1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2017, 2018 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "output/spv/spv.h"
24 #include <libxml/HTMLparser.h>
25 #include <libxml/xmlreader.h>
29 #include "libpspp/assertion.h"
30 #include "libpspp/cast.h"
31 #include "libpspp/hash-functions.h"
32 #include "libpspp/message.h"
33 #include "libpspp/str.h"
34 #include "libpspp/zip-reader.h"
35 #include "output/output-item.h"
36 #include "output/page-setup.h"
37 #include "output/pivot-table.h"
38 #include "output/spv/detail-xml-parser.h"
39 #include "output/spv/light-binary-parser.h"
40 #include "output/spv/spv-css-parser.h"
41 #include "output/spv/spv-legacy-data.h"
42 #include "output/spv/spv-legacy-decoder.h"
43 #include "output/spv/spv-light-decoder.h"
44 #include "output/spv/spv-table-look.h"
45 #include "output/spv/structure-xml-parser.h"
47 #include "gl/c-ctype.h"
48 #include "gl/intprops.h"
49 #include "gl/minmax.h"
50 #include "gl/xalloc.h"
51 #include "gl/xvasprintf.h"
55 #define _(msgid) gettext (msgid)
56 #define N_(msgid) (msgid)
60 struct zip_reader *zip;
61 struct spv_item *root;
62 struct page_setup *page_setup;
66 find_xml_child_element (xmlNode *parent, const char *child_name)
68 for (xmlNode *node = parent->children; node; node = node->next)
69 if (node->type == XML_ELEMENT_NODE
71 && !strcmp (CHAR_CAST (char *, node->name), child_name))
78 get_xml_attr (const xmlNode *node, const char *name)
80 return CHAR_CAST (char *, xmlGetProp (node, CHAR_CAST (xmlChar *, name)));
84 put_xml_attr (const char *name, const char *value, struct string *dst)
89 ds_put_format (dst, " %s=\"", name);
90 for (const char *p = value; *p; p++)
95 ds_put_cstr (dst, " ");
98 ds_put_cstr (dst, "&");
101 ds_put_cstr (dst, "<");
104 ds_put_cstr (dst, ">");
107 ds_put_cstr (dst, """);
110 ds_put_byte (dst, *p);
114 ds_put_byte (dst, '"');
118 extract_html_text (const xmlNode *node, int base_font_size, struct string *s)
120 if (node->type == XML_ELEMENT_NODE)
122 const char *name = CHAR_CAST (char *, node->name);
123 if (!strcmp (name, "br"))
124 ds_put_byte (s, '\n');
125 else if (strcmp (name, "style"))
127 const char *tag = NULL;
128 if (strchr ("biu", name[0]) && name[1] == '\0')
131 ds_put_format (s, "<%s>", tag);
133 else if (!strcmp (name, "font"))
136 ds_put_format (s, "<%s", tag);
138 char *face = get_xml_attr (node, "face");
139 put_xml_attr ("face", face, s);
142 char *color = get_xml_attr (node, "color");
146 put_xml_attr ("color", color, s);
150 if (sscanf (color, "rgb (%"SCNu8", %"SCNu8", %"SCNu8")",
154 snprintf (color2, sizeof color2,
155 "#%02"PRIx8"%02"PRIx8"%02"PRIx8,
157 put_xml_attr ("color", color2, s);
163 char *size_s = get_xml_attr (node, "size");
164 int html_size = size_s ? atoi (size_s) : 0;
166 if (html_size >= 1 && html_size <= 7)
168 static const double scale[7] = {
169 .444, .556, .667, .778, 1.0, 1.33, 2.0
171 double size = base_font_size * scale[html_size - 1];
173 char size2[INT_BUFSIZE_BOUND (int)];
174 snprintf (size2, sizeof size2, "%.0f", size * 1024.);
175 put_xml_attr ("size", size2, s);
178 ds_put_cstr (s, ">");
180 for (const xmlNode *child = node->children; child;
182 extract_html_text (child, base_font_size, s);
184 ds_put_format (s, "</%s>", tag);
187 else if (node->type == XML_TEXT_NODE)
189 /* U+00A0 NONBREAKING SPACE is really, really common in SPV text and it
190 makes it impossible to break syntax across lines. Translate it into a
191 regular space. (Note that U+00A0 is C2 A0 in UTF-8.)
193 Do the same for U+2007 FIGURE SPACE, which also crops out weirdly
195 ds_extend (s, ds_length (s) + xmlStrlen (node->content));
196 for (const uint8_t *p = node->content; *p;)
199 if (p[0] == 0xc2 && p[1] == 0xa0)
204 else if (p[0] == 0xe2 && p[1] == 0x80 && p[2] == 0x87)
214 int last = ds_last (s);
215 if (last != EOF && !c_isspace (last))
219 ds_put_cstr (s, "<");
221 ds_put_cstr (s, ">");
223 ds_put_cstr (s, "&");
231 parse_embedded_html (const xmlNode *node)
233 /* Extract HTML from XML node. */
234 char *html_s = CHAR_CAST (char *, xmlNodeGetContent (node));
238 xmlDoc *html_doc = htmlReadMemory (
239 html_s, strlen (html_s),
240 NULL, "UTF-8", (HTML_PARSE_RECOVER | HTML_PARSE_NOERROR
241 | HTML_PARSE_NOWARNING | HTML_PARSE_NOBLANKS
242 | HTML_PARSE_NONET));
248 /* Given NODE, which should contain HTML content, returns the text within that
249 content as an allocated string. The caller must eventually free the
250 returned string (with xmlFree()). */
252 decode_embedded_html (const xmlNode *node, struct font_style *font_style)
254 struct string markup = DS_EMPTY_INITIALIZER;
255 *font_style = (struct font_style) FONT_STYLE_INITIALIZER;
256 font_style->size = 10;
258 xmlDoc *html_doc = parse_embedded_html (node);
261 xmlNode *root = xmlDocGetRootElement (html_doc);
262 xmlNode *head = root ? find_xml_child_element (root, "head") : NULL;
263 xmlNode *style = head ? find_xml_child_element (head, "style") : NULL;
266 uint8_t *style_s = xmlNodeGetContent (style);
267 spv_parse_css_style (CHAR_CAST (char *, style_s), font_style);
272 extract_html_text (root, font_style->size, &markup);
273 xmlFreeDoc (html_doc);
276 font_style->markup = true;
277 return ds_steal_cstr (&markup);
280 static struct output_item *
281 decode_container_text (const struct spvsx_container_text *ct)
283 struct font_style *font_style = xmalloc (sizeof *font_style);
284 char *text = decode_embedded_html (ct->html->node_.raw, font_style);
286 struct pivot_value *value = xmalloc (sizeof *value);
287 *value = (struct pivot_value) {
289 .type = PIVOT_VALUE_TEXT,
293 .user_provided = true,
296 pivot_value_ex_rw (value)->font_style = font_style;
298 struct output_item *item = text_item_create_value (TEXT_ITEM_LOG,
300 output_item_set_command_name (item, ct->command_name);
305 decode_page_p (const xmlNode *in, struct page_paragraph *out)
307 char *style = get_xml_attr (in, "style");
308 out->halign = (style && strstr (style, "center") ? TABLE_HALIGN_CENTER
309 : style && strstr (style, "right") ? TABLE_HALIGN_RIGHT
310 : TABLE_HALIGN_LEFT);
313 struct font_style font_style;
314 out->markup = decode_embedded_html (in, &font_style);
315 font_style_uninit (&font_style);
319 decode_page_paragraph (const struct spvsx_page_paragraph *page_paragraph,
320 struct page_heading *ph)
322 memset (ph, 0, sizeof *ph);
324 const struct spvsx_page_paragraph_text *page_paragraph_text
325 = page_paragraph->page_paragraph_text;
326 if (!page_paragraph_text)
329 xmlDoc *html_doc = parse_embedded_html (page_paragraph_text->node_.raw);
333 xmlNode *root = xmlDocGetRootElement (html_doc);
334 xmlNode *body = find_xml_child_element (root, "body");
336 for (const xmlNode *node = body->children; node; node = node->next)
337 if (node->type == XML_ELEMENT_NODE
338 && !strcmp (CHAR_CAST (const char *, node->name), "p"))
340 ph->paragraphs = xrealloc (ph->paragraphs,
341 (ph->n + 1) * sizeof *ph->paragraphs);
342 decode_page_p (node, &ph->paragraphs[ph->n++]);
344 xmlFreeDoc (html_doc);
347 char * WARN_UNUSED_RESULT
348 spv_read_light_table (struct zip_reader *zip, const char *bin_member,
349 struct spvlb_table **tablep)
355 char *error = zip_member_read_all (zip, bin_member, &data, &size);
359 struct spvbin_input input;
360 spvbin_input_init (&input, data, size);
362 struct spvlb_table *table = NULL;
364 ? xasprintf ("light table member is empty")
365 : !spvlb_parse_table (&input, &table)
366 ? spvbin_input_to_error (&input, NULL)
367 : input.ofs != input.size
368 ? xasprintf ("expected end of file at offset %#zx", input.ofs)
376 static char * WARN_UNUSED_RESULT
377 pivot_table_open_light (struct zip_reader *zip, const char *bin_member,
378 struct pivot_table **tablep)
382 struct spvlb_table *raw_table;
383 char *error = spv_read_light_table (zip, bin_member, &raw_table);
385 error = decode_spvlb_table (raw_table, tablep);
386 spvlb_free_table (raw_table);
391 char * WARN_UNUSED_RESULT
392 spv_read_legacy_data (struct zip_reader *zip, const char *bin_member,
393 struct spv_data *data)
397 char *error = zip_member_read_all (zip, bin_member, &raw, &size);
400 error = spv_legacy_data_decode (raw, size, data);
407 char * WARN_UNUSED_RESULT
408 spv_read_xml_member (struct zip_reader *zip, const char *xml_member,
409 bool keep_blanks, const char *root_element_name,
414 struct zip_member *zm;
415 char *error = zip_member_open (zip, xml_member, &zm);
419 xmlParserCtxt *parser;
420 xmlKeepBlanksDefault (keep_blanks);
421 parser = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL);
424 zip_member_finish (zm);
425 return xasprintf (_("%s: Failed to create XML parser"), xml_member);
430 while ((retval = zip_member_read (zm, buf, sizeof buf)) > 0)
431 xmlParseChunk (parser, buf, retval, false);
432 xmlParseChunk (parser, NULL, 0, true);
434 xmlDoc *doc = parser->myDoc;
435 bool well_formed = parser->wellFormed;
436 xmlFreeParserCtxt (parser);
440 char *error = zip_member_steal_error (zm);
441 zip_member_finish (zm);
445 zip_member_finish (zm);
450 return xasprintf(_("%s: document is not well-formed"), xml_member);
453 const xmlNode *root_node = xmlDocGetRootElement (doc);
454 assert (root_node->type == XML_ELEMENT_NODE);
455 if (strcmp (CHAR_CAST (char *, root_node->name), root_element_name))
458 return xasprintf(_("%s: root node is \"%s\" but \"%s\" was expected"),
460 CHAR_CAST (char *, root_node->name), root_element_name);
467 static char * WARN_UNUSED_RESULT
468 pivot_table_open_legacy (struct zip_reader *zip, const char *bin_member,
469 const char *xml_member, const char *subtype,
470 const struct pivot_table_look *look,
471 struct pivot_table **tablep)
475 struct spv_data data = SPV_DATA_INITIALIZER;
476 char *error = spv_read_legacy_data (zip, bin_member, &data);
481 error = spv_read_xml_member (zip, xml_member, false,
482 "visualization", &doc);
486 struct spvxml_context ctx = SPVXML_CONTEXT_INIT (ctx);
487 struct spvdx_visualization *v;
488 spvdx_parse_visualization (&ctx, xmlDocGetRootElement (doc), &v);
489 error = spvxml_context_finish (&ctx, &v->node_);
493 error = decode_spvdx_table (v, subtype, look, &data, tablep);
495 spvdx_free_visualization (v);
500 spv_data_uninit (&data);
505 static struct output_item *
506 spv_read_table_item (struct zip_reader *zip,
507 const struct spvsx_table *table)
509 const struct spvsx_table_structure *ts = table->table_structure;
510 const char *bin_member = ts->data_path->text;
511 const char *xml_member = ts->path ? ts->path->text : NULL;
513 struct pivot_table *pt = NULL;
517 struct pivot_table_look *look;
518 error = (table->table_properties
519 ? spv_table_look_decode (table->table_properties, &look)
520 : xstrdup ("Legacy table lacks tableProperties"));
523 error = pivot_table_open_legacy (zip, bin_member, xml_member,
524 table->sub_type, look, &pt);
525 pivot_table_look_unref (look);
529 error = pivot_table_open_light (zip, bin_member, &pt);
531 pt = pivot_table_create_for_text (
532 pivot_value_new_text (N_("Error")),
533 pivot_value_new_user_text_nocopy (error));
535 struct output_item *item = table_item_create (pt);
536 output_item_set_command_name (item, table->command_name);
537 output_item_add_spv_info (item);
538 item->spv_info->error = error != NULL;
539 item->spv_info->zip_reader = zip_reader_ref (zip);
540 item->spv_info->bin_member = xstrdup (bin_member);
541 item->spv_info->xml_member = xstrdup_if_nonnull (xml_member);
545 static cairo_status_t
546 read_from_zip_member (void *zm_, unsigned char *data, unsigned int length)
548 struct zip_member *zm = zm_;
550 return CAIRO_STATUS_READ_ERROR;
554 int n = zip_member_read (zm, data, length);
556 return CAIRO_STATUS_READ_ERROR;
562 return CAIRO_STATUS_SUCCESS;
565 static char * WARN_UNUSED_RESULT
566 spv_read_image (struct zip_reader *zip, const char *png_member,
567 const char *command_name, struct output_item **itemp)
569 struct zip_member *zm;
570 char *error = zip_member_open (zip, png_member, &zm);
574 cairo_surface_t *surface = cairo_image_surface_create_from_png_stream (
575 read_from_zip_member, zm);
577 zip_member_finish (zm);
579 if (cairo_surface_status (surface) != CAIRO_STATUS_SUCCESS)
580 return xstrdup ("reading image failed");
582 struct output_item *item = image_item_create (surface);
583 output_item_set_command_name (item, command_name);
584 output_item_add_spv_info (item);
585 item->spv_info->zip_reader = zip_reader_ref (zip);
586 item->spv_info->png_member = xstrdup (png_member);
591 static struct output_item *
592 error_item_create (char *s)
594 struct output_item *item = text_item_create_nocopy (TEXT_ITEM_LOG, s,
596 output_item_add_spv_info (item);
597 item->spv_info->error = true;
601 static struct output_item *
602 spv_decode_container (struct zip_reader *zip,
603 const struct spvsx_container *c)
605 assert (c->n_seq == 1);
606 struct spvxml_node *content = c->seq[0];
608 struct output_item *item = NULL;
610 if (spvsx_is_container_text (content))
612 item = decode_container_text (spvsx_cast_container_text (content));
615 else if (spvsx_is_table (content))
617 item = spv_read_table_item (zip, spvsx_cast_table (content));
620 else if (spvsx_is_object (content))
622 struct spvsx_object *object = spvsx_cast_object (content);
623 error = spv_read_image (zip, object->uri, object->command_name, &item);
625 else if (spvsx_is_image (content))
627 struct spvsx_image *image = spvsx_cast_image (content);
628 error = spv_read_image (zip, image->data_path->text, image->command_name,
631 else if (spvsx_is_graph (content))
632 error = xstrdup ("graphs not yet implemented");
633 else if (spvsx_is_model (content))
634 error = xstrdup ("models not yet implemented");
635 else if (spvsx_is_tree (content))
636 error = xstrdup ("trees not yet implemented");
641 item = error_item_create (error);
643 output_item_set_label (item, c->label->text);
644 item->show = c->visibility == SPVSX_VISIBILITY_VISIBLE;
650 set_structure_member (struct output_item *item, struct zip_reader *zip,
651 const char *structure_member)
653 if (structure_member)
655 output_item_add_spv_info (item);
656 if (!item->spv_info->zip_reader)
657 item->spv_info->zip_reader = zip_reader_ref (zip);
658 if (!item->spv_info->structure_member)
659 item->spv_info->structure_member = xstrdup (structure_member);
664 spv_decode_children (struct zip_reader *zip, const char *structure_member,
665 struct spvxml_node **seq, size_t n_seq,
666 struct output_item *parent)
668 for (size_t i = 0; i < n_seq; i++)
670 const struct spvxml_node *node = seq[i];
672 struct output_item *child;
673 if (spvsx_is_container (node))
675 const struct spvsx_container *container
676 = spvsx_cast_container (node);
678 if (container->page_break_before_present)
679 group_item_add_child (parent, page_break_item_create ());
681 child = spv_decode_container (zip, container);
683 else if (spvsx_is_heading (node))
685 const struct spvsx_heading *subheading = spvsx_cast_heading (node);
687 child = group_item_create (subheading->command_name,
688 subheading->label->text);
689 child->show = !subheading->heading_visibility_present;
691 /* Pass NULL for 'structure_member' so that only top-level items get
692 tagged that way. Lower-level items are always in the same
693 structure member as their parent anyway. */
694 spv_decode_children (zip, NULL, subheading->seq,
695 subheading->n_seq, child);
700 set_structure_member (child, zip, structure_member);
701 group_item_add_child (parent, child);
705 static struct page_setup *
706 decode_page_setup (const struct spvsx_page_setup *in, const char *file_name)
708 struct page_setup *out = xmalloc (sizeof *out);
709 *out = (struct page_setup) PAGE_SETUP_INITIALIZER;
711 out->initial_page_number = in->initial_page_number;
713 if (in->paper_width != DBL_MAX)
714 out->paper[TABLE_HORZ] = in->paper_width;
715 if (in->paper_height != DBL_MAX)
716 out->paper[TABLE_VERT] = in->paper_height;
718 if (in->margin_left != DBL_MAX)
719 out->margins[TABLE_HORZ][0] = in->margin_left;
720 if (in->margin_right != DBL_MAX)
721 out->margins[TABLE_HORZ][1] = in->margin_right;
722 if (in->margin_top != DBL_MAX)
723 out->margins[TABLE_VERT][0] = in->margin_top;
724 if (in->margin_bottom != DBL_MAX)
725 out->margins[TABLE_VERT][1] = in->margin_bottom;
727 if (in->space_after != DBL_MAX)
728 out->object_spacing = in->space_after;
731 out->chart_size = (in->chart_size == SPVSX_CHART_SIZE_FULL_HEIGHT
732 ? PAGE_CHART_FULL_HEIGHT
733 : in->chart_size == SPVSX_CHART_SIZE_HALF_HEIGHT
734 ? PAGE_CHART_HALF_HEIGHT
735 : in->chart_size == SPVSX_CHART_SIZE_QUARTER_HEIGHT
736 ? PAGE_CHART_QUARTER_HEIGHT
739 decode_page_paragraph (in->page_header->page_paragraph, &out->headings[0]);
740 decode_page_paragraph (in->page_footer->page_paragraph, &out->headings[1]);
742 out->file_name = xstrdup (file_name);
748 spv_add_error_heading (struct output_item *root_item,
749 struct zip_reader *zip, const char *structure_member,
752 struct output_item *item = error_item_create (
753 xasprintf ("%s: %s", structure_member, error));
755 set_structure_member (item, zip, structure_member);
756 group_item_add_child (root_item, item);
760 spv_heading_read (struct zip_reader *zip, struct output_item *root_item,
761 struct page_setup **psp, const char *file_name,
762 const char *structure_member)
765 char *error = spv_read_xml_member (zip, structure_member, true,
769 spv_add_error_heading (root_item, zip, structure_member, error);
773 struct spvxml_context ctx = SPVXML_CONTEXT_INIT (ctx);
774 struct spvsx_root_heading *root;
775 spvsx_parse_root_heading (&ctx, xmlDocGetRootElement (doc), &root);
776 error = spvxml_context_finish (&ctx, &root->node_);
780 spv_add_error_heading (root_item, zip, structure_member, error);
784 if (root->page_setup && psp && !*psp)
785 *psp = decode_page_setup (root->page_setup, file_name);
787 for (size_t i = 0; i < root->n_seq; i++)
788 spv_decode_children (zip, structure_member, root->seq, root->n_seq,
791 spvsx_free_root_heading (root);
796 spv_detect__ (struct zip_reader *zip, char **errorp)
800 const char *member = "META-INF/MANIFEST.MF";
801 if (!zip_reader_contains_member (zip, member))
806 *errorp = zip_member_read_all (zip, "META-INF/MANIFEST.MF",
811 const char *magic = "allowPivoting=true";
812 bool is_spv = size == strlen (magic) && !memcmp (magic, data, size);
818 /* Returns NULL if FILENAME is an SPV file, otherwise an error string that the
819 caller must eventually free(). */
820 char * WARN_UNUSED_RESULT
821 spv_detect (const char *filename)
823 struct zip_reader *zip;
824 char *error = zip_reader_create (filename, &zip);
828 if (spv_detect__ (zip, &error) <= 0 && !error)
829 error = xasprintf("%s: not an SPV file", filename);
830 zip_reader_unref (zip);
834 char * WARN_UNUSED_RESULT
835 spv_read (const char *filename, struct output_item **outp,
836 struct page_setup **psp)
842 struct zip_reader *zip;
843 char *error = zip_reader_create (filename, &zip);
847 int detect = spv_detect__ (zip, &error);
850 zip_reader_unref (zip);
851 return error ? error : xasprintf ("%s: not an SPV file", filename);
854 *outp = root_item_create ();
855 for (size_t i = 0; ; i++)
857 const char *structure_member = zip_reader_get_member_name (zip, i);
858 if (!structure_member)
861 struct substring structure_member_ss = ss_cstr (structure_member);
862 if (ss_starts_with (structure_member_ss, ss_cstr ("outputViewer"))
863 && ss_ends_with (structure_member_ss, ss_cstr (".xml")))
864 spv_heading_read (zip, *outp, psp, filename, structure_member);
867 zip_reader_unref (zip);
871 char * WARN_UNUSED_RESULT
872 spv_decode_fmt_spec (uint32_t u32, struct fmt_spec *out)
875 || (u32 == 0x10000 || u32 == 1 /* both used as string formats */))
877 *out = fmt_for_output (FMT_F, 40, 2);
881 uint8_t raw_type = u32 >> 16;
882 uint8_t w = u32 >> 8;
885 *out = (struct fmt_spec) { .type = FMT_F, .w = w, .d = d };
886 bool ok = raw_type >= 40 || fmt_from_io (raw_type, &out->type);
889 fmt_fix_output (out);
890 ok = fmt_check_width_compat (out, NULL, 0);
895 *out = fmt_for_output (FMT_F, 40, 2);
896 return xasprintf ("bad format %#"PRIx32, u32);