1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2017, 2018 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "output/spv/spv.h"
24 #include <libxml/HTMLparser.h>
25 #include <libxml/xmlreader.h>
29 #include "libpspp/assertion.h"
30 #include "libpspp/cast.h"
31 #include "libpspp/hash-functions.h"
32 #include "libpspp/message.h"
33 #include "libpspp/str.h"
34 #include "libpspp/zip-reader.h"
35 #include "output/output-item.h"
36 #include "output/page-setup.h"
37 #include "output/pivot-table.h"
38 #include "output/spv/detail-xml-parser.h"
39 #include "output/spv/light-binary-parser.h"
40 #include "output/spv/spv-css-parser.h"
41 #include "output/spv/spv-legacy-data.h"
42 #include "output/spv/spv-legacy-decoder.h"
43 #include "output/spv/spv-light-decoder.h"
44 #include "output/spv/spv-table-look.h"
45 #include "output/spv/structure-xml-parser.h"
47 #include "gl/c-ctype.h"
48 #include "gl/intprops.h"
49 #include "gl/minmax.h"
50 #include "gl/xalloc.h"
51 #include "gl/xvasprintf.h"
55 #define _(msgid) gettext (msgid)
56 #define N_(msgid) (msgid)
60 struct zip_reader *zip;
61 struct spv_item *root;
62 struct page_setup *page_setup;
66 find_xml_child_element (xmlNode *parent, const char *child_name)
68 for (xmlNode *node = parent->children; node; node = node->next)
69 if (node->type == XML_ELEMENT_NODE
71 && !strcmp (CHAR_CAST (char *, node->name), child_name))
78 get_xml_attr (const xmlNode *node, const char *name)
80 return CHAR_CAST (char *, xmlGetProp (node, CHAR_CAST (xmlChar *, name)));
84 put_xml_attr (const char *name, const char *value, struct string *dst)
89 ds_put_format (dst, " %s=\"", name);
90 for (const char *p = value; *p; p++)
95 ds_put_cstr (dst, " ");
98 ds_put_cstr (dst, "&");
101 ds_put_cstr (dst, "<");
104 ds_put_cstr (dst, ">");
107 ds_put_cstr (dst, """);
110 ds_put_byte (dst, *p);
114 ds_put_byte (dst, '"');
118 extract_html_text (const xmlNode *node, int base_font_size, struct string *s)
120 if (node->type == XML_ELEMENT_NODE)
122 const char *name = CHAR_CAST (char *, node->name);
123 if (!strcmp (name, "br"))
124 ds_put_byte (s, '\n');
125 else if (strcmp (name, "style"))
127 const char *tag = NULL;
128 if (strchr ("biu", name[0]) && name[1] == '\0')
131 ds_put_format (s, "<%s>", tag);
133 else if (!strcmp (name, "font"))
136 ds_put_format (s, "<%s", tag);
138 char *face = get_xml_attr (node, "face");
139 put_xml_attr ("face", face, s);
142 char *color = get_xml_attr (node, "color");
146 put_xml_attr ("color", color, s);
150 if (sscanf (color, "rgb (%"SCNu8", %"SCNu8", %"SCNu8")",
154 snprintf (color2, sizeof color2,
155 "#%02"PRIx8"%02"PRIx8"%02"PRIx8,
157 put_xml_attr ("color", color2, s);
163 char *size_s = get_xml_attr (node, "size");
164 int html_size = size_s ? atoi (size_s) : 0;
166 if (html_size >= 1 && html_size <= 7)
168 static const double scale[7] = {
169 .444, .556, .667, .778, 1.0, 1.33, 2.0
171 double size = base_font_size * scale[html_size - 1];
173 char size2[INT_BUFSIZE_BOUND (int)];
174 snprintf (size2, sizeof size2, "%.0f", size * 1024.);
175 put_xml_attr ("size", size2, s);
178 ds_put_cstr (s, ">");
180 for (const xmlNode *child = node->children; child;
182 extract_html_text (child, base_font_size, s);
184 ds_put_format (s, "</%s>", tag);
187 else if (node->type == XML_TEXT_NODE)
189 /* U+00A0 NONBREAKING SPACE is really, really common in SPV text and it
190 makes it impossible to break syntax across lines. Translate it into a
191 regular space. (Note that U+00A0 is C2 A0 in UTF-8.)
193 Do the same for U+2007 FIGURE SPACE, which also crops out weirdly
195 ds_extend (s, ds_length (s) + xmlStrlen (node->content));
196 for (const uint8_t *p = node->content; *p;)
199 if (p[0] == 0xc2 && p[1] == 0xa0)
204 else if (p[0] == 0xe2 && p[1] == 0x80 && p[2] == 0x87)
214 int last = ds_last (s);
215 if (last != EOF && !c_isspace (last))
219 ds_put_cstr (s, "<");
221 ds_put_cstr (s, ">");
223 ds_put_cstr (s, "&");
231 parse_embedded_html (const xmlNode *node)
233 /* Extract HTML from XML node. */
234 char *html_s = CHAR_CAST (char *, xmlNodeGetContent (node));
238 xmlDoc *html_doc = htmlReadMemory (
239 html_s, strlen (html_s),
240 NULL, "UTF-8", (HTML_PARSE_RECOVER | HTML_PARSE_NOERROR
241 | HTML_PARSE_NOWARNING | HTML_PARSE_NOBLANKS
242 | HTML_PARSE_NONET));
248 /* Given NODE, which should contain HTML content, returns the text within that
249 content as an allocated string. The caller must eventually free the
250 returned string (with xmlFree()). */
252 decode_embedded_html (const xmlNode *node, struct font_style *font_style)
254 struct string markup = DS_EMPTY_INITIALIZER;
255 *font_style = (struct font_style) FONT_STYLE_INITIALIZER;
256 font_style->size = 10;
258 xmlDoc *html_doc = parse_embedded_html (node);
261 xmlNode *root = xmlDocGetRootElement (html_doc);
262 xmlNode *head = root ? find_xml_child_element (root, "head") : NULL;
263 xmlNode *style = head ? find_xml_child_element (head, "style") : NULL;
266 uint8_t *style_s = xmlNodeGetContent (style);
267 spv_parse_css_style (CHAR_CAST (char *, style_s), font_style);
272 extract_html_text (root, font_style->size, &markup);
273 xmlFreeDoc (html_doc);
276 font_style->markup = true;
277 return ds_steal_cstr (&markup);
280 static struct output_item *
281 decode_container_text (const struct spvsx_container_text *ct)
283 struct font_style *font_style = xmalloc (sizeof *font_style);
284 char *text = decode_embedded_html (ct->html->node_.raw, font_style);
285 struct pivot_value *value = xmalloc (sizeof *value);
286 *value = (struct pivot_value) {
287 .font_style = font_style,
288 .type = PIVOT_VALUE_TEXT,
293 .user_provided = true,
297 struct output_item *item = text_item_create_value (TEXT_ITEM_LOG,
299 output_item_set_command_name (item, ct->command_name);
304 decode_page_p (const xmlNode *in, struct page_paragraph *out)
306 char *style = get_xml_attr (in, "style");
307 out->halign = (style && strstr (style, "center") ? TABLE_HALIGN_CENTER
308 : style && strstr (style, "right") ? TABLE_HALIGN_RIGHT
309 : TABLE_HALIGN_LEFT);
312 struct font_style font_style;
313 out->markup = decode_embedded_html (in, &font_style);
314 font_style_uninit (&font_style);
318 decode_page_paragraph (const struct spvsx_page_paragraph *page_paragraph,
319 struct page_heading *ph)
321 memset (ph, 0, sizeof *ph);
323 const struct spvsx_page_paragraph_text *page_paragraph_text
324 = page_paragraph->page_paragraph_text;
325 if (!page_paragraph_text)
328 xmlDoc *html_doc = parse_embedded_html (page_paragraph_text->node_.raw);
332 xmlNode *root = xmlDocGetRootElement (html_doc);
333 xmlNode *body = find_xml_child_element (root, "body");
335 for (const xmlNode *node = body->children; node; node = node->next)
336 if (node->type == XML_ELEMENT_NODE
337 && !strcmp (CHAR_CAST (const char *, node->name), "p"))
339 ph->paragraphs = xrealloc (ph->paragraphs,
340 (ph->n + 1) * sizeof *ph->paragraphs);
341 decode_page_p (node, &ph->paragraphs[ph->n++]);
343 xmlFreeDoc (html_doc);
346 char * WARN_UNUSED_RESULT
347 spv_read_light_table (struct zip_reader *zip, const char *bin_member,
348 struct spvlb_table **tablep)
354 char *error = zip_member_read_all (zip, bin_member, &data, &size);
358 struct spvbin_input input;
359 spvbin_input_init (&input, data, size);
361 struct spvlb_table *table = NULL;
363 ? xasprintf ("light table member is empty")
364 : !spvlb_parse_table (&input, &table)
365 ? spvbin_input_to_error (&input, NULL)
366 : input.ofs != input.size
367 ? xasprintf ("expected end of file at offset %#zx", input.ofs)
375 static char * WARN_UNUSED_RESULT
376 pivot_table_open_light (struct zip_reader *zip, const char *bin_member,
377 struct pivot_table **tablep)
381 struct spvlb_table *raw_table;
382 char *error = spv_read_light_table (zip, bin_member, &raw_table);
384 error = decode_spvlb_table (raw_table, tablep);
385 spvlb_free_table (raw_table);
390 char * WARN_UNUSED_RESULT
391 spv_read_legacy_data (struct zip_reader *zip, const char *bin_member,
392 struct spv_data *data)
396 char *error = zip_member_read_all (zip, bin_member, &raw, &size);
399 error = spv_legacy_data_decode (raw, size, data);
406 char * WARN_UNUSED_RESULT
407 spv_read_xml_member (struct zip_reader *zip, const char *xml_member,
408 bool keep_blanks, const char *root_element_name,
413 struct zip_member *zm;
414 char *error = zip_member_open (zip, xml_member, &zm);
418 xmlParserCtxt *parser;
419 xmlKeepBlanksDefault (keep_blanks);
420 parser = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL);
423 zip_member_finish (zm);
424 return xasprintf (_("%s: Failed to create XML parser"), xml_member);
429 while ((retval = zip_member_read (zm, buf, sizeof buf)) > 0)
430 xmlParseChunk (parser, buf, retval, false);
431 xmlParseChunk (parser, NULL, 0, true);
433 xmlDoc *doc = parser->myDoc;
434 bool well_formed = parser->wellFormed;
435 xmlFreeParserCtxt (parser);
439 char *error = zip_member_steal_error (zm);
440 zip_member_finish (zm);
444 zip_member_finish (zm);
449 return xasprintf(_("%s: document is not well-formed"), xml_member);
452 const xmlNode *root_node = xmlDocGetRootElement (doc);
453 assert (root_node->type == XML_ELEMENT_NODE);
454 if (strcmp (CHAR_CAST (char *, root_node->name), root_element_name))
457 return xasprintf(_("%s: root node is \"%s\" but \"%s\" was expected"),
459 CHAR_CAST (char *, root_node->name), root_element_name);
466 static char * WARN_UNUSED_RESULT
467 pivot_table_open_legacy (struct zip_reader *zip, const char *bin_member,
468 const char *xml_member, const char *subtype,
469 const struct pivot_table_look *look,
470 struct pivot_table **tablep)
474 struct spv_data data = SPV_DATA_INITIALIZER;
475 char *error = spv_read_legacy_data (zip, bin_member, &data);
480 error = spv_read_xml_member (zip, xml_member, false,
481 "visualization", &doc);
485 struct spvxml_context ctx = SPVXML_CONTEXT_INIT (ctx);
486 struct spvdx_visualization *v;
487 spvdx_parse_visualization (&ctx, xmlDocGetRootElement (doc), &v);
488 error = spvxml_context_finish (&ctx, &v->node_);
492 error = decode_spvdx_table (v, subtype, look, &data, tablep);
494 spvdx_free_visualization (v);
499 spv_data_uninit (&data);
504 static struct output_item *
505 spv_read_table_item (struct zip_reader *zip,
506 const struct spvsx_table *table)
508 const struct spvsx_table_structure *ts = table->table_structure;
509 const char *bin_member = ts->data_path->text;
510 const char *xml_member = ts->path ? ts->path->text : NULL;
512 struct pivot_table *pt = NULL;
516 struct pivot_table_look *look;
517 error = (table->table_properties
518 ? spv_table_look_decode (table->table_properties, &look)
519 : xstrdup ("Legacy table lacks tableProperties"));
522 error = pivot_table_open_legacy (zip, bin_member, xml_member,
523 table->sub_type, look, &pt);
524 pivot_table_look_unref (look);
528 error = pivot_table_open_light (zip, bin_member, &pt);
530 pt = pivot_table_create_for_text (
531 pivot_value_new_text (N_("Error")),
532 pivot_value_new_user_text_nocopy (error));
534 struct output_item *item = table_item_create (pt);
535 output_item_set_command_name (item, table->command_name);
536 output_item_add_spv_info (item);
537 item->spv_info->error = error != NULL;
538 item->spv_info->zip_reader = zip_reader_ref (zip);
539 item->spv_info->bin_member = xstrdup (bin_member);
540 item->spv_info->xml_member = xstrdup_if_nonnull (xml_member);
544 static cairo_status_t
545 read_from_zip_member (void *zm_, unsigned char *data, unsigned int length)
547 struct zip_member *zm = zm_;
549 return CAIRO_STATUS_READ_ERROR;
553 int n = zip_member_read (zm, data, length);
555 return CAIRO_STATUS_READ_ERROR;
561 return CAIRO_STATUS_SUCCESS;
564 static char * WARN_UNUSED_RESULT
565 spv_read_image (struct zip_reader *zip, const char *png_member,
566 const char *command_name, struct output_item **itemp)
568 struct zip_member *zm;
569 char *error = zip_member_open (zip, png_member, &zm);
573 cairo_surface_t *surface = cairo_image_surface_create_from_png_stream (
574 read_from_zip_member, zm);
576 zip_member_finish (zm);
578 if (cairo_surface_status (surface) != CAIRO_STATUS_SUCCESS)
579 return xstrdup ("reading image failed");
581 struct output_item *item = image_item_create (surface);
582 output_item_set_command_name (item, command_name);
583 output_item_add_spv_info (item);
584 item->spv_info->zip_reader = zip_reader_ref (zip);
585 item->spv_info->png_member = xstrdup (png_member);
590 static struct output_item *
591 error_item_create (char *s)
593 struct output_item *item = text_item_create_nocopy (TEXT_ITEM_LOG, s,
595 output_item_add_spv_info (item);
596 item->spv_info->error = true;
600 static struct output_item *
601 spv_decode_container (struct zip_reader *zip,
602 const struct spvsx_container *c)
604 assert (c->n_seq == 1);
605 struct spvxml_node *content = c->seq[0];
607 struct output_item *item = NULL;
609 if (spvsx_is_container_text (content))
611 item = decode_container_text (spvsx_cast_container_text (content));
614 else if (spvsx_is_table (content))
616 item = spv_read_table_item (zip, spvsx_cast_table (content));
619 else if (spvsx_is_object (content))
621 struct spvsx_object *object = spvsx_cast_object (content);
622 error = spv_read_image (zip, object->uri, object->command_name, &item);
624 else if (spvsx_is_image (content))
626 struct spvsx_image *image = spvsx_cast_image (content);
627 error = spv_read_image (zip, image->data_path->text, image->command_name,
630 else if (spvsx_is_graph (content))
631 error = xstrdup ("graphs not yet implemented");
632 else if (spvsx_is_model (content))
633 error = xstrdup ("models not yet implemented");
634 else if (spvsx_is_tree (content))
635 error = xstrdup ("trees not yet implemented");
640 item = error_item_create (error);
642 output_item_set_label (item, c->label->text);
643 item->show = c->visibility == SPVSX_VISIBILITY_VISIBLE;
649 set_structure_member (struct output_item *item, struct zip_reader *zip,
650 const char *structure_member)
652 if (structure_member)
654 output_item_add_spv_info (item);
655 if (!item->spv_info->zip_reader)
656 item->spv_info->zip_reader = zip_reader_ref (zip);
657 if (!item->spv_info->structure_member)
658 item->spv_info->structure_member = xstrdup (structure_member);
663 spv_decode_children (struct zip_reader *zip, const char *structure_member,
664 struct spvxml_node **seq, size_t n_seq,
665 struct output_item *parent)
667 for (size_t i = 0; i < n_seq; i++)
669 const struct spvxml_node *node = seq[i];
671 struct output_item *child;
672 if (spvsx_is_container (node))
674 const struct spvsx_container *container
675 = spvsx_cast_container (node);
677 if (container->page_break_before_present)
678 group_item_add_child (parent, page_break_item_create ());
680 child = spv_decode_container (zip, container);
682 else if (spvsx_is_heading (node))
684 const struct spvsx_heading *subheading = spvsx_cast_heading (node);
686 child = group_item_create (subheading->command_name,
687 subheading->label->text);
688 child->show = !subheading->heading_visibility_present;
690 /* Pass NULL for 'structure_member' so that only top-level items get
691 tagged that way. Lower-level items are always in the same
692 structure member as their parent anyway. */
693 spv_decode_children (zip, NULL, subheading->seq,
694 subheading->n_seq, child);
699 set_structure_member (child, zip, structure_member);
700 group_item_add_child (parent, child);
704 static struct page_setup *
705 decode_page_setup (const struct spvsx_page_setup *in, const char *file_name)
707 struct page_setup *out = xmalloc (sizeof *out);
708 *out = (struct page_setup) PAGE_SETUP_INITIALIZER;
710 out->initial_page_number = in->initial_page_number;
712 if (in->paper_width != DBL_MAX)
713 out->paper[TABLE_HORZ] = in->paper_width;
714 if (in->paper_height != DBL_MAX)
715 out->paper[TABLE_VERT] = in->paper_height;
717 if (in->margin_left != DBL_MAX)
718 out->margins[TABLE_HORZ][0] = in->margin_left;
719 if (in->margin_right != DBL_MAX)
720 out->margins[TABLE_HORZ][1] = in->margin_right;
721 if (in->margin_top != DBL_MAX)
722 out->margins[TABLE_VERT][0] = in->margin_top;
723 if (in->margin_bottom != DBL_MAX)
724 out->margins[TABLE_VERT][1] = in->margin_bottom;
726 if (in->space_after != DBL_MAX)
727 out->object_spacing = in->space_after;
730 out->chart_size = (in->chart_size == SPVSX_CHART_SIZE_FULL_HEIGHT
731 ? PAGE_CHART_FULL_HEIGHT
732 : in->chart_size == SPVSX_CHART_SIZE_HALF_HEIGHT
733 ? PAGE_CHART_HALF_HEIGHT
734 : in->chart_size == SPVSX_CHART_SIZE_QUARTER_HEIGHT
735 ? PAGE_CHART_QUARTER_HEIGHT
738 decode_page_paragraph (in->page_header->page_paragraph, &out->headings[0]);
739 decode_page_paragraph (in->page_footer->page_paragraph, &out->headings[1]);
741 out->file_name = xstrdup (file_name);
747 spv_add_error_heading (struct output_item *root_item,
748 struct zip_reader *zip, const char *structure_member,
751 struct output_item *item = error_item_create (
752 xasprintf ("%s: %s", structure_member, error));
754 set_structure_member (item, zip, structure_member);
755 group_item_add_child (root_item, item);
759 spv_heading_read (struct zip_reader *zip, struct output_item *root_item,
760 struct page_setup **psp, const char *file_name,
761 const char *structure_member)
764 char *error = spv_read_xml_member (zip, structure_member, true,
768 spv_add_error_heading (root_item, zip, structure_member, error);
772 struct spvxml_context ctx = SPVXML_CONTEXT_INIT (ctx);
773 struct spvsx_root_heading *root;
774 spvsx_parse_root_heading (&ctx, xmlDocGetRootElement (doc), &root);
775 error = spvxml_context_finish (&ctx, &root->node_);
779 spv_add_error_heading (root_item, zip, structure_member, error);
783 if (root->page_setup && psp && !*psp)
784 *psp = decode_page_setup (root->page_setup, file_name);
786 for (size_t i = 0; i < root->n_seq; i++)
787 spv_decode_children (zip, structure_member, root->seq, root->n_seq,
790 spvsx_free_root_heading (root);
795 spv_detect__ (struct zip_reader *zip, char **errorp)
799 const char *member = "META-INF/MANIFEST.MF";
800 if (!zip_reader_contains_member (zip, member))
805 *errorp = zip_member_read_all (zip, "META-INF/MANIFEST.MF",
810 const char *magic = "allowPivoting=true";
811 bool is_spv = size == strlen (magic) && !memcmp (magic, data, size);
817 /* Returns NULL if FILENAME is an SPV file, otherwise an error string that the
818 caller must eventually free(). */
819 char * WARN_UNUSED_RESULT
820 spv_detect (const char *filename)
822 struct zip_reader *zip;
823 char *error = zip_reader_create (filename, &zip);
827 if (spv_detect__ (zip, &error) <= 0 && !error)
828 error = xasprintf("%s: not an SPV file", filename);
829 zip_reader_unref (zip);
833 char * WARN_UNUSED_RESULT
834 spv_read (const char *filename, struct output_item **outp,
835 struct page_setup **psp)
841 struct spv_reader *spv = xzalloc (sizeof *spv);
842 struct zip_reader *zip;
843 char *error = zip_reader_create (filename, &zip);
847 int detect = spv_detect__ (zip, &error);
850 zip_reader_unref (zip);
851 return error ? error : xasprintf ("%s: not an SPV file", filename);
854 *outp = root_item_create ();
855 for (size_t i = 0; ; i++)
857 const char *structure_member = zip_reader_get_member_name (zip, i);
858 if (!structure_member)
861 struct substring structure_member_ss = ss_cstr (structure_member);
862 if (ss_starts_with (structure_member_ss, ss_cstr ("outputViewer"))
863 && ss_ends_with (structure_member_ss, ss_cstr (".xml")))
864 spv_heading_read (zip, *outp, psp, filename, structure_member);
867 zip_reader_unref (zip);
871 char * WARN_UNUSED_RESULT
872 spv_decode_fmt_spec (uint32_t u32, struct fmt_spec *out)
875 || (u32 == 0x10000 || u32 == 1 /* both used as string formats */))
877 *out = fmt_for_output (FMT_F, 40, 2);
881 uint8_t raw_type = u32 >> 16;
882 uint8_t w = u32 >> 8;
886 *out = (struct fmt_spec) { .type = FMT_F, .w = w, .d = d };
887 bool ok = raw_type >= 40 || fmt_from_io (raw_type, &out->type);
890 fmt_fix_output (out);
891 ok = fmt_check_width_compat (out, 0);
897 *out = fmt_for_output (FMT_F, 40, 2);
898 return xasprintf ("bad format %#"PRIx32, u32);