1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2017, 2018 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "output/spv/spv.h"
24 #include <libxml/HTMLparser.h>
25 #include <libxml/xmlreader.h>
29 #include "libpspp/assertion.h"
30 #include "libpspp/cast.h"
31 #include "libpspp/hash-functions.h"
32 #include "libpspp/message.h"
33 #include "libpspp/str.h"
34 #include "libpspp/zip-reader.h"
35 #include "output/output-item.h"
36 #include "output/page-setup.h"
37 #include "output/pivot-table.h"
38 #include "output/spv/detail-xml-parser.h"
39 #include "output/spv/light-binary-parser.h"
40 #include "output/spv/spv-css-parser.h"
41 #include "output/spv/spv-legacy-data.h"
42 #include "output/spv/spv-legacy-decoder.h"
43 #include "output/spv/spv-light-decoder.h"
44 #include "output/spv/spv-table-look.h"
45 #include "output/spv/structure-xml-parser.h"
47 #include "gl/c-ctype.h"
48 #include "gl/intprops.h"
49 #include "gl/minmax.h"
50 #include "gl/xalloc.h"
51 #include "gl/xvasprintf.h"
55 #define _(msgid) gettext (msgid)
56 #define N_(msgid) (msgid)
60 struct zip_reader *zip;
61 struct spv_item *root;
62 struct page_setup *page_setup;
66 find_xml_child_element (xmlNode *parent, const char *child_name)
68 for (xmlNode *node = parent->children; node; node = node->next)
69 if (node->type == XML_ELEMENT_NODE
71 && !strcmp (CHAR_CAST (char *, node->name), child_name))
78 get_xml_attr (const xmlNode *node, const char *name)
80 return CHAR_CAST (char *, xmlGetProp (node, CHAR_CAST (xmlChar *, name)));
84 put_xml_attr (const char *name, const char *value, struct string *dst)
89 ds_put_format (dst, " %s=\"", name);
90 for (const char *p = value; *p; p++)
95 ds_put_cstr (dst, " ");
98 ds_put_cstr (dst, "&");
101 ds_put_cstr (dst, "<");
104 ds_put_cstr (dst, ">");
107 ds_put_cstr (dst, """);
110 ds_put_byte (dst, *p);
114 ds_put_byte (dst, '"');
118 extract_html_text (const xmlNode *node, int base_font_size, struct string *s)
120 if (node->type == XML_ELEMENT_NODE)
122 const char *name = CHAR_CAST (char *, node->name);
123 if (!strcmp (name, "br"))
124 ds_put_byte (s, '\n');
125 else if (strcmp (name, "style"))
127 const char *tag = NULL;
128 if (strchr ("biu", name[0]) && name[1] == '\0')
131 ds_put_format (s, "<%s>", tag);
133 else if (!strcmp (name, "font"))
136 ds_put_format (s, "<%s", tag);
138 char *face = get_xml_attr (node, "face");
139 put_xml_attr ("face", face, s);
142 char *color = get_xml_attr (node, "color");
146 put_xml_attr ("color", color, s);
150 if (sscanf (color, "rgb (%"SCNu8", %"SCNu8", %"SCNu8")",
154 snprintf (color2, sizeof color2,
155 "#%02"PRIx8"%02"PRIx8"%02"PRIx8,
157 put_xml_attr ("color", color2, s);
163 char *size_s = get_xml_attr (node, "size");
164 int html_size = size_s ? atoi (size_s) : 0;
166 if (html_size >= 1 && html_size <= 7)
168 static const double scale[7] = {
169 .444, .556, .667, .778, 1.0, 1.33, 2.0
171 double size = base_font_size * scale[html_size - 1];
173 char size2[INT_BUFSIZE_BOUND (int)];
174 snprintf (size2, sizeof size2, "%.0f", size * 1024.);
175 put_xml_attr ("size", size2, s);
178 ds_put_cstr (s, ">");
180 for (const xmlNode *child = node->children; child;
182 extract_html_text (child, base_font_size, s);
184 ds_put_format (s, "</%s>", tag);
187 else if (node->type == XML_TEXT_NODE)
189 /* U+00A0 NONBREAKING SPACE is really, really common in SPV text and it
190 makes it impossible to break syntax across lines. Translate it into a
191 regular space. (Note that U+00A0 is C2 A0 in UTF-8.)
193 Do the same for U+2007 FIGURE SPACE, which also crops out weirdly
195 ds_extend (s, ds_length (s) + xmlStrlen (node->content));
196 for (const uint8_t *p = node->content; *p;)
199 if (p[0] == 0xc2 && p[1] == 0xa0)
204 else if (p[0] == 0xe2 && p[1] == 0x80 && p[2] == 0x87)
214 int last = ds_last (s);
215 if (last != EOF && !c_isspace (last))
219 ds_put_cstr (s, "<");
221 ds_put_cstr (s, ">");
223 ds_put_cstr (s, "&");
231 parse_embedded_html (const xmlNode *node)
233 /* Extract HTML from XML node. */
234 char *html_s = CHAR_CAST (char *, xmlNodeGetContent (node));
238 xmlDoc *html_doc = htmlReadMemory (
239 html_s, strlen (html_s),
240 NULL, "UTF-8", (HTML_PARSE_RECOVER | HTML_PARSE_NOERROR
241 | HTML_PARSE_NOWARNING | HTML_PARSE_NOBLANKS
242 | HTML_PARSE_NONET));
248 /* Given NODE, which should contain HTML content, returns the text within that
249 content as an allocated string. The caller must eventually free the
250 returned string (with xmlFree()). */
252 decode_embedded_html (const xmlNode *node, struct font_style *font_style)
254 struct string markup = DS_EMPTY_INITIALIZER;
255 *font_style = (struct font_style) FONT_STYLE_INITIALIZER;
256 font_style->size = 10;
258 xmlDoc *html_doc = parse_embedded_html (node);
261 xmlNode *root = xmlDocGetRootElement (html_doc);
262 xmlNode *head = root ? find_xml_child_element (root, "head") : NULL;
263 xmlNode *style = head ? find_xml_child_element (head, "style") : NULL;
266 uint8_t *style_s = xmlNodeGetContent (style);
267 spv_parse_css_style (CHAR_CAST (char *, style_s), font_style);
272 extract_html_text (root, font_style->size, &markup);
273 xmlFreeDoc (html_doc);
276 font_style->markup = true;
277 return ds_steal_cstr (&markup);
280 static struct output_item *
281 decode_container_text (const struct spvsx_container_text *ct)
283 struct font_style *font_style = xmalloc (sizeof *font_style);
284 char *text = decode_embedded_html (ct->html->node_.raw, font_style);
286 struct pivot_value *value = xmalloc (sizeof *value);
287 *value = (struct pivot_value) {
289 .type = PIVOT_VALUE_TEXT,
293 .user_provided = true,
296 pivot_value_ex_rw (value)->font_style = font_style;
298 struct output_item *item = text_item_create_value (TEXT_ITEM_LOG,
300 output_item_set_command_name (item, ct->command_name);
305 decode_page_p (const xmlNode *in, struct page_paragraph *out)
307 char *style = get_xml_attr (in, "style");
308 out->halign = (style && strstr (style, "center") ? TABLE_HALIGN_CENTER
309 : style && strstr (style, "right") ? TABLE_HALIGN_RIGHT
310 : TABLE_HALIGN_LEFT);
313 struct font_style font_style;
314 out->markup = decode_embedded_html (in, &font_style);
315 font_style_uninit (&font_style);
319 decode_page_paragraph (const struct spvsx_page_paragraph *page_paragraph,
320 struct page_heading *ph)
322 memset (ph, 0, sizeof *ph);
327 const struct spvsx_page_paragraph_text *page_paragraph_text
328 = page_paragraph->page_paragraph_text;
329 if (!page_paragraph_text)
332 xmlDoc *html_doc = parse_embedded_html (page_paragraph_text->node_.raw);
336 xmlNode *root = xmlDocGetRootElement (html_doc);
337 xmlNode *body = find_xml_child_element (root, "body");
339 for (const xmlNode *node = body->children; node; node = node->next)
340 if (node->type == XML_ELEMENT_NODE
341 && !strcmp (CHAR_CAST (const char *, node->name), "p"))
343 ph->paragraphs = xrealloc (ph->paragraphs,
344 (ph->n + 1) * sizeof *ph->paragraphs);
345 decode_page_p (node, &ph->paragraphs[ph->n++]);
347 xmlFreeDoc (html_doc);
350 char * WARN_UNUSED_RESULT
351 spv_read_light_table (struct zip_reader *zip, const char *bin_member,
352 struct spvlb_table **tablep)
358 char *error = zip_member_read_all (zip, bin_member, &data, &size);
362 struct spvbin_input input;
363 spvbin_input_init (&input, data, size);
365 struct spvlb_table *table = NULL;
367 ? xasprintf ("light table member is empty")
368 : !spvlb_parse_table (&input, &table)
369 ? spvbin_input_to_error (&input, NULL)
370 : input.ofs != input.size
371 ? xasprintf ("expected end of file at offset %#zx", input.ofs)
379 static char * WARN_UNUSED_RESULT
380 pivot_table_open_light (struct zip_reader *zip, const char *bin_member,
381 struct pivot_table **tablep)
385 struct spvlb_table *raw_table;
386 char *error = spv_read_light_table (zip, bin_member, &raw_table);
388 error = decode_spvlb_table (raw_table, tablep);
389 spvlb_free_table (raw_table);
394 char * WARN_UNUSED_RESULT
395 spv_read_legacy_data (struct zip_reader *zip, const char *bin_member,
396 struct spv_data *data)
400 char *error = zip_member_read_all (zip, bin_member, &raw, &size);
403 error = spv_legacy_data_decode (raw, size, data);
410 char * WARN_UNUSED_RESULT
411 spv_read_xml_member (struct zip_reader *zip, const char *xml_member,
412 bool keep_blanks, const char *root_element_name,
417 struct zip_member *zm;
418 char *error = zip_member_open (zip, xml_member, &zm);
422 xmlParserCtxt *parser;
423 xmlKeepBlanksDefault (keep_blanks);
424 parser = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL);
427 zip_member_finish (zm);
428 return xasprintf (_("%s: Failed to create XML parser"), xml_member);
433 while ((retval = zip_member_read (zm, buf, sizeof buf)) > 0)
434 xmlParseChunk (parser, buf, retval, false);
435 xmlParseChunk (parser, NULL, 0, true);
437 xmlDoc *doc = parser->myDoc;
438 bool well_formed = parser->wellFormed;
439 xmlFreeParserCtxt (parser);
443 char *error = zip_member_steal_error (zm);
444 zip_member_finish (zm);
448 zip_member_finish (zm);
453 return xasprintf(_("%s: document is not well-formed"), xml_member);
456 const xmlNode *root_node = xmlDocGetRootElement (doc);
457 assert (root_node->type == XML_ELEMENT_NODE);
458 if (strcmp (CHAR_CAST (char *, root_node->name), root_element_name))
461 return xasprintf(_("%s: root node is \"%s\" but \"%s\" was expected"),
463 CHAR_CAST (char *, root_node->name), root_element_name);
470 static char * WARN_UNUSED_RESULT
471 pivot_table_open_legacy (struct zip_reader *zip, const char *bin_member,
472 const char *xml_member, const char *subtype,
473 const struct pivot_table_look *look,
474 struct pivot_table **tablep)
478 struct spv_data data = SPV_DATA_INITIALIZER;
479 char *error = spv_read_legacy_data (zip, bin_member, &data);
484 error = spv_read_xml_member (zip, xml_member, false,
485 "visualization", &doc);
489 struct spvxml_context ctx = SPVXML_CONTEXT_INIT (ctx);
490 struct spvdx_visualization *v;
491 spvdx_parse_visualization (&ctx, xmlDocGetRootElement (doc), &v);
492 error = spvxml_context_finish (&ctx, &v->node_);
496 error = decode_spvdx_table (v, subtype, look, &data, tablep);
498 spvdx_free_visualization (v);
503 spv_data_uninit (&data);
508 static struct output_item *
509 spv_read_table_item (struct zip_reader *zip,
510 const struct spvsx_table *table)
512 const struct spvsx_table_structure *ts = table->table_structure;
513 const char *bin_member = ts->data_path->text;
514 const char *xml_member = ts->path ? ts->path->text : NULL;
516 struct pivot_table *pt = NULL;
520 struct pivot_table_look *look;
521 error = (table->table_properties
522 ? spv_table_look_decode (table->table_properties, &look)
523 : xstrdup ("Legacy table lacks tableProperties"));
526 error = pivot_table_open_legacy (zip, bin_member, xml_member,
527 table->sub_type, look, &pt);
528 pivot_table_look_unref (look);
532 error = pivot_table_open_light (zip, bin_member, &pt);
534 pt = pivot_table_create_for_text (
535 pivot_value_new_text (N_("Error")),
536 pivot_value_new_user_text_nocopy (error));
538 struct output_item *item = table_item_create (pt);
539 output_item_set_command_name (item, table->command_name);
540 output_item_add_spv_info (item);
541 item->spv_info->error = error != NULL;
542 item->spv_info->zip_reader = zip_reader_ref (zip);
543 item->spv_info->bin_member = xstrdup (bin_member);
544 item->spv_info->xml_member = xstrdup_if_nonnull (xml_member);
548 static cairo_status_t
549 read_from_zip_member (void *zm_, unsigned char *data, unsigned int length)
551 struct zip_member *zm = zm_;
553 return CAIRO_STATUS_READ_ERROR;
557 int n = zip_member_read (zm, data, length);
559 return CAIRO_STATUS_READ_ERROR;
565 return CAIRO_STATUS_SUCCESS;
568 static char * WARN_UNUSED_RESULT
569 spv_read_image (struct zip_reader *zip, const char *png_member,
570 const char *command_name, struct output_item **itemp)
572 struct zip_member *zm;
573 char *error = zip_member_open (zip, png_member, &zm);
577 cairo_surface_t *surface = cairo_image_surface_create_from_png_stream (
578 read_from_zip_member, zm);
580 zip_member_finish (zm);
582 if (cairo_surface_status (surface) != CAIRO_STATUS_SUCCESS)
583 return xstrdup ("reading image failed");
585 struct output_item *item = image_item_create (surface);
586 output_item_set_command_name (item, command_name);
587 output_item_add_spv_info (item);
588 item->spv_info->zip_reader = zip_reader_ref (zip);
589 item->spv_info->png_member = xstrdup (png_member);
594 static struct output_item *
595 error_item_create (char *s)
597 struct output_item *item = text_item_create_nocopy (TEXT_ITEM_LOG, s,
599 output_item_add_spv_info (item);
600 item->spv_info->error = true;
604 static struct output_item *
605 spv_decode_container (struct zip_reader *zip,
606 const struct spvsx_container *c)
608 assert (c->n_seq == 1);
609 struct spvxml_node *content = c->seq[0];
611 struct output_item *item = NULL;
613 if (spvsx_is_container_text (content))
615 item = decode_container_text (spvsx_cast_container_text (content));
618 else if (spvsx_is_table (content))
620 item = spv_read_table_item (zip, spvsx_cast_table (content));
623 else if (spvsx_is_object (content))
625 struct spvsx_object *object = spvsx_cast_object (content);
626 error = spv_read_image (zip, object->uri, object->command_name, &item);
628 else if (spvsx_is_image (content))
630 struct spvsx_image *image = spvsx_cast_image (content);
631 error = spv_read_image (zip, image->data_path->text, image->command_name,
634 else if (spvsx_is_graph (content))
635 error = xstrdup ("graphs not yet implemented");
636 else if (spvsx_is_model (content))
637 error = xstrdup ("models not yet implemented");
638 else if (spvsx_is_tree (content))
639 error = xstrdup ("trees not yet implemented");
644 item = error_item_create (error);
646 output_item_set_label (item, c->label->text);
647 item->show = c->visibility == SPVSX_VISIBILITY_VISIBLE;
653 set_structure_member (struct output_item *item, struct zip_reader *zip,
654 const char *structure_member)
656 if (structure_member)
658 output_item_add_spv_info (item);
659 if (!item->spv_info->zip_reader)
660 item->spv_info->zip_reader = zip_reader_ref (zip);
661 if (!item->spv_info->structure_member)
662 item->spv_info->structure_member = xstrdup (structure_member);
667 spv_decode_children (struct zip_reader *zip, const char *structure_member,
668 struct spvxml_node **seq, size_t n_seq,
669 struct output_item *parent)
671 for (size_t i = 0; i < n_seq; i++)
673 const struct spvxml_node *node = seq[i];
675 struct output_item *child;
676 if (spvsx_is_container (node))
678 const struct spvsx_container *container
679 = spvsx_cast_container (node);
681 if (container->page_break_before_present)
682 group_item_add_child (parent, page_break_item_create ());
684 child = spv_decode_container (zip, container);
686 else if (spvsx_is_heading (node))
688 const struct spvsx_heading *subheading = spvsx_cast_heading (node);
690 child = group_item_create (subheading->command_name,
691 subheading->label->text);
692 child->show = !subheading->heading_visibility_present;
694 /* Pass NULL for 'structure_member' so that only top-level items get
695 tagged that way. Lower-level items are always in the same
696 structure member as their parent anyway. */
697 spv_decode_children (zip, NULL, subheading->seq,
698 subheading->n_seq, child);
703 set_structure_member (child, zip, structure_member);
704 group_item_add_child (parent, child);
708 static struct page_setup *
709 decode_page_setup (const struct spvsx_page_setup *in, const char *file_name)
711 struct page_setup *out = xmalloc (sizeof *out);
712 *out = (struct page_setup) PAGE_SETUP_INITIALIZER;
714 out->initial_page_number = in->initial_page_number;
716 if (in->paper_width != DBL_MAX)
717 out->paper[TABLE_HORZ] = in->paper_width;
718 if (in->paper_height != DBL_MAX)
719 out->paper[TABLE_VERT] = in->paper_height;
721 if (in->margin_left != DBL_MAX)
722 out->margins[TABLE_HORZ][0] = in->margin_left;
723 if (in->margin_right != DBL_MAX)
724 out->margins[TABLE_HORZ][1] = in->margin_right;
725 if (in->margin_top != DBL_MAX)
726 out->margins[TABLE_VERT][0] = in->margin_top;
727 if (in->margin_bottom != DBL_MAX)
728 out->margins[TABLE_VERT][1] = in->margin_bottom;
730 if (in->space_after != DBL_MAX)
731 out->object_spacing = in->space_after;
734 out->chart_size = (in->chart_size == SPVSX_CHART_SIZE_FULL_HEIGHT
735 ? PAGE_CHART_FULL_HEIGHT
736 : in->chart_size == SPVSX_CHART_SIZE_HALF_HEIGHT
737 ? PAGE_CHART_HALF_HEIGHT
738 : in->chart_size == SPVSX_CHART_SIZE_QUARTER_HEIGHT
739 ? PAGE_CHART_QUARTER_HEIGHT
742 decode_page_paragraph (in->page_header->page_paragraph, &out->headings[0]);
743 decode_page_paragraph (in->page_footer->page_paragraph, &out->headings[1]);
745 out->file_name = xstrdup (file_name);
751 spv_add_error_heading (struct output_item *root_item,
752 struct zip_reader *zip, const char *structure_member,
755 struct output_item *item = error_item_create (
756 xasprintf ("%s: %s", structure_member, error));
758 set_structure_member (item, zip, structure_member);
759 group_item_add_child (root_item, item);
763 spv_heading_read (struct zip_reader *zip, struct output_item *root_item,
764 struct page_setup **psp, const char *file_name,
765 const char *structure_member)
768 char *error = spv_read_xml_member (zip, structure_member, true,
772 spv_add_error_heading (root_item, zip, structure_member, error);
776 struct spvxml_context ctx = SPVXML_CONTEXT_INIT (ctx);
777 struct spvsx_root_heading *root;
778 spvsx_parse_root_heading (&ctx, xmlDocGetRootElement (doc), &root);
779 error = spvxml_context_finish (&ctx, &root->node_);
783 spv_add_error_heading (root_item, zip, structure_member, error);
787 if (root->page_setup && psp && !*psp)
788 *psp = decode_page_setup (root->page_setup, file_name);
790 for (size_t i = 0; i < root->n_seq; i++)
791 spv_decode_children (zip, structure_member, root->seq, root->n_seq,
794 spvsx_free_root_heading (root);
799 spv_detect__ (struct zip_reader *zip, char **errorp)
803 const char *member = "META-INF/MANIFEST.MF";
804 if (!zip_reader_contains_member (zip, member))
809 *errorp = zip_member_read_all (zip, "META-INF/MANIFEST.MF",
814 const char *magic = "allowPivoting=true";
815 bool is_spv = size == strlen (magic) && !memcmp (magic, data, size);
821 /* Returns NULL if FILENAME is an SPV file, otherwise an error string that the
822 caller must eventually free(). */
823 char * WARN_UNUSED_RESULT
824 spv_detect (const char *filename)
826 struct zip_reader *zip;
827 char *error = zip_reader_create (filename, &zip);
831 if (spv_detect__ (zip, &error) <= 0 && !error)
832 error = xasprintf("%s: not an SPV file", filename);
833 zip_reader_unref (zip);
837 char * WARN_UNUSED_RESULT
838 spv_read (const char *filename, struct output_item **outp,
839 struct page_setup **psp)
845 struct zip_reader *zip;
846 char *error = zip_reader_create (filename, &zip);
850 int detect = spv_detect__ (zip, &error);
853 zip_reader_unref (zip);
854 return error ? error : xasprintf ("%s: not an SPV file", filename);
857 *outp = root_item_create ();
858 for (size_t i = 0; ; i++)
860 const char *structure_member = zip_reader_get_member_name (zip, i);
861 if (!structure_member)
864 struct substring structure_member_ss = ss_cstr (structure_member);
865 if (ss_starts_with (structure_member_ss, ss_cstr ("outputViewer"))
866 && ss_ends_with (structure_member_ss, ss_cstr (".xml")))
867 spv_heading_read (zip, *outp, psp, filename, structure_member);
870 zip_reader_unref (zip);
874 char * WARN_UNUSED_RESULT
875 spv_decode_fmt_spec (uint32_t u32, struct fmt_spec *out)
878 || (u32 == 0x10000 || u32 == 1 /* both used as string formats */))
880 *out = fmt_for_output (FMT_F, 40, 2);
884 uint8_t raw_type = u32 >> 16;
885 uint8_t w = u32 >> 8;
888 *out = (struct fmt_spec) { .type = FMT_F, .w = w, .d = d };
889 bool ok = raw_type >= 40 || fmt_from_io (raw_type, &out->type);
892 fmt_fix_output (out);
893 ok = fmt_check_width_compat (*out, 0);
898 *out = fmt_for_output (FMT_F, 40, 2);
899 return xasprintf ("bad format %#"PRIx32, u32);