From a997c6cf917d592722d9850bbf63437d4ba1487e Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Thu, 6 Aug 2015 23:24:54 -0700 Subject: [PATCH] Work on structure XML description. --- parse-all-xml | 20 ++++++++++++--- parse-xml.c | 58 ++++++++++++++++++++++++++++++++++---------- spv-file-format.texi | 19 +++++++++++++++ 3 files changed, 80 insertions(+), 17 deletions(-) diff --git a/parse-all-xml b/parse-all-xml index 98a4258ddb..8b1fc366f6 100755 --- a/parse-all-xml +++ b/parse-all-xml @@ -1,5 +1,17 @@ #! /bin/sh -for d in `ls -1 unzipped/*/*.xml |grep -vE 'notes|table|warning|chart|model'` -do - ./parse-xml $d -done | sort -u + +# Parse the structure members that do not represent models or graphs +# and that are not page setups (which are always the first structure member). +# Also skip those with borderProperties, which indicate the non-"light" +# format. +lightTables=`ls -1 unzipped/*/*.xml |grep -vE 'notes|table|warning|chart|model' \ + | xargs grep -EL '<([a-z]*:)?(model|graph|pageSetup|borderProperties)'` +printf 'Structure:\n' +for d in $lightTables; do + ./parse-xml $d containment +done | sort | uniq | sort + +printf '\nAttributes:\n' +for d in $lightTables; do + ./parse-xml $d attributes +done | sort | uniq | sort diff --git a/parse-xml.c b/parse-xml.c index c0445052ed..efc7bb3569 100644 --- a/parse-xml.c +++ b/parse-xml.c @@ -1,4 +1,5 @@ #include +#include #include #include @@ -7,27 +8,51 @@ print_containment (xmlNode * a_node) { for (xmlNode *node = a_node; node; node = node->next) { + const xmlNode *parent = node->parent; + if (parent->type == XML_ELEMENT_NODE) + printf ("%s ", parent->name); + else + printf (" "); + if (node->type == XML_ELEMENT_NODE) - { - const xmlNode *parent = node->parent; - if (parent->type == XML_ELEMENT_NODE) - printf ("%s %s\n", parent->name, node->name); - else if (parent->type == XML_DOCUMENT_NODE) - printf (" %s\n", node->name); - } + printf ("%s", node->name); + else if (node->type == XML_TEXT_NODE) + printf(""); + else if (node->type == XML_CDATA_SECTION_NODE) + printf(""); + else + printf(""); + + putchar('\n'); print_containment (node->children); } } -int -main (int argc, char **argv) +static void +print_attributes (xmlNode * a_node) { - if (argc != 2) + for (xmlNode *node = a_node; node; node = node->next) { - fprintf (stderr, "usage: %s FILE.xml\n", argv[0]); - exit (1); + for (xmlAttr *attr = node->properties; attr; attr = attr->next) + printf ("%s %s\n", node->name, attr->name); + + print_attributes (node->children); } +} + +static void +usage (void) +{ + fprintf (stderr, "usage: parse-xml FILE.xml containment|attributes\n"); + exit (1); +} + +int +main (int argc, char **argv) +{ + if (argc != 3) + usage (); LIBXML_TEST_VERSION; @@ -39,7 +64,14 @@ main (int argc, char **argv) } xmlNode *root = xmlDocGetRootElement(doc); - print_containment(root); + + if (!strcmp(argv[2], "containment")) + print_containment (root); + else if (!strcmp(argv[2], "attributes")) + print_attributes (root); + else + usage (); + xmlFreeDoc(doc); xmlCleanupParser(); diff --git a/spv-file-format.texi b/spv-file-format.texi index 5eae54df73..3c40e76e21 100644 --- a/spv-file-format.texi +++ b/spv-file-format.texi @@ -70,3 +70,22 @@ their exact names do not appear to matter as long as they are unique. @node SPV Structure Member Format @subsection Structure Member Format +Structure members XML files claim conformance with a collection of XML +Schemas. These schemas are distributed, under a nonfree license, with +SPSS binaries. Fortunately, the schemas are not necessary to +understand the structure members. To a degree, the schemas can even +be deceptive because they document elements and attributes that are +not in the corpus and lack documentation of elements and attributes +that are commonly found in the corpus. + +Structure members use a different XML namespace for each schema, but +these namespaces are not entirely consistent: in some SPV files, for +example, the @code{viewer-tree} schema is associated with namespace +@indicateurl{http://xml.spss.com/spss/viewer-tree} and in other with +@indicateurl{http://xml.spss.com/spss/viewer/viewer-tree} (note the +additional @file{viewer/} directory. In any case, the schema URIs are +not resolvable to obtain the schemas themselves. + +One may ignore all of the above in interpreting a structure member. +The actual XML has a simple and straightforward form that does not +require a reader to take schemas or namespaces into account. The -- 2.30.2