From: Ben Pfaff Date: Fri, 7 Aug 2015 09:32:37 +0000 (-0700) Subject: Complete XML structure. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp;a=commitdiff_plain;h=86b334a2694bd7e55392531ddb6dc0f2eda6a063 Complete XML structure. --- diff --git a/parse-all-xml b/parse-all-xml index 3d91d23006..0dfd810250 100755 --- a/parse-all-xml +++ b/parse-all-xml @@ -13,7 +13,7 @@ lightTables=`ls -1 unzipped/*/*.xml |grep -vE 'notes|table|warning|chart|model' printf '\nAttributes:\n' for d in $lightTables; do - ./parse-xml $d attr:type + ./parse-xml $d text done | sort | uniq -c | sort -rn #printf '\nLabels:\n' diff --git a/parse-xml.c b/parse-xml.c index eac05228f1..9304e8b0dc 100644 --- a/parse-xml.c +++ b/parse-xml.c @@ -79,6 +79,31 @@ print_attributes (xmlNode * a_node) } } +static void +print_string(xmlChar *s) +{ + for (char *p = (char *) s; *p; p++) + if (*p == '\n') + printf ("\\n"); + else + putchar (*p); +} + +static void +print_cdata (xmlNode * a_node) +{ + for (xmlNode *node = a_node; node; node = node->next) + { + if (node->type == XML_CDATA_SECTION_NODE) + { + print_string (node->content); + putchar ('\n'); + } + + print_cdata (node->children); + } +} + static void print_attribute (xmlNode *node, const char *attr) { @@ -92,6 +117,27 @@ print_attribute (xmlNode *node, const char *attr) } } +static void +print_text (xmlNode *node) +{ + for (; node; node = node->next) + { + if (node->type == XML_ELEMENT_NODE) + { + printf ("%s", node->name); + for (xmlNode *child = node->children; child; child = child->next) + if (child->type == XML_TEXT_NODE) + { + putchar (' '); + print_string (child->content); + } + putchar ('\n'); + } + + print_text (node->children); + } +} + static void usage (void) { @@ -120,6 +166,10 @@ main (int argc, char **argv) print_containment (root); else if (!strcmp(argv[2], "attributes")) print_attributes (root); + else if (!strcmp(argv[2], "cdata")) + print_cdata (root); + else if (!strcmp(argv[2], "text")) + print_text (root); else if (!strncmp(argv[2], "attr:", 5)) print_attribute (root, argv[2] + 5); else if (!strcmp(argv[2], "labels")) diff --git a/spv-file-format.texi b/spv-file-format.texi index eac26048d9..15f195623d 100644 --- a/spv-file-format.texi +++ b/spv-file-format.texi @@ -219,10 +219,45 @@ As on the @code{heading} element. Parent: @code{text} @* Contents: cdata +The cdata contains an HTML document. In some cases, the document +starts with @code{} and ends with @code{}. The actual content ranges from trivial to simple: +just discarding the CSS and tags yields readable results. + +@table @asis +@item Required attribute: @code{lang} +This always contains @code{en} in the corpus. +@end table + @item table Parent: @code{container} @* Contents: @code{tableStructure} +@table @asis +@item Required attribute: @code{commandName} +As on the @code{heading} element. + +@item Required attribute: @code{type} +One of @code{table}, @code{note}, or @code{warning}. + +@item Required attribute: @code{subType} +The locale-invariant name for the particular kind of output that this +table represents in the procedure. This can be the same as +@code{commandName} e.g.@: @code{Frequencies}, or different, e.g.@: +@code{Case Processing Summary}. Generic subtypes @code{Notes} and +@code{Warnings} are often used. + +@item Required attribute: @code{tableId} +A number that uniquely identifies the table within the SPV file, +typically a large negative number such as @code{-4147135649387905023}. + +@item Optional attribute: @code{creator-version} +As on the @code{heading} element. In the corpus, this is only present +for version 21 and up and always includes all 8 digits. +@end table + @item tableStructure Parent: @code{table} Contents: @code{dataPath} @@ -230,4 +265,7 @@ Contents: @code{dataPath} @item dataPath Parent: @code{tableStructure} Contents: text + +Contains the name of the Zip member that holds the table details, +e.g.@: @code{0000000001437_lightTableData.bin}. @end table