Work on structure XML description.

author Ben Pfaff <blp@cs.stanford.edu>

Fri, 7 Aug 2015 06:24:54 +0000 (23:24 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Fri, 7 Aug 2015 06:24:54 +0000 (23:24 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Fri, 7 Aug 2015 06:24:54 +0000 (23:24 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Fri, 7 Aug 2015 06:24:54 +0000 (23:24 -0700)
diff --git a/parse-all-xml b/parse-all-xml

index 98a4258ddbd50c36bbc47fbca083df5491837844..8b1fc366f692275c045a6bd1a943f144ef0876af 100755 (executable)
--- a/parse-all-xml
+++ b/parse-all-xml
@@ -1,5 +1,17 @@
  #! /bin/sh
-for d in `ls -1 unzipped/*/*.xml |grep -vE 'notes|table|warning|chart|model'`
-do
-    ./parse-xml $d
-done | sort -u
+
+# Parse the structure members that do not represent models or graphs
+# and that are not page setups (which are always the first structure member).
+# Also skip those with borderProperties, which indicate the non-"light"
+# format.
+lightTables=`ls -1 unzipped/*/*.xml |grep -vE 'notes|table|warning|chart|model' \
+          | xargs grep -EL '<([a-z]*:)?(model|graph|pageSetup|borderProperties)'`
+printf 'Structure:\n'
+for d in $lightTables; do
+    ./parse-xml $d containment
+done | sort | uniq | sort
+
+printf '\nAttributes:\n'
+for d in $lightTables; do
+    ./parse-xml $d attributes
+done | sort | uniq | sort
diff --git a/parse-xml.c b/parse-xml.c

index c0445052ed017f6ea80111568dc581884c69d4e1..efc7bb3569df0117bfec656c2739d283deb48c30 100644 (file)
--- a/parse-xml.c
+++ b/parse-xml.c
@@ -1,4 +1,5 @@
  #include <stdio.h>
+#include <string.h>
  #include <libxml/parser.h>
  #include <libxml/tree.h>
  
@@ -7,27 +8,51 @@ print_containment (xmlNode * a_node)
  {
    for (xmlNode *node = a_node; node; node = node->next)
      {
+      const xmlNode *parent = node->parent;
+      if (parent->type == XML_ELEMENT_NODE)
+        printf ("%s ", parent->name);
+      else
+        printf ("<root> ");
+
        if (node->type == XML_ELEMENT_NODE)
-        {
-          const xmlNode *parent = node->parent;
-          if (parent->type == XML_ELEMENT_NODE)
-            printf ("%s %s\n", parent->name, node->name);
-          else if (parent->type == XML_DOCUMENT_NODE)
-            printf ("<root> %s\n", node->name);
-        }
+        printf ("%s", node->name);
+      else if (node->type == XML_TEXT_NODE)
+        printf("<text>");
+      else if (node->type == XML_CDATA_SECTION_NODE)
+        printf("<cdata>");
+      else
+        printf("<other>");
+
+      putchar('\n');
  
        print_containment (node->children);
      }
  }
  
-int
-main (int argc, char **argv)
+static void
+print_attributes (xmlNode * a_node)
  {
-  if (argc != 2)
+  for (xmlNode *node = a_node; node; node = node->next)
      {
-      fprintf (stderr, "usage: %s FILE.xml\n", argv[0]);
-      exit (1);
+      for (xmlAttr *attr = node->properties; attr; attr = attr->next)
+        printf ("%s %s\n", node->name, attr->name);
+
+      print_attributes (node->children);
      }
+}
+
+static void
+usage (void)
+{
+  fprintf (stderr, "usage: parse-xml FILE.xml containment|attributes\n");
+  exit (1);
+}
+
+int
+main (int argc, char **argv)
+{
+  if (argc != 3)
+    usage ();
  
    LIBXML_TEST_VERSION;
  
@@ -39,7 +64,14 @@ main (int argc, char **argv)
      }
  
    xmlNode *root = xmlDocGetRootElement(doc);
-  print_containment(root);
+
+  if (!strcmp(argv[2], "containment"))
+    print_containment (root);
+  else if (!strcmp(argv[2], "attributes"))
+    print_attributes (root);
+  else
+    usage ();
+
    xmlFreeDoc(doc);
    xmlCleanupParser();
  
diff --git a/spv-file-format.texi b/spv-file-format.texi

index 5eae54df739949d50f9c19a544c64c4c3d7ec661..3c40e76e212ed78858821e8036f1c9b94414beca 100644 (file)
--- a/spv-file-format.texi
+++ b/spv-file-format.texi
@@ -70,3 +70,22 @@ their exact names do not appear to matter as long as they are unique.
  @node SPV Structure Member Format
  @subsection Structure Member Format
  
+Structure members XML files claim conformance with a collection of XML
+Schemas.  These schemas are distributed, under a nonfree license, with
+SPSS binaries.  Fortunately, the schemas are not necessary to
+understand the structure members.  To a degree, the schemas can even
+be deceptive because they document elements and attributes that are
+not in the corpus and lack documentation of elements and attributes
+that are commonly found in the corpus.
+
+Structure members use a different XML namespace for each schema, but
+these namespaces are not entirely consistent: in some SPV files, for
+example, the @code{viewer-tree} schema is associated with namespace
+@indicateurl{http://xml.spss.com/spss/viewer-tree} and in other with
+@indicateurl{http://xml.spss.com/spss/viewer/viewer-tree} (note the
+additional @file{viewer/} directory.  In any case, the schema URIs are
+not resolvable to obtain the schemas themselves.
+
+One may ignore all of the above in interpreting a structure member.
+The actual XML has a simple and straightforward form that does not
+require a reader to take schemas or namespaces into account.  The
author	Ben Pfaff <blp@cs.stanford.edu>
	Fri, 7 Aug 2015 06:24:54 +0000 (23:24 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Fri, 7 Aug 2015 06:24:54 +0000 (23:24 -0700)
parse-all-xml		patch \| blob \| history
parse-xml.c		patch \| blob \| history
spv-file-format.texi		patch \| blob \| history