Work on improving distinction between groups and categories.

author Ben Pfaff <blp@cs.stanford.edu>

Sun, 9 Aug 2015 21:31:49 +0000 (14:31 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Sun, 9 Aug 2015 21:31:49 +0000 (14:31 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Sun, 9 Aug 2015 21:31:49 +0000 (14:31 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Sun, 9 Aug 2015 21:31:49 +0000 (14:31 -0700)
diff --git a/dump.c b/dump.c

index e37202000c64f310a309f872ccbbd0ae2f46cc96..65955d81995b16e50c631e430ee33fe95cf485d1 100644 (file)
--- a/dump.c
+++ b/dump.c
@@ -575,26 +575,32 @@ dump_category(int level, int *indexes, int *n_indexes, int max_indexes)
      fprintf (stdout, "    ");
    printf ("<category>\n");
    dump_value (stdout, level + 1);
+
+  int start = pos;
+  int d0 = data[pos];
    match_byte(1);
-  match_byte(0);
-  match_byte(0);
-  match_byte(0);
+  int c = 0;
+  c += match_byte(0);
+  c += match_byte(0);
+  c += match_byte(0);
  
+  int d1 = data[pos];
+  int d2 = 9;
    if (match_byte (1))
      {
-      if (!match_byte (0) && !match_byte (1))
-        match_byte_assert (2);
+      d2 = data[pos];
+      if (!match_u32 (0) && !match_u32 (1))
+        match_u32_assert (2);
      }
-  else if (!match_byte (2))
-    match_byte_assert (0);
-  match_byte_assert (0);
-  match_byte_assert (0);
-  match_byte_assert (0);
+  else
+    match_u32_assert (2);
+  int len = pos - start;
  
    int indx = get_u32();
    int n_categories = get_u32();
    if (indx != -1)
      {
+      fprintf (stderr, "category %d %d %d %d %d:", d0, c, d1, d2, len);
        if (n_categories != 0)
          {
            fprintf(stderr, "index not -1 but subcategories\n");
@@ -607,6 +613,12 @@ dump_category(int level, int *indexes, int *n_indexes, int max_indexes)
          }
        indexes[(*n_indexes)++] = indx;
      }
+  else
+    fprintf (stderr, "group    %d %d %d %d %d:", d0, c, d1, d2, len);
+  for (int i = 0; i < len; i++)
+    fprintf (stderr, " %02x", data[start + i]);
+  putc ('\n', stderr);
+
    if (n_categories == 0)
      {
        for (int i = 0; i <= level + 1; i++)
diff --git a/spv-file-format.texi b/spv-file-format.texi

index e7c3f888900bcdae677436f01d42cc70e64d248a..f6f8bb27dd116771c1df96cad66f7499867d60d6 100644 (file)
--- a/spv-file-format.texi
+++ b/spv-file-format.texi
@@ -517,6 +517,18 @@ for the first dimension, 1 for the second, and so on.  The latter is
  the case 98% of the time in the corpus.
  
  @example
-category := value i1
-            (00 | 01 (00 | 01 | 02) | 02) 00 00 00
+category := value[name]
+            01? 00? 00? 00?
+            (01 (i0 | i1 | i2) | i2)
+            int[index] int[n-subcategories] category*[n-subcategories]
  @end example
+
+@code{category} can represent a terminal category.  In that case,
+@code{name} is the name of the category, @code{index} is a nonnegative
+integer less than @code{n-categories} in the @code{dimension} in which
+the @code{category} is nested (directly or indirectly), and
+@code{n-subcategories} is 0.
+
+Alternatively, @code{category} can represent a group of nested
+categories.  In that case, @code{name} is the name of the group,
+@code{index} is -1, and @code{n-subcategories} is positive.
author	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 9 Aug 2015 21:31:49 +0000 (14:31 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 9 Aug 2015 21:31:49 +0000 (14:31 -0700)
dump.c		patch \| blob \| history
spv-file-format.texi		patch \| blob \| history