From 05813c9943ff069b70f7251669292d65d70b544a Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@cs.stanford.edu>
Date: Sun, 9 Jul 2017 17:08:36 -0700
Subject: [PATCH] Figure some details of categories and dimensions.

---
 dump.c               | 57 ++++++++++++++++-------------------------
 spv-file-format.texi | 60 +++++++++++++++++++++++++-------------------
 2 files changed, 55 insertions(+), 62 deletions(-)
diff --git a/dump.c b/dump.c
index 8a820d7b86..62e9307d7c 100644
--- a/dump.c
+++ b/dump.c
@@ -650,15 +650,9 @@ dump_category(FILE *stream, int level, int **indexes, int *allocated_indexes,
   printf ("<category>\n");
   dump_value (stream, level + 1);
 
-  int merge = data[pos];
-  if (!match_byte(0))
-    match_byte_assert (1);
-
+  bool merge = get_bool();
   match_byte_assert (0);
-
-  int unindexed = data[pos];
-  if (!match_byte(0))
-    match_byte_assert (1);
+  int unindexed = get_bool();
 
   int x = get_u32 ();
   pos -= 4;
@@ -675,24 +669,14 @@ dump_category(FILE *stream, int level, int **indexes, int *allocated_indexes,
             fprintf (stream, "    ");
           fprintf (stream, "<merge/>\n");
         }
+      assert (unindexed);
     }
   else
     {
-      if (merge)
-        {
-          fprintf(stderr, "index not -1 but merged\n");
-          exit(1);
-        }
-      if (x != 2)
-        {
-          fprintf(stderr, "index not -1 but x != 2\n");
-          exit(1);
-        }
-      if (n_categories != 0)
-        {
-          fprintf(stderr, "index not -1 but subcategories\n");
-          exit(1);
-        }
+      assert (!merge);
+      assert (!unindexed);
+      assert (x == 2);
+      assert (n_categories == 0);
       if (*n_indexes >= *allocated_indexes)
         {
           *allocated_indexes = *allocated_indexes ? 2 * *allocated_indexes : 16;
@@ -701,14 +685,6 @@ dump_category(FILE *stream, int level, int **indexes, int *allocated_indexes,
       (*indexes)[(*n_indexes)++] = indx;
     }
 
-  int expected_unindexed = indx == -1;
-  if (unindexed != expected_unindexed)
-    {
-      fprintf(stderr, "unindexed (%d) mismatch with indx (%d)\n",
-              unindexed, indx);
-      exit(1);
-    }
-
   if (n_categories == 0)
     {
       for (int i = 0; i <= level + 1; i++)
@@ -730,20 +706,29 @@ dump_dim(int indx)
   printf ("<dimension index=\"%d\">\n", indx);
   dump_value (stdout, 0);
 
-  /* This byte is usually 0 but many other values have been spotted. */
+  /* This byte is usually 0 but many other values have been spotted.
+     No visible effect. */
   pos++;
 
+  /* This byte can cause data to be oddly replicated. */
   if (!match_byte(0) && !match_byte(1))
     match_byte_assert(2);
+
   if (!match_u32(0))
     match_u32_assert(2);
-  if (!match_byte(0))
-    match_byte_assert(1);
-  if (!match_byte(0))
-    match_byte_assert(1);
+
+  bool show_dim_label = get_bool();
+  if (show_dim_label)
+    printf("  <show-dim-label/>\n");
+
+  bool hide_all_labels = get_bool();
+  if (hide_all_labels)
+    printf("  <hide-all-labels/>\n");
+
   match_byte_assert(1);
   if (!match_u32(UINT32_MAX))
     match_u32_assert(indx);
+
   n_categories = get_u32();
 
   int *indexes = NULL;
diff --git a/spv-file-format.texi b/spv-file-format.texi
index ed2e96092d..0fe2865a60 100644
--- a/spv-file-format.texi
+++ b/spv-file-format.texi
@@ -1060,30 +1060,34 @@ the categories associated with each dimension.
 @cartouche
 @format
 Dimensions @result{} int[@t{n-dims}] Dimension*[@t{n-dims}]
-Dimension @result{} Value[@t{name}] DimUnknown int[@t{n-categories}] Category*[@t{n-categories}]
-DimUnknown @result{}
+Dimension @result{} Value[@t{name}] DimProperties int[@t{n-categories}] Category*[@t{n-categories}]
+DimProperties @result{}
     byte[@t{d1}]
     (00 @math{|} 01 @math{|} 02)[@t{d2}]
     (i0 @math{|} i2)[@t{d3}]
-    bool[@t{d4}]
-    bool[@t{d5}]
-    01
-    int[@t{d6}]
+    bool[@t{show-dim-label}]
+    bool[@t{hide-all-labels}]
+    01 int[@t{dim-index}]
 @end format
 @end cartouche
 
 @code{name} is the name of the dimension, e.g. @code{Variables},
 @code{Statistics}, or a variable name.
 
+The meanings of @code{d1}, @code{d2}, and @code{d3} are unknown.
 @code{d1} is usually 0 but many other values have been observed.
 
-@code{d3} is 2 over 99% of the time.
+If @code{show-dim-label} is 01, the pivot table displays a label for
+the dimension itself.  Because usually the group and category labels
+are enough explanation, it is usually 00.
 
-@code{d5} is 0 over 99% of the time.
+If @code{hide-all-labels} is 01, the pivot table omits all labels for
+the dimension, including group and category labels.  It is usually 00.
+When @code{hide-all-labels} is 01, @code{show-dim-label} is ignored.
 
-@code{d6} is either -1 or the 0-based index of the dimension, e.g.@: 0
-for the first dimension, 1 for the second, and so on.  The latter is
-the case 98% of the time in the corpus.
+@code{dim-index} is usually the 0-based index of the dimension, e.g.@:
+0 for the first dimension, 1 for the second, and so on.  Sometimes it
+is -1.  There is no visible difference.
 
 @node SPV Light Member Categories
 @subsection Categories
@@ -1094,7 +1098,7 @@ are really categories; the others just serve as grouping constructs.
 @cartouche
 @format
 Category @result{} Value[@t{name}] (Leaf @math{|} Group)
-Leaf @result{} 00 00 00 i2 int[@t{index}] i0
+Leaf @result{} 00 00 00 i2 int[@t{cat-index}] i0
 Group @result{}
     bool[@t{merge}] 00 01 (i0 @math{|} i2)[@t{data}]
     i-1 int[@t{n-subcategories}] Category*[@t{n-subcategories}]
@@ -1103,14 +1107,17 @@ Group @result{}
 
 @code{name} is the name of the category (or group).
 
-A Leaf represents a leaf category.  The Leaf's @code{index} is a
+A Leaf represents a leaf category.  The Leaf's @code{cat-index} is a
 nonnegative integer less than @code{n-categories} in the Dimension in
-which the Category is nested (directly or indirectly).
+which the Category is nested (directly or indirectly).  These
+categories represent the original order in which the categories were
+sorted; if the user sorted or rearranged the categories, then the
+order of categories in the file reflects that without changing the
+@code{cat-index} values.
 
-A Group represents a Group of nested categories.  Usually a Group
-contains at least one Category, so that @code{n-subcategories} is
-positive, but a few Groups with @code{n-subcategories} 0 has been
-observed.
+A Group is a group of nested categories.  Usually a Group contains at
+least one Category, so that @code{n-subcategories} is positive, but a
+few Groups with @code{n-subcategories} 0 has been observed.
 
 If a Group's @code{merge} is 00, the most common value, then the group
 is really a distinct group that should be represented as such in the
@@ -1136,23 +1143,23 @@ The final part of an SPV light member contains the actual data.
 Data @result{}
     int[@t{layers}] int[@t{rows}] int[@t{columns}] int*[@t{n-dimensions}]
     int[@t{n-data}] Datum*[@t{n-data}]
-Datum @result{} int64[@t{index}] v3(00?) Value
+Datum @result{} int64[@t{index}] v1(00?) Value
 @end format
 @end cartouche
 
-The values of @code{layers}, @code{rows}, and @code{columns} each
-specifies the number of dimensions displayed in layers, rows, and
+The values of @code{n-layers}, @code{n-rows}, and @code{n-columns}
+each specifies the number of dimensions displayed in layers, rows, and
 columns, respectively.  Any of them may be zero.  Their values sum to
 @code{n-dimensions} from Dimensions (@pxref{SPV Light Member
 Dimensions}).
 
 The @code{n-dimensions} integers are a permutation of the 0-based
-dimension numbers.  The first @code{layers} integers specify each of
-the dimensions represented by layers, the next @code{rows} integers
+dimension numbers.  The first @code{n-layers} integers specify each of
+the dimensions represented by layers, the next @code{n-rows} integers
 specify the dimensions represented by rows, and the final
-@code{columns} integers specify the dimensions represented by columns.
-When there is more than one dimension of a given kind, the inner
-dimensions are given first.
+@code{n-columns} integers specify the dimensions represented by
+columns.  When there is more than one dimension of a given kind, the
+inner dimensions are given first.
 
 The format of a Datum varies slightly from version 1 to version 3: in
 version 1 it allows for an extra optional 00 byte.
@@ -1172,6 +1179,7 @@ for each @math{i} from 0 to @math{d - 1}:
 For example, suppose there are 3 dimensions with 3, 4, and 5
 categories, respectively.  The datum at coordinates (1, 2, 3) has
 index @math{5 \times (4 \times (3 \times 0 + 1) + 2) + 3 = 33}.
+Within a given dimension, the index is the @code{cat-index} in a Leaf.
 
 @node SPV Light Member Value
 @subsection Value
-- 
2.30.2