Figure some details of categories and dimensions.

author Ben Pfaff <blp@cs.stanford.edu>

Mon, 10 Jul 2017 00:08:36 +0000 (17:08 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Mon, 10 Jul 2017 00:08:36 +0000 (17:08 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Mon, 10 Jul 2017 00:08:36 +0000 (17:08 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Mon, 10 Jul 2017 00:08:36 +0000 (17:08 -0700)
diff --git a/dump.c b/dump.c

index 8a820d7b86c12ad2ca78b1be232b9993a643cc0a..62e9307d7c8a3a70e5b8261212e65c77248019b2 100644 (file)
--- a/dump.c
+++ b/dump.c
@@ -650,15 +650,9 @@ dump_category(FILE *stream, int level, int **indexes, int *allocated_indexes,
    printf ("<category>\n");
    dump_value (stream, level + 1);
  
    printf ("<category>\n");
    dump_value (stream, level + 1);
  
-  int merge = data[pos];
-  if (!match_byte(0))
-    match_byte_assert (1);
-
+  bool merge = get_bool();
    match_byte_assert (0);
    match_byte_assert (0);
-
-  int unindexed = data[pos];
-  if (!match_byte(0))
-    match_byte_assert (1);
+  int unindexed = get_bool();
  
    int x = get_u32 ();
    pos -= 4;
  
    int x = get_u32 ();
    pos -= 4;
@@ -675,24 +669,14 @@ dump_category(FILE *stream, int level, int **indexes, int *allocated_indexes,
              fprintf (stream, "    ");
            fprintf (stream, "<merge/>\n");
          }
              fprintf (stream, "    ");
            fprintf (stream, "<merge/>\n");
          }
+      assert (unindexed);
      }
    else
      {
      }
    else
      {
-      if (merge)
-        {
-          fprintf(stderr, "index not -1 but merged\n");
-          exit(1);
-        }
-      if (x != 2)
-        {
-          fprintf(stderr, "index not -1 but x != 2\n");
-          exit(1);
-        }
-      if (n_categories != 0)
-        {
-          fprintf(stderr, "index not -1 but subcategories\n");
-          exit(1);
-        }
+      assert (!merge);
+      assert (!unindexed);
+      assert (x == 2);
+      assert (n_categories == 0);
        if (*n_indexes >= *allocated_indexes)
          {
            *allocated_indexes = *allocated_indexes ? 2 * *allocated_indexes : 16;
        if (*n_indexes >= *allocated_indexes)
          {
            *allocated_indexes = *allocated_indexes ? 2 * *allocated_indexes : 16;
@@ -701,14 +685,6 @@ dump_category(FILE *stream, int level, int **indexes, int *allocated_indexes,
        (*indexes)[(*n_indexes)++] = indx;
      }
  
        (*indexes)[(*n_indexes)++] = indx;
      }
  
-  int expected_unindexed = indx == -1;
-  if (unindexed != expected_unindexed)
-    {
-      fprintf(stderr, "unindexed (%d) mismatch with indx (%d)\n",
-              unindexed, indx);
-      exit(1);
-    }
-
    if (n_categories == 0)
      {
        for (int i = 0; i <= level + 1; i++)
    if (n_categories == 0)
      {
        for (int i = 0; i <= level + 1; i++)
@@ -730,20 +706,29 @@ dump_dim(int indx)
    printf ("<dimension index=\"%d\">\n", indx);
    dump_value (stdout, 0);
  
    printf ("<dimension index=\"%d\">\n", indx);
    dump_value (stdout, 0);
  
-  /* This byte is usually 0 but many other values have been spotted. */
+  /* This byte is usually 0 but many other values have been spotted.
+     No visible effect. */
    pos++;
  
    pos++;
  
+  /* This byte can cause data to be oddly replicated. */
    if (!match_byte(0) && !match_byte(1))
      match_byte_assert(2);
    if (!match_byte(0) && !match_byte(1))
      match_byte_assert(2);
+
    if (!match_u32(0))
      match_u32_assert(2);
    if (!match_u32(0))
      match_u32_assert(2);
-  if (!match_byte(0))
-    match_byte_assert(1);
-  if (!match_byte(0))
-    match_byte_assert(1);
+
+  bool show_dim_label = get_bool();
+  if (show_dim_label)
+    printf("  <show-dim-label/>\n");
+
+  bool hide_all_labels = get_bool();
+  if (hide_all_labels)
+    printf("  <hide-all-labels/>\n");
+
    match_byte_assert(1);
    if (!match_u32(UINT32_MAX))
      match_u32_assert(indx);
    match_byte_assert(1);
    if (!match_u32(UINT32_MAX))
      match_u32_assert(indx);
+
    n_categories = get_u32();
  
    int *indexes = NULL;
    n_categories = get_u32();
  
    int *indexes = NULL;
diff --git a/spv-file-format.texi b/spv-file-format.texi

index ed2e96092d75f27a2f5611a1a50cd0c6ca493bc1..0fe2865a60af3bbba5244c47a690d553d4d469bf 100644 (file)
--- a/spv-file-format.texi
+++ b/spv-file-format.texi
@@ -1060,30 +1060,34 @@ the categories associated with each dimension.
  @cartouche
  @format
  Dimensions @result{} int[@t{n-dims}] Dimension*[@t{n-dims}]
  @cartouche
  @format
  Dimensions @result{} int[@t{n-dims}] Dimension*[@t{n-dims}]
-Dimension @result{} Value[@t{name}] DimUnknown int[@t{n-categories}] Category*[@t{n-categories}]
-DimUnknown @result{}
+Dimension @result{} Value[@t{name}] DimProperties int[@t{n-categories}] Category*[@t{n-categories}]
+DimProperties @result{}
      byte[@t{d1}]
      (00 @math{|} 01 @math{|} 02)[@t{d2}]
      (i0 @math{|} i2)[@t{d3}]
      byte[@t{d1}]
      (00 @math{|} 01 @math{|} 02)[@t{d2}]
      (i0 @math{|} i2)[@t{d3}]
-    bool[@t{d4}]
-    bool[@t{d5}]
-    01
-    int[@t{d6}]
+    bool[@t{show-dim-label}]
+    bool[@t{hide-all-labels}]
+    01 int[@t{dim-index}]
  @end format
  @end cartouche
  
  @code{name} is the name of the dimension, e.g. @code{Variables},
  @code{Statistics}, or a variable name.
  
  @end format
  @end cartouche
  
  @code{name} is the name of the dimension, e.g. @code{Variables},
  @code{Statistics}, or a variable name.
  
+The meanings of @code{d1}, @code{d2}, and @code{d3} are unknown.
  @code{d1} is usually 0 but many other values have been observed.
  
  @code{d1} is usually 0 but many other values have been observed.
  
-@code{d3} is 2 over 99% of the time.
+If @code{show-dim-label} is 01, the pivot table displays a label for
+the dimension itself.  Because usually the group and category labels
+are enough explanation, it is usually 00.
  
  
-@code{d5} is 0 over 99% of the time.
+If @code{hide-all-labels} is 01, the pivot table omits all labels for
+the dimension, including group and category labels.  It is usually 00.
+When @code{hide-all-labels} is 01, @code{show-dim-label} is ignored.
  
  
-@code{d6} is either -1 or the 0-based index of the dimension, e.g.@: 0
-for the first dimension, 1 for the second, and so on.  The latter is
-the case 98% of the time in the corpus.
+@code{dim-index} is usually the 0-based index of the dimension, e.g.@:
+0 for the first dimension, 1 for the second, and so on.  Sometimes it
+is -1.  There is no visible difference.
  
  @node SPV Light Member Categories
  @subsection Categories
  
  @node SPV Light Member Categories
  @subsection Categories
@@ -1094,7 +1098,7 @@ are really categories; the others just serve as grouping constructs.
  @cartouche
  @format
  Category @result{} Value[@t{name}] (Leaf @math{|} Group)
  @cartouche
  @format
  Category @result{} Value[@t{name}] (Leaf @math{|} Group)
-Leaf @result{} 00 00 00 i2 int[@t{index}] i0
+Leaf @result{} 00 00 00 i2 int[@t{cat-index}] i0
  Group @result{}
      bool[@t{merge}] 00 01 (i0 @math{|} i2)[@t{data}]
      i-1 int[@t{n-subcategories}] Category*[@t{n-subcategories}]
  Group @result{}
      bool[@t{merge}] 00 01 (i0 @math{|} i2)[@t{data}]
      i-1 int[@t{n-subcategories}] Category*[@t{n-subcategories}]
@@ -1103,14 +1107,17 @@ Group @result{}
  
  @code{name} is the name of the category (or group).
  
  
  @code{name} is the name of the category (or group).
  
-A Leaf represents a leaf category.  The Leaf's @code{index} is a
+A Leaf represents a leaf category.  The Leaf's @code{cat-index} is a
  nonnegative integer less than @code{n-categories} in the Dimension in
  nonnegative integer less than @code{n-categories} in the Dimension in
-which the Category is nested (directly or indirectly).
+which the Category is nested (directly or indirectly).  These
+categories represent the original order in which the categories were
+sorted; if the user sorted or rearranged the categories, then the
+order of categories in the file reflects that without changing the
+@code{cat-index} values.
  
  
-A Group represents a Group of nested categories.  Usually a Group
-contains at least one Category, so that @code{n-subcategories} is
-positive, but a few Groups with @code{n-subcategories} 0 has been
-observed.
+A Group is a group of nested categories.  Usually a Group contains at
+least one Category, so that @code{n-subcategories} is positive, but a
+few Groups with @code{n-subcategories} 0 has been observed.
  
  If a Group's @code{merge} is 00, the most common value, then the group
  is really a distinct group that should be represented as such in the
  
  If a Group's @code{merge} is 00, the most common value, then the group
  is really a distinct group that should be represented as such in the
@@ -1136,23 +1143,23 @@ The final part of an SPV light member contains the actual data.
  Data @result{}
      int[@t{layers}] int[@t{rows}] int[@t{columns}] int*[@t{n-dimensions}]
      int[@t{n-data}] Datum*[@t{n-data}]
  Data @result{}
      int[@t{layers}] int[@t{rows}] int[@t{columns}] int*[@t{n-dimensions}]
      int[@t{n-data}] Datum*[@t{n-data}]
-Datum @result{} int64[@t{index}] v3(00?) Value
+Datum @result{} int64[@t{index}] v1(00?) Value
  @end format
  @end cartouche
  
  @end format
  @end cartouche
  
-The values of @code{layers}, @code{rows}, and @code{columns} each
-specifies the number of dimensions displayed in layers, rows, and
+The values of @code{n-layers}, @code{n-rows}, and @code{n-columns}
+each specifies the number of dimensions displayed in layers, rows, and
  columns, respectively.  Any of them may be zero.  Their values sum to
  @code{n-dimensions} from Dimensions (@pxref{SPV Light Member
  Dimensions}).
  
  The @code{n-dimensions} integers are a permutation of the 0-based
  columns, respectively.  Any of them may be zero.  Their values sum to
  @code{n-dimensions} from Dimensions (@pxref{SPV Light Member
  Dimensions}).
  
  The @code{n-dimensions} integers are a permutation of the 0-based
-dimension numbers.  The first @code{layers} integers specify each of
-the dimensions represented by layers, the next @code{rows} integers
+dimension numbers.  The first @code{n-layers} integers specify each of
+the dimensions represented by layers, the next @code{n-rows} integers
  specify the dimensions represented by rows, and the final
  specify the dimensions represented by rows, and the final
-@code{columns} integers specify the dimensions represented by columns.
-When there is more than one dimension of a given kind, the inner
-dimensions are given first.
+@code{n-columns} integers specify the dimensions represented by
+columns.  When there is more than one dimension of a given kind, the
+inner dimensions are given first.
  
  The format of a Datum varies slightly from version 1 to version 3: in
  version 1 it allows for an extra optional 00 byte.
  
  The format of a Datum varies slightly from version 1 to version 3: in
  version 1 it allows for an extra optional 00 byte.
@@ -1172,6 +1179,7 @@ for each @math{i} from 0 to @math{d - 1}:
  For example, suppose there are 3 dimensions with 3, 4, and 5
  categories, respectively.  The datum at coordinates (1, 2, 3) has
  index @math{5 \times (4 \times (3 \times 0 + 1) + 2) + 3 = 33}.
  For example, suppose there are 3 dimensions with 3, 4, and 5
  categories, respectively.  The datum at coordinates (1, 2, 3) has
  index @math{5 \times (4 \times (3 \times 0 + 1) + 2) + 3 = 33}.
+Within a given dimension, the index is the @code{cat-index} in a Leaf.
  
  @node SPV Light Member Value
  @subsection Value
  
  @node SPV Light Member Value
  @subsection Value
author	Ben Pfaff <blp@cs.stanford.edu>
	Mon, 10 Jul 2017 00:08:36 +0000 (17:08 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Mon, 10 Jul 2017 00:08:36 +0000 (17:08 -0700)
dump.c		patch \| blob \| history
spv-file-format.texi		patch \| blob \| history