Figure some details of categories and dimensions.

[pspp] / spv-file-format.texi
diff --git a/spv-file-format.texi b/spv-file-format.texi

index 6e7480ba12aca0f32cadcd6a016f3ea35ce35061..0fe2865a60af3bbba5244c47a690d553d4d469bf 100644 (file)
--- a/spv-file-format.texi
+++ b/spv-file-format.texi
@@ -607,7 +607,12 @@ An SPV light member begins with a 39-byte header:
  Header @result{}
      01 00
      (i1 @math{|} i3)[@t{version}]
-    01 bool*4 int
+    bool
+    bool[@t{show-numeric-markers}]
+    bool[@t{rotate-inner-column-labels}]
+    bool[@t{rotate-outer-row-labels}]
+    bool
+    int
      int[@t{min-column-width}] int[@t{max-column-width}]
      int[@t{min-row-width}] int[@t{max-row-width}]
      int64[@t{table-id}]
@@ -619,6 +624,18 @@ some of the other data in the member.  We will refer to ``version 1''
  and ``version 3'' later on and use v1(@dots{}) and v3(@dots{}) for
  version-specific formatting (as described previously).
  
+If @code{show-numeric-markers} is 1, footnote markers are shown as
+numbers, starting from 1; otherwise, they are shown as letters,
+starting from @samp{a}.
+
+If @code{rotate-inner-column-labels} is 1, then column labels closest
+to the data are rotated to be vertical; otherwise, they are shown
+in the normal way.
+
+If @code{rotate-outer-row-labels} is 1, then row labels farthest from
+the data are rotated to be vertical; otherwise, they are shown in the
+normal way.
+
  @code{table-id} is a binary version of the @code{tableId} attribute in
  the structure member that refers to the detail member.  For example,
  if @code{tableId} is @code{-4122591256483201023}, then @code{table-id}
@@ -860,11 +877,24 @@ TableSettings @result{}
      bool[@t{footnote-marker-position}]
      v3(
        byte
-      be32[@t{n}] byte*[@t{n}]
+      count(
+        Breakpoints[@t{row-breaks}] Breakpoints[@t{column-breaks}]
+        Keeps[@t{row-keeps}] Keeps[@t{column-keeps}]
+        PointKeeps[@t{row-keeps}] PointKeeps[@t{column-keeps}]
+      )
        bestring[@t{notes}]
        bestring[@t{table-look}]
        00...
      )
+
+Breakpoints @result{} be32[@t{n-breaks}] be32*[@t{n-breaks}]
+
+Keeps @result{} be32[@t{n-keeps}] Keep*@t{n-keeps}
+Keep @result{} be32[@t{offset}] be[@t{n}]
+
+PointKeeps @result{} be32[@t{n-point-keeps}] PointKeep*@t{n-point-keeps}
+PointKeep @result{} be32[@t{offset}] be32 be32
+
  @end format
  @end cartouche
  
@@ -886,6 +916,20 @@ shown as numbers starting from 1.
  When @code{footnote-marker-position} is 1, footnote markers are shown
  as superscripts, otherwise as subscripts.
  
+The Breakpoints are rows or columns after which there is a page break;
+for example, a row break of 1 requests a page break after the second
+row.  Usually no breakpoints are specified, indicating that page
+breaks should be selected automatically.
+
+The Keeps are ranges of rows or columns to be kept together without a
+page break; for example, a row Keep with @code{offset} 1 and @code{n}
+10 requests that the 10 rows starting with the second row be kept
+together.  Usually no Keeps are specified.
+
+The PointKeeps seem to be generated automatically based on
+user-specified Keeps.  They seems to indicate a conversion from rows
+or columns to pixel or point offsets.
+
  @code{notes} is a text string that contains user-specified notes.  It
  is displayed when the user hovers the cursor over the table, like
  ``alt text'' on a webpage.  It is not printed.  It is usually empty.
@@ -901,33 +945,59 @@ TableSettings ends with an arbitrary number of null bytes.
  @cartouche
  @format
  Formats @result{}
-    int[@t{nwidths}] int*[@t{nwidths}]
+    int[@t{n-widths}] int*[@t{n-widths}]
      string[@t{encoding}]
-    int (00 @math{|} 01) 00 (00 @math{|} 01)
+    int[@t{current-layer}]
+    bool[@t{digit-grouping}] bool[@t{leading-zero}] bool
      int[@t{epoch}]
      byte[@t{decimal}] byte[@t{grouping}]
      CustomCurrency
-    v1(i0)
-    v3(count(count(X5) count(X6)))
-
-CustomCurrency @result{} int[@t{n-ccs}] string*[@t{n-ccs}]
+    count(
+      v1(X0?)
+      v3(count(X1 count(X2)) count(X3))
  
-X5 @result{} byte*33 int[@t{n}] int*[@t{n}]
-X6 @result{}
+X0 @result{}
+    byte*14
+    string[@t{command}] string[@t{command-local}]
+    string[@t{language}] string[@t{charset}] string[@t{locale}]
+    bool 00 bool bool
+    int[@t{epoch}]
+    byte[@t{decimal}] byte[@t{grouping}]
+    CustomCurrency
+    byte[@t{missing}] bool
+
+X1 @result{}
+    byte*2
+    byte[@t{lang}]
+    byte[@t{variable-mode}]
+    byte[@t{value-mode}]
+    int*2
+    00*17
+    bool
+    01
+X2 @result{}
+    int[@t{n-heights}] int*[@t{n-heights}]
+    int[@t{n-style-map}] BlankMap*[@t{n-style-map}]
+    int[@t{n-styles}] StylePair*[@t{n-styles}]
+    count((i0 i0)?)
+StyleMap @result{} int64[@t{cell-index}] int16[@t{style-index}]
+X3 @result{}
      01 00 (03 @math{|} 04) 00 00 00
-    string[@t{command}] string[@t{subcommand}]
+    string[@t{command}] string[@t{command-local}]
      string[@t{language}] string[@t{charset}] string[@t{locale}]
-    (00 @math{|} 01) 00 bool bool
+    bool 00 bool bool
      int[@t{epoch}]
      byte[@t{decimal}] byte[@t{grouping}]
      double[@t{small}] 01
      (string[@t{dataset}] string[@t{datafile}] i0 int[@t{date}] i0)?
      CustomCurrency
      byte[@t{missing}] bool (i2000000 i0)?
+
+CustomCurrency @result{} int[@t{n-ccs}] string*[@t{n-ccs}]
  @end format
  @end cartouche
  
-If @code{nwidths} is nonzero, then the accompanying integers are
+If @code{n-widths} is nonzero, then the accompanying integers are
  column widths as manually adjusted by the user.  (Row heights are
  computed automatically based on the widths.)
  
@@ -950,6 +1020,13 @@ are @samp{.} and @samp{,}.
  @samp{'} (apostrophe), @samp{ } (space), and zero (presumably
  indicating that digits should not be grouped).
  
+@code{command} describes the statistical procedure that generated the
+output, in English.  It is not necessarily the literal syntax name of
+the procedure: for example, NPAR TESTS becomes ``Nonparametric
+Tests.''  @code{command-local} is the procedure's name, translated
+into the output language; it is often empty and, when it is not,
+sometimes the same as @code{command}.
+
  @code{dataset} is the name of the dataset analyzed to produce the
  output, e.g.@: @code{DataSet1}, and @code{datafile} the name of the
  file it was read from, e.g.@: @file{C:\Users\foo\bar.sav}.  The latter
@@ -971,6 +1048,9 @@ following strings are CCA through CCE format strings.  @xref{Custom
  Currency Formats,,, pspp, PSPP}.  Most commonly these are all
  @code{-,,,} but other strings occur.
  
+@code{missing} is the character used to indicate that a cell contains
+a missing value.  It is always observed as @samp{.}.
+
  @node SPV Light Member Dimensions
  @subsection Dimensions
  
@@ -980,30 +1060,34 @@ the categories associated with each dimension.
  @cartouche
  @format
  Dimensions @result{} int[@t{n-dims}] Dimension*[@t{n-dims}]
-Dimension @result{} Value[@t{name}] DimUnknown int[@t{n-categories}] Category*[@t{n-categories}]
-DimUnknown @result{}
+Dimension @result{} Value[@t{name}] DimProperties int[@t{n-categories}] Category*[@t{n-categories}]
+DimProperties @result{}
      byte[@t{d1}]
      (00 @math{|} 01 @math{|} 02)[@t{d2}]
      (i0 @math{|} i2)[@t{d3}]
-    (00 @math{|} 01)[@t{d4}]
-    (00 @math{|} 01)[@t{d5}]
-    01
-    int[@t{d6}]
+    bool[@t{show-dim-label}]
+    bool[@t{hide-all-labels}]
+    01 int[@t{dim-index}]
  @end format
  @end cartouche
  
  @code{name} is the name of the dimension, e.g. @code{Variables},
  @code{Statistics}, or a variable name.
  
+The meanings of @code{d1}, @code{d2}, and @code{d3} are unknown.
  @code{d1} is usually 0 but many other values have been observed.
  
-@code{d3} is 2 over 99% of the time.
+If @code{show-dim-label} is 01, the pivot table displays a label for
+the dimension itself.  Because usually the group and category labels
+are enough explanation, it is usually 00.
  
-@code{d5} is 0 over 99% of the time.
+If @code{hide-all-labels} is 01, the pivot table omits all labels for
+the dimension, including group and category labels.  It is usually 00.
+When @code{hide-all-labels} is 01, @code{show-dim-label} is ignored.
  
-@code{d6} is either -1 or the 0-based index of the dimension, e.g.@: 0
-for the first dimension, 1 for the second, and so on.  The latter is
-the case 98% of the time in the corpus.
+@code{dim-index} is usually the 0-based index of the dimension, e.g.@:
+0 for the first dimension, 1 for the second, and so on.  Sometimes it
+is -1.  There is no visible difference.
  
  @node SPV Light Member Categories
  @subsection Categories
@@ -1014,23 +1098,26 @@ are really categories; the others just serve as grouping constructs.
  @cartouche
  @format
  Category @result{} Value[@t{name}] (Leaf @math{|} Group)
-Leaf @result{} 00 00 00 i2 int[@t{index}] i0
+Leaf @result{} 00 00 00 i2 int[@t{cat-index}] i0
  Group @result{}
-    (00 @math{|} 01)[@t{merge}] 00 01 (i0 @math{|} i2)[@t{data}]
+    bool[@t{merge}] 00 01 (i0 @math{|} i2)[@t{data}]
      i-1 int[@t{n-subcategories}] Category*[@t{n-subcategories}]
  @end format
  @end cartouche
  
  @code{name} is the name of the category (or group).
  
-A Leaf represents a leaf category.  The Leaf's @code{index} is a
+A Leaf represents a leaf category.  The Leaf's @code{cat-index} is a
  nonnegative integer less than @code{n-categories} in the Dimension in
-which the Category is nested (directly or indirectly).
+which the Category is nested (directly or indirectly).  These
+categories represent the original order in which the categories were
+sorted; if the user sorted or rearranged the categories, then the
+order of categories in the file reflects that without changing the
+@code{cat-index} values.
  
-A Group represents a Group of nested categories.  Usually a Group
-contains at least one Category, so that @code{n-subcategories} is
-positive, but a few Groups with @code{n-subcategories} 0 has been
-observed.
+A Group is a group of nested categories.  Usually a Group contains at
+least one Category, so that @code{n-subcategories} is positive, but a
+few Groups with @code{n-subcategories} 0 has been observed.
  
  If a Group's @code{merge} is 00, the most common value, then the group
  is really a distinct group that should be represented as such in the
@@ -1056,23 +1143,23 @@ The final part of an SPV light member contains the actual data.
  Data @result{}
      int[@t{layers}] int[@t{rows}] int[@t{columns}] int*[@t{n-dimensions}]
      int[@t{n-data}] Datum*[@t{n-data}]
-Datum @result{} int64[@t{index}] v3(00?) Value
+Datum @result{} int64[@t{index}] v1(00?) Value
  @end format
  @end cartouche
  
-The values of @code{layers}, @code{rows}, and @code{columns} each
-specifies the number of dimensions displayed in layers, rows, and
+The values of @code{n-layers}, @code{n-rows}, and @code{n-columns}
+each specifies the number of dimensions displayed in layers, rows, and
  columns, respectively.  Any of them may be zero.  Their values sum to
  @code{n-dimensions} from Dimensions (@pxref{SPV Light Member
  Dimensions}).
  
  The @code{n-dimensions} integers are a permutation of the 0-based
-dimension numbers.  The first @code{layers} integers specify each of
-the dimensions represented by layers, the next @code{rows} integers
+dimension numbers.  The first @code{n-layers} integers specify each of
+the dimensions represented by layers, the next @code{n-rows} integers
  specify the dimensions represented by rows, and the final
-@code{columns} integers specify the dimensions represented by columns.
-When there is more than one dimension of a given kind, the inner
-dimensions are given first.
+@code{n-columns} integers specify the dimensions represented by
+columns.  When there is more than one dimension of a given kind, the
+inner dimensions are given first.
  
  The format of a Datum varies slightly from version 1 to version 3: in
  version 1 it allows for an extra optional 00 byte.
@@ -1092,6 +1179,7 @@ for each @math{i} from 0 to @math{d - 1}:
  For example, suppose there are 3 dimensions with 3, 4, and 5
  categories, respectively.  The datum at coordinates (1, 2, 3) has
  index @math{5 \times (4 \times (3 \times 0 + 1) + 2) + 3 = 33}.
+Within a given dimension, the index is the @code{cat-index} in a Leaf.
  
  @node SPV Light Member Value
  @subsection Value
@@ -1106,7 +1194,7 @@ RawValue @result{}
      01 ValueMod int[@t{format}] double[@t{x}]
    @math{|} 02 ValueMod int[@t{format}] double[@t{x}]
      string[@t{varname}] string[@t{vallab}] (01 @math{|} 02 @math{|} 03)
-  @math{|} 03 string[@t{local}] ValueMod string[@t{id}] string[@t{c}] (00 @math{|} 01)[@t{type}]
+  @math{|} 03 string[@t{local}] ValueMod string[@t{id}] string[@t{c}] bool[@t{type}]
    @math{|} 04 ValueMod int[@t{format}] string[@t{vallab}] string[@t{varname}]
      (01 @math{|} 02 @math{|} 03) string[@t{s}]
    @math{|} 05 ValueMod string[@t{varname}] string[@t{varlabel}] (01 @math{|} 02 @math{|} 03)
@@ -1261,15 +1349,17 @@ A ValueMod can specify special modifications to a Value.
  ValueMod @result{}
      31 i0 (i0 @math{|} i1 string[@t{subscript}])
      v1(00 (i1 @math{|} i2) 00 00 int 00 00)
-    v3(count(FormatString
-             (31 Style | 58)
-             (31 Style2 | 58)))
+    v3(count(FormatString StylePair))
    @math{|} 31 int[@t{n-refs}] int16*[@t{n-refs}] Format
    @math{|} 58
  
  Format @result{} 00 00 count(FormatString Style 58)
  FormatString @result{} count((count((i0 58)?) (58 @math{|} 31 string))?)
  
+StylePair @result{}
+    (31 Style | 58)
+    (31 Style2 | 58)
+
  Style @result{}
      bool[@t{bold}] bool[@t{italic}] bool[@t{underline}] bool[@t{show}]
      string[@t{fgcolor}] string[@t{bgcolor}]