projects
/
pspp
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
d885d45
)
Figure some details of categories and dimensions.
author
Ben Pfaff
<blp@cs.stanford.edu>
Mon, 10 Jul 2017 00:08:36 +0000
(17:08 -0700)
committer
Ben Pfaff
<blp@cs.stanford.edu>
Mon, 10 Jul 2017 00:08:36 +0000
(17:08 -0700)
dump.c
patch
|
blob
|
history
spv-file-format.texi
patch
|
blob
|
history
diff --git
a/dump.c
b/dump.c
index 8a820d7b86c12ad2ca78b1be232b9993a643cc0a..62e9307d7c8a3a70e5b8261212e65c77248019b2 100644
(file)
--- a/
dump.c
+++ b/
dump.c
@@
-650,15
+650,9
@@
dump_category(FILE *stream, int level, int **indexes, int *allocated_indexes,
printf ("<category>\n");
dump_value (stream, level + 1);
printf ("<category>\n");
dump_value (stream, level + 1);
- int merge = data[pos];
- if (!match_byte(0))
- match_byte_assert (1);
-
+ bool merge = get_bool();
match_byte_assert (0);
match_byte_assert (0);
-
- int unindexed = data[pos];
- if (!match_byte(0))
- match_byte_assert (1);
+ int unindexed = get_bool();
int x = get_u32 ();
pos -= 4;
int x = get_u32 ();
pos -= 4;
@@
-675,24
+669,14
@@
dump_category(FILE *stream, int level, int **indexes, int *allocated_indexes,
fprintf (stream, " ");
fprintf (stream, "<merge/>\n");
}
fprintf (stream, " ");
fprintf (stream, "<merge/>\n");
}
+ assert (unindexed);
}
else
{
}
else
{
- if (merge)
- {
- fprintf(stderr, "index not -1 but merged\n");
- exit(1);
- }
- if (x != 2)
- {
- fprintf(stderr, "index not -1 but x != 2\n");
- exit(1);
- }
- if (n_categories != 0)
- {
- fprintf(stderr, "index not -1 but subcategories\n");
- exit(1);
- }
+ assert (!merge);
+ assert (!unindexed);
+ assert (x == 2);
+ assert (n_categories == 0);
if (*n_indexes >= *allocated_indexes)
{
*allocated_indexes = *allocated_indexes ? 2 * *allocated_indexes : 16;
if (*n_indexes >= *allocated_indexes)
{
*allocated_indexes = *allocated_indexes ? 2 * *allocated_indexes : 16;
@@
-701,14
+685,6
@@
dump_category(FILE *stream, int level, int **indexes, int *allocated_indexes,
(*indexes)[(*n_indexes)++] = indx;
}
(*indexes)[(*n_indexes)++] = indx;
}
- int expected_unindexed = indx == -1;
- if (unindexed != expected_unindexed)
- {
- fprintf(stderr, "unindexed (%d) mismatch with indx (%d)\n",
- unindexed, indx);
- exit(1);
- }
-
if (n_categories == 0)
{
for (int i = 0; i <= level + 1; i++)
if (n_categories == 0)
{
for (int i = 0; i <= level + 1; i++)
@@
-730,20
+706,29
@@
dump_dim(int indx)
printf ("<dimension index=\"%d\">\n", indx);
dump_value (stdout, 0);
printf ("<dimension index=\"%d\">\n", indx);
dump_value (stdout, 0);
- /* This byte is usually 0 but many other values have been spotted. */
+ /* This byte is usually 0 but many other values have been spotted.
+ No visible effect. */
pos++;
pos++;
+ /* This byte can cause data to be oddly replicated. */
if (!match_byte(0) && !match_byte(1))
match_byte_assert(2);
if (!match_byte(0) && !match_byte(1))
match_byte_assert(2);
+
if (!match_u32(0))
match_u32_assert(2);
if (!match_u32(0))
match_u32_assert(2);
- if (!match_byte(0))
- match_byte_assert(1);
- if (!match_byte(0))
- match_byte_assert(1);
+
+ bool show_dim_label = get_bool();
+ if (show_dim_label)
+ printf(" <show-dim-label/>\n");
+
+ bool hide_all_labels = get_bool();
+ if (hide_all_labels)
+ printf(" <hide-all-labels/>\n");
+
match_byte_assert(1);
if (!match_u32(UINT32_MAX))
match_u32_assert(indx);
match_byte_assert(1);
if (!match_u32(UINT32_MAX))
match_u32_assert(indx);
+
n_categories = get_u32();
int *indexes = NULL;
n_categories = get_u32();
int *indexes = NULL;
diff --git
a/spv-file-format.texi
b/spv-file-format.texi
index ed2e96092d75f27a2f5611a1a50cd0c6ca493bc1..0fe2865a60af3bbba5244c47a690d553d4d469bf 100644
(file)
--- a/
spv-file-format.texi
+++ b/
spv-file-format.texi
@@
-1060,30
+1060,34
@@
the categories associated with each dimension.
@cartouche
@format
Dimensions @result{} int[@t{n-dims}] Dimension*[@t{n-dims}]
@cartouche
@format
Dimensions @result{} int[@t{n-dims}] Dimension*[@t{n-dims}]
-Dimension @result{} Value[@t{name}] Dim
Unknown
int[@t{n-categories}] Category*[@t{n-categories}]
-Dim
Unknown
@result{}
+Dimension @result{} Value[@t{name}] Dim
Properties
int[@t{n-categories}] Category*[@t{n-categories}]
+Dim
Properties
@result{}
byte[@t{d1}]
(00 @math{|} 01 @math{|} 02)[@t{d2}]
(i0 @math{|} i2)[@t{d3}]
byte[@t{d1}]
(00 @math{|} 01 @math{|} 02)[@t{d2}]
(i0 @math{|} i2)[@t{d3}]
- bool[@t{d4}]
- bool[@t{d5}]
- 01
- int[@t{d6}]
+ bool[@t{show-dim-label}]
+ bool[@t{hide-all-labels}]
+ 01 int[@t{dim-index}]
@end format
@end cartouche
@code{name} is the name of the dimension, e.g. @code{Variables},
@code{Statistics}, or a variable name.
@end format
@end cartouche
@code{name} is the name of the dimension, e.g. @code{Variables},
@code{Statistics}, or a variable name.
+The meanings of @code{d1}, @code{d2}, and @code{d3} are unknown.
@code{d1} is usually 0 but many other values have been observed.
@code{d1} is usually 0 but many other values have been observed.
-@code{d3} is 2 over 99% of the time.
+If @code{show-dim-label} is 01, the pivot table displays a label for
+the dimension itself. Because usually the group and category labels
+are enough explanation, it is usually 00.
-@code{d5} is 0 over 99% of the time.
+If @code{hide-all-labels} is 01, the pivot table omits all labels for
+the dimension, including group and category labels. It is usually 00.
+When @code{hide-all-labels} is 01, @code{show-dim-label} is ignored.
-@code{d
6} is either -1 or the 0-based index of the dimension, e.g.@: 0
-for the first dimension, 1 for the second, and so on. The latter is
-
the case 98% of the time in the corpus
.
+@code{d
im-index} is usually the 0-based index of the dimension, e.g.@:
+0 for the first dimension, 1 for the second, and so on. Sometimes it
+
is -1. There is no visible difference
.
@node SPV Light Member Categories
@subsection Categories
@node SPV Light Member Categories
@subsection Categories
@@
-1094,7
+1098,7
@@
are really categories; the others just serve as grouping constructs.
@cartouche
@format
Category @result{} Value[@t{name}] (Leaf @math{|} Group)
@cartouche
@format
Category @result{} Value[@t{name}] (Leaf @math{|} Group)
-Leaf @result{} 00 00 00 i2 int[@t{index}] i0
+Leaf @result{} 00 00 00 i2 int[@t{
cat-
index}] i0
Group @result{}
bool[@t{merge}] 00 01 (i0 @math{|} i2)[@t{data}]
i-1 int[@t{n-subcategories}] Category*[@t{n-subcategories}]
Group @result{}
bool[@t{merge}] 00 01 (i0 @math{|} i2)[@t{data}]
i-1 int[@t{n-subcategories}] Category*[@t{n-subcategories}]
@@
-1103,14
+1107,17
@@
Group @result{}
@code{name} is the name of the category (or group).
@code{name} is the name of the category (or group).
-A Leaf represents a leaf category. The Leaf's @code{index} is a
+A Leaf represents a leaf category. The Leaf's @code{
cat-
index} is a
nonnegative integer less than @code{n-categories} in the Dimension in
nonnegative integer less than @code{n-categories} in the Dimension in
-which the Category is nested (directly or indirectly).
+which the Category is nested (directly or indirectly). These
+categories represent the original order in which the categories were
+sorted; if the user sorted or rearranged the categories, then the
+order of categories in the file reflects that without changing the
+@code{cat-index} values.
-A Group represents a Group of nested categories. Usually a Group
-contains at least one Category, so that @code{n-subcategories} is
-positive, but a few Groups with @code{n-subcategories} 0 has been
-observed.
+A Group is a group of nested categories. Usually a Group contains at
+least one Category, so that @code{n-subcategories} is positive, but a
+few Groups with @code{n-subcategories} 0 has been observed.
If a Group's @code{merge} is 00, the most common value, then the group
is really a distinct group that should be represented as such in the
If a Group's @code{merge} is 00, the most common value, then the group
is really a distinct group that should be represented as such in the
@@
-1136,23
+1143,23
@@
The final part of an SPV light member contains the actual data.
Data @result{}
int[@t{layers}] int[@t{rows}] int[@t{columns}] int*[@t{n-dimensions}]
int[@t{n-data}] Datum*[@t{n-data}]
Data @result{}
int[@t{layers}] int[@t{rows}] int[@t{columns}] int*[@t{n-dimensions}]
int[@t{n-data}] Datum*[@t{n-data}]
-Datum @result{} int64[@t{index}] v
3
(00?) Value
+Datum @result{} int64[@t{index}] v
1
(00?) Value
@end format
@end cartouche
@end format
@end cartouche
-The values of @code{
layers}, @code{rows}, and @code{columns} each
-specifies the number of dimensions displayed in layers, rows, and
+The values of @code{
n-layers}, @code{n-rows}, and @code{n-columns}
+
each
specifies the number of dimensions displayed in layers, rows, and
columns, respectively. Any of them may be zero. Their values sum to
@code{n-dimensions} from Dimensions (@pxref{SPV Light Member
Dimensions}).
The @code{n-dimensions} integers are a permutation of the 0-based
columns, respectively. Any of them may be zero. Their values sum to
@code{n-dimensions} from Dimensions (@pxref{SPV Light Member
Dimensions}).
The @code{n-dimensions} integers are a permutation of the 0-based
-dimension numbers. The first @code{layers} integers specify each of
-the dimensions represented by layers, the next @code{rows} integers
+dimension numbers. The first @code{
n-
layers} integers specify each of
+the dimensions represented by layers, the next @code{
n-
rows} integers
specify the dimensions represented by rows, and the final
specify the dimensions represented by rows, and the final
-@code{
columns} integers specify the dimensions represented by columns.
-When there is more than one dimension of a given kind, the inner
-dimensions are given first.
+@code{
n-columns} integers specify the dimensions represented by
+columns. When there is more than one dimension of a given kind, the
+
inner
dimensions are given first.
The format of a Datum varies slightly from version 1 to version 3: in
version 1 it allows for an extra optional 00 byte.
The format of a Datum varies slightly from version 1 to version 3: in
version 1 it allows for an extra optional 00 byte.
@@
-1172,6
+1179,7
@@
for each @math{i} from 0 to @math{d - 1}:
For example, suppose there are 3 dimensions with 3, 4, and 5
categories, respectively. The datum at coordinates (1, 2, 3) has
index @math{5 \times (4 \times (3 \times 0 + 1) + 2) + 3 = 33}.
For example, suppose there are 3 dimensions with 3, 4, and 5
categories, respectively. The datum at coordinates (1, 2, 3) has
index @math{5 \times (4 \times (3 \times 0 + 1) + 2) + 3 = 33}.
+Within a given dimension, the index is the @code{cat-index} in a Leaf.
@node SPV Light Member Value
@subsection Value
@node SPV Light Member Value
@subsection Value