From 86009a9088ecdfddfacd7974f30b88cb89937d55 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 3 Jan 2016 23:29:49 -0800 Subject: [PATCH] Finish up with categories (for now). --- dump.c | 10 ++++++++ spv-file-format.texi | 56 +++++++++++++++++++++++++++----------------- 2 files changed, 44 insertions(+), 22 deletions(-) diff --git a/dump.c b/dump.c index e9a068699e..70ecab6320 100644 --- a/dump.c +++ b/dump.c @@ -579,10 +579,15 @@ dump_category(FILE *stream, int level, int *indexes, int *n_indexes, int max_ind int merge = data[pos]; if (!match_byte(0)) match_byte_assert (1); + match_byte_assert (0); + int unindexed = data[pos]; if (!match_byte(0)) match_byte_assert (1); + + int x = get_u32 (); + pos -= 4; if (!match_u32 (0)) match_u32_assert (2); @@ -604,6 +609,11 @@ dump_category(FILE *stream, int level, int *indexes, int *n_indexes, int max_ind fprintf(stderr, "index not -1 but merged\n"); exit(1); } + if (x != 2) + { + fprintf(stderr, "index not -1 but x != 2\n"); + exit(1); + } if (n_categories != 0) { fprintf(stderr, "index not -1 but subcategories\n"); diff --git a/spv-file-format.texi b/spv-file-format.texi index 5ced97ce61..c587233795 100644 --- a/spv-file-format.texi +++ b/spv-file-format.texi @@ -517,28 +517,40 @@ for the first dimension, 1 for the second, and so on. The latter is the case 98% of the time in the corpus. @example -category := value[name] - (00 | 01)[merge] 00 (00 | 01)[unindexed] (i0 | i2) - int[index] int[n-subcategories] category*[n-subcategories] +category := value[name] (terminal | group) +terminal-category := 00 00 00 i2 int[index] i0 @end example +@code{name} is the name of the category (or group). + @code{category} can represent a terminal category. In that case, -@code{name} is the name of the category, @code{merge} is 00, -@code{unindexed} is 00, @code{index} is a nonnegative integer less -than @code{n-categories} in the @code{dimension} in which the -@code{category} is nested (directly or indirectly), and -@code{n-subcategories} is 0. - -Alternatively, @code{category} can represent a group of nested -categories. In that case, @code{name} is the name of the group, -@code{unindexed} is 01, and @code{index} is -1. Ordinarily a group -has some nested content, so that @code{n-subcategories} is positive, -but a few instances of groups with @code{n-subcategories} 0 has been -observed. If @code{merge} is 00, the most common value, then the -group is really a distinct group that should be represented as such in -the visual representation and user interface. If @code{merge} is 01, -however, the categories in this group should be shown and treated as -if they were direct children of the group's parent group (or if it has -no parent group, then direct children of the dimension), and this -group's name is irrelevant and should not be displayed. (Merged -groups can be nested!) +@code{index} is a nonnegative integer less than @code{n-categories} in +the @code{dimension} in which the @code{category} is nested (directly +or indirectly). + +Alternatively, @code{category} can represent a @code{group} of nested +categories: + +@example +group := (00 | 01)[merge] 00 01 (i0 | i2)[data] + i-1 int[n-subcategories] category*[n-subcategories] +@end example + +Ordinarily a group has some nested content, so that +@code{n-subcategories} is positive, but a few instances of groups with +@code{n-subcategories} 0 has been observed. + +If @code{merge} is 00, the most common value, then the group is really +a distinct group that should be represented as such in the visual +representation and user interface. If @code{merge} is 01, however, +the categories in this group should be shown and treated as if they +were direct children of the group's parent group (or if it has no +parent group, then direct children of the dimension), and this group's +name is irrelevant and should not be displayed. (Merged groups can be +nested!) + +@code{data} appears to be i2 when all of the categories within a group +are terminal categories that directly represent data values for a +variable (e.g. in a frequency table or crosstabulation, a group of +values in a variable being tabulated) and i0 otherwise, but this might +be naive. -- 2.30.2