From: Ben Pfaff Date: Tue, 12 Jan 2016 06:36:50 +0000 (-0800) Subject: Finish description of legacy ("heavy") binary format. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp;a=commitdiff_plain;h=30801cdf055bb004cc722d9801e0c67df66d6fd8 Finish description of legacy ("heavy") binary format. --- diff --git a/dump2.c b/dump2.c index d7f8ad3d68..15670ecf4b 100644 --- a/dump2.c +++ b/dump2.c @@ -206,7 +206,7 @@ dump_raw(FILE *stream, int start, int end, const char *separator) } static void -dump_source(int end, int count, int n_series) +dump_source(int end, int count, int n_series, const char *name) { const union { @@ -235,14 +235,19 @@ dump_source(int end, int count, int n_series) if (pos >= end) return; - printf ("\n %08x: (%d sysmis)", pos, n_sysmis); - printf (" %d", get_u32()); - printf (", \"%s\"\n", get_string()); + match_u32_assert(1); + char *name2 = get_string(); + assert(!strcmp(name, name2)); printf ("\n %08x:", pos); int n_more_series = get_u32(); + if (n_series != n_more_series) + printf("different series counts: %d %d\n", n_series, n_more_series); + assert(n_more_series <= n_series); printf (" %d series to come\n", n_more_series); + int max1 = -1; + int ofs = pos; for (int i = 0; i < n_more_series; i++) { printf ("%08x:", pos); @@ -252,21 +257,45 @@ dump_source(int end, int count, int n_series) { int x = get_u32(); int y = get_u32(); - printf (" (%d,%d)", x, y); + printf (" (%d, %d)", x, y); + if (y > max1) + max1 = y; } printf ("\n"); } printf ("\n%08x:", pos); int n_strings = get_u32(); + assert(n_strings == max1 + 1); printf (" %d strings\n", n_strings); + + char **strings = malloc(n_strings * sizeof *strings); for (int i = 0; i < n_strings; i++) { - int x = get_u32(); + int frequency = get_u32(); char *s = get_string(); - printf ("%d: \"%s\" (%d)\n", i, s, x); + printf ("%d: \"%s\" (%d)\n", i, s, frequency); + strings[i] = s; } printf ("\n"); + + assert (pos == end); + pos = ofs; + printf("Strings:\n"); + for (int i = 0; i < n_more_series; i++) + { + printf (" \"%s\"\n", get_string()); + int n_pairs = get_u32(); + for (int j = 0; j < n_pairs; j++) + { + int x = get_u32(); + //assert (x == j); + int y = get_u32(); + printf (" %d: \"%s\"\n", x, strings[y]); + } + printf ("\n"); + } + pos = end; } int @@ -312,6 +341,7 @@ main(int argc, char **argv) struct source { int offset, count, n_series; + char *name; } sources[n_sources]; for (int i = 0; i < n_sources; i++) @@ -326,6 +356,7 @@ main(int argc, char **argv) sources[i].offset = offset; sources[i].count = count; sources[i].n_series = n_series; + sources[i].name = name; } for (int i = 0; i < n_sources; i++) @@ -335,7 +366,7 @@ main(int argc, char **argv) fprintf (stderr, "pos=0x%x expected=0x%x reading source %d\n", pos, sources[i].offset, i); exit(1); } - dump_source(i + 1 >= n_sources ? n : sources[i + 1].offset, sources[i].count, sources[i].n_series); + dump_source(i + 1 >= n_sources ? n : sources[i + 1].offset, sources[i].count, sources[i].n_series, sources[i].name); } assert(pos == n); diff --git a/spv-file-format.texi b/spv-file-format.texi index 223b2ef90f..a2b07aa711 100644 --- a/spv-file-format.texi +++ b/spv-file-format.texi @@ -9,7 +9,7 @@ write them. An an aside, SPSS 15 and earlier versions use a completely different output format based on the Microsoft Compound Document Format. This -format is not documented. +format is not documented here. An SPV file is a Zip archive that can be read with @command{zipinfo} and @command{unzip} and similar programs. The final member in the Zip @@ -622,17 +622,17 @@ index @math{5 \times (4 \times (3 \times 0 + 1) + 2) + 3 = 33}. @example value := 00? 00? 00? 00? raw-value raw-value := - 01 value-mod int32[format] double[x] - | 02 value-mod int32[format] double[x] + 01 value-mod int[format] double[x] + | 02 value-mod int[format] double[x] string[varname] string[vallab] (01 | 02 | 03) | 03 string[local] value-mod string[id] string[c] (00 | 01)[type] - | 04 value-mod int32[format] string[vallab] string[varname] + | 04 value-mod int[format] string[vallab] string[varname] (01 | 02 | 03) string[s] | 05 value-mod string[varname] string[varlabel] (01 | 02 | 03) - | value-mod string[format] int32[n-args] arg*[n-args] + | value-mod string[format] int[n-args] arg*[n-args] arg := i0 value - | int32[x] i0 value*[x + 1] /* @r{x > 0} */ + | int[x] i0 value*[x + 1] /* @r{x > 0} */ @end example A @code{value} boils down to a number or a string. There are several @@ -774,11 +774,11 @@ The format string is localized to the user's locale. value-mod := 31 i0 (i0 | i1 string[subscript]) value-mod-i0-v1 /* @r{version 1} */ | 31 i0 (i0 | i1 string[subscript]) value-mod-i0-v3 /* @r{version 3} */ - | 31 i1 int32[footnote-number] format + | 31 i1 int[footnote-number] format | 31 i2 (00 | 01 | 02) 00 (i1 | i2 | i3) format | 31 i3 00 00 01 00 i2 format | 58 -value-mod-i0-v1 := 00 (i1 | i2) 00 00 int32 00 00 +value-mod-i0-v1 := 00 (i1 | i2) 00 00 int 00 00 value-mod-i0-v3 := count(format-string (58 | 31 style) (58 @@ -821,11 +821,14 @@ The @code{style}, if present, changes the style for this individual @node SPV Legacy Detail Member Binary Format @subsection SPV Legacy Detail Member Binary Format -A legacy detail member's binary file has a much simpler format than -the light member binary format. +Whereas the light binary format represents everything about a given +pivot table, the legacy binary format conceptually consists of a +number of named sources, each of which consists of a number of named +series, each of which is a 1-dimensional array of numbers or strings +or a mix. Thus, the legacy binary file format is quite simple. @example -legacy-member := 00 byte[version] int16[n-sources] int32[file-size] +legacy-binary := 00 byte[version] int16[n-sources] int[file-size] metadata*[n-sources] data*[n-sources] @end example @@ -840,22 +843,77 @@ which has @code{metadata} and @code{data}. @code{file-size} is the size of the file, in bytes. @example -metadata := int32[per-series] int32[n-series] int32[offset] source-name -source-name := byte*[32] /* @r{version 0xaf} */ -source-name := byte*[64] int32 /* @r{version 0xb0} */ +/* @r{version 0xaf} */ +metadata := int[per-series] int[n-series] int[ofs] byte*32[source-name] + +/* @r{version 0xb0} */ +metadata := int[per-series] int[n-series] int[ofs] byte*64[source-name] int[x] @end example A data source consists of @code{n-series} series of data, with -@code{per-series} data items per series. Depending on the version, +@code{per-series} data values per series. + @code{source-name} is a 32- or 64-byte string padded on the right with zero bytes. The names that appear in the corpus are very generic, -usually @code{tableData} or @code{source0}. The @code{offset} is the -offset, in bytes, from the beginning of the file to the start of this -data source's @code{data}. +usually @code{tableData} or @code{source0}. + +The @code{ofs} is the offset, in bytes, from the beginning of the file +to the start of this data source's @code{data}. This allows programs +to skip to the beginning of the data for a particular source; it is +also important to determine whether a source includes any string data +(see below). -The meaning of the number in version 0xb0 @code{source-name} is -unknown. +The meaning of @code{x} in version 0xb0 is unknown. @example +data := numeric-data string-data? +numeric-data := numeric-series*[n-series] +numeric-series := byte*288[series-name] double*[per-series] +@end example + +Data follow the metadata in the legacy binary format, with sources in +the same order. Each series begins with a @code{series-name}, which +generally indicates its role in the pivot table, e.g.@: ``cell'', +``cellFormat'', ``dimension0categories'', ``dimension0group0''. The +name is followed by the data, one double per element in the series. A +double with the maximum negative double @code{-DBL_MAX} represents the +system-missing value SYSMIS. +@example +string-data := i1 string[source-name] pairs labels + +pairs := int[n-string-series] pair-series*[n-string-series] +pair-series := string[pair-series-name] int[n-pairs] pair*[n-pairs] +pair := int[i] int[j] + +labels := int[n-labels] label*[n-labels] +label := int[frequency] int[s] @end example + +A source may include a mix of numeric and string data values. When a +source includes any string data, the data values that are strings are +set to SYSMIS in the @code{numeric-series}, and @code{string-data} +follows the @code{numeric-data}. To reliably determine whether a +source includes @code{string-data}, the reader should check whether +the offset following the @code{numeric-data} is the offset of the next +series, as indicated by its @code{metadata} (or end of file, in the +case of the last source in a file). + +@code{string-data} repeats the name of the source. + +The string data overlays the numeric data. @code{n-string-series} is +the number of series within the source that include string data. More +precisely, it is the 1-based index of the last series in the source +that includes any string data; thus, it would be 4 if there are 5 +series and only the fourth one includes string data. + +Each @code{pair-series} consists a sequence of 0 or more pairs, each +of which maps from a 0-based index within the series @code{i} to a +0-based label index @code{j}. The pair @code{i} = 2, @code{j} = 3, +for example, would mean that the third data value (with value SYSMIS) +is to be replaced by the string of the fourth label. + +The labels themselves follow the pairs. The valuable part of each +label is the string @code{s}. Each label also includes a +@code{frequency} that reports the number of pairs that reference it +(although this is not useful).