From d98a8409e3d4163c398b78a7edaa9ff286933805 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 23 May 2017 16:54:03 -0700 Subject: [PATCH] Document lots more. --- .gitignore | 6 +- dump.c | 182 ++++++++++++++++++++++++++++++--------- spv-file-format.texi | 200 ++++++++++++++++++++++++++++++++++--------- 3 files changed, 302 insertions(+), 86 deletions(-) diff --git a/.gitignore b/.gitignore index 95c02797a0..f666851913 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,8 @@ germano/ smekens/ williams/ -spv/ -spv2/ -unzipped/ -unzipped2/ +spv*/ +unzipped*/ webold/ dump parse-xml diff --git a/dump.c b/dump.c index 61ee61ac0c..6cc9e52abe 100644 --- a/dump.c +++ b/dump.c @@ -48,6 +48,12 @@ size_t pos; #define STR(x) XSTR(x) #define WHERE __FILE__":" STR(__LINE__) +static uint8_t +get_byte(void) +{ + return data[pos++]; +} + static unsigned int get_u32(void) { @@ -57,6 +63,15 @@ get_u32(void) return x; } +static unsigned int +get_be32(void) +{ + uint32_t x; + x = (data[pos] << 24) | (data[pos + 1] << 16) | (data[pos + 2] << 8) | data[pos + 3]; + pos += 4; + return x; +} + static unsigned int get_u16(void) { @@ -75,6 +90,15 @@ get_double(void) return x; } +static double __attribute__((unused)) +get_float(void) +{ + float x; + memcpy(&x, &data[pos], 4); + pos += 4; + return x; +} + static bool match_u32(uint32_t x) { @@ -96,6 +120,27 @@ match_u32_assert(uint32_t x, const char *where) } #define match_u32_assert(x) match_u32_assert(x, WHERE) +static bool __attribute__((unused)) +match_be32(uint32_t x) +{ + if (get_be32() == x) + return true; + pos -= 4; + return false; +} + +static void +match_be32_assert(uint32_t x, const char *where) +{ + unsigned int y = get_be32(); + if (x != y) + { + fprintf(stderr, "%s: 0x%x: expected be%u, got be%u\n", where, pos - 4, x, y); + exit(1); + } +} +#define match_be32_assert(x) match_be32_assert(x, WHERE) + static bool match_byte(uint8_t b) { @@ -227,6 +272,29 @@ get_string(const char *where) } #define get_string() get_string(WHERE) +static char * +get_string_be(const char *where) +{ + if (1 + /*data[pos + 1] == 0 && data[pos + 2] == 0 && data[pos + 3] == 0*/ + /*&& all_ascii(&data[pos + 4], data[pos])*/) + { + int len = data[pos + 2] * 256 + data[pos + 3]; + char *s = malloc(len + 1); + + memcpy(s, &data[pos + 4], len); + s[len] = 0; + pos += 4 + len; + return s; + } + else + { + fprintf(stderr, "%s: 0x%x: expected string\n", where, pos); + exit(1); + } +} +#define get_string_be() get_string_be(WHERE) + static int get_end(void) { @@ -247,7 +315,11 @@ hex_dump(int ofs, int n) fprintf(stderr, " "); #endif fprintf(stderr, "%02x", c); - //fprintf(stderr, "%c", c >= 32 && c < 127 ? c : '.'); + } + for (int i = 0; i < n; i++) + { + int c = data[ofs + i]; + fprintf(stderr, "%c", c >= 32 && c < 127 ? c : '.'); } fprintf(stderr, "\n"); } @@ -851,28 +923,28 @@ dump_fonts(void) match_byte_assert(i); match_byte_assert(0x31); printf(" font=\"%s\"", get_string()); - match_byte_assert(0); - match_byte_assert(0); - if (!match_byte(0x40) && !match_byte(0x20) && !match_byte(0x80) && !match_byte(0x10) && !match_byte(0x70)) - match_byte_assert(0x50); - match_byte_assert(0x41); - if (!match_u32(0) && !match_u32(1)) - match_u32_assert(2); - match_byte_assert(0); - /* OK, this seems really unlikely to be totally correct, but it matches my corpus... */ - if (!match_u32(0) && !match_u32(2) && !match_u32(4)) - { - if (i == 7) - match_u32_assert(0xfaad); - else - match_u32_assert(0); - } + printf(" size=\"%gpt\"", get_float()); + + int style = get_u32(); + if (style & 1) + printf(" bold=\"true\""); + if (style & 2) + printf(" italic=\"true\""); + + bool underline = data[pos++]; + if (underline) + printf(" underline=\"true\""); + + int halign = get_u32(); + printf("\nhalign=%d\n", halign); + + int valign = get_u32(); + printf("\nvalign=%d\n", valign); - if (!match_u32(0) && !match_u32(1) && !match_u32(2)) - match_u32_assert(3); printf (" fgcolor=\"%s\"", get_string()); printf (" bgcolor=\"%s\"", get_string()); + if (!match_byte(0)) match_byte_assert(1); match_u32_assert(0); @@ -882,42 +954,70 @@ dump_fonts(void) if (version > 1) { - if (i != 3) - { - pos += 12; - } - else - { - get_u32(); - if (!match_u32(-1) && !match_u32(8)) - match_u32_assert(24); - if (!match_u32(-1) && !match_u32(2)) - match_u32_assert(3); - } - - /* Who knows? Ranges from -1 to 8 with no obvious pattern. */ - get_u32(); + printf("\nfonts:"); + for (int i = 0; i < 4; i++) + printf(" %2d", get_u32()); + printf("\n"); } printf ("/>\n"); } - match_u32_assert(240); - pos += 240; + int x1 = get_u32(); + int x1_end = pos + x1; + printf("\n"); + match_be32_assert(1); + int n_borders = get_be32(); + for (int i = 0; i < n_borders; i++) + { + int type = get_be32(); + int stroke = get_be32(); + int color = get_be32(); + printf(" \n", + type, + (stroke == 0 ? "none" + : stroke == 1 ? "solid" + : stroke == 2 ? "dashed" + : stroke == 3 ? "thick" + : stroke == 4 ? "thin" + : stroke == 5 ? "double" + : ""), + color); + } + bool grid = get_byte(); + pos += 3; + printf(" \n", grid ? "yes" : "no"); + printf("\n"); + assert(pos == x1_end); int skip = get_u32(); assert(skip == 18 || skip == 25); pos += skip; int x3 = get_u32(); + int x3_end = pos + x3; if (version == 3) { - assert(x3 >= 117); - int len = data[pos + 0x34]; - if (len) - printf("%.*s\n", len, &data[pos + 0x35]); + match_be32_assert(1); + get_be32(); + printf("\n"); } - pos += x3; + pos = x3_end; int count = get_u32(); pos += 4 * count; diff --git a/spv-file-format.texi b/spv-file-format.texi index ecc5f8e1b5..422eeb5a82 100644 --- a/spv-file-format.texi +++ b/spv-file-format.texi @@ -497,12 +497,21 @@ Bytes with fixed values are written in hexadecimal: @item byte An arbitrary byte. +@item bool +A byte with value 0 or 1. + +@item int16 +An arbitrary 16-bit integer. + @item int An arbitrary 32-bit integer. @item double An arbitrary 64-bit IEEE floating-point number. +@item float +An arbitrary 32-bit IEEE floating-point number. + @item string A 32-bit integer followed by the specified number of bytes of character data. (The encoding is indicated by the Formats @@ -603,7 +612,7 @@ The meaning of the other variable parts of the header is not known. Title @result{} Value[@t{title1}] 01? Value[@t{c}] 01? 31 - Value[@t{title2}] 01? 00? 58 + Value[@t{title2}] 01? @end format @end cartouche @@ -622,11 +631,15 @@ well formatted. For example, for a frequency table, @code{title1} and @cartouche @format -Caption @result{} 58 @math{|} 31 Value[@t{caption}] +Caption @result{} Caption1 Caption2 +Caption1 @result{} 31 Value @math{|} 58 +Caption2 @result{} 31 Value @math{|} 58 @end format @end cartouche -The @code{caption}, if presented, is shown below the table. +The Caption, if present, is shown below the table. Caption2 is +normally present. Caption1 is only rarely nonempty; it might reflect +user editing of the caption. @node SPV Light Member Footnotes @subsection Footnotes @@ -648,19 +661,18 @@ Each footnote has @code{text} and an optional customer @code{marker} @format Fonts @result{} 00 Font*8 Font @result{} - byte[@t{index}] 31 string[@t{typeface}] 00 00 - (10 @math{|} 20 @math{|} 40 @math{|} 50 @math{|} 70 @math{|} 80)[@t{f1}] 41 - (i0 @math{|} i1 @math{|} i2)[@t{f2}] 00 - (i0 @math{|} i2 @math{|} i64173)[@t{f3}] - (i0 @math{|} i1 @math{|} i2 @math{|} i3)[@t{f4}] - string[@t{fgcolor}] string[@t{bgcolor}] i0 i0 00 - v3(int[@t{f5}] int[@t{f6}] int[@t{f7}] int[@t{f8}])) + byte[@t{index}] 31 + string[@t{typeface}] float[@t{size}] int[@t{style}] bool[@t{underline}] + int[@t{halign}] int[@t{valign}] + string[@t{fgcolor}] string[@t{bgcolor}] + byte[@t{alternate}] string[@t{altfg}] string[@t{altbg}] + v3(int[@t{left-margin}] int[@t{right-margin}] int[@t{top-margin}] int[@t{bottom-margin}]) @end format @end cartouche Each Font represents the font style for a different element, in the -following order: title, caption, footnote, row labels, column labels, -corner labels, data, and layers. +following order: title, caption, footer, corner, column +labels, row labels, data, and layers. @code{index} is the 1-based index of the Font, i.e. 1 for the first Font, through 8 for the final Font. @@ -669,13 +681,32 @@ Font, through 8 for the final Font. is @code{SansSerif} in over 99% of instances and @code{Times New Roman} in the rest. +@code{size} is the size of the font, in points. The most common size +in the corpus is 12 points. + +@code{style} is a bit mask. Bit 0 (with value 1) is set for bold, bit +1 (with value 2) is set for italic. + +@code{underline} is 1 if the font is underlined, 0 otherwise. + +@code{halign} specifies horizontal alignment: 0 for center, 2 for +left, 4 for right, 61453 for decimal, 64173 for mixed. Mixed +alignment varies according to type: string data is left-justified, +numbers and most other formats are right-justified. + +@code{valign} specifies vertical alignment: 0 for center, 1 for top, 3 +for bottom. + @code{fgcolor} and @code{bgcolor} are the foreground color and background color, respectively. In the corpus, these are always @code{#000000} and @code{#ffffff}, respectively. +@code{alternate} is 01 if rows should alternate colors, 00 if all rows +should be the same color. When @code{alternate} is 01, @code{altfg} +and @code{altbg} specify the colors for the alternate rows. + The meaning of the remaining data is unknown. It seems likely to -include font sizes, horizontal and vertical alignment, attributes such -as bold or italic, and margins. +include font sizes, attributes such as bold or italic, and margins. The table below lists the values observed in the corpus. When a cell contains a single value, then 99@math{+}% of the corpus contains that value. @@ -704,9 +735,9 @@ about two-thirds of the time, as does the combination of f4 = 0, f6 = @cartouche @format Formats @result{} - int[@t{n1}] byte*[@t{n1}] - int[@t{n2}] byte*[@t{n2}] - int[@t{n3}] byte*[@t{n3}] + Borders + PrintSettings + TableSettings int[@t{n4}] int*[@t{n4}] string[@t{encoding}] (i0 @math{|} i-1) (00 @math{|} 01) 00 (00 @math{|} 01) @@ -716,6 +747,44 @@ Formats @result{} v1(i0) v3(count(count(X5) count(X6))) +Borders @result{} + int[@t{endian}] + int[@t[n-borders}] Border*[@t{n-borders}] + bool[@t{show-grid-lines}] + 00 00 00 + +Border @result{} + int[@t{border-type}] + int[@t{stroke-type}] + int[@t{color}] + +PrintSettings @result{} + int[@t{endian}] + bool[@t{all-layers}] + bool[@t{new-layers}] + bool[@t{fit-width}] + bool[@t{fit-length}] + bool[@t{top-continuation}] + bool[@t{bottom-continuation}] + int[@t{n-orphan-lines}] + string[@t{continuation-string}] + +TableSettings @result{} + int[@t{endian}] + int + int[@t{current-layer}] + bool[@t{skip-empty}] + bool[@t{show-dimension-in-corner}] + bool[@t{use-alphabetic-markers}] + bool[@t{footnote-marker-position}] + v3( + byte + int[@t{n}] byte*[@t{n}] + string + string[@t{table-look}] + 00... + ) + X5 @result{} byte*33 int[@t{n}] int*[@t{n}] X6 @result{} 01 00 (03 @math{|} 04) 00 00 00 @@ -725,25 +794,76 @@ X6 @result{} int byte[@t{decimal}] byte[@t{grouping}] byte*8 01 - (string[@t{dataset}] string[@t{datafile}] i0 int i0)? + (string[@t{dataset}] string[@t{data file}] i0 int i0)? int[@t{n-ccs}] string*[@t{n-ccs}] 2e (00 @math{|} 01) (i2000000 i0)? @end format @end cartouche -In every example in the corpus, @code{n1} is 240. The meaning of the -bytes that follow it is unknown. +The Borders reflect how borders between regions are drawn. If +@code{endian} is 1, then values inside Borders, including +@code{endian} itself, are big-endian, otherwise they are +little-endian. In practice, they seem to always be big-endian, even +though the rest of the file is little-endian. @code{n-borders} seems +to always be 19. @code{show-grid-lines} is 1 to draw grid lines, +otherwise 0. + +Each Border describes one kind of border. Each @code{border-type} +appears once in order, and they correspond to the following borders: + +@table @asis +@item 0 +Title. +@item 1@dots{}4 +Left, top, right, and bottom outer frame. +@item 5@dots{}8 +Left, top, right, and bottom inner frame. +@item 9, 10 +Left and top of data area. +@item 11, 12 +Horizontal and vertical dimension rows. +@item 13, 14 +Horizontal and vertical dimension columns. +@item 15, 16 +Horizontal and vertical category rows. +@item 17, 18 +Horizontal and vertical category columns. +@end table + +@code{stroke-type} describes how a border is drawn, as one of: + +@table @asis +@item 0 +No line. +@item 1 +Solid line. +@item 2 +Dashed line. +@item 3 +Thick line. +@item 4 +Thin line. +@item 5 +Double line. +@end table + +@code{color} is an RGB color. Bits 24--31 are alpha, bits 16--23 are +red, 8--15 are green, 0--7 are blue. An alpha of 255 indicates an +opaque color, therefore opaque black is 0xff000000. -In every example in the corpus, @code{n2} is 18 and the bytes that -follow it are @code{00 00 00 01 00 00 00 00 00 00 00 00 00 02 00 00 00 -00}. The meaning of these bytes is unknown. +The PrintSettings reflect settings for printing. Like Borders, they +have independent endianness. The @code{continuation-string} is +usually empty but it may contain a text string such as ``(cont.)''. -In every example in the corpus for version 1, @code{n3} is 16 and the -bytes that follow it are @code{00 00 00 01 00 00 00 01 00 00 00 00 01 -01 01 01}. In version 3, observed @code{n3} varies from 117 to 150, -and its bytes include a 1-byte count at offset 0x34. When the count -is nonzero, a text string of that length at offset 0x35 is the name of -a ``TableLook'', e.g. ``Default'' or ``Academic''. +The TableSettings reflect display settings. Like Borders, they +have independent endianness. @code{current-layer} is the displayed +layer. @code{use-alphabetic-markers} is 1 to show markers as letters +(e.g. @samp{a}, @samp{b}, @samp{c}, @dots{}), otherwise they are shown +as numbers starting from 1. When @code{footnote-marker-position} is +1, footnote markers are shown as superscripts, otherwise as +subscripts. @code{table-look} is the name of a SPSS ``TableLook'' +table style, such as ``Default'' or ``Academic''; it is often empty. +TableSettings ends with an arbitrary number of null bytes. Observed values of @code{n4} vary from 0 to 17. Out of 7,060 examples in the corpus, it is nonzero only 36 times. @@ -1057,9 +1177,7 @@ ValueMod @result{} 31 i0 (i0 @math{|} i1 string[@t{subscript}]) v1(00 (i1 @math{|} i2) 00 00 int 00 00) v3(count(FormatString Style ValueModUnknown)) - @math{|} 31 i1 int[@t{footnote-number}] Format - @math{|} 31 i2 (00 @math{|} 01 @math{|} 02) 00 (i1 @math{|} i2 @math{|} i3) Format - @math{|} 31 i3 00 00 01 00 i2 Format + @math{|} 31 int[@t{n-refs}] int16*[@t{n-refs}] Format @math{|} 58 Style @result{} 58 @math{|} 31 01? 00? 00? 00? 01 string[@t{fgcolor}] string[@t{bgcolor}] string[@t{typeface}] byte Format @result{} 00 00 count(FormatString Style 58) @@ -1068,16 +1186,16 @@ ValueModUnknown @result{} 58 @math{|} 31 i0 i0 i0 i0 01 00 (01 @math{|} 02 @math @end format @end cartouche -The @code{footnote-number}, if present, specifies a footnote that the -Value references. The footnote's marker is shown appended to the main -text of the Value, as a superscript. +A ValueMod that begins with ``31 i0'' specifies a string to append to +the main text of the Value, as a subscript. The subscript text is a +brief indicator, e.g.@: @samp{a} or @samp{a,b}, with its meaning +indicated by the table caption. In this usage, subscripts are similar +to footnotes. One apparent difference is that a Value can only +reference one footnote but a subscript can list more than one letter. -The @code{subscript}, if present, specifies a string to append to the -main text of the Value, as a subscript. The subscript text is a brief -indicator, e.g.@: @samp{a} or @samp{a,b}, with its meaning indicated -by the table caption. In this usage, subscripts are similar to -footnotes; one apparent difference is that a Value can only reference -one footnote but a subscript can list more than one letter. +A ValueMod that begins with 31 followed by a nonzero ``int'' specifies +a footnote or footnotes that the Value references. Footnote markers +are shown appended to the main text of the Value, as superscripts. The Format, if present, is a format string for substitutions using the syntax explained previously. It appears to be an English-language -- 2.30.2