From 68a20800ccdf1d5263e1a500f1acdaa982789419 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Wed, 6 Jan 2016 22:01:14 -0800 Subject: [PATCH] Some minor refinements from expanding the corpus. --- .gitignore | 2 + dump.c | 97 +++++++++++++++++++++++++------------------- parse-all-light | 2 +- spv-file-format.texi | 20 ++++----- 4 files changed, 68 insertions(+), 53 deletions(-) diff --git a/.gitignore b/.gitignore index fba1fc27c1..3dd074e1cd 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,9 @@ germano/ smekens/ williams/ spv/ +svp2/ unzipped/ +unzipped2/ webold/ dump parse-xml diff --git a/dump.c b/dump.c index 9e1bfd9f96..be5139c604 100644 --- a/dump.c +++ b/dump.c @@ -246,6 +246,39 @@ dump_counted_string(void) return s; } +static void __attribute__((unused)) +hex_dump(int ofs, int n) +{ + for (int i = 0; i < n; i++) + { + int c = data[ofs + i]; +#if 1 + if (i && !(i % 16)) + fprintf(stderr, "-"); + else + fprintf(stderr, " "); +#endif + fprintf(stderr, "%02x", c); + //fprintf(stderr, "%c", c >= 32 && c < 127 ? c : '.'); + } + fprintf(stderr, "\n"); +} + +static void +dump_style(void) +{ + match_byte(1); + match_byte(0); + match_byte(0); + match_byte(0); + match_byte_assert(1); + get_string(); /* foreground */ + get_string(); /* background */ + get_string(); /* font */ + if (!match_byte(14)) + match_byte_assert(12); /* size? */ +} + static char * dump_nested_string(void) { @@ -255,7 +288,10 @@ dump_nested_string(void) match_byte_assert (0); int outer_end = get_end(); s = dump_counted_string(); - match_byte_assert(0x58); + if (match_byte(0x31)) + dump_style(); + else + match_byte_assert(0x58); match_byte_assert(0x58); if (pos != outer_end) { @@ -308,19 +344,7 @@ dump_optional_value(FILE *stream) dump_counted_string(); if (match_byte(0x31)) - { - /* Only one example in the corpus. */ - match_byte(1); - match_byte(0); - match_byte(0); - match_byte(0); - match_byte_assert(1); - get_string(); /* foreground */ - get_string(); /* background */ - get_string(); /* font */ - if (!match_byte(14)) - match_byte_assert(12); /* size? */ - } + dump_style(); else match_byte_assert(0x58); if (match_byte(0x31)) @@ -357,7 +381,7 @@ dump_optional_value(FILE *stream) else if (match_u32 (2)) { fprintf(stream, "(special 2)"); - if (!match_byte(0)) + if (!match_byte(0) && !match_byte(1)) match_byte_assert(2); match_byte_assert(0); if (!match_u32 (2) && !match_u32(1)) @@ -792,24 +816,6 @@ dump_title(void) } } -static void __attribute__((unused)) -hex_dump(int ofs, int n) -{ - for (int i = 0; i < n; i++) - { - int c = data[ofs + i]; -#if 1 - if (i && !(i % 16)) - fprintf(stderr, "-"); - else - fprintf(stderr, " "); -#endif - fprintf(stderr, "%02x", c); - //fprintf(stderr, "%c", c >= 32 && c < 127 ? c : '.'); - } - fprintf(stderr, "\n"); -} - static void dump_fonts(void) { @@ -854,8 +860,8 @@ dump_fonts(void) match_u32_assert(5); if (!match_u32(10) && !match_u32(11) && !match_u32(5)) match_u32_assert(9); - if (!match_u32(0)) - match_u32_assert(1); + if (!match_u32(0) && !match_u32(1)) + match_u32_assert(2); } else { @@ -885,7 +891,7 @@ dump_fonts(void) assert(x3 >= 117); int len = data[pos + 0x34]; if (len) - fprintf(stderr, "%.*s\n", len, &data[pos + 0x35]); + printf("%.*s\n", len, &data[pos + 0x35]); } pos += x3; @@ -904,8 +910,9 @@ dump_fonts(void) match_byte_assert(1); if (version > 1) { - if (!match_byte(0x97) && !match_byte(0x98) && !match_byte(0x99)) - match_byte_assert(0x9a); + if (!match_byte(0x97) && !match_byte(0x98) + && !match_byte(0x99) && !match_byte(0x9a)) + match_byte_assert(0x9b); match_byte_assert(7); match_byte_assert(0); match_byte_assert(0); @@ -953,8 +960,8 @@ dump_fonts(void) match_byte_assert(1); match_byte_assert(0); - if (!match_byte(3)) - match_byte_assert(4); + if (!match_byte(3) && !match_byte(4)) + match_byte_assert(5); match_byte_assert(0); match_byte_assert(0); match_byte_assert(0); @@ -973,8 +980,9 @@ dump_fonts(void) if (!match_byte(0)) match_byte_assert(1); - if (!match_byte(0x97) && !match_byte(0x98) && !match_byte(0x99)) - match_byte_assert(0x9a); + if (!match_byte(0x97) && !match_byte(0x98) + && !match_byte(0x99) && !match_byte(0x9a)) + match_byte_assert(0x9b); match_byte_assert(7); match_byte_assert(0); match_byte_assert(0); @@ -1030,6 +1038,11 @@ dump_fonts(void) if (!match_byte(0)) match_byte_assert(1); + if (pos < outer_end) + { + match_u32_assert(2000000); + match_u32_assert(0); + } assert(pos == outer_end); pos = outer_end; diff --git a/parse-all-light b/parse-all-light index 3925c52f8d..1facaec43d 100755 --- a/parse-all-light +++ b/parse-all-light @@ -1,5 +1,5 @@ #! /bin/bash -for d in {williams,germano,smekens,unzipped}/*/*light*.bin; do +for d in {williams,germano,smekens,unzipped,unzipped2}/*/*light*.bin; do if ! ./dump all < $d >/dev/null; then echo $d ./dump all < $d diff --git a/spv-file-format.texi b/spv-file-format.texi index 2ac8de6700..1aef76c647 100644 --- a/spv-file-format.texi +++ b/spv-file-format.texi @@ -55,10 +55,12 @@ Same format used for tables, with a different name. The structure of a chart plus its data. Charts do not have a ``light'' format. -@item @var{prefix}_model.xml -@itemx @var{prefix}_pmml.xml -@itemx @var{prefix}_stats.xml +@item @var{prefix}_model.scf +@itemx @var{prefix}_pmml.scf Not yet investigated. The corpus contains only one example of each. + +@itemx @var{prefix}_stats.xml +Not yet investigated. The corpus contains few examples. @end table The @file{@var{prefix}} in the names of the detail members is @@ -417,7 +419,7 @@ x6 := 01 00 (03 | 04) 00 00 00 byte*8 01 (string[dataset] string[datafile] i0 int i0)? int[n-ccs] string*[n-ccs] - 2e (00 | 01) + 2e (00 | 01) (i2000000 i0)? @end example In every example in the corpus, @code{x1} is 240. The meaning of the @@ -632,19 +634,17 @@ substitution := i0 value opt-value := 31 i0 (i0 | i1 string) opt-value-i0-v1 /* @r{version 1} */ | 31 i0 (i0 | i1 string) opt-value-i0-v3 /* @r{version 3} */ | 31 i1 int32[footnote-number] nested-string - | 31 i2 (00 | 02) 00 (i1 | i2 | i3) nested-string + | 31 i2 (00 | 01 | 02) 00 (i1 | i2 | i3) nested-string | 31 i3 00 00 01 00 i2 nested-string | 58 opt-value-i0-v1 := 00 (i1 | i2) 00 00 int32 00 00 opt-value-i0-v3 := count(counted-string - (58 - | 31 01? 00? 00? 00? 01 - string[fgcolor] string[bgcolor] string[typeface] - byte) + (58 | 31 style) (58 | 31 i0 i0 i0 i0 01 00 (01 | 02 | 08) 00 08 00 0a 00)) -nested-string := 00 00 count(counted-string 58 58) +style := 01? 00? 00? 00? 01 string[fgcolor] string[bgcolor] string[font] byte +nested-string := 00 00 count(counted-string (58 | 31 style) 58) counted-string := count((i0 (58 | 31 string))?) @end example -- 2.30.2