Figured out more details, documented most values, work on substitutions.

author Ben Pfaff <blp@cs.stanford.edu>

Sun, 10 Jan 2016 00:49:54 +0000 (16:49 -0800)

committer Ben Pfaff <blp@cs.stanford.edu>

Sun, 10 Jan 2016 00:49:54 +0000 (16:49 -0800)
author Ben Pfaff <blp@cs.stanford.edu>
Sun, 10 Jan 2016 00:49:54 +0000 (16:49 -0800)
committer Ben Pfaff <blp@cs.stanford.edu>
Sun, 10 Jan 2016 00:49:54 +0000 (16:49 -0800)
diff --git a/dump.c b/dump.c

index b383f4359083cd6e1bef39c075947e02c324dfb6..a52cafddf0d414cb06ddd8464c9795949b91e2d9 100644 (file)
--- a/dump.c
+++ b/dump.c
@@ -265,22 +265,25 @@ hex_dump(int ofs, int n)
  }
  
  static void
-dump_style(void)
+dump_style(FILE *stream)
  {
    match_byte(1);
    match_byte(0);
    match_byte(0);
    match_byte(0);
    match_byte_assert(1);
-  get_string();     /* foreground */
-  get_string();     /* background */
-  get_string();     /* font */
+  char *fg = get_string();     /* foreground */
+  char *bg = get_string();     /* background */
+  char *font = get_string();     /* font */
+  int size = data[pos];
    if (!match_byte(14))
      match_byte_assert(12); /* size? */
+  fprintf(stream, " fgcolor=\"%s\" bgcolor=\"%s\" font=\"%s\" size=\"%d\"",
+          fg, bg, font, size);
  }
  
  static char *
-dump_nested_string(void)
+dump_nested_string(FILE *stream)
  {
    char *s = NULL;
  
@@ -288,8 +291,10 @@ dump_nested_string(void)
    match_byte_assert (0);
    int outer_end = get_end();
    s = dump_counted_string();
+  if (s)
+    fprintf(stream, " \"%s\"", s);
    if (match_byte(0x31))
-    dump_style();
+    dump_style(stream);
    else
      match_byte_assert(0x58);
    match_byte_assert(0x58);
@@ -303,20 +308,23 @@ dump_nested_string(void)
  }
  
  static void
-dump_optional_value(FILE *stream)
+dump_value_modifier(FILE *stream)
  {
    if (match_byte (0x31))
      {
        if (match_u32 (0))
          {
+          fprintf(stream, "<special0");
            if (match_u32 (1))
              {
                /* Corpus frequencies:
                   124 "a"
                   12 "b"
                   8 "a, b"
+
+                 The given text is appended to the cell in a subscript font.
                */
-              get_string();
+              fprintf(stream, " subscript=\"%s\"", get_string());
              }
            else
              match_u32_assert (0);
@@ -334,6 +342,7 @@ dump_optional_value(FILE *stream)
                  match_u32_assert(10);
                match_byte(0);
                match_byte(0);
+              fprintf(stream, "/>\n");
                return;
              }
  
@@ -341,10 +350,12 @@ dump_optional_value(FILE *stream)
            
            /* This counted-string appears to be a template string,
               e.g. "Design\: [:^1:]1 Within Subjects Design\: [:^1:]2". */
-          dump_counted_string();
+          char *template = dump_counted_string();
+          if (template)
+            fprintf(stream, " template=\"%s\"", template);
  
            if (match_byte(0x31))
-            dump_style();
+            dump_style(stream);
            else
              match_byte_assert(0x58);
            if (match_byte(0x31))
@@ -372,32 +383,36 @@ dump_optional_value(FILE *stream)
                fprintf(stderr, "outer end discrepancy\n");
                exit(1);
              }
+          fprintf(stream, "/>\n");
          }
        else if (match_u32 (1))
          {
-          fprintf(stream, "(footnote %d) ", get_u32());
-          dump_nested_string();
+          fprintf(stream, "<footnote-ref index=%d", get_u32());
+          dump_nested_string(stream);
+          fprintf(stream, "/>\n");
          }
        else if (match_u32 (2))
          {
-          fprintf(stream, "(special 2)");
+          fprintf(stream, "<special2 %d %d", data[pos], data[pos + 2]);
            if (!match_byte(0) && !match_byte(1))
              match_byte_assert(2);
            match_byte_assert(0);
            if (!match_u32 (2) && !match_u32(1))
              match_u32_assert(3);
-          dump_nested_string();
+          dump_nested_string(stream);
+          fprintf(stream, "/>\n");
          }
        else
          {
+          fprintf(stream, "<special3");
            match_u32_assert(3);
-          fprintf(stream, "(special 3)");
            match_byte_assert(0);
            match_byte_assert(0);
            match_byte_assert(1);
            match_byte_assert(0);
            match_u32_assert(2);
-          dump_nested_string(); /* Our corpus doesn't contain any examples with strings though. */
+          dump_nested_string(stream); /* Our corpus doesn't contain any examples with strings though. */
+          fprintf(stream, "/>\n");
          }
      }
    else
@@ -468,7 +483,7 @@ dump_value(FILE *stream, int level)
        unsigned int format;
        double value;
  
-      dump_optional_value(stream);
+      dump_value_modifier(stream);
        format = get_u32 ();
        value = get_double ();
        fprintf (stream, "<number value=\"%.*g\" format=\"%s%d.%d\"/>\n",
@@ -480,7 +495,7 @@ dump_value(FILE *stream, int level)
        char *var, *vallab;
        double value;
  
-      dump_optional_value (stream);
+      dump_value_modifier (stream);
        format = get_u32 ();
        value = get_double ();
        var = get_string ();
@@ -498,7 +513,7 @@ dump_value(FILE *stream, int level)
    else if (match_byte (3))
      {
        char *text =  get_string();
-      dump_optional_value(stream);
+      dump_value_modifier(stream);
        char *identifier = get_string();
        char *text_eng = get_string();
        fprintf (stream, "<string c=\"%s\"", text_eng);
@@ -515,7 +530,7 @@ dump_value(FILE *stream, int level)
        unsigned int format;
        char *var, *vallab, *value;
  
-      dump_optional_value(stream);
+      dump_value_modifier(stream);
        format = get_u32 ();
        vallab = get_string ();
        var = get_string ();
@@ -532,7 +547,7 @@ dump_value(FILE *stream, int level)
      }
    else if (match_byte (5))
      {
-      dump_optional_value(stream);
+      dump_value_modifier(stream);
        char *name = get_string ();
        char *label = get_string ();
        fprintf (stream, "<variable name=\"%s\"", name);
@@ -544,7 +559,7 @@ dump_value(FILE *stream, int level)
      }
    else
      {
-      dump_optional_value(stream);
+      dump_value_modifier(stream);
  
        char *base = get_string();
        int x = get_u32();
@@ -1130,7 +1145,7 @@ main(int argc, char *argv[])
  
        /* This might be a version number of some kind, because value 1 seems
           to only appear in an SPV file that also required its own weird
-         special cases in dump_optional_value(). */
+         special cases in dump_value_modifier(). */
        version = get_u32();
        pos -= 4;
        if (!match_u32(1))
diff --git a/notes b/notes

index c0f2f81d96079321f5f1cf0af2a35dcd2b3fda6e..811a7a1177f41660ecab599a2a75b898ed08c04d 100644 (file)
--- a/notes
+++ b/notes
@@ -2662,11 +2662,22 @@ Substitutions
  -------------
  
  The primary format specifier for substitution 1 is ^1.
-Substitutions with multiple values take the form [:^1:]1.  The : are mandatory even though not apparently useful.
+
+Substitutions with multiple values take the form [:^1:]1, where ]1
+references substitution 1 and ^1 refers to the first value within the
+substitution.  The : are mandatory even though not apparently useful.
+
+There can be extra text, e.g. [:^1\n:]1.
+
  Substitutions with two forms for multiple values are written as [%1:, ^1:]1.
+
+The general form appears to be [a:b:]x.  Apparently, % is used in a and
+^ is used in b, but there's no obvious reason why.
+
  Substitutions can take two values at a time, like: [:^1 = ^2:]1
  but more often there would be two forms: [%1 = %2:, ^1 = ^2:]1.
-
+A literal : is written \:
+Presumably a literal ^ is written \^ but there are no examples.
  
  WM is constant when Age = 19.00. It will be included in any boxplots produced but other output will be omitted.
      <template format="^1 is constant when [%1 = %2:, ^1 = ^2:]2. It will be included in any boxplots produced but other output will be omitted.">
diff --git a/spv-file-format.texi b/spv-file-format.texi

index 82367ddef40185f38a5d6a6ff1fc017cdc5d72d3..cb2cae75b42a893f51167a8ffae83aa93c839845 100644 (file)
--- a/spv-file-format.texi
+++ b/spv-file-format.texi
@@ -621,32 +621,35 @@ index @math{5 \times (4 \times (3 \times 0 + 1) + 2) + 3 = 33}.
  
  @example
  value := 00? 00? 00? 00? raw-value
-raw-value := 01 opt-value int32[format] double[x]
-           | 02 opt-value int32[format] double[x]
-             string[varname] string[vallab] (01 | 02 | 03)
-           | 03 string[local] opt-value string[id] string[c] (00 | 01)[type]
-           | 04 opt-value int32[format] string[vallab] string[varname]
-             (01 | 02 | 03) string[vallab]
-           | 05 opt-value string[varname] string[varlabel] (01 | 02 | 03)
-           | opt-value string[format] int32[n-substs] substitution*[n-substs]
-substitution := i0 value
-              | int32[x] value*[x + 1]      /* @r{x > 0} */
-opt-value := 31 i0 (i0 | i1 string) opt-value-i0-v1        /* @r{version 1} */
-           | 31 i0 (i0 | i1 string) opt-value-i0-v3        /* @r{version 3} */
-           | 31 i1 int32[footnote-number] nested-string
-           | 31 i2 (00 | 01 | 02) 00 (i1 | i2 | i3) nested-string
-           | 31 i3 00 00 01 00 i2 nested-string
-           | 58
-opt-value-i0-v1 := 00 (i1 | i2) 00 00 int32 00 00
-opt-value-i0-v3 := count(counted-string
+raw-value :=
+    01 value-mod int32[format] double[x]
+  | 02 value-mod int32[format] double[x]
+    string[varname] string[vallab] (01 | 02 | 03)
+  | 03 string[local] value-mod string[id] string[c] (00 | 01)[type]
+  | 04 value-mod int32[format] string[vallab] string[varname]
+    (01 | 02 | 03) string[s]
+  | 05 value-mod string[varname] string[varlabel] (01 | 02 | 03)
+  | value-mod string[format] int32[n-substs] substitution*[n-substs]
+substitution :=
+    i0 value
+  | int32[x] value*[x + 1]      /* @r{x > 0} */
+value-mod :=
+    31 i0 (i0 | i1 string[subscript]) value-mod-i0-v1 /* @r{version 1} */
+  | 31 i0 (i0 | i1 string[subscript]) value-mod-i0-v3 /* @r{version 3} */
+  | 31 i1 int32[footnote-number] template
+  | 31 i2 (00 | 01 | 02) 00 (i1 | i2 | i3) template
+  | 31 i3 00 00 01 00 i2 template
+  | 58
+value-mod-i0-v1 := 00 (i1 | i2) 00 00 int32 00 00
+value-mod-i0-v3 := count(template-string
                           (58 | 31 style)
                           (58
                            | 31 i0 i0 i0 i0 01 00 (01 | 02 | 08)
                              00 08 00 0a 00))
  
  style := 01? 00? 00? 00? 01 string[fgcolor] string[bgcolor] string[font] byte
-nested-string := 00 00 count(counted-string (58 | 31 style) 58)
-counted-string := count((i0 (58 | 31 string))?)
+template := 00 00 count(template-string (58 | 31 style) 58)
+template-string := count((i0 (58 | 31 string))?)
  @end example
  
  A @code{value} boils down to a number or a string.  There are several
@@ -675,14 +678,16 @@ The meaning of the final byte is unknown.  Possibly it is connected to
  whether the value or the label should be displayed.
  
  @item 03
-A text string that originates from the software program (rather than
-from user data).  The string is provided in two forms: @code{c} is in
-English and @code{local} is localized to the user's language
-environment.  In an English-language locale, the two strings are often
-the same, and in cases where they differ @code{c} is often abbreviated
-or obscure and @code{local} is more appropriate for a user interface,
-e.g.@: @code{c} of ``Not a PxP table for MCN...'' versus @code{local}
-of ``Computed only for a PxP table, where P must be greater than 1.''
+A text string, in two forms: @code{c} is in English, and sometimes
+abbreviated or obscure, and @code{local} is localized to the user's
+locale.  In an English-language locale, the two strings are often the
+same, and in the cases where they differ, @code{local} is more
+appropriate for a user interface, e.g.@: @code{c} of ``Not a PxP table
+for MCN...'' versus @code{local} of ``Computed only for a PxP table,
+where P must be greater than 1.''
+
+@code{c} and @code{local} are always either both empty or both
+nonempty.
  
  @code{id} is a brief identifying string whose form seems to resemble a
  programming language identifier, e.g.@: @code{cumulative_percent} or
@@ -690,6 +695,25 @@ programming language identifier, e.g.@: @code{cumulative_percent} or
  
  @code{type} is 00 for text taken from user input, such as syntax
  fragment, expressions, file names, data set names, and 01 for fixed
-text strings such as names of procedures or statistics.
+text strings such as names of procedures or statistics.  In the former
+case, @code{id} is always the empty string; in the latter case,
+@code{id} is still sometimes empty.
  
  @item 04
+The string value @code{s}, presented to the user formatted according
+to @code{format}.  The format for a string is not too interesting, and
+clearly invalid formats like A16.39 or A255.127 or A134.1 abound in
+the corpus, so readers should probably ignore the format entirely.
+
+@code{s} is a value of variable @code{varname} and has value label
+@code{vallab}.  @code{varname} is never empty but @code{vallab} is
+commonly empty.
+
+The meaning of the final byte is unknown.
+
+@item 05
+Variable @code{varname}, which is rarely observed as empty in the
+corpus, with variable label @code{varlabel}, which is often empty.
+
+The meaning of the final byte is unknown.
+@end itemize
author	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 10 Jan 2016 00:49:54 +0000 (16:49 -0800)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 10 Jan 2016 00:49:54 +0000 (16:49 -0800)
dump.c		patch \| blob \| history
notes		patch \| blob \| history
spv-file-format.texi		patch \| blob \| history