work

[pspp] / doc / dev / system-file-format.texi
diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi

index 116fa6f787445fb5931e9ae2dafe87eb97782988..ba390c1c0243b7e3f32facffb712bc6f936ca6b3 100644 (file)
--- a/doc/dev/system-file-format.texi
+++ b/doc/dev/system-file-format.texi
@@ -1,12 +1,25 @@
+@c PSPP - a program for statistical analysis.
+@c Copyright (C) 2019 Free Software Foundation, Inc.
+@c Permission is granted to copy, distribute and/or modify this document
+@c under the terms of the GNU Free Documentation License, Version 1.3
+@c or any later version published by the Free Software Foundation;
+@c with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
+@c A copy of the license is included in the section entitled "GNU
+@c Free Documentation License".
+@c
+
  @node System File Format
-@appendix System File Format
+@chapter System File Format
  
-A system file encapsulates a set of cases and dictionary information
-that describes how they may be interpreted.  This chapter describes
-the format of a system file.
+An SPSS system file holds a set of cases and dictionary information
+that describes how they may be interpreted.  The system file format
+dates back 40+ years and has evolved greatly over that time to support
+new features, but in a way to facilitate interchange between even the
+oldest and newest versions of software.  This chapter describes the
+system file format.
  
  System files use four data types: 8-bit characters, 32-bit integers,
-64-bit integers, 
+64-bit integers,
  and 64-bit floating points, called here @code{char}, @code{int32},
  @code{int64}, and
  @code{flt64}, respectively.  Data is not necessarily aligned on a word
@@ -79,6 +92,7 @@ possible to artificially synthesize files that use different encodings
  * Multiple Response Sets Records::
  * Extra Product Info Record::
  * Variable Display Parameter Record::
+* Variable Sets Record::
  * Long Variable Names Record::
  * Very Long String Record::
  * Character Encoding Record::
@@ -316,10 +330,10 @@ so readers should take care to parse dummy variable records in the
  same way as other variable records.
  
  @anchor{Dictionary Index}
-The @dfn{dictionary index} of a variable is its offset in the set of
+The @dfn{dictionary index} of a variable is a 1-based offset in the set of
  variable records, including dummy variable records for long string
-variables.  The first variable record has a dictionary index of 0, the
-second has a dictionary index of 1, and so on.
+variables.  The first variable record has a dictionary index of 1, the
+second has a dictionary index of 2, and so on.
  
  The system file format does not directly support string variables
  wider than 255 bytes.  Such very long string variables are represented
@@ -514,6 +528,10 @@ Format types are defined as follows:
  @tab @code{EDATE}
  @item 39
  @tab @code{SDATE}
+@item 40
+@tab @code{MTIME}
+@item 41
+@tab @code{YMDHMS}
  @end multitable
  @end quotation
  
@@ -544,7 +562,7 @@ The value label record has the following format:
  int32               rec_type;
  int32               label_count;
  
-/* @r{Repeated @code{label_cnt} times}. */
+/* @r{Repeated @code{n_label} times}. */
  char                value[8];
  char                label_len;
  char                label[];
@@ -596,7 +614,7 @@ Number of variables that the associated value labels from the value
  label record are to be applied.
  
  @item int32 vars[];
-A list of dictionary indexes of variables to which to apply the value
+A list of 1-based dictionary indexes of variables to which to apply the value
  labels (@pxref{Dictionary Index}).  There are @code{var_count}
  elements.
  
@@ -987,18 +1005,24 @@ members are as follows:
  
  @table @code
  @item int32 measure;
-The measurement type of the variable:
+The measurement level of the variable:
  @table @asis
+@item 0
+Unknown
  @item 1
-Nominal Scale
+Nominal
  @item 2
-Ordinal Scale
+Ordinal
  @item 3
-Continuous Scale
+Scale
  @end table
  
-SPSS sometimes writes a @code{measure} of 0.  PSPP interprets this as
-nominal scale.
+An ``unknown'' @code{measure} of 0 means that the variable was created
+in some way that doesn't make the measurement level clear, e.g.@: with
+a @code{COMPUTE} transformation.  PSPP sets the measurement level the
+first time it reads the data using the rules documented in
+@ref{Measurement Level,,,pspp, PSPP Users Guide}, so this should
+rarely appear.
  
  @item int32 width;
  The width of the display column for the variable in characters.
@@ -1020,6 +1044,54 @@ Centre aligned
  @end table
  @end table
  
+@node Variable Sets Record
+@section Variable Sets Record
+
+The SPSS GUI offers users the ability to arrange variables in sets.
+Users may enable and disable sets individually, and the data editor
+and analysis dialog boxes only show enabled sets.  Syntax does not use
+variable sets.
+
+The variable sets record, if present, has the following format:
+
+@example
+/* @r{Header.} */
+int32               rec_type;
+int32               subtype;
+int32               size;
+int32               count;
+
+/* @r{Exactly @code{count} bytes of text.} */
+char                text[];
+@end example
+
+@table @code
+@item int32 rec_type;
+Record type.  Always set to 7.
+
+@item int32 subtype;
+Record subtype.  Always set to 5.
+
+@item int32 size;
+Always set to 1.
+
+@item int32 count;
+The total number of bytes in @code{text}.
+
+@item char text[];
+The variable sets, in a text-based format.
+
+Each variable set occupies one line of text, each of which ends with a
+line feed (byte 0x0a), optionally preceded by a carriage return (byte
+0x0d).
+
+Each line begins with the name of the variable set, followed by an
+equals sign (@samp{=}) and a space (byte 0x20), followed by the long
+variable names of the members of the set, separated by spaces.  A
+variable set may be empty, in which case the equals sign and the space
+following it are still present.
+@end table
+
  @node Long Variable Names Record
  @section Long Variable Names Record
  
@@ -1292,7 +1364,8 @@ int32               count;
  int32               var_name_len;
  char                var_name[];
  char                n_missing_values;
-long_string_missing_value   values[];
+int32               value_len;
+char                values[values_len * n_missing_values];
  @end example
  
  @table @code
@@ -1319,30 +1392,25 @@ any particular boundary, nor is it null-terminated.
  The number of missing values, either 1, 2, or 3.  (This is, unusually,
  a single byte instead of a 32-bit number.)
  
-@item long_string_missing_value values[];
-The missing values themselves.  This array contains exactly
-@code{n_missing_values} elements, each of which has the following
-substructure:
-
-@example
-int32               value_len;
-char                value[];
-@end example
-
-@table @code
  @item int32 value_len;
-The length of the missing value string, in bytes.  This value should
+The length of each missing value string, in bytes.  This value should
  be 8, because long string variables are at least 8 bytes wide (by
  definition), only the first 8 bytes of a long string variable's
  missing values are allowed to be non-spaces, and any spaces within the
  first 8 bytes are included in the missing value here.
  
-@item char value[];
-The missing value string, exactly @code{value_len} bytes, without
-any padding or null terminator.
-@end table
+@item char values[values_len * n_missing_values]
+The missing values themselves, without any padding or null
+terminators.
  @end table
  
+An earlier version of this document stated that @code{value_len} was
+repeated before each of the missing values, so that there was an extra
+@code{int32} value of 8 before each missing value after the first.
+Old versions of PSPP wrote data files in this format.  Readers can
+tolerate this mistake, if they wish, by noticing and skipping the
+extra @code{int32} values, which wouldn't ordinarily occur in strings.
+
  @node Data File and Variable Attributes Records
  @section Data File and Variable Attributes Records
  
@@ -1503,9 +1571,6 @@ The following extension record subtypes have also been observed, with
  the following believed meanings:
  
  @table @asis
-@item 5
-A set of grouped variables (according to Aapi H@"am@"al@"ainen).
-
  @item 6
  Date info, probably related to USE (according to Aapi H@"am@"al@"ainen).
  
@@ -1637,7 +1702,7 @@ The number of bytes in the ZLIB data trailer.  This and the previous
  field sum to the size of the system file in bytes.
  @end table
  
-The data header is followed by @code{(ztrailer_ofs - 24) / 24} ZLIB
+The data header is followed by @code{(ztrailer_len - 24) / 24} ZLIB
  compressed data blocks.  Each ZLIB compressed data block begins with a
  ZLIB header as specified in RFC@tie{}1950, e.g.@: hex bytes @code{78
  01} (the only header yet observed in practice).  Each block
@@ -1674,7 +1739,7 @@ been observed so far.
  
  @item int32 n_blocks;
  The number of ZLIB compressed data blocks, always exactly
-@code{(ztrailer_ofs - 24) / 24}.
+@code{(ztrailer_len - 24) / 24}.
  @end table
  
  The fixed header is followed by @code{n_blocks} 24-byte ZLIB data