work

[pspp] / doc / dev / system-file-format.texi
diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi

index 116fa6f787445fb5931e9ae2dafe87eb97782988..ba390c1c0243b7e3f32facffb712bc6f936ca6b3 100644 (file)
--- a/doc/dev/system-file-format.texi
+++ b/doc/dev/system-file-format.texi
@@ -1,12 +1,25 @@
+@c PSPP - a program for statistical analysis.
+@c Copyright (C) 2019 Free Software Foundation, Inc.
+@c Permission is granted to copy, distribute and/or modify this document
+@c under the terms of the GNU Free Documentation License, Version 1.3
+@c or any later version published by the Free Software Foundation;
+@c with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
+@c A copy of the license is included in the section entitled "GNU
+@c Free Documentation License".
+@c
+
  @node System File Format
  @node System File Format
-@appendix System File Format
+@chapter System File Format
  
  
-A system file encapsulates a set of cases and dictionary information
-that describes how they may be interpreted.  This chapter describes
-the format of a system file.
+An SPSS system file holds a set of cases and dictionary information
+that describes how they may be interpreted.  The system file format
+dates back 40+ years and has evolved greatly over that time to support
+new features, but in a way to facilitate interchange between even the
+oldest and newest versions of software.  This chapter describes the
+system file format.
  
  System files use four data types: 8-bit characters, 32-bit integers,
  
  System files use four data types: 8-bit characters, 32-bit integers,
-64-bit integers, 
+64-bit integers,
  and 64-bit floating points, called here @code{char}, @code{int32},
  @code{int64}, and
  @code{flt64}, respectively.  Data is not necessarily aligned on a word
  and 64-bit floating points, called here @code{char}, @code{int32},
  @code{int64}, and
  @code{flt64}, respectively.  Data is not necessarily aligned on a word
@@ -79,6 +92,7 @@ possible to artificially synthesize files that use different encodings
  * Multiple Response Sets Records::
  * Extra Product Info Record::
  * Variable Display Parameter Record::
  * Multiple Response Sets Records::
  * Extra Product Info Record::
  * Variable Display Parameter Record::
+* Variable Sets Record::
  * Long Variable Names Record::
  * Very Long String Record::
  * Character Encoding Record::
  * Long Variable Names Record::
  * Very Long String Record::
  * Character Encoding Record::
@@ -316,10 +330,10 @@ so readers should take care to parse dummy variable records in the
  same way as other variable records.
  
  @anchor{Dictionary Index}
  same way as other variable records.
  
  @anchor{Dictionary Index}
-The @dfn{dictionary index} of a variable is its offset in the set of
+The @dfn{dictionary index} of a variable is a 1-based offset in the set of
  variable records, including dummy variable records for long string
  variable records, including dummy variable records for long string
-variables.  The first variable record has a dictionary index of 0, the
-second has a dictionary index of 1, and so on.
+variables.  The first variable record has a dictionary index of 1, the
+second has a dictionary index of 2, and so on.
  
  The system file format does not directly support string variables
  wider than 255 bytes.  Such very long string variables are represented
  
  The system file format does not directly support string variables
  wider than 255 bytes.  Such very long string variables are represented
@@ -514,6 +528,10 @@ Format types are defined as follows:
  @tab @code{EDATE}
  @item 39
  @tab @code{SDATE}
  @tab @code{EDATE}
  @item 39
  @tab @code{SDATE}
+@item 40
+@tab @code{MTIME}
+@item 41
+@tab @code{YMDHMS}
  @end multitable
  @end quotation
  
  @end multitable
  @end quotation
  
@@ -544,7 +562,7 @@ The value label record has the following format:
  int32               rec_type;
  int32               label_count;
  
  int32               rec_type;
  int32               label_count;
  
-/* @r{Repeated @code{label_cnt} times}. */
+/* @r{Repeated @code{n_label} times}. */
  char                value[8];
  char                label_len;
  char                label[];
  char                value[8];
  char                label_len;
  char                label[];
@@ -596,7 +614,7 @@ Number of variables that the associated value labels from the value
  label record are to be applied.
  
  @item int32 vars[];
  label record are to be applied.
  
  @item int32 vars[];
-A list of dictionary indexes of variables to which to apply the value
+A list of 1-based dictionary indexes of variables to which to apply the value
  labels (@pxref{Dictionary Index}).  There are @code{var_count}
  elements.
  
  labels (@pxref{Dictionary Index}).  There are @code{var_count}
  elements.
  
@@ -987,18 +1005,24 @@ members are as follows:
  
  @table @code
  @item int32 measure;
  
  @table @code
  @item int32 measure;
-The measurement type of the variable:
+The measurement level of the variable:
  @table @asis
  @table @asis
+@item 0
+Unknown
  @item 1
  @item 1
-Nominal Scale
+Nominal
  @item 2
  @item 2
-Ordinal Scale
+Ordinal
  @item 3
  @item 3
-Continuous Scale
+Scale
  @end table
  
  @end table
  
-SPSS sometimes writes a @code{measure} of 0.  PSPP interprets this as
-nominal scale.
+An ``unknown'' @code{measure} of 0 means that the variable was created
+in some way that doesn't make the measurement level clear, e.g.@: with
+a @code{COMPUTE} transformation.  PSPP sets the measurement level the
+first time it reads the data using the rules documented in
+@ref{Measurement Level,,,pspp, PSPP Users Guide}, so this should
+rarely appear.
  
  @item int32 width;
  The width of the display column for the variable in characters.
  
  @item int32 width;
  The width of the display column for the variable in characters.
@@ -1020,6 +1044,54 @@ Centre aligned
  @end table
  @end table
  
  @end table
  @end table
  
+@node Variable Sets Record
+@section Variable Sets Record
+
+The SPSS GUI offers users the ability to arrange variables in sets.
+Users may enable and disable sets individually, and the data editor
+and analysis dialog boxes only show enabled sets.  Syntax does not use
+variable sets.
+
+The variable sets record, if present, has the following format:
+
+@example
+/* @r{Header.} */
+int32               rec_type;
+int32               subtype;
+int32               size;
+int32               count;
+
+/* @r{Exactly @code{count} bytes of text.} */
+char                text[];
+@end example
+
+@table @code
+@item int32 rec_type;
+Record type.  Always set to 7.
+
+@item int32 subtype;
+Record subtype.  Always set to 5.
+
+@item int32 size;
+Always set to 1.
+
+@item int32 count;
+The total number of bytes in @code{text}.
+
+@item char text[];
+The variable sets, in a text-based format.
+
+Each variable set occupies one line of text, each of which ends with a
+line feed (byte 0x0a), optionally preceded by a carriage return (byte
+0x0d).
+
+Each line begins with the name of the variable set, followed by an
+equals sign (@samp{=}) and a space (byte 0x20), followed by the long
+variable names of the members of the set, separated by spaces.  A
+variable set may be empty, in which case the equals sign and the space
+following it are still present.
+@end table
+
  @node Long Variable Names Record
  @section Long Variable Names Record
  
  @node Long Variable Names Record
  @section Long Variable Names Record
  
@@ -1292,7 +1364,8 @@ int32               count;
  int32               var_name_len;
  char                var_name[];
  char                n_missing_values;
  int32               var_name_len;
  char                var_name[];
  char                n_missing_values;
-long_string_missing_value   values[];
+int32               value_len;
+char                values[values_len * n_missing_values];
  @end example
  
  @table @code
  @end example
  
  @table @code
@@ -1319,30 +1392,25 @@ any particular boundary, nor is it null-terminated.
  The number of missing values, either 1, 2, or 3.  (This is, unusually,
  a single byte instead of a 32-bit number.)
  
  The number of missing values, either 1, 2, or 3.  (This is, unusually,
  a single byte instead of a 32-bit number.)
  
-@item long_string_missing_value values[];
-The missing values themselves.  This array contains exactly
-@code{n_missing_values} elements, each of which has the following
-substructure:
-
-@example
-int32               value_len;
-char                value[];
-@end example
-
-@table @code
  @item int32 value_len;
  @item int32 value_len;
-The length of the missing value string, in bytes.  This value should
+The length of each missing value string, in bytes.  This value should
  be 8, because long string variables are at least 8 bytes wide (by
  definition), only the first 8 bytes of a long string variable's
  missing values are allowed to be non-spaces, and any spaces within the
  first 8 bytes are included in the missing value here.
  
  be 8, because long string variables are at least 8 bytes wide (by
  definition), only the first 8 bytes of a long string variable's
  missing values are allowed to be non-spaces, and any spaces within the
  first 8 bytes are included in the missing value here.
  
-@item char value[];
-The missing value string, exactly @code{value_len} bytes, without
-any padding or null terminator.
-@end table
+@item char values[values_len * n_missing_values]
+The missing values themselves, without any padding or null
+terminators.
  @end table
  
  @end table
  
+An earlier version of this document stated that @code{value_len} was
+repeated before each of the missing values, so that there was an extra
+@code{int32} value of 8 before each missing value after the first.
+Old versions of PSPP wrote data files in this format.  Readers can
+tolerate this mistake, if they wish, by noticing and skipping the
+extra @code{int32} values, which wouldn't ordinarily occur in strings.
+
  @node Data File and Variable Attributes Records
  @section Data File and Variable Attributes Records
  
  @node Data File and Variable Attributes Records
  @section Data File and Variable Attributes Records
  
@@ -1503,9 +1571,6 @@ The following extension record subtypes have also been observed, with
  the following believed meanings:
  
  @table @asis
  the following believed meanings:
  
  @table @asis
-@item 5
-A set of grouped variables (according to Aapi H@"am@"al@"ainen).
-
  @item 6
  Date info, probably related to USE (according to Aapi H@"am@"al@"ainen).
  
  @item 6
  Date info, probably related to USE (according to Aapi H@"am@"al@"ainen).
  
@@ -1637,7 +1702,7 @@ The number of bytes in the ZLIB data trailer.  This and the previous
  field sum to the size of the system file in bytes.
  @end table
  
  field sum to the size of the system file in bytes.
  @end table
  
-The data header is followed by @code{(ztrailer_ofs - 24) / 24} ZLIB
+The data header is followed by @code{(ztrailer_len - 24) / 24} ZLIB
  compressed data blocks.  Each ZLIB compressed data block begins with a
  ZLIB header as specified in RFC@tie{}1950, e.g.@: hex bytes @code{78
  01} (the only header yet observed in practice).  Each block
  compressed data blocks.  Each ZLIB compressed data block begins with a
  ZLIB header as specified in RFC@tie{}1950, e.g.@: hex bytes @code{78
  01} (the only header yet observed in practice).  Each block
@@ -1674,7 +1739,7 @@ been observed so far.
  
  @item int32 n_blocks;
  The number of ZLIB compressed data blocks, always exactly
  
  @item int32 n_blocks;
  The number of ZLIB compressed data blocks, always exactly
-@code{(ztrailer_ofs - 24) / 24}.
+@code{(ztrailer_len - 24) / 24}.
  @end table
  
  The fixed header is followed by @code{n_blocks} 24-byte ZLIB data
  @end table
  
  The fixed header is followed by @code{n_blocks} 24-byte ZLIB data