rust: Add support for portable files.
authorBen Pfaff <blp@cs.stanford.edu>
Wed, 17 Sep 2025 15:17:38 +0000 (08:17 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Wed, 17 Sep 2025 15:27:49 +0000 (08:27 -0700)
38 files changed:
rust/Cargo.lock
rust/doc/src/SUMMARY.md
rust/doc/src/commands/export.md
rust/doc/src/commands/import.md
rust/doc/src/invoking/pspp-convert.md
rust/doc/src/invoking/pspp-show-por.md [new file with mode: 0644]
rust/doc/src/invoking/pspp-show.md
rust/doc/src/portable.md
rust/doc/src/spv/index.md
rust/pspp/Cargo.toml
rust/pspp/src/convert.rs
rust/pspp/src/data.rs
rust/pspp/src/file.rs [new file with mode: 0644]
rust/pspp/src/file/testdata/test-encoding.sps [new file with mode: 0644]
rust/pspp/src/file/testdata/test-encrypted.sav [new file with mode: 0644]
rust/pspp/src/file/testdata/test-encrypted.spv [new file with mode: 0644]
rust/pspp/src/file/testdata/test.por [new file with mode: 0644]
rust/pspp/src/file/testdata/test.sav [new file with mode: 0644]
rust/pspp/src/file/testdata/test.sps [new file with mode: 0644]
rust/pspp/src/file/testdata/test.spv [new file with mode: 0644]
rust/pspp/src/format.rs
rust/pspp/src/lib.rs
rust/pspp/src/main.rs
rust/pspp/src/output/pivot.rs
rust/pspp/src/por.rs [new file with mode: 0644]
rust/pspp/src/por/read.rs [new file with mode: 0644]
rust/pspp/src/por/testdata/README.md [new file with mode: 0644]
rust/pspp/src/por/testdata/test1.expected [new file with mode: 0644]
rust/pspp/src/por/testdata/test1.por [new file with mode: 0644]
rust/pspp/src/por/testdata/test2.expected [new file with mode: 0644]
rust/pspp/src/por/testdata/test2.por [new file with mode: 0644]
rust/pspp/src/por/write.rs [new file with mode: 0644]
rust/pspp/src/show.rs
rust/pspp/src/show_por.rs [new file with mode: 0644]
rust/pspp/src/sys.rs
rust/pspp/src/sys/cooked.rs
rust/pspp/src/sys/write.rs
rust/pspp/src/variable.rs

index b98c7354b24fe899d7d3ae3f325150ec7c6788e1..a0088668770939beb0395da9bfeed7c4412b94c7 100644 (file)
@@ -403,6 +403,15 @@ dependencies = [
  "digest",
 ]
 
+[[package]]
+name = "codepage-437"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e40c1169585d8d08e5675a39f2fc056cd19a258fc4cba5e3bbf4a9c1026de535"
+dependencies = [
+ "csv",
+]
+
 [[package]]
 name = "color"
 version = "0.2.4"
@@ -1626,10 +1635,12 @@ dependencies = [
  "chrono",
  "clap",
  "cmac",
+ "codepage-437",
  "color",
  "csv",
  "derive_more",
  "diff",
+ "displaydoc",
  "either",
  "encoding_rs",
  "enum-iterator",
index 5fde2b517dc7425ce5589fa58f60fbb46562200a..1d1c14d2e6918af300a0fc20997e3437edbfbca3 100644 (file)
@@ -4,9 +4,10 @@
 [License](license.md)
 
 - [Running PSPP](invoking/index.md)
-  - [Converting data](invoking/pspp-convert.md)
-  - [Inspecting data](invoking/pspp-show.md)
-  - [Decrypting files](invoking/pspp-decrypt.md)
+  - [Converting Data](invoking/pspp-convert.md)
+  - [Inspecting `.sav` Data](invoking/pspp-show.md)
+  - [Inspecting `.por` Data](invoking/pspp-show-por.md)
+  - [Decrypting Files](invoking/pspp-decrypt.md)
 
 # Language Overview
 
index a704a059a1843b9ed3d610b7e5a1139fc2810743..4376bdf797cbd16126eb0ad7fe3cbf8da152aea5 100644 (file)
@@ -15,6 +15,9 @@ EXPORT
    The `EXPORT` procedure writes the active dataset's dictionary and
 data to a specified portable file.
 
+> `EXPORT` is obsolete and retained only for compatibility.  New
+> syntax should use [`SAVE`](save.md) instead.
+
    `UNSELECTED` controls whether cases excluded with
 [`FILTER`](filter.md) are written to the file.  These can
 be excluded by specifying `DELETE` on the `UNSELECTED` subcommand.
index c183f2cdfb8294463d90bd1bbba566750acf7df0..6cd133a9bf51a4ae4605e1297edb1a87787059a8 100644 (file)
@@ -13,10 +13,13 @@ The `IMPORT` transformation clears the active dataset dictionary and
 data and replaces them with a dictionary and data from a system file or
 portable file.
 
+> `IMPORT` is obsolete and retained only for compatibility with
+> existing portable files.  New syntax should use [`SAVE`](save.md) to
+> write system files instead, and [`GET`](get.md) to read them.
+
 The `FILE` subcommand, which is the only required subcommand,
 specifies the portable file to be read as a file name string or a
 [file handle](../language/files/file-handles.md).
-[file handle](../language/files/file-handles.md).
 
 The `TYPE` subcommand is currently not used.
 
index a1d33a605b53f131f95bf7343795fd6d5f058e0c..e6c2abd95f590432696f7fcc27ebd07bb183d49f 100644 (file)
@@ -7,9 +7,9 @@ another.  The basic syntax is:
 pspp convert <INPUT> [OUTPUT]
 ```
 
-which reads an SPSS data file from `<INPUT>` and writes a copy of it
-to `[OUTPUT]`.  If `[OUTPUT]` is omitted, output is written to the
-terminal.
+which reads an SPSS system file or portable file from `<INPUT>` and
+writes a copy of it to `[OUTPUT]`.  If `[OUTPUT]` is omitted, output
+is written to the terminal.
 
 If `[OUTPUT]` is specified, then `pspp convert` tries to guess the
 output format based on its extension:
diff --git a/rust/doc/src/invoking/pspp-show-por.md b/rust/doc/src/invoking/pspp-show-por.md
new file mode 100644 (file)
index 0000000..4bd208e
--- /dev/null
@@ -0,0 +1,118 @@
+# Inspecting `.por` files with `pspp show-por`
+
+The `pspp show-por` command reads an SPSS "portable file",
+which usually has a `.por` extension, and produces a report.  The
+basic syntax is:
+
+```
+pspp show-por <MODE> <INPUT> [OUTPUT]
+```
+
+where `<MODE>` is a mode of operation (see below), `<INPUT>` is the
+SPSS portable file to read, and `[OUTPUT]` is the output file name.
+If `[OUTPUT]` is omitted, output is written to the terminal.
+
+> The portable file format is mostly obsolete.  The "system file" or
+> `.sav` format should be used for writing new data files.  Use [`pspp
+> show`](pspp-show.md) to inspect `.sav` files.
+
+The following `<MODE>`s are available:
+
+* `dictionary`: Outputs the file dictionary in detail, including
+  variables, value labels, documents, and so on.  With `--data`, also
+  outputs cases from the system file.
+
+  This can be useful as an alternative to PSPP syntax commands such as
+  [`DISPLAY DICTIONARY`](../commands/display.md).
+
+  [`pspp convert`](pspp-convert.md) is a better way to convert a
+  portable file to another format.
+
+* `metadata`: Outputs portable file metadata not included in the
+  dictionary:
+
+  - The creation date and time declared inside the file (not in the
+    file system).
+
+  - The name of the product and subproduct that wrote the file, if
+    present.
+
+  - The author of the file, if present.  This is usually the name of
+    the organization that licensed the product that wrote the file.
+
+  - The character set [translation table] embedded in the file, as an
+    array with 256 elements, one for each possible value of a byte in
+    the file.  Each array element gives the byte value as a 2-digit
+    hexadecimal number paired with the translation table's entry for
+    that byte.  Since the file can technically be in any encoding
+    (although [the corpus] universally uses extended ASCII), the entry
+    is given as a character interpreted in two character sets:
+    [windows-1252] and [code page 437], in that order.  (If the two
+    character sets agree on the code point, then it is only given
+    once.)
+
+    For example, consider a portable's file translation table at
+    offset 0x9e, which in the [portable character set] is `±`.
+    Suppose it has value 0xb1, which is `±` in [windows-1252] and `▒`
+    in [code page 437].  Then that array element would be `["9e", "±",
+    "▒"]`.
+
+    [translation table]: ../portable.md#translation-table
+    [the corpus]: ../portable.md#corpus
+    [portable character set]: ../portable.md#theory
+    [windows-1252]: https://en.wikipedia.org/wiki/Windows-1252
+    [code page 437]: https://en.wikipedia.org/wiki/Code_page_437
+
+  This command is most useful with some knowledge of the [portable
+  file format].
+
+  [portable file format]: ../portable.md
+
+* `histogram`: Reports on the usage of characters in the portable
+  file.  Produces output in the form of an array for each possible
+  value of a byte in the file.  Each array element gives the byte
+  value, the byte's character, and the number of times that the byte
+  appears in the file.  A given byte is omitted from the table if it
+  does not appear in the file at all, or if the translation table
+  leaves it unmapped.  It is also omitted if the byte's character is
+  the ISO-8859-1 encoding of the byte (for example, if byte 0x41
+  represents `A`, which is `A` in [ISO-8859-1]).
+
+  This command is most useful with some knowledge of the [portable
+  file format].
+
+  [ISO-8859-1]: https://en.wikipedia.org/wiki/ISO/IEC_8859-1
+
+## Options
+
+The following options affect how `pspp show-por` reads `<INPUT>`:
+
+* `--data [<MAX_CASES>]`  
+  For mode `dictionary`, and `encodings`, this instructs `pspp
+  show-por` to read cases from the file.  If `<MAX_CASES>` is given,
+  then that sets a limit on the number of cases to read.  Without this
+  option, PSPP will not read any cases.
+
+The following options affect how `pspp show-por` writes its output:
+
+* `-f <FORMAT>`  
+  `--format <FORMAT>`  
+  Specifies the format to use for output.  `<FORMAT>` may be one of
+  the following:
+
+  - `json`: JSON using indentation and spaces for easy human
+    consumption.
+  - `ndjson`: [Newline-delimited JSON].
+  - `output`: Pivot tables with the PSPP output engine.  Use `-o` for
+    additional configuration.
+  - `discard`: Do not produce any output.
+
+  When these options are not used, the default output format is chosen
+  based on the `[OUTPUT]` extension.  If `[OUTPUT]` is not specified,
+  then output defaults to JSON.
+
+  [Newline-delimited JSON]: https://github.com/ndjson/ndjson-spec
+
+* `-o <OUTPUT_OPTIONS>`  
+  Adds `<OUTPUT_OPTIONS>` to the output engine configuration.
+
index 2356eca2beee4446419564dab177c960c9178203..2b4940193289380547c9f9b9ae38066121e538bb 100644 (file)
@@ -1,7 +1,8 @@
-# Inspecting data files with `pspp show`
+# Inspecting `.sav` files with `pspp show`
 
-The `pspp show` command reads an SPSS data file and produces a report.
-The basic syntax is:
+The `pspp show` command reads an SPSS "system file" or data file,
+which usually has a `.sav` extension, and produces a report.  The
+basic syntax is:
 
 ```
 pspp show <MODE> <INPUT> [OUTPUT]
@@ -43,6 +44,9 @@ The following `<MODE>`s are available:
   investigating cases of system file corruption, especially when the
   character encoding is unknown or uncertain.
 
+  This command is most useful with some knowledge of the [system file
+  format](../system-file.md).
+
 * `decoded`: Outputs the raw structure of the system file dictionary
   and (with `--data`) cases.  Versus `raw`, this command does decode
   the dictionary and data with a particular character encoding, which
@@ -51,6 +55,9 @@ The following `<MODE>`s are available:
   This is useful for debugging how PSPP reads system files and for
   investigating cases of system file corruption.
 
+  This command is most useful with some knowledge of the [system file
+  format](../system-file.md).
+
 ## Options
 
 The following options affect how `pspp show` reads `<INPUT>`:
index 62893c59cc91005b1a50c5d3664257fefb96d360..33f4f4a5736d390b8ec0f38af2af9be467d1240a 100644 (file)
@@ -1,20 +1,67 @@
 # Portable File Format
 
 These days, most computers use the same internal data formats for
-integer and floating-point data, if one ignores little differences like
-big- versus little-endian byte ordering.  However, occasionally it is
-necessary to exchange data between systems with incompatible data
-formats.  This is what portable files are designed to do.
+integer and floating-point data, if one ignores little differences
+like big- versus little-endian byte ordering.  This has not always
+been true, particularly in the 1960s or 1970s, when the portable file
+format originated as a way to exchange data between systems with
+incompatible data formats.
+
+At the time, even bytes being 8 bits each was not a given.  For that
+reason, the portable file format is a text format, because text files
+could be exchanged portably among systems slightly more freely.  On
+the other hand, character encoding was not standardized, so exchanging
+data in portable file format required recoding it from the origin
+system's character encoding to the destination's.
+
+Some contemporary systems represented text files as sequences of
+fixed-length (typically 80-byte) records, without new-line sequences.
+These operating systems padded lines shorter lines with spaces and
+truncated longer lines.  To tolerate files copied from such systems,
+which might drop spaces at the ends of lines, the portable file format
+treats lines less than 80 bytes long as padded with spaces to that
+length.
+
+The portable file format self-identifies the character encoding on the
+system that produced it at the very beginning, in the
+[header](#portable-file-header).  Since portable files are normally
+recoded when they are transported from one system to another, this
+identification can be wrong on its face: a file that was started in
+EBCDIC, and is then recoded to ASCII, will still say `EBCDIC SPSS PORT
+FILE` at the beginning, just in ASCII instead of EBCDIC.
+
+The portable file header also contains a table of all of the
+characters that it supports.  Readers use this to translate each byte
+of the file into its local encoding.  Like the rest of the portable
+file, the character table is recoded when the file is moved to a
+system with a different character set so that it remains correct, or
+at least consistent with the rest of the file.
 
 The portable file format is mostly obsolete.  [System
 files](system-file.md) are a better alternative.
 
-> This information is gleaned from examination of ASCII-formatted
-portable files only, so some of it may be incorrect for portable files
-formatted in EBCDIC or other character sets.
-
 <!-- toc -->
 
+## Sources
+
+The information in this chapter is drawn from documentation and source
+code, including:
+
+* `pff.tar.Z`, a Fortran program from the 1980s that reads and writes
+  portable files.  This program contains translation tables from the
+  portable character set to EBCDIC and to ASCII.
+
+* <a name="document">A document</a>, now lost, that describes portable
+  file syntax.
+
+It is further informed by a <a name="corpus">corpus</a> of about 1,400
+portable files.  The plausible creation dates in the corpus range from
+1986 to 2025, in addition to 131 files with alleged creation dates
+between 1900 and 1907 and 21 files with an invalid creation date.
+
+[document]: #document
+[corpus]: #corpus
+
 ## Portable File Characters
 
 Portable files are arranged as a series of lines of 80 characters each.
@@ -44,7 +91,7 @@ contents.
 
 Every portable file consists of the following records, in sequence:
 
-- File header.
+- Splash strings.
 
 - Version and date info.
 
@@ -100,114 +147,431 @@ they may not contain a fraction.
 String fields take the form of a integer field having value N,
 followed by exactly N characters, which are the string content.
 
-## Portable File Header
-
-Every portable file begins with a 464-byte header, consisting of a
-200-byte collection of vanity splash strings, followed by a 256-byte
-character set translation table, followed by an 8-byte tag string.
-
-The 200-byte segment is divided into five 40-byte sections, each of
-which represents the string `CHARSET SPSS PORT FILE` in a different
-character set encoding, where `CHARSET` is the name of the character set
-used in the file, e.g. `ASCII` or `EBCDIC`.  Each string is padded on
-the right with spaces in its respective character set.
-
-It appears that these strings exist only to inform those who might
-view the file on a screen, and that they are not parsed by SPSS
-products.  Thus, they can be safely ignored.  For those interested, the
-strings are supposed to be in the following character sets, in the
-specified order: EBCDIC, 7-bit ASCII, CDC 6-bit ASCII, 6-bit ASCII,
-Honeywell 6-bit ASCII.
-
-The 256-byte segment describes a mapping from the character set used
-in the portable file to an arbitrary character set having characters at
-the following positions:
-
-* 0-60: Control characters.  Not important enough to describe in full here.
-
-* 61-63: Reserved.
-
-* 64-73: Digits `0` through `9`.
-
-* 74-99: Capital letters `A` through `Z`.
-
-* 100-125: Lowercase letters `a` through `z`.
-
-* 126: Space.
-
-* 127-130: Symbols `.<(+`
-
-* 131: Solid vertical pipe.
-
-* 132-142: Symbols `&[]!$*);^-/`
+> Strings longer than 255 bytes exist in the [corpus].
 
-* 143: Broken vertical pipe.
+## Splash Strings
 
-* 144-150: Symbols `,%_>`?``:`
+Every portable file begins with 200 bytes of splash strings that serve
+to identify the file's type and its original character set.  The 200
+bytes are divided into five 40-byte sections, each of which is
+supposed to represent the string `<CHARSET> SPSS PORT FILE` in a
+different character set encoding[^0], where `<CHARSET>` is the name of
+the character set used in the file, e.g. `ASCII` or `EBCDIC`.  Each
+string is padded on the right with spaces in its respective character
+set.
 
-* 151: British pound symbol.
+[^0]: The strings are supposed to be in EBCDIC, 7-bit ASCII, CDC 6-bit
+  ASCII, 6-bit ASCII, and Honeywell 6-bit ASCII.  (It is somewhat
+  astonishing that anyone considered the possibility of 6-bit "ASCII",
+  or that there were at least three incompatible version of it.)
 
-* 152-155: Symbols `@'="`.
-
-* 156: Less than or equal symbol.
-
-* 157: Empty box.
-
-* 158: Plus or minus.
-
-* 159: Filled box.
-
-* 160: Degree symbol.
-
-* 161: Dagger.
-
-* 162: Symbol `~`.
-
-* 163: En dash.
-
-* 164: Lower left corner box draw.
-
-* 165: Upper left corner box draw.
-
-* 166: Greater than or equal symbol.
-
-* 167-176: Superscript `0` through `9`.
+It appears that these strings exist only to inform those who might
+view the file on a screen, letting them know what character set the
+file is in regardless of how they are viewing it, and that they are
+not parsed by SPSS products.  Thus, they can be safely ignored.  It is
+reasonable to simply write out `ASCII SPSS PORT FILE` five times, each
+time padded to 40 bytes.
+
+## Translation Table
+
+The splash strings are followed by a 256-byte character set translation table.
+This segment describes a mapping from the character set used
+in the portable file to a "portable character set" that does not
+correspond to any known single-byte character set or code page.  Each
+byte in the table reports the byte value that corresponds to the
+character represented by its position.  The following section lists
+the character at each position.
+
+> For example, position 0x4a (decimal 74) in the portable character
+set is uppercase letter A (as shown in the table in the following
+section), so the 75th byte in the table is the value that represents
+`A` in the file.
+
+Any real character set will not necessarily include all of the
+characters in the portable character set.  In the translation table,
+omitted characters are written as digit `0`[^10].
+
+[^10]: Character `0`, not NUL or byte zero.
+
+> For example, in practice, all of the control character positions are
+always written as `0`.
+
+The following section describes how the translation table is supposed
+to act based on looking at the [sources](#sources), and then the
+section after that describes what it actually contains in practice.
+
+### Theory
+
+The table below shows the portable character set.  The columns in the
+table are:
+
+* "Pos", a position within the portable character set, in hex, from 00
+  to FF.
+
+* "EBCDIC", the translation for the given position to EBCDIC, as
+  written in `pff.tar.Z`.
+
+* "ASCII", the translation for the given position to ASCII, as written
+  in `pff.tar.Z`.
+
+* "Unicode", a suggestion for the best translation from this position to
+  Unicode.
+
+* "Notes", which links to additional information for some characters.
+
+In addition to the [sources](#sources) previously cited, some of the
+information below is drawn from [RFC 183], from 1971.  This RFC shows
+many of the "EBCDIC" hex codes in `pff.tar.Z` as corresponding to the
+descriptions in the document, even though no known EBCDIC codepage
+contains those characters with those codes.
+
+[RFC 183]: https://www.rfc-editor.org/rfc/rfc183.pdf
+
+| Pos | EBCDIC | ASCII | Unicode |  | Notes
+| -:  | :----- | :---- | :------ | :-------- | :----------
+| 00 | 00 | — | — | — | [^1]
+| 01 | 01 | — | — | — | [^1]
+| 02 | 02 | — | — | — | [^1]
+| 03 | 03 | — | — | — | [^1]
+| 04 | 04 | — | — | — | [^1]
+| 05 | 05 | — | U+0009 CHARACTER TABULATION | — | [^1]
+| 06 | 06 | — |  — | — | [^1]
+| 07 | 07 | — |  — | — | [^1]
+| 08 | 08 | — |  — | — | [^1]
+| 09 | 09 | — |  — | — | [^1]
+| 0A | 0A | — |  — | — | [^1]
+| 0B | 0B | — | U+000B LINE TABULATION | — | [^1]
+| 0C | 0C | — | U+000C FORM FEED | — | [^1]
+| 0D | 0D | — | U+000D CARRIAGE RETURN | — | [^1]
+| 0E | 0E | — | — | — | [^1]
+| 0F | 0F | — | — | — | [^1]
+| 10 | 10 | — | — | — | [^1]
+| 11 | 11 | — | — | — | [^1]
+| 12 | 12 | — | — | — | [^1]
+| 13 | 13 | — | — | — | [^1]
+| 14 | 3C | — | — | — | [^1]
+| 15 | 15 | — | U+000A LINE FEED | — | [^1]
+| 16 | 16 | — | U+0008 BACKSPACE | — | [^1]
+| 17 | 17 | — | — | — | [^1]
+| 18 | 18 | — | — | — | [^1]
+| 19 | 19 | — | — | — | [^1]
+| 1A | 1A | — | — | — | [^1]
+| 1B | 1B | — | — | — | [^1]
+| 1C | 1C | — | — | — | [^1]
+| 1D | 1D | — | — | — | [^1]
+| 1E | 1E | — | — | — | [^1]
+| 1F | 2A | — | — | — | [^1]
+| 20 | 20 | — | — | — | [^1]
+| 21 | 21 | — | — | — | [^1]
+| 22 | 22 | — | — | — | [^1]
+| 23 | 23 | — | — | — | [^1]
+| 24 | 2B | — | — | — | [^1]
+| 25 | 25 | — | U+000A LINE FEED | — | [^1]
+| 26 | 26 | — | — | — | [^1]
+| 27 | 27 | — | — | — | [^1]
+| 28 | 1F | — | — | — | [^1]
+| 29 | 24 | — | — | — | [^1]
+| 2A | 14 | — | — | — | [^1]
+| 2B | 2D | — | — | — | [^1]
+| 2C | 2E | — | — | — | [^1]
+| 2D | 2F | — | U+0007 BELL | — | [^1]
+| 2E | 32 | — | — | — | [^1]
+| 2F | 33 | — | — | — | [^1]
+| 30 | 34 | — | — | — | [^1]
+| 31 | 35 | — | — | — | [^1]
+| 32 | 36 | — | — | — | [^1]
+| 33 | 37 | — | — | — | [^1]
+| 34 | 38 | — | — | — | [^1]
+| 35 | 39 | — | — | — | [^1]
+| 36 | 3A | — | — | — | [^1]
+| 37 | 3B | — | — | — | [^1]
+| 38 | 3D | — | — | — | [^1]
+| 39 | 3F | — | — | — | [^1]
+| 3A | 28 | — | — | — | [^1]
+| 3B | 29 | — | — | — | [^1]
+| 3C | 2C | — | — | — | [^1]
+| 3D | — | — | — | — | [^8]
+| 3E | — | — | — | — | [^8]
+| 3F | — | — | — | — | [^8]
+| 40 | F0 | 30 | U+0030 DIGIT ZERO | `0` |
+| ... |
+| 49 | F9 | 39 | U+0039 DIGIT NINE | `9` |
+| 4A | C1 | 41 | U+0041 LATIN CAPITAL LETTER A | `A` |
+| ... |
+| 52 | C9 | 49 | U+0049 LATIN CAPITAL LETTER I | `I` |
+| 53 | D1 | 4A | U+004A LATIN CAPITAL LETTER J | `J` |
+| ... |
+| 5B | D9 | 52 | U+0052 LATIN CAPITAL LETTER R | `R` |
+| 5C | E2 | 53 | U+0053 LATIN CAPITAL LETTER S | `S` |
+| ... |
+| 63 | E9 | 5A | U+005A LATIN CAPITAL LETTER Z | `Z` |
+| 64 | 81 | 61 | U+0061 LATIN SMALL LETTER A | `a` |
+| ... |
+| 7D | 89 | 69 | U+0069 LATIN SMALL LETTER I | `i` |
+| 64 | 91 | 6A | U+006A LATIN SMALL LETTER J | `j` |
+| ... |
+| 7D | 99 | 72 | U+0072 LATIN SMALL LETTER R | `r` |
+| 64 | A2 | 73 | U+0073 LATIN SMALL LETTER S | `s` |
+| ... |
+| 7D | A9 | 7A | U+007A LATIN SMALL LETTER Z | `z` |
+| 7E | 40 | 20 | U+0020 SPACE | ` ` |
+| 7F | 4B | 2E | U+002E FULL STOP | `.` |
+| 80 | 4C | 3C | U+003C LESS-THAN SIGN | `<` |
+| 81 | 4D | 28 | U+0028 LEFT PARENTHESIS | `(` |
+| 82 | 4E | 2B | U+002B PLUS SIGN | `+` |
+| 83 | 59 | — | U+007C VERTICAL LINE | `\|` | [^2]
+| 84 | 50 | 26 | U+0026 AMPERSAND | `&` |
+| 85 | AD | 5B | U+005B LEFT SQUARE BRACKET | `[` |
+| 86 | BD | 5D | U+005D RIGHT SQUARE BRACKET | `]` |
+| 87 | 5A | 21 | U+0021 EXCLAMATION MARK | `!` |
+| 88 | 5B | 24 | U+0024 DOLLAR SIGN | `$` |
+| 89 | 5C | 2A | U+002A ASTERISK | `*` |
+| 8A | 5D | 29 | U+0029 RIGHT PARENTHESIS | `)` |
+| 8B | 5E | 3B | U+003B SEMICOLON | `;` |
+| 8C | 5F | 5E | U+005E CIRCUMFLEX ACCENT | `^` |
+| 8D | 60 | 2D | U+002D HYPHEN-MINUS | `-` |
+| 8E | 61 | 2F | U+002F SOLIDUS | `/` |
+| 8F | 6A | 76 | U+00A6 BROKEN BAR | `¦` | [^2]
+| 90 | 6B | 2C | U+002C COMMA | `,` |
+| 91 | 6C | 25 | U+0025 PERCENT SIGN | `%` |
+| 92 | 6D | 5F | U+005F LOW LINE | `_` |
+| 93 | 6E | 3E | U+003E GREATER-THAN SIGN | `>` |
+| 94 | 6F | 3F | U+003F QUESTION MARK | `?` |
+| 95 | 79 | 60 | U+0060 GRAVE ACCENT | \` |
+| 96 | 7A | 3A | U+003A COLON | `:` |
+| 97 | 7B | 23 | U+0023 NUMBER SIGN | `#`
+| 98 | 7C | 40 | U+0040 COMMERCIAL AT | `@` |
+| 99 | 7D | 27 | U+0027 APOSTROPHE | `'` |
+| 9A | 7E | 3D | U+003D EQUALS SIGN | `=` |
+| 9B | 7F | 22 | U+0022 QUOTATION MARK | `"` |
+| 9C | 8C | — | U+2264 LESS-THAN OR EQUAL TO | `≤` |
+| 9D | 9C | — | U+25A1 WHITE SQUARE | `□` | [^3]
+| 9E | 9E | — | U+00B1 PLUS-MINUS SIGN | `±` |
+| 9F | 9F | — | U+25A0 BLACK SQUARE | `■` | [^4]
+| A0 | — | — | U+00B0 DEGREE SIGN | `°` |
+| A1 | 8F | — | U+2020 DAGGER | `†` |
+| A2 | A1 | 7E | U+007E TILDE | `~` |
+| A3 | A0 | — | U+2013 EN DASH | `–` |
+| A4 | AB | — | U+2514 BOX DRAWINGS LIGHT UP AND RIGHT | `└` | [^5]
+| A5 | AC | — | U+250C BOX DRAWINGS LIGHT DOWN AND RIGHT | `┌` | [^5]
+| A6 | AE | — | U+2265 GREATER-THAN OR EQUAL TO | `≥` |
+| A7 | B0 | — | U+2070 SUPERSCRIPT ZERO | `⁰` | [^5]
+| ... |
+| B0 | B9 | — | U+2079 SUPERSCRIPT NINE | `⁹` | [^5]
+| B1 | BB | — | U+2518 BOX DRAWINGS LIGHT UP AND LEFT | `┘` | [^5]
+| B2 | BC | — | U+2510 BOX DRAWINGS LIGHT DOWN AND LEFT | `┐` | [^5]
+| B3 | BE | — | U+2260 NOT EQUAL TO | `≠`
+| B4 | BF | — | U+2014 EM DASH | `—`
+| B5 | 8D | — | U+2070 SUPERSCRIPT LEFT PARENTHESIS | `⁽`
+| B6 | 9D | — | U+207E SUPERSCRIPT RIGHT PARENTHESIS | `⁾`
+| B7 | BE | — | U+207A SUPERSCRIPT PLUS SIGN | `⁺` | [^6]
+| B8 | C0 | 7B | U+007B LEFT CURLY BRACKET | `{`
+| B9 | D0 | 7D | U+007D RIGHT CURLY BRACKET | `}`
+| BA | E0 | 5C | U+005C REVERSE SOLIDUS | `\`
+| BB | 4A | — | 0+00A2 CENT SIGN | `¢`
+| BC | AF | — | U+00B7 MIDDLE DOT | `·` | [^7]
+| BD | — | — | — | — | [^8]
+| ... |
+| FF | — | — | — | — | [^8]
+
+[^1]: From the EBCDIC translation table in `pff.tar.Z`.  The ASCII
+  translation table leaves all of them undefined.  Code points are
+  only listed for common control characters with some modern relevance.
+
+[^2]: The [document] describes 83 as "a solid vertical pipe" and 8F as
+  "a broken vertical pipe".  Even though the ASCII translation table
+  in `pff.tar.Z` leaves position 83 undefined and translates 8F to
+  U+007C VERTICAL LINE, using U+007C VERTICAL LINE and U+00A6 BROKEN
+  BAR, respectively, seem more accurate in a Unicode environment.
+
+[^3]: Unicode inferred from [document] description as "empty box".
+
+[^4]: Unicode inferred from [document] description as "filled box".
+
+[^5]: These characters are as described in the [document].  Some of
+    these don't appear in any known EBCDIC code page, but the EBCDIC
+    translations given in `pff.tar.Z` match the graphics shown in [RFC
+    183] with those hex codes.
+
+[^6]: Described in [document] as "horizontal dagger", which doesn't
+    appear in Unicode or any known code page.  This interpretation
+    from [RFC 183] seems more likely.
+
+[^7]: Unicode inferred from [document] description as "centered dot,
+    or bullet"
+
+[^8]: Reserved
+
+Summary:
+
+|   Range |         Characters |
+|-------: |:-------------------|
+| 40...4F | `0123456789ABCDEF` |
+| 50...5F | `GHIJKLMNOPQRSTUV` |
+| 60...6F | `WXYZabcdefghijkl` |
+| 70...7F | `mnopqrstuvwxyz .` |
+| 80...8F | `<(+\|&[]!$*);^-/¦` |
+| 90...9F | ``,%_>?`:#@'="≤□±■`` |
+| A0...AF | `°†~–└┌≥⁰ⁱ⁲⁳⁴⁵⁶⁷⁸` |
+| B0...BC | `⁹┘┐≠—⁽⁾⁺{}\¢·` |
+
+### Practice: Character Set
+
+The previous section described the translation table in theory.  This
+section describes what it contains in the [corpus].
+
+Every file in the corpus is encoded in (extended) ASCII, although 31
+of them indicate in their splash strings that they were recoded from
+EBCDIC.  This also means that ASCII `0` indicates an unmapped
+character, that is, one not in the character set represented by the
+table.
+
+The files are encoded in different ASCII extension.  Some appear to be
+encoded in [windows-1252], others in [code page 437], others in
+unidentified character sets.  The particular code page in use does not
+matter to a reader that uses the table for mapping.
+
+[windows-1252]: https://en.wikipedia.org/wiki/Windows-1252
+[code page 437]: https://en.wikipedia.org/wiki/Code_page_437
+
+* There are some invariants across the translation tables for every file
+  in the corpus:
+
+  - All control codes (in the range 0 to 63) are unmapped.
+
+    One consequence is that strings in the corpus can never contain
+    new-lines.  New-lines encoded literally would be problematic
+    anyhow because readers [must ignore
+    them](#portable-file-characters).
+
+  - Digits `0` to `9` and letters `A` to `Z` and `a` to `z` are
+    correctly mapped.
+
+  - Punctuation for space as well as ``(+&$*);-/,%_?`:@'=\`` are
+    correctly mapped.
+
+* Characters `<!^>\"~{}` are mapped correctly in almost every file in
+  the corpus, with a few outliers.
+
+* Characters `[]` are mostly correct with a few problems.
+
+* Position 97 is correctly `#` in most files, and wrongly `$` in some.
+
+* The characters at positions 83 `|` and 8F `¦` have lots of issues,
+  stemming from the history described [on Wikipedia].  In particular,
+  EBCDIC and Unicode have separate characters for `|` and `¦`, but
+  ASCII does not.
+
+  [on Wikipedia]: https://en.wikipedia.org/wiki/Vertical_bar#Broken_bar
+
+  Most of the corpus leaves 83 `|` unmapped.  Most of the rest map it
+  correctly to `|`.  The remainder map it to `!`.
+
+  Most of the corpus maps 8F `¦` to `|`.  Only a few map it correctly
+  to `¦` in [windows-1252] or (creatively) to `║` in [code page 437].
+
+* Characters at the following positions are almost always wrong.  The
+  table shows:
 
-* 177: Lower right corner box draw.
+  - "Character", the character and its position in the portable character set.
 
-* 178: Upper right corner box draw.
+  - "Unmapped", the number of files in the corpus that leave the
+    character unmapped (that is, set to `0`).
 
-* 179: Not equal symbol.
+  - "windows-1252", the number of files that map the character
+    correctly in [windows-1252].  If there is more than one plausible
+    mapping, or if the mapping doesn't exactly match the preferred
+    Unicode, the entry shows the mapped character.
 
-* 180: Em dash.
+  - "cp437", the number of files that map the character correctly in
+    [code page 437].
 
-* 181: Superscript `(`.
+    In a few cases, a plausible mapping in the "windows-1252" column
+    is an ASCII character.  Those aren't separately counted in the
+    "cp437" column, even though ASCII maps the same way in both
+    encodings.
 
-* 182: Superscript `)`.
+  - "Wrong", the number of files that map the character to nothing
+    that makes sense in a known encoding.
+
+  | Character        | Unmapped |           windows-1252 |      cp437 | Wrong |
+  |:-----------------|---------:|-----------------------:|-----------:|------:|
+  | 9C `≤`           |     1366 |                      0 |         10 |    28 |
+  | A6 `≥`           |     1373 |                      0 |         10 |    21 |
+  | 9F `■`           |     1373 |                      0 |         10 |    21 |
+  | 9E `±`           |     1353 |                     15 |         15 |    23 |
+  | A3 `–` (en dash) |     1302 |             as `-`: 65 |  as `─`: 5 |    32 |
+  | B4 `—` (em dash) |     1308 |             as `-`: 65 | as `─`: 10 |    21 |
+  | A4 `└`           |     1367 |                      0 |         15 |    22 |
+  | A5 `┌`           |     1367 |                      0 |         15 |    22 |
+  | B1 `┘`           |     1367 |                      0 |         15 |    22 |
+  | B2 `┐`           |     1367 |                      0 |         15 |    22 |
+  | A8 `¹`           |     1286 | as `¹`: 15; as `1`: 65 |          0 |    38 |
+  | A9 `²`           |     1286 | as `²`: 15; as `2`: 65 |         15 |    23 |
+  | AA `³`           |     1286 | as `³`: 15; as `3`: 65 |          0 |    38 |
+  | AB `⁴`           |     1308 |             as `4`: 65 |          0 |    31 |
+  | ...              |      ... |                    ... |        ... |   ... |
+  | B0 `⁹`           |     1308 |             as `9`: 65 |          0 |    31 |
+  | B3 `≠`           |     1373 |                      0 | as `╪`: 10 |    21 |
+  | B6 `⁽`           |     1308 |                      0 |          0 |    96 |
+  | B7 `⁾`           |     1373 |                      0 |          0 |    31 |
+  | BB `¢`           |     1351 |                     16 |         10 |    27 |
+  | BC `·`           |     1357 |  as `·`: 16; as `×`: 1 | as `∙`: 10 |    20 |
+  | A0 `°`           |     1382 |  as `°`: 15; as `º`: 1 |          5 |     6 |
 
-* 183: Horizontal dagger (?).
+* Characters at the following positions are always unmapped or wrong:
+
+  | Character | Unmapped | windows-1252 |      cp437 |                    Wrong |
+  |:----------|---------:|-------------:|-----------:|-------------------------:|
+  | 9D `□`    |     1373 |            0 | as `╬`: 10 |                       21 |
+  | A1 `†`    |     1364 |            0 | as `┼`: 10 |                       30 |
+  | A7 `⁰`    |     1373 |    as `Ø`: 1 |          0 |                       30 |
+  | B7 `⁺`    |     1373 |            0 |          0 |                       31 |
 
-* 184-186: Symbols `{}\`.
+* Sometimes the reserved characters are mapped (not in any obviously
+  useful way).
 
-* 187: Cents symbol.
+### Practice: Characters in Use
 
-* 188: Centered dot, or bullet.
+The previous section reported on the character sets defined in the
+translation table in the corpus.  This section reports on the
+characters actually found in the corpus.
 
-* 189-255: Reserved.
+In practice, characters in the corpus are in [ISO-8859-1], with very
+few exceptions.  The exceptions are a handful of files that either use
+reserved characters from the portable character set, for unclear
+reasons, or declare surprising encodings for bytes in the normal ASCII
+range.  These exceptions might be file corruption; they do not appear
+to be useful.
 
-Symbols that are not defined in a particular character set are set to
-the same value as symbol 64; i.e., to `0`.
+As a result, a portable file reader could reasonably ignore the
+translation table and simply interpret all portable files as
+[ISO-8859-1] or [windows-1252].
+
+There is no visible distinction in practice between portable files in
+"communication" versus "tape" format.  Neither kind contains control
+characters.
+
+[ISO-8859-1]: https://en.wikipedia.org/wiki/ISO/IEC_8859-1
+
+Files in the corpus have a mix of CRLF and LF-only line ends.
+
+## Tag String
+
+The translation table is followed by an 8-byte tag string that
+consists of the exact characters `SPSSPORT` in the portable file's
+character set.  This can be used to verify that the file is indeed a
+portable file.
 
-The 8-byte tag string consists of the exact characters `SPSSPORT` in
-the portable file's character set, which can be used to verify that the
-file is indeed a portable file.
+> Since every file in the corpus is encoded in (extended) ASCII, this
+> string always appears in ASCII too.
 
 ## Version and Date Info Record
 
 This record does not have a tag code.  It has the following structure:
 
-- A single character identifying the file format version.  The letter
-  A represents version 0, and so on.
+- A single character identifying the file format version.  It is
+  always `A`.
 
 - An 8-character string field giving the file creation date in the
   format YYYYMMDD.
@@ -215,19 +579,65 @@ This record does not have a tag code.  It has the following structure:
 - A 6-character string field giving the file creation time in the
   format HHMMSS.
 
+> In the [corpus], there is some variation for file creation dates and
+> times by product:
+>
+> - `STAT/TRANSFER` often writes dates that are invalid
+>   (e.g. `20040931`) or obviously wrong (e.g. `19040823`, `19000607`).
+>
+> - `STAT/TRANSFER` often writes the time as all spaces.
+>
+> - `IBM SPSS Statistics 19.0` (and probably other versions) writes `HH`
+>   as ` H` for single-digit hours.
+>
+> - `SPSS 6.1 for the Power Macintosh` writes invalid dates such as
+>   `19:11010`.
+
 ## Identification Records
 
 The product identification record has tag code `1`.  It consists of a
 single string field giving the name of the product that wrote the
 portable file.
 
-The author identification record has tag code `2`.  It is optional.
-If present, it consists of a single string field giving the name of the
-person who caused the portable file to be written.
-
-The subproduct identification record has tag code `3`.  It is
-optional.  If present, it consists of a single string field giving
-additional information on the product that wrote the portable file.
+The author identification record has tag code `2`.  It is optional and
+usually omitted.  If present, it consists of a single string field
+giving the name of the person who caused the portable file to be
+written.
+
+> The [corpus] contains a few different kinds of authors:
+>
+> - Organizational names, such as the names of companies or
+>   universities or their departments.
+>
+> - Product names, such as `SPSS for HP-UX`.
+>
+> - Internet host names, such as `icpsr.umich.edu`.
+
+The subproduct identification record has tag code `3`.  It is optional
+and usually omitted.  If present, it consists of a single string field
+giving additional information on the product that wrote the portable
+file.
+
+> The [corpus] contains a few different kinds of subproduct:
+>
+> - `x86_64-w64-mingw32` or another target triple (written by PSPP).
+>
+> - A file name for a `.sav` file.
+>
+> - `SPSS/PC+ Studentware+` written by `SPSS for MS WINDOWS Release 7.0`
+>   in 1996.
+>
+> - `FILE BUILT VIA IMPORT` written by `SPSS RELEASE 4.1 FOR VAX/VMS` in
+>   1998.
+>
+> - `SPSS/PC+` written by `SPSS for MS WINDOWS Release 7.0` in 1996.
+>
+> - Multiple instances of `SPSS/PC+` written by `SPSS/PC+ on IBM PC`,
+>   but with several spaces padding out both product and subproduct
+>   fields.
+>
+> - `PFF TEST FILE` written by `SPSS-X RELEASE 2.1 FOR IBM VM/CMS` in
+>   1986.
 
 ## Variable Count Record
 
@@ -252,14 +662,21 @@ field that names the weighting variable.
 Each variable record represents a single variable.  Variable records
 have tag code `7`.  They have the following structure:
 
-- Width (integer).  This is 0 for a numeric variable, and a number
-  between 1 and 255 for a string variable.
+- Width (integer).  This is 0 for a numeric variable.  For portability
+  to old versions of SPSS, it should be between 1 and 255 for a string
+  variable.
+
+  > Portable files in the [corpus] contain strings as wide as 32000
+  bytes.  None of these was written by SPSS itself, but by a variety
+  of third-party products: `STAT/TRANSFER`, `inquery export tool (c)
+  inworks GmbH`, `QDATA Data Entry System for the IBM PC`.  The
+  creation dates in the files range from 2016 to 2024.
 
 - Name (string).  1-8 characters long.  Must be in all capitals.
 
-  A few portable files that contain duplicate variable names have
+  A few portable files that contain duplicate variable names have
   been spotted in the wild.  PSPP handles these by renaming the
-  duplicates with numeric extensions: `VAR_1`, `VAR_2`, and so on.
+  duplicates with numeric extensions: `VAR001`, `VAR002`, and so on.
 
 - Print format.  This is a set of three integer fields:
 
@@ -270,10 +687,10 @@ have tag code `7`.  They have the following structure:
 
   - Number of decimal places.  1-40.
 
-  A few portable files with invalid format types or formats that are
-  not of the appropriate width for their variables have been spotted
-  in the wild.  PSPP assigns a default `F` or `A` format to a variable
-  with an invalid format.
+  A few portable files with invalid format types or formats that are
+  not of the appropriate width or decimals for their variables have
+  been spotted in the wild.  PSPP assigns a default `F` or `A` format
+  to a variable with an invalid format.
 
 - Write format.  Same structure as the print format described above.
 
@@ -282,15 +699,19 @@ record, which has tag code `8`.  A missing value record has one field,
 the missing value itself (a floating-point or string, as appropriate).
 Up to three of these missing value records can be used.
 
-There is also a record for missing value ranges, which has tag code
-`B`.  It is followed by two fields representing the range, which are
-floating-point or string as appropriate.  If a missing value range is
-present, it may be followed by a single missing value record.
+There are also records for missing value ranges:
+
+- Tag code `B` for `X THRU Y` ranges.  It is followed by two
+  floating-point values representing `X` and `Y`.
+
+- Tag code `9` for `LO THRU Y` ranges, followed by a floating-point
+  number representing `Y`.
+
+- Tag code `A` for `X THRU HI` ranges, followed by a floating-point
+  number representing `X`.
 
-Tag codes `9` and `A` represent `LO THRU X` and `X THRU HI` ranges,
-respectively.  Each is followed by a single field representing X.  If
-one of the ranges is present, it may be followed by a single missing
-value record.
+If a missing value range is present, it may be followed by a single
+missing value record.
 
 In addition, each variable record can optionally be followed by a
 variable label record, which has tag code `C`.  A variable label record
@@ -314,9 +735,9 @@ Value label records have tag code `D`.  They have the following format:
   or string as appropriate to the variables, followed by a label
   (string).
 
-A few portable files that specify duplicate value labels, that is,
-two different labels for a single value of a single variable, have been
-spotted in the wild.  PSPP uses the last value label specified in these
+> The corpus contains a few portable files that specify duplicate
+value labels, that is, two different labels for a single value of a
+single variable.  PSPP uses the last value label specified in these
 cases.
 
 ## Document Record
index 3d4e7f35b35814ad6a7e5864393aaa7fdf692499..100a531c41adf78f367b887e28c6abd5dbccb30c 100644 (file)
@@ -20,12 +20,18 @@ new-line.  PSPP uses this string to identify an SPV file; it is
 invariant across the corpus.
 
 > SPV files always begin with the 7-byte sequence 50 4b 03 04 14 00
-08, but this is not a useful magic number because most Zip archives
-start the same way.
-
+> 08, but this is not a useful magic number because most Zip archives
+> start the same way.
+>
+> Checking only for the presence of `META-INF/MANIFEST.MF` is also not
+> a useful magic number because this file name also appears in every
+> [Java JAR archive].
+>
 > SPSS writes `META-INF/MANIFEST.MF` to every SPV file, but it does
-not read it or even require it to exist, so using different contents,
-e.g. `allowPivoting=false`, has no effect.
+> not read it or even require it to exist, so using different
+> contents, e.g. `allowPivoting=false`, has no effect.
+>
+> [Java JAR archive]: https://docs.oracle.com/javase/8/docs/technotes/guides/jar/jar.html
 
 The rest of the members in an SPV file's Zip archive fall into two
 categories: "structure" and "detail" members.  Structure member names
index f0a33d2f56455abe1bd4e6eec3de253892332653..996a75c919adff97d2dadb39c02985b4de4397be 100644 (file)
@@ -53,6 +53,8 @@ unicode-segmentation = "1.12.0"
 serde_json = "1.0.141"
 toml = "0.9.5"
 hashbrown = { version = "0.15.5", features = ["serde"] }
+displaydoc = "0.2.5"
+codepage-437 = "0.1.0"
 
 [target.'cfg(windows)'.dependencies]
 windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] }
index 949331282d7691bc03bcddad7baee61b71505ea8..aa59cbfa42c2a153f9855214f9646708695adf13 100644 (file)
 use std::{
     fs::File,
     io::{stdout, Write},
-    path::PathBuf,
+    path::{Path, PathBuf},
 };
 
-use anyhow::{bail, Result};
+use anyhow::{anyhow, bail, Error as AnyError, Result};
 use chrono::{Datelike, NaiveTime, Timelike};
-use clap::Args;
+use clap::{Args, ValueEnum};
 use csv::Writer;
 use encoding_rs::Encoding;
 use pspp::{
     calendar::calendar_offset_to_gregorian,
-    data::{ByteString, Datum, WithEncoding},
+    data::{ByteString, Case, Datum, WithEncoding},
+    file::FileType,
     format::{DisplayPlain, Type},
+    por::PortableFile,
     sys::{raw::records::Compression, ReadOptions, WriteOptions},
     util::ToSmallString,
     variable::Variable,
 };
 
-use crate::{parse_encoding, OutputFormat};
+use crate::parse_encoding;
 
 /// Convert SPSS data files into other formats.
 #[derive(Args, Clone, Debug)]
@@ -247,12 +249,42 @@ struct SysOptions {
     compression: Option<Compression>,
 }
 
-impl Convert {
-    pub fn run(self) -> Result<()> {
-        fn warn(warning: anyhow::Error) {
-            eprintln!("warning: {warning}");
+/// Output file format.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, ValueEnum)]
+enum OutputFormat {
+    /// Comma-separated values using each variable's print format (variable
+    /// names are written as the first line)
+    Csv,
+
+    /// System file
+    Sys,
+
+    /// Portable file
+    Por,
+}
+
+impl TryFrom<&Path> for OutputFormat {
+    type Error = AnyError;
+
+    fn try_from(value: &Path) -> std::result::Result<Self, Self::Error> {
+        let extension = value.extension().unwrap_or_default();
+        if extension.eq_ignore_ascii_case("csv") || extension.eq_ignore_ascii_case("txt") {
+            Ok(OutputFormat::Csv)
+        } else if extension.eq_ignore_ascii_case("sav") || extension.eq_ignore_ascii_case("sys") {
+            Ok(OutputFormat::Sys)
+        } else if extension.eq_ignore_ascii_case("por") {
+            Ok(OutputFormat::Por)
+        } else {
+            Err(anyhow!(
+                "Unknown output file extension '{}'",
+                extension.display()
+            ))
         }
+    }
+}
 
+impl Convert {
+    pub fn run(self) -> Result<()> {
         let output_format = match self.output_format {
             Some(format) => format,
             None => match &self.output {
@@ -261,14 +293,39 @@ impl Convert {
             },
         };
 
-        let mut system_file = ReadOptions::new(warn)
-            .with_encoding(self.encoding)
-            .with_password(self.password.clone())
-            .open_file(&self.input)?;
-        if output_format == OutputFormat::Sys && self.sys_options.to_unicode {
-            system_file = system_file.into_unicode();
-        }
-        let (dictionary, _, cases) = system_file.into_parts();
+        let (dictionary, cases) = match FileType::from_file(&self.input)? {
+            Some(FileType::System { .. }) => {
+                fn warn(warning: anyhow::Error) {
+                    eprintln!("warning: {warning}");
+                }
+
+                let mut system_file = ReadOptions::new(warn)
+                    .with_encoding(self.encoding)
+                    .with_password(self.password.clone())
+                    .open_file(&self.input)?;
+                if output_format == OutputFormat::Sys && self.sys_options.to_unicode {
+                    system_file = system_file.into_unicode();
+                }
+                let (dictionary, _, cases) = system_file.into_parts();
+                let cases = cases.map(|result| result.map_err(AnyError::from));
+                let cases = Box::new(cases)
+                    as Box<dyn Iterator<Item = Result<Case<Vec<Datum<ByteString>>>, AnyError>>>;
+                (dictionary, cases)
+            }
+            Some(FileType::Portable) => {
+                fn warn_portable(warning: pspp::por::Warning) {
+                    eprintln!("warning: {warning}");
+                }
+
+                let portable_file = PortableFile::open_file(&self.input, warn_portable)?;
+                let (dictionary, _, cases) = portable_file.into_parts();
+                let cases = cases.map(|result| result.map_err(AnyError::from));
+                let cases = Box::new(cases)
+                    as Box<dyn Iterator<Item = Result<Case<Vec<Datum<ByteString>>>, AnyError>>>;
+                (dictionary, cases)
+            }
+            _ => bail!("{}: not a system or portable file", self.input.display()),
+        };
 
         // Take only the first `self.max_cases` cases.
         let cases = cases.take(self.max_cases.unwrap_or(usize::MAX));
@@ -314,6 +371,15 @@ impl Convert {
                     output.write_case(case?)?;
                 }
             }
+            OutputFormat::Por => {
+                let Some(output) = &self.output else {
+                    bail!("output file name must be specified for output to a portable file")
+                };
+                let mut output = pspp::por::WriteOptions::new().write_file(&dictionary, output)?;
+                for case in cases {
+                    output.write_case(case?)?;
+                }
+            }
         }
         Ok(())
     }
index 28ad0521a7c2d0a91ea50d73ac19263a31f3a0c8..ebacd6108e6eb4df697ee68f240abe3b541bfbd7 100644 (file)
@@ -44,7 +44,12 @@ use serde::{
 };
 
 use crate::{
+    dictionary::Dictionary,
     format::DisplayPlain,
+    output::{
+        pivot::{Axis3, Dimension, Group, PivotTable, Value},
+        Item, Text,
+    },
     variable::{VarType, VarWidth},
 };
 
@@ -774,14 +779,20 @@ pub struct Case<B>
 where
     B: Borrow<[Datum<ByteString>]>,
 {
-    encoding: &'static Encoding,
     data: B,
+    encoding: &'static Encoding,
 }
 
 impl<B> Case<B>
 where
     B: Borrow<[Datum<ByteString>]>,
 {
+    pub fn new(data: B, encoding: &'static Encoding) -> Self {
+        Self { data, encoding }
+    }
+    pub fn encoding(&self) -> &'static Encoding {
+        self.encoding
+    }
     pub fn is_empty(&self) -> bool {
         self.len() == 0
     }
@@ -816,6 +827,47 @@ impl Case<Vec<Datum<ByteString>>> {
     }
 }
 
+pub fn cases_to_output<C, E>(dictionary: &Dictionary, cases: C) -> Vec<Item>
+where
+    C: IntoIterator<Item = Result<Case<Vec<Datum<ByteString>>>, E>>,
+    E: Display,
+{
+    let mut output = Vec::new();
+    let cases = cases.into_iter();
+    let variables =
+        Group::new("Variable").with_multiple(dictionary.variables.iter().map(|var| &**var));
+    let mut case_numbers = Group::new("Case").with_label_shown();
+    let mut data = Vec::new();
+    for case in cases {
+        match case {
+            Ok(case) => {
+                case_numbers.push(Value::new_integer(Some((case_numbers.len() + 1) as f64)));
+                data.push(
+                    case.into_iter()
+                        .map(|datum| Value::new_datum(&datum))
+                        .collect::<Vec<_>>(),
+                );
+            }
+            Err(error) => {
+                output.push(Item::from(Text::new_log(error.to_string())));
+            }
+        }
+    }
+    if !data.is_empty() {
+        let mut pt = PivotTable::new([
+            (Axis3::X, Dimension::new(variables)),
+            (Axis3::Y, Dimension::new(case_numbers)),
+        ]);
+        for (row_number, row) in data.into_iter().enumerate() {
+            for (column_number, datum) in row.into_iter().enumerate() {
+                pt.insert(&[column_number, row_number], datum);
+            }
+        }
+        output.push(pt.into());
+    }
+    output
+}
+
 impl<B> Serialize for Case<B>
 where
     B: Borrow<[Datum<ByteString>]>,
diff --git a/rust/pspp/src/file.rs b/rust/pspp/src/file.rs
new file mode 100644 (file)
index 0000000..6e34b4a
--- /dev/null
@@ -0,0 +1,216 @@
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Basic infrastructure for files understood by PSPP.
+
+#![cfg_attr(not(test), warn(missing_docs))]
+use std::{
+    fs::File,
+    io::{Error, Read, Seek},
+    path::Path,
+};
+
+use zip::ZipArchive;
+
+use crate::sys::raw::Magic;
+
+/// Type of a file understood by PSPP.
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum FileType {
+    /// A [system file](crate::sys).
+    System {
+        /// Whether the file is encrypted.
+        encrypted: bool,
+    },
+
+    /// A [portable file](crate::por).
+    Portable,
+
+    /// An SPSS PC+ data file.
+    PcPlus,
+
+    /// An [SPSS Viewer file](crate::output::spv).
+    Viewer {
+        /// Whether the file is encrypted.
+        encrypted: bool,
+    },
+
+    /// A file that may be an SPSS syntax file.
+    Syntax {
+        /// True if there's confidence that this is a syntax file, which would
+        /// be either because it has an indicated encoding or because it is
+        /// encrypted.
+        confident: bool,
+
+        /// Whether the file is encrypted.
+        encrypted: bool,
+    },
+}
+
+impl FileType {
+    /// Returns true if we're confident about the file's type.
+    ///
+    /// (We can't always confidently identify syntax files because they look
+    /// mostly like any kind of text file.)
+    pub fn is_confident(&self) -> bool {
+        match self {
+            Self::Syntax { confident, .. } => *confident,
+            _ => true,
+        }
+    }
+
+    /// Returns true if the file is encrypted.
+    pub fn is_encrypted(&self) -> bool {
+        match self {
+            FileType::System { encrypted } => *encrypted,
+            FileType::Viewer { encrypted } => *encrypted,
+            FileType::Syntax {
+                confident: _,
+                encrypted,
+            } => *encrypted,
+            _ => false,
+        }
+    }
+
+    /// Attempts to identify the type of file at `path`.  Returns:
+    ///
+    /// * `Err(error)`: I/O error.
+    ///
+    /// * `Ok(Some(type))`: Identified file type.
+    ///
+    /// * `Ok(None)`: Unknown file type.
+    pub fn from_file<P>(path: P) -> Result<Option<Self>, Error>
+    where
+        P: AsRef<Path>,
+    {
+        Self::from_reader(File::open(path)?)
+    }
+
+    /// Like [from_file](Self::from_file) for an arbitrary `reader`.
+    pub fn from_reader<R>(mut reader: R) -> Result<Option<Self>, Error>
+    where
+        R: Read + Seek,
+    {
+        let mut buf = vec![0; 512];
+        let mut n = 0;
+        while n < buf.capacity() {
+            let count = reader.read(&mut buf[n..])?;
+            n += count;
+            if count == 0 {
+                break;
+            }
+        }
+        buf.truncate(n);
+
+        if let Some(magic) = buf.get(0..4) {
+            let magic: [u8; 4] = magic.try_into().unwrap();
+            if Magic::try_from(magic).is_ok() {
+                return Ok(Some(Self::System { encrypted: false }));
+            }
+        }
+
+        match buf.get(8..20) {
+            Some(b"ENCRYPTEDSAV") => {
+                return Ok(Some(Self::System { encrypted: true }));
+            }
+            Some(b"ENCRYPTEDSPV") => {
+                return Ok(Some(Self::Viewer { encrypted: true }));
+            }
+            Some(b"ENCRYPTEDSPS") => {
+                return Ok(Some(Self::Syntax {
+                    confident: true,
+                    encrypted: true,
+                }));
+            }
+            _ => (),
+        }
+
+        if buf
+            .get(200 + 256..)
+            .unwrap_or_default()
+            .windows(8)
+            .any(|w| w == b"SPSSPORT")
+        {
+            return Ok(Some(Self::Portable));
+        }
+
+        if buf.get(0x104..0x108) == Some(b"SPSS") {
+            return Ok(Some(Self::PcPlus));
+        }
+
+        let mut string = String::new();
+        if buf.get(..7) == Some(&[0x50, 0x4b, 0x03, 0x04, 0x14, 0x00, 0x08])
+            && let Ok(mut archive) = ZipArchive::new(reader)
+            && let Ok(mut file) = archive.by_name("META-INF/MANIFEST.MF")
+            && let Ok(_) = file.read_to_string(&mut string)
+            && string.trim() == "allowPivoting=true"
+        {
+            return Ok(Some(Self::Viewer { encrypted: false }));
+        }
+
+        if !buf.is_empty() && !buf.contains(&0) {
+            return Ok(Some(Self::Syntax {
+                confident: buf.starts_with(b"* Encoding:"),
+                encrypted: false,
+            }));
+        }
+
+        Ok(None)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::file::FileType;
+
+    #[test]
+    fn file_type() {
+        assert_eq!(
+            FileType::from_file("src/file/testdata/test.sav").unwrap(),
+            Some(FileType::System { encrypted: false })
+        );
+        assert_eq!(
+            FileType::from_file("src/file/testdata/test-encrypted.sav").unwrap(),
+            Some(FileType::System { encrypted: true })
+        );
+        assert_eq!(
+            FileType::from_file("src/file/testdata/test.por").unwrap(),
+            Some(FileType::Portable)
+        );
+        assert_eq!(
+            FileType::from_file("src/file/testdata/test-encrypted.spv").unwrap(),
+            Some(FileType::Viewer { encrypted: true })
+        );
+        assert_eq!(
+            FileType::from_file("src/file/testdata/test.spv").unwrap(),
+            Some(FileType::Viewer { encrypted: false })
+        );
+        assert_eq!(
+            FileType::from_file("src/file/testdata/test.sps").unwrap(),
+            Some(FileType::Syntax {
+                confident: false,
+                encrypted: false
+            })
+        );
+        assert_eq!(
+            FileType::from_file("src/file/testdata/test-encoding.sps").unwrap(),
+            Some(FileType::Syntax {
+                confident: true,
+                encrypted: false
+            })
+        );
+    }
+}
diff --git a/rust/pspp/src/file/testdata/test-encoding.sps b/rust/pspp/src/file/testdata/test-encoding.sps
new file mode 100644 (file)
index 0000000..d060b90
--- /dev/null
@@ -0,0 +1,2 @@
+* Encoding: UTF-8.
+DATA LIST /X 1.
\ No newline at end of file
diff --git a/rust/pspp/src/file/testdata/test-encrypted.sav b/rust/pspp/src/file/testdata/test-encrypted.sav
new file mode 100644 (file)
index 0000000..2d9f531
Binary files /dev/null and b/rust/pspp/src/file/testdata/test-encrypted.sav differ
diff --git a/rust/pspp/src/file/testdata/test-encrypted.spv b/rust/pspp/src/file/testdata/test-encrypted.spv
new file mode 100644 (file)
index 0000000..da8be2c
Binary files /dev/null and b/rust/pspp/src/file/testdata/test-encrypted.spv differ
diff --git a/rust/pspp/src/file/testdata/test.por b/rust/pspp/src/file/testdata/test.por
new file mode 100644 (file)
index 0000000..248cc4e
--- /dev/null
@@ -0,0 +1,11 @@
+ÁâÃÉÉ@â×ââ@×ÖÙã@ÆÉÓÅ@@@@@@@@@@@@@@@@@@@@ASCII SPSS PORT FILE                    \r
+00000-0000-0000-0000--------------------!3#))0303300/240&),%00000000000000000000\r
+0200002'220'&)3000#0000000000000000000000000000000000000000000000000000000000000\r
+0000000000000000000000000123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrst\r
+uvwxyz .<(+0&[]!$*);^-/|,%_>?`:#@'="000000~000000000000000000000{}\0000000000000\r
+00000000000000000000000000000000000000000000000000000000SPSSPORTA8/199805296/173\r
+832111/SPSS for MS WINDOWS Release 7.549/5B/68/VAR0000170/8/VAR000015/8/2/5/8/2/\r
+81/82/83/70/8/VAR000025/8/2/5/8/2/BA/K/70/8/VAR000035/8/2/5/8/2/B10/1A/81K/70/8/\r
+VAR000045/8/2/5/8/2/A1/70/8/VAR000055/8/2/5/8/2/92/70/8/VAR000065/8/2/5/8/2/9A9E\r
+17IR6IFNR+6H/70/8/VAR000075/8/2/5/8/2/92/83/70/8/VAR000085/8/2/5/8/2/70/8/VAR000\r
+095/8/2/5/8/2/F0/*.*.*.*.*.*.*.*.ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ\r
diff --git a/rust/pspp/src/file/testdata/test.sav b/rust/pspp/src/file/testdata/test.sav
new file mode 100644 (file)
index 0000000..a84e8f1
Binary files /dev/null and b/rust/pspp/src/file/testdata/test.sav differ
diff --git a/rust/pspp/src/file/testdata/test.sps b/rust/pspp/src/file/testdata/test.sps
new file mode 100644 (file)
index 0000000..3f1afb2
--- /dev/null
@@ -0,0 +1 @@
+DATA LIST /X 1.
diff --git a/rust/pspp/src/file/testdata/test.spv b/rust/pspp/src/file/testdata/test.spv
new file mode 100644 (file)
index 0000000..891263d
Binary files /dev/null and b/rust/pspp/src/file/testdata/test.spv differ
index 43ba5198b81ef6764c662923817a1501bd361a15..401b22f0faba86b9d3c19d41ab66a9ad24fff738 100644 (file)
@@ -695,7 +695,7 @@ impl TryFrom<UncheckedFormat> for Format {
         } else if !format.width_range().contains(&w) {
             Err(Error::BadWidth(source))
         } else if d > max_d {
-            if format.takes_decimals() {
+            if !format.takes_decimals() {
                 Err(Error::DecimalsNotAllowedForFormat(source))
             } else if max_d > 0 {
                 Err(Error::TooManyDecimalsForWidth {
index ee2e8a68ee25200bf4e73b943dfc93c9e8456391..00114eba12468efd7f246932325100dbadb93584 100644 (file)
@@ -107,6 +107,7 @@ pub mod data;
 pub mod dictionary;
 pub mod endian;
 pub mod engine;
+pub mod file;
 pub mod format;
 pub mod hexfloat;
 pub mod identifier;
@@ -116,6 +117,7 @@ pub mod locale_charset;
 pub mod macros;
 pub mod message;
 pub mod output;
+pub mod por;
 pub mod prompt;
 pub mod settings;
 pub mod sys;
index b00e2b4a89155c8ddf5d092658f8463bec75917c..5cdd92ee71534935484964a010a44ce41dce51d4 100644 (file)
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>. */
 
-use anyhow::{anyhow, Error as AnyError, Result};
-use clap::{Parser, Subcommand, ValueEnum};
+use anyhow::Result;
+use clap::{Parser, Subcommand};
 use encoding_rs::Encoding;
-use std::path::Path;
 use thiserror::Error as ThisError;
 
-use crate::{convert::Convert, decrypt::Decrypt, show::Show};
+use crate::{convert::Convert, decrypt::Decrypt, show::Show, show_por::ShowPor};
 
 mod convert;
 mod decrypt;
 mod show;
+mod show_por;
 
 /// PSPP, a program for statistical analysis of sampled data.
 #[derive(Parser, Debug)]
@@ -34,40 +34,12 @@ struct Cli {
     command: Command,
 }
 
-/// Output file format.
-#[derive(Copy, Clone, Debug, PartialEq, Eq, ValueEnum)]
-enum OutputFormat {
-    /// Comma-separated values using each variable's print format (variable
-    /// names are written as the first line)
-    Csv,
-
-    /// System file
-    Sys,
-}
-
-impl TryFrom<&Path> for OutputFormat {
-    type Error = AnyError;
-
-    fn try_from(value: &Path) -> std::result::Result<Self, Self::Error> {
-        let extension = value.extension().unwrap_or_default();
-        if extension.eq_ignore_ascii_case("csv") || extension.eq_ignore_ascii_case("txt") {
-            Ok(OutputFormat::Csv)
-        } else if extension.eq_ignore_ascii_case("sav") || extension.eq_ignore_ascii_case("sys") {
-            Ok(OutputFormat::Sys)
-        } else {
-            Err(anyhow!(
-                "Unknown output file extension '{}'",
-                extension.display()
-            ))
-        }
-    }
-}
-
 #[derive(Subcommand, Clone, Debug)]
 enum Command {
     Convert(Convert),
     Decrypt(Decrypt),
     Show(Show),
+    ShowPor(ShowPor),
 }
 
 impl Command {
@@ -76,6 +48,7 @@ impl Command {
             Command::Convert(convert) => convert.run(),
             Command::Decrypt(decrypt) => decrypt.run(),
             Command::Show(show) => show.run(),
+            Command::ShowPor(show_por) => show_por.run(),
         }
     }
 }
index 2c9f17b307c5ec16e57328b75d4f17e2a46c686c..c909d8dd38320d1ddd30acd29c4a6335d9e62933 100644 (file)
@@ -71,6 +71,7 @@ use thiserror::Error as ThisError;
 use tlo::parse_tlo;
 
 use crate::{
+    calendar::date_time_to_pspp,
     data::{ByteString, Datum, EncodedString, RawString},
     format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat},
     settings::{Settings, Show},
@@ -2007,6 +2008,9 @@ impl Value {
             styling: None,
         }
     }
+    pub fn new_date_time(date_time: NaiveDateTime) -> Self {
+        Self::new_number_with_format(Some(date_time_to_pspp(date_time)), Format::DATETIME40_0)
+    }
     pub fn new_number_with_format(x: Option<f64>, format: Format) -> Self {
         Self::new(ValueInner::Number(NumberValue {
             show: None,
diff --git a/rust/pspp/src/por.rs b/rust/pspp/src/por.rs
new file mode 100644 (file)
index 0000000..5488c5a
--- /dev/null
@@ -0,0 +1,56 @@
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Reading and writing portable files.
+//!
+//! This module enables reading and writing “portable files”, a text-based
+//! format for SPSS data files.  The [portable file format] dates back 40+ years.
+//! It was originally designed to facilitate data interchange between systems
+//! with unlike character sets, but it did not continue to evolve after the
+//! system file format was introduced.  It is obsolete.  PSPP includes readers
+//! and writers for portable files only for compatibility; all non-legacy uses
+//! of PSPP should use [system files] instead.
+//!
+//! Use [PortableFile] to read a portable file.  Use [WriteOptions] to write a
+//! portable file.
+//!
+//! [portable file format]: https://pspp.benpfaff.org/manual/portable.html
+//! [system files]: crate::sys
+#![cfg_attr(not(test), warn(missing_docs))]
+
+mod read;
+mod write;
+
+pub use read::{
+    Cases, Error, ErrorDetails, Metadata, PortableFile, ReadPad, ReadTranslate, TranslationTable,
+    Warning,
+};
+pub use write::{WriteOptions, Writer};
+
+static PORTABLE_TO_WINDOWS_1252: &[u8] = {
+    let s =
+    b"                                                                \
+      0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .\
+      <(+|&[]!$*);^-/|,%_>?`:#@'=\"  \xb1 \xb0\x86~\x96   0\xb9\xb2\xb3456789   \x97() {}\\\xa2\x95                                                                   ";
+    assert!(s.len() == 256);
+    s
+};
+
+/// Returns the windows-1252 character corresponding to the given `portable`
+/// character.
+fn portable_to_windows_1252(portable: u8) -> u8 {
+    PORTABLE_TO_WINDOWS_1252[portable as usize]
+}
diff --git a/rust/pspp/src/por/read.rs b/rust/pspp/src/por/read.rs
new file mode 100644 (file)
index 0000000..ac726ca
--- /dev/null
@@ -0,0 +1,1223 @@
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program.  If not, see <http://www.gnu.org/licenses/>.
+
+use std::{
+    cmp::Ordering,
+    fmt::{Display, Formatter},
+    fs::File,
+    io::{BufRead, BufReader, Error as IoError, Read, Result as IoResult, Seek, SeekFrom},
+    ops::Index,
+    path::Path,
+};
+
+use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
+use codepage_437::CP437_WINGDINGS;
+use encoding_rs::WINDOWS_1252;
+use indexmap::set::MutableValues;
+use num::{Bounded, NumCast};
+use serde::{ser::SerializeSeq, Serialize, Serializer};
+
+use crate::{
+    data::{ByteString, Case, Datum, RawString, WithEncoding},
+    dictionary::{DictIndex, Dictionary},
+    format::{Error as FormatError, Format, Type, UncheckedFormat},
+    identifier::{Error as IdError, Identifier},
+    output::pivot::{MetadataEntry, MetadataValue, PivotTable, Value},
+    por::portable_to_windows_1252,
+    variable::{MissingValueRange, MissingValues, MissingValuesError, VarType, VarWidth, Variable},
+};
+use displaydoc::Display;
+use thiserror::Error as ThisError;
+
+/// An SPSS portable file.
+#[derive(Debug)]
+pub struct PortableFile<R> {
+    /// The system file dictionary.
+    pub dictionary: Dictionary,
+
+    /// Portable file metadata that is not part of the dictionary.
+    pub metadata: Metadata,
+
+    /// Data in the portable file.
+    pub cases: Cases<ReadTranslate<ReadPad<R>>>,
+}
+
+impl<R> PortableFile<R> {
+    /// Returns the individual parts of the [PortableFile].
+    pub fn into_parts(self) -> (Dictionary, Metadata, Cases<ReadTranslate<ReadPad<R>>>) {
+        (self.dictionary, self.metadata, self.cases)
+    }
+}
+
+/// Portable file metadata that is not part of [Dictionary].
+#[derive(Clone, Debug, PartialEq, Eq, Serialize)]
+pub struct Metadata {
+    /// Creation date and time.
+    ///
+    /// This comes from the file header, not from the file system.
+    pub creation: Option<NaiveDateTime>,
+
+    /// Name of the product that wrote the file.
+    pub product: Option<String>,
+
+    /// Extended name of the product that wrote the file.
+    pub product_ext: Option<String>,
+
+    /// Identifies the organization licensed for the product that wrote the
+    /// file.
+    pub author: Option<String>,
+
+    /// The file's embedded character encoding translation table.
+    #[serde(serialize_with = "serialize_character_set")]
+    pub character_set: [u8; 256],
+}
+
+fn serialize_character_set<S>(translations: &[u8; 256], serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: Serializer,
+{
+    let mut seq = serializer.serialize_seq(Some(256))?;
+    for (index, c) in translations.into_iter().enumerate() {
+        let windows_1252 = *c as char;
+        let cp_437 = CP437_WINGDINGS.decode(*c);
+        if windows_1252 == cp_437 {
+            seq.serialize_element(&(format!("{index:02x}"), windows_1252))?;
+        } else {
+            seq.serialize_element(&(format!("{index:02x}"), windows_1252, cp_437))?;
+        }
+    }
+    seq.end()
+}
+
+impl From<&Metadata> for PivotTable {
+    fn from(value: &Metadata) -> Self {
+        fn maybe_string(name: &str, s: &Option<String>) -> MetadataEntry {
+            MetadataEntry {
+                name: Value::new_user_text(name),
+                value: MetadataValue::Leaf(
+                    s.as_ref()
+                        .cloned()
+                        .map(Value::new_user_text)
+                        .unwrap_or_default(),
+                ),
+            }
+        }
+
+        MetadataEntry {
+            name: Value::new_user_text("Portable File Metadata"),
+            value: MetadataValue::Group(vec![
+                MetadataEntry {
+                    name: Value::new_user_text("Created"),
+                    value: MetadataValue::Leaf(
+                        value.creation.map(Value::new_date_time).unwrap_or_default(),
+                    ),
+                },
+                maybe_string("Product", &value.product),
+                maybe_string("Product 2", &value.product_ext),
+                maybe_string("Author", &value.author),
+            ]),
+        }
+        .into_pivot_table()
+    }
+}
+
+/// Reader for cases in a portable file.
+#[derive(Debug)]
+pub struct Cases<R> {
+    reader: R,
+    variables: Vec<VarWidth>,
+    eof: bool,
+}
+
+impl<R> Cases<R> {
+    fn new(reader: R, variables: Vec<VarWidth>) -> Self {
+        Self {
+            reader,
+            variables,
+            eof: false,
+        }
+    }
+
+    fn read_case(&mut self) -> Result<Option<Case<Vec<Datum<ByteString>>>>, ErrorDetails>
+    where
+        R: Read,
+    {
+        let mut values = Vec::with_capacity(self.variables.len());
+
+        // Check whether we're at end of file.
+        let peek = read_byte(&mut self.reader)?;
+        if peek == b'Z' {
+            return Ok(None);
+        }
+
+        // We're not at EOF, so glue the lookahead byte onto the front of the
+        // reader and then read a case.
+        let peek = [peek];
+        let mut reader = peek.chain(&mut self.reader);
+        for width in &self.variables {
+            match width {
+                VarWidth::Numeric => values.push(Datum::Number(read_f64_or_missing(&mut reader)?)),
+                VarWidth::String(width) => {
+                    let mut string = read_raw_string(&mut reader)?;
+                    string.resize(*width as usize, b' ');
+                    values.push(Datum::String(string.into()));
+                }
+            }
+        }
+        Ok(Some(Case::new(values, WINDOWS_1252)))
+    }
+}
+
+impl<R> Iterator for Cases<R>
+where
+    R: Read + Seek,
+{
+    type Item = Result<Case<Vec<Datum<ByteString>>>, Error>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.eof || self.variables.is_empty() {
+            return None;
+        }
+
+        match self.read_case().transpose() {
+            Some(Ok(case)) => Some(Ok(case)),
+            None => {
+                self.eof = true;
+                None
+            }
+            Some(Err(details)) => {
+                self.eof = true;
+                Some(Err(Error {
+                    offset: self.reader.stream_position().ok(),
+                    details,
+                }))
+            }
+        }
+    }
+}
+
+/// An error encountered reading a portable file.
+#[derive(Debug)]
+pub struct Error {
+    /// Offset where the error occurred.
+    pub offset: Option<u64>,
+
+    /// Details of the error.
+    pub details: ErrorDetails,
+}
+
+impl std::error::Error for Error {}
+
+impl Error {
+    /// Constructs an error from `offset` and `details`.
+    pub fn new(offset: Option<u64>, details: ErrorDetails) -> Self {
+        Self { offset, details }
+    }
+}
+
+impl From<IoError> for Error {
+    fn from(value: IoError) -> Self {
+        Self::new(None, value.into())
+    }
+}
+
+impl Display for Error {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        if let Some(offset) = self.offset {
+            write!(f, "Error at file offset {:#x}: ", offset)?;
+        }
+        write!(f, "{}", &self.details)
+    }
+}
+
+/// An error for reading a [PortableFile].
+#[derive(Display, ThisError, Debug)]
+pub enum ErrorDetails {
+    /// Not an SPSS portable file.
+    NotAPortableFile,
+
+    /// Unrecognized version code '{0}'.
+    UnrecognizedVersionCode(char),
+
+    /// I/O error ({0}).
+    Io(#[from] IoError),
+
+    /// Number expected.
+    NumberExpected,
+
+    /// Integer expected.
+    InvalidInteger,
+
+    /// Expected integer between {min_value} and {max_value}, instead of {float}.
+    OutOfRangeInteger {
+        /// Value actually read.
+        float: f64,
+        /// Minimum valid integer value.
+        min_value: String,
+        /// Maximum valid integer value.
+        max_value: String,
+    },
+
+    /// Missing numeric terminator.
+    MissingSlash,
+
+    /// Invalid string length {0}.
+    InvalidStringLength(i32),
+
+    /// Expected variable count record with tag 4 (instead of tag {0:?}).
+    ExpectedVariableCountRecord(char),
+
+    /// Invalid number of variables {0}.
+    InvalidNumberOfVariables(i32),
+
+    /// Expected variable record.
+    ExpectedVariableRecord,
+
+    /// Invalid width {width} for variable {name}.
+    InvalidVariableWidth {
+        /// Declared width.
+        width: i32,
+        /// Variable name.
+        name: Identifier,
+    },
+
+    /// System-missing value where number expected.
+    UnexpectedSysmis,
+
+    /// Data record expected.
+    DataRecordExpected,
+
+    /// Value label record had no valid variable indexes.
+    NoValueLabelVariables,
+}
+
+/// A warning while reading a [PortableFile].
+#[derive(Display, ThisError, Debug)]
+pub enum Warning {
+    /// Invalid date {0}.
+    InvalidDate(String),
+
+    /// Invalid time {0}.
+    InvalidTime(String),
+
+    /// Invalid variable name.
+    InvalidVariableName {
+        /// Identifier error.
+        id_error: IdError,
+        /// New name.
+        new_name: Identifier,
+    },
+
+    /// Renaming variable with duplicate name {duplicate_name} to {new_name}.
+    DuplicateVariableName {
+        /// Duplicate name.
+        duplicate_name: Identifier,
+        /// New name.
+        new_name: Identifier,
+    },
+
+    /// Substituting {new_format} for invalid print format on variable {variable}.  {format_error}
+    InvalidPrintFormat {
+        /// New format.
+        new_format: Format,
+        /// Variable.
+        variable: Identifier,
+        /// Underlying error.
+        format_error: FormatError,
+    },
+
+    /// Substituting {new_format} for invalid write format on variable {variable}.  {format_error}
+    InvalidWriteFormat {
+        /// New format.
+        new_format: Format,
+        /// Variable.
+        variable: Identifier,
+        /// Underlying error.
+        format_error: FormatError,
+    },
+
+    /// Missing value range may not contain system-missing value.
+    MissingValueRangeSysmis,
+
+    /// Invalid missing values for variable {name}: {error}.
+    InvalidMissingValues {
+        /// Variable name.
+        name: Identifier,
+        /// Kind of error with missing values.
+        error: MissingValuesError,
+    },
+
+    /// Unknown weight variable {0}.
+    UnknownWeightVariable(Identifier),
+
+    /// Invalid identifier {string}.  {error}
+    InvalidIdentifier {
+        /// String that should be an identifier.
+        string: String,
+        /// Kind of error with the string.
+        error: IdError,
+    },
+
+    /// Unknown variable name {0}.
+    UnknownVariableName(Identifier),
+
+    /// Mixed variable types in value labels.
+    MixedVariableTypes,
+}
+
+/// Translation table from file bytes to [WINDOWS_1252].
+///
+/// A byte in the file with value `x` is interpreted in [WINDOWS_1252] as
+/// `self.0[x]`.
+#[derive(Debug)]
+pub struct TranslationTable(
+    /// Translation table.
+    [u8; 256],
+);
+
+impl TranslationTable {
+    // Create the translation table, given the character set in a portable file.
+    fn new(character_set: &[u8; 256]) -> Self {
+        // Skip the first 64 characters of the character set.  They are probably
+        // all set to '0', marking them as untranslatable, and that would screw
+        // up our actual translation of the real '0'.
+        let mut translations = [0; 256];
+        for portable in 64..=255 {
+            let c = character_set[portable] as usize;
+            if translations[c] == 0 {
+                translations[c] = portable_to_windows_1252(portable as u8);
+            }
+        }
+        Self(translations)
+    }
+}
+
+impl Index<u8> for TranslationTable {
+    type Output = u8;
+
+    fn index(&self, index: u8) -> &Self::Output {
+        &self.0[index as usize]
+    }
+}
+
+impl PortableFile<BufReader<File>> {
+    /// Opens the file at `path`.
+    pub fn open_file<P, F>(path: P, warn: F) -> Result<Self, Error>
+    where
+        P: AsRef<Path>,
+        F: FnMut(Warning),
+    {
+        let reader = BufReader::new(File::open(path)?);
+        Self::open(reader, warn)
+    }
+}
+
+impl<R> PortableFile<R>
+where
+    R: Read + Seek,
+{
+    /// Reads `reader`, which should be in the SPSS portable file format.
+    /// Following the file header and character set, counts the incidence of
+    /// each byte value in the file.  Returns a table with those counts, plus a
+    /// [TranslationTable] derived from the character set in the file header.
+    pub fn read_histogram(reader: R) -> Result<([usize; 256], TranslationTable), Error>
+    where
+        R: BufRead,
+    {
+        let mut reader = ReadPad::new(reader);
+
+        // Read and ignore header.
+        reader.read_exact(&mut [0; 200])?;
+        let mut character_set = [0; 256];
+        reader.read_exact(&mut character_set)?;
+        reader.read_exact(&mut [0; 8])?;
+
+        let mut buf = [0; 4096];
+        let mut histogram = [0; 256];
+        loop {
+            let n = reader.read(&mut buf)?;
+            if n == 0 {
+                break;
+            }
+
+            for c in buf[..n].iter().copied() {
+                histogram[c as usize] += 1;
+            }
+        }
+        Ok((histogram, TranslationTable::new(&character_set)))
+    }
+
+    /// Opens `reader` as a portable file, invoking `warn` with any warnings
+    /// diagnosed while reading it.
+    pub fn open<F>(reader: R, mut warn: F) -> Result<Self, Error>
+    where
+        F: FnMut(Warning),
+    {
+        fn read_inner<R, F>(
+            mut reader: R,
+            mut warn: F,
+            character_set: [u8; 256],
+        ) -> Result<(Dictionary, Metadata), ErrorDetails>
+        where
+            R: Read + Seek,
+            F: FnMut(Warning),
+        {
+            let mut signature = [0; 8];
+            reader.read_exact(&mut signature)?;
+            if &signature != b"SPSSPORT" {
+                return Err(ErrorDetails::NotAPortableFile);
+            }
+            let (c, metadata) = read_version(&mut reader, &mut warn, character_set)?;
+            let (mut c, mut dictionary) = read_variables(&mut reader, c, &mut warn)?;
+            while c == b'D' {
+                read_value_label(&mut reader, &mut dictionary, &mut warn)?;
+                c = read_byte(&mut reader)?;
+            }
+            if c == b'E' {
+                read_documents(&mut reader, &mut dictionary)?;
+                c = read_byte(&mut reader)?;
+            }
+            if c != b'F' {
+                return Err(ErrorDetails::DataRecordExpected);
+            }
+            Ok((dictionary, metadata))
+        }
+        fn read_version<R, F>(
+            mut reader: R,
+            mut warn: F,
+            character_set: [u8; 256],
+        ) -> Result<(u8, Metadata), ErrorDetails>
+        where
+            R: Read,
+            F: FnMut(Warning),
+        {
+            let byte = read_byte(&mut reader)?;
+            if byte != b'A' {
+                return Err(ErrorDetails::UnrecognizedVersionCode(byte as char));
+            }
+
+            let date = read_string(&mut reader)?;
+            let date = if date.len() == 8
+                && date.is_ascii()
+                && let Ok(year) = date[..4].parse()
+                && let Ok(month) = date[4..6].parse()
+                && let Ok(day) = date[6..].parse()
+                && let Some(date) = NaiveDate::from_ymd_opt(year, month, day)
+            {
+                Some(date)
+            } else {
+                warn(Warning::InvalidDate(date));
+                None
+            };
+            let time = read_string(&mut reader)?;
+            let time = if let Ok(hms) = time.trim().parse::<u32>()
+                && let Some(time) =
+                    NaiveTime::from_hms_opt(hms / 10000, (hms % 10000) / 100, hms % 100)
+            {
+                Some(time)
+            } else {
+                if !time.trim().is_empty() {
+                    warn(Warning::InvalidTime(time));
+                }
+                None
+            };
+            let creation = date.map(|date| NaiveDateTime::new(date, time.unwrap_or_default()));
+
+            let mut c = read_byte(&mut reader)?;
+            let product = if c == b'1' {
+                let product = read_string(&mut reader)?;
+                c = read_byte(&mut reader)?;
+                Some(product)
+            } else {
+                None
+            };
+            let author = if c == b'2' {
+                let author = read_string(&mut reader)?;
+                c = read_byte(&mut reader)?;
+                Some(author)
+            } else {
+                None
+            };
+
+            let product_ext = if c == b'3' {
+                let product_ext = read_string(&mut reader)?;
+                c = read_byte(&mut reader)?;
+                Some(product_ext)
+            } else {
+                None
+            };
+
+            Ok((
+                c,
+                Metadata {
+                    creation,
+                    product,
+                    product_ext,
+                    author,
+                    character_set,
+                },
+            ))
+        }
+
+        fn read_format<R, F>(
+            mut reader: R,
+            width: VarWidth,
+            warn: F,
+        ) -> Result<Format, ErrorDetails>
+        where
+            R: Read,
+            F: FnOnce(Format, FormatError),
+        {
+            let type_: u16 = read_integer(&mut reader)?;
+            let w: u16 = read_integer(&mut reader)?;
+            let d: u8 = read_integer(&mut reader)?;
+            Ok(Type::try_from(type_)
+                .map(|type_| UncheckedFormat { type_, w, d })
+                .and_then(Format::try_from)
+                .and_then(|x| x.check_width_compatibility(width))
+                .unwrap_or_else(|error| {
+                    let new_format = Format::default_for_width(width);
+                    warn(new_format, error);
+                    new_format
+                }))
+        }
+
+        fn read_variables<R, F>(
+            mut reader: R,
+            mut c: u8,
+            mut warn: F,
+        ) -> Result<(u8, Dictionary), ErrorDetails>
+        where
+            R: Read + Seek,
+            F: FnMut(Warning),
+        {
+            let mut dictionary = Dictionary::new(WINDOWS_1252);
+
+            if c != b'4' {
+                return Err(ErrorDetails::ExpectedVariableCountRecord(c as char));
+            }
+            let n_vars: usize = read_integer(&mut reader)?;
+
+            c = read_byte(&mut reader)?;
+            if c == b'5' {
+                let _ = read_f64(&mut reader)?;
+                c = read_byte(&mut reader)?;
+            }
+            let weight_name = if c == b'6' {
+                let weight_name = read_identifier(&mut reader, &mut warn)?;
+                c = read_byte(&mut reader)?;
+                weight_name
+            } else {
+                None
+            };
+
+            let mut n_generated_names = 0;
+            fn generate_name(dictionary: &Dictionary, n_generated_names: &mut usize) -> Identifier {
+                loop {
+                    *n_generated_names = n_generated_names.checked_add(1).unwrap();
+                    let name = Identifier::from_encoding(
+                        format!("VAR{:03}", *n_generated_names),
+                        WINDOWS_1252,
+                    )
+                    .unwrap();
+                    if !dictionary.variables.contains(&name.0) {
+                        return name;
+                    }
+                }
+            }
+
+            for _ in 0..n_vars {
+                if c != b'7' {
+                    return Err(ErrorDetails::ExpectedVariableRecord);
+                }
+                let width: u16 = read_integer(&mut reader)?;
+                let name = read_string(&mut reader)?;
+                let name = match Identifier::from_encoding(name, WINDOWS_1252)
+                    .and_then(Identifier::must_be_ordinary)
+                {
+                    Ok(name) => {
+                        if !dictionary.variables.contains(&name.0) {
+                            name
+                        } else {
+                            let new_name = generate_name(&dictionary, &mut n_generated_names);
+                            warn(Warning::DuplicateVariableName {
+                                duplicate_name: name.clone(),
+                                new_name: new_name.clone(),
+                            });
+                            new_name
+                        }
+                    }
+                    Err(id_error) => {
+                        let new_name = generate_name(&dictionary, &mut n_generated_names);
+                        warn(Warning::InvalidVariableName {
+                            id_error,
+                            new_name: new_name.clone(),
+                        });
+                        new_name
+                    }
+                };
+                let width = match width {
+                    0 => VarWidth::Numeric,
+                    width => VarWidth::String(width as u16),
+                };
+
+                let print = read_format(&mut reader, width, |new_spec, format_error| {
+                    warn(Warning::InvalidPrintFormat {
+                        new_format: new_spec,
+                        variable: name.clone(),
+                        format_error,
+                    })
+                })?;
+                let write = read_format(&mut reader, width, |new_spec, format_error| {
+                    warn(Warning::InvalidWriteFormat {
+                        new_format: new_spec,
+                        variable: name.clone(),
+                        format_error,
+                    })
+                })?;
+
+                c = read_byte(&mut reader)?;
+                let range = match c {
+                    b'B' => Some(MissingValueRange::In {
+                        low: read_f64(&mut reader)?,
+                        high: read_f64(&mut reader)?,
+                    }),
+                    b'A' => Some(MissingValueRange::From {
+                        low: read_f64(&mut reader)?,
+                    }),
+                    b'9' => Some(MissingValueRange::To {
+                        high: read_f64(&mut reader)?,
+                    }),
+                    _ => None,
+                };
+                if range.is_some() {
+                    c = read_byte(&mut reader)?;
+                }
+                let mut values = Vec::new();
+                while c == b'8' {
+                    values.push(read_value(&mut reader, width.into())?);
+                    c = read_byte(&mut reader)?;
+                }
+                let missing_values = MissingValues::new(values, range)
+                    .inspect_err(|error| {
+                        warn(Warning::InvalidMissingValues {
+                            name: name.clone(),
+                            error: *error,
+                        })
+                    })
+                    .unwrap_or_default();
+
+                let label = if c == b'C' {
+                    let label = read_string(&mut reader)?;
+                    c = read_byte(&mut reader)?;
+                    Some(label)
+                } else {
+                    None
+                };
+
+                let mut variable = Variable::new(name, width, WINDOWS_1252);
+                variable.print_format = print;
+                variable.write_format = write;
+                if let Err(error) = variable.missing_values_mut().replace(missing_values) {
+                    warn(Warning::InvalidMissingValues {
+                        name: variable.name.clone(),
+                        error,
+                    })
+                }
+                variable.label = label;
+                dictionary.add_var(variable).unwrap();
+            }
+
+            if let Some(weight_name) = weight_name {
+                if let Some(dict_index) = dictionary.variables.get_index_of(&weight_name.0) {
+                    let _ = dictionary.set_weight(Some(dict_index));
+                } else {
+                    warn(Warning::UnknownWeightVariable(weight_name))
+                }
+            }
+            Ok((c, dictionary))
+        }
+
+        fn read_value_label<R, F>(
+            mut reader: R,
+            dictionary: &mut Dictionary,
+            mut warn: F,
+        ) -> Result<(), ErrorDetails>
+        where
+            R: Read,
+            F: FnMut(Warning),
+        {
+            let n_variables = read_integer(&mut reader)?;
+            let mut dict_indexes = Vec::with_capacity(n_variables);
+            let mut var_type = None;
+            for _ in 0..n_variables {
+                if let Some(dict_index) = read_variable_name(&mut reader, dictionary, &mut warn)? {
+                    let type_ = VarType::from(dictionary.variables[dict_index].width);
+                    if var_type.is_none() {
+                        var_type = Some(type_);
+                    } else if var_type != Some(type_) {
+                        warn(Warning::MixedVariableTypes);
+                        continue;
+                    }
+                    dict_indexes.push(dict_index);
+                }
+            }
+            let Some(var_type) = var_type else {
+                return Err(ErrorDetails::NoValueLabelVariables);
+            };
+
+            let n_labels = read_integer(&mut reader)?;
+            for _ in 0..n_labels {
+                let value = read_value(&mut reader, var_type)?.without_encoding();
+                let label = read_string(&mut reader)?;
+                for dict_index in dict_indexes.iter().copied() {
+                    dictionary
+                        .variables
+                        .get_index_mut2(dict_index)
+                        .unwrap()
+                        .value_labels
+                        .insert(value.clone(), label.clone());
+                }
+            }
+            Ok(())
+        }
+
+        fn read_documents<R>(mut reader: R, dictionary: &mut Dictionary) -> Result<(), ErrorDetails>
+        where
+            R: Read,
+        {
+            let n_lines: usize = read_integer(&mut reader)?;
+            for _ in 0..n_lines {
+                dictionary.documents.push(read_string(&mut reader)?);
+            }
+            Ok(())
+        }
+
+        let mut reader = ReadPad::new(reader);
+
+        // Read and ignore vanity splash strings.
+        reader.read_exact(&mut [0; 200])?;
+
+        // Read the character set.
+        let mut character_set = [0; 256];
+        reader.read_exact(&mut character_set)?;
+        let translations = TranslationTable::new(&character_set);
+
+        let mut reader = ReadTranslate::new(reader, translations);
+        let (dictionary, metadata) =
+            read_inner(&mut reader, &mut warn, character_set).map_err(|details| Error {
+                offset: reader.stream_position().ok(),
+                details,
+            })?;
+        let variables = dictionary.variables.iter().map(|var| var.width).collect();
+        Ok(PortableFile {
+            dictionary,
+            metadata,
+            cases: Cases::new(reader, variables),
+        })
+    }
+}
+
+fn read_raw_string<R>(mut reader: R) -> Result<Vec<u8>, ErrorDetails>
+where
+    R: Read,
+{
+    let n: u16 = read_integer(&mut reader)?;
+    let mut vec = vec![0u8; n as usize];
+    reader.read_exact(&mut vec)?;
+    Ok(vec)
+}
+
+fn read_string<R>(reader: R) -> Result<String, ErrorDetails>
+where
+    R: Read,
+{
+    // This `unwrap()` can't panic because the translation table only
+    // translates to ASCII characters
+    Ok(String::from_utf8(read_raw_string(reader)?).unwrap())
+}
+
+fn read_identifier<R, F>(reader: R, mut warn: F) -> Result<Option<Identifier>, ErrorDetails>
+where
+    R: Read,
+    F: FnMut(Warning),
+{
+    let string = read_string(reader)?;
+    match Identifier::from_encoding(string.clone(), WINDOWS_1252) {
+        Ok(identifier) => Ok(Some(identifier)),
+        Err(error) => {
+            warn(Warning::InvalidIdentifier { string, error });
+            Ok(None)
+        }
+    }
+}
+
+fn read_variable_name<R, F>(
+    reader: R,
+    dictionary: &Dictionary,
+    mut warn: F,
+) -> Result<Option<DictIndex>, ErrorDetails>
+where
+    R: Read,
+    F: FnMut(Warning),
+{
+    let Some(var_name) = read_identifier(reader, &mut warn)? else {
+        return Ok(None);
+    };
+    let dict_index = dictionary.variables.get_index_of(&var_name.0);
+    if dict_index.is_none() {
+        warn(Warning::UnknownVariableName(var_name));
+    }
+    Ok(dict_index)
+}
+
+fn read_integer<T, R>(reader: R) -> Result<T, ErrorDetails>
+where
+    R: Read,
+    T: NumCast + Bounded + Display,
+{
+    let float = read_f64(reader)?;
+    if float.trunc() == float && float >= i64::MIN as f64 && float <= i64::MAX as f64 {
+        if let Some(integer) = num::cast(float) {
+            Ok(integer)
+        } else {
+            Err(ErrorDetails::OutOfRangeInteger {
+                float,
+                min_value: T::min_value().to_string(),
+                max_value: T::max_value().to_string(),
+            })
+        }
+    } else {
+        Err(ErrorDetails::InvalidInteger)
+    }
+}
+
+fn read_value<R>(
+    reader: R,
+    var_type: VarType,
+) -> Result<Datum<WithEncoding<ByteString>>, ErrorDetails>
+where
+    R: Read,
+{
+    match var_type {
+        VarType::Numeric => Ok(Datum::Number(read_f64_or_missing(reader)?)),
+        VarType::String => Ok(Datum::String(
+            ByteString::from(Vec::from(read_string(reader)?)).with_encoding(WINDOWS_1252),
+        )),
+    }
+}
+
+fn read_f64<R>(reader: R) -> Result<f64, ErrorDetails>
+where
+    R: Read,
+{
+    match read_f64_or_missing(reader)? {
+        Some(value) => Ok(value),
+        None => Err(ErrorDetails::UnexpectedSysmis),
+    }
+}
+
+fn read_f64_or_missing<R>(mut reader: R) -> Result<Option<f64>, ErrorDetails>
+where
+    R: Read,
+{
+    let mut c = read_byte(&mut reader)?;
+    while c == b' ' {
+        c = read_byte(&mut reader)?;
+    }
+    if c == b'*' {
+        let _ = read_byte(&mut reader)?;
+        return Ok(None);
+    }
+    let negative = if c == b'-' {
+        c = read_byte(&mut reader)?;
+        true
+    } else {
+        false
+    };
+    let mut significand = 0;
+    let mut exponent = 0i32;
+    let mut saw_dot = false;
+    let mut saw_digit = false;
+    loop {
+        if let Some(digit) = (c as char).to_digit(30) {
+            saw_digit = true;
+            if significand >= u64::MAX / 30 - 30 {
+                // The value of the digit doesn't matter, since we have already
+                // recorded more digits as can be represented in `f64`.
+                // We just need to record that there was another digit so that
+                // we can multiply by 30 later.
+                exponent += 1;
+            } else {
+                significand = significand * 30 + digit as u64;
+            }
+
+            if saw_dot {
+                exponent -= 1;
+            }
+        } else if c == b'.' && !saw_dot {
+            saw_dot = true;
+        } else {
+            break;
+        }
+
+        c = read_byte(&mut reader)?;
+    }
+    if !saw_digit {
+        return Err(ErrorDetails::NumberExpected);
+    }
+
+    if c == b'+' || c == b'-' {
+        let exp_sign = c;
+        let mut exp = 0i32;
+        c = read_byte(&mut reader)?;
+        while let Some(digit) = (c as char).to_digit(30) {
+            exp = exp * 30 + digit as i32;
+            c = read_byte(&mut reader)?;
+        }
+        if exp_sign == b'+' {
+            exponent -= exp;
+        } else {
+            exponent += exp;
+        }
+    }
+
+    if c != b'/' {
+        return Err(ErrorDetails::MissingSlash);
+    }
+
+    let significand = significand as f64;
+    let num = match exponent.cmp(&0) {
+        Ordering::Less => significand * 30.0f64.powi(exponent),
+        Ordering::Equal => significand,
+        Ordering::Greater if significand > f64::MAX * 30.0f64.powi(-exponent) => f64::MAX,
+        Ordering::Greater => significand * 30.0f64.powi(exponent),
+    };
+    Ok(Some(if negative { -num } else { num }))
+}
+
+fn read_byte<R>(mut reader: R) -> IoResult<u8>
+where
+    R: Read,
+{
+    let mut byte = 0;
+    reader.read_exact(std::slice::from_mut(&mut byte))?;
+    Ok(byte)
+}
+
+/// A [Read] wrapper that translates the bytes it reads using a
+/// [TranslationTable].
+#[derive(Debug)]
+pub struct ReadTranslate<R> {
+    inner: R,
+    translations: TranslationTable,
+}
+
+impl<R> ReadTranslate<R> {
+    /// Create a new [ReadTranslate] with `inner` and `translations`.
+    pub fn new(inner: R, translations: TranslationTable) -> Self {
+        Self {
+            inner,
+            translations,
+        }
+    }
+
+    /// Consumes this [ReadTranslate], returning the inner reader.
+    pub fn into_inner(self) -> R {
+        self.inner
+    }
+}
+
+impl<R> Read for ReadTranslate<R>
+where
+    R: Read,
+{
+    fn read(&mut self, buf: &mut [u8]) -> IoResult<usize> {
+        let n = self.inner.read(buf)?;
+        for c in &mut buf[..n] {
+            *c = self.translations[*c];
+        }
+        Ok(n)
+    }
+}
+
+impl<R> Seek for ReadTranslate<R>
+where
+    R: Seek,
+{
+    fn seek(&mut self, pos: SeekFrom) -> IoResult<u64> {
+        self.inner.seek(pos)
+    }
+}
+
+/// A [Read] wrapper that skips newlines and pads lines to 80 bytes with spaces.
+#[derive(Debug)]
+pub struct ReadPad<R> {
+    inner: R,
+    at_newline: bool,
+    line_length: usize,
+}
+
+impl<R> ReadPad<R> {
+    /// Constructs a [ReadPad] wrapper for `inner`.
+    pub fn new(inner: R) -> Self {
+        Self {
+            inner,
+            at_newline: false,
+            line_length: 0,
+        }
+    }
+
+    /// Consumes this [ReadPad], returning the inner reader.
+    pub fn into_inner(self) -> R {
+        self.inner
+    }
+}
+
+impl<R> Read for ReadPad<R>
+where
+    R: Read,
+{
+    fn read(&mut self, buf: &mut [u8]) -> IoResult<usize> {
+        for (i, c) in buf.into_iter().enumerate() {
+            if self.at_newline {
+                *c = b' ';
+                self.line_length += 1;
+                if self.line_length >= 80 {
+                    self.at_newline = false;
+                    self.line_length = 0;
+                }
+            } else {
+                loop {
+                    match self.inner.read(std::slice::from_mut(c)) {
+                        Ok(1) => (),
+                        other => return if i > 0 { Ok(i) } else { other },
+                    };
+                    match *c {
+                        b'\r' => continue,
+                        b'\n' => match self.line_length {
+                            80.. => {
+                                self.line_length = 0;
+                                continue;
+                            }
+                            79 => {
+                                self.line_length = 0;
+                                *c = b' ';
+                                break;
+                            }
+                            0..79 => {
+                                self.at_newline = true;
+                                self.line_length += 1;
+                                *c = b' ';
+                                break;
+                            }
+                        },
+                        _ => {
+                            self.line_length += 1;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+        Ok(buf.len())
+    }
+}
+
+impl<R> Seek for ReadPad<R>
+where
+    R: Seek,
+{
+    fn seek(&mut self, pos: SeekFrom) -> IoResult<u64> {
+        self.inner.seek(pos)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{
+        io::{BufRead, BufReader, Cursor},
+        path::Path,
+    };
+
+    use itertools::Itertools;
+
+    use crate::{
+        data::cases_to_output,
+        output::{
+            pivot::{tests::assert_lines_eq, PivotTable},
+            Details, Item, Text,
+        },
+        por::{PortableFile, ReadPad},
+    };
+
+    #[test]
+    fn read_wrapper() {
+        let mut lines = BufReader::new(ReadPad::new(Cursor::new(
+            b"abcdefghijklmnop\r\n0123456789\r\n",
+        )))
+        .lines();
+        assert_eq!(lines.next().unwrap().unwrap(), "abcdefghijklmnop                                                                0123456789                                                                      ");
+    }
+
+    fn test_porfile(name: &str) {
+        let base_filename = Path::new("src/por/testdata").join(name);
+        let input_filename = base_filename.with_extension("por");
+        let expected_filename = base_filename.with_extension("expected");
+
+        let mut warnings = Vec::new();
+        let output = match PortableFile::open_file(input_filename, |warning| warnings.push(warning))
+        {
+            Ok(portable_file) => {
+                let (dictionary, metadata, cases) = portable_file.into_parts();
+
+                let mut output = Vec::new();
+                output.extend(
+                    warnings
+                        .into_iter()
+                        .map(|warning| Item::from(Text::new_log(warning.to_string()))),
+                );
+                output.push(PivotTable::from(&metadata).into());
+                output.extend(dictionary.all_pivot_tables().into_iter().map_into());
+                output.extend(cases_to_output(&dictionary, cases));
+                Item::new(Details::Group(output.into_iter().map_into().collect()))
+            }
+            Err(error) => Item::new(Details::Text(Box::new(Text::new_log(error.to_string())))),
+        };
+
+        let actual = output.to_string();
+        let expected = std::fs::read_to_string(&expected_filename).unwrap();
+        if expected != actual {
+            if std::env::var("PSPP_REFRESH_EXPECTED").is_ok() {
+                std::fs::write(&expected_filename, actual).unwrap();
+                panic!("{}: refreshed output", expected_filename.display());
+            } else {
+                eprintln!("note: rerun with PSPP_REFRESH_EXPECTED=1 to refresh expected output");
+            }
+        }
+        assert_lines_eq(&expected, expected_filename.display(), &actual, "actual");
+    }
+
+    #[test]
+    fn porfile_test1() {
+        test_porfile("test1");
+    }
+
+    #[test]
+    fn porfile_test2() {
+        test_porfile("test2");
+    }
+}
diff --git a/rust/pspp/src/por/testdata/README.md b/rust/pspp/src/por/testdata/README.md
new file mode 100644 (file)
index 0000000..c48d1d3
--- /dev/null
@@ -0,0 +1,3 @@
+The two .por files in this directory are old ones found on the
+Internet.  `test1.por` self-identifies as a `PFF TEST FILE`.  They do
+not contain any personally identifying information.
diff --git a/rust/pspp/src/por/testdata/test1.expected b/rust/pspp/src/por/testdata/test1.expected
new file mode 100644 (file)
index 0000000..9105615
--- /dev/null
@@ -0,0 +1,151 @@
+╭─────────┬──────────────────────────────────╮
+│Created  │              15-FEB-1986 14:48:43│
+│Product  │SPSS-X RELEASE 2.1  FOR IBM VM/CMS│
+│Product 2│PFF TEST FILE                     │
+│Author   │HARVARD UNIV COMPUTING CENTER     │
+╰─────────┴──────────────────────────────────╯
+
+╭─────────┬─╮
+│Variables│9│
+╰─────────┴─╯
+
+                                                                   Variables
+╭─────────────────────────┬────────┬─────────────────────────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮
+│                         │Position│          Label          │Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│
+├─────────────────────────┼────────┼─────────────────────────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤
+│SATISFACTION WITH HOUSING│       1│SATISFACTION WITH HOUSING│                 │Input│    8│Right    │F1.0        │F1.0        │0; 9          │
+│SATISFACTION NEIGHBORHOOD│       2│SATISFACTION NEIGHBORHOOD│                 │Input│    8│Right    │F1.0        │F1.0        │0; 9          │
+│RESPOND. EDUCATION       │       3│RESPOND. EDUCATION       │                 │Input│    8│Right    │F2.0        │F2.0        │0; 99         │
+│INCOME IN 1984           │       4│INCOME IN 1984           │                 │Input│    8│Right    │F3.0        │F3.0        │0; 999        │
+│STANDARD OF LIVING       │       5│STANDARD OF LIVING       │                 │Input│    8│Right    │F1.0        │F1.0        │0; 9          │
+│RESPONDENT RACE          │       6│RESPONDENT RACE          │                 │Input│    8│Right    │F1.0        │F1.0        │0; 9          │
+│RESPONDENT SEX           │       7│RESPONDENT SEX           │                 │Input│    8│Right    │F1.0        │F1.0        │0; 9          │
+│RESPONDENT AGE           │       8│RESPONDENT AGE           │                 │Input│    8│Right    │F2.0        │F2.0        │0; 99         │
+│R MARITAL STATUS         │       9│R MARITAL STATUS         │                 │Input│    8│Right    │F1.0        │F1.0        │0; 9          │
+╰─────────────────────────┴────────┴─────────────────────────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯
+
+              Value Labels
+╭───────────────────────────┬──────────╮
+│Variable Value             │          │
+├───────────────────────────┼──────────┤
+│SATISFACTION WITH HOUSING 1│VERY      │
+│                          2│FAIRLY    │
+│                          3│NOT VERY  │
+├───────────────────────────┼──────────┤
+│SATISFACTION NEIGHBORHOOD 1│VERY      │
+│                          2│FAIRLY    │
+│                          3│NOT VERY  │
+├───────────────────────────┼──────────┤
+│RESPOND. EDUCATION        1│NONE      │
+│                          2│< 8 YRS   │
+│                          3│C GRD SC  │
+│                          4│SOME HS   │
+│                          5│CMPL.H.S. │
+│                          6│1-3 COLL  │
+│                          7│COLL DEG  │
+│                          8│MASTERS   │
+│                          9│PHD-M.D.  │
+├───────────────────────────┼──────────┤
+│STANDARD OF LIVING        1│PROSP     │
+│                          2│VERY COMF │
+│                          3│REAS COMF │
+│                          4│GET BY    │
+│                          5│NEAR POOR │
+│                          6│POOR      │
+├───────────────────────────┼──────────┤
+│RESPONDENT RACE           1│BLACK     │
+│                          2│HISPANIC  │
+│                          3│WHITE     │
+│                          4│ASIAN     │
+│                          5│AM IND    │
+│                          8│OTHER     │
+├───────────────────────────┼──────────┤
+│RESPONDENT SEX            1│MALE      │
+│                          2│FEMALE    │
+├───────────────────────────┼──────────┤
+│R MARITAL STATUS          1│MARRIED   │
+│                          2│DIVORCED  │
+│                          3│SEPARATD  │
+│                          4│WIDOWED   │
+│                          5│NEVER MARR│
+╰───────────────────────────┴──────────╯
+
+╭────┬─────────────────────────┬─────────────────────────┬──────────────────┬──────────────┬──────────────────┬───────────────┬──────────────┬──────────────┬────────────────╮
+│Case│SATISFACTION WITH HOUSING│SATISFACTION NEIGHBORHOOD│RESPOND. EDUCATION│INCOME IN 1984│STANDARD OF LIVING│RESPONDENT RACE│RESPONDENT SEX│RESPONDENT AGE│R MARITAL STATUS│
+├────┼─────────────────────────┼─────────────────────────┼──────────────────┼──────────────┼──────────────────┼───────────────┼──────────────┼──────────────┼────────────────┤
+│1   │                     2.00│                     1.00│              5.00│         50.00│              3.00│           3.00│          2.00│         29.00│            1.00│
+│2   │                     2.00│                     1.00│              7.00│         25.00│              3.00│           3.00│          1.00│         26.00│            5.00│
+│3   │                     1.00│                     2.00│              6.00│         35.00│              3.00│           3.00│          2.00│         53.00│            1.00│
+│4   │                     2.00│                     2.00│              6.00│         25.00│              4.00│           3.00│          2.00│         23.00│            3.00│
+│5   │                     2.00│                     1.00│              6.00│         11.00│              3.00│           3.00│          2.00│         21.00│            5.00│
+│6   │                     1.00│                     1.00│              5.00│         12.00│              3.00│           3.00│          1.00│           .03│            5.00│
+│7   │                     2.00│                     2.00│              8.00│         22.00│              3.00│           3.00│          2.00│         42.00│            2.00│
+│8   │                     1.00│                     1.00│              7.00│         35.00│              3.00│           3.00│          2.00│         35.00│            1.00│
+│9   │                     2.00│                     2.00│              7.00│           .03│              3.00│           3.00│          1.00│           .03│            5.00│
+│10  │                     1.00│                     1.00│              7.00│          8.00│              2.00│           3.00│          1.00│         22.00│            5.00│
+│11  │                     1.00│                     1.00│              5.00│         18.00│              4.00│           3.00│          2.00│         55.00│            4.00│
+│12  │                     1.00│                     1.00│              7.00│         45.00│              3.00│           3.00│          2.00│         56.00│            1.00│
+│13  │                     3.00│                     3.00│              5.00│          2.00│              4.00│           1.00│          2.00│         24.00│            5.00│
+│14  │                     3.00│                     2.00│              7.00│           .07│              3.00│           3.00│          1.00│         42.00│            1.00│
+│15  │                     1.00│                     1.00│              5.00│         35.00│              2.00│           3.00│          2.00│           .03│            1.00│
+│16  │                     1.00│                     1.00│              8.00│         12.00│              4.00│           3.00│          1.00│         29.00│            5.00│
+│17  │                     1.00│                     1.00│              6.00│         25.00│              3.00│           3.00│          1.00│         55.00│            1.00│
+│18  │                     1.00│                     1.00│              5.00│         20.00│              4.00│           9.00│          2.00│           .07│            4.00│
+│19  │                     1.00│                     1.00│              6.00│         42.00│              4.00│           3.00│          1.00│         51.00│            4.00│
+│20  │                     1.00│                     1.00│              6.00│         70.00│              2.00│           3.00│          1.00│         33.00│            1.00│
+│21  │                     1.00│                     2.00│              6.00│          2.00│              2.00│           3.00│          2.00│         20.00│            5.00│
+│22  │                     3.00│                     1.00│              9.00│         20.00│              3.00│           4.00│          2.00│         32.00│            3.00│
+│23  │                     2.00│                     1.00│              6.00│         13.00│              4.00│           3.00│          2.00│         33.00│            1.00│
+│24  │                     3.00│                     3.00│              8.00│          7.00│              4.00│           3.00│          2.00│         28.00│            5.00│
+│25  │                     2.00│                     2.00│              5.00│         35.00│              3.00│           3.00│          1.00│         51.00│            1.00│
+│26  │                     1.00│                     1.00│              6.00│         40.00│              3.00│           3.00│          1.00│         59.00│            1.00│
+│27  │                     2.00│                     1.00│              5.00│         46.00│              4.00│           3.00│          1.00│         47.00│            1.00│
+│28  │                     2.00│                     2.00│              5.00│         25.00│              2.00│           3.00│          1.00│         28.00│            1.00│
+│29  │                     2.00│                     1.00│              8.00│         50.00│              3.00│           3.00│          1.00│         39.00│            2.00│
+│30  │                     3.00│                     2.00│              5.00│          6.00│              4.00│           3.00│          2.00│         86.00│            4.00│
+│31  │                     2.00│                     2.00│              6.00│           .03│              2.00│           3.00│          2.00│         24.00│            5.00│
+│32  │                     2.00│                     1.00│              7.00│         25.00│              4.00│           3.00│          2.00│         38.00│            1.00│
+│33  │                     1.00│                     1.00│              5.00│          8.00│              3.00│           3.00│          1.00│         18.00│            5.00│
+│34  │                     1.00│                     1.00│              8.00│         40.00│              3.00│           3.00│          2.00│         58.00│            1.00│
+│35  │                     2.00│                     2.00│              7.00│         25.00│              3.00│           3.00│          2.00│           .03│            5.00│
+│36  │                     1.00│                     1.00│              5.00│         20.00│              3.00│           3.00│          2.00│         28.00│            5.00│
+│37  │                     1.00│                     1.00│              2.00│         15.00│              3.00│           8.00│          2.00│         49.00│            1.00│
+│38  │                     2.00│                     1.00│              7.00│         35.00│              3.00│           3.00│          2.00│         29.00│            1.00│
+│39  │                     2.00│                     2.00│              7.00│          1.00│              4.00│           3.00│          2.00│         35.00│            5.00│
+│40  │                     2.00│                     1.00│              6.00│         11.00│              4.00│           3.00│          2.00│         48.00│            2.00│
+│41  │                     1.00│                     1.00│              6.00│         20.00│              3.00│           3.00│          2.00│         67.00│            1.00│
+│42  │                     1.00│                     2.00│              5.00│         15.00│              4.00│           3.00│          2.00│         39.00│            3.00│
+│43  │                     2.00│                     2.00│              6.00│           .03│              3.00│           3.00│          1.00│         32.00│            1.00│
+│44  │                     1.00│                     1.00│              5.00│         42.00│              3.00│           3.00│          2.00│         44.00│            1.00│
+│45  │                     3.00│                     3.00│              5.00│         16.00│              3.00│           3.00│          2.00│         29.00│            2.00│
+│46  │                     1.00│                     1.00│              7.00│         14.00│              3.00│           3.00│          1.00│         42.00│            5.00│
+│47  │                     1.00│                     1.00│              7.00│         20.00│              3.00│           3.00│          2.00│         36.00│            2.00│
+│48  │                     2.00│                     2.00│              7.00│         45.00│              2.00│           3.00│          2.00│         33.00│            1.00│
+│49  │                     2.00│                     2.00│              5.00│           .03│              2.00│           3.00│          2.00│         32.00│            1.00│
+│50  │                     2.00│                     2.00│              9.00│          9.00│              3.00│           3.00│          1.00│         24.00│            5.00│
+│51  │                     1.00│                     2.00│              6.00│         35.00│              3.00│           3.00│          2.00│         41.00│            1.00│
+│52  │                     1.00│                     1.00│              5.00│           .03│              3.00│           3.00│          2.00│         29.00│            1.00│
+│53  │                     1.00│                     1.00│              5.00│           .07│              2.00│           3.00│          2.00│         64.00│            1.00│
+│54  │                     1.00│                     1.00│              7.00│         20.00│              4.00│           3.00│          1.00│           .03│            5.00│
+│55  │                     1.00│                     1.00│              5.00│          3.00│              3.00│           8.00│          1.00│         21.00│            5.00│
+│56  │                     1.00│                     2.00│              4.00│         20.00│              2.00│           3.00│          1.00│         22.00│            5.00│
+│57  │                     1.00│                     1.00│              6.00│         45.00│              3.00│           3.00│          2.00│         38.00│            1.00│
+│58  │                     1.00│                     1.00│              7.00│           .03│              3.00│           3.00│          2.00│           .03│            5.00│
+│59  │                     2.00│                     1.00│              7.00│         10.00│              3.00│           3.00│          2.00│         25.00│            5.00│
+│60  │                     1.00│                     1.00│              6.00│         11.00│              4.00│           3.00│          2.00│         38.00│            2.00│
+│61  │                     3.00│                     2.00│              5.00│         15.00│              5.00│           1.00│          2.00│         34.00│            2.00│
+│62  │                     1.00│                     1.00│              3.00│         16.00│              4.00│           3.00│          1.00│         48.00│            2.00│
+│63  │                     1.00│                     1.00│              5.00│         20.00│              4.00│           3.00│          2.00│         37.00│            4.00│
+│64  │                     1.00│                     2.00│              7.00│         14.00│              3.00│           3.00│          2.00│         25.00│            5.00│
+│65  │                     1.00│                     2.00│              6.00│           .10│              2.00│           3.00│          1.00│         57.00│            1.00│
+│66  │                     3.00│                     2.00│              8.00│         15.00│              3.00│           3.00│          2.00│         42.00│            3.00│
+│67  │                     1.00│                     2.00│              8.00│         80.00│              3.00│           3.00│          1.00│         46.00│            1.00│
+│68  │                     3.00│                     2.00│              5.00│          8.00│              5.00│           3.00│          2.00│         58.00│            4.00│
+│69  │                     1.00│                     2.00│              7.00│         20.00│              3.00│           3.00│          2.00│         25.00│            5.00│
+│70  │                     1.00│                     2.00│              5.00│           .03│              4.00│           3.00│          2.00│         37.00│            1.00│
+│71  │                     1.00│                     1.00│              7.00│         25.00│              2.00│           3.00│          2.00│         27.00│            5.00│
+│72  │                     1.00│                     1.00│              5.00│         13.00│              4.00│           3.00│          2.00│         42.00│            2.00│
+│73  │                     2.00│                     3.00│              7.00│         40.00│              3.00│           3.00│          2.00│         32.00│            1.00│
+│74  │                     2.00│                     2.00│              5.00│         40.00│              3.00│           3.00│          1.00│         63.00│            1.00│
+│75  │                     2.00│                     1.00│              5.00│         70.00│              3.00│           3.00│          2.00│         50.00│            1.00│
+╰────┴─────────────────────────┴─────────────────────────┴──────────────────┴──────────────┴──────────────────┴───────────────┴──────────────┴──────────────┴────────────────╯
diff --git a/rust/pspp/src/por/testdata/test1.por b/rust/pspp/src/por/testdata/test1.por
new file mode 100644 (file)
index 0000000..d1ce02a
Binary files /dev/null and b/rust/pspp/src/por/testdata/test1.por differ
diff --git a/rust/pspp/src/por/testdata/test2.expected b/rust/pspp/src/por/testdata/test2.expected
new file mode 100644 (file)
index 0000000..181f96f
--- /dev/null
@@ -0,0 +1,64 @@
+Invalid date 20040931.
+
+╭───────┬─────────────╮
+│Product│STAT/TRANSFER│
+╰───────┴─────────────╯
+
+╭─────────┬─╮
+│Variables│4│
+╰─────────┴─╯
+
+                                                                     Variables
+╭───────────────────────────┬────────┬───────────────────────────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮
+│                           │Position│           Label           │Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│
+├───────────────────────────┼────────┼───────────────────────────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤
+│ALCOHOL (C)                │       1│ALCOHOL (C)                │                 │Input│    8│Right    │F9.2        │F9.2        │              │
+│SELF-ESTEEM (SE)           │       2│SELF-ESTEEM (SE)           │                 │Input│    8│Right    │F9.2        │F9.2        │              │
+│BLOOD ALCOHOL CONTENT (BAC)│       3│BLOOD ALCOHOL CONTENT (BAC)│                 │Input│    8│Right    │F9.2        │F9.2        │              │
+│SELF-DISCLOSURE            │       4│SELF-DISCLOSURE            │                 │Input│    8│Right    │F9.2        │F9.2        │              │
+╰───────────────────────────┴────────┴───────────────────────────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯
+
+╭────┬───────────┬────────────────┬───────────────────────────┬───────────────╮
+│Case│ALCOHOL (C)│SELF-ESTEEM (SE)│BLOOD ALCOHOL CONTENT (BAC)│SELF-DISCLOSURE│
+├────┼───────────┼────────────────┼───────────────────────────┼───────────────┤
+│1   │        .00│            3.10│                       4.20│           3.70│
+│2   │        .00│            2.50│                       1.50│           5.70│
+│3   │        .00│            3.00│                       2.00│           4.40│
+│4   │        .00│            2.00│                       2.20│           3.20│
+│5   │        .00│            3.20│                       1.00│           6.90│
+│6   │        .00│            1.80│                       1.70│           3.50│
+│7   │        .00│            2.30│                       2.70│           6.40│
+│8   │        .00│            2.40│                       2.60│           5.80│
+│9   │        .00│            1.00│                       2.80│           4.40│
+│10  │        .00│            3.20│                       3.00│           5.80│
+│11  │        .00│            1.00│                       3.70│           3.80│
+│12  │        .00│            4.70│                       3.30│           5.60│
+│13  │        .00│            2.70│                       2.90│           5.00│
+│14  │        .00│            4.70│                       3.70│           7.00│
+│15  │        .00│            1.80│                       2.30│           4.00│
+│16  │        .00│            2.20│                       3.50│           3.60│
+│17  │        .00│            3.60│                       2.20│           5.90│
+│18  │        .00│            3.90│                       3.40│           5.00│
+│19  │        .00│            3.70│                       1.40│           5.70│
+│20  │        .00│            2.90│                       2.90│           3.40│
+│21  │       1.00│            2.40│                       5.30│           5.30│
+│22  │       1.00│            1.70│                       5.40│           5.90│
+│23  │       1.00│            4.10│                       6.20│           5.10│
+│24  │       1.00│            3.60│                       4.50│           5.90│
+│25  │       1.00│            5.00│                       6.90│           5.00│
+│26  │       1.00│            2.70│                       2.30│           6.90│
+│27  │       1.00│            3.60│                       5.60│           6.30│
+│28  │       1.00│            2.60│                       5.50│           3.90│
+│29  │       1.00│            3.50│                       4.50│           5.90│
+│30  │       1.00│            3.30│                       3.00│           5.90│
+│31  │       1.00│            3.40│                       5.10│           4.80│
+│32  │       1.00│            2.80│                       4.00│           4.80│
+│33  │       1.00│            2.10│                       4.60│           5.80│
+│34  │       1.00│            3.30│                       6.70│           6.20│
+│35  │       1.00│            3.30│                       4.90│           7.50│
+│36  │       1.00│            3.00│                       5.50│           5.50│
+│37  │       1.00│            3.50│                       4.60│           7.00│
+│38  │       1.00│            4.50│                       5.70│           6.40│
+│39  │       1.00│            2.60│                       6.20│           6.00│
+│40  │       1.00│            4.60│                       5.10│           4.50│
+╰────┴───────────┴────────────────┴───────────────────────────┴───────────────╯
diff --git a/rust/pspp/src/por/testdata/test2.por b/rust/pspp/src/por/testdata/test2.por
new file mode 100644 (file)
index 0000000..fdcf5b1
--- /dev/null
@@ -0,0 +1,15 @@
+ASCII SPSS PORT FILE                    ASCII SPSS PORT FILE                    \r
+ASCII SPSS PORT FILE                    ASCII SPSS PORT FILE                    \r
+ASCII SPSS PORT FILE                    0000000000000000000000000000000000000000\r
+0000000000000000000000000123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrst\r
+uvwxyz .<(+0&[]!$*);^-/|,%_>?`:#@'="000000~000000000000000000000{}\0000000000000\r
+00000000000000000000000000000000000000000000000000000000SPSSPORTA8/200409316/   \r
+   1D/STAT/TRANSFER44/5A/70/1/C5/9/2/5/9/2/CB/ALCOHOL (C)70/2/SE5/9/2/5/9/2/CG/S\r
+ELF-ESTEEM (SE)70/3/BAC5/9/2/5/9/2/CR/BLOOD ALCOHOL CONTENT (BAC)70/5/SELFD5/9/2\r
+/5/9/2/CF/SELF-DISCLOSUREF0/3.3/4.6/3.L/0/2.F/1.F/5.L/0/3/2/4.C/0/2/2.6/3.6/0/3.\r
+6/1/6.R/0/1.O/1.L/3.F/0/2.9/2.L/6.C/0/2.C/2.I/5.O/0/1/2.O/4.C/0/3.6/3/5.O/0/1/3.\r
+L/3.O/0/4.L/3.9/5.I/0/2.L/2.R/5/0/4.L/3.L/7/0/1.O/2.9/4/0/2.6/3.F/3.I/0/3.I/2.6/\r
+5.R/0/3.R/3.C/5/0/3.L/1.C/5.L/0/2.R/2.R/3.C/1/2.C/5.9/5.9/1/1.L/5.C/5.R/1/4.3/6.\r
+6/5.3/1/3.I/4.F/5.R/1/5/6.R/5/1/2.L/2.9/6.R/1/3.I/5.I/6.9/1/2.I/5.F/3.R/1/3.F/4.\r
+F/5.R/1/3.9/3/5.R/1/3.C/5.3/4.O/1/2.O/4/4.O/1/2.3/4.I/5.O/1/3.9/6.L/6.6/1/3.9/4.\r
+R/7.F/1/3/5.F/5.F/1/3.F/4.I/7/1/4.F/5.L/6.C/1/2.I/6.6/6/1/4.I/5.3/4.F/ZZZZZZZZZZ
\ No newline at end of file
diff --git a/rust/pspp/src/por/write.rs b/rust/pspp/src/por/write.rs
new file mode 100644 (file)
index 0000000..15aad13
--- /dev/null
@@ -0,0 +1,1308 @@
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program.  If not, see <http://www.gnu.org/licenses/>.
+
+use std::{
+    borrow::Cow,
+    cmp::Ordering,
+    collections::HashMap,
+    fmt::{Display, Write as _},
+    fs::File,
+    io::{BufWriter, Error, Write},
+    path::Path,
+};
+
+use chrono::{Local, NaiveDateTime};
+use libm::frexp;
+use smallvec::SmallVec;
+
+use crate::{
+    data::{Datum, RawString},
+    dictionary::Dictionary,
+    por::PORTABLE_TO_WINDOWS_1252,
+    variable::{MissingValueRange, ValueLabels},
+};
+
+/// Precision for floating-point numbers in a portable file.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub struct Precision(
+    /// Precision in base-30 digits (the base used in portable files).
+    u32,
+);
+
+impl Default for Precision {
+    fn default() -> Self {
+        Self::from_base_10_digits(f64::DIGITS)
+    }
+}
+
+impl Precision {
+    pub fn from_base_10_digits(digits: u32) -> Self {
+        match digits {
+            0..=1 => Self(1),
+            2 => Self(2),
+            3..=4 => Self(3),
+            5 => Self(4),
+            6..=7 => Self(5),
+            8 => Self(6),
+            9..=10 => Self(7),
+            11 => Self(8),
+            12..=13 => Self(9),
+            14 => Self(10),
+            15.. => Self(11),
+        }
+    }
+
+    pub fn from_base_30_digits(digits: u32) -> Self {
+        Self(digits.clamp(1, 10))
+    }
+
+    pub fn as_base_10_digits(&self) -> u32 {
+        match self.0 {
+            1 => 1,
+            2 => 2,
+            3 => 4,
+            4 => 5,
+            5 => 7,
+            6 => 8,
+            7 => 10,
+            8 => 11,
+            9 => 13,
+            10 => 14,
+            11 => 15,
+            _ => unreachable!(),
+        }
+    }
+
+    pub fn as_base_30_digits(&self) -> u32 {
+        self.0
+    }
+}
+
+/// Options for writing a portable file.
+#[derive(Clone, Debug)]
+pub struct WriteOptions {
+    /// Date and time to write to the file.
+    pub timestamp: NaiveDateTime,
+
+    /// Product name.
+    pub product: Cow<'static, str>,
+
+    /// Subproduct name.
+    pub product_ext: Option<Cow<'static, str>>,
+
+    /// Author.
+    pub author: Option<String>,
+
+    /// Precision.
+    pub precision: Precision,
+}
+
+impl Default for WriteOptions {
+    fn default() -> Self {
+        Self {
+            timestamp: Local::now().naive_local(),
+            product: Cow::from(concat!("GNU PSPP (Rust) ", env!("CARGO_PKG_VERSION"))),
+            product_ext: None,
+            author: None,
+            precision: Precision::default(),
+        }
+    }
+}
+
+impl WriteOptions {
+    /// Constructs a new set of default options.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Returns `self` with the timestamp to be written set to `timestamp`.
+    pub fn with_timestamp(self, timestamp: NaiveDateTime) -> Self {
+        Self { timestamp, ..self }
+    }
+
+    /// Returns `self` with the product set to `product`.
+    pub fn with_product(self, product: Cow<'static, str>) -> Self {
+        Self { product, ..self }
+    }
+
+    /// Returns `self` with the extended product set to `product_ext`.
+    pub fn with_product_ext(self, product_ext: Cow<'static, str>) -> Self {
+        Self {
+            product_ext: Some(product_ext),
+            ..self
+        }
+    }
+
+    /// Returns `self` with the author set to `author`.
+    pub fn with_author(self, author: String) -> Self {
+        Self {
+            author: Some(author),
+            ..self
+        }
+    }
+
+    /// Return `self` with the precision set to `precision`.
+    pub fn with_precision(self, precision: Precision) -> Self {
+        Self { precision, ..self }
+    }
+
+    /// Writes `dictionary` to `path` in portable file format.  Returns a [Writer]
+    /// that can be used for writing cases to the new file.
+    pub fn write_file(
+        self,
+        dictionary: &Dictionary,
+        path: impl AsRef<Path>,
+    ) -> Result<Writer<BufWriter<File>>, Error> {
+        self.write_writer(dictionary, BufWriter::new(File::create(path)?))
+    }
+
+    /// Writes `dictionary` to `writer` in portable file format.  Returns a
+    /// [Writer] that can be used for writing cases to the new file.
+    pub fn write_writer<W>(self, dictionary: &Dictionary, writer: W) -> Result<Writer<W>, Error>
+    where
+        W: Write + 'static,
+    {
+        let mut writer = WriteFilter::new(writer);
+        let mut dict_writer = DictionaryWriter::new(&self, &mut writer, dictionary);
+        dict_writer.write()?;
+        Ok(Writer {
+            inner: Some(writer),
+            precision: self.precision,
+        })
+    }
+
+    /// Returns a [WriteOptions] with members set to fixed values so that
+    /// running at different times or with different crate names or versions
+    /// won't change what's written to the file.
+    #[cfg(test)]
+    pub(super) fn reproducible() -> Self {
+        use chrono::{NaiveDate, NaiveTime};
+        WriteOptions::new()
+            .with_timestamp(NaiveDateTime::new(
+                NaiveDate::from_ymd_opt(2025, 7, 30).unwrap(),
+                NaiveTime::from_hms_opt(15, 7, 55).unwrap(),
+            ))
+            .with_product(Cow::from("PSPP TEST DATA FILE"))
+    }
+}
+
+/// Portable file case writer.
+///
+/// Use [WriteOptions::write_file] or [WriteOptions::write_writer] to obtain a
+/// [Writer].
+pub struct Writer<W> {
+    precision: Precision,
+    inner: Option<WriteFilter<W>>,
+}
+
+impl<W> Writer<W>
+where
+    W: Write,
+{
+    /// Finishes writing the file.
+    pub fn finish(mut self) -> Result<Option<W>, Error> {
+        self.try_finish()
+    }
+
+    /// Tries to finish writing the file.
+    ///
+    /// # Panic
+    ///
+    /// Attempts to write more cases after calling this function will panic.
+    pub fn try_finish(&mut self) -> Result<Option<W>, Error> {
+        match self.inner.take() {
+            None => Ok(None),
+            Some(mut inner) => {
+                inner.write_end()?;
+                Ok(Some(inner.into_inner()))
+            }
+        }
+    }
+
+    /// Writes `case` to the file.
+    pub fn write_case<B>(&mut self, case: impl IntoIterator<Item = Datum<B>>) -> Result<(), Error>
+    where
+        B: RawString,
+    {
+        write_case(self.inner.as_mut().unwrap(), case, self.precision)
+    }
+}
+
+fn write_case<W, B>(
+    mut writer: W,
+    case: impl IntoIterator<Item = Datum<B>>,
+    precision: Precision,
+) -> Result<(), Error>
+where
+    W: Write,
+    B: RawString,
+{
+    for datum in case {
+        write_datum(&mut writer, &datum, precision)?;
+    }
+    Ok(())
+}
+
+struct WriteFilter<W> {
+    inner: W,
+    line_len: usize,
+}
+
+impl<W> WriteFilter<W> {
+    fn new(inner: W) -> Self {
+        Self { inner, line_len: 0 }
+    }
+
+    fn into_inner(self) -> W {
+        self.inner
+    }
+}
+
+impl<W> WriteFilter<W>
+where
+    W: Write,
+{
+    fn write_end(&mut self) -> std::io::Result<()> {
+        // Write 'Z'.
+        self.write_all(b"Z")?;
+
+        // Finish out the current line with more 'Z's.
+        if self.line_len != 0 {
+            let rest = std::iter::repeat_n(b'Z', 80 - self.line_len).collect::<Vec<_>>();
+            self.write_all(&rest)?;
+        }
+
+        Ok(())
+    }
+}
+
+impl<W> Write for WriteFilter<W>
+where
+    W: Write,
+{
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        fn handle_error(error: std::io::Error, ofs: usize) -> std::io::Result<usize> {
+            if ofs > 0 {
+                Ok(ofs)
+            } else {
+                Err(error)
+            }
+        }
+
+        fn write_chunk<W>(mut writer: W, chunk: &[u8]) -> std::io::Result<usize>
+        where
+            W: Write,
+        {
+            let mut ofs = 0;
+            while ofs < chunk.len() {
+                let result = if chunk[ofs] < 0x20 {
+                    writer.write(&[chunk[ofs]])
+                } else {
+                    let n = chunk[ofs..].iter().take_while(|b| **b >= 0x20).count();
+                    writer.write(&chunk[ofs..ofs + n])
+                };
+                match result {
+                    Ok(n) => ofs += n,
+                    Err(error) => return handle_error(error, ofs),
+                }
+            }
+            Ok(ofs)
+        }
+
+        let mut ofs = 0;
+        while ofs < buf.len() {
+            let chunk = (buf.len() - ofs).min(80 - self.line_len);
+            let n = match write_chunk(&mut self.inner, &buf[ofs..ofs + chunk]) {
+                Ok(n) => n,
+                Err(error) => return handle_error(error, ofs),
+            };
+            self.line_len += n;
+            ofs += n;
+            if self.line_len == 80 {
+                if let Err(error) = self.inner.write_all(b"\r\n") {
+                    return handle_error(error, ofs);
+                }
+                self.line_len = 0;
+            }
+        }
+        Ok(ofs)
+    }
+
+    fn flush(&mut self) -> std::io::Result<()> {
+        self.inner.flush()
+    }
+}
+
+struct DictionaryWriter<'a, W> {
+    options: &'a WriteOptions,
+    writer: &'a mut W,
+    dictionary: &'a Dictionary,
+    short_names: Vec<String>,
+}
+
+impl<'a, W> DictionaryWriter<'a, W>
+where
+    W: Write,
+{
+    pub fn new(options: &'a WriteOptions, writer: &'a mut W, dictionary: &'a Dictionary) -> Self {
+        Self {
+            options,
+            writer,
+            dictionary,
+            short_names: dictionary
+                .short_names()
+                .into_iter()
+                .map(|names| {
+                    names
+                        .into_iter()
+                        .next()
+                        .unwrap()
+                        .0
+                        .into_inner()
+                        .to_ascii_uppercase()
+                })
+                .collect(),
+        }
+    }
+
+    pub fn write(&mut self) -> Result<(), Error> {
+        self.write_header()?;
+        self.write_version()?;
+        self.write_identification()?;
+        self.write_variable_count()?;
+        self.write_precision()?;
+        self.write_case_weight()?;
+        self.write_variables()?;
+        self.write_value_labels()?;
+        self.write_documents()?;
+        Ok(())
+    }
+
+    pub fn write_header(&mut self) -> Result<(), Error> {
+        for _ in 0..5 {
+            self.writer
+                .write_all(b"ASCII SPSS PORT FILE                    ")?;
+        }
+        for (index, c) in PORTABLE_TO_WINDOWS_1252.iter().enumerate() {
+            let c = if *c == b' ' && index != 0x7e {
+                b'0'
+            } else {
+                *c
+            };
+            self.writer.write_all(&[c])?;
+        }
+        self.writer.write_all(b"SPSSPORT")
+    }
+
+    pub fn write_version(&mut self) -> Result<(), Error> {
+        self.writer.write_all(b"A")?;
+        write_string(
+            &mut self.writer,
+            self.options.timestamp.format("%Y%m%d").to_string(),
+        )?;
+        write_string(
+            &mut self.writer,
+            self.options.timestamp.format("%H%M%S").to_string(),
+        )
+    }
+
+    pub fn write_identification(&mut self) -> Result<(), Error> {
+        self.writer.write_all(b"1")?;
+        write_string(&mut self.writer, self.options.product.as_bytes())?;
+        if let Some(product_ext) = self.options.product_ext.as_ref() {
+            self.writer.write_all(b"2")?;
+            write_string(&mut self.writer, product_ext.as_bytes())?;
+        }
+        if let Some(author) = self.options.author.as_ref() {
+            self.writer.write_all(b"3")?;
+            write_string(&mut self.writer, author.as_bytes())?;
+        }
+        Ok(())
+    }
+
+    pub fn write_variable_count(&mut self) -> Result<(), Error> {
+        write!(
+            &mut self.writer,
+            "4{}",
+            TrigesimalInt::new(self.dictionary.variables.len() as i64)
+        )
+    }
+
+    pub fn write_precision(&mut self) -> Result<(), Error> {
+        write!(
+            &mut self.writer,
+            "5{}",
+            TrigesimalInt::new(self.options.precision.as_base_30_digits() as i64)
+        )
+    }
+
+    pub fn write_case_weight(&mut self) -> Result<(), Error> {
+        if let Some(weight_index) = self.dictionary.weight_index() {
+            self.writer.write_all(b"6")?;
+            write_string(&mut self.writer, &self.short_names[weight_index].as_bytes())?;
+        }
+        Ok(())
+    }
+
+    pub fn write_variables(&mut self) -> Result<(), Error> {
+        let float = |value| TrigesimalFloat::new(value, self.options.precision);
+        for (variable, short_name) in self.dictionary.variables.iter().zip(&self.short_names) {
+            let width = variable.width.as_string_width().unwrap_or_default() as i64;
+            write!(&mut self.writer, "7{}", TrigesimalInt::new(width))?;
+            write_string(&mut self.writer, short_name.as_bytes())?;
+            for format in [variable.print_format, variable.write_format] {
+                let type_ = u16::from(format.type_()) as i64;
+                write!(
+                    &mut self.writer,
+                    "{}{}{}",
+                    TrigesimalInt::new(type_),
+                    TrigesimalInt::new(format.w() as i64),
+                    TrigesimalInt::new(format.d() as i64)
+                )?;
+            }
+            if let Some(range) = variable.missing_values().range() {
+                match range {
+                    MissingValueRange::In { low, high } => {
+                        write!(&mut self.writer, "B{}{}", float(*low), float(*high))?
+                    }
+                    MissingValueRange::From { low } => {
+                        write!(&mut self.writer, "A{}", float(*low))?
+                    }
+                    MissingValueRange::To { high } => {
+                        write!(&mut self.writer, "9{}", float(*high))?
+                    }
+                }
+            }
+            for value in variable.missing_values().values() {
+                write!(&mut self.writer, "8")?;
+                write_datum(&mut self.writer, value, self.options.precision)?;
+            }
+            if let Some(label) = variable.label() {
+                write!(&mut self.writer, "C")?;
+                write_string(&mut self.writer, label.as_bytes())?;
+            }
+        }
+        Ok(())
+    }
+
+    fn write_value_labels(&mut self) -> Result<(), Error> {
+        // Collect identical sets of value labels.
+        let mut sets = HashMap::<&ValueLabels, Vec<_>>::new();
+        for (variable, short_name) in self.dictionary.variables.iter().zip(&self.short_names) {
+            if !variable.value_labels.is_empty() {
+                sets.entry(&variable.value_labels)
+                    .or_default()
+                    .push(short_name);
+            }
+        }
+
+        for (value_labels, variables) in sets {
+            write!(
+                &mut self.writer,
+                "D{}",
+                TrigesimalInt::new(variables.len() as i64)
+            )?;
+            for variable in variables {
+                write_string(&mut self.writer, variable)?;
+            }
+
+            write!(
+                &mut self.writer,
+                "{}",
+                TrigesimalInt::new(value_labels.len() as i64)
+            )?;
+            for (value, label) in value_labels {
+                write_datum(&mut self.writer, value, self.options.precision)?;
+                write_string(&mut self.writer, label.as_bytes())?;
+            }
+        }
+        Ok(())
+    }
+
+    fn write_documents(&mut self) -> Result<(), Error> {
+        if !self.dictionary.documents.is_empty() {
+            write!(
+                &mut self.writer,
+                "E{}",
+                TrigesimalInt::new(self.dictionary.documents.len() as i64)
+            )?;
+            for line in &self.dictionary.documents {
+                write_string(&mut self.writer, line.as_bytes())?;
+            }
+        }
+        Ok(())
+    }
+}
+
+fn write_datum<W, T>(mut writer: W, datum: &Datum<T>, precision: Precision) -> Result<(), Error>
+where
+    W: Write,
+    T: RawString,
+{
+    match datum {
+        Datum::Number(number) => write!(
+            writer,
+            "{}",
+            TrigesimalFloat::new_optional(*number, precision)
+        ),
+        Datum::String(string) => write_string(writer, string.raw_string_bytes()),
+    }
+}
+
+fn write_string<W, S>(mut writer: W, s: S) -> Result<(), Error>
+where
+    W: Write,
+    S: AsRef<[u8]>,
+{
+    let s = s.as_ref();
+    write!(&mut writer, "{}", TrigesimalInt::new(s.len() as i64))?;
+    writer.write_all(s)
+}
+
+fn trig_to_char(trig: u8) -> char {
+    b"0123456789ABCDEFGHIJKLMNOPQRST"[trig as usize] as char
+}
+
+struct TrigesimalInt {
+    value: i64,
+    force_sign: bool,
+    add_slash: bool,
+}
+
+impl TrigesimalInt {
+    fn new(value: i64) -> Self {
+        Self {
+            value,
+            force_sign: false,
+            add_slash: true,
+        }
+    }
+}
+
+impl Display for TrigesimalInt {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.value < 0 {
+            f.write_char('-')?;
+        } else if self.force_sign {
+            f.write_char('+')?;
+        }
+        let value = self.value.unsigned_abs();
+
+        fn recursive_format_int(f: &mut std::fmt::Formatter<'_>, value: u64) -> std::fmt::Result {
+            let trig = value % 30;
+            if value >= 30 {
+                recursive_format_int(f, value / 30)?;
+            }
+            f.write_char(trig_to_char(trig as u8))
+        }
+
+        recursive_format_int(f, value)?;
+        if self.add_slash {
+            f.write_char('/')?;
+        }
+        Ok(())
+    }
+}
+
+struct TrigesimalFloat {
+    value: f64,
+    precision: Precision,
+}
+
+impl TrigesimalFloat {
+    fn new(value: f64, precision: Precision) -> Self {
+        Self { value, precision }
+    }
+    fn new_optional(value: Option<f64>, precision: Precision) -> Self {
+        Self::new(value.unwrap_or(f64::INFINITY), precision)
+    }
+}
+
+impl Display for TrigesimalFloat {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let (value, negative) = match self.value.classify() {
+            std::num::FpCategory::Nan | std::num::FpCategory::Infinite => {
+                return write!(f, "*.");
+            }
+            std::num::FpCategory::Zero | std::num::FpCategory::Subnormal => {
+                return write!(f, "0/");
+            }
+            std::num::FpCategory::Normal if self.value < 0.0 => (-self.value, true),
+            std::num::FpCategory::Normal => (self.value, false),
+        };
+
+        // Adjust `value` to roughly 30**3, by shifting the trigesimal point left or
+        // right as necessary.  We approximate the base-30 exponent by obtaining the
+        // base-2 exponent, then multiplying by log30(2).  This approximation is
+        // sufficient to ensure that the adjusted value is always in the range
+        // 0...30**6, an invariant of the loop below.
+        let binary_exponent = frexp(value).1;
+
+        // This is floor(log30(2**31)), the minimum number of trigesimal
+        // digits that `i32` can hold.
+        const CHUNK_SIZE: usize = 6;
+
+        // Number of trigesimal places for trigs:
+        //
+        // * trigs[0] has coefficient 30**(trig_places - 1),
+        // * trigs[1] has coefficient 30**(trig_places - 2),
+        // * ...
+        //
+        // In other words, the trigesimal point is just before trigs[0].
+        let trig_places = (binary_exponent * 20_379 / 100_000) + CHUNK_SIZE as i32 / 2;
+        let mut value = value * 30f64.powi(CHUNK_SIZE as i32 - trig_places);
+
+        let mut trigs = SmallVec::<[u8; 32]>::new();
+
+        // Dump all the trigs to buffer[], CHUNK_SIZE at a time.
+        let mut trigs_to_output =
+            (f64::DIGITS * 2).div_ceil(3) as i32 + 1 + (CHUNK_SIZE as i32 / 2);
+        while trigs_to_output > 0 {
+            // The current chunk is just the integer part of `value`, truncated to the
+            // nearest integer.  It fits in `usize`.  Append it in base 30.
+            let mut chunk = value as usize;
+            for _ in 0..CHUNK_SIZE {
+                trigs.push((chunk % 30) as u8);
+                chunk /= 30;
+            }
+            let len = trigs.len();
+            trigs[len - CHUNK_SIZE..].reverse();
+
+            // Proceed to the next chunk.
+            value = value.fract();
+            if value == 0.0 {
+                break;
+            }
+            value *= 30.0f64.powi(CHUNK_SIZE as i32);
+            trigs_to_output -= CHUNK_SIZE as i32;
+        }
+
+        // Strip leading zeros.
+        let leading_zeros = trigs.iter().take_while(|trig| **trig == 0).count();
+        let trigs = &mut trigs[leading_zeros..];
+        let trig_places = trig_places - leading_zeros as i32;
+
+        // Round to requested precision, conservatively estimating the required
+        // base-30 precision as 2/3 of the base-10 precision (log30(10) = .68).
+        let base_30_precision = self.precision.as_base_30_digits() as usize;
+        let trigs = if trigs.len() > base_30_precision {
+            if should_round_up(&trigs[base_30_precision - 1..]) {
+                if try_round_up(&mut trigs[..base_30_precision]) {
+                    &trigs[..base_30_precision]
+                } else {
+                    // Couldn't round up because we ran out of trigs to carry into.  Do the carry here instead.
+                    &[1]
+                }
+            } else {
+                // Round down.
+                &trigs[..base_30_precision]
+            }
+        } else {
+            // No rounding required: fewer digits available than requested.
+            &trigs[..]
+        };
+
+        // Strip trailing zeros.
+        let trailing_zeros = trigs
+            .iter()
+            .rev()
+            .take_while(|trig| **trig == 0)
+            .count()
+            .min(trigs.len().saturating_sub(1));
+        let trigs = &trigs[..trigs.len() - trailing_zeros];
+
+        if negative {
+            write!(f, "-")?;
+        }
+        if (-1..trigs.len() as i32 + 3).contains(&trig_places) {
+            // Use conventional notation.
+            format_trig_digits(f, trigs, trig_places)?;
+        } else {
+            // Use scientific notation.
+            format_trig_digits(f, trigs, trigs.len() as i32)?;
+            write!(
+                f,
+                "{}",
+                TrigesimalInt {
+                    value: (trig_places - trigs.len() as i32) as i64,
+                    force_sign: true,
+                    add_slash: false
+                }
+            )?;
+        }
+        f.write_char('/')
+    }
+}
+
+/// Formats `trigs` into `f`, inserting the trigesimal point after `trig_places`
+/// characters have been printed, if necessary adding extra zeros at either end
+/// for correctness.
+fn format_trig_digits(
+    f: &mut std::fmt::Formatter<'_>,
+    trigs: &[u8],
+    mut trig_places: i32,
+) -> std::fmt::Result {
+    if trig_places < 0 {
+        f.write_char('.')?;
+        for _ in trig_places..0 {
+            f.write_char('0')?;
+        }
+        for trig in trigs {
+            f.write_char(trig_to_char(*trig))?;
+        }
+    } else {
+        for trig in trigs {
+            if trig_places == 0 {
+                f.write_char('.')?;
+            }
+            trig_places -= 1;
+            f.write_char(trig_to_char(*trig))?;
+        }
+        for _ in 0..trig_places {
+            f.write_char('0')?;
+        }
+    }
+    Ok(())
+}
+
+/// Determines whether `trigs[1..]` warrant rounding up or down.  Returns true
+/// if `trigs[1..]` represents a value greater than half, false if less than
+/// half.  If `trigs[1..]` is exactly half, examines `trigs[0]` and returns true
+/// if odd, false if even ("round to even").
+fn should_round_up(trigs: &[u8]) -> bool {
+    match trigs[1].cmp(&15) {
+        Ordering::Less => {
+            // Less than half: round down.
+            false
+        }
+        Ordering::Greater => {
+            // More than half: round up.
+            true
+        }
+        Ordering::Equal => {
+            // About half: look more closely.
+            if trigs[2..].iter().any(|trig| *trig != 0) {
+                // Slightly greater than half: round up
+                true
+            } else {
+                // Exactly half: round to even.
+                trigs[0] % 2 != 0
+            }
+        }
+    }
+}
+
+/// Rounds up the rightmost trig in `trigs`, carrying to the left as necessary.
+/// Returns true if successful, false on failure (due to a carry out of the
+/// leftmost position).
+fn try_round_up(trigs: &mut [u8]) -> bool {
+    for trig in trigs.iter_mut().rev() {
+        if *trig != 29 {
+            // Round this trig up to the next value.
+            *trig += 1;
+            return true;
+        }
+
+        // Carry over to the next trig to the left.
+        *trig = 0;
+    }
+
+    // Ran out of trigs to carry.
+    false
+}
+
+#[cfg(test)]
+mod tests {
+    use core::f64;
+    use std::borrow::Cow;
+
+    use encoding_rs::{UTF_8, WINDOWS_1252};
+    use indexmap::set::MutableValues;
+    use itertools::{zip_eq, Itertools};
+
+    use crate::{
+        data::{ByteString, Datum, RawString},
+        dictionary::Dictionary,
+        identifier::Identifier,
+        por::{
+            write::{write_case, DictionaryWriter, Precision, TrigesimalFloat, TrigesimalInt},
+            WriteOptions,
+        },
+        variable::{MissingValueRange, MissingValues, VarWidth, Variable},
+    };
+
+    #[test]
+    fn format_int() {
+        #[track_caller]
+        fn check(value: i64, force_sign: bool, expected: &str) {
+            let s = TrigesimalInt {
+                value,
+                force_sign,
+                add_slash: false,
+            };
+            assert_eq!(&s.to_string(), expected);
+        }
+        check(0, false, "0");
+        check(0, true, "+0");
+        check(1, false, "1");
+        check(2, false, "2");
+        check(10, false, "A");
+        check(29, false, "T");
+        check(123456789, false, "52CE69");
+        check(1, true, "+1");
+        check(2, true, "+2");
+        check(10, true, "+A");
+        check(29, true, "+T");
+        check(123456789, true, "+52CE69");
+        check(-1, false, "-1");
+        check(-2, false, "-2");
+        check(-10, false, "-A");
+        check(-29, false, "-T");
+        check(-123456789, false, "-52CE69");
+        check(-1, true, "-1");
+        check(-2, true, "-2");
+        check(-10, true, "-A");
+        check(-29, true, "-T");
+        check(-123456789, true, "-52CE69");
+    }
+
+    #[test]
+    fn format_float() {
+        #[track_caller]
+        fn check(value: f64, precision: Precision, expected: &str) {
+            let s = TrigesimalFloat { value, precision };
+            assert_eq!(&s.to_string(), expected);
+        }
+
+        fn p(base_30_digits: u32) -> Precision {
+            Precision::from_base_30_digits(base_30_digits)
+        }
+
+        check(0.0, p(10), "0/");
+        check(-0.0, p(10), "0/");
+        check(1.0, p(10), "1/");
+        check(f64::INFINITY, p(10), "*.");
+        check(f64::MIN_POSITIVE / 2.0, p(10), "0/");
+        check(0.5, p(10), ".F/");
+        check(1234.5, p(10), "1B4.F/");
+        check(0.123456789, p(9), ".3L39TT5CR/");
+        check(0.123456789, p(8), ".3L39TT5D/");
+        check(0.123456789, p(7), ".3L39TT5/");
+        check(0.123456789, p(6), ".3L39TT/");
+        check(0.123456789, p(4), ".3L3A/");
+        check(0.123456789, p(3), ".3L3/");
+        check(0.123456789, p(2), ".3L/");
+        check(0.123456789, p(1), ".4/");
+        check(-0.123456789, p(9), "-.3L39TT5CR/");
+        check(-0.123456789, p(8), "-.3L39TT5D/");
+        check(-0.123456789, p(7), "-.3L39TT5/");
+        check(-0.123456789, p(6), "-.3L39TT/");
+        check(-0.123456789, p(4), "-.3L3A/");
+        check(-0.123456789, p(3), "-.3L3/");
+        check(-0.123456789, p(2), "-.3L/");
+        check(-0.123456789, p(1), "-.4/");
+        check(123456789.123456789, p(10), "52CE69.3L3A/");
+        check(123456789.123456789, p(9), "52CE69.3L3/");
+        check(123456789.123456789, p(8), "52CE69.3L/");
+        check(123456789.123456789, p(7), "52CE69.4/");
+        check(123456789.123456789, p(6), "52CE69/");
+        check(123456789.123456789, p(5), "52CE60/");
+        check(123456789.123456789, p(4), "52CE00/");
+        check(123456789.123456789, p(3), "52C+3/");
+        check(123456789.123456789, p(2), "52+4/");
+        check(123456789.123456789, p(1), "5+5/");
+        check(0.00000000987654, p(2), "76-7/");
+    }
+
+    #[test]
+    fn header() {
+        let dictionary = Dictionary::new(UTF_8);
+        let mut output = Vec::new();
+        let options = WriteOptions::reproducible();
+        let mut writer = DictionaryWriter::new(&options, &mut output, &dictionary);
+        writer.write_header().unwrap();
+        assert_eq!(output.len(), 200 + 256 + 8);
+        assert_eq!(
+            &output[..200],
+            b"ASCII SPSS PORT FILE                    \
+ASCII SPSS PORT FILE                    \
+ASCII SPSS PORT FILE                    \
+ASCII SPSS PORT FILE                    \
+ASCII SPSS PORT FILE                    "
+        );
+        assert_eq!(&output[200 + 256..], b"SPSSPORT");
+        assert_eq!(
+            &output[200..200 + 64],
+            b"0000000000000000000000000000000000000000000000000000000000000000"
+        );
+        assert_eq!(
+            &output[200 + 64..200 + 128],
+            b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ."
+        );
+        assert_eq!(&output[200 + 128..200 + 192], b"<(+|&[]!$*);^-/|,%_>?`:#@'=\"00\xb10\xb0\x86~\x960000\xb9\xb2\xb3456789000\x97()0{}\\\xa2\x95000");
+        assert_eq!(
+            &output[200 + 192..200 + 256],
+            b"0000000000000000000000000000000000000000000000000000000000000000"
+        );
+    }
+
+    #[test]
+    fn version() {
+        let dictionary = Dictionary::new(UTF_8);
+        let mut output = Vec::new();
+        let options = WriteOptions::reproducible();
+        let mut writer = DictionaryWriter::new(&options, &mut output, &dictionary);
+        writer.write_version().unwrap();
+        assert_eq!(&String::from_utf8(output).unwrap(), "A8/202507306/150755");
+    }
+
+    #[test]
+    fn identification() {
+        let dictionary = Dictionary::new(UTF_8);
+        let mut output = Vec::new();
+        let options = WriteOptions::reproducible()
+            .with_product_ext(Cow::from("Extra product"))
+            .with_author(String::from("Author"));
+        let mut writer = DictionaryWriter::new(&options, &mut output, &dictionary);
+        writer.write_identification().unwrap();
+        assert_eq!(
+            &String::from_utf8(output).unwrap(),
+            "1J/PSPP TEST DATA FILE2D/Extra product36/Author"
+        );
+    }
+
+    #[test]
+    fn precision() {
+        let dictionary = Dictionary::new(UTF_8);
+        let mut output = Vec::new();
+        let options =
+            WriteOptions::reproducible().with_precision(Precision::from_base_30_digits(3));
+        let mut writer = DictionaryWriter::new(&options, &mut output, &dictionary);
+        writer.write_precision().unwrap();
+        assert_eq!(&String::from_utf8(output).unwrap(), "53/");
+    }
+
+    #[test]
+    fn variables() {
+        {
+            let mut dictionary = Dictionary::new(UTF_8);
+            for (index, width) in [VarWidth::Numeric, VarWidth::String(1), VarWidth::String(15)]
+                .iter()
+                .enumerate()
+            {
+                dictionary
+                    .add_var(Variable::new(
+                        Identifier::new(format!("v{index}")).unwrap(),
+                        *width,
+                        UTF_8,
+                    ))
+                    .unwrap();
+            }
+            dictionary.variables.get_index_mut2(1).unwrap().label =
+                Some(String::from("Variable label."));
+            dictionary.set_weight(Some(0)).unwrap();
+
+            let mut output = Vec::new();
+            let options = WriteOptions::reproducible();
+            let mut writer = DictionaryWriter::new(&options, &mut output, &dictionary);
+            writer.write_variable_count().unwrap();
+            writer.write_case_weight().unwrap();
+            writer.write_variables().unwrap();
+
+            assert_eq!(
+                &String::from_utf8(output).unwrap(),
+                "43/\
+62/V0\
+70/2/V05/8/2/5/8/2/\
+71/2/V11/1/0/1/1/0/\
+CF/Variable label.\
+7F/2/V21/F/0/1/F/0/\
+"
+            );
+        }
+    }
+
+    #[test]
+    fn missing_values() {
+        {
+            let mut dictionary = Dictionary::new(UTF_8);
+            let variables = [
+                (VarWidth::Numeric, vec![Datum::Number(Some(0.0))], None),
+                (
+                    VarWidth::Numeric,
+                    vec![Datum::Number(Some(0.0)), Datum::Number(Some(1.0))],
+                    None,
+                ),
+                (
+                    VarWidth::Numeric,
+                    vec![
+                        Datum::Number(Some(0.0)),
+                        Datum::Number(Some(1.0)),
+                        Datum::Number(Some(2.0)),
+                    ],
+                    None,
+                ),
+                (
+                    VarWidth::Numeric,
+                    vec![Datum::Number(Some(0.0))],
+                    Some(MissingValueRange::new(1.0, 2.0)),
+                ),
+                (
+                    VarWidth::Numeric,
+                    Vec::new(),
+                    Some(MissingValueRange::new(1.0, 2.0)),
+                ),
+                (
+                    VarWidth::Numeric,
+                    vec![Datum::Number(Some(0.0))],
+                    Some(MissingValueRange::From { low: 1.0 }),
+                ),
+                (
+                    VarWidth::Numeric,
+                    Vec::new(),
+                    Some(MissingValueRange::From { low: 1.0 }),
+                ),
+                (
+                    VarWidth::Numeric,
+                    vec![Datum::Number(Some(0.0))],
+                    Some(MissingValueRange::To { high: 1.0 }),
+                ),
+                (
+                    VarWidth::Numeric,
+                    Vec::new(),
+                    Some(MissingValueRange::To { high: 1.0 }),
+                ),
+                (
+                    VarWidth::String(8),
+                    vec![Datum::String(
+                        ByteString::from("abcdefgh").with_encoding(WINDOWS_1252),
+                    )],
+                    None,
+                ),
+                (
+                    VarWidth::String(8),
+                    vec![
+                        Datum::String(ByteString::from("abcdefgh").with_encoding(WINDOWS_1252)),
+                        Datum::String(ByteString::from("ijklmnop").with_encoding(WINDOWS_1252)),
+                    ],
+                    None,
+                ),
+                (
+                    VarWidth::String(8),
+                    vec![
+                        Datum::String(ByteString::from("abcdefgh").with_encoding(WINDOWS_1252)),
+                        Datum::String(ByteString::from("ijklmnop").with_encoding(WINDOWS_1252)),
+                        Datum::String(ByteString::from("qrstuvwx").with_encoding(WINDOWS_1252)),
+                    ],
+                    None,
+                ),
+            ];
+            for (index, (width, values, range)) in variables.into_iter().enumerate() {
+                let mut variable =
+                    Variable::new(Identifier::new(format!("v{index}")).unwrap(), width, UTF_8);
+                variable
+                    .missing_values_mut()
+                    .replace(MissingValues::new(values, range).unwrap())
+                    .unwrap();
+                dictionary.add_var(variable).unwrap();
+            }
+
+            let mut output = Vec::new();
+            let options = WriteOptions::reproducible();
+            let mut writer = DictionaryWriter::new(&options, &mut output, &dictionary);
+            writer.write_variable_count().unwrap();
+            writer.write_case_weight().unwrap();
+            writer.write_variables().unwrap();
+
+            assert_eq!(
+                &String::from_utf8(output).unwrap(),
+                "4C/\
+70/2/V05/8/2/5/8/2/\
+80/\
+70/2/V15/8/2/5/8/2/\
+80/\
+81/\
+70/2/V25/8/2/5/8/2/\
+80/\
+81/\
+82/\
+70/2/V35/8/2/5/8/2/\
+B1/2/\
+80/\
+70/2/V45/8/2/5/8/2/\
+B1/2/\
+70/2/V55/8/2/5/8/2/\
+A1/\
+80/\
+70/2/V65/8/2/5/8/2/\
+A1/\
+70/2/V75/8/2/5/8/2/\
+91/\
+80/\
+70/2/V85/8/2/5/8/2/\
+91/\
+78/2/V91/8/0/1/8/0/\
+88/abcdefgh\
+78/3/V101/8/0/1/8/0/\
+88/abcdefgh\
+88/ijklmnop\
+78/3/V111/8/0/1/8/0/\
+88/abcdefgh\
+88/ijklmnop\
+88/qrstuvwx\
+"
+            );
+        }
+    }
+
+    #[test]
+    fn value_labels() {
+        let variables = [
+            (VarWidth::Numeric, vec![(Datum::Number(Some(1.0)), "One")]),
+            (
+                VarWidth::Numeric,
+                vec![
+                    (Datum::Number(Some(1.0)), "One"),
+                    (Datum::Number(Some(2.0)), "Two"),
+                ],
+            ),
+            (
+                VarWidth::Numeric,
+                vec![
+                    (Datum::Number(Some(1.0)), "One"),
+                    (Datum::Number(Some(2.0)), "Two"),
+                ],
+            ),
+            (
+                VarWidth::String(4),
+                vec![(Datum::String(ByteString::from("abcd")), "One")],
+            ),
+            (
+                VarWidth::String(8),
+                vec![(
+                    Datum::String(ByteString::from("abcdefgh")),
+                    "Longer value label",
+                )],
+            ),
+            (
+                VarWidth::String(9),
+                vec![(
+                    Datum::String(ByteString::from("abcdefghi")),
+                    "value label for 9-byte value",
+                )],
+            ),
+            (
+                VarWidth::String(300),
+                vec![(
+                    Datum::String(ByteString::from(vec![b'x'; 300])),
+                    "value label for 300-byte value",
+                )],
+            ),
+        ];
+
+        let mut dictionary = Dictionary::new(UTF_8);
+        for (index, (width, value_labels)) in variables.iter().enumerate() {
+            let mut variable = Variable::new(
+                Identifier::new(format!("var{index}")).unwrap(),
+                *width,
+                UTF_8,
+            );
+            for (value, label) in value_labels {
+                assert_eq!(variable.value_labels.insert(value.clone(), *label), None);
+            }
+            dictionary.add_var(variable).unwrap();
+        }
+        dbg!(&dictionary);
+
+        let mut output = Vec::new();
+        let options = WriteOptions::reproducible();
+        let mut writer = DictionaryWriter::new(&options, &mut output, &dictionary);
+        writer.write_value_labels().unwrap();
+
+        let output = String::from_utf8(output).unwrap();
+        println!("{output}");
+
+        let mut output = output
+            .split("D")
+            .filter(|s| !s.is_empty())
+            .collect::<Vec<_>>();
+        output.sort();
+
+        let expected = [
+   ("1/4/VAR01/", vec!["1/3/One"]),
+    ("1/4/VAR31/", vec!["4/abcd3/One"]),
+    ("1/4/VAR41/", vec!["8/abcdefghI/Longer value label"]),
+    ("1/4/VAR51/", vec!["9/abcdefghiS/value label for 9-byte value"]),
+    ("1/4/VAR61/", vec!["A0/xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx10/value label for 300-byte value"]),
+    ("2/4/VAR14/VAR22/", vec!["1/3/One", "2/3/Two"]),
+        ];
+
+        for (actual, (exp_prefix, exp_suffixes)) in zip_eq(output, expected) {
+            if !exp_suffixes
+                .iter()
+                .permutations(exp_suffixes.len())
+                .any(|exp_suffixes| {
+                    actual
+                        == std::iter::once(exp_prefix)
+                            .chain(exp_suffixes.into_iter().map(|s| *s))
+                            .collect::<String>()
+                })
+            {
+                panic!(
+                    "{actual:?} != {exp_prefix:?} followed by any permutation of {exp_suffixes:?}"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn documents() {
+        let mut dictionary = Dictionary::new(UTF_8);
+        dictionary.documents = vec![
+            String::from("First document line."),
+            String::from("Second document line."),
+        ];
+
+        let mut output = Vec::new();
+        let options = WriteOptions::reproducible();
+        let mut writer = DictionaryWriter::new(&options, &mut output, &dictionary);
+        writer.write_documents().unwrap();
+
+        assert_eq!(
+            &String::from_utf8(output).unwrap(),
+            "E2/\
+K/First document line.\
+L/Second document line."
+        );
+    }
+
+    #[test]
+    fn cases() {
+        let mut output = Vec::new();
+        write_case(
+            &mut output,
+            [
+                Datum::Number(Some(0.0)),
+                Datum::Number(Some(1.0)),
+                Datum::Number(None),
+                Datum::String(ByteString::from("abcdefghi")),
+            ],
+            Precision::default(),
+        )
+        .unwrap();
+        assert_eq!(&String::from_utf8(output).unwrap(), "0/1/*.9/abcdefghi");
+    }
+}
index 9e699d623e80788ce9fe29ecf2caf1c4ed701f60..699425e85cefdde2e3cc3a0c9e5089c4c3d54c2d 100644 (file)
@@ -1,24 +1,25 @@
-/* PSPP - a program for statistical analysis.
- * Copyright (C) 2023 Free Software Foundation, Inc.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program.  If not, see <http://www.gnu.org/licenses/>.
 
 use crate::parse_encoding;
 use anyhow::{anyhow, Result};
 use clap::{Args, ValueEnum};
 use encoding_rs::Encoding;
 use pspp::{
+    data::cases_to_output,
     output::{
         driver::{Config, Driver},
         pivot::PivotTable,
@@ -293,17 +294,19 @@ impl Show {
                     .into_parts();
                 match &output {
                     Output::Driver { driver, mode: _ } => {
-                        driver
-                            .borrow_mut()
-                            .write(&Arc::new(Item::new(PivotTable::from(&metadata))));
+                        let mut output = Vec::new();
+                        output.push(Item::new(PivotTable::from(&metadata)));
+                        output.extend(
+                            dictionary
+                                .all_pivot_tables()
+                                .into_iter()
+                                .map(|pivot_table| Item::new(pivot_table)),
+                        );
+                        output.extend(cases_to_output(&dictionary, cases));
                         driver
                             .borrow_mut()
                             .write(&Arc::new(Item::new(Details::Group(
-                                dictionary
-                                    .all_pivot_tables()
-                                    .into_iter()
-                                    .map(|pivot_table| Arc::new(Item::new(pivot_table)))
-                                    .collect(),
+                                output.into_iter().map(Arc::new).collect(),
                             ))));
                     }
                     Output::Json { .. } => {
diff --git a/rust/pspp/src/show_por.rs b/rust/pspp/src/show_por.rs
new file mode 100644 (file)
index 0000000..28529e2
--- /dev/null
@@ -0,0 +1,327 @@
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program.  If not, see <http://www.gnu.org/licenses/>.
+
+use anyhow::{anyhow, Result};
+use clap::{Args, ValueEnum};
+use pspp::{
+    data::cases_to_output,
+    output::{
+        driver::{Config, Driver},
+        pivot::PivotTable,
+        Details, Item, Text,
+    },
+    por::PortableFile,
+};
+use serde::Serialize;
+use std::{
+    cell::RefCell,
+    ffi::OsStr,
+    fmt::{Display, Write as _},
+    fs::File,
+    io::{stdout, BufReader, Write},
+    path::{Path, PathBuf},
+    rc::Rc,
+    sync::Arc,
+};
+
+/// Show information about SPSS portable files.
+#[derive(Args, Clone, Debug)]
+pub struct ShowPor {
+    /// What to show.
+    #[arg(value_enum)]
+    mode: Mode,
+
+    /// File to show.
+    #[arg(required = true)]
+    input: PathBuf,
+
+    /// Output file name.  If omitted, output is written to stdout.
+    output: Option<PathBuf>,
+
+    /// Maximum number of cases to read.
+    ///
+    /// If specified without an argument, all cases will be read.
+    #[arg(
+        long = "data",
+        num_args = 0..=1,
+        default_missing_value = "18446744073709551615",
+        default_value_t = 0,
+        help_heading = "Input file options"
+    )]
+    max_cases: usize,
+
+    /// Output driver configuration options.
+    #[arg(short = 'o', help_heading = "Output options")]
+    output_options: Vec<String>,
+
+    /// Output format.
+    #[arg(long, short = 'f', help_heading = "Output options")]
+    format: Option<ShowFormat>,
+}
+
+enum Output {
+    Driver {
+        driver: Rc<RefCell<Box<dyn Driver>>>,
+        mode: Mode,
+    },
+    Json {
+        writer: Rc<RefCell<Box<dyn Write>>>,
+        pretty: bool,
+    },
+    Discard,
+}
+
+impl Output {
+    fn show_json<T>(&self, value: &T) -> Result<()>
+    where
+        T: Serialize,
+    {
+        match self {
+            Self::Driver { mode, driver: _ } => {
+                Err(anyhow!("Mode '{mode}' only supports output as JSON."))
+            }
+            Self::Json { writer, pretty } => {
+                let mut writer = writer.borrow_mut();
+                match pretty {
+                    true => serde_json::to_writer_pretty(&mut *writer, value)?,
+                    false => serde_json::to_writer(&mut *writer, value)?,
+                };
+                writeln!(writer)?;
+                Ok(())
+            }
+            Self::Discard => Ok(()),
+        }
+    }
+
+    fn warn(&self, warning: &impl Display) {
+        match self {
+            Output::Driver { driver, .. } => {
+                driver
+                    .borrow_mut()
+                    .write(&Arc::new(Item::from(Text::new_log(warning.to_string()))));
+            }
+            Output::Json { .. } => {
+                #[derive(Serialize)]
+                struct Warning {
+                    warning: String,
+                }
+                let warning = Warning {
+                    warning: warning.to_string(),
+                };
+                let _ = self.show_json(&warning);
+            }
+            Self::Discard => (),
+        }
+    }
+}
+
+impl ShowPor {
+    pub fn run(self) -> Result<()> {
+        let format = if let Some(format) = self.format {
+            format
+        } else if let Some(output_file) = &self.output {
+            match output_file
+                .extension()
+                .unwrap_or(OsStr::new(""))
+                .to_str()
+                .unwrap_or("")
+            {
+                "json" => ShowFormat::Json,
+                "ndjson" => ShowFormat::Ndjson,
+                _ => ShowFormat::Output,
+            }
+        } else {
+            ShowFormat::Json
+        };
+
+        let output = match format {
+            ShowFormat::Output => {
+                let mut config = String::new();
+
+                if let Some(file) = &self.output {
+                    #[derive(Serialize)]
+                    struct File<'a> {
+                        file: &'a Path,
+                    }
+                    let file = File {
+                        file: file.as_path(),
+                    };
+                    let toml_file = toml::to_string_pretty(&file).unwrap();
+                    config.push_str(&toml_file);
+                }
+                for option in &self.output_options {
+                    writeln!(&mut config, "{option}").unwrap();
+                }
+
+                let table: toml::Table = toml::from_str(&config)?;
+                if !table.contains_key("driver") {
+                    let driver = if let Some(file) = &self.output {
+                        <dyn Driver>::driver_type_from_filename(file).ok_or_else(|| {
+                            anyhow!("{}: no default output format for file name", file.display())
+                        })?
+                    } else {
+                        "text"
+                    };
+
+                    #[derive(Serialize)]
+                    struct DriverConfig {
+                        driver: &'static str,
+                    }
+                    config.insert_str(
+                        0,
+                        &toml::to_string_pretty(&DriverConfig { driver }).unwrap(),
+                    );
+                }
+
+                let config: Config = toml::from_str(&config)?;
+                Output::Driver {
+                    mode: self.mode,
+                    driver: Rc::new(RefCell::new(Box::new(<dyn Driver>::new(&config)?))),
+                }
+            }
+            ShowFormat::Json | ShowFormat::Ndjson => Output::Json {
+                pretty: format == ShowFormat::Json,
+                writer: if let Some(output_file) = &self.output {
+                    Rc::new(RefCell::new(Box::new(File::create(output_file)?)))
+                } else {
+                    Rc::new(RefCell::new(Box::new(stdout())))
+                },
+            },
+            ShowFormat::Discard => Output::Discard,
+        };
+
+        let reader = BufReader::new(File::open(&self.input)?);
+        match self.mode {
+            Mode::Dictionary => {
+                let PortableFile {
+                    dictionary,
+                    metadata: _,
+                    cases,
+                } = PortableFile::open(reader, |warning| output.warn(&warning))?;
+                let cases = cases.take(self.max_cases);
+
+                match &output {
+                    Output::Driver { driver, mode: _ } => {
+                        let mut output = Vec::new();
+                        output.extend(
+                            dictionary
+                                .all_pivot_tables()
+                                .into_iter()
+                                .map(|pivot_table| Item::new(pivot_table)),
+                        );
+                        output.extend(cases_to_output(&dictionary, cases));
+                        driver
+                            .borrow_mut()
+                            .write(&Arc::new(Item::new(Details::Group(
+                                output.into_iter().map(Arc::new).collect(),
+                            ))));
+                    }
+                    Output::Json { .. } => {
+                        output.show_json(&dictionary)?;
+                        for (_index, case) in (0..self.max_cases).zip(cases) {
+                            output.show_json(&case?)?;
+                        }
+                    }
+                    Output::Discard => (),
+                }
+            }
+            Mode::Metadata => {
+                let metadata =
+                    PortableFile::open(reader, |warning| output.warn(&warning))?.metadata;
+
+                match &output {
+                    Output::Driver { driver, mode: _ } => {
+                        driver
+                            .borrow_mut()
+                            .write(&Arc::new(Item::new(PivotTable::from(&metadata))));
+                    }
+                    Output::Json { .. } => {
+                        output.show_json(&metadata)?;
+                    }
+                    Output::Discard => (),
+                }
+            }
+            Mode::Histogram => {
+                let (histogram, translations) = PortableFile::read_histogram(reader)?;
+                let h = histogram
+                    .into_iter()
+                    .enumerate()
+                    .filter_map(|(index, count)| {
+                        if count > 0
+                            && index != translations[index as u8] as usize
+                            && translations[index as u8] != 0
+                        {
+                            Some((
+                                format!("{index:02x}"),
+                                translations[index as u8] as char,
+                                count,
+                            ))
+                        } else {
+                            None
+                        }
+                    })
+                    .collect::<Vec<_>>();
+                output.show_json(&h)?;
+            }
+        }
+        Ok(())
+    }
+}
+
+/// What to show in a system file.
+#[derive(Clone, Copy, Debug, Default, PartialEq, ValueEnum)]
+enum Mode {
+    /// File dictionary, with variables, value labels, ...
+    #[default]
+    #[value(alias = "dict")]
+    Dictionary,
+
+    /// File metadata not included in the dictionary.
+    Metadata,
+
+    /// Histogram of character incidence in the file.
+    Histogram,
+}
+
+impl Mode {
+    fn as_str(&self) -> &'static str {
+        match self {
+            Mode::Dictionary => "dictionary",
+            Mode::Metadata => "metadata",
+            Mode::Histogram => "histogram",
+        }
+    }
+}
+
+impl Display for Mode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.as_str())
+    }
+}
+
+#[derive(Clone, Copy, Debug, Default, PartialEq, Serialize, ValueEnum)]
+#[serde(rename_all = "snake_case")]
+enum ShowFormat {
+    /// Pretty-printed JSON.
+    #[default]
+    Json,
+    /// Newline-delimited JSON.
+    Ndjson,
+    /// Pivot tables.
+    Output,
+    /// No output.
+    Discard,
+}
index 1746781d69241a3d7661b06085b8fcde7981e396..0f405fb4682a30af862a709fcc891563c90bf7b1 100644 (file)
 //! Reading and writing system files.
 //!
 //! This module enables reading and writing "system files", the binary format
-//! for SPSS data files.  The system file format dates back 40+ years and has
+//! for SPSS data files.  The [system file format] dates back 40+ years and has
 //! evolved greatly over that time to support new features, but in a way to
 //! facilitate interchange between even the oldest and newest versions of
 //! software.
 //!
 //! Use [ReadOptions] to read a system file in the simplest way.
 //! Use [WriteOptions] to write a system file.
+//!
+//! [system file format]: https://pspp.benpfaff.org/manual/system-file.html
 
 // Warn about missing docs, but not for items declared with `#[cfg(test)]`.
 #![cfg_attr(not(test), warn(missing_docs))]
index 5702522ec978270fceda22a45d7b402e611561f6..78440d801c35931b4290e34002fa426cc7d9a914 100644 (file)
@@ -24,7 +24,6 @@ use std::{
 };
 
 use crate::{
-    calendar::date_time_to_pspp,
     crypto::EncryptedFile,
     data::{ByteString, Case, Datum, MutRawString, RawString},
     dictionary::{
@@ -579,7 +578,7 @@ impl<F> ReadOptions<F> {
     }
 }
 
-/// The content of an SPSS system file.
+/// An SPSS system file read with [ReadOptions].
 #[derive(Debug)]
 pub struct SystemFile {
     /// The system file dictionary.
@@ -1426,10 +1425,7 @@ impl Metadata {
         let mut values = Vec::new();
 
         group.push("Created");
-        values.push(Value::new_number_with_format(
-            Some(date_time_to_pspp(self.creation)),
-            Format::DATETIME40_0,
-        ));
+        values.push(Value::new_date_time(self.creation));
 
         let mut product = Group::new("Writer");
         product.push("Product");
index 2fe84f2423a901a4b57bb207fc550a96bdcac581..60f4b2cfceb48f90062470453123ed3cedcf2b1e 100644 (file)
@@ -1030,7 +1030,7 @@ where
     ///
     /// # Panic
     ///
-    /// Attempts to write more cases after calling this function may will panic.
+    /// Attempts to write more cases after calling this function will panic.
     pub fn try_finish(&mut self) -> Result<Option<W>, BinError> {
         let Some(inner) = self.inner.take() else {
             return Ok(None);
index 3c182b35b52069f0b1602f027881fc506e70b71e..3b4f72741e76523406d5f1b4130db529fb1a4b2d 100644 (file)
@@ -24,6 +24,7 @@ use std::{
     str::FromStr,
 };
 
+use displaydoc::Display;
 use encoding_rs::{Encoding, UTF_8};
 use hashbrown::HashMap;
 use indexmap::Equivalent;
@@ -47,6 +48,9 @@ pub enum VarType {
     Numeric,
 
     /// A string variable.
+    ///
+    /// The string width is unspecified; use [VarWidth] for type and width
+    /// together.
     String,
 }
 
@@ -78,11 +82,22 @@ impl Display for VarType {
     }
 }
 
-/// [VarType], plus a width for [VarType::String].
+/// A variable's width.
+///
+/// This is essentially [VarType] plus a width for [VarType::String].
 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize)]
 pub enum VarWidth {
+    /// A numeric variable.
     Numeric,
-    String(u16), // XXX change to NonZeroU16, or to 1..=32767 range type
+
+    /// A string variable.
+    String(
+        /// The width of the string variable.
+        ///
+        /// Must be in `1..=32767`, although the type system does not yet
+        /// enforce this.
+        u16,
+    ), // XXX change to NonZeroU16, or to 1..=32767 range type
 }
 
 impl VarWidth {
@@ -601,7 +616,11 @@ impl ValueLabels {
     }
 
     pub fn is_empty(&self) -> bool {
-        self.0.is_empty()
+        self.len() == 0
+    }
+
+    pub fn len(&self) -> usize {
+        self.0.len()
     }
 
     pub fn get<T>(&self, value: &Datum<T>) -> Option<&str>
@@ -674,6 +693,16 @@ impl Hash for ValueLabels {
     }
 }
 
+impl<'a> IntoIterator for &'a ValueLabels {
+    type Item = (&'a Datum<ByteString>, &'a String);
+
+    type IntoIter = hashbrown::hash_map::Iter<'a, Datum<ByteString>, String>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.0.iter()
+    }
+}
+
 pub struct MissingValuesMut<'a> {
     inner: &'a mut MissingValues,
     width: VarWidth,
@@ -778,11 +807,19 @@ impl Display for MissingValues {
     }
 }
 
-#[derive(Copy, Clone, Debug)]
+/// Invalid missing values.
+#[derive(Display, Copy, Clone, Debug, ThisError)]
 pub enum MissingValuesError {
+    /// Too many missing values.
     TooMany,
+
+    /// Missing values too wide (missing values may be no wider than 8 bytes).
     TooWide,
+
+    /// Missing values must be all string or all numeric.
     MixedTypes,
+
+    /// The system-missing value may not be a user-missing value.
     SystemMissing,
 }