From: Ben Pfaff Date: Wed, 17 Sep 2025 15:17:38 +0000 (-0700) Subject: rust: Add support for portable files. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=fc58c3113fd726880fc139eeb5235af8a8272c2b;p=pspp rust: Add support for portable files. --- diff --git a/rust/Cargo.lock b/rust/Cargo.lock index b98c7354b2..a008866877 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -403,6 +403,15 @@ dependencies = [ "digest", ] +[[package]] +name = "codepage-437" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e40c1169585d8d08e5675a39f2fc056cd19a258fc4cba5e3bbf4a9c1026de535" +dependencies = [ + "csv", +] + [[package]] name = "color" version = "0.2.4" @@ -1626,10 +1635,12 @@ dependencies = [ "chrono", "clap", "cmac", + "codepage-437", "color", "csv", "derive_more", "diff", + "displaydoc", "either", "encoding_rs", "enum-iterator", diff --git a/rust/doc/src/SUMMARY.md b/rust/doc/src/SUMMARY.md index 5fde2b517d..1d1c14d2e6 100644 --- a/rust/doc/src/SUMMARY.md +++ b/rust/doc/src/SUMMARY.md @@ -4,9 +4,10 @@ [License](license.md) - [Running PSPP](invoking/index.md) - - [Converting data](invoking/pspp-convert.md) - - [Inspecting data](invoking/pspp-show.md) - - [Decrypting files](invoking/pspp-decrypt.md) + - [Converting Data](invoking/pspp-convert.md) + - [Inspecting `.sav` Data](invoking/pspp-show.md) + - [Inspecting `.por` Data](invoking/pspp-show-por.md) + - [Decrypting Files](invoking/pspp-decrypt.md) # Language Overview diff --git a/rust/doc/src/commands/export.md b/rust/doc/src/commands/export.md index a704a059a1..4376bdf797 100644 --- a/rust/doc/src/commands/export.md +++ b/rust/doc/src/commands/export.md @@ -15,6 +15,9 @@ EXPORT The `EXPORT` procedure writes the active dataset's dictionary and data to a specified portable file. +> `EXPORT` is obsolete and retained only for compatibility. New +> syntax should use [`SAVE`](save.md) instead. + `UNSELECTED` controls whether cases excluded with [`FILTER`](filter.md) are written to the file. These can be excluded by specifying `DELETE` on the `UNSELECTED` subcommand. diff --git a/rust/doc/src/commands/import.md b/rust/doc/src/commands/import.md index c183f2cdfb..6cd133a9bf 100644 --- a/rust/doc/src/commands/import.md +++ b/rust/doc/src/commands/import.md @@ -13,10 +13,13 @@ The `IMPORT` transformation clears the active dataset dictionary and data and replaces them with a dictionary and data from a system file or portable file. +> `IMPORT` is obsolete and retained only for compatibility with +> existing portable files. New syntax should use [`SAVE`](save.md) to +> write system files instead, and [`GET`](get.md) to read them. + The `FILE` subcommand, which is the only required subcommand, specifies the portable file to be read as a file name string or a [file handle](../language/files/file-handles.md). -[file handle](../language/files/file-handles.md). The `TYPE` subcommand is currently not used. diff --git a/rust/doc/src/invoking/pspp-convert.md b/rust/doc/src/invoking/pspp-convert.md index a1d33a605b..e6c2abd95f 100644 --- a/rust/doc/src/invoking/pspp-convert.md +++ b/rust/doc/src/invoking/pspp-convert.md @@ -7,9 +7,9 @@ another. The basic syntax is: pspp convert [OUTPUT] ``` -which reads an SPSS data file from `` and writes a copy of it -to `[OUTPUT]`. If `[OUTPUT]` is omitted, output is written to the -terminal. +which reads an SPSS system file or portable file from `` and +writes a copy of it to `[OUTPUT]`. If `[OUTPUT]` is omitted, output +is written to the terminal. If `[OUTPUT]` is specified, then `pspp convert` tries to guess the output format based on its extension: diff --git a/rust/doc/src/invoking/pspp-show-por.md b/rust/doc/src/invoking/pspp-show-por.md new file mode 100644 index 0000000000..4bd208e8d2 --- /dev/null +++ b/rust/doc/src/invoking/pspp-show-por.md @@ -0,0 +1,118 @@ +# Inspecting `.por` files with `pspp show-por` + +The `pspp show-por` command reads an SPSS "portable file", +which usually has a `.por` extension, and produces a report. The +basic syntax is: + +``` +pspp show-por [OUTPUT] +``` + +where `` is a mode of operation (see below), `` is the +SPSS portable file to read, and `[OUTPUT]` is the output file name. +If `[OUTPUT]` is omitted, output is written to the terminal. + +> The portable file format is mostly obsolete. The "system file" or +> `.sav` format should be used for writing new data files. Use [`pspp +> show`](pspp-show.md) to inspect `.sav` files. + +The following ``s are available: + +* `dictionary`: Outputs the file dictionary in detail, including + variables, value labels, documents, and so on. With `--data`, also + outputs cases from the system file. + + This can be useful as an alternative to PSPP syntax commands such as + [`DISPLAY DICTIONARY`](../commands/display.md). + + [`pspp convert`](pspp-convert.md) is a better way to convert a + portable file to another format. + +* `metadata`: Outputs portable file metadata not included in the + dictionary: + + - The creation date and time declared inside the file (not in the + file system). + + - The name of the product and subproduct that wrote the file, if + present. + + - The author of the file, if present. This is usually the name of + the organization that licensed the product that wrote the file. + + - The character set [translation table] embedded in the file, as an + array with 256 elements, one for each possible value of a byte in + the file. Each array element gives the byte value as a 2-digit + hexadecimal number paired with the translation table's entry for + that byte. Since the file can technically be in any encoding + (although [the corpus] universally uses extended ASCII), the entry + is given as a character interpreted in two character sets: + [windows-1252] and [code page 437], in that order. (If the two + character sets agree on the code point, then it is only given + once.) + + For example, consider a portable's file translation table at + offset 0x9e, which in the [portable character set] is `±`. + Suppose it has value 0xb1, which is `±` in [windows-1252] and `▒` + in [code page 437]. Then that array element would be `["9e", "±", + "▒"]`. + + [translation table]: ../portable.md#translation-table + [the corpus]: ../portable.md#corpus + [portable character set]: ../portable.md#theory + [windows-1252]: https://en.wikipedia.org/wiki/Windows-1252 + [code page 437]: https://en.wikipedia.org/wiki/Code_page_437 + + This command is most useful with some knowledge of the [portable + file format]. + + [portable file format]: ../portable.md + +* `histogram`: Reports on the usage of characters in the portable + file. Produces output in the form of an array for each possible + value of a byte in the file. Each array element gives the byte + value, the byte's character, and the number of times that the byte + appears in the file. A given byte is omitted from the table if it + does not appear in the file at all, or if the translation table + leaves it unmapped. It is also omitted if the byte's character is + the ISO-8859-1 encoding of the byte (for example, if byte 0x41 + represents `A`, which is `A` in [ISO-8859-1]). + + This command is most useful with some knowledge of the [portable + file format]. + + [ISO-8859-1]: https://en.wikipedia.org/wiki/ISO/IEC_8859-1 + +## Options + +The following options affect how `pspp show-por` reads ``: + +* `--data []` + For mode `dictionary`, and `encodings`, this instructs `pspp + show-por` to read cases from the file. If `` is given, + then that sets a limit on the number of cases to read. Without this + option, PSPP will not read any cases. + +The following options affect how `pspp show-por` writes its output: + +* `-f ` + `--format ` + Specifies the format to use for output. `` may be one of + the following: + + - `json`: JSON using indentation and spaces for easy human + consumption. + - `ndjson`: [Newline-delimited JSON]. + - `output`: Pivot tables with the PSPP output engine. Use `-o` for + additional configuration. + - `discard`: Do not produce any output. + + When these options are not used, the default output format is chosen + based on the `[OUTPUT]` extension. If `[OUTPUT]` is not specified, + then output defaults to JSON. + + [Newline-delimited JSON]: https://github.com/ndjson/ndjson-spec + +* `-o ` + Adds `` to the output engine configuration. + diff --git a/rust/doc/src/invoking/pspp-show.md b/rust/doc/src/invoking/pspp-show.md index 2356eca2be..2b49401932 100644 --- a/rust/doc/src/invoking/pspp-show.md +++ b/rust/doc/src/invoking/pspp-show.md @@ -1,7 +1,8 @@ -# Inspecting data files with `pspp show` +# Inspecting `.sav` files with `pspp show` -The `pspp show` command reads an SPSS data file and produces a report. -The basic syntax is: +The `pspp show` command reads an SPSS "system file" or data file, +which usually has a `.sav` extension, and produces a report. The +basic syntax is: ``` pspp show [OUTPUT] @@ -43,6 +44,9 @@ The following ``s are available: investigating cases of system file corruption, especially when the character encoding is unknown or uncertain. + This command is most useful with some knowledge of the [system file + format](../system-file.md). + * `decoded`: Outputs the raw structure of the system file dictionary and (with `--data`) cases. Versus `raw`, this command does decode the dictionary and data with a particular character encoding, which @@ -51,6 +55,9 @@ The following ``s are available: This is useful for debugging how PSPP reads system files and for investigating cases of system file corruption. + This command is most useful with some knowledge of the [system file + format](../system-file.md). + ## Options The following options affect how `pspp show` reads ``: diff --git a/rust/doc/src/portable.md b/rust/doc/src/portable.md index 62893c59cc..33f4f4a573 100644 --- a/rust/doc/src/portable.md +++ b/rust/doc/src/portable.md @@ -1,20 +1,67 @@ # Portable File Format These days, most computers use the same internal data formats for -integer and floating-point data, if one ignores little differences like -big- versus little-endian byte ordering. However, occasionally it is -necessary to exchange data between systems with incompatible data -formats. This is what portable files are designed to do. +integer and floating-point data, if one ignores little differences +like big- versus little-endian byte ordering. This has not always +been true, particularly in the 1960s or 1970s, when the portable file +format originated as a way to exchange data between systems with +incompatible data formats. + +At the time, even bytes being 8 bits each was not a given. For that +reason, the portable file format is a text format, because text files +could be exchanged portably among systems slightly more freely. On +the other hand, character encoding was not standardized, so exchanging +data in portable file format required recoding it from the origin +system's character encoding to the destination's. + +Some contemporary systems represented text files as sequences of +fixed-length (typically 80-byte) records, without new-line sequences. +These operating systems padded lines shorter lines with spaces and +truncated longer lines. To tolerate files copied from such systems, +which might drop spaces at the ends of lines, the portable file format +treats lines less than 80 bytes long as padded with spaces to that +length. + +The portable file format self-identifies the character encoding on the +system that produced it at the very beginning, in the +[header](#portable-file-header). Since portable files are normally +recoded when they are transported from one system to another, this +identification can be wrong on its face: a file that was started in +EBCDIC, and is then recoded to ASCII, will still say `EBCDIC SPSS PORT +FILE` at the beginning, just in ASCII instead of EBCDIC. + +The portable file header also contains a table of all of the +characters that it supports. Readers use this to translate each byte +of the file into its local encoding. Like the rest of the portable +file, the character table is recoded when the file is moved to a +system with a different character set so that it remains correct, or +at least consistent with the rest of the file. The portable file format is mostly obsolete. [System files](system-file.md) are a better alternative. -> This information is gleaned from examination of ASCII-formatted -portable files only, so some of it may be incorrect for portable files -formatted in EBCDIC or other character sets. - +## Sources + +The information in this chapter is drawn from documentation and source +code, including: + +* `pff.tar.Z`, a Fortran program from the 1980s that reads and writes + portable files. This program contains translation tables from the + portable character set to EBCDIC and to ASCII. + +* A document, now lost, that describes portable + file syntax. + +It is further informed by a corpus of about 1,400 +portable files. The plausible creation dates in the corpus range from +1986 to 2025, in addition to 131 files with alleged creation dates +between 1900 and 1907 and 21 files with an invalid creation date. + +[document]: #document +[corpus]: #corpus + ## Portable File Characters Portable files are arranged as a series of lines of 80 characters each. @@ -44,7 +91,7 @@ contents. Every portable file consists of the following records, in sequence: -- File header. +- Splash strings. - Version and date info. @@ -100,114 +147,431 @@ they may not contain a fraction. String fields take the form of a integer field having value N, followed by exactly N characters, which are the string content. -## Portable File Header - -Every portable file begins with a 464-byte header, consisting of a -200-byte collection of vanity splash strings, followed by a 256-byte -character set translation table, followed by an 8-byte tag string. - -The 200-byte segment is divided into five 40-byte sections, each of -which represents the string `CHARSET SPSS PORT FILE` in a different -character set encoding, where `CHARSET` is the name of the character set -used in the file, e.g. `ASCII` or `EBCDIC`. Each string is padded on -the right with spaces in its respective character set. - -It appears that these strings exist only to inform those who might -view the file on a screen, and that they are not parsed by SPSS -products. Thus, they can be safely ignored. For those interested, the -strings are supposed to be in the following character sets, in the -specified order: EBCDIC, 7-bit ASCII, CDC 6-bit ASCII, 6-bit ASCII, -Honeywell 6-bit ASCII. - -The 256-byte segment describes a mapping from the character set used -in the portable file to an arbitrary character set having characters at -the following positions: - -* 0-60: Control characters. Not important enough to describe in full here. - -* 61-63: Reserved. - -* 64-73: Digits `0` through `9`. - -* 74-99: Capital letters `A` through `Z`. - -* 100-125: Lowercase letters `a` through `z`. - -* 126: Space. - -* 127-130: Symbols `.<(+` - -* 131: Solid vertical pipe. - -* 132-142: Symbols `&[]!$*);^-/` +> Strings longer than 255 bytes exist in the [corpus]. -* 143: Broken vertical pipe. +## Splash Strings -* 144-150: Symbols `,%_>`?``:` +Every portable file begins with 200 bytes of splash strings that serve +to identify the file's type and its original character set. The 200 +bytes are divided into five 40-byte sections, each of which is +supposed to represent the string ` SPSS PORT FILE` in a +different character set encoding[^0], where `` is the name of +the character set used in the file, e.g. `ASCII` or `EBCDIC`. Each +string is padded on the right with spaces in its respective character +set. -* 151: British pound symbol. +[^0]: The strings are supposed to be in EBCDIC, 7-bit ASCII, CDC 6-bit + ASCII, 6-bit ASCII, and Honeywell 6-bit ASCII. (It is somewhat + astonishing that anyone considered the possibility of 6-bit "ASCII", + or that there were at least three incompatible version of it.) -* 152-155: Symbols `@'="`. - -* 156: Less than or equal symbol. - -* 157: Empty box. - -* 158: Plus or minus. - -* 159: Filled box. - -* 160: Degree symbol. - -* 161: Dagger. - -* 162: Symbol `~`. - -* 163: En dash. - -* 164: Lower left corner box draw. - -* 165: Upper left corner box draw. - -* 166: Greater than or equal symbol. - -* 167-176: Superscript `0` through `9`. +It appears that these strings exist only to inform those who might +view the file on a screen, letting them know what character set the +file is in regardless of how they are viewing it, and that they are +not parsed by SPSS products. Thus, they can be safely ignored. It is +reasonable to simply write out `ASCII SPSS PORT FILE` five times, each +time padded to 40 bytes. + +## Translation Table + +The splash strings are followed by a 256-byte character set translation table. +This segment describes a mapping from the character set used +in the portable file to a "portable character set" that does not +correspond to any known single-byte character set or code page. Each +byte in the table reports the byte value that corresponds to the +character represented by its position. The following section lists +the character at each position. + +> For example, position 0x4a (decimal 74) in the portable character +set is uppercase letter A (as shown in the table in the following +section), so the 75th byte in the table is the value that represents +`A` in the file. + +Any real character set will not necessarily include all of the +characters in the portable character set. In the translation table, +omitted characters are written as digit `0`[^10]. + +[^10]: Character `0`, not NUL or byte zero. + +> For example, in practice, all of the control character positions are +always written as `0`. + +The following section describes how the translation table is supposed +to act based on looking at the [sources](#sources), and then the +section after that describes what it actually contains in practice. + +### Theory + +The table below shows the portable character set. The columns in the +table are: + +* "Pos", a position within the portable character set, in hex, from 00 + to FF. + +* "EBCDIC", the translation for the given position to EBCDIC, as + written in `pff.tar.Z`. + +* "ASCII", the translation for the given position to ASCII, as written + in `pff.tar.Z`. + +* "Unicode", a suggestion for the best translation from this position to + Unicode. + +* "Notes", which links to additional information for some characters. + +In addition to the [sources](#sources) previously cited, some of the +information below is drawn from [RFC 183], from 1971. This RFC shows +many of the "EBCDIC" hex codes in `pff.tar.Z` as corresponding to the +descriptions in the document, even though no known EBCDIC codepage +contains those characters with those codes. + +[RFC 183]: https://www.rfc-editor.org/rfc/rfc183.pdf + +| Pos | EBCDIC | ASCII | Unicode | | Notes +| -: | :----- | :---- | :------ | :-------- | :---------- +| 00 | 00 | — | — | — | [^1] +| 01 | 01 | — | — | — | [^1] +| 02 | 02 | — | — | — | [^1] +| 03 | 03 | — | — | — | [^1] +| 04 | 04 | — | — | — | [^1] +| 05 | 05 | — | U+0009 CHARACTER TABULATION | — | [^1] +| 06 | 06 | — | — | — | [^1] +| 07 | 07 | — | — | — | [^1] +| 08 | 08 | — | — | — | [^1] +| 09 | 09 | — | — | — | [^1] +| 0A | 0A | — | — | — | [^1] +| 0B | 0B | — | U+000B LINE TABULATION | — | [^1] +| 0C | 0C | — | U+000C FORM FEED | — | [^1] +| 0D | 0D | — | U+000D CARRIAGE RETURN | — | [^1] +| 0E | 0E | — | — | — | [^1] +| 0F | 0F | — | — | — | [^1] +| 10 | 10 | — | — | — | [^1] +| 11 | 11 | — | — | — | [^1] +| 12 | 12 | — | — | — | [^1] +| 13 | 13 | — | — | — | [^1] +| 14 | 3C | — | — | — | [^1] +| 15 | 15 | — | U+000A LINE FEED | — | [^1] +| 16 | 16 | — | U+0008 BACKSPACE | — | [^1] +| 17 | 17 | — | — | — | [^1] +| 18 | 18 | — | — | — | [^1] +| 19 | 19 | — | — | — | [^1] +| 1A | 1A | — | — | — | [^1] +| 1B | 1B | — | — | — | [^1] +| 1C | 1C | — | — | — | [^1] +| 1D | 1D | — | — | — | [^1] +| 1E | 1E | — | — | — | [^1] +| 1F | 2A | — | — | — | [^1] +| 20 | 20 | — | — | — | [^1] +| 21 | 21 | — | — | — | [^1] +| 22 | 22 | — | — | — | [^1] +| 23 | 23 | — | — | — | [^1] +| 24 | 2B | — | — | — | [^1] +| 25 | 25 | — | U+000A LINE FEED | — | [^1] +| 26 | 26 | — | — | — | [^1] +| 27 | 27 | — | — | — | [^1] +| 28 | 1F | — | — | — | [^1] +| 29 | 24 | — | — | — | [^1] +| 2A | 14 | — | — | — | [^1] +| 2B | 2D | — | — | — | [^1] +| 2C | 2E | — | — | — | [^1] +| 2D | 2F | — | U+0007 BELL | — | [^1] +| 2E | 32 | — | — | — | [^1] +| 2F | 33 | — | — | — | [^1] +| 30 | 34 | — | — | — | [^1] +| 31 | 35 | — | — | — | [^1] +| 32 | 36 | — | — | — | [^1] +| 33 | 37 | — | — | — | [^1] +| 34 | 38 | — | — | — | [^1] +| 35 | 39 | — | — | — | [^1] +| 36 | 3A | — | — | — | [^1] +| 37 | 3B | — | — | — | [^1] +| 38 | 3D | — | — | — | [^1] +| 39 | 3F | — | — | — | [^1] +| 3A | 28 | — | — | — | [^1] +| 3B | 29 | — | — | — | [^1] +| 3C | 2C | — | — | — | [^1] +| 3D | — | — | — | — | [^8] +| 3E | — | — | — | — | [^8] +| 3F | — | — | — | — | [^8] +| 40 | F0 | 30 | U+0030 DIGIT ZERO | `0` | +| ... | +| 49 | F9 | 39 | U+0039 DIGIT NINE | `9` | +| 4A | C1 | 41 | U+0041 LATIN CAPITAL LETTER A | `A` | +| ... | +| 52 | C9 | 49 | U+0049 LATIN CAPITAL LETTER I | `I` | +| 53 | D1 | 4A | U+004A LATIN CAPITAL LETTER J | `J` | +| ... | +| 5B | D9 | 52 | U+0052 LATIN CAPITAL LETTER R | `R` | +| 5C | E2 | 53 | U+0053 LATIN CAPITAL LETTER S | `S` | +| ... | +| 63 | E9 | 5A | U+005A LATIN CAPITAL LETTER Z | `Z` | +| 64 | 81 | 61 | U+0061 LATIN SMALL LETTER A | `a` | +| ... | +| 7D | 89 | 69 | U+0069 LATIN SMALL LETTER I | `i` | +| 64 | 91 | 6A | U+006A LATIN SMALL LETTER J | `j` | +| ... | +| 7D | 99 | 72 | U+0072 LATIN SMALL LETTER R | `r` | +| 64 | A2 | 73 | U+0073 LATIN SMALL LETTER S | `s` | +| ... | +| 7D | A9 | 7A | U+007A LATIN SMALL LETTER Z | `z` | +| 7E | 40 | 20 | U+0020 SPACE | ` ` | +| 7F | 4B | 2E | U+002E FULL STOP | `.` | +| 80 | 4C | 3C | U+003C LESS-THAN SIGN | `<` | +| 81 | 4D | 28 | U+0028 LEFT PARENTHESIS | `(` | +| 82 | 4E | 2B | U+002B PLUS SIGN | `+` | +| 83 | 59 | — | U+007C VERTICAL LINE | `\|` | [^2] +| 84 | 50 | 26 | U+0026 AMPERSAND | `&` | +| 85 | AD | 5B | U+005B LEFT SQUARE BRACKET | `[` | +| 86 | BD | 5D | U+005D RIGHT SQUARE BRACKET | `]` | +| 87 | 5A | 21 | U+0021 EXCLAMATION MARK | `!` | +| 88 | 5B | 24 | U+0024 DOLLAR SIGN | `$` | +| 89 | 5C | 2A | U+002A ASTERISK | `*` | +| 8A | 5D | 29 | U+0029 RIGHT PARENTHESIS | `)` | +| 8B | 5E | 3B | U+003B SEMICOLON | `;` | +| 8C | 5F | 5E | U+005E CIRCUMFLEX ACCENT | `^` | +| 8D | 60 | 2D | U+002D HYPHEN-MINUS | `-` | +| 8E | 61 | 2F | U+002F SOLIDUS | `/` | +| 8F | 6A | 76 | U+00A6 BROKEN BAR | `¦` | [^2] +| 90 | 6B | 2C | U+002C COMMA | `,` | +| 91 | 6C | 25 | U+0025 PERCENT SIGN | `%` | +| 92 | 6D | 5F | U+005F LOW LINE | `_` | +| 93 | 6E | 3E | U+003E GREATER-THAN SIGN | `>` | +| 94 | 6F | 3F | U+003F QUESTION MARK | `?` | +| 95 | 79 | 60 | U+0060 GRAVE ACCENT | \` | +| 96 | 7A | 3A | U+003A COLON | `:` | +| 97 | 7B | 23 | U+0023 NUMBER SIGN | `#` +| 98 | 7C | 40 | U+0040 COMMERCIAL AT | `@` | +| 99 | 7D | 27 | U+0027 APOSTROPHE | `'` | +| 9A | 7E | 3D | U+003D EQUALS SIGN | `=` | +| 9B | 7F | 22 | U+0022 QUOTATION MARK | `"` | +| 9C | 8C | — | U+2264 LESS-THAN OR EQUAL TO | `≤` | +| 9D | 9C | — | U+25A1 WHITE SQUARE | `□` | [^3] +| 9E | 9E | — | U+00B1 PLUS-MINUS SIGN | `±` | +| 9F | 9F | — | U+25A0 BLACK SQUARE | `■` | [^4] +| A0 | — | — | U+00B0 DEGREE SIGN | `°` | +| A1 | 8F | — | U+2020 DAGGER | `†` | +| A2 | A1 | 7E | U+007E TILDE | `~` | +| A3 | A0 | — | U+2013 EN DASH | `–` | +| A4 | AB | — | U+2514 BOX DRAWINGS LIGHT UP AND RIGHT | `└` | [^5] +| A5 | AC | — | U+250C BOX DRAWINGS LIGHT DOWN AND RIGHT | `┌` | [^5] +| A6 | AE | — | U+2265 GREATER-THAN OR EQUAL TO | `≥` | +| A7 | B0 | — | U+2070 SUPERSCRIPT ZERO | `⁰` | [^5] +| ... | +| B0 | B9 | — | U+2079 SUPERSCRIPT NINE | `⁹` | [^5] +| B1 | BB | — | U+2518 BOX DRAWINGS LIGHT UP AND LEFT | `┘` | [^5] +| B2 | BC | — | U+2510 BOX DRAWINGS LIGHT DOWN AND LEFT | `┐` | [^5] +| B3 | BE | — | U+2260 NOT EQUAL TO | `≠` +| B4 | BF | — | U+2014 EM DASH | `—` +| B5 | 8D | — | U+2070 SUPERSCRIPT LEFT PARENTHESIS | `⁽` +| B6 | 9D | — | U+207E SUPERSCRIPT RIGHT PARENTHESIS | `⁾` +| B7 | BE | — | U+207A SUPERSCRIPT PLUS SIGN | `⁺` | [^6] +| B8 | C0 | 7B | U+007B LEFT CURLY BRACKET | `{` +| B9 | D0 | 7D | U+007D RIGHT CURLY BRACKET | `}` +| BA | E0 | 5C | U+005C REVERSE SOLIDUS | `\` +| BB | 4A | — | 0+00A2 CENT SIGN | `¢` +| BC | AF | — | U+00B7 MIDDLE DOT | `·` | [^7] +| BD | — | — | — | — | [^8] +| ... | +| FF | — | — | — | — | [^8] + +[^1]: From the EBCDIC translation table in `pff.tar.Z`. The ASCII + translation table leaves all of them undefined. Code points are + only listed for common control characters with some modern relevance. + +[^2]: The [document] describes 83 as "a solid vertical pipe" and 8F as + "a broken vertical pipe". Even though the ASCII translation table + in `pff.tar.Z` leaves position 83 undefined and translates 8F to + U+007C VERTICAL LINE, using U+007C VERTICAL LINE and U+00A6 BROKEN + BAR, respectively, seem more accurate in a Unicode environment. + +[^3]: Unicode inferred from [document] description as "empty box". + +[^4]: Unicode inferred from [document] description as "filled box". + +[^5]: These characters are as described in the [document]. Some of + these don't appear in any known EBCDIC code page, but the EBCDIC + translations given in `pff.tar.Z` match the graphics shown in [RFC + 183] with those hex codes. + +[^6]: Described in [document] as "horizontal dagger", which doesn't + appear in Unicode or any known code page. This interpretation + from [RFC 183] seems more likely. + +[^7]: Unicode inferred from [document] description as "centered dot, + or bullet" + +[^8]: Reserved + +Summary: + +| Range | Characters | +|-------: |:-------------------| +| 40...4F | `0123456789ABCDEF` | +| 50...5F | `GHIJKLMNOPQRSTUV` | +| 60...6F | `WXYZabcdefghijkl` | +| 70...7F | `mnopqrstuvwxyz .` | +| 80...8F | `<(+\|&[]!$*);^-/¦` | +| 90...9F | ``,%_>?`:#@'="≤□±■`` | +| A0...AF | `°†~–└┌≥⁰ⁱ⁲⁳⁴⁵⁶⁷⁸` | +| B0...BC | `⁹┘┐≠—⁽⁾⁺{}\¢·` | + +### Practice: Character Set + +The previous section described the translation table in theory. This +section describes what it contains in the [corpus]. + +Every file in the corpus is encoded in (extended) ASCII, although 31 +of them indicate in their splash strings that they were recoded from +EBCDIC. This also means that ASCII `0` indicates an unmapped +character, that is, one not in the character set represented by the +table. + +The files are encoded in different ASCII extension. Some appear to be +encoded in [windows-1252], others in [code page 437], others in +unidentified character sets. The particular code page in use does not +matter to a reader that uses the table for mapping. + +[windows-1252]: https://en.wikipedia.org/wiki/Windows-1252 +[code page 437]: https://en.wikipedia.org/wiki/Code_page_437 + +* There are some invariants across the translation tables for every file + in the corpus: + + - All control codes (in the range 0 to 63) are unmapped. + + One consequence is that strings in the corpus can never contain + new-lines. New-lines encoded literally would be problematic + anyhow because readers [must ignore + them](#portable-file-characters). + + - Digits `0` to `9` and letters `A` to `Z` and `a` to `z` are + correctly mapped. + + - Punctuation for space as well as ``(+&$*);-/,%_?`:@'=\`` are + correctly mapped. + +* Characters `\"~{}` are mapped correctly in almost every file in + the corpus, with a few outliers. + +* Characters `[]` are mostly correct with a few problems. + +* Position 97 is correctly `#` in most files, and wrongly `$` in some. + +* The characters at positions 83 `|` and 8F `¦` have lots of issues, + stemming from the history described [on Wikipedia]. In particular, + EBCDIC and Unicode have separate characters for `|` and `¦`, but + ASCII does not. + + [on Wikipedia]: https://en.wikipedia.org/wiki/Vertical_bar#Broken_bar + + Most of the corpus leaves 83 `|` unmapped. Most of the rest map it + correctly to `|`. The remainder map it to `!`. + + Most of the corpus maps 8F `¦` to `|`. Only a few map it correctly + to `¦` in [windows-1252] or (creatively) to `║` in [code page 437]. + +* Characters at the following positions are almost always wrong. The + table shows: -* 177: Lower right corner box draw. + - "Character", the character and its position in the portable character set. -* 178: Upper right corner box draw. + - "Unmapped", the number of files in the corpus that leave the + character unmapped (that is, set to `0`). -* 179: Not equal symbol. + - "windows-1252", the number of files that map the character + correctly in [windows-1252]. If there is more than one plausible + mapping, or if the mapping doesn't exactly match the preferred + Unicode, the entry shows the mapped character. -* 180: Em dash. + - "cp437", the number of files that map the character correctly in + [code page 437]. -* 181: Superscript `(`. + In a few cases, a plausible mapping in the "windows-1252" column + is an ASCII character. Those aren't separately counted in the + "cp437" column, even though ASCII maps the same way in both + encodings. -* 182: Superscript `)`. + - "Wrong", the number of files that map the character to nothing + that makes sense in a known encoding. + + | Character | Unmapped | windows-1252 | cp437 | Wrong | + |:-----------------|---------:|-----------------------:|-----------:|------:| + | 9C `≤` | 1366 | 0 | 10 | 28 | + | A6 `≥` | 1373 | 0 | 10 | 21 | + | 9F `■` | 1373 | 0 | 10 | 21 | + | 9E `±` | 1353 | 15 | 15 | 23 | + | A3 `–` (en dash) | 1302 | as `-`: 65 | as `─`: 5 | 32 | + | B4 `—` (em dash) | 1308 | as `-`: 65 | as `─`: 10 | 21 | + | A4 `└` | 1367 | 0 | 15 | 22 | + | A5 `┌` | 1367 | 0 | 15 | 22 | + | B1 `┘` | 1367 | 0 | 15 | 22 | + | B2 `┐` | 1367 | 0 | 15 | 22 | + | A8 `¹` | 1286 | as `¹`: 15; as `1`: 65 | 0 | 38 | + | A9 `²` | 1286 | as `²`: 15; as `2`: 65 | 15 | 23 | + | AA `³` | 1286 | as `³`: 15; as `3`: 65 | 0 | 38 | + | AB `⁴` | 1308 | as `4`: 65 | 0 | 31 | + | ... | ... | ... | ... | ... | + | B0 `⁹` | 1308 | as `9`: 65 | 0 | 31 | + | B3 `≠` | 1373 | 0 | as `╪`: 10 | 21 | + | B6 `⁽` | 1308 | 0 | 0 | 96 | + | B7 `⁾` | 1373 | 0 | 0 | 31 | + | BB `¢` | 1351 | 16 | 10 | 27 | + | BC `·` | 1357 | as `·`: 16; as `×`: 1 | as `∙`: 10 | 20 | + | A0 `°` | 1382 | as `°`: 15; as `º`: 1 | 5 | 6 | -* 183: Horizontal dagger (?). +* Characters at the following positions are always unmapped or wrong: + + | Character | Unmapped | windows-1252 | cp437 | Wrong | + |:----------|---------:|-------------:|-----------:|-------------------------:| + | 9D `□` | 1373 | 0 | as `╬`: 10 | 21 | + | A1 `†` | 1364 | 0 | as `┼`: 10 | 30 | + | A7 `⁰` | 1373 | as `Ø`: 1 | 0 | 30 | + | B7 `⁺` | 1373 | 0 | 0 | 31 | -* 184-186: Symbols `{}\`. +* Sometimes the reserved characters are mapped (not in any obviously + useful way). -* 187: Cents symbol. +### Practice: Characters in Use -* 188: Centered dot, or bullet. +The previous section reported on the character sets defined in the +translation table in the corpus. This section reports on the +characters actually found in the corpus. -* 189-255: Reserved. +In practice, characters in the corpus are in [ISO-8859-1], with very +few exceptions. The exceptions are a handful of files that either use +reserved characters from the portable character set, for unclear +reasons, or declare surprising encodings for bytes in the normal ASCII +range. These exceptions might be file corruption; they do not appear +to be useful. -Symbols that are not defined in a particular character set are set to -the same value as symbol 64; i.e., to `0`. +As a result, a portable file reader could reasonably ignore the +translation table and simply interpret all portable files as +[ISO-8859-1] or [windows-1252]. + +There is no visible distinction in practice between portable files in +"communication" versus "tape" format. Neither kind contains control +characters. + +[ISO-8859-1]: https://en.wikipedia.org/wiki/ISO/IEC_8859-1 + +Files in the corpus have a mix of CRLF and LF-only line ends. + +## Tag String + +The translation table is followed by an 8-byte tag string that +consists of the exact characters `SPSSPORT` in the portable file's +character set. This can be used to verify that the file is indeed a +portable file. -The 8-byte tag string consists of the exact characters `SPSSPORT` in -the portable file's character set, which can be used to verify that the -file is indeed a portable file. +> Since every file in the corpus is encoded in (extended) ASCII, this +> string always appears in ASCII too. ## Version and Date Info Record This record does not have a tag code. It has the following structure: -- A single character identifying the file format version. The letter - A represents version 0, and so on. +- A single character identifying the file format version. It is + always `A`. - An 8-character string field giving the file creation date in the format YYYYMMDD. @@ -215,19 +579,65 @@ This record does not have a tag code. It has the following structure: - A 6-character string field giving the file creation time in the format HHMMSS. +> In the [corpus], there is some variation for file creation dates and +> times by product: +> +> - `STAT/TRANSFER` often writes dates that are invalid +> (e.g. `20040931`) or obviously wrong (e.g. `19040823`, `19000607`). +> +> - `STAT/TRANSFER` often writes the time as all spaces. +> +> - `IBM SPSS Statistics 19.0` (and probably other versions) writes `HH` +> as ` H` for single-digit hours. +> +> - `SPSS 6.1 for the Power Macintosh` writes invalid dates such as +> `19:11010`. + ## Identification Records The product identification record has tag code `1`. It consists of a single string field giving the name of the product that wrote the portable file. -The author identification record has tag code `2`. It is optional. -If present, it consists of a single string field giving the name of the -person who caused the portable file to be written. - -The subproduct identification record has tag code `3`. It is -optional. If present, it consists of a single string field giving -additional information on the product that wrote the portable file. +The author identification record has tag code `2`. It is optional and +usually omitted. If present, it consists of a single string field +giving the name of the person who caused the portable file to be +written. + +> The [corpus] contains a few different kinds of authors: +> +> - Organizational names, such as the names of companies or +> universities or their departments. +> +> - Product names, such as `SPSS for HP-UX`. +> +> - Internet host names, such as `icpsr.umich.edu`. + +The subproduct identification record has tag code `3`. It is optional +and usually omitted. If present, it consists of a single string field +giving additional information on the product that wrote the portable +file. + +> The [corpus] contains a few different kinds of subproduct: +> +> - `x86_64-w64-mingw32` or another target triple (written by PSPP). +> +> - A file name for a `.sav` file. +> +> - `SPSS/PC+ Studentware+` written by `SPSS for MS WINDOWS Release 7.0` +> in 1996. +> +> - `FILE BUILT VIA IMPORT` written by `SPSS RELEASE 4.1 FOR VAX/VMS` in +> 1998. +> +> - `SPSS/PC+` written by `SPSS for MS WINDOWS Release 7.0` in 1996. +> +> - Multiple instances of `SPSS/PC+` written by `SPSS/PC+ on IBM PC`, +> but with several spaces padding out both product and subproduct +> fields. +> +> - `PFF TEST FILE` written by `SPSS-X RELEASE 2.1 FOR IBM VM/CMS` in +> 1986. ## Variable Count Record @@ -252,14 +662,21 @@ field that names the weighting variable. Each variable record represents a single variable. Variable records have tag code `7`. They have the following structure: -- Width (integer). This is 0 for a numeric variable, and a number - between 1 and 255 for a string variable. +- Width (integer). This is 0 for a numeric variable. For portability + to old versions of SPSS, it should be between 1 and 255 for a string + variable. + + > Portable files in the [corpus] contain strings as wide as 32000 + bytes. None of these was written by SPSS itself, but by a variety + of third-party products: `STAT/TRANSFER`, `inquery export tool (c) + inworks GmbH`, `QDATA Data Entry System for the IBM PC`. The + creation dates in the files range from 2016 to 2024. - Name (string). 1-8 characters long. Must be in all capitals. - A few portable files that contain duplicate variable names have + > A few portable files that contain duplicate variable names have been spotted in the wild. PSPP handles these by renaming the - duplicates with numeric extensions: `VAR_1`, `VAR_2`, and so on. + duplicates with numeric extensions: `VAR001`, `VAR002`, and so on. - Print format. This is a set of three integer fields: @@ -270,10 +687,10 @@ have tag code `7`. They have the following structure: - Number of decimal places. 1-40. - A few portable files with invalid format types or formats that are - not of the appropriate width for their variables have been spotted - in the wild. PSPP assigns a default `F` or `A` format to a variable - with an invalid format. + > A few portable files with invalid format types or formats that are + not of the appropriate width or decimals for their variables have + been spotted in the wild. PSPP assigns a default `F` or `A` format + to a variable with an invalid format. - Write format. Same structure as the print format described above. @@ -282,15 +699,19 @@ record, which has tag code `8`. A missing value record has one field, the missing value itself (a floating-point or string, as appropriate). Up to three of these missing value records can be used. -There is also a record for missing value ranges, which has tag code -`B`. It is followed by two fields representing the range, which are -floating-point or string as appropriate. If a missing value range is -present, it may be followed by a single missing value record. +There are also records for missing value ranges: + +- Tag code `B` for `X THRU Y` ranges. It is followed by two + floating-point values representing `X` and `Y`. + +- Tag code `9` for `LO THRU Y` ranges, followed by a floating-point + number representing `Y`. + +- Tag code `A` for `X THRU HI` ranges, followed by a floating-point + number representing `X`. -Tag codes `9` and `A` represent `LO THRU X` and `X THRU HI` ranges, -respectively. Each is followed by a single field representing X. If -one of the ranges is present, it may be followed by a single missing -value record. +If a missing value range is present, it may be followed by a single +missing value record. In addition, each variable record can optionally be followed by a variable label record, which has tag code `C`. A variable label record @@ -314,9 +735,9 @@ Value label records have tag code `D`. They have the following format: or string as appropriate to the variables, followed by a label (string). -A few portable files that specify duplicate value labels, that is, -two different labels for a single value of a single variable, have been -spotted in the wild. PSPP uses the last value label specified in these +> The corpus contains a few portable files that specify duplicate +value labels, that is, two different labels for a single value of a +single variable. PSPP uses the last value label specified in these cases. ## Document Record diff --git a/rust/doc/src/spv/index.md b/rust/doc/src/spv/index.md index 3d4e7f35b3..100a531c41 100644 --- a/rust/doc/src/spv/index.md +++ b/rust/doc/src/spv/index.md @@ -20,12 +20,18 @@ new-line. PSPP uses this string to identify an SPV file; it is invariant across the corpus. > SPV files always begin with the 7-byte sequence 50 4b 03 04 14 00 -08, but this is not a useful magic number because most Zip archives -start the same way. - +> 08, but this is not a useful magic number because most Zip archives +> start the same way. +> +> Checking only for the presence of `META-INF/MANIFEST.MF` is also not +> a useful magic number because this file name also appears in every +> [Java JAR archive]. +> > SPSS writes `META-INF/MANIFEST.MF` to every SPV file, but it does -not read it or even require it to exist, so using different contents, -e.g. `allowPivoting=false`, has no effect. +> not read it or even require it to exist, so using different +> contents, e.g. `allowPivoting=false`, has no effect. +> +> [Java JAR archive]: https://docs.oracle.com/javase/8/docs/technotes/guides/jar/jar.html The rest of the members in an SPV file's Zip archive fall into two categories: "structure" and "detail" members. Structure member names diff --git a/rust/pspp/Cargo.toml b/rust/pspp/Cargo.toml index f0a33d2f56..996a75c919 100644 --- a/rust/pspp/Cargo.toml +++ b/rust/pspp/Cargo.toml @@ -53,6 +53,8 @@ unicode-segmentation = "1.12.0" serde_json = "1.0.141" toml = "0.9.5" hashbrown = { version = "0.15.5", features = ["serde"] } +displaydoc = "0.2.5" +codepage-437 = "0.1.0" [target.'cfg(windows)'.dependencies] windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] } diff --git a/rust/pspp/src/convert.rs b/rust/pspp/src/convert.rs index 949331282d..aa59cbfa42 100644 --- a/rust/pspp/src/convert.rs +++ b/rust/pspp/src/convert.rs @@ -17,24 +17,26 @@ use std::{ fs::File, io::{stdout, Write}, - path::PathBuf, + path::{Path, PathBuf}, }; -use anyhow::{bail, Result}; +use anyhow::{anyhow, bail, Error as AnyError, Result}; use chrono::{Datelike, NaiveTime, Timelike}; -use clap::Args; +use clap::{Args, ValueEnum}; use csv::Writer; use encoding_rs::Encoding; use pspp::{ calendar::calendar_offset_to_gregorian, - data::{ByteString, Datum, WithEncoding}, + data::{ByteString, Case, Datum, WithEncoding}, + file::FileType, format::{DisplayPlain, Type}, + por::PortableFile, sys::{raw::records::Compression, ReadOptions, WriteOptions}, util::ToSmallString, variable::Variable, }; -use crate::{parse_encoding, OutputFormat}; +use crate::parse_encoding; /// Convert SPSS data files into other formats. #[derive(Args, Clone, Debug)] @@ -247,12 +249,42 @@ struct SysOptions { compression: Option, } -impl Convert { - pub fn run(self) -> Result<()> { - fn warn(warning: anyhow::Error) { - eprintln!("warning: {warning}"); +/// Output file format. +#[derive(Copy, Clone, Debug, PartialEq, Eq, ValueEnum)] +enum OutputFormat { + /// Comma-separated values using each variable's print format (variable + /// names are written as the first line) + Csv, + + /// System file + Sys, + + /// Portable file + Por, +} + +impl TryFrom<&Path> for OutputFormat { + type Error = AnyError; + + fn try_from(value: &Path) -> std::result::Result { + let extension = value.extension().unwrap_or_default(); + if extension.eq_ignore_ascii_case("csv") || extension.eq_ignore_ascii_case("txt") { + Ok(OutputFormat::Csv) + } else if extension.eq_ignore_ascii_case("sav") || extension.eq_ignore_ascii_case("sys") { + Ok(OutputFormat::Sys) + } else if extension.eq_ignore_ascii_case("por") { + Ok(OutputFormat::Por) + } else { + Err(anyhow!( + "Unknown output file extension '{}'", + extension.display() + )) } + } +} +impl Convert { + pub fn run(self) -> Result<()> { let output_format = match self.output_format { Some(format) => format, None => match &self.output { @@ -261,14 +293,39 @@ impl Convert { }, }; - let mut system_file = ReadOptions::new(warn) - .with_encoding(self.encoding) - .with_password(self.password.clone()) - .open_file(&self.input)?; - if output_format == OutputFormat::Sys && self.sys_options.to_unicode { - system_file = system_file.into_unicode(); - } - let (dictionary, _, cases) = system_file.into_parts(); + let (dictionary, cases) = match FileType::from_file(&self.input)? { + Some(FileType::System { .. }) => { + fn warn(warning: anyhow::Error) { + eprintln!("warning: {warning}"); + } + + let mut system_file = ReadOptions::new(warn) + .with_encoding(self.encoding) + .with_password(self.password.clone()) + .open_file(&self.input)?; + if output_format == OutputFormat::Sys && self.sys_options.to_unicode { + system_file = system_file.into_unicode(); + } + let (dictionary, _, cases) = system_file.into_parts(); + let cases = cases.map(|result| result.map_err(AnyError::from)); + let cases = Box::new(cases) + as Box>>, AnyError>>>; + (dictionary, cases) + } + Some(FileType::Portable) => { + fn warn_portable(warning: pspp::por::Warning) { + eprintln!("warning: {warning}"); + } + + let portable_file = PortableFile::open_file(&self.input, warn_portable)?; + let (dictionary, _, cases) = portable_file.into_parts(); + let cases = cases.map(|result| result.map_err(AnyError::from)); + let cases = Box::new(cases) + as Box>>, AnyError>>>; + (dictionary, cases) + } + _ => bail!("{}: not a system or portable file", self.input.display()), + }; // Take only the first `self.max_cases` cases. let cases = cases.take(self.max_cases.unwrap_or(usize::MAX)); @@ -314,6 +371,15 @@ impl Convert { output.write_case(case?)?; } } + OutputFormat::Por => { + let Some(output) = &self.output else { + bail!("output file name must be specified for output to a portable file") + }; + let mut output = pspp::por::WriteOptions::new().write_file(&dictionary, output)?; + for case in cases { + output.write_case(case?)?; + } + } } Ok(()) } diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index 28ad0521a7..ebacd6108e 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -44,7 +44,12 @@ use serde::{ }; use crate::{ + dictionary::Dictionary, format::DisplayPlain, + output::{ + pivot::{Axis3, Dimension, Group, PivotTable, Value}, + Item, Text, + }, variable::{VarType, VarWidth}, }; @@ -774,14 +779,20 @@ pub struct Case where B: Borrow<[Datum]>, { - encoding: &'static Encoding, data: B, + encoding: &'static Encoding, } impl Case where B: Borrow<[Datum]>, { + pub fn new(data: B, encoding: &'static Encoding) -> Self { + Self { data, encoding } + } + pub fn encoding(&self) -> &'static Encoding { + self.encoding + } pub fn is_empty(&self) -> bool { self.len() == 0 } @@ -816,6 +827,47 @@ impl Case>> { } } +pub fn cases_to_output(dictionary: &Dictionary, cases: C) -> Vec +where + C: IntoIterator>>, E>>, + E: Display, +{ + let mut output = Vec::new(); + let cases = cases.into_iter(); + let variables = + Group::new("Variable").with_multiple(dictionary.variables.iter().map(|var| &**var)); + let mut case_numbers = Group::new("Case").with_label_shown(); + let mut data = Vec::new(); + for case in cases { + match case { + Ok(case) => { + case_numbers.push(Value::new_integer(Some((case_numbers.len() + 1) as f64))); + data.push( + case.into_iter() + .map(|datum| Value::new_datum(&datum)) + .collect::>(), + ); + } + Err(error) => { + output.push(Item::from(Text::new_log(error.to_string()))); + } + } + } + if !data.is_empty() { + let mut pt = PivotTable::new([ + (Axis3::X, Dimension::new(variables)), + (Axis3::Y, Dimension::new(case_numbers)), + ]); + for (row_number, row) in data.into_iter().enumerate() { + for (column_number, datum) in row.into_iter().enumerate() { + pt.insert(&[column_number, row_number], datum); + } + } + output.push(pt.into()); + } + output +} + impl Serialize for Case where B: Borrow<[Datum]>, diff --git a/rust/pspp/src/file.rs b/rust/pspp/src/file.rs new file mode 100644 index 0000000000..6e34b4a2d9 --- /dev/null +++ b/rust/pspp/src/file.rs @@ -0,0 +1,216 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +//! Basic infrastructure for files understood by PSPP. + +#![cfg_attr(not(test), warn(missing_docs))] +use std::{ + fs::File, + io::{Error, Read, Seek}, + path::Path, +}; + +use zip::ZipArchive; + +use crate::sys::raw::Magic; + +/// Type of a file understood by PSPP. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum FileType { + /// A [system file](crate::sys). + System { + /// Whether the file is encrypted. + encrypted: bool, + }, + + /// A [portable file](crate::por). + Portable, + + /// An SPSS PC+ data file. + PcPlus, + + /// An [SPSS Viewer file](crate::output::spv). + Viewer { + /// Whether the file is encrypted. + encrypted: bool, + }, + + /// A file that may be an SPSS syntax file. + Syntax { + /// True if there's confidence that this is a syntax file, which would + /// be either because it has an indicated encoding or because it is + /// encrypted. + confident: bool, + + /// Whether the file is encrypted. + encrypted: bool, + }, +} + +impl FileType { + /// Returns true if we're confident about the file's type. + /// + /// (We can't always confidently identify syntax files because they look + /// mostly like any kind of text file.) + pub fn is_confident(&self) -> bool { + match self { + Self::Syntax { confident, .. } => *confident, + _ => true, + } + } + + /// Returns true if the file is encrypted. + pub fn is_encrypted(&self) -> bool { + match self { + FileType::System { encrypted } => *encrypted, + FileType::Viewer { encrypted } => *encrypted, + FileType::Syntax { + confident: _, + encrypted, + } => *encrypted, + _ => false, + } + } + + /// Attempts to identify the type of file at `path`. Returns: + /// + /// * `Err(error)`: I/O error. + /// + /// * `Ok(Some(type))`: Identified file type. + /// + /// * `Ok(None)`: Unknown file type. + pub fn from_file

(path: P) -> Result, Error> + where + P: AsRef, + { + Self::from_reader(File::open(path)?) + } + + /// Like [from_file](Self::from_file) for an arbitrary `reader`. + pub fn from_reader(mut reader: R) -> Result, Error> + where + R: Read + Seek, + { + let mut buf = vec![0; 512]; + let mut n = 0; + while n < buf.capacity() { + let count = reader.read(&mut buf[n..])?; + n += count; + if count == 0 { + break; + } + } + buf.truncate(n); + + if let Some(magic) = buf.get(0..4) { + let magic: [u8; 4] = magic.try_into().unwrap(); + if Magic::try_from(magic).is_ok() { + return Ok(Some(Self::System { encrypted: false })); + } + } + + match buf.get(8..20) { + Some(b"ENCRYPTEDSAV") => { + return Ok(Some(Self::System { encrypted: true })); + } + Some(b"ENCRYPTEDSPV") => { + return Ok(Some(Self::Viewer { encrypted: true })); + } + Some(b"ENCRYPTEDSPS") => { + return Ok(Some(Self::Syntax { + confident: true, + encrypted: true, + })); + } + _ => (), + } + + if buf + .get(200 + 256..) + .unwrap_or_default() + .windows(8) + .any(|w| w == b"SPSSPORT") + { + return Ok(Some(Self::Portable)); + } + + if buf.get(0x104..0x108) == Some(b"SPSS") { + return Ok(Some(Self::PcPlus)); + } + + let mut string = String::new(); + if buf.get(..7) == Some(&[0x50, 0x4b, 0x03, 0x04, 0x14, 0x00, 0x08]) + && let Ok(mut archive) = ZipArchive::new(reader) + && let Ok(mut file) = archive.by_name("META-INF/MANIFEST.MF") + && let Ok(_) = file.read_to_string(&mut string) + && string.trim() == "allowPivoting=true" + { + return Ok(Some(Self::Viewer { encrypted: false })); + } + + if !buf.is_empty() && !buf.contains(&0) { + return Ok(Some(Self::Syntax { + confident: buf.starts_with(b"* Encoding:"), + encrypted: false, + })); + } + + Ok(None) + } +} + +#[cfg(test)] +mod tests { + use crate::file::FileType; + + #[test] + fn file_type() { + assert_eq!( + FileType::from_file("src/file/testdata/test.sav").unwrap(), + Some(FileType::System { encrypted: false }) + ); + assert_eq!( + FileType::from_file("src/file/testdata/test-encrypted.sav").unwrap(), + Some(FileType::System { encrypted: true }) + ); + assert_eq!( + FileType::from_file("src/file/testdata/test.por").unwrap(), + Some(FileType::Portable) + ); + assert_eq!( + FileType::from_file("src/file/testdata/test-encrypted.spv").unwrap(), + Some(FileType::Viewer { encrypted: true }) + ); + assert_eq!( + FileType::from_file("src/file/testdata/test.spv").unwrap(), + Some(FileType::Viewer { encrypted: false }) + ); + assert_eq!( + FileType::from_file("src/file/testdata/test.sps").unwrap(), + Some(FileType::Syntax { + confident: false, + encrypted: false + }) + ); + assert_eq!( + FileType::from_file("src/file/testdata/test-encoding.sps").unwrap(), + Some(FileType::Syntax { + confident: true, + encrypted: false + }) + ); + } +} diff --git a/rust/pspp/src/file/testdata/test-encoding.sps b/rust/pspp/src/file/testdata/test-encoding.sps new file mode 100644 index 0000000000..d060b90d26 --- /dev/null +++ b/rust/pspp/src/file/testdata/test-encoding.sps @@ -0,0 +1,2 @@ +* Encoding: UTF-8. +DATA LIST /X 1. \ No newline at end of file diff --git a/rust/pspp/src/file/testdata/test-encrypted.sav b/rust/pspp/src/file/testdata/test-encrypted.sav new file mode 100644 index 0000000000..2d9f531102 Binary files /dev/null and b/rust/pspp/src/file/testdata/test-encrypted.sav differ diff --git a/rust/pspp/src/file/testdata/test-encrypted.spv b/rust/pspp/src/file/testdata/test-encrypted.spv new file mode 100644 index 0000000000..da8be2c80f Binary files /dev/null and b/rust/pspp/src/file/testdata/test-encrypted.spv differ diff --git a/rust/pspp/src/file/testdata/test.por b/rust/pspp/src/file/testdata/test.por new file mode 100644 index 0000000000..248cc4e7b5 --- /dev/null +++ b/rust/pspp/src/file/testdata/test.por @@ -0,0 +1,11 @@ +ÁâÃÉÉ@â×ââ@×ÖÙã@ÆÉÓÅ@@@@@@@@@@@@@@@@@@@@ASCII SPSS PORT FILE +00000-0000-0000-0000--------------------!3#))0303300/240&),%00000000000000000000 +0200002'220'&)3000#0000000000000000000000000000000000000000000000000000000000000 +0000000000000000000000000123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrst +uvwxyz .<(+0&[]!$*);^-/|,%_>?`:#@'="000000~000000000000000000000{}\0000000000000 +00000000000000000000000000000000000000000000000000000000SPSSPORTA8/199805296/173 +832111/SPSS for MS WINDOWS Release 7.549/5B/68/VAR0000170/8/VAR000015/8/2/5/8/2/ +81/82/83/70/8/VAR000025/8/2/5/8/2/BA/K/70/8/VAR000035/8/2/5/8/2/B10/1A/81K/70/8/ +VAR000045/8/2/5/8/2/A1/70/8/VAR000055/8/2/5/8/2/92/70/8/VAR000065/8/2/5/8/2/9A9E +17IR6IFNR+6H/70/8/VAR000075/8/2/5/8/2/92/83/70/8/VAR000085/8/2/5/8/2/70/8/VAR000 +095/8/2/5/8/2/F0/*.*.*.*.*.*.*.*.ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ diff --git a/rust/pspp/src/file/testdata/test.sav b/rust/pspp/src/file/testdata/test.sav new file mode 100644 index 0000000000..a84e8f15a8 Binary files /dev/null and b/rust/pspp/src/file/testdata/test.sav differ diff --git a/rust/pspp/src/file/testdata/test.sps b/rust/pspp/src/file/testdata/test.sps new file mode 100644 index 0000000000..3f1afb2ad4 --- /dev/null +++ b/rust/pspp/src/file/testdata/test.sps @@ -0,0 +1 @@ +DATA LIST /X 1. diff --git a/rust/pspp/src/file/testdata/test.spv b/rust/pspp/src/file/testdata/test.spv new file mode 100644 index 0000000000..891263dccd Binary files /dev/null and b/rust/pspp/src/file/testdata/test.spv differ diff --git a/rust/pspp/src/format.rs b/rust/pspp/src/format.rs index 43ba5198b8..401b22f0fa 100644 --- a/rust/pspp/src/format.rs +++ b/rust/pspp/src/format.rs @@ -695,7 +695,7 @@ impl TryFrom for Format { } else if !format.width_range().contains(&w) { Err(Error::BadWidth(source)) } else if d > max_d { - if format.takes_decimals() { + if !format.takes_decimals() { Err(Error::DecimalsNotAllowedForFormat(source)) } else if max_d > 0 { Err(Error::TooManyDecimalsForWidth { diff --git a/rust/pspp/src/lib.rs b/rust/pspp/src/lib.rs index ee2e8a68ee..00114eba12 100644 --- a/rust/pspp/src/lib.rs +++ b/rust/pspp/src/lib.rs @@ -107,6 +107,7 @@ pub mod data; pub mod dictionary; pub mod endian; pub mod engine; +pub mod file; pub mod format; pub mod hexfloat; pub mod identifier; @@ -116,6 +117,7 @@ pub mod locale_charset; pub mod macros; pub mod message; pub mod output; +pub mod por; pub mod prompt; pub mod settings; pub mod sys; diff --git a/rust/pspp/src/main.rs b/rust/pspp/src/main.rs index b00e2b4a89..5cdd92ee71 100644 --- a/rust/pspp/src/main.rs +++ b/rust/pspp/src/main.rs @@ -14,17 +14,17 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -use anyhow::{anyhow, Error as AnyError, Result}; -use clap::{Parser, Subcommand, ValueEnum}; +use anyhow::Result; +use clap::{Parser, Subcommand}; use encoding_rs::Encoding; -use std::path::Path; use thiserror::Error as ThisError; -use crate::{convert::Convert, decrypt::Decrypt, show::Show}; +use crate::{convert::Convert, decrypt::Decrypt, show::Show, show_por::ShowPor}; mod convert; mod decrypt; mod show; +mod show_por; /// PSPP, a program for statistical analysis of sampled data. #[derive(Parser, Debug)] @@ -34,40 +34,12 @@ struct Cli { command: Command, } -/// Output file format. -#[derive(Copy, Clone, Debug, PartialEq, Eq, ValueEnum)] -enum OutputFormat { - /// Comma-separated values using each variable's print format (variable - /// names are written as the first line) - Csv, - - /// System file - Sys, -} - -impl TryFrom<&Path> for OutputFormat { - type Error = AnyError; - - fn try_from(value: &Path) -> std::result::Result { - let extension = value.extension().unwrap_or_default(); - if extension.eq_ignore_ascii_case("csv") || extension.eq_ignore_ascii_case("txt") { - Ok(OutputFormat::Csv) - } else if extension.eq_ignore_ascii_case("sav") || extension.eq_ignore_ascii_case("sys") { - Ok(OutputFormat::Sys) - } else { - Err(anyhow!( - "Unknown output file extension '{}'", - extension.display() - )) - } - } -} - #[derive(Subcommand, Clone, Debug)] enum Command { Convert(Convert), Decrypt(Decrypt), Show(Show), + ShowPor(ShowPor), } impl Command { @@ -76,6 +48,7 @@ impl Command { Command::Convert(convert) => convert.run(), Command::Decrypt(decrypt) => decrypt.run(), Command::Show(show) => show.run(), + Command::ShowPor(show_por) => show_por.run(), } } } diff --git a/rust/pspp/src/output/pivot.rs b/rust/pspp/src/output/pivot.rs index 2c9f17b307..c909d8dd38 100644 --- a/rust/pspp/src/output/pivot.rs +++ b/rust/pspp/src/output/pivot.rs @@ -71,6 +71,7 @@ use thiserror::Error as ThisError; use tlo::parse_tlo; use crate::{ + calendar::date_time_to_pspp, data::{ByteString, Datum, EncodedString, RawString}, format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat}, settings::{Settings, Show}, @@ -2007,6 +2008,9 @@ impl Value { styling: None, } } + pub fn new_date_time(date_time: NaiveDateTime) -> Self { + Self::new_number_with_format(Some(date_time_to_pspp(date_time)), Format::DATETIME40_0) + } pub fn new_number_with_format(x: Option, format: Format) -> Self { Self::new(ValueInner::Number(NumberValue { show: None, diff --git a/rust/pspp/src/por.rs b/rust/pspp/src/por.rs new file mode 100644 index 0000000000..5488c5a39a --- /dev/null +++ b/rust/pspp/src/por.rs @@ -0,0 +1,56 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +//! Reading and writing portable files. +//! +//! This module enables reading and writing “portable files”, a text-based +//! format for SPSS data files. The [portable file format] dates back 40+ years. +//! It was originally designed to facilitate data interchange between systems +//! with unlike character sets, but it did not continue to evolve after the +//! system file format was introduced. It is obsolete. PSPP includes readers +//! and writers for portable files only for compatibility; all non-legacy uses +//! of PSPP should use [system files] instead. +//! +//! Use [PortableFile] to read a portable file. Use [WriteOptions] to write a +//! portable file. +//! +//! [portable file format]: https://pspp.benpfaff.org/manual/portable.html +//! [system files]: crate::sys +#![cfg_attr(not(test), warn(missing_docs))] + +mod read; +mod write; + +pub use read::{ + Cases, Error, ErrorDetails, Metadata, PortableFile, ReadPad, ReadTranslate, TranslationTable, + Warning, +}; +pub use write::{WriteOptions, Writer}; + +static PORTABLE_TO_WINDOWS_1252: &[u8] = { + let s = + b" \ + 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .\ + <(+|&[]!$*);^-/|,%_>?`:#@'=\" \xb1 \xb0\x86~\x96 0\xb9\xb2\xb3456789 \x97() {}\\\xa2\x95 "; + assert!(s.len() == 256); + s +}; + +/// Returns the windows-1252 character corresponding to the given `portable` +/// character. +fn portable_to_windows_1252(portable: u8) -> u8 { + PORTABLE_TO_WINDOWS_1252[portable as usize] +} diff --git a/rust/pspp/src/por/read.rs b/rust/pspp/src/por/read.rs new file mode 100644 index 0000000000..ac726cae46 --- /dev/null +++ b/rust/pspp/src/por/read.rs @@ -0,0 +1,1223 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +use std::{ + cmp::Ordering, + fmt::{Display, Formatter}, + fs::File, + io::{BufRead, BufReader, Error as IoError, Read, Result as IoResult, Seek, SeekFrom}, + ops::Index, + path::Path, +}; + +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use codepage_437::CP437_WINGDINGS; +use encoding_rs::WINDOWS_1252; +use indexmap::set::MutableValues; +use num::{Bounded, NumCast}; +use serde::{ser::SerializeSeq, Serialize, Serializer}; + +use crate::{ + data::{ByteString, Case, Datum, RawString, WithEncoding}, + dictionary::{DictIndex, Dictionary}, + format::{Error as FormatError, Format, Type, UncheckedFormat}, + identifier::{Error as IdError, Identifier}, + output::pivot::{MetadataEntry, MetadataValue, PivotTable, Value}, + por::portable_to_windows_1252, + variable::{MissingValueRange, MissingValues, MissingValuesError, VarType, VarWidth, Variable}, +}; +use displaydoc::Display; +use thiserror::Error as ThisError; + +/// An SPSS portable file. +#[derive(Debug)] +pub struct PortableFile { + /// The system file dictionary. + pub dictionary: Dictionary, + + /// Portable file metadata that is not part of the dictionary. + pub metadata: Metadata, + + /// Data in the portable file. + pub cases: Cases>>, +} + +impl PortableFile { + /// Returns the individual parts of the [PortableFile]. + pub fn into_parts(self) -> (Dictionary, Metadata, Cases>>) { + (self.dictionary, self.metadata, self.cases) + } +} + +/// Portable file metadata that is not part of [Dictionary]. +#[derive(Clone, Debug, PartialEq, Eq, Serialize)] +pub struct Metadata { + /// Creation date and time. + /// + /// This comes from the file header, not from the file system. + pub creation: Option, + + /// Name of the product that wrote the file. + pub product: Option, + + /// Extended name of the product that wrote the file. + pub product_ext: Option, + + /// Identifies the organization licensed for the product that wrote the + /// file. + pub author: Option, + + /// The file's embedded character encoding translation table. + #[serde(serialize_with = "serialize_character_set")] + pub character_set: [u8; 256], +} + +fn serialize_character_set(translations: &[u8; 256], serializer: S) -> Result +where + S: Serializer, +{ + let mut seq = serializer.serialize_seq(Some(256))?; + for (index, c) in translations.into_iter().enumerate() { + let windows_1252 = *c as char; + let cp_437 = CP437_WINGDINGS.decode(*c); + if windows_1252 == cp_437 { + seq.serialize_element(&(format!("{index:02x}"), windows_1252))?; + } else { + seq.serialize_element(&(format!("{index:02x}"), windows_1252, cp_437))?; + } + } + seq.end() +} + +impl From<&Metadata> for PivotTable { + fn from(value: &Metadata) -> Self { + fn maybe_string(name: &str, s: &Option) -> MetadataEntry { + MetadataEntry { + name: Value::new_user_text(name), + value: MetadataValue::Leaf( + s.as_ref() + .cloned() + .map(Value::new_user_text) + .unwrap_or_default(), + ), + } + } + + MetadataEntry { + name: Value::new_user_text("Portable File Metadata"), + value: MetadataValue::Group(vec![ + MetadataEntry { + name: Value::new_user_text("Created"), + value: MetadataValue::Leaf( + value.creation.map(Value::new_date_time).unwrap_or_default(), + ), + }, + maybe_string("Product", &value.product), + maybe_string("Product 2", &value.product_ext), + maybe_string("Author", &value.author), + ]), + } + .into_pivot_table() + } +} + +/// Reader for cases in a portable file. +#[derive(Debug)] +pub struct Cases { + reader: R, + variables: Vec, + eof: bool, +} + +impl Cases { + fn new(reader: R, variables: Vec) -> Self { + Self { + reader, + variables, + eof: false, + } + } + + fn read_case(&mut self) -> Result>>>, ErrorDetails> + where + R: Read, + { + let mut values = Vec::with_capacity(self.variables.len()); + + // Check whether we're at end of file. + let peek = read_byte(&mut self.reader)?; + if peek == b'Z' { + return Ok(None); + } + + // We're not at EOF, so glue the lookahead byte onto the front of the + // reader and then read a case. + let peek = [peek]; + let mut reader = peek.chain(&mut self.reader); + for width in &self.variables { + match width { + VarWidth::Numeric => values.push(Datum::Number(read_f64_or_missing(&mut reader)?)), + VarWidth::String(width) => { + let mut string = read_raw_string(&mut reader)?; + string.resize(*width as usize, b' '); + values.push(Datum::String(string.into())); + } + } + } + Ok(Some(Case::new(values, WINDOWS_1252))) + } +} + +impl Iterator for Cases +where + R: Read + Seek, +{ + type Item = Result>>, Error>; + + fn next(&mut self) -> Option { + if self.eof || self.variables.is_empty() { + return None; + } + + match self.read_case().transpose() { + Some(Ok(case)) => Some(Ok(case)), + None => { + self.eof = true; + None + } + Some(Err(details)) => { + self.eof = true; + Some(Err(Error { + offset: self.reader.stream_position().ok(), + details, + })) + } + } + } +} + +/// An error encountered reading a portable file. +#[derive(Debug)] +pub struct Error { + /// Offset where the error occurred. + pub offset: Option, + + /// Details of the error. + pub details: ErrorDetails, +} + +impl std::error::Error for Error {} + +impl Error { + /// Constructs an error from `offset` and `details`. + pub fn new(offset: Option, details: ErrorDetails) -> Self { + Self { offset, details } + } +} + +impl From for Error { + fn from(value: IoError) -> Self { + Self::new(None, value.into()) + } +} + +impl Display for Error { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + if let Some(offset) = self.offset { + write!(f, "Error at file offset {:#x}: ", offset)?; + } + write!(f, "{}", &self.details) + } +} + +/// An error for reading a [PortableFile]. +#[derive(Display, ThisError, Debug)] +pub enum ErrorDetails { + /// Not an SPSS portable file. + NotAPortableFile, + + /// Unrecognized version code '{0}'. + UnrecognizedVersionCode(char), + + /// I/O error ({0}). + Io(#[from] IoError), + + /// Number expected. + NumberExpected, + + /// Integer expected. + InvalidInteger, + + /// Expected integer between {min_value} and {max_value}, instead of {float}. + OutOfRangeInteger { + /// Value actually read. + float: f64, + /// Minimum valid integer value. + min_value: String, + /// Maximum valid integer value. + max_value: String, + }, + + /// Missing numeric terminator. + MissingSlash, + + /// Invalid string length {0}. + InvalidStringLength(i32), + + /// Expected variable count record with tag 4 (instead of tag {0:?}). + ExpectedVariableCountRecord(char), + + /// Invalid number of variables {0}. + InvalidNumberOfVariables(i32), + + /// Expected variable record. + ExpectedVariableRecord, + + /// Invalid width {width} for variable {name}. + InvalidVariableWidth { + /// Declared width. + width: i32, + /// Variable name. + name: Identifier, + }, + + /// System-missing value where number expected. + UnexpectedSysmis, + + /// Data record expected. + DataRecordExpected, + + /// Value label record had no valid variable indexes. + NoValueLabelVariables, +} + +/// A warning while reading a [PortableFile]. +#[derive(Display, ThisError, Debug)] +pub enum Warning { + /// Invalid date {0}. + InvalidDate(String), + + /// Invalid time {0}. + InvalidTime(String), + + /// Invalid variable name. + InvalidVariableName { + /// Identifier error. + id_error: IdError, + /// New name. + new_name: Identifier, + }, + + /// Renaming variable with duplicate name {duplicate_name} to {new_name}. + DuplicateVariableName { + /// Duplicate name. + duplicate_name: Identifier, + /// New name. + new_name: Identifier, + }, + + /// Substituting {new_format} for invalid print format on variable {variable}. {format_error} + InvalidPrintFormat { + /// New format. + new_format: Format, + /// Variable. + variable: Identifier, + /// Underlying error. + format_error: FormatError, + }, + + /// Substituting {new_format} for invalid write format on variable {variable}. {format_error} + InvalidWriteFormat { + /// New format. + new_format: Format, + /// Variable. + variable: Identifier, + /// Underlying error. + format_error: FormatError, + }, + + /// Missing value range may not contain system-missing value. + MissingValueRangeSysmis, + + /// Invalid missing values for variable {name}: {error}. + InvalidMissingValues { + /// Variable name. + name: Identifier, + /// Kind of error with missing values. + error: MissingValuesError, + }, + + /// Unknown weight variable {0}. + UnknownWeightVariable(Identifier), + + /// Invalid identifier {string}. {error} + InvalidIdentifier { + /// String that should be an identifier. + string: String, + /// Kind of error with the string. + error: IdError, + }, + + /// Unknown variable name {0}. + UnknownVariableName(Identifier), + + /// Mixed variable types in value labels. + MixedVariableTypes, +} + +/// Translation table from file bytes to [WINDOWS_1252]. +/// +/// A byte in the file with value `x` is interpreted in [WINDOWS_1252] as +/// `self.0[x]`. +#[derive(Debug)] +pub struct TranslationTable( + /// Translation table. + [u8; 256], +); + +impl TranslationTable { + // Create the translation table, given the character set in a portable file. + fn new(character_set: &[u8; 256]) -> Self { + // Skip the first 64 characters of the character set. They are probably + // all set to '0', marking them as untranslatable, and that would screw + // up our actual translation of the real '0'. + let mut translations = [0; 256]; + for portable in 64..=255 { + let c = character_set[portable] as usize; + if translations[c] == 0 { + translations[c] = portable_to_windows_1252(portable as u8); + } + } + Self(translations) + } +} + +impl Index for TranslationTable { + type Output = u8; + + fn index(&self, index: u8) -> &Self::Output { + &self.0[index as usize] + } +} + +impl PortableFile> { + /// Opens the file at `path`. + pub fn open_file(path: P, warn: F) -> Result + where + P: AsRef, + F: FnMut(Warning), + { + let reader = BufReader::new(File::open(path)?); + Self::open(reader, warn) + } +} + +impl PortableFile +where + R: Read + Seek, +{ + /// Reads `reader`, which should be in the SPSS portable file format. + /// Following the file header and character set, counts the incidence of + /// each byte value in the file. Returns a table with those counts, plus a + /// [TranslationTable] derived from the character set in the file header. + pub fn read_histogram(reader: R) -> Result<([usize; 256], TranslationTable), Error> + where + R: BufRead, + { + let mut reader = ReadPad::new(reader); + + // Read and ignore header. + reader.read_exact(&mut [0; 200])?; + let mut character_set = [0; 256]; + reader.read_exact(&mut character_set)?; + reader.read_exact(&mut [0; 8])?; + + let mut buf = [0; 4096]; + let mut histogram = [0; 256]; + loop { + let n = reader.read(&mut buf)?; + if n == 0 { + break; + } + + for c in buf[..n].iter().copied() { + histogram[c as usize] += 1; + } + } + Ok((histogram, TranslationTable::new(&character_set))) + } + + /// Opens `reader` as a portable file, invoking `warn` with any warnings + /// diagnosed while reading it. + pub fn open(reader: R, mut warn: F) -> Result + where + F: FnMut(Warning), + { + fn read_inner( + mut reader: R, + mut warn: F, + character_set: [u8; 256], + ) -> Result<(Dictionary, Metadata), ErrorDetails> + where + R: Read + Seek, + F: FnMut(Warning), + { + let mut signature = [0; 8]; + reader.read_exact(&mut signature)?; + if &signature != b"SPSSPORT" { + return Err(ErrorDetails::NotAPortableFile); + } + let (c, metadata) = read_version(&mut reader, &mut warn, character_set)?; + let (mut c, mut dictionary) = read_variables(&mut reader, c, &mut warn)?; + while c == b'D' { + read_value_label(&mut reader, &mut dictionary, &mut warn)?; + c = read_byte(&mut reader)?; + } + if c == b'E' { + read_documents(&mut reader, &mut dictionary)?; + c = read_byte(&mut reader)?; + } + if c != b'F' { + return Err(ErrorDetails::DataRecordExpected); + } + Ok((dictionary, metadata)) + } + fn read_version( + mut reader: R, + mut warn: F, + character_set: [u8; 256], + ) -> Result<(u8, Metadata), ErrorDetails> + where + R: Read, + F: FnMut(Warning), + { + let byte = read_byte(&mut reader)?; + if byte != b'A' { + return Err(ErrorDetails::UnrecognizedVersionCode(byte as char)); + } + + let date = read_string(&mut reader)?; + let date = if date.len() == 8 + && date.is_ascii() + && let Ok(year) = date[..4].parse() + && let Ok(month) = date[4..6].parse() + && let Ok(day) = date[6..].parse() + && let Some(date) = NaiveDate::from_ymd_opt(year, month, day) + { + Some(date) + } else { + warn(Warning::InvalidDate(date)); + None + }; + let time = read_string(&mut reader)?; + let time = if let Ok(hms) = time.trim().parse::() + && let Some(time) = + NaiveTime::from_hms_opt(hms / 10000, (hms % 10000) / 100, hms % 100) + { + Some(time) + } else { + if !time.trim().is_empty() { + warn(Warning::InvalidTime(time)); + } + None + }; + let creation = date.map(|date| NaiveDateTime::new(date, time.unwrap_or_default())); + + let mut c = read_byte(&mut reader)?; + let product = if c == b'1' { + let product = read_string(&mut reader)?; + c = read_byte(&mut reader)?; + Some(product) + } else { + None + }; + let author = if c == b'2' { + let author = read_string(&mut reader)?; + c = read_byte(&mut reader)?; + Some(author) + } else { + None + }; + + let product_ext = if c == b'3' { + let product_ext = read_string(&mut reader)?; + c = read_byte(&mut reader)?; + Some(product_ext) + } else { + None + }; + + Ok(( + c, + Metadata { + creation, + product, + product_ext, + author, + character_set, + }, + )) + } + + fn read_format( + mut reader: R, + width: VarWidth, + warn: F, + ) -> Result + where + R: Read, + F: FnOnce(Format, FormatError), + { + let type_: u16 = read_integer(&mut reader)?; + let w: u16 = read_integer(&mut reader)?; + let d: u8 = read_integer(&mut reader)?; + Ok(Type::try_from(type_) + .map(|type_| UncheckedFormat { type_, w, d }) + .and_then(Format::try_from) + .and_then(|x| x.check_width_compatibility(width)) + .unwrap_or_else(|error| { + let new_format = Format::default_for_width(width); + warn(new_format, error); + new_format + })) + } + + fn read_variables( + mut reader: R, + mut c: u8, + mut warn: F, + ) -> Result<(u8, Dictionary), ErrorDetails> + where + R: Read + Seek, + F: FnMut(Warning), + { + let mut dictionary = Dictionary::new(WINDOWS_1252); + + if c != b'4' { + return Err(ErrorDetails::ExpectedVariableCountRecord(c as char)); + } + let n_vars: usize = read_integer(&mut reader)?; + + c = read_byte(&mut reader)?; + if c == b'5' { + let _ = read_f64(&mut reader)?; + c = read_byte(&mut reader)?; + } + let weight_name = if c == b'6' { + let weight_name = read_identifier(&mut reader, &mut warn)?; + c = read_byte(&mut reader)?; + weight_name + } else { + None + }; + + let mut n_generated_names = 0; + fn generate_name(dictionary: &Dictionary, n_generated_names: &mut usize) -> Identifier { + loop { + *n_generated_names = n_generated_names.checked_add(1).unwrap(); + let name = Identifier::from_encoding( + format!("VAR{:03}", *n_generated_names), + WINDOWS_1252, + ) + .unwrap(); + if !dictionary.variables.contains(&name.0) { + return name; + } + } + } + + for _ in 0..n_vars { + if c != b'7' { + return Err(ErrorDetails::ExpectedVariableRecord); + } + let width: u16 = read_integer(&mut reader)?; + let name = read_string(&mut reader)?; + let name = match Identifier::from_encoding(name, WINDOWS_1252) + .and_then(Identifier::must_be_ordinary) + { + Ok(name) => { + if !dictionary.variables.contains(&name.0) { + name + } else { + let new_name = generate_name(&dictionary, &mut n_generated_names); + warn(Warning::DuplicateVariableName { + duplicate_name: name.clone(), + new_name: new_name.clone(), + }); + new_name + } + } + Err(id_error) => { + let new_name = generate_name(&dictionary, &mut n_generated_names); + warn(Warning::InvalidVariableName { + id_error, + new_name: new_name.clone(), + }); + new_name + } + }; + let width = match width { + 0 => VarWidth::Numeric, + width => VarWidth::String(width as u16), + }; + + let print = read_format(&mut reader, width, |new_spec, format_error| { + warn(Warning::InvalidPrintFormat { + new_format: new_spec, + variable: name.clone(), + format_error, + }) + })?; + let write = read_format(&mut reader, width, |new_spec, format_error| { + warn(Warning::InvalidWriteFormat { + new_format: new_spec, + variable: name.clone(), + format_error, + }) + })?; + + c = read_byte(&mut reader)?; + let range = match c { + b'B' => Some(MissingValueRange::In { + low: read_f64(&mut reader)?, + high: read_f64(&mut reader)?, + }), + b'A' => Some(MissingValueRange::From { + low: read_f64(&mut reader)?, + }), + b'9' => Some(MissingValueRange::To { + high: read_f64(&mut reader)?, + }), + _ => None, + }; + if range.is_some() { + c = read_byte(&mut reader)?; + } + let mut values = Vec::new(); + while c == b'8' { + values.push(read_value(&mut reader, width.into())?); + c = read_byte(&mut reader)?; + } + let missing_values = MissingValues::new(values, range) + .inspect_err(|error| { + warn(Warning::InvalidMissingValues { + name: name.clone(), + error: *error, + }) + }) + .unwrap_or_default(); + + let label = if c == b'C' { + let label = read_string(&mut reader)?; + c = read_byte(&mut reader)?; + Some(label) + } else { + None + }; + + let mut variable = Variable::new(name, width, WINDOWS_1252); + variable.print_format = print; + variable.write_format = write; + if let Err(error) = variable.missing_values_mut().replace(missing_values) { + warn(Warning::InvalidMissingValues { + name: variable.name.clone(), + error, + }) + } + variable.label = label; + dictionary.add_var(variable).unwrap(); + } + + if let Some(weight_name) = weight_name { + if let Some(dict_index) = dictionary.variables.get_index_of(&weight_name.0) { + let _ = dictionary.set_weight(Some(dict_index)); + } else { + warn(Warning::UnknownWeightVariable(weight_name)) + } + } + Ok((c, dictionary)) + } + + fn read_value_label( + mut reader: R, + dictionary: &mut Dictionary, + mut warn: F, + ) -> Result<(), ErrorDetails> + where + R: Read, + F: FnMut(Warning), + { + let n_variables = read_integer(&mut reader)?; + let mut dict_indexes = Vec::with_capacity(n_variables); + let mut var_type = None; + for _ in 0..n_variables { + if let Some(dict_index) = read_variable_name(&mut reader, dictionary, &mut warn)? { + let type_ = VarType::from(dictionary.variables[dict_index].width); + if var_type.is_none() { + var_type = Some(type_); + } else if var_type != Some(type_) { + warn(Warning::MixedVariableTypes); + continue; + } + dict_indexes.push(dict_index); + } + } + let Some(var_type) = var_type else { + return Err(ErrorDetails::NoValueLabelVariables); + }; + + let n_labels = read_integer(&mut reader)?; + for _ in 0..n_labels { + let value = read_value(&mut reader, var_type)?.without_encoding(); + let label = read_string(&mut reader)?; + for dict_index in dict_indexes.iter().copied() { + dictionary + .variables + .get_index_mut2(dict_index) + .unwrap() + .value_labels + .insert(value.clone(), label.clone()); + } + } + Ok(()) + } + + fn read_documents(mut reader: R, dictionary: &mut Dictionary) -> Result<(), ErrorDetails> + where + R: Read, + { + let n_lines: usize = read_integer(&mut reader)?; + for _ in 0..n_lines { + dictionary.documents.push(read_string(&mut reader)?); + } + Ok(()) + } + + let mut reader = ReadPad::new(reader); + + // Read and ignore vanity splash strings. + reader.read_exact(&mut [0; 200])?; + + // Read the character set. + let mut character_set = [0; 256]; + reader.read_exact(&mut character_set)?; + let translations = TranslationTable::new(&character_set); + + let mut reader = ReadTranslate::new(reader, translations); + let (dictionary, metadata) = + read_inner(&mut reader, &mut warn, character_set).map_err(|details| Error { + offset: reader.stream_position().ok(), + details, + })?; + let variables = dictionary.variables.iter().map(|var| var.width).collect(); + Ok(PortableFile { + dictionary, + metadata, + cases: Cases::new(reader, variables), + }) + } +} + +fn read_raw_string(mut reader: R) -> Result, ErrorDetails> +where + R: Read, +{ + let n: u16 = read_integer(&mut reader)?; + let mut vec = vec![0u8; n as usize]; + reader.read_exact(&mut vec)?; + Ok(vec) +} + +fn read_string(reader: R) -> Result +where + R: Read, +{ + // This `unwrap()` can't panic because the translation table only + // translates to ASCII characters + Ok(String::from_utf8(read_raw_string(reader)?).unwrap()) +} + +fn read_identifier(reader: R, mut warn: F) -> Result, ErrorDetails> +where + R: Read, + F: FnMut(Warning), +{ + let string = read_string(reader)?; + match Identifier::from_encoding(string.clone(), WINDOWS_1252) { + Ok(identifier) => Ok(Some(identifier)), + Err(error) => { + warn(Warning::InvalidIdentifier { string, error }); + Ok(None) + } + } +} + +fn read_variable_name( + reader: R, + dictionary: &Dictionary, + mut warn: F, +) -> Result, ErrorDetails> +where + R: Read, + F: FnMut(Warning), +{ + let Some(var_name) = read_identifier(reader, &mut warn)? else { + return Ok(None); + }; + let dict_index = dictionary.variables.get_index_of(&var_name.0); + if dict_index.is_none() { + warn(Warning::UnknownVariableName(var_name)); + } + Ok(dict_index) +} + +fn read_integer(reader: R) -> Result +where + R: Read, + T: NumCast + Bounded + Display, +{ + let float = read_f64(reader)?; + if float.trunc() == float && float >= i64::MIN as f64 && float <= i64::MAX as f64 { + if let Some(integer) = num::cast(float) { + Ok(integer) + } else { + Err(ErrorDetails::OutOfRangeInteger { + float, + min_value: T::min_value().to_string(), + max_value: T::max_value().to_string(), + }) + } + } else { + Err(ErrorDetails::InvalidInteger) + } +} + +fn read_value( + reader: R, + var_type: VarType, +) -> Result>, ErrorDetails> +where + R: Read, +{ + match var_type { + VarType::Numeric => Ok(Datum::Number(read_f64_or_missing(reader)?)), + VarType::String => Ok(Datum::String( + ByteString::from(Vec::from(read_string(reader)?)).with_encoding(WINDOWS_1252), + )), + } +} + +fn read_f64(reader: R) -> Result +where + R: Read, +{ + match read_f64_or_missing(reader)? { + Some(value) => Ok(value), + None => Err(ErrorDetails::UnexpectedSysmis), + } +} + +fn read_f64_or_missing(mut reader: R) -> Result, ErrorDetails> +where + R: Read, +{ + let mut c = read_byte(&mut reader)?; + while c == b' ' { + c = read_byte(&mut reader)?; + } + if c == b'*' { + let _ = read_byte(&mut reader)?; + return Ok(None); + } + let negative = if c == b'-' { + c = read_byte(&mut reader)?; + true + } else { + false + }; + let mut significand = 0; + let mut exponent = 0i32; + let mut saw_dot = false; + let mut saw_digit = false; + loop { + if let Some(digit) = (c as char).to_digit(30) { + saw_digit = true; + if significand >= u64::MAX / 30 - 30 { + // The value of the digit doesn't matter, since we have already + // recorded more digits as can be represented in `f64`. + // We just need to record that there was another digit so that + // we can multiply by 30 later. + exponent += 1; + } else { + significand = significand * 30 + digit as u64; + } + + if saw_dot { + exponent -= 1; + } + } else if c == b'.' && !saw_dot { + saw_dot = true; + } else { + break; + } + + c = read_byte(&mut reader)?; + } + if !saw_digit { + return Err(ErrorDetails::NumberExpected); + } + + if c == b'+' || c == b'-' { + let exp_sign = c; + let mut exp = 0i32; + c = read_byte(&mut reader)?; + while let Some(digit) = (c as char).to_digit(30) { + exp = exp * 30 + digit as i32; + c = read_byte(&mut reader)?; + } + if exp_sign == b'+' { + exponent -= exp; + } else { + exponent += exp; + } + } + + if c != b'/' { + return Err(ErrorDetails::MissingSlash); + } + + let significand = significand as f64; + let num = match exponent.cmp(&0) { + Ordering::Less => significand * 30.0f64.powi(exponent), + Ordering::Equal => significand, + Ordering::Greater if significand > f64::MAX * 30.0f64.powi(-exponent) => f64::MAX, + Ordering::Greater => significand * 30.0f64.powi(exponent), + }; + Ok(Some(if negative { -num } else { num })) +} + +fn read_byte(mut reader: R) -> IoResult +where + R: Read, +{ + let mut byte = 0; + reader.read_exact(std::slice::from_mut(&mut byte))?; + Ok(byte) +} + +/// A [Read] wrapper that translates the bytes it reads using a +/// [TranslationTable]. +#[derive(Debug)] +pub struct ReadTranslate { + inner: R, + translations: TranslationTable, +} + +impl ReadTranslate { + /// Create a new [ReadTranslate] with `inner` and `translations`. + pub fn new(inner: R, translations: TranslationTable) -> Self { + Self { + inner, + translations, + } + } + + /// Consumes this [ReadTranslate], returning the inner reader. + pub fn into_inner(self) -> R { + self.inner + } +} + +impl Read for ReadTranslate +where + R: Read, +{ + fn read(&mut self, buf: &mut [u8]) -> IoResult { + let n = self.inner.read(buf)?; + for c in &mut buf[..n] { + *c = self.translations[*c]; + } + Ok(n) + } +} + +impl Seek for ReadTranslate +where + R: Seek, +{ + fn seek(&mut self, pos: SeekFrom) -> IoResult { + self.inner.seek(pos) + } +} + +/// A [Read] wrapper that skips newlines and pads lines to 80 bytes with spaces. +#[derive(Debug)] +pub struct ReadPad { + inner: R, + at_newline: bool, + line_length: usize, +} + +impl ReadPad { + /// Constructs a [ReadPad] wrapper for `inner`. + pub fn new(inner: R) -> Self { + Self { + inner, + at_newline: false, + line_length: 0, + } + } + + /// Consumes this [ReadPad], returning the inner reader. + pub fn into_inner(self) -> R { + self.inner + } +} + +impl Read for ReadPad +where + R: Read, +{ + fn read(&mut self, buf: &mut [u8]) -> IoResult { + for (i, c) in buf.into_iter().enumerate() { + if self.at_newline { + *c = b' '; + self.line_length += 1; + if self.line_length >= 80 { + self.at_newline = false; + self.line_length = 0; + } + } else { + loop { + match self.inner.read(std::slice::from_mut(c)) { + Ok(1) => (), + other => return if i > 0 { Ok(i) } else { other }, + }; + match *c { + b'\r' => continue, + b'\n' => match self.line_length { + 80.. => { + self.line_length = 0; + continue; + } + 79 => { + self.line_length = 0; + *c = b' '; + break; + } + 0..79 => { + self.at_newline = true; + self.line_length += 1; + *c = b' '; + break; + } + }, + _ => { + self.line_length += 1; + break; + } + } + } + } + } + Ok(buf.len()) + } +} + +impl Seek for ReadPad +where + R: Seek, +{ + fn seek(&mut self, pos: SeekFrom) -> IoResult { + self.inner.seek(pos) + } +} + +#[cfg(test)] +mod tests { + use std::{ + io::{BufRead, BufReader, Cursor}, + path::Path, + }; + + use itertools::Itertools; + + use crate::{ + data::cases_to_output, + output::{ + pivot::{tests::assert_lines_eq, PivotTable}, + Details, Item, Text, + }, + por::{PortableFile, ReadPad}, + }; + + #[test] + fn read_wrapper() { + let mut lines = BufReader::new(ReadPad::new(Cursor::new( + b"abcdefghijklmnop\r\n0123456789\r\n", + ))) + .lines(); + assert_eq!(lines.next().unwrap().unwrap(), "abcdefghijklmnop 0123456789 "); + } + + fn test_porfile(name: &str) { + let base_filename = Path::new("src/por/testdata").join(name); + let input_filename = base_filename.with_extension("por"); + let expected_filename = base_filename.with_extension("expected"); + + let mut warnings = Vec::new(); + let output = match PortableFile::open_file(input_filename, |warning| warnings.push(warning)) + { + Ok(portable_file) => { + let (dictionary, metadata, cases) = portable_file.into_parts(); + + let mut output = Vec::new(); + output.extend( + warnings + .into_iter() + .map(|warning| Item::from(Text::new_log(warning.to_string()))), + ); + output.push(PivotTable::from(&metadata).into()); + output.extend(dictionary.all_pivot_tables().into_iter().map_into()); + output.extend(cases_to_output(&dictionary, cases)); + Item::new(Details::Group(output.into_iter().map_into().collect())) + } + Err(error) => Item::new(Details::Text(Box::new(Text::new_log(error.to_string())))), + }; + + let actual = output.to_string(); + let expected = std::fs::read_to_string(&expected_filename).unwrap(); + if expected != actual { + if std::env::var("PSPP_REFRESH_EXPECTED").is_ok() { + std::fs::write(&expected_filename, actual).unwrap(); + panic!("{}: refreshed output", expected_filename.display()); + } else { + eprintln!("note: rerun with PSPP_REFRESH_EXPECTED=1 to refresh expected output"); + } + } + assert_lines_eq(&expected, expected_filename.display(), &actual, "actual"); + } + + #[test] + fn porfile_test1() { + test_porfile("test1"); + } + + #[test] + fn porfile_test2() { + test_porfile("test2"); + } +} diff --git a/rust/pspp/src/por/testdata/README.md b/rust/pspp/src/por/testdata/README.md new file mode 100644 index 0000000000..c48d1d338e --- /dev/null +++ b/rust/pspp/src/por/testdata/README.md @@ -0,0 +1,3 @@ +The two .por files in this directory are old ones found on the +Internet. `test1.por` self-identifies as a `PFF TEST FILE`. They do +not contain any personally identifying information. diff --git a/rust/pspp/src/por/testdata/test1.expected b/rust/pspp/src/por/testdata/test1.expected new file mode 100644 index 0000000000..9105615e6f --- /dev/null +++ b/rust/pspp/src/por/testdata/test1.expected @@ -0,0 +1,151 @@ +╭─────────┬──────────────────────────────────╮ +│Created │ 15-FEB-1986 14:48:43│ +│Product │SPSS-X RELEASE 2.1 FOR IBM VM/CMS│ +│Product 2│PFF TEST FILE │ +│Author │HARVARD UNIV COMPUTING CENTER │ +╰─────────┴──────────────────────────────────╯ + +╭─────────┬─╮ +│Variables│9│ +╰─────────┴─╯ + + Variables +╭─────────────────────────┬────────┬─────────────────────────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│ Label │Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├─────────────────────────┼────────┼─────────────────────────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│SATISFACTION WITH HOUSING│ 1│SATISFACTION WITH HOUSING│ │Input│ 8│Right │F1.0 │F1.0 │0; 9 │ +│SATISFACTION NEIGHBORHOOD│ 2│SATISFACTION NEIGHBORHOOD│ │Input│ 8│Right │F1.0 │F1.0 │0; 9 │ +│RESPOND. EDUCATION │ 3│RESPOND. EDUCATION │ │Input│ 8│Right │F2.0 │F2.0 │0; 99 │ +│INCOME IN 1984 │ 4│INCOME IN 1984 │ │Input│ 8│Right │F3.0 │F3.0 │0; 999 │ +│STANDARD OF LIVING │ 5│STANDARD OF LIVING │ │Input│ 8│Right │F1.0 │F1.0 │0; 9 │ +│RESPONDENT RACE │ 6│RESPONDENT RACE │ │Input│ 8│Right │F1.0 │F1.0 │0; 9 │ +│RESPONDENT SEX │ 7│RESPONDENT SEX │ │Input│ 8│Right │F1.0 │F1.0 │0; 9 │ +│RESPONDENT AGE │ 8│RESPONDENT AGE │ │Input│ 8│Right │F2.0 │F2.0 │0; 99 │ +│R MARITAL STATUS │ 9│R MARITAL STATUS │ │Input│ 8│Right │F1.0 │F1.0 │0; 9 │ +╰─────────────────────────┴────────┴─────────────────────────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + + Value Labels +╭───────────────────────────┬──────────╮ +│Variable Value │ │ +├───────────────────────────┼──────────┤ +│SATISFACTION WITH HOUSING 1│VERY │ +│ 2│FAIRLY │ +│ 3│NOT VERY │ +├───────────────────────────┼──────────┤ +│SATISFACTION NEIGHBORHOOD 1│VERY │ +│ 2│FAIRLY │ +│ 3│NOT VERY │ +├───────────────────────────┼──────────┤ +│RESPOND. EDUCATION 1│NONE │ +│ 2│< 8 YRS │ +│ 3│C GRD SC │ +│ 4│SOME HS │ +│ 5│CMPL.H.S. │ +│ 6│1-3 COLL │ +│ 7│COLL DEG │ +│ 8│MASTERS │ +│ 9│PHD-M.D. │ +├───────────────────────────┼──────────┤ +│STANDARD OF LIVING 1│PROSP │ +│ 2│VERY COMF │ +│ 3│REAS COMF │ +│ 4│GET BY │ +│ 5│NEAR POOR │ +│ 6│POOR │ +├───────────────────────────┼──────────┤ +│RESPONDENT RACE 1│BLACK │ +│ 2│HISPANIC │ +│ 3│WHITE │ +│ 4│ASIAN │ +│ 5│AM IND │ +│ 8│OTHER │ +├───────────────────────────┼──────────┤ +│RESPONDENT SEX 1│MALE │ +│ 2│FEMALE │ +├───────────────────────────┼──────────┤ +│R MARITAL STATUS 1│MARRIED │ +│ 2│DIVORCED │ +│ 3│SEPARATD │ +│ 4│WIDOWED │ +│ 5│NEVER MARR│ +╰───────────────────────────┴──────────╯ + +╭────┬─────────────────────────┬─────────────────────────┬──────────────────┬──────────────┬──────────────────┬───────────────┬──────────────┬──────────────┬────────────────╮ +│Case│SATISFACTION WITH HOUSING│SATISFACTION NEIGHBORHOOD│RESPOND. EDUCATION│INCOME IN 1984│STANDARD OF LIVING│RESPONDENT RACE│RESPONDENT SEX│RESPONDENT AGE│R MARITAL STATUS│ +├────┼─────────────────────────┼─────────────────────────┼──────────────────┼──────────────┼──────────────────┼───────────────┼──────────────┼──────────────┼────────────────┤ +│1 │ 2.00│ 1.00│ 5.00│ 50.00│ 3.00│ 3.00│ 2.00│ 29.00│ 1.00│ +│2 │ 2.00│ 1.00│ 7.00│ 25.00│ 3.00│ 3.00│ 1.00│ 26.00│ 5.00│ +│3 │ 1.00│ 2.00│ 6.00│ 35.00│ 3.00│ 3.00│ 2.00│ 53.00│ 1.00│ +│4 │ 2.00│ 2.00│ 6.00│ 25.00│ 4.00│ 3.00│ 2.00│ 23.00│ 3.00│ +│5 │ 2.00│ 1.00│ 6.00│ 11.00│ 3.00│ 3.00│ 2.00│ 21.00│ 5.00│ +│6 │ 1.00│ 1.00│ 5.00│ 12.00│ 3.00│ 3.00│ 1.00│ .03│ 5.00│ +│7 │ 2.00│ 2.00│ 8.00│ 22.00│ 3.00│ 3.00│ 2.00│ 42.00│ 2.00│ +│8 │ 1.00│ 1.00│ 7.00│ 35.00│ 3.00│ 3.00│ 2.00│ 35.00│ 1.00│ +│9 │ 2.00│ 2.00│ 7.00│ .03│ 3.00│ 3.00│ 1.00│ .03│ 5.00│ +│10 │ 1.00│ 1.00│ 7.00│ 8.00│ 2.00│ 3.00│ 1.00│ 22.00│ 5.00│ +│11 │ 1.00│ 1.00│ 5.00│ 18.00│ 4.00│ 3.00│ 2.00│ 55.00│ 4.00│ +│12 │ 1.00│ 1.00│ 7.00│ 45.00│ 3.00│ 3.00│ 2.00│ 56.00│ 1.00│ +│13 │ 3.00│ 3.00│ 5.00│ 2.00│ 4.00│ 1.00│ 2.00│ 24.00│ 5.00│ +│14 │ 3.00│ 2.00│ 7.00│ .07│ 3.00│ 3.00│ 1.00│ 42.00│ 1.00│ +│15 │ 1.00│ 1.00│ 5.00│ 35.00│ 2.00│ 3.00│ 2.00│ .03│ 1.00│ +│16 │ 1.00│ 1.00│ 8.00│ 12.00│ 4.00│ 3.00│ 1.00│ 29.00│ 5.00│ +│17 │ 1.00│ 1.00│ 6.00│ 25.00│ 3.00│ 3.00│ 1.00│ 55.00│ 1.00│ +│18 │ 1.00│ 1.00│ 5.00│ 20.00│ 4.00│ 9.00│ 2.00│ .07│ 4.00│ +│19 │ 1.00│ 1.00│ 6.00│ 42.00│ 4.00│ 3.00│ 1.00│ 51.00│ 4.00│ +│20 │ 1.00│ 1.00│ 6.00│ 70.00│ 2.00│ 3.00│ 1.00│ 33.00│ 1.00│ +│21 │ 1.00│ 2.00│ 6.00│ 2.00│ 2.00│ 3.00│ 2.00│ 20.00│ 5.00│ +│22 │ 3.00│ 1.00│ 9.00│ 20.00│ 3.00│ 4.00│ 2.00│ 32.00│ 3.00│ +│23 │ 2.00│ 1.00│ 6.00│ 13.00│ 4.00│ 3.00│ 2.00│ 33.00│ 1.00│ +│24 │ 3.00│ 3.00│ 8.00│ 7.00│ 4.00│ 3.00│ 2.00│ 28.00│ 5.00│ +│25 │ 2.00│ 2.00│ 5.00│ 35.00│ 3.00│ 3.00│ 1.00│ 51.00│ 1.00│ +│26 │ 1.00│ 1.00│ 6.00│ 40.00│ 3.00│ 3.00│ 1.00│ 59.00│ 1.00│ +│27 │ 2.00│ 1.00│ 5.00│ 46.00│ 4.00│ 3.00│ 1.00│ 47.00│ 1.00│ +│28 │ 2.00│ 2.00│ 5.00│ 25.00│ 2.00│ 3.00│ 1.00│ 28.00│ 1.00│ +│29 │ 2.00│ 1.00│ 8.00│ 50.00│ 3.00│ 3.00│ 1.00│ 39.00│ 2.00│ +│30 │ 3.00│ 2.00│ 5.00│ 6.00│ 4.00│ 3.00│ 2.00│ 86.00│ 4.00│ +│31 │ 2.00│ 2.00│ 6.00│ .03│ 2.00│ 3.00│ 2.00│ 24.00│ 5.00│ +│32 │ 2.00│ 1.00│ 7.00│ 25.00│ 4.00│ 3.00│ 2.00│ 38.00│ 1.00│ +│33 │ 1.00│ 1.00│ 5.00│ 8.00│ 3.00│ 3.00│ 1.00│ 18.00│ 5.00│ +│34 │ 1.00│ 1.00│ 8.00│ 40.00│ 3.00│ 3.00│ 2.00│ 58.00│ 1.00│ +│35 │ 2.00│ 2.00│ 7.00│ 25.00│ 3.00│ 3.00│ 2.00│ .03│ 5.00│ +│36 │ 1.00│ 1.00│ 5.00│ 20.00│ 3.00│ 3.00│ 2.00│ 28.00│ 5.00│ +│37 │ 1.00│ 1.00│ 2.00│ 15.00│ 3.00│ 8.00│ 2.00│ 49.00│ 1.00│ +│38 │ 2.00│ 1.00│ 7.00│ 35.00│ 3.00│ 3.00│ 2.00│ 29.00│ 1.00│ +│39 │ 2.00│ 2.00│ 7.00│ 1.00│ 4.00│ 3.00│ 2.00│ 35.00│ 5.00│ +│40 │ 2.00│ 1.00│ 6.00│ 11.00│ 4.00│ 3.00│ 2.00│ 48.00│ 2.00│ +│41 │ 1.00│ 1.00│ 6.00│ 20.00│ 3.00│ 3.00│ 2.00│ 67.00│ 1.00│ +│42 │ 1.00│ 2.00│ 5.00│ 15.00│ 4.00│ 3.00│ 2.00│ 39.00│ 3.00│ +│43 │ 2.00│ 2.00│ 6.00│ .03│ 3.00│ 3.00│ 1.00│ 32.00│ 1.00│ +│44 │ 1.00│ 1.00│ 5.00│ 42.00│ 3.00│ 3.00│ 2.00│ 44.00│ 1.00│ +│45 │ 3.00│ 3.00│ 5.00│ 16.00│ 3.00│ 3.00│ 2.00│ 29.00│ 2.00│ +│46 │ 1.00│ 1.00│ 7.00│ 14.00│ 3.00│ 3.00│ 1.00│ 42.00│ 5.00│ +│47 │ 1.00│ 1.00│ 7.00│ 20.00│ 3.00│ 3.00│ 2.00│ 36.00│ 2.00│ +│48 │ 2.00│ 2.00│ 7.00│ 45.00│ 2.00│ 3.00│ 2.00│ 33.00│ 1.00│ +│49 │ 2.00│ 2.00│ 5.00│ .03│ 2.00│ 3.00│ 2.00│ 32.00│ 1.00│ +│50 │ 2.00│ 2.00│ 9.00│ 9.00│ 3.00│ 3.00│ 1.00│ 24.00│ 5.00│ +│51 │ 1.00│ 2.00│ 6.00│ 35.00│ 3.00│ 3.00│ 2.00│ 41.00│ 1.00│ +│52 │ 1.00│ 1.00│ 5.00│ .03│ 3.00│ 3.00│ 2.00│ 29.00│ 1.00│ +│53 │ 1.00│ 1.00│ 5.00│ .07│ 2.00│ 3.00│ 2.00│ 64.00│ 1.00│ +│54 │ 1.00│ 1.00│ 7.00│ 20.00│ 4.00│ 3.00│ 1.00│ .03│ 5.00│ +│55 │ 1.00│ 1.00│ 5.00│ 3.00│ 3.00│ 8.00│ 1.00│ 21.00│ 5.00│ +│56 │ 1.00│ 2.00│ 4.00│ 20.00│ 2.00│ 3.00│ 1.00│ 22.00│ 5.00│ +│57 │ 1.00│ 1.00│ 6.00│ 45.00│ 3.00│ 3.00│ 2.00│ 38.00│ 1.00│ +│58 │ 1.00│ 1.00│ 7.00│ .03│ 3.00│ 3.00│ 2.00│ .03│ 5.00│ +│59 │ 2.00│ 1.00│ 7.00│ 10.00│ 3.00│ 3.00│ 2.00│ 25.00│ 5.00│ +│60 │ 1.00│ 1.00│ 6.00│ 11.00│ 4.00│ 3.00│ 2.00│ 38.00│ 2.00│ +│61 │ 3.00│ 2.00│ 5.00│ 15.00│ 5.00│ 1.00│ 2.00│ 34.00│ 2.00│ +│62 │ 1.00│ 1.00│ 3.00│ 16.00│ 4.00│ 3.00│ 1.00│ 48.00│ 2.00│ +│63 │ 1.00│ 1.00│ 5.00│ 20.00│ 4.00│ 3.00│ 2.00│ 37.00│ 4.00│ +│64 │ 1.00│ 2.00│ 7.00│ 14.00│ 3.00│ 3.00│ 2.00│ 25.00│ 5.00│ +│65 │ 1.00│ 2.00│ 6.00│ .10│ 2.00│ 3.00│ 1.00│ 57.00│ 1.00│ +│66 │ 3.00│ 2.00│ 8.00│ 15.00│ 3.00│ 3.00│ 2.00│ 42.00│ 3.00│ +│67 │ 1.00│ 2.00│ 8.00│ 80.00│ 3.00│ 3.00│ 1.00│ 46.00│ 1.00│ +│68 │ 3.00│ 2.00│ 5.00│ 8.00│ 5.00│ 3.00│ 2.00│ 58.00│ 4.00│ +│69 │ 1.00│ 2.00│ 7.00│ 20.00│ 3.00│ 3.00│ 2.00│ 25.00│ 5.00│ +│70 │ 1.00│ 2.00│ 5.00│ .03│ 4.00│ 3.00│ 2.00│ 37.00│ 1.00│ +│71 │ 1.00│ 1.00│ 7.00│ 25.00│ 2.00│ 3.00│ 2.00│ 27.00│ 5.00│ +│72 │ 1.00│ 1.00│ 5.00│ 13.00│ 4.00│ 3.00│ 2.00│ 42.00│ 2.00│ +│73 │ 2.00│ 3.00│ 7.00│ 40.00│ 3.00│ 3.00│ 2.00│ 32.00│ 1.00│ +│74 │ 2.00│ 2.00│ 5.00│ 40.00│ 3.00│ 3.00│ 1.00│ 63.00│ 1.00│ +│75 │ 2.00│ 1.00│ 5.00│ 70.00│ 3.00│ 3.00│ 2.00│ 50.00│ 1.00│ +╰────┴─────────────────────────┴─────────────────────────┴──────────────────┴──────────────┴──────────────────┴───────────────┴──────────────┴──────────────┴────────────────╯ diff --git a/rust/pspp/src/por/testdata/test1.por b/rust/pspp/src/por/testdata/test1.por new file mode 100644 index 0000000000..d1ce02ab3b Binary files /dev/null and b/rust/pspp/src/por/testdata/test1.por differ diff --git a/rust/pspp/src/por/testdata/test2.expected b/rust/pspp/src/por/testdata/test2.expected new file mode 100644 index 0000000000..181f96f0ba --- /dev/null +++ b/rust/pspp/src/por/testdata/test2.expected @@ -0,0 +1,64 @@ +Invalid date 20040931. + +╭───────┬─────────────╮ +│Product│STAT/TRANSFER│ +╰───────┴─────────────╯ + +╭─────────┬─╮ +│Variables│4│ +╰─────────┴─╯ + + Variables +╭───────────────────────────┬────────┬───────────────────────────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│ Label │Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├───────────────────────────┼────────┼───────────────────────────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│ALCOHOL (C) │ 1│ALCOHOL (C) │ │Input│ 8│Right │F9.2 │F9.2 │ │ +│SELF-ESTEEM (SE) │ 2│SELF-ESTEEM (SE) │ │Input│ 8│Right │F9.2 │F9.2 │ │ +│BLOOD ALCOHOL CONTENT (BAC)│ 3│BLOOD ALCOHOL CONTENT (BAC)│ │Input│ 8│Right │F9.2 │F9.2 │ │ +│SELF-DISCLOSURE │ 4│SELF-DISCLOSURE │ │Input│ 8│Right │F9.2 │F9.2 │ │ +╰───────────────────────────┴────────┴───────────────────────────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + +╭────┬───────────┬────────────────┬───────────────────────────┬───────────────╮ +│Case│ALCOHOL (C)│SELF-ESTEEM (SE)│BLOOD ALCOHOL CONTENT (BAC)│SELF-DISCLOSURE│ +├────┼───────────┼────────────────┼───────────────────────────┼───────────────┤ +│1 │ .00│ 3.10│ 4.20│ 3.70│ +│2 │ .00│ 2.50│ 1.50│ 5.70│ +│3 │ .00│ 3.00│ 2.00│ 4.40│ +│4 │ .00│ 2.00│ 2.20│ 3.20│ +│5 │ .00│ 3.20│ 1.00│ 6.90│ +│6 │ .00│ 1.80│ 1.70│ 3.50│ +│7 │ .00│ 2.30│ 2.70│ 6.40│ +│8 │ .00│ 2.40│ 2.60│ 5.80│ +│9 │ .00│ 1.00│ 2.80│ 4.40│ +│10 │ .00│ 3.20│ 3.00│ 5.80│ +│11 │ .00│ 1.00│ 3.70│ 3.80│ +│12 │ .00│ 4.70│ 3.30│ 5.60│ +│13 │ .00│ 2.70│ 2.90│ 5.00│ +│14 │ .00│ 4.70│ 3.70│ 7.00│ +│15 │ .00│ 1.80│ 2.30│ 4.00│ +│16 │ .00│ 2.20│ 3.50│ 3.60│ +│17 │ .00│ 3.60│ 2.20│ 5.90│ +│18 │ .00│ 3.90│ 3.40│ 5.00│ +│19 │ .00│ 3.70│ 1.40│ 5.70│ +│20 │ .00│ 2.90│ 2.90│ 3.40│ +│21 │ 1.00│ 2.40│ 5.30│ 5.30│ +│22 │ 1.00│ 1.70│ 5.40│ 5.90│ +│23 │ 1.00│ 4.10│ 6.20│ 5.10│ +│24 │ 1.00│ 3.60│ 4.50│ 5.90│ +│25 │ 1.00│ 5.00│ 6.90│ 5.00│ +│26 │ 1.00│ 2.70│ 2.30│ 6.90│ +│27 │ 1.00│ 3.60│ 5.60│ 6.30│ +│28 │ 1.00│ 2.60│ 5.50│ 3.90│ +│29 │ 1.00│ 3.50│ 4.50│ 5.90│ +│30 │ 1.00│ 3.30│ 3.00│ 5.90│ +│31 │ 1.00│ 3.40│ 5.10│ 4.80│ +│32 │ 1.00│ 2.80│ 4.00│ 4.80│ +│33 │ 1.00│ 2.10│ 4.60│ 5.80│ +│34 │ 1.00│ 3.30│ 6.70│ 6.20│ +│35 │ 1.00│ 3.30│ 4.90│ 7.50│ +│36 │ 1.00│ 3.00│ 5.50│ 5.50│ +│37 │ 1.00│ 3.50│ 4.60│ 7.00│ +│38 │ 1.00│ 4.50│ 5.70│ 6.40│ +│39 │ 1.00│ 2.60│ 6.20│ 6.00│ +│40 │ 1.00│ 4.60│ 5.10│ 4.50│ +╰────┴───────────┴────────────────┴───────────────────────────┴───────────────╯ diff --git a/rust/pspp/src/por/testdata/test2.por b/rust/pspp/src/por/testdata/test2.por new file mode 100644 index 0000000000..fdcf5b1574 --- /dev/null +++ b/rust/pspp/src/por/testdata/test2.por @@ -0,0 +1,15 @@ +ASCII SPSS PORT FILE ASCII SPSS PORT FILE +ASCII SPSS PORT FILE ASCII SPSS PORT FILE +ASCII SPSS PORT FILE 0000000000000000000000000000000000000000 +0000000000000000000000000123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrst +uvwxyz .<(+0&[]!$*);^-/|,%_>?`:#@'="000000~000000000000000000000{}\0000000000000 +00000000000000000000000000000000000000000000000000000000SPSSPORTA8/200409316/ + 1D/STAT/TRANSFER44/5A/70/1/C5/9/2/5/9/2/CB/ALCOHOL (C)70/2/SE5/9/2/5/9/2/CG/S +ELF-ESTEEM (SE)70/3/BAC5/9/2/5/9/2/CR/BLOOD ALCOHOL CONTENT (BAC)70/5/SELFD5/9/2 +/5/9/2/CF/SELF-DISCLOSUREF0/3.3/4.6/3.L/0/2.F/1.F/5.L/0/3/2/4.C/0/2/2.6/3.6/0/3. +6/1/6.R/0/1.O/1.L/3.F/0/2.9/2.L/6.C/0/2.C/2.I/5.O/0/1/2.O/4.C/0/3.6/3/5.O/0/1/3. +L/3.O/0/4.L/3.9/5.I/0/2.L/2.R/5/0/4.L/3.L/7/0/1.O/2.9/4/0/2.6/3.F/3.I/0/3.I/2.6/ +5.R/0/3.R/3.C/5/0/3.L/1.C/5.L/0/2.R/2.R/3.C/1/2.C/5.9/5.9/1/1.L/5.C/5.R/1/4.3/6. +6/5.3/1/3.I/4.F/5.R/1/5/6.R/5/1/2.L/2.9/6.R/1/3.I/5.I/6.9/1/2.I/5.F/3.R/1/3.F/4. +F/5.R/1/3.9/3/5.R/1/3.C/5.3/4.O/1/2.O/4/4.O/1/2.3/4.I/5.O/1/3.9/6.L/6.6/1/3.9/4. +R/7.F/1/3/5.F/5.F/1/3.F/4.I/7/1/4.F/5.L/6.C/1/2.I/6.6/6/1/4.I/5.3/4.F/ZZZZZZZZZZ \ No newline at end of file diff --git a/rust/pspp/src/por/write.rs b/rust/pspp/src/por/write.rs new file mode 100644 index 0000000000..15aad13820 --- /dev/null +++ b/rust/pspp/src/por/write.rs @@ -0,0 +1,1308 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +use std::{ + borrow::Cow, + cmp::Ordering, + collections::HashMap, + fmt::{Display, Write as _}, + fs::File, + io::{BufWriter, Error, Write}, + path::Path, +}; + +use chrono::{Local, NaiveDateTime}; +use libm::frexp; +use smallvec::SmallVec; + +use crate::{ + data::{Datum, RawString}, + dictionary::Dictionary, + por::PORTABLE_TO_WINDOWS_1252, + variable::{MissingValueRange, ValueLabels}, +}; + +/// Precision for floating-point numbers in a portable file. +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub struct Precision( + /// Precision in base-30 digits (the base used in portable files). + u32, +); + +impl Default for Precision { + fn default() -> Self { + Self::from_base_10_digits(f64::DIGITS) + } +} + +impl Precision { + pub fn from_base_10_digits(digits: u32) -> Self { + match digits { + 0..=1 => Self(1), + 2 => Self(2), + 3..=4 => Self(3), + 5 => Self(4), + 6..=7 => Self(5), + 8 => Self(6), + 9..=10 => Self(7), + 11 => Self(8), + 12..=13 => Self(9), + 14 => Self(10), + 15.. => Self(11), + } + } + + pub fn from_base_30_digits(digits: u32) -> Self { + Self(digits.clamp(1, 10)) + } + + pub fn as_base_10_digits(&self) -> u32 { + match self.0 { + 1 => 1, + 2 => 2, + 3 => 4, + 4 => 5, + 5 => 7, + 6 => 8, + 7 => 10, + 8 => 11, + 9 => 13, + 10 => 14, + 11 => 15, + _ => unreachable!(), + } + } + + pub fn as_base_30_digits(&self) -> u32 { + self.0 + } +} + +/// Options for writing a portable file. +#[derive(Clone, Debug)] +pub struct WriteOptions { + /// Date and time to write to the file. + pub timestamp: NaiveDateTime, + + /// Product name. + pub product: Cow<'static, str>, + + /// Subproduct name. + pub product_ext: Option>, + + /// Author. + pub author: Option, + + /// Precision. + pub precision: Precision, +} + +impl Default for WriteOptions { + fn default() -> Self { + Self { + timestamp: Local::now().naive_local(), + product: Cow::from(concat!("GNU PSPP (Rust) ", env!("CARGO_PKG_VERSION"))), + product_ext: None, + author: None, + precision: Precision::default(), + } + } +} + +impl WriteOptions { + /// Constructs a new set of default options. + pub fn new() -> Self { + Self::default() + } + + /// Returns `self` with the timestamp to be written set to `timestamp`. + pub fn with_timestamp(self, timestamp: NaiveDateTime) -> Self { + Self { timestamp, ..self } + } + + /// Returns `self` with the product set to `product`. + pub fn with_product(self, product: Cow<'static, str>) -> Self { + Self { product, ..self } + } + + /// Returns `self` with the extended product set to `product_ext`. + pub fn with_product_ext(self, product_ext: Cow<'static, str>) -> Self { + Self { + product_ext: Some(product_ext), + ..self + } + } + + /// Returns `self` with the author set to `author`. + pub fn with_author(self, author: String) -> Self { + Self { + author: Some(author), + ..self + } + } + + /// Return `self` with the precision set to `precision`. + pub fn with_precision(self, precision: Precision) -> Self { + Self { precision, ..self } + } + + /// Writes `dictionary` to `path` in portable file format. Returns a [Writer] + /// that can be used for writing cases to the new file. + pub fn write_file( + self, + dictionary: &Dictionary, + path: impl AsRef, + ) -> Result>, Error> { + self.write_writer(dictionary, BufWriter::new(File::create(path)?)) + } + + /// Writes `dictionary` to `writer` in portable file format. Returns a + /// [Writer] that can be used for writing cases to the new file. + pub fn write_writer(self, dictionary: &Dictionary, writer: W) -> Result, Error> + where + W: Write + 'static, + { + let mut writer = WriteFilter::new(writer); + let mut dict_writer = DictionaryWriter::new(&self, &mut writer, dictionary); + dict_writer.write()?; + Ok(Writer { + inner: Some(writer), + precision: self.precision, + }) + } + + /// Returns a [WriteOptions] with members set to fixed values so that + /// running at different times or with different crate names or versions + /// won't change what's written to the file. + #[cfg(test)] + pub(super) fn reproducible() -> Self { + use chrono::{NaiveDate, NaiveTime}; + WriteOptions::new() + .with_timestamp(NaiveDateTime::new( + NaiveDate::from_ymd_opt(2025, 7, 30).unwrap(), + NaiveTime::from_hms_opt(15, 7, 55).unwrap(), + )) + .with_product(Cow::from("PSPP TEST DATA FILE")) + } +} + +/// Portable file case writer. +/// +/// Use [WriteOptions::write_file] or [WriteOptions::write_writer] to obtain a +/// [Writer]. +pub struct Writer { + precision: Precision, + inner: Option>, +} + +impl Writer +where + W: Write, +{ + /// Finishes writing the file. + pub fn finish(mut self) -> Result, Error> { + self.try_finish() + } + + /// Tries to finish writing the file. + /// + /// # Panic + /// + /// Attempts to write more cases after calling this function will panic. + pub fn try_finish(&mut self) -> Result, Error> { + match self.inner.take() { + None => Ok(None), + Some(mut inner) => { + inner.write_end()?; + Ok(Some(inner.into_inner())) + } + } + } + + /// Writes `case` to the file. + pub fn write_case(&mut self, case: impl IntoIterator>) -> Result<(), Error> + where + B: RawString, + { + write_case(self.inner.as_mut().unwrap(), case, self.precision) + } +} + +fn write_case( + mut writer: W, + case: impl IntoIterator>, + precision: Precision, +) -> Result<(), Error> +where + W: Write, + B: RawString, +{ + for datum in case { + write_datum(&mut writer, &datum, precision)?; + } + Ok(()) +} + +struct WriteFilter { + inner: W, + line_len: usize, +} + +impl WriteFilter { + fn new(inner: W) -> Self { + Self { inner, line_len: 0 } + } + + fn into_inner(self) -> W { + self.inner + } +} + +impl WriteFilter +where + W: Write, +{ + fn write_end(&mut self) -> std::io::Result<()> { + // Write 'Z'. + self.write_all(b"Z")?; + + // Finish out the current line with more 'Z's. + if self.line_len != 0 { + let rest = std::iter::repeat_n(b'Z', 80 - self.line_len).collect::>(); + self.write_all(&rest)?; + } + + Ok(()) + } +} + +impl Write for WriteFilter +where + W: Write, +{ + fn write(&mut self, buf: &[u8]) -> std::io::Result { + fn handle_error(error: std::io::Error, ofs: usize) -> std::io::Result { + if ofs > 0 { + Ok(ofs) + } else { + Err(error) + } + } + + fn write_chunk(mut writer: W, chunk: &[u8]) -> std::io::Result + where + W: Write, + { + let mut ofs = 0; + while ofs < chunk.len() { + let result = if chunk[ofs] < 0x20 { + writer.write(&[chunk[ofs]]) + } else { + let n = chunk[ofs..].iter().take_while(|b| **b >= 0x20).count(); + writer.write(&chunk[ofs..ofs + n]) + }; + match result { + Ok(n) => ofs += n, + Err(error) => return handle_error(error, ofs), + } + } + Ok(ofs) + } + + let mut ofs = 0; + while ofs < buf.len() { + let chunk = (buf.len() - ofs).min(80 - self.line_len); + let n = match write_chunk(&mut self.inner, &buf[ofs..ofs + chunk]) { + Ok(n) => n, + Err(error) => return handle_error(error, ofs), + }; + self.line_len += n; + ofs += n; + if self.line_len == 80 { + if let Err(error) = self.inner.write_all(b"\r\n") { + return handle_error(error, ofs); + } + self.line_len = 0; + } + } + Ok(ofs) + } + + fn flush(&mut self) -> std::io::Result<()> { + self.inner.flush() + } +} + +struct DictionaryWriter<'a, W> { + options: &'a WriteOptions, + writer: &'a mut W, + dictionary: &'a Dictionary, + short_names: Vec, +} + +impl<'a, W> DictionaryWriter<'a, W> +where + W: Write, +{ + pub fn new(options: &'a WriteOptions, writer: &'a mut W, dictionary: &'a Dictionary) -> Self { + Self { + options, + writer, + dictionary, + short_names: dictionary + .short_names() + .into_iter() + .map(|names| { + names + .into_iter() + .next() + .unwrap() + .0 + .into_inner() + .to_ascii_uppercase() + }) + .collect(), + } + } + + pub fn write(&mut self) -> Result<(), Error> { + self.write_header()?; + self.write_version()?; + self.write_identification()?; + self.write_variable_count()?; + self.write_precision()?; + self.write_case_weight()?; + self.write_variables()?; + self.write_value_labels()?; + self.write_documents()?; + Ok(()) + } + + pub fn write_header(&mut self) -> Result<(), Error> { + for _ in 0..5 { + self.writer + .write_all(b"ASCII SPSS PORT FILE ")?; + } + for (index, c) in PORTABLE_TO_WINDOWS_1252.iter().enumerate() { + let c = if *c == b' ' && index != 0x7e { + b'0' + } else { + *c + }; + self.writer.write_all(&[c])?; + } + self.writer.write_all(b"SPSSPORT") + } + + pub fn write_version(&mut self) -> Result<(), Error> { + self.writer.write_all(b"A")?; + write_string( + &mut self.writer, + self.options.timestamp.format("%Y%m%d").to_string(), + )?; + write_string( + &mut self.writer, + self.options.timestamp.format("%H%M%S").to_string(), + ) + } + + pub fn write_identification(&mut self) -> Result<(), Error> { + self.writer.write_all(b"1")?; + write_string(&mut self.writer, self.options.product.as_bytes())?; + if let Some(product_ext) = self.options.product_ext.as_ref() { + self.writer.write_all(b"2")?; + write_string(&mut self.writer, product_ext.as_bytes())?; + } + if let Some(author) = self.options.author.as_ref() { + self.writer.write_all(b"3")?; + write_string(&mut self.writer, author.as_bytes())?; + } + Ok(()) + } + + pub fn write_variable_count(&mut self) -> Result<(), Error> { + write!( + &mut self.writer, + "4{}", + TrigesimalInt::new(self.dictionary.variables.len() as i64) + ) + } + + pub fn write_precision(&mut self) -> Result<(), Error> { + write!( + &mut self.writer, + "5{}", + TrigesimalInt::new(self.options.precision.as_base_30_digits() as i64) + ) + } + + pub fn write_case_weight(&mut self) -> Result<(), Error> { + if let Some(weight_index) = self.dictionary.weight_index() { + self.writer.write_all(b"6")?; + write_string(&mut self.writer, &self.short_names[weight_index].as_bytes())?; + } + Ok(()) + } + + pub fn write_variables(&mut self) -> Result<(), Error> { + let float = |value| TrigesimalFloat::new(value, self.options.precision); + for (variable, short_name) in self.dictionary.variables.iter().zip(&self.short_names) { + let width = variable.width.as_string_width().unwrap_or_default() as i64; + write!(&mut self.writer, "7{}", TrigesimalInt::new(width))?; + write_string(&mut self.writer, short_name.as_bytes())?; + for format in [variable.print_format, variable.write_format] { + let type_ = u16::from(format.type_()) as i64; + write!( + &mut self.writer, + "{}{}{}", + TrigesimalInt::new(type_), + TrigesimalInt::new(format.w() as i64), + TrigesimalInt::new(format.d() as i64) + )?; + } + if let Some(range) = variable.missing_values().range() { + match range { + MissingValueRange::In { low, high } => { + write!(&mut self.writer, "B{}{}", float(*low), float(*high))? + } + MissingValueRange::From { low } => { + write!(&mut self.writer, "A{}", float(*low))? + } + MissingValueRange::To { high } => { + write!(&mut self.writer, "9{}", float(*high))? + } + } + } + for value in variable.missing_values().values() { + write!(&mut self.writer, "8")?; + write_datum(&mut self.writer, value, self.options.precision)?; + } + if let Some(label) = variable.label() { + write!(&mut self.writer, "C")?; + write_string(&mut self.writer, label.as_bytes())?; + } + } + Ok(()) + } + + fn write_value_labels(&mut self) -> Result<(), Error> { + // Collect identical sets of value labels. + let mut sets = HashMap::<&ValueLabels, Vec<_>>::new(); + for (variable, short_name) in self.dictionary.variables.iter().zip(&self.short_names) { + if !variable.value_labels.is_empty() { + sets.entry(&variable.value_labels) + .or_default() + .push(short_name); + } + } + + for (value_labels, variables) in sets { + write!( + &mut self.writer, + "D{}", + TrigesimalInt::new(variables.len() as i64) + )?; + for variable in variables { + write_string(&mut self.writer, variable)?; + } + + write!( + &mut self.writer, + "{}", + TrigesimalInt::new(value_labels.len() as i64) + )?; + for (value, label) in value_labels { + write_datum(&mut self.writer, value, self.options.precision)?; + write_string(&mut self.writer, label.as_bytes())?; + } + } + Ok(()) + } + + fn write_documents(&mut self) -> Result<(), Error> { + if !self.dictionary.documents.is_empty() { + write!( + &mut self.writer, + "E{}", + TrigesimalInt::new(self.dictionary.documents.len() as i64) + )?; + for line in &self.dictionary.documents { + write_string(&mut self.writer, line.as_bytes())?; + } + } + Ok(()) + } +} + +fn write_datum(mut writer: W, datum: &Datum, precision: Precision) -> Result<(), Error> +where + W: Write, + T: RawString, +{ + match datum { + Datum::Number(number) => write!( + writer, + "{}", + TrigesimalFloat::new_optional(*number, precision) + ), + Datum::String(string) => write_string(writer, string.raw_string_bytes()), + } +} + +fn write_string(mut writer: W, s: S) -> Result<(), Error> +where + W: Write, + S: AsRef<[u8]>, +{ + let s = s.as_ref(); + write!(&mut writer, "{}", TrigesimalInt::new(s.len() as i64))?; + writer.write_all(s) +} + +fn trig_to_char(trig: u8) -> char { + b"0123456789ABCDEFGHIJKLMNOPQRST"[trig as usize] as char +} + +struct TrigesimalInt { + value: i64, + force_sign: bool, + add_slash: bool, +} + +impl TrigesimalInt { + fn new(value: i64) -> Self { + Self { + value, + force_sign: false, + add_slash: true, + } + } +} + +impl Display for TrigesimalInt { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.value < 0 { + f.write_char('-')?; + } else if self.force_sign { + f.write_char('+')?; + } + let value = self.value.unsigned_abs(); + + fn recursive_format_int(f: &mut std::fmt::Formatter<'_>, value: u64) -> std::fmt::Result { + let trig = value % 30; + if value >= 30 { + recursive_format_int(f, value / 30)?; + } + f.write_char(trig_to_char(trig as u8)) + } + + recursive_format_int(f, value)?; + if self.add_slash { + f.write_char('/')?; + } + Ok(()) + } +} + +struct TrigesimalFloat { + value: f64, + precision: Precision, +} + +impl TrigesimalFloat { + fn new(value: f64, precision: Precision) -> Self { + Self { value, precision } + } + fn new_optional(value: Option, precision: Precision) -> Self { + Self::new(value.unwrap_or(f64::INFINITY), precision) + } +} + +impl Display for TrigesimalFloat { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let (value, negative) = match self.value.classify() { + std::num::FpCategory::Nan | std::num::FpCategory::Infinite => { + return write!(f, "*."); + } + std::num::FpCategory::Zero | std::num::FpCategory::Subnormal => { + return write!(f, "0/"); + } + std::num::FpCategory::Normal if self.value < 0.0 => (-self.value, true), + std::num::FpCategory::Normal => (self.value, false), + }; + + // Adjust `value` to roughly 30**3, by shifting the trigesimal point left or + // right as necessary. We approximate the base-30 exponent by obtaining the + // base-2 exponent, then multiplying by log30(2). This approximation is + // sufficient to ensure that the adjusted value is always in the range + // 0...30**6, an invariant of the loop below. + let binary_exponent = frexp(value).1; + + // This is floor(log30(2**31)), the minimum number of trigesimal + // digits that `i32` can hold. + const CHUNK_SIZE: usize = 6; + + // Number of trigesimal places for trigs: + // + // * trigs[0] has coefficient 30**(trig_places - 1), + // * trigs[1] has coefficient 30**(trig_places - 2), + // * ... + // + // In other words, the trigesimal point is just before trigs[0]. + let trig_places = (binary_exponent * 20_379 / 100_000) + CHUNK_SIZE as i32 / 2; + let mut value = value * 30f64.powi(CHUNK_SIZE as i32 - trig_places); + + let mut trigs = SmallVec::<[u8; 32]>::new(); + + // Dump all the trigs to buffer[], CHUNK_SIZE at a time. + let mut trigs_to_output = + (f64::DIGITS * 2).div_ceil(3) as i32 + 1 + (CHUNK_SIZE as i32 / 2); + while trigs_to_output > 0 { + // The current chunk is just the integer part of `value`, truncated to the + // nearest integer. It fits in `usize`. Append it in base 30. + let mut chunk = value as usize; + for _ in 0..CHUNK_SIZE { + trigs.push((chunk % 30) as u8); + chunk /= 30; + } + let len = trigs.len(); + trigs[len - CHUNK_SIZE..].reverse(); + + // Proceed to the next chunk. + value = value.fract(); + if value == 0.0 { + break; + } + value *= 30.0f64.powi(CHUNK_SIZE as i32); + trigs_to_output -= CHUNK_SIZE as i32; + } + + // Strip leading zeros. + let leading_zeros = trigs.iter().take_while(|trig| **trig == 0).count(); + let trigs = &mut trigs[leading_zeros..]; + let trig_places = trig_places - leading_zeros as i32; + + // Round to requested precision, conservatively estimating the required + // base-30 precision as 2/3 of the base-10 precision (log30(10) = .68). + let base_30_precision = self.precision.as_base_30_digits() as usize; + let trigs = if trigs.len() > base_30_precision { + if should_round_up(&trigs[base_30_precision - 1..]) { + if try_round_up(&mut trigs[..base_30_precision]) { + &trigs[..base_30_precision] + } else { + // Couldn't round up because we ran out of trigs to carry into. Do the carry here instead. + &[1] + } + } else { + // Round down. + &trigs[..base_30_precision] + } + } else { + // No rounding required: fewer digits available than requested. + &trigs[..] + }; + + // Strip trailing zeros. + let trailing_zeros = trigs + .iter() + .rev() + .take_while(|trig| **trig == 0) + .count() + .min(trigs.len().saturating_sub(1)); + let trigs = &trigs[..trigs.len() - trailing_zeros]; + + if negative { + write!(f, "-")?; + } + if (-1..trigs.len() as i32 + 3).contains(&trig_places) { + // Use conventional notation. + format_trig_digits(f, trigs, trig_places)?; + } else { + // Use scientific notation. + format_trig_digits(f, trigs, trigs.len() as i32)?; + write!( + f, + "{}", + TrigesimalInt { + value: (trig_places - trigs.len() as i32) as i64, + force_sign: true, + add_slash: false + } + )?; + } + f.write_char('/') + } +} + +/// Formats `trigs` into `f`, inserting the trigesimal point after `trig_places` +/// characters have been printed, if necessary adding extra zeros at either end +/// for correctness. +fn format_trig_digits( + f: &mut std::fmt::Formatter<'_>, + trigs: &[u8], + mut trig_places: i32, +) -> std::fmt::Result { + if trig_places < 0 { + f.write_char('.')?; + for _ in trig_places..0 { + f.write_char('0')?; + } + for trig in trigs { + f.write_char(trig_to_char(*trig))?; + } + } else { + for trig in trigs { + if trig_places == 0 { + f.write_char('.')?; + } + trig_places -= 1; + f.write_char(trig_to_char(*trig))?; + } + for _ in 0..trig_places { + f.write_char('0')?; + } + } + Ok(()) +} + +/// Determines whether `trigs[1..]` warrant rounding up or down. Returns true +/// if `trigs[1..]` represents a value greater than half, false if less than +/// half. If `trigs[1..]` is exactly half, examines `trigs[0]` and returns true +/// if odd, false if even ("round to even"). +fn should_round_up(trigs: &[u8]) -> bool { + match trigs[1].cmp(&15) { + Ordering::Less => { + // Less than half: round down. + false + } + Ordering::Greater => { + // More than half: round up. + true + } + Ordering::Equal => { + // About half: look more closely. + if trigs[2..].iter().any(|trig| *trig != 0) { + // Slightly greater than half: round up + true + } else { + // Exactly half: round to even. + trigs[0] % 2 != 0 + } + } + } +} + +/// Rounds up the rightmost trig in `trigs`, carrying to the left as necessary. +/// Returns true if successful, false on failure (due to a carry out of the +/// leftmost position). +fn try_round_up(trigs: &mut [u8]) -> bool { + for trig in trigs.iter_mut().rev() { + if *trig != 29 { + // Round this trig up to the next value. + *trig += 1; + return true; + } + + // Carry over to the next trig to the left. + *trig = 0; + } + + // Ran out of trigs to carry. + false +} + +#[cfg(test)] +mod tests { + use core::f64; + use std::borrow::Cow; + + use encoding_rs::{UTF_8, WINDOWS_1252}; + use indexmap::set::MutableValues; + use itertools::{zip_eq, Itertools}; + + use crate::{ + data::{ByteString, Datum, RawString}, + dictionary::Dictionary, + identifier::Identifier, + por::{ + write::{write_case, DictionaryWriter, Precision, TrigesimalFloat, TrigesimalInt}, + WriteOptions, + }, + variable::{MissingValueRange, MissingValues, VarWidth, Variable}, + }; + + #[test] + fn format_int() { + #[track_caller] + fn check(value: i64, force_sign: bool, expected: &str) { + let s = TrigesimalInt { + value, + force_sign, + add_slash: false, + }; + assert_eq!(&s.to_string(), expected); + } + check(0, false, "0"); + check(0, true, "+0"); + check(1, false, "1"); + check(2, false, "2"); + check(10, false, "A"); + check(29, false, "T"); + check(123456789, false, "52CE69"); + check(1, true, "+1"); + check(2, true, "+2"); + check(10, true, "+A"); + check(29, true, "+T"); + check(123456789, true, "+52CE69"); + check(-1, false, "-1"); + check(-2, false, "-2"); + check(-10, false, "-A"); + check(-29, false, "-T"); + check(-123456789, false, "-52CE69"); + check(-1, true, "-1"); + check(-2, true, "-2"); + check(-10, true, "-A"); + check(-29, true, "-T"); + check(-123456789, true, "-52CE69"); + } + + #[test] + fn format_float() { + #[track_caller] + fn check(value: f64, precision: Precision, expected: &str) { + let s = TrigesimalFloat { value, precision }; + assert_eq!(&s.to_string(), expected); + } + + fn p(base_30_digits: u32) -> Precision { + Precision::from_base_30_digits(base_30_digits) + } + + check(0.0, p(10), "0/"); + check(-0.0, p(10), "0/"); + check(1.0, p(10), "1/"); + check(f64::INFINITY, p(10), "*."); + check(f64::MIN_POSITIVE / 2.0, p(10), "0/"); + check(0.5, p(10), ".F/"); + check(1234.5, p(10), "1B4.F/"); + check(0.123456789, p(9), ".3L39TT5CR/"); + check(0.123456789, p(8), ".3L39TT5D/"); + check(0.123456789, p(7), ".3L39TT5/"); + check(0.123456789, p(6), ".3L39TT/"); + check(0.123456789, p(4), ".3L3A/"); + check(0.123456789, p(3), ".3L3/"); + check(0.123456789, p(2), ".3L/"); + check(0.123456789, p(1), ".4/"); + check(-0.123456789, p(9), "-.3L39TT5CR/"); + check(-0.123456789, p(8), "-.3L39TT5D/"); + check(-0.123456789, p(7), "-.3L39TT5/"); + check(-0.123456789, p(6), "-.3L39TT/"); + check(-0.123456789, p(4), "-.3L3A/"); + check(-0.123456789, p(3), "-.3L3/"); + check(-0.123456789, p(2), "-.3L/"); + check(-0.123456789, p(1), "-.4/"); + check(123456789.123456789, p(10), "52CE69.3L3A/"); + check(123456789.123456789, p(9), "52CE69.3L3/"); + check(123456789.123456789, p(8), "52CE69.3L/"); + check(123456789.123456789, p(7), "52CE69.4/"); + check(123456789.123456789, p(6), "52CE69/"); + check(123456789.123456789, p(5), "52CE60/"); + check(123456789.123456789, p(4), "52CE00/"); + check(123456789.123456789, p(3), "52C+3/"); + check(123456789.123456789, p(2), "52+4/"); + check(123456789.123456789, p(1), "5+5/"); + check(0.00000000987654, p(2), "76-7/"); + } + + #[test] + fn header() { + let dictionary = Dictionary::new(UTF_8); + let mut output = Vec::new(); + let options = WriteOptions::reproducible(); + let mut writer = DictionaryWriter::new(&options, &mut output, &dictionary); + writer.write_header().unwrap(); + assert_eq!(output.len(), 200 + 256 + 8); + assert_eq!( + &output[..200], + b"ASCII SPSS PORT FILE \ +ASCII SPSS PORT FILE \ +ASCII SPSS PORT FILE \ +ASCII SPSS PORT FILE \ +ASCII SPSS PORT FILE " + ); + assert_eq!(&output[200 + 256..], b"SPSSPORT"); + assert_eq!( + &output[200..200 + 64], + b"0000000000000000000000000000000000000000000000000000000000000000" + ); + assert_eq!( + &output[200 + 64..200 + 128], + b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ." + ); + assert_eq!(&output[200 + 128..200 + 192], b"<(+|&[]!$*);^-/|,%_>?`:#@'=\"00\xb10\xb0\x86~\x960000\xb9\xb2\xb3456789000\x97()0{}\\\xa2\x95000"); + assert_eq!( + &output[200 + 192..200 + 256], + b"0000000000000000000000000000000000000000000000000000000000000000" + ); + } + + #[test] + fn version() { + let dictionary = Dictionary::new(UTF_8); + let mut output = Vec::new(); + let options = WriteOptions::reproducible(); + let mut writer = DictionaryWriter::new(&options, &mut output, &dictionary); + writer.write_version().unwrap(); + assert_eq!(&String::from_utf8(output).unwrap(), "A8/202507306/150755"); + } + + #[test] + fn identification() { + let dictionary = Dictionary::new(UTF_8); + let mut output = Vec::new(); + let options = WriteOptions::reproducible() + .with_product_ext(Cow::from("Extra product")) + .with_author(String::from("Author")); + let mut writer = DictionaryWriter::new(&options, &mut output, &dictionary); + writer.write_identification().unwrap(); + assert_eq!( + &String::from_utf8(output).unwrap(), + "1J/PSPP TEST DATA FILE2D/Extra product36/Author" + ); + } + + #[test] + fn precision() { + let dictionary = Dictionary::new(UTF_8); + let mut output = Vec::new(); + let options = + WriteOptions::reproducible().with_precision(Precision::from_base_30_digits(3)); + let mut writer = DictionaryWriter::new(&options, &mut output, &dictionary); + writer.write_precision().unwrap(); + assert_eq!(&String::from_utf8(output).unwrap(), "53/"); + } + + #[test] + fn variables() { + { + let mut dictionary = Dictionary::new(UTF_8); + for (index, width) in [VarWidth::Numeric, VarWidth::String(1), VarWidth::String(15)] + .iter() + .enumerate() + { + dictionary + .add_var(Variable::new( + Identifier::new(format!("v{index}")).unwrap(), + *width, + UTF_8, + )) + .unwrap(); + } + dictionary.variables.get_index_mut2(1).unwrap().label = + Some(String::from("Variable label.")); + dictionary.set_weight(Some(0)).unwrap(); + + let mut output = Vec::new(); + let options = WriteOptions::reproducible(); + let mut writer = DictionaryWriter::new(&options, &mut output, &dictionary); + writer.write_variable_count().unwrap(); + writer.write_case_weight().unwrap(); + writer.write_variables().unwrap(); + + assert_eq!( + &String::from_utf8(output).unwrap(), + "43/\ +62/V0\ +70/2/V05/8/2/5/8/2/\ +71/2/V11/1/0/1/1/0/\ +CF/Variable label.\ +7F/2/V21/F/0/1/F/0/\ +" + ); + } + } + + #[test] + fn missing_values() { + { + let mut dictionary = Dictionary::new(UTF_8); + let variables = [ + (VarWidth::Numeric, vec![Datum::Number(Some(0.0))], None), + ( + VarWidth::Numeric, + vec![Datum::Number(Some(0.0)), Datum::Number(Some(1.0))], + None, + ), + ( + VarWidth::Numeric, + vec![ + Datum::Number(Some(0.0)), + Datum::Number(Some(1.0)), + Datum::Number(Some(2.0)), + ], + None, + ), + ( + VarWidth::Numeric, + vec![Datum::Number(Some(0.0))], + Some(MissingValueRange::new(1.0, 2.0)), + ), + ( + VarWidth::Numeric, + Vec::new(), + Some(MissingValueRange::new(1.0, 2.0)), + ), + ( + VarWidth::Numeric, + vec![Datum::Number(Some(0.0))], + Some(MissingValueRange::From { low: 1.0 }), + ), + ( + VarWidth::Numeric, + Vec::new(), + Some(MissingValueRange::From { low: 1.0 }), + ), + ( + VarWidth::Numeric, + vec![Datum::Number(Some(0.0))], + Some(MissingValueRange::To { high: 1.0 }), + ), + ( + VarWidth::Numeric, + Vec::new(), + Some(MissingValueRange::To { high: 1.0 }), + ), + ( + VarWidth::String(8), + vec![Datum::String( + ByteString::from("abcdefgh").with_encoding(WINDOWS_1252), + )], + None, + ), + ( + VarWidth::String(8), + vec![ + Datum::String(ByteString::from("abcdefgh").with_encoding(WINDOWS_1252)), + Datum::String(ByteString::from("ijklmnop").with_encoding(WINDOWS_1252)), + ], + None, + ), + ( + VarWidth::String(8), + vec![ + Datum::String(ByteString::from("abcdefgh").with_encoding(WINDOWS_1252)), + Datum::String(ByteString::from("ijklmnop").with_encoding(WINDOWS_1252)), + Datum::String(ByteString::from("qrstuvwx").with_encoding(WINDOWS_1252)), + ], + None, + ), + ]; + for (index, (width, values, range)) in variables.into_iter().enumerate() { + let mut variable = + Variable::new(Identifier::new(format!("v{index}")).unwrap(), width, UTF_8); + variable + .missing_values_mut() + .replace(MissingValues::new(values, range).unwrap()) + .unwrap(); + dictionary.add_var(variable).unwrap(); + } + + let mut output = Vec::new(); + let options = WriteOptions::reproducible(); + let mut writer = DictionaryWriter::new(&options, &mut output, &dictionary); + writer.write_variable_count().unwrap(); + writer.write_case_weight().unwrap(); + writer.write_variables().unwrap(); + + assert_eq!( + &String::from_utf8(output).unwrap(), + "4C/\ +70/2/V05/8/2/5/8/2/\ +80/\ +70/2/V15/8/2/5/8/2/\ +80/\ +81/\ +70/2/V25/8/2/5/8/2/\ +80/\ +81/\ +82/\ +70/2/V35/8/2/5/8/2/\ +B1/2/\ +80/\ +70/2/V45/8/2/5/8/2/\ +B1/2/\ +70/2/V55/8/2/5/8/2/\ +A1/\ +80/\ +70/2/V65/8/2/5/8/2/\ +A1/\ +70/2/V75/8/2/5/8/2/\ +91/\ +80/\ +70/2/V85/8/2/5/8/2/\ +91/\ +78/2/V91/8/0/1/8/0/\ +88/abcdefgh\ +78/3/V101/8/0/1/8/0/\ +88/abcdefgh\ +88/ijklmnop\ +78/3/V111/8/0/1/8/0/\ +88/abcdefgh\ +88/ijklmnop\ +88/qrstuvwx\ +" + ); + } + } + + #[test] + fn value_labels() { + let variables = [ + (VarWidth::Numeric, vec![(Datum::Number(Some(1.0)), "One")]), + ( + VarWidth::Numeric, + vec![ + (Datum::Number(Some(1.0)), "One"), + (Datum::Number(Some(2.0)), "Two"), + ], + ), + ( + VarWidth::Numeric, + vec![ + (Datum::Number(Some(1.0)), "One"), + (Datum::Number(Some(2.0)), "Two"), + ], + ), + ( + VarWidth::String(4), + vec![(Datum::String(ByteString::from("abcd")), "One")], + ), + ( + VarWidth::String(8), + vec![( + Datum::String(ByteString::from("abcdefgh")), + "Longer value label", + )], + ), + ( + VarWidth::String(9), + vec![( + Datum::String(ByteString::from("abcdefghi")), + "value label for 9-byte value", + )], + ), + ( + VarWidth::String(300), + vec![( + Datum::String(ByteString::from(vec![b'x'; 300])), + "value label for 300-byte value", + )], + ), + ]; + + let mut dictionary = Dictionary::new(UTF_8); + for (index, (width, value_labels)) in variables.iter().enumerate() { + let mut variable = Variable::new( + Identifier::new(format!("var{index}")).unwrap(), + *width, + UTF_8, + ); + for (value, label) in value_labels { + assert_eq!(variable.value_labels.insert(value.clone(), *label), None); + } + dictionary.add_var(variable).unwrap(); + } + dbg!(&dictionary); + + let mut output = Vec::new(); + let options = WriteOptions::reproducible(); + let mut writer = DictionaryWriter::new(&options, &mut output, &dictionary); + writer.write_value_labels().unwrap(); + + let output = String::from_utf8(output).unwrap(); + println!("{output}"); + + let mut output = output + .split("D") + .filter(|s| !s.is_empty()) + .collect::>(); + output.sort(); + + let expected = [ + ("1/4/VAR01/", vec!["1/3/One"]), + ("1/4/VAR31/", vec!["4/abcd3/One"]), + ("1/4/VAR41/", vec!["8/abcdefghI/Longer value label"]), + ("1/4/VAR51/", vec!["9/abcdefghiS/value label for 9-byte value"]), + ("1/4/VAR61/", vec!["A0/xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx10/value label for 300-byte value"]), + ("2/4/VAR14/VAR22/", vec!["1/3/One", "2/3/Two"]), + ]; + + for (actual, (exp_prefix, exp_suffixes)) in zip_eq(output, expected) { + if !exp_suffixes + .iter() + .permutations(exp_suffixes.len()) + .any(|exp_suffixes| { + actual + == std::iter::once(exp_prefix) + .chain(exp_suffixes.into_iter().map(|s| *s)) + .collect::() + }) + { + panic!( + "{actual:?} != {exp_prefix:?} followed by any permutation of {exp_suffixes:?}" + ); + } + } + } + + #[test] + fn documents() { + let mut dictionary = Dictionary::new(UTF_8); + dictionary.documents = vec![ + String::from("First document line."), + String::from("Second document line."), + ]; + + let mut output = Vec::new(); + let options = WriteOptions::reproducible(); + let mut writer = DictionaryWriter::new(&options, &mut output, &dictionary); + writer.write_documents().unwrap(); + + assert_eq!( + &String::from_utf8(output).unwrap(), + "E2/\ +K/First document line.\ +L/Second document line." + ); + } + + #[test] + fn cases() { + let mut output = Vec::new(); + write_case( + &mut output, + [ + Datum::Number(Some(0.0)), + Datum::Number(Some(1.0)), + Datum::Number(None), + Datum::String(ByteString::from("abcdefghi")), + ], + Precision::default(), + ) + .unwrap(); + assert_eq!(&String::from_utf8(output).unwrap(), "0/1/*.9/abcdefghi"); + } +} diff --git a/rust/pspp/src/show.rs b/rust/pspp/src/show.rs index 9e699d623e..699425e85c 100644 --- a/rust/pspp/src/show.rs +++ b/rust/pspp/src/show.rs @@ -1,24 +1,25 @@ -/* PSPP - a program for statistical analysis. - * Copyright (C) 2023 Free Software Foundation, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . */ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . use crate::parse_encoding; use anyhow::{anyhow, Result}; use clap::{Args, ValueEnum}; use encoding_rs::Encoding; use pspp::{ + data::cases_to_output, output::{ driver::{Config, Driver}, pivot::PivotTable, @@ -293,17 +294,19 @@ impl Show { .into_parts(); match &output { Output::Driver { driver, mode: _ } => { - driver - .borrow_mut() - .write(&Arc::new(Item::new(PivotTable::from(&metadata)))); + let mut output = Vec::new(); + output.push(Item::new(PivotTable::from(&metadata))); + output.extend( + dictionary + .all_pivot_tables() + .into_iter() + .map(|pivot_table| Item::new(pivot_table)), + ); + output.extend(cases_to_output(&dictionary, cases)); driver .borrow_mut() .write(&Arc::new(Item::new(Details::Group( - dictionary - .all_pivot_tables() - .into_iter() - .map(|pivot_table| Arc::new(Item::new(pivot_table))) - .collect(), + output.into_iter().map(Arc::new).collect(), )))); } Output::Json { .. } => { diff --git a/rust/pspp/src/show_por.rs b/rust/pspp/src/show_por.rs new file mode 100644 index 0000000000..28529e2315 --- /dev/null +++ b/rust/pspp/src/show_por.rs @@ -0,0 +1,327 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +use anyhow::{anyhow, Result}; +use clap::{Args, ValueEnum}; +use pspp::{ + data::cases_to_output, + output::{ + driver::{Config, Driver}, + pivot::PivotTable, + Details, Item, Text, + }, + por::PortableFile, +}; +use serde::Serialize; +use std::{ + cell::RefCell, + ffi::OsStr, + fmt::{Display, Write as _}, + fs::File, + io::{stdout, BufReader, Write}, + path::{Path, PathBuf}, + rc::Rc, + sync::Arc, +}; + +/// Show information about SPSS portable files. +#[derive(Args, Clone, Debug)] +pub struct ShowPor { + /// What to show. + #[arg(value_enum)] + mode: Mode, + + /// File to show. + #[arg(required = true)] + input: PathBuf, + + /// Output file name. If omitted, output is written to stdout. + output: Option, + + /// Maximum number of cases to read. + /// + /// If specified without an argument, all cases will be read. + #[arg( + long = "data", + num_args = 0..=1, + default_missing_value = "18446744073709551615", + default_value_t = 0, + help_heading = "Input file options" + )] + max_cases: usize, + + /// Output driver configuration options. + #[arg(short = 'o', help_heading = "Output options")] + output_options: Vec, + + /// Output format. + #[arg(long, short = 'f', help_heading = "Output options")] + format: Option, +} + +enum Output { + Driver { + driver: Rc>>, + mode: Mode, + }, + Json { + writer: Rc>>, + pretty: bool, + }, + Discard, +} + +impl Output { + fn show_json(&self, value: &T) -> Result<()> + where + T: Serialize, + { + match self { + Self::Driver { mode, driver: _ } => { + Err(anyhow!("Mode '{mode}' only supports output as JSON.")) + } + Self::Json { writer, pretty } => { + let mut writer = writer.borrow_mut(); + match pretty { + true => serde_json::to_writer_pretty(&mut *writer, value)?, + false => serde_json::to_writer(&mut *writer, value)?, + }; + writeln!(writer)?; + Ok(()) + } + Self::Discard => Ok(()), + } + } + + fn warn(&self, warning: &impl Display) { + match self { + Output::Driver { driver, .. } => { + driver + .borrow_mut() + .write(&Arc::new(Item::from(Text::new_log(warning.to_string())))); + } + Output::Json { .. } => { + #[derive(Serialize)] + struct Warning { + warning: String, + } + let warning = Warning { + warning: warning.to_string(), + }; + let _ = self.show_json(&warning); + } + Self::Discard => (), + } + } +} + +impl ShowPor { + pub fn run(self) -> Result<()> { + let format = if let Some(format) = self.format { + format + } else if let Some(output_file) = &self.output { + match output_file + .extension() + .unwrap_or(OsStr::new("")) + .to_str() + .unwrap_or("") + { + "json" => ShowFormat::Json, + "ndjson" => ShowFormat::Ndjson, + _ => ShowFormat::Output, + } + } else { + ShowFormat::Json + }; + + let output = match format { + ShowFormat::Output => { + let mut config = String::new(); + + if let Some(file) = &self.output { + #[derive(Serialize)] + struct File<'a> { + file: &'a Path, + } + let file = File { + file: file.as_path(), + }; + let toml_file = toml::to_string_pretty(&file).unwrap(); + config.push_str(&toml_file); + } + for option in &self.output_options { + writeln!(&mut config, "{option}").unwrap(); + } + + let table: toml::Table = toml::from_str(&config)?; + if !table.contains_key("driver") { + let driver = if let Some(file) = &self.output { + ::driver_type_from_filename(file).ok_or_else(|| { + anyhow!("{}: no default output format for file name", file.display()) + })? + } else { + "text" + }; + + #[derive(Serialize)] + struct DriverConfig { + driver: &'static str, + } + config.insert_str( + 0, + &toml::to_string_pretty(&DriverConfig { driver }).unwrap(), + ); + } + + let config: Config = toml::from_str(&config)?; + Output::Driver { + mode: self.mode, + driver: Rc::new(RefCell::new(Box::new(::new(&config)?))), + } + } + ShowFormat::Json | ShowFormat::Ndjson => Output::Json { + pretty: format == ShowFormat::Json, + writer: if let Some(output_file) = &self.output { + Rc::new(RefCell::new(Box::new(File::create(output_file)?))) + } else { + Rc::new(RefCell::new(Box::new(stdout()))) + }, + }, + ShowFormat::Discard => Output::Discard, + }; + + let reader = BufReader::new(File::open(&self.input)?); + match self.mode { + Mode::Dictionary => { + let PortableFile { + dictionary, + metadata: _, + cases, + } = PortableFile::open(reader, |warning| output.warn(&warning))?; + let cases = cases.take(self.max_cases); + + match &output { + Output::Driver { driver, mode: _ } => { + let mut output = Vec::new(); + output.extend( + dictionary + .all_pivot_tables() + .into_iter() + .map(|pivot_table| Item::new(pivot_table)), + ); + output.extend(cases_to_output(&dictionary, cases)); + driver + .borrow_mut() + .write(&Arc::new(Item::new(Details::Group( + output.into_iter().map(Arc::new).collect(), + )))); + } + Output::Json { .. } => { + output.show_json(&dictionary)?; + for (_index, case) in (0..self.max_cases).zip(cases) { + output.show_json(&case?)?; + } + } + Output::Discard => (), + } + } + Mode::Metadata => { + let metadata = + PortableFile::open(reader, |warning| output.warn(&warning))?.metadata; + + match &output { + Output::Driver { driver, mode: _ } => { + driver + .borrow_mut() + .write(&Arc::new(Item::new(PivotTable::from(&metadata)))); + } + Output::Json { .. } => { + output.show_json(&metadata)?; + } + Output::Discard => (), + } + } + Mode::Histogram => { + let (histogram, translations) = PortableFile::read_histogram(reader)?; + let h = histogram + .into_iter() + .enumerate() + .filter_map(|(index, count)| { + if count > 0 + && index != translations[index as u8] as usize + && translations[index as u8] != 0 + { + Some(( + format!("{index:02x}"), + translations[index as u8] as char, + count, + )) + } else { + None + } + }) + .collect::>(); + output.show_json(&h)?; + } + } + Ok(()) + } +} + +/// What to show in a system file. +#[derive(Clone, Copy, Debug, Default, PartialEq, ValueEnum)] +enum Mode { + /// File dictionary, with variables, value labels, ... + #[default] + #[value(alias = "dict")] + Dictionary, + + /// File metadata not included in the dictionary. + Metadata, + + /// Histogram of character incidence in the file. + Histogram, +} + +impl Mode { + fn as_str(&self) -> &'static str { + match self { + Mode::Dictionary => "dictionary", + Mode::Metadata => "metadata", + Mode::Histogram => "histogram", + } + } +} + +impl Display for Mode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +#[derive(Clone, Copy, Debug, Default, PartialEq, Serialize, ValueEnum)] +#[serde(rename_all = "snake_case")] +enum ShowFormat { + /// Pretty-printed JSON. + #[default] + Json, + /// Newline-delimited JSON. + Ndjson, + /// Pivot tables. + Output, + /// No output. + Discard, +} diff --git a/rust/pspp/src/sys.rs b/rust/pspp/src/sys.rs index 1746781d69..0f405fb468 100644 --- a/rust/pspp/src/sys.rs +++ b/rust/pspp/src/sys.rs @@ -17,13 +17,15 @@ //! Reading and writing system files. //! //! This module enables reading and writing "system files", the binary format -//! for SPSS data files. The system file format dates back 40+ years and has +//! for SPSS data files. The [system file format] dates back 40+ years and has //! evolved greatly over that time to support new features, but in a way to //! facilitate interchange between even the oldest and newest versions of //! software. //! //! Use [ReadOptions] to read a system file in the simplest way. //! Use [WriteOptions] to write a system file. +//! +//! [system file format]: https://pspp.benpfaff.org/manual/system-file.html // Warn about missing docs, but not for items declared with `#[cfg(test)]`. #![cfg_attr(not(test), warn(missing_docs))] diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index 5702522ec9..78440d801c 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -24,7 +24,6 @@ use std::{ }; use crate::{ - calendar::date_time_to_pspp, crypto::EncryptedFile, data::{ByteString, Case, Datum, MutRawString, RawString}, dictionary::{ @@ -579,7 +578,7 @@ impl ReadOptions { } } -/// The content of an SPSS system file. +/// An SPSS system file read with [ReadOptions]. #[derive(Debug)] pub struct SystemFile { /// The system file dictionary. @@ -1426,10 +1425,7 @@ impl Metadata { let mut values = Vec::new(); group.push("Created"); - values.push(Value::new_number_with_format( - Some(date_time_to_pspp(self.creation)), - Format::DATETIME40_0, - )); + values.push(Value::new_date_time(self.creation)); let mut product = Group::new("Writer"); product.push("Product"); diff --git a/rust/pspp/src/sys/write.rs b/rust/pspp/src/sys/write.rs index 2fe84f2423..60f4b2cfce 100644 --- a/rust/pspp/src/sys/write.rs +++ b/rust/pspp/src/sys/write.rs @@ -1030,7 +1030,7 @@ where /// /// # Panic /// - /// Attempts to write more cases after calling this function may will panic. + /// Attempts to write more cases after calling this function will panic. pub fn try_finish(&mut self) -> Result, BinError> { let Some(inner) = self.inner.take() else { return Ok(None); diff --git a/rust/pspp/src/variable.rs b/rust/pspp/src/variable.rs index 3c182b35b5..3b4f72741e 100644 --- a/rust/pspp/src/variable.rs +++ b/rust/pspp/src/variable.rs @@ -24,6 +24,7 @@ use std::{ str::FromStr, }; +use displaydoc::Display; use encoding_rs::{Encoding, UTF_8}; use hashbrown::HashMap; use indexmap::Equivalent; @@ -47,6 +48,9 @@ pub enum VarType { Numeric, /// A string variable. + /// + /// The string width is unspecified; use [VarWidth] for type and width + /// together. String, } @@ -78,11 +82,22 @@ impl Display for VarType { } } -/// [VarType], plus a width for [VarType::String]. +/// A variable's width. +/// +/// This is essentially [VarType] plus a width for [VarType::String]. #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize)] pub enum VarWidth { + /// A numeric variable. Numeric, - String(u16), // XXX change to NonZeroU16, or to 1..=32767 range type + + /// A string variable. + String( + /// The width of the string variable. + /// + /// Must be in `1..=32767`, although the type system does not yet + /// enforce this. + u16, + ), // XXX change to NonZeroU16, or to 1..=32767 range type } impl VarWidth { @@ -601,7 +616,11 @@ impl ValueLabels { } pub fn is_empty(&self) -> bool { - self.0.is_empty() + self.len() == 0 + } + + pub fn len(&self) -> usize { + self.0.len() } pub fn get(&self, value: &Datum) -> Option<&str> @@ -674,6 +693,16 @@ impl Hash for ValueLabels { } } +impl<'a> IntoIterator for &'a ValueLabels { + type Item = (&'a Datum, &'a String); + + type IntoIter = hashbrown::hash_map::Iter<'a, Datum, String>; + + fn into_iter(self) -> Self::IntoIter { + self.0.iter() + } +} + pub struct MissingValuesMut<'a> { inner: &'a mut MissingValues, width: VarWidth, @@ -778,11 +807,19 @@ impl Display for MissingValues { } } -#[derive(Copy, Clone, Debug)] +/// Invalid missing values. +#[derive(Display, Copy, Clone, Debug, ThisError)] pub enum MissingValuesError { + /// Too many missing values. TooMany, + + /// Missing values too wide (missing values may be no wider than 8 bytes). TooWide, + + /// Missing values must be all string or all numeric. MixedTypes, + + /// The system-missing value may not be a user-missing value. SystemMissing, }