From: Ben Pfaff Date: Wed, 17 Sep 2025 15:45:57 +0000 (-0700) Subject: rust: Add support for reading SPSS/PC+ system files. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=93046093ba3550a22d97e68f6d54e53c56651b8c;p=pspp rust: Add support for reading SPSS/PC+ system files. Signed-off-by: Ben Pfaff --- diff --git a/rust/doc/src/SUMMARY.md b/rust/doc/src/SUMMARY.md index 1d1c14d2e6..4c10ae8220 100644 --- a/rust/doc/src/SUMMARY.md +++ b/rust/doc/src/SUMMARY.md @@ -5,8 +5,9 @@ - [Running PSPP](invoking/index.md) - [Converting Data](invoking/pspp-convert.md) - - [Inspecting `.sav` Data](invoking/pspp-show.md) - - [Inspecting `.por` Data](invoking/pspp-show-por.md) + - [Inspecting System Files](invoking/pspp-show.md) + - [Inspecting Portable Files](invoking/pspp-show-por.md) + - [Inspecting SPSS/PC+ Files](invoking/pspp-show-pc.md) - [Decrypting Files](invoking/pspp-decrypt.md) # Language Overview diff --git a/rust/doc/src/invoking/pspp-convert.md b/rust/doc/src/invoking/pspp-convert.md index e6c2abd95f..d1248cd67f 100644 --- a/rust/doc/src/invoking/pspp-convert.md +++ b/rust/doc/src/invoking/pspp-convert.md @@ -7,9 +7,9 @@ another. The basic syntax is: pspp convert [OUTPUT] ``` -which reads an SPSS system file or portable file from `` and -writes a copy of it to `[OUTPUT]`. If `[OUTPUT]` is omitted, output -is written to the terminal. +which reads an SPSS system file or portable file or SPSS/PC+ system +file from `` and writes a copy of it to `[OUTPUT]`. If +`[OUTPUT]` is omitted, output is written to the terminal. If `[OUTPUT]` is specified, then `pspp convert` tries to guess the output format based on its extension: diff --git a/rust/doc/src/invoking/pspp-show-pc.md b/rust/doc/src/invoking/pspp-show-pc.md new file mode 100644 index 0000000000..6e85c11691 --- /dev/null +++ b/rust/doc/src/invoking/pspp-show-pc.md @@ -0,0 +1,78 @@ +# Inspecting SPSS/PC+ Files + +The `pspp show-pc` command reads an SPSS/PC+ system file which +usually has a `.sys` extension, and produces a report. + +> SPSS/PC+ has been obsolete since the 1990s, and its file format is +> also obsolete and rarely encountered. Use [`pspp +> show`](pspp-show.md) to inspect modern SPSS system files. + +The basic syntax is: + +``` +pspp show-pc [OUTPUT] +``` + +where `` is a mode of operation (see below), `` is the +SPSS/PC+ file to read, and `[OUTPUT]` is the output file name. If +`[OUTPUT]` is omitted, output is written to the terminal. + +The following ``s are available: + +* `dictionary`: Outputs the file dictionary in detail, including + variables, value labels, and so on. With `--data`, also outputs + cases from the system file. + + This can be useful as an alternative to PSPP syntax commands such as + [`DISPLAY DICTIONARY`](../commands/display.md). + + [`pspp convert`](pspp-convert.md) is a better way to convert an + SPSS/PC+ file to another format. + +* `metadata`: Outputs metadata not included in the dictionary: + + - The creation date and time declared inside the file (not in the + file system). + + - The name of the product family and product that wrote the file, if + present. + + - The file name embedded inside the file, if one is present. + + - Whether the file is bytecode-compressed. + + - The number of cases in the file. + +## Options + +The following options affect how `pspp show-pc` reads ``: + +* `--data []` + For mode `dictionary`, and `encodings`, this instructs `pspp + show-pc` to read cases from the file. If `` is given, + then that sets a limit on the number of cases to read. Without this + option, PSPP will not read any cases. + +The following options affect how `pspp show-pc` writes its output: + +* `-f ` + `--format ` + Specifies the format to use for output. `` may be one of + the following: + + - `json`: JSON using indentation and spaces for easy human + consumption. + - `ndjson`: [Newline-delimited JSON]. + - `output`: Pivot tables with the PSPP output engine. Use `-o` for + additional configuration. + - `discard`: Do not produce any output. + + When these options are not used, the default output format is chosen + based on the `[OUTPUT]` extension. If `[OUTPUT]` is not specified, + then output defaults to JSON. + + [Newline-delimited JSON]: https://github.com/ndjson/ndjson-spec + +* `-o ` + Adds `` to the output engine configuration. + diff --git a/rust/doc/src/invoking/pspp-show-por.md b/rust/doc/src/invoking/pspp-show-por.md index 4bd208e8d2..bfe0f669a9 100644 --- a/rust/doc/src/invoking/pspp-show-por.md +++ b/rust/doc/src/invoking/pspp-show-por.md @@ -1,4 +1,4 @@ -# Inspecting `.por` files with `pspp show-por` +# Inspecting Portable (`.por`) Files with `pspp show-por` The `pspp show-por` command reads an SPSS "portable file", which usually has a `.por` extension, and produces a report. The diff --git a/rust/doc/src/invoking/pspp-show.md b/rust/doc/src/invoking/pspp-show.md index 2b49401932..25065438f7 100644 --- a/rust/doc/src/invoking/pspp-show.md +++ b/rust/doc/src/invoking/pspp-show.md @@ -1,4 +1,4 @@ -# Inspecting `.sav` files with `pspp show` +# Inspecting System (`.sav`) Files with `pspp show` The `pspp show` command reads an SPSS "system file" or data file, which usually has a `.sav` extension, and produces a report. The diff --git a/rust/doc/src/pc+.md b/rust/doc/src/pc+.md index 558d413603..5fb318c893 100644 --- a/rust/doc/src/pc+.md +++ b/rust/doc/src/pc+.md @@ -58,9 +58,10 @@ char filename[128]; * `char filename[128];` In most files in the corpus, this field is entirely filled with - spaces. In one file, it contains a file name, followed by a null - bytes, followed by spaces to fill the remainder of the field. The - meaning is unknown. + spaces or null bytes. In others, it contains a filename, which + generally contains doubled backslashes, + e.g. `c:\\doli\\altm\\f_sum94.sys`. The unusual extension `(_)` is + also common, e.g. `DER56.(_)`. The following sections describe the contents of each record, identified by the index into the `records` array. @@ -75,7 +76,8 @@ All files in the corpus have this record at offset 0x100 with length ``` uint16 one0; -char product[62]; +char family[2]; +char product[60]; flt64 sysmis; uint32 zero0; uint32 zero1; @@ -84,12 +86,12 @@ uint16 compressed; uint16 nominal_case_size; uint16 n_cases0; uint16 weight_index; -uint16 zero2; +uint16 unknown; uint16 n_cases1; -uint16 zero3; +uint16 zero2; char creation_date[8]; char creation_time[8]; -char label[64]; +char file_label[64]; ``` * `uint16 one0;` @@ -99,23 +101,26 @@ char label[64]; * `uint32 zero0;` `uint32 zero1;` `uint16 zero2;` - `uint16 zero3;` Always set to 0. - It seems likely that one of these variables is set to 1 if - weighting is enabled, but none of the files in the corpus is - weighted. +* `uint16 unknown;` + Unknown meaning. Usually set to 0. + +* `char family[2];` + Identifies the product family that created the file. This is either + `PC` for SPSS/PC+ and related software, or `DE` for SPSS Data Entry + and related software. -* `char product[62];` +* `char product[60];` Name of the program that created the file. Only the following unique values have been observed, in each case padded on the right with spaces: ``` - DESPSS/PC+ System File Written by Data Entry II - PCSPSS SYSTEM FILE. IBM PC DOS, SPSS/PC+ - PCSPSS SYSTEM FILE. IBM PC DOS, SPSS/PC+ V3.0 - PCSPSS SYSTEM FILE. IBM PC DOS, SPSS for Windows + SPSS/PC+ System File Written by Data Entry II + SPSS SYSTEM FILE. IBM PC DOS, SPSS/PC+ + SPSS SYSTEM FILE. IBM PC DOS, SPSS/PC+ V3.0 + SPSS SYSTEM FILE. IBM PC DOS, SPSS for Windows ``` Thus, it is reasonable to use the presence of the string `SPSS` at @@ -128,6 +133,8 @@ char label[64]; Set to 0 if the data in the file is not compressed, 1 if the data is compressed with simple bytecode compression. + > The corpus contains a mix of compressed and uncompressed files. + * `uint16 nominal_case_size;` Number of data elements per case. This is the number of variables, except that long string variables add extra data elements (one for @@ -137,8 +144,10 @@ char label[64]; * `uint16 n_cases0;` `uint16 n_cases1;` The number of cases in the data record. Both values are the same. - Some files in the corpus contain data for the number of cases noted - here, followed by garbage that somewhat resembles data. + + > Readers must use these case counts because some files in the corpus + contain garbage that somewhat resembles data after the specified + number of cases. * `uint16 weight_index;` 0, if the file is unweighted, otherwise a 1-based index into the @@ -147,15 +156,17 @@ char label[64]; * `char creation_date[8];` The date that the file was created, in `mm/dd/yy` format. - Single-digit days and months are not prefixed by zeros. The string + + > Single-digit days and months are not prefixed by zeros. The string is padded with spaces on right or left or both, e.g. `_2/4/93_`, `10/5/87_`, and `_1/11/88` (with `_` standing in for a space) are all actual examples from the corpus. * `char creation_time[8];` The time that the file was created, in `HH:MM:SS` format. - Single-digit hours are padded on a left with a space. Minutes and - seconds are always written as two digits. + + > Single-digit hours are padded on the left with a space. Minutes + and seconds are always written as two digits. * `char file_label[64];` [File label](commands/file-label.md) declared by the user, if any. @@ -194,14 +205,21 @@ these additional instances for long strings. * `uint32 value_label_start;` `uint32 value_label_end;` - For a variable with value labels, these specify offsets into the - label record of the start and end of this variable's value - labels, respectively. See the [labels - record](#record-2-labels-record), for more information. - - For a variable without any value labels, these are both zero. - - A long string variable may not have value labels. + These specify offsets into the label record of the start and end of + value labels for this variable. They are zero if there are no value + labels. See the [labels record](#record-2-labels-record), for more + information. A long string variable may not have value labels. + + Sometimes the data is, instead of value labels, some form of data + validation rules for SPSS Data Entry. There is no known way to + distinguish, except that data validation rules often cannot be + interpreted as valid value labels because the label length field + makes them not fit exactly in the allocated space. + + > It appears that SPSS products cannot properly read these either. + > All the files in the corpus with these problems are closely + > related, so it's also possible that they are corrupted in some + > way. * `uint32 var_label_ofs;` For a variable with a variable label, this specifies an offset into @@ -224,13 +242,15 @@ these additional instances for long strings. variable's user-missing value. For string variables, `missing.s` is a string missing value. A variable without a user-missing value is indicated with `missing.f` set to the system-missing value, even - for string variables (!). A Long string variable may not have a + for string variables (!). A long string variable may not have a missing value. In addition to the user-defined variables, every SPSS/PC+ system file contains, as its first three variables, the following system-defined variables, in the following order. The system-defined variables have -no variable label, value labels, or missing values. +no variable label, value labels, or missing values. PSPP renames +these variables to start with `@` when it reads an SPSS/PC+ system +file. * `$CASENUM` A numeric variable with format `F8.0`. Most of the time this is a @@ -247,8 +267,7 @@ no variable label, value labels, or missing values. * `$WEIGHT` A numeric variable with format `F8.2`. This represents the case's - weight; SPSS/PC+ files do not have a user-defined weighting - variable. If weighting has not been enabled, every case has value + weight. If weighting has not been enabled, every case has value 1.0. ## Record 2: Labels Record @@ -263,7 +282,7 @@ fields in a variable record are all offsets relative to the beginning of the labels record, with an additional 7-byte offset. That is, if the labels record starts at byte offset `labels_ofs` and a variable has a given `var_label_ofs`, then the variable label begins at byte offset -`labels_ofs` + `var_label_ofs` + 7 in the file. +`labels_ofs` + `var_label_ofs + 7` in the file. A variable label, starting at the offset indicated by `var_label_ofs`, consists of a one-byte length followed by the specified @@ -274,7 +293,7 @@ uint8 length; char s[length]; ``` - A set of value labels, extending from `value_label_start` to +A set of value labels, extending from `value_label_start` to `value_label_end` (exclusive), consists of a numeric or string value followed by a string in the format just described. String values are padded on the right with spaces to fill the 8-byte field, like this: @@ -288,10 +307,10 @@ uint8 length; char s[length]; ``` - The labels record begins with a pair of `uint32` values. The first of -these is always 3. The second is between 8 and 16 less than the number -of bytes in the record. Neither value is important for interpreting the -file. +The labels record begins with a pair of `uint32` values. The first of +these is always 3. The second is between 8 and 16 less than the +number of bytes in the record. Neither value is important for +interpreting the file. ## Record 3: Data Record @@ -321,15 +340,16 @@ The format of the data record varies depending on the value of following the command bytes, and so on. - 2 through 255 - A number with value CODE - 100, where CODE is the value of the + A number with value `CODE - 100`, where `CODE` is the value of the compression code. For example, code 105 indicates a numeric variable of value 5. - The end of the 8-byte group of bytecodes is followed by any 8-byte - blocks of non-compressible values indicated by code 1. After that - follows another 8-byte group of bytecodes, then those bytecodes' - non-compressible values. The pattern repeats up to the number of - cases specified by the main header record have been seen. + The end of the 8-byte group of command codes is followed by any + 8-byte blocks of non-compressible values indicated by code 1. After + that follows another 8-byte group of command codes, then those + command codes' non-compressible values. The pattern repeats up to + the number of cases specified by the main header record have been + seen. The corpus does not contain any files with command codes 2 through 95, so it is possible that some of these codes are used for special diff --git a/rust/pspp/src/convert.rs b/rust/pspp/src/convert.rs index 34aed906bb..b38ccb3b10 100644 --- a/rust/pspp/src/convert.rs +++ b/rust/pspp/src/convert.rs @@ -30,6 +30,7 @@ use pspp::{ data::{ByteString, Case, Datum, WithEncoding}, file::FileType, format::{DisplayPlain, Type}, + pc::PcFile, por::PortableFile, sys::{ReadOptions, WriteOptions, raw::records::Compression}, util::ToSmallString, @@ -324,7 +325,22 @@ impl Convert { as Box>>, AnyError>>>; (dictionary, cases) } - _ => bail!("{}: not a system or portable file", self.input.display()), + Some(FileType::Pc) => { + fn warn_pc(warning: pspp::pc::Warning) { + eprintln!("warning: {warning}"); + } + + let pc_file = PcFile::open_file(&self.input, warn_pc)?; + let (dictionary, _, cases) = pc_file.into_parts(); + let cases = cases.map(|result| result.map_err(AnyError::from)); + let cases = Box::new(cases) + as Box>>, AnyError>>>; + (dictionary, cases) + } + _ => bail!( + "{}: not a system, portable, or SPSS/PC+ file", + self.input.display() + ), }; // Take only the first `self.max_cases` cases. diff --git a/rust/pspp/src/file.rs b/rust/pspp/src/file.rs index 6e34b4a2d9..7c9cf4fe52 100644 --- a/rust/pspp/src/file.rs +++ b/rust/pspp/src/file.rs @@ -40,7 +40,7 @@ pub enum FileType { Portable, /// An SPSS PC+ data file. - PcPlus, + Pc, /// An [SPSS Viewer file](crate::output::spv). Viewer { @@ -148,7 +148,7 @@ impl FileType { } if buf.get(0x104..0x108) == Some(b"SPSS") { - return Ok(Some(Self::PcPlus)); + return Ok(Some(Self::Pc)); } let mut string = String::new(); diff --git a/rust/pspp/src/lib.rs b/rust/pspp/src/lib.rs index 00114eba12..baff025160 100644 --- a/rust/pspp/src/lib.rs +++ b/rust/pspp/src/lib.rs @@ -117,6 +117,7 @@ pub mod locale_charset; pub mod macros; pub mod message; pub mod output; +pub mod pc; pub mod por; pub mod prompt; pub mod settings; diff --git a/rust/pspp/src/main.rs b/rust/pspp/src/main.rs index 5cdd92ee71..4fa6f80b55 100644 --- a/rust/pspp/src/main.rs +++ b/rust/pspp/src/main.rs @@ -19,11 +19,12 @@ use clap::{Parser, Subcommand}; use encoding_rs::Encoding; use thiserror::Error as ThisError; -use crate::{convert::Convert, decrypt::Decrypt, show::Show, show_por::ShowPor}; +use crate::{convert::Convert, decrypt::Decrypt, show::Show, show_pc::ShowPc, show_por::ShowPor}; mod convert; mod decrypt; mod show; +mod show_pc; mod show_por; /// PSPP, a program for statistical analysis of sampled data. @@ -40,6 +41,7 @@ enum Command { Decrypt(Decrypt), Show(Show), ShowPor(ShowPor), + ShowPc(ShowPc), } impl Command { @@ -49,6 +51,7 @@ impl Command { Command::Decrypt(decrypt) => decrypt.run(), Command::Show(show) => show.run(), Command::ShowPor(show_por) => show_por.run(), + Command::ShowPc(show_pc) => show_pc.run(), } } } diff --git a/rust/pspp/src/output/pivot.rs b/rust/pspp/src/output/pivot.rs index 5ceab55e8e..13392f8ea6 100644 --- a/rust/pspp/src/output/pivot.rs +++ b/rust/pspp/src/output/pivot.rs @@ -2704,12 +2704,13 @@ pub struct MetadataEntry { pub value: MetadataValue, } -pub enum MetadataValue { - Leaf(Value), - Group(Vec), -} - impl MetadataEntry { + pub fn new(name: impl Into, value: MetadataValue) -> Self { + Self { + name: name.into(), + value, + } + } pub fn into_pivot_table(self) -> PivotTable { let mut data = Vec::new(); let group = match self.visit(&mut data) { @@ -2736,6 +2737,17 @@ impl MetadataEntry { } } +pub enum MetadataValue { + Leaf(Value), + Group(Vec), +} + +impl MetadataValue { + pub fn new_leaf(value: impl Into) -> Self { + Self::Leaf(value.into()) + } +} + impl Serialize for MetadataValue { fn serialize(&self, serializer: S) -> Result where diff --git a/rust/pspp/src/pc.rs b/rust/pspp/src/pc.rs new file mode 100644 index 0000000000..ce484496a7 --- /dev/null +++ b/rust/pspp/src/pc.rs @@ -0,0 +1,867 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +//! Reading SPSS/PC+ data files. +//! +//! This module enables reading [SPSS/PC+ data files], the data format for the +//! SPSS/PC+ product first released in 1984. It is obsolete. +//! +//! Use [PcFile] to read an SPSS/PC+ file. Writing SPSS/PC+ files is not +//! supported. +//! +//! [SPSS/PC+ data files]: https://pspp.benpfaff.org/manual/pc+.html +#![cfg_attr(not(test), warn(missing_docs))] + +use std::{ + collections::VecDeque, + fmt::{Display, Formatter}, + fs::File, + io::{BufReader, Error as IoError, Read, Seek, SeekFrom, Take}, + path::Path, +}; + +use binrw::{BinRead, Endian, Error as BinError}; +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use displaydoc::Display; +use encoding_rs::WINDOWS_1252; +use serde::Serialize; +use thiserror::Error as ThisError; + +use crate::{ + data::{ByteString, Case, Datum, RawString, WithEncoding}, + dictionary::Dictionary, + format::{Error as FormatError, Format, UncheckedFormat}, + identifier::{Error as IdError, Identifier}, + output::pivot::{MetadataEntry, MetadataValue, PivotTable, Value}, + sys::raw::{self, CaseDetails, CaseVar, CompressionAction, records::RawFormat}, + variable::{MissingValues, MissingValuesError, VarWidth, Variable}, +}; + +#[cfg(test)] +mod tests; + +/// An [SPSS/PC+ data file]. +/// +/// [SPSS/PC+ data file]: https://pspp.benpfaff.org/manual/pc+.html +#[derive(Debug)] +pub struct PcFile { + /// The data file's dictionary. + pub dictionary: Dictionary, + + /// SPSS/PC+ file metadata that is not part of the dictionary. + pub metadata: Metadata, + + /// Data in the SPSS/PC+ file. + pub cases: Cases, +} + +impl PcFile { + /// Returns the individual parts of the [PcFile]. + pub fn into_parts(self) -> (Dictionary, Metadata, Cases) { + (self.dictionary, self.metadata, self.cases) + } +} + +/// SPSS/PC+ product family. +#[derive(Clone, Debug, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum Family { + /// Data analysis product family. + /// + /// This includes at least: + /// - SPSS/PC+ + /// - SPSS/PC+ V3.0 + Pc, + + /// Data entry product family. + /// + /// This includes at least: + /// - SPSS Data Entry + /// - SPSS Data Entry II + De, +} + +/// SPSS/PC+ file metadata that is not part of [Dictionary]. +#[derive(Clone, Debug, PartialEq, Eq, Serialize)] +pub struct Metadata { + /// Creation date and time. + /// + /// This comes from the file header, not from the file system. + pub creation: NaiveDateTime, + + /// Product family. + pub family: Family, + + /// Name of the product that wrote the file. + pub product: Option, + + /// Additional metadata that in some files identifies a file name. + pub filename: Option, + + /// Whether data in the file is bytecode compressed. + pub compressed: bool, + + /// Number of declared cases in the file. + pub n_cases: u16, +} + +impl From<&Metadata> for PivotTable { + fn from(value: &Metadata) -> Self { + fn maybe_string(name: &str, s: &Option) -> MetadataEntry { + MetadataEntry { + name: Value::new_user_text(name), + value: MetadataValue::Leaf( + s.as_ref() + .cloned() + .map(Value::new_user_text) + .unwrap_or_default(), + ), + } + } + + MetadataEntry { + name: Value::new_user_text("SPSS/PC+ File Metadata"), + value: MetadataValue::Group(vec![ + MetadataEntry { + name: Value::new_user_text("Created"), + value: MetadataValue::new_leaf(Value::new_date_time(value.creation)), + }, + maybe_string("Product", &value.product), + maybe_string("File Name", &value.filename), + MetadataEntry::new( + "Compression", + MetadataValue::new_leaf(if value.compressed { "Simple" } else { "None" }), + ), + MetadataEntry::new( + "Number of Cases", + MetadataValue::new_leaf(Value::new_integer(Some(value.n_cases as f64))), + ), + ]), + } + .into_pivot_table() + } +} + +/// Reader for cases in a SPSS/PC+ file. +#[derive(Debug)] +pub struct Cases { + reader: Take, + compressed: bool, + case_vars: Vec, + codes: VecDeque, + read_cases: u64, + sysmis: f64, + n_cases: u16, + eof: bool, +} + +impl Cases { + fn new(reader: Take, dictionary: &Dictionary, metadata: &Metadata, sysmis: f64) -> Self { + Self { + reader, + compressed: metadata.compressed, + case_vars: dictionary + .variables + .iter() + .map(|var| var.width.into()) + .collect(), + codes: VecDeque::new(), + sysmis, + read_cases: 0, + n_cases: metadata.n_cases, + eof: false, + } + } + + fn read_case(&mut self) -> Result>>, raw::Error> + where + R: Read + Seek, + { + let result = if !self.compressed { + Datum::read_case( + &mut self.reader, + self.read_cases + 1, + &self.case_vars, + Endian::Little, + ) + } else { + Datum::read_compressed_case( + &mut self.reader, + self.read_cases + 1, + &self.case_vars, + &mut self.codes, + CompressionAction::from_pc, + Endian::Little, + ) + }; + + match result { + Ok(Some(mut raw_case)) => { + for datum in &mut raw_case.0 { + if let Datum::Number(Some(number)) = datum + && *number == self.sysmis + { + *datum = Datum::Number(None); + } + } + Ok(raw_case.with_encoding(WINDOWS_1252)) + } + Ok(None) => Err(raw::Error::new( + None, + CaseDetails::WrongNumberOfCases { + expected: self.n_cases as u64, + actual: self.read_cases, + }, + )), + Err(error) => Err(error), + } + } +} + +impl CompressionAction { + /// Interprets an SPSS/PC+ system file compression opcode. + fn from_pc(code: u8) -> Self { + match code { + 0 => Self::Sysmis, + 1 => Self::Literal, + other => Self::CompressedInt(other as f64 - 100.0), + } + } +} + +impl Iterator for Cases +where + R: Read + Seek, +{ + type Item = Result>>, raw::Error>; + + fn next(&mut self) -> Option { + if self.eof || self.case_vars.is_empty() || self.read_cases >= self.n_cases as u64 { + return None; + } + + match self.read_case() { + Ok(case) => { + self.read_cases += 1; + Some(Ok(case)) + } + Err(error) => { + self.eof = true; + Some(Err(error)) + } + } + } +} + +/// An error encountered reading a SPSS/PC+ file. +#[derive(Debug)] +pub struct Error { + /// Offset where the error occurred. + pub offset: Option, + + /// Details of the error. + pub details: ErrorDetails, +} + +impl std::error::Error for Error {} + +impl Error { + /// Constructs an error from `offset` and `details`. + pub fn new(offset: Option, details: ErrorDetails) -> Self { + Self { offset, details } + } +} + +impl From for Error { + fn from(value: IoError) -> Self { + Self::new(None, value.into()) + } +} + +impl Display for Error { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + if let Some(offset) = self.offset { + write!(f, "Error at file offset {:#x}: ", offset)?; + } + write!(f, "{}", &self.details) + } +} + +/// An error for reading a [PcFile]. +#[derive(Display, ThisError, Debug)] +pub enum ErrorDetails { + /// Not an SPSS/PC+ data file. + NotPc, + + /// I/O error ({0}). + Io(#[from] IoError), + + /// {0} + BinError(DisplayBinError), + + /// Invalid variable format: {0}. + InvalidFormat(FormatError), + + /// File header record declares {nominal_case_size} variable segments but the variable records contain more than that (at least {n_chunks}). + TooManyVariables { + /// Declared number of variable segments. + nominal_case_size: u16, + /// Actual number of variable segments. + n_chunks: usize, + }, + + /// Labels record ({record}) extends beyond end of file with length {file_size}. + InvalidLabelsRecord { + /// Labels record location. + record: Record, + /// File size. + file_size: u64, + }, +} + +impl From for ErrorDetails { + fn from(value: BinError) -> Self { + Self::BinError(DisplayBinError(value)) + } +} + +/// Newtype that implements [Display] for [BinError]. +#[derive(Debug)] +pub struct DisplayBinError(BinError); + +impl Display for DisplayBinError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + if self.0.is_eof() { + write!(f, "Unexpected end-of-file reading {}", self.0) + } else { + write!(f, "Error reading SPSS/PC+ file: {}", self.0.root_cause()) + } + } +} +/// A warning while reading a [PcFile]. +#[derive(Display, ThisError, Debug)] +pub enum Warning { + /// Invalid creation date {0}. + InvalidCreationDate(String), + + /// Invalid creation time {0}. + InvalidCreationTime(String), + + /// Invalid variable name. {id_error} Substituting {new_name} instead. + InvalidVariableName { + /// Identifier error. + id_error: IdError, + /// New name. + new_name: Identifier, + }, + + /// Renaming variable with duplicate name {duplicate_name} to {new_name}. + DuplicateVariableName { + /// Duplicate name. + duplicate_name: Identifier, + /// New name. + new_name: Identifier, + }, + + /// Substituting {new_format} for invalid print format on variable {variable}. {format_error} + InvalidPrintFormat { + /// New format. + new_format: Format, + /// Variable. + variable: Identifier, + /// Underlying error. + format_error: FormatError, + }, + + /// Substituting {new_format} for invalid write format on variable {variable}. {format_error} + InvalidWriteFormat { + /// New format. + new_format: Format, + /// Variable. + variable: Identifier, + /// Underlying error. + format_error: FormatError, + }, + + /// Missing value range may not contain system-missing value. + MissingValueRangeSysmis, + + /// Ignoring missing value for long string variable {0}. + LongStringMissingValue(Identifier), + + /// Invalid missing values for variable {name}: {error}. + InvalidMissingValues { + /// Variable name. + name: Identifier, + /// Kind of error with missing values. + error: MissingValuesError, + }, + + /// Invalid identifier {string}. {error} + InvalidIdentifier { + /// String that should be an identifier. + string: String, + /// Kind of error with the string. + error: IdError, + }, + + /// Unknown variable name {0}. + UnknownVariableName(Identifier), + + /// Mixed variable types in value labels. + MixedVariableTypes, + + /// Cannot weight by string variable {0}. + StringWeight(Identifier), + + /// File's specified weight index {0} does not refer to any variable. + InvalidWeightIndex(u16), + + /// Variable record for {name} refers to invalid variable label starting at offset {offset} in label record. + InvalidVarLabel { + /// Variable name. + name: Identifier, + /// Offset into label record. + offset: usize, + }, + + /// Value labels for {name}, at file offsets {start:#x}..{end:#x}, end with last value label (starting at file offset {offset:#x}) running past end offset. (This warning appears for some system files written by SPSS Data Entry products.) + ValueLabelOverflow { + /// Variable name. + name: Identifier, + /// File starting offset for variable's value labels. + start: usize, + /// File ending offset for variable's value labels. + end: usize, + /// File offset for last value label. + offset: u64, + }, + + /// Ignoring value labels for long string variable {0}. + LongStringValueLabel( + /// Variable name. + Identifier, + ), + + /// Value label for {name} specifies invalid range {start}..{end} into labels record with length {len}. + InvalidValueLabelRange { + /// Variable name. + name: Identifier, + /// Starting offset in labels record. + start: usize, + /// Ending offset in labels record. + end: usize, + /// Length of labels record. + len: usize, + }, + + /// File header inconsistently reports {0} cases in one place and {1} in another; assuming {0} cases. + InconsistentCaseCount(u16, u16), +} + +#[derive(Debug, BinRead)] +#[br(little)] +struct FileHeader { + two: u32, + zero: u32, + main_record: Record, + variables_record: Record, + labels_record: Record, + data_record: Record, + _other_records: [Record; 11], + filename: [u8; 128], +} + +/// A record in an SPSS/PC+ system file. +#[derive(Copy, Clone, Debug, PartialEq, Eq, BinRead)] +#[br(little)] +pub struct Record { + /// File starting offset of the record. + offset: u32, + /// Length of the record in bytes. + len: u32, +} + +impl Record { + fn new(offset: u32, len: u32) -> Self { + Self { offset, len } + } +} + +impl Display for Record { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "offset {}, length {}", self.offset, self.len) + } +} + +#[derive(Debug, BinRead)] +#[br(little)] +struct MainHeader { + one0: u16, + family: [u8; 2], + product: [u8; 60], + sysmis: f64, + zero0: u32, + zero1: u32, + one1: u16, + compressed: u16, + nominal_case_size: u16, + n_cases0: u16, + weight_index: u16, + _unknown: u16, + n_cases1: u16, + zero2: u16, + creation_date: [u8; 8], + creation_time: [u8; 8], + file_label: [u8; 64], +} + +#[derive(BinRead)] +#[br(little)] +struct VariableRecord { + value_label_start: u32, + value_label_end: u32, + var_label_ofs: u32, + format: RawFormat, + name: [u8; 8], + missing: [u8; 8], +} + +impl PcFile> { + /// Opens the file at `path`. + pub fn open_file(path: P, warn: F) -> Result + where + P: AsRef, + F: FnMut(Warning), + { + let reader = BufReader::new(File::open(path)?); + Self::open(reader, warn) + } +} + +impl PcFile +where + R: Read + Seek, +{ + /// Opens `reader` as an SPSS/PC+ file, invoking `warn` with any warnings + /// diagnosed while reading it. + pub fn open(mut reader: R, mut warn: F) -> Result + where + F: FnMut(Warning), + { + fn read_inner( + mut reader: R, + mut warn: F, + ) -> Result<(Dictionary, Metadata, Record, f64), ErrorDetails> + where + R: Read + Seek, + F: FnMut(Warning), + { + let file_header = FileHeader::read(&mut reader)?; + if file_header.two != 2 + || file_header.zero != 0 + || file_header.main_record != Record::new(0x100, 0xb0) + { + return Err(ErrorDetails::NotPc); + } + + reader.seek(SeekFrom::Start(file_header.main_record.offset as u64))?; + let main_header = MainHeader::read(&mut reader)?; + if main_header.one0 != 1 + || main_header.one1 != 1 + || main_header.zero0 != 0 + || main_header.zero1 != 0 + || main_header.zero2 != 0 + { + return Err(ErrorDetails::NotPc); + } + let family = match &main_header.family { + b"DE" => Ok(Family::De), + b"PC" => Ok(Family::Pc), + _ => Err(ErrorDetails::NotPc), + }?; + if main_header.n_cases0 != main_header.n_cases1 { + warn(Warning::InconsistentCaseCount( + main_header.n_cases0, + main_header.n_cases1, + )); + } + let sysmis = main_header.sysmis; + + let mut dictionary = Dictionary::new(WINDOWS_1252); + + let file_label = WINDOWS_1252.decode(&main_header.file_label); + let file_label = file_label.0.trim(); + if !file_label.is_empty() { + dictionary.file_label = Some(file_label.into()); + } + + let creation_date = WINDOWS_1252.decode(&main_header.creation_date).0; + let creation_date = NaiveDate::parse_from_str(creation_date.trim(), "%m/%d/%y") + .unwrap_or_else(|_| { + warn(Warning::InvalidCreationDate(creation_date.into_owned())); + Default::default() + }); + let creation_time = WINDOWS_1252.decode(&main_header.creation_time).0; + let creation_time = NaiveTime::parse_from_str(creation_time.trim(), "%H:%M:%S") + .unwrap_or_else(|_| { + warn(Warning::InvalidCreationTime(creation_time.into_owned())); + Default::default() + }); + let creation = NaiveDateTime::new(creation_date, creation_time); + + let mut n_generated_names = 0; + fn generate_name(dictionary: &Dictionary, n_generated_names: &mut usize) -> Identifier { + loop { + *n_generated_names = n_generated_names.checked_add(1).unwrap(); + let name = Identifier::from_encoding( + format!("VAR{:03}", *n_generated_names), + WINDOWS_1252, + ) + .unwrap(); + if !dictionary.variables.contains(&name.0) { + return name; + } + } + } + + let file_size = reader.seek(SeekFrom::End(0))?; + if u64::from(file_header.labels_record.offset) + + u64::from(file_header.labels_record.len) + > file_size + { + return Err(ErrorDetails::InvalidLabelsRecord { + record: file_header.labels_record, + file_size, + }); + } + reader.seek(SeekFrom::Start(file_header.labels_record.offset as u64))?; + let mut labels = vec![0; file_header.labels_record.len as usize]; + reader.read_exact(&mut labels)?; + + reader.seek(SeekFrom::Start(file_header.variables_record.offset as u64))?; + + let mut index = 0; + let mut weight_index = None; + let mut n_overflows = 0; + while index < main_header.nominal_case_size as usize { + if main_header.weight_index as usize == index + 1 { + weight_index = Some(dictionary.variables.len()); + } + + let variable_record = VariableRecord::read(&mut reader)?; + let mut name = String::from(WINDOWS_1252.decode(&variable_record.name).0.trim()); + if name.starts_with('$') { + name.replace_range(..1, "@"); + } + let name = match Identifier::from_encoding(name, WINDOWS_1252) + .and_then(Identifier::must_be_ordinary) + { + Ok(name) => { + if !dictionary.variables.contains(&name.0) { + name + } else { + let new_name = generate_name(&dictionary, &mut n_generated_names); + warn(Warning::DuplicateVariableName { + duplicate_name: name.clone(), + new_name: new_name.clone(), + }); + new_name + } + } + Err(id_error) => { + let new_name = generate_name(&dictionary, &mut n_generated_names); + warn(Warning::InvalidVariableName { + id_error, + new_name: new_name.clone(), + }); + new_name + } + }; + + let format = UncheckedFormat::try_from(variable_record.format) + .and_then(Format::try_from) + .map_err(ErrorDetails::InvalidFormat)?; + + let width = format.var_width(); + let mut variable = Variable::new(name, width, WINDOWS_1252); + + // This `unwrap` cannot panic because `format`, from + // `RawFormat`, can only represent a width <= 255. + let n_chunks = width.n_chunks().unwrap(); + + fn parse_datum( + datum: [u8; 8], + width: VarWidth, + sysmis: f64, + ) -> Datum> { + match width { + VarWidth::Numeric => { + let value = f64::from_le_bytes(datum); + Datum::Number((value != sysmis).then_some(value)) + } + VarWidth::String(width) => Datum::String( + ByteString::from(&datum[..width as usize]).with_encoding(WINDOWS_1252), + ), + } + } + + if sysmis != f64::from_le_bytes(variable_record.missing) { + if !width.is_long_string() { + let missing_value = MissingValues::new( + vec![parse_datum(variable_record.missing, width, sysmis)], + None, + ) + .unwrap(); + variable + .missing_values_mut() + .replace(missing_value) + .unwrap(); + } else { + warn(Warning::LongStringMissingValue(variable.name.clone())) + } + } + + if variable_record.var_label_ofs != 0 { + let offset = variable_record.var_label_ofs as usize + 7; + if let Some(len) = labels.get(offset) + && let Some(slice) = labels.get(offset + 1..offset + 1 + *len as usize) + { + variable.label = Some(WINDOWS_1252.decode(slice).0.into_owned()); + } else { + warn(Warning::InvalidVarLabel { + name: variable.name.clone(), + offset, + }); + } + } + + if variable_record.value_label_start != 0 { + if width.is_long_string() { + warn(Warning::LongStringValueLabel(variable.name.clone())); + } else { + let start = variable_record.value_label_start as usize + 7; + let end = variable_record.value_label_end as usize + 7; + if let Some(mut slice) = labels.get(start..end) { + while !slice.is_empty() { + if let Some((value, rest)) = slice.split_at_checked(8) + && let Some((length, rest)) = rest.split_first() + && let Some((label, rest)) = + rest.split_at_checked(*length as usize) + { + let label = WINDOWS_1252.decode(label).0.into_owned(); + let value = + parse_datum(value.try_into().unwrap(), width, sysmis) + .without_encoding(); + variable.value_labels.insert(value, label); + slice = rest; + } else { + if n_overflows == 0 { + warn(Warning::ValueLabelOverflow { + name: variable.name.clone(), + start: start + + file_header.labels_record.offset as usize, + end: end + file_header.labels_record.offset as usize, + offset: file_header.labels_record.offset as u64 + + variable_record.value_label_start as u64 + + 7 + + (variable_record.value_label_end as u64 + - variable_record.value_label_start as u64 + - slice.len() as u64), + }); + } + n_overflows += 1; + break; + }; + } + } else { + warn(Warning::InvalidValueLabelRange { + name: variable.name.clone(), + start, + end, + len: labels.len(), + }); + } + } + } + + dictionary.add_var(variable).unwrap(); + + for _ in 1..n_chunks { + let _variable_record = VariableRecord::read(&mut reader)?; + } + index += n_chunks; + if index > main_header.nominal_case_size as usize { + return Err(ErrorDetails::TooManyVariables { + nominal_case_size: main_header.nominal_case_size, + n_chunks: index, + }); + } + } + + if let Some(weight_index) = weight_index { + if dictionary.set_weight(Some(weight_index)).is_err() { + warn(Warning::StringWeight( + dictionary + .variables + .get_index(weight_index) + .unwrap() + .name + .clone(), + )) + } + } else if main_header.weight_index != 0 { + warn(Warning::InvalidWeightIndex(main_header.weight_index)) + } + + fn decode_optional_string(s: &[u8]) -> Option { + let s = WINDOWS_1252.decode(s).0; + let s = s.trim_matches(&[' ', '\0']); + if s.is_empty() { None } else { Some(s.into()) } + } + let metadata = Metadata { + creation, + family, + product: decode_optional_string(&main_header.product), + filename: decode_optional_string(&file_header.filename), + compressed: main_header.compressed != 0, + n_cases: main_header.n_cases0, + }; + + Ok(( + dictionary, + metadata, + file_header.data_record, + main_header.sysmis, + )) + } + + let (dictionary, metadata, data_record, sysmis) = read_inner(&mut reader, &mut warn) + .map_err(|details| Error { + offset: reader.stream_position().ok(), + details, + })?; + + reader.seek(SeekFrom::Start(data_record.offset as u64))?; + let reader = reader.take(data_record.len as u64); + let cases = Cases::new(reader, &dictionary, &metadata, sysmis); + Ok(PcFile { + dictionary, + metadata, + cases, + }) + } +} diff --git a/rust/pspp/src/pc/testdata/README.md b/rust/pspp/src/pc/testdata/README.md new file mode 100644 index 0000000000..5b623aca1e --- /dev/null +++ b/rust/pspp/src/pc/testdata/README.md @@ -0,0 +1,2 @@ +The two .sys files in this directory are old ones found on the Internet. They +do not contain any personally identifying information. diff --git a/rust/pspp/src/pc/testdata/test1.expected b/rust/pspp/src/pc/testdata/test1.expected new file mode 100644 index 0000000000..04cc739de3 --- /dev/null +++ b/rust/pspp/src/pc/testdata/test1.expected @@ -0,0 +1,68 @@ +╭───────────────┬───────────────────────────────────────╮ +│Created │ 05-OCT-1987 17:18:05│ +│Product │SPSS SYSTEM FILE. IBM PC DOS, SPSS/PC+│ +│Compression │Simple │ +│Number of Cases│ 30│ +╰───────────────┴───────────────────────────────────────╯ + +╭─────────┬────────╮ +│Label │SPSS/PC+│ +│Variables│ 7│ +╰─────────┴────────╯ + + Variables +╭──────────────────┬────────┬──────────────────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│ Label │Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├──────────────────┼────────┼──────────────────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│@CASENUM │ 1│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│@DATE │ 2│ │Nominal │Input│ 8│Left │A8 │A8 │ │ +│@WEIGHT │ 3│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│Advertiser code │ 4│Advertiser code │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│Microprocessor │ 5│Microprocessor │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│Clock speed in MHz│ 6│Clock speed in MHz│ │Input│ 8│Right │F8.2 │F8.2 │ │ +│Price in dollars │ 7│Price in dollars │ │Input│ 8│Right │F8.2 │F8.2 │ │ +╰──────────────────┴────────┴──────────────────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + + Value Labels +╭───────────────────┬─────╮ +│Variable Value │ │ +├───────────────────┼─────┤ +│Microprocessor 1.00│8088 │ +│ 2.00│80286│ +│ 3.00│80386│ +╰───────────────────┴─────╯ + +╭────┬────────┬────────┬───────┬───────────────┬──────────────┬──────────────────┬────────────────╮ +│Case│@CASENUM│ @DATE │@WEIGHT│Advertiser code│Microprocessor│Clock speed in MHz│Price in dollars│ +├────┼────────┼────────┼───────┼───────────────┼──────────────┼──────────────────┼────────────────┤ +│1 │ 6.00│10/5/87 │ 6.00│ 18.00│ 6.00│ 13.00│ 995.00│ +│2 │ 7.00│10/5/87 │ 6.00│ 23.00│ 6.00│ 13.00│ 699.00│ +│3 │ 8.00│10/5/87 │ 6.00│ 23.00│ 7.00│ 13.00│ 995.00│ +│4 │ 9.00│10/5/87 │ 6.00│ 23.00│ 7.00│ 15.00│ 1529.00│ +│5 │ 10.00│10/5/87 │ 6.00│ 23.00│ 7.00│ 17.00│ 1695.00│ +│6 │ 11.00│10/5/87 │ 6.00│ 43.00│ 8.00│ 21.00│ 2995.00│ +│7 │ 12.00│10/5/87 │ 6.00│ 50.00│ 7.00│ 13.00│ 1999.00│ +│8 │ 14.00│10/5/87 │ 6.00│ 50.00│ 7.00│ 15.00│ 2995.00│ +│9 │ 16.00│10/5/87 │ 6.00│ 69.00│ 7.00│ 15.00│ 1795.00│ +│10 │ 17.00│10/5/87 │ 6.00│ 69.00│ 6.00│ 4.77│ 995.00│ +│11 │ 18.00│10/5/87 │ 6.00│ 72.00│ 6.00│ 13.00│ 648.00│ +│12 │ 19.00│10/5/87 │ 6.00│ 72.00│ 7.00│ 15.00│ 1248.00│ +│13 │ 20.00│10/5/87 │ 6.00│ 79.00│ 7.00│ 17.00│ 1199.00│ +│14 │ 21.00│10/5/87 │ 6.00│ 82.00│ 7.00│ 15.00│ 1199.00│ +│15 │ 22.00│10/5/87 │ 6.00│ 82.00│ 6.00│ 15.00│ 699.00│ +│16 │ 23.00│10/5/87 │ 6.00│ 90.00│ 6.00│ 13.00│ 537.00│ +│17 │ 24.00│10/5/87 │ 6.00│ 90.00│ 7.00│ 15.00│ 1585.00│ +│18 │ 25.00│10/5/87 │ 6.00│ 91.00│ 7.00│ 13.00│ 1084.00│ +│19 │ 26.00│10/5/87 │ 6.00│ 91.00│ 7.00│ 15.00│ 1134.00│ +│20 │ 27.00│10/5/87 │ 6.00│ 91.00│ 7.00│ 17.00│ 1288.00│ +│21 │ 28.00│10/5/87 │ 6.00│ 91.00│ 6.00│ 15.00│ 634.00│ +│22 │ 29.00│10/5/87 │ 6.00│ 91.00│ 6.00│ 13.00│ 538.00│ +│23 │ 30.00│10/5/87 │ 6.00│ 99.00│ 6.00│ 15.00│ 785.00│ +│24 │ 31.00│10/5/87 │ 6.00│ 99.00│ 7.00│ 15.00│ 1555.00│ +│25 │ 32.00│10/5/87 │ 6.00│ 99.00│ 8.00│ 21.00│ 3620.00│ +│26 │ 33.00│10/5/87 │ 6.00│ 112.00│ 8.00│ 25.00│ 4999.00│ +│27 │ 34.00│10/5/87 │ 6.00│ 112.00│ 7.00│ 17.00│ 2999.00│ +│28 │ 35.00│10/5/87 │ 6.00│ 112.00│ 6.00│ 13.00│ 499.00│ +│29 │ 36.00│10/5/87 │ 6.00│ 112.00│ 8.00│ 21.00│ 2999.00│ +│30 │ 37.00│10/5/87 │ 6.00│ 112.00│ 6.00│ 13.00│ 1799.00│ +╰────┴────────┴────────┴───────┴───────────────┴──────────────┴──────────────────┴────────────────╯ diff --git a/rust/pspp/src/pc/testdata/test1.sys b/rust/pspp/src/pc/testdata/test1.sys new file mode 100644 index 0000000000..64cbf6d2ec Binary files /dev/null and b/rust/pspp/src/pc/testdata/test1.sys differ diff --git a/rust/pspp/src/pc/testdata/test2.expected b/rust/pspp/src/pc/testdata/test2.expected new file mode 100644 index 0000000000..f6827bc668 --- /dev/null +++ b/rust/pspp/src/pc/testdata/test2.expected @@ -0,0 +1,106 @@ +╭───────────────┬───────────────────────────────────────╮ +│Created │ 05-DEC-1989 23:07:29│ +│Product │SPSS SYSTEM FILE. IBM PC DOS, SPSS/PC+│ +│Compression │None │ +│Number of Cases│ 70│ +╰───────────────┴───────────────────────────────────────╯ + +╭─────────┬────────╮ +│Label │SPSS/PC+│ +│Variables│ 14│ +╰─────────┴────────╯ + + Variables +╭────────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├────────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│@CASENUM│ 1│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│@DATE │ 2│ │Nominal │Input│ 8│Left │A8 │A8 │ │ +│@WEIGHT │ 3│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│ID │ 4│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│LIVE │ 5│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│SEX │ 6│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│AGE │ 7│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│FIRSTVIS│ 8│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│PROMPT │ 9│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│TIME │ 10│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│PEOPLE │ 11│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│ADVERT │ 12│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│OTHER │ 13│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│COMMENT │ 14│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +╰────────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + +╭────┬────────┬────────┬───────┬───────┬────┬────┬────┬────────┬──────┬────┬──────┬──────┬─────┬───────╮ +│Case│@CASENUM│ @DATE │@WEIGHT│ ID │LIVE│ SEX│ AGE│FIRSTVIS│PROMPT│TIME│PEOPLE│ADVERT│OTHER│COMMENT│ +├────┼────────┼────────┼───────┼───────┼────┼────┼────┼────────┼──────┼────┼──────┼──────┼─────┼───────┤ +│1 │ 1.00│12/5/89 │ 1.00│ 87.00│1.00│2.00│5.00│ 2.00│ 1.00│4.00│ 1.00│ 9.00│ 9.00│ 1.00│ +│2 │ 2.00│12/5/89 │ 1.00│ 88.00│1.00│1.00│5.00│ 2.00│ 1.00│4.00│ 9.00│ 9.00│ 1.00│ 1.00│ +│3 │ 3.00│12/5/89 │ 1.00│ 89.00│1.00│2.00│2.00│ 1.00│ 1.00│4.00│ 9.00│ 9.00│ 9.00│ 9.00│ +│4 │ 4.00│12/5/89 │ 1.00│ 90.00│4.00│2.00│2.00│ 1.00│ 1.00│3.00│ 9.00│ 9.00│ 1.00│ 1.00│ +│5 │ 5.00│12/5/89 │ 1.00│ 91.00│4.00│1.00│2.00│ 1.00│ 1.00│2.00│ 9.00│ 9.00│ 1.00│ 1.00│ +│6 │ 6.00│12/5/89 │ 1.00│ 92.00│4.00│2.00│2.00│ 1.00│ 1.00│2.00│ 9.00│ 4.00│ 1.00│ 1.00│ +│7 │ 7.00│12/5/89 │ 1.00│ 93.00│4.00│2.00│2.00│ 1.00│ 9.00│2.00│ 9.00│ 9.00│ 1.00│ 9.00│ +│8 │ 8.00│12/5/89 │ 1.00│ 94.00│1.00│2.00│4.00│ 1.00│ 1.00│3.00│ 9.00│ 5.00│ 9.00│ 1.00│ +│9 │ 9.00│12/5/89 │ 1.00│ 95.00│1.00│2.00│4.00│ 1.00│ 1.00│2.00│ 9.00│ 6.00│ 9.00│ 9.00│ +│10 │ 10.00│12/5/89 │ 1.00│ 96.00│2.00│2.00│4.00│ 2.00│ 1.00│2.00│ 9.00│ 1.00│ 9.00│ 1.00│ +│11 │ 11.00│12/5/89 │ 1.00│1000.00│4.00│2.00│2.00│ 9.00│ 1.00│1.00│ 1.00│ 9.00│ 9.00│ 9.00│ +│12 │ 12.00│12/5/89 │ 1.00│1001.00│4.00│1.00│3.00│ 9.00│ 1.00│2.00│ 9.00│ 9.00│ 9.00│ 1.00│ +│13 │ 13.00│12/5/89 │ 1.00│1002.00│4.00│2.00│3.00│ 1.00│ 1.00│3.00│ 1.00│ 9.00│ 9.00│ 1.00│ +│14 │ 14.00│12/5/89 │ 1.00│1003.00│2.00│2.00│3.00│ 2.00│ 1.00│3.00│ 9.00│ 2.00│ 9.00│ 9.00│ +│15 │ 15.00│12/5/89 │ 1.00│1004.00│1.00│1.00│3.00│ 2.00│ 1.00│3.00│ 9.00│ 2.00│ 9.00│ 1.00│ +│16 │ 16.00│12/5/89 │ 1.00│1005.00│1.00│1.00│3.00│ 2.00│ 1.00│3.00│ 9.00│ 3.00│ 9.00│ 1.00│ +│17 │ 17.00│12/5/89 │ 1.00│1006.00│4.00│2.00│3.00│ 1.00│ 1.00│2.00│ 1.00│ 4.00│ 9.00│ 1.00│ +│18 │ 18.00│12/5/89 │ 1.00│1007.00│4.00│1.00│3.00│ 1.00│ 1.00│3.00│ 9.00│ 9.00│ 1.00│ 1.00│ +│19 │ 19.00│12/5/89 │ 1.00│1008.00│1.00│1.00│3.00│ 2.00│ 1.00│4.00│ 9.00│ 9.00│ 9.00│ 9.00│ +│20 │ 20.00│12/5/89 │ 1.00│1009.00│1.00│9.00│3.00│ 1.00│ 9.00│4.00│ 9.00│ 9.00│ 1.00│ 1.00│ +│21 │ 21.00│12/5/89 │ 1.00│1010.00│4.00│2.00│1.00│ 1.00│ 1.00│1.00│ 9.00│ 9.00│ 1.00│ 9.00│ +│22 │ 22.00│12/5/89 │ 1.00│1011.00│4.00│9.00│2.00│ 1.00│ 1.00│3.00│ 1.00│ 8.00│ 9.00│ 9.00│ +│23 │ 23.00│12/5/89 │ 1.00│1012.00│4.00│1.00│2.00│ 1.00│ 1.00│2.00│ 1.00│ 7.00│ 9.00│ 1.00│ +│24 │ 24.00│12/5/89 │ 1.00│1013.00│4.00│1.00│2.00│ 1.00│ 9.00│5.00│ 9.00│ 3.00│ 9.00│ 9.00│ +│25 │ 25.00│12/5/89 │ 1.00│1014.00│4.00│1.00│2.00│ 1.00│ 1.00│2.00│ 5.00│ 9.00│ 9.00│ 9.00│ +│26 │ 26.00│12/5/89 │ 1.00│1015.00│4.00│1.00│2.00│ 1.00│ 1.00│4.00│ 9.00│ 3.00│ 9.00│ 9.00│ +│27 │ 27.00│12/5/89 │ 1.00│1016.00│4.00│2.00│2.00│ 1.00│ 1.00│2.00│ 5.00│ 9.00│ 9.00│ 1.00│ +│28 │ 28.00│12/5/89 │ 1.00│1017.00│1.00│9.00│2.00│ 9.00│ 1.00│2.00│ 2.00│ 9.00│ 9.00│ 9.00│ +│29 │ 29.00│12/5/89 │ 1.00│1018.00│1.00│2.00│2.00│ 2.00│ 1.00│3.00│ 9.00│ 3.00│ 9.00│ 1.00│ +│30 │ 30.00│12/5/89 │ 1.00│1019.00│1.00│2.00│2.00│ 2.00│ 1.00│4.00│ 9.00│ 2.00│ 9.00│ 1.00│ +│31 │ 31.00│12/5/89 │ 1.00│1020.00│4.00│2.00│2.00│ 1.00│ 1.00│1.00│ 4.00│ 9.00│ 9.00│ 9.00│ +│32 │ 32.00│12/5/89 │ 1.00│1021.00│4.00│1.00│3.00│ 1.00│ 1.00│3.00│ 4.00│ 9.00│ 9.00│ 1.00│ +│33 │ 33.00│12/5/89 │ 1.00│1022.00│4.00│2.00│3.00│ 1.00│ 1.00│3.00│ 3.00│ 9.00│ 9.00│ 1.00│ +│34 │ 34.00│12/5/89 │ 1.00│1023.00│4.00│2.00│3.00│ 1.00│ 9.00│9.00│ 5.00│ 8.00│ 9.00│ 9.00│ +│35 │ 35.00│12/5/89 │ 1.00│1024.00│4.00│2.00│3.00│ 1.00│ 9.00│2.00│ 4.00│ 9.00│ 9.00│ 9.00│ +│36 │ 36.00│12/5/89 │ 1.00│1025.00│3.00│1.00│3.00│ 1.00│ 1.00│2.00│ 9.00│ 9.00│ 9.00│ 1.00│ +│37 │ 37.00│12/5/89 │ 1.00│1026.00│2.00│2.00│3.00│ 2.00│ 1.00│9.00│ 9.00│ 9.00│ 1.00│ 9.00│ +│38 │ 38.00│12/5/89 │ 1.00│1027.00│1.00│1.00│3.00│ 1.00│ 1.00│4.00│ 1.00│ 9.00│ 9.00│ 1.00│ +│39 │ 39.00│12/5/89 │ 1.00│1028.00│3.00│2.00│3.00│ 1.00│ 1.00│5.00│ 4.00│ 9.00│ 9.00│ 1.00│ +│40 │ 40.00│12/5/89 │ 1.00│1029.00│1.00│2.00│3.00│ 1.00│ 1.00│4.00│ 1.00│ 2.00│ 9.00│ 1.00│ +│41 │ 41.00│12/5/89 │ 1.00│1030.00│1.00│2.00│2.00│ 1.00│ 1.00│5.00│ 1.00│ 2.00│ 9.00│ 9.00│ +│42 │ 42.00│12/5/89 │ 1.00│1031.00│4.00│2.00│5.00│ 2.00│ 1.00│2.00│ 5.00│ 8.00│ 9.00│ 1.00│ +│43 │ 43.00│12/5/89 │ 1.00│1032.00│1.00│1.00│5.00│ 9.00│ 1.00│2.00│ 9.00│ 9.00│ 1.00│ 9.00│ +│44 │ 44.00│12/5/89 │ 1.00│1033.00│4.00│1.00│1.00│ 1.00│ 1.00│2.00│ 4.00│ 9.00│ 1.00│ 1.00│ +│45 │ 45.00│12/5/89 │ 1.00│1034.00│4.00│2.00│2.00│ 1.00│ 1.00│3.00│ 3.00│ 4.00│ 9.00│ 1.00│ +│46 │ 46.00│12/5/89 │ 1.00│1035.00│4.00│1.00│2.00│ 1.00│ 1.00│1.00│ 9.00│ 9.00│ 1.00│ 1.00│ +│47 │ 47.00│12/5/89 │ 1.00│1036.00│4.00│9.00│5.00│ 1.00│ 1.00│1.00│ 1.00│ 9.00│ 9.00│ 9.00│ +│48 │ 48.00│12/5/89 │ 1.00│1039.00│2.00│2.00│4.00│ 1.00│ 1.00│1.00│ 4.00│ 8.00│ 9.00│ 1.00│ +│49 │ 49.00│12/5/89 │ 1.00│ 21.00│1.00│9.00│2.00│ 1.00│ 1.00│1.00│ 5.00│ 4.00│ 9.00│ 1.00│ +│50 │ 50.00│12/5/89 │ 1.00│ 23.00│3.00│2.00│5.00│ 2.00│ 1.00│3.00│ 2.00│ 8.00│ 9.00│ 1.00│ +│51 │ 51.00│12/5/89 │ 1.00│ 24.00│2.00│2.00│5.00│ 2.00│ 1.00│5.00│ 9.00│ 9.00│ 1.00│ 1.00│ +│52 │ 52.00│12/5/89 │ 1.00│ 27.00│1.00│2.00│3.00│ 1.00│ 1.00│3.00│ 5.00│ 7.00│ 1.00│ 1.00│ +│53 │ 53.00│12/5/89 │ 1.00│ 28.00│4.00│2.00│3.00│ 1.00│ 1.00│3.00│ 1.00│ 4.00│ 9.00│ 1.00│ +│54 │ 54.00│12/5/89 │ 1.00│1040.00│4.00│1.00│1.00│ 1.00│ 9.00│9.00│ 9.00│ 9.00│ 9.00│ 9.00│ +│55 │ 55.00│12/5/89 │ 1.00│1041.00│4.00│9.00│3.00│ 1.00│ 1.00│4.00│ 9.00│ 4.00│ 9.00│ 1.00│ +│56 │ 56.00│12/5/89 │ 1.00│1043.00│3.00│1.00│3.00│ 1.00│ 1.00│3.00│ 9.00│ 4.00│ 9.00│ 9.00│ +│57 │ 57.00│12/5/89 │ 1.00│1044.00│1.00│1.00│2.00│ 1.00│ 1.00│3.00│ 9.00│ 9.00│ 9.00│ 1.00│ +│58 │ 58.00│12/5/89 │ 1.00│1045.00│2.00│1.00│2.00│ 1.00│ 1.00│3.00│ 9.00│ 1.00│ 9.00│ 9.00│ +│59 │ 59.00│12/5/89 │ 1.00│1046.00│3.00│1.00│2.00│ 1.00│ 1.00│3.00│ 9.00│ 9.00│ 1.00│ 1.00│ +│60 │ 60.00│12/5/89 │ 1.00│1047.00│3.00│1.00│2.00│ 1.00│ 1.00│3.00│ 3.00│ 7.00│ 9.00│ 1.00│ +│61 │ 61.00│12/5/89 │ 1.00│1048.00│3.00│1.00│2.00│ 1.00│ 1.00│2.00│ 9.00│ 8.00│ 1.00│ 9.00│ +│62 │ 62.00│12/5/89 │ 1.00│1049.00│3.00│1.00│2.00│ 1.00│ 1.00│2.00│ 9.00│ 9.00│ 1.00│ 1.00│ +│63 │ 63.00│12/5/89 │ 1.00│1050.00│1.00│2.00│2.00│ 1.00│ 1.00│1.00│ 9.00│ 1.00│ 9.00│ 1.00│ +│64 │ 64.00│12/5/89 │ 1.00│1051.00│1.00│2.00│2.00│ 1.00│ 1.00│5.00│ 1.00│ 2.00│ 9.00│ 1.00│ +│65 │ 65.00│12/5/89 │ 1.00│1052.00│2.00│2.00│2.00│ 1.00│ 1.00│9.00│ 9.00│ 9.00│ 9.00│ 9.00│ +│66 │ 66.00│12/5/89 │ 1.00│1053.00│2.00│2.00│3.00│ 2.00│ 3.00│9.00│ 1.00│ 3.00│ 9.00│ 1.00│ +│67 │ 67.00│12/5/89 │ 1.00│1054.00│1.00│2.00│5.00│ 2.00│ 1.00│1.00│ 1.00│ 3.00│ 9.00│ 1.00│ +│68 │ 68.00│12/5/89 │ 1.00│1055.00│1.00│2.00│5.00│ 2.00│ 1.00│3.00│ 1.00│ 2.00│ 9.00│ 1.00│ +│69 │ 69.00│12/5/89 │ 1.00│1056.00│2.00│2.00│5.00│ 2.00│ 4.00│9.00│ 1.00│ 1.00│ 9.00│ 9.00│ +│70 │ 70.00│12/5/89 │ 1.00│1057.00│2.00│2.00│5.00│ 1.00│ 4.00│9.00│ 9.00│ 1.00│ 1.00│ 1.00│ +╰────┴────────┴────────┴───────┴───────┴────┴────┴────┴────────┴──────┴────┴──────┴──────┴─────┴───────╯ diff --git a/rust/pspp/src/pc/testdata/test2.sys b/rust/pspp/src/pc/testdata/test2.sys new file mode 100644 index 0000000000..4a759c1e3b Binary files /dev/null and b/rust/pspp/src/pc/testdata/test2.sys differ diff --git a/rust/pspp/src/pc/tests.rs b/rust/pspp/src/pc/tests.rs new file mode 100644 index 0000000000..25dab4f513 --- /dev/null +++ b/rust/pspp/src/pc/tests.rs @@ -0,0 +1,59 @@ +use std::path::Path; + +use itertools::Itertools; + +use crate::{ + data::cases_to_output, + output::{ + Details, Item, Text, + pivot::{PivotTable, tests::assert_lines_eq}, + }, + pc::PcFile, +}; + +fn test_pcfile(name: &str) { + let base_filename = Path::new("src/pc/testdata").join(name); + let input_filename = base_filename.with_extension("sys"); + let expected_filename = base_filename.with_extension("expected"); + + let mut warnings = Vec::new(); + let output = match PcFile::open_file(input_filename, |warning| warnings.push(warning)) { + Ok(pc_file) => { + let (dictionary, metadata, cases) = pc_file.into_parts(); + + let mut output = Vec::new(); + output.extend( + warnings + .into_iter() + .map(|warning| Item::from(Text::new_log(warning.to_string()))), + ); + output.push(PivotTable::from(&metadata).into()); + output.extend(dictionary.all_pivot_tables().into_iter().map_into()); + output.extend(cases_to_output(&dictionary, cases)); + Item::new(Details::Group(output.into_iter().map_into().collect())) + } + Err(error) => Item::new(Details::Text(Box::new(Text::new_log(error.to_string())))), + }; + + let actual = output.to_string(); + let expected = std::fs::read_to_string(&expected_filename).unwrap(); + if expected != actual { + if std::env::var("PSPP_REFRESH_EXPECTED").is_ok() { + std::fs::write(&expected_filename, actual).unwrap(); + panic!("{}: refreshed output", expected_filename.display()); + } else { + eprintln!("note: rerun with PSPP_REFRESH_EXPECTED=1 to refresh expected output"); + } + } + assert_lines_eq(&expected, expected_filename.display(), &actual, "actual"); +} + +#[test] +fn pcfile_test1() { + test_pcfile("test1"); +} + +#[test] +fn pcfile_test2() { + test_pcfile("test2"); +} diff --git a/rust/pspp/src/show_pc.rs b/rust/pspp/src/show_pc.rs new file mode 100644 index 0000000000..385f8770be --- /dev/null +++ b/rust/pspp/src/show_pc.rs @@ -0,0 +1,300 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +use anyhow::{Result, anyhow}; +use clap::{Args, ValueEnum}; +use pspp::{ + data::cases_to_output, + output::{ + Details, Item, Text, + driver::{Config, Driver}, + pivot::PivotTable, + }, + pc::PcFile, +}; +use serde::Serialize; +use std::{ + cell::RefCell, + ffi::OsStr, + fmt::{Display, Write as _}, + fs::File, + io::{BufReader, Write, stdout}, + path::{Path, PathBuf}, + rc::Rc, + sync::Arc, +}; + +/// Show information about SPSS/PC+ data files. +#[derive(Args, Clone, Debug)] +pub struct ShowPc { + /// What to show. + #[arg(value_enum)] + mode: Mode, + + /// File to show. + #[arg(required = true)] + input: PathBuf, + + /// Output file name. If omitted, output is written to stdout. + output: Option, + + /// Maximum number of cases to read. + /// + /// If specified without an argument, all cases will be read. + #[arg( + long = "data", + num_args = 0..=1, + default_missing_value = "18446744073709551615", + default_value_t = 0, + help_heading = "Input file options" + )] + max_cases: usize, + + /// Output driver configuration options. + #[arg(short = 'o', help_heading = "Output options")] + output_options: Vec, + + /// Output format. + #[arg(long, short = 'f', help_heading = "Output options")] + format: Option, +} + +enum Output { + Driver { + driver: Rc>>, + mode: Mode, + }, + Json { + writer: Rc>>, + pretty: bool, + }, + Discard, +} + +impl Output { + fn show_json(&self, value: &T) -> Result<()> + where + T: Serialize, + { + match self { + Self::Driver { mode, driver: _ } => { + Err(anyhow!("Mode '{mode}' only supports output as JSON.")) + } + Self::Json { writer, pretty } => { + let mut writer = writer.borrow_mut(); + match pretty { + true => serde_json::to_writer_pretty(&mut *writer, value)?, + false => serde_json::to_writer(&mut *writer, value)?, + }; + writeln!(writer)?; + Ok(()) + } + Self::Discard => Ok(()), + } + } + + fn warn(&self, warning: &impl Display) { + match self { + Output::Driver { driver, .. } => { + driver + .borrow_mut() + .write(&Arc::new(Item::from(Text::new_log(warning.to_string())))); + } + Output::Json { .. } => { + #[derive(Serialize)] + struct Warning { + warning: String, + } + let warning = Warning { + warning: warning.to_string(), + }; + let _ = self.show_json(&warning); + } + Self::Discard => (), + } + } +} + +impl ShowPc { + pub fn run(self) -> Result<()> { + let format = if let Some(format) = self.format { + format + } else if let Some(output_file) = &self.output { + match output_file + .extension() + .unwrap_or(OsStr::new("")) + .to_str() + .unwrap_or("") + { + "json" => ShowFormat::Json, + "ndjson" => ShowFormat::Ndjson, + _ => ShowFormat::Output, + } + } else { + ShowFormat::Json + }; + + let output = match format { + ShowFormat::Output => { + let mut config = String::new(); + + if let Some(file) = &self.output { + #[derive(Serialize)] + struct File<'a> { + file: &'a Path, + } + let file = File { + file: file.as_path(), + }; + let toml_file = toml::to_string_pretty(&file).unwrap(); + config.push_str(&toml_file); + } + for option in &self.output_options { + writeln!(&mut config, "{option}").unwrap(); + } + + let table: toml::Table = toml::from_str(&config)?; + if !table.contains_key("driver") { + let driver = if let Some(file) = &self.output { + ::driver_type_from_filename(file).ok_or_else(|| { + anyhow!("{}: no default output format for file name", file.display()) + })? + } else { + "text" + }; + + #[derive(Serialize)] + struct DriverConfig { + driver: &'static str, + } + config.insert_str( + 0, + &toml::to_string_pretty(&DriverConfig { driver }).unwrap(), + ); + } + + let config: Config = toml::from_str(&config)?; + Output::Driver { + mode: self.mode, + driver: Rc::new(RefCell::new(Box::new(::new(&config)?))), + } + } + ShowFormat::Json | ShowFormat::Ndjson => Output::Json { + pretty: format == ShowFormat::Json, + writer: if let Some(output_file) = &self.output { + Rc::new(RefCell::new(Box::new(File::create(output_file)?))) + } else { + Rc::new(RefCell::new(Box::new(stdout()))) + }, + }, + ShowFormat::Discard => Output::Discard, + }; + + let reader = BufReader::new(File::open(&self.input)?); + match self.mode { + Mode::Dictionary => { + let PcFile { + dictionary, + metadata: _, + cases, + } = PcFile::open(reader, |warning| output.warn(&warning))?; + let cases = cases.take(self.max_cases); + + match &output { + Output::Driver { driver, mode: _ } => { + let mut output = Vec::new(); + output.extend( + dictionary + .all_pivot_tables() + .into_iter() + .map(|pivot_table| Item::new(pivot_table)), + ); + output.extend(cases_to_output(&dictionary, cases)); + driver + .borrow_mut() + .write(&Arc::new(Item::new(Details::Group( + output.into_iter().map(Arc::new).collect(), + )))); + } + Output::Json { .. } => { + output.show_json(&dictionary)?; + for (_index, case) in (0..self.max_cases).zip(cases) { + output.show_json(&case?)?; + } + } + Output::Discard => (), + } + } + Mode::Metadata => { + let metadata = PcFile::open(reader, |warning| output.warn(&warning))?.metadata; + + match &output { + Output::Driver { driver, mode: _ } => { + driver + .borrow_mut() + .write(&Arc::new(Item::new(PivotTable::from(&metadata)))); + } + Output::Json { .. } => { + output.show_json(&metadata)?; + } + Output::Discard => (), + } + } + } + Ok(()) + } +} + +/// What to show in a system file. +#[derive(Clone, Copy, Debug, Default, PartialEq, ValueEnum)] +enum Mode { + /// File dictionary, with variables, value labels, ... + #[default] + #[value(alias = "dict")] + Dictionary, + + /// File metadata not included in the dictionary. + Metadata, +} + +impl Mode { + fn as_str(&self) -> &'static str { + match self { + Mode::Dictionary => "dictionary", + Mode::Metadata => "metadata", + } + } +} + +impl Display for Mode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +#[derive(Clone, Copy, Debug, Default, PartialEq, Serialize, ValueEnum)] +#[serde(rename_all = "snake_case")] +enum ShowFormat { + /// Pretty-printed JSON. + #[default] + Json, + /// Newline-delimited JSON. + Ndjson, + /// Pivot tables. + Output, + /// No output. + Discard, +} diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index 7316d29b9d..d3e43b6b15 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -36,7 +36,7 @@ use crate::{ output::pivot::{Axis3, Dimension, Group, PivotTable, Value}, sys::{ raw::{ - self, DecodedRecord, RawCases, RawDatum, RawWidth, Reader, infer_encoding, + self, CaseDetails, DecodedRecord, RawCases, RawDatum, RawWidth, Reader, infer_encoding, records::{ Compression, DocumentRecord, EncodingRecord, Extension, FileAttributesRecord, FileHeader, FloatInfoRecord, IntegerInfoRecord, LongName, LongNamesRecord, @@ -1728,7 +1728,7 @@ impl Debug for Cases { } impl Iterator for Cases { - type Item = Result>>, raw::Error>; + type Item = Result>>, raw::Error>; fn next(&mut self) -> Option { self.inner.next().map(|result| { diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index d9f770ba6a..00779f054f 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -48,6 +48,7 @@ use crate::{ }; use binrw::Endian; +use displaydoc::Display; use encoding_rs::{ BIG5, EUC_JP, EUC_KR, Encoding, GB18030, IBM866, ISO_2022_JP, ISO_8859_2, ISO_8859_3, ISO_8859_4, ISO_8859_5, ISO_8859_6, ISO_8859_7, ISO_8859_8, ISO_8859_10, ISO_8859_13, @@ -79,24 +80,27 @@ pub mod records; /// /// Any error prevents reading further data from the system file. #[derive(Debug)] -pub struct Error { +pub struct Error { /// Range of file offsets where the error occurred. pub offsets: Option>, /// Details of the error. - pub details: ErrorDetails, + pub details: D, } -impl std::error::Error for Error {} +impl std::error::Error for Error where D: Debug + Display {} -impl Error { +impl Error { /// Constructs an error from `offsets` and `details`. - pub fn new(offsets: Option>, details: ErrorDetails) -> Self { + pub fn new(offsets: Option>, details: D) -> Self { Self { offsets, details } } } -impl Display for Error { +impl Display for Error +where + D: Display, +{ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { if let Some(offsets) = &self.offsets && !offsets.is_empty() @@ -115,7 +119,10 @@ impl Display for Error { } } -impl From for Error { +impl From for Error +where + D: From, +{ fn from(value: IoError) -> Self { Self::new(None, value.into()) } @@ -225,33 +232,6 @@ pub enum ErrorDetails { count: u32, }, - /// Unexpected end of file {case_ofs} bytes into a {case_len}-byte case. - #[error( - "Unexpected end of file {case_ofs} bytes into case {case_number} with expected length {case_len} bytes." - )] - EofInCase { - /// Offset into case in bytes. - case_ofs: u64, - /// Expected case length in bytes. - case_len: usize, - /// 1-based case number in file. - case_number: u64, - }, - - /// Unexpected end of file {case_ofs} bytes and {n_chunks} compression - /// chunks into a compressed case. - #[error( - "Unexpected end of file {case_ofs} bytes and {n_chunks} compression chunks into compressed case {case_number}." - )] - EofInCompressedCase { - /// Offset into case in bytes. - case_ofs: u64, - /// Number of compression codes consumed. - n_chunks: usize, - /// 1-based case number in file. - case_number: u64, - }, - /// Error reading a [ZHeader]. #[error("Error reading ZLIB header: {0}")] ZHeader(#[from] ZHeaderError), @@ -260,15 +240,6 @@ pub enum ErrorDetails { #[error("Error reading ZLIB trailer: {0}")] ZTrailer(#[from] ZTrailerError), - /// File metadata says it contains {expected} cases, but {actual} cases were read. - #[error("File metadata says it contains {expected} cases, but {actual} cases were read.")] - WrongNumberOfCases { - /// Expected number of cases. - expected: u64, - /// Actual number of cases. - actual: u64, - }, - /// Encoding error. #[error(transparent)] EncodingError( @@ -747,7 +718,7 @@ impl Record { endian: Endian, var_types: &VarTypes, warn: &mut dyn FnMut(Warning), - ) -> Result, Error> + ) -> Result, Error> where R: BufRead + Seek, { @@ -829,7 +800,7 @@ impl Record { pub fn infer_encoding( records: &[Record], mut warn: impl FnMut(Warning), -) -> Result<&'static Encoding, Error> { +) -> Result<&'static Encoding, Error> { let (encoding, character_code) = get_encoding_info(records); match get_encoding(encoding, character_code) { Ok(encoding) => Ok(encoding), @@ -889,7 +860,10 @@ impl<'de> Decoder<'de> { /// EBCDIC encoding, since this crate only supports ASCII-based encodings. /// /// `warn` will be used to report warnings while decoding records. - pub fn with_inferred_encoding(records: &[Record], mut warn: F) -> Result + pub fn with_inferred_encoding( + records: &[Record], + mut warn: F, + ) -> Result> where F: FnMut(Warning) + 'de, { @@ -1098,26 +1072,111 @@ impl RawDatum { } } +/// The meaning of a compression opcode byte. +/// +/// This abstraction exists because SPSS and SPSS/PC+ system files have similar +/// compression structures but their opcodes are slightly different. +pub enum CompressionAction { + /// Ignored. + NoOp, + + /// A compressed integer. + CompressedInt( + /// The compressed integer. + f64, + ), + + /// End of file. + Eof, + + /// Literal 8-byte value follows the block of opcodes. + Literal, + + /// Represents a group of 8 spaces. + Spaces, + + /// Represents the system-missing value. + Sysmis, +} + +impl CompressionAction { + /// Interprets an SPSS system file compression opcode. + fn from_sysfile(code: u8, bias: f64) -> Self { + match code { + 0 => Self::NoOp, + 252 => Self::Eof, + 253 => Self::Literal, + 254 => Self::Spaces, + 255 => Self::Sysmis, + _ => Self::CompressedInt(code as f64 - bias), + } + } +} + +/// An error reading a case from a system file. +/// +/// Used for SPSS system files and SPSS/PC+ system files. +#[derive(ThisError, Display, Debug)] +pub enum CaseDetails { + /// Unexpected end of file {case_ofs} bytes into case {case_number} with expected length {case_len} bytes. + EofInCase { + /// Offset into case in bytes. + case_ofs: u64, + /// Expected case length in bytes. + case_len: usize, + /// 1-based case number in file. + case_number: u64, + }, + + /// Unexpected end of file {case_ofs} bytes and {n_chunks} compression chunks into compressed case {case_number}. + EofInCompressedCase { + /// Offset into case in bytes. + case_ofs: u64, + /// Number of compression codes consumed. + n_chunks: usize, + /// 1-based case number in file. + case_number: u64, + }, + + /// File metadata says it contains {expected} cases, but {actual} cases were read. + WrongNumberOfCases { + /// Expected number of cases. + expected: u64, + /// Actual number of cases. + actual: u64, + }, + + /// I/O error ({0}) + Io(#[from] IoError), +} + impl Datum { - fn read_case( + /// Reads an uncompressed case with variables `case_vars` from `reader`, + /// with numbers in the given `endian`. + /// + /// `case_number` is used in error messages. + pub fn read_case( reader: &mut R, case_number: u64, case_vars: &[CaseVar], endian: Endian, - ) -> Result, Error> { + ) -> Result, Error> + where + R: Read + Seek, + { fn eof( reader: &mut R, case_number: u64, case_vars: &[CaseVar], case_start: u64, - ) -> Result, Error> { + ) -> Result, Error> { let offset = reader.stream_position()?; if offset == case_start { Ok(None) } else { Err(Error::new( Some(case_start..offset), - ErrorDetails::EofInCase { + CaseDetails::EofInCase { case_ofs: offset - case_start, case_len: case_vars.iter().map(CaseVar::bytes).sum(), case_number, @@ -1156,20 +1215,26 @@ impl Datum { Ok(Some(RawCase(values))) } - fn read_compressed_chunk( + fn read_compressed_chunk( reader: &mut R, codes: &mut VecDeque, + decode_compression_action: F, endian: Endian, - bias: f64, - ) -> Result, Error> { + ) -> Result, Error> + where + F: Fn(u8) -> CompressionAction, + R: Read, + { loop { - match codes.pop_front() { - Some(0) => (), - Some(252) => return Ok(None), - Some(253) => return Ok(Some(read_bytes(reader)?)), - Some(254) => return Ok(Some([b' '; 8])), - Some(255) => return Ok(Some(endian.to_bytes(-f64::MAX))), - Some(code) => return Ok(Some(endian.to_bytes(code as f64 - bias))), + match codes.pop_front().map(&decode_compression_action) { + Some(CompressionAction::NoOp) => (), + Some(CompressionAction::Eof) => return Ok(None), + Some(CompressionAction::Literal) => return Ok(Some(read_bytes(reader)?)), + Some(CompressionAction::Spaces) => return Ok(Some([b' '; 8])), + Some(CompressionAction::Sysmis) => return Ok(Some(endian.to_bytes(-f64::MAX))), + Some(CompressionAction::CompressedInt(value)) => { + return Ok(Some(endian.to_bytes(value))); + } None => { match try_read_bytes::<8, _>(reader)? { Some(new_codes) => codes.extend(new_codes), @@ -1179,25 +1244,37 @@ impl Datum { }; } } - fn read_compressed_case( + + /// Reads an compressed case with variables `case_vars` from `reader`, with + /// numbers in the given `endian`. + /// + /// `codes` is used for compression codes, which are interpreted using + /// `decode_compression_action`. + /// + /// `case_number` is used in error messages. + pub fn read_compressed_case( reader: &mut R, case_number: u64, case_vars: &[CaseVar], codes: &mut VecDeque, + decode_compression_action: F, endian: Endian, - bias: f64, - ) -> Result, Error> { + ) -> Result, Error> + where + R: Read + Seek, + F: Fn(u8) -> CompressionAction, + { fn eof( reader: &mut R, case_number: u64, case_start: u64, n_chunks: usize, - ) -> Result, Error> { + ) -> Result, Error> { let offset = reader.stream_position()?; if n_chunks > 0 { Err(Error::new( Some(case_start..offset), - ErrorDetails::EofInCompressedCase { + CaseDetails::EofInCompressedCase { case_ofs: offset - case_start, n_chunks, case_number, @@ -1214,7 +1291,12 @@ impl Datum { for var in case_vars { match var { CaseVar::Numeric => { - let Some(raw) = Self::read_compressed_chunk(reader, codes, endian, bias)? + let Some(raw) = Self::read_compressed_chunk( + reader, + codes, + &decode_compression_action, + endian, + )? else { return eof(reader, case_number, case_start, n_chunks); }; @@ -1227,8 +1309,12 @@ impl Datum { let mut data_bytes = segment.data_bytes; let mut padding_bytes = segment.padding_bytes; while data_bytes > 0 || padding_bytes > 0 { - let Some(raw) = - Self::read_compressed_chunk(reader, codes, endian, bias)? + let Some(raw) = Self::read_compressed_chunk( + reader, + codes, + &decode_compression_action, + endian, + )? else { return eof(reader, case_number, case_start, n_chunks); }; @@ -1329,7 +1415,10 @@ where /// /// To read an encrypted system file, wrap `reader` in /// [EncryptedReader](crate::crypto::EncryptedReader). - pub fn new(mut reader: R, mut warn: impl FnMut(Warning) + 'a) -> Result { + pub fn new( + mut reader: R, + mut warn: impl FnMut(Warning) + 'a, + ) -> Result> { let header = FileHeader::read(&mut reader, &mut warn)?; Ok(Self { reader: Some(reader), @@ -1451,7 +1540,7 @@ impl<'a, 'b, R> Iterator for Records<'a, 'b, R> where R: BufRead + Seek + 'static, { - type Item = Result; + type Item = Result>; fn next(&mut self) -> Option { self.next_inner().inspect(|retval| { @@ -1465,9 +1554,17 @@ where trait ReadSeek: Read + Seek {} impl ReadSeek for T where T: Read + Seek {} +/// Part of a string variable for reading data from a system file. +/// +/// A string variable in a system file is usually just the string itself +/// followed by padding out to a multiple of 8 bytes. Very long strings (longer +/// than 255 bytes) consist of multiple segments. #[derive(Debug)] -struct StringSegment { +pub struct StringSegment { + /// Number of bytes of string data. data_bytes: usize, + + /// Number of bytes to ignore following the string data. padding_bytes: usize, } @@ -1482,16 +1579,27 @@ fn segment_widths(width: usize) -> impl Iterator { .map(|w| w.next_multiple_of(8)) } -enum CaseVar { +/// Format for reading a variable in a system file. +#[derive(Debug)] +pub enum CaseVar { + /// A numeric variable, represented in the system file as an `f64`. Numeric, + + /// A string variable. String { + /// Total number of bytes (sum of `data_bytes` across the `encoding`). width: usize, + + /// How the string variable is represented in the file. + /// + /// Widths 255 or less have a single [StringSegment]; wider variables + /// have multiple. encoding: SmallVec<[StringSegment; 1]>, }, } -impl CaseVar { - fn new(width: VarWidth) -> Self { +impl From for CaseVar { + fn from(width: VarWidth) -> Self { match width { VarWidth::Numeric => Self::Numeric, VarWidth::String(width) => { @@ -1515,7 +1623,9 @@ impl CaseVar { } } } +} +impl CaseVar { fn bytes(&self) -> usize { match self { CaseVar::Numeric => 8, @@ -1595,7 +1705,7 @@ impl RawCases { .iter() .flatten() .copied() - .map(CaseVar::new) + .map_into() .collect::>(), compression: header.compression, bias: header.bias, @@ -1613,7 +1723,7 @@ impl RawCases { /// very long string variables (see [RawCases] for details). pub fn with_widths(self, widths: impl IntoIterator) -> Self { Self { - case_vars: widths.into_iter().map(CaseVar::new).collect::>(), + case_vars: widths.into_iter().map_into().collect::>(), ..self } } @@ -1629,7 +1739,7 @@ impl RawCases { } impl Iterator for RawCases { - type Item = Result; + type Item = Result>; fn next(&mut self) -> Option { if self.eof { @@ -1644,8 +1754,8 @@ impl Iterator for RawCases { self.read_cases + 1, &self.case_vars, &mut self.codes, + |code| CompressionAction::from_sysfile(code, self.bias), self.endian, - self.bias, ) .transpose() } else { @@ -1665,7 +1775,7 @@ impl Iterator for RawCases { { return Some(Err(Error::new( None, - ErrorDetails::WrongNumberOfCases { + CaseDetails::WrongNumberOfCases { expected: expected_cases, actual: self.read_cases, }, @@ -2152,15 +2262,15 @@ impl EncodingReportString { impl EncodingReport { /// Constructs an encoding report from `reader`, reading no more than /// `max_cases` from it. - pub fn new(mut reader: Reader, max_cases: u64) -> Result + pub fn new(mut reader: Reader, max_cases: u64) -> anyhow::Result where R: BufRead + Seek + 'static, { fn inner( header: FileHeader, records: &[Record], - cases: impl Iterator>, - ) -> Result { + cases: impl Iterator>>, + ) -> Result> { let (encoding, codepage) = get_encoding_info(records); let label = encoding .map(|encoding| (String::from(encoding), get_encoding(Some(encoding), None))); @@ -2277,7 +2387,11 @@ impl EncodingReport { let records: Vec = reader.records().collect::, _>>()?; let header = reader.header().clone(); - inner(header, &records, reader.cases().take(max_cases as usize)) + Ok(inner( + header, + &records, + reader.cases().take(max_cases as usize), + )?) } } diff --git a/rust/pspp/src/sys/raw/records.rs b/rust/pspp/src/sys/raw/records.rs index 592d29b54f..1bc79ced89 100644 --- a/rust/pspp/src/sys/raw/records.rs +++ b/rust/pspp/src/sys/raw/records.rs @@ -146,7 +146,7 @@ pub struct RawHeader { impl FileHeader { /// Reads a header record from `r`, reporting any warnings via `warn`. - pub fn read(r: &mut R, warn: &mut dyn FnMut(Warning)) -> Result + pub fn read(r: &mut R, warn: &mut dyn FnMut(Warning)) -> Result> where R: Read + Seek, { @@ -384,7 +384,7 @@ impl RawMissingValues { code: i32, endian: Endian, warn: &mut dyn FnMut(Warning), - ) -> Result + ) -> Result> where R: Read + Seek, { @@ -558,7 +558,11 @@ pub struct RawVariableRecord { impl VariableRecord { /// Reads a variable record from `r`. - pub fn read(r: &mut R, endian: Endian, warn: &mut dyn FnMut(Warning)) -> Result + pub fn read( + r: &mut R, + endian: Endian, + warn: &mut dyn FnMut(Warning), + ) -> Result> where R: Read + Seek, { @@ -720,7 +724,7 @@ impl ValueLabelRecord { endian: Endian, var_types: &VarTypes, warn: &mut dyn FnMut(Warning), - ) -> Result, Error> { + ) -> Result, Error> { let label_offset = r.stream_position()?; let n: u32 = endian.parse(read_bytes(r)?); if n > Self::MAX_LABELS { @@ -885,7 +889,7 @@ impl DocumentRecord { pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN; /// Reads a document record from `r`. - pub fn read(r: &mut R, endian: Endian) -> Result + pub fn read(r: &mut R, endian: Endian) -> Result> where R: Read + Seek, { @@ -2300,7 +2304,7 @@ impl Extension { endian: Endian, var_types: &VarTypes, warn: &mut dyn FnMut(Warning), - ) -> Result, Error> { + ) -> Result, Error> { let subtype = endian.parse(read_bytes(r)?); let header_offset = r.stream_position()?; let size: u32 = endian.parse(read_bytes(r)?); @@ -2489,7 +2493,7 @@ pub struct RawZHeader { impl ZHeader { /// Reads a ZLIB header from `r` using `endian`. - pub fn read(r: &mut R, endian: Endian) -> Result + pub fn read(r: &mut R, endian: Endian) -> Result> where R: Read + Seek, { @@ -2776,7 +2780,7 @@ impl ZTrailer { bias: f64, zheader: &RawZHeader, warn: &mut dyn FnMut(Warning), - ) -> Result, Error> + ) -> Result, Error> where R: Read + Seek, {