From: Ben Pfaff Date: Fri, 18 Jul 2025 16:10:56 +0000 (-0700) Subject: rust: Major additions. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=20626a75474660269ca7facf4305946c87b556a2;p=pspp rust: Major additions. This adds support for writing system files to the Rust pspp library and to the `convert` command in the Rust CLI. It also renames the `dissect` command in the CLI to `show` and adds numerous features to it. This makes many bug fixes across the Rust pspp library and changes the library internals in several ways. --- diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 52e8544160..00e1ff6e1a 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -330,6 +330,7 @@ dependencies = [ "iana-time-zone", "js-sys", "num-traits", + "serde", "wasm-bindgen", "windows-link", ] @@ -582,6 +583,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" dependencies = [ "cfg-if", + "serde", ] [[package]] @@ -611,6 +613,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6866f3bfdf8207509a033af1a75a7b08abda06bbaaeae6669323fd5a097df2e9" dependencies = [ "enum-map-derive", + "serde", ] [[package]] @@ -663,12 +666,6 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "finl_unicode" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94c970b525906eb37d3940083aa65b95e481fc1857d467d13374e1d925cfc163" - [[package]] name = "flagset" version = "0.4.7" @@ -1089,6 +1086,7 @@ checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" dependencies = [ "equivalent", "hashbrown 0.15.3", + "serde", ] [[package]] @@ -1199,9 +1197,9 @@ checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" [[package]] name = "libz-rs-sys" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6489ca9bd760fe9642d7644e827b0c9add07df89857b0416ee15c1cc1a3b8c5a" +checksum = "172a788537a2221661b480fee8dc5f96c580eb34fa88764d3205dc356c7e4221" dependencies = [ "zlib-rs", ] @@ -1599,7 +1597,7 @@ dependencies = [ [[package]] name = "pspp" -version = "0.1.0" +version = "0.2.0" dependencies = [ "aes", "anyhow", @@ -1618,7 +1616,6 @@ dependencies = [ "encoding_rs", "enum-iterator", "enum-map", - "finl_unicode", "flagset", "flate2", "hexplay", @@ -1636,11 +1633,15 @@ dependencies = [ "rand", "readpass", "serde", + "serde_json", "smallstr", "smallvec", "thiserror", + "toml 0.9.5", "unicase", "unicode-linebreak", + "unicode-properties", + "unicode-segmentation", "unicode-width", "windows-sys 0.48.0", "xmlwriter", @@ -1836,9 +1837,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.140" +version = "1.0.141" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +checksum = "30b9eff21ebe718216c6ec64e1d9ac57087aad11efc64e32002bce4a0d4c03d3" dependencies = [ "itoa", "memchr", @@ -1866,6 +1867,15 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_spanned" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40734c41988f7306bb04f0ecf60ec0f3f1caa34290e4e8ea471dcd3346483b83" +dependencies = [ + "serde", +] + [[package]] name = "sha1" version = "0.10.6" @@ -1992,7 +2002,7 @@ dependencies = [ "cfg-expr", "heck", "pkg-config", - "toml", + "toml 0.8.22", "version-compare", ] @@ -2119,11 +2129,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05ae329d1f08c4d17a59bed7ff5b5a769d062e64a62d34a3261b219e62cd5aae" dependencies = [ "serde", - "serde_spanned", - "toml_datetime", + "serde_spanned 0.6.8", + "toml_datetime 0.6.9", "toml_edit", ] +[[package]] +name = "toml" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75129e1dc5000bfbaa9fee9d1b21f974f9fbad9daec557a521ee6e080825f6e8" +dependencies = [ + "indexmap", + "serde", + "serde_spanned 1.0.0", + "toml_datetime 0.7.0", + "toml_parser", + "toml_writer", + "winnow", +] + [[package]] name = "toml_datetime" version = "0.6.9" @@ -2133,6 +2158,15 @@ dependencies = [ "serde", ] +[[package]] +name = "toml_datetime" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bade1c3e902f58d73d3f294cd7f20391c1cb2fbcb643b73566bc773971df91e3" +dependencies = [ + "serde", +] + [[package]] name = "toml_edit" version = "0.22.26" @@ -2141,11 +2175,26 @@ checksum = "310068873db2c5b3e7659d2cc35d21855dbafa50d1ce336397c666e3cb08137e" dependencies = [ "indexmap", "serde", - "serde_spanned", - "toml_datetime", + "serde_spanned 0.6.8", + "toml_datetime 0.6.9", + "winnow", +] + +[[package]] +name = "toml_parser" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b551886f449aa90d4fe2bdaa9f4a2577ad2dde302c61ecf262d80b116db95c10" +dependencies = [ "winnow", ] +[[package]] +name = "toml_writer" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcc842091f2def52017664b53082ecbbeb5c7731092bad69d2c63050401dfd64" + [[package]] name = "tower" version = "0.4.13" @@ -2261,6 +2310,18 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b09c83c3c29d37506a3e260c08c03743a6bb66a9cd432c6934ab501a190571f" +[[package]] +name = "unicode-properties" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0" + +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + [[package]] name = "unicode-width" version = "0.2.0" @@ -2869,9 +2930,9 @@ dependencies = [ [[package]] name = "zlib-rs" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "868b928d7949e09af2f6086dfc1e01936064cc7a819253bce650d4e2a2d63ba8" +checksum = "626bd9fa9734751fc50d6060752170984d7053f5a39061f524cda68023d4db8a" [[package]] name = "zopfli" diff --git a/rust/pspp-lsp/Cargo.toml b/rust/pspp-lsp/Cargo.toml index ea911f4e07..f7552707db 100644 --- a/rust/pspp-lsp/Cargo.toml +++ b/rust/pspp-lsp/Cargo.toml @@ -7,6 +7,6 @@ publish = false [dependencies] env_logger = "0.11.5" log = "0.4.22" -pspp = { version = "0.1.0", path = "../pspp" } +pspp = { version = "0.2.0", path = "../pspp" } tokio = { version = "1.39.3", features = ["full"] } tower-lsp = "0.20.0" diff --git a/rust/pspp/Cargo.toml b/rust/pspp/Cargo.toml index 3ad04a1296..31b1d743b3 100644 --- a/rust/pspp/Cargo.toml +++ b/rust/pspp/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pspp" -version = "0.1.0" +version = "0.2.0" edition = "2024" authors = [ "Ben Pfaff", "John Darrington" ] description = "Statistical analysis software" @@ -9,20 +9,20 @@ license = "GPL-3.0-or-later" [dependencies] anyhow = "1.0.69" clap = { version = "4.1.7", features = ["derive", "wrap_help"] } -encoding_rs = "0.8.32" +encoding_rs = { version = "0.8.32", features = ["serde"] } flate2 = "1.0.26" hexplay = "0.2.1" num = "0.4.0" ordered-float = "3.7.0" thiserror = "1.0" -chrono = "0.4.40" +chrono = { version = "0.4.40", features = ["serde"] } unicase = "2.6.0" libc = "0.2.147" -indexmap = "2.1.0" +indexmap = { version = "2.1.0", features = ["serde"] } bitflags = "2.5.0" unicode-width = "0.2.0" chardetng = "0.1.17" -enum-map = "2.7.3" +enum-map = { version = "2.7.3", features = ["serde"] } flagset = "0.4.6" pspp-derive = { version = "0.1.0", path = "../pspp-derive" } either = "1.13.0" @@ -33,7 +33,7 @@ smallstr = "0.3.0" itertools = "0.14.0" unicode-linebreak = "0.1.5" quick-xml = { version = "0.37.2", features = ["serialize"] } -serde = { version = "1.0.218", features = ["derive"] } +serde = { version = "1.0.218", features = ["derive", "rc"] } color = { version = "0.2.3", features = ["serde"] } binrw = "0.14.1" ndarray = "0.16.1" @@ -49,6 +49,9 @@ aes = "0.8.4" readpass = "1.0.3" zeroize = "1.8.1" unicode-properties = "0.1.3" +unicode-segmentation = "1.12.0" +serde_json = "1.0.141" +toml = "0.9.5" [target.'cfg(windows)'.dependencies] windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] } diff --git a/rust/pspp/build.rs b/rust/pspp/build.rs index 0535380377..13e9534d16 100644 --- a/rust/pspp/build.rs +++ b/rust/pspp/build.rs @@ -188,6 +188,20 @@ fn main() -> AnyResult<()> { } process_converter(&converter, &mut codepages); + for (codepage, source, name) in [ + (20932, Source::Codepage, "EUC-JP"), + (50220, Source::Codepage, "ISO-2022-JP"), + (28600, Source::Windows, "ISO-8859-10"), + (28604, Source::Windows, "ISO-8859-14"), + (28606, Source::Windows, "ISO-8859-16"), + (99998, Source::Codepage, "replacement"), + (99999, Source::Codepage, "x-user-defined"), + ] { + assert!(codepages + .insert(codepage, [(source, vec![name])].into_iter().collect()) + .is_none()); + } + let output_file_name = Path::new(&var_os("OUT_DIR").unwrap()).join("encodings.rs"); write_output(&codepages, &output_file_name) diff --git a/rust/pspp/src/crypto/mod.rs b/rust/pspp/src/crypto/mod.rs index 8401ef7b72..8685f938af 100644 --- a/rust/pspp/src/crypto/mod.rs +++ b/rust/pspp/src/crypto/mod.rs @@ -588,9 +588,7 @@ mod test { use crate::crypto::{EncodedPassword, EncryptedFile, FileType}; fn test_decrypt(input_name: &Path, expected_name: &Path, password: &str, file_type: FileType) { - let input_filename = Path::new(env!("CARGO_MANIFEST_DIR")) - .join("src/crypto/testdata") - .join(input_name); + let input_filename = Path::new("src/crypto/testdata").join(input_name); let input = std::fs::read(&input_filename).unwrap(); let mut cursor = Cursor::new(&input); let file = EncryptedFile::new(&mut cursor).unwrap(); @@ -600,9 +598,7 @@ mod test { let mut actual = Vec::new(); std::io::copy(&mut reader, &mut actual).unwrap(); - let expected_filename = Path::new(env!("CARGO_MANIFEST_DIR")) - .join("src/crypto/testdata") - .join(expected_name); + let expected_filename = Path::new("src/crypto/testdata").join(expected_name); let expected = std::fs::read(&expected_filename).unwrap(); if actual != expected { panic!(); diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index b6fa22bfa5..780e0e1c98 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -21,195 +21,355 @@ //! associated [Variable]. (All the variables in a [Dictionary] have the same //! character encoding.) //! -//! [Variable]: crate::dictionary::Variable +//! [Variable]: crate::variable::Variable //! [Dictionary]: crate::dictionary::Dictionary // Warn about missing docs, but not for items declared with `#[cfg(test)]`. -#![cfg_attr(not(test), warn(missing_docs))] +//#![cfg_attr(not(test), warn(missing_docs))] use std::{ - borrow::{Borrow, Cow}, + borrow::{Borrow, BorrowMut, Cow}, cmp::Ordering, fmt::{Debug, Display, Formatter}, hash::Hash, - ops::Deref, str::from_utf8, }; use encoding_rs::{mem::decode_latin1, Encoding, UTF_8}; +use itertools::Itertools; use ordered_float::OrderedFloat; +use serde::{ + ser::{SerializeSeq, SerializeTupleVariant}, + Serialize, +}; -use crate::dictionary::{VarType, VarWidth}; +use crate::{ + format::DisplayPlain, + variable::{VarType, VarWidth}, +}; -/// An owned string in an unspecified character encoding. -/// -/// A [RawString] is usually associated with a [Variable] and uses the -/// variable's character encoding. We assume that the encoding is one supported -/// by [encoding_rs] with byte units (that is, not a `UTF-16` encoding). All of -/// these encodings have some basic ASCII compatibility. -/// -/// A [RawString] owns its contents and can grow and shrink, like a [Vec] or -/// [String]. For a borrowed raw string, see [RawStr]. -/// -/// [Variable]: crate::dictionary::Variable -#[derive(Clone, PartialEq, Default, Eq, PartialOrd, Ord, Hash)] -pub struct RawString(pub Vec); +pub trait RawString: Debug + PartialEq + Eq + PartialOrd + Ord + Hash { + fn raw_string_bytes(&self) -> &[u8]; -impl RawString { - /// Creates a new [RawString] that consists of `n` ASCII spaces. - pub fn spaces(n: usize) -> Self { - Self(std::iter::repeat_n(b' ', n).collect()) + /// Compares this string and `other` for equality, ignoring trailing ASCII + /// spaces in either string for the purpose of comparison. (This is + /// acceptable because we assume that the encoding is ASCII-compatible.) + /// + /// This compares the bytes of the strings, disregarding their encodings (if + /// known). + fn eq_ignore_trailing_spaces(&self, other: &R) -> bool + where + R: RawString, + { + self.raw_string_bytes() + .iter() + .copied() + .zip_longest(other.raw_string_bytes().iter().copied()) + .all(|elem| { + let (left, right) = elem.or(b' ', b' '); + left == right + }) } - /// Creates an [EncodedStr] with `encoding` that borrows this string's - /// contents. - pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> { - EncodedStr::new(&self.0, encoding) + /// Returns true if this raw string can be resized to `len` bytes without + /// dropping non-space characters. + fn is_resizable(&self, new_len: usize) -> bool { + new_len >= self.len() + || self.raw_string_bytes()[new_len..] + .iter() + .copied() + .all(|b| b == b' ') } - /// Extends or shortens this [RawString] to exactly `len` bytes. If the - /// string needs to be extended, does so by appending spaces. - /// - /// If this shortens the string, it can cut off a multibyte character in the - /// middle. - pub fn resize(&mut self, len: usize) { - self.0.resize(len, b' '); + fn is_empty(&self) -> bool { + self.raw_string_bytes().is_empty() } - /// Removes any trailing ASCII spaces. - pub fn trim_end(&mut self) { - while self.0.pop_if(|c| *c == b' ').is_some() {} + fn len(&self) -> usize { + self.raw_string_bytes().len() } -} -impl Borrow for RawString { - fn borrow(&self) -> &RawStr { - RawStr::from_bytes(&self.0) + fn as_ref(&self) -> ByteStr<'_> { + ByteStr(self.raw_string_bytes()) + } + + fn without_trailing_spaces(&self) -> ByteStr<'_> { + let mut raw = self.raw_string_bytes(); + while let Some(trimmed) = raw.strip_suffix(b" ") { + raw = trimmed; + } + ByteStr(raw) + } + + fn as_encoded(&self, encoding: &'static Encoding) -> WithEncoding> + where + Self: Sized, + { + WithEncoding::new(self.as_ref(), encoding) + } + + fn with_encoding(self, encoding: &'static Encoding) -> WithEncoding + where + Self: Sized, + { + WithEncoding::new(self, encoding) } } -impl Deref for RawString { - type Target = RawStr; +pub trait MutRawString: RawString { + fn resize(&mut self, new_len: usize) -> Result<(), ResizeError>; + fn trim_end(&mut self); +} - fn deref(&self) -> &Self::Target { - self.borrow() +impl RawString for &'_ str { + fn raw_string_bytes(&self) -> &[u8] { + self.as_bytes() } } -impl From> for RawString { - fn from(value: Cow<'_, [u8]>) -> Self { - Self(value.into_owned()) +impl RawString for String { + fn raw_string_bytes(&self) -> &[u8] { + self.as_bytes() } } -impl From> for RawString { - fn from(source: Vec) -> Self { - Self(source) +impl RawString for &'_ String { + fn raw_string_bytes(&self) -> &[u8] { + self.as_bytes() } } -impl From<&[u8]> for RawString { - fn from(source: &[u8]) -> Self { - Self(source.into()) +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct ByteStr<'a>(pub &'a [u8]); + +impl RawString for ByteStr<'_> { + fn raw_string_bytes(&self) -> &[u8] { + self.0 } } -impl Debug for RawString { +impl Serialize for ByteStr<'_> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + if let Ok(s) = str::from_utf8(self.0) { + let (variant_index, variant) = if self.0.iter().all(|b| b.is_ascii()) { + (0, "Ascii") + } else { + (1, "Utf8") + }; + let mut tuple = + serializer.serialize_tuple_variant("RawString", variant_index, variant, 1)?; + tuple.serialize_field(s)?; + tuple.end() + } else { + let mut tuple = serializer.serialize_tuple_variant("RawString", 2, "Windows1252", 1)?; + tuple.serialize_field(&decode_latin1(self.0))?; + tuple.end() + } + } +} + +impl Debug for ByteStr<'_> { + // If `s` is valid UTF-8, displays it as UTF-8, otherwise as Latin-1 + // (actually bytes interpreted as Unicode code points). fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "{:?}", *self) + let s = from_utf8(&self.0).map_or_else(|_| decode_latin1(self.0), Cow::from); + write!(f, "{s:?}") } } -/// A borrowed string in an unspecified encoding. -/// -/// A [RawString] is usually associated with a [Variable] and uses the -/// variable's character encoding. We assume that the encoding is one supported -/// by [encoding_rs] with byte units (that is, not a `UTF-16` encoding). All of -/// these encodings have some basic ASCII compatibility. -/// -/// For an owned raw string, see [RawString]. -/// -/// [Variable]: crate::dictionary::Variable -#[repr(transparent)] -#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct RawStr(pub [u8]); - -impl RawStr { - /// Creates a new [RawStr] that contains `bytes`. - pub fn from_bytes(bytes: &[u8]) -> &Self { - // SAFETY: `RawStr` is a transparent wrapper around `[u8]`, so we can - // turn a reference to the wrapped type into a reference to the wrapper - // type. - unsafe { &*(bytes as *const [u8] as *const Self) } - } - - /// Returns the raw string's contents as a borrowed byte slice. - pub fn as_bytes(&self) -> &[u8] { - &self.0 +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct ByteCow<'a>(pub Cow<'a, [u8]>); + +impl ByteCow<'_> { + pub fn into_owned(self) -> ByteString { + ByteString(self.0.into_owned()) } +} - /// Returns an object that implements [Display] for printing this [RawStr], - /// given that it is encoded in `encoding`. - pub fn display(&self, encoding: &'static Encoding) -> DisplayRawString { - DisplayRawString(encoding.decode_without_bom_handling(&self.0).0) +impl Serialize for ByteCow<'_> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + ByteStr(&self.0).serialize(serializer) } +} - /// Interprets the raw string's contents as the specified `encoding` and - /// returns it decoded into UTF-8, replacing any malformed sequences by - /// [REPLACEMENT_CHARACTER]. - /// - /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER - pub fn decode(&self, encoding: &'static Encoding) -> Cow<'_, str> { - encoding.decode_without_bom_handling(&self.0).0 +impl RawString for ByteCow<'_> { + fn raw_string_bytes(&self) -> &[u8] { + &self.0 } +} - /// Compares this string and `other` for equality, ignoring trailing ASCII - /// spaces in either string for the purpose of comparison. (This is - /// acceptable because we assume that the encoding is ASCII-compatible.) - pub fn eq_ignore_trailing_spaces(&self, other: &RawStr) -> bool { - let mut this = self.0.iter(); - let mut other = other.0.iter(); - loop { - match (this.next(), other.next()) { - (Some(a), Some(b)) if a == b => (), - (Some(_), Some(_)) => return false, - (None, None) => return true, - (Some(b' '), None) => return this.all(|c| *c == b' '), - (None, Some(b' ')) => return other.all(|c| *c == b' '), - (Some(_), None) | (None, Some(_)) => return false, - } - } +impl Debug for ByteCow<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + ByteStr(&self.0).fmt(f) } +} - /// Returns the string's length in bytes. - pub fn len(&self) -> usize { - self.0.len() +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct ByteStrArray(pub [u8; N]); + +impl Serialize for ByteStrArray { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + ByteStr(&self.0).serialize(serializer) } } -/// Helper struct for printing [RawStr] with [format!]. -/// -/// Created by [RawStr::display]. -pub struct DisplayRawString<'a>(Cow<'a, str>); +impl RawString for ByteStrArray { + fn raw_string_bytes(&self) -> &[u8] { + &self.0 + } +} -impl<'a> Display for DisplayRawString<'a> { - // If `s` is valid UTF-8, displays it as UTF-8, otherwise as Latin-1 - // (actually bytes interpreted as Unicode code points). +impl Debug for ByteStrArray { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", &self.0) + ByteStr(&self.0).fmt(f) + } +} + +#[derive(Clone, Default, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct ByteString(pub Vec); + +impl ByteString { + /// Creates a new [ByteString] that consists of `n` ASCII spaces. + pub fn spaces(n: usize) -> Self { + Self(std::iter::repeat_n(b' ', n).collect()) + } +} + +impl From for ByteString { + fn from(value: String) -> Self { + value.into_bytes().into() + } +} + +impl From<&'_ str> for ByteString { + fn from(value: &str) -> Self { + value.as_bytes().into() + } +} + +impl From> for ByteString { + fn from(value: Cow<'_, str>) -> Self { + value.into_owned().into() + } +} + +impl From> for ByteString { + fn from(value: Cow<'_, [u8]>) -> Self { + value.into_owned().into() + } +} + +impl From> for ByteString { + fn from(value: Vec) -> Self { + Self(value) + } +} + +impl From<&[u8]> for ByteString { + fn from(value: &[u8]) -> Self { + Self(value.into()) + } +} + +impl From<&ByteString> for ByteString { + fn from(value: &ByteString) -> Self { + value.clone() + } +} + +impl From<&ByteStrArray> for ByteString { + fn from(value: &ByteStrArray) -> Self { + Self::from(value.raw_string_bytes()) + } +} + +impl From<[u8; N]> for ByteString { + fn from(value: [u8; N]) -> Self { + value.as_slice().into() + } +} + +impl RawString for ByteString { + fn raw_string_bytes(&self) -> &[u8] { + self.0.as_slice() + } +} + +impl Serialize for ByteString { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + if let Ok(s) = str::from_utf8(&self.0) { + let (variant_index, variant) = if self.0.iter().all(|b| b.is_ascii()) { + (0, "Ascii") + } else { + (1, "Utf8") + }; + let mut tuple = + serializer.serialize_tuple_variant("RawString", variant_index, variant, 1)?; + tuple.serialize_field(s)?; + tuple.end() + } else { + let mut tuple = serializer.serialize_tuple_variant("RawString", 2, "Windows1252", 1)?; + tuple.serialize_field(&decode_latin1(&self.0))?; + tuple.end() + } } } -impl Debug for RawStr { +impl Debug for ByteString { + // If `s` is valid UTF-8, displays it as UTF-8, otherwise as Latin-1 + // (actually bytes interpreted as Unicode code points). fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let s = from_utf8(&self.0).map_or_else(|_| decode_latin1(&self.0), Cow::from); + let s = + from_utf8(&self.0.borrow()).map_or_else(|_| decode_latin1(self.0.borrow()), Cow::from); write!(f, "{s:?}") } } -/// The value of a [Variable](crate::dictionary::Variable). +impl MutRawString for ByteString { + fn resize(&mut self, new_len: usize) -> Result<(), ResizeError> { + match new_len.cmp(&self.0.len()) { + Ordering::Less => { + if !self.0[new_len..].iter().all(|b| *b == b' ') { + return Err(ResizeError::TooWide); + } + self.0.truncate(new_len); + } + Ordering::Equal => (), + Ordering::Greater => self.0.resize(new_len, b' '), + } + Ok(()) + } + + /// Removes any trailing ASCII spaces. + fn trim_end(&mut self) { + while self.0.pop_if(|c| *c == b' ').is_some() {} + } +} + +mod encoded; +pub use encoded::{Encoded, EncodedString, WithEncoding}; + +/// A [Datum] that owns its string data (if any). +pub type OwnedDatum = Datum>; + +/// The value of a [Variable](crate::variable::Variable). +/// +/// `T` is the type for a string `Datum`, typically [ByteString] or +/// `WithEncoding` or some borrowed type. #[derive(Clone)] -pub enum Datum { +pub enum Datum { /// A numeric value. Number( /// A number, or `None` for the system-missing value. @@ -218,67 +378,159 @@ pub enum Datum { /// A string value. String( /// The value, in the variable's encoding. - RawString, + T, ), } -impl Debug for Datum { +impl Datum> { + pub fn new_utf8(s: impl Into) -> Self { + let s: String = s.into(); + Datum::String(ByteString::from(s).with_encoding(UTF_8)) + } + + pub fn codepage_to_unicode(&mut self) { + if let Some(s) = self.as_string_mut() { + s.codepage_to_unicode(); + } + } + + pub fn without_encoding(self) -> Datum { + self.map_string(|s| s.into_inner()) + } +} + +impl<'a> Datum>> { + pub fn into_owned(self) -> Datum> { + self.map_string(|s| s.into_owned()) + } +} + +impl Datum +where + T: EncodedString, +{ + pub fn as_borrowed(&self) -> Datum>> { + self.as_ref().map_string(|s| s.as_encoded_byte_str()) + } + pub fn cloned(&self) -> Datum> { + self.as_ref().map_string(|s| s.cloned()) + } +} + +impl Debug for Datum +where + B: Debug, +{ fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { match self { - Datum::Number(Some(number)) => write!(f, "{number:?}"), - Datum::Number(None) => write!(f, "SYSMIS"), - Datum::String(s) => write!(f, "{:?}", s), + Self::Number(Some(number)) => write!(f, "{number:?}"), + Self::Number(None) => write!(f, "SYSMIS"), + Self::String(s) => write!(f, "{:?}", s), } } } -impl PartialEq for Datum { - fn eq(&self, other: &Self) -> bool { +impl Display for Datum +where + T: Display, +{ + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::Number(None) => write!(f, "SYSMIS"), + Self::Number(Some(number)) => number.display_plain().fmt(f), + Self::String(string) => string.fmt(f), + } + } +} + +impl Serialize for Datum +where + B: Serialize, +{ + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match self { + Self::Number(number) => number.serialize(serializer), + Self::String(raw_string) => raw_string.serialize(serializer), + } + } +} + +impl PartialEq> for Datum +where + T: PartialEq, +{ + fn eq(&self, other: &Datum) -> bool { match (self, other) { - (Self::Number(Some(l0)), Self::Number(Some(r0))) => { - OrderedFloat(*l0) == OrderedFloat(*r0) + (Self::Number(Some(n1)), Datum::Number(Some(n2))) => { + OrderedFloat(*n1) == OrderedFloat(*n2) } - (Self::Number(None), Self::Number(None)) => true, - (Self::String(l0), Self::String(r0)) => l0 == r0, + (Self::Number(None), Datum::Number(None)) => true, + (Self::String(s1), Datum::String(s2)) => s1 == s2, _ => false, } } } -impl Eq for Datum {} +impl Eq for Datum where T: Eq {} -impl PartialOrd for Datum { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) +impl PartialOrd> for Datum +where + T: PartialOrd, +{ + fn partial_cmp(&self, other: &Datum) -> Option { + match (self, other) { + (Self::Number(a), Datum::Number(b)) => { + a.map(OrderedFloat).partial_cmp(&b.map(OrderedFloat)) + } + (Self::Number(_), Datum::String(_)) => Some(Ordering::Less), + (Self::String(_), Datum::Number(_)) => Some(Ordering::Greater), + (Self::String(a), Datum::String(b)) => a.partial_cmp(b), + } } } -impl Ord for Datum { +impl Ord for Datum +where + T: Ord, +{ fn cmp(&self, other: &Self) -> Ordering { - match (self, other) { - (Datum::Number(a), Datum::Number(b)) => match (a, b) { - (None, None) => Ordering::Equal, - (None, Some(_)) => Ordering::Less, - (Some(_), None) => Ordering::Greater, - (Some(a), Some(b)) => a.total_cmp(b), - }, - (Datum::Number(_), Datum::String(_)) => Ordering::Less, - (Datum::String(_), Datum::Number(_)) => Ordering::Greater, - (Datum::String(a), Datum::String(b)) => a.cmp(b), - } + self.partial_cmp(other).unwrap() } } -impl Hash for Datum { +impl Hash for Datum +where + T: Hash, +{ fn hash(&self, state: &mut H) { match self { - Datum::Number(number) => number.map(OrderedFloat).hash(state), - Datum::String(string) => string.hash(state), + Self::Number(number) => number.map(OrderedFloat).hash(state), + Self::String(string) => string.hash(state), } } } -impl Datum { +impl Datum { + pub fn as_ref(&self) -> Datum<&B> { + match self { + Datum::Number(number) => Datum::Number(*number), + Datum::String(string) => Datum::String(&string), + } + } + + pub fn map_string(self, f: F) -> Datum + where + F: Fn(B) -> R, + { + match self { + Datum::Number(number) => Datum::Number(number), + Datum::String(string) => Datum::String(f(string)), + } + } + /// Constructs a new numerical [Datum] for the system-missing value. pub const fn sysmis() -> Self { Self::Number(None) @@ -288,284 +540,322 @@ impl Datum { /// datum. pub fn as_number(&self) -> Option> { match self { - Datum::Number(number) => Some(*number), - Datum::String(_) => None, + Self::Number(number) => Some(*number), + Self::String(_) => None, } } /// Returns the string inside this datum, or `None` if this is a numeric /// datum. - pub fn as_string(&self) -> Option<&RawString> { + pub fn as_string(&self) -> Option<&B> { match self { - Datum::Number(_) => None, - Datum::String(s) => Some(s), + Self::Number(_) => None, + Self::String(s) => Some(s), } } - /// Returns the string inside this datum as a mutable borrow, or `None` if - /// this is a numeric datum. - pub fn as_string_mut(&mut self) -> Option<&mut RawString> { + /// Returns the string inside this datum, or `None` if this is a numeric + /// datum. + pub fn into_string(self) -> Option { + match self { + Self::Number(_) => None, + Self::String(s) => Some(s), + } + } + + /// Returns the [VarType] corresponding to this datum. + pub fn var_type(&self) -> VarType { match self { - Datum::Number(_) => None, - Datum::String(s) => Some(s), + Self::Number(_) => VarType::Numeric, + Self::String(_) => VarType::String, } } +} +impl Datum +where + T: RawString, +{ /// Returns true if this datum can be resized to the given `width` without /// loss, which is true only if this datum and `width` are both string or /// both numeric and, for string widths, if resizing would not drop any /// non-space characters. pub fn is_resizable(&self, width: VarWidth) -> bool { match (self, width) { - (Datum::Number(_), VarWidth::Numeric) => true, - (Datum::String(s), VarWidth::String(new_width)) => { - let new_len = new_width as usize; - new_len >= s.len() || s.0[new_len..].iter().all(|c| *c == b' ') - } + (Self::Number(_), VarWidth::Numeric) => true, + (Self::String(s), VarWidth::String(new_width)) => s.is_resizable(new_width as usize), _ => false, } } - /// Resizes this datum to the given `width`. - /// - /// # Panic - /// - /// Panics if resizing would change the datum from numeric to string or vice - /// versa. - pub fn resize(&mut self, width: VarWidth) { - match (self, width) { - (Datum::Number(_), VarWidth::Numeric) => (), - (Datum::String(s), VarWidth::String(new_width)) => s.resize(new_width as usize), - _ => unreachable!(), - } - } - - /// Returns the [VarType] corresponding to this datum. - pub fn var_type(&self) -> VarType { - match self { - Self::Number(_) => VarType::Numeric, - Self::String(_) => VarType::String, - } - } - /// Returns the [VarWidth] corresponding to this datum. pub fn width(&self) -> VarWidth { match self { - Datum::Number(_) => VarWidth::Numeric, - Datum::String(s) => VarWidth::String(s.len().try_into().unwrap()), + Self::Number(_) => VarWidth::Numeric, + Self::String(s) => VarWidth::String(s.len().try_into().unwrap()), } } /// Compares this datum and `other` for equality, ignoring trailing ASCII /// spaces in either, if they are both strings, for the purpose of /// comparison. - pub fn eq_ignore_trailing_spaces(&self, other: &Datum) -> bool { + pub fn eq_ignore_trailing_spaces(&self, other: &Datum) -> bool + where + R: RawString, + { match (self, other) { - (Self::String(a), Self::String(b)) => a.eq_ignore_trailing_spaces(b), - _ => self == other, + (Self::String(a), Datum::String(b)) => a.eq_ignore_trailing_spaces(b), + (Self::Number(a), Datum::Number(b)) => a == b, + _ => false, } } - /// Removes trailing ASCII spaces from this datum, if it is a string. - pub fn trim_end(&mut self) { + pub fn as_encoded(&self, encoding: &'static Encoding) -> Datum>> { + self.as_ref().map_string(|s| s.as_encoded(encoding)) + } + + pub fn with_encoding(self, encoding: &'static Encoding) -> Datum> { + self.map_string(|s| s.with_encoding(encoding)) + } +} + +impl Datum +where + B: EncodedString, +{ + pub fn quoted<'a>(&'a self) -> QuotedDatum<'a, B> { + QuotedDatum(self) + } +} + +pub struct QuotedDatum<'a, B>(&'a Datum); + +impl<'a, B> Display for QuotedDatum<'a, B> +where + B: Display, +{ + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match &self.0 { + Datum::Number(None) => write!(f, "SYSMIS"), + Datum::Number(Some(number)) => number.display_plain().fmt(f), + Datum::String(string) => write!(f, "\"{string}\""), + } + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum ResizeError { + MixedTypes, + TooWide, +} + +impl Datum { + /// Returns the string inside this datum as a mutable borrow, or `None` if + /// this is a numeric datum. + pub fn as_string_mut(&mut self) -> Option<&mut T> { match self { - Self::Number(_) => (), - Self::String(s) => s.trim_end(), + Self::Number(_) => None, + Self::String(s) => Some(s.borrow_mut()), + } + } + + /// Removes trailing ASCII spaces from this datum, if it is a string. + pub fn trim_end(&mut self) + where + T: MutRawString, + { + self.as_string_mut().map(|s| s.trim_end()); + } + + /// Resizes this datum to the given `width`. Returns an error, without + /// modifying the datum, if [is_resizable](Self::is_resizable) would return + /// false. + pub fn resize(&mut self, width: VarWidth) -> Result<(), ResizeError> + where + T: MutRawString, + { + match (self, width) { + (Self::Number(_), VarWidth::Numeric) => Ok(()), + (Self::String(s), VarWidth::String(new_width)) => s.resize(new_width as usize), + _ => Err(ResizeError::MixedTypes), } } } -impl From for Datum { +impl From for Datum { fn from(number: f64) -> Self { Some(number).into() } } -impl From> for Datum { +impl From> for Datum { fn from(value: Option) -> Self { Self::Number(value) } } -impl From<&str> for Datum { - fn from(value: &str) -> Self { - value.as_bytes().into() +impl<'a> From<&'a str> for Datum> { + fn from(value: &'a str) -> Self { + Datum::String(ByteStr(value.as_bytes())) } } -impl From<&[u8]> for Datum { - fn from(value: &[u8]) -> Self { - Self::String(value.into()) +impl<'a> From<&'a [u8]> for Datum> { + fn from(value: &'a [u8]) -> Self { + Self::String(ByteStr(value)) } } /// A case in a data set. -#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct Case( +#[derive(Clone, Debug, Serialize)] +pub struct RawCase( /// One [Datum] per variable in the corresponding [Dictionary], in the same /// order. /// /// [Dictionary]: crate::dictionary::Dictionary - pub Vec, + pub Vec>, ); -/// An owned string and its [Encoding]. -/// -/// The string is not guaranteed to be valid in the encoding. -/// -/// The borrowed form of such a string is [EncodedStr]. -#[derive(Clone, Debug)] -pub enum EncodedString { - /// A string in arbitrary encoding. - Encoded { - /// The bytes of the string. - bytes: Vec, - - /// The string's encoding. - /// - /// This can be [UTF_8]. - encoding: &'static Encoding, - }, - - /// A string that is in UTF-8 and known to be valid. - Utf8 { - /// The string. - s: String, - }, -} - -impl EncodedString { - /// Returns the string's [Encoding]. - pub fn encoding(&self) -> &'static Encoding { - match self { - EncodedString::Encoded { encoding, .. } => encoding, - EncodedString::Utf8 { .. } => UTF_8, +impl RawCase { + pub fn as_encoding(&self, encoding: &'static Encoding) -> Case<&'_ [Datum]> { + Case { + encoding, + data: &self.0, } } - - /// Returns a borrowed form of this string. - pub fn borrowed(&self) -> EncodedStr<'_> { - match self { - EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding }, - EncodedString::Utf8 { s } => EncodedStr::Utf8 { s }, + pub fn with_encoding(self, encoding: &'static Encoding) -> Case>> { + Case { + encoding, + data: self.0, } } } -impl<'a> From> for EncodedString { - fn from(value: EncodedStr<'a>) -> Self { - match value { - EncodedStr::Encoded { bytes, encoding } => Self::Encoded { - bytes: bytes.into(), - encoding, - }, - EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() }, - } +pub struct Case +where + B: Borrow<[Datum]>, +{ + encoding: &'static Encoding, + data: B, +} + +impl Case +where + B: Borrow<[Datum]>, +{ + pub fn len(&self) -> usize { + self.data.borrow().len() + } + pub fn iter(&self) -> CaseIter<'_> { + self.into_iter() } } -/// A borrowed string and its [Encoding]. -/// -/// The string is not guaranteed to be valid in the encoding. -/// -/// The owned form of such a string is [EncodedString]. -pub enum EncodedStr<'a> { - /// A string in an arbitrary encoding - Encoded { - /// The bytes of the string. - bytes: &'a [u8], - - /// The string's encoding. - /// - /// THis can be [UTF_8]. - encoding: &'static Encoding, - }, - - /// A string in UTF-8 that is known to be valid. - Utf8 { - /// The string. - s: &'a str, - }, -} - -impl<'a> EncodedStr<'a> { - /// Construct a new string with an arbitrary encoding. - pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self { - Self::Encoded { bytes, encoding } - } - - /// Returns this string recoded in UTF-8. Invalid characters will be - /// replaced by [REPLACEMENT_CHARACTER]. - /// - /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER - pub fn as_str(&self) -> Cow<'_, str> { - match self { - EncodedStr::Encoded { bytes, encoding } => { - encoding.decode_without_bom_handling(bytes).0 +impl Case>> { + pub fn into_unicode(self) -> Self { + if self.encoding == UTF_8 { + self + } else { + Self { + encoding: UTF_8, + data: self + .data + .into_iter() + .map(|datum| { + datum.map_string(|s| { + let mut s = s.with_encoding(self.encoding); + s.codepage_to_unicode(); + s.into_inner() + }) + }) + .collect(), } - EncodedStr::Utf8 { s } => Cow::from(*s), } } +} - /// Returns the bytes in the string, in its encoding. - pub fn as_bytes(&self) -> &[u8] { - match self { - EncodedStr::Encoded { bytes, .. } => bytes, - EncodedStr::Utf8 { s } => s.as_bytes(), +impl Serialize for Case +where + B: Borrow<[Datum]>, +{ + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut seq = serializer.serialize_seq(Some(self.len()))?; + for datum in self.iter() { + seq.serialize_element(&datum)?; } + seq.end() } +} - /// Returns this string recoded in `encoding`. Invalid characters will be - /// replaced by [REPLACEMENT_CHARACTER]. - /// - /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER - pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { - match self { - EncodedStr::Encoded { bytes, encoding } => { - let utf8 = encoding.decode_without_bom_handling(bytes).0; - match encoding.encode(&utf8).0 { - Cow::Borrowed(_) => { - // Recoding into UTF-8 and then back did not change anything. - Cow::from(*bytes) - } - Cow::Owned(owned) => Cow::Owned(owned), - } - } - EncodedStr::Utf8 { s } => encoding.encode(s).0, - } +pub struct CaseIter<'a> { + encoding: &'static Encoding, + iter: std::slice::Iter<'a, Datum>, +} + +impl<'a> Iterator for CaseIter<'a> { + type Item = Datum>>; + + fn next(&mut self) -> Option { + self.iter.next().map(|d| d.as_encoded(self.encoding)) } +} - /// Returns true if this string is empty. - pub fn is_empty(&self) -> bool { - match self { - EncodedStr::Encoded { bytes, .. } => bytes.is_empty(), - EncodedStr::Utf8 { s } => s.is_empty(), +impl<'a, B> IntoIterator for &'a Case +where + B: Borrow<[Datum]>, +{ + type Item = Datum>>; + + type IntoIter = CaseIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + CaseIter { + encoding: self.encoding, + iter: self.data.borrow().into_iter(), } } +} + +impl IntoIterator for Case>> { + type Item = Datum>; + + type IntoIter = CaseIntoIter; - /// Returns a helper for displaying this string in double quotes. - pub fn quoted(&self) -> QuotedEncodedStr { - QuotedEncodedStr(self) + fn into_iter(self) -> Self::IntoIter { + CaseIntoIter { + encoding: self.encoding, + iter: self.data.into_iter(), + } } } -impl<'a> From<&'a str> for EncodedStr<'a> { - fn from(s: &'a str) -> Self { - Self::Utf8 { s } - } +pub struct CaseIntoIter { + encoding: &'static Encoding, + iter: std::vec::IntoIter>, } -impl<'a> From<&'a String> for EncodedStr<'a> { - fn from(s: &'a String) -> Self { - Self::Utf8 { s: s.as_str() } +impl Iterator for CaseIntoIter { + type Item = Datum>; + + fn next(&mut self) -> Option { + self.iter + .next() + .map(|datum| datum.with_encoding(self.encoding)) } } -/// Helper struct for displaying a [QuotedEncodedStr] in double quotes. -pub struct QuotedEncodedStr<'a>(&'a EncodedStr<'a>); +pub struct Quoted(T) +where + T: Display; -impl Display for QuotedEncodedStr<'_> { +impl Display for Quoted +where + T: Display, +{ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self.0.as_str()) + write!(f, "\"{}\"", &self.0) } } diff --git a/rust/pspp/src/data/encoded.rs b/rust/pspp/src/data/encoded.rs new file mode 100644 index 0000000000..304769d20d --- /dev/null +++ b/rust/pspp/src/data/encoded.rs @@ -0,0 +1,296 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +use std::{ + borrow::Cow, + cmp::Ordering, + fmt::{Debug, Display}, + hash::Hash, +}; + +use encoding_rs::{Encoding, UTF_8}; +use serde::Serialize; + +use crate::{ + data::{ByteCow, ByteStr, ByteString, MutRawString, Quoted, RawString, ResizeError}, + variable::VarWidth, +}; + +pub trait Encoded { + fn encoding(&self) -> &'static Encoding; +} + +impl Encoded for &'_ str { + fn encoding(&self) -> &'static Encoding { + UTF_8 + } +} + +impl Encoded for String { + fn encoding(&self) -> &'static Encoding { + UTF_8 + } +} + +impl Encoded for &'_ String { + fn encoding(&self) -> &'static Encoding { + UTF_8 + } +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct WithEncoding { + pub encoding: &'static Encoding, + pub inner: T, +} + +impl WithEncoding { + pub fn new(inner: T, encoding: &'static Encoding) -> Self { + Self { inner, encoding } + } + + pub fn into_inner(self) -> T { + self.inner + } +} + +impl<'a> WithEncoding> { + pub fn into_owned(self) -> WithEncoding { + WithEncoding::new(self.inner.into_owned(), self.encoding) + } +} + +impl PartialOrd for WithEncoding +where + T: PartialOrd, +{ + fn partial_cmp(&self, other: &Self) -> Option { + self.inner.partial_cmp(&other.inner) + } +} + +impl Ord for WithEncoding +where + T: Ord, +{ + fn cmp(&self, other: &Self) -> Ordering { + self.inner.cmp(&other.inner) + } +} + +impl Serialize for WithEncoding +where + WithEncoding: EncodedString, +{ + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.as_str().serialize(serializer) + } +} + +pub trait EncodedString: Encoded + RawString + Display + Debug { + fn as_str(&self) -> Cow<'_, str>; + fn into_string(self) -> String + where + Self: Sized, + { + self.as_str().into_owned() + } + fn to_encoding(&self, encoding: &'static Encoding) -> WithEncoding>; + fn as_encoded_byte_str(&self) -> WithEncoding> { + WithEncoding::new(ByteStr(self.raw_string_bytes()), self.encoding()) + } + fn cloned(&self) -> WithEncoding { + WithEncoding::new(ByteString::from(self.raw_string_bytes()), self.encoding()) + } + fn quoted(&self) -> Quoted<&Self> + where + Self: Sized, + { + Quoted(self) + } +} + +impl<'a> EncodedString for &'a str { + fn as_str(&self) -> Cow<'_, str> { + Cow::from(*self) + } + + fn to_encoding(&self, encoding: &'static Encoding) -> WithEncoding> { + WithEncoding::new(ByteCow(encoding.encode(self).0), encoding) + } +} + +impl EncodedString for String { + fn as_str(&self) -> Cow<'_, str> { + Cow::from(String::as_str(&self)) + } + + fn to_encoding(&self, encoding: &'static Encoding) -> WithEncoding> { + WithEncoding::new(ByteCow(encoding.encode(&self).0), encoding) + } +} + +impl EncodedString for &'_ String { + fn as_str(&self) -> Cow<'_, str> { + Cow::from(String::as_str(&self)) + } + + fn to_encoding(&self, encoding: &'static Encoding) -> WithEncoding> { + WithEncoding::new(ByteCow(encoding.encode(String::as_str(&self)).0), encoding) + } +} + +impl RawString for WithEncoding +where + T: RawString, +{ + fn raw_string_bytes(&self) -> &[u8] { + self.inner.raw_string_bytes() + } +} + +impl MutRawString for WithEncoding +where + T: MutRawString, +{ + fn resize(&mut self, new_len: usize) -> Result<(), ResizeError> { + self.inner.resize(new_len) + } + + fn trim_end(&mut self) { + self.inner.trim_end(); + } +} + +impl EncodedString for WithEncoding +where + T: RawString, +{ + /// Returns this string recoded in UTF-8. Invalid characters will be + /// replaced by [REPLACEMENT_CHARACTER]. + /// + /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER + fn as_str(&self) -> Cow<'_, str> { + self.encoding + .decode_without_bom_handling(self.raw_string_bytes()) + .0 + } + + /// Returns this string recoded in `encoding`. Invalid characters will be + /// replaced by [REPLACEMENT_CHARACTER]. + /// + /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER + fn to_encoding(&self, encoding: &'static Encoding) -> WithEncoding> { + let utf8 = self.as_str(); + let inner = match encoding.encode(&utf8).0 { + Cow::Borrowed(_) => { + // Recoding into UTF-8 and then back did not change anything. + Cow::from(self.raw_string_bytes()) + } + Cow::Owned(owned) => Cow::Owned(owned), + }; + WithEncoding { + encoding, + inner: ByteCow(inner), + } + } +} + +impl WithEncoding { + pub fn codepage_to_unicode(&mut self) { + if self.encoding() != UTF_8 { + let new_len = (self.inner.len() * 3).min(VarWidth::MAX_STRING as usize); + if let Cow::Owned(string) = self + .encoding() + .decode_without_bom_handling(self.raw_string_bytes()) + .0 + { + self.inner = ByteString::from(string); + } + + // Use `self.inner.0.resize` (instead of `self.inner.resize()`) + // because this is a forced resize that can trim off non-spaces. + self.inner.0.resize(new_len, b' '); + + self.encoding = UTF_8; + } + } +} + +impl Encoded for WithEncoding { + fn encoding(&self) -> &'static Encoding { + self.encoding + } +} + +impl Display for WithEncoding +where + T: RawString, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(&self.as_str()) + } +} + +impl Hash for WithEncoding +where + T: Hash, +{ + fn hash(&self, state: &mut H) { + self.inner.hash(state); + } +} + +#[cfg(test)] +mod tests { + use std::{char::REPLACEMENT_CHARACTER, iter::repeat_n}; + + use encoding_rs::{Encoding, UTF_8, WINDOWS_1252}; + + use crate::data::{ByteString, EncodedString, RawString}; + + #[test] + fn codepage_to_unicode() { + fn check_unicode(original: &str, encoding: &'static Encoding, expected: &str) { + let original = ByteString::from(encoding.encode(original).0).with_encoding(encoding); + let mut actual = original.clone(); + actual.codepage_to_unicode(); + assert_eq!(actual.as_str().len(), expected.len()); + assert_eq!(actual.as_str(), expected); + } + + check_unicode("abc", UTF_8, "abc"); + check_unicode("abc", WINDOWS_1252, "abc "); + check_unicode("éèäî", WINDOWS_1252, "éèäî "); + check_unicode( + &repeat_n('é', 15000).collect::(), + WINDOWS_1252, + &repeat_n('é', 15000) + .chain(repeat_n(' ', 2767)) + .collect::(), + ); + check_unicode( + &repeat_n('é', 20000).collect::(), + WINDOWS_1252, + &repeat_n('é', 16383) + .chain(std::iter::once(REPLACEMENT_CHARACTER)) + .collect::(), + ); + } +} diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index 151e218987..923f289b3e 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -14,226 +14,39 @@ // You should have received a copy of the GNU General Public License along with // this program. If not, see . -//! Dictionaries and variables. +//! Dictionaries. use core::str; use std::{ borrow::Cow, - cmp::Ordering, - collections::{BTreeMap, BTreeSet, HashMap, HashSet}, - fmt::{Debug, Display, Formatter, Result as FmtResult}, - hash::Hash, - ops::{Bound, Not, RangeBounds, RangeInclusive}, - str::FromStr, + collections::{btree_set, BTreeSet, HashSet}, + ops::{Bound, Index, RangeBounds, RangeInclusive}, }; -use encoding_rs::Encoding; +use encoding_rs::{Encoding, UTF_8}; use enum_map::{Enum, EnumMap}; use indexmap::IndexSet; -use num::integer::div_ceil; +use serde::{ + ser::{SerializeMap, SerializeSeq, SerializeStruct}, + Serialize, +}; +use smallvec::SmallVec; use thiserror::Error as ThisError; use unicase::UniCase; use crate::{ - data::Datum, - format::{DisplayPlain, Format}, + data::{ByteString, Datum, RawString}, identifier::{ByIdentifier, HasIdentifier, Identifier}, - output::pivot::{Axis3, Dimension, Footnote, Footnotes, Group, PivotTable, Value}, + output::pivot::{ + Axis3, Dimension, Display26Adic, Footnote, Footnotes, Group, PivotTable, Value, + }, settings::Show, + variable::{Attributes, VarWidth, Variable}, }; /// An index within [Dictionary::variables]. pub type DictIndex = usize; -/// Variable type. -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub enum VarType { - /// A numeric variable. - Numeric, - - /// A string variable. - String, -} - -impl Not for VarType { - type Output = Self; - - fn not(self) -> Self::Output { - match self { - Self::Numeric => Self::String, - Self::String => Self::Numeric, - } - } -} - -impl Not for &VarType { - type Output = VarType; - - fn not(self) -> Self::Output { - !*self - } -} - -impl Display for VarType { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - match self { - VarType::Numeric => write!(f, "numeric"), - VarType::String => write!(f, "string"), - } - } -} - -/// [VarType], plus a width for [VarType::String]. -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum VarWidth { - Numeric, - String(u16), -} - -impl PartialOrd for VarWidth { - fn partial_cmp(&self, other: &Self) -> Option { - match (self, other) { - (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal), - (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)), - _ => None, - } - } -} - -impl VarWidth { - pub const MAX_STRING: u16 = 32767; - - pub fn n_dict_indexes(self) -> usize { - match self { - VarWidth::Numeric => 1, - VarWidth::String(w) => div_ceil(w as usize, 8), - } - } - - fn width_predicate( - a: Option, - b: Option, - f: impl Fn(u16, u16) -> u16, - ) -> Option { - match (a, b) { - (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric), - (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => { - Some(VarWidth::String(f(a, b))) - } - _ => None, - } - } - - /// Returns the wider of `self` and `other`: - /// - Numerical variable widths are equally wide. - /// - Longer strings are wider than shorter strings. - /// - Numerical and string types are incomparable, so result in `None`. - /// - Any `None` in the input yields `None` in the output. - pub fn wider(a: Option, b: Option) -> Option { - Self::width_predicate(a, b, |a, b| a.max(b)) - } - - /// Returns the narrower of `self` and `other` (see [`Self::wider`]). - pub fn narrower(a: Option, b: Option) -> Option { - Self::width_predicate(a, b, |a, b| a.min(b)) - } - - pub fn default_display_width(&self) -> u32 { - match self { - VarWidth::Numeric => 8, - VarWidth::String(width) => *width.min(&32) as u32, - } - } - - pub fn is_long_string(&self) -> bool { - if let Self::String(width) = self { - *width > 8 - } else { - false - } - } - - pub fn as_string_width(&self) -> Option { - match self { - VarWidth::Numeric => None, - VarWidth::String(width) => Some(*width as usize), - } - } - - pub fn is_numeric(&self) -> bool { - *self == Self::Numeric - } - - pub fn is_string(&self) -> bool { - !self.is_numeric() - } - - pub fn is_very_long(&self) -> bool { - match *self { - VarWidth::Numeric => false, - VarWidth::String(width) => width >= 256, - } - } - - /// Number of bytes per segment by which the amount of space for very long - /// string variables is allocated. - const EFFECTIVE_VLS_CHUNK: usize = 252; - - /// Returns the number of "segments" used for writing case data for a - /// variable with this width. A segment is a physical variable in the - /// system file that represents some piece of a logical variable as seen by - /// a PSPP user. Only very long string variables have more than one - /// segment. - pub fn n_segments(&self) -> usize { - if self.is_very_long() { - self.as_string_width() - .unwrap() - .div_ceil(Self::EFFECTIVE_VLS_CHUNK) - } else { - 1 - } - } - - /// Returns the width to allocate to the segment with the given - /// `segment_idx` within this variable. A segment is a physical variable in - /// the system file that represents some piece of a logical variable as seen - /// by a PSPP user. - pub fn segment_alloc_width(&self, segment_idx: usize) -> usize { - debug_assert!(segment_idx < self.n_segments()); - debug_assert!(self.is_very_long()); - - if segment_idx < self.n_segments() - 1 { - 255 - } else { - self.as_string_width().unwrap() - segment_idx * Self::EFFECTIVE_VLS_CHUNK - } - } - - pub fn display_adjective(&self) -> VarWidthAdjective { - VarWidthAdjective(*self) - } -} - -impl From for VarType { - fn from(source: VarWidth) -> Self { - match source { - VarWidth::Numeric => VarType::Numeric, - VarWidth::String(_) => VarType::String, - } - } -} - -pub struct VarWidthAdjective(VarWidth); - -impl Display for VarWidthAdjective { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - match self.0 { - VarWidth::Numeric => write!(f, "numeric"), - VarWidth::String(width) => write!(f, "{width}-byte string"), - } - } -} - /// A collection of variables, plus additional metadata. #[derive(Clone, Debug)] pub struct Dictionary { @@ -241,19 +54,19 @@ pub struct Dictionary { pub variables: IndexSet>, /// Indexes into `variables` of the `SPLIT FILE` variables. - pub split_file: Vec, + split_file: Vec, /// Index of the weight variable, if any. /// /// The weight variable must be numeric. - pub weight: Option, + weight: Option, /// Index of the filter variable, if any. /// /// The filter variable must be numeric. If there is a filter variable, /// then data analysis excludes cases whose filter value is zero or system- /// or user-missing. - pub filter: Option, + filter: Option, /// An optional limit on the number of cases read by procedures. pub case_limit: Option, @@ -265,7 +78,7 @@ pub struct Dictionary { pub documents: Vec, /// Named collections of variables within the dictionary. - pub vectors: HashSet>, + vectors: HashSet>, /// Attributes for the dictionary itself. /// @@ -273,15 +86,65 @@ pub struct Dictionary { pub attributes: Attributes, /// Multiple response sets. - pub mrsets: BTreeSet>, + mrsets: BTreeSet>, /// Variable sets. /// /// Only the GUI makes use of variable sets. - pub variable_sets: Vec, + variable_sets: Vec, /// Character encoding for the dictionary and the data. - pub encoding: &'static Encoding, + encoding: &'static Encoding, +} + +impl PartialEq for Dictionary { + fn eq(&self, other: &Self) -> bool { + // We have to compare the dereferenced versions of fields that use + // [ByIdentifier. Otherwise we would just be comparing their names. + self.variables + .iter() + .map(|var| &*var) + .eq(other.variables.iter().map(|var| &*var)) + && self.split_file == other.split_file + && self.weight == other.weight + && self.filter == other.filter + && self.case_limit == other.case_limit + && self.file_label == other.file_label + && self.documents == other.documents + && self + .vectors + .iter() + .map(|vector| &*vector) + .eq(other.vectors.iter().map(|vector| &*vector)) + && self.attributes == other.attributes + && self + .mrsets + .iter() + .map(|mrset| &*mrset) + .eq(other.mrsets.iter().map(|mrset| &*mrset)) + && self.variable_sets == other.variable_sets + && self.encoding == other.encoding + } +} + +impl Serialize for Dictionary { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut map = serializer.serialize_struct("Dictionary", 12)?; + map.serialize_field("variables", &self.variables)?; + map.serialize_field("split_file", &self.split_vars())?; + map.serialize_field("weight", &self.weight_var())?; + map.serialize_field("filter", &self.filter_var())?; + map.serialize_field("documents", &self.documents)?; + map.serialize_field("vectors", &self.vectors())?; + map.serialize_field("attributes", &self.attributes)?; + map.serialize_field("mrsets", &self.mrsets())?; + map.serialize_field("variable_sets", &self.variable_sets())?; + map.serialize_field("encoding", self.encoding)?; + map.end() + } } #[derive(Debug, ThisError)] @@ -296,6 +159,24 @@ pub enum AddVarError { }, } +/// Weight variable must be numeric. +#[derive(Debug, ThisError)] +#[error("Weight variable must be numeric.")] +pub struct InvalidWeightVariable; + +/// Filter variable must be numeric. +#[derive(Debug, ThisError)] +#[error("Filter variable must be numeric.")] +pub struct InvalidFilterVariable; + +/// Invalid dictionary index. +#[derive(Debug, Clone, ThisError)] +#[error("Invalid index {index} in dictionary with {n} variables.")] +pub struct DictIndexError { + index: usize, + n: usize, +} + impl Dictionary { /// Creates a new, empty dictionary with the specified `encoding`. pub fn new(encoding: &'static Encoding) -> Self { @@ -315,17 +196,105 @@ impl Dictionary { } } + pub fn encoding(&self) -> &'static Encoding { + self.encoding + } + /// Returns a reference to the weight variable, if any. pub fn weight_var(&self) -> Option<&Variable> { self.weight.map(|index| &self.variables[index].0) } - /// Returns references to all the split variables, if any. - pub fn split_vars(&self) -> Vec<&Variable> { - self.split_file + /// Returns the weight variable's dictionary index. + pub fn weight_index(&self) -> Option { + self.weight + } + + /// Sets the weight variable to the variable with the given dictionary + /// index. + /// + /// # Panic + /// + /// Panics if `dict_index` is not a valid dictionary index. + pub fn set_weight( + &mut self, + dict_index: Option, + ) -> Result<(), InvalidWeightVariable> { + if let Some(dict_index) = dict_index + && !self.variables[dict_index].width.is_numeric() + { + Err(InvalidWeightVariable) + } else { + self.weight = dict_index; + Ok(()) + } + } + + /// Returns a reference to the filter variable, if any. + pub fn filter_var(&self) -> Option<&Variable> { + self.filter.map(|index| &self.variables[index].0) + } + + /// Returns the filter variable's dictionary index. + pub fn filter_index(&self) -> Option { + self.filter + } + + /// Sets the filter variable to the variable with the given dictionary + /// index. + /// + /// # Panic + /// + /// Panics if `dict_index` is not a valid dictionary index. + pub fn set_filter( + &mut self, + dict_index: Option, + ) -> Result<(), InvalidFilterVariable> { + if let Some(dict_index) = dict_index + && !self.variables[dict_index].width.is_numeric() + { + Err(InvalidFilterVariable) + } else { + self.filter = dict_index; + Ok(()) + } + } + + /// Returns the split variables. + pub fn split_vars(&self) -> MappedVariables<'_> { + MappedVariables::new_unchecked(self, &self.split_file) + } + + pub fn vectors(&self) -> Vectors<'_> { + Vectors::new(self) + } + + pub fn vectors_mut(&mut self) -> VectorsMut<'_> { + VectorsMut::new(self) + } + + pub fn mrsets(&self) -> MultipleResponseSets<'_> { + MultipleResponseSets::new(self) + } + + pub fn mrsets_mut(&mut self) -> MultipleResponseSetsMut<'_> { + MultipleResponseSetsMut::new(self) + } + + pub fn variable_sets(&self) -> VariableSets<'_> { + VariableSets::new(self) + } + + pub fn add_variable_set(&mut self, set: DictIndexVariableSet) { + assert!(set + .variables .iter() - .map(|index| &self.variables[*index].0) - .collect() + .all(|dict_index| *dict_index < self.variables.len())); + self.variable_sets.push(set); + } + + pub fn remove_variable_set(&mut self, var_set_index: usize) { + self.variable_sets.remove(var_set_index); } /// Adds `variable` at the end of the dictionary and returns its index. @@ -334,9 +303,9 @@ impl Dictionary { /// the same name (or a variant with different case), or if `variable`'s /// encoding differs from the dictionary's. pub fn add_var(&mut self, variable: Variable) -> Result { - if variable.encoding != self.encoding { + if variable.encoding() != self.encoding { Err(AddVarError::WrongEncoding { - var_encoding: variable.encoding, + var_encoding: variable.encoding(), dict_encoding: self.encoding, }) } else { @@ -431,7 +400,7 @@ impl Dictionary { } else if index < end { None } else { - Some(index - end - start) + Some(index - (end - start)) } }) } @@ -549,6 +518,191 @@ impl Dictionary { (group, values) } + + pub fn to_pivot_table(&self) -> PivotTable { + let (group, data) = self.to_pivot_rows(); + PivotTable::new([(Axis3::Y, Dimension::new(group))]).with_data( + data.into_iter() + .enumerate() + .filter(|(_row, value)| !value.is_empty()) + .map(|(row, value)| ([row], value)), + ) + } + + pub fn all_pivot_tables(&self) -> Vec { + let mut pivot_tables = Vec::new(); + pivot_tables.push(self.to_pivot_table()); + pivot_tables.push(self.output_variables().to_pivot_table()); + pivot_tables.extend(self.output_value_labels().to_pivot_table()); + pivot_tables.extend(self.output_mrsets().to_pivot_table()); + pivot_tables.extend(self.output_attributes().to_pivot_table()); + pivot_tables.extend(self.output_variable_sets().to_pivot_table()); + pivot_tables + } + + pub fn short_names(&self) -> Vec> { + struct PickShortName<'a> { + variable_name: &'a Identifier, + used_names: &'a mut HashSet, + encoding: &'static Encoding, + index: usize, + } + impl<'a> PickShortName<'a> { + fn new( + variable_name: &'a Identifier, + used_names: &'a mut HashSet, + encoding: &'static Encoding, + ) -> Self { + Self { + variable_name, + used_names, + encoding, + index: 0, + } + } + + fn next(&mut self) -> Identifier { + loop { + let name = if self.index == 0 { + self.variable_name.shortened(self.encoding) + } else { + self.variable_name + .with_suffix( + &format!("_{}", Display26Adic::new_uppercase(self.index)), + self.encoding, + 8, + ) + .or_else(|_| { + Identifier::new(format!( + "V{}", + Display26Adic::new_uppercase(self.index) + )) + }) + .unwrap() + }; + if !self.used_names.contains(&name) { + self.used_names.insert(name.clone()); + return name; + } + self.index += 1; + } + } + } + + let mut used_names = HashSet::new(); + + // Each variable whose name is short has the best claim to its short + // name. + let mut short_names: Vec; 1]>> = self + .variables + .iter() + .map(|variable| { + let n = variable.width.segments().len(); + let mut names = SmallVec::with_capacity(n); + if self.encoding.encode(variable.name.as_str()).0.len() <= 8 { + used_names.insert(variable.name.clone()); + names.push(Some(variable.name.clone())) + } + while names.len() < n { + names.push(None); + } + names + }) + .collect(); + + // Each variable with an assigned short name for its first segment now + // gets it unless there is a conflict. In case of conflict, the + // claimant earlier in dictionary order wins. Then similarly for + // additional segments of very long strings. + for (variable, short_names) in self.variables.iter().zip(short_names.iter_mut()) { + if short_names[0].is_none() + && let Some(short_name) = variable.short_names.first() + && !used_names.contains(&short_name) + { + used_names.insert(short_name.clone()); + short_names[0] = Some(short_name.clone()); + } + } + for (variable, short_names) in self.variables.iter().zip(short_names.iter_mut()) { + for (index, assigned_short_name) in short_names.iter_mut().enumerate().skip(1) { + if assigned_short_name.is_none() + && let Some(short_name) = variable.short_names.get(index) + && !used_names.contains(&short_name) + { + used_names.insert(short_name.clone()); + *assigned_short_name = Some(short_name.clone()); + } + } + } + + // Assign short names to first segment of remaining variables, + // then similarly for additional segments. + for (variable, short_names) in self.variables.iter().zip(short_names.iter_mut()) { + if short_names[0].is_none() { + short_names[0] = + Some(PickShortName::new(&variable.name, &mut used_names, self.encoding).next()); + } + } + for (variable, short_names) in self.variables.iter().zip(short_names.iter_mut()) { + let mut picker = PickShortName::new(&variable.name, &mut used_names, self.encoding); + for assigned_short_name in short_names.iter_mut().skip(1) { + if assigned_short_name.is_none() { + *assigned_short_name = Some(picker.next()); + } + } + } + + short_names + .into_iter() + .map(|names| names.into_iter().flatten().collect()) + .collect() + } + + pub fn codepage_to_unicode(&mut self) { + if self.encoding == UTF_8 { + return; + } + + let mut variables = IndexSet::new(); + let mut index = 0; + for mut variable in self.variables.drain(..) { + variable.codepage_to_unicode(); + while variables.contains(&variable) { + index += 1; + variable.name = Identifier::new(format!("Var{index}")).unwrap(); + } + variables.insert(variable); + } + self.variables = variables; + + let mut index = 0; + let mut vectors = self.vectors.drain().collect::>(); + vectors.sort(); + for mut vector in vectors { + vector.codepage_to_unicode(); + while self.vectors.contains(&vector) { + index += 1; + vector.name = Identifier::new(format!("Vec{index}")).unwrap(); + } + self.vectors.insert(vector); + } + + self.attributes.codepage_to_unicode(); + + let mut mrsets = BTreeSet::new(); + let mut index = 0; + while let Some(mut mrset) = self.mrsets.pop_first() { + mrset.codepage_to_unicode(); + while mrsets.contains(&mrset) { + index += 1; + mrset.name = Identifier::new(format!("MrSet{index}")).unwrap(); + } + mrsets.insert(mrset); + } + self.mrsets = mrsets; + + self.encoding = UTF_8; + } } pub struct OutputVariables<'a> { @@ -584,6 +738,7 @@ impl<'a> OutputVariables<'a> { (Axis3::Y, Dimension::new(names)), (Axis3::X, Dimension::new(attributes)), ]) + .with_title("Variables") .with_show_empty(); for (var_index, variable) in self.dictionary.variables.iter().enumerate() { for (field, field_index) in &columns { @@ -612,13 +767,8 @@ impl<'a> OutputVariables<'a> { VariableField::WriteFormat => { Some(Value::new_user_text(variable.write_format.to_string())) } - VariableField::MissingValues if !variable.missing_values.is_empty() => { - Some(Value::new_user_text( - variable - .missing_values - .display(variable.encoding) - .to_string(), - )) + VariableField::MissingValues if !variable.missing_values().is_empty() => { + Some(Value::new_user_text(variable.missing_values().to_string())) } VariableField::MissingValues => None, } @@ -660,7 +810,10 @@ impl<'a> OutputValueLabels<'a> { for (datum, label) in sorted_value_labels { let mut value = Value::new_variable_value(variable, datum) .with_show_value_label(Some(Show::Value)); - if variable.missing_values.contains(datum) { + if variable + .missing_values() + .contains(&datum.as_encoded(variable.encoding())) + { value.add_footnote(&missing_footnote); } group.push(value); @@ -673,11 +826,15 @@ impl<'a> OutputValueLabels<'a> { } values.push(group); } - let mut pt = PivotTable::new([(Axis3::Y, Dimension::new(values))]); - for (row, datum) in data.into_iter().enumerate() { - pt.insert(&[row], datum); - } - Some(pt) + Some( + PivotTable::new([(Axis3::Y, Dimension::new(values))]) + .with_title("Value Labels") + .with_data( + data.into_iter() + .enumerate() + .map(|(row, datum)| ([row], datum)), + ), + ) } } @@ -740,17 +897,21 @@ impl<'a> OutputVariableSets<'a> { } variable_sets.push(group); } - let mut pt = PivotTable::new([ - (Axis3::Y, Dimension::new(variable_sets)), - ( - Axis3::X, - Dimension::new(Group::new("Attributes").with("Variable")), + Some( + PivotTable::new([ + (Axis3::Y, Dimension::new(variable_sets)), + ( + Axis3::X, + Dimension::new(Group::new("Attributes").with("Variable")), + ), + ]) + .with_title("Variable Sets") + .with_data( + data.into_iter() + .enumerate() + .map(|(row, datum)| ([row, 0], datum)), ), - ]); - for (row, datum) in data.into_iter().enumerate() { - pt.insert(&[row, 0], datum); - } - Some(pt) + ) } } @@ -783,13 +944,17 @@ impl<'a> OutputMrsets<'a> { let mut pt = PivotTable::new([ (Axis3::Y, Dimension::new(mrsets)), (Axis3::X, Dimension::new(attributes)), - ]); + ]) + .with_title("Multiple Response Sets"); for (row, mrset) in self.dictionary.mrsets.iter().enumerate() { pt.insert(&[row, 0], mrset.label.as_str()); let mr_type_name = match &mrset.mr_type { MultipleResponseType::MultipleDichotomy { datum, .. } => { - pt.insert(&[row, 2], Value::new_datum(datum, self.dictionary.encoding)); + pt.insert( + &[row, 2], + Value::new_datum(&datum.as_encoded(self.dictionary.encoding)), + ); "Dichotomies" } MultipleResponseType::MultipleCategory => "Categories", @@ -873,7 +1038,8 @@ impl<'a> OutputAttributes<'a> { let mut pt = PivotTable::new([ (Axis3::X, Dimension::new(values)), (Axis3::Y, Dimension::new(variables)), - ]); + ]) + .with_title("Data File and Variable Attributes"); for (row, datum) in data.into_iter().enumerate() { pt.insert(&[0, row], datum); } @@ -895,711 +1061,851 @@ where }); } -#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] -pub enum Role { - #[default] - Input, - Target, - Both, - None, - Partition, - Split, +pub fn escape_value_label(unescaped: &str) -> Cow<'_, str> { + if unescaped.contains("\n") { + unescaped.replace("\n", "\\n").into() + } else { + unescaped.into() + } } -impl Role { - fn as_str(&self) -> &'static str { - match self { - Role::Input => "Input", - Role::Target => "Target", - Role::Both => "Both", - Role::None => "None", - Role::Partition => "Partition", - Role::Split => "Split", - } +pub fn unescape_value_label(escaped: &str) -> Cow<'_, str> { + if escaped.contains("\\n") { + escaped.replace("\\n", "\n").into() + } else { + escaped.into() } } -impl FromStr for Role { - type Err = InvalidRole; +#[derive(Clone, Debug)] +pub struct DictIndexVector { + pub name: Identifier, + pub variables: Vec, +} - fn from_str(s: &str) -> Result { - for (string, value) in [ - ("input", Role::Input), - ("target", Role::Target), - ("both", Role::Both), - ("none", Role::None), - ("partition", Role::Partition), - ("split", Role::Split), - ] { - if string.eq_ignore_ascii_case(s) { - return Ok(value); - } - } - Err(InvalidRole::UnknownRole(s.into())) +impl DictIndexVector { + fn with_updated_dict_indexes( + mut self, + f: impl Fn(DictIndex) -> Option, + ) -> Option { + update_dict_index_vec(&mut self.variables, f); + (!self.variables.is_empty()).then_some(self) } -} -impl TryFrom for Role { - type Error = InvalidRole; + pub fn codepage_to_unicode(&mut self) { + self.name.codepage_to_unicode(); + } +} - fn try_from(value: i32) -> Result { - match value { - 0 => Ok(Role::Input), - 1 => Ok(Role::Target), - 2 => Ok(Role::Both), - 3 => Ok(Role::None), - 4 => Ok(Role::Partition), - 5 => Ok(Role::Split), - _ => Err(InvalidRole::UnknownRole(value.to_string())), - } +impl HasIdentifier for DictIndexVector { + fn identifier(&self) -> &UniCase { + &self.name.0 } } -#[derive(Clone, Debug, Default, PartialEq, Eq)] -pub struct Attributes(pub BTreeMap>); +pub struct Vector<'a> { + dictionary: &'a Dictionary, + vector: &'a DictIndexVector, +} -impl Attributes { - pub fn new() -> Self { - Self(BTreeMap::new()) +impl<'a> Vector<'a> { + fn new_unchecked(dictionary: &'a Dictionary, vector: &'a DictIndexVector) -> Self { + Self { dictionary, vector } + } + pub fn new( + dictionary: &'a Dictionary, + vector: &'a DictIndexVector, + ) -> Result { + MappedVariables::new(dictionary, &vector.variables)?; + Ok(Self::new_unchecked(dictionary, vector)) + } + pub fn name(&self) -> &'a Identifier { + &self.vector.name + } + pub fn variables(&self) -> MappedVariables<'a> { + MappedVariables::new_unchecked(self.dictionary, &self.vector.variables) } +} + +pub struct MappedVariables<'a> { + dictionary: &'a Dictionary, + dict_indexes: &'a [DictIndex], +} - pub fn contains_name(&self, name: &Identifier) -> bool { - self.0.contains_key(name) +impl<'a> MappedVariables<'a> { + fn new_unchecked(dictionary: &'a Dictionary, dict_indexes: &'a [DictIndex]) -> Self { + Self { + dictionary, + dict_indexes, + } } - pub fn insert(&mut self, name: Identifier, values: Vec) { - self.0.insert(name, values); + pub fn new( + dictionary: &'a Dictionary, + dict_indexes: &'a [DictIndex], + ) -> Result { + let n = dictionary.variables.len(); + for index in dict_indexes.iter().copied() { + if index >= n { + return Err(DictIndexError { index, n }); + } + } + Ok(Self::new_unchecked(dictionary, dict_indexes)) } - pub fn append(&mut self, other: &mut Self) { - self.0.append(&mut other.0) + pub fn len(&self) -> usize { + self.dict_indexes.len() } - pub fn role(&self) -> Result, InvalidRole> { - self.try_into() + pub fn get(&self, index: usize) -> Option<&'a Variable> { + self.dict_indexes + .get(index) + .map(|dict_index| &*self.dictionary.variables[*dict_index]) } - pub fn iter(&self, include_at: bool) -> impl Iterator { - self.0.iter().filter_map(move |(name, values)| { - if include_at || !name.0.starts_with('@') { - Some((name, values.as_slice())) - } else { - None - } - }) + pub fn iter(&self) -> MappedVariablesIter<'a> { + MappedVariablesIter::new(self.dictionary, self.dict_indexes.iter()) } - pub fn has_any(&self, include_at: bool) -> bool { - self.iter(include_at).next().is_some() + pub fn dict_indexes(&self) -> &[DictIndex] { + self.dict_indexes } } -#[derive(Clone, Debug, ThisError, PartialEq, Eq)] -pub enum InvalidRole { - #[error("Unknown role {0:?}.")] - UnknownRole(String), +impl<'a> Index for MappedVariables<'a> { + type Output = Variable; - #[error("Role attribute $@Role must have exactly one value (not {0}).")] - InvalidValues(usize), + fn index(&self, index: usize) -> &Self::Output { + &*self.dictionary.variables[self.dict_indexes[index]] + } } -impl TryFrom<&Attributes> for Option { - type Error = InvalidRole; - - fn try_from(value: &Attributes) -> Result { - let role = Identifier::new("$@Role").unwrap(); - value.0.get(&role).map_or(Ok(None), |attribute| { - if let Ok([string]) = <&[String; 1]>::try_from(attribute.as_slice()) { - match string.parse::() { - Ok(integer) => Ok(Some(Role::try_from(integer)?)), - Err(_) => Err(InvalidRole::UnknownRole(string.clone())), - } - } else { - Err(InvalidRole::InvalidValues(attribute.len())) - } - }) +impl<'a> Serialize for MappedVariables<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut seq = serializer.serialize_seq(Some(self.len()))?; + for variable in self { + seq.serialize_element(&variable.name)?; + } + seq.end() } } -/// A variable, usually inside a [Dictionary]. -#[derive(Clone, Debug)] -pub struct Variable { - /// The variable's name. - /// - /// PSPP variable names are case-insensitive. - pub name: Identifier, - - /// Variable width. - pub width: VarWidth, - - /// User-missing values. - /// - /// Numeric variables also have a system-missing value (represented as - /// `None`). - /// - /// Both kinds of missing values are excluded from most analyses. - pub missing_values: MissingValues, - - /// Output format used in most contexts. - pub print_format: Format, - - /// Output format used on the `WRITE` command. - pub write_format: Format, - - /// Value labels, to associate a number (or a string) with a more meaningful - /// description, e.g. 1 -> Apple, 2 -> Banana, ... - pub value_labels: ValueLabels, - - /// Variable label, an optional meaningful description for the variable - /// itself. - pub label: Option, - - /// Measurement level for the variable's data. - pub measure: Option, - - /// Role in data analysis. - pub role: Role, - - /// Width of data column in GUI. - pub display_width: u32, - - /// Data alignment in GUI. - pub alignment: Alignment, - - /// Whether to retain values of the variable from one case to the next. - pub leave: bool, +impl<'a> IntoIterator for &MappedVariables<'a> { + type Item = &'a Variable; - /// For compatibility with old software that supported at most 8-character - /// variable names. - pub short_names: Vec, + type IntoIter = MappedVariablesIter<'a>; - /// Variable attributes. - pub attributes: Attributes, - - /// Encoding for [Value]s inside this variable. - /// - /// The variables in a [Dictionary] must all use the same encoding as the - /// dictionary. - pub encoding: &'static Encoding, -} - -pub fn escape_value_label(unescaped: &str) -> Cow<'_, str> { - if unescaped.contains("\n") { - unescaped.replace("\n", "\\n").into() - } else { - unescaped.into() + fn into_iter(self) -> Self::IntoIter { + self.iter() } } -pub fn unescape_value_label(escaped: &str) -> Cow<'_, str> { - if escaped.contains("\\n") { - escaped.replace("\\n", "\n").into() - } else { - escaped.into() - } +pub struct MappedVariablesIter<'a> { + dictionary: &'a Dictionary, + dict_indexes: std::slice::Iter<'a, DictIndex>, } -impl Variable { - pub fn new(name: Identifier, width: VarWidth, encoding: &'static Encoding) -> Self { - let var_type = VarType::from(width); - let leave = name.class().must_leave(); +impl<'a> MappedVariablesIter<'a> { + pub fn new(dictionary: &'a Dictionary, dict_indexes: std::slice::Iter<'a, DictIndex>) -> Self { Self { - name, - width, - missing_values: MissingValues::default(), - print_format: Format::default_for_width(width), - write_format: Format::default_for_width(width), - value_labels: ValueLabels::new(), - label: None, - measure: Measure::default_for_type(var_type), - role: Role::default(), - display_width: width.default_display_width(), - alignment: Alignment::default_for_type(var_type), - leave, - short_names: Vec::new(), - attributes: Attributes::new(), - encoding, + dictionary, + dict_indexes, } } +} - pub fn is_numeric(&self) -> bool { - self.width.is_numeric() - } - - pub fn is_string(&self) -> bool { - self.width.is_string() - } +impl<'a> Iterator for MappedVariablesIter<'a> { + type Item = &'a Variable; - pub fn label(&self) -> Option<&String> { - self.label.as_ref() + fn next(&mut self) -> Option { + self.dict_indexes + .next() + .map(|dict_index| &*self.dictionary.variables[*dict_index]) } +} - pub fn resize(&mut self, width: VarWidth) { - if self.missing_values.is_resizable(width) { - self.missing_values.resize(width); - } else { - self.missing_values = MissingValues::default(); - } +pub struct VectorsIter<'a> { + dictionary: &'a Dictionary, + iter: std::collections::hash_set::Iter<'a, ByIdentifier>, +} - if self.value_labels.is_resizable(width) { - self.value_labels.resize(width); - } else { - self.value_labels = ValueLabels::default(); +impl<'a> VectorsIter<'a> { + fn new(dictionary: &'a Dictionary) -> Self { + Self { + dictionary, + iter: dictionary.vectors.iter(), } - - self.print_format.resize(width); - self.write_format.resize(width); - - self.width = width; } } +impl<'a> Iterator for VectorsIter<'a> { + type Item = Vector<'a>; -impl HasIdentifier for Variable { - fn identifier(&self) -> &UniCase { - &self.name.0 + fn next(&mut self) -> Option { + self.iter + .next() + .map(|vector| Vector::new_unchecked(self.dictionary, vector)) } } -#[derive(Clone, Debug)] -pub struct Vector { - pub name: Identifier, - pub variables: Vec, -} +#[derive(Debug)] +pub struct VectorsMut<'a>(&'a mut Dictionary); -impl Vector { - fn with_updated_dict_indexes( - mut self, - f: impl Fn(DictIndex) -> Option, - ) -> Option { - update_dict_index_vec(&mut self.variables, f); - (!self.variables.is_empty()).then_some(self) +impl<'a> VectorsMut<'a> { + fn new(dictionary: &'a mut Dictionary) -> Self { + Self(dictionary) } -} - -impl HasIdentifier for Vector { - fn identifier(&self) -> &UniCase { - &self.name.0 + pub fn as_vectors(&'a self) -> Vectors<'a> { + Vectors(self.0) + } + pub fn insert(&mut self, vector: DictIndexVector) -> Result<(), DictIndexError> { + Vector::new(self.0, &vector)?; + self.0.vectors.insert(ByIdentifier(vector)); + Ok(()) } } -/// Variables that represent multiple responses to a survey question. -#[derive(Clone, Debug)] -pub struct MultipleResponseSet { - /// The set's name. - pub name: Identifier, - - /// A description for the set. - pub label: String, +#[derive(Debug)] +pub struct Vectors<'a>(&'a Dictionary); - /// Range of widths among the variables. - pub width: RangeInclusive, +impl<'a> Vectors<'a> { + fn new(dictionary: &'a Dictionary) -> Self { + Self(dictionary) + } + pub fn len(&self) -> usize { + self.0.vectors.len() + } + pub fn get(&self, name: &Identifier) -> Option> { + self.0 + .vectors + .get(&name.0) + .map(|vector| Vector::new_unchecked(self.0, &*vector)) + } + pub fn iter(&self) -> VectorsIter<'a> { + VectorsIter::new(self.0) + } +} - /// What kind of multiple response set this is. - pub mr_type: MultipleResponseType, +impl<'a> IntoIterator for &Vectors<'a> { + type Item = Vector<'a>; - /// The variables comprising the set. - pub variables: Vec, -} + type IntoIter = VectorsIter<'a>; -impl MultipleResponseSet { - fn with_updated_dict_indexes( - mut self, - f: impl Fn(DictIndex) -> Option, - ) -> Option { - update_dict_index_vec(&mut self.variables, f); - (self.variables.len() > 1).then_some(self) + fn into_iter(self) -> Self::IntoIter { + self.iter() } } -impl HasIdentifier for MultipleResponseSet { - fn identifier(&self) -> &UniCase { - &self.name.0 +impl<'a> Serialize for Vectors<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut map = serializer.serialize_map(Some(self.len()))?; + for vector in self { + map.serialize_key(vector.name())?; + map.serialize_value(&vector.variables())?; + } + map.end() } } -/// The type of a [MultipleResponseSet]. -#[derive(Clone, Debug)] -pub enum MultipleResponseType { - /// A "multiple dichotomy set", analogous to a survey question with a set of - /// checkboxes. Each variable in the set is treated in a Boolean fashion: - /// one value (the "counted value") means that the box was checked, and any - /// other value means that it was not. - MultipleDichotomy { - datum: Datum, - labels: CategoryLabels, - }, - - /// A "multiple category set", a survey question where the respondent is - /// instructed to list up to N choices. Each variable represents one of the - /// responses. - MultipleCategory, -} - -#[derive(Clone, Debug)] -pub struct VariableSet { - pub name: String, - pub variables: Vec, +#[derive(Copy, Clone, Debug)] +pub struct VariableSet<'a> { + dictionary: &'a Dictionary, + variable_set: &'a DictIndexVariableSet, } -impl VariableSet { - fn with_updated_dict_indexes( - mut self, - f: impl Fn(DictIndex) -> Option, - ) -> Option { - update_dict_index_vec(&mut self.variables, f); - (!self.variables.is_empty()).then_some(self) +impl<'a> PartialEq for VariableSet<'a> { + fn eq(&self, other: &Self) -> bool { + self.variable_set == other.variable_set } } -#[derive(Clone, Debug, Default)] -pub struct ValueLabels(pub HashMap); - -impl ValueLabels { - pub fn new() -> Self { - Self::default() +impl<'a> VariableSet<'a> { + pub fn name(&self) -> &'a String { + &self.variable_set.name } - - pub fn is_empty(&self) -> bool { - self.0.is_empty() + pub fn variables(&self) -> MappedVariables<'a> { + MappedVariables::new_unchecked(self.dictionary, &self.variable_set.variables) } +} - pub fn get(&self, datum: &Datum) -> Option<&str> { - self.0.get(datum).map(|s| s.as_str()) - } +#[derive(Debug)] +pub struct VariableSets<'a>(&'a Dictionary); - pub fn insert(&mut self, datum: Datum, label: String) -> Option { - self.0.insert(datum, label) +impl<'a> VariableSets<'a> { + fn new(dictionary: &'a Dictionary) -> Self { + Self(dictionary) } - - pub fn is_resizable(&self, width: VarWidth) -> bool { - self.0.keys().all(|datum| datum.is_resizable(width)) + pub fn len(&self) -> usize { + self.0.variable_sets.len() } - - pub fn resize(&mut self, width: VarWidth) { - self.0 = self - .0 - .drain() - .map(|(mut datum, string)| { - datum.resize(width); - (datum, string) + pub fn get(&self, index: usize) -> Option> { + self.0 + .variable_sets + .get(index) + .map(|variable_set| VariableSet { + dictionary: self.0, + variable_set: &*variable_set, }) - .collect(); + } + pub fn iter(&self) -> VariableSetsIter<'a> { + VariableSetsIter::new(self.0) } } -#[derive(Clone, Default)] -pub struct MissingValues { - /// Individual missing values, up to 3 of them. - values: Vec, +impl<'a> IntoIterator for &VariableSets<'a> { + type Item = VariableSet<'a>; - /// Optional range of missing values. - range: Option, + type IntoIter = VariableSetsIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } } -impl Debug for MissingValues { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - DisplayMissingValues { - mv: self, - encoding: None, +impl<'a> Serialize for VariableSets<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut map = serializer.serialize_map(Some(self.len()))?; + for variable_set in self { + map.serialize_key(variable_set.name())?; + map.serialize_value(&variable_set.variables())?; } - .fmt(f) + map.end() } } -#[derive(Copy, Clone, Debug)] -pub enum MissingValuesError { - TooMany, - TooWide, - MixedTypes, +pub struct VariableSetsIter<'a> { + dictionary: &'a Dictionary, + iter: std::slice::Iter<'a, DictIndexVariableSet>, } -impl MissingValues { - pub fn new( - mut values: Vec, - range: Option, - ) -> Result { - if values.len() > 3 { - return Err(MissingValuesError::TooMany); +impl<'a> VariableSetsIter<'a> { + fn new(dictionary: &'a Dictionary) -> Self { + Self { + dictionary, + iter: dictionary.variable_sets.iter(), } + } +} +impl<'a> Iterator for VariableSetsIter<'a> { + type Item = VariableSet<'a>; - let mut var_type = None; - for value in values.iter_mut() { - value.trim_end(); - match value.width() { - VarWidth::String(w) if w > 8 => return Err(MissingValuesError::TooWide), - _ => (), - } - if var_type.is_some_and(|t| t != value.var_type()) { - return Err(MissingValuesError::MixedTypes); - } - var_type = Some(value.var_type()); - } + fn next(&mut self) -> Option { + self.iter.next().map(|variable_set| VariableSet { + dictionary: self.dictionary, + variable_set, + }) + } +} - if var_type == Some(VarType::String) && range.is_some() { - return Err(MissingValuesError::MixedTypes); - } +#[derive(Debug)] +pub struct MultipleResponseSetsMut<'a>(&'a mut Dictionary); + +#[derive(ThisError, Clone, Debug)] +pub enum MrSetError { + #[error("{0}")] + DictIndexError(#[from] DictIndexError), + + /// Counted value {value} has width {width}, but it must be no wider than + /// {max_width}, the width of the narrowest variable in multiple response + /// set {mr_set}. + #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")] + TooWideMDGroupCountedValue { + /// Multiple response set name. + mr_set: Identifier, + /// Counted value. + value: String, + /// Width of counted value. + width: usize, + /// Maximum allowed width of counted value. + max_width: u16, + }, + + /// Multiple response set {0} contains both string and numeric variables. + #[error("Multiple response set {0} contains both string and numeric variables.")] + MixedMrSet( + /// Multiple response set name. + Identifier, + ), +} - Ok(Self { values, range }) +impl<'a> MultipleResponseSetsMut<'a> { + fn new(dictionary: &'a mut Dictionary) -> Self { + Self(dictionary) } - pub fn is_empty(&self) -> bool { - self.values.is_empty() && self.range.is_none() + pub fn mrsets(&'a self) -> MultipleResponseSets<'a> { + MultipleResponseSets::new(self.0) } - pub fn var_type(&self) -> Option { - if let Some(datum) = self.values.first() { - Some(datum.var_type()) - } else if self.range.is_some() { - Some(VarType::Numeric) - } else { - None - } + pub fn insert(&mut self, mrset: DictIndexMultipleResponseSet) -> Result<(), MrSetError> { + MultipleResponseSet::new(self.0, &mrset)?; + self.0.mrsets.insert(ByIdentifier(mrset)); + Ok(()) } +} - pub fn contains(&self, value: &Datum) -> bool { - if self - .values - .iter() - .any(|datum| datum.eq_ignore_trailing_spaces(value)) - { - return true; - } +#[derive(Clone, Debug)] +pub struct MultipleResponseSets<'a>(&'a Dictionary); - match value { - Datum::Number(Some(number)) => self.range.is_some_and(|range| range.contains(*number)), - _ => false, - } +impl<'a> MultipleResponseSets<'a> { + fn new(dictionary: &'a Dictionary) -> Self { + Self(dictionary) } - pub fn is_resizable(&self, width: VarWidth) -> bool { - self.values.iter().all(|datum| datum.is_resizable(width)) - && self.range.iter().all(|range| range.is_resizable(width)) + pub fn len(&self) -> usize { + self.0.mrsets.len() } - pub fn resize(&mut self, width: VarWidth) { - for datum in &mut self.values { - datum.resize(width); - } - if let Some(range) = &mut self.range { - range.resize(width); - } + pub fn get(&self, name: &Identifier) -> Option> { + self.0 + .mrsets + .get(&name.0) + .map(|mrset| MultipleResponseSet::new_unchecked(self.0, mrset)) } - pub fn display(&self, encoding: &'static Encoding) -> DisplayMissingValues<'_> { - DisplayMissingValues { - mv: self, - encoding: Some(encoding), - } + pub fn iter(&self) -> MultipleResponseSetIter<'a> { + MultipleResponseSetIter::new(self.0) } } -pub struct DisplayMissingValues<'a> { - mv: &'a MissingValues, - encoding: Option<&'static Encoding>, -} +impl<'a> IntoIterator for &MultipleResponseSets<'a> { + type Item = MultipleResponseSet<'a>; -impl<'a> Display for DisplayMissingValues<'a> { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - if let Some(range) = &self.mv.range { - write!(f, "{range}")?; - if !self.mv.values.is_empty() { - write!(f, "; ")?; - } - } + type IntoIter = MultipleResponseSetIter<'a>; - for (i, value) in self.mv.values.iter().enumerate() { - if i > 0 { - write!(f, "; ")?; - } - match self.encoding { - Some(encoding) => value.display_plain(encoding).fmt(f)?, - None => value.fmt(f)?, - } - } + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} - if self.mv.is_empty() { - write!(f, "none")?; +pub struct MultipleResponseSetIter<'a> { + dictionary: &'a Dictionary, + iter: btree_set::Iter<'a, ByIdentifier>, +} + +impl<'a> MultipleResponseSetIter<'a> { + fn new(dictionary: &'a Dictionary) -> Self { + Self { + dictionary, + iter: dictionary.mrsets.iter(), } - Ok(()) } } -#[derive(Copy, Clone)] -pub enum MissingValueRange { - In { low: f64, high: f64 }, - From { low: f64 }, - To { high: f64 }, +impl<'a> Iterator for MultipleResponseSetIter<'a> { + type Item = MultipleResponseSet<'a>; + + fn next(&mut self) -> Option { + self.iter + .next() + .map(|set| MultipleResponseSet::new_unchecked(self.dictionary, set)) + } } -impl MissingValueRange { - pub fn new(low: f64, high: f64) -> Self { - const LOWEST: f64 = f64::MIN.next_up(); - match (low, high) { - (f64::MIN | LOWEST, _) => Self::To { high }, - (_, f64::MAX) => Self::From { low }, - (_, _) => Self::In { low, high }, +impl<'a> Serialize for MultipleResponseSets<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut seq = serializer.serialize_seq(Some(self.len()))?; + for set in self { + seq.serialize_element(&set)?; } + seq.end() } +} - pub fn low(&self) -> Option { - match self { - MissingValueRange::In { low, .. } | MissingValueRange::From { low } => Some(*low), - MissingValueRange::To { .. } => None, +/// Variables that represent multiple responses to a survey question. +#[derive(Clone, Debug)] +pub struct MultipleResponseSet<'a> { + dictionary: &'a Dictionary, + mrset: &'a DictIndexMultipleResponseSet, +} + +impl<'a> MultipleResponseSet<'a> { + fn new_unchecked(dictionary: &'a Dictionary, mrset: &'a DictIndexMultipleResponseSet) -> Self { + Self { dictionary, mrset } + } + + fn new( + dictionary: &'a Dictionary, + mrset: &'a DictIndexMultipleResponseSet, + ) -> Result { + let variables = MappedVariables::new(dictionary, &mrset.variables)?; + let (min_width, _max_width) = Self::widths(&variables) + .ok_or_else(|| MrSetError::MixedMrSet(mrset.name.clone()))? + .into_inner(); + + if let MultipleResponseType::MultipleDichotomy { datum, labels: _ } = &mrset.mr_type { + match (datum, min_width) { + (Datum::Number(_), VarWidth::Numeric) => (), + (Datum::String(s), VarWidth::String(min_width)) => { + if s.without_trailing_spaces().len() > min_width as usize {} + } + _ => return Err(MrSetError::MixedMrSet(mrset.name.clone())), + } } + Ok(Self::new_unchecked(dictionary, mrset)) } - pub fn high(&self) -> Option { - match self { - MissingValueRange::In { high, .. } | MissingValueRange::To { high } => Some(*high), - MissingValueRange::From { .. } => None, - } + fn widths(variables: &MappedVariables<'_>) -> Option> { + variables + .iter() + .map(|v| Some((v.width, v.width))) + .reduce(|a, b| { + let (na, wa) = a?; + let (nb, wb) = b?; + Some((VarWidth::narrower(na, nb)?, VarWidth::wider(wa, wb)?)) + }) + .flatten() + .map(|(min_width, max_width)| min_width..=max_width) } - pub fn contains(&self, number: f64) -> bool { - match self { - MissingValueRange::In { low, high } => (*low..*high).contains(&number), - MissingValueRange::From { low } => number >= *low, - MissingValueRange::To { high } => number <= *high, - } + pub fn name(&self) -> &Identifier { + &self.mrset.name } - pub fn is_resizable(&self, width: VarWidth) -> bool { - width.is_numeric() + pub fn label(&self) -> &String { + &self.mrset.label } - pub fn resize(&self, width: VarWidth) { - assert_eq!(width, VarWidth::Numeric); + pub fn width(&self) -> RangeInclusive { + Self::widths(&self.variables()).unwrap() } -} -impl Display for MissingValueRange { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - match self.low() { - Some(low) => low.display_plain().fmt(f)?, - None => write!(f, "LOW")?, - } + pub fn mr_type(&self) -> &MultipleResponseType { + &self.mrset.mr_type + } - write!(f, " THRU ")?; + pub fn variables(&self) -> MappedVariables<'a> { + MappedVariables::new_unchecked(self.dictionary, &self.mrset.variables) + } +} - match self.high() { - Some(high) => high.display_plain().fmt(f)?, - None => write!(f, "HIGH")?, - } - Ok(()) +impl<'a> Serialize for MultipleResponseSet<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut map = serializer.serialize_map(Some(5))?; + map.serialize_entry("name", self.name())?; + map.serialize_entry("label", self.label())?; + map.serialize_entry("width", &self.width())?; + map.serialize_entry("type", self.mr_type())?; + map.serialize_entry("variables", &self.variables())?; + map.end() } } -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub enum Alignment { - Left, - Right, - Center, +/// Variables that represent multiple responses to a survey question. +#[derive(Clone, Debug, Serialize)] +pub struct DictIndexMultipleResponseSet { + /// The set's name. + pub name: Identifier, + + /// A description for the set. + pub label: String, + + /// What kind of multiple response set this is. + pub mr_type: MultipleResponseType, + + /// The variables comprising the set. + pub variables: Vec, } -impl Alignment { - pub fn default_for_type(var_type: VarType) -> Self { - match var_type { - VarType::Numeric => Self::Right, - VarType::String => Self::Left, - } +impl DictIndexMultipleResponseSet { + fn with_updated_dict_indexes( + mut self, + f: impl Fn(DictIndex) -> Option, + ) -> Option { + update_dict_index_vec(&mut self.variables, f); + (self.variables.len() > 1).then_some(self) } - pub fn as_str(&self) -> &'static str { - match self { - Alignment::Left => "Left", - Alignment::Right => "Right", - Alignment::Center => "Center", - } + pub fn codepage_to_unicode(&mut self) { + self.name.codepage_to_unicode(); } } -/// [Level of measurement](https://en.wikipedia.org/wiki/Level_of_measurement). -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub enum Measure { - /// Nominal values can only be compared for equality. - Nominal, +impl HasIdentifier for DictIndexMultipleResponseSet { + fn identifier(&self) -> &UniCase { + &self.name.0 + } +} - /// Ordinal values can be meaningfully ordered. - Ordinal, +/// The type of a [MultipleResponseSet]. +#[derive(Clone, Debug, Serialize)] +pub enum MultipleResponseType { + /// A "multiple dichotomy set", analogous to a survey question with a set of + /// checkboxes. Each variable in the set is treated in a Boolean fashion: + /// one value (the "counted value") means that the box was checked, and any + /// other value means that it was not. + MultipleDichotomy { + datum: Datum, + labels: CategoryLabels, + }, - /// Scale values can be meaningfully compared for the degree of difference. - Scale, + /// A "multiple category set", a survey question where the respondent is + /// instructed to list up to N choices. Each variable represents one of the + /// responses. + MultipleCategory, } -impl Measure { - pub fn default_for_type(var_type: VarType) -> Option { - match var_type { - VarType::Numeric => None, - VarType::String => Some(Self::Nominal), +impl MultipleResponseType { + pub fn supported_before_v14(&self) -> bool { + match self { + MultipleResponseType::MultipleDichotomy { + labels: CategoryLabels::CountedValues { .. }, + datum: _, + } => false, + _ => true, } } - pub fn as_str(&self) -> &'static str { + pub fn label_from_var_label(&self) -> bool { match self { - Measure::Nominal => "Nominal", - Measure::Ordinal => "Ordinal", - Measure::Scale => "Scale", + MultipleResponseType::MultipleDichotomy { + labels: + CategoryLabels::CountedValues { + use_var_label_as_mrset_label: true, + }, + .. + } => true, + _ => false, } } } -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)] pub enum CategoryLabels { VarLabels, - CountedValues, + CountedValues { use_var_label_as_mrset_label: bool }, } -#[cfg(test)] -mod test { - use std::collections::HashSet; - - use unicase::UniCase; - - use crate::identifier::Identifier; - - use super::{ByIdentifier, HasIdentifier}; +#[derive(Clone, Debug, PartialEq)] +pub struct DictIndexVariableSet { + pub name: String, + pub variables: Vec, +} - #[derive(PartialEq, Eq, Debug, Clone)] - struct Variable { - name: Identifier, - value: i32, +impl DictIndexVariableSet { + fn with_updated_dict_indexes( + mut self, + f: impl Fn(DictIndex) -> Option, + ) -> Option { + update_dict_index_vec(&mut self.variables, f); + (!self.variables.is_empty()).then_some(self) } +} + +#[cfg(test)] +mod tests { + use encoding_rs::{UTF_8, WINDOWS_1252}; + use smallvec::SmallVec; + + use crate::{ + data::Datum, + dictionary::{ + CategoryLabels, DictIndexMultipleResponseSet, DictIndexVector, Dictionary, + MultipleResponseType, + }, + identifier::Identifier, + variable::{VarWidth, Variable}, + }; - impl HasIdentifier for Variable { - fn identifier(&self) -> &UniCase { - &self.name.0 + #[test] + fn short_names() { + for (variables, expected, encoding) in [ + ( + [("VariableName1", 1), ("VARIABLE", 1), ("VariableName2", 1)], + vec![vec!["Variab_A"], vec!["VARIABLE"], vec!["Variab_B"]], + UTF_8, + ), + ( + [ + ("LongVarNameA", 1), + ("LongVarNameB", 1), + ("LongVarNameC", 1), + ], + vec![vec!["LongVarN"], vec!["LongVa_A"], vec!["LongVa_B"]], + UTF_8, + ), + ( + [ + ("LongVarNameA", 300), + ("LongVarNameB", 1), + ("LongVarNameC", 1), + ], + vec![ + vec!["LongVarN", "LongVa_C"], + vec!["LongVa_A"], + vec!["LongVa_B"], + ], + UTF_8, + ), + ( + [ + // The accented letters are 2 bytes and the katakana is 3 + // bytes in UTF-8. + ("éèäスîVarNameA", 300), + ("éèäスVarNameB", 1), + ("éèäîVarNameC", 1), + ], + vec![vec!["éèä", "éèä_B"], vec!["éèä_A"], vec!["éèäî"]], + UTF_8, + ), + ( + [ + // This version uses `e` with modifying acute accent in the + // first name. + ("e\u{301}èäスîVarNameA", 300), + ("éèäスVarNameB", 1), + ("éèäîVarNameC", 1), + ], + vec![vec!["e\u{301}èä", "e\u{301}è_A"], vec!["éèä"], vec!["éèäî"]], + UTF_8, + ), + ( + [ + // The accented letters are only 1 byte in windows-1252. + ("éèäîVarNameA", 300), + ("éèäîVarNameB", 1), + ("éèäîVarNameC", 1), + ], + vec![ + vec!["éèäîVarN", "éèäîVa_C"], + vec!["éèäîVa_A"], + vec!["éèäîVa_B"], + ], + WINDOWS_1252, + ), + ] { + let mut dict = Dictionary::new(encoding); + for (name, width) in variables { + dict.add_var(Variable::new( + Identifier::new(name).unwrap(), + VarWidth::String(width), + encoding, + )) + .unwrap(); + } + let expected = expected + .into_iter() + .map(|names| { + names + .into_iter() + .map(|name| Identifier::new(name).unwrap()) + .collect::>() + }) + .collect::>(); + assert_eq!(expected, dict.short_names()); } } #[test] - fn test() { - // Variables should not be the same if their values differ. - let abcd = Identifier::new("abcd").unwrap(); - let abcd1 = Variable { - name: abcd.clone(), - value: 1, - }; - let abcd2 = Variable { - name: abcd, - value: 2, - }; - assert_ne!(abcd1, abcd2); - - // But `ByName` should treat them the same. - let abcd1_by_name = ByIdentifier::new(abcd1); - let abcd2_by_name = ByIdentifier::new(abcd2); - assert_eq!(abcd1_by_name, abcd2_by_name); + fn codepage_to_unicode() { + let mut dictionary = Dictionary::new(WINDOWS_1252); + + dictionary + .add_var(Variable::new( + Identifier::new("ééééééééééééééééééééééééééééééééa").unwrap(), + VarWidth::Numeric, + WINDOWS_1252, + )) + .unwrap(); + dictionary + .add_var(Variable::new( + Identifier::new("ééééééééééééééééééééééééééééééééb").unwrap(), + VarWidth::Numeric, + WINDOWS_1252, + )) + .unwrap(); + + dictionary + .vectors_mut() + .insert(DictIndexVector { + name: Identifier::new("àààààààààààààààààààààààààààààààà").unwrap(), + variables: vec![0, 1], + }) + .unwrap(); + dictionary + .vectors_mut() + .insert(DictIndexVector { + name: Identifier::new("ààààààààààààààààààààààààààààààààx").unwrap(), + variables: vec![1, 0], + }) + .unwrap(); + + dictionary + .mrsets_mut() + .insert(DictIndexMultipleResponseSet { + name: Identifier::new("üüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüasdf").unwrap(), + label: String::from("my mcgroup"), + mr_type: MultipleResponseType::MultipleCategory, + variables: vec![0, 1], + }) + .unwrap(); + dictionary + .mrsets_mut() + .insert(DictIndexMultipleResponseSet { + name: Identifier::new("üüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüquux").unwrap(), + label: String::new(), + mr_type: MultipleResponseType::MultipleDichotomy { + datum: Datum::Number(Some(55.0)), + labels: CategoryLabels::VarLabels, + }, + variables: vec![0, 1], + }) + .unwrap(); - // And a `HashSet` of `ByName` should also treat them the same. - let mut vars: HashSet> = HashSet::new(); - assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone()))); - assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone()))); + dictionary.codepage_to_unicode(); + dbg!(&dictionary); assert_eq!( - vars.get(&UniCase::new(String::from("abcd"))) + &dictionary.variables[0].name, + "éééééééééééééééééééééééééééééééé" + ); + assert_eq!(&dictionary.variables[1].name, "Var1"); + assert_eq!( + dictionary + .vectors() + .get(&Identifier::new("àààààààààààààààààààààààààààààààà").unwrap()) + .unwrap() + .variables() + .dict_indexes(), + &[0, 1] + ); + assert_eq!( + dictionary + .vectors() + .get(&Identifier::new("Vec1").unwrap()) .unwrap() - .0 - .value, - 1 + .variables() + .dict_indexes(), + &[1, 0] ); + assert!(matches!( + dictionary + .mrsets() + .get(&Identifier::new("üüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüü").unwrap()) + .unwrap() + .mr_type(), + MultipleResponseType::MultipleCategory + )); + assert!(matches!( + dictionary + .mrsets() + .get(&Identifier::new("MrSet1").unwrap()) + .unwrap() + .mr_type(), + MultipleResponseType::MultipleDichotomy { .. } + )); } } diff --git a/rust/pspp/src/endian.rs b/rust/pspp/src/endian.rs index 07cfe886c8..ebb2694dc2 100644 --- a/rust/pspp/src/endian.rs +++ b/rust/pspp/src/endian.rs @@ -14,28 +14,11 @@ // You should have received a copy of the GNU General Public License along with // this program. If not, see . -use smallvec::SmallVec; +//! Converting big- and little-endian `[u8]` arrays to and from primitive types. -pub use binrw::Endian; - -pub fn endian_to_smallvec( - endian: Endian, - mut value: u64, - n: usize, -) -> SmallVec<[u8; N]> { - debug_assert!(n <= 8); - let mut vec = SmallVec::new(); - value <<= 8 * (8 - n); - for _ in 0..n { - vec.push((value >> 56) as u8); - value <<= 8; - } - if endian == Endian::Little { - vec.reverse(); - } - vec -} +use binrw::Endian; +/// Converts a primitive type into a big- or little-endian `[u8]` array. pub trait ToBytes { fn to_bytes(self, value: T) -> [u8; N]; } @@ -93,13 +76,12 @@ impl ToBytes for Endian { } } -/// Parses an `N`-byte array in one of the supported formats into native format -/// as type `T`. -pub trait Parse { +/// Parses a `[u8]` array as a big- or little-endian primitive type. +pub trait FromBytes { /// Given 'bytes', returns `T`. fn parse(self, bytes: [u8; N]) -> T; } -impl Parse for Endian { +impl FromBytes for Endian { fn parse(self, bytes: [u8; 8]) -> u64 { match self { Endian::Big => u64::from_be_bytes(bytes), @@ -107,7 +89,7 @@ impl Parse for Endian { } } } -impl Parse for Endian { +impl FromBytes for Endian { fn parse(self, bytes: [u8; 4]) -> u32 { match self { Endian::Big => u32::from_be_bytes(bytes), @@ -115,7 +97,7 @@ impl Parse for Endian { } } } -impl Parse for Endian { +impl FromBytes for Endian { fn parse(self, bytes: [u8; 2]) -> u16 { match self { Endian::Big => u16::from_be_bytes(bytes), @@ -123,7 +105,7 @@ impl Parse for Endian { } } } -impl Parse for Endian { +impl FromBytes for Endian { fn parse(self, bytes: [u8; 1]) -> u8 { match self { Endian::Big => u8::from_be_bytes(bytes), @@ -131,7 +113,7 @@ impl Parse for Endian { } } } -impl Parse for Endian { +impl FromBytes for Endian { fn parse(self, bytes: [u8; 8]) -> i64 { match self { Endian::Big => i64::from_be_bytes(bytes), @@ -139,7 +121,7 @@ impl Parse for Endian { } } } -impl Parse for Endian { +impl FromBytes for Endian { fn parse(self, bytes: [u8; 4]) -> i32 { match self { Endian::Big => i32::from_be_bytes(bytes), @@ -147,7 +129,7 @@ impl Parse for Endian { } } } -impl Parse for Endian { +impl FromBytes for Endian { fn parse(self, bytes: [u8; 2]) -> i16 { match self { Endian::Big => i16::from_be_bytes(bytes), @@ -155,7 +137,7 @@ impl Parse for Endian { } } } -impl Parse for Endian { +impl FromBytes for Endian { fn parse(self, bytes: [u8; 1]) -> i8 { match self { Endian::Big => i8::from_be_bytes(bytes), @@ -163,7 +145,7 @@ impl Parse for Endian { } } } -impl Parse for Endian { +impl FromBytes for Endian { fn parse(self, bytes: [u8; 8]) -> f64 { match self { Endian::Big => f64::from_be_bytes(bytes), @@ -171,13 +153,13 @@ impl Parse for Endian { } } } -impl Parse, 8> for Endian { +impl FromBytes, 8> for Endian { fn parse(self, bytes: [u8; 8]) -> Option { let number: f64 = self.parse(bytes); (number != -f64::MAX).then_some(number) } } -impl Parse for Endian { +impl FromBytes for Endian { fn parse(self, bytes: [u8; 4]) -> f32 { match self { Endian::Big => f32::from_be_bytes(bytes), @@ -185,7 +167,7 @@ impl Parse for Endian { } } } -impl Parse, 4> for Endian { +impl FromBytes, 4> for Endian { fn parse(self, bytes: [u8; 4]) -> Option { let number: f32 = self.parse(bytes); (number != -f32::MAX).then_some(number) diff --git a/rust/pspp/src/format/display/mod.rs b/rust/pspp/src/format/display/mod.rs index 755b6bdfc0..0347314a76 100644 --- a/rust/pspp/src/format/display/mod.rs +++ b/rust/pspp/src/format/display/mod.rs @@ -21,6 +21,7 @@ use std::{ str::from_utf8_unchecked, }; +use binrw::Endian; use chrono::{Datelike, NaiveDate}; use encoding_rs::{Encoding, UTF_8}; use libm::frexp; @@ -29,18 +30,17 @@ use smallvec::{Array, SmallVec}; use crate::{ calendar::{calendar_offset_to_gregorian, day_of_year, month_name, short_month_name}, - data::Datum, - endian::{endian_to_smallvec, ToBytes}, + data::{ByteStr, Datum, EncodedString, QuotedDatum, WithEncoding}, + endian::ToBytes, format::{Category, DateTemplate, Decimal, Format, NumberStyle, Settings, TemplateItem, Type}, settings::{EndianSettings, Settings as PsppSettings}, }; -pub struct DisplayDatum<'a, 'b> { +pub struct DisplayDatum<'b, B> { format: Format, settings: &'b Settings, endian: EndianSettings, - datum: &'a Datum, - encoding: &'static Encoding, + datum: Datum, /// If true, the output will remove leading and trailing spaces from numeric /// values, and trailing spaces from string values. (This might make the @@ -83,68 +83,38 @@ impl Display for DisplayPlainF64 { } } -impl Datum { +impl<'a, D> Datum +where + D: EncodedString, +{ /// Returns an object that implements [Display] for printing this [Datum] as - /// `format`. `encoding` specifies this `Datum`'s encoding (therefore, it - /// is used only if this is a `Datum::String`). + /// `format`. /// /// [Display]: std::fmt::Display - pub fn display(&self, format: Format, encoding: &'static Encoding) -> DisplayDatum { - DisplayDatum::new(format, self, encoding) - } - - pub fn display_plain(&self, encoding: &'static Encoding) -> DisplayDatumPlain { - DisplayDatumPlain { - datum: self, - encoding, - quote_strings: true, - } + pub fn display(&'a self, format: Format) -> DisplayDatum<'a, WithEncoding>> { + DisplayDatum::new(format, self.as_borrowed()) } -} - -pub struct DisplayDatumPlain<'a> { - datum: &'a Datum, - encoding: &'static Encoding, - quote_strings: bool, -} -impl DisplayDatumPlain<'_> { - pub fn without_quotes(self) -> Self { - Self { - quote_strings: false, - ..self - } + pub fn display_plain(&self) -> QuotedDatum<'_, D> { + self.quoted() } } -impl Display for DisplayDatumPlain<'_> { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - match self.datum { - Datum::Number(None) => write!(f, "SYSMIS"), - Datum::Number(Some(number)) => number.display_plain().fmt(f), - Datum::String(string) => { - if self.quote_strings { - write!(f, "\"{}\"", string.display(self.encoding)) - } else { - string.display(self.encoding).fmt(f) - } - } - } - } -} - -impl Display for DisplayDatum<'_, '_> { +impl<'a, 'b, B> Display for DisplayDatum<'b, B> +where + B: EncodedString, +{ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - let number = match self.datum { + let number = match &self.datum { Datum::Number(number) => *number, Datum::String(string) => { if self.format.type_() == Type::AHex { - for byte in &string.0 { + for byte in string.raw_string_bytes() { write!(f, "{byte:02x}")?; } } else { let quote = if self.quote_strings { "\"" } else { "" }; - let s = self.encoding.decode_without_bom_handling(&string.0).0; + let s = string.as_str(); let s = if self.trim_spaces { s.trim_end_matches(' ') } else { @@ -195,13 +165,15 @@ impl Display for DisplayDatum<'_, '_> { } } -impl<'a, 'b> DisplayDatum<'a, 'b> { - pub fn new(format: Format, value: &'a Datum, encoding: &'static Encoding) -> Self { +impl<'b, B> DisplayDatum<'b, B> +where + B: EncodedString, +{ + pub fn new(format: Format, datum: Datum) -> Self { let settings = PsppSettings::global(); Self { format, - datum: value, - encoding, + datum, settings: &settings.formats, endian: settings.endian, trim_spaces: false, @@ -1176,3 +1148,21 @@ where } } } + +pub fn endian_to_smallvec( + endian: Endian, + mut value: u64, + n: usize, +) -> SmallVec<[u8; N]> { + debug_assert!(n <= 8); + let mut vec = SmallVec::new(); + value <<= 8 * (8 - n); + for _ in 0..n { + vec.push((value >> 56) as u8); + value <<= 8; + } + if endian == Endian::Little { + vec.reverse(); + } + vec +} diff --git a/rust/pspp/src/format/display/test.rs b/rust/pspp/src/format/display/test.rs index d9b4dd5f64..9ddd3047f2 100644 --- a/rust/pspp/src/format/display/test.rs +++ b/rust/pspp/src/format/display/test.rs @@ -16,24 +16,21 @@ use std::{fmt::Write, fs::File, io::BufRead, path::Path}; -use binrw::io::BufReader; +use binrw::{io::BufReader, Endian}; use encoding_rs::UTF_8; use itertools::Itertools; use smallstr::SmallString; use smallvec::SmallVec; use crate::{ - data::Datum, - endian::Endian, + data::{ByteString, Datum, WithEncoding}, format::{AbstractFormat, Epoch, Format, Settings, Type, UncheckedFormat, CC}, lex::{scan::StringScanner, segment::Syntax, Punct, Token}, settings::EndianSettings, }; fn test(name: &str) { - let filename = Path::new(env!("CARGO_MANIFEST_DIR")) - .join("src/format/testdata/display") - .join(name); + let filename = Path::new("src/format/testdata/display").join(name); let input = BufReader::new(File::open(&filename).unwrap()); let settings = Settings::default() .with_cc(CC::A, ",,,".parse().unwrap()) @@ -75,8 +72,8 @@ fn test(name: &str) { let format: Format = format.try_into().unwrap(); assert_eq!(tokens.get(1), Some(&Token::Punct(Punct::Colon))); let expected = tokens[2].as_string().unwrap(); - let actual = Datum::Number(value) - .display(format, UTF_8) + let actual = Datum::>::Number(value) + .display(format) .with_settings(&settings) .with_endian(endian) .to_string(); @@ -183,11 +180,11 @@ fn leading_zeros() { } fn test_with_settings(value: f64, expected: [&str; 2], settings: &Settings) { - let value = Datum::from(value); + let value = Datum::>::from(value); for (expected, d) in expected.into_iter().zip([2, 1].into_iter()) { assert_eq!( &value - .display(Format::new(Type::F, 5, d).unwrap(), UTF_8) + .display(Format::new(Type::F, 5, d).unwrap()) .with_settings(settings) .to_string(), expected @@ -214,8 +211,8 @@ fn leading_zeros() { fn non_ascii_cc() { fn test(settings: &Settings, value: f64, expected: &str) { assert_eq!( - &Datum::from(value) - .display(Format::new(Type::CC(CC::A), 10, 2).unwrap(), UTF_8) + &Datum::>::from(value) + .display(Format::new(Type::CC(CC::A), 10, 2).unwrap()) .with_settings(settings) .to_string(), expected @@ -233,9 +230,7 @@ fn non_ascii_cc() { } fn test_binhex(name: &str) { - let filename = Path::new(env!("CARGO_MANIFEST_DIR")) - .join("src/format/testdata/display") - .join(name); + let filename = Path::new("src/format/testdata/display").join(name); let input = BufReader::new(File::open(&filename).unwrap()); let mut value = None; let mut value_name = String::new(); @@ -266,8 +261,8 @@ fn test_binhex(name: &str) { assert_eq!(tokens.get(1), Some(&Token::Punct(Punct::Colon))); let expected = tokens[2].as_string().unwrap(); let mut actual = SmallVec::<[u8; 16]>::new(); - Datum::Number(value) - .display(format, UTF_8) + Datum::>::Number(value) + .display(format) .with_endian(endian) .write(&mut actual, UTF_8) .unwrap(); @@ -339,11 +334,8 @@ fn test_dates(format: Format, expect: &[&str]) { ]; assert_eq!(expect.len(), INPUTS.len()); for (input, expect) in INPUTS.iter().copied().zip_eq(expect.iter().copied()) { - let value = parser.parse(input).unwrap(); - let formatted = value - .display(format, UTF_8) - .with_settings(&settings) - .to_string(); + let value = parser.parse(input).unwrap().with_encoding(UTF_8); + let formatted = value.display(format).with_settings(&settings).to_string(); assert_eq!(&formatted, expect); } } @@ -1281,7 +1273,7 @@ fn ymdhms25_5() { } fn test_times(format: Format, name: &str) { - let directory = Path::new(env!("CARGO_MANIFEST_DIR")).join("src/format/testdata/display"); + let directory = Path::new("src/format/testdata/display"); let input_filename = directory.join("time-input.txt"); let input = BufReader::new(File::open(&input_filename).unwrap()); @@ -1295,8 +1287,12 @@ fn test_times(format: Format, name: &str) { .zip_eq(output.lines().map(|r| r.unwrap())) .zip(1..) { - let value = parser.parse(&input).unwrap(); - let formatted = value.display(format, UTF_8).to_string(); + let formatted = parser + .parse(input) + .unwrap() + .with_encoding(UTF_8) + .display(format) + .to_string(); assert!( formatted == expect, "formatting {}:{line_number} as {format}:\n actual: {formatted:?}\nexpected: {expect:?}", diff --git a/rust/pspp/src/format/mod.rs b/rust/pspp/src/format/mod.rs index d46db2d8ee..a9e9e7b78b 100644 --- a/rust/pspp/src/format/mod.rs +++ b/rust/pspp/src/format/mod.rs @@ -24,14 +24,15 @@ use std::{ use chrono::{Datelike, Local}; use enum_iterator::{all, Sequence}; use enum_map::{Enum, EnumMap}; +use serde::{Deserialize, Serialize}; +use smallstr::SmallString; use thiserror::Error as ThisError; use unicode_width::UnicodeWidthStr; use crate::{ - data::RawString, - data::Datum, - dictionary::{VarType, VarWidth}, + data::{ByteString, Datum}, sys::raw, + variable::{VarType, VarWidth}, }; mod display; @@ -124,7 +125,7 @@ impl From for Category { } } -#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Hash, Sequence)] +#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Hash, Sequence, Serialize)] pub enum CC { A, B, @@ -151,7 +152,7 @@ impl Display for CC { } } -#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Sequence)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Sequence, Serialize)] pub enum Type { // Basic numeric formats. F, @@ -392,10 +393,10 @@ impl Type { } } - pub fn default_value(&self) -> Datum { + pub fn default_value(&self) -> Datum { match self.var_type() { VarType::Numeric => Datum::sysmis(), - VarType::String => Datum::String(RawString::default()), + VarType::String => Datum::String(ByteString::default()), } } } @@ -489,6 +490,17 @@ pub struct Format { d: Decimals, } +impl Serialize for Format { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut s = SmallString::<[u8; 16]>::new(); + write!(&mut s, "{}", self).unwrap(); + s.serialize(serializer) + } +} + impl Format { pub const F40: Format = Format { type_: Type::F, @@ -620,10 +632,10 @@ impl Format { Ok(self) } - pub fn default_value(&self) -> Datum { + pub fn default_value(&self) -> Datum { match self.var_width() { VarWidth::Numeric => Datum::sysmis(), - VarWidth::String(width) => Datum::String(RawString::spaces(width as usize)), + VarWidth::String(width) => Datum::String(ByteString::spaces(width as usize)), } } @@ -640,6 +652,18 @@ impl Format { _ => *self = Self::default_for_width(width), } } + + pub fn codepage_to_unicode(&mut self) { + let mut width = self.var_width(); + width.codepage_to_unicode(); + if let Some(width) = width.as_string_width() { + if self.type_ == Type::AHex { + self.w = width as u16 * 2; + } else { + self.w = width as u16; + } + } + } } impl Debug for Format { @@ -829,7 +853,8 @@ impl Display for UncheckedFormat { } } -#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Enum)] +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Enum, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] pub enum Decimal { #[default] Dot, @@ -883,7 +908,7 @@ impl Not for Decimal { } } -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize)] pub struct Epoch(pub i32); impl Epoch { @@ -922,7 +947,7 @@ impl Display for Epoch { } } -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, Serialize)] pub struct Settings { pub epoch: Epoch, @@ -1048,7 +1073,7 @@ impl Settings { /// A numeric output style. This can express numeric formats in /// [Category::Basic] and [Category::Custom]. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct NumberStyle { pub neg_prefix: Affix, pub prefix: Affix, @@ -1071,6 +1096,7 @@ pub struct NumberStyle { /// can be used to size memory allocations: for example, the formatted /// result of `CCA20.5` requires no more than `(20 + extra_bytes)` bytes in /// UTF-8. + #[serde(skip)] pub extra_bytes: usize, } @@ -1125,11 +1151,12 @@ impl NumberStyle { } } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct Affix { /// String contents of affix. pub s: String, + #[serde(skip)] /// Display width in columns (see [unicode_width]) pub width: usize, } @@ -1334,3 +1361,32 @@ impl Iterator for DateTemplate { Some(TemplateItem { c, n }) } } + +#[cfg(test)] +mod tests { + use crate::format::{Format, Type, Width}; + + #[test] + fn codepage_to_unicode() { + fn check_format(input: Format, expected_width: Width) { + let mut output = input; + output.codepage_to_unicode(); + let expected = Format::new(input.type_, expected_width, input.d).unwrap(); + assert_eq!(output, expected); + } + check_format(Format::new(Type::A, 1, 0).unwrap(), 3); + check_format(Format::new(Type::A, 2, 0).unwrap(), 6); + check_format(Format::new(Type::A, 3, 0).unwrap(), 9); + check_format(Format::new(Type::A, 1000, 0).unwrap(), 3000); + check_format(Format::new(Type::A, 20000, 0).unwrap(), 32767); + + check_format(Format::new(Type::AHex, 2, 0).unwrap(), 6); + check_format(Format::new(Type::AHex, 4, 0).unwrap(), 12); + check_format(Format::new(Type::AHex, 6, 0).unwrap(), 18); + check_format(Format::new(Type::AHex, 2000, 0).unwrap(), 6000); + check_format(Format::new(Type::AHex, 20000, 0).unwrap(), 60000); + check_format(Format::new(Type::AHex, 30000, 0).unwrap(), 65534); + + check_format(Format::new(Type::F, 40, 0).unwrap(), 40); + } +} diff --git a/rust/pspp/src/format/parse.rs b/rust/pspp/src/format/parse.rs index d2f6b22bc7..3e5d257a6e 100644 --- a/rust/pspp/src/format/parse.rs +++ b/rust/pspp/src/format/parse.rs @@ -16,11 +16,12 @@ use crate::{ calendar::{calendar_gregorian_to_offset, DateError}, - data::{Datum, EncodedStr, EncodedString}, - endian::{Endian, Parse}, + data::{ByteString, Datum, EncodedString, OwnedDatum, RawString, WithEncoding}, + endian::FromBytes, format::{DateTemplate, Decimals, Settings, TemplateItem, Type}, settings::{EndianSettings, Settings as PsppSettings}, }; +use binrw::Endian; use encoding_rs::Encoding; use smallstr::SmallString; use std::{ @@ -32,7 +33,7 @@ use thiserror::Error as ThisError; #[derive(Clone, Debug)] pub struct ParseError { type_: Type, - input: EncodedString, + input: WithEncoding, kind: ParseErrorKind, } @@ -43,7 +44,7 @@ impl Display for ParseError { write!( f, "{} cannot be parsed as {}: {}", - self.input.borrowed().quoted(), + self.input.quoted(), &self.type_, &self.kind ) @@ -190,13 +191,12 @@ impl<'a> ParseValue<'a> { /// input into UTF-8, but this will screw up parsing of binary formats, /// because recoding bytes from (e.g.) windows-1252 into UTF-8, and then /// interpreting them as a binary number yields nonsense. - pub fn parse<'b, T>(&self, input: T) -> Result - where - T: Into>, - { - let input: EncodedStr = input.into(); + pub fn parse(&self, input: impl EncodedString) -> Result { if input.is_empty() { - return Ok(self.type_.default_value()); + return Ok(self + .type_ + .default_value() + .with_encoding(self.output_encoding)); } match self.type_ { Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => { @@ -222,24 +222,24 @@ impl<'a> ParseValue<'a> { | Type::DTime => self.parse_date(&input.as_str()), Type::WkDay => self.parse_wkday(&input.as_str()), Type::Month => self.parse_month(&input.as_str()), - Type::P => self.parse_p(input.as_bytes()), - Type::PK => self.parse_pk(input.as_bytes()), - Type::IB => self.parse_ib(input.as_bytes()), - Type::PIB => self.parse_pib(input.as_bytes()), - Type::RB => self.parse_rb(input.as_bytes()), + Type::P => self.parse_p(input.raw_string_bytes()), + Type::PK => self.parse_pk(input.raw_string_bytes()), + Type::IB => self.parse_ib(input.raw_string_bytes()), + Type::PIB => self.parse_pib(input.raw_string_bytes()), + Type::RB => self.parse_rb(input.raw_string_bytes()), Type::A => Ok(Datum::String( - input.to_encoding(self.output_encoding).into(), + input.to_encoding(self.output_encoding).into_owned(), )), Type::AHex => self.parse_ahex(&input.as_str()), } .map_err(|kind| ParseError { type_: self.type_, - input: input.into(), + input: input.cloned(), kind, }) } - fn parse_number(&self, input: &str, type_: Type) -> Result { + fn parse_number(&self, input: &str, type_: Type) -> Result { let style = self.settings.number_style(type_); let input = input.trim(); @@ -312,14 +312,14 @@ impl<'a> ParseValue<'a> { } } - fn parse_n(&self, input: &str) -> Result { + fn parse_n(&self, input: &str) -> Result { match input.chars().find(|c| !c.is_ascii_digit()) { None => Ok(Datum::Number(Some(input.parse().unwrap()))), Some(nondigit) => Err(ParseErrorKind::Nondigit(nondigit)), } } - fn parse_z(&self, input: &str) -> Result { + fn parse_z(&self, input: &str) -> Result { let input = input.trim(); if input.is_empty() || input == "." { return Ok(Datum::sysmis()); @@ -396,12 +396,12 @@ impl<'a> ParseValue<'a> { } } - fn parse_pk(&self, input: &[u8]) -> Result { + fn parse_pk(&self, input: &[u8]) -> Result { let number = Self::parse_bcd(input)?; Ok(Datum::Number(Some(self.apply_decimals(number as f64)))) } - fn parse_p(&self, input: &[u8]) -> Result { + fn parse_p(&self, input: &[u8]) -> Result { if input.is_empty() { return Ok(Datum::Number(None)); }; @@ -423,7 +423,7 @@ impl<'a> ParseValue<'a> { } } - fn parse_ib(&self, input: &[u8]) -> Result { + fn parse_ib(&self, input: &[u8]) -> Result { let number = self.parse_binary(input); let sign_bit = 1 << (input.len() * 8 - 1); let number = if (number & sign_bit) == 0 { @@ -434,12 +434,12 @@ impl<'a> ParseValue<'a> { Ok(Datum::Number(Some(self.apply_decimals(number as f64)))) } - fn parse_pib(&self, input: &[u8]) -> Result { + fn parse_pib(&self, input: &[u8]) -> Result { let number = self.parse_binary(input); Ok(Datum::Number(Some(self.apply_decimals(number as f64)))) } - fn parse_rb(&self, input: &[u8]) -> Result { + fn parse_rb(&self, input: &[u8]) -> Result { let mut bytes = [0; 8]; let len = input.len().min(8); bytes[..len].copy_from_slice(&input[..len]); @@ -453,7 +453,7 @@ impl<'a> ParseValue<'a> { Ok(Datum::Number(number)) } - fn parse_ahex(&self, input: &str) -> Result { + fn parse_ahex(&self, input: &str) -> Result { let mut result = Vec::with_capacity(input.len() / 2); let mut iter = input.chars(); while let Some(hi) = iter.next() { @@ -468,7 +468,9 @@ impl<'a> ParseValue<'a> { }; result.push((hi * 16 + lo) as u8); } - Ok(Datum::String(result.into())) + Ok(Datum::String( + ByteString(result).with_encoding(self.output_encoding), + )) } fn parse_hex(&self, input: &str) -> Result, ParseErrorKind> { @@ -483,17 +485,17 @@ impl<'a> ParseValue<'a> { } } - fn parse_pibhex(&self, input: &str) -> Result { + fn parse_pibhex(&self, input: &str) -> Result { self.parse_hex(input) .map(|value| Datum::Number(value.map(|number| number as f64))) } - fn parse_rbhex(&self, input: &str) -> Result { + fn parse_rbhex(&self, input: &str) -> Result { self.parse_hex(input) .map(|value| Datum::Number(value.map(f64::from_bits))) } - fn parse_date(&self, input: &str) -> Result { + fn parse_date(&self, input: &str) -> Result { let mut p = StrParser(input.trim()); if p.0.is_empty() || p.0 == "." { return Ok(Datum::sysmis()); @@ -609,7 +611,7 @@ impl<'a> ParseValue<'a> { Ok(time + seconds) } - fn parse_wkday(&self, input: &str) -> Result { + fn parse_wkday(&self, input: &str) -> Result { let mut p = StrParser(input.trim()); if p.0.is_empty() || p.0 == "." { Ok(Datum::sysmis()) @@ -620,7 +622,7 @@ impl<'a> ParseValue<'a> { } } - fn parse_month(&self, input: &str) -> Result { + fn parse_month(&self, input: &str) -> Result { let mut p = StrParser(input.trim()); if p.0.is_empty() || p.0 == "." { Ok(Datum::sysmis()) @@ -915,13 +917,13 @@ mod test { path::Path, }; + use binrw::Endian; use encoding_rs::UTF_8; use rand::random; use crate::{ calendar::{days_in_month, is_leap_year}, - data::{Datum, EncodedStr}, - endian::Endian, + data::{ByteStr, Datum, EncodedString, OwnedDatum, RawString}, format::{ parse::{ParseError, ParseErrorKind, Sign}, Epoch, Format, Settings as FormatSettings, Type, @@ -930,7 +932,7 @@ mod test { }; fn test(name: &str, type_: Type) { - let base = Path::new(env!("CARGO_MANIFEST_DIR")).join("src/format/testdata/parse"); + let base = Path::new("src/format/testdata/parse"); let input_stream = BufReader::new(File::open(base.join("num-in.txt")).unwrap()); let expected_stream = BufReader::new(File::open(base.join(name)).unwrap()); for ((input, expected), line_number) in input_stream @@ -942,8 +944,8 @@ mod test { let result = type_.parser(UTF_8).parse(&input); let error = result.clone().err(); let value = result - .unwrap_or(type_.default_value()) - .display(Format::new(Type::F, 10, 4).unwrap(), UTF_8) + .unwrap_or(type_.default_value().with_encoding(UTF_8)) + .display(Format::new(Type::F, 10, 4).unwrap()) .to_string(); if value != expected { panic!( @@ -1229,7 +1231,7 @@ mod test { .with_settings(&settings) .parse(&formatted) .unwrap(); - assert_eq!(parsed, Datum::Number(Some(expected as f64))); + assert_eq!(parsed, OwnedDatum::Number(Some(expected as f64))); } } @@ -1607,8 +1609,8 @@ mod test { assert_eq!(parsed, expected); } } - assert_eq!(parser.parse(".").unwrap(), Datum::Number(None)); - assert_eq!(parser.parse("",).unwrap(), Datum::Number(None)); + assert_eq!(parser.parse(".").unwrap(), OwnedDatum::Number(None)); + assert_eq!(parser.parse("",).unwrap(), OwnedDatum::Number(None)); } #[test] @@ -1635,7 +1637,7 @@ mod test { let parsed = Type::RB .parser(UTF_8) .with_endian(EndianSettings::new(Endian::Big)) - .parse(EncodedStr::new(&raw[..], UTF_8)) + .parse(ByteStr(raw.as_slice()).with_encoding(UTF_8)) .unwrap() .as_number() .unwrap() @@ -1697,7 +1699,7 @@ mod test { assert_eq!(parsed, number as f64, "formatted as {formatted:?}"); } } - assert_eq!(parser.parse(".").unwrap(), Datum::Number(None)); + assert_eq!(parser.parse(".").unwrap(), OwnedDatum::Number(None)); let parser = Type::Z.parser(UTF_8).with_implied_decimals(1); for number in -999i32..=999 { @@ -1732,7 +1734,6 @@ mod test { .unwrap() .as_string() .unwrap() - .as_encoded(UTF_8) .as_str(), "abcdefgh" ); diff --git a/rust/pspp/src/identifier.rs b/rust/pspp/src/identifier.rs index 823ba6e971..700b1cd087 100644 --- a/rust/pspp/src/identifier.rs +++ b/rust/pspp/src/identifier.rs @@ -22,10 +22,12 @@ use std::{ ops::{Deref, DerefMut}, }; -use encoding_rs::{EncoderResult, Encoding, UTF_8}; +use encoding_rs::{CoderResult, Encoder, EncoderResult, Encoding, UTF_8}; +use serde::Serialize; use thiserror::Error as ThisError; use unicase::UniCase; use unicode_properties::UnicodeGeneralCategory; +use unicode_segmentation::UnicodeSegmentation; #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum Class { @@ -209,16 +211,38 @@ impl Identifier { /// encoding used by the dictionary, not in UTF-8. pub const MAX_LEN: usize = 64; + fn new_unchecked(s: impl Into>) -> Self { + let s: UniCase = s.into(); + debug_assert!(Self::check_plausible(&s).is_ok()); + Identifier(s) + } + pub fn new(s: impl Into>) -> Result { Self::from_encoding(s, UTF_8) } + /// Converts this identifier to UTF-8. This is generally a no-op, because + /// our internal encoding is UTF-8, but some identifiers are longer in UTF-8 + /// than in their code page, which means that to satisfy the 64-byte limit + /// this function sometimes has to remove trailing grapheme clusters. + pub fn codepage_to_unicode(&mut self) { + while self.len() > Self::MAX_LEN { + let (new_len, _) = self.as_str().grapheme_indices(true).next_back().unwrap(); + self.0.truncate(new_len); + if self.0.is_empty() { + // We had a grapheme cluster longer than 64 bytes! + *self = Identifier::new("VAR").unwrap(); + return; + } + } + } + pub fn from_encoding( s: impl Into>, encoding: &'static Encoding, ) -> Result { let s: UniCase = s.into(); - Self::is_plausible(&s)?; + Self::check_plausible(&s)?; let identifier = Identifier(s); identifier.check_encoding(encoding)?; Ok(identifier) @@ -260,7 +284,7 @@ impl Identifier { }*/ Ok(()) } - pub fn is_plausible(s: &str) -> Result<(), Error> { + pub fn check_plausible(s: &str) -> Result<(), Error> { if s.is_empty() { return Err(Error::Empty); } @@ -330,6 +354,120 @@ impl Identifier { pub fn as_str(&self) -> &str { self.0.as_ref() } + + /// Returns this this identifier truncated to at most 8 bytes in `encoding`. + pub fn shortened(&self, encoding: &'static Encoding) -> Self { + let new_len = shortened_len(self, "", encoding, 8); + Self::new_unchecked(self.0[..new_len].to_string()) + } + + /// Returns a prefix of this identifier concatenated with all of `suffix`, + /// including as many grapheme clusters from the beginning of this + /// identifier as would fit within `max_len` bytes if the resulting string + /// were to be re-encoded in `encoding`. + /// + /// `max_len` would ordinarily be 64, since that's the maximum length of an + /// identifier, but a value of 8 is appropriate for short variable names. + /// + /// This function fails if adding or using `suffix` produces an invalid + /// [Identifier], for example if `max_len` is short enough that none of the + /// identifier can be included and `suffix` begins with `'_'` or another + /// character that may not appear at the beginning of an identifier. + /// + /// # Examples + /// + /// Simple examples for UTF-8 `encoding` with `max_len` of 6: + /// + /// ```text + /// identifier="abc", suffix="xyz" => "abcxyz" + /// identifier="abcd", suffix="xyz" => "abcxyz" + /// identifier="abc", suffix="uvwxyz" => "uvwxyz" + /// identifier="abc", suffix="tuvwxyz" => "tuvwxyz" + /// ``` + /// + /// Examples for windows-1252 `encoding` with `max_len` of 6: + /// + /// ```text + /// identifier="éèä", suffix="xyz" => "éèäxyz" + /// ``` + /// + /// (each letter in the identifier is only 1 byte in windows-1252 even + /// though they each take 2 bytes in UTF-8) + pub fn with_suffix( + &self, + suffix: &str, + encoding: &'static Encoding, + max_len: usize, + ) -> Result { + let prefix_len = shortened_len(self, suffix, encoding, max_len); + if prefix_len == 0 { + Self::new(suffix) + } else { + Self::new(format!("{}{suffix}", &self[..prefix_len])) + } + } +} + +impl Serialize for Identifier { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.0.as_str().serialize(serializer) + } +} + +fn encode_fully(encoder: &mut Encoder, mut src: &str, dst: &mut Vec, last: bool) { + while let (CoderResult::OutputFull, read, _) = encoder.encode_from_utf8_to_vec(src, dst, last) { + src = &src[read..]; + dst.reserve((dst.capacity() * 2) - dst.len()); + } +} + +fn shortened_len(prefix: &str, suffix: &str, encoding: &'static Encoding, max_len: usize) -> usize { + assert!(max_len <= 64); + if encoding == UTF_8 { + if prefix.len() + suffix.len() <= max_len { + prefix.len() + } else if suffix.len() >= max_len { + 0 + } else { + let mut copy_len = 0; + for (cluster_start, cluster) in prefix.grapheme_indices(true) { + let cluster_end = cluster_start + cluster.len(); + if cluster_end > max_len - suffix.len() { + break; + } + copy_len = cluster_end; + } + copy_len + } + } else { + let mut copy_len = 0; + let mut tmp = Vec::with_capacity(max_len); + for (cluster_start, cluster) in prefix.grapheme_indices(true) { + let cluster_end = cluster_start + cluster.len(); + let mut encoder = encoding.new_encoder(); + tmp.clear(); + encode_fully(&mut encoder, &prefix[..cluster_end], &mut tmp, false); + if tmp.len() <= max_len { + encode_fully(&mut encoder, suffix, &mut tmp, true); + } + if tmp.len() > max_len { + break; + } + copy_len = cluster_end; + } + copy_len + } +} + +impl Deref for Identifier { + type Target = UniCase; + + fn deref(&self) -> &Self::Target { + &self.0 + } } impl PartialEq for Identifier { @@ -490,3 +628,134 @@ where &mut self.0 } } + +impl Serialize for ByIdentifier +where + T: HasIdentifier + Clone + Serialize, +{ + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.0.serialize(serializer) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + use encoding_rs::{Encoding, UTF_8, WINDOWS_1252}; + use unicase::UniCase; + + use crate::identifier::Identifier; + + use super::{ByIdentifier, HasIdentifier}; + + #[derive(PartialEq, Eq, Debug, Clone)] + struct SimpleVar { + name: Identifier, + value: i32, + } + + impl HasIdentifier for SimpleVar { + fn identifier(&self) -> &UniCase { + &self.name.0 + } + } + + #[test] + fn identifier() { + // Variables should not be the same if their values differ. + let abcd = Identifier::new("abcd").unwrap(); + let abcd1 = SimpleVar { + name: abcd.clone(), + value: 1, + }; + let abcd2 = SimpleVar { + name: abcd, + value: 2, + }; + assert_ne!(abcd1, abcd2); + + // But [ByIdentifier]` should treat them the same. + let abcd1_by_name = ByIdentifier::new(abcd1); + let abcd2_by_name = ByIdentifier::new(abcd2); + assert_eq!(abcd1_by_name, abcd2_by_name); + + // And a [HashSet] of [ByIdentifier] should also treat them the same. + let mut vars: HashSet> = HashSet::new(); + assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone()))); + assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone()))); + assert_eq!( + vars.get(&UniCase::new(String::from("abcd"))) + .unwrap() + .0 + .value, + 1 + ); + } + + #[test] + fn with_suffix() { + for (head, suffix, encoding, max_len, expected) in [ + ("abc", "xyz", UTF_8, 6, "abcxyz"), + ("abcd", "xyz", UTF_8, 6, "abcxyz"), + ("abcd", "uvwxyz", UTF_8, 6, "uvwxyz"), + ("abc", "tuvwxyz", UTF_8, 6, "tuvwxyz"), + ("éèä", "xyz", UTF_8, 6, "éxyz"), + ("éèä", "xyz", WINDOWS_1252, 6, "éèäxyz"), + ] { + let head = Identifier::new(head).unwrap(); + let suffix = Identifier::new(suffix).unwrap(); + let actual = head.with_suffix(&suffix, encoding, max_len).unwrap(); + assert_eq!(&actual, expected); + } + } + + #[test] + fn shortened() { + for (long, expected_short, encoding) in [ + ("abc", "abc", UTF_8), + ("éèäîVarNameA", "éèäî", UTF_8), + ("éèäîVarNameA", "éèäîVarN", WINDOWS_1252), + ] { + let long = Identifier::new(long).unwrap(); + let short = long.shortened(encoding); + assert_eq!(&short, expected_short); + } + } + + #[test] + fn codepage_to_unicode() { + fn check_unicode(identifier: &str, encoding: &'static Encoding, expected: &str) { + let identifier = Identifier::from_encoding(String::from(identifier), encoding).unwrap(); + let mut actual = identifier.clone(); + actual.codepage_to_unicode(); + assert_eq!(actual.as_str(), expected); + } + + check_unicode("abc", UTF_8, "abc"); + check_unicode("éèäî", UTF_8, "éèäî"); + + // 32 bytes in windows-1252, 64 bytes in UTF-8, no truncation. + check_unicode( + "éèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî", + WINDOWS_1252, + "éèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî", + ); + + // 33 or 34 bytes in windows-1252, 65 or 66 bytes in UTF-8, truncate + // last (2-byte) character. + check_unicode( + "xéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî", + WINDOWS_1252, + "xéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèä", + ); + check_unicode( + "xyéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî", + WINDOWS_1252, + "xyéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèä", + ); + } +} diff --git a/rust/pspp/src/lex/mod.rs b/rust/pspp/src/lex/mod.rs index 2cf29eec92..f92407bb41 100644 --- a/rust/pspp/src/lex/mod.rs +++ b/rust/pspp/src/lex/mod.rs @@ -14,7 +14,7 @@ // You should have received a copy of the GNU General Public License along with // this program. If not, see . -//! PSPP lexical analysis. +//! Lexical analysis for PSPP syntax. //! //! PSPP divides traditional "lexical analysis" or "tokenization" into three //! phases: diff --git a/rust/pspp/src/lib.rs b/rust/pspp/src/lib.rs index 9ae4bdfcd9..cb2ddc4f80 100644 --- a/rust/pspp/src/lib.rs +++ b/rust/pspp/src/lib.rs @@ -35,6 +35,7 @@ pub mod output; pub mod prompt; pub mod settings; pub mod sys; +pub mod variable; /// This is [slice::element_offset] copied out from the standard library so that /// we can use it while it is still experimental. diff --git a/rust/pspp/src/main.rs b/rust/pspp/src/main.rs index f3f885d085..48d2e0ada0 100644 --- a/rust/pspp/src/main.rs +++ b/rust/pspp/src/main.rs @@ -14,21 +14,35 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -use anyhow::{anyhow, Result}; +use anyhow::{anyhow, bail, Error as AnyError, Result}; use clap::{Args, Parser, Subcommand, ValueEnum}; use encoding_rs::Encoding; use pspp::{ crypto::EncryptedFile, + data::Datum, + output::{ + driver::{Config, Driver}, + pivot::PivotTable, + Details, Item, Text, + }, sys::{ - raw::{infer_encoding, Decoder, Magic, Reader, Record}, - ReaderOptions, Records, + self, + raw::{ + infer_encoding, records::Compression, Decoder, EncodingReport, Magic, Reader, Record, + }, + ReadOptions, Records, }, }; +use serde::Serialize; use std::{ + cell::RefCell, + ffi::OsStr, + fmt::{Display, Write as _}, fs::File, io::{stdout, BufReader, Write}, path::{Path, PathBuf}, - str, + rc::Rc, + sync::Arc, }; use thiserror::Error as ThisError; use zeroize::Zeroizing; @@ -47,6 +61,27 @@ enum OutputFormat { /// Comma-separated values using each variable's print format (variable /// names are written as the first line) Csv, + + /// System file + Sys, +} + +impl TryFrom<&Path> for OutputFormat { + type Error = AnyError; + + fn try_from(value: &Path) -> std::result::Result { + let extension = value.extension().unwrap_or_default(); + if extension.eq_ignore_ascii_case("csv") || extension.eq_ignore_ascii_case("txt") { + Ok(OutputFormat::Csv) + } else if extension.eq_ignore_ascii_case("sav") || extension.eq_ignore_ascii_case("sys") { + Ok(OutputFormat::Sys) + } else { + Err(anyhow!( + "Unknown output file extension '{}'", + extension.display() + )) + } + } } /// Convert SPSS data files into other formats. @@ -63,10 +98,14 @@ struct Convert { #[arg(short = 'O')] output_format: Option, - /// The encoding to use. + /// The encoding to use for reading the input file. #[arg(short = 'e', long, value_parser = parse_encoding)] encoding: Option<&'static Encoding>, + /// If true, convert to Unicode (UTF-8) encoding. + #[arg(long = "unicode")] + to_unicode: bool, + /// Password for decryption, with or without what SPSS calls "password encryption". /// /// Specify only for an encrypted system file. @@ -75,10 +114,13 @@ struct Convert { /// Maximum number of cases to print. #[arg(short = 'c', long = "cases")] - max_cases: Option, + max_cases: Option, #[command(flatten, next_help_heading = "Options for CSV output")] csv_options: CsvOptions, + + #[command(flatten, next_help_heading = "Options for system file output")] + sys_options: SysOptions, } #[derive(Args, Clone, Debug)] @@ -88,34 +130,76 @@ struct CsvOptions { no_var_names: bool, } +#[derive(Args, Clone, Debug)] +struct SysOptions { + /// How to compress data in the system file. + #[arg(long, default_value = "simple")] + compression: Option, +} + impl Convert { fn run(self) -> Result<()> { fn warn(warning: anyhow::Error) { eprintln!("warning: {warning}"); } - let (dictionary, _, cases) = ReaderOptions::new() + let mut system_file = ReadOptions::new(warn) .with_encoding(self.encoding) .with_password(self.password.clone()) - .open_file(&self.input, warn)? - .into_parts(); - let writer = match self.output { - Some(path) => Box::new(File::create(path)?) as Box, - None => Box::new(stdout()), - }; - let mut output = csv::WriterBuilder::new().from_writer(writer); - if !self.csv_options.no_var_names { - output.write_record(dictionary.variables.iter().map(|var| var.name.as_str()))?; + .open_file(&self.input)?; + if self.to_unicode { + system_file = system_file.into_unicode(); } + let (dictionary, _, cases) = system_file.into_parts(); - for (_case_number, case) in (0..self.max_cases.unwrap_or(u64::MAX)).zip(cases) { - output.write_record(case?.0.into_iter().zip(dictionary.variables.iter()).map( - |(datum, variable)| { - datum - .display(variable.print_format, variable.encoding) - .to_string() - }, - ))?; + // Take only the first `self.max_cases` cases. + let cases = cases.take(self.max_cases.unwrap_or(usize::MAX)); + + let output_format = match self.output_format { + Some(format) => format, + None => { + let Some(output) = &self.output else { + bail!("either --output-format or an output file name must be specified"); + }; + output.as_path().try_into()? + } + }; + + match output_format { + OutputFormat::Csv => { + let writer = match self.output { + Some(path) => Box::new(File::create(path)?) as Box, + None => Box::new(stdout()), + }; + let mut output = csv::WriterBuilder::new().from_writer(writer); + if !self.csv_options.no_var_names { + output + .write_record(dictionary.variables.iter().map(|var| var.name.as_str()))?; + } + + for case in cases { + output.write_record(case?.into_iter().zip(dictionary.variables.iter()).map( + |(datum, variable)| { + if datum == Datum::sysmis() { + String::from(" ") + } else { + datum.display(variable.print_format).to_string() + } + }, + ))?; + } + } + OutputFormat::Sys => { + let Some(output) = &self.output else { + bail!("output file name must be specified for output to a system file") + }; + let mut output = sys::WriteOptions::new() + .with_compression(self.sys_options.compression) + .write_file(&dictionary, output)?; + for case in cases { + output.write_case(case?)?; + } + } } Ok(()) } @@ -157,31 +241,286 @@ impl Decrypt { } } -/// Dissects SPSS system files. +/// Show information about SPSS system files. #[derive(Args, Clone, Debug)] -struct Dissect { - /// Maximum number of cases to print. - #[arg(long = "data", default_value_t = 0)] - max_cases: u64, +struct Show { + /// What to show. + #[arg(value_enum)] + mode: Mode, - /// Files to dissect. + /// File to show. #[arg(required = true)] - files: Vec, + input_file: PathBuf, - /// How to dissect the file. - #[arg(short, long, value_enum, default_value_t)] - mode: Mode, + /// Output file name. If omitted, output is written to stdout. + output_file: Option, + + /// Output driver configuration options. + #[arg(short = 'o')] + output_options: Vec, + + /// Maximum number of cases to read. + /// + /// If specified without an argument, all cases will be read. + #[arg( + long = "data", + num_args = 0..=1, + default_missing_value = "18446744073709551615", + default_value_t = 0 + )] + max_cases: u64, + + /// Output format. + #[arg(long, short = 'f')] + format: Option, /// The encoding to use. #[arg(long, value_parser = parse_encoding)] encoding: Option<&'static Encoding>, } -impl Dissect { +enum Output { + Driver { + driver: Rc>>, + mode: Mode, + }, + Json { + writer: Rc>>, + pretty: bool, + }, + Discard, +} + +impl Output { + /* + fn show_metadata(&self, metadata: MetadataEntry) -> Result<()> { + match self { + Self::Driver { driver, .. } => { + driver + .borrow_mut() + .write(&Arc::new(Item::new(metadata.into_pivot_table()))); + Ok(()) + } + Self::Json { .. } => self.show_json(&metadata), + Self::Discard => Ok(()), + } + }*/ + + fn show(&self, value: &T) -> Result<()> + where + T: Serialize, + for<'a> &'a T: Into
, + { + match self { + Self::Driver { driver, .. } => { + driver + .borrow_mut() + .write(&Arc::new(Item::new(value.into()))); + Ok(()) + } + Self::Json { .. } => self.show_json(value), + Self::Discard => Ok(()), + } + } + + fn show_json(&self, value: &T) -> Result<()> + where + T: Serialize, + { + match self { + Self::Driver { mode, driver: _ } => { + Err(anyhow!("Mode '{mode}' only supports output as JSON.")) + } + Self::Json { writer, pretty } => { + let mut writer = writer.borrow_mut(); + match pretty { + true => serde_json::to_writer_pretty(&mut *writer, value)?, + false => serde_json::to_writer(&mut *writer, value)?, + }; + writeln!(writer)?; + Ok(()) + } + Self::Discard => Ok(()), + } + } + + fn warn(&self, warning: &impl Display) { + match self { + Output::Driver { driver, .. } => { + driver + .borrow_mut() + .write(&Arc::new(Item::from(Text::new_log(warning.to_string())))); + } + Output::Json { .. } => { + #[derive(Serialize)] + struct Warning { + warning: String, + } + let warning = Warning { + warning: warning.to_string(), + }; + let _ = self.show_json(&warning); + } + Self::Discard => (), + } + } +} + +impl Show { fn run(self) -> Result<()> { - for file in self.files { - dissect(&file, self.max_cases, self.mode, self.encoding)?; + let format = if let Some(format) = self.format { + format + } else if let Some(output_file) = &self.output_file { + match output_file + .extension() + .unwrap_or(OsStr::new("")) + .to_str() + .unwrap_or("") + { + "json" => ShowFormat::Json, + "ndjson" => ShowFormat::Ndjson, + _ => ShowFormat::Output, + } + } else { + ShowFormat::Json + }; + + let output = match format { + ShowFormat::Output => { + let mut config = String::new(); + + if let Some(file) = &self.output_file { + #[derive(Serialize)] + struct File<'a> { + file: &'a Path, + } + let file = File { + file: file.as_path(), + }; + let toml_file = toml::to_string_pretty(&file).unwrap(); + config.push_str(&toml_file); + } + for option in &self.output_options { + writeln!(&mut config, "{option}").unwrap(); + } + + let table: toml::Table = toml::from_str(&config)?; + if !table.contains_key("driver") { + let driver = if let Some(file) = &self.output_file { + ::driver_type_from_filename(file).ok_or_else(|| { + anyhow!("{}: no default output format for file name", file.display()) + })? + } else { + "text" + }; + + #[derive(Serialize)] + struct DriverConfig { + driver: &'static str, + } + config.insert_str( + 0, + &toml::to_string_pretty(&DriverConfig { driver }).unwrap(), + ); + } + + let config: Config = toml::from_str(&config)?; + Output::Driver { + mode: self.mode, + driver: Rc::new(RefCell::new(Box::new(::new(&config)?))), + } + } + ShowFormat::Json | ShowFormat::Ndjson => Output::Json { + pretty: format == ShowFormat::Json, + writer: if let Some(output_file) = &self.output_file { + Rc::new(RefCell::new(Box::new(File::create(output_file)?))) + } else { + Rc::new(RefCell::new(Box::new(stdout()))) + }, + }, + ShowFormat::Discard => Output::Discard, + }; + + let reader = File::open(&self.input_file)?; + let reader = BufReader::new(reader); + let mut reader = Reader::new(reader, Box::new(|warning| output.warn(&warning)))?; + + match self.mode { + Mode::Identity => { + match reader.header().magic { + Magic::Sav => println!("SPSS System File"), + Magic::Zsav => println!("SPSS System File with Zlib compression"), + Magic::Ebcdic => println!("EBCDIC-encoded SPSS System File"), + } + return Ok(()); + } + Mode::Raw => { + output.show_json(reader.header())?; + for record in reader.records() { + output.show_json(&record?)?; + } + for (_index, case) in (0..self.max_cases).zip(reader.cases()) { + output.show_json(&case?)?; + } + } + Mode::Decoded => { + let records: Vec = reader.records().collect::, _>>()?; + let encoding = match self.encoding { + Some(encoding) => encoding, + None => infer_encoding(&records, &mut |e| output.warn(&e))?, + }; + let mut decoder = Decoder::new(encoding, |e| output.warn(&e)); + for record in records { + output.show_json(&record.decode(&mut decoder))?; + } + } + Mode::Dictionary => { + let records: Vec = reader.records().collect::, _>>()?; + let encoding = match self.encoding { + Some(encoding) => encoding, + None => infer_encoding(&records, &mut |e| output.warn(&e))?, + }; + let mut decoder = Decoder::new(encoding, |e| output.warn(&e)); + let records = Records::from_raw(records, &mut decoder); + let (dictionary, metadata, cases) = records + .decode( + reader.header().clone().decode(&mut decoder), + reader.cases(), + encoding, + |e| output.warn(&e), + ) + .into_parts(); + match &output { + Output::Driver { driver, mode: _ } => { + driver + .borrow_mut() + .write(&Arc::new(Item::new(PivotTable::from(&metadata)))); + driver + .borrow_mut() + .write(&Arc::new(Item::new(Details::Group( + dictionary + .all_pivot_tables() + .into_iter() + .map(|pivot_table| Arc::new(Item::new(pivot_table))) + .collect(), + )))); + } + Output::Json { .. } => { + output.show_json(&dictionary)?; + output.show_json(&metadata)?; + for (_index, case) in (0..self.max_cases).zip(cases) { + output.show_json(&case?)?; + } + } + Output::Discard => (), + } + } + Mode::Encodings => { + let encoding_report = EncodingReport::new(reader, self.max_cases)?; + output.show(&encoding_report)?; + } } + Ok(()) } } @@ -190,7 +529,7 @@ impl Dissect { enum Command { Convert(Convert), Decrypt(Decrypt), - Dissect(Dissect), + Show(Show), } impl Command { @@ -198,7 +537,7 @@ impl Command { match self { Command::Convert(convert) => convert.run(), Command::Decrypt(decrypt) => decrypt.run(), - Command::Dissect(dissect) => dissect.run(), + Command::Show(show) => show.run(), } } } @@ -214,90 +553,59 @@ fn parse_encoding(arg: &str) -> Result<&'static Encoding, UnknownEncodingError> } } -#[derive(Clone, Copy, Debug, Default, ValueEnum)] +/// What to show in a system file. +#[derive(Clone, Copy, Debug, Default, PartialEq, ValueEnum)] enum Mode { - Identify, + /// The file dictionary, including variables, value labels, attributes, and so on. + #[default] + #[value(alias = "dict")] + Dictionary, + + /// Possible encodings of text in the file dictionary and (with `--data`) cases. + Encodings, + + /// The kind of file. + Identity, + + /// Raw file records, without assuming a particular character encoding. Raw, + + /// Raw file records decoded with a particular character encoding. Decoded, - #[default] - Cooked, } -fn main() -> Result<()> { - Cli::parse().command.run() +impl Mode { + fn as_str(&self) -> &'static str { + match self { + Mode::Dictionary => "dictionary", + Mode::Identity => "identity", + Mode::Raw => "raw", + Mode::Decoded => "decoded", + Mode::Encodings => "encodings", + } + } } -fn dissect( - file_name: &Path, - max_cases: u64, - mode: Mode, - encoding: Option<&'static Encoding>, -) -> Result<()> { - let reader = File::open(file_name)?; - let reader = BufReader::new(reader); - let mut reader = Reader::new(reader, Box::new(|warning| println!("{warning}")))?; - - match mode { - Mode::Identify => { - match reader.header().magic { - Magic::Sav => println!("SPSS System File"), - Magic::Zsav => println!("SPSS System File with Zlib compression"), - Magic::Ebcdic => println!("EBCDIC-encoded SPSS System File"), - } - return Ok(()); - } - Mode::Raw => { - for record in reader.records() { - let header = record?; - println!("{:?}", header); - } - for (_index, case) in (0..max_cases).zip(reader.cases()) { - println!("{:?}", case?); - } - } - Mode::Decoded => { - let records: Vec = reader.records().collect::, _>>()?; - let encoding = match encoding { - Some(encoding) => encoding, - None => infer_encoding(&records, &mut |e| eprintln!("{e}"))?, - }; - let mut decoder = Decoder::new(encoding, |e| eprintln!("{e}")); - for header in records { - let header = header.decode(&mut decoder); - println!("{:?}", header); - /* - if let Record::Cases(cases) = header { - let mut cases = cases.borrow_mut(); - for _ in 0..max_cases { - let Some(Ok(record)) = cases.next() else { - break; - }; - println!("{:?}", record); - } - } - */ - } - } - Mode::Cooked => { - let records: Vec = reader.records().collect::, _>>()?; - let encoding = match encoding { - Some(encoding) => encoding, - None => infer_encoding(&records, &mut |e| eprintln!("{e}"))?, - }; - let mut decoder = Decoder::new(encoding, |e| eprintln!("{e}")); - let records = Records::from_raw(records, &mut decoder); - let (dictionary, metadata, _) = records - .decode( - reader.header().clone().decode(&mut decoder), - reader.cases(), - encoding, - |e| eprintln!("{e}"), - ) - .into_parts(); - println!("{dictionary:#?}"); - println!("{metadata:#?}"); - } +impl Display for Mode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.as_str()) } +} + +#[derive(Clone, Copy, Debug, Default, PartialEq, Serialize, ValueEnum)] +#[serde(rename_all = "snake_case")] +enum ShowFormat { + /// Pretty-printed JSON. + #[default] + Json, + /// Newline-delimited JSON. + Ndjson, + /// Pivot tables. + Output, + /// No output. + Discard, +} - Ok(()) +fn main() -> Result<()> { + Cli::parse().command.run() } diff --git a/rust/pspp/src/message.rs b/rust/pspp/src/message.rs index 3d0f667b57..97bcc90b84 100644 --- a/rust/pspp/src/message.rs +++ b/rust/pspp/src/message.rs @@ -22,10 +22,11 @@ use std::{ }; use enum_map::Enum; +use serde::Serialize; use unicode_width::UnicodeWidthStr; /// A line number and optional column number within a source file. -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize)] pub struct Point { /// 1-based line number. pub line: i32, @@ -65,7 +66,7 @@ impl Point { } /// Location relevant to an diagnostic message. -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, Serialize)] pub struct Location { /// File name, if any. pub file_name: Option>, @@ -76,6 +77,7 @@ pub struct Location { /// Normally, if `span` contains column information, then displaying the /// message will underline the location. Setting this to true disables /// displaying underlines. + #[serde(skip)] pub omit_underlines: bool, } @@ -136,7 +138,8 @@ impl Location { } } -#[derive(Copy, Clone, Debug, PartialEq, Eq, Enum)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Enum, Serialize)] +#[serde(rename_all = "snake_case")] pub enum Severity { Error, Warning, @@ -167,13 +170,15 @@ impl Display for Severity { } } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] pub enum Category { General, Syntax, Data, } +#[derive(Serialize)] pub struct Stack { location: Location, description: String, @@ -188,6 +193,7 @@ impl From for Diagnostics { } } +#[derive(Serialize)] pub struct Diagnostic { pub severity: Severity, pub category: Category, diff --git a/rust/pspp/src/output/cairo/driver.rs b/rust/pspp/src/output/cairo/driver.rs index 360d14fa89..fbcabc06c0 100644 --- a/rust/pspp/src/output/cairo/driver.rs +++ b/rust/pspp/src/output/cairo/driver.rs @@ -14,11 +14,16 @@ // You should have received a copy of the GNU General Public License along with // this program. If not, see . -use std::{borrow::Cow, path::Path, sync::Arc}; +use std::{ + borrow::Cow, + path::{Path, PathBuf}, + sync::Arc, +}; use cairo::{Context, PdfSurface}; use enum_map::{enum_map, EnumMap}; use pango::SCALE; +use serde::{Deserialize, Serialize}; use crate::output::{ cairo::{ @@ -26,13 +31,31 @@ use crate::output::{ pager::{CairoPageStyle, CairoPager}, }, driver::Driver, - page::Setup, + page::PageSetup, pivot::{Color, Coord2, FontStyle}, Item, }; use crate::output::pivot::Axis2; +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct CairoConfig { + /// Output file name. + pub file: PathBuf, + + /// Page setup. + pub page_setup: Option, +} + +impl CairoConfig { + pub fn new(path: impl AsRef) -> Self { + Self { + file: path.as_ref().to_path_buf(), + page_setup: None, + } + } +} + pub struct CairoDriver { fsm_style: Arc, page_style: Arc, @@ -41,12 +64,19 @@ pub struct CairoDriver { } impl CairoDriver { - pub fn new(path: impl AsRef) -> CairoDriver { + pub fn new(config: &CairoConfig) -> cairo::Result { fn scale(inches: f64) -> usize { (inches * 72.0 * SCALE as f64).max(0.0).round() as usize } - let page_setup = Setup::default(); + let default_page_setup; + let page_setup = match &config.page_setup { + Some(page_setup) => page_setup, + None => { + default_page_setup = PageSetup::default(); + &default_page_setup + } + }; let printable = page_setup.printable_size(); let page_style = CairoPageStyle { margins: EnumMap::from_fn(|axis| { @@ -85,15 +115,14 @@ impl CairoDriver { let surface = PdfSurface::new( page_setup.paper[Axis2::X] * 72.0, page_setup.paper[Axis2::Y] * 72.0, - path, - ) - .unwrap(); - Self { + &config.file, + )?; + Ok(Self { fsm_style: Arc::new(fsm_style), page_style: Arc::new(page_style), pager: None, surface, - } + }) } } diff --git a/rust/pspp/src/output/cairo/mod.rs b/rust/pspp/src/output/cairo/mod.rs index 2811bca601..0d6782f142 100644 --- a/rust/pspp/src/output/cairo/mod.rs +++ b/rust/pspp/src/output/cairo/mod.rs @@ -22,7 +22,7 @@ mod driver; pub mod fsm; pub mod pager; -pub use driver::CairoDriver; +pub use driver::{CairoConfig, CairoDriver}; /// Conversion from 1/96" units ("pixels") to Cairo/Pango units. fn px_to_xr(x: usize) -> usize { @@ -43,10 +43,10 @@ fn horz_align_to_pango(horz_align: HorzAlign) -> pango::Alignment { #[cfg(test)] mod test { - use crate::output::cairo::CairoDriver; + use crate::output::cairo::{CairoConfig, CairoDriver}; #[test] fn create() { - CairoDriver::new("test.pdf"); + CairoDriver::new(&CairoConfig::new("test.pdf")).unwrap(); } } diff --git a/rust/pspp/src/output/csv.rs b/rust/pspp/src/output/csv.rs index 543e80fae8..5e65b0b75e 100644 --- a/rust/pspp/src/output/csv.rs +++ b/rust/pspp/src/output/csv.rs @@ -18,28 +18,67 @@ use std::{ borrow::Cow, fmt::Display, fs::File, - io::{Error, Write}, + io::{BufWriter, Error, Write}, + path::PathBuf, sync::Arc, }; +use serde::{ + de::{Unexpected, Visitor}, + Deserialize, Deserializer, Serialize, +}; + use crate::output::pivot::Coord2; use super::{driver::Driver, pivot::PivotTable, table::Table, Details, Item, TextType}; -struct CsvDriver { - file: File, +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct CsvConfig { + file: PathBuf, + #[serde(flatten)] + options: CsvOptions, +} + +pub struct CsvDriver { + file: BufWriter, options: CsvOptions, /// Number of items written so far. n_items: usize, } -#[derive(Copy, Clone, Debug)] +#[derive(Copy, Clone, Debug, Serialize, Deserialize)] +#[serde(default)] struct CsvOptions { + #[serde(deserialize_with = "deserialize_ascii_char")] quote: u8, delimiter: u8, } +fn deserialize_ascii_char<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + struct AsciiCharVisitor; + impl<'de> Visitor<'de> for AsciiCharVisitor { + type Value = u8; + fn expecting(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "a single ASCII character") + } + fn visit_str(self, s: &str) -> Result + where + E: serde::de::Error, + { + if s.len() == 1 { + Ok(s.chars().next().unwrap() as u8) + } else { + Err(serde::de::Error::invalid_value(Unexpected::Str(s), &self)) + } + } + } + deserializer.deserialize_char(AsciiCharVisitor) +} + impl Default for CsvOptions { fn default() -> Self { Self { @@ -89,12 +128,12 @@ impl Display for CsvField<'_> { } impl CsvDriver { - pub fn new(file: File) -> Self { - Self { - file, - options: CsvOptions::default(), + pub fn new(config: &CsvConfig) -> std::io::Result { + Ok(Self { + file: BufWriter::new(File::create(&config.file)?), + options: config.options.clone(), n_items: 0, - } + }) } fn start_item(&mut self) { @@ -162,7 +201,7 @@ impl Driver for CsvDriver { Details::Message(diagnostic) => { self.start_item(); let text = diagnostic.to_string(); - writeln!(&self.file, "{}", CsvField::new(&text, self.options)).unwrap(); + writeln!(&mut self.file, "{}", CsvField::new(&text, self.options)).unwrap(); } Details::Table(pivot_table) => { for layer in pivot_table.layers(true) { @@ -178,7 +217,7 @@ impl Driver for CsvDriver { TextType::Title | TextType::Log => { self.start_item(); for line in text.content.display(()).to_string().lines() { - writeln!(&self.file, "{}", CsvField::new(line, self.options)).unwrap(); + writeln!(&mut self.file, "{}", CsvField::new(line, self.options)).unwrap(); } } }, diff --git a/rust/pspp/src/output/driver.rs b/rust/pspp/src/output/driver.rs index 897ae61de7..963661146e 100644 --- a/rust/pspp/src/output/driver.rs +++ b/rust/pspp/src/output/driver.rs @@ -14,9 +14,21 @@ // You should have received a copy of the GNU General Public License along with // this program. If not, see . -use std::{borrow::Cow, sync::Arc}; +use std::{borrow::Cow, path::Path, sync::Arc}; -use super::{page::Setup, Item}; +use clap::ValueEnum; +use serde::{Deserialize, Serialize}; + +use crate::output::{ + cairo::{CairoConfig, CairoDriver}, + csv::{CsvConfig, CsvDriver}, + html::{HtmlConfig, HtmlDriver}, + json::{JsonConfig, JsonDriver}, + spv::{SpvConfig, SpvDriver}, + text::{TextConfig, TextDriver}, +}; + +use super::{page::PageSetup, Item}; // An output driver. pub trait Driver { @@ -25,7 +37,7 @@ pub trait Driver { fn write(&mut self, item: &Arc); /// Returns false if the driver doesn't support page setup. - fn setup(&mut self, page_setup: &Setup) -> bool { + fn setup(&mut self, page_setup: &PageSetup) -> bool { let _ = page_setup; false } @@ -53,29 +65,100 @@ pub trait Driver { } } -/* -/// An abstract way for the output subsystem to create an output driver. -trait DriverFactory { - /// The file extension, without the leading dot, e.g. "pdf". - fn extension(&self) -> OsString; +impl Driver for Box { + fn name(&self) -> Cow<'static, str> { + (&**self).name() + } - /// The default file name, including extension. - /// - /// If this is `-`, that implies that by default output will be directed to - /// stdout. - fn default_file_name(&self) -> PathBuf; + fn write(&mut self, item: &Arc) { + (&mut **self).write(item); + } - /// Creates a new output driver of this class. `name` and `type` should be - /// passed directly to output_driver_init. - /// - /// It is up to the driver class to decide how to interpret `options`. The - /// create function should delete pairs that it understands from `options`, - /// because the caller may issue errors about unknown options for any pairs - /// that remain. - fn create(&self, file_handle: (), + fn setup(&mut self, page_setup: &PageSetup) -> bool { + (&mut **self).setup(page_setup) + } + + fn flush(&mut self) { + (&mut **self).flush(); + } + + fn handles_show(&self) -> bool { + (&**self).handles_show() + } + + fn handles_groups(&self) -> bool { + (&**self).handles_groups() + } +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(tag = "driver", rename_all = "snake_case")] +pub enum Config { + Text(TextConfig), + Pdf(CairoConfig), + Html(HtmlConfig), + Json(JsonConfig), + Csv(CsvConfig), + Spv(SpvConfig), +} + +#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, ValueEnum)] +#[serde(rename_all = "snake_case")] +pub enum DriverType { + Text, + Pdf, + Html, + Csv, + Json, + Spv, +} + +impl dyn Driver { + pub fn new(config: &Config) -> anyhow::Result> { + match config { + Config::Text(text_config) => Ok(Box::new(TextDriver::new(text_config)?)), + Config::Pdf(cairo_config) => Ok(Box::new(CairoDriver::new(cairo_config)?)), + Config::Html(html_config) => Ok(Box::new(HtmlDriver::new(html_config)?)), + Config::Csv(csv_config) => Ok(Box::new(CsvDriver::new(csv_config)?)), + Config::Json(json_config) => Ok(Box::new(JsonDriver::new(json_config)?)), + Config::Spv(spv_config) => Ok(Box::new(SpvDriver::new(spv_config)?)), + } + } + + pub fn driver_type_from_filename(file: impl AsRef) -> Option<&'static str> { + match file.as_ref().extension()?.to_str()? { + "txt" | "text" => Some("text"), + "pdf" => Some("pdf"), + "htm" | "html" => Some("html"), + "csv" => Some("csv"), + "json" => Some("json"), + "spv" => Some("spv"), + _ => None, + } + } +} - enum settings_output_devices type, - struct driver_options *); +#[cfg(test)] +mod tests { + use serde::Serialize; + use crate::output::driver::Config; + + #[test] + fn toml() { + let config = r#"driver = "text" +file = "filename.text" +"#; + let toml: Config = toml::from_str(config).unwrap(); + println!("{}", toml::to_string_pretty(&toml).unwrap()); + + #[derive(Serialize)] + struct Map<'a> { + file: &'a str, + } + println!( + "{}", + toml::to_string_pretty(&Map { file: "filename" }).unwrap() + ); + } } -*/ diff --git a/rust/pspp/src/output/html.rs b/rust/pspp/src/output/html.rs index 9a80783f3e..cfc515d19a 100644 --- a/rust/pspp/src/output/html.rs +++ b/rust/pspp/src/output/html.rs @@ -17,10 +17,13 @@ use std::{ borrow::Cow, fmt::{Display, Write as _}, + fs::File, io::Write, + path::PathBuf, sync::Arc, }; +use serde::{Deserialize, Serialize}; use smallstr::SmallString; use crate::output::{ @@ -30,7 +33,12 @@ use crate::output::{ Details, Item, }; -pub struct HtmlRenderer { +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct HtmlConfig { + file: PathBuf, +} + +pub struct HtmlDriver { writer: W, fg: Color, bg: Color, @@ -49,11 +57,17 @@ impl Stroke { } } -impl HtmlRenderer +impl HtmlDriver { + pub fn new(config: &HtmlConfig) -> std::io::Result { + Ok(Self::for_writer(File::create(&config.file)?)) + } +} + +impl HtmlDriver where W: Write, { - pub fn new(mut writer: W) -> Self { + pub fn for_writer(mut writer: W) -> Self { let _ = put_header(&mut writer); Self { fg: Color::BLACK, @@ -412,7 +426,7 @@ a:active { "#; -impl Driver for HtmlRenderer +impl Driver for HtmlDriver where W: Write, { diff --git a/rust/pspp/src/output/json.rs b/rust/pspp/src/output/json.rs new file mode 100644 index 0000000000..af6923d390 --- /dev/null +++ b/rust/pspp/src/output/json.rs @@ -0,0 +1,58 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +use std::{ + borrow::Cow, + fs::File, + io::{BufWriter, Write}, + path::PathBuf, + sync::Arc, +}; + +use serde::{Deserialize, Serialize}; + +use super::{driver::Driver, Item}; + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct JsonConfig { + file: PathBuf, +} + +pub struct JsonDriver { + file: BufWriter, +} + +impl JsonDriver { + pub fn new(config: &JsonConfig) -> std::io::Result { + Ok(Self { + file: BufWriter::new(File::create(&config.file)?), + }) + } +} + +impl Driver for JsonDriver { + fn name(&self) -> Cow<'static, str> { + Cow::from("json") + } + + fn write(&mut self, item: &Arc) { + serde_json::to_writer_pretty(&mut self.file, item).unwrap(); // XXX handle errors + } + + fn flush(&mut self) { + let _ = self.file.flush(); + } +} diff --git a/rust/pspp/src/output/mod.rs b/rust/pspp/src/output/mod.rs index 5417de68b7..28ab4efdd6 100644 --- a/rust/pspp/src/output/mod.rs +++ b/rust/pspp/src/output/mod.rs @@ -22,6 +22,7 @@ use std::{ use enum_map::EnumMap; use pivot::PivotTable; +use serde::Serialize; use crate::{ message::Diagnostic, @@ -34,6 +35,7 @@ pub mod cairo; pub mod csv; pub mod driver; pub mod html; +pub mod json; pub mod page; pub mod pivot; pub mod render; @@ -43,6 +45,7 @@ pub mod text; pub mod text_line; /// A single output item. +#[derive(Serialize)] pub struct Item { /// The localized label for the item that appears in the outline pane in the /// output viewer and in PDF outlines. This is `None` if no label has been @@ -94,6 +97,7 @@ where } } +#[derive(Serialize)] pub enum Details { Chart, Image, @@ -144,6 +148,18 @@ impl Details { } } +impl FromIterator for Details +where + A: Into>, +{ + fn from_iter(iter: T) -> Self + where + T: IntoIterator, + { + Self::Group(iter.into_iter().map(|value| value.into()).collect()) + } +} + impl From for Details { fn from(value: Diagnostic) -> Self { Self::Message(Box::new(value)) @@ -180,7 +196,7 @@ impl From> for Details { } } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct Text { type_: TextType, @@ -188,10 +204,10 @@ pub struct Text { } impl Text { - pub fn new_log(s: impl Into) -> Self { + pub fn new_log(value: impl Into) -> Self { Self { type_: TextType::Log, - content: Value::new_user_text(s), + content: value.into(), } } } @@ -228,7 +244,8 @@ impl From<&Diagnostic> for Text { } } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] pub enum TextType { /// `TITLE` and `SUBTITLE` commands. PageTitle, diff --git a/rust/pspp/src/output/page.rs b/rust/pspp/src/output/page.rs index ccebb8bcd5..4240b8d9f5 100644 --- a/rust/pspp/src/output/page.rs +++ b/rust/pspp/src/output/page.rs @@ -14,29 +14,38 @@ // You should have received a copy of the GNU General Public License along with // this program. If not, see . -use std::path::PathBuf; - use enum_map::{enum_map, EnumMap}; +use serde::{Deserialize, Serialize}; use super::pivot::{Axis2, HorzAlign}; -#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] pub enum Orientation { #[default] Portrait, Landscape, } -#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] +/// Chart size. +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] pub enum ChartSize { + /// Size specified in the chart itself. #[default] AsIs, + + /// Full page. FullHeight, + + /// Half-page. HalfHeight, + + /// Quarter-page. QuarterHeight, } -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct Paragraph { pub markup: String, pub horz_align: HorzAlign, @@ -51,10 +60,13 @@ impl Default for Paragraph { } } -#[derive(Clone, Debug, Default, PartialEq)] +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] pub struct Heading(pub Vec); -pub struct Setup { +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default)] +pub struct PageSetup { + /// Page number of first page. pub initial_page_number: i32, /// Paper size in inches. @@ -63,20 +75,20 @@ pub struct Setup { /// Margin width in inches. pub margins: EnumMap, + /// Portrait or landscape. pub orientation: Orientation, /// Space between objects, in inches. pub object_spacing: f64, + /// Size of charts. pub chart_size: ChartSize, /// Header and footer. pub headings: [Heading; 2], - - file_name: Option, } -impl Default for Setup { +impl Default for PageSetup { fn default() -> Self { Self { initial_page_number: 1, @@ -86,12 +98,11 @@ impl Default for Setup { object_spacing: 12.0 / 72.0, chart_size: Default::default(), headings: Default::default(), - file_name: None, } } } -impl Setup { +impl PageSetup { pub fn printable_size(&self) -> EnumMap { EnumMap::from_fn(|axis| self.paper[axis] - self.margins[axis][0] - self.margins[axis][1]) } diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs index 5d94390b30..b3541f1264 100644 --- a/rust/pspp/src/output/pivot/mod.rs +++ b/rust/pspp/src/output/pivot/mod.rs @@ -56,22 +56,25 @@ use binrw::Error as BinError; use chrono::NaiveDateTime; pub use color::ParseError as ParseColorError; use color::{palette::css::TRANSPARENT, AlphaColor, Rgba8, Srgb}; -use encoding_rs::{Encoding, UTF_8}; use enum_iterator::Sequence; use enum_map::{enum_map, Enum, EnumMap}; use look_xml::TableProperties; use quick_xml::{de::from_str, DeError}; -use serde::{de::Visitor, Deserialize}; +use serde::{ + de::Visitor, + ser::{SerializeMap, SerializeStruct}, + Deserialize, Serialize, Serializer, +}; use smallstr::SmallString; use smallvec::SmallVec; use thiserror::Error as ThisError; use tlo::parse_tlo; use crate::{ - data::Datum, - dictionary::{VarType, Variable}, + data::{ByteString, Datum, EncodedString, RawString}, format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat}, settings::{Settings, Show}, + variable::{VarType, Variable}, }; pub mod output; @@ -103,6 +106,31 @@ pub enum Area { Layers, } +impl Display for Area { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Area::Title => write!(f, "title"), + Area::Caption => write!(f, "caption"), + Area::Footer => write!(f, "footer"), + Area::Corner => write!(f, "corner"), + Area::Labels(axis2) => write!(f, "labels({axis2})"), + Area::Data => write!(f, "data"), + Area::Layers => write!(f, "layers"), + } + } +} + +impl Serialize for Area { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut s = SmallString::<[u8; 16]>::new(); + write!(&mut s, "{}", self).unwrap(); + serializer.serialize_str(&s) + } +} + impl Area { fn default_cell_style(self) -> CellStyle { use HorzAlign::*; @@ -188,8 +216,34 @@ impl Border { } } +impl Display for Border { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Border::Title => write!(f, "title"), + Border::OuterFrame(box_border) => write!(f, "outer_frame({box_border})"), + Border::InnerFrame(box_border) => write!(f, "inner_frame({box_border})"), + Border::Dimension(row_col_border) => write!(f, "dimension({row_col_border})"), + Border::Category(row_col_border) => write!(f, "category({row_col_border})"), + Border::DataLeft => write!(f, "data(left)"), + Border::DataTop => write!(f, "data(top)"), + } + } +} + +impl Serialize for Border { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut s = SmallString::<[u8; 32]>::new(); + write!(&mut s, "{}", self).unwrap(); + serializer.serialize_str(&s) + } +} + /// The borders on a box. -#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] pub enum BoxBorder { Left, Top, @@ -197,8 +251,26 @@ pub enum BoxBorder { Bottom, } +impl BoxBorder { + fn as_str(&self) -> &'static str { + match self { + BoxBorder::Left => "left", + BoxBorder::Top => "top", + BoxBorder::Right => "right", + BoxBorder::Bottom => "bottom", + } + } +} + +impl Display for BoxBorder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + /// Borders between rows and columns. -#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] pub struct RowColBorder( /// Row or column headings. pub HeadingRegion, @@ -206,11 +278,17 @@ pub struct RowColBorder( pub Axis2, ); +impl Display for RowColBorder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}", self.0, self.1) + } +} + /// Sizing for rows or columns of a rendered table. /// /// The comments below talk about columns and their widths but they apply /// equally to rows and their heights. -#[derive(Default, Clone, Debug)] +#[derive(Default, Clone, Debug, Serialize)] pub struct Sizing { /// Specific column widths, in 1/96" units. widths: Vec, @@ -223,7 +301,8 @@ pub struct Sizing { keeps: Vec>, } -#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Sequence)] +#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Sequence, Serialize)] +#[serde(rename_all = "snake_case")] pub enum Axis3 { X, Y, @@ -250,7 +329,7 @@ impl From for Axis3 { } /// An axis within a pivot table. -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, Serialize)] pub struct Axis { /// `dimensions[0]` is the innermost dimension. pub dimensions: Vec, @@ -304,7 +383,7 @@ impl PivotTable { format, honor_small: class == Class::Other, value: number, - var_name: None, + variable: None, value_label: None, })); self.insert(data_indexes, value); @@ -339,7 +418,7 @@ impl PivotTable { /// (A dimension or a group can contain zero categories, but this is unusual. /// If a dimension contains no categories, then its table cannot contain any /// data.) -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct Dimension { /// Hierarchy of categories within the dimension. The groups and categories /// are sorted in the order that should be used for display. This might be @@ -400,8 +479,9 @@ impl Dimension { } } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct Group { + #[serde(skip)] len: usize, pub name: Box, @@ -416,11 +496,15 @@ pub struct Group { } impl Group { - pub fn new(name: impl Into) -> Group { + pub fn new(name: impl Into) -> Self { + Self::with_capacity(name, 0) + } + + pub fn with_capacity(name: impl Into, capacity: usize) -> Self { Self { len: 0, name: Box::new(name.into()), - children: Vec::new(), + children: Vec::with_capacity(capacity), show_label: false, } } @@ -505,7 +589,7 @@ where } } -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, Serialize)] pub struct Footnotes(pub Vec>); impl Footnotes { @@ -540,6 +624,15 @@ impl Leaf { } } +impl Serialize for Leaf { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.name.serialize(serializer) + } +} + /// Pivot result classes. /// /// These are used to mark [Leaf] categories as having particular types of data, @@ -556,7 +649,7 @@ pub enum Class { } /// A pivot_category is a leaf (a category) or a group. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub enum Category { Group(Group), Leaf(Leaf), @@ -641,12 +734,24 @@ impl From<&str> for Category { } } +impl From for Category { + fn from(name: String) -> Self { + Self::Leaf(Leaf::new(Value::new_text(name))) + } +} + +impl From<&String> for Category { + fn from(name: &String) -> Self { + Self::Leaf(Leaf::new(Value::new_text(name))) + } +} + /// Styling for a pivot table. /// /// The division between this and the style information in [PivotTable] seems /// fairly arbitrary. The ultimate reason for the division is simply because /// that's how SPSS documentation and file formats do it. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct Look { pub name: Option, @@ -777,7 +882,7 @@ impl Look { } /// Position for group labels. -#[derive(Copy, Clone, Debug, Default, Deserialize, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] pub enum LabelPosition { /// Hierarachically enclosing the categories. /// @@ -828,12 +933,28 @@ pub enum LabelPosition { /// │ │ │ /// └──────────────────┴─────────────────────────────────────────────────┘ /// ``` -#[derive(Copy, Clone, Debug, PartialEq, Eq, Enum)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Enum, Serialize)] +#[serde(rename_all = "snake_case")] pub enum HeadingRegion { Rows, Columns, } +impl HeadingRegion { + pub fn as_str(&self) -> &'static str { + match self { + HeadingRegion::Rows => "rows", + HeadingRegion::Columns => "columns", + } + } +} + +impl Display for HeadingRegion { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + impl From for HeadingRegion { fn from(axis: Axis2) -> Self { match axis { @@ -843,13 +964,13 @@ impl From for HeadingRegion { } } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct AreaStyle { pub cell_style: CellStyle, pub font_style: FontStyle, } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct CellStyle { /// `None` means "mixed" alignment: align strings to the left, numbers to /// the right. @@ -865,7 +986,8 @@ pub struct CellStyle { pub margins: EnumMap, } -#[derive(Copy, Clone, Debug, PartialEq)] +#[derive(Copy, Clone, Debug, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] pub enum HorzAlign { /// Right aligned. Right, @@ -895,7 +1017,8 @@ impl HorzAlign { } } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] pub enum VertAlign { /// Top alignment. Top, @@ -907,7 +1030,7 @@ pub enum VertAlign { Bottom, } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct FontStyle { pub bold: bool, pub italic: bool, @@ -1001,6 +1124,17 @@ impl FromStr for Color { } } +impl Serialize for Color { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut s = SmallString::<[u8; 32]>::new(); + write!(&mut s, "{}", self.display_css()).unwrap(); + serializer.serialize_str(&s) + } +} + impl<'de> Deserialize<'de> for Color { fn deserialize(deserializer: D) -> Result where @@ -1048,6 +1182,18 @@ pub struct BorderStyle { pub color: Color, } +impl Serialize for BorderStyle { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut s = serializer.serialize_struct("BorderStyle", 2)?; + s.serialize_field("stroke", &self.stroke)?; + s.serialize_field("color", &self.color)?; + s.end() + } +} + impl BorderStyle { pub const fn none() -> Self { Self { @@ -1071,7 +1217,7 @@ impl BorderStyle { } } -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Enum, Deserialize)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Enum, Deserialize, Serialize)] #[serde(rename_all = "camelCase")] pub enum Stroke { None, @@ -1096,7 +1242,8 @@ impl Stroke { } /// An axis of a 2-dimensional table. -#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] pub enum Axis2 { X, Y, @@ -1106,6 +1253,19 @@ impl Axis2 { pub fn new_enum(x: T, y: T) -> EnumMap { EnumMap::from_array([x, y]) } + + pub fn as_str(&self) -> &'static str { + match self { + Axis2::X => "x", + Axis2::Y => "y", + } + } +} + +impl Display for Axis2 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } } impl Not for Axis2 { @@ -1237,7 +1397,7 @@ impl IndexMut for Rect2 { } } -#[derive(Copy, Clone, Debug, Default, Deserialize, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] #[serde(rename_all = "camelCase")] pub enum FootnoteMarkerType { /// a, b, c, ... @@ -1248,7 +1408,7 @@ pub enum FootnoteMarkerType { Numeric, } -#[derive(Copy, Clone, Debug, Default, Deserialize, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] #[serde(rename_all = "camelCase")] pub enum FootnoteMarkerPosition { /// Subscripts. @@ -1310,7 +1470,7 @@ impl IntoValueOptions for ValueOptions { } } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct PivotTable { pub look: Arc, @@ -1373,20 +1533,20 @@ impl PivotTable { self } - pub fn with_caption(mut self, caption: Value) -> Self { - self.caption = Some(Box::new(caption)); + pub fn with_caption(mut self, caption: impl Into) -> Self { + self.caption = Some(Box::new(caption.into())); self.show_caption = true; self } - pub fn with_corner_text(mut self, corner_text: Value) -> Self { - self.corner_text = Some(Box::new(corner_text)); + pub fn with_corner_text(mut self, corner_text: impl Into) -> Self { + self.corner_text = Some(Box::new(corner_text.into())); self } - pub fn with_subtype(self, subtype: Value) -> Self { + pub fn with_subtype(self, subtype: impl Into) -> Self { Self { - subtype: Some(Box::new(subtype)), + subtype: Some(Box::new(subtype.into())), ..self } } @@ -1515,10 +1675,10 @@ where } impl PivotTable { - pub fn new(dimensions_and_axes: impl IntoIterator) -> Self { + pub fn new(axes_and_dimensions: impl IntoIterator) -> Self { let mut dimensions = Vec::new(); let mut axes = EnumMap::::default(); - for (axis, dimension) in dimensions_and_axes { + for (axis, dimension) in axes_and_dimensions { axes[axis].dimensions.push(dimensions.len()); dimensions.push(dimension); } @@ -1665,8 +1825,9 @@ where } } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct Footnote { + #[serde(skip)] index: usize, pub content: Box, pub marker: Option>, @@ -1725,23 +1886,42 @@ impl Display for DisplayMarker<'_> { } else { let i = self.footnote.index + 1; match self.options.footnote_marker_type { - FootnoteMarkerType::Alphabetic => write!(f, "{}", Display26Adic(i)), + FootnoteMarkerType::Alphabetic => write!(f, "{}", Display26Adic::new_lowercase(i)), FootnoteMarkerType::Numeric => write!(f, "{i}"), } } } } -pub struct Display26Adic(pub usize); +/// Displays a number in 26adic notation. +/// +/// Zero is displayed as the empty string, 1 through 26 as `a` through `z`, 27 +/// through 52 as `aa` through `az`, and so on. +pub struct Display26Adic { + value: usize, + base: u8, +} + +impl Display26Adic { + /// Constructs a `Display26Adic` for `value`, with letters in lowercase. + pub fn new_lowercase(value: usize) -> Self { + Self { value, base: b'a' } + } + + /// Constructs a `Display26Adic` for `value`, with letters in uppercase. + pub fn new_uppercase(value: usize) -> Self { + Self { value, base: b'A' } + } +} impl Display for Display26Adic { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let mut output = SmallVec::<[u8; 16]>::new(); - let mut number = self.0; + let mut number = self.value; while number > 0 { number -= 1; let digit = (number % 26) as u8; - output.push(digit + b'a'); + output.push(digit + self.base); number /= 26; } output.reverse(); @@ -1794,7 +1974,34 @@ pub struct Value { pub styling: Option>, } +impl Serialize for Value { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.inner.serialize(serializer) + } +} + +/// Wrapper for [Value] that uses [Value::serialize_bare] for serialization. +#[derive(Serialize)] +struct BareValue<'a>(#[serde(serialize_with = "Value::serialize_bare")] pub &'a Value); + impl Value { + pub fn serialize_bare(&self, serializer: S) -> Result + where + S: Serializer, + { + match &self.inner { + ValueInner::Number(number_value) => number_value.serialize_bare(serializer), + ValueInner::String(string_value) => string_value.s.serialize(serializer), + ValueInner::Variable(variable_value) => variable_value.var_name.serialize(serializer), + ValueInner::Text(text_value) => text_value.localized.serialize(serializer), + ValueInner::Template(template_value) => template_value.localized.serialize(serializer), + ValueInner::Empty => serializer.serialize_none(), + } + } + fn new(inner: ValueInner) -> Self { Self { inner, @@ -1807,7 +2014,7 @@ impl Value { format, honor_small: false, value: x, - var_name: None, + variable: None, value_label: None, })) } @@ -1818,13 +2025,16 @@ impl Value { variable_label: variable.label.clone(), })) } - pub fn new_datum(value: &Datum, encoding: &'static Encoding) -> Self { + pub fn new_datum(value: &Datum) -> Self + where + B: EncodedString, + { match value { Datum::Number(number) => Self::new_number(*number), - Datum::String(string) => Self::new_user_text(string.decode(encoding).into_owned()), + Datum::String(string) => Self::new_user_text(string.as_str()), } } - pub fn new_variable_value(variable: &Variable, value: &Datum) -> Self { + pub fn new_variable_value(variable: &Variable, value: &Datum) -> Self { let var_name = Some(variable.name.as_str().into()); let value_label = variable.value_labels.get(value).map(String::from); match value { @@ -1842,13 +2052,16 @@ impl Value { }, honor_small: false, value: *number, - var_name, + variable: var_name, value_label, })), Datum::String(string) => Self::new(ValueInner::String(StringValue { show: None, hex: variable.print_format.type_() == Type::AHex, - s: string.decode(variable.encoding).into_owned(), + s: string + .as_ref() + .with_encoding(variable.encoding()) + .into_string(), var_name, value_label, })), @@ -1870,9 +2083,9 @@ impl Value { } else { Self::new(ValueInner::Text(TextValue { user_provided: true, - local: s.clone(), - c: s.clone(), - id: s.clone(), + localized: s.clone(), + c: None, + id: None, })) } } @@ -2164,7 +2377,12 @@ impl Display for DisplayValue<'_> { *format }; let mut buf = SmallString::<[u8; 40]>::new(); - write!(&mut buf, "{}", Datum::Number(*value).display(format, UTF_8)).unwrap(); + write!( + &mut buf, + "{}", + Datum::<&str>::Number(*value).display(format) + ) + .unwrap(); write!(f, "{}", buf.trim_start_matches(' '))?; } if let Some(label) = self.show_label { @@ -2186,7 +2404,9 @@ impl Display for DisplayValue<'_> { } } - ValueInner::Text(TextValue { local, .. }) => { + ValueInner::Text(TextValue { + localized: local, .. + }) => { /* if self .inner @@ -2199,9 +2419,11 @@ impl Display for DisplayValue<'_> { f.write_str(local) } - ValueInner::Template(TemplateValue { args, local, .. }) => { - self.template(f, local, args) - } + ValueInner::Template(TemplateValue { + args, + localized: local, + .. + }) => self.template(f, local, args), ValueInner::Empty => Ok(()), }?; @@ -2239,27 +2461,83 @@ impl Debug for Value { #[derive(Clone, Debug)] pub struct NumberValue { - pub show: Option, + /// The numerical value, or `None` if it is a missing value. + pub value: Option, pub format: Format, + pub show: Option, pub honor_small: bool, - pub value: Option, - pub var_name: Option, + pub variable: Option, pub value_label: Option, } -#[derive(Clone, Debug)] -pub struct StringValue { - pub show: Option, - pub hex: bool, +impl Serialize for NumberValue { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + if self.format.type_() == Type::F && self.variable.is_none() && self.value_label.is_none() { + self.value.serialize(serializer) + } else { + let mut s = serializer.serialize_map(None)?; + s.serialize_entry("value", &self.value)?; + s.serialize_entry("format", &self.format)?; + if let Some(show) = self.show { + s.serialize_entry("show", &show)?; + } + if self.honor_small { + s.serialize_entry("honor_small", &self.honor_small)?; + } + if let Some(variable) = &self.variable { + s.serialize_entry("variable", variable)?; + } + if let Some(value_label) = &self.value_label { + s.serialize_entry("value_label", value_label)?; + } + s.end() + } + } +} - /// If `hex` is true, this string should already be hex digits +impl NumberValue { + pub fn serialize_bare(&self, serializer: S) -> Result + where + S: Serializer, + { + if let Some(number) = self.value + && number.trunc() == number + && number >= -(1i64 << 53) as f64 + && number <= (1i64 << 53) as f64 + { + (number as u64).serialize(serializer) + } else { + self.value.serialize(serializer) + } + } +} + +#[derive(Serialize)] +pub struct BareNumberValue<'a>( + #[serde(serialize_with = "NumberValue::serialize_bare")] pub &'a NumberValue, +); + +#[derive(Clone, Debug, Serialize)] +pub struct StringValue { + /// The string value. + /// + /// If `hex` is true, this should contain hex digits, not raw binary data /// (otherwise it would be impossible to encode non-UTF-8 data). pub s: String, + + /// True if `s` is hex digits. + pub hex: bool, + + pub show: Option, + pub var_name: Option, pub value_label: Option, } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct VariableValue { pub show: Option, pub var_name: String, @@ -2270,21 +2548,59 @@ pub struct VariableValue { pub struct TextValue { pub user_provided: bool, /// Localized. - pub local: String, + pub localized: String, /// English. - pub c: String, + pub c: Option, /// Identifier. - pub id: String, + pub id: Option, } -#[derive(Clone, Debug)] +impl Serialize for TextValue { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + if self.user_provided && self.c.is_none() && self.id.is_none() { + serializer.serialize_str(&self.localized) + } else { + let mut s = serializer.serialize_struct( + "TextValue", + 2 + self.c.is_some() as usize + self.id.is_some() as usize, + )?; + s.serialize_field("user_provided", &self.user_provided)?; + s.serialize_field("localized", &self.localized)?; + if let Some(c) = &self.c { + s.serialize_field("c", &c)?; + } + if let Some(id) = &self.id { + s.serialize_field("id", &id)?; + } + s.end() + } + } +} + +impl TextValue { + pub fn localized(&self) -> &str { + self.localized.as_str() + } + pub fn c(&self) -> &str { + self.c.as_ref().unwrap_or(&self.localized).as_str() + } + pub fn id(&self) -> &str { + self.id.as_ref().unwrap_or(&self.localized).as_str() + } +} + +#[derive(Clone, Debug, Serialize)] pub struct TemplateValue { pub args: Vec>, - pub local: String, + pub localized: String, pub id: String, } -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, Serialize)] +#[serde(rename_all = "snake_case")] pub enum ValueInner { Number(NumberValue), String(StringValue), @@ -2379,3 +2695,166 @@ impl ValueInner { } } } + +pub struct MetadataEntry { + pub name: Value, + pub value: MetadataValue, +} + +pub enum MetadataValue { + Leaf(Value), + Group(Vec), +} + +impl MetadataEntry { + pub fn into_pivot_table(self) -> PivotTable { + let mut data = Vec::new(); + let group = match self.visit(&mut data) { + Category::Group(group) => group, + Category::Leaf(leaf) => Group::new("Metadata").with(leaf).with_label_shown(), + }; + PivotTable::new([(Axis3::Y, Dimension::new(group))]).with_data( + data.into_iter() + .enumerate() + .filter(|(_row, value)| !value.is_empty()) + .map(|(row, value)| ([row], value)), + ) + } + fn visit(self, data: &mut Vec) -> Category { + match self.value { + MetadataValue::Leaf(value) => { + data.push(value); + Leaf::new(self.name).into() + } + MetadataValue::Group(items) => Group::with_capacity(self.name, items.len()) + .with_multiple(items.into_iter().map(|item| item.visit(data))) + .into(), + } + } +} + +impl Serialize for MetadataValue { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match self { + MetadataValue::Leaf(value) => value.serialize_bare(serializer), + MetadataValue::Group(items) => { + let mut map = serializer.serialize_map(Some(items.len()))?; + for item in items { + let name = item.name.display(()).to_string(); + map.serialize_entry(&name, &item.value)?; + } + map.end() + } + } + } +} +impl Serialize for MetadataEntry { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match &self.value { + MetadataValue::Leaf(value) => { + let mut map = serializer.serialize_map(Some(1))?; + let name = self.name.display(()).to_string(); + map.serialize_entry(&name, &BareValue(&value))?; + map.end() + } + MetadataValue::Group(items) => { + let mut map = serializer.serialize_map(Some(items.len()))?; + for item in items { + let name = item.name.display(()).to_string(); + map.serialize_entry(&name, &item.value)?; + } + map.end() + } + } + } +} + +#[cfg(test)] +mod tests { + use crate::output::pivot::{Display26Adic, MetadataEntry, MetadataValue, Value}; + + #[test] + fn display_26adic() { + for (number, lowercase, uppercase) in [ + (0, "", ""), + (1, "a", "A"), + (2, "b", "B"), + (26, "z", "Z"), + (27, "aa", "AA"), + (28, "ab", "AB"), + (29, "ac", "AC"), + (18278, "zzz", "ZZZ"), + (18279, "aaaa", "AAAA"), + (19010, "abcd", "ABCD"), + ] { + assert_eq!(Display26Adic::new_lowercase(number).to_string(), lowercase); + assert_eq!(Display26Adic::new_uppercase(number).to_string(), uppercase); + } + } + + #[test] + fn metadata_entry() { + let tree = MetadataEntry { + name: Value::from("Group"), + value: MetadataValue::Group(vec![ + MetadataEntry { + name: Value::from("Name 1"), + value: MetadataValue::Leaf(Value::from("Value 1")), + }, + MetadataEntry { + name: Value::from("Subgroup 1"), + value: MetadataValue::Group(vec![ + MetadataEntry { + name: Value::from("Subname 1"), + value: MetadataValue::Leaf(Value::from("Subvalue 1")), + }, + MetadataEntry { + name: Value::from("Subname 2"), + value: MetadataValue::Leaf(Value::from("Subvalue 2")), + }, + MetadataEntry { + name: Value::from("Subname 3"), + value: MetadataValue::Leaf(Value::new_integer(Some(3.0))), + }, + ]), + }, + MetadataEntry { + name: Value::from("Name 2"), + value: MetadataValue::Leaf(Value::from("Value 2")), + }, + ]), + }; + assert_eq!( + serde_json::to_string_pretty(&tree).unwrap(), + r#"{ + "Name 1": "Value 1", + "Subgroup 1": { + "Subname 1": "Subvalue 1", + "Subname 2": "Subvalue 2", + "Subname 3": 3 + }, + "Name 2": "Value 2" +}"# + ); + + assert_eq!( + tree.into_pivot_table().to_string(), + r#"╭────────────────────┬──────────╮ +│ Name 1 │Value 1 │ +├────────────────────┼──────────┤ +│Subgroup 1 Subname 1│Subvalue 1│ +│ Subname 2│Subvalue 2│ +│ Subname 3│ 3│ +├────────────────────┼──────────┤ +│ Name 2 │Value 2 │ +╰────────────────────┴──────────╯ +"# + ); + } +} diff --git a/rust/pspp/src/output/pivot/test.rs b/rust/pspp/src/output/pivot/test.rs index cc532fad10..a69f821532 100644 --- a/rust/pspp/src/output/pivot/test.rs +++ b/rust/pspp/src/output/pivot/test.rs @@ -19,9 +19,9 @@ use std::{fmt::Display, fs::File, path::Path, sync::Arc}; use enum_map::EnumMap; use crate::output::{ - cairo::CairoDriver, + cairo::{CairoConfig, CairoDriver}, driver::Driver, - html::HtmlRenderer, + html::HtmlDriver, pivot::{ Area, Axis2, Border, BorderStyle, Class, Color, Dimension, Footnote, FootnoteMarkerPosition, FootnoteMarkerType, Footnotes, Group, HeadingRegion, LabelPosition, @@ -175,18 +175,18 @@ pub fn assert_rendering(name: &str, pivot_table: &PivotTable, expected: &str) { let item = Arc::new(Item::new(Details::Table(Box::new(pivot_table.clone())))); if let Some(dir) = std::env::var_os("PSPP_TEST_HTML_DIR") { let writer = File::create(Path::new(&dir).join(name).with_extension("html")).unwrap(); - HtmlRenderer::new(writer).write(&item); + HtmlDriver::for_writer(writer).write(&item); } let item = Arc::new(Item::new(Details::Table(Box::new(pivot_table.clone())))); if let Some(dir) = std::env::var_os("PSPP_TEST_PDF_DIR") { - let path = Path::new(&dir).join(name).with_extension("pdf"); - CairoDriver::new(path).write(&item); + let config = CairoConfig::new(Path::new(&dir).join(name).with_extension("pdf")); + CairoDriver::new(&config).unwrap().write(&item); } if let Some(dir) = std::env::var_os("PSPP_TEST_SPV_DIR") { let writer = File::create(Path::new(&dir).join(name).with_extension("spv")).unwrap(); - SpvDriver::new(writer).write(&item); + SpvDriver::for_writer(writer).write(&item); } } diff --git a/rust/pspp/src/output/spv.rs b/rust/pspp/src/output/spv.rs index f34090fbca..9df728fec1 100644 --- a/rust/pspp/src/output/spv.rs +++ b/rust/pspp/src/output/spv.rs @@ -18,8 +18,10 @@ use core::f64; use std::{ borrow::Cow, fmt::Write as _, + fs::File, io::{Cursor, Seek, Write}, iter::{repeat, repeat_n}, + path::PathBuf, sync::Arc, }; @@ -31,6 +33,7 @@ use quick_xml::{ writer::Writer as XmlWriter, ElementWriter, }; +use serde::{Deserialize, Serialize}; use smallstr::SmallString; use zip::{result::ZipResult, write::SimpleFileOptions, ZipWriter}; @@ -38,6 +41,7 @@ use crate::{ format::{Format, Type}, output::{ driver::Driver, + page::{Heading, PageSetup}, pivot::{ Area, AreaStyle, Axis2, Axis3, Border, BorderStyle, BoxBorder, Category, CellStyle, Color, Dimension, FontStyle, Footnote, FootnoteMarkerPosition, FootnoteMarkerType, @@ -60,6 +64,15 @@ fn output_viewer_name(heading_id: u64, is_heading: bool) -> String { ) } +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct SpvConfig { + /// Output file name. + pub file: PathBuf, + + /// Page setup. + pub page_setup: Option, +} + pub struct SpvDriver where W: Write + Seek, @@ -68,13 +81,24 @@ where needs_page_break: bool, next_table_id: u64, next_heading_id: u64, + page_setup: Option, +} + +impl SpvDriver { + pub fn new(config: &SpvConfig) -> std::io::Result { + let mut driver = Self::for_writer(File::create(&config.file)?); + if let Some(page_setup) = &config.page_setup { + driver = driver.with_page_setup(page_setup.clone()); + } + Ok(driver) + } } impl SpvDriver where W: Write + Seek, { - pub fn new(writer: W) -> Self { + pub fn for_writer(writer: W) -> Self { let mut writer = ZipWriter::new(writer); writer .start_file("META-INF/MANIFEST.MF", SimpleFileOptions::default()) @@ -85,6 +109,14 @@ where needs_page_break: false, next_table_id: 1, next_heading_id: 1, + page_setup: None, + } + } + + pub fn with_page_setup(self, page_setup: PageSetup) -> Self { + Self { + page_setup: Some(page_setup), + ..self } } @@ -533,7 +565,9 @@ where .write_inner_content(|w| { w.create_element("label") .write_text_content(BytesText::new("Output"))?; - // XXX page setup + if let Some(page_setup) = self.page_setup.take() { + write_page_setup(&page_setup, w)?; + } self.write_item(item, w); Ok(()) }) @@ -552,6 +586,80 @@ where } } +fn write_page_setup(page_setup: &PageSetup, writer: &mut XmlWriter) -> std::io::Result<()> +where + X: Write, +{ + fn inches<'a>(x: f64) -> Cow<'a, str> { + Cow::from(format!("{:.2}in", x)) + } + + writer + .create_element("vps:pageSetup") + .with_attribute(( + "initial-page-number", + Cow::from(format!("{}", page_setup.initial_page_number)), + )) + .with_attribute(( + "chart-size", + match page_setup.chart_size { + super::page::ChartSize::AsIs => "as-is", + super::page::ChartSize::FullHeight => "full-height", + super::page::ChartSize::HalfHeight => "half-height", + super::page::ChartSize::QuarterHeight => "quarter-height", + }, + )) + .with_attribute(("margin-left", inches(page_setup.margins[Axis2::X][0]))) + .with_attribute(("margin-right", inches(page_setup.margins[Axis2::X][1]))) + .with_attribute(("margin-top", inches(page_setup.margins[Axis2::Y][0]))) + .with_attribute(("margin-bottom", inches(page_setup.margins[Axis2::Y][1]))) + .with_attribute(("paper-height", inches(page_setup.paper[Axis2::Y]))) + .with_attribute(("paper-width", inches(page_setup.paper[Axis2::X]))) + .with_attribute(( + "reference-orientation", + match page_setup.orientation { + crate::output::page::Orientation::Portrait => "portrait", + crate::output::page::Orientation::Landscape => "landscape", + }, + )) + .with_attribute(( + "space-after", + Cow::from(format!("{:.1}pt", page_setup.object_spacing * 72.0)), + )) + .write_inner_content(|w| { + write_page_heading(&page_setup.headings[0], "vps:pageHeader", w)?; + write_page_heading(&page_setup.headings[1], "vps:pageFooter", w)?; + Ok(()) + })?; + Ok(()) +} + +fn write_page_heading( + heading: &Heading, + name: &str, + writer: &mut XmlWriter, +) -> std::io::Result<()> +where + X: Write, +{ + let element = writer.create_element(name); + if !heading.0.is_empty() { + element.write_inner_content(|w| { + w.create_element("vps:pageParagraph") + .write_inner_content(|w| { + for paragraph in &heading.0 { + w.create_element("vtx:text") + .with_attribute(("text", "title")) + .write_text_content(BytesText::new(¶graph.markup))?; + } + Ok(()) + })?; + Ok(()) + })?; + } + Ok(()) +} + fn maybe_with_attribute<'a, 'b, W, I>( element: ElementWriter<'a, W>, attr: Option, @@ -899,7 +1007,7 @@ where } } -struct Zeros(usize); +pub struct Zeros(pub usize); impl BinWrite for Zeros { type Args<'a> = (); @@ -1170,13 +1278,13 @@ impl BinWrite for Value { format: number.format, honor_small: number.honor_small, }; - if number.var_name.is_some() || number.value_label.is_some() { + if number.variable.is_some() || number.value_label.is_some() { ( 2u8, ValueMod::new(self), format, - number.value.unwrap_or(-f64::MAX), - SpvString::optional(&number.var_name), + number.value.unwrap_or(f64::MIN), + SpvString::optional(&number.variable), SpvString::optional(&number.value_label), Show::as_spv(&number.show), ) @@ -1186,7 +1294,7 @@ impl BinWrite for Value { 1u8, ValueMod::new(self), format, - number.value.unwrap_or(-f64::MAX), + number.value.unwrap_or(f64::MIN), ) .write_options(writer, endian, args)?; } @@ -1223,10 +1331,10 @@ impl BinWrite for Value { ValueInner::Text(text) => { ( 3u8, - SpvString(&text.local), + SpvString(&text.localized), ValueMod::new(self), - SpvString(&text.id), - SpvString(&text.c), + SpvString(text.id()), + SpvString(text.c()), SpvBool(true), ) .write_options(writer, endian, args)?; @@ -1235,7 +1343,7 @@ impl BinWrite for Value { ( 0u8, ValueMod::new(self), - SpvString(&template.local), + SpvString(&template.localized), template.args.len() as u32, ) .write_options(writer, endian, args)?; diff --git a/rust/pspp/src/output/text.rs b/rust/pspp/src/output/text.rs index 7abe32851a..1a5aad6d8e 100644 --- a/rust/pspp/src/output/text.rs +++ b/rust/pspp/src/output/text.rs @@ -20,10 +20,12 @@ use std::{ fs::File, io::{BufWriter, Write as IoWrite}, ops::{Index, Range}, + path::PathBuf, sync::{Arc, LazyLock}, }; use enum_map::{enum_map, Enum, EnumMap}; +use serde::{Deserialize, Serialize}; use unicode_linebreak::{linebreaks, BreakOpportunity}; use unicode_width::UnicodeWidthStr; @@ -38,7 +40,8 @@ use super::{ Details, Item, }; -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] pub enum Boxes { Ascii, #[default] @@ -54,28 +57,29 @@ impl Boxes { } } -#[derive(Clone, Debug)] -pub struct TextRendererConfig { +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct TextConfig { + /// Output file name. + file: Option, + + /// Renderer config. + #[serde(flatten)] + options: TextRendererOptions, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +#[serde(default)] +pub struct TextRendererOptions { /// Enable bold and underline in output? pub emphasis: bool, /// Page width. - pub width: usize, + pub width: Option, /// ASCII or Unicode pub boxes: Boxes, } -impl Default for TextRendererConfig { - fn default() -> Self { - Self { - emphasis: false, - width: usize::MAX, - boxes: Boxes::default(), - } - } -} - pub struct TextRenderer { /// Enable bold and underline in output? emphasis: bool, @@ -95,20 +99,21 @@ pub struct TextRenderer { impl Default for TextRenderer { fn default() -> Self { - Self::new(&TextRendererConfig::default()) + Self::new(&TextRendererOptions::default()) } } impl TextRenderer { - pub fn new(config: &TextRendererConfig) -> Self { + pub fn new(config: &TextRendererOptions) -> Self { + let width = config.width.unwrap_or(usize::MAX); Self { emphasis: config.emphasis, - width: config.width, + width, min_hbreak: 20, box_chars: config.boxes.box_chars(), n_objects: 0, params: Params { - size: Coord2::new(config.width, usize::MAX), + size: Coord2::new(width, usize::MAX), font_size: EnumMap::from_fn(|_| 1), line_widths: EnumMap::from_fn(|stroke| if stroke == Stroke::None { 0 } else { 1 }), px_size: None, @@ -358,11 +363,14 @@ pub struct TextDriver { } impl TextDriver { - pub fn new(file: File) -> TextDriver { - Self { - file: BufWriter::new(file), - renderer: TextRenderer::default(), - } + pub fn new(config: &TextConfig) -> std::io::Result { + Ok(Self { + file: BufWriter::new(match &config.file { + Some(file) => File::create(&file)?, + None => File::options().write(true).open("/dev/stdout")?, + }), + renderer: TextRenderer::new(&config.options), + }) } } diff --git a/rust/pspp/src/settings.rs b/rust/pspp/src/settings.rs index f4678b1236..aac4a4c8a8 100644 --- a/rust/pspp/src/settings.rs +++ b/rust/pspp/src/settings.rs @@ -16,10 +16,11 @@ use std::sync::{Arc, OnceLock}; +use binrw::Endian; use enum_map::EnumMap; +use serde::Serialize; use crate::{ - endian::Endian, format::{Format, Settings as FormatSettings}, message::Severity, output::pivot::Look, @@ -27,7 +28,8 @@ use crate::{ /// Whether to show variable or value labels or the underlying value or variable /// name. -#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] pub enum Show { /// Value (or variable name) only. Value, diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index 9c5b7e0ab5..c4f5878206 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -16,8 +16,9 @@ use std::{ collections::BTreeMap, + fmt::{Debug, Display}, fs::File, - io::{Read, Seek}, + io::{BufRead, BufReader, Read, Seek}, ops::Range, path::Path, }; @@ -25,35 +26,39 @@ use std::{ use crate::{ calendar::date_time_to_pspp, crypto::EncryptedFile, - data::{Datum, RawString}, + data::{ByteString, Case, Datum, MutRawString, RawString}, dictionary::{ - Dictionary, InvalidRole, MissingValues, MissingValuesError, MultipleResponseSet, - MultipleResponseType, VarWidth, Variable, VariableSet, + DictIndexMultipleResponseSet, DictIndexVariableSet, Dictionary, MrSetError, + MultipleResponseType, }, - endian::Endian, format::{Error as FormatError, Format, UncheckedFormat}, hexfloat::HexFloat, - identifier::{ByIdentifier, Error as IdError, Identifier}, - output::pivot::{Group, Value}, - sys::raw::{ - self, infer_encoding, - records::{ - Compression, DocumentRecord, EncodingRecord, Extension, FileAttributesRecord, - FileHeader, FloatInfoRecord, IntegerInfoRecord, LongName, LongNamesRecord, - LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord, - NumberOfCasesRecord, ProductInfoRecord, RawFormat, ValueLabel, ValueLabelRecord, - VarDisplayRecord, VariableAttributesRecord, VariableRecord, VariableSetRecord, - VeryLongStringsRecord, + identifier::{Error as IdError, Identifier}, + output::pivot::{Axis3, Dimension, Group, PivotTable, Value}, + sys::{ + raw::{ + self, infer_encoding, + records::{ + Compression, DocumentRecord, EncodingRecord, Extension, FileAttributesRecord, + FileHeader, FloatInfoRecord, IntegerInfoRecord, LongName, LongNamesRecord, + LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord, + NumberOfCasesRecord, ProductInfoRecord, RawFormat, ValueLabel, ValueLabelRecord, + VarDisplayRecord, VariableAttributesRecord, VariableRecord, VariableSetRecord, + VeryLongStringsRecord, + }, + DecodedRecord, RawCases, RawDatum, RawWidth, Reader, }, - Cases, DecodedRecord, RawDatum, RawWidth, Reader, + serialize_endian, }, + variable::{InvalidRole, MissingValues, MissingValuesError, VarType, VarWidth, Variable}, }; use anyhow::{anyhow, Error as AnyError}; -use binrw::io::BufReader; +use binrw::{BinRead, BinWrite, Endian}; use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; -use encoding_rs::Encoding; +use encoding_rs::{Encoding, UTF_8}; use indexmap::set::MutableValues; use itertools::Itertools; +use serde::Serialize; use thiserror::Error as ThisError; /// A warning for decoding [Records] into a [SystemFile]. @@ -191,12 +196,9 @@ pub enum Error { Identifier, ), - /// Multiple response set {0} contains both string and numeric variables. - #[error("Multiple response set {0} contains both string and numeric variables.")] - MixedMrSet( - /// Multiple response set name. - Identifier, - ), + /// Error adding multiple response set. + #[error("{0}")] + MrSetError(#[from] MrSetError), /// Invalid numeric format for counted value {number} in multiple response set {mr_set}. #[error( @@ -209,21 +211,6 @@ pub enum Error { number: String, }, - /// Counted value {value} has width {width}, but it must be no wider than - /// {max_width}, the width of the narrowest variable in multiple response - /// set {mr_set}. - #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")] - TooWideMDGroupCountedValue { - /// Multiple response set name. - mr_set: Identifier, - /// Counted value. - value: String, - /// Width of counted value. - width: usize, - /// Maximum allowed width of counted value. - max_width: u16, - }, - /// Ignoring long string value label for unknown variable {0}. #[error("Ignoring long string value label for unknown variable {0}.")] UnknownLongStringValueLabelVariable( @@ -476,8 +463,11 @@ pub enum Error { } /// Options for reading a system file. -#[derive(Default, Clone, Debug)] -pub struct ReaderOptions { +#[derive(Clone, Debug)] +pub struct ReadOptions { + /// Function called to report warnings. + pub warn: F, + /// Character encoding for text in the system file. /// /// If not set, the character encoding will be determined from reading the @@ -493,11 +483,15 @@ pub struct ReaderOptions { pub password: Option, } -impl ReaderOptions { - /// Construct a new `ReaderOptions` that initially does not specify an - /// encoding or password. - pub fn new() -> Self { - Self::default() +impl ReadOptions { + /// Construct a new `ReadOptions` that reports warnings by calling `warn` + /// and initially does not specify an encoding or password. + pub fn new(warn: F) -> Self { + Self { + warn, + encoding: None, + password: None, + } } /// Causes the file to be read using the specified `encoding`, or with a @@ -512,47 +506,56 @@ impl ReaderOptions { Self { password, ..self } } - /// Opens the file at `path`, reporting warnings using `warn`. - pub fn open_file(self, path: P, warn: F) -> Result + /// Opens the file at `path`. + pub fn open_file

(mut self, path: P) -> Result where P: AsRef, F: FnMut(AnyError), { let file = File::open(path)?; - if self.password.is_some() { + if let Some(password) = self.password.take() { // Don't create `BufReader`, because [EncryptedReader] will buffer. - self.open_reader(file, warn) + self.open_reader_encrypted(file, password) } else { - self.open_reader(BufReader::new(file), warn) + Self::open_reader_inner(BufReader::new(file), self.encoding, self.warn) } } - /// Opens the file read from `reader`, reporting warnings using `warn`. - pub fn open_reader(self, reader: R, warn: F) -> Result + /// Opens the file read from `reader`. + fn open_reader_encrypted(self, reader: R, password: String) -> Result where R: Read + Seek + 'static, F: FnMut(AnyError), { - if let Some(password) = &self.password { - Self::open_reader_inner( - EncryptedFile::new(reader)? - .unlock(password.as_bytes()) - .map_err(|_| anyhow!("Incorrect password."))?, - self.encoding, - warn, - ) + Self::open_reader_inner( + EncryptedFile::new(reader)? + .unlock(password.as_bytes()) + .map_err(|_| anyhow!("Incorrect password."))?, + self.encoding, + self.warn, + ) + } + + /// Opens the file read from `reader`. + pub fn open_reader(mut self, reader: R) -> Result + where + R: BufRead + Seek + 'static, + F: FnMut(AnyError), + { + if let Some(password) = self.password.take() { + self.open_reader_encrypted(reader, password) } else { - Self::open_reader_inner(reader, self.encoding, warn) + Self::open_reader_inner(reader, self.encoding, self.warn) } } - fn open_reader_inner( + fn open_reader_inner( reader: R, encoding: Option<&'static Encoding>, mut warn: F, ) -> Result where - R: Read + Seek + 'static, + R: BufRead + Seek + 'static, F: FnMut(AnyError), { let mut reader = Reader::new(reader, |warning| warn(warning.into()))?; @@ -594,6 +597,16 @@ impl SystemFile { pub fn into_parts(self) -> (Dictionary, Metadata, Cases) { (self.dictionary, self.metadata, self.cases) } + + /// Converts this system file reader into one encoded in UTF-8. + pub fn into_unicode(mut self) -> Self { + self.dictionary.codepage_to_unicode(); + Self { + dictionary: self.dictionary, + metadata: self.metadata, + cases: self.cases.into_unicode(), + } + } } /// Decoded records in a system file, arranged by type. @@ -754,7 +767,7 @@ impl Records { pub fn decode( mut self, header: FileHeader, - mut cases: Cases, + mut cases: RawCases, encoding: &'static Encoding, mut warn: impl FnMut(Error), ) -> SystemFile { @@ -792,7 +805,7 @@ impl Records { .collect(); if let Some(integer_info) = self.integer_info.first() { - let floating_point_rep = integer_info.floating_point_rep; + let floating_point_rep = integer_info.inner.floating_point_rep; if floating_point_rep != 1 { warn(Error::UnexpectedFloatFormat(floating_point_rep)) } @@ -801,7 +814,7 @@ impl Records { Endian::Big => 1, Endian::Little => 2, }; - let actual = integer_info.endianness; + let actual = integer_info.inner.endianness; if actual != expected { warn(Error::UnexpectedEndianess { actual, expected }); } @@ -834,7 +847,7 @@ impl Records { && self .integer_info .get(0) - .is_none_or(|info| info.version.0 != 13) + .is_none_or(|info| info.inner.version.0 != 13) { warn(Error::WrongVariablePositions { actual: n_vars, @@ -892,7 +905,10 @@ impl Records { variable.label = input.label.clone(); - variable.missing_values = input.missing_values.clone(); + variable + .missing_values_mut() + .replace(input.missing_values.decode(encoding).unwrap()) + .unwrap(); variable.print_format = decode_format( input.print_format, @@ -949,20 +965,17 @@ impl Records { }); } else { let (var_index, dict_index) = var_index_map.range(..=&index).last().unwrap(); - let variable = &dictionary.variables[*dict_index]; if *var_index == index { - if variable.is_numeric() { - dictionary.weight = Some(*dict_index); - } else { + if dictionary.set_weight(Some(*dict_index)).is_err() { warn(Error::InvalidWeightVar { index: weight_index, - name: variable.name.clone(), + name: dictionary.variables[*dict_index].name.clone(), }); } } else { warn(Error::WeightIndexStringContinuation { index: weight_index, - name: variable.name.clone(), + name: dictionary.variables[*dict_index].name.clone(), }); } } @@ -1017,7 +1030,8 @@ impl Records { .map(|value| { value .decode(variable.width) - .display(variable.print_format, variable.encoding) + .as_encoded(variable.encoding()) + .display(variable.print_format) .with_trimming() .with_quoted_string() .to_string() @@ -1054,9 +1068,9 @@ impl Records { .iter() .flat_map(|record| record.sets.iter()) { - match MultipleResponseSet::decode(&dictionary, record, &mut warn) { + match DictIndexMultipleResponseSet::decode(&dictionary, record, &mut warn) { Ok(mrset) => { - dictionary.mrsets.insert(ByIdentifier::new(mrset)); + dictionary.mrsets_mut().insert(mrset).unwrap(); } Err(error) => warn(error), } @@ -1073,7 +1087,7 @@ impl Records { continue; }; let width = VarWidth::String(record.length); - let n_segments = width.n_segments(); + let n_segments = width.segments().len(); if n_segments == 1 { warn(Error::ShortVeryLongString { short_name: record.short_name.clone(), @@ -1121,7 +1135,7 @@ impl Records { // converted to lowercase, as the long variable names. for index in 0..dictionary.variables.len() { let lower = dictionary.variables[index].name.0.as_ref().to_lowercase(); - if let Ok(new_name) = Identifier::from_encoding(lower, dictionary.encoding) { + if let Ok(new_name) = Identifier::from_encoding(lower, dictionary.encoding()) { let _ = dictionary.try_rename_var(index, new_name); } } @@ -1237,17 +1251,22 @@ impl Records { .missing_values .into_iter() .map(|v| { - let mut value = RawString::from(v.0.as_slice()); - value.resize(variable.width.as_string_width().unwrap()); - Datum::String(value) + let mut value = ByteString::from(v.0.as_slice()); + let _ = value.resize(variable.width.as_string_width().unwrap()); // XXX check error + Datum::String(value.with_encoding(encoding)) }) .collect::>(); match MissingValues::new(values, None) { - Ok(missing_values) => variable.missing_values = missing_values, + Ok(missing_values) => variable + .missing_values_mut() + .replace(missing_values) + .unwrap(), Err(MissingValuesError::TooWide) => { warn(Error::MissingValuesTooWide(record.var_name.clone())) } - Err(MissingValuesError::TooMany) | Err(MissingValuesError::MixedTypes) => { + Err(MissingValuesError::TooMany) + | Err(MissingValuesError::MixedTypes) + | Err(MissingValuesError::SystemMissing) => { unreachable!() } } @@ -1270,11 +1289,11 @@ impl Records { }; variables.push(dict_index); } - let variable_set = VariableSet { + let variable_set = DictIndexVariableSet { name: record.name, variables, }; - dictionary.variable_sets.push(variable_set); + dictionary.add_variable_set(variable_set); } for record in self.other_extension.drain(..) { @@ -1293,15 +1312,81 @@ impl Records { SystemFile { dictionary, metadata, - cases, + cases: Cases::new(encoding, cases), } } } +/// Product version number in a system file. +/// +/// # Example +/// +/// `ProductVersion(1,2,3)` is version 1.2.3. +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, BinRead, BinWrite, Serialize)] +pub struct ProductVersion( + /// Major version. + pub i32, + /// Minor version + pub i32, + /// Revision. + pub i32, +); + +impl ProductVersion { + /// This version of PSPP. + pub const VERSION: Self = { + const fn parse_integer(mut s: &[u8]) -> (i32, &[u8]) { + let mut value = 0; + let mut n = 0; + while let Some((c, rest)) = s.split_first() + && *c >= b'0' + && *c <= b'9' + { + value = value * 10 + (*c - b'0') as i32; + n += 1; + s = rest; + } + assert!(n > 0); + (value, s) + } + + const fn skip_dot(s: &[u8]) -> &[u8] { + let Some((c, rest)) = s.split_first() else { + unreachable!() + }; + assert!(*c == b'.'); + rest + } + + // Parse `CARGO_PKG_VERSION`. This could be easier if `const` contexts + // were less restricted. + let s = env!("CARGO_PKG_VERSION").as_bytes(); + let (first, s) = parse_integer(s); + let s = skip_dot(s); + let (second, s) = parse_integer(s); + let s = skip_dot(s); + let (third, s) = parse_integer(s); + assert!(matches!(s.first(), None | Some(b'-' | b'+'))); + Self(first, second, third) + }; +} + +impl Display for ProductVersion { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}.{}.{}", self.0, self.1, self.2) + } +} + +impl Debug for ProductVersion { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + ::fmt(self, f) + } +} + /// System file metadata that is not part of [Dictionary]. /// /// [Dictionary]: crate::dictionary::Dictionary -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize)] pub struct Metadata { /// Creation date and time. /// @@ -1309,6 +1394,7 @@ pub struct Metadata { pub creation: NaiveDateTime, /// Endianness of integers and floating-point numbers in the file. + #[serde(serialize_with = "serialize_endian")] pub endian: Endian, /// Compression type (if any). @@ -1327,8 +1413,7 @@ pub struct Metadata { /// Version number of the product that wrote the file. /// - /// For example, `(1,2,3)` is version 1.2.3. - pub version: Option<(i32, i32, i32)>, + pub version: Option, } impl Metadata { @@ -1403,7 +1488,7 @@ impl Metadata { let product = header .eye_catcher .trim_start_matches("@(#) SPSS DATA FILE") - .trim_end() + .trim() .to_string(); Self { @@ -1413,15 +1498,27 @@ impl Metadata { n_cases: headers .number_of_cases .first() - .map(|record| record.n_cases) + .and_then(|record| record.n_cases) .or_else(|| header.n_cases.map(|n| n as u64)), product, product_ext: headers.product_info.first().map(|pe| fix_line_ends(&pe.0)), - version: headers.integer_info.first().map(|ii| ii.version), + version: headers.integer_info.first().map(|ii| ii.inner.version), } } } +impl From<&Metadata> for PivotTable { + fn from(value: &Metadata) -> Self { + let (group, data) = value.to_pivot_rows(); + PivotTable::new([(Axis3::Y, Dimension::new(group))]).with_data( + data.into_iter() + .enumerate() + .filter(|(_row, value)| !value.is_empty()) + .map(|(row, value)| ([row], value)), + ) + } +} + struct Decoder { pub encoding: &'static Encoding, n_generated_names: usize, @@ -1444,7 +1541,7 @@ impl Decoder { } } -impl MultipleResponseSet { +impl DictIndexMultipleResponseSet { fn decode( dictionary: &Dictionary, input: &raw::records::MultipleResponseSet, @@ -1482,21 +1579,18 @@ impl MultipleResponseSet { _ => (), } - let Some((Some(min_width), Some(max_width))) = variables + let Ok(var_type) = variables .iter() - .copied() - .map(|dict_index| dictionary.variables[dict_index].width) - .map(|w| (Some(w), Some(w))) - .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb))) + .map(|dict_index| VarType::from(dictionary.variables[*dict_index].width)) + .all_equal_value() else { - return Err(Error::MixedMrSet(mr_set_name)); + return Err(MrSetError::MixedMrSet(mr_set_name).into()); }; - let mr_type = MultipleResponseType::decode(&mr_set_name, &input.mr_type, min_width)?; + let mr_type = MultipleResponseType::decode(&mr_set_name, &input.mr_type, var_type)?; - Ok(MultipleResponseSet { + Ok(DictIndexMultipleResponseSet { name: mr_set_name, - width: min_width..=max_width, label: input.label.to_string(), mr_type, variables, @@ -1548,12 +1642,12 @@ impl MultipleResponseType { fn decode( mr_set: &Identifier, input: &raw::records::MultipleResponseType, - min_width: VarWidth, + var_type: VarType, ) -> Result { match input { raw::records::MultipleResponseType::MultipleDichotomy { value, labels } => { - let value = match min_width { - VarWidth::Numeric => { + let value = match var_type { + VarType::Numeric => { let string = String::from_utf8_lossy(&value.0); let number: f64 = string.trim().parse().map_err(|_| { Error::InvalidMDGroupCountedValue { @@ -1563,21 +1657,10 @@ impl MultipleResponseType { })?; Datum::Number(Some(number)) } - VarWidth::String(max_width) => { - let mut value = value.0.as_slice(); - while value.ends_with(b" ") { - value = &value[..value.len() - 1]; - } - let width = value.len(); - if width > max_width as usize { - return Err(Error::TooWideMDGroupCountedValue { - mr_set: mr_set.clone(), - value: String::from_utf8_lossy(value).into(), - width, - max_width, - }); - }; - Datum::String(value.into()) + VarType::String => { + let mut value = value.clone(); + value.trim_end(); + Datum::String(value) } }; Ok(MultipleResponseType::MultipleDichotomy { @@ -1592,32 +1675,59 @@ impl MultipleResponseType { } } -/* -trait Quoted { - fn quoted(self) -> WithQuotes - where - Self: Display + Sized; +/// Reads cases from a system file. +pub struct Cases { + encoding: &'static Encoding, + into_unicode: bool, + inner: RawCases, } -impl Quoted for T -where - T: Display, -{ - fn quoted(self) -> WithQuotes { - WithQuotes(self) +impl Cases { + /// Constructs a new reader for `inner` with the given `encoding`. + /// + /// No recoding will take place; the caller is simply saying that the cases + /// are already encoded in `encoding`. + pub fn new(encoding: &'static Encoding, inner: RawCases) -> Self { + Self { + encoding, + inner, + into_unicode: false, + } + } + + /// Returns a reader that will recode cases from their existing encoding + /// into UTF-8. If the cases were already in UTF-8, this is a no-op; + /// otherwise, the widths of strings are tripled in the process. + pub fn into_unicode(self) -> Self { + Self { + into_unicode: { + // We only need to convert if we're not starting out as UTF-8. + self.encoding != UTF_8 + }, + ..self + } } } -struct WithQuotes(T) -where - T: Display; +impl Debug for Cases { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "Cases") + } +} -impl Display for WithQuotes -where - T: Display, -{ - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "\"{}\"", &self.0) +impl Iterator for Cases { + type Item = Result>>, raw::Error>; + + fn next(&mut self) -> Option { + self.inner.next().map(|result| { + result.map(|case| { + let case = case.with_encoding(self.encoding); + if self.into_unicode { + case.into_unicode() + } else { + case + } + }) + }) } } -*/ diff --git a/rust/pspp/src/sys/encoding.rs b/rust/pspp/src/sys/encoding.rs index f4fecbab62..4e7468829b 100644 --- a/rust/pspp/src/sys/encoding.rs +++ b/rust/pspp/src/sys/encoding.rs @@ -15,26 +15,36 @@ // this program. If not, see . //! Character encodings in system files. +//! +//! These are useful for reading and writing system files at a low level. use std::sync::LazyLock; use crate::locale_charset::locale_charset; use encoding_rs::{Encoding, UTF_8}; +use serde::Serialize; +use thiserror::Error as ThisError; include!(concat!(env!("OUT_DIR"), "/encodings.rs")); /// Returns the code page number corresponding to `encoding`, or `None` if /// unknown. -pub fn codepage_from_encoding(encoding: &str) -> Option { +pub fn codepage_from_encoding_name(encoding: &str) -> Option { CODEPAGE_NAME_TO_NUMBER .get(encoding.to_ascii_lowercase().as_str()) .copied() } -use thiserror::Error as ThisError; +/// Returns the code page number for `encoding`. +pub fn codepage_from_encoding(encoding: &'static Encoding) -> u32 { + // This `unwrap()` is tested against all the actual [Encoding]s in a + // #[test]. + codepage_from_encoding_name(encoding.name()).unwrap() +} /// An error or warning related to encodings. -#[derive(Clone, ThisError, Debug, PartialEq, Eq)] +#[derive(Clone, ThisError, Debug, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] pub enum Error { /// Warning that the system file doesn't indicate its own encoding. #[error("This system file does not indicate its own character encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")] @@ -80,27 +90,77 @@ pub fn get_encoding( encoding: Option<&str>, character_code: Option, ) -> Result<&'static Encoding, Error> { - let label = if let Some(encoding) = encoding { - encoding - } else if let Some(codepage) = character_code { - match codepage { - 1 => return Err(Error::Ebcdic), - 2 | 3 => { - // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic] - // respectively. However, many files have character code 2 but - // data which are clearly not ASCII. Therefore, ignore these - // values. - return Err(Error::NoEncoding); - } - 4 => "MS_KANJI", - _ => CODEPAGE_NUMBER_TO_NAME + fn inner(label: &str) -> Result<&'static Encoding, Error> { + Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into())) + } + + match (encoding, character_code) { + (Some(encoding), _) => inner(encoding), + (None, Some(1)) => Err(Error::Ebcdic), + (None, Some(2 | 3)) => { + // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic] + // respectively. However, many files have character code 2 but + // data which are clearly not ASCII. Therefore, ignore these + // values. + Err(Error::NoEncoding) + } + (None, Some(4)) => inner("MS_KANJI"), + (None, Some(codepage)) => inner( + CODEPAGE_NUMBER_TO_NAME .get(&codepage) .copied() .ok_or(Error::UnknownCodepage(codepage))?, - } - } else { - return Err(Error::NoEncoding); - }; + ), + (None, None) => Err(Error::NoEncoding), + } +} + +#[cfg(test)] +mod tests { + use crate::sys::encoding::codepage_from_encoding; - Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into())) + /// Test that every `Encoding` has a codepage. + #[test] + fn codepages() { + codepage_from_encoding(&encoding_rs::BIG5); + codepage_from_encoding(&encoding_rs::EUC_JP); + codepage_from_encoding(&encoding_rs::EUC_KR); + codepage_from_encoding(&encoding_rs::GB18030); + codepage_from_encoding(&encoding_rs::GBK); + codepage_from_encoding(&encoding_rs::IBM866); + codepage_from_encoding(&encoding_rs::ISO_2022_JP); + codepage_from_encoding(&encoding_rs::ISO_8859_2); + codepage_from_encoding(&encoding_rs::ISO_8859_3); + codepage_from_encoding(&encoding_rs::ISO_8859_4); + codepage_from_encoding(&encoding_rs::ISO_8859_5); + codepage_from_encoding(&encoding_rs::ISO_8859_6); + codepage_from_encoding(&encoding_rs::ISO_8859_7); + codepage_from_encoding(&encoding_rs::ISO_8859_8); + codepage_from_encoding(&encoding_rs::ISO_8859_8_I); + codepage_from_encoding(&encoding_rs::ISO_8859_10); + codepage_from_encoding(&encoding_rs::ISO_8859_13); + codepage_from_encoding(&encoding_rs::ISO_8859_14); + codepage_from_encoding(&encoding_rs::ISO_8859_15); + codepage_from_encoding(&encoding_rs::ISO_8859_16); + codepage_from_encoding(&encoding_rs::KOI8_R); + codepage_from_encoding(&encoding_rs::KOI8_U); + codepage_from_encoding(&encoding_rs::MACINTOSH); + codepage_from_encoding(&encoding_rs::REPLACEMENT); + codepage_from_encoding(&encoding_rs::SHIFT_JIS); + codepage_from_encoding(&encoding_rs::UTF_8); + codepage_from_encoding(&encoding_rs::UTF_16BE); + codepage_from_encoding(&encoding_rs::UTF_16LE); + codepage_from_encoding(&encoding_rs::WINDOWS_874); + codepage_from_encoding(&encoding_rs::WINDOWS_1250); + codepage_from_encoding(&encoding_rs::WINDOWS_1251); + codepage_from_encoding(&encoding_rs::WINDOWS_1252); + codepage_from_encoding(&encoding_rs::WINDOWS_1253); + codepage_from_encoding(&encoding_rs::WINDOWS_1254); + codepage_from_encoding(&encoding_rs::WINDOWS_1255); + codepage_from_encoding(&encoding_rs::WINDOWS_1256); + codepage_from_encoding(&encoding_rs::WINDOWS_1257); + codepage_from_encoding(&encoding_rs::WINDOWS_1258); + codepage_from_encoding(&encoding_rs::X_MAC_CYRILLIC); + codepage_from_encoding(&encoding_rs::X_USER_DEFINED); + } } diff --git a/rust/pspp/src/sys/mod.rs b/rust/pspp/src/sys/mod.rs index 94e063fcf1..4f59614100 100644 --- a/rust/pspp/src/sys/mod.rs +++ b/rust/pspp/src/sys/mod.rs @@ -22,12 +22,14 @@ //! facilitate interchange between even the oldest and newest versions of //! software. //! -//! To read a system file in the simplest way, use [ReaderOptions]. +//! Use [ReadOptions] to read a system file in the simplest way. +//! Use [WriteOptions] to write a system file. // Warn about missing docs, but not for items declared with `#[cfg(test)]`. #![cfg_attr(not(test), warn(missing_docs))] mod cooked; +use binrw::Endian; pub use cooked::*; pub mod encoding; pub mod raw; @@ -35,5 +37,19 @@ pub mod raw; #[cfg(test)] pub mod sack; +mod write; +use serde::Serializer; +pub use write::{SystemFileVersion, WriteOptions, Writer}; + #[cfg(test)] mod test; + +fn serialize_endian(endian: &Endian, serializer: S) -> Result +where + S: Serializer, +{ + match endian { + Endian::Big => serializer.serialize_unit_variant("Endian", 0, "Big"), + Endian::Little => serializer.serialize_unit_variant("Endian", 1, "Little"), + } +} diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 41ab01f42f..e06babe62d 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -20,11 +20,13 @@ //! raw details. Most readers will want to use higher-level interfaces. use crate::{ - data::{Case, Datum, RawStr, RawString}, - dictionary::{VarType, VarWidth}, - endian::{Endian, Parse, ToBytes}, - format::DisplayPlainF64, + data::{ByteStr, ByteString, Datum, MutRawString, RawCase, RawString}, + endian::{FromBytes, ToBytes}, identifier::{Error as IdError, Identifier}, + output::{ + pivot::{Axis3, Dimension, Group, PivotTable, Value}, + Details, Item, Text, + }, sys::{ encoding::{default_encoding, get_encoding, Error as EncodingError}, raw::records::{ @@ -38,20 +40,32 @@ use crate::{ RawVariableSetRecord, RawVeryLongStringsRecord, ValueLabelRecord, ValueLabelWarning, VarDisplayRecord, VariableAttributesRecord, VariableDisplayWarning, VariableRecord, VariableSetRecord, VariableSetWarning, VariableWarning, VeryLongStringWarning, - VeryLongStringsRecord, ZHeader, ZTrailer, ZlibTrailerWarning, + VeryLongStringsRecord, ZHeader, ZHeaderError, ZTrailer, ZTrailerError, + ZlibTrailerWarning, }, }, + variable::{VarType, VarWidth}, }; -use encoding_rs::Encoding; -use flate2::read::ZlibDecoder; +use binrw::Endian; +use encoding_rs::{ + Encoding, BIG5, EUC_JP, EUC_KR, GB18030, IBM866, ISO_2022_JP, ISO_8859_10, ISO_8859_13, + ISO_8859_14, ISO_8859_16, ISO_8859_2, ISO_8859_3, ISO_8859_4, ISO_8859_5, ISO_8859_6, + ISO_8859_7, ISO_8859_8, KOI8_R, KOI8_U, MACINTOSH, SHIFT_JIS, UTF_8, WINDOWS_1250, + WINDOWS_1251, WINDOWS_1252, WINDOWS_1253, WINDOWS_1254, WINDOWS_1255, WINDOWS_1256, + WINDOWS_1257, WINDOWS_1258, WINDOWS_874, +}; +use flate2::bufread::ZlibDecoder; +use indexmap::IndexMap; +use itertools::{EitherOrBoth, Itertools}; +use serde::Serialize; use smallvec::SmallVec; use std::{ borrow::Cow, cell::RefCell, collections::VecDeque, fmt::{Debug, Display, Formatter, Result as FmtResult}, - io::{empty, Error as IoError, Read, Seek, SeekFrom}, + io::{empty, BufRead, Error as IoError, Read, Seek, SeekFrom}, iter::repeat_n, mem::take, num::NonZeroU8, @@ -215,143 +229,37 @@ pub enum ErrorDetails { }, /// Unexpected end of file {case_ofs} bytes into a {case_len}-byte case. - #[error("Unexpected end of file {case_ofs} bytes into a {case_len}-byte case.")] + #[error("Unexpected end of file {case_ofs} bytes into case {case_number} with expected length {case_len} bytes.")] EofInCase { /// Offset into case in bytes. case_ofs: u64, /// Expected case length in bytes. case_len: usize, + /// 1-based case number in file. + case_number: u64, }, /// Unexpected end of file {case_ofs} bytes and {n_chunks} compression /// chunks into a compressed case. #[error( - "Unexpected end of file {case_ofs} bytes and {n_chunks} compression chunks into a compressed case." + "Unexpected end of file {case_ofs} bytes and {n_chunks} compression chunks into compressed case {case_number}." )] EofInCompressedCase { /// Offset into case in bytes. case_ofs: u64, /// Number of compression codes consumed. n_chunks: usize, + /// 1-based case number in file. + case_number: u64, }, - /// Impossible ztrailer_offset {0:#x}. - #[error("Impossible ztrailer_offset {0:#x}.")] - ImpossibleZTrailerOffset( - /// `ztrailer_offset` - u64, - ), - - /// ZLIB header's zlib_offset is {actual:#x} instead of expected - /// {expected:#x}. - #[error("ZLIB header's zlib_offset is {actual:#x} instead of expected {expected:#x}.")] - UnexpectedZHeaderOffset { - /// Actual `zlib_offset`. - actual: u64, - /// Expected `zlib_offset`. - expected: u64, - }, - - /// Invalid ZLIB trailer length {0}. - #[error("Invalid ZLIB trailer length {0}.")] - InvalidZTrailerLength( - /// ZLIB trailer length. - u64, - ), - - /// ZLIB trailer bias {actual} is not {} as expected from file header bias. - #[ - error( - "ZLIB trailer bias {actual} is not {} as expected from file header bias.", - DisplayPlainF64(*expected) - )] - WrongZlibTrailerBias { - /// ZLIB trailer bias read from file. - actual: i64, - /// Expected ZLIB trailer bias. - expected: f64, - }, + /// Error reading a [ZHeader]. + #[error("Error reading ZLIB header: {0}")] + ZHeader(#[from] ZHeaderError), - /// ZLIB trailer \"zero\" field has nonzero value {0}. - #[error("ZLIB trailer \"zero\" field has nonzero value {0}.")] - WrongZlibTrailerZero( - /// Actual value that should have been zero. - u64, - ), - - /// ZLIB trailer specifies unexpected {0}-byte block size. - #[error("ZLIB trailer specifies unexpected {0}-byte block size.")] - WrongZlibTrailerBlockSize( - /// Block size read from file. - u32, - ), - - /// Block count in ZLIB trailer differs from expected block count calculated - /// from trailer length. - #[error( - "Block count {n_blocks} in ZLIB trailer differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}." - )] - BadZlibTrailerNBlocks { - /// Number of blocks. - n_blocks: u32, - /// Expected number of blocks. - expected_n_blocks: u64, - /// ZLIB trailer length in bytes. - ztrailer_len: u64, - }, - - /// ZLIB block descriptor reported uncompressed data offset different from - /// expected. - #[error( - "ZLIB block descriptor {index} reported uncompressed data offset {actual:#x}, when {expected:#x} was expected." - )] - ZlibTrailerBlockWrongUncmpOfs { - /// Block descriptor index. - index: usize, - /// Actual uncompressed data offset. - actual: u64, - /// Expected uncompressed data offset. - expected: u64, - }, - - /// ZLIB block descriptor {index} reported compressed data offset - /// {actual:#x}, when {expected:#x} was expected. - #[error( - "ZLIB block descriptor {index} reported compressed data offset {actual:#x}, when {expected:#x} was expected." - )] - ZlibTrailerBlockWrongCmpOfs { - /// Block descriptor index. - index: usize, - /// Actual compressed data offset. - actual: u64, - /// Expected compressed data offset. - expected: u64, - }, - - /// ZLIB block descriptor {index} reports compressed size {compressed_size} - /// and uncompressed size {uncompressed_size}. - #[error( - "ZLIB block descriptor {index} reports compressed size {compressed_size} and uncompressed size {uncompressed_size}." - )] - ZlibExpansion { - /// Block descriptor index. - index: usize, - /// Compressed size. - compressed_size: u32, - /// Uncompressed size. - uncompressed_size: u32, - }, - - /// ZLIB trailer at unexpected offset. - #[error( - "ZLIB trailer is at offset {actual:#x} but {expected:#x} would be expected from block descriptors." - )] - ZlibTrailerOffsetInconsistency { - /// Expected offset. - expected: u64, - /// Actual offset. - actual: u64, - }, + /// Error reading a [ZTrailer]. + #[error("Error reading ZLIB trailer: {0}")] + ZTrailer(#[from] ZTrailerError), /// File metadata says it contains {expected} cases, but {actual} cases were read. #[error("File metadata says it contains {expected} cases, but {actual} cases were read.")] @@ -496,7 +404,7 @@ impl From for WarningDetails { } /// A raw record in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub enum Record { /// Variable record. /// @@ -504,7 +412,7 @@ pub enum Record { /// one variable record per 8-byte segment. Variable( /// The record. - VariableRecord, + VariableRecord, ), /// Value labels for numeric and short string variables. @@ -512,7 +420,7 @@ pub enum Record { /// These appear after the variable records. ValueLabel( /// The record. - ValueLabelRecord, + ValueLabelRecord, ), /// Document record. @@ -542,13 +450,13 @@ pub enum Record { /// Multiple response variable record. MultipleResponse( /// The record. - MultipleResponseRecord, + MultipleResponseRecord, ), /// Value labels for long string variables. LongStringValueLabels( /// The record. - LongStringValueLabelRecord, + LongStringValueLabelRecord, ), /// Missing values for long string variables. @@ -557,7 +465,7 @@ pub enum Record { /// variable records. LongStringMissingValues( /// The record. - LongStringMissingValueRecord, + LongStringMissingValueRecord, ), /// Encoding record. @@ -641,12 +549,135 @@ pub enum Record { ), } +impl Record { + /// Returns the inner [EncodingRecord], if any. + pub fn as_encoding_record(&self) -> Option<&EncodingRecord> { + match self { + Record::Encoding(encoding_record) => Some(encoding_record), + _ => None, + } + } + + /// Returns the inner [IntegerInfoRecord], if any. + pub fn as_integer_info_record(&self) -> Option<&IntegerInfoRecord> { + match self { + Record::IntegerInfo(integer_info_record) => Some(integer_info_record), + _ => None, + } + } + + /// Returns the inner [LongStringMissingValueRecord], if any. + pub fn as_long_string_missing_values( + &self, + ) -> Option<&LongStringMissingValueRecord> { + match self { + Record::LongStringMissingValues(long_string_missing_value_record) => { + Some(long_string_missing_value_record) + } + _ => None, + } + } + + /// Returns [RecordString]s for the record. These are useful for producing + /// an [EncodingReport]. + pub fn get_strings(&self) -> Vec { + let mut strings = Vec::new(); + match self { + Record::Variable(variable_record) => { + strings.push(RecordString::new( + "Variable Name", + &variable_record.name, + true, + )); + if let Some(label) = &variable_record.label { + strings.push(RecordString::new("Variable Label", label, false)); + } + for missing_value in &variable_record.missing_values.values { + if let Some(string) = missing_value.as_string() { + strings.push(RecordString::new("Missing Value", string, false)); + } + } + } + Record::ValueLabel(value_label_record) => { + for label in &value_label_record.labels { + strings.push(RecordString::new("Value Label", &label.label, false)); + } + } + Record::Document(document_record) => { + for (line, index) in document_record.lines.iter().zip(1..) { + strings.push(RecordString::new( + format!("Document Line {index}"), + line, + false, + )); + } + } + Record::MultipleResponse(multiple_response_record) => { + for set in &multiple_response_record.sets { + strings.push(RecordString::new( + "Multiple Response Set Name", + &set.name, + true, + )); + if !set.label.is_empty() { + strings.push(RecordString::new( + "Multiple Response Set Label", + &set.label, + false, + )); + } + match &set.mr_type { + records::MultipleResponseType::MultipleDichotomy { value, .. } => { + strings.push(RecordString::new( + "Multiple Response Set Counted Value", + value, + false, + )); + } + _ => (), + } + } + } + Record::LongStringValueLabels(long_string_value_label_record) => { + for labels in &long_string_value_label_record.labels { + for (_value, label) in &labels.labels { + strings.push(RecordString::new("Value Label", label, false)); + } + } + } + Record::ProductInfo(raw_product_info_record) => { + strings.push(RecordString::new( + "Extra Product Info", + &raw_product_info_record.0.text, + false, + )); + } + Record::IntegerInfo(_) + | Record::FloatInfo(_) + | Record::VarDisplay(_) + | Record::LongStringMissingValues(_) + | Record::Encoding(_) + | Record::NumberOfCases(_) + | Record::VariableSets(_) + | Record::LongNames(_) + | Record::VeryLongStrings(_) + | Record::FileAttributes(_) + | Record::VariableAttributes(_) + | Record::OtherExtension(_) + | Record::EndOfHeaders(_) + | Record::ZHeader(_) + | Record::ZTrailer(_) => (), + } + strings + } +} + /// A [Record] that has been decoded to a more usable form. /// /// Some records can be understand raw, but others need to have strings decoded /// (and interpreted as identifiers) or raw data interpreted as either numbers /// or strings. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub enum DecodedRecord { /// Variable record, with strings decoded. Variable(VariableRecord), @@ -720,13 +751,19 @@ impl Record { warn: &mut dyn FnMut(Warning), ) -> Result, Error> where - R: Read + Seek, + R: BufRead + Seek, { let rec_type: u32 = endian.parse(read_bytes(reader)?); match rec_type { - 2 => Ok(Some(VariableRecord::read(reader, endian, warn)?)), - 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?), - 6 => Ok(Some(DocumentRecord::read(reader, endian)?)), + 2 => Ok(Some(Record::Variable(VariableRecord::read( + reader, endian, warn, + )?))), + 3 => Ok( + ValueLabelRecord::read(reader, endian, var_types, warn)?.map(Record::ValueLabel) + ), + 6 => Ok(Some(Record::Document(DocumentRecord::read( + reader, endian, + )?))), 7 => Extension::read(reader, endian, var_types, warn), 999 => Ok(Some(Record::EndOfHeaders( endian.parse(read_bytes(reader)?), @@ -795,24 +832,7 @@ pub fn infer_encoding( records: &[Record], mut warn: impl FnMut(Warning), ) -> Result<&'static Encoding, Error> { - // Get the character encoding from the first (and only) encoding record. - let encoding = records - .iter() - .filter_map(|record| match record { - Record::Encoding(record) => Some(record.0.as_str()), - _ => None, - }) - .next(); - - // Get the character code from the first (only) integer info record. - let character_code = records - .iter() - .filter_map(|record| match record { - Record::IntegerInfo(record) => Some(record.character_code), - _ => None, - }) - .next(); - + let (encoding, character_code) = get_encoding_info(records); match get_encoding(encoding, character_code) { Ok(encoding) => Ok(encoding), Err(err @ EncodingError::Ebcdic) => Err(Error::new(None, err.into())), @@ -824,6 +844,36 @@ pub fn infer_encoding( } } +/// Returns the encoding name from the (first) [EncodingRecord] in `records`, if +/// any, and the codepage from the (first) [IntegerInfoRecord] in `records`, if +/// any. +pub fn get_encoding_info(records: &[Record]) -> (Option<&str>, Option) { + ( + get_encoding_record(records).map(|r| r.0.as_str()), + get_integer_info_record(records).map(|r| r.inner.character_code), + ) +} + +/// Returns the (first) [EncodingRecord] in `iter`, if any. +pub fn get_encoding_record<'a, I>(iter: I) -> Option<&'a EncodingRecord> +where + I: IntoIterator, +{ + iter.into_iter() + .filter_map(|record| record.as_encoding_record()) + .next() +} + +/// Returns the (first) [IntegerInfoRecord] in `iter`, if any. +pub fn get_integer_info_record<'a, I>(iter: I) -> Option<&'a IntegerInfoRecord> +where + I: IntoIterator, +{ + iter.into_iter() + .filter_map(|record| record.as_integer_info_record()) + .next() +} + /// An [Encoding] along with a function to report decoding errors. /// /// This is used by functions that decode raw records. @@ -885,12 +935,12 @@ impl<'de> Decoder<'de> { output } - fn decode<'a>(&mut self, input: &'a RawString) -> Cow<'a, str> { + fn decode<'a>(&mut self, input: &'a ByteString) -> Cow<'a, str> { self.decode_slice(input.0.as_slice()) } /// Decodes `input` to an [Identifier] using our encoding. - pub fn decode_identifier(&mut self, input: &RawString) -> Result { + pub fn decode_identifier(&mut self, input: &ByteString) -> Result { let decoded = &self.decode(input); self.new_identifier(decoded) } @@ -904,7 +954,7 @@ impl<'de> Decoder<'de> { /// System file type, inferred from its "magic number". /// /// The magic number is the first four bytes of the file. -#[derive(Copy, Clone, PartialEq, Eq, Hash)] +#[derive(Copy, Clone, PartialEq, Eq, Hash, Serialize)] pub enum Magic { /// Regular system file. Sav, @@ -939,6 +989,16 @@ impl Debug for Magic { } } +impl From for [u8; 4] { + fn from(value: Magic) -> Self { + match value { + Magic::Sav => Magic::SAV, + Magic::Zsav => Magic::ZSAV, + Magic::Ebcdic => Magic::EBCDIC, + } + } +} + impl TryFrom<[u8; 4]> for Magic { type Error = ErrorDetails; @@ -1000,7 +1060,19 @@ impl Debug for RawDatum { match self { RawDatum::Number(Some(number)) => write!(f, "{number:?}"), RawDatum::Number(None) => write!(f, "SYSMIS"), - RawDatum::String(s) => write!(f, "{:?}", RawStr::from_bytes(s)), + RawDatum::String(s) => write!(f, "{:?}", ByteStr(s)), + } + } +} + +impl Serialize for RawDatum { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match self { + RawDatum::Number(number) => number.serialize(serializer), + RawDatum::String(s) => ByteStr(s).serialize(serializer), } } } @@ -1017,28 +1089,30 @@ impl RawDatum { /// Decodes a `RawDatum` into a [Datum] given that we now know the string /// width. - pub fn decode(&self, width: VarWidth) -> Datum { + pub fn decode(&self, width: VarWidth) -> Datum { match self { Self::Number(x) => Datum::Number(*x), Self::String(s) => { let width = width.as_string_width().unwrap(); - Datum::String(RawString::from(&s[..width])) + Datum::String(ByteString::from(&s[..width])) } } } } -impl Datum { +impl Datum { fn read_case( reader: &mut R, + case_number: u64, case_vars: &[CaseVar], endian: Endian, - ) -> Result, Error> { + ) -> Result, Error> { fn eof( reader: &mut R, + case_number: u64, case_vars: &[CaseVar], case_start: u64, - ) -> Result, Error> { + ) -> Result, Error> { let offset = reader.stream_position()?; if offset == case_start { Ok(None) @@ -1048,6 +1122,7 @@ impl Datum { ErrorDetails::EofInCase { case_ofs: offset - case_start, case_len: case_vars.iter().map(CaseVar::bytes).sum(), + case_number, }, )) } @@ -1059,7 +1134,7 @@ impl Datum { match var { CaseVar::Numeric => { let Some(raw) = try_read_bytes(reader)? else { - return eof(reader, case_vars, case_start); + return eof(reader, case_number, case_vars, case_start); }; values.push(Datum::Number(endian.parse(raw))); } @@ -1071,16 +1146,16 @@ impl Datum { reader, &mut datum[offset..offset + segment.data_bytes], )? { - return eof(reader, case_vars, case_start); + return eof(reader, case_number, case_vars, case_start); } skip_bytes(reader, segment.padding_bytes)?; offset += segment.data_bytes; } - values.push(Datum::String(RawString(datum))); + values.push(Datum::String(datum.into())); } } } - Ok(Some(Case(values))) + Ok(Some(RawCase(values))) } fn read_compressed_chunk( @@ -1108,16 +1183,18 @@ impl Datum { } fn read_compressed_case( reader: &mut R, + case_number: u64, case_vars: &[CaseVar], codes: &mut VecDeque, endian: Endian, bias: f64, - ) -> Result, Error> { + ) -> Result, Error> { fn eof( reader: &mut R, + case_number: u64, case_start: u64, n_chunks: usize, - ) -> Result, Error> { + ) -> Result, Error> { let offset = reader.stream_position()?; if n_chunks > 0 { Err(Error::new( @@ -1125,6 +1202,7 @@ impl Datum { ErrorDetails::EofInCompressedCase { case_ofs: offset - case_start, n_chunks, + case_number, }, )) } else { @@ -1140,7 +1218,7 @@ impl Datum { CaseVar::Numeric => { let Some(raw) = Self::read_compressed_chunk(reader, codes, endian, bias)? else { - return eof(reader, case_start, n_chunks); + return eof(reader, case_number, case_start, n_chunks); }; n_chunks += 1; values.push(Datum::Number(endian.parse(raw))); @@ -1154,7 +1232,7 @@ impl Datum { let Some(raw) = Self::read_compressed_chunk(reader, codes, endian, bias)? else { - return eof(reader, case_start, n_chunks); + return eof(reader, case_number, case_start, n_chunks); }; let n_data = data_bytes.min(8); datum.extend_from_slice(&raw[..n_data]); @@ -1163,44 +1241,51 @@ impl Datum { n_chunks += 1; } } - values.push(Datum::String(RawString(datum))); + values.push(Datum::String(datum.into())); } } } - Ok(Some(Case(values))) + Ok(Some(RawCase(values))) } } struct ZlibDecodeMultiple where - R: Read + Seek, + R: BufRead + Seek, { reader: Option>, + limit: u64, } impl ZlibDecodeMultiple where - R: Read + Seek, + R: BufRead + Seek, { - fn new(reader: R) -> ZlibDecodeMultiple { + fn new(reader: R, limit: u64) -> ZlibDecodeMultiple { ZlibDecodeMultiple { reader: Some(ZlibDecoder::new(reader)), + limit, } } } impl Read for ZlibDecodeMultiple where - R: Read + Seek, + R: BufRead + Seek, { fn read(&mut self, buf: &mut [u8]) -> Result { loop { - match self.reader.as_mut().unwrap().read(buf)? { - 0 => { - let inner = self.reader.take().unwrap().into_inner(); + match self.reader.as_mut().unwrap().read(buf) { + Err(error) => return Err(error), + Ok(0) => { + let mut inner = self.reader.take().unwrap().into_inner(); + let position = inner.stream_position(); self.reader = Some(ZlibDecoder::new(inner)); + if position? >= self.limit { + return Ok(0); + } } - n => return Ok(n), + Ok(n) => return Ok(n), }; } } @@ -1208,7 +1293,7 @@ where impl Seek for ZlibDecodeMultiple where - R: Read + Seek, + R: BufRead + Seek, { fn seek(&mut self, pos: SeekFrom) -> Result { self.reader.as_mut().unwrap().get_mut().seek(pos) @@ -1225,21 +1310,21 @@ enum ReaderState { /// Reads records from a system file in their raw form. pub struct Reader<'a, R> where - R: Read + Seek + 'static, + R: BufRead + Seek + 'static, { reader: Option, warn: Box, - header: FileHeader, + header: FileHeader, var_types: VarTypes, state: ReaderState, - cases: Option, + cases: Option, } impl<'a, R> Reader<'a, R> where - R: Read + Seek + 'static, + R: BufRead + Seek + 'static, { /// Constructs a new [Reader] from the underlying `reader`. Any warnings /// encountered while reading the system file will be reported with `warn`. @@ -1259,7 +1344,7 @@ where } /// Returns the header in this reader. - pub fn header(&self) -> &FileHeader { + pub fn header(&self) -> &FileHeader { &self.header } @@ -1272,9 +1357,9 @@ where /// /// The cases are only available once all the headers have been read. If /// there is an error reading the headers, or if [cases](Self::cases) is - /// called before all of the headers have been read, the returned [Cases] + /// called before all of the headers have been read, the returned [RawCases] /// will be empty. - pub fn cases(self) -> Cases { + pub fn cases(self) -> RawCases { self.cases.unwrap_or_default() } } @@ -1282,18 +1367,19 @@ where /// Reads raw records from a system file. pub struct Records<'a, 'b, R>(&'b mut Reader<'a, R>) where - R: Read + Seek + 'static; + R: BufRead + Seek + 'static; impl<'a, 'b, R> Records<'a, 'b, R> where - R: Read + Seek + 'static, + R: BufRead + Seek + 'static, { - fn cases(&mut self) { + fn cases(&mut self, ztrailer_offset: Option) { self.0.state = ReaderState::End; - self.0.cases = Some(Cases::new( + self.0.cases = Some(RawCases::new( self.0.reader.take().unwrap(), take(&mut self.0.var_types), &self.0.header, + ztrailer_offset, )); } @@ -1313,12 +1399,16 @@ where } }; match record { - Record::Variable(VariableRecord { width, .. }) => self.0.var_types.push(width), + Record::Variable(VariableRecord { width, .. }) => { + if let Ok(width) = width.try_into() { + self.0.var_types.push(width) + } + } Record::EndOfHeaders(_) => { self.0.state = if let Some(Compression::ZLib) = self.0.header.compression { ReaderState::ZlibHeader } else { - self.cases(); + self.cases(None); ReaderState::End }; } @@ -1340,15 +1430,15 @@ where self.0.reader.as_mut().unwrap(), self.0.header.endian, self.0.header.bias, - zheader, + &zheader.inner, &mut self.0.warn, ) { Ok(None) => { - self.cases(); + self.cases(Some(zheader.inner.ztrailer_offset)); None } Ok(Some(ztrailer)) => { - self.cases(); + self.cases(Some(zheader.inner.ztrailer_offset)); Some(Ok(Record::ZTrailer(ztrailer))) } Err(error) => Some(Err(error)), @@ -1361,7 +1451,7 @@ where impl<'a, 'b, R> Iterator for Records<'a, 'b, R> where - R: Read + Seek + 'static, + R: BufRead + Seek + 'static, { type Item = Result; @@ -1384,7 +1474,7 @@ struct StringSegment { } fn segment_widths(width: usize) -> impl Iterator { - let n_segments = width.div_ceil(252); + let n_segments = if width > 255 { width.div_ceil(252) } else { 1 }; repeat_n(255, n_segments - 1) .chain(if n_segments > 1 { std::iter::once(width - (n_segments - 1) * 252) @@ -1441,17 +1531,17 @@ impl CaseVar { /// Reader for cases in a system file. /// -/// - [Reader::cases] returns [Cases] in which very long string variables (those +/// - [Reader::cases] returns [RawCases] in which very long string variables (those /// over 255 bytes wide) are still in their raw format, which means that they /// are divided into multiple, adjacent string variables, approximately one /// variable for each 252 bytes. /// -/// - In the [Cases] in [SystemFile], each [Dictionary] variable corresponds to +/// - In the [RawCases] in [SystemFile], each [Dictionary] variable corresponds to /// one [Datum], even for long string variables. /// /// [Dictionary]: crate::dictionary::Dictionary /// [SystemFile]: crate::sys::cooked::SystemFile -pub struct Cases { +pub struct RawCases { reader: Box, case_vars: Vec, compression: Option, @@ -1463,13 +1553,13 @@ pub struct Cases { read_cases: u64, } -impl Debug for Cases { +impl Debug for RawCases { fn fmt(&self, f: &mut Formatter) -> FmtResult { write!(f, "Cases") } } -impl Default for Cases { +impl Default for RawCases { fn default() -> Self { Self { reader: Box::new(empty()), @@ -1485,14 +1575,19 @@ impl Default for Cases { } } -impl Cases { - fn new(reader: R, var_types: VarTypes, header: &FileHeader) -> Self +impl RawCases { + fn new( + reader: R, + var_types: VarTypes, + header: &FileHeader, + ztrailer_offset: Option, + ) -> Self where - R: Read + Seek + 'static, + R: BufRead + Seek + 'static, { Self { reader: if header.compression == Some(Compression::ZLib) { - Box::new(ZlibDecodeMultiple::new(reader)) + Box::new(ZlibDecodeMultiple::new(reader, ztrailer_offset.unwrap())) } else { Box::new(reader) }, @@ -1513,11 +1608,11 @@ impl Cases { } } - /// Returns this [Cases] with its notion of variable widths updated from + /// Returns this [RawCases] with its notion of variable widths updated from /// `widths`. /// /// [Records::decode](crate::sys::Records::decode) uses this to properly handle - /// very long string variables (see [Cases] for details). + /// very long string variables (see [RawCases] for details). pub fn with_widths(self, widths: impl IntoIterator) -> Self { Self { case_vars: widths.into_iter().map(CaseVar::new).collect::>(), @@ -1525,7 +1620,7 @@ impl Cases { } } - /// Returns this [Cases] updated to expect `expected_cases`. If the actual + /// Returns this [RawCases] updated to expect `expected_cases`. If the actual /// number of cases in the file differs, the reader will issue a warning. pub fn with_expected_cases(self, expected_cases: u64) -> Self { Self { @@ -1535,8 +1630,8 @@ impl Cases { } } -impl Iterator for Cases { - type Item = Result; +impl Iterator for RawCases { + type Item = Result; fn next(&mut self) -> Option { if self.eof { @@ -1548,6 +1643,7 @@ impl Iterator for Cases { } else if self.compression.is_some() { Datum::read_compressed_case( &mut self.reader, + self.read_cases + 1, &self.case_vars, &mut self.codes, self.endian, @@ -1555,7 +1651,13 @@ impl Iterator for Cases { ) .transpose() } else { - Datum::read_case(&mut self.reader, &self.case_vars, self.endian).transpose() + Datum::read_case( + &mut self.reader, + self.read_cases + 1, + &self.case_vars, + self.endian, + ) + .transpose() }; match &retval { None => { @@ -1584,7 +1686,7 @@ impl Iterator for Cases { } /// Width of a variable record. -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize)] pub enum RawWidth { /// String continuation. /// @@ -1653,26 +1755,7 @@ impl Debug for UntypedDatum { } else { big }; - write!(f, "{number}/{:?}", RawStr::from_bytes(&self.0)) - } -} - -/// An 8-byte raw string whose type and encoding are unknown. -#[derive(Copy, Clone)] -pub struct RawStrArray( - /// Content. - pub [u8; N], -); - -impl From<[u8; N]> for RawStrArray { - fn from(source: [u8; N]) -> Self { - Self(source) - } -} - -impl Debug for RawStrArray { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{:?}", RawStr::from_bytes(&self.0)) + write!(f, "{number}/{:?}", ByteStr(&self.0)) } } @@ -1722,47 +1805,554 @@ fn read_vec(r: &mut R, n: usize) -> Result, IoError> { Ok(vec) } -fn read_string(r: &mut R, endian: Endian) -> Result { +fn read_string(r: &mut R, endian: Endian) -> Result { let length: u32 = endian.parse(read_bytes(r)?); Ok(read_vec(r, length as usize)?.into()) } +/// A collection of [VarWidth]s indexed for lookup. +/// +/// A system file contains a series of variables. Some parts of the system file +/// refer to variables by indexes that take the widths of string variables into +/// account, where a string variable with width 1..=8 occupies one index, with +/// width 9..=16 occupies two, and so on. For string variables that span more +/// than one index, only references to the first are valid. [VarTypes] +/// facilitates this kind of lookup. #[derive(Default)] -struct VarTypes { +pub struct VarTypes { types: Vec>, } impl VarTypes { + /// Construct a new, empty [VarTypes]. pub fn new() -> Self { Self::default() } - pub fn push(&mut self, width: RawWidth) { - if let Ok(var_width) = VarWidth::try_from(width) { - self.types.push(Some(var_width)); - for _ in 1..width.n_values().unwrap() { - self.types.push(None); - } + /// Appends `width`. + pub fn push(&mut self, width: VarWidth) { + self.types.push(Some(width)); + for _ in 1..width.n_chunks().unwrap() { + self.types.push(None); } } + /// Returns the number of indexes spanned by these variables. pub fn n_values(&self) -> usize { self.types.len() } + /// Returns true if 1-based `index` is valid, that is, if it's in the range + /// of indexes and refers to the first index for a given variable. pub fn is_valid_index(&self, index: usize) -> bool { self.var_type_at(index).is_some() } + /// Returns the type of variable with the given 1-based `index`, or `None` + /// if it's out of range or doesn't refer to the first index for a variable. pub fn var_type_at(&self, index: usize) -> Option { - if index >= 1 && index <= self.types.len() { - self.types[index - 1].map(VarType::from) - } else { - None - } + self.types.get(index.checked_sub(1)?)?.map(VarType::from) } + /// Returns the number of variables. pub fn n_vars(&self) -> usize { self.types.iter().flatten().count() } } + +/// A text string with an unknown encoding in a system file. +pub struct RecordString { + /// The name of the text string, e.g. "Variable 1". + pub title: String, + + /// The text string. + pub string: ByteString, + + /// Whether the text string must be a valid identifier. + /// + /// This can allow some otherwise valid encodings to be rejected. + pub is_identifier: bool, +} + +impl RecordString { + /// Constructs a new [RecordString]. + pub fn new( + title: impl Into, + string: impl Into, + is_identifier: bool, + ) -> Self { + Self { + title: title.into(), + string: string.into(), + is_identifier, + } + } +} + +static ENCODINGS: [&Encoding; 32] = [ + UTF_8, + WINDOWS_1252, + ISO_8859_2, + ISO_8859_3, + ISO_8859_4, + ISO_8859_5, + ISO_8859_6, + ISO_8859_7, + ISO_8859_8, + ISO_8859_10, + ISO_8859_13, + ISO_8859_14, + ISO_8859_16, + MACINTOSH, + WINDOWS_874, + WINDOWS_1250, + WINDOWS_1251, + WINDOWS_1253, + WINDOWS_1254, + WINDOWS_1255, + WINDOWS_1256, + WINDOWS_1257, + WINDOWS_1258, + KOI8_R, + KOI8_U, + IBM866, + GB18030, + BIG5, + EUC_JP, + ISO_2022_JP, + SHIFT_JIS, + EUC_KR, +]; + +/// Where the chosen encoding came from. +#[derive(Serialize)] +#[serde(rename_all = "snake_case")] +pub enum EncodingSource { + /// From the encoding name in the system file. + Name, + /// From the code page in the system file. + Codepage, + /// Default encoding. + Default, +} + +impl EncodingSource { + fn as_str(&self) -> &'static str { + match self { + EncodingSource::Name => "name", + EncodingSource::Codepage => "codepage", + EncodingSource::Default => "default", + } + } +} + +/// Information about the character encodings in a system file. +/// +/// This contains two kinds of data: +/// +/// - Information about the encodings indicated by the system file itself and +/// how PSPP interprets it. +/// +/// - Information about the text strings in the system file headers and possibly +/// some of its cases, along with the character encodings that are valid for +/// those text strings and how their interpretations differ based on encoding. +#[derive(Serialize)] +pub struct EncodingReport { + /// If the file includes a record that names its encoding, then this is the + /// name and how PSPP interprets that as an encoding. + pub name: Option<(String, Result<&'static Encoding, EncodingError>)>, + + /// If the file includes a record that identifies its encoding as a code + /// page number, then this is the number and how PSPP interprets that as an + /// encoding. + pub codepage: Option<(i32, Result<&'static Encoding, EncodingError>)>, + + /// The overall encoding chosen. + pub inferred_encoding: Result<&'static Encoding, EncodingError>, + + /// Why the overall encoding was chosen. + pub inferred_encoding_source: EncodingSource, + + /// The encodings that are valid for this file, based on looking at all the + /// text data in the file headers and possibly some of its cases. Each + /// array element is a group of encodings that yield the same text data. If + /// there is only one element, then all valid encodings yield the same text + /// data. + pub valid_encodings: Vec>, + + /// Individual strings in the file headers and cases, together with their + /// intepretations for each group of valid encodings. Only strings that + /// don't have the same interpretation for every valid encoding are + /// included. + /// + /// If this is empty, then either: + /// + /// - `valid_encodings` is also empty. In this case, there are no valid + /// encodings, so there are no strings in the valid encodings. + /// + /// - `valid_encodings` has one element (one group of valid encodings). In + /// this case, every valid encoding interprets every string the same way. + pub strings: Vec, +} + +impl EncodingReport { + fn metadata_pivot_table(&self) -> PivotTable { + fn result_to_value(result: &Result<&'static Encoding, EncodingError>) -> Value { + match result { + Ok(encoding) => encoding.name().into(), + Err(error) => error.to_string().into(), + } + } + + let cols = Group::new("Distinctions") + .with("Value") + .with("Interpretation"); + let rows = Group::new("Category") + .with("Name") + .with("Codepage") + .with("Overall"); + let mut table = PivotTable::new([ + (Axis3::X, Dimension::new(cols)), + (Axis3::Y, Dimension::new(rows)), + ]) + .with_title("Character encoding information found in system file and its interpretation") + .with_caption("A system file may identify its character encoding by name or by codepage number or both. This table states which were found, how each was interpreted, and the overall interpretation."); + if let Some((label, result)) = &self.name { + table.insert(&[0, 0], label.as_str()); + table.insert(&[1, 0], result_to_value(result)); + } else { + table.insert(&[0, 0], "(none)"); + } + if let Some((codepage, result)) = &self.codepage { + table.insert(&[0, 1], Value::new_integer(Some((*codepage) as f64))); + table.insert(&[1, 1], result_to_value(result)); + } else { + table.insert(&[0, 1], "(none)"); + } + table.insert(&[0, 2], self.inferred_encoding_source.as_str()); + table.insert(&[1, 2], result_to_value(&self.inferred_encoding)); + table + } +} + +impl From<&EncodingReport> for Details { + fn from(value: &EncodingReport) -> Self { + let mut output: Vec = vec![value.metadata_pivot_table().into()]; + + if !value.valid_encodings.is_empty() { + let groups = Group::new("Group").with_label_shown().with_multiple( + (1..=value.valid_encodings.len()).map(|i| Value::new_integer(Some(i as f64))), + ); + let encodings = Group::new("Encoding").with_multiple( + (1..=value + .valid_encodings + .iter() + .map(|encodings| encodings.len()) + .sum()) + .map(|i| Value::new_integer(Some(i as f64))), + ); + let mut data = Vec::new(); + let mut index = 0; + for (group, encodings) in value.valid_encodings.iter().enumerate() { + for encoding in encodings { + data.push(([group, index], encoding.name().into())); + index += 1; + } + } + output.push( + PivotTable::new([ + (Axis3::Y, Dimension::new(groups)), + (Axis3::Y, Dimension::new(encodings).with_all_labels_hidden()), + ]) + .with_title("Valid Encodings") + .with_caption("This table lists all of the encodings that were found to successfully interpret text in the file's header records. Encodings in the same group interpret all of the text in the file the same way.") + .with_data(data) + .into(), + ); + + if !value.strings.is_empty() { + let purposes = Group::with_capacity("Purpose", value.strings.len()) + .with_label_shown() + .with_multiple(value.strings.iter().map(|rs| &rs.name)); + let number = Group::new("Encoding").with_label_shown().with_multiple( + value + .valid_encodings + .iter() + .map(|encodings| encodings[0].name()), + ); + output.push( + PivotTable::new([ + (Axis3::X, Dimension::new(Group::new("Text").with("Text"))), + (Axis3::Y, Dimension::new(number)), + (Axis3::Y, Dimension::new(purposes)), + ]) + .with_title("Alternate Encoded Text Strings") + .with_caption("Text strings in the file dictionary that the previously listed encodings interpret differently, along with the interpretations. The listed encodings are the first in each group.") + .with_data(value + .strings + .iter() + .enumerate() + .map(|(purpose, rs)| { + rs.interpretations + .iter() + .enumerate() + .map(move |(encoding, s)| { + ( + [0, encoding, purpose], + Value::new_user_text(rs.ellipsize(s.as_str())), + ) + }) + }) + .flatten() + .collect::>()).into(), + ); + } + } else { + output.push(Text::new_log("No valid encodings were found.").into()); + }; + + output.into_iter().collect() + } +} + +/// All of the (valid) interpretations of a given string in a system file. +#[derive(Serialize)] +pub struct EncodingReportString { + /// Name for the string, something like "variable name 1". + name: String, + + /// If the string's interpretations all start with a common prefix, this is + /// it. Only whole words are considered to be common. + common_prefix: String, + + /// All of the interpretations of the string, one per valid encoding, in the + /// order of [EncodingReport::valid_encodings]. + interpretations: Vec, + + /// If the string's interpretations all end with a common suffix, this is + /// it. Only whole words are considered to be common. + common_suffix: String, +} + +impl EncodingReportString { + fn ellipsize<'a>(&self, s: &'a str) -> Cow<'a, str> { + if self.common_prefix.is_empty() && self.common_suffix.is_empty() { + Cow::from(s) + } else { + let mut result = String::with_capacity(s.len() + 6); + if !self.common_prefix.is_empty() { + result.push_str("..."); + } + result.push_str(s); + if !self.common_suffix.is_empty() { + result.push_str("..."); + } + Cow::from(result) + } + } +} + +impl EncodingReport { + /// Constructs an encoding report from `reader`, reading no more than + /// `max_cases` from it. + pub fn new(mut reader: Reader, max_cases: u64) -> Result + where + R: BufRead + Seek + 'static, + { + fn inner( + header: FileHeader, + records: &[Record], + cases: impl Iterator>, + ) -> Result { + let (encoding, codepage) = get_encoding_info(&records); + let label = encoding + .map(|encoding| (String::from(encoding), get_encoding(Some(encoding), None))); + let codepage = codepage.map(|codepage| (codepage, get_encoding(None, Some(codepage)))); + let (inferred_encoding_source, inferred_encoding) = match label + .as_ref() + .map(|(_string, result)| (EncodingSource::Name, result.clone())) + .or(codepage + .as_ref() + .map(|(_codepage, result)| (EncodingSource::Codepage, result.clone()))) + { + Some((source, Ok(encoding))) => (source, Ok(encoding)), + Some((source, Err(EncodingError::Ebcdic))) => (source, Err(EncodingError::Ebcdic)), + _ => (EncodingSource::Default, Ok(default_encoding())), + }; + + let mut record_strings = header.get_strings(); + for record in records { + record_strings.append(&mut record.get_strings()); + } + for (case_number, case) in (1..).zip(cases) { + for (variable_number, datum) in (1..).zip(case?.0) { + if let Some(mut string) = datum.into_string() { + string.trim_end(); + if !string.is_empty() { + record_strings.push(RecordString::new( + format!("Case {case_number}, Variable {variable_number}"), + string, + false, + )); + } + } + } + } + + let record_strings = record_strings + .into_iter() + .unique_by(|rs| rs.string.clone()) + .collect::>(); + + let mut encodings: IndexMap, Vec<&'static Encoding>> = IndexMap::new(); + for encoding in ENCODINGS { + fn recode_as( + record_strings: &[RecordString], + encoding: &'static Encoding, + ) -> Option> { + let mut output = Vec::with_capacity(record_strings.len()); + for rs in record_strings { + let mut s = encoding + .decode_without_bom_handling_and_without_replacement(&rs.string.0)? + .into_owned(); + s.truncate(s.trim_end().len()); + if rs.is_identifier { + Identifier::check_plausible(&s).ok()?; + } + output.push(s); + } + Some(output) + } + if let Some(strings) = recode_as(&record_strings, encoding) { + encodings.entry(strings).or_default().push(encoding); + } + } + + let mut strings = Vec::with_capacity(record_strings.len()); + if !encodings.is_empty() { + for (index, rs) in record_strings.iter().enumerate() { + // Skip strings that decode the same way from every encoding. + if encodings.keys().map(|strings| &strings[index]).all_equal() { + continue; + } + + /// Returns an iterator for the decoded strings for the given + /// `index`. + fn decoded_index<'a>( + encodings: &'a IndexMap, Vec<&'static Encoding>>, + index: usize, + ) -> impl Iterator { + encodings.keys().map(move |strings| strings[index].as_str()) + } + + let common_prefix: String = decoded_index(&encodings, index) + .reduce(common_prefix) + .unwrap() + .trim_end_matches(|c| c != ' ') + .into(); + let common_suffix: String = decoded_index(&encodings, index) + .reduce(common_suffix) + .unwrap() + .trim_start_matches(|c| c != ' ') + .into(); + + let interpretations = decoded_index(&encodings, index) + .map(|s| s[common_prefix.len()..s.len() - common_suffix.len()].into()) + .collect(); + + strings.push(EncodingReportString { + name: rs.title.clone(), + common_prefix, + interpretations, + common_suffix, + }); + } + } + Ok(EncodingReport { + valid_encodings: encodings.values().cloned().collect(), + strings, + name: label, + codepage, + inferred_encoding, + inferred_encoding_source, + }) + } + + let records: Vec = reader.records().collect::, _>>()?; + let header = reader.header().clone(); + inner(header, &records, reader.cases().take(max_cases as usize)) + } +} + +fn common_prefix<'a>(a: &'a str, b: &'a str) -> &'a str { + for elem in a.char_indices().zip_longest(b.char_indices()) { + match elem { + EitherOrBoth::Both((offset, a_char), (_, b_char)) => { + if a_char != b_char { + return &a[..offset]; + } + } + EitherOrBoth::Left((offset, _)) | EitherOrBoth::Right((offset, _)) => { + return &a[..offset] + } + } + } + a +} + +fn common_suffix<'a>(a: &'a str, b: &'a str) -> &'a str { + for elem in a.char_indices().rev().zip_longest(b.char_indices().rev()) { + match elem { + EitherOrBoth::Both((offset, a_char), (_, b_char)) => { + if a_char != b_char { + return &a[offset + a_char.len_utf8()..]; + } + } + EitherOrBoth::Left((offset, char)) => { + return &a[offset + char.len_utf8()..]; + } + EitherOrBoth::Right((offset, char)) => { + return &b[offset + char.len_utf8()..]; + } + } + } + a +} + +#[cfg(test)] +mod tests { + use itertools::Itertools; + + use crate::sys::raw::{common_prefix, common_suffix, segment_widths}; + + #[test] + fn test_common_prefix() { + assert_eq!(common_prefix("abc", "abcxyzzy"), "abc"); + assert_eq!(common_prefix("abcxyzzy", "abc"), "abc"); + assert_eq!(common_prefix("abc", "abc"), "abc"); + assert_eq!(common_prefix("", ""), ""); + } + + #[test] + fn test_common_suffix() { + assert_eq!(common_suffix("xyzzyabc", "abc"), "abc"); + assert_eq!(common_suffix("abc", "xyzzyabc"), "abc"); + assert_eq!(common_suffix("abc", "abc"), "abc"); + assert_eq!(common_suffix("", ""), ""); + } + + #[test] + fn test_segment_widths() { + // We had a bug for the range 252..=255. + for i in 1..=255 { + assert_eq!(segment_widths(i).collect_vec(), vec![i.next_multiple_of(8)]); + } + + assert_eq!( + segment_widths(20000).collect_vec(), + std::iter::repeat_n(256, 79) + .chain(std::iter::once(96)) + .collect_vec() + ); + } +} diff --git a/rust/pspp/src/sys/raw/records.rs b/rust/pspp/src/sys/raw/records.rs index d49437697b..5269d26d98 100644 --- a/rust/pspp/src/sys/raw/records.rs +++ b/rust/pspp/src/sys/raw/records.rs @@ -5,38 +5,47 @@ use std::{ borrow::Cow, collections::BTreeMap, - fmt::{Debug, Formatter}, + fmt::{Debug, Display, Formatter}, io::{Cursor, ErrorKind, Read, Seek, SeekFrom}, ops::Range, str::from_utf8, }; use crate::{ - data::{Datum, RawString}, - dictionary::{ - Alignment, Attributes, CategoryLabels, Measure, MissingValueRange, MissingValues, VarType, - VarWidth, - }, - endian::{Endian, Parse}, + data::{ByteStrArray, ByteString, Datum}, + dictionary::CategoryLabels, + endian::FromBytes, + format::{DisplayPlainF64, Format, Type}, identifier::{Error as IdError, Identifier}, - sys::raw::{ - read_bytes, read_string, read_vec, Decoder, Error, ErrorDetails, Magic, RawDatum, - RawStrArray, RawWidth, Record, UntypedDatum, VarTypes, Warning, WarningDetails, + sys::{ + raw::{ + read_bytes, read_string, read_vec, Decoder, Error, ErrorDetails, Magic, RawDatum, + RawWidth, Record, RecordString, UntypedDatum, VarTypes, Warning, WarningDetails, + }, + serialize_endian, ProductVersion, + }, + variable::{ + Alignment, Attributes, Measure, MissingValueRange, MissingValues, MissingValuesError, + VarType, VarWidth, }, }; -use binrw::BinRead; +use binrw::{binrw, BinRead, BinWrite, Endian, Error as BinError}; +use clap::ValueEnum; +use encoding_rs::Encoding; use itertools::Itertools; +use serde::{ser::SerializeTuple, Serialize, Serializer}; use thiserror::Error as ThisError; /// Type of compression in a system file. -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, ValueEnum)] pub enum Compression { /// Simple bytecode-based compression. Simple, /// [ZLIB] compression. /// /// [ZLIB]: https://www.zlib.net/ + #[value(name = "zlib", help = "ZLIB space-efficient compression")] ZLib, } @@ -49,10 +58,10 @@ pub enum HeaderWarning { } /// A file header record in a system file. -#[derive(Clone)] +#[derive(Clone, Debug, Serialize)] pub struct FileHeader where - S: Debug, + S: Debug + Serialize, { /// Magic number. pub magic: Magic, @@ -91,43 +100,50 @@ where pub file_label: S, /// Endianness of the data in the file header. + #[serde(serialize_with = "serialize_endian")] pub endian: Endian, } -impl FileHeader -where - S: Debug, -{ - fn debug_field(&self, f: &mut Formatter, name: &str, value: T) -> std::fmt::Result - where - T: Debug, - { - writeln!(f, "{name:>17}: {:?}", value) - } +/// Raw file header. +#[derive(BinRead, BinWrite)] +pub struct RawHeader { + /// Magic number. + pub magic: [u8; 4], + + /// Eye-catcher string and product name. + pub eye_catcher: [u8; 60], + + /// Layout code, normally either 2 or 3. + pub layout_code: u32, + + /// Claimed number of variable positions (not always accurate). + pub nominal_case_size: u32, + + /// Compression type. + pub compression_code: u32, + + /// 1-based variable index of the weight variable, or 0 if the file is + /// unweighted. + pub weight_index: u32, + + /// Claimed number of cases, or [u32::MAX] if unknown. + pub n_cases: u32, + + /// Compression bias, usually 100.0. + pub bias: f64, + + /// `dd mmm yy` in the file's encoding. + pub creation_date: [u8; 9], + + /// `HH:MM:SS` in the file's encoding. + pub creation_time: [u8; 8], + + /// File label, in the file's encoding. Padded on the right with spaces. + #[brw(pad_after = 3)] + pub file_label: [u8; 64], } -impl Debug for FileHeader -where - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - writeln!(f, "File header record:")?; - self.debug_field(f, "Magic", self.magic)?; - self.debug_field(f, "Product name", &self.eye_catcher)?; - self.debug_field(f, "Layout code", self.layout_code)?; - self.debug_field(f, "Nominal case size", self.nominal_case_size)?; - self.debug_field(f, "Compression", self.compression)?; - self.debug_field(f, "Weight index", self.weight_index)?; - self.debug_field(f, "Number of cases", self.n_cases)?; - self.debug_field(f, "Compression bias", self.bias)?; - self.debug_field(f, "Creation date", &self.creation_date)?; - self.debug_field(f, "Creation time", &self.creation_time)?; - self.debug_field(f, "File label", &self.file_label)?; - self.debug_field(f, "Endianness", self.endian) - } -} - -impl FileHeader { +impl FileHeader { /// Reads a header record from `r`, reporting any warnings via `warn`. pub fn read(r: &mut R, warn: &mut dyn FnMut(Warning)) -> Result where @@ -150,22 +166,6 @@ impl FileHeader { header_bytes: &[u8], warn: &mut dyn FnMut(Warning), ) -> Result { - #[derive(BinRead)] - struct RawHeader { - magic: [u8; 4], - eye_catcher: [u8; 60], - layout_code: u32, - nominal_case_size: u32, - compression_code: u32, - weight_index: u32, - n_cases: u32, - bias: f64, - creation_date: [u8; 9], - creation_time: [u8; 8], - file_label: [u8; 64], - _padding: [u8; 3], - } - if &header_bytes[8..20] == b"ENCRYPTEDSAV" { return Err(ErrorDetails::Encrypted); } @@ -200,7 +200,7 @@ impl FileHeader { let weight_index = (header.weight_index > 0).then_some(header.weight_index); - let n_cases = (header.n_cases < i32::MAX as u32 / 2).then_some(header.n_cases); + let n_cases = (header.n_cases <= u32::MAX / 2).then_some(header.n_cases); if header.bias != 100.0 && header.bias != 0.0 { warn(Warning::new( @@ -209,10 +209,6 @@ impl FileHeader { )); } - let creation_date = RawString(header.creation_date.into()); - let creation_time = RawString(header.creation_time.into()); - let file_label = RawString(header.file_label.into()); - Ok(FileHeader { magic, layout_code: header.layout_code, @@ -221,10 +217,10 @@ impl FileHeader { weight_index, n_cases, bias: header.bias, - creation_date, - creation_time, - eye_catcher: RawString(header.eye_catcher.into()), - file_label, + creation_date: header.creation_date.into(), + creation_time: header.creation_time.into(), + eye_catcher: header.eye_catcher.into(), + file_label: header.file_label.into(), endian, }) } @@ -250,22 +246,70 @@ impl FileHeader { endian: self.endian, } } + + /// Returns [RecordString]s for this file header. + pub fn get_strings(&self) -> Vec { + vec![ + RecordString::new("Product", &self.eye_catcher.0[5..], false), + RecordString::new("File Label", &self.file_label, false), + ] + } } -/// [Format](crate::format::Format) as represented in a system file. -#[derive(Copy, Clone, PartialEq, Eq, Hash)] +/// [Format] as represented in a system file. +#[derive(Copy, Clone, PartialEq, Eq, Hash, BinRead, BinWrite)] pub struct RawFormat( /// The most-significant 16 bits are the type, the next 8 bytes are the /// width, and the least-significant 8 bits are the number of decimals. pub u32, ); +/// Cannot convert very long string (wider than 255 bytes) to [RawFormat]. +#[derive(Copy, Clone, Debug)] +pub struct VeryLongStringError; + +impl TryFrom for RawFormat { + type Error = VeryLongStringError; + + fn try_from(value: Format) -> Result { + let type_ = u16::from(value.type_()) as u32; + let w = match value.var_width() { + VarWidth::Numeric => value.w() as u8, + VarWidth::String(w) if w > 255 => return Err(VeryLongStringError), + VarWidth::String(w) if value.type_() == Type::AHex => (w * 2).min(255) as u8, + VarWidth::String(w) => w as u8, + } as u32; + let d = value.d() as u32; + Ok(Self((type_ << 16) | (w << 8) | d)) + } +} + +struct RawFormatDisplayMeaning(RawFormat); + +impl Display for RawFormatDisplayMeaning { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + let type_ = format_name(self.0 .0 >> 16); + let w = (self.0 .0 >> 8) & 0xff; + let d = self.0 .0 & 0xff; + write!(f, "{type_}{w}.{d}") + } +} + impl Debug for RawFormat { fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let type_ = format_name(self.0 >> 16); - let w = (self.0 >> 8) & 0xff; - let d = self.0 & 0xff; - write!(f, "{:06x} ({type_}{w}.{d})", self.0) + write!(f, "{:06x} ({})", self.0, RawFormatDisplayMeaning(*self)) + } +} + +impl Serialize for RawFormat { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut tuple = serializer.serialize_tuple(2)?; + tuple.serialize_element(&self.0)?; + tuple.serialize_element(&RawFormatDisplayMeaning(*self).to_string())?; + tuple.end() } } @@ -313,7 +357,25 @@ fn format_name(type_: u32) -> Cow<'static, str> { .into() } -impl MissingValues { +/// Missing values in a [VariableRecord]. +/// +/// This is the format used before we know the character encoding for the system +/// file. +#[derive(Clone, Debug, Default, Serialize)] +pub struct RawMissingValues { + /// Individual missing values, up to 3 of them. + pub values: Vec>, + + /// Optional range of missing values. + pub range: Option, +} + +impl RawMissingValues { + /// Constructs new raw missing values. + pub fn new(values: Vec>, range: Option) -> Self { + Self { values, range } + } + fn read( r: &mut R, offsets: Range, @@ -392,7 +454,7 @@ impl MissingValues { let range = range.map(|(low, high)| { MissingValueRange::new(endian.parse(low), endian.parse(high)) }); - return Ok(Self::new(values, range).unwrap()); + return Ok(Self::new(values, range)); } Ok(VarWidth::String(_)) if range.is_some() => warn(Warning::new( Some(offsets), @@ -402,9 +464,9 @@ impl MissingValues { let width = width.min(8) as usize; let values = values .into_iter() - .map(|value| Datum::String(RawString::from(&value[..width]))) + .map(|value| Datum::String(ByteString::from(&value[..width]))) .collect(); - return Ok(Self::new(values, None).unwrap()); + return Ok(Self::new(values, None)); } Err(()) => warn(Warning::new( Some(offsets), @@ -413,6 +475,17 @@ impl MissingValues { } Ok(Self::default()) } + + /// Returns [MissingValues] for these raw missing values, using `encoding`. + pub fn decode(&self, encoding: &'static Encoding) -> Result { + MissingValues::new( + self.values + .iter() + .map(|datum| datum.clone().with_encoding(encoding)) + .collect(), + self.range, + ) + } } /// Warning for a variable record. @@ -428,10 +501,10 @@ pub enum VariableWarning { } /// A variable record in a system file. -#[derive(Clone)] +#[derive(Clone, Debug, Serialize)] pub struct VariableRecord where - S: Debug, + S: Debug + Serialize, { /// Range of offsets in file. pub offsets: Range, @@ -449,46 +522,45 @@ where pub write_format: RawFormat, /// Missing values. - pub missing_values: MissingValues, + pub missing_values: RawMissingValues, /// Optional variable label. pub label: Option, } -impl Debug for VariableRecord -where - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - writeln!(f, "Width: {}", self.width,)?; - writeln!(f, "Print format: {:?}", self.print_format)?; - writeln!(f, "Write format: {:?}", self.write_format)?; - writeln!(f, "Name: {:?}", &self.name)?; - writeln!(f, "Variable label: {:?}", self.label)?; - writeln!(f, "Missing values: {:?}", self.missing_values) - } +/// Raw variable record. +#[derive(BinRead, BinWrite)] +pub struct RawVariableRecord { + /// Variable width, in the range -1..=255. + pub width: i32, + + /// 1 if the variable has a label, 0 otherwise. + pub has_variable_label: u32, + + /// - 0 for no missing values. + /// - 1 for one missing value. + /// - 2 for two missing values. + /// - 3 for three missing values. + /// - -2 for a range of missing values. + /// - -3 for an individual missing value plus a range. + pub missing_value_code: i32, + + /// Print format. + pub print_format: RawFormat, + + /// Write format. + pub write_format: RawFormat, + + /// Variable name, padded with spaces. + pub name: [u8; 8], } -impl VariableRecord { +impl VariableRecord { /// Reads a variable record from `r`. - pub fn read( - r: &mut R, - endian: Endian, - warn: &mut dyn FnMut(Warning), - ) -> Result + pub fn read(r: &mut R, endian: Endian, warn: &mut dyn FnMut(Warning)) -> Result where R: Read + Seek, { - #[derive(BinRead)] - struct RawVariableRecord { - width: i32, - has_variable_label: u32, - missing_value_code: i32, - print_format: u32, - write_format: u32, - name: [u8; 8], - } - let start_offset = r.stream_position()?; let offsets = start_offset..start_offset + 28; let raw_record = @@ -508,12 +580,12 @@ impl VariableRecord { 1 => { let len: u32 = endian.parse(read_bytes(r)?); let read_len = len.min(65535) as usize; - let label = RawString(read_vec(r, read_len)?); + let label = read_vec(r, read_len)?; let padding_bytes = len.next_multiple_of(4) - len; let _ = read_vec(r, padding_bytes as usize)?; - Some(label) + Some(label.into()) } _ => { return Err(Error::new( @@ -523,7 +595,7 @@ impl VariableRecord { } }; - let missing_values = MissingValues::read( + let missing_values = RawMissingValues::read( r, offsets, width, @@ -534,15 +606,15 @@ impl VariableRecord { let end_offset = r.stream_position()?; - Ok(Record::Variable(VariableRecord { + Ok(Self { offsets: start_offset..end_offset, width, - name: RawString(raw_record.name.into()), - print_format: RawFormat(raw_record.print_format), - write_format: RawFormat(raw_record.write_format), + name: raw_record.name.into(), + print_format: raw_record.print_format, + write_format: raw_record.write_format, missing_values, label, - })) + }) } /// Decodes a variable record using `decoder`. @@ -591,11 +663,11 @@ pub enum ValueLabelWarning { } /// A value and label in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct ValueLabel where - D: Debug, - S: Debug, + D: Debug + Serialize, + S: Debug + Serialize, { /// The value being labeled. pub datum: D, @@ -607,11 +679,11 @@ where /// /// This represents both the type-3 and type-4 records together, since they are /// always paired anyway. -#[derive(Clone)] +#[derive(Clone, Debug, Serialize)] pub struct ValueLabelRecord where - D: Debug, - S: Debug, + D: Debug + Serialize, + S: Debug + Serialize, { /// Range of offsets in file. pub offsets: Range, @@ -626,28 +698,10 @@ where pub var_type: VarType, } -impl Debug for ValueLabelRecord -where - D: Debug, - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - writeln!(f, "labels: ")?; - for label in self.labels.iter() { - writeln!(f, "{label:?}")?; - } - write!(f, "apply to {} variables", self.var_type)?; - for dict_index in self.dict_indexes.iter() { - write!(f, " #{dict_index}")?; - } - Ok(()) - } -} - impl ValueLabelRecord where - D: Debug, - S: Debug, + D: Debug + Serialize, + S: Debug + Serialize, { /// Maximum number of value labels in a record. pub const MAX_LABELS: u32 = u32::MAX / 8; @@ -656,13 +710,16 @@ where pub const MAX_INDEXES: u32 = u32::MAX / 8; } -impl ValueLabelRecord { - pub(super) fn read( +impl ValueLabelRecord { + /// Reads a value label record from `r`, with the given `endian`, given that + /// the variables in the system file have the types in `var_types`, and + /// using `warn` to report warnings. + pub fn read( r: &mut R, endian: Endian, var_types: &VarTypes, warn: &mut dyn FnMut(Warning), - ) -> Result, Error> { + ) -> Result, Error> { let label_offset = r.stream_position()?; let n: u32 = endian.parse(read_bytes(r)?); if n > Self::MAX_LABELS { @@ -684,7 +741,7 @@ impl ValueLabelRecord { let mut label = read_vec(r, padded_len - 1)?; label.truncate(label_len); - labels.push((value, RawString(label))); + labels.push((value, label.into())); } let index_offset = r.stream_position()?; @@ -739,10 +796,10 @@ impl ValueLabelRecord { let Some(&first_index) = dict_indexes.first() else { return Ok(None); }; - let var_type = VarType::from(var_types.types[first_index as usize - 1].unwrap()); + let var_type = var_types.var_type_at(first_index as usize).unwrap(); let mut wrong_type_indexes = Vec::new(); dict_indexes.retain(|&index| { - if var_types.types[index as usize - 1].map(VarType::from) != Some(var_type) { + if var_types.var_type_at(index as usize) != Some(var_type) { wrong_type_indexes.push(index); false } else { @@ -768,12 +825,12 @@ impl ValueLabelRecord { .collect(); let end_offset = r.stream_position()?; - Ok(Some(Record::ValueLabel(ValueLabelRecord { + Ok(Some(ValueLabelRecord { offsets: label_offset..end_offset, labels, dict_indexes, var_type, - }))) + })) } /// Decodes a value label record using `decoder`. @@ -801,10 +858,10 @@ impl ValueLabelRecord { } /// A document record in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct DocumentRecord where - S: Debug, + S: Debug + Serialize, { /// The range of file offsets occupied by the record. pub offsets: Range, @@ -815,7 +872,7 @@ where } /// One line in a document. -pub type RawDocumentLine = RawStrArray; +pub type RawDocumentLine = ByteStrArray; /// Length of a line in a document. Document lines are fixed-length and /// padded on the right with spaces. @@ -827,7 +884,7 @@ impl DocumentRecord { pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN; /// Reads a document record from `r`. - pub fn read(r: &mut R, endian: Endian) -> Result + pub fn read(r: &mut R, endian: Endian) -> Result where R: Read + Seek, { @@ -846,11 +903,11 @@ impl DocumentRecord { let offsets = start_offset..start_offset.saturating_add((n * DOC_LINE_LEN) as u64); let mut lines = Vec::with_capacity(n); for _ in 0..n { - lines.push(RawStrArray( + lines.push(ByteStrArray( read_bytes(r).map_err(|e| Error::new(Some(offsets.clone()), e.into()))?, )); } - Ok(Record::Document(DocumentRecord { offsets, lines })) + Ok(DocumentRecord { offsets, lines }) } } @@ -882,15 +939,21 @@ pub struct ExtensionRecord<'a> { } /// An integer info record in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct IntegerInfoRecord { /// File offsets occupied by the record. pub offsets: Range, + /// Details. + #[serde(flatten)] + pub inner: RawIntegerInfoRecord, +} + +/// Machine integer info record in [mod@binrw] format. +#[derive(Clone, Debug, BinRead, BinWrite, Serialize)] +pub struct RawIntegerInfoRecord { /// Version number. - /// - /// e.g. `(1,2,3)` for version 1.2.3. - pub version: (i32, i32, i32), + pub version: ProductVersion, /// Identifies the type of machine. /// @@ -915,18 +978,12 @@ impl IntegerInfoRecord { pub fn parse(ext: &Extension, endian: Endian) -> Result { ext.check_size(Some(4), Some(8), "integer record")?; - let mut input = &ext.data[..]; - let data: Vec = (0..8) - .map(|_| endian.parse(read_bytes(&mut input).unwrap())) - .collect(); + let inner = + RawIntegerInfoRecord::read_options(&mut Cursor::new(ext.data.as_slice()), endian, ()) + .unwrap(); Ok(Record::IntegerInfo(IntegerInfoRecord { offsets: ext.offsets.clone(), - version: (data[0], data[1], data[2]), - machine_code: data[3], - floating_point_rep: data[4], - compression_code: data[5], - endianness: data[6], - character_code: data[7], + inner, })) } } @@ -936,20 +993,14 @@ impl FloatInfoRecord { pub fn parse(ext: &Extension, endian: Endian) -> Result { ext.check_size(Some(8), Some(3), "floating point record")?; - let mut input = &ext.data[..]; - let data: Vec = (0..3) - .map(|_| endian.parse(read_bytes(&mut input).unwrap())) - .collect(); - Ok(Record::FloatInfo(FloatInfoRecord { - sysmis: data[0], - highest: data[1], - lowest: data[2], - })) + let data = FloatInfoRecord::read_options(&mut Cursor::new(ext.data.as_slice()), endian, ()) + .unwrap(); + Ok(Record::FloatInfo(data)) } } /// A floating-point info record. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, BinRead, BinWrite, Serialize)] pub struct FloatInfoRecord { /// Value used for system-missing values. pub sysmis: f64, @@ -962,10 +1013,10 @@ pub struct FloatInfoRecord { } /// Long variable names record. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct RawLongNamesRecord( /// Text contents of record. - TextRecord, + pub TextRecord, ); impl RawLongNamesRecord { @@ -993,13 +1044,13 @@ impl RawLongNamesRecord { } /// An extension record whose contents are a text string. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct TextRecord { /// Range of file offsets for this record in bytes. pub offsets: Range, /// The text content of the record. - pub text: RawString, + pub text: ByteString, } impl TextRecord { @@ -1036,7 +1087,7 @@ pub enum VeryLongStringWarning { } /// A very long string parsed from a [VeryLongStringsRecord]. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct VeryLongString { /// Short name of very long string variable. pub short_name: Identifier, @@ -1063,11 +1114,11 @@ impl VeryLongString { } /// A very long string record as text. -#[derive(Clone, Debug)] -pub struct RawVeryLongStringsRecord(TextRecord); +#[derive(Clone, Debug, Serialize)] +pub struct RawVeryLongStringsRecord(pub TextRecord); /// A parsed very long string record. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct VeryLongStringsRecord( /// The very long strings. pub Vec, @@ -1157,12 +1208,12 @@ pub enum MultipleResponseWarning { } /// The type of a multiple-response set. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub enum MultipleResponseType { /// Multiple-dichotomy set. MultipleDichotomy { /// The value that is counted in the set. - value: RawString, + value: ByteString, /// What categories are labeled. labels: CategoryLabels, @@ -1189,16 +1240,23 @@ impl MultipleResponseType { ) } Some((b'E', input)) => { - let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") { - (CategoryLabels::CountedValues, rest) + let (use_var_label_as_mrset_label, input) = if let Some(rest) = + input.strip_prefix(b" 1 ") + { + (false, rest) } else if let Some(rest) = input.strip_prefix(b" 11 ") { - (CategoryLabels::VarLabels, rest) + (true, rest) } else { return Err(MultipleResponseWarning::InvalidMultipleDichotomyLabelType.into()); }; let (value, input) = parse_counted_string(input)?; ( - MultipleResponseType::MultipleDichotomy { value, labels }, + MultipleResponseType::MultipleDichotomy { + value, + labels: CategoryLabels::CountedValues { + use_var_label_as_mrset_label, + }, + }, input, ) } @@ -1209,11 +1267,11 @@ impl MultipleResponseType { } /// A multiple-response set in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct MultipleResponseSet where - I: Debug, - S: Debug, + I: Debug + Serialize, + S: Debug + Serialize, { /// The set's name. pub name: I, @@ -1225,7 +1283,7 @@ where pub short_names: Vec, } -impl MultipleResponseSet { +impl MultipleResponseSet { /// Parses a multiple-response set from `input`. Returns the set and the /// input remaining to be parsed following the set. fn parse(input: &[u8]) -> Result<(Self, &[u8]), WarningDetails> { @@ -1309,11 +1367,11 @@ impl MultipleResponseSet { } /// A multiple-response set record in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct MultipleResponseRecord where - I: Debug, - S: Debug, + I: Debug + Serialize, + S: Debug + Serialize, { /// File offsets of the record. pub offsets: Range, @@ -1322,7 +1380,7 @@ where pub sets: Vec>, } -impl MultipleResponseRecord { +impl MultipleResponseRecord { /// Parses a multiple-response set from `ext`. pub fn parse(ext: &Extension) -> Result { ext.check_size(Some(1), None, "multiple response set record")?; @@ -1347,7 +1405,7 @@ impl MultipleResponseRecord { } } -impl MultipleResponseRecord { +impl MultipleResponseRecord { /// Decodes this record using `decoder`. pub fn decode(self, decoder: &mut Decoder) -> MultipleResponseRecord { let mut sets = Vec::new(); @@ -1366,7 +1424,7 @@ impl MultipleResponseRecord { } } -fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), WarningDetails> { +fn parse_counted_string(input: &[u8]) -> Result<(ByteString, &[u8]), WarningDetails> { let Some(space) = input.iter().position(|&b| b == b' ') else { return Err(MultipleResponseWarning::CountedStringMissingSpace.into()); }; @@ -1436,7 +1494,7 @@ impl Alignment { } /// Variable display settings for one variable, in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct VarDisplay { /// Measurement level. pub measure: Option, @@ -1449,7 +1507,7 @@ pub struct VarDisplay { } /// A variable display record in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct VarDisplayRecord( /// Variable display settings for each variable. pub Vec, @@ -1520,19 +1578,19 @@ pub enum LongStringMissingValuesWarning { } /// Missing values for one long string variable. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct LongStringMissingValues where - N: Debug, + N: Debug + Serialize, { /// Variable name. pub var_name: N, /// Missing values. - pub missing_values: Vec>, + pub missing_values: Vec>, } -impl LongStringMissingValues { +impl LongStringMissingValues { /// Decodes these settings using `decoder`. fn decode( &self, @@ -1546,10 +1604,10 @@ impl LongStringMissingValues { } /// Long string missing values record in a sytem file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct LongStringMissingValueRecord where - N: Debug, + N: Debug + Serialize, { /// The record's file offsets. pub offsets: Range, @@ -1558,7 +1616,7 @@ where pub values: Vec>, } -impl LongStringMissingValueRecord { +impl LongStringMissingValueRecord { /// Parses this record from `ext`. pub fn parse( ext: &Extension, @@ -1596,7 +1654,7 @@ impl LongStringMissingValueRecord { } let value: [u8; 8] = read_bytes(&mut input)?; - missing_values.push(RawStrArray(value)); + missing_values.push(ByteStrArray(value)); } missing_value_set.push(LongStringMissingValues { var_name, @@ -1631,7 +1689,7 @@ impl LongStringMissingValueRecord { } /// A character encoding record in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct EncodingRecord( /// The encoding name. pub String, @@ -1649,13 +1707,13 @@ impl EncodingRecord { } /// The extended number of cases record in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct NumberOfCasesRecord { /// Always observed as 1. pub one: u64, /// Number of cases. - pub n_cases: u64, + pub n_cases: Option, } impl NumberOfCasesRecord { @@ -1666,6 +1724,7 @@ impl NumberOfCasesRecord { let mut input = &ext.data[..]; let one = endian.parse(read_bytes(&mut input)?); let n_cases = endian.parse(read_bytes(&mut input)?); + let n_cases = (n_cases < u64::MAX).then_some(n_cases); Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases })) } @@ -1687,7 +1746,7 @@ pub enum VariableSetWarning { } /// Raw (text) version of the variable set record in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct RawVariableSetRecord(TextRecord); impl RawVariableSetRecord { @@ -1718,8 +1777,8 @@ impl RawVariableSetRecord { } /// Raw (text) version of a product info record in a system file. -#[derive(Clone, Debug)] -pub struct RawProductInfoRecord(TextRecord); +#[derive(Clone, Debug, Serialize)] +pub struct RawProductInfoRecord(pub TextRecord); impl RawProductInfoRecord { /// Parses the record from `extension`. @@ -1893,11 +1952,11 @@ impl Attributes { } /// A raw (text) file attributes record in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct RawFileAttributesRecord(TextRecord); /// A decoded file attributes record in a system file. -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, Serialize)] pub struct FileAttributesRecord(pub Attributes); impl RawFileAttributesRecord { @@ -1938,7 +1997,7 @@ impl RawFileAttributesRecord { } /// A set of variable attributes in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct VarAttributes { /// The long name of the variable associated with the attributes. pub long_var_name: Identifier, @@ -1983,11 +2042,11 @@ impl VarAttributes { } /// A raw (text) variable attributes record in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct RawVariableAttributesRecord(TextRecord); /// A decoded variable attributes record in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct VariableAttributesRecord(pub Vec); impl RawVariableAttributesRecord { @@ -2040,7 +2099,7 @@ pub enum LongNameWarning { } /// A long variable name in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct LongName { /// The variable's short name. pub short_name: Identifier, @@ -2071,15 +2130,15 @@ impl LongName { } /// A long variable name record in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct LongNamesRecord(pub Vec); /// A product info record in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct ProductInfoRecord(pub String); /// A variable set in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct VariableSet { /// Name of the variable set. pub name: String, @@ -2118,7 +2177,7 @@ impl VariableSet { } /// A variable set record in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct VariableSetRecord { /// Range of file offsets occupied by the record. pub offsets: Range, @@ -2179,7 +2238,7 @@ pub enum ExtensionWarning { /// /// Most of the records in system files are "extension records". This structure /// collects everything in an extension record for later processing. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct Extension { /// File offsets occupied by the extension record. /// @@ -2232,7 +2291,10 @@ impl Extension { } } - pub(super) fn read( + /// Reads an extension record from `r`, with the given `endian`, given that + /// the variables in the system file have the types in `var_types`, and + /// using `warn` to report warnings. + pub fn read( r: &mut R, endian: Endian, var_types: &VarTypes, @@ -2302,10 +2364,10 @@ pub enum LongStringValueLabelWarning { } /// One set of long string value labels record in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct LongStringValueLabels where - S: Debug, + S: Debug + Serialize, { /// The variable being labeled. pub var_name: N, @@ -2314,10 +2376,10 @@ where pub width: u32, /// `(value, label)` pairs, where each value is `width` bytes. - pub labels: Vec<(RawString, S)>, + pub labels: Vec<(ByteString, S)>, } -impl LongStringValueLabels { +impl LongStringValueLabels { /// Decodes a set of long string value labels using `decoder`. fn decode( &self, @@ -2342,11 +2404,11 @@ impl LongStringValueLabels { } /// A long string value labels record in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct LongStringValueLabelRecord where - N: Debug, - S: Debug, + N: Debug + Serialize, + S: Debug + Serialize, { /// File offsets occupied by the record. pub offsets: Range, @@ -2355,7 +2417,7 @@ where pub labels: Vec>, } -impl LongStringValueLabelRecord { +impl LongStringValueLabelRecord { /// Parses this record from `ext` using `endian`. fn parse(ext: &Extension, endian: Endian) -> Result { ext.check_size(Some(1), None, "long string value labels record")?; @@ -2401,11 +2463,19 @@ impl LongStringValueLabelRecord { } /// ZLIB header, for [Compression::ZLib]. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct ZHeader { /// File offset to the start of the record. pub offset: u64, + /// Raw header. + #[serde(flatten)] + pub inner: RawZHeader, +} + +/// A ZLIB header in a system file. +#[derive(Clone, Debug, BinRead, BinWrite, Serialize)] +pub struct RawZHeader { /// File offset to the ZLIB data header. pub zheader_offset: u64, @@ -2423,37 +2493,74 @@ impl ZHeader { R: Read + Seek, { let offset = r.stream_position()?; - let zheader_offset: u64 = endian.parse(read_bytes(r)?); - let ztrailer_offset: u64 = endian.parse(read_bytes(r)?); - let ztrailer_len: u64 = endian.parse(read_bytes(r)?); + let inner = RawZHeader::read_options(r, endian, ()).map_err(|e| Error { + offsets: Some(offset..offset + 24), + details: ZHeaderError::from(e).into(), + })?; - if zheader_offset != offset { - Err(ErrorDetails::UnexpectedZHeaderOffset { - actual: zheader_offset, + if inner.zheader_offset != offset { + Err(ZHeaderError::UnexpectedZHeaderOffset { + actual: inner.zheader_offset, expected: offset, - }) - } else if ztrailer_offset < offset { - Err(ErrorDetails::ImpossibleZTrailerOffset(ztrailer_offset)) - } else if ztrailer_len < 24 || ztrailer_len % 24 != 0 { - Err(ErrorDetails::InvalidZTrailerLength(ztrailer_len)) + } + .into()) + } else if inner.ztrailer_offset < offset { + Err(ZHeaderError::ImpossibleZTrailerOffset(inner.ztrailer_offset).into()) + } else if inner.ztrailer_len < 24 || inner.ztrailer_len % 24 != 0 { + Err(ZHeaderError::InvalidZTrailerLength(inner.ztrailer_len).into()) } else { - Ok(ZHeader { - offset, - zheader_offset, - ztrailer_offset, - ztrailer_len, - }) + Ok(ZHeader { offset, inner }) } .map_err(|details| Error::new(Some(offset..offset + 12), details)) } } +/// Error reading a [ZHeader]. +#[derive(ThisError, Debug)] +pub enum ZHeaderError { + /// I/O error via [mod@binrw]. + #[error("{}", DisplayBinError(&.0, "ZLIB header"))] + BinError(#[from] BinError), + + /// Impossible ztrailer_offset {0:#x}. + #[error("Impossible ztrailer_offset {0:#x}.")] + ImpossibleZTrailerOffset( + /// `ztrailer_offset` + u64, + ), + + /// zlib_offset is {actual:#x} instead of expected {expected:#x}. + #[error("zlib_offset is {actual:#x} instead of expected {expected:#x}.")] + UnexpectedZHeaderOffset { + /// Actual `zlib_offset`. + actual: u64, + /// Expected `zlib_offset`. + expected: u64, + }, + + /// Invalid ZLIB trailer length {0}. + #[error("Invalid ZLIB trailer length {0}.")] + InvalidZTrailerLength( + /// ZLIB trailer length. + u64, + ), +} + /// A ZLIB trailer in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize)] pub struct ZTrailer { /// File offset to the start of the record. pub offset: u64, + /// The raw trailer. + #[serde(flatten)] + pub inner: RawZTrailer, +} + +/// A ZLIB trailer in a system file. +#[binrw] +#[derive(Clone, Debug, Serialize)] +pub struct RawZTrailer { /// Compression bias as a negative integer, e.g. -100. pub int_bias: i64, @@ -2464,16 +2571,28 @@ pub struct ZTrailer { /// `0x3ff000` has been observed so far. pub block_size: u32, + /// Number of blocks. + #[bw(calc(blocks.len() as u32))] + pub n_blocks: u32, + /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them. + #[br(count = n_blocks)] pub blocks: Vec, } +impl RawZTrailer { + /// Returns the length of the trailer when it is written, in bytes. + pub fn len(&self) -> usize { + 24 + self.blocks.len() * 24 + } +} + /// Warning for a ZLIB trailer record. #[derive(ThisError, Debug)] pub enum ZlibTrailerWarning { /// Wrong block size. #[error( - "ZLIB block descriptor {index} reported block size {actual:#x}, when {expected:#x} was expected." + "Block descriptor {index} reported block size {actual:#x}, when {expected:#x} was expected." )] ZlibTrailerBlockWrongSize { /// 0-based block descriptor index. @@ -2486,7 +2605,7 @@ pub enum ZlibTrailerWarning { /// Block too big. #[error( - "ZLIB block descriptor {index} reported block size {actual:#x}, when at most {max_expected:#x} was expected." + "Block descriptor {index} reported block size {actual:#x}, when at most {max_expected:#x} was expected." )] ZlibTrailerBlockTooBig { /// 0-based block descriptor index. @@ -2499,7 +2618,7 @@ pub enum ZlibTrailerWarning { } /// A ZLIB block descriptor in a system file. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, BinRead, BinWrite, Serialize)] pub struct ZBlock { /// Offset of block of data if simple compression were used. pub uncompressed_ofs: u64, @@ -2517,15 +2636,6 @@ pub struct ZBlock { } impl ZBlock { - fn read(r: &mut R, endian: Endian) -> Result { - Ok(ZBlock { - uncompressed_ofs: endian.parse(read_bytes(r)?), - compressed_ofs: endian.parse(read_bytes(r)?), - uncompressed_size: endian.parse(read_bytes(r)?), - compressed_size: endian.parse(read_bytes(r)?), - }) - } - /// Returns true if the uncompressed and compressed sizes are plausible. /// /// [zlib Technical Details] says that the maximum expansion from @@ -2540,6 +2650,120 @@ impl ZBlock { } } +struct DisplayBinError<'a>(&'a BinError, &'static str); + +impl<'a> Display for DisplayBinError<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + if self.0.is_eof() { + write!(f, "Unexpected end-of-file reading {}", self.1) + } else { + write!(f, "Error reading {}: {}", self.1, self.0.root_cause()) + } + } +} + +/// Error reading a [ZTrailer]. +#[derive(ThisError, Debug)] +pub enum ZTrailerError { + /// I/O error via [mod@binrw]. + #[error("{}", DisplayBinError(&.0, "ZLIB trailer"))] + BinError(#[from] BinError), + + /// ZLIB trailer bias {actual} is not {} as expected from file header bias. + #[ + error( + "Bias {actual} is not {} as expected from file header.", + DisplayPlainF64(*expected) + )] + WrongZlibTrailerBias { + /// ZLIB trailer bias read from file. + actual: i64, + /// Expected ZLIB trailer bias. + expected: f64, + }, + + /// ZLIB trailer zero field has nonzero value {0}. + #[error("Expected zero field has nonzero value {0}.")] + WrongZlibTrailerZero( + /// Actual value that should have been zero. + u64, + ), + + /// ZLIB trailer specifies unexpected {0}-byte block size. + #[error("Unexpected {0:x}-byte block size (expected 0x3ff000).")] + WrongZlibTrailerBlockSize( + /// Block size read from file. + u32, + ), + + /// Block count differs from expected block count calculated from trailer + /// length. + #[error( + "Block count {n_blocks} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}." + )] + BadZlibTrailerNBlocks { + /// Number of blocks. + n_blocks: usize, + /// Expected number of blocks. + expected_n_blocks: u64, + /// ZLIB trailer length in bytes. + ztrailer_len: u64, + }, + + /// ZLIB block descriptor reported uncompressed data offset different from + /// expected. + #[error( + "Block descriptor {index} reported uncompressed data offset {actual:#x}, when {expected:#x} was expected." + )] + ZlibTrailerBlockWrongUncmpOfs { + /// Block descriptor index. + index: usize, + /// Actual uncompressed data offset. + actual: u64, + /// Expected uncompressed data offset. + expected: u64, + }, + + /// Block descriptor {index} reported compressed data offset + /// {actual:#x}, when {expected:#x} was expected. + #[error( + "Block descriptor {index} reported compressed data offset {actual:#x}, when {expected:#x} was expected." + )] + ZlibTrailerBlockWrongCmpOfs { + /// Block descriptor index. + index: usize, + /// Actual compressed data offset. + actual: u64, + /// Expected compressed data offset. + expected: u64, + }, + + /// Block descriptor {index} reports compressed size {compressed_size} + /// and uncompressed size {uncompressed_size}. + #[error( + "Block descriptor {index} reports compressed size {compressed_size} and uncompressed size {uncompressed_size}." + )] + ZlibExpansion { + /// Block descriptor index. + index: usize, + /// Compressed size. + compressed_size: u32, + /// Uncompressed size. + uncompressed_size: u32, + }, + + /// ZLIB trailer at unexpected offset. + #[error( + "ZLIB trailer is at offset {actual:#x} but {expected:#x} would be expected from block descriptors." + )] + ZlibTrailerOffsetInconsistency { + /// Expected offset. + expected: u64, + /// Actual offset. + actual: u64, + }, +} + impl ZTrailer { /// Reads a ZLIB trailer from `reader` using `endian`. `bias` is the /// floating-point bias for confirmation against the trailer, and `zheader` @@ -2548,7 +2772,7 @@ impl ZTrailer { reader: &mut R, endian: Endian, bias: f64, - zheader: &ZHeader, + zheader: &RawZHeader, warn: &mut dyn FnMut(Warning), ) -> Result, Error> where @@ -2561,84 +2785,85 @@ impl ZTrailer { { return Ok(None); } - let int_bias = endian.parse(read_bytes(reader)?); - let zero = endian.parse(read_bytes(reader)?); - let block_size = endian.parse(read_bytes(reader)?); - let n_blocks: u32 = endian.parse(read_bytes(reader)?); - if int_bias as f64 != -bias { - Err(ErrorDetails::WrongZlibTrailerBias { - actual: int_bias, + let inner = RawZTrailer::read_options(reader, endian, ()).map_err(|e| Error { + offsets: Some(zheader.ztrailer_offset..zheader.ztrailer_offset + zheader.ztrailer_len), + details: ZTrailerError::from(e).into(), + })?; + if inner.int_bias as f64 != -bias { + Err(ZTrailerError::WrongZlibTrailerBias { + actual: inner.int_bias, expected: -bias, - }) - } else if zero != 0 { - Err(ErrorDetails::WrongZlibTrailerZero(zero)) - } else if block_size != 0x3ff000 { - Err(ErrorDetails::WrongZlibTrailerBlockSize(block_size)) + } + .into()) + } else if inner.zero != 0 { + Err(ZTrailerError::WrongZlibTrailerZero(inner.zero).into()) + } else if inner.block_size != 0x3ff000 { + Err(ZTrailerError::WrongZlibTrailerBlockSize(inner.block_size).into()) } else if let expected_n_blocks = (zheader.ztrailer_len - 24) / 24 - && n_blocks as u64 != expected_n_blocks + && inner.blocks.len() as u64 != expected_n_blocks { - Err(ErrorDetails::BadZlibTrailerNBlocks { - n_blocks, + Err(ZTrailerError::BadZlibTrailerNBlocks { + n_blocks: inner.blocks.len(), expected_n_blocks, ztrailer_len: zheader.ztrailer_len, - }) + } + .into()) } else { Ok(()) } .map_err(|details| Error::new(Some(start_offset..start_offset + 24), details))?; - let blocks = (0..n_blocks) - .map(|_| ZBlock::read(reader, endian)) - .collect::, _>>()?; - let mut expected_uncmp_ofs = zheader.zheader_offset; let mut expected_cmp_ofs = zheader.zheader_offset + 24; - for (index, block) in blocks.iter().enumerate() { + for (index, block) in inner.blocks.iter().enumerate() { let block_start = start_offset + 24 + 24 * index as u64; let block_offsets = block_start..block_start + 24; if block.uncompressed_ofs != expected_uncmp_ofs { - Err(ErrorDetails::ZlibTrailerBlockWrongUncmpOfs { + Err(ZTrailerError::ZlibTrailerBlockWrongUncmpOfs { index, actual: block.uncompressed_ofs, expected: expected_cmp_ofs, - }) + } + .into()) } else if block.compressed_ofs != expected_cmp_ofs { - Err(ErrorDetails::ZlibTrailerBlockWrongCmpOfs { + Err(ZTrailerError::ZlibTrailerBlockWrongCmpOfs { index, actual: block.compressed_ofs, expected: expected_cmp_ofs, - }) + } + .into()) } else if !block.has_plausible_sizes() { - Err(ErrorDetails::ZlibExpansion { + Err(ZTrailerError::ZlibExpansion { index, compressed_size: block.compressed_size, uncompressed_size: block.uncompressed_size, - }) + } + .into()) } else { Ok(()) } .map_err(|details| Error::new(Some(block_offsets.clone()), details))?; - if index < blocks.len() - 1 { - if block.uncompressed_size != block_size { + if index < inner.blocks.len() - 1 { + if block.uncompressed_size != inner.block_size { warn(Warning::new( Some(block_offsets), ZlibTrailerWarning::ZlibTrailerBlockWrongSize { index, actual: block.uncompressed_size, - expected: block_size, + expected: inner.block_size, }, )); } } else { - if block.uncompressed_size > block_size { + if block.uncompressed_size > inner.block_size { warn(Warning::new( Some(block_offsets), ZlibTrailerWarning::ZlibTrailerBlockTooBig { index, actual: block.uncompressed_size, - max_expected: block_size, + max_expected: inner.block_size, }, )); } @@ -2650,21 +2875,19 @@ impl ZTrailer { if expected_cmp_ofs != zheader.ztrailer_offset { return Err(Error::new( - Some(start_offset..start_offset + 24 + 24 * n_blocks as u64), - ErrorDetails::ZlibTrailerOffsetInconsistency { + Some(start_offset..start_offset + 24 + 24 * inner.blocks.len() as u64), + ZTrailerError::ZlibTrailerOffsetInconsistency { expected: expected_cmp_ofs, actual: zheader.ztrailer_offset, - }, + } + .into(), )); } reader.seek(SeekFrom::Start(start_offset))?; Ok(Some(ZTrailer { offset: zheader.ztrailer_offset, - int_bias, - zero, - block_size, - blocks, + inner, })) } } diff --git a/rust/pspp/src/sys/sack.rs b/rust/pspp/src/sys/sack.rs index c6695bd1be..b23d0d83ea 100644 --- a/rust/pspp/src/sys/sack.rs +++ b/rust/pspp/src/sys/sack.rs @@ -14,6 +14,7 @@ // You should have received a copy of the GNU General Public License along with // this program. If not, see . +use binrw::Endian; use num::{Bounded, Zero}; use ordered_float::OrderedFloat; use std::{ @@ -24,7 +25,7 @@ use std::{ path::{Path, PathBuf}, }; -use crate::endian::{Endian, ToBytes}; +use crate::endian::ToBytes; pub type Result = std::result::Result; @@ -552,7 +553,7 @@ impl<'a> Lexer<'a> { "i64" => Token::I64, "SYSMIS" => Token::Float(OrderedFloat(-f64::MAX)), "PCSYSMIS" => Token::PcSysmis, - "LOWEST" => Token::Float((-f64::MAX).next_up().into()), + "LOWEST" => Token::Float(f64::MIN.next_up().into()), "HIGHEST" => Token::Float(f64::MAX.into()), "ENDIAN" => Token::Integer(if self.endian == Endian::Big { 1 } else { 2 }), "COUNT" => Token::Count, @@ -573,10 +574,9 @@ impl<'a> Lexer<'a> { #[cfg(test)] mod test { - use crate::endian::Endian; use crate::sys::sack::sack; use anyhow::Result; - use hexplay::HexView; + use binrw::Endian; #[test] fn basic_sack() -> Result<()> { @@ -592,8 +592,7 @@ mod test { "PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 ""; i8 0 *3; "#; - let output = sack(input, None, Endian::Big)?; - HexView::new(&output).print()?; + sack(input, None, Endian::Big)?; Ok(()) } @@ -677,8 +676,7 @@ DATA: s16 "stuvwxyzAB"; s16 "CDEFGHIJKLM"; DATA_END: "#; - let output = sack(input, None, Endian::Big)?; - HexView::new(&output).print()?; + sack(input, None, Endian::Big)?; Ok(()) } } diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs index 48e95a0f6b..9198a71e94 100644 --- a/rust/pspp/src/sys/test.rs +++ b/rust/pspp/src/sys/test.rs @@ -16,23 +16,31 @@ use std::{ fs::File, - io::{Cursor, Read, Seek}, - path::Path, + io::{BufRead, BufReader, Cursor, Seek}, + path::{Path, PathBuf}, sync::Arc, }; +use binrw::Endian; +use encoding_rs::UTF_8; +use itertools::Itertools; + use crate::{ crypto::EncryptedFile, - endian::Endian, + data::Datum, + dictionary::Dictionary, + identifier::Identifier, output::{ pivot::{test::assert_lines_eq, Axis3, Dimension, Group, PivotTable, Value}, Details, Item, Text, }, sys::{ - cooked::ReaderOptions, - raw::{self, ErrorDetails}, + cooked::ReadOptions, + raw::{self, records::Compression, ErrorDetails}, sack::sack, + WriteOptions, }, + variable::{VarWidth, Variable}, }; #[test] @@ -553,31 +561,159 @@ fn encrypted_file() { #[test] fn encrypted_file_without_password() { - let error = ReaderOptions::new() - .open_file("src/crypto/testdata/test-encrypted.sav", |_| { - panic!(); - }) - .unwrap_err(); + let error = ReadOptions::new(|_| { + panic!(); + }) + .open_file("src/crypto/testdata/test-encrypted.sav") + .unwrap_err(); assert!(matches!( error.downcast::().unwrap().details, ErrorDetails::Encrypted )); } +/// Tests the most basic kind of writing a system file, just writing a few +/// numeric variables and cases. +fn write_numeric(compression: Option, compression_string: &str) { + let mut dictionary = Dictionary::new(UTF_8); + for i in 0..4 { + let name = Identifier::new(format!("variable{i}")).unwrap(); + dictionary + .add_var(Variable::new(name, VarWidth::Numeric, UTF_8)) + .unwrap(); + } + let mut cases = WriteOptions::reproducible(compression) + .write_writer(&dictionary, Cursor::new(Vec::new())) + .unwrap(); + for case in [ + [1, 1, 1, 2], + [1, 1, 2, 30], + [1, 2, 1, 8], + [1, 2, 2, 20], + [2, 1, 1, 2], + [2, 1, 2, 22], + [2, 2, 1, 1], + [2, 2, 2, 3], + ] { + cases + .write_case( + case.into_iter() + .map(|number| Datum::<&str>::Number(Some(number as f64))), + ) + .unwrap(); + } + let sysfile = cases.finish().unwrap().unwrap().into_inner(); + let expected_filename = PathBuf::from(&format!( + "src/sys/testdata/write-numeric-{compression_string}.expected" + )); + let expected = String::from_utf8(std::fs::read(&expected_filename).unwrap()).unwrap(); + test_sysfile(Cursor::new(sysfile), &expected, &expected_filename); +} + +#[test] +fn write_numeric_uncompressed() { + write_numeric(None, "uncompressed"); +} + +#[test] +fn write_numeric_simple() { + write_numeric(Some(Compression::Simple), "simple"); +} + +#[test] +fn write_numeric_zlib() { + write_numeric(Some(Compression::ZLib), "zlib"); +} + +/// Tests writing string data. +fn write_string(compression: Option, compression_string: &str) { + let mut dictionary = Dictionary::new(UTF_8); + dictionary + .add_var(Variable::new( + Identifier::new("s1").unwrap(), + VarWidth::String(1), + UTF_8, + )) + .unwrap(); + + dictionary + .add_var(Variable::new( + Identifier::new("s2").unwrap(), + VarWidth::String(2), + UTF_8, + )) + .unwrap(); + + dictionary + .add_var(Variable::new( + Identifier::new("s3").unwrap(), + VarWidth::String(3), + UTF_8, + )) + .unwrap(); + + dictionary + .add_var(Variable::new( + Identifier::new("s4").unwrap(), + VarWidth::String(9), + UTF_8, + )) + .unwrap(); + + dictionary + .add_var(Variable::new( + Identifier::new("s566").unwrap(), + VarWidth::String(566), + UTF_8, + )) + .unwrap(); + + let mut cases = WriteOptions::reproducible(compression) + .write_writer(&dictionary, Cursor::new(Vec::new())) + .unwrap(); + for case in [ + ["1", "1", "1", "xyzzyquux", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\nabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\nabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\nabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\nabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\nabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\nabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\nabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\nabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\n"], + ["1", "2", "1", "8", "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"], + ] { + cases + .write_case(case.into_iter().map(|s| Datum::String(s))) + .unwrap(); + } + let sysfile = cases.finish().unwrap().unwrap().into_inner(); + let expected_filename = PathBuf::from(&format!( + "src/sys/testdata/write-string-{compression_string}.expected" + )); + let expected = String::from_utf8(std::fs::read(&expected_filename).unwrap()).unwrap(); + test_sysfile(Cursor::new(sysfile), &expected, &expected_filename); +} + +#[test] +fn write_string_uncompressed() { + write_string(None, "uncompressed"); +} + +#[test] +fn write_string_simple() { + write_string(Some(Compression::Simple), "simple"); +} + +#[test] +fn write_string_zlib() { + write_string(Some(Compression::ZLib), "zlib"); +} + fn test_raw_sysfile(name: &str) { - let input_filename = Path::new(env!("CARGO_MANIFEST_DIR")) - .join("src/sys/testdata") + let input_filename = Path::new("src/sys/testdata") .join(name) .with_extension("sav"); - let sysfile = File::open(&input_filename).unwrap(); + let sysfile = BufReader::new(File::open(&input_filename).unwrap()); let expected_filename = input_filename.with_extension("expected"); let expected = String::from_utf8(std::fs::read(&expected_filename).unwrap()).unwrap(); test_sysfile(sysfile, &expected, &expected_filename); } fn test_encrypted_sysfile(name: &str, password: &str) { - let input_filename = Path::new(env!("CARGO_MANIFEST_DIR")) - .join("src/sys/testdata") + let input_filename = Path::new("src/sys/testdata") .join(name) .with_extension("sav"); let sysfile = EncryptedFile::new(File::open(&input_filename).unwrap()) @@ -590,8 +726,7 @@ fn test_encrypted_sysfile(name: &str, password: &str) { } fn test_sack_sysfile(name: &str) { - let input_filename = Path::new(env!("CARGO_MANIFEST_DIR")) - .join("src/sys/testdata") + let input_filename = Path::new("src/sys/testdata") .join(name) .with_extension("sack"); let input = String::from_utf8(std::fs::read(&input_filename).unwrap()).unwrap(); @@ -612,49 +747,21 @@ fn test_sack_sysfile(name: &str) { fn test_sysfile(sysfile: R, expected: &str, expected_filename: &Path) where - R: Read + Seek + 'static, + R: BufRead + Seek + 'static, { let mut warnings = Vec::new(); - let output = match ReaderOptions::new().open_reader(sysfile, |warning| warnings.push(warning)) { + let output = match ReadOptions::new(|warning| warnings.push(warning)).open_reader(sysfile) { Ok(system_file) => { let (dictionary, metadata, cases) = system_file.into_parts(); - let (group, data) = metadata.to_pivot_rows(); - let metadata_table = PivotTable::new([(Axis3::Y, Dimension::new(group))]).with_data( - data.into_iter() - .enumerate() - .filter(|(_row, value)| !value.is_empty()) - .map(|(row, value)| ([row], value)), - ); - let (group, data) = dictionary.to_pivot_rows(); - let dictionary_table = PivotTable::new([(Axis3::Y, Dimension::new(group))]).with_data( - data.into_iter() - .enumerate() - .filter(|(_row, value)| !value.is_empty()) - .map(|(row, value)| ([row], value)), - ); + let mut output = Vec::new(); output.extend( warnings .into_iter() - .map(|warning| Arc::new(Item::from(Text::new_log(warning.to_string())))), + .map(|warning| Item::from(Text::new_log(warning.to_string()))), ); - output.push(Arc::new(metadata_table.into())); - output.push(Arc::new(dictionary_table.into())); - output.push(Arc::new( - dictionary.output_variables().to_pivot_table().into(), - )); - if let Some(pt) = dictionary.output_value_labels().to_pivot_table() { - output.push(Arc::new(pt.into())); - } - if let Some(pt) = dictionary.output_mrsets().to_pivot_table() { - output.push(Arc::new(pt.into())); - } - if let Some(pt) = dictionary.output_attributes().to_pivot_table() { - output.push(Arc::new(pt.into())); - } - if let Some(pt) = dictionary.output_variable_sets().to_pivot_table() { - output.push(Arc::new(pt.into())); - } + output.push(PivotTable::from(&metadata).into()); + output.extend(dictionary.all_pivot_tables().into_iter().map_into()); let variables = Group::new("Variable").with_multiple(dictionary.variables.iter().map(|var| &**var)); let mut case_numbers = Group::new("Case").with_label_shown(); @@ -665,14 +772,13 @@ where case_numbers .push(Value::new_integer(Some((case_numbers.len() + 1) as f64))); data.push( - case.0 - .into_iter() - .map(|datum| Value::new_datum(&datum, dictionary.encoding)) + case.into_iter() + .map(|datum| Value::new_datum(&datum)) .collect::>(), ); } Err(error) => { - output.push(Arc::new(Item::from(Text::new_log(error.to_string())))); + output.push(Item::from(Text::new_log(error.to_string()))); } } } @@ -686,17 +792,21 @@ where pt.insert(&[column_number, row_number], datum); } } - output.push(Arc::new(pt.into())); + output.push(pt.into()); } - Item::new(Details::Group(output)) + Item::new(Details::Group(output.into_iter().map(Arc::new).collect())) } Err(error) => Item::new(Details::Text(Box::new(Text::new_log(error.to_string())))), }; let actual = output.to_string(); - if expected != actual && std::env::var("PSPP_REFRESH_EXPECTED").is_ok() { - std::fs::write(expected_filename, actual).unwrap(); - panic!("{}: refreshed output", expected_filename.display()); + if expected != actual { + if std::env::var("PSPP_REFRESH_EXPECTED").is_ok() { + std::fs::write(expected_filename, actual).unwrap(); + panic!("{}: refreshed output", expected_filename.display()); + } else { + eprintln!("note: rerun with PSPP_REFRESH_EXPECTED=1 to refresh expected output"); + } } assert_lines_eq(&expected, expected_filename.display(), &actual, "actual"); } diff --git a/rust/pspp/src/sys/testdata/attributes.expected b/rust/pspp/src/sys/testdata/attributes.expected index f0daafed95..577bea6123 100644 --- a/rust/pspp/src/sys/testdata/attributes.expected +++ b/rust/pspp/src/sys/testdata/attributes.expected @@ -13,6 +13,7 @@ │Variables│ 3│ ╰─────────┴────────────────────────╯ + Variables ╭──────────────┬────────┬─────┬─────────────────┬──────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role │Width│Alignment│Print Format│Write Format│Missing Values│ ├──────────────┼────────┼─────┼─────────────────┼──────┼─────┼─────────┼────────────┼────────────┼──────────────┤ @@ -21,6 +22,7 @@ │ThirdVariable │ 3│ │ │Input │ 8│Right │F8.0 │F8.0 │ │ ╰──────────────┴────────┴─────┴─────────────────┴──────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + Data File and Variable Attributes ╭───────────────────────────────┬─────────────╮ │Variable and Name │ Value │ ├───────────────────────────────┼─────────────┤ diff --git a/rust/pspp/src/sys/testdata/bad_machine_float_info_size.expected b/rust/pspp/src/sys/testdata/bad_machine_float_info_size.expected index d2542a1194..b09ef9aa98 100644 --- a/rust/pspp/src/sys/testdata/bad_machine_float_info_size.expected +++ b/rust/pspp/src/sys/testdata/bad_machine_float_info_size.expected @@ -13,6 +13,7 @@ Warning at file offsets 0xe0 to 0x100: In extension record: floating point recor │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/bad_machine_integer_info_count.expected b/rust/pspp/src/sys/testdata/bad_machine_integer_info_count.expected index 0587a131f8..c6e81da688 100644 --- a/rust/pspp/src/sys/testdata/bad_machine_integer_info_count.expected +++ b/rust/pspp/src/sys/testdata/bad_machine_integer_info_count.expected @@ -13,6 +13,7 @@ Warning at file offsets 0xe0 to 0x104: In extension record: integer record has b │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/bad_machine_integer_info_endianness.expected b/rust/pspp/src/sys/testdata/bad_machine_integer_info_endianness.expected index e66463ad12..ca0229447c 100644 --- a/rust/pspp/src/sys/testdata/bad_machine_integer_info_endianness.expected +++ b/rust/pspp/src/sys/testdata/bad_machine_integer_info_endianness.expected @@ -14,6 +14,7 @@ Integer format indicated by system file (3) differs from expected ({endian}). │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/bad_machine_integer_info_float_format.expected b/rust/pspp/src/sys/testdata/bad_machine_integer_info_float_format.expected index d25902e584..f040255705 100644 --- a/rust/pspp/src/sys/testdata/bad_machine_integer_info_float_format.expected +++ b/rust/pspp/src/sys/testdata/bad_machine_integer_info_float_format.expected @@ -14,6 +14,7 @@ Floating-point representation indicated by system file (2) differs from expected │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/bad_variable_name_in_long_string_value_label.expected b/rust/pspp/src/sys/testdata/bad_variable_name_in_long_string_value_label.expected index 0f9a086b6c..acdbb3f5a3 100644 --- a/rust/pspp/src/sys/testdata/bad_variable_name_in_long_string_value_label.expected +++ b/rust/pspp/src/sys/testdata/bad_variable_name_in_long_string_value_label.expected @@ -15,6 +15,7 @@ Ignoring long string value label for numeric variable NUM1. │Variables│2│ ╰─────────┴─╯ + Variables ╭─────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├─────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ @@ -22,6 +23,7 @@ Ignoring long string value label for numeric variable NUM1. │str14│ 2│ │Nominal │Input│ 14│Left │A14 │A14 │ │ ╰─────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + Value Labels ╭─────────────────────────────┬────────────────────────────────────────╮ │Variable Value │ │ ├─────────────────────────────┼────────────────────────────────────────┤ diff --git a/rust/pspp/src/sys/testdata/bad_variable_name_in_variable_value_pair.expected b/rust/pspp/src/sys/testdata/bad_variable_name_in_variable_value_pair.expected index 13d3fb715e..673e178210 100644 --- a/rust/pspp/src/sys/testdata/bad_variable_name_in_variable_value_pair.expected +++ b/rust/pspp/src/sys/testdata/bad_variable_name_in_variable_value_pair.expected @@ -13,6 +13,7 @@ Warning at file offsets 0xe0 to 0xe5: In long variable name record: Missing `=` │Variables│1│ ╰─────────┴─╯ + Variables ╭────────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/bad_very_long_string_length.expected b/rust/pspp/src/sys/testdata/bad_very_long_string_length.expected index 31dafee3ba..7d391c0323 100644 --- a/rust/pspp/src/sys/testdata/bad_very_long_string_length.expected +++ b/rust/pspp/src/sys/testdata/bad_very_long_string_length.expected @@ -17,6 +17,7 @@ Variable with short name NUM1 listed in very long string record with width 256 r │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/bad_very_long_string_segment_width.expected b/rust/pspp/src/sys/testdata/bad_very_long_string_segment_width.expected index 75d706ab95..6ba24735bd 100644 --- a/rust/pspp/src/sys/testdata/bad_very_long_string_segment_width.expected +++ b/rust/pspp/src/sys/testdata/bad_very_long_string_segment_width.expected @@ -15,6 +15,7 @@ Variable with short name STR1 listed in very long string record with width 256 h │Variables│2│ ╰─────────┴─╯ + Variables ╭──────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├──────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/compressed_data.expected b/rust/pspp/src/sys/testdata/compressed_data.expected index 624955c00d..1cf5d48e0f 100644 --- a/rust/pspp/src/sys/testdata/compressed_data.expected +++ b/rust/pspp/src/sys/testdata/compressed_data.expected @@ -12,6 +12,7 @@ │Variables│ 5│ ╰─────────┴────────────────────────╯ + Variables ╭─────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├─────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/compressed_data_other_bias.expected b/rust/pspp/src/sys/testdata/compressed_data_other_bias.expected index 21cd534702..372a7f9654 100644 --- a/rust/pspp/src/sys/testdata/compressed_data_other_bias.expected +++ b/rust/pspp/src/sys/testdata/compressed_data_other_bias.expected @@ -14,6 +14,7 @@ Warning at file offsets 0x54 to 0x5c: In file header: Compression bias is 50 ins │Variables│ 5│ ╰─────────┴────────────────────────╯ + Variables ╭─────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├─────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/compressed_data_zero_bias.expected b/rust/pspp/src/sys/testdata/compressed_data_zero_bias.expected index 96d6e5ac76..9d6989cdf7 100644 --- a/rust/pspp/src/sys/testdata/compressed_data_zero_bias.expected +++ b/rust/pspp/src/sys/testdata/compressed_data_zero_bias.expected @@ -12,6 +12,7 @@ │Variables│ 5│ ╰─────────┴────────────────────────╯ + Variables ╭─────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├─────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/documents.expected b/rust/pspp/src/sys/testdata/documents.expected index 42cc6e4c04..762e204069 100644 --- a/rust/pspp/src/sys/testdata/documents.expected +++ b/rust/pspp/src/sys/testdata/documents.expected @@ -17,6 +17,7 @@ │ │Last line of documents │ ╰─────────┴───────────────────────────────────────────────────────╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/duplicate_attribute_name.expected b/rust/pspp/src/sys/testdata/duplicate_attribute_name.expected index 3a70ef81ca..78d1ec0b83 100644 --- a/rust/pspp/src/sys/testdata/duplicate_attribute_name.expected +++ b/rust/pspp/src/sys/testdata/duplicate_attribute_name.expected @@ -15,12 +15,14 @@ Warning at file offsets 0x10e to 0x12d: In file or variable attribute record: Du │Variables│1│ ╰─────────┴─╯ + Variables ╭────────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ │firstvar│ 1│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ ╰────────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ +Data File and Variable Attributes ╭───────────────────────┬─────╮ │Variable and Name │Value│ ├───────────────────────┼─────┤ diff --git a/rust/pspp/src/sys/testdata/duplicate_long_variable_name.expected b/rust/pspp/src/sys/testdata/duplicate_long_variable_name.expected index 6f17a31f67..a169e66ead 100644 --- a/rust/pspp/src/sys/testdata/duplicate_long_variable_name.expected +++ b/rust/pspp/src/sys/testdata/duplicate_long_variable_name.expected @@ -19,6 +19,7 @@ Duplicate long variable name LONGVARIABLENAME. │Variables│4│ ╰─────────┴─╯ + Variables ╭────────────────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────────────────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/duplicate_value_labels_type.expected b/rust/pspp/src/sys/testdata/duplicate_value_labels_type.expected index e6416d44f3..010bb05806 100644 --- a/rust/pspp/src/sys/testdata/duplicate_value_labels_type.expected +++ b/rust/pspp/src/sys/testdata/duplicate_value_labels_type.expected @@ -15,6 +15,7 @@ NUM1 has duplicate value labels for the following value(s): 1 │Variables│2│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ @@ -22,6 +23,7 @@ NUM1 has duplicate value labels for the following value(s): 1 │num1│ 2│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ ╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + Value Labels ╭─────────────────────┬───╮ │Variable Value │ │ ├─────────────────────┼───┤ diff --git a/rust/pspp/src/sys/testdata/duplicate_variable_name.expected b/rust/pspp/src/sys/testdata/duplicate_variable_name.expected index ac64a478c3..581bcf61d9 100644 --- a/rust/pspp/src/sys/testdata/duplicate_variable_name.expected +++ b/rust/pspp/src/sys/testdata/duplicate_variable_name.expected @@ -13,6 +13,7 @@ Renaming variable with duplicate name VAR1 to VAR001. │Variables│2│ ╰─────────┴─╯ + Variables ╭──────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├──────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/empty_document_record.expected b/rust/pspp/src/sys/testdata/empty_document_record.expected index 47d2fac4a4..0bf233ca67 100644 --- a/rust/pspp/src/sys/testdata/empty_document_record.expected +++ b/rust/pspp/src/sys/testdata/empty_document_record.expected @@ -13,6 +13,7 @@ │Variables│ 1│ ╰─────────┴────────────────────────╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/extra_product_info.expected b/rust/pspp/src/sys/testdata/extra_product_info.expected index 63ef54fde9..f5d25a70dd 100644 --- a/rust/pspp/src/sys/testdata/extra_product_info.expected +++ b/rust/pspp/src/sys/testdata/extra_product_info.expected @@ -16,6 +16,7 @@ │Variables│ 4│ ╰─────────┴──────────────╯ + Variables ╭─┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├─┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/fewer_data_records_than_indicated_by_file_header.expected b/rust/pspp/src/sys/testdata/fewer_data_records_than_indicated_by_file_header.expected index 41eca0a266..70c4d64598 100644 --- a/rust/pspp/src/sys/testdata/fewer_data_records_than_indicated_by_file_header.expected +++ b/rust/pspp/src/sys/testdata/fewer_data_records_than_indicated_by_file_header.expected @@ -11,6 +11,7 @@ │Variables│2│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/integer_overflows_in_long_string_missing_values.expected b/rust/pspp/src/sys/testdata/integer_overflows_in_long_string_missing_values.expected index ea2d86612b..8b1faeb1aa 100644 --- a/rust/pspp/src/sys/testdata/integer_overflows_in_long_string_missing_values.expected +++ b/rust/pspp/src/sys/testdata/integer_overflows_in_long_string_missing_values.expected @@ -18,6 +18,7 @@ This system file does not indicate its own character encoding. For best results │Variables│4│ ╰─────────┴─╯ + Variables ╭────────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/invalid_long_string_missing_values.expected b/rust/pspp/src/sys/testdata/invalid_long_string_missing_values.expected index 654448b131..086d49847e 100644 --- a/rust/pspp/src/sys/testdata/invalid_long_string_missing_values.expected +++ b/rust/pspp/src/sys/testdata/invalid_long_string_missing_values.expected @@ -25,6 +25,7 @@ Invalid long string missing value for 7-byte string variable STR4. │Variables│ 5│ ╰─────────┴──────────────────────────────╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────────────────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│ Missing Values │ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────────────────────────┤ diff --git a/rust/pspp/src/sys/testdata/invalid_variable_format.expected b/rust/pspp/src/sys/testdata/invalid_variable_format.expected index 9969410ab1..05d55df674 100644 --- a/rust/pspp/src/sys/testdata/invalid_variable_format.expected +++ b/rust/pspp/src/sys/testdata/invalid_variable_format.expected @@ -27,6 +27,7 @@ Substituting A4 for invalid write format on variable STR2. String variable with │Variables│4│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/invalid_variable_name.expected b/rust/pspp/src/sys/testdata/invalid_variable_name.expected index 9ac792f6ba..ee97fa6706 100644 --- a/rust/pspp/src/sys/testdata/invalid_variable_name.expected +++ b/rust/pspp/src/sys/testdata/invalid_variable_name.expected @@ -19,6 +19,7 @@ │Variables│4│ ╰─────────┴─╯ + Variables ╭──────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├──────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/long_variable_names.expected b/rust/pspp/src/sys/testdata/long_variable_names.expected index 23b123429a..88dde68c52 100644 --- a/rust/pspp/src/sys/testdata/long_variable_names.expected +++ b/rust/pspp/src/sys/testdata/long_variable_names.expected @@ -13,6 +13,7 @@ │Variables│ 7│ ╰─────────┴────────────────────────╯ + Variables ╭─────────────────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├─────────────────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/missing_attribute_value.expected b/rust/pspp/src/sys/testdata/missing_attribute_value.expected index 141a5a0bc7..90adcda997 100644 --- a/rust/pspp/src/sys/testdata/missing_attribute_value.expected +++ b/rust/pspp/src/sys/testdata/missing_attribute_value.expected @@ -15,6 +15,7 @@ Warning at file offsets 0xf6 to 0x109: In file or variable attribute record: Att │Variables│1│ ╰─────────┴─╯ + Variables ╭────────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/missing_newline_after_variable_name_in_mrsets.expected b/rust/pspp/src/sys/testdata/missing_newline_after_variable_name_in_mrsets.expected index ca1145046a..7aef1fdfec 100644 --- a/rust/pspp/src/sys/testdata/missing_newline_after_variable_name_in_mrsets.expected +++ b/rust/pspp/src/sys/testdata/missing_newline_after_variable_name_in_mrsets.expected @@ -13,6 +13,7 @@ Multiple response set $a has only one variable. │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/missing_string_continuation.expected b/rust/pspp/src/sys/testdata/missing_string_continuation.expected index 3bb98b0f40..6630473a18 100644 --- a/rust/pspp/src/sys/testdata/missing_string_continuation.expected +++ b/rust/pspp/src/sys/testdata/missing_string_continuation.expected @@ -13,6 +13,7 @@ Variable index 0 is a 10-byte string that should be followed by long string cont │Variables│2│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/mixed_variable_types_in_mrsets.expected b/rust/pspp/src/sys/testdata/mixed_variable_types_in_mrsets.expected index b9a1e3a4c3..4c86e6f7ca 100644 --- a/rust/pspp/src/sys/testdata/mixed_variable_types_in_mrsets.expected +++ b/rust/pspp/src/sys/testdata/mixed_variable_types_in_mrsets.expected @@ -13,6 +13,7 @@ Multiple response set $a contains both string and numeric variables. │Variables│2│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/more_data_records_than_indicated_by_file_header.expected b/rust/pspp/src/sys/testdata/more_data_records_than_indicated_by_file_header.expected index cbe0398bf4..16d81d3cc7 100644 --- a/rust/pspp/src/sys/testdata/more_data_records_than_indicated_by_file_header.expected +++ b/rust/pspp/src/sys/testdata/more_data_records_than_indicated_by_file_header.expected @@ -11,6 +11,7 @@ │Variables│2│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/multiple_documents_records.expected b/rust/pspp/src/sys/testdata/multiple_documents_records.expected index 414d01908c..2f45331637 100644 --- a/rust/pspp/src/sys/testdata/multiple_documents_records.expected +++ b/rust/pspp/src/sys/testdata/multiple_documents_records.expected @@ -13,6 +13,7 @@ │ │One line of documents│ ╰─────────┴─────────────────────╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/multiple_response_sets.expected b/rust/pspp/src/sys/testdata/multiple_response_sets.expected index 47c8a9b46c..6bacc195f3 100644 --- a/rust/pspp/src/sys/testdata/multiple_response_sets.expected +++ b/rust/pspp/src/sys/testdata/multiple_response_sets.expected @@ -13,6 +13,7 @@ │Variables│ 16│ ╰─────────┴────────────────────────╯ + Variables ╭──┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├──┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ @@ -34,6 +35,7 @@ │p │ 16│ │Nominal │Input│ 6│Left │A6 │A6 │ │ ╰──┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + Multiple Response Sets ╭────┬─────────────┬───────────┬─────────────┬────────────────╮ │Name│ Label │ Encoding │Counted Value│Member Variables│ ├────┼─────────────┼───────────┼─────────────┼────────────────┤ diff --git a/rust/pspp/src/sys/testdata/multiple_response_sets_bad_counted_string.expected b/rust/pspp/src/sys/testdata/multiple_response_sets_bad_counted_string.expected index 9b849999e5..e418bef4cc 100644 --- a/rust/pspp/src/sys/testdata/multiple_response_sets_bad_counted_string.expected +++ b/rust/pspp/src/sys/testdata/multiple_response_sets_bad_counted_string.expected @@ -13,6 +13,7 @@ Warning at file offsets 0xe0 to 0xe5: In multiple response set record: Syntax er │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/multiple_response_sets_bad_name.expected b/rust/pspp/src/sys/testdata/multiple_response_sets_bad_name.expected index 6793e1b229..0e7f37317d 100644 --- a/rust/pspp/src/sys/testdata/multiple_response_sets_bad_name.expected +++ b/rust/pspp/src/sys/testdata/multiple_response_sets_bad_name.expected @@ -17,6 +17,7 @@ Invalid multiple response set name. Multiple response set name "e" does not beg │Variables│ 16│ ╰─────────┴────────────────────────╯ + Variables ╭──┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├──┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ @@ -38,6 +39,7 @@ Invalid multiple response set name. Multiple response set name "e" does not beg │p │ 16│ │Nominal │Input│ 6│Left │A6 │A6 │ │ ╰──┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + Multiple Response Sets ╭────┬─────────────┬───────────┬─────────────┬────────────────╮ │Name│ Label │ Encoding │Counted Value│Member Variables│ ├────┼─────────────┼───────────┼─────────────┼────────────────┤ diff --git a/rust/pspp/src/sys/testdata/multiple_response_sets_counted_string_bad_length.expected b/rust/pspp/src/sys/testdata/multiple_response_sets_counted_string_bad_length.expected index 8fb97dbb0c..9d0a063f51 100644 --- a/rust/pspp/src/sys/testdata/multiple_response_sets_counted_string_bad_length.expected +++ b/rust/pspp/src/sys/testdata/multiple_response_sets_counted_string_bad_length.expected @@ -13,6 +13,7 @@ Warning at file offsets 0xe0 to 0xe9: In multiple response set record: Syntax er │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/multiple_response_sets_counted_string_missing_space.expected b/rust/pspp/src/sys/testdata/multiple_response_sets_counted_string_missing_space.expected index f6e6e03234..8cbd13cc55 100644 --- a/rust/pspp/src/sys/testdata/multiple_response_sets_counted_string_missing_space.expected +++ b/rust/pspp/src/sys/testdata/multiple_response_sets_counted_string_missing_space.expected @@ -13,6 +13,7 @@ Warning at file offsets 0xe0 to 0xe6: In multiple response set record: Syntax er │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/multiple_response_sets_duplicate_variable_name.expected b/rust/pspp/src/sys/testdata/multiple_response_sets_duplicate_variable_name.expected index e5a0854dc6..a43645718e 100644 --- a/rust/pspp/src/sys/testdata/multiple_response_sets_duplicate_variable_name.expected +++ b/rust/pspp/src/sys/testdata/multiple_response_sets_duplicate_variable_name.expected @@ -15,6 +15,7 @@ Multiple response set $a has only one variable. │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/multiple_response_sets_missing_label_source.expected b/rust/pspp/src/sys/testdata/multiple_response_sets_missing_label_source.expected index fe138f630b..9212037f06 100644 --- a/rust/pspp/src/sys/testdata/multiple_response_sets_missing_label_source.expected +++ b/rust/pspp/src/sys/testdata/multiple_response_sets_missing_label_source.expected @@ -13,6 +13,7 @@ Warning at file offsets 0xe0 to 0xe5: In multiple response set record: Invalid m │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/multiple_response_sets_missing_newline_after_variable_name.expected b/rust/pspp/src/sys/testdata/multiple_response_sets_missing_newline_after_variable_name.expected index d4c8ec03cb..a4535ccd20 100644 --- a/rust/pspp/src/sys/testdata/multiple_response_sets_missing_newline_after_variable_name.expected +++ b/rust/pspp/src/sys/testdata/multiple_response_sets_missing_newline_after_variable_name.expected @@ -13,6 +13,7 @@ Warning at file offsets 0xe0 to 0xec: In multiple response set record: Syntax er │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/multiple_response_sets_missing_space_after_c.expected b/rust/pspp/src/sys/testdata/multiple_response_sets_missing_space_after_c.expected index 5c60f29d73..beca7dd9f0 100644 --- a/rust/pspp/src/sys/testdata/multiple_response_sets_missing_space_after_c.expected +++ b/rust/pspp/src/sys/testdata/multiple_response_sets_missing_space_after_c.expected @@ -13,6 +13,7 @@ Warning at file offsets 0xe0 to 0xe5: In multiple response set record: Syntax er │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/multiple_response_sets_missing_space_after_counted_string.expected b/rust/pspp/src/sys/testdata/multiple_response_sets_missing_space_after_counted_string.expected index 05d8d77537..bdaee73a02 100644 --- a/rust/pspp/src/sys/testdata/multiple_response_sets_missing_space_after_counted_string.expected +++ b/rust/pspp/src/sys/testdata/multiple_response_sets_missing_space_after_counted_string.expected @@ -13,6 +13,7 @@ Warning at file offsets 0xe0 to 0xea: In multiple response set record: Syntax er │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/multiple_response_sets_missing_space_after_e.expected b/rust/pspp/src/sys/testdata/multiple_response_sets_missing_space_after_e.expected index fe138f630b..9212037f06 100644 --- a/rust/pspp/src/sys/testdata/multiple_response_sets_missing_space_after_e.expected +++ b/rust/pspp/src/sys/testdata/multiple_response_sets_missing_space_after_e.expected @@ -13,6 +13,7 @@ Warning at file offsets 0xe0 to 0xe5: In multiple response set record: Invalid m │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/multiple_response_sets_unexpected_label_source.expected b/rust/pspp/src/sys/testdata/multiple_response_sets_unexpected_label_source.expected index 4b0d62e285..c71d6c8949 100644 --- a/rust/pspp/src/sys/testdata/multiple_response_sets_unexpected_label_source.expected +++ b/rust/pspp/src/sys/testdata/multiple_response_sets_unexpected_label_source.expected @@ -13,6 +13,7 @@ Warning at file offsets 0xe0 to 0xe6: In multiple response set record: Invalid m │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/no_variables.expected b/rust/pspp/src/sys/testdata/no_variables.expected index b10e8d94b3..3580af6a4b 100644 --- a/rust/pspp/src/sys/testdata/no_variables.expected +++ b/rust/pspp/src/sys/testdata/no_variables.expected @@ -12,3 +12,4 @@ │Variables│ 0│ ╰─────────┴────────────────────────╯ + Variables diff --git a/rust/pspp/src/sys/testdata/null_dereference_skipping_bad_extension_record_18.expected b/rust/pspp/src/sys/testdata/null_dereference_skipping_bad_extension_record_18.expected index 64940a3880..1cad5bffd3 100644 --- a/rust/pspp/src/sys/testdata/null_dereference_skipping_bad_extension_record_18.expected +++ b/rust/pspp/src/sys/testdata/null_dereference_skipping_bad_extension_record_18.expected @@ -15,6 +15,7 @@ This system file does not indicate its own character encoding. For best results │Variables│4│ ╰─────────┴─╯ + Variables ╭────────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ @@ -24,6 +25,7 @@ This system file does not indicate its own character encoding. For best results │VAR00004│ 4│ │Nominal │Input│ 8│Left │A1 │A1 │ │ ╰────────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + Value Labels ╭───────────────────┬──────────╮ │Variable Value │ │ ├───────────────────┼──────────┤ diff --git a/rust/pspp/src/sys/testdata/partial_compressed_data_record.expected b/rust/pspp/src/sys/testdata/partial_compressed_data_record.expected index d0cc32ccf9..d7b0a07ee7 100644 --- a/rust/pspp/src/sys/testdata/partial_compressed_data_record.expected +++ b/rust/pspp/src/sys/testdata/partial_compressed_data_record.expected @@ -12,6 +12,7 @@ │Variables│ 5│ ╰─────────┴────────────────────────╯ + Variables ╭─────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├─────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ @@ -22,7 +23,7 @@ │str15│ 5│ │Nominal │Input│ 15│Left │A15 │A15 │ │ ╰─────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ -Unexpected end of file 0 bytes and 2 compression chunks into a compressed case. +Unexpected end of file 0 bytes and 2 compression chunks into compressed case 2. ╭────┬──────┬────┬────┬────────┬───────────────╮ │Case│ num1 │num2│str4│ str8 │ str15 │ diff --git a/rust/pspp/src/sys/testdata/partial_data_record_between_variables.expected b/rust/pspp/src/sys/testdata/partial_data_record_between_variables.expected index 813c06e834..021b2b3ff6 100644 --- a/rust/pspp/src/sys/testdata/partial_data_record_between_variables.expected +++ b/rust/pspp/src/sys/testdata/partial_data_record_between_variables.expected @@ -11,6 +11,7 @@ │Variables│2│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ @@ -18,7 +19,7 @@ │num2│ 2│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ ╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ -Error at file offsets 0x124 to 0x12c: Unexpected end of file 8 bytes into a 16-byte case. +Error at file offsets 0x124 to 0x12c: Unexpected end of file 8 bytes into case 2 with expected length 16 bytes. ╭────┬────┬────╮ │Case│num1│num2│ diff --git a/rust/pspp/src/sys/testdata/partial_data_record_within_long_string.expected b/rust/pspp/src/sys/testdata/partial_data_record_within_long_string.expected index b1c0844998..b0754f43a0 100644 --- a/rust/pspp/src/sys/testdata/partial_data_record_within_long_string.expected +++ b/rust/pspp/src/sys/testdata/partial_data_record_within_long_string.expected @@ -11,6 +11,7 @@ │Variables│1│ ╰─────────┴─╯ + Variables ╭─────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├─────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/test-encrypted.expected b/rust/pspp/src/sys/testdata/test-encrypted.expected index e98ba62f71..840189423b 100644 --- a/rust/pspp/src/sys/testdata/test-encrypted.expected +++ b/rust/pspp/src/sys/testdata/test-encrypted.expected @@ -12,6 +12,7 @@ │Variables│5│ ╰─────────┴─╯ + Variables ╭────────────────────────────────────────────────────┬────────┬────────────────────────────────────────────────────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│ Label │Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────────────────────────────────────────────────────┼────────┼────────────────────────────────────────────────────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ @@ -22,6 +23,7 @@ │There was too much noise in the rooms │ 5│There was too much noise in the rooms │Ordinal │Input│ 8│Right │F8.0 │F8.0 │ │ ╰────────────────────────────────────────────────────┴────────┴────────────────────────────────────────────────────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + Value Labels ╭──────────────────────────────────────────────────────┬─────────────────╮ │Variable Value │ │ ├──────────────────────────────────────────────────────┼─────────────────┤ @@ -56,6 +58,7 @@ │ 5│Strongly Agree │ ╰──────────────────────────────────────────────────────┴─────────────────╯ + Data File and Variable Attributes ╭───────────────────────────────────────────────────────────┬─────╮ │Variable and Name │Value│ ├───────────────────────────────────────────────────────────┼─────┤ diff --git a/rust/pspp/src/sys/testdata/type_4_record_names_long_string_variable.expected b/rust/pspp/src/sys/testdata/type_4_record_names_long_string_variable.expected index ba37a94432..c91be624b3 100644 --- a/rust/pspp/src/sys/testdata/type_4_record_names_long_string_variable.expected +++ b/rust/pspp/src/sys/testdata/type_4_record_names_long_string_variable.expected @@ -13,6 +13,7 @@ At offsets 0xf4...0x114, record types 3 and 4 may not add value labels to one or │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/unknown_encoding.expected b/rust/pspp/src/sys/testdata/unknown_encoding.expected index a02e3abb6a..3d620f7241 100644 --- a/rust/pspp/src/sys/testdata/unknown_encoding.expected +++ b/rust/pspp/src/sys/testdata/unknown_encoding.expected @@ -16,6 +16,7 @@ Text string contains invalid bytes for UTF-8 encoding: "PSPP synthetic test file │Variables│ 4│ ╰─────────┴──────────────────────────────╯ + Variables ╭─┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├─┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/unknown_extension_record.expected b/rust/pspp/src/sys/testdata/unknown_extension_record.expected index 3d4be23ca9..bf775efd47 100644 --- a/rust/pspp/src/sys/testdata/unknown_extension_record.expected +++ b/rust/pspp/src/sys/testdata/unknown_extension_record.expected @@ -13,6 +13,7 @@ Unknown extension record with subtype 30 at offset 0xe0, consisting of 1 1-byte │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/unquoted_attribute_value.expected b/rust/pspp/src/sys/testdata/unquoted_attribute_value.expected index 5a45874292..22a368ee06 100644 --- a/rust/pspp/src/sys/testdata/unquoted_attribute_value.expected +++ b/rust/pspp/src/sys/testdata/unquoted_attribute_value.expected @@ -15,12 +15,14 @@ Warning at file offsets 0xfd to 0x10f: In file or variable attribute record: Att │Variables│1│ ╰─────────┴─╯ + Variables ╭────────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ │firstvar│ 1│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ ╰────────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ +Data File and Variable Attributes ╭───────────────────────┬─────╮ │Variable and Name │Value│ ├───────────────────────┼─────┤ diff --git a/rust/pspp/src/sys/testdata/unspecified_number_of_variable_positions.expected b/rust/pspp/src/sys/testdata/unspecified_number_of_variable_positions.expected index 12cfebb0c6..ed9a324b8f 100644 --- a/rust/pspp/src/sys/testdata/unspecified_number_of_variable_positions.expected +++ b/rust/pspp/src/sys/testdata/unspecified_number_of_variable_positions.expected @@ -12,6 +12,7 @@ │Variables│ 2│ ╰─────────┴────────────────────────╯ + Variables ╭──────────────────────────┬────────┬──────────────────────────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│ Label │Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├──────────────────────────┼────────┼──────────────────────────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/value_label_variable_indexes_must_be_in_correct_range.expected b/rust/pspp/src/sys/testdata/value_label_variable_indexes_must_be_in_correct_range.expected index 8469f599a8..ceb9b71ee6 100644 --- a/rust/pspp/src/sys/testdata/value_label_variable_indexes_must_be_in_correct_range.expected +++ b/rust/pspp/src/sys/testdata/value_label_variable_indexes_must_be_in_correct_range.expected @@ -17,6 +17,7 @@ Warning at file offsets 0x160 to 0x168: In value label record: One or more varia │Variables│2│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/value_label_variable_indexes_must_not_be_long_string_continuation.expected b/rust/pspp/src/sys/testdata/value_label_variable_indexes_must_not_be_long_string_continuation.expected index 54880f8648..67c64a991e 100644 --- a/rust/pspp/src/sys/testdata/value_label_variable_indexes_must_not_be_long_string_continuation.expected +++ b/rust/pspp/src/sys/testdata/value_label_variable_indexes_must_not_be_long_string_continuation.expected @@ -13,6 +13,7 @@ Warning at file offsets 0x110 to 0x114: In value label record: One or more varia │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/value_label_with_no_associated_variables.expected b/rust/pspp/src/sys/testdata/value_label_with_no_associated_variables.expected index 1230c0a499..91ea4380dd 100644 --- a/rust/pspp/src/sys/testdata/value_label_with_no_associated_variables.expected +++ b/rust/pspp/src/sys/testdata/value_label_with_no_associated_variables.expected @@ -15,6 +15,7 @@ This system file does not indicate its own character encoding. For best results │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/value_labels.expected b/rust/pspp/src/sys/testdata/value_labels.expected index 54bd0ec5f0..50932daee1 100644 --- a/rust/pspp/src/sys/testdata/value_labels.expected +++ b/rust/pspp/src/sys/testdata/value_labels.expected @@ -13,6 +13,7 @@ │Variables│ 17│ ╰─────────┴────────────────────────╯ + Variables ╭─────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├─────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ @@ -35,6 +36,7 @@ │str17│ 17│ │Nominal │Input│ 17│Left │A17 │A17 │ │ ╰─────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + Value Labels ╭────────────────────────────────┬───────────────────────────────────────────────────────────────╮ │Variable Value │ │ ├────────────────────────────────┼───────────────────────────────────────────────────────────────┤ diff --git a/rust/pspp/src/sys/testdata/variable_display_with_width.expected b/rust/pspp/src/sys/testdata/variable_display_with_width.expected index a90f718cbf..898c8350f2 100644 --- a/rust/pspp/src/sys/testdata/variable_display_with_width.expected +++ b/rust/pspp/src/sys/testdata/variable_display_with_width.expected @@ -12,6 +12,7 @@ │Variables│ 12│ ╰─────────┴────────────────────────╯ + Variables ╭─┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├─┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/variable_display_without_width.expected b/rust/pspp/src/sys/testdata/variable_display_without_width.expected index 97aa90c9ea..e810dcd53f 100644 --- a/rust/pspp/src/sys/testdata/variable_display_without_width.expected +++ b/rust/pspp/src/sys/testdata/variable_display_without_width.expected @@ -12,6 +12,7 @@ │Variables│ 12│ ╰─────────┴────────────────────────╯ + Variables ╭─┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├─┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/variable_labels_and_missing_values.expected b/rust/pspp/src/sys/testdata/variable_labels_and_missing_values.expected index 2648723946..6d6b7a751d 100644 --- a/rust/pspp/src/sys/testdata/variable_labels_and_missing_values.expected +++ b/rust/pspp/src/sys/testdata/variable_labels_and_missing_values.expected @@ -13,6 +13,7 @@ │Variables│ 21│ ╰─────────┴──────────────────────────────╯ + Variables ╭────────────────────────────────┬────────┬────────────────────────────────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬───────────────────────────╮ │ │Position│ Label │Measurement Level│ Role│Width│Alignment│Print Format│Write Format│ Missing Values │ ├────────────────────────────────┼────────┼────────────────────────────────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼───────────────────────────┤ diff --git a/rust/pspp/src/sys/testdata/variable_roles.expected b/rust/pspp/src/sys/testdata/variable_roles.expected index ce3c4cc80f..5825b90c08 100644 --- a/rust/pspp/src/sys/testdata/variable_roles.expected +++ b/rust/pspp/src/sys/testdata/variable_roles.expected @@ -15,6 +15,7 @@ Unknown role "6". │Variables│ 7│ ╰─────────┴────────────────────────╯ + Variables ╭─┬────────┬─────┬─────────────────┬─────────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role │Width│Alignment│Print Format│Write Format│Missing Values│ ├─┼────────┼─────┼─────────────────┼─────────┼─────┼─────────┼────────────┼────────────┼──────────────┤ @@ -27,6 +28,7 @@ Unknown role "6". │x│ 7│ │ │Input │ 8│Right │F8.0 │F8.0 │ │ ╰─┴────────┴─────┴─────────────────┴─────────┴─────┴─────────┴────────────┴────────────┴──────────────╯ +Data File and Variable Attributes ╭────────────────────────┬─────╮ │Variable and Name │Value│ ├────────────────────────┼─────┤ diff --git a/rust/pspp/src/sys/testdata/variable_sets.expected b/rust/pspp/src/sys/testdata/variable_sets.expected index 982db894ea..4832fda179 100644 --- a/rust/pspp/src/sys/testdata/variable_sets.expected +++ b/rust/pspp/src/sys/testdata/variable_sets.expected @@ -13,6 +13,7 @@ │Variables│ 10│ ╰─────────┴────────────────────────╯ + Variables ╭──┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├──┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ @@ -28,6 +29,7 @@ │j │ 10│ │Nominal │Input│ 4│Left │A4 │A4 │ │ ╰──┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + Variable Sets ╭─────────────────────────────┬────────╮ │Variable Set and Position │Variable│ ├─────────────────────────────┼────────┤ diff --git a/rust/pspp/src/sys/testdata/variable_sets_unknown_variable.expected b/rust/pspp/src/sys/testdata/variable_sets_unknown_variable.expected index bf63c319e6..48f024822e 100644 --- a/rust/pspp/src/sys/testdata/variable_sets_unknown_variable.expected +++ b/rust/pspp/src/sys/testdata/variable_sets_unknown_variable.expected @@ -17,6 +17,7 @@ Variable set "vs2" includes unknown variable foo. │Variables│ 10│ ╰─────────┴────────────────────────╯ + Variables ╭──┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├──┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ @@ -32,6 +33,7 @@ Variable set "vs2" includes unknown variable foo. │j │ 10│ │Nominal │Input│ 4│Left │A4 │A4 │ │ ╰──┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + Variable Sets ╭───────────────────────────┬────────╮ │Variable Set and Position │Variable│ ├───────────────────────────┼────────┤ diff --git a/rust/pspp/src/sys/testdata/variables_for_value_label_must_all_be_same_type.expected b/rust/pspp/src/sys/testdata/variables_for_value_label_must_all_be_same_type.expected index 0723dc8d92..4302cc3c62 100644 --- a/rust/pspp/src/sys/testdata/variables_for_value_label_must_all_be_same_type.expected +++ b/rust/pspp/src/sys/testdata/variables_for_value_label_must_all_be_same_type.expected @@ -13,6 +13,7 @@ Warning at file offsets 0x110 to 0x118: In value label record: First variable in │Variables│2│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ @@ -20,6 +21,7 @@ Warning at file offsets 0x110 to 0x118: In value label record: First variable in │num1│ 2│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ ╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + Value Labels ╭─────────────────────┬───╮ │Variable Value │ │ ├─────────────────────┼───┤ diff --git a/rust/pspp/src/sys/testdata/very_long_strings.expected b/rust/pspp/src/sys/testdata/very_long_strings.expected index 3024855129..35b06625cb 100644 --- a/rust/pspp/src/sys/testdata/very_long_strings.expected +++ b/rust/pspp/src/sys/testdata/very_long_strings.expected @@ -13,6 +13,7 @@ │Variables│ 2│ ╰─────────┴────────────────────────╯ + Variables ╭──────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├──────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/weight_must_be_numeric.expected b/rust/pspp/src/sys/testdata/weight_must_be_numeric.expected index a77a5d4fdc..4e1bf58cad 100644 --- a/rust/pspp/src/sys/testdata/weight_must_be_numeric.expected +++ b/rust/pspp/src/sys/testdata/weight_must_be_numeric.expected @@ -13,6 +13,7 @@ File designates string variable STR1 (index 2) as weight variable, but weight va │Variables│2│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/weight_variable_bad_index.expected b/rust/pspp/src/sys/testdata/weight_variable_bad_index.expected index e4b2b8f514..dae8fb9619 100644 --- a/rust/pspp/src/sys/testdata/weight_variable_bad_index.expected +++ b/rust/pspp/src/sys/testdata/weight_variable_bad_index.expected @@ -13,6 +13,7 @@ File weight variable index 3 is invalid because it exceeds maximum variable inde │Variables│2│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/weight_variable_continuation.expected b/rust/pspp/src/sys/testdata/weight_variable_continuation.expected index 9dc6cf3df7..da6403f16b 100644 --- a/rust/pspp/src/sys/testdata/weight_variable_continuation.expected +++ b/rust/pspp/src/sys/testdata/weight_variable_continuation.expected @@ -13,6 +13,7 @@ File weight variable index 2 is invalid because it refers to long string continu │Variables│2│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/write-numeric-simple.expected b/rust/pspp/src/sys/testdata/write-numeric-simple.expected new file mode 100644 index 0000000000..4dafcc388f --- /dev/null +++ b/rust/pspp/src/sys/testdata/write-numeric-simple.expected @@ -0,0 +1,49 @@ +╭──────────────────────┬────────────────────╮ +│ Created │30-JUL-2025 15:07:55│ +├──────────────────────┼────────────────────┤ +│Writer Product │PSPP TEST DATA FILE │ +│ Version │1.2.3 │ +├──────────────────────┼────────────────────┤ +│ Compression │SAV │ +│ Number of Cases│ 8│ +╰──────────────────────┴────────────────────╯ + +╭─────────┬─╮ +│Variables│4│ +╰─────────┴─╯ + + Variables +╭─────────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├─────────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│variable0│ 1│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│variable1│ 2│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│variable2│ 3│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│variable3│ 4│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +╰─────────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + +Data File and Variable Attributes +╭────────────────────────┬─────╮ +│Variable and Name │Value│ +├────────────────────────┼─────┤ +│variable0 $@Role│0 │ +├────────────────────────┼─────┤ +│variable1 $@Role│0 │ +├────────────────────────┼─────┤ +│variable2 $@Role│0 │ +├────────────────────────┼─────┤ +│variable3 $@Role│0 │ +╰────────────────────────┴─────╯ + +╭────┬─────────┬─────────┬─────────┬─────────╮ +│Case│variable0│variable1│variable2│variable3│ +├────┼─────────┼─────────┼─────────┼─────────┤ +│1 │ 1.00│ 1.00│ 1.00│ 2.00│ +│2 │ 1.00│ 1.00│ 2.00│ 30.00│ +│3 │ 1.00│ 2.00│ 1.00│ 8.00│ +│4 │ 1.00│ 2.00│ 2.00│ 20.00│ +│5 │ 2.00│ 1.00│ 1.00│ 2.00│ +│6 │ 2.00│ 1.00│ 2.00│ 22.00│ +│7 │ 2.00│ 2.00│ 1.00│ 1.00│ +│8 │ 2.00│ 2.00│ 2.00│ 3.00│ +╰────┴─────────┴─────────┴─────────┴─────────╯ diff --git a/rust/pspp/src/sys/testdata/write-numeric-uncompressed.expected b/rust/pspp/src/sys/testdata/write-numeric-uncompressed.expected new file mode 100644 index 0000000000..ac6eae1832 --- /dev/null +++ b/rust/pspp/src/sys/testdata/write-numeric-uncompressed.expected @@ -0,0 +1,49 @@ +╭──────────────────────┬────────────────────╮ +│ Created │30-JUL-2025 15:07:55│ +├──────────────────────┼────────────────────┤ +│Writer Product │PSPP TEST DATA FILE │ +│ Version │1.2.3 │ +├──────────────────────┼────────────────────┤ +│ Compression │None │ +│ Number of Cases│ 8│ +╰──────────────────────┴────────────────────╯ + +╭─────────┬─╮ +│Variables│4│ +╰─────────┴─╯ + + Variables +╭─────────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├─────────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│variable0│ 1│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│variable1│ 2│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│variable2│ 3│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│variable3│ 4│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +╰─────────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + +Data File and Variable Attributes +╭────────────────────────┬─────╮ +│Variable and Name │Value│ +├────────────────────────┼─────┤ +│variable0 $@Role│0 │ +├────────────────────────┼─────┤ +│variable1 $@Role│0 │ +├────────────────────────┼─────┤ +│variable2 $@Role│0 │ +├────────────────────────┼─────┤ +│variable3 $@Role│0 │ +╰────────────────────────┴─────╯ + +╭────┬─────────┬─────────┬─────────┬─────────╮ +│Case│variable0│variable1│variable2│variable3│ +├────┼─────────┼─────────┼─────────┼─────────┤ +│1 │ 1.00│ 1.00│ 1.00│ 2.00│ +│2 │ 1.00│ 1.00│ 2.00│ 30.00│ +│3 │ 1.00│ 2.00│ 1.00│ 8.00│ +│4 │ 1.00│ 2.00│ 2.00│ 20.00│ +│5 │ 2.00│ 1.00│ 1.00│ 2.00│ +│6 │ 2.00│ 1.00│ 2.00│ 22.00│ +│7 │ 2.00│ 2.00│ 1.00│ 1.00│ +│8 │ 2.00│ 2.00│ 2.00│ 3.00│ +╰────┴─────────┴─────────┴─────────┴─────────╯ diff --git a/rust/pspp/src/sys/testdata/write-numeric-zlib.expected b/rust/pspp/src/sys/testdata/write-numeric-zlib.expected new file mode 100644 index 0000000000..971572b576 --- /dev/null +++ b/rust/pspp/src/sys/testdata/write-numeric-zlib.expected @@ -0,0 +1,49 @@ +╭──────────────────────┬────────────────────╮ +│ Created │30-JUL-2025 15:07:55│ +├──────────────────────┼────────────────────┤ +│Writer Product │PSPP TEST DATA FILE │ +│ Version │1.2.3 │ +├──────────────────────┼────────────────────┤ +│ Compression │ZSAV │ +│ Number of Cases│ 8│ +╰──────────────────────┴────────────────────╯ + +╭─────────┬─╮ +│Variables│4│ +╰─────────┴─╯ + + Variables +╭─────────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├─────────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│variable0│ 1│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│variable1│ 2│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│variable2│ 3│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│variable3│ 4│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +╰─────────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + +Data File and Variable Attributes +╭────────────────────────┬─────╮ +│Variable and Name │Value│ +├────────────────────────┼─────┤ +│variable0 $@Role│0 │ +├────────────────────────┼─────┤ +│variable1 $@Role│0 │ +├────────────────────────┼─────┤ +│variable2 $@Role│0 │ +├────────────────────────┼─────┤ +│variable3 $@Role│0 │ +╰────────────────────────┴─────╯ + +╭────┬─────────┬─────────┬─────────┬─────────╮ +│Case│variable0│variable1│variable2│variable3│ +├────┼─────────┼─────────┼─────────┼─────────┤ +│1 │ 1.00│ 1.00│ 1.00│ 2.00│ +│2 │ 1.00│ 1.00│ 2.00│ 30.00│ +│3 │ 1.00│ 2.00│ 1.00│ 8.00│ +│4 │ 1.00│ 2.00│ 2.00│ 20.00│ +│5 │ 2.00│ 1.00│ 1.00│ 2.00│ +│6 │ 2.00│ 1.00│ 2.00│ 22.00│ +│7 │ 2.00│ 2.00│ 1.00│ 1.00│ +│8 │ 2.00│ 2.00│ 2.00│ 3.00│ +╰────┴─────────┴─────────┴─────────┴─────────╯ diff --git a/rust/pspp/src/sys/testdata/write-string-simple.expected b/rust/pspp/src/sys/testdata/write-string-simple.expected new file mode 100644 index 0000000000..34eab53234 --- /dev/null +++ b/rust/pspp/src/sys/testdata/write-string-simple.expected @@ -0,0 +1,63 @@ +╭──────────────────────┬────────────────────╮ +│ Created │30-JUL-2025 15:07:55│ +├──────────────────────┼────────────────────┤ +│Writer Product │PSPP TEST DATA FILE │ +│ Version │1.2.3 │ +├──────────────────────┼────────────────────┤ +│ Compression │SAV │ +│ Number of Cases│ 2│ +╰──────────────────────┴────────────────────╯ + +╭─────────┬─╮ +│Variables│5│ +╰─────────┴─╯ + + Variables +╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│s1 │ 1│ │Nominal │Input│ 1│Left │A1 │A1 │ │ +│s2 │ 2│ │Nominal │Input│ 2│Left │A2 │A2 │ │ +│s3 │ 3│ │Nominal │Input│ 3│Left │A3 │A3 │ │ +│s4 │ 4│ │Nominal │Input│ 9│Left │A9 │A9 │ │ +│s566│ 5│ │Nominal │Input│ 32│Left │A566 │A566 │ │ +╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + +Data File and Variable Attributes +╭────────────────────────┬─────╮ +│Variable and Name │Value│ +├────────────────────────┼─────┤ +│s1 $@Role│0 │ +├────────────────────────┼─────┤ +│s2 $@Role│0 │ +├────────────────────────┼─────┤ +│s3 $@Role│0 │ +├────────────────────────┼─────┤ +│s4 $@Role│0 │ +├────────────────────────┼─────┤ +│s566 $@Role│0 │ +╰────────────────────────┴─────╯ + +╭────┬──┬──┬───┬─────────┬──────────────────────────────────────────────────────────────╮ +│Case│s1│s2│ s3│ s4 │ s566 │ +├────┼──┼──┼───┼─────────┼──────────────────────────────────────────────────────────────┤ +│1 │1 │1 │1 │xyzzyquux│abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│2 │1 │2 │1 │8 │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxx │ +╰────┴──┴──┴───┴─────────┴──────────────────────────────────────────────────────────────╯ diff --git a/rust/pspp/src/sys/testdata/write-string-uncompressed.expected b/rust/pspp/src/sys/testdata/write-string-uncompressed.expected new file mode 100644 index 0000000000..aaa9214564 --- /dev/null +++ b/rust/pspp/src/sys/testdata/write-string-uncompressed.expected @@ -0,0 +1,63 @@ +╭──────────────────────┬────────────────────╮ +│ Created │30-JUL-2025 15:07:55│ +├──────────────────────┼────────────────────┤ +│Writer Product │PSPP TEST DATA FILE │ +│ Version │1.2.3 │ +├──────────────────────┼────────────────────┤ +│ Compression │None │ +│ Number of Cases│ 2│ +╰──────────────────────┴────────────────────╯ + +╭─────────┬─╮ +│Variables│5│ +╰─────────┴─╯ + + Variables +╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│s1 │ 1│ │Nominal │Input│ 1│Left │A1 │A1 │ │ +│s2 │ 2│ │Nominal │Input│ 2│Left │A2 │A2 │ │ +│s3 │ 3│ │Nominal │Input│ 3│Left │A3 │A3 │ │ +│s4 │ 4│ │Nominal │Input│ 9│Left │A9 │A9 │ │ +│s566│ 5│ │Nominal │Input│ 32│Left │A566 │A566 │ │ +╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + +Data File and Variable Attributes +╭────────────────────────┬─────╮ +│Variable and Name │Value│ +├────────────────────────┼─────┤ +│s1 $@Role│0 │ +├────────────────────────┼─────┤ +│s2 $@Role│0 │ +├────────────────────────┼─────┤ +│s3 $@Role│0 │ +├────────────────────────┼─────┤ +│s4 $@Role│0 │ +├────────────────────────┼─────┤ +│s566 $@Role│0 │ +╰────────────────────────┴─────╯ + +╭────┬──┬──┬───┬─────────┬──────────────────────────────────────────────────────────────╮ +│Case│s1│s2│ s3│ s4 │ s566 │ +├────┼──┼──┼───┼─────────┼──────────────────────────────────────────────────────────────┤ +│1 │1 │1 │1 │xyzzyquux│abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│2 │1 │2 │1 │8 │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxx │ +╰────┴──┴──┴───┴─────────┴──────────────────────────────────────────────────────────────╯ diff --git a/rust/pspp/src/sys/testdata/write-string-zlib.expected b/rust/pspp/src/sys/testdata/write-string-zlib.expected new file mode 100644 index 0000000000..55aea6b552 --- /dev/null +++ b/rust/pspp/src/sys/testdata/write-string-zlib.expected @@ -0,0 +1,63 @@ +╭──────────────────────┬────────────────────╮ +│ Created │30-JUL-2025 15:07:55│ +├──────────────────────┼────────────────────┤ +│Writer Product │PSPP TEST DATA FILE │ +│ Version │1.2.3 │ +├──────────────────────┼────────────────────┤ +│ Compression │ZSAV │ +│ Number of Cases│ 2│ +╰──────────────────────┴────────────────────╯ + +╭─────────┬─╮ +│Variables│5│ +╰─────────┴─╯ + + Variables +╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│s1 │ 1│ │Nominal │Input│ 1│Left │A1 │A1 │ │ +│s2 │ 2│ │Nominal │Input│ 2│Left │A2 │A2 │ │ +│s3 │ 3│ │Nominal │Input│ 3│Left │A3 │A3 │ │ +│s4 │ 4│ │Nominal │Input│ 9│Left │A9 │A9 │ │ +│s566│ 5│ │Nominal │Input│ 32│Left │A566 │A566 │ │ +╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + +Data File and Variable Attributes +╭────────────────────────┬─────╮ +│Variable and Name │Value│ +├────────────────────────┼─────┤ +│s1 $@Role│0 │ +├────────────────────────┼─────┤ +│s2 $@Role│0 │ +├────────────────────────┼─────┤ +│s3 $@Role│0 │ +├────────────────────────┼─────┤ +│s4 $@Role│0 │ +├────────────────────────┼─────┤ +│s566 $@Role│0 │ +╰────────────────────────┴─────╯ + +╭────┬──┬──┬───┬─────────┬──────────────────────────────────────────────────────────────╮ +│Case│s1│s2│ s3│ s4 │ s566 │ +├────┼──┼──┼───┼─────────┼──────────────────────────────────────────────────────────────┤ +│1 │1 │1 │1 │xyzzyquux│abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│ │ │ │ │ │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789│ +│2 │1 │2 │1 │8 │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx │ +│ │ │ │ │ │xxxxxxxxxxxxxxxxx │ +╰────┴──┴──┴───┴─────────┴──────────────────────────────────────────────────────────────╯ diff --git a/rust/pspp/src/sys/testdata/wrong_display_alignment.expected b/rust/pspp/src/sys/testdata/wrong_display_alignment.expected index 7628df79a2..fa6fa29ac0 100644 --- a/rust/pspp/src/sys/testdata/wrong_display_alignment.expected +++ b/rust/pspp/src/sys/testdata/wrong_display_alignment.expected @@ -13,6 +13,7 @@ Warning at file offsets 0xe0 to 0xe8: In variable display record: Invalid variab │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/wrong_display_measurement_level.expected b/rust/pspp/src/sys/testdata/wrong_display_measurement_level.expected index 783766c84a..5d76ed0661 100644 --- a/rust/pspp/src/sys/testdata/wrong_display_measurement_level.expected +++ b/rust/pspp/src/sys/testdata/wrong_display_measurement_level.expected @@ -13,6 +13,7 @@ Warning at file offsets 0xe0 to 0xe8: In variable display record: Invalid variab │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/wrong_display_parameter_count.expected b/rust/pspp/src/sys/testdata/wrong_display_parameter_count.expected index cefc24fa1b..3617b645db 100644 --- a/rust/pspp/src/sys/testdata/wrong_display_parameter_count.expected +++ b/rust/pspp/src/sys/testdata/wrong_display_parameter_count.expected @@ -13,6 +13,7 @@ Warning at file offsets 0xe0 to 0xf0: In variable display record: Record contain │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/wrong_display_parameter_size.expected b/rust/pspp/src/sys/testdata/wrong_display_parameter_size.expected index 06a93186ed..e8fda3180c 100644 --- a/rust/pspp/src/sys/testdata/wrong_display_parameter_size.expected +++ b/rust/pspp/src/sys/testdata/wrong_display_parameter_size.expected @@ -13,6 +13,7 @@ Warning at file offsets 0xe0 to 0xf0: In extension record: variable display reco │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/wrong_special_floats.expected b/rust/pspp/src/sys/testdata/wrong_special_floats.expected index 6e55891088..b73def632a 100644 --- a/rust/pspp/src/sys/testdata/wrong_special_floats.expected +++ b/rust/pspp/src/sys/testdata/wrong_special_floats.expected @@ -17,6 +17,7 @@ System file specifies value 2.0 (0x1.0p1) as LOWEST but -1.7976931348623157e308 │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/wrong_variable_positions.expected b/rust/pspp/src/sys/testdata/wrong_variable_positions.expected index 7c01e4d720..34858dc921 100644 --- a/rust/pspp/src/sys/testdata/wrong_variable_positions.expected +++ b/rust/pspp/src/sys/testdata/wrong_variable_positions.expected @@ -13,6 +13,7 @@ File header claims 2 variable positions but 1 were read from file. │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/wrong_variable_positions_but_v13.expected b/rust/pspp/src/sys/testdata/wrong_variable_positions_but_v13.expected index d701ada126..463a44e39b 100644 --- a/rust/pspp/src/sys/testdata/wrong_variable_positions_but_v13.expected +++ b/rust/pspp/src/sys/testdata/wrong_variable_positions_but_v13.expected @@ -13,6 +13,7 @@ │Variables│ 2│ ╰─────────┴────────────────────────╯ + Variables ╭──────────────────────────┬────────┬──────────────────────────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│ Label │Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├──────────────────────────┼────────┼──────────────────────────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/zcompressed_data.expected b/rust/pspp/src/sys/testdata/zcompressed_data.expected index 834891e4f6..66f4355225 100644 --- a/rust/pspp/src/sys/testdata/zcompressed_data.expected +++ b/rust/pspp/src/sys/testdata/zcompressed_data.expected @@ -12,6 +12,7 @@ │Variables│ 5│ ╰─────────┴────────────────────────╯ + Variables ╭─────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├─────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/zcompressed_data_bad_zheader_ofs.expected b/rust/pspp/src/sys/testdata/zcompressed_data_bad_zheader_ofs.expected index 6c8ffda947..ef5d97d1e8 100644 --- a/rust/pspp/src/sys/testdata/zcompressed_data_bad_zheader_ofs.expected +++ b/rust/pspp/src/sys/testdata/zcompressed_data_bad_zheader_ofs.expected @@ -1 +1 @@ -Error at file offsets 0x194 to 0x1a0: ZLIB header's zlib_offset is 0x0 instead of expected 0x194. +Error at file offsets 0x194 to 0x1a0: Error reading ZLIB header: zlib_offset is 0x0 instead of expected 0x194. diff --git a/rust/pspp/src/sys/testdata/zcompressed_data_bad_ztrailer_ofs.expected b/rust/pspp/src/sys/testdata/zcompressed_data_bad_ztrailer_ofs.expected index 83cbe679c8..ca2d02b3b9 100644 --- a/rust/pspp/src/sys/testdata/zcompressed_data_bad_ztrailer_ofs.expected +++ b/rust/pspp/src/sys/testdata/zcompressed_data_bad_ztrailer_ofs.expected @@ -1 +1 @@ -Error at file offsets 0x194 to 0x1a0: Impossible ztrailer_offset 0x0. +Error at file offsets 0x194 to 0x1a0: Error reading ZLIB header: Impossible ztrailer_offset 0x0. diff --git a/rust/pspp/src/sys/testdata/zcompressed_data_compressed_sizes_don_t_add_up.expected b/rust/pspp/src/sys/testdata/zcompressed_data_compressed_sizes_don_t_add_up.expected index 6b728bdfe4..b99f89055b 100644 --- a/rust/pspp/src/sys/testdata/zcompressed_data_compressed_sizes_don_t_add_up.expected +++ b/rust/pspp/src/sys/testdata/zcompressed_data_compressed_sizes_don_t_add_up.expected @@ -1 +1 @@ -Error at file offsets 0x1ac to 0x1dc: ZLIB trailer is at offset 0x205 but 0x204 would be expected from block descriptors. +Error at file offsets 0x1ac to 0x1dc: Error reading ZLIB trailer: ZLIB trailer is at offset 0x205 but 0x204 would be expected from block descriptors. diff --git a/rust/pspp/src/sys/testdata/zcompressed_data_compressed_sizes_dont_add_up.expected b/rust/pspp/src/sys/testdata/zcompressed_data_compressed_sizes_dont_add_up.expected index effe1768fb..e90459a4a9 100644 --- a/rust/pspp/src/sys/testdata/zcompressed_data_compressed_sizes_dont_add_up.expected +++ b/rust/pspp/src/sys/testdata/zcompressed_data_compressed_sizes_dont_add_up.expected @@ -1 +1 @@ -Error at file offsets 0x1dc to 0x1f4: ZLIB block descriptor 1 reported compressed data offset 0x12421, when 0x124f1 was expected. +Error at file offsets 0x1dc to 0x1f4: Error reading ZLIB trailer: Block descriptor 1 reported compressed data offset 0x12421, when 0x124f1 was expected. diff --git a/rust/pspp/src/sys/testdata/zcompressed_data_compression_expands_data_too_much.expected b/rust/pspp/src/sys/testdata/zcompressed_data_compression_expands_data_too_much.expected index 8196e26aca..3e46bb92a3 100644 --- a/rust/pspp/src/sys/testdata/zcompressed_data_compression_expands_data_too_much.expected +++ b/rust/pspp/src/sys/testdata/zcompressed_data_compression_expands_data_too_much.expected @@ -1 +1 @@ -Error at file offsets 0x1c4 to 0x1dc: ZLIB block descriptor 0 reports compressed size 100 and uncompressed size 50. +Error at file offsets 0x1c4 to 0x1dc: Error reading ZLIB trailer: Block descriptor 0 reports compressed size 100 and uncompressed size 50. diff --git a/rust/pspp/src/sys/testdata/zcompressed_data_invalid_ztrailer_len.expected b/rust/pspp/src/sys/testdata/zcompressed_data_invalid_ztrailer_len.expected index b70a425e02..d78157a697 100644 --- a/rust/pspp/src/sys/testdata/zcompressed_data_invalid_ztrailer_len.expected +++ b/rust/pspp/src/sys/testdata/zcompressed_data_invalid_ztrailer_len.expected @@ -1 +1 @@ -Error at file offsets 0x194 to 0x1a0: Invalid ZLIB trailer length 21. +Error at file offsets 0x194 to 0x1a0: Error reading ZLIB header: Invalid ZLIB trailer length 21. diff --git a/rust/pspp/src/sys/testdata/zcompressed_data_uncompressed_size_block_size.expected b/rust/pspp/src/sys/testdata/zcompressed_data_uncompressed_size_block_size.expected index 00be8d4267..b5fabe9da7 100644 --- a/rust/pspp/src/sys/testdata/zcompressed_data_uncompressed_size_block_size.expected +++ b/rust/pspp/src/sys/testdata/zcompressed_data_uncompressed_size_block_size.expected @@ -1,4 +1,4 @@ -Warning at file offsets 0x1c4 to 0x1dc: In ZLIB trailer: ZLIB block descriptor 0 reported block size 0x400000, when at most 0x3ff000 was expected. +Warning at file offsets 0x1c4 to 0x1dc: In ZLIB trailer: Block descriptor 0 reported block size 0x400000, when at most 0x3ff000 was expected. ╭──────────────────────┬────────────────────────╮ │ Created │ 01-JAN-2011 20:53:52│ @@ -14,6 +14,7 @@ Warning at file offsets 0x1c4 to 0x1dc: In ZLIB trailer: ZLIB block descriptor 0 │Variables│ 5│ ╰─────────┴────────────────────────╯ + Variables ╭─────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├─────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/testdata/zcompressed_data_wrong_block_size.expected b/rust/pspp/src/sys/testdata/zcompressed_data_wrong_block_size.expected index 82c0c1b127..5af8bd1e20 100644 --- a/rust/pspp/src/sys/testdata/zcompressed_data_wrong_block_size.expected +++ b/rust/pspp/src/sys/testdata/zcompressed_data_wrong_block_size.expected @@ -1 +1 @@ -Error at file offsets 0x1ac to 0x1c4: ZLIB trailer specifies unexpected 4096-byte block size. +Error at file offsets 0x1ac to 0x1c4: Error reading ZLIB trailer: Unexpected 1000-byte block size (expected 0x3ff000). diff --git a/rust/pspp/src/sys/testdata/zcompressed_data_wrong_compressed_ofs.expected b/rust/pspp/src/sys/testdata/zcompressed_data_wrong_compressed_ofs.expected index a7e4b4b935..272feb052a 100644 --- a/rust/pspp/src/sys/testdata/zcompressed_data_wrong_compressed_ofs.expected +++ b/rust/pspp/src/sys/testdata/zcompressed_data_wrong_compressed_ofs.expected @@ -1 +1 @@ -Error at file offsets 0x1c4 to 0x1dc: ZLIB block descriptor 0 reported compressed data offset 0x191, when 0x1ac was expected. +Error at file offsets 0x1c4 to 0x1dc: Error reading ZLIB trailer: Block descriptor 0 reported compressed data offset 0x191, when 0x1ac was expected. diff --git a/rust/pspp/src/sys/testdata/zcompressed_data_wrong_n_blocks.expected b/rust/pspp/src/sys/testdata/zcompressed_data_wrong_n_blocks.expected index 37a9a2d422..eef81e946c 100644 --- a/rust/pspp/src/sys/testdata/zcompressed_data_wrong_n_blocks.expected +++ b/rust/pspp/src/sys/testdata/zcompressed_data_wrong_n_blocks.expected @@ -1 +1 @@ -Error at file offsets 0x1ac to 0x1c4: Block count 2 in ZLIB trailer differs from expected block count 1 calculated from trailer length 48. +Error at file offsets 0x205 to 0x235: Error reading ZLIB trailer: Unexpected end-of-file reading ZLIB trailer diff --git a/rust/pspp/src/sys/testdata/zcompressed_data_wrong_uncompressed_ofs.expected b/rust/pspp/src/sys/testdata/zcompressed_data_wrong_uncompressed_ofs.expected index ef1bb440f7..d363062077 100644 --- a/rust/pspp/src/sys/testdata/zcompressed_data_wrong_uncompressed_ofs.expected +++ b/rust/pspp/src/sys/testdata/zcompressed_data_wrong_uncompressed_ofs.expected @@ -1 +1 @@ -Error at file offsets 0x1c4 to 0x1dc: ZLIB block descriptor 0 reported uncompressed data offset 0x177, when 0x1ac was expected. +Error at file offsets 0x1c4 to 0x1dc: Error reading ZLIB trailer: Block descriptor 0 reported uncompressed data offset 0x177, when 0x1ac was expected. diff --git a/rust/pspp/src/sys/testdata/zcompressed_data_wrong_ztrailer_bias.expected b/rust/pspp/src/sys/testdata/zcompressed_data_wrong_ztrailer_bias.expected index d084ea5256..5017da8b03 100644 --- a/rust/pspp/src/sys/testdata/zcompressed_data_wrong_ztrailer_bias.expected +++ b/rust/pspp/src/sys/testdata/zcompressed_data_wrong_ztrailer_bias.expected @@ -1 +1 @@ -Error at file offsets 0x1ac to 0x1c4: ZLIB trailer bias 0 is not -100 as expected from file header bias. +Error at file offsets 0x1ac to 0x1c4: Error reading ZLIB trailer: Bias 0 is not -100 as expected from file header. diff --git a/rust/pspp/src/sys/testdata/zcompressed_data_wrong_ztrailer_len.expected b/rust/pspp/src/sys/testdata/zcompressed_data_wrong_ztrailer_len.expected index a9d98d29b5..ba66bc99c3 100644 --- a/rust/pspp/src/sys/testdata/zcompressed_data_wrong_ztrailer_len.expected +++ b/rust/pspp/src/sys/testdata/zcompressed_data_wrong_ztrailer_len.expected @@ -1 +1 @@ -Error at file offsets 0x1ac to 0x1c4: Block count 1 in ZLIB trailer differs from expected block count 2 calculated from trailer length 72. +Error at file offsets 0x1ac to 0x1c4: Error reading ZLIB trailer: Block count 1 differs from expected block count 2 calculated from trailer length 72. diff --git a/rust/pspp/src/sys/testdata/zcompressed_data_wrong_ztrailer_zero.expected b/rust/pspp/src/sys/testdata/zcompressed_data_wrong_ztrailer_zero.expected index 73f5cfd137..578775cc31 100644 --- a/rust/pspp/src/sys/testdata/zcompressed_data_wrong_ztrailer_zero.expected +++ b/rust/pspp/src/sys/testdata/zcompressed_data_wrong_ztrailer_zero.expected @@ -1 +1 @@ -Error at file offsets 0x1ac to 0x1c4: ZLIB trailer "zero" field has nonzero value 100. +Error at file offsets 0x1ac to 0x1c4: Error reading ZLIB trailer: Expected zero field has nonzero value 100. diff --git a/rust/pspp/src/sys/testdata/zero_or_one_variable_in_mrset.expected b/rust/pspp/src/sys/testdata/zero_or_one_variable_in_mrset.expected index 098e5345a5..a7a05a9cce 100644 --- a/rust/pspp/src/sys/testdata/zero_or_one_variable_in_mrset.expected +++ b/rust/pspp/src/sys/testdata/zero_or_one_variable_in_mrset.expected @@ -15,6 +15,7 @@ Multiple response set $b has no variables. │Variables│1│ ╰─────────┴─╯ + Variables ╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ │ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ diff --git a/rust/pspp/src/sys/write.rs b/rust/pspp/src/sys/write.rs new file mode 100644 index 0000000000..8b12ea268e --- /dev/null +++ b/rust/pspp/src/sys/write.rs @@ -0,0 +1,1959 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +use std::{ + borrow::Cow, + collections::HashMap, + fmt::Write as _, + fs::File, + io::{BufWriter, Cursor, Error as IoError, ErrorKind, Seek, SeekFrom, Write}, + iter::repeat_n, + path::Path, +}; + +use binrw::{BinWrite, Endian, Error as BinError}; +use chrono::{Local, NaiveDateTime}; +use either::Either; +use encoding_rs::Encoding; +use flate2::write::ZlibEncoder; +use itertools::zip_eq; +use smallvec::SmallVec; + +use crate::{ + data::{Datum, RawString}, + dictionary::{CategoryLabels, Dictionary, MultipleResponseType}, + format::{DisplayPlainF64, Format}, + identifier::Identifier, + output::spv::Zeros, + sys::{ + encoding::codepage_from_encoding, + raw::{ + records::{ + Compression, FloatInfoRecord, RawFormat, RawHeader, RawIntegerInfoRecord, + RawVariableRecord, RawZHeader, RawZTrailer, ZBlock, + }, + Magic, + }, + ProductVersion, + }, + variable::{Alignment, Attributes, Measure, ValueLabels, VarWidth}, +}; + +/// System file format version. +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)] +pub enum SystemFileVersion { + /// Obsolete version. + V2, + + /// Current version. + #[default] + V3, +} + +/// Options for writing a system file. +#[derive(Clone, Debug)] +pub struct WriteOptions { + /// How to compress (if at all) data in the system file. + pub compression: Option, + + /// System file version to write. + pub sysfile_version: SystemFileVersion, + + /// Date and time to write to the file. + pub timestamp: NaiveDateTime, + + /// Product name. + /// + /// Only the first 40 bytes are written. + pub product_name: Cow<'static, str>, + + /// Product version number. + /// + /// The default is taken from `CARGO_PKG_VERSION`. + pub product_version: ProductVersion, +} + +impl Default for WriteOptions { + fn default() -> Self { + Self { + compression: Some(Compression::Simple), + sysfile_version: Default::default(), + timestamp: Local::now().naive_local(), + product_name: Cow::from(concat!("GNU PSPP (Rust) ", env!("CARGO_PKG_VERSION"))), + product_version: ProductVersion::VERSION, + } + } +} + +impl WriteOptions { + /// Constructs a new set of default options. + pub fn new() -> Self { + Self::default() + } + + /// Returns `self` with the compression format set to `compression`. + pub fn with_compression(self, compression: Option) -> Self { + Self { + compression, + ..self + } + } + + /// Returns `self` with the timestamp to be written set to `timestamp`. + pub fn with_timestamp(self, timestamp: NaiveDateTime) -> Self { + Self { timestamp, ..self } + } + + /// Returns `self` with the system file version set to `sysfile_version`. + pub fn with_sysfile_version(self, sysfile_version: SystemFileVersion) -> Self { + Self { + sysfile_version, + ..self + } + } + + /// Returns `self` with the product name set to `product_name`. + pub fn with_product_name(self, product_name: Cow<'static, str>) -> Self { + Self { + product_name, + ..self + } + } + + /// Returns `self` with the product version set to `product_version`. + pub fn with_product_version(self, product_version: ProductVersion) -> Self { + Self { + product_version, + ..self + } + } + + /// Writes `dictionary` to `path` in system file format. Returns a [Writer] + /// that can be used for writing cases to the new file. + pub fn write_file( + self, + dictionary: &Dictionary, + path: impl AsRef, + ) -> Result>, BinError> { + self.write_writer(dictionary, BufWriter::new(File::create(path)?)) + } + + /// Writes `dictionary` to `writer` in system file format. Returns a + /// [Writer] that can be used for writing cases to the new file. + pub fn write_writer( + self, + dictionary: &Dictionary, + mut writer: W, + ) -> Result, BinError> + where + W: Write + Seek + 'static, + { + let mut dict_writer = DictionaryWriter::new(&self, &mut writer, dictionary); + dict_writer.write()?; + let DictionaryWriter { case_vars, .. } = dict_writer; + Writer::new(self, case_vars, writer) + } + + /// Returns a [WriteOptions] with the given `compression` and the other + /// members set to fixed values so that running at different times or with + /// different crate names or versions won't change what's written to the + /// file. + #[cfg(test)] + pub(super) fn reproducible(compression: Option) -> Self { + use chrono::{NaiveDate, NaiveTime}; + WriteOptions::new() + .with_compression(compression) + .with_timestamp(NaiveDateTime::new( + NaiveDate::from_ymd_opt(2025, 7, 30).unwrap(), + NaiveTime::from_hms_opt(15, 7, 55).unwrap(), + )) + .with_product_name(Cow::from("PSPP TEST DATA FILE")) + .with_product_version(ProductVersion(1, 2, 3)) + } +} + +struct DictionaryWriter<'a, W> { + options: &'a WriteOptions, + short_names: Vec>, + case_vars: Vec, + writer: &'a mut W, + dictionary: &'a Dictionary, +} + +fn put_attributes(attributes: &Attributes, s: &mut String) { + for (name, values) in attributes.iter(true) { + write!(s, "{name}(").unwrap(); + for value in values { + writeln!(s, "'{value}'").unwrap(); + } + write!(s, ")").unwrap() + } +} + +const BIAS: f64 = 100.0; + +fn encode_fixed_string(s: &str, encoding: &'static Encoding) -> [u8; N] { + let mut encoded = encoding.encode(s).0.into_owned(); + encoded.resize(N, b' '); + encoded.try_into().unwrap() +} + +impl<'a, W> DictionaryWriter<'a, W> +where + W: Write + Seek, +{ + pub fn new(options: &'a WriteOptions, writer: &'a mut W, dictionary: &'a Dictionary) -> Self { + Self { + options, + short_names: dictionary.short_names(), + case_vars: dictionary + .variables + .iter() + .map(|variable| CaseVar::new(variable.width)) + .collect::>(), + writer, + dictionary, + } + } + + pub fn write(&mut self) -> Result<(), BinError> { + self.write_header()?; + self.write_variables()?; + self.write_value_labels()?; + self.write_documents()?; + self.write_integer_record()?; + self.write_float_record()?; + self.write_variable_sets()?; + self.write_mrsets(true)?; + self.write_variable_display_parameters()?; + self.write_long_variable_names()?; + self.write_very_long_strings()?; + self.write_long_string_value_labels()?; + self.write_long_string_missing_values()?; + self.write_data_file_attributes()?; + self.write_variable_attributes()?; + self.write_mrsets(false)?; + self.write_encoding()?; + (999u32, 0u32).write_le(self.writer) + } + + fn write_header(&mut self) -> Result<(), BinError> { + fn as_byte_array(s: String) -> [u8; N] { + let mut bytes = s.into_bytes(); + bytes.resize(N, b' '); + bytes.try_into().unwrap() + } + + fn count_variable_positions(case_vars: &[CaseVar]) -> u32 { + case_vars + .iter() + .map(CaseVar::n_variable_positions) + .sum::() as u32 + } + + let header = RawHeader { + magic: if self.options.compression == Some(Compression::ZLib) { + Magic::Zsav + } else { + Magic::Sav + } + .into(), + eye_catcher: encode_fixed_string( + &format!("@(#) SPSS DATA FILE {}", &self.options.product_name), + self.dictionary.encoding(), + ), + layout_code: 2, + nominal_case_size: count_variable_positions(&self.case_vars), + compression_code: match self.options.compression { + Some(Compression::Simple) => 1, + Some(Compression::ZLib) => 2, + None => 0, + }, + weight_index: if let Some(weight_index) = self.dictionary.weight_index() { + count_variable_positions(&self.case_vars[..weight_index]) + 1 + } else { + 0 + }, + n_cases: u32::MAX, + bias: BIAS, + creation_date: as_byte_array(self.options.timestamp.format("%d %b %y").to_string()), + creation_time: as_byte_array(self.options.timestamp.format("%H:%M:%S").to_string()), + file_label: as_byte_array(self.dictionary.file_label.clone().unwrap_or_default()), + }; + header.write_le(self.writer) + } + + fn write_variables(&mut self) -> Result<(), BinError> { + for (variable, short_names) in self + .dictionary + .variables + .iter() + .zip(self.short_names.iter()) + { + let mut segments = variable.width.segments(); + let mut short_names = short_names.iter(); + let seg0_width = segments.next().unwrap(); + let name0 = short_names.next().unwrap(); + let record = RawVariableRecord { + width: seg0_width.as_string_width().unwrap_or(0) as i32, + has_variable_label: variable.label.is_some() as u32, + missing_value_code: if !variable.width.is_long_string() { + let n = variable.missing_values().values().len() as i32; + match variable.missing_values().range() { + Some(_) => -(n + 2), + None => n, + } + } else { + 0 + }, + print_format: to_raw_format(variable.print_format, seg0_width), + write_format: to_raw_format(variable.write_format, seg0_width), + name: encode_fixed_string(name0, variable.encoding()), + }; + (2u32, record).write_le(self.writer)?; + + // Variable label. + if let Some(label) = variable.label() { + let label = variable.encoding().encode(&label).0; + let len = label.len().min(255) as u32; + let padded_len = len.next_multiple_of(4); + (len, &*label, Zeros((padded_len - len) as usize)).write_le(self.writer)?; + } + + // Missing values. + if !variable.width.is_long_string() { + if let Some(range) = variable.missing_values().range() { + ( + range.low().unwrap_or(f64::MIN), + range.high().unwrap_or(f64::MAX), + ) + .write_le(self.writer)?; + } + let pad = variable + .width + .as_string_width() + .map_or(0, |width| 8 - width); + for value in variable.missing_values().values() { + (value, Zeros(pad)).write_le(self.writer)?; + } + } + write_variable_continuation_records(&mut self.writer, seg0_width)?; + + // Write additional segments for very long string variables. + for (width, name) in segments.zip(short_names) { + let format: RawFormat = Format::default_for_width(width).try_into().unwrap(); + ( + 2u32, + RawVariableRecord { + width: width.as_string_width().unwrap() as i32, + has_variable_label: 0, + missing_value_code: 0, + print_format: format, + write_format: format, + name: encode_fixed_string(name, variable.encoding()), + }, + ) + .write_le(self.writer)?; + write_variable_continuation_records(&mut self.writer, width)?; + } + } + + fn write_variable_continuation_records( + mut writer: W, + width: VarWidth, + ) -> Result<(), BinError> + where + W: Write + Seek, + { + let continuation = ( + 2u32, + RawVariableRecord { + width: -1, + has_variable_label: 0, + missing_value_code: 0, + print_format: RawFormat(0), + write_format: RawFormat(0), + name: [0; 8], + }, + ); + for _ in 1..width.n_chunks().unwrap() { + continuation.write_le(&mut writer)?; + } + Ok(()) + } + + fn to_raw_format(mut format: Format, width: VarWidth) -> RawFormat { + format.resize(width); + RawFormat::try_from(format).unwrap() + } + + Ok(()) + } + + /// Writes value label records, except for long string variables. + fn write_value_labels(&mut self) -> Result<(), BinError> { + // Collect identical sets of value labels. + let mut sets = HashMap::<&ValueLabels, Vec<_>>::new(); + let mut index = 1usize; + for variable in &self.dictionary.variables { + if !variable.width.is_long_string() && !variable.value_labels.is_empty() { + sets.entry(&variable.value_labels) + .or_default() + .push(index as u32); + } + index += variable + .width + .segments() + .map(|w| w.n_chunks().unwrap()) + .sum::(); + } + + for (value_labels, variables) in sets { + // Label record. + (3u32, value_labels.0.len() as u32).write_le(self.writer)?; + for (datum, label) in &value_labels.0 { + let datum_padding = datum.width().as_string_width().map_or(0, |width| 8 - width); + let label = &*self.dictionary.encoding().encode(&label).0; + let label = if label.len() > 255 { + &label[..255] + } else { + label + }; + let label_padding = (1 + label.len()).next_multiple_of(8) - (1 + label.len()); + ( + datum, + Zeros(datum_padding), + label.len() as u8, + label, + Zeros(label_padding), + ) + .write_le(self.writer)?; + } + + // Variable record. + (4u32, variables.len() as u32, variables).write_le(self.writer)?; + } + Ok(()) + } + + fn write_documents(&mut self) -> Result<(), BinError> { + if !self.dictionary.documents.is_empty() { + (6u32, self.dictionary.documents.len() as u32).write_le(self.writer)?; + for line in &self.dictionary.documents { + Padded::exact(&*self.dictionary.encoding().encode(&line).0, 80, b' ') + .write_le(self.writer)?; + } + } + Ok(()) + } + + fn write_integer_record(&mut self) -> Result<(), BinError> { + ( + 7u32, + 3u32, + 4u32, + 8u32, + RawIntegerInfoRecord { + version: self.options.product_version, + machine_code: -1, + floating_point_rep: 1, + compression_code: 1, + endianness: { + // We always write files in little-endian. + 2 + }, + character_code: codepage_from_encoding(self.dictionary.encoding()) as i32, + }, + ) + .write_le(self.writer) + } + + fn write_float_record(&mut self) -> Result<(), BinError> { + ( + 7u32, + 4u32, + 8u32, + 3u32, + FloatInfoRecord { + sysmis: f64::MIN, + highest: f64::MAX, + lowest: f64::MIN.next_up(), + }, + ) + .write_le(self.writer) + } + + fn write_variable_sets(&mut self) -> Result<(), BinError> { + let mut s = String::new(); + for set in &self.dictionary.variable_sets() { + write!(&mut s, "{}= ", set.name()).unwrap(); + for (index, variable) in set.variables().iter().enumerate() { + let prefix = if index > 0 { " " } else { "" }; + write!(&mut s, "{prefix}{}", &variable.name).unwrap(); + } + writeln!(&mut s).unwrap(); + } + self.write_string_record(5, &s) + } + + /// If `pre_v14` is true, writes only sets supported by SPSS before release + /// 14, otherwise writes sets supported only by later versions. + fn write_mrsets(&mut self, pre_v14: bool) -> Result<(), BinError> { + let mut output = Vec::new(); + for set in self + .dictionary + .mrsets() + .iter() + .filter(|set| set.mr_type().supported_before_v14() == pre_v14) + { + output.extend_from_slice(&self.dictionary.encoding().encode(set.name()).0[..]); + output.push(b'='); + match set.mr_type() { + MultipleResponseType::MultipleDichotomy { datum, labels } => { + let leader = match labels { + CategoryLabels::VarLabels => b"D".as_slice(), + CategoryLabels::CountedValues { + use_var_label_as_mrset_label: true, + } => b"E 11 ".as_slice(), + CategoryLabels::CountedValues { + use_var_label_as_mrset_label: false, + } => b"E 1 ".as_slice(), + }; + output.extend_from_slice(leader); + + let mut value = match datum { + Datum::Number(Some(number)) => { + DisplayPlainF64(*number).to_string().into_bytes() + } + Datum::Number(None) => vec![b'.'], + Datum::String(raw_string) => raw_string.0.clone(), + }; + write!(&mut output, "{} ", value.len()).unwrap(); + output.append(&mut value); + output.push(b' '); + } + MultipleResponseType::MultipleCategory => write!(&mut output, "C ").unwrap(), + } + + let label = if set.mr_type().label_from_var_label() { + Cow::from(&[]) + } else { + self.dictionary.encoding().encode(set.label()).0 + }; + write!(&mut output, "{} ", label.len()).unwrap(); + output.extend_from_slice(&label[..]); + + for variable in set.variables().dict_indexes().iter().copied() { + // Only lowercase ASCII characters because other characters + // might expand upon lowercasing. + let short_name = self.short_names[variable][0].as_str().to_ascii_lowercase(); + output.push(b' '); + output.extend_from_slice(&self.dictionary.encoding().encode(&short_name).0); + } + output.push(b'\n'); + } + self.write_bytes_record(if pre_v14 { 7 } else { 19 }, &output) + } + + fn write_variable_display_parameters(&mut self) -> Result<(), BinError> { + ( + 7u32, + 11u32, + 4u32, + self.case_vars + .iter() + .map(CaseVar::n_segments) + .sum::() as u32 + * 3, + ) + .write_le(self.writer)?; + for variable in &self.dictionary.variables { + let measure = match variable.measure { + None => 0, + Some(Measure::Nominal) => 1, + Some(Measure::Ordinal) => 2, + Some(Measure::Scale) => 3, + }; + let alignment = match variable.alignment { + Alignment::Left => 0, + Alignment::Right => 1, + Alignment::Center => 2, + }; + for (index, segment) in variable.width.segments().enumerate() { + let display_width = match index { + 0 => variable.display_width, + _ => segment.default_display_width(), + }; + (measure, display_width, alignment).write_le(self.writer)?; + } + } + Ok(()) + } + + fn write_long_variable_names(&mut self) -> Result<(), BinError> { + if self.options.sysfile_version == SystemFileVersion::V2 { + return Ok(()); + } + + let mut s = String::new(); + for (index, variable) in self.dictionary.variables.iter().enumerate() { + if index > 0 { + s.push('\t'); + } + write!(&mut s, "{}={}", &self.short_names[index][0], variable.name).unwrap(); + } + self.write_string_record(13, &s) + } + + fn write_very_long_strings(&mut self) -> Result<(), BinError> { + let mut s = String::new(); + for (index, variable) in self.dictionary.variables.iter().enumerate() { + if variable.width.is_very_long_string() { + let width = variable.width.as_string_width().unwrap(); + write!(&mut s, "{}={width:05}\0\t", &self.short_names[index][0],).unwrap(); + } + } + self.write_string_record(14, &s) + } + + fn write_long_string_value_labels(&mut self) -> Result<(), BinError> { + let mut body = Vec::new(); + let mut cursor = Cursor::new(&mut body); + for variable in &self.dictionary.variables { + if variable.value_labels.is_empty() || !variable.width.is_long_string() { + continue; + } + let name = self.dictionary.encoding().encode(&variable.name).0; + ( + name.len() as u32, + &name[..], + variable.width.as_string_width().unwrap() as u32, + variable.value_labels.0.len() as u32, + ) + .write_le(&mut cursor)?; + + for (value, label) in &variable.value_labels.0 { + let value = value.as_string().unwrap(); + let label = self.dictionary.encoding().encode(&label).0; + ( + value.len() as u32, + value.raw_string_bytes(), + label.len() as u32, + &label[..], + ) + .write_le(&mut cursor)?; + } + } + self.write_bytes_record(21, &body) + } + + fn write_long_string_missing_values(&mut self) -> Result<(), BinError> { + let mut body = Vec::new(); + let mut cursor = Cursor::new(&mut body); + for variable in &self.dictionary.variables { + if variable.missing_values().is_empty() || !variable.width.is_long_string() { + break; + } + let name = self.dictionary.encoding().encode(&variable.name).0; + ( + name.len() as u32, + &name[..], + variable.missing_values().values().len() as u8, + 8u32, + ) + .write_le(&mut cursor)?; + + for value in variable.missing_values().values() { + let value = value.as_string().unwrap().raw_string_bytes(); + let bytes = value.get(..8).unwrap_or(value); + Padded::exact(bytes, 8, b' ').write_le(&mut cursor).unwrap(); + } + } + self.write_bytes_record(22, &body) + } + + fn write_data_file_attributes(&mut self) -> Result<(), BinError> { + if self.options.sysfile_version != SystemFileVersion::V3 { + return Ok(()); + } + let mut s = String::new(); + put_attributes(&self.dictionary.attributes, &mut s); + self.write_string_record(17, &s) + } + + fn write_variable_attributes(&mut self) -> Result<(), BinError> { + if self.options.sysfile_version != SystemFileVersion::V3 { + return Ok(()); + } + let mut s = String::new(); + for (index, variable) in self.dictionary.variables.iter().enumerate() { + let mut attributes = variable.attributes.clone(); + attributes.0.insert( + Identifier::new("$@Role").unwrap(), + vec![i32::from(variable.role).to_string()], + ); + + if index > 0 { + s.push('/'); + } + write!(&mut s, "{}:", &variable.name).unwrap(); + put_attributes(&attributes, &mut s); + } + self.write_string_record(18, &s) + } + + fn write_encoding(&mut self) -> Result<(), BinError> { + self.write_string_record(20, self.dictionary.encoding().name()) + } + + fn write_bytes_record(&mut self, subtype: u32, bytes: &[u8]) -> Result<(), BinError> { + if !bytes.is_empty() { + (7u32, subtype, 1u32, bytes.len() as u32, bytes).write_le(self.writer) + } else { + Ok(()) + } + } + + fn write_string_record(&mut self, subtype: u32, s: &str) -> Result<(), BinError> { + self.write_bytes_record(subtype, &self.dictionary.encoding().encode(&s).0) + } +} + +#[derive(BinWrite)] +struct Padded<'a> { + bytes: &'a [u8], + padding: Pad, +} + +impl<'a> Padded<'a> { + pub fn exact(bytes: &'a [u8], length: usize, pad: u8) -> Self { + let min = bytes.len().min(length); + Self { + bytes: &bytes[..min], + padding: Pad::new(length - min, pad), + } + } +} + +pub struct Pad { + n: usize, + pad: u8, +} + +impl Pad { + pub fn new(n: usize, pad: u8) -> Self { + Self { n, pad } + } +} + +impl BinWrite for Pad { + type Args<'a> = (); + + fn write_options( + &self, + writer: &mut W, + _endian: Endian, + _args: Self::Args<'_>, + ) -> binrw::BinResult<()> { + for _ in 0..self.n { + writer.write_all(&[self.pad])?; + } + Ok(()) + } +} + +impl BinWrite for Datum +where + B: RawString, +{ + type Args<'a> = (); + + fn write_options( + &self, + writer: &mut W, + endian: binrw::Endian, + _: (), + ) -> binrw::BinResult<()> { + match self { + Datum::Number(number) => number.unwrap_or(f64::MIN).write_options(writer, endian, ()), + Datum::String(raw_string) => { + raw_string + .raw_string_bytes() + .write_options(writer, endian, ()) + } + } + } +} + +#[derive(Debug)] +struct StringSegment { + data_bytes: usize, + padding_bytes: usize, +} + +enum CaseVar { + Numeric, + String(SmallVec<[StringSegment; 1]>), +} + +impl CaseVar { + fn new(width: VarWidth) -> Self { + match width { + VarWidth::Numeric => Self::Numeric, + VarWidth::String(w) => { + let mut encoding = SmallVec::<[StringSegment; 1]>::new(); + let mut remaining = w as usize; + for segment in width.segments() { + let segment = segment.as_string_width().unwrap().next_multiple_of(8); + let data_bytes = remaining.min(segment).min(255); + let padding_bytes = segment - data_bytes; + if data_bytes > 0 { + encoding.push(StringSegment { + data_bytes, + padding_bytes, + }); + remaining -= data_bytes; + } else { + encoding.last_mut().unwrap().padding_bytes += padding_bytes; + } + } + CaseVar::String(encoding) + } + } + } + fn n_segments(&self) -> usize { + match self { + CaseVar::Numeric => 1, + CaseVar::String(encoding) => encoding.len(), + } + } + fn n_variable_positions(&self) -> usize { + match self { + CaseVar::Numeric => 1, + CaseVar::String(encoding) => encoding + .iter() + .map(|segment| (segment.data_bytes + segment.padding_bytes) / 8) + .sum(), + } + } +} + +/// System file writer. +pub struct Writer +where + W: Write + Seek, +{ + compression: Option, + case_vars: Vec, + opcodes: Vec, + data: Vec, + inner: Option>>, + n_cases: u64, +} + +pub struct WriterInner<'a, W: Write> { + case_vars: &'a [CaseVar], + opcodes: &'a mut Vec, + data: &'a mut Vec, + inner: &'a mut W, +} + +impl<'a, W> WriterInner<'a, W> +where + W: Write + Seek, +{ + fn new( + case_vars: &'a [CaseVar], + opcodes: &'a mut Vec, + data: &'a mut Vec, + inner: &'a mut W, + ) -> Self { + Self { + case_vars, + opcodes, + data, + inner, + } + } + fn flush_compressed(&mut self) -> Result<(), BinError> { + if !self.opcodes.is_empty() { + self.opcodes.resize(8, 0); + self.inner.write_all(&mut self.opcodes)?; + self.inner.write(&mut self.data)?; + self.opcodes.clear(); + self.data.clear(); + } + Ok(()) + } + fn put_opcode(&mut self, opcode: u8) -> Result<(), BinError> { + if self.opcodes.len() >= 8 { + self.flush_compressed()?; + } + self.opcodes.push(opcode); + Ok(()) + } + + fn write_case_uncompressed<'c, B>( + &mut self, + case: impl Iterator>, + ) -> Result<(), BinError> + where + B: RawString, + { + for (var, datum) in zip_eq(self.case_vars, case) { + match var { + CaseVar::Numeric => datum + .as_number() + .unwrap() + .unwrap_or(f64::MIN) + .write_le(&mut self.inner)?, + CaseVar::String(encoding) => { + let mut s = datum.as_string().unwrap().raw_string_bytes(); + for segment in encoding { + let spaces = segment.data_bytes.saturating_sub(s.len()); + let data_bytes = segment.data_bytes - spaces; + + let data; + (data, s) = s.split_at(data_bytes); + ( + data, + Pad::new(spaces, b' '), + Pad::new(segment.padding_bytes, 0), + ) + .write_le(&mut self.inner)?; + } + } + } + } + Ok(()) + } + fn write_case_compressed<'c, B>( + &mut self, + case: impl Iterator>, + ) -> Result<(), BinError> + where + B: RawString, + { + for (var, datum) in zip_eq(self.case_vars, case) { + match var { + CaseVar::Numeric => match datum.as_number().unwrap() { + None => self.put_opcode(255)?, + Some(number) => { + if number >= 1.0 - BIAS + && number <= 251.0 - BIAS + && number == number.trunc() + { + self.put_opcode((number + BIAS) as u8)? + } else { + self.put_opcode(253)?; + self.data.extend_from_slice(&number.to_le_bytes()); + } + } + }, + + CaseVar::String(encoding) => { + let mut s = datum.as_string().unwrap().raw_string_bytes(); + for segment in encoding { + let excess = segment.data_bytes.saturating_sub(s.len()); + let data_bytes = segment.data_bytes - excess; + let padding_bytes = segment.padding_bytes + excess; + + let data; + (data, s) = s.split_at(data_bytes); + + let (chunks, remainder) = data.as_chunks::<8>(); + for chunk in chunks { + if chunk == b" " { + self.put_opcode(254)?; + } else { + self.put_opcode(253)?; + self.data.extend_from_slice(chunk); + } + } + if !remainder.is_empty() { + if remainder.iter().all(|c| *c == b' ') { + self.put_opcode(254)?; + } else { + self.put_opcode(253)?; + self.data.extend_from_slice(remainder); + self.data.extend(repeat_n(b' ', 8 - remainder.len())); + } + } + for _ in 0..padding_bytes / 8 { + self.put_opcode(254)?; + } + } + } + } + } + Ok(()) + } +} + +impl Writer +where + W: Write + Seek, +{ + fn new(options: WriteOptions, case_vars: Vec, inner: W) -> Result { + Ok(Self { + compression: options.compression, + case_vars, + opcodes: Vec::with_capacity(8), + data: Vec::with_capacity(64), + n_cases: 0, + inner: match options.compression { + Some(Compression::ZLib) => Some(Either::Right(ZlibWriter::new(inner)?)), + _ => Some(Either::Left(inner)), + }, + }) + } + + /// Finishes writing the file, flushing buffers and updating headers to + /// match the final case counts. + pub fn finish(mut self) -> Result, BinError> { + self.try_finish() + } + + /// Tries to finish writing the file, flushing buffers and updating headers + /// to match the final case counts. + /// + /// # Panic + /// + /// Attempts to write more cases after calling this function may will panic. + pub fn try_finish(&mut self) -> Result, BinError> { + let Some(inner) = self.inner.take() else { + return Ok(None); + }; + + let mut inner = match inner { + Either::Left(mut inner) => { + WriterInner::new( + &self.case_vars, + &mut self.opcodes, + &mut self.data, + &mut inner, + ) + .flush_compressed()?; + inner + } + Either::Right(mut zlib_writer) => { + WriterInner::new( + &self.case_vars, + &mut self.opcodes, + &mut self.data, + &mut zlib_writer, + ) + .flush_compressed()?; + zlib_writer.finish()? + } + }; + if let Ok(n_cases) = u32::try_from(self.n_cases) { + if inner.seek(SeekFrom::Start(80)).is_ok() { + let _ = inner.write_all(&n_cases.to_le_bytes()); + } + } + Ok(Some(inner)) + } + + /// Writes `case` to the system file. + /// + /// # Panic + /// + /// Panics if [try_finish](Self::try_finish) has been called. + pub fn write_case<'a, B>( + &mut self, + case: impl IntoIterator>, + ) -> Result<(), BinError> + where + B: RawString, + { + match self.inner.as_mut().unwrap() { + Either::Left(inner) => { + let mut inner = + WriterInner::new(&self.case_vars, &mut self.opcodes, &mut self.data, inner); + match self.compression { + Some(_) => inner.write_case_compressed(case.into_iter())?, + None => inner.write_case_uncompressed(case.into_iter())?, + } + } + Either::Right(inner) => { + WriterInner::new(&self.case_vars, &mut self.opcodes, &mut self.data, inner) + .write_case_compressed(case.into_iter())? + } + } + self.n_cases += 1; + Ok(()) + } +} + +impl Drop for Writer +where + W: Write + Seek, +{ + fn drop(&mut self) { + let _ = self.try_finish(); + } +} + +struct ZlibWriter +where + W: Write + Seek, +{ + header: RawZHeader, + trailer: RawZTrailer, + encoder: ZlibEncoder>, + inner: W, +} + +impl ZlibWriter +where + W: Write + Seek, +{ + fn new(mut inner: W) -> Result { + let header = RawZHeader { + zheader_offset: inner.stream_position()?, + ztrailer_offset: 0, + ztrailer_len: 0, + }; + header.write_le(&mut inner)?; + Ok(Self { + header, + trailer: RawZTrailer { + int_bias: -BIAS as i64, + zero: 0, + block_size: ZBLOCK_SIZE as u32, + blocks: Vec::new(), + }, + encoder: ZlibEncoder::new(Vec::new(), flate2::Compression::new(1)), + inner, + }) + } + + fn flush_block(&mut self) -> std::io::Result<()> { + let total_in = self.encoder.total_in(); + if total_in > 0 { + let buf = self.encoder.reset(Vec::new())?; + let total_out = buf.len(); + self.inner.write_all(&buf)?; + self.encoder.reset(buf).unwrap(); + + self.trailer.blocks.push(ZBlock { + uncompressed_size: total_in as u32, + compressed_size: total_out as u32, + uncompressed_ofs: match self.trailer.blocks.last() { + Some(prev) => prev.uncompressed_ofs + prev.uncompressed_size as u64, + None => self.header.zheader_offset, + }, + compressed_ofs: match self.trailer.blocks.last() { + Some(prev) => prev.compressed_ofs + prev.compressed_size as u64, + None => self.header.zheader_offset + 24, + }, + }); + } + Ok(()) + } + + fn finish(mut self) -> Result { + self.flush_block()?; + let ztrailer_offset = self.inner.stream_position()?; + self.trailer.write_le(&mut self.inner)?; + let header = RawZHeader { + zheader_offset: self.header.zheader_offset, + ztrailer_offset, + ztrailer_len: self.trailer.len() as u64, + }; + self.inner.seek(SeekFrom::Start(header.zheader_offset))?; + header.write_le(&mut self.inner)?; + Ok(self.inner) + } +} + +const ZBLOCK_SIZE: u64 = 0x3ff000; + +impl Write for ZlibWriter +where + W: Write + Seek, +{ + fn write(&mut self, mut buf: &[u8]) -> Result { + let n = buf.len(); + while buf.len() > 0 { + if self.encoder.total_in() >= ZBLOCK_SIZE { + self.flush_block()?; + } + + let chunk = buf + .len() + .min((ZBLOCK_SIZE - self.encoder.total_in()) as usize); + self.encoder.write_all(&buf[..chunk])?; + buf = &buf[chunk..]; + } + Ok(n) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +impl Seek for ZlibWriter +where + W: Write + Seek, +{ + fn seek(&mut self, _pos: std::io::SeekFrom) -> Result { + Err(IoError::from(ErrorKind::NotSeekable)) + } +} + +#[cfg(test)] +mod tests { + use std::io::Cursor; + + use binrw::{BinRead, Endian}; + use encoding_rs::UTF_8; + use itertools::Itertools; + use unicase::UniCase; + + use crate::{ + data::{ByteString, Datum, RawString}, + dictionary::{ + CategoryLabels, DictIndexMultipleResponseSet, DictIndexVariableSet, Dictionary, + MultipleResponseType, + }, + identifier::Identifier, + sys::{ + raw::{ + records::{ + DocumentRecord, Extension, RawHeader, RawVariableRecord, VariableRecord, + }, + Decoder, VarTypes, + }, + write::DictionaryWriter, + ReadOptions, WriteOptions, + }, + variable::{Alignment, Attributes, Measure, MissingValueRange, VarWidth, Variable}, + }; + + /// Checks that the header record has the right nominal case size and weight + /// index, even with long and very long string variables. + #[test] + fn header() { + for variables in [ + (VarWidth::Numeric, 1), + (VarWidth::String(1), 1), + (VarWidth::String(8), 1), + (VarWidth::String(15), 2), + (VarWidth::String(255), 32), + (VarWidth::String(256), 33), + (VarWidth::String(20000), 79 * 32 + 12), + ] + .iter() + .copied() + .combinations_with_replacement(4) + { + let mut dictionary = Dictionary::new(UTF_8); + let mut expected_case_size = 0; + let mut weight_indexes = vec![(None, 0)]; + for (index, (width, n_chunks)) in variables.into_iter().enumerate() { + let index = dictionary + .add_var(Variable::new( + Identifier::new(format!("v{index}")).unwrap(), + width, + UTF_8, + )) + .unwrap(); + if width.is_numeric() { + weight_indexes.push((Some(index), expected_case_size + 1)); + } + expected_case_size += n_chunks; + } + for (weight_index, expected_weight_index) in weight_indexes { + dictionary.set_weight(weight_index).unwrap(); + + let mut raw = Vec::new(); + DictionaryWriter::new( + &WriteOptions::reproducible(None), + &mut Cursor::new(&mut raw), + &dictionary, + ) + .write_header() + .unwrap(); + let header = RawHeader::read_le(&mut Cursor::new(&raw)).unwrap(); + assert_eq!(header.weight_index, expected_weight_index as u32); + assert_eq!(header.nominal_case_size, expected_case_size as u32); + } + } + } + + /// Checks that variable records are followed by the right number of + /// continuation records, and that very long string variables have the right + /// number of segment variables. + #[test] + fn variables_widths() { + let variables = [ + (VarWidth::Numeric, vec![0]), + (VarWidth::String(1), vec![1]), + (VarWidth::String(8), vec![8]), + (VarWidth::String(15), vec![15, -1]), + ( + VarWidth::String(255), + std::iter::once(255) + .chain(std::iter::repeat_n(-1, 31)) + .collect(), + ), + ( + VarWidth::String(256), + std::iter::once(255) + .chain(std::iter::repeat_n(-1, 31)) + .chain(std::iter::once(4)) + .collect(), + ), + ( + VarWidth::String(20000), + std::iter::once(255) + .chain(std::iter::repeat_n(-1, 31)) + .cycle() + .take(32 * 79) + .chain(std::iter::once(92)) + .chain(std::iter::repeat_n(-1, 11)) + .collect(), + ), + ]; + for variables in variables.iter().combinations_with_replacement(3) { + let mut dictionary = Dictionary::new(UTF_8); + for (index, (width, _)) in variables.iter().enumerate() { + dictionary + .add_var(Variable::new( + Identifier::new(format!("v{index}")).unwrap(), + *width, + UTF_8, + )) + .unwrap(); + } + + let widths = variables + .into_iter() + .map(|(_, w)| w.iter()) + .flatten() + .copied(); + + let mut raw = Vec::new(); + DictionaryWriter::new( + &WriteOptions::reproducible(None), + &mut Cursor::new(&mut raw), + &dictionary, + ) + .write_variables() + .unwrap(); + + let mut cursor = Cursor::new(&raw); + let mut records = Vec::new(); + while cursor.position() < raw.len() as u64 { + assert_eq!(u32::read_le(&mut cursor).unwrap(), 2); + records.push(RawVariableRecord::read_le(&mut cursor).unwrap()); + } + for (record, expected_width) in records.iter().zip_eq(widths.into_iter()) { + assert_eq!(record.width, expected_width); + } + } + } + + /// Checks that missing values are written correctly. + #[test] + fn variables_missing_values() { + let test_cases = [ + (VarWidth::Numeric, vec![Datum::Number(Some(1.0))], None), + ( + VarWidth::Numeric, + vec![Datum::Number(Some(1.0)), Datum::Number(Some(2.0))], + None, + ), + ( + VarWidth::Numeric, + vec![ + Datum::Number(Some(1.0)), + Datum::Number(Some(2.0)), + Datum::Number(Some(3.0)), + ], + None, + ), + ( + VarWidth::Numeric, + vec![], + Some(MissingValueRange::In { + low: 10.0, + high: 20.0, + }), + ), + ( + VarWidth::Numeric, + vec![], + Some(MissingValueRange::From { low: 100.0 }), + ), + ( + VarWidth::Numeric, + vec![], + Some(MissingValueRange::To { high: 200.0 }), + ), + ( + VarWidth::Numeric, + vec![Datum::Number(Some(1.0))], + Some(MissingValueRange::In { + low: 10.0, + high: 20.0, + }), + ), + ( + VarWidth::Numeric, + vec![Datum::Number(Some(1.0))], + Some(MissingValueRange::From { low: 100.0 }), + ), + ( + VarWidth::Numeric, + vec![Datum::Number(Some(1.0))], + Some(MissingValueRange::To { high: 200.0 }), + ), + ( + VarWidth::String(5), + vec![Datum::String(ByteString::from("abcde"))], + None, + ), + ( + VarWidth::String(5), + vec![ + Datum::String(ByteString::from("abcde")), + Datum::String(ByteString::from("qwioe")), + ], + None, + ), + ( + VarWidth::String(5), + vec![ + Datum::String(ByteString::from("abcde")), + Datum::String(ByteString::from("qwioe")), + Datum::String(ByteString::from("jksld")), + ], + None, + ), + ( + VarWidth::String(9), + vec![ + Datum::String(ByteString::from("abcdeasd")), + Datum::String(ByteString::from("qwioejdf")), + Datum::String(ByteString::from("jksldiwe")), + ], + None, + ), + ( + VarWidth::String(10), + vec![ + Datum::String(ByteString::from("abcdeasd")), + Datum::String(ByteString::from("qwioejdf")), + ], + None, + ), + ( + VarWidth::String(11), + vec![Datum::String(ByteString::from("abcdeasd"))], + None, + ), + ]; + + for (width, values, range) in test_cases { + let mut dictionary = Dictionary::new(UTF_8); + let mut variable = Variable::new(Identifier::new("var").unwrap(), width, UTF_8); + variable + .missing_values_mut() + .add_values(values.iter().map(|value| value.as_encoded(UTF_8).cloned())) + .unwrap(); + if let Some(range) = &range { + variable + .missing_values_mut() + .add_range(range.clone()) + .unwrap(); + } + dictionary.add_var(variable).unwrap(); + + // Write and check variable records. + let mut raw_variables = Vec::new(); + DictionaryWriter::new( + &WriteOptions::reproducible(None), + &mut Cursor::new(&mut raw_variables), + &dictionary, + ) + .write_variables() + .unwrap(); + + let mut cursor = Cursor::new(&raw_variables[4..]); + let record = + VariableRecord::read(&mut cursor, Endian::Little, &mut |_| panic!()).unwrap(); + if !width.is_long_string() { + assert_eq!(&record.missing_values.values, &values); + } else { + assert_eq!(&record.missing_values.values, &vec![]); + } + assert_eq!(&record.missing_values.range, &range); + + // Write and check long string missing value record. + let mut raw_long_missing = Vec::new(); + DictionaryWriter::new( + &WriteOptions::reproducible(None), + &mut Cursor::new(&mut raw_long_missing), + &dictionary, + ) + .write_long_string_missing_values() + .unwrap(); + + if width.is_long_string() { + let mut cursor = Cursor::new(&raw_long_missing[4..]); + let record = Extension::read( + &mut cursor, + Endian::Little, + &VarTypes::new(), + &mut |_| panic!(), + ) + .unwrap() + .unwrap() + .as_long_string_missing_values() + .unwrap() + .clone() + .decode(&mut Decoder::new(UTF_8, |_| panic!())); + + assert_eq!(record.values.len(), 1); + assert_eq!(&record.values[0].var_name.0, &UniCase::new("var")); + let actual = record.values[0] + .missing_values + .iter() + .map(|v| v.raw_string_bytes()); + let expected = values + .iter() + .map(|v| v.as_string().unwrap().raw_string_bytes()); + for (actual, expected) in actual.zip_eq(expected) { + assert_eq!(actual, expected); + } + } else { + assert_eq!(raw_long_missing.len(), 0); + } + } + } + + /// Checks that value labels are written correctly. + #[test] + fn variables_value_labels() { + let variables = [ + (VarWidth::Numeric, vec![(Datum::Number(Some(1.0)), "One")]), + ( + VarWidth::Numeric, + vec![ + (Datum::Number(Some(1.0)), "One"), + (Datum::Number(Some(2.0)), "Two"), + ], + ), + ( + VarWidth::Numeric, + vec![ + (Datum::Number(Some(1.0)), "One"), + (Datum::Number(Some(2.0)), "Two"), + (Datum::Number(Some(3.0)), "Three"), + ], + ), + ( + VarWidth::String(4), + vec![(Datum::String(ByteString::from("abcd")), "One")], + ), + ( + VarWidth::String(8), + vec![( + Datum::String(ByteString::from("abcdefgh")), + "Longer value label", + )], + ), + ( + VarWidth::String(9), + vec![( + Datum::String(ByteString::from("abcdefghi")), + "value label for 9-byte value", + )], + ), + ( + VarWidth::String(300), + vec![( + Datum::String(ByteString::from(vec![b'x'; 300])), + "value label for 300-byte value", + )], + ), + ]; + + for test_case in variables.iter().combinations_with_replacement(3) { + let mut dictionary = Dictionary::new(UTF_8); + for (index, (width, value_labels)) in test_case.iter().enumerate() { + let mut variable = Variable::new( + Identifier::new(format!("var{index}")).unwrap(), + *width, + UTF_8, + ); + for (value, label) in value_labels { + assert_eq!(variable.value_labels.insert(value.clone(), *label), None); + } + dictionary.add_var(variable).unwrap(); + } + dbg!(&dictionary); + + let raw = WriteOptions::new() + .write_writer(&dictionary, Cursor::new(Vec::new())) + .unwrap() + .finish() + .unwrap() + .unwrap() + .into_inner(); + let dictionary2 = ReadOptions::new(|_| panic!()) + .open_reader(Cursor::new(raw)) + .unwrap() + .dictionary; + + for (expected, actual) in dictionary + .variables + .iter() + .zip_eq(dictionary2.variables.iter()) + { + assert_eq!(&expected.value_labels, &actual.value_labels); + } + } + } + + #[test] + fn documents() { + let expected = vec![String::from("Line one"), String::from("Line two")]; + let mut dictionary = Dictionary::new(UTF_8); + dictionary.documents = expected.clone(); + + let mut raw = Vec::new(); + DictionaryWriter::new( + &WriteOptions::reproducible(None), + &mut Cursor::new(&mut raw), + &dictionary, + ) + .write_documents() + .unwrap(); + + let actual = DocumentRecord::read(&mut Cursor::new(&raw[4..]), Endian::Little) + .unwrap() + .decode(&mut Decoder::new(UTF_8, |_| panic!())) + .lines + .into_iter() + .map(|mut s| { + s.truncate(s.trim_end().len()); + s + }) + .collect::>(); + assert_eq!(&actual, &expected); + } + + #[test] + fn variable_sets() { + let mut expected = Dictionary::new(UTF_8); + for index in 0..10 { + expected + .add_var(Variable::new( + Identifier::new(format!("var{index}")).unwrap(), + VarWidth::Numeric, + UTF_8, + )) + .unwrap(); + } + + for (index, variables) in [vec![0], vec![1, 2], vec![3, 4, 5], vec![6, 7, 8, 9]] + .into_iter() + .enumerate() + { + expected.add_variable_set(DictIndexVariableSet { + name: format!("Variable Set {index}"), + variables, + }); + } + + let raw = WriteOptions::new() + .write_writer(&expected, Cursor::new(Vec::new())) + .unwrap() + .finish() + .unwrap() + .unwrap() + .into_inner(); + let actual = ReadOptions::new(|_| panic!()) + .open_reader(Cursor::new(raw)) + .unwrap() + .dictionary; + + assert!(actual + .variable_sets() + .iter() + .eq(expected.variable_sets().iter()),); + } + + /// Test writing multiple response sets. + /// + /// This is the example given in the documentation for the system file + /// format. + #[test] + fn mrsets() { + let mut dictionary = Dictionary::new(UTF_8); + for (variables, width) in [ + ('a'..='g', VarWidth::Numeric), + ('h'..='j', VarWidth::String(3)), + ('k'..='m', VarWidth::Numeric), + ('n'..='p', VarWidth::String(6)), + ] { + for variable in variables { + dictionary + .add_var(Variable::new( + Identifier::new(variable.to_string()).unwrap(), + width, + UTF_8, + )) + .unwrap(); + } + } + dictionary + .mrsets_mut() + .insert(DictIndexMultipleResponseSet { + name: Identifier::new("$a").unwrap(), + label: String::from("my mcgroup"), + mr_type: MultipleResponseType::MultipleCategory, + variables: vec![0, 1, 2], + }) + .unwrap(); + dictionary + .mrsets_mut() + .insert(DictIndexMultipleResponseSet { + name: Identifier::new("$b").unwrap(), + label: String::new(), + mr_type: MultipleResponseType::MultipleDichotomy { + datum: Datum::Number(Some(55.0)), + labels: CategoryLabels::VarLabels, + }, + variables: vec![6, 4, 5, 3], + }) + .unwrap(); + dictionary + .mrsets_mut() + .insert(DictIndexMultipleResponseSet { + name: Identifier::new("$c").unwrap(), + label: String::from("mdgroup #2"), + mr_type: MultipleResponseType::MultipleDichotomy { + datum: Datum::String("Yes".into()), + labels: CategoryLabels::VarLabels, + }, + variables: vec![7, 8, 9], + }) + .unwrap(); + dictionary + .mrsets_mut() + .insert(DictIndexMultipleResponseSet { + name: Identifier::new("$d").unwrap(), + label: String::from("third mdgroup"), + mr_type: MultipleResponseType::MultipleDichotomy { + datum: Datum::Number(Some(34.0)), + labels: CategoryLabels::CountedValues { + use_var_label_as_mrset_label: false, + }, + }, + variables: vec![10, 11, 12], + }) + .unwrap(); + dictionary + .mrsets_mut() + .insert(DictIndexMultipleResponseSet { + name: Identifier::new("$e").unwrap(), + label: String::new(), + mr_type: MultipleResponseType::MultipleDichotomy { + datum: Datum::String("choice".into()), + labels: CategoryLabels::CountedValues { + use_var_label_as_mrset_label: true, + }, + }, + variables: vec![13, 14, 15], + }) + .unwrap(); + + fn get_mrsets(dictionary: &Dictionary, pre_v14: bool) -> String { + let mut raw = Vec::new(); + DictionaryWriter::new( + &WriteOptions::reproducible(None), + &mut Cursor::new(&mut raw), + dictionary, + ) + .write_mrsets(pre_v14) + .unwrap(); + + str::from_utf8(&raw[16..]).unwrap().into() + } + + assert_eq!( + &get_mrsets(&dictionary, true), + "$a=C 10 my mcgroup a b c +$b=D2 55 0 g e f d +$c=D3 Yes 10 mdgroup #2 h i j +" + ); + assert_eq!( + &get_mrsets(&dictionary, false), + "$d=E 1 2 34 13 third mdgroup k l m +$e=E 11 6 choice 0 n o p +" + ); + } + + #[test] + fn variable_display_parameters() { + let variables = [ + (None, Alignment::Left, 10), + (Some(Measure::Nominal), Alignment::Right, 12), + (Some(Measure::Ordinal), Alignment::Center, 14), + (Some(Measure::Scale), Alignment::Right, 16), + ]; + let mut expected = Dictionary::new(UTF_8); + for (index, (measure, alignment, display_width)) in variables.into_iter().enumerate() { + let mut variable = Variable::new( + Identifier::new(format!("v{index}")).unwrap(), + VarWidth::Numeric, + UTF_8, + ); + variable.measure = measure; + variable.alignment = alignment; + variable.display_width = display_width; + expected.add_var(variable).unwrap(); + } + + let raw = WriteOptions::new() + .write_writer(&expected, Cursor::new(Vec::new())) + .unwrap() + .finish() + .unwrap() + .unwrap() + .into_inner(); + let actual = ReadOptions::new(|_| panic!()) + .open_reader(Cursor::new(raw)) + .unwrap() + .dictionary; + + fn display_parameters( + dictionary: &Dictionary, + ) -> impl Iterator, Alignment, u32)> { + dictionary + .variables + .iter() + .map(|variable| (variable.measure, variable.alignment, variable.display_width)) + } + assert!(display_parameters(&expected).eq(display_parameters(&actual))); + } + + #[test] + fn long_variable_names() { + let long_name = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@$"; + + let mut expected = Dictionary::new(UTF_8); + for name in (1..=64).map(|len| long_name[..len].to_string()) { + expected + .add_var(Variable::new( + Identifier::new(name).unwrap(), + VarWidth::Numeric, + UTF_8, + )) + .unwrap(); + } + + let raw = WriteOptions::new() + .write_writer(&expected, Cursor::new(Vec::new())) + .unwrap() + .finish() + .unwrap() + .unwrap() + .into_inner(); + let actual = ReadOptions::new(|_| panic!()) + .open_reader(Cursor::new(raw)) + .unwrap() + .dictionary; + + fn names(dictionary: &Dictionary) -> impl Iterator { + dictionary.variables.iter().map(|variable| &variable.name) + } + assert!(names(&expected).eq(names(&actual))); + } + + /// This tests the example from the documentation for the system file + /// format. + #[test] + fn attributes() { + let mut dictionary = Dictionary::new(UTF_8); + let attributes = Attributes::new() + .with( + Identifier::new("fred").unwrap(), + vec![String::from("23"), String::from("34")], + ) + .with(Identifier::new("bert").unwrap(), vec![String::from("123")]); + dictionary.attributes = attributes.clone(); + let mut variable = + Variable::new(Identifier::new("dummy").unwrap(), VarWidth::Numeric, UTF_8); + variable.attributes = attributes; + dictionary.add_var(variable).unwrap(); + + fn get_attributes(dictionary: &Dictionary, vars: bool) -> String { + let mut raw = Vec::new(); + let options = WriteOptions::reproducible(None); + let mut cursor = Cursor::new(&mut raw); + let mut writer = DictionaryWriter::new(&options, &mut cursor, dictionary); + if vars { + writer.write_variable_attributes().unwrap(); + } else { + writer.write_data_file_attributes().unwrap(); + } + if raw.is_empty() { + String::new() + } else { + str::from_utf8(&raw[16..]).unwrap().into() + } + } + + assert_eq!( + &get_attributes(&dictionary, false), + "bert('123' +)fred('23' +'34' +)" + ); + assert_eq!( + &get_attributes(&dictionary, true), + "dummy:$@Role('0' +)bert('123' +)fred('23' +'34' +)" + ); + } + + #[test] + fn encoding() { + let dictionary = Dictionary::new(UTF_8); + let mut raw = Vec::new(); + DictionaryWriter::new( + &WriteOptions::reproducible(None), + &mut Cursor::new(&mut raw), + &dictionary, + ) + .write_encoding() + .unwrap(); + assert_eq!(str::from_utf8(&raw[16..]).unwrap(), "UTF-8"); + } +} diff --git a/rust/pspp/src/variable.rs b/rust/pspp/src/variable.rs new file mode 100644 index 0000000000..866de4f836 --- /dev/null +++ b/rust/pspp/src/variable.rs @@ -0,0 +1,1085 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +//! Variables. + +use std::{ + collections::{BTreeMap, HashMap}, + fmt::{Debug, Display}, + hash::{DefaultHasher, Hash, Hasher}, + ops::{Deref, Not}, + str::FromStr, +}; + +use encoding_rs::{Encoding, UTF_8}; +use num::integer::div_ceil; +use serde::{ser::SerializeSeq, Serialize}; +use thiserror::Error as ThisError; +use unicase::UniCase; + +use crate::{ + data::{ByteString, Datum, Encoded, EncodedString, ResizeError, WithEncoding}, + format::{DisplayPlain, Format}, + identifier::{HasIdentifier, Identifier}, +}; + +/// Variable type. +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)] +pub enum VarType { + /// A numeric variable. + Numeric, + + /// A string variable. + String, +} + +impl Not for VarType { + type Output = Self; + + fn not(self) -> Self::Output { + match self { + Self::Numeric => Self::String, + Self::String => Self::Numeric, + } + } +} + +impl Not for &VarType { + type Output = VarType; + + fn not(self) -> Self::Output { + !*self + } +} + +impl Display for VarType { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + VarType::Numeric => write!(f, "numeric"), + VarType::String => write!(f, "string"), + } + } +} + +/// [VarType], plus a width for [VarType::String]. +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize)] +pub enum VarWidth { + Numeric, + String(u16), // XXX change to NonZeroU16, or to 1..=32767 range type +} + +impl VarWidth { + pub const MAX_STRING: u16 = 32767; + + pub fn n_dict_indexes(self) -> usize { + match self { + VarWidth::Numeric => 1, + VarWidth::String(w) => div_ceil(w as usize, 8), + } + } + + fn width_predicate(a: VarWidth, b: VarWidth, f: impl Fn(u16, u16) -> u16) -> Option { + match (a, b) { + (VarWidth::Numeric, VarWidth::Numeric) => Some(VarWidth::Numeric), + (VarWidth::String(a), VarWidth::String(b)) => Some(VarWidth::String(f(a, b))), + _ => None, + } + } + + /// Returns the wider of `self` and `other`: + /// - Numerical variable widths are equally wide. + /// - Longer strings are wider than shorter strings. + /// - Numerical and string types are incomparable, so result in `None`. + pub fn wider(a: VarWidth, b: VarWidth) -> Option { + Self::width_predicate(a, b, |a, b| a.max(b)) + } + + /// Returns the narrower of `self` and `other` (see [`Self::wider`]). + pub fn narrower(a: VarWidth, b: VarWidth) -> Option { + Self::width_predicate(a, b, |a, b| a.min(b)) + } + + pub fn default_display_width(&self) -> u32 { + match self { + VarWidth::Numeric => 8, + VarWidth::String(width) => *width.min(&32) as u32, + } + } + + pub fn is_long_string(&self) -> bool { + if let Self::String(width) = self { + *width > 8 + } else { + false + } + } + + pub fn as_string_width(&self) -> Option { + match self { + VarWidth::Numeric => None, + VarWidth::String(width) => Some(*width as usize), + } + } + + pub fn is_numeric(&self) -> bool { + *self == Self::Numeric + } + + pub fn is_string(&self) -> bool { + !self.is_numeric() + } + + /// Returns true if this is a very long string width, meaning wider than 255 + /// bytes, which was the limit for old versions of SPSS. + pub fn is_very_long_string(&self) -> bool { + match *self { + VarWidth::Numeric => false, + VarWidth::String(width) => width > 255, + } + } + + /// Number of bytes per segment by which the amount of space for very long + /// string variables is allocated. + pub const SEGMENT_SIZE: usize = 252; + + /// Returns an iterator over the "segments" used for writing case data for a + /// variable with this width. A segment is a physical variable in the + /// system file that represents some piece of a logical variable as seen by + /// a PSPP user. Most variables have one segment whose width is their own + /// width, but very long string variables, with width greater than 255, have + /// multiple segments each with width 255 or less. + pub fn segments(&self) -> Segments { + Segments::new(*self) + } + + /// Returns the number of 8-byte chunks used for writing case data for a + /// variable with this width. Very long string variables (wider than 255 + /// bytes) cannot directly be divided into chunks: they must first be + /// divided into multiple [segments](Self::segments), which can then be + /// divided into chunks. + pub fn n_chunks(&self) -> Option { + match *self { + VarWidth::Numeric => Some(1), + VarWidth::String(w) if w <= 255 => Some(w.div_ceil(8) as usize), + VarWidth::String(_) => None, + } + } + + /// Returns the width to allocate to the segment with the given + /// `segment_idx` within this variable. A segment is a physical variable in + /// the system file that represents some piece of a logical variable as seen + /// by a PSPP user. + pub fn segment_alloc_width(&self, segment_idx: usize) -> usize { + debug_assert!(segment_idx < self.segments().len()); + debug_assert!(self.is_very_long_string()); + + if segment_idx < self.segments().len() - 1 { + 255 + } else { + self.as_string_width().unwrap() - segment_idx * Self::SEGMENT_SIZE + } + } + + pub fn display_adjective(&self) -> VarWidthAdjective { + VarWidthAdjective(*self) + } + + pub fn codepage_to_unicode(&mut self) { + match self { + VarWidth::Numeric => (), + VarWidth::String(width) => *width = width.saturating_mul(3).min(Self::MAX_STRING), + } + } +} + +pub struct Segments { + width: VarWidth, + i: usize, + n: usize, +} +impl Segments { + pub fn new(width: VarWidth) -> Self { + Self { + width, + i: 0, + n: if width.is_very_long_string() { + width + .as_string_width() + .unwrap() + .div_ceil(VarWidth::SEGMENT_SIZE) + } else { + 1 + }, + } + } +} + +impl Iterator for Segments { + type Item = VarWidth; + + fn next(&mut self) -> Option { + let i = self.i; + if i >= self.n { + None + } else { + self.i += 1; + match self.width { + VarWidth::Numeric => Some(VarWidth::Numeric), + VarWidth::String(_) if i < self.n - 1 => Some(VarWidth::String(255)), + VarWidth::String(width) => Some(VarWidth::String( + width - (self.n as u16 - 1) * VarWidth::SEGMENT_SIZE as u16, + )), + } + } + } + + fn size_hint(&self) -> (usize, Option) { + let n = self.n - self.i; + (n, Some(n)) + } +} + +impl ExactSizeIterator for Segments {} + +impl From for VarType { + fn from(source: VarWidth) -> Self { + match source { + VarWidth::Numeric => VarType::Numeric, + VarWidth::String(_) => VarType::String, + } + } +} + +pub struct VarWidthAdjective(VarWidth); + +impl Display for VarWidthAdjective { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self.0 { + VarWidth::Numeric => write!(f, "numeric"), + VarWidth::String(width) => write!(f, "{width}-byte string"), + } + } +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize)] +pub enum Role { + #[default] + Input, + Target, + Both, + None, + Partition, + Split, +} + +impl Role { + pub fn as_str(&self) -> &'static str { + match self { + Role::Input => "Input", + Role::Target => "Target", + Role::Both => "Both", + Role::None => "None", + Role::Partition => "Partition", + Role::Split => "Split", + } + } +} + +impl FromStr for Role { + type Err = InvalidRole; + + fn from_str(s: &str) -> Result { + for (string, value) in [ + ("input", Role::Input), + ("target", Role::Target), + ("both", Role::Both), + ("none", Role::None), + ("partition", Role::Partition), + ("split", Role::Split), + ] { + if string.eq_ignore_ascii_case(s) { + return Ok(value); + } + } + Err(InvalidRole::UnknownRole(s.into())) + } +} + +impl TryFrom for Role { + type Error = InvalidRole; + + fn try_from(value: i32) -> Result { + match value { + 0 => Ok(Role::Input), + 1 => Ok(Role::Target), + 2 => Ok(Role::Both), + 3 => Ok(Role::None), + 4 => Ok(Role::Partition), + 5 => Ok(Role::Split), + _ => Err(InvalidRole::UnknownRole(value.to_string())), + } + } +} + +impl From for i32 { + fn from(value: Role) -> Self { + match value { + Role::Input => 0, + Role::Target => 1, + Role::Both => 2, + Role::None => 3, + Role::Partition => 4, + Role::Split => 5, + } + } +} + +#[derive(Clone, Default, PartialEq, Eq, Serialize)] +pub struct Attributes(pub BTreeMap>); + +impl Attributes { + pub fn new() -> Self { + Self(BTreeMap::new()) + } + + pub fn contains_name(&self, name: &Identifier) -> bool { + self.0.contains_key(name) + } + + pub fn insert(&mut self, name: Identifier, values: Vec) { + self.0.insert(name, values); + } + + pub fn with(mut self, name: Identifier, values: Vec) -> Self { + self.insert(name, values); + self + } + + pub fn append(&mut self, other: &mut Self) { + self.0.append(&mut other.0) + } + + pub fn role(&self) -> Result, InvalidRole> { + self.try_into() + } + + pub fn iter(&self, include_at: bool) -> impl Iterator { + self.0.iter().filter_map(move |(name, values)| { + if include_at || !name.0.starts_with('@') { + Some((name, values.as_slice())) + } else { + None + } + }) + } + + pub fn has_any(&self, include_at: bool) -> bool { + self.iter(include_at).next().is_some() + } + + pub fn codepage_to_unicode(&mut self) { + let mut new = BTreeMap::new(); + while let Some((mut name, value)) = self.0.pop_first() { + name.codepage_to_unicode(); + new.insert(name, value); + } + self.0 = new; + } +} + +impl Debug for Attributes { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + +#[derive(Clone, Debug, ThisError, PartialEq, Eq)] +pub enum InvalidRole { + #[error("Unknown role {0:?}.")] + UnknownRole(String), + + #[error("Role attribute $@Role must have exactly one value (not {0}).")] + InvalidValues(usize), +} + +impl TryFrom<&Attributes> for Option { + type Error = InvalidRole; + + fn try_from(value: &Attributes) -> Result { + let role = Identifier::new("$@Role").unwrap(); + value.0.get(&role).map_or(Ok(None), |attribute| { + if let Ok([string]) = <&[String; 1]>::try_from(attribute.as_slice()) { + match string.parse::() { + Ok(integer) => Ok(Some(Role::try_from(integer)?)), + Err(_) => Err(InvalidRole::UnknownRole(string.clone())), + } + } else { + Err(InvalidRole::InvalidValues(attribute.len())) + } + }) + } +} + +/// A variable, usually inside a [Dictionary]. +/// +/// [Dictionary]: crate::dictionary::Dictionary +#[derive(Clone, Debug, Serialize)] +pub struct Variable { + /// The variable's name. + /// + /// PSPP variable names are case-insensitive. + pub name: Identifier, + + /// Variable width. + pub width: VarWidth, + + /// User-missing values. + /// + /// Numeric variables also have a system-missing value (represented as + /// `None`). + /// + /// Both kinds of missing values are excluded from most analyses. + missing_values: MissingValues, + + /// Output format used in most contexts. + pub print_format: Format, + + /// Output format used on the `WRITE` command. + pub write_format: Format, + + /// Value labels. + pub value_labels: ValueLabels, + + /// Variable label, an optional meaningful description for the variable + /// itself. + pub label: Option, + + /// Measurement level for the variable's data. + pub measure: Option, + + /// Role in data analysis. + pub role: Role, + + /// Width of data column in GUI. + pub display_width: u32, + + /// Data alignment in GUI. + pub alignment: Alignment, + + /// Whether to retain values of the variable from one case to the next. + pub leave: bool, + + /// For compatibility with old software that supported at most 8-character + /// variable names. + pub short_names: Vec, + + /// Variable attributes. + pub attributes: Attributes, + + /// Encoding for [Value]s inside this variable. + /// + /// The variables in a [Dictionary] must all use the same encoding as the + /// dictionary. + encoding: &'static Encoding, +} + +impl Variable { + pub fn new(name: Identifier, width: VarWidth, encoding: &'static Encoding) -> Self { + let var_type = VarType::from(width); + let leave = name.class().must_leave(); + Self { + name, + width, + missing_values: MissingValues::default(), + print_format: Format::default_for_width(width), + write_format: Format::default_for_width(width), + value_labels: ValueLabels::new(), + label: None, + measure: Measure::default_for_type(var_type), + role: Role::default(), + display_width: width.default_display_width(), + alignment: Alignment::default_for_type(var_type), + leave, + short_names: Vec::new(), + attributes: Attributes::new(), + encoding, + } + } + + pub fn encoding(&self) -> &'static Encoding { + self.encoding + } + + pub fn is_numeric(&self) -> bool { + self.width.is_numeric() + } + + pub fn is_string(&self) -> bool { + self.width.is_string() + } + + pub fn label(&self) -> Option<&String> { + self.label.as_ref() + } + + pub fn resize(&mut self, width: VarWidth) { + let _ = self.missing_values.resize(width); + + self.value_labels.resize(width); + + self.print_format.resize(width); + self.write_format.resize(width); + + self.width = width; + } + + pub fn missing_values(&self) -> &MissingValues { + &self.missing_values + } + + pub fn missing_values_mut(&mut self) -> MissingValuesMut<'_> { + MissingValuesMut { + inner: &mut self.missing_values, + width: self.width, + } + } + + pub fn codepage_to_unicode(&mut self) { + self.name.codepage_to_unicode(); + self.width.codepage_to_unicode(); + self.missing_values.codepage_to_unicode(); + self.print_format.codepage_to_unicode(); + self.write_format.codepage_to_unicode(); + self.attributes.codepage_to_unicode(); + self.encoding = UTF_8; + + // Anything old enough to not support long names is old enough not to + // support Unicode. + self.short_names.clear(); + } +} + +impl HasIdentifier for Variable { + fn identifier(&self) -> &UniCase { + &self.name.0 + } +} + +/// Associates values of a variable with meaningful labels. +/// +/// For example, 1 => strongly disagree, 2 => disagree, 3 => neither agree nor +/// disagree, ... +#[derive(Clone, Default, PartialEq, Eq)] +pub struct ValueLabels(pub HashMap, String>); + +impl ValueLabels { + pub fn new() -> Self { + Self::default() + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn get(&self, value: &Datum) -> Option<&str> { + self.0.get(value).map(|s| s.as_str()) + } + + pub fn insert(&mut self, value: Datum, label: impl Into) -> Option { + self.0.insert(value, label.into()) + } + + pub fn is_resizable(&self, width: VarWidth) -> bool { + self.0.keys().all(|datum| datum.is_resizable(width)) + } + + pub fn resize(&mut self, width: VarWidth) { + self.0 = self + .0 + .drain() + .filter_map(|(mut datum, string)| datum.resize(width).is_ok().then(|| (datum, string))) + .collect(); + } + + pub fn codepage_to_unicode(&mut self, encoding: &'static Encoding) { + self.0 = self + .0 + .drain() + .map(|(key, value)| { + let mut key = key.with_encoding(encoding); + key.codepage_to_unicode(); + (key.without_encoding(), value) + }) + .collect(); + } +} + +impl Serialize for ValueLabels { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut map = serializer.serialize_seq(Some(self.0.len()))?; + for tuple in &self.0 { + map.serialize_element(&tuple)?; + } + map.end() + } +} + +impl Debug for ValueLabels { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + +impl Hash for ValueLabels { + fn hash(&self, state: &mut H) { + let mut hash = 0; + for (k, v) in &self.0 { + let mut hasher = DefaultHasher::new(); + k.hash(&mut hasher); + v.hash(&mut hasher); + hash ^= hasher.finish(); + } + state.write_u64(hash); + } +} + +pub struct MissingValuesMut<'a> { + inner: &'a mut MissingValues, + width: VarWidth, +} + +impl<'a> Deref for MissingValuesMut<'a> { + type Target = MissingValues; + + fn deref(&self) -> &Self::Target { + self.inner + } +} + +impl<'a> MissingValuesMut<'a> { + pub fn replace(&mut self, mut new: MissingValues) -> Result<(), MissingValuesError> { + new.resize(self.width)?; + *self.inner = new; + Ok(()) + } + + pub fn add_value( + &mut self, + mut value: Datum>, + ) -> Result<(), MissingValuesError> { + if self.inner.values.len() > 2 + || (self.inner.range().is_some() && self.inner.values.len() > 1) + { + Err(MissingValuesError::TooMany) + } else if value.var_type() != VarType::from(self.width) { + Err(MissingValuesError::MixedTypes) + } else if value == Datum::Number(None) { + Err(MissingValuesError::SystemMissing) + } else if value.resize(self.width.min(VarWidth::String(8))).is_err() { + Err(MissingValuesError::TooWide) + } else { + value.trim_end(); + self.inner.values.push(value); + Ok(()) + } + } + + pub fn add_values( + &mut self, + values: impl IntoIterator>>, + ) -> Result<(), MissingValuesError> { + let n = self.inner.values.len(); + for value in values { + self.add_value(value) + .inspect_err(|_| self.inner.values.truncate(n))?; + } + Ok(()) + } + + pub fn add_range(&mut self, range: MissingValueRange) -> Result<(), MissingValuesError> { + if self.inner.range.is_some() || self.inner.values().len() > 1 { + Err(MissingValuesError::TooMany) + } else if self.width != VarWidth::Numeric { + Err(MissingValuesError::MixedTypes) + } else { + self.inner.range = Some(range); + Ok(()) + } + } +} + +// Currently doesn't filter out duplicates (should it?). +#[derive(Clone, Default, Serialize, PartialEq)] +pub struct MissingValues { + /// Individual missing values, up to 3 of them. + values: Vec>>, + + /// Optional range of missing values. + range: Option, +} + +impl Debug for MissingValues { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}", self) + } +} + +impl Display for MissingValues { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if let Some(range) = &self.range { + write!(f, "{range}")?; + if !self.values.is_empty() { + write!(f, "; ")?; + } + } + + for (i, value) in self.values.iter().enumerate() { + if i > 0 { + write!(f, "; ")?; + } + write!(f, "{}", value.quoted())?; + } + + if self.is_empty() { + write!(f, "none")?; + } + Ok(()) + } +} + +#[derive(Copy, Clone, Debug)] +pub enum MissingValuesError { + TooMany, + TooWide, + MixedTypes, + SystemMissing, +} + +impl From for MissingValuesError { + fn from(value: ResizeError) -> Self { + match value { + ResizeError::MixedTypes => MissingValuesError::MixedTypes, + ResizeError::TooWide => MissingValuesError::TooWide, + } + } +} + +impl MissingValues { + pub fn clear(&mut self) { + *self = Self::default(); + } + pub fn values(&self) -> &[Datum>] { + &self.values + } + + pub fn range(&self) -> Option<&MissingValueRange> { + self.range.as_ref() + } + + pub fn new( + mut values: Vec>>, + range: Option, + ) -> Result { + if values.len() > 3 { + return Err(MissingValuesError::TooMany); + } + + let mut var_type = None; + for value in values.iter_mut() { + value.trim_end(); + if value.width().is_long_string() { + return Err(MissingValuesError::TooWide); + } + if var_type.is_some_and(|t| t != value.var_type()) { + return Err(MissingValuesError::MixedTypes); + } + var_type = Some(value.var_type()); + } + + if var_type == Some(VarType::String) && range.is_some() { + return Err(MissingValuesError::MixedTypes); + } + + Ok(Self { values, range }) + } + + pub fn is_empty(&self) -> bool { + self.values.is_empty() && self.range.is_none() + } + + pub fn var_type(&self) -> Option { + if let Some(datum) = self.values.first() { + Some(datum.var_type()) + } else if self.range.is_some() { + Some(VarType::Numeric) + } else { + None + } + } + + pub fn contains(&self, value: &Datum) -> bool + where + S: EncodedString, + { + if self + .values + .iter() + .any(|datum| datum.eq_ignore_trailing_spaces(value)) + { + return true; + } + + if let Some(Some(number)) = value.as_number() + && let Some(range) = self.range + { + range.contains(number) + } else { + false + } + } + + pub fn resize(&mut self, width: VarWidth) -> Result<(), MissingValuesError> { + fn inner(this: &mut MissingValues, width: VarWidth) -> Result<(), MissingValuesError> { + for datum in &mut this.values { + datum.resize(width)?; + datum.trim_end(); + } + if let Some(range) = &mut this.range { + range.resize(width)?; + } + Ok(()) + } + inner(self, width).inspect_err(|_| self.clear()) + } + + pub fn codepage_to_unicode(&mut self) { + self.values = self + .values + .drain(..) + .map(|value| match value { + Datum::Number(number) => Datum::Number(number), + Datum::String(s) => Datum::String(if s.encoding() != UTF_8 { + let mut new_s = ByteString::from(s.as_str()); + new_s.0.truncate(8); + WithEncoding::new(new_s, UTF_8) + } else { + s + }), + }) + .collect(); + } +} + +#[derive(Copy, Clone, Debug, Serialize, PartialEq)] +pub enum MissingValueRange { + In { low: f64, high: f64 }, + From { low: f64 }, + To { high: f64 }, +} + +impl MissingValueRange { + pub fn new(low: f64, high: f64) -> Self { + const LOWEST: f64 = f64::MIN.next_up(); + match (low, high) { + (f64::MIN | LOWEST, _) => Self::To { high }, + (_, f64::MAX) => Self::From { low }, + (_, _) => Self::In { low, high }, + } + } + + pub fn low(&self) -> Option { + match self { + MissingValueRange::In { low, .. } | MissingValueRange::From { low } => Some(*low), + MissingValueRange::To { .. } => None, + } + } + + pub fn high(&self) -> Option { + match self { + MissingValueRange::In { high, .. } | MissingValueRange::To { high } => Some(*high), + MissingValueRange::From { .. } => None, + } + } + + pub fn contains(&self, number: f64) -> bool { + match self { + MissingValueRange::In { low, high } => (*low..*high).contains(&number), + MissingValueRange::From { low } => number >= *low, + MissingValueRange::To { high } => number <= *high, + } + } + + pub fn resize(&self, width: VarWidth) -> Result<(), MissingValuesError> { + if width.is_numeric() { + Ok(()) + } else { + Err(MissingValuesError::MixedTypes) + } + } +} + +impl Display for MissingValueRange { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self.low() { + Some(low) => low.display_plain().fmt(f)?, + None => write!(f, "LOW")?, + } + + write!(f, " THRU ")?; + + match self.high() { + Some(high) => high.display_plain().fmt(f)?, + None => write!(f, "HIGH")?, + } + Ok(()) + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)] +pub enum Alignment { + Left, + Right, + Center, +} + +impl Alignment { + pub fn default_for_type(var_type: VarType) -> Self { + match var_type { + VarType::Numeric => Self::Right, + VarType::String => Self::Left, + } + } + + pub fn as_str(&self) -> &'static str { + match self { + Alignment::Left => "Left", + Alignment::Right => "Right", + Alignment::Center => "Center", + } + } +} + +/// [Level of measurement](https://en.wikipedia.org/wiki/Level_of_measurement). +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)] +pub enum Measure { + /// Nominal values can only be compared for equality. + Nominal, + + /// Ordinal values can be meaningfully ordered. + Ordinal, + + /// Scale values can be meaningfully compared for the degree of difference. + Scale, +} + +impl Measure { + pub fn default_for_type(var_type: VarType) -> Option { + match var_type { + VarType::Numeric => None, + VarType::String => Some(Self::Nominal), + } + } + + pub fn as_str(&self) -> &'static str { + match self { + Measure::Nominal => "Nominal", + Measure::Ordinal => "Ordinal", + Measure::Scale => "Scale", + } + } +} + +#[cfg(test)] +mod tests { + use encoding_rs::{UTF_8, WINDOWS_1252}; + + use crate::{ + data::{ByteString, Datum, RawString, WithEncoding}, + variable::{MissingValues, ValueLabels, VarWidth}, + }; + + #[test] + fn var_width_codepage_to_unicode() { + fn check_unicode(input: VarWidth, expected: VarWidth) { + let mut actual = input; + actual.codepage_to_unicode(); + assert_eq!(actual, expected); + } + + check_unicode(VarWidth::Numeric, VarWidth::Numeric); + check_unicode(VarWidth::String(1), VarWidth::String(3)); + check_unicode(VarWidth::String(2), VarWidth::String(6)); + check_unicode(VarWidth::String(3), VarWidth::String(9)); + check_unicode(VarWidth::String(1000), VarWidth::String(3000)); + check_unicode(VarWidth::String(20000), VarWidth::String(32767)); + check_unicode(VarWidth::String(30000), VarWidth::String(32767)); + } + + #[test] + fn missing_values_codepage_to_unicode() { + fn windows_1252(s: &str) -> WithEncoding { + ByteString::from(WINDOWS_1252.encode(s).0).with_encoding(WINDOWS_1252) + } + + let mut actual = MissingValues::new( + vec![ + Datum::String(windows_1252("abcdefgh")), + Datum::String(windows_1252("éèäî ")), + Datum::String(windows_1252("aaéèäîdf")), + ], + None, + ) + .unwrap(); + actual.codepage_to_unicode(); + + fn utf_8(s: &str) -> WithEncoding { + ByteString::from(s).with_encoding(UTF_8) + } + + let expected = MissingValues::new( + vec![ + Datum::String(utf_8("abcdefgh")), + Datum::String(utf_8("éèäî")), + Datum::String(utf_8("aaéèä")), + ], + None, + ) + .unwrap(); + + assert_eq!(&actual, &expected); + } + + #[test] + fn value_labels_codepage_to_unicode() { + fn windows_1252(s: &str) -> Datum { + Datum::String(ByteString::from(WINDOWS_1252.encode(s).0)) + } + + let mut actual = ValueLabels::new(); + actual.insert(windows_1252("abcd"), "Label 1"); + actual.insert(windows_1252("éèäî"), "Label 2"); + actual.codepage_to_unicode(WINDOWS_1252); + + let mut expected = ValueLabels::new(); + expected.insert(Datum::String(ByteString::from("abcd ")), "Label 1"); + expected.insert(Datum::String(ByteString::from("éèäî ")), "Label 2"); + + assert_eq!(&actual, &expected); + } +}