From 5288a3c74a0abe77985d630727a648674ca00da2 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Fri, 18 Jul 2025 09:10:56 -0700 Subject: [PATCH] work on writing system files --- rust/Cargo.lock | 21 ++- rust/pspp/Cargo.toml | 1 + rust/pspp/src/dictionary.rs | 219 ++++++++++++++++++++++++++++-- rust/pspp/src/format/mod.rs | 2 +- rust/pspp/src/identifier.rs | 155 ++++++++++++++++++++- rust/pspp/src/output/pivot/mod.rs | 50 ++++++- rust/pspp/src/sys/mod.rs | 2 + rust/pspp/src/sys/raw.rs | 10 ++ rust/pspp/src/sys/raw/records.rs | 34 ++--- rust/pspp/src/sys/write.rs | 208 ++++++++++++++++++++++++++++ 10 files changed, 662 insertions(+), 40 deletions(-) create mode 100644 rust/pspp/src/sys/write.rs diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 52e8544160..fb413463d9 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -663,12 +663,6 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "finl_unicode" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94c970b525906eb37d3940083aa65b95e481fc1857d467d13374e1d925cfc163" - [[package]] name = "flagset" version = "0.4.7" @@ -1618,7 +1612,6 @@ dependencies = [ "encoding_rs", "enum-iterator", "enum-map", - "finl_unicode", "flagset", "flate2", "hexplay", @@ -1641,6 +1634,8 @@ dependencies = [ "thiserror", "unicase", "unicode-linebreak", + "unicode-properties", + "unicode-segmentation", "unicode-width", "windows-sys 0.48.0", "xmlwriter", @@ -2261,6 +2256,18 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b09c83c3c29d37506a3e260c08c03743a6bb66a9cd432c6934ab501a190571f" +[[package]] +name = "unicode-properties" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0" + +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + [[package]] name = "unicode-width" version = "0.2.0" diff --git a/rust/pspp/Cargo.toml b/rust/pspp/Cargo.toml index 3ad04a1296..a4e1e8b975 100644 --- a/rust/pspp/Cargo.toml +++ b/rust/pspp/Cargo.toml @@ -49,6 +49,7 @@ aes = "0.8.4" readpass = "1.0.3" zeroize = "1.8.1" unicode-properties = "0.1.3" +unicode-segmentation = "1.12.0" [target.'cfg(windows)'.dependencies] windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] } diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index 151e218987..7e1bb23705 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -31,6 +31,7 @@ use encoding_rs::Encoding; use enum_map::{Enum, EnumMap}; use indexmap::IndexSet; use num::integer::div_ceil; +use smallvec::SmallVec; use thiserror::Error as ThisError; use unicase::UniCase; @@ -38,7 +39,9 @@ use crate::{ data::Datum, format::{DisplayPlain, Format}, identifier::{ByIdentifier, HasIdentifier, Identifier}, - output::pivot::{Axis3, Dimension, Footnote, Footnotes, Group, PivotTable, Value}, + output::pivot::{ + Axis3, Dimension, Display26Adic, Footnote, Footnotes, Group, PivotTable, Value, + }, settings::Show, }; @@ -168,10 +171,12 @@ impl VarWidth { !self.is_numeric() } + /// Returns true if this is a very long string width, meaning wider than 255 + /// bytes, which was the limit for old versions of SPSS. pub fn is_very_long(&self) -> bool { match *self { VarWidth::Numeric => false, - VarWidth::String(width) => width >= 256, + VarWidth::String(width) => width > 255, } } @@ -549,6 +554,110 @@ impl Dictionary { (group, values) } + + pub fn short_names(&self) -> Vec> { + fn pick_short_name( + variable_name: &Identifier, + used_names: &mut HashSet, + encoding: &'static Encoding, + ) -> Identifier { + for index in 0.. { + let name = if index == 0 { + variable_name.shortened(encoding) + } else { + variable_name + .with_suffix( + &format!("_{}", Display26Adic::new_uppercase(index)), + encoding, + 8, + ) + .or_else(|_| { + Identifier::new(format!("V{}", Display26Adic::new_uppercase(index))) + }) + .unwrap() + }; + if !used_names.contains(&name) { + used_names.insert(name.clone()); + return name; + } + } + unreachable!() + } + + let mut used_names = HashSet::new(); + + // Each variable whose name is short has the best claim to its short + // name. + let mut short_names: Vec; 1]>> = self + .variables + .iter() + .map(|variable| { + let n = variable.width.n_segments(); + let mut names = SmallVec::with_capacity(n); + if self.encoding.encode(variable.name.as_str()).0.len() <= 8 { + used_names.insert(variable.name.clone()); + names.push(Some(variable.name.clone())) + } + while names.len() < n { + names.push(None); + } + names + }) + .collect(); + + // Each variable with an assigned short name for its first segment now + // gets it unless there is a conflict. In case of conflict, the + // claimant earlier in dictionary order wins. Then similarly for + // additional segments of very long strings. + for (variable, short_names) in self.variables.iter().zip(short_names.iter_mut()) { + if short_names[0].is_none() + && let Some(short_name) = variable.short_names.first() + && !used_names.contains(&short_name) + { + used_names.insert(short_name.clone()); + short_names[0] = Some(short_name.clone()); + } + } + for (variable, short_names) in self.variables.iter().zip(short_names.iter_mut()) { + for (index, assigned_short_name) in short_names.iter_mut().enumerate().skip(1) { + if assigned_short_name.is_none() + && let Some(short_name) = variable.short_names.get(index) + && !used_names.contains(&short_name) + { + used_names.insert(short_name.clone()); + *assigned_short_name = Some(short_name.clone()); + } + } + } + + // Assign short names to first segment of remaining variables, + // then similarly for additional segments. + for (variable, short_names) in self.variables.iter().zip(short_names.iter_mut()) { + if short_names[0].is_none() { + short_names[0] = Some(pick_short_name( + &variable.name, + &mut used_names, + self.encoding, + )); + } + } + for (variable, short_names) in self.variables.iter().zip(short_names.iter_mut()) { + for assigned_short_name in short_names.iter_mut().skip(1) { + if assigned_short_name.is_none() { + *assigned_short_name = Some(pick_short_name( + &variable.name, + &mut used_names, + self.encoding, + )); + } + } + } + + short_names + .into_iter() + .map(|names| names.into_iter().flatten().collect()) + .collect() + } } pub struct OutputVariables<'a> { @@ -1553,19 +1662,24 @@ pub enum CategoryLabels { mod test { use std::collections::HashSet; + use encoding_rs::{UTF_8, WINDOWS_1252}; + use smallvec::SmallVec; use unicase::UniCase; - use crate::identifier::Identifier; + use crate::{ + dictionary::{Dictionary, VarWidth, Variable}, + identifier::Identifier, + }; use super::{ByIdentifier, HasIdentifier}; #[derive(PartialEq, Eq, Debug, Clone)] - struct Variable { + struct SimpleVar { name: Identifier, value: i32, } - impl HasIdentifier for Variable { + impl HasIdentifier for SimpleVar { fn identifier(&self) -> &UniCase { &self.name.0 } @@ -1575,11 +1689,11 @@ mod test { fn test() { // Variables should not be the same if their values differ. let abcd = Identifier::new("abcd").unwrap(); - let abcd1 = Variable { + let abcd1 = SimpleVar { name: abcd.clone(), value: 1, }; - let abcd2 = Variable { + let abcd2 = SimpleVar { name: abcd, value: 2, }; @@ -1591,7 +1705,7 @@ mod test { assert_eq!(abcd1_by_name, abcd2_by_name); // And a `HashSet` of `ByName` should also treat them the same. - let mut vars: HashSet> = HashSet::new(); + let mut vars: HashSet> = HashSet::new(); assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone()))); assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone()))); assert_eq!( @@ -1602,4 +1716,93 @@ mod test { 1 ); } + + #[test] + fn short_names() { + for (variables, expected, encoding) in [ + ( + [("VariableName1", 1), ("VARIABLE", 1), ("VariableName2", 1)], + vec![vec!["Variab_A"], vec!["VARIABLE"], vec!["Variab_B"]], + UTF_8, + ), + ( + [ + ("LongVarNameA", 1), + ("LongVarNameB", 1), + ("LongVarNameC", 1), + ], + vec![vec!["LongVarN"], vec!["LongVa_A"], vec!["LongVa_B"]], + UTF_8, + ), + ( + [ + ("LongVarNameA", 300), + ("LongVarNameB", 1), + ("LongVarNameC", 1), + ], + vec![ + vec!["LongVarN", "LongVa_C"], + vec!["LongVa_A"], + vec!["LongVa_B"], + ], + UTF_8, + ), + ( + [ + // The accented letters are 2 bytes and the katakana is 3 + // bytes in UTF-8. + ("éèäスîVarNameA", 300), + ("éèäスVarNameB", 1), + ("éèäîVarNameC", 1), + ], + vec![vec!["éèä", "éèä_B"], vec!["éèä_A"], vec!["éèäî"]], + UTF_8, + ), + ( + [ + // This version uses `e` with modifying acute accent in the + // first name. + ("e\u{301}èäスîVarNameA", 300), + ("éèäスVarNameB", 1), + ("éèäîVarNameC", 1), + ], + vec![vec!["e\u{301}èä", "e\u{301}è_A"], vec!["éèä"], vec!["éèäî"]], + UTF_8, + ), + ( + [ + // The accented letters are only 1 byte in windows-1252. + ("éèäîVarNameA", 300), + ("éèäîVarNameB", 1), + ("éèäîVarNameC", 1), + ], + vec![ + vec!["éèäîVarN", "éèäîVa_C"], + vec!["éèäîVa_A"], + vec!["éèäîVa_B"], + ], + WINDOWS_1252, + ), + ] { + let mut dict = Dictionary::new(encoding); + for (name, width) in variables { + dict.add_var(Variable::new( + Identifier::new(name).unwrap(), + VarWidth::String(width), + encoding, + )) + .unwrap(); + } + let expected = expected + .into_iter() + .map(|names| { + names + .into_iter() + .map(|name| Identifier::new(name).unwrap()) + .collect::>() + }) + .collect::>(); + assert_eq!(expected, dict.short_names()); + } + } } diff --git a/rust/pspp/src/format/mod.rs b/rust/pspp/src/format/mod.rs index d46db2d8ee..7ddbbbaee7 100644 --- a/rust/pspp/src/format/mod.rs +++ b/rust/pspp/src/format/mod.rs @@ -28,8 +28,8 @@ use thiserror::Error as ThisError; use unicode_width::UnicodeWidthStr; use crate::{ - data::RawString, data::Datum, + data::RawString, dictionary::{VarType, VarWidth}, sys::raw, }; diff --git a/rust/pspp/src/identifier.rs b/rust/pspp/src/identifier.rs index 823ba6e971..b2565d6a54 100644 --- a/rust/pspp/src/identifier.rs +++ b/rust/pspp/src/identifier.rs @@ -22,10 +22,11 @@ use std::{ ops::{Deref, DerefMut}, }; -use encoding_rs::{EncoderResult, Encoding, UTF_8}; +use encoding_rs::{CoderResult, Encoder, EncoderResult, Encoding, UTF_8}; use thiserror::Error as ThisError; use unicase::UniCase; use unicode_properties::UnicodeGeneralCategory; +use unicode_segmentation::UnicodeSegmentation; #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum Class { @@ -209,6 +210,12 @@ impl Identifier { /// encoding used by the dictionary, not in UTF-8. pub const MAX_LEN: usize = 64; + fn new_unchecked(s: impl Into>) -> Self { + let s: UniCase = s.into(); + debug_assert!(Self::check_plausible(&s).is_ok()); + Identifier(s) + } + pub fn new(s: impl Into>) -> Result { Self::from_encoding(s, UTF_8) } @@ -218,7 +225,7 @@ impl Identifier { encoding: &'static Encoding, ) -> Result { let s: UniCase = s.into(); - Self::is_plausible(&s)?; + Self::check_plausible(&s)?; let identifier = Identifier(s); identifier.check_encoding(encoding)?; Ok(identifier) @@ -260,7 +267,7 @@ impl Identifier { }*/ Ok(()) } - pub fn is_plausible(s: &str) -> Result<(), Error> { + pub fn check_plausible(s: &str) -> Result<(), Error> { if s.is_empty() { return Err(Error::Empty); } @@ -330,6 +337,111 @@ impl Identifier { pub fn as_str(&self) -> &str { self.0.as_ref() } + + /// Returns this this identifier truncated to at most 8 bytes in `encoding`. + pub fn shortened(&self, encoding: &'static Encoding) -> Self { + let new_len = shortened_len(self, "", encoding, 8); + Self::new_unchecked(self.0[..new_len].to_string()) + } + + /// Returns a prefix of this identifier concatenated with all of `suffix`, + /// including as many grapheme clusters from the beginning of this + /// identifier as would fit within `max_len` bytes if the resulting string + /// were to be re-encoded in `encoding`. + /// + /// `max_len` would ordinarily be 64, since that's the maximum length of an + /// identifier, but a value of 8 is appropriate for short variable names. + /// + /// This function fails if adding or using `suffix` produces an invalid + /// [Identifier], for example if `max_len` is short enough that none of the + /// identifier can be included and `suffix` begins with `'_'` or another + /// character that may not appear at the beginning of an identifier. + /// + /// # Examples + /// + /// Simple examples for UTF-8 `encoding` with `max_len` of 6: + /// + /// ```text + /// identifier="abc", suffix="xyz" => "abcxyz" + /// identifier="abcd", suffix="xyz" => "abcxyz" + /// identifier="abc", suffix="uvwxyz" => "uvwxyz" + /// identifier="abc", suffix="tuvwxyz" => "tuvwxyz" + /// ``` + /// + /// Examples for windows-1252 `encoding` with `max_len` of 6: + /// + /// ```text + /// identifier="éèä", suffix="xyz" => "éèäxyz" + /// ``` + /// + /// (each letter in the identifier is only 1 byte in windows-1252 even + /// though they each take 2 bytes in UTF-8) + pub fn with_suffix( + &self, + suffix: &str, + encoding: &'static Encoding, + max_len: usize, + ) -> Result { + let prefix_len = shortened_len(self, suffix, encoding, max_len); + if prefix_len == 0 { + Self::new(suffix) + } else { + Self::new(format!("{}{suffix}", &self[..prefix_len])) + } + } +} + +fn encode_fully(encoder: &mut Encoder, mut src: &str, dst: &mut Vec, last: bool) { + while let (CoderResult::OutputFull, read, _) = encoder.encode_from_utf8_to_vec(src, dst, last) { + src = &src[read..]; + dst.reserve((dst.capacity() * 2) - dst.len()); + } +} + +fn shortened_len(prefix: &str, suffix: &str, encoding: &'static Encoding, max_len: usize) -> usize { + assert!(max_len <= 64); + if encoding == UTF_8 { + if prefix.len() + suffix.len() <= max_len { + prefix.len() + } else if suffix.len() >= max_len { + 0 + } else { + let mut copy_len = 0; + for (cluster_start, cluster) in prefix.grapheme_indices(true) { + let cluster_end = cluster_start + cluster.len(); + if cluster_end > max_len - suffix.len() { + break; + } + copy_len = cluster_end; + } + copy_len + } + } else { + let mut copy_len = 0; + let mut tmp = Vec::with_capacity(max_len); + for (cluster_start, cluster) in prefix.grapheme_indices(true) { + let cluster_end = cluster_start + cluster.len(); + let mut encoder = encoding.new_encoder(); + tmp.clear(); + encode_fully(&mut encoder, &prefix[..cluster_end], &mut tmp, false); + if tmp.len() <= max_len { + encode_fully(&mut encoder, suffix, &mut tmp, true); + } + if tmp.len() > max_len { + break; + } + copy_len = cluster_end; + } + copy_len + } +} + +impl Deref for Identifier { + type Target = UniCase; + + fn deref(&self) -> &Self::Target { + &self.0 + } } impl PartialEq for Identifier { @@ -490,3 +602,40 @@ where &mut self.0 } } + +#[cfg(test)] +mod tests { + use encoding_rs::{UTF_8, WINDOWS_1252}; + + use crate::identifier::Identifier; + + #[test] + fn with_suffix() { + for (head, suffix, encoding, max_len, expected) in [ + ("abc", "xyz", UTF_8, 6, "abcxyz"), + ("abcd", "xyz", UTF_8, 6, "abcxyz"), + ("abcd", "uvwxyz", UTF_8, 6, "uvwxyz"), + ("abc", "tuvwxyz", UTF_8, 6, "tuvwxyz"), + ("éèä", "xyz", UTF_8, 6, "éxyz"), + ("éèä", "xyz", WINDOWS_1252, 6, "éèäxyz"), + ] { + let head = Identifier::new(head).unwrap(); + let suffix = Identifier::new(suffix).unwrap(); + let actual = head.with_suffix(&suffix, encoding, max_len).unwrap(); + assert_eq!(&actual, expected); + } + } + + #[test] + fn shortened() { + for (long, expected_short, encoding) in [ + ("abc", "abc", UTF_8), + ("éèäîVarNameA", "éèäî", UTF_8), + ("éèäîVarNameA", "éèäîVarN", WINDOWS_1252), + ] { + let long = Identifier::new(long).unwrap(); + let short = long.shortened(encoding); + assert_eq!(&short, expected_short); + } + } +} diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs index 5d94390b30..2f670a572c 100644 --- a/rust/pspp/src/output/pivot/mod.rs +++ b/rust/pspp/src/output/pivot/mod.rs @@ -1725,23 +1725,42 @@ impl Display for DisplayMarker<'_> { } else { let i = self.footnote.index + 1; match self.options.footnote_marker_type { - FootnoteMarkerType::Alphabetic => write!(f, "{}", Display26Adic(i)), + FootnoteMarkerType::Alphabetic => write!(f, "{}", Display26Adic::new_lowercase(i)), FootnoteMarkerType::Numeric => write!(f, "{i}"), } } } } -pub struct Display26Adic(pub usize); +/// Displays a number in 26adic notation. +/// +/// Zero is displayed as the empty string, 1 through 26 as `a` through `z`, 27 +/// through 52 as `aa` through `az`, and so on. +pub struct Display26Adic { + value: usize, + base: u8, +} + +impl Display26Adic { + /// Constructs a `Display26Adic` for `value`, with letters in lowercase. + pub fn new_lowercase(value: usize) -> Self { + Self { value, base: b'a' } + } + + /// Constructs a `Display26Adic` for `value`, with letters in uppercase. + pub fn new_uppercase(value: usize) -> Self { + Self { value, base: b'A' } + } +} impl Display for Display26Adic { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let mut output = SmallVec::<[u8; 16]>::new(); - let mut number = self.0; + let mut number = self.value; while number > 0 { number -= 1; let digit = (number % 26) as u8; - output.push(digit + b'a'); + output.push(digit + self.base); number /= 26; } output.reverse(); @@ -1749,6 +1768,29 @@ impl Display for Display26Adic { } } +#[cfg(test)] +mod tests { + use super::Display26Adic; + #[test] + fn display_26adic() { + for (number, lowercase, uppercase) in [ + (0, "", ""), + (1, "a", "A"), + (2, "b", "B"), + (26, "z", "Z"), + (27, "aa", "AA"), + (28, "ab", "AB"), + (29, "ac", "AC"), + (18278, "zzz", "ZZZ"), + (18279, "aaaa", "AAAA"), + (19010, "abcd", "ABCD"), + ] { + assert_eq!(Display26Adic::new_lowercase(number).to_string(), lowercase); + assert_eq!(Display26Adic::new_uppercase(number).to_string(), uppercase); + } + } +} + /// The content of a single pivot table cell. /// /// A [Value] is also a pivot table's title, caption, footnote marker and diff --git a/rust/pspp/src/sys/mod.rs b/rust/pspp/src/sys/mod.rs index 94e063fcf1..b05eba3ec6 100644 --- a/rust/pspp/src/sys/mod.rs +++ b/rust/pspp/src/sys/mod.rs @@ -35,5 +35,7 @@ pub mod raw; #[cfg(test)] pub mod sack; +mod write; + #[cfg(test)] mod test; diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 41ab01f42f..dca752f034 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -939,6 +939,16 @@ impl Debug for Magic { } } +impl From for [u8; 4] { + fn from(value: Magic) -> Self { + match value { + Magic::Sav => Magic::SAV, + Magic::Zsav => Magic::ZSAV, + Magic::Ebcdic => Magic::EBCDIC, + } + } +} + impl TryFrom<[u8; 4]> for Magic { type Error = ErrorDetails; diff --git a/rust/pspp/src/sys/raw/records.rs b/rust/pspp/src/sys/raw/records.rs index d49437697b..7dbe107959 100644 --- a/rust/pspp/src/sys/raw/records.rs +++ b/rust/pspp/src/sys/raw/records.rs @@ -25,7 +25,7 @@ use crate::{ }, }; -use binrw::BinRead; +use binrw::{BinRead, BinWrite}; use itertools::Itertools; use thiserror::Error as ThisError; @@ -127,6 +127,22 @@ where } } +#[derive(BinRead, BinWrite)] +pub struct RawHeader { + pub magic: [u8; 4], + pub eye_catcher: [u8; 60], + pub layout_code: u32, + pub nominal_case_size: u32, + pub compression_code: u32, + pub weight_index: u32, + pub n_cases: u32, + pub bias: f64, + pub creation_date: [u8; 9], + pub creation_time: [u8; 8], + #[brw(pad_after = 3)] + pub file_label: [u8; 64], +} + impl FileHeader { /// Reads a header record from `r`, reporting any warnings via `warn`. pub fn read(r: &mut R, warn: &mut dyn FnMut(Warning)) -> Result @@ -150,22 +166,6 @@ impl FileHeader { header_bytes: &[u8], warn: &mut dyn FnMut(Warning), ) -> Result { - #[derive(BinRead)] - struct RawHeader { - magic: [u8; 4], - eye_catcher: [u8; 60], - layout_code: u32, - nominal_case_size: u32, - compression_code: u32, - weight_index: u32, - n_cases: u32, - bias: f64, - creation_date: [u8; 9], - creation_time: [u8; 8], - file_label: [u8; 64], - _padding: [u8; 3], - } - if &header_bytes[8..20] == b"ENCRYPTEDSAV" { return Err(ErrorDetails::Encrypted); } diff --git a/rust/pspp/src/sys/write.rs b/rust/pspp/src/sys/write.rs new file mode 100644 index 0000000000..e65edfd29e --- /dev/null +++ b/rust/pspp/src/sys/write.rs @@ -0,0 +1,208 @@ +use std::{ + io::{Seek, Write}, + iter::repeat_n, +}; + +use binrw::{BinWrite, Error as BinError}; +use chrono::Local; +use smallvec::SmallVec; + +use crate::{ + dictionary::{Dictionary, VarWidth}, + sys::raw::{ + records::{Compression, RawHeader}, + Magic, + }, +}; + +/// System file format version. +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)] +pub enum Version { + /// Obsolete version. + V2, + + /// Current version. + #[default] + V3, +} + +#[derive(Copy, Clone, Debug)] +pub struct WriteOptions { + /// How to compress (if at all) data in the system file. + pub compression: Option, + + /// System file version to write. + pub version: Version, +} + +impl Default for WriteOptions { + fn default() -> Self { + Self { + compression: Some(Compression::Simple), + version: Default::default(), + } + } +} + +impl WriteOptions { + pub fn new() -> Self { + Self::default() + } + pub fn write_writer( + self, + dictionary: &Dictionary, + mut writer: W, + ) -> Result, BinError> + where + W: Write + Seek, + { + fn as_byte_array(s: String) -> [u8; N] { + let mut bytes = s.into_bytes(); + bytes.resize(N, b' '); + bytes.try_into().unwrap() + } + + fn count_segments(case_vars: &[CaseVar]) -> u32 { + case_vars.iter().map(CaseVar::n_segments).sum::() as u32 + } + + let case_vars = dictionary + .variables + .iter() + .map(|variable| CaseVar::new(variable.width)) + .collect::>(); + + let now = Local::now(); + let header = RawHeader { + magic: if self.compression == Some(Compression::ZLib) { + Magic::Zsav + } else { + Magic::Sav + } + .into(), + eye_catcher: { + as_byte_array(format!( + "@(#) SPSS DATA FILE GNU pspp (Rust) {}", + env!("CARGO_PKG_VERSION") + )) + }, + layout_code: 2, + nominal_case_size: count_segments(&case_vars), + compression_code: match self.compression { + Some(Compression::Simple) => 1, + Some(Compression::ZLib) => 2, + None => 0, + }, + weight_index: if let Some(weight_index) = dictionary.weight { + count_segments(&case_vars[..weight_index]) + 1 + } else { + 0 + }, + n_cases: u32::MAX, + bias: 100.0, + creation_date: as_byte_array(now.format("%d %b %Y").to_string()), + creation_time: as_byte_array(now.format("%H:%M:%S").to_string()), + file_label: as_byte_array(dictionary.file_label.clone().unwrap_or_default()), + }; + header.write_le(&mut writer)?; + todo!() + } +} + +#[derive(Debug)] +struct StringSegment { + data_bytes: usize, + padding_bytes: usize, +} + +fn segment_widths(width: usize) -> impl Iterator { + let n_segments = width.div_ceil(252); + repeat_n(255, n_segments - 1) + .chain(if n_segments > 1 { + std::iter::once(width - (n_segments - 1) * 252) + } else { + std::iter::once(width) + }) + .map(|w| w.next_multiple_of(8)) +} + +enum CaseVar { + Numeric, + String { + width: usize, + encoding: SmallVec<[StringSegment; 1]>, + }, +} + +impl CaseVar { + fn new(width: VarWidth) -> Self { + match width { + VarWidth::Numeric => Self::Numeric, + VarWidth::String(width) => { + let width = width as usize; + let mut encoding = SmallVec::<[StringSegment; 1]>::new(); + let mut remaining = width; + for segment in segment_widths(width) { + let data_bytes = remaining.min(segment).min(255); + let padding_bytes = segment - data_bytes; + if data_bytes > 0 { + encoding.push(StringSegment { + data_bytes, + padding_bytes, + }); + remaining -= data_bytes; + } else { + encoding.last_mut().unwrap().padding_bytes += padding_bytes; + } + } + CaseVar::String { width, encoding } + } + } + } + + fn bytes(&self) -> usize { + match self { + CaseVar::Numeric => 8, + CaseVar::String { width: _, encoding } => encoding + .iter() + .map(|segment| segment.data_bytes + segment.padding_bytes) + .sum(), + } + } + + fn n_segments(&self) -> usize { + match self { + CaseVar::Numeric => 1, + CaseVar::String { encoding, .. } => encoding.len(), + } + } +} +/* +/// A variable in a system file. +struct WriteVar { + width: VarWidth, + segment_width: u8, + case_index: usize, + + /// Offset within string variable in case. + offset: usize, + + /// Number of padding bytes following data. + padding: usize, +} + +impl WriteVar { + fn new_vars(dictionary: &Dictionary) -> Vec { + let mut vars = Vec::new(); + for dv in &dictionary.variables { + + } + } +}*/ + +/// System file writer. +pub struct Writer { + inner: W, +} + +impl Writer where W: Write + Seek {} -- 2.30.2