From 323f2aee77012c9705508bda25549ed7f88b34fa Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 2 Aug 2025 11:48:36 -0700 Subject: [PATCH] write fixes --- rust/pspp/src/endian.rs | 4 +- rust/pspp/src/lex/mod.rs | 2 +- rust/pspp/src/sys/test.rs | 63 ++++++++++++++----- .../sys/testdata/write-string-simple.expected | 47 ++++++++++++++ .../write-string-uncompressed.expected | 47 ++++++++++++++ .../sys/testdata/write-string-zlib.expected | 47 ++++++++++++++ rust/pspp/src/sys/write.rs | 62 +++++++++++++----- 7 files changed, 240 insertions(+), 32 deletions(-) create mode 100644 rust/pspp/src/sys/testdata/write-string-simple.expected create mode 100644 rust/pspp/src/sys/testdata/write-string-uncompressed.expected create mode 100644 rust/pspp/src/sys/testdata/write-string-zlib.expected diff --git a/rust/pspp/src/endian.rs b/rust/pspp/src/endian.rs index 906b23ba61..ebb2694dc2 100644 --- a/rust/pspp/src/endian.rs +++ b/rust/pspp/src/endian.rs @@ -1,5 +1,3 @@ -//! Converting big- and little-endian `[u8]` arrays to and from primitive types. - // PSPP - a program for statistical analysis. // Copyright (C) 2025 Free Software Foundation, Inc. // @@ -16,6 +14,8 @@ // You should have received a copy of the GNU General Public License along with // this program. If not, see . +//! Converting big- and little-endian `[u8]` arrays to and from primitive types. + use binrw::Endian; /// Converts a primitive type into a big- or little-endian `[u8]` array. diff --git a/rust/pspp/src/lex/mod.rs b/rust/pspp/src/lex/mod.rs index 2cf29eec92..f92407bb41 100644 --- a/rust/pspp/src/lex/mod.rs +++ b/rust/pspp/src/lex/mod.rs @@ -14,7 +14,7 @@ // You should have received a copy of the GNU General Public License along with // this program. If not, see . -//! PSPP lexical analysis. +//! Lexical analysis for PSPP syntax. //! //! PSPP divides traditional "lexical analysis" or "tokenization" into three //! phases: diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs index 5e1446d0dc..027ef35c11 100644 --- a/rust/pspp/src/sys/test.rs +++ b/rust/pspp/src/sys/test.rs @@ -25,6 +25,7 @@ use std::{ use binrw::Endian; use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; use encoding_rs::UTF_8; +use hexplay::HexView; use crate::{ crypto::EncryptedFile, @@ -660,33 +661,65 @@ fn write_long_string_value_labels() { .add_values([Datum::new_utf8("0")]) .unwrap(); dictionary.add_var(s1).unwrap(); - /* + + let mut s2 = Variable::new(Identifier::new("s2").unwrap(), VarWidth::String(9), UTF_8); + s2.value_labels.insert( + Datum::String(ByteString::from("0 ")), + String::from("Fourth value label"), + ); + s2.value_labels.insert( + Datum::String(ByteString::from("01234567")), + String::from("Fifth value label"), + ); + s2.value_labels.insert( + Datum::String(ByteString::from("012345678")), + String::from("Sixth value label"), + ); + s2.missing_values_mut() + .add_values([Datum::String("12").cloned(), Datum::String("123").cloned()]) + .unwrap(); + dictionary.add_var(s2).unwrap(); + + let mut s3 = Variable::new(Identifier::new("s3").unwrap(), VarWidth::String(9), UTF_8); + s3.missing_values_mut() + .add_values([ + Datum::String("1234").cloned(), + Datum::String("12345").cloned(), + Datum::String("12345678").cloned(), + ]) + .unwrap(); + dictionary.add_var(s3).unwrap(); + + dictionary + .add_var(Variable::new( + Identifier::new("s4").unwrap(), + VarWidth::String(9), + UTF_8, + )) + .unwrap(); let mut cases = WriteOptions::reproducible(compression) .write_writer(&dictionary, Cursor::new(Vec::new())) .unwrap(); for case in [ - [1, 1, 1, 2], - [1, 1, 2, 30], - [1, 2, 1, 8], - [1, 2, 2, 20], - [2, 1, 1, 2], - [2, 1, 2, 22], - [2, 2, 1, 1], - [2, 2, 2, 3], + ["1", "1", "1", "2"], + ["1", "1", "2", "30"], + ["1", "2", "1", "8"], + ["1", "2", "2", "20"], + ["2", "1", "1", "2"], + ["2", "1", "2", "22"], + ["2", "2", "1", "1"], + ["2", "2", "2", "3"], ] { cases - .write_case( - case.into_iter() - .map(|number| Datum::<&str>::Number(Some(number as f64))), - ) + .write_case(case.into_iter().map(|s| Datum::String(s))) .unwrap(); } let sysfile = cases.finish().unwrap().unwrap().into_inner(); let expected_filename = PathBuf::from(&format!( - "src/sys/testdata/write-numeric-{compression_string}.expected" + "src/sys/testdata/write-string-{compression_string}.expected" )); let expected = String::from_utf8(std::fs::read(&expected_filename).unwrap()).unwrap(); - test_sysfile(Cursor::new(sysfile), &expected, &expected_filename);*/ + test_sysfile(Cursor::new(sysfile), &expected, &expected_filename); } } diff --git a/rust/pspp/src/sys/testdata/write-string-simple.expected b/rust/pspp/src/sys/testdata/write-string-simple.expected new file mode 100644 index 0000000000..3eeb7cb056 --- /dev/null +++ b/rust/pspp/src/sys/testdata/write-string-simple.expected @@ -0,0 +1,47 @@ +╭──────────────────────┬────────────────────╮ +│ Created │30-JUL-2025 15:07:55│ +├──────────────────────┼────────────────────┤ +│Writer Product │PSPP TEST DATA FILE │ +│ Version │1.2.3 │ +├──────────────────────┼────────────────────┤ +│ Compression │SAV │ +│ Number of Cases│ 8│ +╰──────────────────────┴────────────────────╯ + +╭─────────┬─╮ +│Variables│4│ +╰─────────┴─╯ + +╭──┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬───────────────────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│ Missing Values │ +├──┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼───────────────────────────┤ +│s1│ 1│ │Nominal │Input│ 9│Left │A9 │A9 │"0" │ +│s2│ 2│ │Nominal │Input│ 9│Left │A9 │A9 │"12"; "123" │ +│s3│ 3│ │Nominal │Input│ 9│Left │A9 │A9 │"1234"; "12345"; "12345678"│ +│s4│ 4│ │Nominal │Input│ 9│Left │A9 │A9 │ │ +╰──┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴───────────────────────────╯ + +╭────────────────────────┬──────────────────╮ +│Variable Value │ │ +├────────────────────────┼──────────────────┤ +│s1 abc │First value label │ +│ abcdefgh │Second value label│ +│ abcdefghi│Third value label │ +├────────────────────────┼──────────────────┤ +│s2 0 │Fourth value label│ +│ 01234567 │Fifth value label │ +│ 012345678│Sixth value label │ +╰────────────────────────┴──────────────────╯ + +╭────┬─────────┬─────────┬─────────┬─────────╮ +│Case│ s1 │ s2 │ s3 │ s4 │ +├────┼─────────┼─────────┼─────────┼─────────┤ +│1 │1 │1 │1 │2 │ +│2 │1 │1 │2 │30 │ +│3 │1 │2 │1 │8 │ +│4 │1 │2 │2 │20 │ +│5 │2 │1 │1 │2 │ +│6 │2 │1 │2 │22 │ +│7 │2 │2 │1 │1 │ +│8 │2 │2 │2 │3 │ +╰────┴─────────┴─────────┴─────────┴─────────╯ diff --git a/rust/pspp/src/sys/testdata/write-string-uncompressed.expected b/rust/pspp/src/sys/testdata/write-string-uncompressed.expected new file mode 100644 index 0000000000..9c773dc975 --- /dev/null +++ b/rust/pspp/src/sys/testdata/write-string-uncompressed.expected @@ -0,0 +1,47 @@ +╭──────────────────────┬────────────────────╮ +│ Created │30-JUL-2025 15:07:55│ +├──────────────────────┼────────────────────┤ +│Writer Product │PSPP TEST DATA FILE │ +│ Version │1.2.3 │ +├──────────────────────┼────────────────────┤ +│ Compression │None │ +│ Number of Cases│ 8│ +╰──────────────────────┴────────────────────╯ + +╭─────────┬─╮ +│Variables│4│ +╰─────────┴─╯ + +╭──┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬───────────────────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│ Missing Values │ +├──┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼───────────────────────────┤ +│s1│ 1│ │Nominal │Input│ 9│Left │A9 │A9 │"0" │ +│s2│ 2│ │Nominal │Input│ 9│Left │A9 │A9 │"12"; "123" │ +│s3│ 3│ │Nominal │Input│ 9│Left │A9 │A9 │"1234"; "12345"; "12345678"│ +│s4│ 4│ │Nominal │Input│ 9│Left │A9 │A9 │ │ +╰──┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴───────────────────────────╯ + +╭────────────────────────┬──────────────────╮ +│Variable Value │ │ +├────────────────────────┼──────────────────┤ +│s1 abc │First value label │ +│ abcdefgh │Second value label│ +│ abcdefghi│Third value label │ +├────────────────────────┼──────────────────┤ +│s2 0 │Fourth value label│ +│ 01234567 │Fifth value label │ +│ 012345678│Sixth value label │ +╰────────────────────────┴──────────────────╯ + +╭────┬─────────┬─────────┬─────────┬─────────╮ +│Case│ s1 │ s2 │ s3 │ s4 │ +├────┼─────────┼─────────┼─────────┼─────────┤ +│1 │1 │1 │1 │2 │ +│2 │1 │1 │2 │30 │ +│3 │1 │2 │1 │8 │ +│4 │1 │2 │2 │20 │ +│5 │2 │1 │1 │2 │ +│6 │2 │1 │2 │22 │ +│7 │2 │2 │1 │1 │ +│8 │2 │2 │2 │3 │ +╰────┴─────────┴─────────┴─────────┴─────────╯ diff --git a/rust/pspp/src/sys/testdata/write-string-zlib.expected b/rust/pspp/src/sys/testdata/write-string-zlib.expected new file mode 100644 index 0000000000..6812e1aa48 --- /dev/null +++ b/rust/pspp/src/sys/testdata/write-string-zlib.expected @@ -0,0 +1,47 @@ +╭──────────────────────┬────────────────────╮ +│ Created │30-JUL-2025 15:07:55│ +├──────────────────────┼────────────────────┤ +│Writer Product │PSPP TEST DATA FILE │ +│ Version │1.2.3 │ +├──────────────────────┼────────────────────┤ +│ Compression │ZSAV │ +│ Number of Cases│ 8│ +╰──────────────────────┴────────────────────╯ + +╭─────────┬─╮ +│Variables│4│ +╰─────────┴─╯ + +╭──┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬───────────────────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│ Missing Values │ +├──┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼───────────────────────────┤ +│s1│ 1│ │Nominal │Input│ 9│Left │A9 │A9 │"0" │ +│s2│ 2│ │Nominal │Input│ 9│Left │A9 │A9 │"12"; "123" │ +│s3│ 3│ │Nominal │Input│ 9│Left │A9 │A9 │"1234"; "12345"; "12345678"│ +│s4│ 4│ │Nominal │Input│ 9│Left │A9 │A9 │ │ +╰──┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴───────────────────────────╯ + +╭────────────────────────┬──────────────────╮ +│Variable Value │ │ +├────────────────────────┼──────────────────┤ +│s1 abc │First value label │ +│ abcdefgh │Second value label│ +│ abcdefghi│Third value label │ +├────────────────────────┼──────────────────┤ +│s2 0 │Fourth value label│ +│ 01234567 │Fifth value label │ +│ 012345678│Sixth value label │ +╰────────────────────────┴──────────────────╯ + +╭────┬─────────┬─────────┬─────────┬─────────╮ +│Case│ s1 │ s2 │ s3 │ s4 │ +├────┼─────────┼─────────┼─────────┼─────────┤ +│1 │1 │1 │1 │2 │ +│2 │1 │1 │2 │30 │ +│3 │1 │2 │1 │8 │ +│4 │1 │2 │2 │20 │ +│5 │2 │1 │1 │2 │ +│6 │2 │1 │2 │22 │ +│7 │2 │2 │1 │1 │ +│8 │2 │2 │2 │3 │ +╰────┴─────────┴─────────┴─────────┴─────────╯ diff --git a/rust/pspp/src/sys/write.rs b/rust/pspp/src/sys/write.rs index d7d7b84dfe..b595095110 100644 --- a/rust/pspp/src/sys/write.rs +++ b/rust/pspp/src/sys/write.rs @@ -161,10 +161,6 @@ struct DictionaryWriter<'a, W> { dictionary: &'a Dictionary, } -fn count_segments(case_vars: &[CaseVar]) -> u32 { - case_vars.iter().map(CaseVar::n_segments).sum::() as u32 -} - fn put_attributes(attributes: &Attributes, s: &mut String) { for (name, values) in attributes.iter(true) { write!(s, "{name}(").unwrap(); @@ -229,6 +225,13 @@ where bytes.try_into().unwrap() } + fn count_variable_positions(case_vars: &[CaseVar]) -> u32 { + case_vars + .iter() + .map(CaseVar::n_variable_positions) + .sum::() as u32 + } + let header = RawHeader { magic: if self.options.compression == Some(Compression::ZLib) { Magic::Zsav @@ -241,14 +244,14 @@ where self.dictionary.encoding(), ), layout_code: 2, - nominal_case_size: count_segments(&self.case_vars), + nominal_case_size: count_variable_positions(&self.case_vars), compression_code: match self.options.compression { Some(Compression::Simple) => 1, Some(Compression::ZLib) => 2, None => 0, }, weight_index: if let Some(weight_index) = self.dictionary.weight_index() { - count_segments(&self.case_vars[..weight_index]) + 1 + count_variable_positions(&self.case_vars[..weight_index]) + 1 } else { 0 }, @@ -275,7 +278,7 @@ where let record = RawVariableRecord { width: seg0_width.as_string_width().unwrap_or(0) as i32, has_variable_label: variable.label.is_some() as u32, - missing_value_code: if variable.width.is_long_string() { + missing_value_code: if !variable.width.is_long_string() { let n = variable.missing_values().values().len() as i32; match variable.missing_values().range() { Some(_) => -(n + 2), @@ -516,7 +519,17 @@ where } fn write_variable_display_parameters(&mut self) -> Result<(), BinError> { - (7u32, 11u32, 4u32, count_segments(&self.case_vars) * 3).write_le(self.writer)?; + ( + 7u32, + 11u32, + 4u32, + self.case_vars + .iter() + .map(CaseVar::n_segments) + .sum::() as u32 + * 3, + ) + .write_le(self.writer)?; for variable in &self.dictionary.variables { let measure = match variable.measure { None => 0, @@ -608,7 +621,7 @@ where ( name.len() as u32, &name[..], - variable.missing_values().values().len() as u32, + variable.missing_values().values().len() as u8, 8u32, ) .write_le(&mut cursor)?; @@ -811,6 +824,15 @@ impl CaseVar { CaseVar::String(encoding) => encoding.len(), } } + fn n_variable_positions(&self) -> usize { + match self { + CaseVar::Numeric => 1, + CaseVar::String(encoding) => encoding + .iter() + .map(|segment| (segment.data_bytes + segment.padding_bytes) / 8) + .sum(), + } + } } /// System file writer. @@ -885,9 +907,17 @@ where CaseVar::String(encoding) => { let mut s = datum.as_string().unwrap().raw_string_bytes(); for segment in encoding { + let spaces = segment.data_bytes.saturating_sub(s.len()); + let data_bytes = segment.data_bytes - spaces; + let data; - (data, s) = s.split_at(segment.data_bytes); - (data, Pad::new(segment.padding_bytes, 0)).write_le(&mut self.inner)?; + (data, s) = s.split_at(data_bytes); + ( + data, + Pad::new(spaces, b' '), + Pad::new(segment.padding_bytes, 0), + ) + .write_le(&mut self.inner)?; } } } @@ -921,8 +951,12 @@ where CaseVar::String(encoding) => { let mut s = datum.as_string().unwrap().raw_string_bytes(); for segment in encoding { + let excess = segment.data_bytes.saturating_sub(s.len()); + let data_bytes = segment.data_bytes - excess; + let padding_bytes = segment.padding_bytes + excess; + let data; - (data, s) = s.split_at(segment.data_bytes); + (data, s) = s.split_at(data_bytes); let (chunks, remainder) = data.as_chunks::<8>(); for chunk in chunks { @@ -939,10 +973,10 @@ where } else { self.put_opcode(253)?; self.data.extend_from_slice(remainder); - self.data.extend(repeat_n(0, 8 - remainder.len())); + self.data.extend(repeat_n(b' ', 8 - remainder.len())); } } - for _ in 0..segment.padding_bytes / 8 { + for _ in 0..padding_bytes / 8 { self.put_opcode(254)?; } } -- 2.30.2