write fixes
authorBen Pfaff <blp@cs.stanford.edu>
Sat, 2 Aug 2025 18:48:36 +0000 (11:48 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Sat, 2 Aug 2025 18:48:36 +0000 (11:48 -0700)
rust/pspp/src/endian.rs
rust/pspp/src/lex/mod.rs
rust/pspp/src/sys/test.rs
rust/pspp/src/sys/testdata/write-string-simple.expected [new file with mode: 0644]
rust/pspp/src/sys/testdata/write-string-uncompressed.expected [new file with mode: 0644]
rust/pspp/src/sys/testdata/write-string-zlib.expected [new file with mode: 0644]
rust/pspp/src/sys/write.rs

index 906b23ba611ca5608760d7458d113e1c693b7718..ebb2694dc297fd668361405afd6b3319e2879c78 100644 (file)
@@ -1,5 +1,3 @@
-//! Converting big- and little-endian `[u8]` arrays to and from primitive types.
-
 // PSPP - a program for statistical analysis.
 // Copyright (C) 2025 Free Software Foundation, Inc.
 //
@@ -16,6 +14,8 @@
 // You should have received a copy of the GNU General Public License along with
 // this program.  If not, see <http://www.gnu.org/licenses/>.
 
+//! Converting big- and little-endian `[u8]` arrays to and from primitive types.
+
 use binrw::Endian;
 
 /// Converts a primitive type into a big- or little-endian `[u8]` array.
index 2cf29eec920bc6dced22a55df8c507ee98370950..f92407bb418b57694c06d1e846955533619c8e12 100644 (file)
@@ -14,7 +14,7 @@
 // You should have received a copy of the GNU General Public License along with
 // this program.  If not, see <http://www.gnu.org/licenses/>.
 
-//! PSPP lexical analysis.
+//! Lexical analysis for PSPP syntax.
 //!
 //! PSPP divides traditional "lexical analysis" or "tokenization" into three
 //! phases:
index 5e1446d0dc7ba940a083549e3ff2d02f2f85ea66..027ef35c11b19b3996f68286826f1f17ca529072 100644 (file)
@@ -25,6 +25,7 @@ use std::{
 use binrw::Endian;
 use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
 use encoding_rs::UTF_8;
+use hexplay::HexView;
 
 use crate::{
     crypto::EncryptedFile,
@@ -660,33 +661,65 @@ fn write_long_string_value_labels() {
             .add_values([Datum::new_utf8("0")])
             .unwrap();
         dictionary.add_var(s1).unwrap();
-        /*
+
+        let mut s2 = Variable::new(Identifier::new("s2").unwrap(), VarWidth::String(9), UTF_8);
+        s2.value_labels.insert(
+            Datum::String(ByteString::from("0        ")),
+            String::from("Fourth value label"),
+        );
+        s2.value_labels.insert(
+            Datum::String(ByteString::from("01234567")),
+            String::from("Fifth value label"),
+        );
+        s2.value_labels.insert(
+            Datum::String(ByteString::from("012345678")),
+            String::from("Sixth value label"),
+        );
+        s2.missing_values_mut()
+            .add_values([Datum::String("12").cloned(), Datum::String("123").cloned()])
+            .unwrap();
+        dictionary.add_var(s2).unwrap();
+
+        let mut s3 = Variable::new(Identifier::new("s3").unwrap(), VarWidth::String(9), UTF_8);
+        s3.missing_values_mut()
+            .add_values([
+                Datum::String("1234").cloned(),
+                Datum::String("12345").cloned(),
+                Datum::String("12345678").cloned(),
+            ])
+            .unwrap();
+        dictionary.add_var(s3).unwrap();
+
+        dictionary
+            .add_var(Variable::new(
+                Identifier::new("s4").unwrap(),
+                VarWidth::String(9),
+                UTF_8,
+            ))
+            .unwrap();
         let mut cases = WriteOptions::reproducible(compression)
             .write_writer(&dictionary, Cursor::new(Vec::new()))
             .unwrap();
         for case in [
-            [1, 1, 1, 2],
-            [1, 1, 2, 30],
-            [1, 2, 1, 8],
-            [1, 2, 2, 20],
-            [2, 1, 1, 2],
-            [2, 1, 2, 22],
-            [2, 2, 1, 1],
-            [2, 2, 2, 3],
+            ["1", "1", "1", "2"],
+            ["1", "1", "2", "30"],
+            ["1", "2", "1", "8"],
+            ["1", "2", "2", "20"],
+            ["2", "1", "1", "2"],
+            ["2", "1", "2", "22"],
+            ["2", "2", "1", "1"],
+            ["2", "2", "2", "3"],
         ] {
             cases
-                .write_case(
-                    case.into_iter()
-                        .map(|number| Datum::<&str>::Number(Some(number as f64))),
-                )
+                .write_case(case.into_iter().map(|s| Datum::String(s)))
                 .unwrap();
         }
         let sysfile = cases.finish().unwrap().unwrap().into_inner();
         let expected_filename = PathBuf::from(&format!(
-            "src/sys/testdata/write-numeric-{compression_string}.expected"
+            "src/sys/testdata/write-string-{compression_string}.expected"
         ));
         let expected = String::from_utf8(std::fs::read(&expected_filename).unwrap()).unwrap();
-        test_sysfile(Cursor::new(sysfile), &expected, &expected_filename);*/
+        test_sysfile(Cursor::new(sysfile), &expected, &expected_filename);
     }
 }
 
diff --git a/rust/pspp/src/sys/testdata/write-string-simple.expected b/rust/pspp/src/sys/testdata/write-string-simple.expected
new file mode 100644 (file)
index 0000000..3eeb7cb
--- /dev/null
@@ -0,0 +1,47 @@
+╭──────────────────────┬────────────────────╮
+│       Created        │30-JUL-2025 15:07:55│
+├──────────────────────┼────────────────────┤
+│Writer Product        │PSPP TEST DATA FILE │
+│       Version        │1.2.3               │
+├──────────────────────┼────────────────────┤
+│       Compression    │SAV                 │
+│       Number of Cases│                   8│
+╰──────────────────────┴────────────────────╯
+
+╭─────────┬─╮
+│Variables│4│
+╰─────────┴─╯
+
+╭──┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬───────────────────────────╮
+│  │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│       Missing Values      │
+├──┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼───────────────────────────┤
+│s1│       1│     │Nominal          │Input│    9│Left     │A9          │A9          │"0"                        │
+│s2│       2│     │Nominal          │Input│    9│Left     │A9          │A9          │"12"; "123"                │
+│s3│       3│     │Nominal          │Input│    9│Left     │A9          │A9          │"1234"; "12345"; "12345678"│
+│s4│       4│     │Nominal          │Input│    9│Left     │A9          │A9          │                           │
+╰──┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴───────────────────────────╯
+
+╭────────────────────────┬──────────────────╮
+│Variable Value          │                  │
+├────────────────────────┼──────────────────┤
+│s1             abc      │First value label │
+│               abcdefgh │Second value label│
+│               abcdefghi│Third value label │
+├────────────────────────┼──────────────────┤
+│s2             0        │Fourth value label│
+│               01234567 │Fifth value label │
+│               012345678│Sixth value label │
+╰────────────────────────┴──────────────────╯
+
+╭────┬─────────┬─────────┬─────────┬─────────╮
+│Case│    s1   │    s2   │    s3   │    s4   │
+├────┼─────────┼─────────┼─────────┼─────────┤
+│1   │1        │1        │1        │2        │
+│2   │1        │1        │2        │30       │
+│3   │1        │2        │1        │8        │
+│4   │1        │2        │2        │20       │
+│5   │2        │1        │1        │2        │
+│6   │2        │1        │2        │22       │
+│7   │2        │2        │1        │1        │
+│8   │2        │2        │2        │3        │
+╰────┴─────────┴─────────┴─────────┴─────────╯
diff --git a/rust/pspp/src/sys/testdata/write-string-uncompressed.expected b/rust/pspp/src/sys/testdata/write-string-uncompressed.expected
new file mode 100644 (file)
index 0000000..9c773dc
--- /dev/null
@@ -0,0 +1,47 @@
+╭──────────────────────┬────────────────────╮
+│       Created        │30-JUL-2025 15:07:55│
+├──────────────────────┼────────────────────┤
+│Writer Product        │PSPP TEST DATA FILE │
+│       Version        │1.2.3               │
+├──────────────────────┼────────────────────┤
+│       Compression    │None                │
+│       Number of Cases│                   8│
+╰──────────────────────┴────────────────────╯
+
+╭─────────┬─╮
+│Variables│4│
+╰─────────┴─╯
+
+╭──┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬───────────────────────────╮
+│  │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│       Missing Values      │
+├──┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼───────────────────────────┤
+│s1│       1│     │Nominal          │Input│    9│Left     │A9          │A9          │"0"                        │
+│s2│       2│     │Nominal          │Input│    9│Left     │A9          │A9          │"12"; "123"                │
+│s3│       3│     │Nominal          │Input│    9│Left     │A9          │A9          │"1234"; "12345"; "12345678"│
+│s4│       4│     │Nominal          │Input│    9│Left     │A9          │A9          │                           │
+╰──┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴───────────────────────────╯
+
+╭────────────────────────┬──────────────────╮
+│Variable Value          │                  │
+├────────────────────────┼──────────────────┤
+│s1             abc      │First value label │
+│               abcdefgh │Second value label│
+│               abcdefghi│Third value label │
+├────────────────────────┼──────────────────┤
+│s2             0        │Fourth value label│
+│               01234567 │Fifth value label │
+│               012345678│Sixth value label │
+╰────────────────────────┴──────────────────╯
+
+╭────┬─────────┬─────────┬─────────┬─────────╮
+│Case│    s1   │    s2   │    s3   │    s4   │
+├────┼─────────┼─────────┼─────────┼─────────┤
+│1   │1        │1        │1        │2        │
+│2   │1        │1        │2        │30       │
+│3   │1        │2        │1        │8        │
+│4   │1        │2        │2        │20       │
+│5   │2        │1        │1        │2        │
+│6   │2        │1        │2        │22       │
+│7   │2        │2        │1        │1        │
+│8   │2        │2        │2        │3        │
+╰────┴─────────┴─────────┴─────────┴─────────╯
diff --git a/rust/pspp/src/sys/testdata/write-string-zlib.expected b/rust/pspp/src/sys/testdata/write-string-zlib.expected
new file mode 100644 (file)
index 0000000..6812e1a
--- /dev/null
@@ -0,0 +1,47 @@
+╭──────────────────────┬────────────────────╮
+│       Created        │30-JUL-2025 15:07:55│
+├──────────────────────┼────────────────────┤
+│Writer Product        │PSPP TEST DATA FILE │
+│       Version        │1.2.3               │
+├──────────────────────┼────────────────────┤
+│       Compression    │ZSAV                │
+│       Number of Cases│                   8│
+╰──────────────────────┴────────────────────╯
+
+╭─────────┬─╮
+│Variables│4│
+╰─────────┴─╯
+
+╭──┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬───────────────────────────╮
+│  │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│       Missing Values      │
+├──┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼───────────────────────────┤
+│s1│       1│     │Nominal          │Input│    9│Left     │A9          │A9          │"0"                        │
+│s2│       2│     │Nominal          │Input│    9│Left     │A9          │A9          │"12"; "123"                │
+│s3│       3│     │Nominal          │Input│    9│Left     │A9          │A9          │"1234"; "12345"; "12345678"│
+│s4│       4│     │Nominal          │Input│    9│Left     │A9          │A9          │                           │
+╰──┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴───────────────────────────╯
+
+╭────────────────────────┬──────────────────╮
+│Variable Value          │                  │
+├────────────────────────┼──────────────────┤
+│s1             abc      │First value label │
+│               abcdefgh │Second value label│
+│               abcdefghi│Third value label │
+├────────────────────────┼──────────────────┤
+│s2             0        │Fourth value label│
+│               01234567 │Fifth value label │
+│               012345678│Sixth value label │
+╰────────────────────────┴──────────────────╯
+
+╭────┬─────────┬─────────┬─────────┬─────────╮
+│Case│    s1   │    s2   │    s3   │    s4   │
+├────┼─────────┼─────────┼─────────┼─────────┤
+│1   │1        │1        │1        │2        │
+│2   │1        │1        │2        │30       │
+│3   │1        │2        │1        │8        │
+│4   │1        │2        │2        │20       │
+│5   │2        │1        │1        │2        │
+│6   │2        │1        │2        │22       │
+│7   │2        │2        │1        │1        │
+│8   │2        │2        │2        │3        │
+╰────┴─────────┴─────────┴─────────┴─────────╯
index d7d7b84dfe92bc565e3b4d0299223252af1555bd..b5950951109afff514bfdb0976b0c40e32d9bef6 100644 (file)
@@ -161,10 +161,6 @@ struct DictionaryWriter<'a, W> {
     dictionary: &'a Dictionary,
 }
 
-fn count_segments(case_vars: &[CaseVar]) -> u32 {
-    case_vars.iter().map(CaseVar::n_segments).sum::<usize>() as u32
-}
-
 fn put_attributes(attributes: &Attributes, s: &mut String) {
     for (name, values) in attributes.iter(true) {
         write!(s, "{name}(").unwrap();
@@ -229,6 +225,13 @@ where
             bytes.try_into().unwrap()
         }
 
+        fn count_variable_positions(case_vars: &[CaseVar]) -> u32 {
+            case_vars
+                .iter()
+                .map(CaseVar::n_variable_positions)
+                .sum::<usize>() as u32
+        }
+
         let header = RawHeader {
             magic: if self.options.compression == Some(Compression::ZLib) {
                 Magic::Zsav
@@ -241,14 +244,14 @@ where
                 self.dictionary.encoding(),
             ),
             layout_code: 2,
-            nominal_case_size: count_segments(&self.case_vars),
+            nominal_case_size: count_variable_positions(&self.case_vars),
             compression_code: match self.options.compression {
                 Some(Compression::Simple) => 1,
                 Some(Compression::ZLib) => 2,
                 None => 0,
             },
             weight_index: if let Some(weight_index) = self.dictionary.weight_index() {
-                count_segments(&self.case_vars[..weight_index]) + 1
+                count_variable_positions(&self.case_vars[..weight_index]) + 1
             } else {
                 0
             },
@@ -275,7 +278,7 @@ where
             let record = RawVariableRecord {
                 width: seg0_width.as_string_width().unwrap_or(0) as i32,
                 has_variable_label: variable.label.is_some() as u32,
-                missing_value_code: if variable.width.is_long_string() {
+                missing_value_code: if !variable.width.is_long_string() {
                     let n = variable.missing_values().values().len() as i32;
                     match variable.missing_values().range() {
                         Some(_) => -(n + 2),
@@ -516,7 +519,17 @@ where
     }
 
     fn write_variable_display_parameters(&mut self) -> Result<(), BinError> {
-        (7u32, 11u32, 4u32, count_segments(&self.case_vars) * 3).write_le(self.writer)?;
+        (
+            7u32,
+            11u32,
+            4u32,
+            self.case_vars
+                .iter()
+                .map(CaseVar::n_segments)
+                .sum::<usize>() as u32
+                * 3,
+        )
+            .write_le(self.writer)?;
         for variable in &self.dictionary.variables {
             let measure = match variable.measure {
                 None => 0,
@@ -608,7 +621,7 @@ where
             (
                 name.len() as u32,
                 &name[..],
-                variable.missing_values().values().len() as u32,
+                variable.missing_values().values().len() as u8,
                 8u32,
             )
                 .write_le(&mut cursor)?;
@@ -811,6 +824,15 @@ impl CaseVar {
             CaseVar::String(encoding) => encoding.len(),
         }
     }
+    fn n_variable_positions(&self) -> usize {
+        match self {
+            CaseVar::Numeric => 1,
+            CaseVar::String(encoding) => encoding
+                .iter()
+                .map(|segment| (segment.data_bytes + segment.padding_bytes) / 8)
+                .sum(),
+        }
+    }
 }
 
 /// System file writer.
@@ -885,9 +907,17 @@ where
                 CaseVar::String(encoding) => {
                     let mut s = datum.as_string().unwrap().raw_string_bytes();
                     for segment in encoding {
+                        let spaces = segment.data_bytes.saturating_sub(s.len());
+                        let data_bytes = segment.data_bytes - spaces;
+
                         let data;
-                        (data, s) = s.split_at(segment.data_bytes);
-                        (data, Pad::new(segment.padding_bytes, 0)).write_le(&mut self.inner)?;
+                        (data, s) = s.split_at(data_bytes);
+                        (
+                            data,
+                            Pad::new(spaces, b' '),
+                            Pad::new(segment.padding_bytes, 0),
+                        )
+                            .write_le(&mut self.inner)?;
                     }
                 }
             }
@@ -921,8 +951,12 @@ where
                 CaseVar::String(encoding) => {
                     let mut s = datum.as_string().unwrap().raw_string_bytes();
                     for segment in encoding {
+                        let excess = segment.data_bytes.saturating_sub(s.len());
+                        let data_bytes = segment.data_bytes - excess;
+                        let padding_bytes = segment.padding_bytes + excess;
+
                         let data;
-                        (data, s) = s.split_at(segment.data_bytes);
+                        (data, s) = s.split_at(data_bytes);
 
                         let (chunks, remainder) = data.as_chunks::<8>();
                         for chunk in chunks {
@@ -939,10 +973,10 @@ where
                             } else {
                                 self.put_opcode(253)?;
                                 self.data.extend_from_slice(remainder);
-                                self.data.extend(repeat_n(0, 8 - remainder.len()));
+                                self.data.extend(repeat_n(b' ', 8 - remainder.len()));
                             }
                         }
-                        for _ in 0..segment.padding_bytes / 8 {
+                        for _ in 0..padding_bytes / 8 {
                             self.put_opcode(254)?;
                         }
                     }