work on making data work
authorBen Pfaff <blp@cs.stanford.edu>
Sun, 6 Jul 2025 17:07:10 +0000 (10:07 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Sun, 6 Jul 2025 17:07:10 +0000 (10:07 -0700)
rust/pspp/src/output/pivot/mod.rs
rust/pspp/src/sys/cooked.rs
rust/pspp/src/sys/raw.rs
rust/pspp/src/sys/test.rs
rust/pspp/src/sys/testdata/documents.expected
rust/pspp/src/sys/testdata/empty_document_record.expected
rust/pspp/src/sys/testdata/multiple_documents_records.expected
rust/pspp/src/sys/testdata/multiple_documents_records.sack
rust/pspp/src/sys/testdata/very_long_strings.expected

index 2fb4dfa773582df52e25ae23e17f1ae64577b967..811b1ca7695309b55a4e7b77bef6ccb49b01a5bf 100644 (file)
@@ -423,6 +423,14 @@ impl Group {
         self
     }
 
+    pub fn with_multiple<C>(mut self, children: impl IntoIterator<Item = C>) -> Self
+    where
+        C: Into<Category>,
+    {
+        self.extend(children);
+        self
+    }
+
     pub fn with_label_shown(self) -> Self {
         self.with_show_label(true)
     }
@@ -468,6 +476,19 @@ impl Group {
     }
 }
 
+impl<C> Extend<C> for Group
+where
+    C: Into<Category>,
+{
+    fn extend<T: IntoIterator<Item = C>>(&mut self, children: T) {
+        let children = children.into_iter();
+        self.children.reserve(children.size_hint().0);
+        for child in children {
+            self.push(child);
+        }
+    }
+}
+
 #[derive(Clone, Debug, Default)]
 pub struct Footnotes(pub Vec<Arc<Footnote>>);
 
@@ -592,6 +613,12 @@ impl From<Value> for Category {
     }
 }
 
+impl From<&Variable> for Category {
+    fn from(variable: &Variable) -> Self {
+        Value::new_variable(variable).into()
+    }
+}
+
 impl From<&str> for Category {
     fn from(name: &str) -> Self {
         Self::Leaf(Leaf::new(Value::new_text(name)))
index a956dd8605e3e55b86c147c7fece64e5e84328ab..43ad3c5f599209c185dfe9ebb0b8feb31c09addd 100644 (file)
@@ -578,7 +578,7 @@ impl Decoder {
 
 pub fn decode(
     mut headers: Headers,
-    cases: Option<Cases>,
+    mut cases: Option<Cases>,
     encoding: &'static Encoding,
     mut warn: impl FnMut(Error),
 ) -> Result<(Dictionary, Metadata, Option<Cases>), Error> {
@@ -870,55 +870,60 @@ pub fn decode(
         }
     }
 
-    'outer: for record in headers
-        .very_long_strings
-        .drain(..)
-        .flat_map(|record| record.0.into_iter())
-    {
-        let Some(index) = dictionary.variables.get_index_of(&record.short_name.0) else {
-            warn(dbg!(Error::TBD));
-            continue;
-        };
-        let width = VarWidth::String(record.length);
-        let n_segments = width.n_segments();
-        if n_segments == 1 {
-            warn(dbg!(Error::ShortVeryLongString {
-                short_name: record.short_name.clone(),
-                width: record.length
-            }));
-            continue;
-        }
-        if index + n_segments > dictionary.variables.len() {
-            warn(dbg!(Error::VeryLongStringOverflow {
-                short_name: record.short_name.clone(),
-                width: record.length,
-                index,
-                n_segments,
-                len: dictionary.variables.len()
-            }));
-            continue;
-        }
-        let mut short_names = Vec::with_capacity(n_segments);
-        for i in 0..n_segments {
-            let alloc_width = width.segment_alloc_width(i);
-            let segment = &dictionary.variables[index + i];
-            short_names.push(segment.short_names[0].clone());
-            let segment_width = segment.width.as_string_width().unwrap_or(0);
-            if segment_width.next_multiple_of(8) != alloc_width.next_multiple_of(8) {
-                warn(Error::VeryLongStringInvalidSegmentWidth {
+    if !headers.very_long_strings.is_empty() {
+        'outer: for record in headers
+            .very_long_strings
+            .drain(..)
+            .flat_map(|record| record.0.into_iter())
+        {
+            let Some(index) = dictionary.variables.get_index_of(&record.short_name.0) else {
+                warn(dbg!(Error::TBD));
+                continue;
+            };
+            let width = VarWidth::String(record.length);
+            let n_segments = width.n_segments();
+            if n_segments == 1 {
+                warn(dbg!(Error::ShortVeryLongString {
+                    short_name: record.short_name.clone(),
+                    width: record.length
+                }));
+                continue;
+            }
+            if index + n_segments > dictionary.variables.len() {
+                warn(dbg!(Error::VeryLongStringOverflow {
                     short_name: record.short_name.clone(),
                     width: record.length,
-                    index: i,
-                    actual: segment_width,
-                    expected: alloc_width,
-                });
-                continue 'outer;
+                    index,
+                    n_segments,
+                    len: dictionary.variables.len()
+                }));
+                continue;
             }
+            let mut short_names = Vec::with_capacity(n_segments);
+            for i in 0..n_segments {
+                let alloc_width = width.segment_alloc_width(i);
+                let segment = &dictionary.variables[index + i];
+                short_names.push(segment.short_names[0].clone());
+                let segment_width = segment.width.as_string_width().unwrap_or(0);
+                if segment_width.next_multiple_of(8) != alloc_width.next_multiple_of(8) {
+                    warn(Error::VeryLongStringInvalidSegmentWidth {
+                        short_name: record.short_name.clone(),
+                        width: record.length,
+                        index: i,
+                        actual: segment_width,
+                        expected: alloc_width,
+                    });
+                    continue 'outer;
+                }
+            }
+            dictionary.delete_vars(index + 1..index + n_segments);
+            let variable = dictionary.variables.get_index_mut2(index).unwrap();
+            variable.short_names = short_names;
+            variable.resize(width);
         }
-        dictionary.delete_vars(index + 1..index + n_segments);
-        let variable = dictionary.variables.get_index_mut2(index).unwrap();
-        variable.short_names = short_names;
-        variable.resize(width);
+        cases = cases
+            .take()
+            .map(|cases| cases.with_widths(dictionary.variables.iter().map(|var| var.width)));
     }
 
     if headers.long_names.is_empty() {
index 46c201a86520592176b399a633016b9bac8d2d20..1317c2d8359c45e332c5a784f8721a0f198d8ee1 100644 (file)
@@ -1165,7 +1165,7 @@ impl CaseVar {
     fn bytes(&self) -> usize {
         match self {
             CaseVar::Numeric => 8,
-            CaseVar::String { width, encoding } => encoding
+            CaseVar::String { width: _, encoding } => encoding
                 .iter()
                 .map(|segment| segment.data_bytes + segment.padding_bytes)
                 .sum(),
@@ -1221,12 +1221,21 @@ impl Cases {
             } else {
                 Box::new(reader)
             },
+            eof: case_vars.is_empty(),
             case_vars,
             compression: header.compression,
             bias: header.bias,
             endian: header.endian,
             codes: VecDeque::with_capacity(8),
-            eof: false,
+        }
+    }
+
+    pub fn with_widths(self, widths: impl IntoIterator<Item = VarWidth>) -> Self {
+        let case_vars = widths.into_iter().map(CaseVar::new).collect::<Vec<_>>();
+        Self {
+            eof: self.eof || case_vars.is_empty(),
+            case_vars,
+            ..self
         }
     }
 }
index f3a5ade4c9520cbeb1e253f0be1e4d5b891ff4c5..aa2af100121d336446348c9b342a8d3f7888402b 100644 (file)
@@ -3,7 +3,7 @@ use std::{io::Cursor, path::Path, sync::Arc};
 use crate::{
     endian::Endian,
     output::{
-        pivot::{test::assert_lines_eq, Axis3, Dimension, PivotTable},
+        pivot::{test::assert_lines_eq, Axis3, Dimension, Group, PivotTable, Value},
         Details, Item, Text,
     },
     sys::{
@@ -533,7 +533,7 @@ fn test_sysfile(name: &str) {
         let mut reader = Reader::new(cursor, |warning| warnings.push(warning)).unwrap();
         let output = match reader.headers().collect() {
             Ok(headers) => {
-                drop(reader);
+                let cases = reader.cases();
                 let encoding =
                     encoding_from_headers(&headers, &mut |warning| warnings.push(warning)).unwrap();
                 let mut decoder = Decoder::new(encoding, |warning| warnings.push(warning));
@@ -545,8 +545,8 @@ fn test_sysfile(name: &str) {
 
                 let mut errors = Vec::new();
                 let headers = Headers::new(decoded_records, &mut |e| errors.push(e)).unwrap();
-                let (dictionary, metadata, _cases) =
-                    decode(headers, None, encoding, |e| errors.push(e)).unwrap();
+                let (dictionary, metadata, cases) =
+                    decode(headers, cases, encoding, |e| errors.push(e)).unwrap();
                 let (group, data) = metadata.to_pivot_rows();
                 let metadata_table = PivotTable::new([(Axis3::Y, Dimension::new(group))])
                     .with_data(
@@ -591,6 +591,41 @@ fn test_sysfile(name: &str) {
                 if let Some(pt) = dictionary.output_variable_sets().to_pivot_table() {
                     output.push(Arc::new(pt.into()));
                 }
+                if let Some(cases) = cases {
+                    let variables = Group::new("Variable")
+                        .with_multiple(dictionary.variables.iter().map(|var| &**var));
+                    let mut case_numbers = Group::new("Case").with_label_shown();
+                    let mut data = Vec::new();
+                    for (case_number, case) in cases.enumerate() {
+                        match case {
+                            Ok(case) => {
+                                case_numbers.push(Value::new_integer(Some(
+                                    (case_numbers.len() + 1) as f64,
+                                )));
+                                data.push(
+                                    case.into_iter()
+                                        .map(|datum| Value::new_datum(&datum, dictionary.encoding))
+                                        .collect::<Vec<_>>(),
+                                );
+                            }
+                            Err(error) => {
+                                output.push(Arc::new(Item::from(Text::new_log(error.to_string()))));
+                            }
+                        }
+                    }
+                    if !data.is_empty() {
+                        let mut pt = PivotTable::new([
+                            (Axis3::X, Dimension::new(variables)),
+                            (Axis3::Y, Dimension::new(case_numbers)),
+                        ]);
+                        for (row_number, row) in data.into_iter().enumerate() {
+                            for (column_number, datum) in row.into_iter().enumerate() {
+                                pt.insert(&[column_number, row_number], datum);
+                            }
+                        }
+                        output.push(Arc::new(pt.into()));
+                    }
+                }
                 Item::new(Details::Group(output))
             }
             Err(error) => Item::new(Details::Text(Box::new(Text::new_log(error.to_string())))),
index 65439d5039ae250414c4ea17eda033fd3b78058c..42cc6e4c04b7da8f0312791b00bac8d569e65af6 100644 (file)
@@ -22,3 +22,9 @@
 ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤
 │num1│       1│     │                 │Input│    8│Right    │F8.0        │F8.0        │              │
 ╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯
+
+╭────┬────╮
+│Case│num1│
+├────┼────┤
+│1   │1.00│
+╰────┴────╯
index 4489a0b36307f703f410890c66914e94228f61e8..47d2fac4a43f7362d0fadb63008aa8799a8bd66a 100644 (file)
@@ -18,3 +18,9 @@
 ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤
 │num1│       1│     │                 │Input│    8│Right    │F8.0        │F8.0        │              │
 ╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯
+
+╭────┬────╮
+│Case│num1│
+├────┼────┤
+│1   │1.00│
+╰────┴────╯
index 0c181f3954bf731aca072e2c06bec28c7d97c9f5..634c52ca578f61e5543cca2052167781fc95aaee 100644 (file)
@@ -19,3 +19,9 @@
 │num1│       1│     │                 │Input│    8│Right    │F8.0        │F8.0        │              │
 │num2│       2│     │                 │Input│    8│Right    │F8.0        │F8.0        │              │
 ╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯
+
+╭────┬────┬────╮
+│Case│num1│num2│
+├────┼────┼────┤
+│1   │1.00│2.00│
+╰────┴────┴────╯
index 77c26ff91dc197655350937334bd4ae79b4ba3e6..08013aa8c88dde57e87c685d2bddeee5f325509e 100644 (file)
@@ -1,6 +1,6 @@
 # File header.
 "$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
-2; 2; 1; 0; -1; 100.0; "01 Jan 11"; "20:53:52"; s64 ""; i8 0 *3;
+2; 2; 0; 0; -1; 100.0; "01 Jan 11"; "20:53:52"; s64 ""; i8 0 *3;
 
 # Numeric variables, no label or missing values.
 2; 0; 0; 0; 0x050800 *2; s8 "NUM1";
@@ -16,4 +16,4 @@
 999; 0;
 
 # Data.
-1.0;
+1.0; 2.0;
\ No newline at end of file
index e1e2966dba399b75036ecdb9ece4e9f098c05a3e..3024855129b62ebe9180ec779681baad9b04a27a 100644 (file)
@@ -19,3 +19,9 @@
 │séq256│       1│     │Nominal          │Input│   32│Left     │A256        │A256        │              │
 │str600│       2│     │Nominal          │Input│   32│Left     │A600        │A600        │              │
 ╰──────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯
+
+╭────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│Case│                                                                                                                             séq256                                                                                                                             │                                                                                                                                                                                                                                                                                                         str600                                                                                                                                                                                                                                                                                                         │
+├────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
+│1   │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@a│abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyz│
+╰────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯