From 0887f63b95ef49b42b95752078e6f3c0dee601ce Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 6 Jul 2025 10:07:10 -0700 Subject: [PATCH] work on making data work --- rust/pspp/src/output/pivot/mod.rs | 27 ++++++ rust/pspp/src/sys/cooked.rs | 97 ++++++++++--------- rust/pspp/src/sys/raw.rs | 13 ++- rust/pspp/src/sys/test.rs | 43 +++++++- rust/pspp/src/sys/testdata/documents.expected | 6 ++ .../testdata/empty_document_record.expected | 6 ++ .../multiple_documents_records.expected | 6 ++ .../testdata/multiple_documents_records.sack | 4 +- .../sys/testdata/very_long_strings.expected | 6 ++ 9 files changed, 154 insertions(+), 54 deletions(-) diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs index 2fb4dfa773..811b1ca769 100644 --- a/rust/pspp/src/output/pivot/mod.rs +++ b/rust/pspp/src/output/pivot/mod.rs @@ -423,6 +423,14 @@ impl Group { self } + pub fn with_multiple(mut self, children: impl IntoIterator) -> Self + where + C: Into, + { + self.extend(children); + self + } + pub fn with_label_shown(self) -> Self { self.with_show_label(true) } @@ -468,6 +476,19 @@ impl Group { } } +impl Extend for Group +where + C: Into, +{ + fn extend>(&mut self, children: T) { + let children = children.into_iter(); + self.children.reserve(children.size_hint().0); + for child in children { + self.push(child); + } + } +} + #[derive(Clone, Debug, Default)] pub struct Footnotes(pub Vec>); @@ -592,6 +613,12 @@ impl From for Category { } } +impl From<&Variable> for Category { + fn from(variable: &Variable) -> Self { + Value::new_variable(variable).into() + } +} + impl From<&str> for Category { fn from(name: &str) -> Self { Self::Leaf(Leaf::new(Value::new_text(name))) diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index a956dd8605..43ad3c5f59 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -578,7 +578,7 @@ impl Decoder { pub fn decode( mut headers: Headers, - cases: Option, + mut cases: Option, encoding: &'static Encoding, mut warn: impl FnMut(Error), ) -> Result<(Dictionary, Metadata, Option), Error> { @@ -870,55 +870,60 @@ pub fn decode( } } - 'outer: for record in headers - .very_long_strings - .drain(..) - .flat_map(|record| record.0.into_iter()) - { - let Some(index) = dictionary.variables.get_index_of(&record.short_name.0) else { - warn(dbg!(Error::TBD)); - continue; - }; - let width = VarWidth::String(record.length); - let n_segments = width.n_segments(); - if n_segments == 1 { - warn(dbg!(Error::ShortVeryLongString { - short_name: record.short_name.clone(), - width: record.length - })); - continue; - } - if index + n_segments > dictionary.variables.len() { - warn(dbg!(Error::VeryLongStringOverflow { - short_name: record.short_name.clone(), - width: record.length, - index, - n_segments, - len: dictionary.variables.len() - })); - continue; - } - let mut short_names = Vec::with_capacity(n_segments); - for i in 0..n_segments { - let alloc_width = width.segment_alloc_width(i); - let segment = &dictionary.variables[index + i]; - short_names.push(segment.short_names[0].clone()); - let segment_width = segment.width.as_string_width().unwrap_or(0); - if segment_width.next_multiple_of(8) != alloc_width.next_multiple_of(8) { - warn(Error::VeryLongStringInvalidSegmentWidth { + if !headers.very_long_strings.is_empty() { + 'outer: for record in headers + .very_long_strings + .drain(..) + .flat_map(|record| record.0.into_iter()) + { + let Some(index) = dictionary.variables.get_index_of(&record.short_name.0) else { + warn(dbg!(Error::TBD)); + continue; + }; + let width = VarWidth::String(record.length); + let n_segments = width.n_segments(); + if n_segments == 1 { + warn(dbg!(Error::ShortVeryLongString { + short_name: record.short_name.clone(), + width: record.length + })); + continue; + } + if index + n_segments > dictionary.variables.len() { + warn(dbg!(Error::VeryLongStringOverflow { short_name: record.short_name.clone(), width: record.length, - index: i, - actual: segment_width, - expected: alloc_width, - }); - continue 'outer; + index, + n_segments, + len: dictionary.variables.len() + })); + continue; } + let mut short_names = Vec::with_capacity(n_segments); + for i in 0..n_segments { + let alloc_width = width.segment_alloc_width(i); + let segment = &dictionary.variables[index + i]; + short_names.push(segment.short_names[0].clone()); + let segment_width = segment.width.as_string_width().unwrap_or(0); + if segment_width.next_multiple_of(8) != alloc_width.next_multiple_of(8) { + warn(Error::VeryLongStringInvalidSegmentWidth { + short_name: record.short_name.clone(), + width: record.length, + index: i, + actual: segment_width, + expected: alloc_width, + }); + continue 'outer; + } + } + dictionary.delete_vars(index + 1..index + n_segments); + let variable = dictionary.variables.get_index_mut2(index).unwrap(); + variable.short_names = short_names; + variable.resize(width); } - dictionary.delete_vars(index + 1..index + n_segments); - let variable = dictionary.variables.get_index_mut2(index).unwrap(); - variable.short_names = short_names; - variable.resize(width); + cases = cases + .take() + .map(|cases| cases.with_widths(dictionary.variables.iter().map(|var| var.width))); } if headers.long_names.is_empty() { diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 46c201a865..1317c2d835 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -1165,7 +1165,7 @@ impl CaseVar { fn bytes(&self) -> usize { match self { CaseVar::Numeric => 8, - CaseVar::String { width, encoding } => encoding + CaseVar::String { width: _, encoding } => encoding .iter() .map(|segment| segment.data_bytes + segment.padding_bytes) .sum(), @@ -1221,12 +1221,21 @@ impl Cases { } else { Box::new(reader) }, + eof: case_vars.is_empty(), case_vars, compression: header.compression, bias: header.bias, endian: header.endian, codes: VecDeque::with_capacity(8), - eof: false, + } + } + + pub fn with_widths(self, widths: impl IntoIterator) -> Self { + let case_vars = widths.into_iter().map(CaseVar::new).collect::>(); + Self { + eof: self.eof || case_vars.is_empty(), + case_vars, + ..self } } } diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs index f3a5ade4c9..aa2af10012 100644 --- a/rust/pspp/src/sys/test.rs +++ b/rust/pspp/src/sys/test.rs @@ -3,7 +3,7 @@ use std::{io::Cursor, path::Path, sync::Arc}; use crate::{ endian::Endian, output::{ - pivot::{test::assert_lines_eq, Axis3, Dimension, PivotTable}, + pivot::{test::assert_lines_eq, Axis3, Dimension, Group, PivotTable, Value}, Details, Item, Text, }, sys::{ @@ -533,7 +533,7 @@ fn test_sysfile(name: &str) { let mut reader = Reader::new(cursor, |warning| warnings.push(warning)).unwrap(); let output = match reader.headers().collect() { Ok(headers) => { - drop(reader); + let cases = reader.cases(); let encoding = encoding_from_headers(&headers, &mut |warning| warnings.push(warning)).unwrap(); let mut decoder = Decoder::new(encoding, |warning| warnings.push(warning)); @@ -545,8 +545,8 @@ fn test_sysfile(name: &str) { let mut errors = Vec::new(); let headers = Headers::new(decoded_records, &mut |e| errors.push(e)).unwrap(); - let (dictionary, metadata, _cases) = - decode(headers, None, encoding, |e| errors.push(e)).unwrap(); + let (dictionary, metadata, cases) = + decode(headers, cases, encoding, |e| errors.push(e)).unwrap(); let (group, data) = metadata.to_pivot_rows(); let metadata_table = PivotTable::new([(Axis3::Y, Dimension::new(group))]) .with_data( @@ -591,6 +591,41 @@ fn test_sysfile(name: &str) { if let Some(pt) = dictionary.output_variable_sets().to_pivot_table() { output.push(Arc::new(pt.into())); } + if let Some(cases) = cases { + let variables = Group::new("Variable") + .with_multiple(dictionary.variables.iter().map(|var| &**var)); + let mut case_numbers = Group::new("Case").with_label_shown(); + let mut data = Vec::new(); + for (case_number, case) in cases.enumerate() { + match case { + Ok(case) => { + case_numbers.push(Value::new_integer(Some( + (case_numbers.len() + 1) as f64, + ))); + data.push( + case.into_iter() + .map(|datum| Value::new_datum(&datum, dictionary.encoding)) + .collect::>(), + ); + } + Err(error) => { + output.push(Arc::new(Item::from(Text::new_log(error.to_string())))); + } + } + } + if !data.is_empty() { + let mut pt = PivotTable::new([ + (Axis3::X, Dimension::new(variables)), + (Axis3::Y, Dimension::new(case_numbers)), + ]); + for (row_number, row) in data.into_iter().enumerate() { + for (column_number, datum) in row.into_iter().enumerate() { + pt.insert(&[column_number, row_number], datum); + } + } + output.push(Arc::new(pt.into())); + } + } Item::new(Details::Group(output)) } Err(error) => Item::new(Details::Text(Box::new(Text::new_log(error.to_string())))), diff --git a/rust/pspp/src/sys/testdata/documents.expected b/rust/pspp/src/sys/testdata/documents.expected index 65439d5039..42cc6e4c04 100644 --- a/rust/pspp/src/sys/testdata/documents.expected +++ b/rust/pspp/src/sys/testdata/documents.expected @@ -22,3 +22,9 @@ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ │num1│ 1│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ ╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + +╭────┬────╮ +│Case│num1│ +├────┼────┤ +│1 │1.00│ +╰────┴────╯ diff --git a/rust/pspp/src/sys/testdata/empty_document_record.expected b/rust/pspp/src/sys/testdata/empty_document_record.expected index 4489a0b363..47d2fac4a4 100644 --- a/rust/pspp/src/sys/testdata/empty_document_record.expected +++ b/rust/pspp/src/sys/testdata/empty_document_record.expected @@ -18,3 +18,9 @@ ├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ │num1│ 1│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ ╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + +╭────┬────╮ +│Case│num1│ +├────┼────┤ +│1 │1.00│ +╰────┴────╯ diff --git a/rust/pspp/src/sys/testdata/multiple_documents_records.expected b/rust/pspp/src/sys/testdata/multiple_documents_records.expected index 0c181f3954..634c52ca57 100644 --- a/rust/pspp/src/sys/testdata/multiple_documents_records.expected +++ b/rust/pspp/src/sys/testdata/multiple_documents_records.expected @@ -19,3 +19,9 @@ │num1│ 1│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ │num2│ 2│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ ╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + +╭────┬────┬────╮ +│Case│num1│num2│ +├────┼────┼────┤ +│1 │1.00│2.00│ +╰────┴────┴────╯ diff --git a/rust/pspp/src/sys/testdata/multiple_documents_records.sack b/rust/pspp/src/sys/testdata/multiple_documents_records.sack index 77c26ff91d..08013aa8c8 100644 --- a/rust/pspp/src/sys/testdata/multiple_documents_records.sack +++ b/rust/pspp/src/sys/testdata/multiple_documents_records.sack @@ -1,6 +1,6 @@ # File header. "$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; -2; 2; 1; 0; -1; 100.0; "01 Jan 11"; "20:53:52"; s64 ""; i8 0 *3; +2; 2; 0; 0; -1; 100.0; "01 Jan 11"; "20:53:52"; s64 ""; i8 0 *3; # Numeric variables, no label or missing values. 2; 0; 0; 0; 0x050800 *2; s8 "NUM1"; @@ -16,4 +16,4 @@ 999; 0; # Data. -1.0; +1.0; 2.0; \ No newline at end of file diff --git a/rust/pspp/src/sys/testdata/very_long_strings.expected b/rust/pspp/src/sys/testdata/very_long_strings.expected index e1e2966dba..3024855129 100644 --- a/rust/pspp/src/sys/testdata/very_long_strings.expected +++ b/rust/pspp/src/sys/testdata/very_long_strings.expected @@ -19,3 +19,9 @@ │séq256│ 1│ │Nominal │Input│ 32│Left │A256 │A256 │ │ │str600│ 2│ │Nominal │Input│ 32│Left │A600 │A600 │ │ ╰──────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + +╭────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│Case│ séq256 │ str600 │ +├────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ +│1 │abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@a│abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#abcdefghijklmnopqrstuvwxyz│ +╰────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -- 2.30.2