From d7c3cdd4cf2ea17254b2cf587ef803600304f295 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 17 Jun 2025 08:26:37 -0700 Subject: [PATCH] add more tests, fix weird linebreaking behavior --- rust/pspp/src/output/text.rs | 36 +++++++++++++-- rust/pspp/src/sys/cooked.rs | 45 +++++++++++++------ rust/pspp/src/sys/test.rs | 20 +++++++++ .../multiple_documents_records.expected | 22 +++++++++ .../testdata/multiple_documents_records.sack | 19 ++++++++ .../testdata/weight_must_be_numeric.expected | 21 +++++++++ .../sys/testdata/weight_must_be_numeric.sack | 15 +++++++ .../weight_variable_bad_index.expected | 21 +++++++++ .../testdata/weight_variable_bad_index.sack | 15 +++++++ .../weight_variable_continuation.expected | 21 +++++++++ .../weight_variable_continuation.sack | 16 +++++++ 11 files changed, 235 insertions(+), 16 deletions(-) create mode 100644 rust/pspp/src/sys/testdata/multiple_documents_records.expected create mode 100644 rust/pspp/src/sys/testdata/multiple_documents_records.sack create mode 100644 rust/pspp/src/sys/testdata/weight_must_be_numeric.expected create mode 100644 rust/pspp/src/sys/testdata/weight_must_be_numeric.sack create mode 100644 rust/pspp/src/sys/testdata/weight_variable_bad_index.expected create mode 100644 rust/pspp/src/sys/testdata/weight_variable_bad_index.sack create mode 100644 rust/pspp/src/sys/testdata/weight_variable_continuation.expected create mode 100644 rust/pspp/src/sys/testdata/weight_variable_continuation.sack diff --git a/rust/pspp/src/output/text.rs b/rust/pspp/src/output/text.rs index 8f556454b3..be67f96576 100644 --- a/rust/pspp/src/output/text.rs +++ b/rust/pspp/src/output/text.rs @@ -430,6 +430,7 @@ where width: usize, saved: Option<(usize, BreakOpportunity)>, breaks: B, + trailing_newlines: usize, } impl<'a, B> Iterator for LineBreaks<'a, B> @@ -450,6 +451,7 @@ where continue; } + let segment = &self.text[self.indexes.end..index]; let segment_width = self.text[self.indexes.end..index].width(); if self.width == 0 || self.width + segment_width <= self.max_width { // Add this segment to the current line. @@ -478,7 +480,12 @@ where return Some(segment); } } - None + if self.trailing_newlines > 1 { + self.trailing_newlines -= 1; + Some("") + } else { + None + } } } @@ -486,13 +493,30 @@ fn new_line_breaks( text: &str, width: usize, ) -> LineBreaks<'_, impl Iterator + Clone + '_> { + // Trim trailing new-lines from the text, because the linebreaking algorithm + // treats them as if they have width. That is, if you break `"a b c\na b + // c\n"` with a 5-character width, then you end up with: + // + // ```text + // a b c + // a b + // c + // ``` + // + // So, we trim trailing new-lines and then add in extra blank lines at the + // end if necessary. + // + // (The linebreaking algorithm treats new-lines in the middle of the text in + // a normal way, though.) + let trimmed = text.trim_end_matches('\n'); LineBreaks { - text, + text: trimmed, max_width: width, indexes: 0..0, width: 0, saved: None, - breaks: linebreaks(text), + breaks: linebreaks(trimmed), + trailing_newlines: text.len() - trimmed.len(), } } @@ -630,6 +654,12 @@ mod test { } #[test] fn line_breaks() { + test_line_breaks( + "One line of text\nOne line of text\n", + 16, + vec!["One line of text", "One line of text"], + ); + test_line_breaks("a b c\na b c\na b c\n", 5, vec!["a b c", "a b c", "a b c"]); for width in 0..=6 { test_line_breaks("abc def ghi", width, vec!["abc", "def", "ghi"]); } diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index 0121133528..dba0c40027 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -1,4 +1,9 @@ -use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc}; +use std::{ + cell::RefCell, + collections::{BTreeMap, HashMap}, + ops::Range, + rc::Rc, +}; use crate::{ calendar::date_time_to_pspp, @@ -182,9 +187,14 @@ pub enum Error { InvalidWeightVar { name: Identifier, index: u32 }, #[error( - "File weight variable index {index} is greater than maximum variable index {max_index}." + "File weight variable index {index} is invalid because it exceeds maximum variable index {max_index}." + )] + WeightIndexOutOfRange { index: u32, max_index: usize }, + + #[error( + "File weight variable index {index} is invalid because it refers to long string continuation for variable {name}." )] - InvalidWeightIndex { index: u32, max_index: usize }, + WeightIndexStringContinuation { index: u32, name: Identifier }, #[error("{0}")] InvalidRole(InvalidRole), @@ -546,7 +556,7 @@ pub fn decode( n_generated_names: 0, }; - let mut var_index_map = HashMap::new(); + let mut var_index_map = BTreeMap::new(); let mut value_index = 0; for (index, input) in headers .variable @@ -639,21 +649,30 @@ pub fn decode( } if let Some(weight_index) = headers.header.weight_index { - if let Some(dict_index) = var_index_map.get(&(weight_index as usize - 1)) { + let index = weight_index as usize - 1; + if index >= value_index { + warn(Error::WeightIndexOutOfRange { + index: weight_index, + max_index: var_index_map.len(), + }); + } else { + let (var_index, dict_index) = var_index_map.range(..=&index).last().unwrap(); let variable = &dictionary.variables[*dict_index]; - if variable.is_numeric() { - dictionary.weight = Some(*dict_index); + if *var_index == index { + if variable.is_numeric() { + dictionary.weight = Some(*dict_index); + } else { + warn(Error::InvalidWeightVar { + index: weight_index, + name: variable.name.clone(), + }); + } } else { - warn(Error::InvalidWeightVar { + warn(Error::WeightIndexStringContinuation { index: weight_index, name: variable.name.clone(), }); } - } else { - warn(Error::InvalidWeightIndex { - index: weight_index, - max_index: var_index_map.len(), - }); } } diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs index 30021daff2..75207ee356 100644 --- a/rust/pspp/src/sys/test.rs +++ b/rust/pspp/src/sys/test.rs @@ -172,6 +172,26 @@ fn invalid_long_string_missing_values() { test_sysfile("invalid_long_string_missing_values"); } +#[test] +fn weight_must_be_numeric() { + test_sysfile("weight_must_be_numeric"); +} + +#[test] +fn weight_variable_bad_index() { + test_sysfile("weight_variable_bad_index"); +} + +#[test] +fn weight_variable_continuation() { + test_sysfile("weight_variable_continuation"); +} + +#[test] +fn multiple_documents_records() { + test_sysfile("multiple_documents_records"); +} + /// Duplicate variable name handling negative test. /// /// SPSS-generated system file can contain duplicate variable names (see bug diff --git a/rust/pspp/src/sys/testdata/multiple_documents_records.expected b/rust/pspp/src/sys/testdata/multiple_documents_records.expected new file mode 100644 index 0000000000..f6f127ec16 --- /dev/null +++ b/rust/pspp/src/sys/testdata/multiple_documents_records.expected @@ -0,0 +1,22 @@ +╭──────────────────────┬────────────────────────╮ +│ Created │ 01-JAN-2011 20:53:52│ +├──────────────────────┼────────────────────────┤ +│Writer Product │PSPP synthetic test file│ +├──────────────────────┼────────────────────────┤ +│ Compression │SAV │ +│ Number of Cases│Unknown │ +╰──────────────────────┴────────────────────────╯ + +╭─────────┬─────────────────────╮ +│Variables│ 2│ +│Documents│One line of documents│ +│ │One line of documents│ +╰─────────┴─────────────────────╯ + +╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│num1│ 1│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ +│num2│ 2│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ +╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ + diff --git a/rust/pspp/src/sys/testdata/multiple_documents_records.sack b/rust/pspp/src/sys/testdata/multiple_documents_records.sack new file mode 100644 index 0000000000..77c26ff91d --- /dev/null +++ b/rust/pspp/src/sys/testdata/multiple_documents_records.sack @@ -0,0 +1,19 @@ +# File header. +"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; 2; 1; 0; -1; 100.0; "01 Jan 11"; "20:53:52"; s64 ""; i8 0 *3; + +# Numeric variables, no label or missing values. +2; 0; 0; 0; 0x050800 *2; s8 "NUM1"; +2; 0; 0; 0; 0x050800 *2; s8 "NUM2"; + +# Two document records. +(6; 1; s80 "One line of documents") >>* 2<<; + +# Character encoding record. +7; 20; 1; 12; "windows-1252"; + +# Dictionary termination record. +999; 0; + +# Data. +1.0; diff --git a/rust/pspp/src/sys/testdata/weight_must_be_numeric.expected b/rust/pspp/src/sys/testdata/weight_must_be_numeric.expected new file mode 100644 index 0000000000..a77a5d4fdc --- /dev/null +++ b/rust/pspp/src/sys/testdata/weight_must_be_numeric.expected @@ -0,0 +1,21 @@ +File designates string variable STR1 (index 2) as weight variable, but weight variables must be numeric. + +╭──────────────────────┬────────────────────────╮ +│ Created │ 01-JAN-2011 20:53:52│ +├──────────────────────┼────────────────────────┤ +│Writer Product │PSPP synthetic test file│ +├──────────────────────┼────────────────────────┤ +│ Compression │SAV │ +│ Number of Cases│Unknown │ +╰──────────────────────┴────────────────────────╯ + +╭─────────┬─╮ +│Variables│2│ +╰─────────┴─╯ + +╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│num1│ 1│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ +│str1│ 2│ │Nominal │Input│ 4│Left │A4 │A4 │ │ +╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ diff --git a/rust/pspp/src/sys/testdata/weight_must_be_numeric.sack b/rust/pspp/src/sys/testdata/weight_must_be_numeric.sack new file mode 100644 index 0000000000..ad8094a80a --- /dev/null +++ b/rust/pspp/src/sys/testdata/weight_must_be_numeric.sack @@ -0,0 +1,15 @@ +# File header. +"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; 2; 1; >>2<<; -1; 100.0; "01 Jan 11"; "20:53:52"; s64 ""; i8 0 *3; + +# Numeric variable. +2; 0; 0; 0; 0x050800 *2; s8 "NUM1"; + +# String variable. +2; 4; 0; 0; 0x010400 *2; s8 "STR1"; + +# Character encoding record. +7; 20; 1; 12; "windows-1252"; + +# End of dictionary. +999; 0; diff --git a/rust/pspp/src/sys/testdata/weight_variable_bad_index.expected b/rust/pspp/src/sys/testdata/weight_variable_bad_index.expected new file mode 100644 index 0000000000..e4b2b8f514 --- /dev/null +++ b/rust/pspp/src/sys/testdata/weight_variable_bad_index.expected @@ -0,0 +1,21 @@ +File weight variable index 3 is invalid because it exceeds maximum variable index 2. + +╭──────────────────────┬────────────────────────╮ +│ Created │ 01-JAN-2011 20:53:52│ +├──────────────────────┼────────────────────────┤ +│Writer Product │PSPP synthetic test file│ +├──────────────────────┼────────────────────────┤ +│ Compression │SAV │ +│ Number of Cases│Unknown │ +╰──────────────────────┴────────────────────────╯ + +╭─────────┬─╮ +│Variables│2│ +╰─────────┴─╯ + +╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│num1│ 1│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ +│str1│ 2│ │Nominal │Input│ 4│Left │A4 │A4 │ │ +╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ diff --git a/rust/pspp/src/sys/testdata/weight_variable_bad_index.sack b/rust/pspp/src/sys/testdata/weight_variable_bad_index.sack new file mode 100644 index 0000000000..6fb764c5d7 --- /dev/null +++ b/rust/pspp/src/sys/testdata/weight_variable_bad_index.sack @@ -0,0 +1,15 @@ +# File header. +"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; 2; 1; >>3<<; -1; 100.0; "01 Jan 11"; "20:53:52"; s64 ""; i8 0 *3; + +# Numeric variable. +2; 0; 0; 0; 0x050800 *2; s8 "NUM1"; + +# String variable. +2; 4; 0; 0; 0x010400 *2; s8 "STR1"; + +# Character encoding record. +7; 20; 1; 12; "windows-1252"; + +# End of dictionary. +999; 0; diff --git a/rust/pspp/src/sys/testdata/weight_variable_continuation.expected b/rust/pspp/src/sys/testdata/weight_variable_continuation.expected new file mode 100644 index 0000000000..9dc6cf3df7 --- /dev/null +++ b/rust/pspp/src/sys/testdata/weight_variable_continuation.expected @@ -0,0 +1,21 @@ +File weight variable index 2 is invalid because it refers to long string continuation for variable STR1. + +╭──────────────────────┬────────────────────────╮ +│ Created │ 01-JAN-2011 20:53:52│ +├──────────────────────┼────────────────────────┤ +│Writer Product │PSPP synthetic test file│ +├──────────────────────┼────────────────────────┤ +│ Compression │SAV │ +│ Number of Cases│Unknown │ +╰──────────────────────┴────────────────────────╯ + +╭─────────┬─╮ +│Variables│2│ +╰─────────┴─╯ + +╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│str1│ 1│ │Nominal │Input│ 9│Left │A9 │A9 │ │ +│num1│ 2│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ +╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ diff --git a/rust/pspp/src/sys/testdata/weight_variable_continuation.sack b/rust/pspp/src/sys/testdata/weight_variable_continuation.sack new file mode 100644 index 0000000000..cf5eebe076 --- /dev/null +++ b/rust/pspp/src/sys/testdata/weight_variable_continuation.sack @@ -0,0 +1,16 @@ +# File header. +"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; 3; 1; >>2<<; -1; 100.0; "01 Jan 11"; "20:53:52"; s64 ""; i8 0 *3; + +# Long string variable. +2; 9; 0; 0; 0x010900 *2; s8 "STR1"; +(2; -1; 0; 0; 0; 0; s8 ""); + +# Numeric variable. +2; 0; 0; 0; 0x050800 *2; s8 "NUM1"; + +# Character encoding record. +7; 20; 1; 12; "windows-1252"; + +# End of dictionary. +999; 0; -- 2.30.2