From: Ben Pfaff Date: Sun, 8 Jun 2025 20:51:19 +0000 (-0700) Subject: very long strings X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=fcec12e6a12120d7e116dc54e3f3efa4a1377fcc;p=pspp very long strings --- diff --git a/rust/doc/src/system-file.md b/rust/doc/src/system-file.md index 91b6f33e88..8fdab46ab5 100644 --- a/rust/doc/src/system-file.md +++ b/rust/doc/src/system-file.md @@ -1238,13 +1238,13 @@ contains very long string variables, has the following format: * `char string_lengths[];` - a list of key-value tuples, where key is the name of a variable, and - value is its length. the key field is at most 8 bytes long and must + A list of key-value tuples, where key is the name of a variable, and + value is its length. The key field is at most 8 bytes long and must match the name of a variable which appears in the [variable - record](#variable-record). the value field is exactly 5 bytes long. - it is a zero-padded, ASCII-encoded string that is the length of the - variable. the key and value fields are separated by a `=` byte. - tuples are delimited by a two-byte sequence {00, 09}. After the + record](#variable-record). The value field is exactly 5 bytes long. + It is a zero-padded, ASCII-encoded string that is the length of the + variable. The key and value fields are separated by a `=` byte. + Tuples are delimited by a two-byte sequence {00, 09}. After the last tuple, there may be a single byte 00, or {00, 09}. The total length is `count` bytes. diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index fd27bb9723..caaba5402e 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -249,6 +249,32 @@ impl Datum { Datum::String(s) => Some(s), } } + + pub fn is_resizable(&self, width: VarWidth) -> bool { + match (self, width) { + (Datum::Number(_), VarWidth::Numeric) => true, + (Datum::String(s), VarWidth::String(new_width)) => { + let new_len = new_width as usize; + new_len >= s.len() || s.0[new_len..].iter().all(|c| *c == b' ') + } + _ => false, + } + } + + pub fn resize(&mut self, width: VarWidth) { + match (self, width) { + (Datum::Number(_), VarWidth::Numeric) => (), + (Datum::String(s), VarWidth::String(new_width)) => s.resize(new_width as usize), + _ => unreachable!(), + } + } + + pub fn width(&self) -> VarWidth { + match self { + Datum::Number(_) => VarWidth::Numeric, + Datum::String(s) => VarWidth::String(s.len().try_into().unwrap()), + } + } } impl From for Datum { @@ -688,7 +714,7 @@ impl<'a> OutputValueLabels<'a> { .filter(|var| !var.value_labels.is_empty()) { let mut group = Group::new(&**variable); - let mut sorted_value_labels = variable.value_labels.iter().collect::>(); + let mut sorted_value_labels = variable.value_labels.0.iter().collect::>(); sorted_value_labels.sort(); for (datum, label) in sorted_value_labels { let mut value = Value::new_variable_value(variable, datum) @@ -987,7 +1013,7 @@ pub struct Variable { /// Value labels, to associate a number (or a string) with a more meaningful /// description, e.g. 1 -> Apple, 2 -> Banana, ... - pub value_labels: HashMap, + pub value_labels: ValueLabels, /// Variable label, an optional meaningful description for the variable /// itself. @@ -1048,7 +1074,7 @@ impl Variable { missing_values: MissingValues::default(), print_format: Format::default_for_width(width), write_format: Format::default_for_width(width), - value_labels: HashMap::new(), + value_labels: ValueLabels::new(), label: None, measure: Measure::default_for_type(var_type), role: Role::default(), @@ -1072,6 +1098,25 @@ impl Variable { pub fn label(&self) -> Option<&String> { self.label.as_ref() } + + pub fn resize(&mut self, width: VarWidth) { + if self.missing_values.is_resizable(width) { + self.missing_values.resize(width); + } else { + self.missing_values = MissingValues::default(); + } + + if self.value_labels.is_resizable(width) { + self.value_labels.resize(width); + } else { + self.value_labels = ValueLabels::default(); + } + + self.print_format.resize(width); + self.write_format.resize(width); + + self.width = width; + } } impl HasIdentifier for Variable { @@ -1171,6 +1216,42 @@ impl VariableSet { } } +#[derive(Clone, Debug, Default)] +pub struct ValueLabels(pub HashMap); + +impl ValueLabels { + pub fn new() -> Self { + Self::default() + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn get(&self, datum: &Datum) -> Option<&str> { + self.0.get(datum).map(|s| s.as_str()) + } + + pub fn insert(&mut self, datum: Datum, label: String) -> Option { + self.0.insert(datum, label) + } + + pub fn is_resizable(&self, width: VarWidth) -> bool { + self.0.keys().all(|datum| datum.is_resizable(width)) + } + + pub fn resize(&mut self, width: VarWidth) { + self.0 = self + .0 + .drain() + .map(|(mut datum, string)| { + datum.resize(width); + (datum, string) + }) + .collect(); + } +} + #[cfg(test)] mod test { use std::collections::HashSet; diff --git a/rust/pspp/src/format/mod.rs b/rust/pspp/src/format/mod.rs index 7bab7bc0da..945cfa6c60 100644 --- a/rust/pspp/src/format/mod.rs +++ b/rust/pspp/src/format/mod.rs @@ -608,6 +608,20 @@ impl Format { VarWidth::String(width) => Datum::String(RawString::spaces(width as usize)), } } + + pub fn resize(&mut self, width: VarWidth) { + match (self.var_width(), width) { + (VarWidth::Numeric, VarWidth::Numeric) => {} + (VarWidth::String(_), VarWidth::String(new_width)) => { + self.w = if self.type_ == Type::AHex { + new_width * 2 + } else { + new_width + }; + } + _ => *self = Self::default_for_width(width), + } + } } impl Debug for Format { diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs index 35c38244db..ce2db04c91 100644 --- a/rust/pspp/src/output/pivot/mod.rs +++ b/rust/pspp/src/output/pivot/mod.rs @@ -1769,7 +1769,7 @@ impl Value { } pub fn new_variable_value(variable: &Variable, value: &Datum) -> Self { let var_name = Some(variable.name.as_str().into()); - let value_label = variable.value_labels.get(value).cloned(); + let value_label = variable.value_labels.get(value).map(String::from); match value { Datum::Number(number) => Self::new(ValueInner::Number(NumberValue { show: None, diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index b49d21f67b..1bbe6aee16 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -727,7 +727,7 @@ pub fn decode( dictionary.delete_vars(index + 1..index + n_segments); let variable = dictionary.variables.get_index_mut2(index).unwrap(); variable.short_names = short_names; - variable.width = width; + variable.resize(width); } if headers.long_names.is_empty() { diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 34693eb0f4..09b19cd7a7 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -1159,6 +1159,20 @@ impl MissingValues { } } + pub fn is_resizable(&self, width: VarWidth) -> bool { + self.values.iter().all(|datum| datum.is_resizable(width)) + && self.range.iter().all(|range| range.is_resizable(width)) + } + + pub fn resize(&mut self, width: VarWidth) { + for datum in &mut self.values { + datum.resize(width); + } + if let Some(range) = &mut self.range { + range.resize(width); + } + } + fn read( r: &mut R, offset: u64, @@ -1293,6 +1307,14 @@ impl MissingValueRange { MissingValueRange::To { high } => number <= *high, } } + + pub fn is_resizable(&self, width: VarWidth) -> bool { + width.is_numeric() + } + + pub fn resize(&self, width: VarWidth) { + assert_eq!(width, VarWidth::Numeric); + } } impl Display for MissingValueRange { @@ -1512,6 +1534,9 @@ impl RawString { pub fn resize(&mut self, len: usize) { self.0.resize(len, b' '); } + pub fn len(&self) -> usize { + self.0.len() + } } impl Borrow for RawString { @@ -2585,7 +2610,7 @@ impl VeryLongStringsRecord { let mut very_long_strings = Vec::new(); for tuple in input .split('\0') - .map(|s| s.trim_end_matches('\t')) + .map(|s| s.trim_start_matches('\t')) .filter(|s| !s.is_empty()) { if let Some(vls) = VeryLongString::parse(decoder, tuple).issue_warning(&decoder.warn) { diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs index eb51ecf126..e2ab17a50c 100644 --- a/rust/pspp/src/sys/test.rs +++ b/rust/pspp/src/sys/test.rs @@ -72,6 +72,16 @@ fn variable_display_with_width() { test_sysfile("variable_display_with_width"); } +#[test] +fn long_variable_names() { + test_sysfile("long_variable_names"); +} + +#[test] +fn very_long_strings() { + test_sysfile("very_long_strings"); +} + fn test_sysfile(name: &str) { let input_filename = Path::new(env!("CARGO_MANIFEST_DIR")) .join("src/sys/testdata") diff --git a/rust/pspp/src/sys/testdata/long_variable_names.expected b/rust/pspp/src/sys/testdata/long_variable_names.expected new file mode 100644 index 0000000000..23b123429a --- /dev/null +++ b/rust/pspp/src/sys/testdata/long_variable_names.expected @@ -0,0 +1,26 @@ +╭──────────────────────┬────────────────────────╮ +│ Created │ 01-JAN-2011 20:53:52│ +├──────────────────────┼────────────────────────┤ +│Writer Product │PSPP synthetic test file│ +│ Version │1.2.3 │ +├──────────────────────┼────────────────────────┤ +│ Compression │None │ +│ Number of Cases│ 0│ +╰──────────────────────┴────────────────────────╯ + +╭─────────┬────────────────────────╮ +│Label │PSPP synthetic test file│ +│Variables│ 7│ +╰─────────┴────────────────────────╯ + +╭─────────────────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├─────────────────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│LongVariableName1│ 1│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ +│LongVariableName2│ 2│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ +│LongVariableName3│ 3│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ +│LongVariableName4│ 4│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ +│Coördinate_X │ 5│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ +│Coördinate_Y │ 6│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ +│Coördinate_Z │ 7│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ +╰─────────────────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ diff --git a/rust/pspp/src/sys/testdata/long_variable_names.sack b/rust/pspp/src/sys/testdata/long_variable_names.sack new file mode 100644 index 0000000000..4e85cf2e4a --- /dev/null +++ b/rust/pspp/src/sys/testdata/long_variable_names.sack @@ -0,0 +1,42 @@ +# File header. +"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; # Layout code +7; # Nominal case size +0; # Not compressed +0; # Not weighted +0; # No cases. +100.0; # Bias. +"01 Jan 11"; "20:53:52"; s64 "PSPP synthetic test file"; +i8 0 *3; + +# Numeric variables. +2; 0; 0; 0; 0x050800 *2; s8 "LONGVARI"; +2; 0; 0; 0; 0x050800 *2; s8 "LONGVA_A"; +2; 0; 0; 0; 0x050800 *2; s8 "LONGVA_B"; +2; 0; 0; 0; 0x050800 *2; s8 "LONGVA_C"; +2; 0; 0; 0; 0x050800 *2; "CO"; i8 214; "RDINA"; +2; 0; 0; 0; 0x050800 *2; "CO"; i8 214; "RDI_A"; +2; 0; 0; 0; 0x050800 *2; "CO"; i8 214; "RDI_B"; + +# Machine integer info record. +7; 3; 4; 8; 1; 2; 3; -1; 1; 1; ENDIAN; 1252; + +# Machine floating-point info record. +7; 4; 8; 3; SYSMIS; HIGHEST; LOWEST; + +# Long variable names. +7; 13; 1; COUNT ( +"LONGVARI=LongVariableName1"; i8 9; +"LONGVA_A=LongVariableName2"; i8 9; +"LONGVA_B=LongVariableName3"; i8 9; +"LONGVA_C=LongVariableName4"; i8 9; +"CO"; i8 214; "RDINA=Co"; i8 246; "rdinate_X"; i8 9; +"CO"; i8 214; "RDI_A=Co"; i8 246; "rdinate_Y"; i8 9; +"CO"; i8 214; "RDI_B=Co"; i8 246; "rdinate_Z"; +); + +# Character encoding record. +7; 20; 1; 12; "windows-1252"; + +# Dictionary termination record. +999; 0; diff --git a/rust/pspp/src/sys/testdata/very_long_strings.expected b/rust/pspp/src/sys/testdata/very_long_strings.expected new file mode 100644 index 0000000000..e1e2966dba --- /dev/null +++ b/rust/pspp/src/sys/testdata/very_long_strings.expected @@ -0,0 +1,21 @@ +╭──────────────────────┬────────────────────────╮ +│ Created │ 01-JAN-2011 20:53:52│ +├──────────────────────┼────────────────────────┤ +│Writer Product │PSPP synthetic test file│ +│ Version │1.2.3 │ +├──────────────────────┼────────────────────────┤ +│ Compression │None │ +│ Number of Cases│ 1│ +╰──────────────────────┴────────────────────────╯ + +╭─────────┬────────────────────────╮ +│Label │PSPP synthetic test file│ +│Variables│ 2│ +╰─────────┴────────────────────────╯ + +╭──────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├──────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│séq256│ 1│ │Nominal │Input│ 32│Left │A256 │A256 │ │ +│str600│ 2│ │Nominal │Input│ 32│Left │A600 │A600 │ │ +╰──────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ diff --git a/rust/pspp/src/sys/testdata/very_long_strings.sack b/rust/pspp/src/sys/testdata/very_long_strings.sack new file mode 100644 index 0000000000..ed1fe9d826 --- /dev/null +++ b/rust/pspp/src/sys/testdata/very_long_strings.sack @@ -0,0 +1,44 @@ +# File header. +"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; # Layout code +109; # Nominal case size +0; # Not compressed +0; # Not weighted +1; # No cases. +100.0; # Bias. +"01 Jan 11"; "20:53:52"; s64 "PSPP synthetic test file"; +i8 0 *3; + +# 256-byte string. +2; 255; 0; 0; 0x01FF00 *2; "S"; i8 201; s6 "Q256"; +(2; -1; 0; 0; 0; 0; s8 "") * 31; +2; 4; 0; 0; 0x010400 *2; "S"; i8 201; "Q256_1"; + +# 600-byte string. +2; 255; 0; 0; 0x01FF00 *2; s8 "STR600"; +(2; -1; 0; 0; 0; 0; s8 "") * 31; +2; 255; 0; 0; 0x01FF00 *2; s8 "STR600_1"; +(2; -1; 0; 0; 0; 0; s8 "") * 31; +2; 96; 0; 0; 0x016000 *2; s8 "STR600_2"; +(2; -1; 0; 0; 0; 0; s8 "") * 11; + +# Machine integer info record. +7; 3; 4; 8; 1; 2; 3; -1; 1; 1; ENDIAN; 1252; + +# Very long string record. +7; 14; 1; COUNT ( +"S"; i8 201; "Q256=00256"; i8 0; i8 9; +"STR600=00600"; i8 0; i8 9; +); + +# Character encoding record. +7; 20; 1; 12; "windows-1252"; + +# Dictionary termination record. +999; 0; + +# Data. +"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#" * 4; +"abcdefgh"; +"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#" * 9; +"abcdefghijklmnopqrstuvwxyzABCDEF";