From 64e86d0d9fe8a0642b839cb1c78037d07f0bf3ff Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 15 Jun 2025 20:30:16 -0700 Subject: [PATCH] getting rid of textrecord --- rust/pspp/src/dictionary.rs | 21 + rust/pspp/src/sys/cooked.rs | 26 +- rust/pspp/src/sys/raw.rs | 421 +++++++++++------- rust/pspp/src/sys/test.rs | 15 + ...nvalid_long_string_missing_values.expected | 0 .../invalid_long_string_missing_values.sack | 61 +++ .../testdata/invalid_variable_format.expected | 37 ++ .../sys/testdata/invalid_variable_format.sack | 23 + .../missing_string_continuation.expected | 21 + .../testdata/missing_string_continuation.sack | 13 + ...ariable_labels_and_missing_values.expected | 50 +-- 11 files changed, 488 insertions(+), 200 deletions(-) create mode 100644 rust/pspp/src/sys/testdata/invalid_long_string_missing_values.expected create mode 100644 rust/pspp/src/sys/testdata/invalid_long_string_missing_values.sack create mode 100644 rust/pspp/src/sys/testdata/invalid_variable_format.expected create mode 100644 rust/pspp/src/sys/testdata/invalid_variable_format.sack create mode 100644 rust/pspp/src/sys/testdata/missing_string_continuation.expected create mode 100644 rust/pspp/src/sys/testdata/missing_string_continuation.sack diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index 7b33658ad4..e6154fb4e1 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -269,12 +269,33 @@ impl Datum { } } + pub fn var_type(&self) -> VarType { + match self { + Self::Number(_) => VarType::Numeric, + Self::String(_) => VarType::String, + } + } + pub fn width(&self) -> VarWidth { match self { Datum::Number(_) => VarWidth::Numeric, Datum::String(s) => VarWidth::String(s.len().try_into().unwrap()), } } + + pub fn eq_ignore_trailing_spaces(&self, other: &Datum) -> bool { + match (self, other) { + (Self::String(a), Self::String(b)) => a.eq_ignore_trailing_spaces(b), + _ => self == other, + } + } + + pub fn trim_end(&mut self) { + match self { + Self::Number(_) => (), + Self::String(s) => s.trim_end(), + } + } } impl From for Datum { diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index 0978487ae9..aa7e65fbb3 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -1,4 +1,3 @@ -use core::str; use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc}; use crate::{ @@ -15,12 +14,12 @@ use crate::{ encoding::Error as EncodingError, raw::{ self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, - FileAttributeRecord, FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName, + FileAttributesRecord, FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName, LongNamesRecord, LongStringMissingValueRecord, LongStringValueLabelRecord, - MissingValues, MultipleResponseRecord, NumberOfCasesRecord, ProductInfoRecord, - RawStrArray, RawString, RawWidth, ValueLabel, ValueLabelRecord, VarDisplayRecord, - VariableAttributeRecord, VariableRecord, VariableSetRecord, VeryLongStringsRecord, - ZHeader, ZTrailer, + MissingValues, MissingValuesError, MultipleResponseRecord, NumberOfCasesRecord, + ProductInfoRecord, RawStrArray, RawString, RawWidth, ValueLabel, ValueLabelRecord, + VarDisplayRecord, VariableAttributeRecord, VariableRecord, VariableSetRecord, + VeryLongStringsRecord, ZHeader, ZTrailer, }, }, }; @@ -93,7 +92,7 @@ pub enum Error { #[error("At offset {offset:#x}, one or more variable indexes for value labels referred to long string continuation records: {indexes:?}")] LongStringContinuationIndexes { offset: u64, indexes: Vec }, - #[error("Variable index {start_index} is a {width} that should be followed by long string continuation records up to index {end_index}, but index {error_index} is not a continuation")] + #[error("Variable index {start_index} is a {width} that should be followed by long string continuation records through index {end_index} (inclusive), but index {error_index} is not a continuation")] MissingLongStringContinuation { width: RawWidth, start_index: usize, @@ -215,7 +214,7 @@ pub struct Headers { pub product_info: Option, pub long_names: Vec, pub very_long_strings: Vec, - pub file_attributes: Vec, + pub file_attributes: Vec, pub variable_attributes: Vec, pub other_extension: Vec, pub end_of_headers: Option, @@ -852,10 +851,13 @@ pub fn decode( Datum::String(value) }) .collect::>(); - variable.missing_values = MissingValues { - values, - range: None, - }; + dbg!(&values); + match MissingValues::new(values, None) { + Ok(missing_values) => variable.missing_values = missing_values, + Err(MissingValuesError::TooMany) => warn(Error::TBD), + Err(MissingValuesError::TooWide) => warn(Error::TBD), + Err(MissingValuesError::MixedTypes) => unreachable!(), + } } for record in headers diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 3c8ddb9d9a..9c010ccbe9 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -287,6 +287,11 @@ pub enum Record { LongStringMissingValues(LongStringMissingValueRecord), Encoding(EncodingRecord), NumberOfCases(NumberOfCasesRecord), + VariableSets(RawVariableSetRecord), + ProductInfo(RawProductInfoRecord), + LongNames(RawLongNamesRecord), + VeryLongStrings(RawVeryLongStringsRecord), + FileAttributes(RawFileAttributesRecord), Text(TextRecord), OtherExtension(Extension), EndOfHeaders(u32), @@ -313,7 +318,7 @@ pub enum DecodedRecord { ProductInfo(ProductInfoRecord), LongNames(LongNamesRecord), VeryLongStrings(VeryLongStringsRecord), - FileAttributes(FileAttributeRecord), + FileAttributes(FileAttributesRecord), VariableAttributes(VariableAttributeRecord), OtherExtension(Extension), EndOfHeaders(u32), @@ -366,6 +371,13 @@ impl Record { } Record::Encoding(record) => DecodedRecord::Encoding(record.clone()), Record::NumberOfCases(record) => DecodedRecord::NumberOfCases(record.clone()), + Record::VariableSets(record) => DecodedRecord::VariableSets(record.decode(decoder)), + Record::ProductInfo(record) => DecodedRecord::ProductInfo(record.decode(decoder)), + Record::LongNames(record) => DecodedRecord::LongNames(record.decode(decoder)), + Record::VeryLongStrings(record) => { + DecodedRecord::VeryLongStrings(record.decode(decoder)) + } + Record::FileAttributes(record) => DecodedRecord::FileAttributes(record.decode(decoder)), Record::Text(record) => record.decode(decoder), Record::OtherExtension(record) => DecodedRecord::OtherExtension(record.clone()), Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record), @@ -1135,10 +1147,10 @@ fn format_name(type_: u32) -> Cow<'static, str> { #[derive(Clone, Default)] pub struct MissingValues { /// Individual missing values, up to 3 of them. - pub values: Vec, + values: Vec, /// Optional range of missing values. - pub range: Option, + range: Option, } impl Debug for MissingValues { @@ -1151,13 +1163,62 @@ impl Debug for MissingValues { } } +#[derive(Copy, Clone, Debug)] +pub enum MissingValuesError { + TooMany, + TooWide, + MixedTypes, +} + impl MissingValues { + pub fn new( + mut values: Vec, + range: Option, + ) -> Result { + if values.len() > 3 { + return Err(MissingValuesError::TooMany); + } + + let mut var_type = None; + for value in values.iter_mut() { + value.trim_end(); + match value.width() { + VarWidth::String(w) if w > 8 => return Err(MissingValuesError::TooWide), + _ => (), + } + if var_type.is_some_and(|t| t != value.var_type()) { + return Err(MissingValuesError::MixedTypes); + } + var_type = Some(value.var_type()); + } + + if var_type == Some(VarType::String) && range.is_some() { + return Err(MissingValuesError::MixedTypes); + } + + Ok(Self { values, range }) + } + pub fn is_empty(&self) -> bool { self.values.is_empty() && self.range.is_none() } + pub fn var_type(&self) -> Option { + if let Some(datum) = self.values.first() { + Some(datum.var_type()) + } else if self.range.is_some() { + Some(VarType::Numeric) + } else { + None + } + } + pub fn contains(&self, value: &Datum) -> bool { - if self.values.contains(value) { + if self + .values + .iter() + .any(|datum| datum.eq_ignore_trailing_spaces(value)) + { return true; } @@ -1219,7 +1280,7 @@ impl MissingValues { let range = range.map(|(low, high)| { MissingValueRange::new(endian.parse(low), endian.parse(high)) }); - return Ok(Self { values, range }); + return Ok(Self::new(values, range).unwrap()); } Ok(VarWidth::String(_)) if range.is_some() => warn(Warning::MissingValueStringRange), Ok(VarWidth::String(width)) => { @@ -1228,10 +1289,7 @@ impl MissingValues { .into_iter() .map(|value| Datum::String(RawString::from(&value[..width]))) .collect(); - return Ok(Self { - values, - range: None, - }); + return Ok(Self::new(values, None).unwrap()); } Err(()) => warn(Warning::MissingValueContinuation(offset)), } @@ -1545,6 +1603,9 @@ impl RawString { pub fn len(&self) -> usize { self.0.len() } + pub fn trim_end(&mut self) { + while self.0.pop_if(|c| *c == b' ').is_some() {} + } } impl Borrow for RawString { @@ -1617,6 +1678,21 @@ impl RawStr { pub fn decode(&self, encoding: &'static Encoding) -> Cow<'_, str> { encoding.decode_without_bom_handling(&self.0).0 } + + pub fn eq_ignore_trailing_spaces(&self, other: &RawStr) -> bool { + let mut this = self.0.iter(); + let mut other = other.0.iter(); + loop { + match (this.next(), other.next()) { + (Some(a), Some(b)) if a == b => (), + (Some(_), Some(_)) => return false, + (None, None) => return true, + (Some(b' '), None) => return this.all(|c| *c == b' '), + (None, Some(b' ')) => return other.all(|c| *c == b' '), + (Some(_), None) | (None, Some(_)) => return false, + } + } + } } pub struct DisplayRawString<'a>(Cow<'a, str>); @@ -2003,12 +2079,10 @@ impl DocumentRecord { } } -trait ExtensionRecord { - const SUBTYPE: u32; - const SIZE: Option; - const COUNT: Option; - const NAME: &'static str; - fn parse(ext: &Extension, endian: Endian) -> Result; +struct ExtensionRecord<'a> { + size: Option, + count: Option, + name: &'a str, } #[derive(Clone, Debug)] @@ -2022,14 +2096,15 @@ pub struct IntegerInfoRecord { pub character_code: i32, } -impl ExtensionRecord for IntegerInfoRecord { - const SUBTYPE: u32 = 3; - const SIZE: Option = Some(4); - const COUNT: Option = Some(8); - const NAME: &'static str = "integer record"; +static INTEGER_INFO_RECORD: ExtensionRecord = ExtensionRecord { + size: Some(4), + count: Some(8), + name: "integer record", +}; +impl IntegerInfoRecord { fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size::()?; + ext.check_size(&INTEGER_INFO_RECORD)?; let mut input = &ext.data[..]; let data: Vec = (0..8) @@ -2054,14 +2129,15 @@ pub struct FloatInfoRecord { pub lowest: f64, } -impl ExtensionRecord for FloatInfoRecord { - const SUBTYPE: u32 = 4; - const SIZE: Option = Some(8); - const COUNT: Option = Some(3); - const NAME: &'static str = "floating point record"; +static FLOAT_INFO_RECORD: ExtensionRecord = ExtensionRecord { + size: Some(8), + count: Some(3), + name: "floating point record", +}; +impl FloatInfoRecord { fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size::()?; + ext.check_size(&FLOAT_INFO_RECORD)?; let mut input = &ext.data[..]; let data: Vec = (0..3) @@ -2209,14 +2285,15 @@ where I: Debug, S: Debug; -impl ExtensionRecord for MultipleResponseRecord { - const SUBTYPE: u32 = 7; - const SIZE: Option = Some(1); - const COUNT: Option = None; - const NAME: &'static str = "multiple response set record"; +static MULTIPLE_RESPONSE_RECORD: ExtensionRecord = ExtensionRecord { + size: Some(1), + count: None, + name: "multiple response set record", +}; +impl MultipleResponseRecord { fn parse(ext: &Extension, _endian: Endian) -> Result { - ext.check_size::()?; + ext.check_size(&MULTIPLE_RESPONSE_RECORD)?; let mut input = &ext.data[..]; let mut sets = Vec::new(); @@ -2348,8 +2425,6 @@ pub struct VarDisplay { pub struct VarDisplayRecord(pub Vec); impl VarDisplayRecord { - const SUBTYPE: u32 = 11; - fn parse( ext: &Extension, var_types: &VarTypes, @@ -2427,14 +2502,15 @@ pub struct LongStringMissingValueRecord(pub Vec>) where N: Debug; -impl ExtensionRecord for LongStringMissingValueRecord { - const SUBTYPE: u32 = 22; - const SIZE: Option = Some(1); - const COUNT: Option = None; - const NAME: &'static str = "long string missing values record"; +static LONG_STRING_MISSING_VALUE_RECORD: ExtensionRecord = ExtensionRecord { + size: Some(1), + count: None, + name: "long string missing values record", +}; +impl LongStringMissingValueRecord { fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size::()?; + ext.check_size(&LONG_STRING_MISSING_VALUE_RECORD)?; let mut input = &ext.data[..]; let mut missing_value_set = Vec::new(); @@ -2494,14 +2570,15 @@ impl LongStringMissingValueRecord { #[derive(Clone, Debug)] pub struct EncodingRecord(pub String); -impl ExtensionRecord for EncodingRecord { - const SUBTYPE: u32 = 20; - const SIZE: Option = Some(1); - const COUNT: Option = None; - const NAME: &'static str = "encoding record"; +static ENCODING_RECORD: ExtensionRecord = ExtensionRecord { + size: Some(1), + count: None, + name: "encoding record", +}; +impl EncodingRecord { fn parse(ext: &Extension, _endian: Endian) -> Result { - ext.check_size::()?; + ext.check_size(&ENCODING_RECORD)?; Ok(Record::Encoding(EncodingRecord( String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName { @@ -2520,14 +2597,15 @@ pub struct NumberOfCasesRecord { pub n_cases: u64, } -impl ExtensionRecord for NumberOfCasesRecord { - const SUBTYPE: u32 = 16; - const SIZE: Option = Some(8); - const COUNT: Option = Some(2); - const NAME: &'static str = "extended number of cases record"; +static NUMBER_OF_CASES_RECORD: ExtensionRecord = ExtensionRecord { + size: Some(8), + count: Some(2), + name: "extended number of cases record", +}; +impl NumberOfCasesRecord { fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size::()?; + ext.check_size(&NUMBER_OF_CASES_RECORD)?; let mut input = &ext.data[..]; let one = endian.parse(read_bytes(&mut input)?); @@ -2537,6 +2615,69 @@ impl ExtensionRecord for NumberOfCasesRecord { } } +#[derive(Clone, Debug)] +pub struct RawVariableSetRecord(TextRecord); + +impl RawVariableSetRecord { + fn parse(extension: Extension) -> Result { + Ok(Record::VariableSets(Self(TextRecord::parse( + extension, + "variable sets record", + )?))) + } + fn decode(self, decoder: &mut Decoder) -> VariableSetRecord { + let mut sets = Vec::new(); + let input = decoder.decode(&self.0.text); + for line in input.lines() { + if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&mut decoder.warn) { + sets.push(set) + } + } + VariableSetRecord { + offsets: self.0.offsets, + sets, + } + } +} + +#[derive(Clone, Debug)] +pub struct RawProductInfoRecord(TextRecord); + +impl RawProductInfoRecord { + fn parse(extension: Extension) -> Result { + Ok(Record::ProductInfo(Self(TextRecord::parse( + extension, + "product info record", + )?))) + } + fn decode(self, decoder: &mut Decoder) -> ProductInfoRecord { + ProductInfoRecord(decoder.decode(&self.0.text).into()) + } +} + +#[derive(Clone, Debug)] +pub struct RawLongNamesRecord(TextRecord); + +impl RawLongNamesRecord { + fn parse(extension: Extension) -> Result { + Ok(Record::LongNames(Self(TextRecord::parse( + extension, + "long names record", + )?))) + } + fn decode(self, decoder: &mut Decoder) -> LongNamesRecord { + let input = decoder.decode(&self.0.text); + let mut names = Vec::new(); + for pair in input.split('\t').filter(|s| !s.is_empty()) { + if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&mut decoder.warn) + { + names.push(long_name); + } + } + LongNamesRecord(names) + } +} + #[derive(Clone, Debug)] pub struct TextRecord { pub offsets: Range, @@ -2550,15 +2691,18 @@ pub struct TextRecord { #[derive(Clone, Copy, Debug)] pub enum TextRecordType { - VariableSets, - ProductInfo, - LongNames, - VeryLongStrings, - FileAttributes, VariableAttributes, } impl TextRecord { + fn parse(extension: Extension, name: &str) -> Result { + extension.check_size(&ExtensionRecord { + size: Some(1), + count: None, + name, + })?; + Ok(Self::new(extension, TextRecordType::VariableAttributes)) + } fn new(extension: Extension, rec_type: TextRecordType) -> Self { Self { offsets: extension.offsets, @@ -2568,21 +2712,6 @@ impl TextRecord { } pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord { match self.rec_type { - TextRecordType::VariableSets => { - DecodedRecord::VariableSets(VariableSetRecord::decode(&self, decoder)) - } - TextRecordType::ProductInfo => { - DecodedRecord::ProductInfo(ProductInfoRecord::decode(&self, decoder)) - } - TextRecordType::LongNames => { - DecodedRecord::LongNames(LongNamesRecord::decode(&self, decoder)) - } - TextRecordType::VeryLongStrings => { - DecodedRecord::VeryLongStrings(VeryLongStringsRecord::decode(&self, decoder)) - } - TextRecordType::FileAttributes => { - DecodedRecord::FileAttributes(FileAttributeRecord::decode(&self, decoder)) - } TextRecordType::VariableAttributes => { DecodedRecord::VariableAttributes(VariableAttributeRecord::decode(&self, decoder)) } @@ -2612,12 +2741,21 @@ impl VeryLongString { } } +#[derive(Clone, Debug)] +pub struct RawVeryLongStringsRecord(TextRecord); + #[derive(Clone, Debug)] pub struct VeryLongStringsRecord(pub Vec); -impl VeryLongStringsRecord { - fn decode(source: &TextRecord, decoder: &mut Decoder) -> Self { - let input = decoder.decode(&source.text); +impl RawVeryLongStringsRecord { + fn parse(extension: Extension) -> Result { + Ok(Record::VeryLongStrings(Self(TextRecord::parse( + extension, + "very long strings record", + )?))) + } + fn decode(self, decoder: &mut Decoder) -> VeryLongStringsRecord { + let input = decoder.decode(&self.0.text); let mut very_long_strings = Vec::new(); for tuple in input .split('\0') @@ -2694,20 +2832,29 @@ impl Attributes { } } +#[derive(Clone, Debug)] +pub struct RawFileAttributesRecord(TextRecord); + #[derive(Clone, Debug, Default)] -pub struct FileAttributeRecord(pub Attributes); +pub struct FileAttributesRecord(pub Attributes); -impl FileAttributeRecord { - fn decode(source: &TextRecord, decoder: &mut Decoder) -> Self { - let input = decoder.decode(&source.text); +impl RawFileAttributesRecord { + fn parse(extension: Extension) -> Result { + Ok(Record::FileAttributes(Self(TextRecord::parse( + extension, + "file attributes record", + )?))) + } + fn decode(self, decoder: &mut Decoder) -> FileAttributesRecord { + let input = decoder.decode(&self.0.text); match Attributes::parse(decoder, &input, None).issue_warning(&mut decoder.warn) { Some((set, rest)) => { if !rest.is_empty() { decoder.warn(Warning::TBD); } - FileAttributeRecord(set) + FileAttributesRecord(set) } - None => FileAttributeRecord::default(), + None => FileAttributesRecord::default(), } } } @@ -2789,28 +2936,9 @@ impl LongName { #[derive(Clone, Debug)] pub struct LongNamesRecord(pub Vec); -impl LongNamesRecord { - fn decode(source: &TextRecord, decoder: &mut Decoder) -> Self { - let input = decoder.decode(&source.text); - let mut names = Vec::new(); - for pair in input.split('\t').filter(|s| !s.is_empty()) { - if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&mut decoder.warn) - { - names.push(long_name); - } - } - LongNamesRecord(names) - } -} - #[derive(Clone, Debug)] pub struct ProductInfoRecord(pub String); -impl ProductInfoRecord { - fn decode(source: &TextRecord, decoder: &mut Decoder) -> Self { - Self(decoder.decode(&source.text).into()) - } -} #[derive(Clone, Debug)] pub struct VariableSet { pub name: String, @@ -2846,22 +2974,6 @@ pub struct VariableSetRecord { pub sets: Vec, } -impl VariableSetRecord { - fn decode(source: &TextRecord, decoder: &mut Decoder) -> VariableSetRecord { - let mut sets = Vec::new(); - let input = decoder.decode(&source.text); - for line in input.lines() { - if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&mut decoder.warn) { - sets.push(set) - } - } - VariableSetRecord { - offsets: source.offsets.clone(), - sets, - } - } -} - trait IssueWarning { fn issue_warning(self, warn: &mut dyn FnMut(Warning)) -> Option; } @@ -2895,26 +3007,28 @@ pub struct Extension { } impl Extension { - fn check_size(&self) -> Result<(), Warning> { - if let Some(expected_size) = E::SIZE { - if self.size != expected_size { + fn check_size(&self, expected: &ExtensionRecord) -> Result<(), Warning> { + match expected.size { + Some(expected_size) if self.size != expected_size => { return Err(Warning::BadRecordSize { offset: self.offsets.start, - record: E::NAME.into(), + record: expected.name.into(), size: self.size, expected_size, }); } + _ => (), } - if let Some(expected_count) = E::COUNT { - if self.count != expected_count { + match expected.count { + Some(expected_count) if self.count != expected_count => { return Err(Warning::BadRecordCount { offset: self.offsets.start, - record: E::NAME.into(), + record: expected.name.into(), count: self.count, expected_count, }); } + _ => (), } Ok(()) } @@ -2948,39 +3062,19 @@ impl Extension { data, }; let result = match subtype { - IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian), - FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian), - VarDisplayRecord::SUBTYPE => { - VarDisplayRecord::parse(&extension, var_types, endian, warn) - } - MultipleResponseRecord::SUBTYPE | 19 => { - MultipleResponseRecord::parse(&extension, endian) - } - LongStringValueLabelRecord::SUBTYPE => { - LongStringValueLabelRecord::parse(&extension, endian) - } - EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian), - NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian), - 5 => Ok(Record::Text(TextRecord::new( - extension, - TextRecordType::VariableSets, - ))), - 10 => Ok(Record::Text(TextRecord::new( - extension, - TextRecordType::ProductInfo, - ))), - 13 => Ok(Record::Text(TextRecord::new( - extension, - TextRecordType::LongNames, - ))), - 14 => Ok(Record::Text(TextRecord::new( - extension, - TextRecordType::VeryLongStrings, - ))), - 17 => Ok(Record::Text(TextRecord::new( - extension, - TextRecordType::FileAttributes, - ))), + 3 => IntegerInfoRecord::parse(&extension, endian), + 4 => FloatInfoRecord::parse(&extension, endian), + 11 => VarDisplayRecord::parse(&extension, var_types, endian, warn), + 7 | 19 => MultipleResponseRecord::parse(&extension, endian), + 21 => LongStringValueLabelRecord::parse(&extension, endian), + 22 => LongStringMissingValueRecord::parse(&extension, endian), + 20 => EncodingRecord::parse(&extension, endian), + 16 => NumberOfCasesRecord::parse(&extension, endian), + 5 => RawVariableSetRecord::parse(extension), + 10 => RawProductInfoRecord::parse(extension), + 13 => RawLongNamesRecord::parse(extension), + 14 => RawVeryLongStringsRecord::parse(extension), + 17 => RawFileAttributesRecord::parse(extension), 18 => Ok(Record::Text(TextRecord::new( extension, TextRecordType::VariableAttributes, @@ -3184,14 +3278,15 @@ where N: Debug, S: Debug; -impl ExtensionRecord for LongStringValueLabelRecord { - const SUBTYPE: u32 = 21; - const SIZE: Option = Some(1); - const COUNT: Option = None; - const NAME: &'static str = "long string value labels record"; +static LONG_STRING_VALUE_LABEL_RECORD: ExtensionRecord = ExtensionRecord { + size: Some(1), + count: None, + name: "long string value labels record", +}; +impl LongStringValueLabelRecord { fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size::()?; + ext.check_size(&LONG_STRING_VALUE_LABEL_RECORD)?; let mut input = &ext.data[..]; let mut label_set = Vec::new(); diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs index 978c10df63..30021daff2 100644 --- a/rust/pspp/src/sys/test.rs +++ b/rust/pspp/src/sys/test.rs @@ -157,6 +157,21 @@ fn invalid_missing_indicator2() { test_sysfile("invalid_missing_indicator2"); } +#[test] +fn missing_string_continuation() { + test_sysfile("missing_string_continuation"); +} + +#[test] +fn invalid_variable_format() { + test_sysfile("invalid_variable_format"); +} + +#[test] +fn invalid_long_string_missing_values() { + test_sysfile("invalid_long_string_missing_values"); +} + /// Duplicate variable name handling negative test. /// /// SPSS-generated system file can contain duplicate variable names (see bug diff --git a/rust/pspp/src/sys/testdata/invalid_long_string_missing_values.expected b/rust/pspp/src/sys/testdata/invalid_long_string_missing_values.expected new file mode 100644 index 0000000000..e69de29bb2 diff --git a/rust/pspp/src/sys/testdata/invalid_long_string_missing_values.sack b/rust/pspp/src/sys/testdata/invalid_long_string_missing_values.sack new file mode 100644 index 0000000000..d2db3d8070 --- /dev/null +++ b/rust/pspp/src/sys/testdata/invalid_long_string_missing_values.sack @@ -0,0 +1,61 @@ +# File header. +"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; # Layout code +7; # Nominal case size +0; # Not compressed +0; # Not weighted +1; # 1 case. +100.0; # Bias. +"01 Jan 11"; "20:53:52"; +"PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 ""; +i8 0 *3; + +# One numeric variable. +2; 0; 0; 0; 0x050800 *2; s8 "NUM1"; + +# Long string variables that will have missing values added with a +# later record. +2; 9; 0; 0; 0x010900 *2; s8 "STR1"; +2; -1; 0; 0; 0; 0; s8 ""; +2; 10; 0; 0; 0x010a00 *2; s8 "STR2"; +2; -1; 0; 0; 0; 0; s8 ""; +2; 11; 0; 0; 0x010b00 *2; s8 "STR3"; +2; -1; 0; 0; 0; 0; s8 ""; + +# Machine integer info record. +7; 3; 4; 8; 1; 2; 3; -1; 1; 1; ENDIAN; 1252; + +# Machine floating-point info record. +7; 4; 8; 3; SYSMIS; HIGHEST; LOWEST; + +# Long string variable missing values record. +7; 22; 1; COUNT ( +# Zero missing values (not allowed) for STR1 . +COUNT("STR1"); i8 >>0<<; 8; + +# Four missing values (not allowed) for STR2. +COUNT("STR2"); i8 4; 8; +"abcdefgh"; "ijklmnop"; "qrstuvwx"; "yz012345"; + +# Missing values for unknown variable +COUNT(>>"Nonexistent"<<); i8 1; 8; "abcdefgh"; + +# Missing values for numeric variable +COUNT(>>"NUM1"<<); i8 1; 8; "abcdefgh"; + +# Too long missing value +COUNT("STR3"); i8 1; >>COUNT("abcdefghijkl")<<; + +# Buggy way that this was written in old PSPP, with a length +# before each missing value instead of just once. +COUNT("STR3"); i8 2; 8; "ABCDEFGH"; >>8<<; "IJKLMNOP"; +); + +# Character encoding record. +7; 20; 1; 12; "windows-1252"; + +# Dictionary termination record. +999; 0; +s8 "abcd"; s8 "efgh"; s8 "ijkl"; s8 "mnop"; s8 "qrst"; s8 "uvwx"; +s16 "yzABCDEFGHI"; s16 "JKLMNOPQR"; s16 "STUVWXYZ01"; +s16 "23456789abc"; s32 "defghijklmnopqstuvwxyzABC"; diff --git a/rust/pspp/src/sys/testdata/invalid_variable_format.expected b/rust/pspp/src/sys/testdata/invalid_variable_format.expected new file mode 100644 index 0000000000..9969410ab1 --- /dev/null +++ b/rust/pspp/src/sys/testdata/invalid_variable_format.expected @@ -0,0 +1,37 @@ +Substituting F8.2 for invalid print format on variable NUM1. Unknown format type 255. + +Substituting F8.2 for invalid write format on variable NUM1. Unknown format type 0. + +Substituting F8.2 for invalid print format on variable VAR1. Numeric variable is not compatible with string format A. + +Substituting F8.2 for invalid write format on variable VAR1. Numeric variable is not compatible with string format AHEX. + +Substituting A4 for invalid print format on variable STR1. String variable is not compatible with numeric format F. + +Substituting A4 for invalid write format on variable STR1. String variable is not compatible with numeric format E. + +Substituting A4 for invalid print format on variable STR2. String variable with width 4 is not compatible with format A8. Use format A4 instead. + +Substituting A4 for invalid write format on variable STR2. String variable with width 4 is not compatible with format AHEX4. Use format AHEX8 instead. + +╭──────────────────────┬────────────────────────╮ +│ Created │ 01-JAN-2011 20:53:52│ +├──────────────────────┼────────────────────────┤ +│Writer Product │PSPP synthetic test file│ +├──────────────────────┼────────────────────────┤ +│ Compression │SAV │ +│ Number of Cases│Unknown │ +╰──────────────────────┴────────────────────────╯ + +╭─────────┬─╮ +│Variables│4│ +╰─────────┴─╯ + +╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│num1│ 1│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│var1│ 2│ │ │Input│ 8│Right │F8.2 │F8.2 │ │ +│str1│ 3│ │Nominal │Input│ 4│Left │A4 │A4 │ │ +│str2│ 4│ │Nominal │Input│ 4│Left │A4 │A4 │ │ +╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ diff --git a/rust/pspp/src/sys/testdata/invalid_variable_format.sack b/rust/pspp/src/sys/testdata/invalid_variable_format.sack new file mode 100644 index 0000000000..1064a14f26 --- /dev/null +++ b/rust/pspp/src/sys/testdata/invalid_variable_format.sack @@ -0,0 +1,23 @@ +# File header. +"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; 4; 1; 0; -1; 100.0; "01 Jan 11"; "20:53:52"; s64 ""; i8 0 *3; + +# Numeric variable, invalid format types. +# No warning is issued for type 0 because it has been observed in real +# system files. +2; 0; 0; 0; >>0xff0800; 0<<; s8 "NUM1"; + +# Numeric variable, string formats. +2; 0; 0; 0; >>0x010800<<; >>0x021000<<; s8 "VAR1"; + +# String variable, numeric formats. +2; 4; 0; 0; >>0x050800<<; >>0x110a01<<; s8 "STR1"; + +# String variable, wrong width formats. +2; 4; 0; 0; >>0x010800<<; >>0x020400<<; s8 "STR2"; + +# Character encoding record. +7; 20; 1; 12; "windows-1252"; + +# End of dictionary. +999; 0; diff --git a/rust/pspp/src/sys/testdata/missing_string_continuation.expected b/rust/pspp/src/sys/testdata/missing_string_continuation.expected new file mode 100644 index 0000000000..3bb98b0f40 --- /dev/null +++ b/rust/pspp/src/sys/testdata/missing_string_continuation.expected @@ -0,0 +1,21 @@ +Variable index 0 is a 10-byte string that should be followed by long string continuation records through index 1 (inclusive), but index 1 is not a continuation + +╭──────────────────────┬────────────────────────╮ +│ Created │ 01-JAN-2011 20:53:52│ +├──────────────────────┼────────────────────────┤ +│Writer Product │PSPP synthetic test file│ +├──────────────────────┼────────────────────────┤ +│ Compression │SAV │ +│ Number of Cases│Unknown │ +╰──────────────────────┴────────────────────────╯ + +╭─────────┬─╮ +│Variables│2│ +╰─────────┴─╯ + +╭────┬────────┬─────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────╮ +│ │Position│Label│Measurement Level│ Role│Width│Alignment│Print Format│Write Format│Missing Values│ +├────┼────────┼─────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────┤ +│var1│ 1│ │Nominal │Input│ 10│Left │A10 │A10 │ │ +│var2│ 2│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ +╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯ diff --git a/rust/pspp/src/sys/testdata/missing_string_continuation.sack b/rust/pspp/src/sys/testdata/missing_string_continuation.sack new file mode 100644 index 0000000000..4abfe02faf --- /dev/null +++ b/rust/pspp/src/sys/testdata/missing_string_continuation.sack @@ -0,0 +1,13 @@ +# File header. +"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; 2; 1; 0; -1; 100.0; "01 Jan 11"; "20:53:52"; s64 ""; i8 0 *3; + +# String variable. +2; 10; 0; 0; 0x010a00 *2; s8 "VAR1"; +>>2; 0; 0; 0; 0x050800 *2; s8 "VAR2";<< + +# Character encoding record. +7; 20; 1; 12; "windows-1252"; + +# End of dictionary. +999; 0; diff --git a/rust/pspp/src/sys/testdata/variable_labels_and_missing_values.expected b/rust/pspp/src/sys/testdata/variable_labels_and_missing_values.expected index f3c6549215..f7e5a27ad2 100644 --- a/rust/pspp/src/sys/testdata/variable_labels_and_missing_values.expected +++ b/rust/pspp/src/sys/testdata/variable_labels_and_missing_values.expected @@ -13,28 +13,28 @@ │Variables│ 21│ ╰─────────┴──────────────────────────────╯ -╭────────────────────────────────┬────────┬────────────────────────────────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬──────────────────────╮ -│ │Position│ Label │Measurement Level│ Role│Width│Alignment│Print Format│Write Format│ Missing Values │ -├────────────────────────────────┼────────┼────────────────────────────────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼──────────────────────┤ -│num1 │ 1│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ -│Numeric variable 2's label (ùúû)│ 2│Numeric variable 2's label (ùúû)│ │Input│ 8│Right │F8.0 │F8.0 │ │ -│num3 │ 3│ │ │Input│ 8│Right │F8.0 │F8.0 │1 │ -│Another numeric variable label │ 4│Another numeric variable label │ │Input│ 8│Right │F8.0 │F8.0 │1 │ -│num5 │ 5│ │ │Input│ 8│Right │F8.0 │F8.0 │1; 2 │ -│num6 │ 6│ │ │Input│ 8│Right │F8.0 │F8.0 │1; 2; 3 │ -│num7 │ 7│ │ │Input│ 8│Right │F8.0 │F8.0 │1 THRU 3 │ -│num8 │ 8│ │ │Input│ 8│Right │F8.0 │F8.0 │1 THRU 3; 5 │ -│num9 │ 9│ │ │Input│ 8│Right │F8.0 │F8.0 │1 THRU HIGH; -5 │ -│numàèìñò │ 10│ │ │Input│ 8│Right │F8.0 │F8.0 │LOW THRU 1; 5 │ -│str1 │ 11│ │Nominal │Input│ 4│Left │A4 │A4 │ │ -│String variable 2's label │ 12│String variable 2's label │Nominal │Input│ 4│Left │A4 │A4 │ │ -│str3 │ 13│ │Nominal │Input│ 4│Left │A4 │A4 │"MISS" │ -│Another string variable label │ 14│Another string variable label │Nominal │Input│ 4│Left │A4 │A4 │"OTHR" │ -│str5 │ 15│ │Nominal │Input│ 4│Left │A4 │A4 │"MISS"; "OTHR" │ -│str6 │ 16│ │Nominal │Input│ 4│Left │A4 │A4 │"MISS"; "OTHR"; "MORE"│ -│str7 │ 17│ │Nominal │Input│ 11│Left │A11 │A11 │"first8by" │ -│str8 │ 18│ │Nominal │Input│ 9│Left │A9 │A9 │ │ -│str9 │ 19│ │Nominal │Input│ 10│Left │A10 │A10 │ │ -│str10 │ 20│ │Nominal │Input│ 11│Left │A11 │A11 │ │ -│25-byte string │ 21│25-byte string │Nominal │Input│ 25│Left │A25 │A25 │ │ -╰────────────────────────────────┴────────┴────────────────────────────────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────────────╯ +╭────────────────────────────────┬────────┬────────────────────────────────┬─────────────────┬─────┬─────┬─────────┬────────────┬────────────┬───────────────────────────────────────────╮ +│ │Position│ Label │Measurement Level│ Role│Width│Alignment│Print Format│Write Format│ Missing Values │ +├────────────────────────────────┼────────┼────────────────────────────────┼─────────────────┼─────┼─────┼─────────┼────────────┼────────────┼───────────────────────────────────────────┤ +│num1 │ 1│ │ │Input│ 8│Right │F8.0 │F8.0 │ │ +│Numeric variable 2's label (ùúû)│ 2│Numeric variable 2's label (ùúû)│ │Input│ 8│Right │F8.0 │F8.0 │ │ +│num3 │ 3│ │ │Input│ 8│Right │F8.0 │F8.0 │1 │ +│Another numeric variable label │ 4│Another numeric variable label │ │Input│ 8│Right │F8.0 │F8.0 │1 │ +│num5 │ 5│ │ │Input│ 8│Right │F8.0 │F8.0 │1; 2 │ +│num6 │ 6│ │ │Input│ 8│Right │F8.0 │F8.0 │1; 2; 3 │ +│num7 │ 7│ │ │Input│ 8│Right │F8.0 │F8.0 │1 THRU 3 │ +│num8 │ 8│ │ │Input│ 8│Right │F8.0 │F8.0 │1 THRU 3; 5 │ +│num9 │ 9│ │ │Input│ 8│Right │F8.0 │F8.0 │1 THRU HIGH; -5 │ +│numàèìñò │ 10│ │ │Input│ 8│Right │F8.0 │F8.0 │LOW THRU 1; 5 │ +│str1 │ 11│ │Nominal │Input│ 4│Left │A4 │A4 │ │ +│String variable 2's label │ 12│String variable 2's label │Nominal │Input│ 4│Left │A4 │A4 │ │ +│str3 │ 13│ │Nominal │Input│ 4│Left │A4 │A4 │"MISS" │ +│Another string variable label │ 14│Another string variable label │Nominal │Input│ 4│Left │A4 │A4 │"OTHR" │ +│str5 │ 15│ │Nominal │Input│ 4│Left │A4 │A4 │"MISS"; "OTHR" │ +│str6 │ 16│ │Nominal │Input│ 4│Left │A4 │A4 │"MISS"; "OTHR"; "MORE" │ +│str7 │ 17│ │Nominal │Input│ 11│Left │A11 │A11 │"first8by" │ +│str8 │ 18│ │Nominal │Input│ 9│Left │A9 │A9 │"abcdefgh " │ +│str9 │ 19│ │Nominal │Input│ 10│Left │A10 │A10 │"abcdefgh "; "01234567 " │ +│str10 │ 20│ │Nominal │Input│ 11│Left │A11 │A11 │"abcdefgh "; "01234567 "; "0 "│ +│25-byte string │ 21│25-byte string │Nominal │Input│ 25│Left │A25 │A25 │ │ +╰────────────────────────────────┴────────┴────────────────────────────────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴───────────────────────────────────────────╯ -- 2.30.2