From: Ben Pfaff Date: Tue, 10 Jun 2025 00:20:46 +0000 (-0700) Subject: compression warning X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=68515e7d35637791ad1671ddb77f062520edc213;p=pspp compression warning --- diff --git a/rust/pspp/src/main.rs b/rust/pspp/src/main.rs index 7ffe019775..4f557d9366 100644 --- a/rust/pspp/src/main.rs +++ b/rust/pspp/src/main.rs @@ -88,7 +88,7 @@ fn dissect( ) -> Result<()> { let reader = File::open(file_name)?; let reader = BufReader::new(reader); - let mut reader = Reader::new(reader, |warning| println!("{warning}"))?; + let mut reader = Reader::new(reader, Box::new(|warning| println!("{warning}")))?; match mode { Mode::Identify => { @@ -121,11 +121,11 @@ fn dissect( let headers: Vec = reader.collect::, _>>()?; let encoding = match encoding { Some(encoding) => encoding, - None => encoding_from_headers(&headers, &|e| eprintln!("{e}"))?, + None => encoding_from_headers(&headers, &mut |e| eprintln!("{e}"))?, }; - let decoder = Decoder::new(encoding, |e| eprintln!("{e}")); + let mut decoder = Decoder::new(encoding, |e| eprintln!("{e}")); for header in headers { - let header = header.decode(&decoder); + let header = header.decode(&mut decoder); println!("{:?}", header); /* if let Record::Cases(cases) = header { @@ -144,12 +144,12 @@ fn dissect( let headers: Vec = reader.collect::, _>>()?; let encoding = match encoding { Some(encoding) => encoding, - None => encoding_from_headers(&headers, &|e| eprintln!("{e}"))?, + None => encoding_from_headers(&headers, &mut |e| eprintln!("{e}"))?, }; - let decoder = Decoder::new(encoding, |e| eprintln!("{e}")); + let mut decoder = Decoder::new(encoding, |e| eprintln!("{e}")); let mut decoded_records = Vec::new(); for header in headers { - decoded_records.push(header.decode(&decoder)?); + decoded_records.push(header.decode(&mut decoder)?); } let headers = Headers::new(decoded_records, &mut |e| eprintln!("{e}"))?; let (dictionary, metadata) = decode(headers, encoding, |e| eprintln!("{e}"))?; diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 014134c305..f931d38a50 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -260,6 +260,9 @@ pub enum Warning { #[error("Attribute record missing quotations, in {0:?}.")] AttributeMissingQuotes(String), + #[error("Compression bias is {0} instead of the usual values of 0 or 100.")] + UnexpectedBias(f64), + #[error("Details TBD (raw)")] TBD, } @@ -324,7 +327,7 @@ impl Record { reader: &mut R, endian: Endian, var_types: &VarTypes, - warn: &dyn Fn(Warning), + warn: &mut dyn FnMut(Warning), ) -> Result, Error> where R: Read + Seek, @@ -345,7 +348,7 @@ impl Record { } } - pub fn decode(self, decoder: &Decoder) -> Result { + pub fn decode(self, decoder: &mut Decoder) -> Result { Ok(match self { Record::Header(record) => record.decode(decoder), Record::Variable(record) => record.decode(decoder), @@ -375,7 +378,7 @@ impl Record { pub fn encoding_from_headers( headers: &Vec, - warn: &impl Fn(Warning), + warn: &mut impl FnMut(Warning), ) -> Result<&'static Encoding, Error> { let mut encoding_record = None; let mut integer_info_record = None; @@ -493,7 +496,7 @@ where } impl HeaderRecord { - fn read(r: &mut R) -> Result { + fn read(r: &mut R, warn: &mut dyn FnMut(Warning)) -> Result { let start = r.stream_position()?; let magic: [u8; 4] = read_bytes(r)?; @@ -526,6 +529,9 @@ impl HeaderRecord { let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases); let bias: f64 = endian.parse(read_bytes(r)?); + if bias != 100.0 && bias != 0.0 { + warn(Warning::UnexpectedBias(bias)); + } let creation_date = RawString(read_vec(r, 9)?); let creation_time = RawString(read_vec(r, 8)?); @@ -549,7 +555,7 @@ impl HeaderRecord { }) } - pub fn decode(self, decoder: &Decoder) -> DecodedRecord { + pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord { let eye_catcher = decoder.decode(&self.eye_catcher).to_string(); let file_label = decoder.decode(&self.file_label).to_string(); let creation_date = decoder.decode(&self.creation_date).to_string(); @@ -572,25 +578,25 @@ impl HeaderRecord { } } -pub struct Decoder { +pub struct Decoder<'a> { pub encoding: &'static Encoding, - pub warn: Box, + pub warn: Box, } -impl Decoder { +impl<'de> Decoder<'de> { pub fn new(encoding: &'static Encoding, warn: F) -> Self where - F: Fn(Warning) + 'static, + F: FnMut(Warning) + 'de, { Self { encoding, warn: Box::new(warn), } } - fn warn(&self, warning: Warning) { + fn warn(&mut self, warning: Warning) { (self.warn)(warning) } - fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { + fn decode_slice<'a>(&mut self, input: &'a [u8]) -> Cow<'a, str> { let (output, malformed) = self.encoding.decode_without_bom_handling(input); if malformed { self.warn(Warning::MalformedString { @@ -601,12 +607,13 @@ impl Decoder { output } - fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> { + fn decode<'a>(&mut self, input: &'a RawString) -> Cow<'a, str> { self.decode_slice(input.0.as_slice()) } - pub fn decode_identifier(&self, input: &RawString) -> Result { - self.new_identifier(&self.decode(input)) + pub fn decode_identifier(&mut self, input: &RawString) -> Result { + let decoded = &self.decode(input); + self.new_identifier(decoded) } pub fn new_identifier(&self, name: &str) -> Result { @@ -885,12 +892,12 @@ enum ReaderState { End, } -pub struct Reader +pub struct Reader<'a, R> where R: Read + Seek + 'static, { reader: Option, - warn: Box, + warn: Box, header: HeaderRecord, var_types: VarTypes, @@ -898,15 +905,12 @@ where state: ReaderState, } -impl Reader +impl<'a, R> Reader<'a, R> where R: Read + Seek + 'static, { - pub fn new(mut reader: R, warn: F) -> Result - where - F: Fn(Warning) + 'static, - { - let header = HeaderRecord::read(&mut reader)?; + pub fn new(mut reader: R, mut warn: impl FnMut(Warning) + 'a) -> Result { + let header = HeaderRecord::read(&mut reader, &mut warn)?; Ok(Self { reader: Some(reader), warn: Box::new(warn), @@ -935,7 +939,7 @@ where self.reader.as_mut().unwrap(), self.header.endian, &self.var_types, - &self.warn, + &mut self.warn, ) { Ok(Some(record)) => break record, Ok(None) => (), @@ -988,7 +992,7 @@ where } } -impl Iterator for Reader +impl<'a, R> Iterator for Reader<'a, R> where R: Read + Seek + 'static, { @@ -1179,7 +1183,7 @@ impl MissingValues { raw_width: RawWidth, code: i32, endian: Endian, - warn: &dyn Fn(Warning), + warn: &mut dyn FnMut(Warning), ) -> Result { let (individual_values, has_range) = match code { 0 => return Ok(Self::default()), @@ -1419,7 +1423,7 @@ impl VariableRecord { fn read( r: &mut R, endian: Endian, - warn: &dyn Fn(Warning), + warn: &mut dyn FnMut(Warning), ) -> Result { let start_offset = r.stream_position()?; let width: i32 = endian.parse(read_bytes(r)?); @@ -1471,7 +1475,7 @@ impl VariableRecord { })) } - pub fn decode(self, decoder: &Decoder) -> DecodedRecord { + pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord { DecodedRecord::Variable(VariableRecord { offsets: self.offsets.clone(), width: self.width, @@ -1810,7 +1814,7 @@ impl ValueLabelRecord, RawString> { r: &mut R, endian: Endian, var_types: &VarTypes, - warn: &dyn Fn(Warning), + warn: &mut dyn FnMut(Warning), ) -> Result, Error> { let label_offset = r.stream_position()?; let n: u32 = endian.parse(read_bytes(r)?); @@ -1914,7 +1918,7 @@ impl ValueLabelRecord, RawString> { }))) } - fn decode(self, decoder: &Decoder) -> ValueLabelRecord, String> { + fn decode(self, decoder: &mut Decoder) -> ValueLabelRecord, String> { let labels = self .labels .iter() @@ -1983,7 +1987,7 @@ impl DocumentRecord { } } - pub fn decode(self, decoder: &Decoder) -> DecodedRecord { + pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord { DecodedRecord::Document(DocumentRecord { offsets: self.offsets.clone(), lines: self @@ -2172,14 +2176,14 @@ impl MultipleResponseSet { fn decode( &self, - decoder: &Decoder, + decoder: &mut Decoder, ) -> Result, Warning> { let mut short_names = Vec::with_capacity(self.short_names.len()); for short_name in self.short_names.iter() { if let Some(short_name) = decoder .decode_identifier(short_name) .map_err(Warning::InvalidMrSetName) - .issue_warning(&decoder.warn) + .issue_warning(&mut decoder.warn) { short_names.push(short_name); } @@ -2228,10 +2232,10 @@ impl ExtensionRecord for MultipleResponseRecord { } impl MultipleResponseRecord { - fn decode(self, decoder: &Decoder) -> DecodedRecord { + fn decode(self, decoder: &mut Decoder) -> DecodedRecord { let mut sets = Vec::new(); for set in self.0.iter() { - if let Some(set) = set.decode(decoder).issue_warning(&decoder.warn) { + if let Some(set) = set.decode(decoder).issue_warning(&mut decoder.warn) { sets.push(set); } } @@ -2346,7 +2350,7 @@ impl VarDisplayRecord { ext: &Extension, var_types: &VarTypes, endian: Endian, - warn: &dyn Fn(Warning), + warn: &mut dyn FnMut(Warning), ) -> Result { if ext.size != 4 { return Err(Warning::BadRecordSize { @@ -2374,11 +2378,11 @@ impl VarDisplayRecord { let mut input = &ext.data[..]; for _ in 0..n_vars { let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap())) - .issue_warning(&warn) + .issue_warning(warn) .flatten(); let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap())); let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap())) - .issue_warning(&warn) + .issue_warning(warn) .flatten(); var_displays.push(VarDisplay { measure, @@ -2403,7 +2407,10 @@ where } impl LongStringMissingValues { - fn decode(&self, decoder: &Decoder) -> Result, IdError> { + fn decode( + &self, + decoder: &mut Decoder, + ) -> Result, IdError> { Ok(LongStringMissingValues { var_name: decoder.decode_identifier(&self.var_name)?, missing_values: self.missing_values.clone(), @@ -2465,13 +2472,13 @@ impl ExtensionRecord for LongStringMissingValueRecord { } impl LongStringMissingValueRecord { - pub fn decode(self, decoder: &Decoder) -> LongStringMissingValueRecord { + pub fn decode(self, decoder: &mut Decoder) -> LongStringMissingValueRecord { let mut mvs = Vec::with_capacity(self.0.len()); for mv in self.0.iter() { if let Some(mv) = mv .decode(decoder) .map_err(Warning::InvalidLongStringMissingValueVariableName) - .issue_warning(&decoder.warn) + .issue_warning(&mut decoder.warn) { mvs.push(mv); } @@ -2555,7 +2562,7 @@ impl TextRecord { text: extension.data.into(), } } - pub fn decode(self, decoder: &Decoder) -> DecodedRecord { + pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord { match self.rec_type { TextRecordType::VariableSets => { DecodedRecord::VariableSets(VariableSetRecord::decode(&self, decoder)) @@ -2605,7 +2612,7 @@ impl VeryLongString { pub struct VeryLongStringsRecord(pub Vec); impl VeryLongStringsRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Self { + fn decode(source: &TextRecord, decoder: &mut Decoder) -> Self { let input = decoder.decode(&source.text); let mut very_long_strings = Vec::new(); for tuple in input @@ -2613,7 +2620,9 @@ impl VeryLongStringsRecord { .map(|s| s.trim_start_matches('\t')) .filter(|s| !s.is_empty()) { - if let Some(vls) = VeryLongString::parse(decoder, tuple).issue_warning(&decoder.warn) { + if let Some(vls) = + VeryLongString::parse(decoder, tuple).issue_warning(&mut decoder.warn) + { very_long_strings.push(vls) } } @@ -2628,7 +2637,7 @@ pub struct Attribute { } impl Attribute { - fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> { + fn parse<'a>(decoder: &mut Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> { let Some((name, mut input)) = input.split_once('(') else { return Err(Warning::AttributeMissingLParen(input.into())); }; @@ -2660,7 +2669,7 @@ impl Attribute { impl Attributes { fn parse<'a>( - decoder: &Decoder, + decoder: &mut Decoder, mut input: &'a str, sentinel: Option, ) -> Result<(Attributes, &'a str), Warning> { @@ -2685,9 +2694,9 @@ impl Attributes { pub struct FileAttributeRecord(pub Attributes); impl FileAttributeRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Self { + fn decode(source: &TextRecord, decoder: &mut Decoder) -> Self { let input = decoder.decode(&source.text); - match Attributes::parse(decoder, &input, None).issue_warning(&decoder.warn) { + match Attributes::parse(decoder, &input, None).issue_warning(&mut decoder.warn) { Some((set, rest)) => { if !rest.is_empty() { decoder.warn(Warning::TBD); @@ -2706,7 +2715,10 @@ pub struct VarAttributes { } impl VarAttributes { - fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributes, &'a str), Warning> { + fn parse<'a>( + decoder: &mut Decoder, + input: &'a str, + ) -> Result<(VarAttributes, &'a str), Warning> { let Some((long_var_name, rest)) = input.split_once(':') else { return Err(Warning::TBD); }; @@ -2727,13 +2739,13 @@ impl VarAttributes { pub struct VariableAttributeRecord(pub Vec); impl VariableAttributeRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Self { + fn decode(source: &TextRecord, decoder: &mut Decoder) -> Self { let decoded = decoder.decode(&source.text); let mut input = decoded.as_ref(); let mut var_attribute_sets = Vec::new(); while !input.is_empty() { let Some((var_attribute, rest)) = - VarAttributes::parse(decoder, input).issue_warning(&decoder.warn) + VarAttributes::parse(decoder, input).issue_warning(&mut decoder.warn) else { break; }; @@ -2774,11 +2786,12 @@ impl LongName { pub struct LongNamesRecord(pub Vec); impl LongNamesRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Self { + fn decode(source: &TextRecord, decoder: &mut Decoder) -> Self { let input = decoder.decode(&source.text); let mut names = Vec::new(); for pair in input.split('\t').filter(|s| !s.is_empty()) { - if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&decoder.warn) { + if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&mut decoder.warn) + { names.push(long_name); } } @@ -2790,7 +2803,7 @@ impl LongNamesRecord { pub struct ProductInfoRecord(pub String); impl ProductInfoRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Self { + fn decode(source: &TextRecord, decoder: &mut Decoder) -> Self { Self(decoder.decode(&source.text).into()) } } @@ -2801,7 +2814,7 @@ pub struct VariableSet { } impl VariableSet { - fn parse(input: &str, decoder: &Decoder) -> Result { + fn parse(input: &str, decoder: &mut Decoder) -> Result { let (name, input) = input .split_once('=') .ok_or(Warning::VariableSetMissingEquals)?; @@ -2811,7 +2824,7 @@ impl VariableSet { .new_identifier(var) .and_then(Identifier::must_be_ordinary) .map_err(Warning::InvalidVariableSetName) - .issue_warning(&decoder.warn) + .issue_warning(&mut decoder.warn) { vars.push(identifier); } @@ -2830,11 +2843,11 @@ pub struct VariableSetRecord { } impl VariableSetRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord { + fn decode(source: &TextRecord, decoder: &mut Decoder) -> VariableSetRecord { let mut sets = Vec::new(); let input = decoder.decode(&source.text); for line in input.lines() { - if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&decoder.warn) { + if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&mut decoder.warn) { sets.push(set) } } @@ -2846,15 +2859,10 @@ impl VariableSetRecord { } trait IssueWarning { - fn issue_warning(self, warn: &F) -> Option - where - F: Fn(Warning); + fn issue_warning(self, warn: &mut dyn FnMut(Warning)) -> Option; } impl IssueWarning for Result { - fn issue_warning(self, warn: &F) -> Option - where - F: Fn(Warning), - { + fn issue_warning(self, warn: &mut dyn FnMut(Warning)) -> Option { match self { Ok(result) => Some(result), Err(error) => { @@ -2911,7 +2919,7 @@ impl Extension { r: &mut R, endian: Endian, var_types: &VarTypes, - warn: &dyn Fn(Warning), + warn: &mut dyn FnMut(Warning), ) -> Result, Error> { let subtype = endian.parse(read_bytes(r)?); let header_offset = r.stream_position()?; @@ -3146,7 +3154,7 @@ where impl LongStringValueLabels { fn decode( &self, - decoder: &Decoder, + decoder: &mut Decoder, ) -> Result, Warning> { let var_name = decoder.decode(&self.var_name); let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding) @@ -3206,7 +3214,7 @@ impl ExtensionRecord for LongStringValueLabelRecord { } impl LongStringValueLabelRecord { - fn decode(self, decoder: &Decoder) -> LongStringValueLabelRecord { + fn decode(self, decoder: &mut Decoder) -> LongStringValueLabelRecord { let mut labels = Vec::with_capacity(self.0.len()); for label in &self.0 { match label.decode(decoder) { diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs index e5f5f914fe..267c54b439 100644 --- a/rust/pspp/src/sys/test.rs +++ b/rust/pspp/src/sys/test.rs @@ -97,6 +97,16 @@ fn compressed_data() { test_sysfile("compressed_data"); } +#[test] +fn compressed_data_zero_bias() { + test_sysfile("compressed_data_zero_bias"); +} + +#[test] +fn compressed_data_other_bias() { + test_sysfile("compressed_data_other_bias"); +} + fn test_sysfile(name: &str) { let input_filename = Path::new(env!("CARGO_MANIFEST_DIR")) .join("src/sys/testdata") @@ -108,14 +118,17 @@ fn test_sysfile(name: &str) { for endian in all::() { let sysfile = sack(&input, Some(&input_filename), endian).unwrap(); let cursor = Cursor::new(sysfile); - let reader = Reader::new(cursor, |warning| println!("{warning}")).unwrap(); + let mut warnings = Vec::new(); + let reader = Reader::new(cursor, |warning| warnings.push(warning)).unwrap(); let headers: Vec = reader.collect::, _>>().unwrap(); - let encoding = encoding_from_headers(&headers, &|e| eprintln!("{e}")).unwrap(); - let decoder = Decoder::new(encoding, |e| eprintln!("{e}")); + let encoding = + encoding_from_headers(&headers, &mut |warning| warnings.push(warning)).unwrap(); + let mut decoder = Decoder::new(encoding, |warning| warnings.push(warning)); let mut decoded_records = Vec::new(); for header in headers { - decoded_records.push(header.decode(&decoder).unwrap()); + decoded_records.push(header.decode(&mut decoder).unwrap()); } + drop(decoder); let mut errors = Vec::new(); let headers = Headers::new(decoded_records, &mut |e| errors.push(e)).unwrap(); @@ -135,6 +148,11 @@ fn test_sysfile(name: &str) { .map(|(row, value)| ([row], value)), ); let mut output = Vec::new(); + output.extend( + warnings + .into_iter() + .map(|warning| Arc::new(Item::from(Text::new_log(warning.to_string())))), + ); output.extend( errors .into_iter()