From: Ben Pfaff Date: Mon, 7 Jul 2025 00:08:36 +0000 (-0700) Subject: minor cleanup X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=484e4c84f1e39e2f82d4acdd5ae59529d42f7555;p=pspp minor cleanup --- diff --git a/rust/pspp/src/main.rs b/rust/pspp/src/main.rs index 57ca1bb260..b8fffb3c64 100644 --- a/rust/pspp/src/main.rs +++ b/rust/pspp/src/main.rs @@ -17,7 +17,7 @@ use anyhow::Result; use clap::{Args, Parser, Subcommand, ValueEnum}; use encoding_rs::Encoding; -use pspp::sys::cooked::{decode, Error, Headers}; +use pspp::sys::cooked::{Error, Headers}; use pspp::sys::raw::{encoding_from_headers, Decoder, Magic, Reader, Record, Warning}; use std::fs::File; use std::io::{stdout, BufReader, Write}; @@ -72,20 +72,17 @@ impl Convert { } fn run(self) -> Result<()> { - let input = BufReader::new(File::open(&self.input)?); - let mut reader = Reader::new(input, Self::warn)?; + let mut reader = Reader::new(BufReader::new(File::open(&self.input)?), Self::warn)?; let headers = reader.headers().collect::, _>>()?; - let cases = reader.cases(); let encoding = encoding_from_headers(&headers, &mut |w| Self::warn(w))?; let mut decoder = Decoder::new(encoding, |w| Self::warn(w)); let mut decoded_records = Vec::new(); for header in headers { decoded_records.push(header.decode(&mut decoder)?); } - drop(decoder); - let headers = Headers::new(decoded_records, &mut |e| Self::err(e))?; - let (dictionary, metadata, cases) = decode(headers, cases, encoding, |e| Self::err(e))?; + let (dictionary, _metadata, cases) = + headers.decode(reader.cases(), encoding, |e| Self::err(e))?; let writer = match self.output { Some(path) => Box::new(File::create(path)?) as Box, None => Box::new(stdout()), @@ -245,7 +242,7 @@ fn dissect( } let headers = Headers::new(decoded_records, &mut |e| eprintln!("{e}"))?; let (dictionary, metadata, _cases) = - decode(headers, None, encoding, |e| eprintln!("{e}"))?; + headers.decode(None, encoding, |e| eprintln!("{e}"))?; println!("{dictionary:#?}"); println!("{metadata:#?}"); } diff --git a/rust/pspp/src/output/cairo/fsm.rs b/rust/pspp/src/output/cairo/fsm.rs index e35b7c95b1..7295347908 100644 --- a/rust/pspp/src/output/cairo/fsm.rs +++ b/rust/pspp/src/output/cairo/fsm.rs @@ -532,7 +532,7 @@ impl Device for CairoDevice<'_> { self.layout_cell(cell, bb, &Rect2::default()).y() + margin(cell, Axis2::Y) } - fn adjust_break(&self, cell: &Content, size: Coord2) -> usize { + fn adjust_break(&self, _cell: &Content, _size: Coord2) -> usize { todo!() } diff --git a/rust/pspp/src/output/cairo/pager.rs b/rust/pspp/src/output/cairo/pager.rs index e26a99d46f..2a53c2e3cd 100644 --- a/rust/pspp/src/output/cairo/pager.rs +++ b/rust/pspp/src/output/cairo/pager.rs @@ -192,7 +192,7 @@ fn render_heading( context: &Context, font: &FontDescription, heading: &Heading, - page_number: i32, + _page_number: i32, width: usize, base_y: usize, font_resolution: f64, diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index c513c20934..36c3e80b99 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -454,6 +454,529 @@ impl Headers { z_trailer: take_first(z_trailer, "z_trailer", warn), }) } + + pub fn decode( + mut self, + mut cases: Option, + encoding: &'static Encoding, + mut warn: impl FnMut(Error), + ) -> Result<(Dictionary, Metadata, Option), Error> { + let mut dictionary = Dictionary::new(encoding); + + let file_label = fix_line_ends(self.header.file_label.trim_end_matches(' ')); + if !file_label.is_empty() { + dictionary.file_label = Some(file_label); + } + + for mut attributes in self.file_attributes.drain(..) { + dictionary.attributes.append(&mut attributes.0) + } + + // Concatenate all the document records (really there should only be one) + // and trim off the trailing spaces that pad them to 80 bytes. + dictionary.documents = self + .document + .drain(..) + .flat_map(|record| record.lines) + .map(trim_end_spaces) + .collect(); + + if let Some(integer_info) = &self.integer_info { + let floating_point_rep = integer_info.floating_point_rep; + if floating_point_rep != 1 { + warn(Error::UnexpectedFloatFormat(floating_point_rep)) + } + + let expected = match self.header.endian { + Endian::Big => 1, + Endian::Little => 2, + }; + let actual = integer_info.endianness; + if actual != expected { + warn(Error::UnexpectedEndianess { actual, expected }); + } + }; + + if let Some(float_info) = &self.float_info { + for (expected, expected2, actual, name) in [ + (f64::MIN, None, float_info.sysmis, "SYSMIS"), + (f64::MAX, None, float_info.highest, "HIGHEST"), + ( + f64::MIN, + Some(f64::MIN.next_up()), + float_info.lowest, + "LOWEST", + ), + ] { + if actual != expected && expected2.is_none_or(|expected2| expected2 != actual) { + warn(Error::UnexpectedFloatValue { + expected, + actual, + name, + }); + } + } + } + + if let Some(nominal_case_size) = self.header.nominal_case_size { + let n_vars = self.variable.len(); + if n_vars != nominal_case_size as usize + && self + .integer_info + .as_ref() + .is_none_or(|info| info.version.0 != 13) + { + warn(Error::WrongVariablePositions { + actual: n_vars, + expected: nominal_case_size as usize, + }); + } + } + + let mut decoder = Decoder { + encoding, + n_generated_names: 0, + }; + + let mut var_index_map = BTreeMap::new(); + let mut value_index = 0; + for (index, input) in self + .variable + .iter() + .enumerate() + .filter(|(_index, record)| record.width != RawWidth::Continuation) + { + let name = trim_end_spaces(input.name.to_string()); + let name = match Identifier::from_encoding(name, encoding) + .and_then(Identifier::must_be_ordinary) + { + Ok(name) => { + if !dictionary.variables.contains(&name.0) { + name + } else { + let new_name = decoder.generate_name(&dictionary); + warn(Error::DuplicateVariableName { + duplicate_name: name.clone(), + new_name: new_name.clone(), + }); + new_name + } + } + Err(id_error) => { + let new_name = decoder.generate_name(&dictionary); + warn(Error::InvalidVariableName { + id_error, + new_name: new_name.clone(), + }); + new_name + } + }; + let mut variable = Variable::new( + name.clone(), + VarWidth::try_from(input.width).unwrap(), + encoding, + ); + + // Set the short name the same as the long name (even if we renamed it). + variable.short_names = vec![name]; + + variable.label = input.label.clone(); + + variable.missing_values = input.missing_values.clone(); + + variable.print_format = decode_format( + input.print_format, + variable.width, + |new_spec, format_error| { + warn(Error::InvalidPrintFormat { + new_spec, + variable: variable.name.clone(), + format_error, + }) + }, + ); + variable.write_format = decode_format( + input.write_format, + variable.width, + |new_spec, format_error| { + warn(Error::InvalidWriteFormat { + new_spec, + variable: variable.name.clone(), + format_error, + }) + }, + ); + + // Check for long string continuation records. + let n_values = input.width.n_values().unwrap(); + for offset in 1..n_values { + if self + .variable + .get(index + offset) + .is_none_or(|record| record.width != RawWidth::Continuation) + { + warn(Error::MissingLongStringContinuation { + width: input.width, + start_index: index, + end_index: index + n_values - 1, + error_index: index + offset, + }); + break; + } + } + + let dict_index = dictionary.add_var(variable).unwrap(); + assert_eq!(var_index_map.insert(value_index, dict_index), None); + value_index += n_values; + } + + if let Some(weight_index) = self.header.weight_index { + let index = weight_index as usize - 1; + if index >= value_index { + warn(Error::WeightIndexOutOfRange { + index: weight_index, + max_index: var_index_map.len(), + }); + } else { + let (var_index, dict_index) = var_index_map.range(..=&index).last().unwrap(); + let variable = &dictionary.variables[*dict_index]; + if *var_index == index { + if variable.is_numeric() { + dictionary.weight = Some(*dict_index); + } else { + warn(Error::InvalidWeightVar { + index: weight_index, + name: variable.name.clone(), + }); + } + } else { + warn(Error::WeightIndexStringContinuation { + index: weight_index, + name: variable.name.clone(), + }); + } + } + } + + for record in self.value_label.drain(..) { + let mut dict_indexes = Vec::with_capacity(record.dict_indexes.len()); + let mut long_string_variables = Vec::new(); + for value_index in record.dict_indexes.iter() { + let Some(dict_index) = var_index_map.get(&(*value_index as usize - 1)) else { + unreachable!() + }; + let variable = &dictionary.variables[*dict_index]; + if variable.width.is_long_string() { + long_string_variables.push(variable.name.clone()); + } else { + dict_indexes.push(*dict_index); + } + } + if !long_string_variables.is_empty() { + warn(Error::InvalidLongStringValueLabels { + offsets: record.offsets.clone(), + variables: long_string_variables, + }); + } + + let written_by_readstat = self.header.eye_catcher.contains("ReadStat"); + for dict_index in dict_indexes { + let variable = dictionary.variables.get_index_mut2(dict_index).unwrap(); + let mut duplicates = Vec::new(); + for ValueLabel { + datum: value, + label, + } in record.labels.iter().cloned() + { + let datum = value.decode(variable.width); + if variable.value_labels.insert(datum, label).is_some() { + duplicates.push(value); + } + } + if written_by_readstat { + // Ignore any possible duplicates. ReadStat is buggy and emits + // value labels whose values are longer than string variables' + // widths, that are identical in the actual width of the + // variable, e.g. both values "ABC123" and "ABC456" for a string + // variable with width 3. + } else if !duplicates.is_empty() { + warn(Error::DuplicateValueLabels { + variable: variable.name.clone(), + values: duplicates + .iter() + .map(|value| { + value + .decode(variable.width) + .display(variable.print_format, variable.encoding) + .with_trimming() + .with_quoted_string() + .to_string() + }) + .collect(), + }); + } + } + } + + if let Some(display) = &self.var_display { + for (index, display) in display.0.iter().enumerate() { + if let Some(variable) = dictionary.variables.get_index_mut2(index) { + if let Some(width) = display.width { + variable.display_width = width; + } + if let Some(alignment) = display.alignment { + variable.alignment = alignment; + } + if let Some(measure) = display.measure { + variable.measure = Some(measure); + } + } else { + warn(dbg!(Error::TBD)); + } + } + } + + for record in self + .multiple_response + .iter() + .flat_map(|record| record.0.iter()) + { + match MultipleResponseSet::decode(&dictionary, record, &mut warn) { + Ok(mrset) => { + dictionary.mrsets.insert(ByIdentifier::new(mrset)); + } + Err(error) => warn(error), + } + } + + if !self.very_long_strings.is_empty() { + 'outer: for record in self + .very_long_strings + .drain(..) + .flat_map(|record| record.0.into_iter()) + { + let Some(index) = dictionary.variables.get_index_of(&record.short_name.0) else { + warn(dbg!(Error::TBD)); + continue; + }; + let width = VarWidth::String(record.length); + let n_segments = width.n_segments(); + if n_segments == 1 { + warn(dbg!(Error::ShortVeryLongString { + short_name: record.short_name.clone(), + width: record.length + })); + continue; + } + if index + n_segments > dictionary.variables.len() { + warn(dbg!(Error::VeryLongStringOverflow { + short_name: record.short_name.clone(), + width: record.length, + index, + n_segments, + len: dictionary.variables.len() + })); + continue; + } + let mut short_names = Vec::with_capacity(n_segments); + for i in 0..n_segments { + let alloc_width = width.segment_alloc_width(i); + let segment = &dictionary.variables[index + i]; + short_names.push(segment.short_names[0].clone()); + let segment_width = segment.width.as_string_width().unwrap_or(0); + if segment_width.next_multiple_of(8) != alloc_width.next_multiple_of(8) { + warn(Error::VeryLongStringInvalidSegmentWidth { + short_name: record.short_name.clone(), + width: record.length, + index: i, + actual: segment_width, + expected: alloc_width, + }); + continue 'outer; + } + } + dictionary.delete_vars(index + 1..index + n_segments); + let variable = dictionary.variables.get_index_mut2(index).unwrap(); + variable.short_names = short_names; + variable.resize(width); + } + cases = cases + .take() + .map(|cases| cases.with_widths(dictionary.variables.iter().map(|var| var.width))); + } + + if self.long_names.is_empty() { + // There are no long variable names. Use the short variable names, + // converted to lowercase, as the long variable names. + for index in 0..dictionary.variables.len() { + let lower = dictionary.variables[index].name.0.as_ref().to_lowercase(); + if let Ok(new_name) = Identifier::from_encoding(lower, dictionary.encoding) { + let _ = dictionary.try_rename_var(index, new_name); + } + } + } else { + // Rename each of the variables, one by one. (In a correctly + // constructed system file, this cannot create any intermediate + // duplicate variable names, because all of the new variable names are + // longer than any of the old variable names and thus there cannot be + // any overlaps.) + for renaming in self + .long_names + .iter() + .flat_map(|record| record.0.iter().cloned()) + { + let LongName { + short_name, + long_name, + } = renaming; + if let Some(index) = dictionary.variables.get_index_of(&short_name.0) { + if let Err(long_name) = dictionary.try_rename_var(index, long_name) { + warn(Error::DuplicateLongName(long_name)); + } + dictionary + .variables + .get_index_mut2(index) + .unwrap() + .short_names = vec![short_name]; + } else { + warn(dbg!(Error::TBD)); + } + } + } + + for mut attr_set in self + .variable_attributes + .drain(..) + .flat_map(|record| record.0.into_iter()) + { + if let Some((_, variable)) = dictionary + .variables + .get_full_mut2(&attr_set.long_var_name.0) + { + variable.attributes.append(&mut attr_set.attributes); + } else { + warn(dbg!(Error::TBD)); + } + } + + // Assign variable roles. + for index in 0..dictionary.variables.len() { + let variable = dictionary.variables.get_index_mut2(index).unwrap(); + match variable.attributes.role() { + Ok(Some(role)) => variable.role = role, + Ok(None) => (), + Err(error) => warn(Error::InvalidRole(error)), + } + } + + // Long string value labels. + for record in self + .long_string_value_labels + .drain(..) + .flat_map(|record| record.0.into_iter()) + { + let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else { + warn(Error::UnknownLongStringValueLabelVariable( + record.var_name.clone(), + )); + continue; + }; + let Some(width) = variable.width.as_string_width() else { + warn(Error::LongStringValueLabelNumericVariable( + record.var_name.clone(), + )); + continue; + }; + for (mut value, label) in record.labels.into_iter() { + // XXX warn about too-long value? + value.0.resize(width, b' '); + // XXX warn abouat duplicate value labels? + variable.value_labels.insert(Datum::String(value), label); + } + } + + for mut record in self + .long_string_missing_values + .drain(..) + .flat_map(|record| record.0.into_iter()) + { + let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else { + warn(Error::LongStringMissingValueUnknownVariable { + name: record.var_name.clone(), + }); + continue; + }; + if !variable.width.is_long_string() { + warn(Error::LongStringMissingValueBadWdith { + name: record.var_name.clone(), + width: variable.width, + }); + continue; + } + if record.missing_values.len() > 3 { + warn(Error::LongStringMissingValueInvalidCount { + name: record.var_name.clone(), + count: record.missing_values.len(), + }); + record.missing_values.truncate(3); + } + let values = record + .missing_values + .into_iter() + .map(|v| { + let mut value = RawString::from(v.0.as_slice()); + value.resize(variable.width.as_string_width().unwrap()); + Datum::String(value) + }) + .collect::>(); + match MissingValues::new(values, None) { + Ok(missing_values) => variable.missing_values = missing_values, + Err(MissingValuesError::TooWide) => warn(dbg!(Error::TBD)), + Err(MissingValuesError::TooMany) | Err(MissingValuesError::MixedTypes) => { + unreachable!() + } + } + } + + for record in self + .variable_sets + .drain(..) + .flat_map(|record| record.sets.into_iter()) + { + let mut variables = Vec::with_capacity(record.variable_names.len()); + for variable_name in record.variable_names { + let Some((dict_index, _)) = dictionary.variables.get_full_mut2(&variable_name.0) + else { + warn(Error::UnknownVariableSetVariable { + variable_set: record.name.clone(), + variable: variable_name.clone(), + }); + continue; + }; + variables.push(dict_index); + } + let variable_set = VariableSet { + name: record.name, + variables, + }; + dictionary.variable_sets.push(variable_set); + } + + for record in self.other_extension.drain(..) { + warn(Error::UnknownExtensionRecord { + offset: record.offsets.start, + subtype: record.subtype, + size: record.size, + count: record.count, + }); + } + + let metadata = Metadata::decode(&self, warn); + if let Some(n_cases) = metadata.n_cases { + cases = cases.take().map(|cases| cases.with_expected_cases(n_cases)) + } + Ok((dictionary, metadata, cases)) + } } #[derive(Clone, Debug, PartialEq, Eq)] @@ -580,528 +1103,6 @@ impl Decoder { } } -pub fn decode( - mut headers: Headers, - mut cases: Option, - encoding: &'static Encoding, - mut warn: impl FnMut(Error), -) -> Result<(Dictionary, Metadata, Option), Error> { - let mut dictionary = Dictionary::new(encoding); - - let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' ')); - if !file_label.is_empty() { - dictionary.file_label = Some(file_label); - } - - for mut attributes in headers.file_attributes.drain(..) { - dictionary.attributes.append(&mut attributes.0) - } - - // Concatenate all the document records (really there should only be one) - // and trim off the trailing spaces that pad them to 80 bytes. - dictionary.documents = headers - .document - .drain(..) - .flat_map(|record| record.lines) - .map(trim_end_spaces) - .collect(); - - if let Some(integer_info) = &headers.integer_info { - let floating_point_rep = integer_info.floating_point_rep; - if floating_point_rep != 1 { - warn(Error::UnexpectedFloatFormat(floating_point_rep)) - } - - let expected = match headers.header.endian { - Endian::Big => 1, - Endian::Little => 2, - }; - let actual = integer_info.endianness; - if actual != expected { - warn(Error::UnexpectedEndianess { actual, expected }); - } - }; - - if let Some(float_info) = &headers.float_info { - for (expected, expected2, actual, name) in [ - (f64::MIN, None, float_info.sysmis, "SYSMIS"), - (f64::MAX, None, float_info.highest, "HIGHEST"), - ( - f64::MIN, - Some(f64::MIN.next_up()), - float_info.lowest, - "LOWEST", - ), - ] { - if actual != expected && expected2.is_none_or(|expected2| expected2 != actual) { - warn(Error::UnexpectedFloatValue { - expected, - actual, - name, - }); - } - } - } - - if let Some(nominal_case_size) = headers.header.nominal_case_size { - let n_vars = headers.variable.len(); - if n_vars != nominal_case_size as usize - && headers - .integer_info - .as_ref() - .is_none_or(|info| info.version.0 != 13) - { - warn(Error::WrongVariablePositions { - actual: n_vars, - expected: nominal_case_size as usize, - }); - } - } - - let mut decoder = Decoder { - encoding, - n_generated_names: 0, - }; - - let mut var_index_map = BTreeMap::new(); - let mut value_index = 0; - for (index, input) in headers - .variable - .iter() - .enumerate() - .filter(|(_index, record)| record.width != RawWidth::Continuation) - { - let name = trim_end_spaces(input.name.to_string()); - let name = match Identifier::from_encoding(name, encoding) - .and_then(Identifier::must_be_ordinary) - { - Ok(name) => { - if !dictionary.variables.contains(&name.0) { - name - } else { - let new_name = decoder.generate_name(&dictionary); - warn(Error::DuplicateVariableName { - duplicate_name: name.clone(), - new_name: new_name.clone(), - }); - new_name - } - } - Err(id_error) => { - let new_name = decoder.generate_name(&dictionary); - warn(Error::InvalidVariableName { - id_error, - new_name: new_name.clone(), - }); - new_name - } - }; - let mut variable = Variable::new( - name.clone(), - VarWidth::try_from(input.width).unwrap(), - encoding, - ); - - // Set the short name the same as the long name (even if we renamed it). - variable.short_names = vec![name]; - - variable.label = input.label.clone(); - - variable.missing_values = input.missing_values.clone(); - - variable.print_format = decode_format( - input.print_format, - variable.width, - |new_spec, format_error| { - warn(Error::InvalidPrintFormat { - new_spec, - variable: variable.name.clone(), - format_error, - }) - }, - ); - variable.write_format = decode_format( - input.write_format, - variable.width, - |new_spec, format_error| { - warn(Error::InvalidWriteFormat { - new_spec, - variable: variable.name.clone(), - format_error, - }) - }, - ); - - // Check for long string continuation records. - let n_values = input.width.n_values().unwrap(); - for offset in 1..n_values { - if headers - .variable - .get(index + offset) - .is_none_or(|record| record.width != RawWidth::Continuation) - { - warn(Error::MissingLongStringContinuation { - width: input.width, - start_index: index, - end_index: index + n_values - 1, - error_index: index + offset, - }); - break; - } - } - - let dict_index = dictionary.add_var(variable).unwrap(); - assert_eq!(var_index_map.insert(value_index, dict_index), None); - value_index += n_values; - } - - if let Some(weight_index) = headers.header.weight_index { - let index = weight_index as usize - 1; - if index >= value_index { - warn(Error::WeightIndexOutOfRange { - index: weight_index, - max_index: var_index_map.len(), - }); - } else { - let (var_index, dict_index) = var_index_map.range(..=&index).last().unwrap(); - let variable = &dictionary.variables[*dict_index]; - if *var_index == index { - if variable.is_numeric() { - dictionary.weight = Some(*dict_index); - } else { - warn(Error::InvalidWeightVar { - index: weight_index, - name: variable.name.clone(), - }); - } - } else { - warn(Error::WeightIndexStringContinuation { - index: weight_index, - name: variable.name.clone(), - }); - } - } - } - - for record in headers.value_label.drain(..) { - let mut dict_indexes = Vec::with_capacity(record.dict_indexes.len()); - let mut long_string_variables = Vec::new(); - for value_index in record.dict_indexes.iter() { - let Some(dict_index) = var_index_map.get(&(*value_index as usize - 1)) else { - unreachable!() - }; - let variable = &dictionary.variables[*dict_index]; - if variable.width.is_long_string() { - long_string_variables.push(variable.name.clone()); - } else { - dict_indexes.push(*dict_index); - } - } - if !long_string_variables.is_empty() { - warn(Error::InvalidLongStringValueLabels { - offsets: record.offsets.clone(), - variables: long_string_variables, - }); - } - - let written_by_readstat = headers.header.eye_catcher.contains("ReadStat"); - for dict_index in dict_indexes { - let variable = dictionary.variables.get_index_mut2(dict_index).unwrap(); - let mut duplicates = Vec::new(); - for ValueLabel { - datum: value, - label, - } in record.labels.iter().cloned() - { - let datum = value.decode(variable.width); - if variable.value_labels.insert(datum, label).is_some() { - duplicates.push(value); - } - } - if written_by_readstat { - // Ignore any possible duplicates. ReadStat is buggy and emits - // value labels whose values are longer than string variables' - // widths, that are identical in the actual width of the - // variable, e.g. both values "ABC123" and "ABC456" for a string - // variable with width 3. - } else if !duplicates.is_empty() { - warn(Error::DuplicateValueLabels { - variable: variable.name.clone(), - values: duplicates - .iter() - .map(|value| { - value - .decode(variable.width) - .display(variable.print_format, variable.encoding) - .with_trimming() - .with_quoted_string() - .to_string() - }) - .collect(), - }); - } - } - } - - if let Some(display) = &headers.var_display { - for (index, display) in display.0.iter().enumerate() { - if let Some(variable) = dictionary.variables.get_index_mut2(index) { - if let Some(width) = display.width { - variable.display_width = width; - } - if let Some(alignment) = display.alignment { - variable.alignment = alignment; - } - if let Some(measure) = display.measure { - variable.measure = Some(measure); - } - } else { - warn(dbg!(Error::TBD)); - } - } - } - - for record in headers - .multiple_response - .iter() - .flat_map(|record| record.0.iter()) - { - match MultipleResponseSet::decode(&dictionary, record, &mut warn) { - Ok(mrset) => { - dictionary.mrsets.insert(ByIdentifier::new(mrset)); - } - Err(error) => warn(error), - } - } - - if !headers.very_long_strings.is_empty() { - 'outer: for record in headers - .very_long_strings - .drain(..) - .flat_map(|record| record.0.into_iter()) - { - let Some(index) = dictionary.variables.get_index_of(&record.short_name.0) else { - warn(dbg!(Error::TBD)); - continue; - }; - let width = VarWidth::String(record.length); - let n_segments = width.n_segments(); - if n_segments == 1 { - warn(dbg!(Error::ShortVeryLongString { - short_name: record.short_name.clone(), - width: record.length - })); - continue; - } - if index + n_segments > dictionary.variables.len() { - warn(dbg!(Error::VeryLongStringOverflow { - short_name: record.short_name.clone(), - width: record.length, - index, - n_segments, - len: dictionary.variables.len() - })); - continue; - } - let mut short_names = Vec::with_capacity(n_segments); - for i in 0..n_segments { - let alloc_width = width.segment_alloc_width(i); - let segment = &dictionary.variables[index + i]; - short_names.push(segment.short_names[0].clone()); - let segment_width = segment.width.as_string_width().unwrap_or(0); - if segment_width.next_multiple_of(8) != alloc_width.next_multiple_of(8) { - warn(Error::VeryLongStringInvalidSegmentWidth { - short_name: record.short_name.clone(), - width: record.length, - index: i, - actual: segment_width, - expected: alloc_width, - }); - continue 'outer; - } - } - dictionary.delete_vars(index + 1..index + n_segments); - let variable = dictionary.variables.get_index_mut2(index).unwrap(); - variable.short_names = short_names; - variable.resize(width); - } - cases = cases - .take() - .map(|cases| cases.with_widths(dictionary.variables.iter().map(|var| var.width))); - } - - if headers.long_names.is_empty() { - // There are no long variable names. Use the short variable names, - // converted to lowercase, as the long variable names. - for index in 0..dictionary.variables.len() { - let lower = dictionary.variables[index].name.0.as_ref().to_lowercase(); - if let Ok(new_name) = Identifier::from_encoding(lower, dictionary.encoding) { - let _ = dictionary.try_rename_var(index, new_name); - } - } - } else { - // Rename each of the variables, one by one. (In a correctly - // constructed system file, this cannot create any intermediate - // duplicate variable names, because all of the new variable names are - // longer than any of the old variable names and thus there cannot be - // any overlaps.) - for renaming in headers - .long_names - .iter() - .flat_map(|record| record.0.iter().cloned()) - { - let LongName { - short_name, - long_name, - } = renaming; - if let Some(index) = dictionary.variables.get_index_of(&short_name.0) { - if let Err(long_name) = dictionary.try_rename_var(index, long_name) { - warn(Error::DuplicateLongName(long_name)); - } - dictionary - .variables - .get_index_mut2(index) - .unwrap() - .short_names = vec![short_name]; - } else { - warn(dbg!(Error::TBD)); - } - } - } - - for mut attr_set in headers - .variable_attributes - .drain(..) - .flat_map(|record| record.0.into_iter()) - { - if let Some((_, variable)) = dictionary - .variables - .get_full_mut2(&attr_set.long_var_name.0) - { - variable.attributes.append(&mut attr_set.attributes); - } else { - warn(dbg!(Error::TBD)); - } - } - - // Assign variable roles. - for index in 0..dictionary.variables.len() { - let variable = dictionary.variables.get_index_mut2(index).unwrap(); - match variable.attributes.role() { - Ok(Some(role)) => variable.role = role, - Ok(None) => (), - Err(error) => warn(Error::InvalidRole(error)), - } - } - - // Long string value labels. - for record in headers - .long_string_value_labels - .drain(..) - .flat_map(|record| record.0.into_iter()) - { - let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else { - warn(Error::UnknownLongStringValueLabelVariable( - record.var_name.clone(), - )); - continue; - }; - let Some(width) = variable.width.as_string_width() else { - warn(Error::LongStringValueLabelNumericVariable( - record.var_name.clone(), - )); - continue; - }; - for (mut value, label) in record.labels.into_iter() { - // XXX warn about too-long value? - value.0.resize(width, b' '); - // XXX warn abouat duplicate value labels? - variable.value_labels.insert(Datum::String(value), label); - } - } - - for mut record in headers - .long_string_missing_values - .drain(..) - .flat_map(|record| record.0.into_iter()) - { - let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else { - warn(Error::LongStringMissingValueUnknownVariable { - name: record.var_name.clone(), - }); - continue; - }; - if !variable.width.is_long_string() { - warn(Error::LongStringMissingValueBadWdith { - name: record.var_name.clone(), - width: variable.width, - }); - continue; - } - if record.missing_values.len() > 3 { - warn(Error::LongStringMissingValueInvalidCount { - name: record.var_name.clone(), - count: record.missing_values.len(), - }); - record.missing_values.truncate(3); - } - let values = record - .missing_values - .into_iter() - .map(|v| { - let mut value = RawString::from(v.0.as_slice()); - value.resize(variable.width.as_string_width().unwrap()); - Datum::String(value) - }) - .collect::>(); - match MissingValues::new(values, None) { - Ok(missing_values) => variable.missing_values = missing_values, - Err(MissingValuesError::TooWide) => warn(dbg!(Error::TBD)), - Err(MissingValuesError::TooMany) | Err(MissingValuesError::MixedTypes) => { - unreachable!() - } - } - } - - for record in headers - .variable_sets - .drain(..) - .flat_map(|record| record.sets.into_iter()) - { - let mut variables = Vec::with_capacity(record.variable_names.len()); - for variable_name in record.variable_names { - let Some((dict_index, _)) = dictionary.variables.get_full_mut2(&variable_name.0) else { - warn(Error::UnknownVariableSetVariable { - variable_set: record.name.clone(), - variable: variable_name.clone(), - }); - continue; - }; - variables.push(dict_index); - } - let variable_set = VariableSet { - name: record.name, - variables, - }; - dictionary.variable_sets.push(variable_set); - } - - for record in headers.other_extension.drain(..) { - warn(Error::UnknownExtensionRecord { - offset: record.offsets.start, - subtype: record.subtype, - size: record.size, - count: record.count, - }); - } - - let metadata = Metadata::decode(&headers, warn); - if let Some(n_cases) = metadata.n_cases { - cases = cases.take().map(|cases| cases.with_expected_cases(n_cases)) - } - Ok((dictionary, metadata, cases)) -} - impl MultipleResponseSet { fn decode( dictionary: &Dictionary, diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index b1e0528459..94067aaafa 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -915,7 +915,6 @@ impl Datum { ) -> Result>, Error> { fn eof( reader: &mut R, - case_vars: &[CaseVar], case_start: u64, n_chunks: usize, ) -> Result>, Error> { @@ -939,7 +938,7 @@ impl Datum { CaseVar::Numeric => { let Some(raw) = Self::read_compressed_chunk(reader, codes, endian, bias)? else { - return eof(reader, case_vars, case_start, n_chunks); + return eof(reader, case_start, n_chunks); }; n_chunks += 1; values.push(Datum::Number(endian.parse(raw))); @@ -953,7 +952,7 @@ impl Datum { let Some(raw) = Self::read_compressed_chunk(reader, codes, endian, bias)? else { - return eof(reader, case_vars, case_start, n_chunks); + return eof(reader, case_start, n_chunks); }; let n_data = data_bytes.min(8); datum.extend_from_slice(&raw[..n_data]); @@ -1122,14 +1121,11 @@ where Some(Ok(record)) } ReaderState::ZlibHeader => { - let zheader = match ZHeader::read( - self.0.reader.as_mut().unwrap(), - self.0.header.endian, - &mut self.0.warn, - ) { - Ok(zheader) => zheader, - Err(error) => return Some(Err(error)), - }; + let zheader = + match ZHeader::read(self.0.reader.as_mut().unwrap(), self.0.header.endian) { + Ok(zheader) => zheader, + Err(error) => return Some(Err(error)), + }; self.0.state = ReaderState::ZlibTrailer(zheader.clone()); Some(Ok(Record::ZHeader(zheader))) } @@ -3408,11 +3404,7 @@ pub struct ZHeader { } impl ZHeader { - fn read( - r: &mut R, - endian: Endian, - warn: &mut dyn FnMut(Warning), - ) -> Result { + fn read(r: &mut R, endian: Endian) -> Result { let offset = r.stream_position()?; let zheader_offset: u64 = endian.parse(read_bytes(r)?); let ztrailer_offset: u64 = endian.parse(read_bytes(r)?); diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs index 075c0384ec..304bea70b6 100644 --- a/rust/pspp/src/sys/test.rs +++ b/rust/pspp/src/sys/test.rs @@ -7,7 +7,7 @@ use crate::{ Details, Item, Text, }, sys::{ - cooked::{decode, Headers}, + cooked::Headers, raw::{encoding_from_headers, Decoder, Reader}, sack::sack, }, @@ -534,7 +534,7 @@ fn test_raw_sysfile(name: &str) { let sysfile = std::fs::read(&input_filename).unwrap(); let expected_filename = input_filename.with_extension("expected"); let expected = String::from_utf8(std::fs::read(&expected_filename).unwrap()).unwrap(); - test_sysfile(name, sysfile, &expected, &expected_filename); + test_sysfile(sysfile, &expected, &expected_filename); } fn test_sack_sysfile(name: &str) { @@ -554,11 +554,11 @@ fn test_sack_sysfile(name: &str) { }, ); let sysfile = sack(&input, Some(&input_filename), endian).unwrap(); - test_sysfile(name, sysfile, &expected, &expected_filename); + test_sysfile(sysfile, &expected, &expected_filename); } } -fn test_sysfile(name: &str, sysfile: Vec, expected: &str, expected_filename: &Path) { +fn test_sysfile(sysfile: Vec, expected: &str, expected_filename: &Path) { let cursor = Cursor::new(sysfile); let mut warnings = Vec::new(); let mut reader = Reader::new(cursor, |warning| warnings.push(warning)).unwrap(); @@ -577,7 +577,7 @@ fn test_sysfile(name: &str, sysfile: Vec, expected: &str, expected_filename: let mut errors = Vec::new(); let headers = Headers::new(decoded_records, &mut |e| errors.push(e)).unwrap(); let (dictionary, metadata, cases) = - decode(headers, cases, encoding, |e| errors.push(e)).unwrap(); + headers.decode(cases, encoding, |e| errors.push(e)).unwrap(); let (group, data) = metadata.to_pivot_rows(); let metadata_table = PivotTable::new([(Axis3::Y, Dimension::new(group))]).with_data( data.into_iter()