From d8286ea153ab06d2a1c5219e91b2c99f107c91b9 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 23 Aug 2025 20:47:50 -0700 Subject: [PATCH] support case data too --- rust/pspp/src/data.rs | 9 ++ rust/pspp/src/main.rs | 3 +- rust/pspp/src/sys/raw.rs | 199 +++++++++++++++++++++++---------------- 3 files changed, 126 insertions(+), 85 deletions(-) diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index ea7364face..8e52af7035 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -556,6 +556,15 @@ impl Datum { } } + /// Returns the string inside this datum, or `None` if this is a numeric + /// datum. + pub fn into_string(self) -> Option { + match self { + Self::Number(_) => None, + Self::String(s) => Some(s), + } + } + /// Returns the [VarType] corresponding to this datum. pub fn var_type(&self) -> VarType { match self { diff --git a/rust/pspp/src/main.rs b/rust/pspp/src/main.rs index 606733daf5..966e1cfd30 100644 --- a/rust/pspp/src/main.rs +++ b/rust/pspp/src/main.rs @@ -501,8 +501,7 @@ impl Show { } } Mode::Encodings => { - let records: Vec = reader.records().collect::, _>>()?; - let encoding_report = EncodingReport::new(reader.header(), &records); + let encoding_report = EncodingReport::new(reader, self.max_cases)?; output.show(&encoding_report)?; } } diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index e01888a2de..f28590e6ca 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -20,7 +20,7 @@ //! raw details. Most readers will want to use higher-level interfaces. use crate::{ - data::{ByteStr, ByteString, Datum, RawCase, RawString}, + data::{ByteStr, ByteString, Datum, MutRawString, RawCase, RawString}, endian::{FromBytes, ToBytes}, identifier::{Error as IdError, Identifier}, output::{ @@ -2090,99 +2090,132 @@ impl EncodingReportString { } impl EncodingReport { - pub fn new(header: &FileHeader, records: &[Record]) -> Self { - let (encoding, codepage) = get_encoding_info(&records); - let label = - encoding.map(|encoding| (String::from(encoding), get_encoding(Some(encoding), None))); - let codepage = codepage.map(|codepage| (codepage, get_encoding(None, Some(codepage)))); - let (inferred_encoding_source, inferred_encoding) = match label - .as_ref() - .map(|(_string, result)| (EncodingSource::Name, result.clone())) - .or(codepage + pub fn new(mut reader: Reader, max_cases: u64) -> Result + where + R: BufRead + Seek + 'static, + { + fn inner( + header: FileHeader, + records: &[Record], + cases: impl Iterator>, + ) -> Result { + let (encoding, codepage) = get_encoding_info(&records); + let label = encoding + .map(|encoding| (String::from(encoding), get_encoding(Some(encoding), None))); + let codepage = codepage.map(|codepage| (codepage, get_encoding(None, Some(codepage)))); + let (inferred_encoding_source, inferred_encoding) = match label .as_ref() - .map(|(_codepage, result)| (EncodingSource::Codepage, result.clone()))) - { - Some((source, Ok(encoding))) => (source, Ok(encoding)), - Some((source, Err(EncodingError::Ebcdic))) => (source, Err(EncodingError::Ebcdic)), - _ => (EncodingSource::Default, Ok(default_encoding())), - }; + .map(|(_string, result)| (EncodingSource::Name, result.clone())) + .or(codepage + .as_ref() + .map(|(_codepage, result)| (EncodingSource::Codepage, result.clone()))) + { + Some((source, Ok(encoding))) => (source, Ok(encoding)), + Some((source, Err(EncodingError::Ebcdic))) => (source, Err(EncodingError::Ebcdic)), + _ => (EncodingSource::Default, Ok(default_encoding())), + }; - let mut record_strings = header.get_strings(); - for record in records { - record_strings.append(&mut record.get_strings()); - } - let mut encodings: IndexMap, Vec<&'static Encoding>> = IndexMap::new(); - for encoding in ENCODINGS { - fn recode_as( - record_strings: &[RecordString], - encoding: &'static Encoding, - ) -> Option> { - let mut output = Vec::with_capacity(record_strings.len()); - for rs in record_strings { - let mut s = encoding - .decode_without_bom_handling_and_without_replacement(&rs.string.0)? - .into_owned(); - s.truncate(s.trim_end().len()); - if rs.is_identifier { - Identifier::check_plausible(&s).ok()?; + let mut record_strings = header.get_strings(); + for record in records { + record_strings.append(&mut record.get_strings()); + } + for (case_number, case) in (1..).zip(cases) { + for (variable_number, datum) in (1..).zip(case?.0) { + if let Some(mut string) = datum.into_string() { + string.trim_end(); + if !string.is_empty() { + record_strings.push(RecordString::new( + format!("Case {case_number}, Variable {variable_number}"), + string, + false, + )); + } } - output.push(s); } - Some(output) - } - if let Some(strings) = recode_as(&record_strings, encoding) { - encodings.entry(strings).or_default().push(encoding); } - } - let mut strings = Vec::with_capacity(record_strings.len()); - if !encodings.is_empty() { - for (index, rs) in record_strings.iter().enumerate() { - // Skip strings that decode the same way from every encoding. - if encodings.keys().map(|strings| &strings[index]).all_equal() { - continue; + let record_strings = record_strings + .into_iter() + .unique_by(|rs| rs.string.clone()) + .collect::>(); + + let mut encodings: IndexMap, Vec<&'static Encoding>> = IndexMap::new(); + for encoding in ENCODINGS { + fn recode_as( + record_strings: &[RecordString], + encoding: &'static Encoding, + ) -> Option> { + let mut output = Vec::with_capacity(record_strings.len()); + for rs in record_strings { + let mut s = encoding + .decode_without_bom_handling_and_without_replacement(&rs.string.0)? + .into_owned(); + s.truncate(s.trim_end().len()); + if rs.is_identifier { + Identifier::check_plausible(&s).ok()?; + } + output.push(s); + } + Some(output) } - - /// Returns an iterator for the decoded strings for the given - /// `index`. - fn decoded_index<'a>( - encodings: &'a IndexMap, Vec<&'static Encoding>>, - index: usize, - ) -> impl Iterator { - encodings.keys().map(move |strings| strings[index].as_str()) + if let Some(strings) = recode_as(&record_strings, encoding) { + encodings.entry(strings).or_default().push(encoding); } + } + + let mut strings = Vec::with_capacity(record_strings.len()); + if !encodings.is_empty() { + for (index, rs) in record_strings.iter().enumerate() { + // Skip strings that decode the same way from every encoding. + if encodings.keys().map(|strings| &strings[index]).all_equal() { + continue; + } + + /// Returns an iterator for the decoded strings for the given + /// `index`. + fn decoded_index<'a>( + encodings: &'a IndexMap, Vec<&'static Encoding>>, + index: usize, + ) -> impl Iterator { + encodings.keys().map(move |strings| strings[index].as_str()) + } - let common_prefix: String = decoded_index(&encodings, index) - .reduce(common_prefix) - .unwrap() - .trim_end_matches(|c| c != ' ') - .into(); - let common_suffix: String = decoded_index(&encodings, index) - .reduce(common_suffix) - .unwrap() - .trim_start_matches(|c| c != ' ') - .into(); - - let interpretations = decoded_index(&encodings, index) - .map(|s| s[common_prefix.len()..s.len() - common_suffix.len()].into()) - .collect(); - - strings.push(EncodingReportString { - name: rs.title.clone(), - common_prefix, - interpretations, - common_suffix, - }); + let common_prefix: String = decoded_index(&encodings, index) + .reduce(common_prefix) + .unwrap() + .trim_end_matches(|c| c != ' ') + .into(); + let common_suffix: String = decoded_index(&encodings, index) + .reduce(common_suffix) + .unwrap() + .trim_start_matches(|c| c != ' ') + .into(); + + let interpretations = decoded_index(&encodings, index) + .map(|s| s[common_prefix.len()..s.len() - common_suffix.len()].into()) + .collect(); + + strings.push(EncodingReportString { + name: rs.title.clone(), + common_prefix, + interpretations, + common_suffix, + }); + } } + Ok(EncodingReport { + valid_encodings: encodings.values().cloned().collect(), + strings, + name: label, + codepage, + inferred_encoding, + inferred_encoding_source, + }) } - EncodingReport { - valid_encodings: encodings.values().cloned().collect(), - strings, - name: label, - codepage, - inferred_encoding, - inferred_encoding_source, - } + + let records: Vec = reader.records().collect::, _>>()?; + let header = reader.header().clone(); + inner(header, &records, reader.cases().take(max_cases as usize)) } } -- 2.30.2