From 6081babce45a15901d2220d4f44ae8cfe9c171d5 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 12 Aug 2025 14:33:32 -0700 Subject: [PATCH] work --- rust/pspp/src/data.rs | 12 ++ rust/pspp/src/output/pivot/mod.rs | 31 ++- rust/pspp/src/sys/raw.rs | 329 +++++++++++++++++++++++++++++- rust/pspp/src/sys/raw/records.rs | 15 +- 4 files changed, 371 insertions(+), 16 deletions(-) diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index 60c46752db..ea7364face 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -280,6 +280,18 @@ impl From<&[u8]> for ByteString { } } +impl From<&ByteString> for ByteString { + fn from(value: &ByteString) -> Self { + value.clone() + } +} + +impl From<&ByteStrArray> for ByteString { + fn from(value: &ByteStrArray) -> Self { + Self::from(value.raw_string_bytes()) + } +} + impl From<[u8; N]> for ByteString { fn from(value: [u8; N]) -> Self { value.as_slice().into() diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs index 37ad5929da..99d7271070 100644 --- a/rust/pspp/src/output/pivot/mod.rs +++ b/rust/pspp/src/output/pivot/mod.rs @@ -424,13 +424,14 @@ impl Group { } } - pub fn push(&mut self, child: impl Into) { + pub fn push(&mut self, child: impl Into) -> usize { let mut child = child.into(); if let Category::Group(group) = &mut child { group.show_label = true; } self.len += child.len(); self.children.push(child); + self.len - 1 } pub fn with(mut self, child: impl Into) -> Self { @@ -640,6 +641,18 @@ impl From<&str> for Category { } } +impl From for Category { + fn from(name: String) -> Self { + Self::Leaf(Leaf::new(Value::new_text(name))) + } +} + +impl From<&String> for Category { + fn from(name: &String) -> Self { + Self::Leaf(Leaf::new(Value::new_text(name))) + } +} + /// Styling for a pivot table. /// /// The division between this and the style information in [PivotTable] seems @@ -1374,20 +1387,20 @@ impl PivotTable { self } - pub fn with_caption(mut self, caption: Value) -> Self { - self.caption = Some(Box::new(caption)); + pub fn with_caption(mut self, caption: impl Into) -> Self { + self.caption = Some(Box::new(caption.into())); self.show_caption = true; self } - pub fn with_corner_text(mut self, corner_text: Value) -> Self { - self.corner_text = Some(Box::new(corner_text)); + pub fn with_corner_text(mut self, corner_text: impl Into) -> Self { + self.corner_text = Some(Box::new(corner_text.into())); self } - pub fn with_subtype(self, subtype: Value) -> Self { + pub fn with_subtype(self, subtype: impl Into) -> Self { Self { - subtype: Some(Box::new(subtype)), + subtype: Some(Box::new(subtype.into())), ..self } } @@ -1516,10 +1529,10 @@ where } impl PivotTable { - pub fn new(dimensions_and_axes: impl IntoIterator) -> Self { + pub fn new(axes_and_dimensions: impl IntoIterator) -> Self { let mut dimensions = Vec::new(); let mut axes = EnumMap::::default(); - for (axis, dimension) in dimensions_and_axes { + for (axis, dimension) in axes_and_dimensions { axes[axis].dimensions.push(dimensions.len()); dimensions.push(dimension); } diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 5b39156691..f6ebfd406b 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -20,10 +20,10 @@ //! raw details. Most readers will want to use higher-level interfaces. use crate::{ - data::{ByteStr, ByteString, Datum, RawCase}, - variable::{VarType, VarWidth}, + data::{ByteStr, ByteString, Datum, RawCase, RawString}, endian::{FromBytes, ToBytes}, identifier::{Error as IdError, Identifier}, + output::pivot::{Axis3, Dimension, Group, PivotTable, Value}, sys::{ encoding::{default_encoding, get_encoding, Error as EncodingError}, raw::records::{ @@ -41,11 +41,20 @@ use crate::{ ZlibTrailerWarning, }, }, + variable::{VarType, VarWidth}, }; use binrw::Endian; -use encoding_rs::Encoding; +use encoding_rs::{ + Encoding, BIG5, EUC_JP, EUC_KR, GB18030, IBM866, ISO_2022_JP, ISO_8859_10, ISO_8859_13, + ISO_8859_14, ISO_8859_16, ISO_8859_2, ISO_8859_3, ISO_8859_4, ISO_8859_5, ISO_8859_6, + ISO_8859_7, ISO_8859_8, KOI8_R, KOI8_U, MACINTOSH, SHIFT_JIS, UTF_8, WINDOWS_1250, + WINDOWS_1251, WINDOWS_1252, WINDOWS_1253, WINDOWS_1254, WINDOWS_1255, WINDOWS_1256, + WINDOWS_1257, WINDOWS_1258, WINDOWS_874, +}; use flate2::bufread::ZlibDecoder; +use indexmap::IndexMap; +use itertools::{EitherOrBoth, Itertools}; use serde::Serialize; use smallvec::SmallVec; use std::{ @@ -544,6 +553,97 @@ impl Record { _ => None, } } + + pub fn get_strings(&self) -> Vec { + let mut strings = Vec::new(); + match self { + Record::Variable(variable_record) => { + strings.push(RecordString::new( + "Variable Name", + &variable_record.name, + true, + )); + if let Some(label) = &variable_record.label { + strings.push(RecordString::new("Variable Label", label, false)); + } + for missing_value in &variable_record.missing_values.values { + if let Some(string) = missing_value.as_string() { + strings.push(RecordString::new("Missing Value", string, false)); + } + } + } + Record::ValueLabel(value_label_record) => { + for label in &value_label_record.labels { + strings.push(RecordString::new("Value Label", &label.label, false)); + } + } + Record::Document(document_record) => { + for (line, index) in document_record.lines.iter().zip(1..) { + strings.push(RecordString::new( + format!("Document Line {index}"), + line, + false, + )); + } + } + Record::MultipleResponse(multiple_response_record) => { + for set in &multiple_response_record.sets { + strings.push(RecordString::new( + "Multiple Response Set Name", + &set.name, + true, + )); + if !set.label.is_empty() { + strings.push(RecordString::new( + "Multiple Response Set Label", + &set.label, + false, + )); + } + match &set.mr_type { + records::MultipleResponseType::MultipleDichotomy { value, .. } => { + strings.push(RecordString::new( + "Multiple Response Set Counted Value", + value, + false, + )); + } + _ => (), + } + } + } + Record::LongStringValueLabels(long_string_value_label_record) => { + for labels in &long_string_value_label_record.labels { + for (_value, label) in &labels.labels { + strings.push(RecordString::new("Value Label", label, false)); + } + } + } + Record::ProductInfo(raw_product_info_record) => { + strings.push(RecordString::new( + "Extra Product Info", + &raw_product_info_record.0.text, + false, + )); + } + Record::IntegerInfo(_) + | Record::FloatInfo(_) + | Record::VarDisplay(_) + | Record::LongStringMissingValues(_) + | Record::Encoding(_) + | Record::NumberOfCases(_) + | Record::VariableSets(_) + | Record::LongNames(_) + | Record::VeryLongStrings(_) + | Record::FileAttributes(_) + | Record::VariableAttributes(_) + | Record::OtherExtension(_) + | Record::EndOfHeaders(_) + | Record::ZHeader(_) + | Record::ZTrailer(_) => (), + } + strings + } } /// A [Record] that has been decoded to a more usable form. @@ -1695,3 +1795,226 @@ impl VarTypes { self.types.iter().flatten().count() } } + +pub struct RecordString { + pub title: String, + pub string: ByteString, + pub is_identifier: bool, +} + +impl RecordString { + pub fn new( + title: impl Into, + string: impl Into, + is_identifier: bool, + ) -> Self { + Self { + title: title.into(), + string: string.into(), + is_identifier, + } + } +} + +static ENCODINGS: [&Encoding; 32] = [ + UTF_8, + WINDOWS_1252, + ISO_8859_2, + ISO_8859_3, + ISO_8859_4, + ISO_8859_5, + ISO_8859_6, + ISO_8859_7, + ISO_8859_8, + ISO_8859_10, + ISO_8859_13, + ISO_8859_14, + ISO_8859_16, + MACINTOSH, + WINDOWS_874, + WINDOWS_1250, + WINDOWS_1251, + WINDOWS_1253, + WINDOWS_1254, + WINDOWS_1255, + WINDOWS_1256, + WINDOWS_1257, + WINDOWS_1258, + KOI8_R, + KOI8_U, + IBM866, + GB18030, + BIG5, + EUC_JP, + ISO_2022_JP, + SHIFT_JIS, + EUC_KR, +]; + +pub struct EncodingReport { + pub valid_encodings: PivotTable, + pub interpretations: Option, +} + +impl EncodingReport { + pub fn new(record_strings: &[RecordString]) -> Option { + let mut encodings: IndexMap, Vec<&'static Encoding>> = IndexMap::new(); + for encoding in ENCODINGS { + fn recode_as( + record_strings: &[RecordString], + encoding: &'static Encoding, + ) -> Option> { + let mut output = Vec::with_capacity(record_strings.len()); + for rs in record_strings { + let mut s = encoding + .decode_without_bom_handling_and_without_replacement(&rs.string.0)? + .into_owned(); + s.truncate(s.trim_end().len()); + if rs.is_identifier { + Identifier::check_plausible(&s).ok()?; + } + output.push(s); + } + Some(output) + } + if let Some(strings) = recode_as(record_strings, encoding) { + encodings.entry(strings).or_default().push(encoding); + } + } + if encodings.is_empty() { + return None; + } + + let numbers = Group::new("#").with_multiple((1..=encodings.len()).map(|i| format!("{i}"))); + let valid_encodings = PivotTable::new([(Axis3::Y, Dimension::new(numbers))]).with_data( + encodings + .values() + .map(|encodings| { + Value::new_user_text(encodings.iter().map(|e| e.name()).join(", ")) + }) + .enumerate() + .map(|(index, datum)| ([index], datum)), + ); + + let mut purposes = Group::new("Purpose").with_label_shown(); + let mut data = Vec::new(); + for (index, rs) in record_strings.iter().enumerate() { + // Skip strings that decode the same way from every encoding. + if encodings.keys().map(|strings| &strings[index]).all_equal() { + continue; + } + + /// Returns an iterator for the decoded strings for the given + /// `index`. + fn decoded_index<'a>( + encodings: &'a IndexMap, Vec<&'static Encoding>>, + index: usize, + ) -> impl Iterator { + encodings.keys().map(move |strings| strings[index].as_str()) + } + + let common_prefix = decoded_index(&encodings, index) + .reduce(common_prefix) + .unwrap() + .trim_end_matches(|c| c != ' ') + .len(); + let common_suffix = decoded_index(&encodings, index) + .reduce(common_suffix) + .unwrap() + .trim_start_matches(|c| c != ' ') + .len(); + + let purpose = purposes.push(&rs.title); + + for (j, s) in decoded_index(&encodings, index).enumerate() { + let s = &s[common_prefix..s.len() - common_suffix]; + let mut entry = String::with_capacity(s.len() + 6); + if common_prefix > 0 { + entry.push_str("..."); + } + entry.push_str(s); + if common_suffix > 0 { + entry.push_str("..."); + } + data.push(([0, j, purpose], Value::new_user_text(entry))); + } + } + let number = Group::new("Text") + .with_label_shown() + .with_multiple((1..=encodings.len()).map(|i| format!("{i}"))); + let interpretations = if !data.is_empty() { + Some( + PivotTable::new([ + (Axis3::X, Dimension::new(Group::new("Text").with("Text"))), + (Axis3::Y, Dimension::new(number)), + (Axis3::Y, Dimension::new(purposes)), + ]) + .with_title("Alternate Encoded Text Strings") + .with_caption("Text strings in the file dictionary that the previously listed encodings interpret differently, along with the interpretations.") + .with_data(data), + ) + } else { + None + }; + Some(Self { + valid_encodings, + interpretations, + }) + } +} + +fn common_prefix<'a>(a: &'a str, b: &'a str) -> &'a str { + for elem in a.char_indices().zip_longest(b.char_indices()) { + match elem { + EitherOrBoth::Both((offset, a_char), (_, b_char)) => { + if a_char != b_char { + return &a[..offset]; + } + } + EitherOrBoth::Left((offset, _)) | EitherOrBoth::Right((offset, _)) => { + return &a[..offset] + } + } + } + a +} + +fn common_suffix<'a>(a: &'a str, b: &'a str) -> &'a str { + for elem in a.char_indices().rev().zip_longest(b.char_indices().rev()) { + match elem { + EitherOrBoth::Both((offset, a_char), (_, b_char)) => { + if a_char != b_char { + return &a[offset + a_char.len_utf8()..]; + } + } + EitherOrBoth::Left((offset, char)) => { + return &a[offset + char.len_utf8()..]; + } + EitherOrBoth::Right((offset, char)) => { + return &b[offset + char.len_utf8()..]; + } + } + } + a +} + +#[cfg(test)] +mod tests { + use crate::sys::raw::{common_prefix, common_suffix}; + + #[test] + fn test_common_prefix() { + assert_eq!(common_prefix("abc", "abcxyzzy"), "abc"); + assert_eq!(common_prefix("abcxyzzy", "abc"), "abc"); + assert_eq!(common_prefix("abc", "abc"), "abc"); + assert_eq!(common_prefix("", ""), ""); + } + + #[test] + fn test_common_suffix() { + assert_eq!(common_suffix("xyzzyabc", "abc"), "abc"); + assert_eq!(common_suffix("abc", "xyzzyabc"), "abc"); + assert_eq!(common_suffix("abc", "abc"), "abc"); + assert_eq!(common_suffix("", ""), ""); + } +} diff --git a/rust/pspp/src/sys/raw/records.rs b/rust/pspp/src/sys/raw/records.rs index 18b76129be..e893dc79ae 100644 --- a/rust/pspp/src/sys/raw/records.rs +++ b/rust/pspp/src/sys/raw/records.rs @@ -20,7 +20,7 @@ use crate::{ sys::{ raw::{ read_bytes, read_string, read_vec, Decoder, Error, ErrorDetails, Magic, RawDatum, - RawWidth, Record, UntypedDatum, VarTypes, Warning, WarningDetails, + RawWidth, Record, RecordString, UntypedDatum, VarTypes, Warning, WarningDetails, }, serialize_endian, ProductVersion, }, @@ -246,6 +246,13 @@ impl FileHeader { endian: self.endian, } } + + pub fn get_strings(&self) -> Vec { + vec![ + RecordString::new("Product", &self.eye_catcher.0[5..], false), + RecordString::new("File Label", &self.file_label, false), + ] + } } /// [Format] as represented in a system file. @@ -999,7 +1006,7 @@ pub struct FloatInfoRecord { #[derive(Clone, Debug, Serialize)] pub struct RawLongNamesRecord( /// Text contents of record. - TextRecord, + pub TextRecord, ); impl RawLongNamesRecord { @@ -1098,7 +1105,7 @@ impl VeryLongString { /// A very long string record as text. #[derive(Clone, Debug, Serialize)] -pub struct RawVeryLongStringsRecord(TextRecord); +pub struct RawVeryLongStringsRecord(pub TextRecord); /// A parsed very long string record. #[derive(Clone, Debug, Serialize)] @@ -1761,7 +1768,7 @@ impl RawVariableSetRecord { /// Raw (text) version of a product info record in a system file. #[derive(Clone, Debug, Serialize)] -pub struct RawProductInfoRecord(TextRecord); +pub struct RawProductInfoRecord(pub TextRecord); impl RawProductInfoRecord { /// Parses the record from `extension`. -- 2.30.2