work
authorBen Pfaff <blp@cs.stanford.edu>
Tue, 12 Aug 2025 21:33:32 +0000 (14:33 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Tue, 12 Aug 2025 21:33:32 +0000 (14:33 -0700)
rust/pspp/src/data.rs
rust/pspp/src/output/pivot/mod.rs
rust/pspp/src/sys/raw.rs
rust/pspp/src/sys/raw/records.rs

index 60c46752db13c51ac23b5269ee8a7ac2f155f55b..ea7364face33b831681b21a90798c6b0f7d44d5a 100644 (file)
@@ -280,6 +280,18 @@ impl From<&[u8]> for ByteString {
     }
 }
 
+impl From<&ByteString> for ByteString {
+    fn from(value: &ByteString) -> Self {
+        value.clone()
+    }
+}
+
+impl<const N: usize> From<&ByteStrArray<N>> for ByteString {
+    fn from(value: &ByteStrArray<N>) -> Self {
+        Self::from(value.raw_string_bytes())
+    }
+}
+
 impl<const N: usize> From<[u8; N]> for ByteString {
     fn from(value: [u8; N]) -> Self {
         value.as_slice().into()
index 37ad5929da59f3766befca0a62eb4031bbcee716..99d727107058d6c5f37822a50a1b03952d8bc164 100644 (file)
@@ -424,13 +424,14 @@ impl Group {
         }
     }
 
-    pub fn push(&mut self, child: impl Into<Category>) {
+    pub fn push(&mut self, child: impl Into<Category>) -> usize {
         let mut child = child.into();
         if let Category::Group(group) = &mut child {
             group.show_label = true;
         }
         self.len += child.len();
         self.children.push(child);
+        self.len - 1
     }
 
     pub fn with(mut self, child: impl Into<Category>) -> Self {
@@ -640,6 +641,18 @@ impl From<&str> for Category {
     }
 }
 
+impl From<String> for Category {
+    fn from(name: String) -> Self {
+        Self::Leaf(Leaf::new(Value::new_text(name)))
+    }
+}
+
+impl From<&String> for Category {
+    fn from(name: &String) -> Self {
+        Self::Leaf(Leaf::new(Value::new_text(name)))
+    }
+}
+
 /// Styling for a pivot table.
 ///
 /// The division between this and the style information in [PivotTable] seems
@@ -1374,20 +1387,20 @@ impl PivotTable {
         self
     }
 
-    pub fn with_caption(mut self, caption: Value) -> Self {
-        self.caption = Some(Box::new(caption));
+    pub fn with_caption(mut self, caption: impl Into<Value>) -> Self {
+        self.caption = Some(Box::new(caption.into()));
         self.show_caption = true;
         self
     }
 
-    pub fn with_corner_text(mut self, corner_text: Value) -> Self {
-        self.corner_text = Some(Box::new(corner_text));
+    pub fn with_corner_text(mut self, corner_text: impl Into<Value>) -> Self {
+        self.corner_text = Some(Box::new(corner_text.into()));
         self
     }
 
-    pub fn with_subtype(self, subtype: Value) -> Self {
+    pub fn with_subtype(self, subtype: impl Into<Value>) -> Self {
         Self {
-            subtype: Some(Box::new(subtype)),
+            subtype: Some(Box::new(subtype.into())),
             ..self
         }
     }
@@ -1516,10 +1529,10 @@ where
 }
 
 impl PivotTable {
-    pub fn new(dimensions_and_axes: impl IntoIterator<Item = (Axis3, Dimension)>) -> Self {
+    pub fn new(axes_and_dimensions: impl IntoIterator<Item = (Axis3, Dimension)>) -> Self {
         let mut dimensions = Vec::new();
         let mut axes = EnumMap::<Axis3, Axis>::default();
-        for (axis, dimension) in dimensions_and_axes {
+        for (axis, dimension) in axes_and_dimensions {
             axes[axis].dimensions.push(dimensions.len());
             dimensions.push(dimension);
         }
index 5b39156691cd19abeb4e27613adeb139d0b74aef..f6ebfd406b504daadbd49743d0bf6745f6ef2eab 100644 (file)
 //! raw details.  Most readers will want to use higher-level interfaces.
 
 use crate::{
-    data::{ByteStr, ByteString, Datum, RawCase},
-    variable::{VarType, VarWidth},
+    data::{ByteStr, ByteString, Datum, RawCase, RawString},
     endian::{FromBytes, ToBytes},
     identifier::{Error as IdError, Identifier},
+    output::pivot::{Axis3, Dimension, Group, PivotTable, Value},
     sys::{
         encoding::{default_encoding, get_encoding, Error as EncodingError},
         raw::records::{
@@ -41,11 +41,20 @@ use crate::{
             ZlibTrailerWarning,
         },
     },
+    variable::{VarType, VarWidth},
 };
 
 use binrw::Endian;
-use encoding_rs::Encoding;
+use encoding_rs::{
+    Encoding, BIG5, EUC_JP, EUC_KR, GB18030, IBM866, ISO_2022_JP, ISO_8859_10, ISO_8859_13,
+    ISO_8859_14, ISO_8859_16, ISO_8859_2, ISO_8859_3, ISO_8859_4, ISO_8859_5, ISO_8859_6,
+    ISO_8859_7, ISO_8859_8, KOI8_R, KOI8_U, MACINTOSH, SHIFT_JIS, UTF_8, WINDOWS_1250,
+    WINDOWS_1251, WINDOWS_1252, WINDOWS_1253, WINDOWS_1254, WINDOWS_1255, WINDOWS_1256,
+    WINDOWS_1257, WINDOWS_1258, WINDOWS_874,
+};
 use flate2::bufread::ZlibDecoder;
+use indexmap::IndexMap;
+use itertools::{EitherOrBoth, Itertools};
 use serde::Serialize;
 use smallvec::SmallVec;
 use std::{
@@ -544,6 +553,97 @@ impl Record {
             _ => None,
         }
     }
+
+    pub fn get_strings(&self) -> Vec<RecordString> {
+        let mut strings = Vec::new();
+        match self {
+            Record::Variable(variable_record) => {
+                strings.push(RecordString::new(
+                    "Variable Name",
+                    &variable_record.name,
+                    true,
+                ));
+                if let Some(label) = &variable_record.label {
+                    strings.push(RecordString::new("Variable Label", label, false));
+                }
+                for missing_value in &variable_record.missing_values.values {
+                    if let Some(string) = missing_value.as_string() {
+                        strings.push(RecordString::new("Missing Value", string, false));
+                    }
+                }
+            }
+            Record::ValueLabel(value_label_record) => {
+                for label in &value_label_record.labels {
+                    strings.push(RecordString::new("Value Label", &label.label, false));
+                }
+            }
+            Record::Document(document_record) => {
+                for (line, index) in document_record.lines.iter().zip(1..) {
+                    strings.push(RecordString::new(
+                        format!("Document Line {index}"),
+                        line,
+                        false,
+                    ));
+                }
+            }
+            Record::MultipleResponse(multiple_response_record) => {
+                for set in &multiple_response_record.sets {
+                    strings.push(RecordString::new(
+                        "Multiple Response Set Name",
+                        &set.name,
+                        true,
+                    ));
+                    if !set.label.is_empty() {
+                        strings.push(RecordString::new(
+                            "Multiple Response Set Label",
+                            &set.label,
+                            false,
+                        ));
+                    }
+                    match &set.mr_type {
+                        records::MultipleResponseType::MultipleDichotomy { value, .. } => {
+                            strings.push(RecordString::new(
+                                "Multiple Response Set Counted Value",
+                                value,
+                                false,
+                            ));
+                        }
+                        _ => (),
+                    }
+                }
+            }
+            Record::LongStringValueLabels(long_string_value_label_record) => {
+                for labels in &long_string_value_label_record.labels {
+                    for (_value, label) in &labels.labels {
+                        strings.push(RecordString::new("Value Label", label, false));
+                    }
+                }
+            }
+            Record::ProductInfo(raw_product_info_record) => {
+                strings.push(RecordString::new(
+                    "Extra Product Info",
+                    &raw_product_info_record.0.text,
+                    false,
+                ));
+            }
+            Record::IntegerInfo(_)
+            | Record::FloatInfo(_)
+            | Record::VarDisplay(_)
+            | Record::LongStringMissingValues(_)
+            | Record::Encoding(_)
+            | Record::NumberOfCases(_)
+            | Record::VariableSets(_)
+            | Record::LongNames(_)
+            | Record::VeryLongStrings(_)
+            | Record::FileAttributes(_)
+            | Record::VariableAttributes(_)
+            | Record::OtherExtension(_)
+            | Record::EndOfHeaders(_)
+            | Record::ZHeader(_)
+            | Record::ZTrailer(_) => (),
+        }
+        strings
+    }
 }
 
 /// A [Record] that has been decoded to a more usable form.
@@ -1695,3 +1795,226 @@ impl VarTypes {
         self.types.iter().flatten().count()
     }
 }
+
+pub struct RecordString {
+    pub title: String,
+    pub string: ByteString,
+    pub is_identifier: bool,
+}
+
+impl RecordString {
+    pub fn new(
+        title: impl Into<String>,
+        string: impl Into<ByteString>,
+        is_identifier: bool,
+    ) -> Self {
+        Self {
+            title: title.into(),
+            string: string.into(),
+            is_identifier,
+        }
+    }
+}
+
+static ENCODINGS: [&Encoding; 32] = [
+    UTF_8,
+    WINDOWS_1252,
+    ISO_8859_2,
+    ISO_8859_3,
+    ISO_8859_4,
+    ISO_8859_5,
+    ISO_8859_6,
+    ISO_8859_7,
+    ISO_8859_8,
+    ISO_8859_10,
+    ISO_8859_13,
+    ISO_8859_14,
+    ISO_8859_16,
+    MACINTOSH,
+    WINDOWS_874,
+    WINDOWS_1250,
+    WINDOWS_1251,
+    WINDOWS_1253,
+    WINDOWS_1254,
+    WINDOWS_1255,
+    WINDOWS_1256,
+    WINDOWS_1257,
+    WINDOWS_1258,
+    KOI8_R,
+    KOI8_U,
+    IBM866,
+    GB18030,
+    BIG5,
+    EUC_JP,
+    ISO_2022_JP,
+    SHIFT_JIS,
+    EUC_KR,
+];
+
+pub struct EncodingReport {
+    pub valid_encodings: PivotTable,
+    pub interpretations: Option<PivotTable>,
+}
+
+impl EncodingReport {
+    pub fn new(record_strings: &[RecordString]) -> Option<Self> {
+        let mut encodings: IndexMap<Vec<String>, Vec<&'static Encoding>> = IndexMap::new();
+        for encoding in ENCODINGS {
+            fn recode_as(
+                record_strings: &[RecordString],
+                encoding: &'static Encoding,
+            ) -> Option<Vec<String>> {
+                let mut output = Vec::with_capacity(record_strings.len());
+                for rs in record_strings {
+                    let mut s = encoding
+                        .decode_without_bom_handling_and_without_replacement(&rs.string.0)?
+                        .into_owned();
+                    s.truncate(s.trim_end().len());
+                    if rs.is_identifier {
+                        Identifier::check_plausible(&s).ok()?;
+                    }
+                    output.push(s);
+                }
+                Some(output)
+            }
+            if let Some(strings) = recode_as(record_strings, encoding) {
+                encodings.entry(strings).or_default().push(encoding);
+            }
+        }
+        if encodings.is_empty() {
+            return None;
+        }
+
+        let numbers = Group::new("#").with_multiple((1..=encodings.len()).map(|i| format!("{i}")));
+        let valid_encodings = PivotTable::new([(Axis3::Y, Dimension::new(numbers))]).with_data(
+            encodings
+                .values()
+                .map(|encodings| {
+                    Value::new_user_text(encodings.iter().map(|e| e.name()).join(", "))
+                })
+                .enumerate()
+                .map(|(index, datum)| ([index], datum)),
+        );
+
+        let mut purposes = Group::new("Purpose").with_label_shown();
+        let mut data = Vec::new();
+        for (index, rs) in record_strings.iter().enumerate() {
+            // Skip strings that decode the same way from every encoding.
+            if encodings.keys().map(|strings| &strings[index]).all_equal() {
+                continue;
+            }
+
+            /// Returns an iterator for the decoded strings for the given
+            /// `index`.
+            fn decoded_index<'a>(
+                encodings: &'a IndexMap<Vec<String>, Vec<&'static Encoding>>,
+                index: usize,
+            ) -> impl Iterator<Item = &'a str> {
+                encodings.keys().map(move |strings| strings[index].as_str())
+            }
+
+            let common_prefix = decoded_index(&encodings, index)
+                .reduce(common_prefix)
+                .unwrap()
+                .trim_end_matches(|c| c != ' ')
+                .len();
+            let common_suffix = decoded_index(&encodings, index)
+                .reduce(common_suffix)
+                .unwrap()
+                .trim_start_matches(|c| c != ' ')
+                .len();
+
+            let purpose = purposes.push(&rs.title);
+
+            for (j, s) in decoded_index(&encodings, index).enumerate() {
+                let s = &s[common_prefix..s.len() - common_suffix];
+                let mut entry = String::with_capacity(s.len() + 6);
+                if common_prefix > 0 {
+                    entry.push_str("...");
+                }
+                entry.push_str(s);
+                if common_suffix > 0 {
+                    entry.push_str("...");
+                }
+                data.push(([0, j, purpose], Value::new_user_text(entry)));
+            }
+        }
+        let number = Group::new("Text")
+            .with_label_shown()
+            .with_multiple((1..=encodings.len()).map(|i| format!("{i}")));
+        let interpretations = if !data.is_empty() {
+            Some(
+                PivotTable::new([
+                    (Axis3::X, Dimension::new(Group::new("Text").with("Text"))),
+                    (Axis3::Y, Dimension::new(number)),
+                    (Axis3::Y, Dimension::new(purposes)),
+                ])
+                .with_title("Alternate Encoded Text Strings")
+                .with_caption("Text strings in the file dictionary that the previously listed encodings interpret differently, along with the interpretations.")
+                .with_data(data),
+            )
+        } else {
+            None
+        };
+        Some(Self {
+            valid_encodings,
+            interpretations,
+        })
+    }
+}
+
+fn common_prefix<'a>(a: &'a str, b: &'a str) -> &'a str {
+    for elem in a.char_indices().zip_longest(b.char_indices()) {
+        match elem {
+            EitherOrBoth::Both((offset, a_char), (_, b_char)) => {
+                if a_char != b_char {
+                    return &a[..offset];
+                }
+            }
+            EitherOrBoth::Left((offset, _)) | EitherOrBoth::Right((offset, _)) => {
+                return &a[..offset]
+            }
+        }
+    }
+    a
+}
+
+fn common_suffix<'a>(a: &'a str, b: &'a str) -> &'a str {
+    for elem in a.char_indices().rev().zip_longest(b.char_indices().rev()) {
+        match elem {
+            EitherOrBoth::Both((offset, a_char), (_, b_char)) => {
+                if a_char != b_char {
+                    return &a[offset + a_char.len_utf8()..];
+                }
+            }
+            EitherOrBoth::Left((offset, char)) => {
+                return &a[offset + char.len_utf8()..];
+            }
+            EitherOrBoth::Right((offset, char)) => {
+                return &b[offset + char.len_utf8()..];
+            }
+        }
+    }
+    a
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::sys::raw::{common_prefix, common_suffix};
+
+    #[test]
+    fn test_common_prefix() {
+        assert_eq!(common_prefix("abc", "abcxyzzy"), "abc");
+        assert_eq!(common_prefix("abcxyzzy", "abc"), "abc");
+        assert_eq!(common_prefix("abc", "abc"), "abc");
+        assert_eq!(common_prefix("", ""), "");
+    }
+
+    #[test]
+    fn test_common_suffix() {
+        assert_eq!(common_suffix("xyzzyabc", "abc"), "abc");
+        assert_eq!(common_suffix("abc", "xyzzyabc"), "abc");
+        assert_eq!(common_suffix("abc", "abc"), "abc");
+        assert_eq!(common_suffix("", ""), "");
+    }
+}
index 18b76129bed3b1a41066f9e1e36b102f89a2b330..e893dc79aecf277b7da7c4f64148b485e7d396a6 100644 (file)
@@ -20,7 +20,7 @@ use crate::{
     sys::{
         raw::{
             read_bytes, read_string, read_vec, Decoder, Error, ErrorDetails, Magic, RawDatum,
-            RawWidth, Record, UntypedDatum, VarTypes, Warning, WarningDetails,
+            RawWidth, Record, RecordString, UntypedDatum, VarTypes, Warning, WarningDetails,
         },
         serialize_endian, ProductVersion,
     },
@@ -246,6 +246,13 @@ impl FileHeader<ByteString> {
             endian: self.endian,
         }
     }
+
+    pub fn get_strings(&self) -> Vec<RecordString> {
+        vec![
+            RecordString::new("Product", &self.eye_catcher.0[5..], false),
+            RecordString::new("File Label", &self.file_label, false),
+        ]
+    }
 }
 
 /// [Format] as represented in a system file.
@@ -999,7 +1006,7 @@ pub struct FloatInfoRecord {
 #[derive(Clone, Debug, Serialize)]
 pub struct RawLongNamesRecord(
     /// Text contents of record.
-    TextRecord,
+    pub TextRecord,
 );
 
 impl RawLongNamesRecord {
@@ -1098,7 +1105,7 @@ impl VeryLongString {
 
 /// A very long string record as text.
 #[derive(Clone, Debug, Serialize)]
-pub struct RawVeryLongStringsRecord(TextRecord);
+pub struct RawVeryLongStringsRecord(pub TextRecord);
 
 /// A parsed very long string record.
 #[derive(Clone, Debug, Serialize)]
@@ -1761,7 +1768,7 @@ impl RawVariableSetRecord {
 
 /// Raw (text) version of a product info record in a system file.
 #[derive(Clone, Debug, Serialize)]
-pub struct RawProductInfoRecord(TextRecord);
+pub struct RawProductInfoRecord(pub TextRecord);
 
 impl RawProductInfoRecord {
     /// Parses the record from `extension`.