support case data too rust
authorBen Pfaff <blp@cs.stanford.edu>
Sun, 24 Aug 2025 03:47:50 +0000 (20:47 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Sun, 24 Aug 2025 03:47:50 +0000 (20:47 -0700)
rust/pspp/src/data.rs
rust/pspp/src/main.rs
rust/pspp/src/sys/raw.rs

index ea7364face33b831681b21a90798c6b0f7d44d5a..8e52af70358f975b833d9a9583e050ff6ffa354c 100644 (file)
@@ -556,6 +556,15 @@ impl<B> Datum<B> {
         }
     }
 
+    /// Returns the string inside this datum, or `None` if this is a numeric
+    /// datum.
+    pub fn into_string(self) -> Option<B> {
+        match self {
+            Self::Number(_) => None,
+            Self::String(s) => Some(s),
+        }
+    }
+
     /// Returns the [VarType] corresponding to this datum.
     pub fn var_type(&self) -> VarType {
         match self {
index 606733daf5b8efe3be98e5c7543a7bdfa149dc19..966e1cfd30f3325d100e9f7f8eb298b6d58ca451 100644 (file)
@@ -501,8 +501,7 @@ impl Show {
                 }
             }
             Mode::Encodings => {
-                let records: Vec<Record> = reader.records().collect::<Result<Vec<_>, _>>()?;
-                let encoding_report = EncodingReport::new(reader.header(), &records);
+                let encoding_report = EncodingReport::new(reader, self.max_cases)?;
                 output.show(&encoding_report)?;
             }
         }
index e01888a2de4ad0bb26201bb62f1afdc5b40a7208..f28590e6ca9bd0bab216940458902c0cc9aede34 100644 (file)
@@ -20,7 +20,7 @@
 //! raw details.  Most readers will want to use higher-level interfaces.
 
 use crate::{
-    data::{ByteStr, ByteString, Datum, RawCase, RawString},
+    data::{ByteStr, ByteString, Datum, MutRawString, RawCase, RawString},
     endian::{FromBytes, ToBytes},
     identifier::{Error as IdError, Identifier},
     output::{
@@ -2090,99 +2090,132 @@ impl EncodingReportString {
 }
 
 impl EncodingReport {
-    pub fn new(header: &FileHeader<ByteString>, records: &[Record]) -> Self {
-        let (encoding, codepage) = get_encoding_info(&records);
-        let label =
-            encoding.map(|encoding| (String::from(encoding), get_encoding(Some(encoding), None)));
-        let codepage = codepage.map(|codepage| (codepage, get_encoding(None, Some(codepage))));
-        let (inferred_encoding_source, inferred_encoding) = match label
-            .as_ref()
-            .map(|(_string, result)| (EncodingSource::Name, result.clone()))
-            .or(codepage
+    pub fn new<R>(mut reader: Reader<R>, max_cases: u64) -> Result<Self, Error>
+    where
+        R: BufRead + Seek + 'static,
+    {
+        fn inner(
+            header: FileHeader<ByteString>,
+            records: &[Record],
+            cases: impl Iterator<Item = Result<RawCase, Error>>,
+        ) -> Result<EncodingReport, Error> {
+            let (encoding, codepage) = get_encoding_info(&records);
+            let label = encoding
+                .map(|encoding| (String::from(encoding), get_encoding(Some(encoding), None)));
+            let codepage = codepage.map(|codepage| (codepage, get_encoding(None, Some(codepage))));
+            let (inferred_encoding_source, inferred_encoding) = match label
                 .as_ref()
-                .map(|(_codepage, result)| (EncodingSource::Codepage, result.clone())))
-        {
-            Some((source, Ok(encoding))) => (source, Ok(encoding)),
-            Some((source, Err(EncodingError::Ebcdic))) => (source, Err(EncodingError::Ebcdic)),
-            _ => (EncodingSource::Default, Ok(default_encoding())),
-        };
+                .map(|(_string, result)| (EncodingSource::Name, result.clone()))
+                .or(codepage
+                    .as_ref()
+                    .map(|(_codepage, result)| (EncodingSource::Codepage, result.clone())))
+            {
+                Some((source, Ok(encoding))) => (source, Ok(encoding)),
+                Some((source, Err(EncodingError::Ebcdic))) => (source, Err(EncodingError::Ebcdic)),
+                _ => (EncodingSource::Default, Ok(default_encoding())),
+            };
 
-        let mut record_strings = header.get_strings();
-        for record in records {
-            record_strings.append(&mut record.get_strings());
-        }
-        let mut encodings: IndexMap<Vec<String>, Vec<&'static Encoding>> = IndexMap::new();
-        for encoding in ENCODINGS {
-            fn recode_as(
-                record_strings: &[RecordString],
-                encoding: &'static Encoding,
-            ) -> Option<Vec<String>> {
-                let mut output = Vec::with_capacity(record_strings.len());
-                for rs in record_strings {
-                    let mut s = encoding
-                        .decode_without_bom_handling_and_without_replacement(&rs.string.0)?
-                        .into_owned();
-                    s.truncate(s.trim_end().len());
-                    if rs.is_identifier {
-                        Identifier::check_plausible(&s).ok()?;
+            let mut record_strings = header.get_strings();
+            for record in records {
+                record_strings.append(&mut record.get_strings());
+            }
+            for (case_number, case) in (1..).zip(cases) {
+                for (variable_number, datum) in (1..).zip(case?.0) {
+                    if let Some(mut string) = datum.into_string() {
+                        string.trim_end();
+                        if !string.is_empty() {
+                            record_strings.push(RecordString::new(
+                                format!("Case {case_number}, Variable {variable_number}"),
+                                string,
+                                false,
+                            ));
+                        }
                     }
-                    output.push(s);
                 }
-                Some(output)
-            }
-            if let Some(strings) = recode_as(&record_strings, encoding) {
-                encodings.entry(strings).or_default().push(encoding);
             }
-        }
 
-        let mut strings = Vec::with_capacity(record_strings.len());
-        if !encodings.is_empty() {
-            for (index, rs) in record_strings.iter().enumerate() {
-                // Skip strings that decode the same way from every encoding.
-                if encodings.keys().map(|strings| &strings[index]).all_equal() {
-                    continue;
+            let record_strings = record_strings
+                .into_iter()
+                .unique_by(|rs| rs.string.clone())
+                .collect::<Vec<_>>();
+
+            let mut encodings: IndexMap<Vec<String>, Vec<&'static Encoding>> = IndexMap::new();
+            for encoding in ENCODINGS {
+                fn recode_as(
+                    record_strings: &[RecordString],
+                    encoding: &'static Encoding,
+                ) -> Option<Vec<String>> {
+                    let mut output = Vec::with_capacity(record_strings.len());
+                    for rs in record_strings {
+                        let mut s = encoding
+                            .decode_without_bom_handling_and_without_replacement(&rs.string.0)?
+                            .into_owned();
+                        s.truncate(s.trim_end().len());
+                        if rs.is_identifier {
+                            Identifier::check_plausible(&s).ok()?;
+                        }
+                        output.push(s);
+                    }
+                    Some(output)
                 }
-
-                /// Returns an iterator for the decoded strings for the given
-                /// `index`.
-                fn decoded_index<'a>(
-                    encodings: &'a IndexMap<Vec<String>, Vec<&'static Encoding>>,
-                    index: usize,
-                ) -> impl Iterator<Item = &'a str> {
-                    encodings.keys().map(move |strings| strings[index].as_str())
+                if let Some(strings) = recode_as(&record_strings, encoding) {
+                    encodings.entry(strings).or_default().push(encoding);
                 }
+            }
+
+            let mut strings = Vec::with_capacity(record_strings.len());
+            if !encodings.is_empty() {
+                for (index, rs) in record_strings.iter().enumerate() {
+                    // Skip strings that decode the same way from every encoding.
+                    if encodings.keys().map(|strings| &strings[index]).all_equal() {
+                        continue;
+                    }
+
+                    /// Returns an iterator for the decoded strings for the given
+                    /// `index`.
+                    fn decoded_index<'a>(
+                        encodings: &'a IndexMap<Vec<String>, Vec<&'static Encoding>>,
+                        index: usize,
+                    ) -> impl Iterator<Item = &'a str> {
+                        encodings.keys().map(move |strings| strings[index].as_str())
+                    }
 
-                let common_prefix: String = decoded_index(&encodings, index)
-                    .reduce(common_prefix)
-                    .unwrap()
-                    .trim_end_matches(|c| c != ' ')
-                    .into();
-                let common_suffix: String = decoded_index(&encodings, index)
-                    .reduce(common_suffix)
-                    .unwrap()
-                    .trim_start_matches(|c| c != ' ')
-                    .into();
-
-                let interpretations = decoded_index(&encodings, index)
-                    .map(|s| s[common_prefix.len()..s.len() - common_suffix.len()].into())
-                    .collect();
-
-                strings.push(EncodingReportString {
-                    name: rs.title.clone(),
-                    common_prefix,
-                    interpretations,
-                    common_suffix,
-                });
+                    let common_prefix: String = decoded_index(&encodings, index)
+                        .reduce(common_prefix)
+                        .unwrap()
+                        .trim_end_matches(|c| c != ' ')
+                        .into();
+                    let common_suffix: String = decoded_index(&encodings, index)
+                        .reduce(common_suffix)
+                        .unwrap()
+                        .trim_start_matches(|c| c != ' ')
+                        .into();
+
+                    let interpretations = decoded_index(&encodings, index)
+                        .map(|s| s[common_prefix.len()..s.len() - common_suffix.len()].into())
+                        .collect();
+
+                    strings.push(EncodingReportString {
+                        name: rs.title.clone(),
+                        common_prefix,
+                        interpretations,
+                        common_suffix,
+                    });
+                }
             }
+            Ok(EncodingReport {
+                valid_encodings: encodings.values().cloned().collect(),
+                strings,
+                name: label,
+                codepage,
+                inferred_encoding,
+                inferred_encoding_source,
+            })
         }
-        EncodingReport {
-            valid_encodings: encodings.values().cloned().collect(),
-            strings,
-            name: label,
-            codepage,
-            inferred_encoding,
-            inferred_encoding_source,
-        }
+
+        let records: Vec<Record> = reader.records().collect::<Result<Vec<_>, _>>()?;
+        let header = reader.header().clone();
+        inner(header, &records, reader.cases().take(max_cases as usize))
     }
 }