//! raw details. Most readers will want to use higher-level interfaces.
use crate::{
- data::{ByteStr, ByteString, Datum, RawCase, RawString},
+ data::{ByteStr, ByteString, Datum, MutRawString, RawCase, RawString},
endian::{FromBytes, ToBytes},
identifier::{Error as IdError, Identifier},
output::{
}
impl EncodingReport {
- pub fn new(header: &FileHeader<ByteString>, records: &[Record]) -> Self {
- let (encoding, codepage) = get_encoding_info(&records);
- let label =
- encoding.map(|encoding| (String::from(encoding), get_encoding(Some(encoding), None)));
- let codepage = codepage.map(|codepage| (codepage, get_encoding(None, Some(codepage))));
- let (inferred_encoding_source, inferred_encoding) = match label
- .as_ref()
- .map(|(_string, result)| (EncodingSource::Name, result.clone()))
- .or(codepage
+ pub fn new<R>(mut reader: Reader<R>, max_cases: u64) -> Result<Self, Error>
+ where
+ R: BufRead + Seek + 'static,
+ {
+ fn inner(
+ header: FileHeader<ByteString>,
+ records: &[Record],
+ cases: impl Iterator<Item = Result<RawCase, Error>>,
+ ) -> Result<EncodingReport, Error> {
+ let (encoding, codepage) = get_encoding_info(&records);
+ let label = encoding
+ .map(|encoding| (String::from(encoding), get_encoding(Some(encoding), None)));
+ let codepage = codepage.map(|codepage| (codepage, get_encoding(None, Some(codepage))));
+ let (inferred_encoding_source, inferred_encoding) = match label
.as_ref()
- .map(|(_codepage, result)| (EncodingSource::Codepage, result.clone())))
- {
- Some((source, Ok(encoding))) => (source, Ok(encoding)),
- Some((source, Err(EncodingError::Ebcdic))) => (source, Err(EncodingError::Ebcdic)),
- _ => (EncodingSource::Default, Ok(default_encoding())),
- };
+ .map(|(_string, result)| (EncodingSource::Name, result.clone()))
+ .or(codepage
+ .as_ref()
+ .map(|(_codepage, result)| (EncodingSource::Codepage, result.clone())))
+ {
+ Some((source, Ok(encoding))) => (source, Ok(encoding)),
+ Some((source, Err(EncodingError::Ebcdic))) => (source, Err(EncodingError::Ebcdic)),
+ _ => (EncodingSource::Default, Ok(default_encoding())),
+ };
- let mut record_strings = header.get_strings();
- for record in records {
- record_strings.append(&mut record.get_strings());
- }
- let mut encodings: IndexMap<Vec<String>, Vec<&'static Encoding>> = IndexMap::new();
- for encoding in ENCODINGS {
- fn recode_as(
- record_strings: &[RecordString],
- encoding: &'static Encoding,
- ) -> Option<Vec<String>> {
- let mut output = Vec::with_capacity(record_strings.len());
- for rs in record_strings {
- let mut s = encoding
- .decode_without_bom_handling_and_without_replacement(&rs.string.0)?
- .into_owned();
- s.truncate(s.trim_end().len());
- if rs.is_identifier {
- Identifier::check_plausible(&s).ok()?;
+ let mut record_strings = header.get_strings();
+ for record in records {
+ record_strings.append(&mut record.get_strings());
+ }
+ for (case_number, case) in (1..).zip(cases) {
+ for (variable_number, datum) in (1..).zip(case?.0) {
+ if let Some(mut string) = datum.into_string() {
+ string.trim_end();
+ if !string.is_empty() {
+ record_strings.push(RecordString::new(
+ format!("Case {case_number}, Variable {variable_number}"),
+ string,
+ false,
+ ));
+ }
}
- output.push(s);
}
- Some(output)
- }
- if let Some(strings) = recode_as(&record_strings, encoding) {
- encodings.entry(strings).or_default().push(encoding);
}
- }
- let mut strings = Vec::with_capacity(record_strings.len());
- if !encodings.is_empty() {
- for (index, rs) in record_strings.iter().enumerate() {
- // Skip strings that decode the same way from every encoding.
- if encodings.keys().map(|strings| &strings[index]).all_equal() {
- continue;
+ let record_strings = record_strings
+ .into_iter()
+ .unique_by(|rs| rs.string.clone())
+ .collect::<Vec<_>>();
+
+ let mut encodings: IndexMap<Vec<String>, Vec<&'static Encoding>> = IndexMap::new();
+ for encoding in ENCODINGS {
+ fn recode_as(
+ record_strings: &[RecordString],
+ encoding: &'static Encoding,
+ ) -> Option<Vec<String>> {
+ let mut output = Vec::with_capacity(record_strings.len());
+ for rs in record_strings {
+ let mut s = encoding
+ .decode_without_bom_handling_and_without_replacement(&rs.string.0)?
+ .into_owned();
+ s.truncate(s.trim_end().len());
+ if rs.is_identifier {
+ Identifier::check_plausible(&s).ok()?;
+ }
+ output.push(s);
+ }
+ Some(output)
}
-
- /// Returns an iterator for the decoded strings for the given
- /// `index`.
- fn decoded_index<'a>(
- encodings: &'a IndexMap<Vec<String>, Vec<&'static Encoding>>,
- index: usize,
- ) -> impl Iterator<Item = &'a str> {
- encodings.keys().map(move |strings| strings[index].as_str())
+ if let Some(strings) = recode_as(&record_strings, encoding) {
+ encodings.entry(strings).or_default().push(encoding);
}
+ }
+
+ let mut strings = Vec::with_capacity(record_strings.len());
+ if !encodings.is_empty() {
+ for (index, rs) in record_strings.iter().enumerate() {
+ // Skip strings that decode the same way from every encoding.
+ if encodings.keys().map(|strings| &strings[index]).all_equal() {
+ continue;
+ }
+
+ /// Returns an iterator for the decoded strings for the given
+ /// `index`.
+ fn decoded_index<'a>(
+ encodings: &'a IndexMap<Vec<String>, Vec<&'static Encoding>>,
+ index: usize,
+ ) -> impl Iterator<Item = &'a str> {
+ encodings.keys().map(move |strings| strings[index].as_str())
+ }
- let common_prefix: String = decoded_index(&encodings, index)
- .reduce(common_prefix)
- .unwrap()
- .trim_end_matches(|c| c != ' ')
- .into();
- let common_suffix: String = decoded_index(&encodings, index)
- .reduce(common_suffix)
- .unwrap()
- .trim_start_matches(|c| c != ' ')
- .into();
-
- let interpretations = decoded_index(&encodings, index)
- .map(|s| s[common_prefix.len()..s.len() - common_suffix.len()].into())
- .collect();
-
- strings.push(EncodingReportString {
- name: rs.title.clone(),
- common_prefix,
- interpretations,
- common_suffix,
- });
+ let common_prefix: String = decoded_index(&encodings, index)
+ .reduce(common_prefix)
+ .unwrap()
+ .trim_end_matches(|c| c != ' ')
+ .into();
+ let common_suffix: String = decoded_index(&encodings, index)
+ .reduce(common_suffix)
+ .unwrap()
+ .trim_start_matches(|c| c != ' ')
+ .into();
+
+ let interpretations = decoded_index(&encodings, index)
+ .map(|s| s[common_prefix.len()..s.len() - common_suffix.len()].into())
+ .collect();
+
+ strings.push(EncodingReportString {
+ name: rs.title.clone(),
+ common_prefix,
+ interpretations,
+ common_suffix,
+ });
+ }
}
+ Ok(EncodingReport {
+ valid_encodings: encodings.values().cloned().collect(),
+ strings,
+ name: label,
+ codepage,
+ inferred_encoding,
+ inferred_encoding_source,
+ })
}
- EncodingReport {
- valid_encodings: encodings.values().cloned().collect(),
- strings,
- name: label,
- codepage,
- inferred_encoding,
- inferred_encoding_source,
- }
+
+ let records: Vec<Record> = reader.records().collect::<Result<Vec<_>, _>>()?;
+ let header = reader.header().clone();
+ inner(header, &records, reader.cases().take(max_cases as usize))
}
}