use crate::locale_charset::locale_charset;
use encoding_rs::{Encoding, UTF_8};
+use serde::Serialize;
use thiserror::Error as ThisError;
include!(concat!(env!("OUT_DIR"), "/encodings.rs"));
}
/// An error or warning related to encodings.
-#[derive(Clone, ThisError, Debug, PartialEq, Eq)]
+#[derive(Clone, ThisError, Debug, PartialEq, Eq, Serialize)]
+#[serde(rename_all = "snake_case")]
pub enum Error {
/// Warning that the system file doesn't indicate its own encoding.
#[error("This system file does not indicate its own character encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")]
encoding: Option<&str>,
character_code: Option<i32>,
) -> Result<&'static Encoding, Error> {
- let label = if let Some(encoding) = encoding {
- encoding
- } else if let Some(codepage) = character_code {
- match codepage {
- 1 => return Err(Error::Ebcdic),
- 2 | 3 => {
- // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
- // respectively. However, many files have character code 2 but
- // data which are clearly not ASCII. Therefore, ignore these
- // values.
- return Err(Error::NoEncoding);
- }
- 4 => "MS_KANJI",
- _ => CODEPAGE_NUMBER_TO_NAME
+ fn inner(label: &str) -> Result<&'static Encoding, Error> {
+ Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into()))
+ }
+
+ match (encoding, character_code) {
+ (Some(encoding), _) => inner(encoding),
+ (None, Some(1)) => Err(Error::Ebcdic),
+ (None, Some(2 | 3)) => {
+ // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
+ // respectively. However, many files have character code 2 but
+ // data which are clearly not ASCII. Therefore, ignore these
+ // values.
+ Err(Error::NoEncoding)
+ }
+ (None, Some(4)) => inner("MS_KANJI"),
+ (None, Some(codepage)) => inner(
+ CODEPAGE_NUMBER_TO_NAME
.get(&codepage)
.copied()
.ok_or(Error::UnknownCodepage(codepage))?,
- }
- } else {
- return Err(Error::NoEncoding);
- };
-
- Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into()))
+ ),
+ (None, None) => Err(Error::NoEncoding),
+ }
}
#[cfg(test)]
data::{ByteStr, ByteString, Datum, RawCase, RawString},
endian::{FromBytes, ToBytes},
identifier::{Error as IdError, Identifier},
- output::pivot::{Axis3, Dimension, Group, PivotTable, Value},
+ output::{
+ pivot::{Axis3, Dimension, Group, PivotTable, Value},
+ Details, Item, Text,
+ },
sys::{
encoding::{default_encoding, get_encoding, Error as EncodingError},
raw::records::{
EUC_KR,
];
+#[derive(Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum EncodingSource {
+ Name,
+ Codepage,
+ Default,
+}
+
+impl EncodingSource {
+ fn as_str(&self) -> &'static str {
+ match self {
+ EncodingSource::Name => "name",
+ EncodingSource::Codepage => "codepage",
+ EncodingSource::Default => "default",
+ }
+ }
+}
+
+#[derive(Serialize)]
pub struct EncodingReport {
- pub valid_encodings: PivotTable,
- pub interpretations: Option<PivotTable>,
+ /// If the file includes a record that names its encoding, then this is the
+ /// name and how PSPP interprets that as an encoding.
+ pub name: Option<(String, Result<&'static Encoding, EncodingError>)>,
+
+ /// If the file includes a record that identifies its encoding as a code
+ /// page number, then this is the number and how PSPP interprets that as an
+ /// encoding.
+ pub codepage: Option<(i32, Result<&'static Encoding, EncodingError>)>,
+
+ /// The overall encoding chosen.
+ pub inferred_encoding: Result<&'static Encoding, EncodingError>,
+
+ /// Why the overall encoding was chosen.
+ pub inferred_encoding_source: EncodingSource,
+
+ /// The encodings that are valid for this file, based on looking at all the
+ /// text data in the file headers. Each array element is a group of
+ /// encodings that yield the same text data. If there is only one element,
+ /// then all valid encodings yield the same text data.
+ pub valid_encodings: Vec<Vec<&'static Encoding>>,
+
+ /// Individual strings in the file headers, together with their
+ /// intepretations for each group of valid encodings. Only strings that
+ /// don't have the same interpretation for every valid encoding are
+ /// included.
+ ///
+ /// If this is empty, then either:
+ ///
+ /// - `valid_encodings` is also empty. In this case, there are no valid
+ /// encodings, so there are no strings in the valid encodings.
+ ///
+ /// - `valid_encodings` has one element (one group of valid encodings). In
+ /// this case, every valid encoding interprets every string the same way.
+ pub strings: Vec<EncodingReportString>,
+}
+
+impl EncodingReport {
+ fn metadata_pivot_table(&self) -> PivotTable {
+ fn result_to_value(result: &Result<&'static Encoding, EncodingError>) -> Value {
+ match result {
+ Ok(encoding) => encoding.name().into(),
+ Err(error) => error.to_string().into(),
+ }
+ }
+
+ let cols = Group::new("Distinctions")
+ .with("Value")
+ .with("Interpretation");
+ let rows = Group::new("Category")
+ .with("Name")
+ .with("Codepage")
+ .with("Overall");
+ let mut table = PivotTable::new([
+ (Axis3::X, Dimension::new(cols)),
+ (Axis3::Y, Dimension::new(rows)),
+ ])
+ .with_title("Character encoding information found in system file and its interpretation")
+ .with_caption("A system file may identify its character encoding by name or by codepage number or both. This table states which were found, how each was interpreted, and the overall interpretation.");
+ if let Some((label, result)) = &self.name {
+ table.insert(&[0, 0], label.as_str());
+ table.insert(&[1, 0], result_to_value(result));
+ } else {
+ table.insert(&[0, 0], "(none)");
+ }
+ if let Some((codepage, result)) = &self.codepage {
+ table.insert(&[0, 1], Value::new_integer(Some((*codepage) as f64)));
+ table.insert(&[1, 1], result_to_value(result));
+ } else {
+ table.insert(&[0, 1], "(none)");
+ }
+ table.insert(&[0, 2], self.inferred_encoding_source.as_str());
+ table.insert(&[1, 2], result_to_value(&self.inferred_encoding));
+ table
+ }
+}
+
+impl From<&EncodingReport> for Details {
+ fn from(value: &EncodingReport) -> Self {
+ let mut output: Vec<Item> = vec![value.metadata_pivot_table().into()];
+
+ if !value.valid_encodings.is_empty() {
+ let numbers = Group::new("#")
+ .with_multiple((1..=value.valid_encodings.len()).map(|i| format!("{i}")));
+ output.push(
+ PivotTable::new([(Axis3::Y, Dimension::new(numbers))])
+ .with_data(
+ value
+ .valid_encodings
+ .iter()
+ .map(|encodings| {
+ Value::new_user_text(encodings.iter().map(|e| e.name()).join(", "))
+ })
+ .enumerate()
+ .map(|(index, datum)| ([index], datum)),
+ )
+ .into(),
+ );
+
+ if !value.strings.is_empty() {
+ let purposes = Group::with_capacity("Purpose", value.strings.len())
+ .with_label_shown()
+ .with_multiple(value.strings.iter().map(|rs| &rs.name));
+ let number = Group::new("Text")
+ .with_label_shown()
+ .with_multiple((1..=value.valid_encodings.len()).map(|i| format!("{i}")));
+ output.push(
+ PivotTable::new([
+ (Axis3::X, Dimension::new(Group::new("Text").with("Text"))),
+ (Axis3::Y, Dimension::new(number)),
+ (Axis3::Y, Dimension::new(purposes)),
+ ])
+ .with_title("Alternate Encoded Text Strings")
+ .with_caption("Text strings in the file dictionary that the previously listed encodings interpret differently, along with the interpretations.")
+ .with_data(value
+ .strings
+ .iter()
+ .enumerate()
+ .map(|(purpose, rs)| {
+ rs.interpretations
+ .iter()
+ .enumerate()
+ .map(move |(encoding, s)| {
+ (
+ [0, encoding, purpose],
+ Value::new_user_text(rs.ellipsize(s.as_str())),
+ )
+ })
+ })
+ .flatten()
+ .collect::<Vec<_>>()).into(),
+ );
+ }
+ } else {
+ output.push(Text::new_log("No valid encodings were found.").into());
+ };
+
+ output.into_iter().collect()
+ }
+}
+
+/// All of the (valid) interpretations of a given string in a system file.
+#[derive(Serialize)]
+pub struct EncodingReportString {
+ /// Name for the string, something like "variable name 1".
+ name: String,
+
+ /// If the string's interpretations all start with a common prefix, this is
+ /// it. Only whole words are considered to be common.
+ common_prefix: String,
+
+ /// All of the interpretations of the string, one per valid encoding, in the
+ /// order of [EncodingReport::valid_encodings].
+ interpretations: Vec<String>,
+
+ /// If the string's interpretations all end with a common suffix, this is
+ /// it. Only whole words are considered to be common.
+ common_suffix: String,
+}
+
+impl EncodingReportString {
+ fn ellipsize<'a>(&self, s: &'a str) -> Cow<'a, str> {
+ if self.common_prefix.is_empty() && self.common_suffix.is_empty() {
+ Cow::from(s)
+ } else {
+ let mut result = String::with_capacity(s.len() + 6);
+ if !self.common_prefix.is_empty() {
+ result.push_str("...");
+ }
+ result.push_str(s);
+ if !self.common_suffix.is_empty() {
+ result.push_str("...");
+ }
+ Cow::from(result)
+ }
+ }
}
impl EncodingReport {
- pub fn new(record_strings: &[RecordString]) -> Option<Self> {
+ pub fn new(header: &FileHeader<ByteString>, records: &[Record]) -> Self {
+ let (encoding, codepage) = get_encoding_info(&records);
+ let label =
+ encoding.map(|encoding| (String::from(encoding), get_encoding(Some(encoding), None)));
+ let codepage = codepage.map(|codepage| (codepage, get_encoding(None, Some(codepage))));
+ let (inferred_encoding_source, inferred_encoding) = match label
+ .as_ref()
+ .map(|(_string, result)| (EncodingSource::Name, result.clone()))
+ .or(codepage
+ .as_ref()
+ .map(|(_codepage, result)| (EncodingSource::Codepage, result.clone())))
+ {
+ Some((source, Ok(encoding))) => (source, Ok(encoding)),
+ Some((source, Err(EncodingError::Ebcdic))) => (source, Err(EncodingError::Ebcdic)),
+ _ => (EncodingSource::Default, Ok(default_encoding())),
+ };
+
+ let mut record_strings = header.get_strings();
+ for record in records {
+ record_strings.append(&mut record.get_strings());
+ }
let mut encodings: IndexMap<Vec<String>, Vec<&'static Encoding>> = IndexMap::new();
for encoding in ENCODINGS {
fn recode_as(
}
Some(output)
}
- if let Some(strings) = recode_as(record_strings, encoding) {
+ if let Some(strings) = recode_as(&record_strings, encoding) {
encodings.entry(strings).or_default().push(encoding);
}
}
- if encodings.is_empty() {
- return None;
- }
-
- let numbers = Group::new("#").with_multiple((1..=encodings.len()).map(|i| format!("{i}")));
- let valid_encodings = PivotTable::new([(Axis3::Y, Dimension::new(numbers))]).with_data(
- encodings
- .values()
- .map(|encodings| {
- Value::new_user_text(encodings.iter().map(|e| e.name()).join(", "))
- })
- .enumerate()
- .map(|(index, datum)| ([index], datum)),
- );
-
- let mut purposes = Group::new("Purpose").with_label_shown();
- let mut data = Vec::new();
- for (index, rs) in record_strings.iter().enumerate() {
- // Skip strings that decode the same way from every encoding.
- if encodings.keys().map(|strings| &strings[index]).all_equal() {
- continue;
- }
- /// Returns an iterator for the decoded strings for the given
- /// `index`.
- fn decoded_index<'a>(
- encodings: &'a IndexMap<Vec<String>, Vec<&'static Encoding>>,
- index: usize,
- ) -> impl Iterator<Item = &'a str> {
- encodings.keys().map(move |strings| strings[index].as_str())
- }
-
- let common_prefix = decoded_index(&encodings, index)
- .reduce(common_prefix)
- .unwrap()
- .trim_end_matches(|c| c != ' ')
- .len();
- let common_suffix = decoded_index(&encodings, index)
- .reduce(common_suffix)
- .unwrap()
- .trim_start_matches(|c| c != ' ')
- .len();
-
- let purpose = purposes.push(&rs.title);
-
- for (j, s) in decoded_index(&encodings, index).enumerate() {
- let s = &s[common_prefix..s.len() - common_suffix];
- let mut entry = String::with_capacity(s.len() + 6);
- if common_prefix > 0 {
- entry.push_str("...");
+ let mut strings = Vec::with_capacity(record_strings.len());
+ if !encodings.is_empty() {
+ for (index, rs) in record_strings.iter().enumerate() {
+ // Skip strings that decode the same way from every encoding.
+ if encodings.keys().map(|strings| &strings[index]).all_equal() {
+ continue;
}
- entry.push_str(s);
- if common_suffix > 0 {
- entry.push_str("...");
+
+ /// Returns an iterator for the decoded strings for the given
+ /// `index`.
+ fn decoded_index<'a>(
+ encodings: &'a IndexMap<Vec<String>, Vec<&'static Encoding>>,
+ index: usize,
+ ) -> impl Iterator<Item = &'a str> {
+ encodings.keys().map(move |strings| strings[index].as_str())
}
- data.push(([0, j, purpose], Value::new_user_text(entry)));
+
+ let common_prefix: String = decoded_index(&encodings, index)
+ .reduce(common_prefix)
+ .unwrap()
+ .trim_end_matches(|c| c != ' ')
+ .into();
+ let common_suffix: String = decoded_index(&encodings, index)
+ .reduce(common_suffix)
+ .unwrap()
+ .trim_start_matches(|c| c != ' ')
+ .into();
+
+ let interpretations = decoded_index(&encodings, index)
+ .map(|s| s[common_prefix.len()..s.len() - common_suffix.len()].into())
+ .collect();
+
+ strings.push(EncodingReportString {
+ name: rs.title.clone(),
+ common_prefix,
+ interpretations,
+ common_suffix,
+ });
}
}
- let number = Group::new("Text")
- .with_label_shown()
- .with_multiple((1..=encodings.len()).map(|i| format!("{i}")));
- let interpretations = if !data.is_empty() {
- Some(
- PivotTable::new([
- (Axis3::X, Dimension::new(Group::new("Text").with("Text"))),
- (Axis3::Y, Dimension::new(number)),
- (Axis3::Y, Dimension::new(purposes)),
- ])
- .with_title("Alternate Encoded Text Strings")
- .with_caption("Text strings in the file dictionary that the previously listed encodings interpret differently, along with the interpretations.")
- .with_data(data),
- )
- } else {
- None
- };
- Some(Self {
- valid_encodings,
- interpretations,
- })
+ EncodingReport {
+ valid_encodings: encodings.values().cloned().collect(),
+ strings,
+ name: label,
+ codepage,
+ inferred_encoding,
+ inferred_encoding_source,
+ }
}
}