From: Ben Pfaff Date: Sun, 24 Aug 2025 00:36:59 +0000 (-0700) Subject: encodings display pretty well now X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=532c23b91826780677171ca23d73b774c396b213;p=pspp encodings display pretty well now --- diff --git a/rust/pspp/src/main.rs b/rust/pspp/src/main.rs index f388f89c6e..606733daf5 100644 --- a/rust/pspp/src/main.rs +++ b/rust/pspp/src/main.rs @@ -26,8 +26,7 @@ use pspp::{ sys::{ self, raw::{ - get_encoding_info, infer_encoding, records::Compression, Decoder, EncodingReport, - Magic, Reader, Record, + infer_encoding, records::Compression, Decoder, EncodingReport, Magic, Reader, Record, }, ReadOptions, Records, }, @@ -278,6 +277,37 @@ enum Output { } impl Output { + /* + fn show_metadata(&self, metadata: MetadataEntry) -> Result<()> { + match self { + Self::Driver { driver, .. } => { + driver + .borrow_mut() + .write(&Arc::new(Item::new(metadata.into_pivot_table()))); + Ok(()) + } + Self::Json { .. } => self.show_json(&metadata), + Self::Discard => Ok(()), + } + }*/ + + fn show(&self, value: &T) -> Result<()> + where + T: Serialize, + for<'a> &'a T: Into
, + { + match self { + Self::Driver { driver, .. } => { + driver + .borrow_mut() + .write(&Arc::new(Item::new(value.into()))); + Ok(()) + } + Self::Json { .. } => self.show_json(value), + Self::Discard => Ok(()), + } + } + fn show_json(&self, value: &T) -> Result<()> where T: Serialize, @@ -336,8 +366,6 @@ impl Show { "ndjson" => ShowFormat::Ndjson, _ => ShowFormat::Output, } - } else if self.mode == Mode::Encodings { - ShowFormat::Output } else { ShowFormat::Json }; @@ -474,29 +502,8 @@ impl Show { } Mode::Encodings => { let records: Vec = reader.records().collect::, _>>()?; - let (encoding, character_code) = get_encoding_info(&records); - - let mut record_strings = reader.header().get_strings(); - for record in &records { - record_strings.append(&mut record.get_strings()); - } - let Some(encoding_report) = EncodingReport::new(&record_strings) else { - output.warn(&"No valid encodings found."); - return Ok(()); - }; - match &output { - Output::Driver { driver, .. } => { - driver - .borrow_mut() - .write(&Arc::new(Item::new(encoding_report.valid_encodings))); - if let Some(interpretations) = encoding_report.interpretations { - driver - .borrow_mut() - .write(&Arc::new(Item::new(interpretations))); - } - } - _ => todo!(), - } + let encoding_report = EncodingReport::new(reader.header(), &records); + output.show(&encoding_report)?; } } diff --git a/rust/pspp/src/output/mod.rs b/rust/pspp/src/output/mod.rs index 0779129f8e..28ab4efdd6 100644 --- a/rust/pspp/src/output/mod.rs +++ b/rust/pspp/src/output/mod.rs @@ -148,6 +148,18 @@ impl Details { } } +impl FromIterator for Details +where + A: Into>, +{ + fn from_iter(iter: T) -> Self + where + T: IntoIterator, + { + Self::Group(iter.into_iter().map(|value| value.into()).collect()) + } +} + impl From for Details { fn from(value: Diagnostic) -> Self { Self::Message(Box::new(value)) @@ -192,10 +204,10 @@ pub struct Text { } impl Text { - pub fn new_log(s: impl Into) -> Self { + pub fn new_log(value: impl Into) -> Self { Self { type_: TextType::Log, - content: Value::new_user_text(s), + content: value.into(), } } } diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs index bd6c4242fd..387e0f3d14 100644 --- a/rust/pspp/src/output/pivot/mod.rs +++ b/rust/pspp/src/output/pivot/mod.rs @@ -1986,20 +1986,20 @@ impl Serialize for Value { /// Wrapper for [Value] that uses [Value::serialize_bare] for serialization. #[derive(Serialize)] -struct BareValue<'a>(#[serde(serialize_with = "Value::serialize_bare")] &'a Value); +struct BareValue<'a>(#[serde(serialize_with = "Value::serialize_bare")] pub &'a Value); impl Value { - fn serialize_bare(&self, serializer: S) -> Result + pub fn serialize_bare(&self, serializer: S) -> Result where S: Serializer, { match &self.inner { - ValueInner::Number(number_value) => number_value.value.serialize(serializer), + ValueInner::Number(number_value) => number_value.serialize_bare(serializer), ValueInner::String(string_value) => string_value.s.serialize(serializer), ValueInner::Variable(variable_value) => variable_value.var_name.serialize(serializer), ValueInner::Text(text_value) => text_value.localized.serialize(serializer), ValueInner::Template(template_value) => template_value.localized.serialize(serializer), - ValueInner::Empty => ().serialize(serializer), + ValueInner::Empty => serializer.serialize_none(), } } @@ -2499,6 +2499,28 @@ impl Serialize for NumberValue { } } +impl NumberValue { + pub fn serialize_bare(&self, serializer: S) -> Result + where + S: Serializer, + { + if let Some(number) = self.value + && number.trunc() == number + && number >= -(1i64 << 53) as f64 + && number <= (1i64 << 53) as f64 + { + (number as u64).serialize(serializer) + } else { + self.value.serialize(serializer) + } + } +} + +#[derive(Serialize)] +pub struct BareNumberValue<'a>( + #[serde(serialize_with = "NumberValue::serialize_bare")] pub &'a NumberValue, +); + #[derive(Clone, Debug, Serialize)] pub struct StringValue { /// The string value. @@ -2686,15 +2708,16 @@ pub enum MetadataValue { } impl MetadataEntry { - fn into_pivot_table(self) -> PivotTable { + pub fn into_pivot_table(self) -> PivotTable { let mut data = Vec::new(); let group = match self.visit(&mut data) { Category::Group(group) => group, - Category::Leaf(leaf) => Group::new("Metadata").with(leaf), + Category::Leaf(leaf) => Group::new("Metadata").with(leaf).with_label_shown(), }; PivotTable::new([(Axis3::Y, Dimension::new(group))]).with_data( data.into_iter() .enumerate() + .filter(|(_row, value)| !value.is_empty()) .map(|(row, value)| ([row], value)), ) } diff --git a/rust/pspp/src/output/text.rs b/rust/pspp/src/output/text.rs index 61b49a357c..1a5aad6d8e 100644 --- a/rust/pspp/src/output/text.rs +++ b/rust/pspp/src/output/text.rs @@ -60,7 +60,7 @@ impl Boxes { #[derive(Clone, Debug, Deserialize, Serialize)] pub struct TextConfig { /// Output file name. - file: PathBuf, + file: Option, /// Renderer config. #[serde(flatten)] @@ -365,7 +365,10 @@ pub struct TextDriver { impl TextDriver { pub fn new(config: &TextConfig) -> std::io::Result { Ok(Self { - file: BufWriter::new(File::create(&config.file)?), + file: BufWriter::new(match &config.file { + Some(file) => File::create(&file)?, + None => File::options().write(true).open("/dev/stdout")?, + }), renderer: TextRenderer::new(&config.options), }) } diff --git a/rust/pspp/src/sys/encoding.rs b/rust/pspp/src/sys/encoding.rs index 29a4f9e45f..4e7468829b 100644 --- a/rust/pspp/src/sys/encoding.rs +++ b/rust/pspp/src/sys/encoding.rs @@ -22,6 +22,7 @@ use std::sync::LazyLock; use crate::locale_charset::locale_charset; use encoding_rs::{Encoding, UTF_8}; +use serde::Serialize; use thiserror::Error as ThisError; include!(concat!(env!("OUT_DIR"), "/encodings.rs")); @@ -42,7 +43,8 @@ pub fn codepage_from_encoding(encoding: &'static Encoding) -> u32 { } /// An error or warning related to encodings. -#[derive(Clone, ThisError, Debug, PartialEq, Eq)] +#[derive(Clone, ThisError, Debug, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] pub enum Error { /// Warning that the system file doesn't indicate its own encoding. #[error("This system file does not indicate its own character encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")] @@ -88,29 +90,29 @@ pub fn get_encoding( encoding: Option<&str>, character_code: Option, ) -> Result<&'static Encoding, Error> { - let label = if let Some(encoding) = encoding { - encoding - } else if let Some(codepage) = character_code { - match codepage { - 1 => return Err(Error::Ebcdic), - 2 | 3 => { - // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic] - // respectively. However, many files have character code 2 but - // data which are clearly not ASCII. Therefore, ignore these - // values. - return Err(Error::NoEncoding); - } - 4 => "MS_KANJI", - _ => CODEPAGE_NUMBER_TO_NAME + fn inner(label: &str) -> Result<&'static Encoding, Error> { + Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into())) + } + + match (encoding, character_code) { + (Some(encoding), _) => inner(encoding), + (None, Some(1)) => Err(Error::Ebcdic), + (None, Some(2 | 3)) => { + // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic] + // respectively. However, many files have character code 2 but + // data which are clearly not ASCII. Therefore, ignore these + // values. + Err(Error::NoEncoding) + } + (None, Some(4)) => inner("MS_KANJI"), + (None, Some(codepage)) => inner( + CODEPAGE_NUMBER_TO_NAME .get(&codepage) .copied() .ok_or(Error::UnknownCodepage(codepage))?, - } - } else { - return Err(Error::NoEncoding); - }; - - Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into())) + ), + (None, None) => Err(Error::NoEncoding), + } } #[cfg(test)] diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 75218b794f..747c34761c 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -23,7 +23,10 @@ use crate::{ data::{ByteStr, ByteString, Datum, RawCase, RawString}, endian::{FromBytes, ToBytes}, identifier::{Error as IdError, Identifier}, - output::pivot::{Axis3, Dimension, Group, PivotTable, Value}, + output::{ + pivot::{Axis3, Dimension, Group, PivotTable, Value}, + Details, Item, Text, + }, sys::{ encoding::{default_encoding, get_encoding, Error as EncodingError}, raw::records::{ @@ -1873,13 +1876,225 @@ static ENCODINGS: [&Encoding; 32] = [ EUC_KR, ]; +#[derive(Serialize)] +#[serde(rename_all = "snake_case")] +pub enum EncodingSource { + Name, + Codepage, + Default, +} + +impl EncodingSource { + fn as_str(&self) -> &'static str { + match self { + EncodingSource::Name => "name", + EncodingSource::Codepage => "codepage", + EncodingSource::Default => "default", + } + } +} + +#[derive(Serialize)] pub struct EncodingReport { - pub valid_encodings: PivotTable, - pub interpretations: Option, + /// If the file includes a record that names its encoding, then this is the + /// name and how PSPP interprets that as an encoding. + pub name: Option<(String, Result<&'static Encoding, EncodingError>)>, + + /// If the file includes a record that identifies its encoding as a code + /// page number, then this is the number and how PSPP interprets that as an + /// encoding. + pub codepage: Option<(i32, Result<&'static Encoding, EncodingError>)>, + + /// The overall encoding chosen. + pub inferred_encoding: Result<&'static Encoding, EncodingError>, + + /// Why the overall encoding was chosen. + pub inferred_encoding_source: EncodingSource, + + /// The encodings that are valid for this file, based on looking at all the + /// text data in the file headers. Each array element is a group of + /// encodings that yield the same text data. If there is only one element, + /// then all valid encodings yield the same text data. + pub valid_encodings: Vec>, + + /// Individual strings in the file headers, together with their + /// intepretations for each group of valid encodings. Only strings that + /// don't have the same interpretation for every valid encoding are + /// included. + /// + /// If this is empty, then either: + /// + /// - `valid_encodings` is also empty. In this case, there are no valid + /// encodings, so there are no strings in the valid encodings. + /// + /// - `valid_encodings` has one element (one group of valid encodings). In + /// this case, every valid encoding interprets every string the same way. + pub strings: Vec, +} + +impl EncodingReport { + fn metadata_pivot_table(&self) -> PivotTable { + fn result_to_value(result: &Result<&'static Encoding, EncodingError>) -> Value { + match result { + Ok(encoding) => encoding.name().into(), + Err(error) => error.to_string().into(), + } + } + + let cols = Group::new("Distinctions") + .with("Value") + .with("Interpretation"); + let rows = Group::new("Category") + .with("Name") + .with("Codepage") + .with("Overall"); + let mut table = PivotTable::new([ + (Axis3::X, Dimension::new(cols)), + (Axis3::Y, Dimension::new(rows)), + ]) + .with_title("Character encoding information found in system file and its interpretation") + .with_caption("A system file may identify its character encoding by name or by codepage number or both. This table states which were found, how each was interpreted, and the overall interpretation."); + if let Some((label, result)) = &self.name { + table.insert(&[0, 0], label.as_str()); + table.insert(&[1, 0], result_to_value(result)); + } else { + table.insert(&[0, 0], "(none)"); + } + if let Some((codepage, result)) = &self.codepage { + table.insert(&[0, 1], Value::new_integer(Some((*codepage) as f64))); + table.insert(&[1, 1], result_to_value(result)); + } else { + table.insert(&[0, 1], "(none)"); + } + table.insert(&[0, 2], self.inferred_encoding_source.as_str()); + table.insert(&[1, 2], result_to_value(&self.inferred_encoding)); + table + } +} + +impl From<&EncodingReport> for Details { + fn from(value: &EncodingReport) -> Self { + let mut output: Vec = vec![value.metadata_pivot_table().into()]; + + if !value.valid_encodings.is_empty() { + let numbers = Group::new("#") + .with_multiple((1..=value.valid_encodings.len()).map(|i| format!("{i}"))); + output.push( + PivotTable::new([(Axis3::Y, Dimension::new(numbers))]) + .with_data( + value + .valid_encodings + .iter() + .map(|encodings| { + Value::new_user_text(encodings.iter().map(|e| e.name()).join(", ")) + }) + .enumerate() + .map(|(index, datum)| ([index], datum)), + ) + .into(), + ); + + if !value.strings.is_empty() { + let purposes = Group::with_capacity("Purpose", value.strings.len()) + .with_label_shown() + .with_multiple(value.strings.iter().map(|rs| &rs.name)); + let number = Group::new("Text") + .with_label_shown() + .with_multiple((1..=value.valid_encodings.len()).map(|i| format!("{i}"))); + output.push( + PivotTable::new([ + (Axis3::X, Dimension::new(Group::new("Text").with("Text"))), + (Axis3::Y, Dimension::new(number)), + (Axis3::Y, Dimension::new(purposes)), + ]) + .with_title("Alternate Encoded Text Strings") + .with_caption("Text strings in the file dictionary that the previously listed encodings interpret differently, along with the interpretations.") + .with_data(value + .strings + .iter() + .enumerate() + .map(|(purpose, rs)| { + rs.interpretations + .iter() + .enumerate() + .map(move |(encoding, s)| { + ( + [0, encoding, purpose], + Value::new_user_text(rs.ellipsize(s.as_str())), + ) + }) + }) + .flatten() + .collect::>()).into(), + ); + } + } else { + output.push(Text::new_log("No valid encodings were found.").into()); + }; + + output.into_iter().collect() + } +} + +/// All of the (valid) interpretations of a given string in a system file. +#[derive(Serialize)] +pub struct EncodingReportString { + /// Name for the string, something like "variable name 1". + name: String, + + /// If the string's interpretations all start with a common prefix, this is + /// it. Only whole words are considered to be common. + common_prefix: String, + + /// All of the interpretations of the string, one per valid encoding, in the + /// order of [EncodingReport::valid_encodings]. + interpretations: Vec, + + /// If the string's interpretations all end with a common suffix, this is + /// it. Only whole words are considered to be common. + common_suffix: String, +} + +impl EncodingReportString { + fn ellipsize<'a>(&self, s: &'a str) -> Cow<'a, str> { + if self.common_prefix.is_empty() && self.common_suffix.is_empty() { + Cow::from(s) + } else { + let mut result = String::with_capacity(s.len() + 6); + if !self.common_prefix.is_empty() { + result.push_str("..."); + } + result.push_str(s); + if !self.common_suffix.is_empty() { + result.push_str("..."); + } + Cow::from(result) + } + } } impl EncodingReport { - pub fn new(record_strings: &[RecordString]) -> Option { + pub fn new(header: &FileHeader, records: &[Record]) -> Self { + let (encoding, codepage) = get_encoding_info(&records); + let label = + encoding.map(|encoding| (String::from(encoding), get_encoding(Some(encoding), None))); + let codepage = codepage.map(|codepage| (codepage, get_encoding(None, Some(codepage)))); + let (inferred_encoding_source, inferred_encoding) = match label + .as_ref() + .map(|(_string, result)| (EncodingSource::Name, result.clone())) + .or(codepage + .as_ref() + .map(|(_codepage, result)| (EncodingSource::Codepage, result.clone()))) + { + Some((source, Ok(encoding))) => (source, Ok(encoding)), + Some((source, Err(EncodingError::Ebcdic))) => (source, Err(EncodingError::Ebcdic)), + _ => (EncodingSource::Default, Ok(default_encoding())), + }; + + let mut record_strings = header.get_strings(); + for record in records { + record_strings.append(&mut record.get_strings()); + } let mut encodings: IndexMap, Vec<&'static Encoding>> = IndexMap::new(); for encoding in ENCODINGS { fn recode_as( @@ -1899,89 +2114,59 @@ impl EncodingReport { } Some(output) } - if let Some(strings) = recode_as(record_strings, encoding) { + if let Some(strings) = recode_as(&record_strings, encoding) { encodings.entry(strings).or_default().push(encoding); } } - if encodings.is_empty() { - return None; - } - - let numbers = Group::new("#").with_multiple((1..=encodings.len()).map(|i| format!("{i}"))); - let valid_encodings = PivotTable::new([(Axis3::Y, Dimension::new(numbers))]).with_data( - encodings - .values() - .map(|encodings| { - Value::new_user_text(encodings.iter().map(|e| e.name()).join(", ")) - }) - .enumerate() - .map(|(index, datum)| ([index], datum)), - ); - - let mut purposes = Group::new("Purpose").with_label_shown(); - let mut data = Vec::new(); - for (index, rs) in record_strings.iter().enumerate() { - // Skip strings that decode the same way from every encoding. - if encodings.keys().map(|strings| &strings[index]).all_equal() { - continue; - } - /// Returns an iterator for the decoded strings for the given - /// `index`. - fn decoded_index<'a>( - encodings: &'a IndexMap, Vec<&'static Encoding>>, - index: usize, - ) -> impl Iterator { - encodings.keys().map(move |strings| strings[index].as_str()) - } - - let common_prefix = decoded_index(&encodings, index) - .reduce(common_prefix) - .unwrap() - .trim_end_matches(|c| c != ' ') - .len(); - let common_suffix = decoded_index(&encodings, index) - .reduce(common_suffix) - .unwrap() - .trim_start_matches(|c| c != ' ') - .len(); - - let purpose = purposes.push(&rs.title); - - for (j, s) in decoded_index(&encodings, index).enumerate() { - let s = &s[common_prefix..s.len() - common_suffix]; - let mut entry = String::with_capacity(s.len() + 6); - if common_prefix > 0 { - entry.push_str("..."); + let mut strings = Vec::with_capacity(record_strings.len()); + if !encodings.is_empty() { + for (index, rs) in record_strings.iter().enumerate() { + // Skip strings that decode the same way from every encoding. + if encodings.keys().map(|strings| &strings[index]).all_equal() { + continue; } - entry.push_str(s); - if common_suffix > 0 { - entry.push_str("..."); + + /// Returns an iterator for the decoded strings for the given + /// `index`. + fn decoded_index<'a>( + encodings: &'a IndexMap, Vec<&'static Encoding>>, + index: usize, + ) -> impl Iterator { + encodings.keys().map(move |strings| strings[index].as_str()) } - data.push(([0, j, purpose], Value::new_user_text(entry))); + + let common_prefix: String = decoded_index(&encodings, index) + .reduce(common_prefix) + .unwrap() + .trim_end_matches(|c| c != ' ') + .into(); + let common_suffix: String = decoded_index(&encodings, index) + .reduce(common_suffix) + .unwrap() + .trim_start_matches(|c| c != ' ') + .into(); + + let interpretations = decoded_index(&encodings, index) + .map(|s| s[common_prefix.len()..s.len() - common_suffix.len()].into()) + .collect(); + + strings.push(EncodingReportString { + name: rs.title.clone(), + common_prefix, + interpretations, + common_suffix, + }); } } - let number = Group::new("Text") - .with_label_shown() - .with_multiple((1..=encodings.len()).map(|i| format!("{i}"))); - let interpretations = if !data.is_empty() { - Some( - PivotTable::new([ - (Axis3::X, Dimension::new(Group::new("Text").with("Text"))), - (Axis3::Y, Dimension::new(number)), - (Axis3::Y, Dimension::new(purposes)), - ]) - .with_title("Alternate Encoded Text Strings") - .with_caption("Text strings in the file dictionary that the previously listed encodings interpret differently, along with the interpretations.") - .with_data(data), - ) - } else { - None - }; - Some(Self { - valid_encodings, - interpretations, - }) + EncodingReport { + valid_encodings: encodings.values().cloned().collect(), + strings, + name: label, + codepage, + inferred_encoding, + inferred_encoding_source, + } } }