From 70611b4f4a6478c1dc9667f6502a62ce95d3868f Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Mon, 4 Aug 2025 19:37:18 -0700 Subject: [PATCH] work on converting a file to unicode --- rust/pspp/src/data.rs | 28 +++++- rust/pspp/src/data/encoded.rs | 64 ++++++++++++- rust/pspp/src/dictionary.rs | 173 +++++++++++++++++++++++++++++++++- rust/pspp/src/format/mod.rs | 43 ++++++++- rust/pspp/src/identifier.rs | 51 +++++++++- rust/pspp/src/sys/test.rs | 2 +- 6 files changed, 352 insertions(+), 9 deletions(-) diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index c86e20a090..fe77681998 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -248,6 +248,18 @@ impl From<&'_ str> for ByteString { } } +impl From> for ByteString { + fn from(value: Cow<'_, str>) -> Self { + value.into_owned().into() + } +} + +impl From> for ByteString { + fn from(value: Cow<'_, [u8]>) -> Self { + value.into_owned().into() + } +} + impl From> for ByteString { fn from(value: Vec) -> Self { Self(value) @@ -315,7 +327,7 @@ impl MutRawString for ByteString { self.0.truncate(new_len); } Ordering::Equal => (), - Ordering::Greater => self.0.extend((self.0.len()..new_len).map(|_| b' ')), + Ordering::Greater => self.0.resize(new_len, b' '), } Ok(()) } @@ -357,6 +369,20 @@ impl Datum> { let s: String = s.into(); Datum::String(ByteString::from(s).with_encoding(UTF_8)) } + + pub fn codepage_to_unicode(&mut self) { + match self { + Datum::Number(_) => (), + Datum::String(string) => string.codepage_to_unicode(), + } + } + + pub fn without_encoding(self) -> Datum { + match self { + Datum::Number(number) => Datum::Number(number), + Datum::String(string) => Datum::String(string.inner), + } + } } impl<'a> Datum>> { diff --git a/rust/pspp/src/data/encoded.rs b/rust/pspp/src/data/encoded.rs index a12eccb9bf..3584d05070 100644 --- a/rust/pspp/src/data/encoded.rs +++ b/rust/pspp/src/data/encoded.rs @@ -8,7 +8,10 @@ use std::{ use encoding_rs::{Encoding, UTF_8}; use serde::Serialize; -use crate::data::{ByteCow, ByteStr, ByteString, MutRawString, Quoted, RawString, ResizeError}; +use crate::{ + data::{ByteCow, ByteStr, ByteString, MutRawString, Quoted, RawString, ResizeError}, + dictionary::VarWidth, +}; pub trait Encoded { fn encoding(&self) -> &'static Encoding; @@ -193,6 +196,27 @@ where } } +impl WithEncoding { + pub fn codepage_to_unicode(&mut self) { + if self.encoding() != UTF_8 { + let new_len = (self.inner.len() * 3).min(VarWidth::MAX_STRING as usize); + if let Cow::Owned(string) = self + .encoding() + .decode_without_bom_handling(self.raw_string_bytes()) + .0 + { + self.inner = ByteString::from(string); + } + + // Use `self.inner.0.resize` (instead of `self.inner.resize()`) + // because this is a forced resize that can trim off non-spaces. + self.inner.0.resize(new_len, b' '); + + self.encoding = UTF_8; + } + } +} + impl Encoded for WithEncoding { fn encoding(&self) -> &'static Encoding { self.encoding @@ -216,3 +240,41 @@ where self.inner.hash(state); } } + +#[cfg(test)] +mod tests { + use std::{char::REPLACEMENT_CHARACTER, iter::repeat_n}; + + use encoding_rs::{Encoding, UTF_8, WINDOWS_1252}; + + use crate::data::{ByteString, EncodedString, RawString}; + + #[test] + fn codepage_to_unicode() { + fn check_unicode(original: &str, encoding: &'static Encoding, expected: &str) { + let original = ByteString::from(encoding.encode(original).0).with_encoding(encoding); + let mut actual = original.clone(); + actual.codepage_to_unicode(); + assert_eq!(actual.as_str().len(), expected.len()); + assert_eq!(actual.as_str(), expected); + } + + check_unicode("abc", UTF_8, "abc"); + check_unicode("abc", WINDOWS_1252, "abc "); + check_unicode("éèäî", WINDOWS_1252, "éèäî "); + check_unicode( + &repeat_n('é', 15000).collect::(), + WINDOWS_1252, + &repeat_n('é', 15000) + .chain(repeat_n(' ', 2767)) + .collect::(), + ); + check_unicode( + &repeat_n('é', 20000).collect::(), + WINDOWS_1252, + &repeat_n('é', 16383) + .chain(std::iter::once(REPLACEMENT_CHARACTER)) + .collect::(), + ); + } +} diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index 6f30e29972..4608d2dda3 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -26,7 +26,7 @@ use std::{ str::FromStr, }; -use encoding_rs::Encoding; +use encoding_rs::{Encoding, UTF_8}; use enum_map::{Enum, EnumMap}; use indexmap::IndexSet; use num::integer::div_ceil; @@ -39,7 +39,7 @@ use thiserror::Error as ThisError; use unicase::UniCase; use crate::{ - data::{ByteString, Datum, EncodedString, ResizeError, WithEncoding}, + data::{ByteString, Datum, Encoded, EncodedString, ResizeError, WithEncoding}, format::{DisplayPlain, Format}, identifier::{ByIdentifier, HasIdentifier, Identifier}, output::pivot::{ @@ -218,6 +218,13 @@ impl VarWidth { pub fn display_adjective(&self) -> VarWidthAdjective { VarWidthAdjective(*self) } + + pub fn codepage_to_unicode(&mut self) { + match self { + VarWidth::Numeric => (), + VarWidth::String(width) => *width = width.saturating_mul(3).min(Self::MAX_STRING), + } + } } pub struct Segments { @@ -832,6 +839,52 @@ impl Dictionary { .map(|names| names.into_iter().flatten().collect()) .collect() } + + pub fn codepage_to_unicode(&mut self) { + if self.encoding == UTF_8 { + return; + } + + let mut variables = IndexSet::new(); + let mut index = 0; + for mut variable in self.variables.drain(..) { + variable.codepage_to_unicode(); + while variables.contains(&variable) { + index += 1; + variable.name = Identifier::new(format!("Var{index}")).unwrap(); + } + variables.insert(variable); + } + self.variables = variables; + + let mut vectors = HashSet::new(); + let mut index = 0; + for mut vector in self.vectors.drain() { + vector.codepage_to_unicode(); + while vectors.contains(&vector) { + index += 1; + vector.name = Identifier::new(format!("Vec{index}")).unwrap(); + } + vectors.insert(vector); + } + self.vectors = vectors; + + self.attributes.codepage_to_unicode(); + + let mut mrsets = BTreeSet::new(); + let mut index = 0; + while let Some(mut mrset) = self.mrsets.pop_first() { + mrset.codepage_to_unicode(); + while mrsets.contains(&mrset) { + index += 1; + mrset.name = Identifier::new(format!("Mr{index}")).unwrap(); + } + mrsets.insert(mrset); + } + self.mrsets = mrsets; + + self.encoding = UTF_8; + } } pub struct OutputVariables<'a> { @@ -1294,6 +1347,15 @@ impl Attributes { pub fn has_any(&self, include_at: bool) -> bool { self.iter(include_at).next().is_some() } + + pub fn codepage_to_unicode(&mut self) { + let mut new = BTreeMap::new(); + while let Some((mut name, value)) = self.0.pop_first() { + name.codepage_to_unicode(); + new.insert(name, value); + } + self.0 = new; + } } impl Debug for Attributes { @@ -1466,6 +1528,20 @@ impl Variable { width: self.width, } } + + pub fn codepage_to_unicode(&mut self) { + self.name.codepage_to_unicode(); + self.width.codepage_to_unicode(); + self.missing_values.codepage_to_unicode(); + self.print_format.codepage_to_unicode(); + self.write_format.codepage_to_unicode(); + self.attributes.codepage_to_unicode(); + self.encoding = UTF_8; + + // Anything old enough to not support long names is old enough not to + // support Unicode. + self.short_names.clear(); + } } impl HasIdentifier for Variable { @@ -1488,6 +1564,10 @@ impl DictIndexVector { update_dict_index_vec(&mut self.variables, f); (!self.variables.is_empty()).then_some(self) } + + pub fn codepage_to_unicode(&mut self) { + self.name.codepage_to_unicode(); + } } impl HasIdentifier for DictIndexVector { @@ -1901,6 +1981,10 @@ impl DictIndexMultipleResponseSet { update_dict_index_vec(&mut self.variables, f); (self.variables.len() > 1).then_some(self) } + + pub fn codepage_to_unicode(&mut self) { + self.name.codepage_to_unicode(); + } } impl HasIdentifier for DictIndexMultipleResponseSet { @@ -2009,6 +2093,18 @@ impl ValueLabels { .filter_map(|(mut datum, string)| datum.resize(width).is_ok().then(|| (datum, string))) .collect(); } + + pub fn codepage_to_unicode(&mut self, encoding: &'static Encoding) { + self.0 = self + .0 + .drain() + .map(|(key, value)| { + let mut key = key.with_encoding(encoding); + key.codepage_to_unicode(); + (key.without_encoding(), value) + }) + .collect(); + } } impl Debug for ValueLabels { @@ -2095,7 +2191,7 @@ impl<'a> MissingValuesMut<'a> { } } -#[derive(Clone, Default, Serialize)] +#[derive(Clone, Default, Serialize, PartialEq)] pub struct MissingValues { /// Individual missing values, up to 3 of them. values: Vec>>, @@ -2237,6 +2333,23 @@ impl MissingValues { } inner(self, width).inspect_err(|_| self.clear()) } + + pub fn codepage_to_unicode(&mut self) { + self.values = self + .values + .drain(..) + .map(|value| match value { + Datum::Number(number) => Datum::Number(number), + Datum::String(s) => Datum::String(if s.encoding() != UTF_8 { + let mut new_s = ByteString::from(s.as_str()); + new_s.0.truncate(8); + WithEncoding::new(new_s, UTF_8) + } else { + s + }), + }) + .collect(); + } } #[derive(Copy, Clone, Debug, Serialize, PartialEq)] @@ -2367,7 +2480,8 @@ mod test { use unicase::UniCase; use crate::{ - dictionary::{Dictionary, VarWidth, Variable}, + data::{ByteString, Datum, RawString, WithEncoding}, + dictionary::{Dictionary, MissingValues, VarWidth, Variable}, identifier::Identifier, }; @@ -2505,4 +2619,55 @@ mod test { assert_eq!(expected, dict.short_names()); } } + + #[test] + fn var_width_codepage_to_unicode() { + fn check_unicode(input: VarWidth, expected: VarWidth) { + let mut actual = input; + actual.codepage_to_unicode(); + assert_eq!(actual, expected); + } + + check_unicode(VarWidth::Numeric, VarWidth::Numeric); + check_unicode(VarWidth::String(1), VarWidth::String(3)); + check_unicode(VarWidth::String(2), VarWidth::String(6)); + check_unicode(VarWidth::String(3), VarWidth::String(9)); + check_unicode(VarWidth::String(1000), VarWidth::String(3000)); + check_unicode(VarWidth::String(20000), VarWidth::String(32767)); + check_unicode(VarWidth::String(30000), VarWidth::String(32767)); + } + + #[test] + fn missing_values_codepage_to_unicode() { + fn windows_1252(s: &str) -> WithEncoding { + ByteString::from(WINDOWS_1252.encode(s).0).with_encoding(WINDOWS_1252) + } + + let mut actual = MissingValues::new( + vec![ + Datum::String(windows_1252("abcdefgh")), + Datum::String(windows_1252("éèäî ")), + Datum::String(windows_1252("aaéèäîdf")), + ], + None, + ) + .unwrap(); + actual.codepage_to_unicode(); + + fn utf_8(s: &str) -> WithEncoding { + ByteString::from(s).with_encoding(UTF_8) + } + + let expected = MissingValues::new( + vec![ + Datum::String(utf_8("abcdefgh")), + Datum::String(utf_8("éèäî")), + Datum::String(utf_8("aaéèä")), + ], + None, + ) + .unwrap(); + + assert_eq!(&actual, &expected); + } } diff --git a/rust/pspp/src/format/mod.rs b/rust/pspp/src/format/mod.rs index 6fc8162468..b4109cb206 100644 --- a/rust/pspp/src/format/mod.rs +++ b/rust/pspp/src/format/mod.rs @@ -29,7 +29,7 @@ use thiserror::Error as ThisError; use unicode_width::UnicodeWidthStr; use crate::{ - data::{ByteString, Datum, }, + data::{ByteString, Datum}, dictionary::{VarType, VarWidth}, sys::raw, }; @@ -640,6 +640,18 @@ impl Format { _ => *self = Self::default_for_width(width), } } + + pub fn codepage_to_unicode(&mut self) { + let mut width = self.var_width(); + width.codepage_to_unicode(); + if let Some(width) = width.as_string_width() { + if self.type_ == Type::AHex { + self.w = width as u16 * 2; + } else { + self.w = width as u16; + } + } + } } impl Debug for Format { @@ -1334,3 +1346,32 @@ impl Iterator for DateTemplate { Some(TemplateItem { c, n }) } } + +#[cfg(test)] +mod tests { + use crate::format::{Format, Type, Width}; + + #[test] + fn codepage_to_unicode() { + fn check_format(input: Format, expected_width: Width) { + let mut output = input; + output.codepage_to_unicode(); + let expected = Format::new(input.type_, expected_width, input.d).unwrap(); + assert_eq!(output, expected); + } + check_format(Format::new(Type::A, 1, 0).unwrap(), 3); + check_format(Format::new(Type::A, 2, 0).unwrap(), 6); + check_format(Format::new(Type::A, 3, 0).unwrap(), 9); + check_format(Format::new(Type::A, 1000, 0).unwrap(), 3000); + check_format(Format::new(Type::A, 20000, 0).unwrap(), 32767); + + check_format(Format::new(Type::AHex, 2, 0).unwrap(), 6); + check_format(Format::new(Type::AHex, 4, 0).unwrap(), 12); + check_format(Format::new(Type::AHex, 6, 0).unwrap(), 18); + check_format(Format::new(Type::AHex, 2000, 0).unwrap(), 6000); + check_format(Format::new(Type::AHex, 20000, 0).unwrap(), 60000); + check_format(Format::new(Type::AHex, 30000, 0).unwrap(), 65534); + + check_format(Format::new(Type::F, 40, 0).unwrap(), 40); + } +} diff --git a/rust/pspp/src/identifier.rs b/rust/pspp/src/identifier.rs index 0923504b4d..9697cf7673 100644 --- a/rust/pspp/src/identifier.rs +++ b/rust/pspp/src/identifier.rs @@ -221,6 +221,22 @@ impl Identifier { Self::from_encoding(s, UTF_8) } + /// Converts this identifier to UTF-8. This is generally a no-op, because + /// our internal encoding is UTF-8, but some identifiers are longer in UTF-8 + /// than in their code page, which means that to satisfy the 64-byte limit + /// this function sometimes has to remove trailing grapheme clusters. + pub fn codepage_to_unicode(&mut self) { + while self.len() > Self::MAX_LEN { + let (new_len, _) = self.as_str().grapheme_indices(true).next_back().unwrap(); + self.0.truncate(new_len); + if self.0.is_empty() { + // We had a grapheme cluster longer than 64 bytes! + *self = Identifier::new("VAR").unwrap(); + return; + } + } + } + pub fn from_encoding( s: impl Into>, encoding: &'static Encoding, @@ -627,7 +643,7 @@ where #[cfg(test)] mod tests { - use encoding_rs::{UTF_8, WINDOWS_1252}; + use encoding_rs::{Encoding, UTF_8, WINDOWS_1252}; use crate::identifier::Identifier; @@ -660,4 +676,37 @@ mod tests { assert_eq!(&short, expected_short); } } + + #[test] + fn codepage_to_unicode() { + fn check_unicode(identifier: &str, encoding: &'static Encoding, expected: &str) { + let identifier = Identifier::from_encoding(String::from(identifier), encoding).unwrap(); + let mut actual = identifier.clone(); + actual.codepage_to_unicode(); + assert_eq!(actual.as_str(), expected); + } + + check_unicode("abc", UTF_8, "abc"); + check_unicode("éèäî", UTF_8, "éèäî"); + + // 32 bytes in windows-1252, 64 bytes in UTF-8, no truncation. + check_unicode( + "éèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî", + WINDOWS_1252, + "éèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî", + ); + + // 33 or 34 bytes in windows-1252, 65 or 66 bytes in UTF-8, truncate + // last (2-byte) character. + check_unicode( + "xéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî", + WINDOWS_1252, + "xéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèä", + ); + check_unicode( + "xyéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî", + WINDOWS_1252, + "xyéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèä", + ); + } } diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs index e2e531d12b..c5eacee7b3 100644 --- a/rust/pspp/src/sys/test.rs +++ b/rust/pspp/src/sys/test.rs @@ -26,7 +26,7 @@ use encoding_rs::UTF_8; use crate::{ crypto::EncryptedFile, - data::{ByteString, Datum}, + data::Datum, dictionary::{Dictionary, VarWidth, Variable}, identifier::Identifier, output::{ -- 2.30.2