work on converting a file to unicode

author Ben Pfaff <blp@cs.stanford.edu>

Tue, 5 Aug 2025 02:37:18 +0000 (19:37 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Tue, 5 Aug 2025 02:37:18 +0000 (19:37 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Tue, 5 Aug 2025 02:37:18 +0000 (19:37 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Tue, 5 Aug 2025 02:37:18 +0000 (19:37 -0700)
diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs

index c86e20a09074be521f27904078c9a28055751dc6..fe77681998b30fc32b258470cac8d8f2de199efa 100644 (file)
--- a/rust/pspp/src/data.rs
+++ b/rust/pspp/src/data.rs
@@ -248,6 +248,18 @@ impl From<&'_ str> for ByteString {
      }
  }
  
+impl From<Cow<'_, str>> for ByteString {
+    fn from(value: Cow<'_, str>) -> Self {
+        value.into_owned().into()
+    }
+}
+
+impl From<Cow<'_, [u8]>> for ByteString {
+    fn from(value: Cow<'_, [u8]>) -> Self {
+        value.into_owned().into()
+    }
+}
+
  impl From<Vec<u8>> for ByteString {
      fn from(value: Vec<u8>) -> Self {
          Self(value)
@@ -315,7 +327,7 @@ impl MutRawString for ByteString {
                  self.0.truncate(new_len);
              }
              Ordering::Equal => (),
-            Ordering::Greater => self.0.extend((self.0.len()..new_len).map(|_| b' ')),
+            Ordering::Greater => self.0.resize(new_len, b' '),
          }
          Ok(())
      }
@@ -357,6 +369,20 @@ impl Datum<WithEncoding<ByteString>> {
          let s: String = s.into();
          Datum::String(ByteString::from(s).with_encoding(UTF_8))
      }
+
+    pub fn codepage_to_unicode(&mut self) {
+        match self {
+            Datum::Number(_) => (),
+            Datum::String(string) => string.codepage_to_unicode(),
+        }
+    }
+
+    pub fn without_encoding(self) -> Datum<ByteString> {
+        match self {
+            Datum::Number(number) => Datum::Number(number),
+            Datum::String(string) => Datum::String(string.inner),
+        }
+    }
  }
  
  impl<'a> Datum<WithEncoding<ByteCow<'a>>> {
diff --git a/rust/pspp/src/data/encoded.rs b/rust/pspp/src/data/encoded.rs

index a12eccb9bf15bfc8fbdbe628706139d75383c9c6..3584d0507047aadc8d48571a59d832bd2cd17240 100644 (file)
--- a/rust/pspp/src/data/encoded.rs
+++ b/rust/pspp/src/data/encoded.rs
@@ -8,7 +8,10 @@ use std::{
  use encoding_rs::{Encoding, UTF_8};
  use serde::Serialize;
  
-use crate::data::{ByteCow, ByteStr, ByteString, MutRawString, Quoted, RawString, ResizeError};
+use crate::{
+    data::{ByteCow, ByteStr, ByteString, MutRawString, Quoted, RawString, ResizeError},
+    dictionary::VarWidth,
+};
  
  pub trait Encoded {
      fn encoding(&self) -> &'static Encoding;
@@ -193,6 +196,27 @@ where
      }
  }
  
+impl WithEncoding<ByteString> {
+    pub fn codepage_to_unicode(&mut self) {
+        if self.encoding() != UTF_8 {
+            let new_len = (self.inner.len() * 3).min(VarWidth::MAX_STRING as usize);
+            if let Cow::Owned(string) = self
+                .encoding()
+                .decode_without_bom_handling(self.raw_string_bytes())
+                .0
+            {
+                self.inner = ByteString::from(string);
+            }
+
+            // Use `self.inner.0.resize` (instead of `self.inner.resize()`)
+            // because this is a forced resize that can trim off non-spaces.
+            self.inner.0.resize(new_len, b' ');
+
+            self.encoding = UTF_8;
+        }
+    }
+}
+
  impl<T> Encoded for WithEncoding<T> {
      fn encoding(&self) -> &'static Encoding {
          self.encoding
@@ -216,3 +240,41 @@ where
          self.inner.hash(state);
      }
  }
+
+#[cfg(test)]
+mod tests {
+    use std::{char::REPLACEMENT_CHARACTER, iter::repeat_n};
+
+    use encoding_rs::{Encoding, UTF_8, WINDOWS_1252};
+
+    use crate::data::{ByteString, EncodedString, RawString};
+
+    #[test]
+    fn codepage_to_unicode() {
+        fn check_unicode(original: &str, encoding: &'static Encoding, expected: &str) {
+            let original = ByteString::from(encoding.encode(original).0).with_encoding(encoding);
+            let mut actual = original.clone();
+            actual.codepage_to_unicode();
+            assert_eq!(actual.as_str().len(), expected.len());
+            assert_eq!(actual.as_str(), expected);
+        }
+
+        check_unicode("abc", UTF_8, "abc");
+        check_unicode("abc", WINDOWS_1252, "abc      ");
+        check_unicode("éèäî", WINDOWS_1252, "éèäî    ");
+        check_unicode(
+            &repeat_n('é', 15000).collect::<String>(),
+            WINDOWS_1252,
+            &repeat_n('é', 15000)
+                .chain(repeat_n(' ', 2767))
+                .collect::<String>(),
+        );
+        check_unicode(
+            &repeat_n('é', 20000).collect::<String>(),
+            WINDOWS_1252,
+            &repeat_n('é', 16383)
+                .chain(std::iter::once(REPLACEMENT_CHARACTER))
+                .collect::<String>(),
+        );
+    }
+}
diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs

index 6f30e299725864d96637a984fee4def9422c54c5..4608d2dda3ebe6eecccb6b38c9214cd4b9a03239 100644 (file)
--- a/rust/pspp/src/dictionary.rs
+++ b/rust/pspp/src/dictionary.rs
@@ -26,7 +26,7 @@ use std::{
      str::FromStr,
  };
  
-use encoding_rs::Encoding;
+use encoding_rs::{Encoding, UTF_8};
  use enum_map::{Enum, EnumMap};
  use indexmap::IndexSet;
  use num::integer::div_ceil;
@@ -39,7 +39,7 @@ use thiserror::Error as ThisError;
  use unicase::UniCase;
  
  use crate::{
-    data::{ByteString, Datum, EncodedString, ResizeError, WithEncoding},
+    data::{ByteString, Datum, Encoded, EncodedString, ResizeError, WithEncoding},
      format::{DisplayPlain, Format},
      identifier::{ByIdentifier, HasIdentifier, Identifier},
      output::pivot::{
@@ -218,6 +218,13 @@ impl VarWidth {
      pub fn display_adjective(&self) -> VarWidthAdjective {
          VarWidthAdjective(*self)
      }
+
+    pub fn codepage_to_unicode(&mut self) {
+        match self {
+            VarWidth::Numeric => (),
+            VarWidth::String(width) => *width = width.saturating_mul(3).min(Self::MAX_STRING),
+        }
+    }
  }
  
  pub struct Segments {
@@ -832,6 +839,52 @@ impl Dictionary {
              .map(|names| names.into_iter().flatten().collect())
              .collect()
      }
+
+    pub fn codepage_to_unicode(&mut self) {
+        if self.encoding == UTF_8 {
+            return;
+        }
+
+        let mut variables = IndexSet::new();
+        let mut index = 0;
+        for mut variable in self.variables.drain(..) {
+            variable.codepage_to_unicode();
+            while variables.contains(&variable) {
+                index += 1;
+                variable.name = Identifier::new(format!("Var{index}")).unwrap();
+            }
+            variables.insert(variable);
+        }
+        self.variables = variables;
+
+        let mut vectors = HashSet::new();
+        let mut index = 0;
+        for mut vector in self.vectors.drain() {
+            vector.codepage_to_unicode();
+            while vectors.contains(&vector) {
+                index += 1;
+                vector.name = Identifier::new(format!("Vec{index}")).unwrap();
+            }
+            vectors.insert(vector);
+        }
+        self.vectors = vectors;
+
+        self.attributes.codepage_to_unicode();
+
+        let mut mrsets = BTreeSet::new();
+        let mut index = 0;
+        while let Some(mut mrset) = self.mrsets.pop_first() {
+            mrset.codepage_to_unicode();
+            while mrsets.contains(&mrset) {
+                index += 1;
+                mrset.name = Identifier::new(format!("Mr{index}")).unwrap();
+            }
+            mrsets.insert(mrset);
+        }
+        self.mrsets = mrsets;
+
+        self.encoding = UTF_8;
+    }
  }
  
  pub struct OutputVariables<'a> {
@@ -1294,6 +1347,15 @@ impl Attributes {
      pub fn has_any(&self, include_at: bool) -> bool {
          self.iter(include_at).next().is_some()
      }
+
+    pub fn codepage_to_unicode(&mut self) {
+        let mut new = BTreeMap::new();
+        while let Some((mut name, value)) = self.0.pop_first() {
+            name.codepage_to_unicode();
+            new.insert(name, value);
+        }
+        self.0 = new;
+    }
  }
  
  impl Debug for Attributes {
@@ -1466,6 +1528,20 @@ impl Variable {
              width: self.width,
          }
      }
+
+    pub fn codepage_to_unicode(&mut self) {
+        self.name.codepage_to_unicode();
+        self.width.codepage_to_unicode();
+        self.missing_values.codepage_to_unicode();
+        self.print_format.codepage_to_unicode();
+        self.write_format.codepage_to_unicode();
+        self.attributes.codepage_to_unicode();
+        self.encoding = UTF_8;
+
+        // Anything old enough to not support long names is old enough not to
+        // support Unicode.
+        self.short_names.clear();
+    }
  }
  
  impl HasIdentifier for Variable {
@@ -1488,6 +1564,10 @@ impl DictIndexVector {
          update_dict_index_vec(&mut self.variables, f);
          (!self.variables.is_empty()).then_some(self)
      }
+
+    pub fn codepage_to_unicode(&mut self) {
+        self.name.codepage_to_unicode();
+    }
  }
  
  impl HasIdentifier for DictIndexVector {
@@ -1901,6 +1981,10 @@ impl DictIndexMultipleResponseSet {
          update_dict_index_vec(&mut self.variables, f);
          (self.variables.len() > 1).then_some(self)
      }
+
+    pub fn codepage_to_unicode(&mut self) {
+        self.name.codepage_to_unicode();
+    }
  }
  
  impl HasIdentifier for DictIndexMultipleResponseSet {
@@ -2009,6 +2093,18 @@ impl ValueLabels {
              .filter_map(|(mut datum, string)| datum.resize(width).is_ok().then(|| (datum, string)))
              .collect();
      }
+
+    pub fn codepage_to_unicode(&mut self, encoding: &'static Encoding) {
+        self.0 = self
+            .0
+            .drain()
+            .map(|(key, value)| {
+                let mut key = key.with_encoding(encoding);
+                key.codepage_to_unicode();
+                (key.without_encoding(), value)
+            })
+            .collect();
+    }
  }
  
  impl Debug for ValueLabels {
@@ -2095,7 +2191,7 @@ impl<'a> MissingValuesMut<'a> {
      }
  }
  
-#[derive(Clone, Default, Serialize)]
+#[derive(Clone, Default, Serialize, PartialEq)]
  pub struct MissingValues {
      /// Individual missing values, up to 3 of them.
      values: Vec<Datum<WithEncoding<ByteString>>>,
@@ -2237,6 +2333,23 @@ impl MissingValues {
          }
          inner(self, width).inspect_err(|_| self.clear())
      }
+
+    pub fn codepage_to_unicode(&mut self) {
+        self.values = self
+            .values
+            .drain(..)
+            .map(|value| match value {
+                Datum::Number(number) => Datum::Number(number),
+                Datum::String(s) => Datum::String(if s.encoding() != UTF_8 {
+                    let mut new_s = ByteString::from(s.as_str());
+                    new_s.0.truncate(8);
+                    WithEncoding::new(new_s, UTF_8)
+                } else {
+                    s
+                }),
+            })
+            .collect();
+    }
  }
  
  #[derive(Copy, Clone, Debug, Serialize, PartialEq)]
@@ -2367,7 +2480,8 @@ mod test {
      use unicase::UniCase;
  
      use crate::{
-        dictionary::{Dictionary, VarWidth, Variable},
+        data::{ByteString, Datum, RawString, WithEncoding},
+        dictionary::{Dictionary, MissingValues, VarWidth, Variable},
          identifier::Identifier,
      };
  
@@ -2505,4 +2619,55 @@ mod test {
              assert_eq!(expected, dict.short_names());
          }
      }
+
+    #[test]
+    fn var_width_codepage_to_unicode() {
+        fn check_unicode(input: VarWidth, expected: VarWidth) {
+            let mut actual = input;
+            actual.codepage_to_unicode();
+            assert_eq!(actual, expected);
+        }
+
+        check_unicode(VarWidth::Numeric, VarWidth::Numeric);
+        check_unicode(VarWidth::String(1), VarWidth::String(3));
+        check_unicode(VarWidth::String(2), VarWidth::String(6));
+        check_unicode(VarWidth::String(3), VarWidth::String(9));
+        check_unicode(VarWidth::String(1000), VarWidth::String(3000));
+        check_unicode(VarWidth::String(20000), VarWidth::String(32767));
+        check_unicode(VarWidth::String(30000), VarWidth::String(32767));
+    }
+
+    #[test]
+    fn missing_values_codepage_to_unicode() {
+        fn windows_1252(s: &str) -> WithEncoding<ByteString> {
+            ByteString::from(WINDOWS_1252.encode(s).0).with_encoding(WINDOWS_1252)
+        }
+
+        let mut actual = MissingValues::new(
+            vec![
+                Datum::String(windows_1252("abcdefgh")),
+                Datum::String(windows_1252("éèäî   ")),
+                Datum::String(windows_1252("aaéèäîdf")),
+            ],
+            None,
+        )
+        .unwrap();
+        actual.codepage_to_unicode();
+
+        fn utf_8(s: &str) -> WithEncoding<ByteString> {
+            ByteString::from(s).with_encoding(UTF_8)
+        }
+
+        let expected = MissingValues::new(
+            vec![
+                Datum::String(utf_8("abcdefgh")),
+                Datum::String(utf_8("éèäî")),
+                Datum::String(utf_8("aaéèä")),
+            ],
+            None,
+        )
+        .unwrap();
+
+        assert_eq!(&actual, &expected);
+    }
  }
diff --git a/rust/pspp/src/format/mod.rs b/rust/pspp/src/format/mod.rs

index 6fc81624686a8dc603c360cc000b83af4f225370..b4109cb2062b6f5093669c1a0f233a55f953328d 100644 (file)
--- a/rust/pspp/src/format/mod.rs
+++ b/rust/pspp/src/format/mod.rs
@@ -29,7 +29,7 @@ use thiserror::Error as ThisError;
  use unicode_width::UnicodeWidthStr;
  
  use crate::{
-    data::{ByteString, Datum, },
+    data::{ByteString, Datum},
      dictionary::{VarType, VarWidth},
      sys::raw,
  };
@@ -640,6 +640,18 @@ impl Format {
              _ => *self = Self::default_for_width(width),
          }
      }
+
+    pub fn codepage_to_unicode(&mut self) {
+        let mut width = self.var_width();
+        width.codepage_to_unicode();
+        if let Some(width) = width.as_string_width() {
+            if self.type_ == Type::AHex {
+                self.w = width as u16 * 2;
+            } else {
+                self.w = width as u16;
+            }
+        }
+    }
  }
  
  impl Debug for Format {
@@ -1334,3 +1346,32 @@ impl Iterator for DateTemplate {
          Some(TemplateItem { c, n })
      }
  }
+
+#[cfg(test)]
+mod tests {
+    use crate::format::{Format, Type, Width};
+
+    #[test]
+    fn codepage_to_unicode() {
+        fn check_format(input: Format, expected_width: Width) {
+            let mut output = input;
+            output.codepage_to_unicode();
+            let expected = Format::new(input.type_, expected_width, input.d).unwrap();
+            assert_eq!(output, expected);
+        }
+        check_format(Format::new(Type::A, 1, 0).unwrap(), 3);
+        check_format(Format::new(Type::A, 2, 0).unwrap(), 6);
+        check_format(Format::new(Type::A, 3, 0).unwrap(), 9);
+        check_format(Format::new(Type::A, 1000, 0).unwrap(), 3000);
+        check_format(Format::new(Type::A, 20000, 0).unwrap(), 32767);
+
+        check_format(Format::new(Type::AHex, 2, 0).unwrap(), 6);
+        check_format(Format::new(Type::AHex, 4, 0).unwrap(), 12);
+        check_format(Format::new(Type::AHex, 6, 0).unwrap(), 18);
+        check_format(Format::new(Type::AHex, 2000, 0).unwrap(), 6000);
+        check_format(Format::new(Type::AHex, 20000, 0).unwrap(), 60000);
+        check_format(Format::new(Type::AHex, 30000, 0).unwrap(), 65534);
+
+        check_format(Format::new(Type::F, 40, 0).unwrap(), 40);
+    }
+}
diff --git a/rust/pspp/src/identifier.rs b/rust/pspp/src/identifier.rs

index 0923504b4d2cb5b490b754f99b8166369777d83b..9697cf7673b0804273f39070e7222cf5db47efb4 100644 (file)
--- a/rust/pspp/src/identifier.rs
+++ b/rust/pspp/src/identifier.rs
@@ -221,6 +221,22 @@ impl Identifier {
          Self::from_encoding(s, UTF_8)
      }
  
+    /// Converts this identifier to UTF-8.  This is generally a no-op, because
+    /// our internal encoding is UTF-8, but some identifiers are longer in UTF-8
+    /// than in their code page, which means that to satisfy the 64-byte limit
+    /// this function sometimes has to remove trailing grapheme clusters.
+    pub fn codepage_to_unicode(&mut self) {
+        while self.len() > Self::MAX_LEN {
+            let (new_len, _) = self.as_str().grapheme_indices(true).next_back().unwrap();
+            self.0.truncate(new_len);
+            if self.0.is_empty() {
+                // We had a grapheme cluster longer than 64 bytes!
+                *self = Identifier::new("VAR").unwrap();
+                return;
+            }
+        }
+    }
+
      pub fn from_encoding(
          s: impl Into<UniCase<String>>,
          encoding: &'static Encoding,
@@ -627,7 +643,7 @@ where
  
  #[cfg(test)]
  mod tests {
-    use encoding_rs::{UTF_8, WINDOWS_1252};
+    use encoding_rs::{Encoding, UTF_8, WINDOWS_1252};
  
      use crate::identifier::Identifier;
  
@@ -660,4 +676,37 @@ mod tests {
              assert_eq!(&short, expected_short);
          }
      }
+
+    #[test]
+    fn codepage_to_unicode() {
+        fn check_unicode(identifier: &str, encoding: &'static Encoding, expected: &str) {
+            let identifier = Identifier::from_encoding(String::from(identifier), encoding).unwrap();
+            let mut actual = identifier.clone();
+            actual.codepage_to_unicode();
+            assert_eq!(actual.as_str(), expected);
+        }
+
+        check_unicode("abc", UTF_8, "abc");
+        check_unicode("éèäî", UTF_8, "éèäî");
+
+        // 32 bytes in windows-1252, 64 bytes in UTF-8, no truncation.
+        check_unicode(
+            "éèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî",
+            WINDOWS_1252,
+            "éèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî",
+        );
+
+        // 33 or 34 bytes in windows-1252, 65 or 66 bytes in UTF-8, truncate
+        // last (2-byte) character.
+        check_unicode(
+            "xéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî",
+            WINDOWS_1252,
+            "xéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèä",
+        );
+        check_unicode(
+            "xyéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî",
+            WINDOWS_1252,
+            "xyéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèä",
+        );
+    }
  }
diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs

index e2e531d12bdacf4ea0c5fe75033ffd3836ca8feb..c5eacee7b3cd988aa205906b1e901c2aab20fc87 100644 (file)
--- a/rust/pspp/src/sys/test.rs
+++ b/rust/pspp/src/sys/test.rs
@@ -26,7 +26,7 @@ use encoding_rs::UTF_8;
  
  use crate::{
      crypto::EncryptedFile,
-    data::{ByteString, Datum},
+    data::Datum,
      dictionary::{Dictionary, VarWidth, Variable},
      identifier::Identifier,
      output::{
author	Ben Pfaff <blp@cs.stanford.edu>
	Tue, 5 Aug 2025 02:37:18 +0000 (19:37 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Tue, 5 Aug 2025 02:37:18 +0000 (19:37 -0700)
rust/pspp/src/data.rs		patch \| blob \| history
rust/pspp/src/data/encoded.rs		patch \| blob \| history
rust/pspp/src/dictionary.rs		patch \| blob \| history
rust/pspp/src/format/mod.rs		patch \| blob \| history
rust/pspp/src/identifier.rs		patch \| blob \| history
rust/pspp/src/sys/test.rs		patch \| blob \| history