cleanup
authorBen Pfaff <blp@cs.stanford.edu>
Sun, 13 Jul 2025 21:13:53 +0000 (14:13 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Sun, 13 Jul 2025 21:13:53 +0000 (14:13 -0700)
rust/pspp/src/data.rs
rust/pspp/src/format/parse.rs
rust/pspp/src/sys/encoding.rs
rust/pspp/src/sys/raw.rs

index 59026039960205e4a28b80d156d866867e1d1eaf..b6fa22bfa5620bfba55230eb8df242d2a7f433d6 100644 (file)
@@ -36,13 +36,10 @@ use std::{
     str::from_utf8,
 };
 
-use encoding_rs::{mem::decode_latin1, Encoding};
+use encoding_rs::{mem::decode_latin1, Encoding, UTF_8};
 use ordered_float::OrderedFloat;
 
-use crate::{
-    dictionary::{VarType, VarWidth},
-    sys::raw::EncodedStr,
-};
+use crate::dictionary::{VarType, VarWidth};
 
 /// An owned string in an unspecified character encoding.
 ///
@@ -411,3 +408,164 @@ pub struct Case(
     /// [Dictionary]: crate::dictionary::Dictionary
     pub Vec<Datum>,
 );
+
+/// An owned string and its [Encoding].
+///
+/// The string is not guaranteed to be valid in the encoding.
+///
+/// The borrowed form of such a string is [EncodedStr].
+#[derive(Clone, Debug)]
+pub enum EncodedString {
+    /// A string in arbitrary encoding.
+    Encoded {
+        /// The bytes of the string.
+        bytes: Vec<u8>,
+
+        /// The string's encoding.
+        ///
+        /// This can be [UTF_8].
+        encoding: &'static Encoding,
+    },
+
+    /// A string that is in UTF-8 and known to be valid.
+    Utf8 {
+        /// The string.
+        s: String,
+    },
+}
+
+impl EncodedString {
+    /// Returns the string's [Encoding].
+    pub fn encoding(&self) -> &'static Encoding {
+        match self {
+            EncodedString::Encoded { encoding, .. } => encoding,
+            EncodedString::Utf8 { .. } => UTF_8,
+        }
+    }
+
+    /// Returns a borrowed form of this string.
+    pub fn borrowed(&self) -> EncodedStr<'_> {
+        match self {
+            EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding },
+            EncodedString::Utf8 { s } => EncodedStr::Utf8 { s },
+        }
+    }
+}
+
+impl<'a> From<EncodedStr<'a>> for EncodedString {
+    fn from(value: EncodedStr<'a>) -> Self {
+        match value {
+            EncodedStr::Encoded { bytes, encoding } => Self::Encoded {
+                bytes: bytes.into(),
+                encoding,
+            },
+            EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() },
+        }
+    }
+}
+
+/// A borrowed string and its [Encoding].
+///
+/// The string is not guaranteed to be valid in the encoding.
+///
+/// The owned form of such a string is [EncodedString].
+pub enum EncodedStr<'a> {
+    /// A string in an arbitrary encoding
+    Encoded {
+        /// The bytes of the string.
+        bytes: &'a [u8],
+
+        /// The string's encoding.
+        ///
+        /// THis can be [UTF_8].
+        encoding: &'static Encoding,
+    },
+
+    /// A string in UTF-8 that is known to be valid.
+    Utf8 {
+        /// The string.
+        s: &'a str,
+    },
+}
+
+impl<'a> EncodedStr<'a> {
+    /// Construct a new string with an arbitrary encoding.
+    pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self {
+        Self::Encoded { bytes, encoding }
+    }
+
+    /// Returns this string recoded in UTF-8.  Invalid characters will be
+    /// replaced by [REPLACEMENT_CHARACTER].
+    ///
+    /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER
+    pub fn as_str(&self) -> Cow<'_, str> {
+        match self {
+            EncodedStr::Encoded { bytes, encoding } => {
+                encoding.decode_without_bom_handling(bytes).0
+            }
+            EncodedStr::Utf8 { s } => Cow::from(*s),
+        }
+    }
+
+    /// Returns the bytes in the string, in its encoding.
+    pub fn as_bytes(&self) -> &[u8] {
+        match self {
+            EncodedStr::Encoded { bytes, .. } => bytes,
+            EncodedStr::Utf8 { s } => s.as_bytes(),
+        }
+    }
+
+    /// Returns this string recoded in `encoding`.  Invalid characters will be
+    /// replaced by [REPLACEMENT_CHARACTER].
+    ///
+    /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER
+    pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> {
+        match self {
+            EncodedStr::Encoded { bytes, encoding } => {
+                let utf8 = encoding.decode_without_bom_handling(bytes).0;
+                match encoding.encode(&utf8).0 {
+                    Cow::Borrowed(_) => {
+                        // Recoding into UTF-8 and then back did not change anything.
+                        Cow::from(*bytes)
+                    }
+                    Cow::Owned(owned) => Cow::Owned(owned),
+                }
+            }
+            EncodedStr::Utf8 { s } => encoding.encode(s).0,
+        }
+    }
+
+    /// Returns true if this string is empty.
+    pub fn is_empty(&self) -> bool {
+        match self {
+            EncodedStr::Encoded { bytes, .. } => bytes.is_empty(),
+            EncodedStr::Utf8 { s } => s.is_empty(),
+        }
+    }
+
+    /// Returns a helper for displaying this string in double quotes.
+    pub fn quoted(&self) -> QuotedEncodedStr {
+        QuotedEncodedStr(self)
+    }
+}
+
+impl<'a> From<&'a str> for EncodedStr<'a> {
+    fn from(s: &'a str) -> Self {
+        Self::Utf8 { s }
+    }
+}
+
+impl<'a> From<&'a String> for EncodedStr<'a> {
+    fn from(s: &'a String) -> Self {
+        Self::Utf8 { s: s.as_str() }
+    }
+}
+
+/// Helper struct for displaying a [QuotedEncodedStr] in double quotes.
+pub struct QuotedEncodedStr<'a>(&'a EncodedStr<'a>);
+
+impl Display for QuotedEncodedStr<'_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self.0.as_str())
+    }
+}
index b69fbdb48b96d3cb9a7b28416f833ca55699cac8..0c7c6dee378c0d627cf89e288297ff19a1274eec 100644 (file)
 
 use crate::{
     calendar::{calendar_gregorian_to_offset, DateError},
-    data::Datum,
+    data::{Datum, EncodedStr, EncodedString},
     endian::{Endian, Parse},
     format::{DateTemplate, Decimals, Settings, TemplateItem, Type},
     settings::{EndianSettings, Settings as PsppSettings},
-    sys::raw::{EncodedStr, EncodedString},
 };
 use encoding_rs::Encoding;
 use smallstr::SmallString;
@@ -921,14 +920,13 @@ mod test {
 
     use crate::{
         calendar::{days_in_month, is_leap_year},
-        data::Datum,
+        data::{Datum, EncodedStr},
         endian::Endian,
         format::{
             parse::{ParseError, ParseErrorKind, Sign},
             Epoch, Format, Settings as FormatSettings, Type,
         },
         settings::EndianSettings,
-        sys::raw::EncodedStr,
     };
 
     fn test(name: &str, type_: Type) {
index 1510bb8a025d891f284980138b6eb90fce592a76..0f09f6bc5e26a09873a5eaf81329d061acadcf3a 100644 (file)
@@ -78,34 +78,3 @@ pub fn get_encoding(
 
     Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into()))
 }
-
-/*
-#[cfg(test)]
-mod tests {
-    use std::thread::spawn;
-
-    use encoding_rs::{EUC_JP, UTF_8, WINDOWS_1252};
-
-    #[test]
-    fn round_trip() {
-        let mut threads = Vec::new();
-        for thread in 0..128 {
-            let start: u32 = thread << 25;
-            let end = start + ((1 << 25) - 1);
-            threads.push(spawn(move || {
-                for i in start..=end {
-                    let s = i.to_le_bytes();
-                    let (utf8, replacement) = EUC_JP.decode_without_bom_handling(&s);
-                    if !replacement {
-                        let s2 = UTF_8.encode(&utf8).0;
-                        assert_eq!(s.as_slice(), &*s2);
-                    }
-                }
-            }));
-        }
-        for thread in threads {
-            thread.join().unwrap();
-        }
-    }
-}
-*/
index 0623d374e287f783d2bff2a64f724f9c83ee7480..6e822be41ed4e18894d2519c37af1c6aaccdbca4 100644 (file)
@@ -1586,112 +1586,6 @@ impl<const N: usize> Debug for RawStrArray<N> {
     }
 }
 
-#[derive(Clone, Debug)]
-pub enum EncodedString {
-    Encoded {
-        bytes: Vec<u8>,
-        encoding: &'static Encoding,
-    },
-    Utf8 {
-        s: String,
-    },
-}
-
-impl EncodedString {
-    pub fn borrowed(&self) -> EncodedStr<'_> {
-        match self {
-            EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding },
-            EncodedString::Utf8 { s } => EncodedStr::Utf8 { s },
-        }
-    }
-}
-
-impl<'a> From<EncodedStr<'a>> for EncodedString {
-    fn from(value: EncodedStr<'a>) -> Self {
-        match value {
-            EncodedStr::Encoded { bytes, encoding } => Self::Encoded {
-                bytes: bytes.into(),
-                encoding,
-            },
-            EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() },
-        }
-    }
-}
-
-pub enum EncodedStr<'a> {
-    Encoded {
-        bytes: &'a [u8],
-        encoding: &'static Encoding,
-    },
-    Utf8 {
-        s: &'a str,
-    },
-}
-
-impl<'a> EncodedStr<'a> {
-    pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self {
-        Self::Encoded { bytes, encoding }
-    }
-    pub fn as_str(&self) -> Cow<'_, str> {
-        match self {
-            EncodedStr::Encoded { bytes, encoding } => {
-                encoding.decode_without_bom_handling(bytes).0
-            }
-            EncodedStr::Utf8 { s } => Cow::from(*s),
-        }
-    }
-    pub fn as_bytes(&self) -> &[u8] {
-        match self {
-            EncodedStr::Encoded { bytes, .. } => bytes,
-            EncodedStr::Utf8 { s } => s.as_bytes(),
-        }
-    }
-    pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> {
-        match self {
-            EncodedStr::Encoded { bytes, encoding } => {
-                let utf8 = encoding.decode_without_bom_handling(bytes).0;
-                match encoding.encode(&utf8).0 {
-                    Cow::Borrowed(_) => {
-                        // Recoding into UTF-8 and then back did not change anything.
-                        Cow::from(*bytes)
-                    }
-                    Cow::Owned(owned) => Cow::Owned(owned),
-                }
-            }
-            EncodedStr::Utf8 { s } => encoding.encode(s).0,
-        }
-    }
-    pub fn is_empty(&self) -> bool {
-        match self {
-            EncodedStr::Encoded { bytes, .. } => bytes.is_empty(),
-            EncodedStr::Utf8 { s } => s.is_empty(),
-        }
-    }
-    pub fn quoted(&self) -> QuotedEncodedStr {
-        QuotedEncodedStr(self)
-    }
-}
-
-impl<'a> From<&'a str> for EncodedStr<'a> {
-    fn from(s: &'a str) -> Self {
-        Self::Utf8 { s }
-    }
-}
-
-impl<'a> From<&'a String> for EncodedStr<'a> {
-    fn from(s: &'a String) -> Self {
-        Self::Utf8 { s: s.as_str() }
-    }
-}
-
-pub struct QuotedEncodedStr<'a>(&'a EncodedStr<'a>);
-
-impl Display for QuotedEncodedStr<'_> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:?}", self.0.as_str())
-    }
-}
-
 fn skip_bytes<R: Read>(r: &mut R, mut n: usize) -> Result<(), IoError> {
     thread_local! {
         static BUF: RefCell<[u8; 256]> = RefCell::new([0u8; 256]);