work
authorBen Pfaff <blp@cs.stanford.edu>
Sun, 18 May 2025 18:55:00 +0000 (11:55 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Sun, 18 May 2025 18:55:00 +0000 (11:55 -0700)
rust/pspp/src/sys/raw.rs

index ea77e3b4c2b72dd47c08f259707cebe817c02192..c26ae08052c6722d61dae5ab1c634481f8727d35 100644 (file)
@@ -5,11 +5,11 @@ use crate::{
     sys::encoding::{default_encoding, get_encoding, Error as EncodingError},
 };
 
-use encoding_rs::{mem::decode_latin1, Encoding};
+use encoding_rs::{mem::decode_latin1, Encoding, UTF_8};
 use flate2::read::ZlibDecoder;
 use num::Integer;
 use std::{
-    borrow::Cow,
+    borrow::{Borrow, Cow},
     cell::RefCell,
     collections::{HashMap, VecDeque},
     fmt::{Debug, Display, Formatter, Result as FmtResult},
@@ -1516,6 +1516,19 @@ impl EncodedString {
             EncodedString::Utf8 { s } => EncodedStr::Utf8 { s },
         }
     }
+    pub fn as_utf8_bytes(&self) -> Option<&[u8]> {
+        match self {
+            EncodedString::Encoded { bytes, encoding } if *encoding == UTF_8 => Some(&bytes),
+            EncodedString::Utf8 { s } => Some(s.as_bytes()),
+            _ => None,
+        }
+    }
+    pub fn as_encoded(&self) -> (&[u8], &'static Encoding) {
+        match self {
+            EncodedString::Encoded { bytes, encoding } => (&bytes, encoding),
+            EncodedString::Utf8 { s } => (s.as_bytes(), UTF_8),
+        }
+    }
 }
 
 impl<'a> From<EncodedStr<'a>> for EncodedString {
@@ -1530,6 +1543,24 @@ impl<'a> From<EncodedStr<'a>> for EncodedString {
     }
 }
 
+impl PartialEq for EncodedString {
+    fn eq(&self, other: &Self) -> bool {
+        if let Some(self_utf8) = self.as_utf8_bytes() {
+            if let Some(other_utf8) = other.as_utf8_bytes() {
+                return self_utf8 == other_utf8;
+            }
+        }
+
+        let (self_bytes, self_encoding) = self.as_encoded();
+        let (other_bytes, other_encoding) = other.as_encoded();
+        if self_encoding == other_encoding {
+            self_bytes == other_bytes
+        } else {
+            self.borrowed().to_utf8() == other.borrowed().to_utf8()
+        }
+    }
+}
+
 pub enum EncodedStr<'a> {
     Encoded {
         bytes: &'a [u8],
@@ -1558,19 +1589,18 @@ impl<'a> EncodedStr<'a> {
             EncodedStr::Utf8 { s } => s.as_bytes(),
         }
     }
-    pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> {
+    pub fn to_utf8(&self) -> Cow<'a, str> {
         match self {
             EncodedStr::Encoded { bytes, encoding } => {
-                let utf8 = encoding.decode_without_bom_handling(bytes).0;
-                match encoding.encode(&utf8).0 {
-                    Cow::Borrowed(_) => {
-                        // Recoding into UTF-8 and then back did not change anything.
-                        Cow::from(*bytes)
-                    }
-                    Cow::Owned(owned) => Cow::Owned(owned),
-                }
+                encoding.decode_without_bom_handling(bytes).0
             }
-            EncodedStr::Utf8 { s } => encoding.encode(s).0,
+            EncodedStr::Utf8 { s } => Cow::from(*s),
+        }
+    }
+    pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> {
+        match encoding.encode(&self.to_utf8()).0 {
+            Cow::Borrowed(_) => Cow::Borrowed(self.as_bytes()),
+            Cow::Owned(string) => Cow::Owned(string),
         }
     }
     pub fn is_empty(&self) -> bool {
@@ -1686,7 +1716,7 @@ impl ValueLabelRecord<RawStrArray<8>, RawString> {
             let label_len: u8 = endian.parse(read_bytes(r)?);
             let label_len = label_len as usize;
 
-            let mut label = read_slice(r, label_len)?;
+            let label = read_slice(r, label_len)?;
             let padding_len = Integer::next_multiple_of(&(label_len + 1), &8) - (label_len + 1);
             read_slice(r, padding_len)?;
             labels.push((value, RawString(label)));