From 09d295be4e3da0997be047ba0adcd6c18ca339f0 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 18 May 2025 11:55:00 -0700 Subject: [PATCH] work --- rust/pspp/src/sys/raw.rs | 56 ++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index ea77e3b4c2..c26ae08052 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -5,11 +5,11 @@ use crate::{ sys::encoding::{default_encoding, get_encoding, Error as EncodingError}, }; -use encoding_rs::{mem::decode_latin1, Encoding}; +use encoding_rs::{mem::decode_latin1, Encoding, UTF_8}; use flate2::read::ZlibDecoder; use num::Integer; use std::{ - borrow::Cow, + borrow::{Borrow, Cow}, cell::RefCell, collections::{HashMap, VecDeque}, fmt::{Debug, Display, Formatter, Result as FmtResult}, @@ -1516,6 +1516,19 @@ impl EncodedString { EncodedString::Utf8 { s } => EncodedStr::Utf8 { s }, } } + pub fn as_utf8_bytes(&self) -> Option<&[u8]> { + match self { + EncodedString::Encoded { bytes, encoding } if *encoding == UTF_8 => Some(&bytes), + EncodedString::Utf8 { s } => Some(s.as_bytes()), + _ => None, + } + } + pub fn as_encoded(&self) -> (&[u8], &'static Encoding) { + match self { + EncodedString::Encoded { bytes, encoding } => (&bytes, encoding), + EncodedString::Utf8 { s } => (s.as_bytes(), UTF_8), + } + } } impl<'a> From> for EncodedString { @@ -1530,6 +1543,24 @@ impl<'a> From> for EncodedString { } } +impl PartialEq for EncodedString { + fn eq(&self, other: &Self) -> bool { + if let Some(self_utf8) = self.as_utf8_bytes() { + if let Some(other_utf8) = other.as_utf8_bytes() { + return self_utf8 == other_utf8; + } + } + + let (self_bytes, self_encoding) = self.as_encoded(); + let (other_bytes, other_encoding) = other.as_encoded(); + if self_encoding == other_encoding { + self_bytes == other_bytes + } else { + self.borrowed().to_utf8() == other.borrowed().to_utf8() + } + } +} + pub enum EncodedStr<'a> { Encoded { bytes: &'a [u8], @@ -1558,19 +1589,18 @@ impl<'a> EncodedStr<'a> { EncodedStr::Utf8 { s } => s.as_bytes(), } } - pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { + pub fn to_utf8(&self) -> Cow<'a, str> { match self { EncodedStr::Encoded { bytes, encoding } => { - let utf8 = encoding.decode_without_bom_handling(bytes).0; - match encoding.encode(&utf8).0 { - Cow::Borrowed(_) => { - // Recoding into UTF-8 and then back did not change anything. - Cow::from(*bytes) - } - Cow::Owned(owned) => Cow::Owned(owned), - } + encoding.decode_without_bom_handling(bytes).0 } - EncodedStr::Utf8 { s } => encoding.encode(s).0, + EncodedStr::Utf8 { s } => Cow::from(*s), + } + } + pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { + match encoding.encode(&self.to_utf8()).0 { + Cow::Borrowed(_) => Cow::Borrowed(self.as_bytes()), + Cow::Owned(string) => Cow::Owned(string), } } pub fn is_empty(&self) -> bool { @@ -1686,7 +1716,7 @@ impl ValueLabelRecord, RawString> { let label_len: u8 = endian.parse(read_bytes(r)?); let label_len = label_len as usize; - let mut label = read_slice(r, label_len)?; + let label = read_slice(r, label_len)?; let padding_len = Integer::next_multiple_of(&(label_len + 1), &8) - (label_len + 1); read_slice(r, padding_len)?; labels.push((value, RawString(label))); -- 2.30.2