From: Ben Pfaff Date: Sun, 13 Jul 2025 21:13:53 +0000 (-0700) Subject: cleanup X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=040a8178f3c9c7265897c4fcc1dc84a883949472;p=pspp cleanup --- diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index 5902603996..b6fa22bfa5 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -36,13 +36,10 @@ use std::{ str::from_utf8, }; -use encoding_rs::{mem::decode_latin1, Encoding}; +use encoding_rs::{mem::decode_latin1, Encoding, UTF_8}; use ordered_float::OrderedFloat; -use crate::{ - dictionary::{VarType, VarWidth}, - sys::raw::EncodedStr, -}; +use crate::dictionary::{VarType, VarWidth}; /// An owned string in an unspecified character encoding. /// @@ -411,3 +408,164 @@ pub struct Case( /// [Dictionary]: crate::dictionary::Dictionary pub Vec, ); + +/// An owned string and its [Encoding]. +/// +/// The string is not guaranteed to be valid in the encoding. +/// +/// The borrowed form of such a string is [EncodedStr]. +#[derive(Clone, Debug)] +pub enum EncodedString { + /// A string in arbitrary encoding. + Encoded { + /// The bytes of the string. + bytes: Vec, + + /// The string's encoding. + /// + /// This can be [UTF_8]. + encoding: &'static Encoding, + }, + + /// A string that is in UTF-8 and known to be valid. + Utf8 { + /// The string. + s: String, + }, +} + +impl EncodedString { + /// Returns the string's [Encoding]. + pub fn encoding(&self) -> &'static Encoding { + match self { + EncodedString::Encoded { encoding, .. } => encoding, + EncodedString::Utf8 { .. } => UTF_8, + } + } + + /// Returns a borrowed form of this string. + pub fn borrowed(&self) -> EncodedStr<'_> { + match self { + EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding }, + EncodedString::Utf8 { s } => EncodedStr::Utf8 { s }, + } + } +} + +impl<'a> From> for EncodedString { + fn from(value: EncodedStr<'a>) -> Self { + match value { + EncodedStr::Encoded { bytes, encoding } => Self::Encoded { + bytes: bytes.into(), + encoding, + }, + EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() }, + } + } +} + +/// A borrowed string and its [Encoding]. +/// +/// The string is not guaranteed to be valid in the encoding. +/// +/// The owned form of such a string is [EncodedString]. +pub enum EncodedStr<'a> { + /// A string in an arbitrary encoding + Encoded { + /// The bytes of the string. + bytes: &'a [u8], + + /// The string's encoding. + /// + /// THis can be [UTF_8]. + encoding: &'static Encoding, + }, + + /// A string in UTF-8 that is known to be valid. + Utf8 { + /// The string. + s: &'a str, + }, +} + +impl<'a> EncodedStr<'a> { + /// Construct a new string with an arbitrary encoding. + pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self { + Self::Encoded { bytes, encoding } + } + + /// Returns this string recoded in UTF-8. Invalid characters will be + /// replaced by [REPLACEMENT_CHARACTER]. + /// + /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER + pub fn as_str(&self) -> Cow<'_, str> { + match self { + EncodedStr::Encoded { bytes, encoding } => { + encoding.decode_without_bom_handling(bytes).0 + } + EncodedStr::Utf8 { s } => Cow::from(*s), + } + } + + /// Returns the bytes in the string, in its encoding. + pub fn as_bytes(&self) -> &[u8] { + match self { + EncodedStr::Encoded { bytes, .. } => bytes, + EncodedStr::Utf8 { s } => s.as_bytes(), + } + } + + /// Returns this string recoded in `encoding`. Invalid characters will be + /// replaced by [REPLACEMENT_CHARACTER]. + /// + /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER + pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { + match self { + EncodedStr::Encoded { bytes, encoding } => { + let utf8 = encoding.decode_without_bom_handling(bytes).0; + match encoding.encode(&utf8).0 { + Cow::Borrowed(_) => { + // Recoding into UTF-8 and then back did not change anything. + Cow::from(*bytes) + } + Cow::Owned(owned) => Cow::Owned(owned), + } + } + EncodedStr::Utf8 { s } => encoding.encode(s).0, + } + } + + /// Returns true if this string is empty. + pub fn is_empty(&self) -> bool { + match self { + EncodedStr::Encoded { bytes, .. } => bytes.is_empty(), + EncodedStr::Utf8 { s } => s.is_empty(), + } + } + + /// Returns a helper for displaying this string in double quotes. + pub fn quoted(&self) -> QuotedEncodedStr { + QuotedEncodedStr(self) + } +} + +impl<'a> From<&'a str> for EncodedStr<'a> { + fn from(s: &'a str) -> Self { + Self::Utf8 { s } + } +} + +impl<'a> From<&'a String> for EncodedStr<'a> { + fn from(s: &'a String) -> Self { + Self::Utf8 { s: s.as_str() } + } +} + +/// Helper struct for displaying a [QuotedEncodedStr] in double quotes. +pub struct QuotedEncodedStr<'a>(&'a EncodedStr<'a>); + +impl Display for QuotedEncodedStr<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.0.as_str()) + } +} diff --git a/rust/pspp/src/format/parse.rs b/rust/pspp/src/format/parse.rs index b69fbdb48b..0c7c6dee37 100644 --- a/rust/pspp/src/format/parse.rs +++ b/rust/pspp/src/format/parse.rs @@ -16,11 +16,10 @@ use crate::{ calendar::{calendar_gregorian_to_offset, DateError}, - data::Datum, + data::{Datum, EncodedStr, EncodedString}, endian::{Endian, Parse}, format::{DateTemplate, Decimals, Settings, TemplateItem, Type}, settings::{EndianSettings, Settings as PsppSettings}, - sys::raw::{EncodedStr, EncodedString}, }; use encoding_rs::Encoding; use smallstr::SmallString; @@ -921,14 +920,13 @@ mod test { use crate::{ calendar::{days_in_month, is_leap_year}, - data::Datum, + data::{Datum, EncodedStr}, endian::Endian, format::{ parse::{ParseError, ParseErrorKind, Sign}, Epoch, Format, Settings as FormatSettings, Type, }, settings::EndianSettings, - sys::raw::EncodedStr, }; fn test(name: &str, type_: Type) { diff --git a/rust/pspp/src/sys/encoding.rs b/rust/pspp/src/sys/encoding.rs index 1510bb8a02..0f09f6bc5e 100644 --- a/rust/pspp/src/sys/encoding.rs +++ b/rust/pspp/src/sys/encoding.rs @@ -78,34 +78,3 @@ pub fn get_encoding( Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into())) } - -/* -#[cfg(test)] -mod tests { - use std::thread::spawn; - - use encoding_rs::{EUC_JP, UTF_8, WINDOWS_1252}; - - #[test] - fn round_trip() { - let mut threads = Vec::new(); - for thread in 0..128 { - let start: u32 = thread << 25; - let end = start + ((1 << 25) - 1); - threads.push(spawn(move || { - for i in start..=end { - let s = i.to_le_bytes(); - let (utf8, replacement) = EUC_JP.decode_without_bom_handling(&s); - if !replacement { - let s2 = UTF_8.encode(&utf8).0; - assert_eq!(s.as_slice(), &*s2); - } - } - })); - } - for thread in threads { - thread.join().unwrap(); - } - } -} -*/ diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 0623d374e2..6e822be41e 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -1586,112 +1586,6 @@ impl Debug for RawStrArray { } } -#[derive(Clone, Debug)] -pub enum EncodedString { - Encoded { - bytes: Vec, - encoding: &'static Encoding, - }, - Utf8 { - s: String, - }, -} - -impl EncodedString { - pub fn borrowed(&self) -> EncodedStr<'_> { - match self { - EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding }, - EncodedString::Utf8 { s } => EncodedStr::Utf8 { s }, - } - } -} - -impl<'a> From> for EncodedString { - fn from(value: EncodedStr<'a>) -> Self { - match value { - EncodedStr::Encoded { bytes, encoding } => Self::Encoded { - bytes: bytes.into(), - encoding, - }, - EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() }, - } - } -} - -pub enum EncodedStr<'a> { - Encoded { - bytes: &'a [u8], - encoding: &'static Encoding, - }, - Utf8 { - s: &'a str, - }, -} - -impl<'a> EncodedStr<'a> { - pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self { - Self::Encoded { bytes, encoding } - } - pub fn as_str(&self) -> Cow<'_, str> { - match self { - EncodedStr::Encoded { bytes, encoding } => { - encoding.decode_without_bom_handling(bytes).0 - } - EncodedStr::Utf8 { s } => Cow::from(*s), - } - } - pub fn as_bytes(&self) -> &[u8] { - match self { - EncodedStr::Encoded { bytes, .. } => bytes, - EncodedStr::Utf8 { s } => s.as_bytes(), - } - } - pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { - match self { - EncodedStr::Encoded { bytes, encoding } => { - let utf8 = encoding.decode_without_bom_handling(bytes).0; - match encoding.encode(&utf8).0 { - Cow::Borrowed(_) => { - // Recoding into UTF-8 and then back did not change anything. - Cow::from(*bytes) - } - Cow::Owned(owned) => Cow::Owned(owned), - } - } - EncodedStr::Utf8 { s } => encoding.encode(s).0, - } - } - pub fn is_empty(&self) -> bool { - match self { - EncodedStr::Encoded { bytes, .. } => bytes.is_empty(), - EncodedStr::Utf8 { s } => s.is_empty(), - } - } - pub fn quoted(&self) -> QuotedEncodedStr { - QuotedEncodedStr(self) - } -} - -impl<'a> From<&'a str> for EncodedStr<'a> { - fn from(s: &'a str) -> Self { - Self::Utf8 { s } - } -} - -impl<'a> From<&'a String> for EncodedStr<'a> { - fn from(s: &'a String) -> Self { - Self::Utf8 { s: s.as_str() } - } -} - -pub struct QuotedEncodedStr<'a>(&'a EncodedStr<'a>); - -impl Display for QuotedEncodedStr<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self.0.as_str()) - } -} - fn skip_bytes(r: &mut R, mut n: usize) -> Result<(), IoError> { thread_local! { static BUF: RefCell<[u8; 256]> = RefCell::new([0u8; 256]);