From 69c457792a1a94821358f7b505322c40f4f48f7f Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Mon, 19 May 2025 08:31:43 -0700 Subject: [PATCH] Revert "work" This reverts commit 630218d1e867d12c84c46d0e6990e2cbe9c265aa. --- rust/pspp/src/format/parse.rs | 22 +-- rust/pspp/src/output/pivot/test.rs | 2 +- rust/pspp/src/sys/raw.rs | 239 +++++++++++------------------ 3 files changed, 104 insertions(+), 159 deletions(-) diff --git a/rust/pspp/src/format/parse.rs b/rust/pspp/src/format/parse.rs index 623b47ac49..2f5887370f 100644 --- a/rust/pspp/src/format/parse.rs +++ b/rust/pspp/src/format/parse.rs @@ -185,13 +185,13 @@ impl<'a> ParseValue<'a> { } match self.type_ { Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => { - self.parse_number(&input.to_str(), self.type_) + self.parse_number(&input.as_str(), self.type_) } - Type::CC(_) => self.parse_number(&input.to_str(), Type::F), - Type::N => self.parse_n(&input.to_str()), - Type::Z => self.parse_z(&input.to_str()), - Type::PIBHex => self.parse_pibhex(&input.to_str()), - Type::RBHex => self.parse_rbhex(&input.to_str()), + Type::CC(_) => self.parse_number(&input.as_str(), Type::F), + Type::N => self.parse_n(&input.as_str()), + Type::Z => self.parse_z(&input.as_str()), + Type::PIBHex => self.parse_pibhex(&input.as_str()), + Type::RBHex => self.parse_rbhex(&input.as_str()), Type::Date | Type::ADate | Type::EDate @@ -204,9 +204,9 @@ impl<'a> ParseValue<'a> { | Type::YmdHms | Type::MTime | Type::Time - | Type::DTime => self.parse_date(&input.to_str()), - Type::WkDay => self.parse_wkday(&input.to_str()), - Type::Month => self.parse_month(&input.to_str()), + | Type::DTime => self.parse_date(&input.as_str()), + Type::WkDay => self.parse_wkday(&input.as_str()), + Type::Month => self.parse_month(&input.as_str()), Type::P => self.parse_p(input.as_bytes()), Type::PK => self.parse_pk(input.as_bytes()), Type::IB => self.parse_ib(input.as_bytes()), @@ -215,7 +215,7 @@ impl<'a> ParseValue<'a> { Type::A => Ok(Value::String( input.to_encoding(self.output_encoding).into(), )), - Type::AHex => self.parse_ahex(&input.to_str()), + Type::AHex => self.parse_ahex(&input.as_str()), } .map_err(|kind| ParseError { type_: self.type_, @@ -1719,7 +1719,7 @@ mod test { .as_string() .unwrap() .as_encoded(UTF_8) - .to_str(), + .as_str(), "abcdefgh" ); diff --git a/rust/pspp/src/output/pivot/test.rs b/rust/pspp/src/output/pivot/test.rs index 1354a66d6b..5788d03288 100644 --- a/rust/pspp/src/output/pivot/test.rs +++ b/rust/pspp/src/output/pivot/test.rs @@ -763,7 +763,7 @@ fn footnote_alphabetic_superscript() { "\ Pivot Table with Alphabetic Superscript Footnotes[*] ╭────────────┬──────────────────╮ -│ │ A[*] │ +│ │ A[*] 1 │ │ ├───────┬──────────┤ │Corner[*][b]│ B[b] │ C[*][b] │ ├────────────┼───────┼──────────┤ diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 18daf2cef7..ec4e3643b7 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -1510,33 +1510,6 @@ impl Debug for RawString { } } -#[derive(Clone, PartialEq, Default, Eq, PartialOrd, Ord, Hash)] -pub struct RawStr<'a>(pub &'a [u8]); - -impl<'a> RawStr<'a> { - pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> { - EncodedStr::new(&self.0, encoding) - } - pub fn with_encoding(&self, encoding: &'static Encoding) -> EncodedString { - EncodedString::new(&*self.0, encoding) - } - pub fn as_slice(&self) -> &[u8] { - self.0 - } -} - -impl<'a> From<&'a [u8]> for RawStr<'a> { - fn from(source: &'a [u8]) -> Self { - Self(source) - } -} - -impl Debug for RawStr<'_> { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{:?}", default_decode(self.as_slice())) - } -} - #[derive(Copy, Clone)] pub struct RawStrArray(pub [u8; N]); @@ -1552,186 +1525,162 @@ impl Debug for RawStrArray { } } -/// Pairs a text string with its [Encoding]. -/// -/// Strings in different encodings are considered to be different, even if they -/// have the same contents. This is an important optimization for hashing: -/// otherwise, strings could only be hashed if they were converted to a common -/// encoding (probably UTF-8), which would be expensive. #[derive(Clone)] -pub struct EncodedString { - /// Raw contents. - bytes: RawString, - - /// Encoding. - encoding: &'static Encoding, - - /// True if `bytes` can be treated as UTF-8, that is, if `bytes` contains - /// valid UTF-8 and that would correctly represent its contents in - /// `encoding`. - valid_utf8: bool, +pub enum EncodedString { + Encoded { + bytes: Vec, + encoding: &'static Encoding, + }, + Utf8(String), } impl EncodedString { /// Creates a new `EncodedString` from `bytes` and `encoding`. /// - /// If the input is in a `&str` or `String`, instead use - /// `EncodedString::from(string)` because it avoids checking for correct - /// UTF-8. + /// It's cheaper to use `EncodedString::from(string)` if the input is in a + /// `&str` or `String`. pub fn new(bytes: impl Into>, encoding: &'static Encoding) -> Self { let bytes: Vec = bytes.into(); - Self { - valid_utf8: matches!( - encoding.decode_without_bom_handling(&bytes).0, - Cow::Borrowed(_), - ), - bytes: RawString::from(bytes), - encoding, + if encoding == UTF_8 { + match String::from_utf8(bytes) { + Ok(string) => Self::Utf8(string), + Err(error) => Self::Encoded { + bytes: error.into_bytes(), + encoding, + }, + } + } else { + Self::Encoded { bytes, encoding } } } pub fn borrowed(&self) -> EncodedStr<'_> { - todo!() - } - - pub fn encoding(&self) -> &'static Encoding { - self.encoding - } - - pub fn as_bytes(&self) -> &[u8] { - self.bytes.0.iter().as_slice() + match self { + EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding }, + EncodedString::Utf8(s) => EncodedStr::Utf8 { s }, + } } - - pub fn as_str(&self) -> Option<&str> { - self.valid_utf8 - .then(|| unsafe { std::str::from_utf8_unchecked(self.bytes.as_slice()) }) + pub fn as_utf8_bytes(&self) -> Option<&[u8]> { + match self { + EncodedString::Encoded { bytes, encoding } if *encoding == UTF_8 => Some(&bytes), + EncodedString::Utf8(s) => Some(s.as_bytes()), + _ => None, + } } - - pub fn to_utf8<'a>(&'a self) -> Cow<'a, str> { - match self.as_str() { - Some(str) => Cow::from(str), - None => { - self.encoding - .decode_without_bom_handling(self.bytes.as_slice()) - .0 - } + pub fn as_encoded(&self) -> (&[u8], &'static Encoding) { + match self { + EncodedString::Encoded { bytes, encoding } => (&bytes, encoding), + EncodedString::Utf8(s) => (s.as_bytes(), UTF_8), } } } impl Debug for EncodedString { fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - write!(f, "{:?}", self.to_utf8())?; - if self.encoding != UTF_8 { - write!(f, "({})", self.encoding.name())?; + match self { + Self::Encoded { bytes, encoding } => { + write!(f, "{:?}({})", self.borrowed().to_utf8(), encoding.name()) + } + Self::Utf8(string) => write!(f, "{string:?}"), } - Ok(()) } } impl From for EncodedString { fn from(value: String) -> Self { - Self { - bytes: RawString(value.into_bytes().into_boxed_slice()), - encoding: UTF_8, - valid_utf8: true, - } + Self::Utf8(value) } } impl From<&'_ str> for EncodedString { fn from(value: &'_ str) -> Self { - Self { - bytes: value.as_bytes().into(), - encoding: UTF_8, - valid_utf8: true, - } + Self::Utf8(value.into()) } } impl Default for EncodedString { fn default() -> Self { - // XXX what encoding? - todo!() + Self::Utf8(String::new()) } } impl<'a> From> for EncodedString { fn from(value: EncodedStr<'a>) -> Self { - Self { - bytes: value.bytes.0.to_vec().into(), - encoding: value.encoding, - valid_utf8: value.valid_utf8, + match value { + EncodedStr::Encoded { bytes, encoding } => Self::Encoded { + bytes: bytes.into(), + encoding, + }, + EncodedStr::Utf8 { s } => Self::Utf8(s.into()), } } } impl PartialEq for EncodedString { fn eq(&self, other: &Self) -> bool { - self.encoding == other.encoding - && self.valid_utf8 == other.valid_utf8 - && self.bytes == other.bytes + if let Some(self_utf8) = self.as_utf8_bytes() { + if let Some(other_utf8) = other.as_utf8_bytes() { + return self_utf8 == other_utf8; + } + } + + let (self_bytes, self_encoding) = self.as_encoded(); + let (other_bytes, other_encoding) = other.as_encoded(); + if self_encoding == other_encoding { + self_bytes == other_bytes + } else { + self.borrowed().to_utf8() == other.borrowed().to_utf8() + } } } -pub struct EncodedStr<'a> { - /// Raw contents. - bytes: RawStr<'a>, - - /// Encoding. - encoding: &'static Encoding, - - /// True if `bytes` can be treated as UTF-8, that is, if `bytes` contains - /// valid UTF-8 and that would correctly represent its contents in - /// `encoding`. - valid_utf8: bool, +pub enum EncodedStr<'a> { + Encoded { + bytes: &'a [u8], + encoding: &'static Encoding, + }, + Utf8 { + s: &'a str, + }, } impl<'a> EncodedStr<'a> { - /// Creates a new `EncodedStr` from `bytes` and `encoding`. - /// - /// If the input is in a `&str` or `String`, instead use - /// `EncodedString::from(string)` because it avoids checking for correct - /// UTF-8. pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self { - Self { - valid_utf8: matches!( - encoding.decode_without_bom_handling(bytes).0, - Cow::Borrowed(_), - ), - bytes: RawStr::from(bytes), - encoding, - } - } - - pub fn as_str(&self) -> Option<&str> { - self.valid_utf8 - .then(|| unsafe { std::str::from_utf8_unchecked(self.bytes.as_slice()) }) + Self::Encoded { bytes, encoding } } - - pub fn to_str(&'a self) -> Cow<'a, str> { - match self.as_str() { - Some(str) => Cow::from(str), - None => { - self.encoding - .decode_without_bom_handling(self.bytes.as_slice()) - .0 + pub fn as_str(&self) -> Cow<'_, str> { + match self { + EncodedStr::Encoded { bytes, encoding } => { + encoding.decode_without_bom_handling(bytes).0 } + EncodedStr::Utf8 { s } => Cow::from(*s), } } - pub fn as_bytes(&self) -> &[u8] { - self.bytes.as_slice() + match self { + EncodedStr::Encoded { bytes, .. } => bytes, + EncodedStr::Utf8 { s } => s.as_bytes(), + } + } + pub fn to_utf8(&self) -> Cow<'a, str> { + match self { + EncodedStr::Encoded { bytes, encoding } => { + encoding.decode_without_bom_handling(bytes).0 + } + EncodedStr::Utf8 { s } => Cow::from(*s), + } } - pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { - match encoding.encode(&self.to_str()).0 { + match encoding.encode(&self.to_utf8()).0 { Cow::Borrowed(_) => Cow::Borrowed(self.as_bytes()), Cow::Owned(string) => Cow::Owned(string), } } pub fn is_empty(&self) -> bool { - self.bytes.0.is_empty() + match self { + EncodedStr::Encoded { bytes, .. } => bytes.is_empty(), + EncodedStr::Utf8 { s } => s.is_empty(), + } } pub fn quoted(&self) -> QuotedEncodedStr { QuotedEncodedStr(self) @@ -1740,17 +1689,13 @@ impl<'a> EncodedStr<'a> { impl<'a> From<&'a str> for EncodedStr<'a> { fn from(s: &'a str) -> Self { - Self { - bytes: RawStr(s.as_bytes()), - encoding: UTF_8, - valid_utf8: true, - } + Self::Utf8 { s } } } impl<'a> From<&'a String> for EncodedStr<'a> { fn from(s: &'a String) -> Self { - Self::from(s.as_str()) + Self::Utf8 { s: s.as_str() } } } -- 2.30.2