From: Ben Pfaff Date: Sun, 18 May 2025 23:21:02 +0000 (-0700) Subject: work X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=630218d1e867d12c84c46d0e6990e2cbe9c265aa;p=pspp work --- diff --git a/rust/pspp/src/format/parse.rs b/rust/pspp/src/format/parse.rs index 2f5887370f..623b47ac49 100644 --- a/rust/pspp/src/format/parse.rs +++ b/rust/pspp/src/format/parse.rs @@ -185,13 +185,13 @@ impl<'a> ParseValue<'a> { } match self.type_ { Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => { - self.parse_number(&input.as_str(), self.type_) + self.parse_number(&input.to_str(), self.type_) } - Type::CC(_) => self.parse_number(&input.as_str(), Type::F), - Type::N => self.parse_n(&input.as_str()), - Type::Z => self.parse_z(&input.as_str()), - Type::PIBHex => self.parse_pibhex(&input.as_str()), - Type::RBHex => self.parse_rbhex(&input.as_str()), + Type::CC(_) => self.parse_number(&input.to_str(), Type::F), + Type::N => self.parse_n(&input.to_str()), + Type::Z => self.parse_z(&input.to_str()), + Type::PIBHex => self.parse_pibhex(&input.to_str()), + Type::RBHex => self.parse_rbhex(&input.to_str()), Type::Date | Type::ADate | Type::EDate @@ -204,9 +204,9 @@ impl<'a> ParseValue<'a> { | Type::YmdHms | Type::MTime | Type::Time - | Type::DTime => self.parse_date(&input.as_str()), - Type::WkDay => self.parse_wkday(&input.as_str()), - Type::Month => self.parse_month(&input.as_str()), + | Type::DTime => self.parse_date(&input.to_str()), + Type::WkDay => self.parse_wkday(&input.to_str()), + Type::Month => self.parse_month(&input.to_str()), Type::P => self.parse_p(input.as_bytes()), Type::PK => self.parse_pk(input.as_bytes()), Type::IB => self.parse_ib(input.as_bytes()), @@ -215,7 +215,7 @@ impl<'a> ParseValue<'a> { Type::A => Ok(Value::String( input.to_encoding(self.output_encoding).into(), )), - Type::AHex => self.parse_ahex(&input.as_str()), + Type::AHex => self.parse_ahex(&input.to_str()), } .map_err(|kind| ParseError { type_: self.type_, @@ -1719,7 +1719,7 @@ mod test { .as_string() .unwrap() .as_encoded(UTF_8) - .as_str(), + .to_str(), "abcdefgh" ); diff --git a/rust/pspp/src/output/pivot/test.rs b/rust/pspp/src/output/pivot/test.rs index 5788d03288..1354a66d6b 100644 --- a/rust/pspp/src/output/pivot/test.rs +++ b/rust/pspp/src/output/pivot/test.rs @@ -763,7 +763,7 @@ fn footnote_alphabetic_superscript() { "\ Pivot Table with Alphabetic Superscript Footnotes[*] ╭────────────┬──────────────────╮ -│ │ A[*] 1 │ +│ │ A[*] │ │ ├───────┬──────────┤ │Corner[*][b]│ B[b] │ C[*][b] │ ├────────────┼───────┼──────────┤ diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index ec4e3643b7..18daf2cef7 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -1510,6 +1510,33 @@ impl Debug for RawString { } } +#[derive(Clone, PartialEq, Default, Eq, PartialOrd, Ord, Hash)] +pub struct RawStr<'a>(pub &'a [u8]); + +impl<'a> RawStr<'a> { + pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> { + EncodedStr::new(&self.0, encoding) + } + pub fn with_encoding(&self, encoding: &'static Encoding) -> EncodedString { + EncodedString::new(&*self.0, encoding) + } + pub fn as_slice(&self) -> &[u8] { + self.0 + } +} + +impl<'a> From<&'a [u8]> for RawStr<'a> { + fn from(source: &'a [u8]) -> Self { + Self(source) + } +} + +impl Debug for RawStr<'_> { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{:?}", default_decode(self.as_slice())) + } +} + #[derive(Copy, Clone)] pub struct RawStrArray(pub [u8; N]); @@ -1525,162 +1552,186 @@ impl Debug for RawStrArray { } } +/// Pairs a text string with its [Encoding]. +/// +/// Strings in different encodings are considered to be different, even if they +/// have the same contents. This is an important optimization for hashing: +/// otherwise, strings could only be hashed if they were converted to a common +/// encoding (probably UTF-8), which would be expensive. #[derive(Clone)] -pub enum EncodedString { - Encoded { - bytes: Vec, - encoding: &'static Encoding, - }, - Utf8(String), +pub struct EncodedString { + /// Raw contents. + bytes: RawString, + + /// Encoding. + encoding: &'static Encoding, + + /// True if `bytes` can be treated as UTF-8, that is, if `bytes` contains + /// valid UTF-8 and that would correctly represent its contents in + /// `encoding`. + valid_utf8: bool, } impl EncodedString { /// Creates a new `EncodedString` from `bytes` and `encoding`. /// - /// It's cheaper to use `EncodedString::from(string)` if the input is in a - /// `&str` or `String`. + /// If the input is in a `&str` or `String`, instead use + /// `EncodedString::from(string)` because it avoids checking for correct + /// UTF-8. pub fn new(bytes: impl Into>, encoding: &'static Encoding) -> Self { let bytes: Vec = bytes.into(); - if encoding == UTF_8 { - match String::from_utf8(bytes) { - Ok(string) => Self::Utf8(string), - Err(error) => Self::Encoded { - bytes: error.into_bytes(), - encoding, - }, - } - } else { - Self::Encoded { bytes, encoding } + Self { + valid_utf8: matches!( + encoding.decode_without_bom_handling(&bytes).0, + Cow::Borrowed(_), + ), + bytes: RawString::from(bytes), + encoding, } } pub fn borrowed(&self) -> EncodedStr<'_> { - match self { - EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding }, - EncodedString::Utf8(s) => EncodedStr::Utf8 { s }, - } + todo!() } - pub fn as_utf8_bytes(&self) -> Option<&[u8]> { - match self { - EncodedString::Encoded { bytes, encoding } if *encoding == UTF_8 => Some(&bytes), - EncodedString::Utf8(s) => Some(s.as_bytes()), - _ => None, - } + + pub fn encoding(&self) -> &'static Encoding { + self.encoding } - pub fn as_encoded(&self) -> (&[u8], &'static Encoding) { - match self { - EncodedString::Encoded { bytes, encoding } => (&bytes, encoding), - EncodedString::Utf8(s) => (s.as_bytes(), UTF_8), + + pub fn as_bytes(&self) -> &[u8] { + self.bytes.0.iter().as_slice() + } + + pub fn as_str(&self) -> Option<&str> { + self.valid_utf8 + .then(|| unsafe { std::str::from_utf8_unchecked(self.bytes.as_slice()) }) + } + + pub fn to_utf8<'a>(&'a self) -> Cow<'a, str> { + match self.as_str() { + Some(str) => Cow::from(str), + None => { + self.encoding + .decode_without_bom_handling(self.bytes.as_slice()) + .0 + } } } } impl Debug for EncodedString { fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - match self { - Self::Encoded { bytes, encoding } => { - write!(f, "{:?}({})", self.borrowed().to_utf8(), encoding.name()) - } - Self::Utf8(string) => write!(f, "{string:?}"), + write!(f, "{:?}", self.to_utf8())?; + if self.encoding != UTF_8 { + write!(f, "({})", self.encoding.name())?; } + Ok(()) } } impl From for EncodedString { fn from(value: String) -> Self { - Self::Utf8(value) + Self { + bytes: RawString(value.into_bytes().into_boxed_slice()), + encoding: UTF_8, + valid_utf8: true, + } } } impl From<&'_ str> for EncodedString { fn from(value: &'_ str) -> Self { - Self::Utf8(value.into()) + Self { + bytes: value.as_bytes().into(), + encoding: UTF_8, + valid_utf8: true, + } } } impl Default for EncodedString { fn default() -> Self { - Self::Utf8(String::new()) + // XXX what encoding? + todo!() } } impl<'a> From> for EncodedString { fn from(value: EncodedStr<'a>) -> Self { - match value { - EncodedStr::Encoded { bytes, encoding } => Self::Encoded { - bytes: bytes.into(), - encoding, - }, - EncodedStr::Utf8 { s } => Self::Utf8(s.into()), + Self { + bytes: value.bytes.0.to_vec().into(), + encoding: value.encoding, + valid_utf8: value.valid_utf8, } } } impl PartialEq for EncodedString { fn eq(&self, other: &Self) -> bool { - if let Some(self_utf8) = self.as_utf8_bytes() { - if let Some(other_utf8) = other.as_utf8_bytes() { - return self_utf8 == other_utf8; - } - } - - let (self_bytes, self_encoding) = self.as_encoded(); - let (other_bytes, other_encoding) = other.as_encoded(); - if self_encoding == other_encoding { - self_bytes == other_bytes - } else { - self.borrowed().to_utf8() == other.borrowed().to_utf8() - } + self.encoding == other.encoding + && self.valid_utf8 == other.valid_utf8 + && self.bytes == other.bytes } } -pub enum EncodedStr<'a> { - Encoded { - bytes: &'a [u8], - encoding: &'static Encoding, - }, - Utf8 { - s: &'a str, - }, +pub struct EncodedStr<'a> { + /// Raw contents. + bytes: RawStr<'a>, + + /// Encoding. + encoding: &'static Encoding, + + /// True if `bytes` can be treated as UTF-8, that is, if `bytes` contains + /// valid UTF-8 and that would correctly represent its contents in + /// `encoding`. + valid_utf8: bool, } impl<'a> EncodedStr<'a> { + /// Creates a new `EncodedStr` from `bytes` and `encoding`. + /// + /// If the input is in a `&str` or `String`, instead use + /// `EncodedString::from(string)` because it avoids checking for correct + /// UTF-8. pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self { - Self::Encoded { bytes, encoding } - } - pub fn as_str(&self) -> Cow<'_, str> { - match self { - EncodedStr::Encoded { bytes, encoding } => { - encoding.decode_without_bom_handling(bytes).0 - } - EncodedStr::Utf8 { s } => Cow::from(*s), + Self { + valid_utf8: matches!( + encoding.decode_without_bom_handling(bytes).0, + Cow::Borrowed(_), + ), + bytes: RawStr::from(bytes), + encoding, } } - pub fn as_bytes(&self) -> &[u8] { - match self { - EncodedStr::Encoded { bytes, .. } => bytes, - EncodedStr::Utf8 { s } => s.as_bytes(), - } + + pub fn as_str(&self) -> Option<&str> { + self.valid_utf8 + .then(|| unsafe { std::str::from_utf8_unchecked(self.bytes.as_slice()) }) } - pub fn to_utf8(&self) -> Cow<'a, str> { - match self { - EncodedStr::Encoded { bytes, encoding } => { - encoding.decode_without_bom_handling(bytes).0 + + pub fn to_str(&'a self) -> Cow<'a, str> { + match self.as_str() { + Some(str) => Cow::from(str), + None => { + self.encoding + .decode_without_bom_handling(self.bytes.as_slice()) + .0 } - EncodedStr::Utf8 { s } => Cow::from(*s), } } + + pub fn as_bytes(&self) -> &[u8] { + self.bytes.as_slice() + } + pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { - match encoding.encode(&self.to_utf8()).0 { + match encoding.encode(&self.to_str()).0 { Cow::Borrowed(_) => Cow::Borrowed(self.as_bytes()), Cow::Owned(string) => Cow::Owned(string), } } pub fn is_empty(&self) -> bool { - match self { - EncodedStr::Encoded { bytes, .. } => bytes.is_empty(), - EncodedStr::Utf8 { s } => s.is_empty(), - } + self.bytes.0.is_empty() } pub fn quoted(&self) -> QuotedEncodedStr { QuotedEncodedStr(self) @@ -1689,13 +1740,17 @@ impl<'a> EncodedStr<'a> { impl<'a> From<&'a str> for EncodedStr<'a> { fn from(s: &'a str) -> Self { - Self::Utf8 { s } + Self { + bytes: RawStr(s.as_bytes()), + encoding: UTF_8, + valid_utf8: true, + } } } impl<'a> From<&'a String> for EncodedStr<'a> { fn from(s: &'a String) -> Self { - Self::Utf8 { s: s.as_str() } + Self::from(s.as_str()) } }