From eb63735dba10d4cb284a23da1d7fd99d339509ba Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 26 Jul 2025 09:19:51 -0700 Subject: [PATCH] generalize encodedstring --- rust/pspp/src/data.rs | 142 ++++++++++++++++++++++++++++-------------- 1 file changed, 95 insertions(+), 47 deletions(-) diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index e1c9125b4b..7402d2c9e1 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -141,7 +141,7 @@ impl OwnedRawString { pub fn with_encoding(self, encoding: &'static Encoding) -> EncodedString { EncodedString { - bytes: self.0, + bytes: self, encoding, } } @@ -159,6 +159,10 @@ where self.0.borrow().len() } + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + /// Returns true if this raw string can be resized to `len` bytes without /// dropping non-space characters. pub fn is_resizable(&self, new_len: usize) -> bool { @@ -215,7 +219,7 @@ impl From<[u8; N]> for OwnedRawString { impl From for OwnedRawString { fn from(value: EncodedString) -> Self { - Self(value.bytes) + value.bytes } } @@ -416,7 +420,7 @@ pub enum EncodedDat<'a> { /// A string value. String( /// The value, in the variable's encoding. - EncodedStr<'a>, + EncodedString<&'a BorrowedRawString>, ), } @@ -437,21 +441,22 @@ impl<'a> EncodedDat<'a> { /// Returns the string inside this datum, or `None` if this is a numeric /// datum. - pub fn as_string(&self) -> Option<&EncodedStr> { + pub fn as_string(&self) -> Option> { match self { Self::Number(_) => None, - Self::String(s) => Some(s), + Self::String(s) => Some(*s), } } + /* /// Returns the string inside this datum as a mutable borrow, or `None` if /// this is a numeric datum. - pub fn as_string_mut(&'a mut self) -> Option> { + pub fn as_string_mut(&'a mut self) -> Option> { match self { Self::Number(_) => None, Self::String(s) => Some(*s), } - } + }*/ pub fn eq_ignore_trailing_spaces<'b>(&self, other: EncodedDat<'b>) -> bool { match (self, other) { @@ -472,7 +477,7 @@ impl Display for QuotedEncodedDat<'_> { match &self.0 { EncodedDat::Number(None) => write!(f, "SYSMIS"), EncodedDat::Number(Some(number)) => number.display_plain().fmt(f), - EncodedDat::String(string) => write!(f, "{}", string.quoted()), + EncodedDat::String(string) => write!(f, "\"{}\"", string.as_str()), } } } @@ -691,9 +696,10 @@ where pub fn as_encoded<'a>(&'a self, encoding: &'static Encoding) -> EncodedDat<'a> { match self { Datum::Number(number) => EncodedDat::Number(*number), - Datum::String(raw_string) => { - EncodedDat::String(raw_string.borrow().as_encoded(encoding)) - } + Datum::String(raw_string) => EncodedDat::String(EncodedString { + encoding, + bytes: raw_string.borrow(), + }), } } } @@ -856,18 +862,21 @@ impl Iterator for CaseVecIter { /// The string is not guaranteed to be valid in the encoding. /// /// The borrowed form of such a string is [EncodedStr]. -#[derive(Clone, Debug)] -pub struct EncodedString { +#[derive(Copy, Clone, Debug)] +pub struct EncodedString { /// The bytes of the string. - bytes: Vec, + bytes: R, /// The string's encoding. encoding: &'static Encoding, } -impl EncodedString { +impl EncodedString +where + R: Borrow, +{ pub fn len(&self) -> usize { - self.bytes.len() + self.bytes.borrow().len() } /// Returns this string recoded in UTF-8. Invalid characters will be @@ -875,66 +884,80 @@ impl EncodedString { /// /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER pub fn as_str(&self) -> Cow<'_, str> { - self.encoding.decode_without_bom_handling(&self.bytes).0 + self.encoding.decode_without_bom_handling(self.as_bytes()).0 } /// Returns the bytes in the string, in its encoding. pub fn as_bytes(&self) -> &[u8] { - &self.bytes + &self.bytes.borrow().0 } /// Compares this string and `other` for equality, ignoring trailing ASCII /// spaces in either string for the purpose of comparison. (This is /// acceptable because we assume that the encoding is ASCII-compatible.) - pub fn eq_ignore_trailing_spaces<'a>(&self, other: impl Into>) -> bool { - self.borrowed().eq_ignore_trailing_spaces(other.into()) + pub fn eq_ignore_trailing_spaces(&self, other: impl Into>) -> bool + where + R2: Borrow, + { + self.borrowed() + .bytes + .eq_ignore_trailing_spaces(&other.into().borrowed().bytes) + } + + /// Returns the string's [Encoding]. + pub fn encoding(&self) -> &'static Encoding { + self.encoding + } + + /// Returns a borrowed form of this string. + pub fn borrowed<'a>(&'a self) -> EncodedString<&'a BorrowedRawString> { + EncodedString { + encoding: self.encoding, + bytes: self.bytes.borrow(), + } + } + + /// Returns true if this string is empty. + pub fn is_empty(&self) -> bool { + self.bytes.borrow().is_empty() + } + + /// Returns a helper for displaying this string in double quotes. + pub fn quoted(&self) -> impl Display { + Quoted(self.as_str()) } +} +impl EncodedString { pub fn resize(&mut self, new_len: usize) -> Result<(), ()> { match new_len.cmp(&self.len()) { Ordering::Less => { if !self.as_bytes()[new_len..].iter().all(|b| *b == b' ') { return Err(()); } - self.bytes.truncate(new_len); + self.bytes.0.truncate(new_len); } Ordering::Equal => (), - Ordering::Greater => self.bytes.extend((self.len()..new_len).map(|_| b' ')), + Ordering::Greater => self.bytes.0.extend((self.len()..new_len).map(|_| b' ')), } Ok(()) } - /// Returns the string's [Encoding]. - pub fn encoding(&self) -> &'static Encoding { - self.encoding - } - - /// Returns a borrowed form of this string. - pub fn borrowed(&self) -> EncodedStr<'_> { - EncodedStr::new(&self.bytes, self.encoding) - } - /// Removes any trailing ASCII spaces. pub fn trim_end(&mut self) { - while self.bytes.pop_if(|c| *c == b' ').is_some() {} + while self.bytes.0.pop_if(|c| *c == b' ').is_some() {} } } impl From<&str> for EncodedString { fn from(value: &str) -> Self { Self { - bytes: value.into(), + bytes: RawString(value.into()), encoding: UTF_8, } } } -impl<'a> From<&'a EncodedString> for EncodedStr<'a> { - fn from(value: &'a EncodedString) -> Self { - value.borrowed() - } -} - impl<'a> From> for EncodedString { fn from(value: EncodedStr<'a>) -> Self { Self { @@ -949,7 +972,27 @@ impl Serialize for EncodedString { where S: serde::Serializer, { - self.borrowed().serialize(serializer) + self.as_str().serialize(serializer) + } +} + +impl Display for EncodedString +where + R: Borrow, +{ + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl PartialEq> for EncodedString +where + R: Borrow, + R2: Borrow, +{ + fn eq(&self, other: &EncodedString) -> bool { + // XXX should this consider the encodings? + self.borrowed().bytes.eq(other.borrowed().bytes) } } @@ -1014,8 +1057,8 @@ impl<'a> EncodedStr<'a> { } /// Returns a helper for displaying this string in double quotes. - pub fn quoted(&self) -> QuotedEncodedStr { - QuotedEncodedStr(self) + pub fn quoted(&self) -> impl Display { + Quoted(self.as_str()) } } @@ -1055,11 +1098,16 @@ impl<'a> Serialize for EncodedStr<'a> { } } -/// Helper struct for displaying a [QuotedEncodedStr] in double quotes. -pub struct QuotedEncodedStr<'a>(&'a EncodedStr<'a>); +/// Helper struct for displaying a value in double quotes. +pub struct Quoted(T) +where + T: Display; -impl Display for QuotedEncodedStr<'_> { +impl Display for Quoted +where + T: Display, +{ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self.0.as_str()) + write!(f, "\"{}\"", &self.0) } } -- 2.30.2