From: Ben Pfaff Date: Sat, 26 Jul 2025 00:02:54 +0000 (-0700) Subject: wokr on encodedstring X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=412fad0b2ed2f6282fce80cde185c261d87f0b69;p=pspp wokr on encodedstring --- diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index e1c9125b4b..a3e97877b8 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -141,7 +141,7 @@ impl OwnedRawString { pub fn with_encoding(self, encoding: &'static Encoding) -> EncodedString { EncodedString { - bytes: self.0, + bytes: self, encoding, } } @@ -184,8 +184,11 @@ where /// Creates an [EncodedStr] with `encoding` that borrows this string's /// contents. - pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> { - EncodedStr::new(self.0.borrow(), encoding) + pub fn as_encoded(&self, encoding: &'static Encoding) -> BorrowedEncodedString { + EncodedString { + encoding, + bytes: self.0.borrow(), + } } } @@ -215,7 +218,7 @@ impl From<[u8; N]> for OwnedRawString { impl From for OwnedRawString { fn from(value: EncodedString) -> Self { - Self(value.bytes) + value.bytes } } @@ -416,7 +419,7 @@ pub enum EncodedDat<'a> { /// A string value. String( /// The value, in the variable's encoding. - EncodedStr<'a>, + &'a BorrowedEncodedString, ), } @@ -437,7 +440,7 @@ impl<'a> EncodedDat<'a> { /// Returns the string inside this datum, or `None` if this is a numeric /// datum. - pub fn as_string(&self) -> Option<&EncodedStr> { + pub fn as_string(&self) -> Option<&BorrowedEncodedString> { match self { Self::Number(_) => None, Self::String(s) => Some(s), @@ -446,7 +449,7 @@ impl<'a> EncodedDat<'a> { /// Returns the string inside this datum as a mutable borrow, or `None` if /// this is a numeric datum. - pub fn as_string_mut(&'a mut self) -> Option> { + pub fn as_string_mut(&'a mut self) -> Option<&mut BorrowedEncodedString> { match self { Self::Number(_) => None, Self::String(s) => Some(*s), @@ -508,7 +511,8 @@ pub type OwnedDatum = Datum; /// A [Datum] that borrows its string data (if any). pub type BorrowedDatum<'a> = Datum<&'a BorrowedRawString>; -/// The value of a [Variable](crate::dictionary::Variable). +/// The value of a [Variable](crate::dictionary::Variable): either a number or a +/// string. /// /// `RawString` is parameterized by its string type, which is either /// [OwnedRawString] if it owns its string value (aliased as [OwnedDatum]) or @@ -851,21 +855,39 @@ impl Iterator for CaseVecIter { } } +pub type OwnedEncodedString = EncodedString>; +pub type BorrowedEncodedString = EncodedString<[u8]>; + /// An owned string and its [Encoding]. /// /// The string is not guaranteed to be valid in the encoding. /// /// The borrowed form of such a string is [EncodedStr]. -#[derive(Clone, Debug)] -pub struct EncodedString { - /// The bytes of the string. - bytes: Vec, - +#[derive(Clone)] +pub struct EncodedString> +where + B: ?Sized, +{ /// The string's encoding. encoding: &'static Encoding, + + /// The bytes of the string. + bytes: RawString, } -impl EncodedString { +impl Debug for EncodedString +where + B: Borrow<[u8]>, +{ + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.as_str()) + } +} + +impl EncodedString +where + B: Borrow<[u8]>, +{ pub fn len(&self) -> usize { self.bytes.len() } @@ -875,18 +897,21 @@ impl EncodedString { /// /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER pub fn as_str(&self) -> Cow<'_, str> { - self.encoding.decode_without_bom_handling(&self.bytes).0 + self.encoding.decode_without_bom_handling(self.as_bytes()).0 } /// Returns the bytes in the string, in its encoding. pub fn as_bytes(&self) -> &[u8] { - &self.bytes + self.bytes.as_bytes() } /// Compares this string and `other` for equality, ignoring trailing ASCII /// spaces in either string for the purpose of comparison. (This is /// acceptable because we assume that the encoding is ASCII-compatible.) - pub fn eq_ignore_trailing_spaces<'a>(&self, other: impl Into>) -> bool { + pub fn eq_ignore_trailing_spaces<'a>( + &self, + other: impl Into<&'a BorrowedEncodedString>, + ) -> bool { self.borrowed().eq_ignore_trailing_spaces(other.into()) } @@ -910,8 +935,11 @@ impl EncodedString { } /// Returns a borrowed form of this string. - pub fn borrowed(&self) -> EncodedStr<'_> { - EncodedStr::new(&self.bytes, self.encoding) + pub fn borrowed(&self) -> &EncodedString<[u8]> { + EncodedString { + encoding: self.encoding, + bytes: self.bytes.borrow(), + } } /// Removes any trailing ASCII spaces. @@ -929,14 +957,8 @@ impl From<&str> for EncodedString { } } -impl<'a> From<&'a EncodedString> for EncodedStr<'a> { - fn from(value: &'a EncodedString) -> Self { - value.borrowed() - } -} - -impl<'a> From> for EncodedString { - fn from(value: EncodedStr<'a>) -> Self { +impl<'a> From<&'a BorrowedEncodedString> for EncodedString { + fn from(value: &'a BorrowedEncodedString) -> Self { Self { bytes: value.bytes.into(), encoding: value.encoding, @@ -952,114 +974,3 @@ impl Serialize for EncodedString { self.borrowed().serialize(serializer) } } - -/// A borrowed string and its [Encoding]. -/// -/// The string is not guaranteed to be valid in the encoding. -/// -/// The owned form of such a string is [EncodedString]. -#[derive(Copy, Clone, PartialEq, Eq)] -pub struct EncodedStr<'a> { - /// The bytes of the string. - bytes: &'a [u8], - - /// The string's encoding. - encoding: &'static Encoding, -} - -impl<'a> EncodedStr<'a> { - /// Construct a new string with an arbitrary encoding. - pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self { - Self { bytes, encoding } - } - - /// Returns this string recoded in UTF-8. Invalid characters will be - /// replaced by [REPLACEMENT_CHARACTER]. - /// - /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER - pub fn as_str(&self) -> Cow<'_, str> { - self.encoding.decode_without_bom_handling(self.bytes).0 - } - - /// Returns the bytes in the string, in its encoding. - pub fn as_bytes(&self) -> &[u8] { - self.bytes - } - - /// Returns this string recoded in `encoding`. Invalid characters will be - /// replaced by [REPLACEMENT_CHARACTER]. - /// - /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER - pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { - let utf8 = self.as_str(); - match encoding.encode(&utf8).0 { - Cow::Borrowed(_) => { - // Recoding into UTF-8 and then back did not change anything. - Cow::from(self.bytes) - } - Cow::Owned(owned) => Cow::Owned(owned), - } - } - - /// Returns true if this string is empty. - pub fn is_empty(&self) -> bool { - self.bytes.is_empty() - } - - pub fn eq_ignore_trailing_spaces<'b>(&self, other: EncodedStr<'b>) -> bool { - self.bytes.iter().zip_longest(other.bytes).all(|elem| { - let (left, right) = elem.or(&b' ', &b' '); - *left == *right - }) - } - - /// Returns a helper for displaying this string in double quotes. - pub fn quoted(&self) -> QuotedEncodedStr { - QuotedEncodedStr(self) - } -} - -impl<'a> Display for EncodedStr<'a> { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "{}", self.as_str()) - } -} - -impl<'a> Debug for EncodedStr<'a> { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "{:?}", self.as_str()) - } -} - -impl<'a> From<&'a str> for EncodedStr<'a> { - fn from(s: &'a str) -> Self { - Self { - bytes: s.as_bytes(), - encoding: UTF_8, - } - } -} - -impl<'a> From<&'a String> for EncodedStr<'a> { - fn from(s: &'a String) -> Self { - Self::from(s.as_str()) - } -} - -impl<'a> Serialize for EncodedStr<'a> { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - self.as_str().serialize(serializer) - } -} - -/// Helper struct for displaying a [QuotedEncodedStr] in double quotes. -pub struct QuotedEncodedStr<'a>(&'a EncodedStr<'a>); - -impl Display for QuotedEncodedStr<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self.0.as_str()) - } -} diff --git a/rust/pspp/src/format/parse.rs b/rust/pspp/src/format/parse.rs index 5bff50c528..dd1f79fbd1 100644 --- a/rust/pspp/src/format/parse.rs +++ b/rust/pspp/src/format/parse.rs @@ -16,7 +16,7 @@ use crate::{ calendar::{calendar_gregorian_to_offset, DateError}, - data::{Datum, EncodedStr, EncodedString, OwnedDatum, RawString}, + data::{BorrowedEncodedString, Datum, EncodedString, OwnedDatum, RawString}, endian::{Endian, Parse}, format::{DateTemplate, Decimals, Settings, TemplateItem, Type}, settings::{EndianSettings, Settings as PsppSettings}, @@ -192,9 +192,9 @@ impl<'a> ParseValue<'a> { /// interpreting them as a binary number yields nonsense. pub fn parse<'b, T>(&self, input: T) -> Result where - T: Into>, + T: Into<&'b BorrowedEncodedString>, { - let input: EncodedStr = input.into(); + let input: &BorrowedEncodedString = input.into(); if input.is_empty() { return Ok(self.type_.default_value()); } @@ -920,7 +920,7 @@ mod test { use crate::{ calendar::{days_in_month, is_leap_year}, - data::{Datum, EncodedStr, OwnedDatum}, + data::{Datum, OwnedDatum}, endian::Endian, format::{ parse::{ParseError, ParseErrorKind, Sign},