From: Ben Pfaff Date: Sat, 26 Jul 2025 00:02:57 +0000 (-0700) Subject: Revert "wokr on encodedstring" X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1c21af52c39e83cd1c4d07e492a4e2a7572987cf;p=pspp Revert "wokr on encodedstring" This reverts commit 412fad0b2ed2f6282fce80cde185c261d87f0b69. --- diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index a3e97877b8..e1c9125b4b 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -141,7 +141,7 @@ impl OwnedRawString { pub fn with_encoding(self, encoding: &'static Encoding) -> EncodedString { EncodedString { - bytes: self, + bytes: self.0, encoding, } } @@ -184,11 +184,8 @@ where /// Creates an [EncodedStr] with `encoding` that borrows this string's /// contents. - pub fn as_encoded(&self, encoding: &'static Encoding) -> BorrowedEncodedString { - EncodedString { - encoding, - bytes: self.0.borrow(), - } + pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> { + EncodedStr::new(self.0.borrow(), encoding) } } @@ -218,7 +215,7 @@ impl From<[u8; N]> for OwnedRawString { impl From for OwnedRawString { fn from(value: EncodedString) -> Self { - value.bytes + Self(value.bytes) } } @@ -419,7 +416,7 @@ pub enum EncodedDat<'a> { /// A string value. String( /// The value, in the variable's encoding. - &'a BorrowedEncodedString, + EncodedStr<'a>, ), } @@ -440,7 +437,7 @@ impl<'a> EncodedDat<'a> { /// Returns the string inside this datum, or `None` if this is a numeric /// datum. - pub fn as_string(&self) -> Option<&BorrowedEncodedString> { + pub fn as_string(&self) -> Option<&EncodedStr> { match self { Self::Number(_) => None, Self::String(s) => Some(s), @@ -449,7 +446,7 @@ impl<'a> EncodedDat<'a> { /// Returns the string inside this datum as a mutable borrow, or `None` if /// this is a numeric datum. - pub fn as_string_mut(&'a mut self) -> Option<&mut BorrowedEncodedString> { + pub fn as_string_mut(&'a mut self) -> Option> { match self { Self::Number(_) => None, Self::String(s) => Some(*s), @@ -511,8 +508,7 @@ pub type OwnedDatum = Datum; /// A [Datum] that borrows its string data (if any). pub type BorrowedDatum<'a> = Datum<&'a BorrowedRawString>; -/// The value of a [Variable](crate::dictionary::Variable): either a number or a -/// string. +/// The value of a [Variable](crate::dictionary::Variable). /// /// `RawString` is parameterized by its string type, which is either /// [OwnedRawString] if it owns its string value (aliased as [OwnedDatum]) or @@ -855,39 +851,21 @@ impl Iterator for CaseVecIter { } } -pub type OwnedEncodedString = EncodedString>; -pub type BorrowedEncodedString = EncodedString<[u8]>; - /// An owned string and its [Encoding]. /// /// The string is not guaranteed to be valid in the encoding. /// /// The borrowed form of such a string is [EncodedStr]. -#[derive(Clone)] -pub struct EncodedString> -where - B: ?Sized, -{ - /// The string's encoding. - encoding: &'static Encoding, - +#[derive(Clone, Debug)] +pub struct EncodedString { /// The bytes of the string. - bytes: RawString, -} + bytes: Vec, -impl Debug for EncodedString -where - B: Borrow<[u8]>, -{ - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self.as_str()) - } + /// The string's encoding. + encoding: &'static Encoding, } -impl EncodedString -where - B: Borrow<[u8]>, -{ +impl EncodedString { pub fn len(&self) -> usize { self.bytes.len() } @@ -897,21 +875,18 @@ where /// /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER pub fn as_str(&self) -> Cow<'_, str> { - self.encoding.decode_without_bom_handling(self.as_bytes()).0 + self.encoding.decode_without_bom_handling(&self.bytes).0 } /// Returns the bytes in the string, in its encoding. pub fn as_bytes(&self) -> &[u8] { - self.bytes.as_bytes() + &self.bytes } /// Compares this string and `other` for equality, ignoring trailing ASCII /// spaces in either string for the purpose of comparison. (This is /// acceptable because we assume that the encoding is ASCII-compatible.) - pub fn eq_ignore_trailing_spaces<'a>( - &self, - other: impl Into<&'a BorrowedEncodedString>, - ) -> bool { + pub fn eq_ignore_trailing_spaces<'a>(&self, other: impl Into>) -> bool { self.borrowed().eq_ignore_trailing_spaces(other.into()) } @@ -935,11 +910,8 @@ where } /// Returns a borrowed form of this string. - pub fn borrowed(&self) -> &EncodedString<[u8]> { - EncodedString { - encoding: self.encoding, - bytes: self.bytes.borrow(), - } + pub fn borrowed(&self) -> EncodedStr<'_> { + EncodedStr::new(&self.bytes, self.encoding) } /// Removes any trailing ASCII spaces. @@ -957,8 +929,14 @@ impl From<&str> for EncodedString { } } -impl<'a> From<&'a BorrowedEncodedString> for EncodedString { - fn from(value: &'a BorrowedEncodedString) -> Self { +impl<'a> From<&'a EncodedString> for EncodedStr<'a> { + fn from(value: &'a EncodedString) -> Self { + value.borrowed() + } +} + +impl<'a> From> for EncodedString { + fn from(value: EncodedStr<'a>) -> Self { Self { bytes: value.bytes.into(), encoding: value.encoding, @@ -974,3 +952,114 @@ impl Serialize for EncodedString { self.borrowed().serialize(serializer) } } + +/// A borrowed string and its [Encoding]. +/// +/// The string is not guaranteed to be valid in the encoding. +/// +/// The owned form of such a string is [EncodedString]. +#[derive(Copy, Clone, PartialEq, Eq)] +pub struct EncodedStr<'a> { + /// The bytes of the string. + bytes: &'a [u8], + + /// The string's encoding. + encoding: &'static Encoding, +} + +impl<'a> EncodedStr<'a> { + /// Construct a new string with an arbitrary encoding. + pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self { + Self { bytes, encoding } + } + + /// Returns this string recoded in UTF-8. Invalid characters will be + /// replaced by [REPLACEMENT_CHARACTER]. + /// + /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER + pub fn as_str(&self) -> Cow<'_, str> { + self.encoding.decode_without_bom_handling(self.bytes).0 + } + + /// Returns the bytes in the string, in its encoding. + pub fn as_bytes(&self) -> &[u8] { + self.bytes + } + + /// Returns this string recoded in `encoding`. Invalid characters will be + /// replaced by [REPLACEMENT_CHARACTER]. + /// + /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER + pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { + let utf8 = self.as_str(); + match encoding.encode(&utf8).0 { + Cow::Borrowed(_) => { + // Recoding into UTF-8 and then back did not change anything. + Cow::from(self.bytes) + } + Cow::Owned(owned) => Cow::Owned(owned), + } + } + + /// Returns true if this string is empty. + pub fn is_empty(&self) -> bool { + self.bytes.is_empty() + } + + pub fn eq_ignore_trailing_spaces<'b>(&self, other: EncodedStr<'b>) -> bool { + self.bytes.iter().zip_longest(other.bytes).all(|elem| { + let (left, right) = elem.or(&b' ', &b' '); + *left == *right + }) + } + + /// Returns a helper for displaying this string in double quotes. + pub fn quoted(&self) -> QuotedEncodedStr { + QuotedEncodedStr(self) + } +} + +impl<'a> Display for EncodedStr<'a> { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl<'a> Debug for EncodedStr<'a> { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + write!(f, "{:?}", self.as_str()) + } +} + +impl<'a> From<&'a str> for EncodedStr<'a> { + fn from(s: &'a str) -> Self { + Self { + bytes: s.as_bytes(), + encoding: UTF_8, + } + } +} + +impl<'a> From<&'a String> for EncodedStr<'a> { + fn from(s: &'a String) -> Self { + Self::from(s.as_str()) + } +} + +impl<'a> Serialize for EncodedStr<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.as_str().serialize(serializer) + } +} + +/// Helper struct for displaying a [QuotedEncodedStr] in double quotes. +pub struct QuotedEncodedStr<'a>(&'a EncodedStr<'a>); + +impl Display for QuotedEncodedStr<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.0.as_str()) + } +} diff --git a/rust/pspp/src/format/parse.rs b/rust/pspp/src/format/parse.rs index dd1f79fbd1..5bff50c528 100644 --- a/rust/pspp/src/format/parse.rs +++ b/rust/pspp/src/format/parse.rs @@ -16,7 +16,7 @@ use crate::{ calendar::{calendar_gregorian_to_offset, DateError}, - data::{BorrowedEncodedString, Datum, EncodedString, OwnedDatum, RawString}, + data::{Datum, EncodedStr, EncodedString, OwnedDatum, RawString}, endian::{Endian, Parse}, format::{DateTemplate, Decimals, Settings, TemplateItem, Type}, settings::{EndianSettings, Settings as PsppSettings}, @@ -192,9 +192,9 @@ impl<'a> ParseValue<'a> { /// interpreting them as a binary number yields nonsense. pub fn parse<'b, T>(&self, input: T) -> Result where - T: Into<&'b BorrowedEncodedString>, + T: Into>, { - let input: &BorrowedEncodedString = input.into(); + let input: EncodedStr = input.into(); if input.is_empty() { return Ok(self.type_.default_value()); } @@ -920,7 +920,7 @@ mod test { use crate::{ calendar::{days_in_month, is_leap_year}, - data::{Datum, OwnedDatum}, + data::{Datum, EncodedStr, OwnedDatum}, endian::Endian, format::{ parse::{ParseError, ParseErrorKind, Sign},