From a5ac77d3382a4e99171a08941a1f10de3d4f2091 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 26 Jul 2025 09:57:51 -0700 Subject: [PATCH] encodedstr is dead --- rust/pspp/src/data.rs | 177 ++++++++++----------------- rust/pspp/src/dictionary.rs | 2 +- rust/pspp/src/format/display/mod.rs | 2 +- rust/pspp/src/format/display/test.rs | 2 +- rust/pspp/src/format/parse.rs | 10 +- rust/pspp/src/output/pivot/mod.rs | 3 +- rust/pspp/src/sys/raw.rs | 2 +- rust/pspp/src/sys/raw/records.rs | 2 +- rust/pspp/src/sys/write.rs | 4 +- 9 files changed, 75 insertions(+), 129 deletions(-) diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index 7402d2c9e1..e59ed6aee3 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -32,7 +32,6 @@ use std::{ cmp::Ordering, fmt::{Debug, Display, Formatter}, hash::Hash, - ops::Deref, str::from_utf8, }; @@ -44,7 +43,6 @@ use serde::{ser::SerializeTupleVariant, Serialize}; use crate::{ dictionary::{VarType, VarWidth}, format::DisplayPlain, - sys::raw::RawDatum, }; /// A string in an unspecified character encoding. @@ -186,10 +184,17 @@ where }) } + pub fn borrowed(&self) -> &BorrowedRawString { + RawString::new(self.0.borrow()) + } + /// Creates an [EncodedStr] with `encoding` that borrows this string's /// contents. - pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> { - EncodedStr::new(self.0.borrow(), encoding) + pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedString<&BorrowedRawString> { + EncodedString { + encoding, + bytes: self.borrowed(), + } } } @@ -223,6 +228,12 @@ impl From for OwnedRawString { } } +impl<'a> From<&'a BorrowedRawString> for OwnedRawString { + fn from(value: &'a BorrowedRawString) -> Self { + Self(value.0.into()) + } +} + impl Debug for RawString where B: Borrow<[u8]> + ?Sized, @@ -460,7 +471,7 @@ impl<'a> EncodedDat<'a> { pub fn eq_ignore_trailing_spaces<'b>(&self, other: EncodedDat<'b>) -> bool { match (self, other) { - (Self::String(a), EncodedDat::String(b)) => a.eq_ignore_trailing_spaces(b), + (Self::String(a), EncodedDat::String(b)) => a.eq_ignore_trailing_spaces(&b), _ => *self == other, } } @@ -857,11 +868,12 @@ impl Iterator for CaseVecIter { } } +pub type OwnedEncodedString = EncodedString; +pub type BorrowedEncodedString<'a> = EncodedString<&'a BorrowedRawString>; + /// An owned string and its [Encoding]. /// /// The string is not guaranteed to be valid in the encoding. -/// -/// The borrowed form of such a string is [EncodedStr]. #[derive(Copy, Clone, Debug)] pub struct EncodedString { /// The bytes of the string. @@ -875,6 +887,13 @@ impl EncodedString where R: Borrow, { + pub fn new(raw: R, encoding: &'static Encoding) -> Self { + Self { + bytes: raw, + encoding, + } + } + pub fn len(&self) -> usize { self.bytes.borrow().len() } @@ -887,6 +906,21 @@ where self.encoding.decode_without_bom_handling(self.as_bytes()).0 } + /// Returns this string recoded in `encoding`. Invalid characters will be + /// replaced by [REPLACEMENT_CHARACTER]. + /// + /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER + pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { + let utf8 = self.as_str(); + match encoding.encode(&utf8).0 { + Cow::Borrowed(_) => { + // Recoding into UTF-8 and then back did not change anything. + Cow::from(self.as_bytes()) + } + Cow::Owned(owned) => Cow::Owned(owned), + } + } + /// Returns the bytes in the string, in its encoding. pub fn as_bytes(&self) -> &[u8] { &self.bytes.borrow().0 @@ -895,13 +929,13 @@ where /// Compares this string and `other` for equality, ignoring trailing ASCII /// spaces in either string for the purpose of comparison. (This is /// acceptable because we assume that the encoding is ASCII-compatible.) - pub fn eq_ignore_trailing_spaces(&self, other: impl Into>) -> bool + pub fn eq_ignore_trailing_spaces(&self, other: &EncodedString) -> bool where R2: Borrow, { self.borrowed() .bytes - .eq_ignore_trailing_spaces(&other.into().borrowed().bytes) + .eq_ignore_trailing_spaces(&other.borrowed().bytes) } /// Returns the string's [Encoding]. @@ -949,6 +983,15 @@ impl EncodedString { } } +impl<'a> From> for OwnedEncodedString { + fn from(value: BorrowedEncodedString<'a>) -> Self { + Self { + bytes: value.bytes.into(), + encoding: value.encoding, + } + } +} + impl From<&str> for EncodedString { fn from(value: &str) -> Self { Self { @@ -958,15 +1001,21 @@ impl From<&str> for EncodedString { } } -impl<'a> From> for EncodedString { - fn from(value: EncodedStr<'a>) -> Self { +impl<'a> From<&'a str> for BorrowedEncodedString<'a> { + fn from(value: &'a str) -> Self { Self { - bytes: value.bytes.into(), - encoding: value.encoding, + bytes: BorrowedRawString::new(value.as_bytes()), + encoding: UTF_8, } } } +impl<'a> From<&'a String> for BorrowedEncodedString<'a> { + fn from(value: &'a String) -> Self { + value.as_str().into() + } +} + impl Serialize for EncodedString { fn serialize(&self, serializer: S) -> Result where @@ -996,108 +1045,6 @@ where } } -/// A borrowed string and its [Encoding]. -/// -/// The string is not guaranteed to be valid in the encoding. -/// -/// The owned form of such a string is [EncodedString]. -#[derive(Copy, Clone, PartialEq, Eq)] -pub struct EncodedStr<'a> { - /// The bytes of the string. - bytes: &'a [u8], - - /// The string's encoding. - encoding: &'static Encoding, -} - -impl<'a> EncodedStr<'a> { - /// Construct a new string with an arbitrary encoding. - pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self { - Self { bytes, encoding } - } - - /// Returns this string recoded in UTF-8. Invalid characters will be - /// replaced by [REPLACEMENT_CHARACTER]. - /// - /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER - pub fn as_str(&self) -> Cow<'_, str> { - self.encoding.decode_without_bom_handling(self.bytes).0 - } - - /// Returns the bytes in the string, in its encoding. - pub fn as_bytes(&self) -> &[u8] { - self.bytes - } - - /// Returns this string recoded in `encoding`. Invalid characters will be - /// replaced by [REPLACEMENT_CHARACTER]. - /// - /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER - pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { - let utf8 = self.as_str(); - match encoding.encode(&utf8).0 { - Cow::Borrowed(_) => { - // Recoding into UTF-8 and then back did not change anything. - Cow::from(self.bytes) - } - Cow::Owned(owned) => Cow::Owned(owned), - } - } - - /// Returns true if this string is empty. - pub fn is_empty(&self) -> bool { - self.bytes.is_empty() - } - - pub fn eq_ignore_trailing_spaces<'b>(&self, other: EncodedStr<'b>) -> bool { - self.bytes.iter().zip_longest(other.bytes).all(|elem| { - let (left, right) = elem.or(&b' ', &b' '); - *left == *right - }) - } - - /// Returns a helper for displaying this string in double quotes. - pub fn quoted(&self) -> impl Display { - Quoted(self.as_str()) - } -} - -impl<'a> Display for EncodedStr<'a> { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "{}", self.as_str()) - } -} - -impl<'a> Debug for EncodedStr<'a> { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "{:?}", self.as_str()) - } -} - -impl<'a> From<&'a str> for EncodedStr<'a> { - fn from(s: &'a str) -> Self { - Self { - bytes: s.as_bytes(), - encoding: UTF_8, - } - } -} - -impl<'a> From<&'a String> for EncodedStr<'a> { - fn from(s: &'a String) -> Self { - Self::from(s.as_str()) - } -} - -impl<'a> Serialize for EncodedStr<'a> { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - self.as_str().serialize(serializer) - } -} - /// Helper struct for displaying a value in double quotes. pub struct Quoted(T) where diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index d2d338e23a..053674c2ab 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -40,7 +40,7 @@ use thiserror::Error as ThisError; use unicase::UniCase; use crate::{ - data::{Datum, EncodedDat, EncodedDatum, OwnedRawString, RawString}, + data::{Datum, EncodedDat, EncodedDatum, OwnedRawString}, format::{DisplayPlain, Format}, identifier::{ByIdentifier, HasIdentifier, Identifier}, output::pivot::{ diff --git a/rust/pspp/src/format/display/mod.rs b/rust/pspp/src/format/display/mod.rs index d4638cf34a..94098de7d9 100644 --- a/rust/pspp/src/format/display/mod.rs +++ b/rust/pspp/src/format/display/mod.rs @@ -29,7 +29,7 @@ use smallvec::{Array, SmallVec}; use crate::{ calendar::{calendar_offset_to_gregorian, day_of_year, month_name, short_month_name}, - data::{Datum, EncodedDat, EncodedDatum, QuotedEncodedDat}, + data::{EncodedDat, EncodedDatum, QuotedEncodedDat}, endian::{endian_to_smallvec, ToBytes}, format::{Category, DateTemplate, Decimal, Format, NumberStyle, Settings, TemplateItem, Type}, settings::{EndianSettings, Settings as PsppSettings}, diff --git a/rust/pspp/src/format/display/test.rs b/rust/pspp/src/format/display/test.rs index 6def5489b0..f684519776 100644 --- a/rust/pspp/src/format/display/test.rs +++ b/rust/pspp/src/format/display/test.rs @@ -23,7 +23,7 @@ use smallstr::SmallString; use smallvec::SmallVec; use crate::{ - data::{Datum, EncodedDatum}, + data::{ EncodedDatum}, endian::Endian, format::{AbstractFormat, Epoch, Format, Settings, Type, UncheckedFormat, CC}, lex::{scan::StringScanner, segment::Syntax, Punct, Token}, diff --git a/rust/pspp/src/format/parse.rs b/rust/pspp/src/format/parse.rs index 5bff50c528..c960995c81 100644 --- a/rust/pspp/src/format/parse.rs +++ b/rust/pspp/src/format/parse.rs @@ -16,7 +16,7 @@ use crate::{ calendar::{calendar_gregorian_to_offset, DateError}, - data::{Datum, EncodedStr, EncodedString, OwnedDatum, RawString}, + data::{BorrowedEncodedString, Datum, EncodedString, OwnedDatum}, endian::{Endian, Parse}, format::{DateTemplate, Decimals, Settings, TemplateItem, Type}, settings::{EndianSettings, Settings as PsppSettings}, @@ -192,9 +192,9 @@ impl<'a> ParseValue<'a> { /// interpreting them as a binary number yields nonsense. pub fn parse<'b, T>(&self, input: T) -> Result where - T: Into>, + T: Into>, { - let input: EncodedStr = input.into(); + let input: BorrowedEncodedString = input.into(); if input.is_empty() { return Ok(self.type_.default_value()); } @@ -920,7 +920,7 @@ mod test { use crate::{ calendar::{days_in_month, is_leap_year}, - data::{Datum, EncodedStr, OwnedDatum}, + data::{BorrowedRawString, Datum, EncodedString, OwnedDatum}, endian::Endian, format::{ parse::{ParseError, ParseErrorKind, Sign}, @@ -1636,7 +1636,7 @@ mod test { let parsed = Type::RB .parser(UTF_8) .with_endian(EndianSettings::new(Endian::Big)) - .parse(EncodedStr::new(&raw[..], UTF_8)) + .parse(EncodedString::new(BorrowedRawString::new(&raw[..]), UTF_8)) .unwrap() .as_number() .unwrap() diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs index c7c899dd82..ed1b8ea906 100644 --- a/rust/pspp/src/output/pivot/mod.rs +++ b/rust/pspp/src/output/pivot/mod.rs @@ -56,7 +56,6 @@ use binrw::Error as BinError; use chrono::NaiveDateTime; pub use color::ParseError as ParseColorError; use color::{palette::css::TRANSPARENT, AlphaColor, Rgba8, Srgb}; -use encoding_rs::{Encoding, UTF_8}; use enum_iterator::Sequence; use enum_map::{enum_map, Enum, EnumMap}; use look_xml::TableProperties; @@ -68,7 +67,7 @@ use thiserror::Error as ThisError; use tlo::parse_tlo; use crate::{ - data::{Datum, EncodedDat, EncodedDatum, OwnedRawString}, + data::{Datum, EncodedDat, OwnedRawString, }, dictionary::{VarType, Variable}, format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat}, settings::{Settings, Show}, diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index ee199b191b..3390e5e9f3 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -20,7 +20,7 @@ //! raw details. Most readers will want to use higher-level interfaces. use crate::{ - data::{BorrowedRawString, Datum, OwnedRawString, RawCase, RawString}, + data::{BorrowedRawString, Datum, OwnedRawString, RawCase, }, dictionary::{VarType, VarWidth}, endian::{Endian, Parse, ToBytes}, identifier::{Error as IdError, Identifier}, diff --git a/rust/pspp/src/sys/raw/records.rs b/rust/pspp/src/sys/raw/records.rs index 789571cc33..b7469fbbbe 100644 --- a/rust/pspp/src/sys/raw/records.rs +++ b/rust/pspp/src/sys/raw/records.rs @@ -12,7 +12,7 @@ use std::{ }; use crate::{ - data::{Datum, OwnedRawString, RawString}, + data::{Datum, OwnedRawString, }, dictionary::{ Alignment, Attributes, CategoryLabels, Measure, MissingValueRange, MissingValues, MissingValuesError, VarType, VarWidth, diff --git a/rust/pspp/src/sys/write.rs b/rust/pspp/src/sys/write.rs index 3a6b13486b..d3a128d1da 100644 --- a/rust/pspp/src/sys/write.rs +++ b/rust/pspp/src/sys/write.rs @@ -1,5 +1,5 @@ use std::{ - borrow::{Borrow, Cow}, + borrow::{Cow}, collections::HashMap, fmt::Write as _, fs::File, @@ -17,7 +17,7 @@ use itertools::zip_eq; use smallvec::SmallVec; use crate::{ - data::{Datum, EncodedDatum, OwnedRawString, RawString}, + data::{Datum, EncodedDatum, OwnedRawString, }, dictionary::{ Alignment, Attributes, CategoryLabels, Dictionary, Measure, MultipleResponseType, ValueLabels, VarWidth, -- 2.30.2