From 82cebe5740f345476ac5210767168b2d88ab91c5 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 2 Aug 2025 09:27:48 -0700 Subject: [PATCH] traits experiment worked -- needs cleanup --- rust/pspp/src/data.rs | 680 +++++++++------------------ rust/pspp/src/data/encoded.rs | 311 ++++-------- rust/pspp/src/dictionary.rs | 24 +- rust/pspp/src/format/display/mod.rs | 20 +- rust/pspp/src/format/display/test.rs | 12 +- rust/pspp/src/format/mod.rs | 11 +- rust/pspp/src/format/parse.rs | 43 +- rust/pspp/src/output/pivot/mod.rs | 15 +- rust/pspp/src/sys/cooked.rs | 11 +- rust/pspp/src/sys/raw.rs | 40 +- rust/pspp/src/sys/raw/records.rs | 36 +- rust/pspp/src/sys/test.rs | 8 +- rust/pspp/src/sys/write.rs | 26 +- 13 files changed, 444 insertions(+), 793 deletions(-) diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index 7c97aa4e58..02e0c8cb5c 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -48,7 +48,7 @@ use crate::{ format::DisplayPlain, }; -pub trait RawStringTrait: Debug + PartialEq + Eq + PartialOrd + Ord { +pub trait RawStringTrait: Debug + PartialEq + Eq + PartialOrd + Ord + Hash { fn raw_string_bytes(&self) -> &[u8]; /// Compares this string and `other` for equality, ignoring trailing ASCII @@ -57,7 +57,10 @@ pub trait RawStringTrait: Debug + PartialEq + Eq + PartialOrd + Ord { /// /// This compares the bytes of the strings, disregarding their encodings (if /// known). - fn eq_ignore_trailing_spaces(&self, other: &impl RawStringTrait) -> bool { + fn eq_ignore_trailing_spaces(&self, other: &R) -> bool + where + R: RawStringTrait, + { self.raw_string_bytes() .iter() .copied() @@ -85,14 +88,32 @@ pub trait RawStringTrait: Debug + PartialEq + Eq + PartialOrd + Ord { fn len(&self) -> usize { self.raw_string_bytes().len() } + + fn as_ref(&self) -> ByteStr<'_> { + ByteStr(self.raw_string_bytes()) + } + + fn as_encoded(&self, encoding: &'static Encoding) -> WithEncoding> + where + Self: Sized, + { + WithEncoding::new(self.as_ref(), encoding) + } + + fn with_encoding(self, encoding: &'static Encoding) -> WithEncoding + where + Self: Sized, + { + WithEncoding::new(self, encoding) + } } pub trait MutRawString: RawStringTrait { - fn resize(&mut self, new_len: usize) -> Result<(), ()>; + fn resize(&mut self, new_len: usize) -> Result<(), ResizeError>; fn trim_end(&mut self); } -impl RawStringTrait for str { +impl RawStringTrait for &'_ str { fn raw_string_bytes(&self) -> &[u8] { self.as_bytes() } @@ -104,8 +125,20 @@ impl RawStringTrait for String { } } -#[derive(PartialEq, Eq, PartialOrd, Ord)] -struct ByteStr<'a>(&'a [u8]); +impl RawStringTrait for &'_ String { + fn raw_string_bytes(&self) -> &[u8] { + self.as_bytes() + } +} + +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct ByteStr<'a>(pub &'a [u8]); + +impl RawStringTrait for ByteStr<'_> { + fn raw_string_bytes(&self) -> &[u8] { + self.0 + } +} impl Serialize for ByteStr<'_> { fn serialize(&self, serializer: S) -> Result @@ -139,287 +172,89 @@ impl Debug for ByteStr<'_> { } } -impl RawStringTrait for ByteString { - fn raw_string_bytes(&self) -> &[u8] { - self.0.as_slice() +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct ByteCow<'a>(pub Cow<'a, [u8]>); + +impl ByteCow<'_> { + pub fn into_owned(self) -> ByteString { + ByteString(self.0.into_owned()) } } -#[derive(PartialEq, Eq, PartialOrd, Ord)] -struct ByteString(Vec); - -impl Serialize for ByteString { +impl Serialize for ByteCow<'_> { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { - if let Ok(s) = str::from_utf8(&self.0) { - let (variant_index, variant) = if self.0.iter().all(|b| b.is_ascii()) { - (0, "Ascii") - } else { - (1, "Utf8") - }; - let mut tuple = - serializer.serialize_tuple_variant("RawString", variant_index, variant, 1)?; - tuple.serialize_field(s)?; - tuple.end() - } else { - let mut tuple = serializer.serialize_tuple_variant("RawString", 2, "Windows1252", 1)?; - tuple.serialize_field(&decode_latin1(&self.0))?; - tuple.end() - } + ByteStr(&self.0).serialize(serializer) } } -impl Debug for ByteString { - // If `s` is valid UTF-8, displays it as UTF-8, otherwise as Latin-1 - // (actually bytes interpreted as Unicode code points). - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let s = - from_utf8(&self.0.borrow()).map_or_else(|_| decode_latin1(self.0.borrow()), Cow::from); - write!(f, "{s:?}") - } -} - -impl RawStringTrait for ByteStr<'_> { +impl RawStringTrait for ByteCow<'_> { fn raw_string_bytes(&self) -> &[u8] { - self.0 - } -} - -impl MutRawString for ByteString { - fn resize(&mut self, new_len: usize) -> Result<(), ()> { - match new_len.cmp(&self.0.len()) { - Ordering::Less => { - if !self.0[new_len..].iter().all(|b| *b == b' ') { - return Err(()); - } - self.0.truncate(new_len); - } - Ordering::Equal => (), - Ordering::Greater => self.0.extend((self.0.len()..new_len).map(|_| b' ')), - } - Ok(()) - } - - /// Removes any trailing ASCII spaces. - fn trim_end(&mut self) { - while self.0.pop_if(|c| *c == b' ').is_some() {} + &self.0 } } -/// A string in an unspecified character encoding. -/// -/// `RawString` is usually associated with a [Variable], in the variable's -/// character encoding. We assume that the encoding is one supported by -/// [encoding_rs] with byte units (that is, not a `UTF-16` encoding). All of -/// these encodings have some basic ASCII compatibility. -/// -/// `RawString` is parameterized by its content type, which is either `Vec` -/// for an owned raw string (aliased as [OwnedRawString]) or `[u8]` for a -/// borrowed raw string (aliased as [BorrowedRawString]). -/// -/// [Variable]: crate::dictionary::Variable -#[derive(Clone, Default, Hash)] -pub struct RawString(pub B) -where - B: ?Sized; - -impl PartialEq> for RawString -where - B: Borrow<[u8]> + ?Sized, - B2: Borrow<[u8]> + ?Sized, -{ - fn eq(&self, other: &RawString) -> bool { - self.0.borrow().eq(other.0.borrow()) - } -} - -impl Eq for RawString where B: Borrow<[u8]> + ?Sized {} - -impl PartialOrd> for RawString -where - B: Borrow<[u8]> + ?Sized, - B2: Borrow<[u8]> + ?Sized, -{ - fn partial_cmp(&self, other: &RawString) -> Option { - self.0.borrow().partial_cmp(other.0.borrow()) - } -} - -impl Ord for RawString -where - B: Borrow<[u8]> + ?Sized, -{ - fn cmp(&self, other: &Self) -> Ordering { - self.0.borrow().cmp(other.0.borrow()) - } -} - -/// A [RawString] that owns its contents. -pub type OwnedRawString = RawString>; - -/// A [RawString] that borrows its contents. -/// -/// Because `[u8]` is not [Sized], [BorrowedRawString] may itself only be used -/// borrowed. -pub type BorrowedRawString = RawString<[u8]>; - -impl Borrow for OwnedRawString { - fn borrow(&self) -> &BorrowedRawString { - &BorrowedRawString::new(self.as_bytes()) +impl Debug for ByteCow<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + ByteStr(&self.0).fmt(f) } } -impl BorrowedRawString { - pub fn new(s: &[u8]) -> &Self { - // SAFETY: `RawStr` is a transparent wrapper around `[u8]`, so we can - // turn a reference to the wrapped type into a reference to the wrapper - // type. - unsafe { &*(s as *const [u8] as *const BorrowedRawString) } - } -} +#[derive(Clone, Default, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct ByteString(pub Vec); -impl OwnedRawString { - /// Creates a new [RawString] that consists of `n` ASCII spaces. +impl ByteString { + /// Creates a new [ByteString] that consists of `n` ASCII spaces. pub fn spaces(n: usize) -> Self { Self(std::iter::repeat_n(b' ', n).collect()) } - - /// Extends or shortens this [RawString] to exactly `len` bytes. If the - /// string needs to be extended, does so by appending spaces. - /// - /// If this shortens the string, it can cut off a multibyte character in the - /// middle ([is_resizable](Self::is_resizable) checks for this). - pub fn resize(&mut self, len: usize) { - self.0.resize(len, b' '); - } - - /// Removes any trailing ASCII spaces. - pub fn trim_end(&mut self) { - while self.0.pop_if(|c| *c == b' ').is_some() {} - } - - pub fn with_encoding(self, encoding: &'static Encoding) -> OwnedEncodedString { - EncodedString { - raw: self, - encoding, - } - } } -impl RawString -where - B: Borrow<[u8]> + ?Sized, -{ - pub fn as_bytes(&self) -> &[u8] { - self.0.borrow() - } - - pub fn len(&self) -> usize { - self.0.borrow().len() - } - - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Returns true if this raw string can be resized to `len` bytes without - /// dropping non-space characters. - pub fn is_resizable(&self, new_len: usize) -> bool { - new_len >= self.len() || self.0.borrow()[new_len..].iter().all(|b| *b == b' ') - } - - /// Compares this string and `other` for equality, ignoring trailing ASCII - /// spaces in either string for the purpose of comparison. (This is - /// acceptable because we assume that the encoding is ASCII-compatible.) - pub fn eq_ignore_trailing_spaces(&self, other: &RawString) -> bool - where - B2: Borrow<[u8]> + ?Sized, - { - self.0 - .borrow() - .iter() - .zip_longest(other.0.borrow()) - .all(|elem| { - let (left, right) = elem.or(&b' ', &b' '); - *left == *right - }) - } - - pub fn borrowed(&self) -> &BorrowedRawString { - RawString::new(self.0.borrow()) - } - - /// Creates an [EncodedStr] with `encoding` that borrows this string's - /// contents. - pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedString<&BorrowedRawString> { - EncodedString { - encoding, - raw: self.borrowed(), - } - } -} - -impl From> for OwnedRawString { - fn from(value: Cow<'_, [u8]>) -> Self { - Self(value.into_owned()) +impl From for ByteString { + fn from(value: String) -> Self { + value.into_bytes().into() } } -impl From> for OwnedRawString { - fn from(source: Vec) -> Self { - Self(source) - } -} - -impl From<&[u8]> for OwnedRawString { - fn from(source: &[u8]) -> Self { - Self(source.into()) +impl From<&'_ str> for ByteString { + fn from(value: &str) -> Self { + value.as_bytes().into() } } -impl From<[u8; N]> for OwnedRawString { - fn from(source: [u8; N]) -> Self { - Self(source.into()) +impl From> for ByteString { + fn from(value: Vec) -> Self { + Self(value) } } -impl From for OwnedRawString { - fn from(value: OwnedEncodedString) -> Self { - value.raw +impl From<&[u8]> for ByteString { + fn from(value: &[u8]) -> Self { + Self(value.into()) } } -impl<'a> From<&'a BorrowedRawString> for OwnedRawString { - fn from(value: &'a BorrowedRawString) -> Self { - Self(value.0.into()) +impl From<[u8; N]> for ByteString { + fn from(value: [u8; N]) -> Self { + value.as_slice().into() } } -impl Debug for RawString -where - B: Borrow<[u8]> + ?Sized, -{ - // If `s` is valid UTF-8, displays it as UTF-8, otherwise as Latin-1 - // (actually bytes interpreted as Unicode code points). - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let s = - from_utf8(&self.0.borrow()).map_or_else(|_| decode_latin1(self.0.borrow()), Cow::from); - write!(f, "{s:?}") +impl RawStringTrait for ByteString { + fn raw_string_bytes(&self) -> &[u8] { + self.0.as_slice() } } -impl Serialize for RawString -where - B: Borrow<[u8]> + ?Sized, -{ +impl Serialize for ByteString { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { - if let Ok(s) = str::from_utf8(self.0.borrow()) { - let (variant_index, variant) = if self.0.borrow().iter().all(|b| b.is_ascii()) { + if let Ok(s) = str::from_utf8(&self.0) { + let (variant_index, variant) = if self.0.iter().all(|b| b.is_ascii()) { (0, "Ascii") } else { (1, "Utf8") @@ -430,20 +265,51 @@ where tuple.end() } else { let mut tuple = serializer.serialize_tuple_variant("RawString", 2, "Windows1252", 1)?; - tuple.serialize_field(&decode_latin1(self.0.borrow()))?; + tuple.serialize_field(&decode_latin1(&self.0))?; tuple.end() } } } +impl Debug for ByteString { + // If `s` is valid UTF-8, displays it as UTF-8, otherwise as Latin-1 + // (actually bytes interpreted as Unicode code points). + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + let s = + from_utf8(&self.0.borrow()).map_or_else(|_| decode_latin1(self.0.borrow()), Cow::from); + write!(f, "{s:?}") + } +} + +impl MutRawString for ByteString { + fn resize(&mut self, new_len: usize) -> Result<(), ResizeError> { + match new_len.cmp(&self.0.len()) { + Ordering::Less => { + if !self.0[new_len..].iter().all(|b| *b == b' ') { + return Err(ResizeError::TooWide); + } + self.0.truncate(new_len); + } + Ordering::Equal => (), + Ordering::Greater => self.0.extend((self.0.len()..new_len).map(|_| b' ')), + } + Ok(()) + } + + /// Removes any trailing ASCII spaces. + fn trim_end(&mut self) { + while self.0.pop_if(|c| *c == b' ').is_some() {} + } +} + mod encoded; -pub use encoded::{BorrowedEncodedString, EncodedString, OwnedEncodedString}; +pub use encoded::{Encoded, EncodedStringTrait, WithEncoding}; /// A [Datum] that owns its string data (if any). -pub type OwnedDatum = Datum; +pub type OwnedDatum = Datum>; /// A [Datum] that borrows its string data (if any). -pub type BorrowedDatum<'a> = Datum<&'a BorrowedRawString>; +pub type BorrowedDatum<'a> = Datum>>; /// The value of a [Variable](crate::dictionary::Variable). /// @@ -452,7 +318,7 @@ pub type BorrowedDatum<'a> = Datum<&'a BorrowedRawString>; /// [&BorrowedRawString](BorrowedRawString) if it borrows it (aliased as /// [BorrowedDatum]). #[derive(Clone)] -pub enum Datum { +pub enum Datum { /// A numeric value. Number( /// A number, or `None` for the system-missing value. @@ -461,13 +327,41 @@ pub enum Datum { /// A string value. String( /// The value, in the variable's encoding. - B, + T, ), } -impl Datum { +impl Datum> { pub fn new_utf8(s: impl Into) -> Self { - Datum::String(OwnedRawString::from(s.into().into_bytes()).with_encoding(UTF_8)) + let s: String = s.into(); + Datum::String(ByteString::from(s).with_encoding(UTF_8)) + } +} + +impl<'a> Datum>> { + pub fn into_owned(self) -> Datum> { + match self { + Self::Number(number) => Datum::Number(number), + Self::String(string) => Datum::String(string.into_owned()), + } + } +} + +impl Datum +where + T: EncodedStringTrait, +{ + pub fn as_borrowed(&self) -> Datum>> { + match self { + Datum::Number(number) => Datum::Number(*number), + Datum::String(string) => Datum::String(string.as_encoded_byte_str()), + } + } + pub fn cloned(&self) -> Datum> { + match self { + Datum::Number(number) => Datum::Number(*number), + Datum::String(string) => Datum::String(string.cloned()), + } } } @@ -484,15 +378,15 @@ where } } -impl<'a, B> Display for Datum +impl Display for Datum where - B: Borrow>, + T: Display, { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { Self::Number(None) => write!(f, "SYSMIS"), Self::Number(Some(number)) => number.display_plain().fmt(f), - Self::String(string) => write!(f, "{}", string.borrow()), + Self::String(string) => string.fmt(f), } } } @@ -512,62 +406,57 @@ where } } -impl PartialEq> for Datum +impl PartialEq> for Datum where - B: Borrow>, - B2: Borrow>, + T: PartialEq, { - fn eq(&self, other: &Datum) -> bool { + fn eq(&self, other: &Datum) -> bool { match (self, other) { - (Self::Number(Some(l0)), Datum::Number(Some(r0))) => { - OrderedFloat(*l0) == OrderedFloat(*r0) + (Self::Number(Some(n1)), Datum::Number(Some(n2))) => { + OrderedFloat(*n1) == OrderedFloat(*n2) } (Self::Number(None), Datum::Number(None)) => true, - (Self::String(l0), Datum::String(r0)) => l0.borrow() == r0.borrow(), + (Self::String(s1), Datum::String(s2)) => s1 == s2, _ => false, } } } -impl Eq for Datum where B: Borrow> {} +impl Eq for Datum where T: Eq {} -impl PartialOrd> for Datum +impl PartialOrd> for Datum where - B: Borrow>, - B2: Borrow>, + T: PartialOrd, { - fn partial_cmp(&self, other: &Datum) -> Option { - Some(match (self, other) { - (Self::Number(a), Datum::Number(b)) => match (a, b) { - (None, None) => Ordering::Equal, - (None, Some(_)) => Ordering::Less, - (Some(_), None) => Ordering::Greater, - (Some(a), Some(b)) => a.total_cmp(b), - }, - (Self::Number(_), Datum::String(_)) => Ordering::Less, - (Self::String(_), Datum::Number(_)) => Ordering::Greater, - (Self::String(a), Datum::String(b)) => a.borrow().cmp(b.borrow()), - }) - } -} - -impl Ord for Datum + fn partial_cmp(&self, other: &Datum) -> Option { + match (self, other) { + (Self::Number(a), Datum::Number(b)) => { + a.map(OrderedFloat).partial_cmp(&b.map(OrderedFloat)) + } + (Self::Number(_), Datum::String(_)) => Some(Ordering::Less), + (Self::String(_), Datum::Number(_)) => Some(Ordering::Greater), + (Self::String(a), Datum::String(b)) => a.partial_cmp(b), + } + } +} + +impl Ord for Datum where - B: Borrow>, + T: Ord, { fn cmp(&self, other: &Self) -> Ordering { self.partial_cmp(other).unwrap() } } -impl Hash for Datum +impl Hash for Datum where - B: Borrow, + T: Hash, { fn hash(&self, state: &mut H) { match self { Self::Number(number) => number.map(OrderedFloat).hash(state), - Self::String(string) => string.borrow().hash(state), + Self::String(string) => string.hash(state), } } } @@ -605,9 +494,9 @@ impl Datum { } } -impl Datum +impl Datum where - B: Borrow>, + T: RawStringTrait, { /// Returns true if this datum can be resized to the given `width` without /// loss, which is true only if this datum and `width` are both string or @@ -616,9 +505,7 @@ where pub fn is_resizable(&self, width: VarWidth) -> bool { match (self, width) { (Self::Number(_), VarWidth::Numeric) => true, - (Self::String(s), VarWidth::String(new_width)) => { - s.borrow().is_resizable(new_width as usize) - } + (Self::String(s), VarWidth::String(new_width)) => s.is_resizable(new_width as usize), _ => false, } } @@ -627,137 +514,42 @@ where pub fn width(&self) -> VarWidth { match self { Self::Number(_) => VarWidth::Numeric, - Self::String(s) => VarWidth::String(s.borrow().len().try_into().unwrap()), + Self::String(s) => VarWidth::String(s.len().try_into().unwrap()), } } /// Compares this datum and `other` for equality, ignoring trailing ASCII /// spaces in either, if they are both strings, for the purpose of /// comparison. - pub fn eq_ignore_trailing_spaces(&self, other: &Datum) -> bool + pub fn eq_ignore_trailing_spaces(&self, other: &Datum) -> bool where - B2: Borrow>, + R: RawStringTrait, { match (self, other) { - (Self::String(a), Datum::String(b)) => a.borrow().eq_ignore_trailing_spaces(b.borrow()), - _ => self == other, - } - } - - pub fn as_encoded<'a>( - &'a self, - encoding: &'static Encoding, - ) -> Datum> { - match self { - Datum::Number(number) => Datum::Number(*number), - Datum::String(raw_string) => Datum::String(EncodedString { - raw: raw_string.borrow(), - encoding, - }), + (Self::String(a), Datum::String(b)) => a.eq_ignore_trailing_spaces(b), + (Self::Number(a), Datum::Number(b)) => a == b, + _ => false, } } -} -impl Datum { - pub fn borrowed(&self) -> BorrowedDatum { + pub fn as_encoded(&self, encoding: &'static Encoding) -> Datum>> { match self { Datum::Number(number) => Datum::Number(*number), - Datum::String(string) => Datum::String(Borrow::borrow(string)), + Datum::String(raw_string) => Datum::String(raw_string.as_encoded(encoding)), } } -} - -impl<'a> Datum<&'a BorrowedRawString> { - pub fn borrowed(&self) -> BorrowedDatum { - self.clone() - } -} -impl Datum { - pub fn borrowed<'a>(&'a self) -> Datum> { + pub fn with_encoding(self, encoding: &'static Encoding) -> Datum> { match self { - Datum::Number(number) => Datum::Number(*number), - Datum::String(string) => Datum::String(string.borrowed()), - } - } -} - -impl<'a> Datum> { - pub fn borrowed(&self) -> Datum> { - self.clone() - } -} - -impl Datum -where - D: BorrowString, -{ - pub fn borrowed_string<'a>(&'a self) -> Datum> { - match self { - Datum::Number(number) => Datum::Number(*number), - Datum::String(string) => Datum::String(string.borrow_string()), + Datum::Number(number) => Datum::Number(number), + Datum::String(string) => Datum::String(string.with_encoding(encoding)), } } } -pub trait BorrowString { - type Borrowed<'a> - where - Self: 'a; - fn borrow_string<'a>(&'a self) -> Self::Borrowed<'a>; -} - -impl BorrowString for OwnedRawString { - type Borrowed<'a> = &'a BorrowedRawString; - fn borrow_string<'a>(&'a self) -> Self::Borrowed<'a> { - BorrowedRawString::new(&self.0) - } -} - -impl BorrowString for BorrowedRawString { - type Borrowed<'a> = &'a BorrowedRawString; - fn borrow_string<'a>(&'a self) -> Self::Borrowed<'a> { - self - } -} - -impl BorrowString for OwnedEncodedString { - type Borrowed<'a> = BorrowedEncodedString<'a>; - fn borrow_string<'a>(&'a self) -> Self::Borrowed<'a> { - BorrowedEncodedString::new(self.raw.borrowed(), self.encoding) - } -} - -impl<'b> BorrowString for BorrowedEncodedString<'b> { - type Borrowed<'a> - = BorrowedEncodedString<'b> - where - Self: 'a; - - fn borrow_string<'a>(&'a self) -> Self::Borrowed<'a> { - self.clone() - } -} - -pub trait AsEncodedString: Borrow { - fn as_encoded_string<'a>(&'a self) -> BorrowedEncodedString<'a>; -} - -impl AsEncodedString for OwnedEncodedString { - fn as_encoded_string<'a>(&'a self) -> BorrowedEncodedString<'a> { - self.borrowed() - } -} - -impl<'b> AsEncodedString for BorrowedEncodedString<'b> { - fn as_encoded_string<'a>(&'a self) -> BorrowedEncodedString<'a> { - self.clone() - } -} - impl Datum where - B: AsEncodedString, + B: EncodedStringTrait, { pub fn quoted<'a>(&'a self) -> QuotedDatum<'a, B> { QuotedDatum(self) @@ -768,13 +560,13 @@ pub struct QuotedDatum<'a, B>(&'a Datum); impl<'a, B> Display for QuotedDatum<'a, B> where - B: AsEncodedString, + B: Display, { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match &self.0 { Datum::Number(None) => write!(f, "SYSMIS"), Datum::Number(Some(number)) => number.display_plain().fmt(f), - Datum::String(string) => write!(f, "\"{}\"", string.as_encoded_string().as_str()), + Datum::String(string) => write!(f, "\"{string}\""), } } } @@ -785,13 +577,10 @@ pub enum ResizeError { TooWide, } -impl Datum -where - B: BorrowMut, -{ +impl Datum { /// Returns the string inside this datum as a mutable borrow, or `None` if /// this is a numeric datum. - pub fn as_string_mut(&mut self) -> Option<&mut OwnedRawString> { + pub fn as_string_mut(&mut self) -> Option<&mut T> { match self { Self::Number(_) => None, Self::String(s) => Some(s.borrow_mut()), @@ -799,72 +588,49 @@ where } /// Removes trailing ASCII spaces from this datum, if it is a string. - pub fn trim_end(&mut self) { + pub fn trim_end(&mut self) + where + T: MutRawString, + { self.as_string_mut().map(|s| s.trim_end()); } /// Resizes this datum to the given `width`. Returns an error, without /// modifying the datum, if [is_resizable](Self::is_resizable) would return /// false. - pub fn resize(&mut self, width: VarWidth) -> Result<(), ResizeError> { + pub fn resize(&mut self, width: VarWidth) -> Result<(), ResizeError> + where + T: MutRawString, + { match (self, width) { (Self::Number(_), VarWidth::Numeric) => Ok(()), - (Self::String(s), VarWidth::String(new_width)) => { - let s = s.borrow_mut(); - if s.is_resizable(new_width as usize) { - s.resize(new_width as usize); - Ok(()) - } else { - Err(ResizeError::TooWide) - } - } + (Self::String(s), VarWidth::String(new_width)) => s.resize(new_width as usize), _ => Err(ResizeError::MixedTypes), } } } -impl Datum { - pub fn with_encoding(self, encoding: &'static Encoding) -> Datum { - match self { - Datum::Number(number) => Datum::Number(number), - Datum::String(raw_string) => Datum::String(raw_string.with_encoding(encoding)), - } - } -} - -impl From for Datum -where - B: Borrow, -{ +impl From for Datum { fn from(number: f64) -> Self { Some(number).into() } } -impl From> for Datum -where - B: Borrow, -{ +impl From> for Datum { fn from(value: Option) -> Self { Self::Number(value) } } -impl From<&str> for Datum -where - B: Borrow + for<'a> From<&'a [u8]>, -{ - fn from(value: &str) -> Self { - value.as_bytes().into() +impl<'a> From<&'a str> for Datum> { + fn from(value: &'a str) -> Self { + Datum::String(ByteStr(value.as_bytes())) } } -impl From<&[u8]> for Datum -where - B: Borrow + for<'a> From<&'a [u8]>, -{ - fn from(value: &[u8]) -> Self { - Self::String(value.into()) +impl<'a> From<&'a [u8]> for Datum> { + fn from(value: &'a [u8]) -> Self { + Self::String(ByteStr(value)) } } @@ -875,17 +641,17 @@ pub struct RawCase( /// order. /// /// [Dictionary]: crate::dictionary::Dictionary - pub Vec>, + pub Vec>, ); impl RawCase { - pub fn as_encoding(&self, encoding: &'static Encoding) -> Case<&'_ [Datum]> { + pub fn as_encoding(&self, encoding: &'static Encoding) -> Case<&'_ [Datum]> { Case { encoding, data: &self.0, } } - pub fn with_encoding(self, encoding: &'static Encoding) -> Case>> { + pub fn with_encoding(self, encoding: &'static Encoding) -> Case>> { Case { encoding, data: self.0, @@ -895,7 +661,7 @@ impl RawCase { pub struct Case where - B: Borrow<[Datum]>, + B: Borrow<[Datum]>, { encoding: &'static Encoding, data: B, @@ -903,7 +669,7 @@ where impl Case where - B: Borrow<[Datum]>, + B: Borrow<[Datum]>, { pub fn len(&self) -> usize { self.data.borrow().len() @@ -915,7 +681,7 @@ where impl Serialize for Case where - B: Borrow<[Datum]>, + B: Borrow<[Datum]>, { fn serialize(&self, serializer: S) -> Result where @@ -931,11 +697,11 @@ where pub struct CaseIter<'a> { encoding: &'static Encoding, - iter: std::slice::Iter<'a, Datum>, + iter: std::slice::Iter<'a, Datum>, } impl<'a> Iterator for CaseIter<'a> { - type Item = Datum>; + type Item = Datum>>; fn next(&mut self) -> Option { self.iter.next().map(|d| d.as_encoded(self.encoding)) @@ -944,9 +710,9 @@ impl<'a> Iterator for CaseIter<'a> { impl<'a, B> IntoIterator for &'a Case where - B: Borrow<[Datum]>, + B: Borrow<[Datum]>, { - type Item = Datum>; + type Item = Datum>>; type IntoIter = CaseIter<'a>; @@ -958,8 +724,8 @@ where } } -impl IntoIterator for Case>> { - type Item = Datum; +impl IntoIterator for Case>> { + type Item = Datum>; type IntoIter = CaseIntoIter; @@ -973,11 +739,11 @@ impl IntoIterator for Case>> { pub struct CaseIntoIter { encoding: &'static Encoding, - iter: std::vec::IntoIter>, + iter: std::vec::IntoIter>, } impl Iterator for CaseIntoIter { - type Item = Datum; + type Item = Datum>; fn next(&mut self) -> Option { self.iter diff --git a/rust/pspp/src/data/encoded.rs b/rust/pspp/src/data/encoded.rs index 8e32c862d8..7e3262730b 100644 --- a/rust/pspp/src/data/encoded.rs +++ b/rust/pspp/src/data/encoded.rs @@ -1,19 +1,22 @@ use std::{ - borrow::{Borrow, BorrowMut, Cow}, + borrow::Cow, cmp::Ordering, fmt::{Debug, Display}, + hash::Hash, }; use encoding_rs::{Encoding, UTF_8}; use serde::Serialize; -use crate::data::{BorrowedRawString, OwnedRawString, Quoted, RawString, RawStringTrait}; +use crate::data::{ + ByteCow, ByteStr, ByteString, MutRawString, Quoted, RawStringTrait, ResizeError, +}; pub trait Encoded { fn encoding(&self) -> &'static Encoding; } -impl Encoded for str { +impl Encoded for &'_ str { fn encoding(&self) -> &'static Encoding { UTF_8 } @@ -25,10 +28,16 @@ impl Encoded for String { } } +impl Encoded for &'_ String { + fn encoding(&self) -> &'static Encoding { + UTF_8 + } +} + #[derive(Clone, Debug, PartialEq, Eq)] pub struct WithEncoding { - pub inner: T, pub encoding: &'static Encoding, + pub inner: T, } impl WithEncoding { @@ -41,6 +50,12 @@ impl WithEncoding { } } +impl<'a> WithEncoding> { + pub fn into_owned(self) -> WithEncoding { + WithEncoding::new(self.inner.into_owned(), self.encoding) + } +} + impl PartialOrd for WithEncoding where T: PartialOrd, @@ -59,18 +74,68 @@ where } } +impl Serialize for WithEncoding +where + WithEncoding: EncodedStringTrait, +{ + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.as_str().serialize(serializer) + } +} + pub trait EncodedStringTrait: Encoded + RawStringTrait + Display + Debug { fn as_str(&self) -> Cow<'_, str>; - fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]>; + fn into_string(self) -> String + where + Self: Sized, + { + self.as_str().into_owned() + } + fn to_encoding(&self, encoding: &'static Encoding) -> WithEncoding>; + fn as_encoded_byte_str(&self) -> WithEncoding> { + WithEncoding::new(ByteStr(self.raw_string_bytes()), self.encoding()) + } + fn cloned(&self) -> WithEncoding { + WithEncoding::new(ByteString::from(self.raw_string_bytes()), self.encoding()) + } + fn quoted(&self) -> Quoted<&Self> + where + Self: Sized, + { + Quoted(self) + } } -impl<'a> EncodedStringTrait for str { +impl<'a> EncodedStringTrait for &'a str { fn as_str(&self) -> Cow<'_, str> { - Cow::from(self) + Cow::from(*self) } - fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { - encoding.encode(self).0 + fn to_encoding(&self, encoding: &'static Encoding) -> WithEncoding> { + WithEncoding::new(ByteCow(encoding.encode(self).0), encoding) + } +} + +impl EncodedStringTrait for String { + fn as_str(&self) -> Cow<'_, str> { + Cow::from(String::as_str(&self)) + } + + fn to_encoding(&self, encoding: &'static Encoding) -> WithEncoding> { + WithEncoding::new(ByteCow(encoding.encode(&self).0), encoding) + } +} + +impl EncodedStringTrait for &'_ String { + fn as_str(&self) -> Cow<'_, str> { + Cow::from(String::as_str(&self)) + } + + fn to_encoding(&self, encoding: &'static Encoding) -> WithEncoding> { + WithEncoding::new(ByteCow(encoding.encode(String::as_str(&self)).0), encoding) } } @@ -83,6 +148,19 @@ where } } +impl MutRawString for WithEncoding +where + T: MutRawString, +{ + fn resize(&mut self, new_len: usize) -> Result<(), ResizeError> { + self.inner.resize(new_len) + } + + fn trim_end(&mut self) { + self.inner.trim_end(); + } +} + impl EncodedStringTrait for WithEncoding where T: RawStringTrait, @@ -101,14 +179,18 @@ where /// replaced by [REPLACEMENT_CHARACTER]. /// /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER - fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { + fn to_encoding(&self, encoding: &'static Encoding) -> WithEncoding> { let utf8 = self.as_str(); - match encoding.encode(&utf8).0 { + let inner = match encoding.encode(&utf8).0 { Cow::Borrowed(_) => { // Recoding into UTF-8 and then back did not change anything. Cow::from(self.raw_string_bytes()) } Cow::Owned(owned) => Cow::Owned(owned), + }; + WithEncoding { + encoding, + inner: ByteCow(inner), } } } @@ -128,210 +210,11 @@ where } } -pub type OwnedEncodedString = EncodedString; -pub type BorrowedEncodedString<'a> = EncodedString<&'a BorrowedRawString>; - -/// An owned string and its [Encoding]. -/// -/// The string is not guaranteed to be valid in the encoding. -#[derive(Copy, Clone, Debug)] -pub struct EncodedString { - /// The bytes of the string. - pub raw: R, - - /// The string's encoding. - pub encoding: &'static Encoding, -} - -impl Encoded for EncodedString { - fn encoding(&self) -> &'static Encoding { - self.encoding - } -} - -impl EncodedString -where - R: Borrow, -{ - pub fn new(raw: R, encoding: &'static Encoding) -> Self { - Self { raw, encoding } - } - - pub fn into_raw(self) -> R { - self.raw - } - - pub fn len(&self) -> usize { - self.raw.borrow().len() - } - - /// Returns this string recoded in UTF-8. Invalid characters will be - /// replaced by [REPLACEMENT_CHARACTER]. - /// - /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER - pub fn as_str(&self) -> Cow<'_, str> { - self.encoding.decode_without_bom_handling(self.as_bytes()).0 - } - - /// Returns this string recoded in `encoding`. Invalid characters will be - /// replaced by [REPLACEMENT_CHARACTER]. - /// - /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER - pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { - let utf8 = self.as_str(); - match encoding.encode(&utf8).0 { - Cow::Borrowed(_) => { - // Recoding into UTF-8 and then back did not change anything. - Cow::from(self.as_bytes()) - } - Cow::Owned(owned) => Cow::Owned(owned), - } - } - - /// Returns the bytes in the string, in its encoding. - pub fn as_bytes(&self) -> &[u8] { - &self.raw.borrow().0 - } - - /// Compares this string and `other` for equality, ignoring trailing ASCII - /// spaces in either string for the purpose of comparison. (This is - /// acceptable because we assume that the encoding is ASCII-compatible.) - pub fn eq_ignore_trailing_spaces(&self, other: &EncodedString) -> bool - where - R2: Borrow, - { - self.borrowed() - .raw - .eq_ignore_trailing_spaces(&other.borrowed().raw) - } - - /// Returns the string's [Encoding]. - pub fn encoding(&self) -> &'static Encoding { - self.encoding - } - - /// Returns a borrowed form of this string. - pub fn borrowed<'a>(&'a self) -> EncodedString<&'a BorrowedRawString> { - EncodedString { - encoding: self.encoding, - raw: self.raw.borrow(), - } - } - - /// Returns true if this string is empty. - pub fn is_empty(&self) -> bool { - self.raw.borrow().is_empty() - } - - /// Returns a helper for displaying this string in double quotes. - pub fn quoted(&self) -> impl Display { - Quoted(self.as_str()) - } -} - -impl Borrow for EncodedString -where - R: Borrow, -{ - fn borrow(&self) -> &BorrowedRawString { - self.raw.borrow() - } -} - -impl Borrow for OwnedEncodedString { - fn borrow(&self) -> &OwnedRawString { - &self.raw - } -} - -impl BorrowMut for OwnedEncodedString { - fn borrow_mut(&mut self) -> &mut OwnedRawString { - &mut self.raw - } -} - -impl OwnedEncodedString { - pub fn resize(&mut self, new_len: usize) -> Result<(), ()> { - match new_len.cmp(&self.len()) { - Ordering::Less => { - if !self.as_bytes()[new_len..].iter().all(|b| *b == b' ') { - return Err(()); - } - self.raw.0.truncate(new_len); - } - Ordering::Equal => (), - Ordering::Greater => self.raw.0.extend((self.len()..new_len).map(|_| b' ')), - } - Ok(()) - } - - /// Removes any trailing ASCII spaces. - pub fn trim_end(&mut self) { - while self.raw.0.pop_if(|c| *c == b' ').is_some() {} - } -} - -impl<'a> From> for OwnedEncodedString { - fn from(value: BorrowedEncodedString<'a>) -> Self { - Self { - raw: value.raw.into(), - encoding: value.encoding, - } - } -} - -impl From<&str> for OwnedEncodedString { - fn from(value: &str) -> Self { - Self { - raw: RawString(value.into()), - encoding: UTF_8, - } - } -} - -impl<'a> From<&'a str> for BorrowedEncodedString<'a> { - fn from(value: &'a str) -> Self { - Self { - raw: BorrowedRawString::new(value.as_bytes()), - encoding: UTF_8, - } - } -} - -impl<'a> From<&'a String> for BorrowedEncodedString<'a> { - fn from(value: &'a String) -> Self { - value.as_str().into() - } -} - -impl Serialize for EncodedString -where - R: Borrow, -{ - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - self.as_str().serialize(serializer) - } -} - -impl Display for EncodedString -where - R: Borrow, -{ - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{}", self.as_str()) - } -} - -impl PartialEq> for EncodedString +impl Hash for WithEncoding where - R: Borrow, - R2: Borrow, + T: Hash, { - fn eq(&self, other: &EncodedString) -> bool { - // XXX should this consider the encodings? - self.borrowed().raw.eq(other.borrowed().raw) + fn hash(&self, state: &mut H) { + self.inner.hash(state); } } diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index 7cd40e8651..d45ff56141 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -40,7 +40,7 @@ use thiserror::Error as ThisError; use unicase::UniCase; use crate::{ - data::{AsEncodedString, Datum, OwnedEncodedString, OwnedRawString, ResizeError}, + data::{ByteString, Datum, EncodedStringTrait, ResizeError, WithEncoding}, format::{DisplayPlain, Format}, identifier::{ByIdentifier, HasIdentifier, Identifier}, output::pivot::{ @@ -1856,7 +1856,7 @@ pub enum MultipleResponseType { /// one value (the "counted value") means that the box was checked, and any /// other value means that it was not. MultipleDichotomy { - datum: Datum, + datum: Datum, labels: CategoryLabels, }, @@ -1914,7 +1914,7 @@ impl DictIndexVariableSet { } #[derive(Clone, Default, PartialEq, Eq, Serialize)] -pub struct ValueLabels(pub HashMap, String>); +pub struct ValueLabels(pub HashMap, String>); impl ValueLabels { pub fn new() -> Self { @@ -1925,11 +1925,11 @@ impl ValueLabels { self.0.is_empty() } - pub fn get(&self, datum: &Datum) -> Option<&str> { + pub fn get(&self, datum: &Datum) -> Option<&str> { self.0.get(datum).map(|s| s.as_str()) } - pub fn insert(&mut self, datum: Datum, label: String) -> Option { + pub fn insert(&mut self, datum: Datum, label: String) -> Option { self.0.insert(datum, label) } @@ -1987,7 +1987,7 @@ impl<'a> MissingValuesMut<'a> { pub fn add_value( &mut self, - mut value: Datum, + mut value: Datum>, ) -> Result<(), MissingValuesError> { if self.inner.values.len() > 2 || (self.inner.range().is_some() && self.inner.values.len() > 1) @@ -2006,7 +2006,7 @@ impl<'a> MissingValuesMut<'a> { pub fn add_values( &mut self, - values: impl IntoIterator>, + values: impl IntoIterator>>, ) -> Result<(), MissingValuesError> { let n = self.inner.values.len(); for value in values { @@ -2031,7 +2031,7 @@ impl<'a> MissingValuesMut<'a> { #[derive(Clone, Default, Serialize)] pub struct MissingValues { /// Individual missing values, up to 3 of them. - values: Vec>, + values: Vec>>, /// Optional range of missing values. range: Option, @@ -2086,7 +2086,7 @@ impl MissingValues { pub fn clear(&mut self) { *self = Self::default(); } - pub fn values(&self) -> &[Datum] { + pub fn values(&self) -> &[Datum>] { &self.values } @@ -2095,7 +2095,7 @@ impl MissingValues { } pub fn new( - mut values: Vec>, + mut values: Vec>>, range: Option, ) -> Result { if values.len() > 3 { @@ -2137,12 +2137,12 @@ impl MissingValues { pub fn contains(&self, value: &Datum) -> bool where - S: AsEncodedString, + S: EncodedStringTrait, { if self .values .iter() - .any(|datum| datum.eq_ignore_trailing_spaces(&value)) + .any(|datum| datum.eq_ignore_trailing_spaces(value)) { return true; } diff --git a/rust/pspp/src/format/display/mod.rs b/rust/pspp/src/format/display/mod.rs index cf3ae72114..337d5136d5 100644 --- a/rust/pspp/src/format/display/mod.rs +++ b/rust/pspp/src/format/display/mod.rs @@ -29,7 +29,7 @@ use smallvec::{Array, SmallVec}; use crate::{ calendar::{calendar_offset_to_gregorian, day_of_year, month_name, short_month_name}, - data::{AsEncodedString, BorrowString, Datum, QuotedDatum}, + data::{ByteStr, Datum, EncodedStringTrait, QuotedDatum, WithEncoding}, endian::{endian_to_smallvec, ToBytes}, format::{Category, DateTemplate, Decimal, Format, NumberStyle, Settings, TemplateItem, Type}, settings::{EndianSettings, Settings as PsppSettings}, @@ -102,17 +102,14 @@ where impl<'a, D> Datum where - D: AsEncodedString + BorrowString, + D: EncodedStringTrait, { /// Returns an object that implements [Display] for printing this /// [EncodedDatum] as `format`. /// /// [Display]: std::fmt::Display - pub fn display(&'a self, format: Format) -> DisplayDatum<'a, D::Borrowed<'a>> - where - D::Borrowed<'a>: AsEncodedString, - { - DisplayDatum::new(format, self.borrowed_string()) + pub fn display(&'a self, format: Format) -> DisplayDatum<'a, WithEncoding>> { + DisplayDatum::new(format, self.as_borrowed()) } pub fn display_plain(&self) -> QuotedDatum<'_, D> { @@ -122,20 +119,19 @@ where impl<'a, 'b, B> Display for DisplayDatum<'b, B> where - B: AsEncodedString, + B: EncodedStringTrait, { fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { let number = match &self.datum { Datum::Number(number) => *number, Datum::String(string) => { if self.format.type_() == Type::AHex { - for byte in string.as_encoded_string().as_bytes() { + for byte in string.raw_string_bytes() { write!(f, "{byte:02x}")?; } } else { let quote = if self.quote_strings { "\"" } else { "" }; - let s = string.as_encoded_string(); - let s = s.as_str(); + let s = string.as_str(); let s = if self.trim_spaces { s.trim_end_matches(' ') } else { @@ -188,7 +184,7 @@ where impl<'b, B> DisplayDatum<'b, B> where - B: AsEncodedString, + B: EncodedStringTrait, { pub fn new(format: Format, datum: Datum) -> Self { let settings = PsppSettings::global(); diff --git a/rust/pspp/src/format/display/test.rs b/rust/pspp/src/format/display/test.rs index 4a4ae6a0a4..6eefc4ffd2 100644 --- a/rust/pspp/src/format/display/test.rs +++ b/rust/pspp/src/format/display/test.rs @@ -23,7 +23,7 @@ use smallstr::SmallString; use smallvec::SmallVec; use crate::{ - data::{Datum, OwnedEncodedString}, + data::{ByteString, Datum, WithEncoding}, endian::Endian, format::{AbstractFormat, Epoch, Format, Settings, Type, UncheckedFormat, CC}, lex::{scan::StringScanner, segment::Syntax, Punct, Token}, @@ -73,7 +73,7 @@ fn test(name: &str) { let format: Format = format.try_into().unwrap(); assert_eq!(tokens.get(1), Some(&Token::Punct(Punct::Colon))); let expected = tokens[2].as_string().unwrap(); - let actual = Datum::::Number(value) + let actual = Datum::>::Number(value) .display(format) .with_settings(&settings) .with_endian(endian) @@ -181,7 +181,7 @@ fn leading_zeros() { } fn test_with_settings(value: f64, expected: [&str; 2], settings: &Settings) { - let value = Datum::::from(value); + let value = Datum::>::from(value); for (expected, d) in expected.into_iter().zip([2, 1].into_iter()) { assert_eq!( &value @@ -212,7 +212,7 @@ fn leading_zeros() { fn non_ascii_cc() { fn test(settings: &Settings, value: f64, expected: &str) { assert_eq!( - &Datum::::from(value) + &Datum::>::from(value) .display(Format::new(Type::CC(CC::A), 10, 2).unwrap()) .with_settings(settings) .to_string(), @@ -262,7 +262,7 @@ fn test_binhex(name: &str) { assert_eq!(tokens.get(1), Some(&Token::Punct(Punct::Colon))); let expected = tokens[2].as_string().unwrap(); let mut actual = SmallVec::<[u8; 16]>::new(); - Datum::::Number(value) + Datum::>::Number(value) .display(format) .with_endian(endian) .write(&mut actual, UTF_8) @@ -1289,7 +1289,7 @@ fn test_times(format: Format, name: &str) { .zip(1..) { let formatted = parser - .parse(&input) + .parse(input) .unwrap() .with_encoding(UTF_8) .display(format) diff --git a/rust/pspp/src/format/mod.rs b/rust/pspp/src/format/mod.rs index aa92728582..6fc8162468 100644 --- a/rust/pspp/src/format/mod.rs +++ b/rust/pspp/src/format/mod.rs @@ -29,8 +29,7 @@ use thiserror::Error as ThisError; use unicode_width::UnicodeWidthStr; use crate::{ - data::Datum, - data::OwnedRawString, + data::{ByteString, Datum, }, dictionary::{VarType, VarWidth}, sys::raw, }; @@ -393,10 +392,10 @@ impl Type { } } - pub fn default_value(&self) -> Datum { + pub fn default_value(&self) -> Datum { match self.var_type() { VarType::Numeric => Datum::sysmis(), - VarType::String => Datum::String(OwnedRawString::default()), + VarType::String => Datum::String(ByteString::default()), } } } @@ -621,10 +620,10 @@ impl Format { Ok(self) } - pub fn default_value(&self) -> Datum { + pub fn default_value(&self) -> Datum { match self.var_width() { VarWidth::Numeric => Datum::sysmis(), - VarWidth::String(width) => Datum::String(OwnedRawString::spaces(width as usize)), + VarWidth::String(width) => Datum::String(ByteString::spaces(width as usize)), } } diff --git a/rust/pspp/src/format/parse.rs b/rust/pspp/src/format/parse.rs index eafddcf930..8a2a3c9c6b 100644 --- a/rust/pspp/src/format/parse.rs +++ b/rust/pspp/src/format/parse.rs @@ -16,7 +16,7 @@ use crate::{ calendar::{calendar_gregorian_to_offset, DateError}, - data::{BorrowedEncodedString, Datum, OwnedDatum, OwnedEncodedString}, + data::{ByteString, Datum, EncodedStringTrait, OwnedDatum, RawStringTrait, WithEncoding}, endian::{Endian, Parse}, format::{DateTemplate, Decimals, Settings, TemplateItem, Type}, settings::{EndianSettings, Settings as PsppSettings}, @@ -32,7 +32,7 @@ use thiserror::Error as ThisError; #[derive(Clone, Debug)] pub struct ParseError { type_: Type, - input: OwnedEncodedString, + input: WithEncoding, kind: ParseErrorKind, } @@ -43,7 +43,7 @@ impl Display for ParseError { write!( f, "{} cannot be parsed as {}: {}", - self.input.borrowed().quoted(), + self.input.quoted(), &self.type_, &self.kind ) @@ -190,13 +190,12 @@ impl<'a> ParseValue<'a> { /// input into UTF-8, but this will screw up parsing of binary formats, /// because recoding bytes from (e.g.) windows-1252 into UTF-8, and then /// interpreting them as a binary number yields nonsense. - pub fn parse<'b, T>(&self, input: T) -> Result - where - T: Into>, - { - let input: BorrowedEncodedString = input.into(); + pub fn parse(&self, input: impl EncodedStringTrait) -> Result { if input.is_empty() { - return Ok(self.type_.default_value()); + return Ok(self + .type_ + .default_value() + .with_encoding(self.output_encoding)); } match self.type_ { Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => { @@ -222,19 +221,19 @@ impl<'a> ParseValue<'a> { | Type::DTime => self.parse_date(&input.as_str()), Type::WkDay => self.parse_wkday(&input.as_str()), Type::Month => self.parse_month(&input.as_str()), - Type::P => self.parse_p(input.as_bytes()), - Type::PK => self.parse_pk(input.as_bytes()), - Type::IB => self.parse_ib(input.as_bytes()), - Type::PIB => self.parse_pib(input.as_bytes()), - Type::RB => self.parse_rb(input.as_bytes()), + Type::P => self.parse_p(input.raw_string_bytes()), + Type::PK => self.parse_pk(input.raw_string_bytes()), + Type::IB => self.parse_ib(input.raw_string_bytes()), + Type::PIB => self.parse_pib(input.raw_string_bytes()), + Type::RB => self.parse_rb(input.raw_string_bytes()), Type::A => Ok(Datum::String( - input.to_encoding(self.output_encoding).into(), + input.to_encoding(self.output_encoding).into_owned(), )), Type::AHex => self.parse_ahex(&input.as_str()), } .map_err(|kind| ParseError { type_: self.type_, - input: input.into(), + input: input.cloned(), kind, }) } @@ -468,7 +467,9 @@ impl<'a> ParseValue<'a> { }; result.push((hi * 16 + lo) as u8); } - Ok(Datum::String(result.into())) + Ok(Datum::String( + ByteString(result).with_encoding(self.output_encoding), + )) } fn parse_hex(&self, input: &str) -> Result, ParseErrorKind> { @@ -920,7 +921,7 @@ mod test { use crate::{ calendar::{days_in_month, is_leap_year}, - data::{BorrowedRawString, Datum, EncodedString, OwnedDatum}, + data::{ByteStr, Datum, EncodedStringTrait, OwnedDatum, RawStringTrait}, endian::Endian, format::{ parse::{ParseError, ParseErrorKind, Sign}, @@ -942,8 +943,7 @@ mod test { let result = type_.parser(UTF_8).parse(&input); let error = result.clone().err(); let value = result - .unwrap_or(type_.default_value()) - .with_encoding(UTF_8) + .unwrap_or(type_.default_value().with_encoding(UTF_8)) .display(Format::new(Type::F, 10, 4).unwrap()) .to_string(); if value != expected { @@ -1636,7 +1636,7 @@ mod test { let parsed = Type::RB .parser(UTF_8) .with_endian(EndianSettings::new(Endian::Big)) - .parse(EncodedString::new(BorrowedRawString::new(&raw[..]), UTF_8)) + .parse(ByteStr(raw.as_slice()).with_encoding(UTF_8)) .unwrap() .as_number() .unwrap() @@ -1733,7 +1733,6 @@ mod test { .unwrap() .as_string() .unwrap() - .as_encoded(UTF_8) .as_str(), "abcdefgh" ); diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs index 75cc9e3235..1361765dec 100644 --- a/rust/pspp/src/output/pivot/mod.rs +++ b/rust/pspp/src/output/pivot/mod.rs @@ -67,7 +67,7 @@ use thiserror::Error as ThisError; use tlo::parse_tlo; use crate::{ - data::{AsEncodedString, Datum, OwnedEncodedString, OwnedRawString}, + data::{ByteString, Datum, EncodedStringTrait, RawStringTrait}, dictionary::{VarType, Variable}, format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat}, settings::{Settings, Show}, @@ -1861,14 +1861,14 @@ impl Value { } pub fn new_datum(value: &Datum) -> Self where - B: AsEncodedString, + B: EncodedStringTrait, { match value { Datum::Number(number) => Self::new_number(*number), - Datum::String(string) => Self::new_user_text(string.as_encoded_string().as_str()), + Datum::String(string) => Self::new_user_text(string.as_str()), } } - pub fn new_variable_value(variable: &Variable, value: &Datum) -> Self { + pub fn new_variable_value(variable: &Variable, value: &Datum) -> Self { let var_name = Some(variable.name.as_str().into()); let value_label = variable.value_labels.get(value).map(String::from); match value { @@ -1892,7 +1892,10 @@ impl Value { Datum::String(string) => Self::new(ValueInner::String(StringValue { show: None, hex: variable.print_format.type_() == Type::AHex, - s: string.as_encoded(variable.encoding()).as_str().into_owned(), + s: string + .as_ref() + .with_encoding(variable.encoding()) + .into_string(), var_name, value_label, })), @@ -2211,7 +2214,7 @@ impl Display for DisplayValue<'_> { write!( &mut buf, "{}", - Datum::::Number(*value).display(format) + Datum::<&str>::Number(*value).display(format) ) .unwrap(); write!(f, "{}", buf.trim_start_matches(' '))?; diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index 82edb76f92..044985146f 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -26,7 +26,7 @@ use std::{ use crate::{ calendar::date_time_to_pspp, crypto::EncryptedFile, - data::{Case, Datum, OwnedRawString}, + data::{ByteString, Case, Datum, MutRawString, RawStringTrait}, dictionary::{ DictIndexMultipleResponseSet, DictIndexVariableSet, Dictionary, InvalidRole, MissingValues, MissingValuesError, MultipleResponseType, VarWidth, Variable, @@ -915,7 +915,8 @@ impl Records { variable .missing_values_mut() - .replace(input.missing_values.decode(encoding).unwrap()); + .replace(input.missing_values.decode(encoding).unwrap()) + .unwrap(); variable.print_format = decode_format( input.print_format, @@ -1258,8 +1259,8 @@ impl Records { .missing_values .into_iter() .map(|v| { - let mut value = OwnedRawString::from(v.0.as_slice()); - value.resize(variable.width.as_string_width().unwrap()); + let mut value = ByteString::from(v.0.as_slice()); + let _ = value.resize(variable.width.as_string_width().unwrap()); // XXX check error Datum::String(value.with_encoding(encoding)) }) .collect::>(); @@ -1700,7 +1701,7 @@ impl Debug for Cases { } impl Iterator for Cases { - type Item = Result>>, raw::Error>; + type Item = Result>>, raw::Error>; fn next(&mut self) -> Option { self.inner diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 3390e5e9f3..be1f741d48 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -20,7 +20,7 @@ //! raw details. Most readers will want to use higher-level interfaces. use crate::{ - data::{BorrowedRawString, Datum, OwnedRawString, RawCase, }, + data::{ByteStr, ByteString, Datum, RawCase}, dictionary::{VarType, VarWidth}, endian::{Endian, Parse, ToBytes}, identifier::{Error as IdError, Identifier}, @@ -395,7 +395,7 @@ pub enum Record { /// one variable record per 8-byte segment. Variable( /// The record. - VariableRecord, + VariableRecord, ), /// Value labels for numeric and short string variables. @@ -403,7 +403,7 @@ pub enum Record { /// These appear after the variable records. ValueLabel( /// The record. - ValueLabelRecord, + ValueLabelRecord, ), /// Document record. @@ -433,13 +433,13 @@ pub enum Record { /// Multiple response variable record. MultipleResponse( /// The record. - MultipleResponseRecord, + MultipleResponseRecord, ), /// Value labels for long string variables. LongStringValueLabels( /// The record. - LongStringValueLabelRecord, + LongStringValueLabelRecord, ), /// Missing values for long string variables. @@ -448,7 +448,7 @@ pub enum Record { /// variable records. LongStringMissingValues( /// The record. - LongStringMissingValueRecord, + LongStringMissingValueRecord, ), /// Encoding record. @@ -776,12 +776,12 @@ impl<'de> Decoder<'de> { output } - fn decode<'a>(&mut self, input: &'a OwnedRawString) -> Cow<'a, str> { + fn decode<'a>(&mut self, input: &'a ByteString) -> Cow<'a, str> { self.decode_slice(input.0.as_slice()) } /// Decodes `input` to an [Identifier] using our encoding. - pub fn decode_identifier(&mut self, input: &OwnedRawString) -> Result { + pub fn decode_identifier(&mut self, input: &ByteString) -> Result { let decoded = &self.decode(input); self.new_identifier(decoded) } @@ -901,7 +901,7 @@ impl Debug for RawDatum { match self { RawDatum::Number(Some(number)) => write!(f, "{number:?}"), RawDatum::Number(None) => write!(f, "SYSMIS"), - RawDatum::String(s) => write!(f, "{:?}", BorrowedRawString::new(s)), + RawDatum::String(s) => write!(f, "{:?}", ByteStr(s)), } } } @@ -913,7 +913,7 @@ impl Serialize for RawDatum { { match self { RawDatum::Number(number) => number.serialize(serializer), - RawDatum::String(s) => BorrowedRawString::new(s).serialize(serializer), + RawDatum::String(s) => ByteStr(s).serialize(serializer), } } } @@ -930,18 +930,18 @@ impl RawDatum { /// Decodes a `RawDatum` into a [Datum] given that we now know the string /// width. - pub fn decode(&self, width: VarWidth) -> Datum { + pub fn decode(&self, width: VarWidth) -> Datum { match self { Self::Number(x) => Datum::Number(*x), Self::String(s) => { let width = width.as_string_width().unwrap(); - Datum::String(OwnedRawString::from(&s[..width])) + Datum::String(ByteString::from(&s[..width])) } } } } -impl Datum { +impl Datum { fn read_case( reader: &mut R, case_vars: &[CaseVar], @@ -1150,7 +1150,7 @@ where reader: Option, warn: Box, - header: FileHeader, + header: FileHeader, var_types: VarTypes, state: ReaderState, @@ -1179,7 +1179,7 @@ where } /// Returns the header in this reader. - pub fn header(&self) -> &FileHeader { + pub fn header(&self) -> &FileHeader { &self.header } @@ -1410,7 +1410,7 @@ impl RawCases { fn new( reader: R, var_types: VarTypes, - header: &FileHeader, + header: &FileHeader, ztrailer_offset: Option, ) -> Self where @@ -1579,7 +1579,7 @@ impl Debug for UntypedDatum { } else { big }; - write!(f, "{number}/{:?}", BorrowedRawString::new(&self.0)) + write!(f, "{number}/{:?}", ByteStr(&self.0)) } } @@ -1598,7 +1598,7 @@ impl From<[u8; N]> for RawStrArray { impl Debug for RawStrArray { fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{:?}", BorrowedRawString::new(&self.0)) + write!(f, "{:?}", ByteStr(&self.0)) } } @@ -1607,7 +1607,7 @@ impl Serialize for RawStrArray { where S: serde::Serializer, { - BorrowedRawString::new(&self.0).serialize(serializer) + ByteStr(&self.0).serialize(serializer) } } @@ -1657,7 +1657,7 @@ fn read_vec(r: &mut R, n: usize) -> Result, IoError> { Ok(vec) } -fn read_string(r: &mut R, endian: Endian) -> Result { +fn read_string(r: &mut R, endian: Endian) -> Result { let length: u32 = endian.parse(read_bytes(r)?); Ok(read_vec(r, length as usize)?.into()) } diff --git a/rust/pspp/src/sys/raw/records.rs b/rust/pspp/src/sys/raw/records.rs index b7469fbbbe..296a0ac73e 100644 --- a/rust/pspp/src/sys/raw/records.rs +++ b/rust/pspp/src/sys/raw/records.rs @@ -12,7 +12,7 @@ use std::{ }; use crate::{ - data::{Datum, OwnedRawString, }, + data::{ByteString, Datum}, dictionary::{ Alignment, Attributes, CategoryLabels, Measure, MissingValueRange, MissingValues, MissingValuesError, VarType, VarWidth, @@ -142,7 +142,7 @@ pub struct RawHeader { pub file_label: [u8; 64], } -impl FileHeader { +impl FileHeader { /// Reads a header record from `r`, reporting any warnings via `warn`. pub fn read(r: &mut R, warn: &mut dyn FnMut(Warning)) -> Result where @@ -351,14 +351,14 @@ fn format_name(type_: u32) -> Cow<'static, str> { #[derive(Clone, Debug, Default, Serialize)] pub struct RawMissingValues { /// Individual missing values, up to 3 of them. - pub values: Vec>, + pub values: Vec>, /// Optional range of missing values. pub range: Option, } impl RawMissingValues { - pub fn new(values: Vec>, range: Option) -> Self { + pub fn new(values: Vec>, range: Option) -> Self { Self { values, range } } @@ -450,7 +450,7 @@ impl RawMissingValues { let width = width.min(8) as usize; let values = values .into_iter() - .map(|value| Datum::String(OwnedRawString::from(&value[..width]))) + .map(|value| Datum::String(ByteString::from(&value[..width]))) .collect(); return Ok(Self::new(values, None)); } @@ -540,7 +540,7 @@ pub struct RawVariableRecord { pub name: [u8; 8], } -impl VariableRecord { +impl VariableRecord { /// Reads a variable record from `r`. pub fn read( r: &mut R, @@ -699,7 +699,7 @@ where pub const MAX_INDEXES: u32 = u32::MAX / 8; } -impl ValueLabelRecord { +impl ValueLabelRecord { pub(super) fn read( r: &mut R, endian: Endian, @@ -1036,7 +1036,7 @@ pub struct TextRecord { pub offsets: Range, /// The text content of the record. - pub text: OwnedRawString, + pub text: ByteString, } impl TextRecord { @@ -1199,7 +1199,7 @@ pub enum MultipleResponseType { /// Multiple-dichotomy set. MultipleDichotomy { /// The value that is counted in the set. - value: OwnedRawString, + value: ByteString, /// What categories are labeled. labels: CategoryLabels, @@ -1269,7 +1269,7 @@ where pub short_names: Vec, } -impl MultipleResponseSet { +impl MultipleResponseSet { /// Parses a multiple-response set from `input`. Returns the set and the /// input remaining to be parsed following the set. fn parse(input: &[u8]) -> Result<(Self, &[u8]), WarningDetails> { @@ -1366,7 +1366,7 @@ where pub sets: Vec>, } -impl MultipleResponseRecord { +impl MultipleResponseRecord { /// Parses a multiple-response set from `ext`. pub fn parse(ext: &Extension) -> Result { ext.check_size(Some(1), None, "multiple response set record")?; @@ -1391,7 +1391,7 @@ impl MultipleResponseRecord { } } -impl MultipleResponseRecord { +impl MultipleResponseRecord { /// Decodes this record using `decoder`. pub fn decode(self, decoder: &mut Decoder) -> MultipleResponseRecord { let mut sets = Vec::new(); @@ -1410,7 +1410,7 @@ impl MultipleResponseRecord { } } -fn parse_counted_string(input: &[u8]) -> Result<(OwnedRawString, &[u8]), WarningDetails> { +fn parse_counted_string(input: &[u8]) -> Result<(ByteString, &[u8]), WarningDetails> { let Some(space) = input.iter().position(|&b| b == b' ') else { return Err(MultipleResponseWarning::CountedStringMissingSpace.into()); }; @@ -1576,7 +1576,7 @@ where pub missing_values: Vec>, } -impl LongStringMissingValues { +impl LongStringMissingValues { /// Decodes these settings using `decoder`. fn decode( &self, @@ -1602,7 +1602,7 @@ where pub values: Vec>, } -impl LongStringMissingValueRecord { +impl LongStringMissingValueRecord { /// Parses this record from `ext`. pub fn parse( ext: &Extension, @@ -2359,10 +2359,10 @@ where pub width: u32, /// `(value, label)` pairs, where each value is `width` bytes. - pub labels: Vec<(OwnedRawString, S)>, + pub labels: Vec<(ByteString, S)>, } -impl LongStringValueLabels { +impl LongStringValueLabels { /// Decodes a set of long string value labels using `decoder`. fn decode( &self, @@ -2400,7 +2400,7 @@ where pub labels: Vec>, } -impl LongStringValueLabelRecord { +impl LongStringValueLabelRecord { /// Parses this record from `ext` using `endian`. fn parse(ext: &Extension, endian: Endian) -> Result { ext.check_size(Some(1), None, "long string value labels record")?; diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs index 24463aa632..1b5fe0f193 100644 --- a/rust/pspp/src/sys/test.rs +++ b/rust/pspp/src/sys/test.rs @@ -27,7 +27,7 @@ use encoding_rs::UTF_8; use crate::{ crypto::EncryptedFile, - data::{BorrowedDatum, Datum, OwnedDatum, RawString}, + data::{BorrowedDatum, ByteString, Datum, OwnedDatum}, dictionary::{Dictionary, VarWidth, Variable}, endian::Endian, identifier::Identifier, @@ -645,15 +645,15 @@ fn write_long_string_value_labels() { let mut dictionary = Dictionary::new(UTF_8); let mut s1 = Variable::new(Identifier::new("s1").unwrap(), VarWidth::String(9), UTF_8); s1.value_labels.insert( - OwnedDatum::String(RawString(String::from("abc ").into_bytes())), + Datum::String(ByteString::from("abc ")), String::from("First value label"), ); s1.value_labels.insert( - OwnedDatum::String(RawString(String::from("abcdefgh ").into_bytes())), + Datum::String(ByteString::from("abcdefgh ")), String::from("Second value label"), ); s1.value_labels.insert( - OwnedDatum::String(RawString(String::from("abcdefghi").into_bytes())), + Datum::String(ByteString::from("abcdefghi")), String::from("Third value label"), ); s1.missing_values_mut() diff --git a/rust/pspp/src/sys/write.rs b/rust/pspp/src/sys/write.rs index a4da7e6800..4c59603c73 100644 --- a/rust/pspp/src/sys/write.rs +++ b/rust/pspp/src/sys/write.rs @@ -1,5 +1,5 @@ use std::{ - borrow::{Borrow, Cow}, + borrow::Cow, collections::HashMap, fmt::Write as _, fs::File, @@ -17,7 +17,7 @@ use itertools::zip_eq; use smallvec::SmallVec; use crate::{ - data::{BorrowedRawString, Datum}, + data::{Datum, RawStringTrait}, dictionary::{ Alignment, Attributes, CategoryLabels, Dictionary, Measure, MultipleResponseType, ValueLabels, VarWidth, @@ -587,7 +587,7 @@ where let label = self.dictionary.encoding().encode(&label).0; ( value.len() as u32, - value.as_bytes(), + value.raw_string_bytes(), label.len() as u32, &label[..], ) @@ -614,7 +614,7 @@ where .write_le(&mut cursor)?; for value in variable.missing_values().values() { - let value = value.as_string().unwrap().as_bytes(); + let value = value.as_string().unwrap().raw_string_bytes(); let bytes = value.get(..8).unwrap_or(value); Padded::exact(bytes, 8, b' ').write_le(&mut cursor).unwrap(); } @@ -713,7 +713,7 @@ impl BinWrite for Pad { impl BinWrite for Datum where - B: Borrow, + B: RawStringTrait, { type Args<'a> = (); @@ -725,7 +725,11 @@ where ) -> binrw::BinResult<()> { match self { Datum::Number(number) => number.unwrap_or(f64::MIN).write_options(writer, endian, ()), - Datum::String(raw_string) => raw_string.borrow().0.write_options(writer, endian, ()), + Datum::String(raw_string) => { + raw_string + .raw_string_bytes() + .write_options(writer, endian, ()) + } } } } @@ -869,7 +873,7 @@ where case: impl Iterator>, ) -> Result<(), BinError> where - B: Borrow, + B: RawStringTrait, { for (var, datum) in zip_eq(self.case_vars, case) { match var { @@ -879,7 +883,7 @@ where .unwrap_or(f64::MIN) .write_le(&mut self.inner)?, CaseVar::String(encoding) => { - let mut s = datum.as_string().unwrap().borrow().as_bytes(); + let mut s = datum.as_string().unwrap().raw_string_bytes(); for segment in encoding { let data; (data, s) = s.split_at(segment.data_bytes); @@ -895,7 +899,7 @@ where case: impl Iterator>, ) -> Result<(), BinError> where - B: Borrow, + B: RawStringTrait, { for (var, datum) in zip_eq(self.case_vars, case) { match var { @@ -915,7 +919,7 @@ where }, CaseVar::String(encoding) => { - let mut s = datum.as_string().unwrap().borrow().as_bytes(); + let mut s = datum.as_string().unwrap().raw_string_bytes(); for segment in encoding { let data; (data, s) = s.split_at(segment.data_bytes); @@ -1024,7 +1028,7 @@ where case: impl IntoIterator>, ) -> Result<(), BinError> where - B: Borrow, + B: RawStringTrait, { match self.inner.as_mut().unwrap() { Either::Left(inner) => { -- 2.30.2