From: Ben Pfaff Date: Mon, 28 Jul 2025 23:23:51 +0000 (-0700) Subject: start experiment with merging encodeddatum into datum X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2957b0accb53037fe04de6b48e2b2563acc89626;p=pspp start experiment with merging encodeddatum into datum --- diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index 581197c438..a16691d0b4 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -35,7 +35,7 @@ use std::{ str::from_utf8, }; -use encoding_rs::{mem::decode_latin1, Encoding, UTF_8}; +use encoding_rs::{mem::decode_latin1, Encoding}; use itertools::Itertools; use ordered_float::OrderedFloat; use serde::{ser::SerializeTupleVariant, Serialize}; @@ -273,193 +273,11 @@ where } } -pub type OwnedEncodedDatum = EncodedDatum; -pub type BorrowedEncodedDatum<'a> = EncodedDatum>; - -/// The value of a [Variable](crate::dictionary::Variable), with a string -/// encoding. -#[derive(Clone)] -pub enum EncodedDatum { - /// A numeric value. - Number( - /// A number, or `None` for the system-missing value. - Option, - ), - /// A string value. - String( - /// The value, in the variable's encoding. - D, - ), -} - -impl EncodedDatum> -where - R: Borrow, -{ - pub fn into_raw(self) -> Datum { - match self { - EncodedDatum::Number(number) => Datum::Number(number), - EncodedDatum::String(encoded_string) => Datum::String(encoded_string.into_raw()), - } - } - - /// Returns the [VarWidth] corresponding to this datum. - pub fn width(&self) -> VarWidth { - match self { - Self::Number(_) => VarWidth::Numeric, - Self::String(s) => VarWidth::String(s.len().try_into().unwrap()), - } - } - - pub fn borrowed<'a>(&'a self) -> EncodedDatum> { - match self { - EncodedDatum::Number(number) => EncodedDatum::Number(*number), - EncodedDatum::String(encoded_string) => EncodedDatum::String(encoded_string.borrowed()), - } - } - - /// Compares this datum and `other` for equality, ignoring trailing ASCII - /// spaces in either, if they are both strings, for the purpose of - /// comparison. - pub fn eq_ignore_trailing_spaces(&self, other: &EncodedDatum>) -> bool - where - R2: Borrow, - { - match (self.borrowed(), other.borrowed()) { - (EncodedDatum::Number(lhs), EncodedDatum::Number(rhs)) => lhs == rhs, - (EncodedDatum::String(lhs), EncodedDatum::String(rhs)) => { - lhs.eq_ignore_trailing_spaces(&rhs) - } - _ => false, - } - } - - pub fn quoted(&self) -> QuotedEncodedDatum<'_> { - QuotedEncodedDatum(self.borrowed()) - } -} - -impl EncodedDatum { - /// Constructs a new numerical [EncodedDatum] for the system-missing value. - pub const fn sysmis() -> Self { - Self::Number(None) - } - - /// Returns the number inside this datum, or `None` if this is a string - /// datum. - pub fn as_number(&self) -> Option> { - match self { - Self::Number(number) => Some(*number), - Self::String(_) => None, - } - } - - /// Returns the string inside this datum, or `None` if this is a numeric - /// datum. - pub fn as_string(&self) -> Option<&D> { - match self { - Self::Number(_) => None, - Self::String(s) => Some(s), - } - } - - /// Returns the string inside this datum as a mutable borrow, or `None` if - /// this is a numeric datum. - pub fn as_string_mut(&mut self) -> Option<&mut D> { - match self { - Self::Number(_) => None, - Self::String(s) => Some(s), - } - } - - /// Returns the [VarType] corresponding to this datum. - pub fn var_type(&self) -> VarType { - match self { - Self::Number(_) => VarType::Numeric, - Self::String(_) => VarType::String, - } - } -} - -impl OwnedEncodedDatum { - /// Resizes this datum to the given `width`. Returns `Ok(())` if - /// successful, if and only if this datum and `width` are both string or - /// both numeric and, for string widths, resizing would not drop any - /// non-space characters. - pub fn resize(&mut self, width: VarWidth) -> Result<(), ()> { - match (self, width) { - (Self::Number(_), VarWidth::Numeric) => Ok(()), - (Self::String(s), VarWidth::String(new_width)) => s.resize(new_width as usize), - _ => Err(()), - } - } - - /// Removes trailing ASCII spaces from this datum, if it is a string. - pub fn trim_end(&mut self) { - match self { - Self::Number(_) => (), - Self::String(s) => s.trim_end(), - } - } -} - -impl Display for EncodedDatum -where - D: Display, -{ - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - Self::Number(None) => write!(f, "SYSMIS"), - Self::Number(Some(number)) => number.display_plain().fmt(f), - Self::String(string) => write!(f, "{string}"), - } - } -} - -impl Serialize for EncodedDatum -where - D: Serialize, -{ - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - match self { - EncodedDatum::Number(number) => number.serialize(serializer), - EncodedDatum::String(encoded_string) => encoded_string.serialize(serializer), - } - } -} - -impl From for EncodedDatum { - fn from(number: f64) -> Self { - Some(number).into() - } -} - -impl From> for EncodedDatum { - fn from(value: Option) -> Self { - Self::Number(value) - } -} - -impl From<&str> for OwnedEncodedDatum { - fn from(value: &str) -> Self { - Self::String(OwnedEncodedString::from(value)) - } -} - -pub struct QuotedEncodedDatum<'a>(BorrowedEncodedDatum<'a>); - -impl Display for QuotedEncodedDatum<'_> { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match &self.0 { - EncodedDatum::Number(None) => write!(f, "SYSMIS"), - EncodedDatum::Number(Some(number)) => number.display_plain().fmt(f), - EncodedDatum::String(string) => write!(f, "\"{}\"", string.as_str()), - } - } -} +mod encoded; +pub use encoded::{ + BorrowedEncodedDatum, BorrowedEncodedString, EncodedDatum, EncodedString, OwnedEncodedDatum, + OwnedEncodedString, QuotedEncodedDatum, +}; /// A [Datum] that owns its string data (if any). pub type OwnedDatum = Datum; @@ -500,6 +318,19 @@ where } } +impl<'a, B> Display for Datum +where + B: Borrow>, +{ + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::Number(None) => write!(f, "SYSMIS"), + Self::Number(Some(number)) => number.display_plain().fmt(f), + Self::String(string) => write!(f, "{}", string.borrow()), + } + } +} + impl Serialize for Datum where B: Serialize, @@ -590,6 +421,15 @@ impl Datum { } } + /// Returns the string inside this datum, or `None` if this is a numeric + /// datum. + pub fn as_string(&self) -> Option<&B> { + match self { + Self::Number(_) => None, + Self::String(s) => Some(s), + } + } + /// Returns the [VarType] corresponding to this datum. pub fn var_type(&self) -> VarType { match self { @@ -603,15 +443,6 @@ impl Datum where B: Borrow>, { - /// Returns the string inside this datum, or `None` if this is a numeric - /// datum. - pub fn as_string(&self) -> Option<&BorrowedRawString> { - match self { - Self::Number(_) => None, - Self::String(s) => Some(s.borrow()), - } - } - /// Returns true if this datum can be resized to the given `width` without /// loss, which is true only if this datum and `width` are both string or /// both numeric and, for string widths, if resizing would not drop any @@ -811,185 +642,6 @@ impl Iterator for CaseVecIter { } } -pub type OwnedEncodedString = EncodedString; -pub type BorrowedEncodedString<'a> = EncodedString<&'a BorrowedRawString>; - -/// An owned string and its [Encoding]. -/// -/// The string is not guaranteed to be valid in the encoding. -#[derive(Copy, Clone, Debug)] -pub struct EncodedString { - /// The bytes of the string. - raw: R, - - /// The string's encoding. - encoding: &'static Encoding, -} - -impl EncodedString -where - R: Borrow, -{ - pub fn new(raw: R, encoding: &'static Encoding) -> Self { - Self { raw, encoding } - } - - pub fn into_raw(self) -> R { - self.raw - } - - pub fn len(&self) -> usize { - self.raw.borrow().len() - } - - /// Returns this string recoded in UTF-8. Invalid characters will be - /// replaced by [REPLACEMENT_CHARACTER]. - /// - /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER - pub fn as_str(&self) -> Cow<'_, str> { - self.encoding.decode_without_bom_handling(self.as_bytes()).0 - } - - /// Returns this string recoded in `encoding`. Invalid characters will be - /// replaced by [REPLACEMENT_CHARACTER]. - /// - /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER - pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { - let utf8 = self.as_str(); - match encoding.encode(&utf8).0 { - Cow::Borrowed(_) => { - // Recoding into UTF-8 and then back did not change anything. - Cow::from(self.as_bytes()) - } - Cow::Owned(owned) => Cow::Owned(owned), - } - } - - /// Returns the bytes in the string, in its encoding. - pub fn as_bytes(&self) -> &[u8] { - &self.raw.borrow().0 - } - - /// Compares this string and `other` for equality, ignoring trailing ASCII - /// spaces in either string for the purpose of comparison. (This is - /// acceptable because we assume that the encoding is ASCII-compatible.) - pub fn eq_ignore_trailing_spaces(&self, other: &EncodedString) -> bool - where - R2: Borrow, - { - self.borrowed() - .raw - .eq_ignore_trailing_spaces(&other.borrowed().raw) - } - - /// Returns the string's [Encoding]. - pub fn encoding(&self) -> &'static Encoding { - self.encoding - } - - /// Returns a borrowed form of this string. - pub fn borrowed<'a>(&'a self) -> EncodedString<&'a BorrowedRawString> { - EncodedString { - encoding: self.encoding, - raw: self.raw.borrow(), - } - } - - /// Returns true if this string is empty. - pub fn is_empty(&self) -> bool { - self.raw.borrow().is_empty() - } - - /// Returns a helper for displaying this string in double quotes. - pub fn quoted(&self) -> impl Display { - Quoted(self.as_str()) - } -} - -impl OwnedEncodedString { - pub fn resize(&mut self, new_len: usize) -> Result<(), ()> { - match new_len.cmp(&self.len()) { - Ordering::Less => { - if !self.as_bytes()[new_len..].iter().all(|b| *b == b' ') { - return Err(()); - } - self.raw.0.truncate(new_len); - } - Ordering::Equal => (), - Ordering::Greater => self.raw.0.extend((self.len()..new_len).map(|_| b' ')), - } - Ok(()) - } - - /// Removes any trailing ASCII spaces. - pub fn trim_end(&mut self) { - while self.raw.0.pop_if(|c| *c == b' ').is_some() {} - } -} - -impl<'a> From> for OwnedEncodedString { - fn from(value: BorrowedEncodedString<'a>) -> Self { - Self { - raw: value.raw.into(), - encoding: value.encoding, - } - } -} - -impl From<&str> for OwnedEncodedString { - fn from(value: &str) -> Self { - Self { - raw: RawString(value.into()), - encoding: UTF_8, - } - } -} - -impl<'a> From<&'a str> for BorrowedEncodedString<'a> { - fn from(value: &'a str) -> Self { - Self { - raw: BorrowedRawString::new(value.as_bytes()), - encoding: UTF_8, - } - } -} - -impl<'a> From<&'a String> for BorrowedEncodedString<'a> { - fn from(value: &'a String) -> Self { - value.as_str().into() - } -} - -impl Serialize for OwnedEncodedString { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - self.as_str().serialize(serializer) - } -} - -impl Display for EncodedString -where - R: Borrow, -{ - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "{}", self.as_str()) - } -} - -impl PartialEq> for EncodedString -where - R: Borrow, - R2: Borrow, -{ - fn eq(&self, other: &EncodedString) -> bool { - // XXX should this consider the encodings? - self.borrowed().raw.eq(other.borrowed().raw) - } -} - -/// Helper struct for displaying a value in double quotes. pub struct Quoted(T) where T: Display; diff --git a/rust/pspp/src/data/encoded.rs b/rust/pspp/src/data/encoded.rs new file mode 100644 index 0000000000..19fba5f340 --- /dev/null +++ b/rust/pspp/src/data/encoded.rs @@ -0,0 +1,402 @@ +use std::{ + borrow::{Borrow, BorrowMut, Cow}, + cmp::Ordering, + fmt::Display, +}; + +use encoding_rs::{Encoding, UTF_8}; +use serde::Serialize; + +use crate::{ + data::{BorrowedRawString, Datum, OwnedRawString, Quoted, RawString}, + dictionary::{VarType, VarWidth}, + format::DisplayPlain, +}; + +pub type OwnedEncodedDatum = EncodedDatum; +pub type BorrowedEncodedDatum<'a> = EncodedDatum>; + +/// The value of a [Variable](crate::dictionary::Variable), with a string +/// encoding. +#[derive(Clone)] +pub enum EncodedDatum { + /// A numeric value. + Number( + /// A number, or `None` for the system-missing value. + Option, + ), + /// A string value. + String( + /// The value, in the variable's encoding. + D, + ), +} + +impl EncodedDatum> +where + R: Borrow, +{ + pub fn into_raw(self) -> Datum { + match self { + EncodedDatum::Number(number) => Datum::Number(number), + EncodedDatum::String(encoded_string) => Datum::String(encoded_string.into_raw()), + } + } + + /// Returns the [VarWidth] corresponding to this datum. + pub fn width(&self) -> VarWidth { + match self { + Self::Number(_) => VarWidth::Numeric, + Self::String(s) => VarWidth::String(s.len().try_into().unwrap()), + } + } + + pub fn borrowed<'a>(&'a self) -> EncodedDatum> { + match self { + EncodedDatum::Number(number) => EncodedDatum::Number(*number), + EncodedDatum::String(encoded_string) => EncodedDatum::String(encoded_string.borrowed()), + } + } + + /// Compares this datum and `other` for equality, ignoring trailing ASCII + /// spaces in either, if they are both strings, for the purpose of + /// comparison. + pub fn eq_ignore_trailing_spaces(&self, other: &EncodedDatum>) -> bool + where + R2: Borrow, + { + match (self.borrowed(), other.borrowed()) { + (EncodedDatum::Number(lhs), EncodedDatum::Number(rhs)) => lhs == rhs, + (EncodedDatum::String(lhs), EncodedDatum::String(rhs)) => { + lhs.eq_ignore_trailing_spaces(&rhs) + } + _ => false, + } + } + + pub fn quoted(&self) -> QuotedEncodedDatum<'_> { + QuotedEncodedDatum(self.borrowed()) + } +} + +impl EncodedDatum { + /// Constructs a new numerical [EncodedDatum] for the system-missing value. + pub const fn sysmis() -> Self { + Self::Number(None) + } + + /// Returns the number inside this datum, or `None` if this is a string + /// datum. + pub fn as_number(&self) -> Option> { + match self { + Self::Number(number) => Some(*number), + Self::String(_) => None, + } + } + + /// Returns the string inside this datum, or `None` if this is a numeric + /// datum. + pub fn as_string(&self) -> Option<&D> { + match self { + Self::Number(_) => None, + Self::String(s) => Some(s), + } + } + + /// Returns the string inside this datum as a mutable borrow, or `None` if + /// this is a numeric datum. + pub fn as_string_mut(&mut self) -> Option<&mut D> { + match self { + Self::Number(_) => None, + Self::String(s) => Some(s), + } + } + + /// Returns the [VarType] corresponding to this datum. + pub fn var_type(&self) -> VarType { + match self { + Self::Number(_) => VarType::Numeric, + Self::String(_) => VarType::String, + } + } +} + +impl OwnedEncodedDatum { + /// Resizes this datum to the given `width`. Returns `Ok(())` if + /// successful, if and only if this datum and `width` are both string or + /// both numeric and, for string widths, resizing would not drop any + /// non-space characters. + pub fn resize(&mut self, width: VarWidth) -> Result<(), ()> { + match (self, width) { + (Self::Number(_), VarWidth::Numeric) => Ok(()), + (Self::String(s), VarWidth::String(new_width)) => s.resize(new_width as usize), + _ => Err(()), + } + } + + /// Removes trailing ASCII spaces from this datum, if it is a string. + pub fn trim_end(&mut self) { + match self { + Self::Number(_) => (), + Self::String(s) => s.trim_end(), + } + } +} + +impl Display for EncodedDatum +where + D: Display, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Number(None) => write!(f, "SYSMIS"), + Self::Number(Some(number)) => number.display_plain().fmt(f), + Self::String(string) => write!(f, "{string}"), + } + } +} + +impl Serialize for EncodedDatum +where + D: Serialize, +{ + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match self { + EncodedDatum::Number(number) => number.serialize(serializer), + EncodedDatum::String(encoded_string) => encoded_string.serialize(serializer), + } + } +} + +impl From for EncodedDatum { + fn from(number: f64) -> Self { + Some(number).into() + } +} + +impl From> for EncodedDatum { + fn from(value: Option) -> Self { + Self::Number(value) + } +} + +impl From<&str> for OwnedEncodedDatum { + fn from(value: &str) -> Self { + Self::String(OwnedEncodedString::from(value)) + } +} + +/// Helper struct for displaying a value in double quotes. +pub struct QuotedEncodedDatum<'a>(BorrowedEncodedDatum<'a>); + +impl Display for QuotedEncodedDatum<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match &self.0 { + EncodedDatum::Number(None) => write!(f, "SYSMIS"), + EncodedDatum::Number(Some(number)) => number.display_plain().fmt(f), + EncodedDatum::String(string) => write!(f, "\"{}\"", string.as_str()), + } + } +} + +pub type OwnedEncodedString = EncodedString; +pub type BorrowedEncodedString<'a> = EncodedString<&'a BorrowedRawString>; + +/// An owned string and its [Encoding]. +/// +/// The string is not guaranteed to be valid in the encoding. +#[derive(Copy, Clone, Debug)] +pub struct EncodedString { + /// The bytes of the string. + pub raw: R, + + /// The string's encoding. + pub encoding: &'static Encoding, +} + +impl EncodedString +where + R: Borrow, +{ + pub fn new(raw: R, encoding: &'static Encoding) -> Self { + Self { raw, encoding } + } + + pub fn into_raw(self) -> R { + self.raw + } + + pub fn len(&self) -> usize { + self.raw.borrow().len() + } + + /// Returns this string recoded in UTF-8. Invalid characters will be + /// replaced by [REPLACEMENT_CHARACTER]. + /// + /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER + pub fn as_str(&self) -> Cow<'_, str> { + self.encoding.decode_without_bom_handling(self.as_bytes()).0 + } + + /// Returns this string recoded in `encoding`. Invalid characters will be + /// replaced by [REPLACEMENT_CHARACTER]. + /// + /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER + pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { + let utf8 = self.as_str(); + match encoding.encode(&utf8).0 { + Cow::Borrowed(_) => { + // Recoding into UTF-8 and then back did not change anything. + Cow::from(self.as_bytes()) + } + Cow::Owned(owned) => Cow::Owned(owned), + } + } + + /// Returns the bytes in the string, in its encoding. + pub fn as_bytes(&self) -> &[u8] { + &self.raw.borrow().0 + } + + /// Compares this string and `other` for equality, ignoring trailing ASCII + /// spaces in either string for the purpose of comparison. (This is + /// acceptable because we assume that the encoding is ASCII-compatible.) + pub fn eq_ignore_trailing_spaces(&self, other: &EncodedString) -> bool + where + R2: Borrow, + { + self.borrowed() + .raw + .eq_ignore_trailing_spaces(&other.borrowed().raw) + } + + /// Returns the string's [Encoding]. + pub fn encoding(&self) -> &'static Encoding { + self.encoding + } + + /// Returns a borrowed form of this string. + pub fn borrowed<'a>(&'a self) -> EncodedString<&'a BorrowedRawString> { + EncodedString { + encoding: self.encoding, + raw: self.raw.borrow(), + } + } + + /// Returns true if this string is empty. + pub fn is_empty(&self) -> bool { + self.raw.borrow().is_empty() + } + + /// Returns a helper for displaying this string in double quotes. + pub fn quoted(&self) -> impl Display { + Quoted(self.as_str()) + } +} + +impl Borrow for EncodedString +where + R: Borrow, +{ + fn borrow(&self) -> &BorrowedRawString { + self.raw.borrow() + } +} + +impl Borrow for OwnedEncodedString { + fn borrow(&self) -> &OwnedRawString { + &self.raw + } +} + +impl BorrowMut for OwnedEncodedString { + fn borrow_mut(&mut self) -> &mut OwnedRawString { + &mut self.raw + } +} + +impl OwnedEncodedString { + pub fn resize(&mut self, new_len: usize) -> Result<(), ()> { + match new_len.cmp(&self.len()) { + Ordering::Less => { + if !self.as_bytes()[new_len..].iter().all(|b| *b == b' ') { + return Err(()); + } + self.raw.0.truncate(new_len); + } + Ordering::Equal => (), + Ordering::Greater => self.raw.0.extend((self.len()..new_len).map(|_| b' ')), + } + Ok(()) + } + + /// Removes any trailing ASCII spaces. + pub fn trim_end(&mut self) { + while self.raw.0.pop_if(|c| *c == b' ').is_some() {} + } +} + +impl<'a> From> for OwnedEncodedString { + fn from(value: BorrowedEncodedString<'a>) -> Self { + Self { + raw: value.raw.into(), + encoding: value.encoding, + } + } +} + +impl From<&str> for OwnedEncodedString { + fn from(value: &str) -> Self { + Self { + raw: RawString(value.into()), + encoding: UTF_8, + } + } +} + +impl<'a> From<&'a str> for BorrowedEncodedString<'a> { + fn from(value: &'a str) -> Self { + Self { + raw: BorrowedRawString::new(value.as_bytes()), + encoding: UTF_8, + } + } +} + +impl<'a> From<&'a String> for BorrowedEncodedString<'a> { + fn from(value: &'a String) -> Self { + value.as_str().into() + } +} + +impl Serialize for OwnedEncodedString { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.as_str().serialize(serializer) + } +} + +impl Display for EncodedString +where + R: Borrow, +{ + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl PartialEq> for EncodedString +where + R: Borrow, + R2: Borrow, +{ + fn eq(&self, other: &EncodedString) -> bool { + // XXX should this consider the encodings? + self.borrowed().raw.eq(other.borrowed().raw) + } +}