From c113404df33c6910aceafbddffad480641efd1b3 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Thu, 24 Jul 2025 10:10:46 -0700 Subject: [PATCH] systematize all the raw/encoded data types --- rust/pspp/src/data.rs | 526 +++++++++++++++++++++------- rust/pspp/src/dictionary.rs | 152 ++++---- rust/pspp/src/format/display/mod.rs | 41 +-- rust/pspp/src/sys/cooked.rs | 6 +- rust/pspp/src/sys/raw/records.rs | 38 +- rust/pspp/src/sys/write.rs | 20 +- 6 files changed, 518 insertions(+), 265 deletions(-) diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index 0b6e7fa47d..c296c8a6af 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -37,10 +37,14 @@ use std::{ }; use encoding_rs::{mem::decode_latin1, Encoding, UTF_8}; +use itertools::Itertools; use ordered_float::OrderedFloat; use serde::{ser::SerializeTupleVariant, Serialize}; -use crate::dictionary::{VarType, VarWidth}; +use crate::{ + dictionary::{VarType, VarWidth}, + format::DisplayPlain, +}; /// An owned string in an unspecified character encoding. /// @@ -68,11 +72,17 @@ impl RawString { EncodedStr::new(&self.0, encoding) } + /// Returns true if this raw string can be resized to `len` bytes without + /// dropping non-space characters. + pub fn is_resizable(&self, new_len: usize) -> bool { + new_len >= self.len() || self.0[new_len..].iter().all(|b| *b == b' ') + } + /// Extends or shortens this [RawString] to exactly `len` bytes. If the /// string needs to be extended, does so by appending spaces. /// /// If this shortens the string, it can cut off a multibyte character in the - /// middle. + /// middle ([is_resizable](Self::is_resizable) checks for this). pub fn resize(&mut self, len: usize) { self.0.resize(len, b' '); } @@ -81,6 +91,13 @@ impl RawString { pub fn trim_end(&mut self) { while self.0.pop_if(|c| *c == b' ').is_some() {} } + + pub fn with_encoding(self, encoding: &'static Encoding) -> EncodedString { + EncodedString { + bytes: self.0, + encoding, + } + } } impl Borrow for RawString { @@ -158,10 +175,11 @@ impl RawStr { &self.0 } - /// Returns an object that implements [Display] for printing this [RawStr], - /// given that it is encoded in `encoding`. - pub fn display(&self, encoding: &'static Encoding) -> DisplayRawString { - DisplayRawString(encoding.decode_without_bom_handling(&self.0).0) + pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> { + EncodedStr { + bytes: &self.0, + encoding, + } } /// Interprets the raw string's contents as the specified `encoding` and @@ -220,17 +238,6 @@ impl Serialize for RawStr { } } -/// Helper struct for printing [RawStr] with [format!]. -/// -/// Created by [RawStr::display]. -pub struct DisplayRawString<'a>(Cow<'a, str>); - -impl<'a> Display for DisplayRawString<'a> { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", &self.0) - } -} - impl Debug for RawStr { // If `s` is valid UTF-8, displays it as UTF-8, otherwise as Latin-1 // (actually bytes interpreted as Unicode code points). @@ -240,6 +247,225 @@ impl Debug for RawStr { } } +/// The value of a [Variable](crate::dictionary::Variable), with a string +/// encoding. +#[derive(Clone)] +pub enum EncodedDatum { + /// A numeric value. + Number( + /// A number, or `None` for the system-missing value. + Option, + ), + /// A string value. + String( + /// The value, in the variable's encoding. + EncodedString, + ), +} + +impl EncodedDatum { + /// Constructs a new numerical [Datum] for the system-missing value. + pub const fn sysmis() -> Self { + Self::Number(None) + } + + /// Returns the number inside this datum, or `None` if this is a string + /// datum. + pub fn as_number(&self) -> Option> { + match self { + Self::Number(number) => Some(*number), + Self::String(_) => None, + } + } + + /// Returns the string inside this datum, or `None` if this is a numeric + /// datum. + pub fn as_string(&self) -> Option<&EncodedString> { + match self { + Self::Number(_) => None, + Self::String(s) => Some(s), + } + } + + /// Returns the string inside this datum as a mutable borrow, or `None` if + /// this is a numeric datum. + pub fn as_string_mut(&mut self) -> Option<&mut EncodedString> { + match self { + Self::Number(_) => None, + Self::String(s) => Some(s), + } + } + + /// Resizes this datum to the given `width`. Returns `Ok(())` if + /// successful, if and only if this datum and `width` are both string or + /// both numeric and, for string widths, resizing would not drop any + /// non-space characters. + pub fn resize(&mut self, width: VarWidth) -> Result<(), ()> { + match (self, width) { + (Self::Number(_), VarWidth::Numeric) => Ok(()), + (Self::String(s), VarWidth::String(new_width)) => s.resize(new_width as usize), + _ => Err(()), + } + } + + /// Returns the [VarType] corresponding to this datum. + pub fn var_type(&self) -> VarType { + match self { + Self::Number(_) => VarType::Numeric, + Self::String(_) => VarType::String, + } + } + + /// Returns the [VarWidth] corresponding to this datum. + pub fn width(&self) -> VarWidth { + match self { + Self::Number(_) => VarWidth::Numeric, + Self::String(s) => VarWidth::String(s.len().try_into().unwrap()), + } + } + + /// Compares this datum and `other` for equality, ignoring trailing ASCII + /// spaces in either, if they are both strings, for the purpose of + /// comparison. + pub fn eq_ignore_trailing_spaces<'a>(&self, other: impl Into>) -> bool { + self.borrowed().eq_ignore_trailing_spaces(other.into()) + } + + /// Removes trailing ASCII spaces from this datum, if it is a string. + pub fn trim_end(&mut self) { + match self { + Self::Number(_) => (), + Self::String(s) => s.trim_end(), + } + } + + pub fn borrowed<'a>(&'a self) -> EncodedDat<'a> { + match self { + EncodedDatum::Number(number) => EncodedDat::Number(*number), + EncodedDatum::String(encoded_string) => EncodedDat::String(encoded_string.borrowed()), + } + } + + pub fn quoted(&self) -> QuotedEncodedDat<'_> { + self.borrowed().quoted() + } +} + +impl Display for EncodedDatum { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.borrowed()) + } +} + +impl Serialize for EncodedDatum { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match self { + EncodedDatum::Number(number) => number.serialize(serializer), + EncodedDatum::String(encoded_string) => encoded_string.serialize(serializer), + } + } +} + +/// A borrowed [Datum] with a string encoding. +#[derive(Copy, Clone)] +pub enum EncodedDat<'a> { + /// A numeric value. + Number( + /// A number, or `None` for the system-missing value. + Option, + ), + /// A string value. + String( + /// The value, in the variable's encoding. + EncodedStr<'a>, + ), +} + +impl<'a> EncodedDat<'a> { + /// Constructs a new numerical [Datum] for the system-missing value. + pub const fn sysmis() -> Self { + Self::Number(None) + } + + /// Returns the number inside this datum, or `None` if this is a string + /// datum. + pub fn as_number(&self) -> Option> { + match self { + Self::Number(number) => Some(*number), + Self::String(_) => None, + } + } + + /// Returns the string inside this datum, or `None` if this is a numeric + /// datum. + pub fn as_string(&self) -> Option<&EncodedStr> { + match self { + Self::Number(_) => None, + Self::String(s) => Some(s), + } + } + + /// Returns the string inside this datum as a mutable borrow, or `None` if + /// this is a numeric datum. + pub fn as_string_mut(&'a mut self) -> Option> { + match self { + Self::Number(_) => None, + Self::String(s) => Some(*s), + } + } + + pub fn eq_ignore_trailing_spaces<'b>(&self, other: EncodedDat<'b>) -> bool { + match (self, other) { + (Self::String(a), EncodedDat::String(b)) => a.eq_ignore_trailing_spaces(b), + _ => *self == other, + } + } + + pub fn quoted(&self) -> QuotedEncodedDat<'a> { + QuotedEncodedDat(*self) + } +} + +pub struct QuotedEncodedDat<'a>(EncodedDat<'a>); + +impl Display for QuotedEncodedDat<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match &self.0 { + EncodedDat::Number(None) => write!(f, "SYSMIS"), + EncodedDat::Number(Some(number)) => number.display_plain().fmt(f), + EncodedDat::String(string) => write!(f, "{}", string.quoted()), + } + } +} + +impl Display for EncodedDat<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::Number(None) => write!(f, "SYSMIS"), + Self::Number(Some(number)) => number.display_plain().fmt(f), + Self::String(string) => write!(f, "{string}"), + } + } +} + +impl<'a> PartialEq for EncodedDat<'a> { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::Number(Some(l0)), Self::Number(Some(r0))) => { + OrderedFloat(*l0) == OrderedFloat(*r0) + } + (Self::Number(None), Self::Number(None)) => true, + (Self::String(l0), Self::String(r0)) => l0 == r0, + _ => false, + } + } +} + +impl<'a> Eq for EncodedDat<'a> {} + /// The value of a [Variable](crate::dictionary::Variable). #[derive(Clone)] pub enum Datum { @@ -258,9 +484,9 @@ pub enum Datum { impl Debug for Datum { fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { match self { - Datum::Number(Some(number)) => write!(f, "{number:?}"), - Datum::Number(None) => write!(f, "SYSMIS"), - Datum::String(s) => write!(f, "{:?}", s), + Self::Number(Some(number)) => write!(f, "{number:?}"), + Self::Number(None) => write!(f, "SYSMIS"), + Self::String(s) => write!(f, "{:?}", s), } } } @@ -271,8 +497,8 @@ impl Serialize for Datum { S: serde::Serializer, { match self { - Datum::Number(number) => number.serialize(serializer), - Datum::String(raw_string) => raw_string.serialize(serializer), + Self::Number(number) => number.serialize(serializer), + Self::String(raw_string) => raw_string.serialize(serializer), } } } @@ -301,15 +527,15 @@ impl PartialOrd for Datum { impl Ord for Datum { fn cmp(&self, other: &Self) -> Ordering { match (self, other) { - (Datum::Number(a), Datum::Number(b)) => match (a, b) { + (Self::Number(a), Self::Number(b)) => match (a, b) { (None, None) => Ordering::Equal, (None, Some(_)) => Ordering::Less, (Some(_), None) => Ordering::Greater, (Some(a), Some(b)) => a.total_cmp(b), }, - (Datum::Number(_), Datum::String(_)) => Ordering::Less, - (Datum::String(_), Datum::Number(_)) => Ordering::Greater, - (Datum::String(a), Datum::String(b)) => a.cmp(b), + (Self::Number(_), Self::String(_)) => Ordering::Less, + (Self::String(_), Self::Number(_)) => Ordering::Greater, + (Self::String(a), Self::String(b)) => a.cmp(b), } } } @@ -317,8 +543,8 @@ impl Ord for Datum { impl Hash for Datum { fn hash(&self, state: &mut H) { match self { - Datum::Number(number) => number.map(OrderedFloat).hash(state), - Datum::String(string) => string.hash(state), + Self::Number(number) => number.map(OrderedFloat).hash(state), + Self::String(string) => string.hash(state), } } } @@ -333,8 +559,8 @@ impl Datum { /// datum. pub fn as_number(&self) -> Option> { match self { - Datum::Number(number) => Some(*number), - Datum::String(_) => None, + Self::Number(number) => Some(*number), + Self::String(_) => None, } } @@ -342,8 +568,8 @@ impl Datum { /// datum. pub fn as_string(&self) -> Option<&RawString> { match self { - Datum::Number(_) => None, - Datum::String(s) => Some(s), + Self::Number(_) => None, + Self::String(s) => Some(s), } } @@ -351,8 +577,15 @@ impl Datum { /// this is a numeric datum. pub fn as_string_mut(&mut self) -> Option<&mut RawString> { match self { - Datum::Number(_) => None, - Datum::String(s) => Some(s), + Self::Number(_) => None, + Self::String(s) => Some(s), + } + } + + pub fn as_encoded<'a>(&'a self, encoding: &'static Encoding) -> EncodedDat<'a> { + match self { + Datum::Number(number) => EncodedDat::Number(*number), + Datum::String(raw_string) => EncodedDat::String(raw_string.as_encoded(encoding)), } } @@ -362,26 +595,27 @@ impl Datum { /// non-space characters. pub fn is_resizable(&self, width: VarWidth) -> bool { match (self, width) { - (Datum::Number(_), VarWidth::Numeric) => true, - (Datum::String(s), VarWidth::String(new_width)) => { - let new_len = new_width as usize; - new_len >= s.len() || s.0[new_len..].iter().all(|c| *c == b' ') - } + (Self::Number(_), VarWidth::Numeric) => true, + (Self::String(s), VarWidth::String(new_width)) => s.is_resizable(new_width as usize), _ => false, } } - /// Resizes this datum to the given `width`. - /// - /// # Panic - /// - /// Panics if resizing would change the datum from numeric to string or vice - /// versa. - pub fn resize(&mut self, width: VarWidth) { + /// Resizes this datum to the given `width`. Returns an error, without + /// modifying the datum, if [is_resizable](Self::is_resizable) would return + /// false. + pub fn resize(&mut self, width: VarWidth) -> Result<(), ()> { match (self, width) { - (Datum::Number(_), VarWidth::Numeric) => (), - (Datum::String(s), VarWidth::String(new_width)) => s.resize(new_width as usize), - _ => unreachable!(), + (Self::Number(_), VarWidth::Numeric) => Ok(()), + (Self::String(s), VarWidth::String(new_width)) => { + if s.is_resizable(new_width as usize) { + s.resize(new_width as usize); + Ok(()) + } else { + Err(()) + } + } + _ => Err(()), } } @@ -396,8 +630,8 @@ impl Datum { /// Returns the [VarWidth] corresponding to this datum. pub fn width(&self) -> VarWidth { match self { - Datum::Number(_) => VarWidth::Numeric, - Datum::String(s) => VarWidth::String(s.len().try_into().unwrap()), + Self::Number(_) => VarWidth::Numeric, + Self::String(s) => VarWidth::String(s.len().try_into().unwrap()), } } @@ -418,6 +652,13 @@ impl Datum { Self::String(s) => s.trim_end(), } } + + pub fn with_encoding(self, encoding: &'static Encoding) -> EncodedDatum { + match self { + Datum::Number(number) => EncodedDatum::Number(number), + Datum::String(raw_string) => EncodedDatum::String(raw_string.with_encoding(encoding)), + } + } } impl From for Datum { @@ -460,83 +701,103 @@ pub struct Case( /// /// The borrowed form of such a string is [EncodedStr]. #[derive(Clone, Debug)] -pub enum EncodedString { - /// A string in arbitrary encoding. - Encoded { - /// The bytes of the string. - bytes: Vec, - - /// The string's encoding. - /// - /// This can be [UTF_8]. - encoding: &'static Encoding, - }, +pub struct EncodedString { + /// The bytes of the string. + bytes: Vec, - /// A string that is in UTF-8 and known to be valid. - Utf8 { - /// The string. - s: String, - }, + /// The string's encoding. + encoding: &'static Encoding, } impl EncodedString { + pub fn len(&self) -> usize { + self.bytes.len() + } + + /// Returns the bytes in the string, in its encoding. + pub fn as_bytes(&self) -> &[u8] { + &self.bytes + } + + /// Compares this string and `other` for equality, ignoring trailing ASCII + /// spaces in either string for the purpose of comparison. (This is + /// acceptable because we assume that the encoding is ASCII-compatible.) + pub fn eq_ignore_trailing_spaces<'a>(&self, other: impl Into>) -> bool { + self.borrowed().eq_ignore_trailing_spaces(other.into()) + } + + pub fn resize(&mut self, new_len: usize) -> Result<(), ()> { + match new_len.cmp(&self.len()) { + Ordering::Less => { + if !self.as_bytes()[new_len..].iter().all(|b| *b == b' ') { + return Err(()); + } + self.bytes.truncate(new_len); + } + Ordering::Equal => (), + Ordering::Greater => self.bytes.extend((self.len()..new_len).map(|_| b' ')), + } + Ok(()) + } + /// Returns the string's [Encoding]. pub fn encoding(&self) -> &'static Encoding { - match self { - EncodedString::Encoded { encoding, .. } => encoding, - EncodedString::Utf8 { .. } => UTF_8, - } + self.encoding } /// Returns a borrowed form of this string. pub fn borrowed(&self) -> EncodedStr<'_> { - match self { - EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding }, - EncodedString::Utf8 { s } => EncodedStr::Utf8 { s }, - } + EncodedStr::new(&self.bytes, self.encoding) + } + + /// Removes any trailing ASCII spaces. + pub fn trim_end(&mut self) { + while self.bytes.pop_if(|c| *c == b' ').is_some() {} + } +} + +impl<'a> From<&'a EncodedString> for EncodedStr<'a> { + fn from(value: &'a EncodedString) -> Self { + value.borrowed() } } impl<'a> From> for EncodedString { fn from(value: EncodedStr<'a>) -> Self { - match value { - EncodedStr::Encoded { bytes, encoding } => Self::Encoded { - bytes: bytes.into(), - encoding, - }, - EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() }, + Self { + bytes: value.bytes.into(), + encoding: value.encoding, } } } +impl Serialize for EncodedString { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.borrowed().serialize(serializer) + } +} + /// A borrowed string and its [Encoding]. /// /// The string is not guaranteed to be valid in the encoding. /// /// The owned form of such a string is [EncodedString]. -pub enum EncodedStr<'a> { - /// A string in an arbitrary encoding - Encoded { - /// The bytes of the string. - bytes: &'a [u8], +#[derive(Copy, Clone, PartialEq, Eq)] +pub struct EncodedStr<'a> { + /// The bytes of the string. + bytes: &'a [u8], - /// The string's encoding. - /// - /// This can be [UTF_8]. - encoding: &'static Encoding, - }, - - /// A string in UTF-8 that is known to be valid. - Utf8 { - /// The string. - s: &'a str, - }, + /// The string's encoding. + encoding: &'static Encoding, } impl<'a> EncodedStr<'a> { /// Construct a new string with an arbitrary encoding. pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self { - Self::Encoded { bytes, encoding } + Self { bytes, encoding } } /// Returns this string recoded in UTF-8. Invalid characters will be @@ -544,20 +805,12 @@ impl<'a> EncodedStr<'a> { /// /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER pub fn as_str(&self) -> Cow<'_, str> { - match self { - EncodedStr::Encoded { bytes, encoding } => { - encoding.decode_without_bom_handling(bytes).0 - } - EncodedStr::Utf8 { s } => Cow::from(*s), - } + self.encoding.decode_without_bom_handling(self.bytes).0 } /// Returns the bytes in the string, in its encoding. pub fn as_bytes(&self) -> &[u8] { - match self { - EncodedStr::Encoded { bytes, .. } => bytes, - EncodedStr::Utf8 { s } => s.as_bytes(), - } + self.bytes } /// Returns this string recoded in `encoding`. Invalid characters will be @@ -565,27 +818,26 @@ impl<'a> EncodedStr<'a> { /// /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { - match self { - EncodedStr::Encoded { bytes, encoding } => { - let utf8 = encoding.decode_without_bom_handling(bytes).0; - match encoding.encode(&utf8).0 { - Cow::Borrowed(_) => { - // Recoding into UTF-8 and then back did not change anything. - Cow::from(*bytes) - } - Cow::Owned(owned) => Cow::Owned(owned), - } + let utf8 = self.as_str(); + match encoding.encode(&utf8).0 { + Cow::Borrowed(_) => { + // Recoding into UTF-8 and then back did not change anything. + Cow::from(self.bytes) } - EncodedStr::Utf8 { s } => encoding.encode(s).0, + Cow::Owned(owned) => Cow::Owned(owned), } } /// Returns true if this string is empty. pub fn is_empty(&self) -> bool { - match self { - EncodedStr::Encoded { bytes, .. } => bytes.is_empty(), - EncodedStr::Utf8 { s } => s.is_empty(), - } + self.bytes.is_empty() + } + + pub fn eq_ignore_trailing_spaces<'b>(&self, other: EncodedStr<'b>) -> bool { + self.bytes.iter().zip_longest(other.bytes).all(|elem| { + let (left, right) = elem.or(&b' ', &b' '); + *left == *right + }) } /// Returns a helper for displaying this string in double quotes. @@ -594,15 +846,39 @@ impl<'a> EncodedStr<'a> { } } +impl<'a> Display for EncodedStr<'a> { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl<'a> Debug for EncodedStr<'a> { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + write!(f, "{:?}", self.as_str()) + } +} + impl<'a> From<&'a str> for EncodedStr<'a> { fn from(s: &'a str) -> Self { - Self::Utf8 { s } + Self { + bytes: s.as_bytes(), + encoding: UTF_8, + } } } impl<'a> From<&'a String> for EncodedStr<'a> { fn from(s: &'a String) -> Self { - Self::Utf8 { s: s.as_str() } + Self::from(s.as_str()) + } +} + +impl<'a> Serialize for EncodedStr<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.as_str().serialize(serializer) } } diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index e1a98d6afb..d6b2015909 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -40,7 +40,7 @@ use thiserror::Error as ThisError; use unicase::UniCase; use crate::{ - data::Datum, + data::{Datum, EncodedDat, EncodedDatum}, format::{DisplayPlain, Format}, identifier::{ByIdentifier, HasIdentifier, Identifier}, output::pivot::{ @@ -846,12 +846,7 @@ impl<'a> OutputVariables<'a> { Some(Value::new_user_text(variable.write_format.to_string())) } VariableField::MissingValues if !variable.missing_values.is_empty() => { - Some(Value::new_user_text( - variable - .missing_values - .display(variable.encoding) - .to_string(), - )) + Some(Value::new_user_text(variable.missing_values.to_string())) } VariableField::MissingValues => None, } @@ -893,7 +888,10 @@ impl<'a> OutputValueLabels<'a> { for (datum, label) in sorted_value_labels { let mut value = Value::new_variable_value(variable, datum) .with_show_value_label(Some(Show::Value)); - if variable.missing_values.contains(datum) { + if variable + .missing_values + .contains(datum.as_encoded(variable.encoding())) + { value.add_footnote(&missing_footnote); } group.push(value); @@ -1391,17 +1389,9 @@ impl Variable { } pub fn resize(&mut self, width: VarWidth) { - if self.missing_values.is_resizable(width) { - self.missing_values.resize(width); - } else { - self.missing_values = MissingValues::default(); - } + let _ = self.missing_values.resize(width); - if self.value_labels.is_resizable(width) { - self.value_labels.resize(width); - } else { - self.value_labels = ValueLabels::default(); - } + self.value_labels.resize(width); self.print_format.resize(width); self.write_format.resize(width); @@ -1937,10 +1927,7 @@ impl ValueLabels { self.0 = self .0 .drain() - .map(|(mut datum, string)| { - datum.resize(width); - (datum, string) - }) + .filter_map(|(mut datum, string)| datum.resize(width).is_ok().then(|| (datum, string))) .collect(); } } @@ -1967,7 +1954,7 @@ impl Hash for ValueLabels { #[derive(Clone, Default, Serialize)] pub struct MissingValues { /// Individual missing values, up to 3 of them. - values: Vec, + values: Vec, /// Optional range of missing values. range: Option, @@ -1975,11 +1962,30 @@ pub struct MissingValues { impl Debug for MissingValues { fn fmt(&self, f: &mut Formatter) -> FmtResult { - DisplayMissingValues { - mv: self, - encoding: None, + write!(f, "{}", self) + } +} + +impl Display for MissingValues { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + if let Some(range) = &self.range { + write!(f, "{range}")?; + if !self.values.is_empty() { + write!(f, "; ")?; + } } - .fmt(f) + + for (i, value) in self.values.iter().enumerate() { + if i > 0 { + write!(f, "; ")?; + } + write!(f, "{}", value.quoted())?; + } + + if self.is_empty() { + write!(f, "none")?; + } + Ok(()) } } @@ -1991,7 +1997,10 @@ pub enum MissingValuesError { } impl MissingValues { - pub fn values(&self) -> &[Datum] { + pub fn clear(&mut self) { + *self = Self::default(); + } + pub fn values(&self) -> &[EncodedDatum] { &self.values } @@ -2000,7 +2009,7 @@ impl MissingValues { } pub fn new( - mut values: Vec, + mut values: Vec, range: Option, ) -> Result { if values.len() > 3 { @@ -2010,9 +2019,8 @@ impl MissingValues { let mut var_type = None; for value in values.iter_mut() { value.trim_end(); - match value.width() { - VarWidth::String(w) if w > 8 => return Err(MissingValuesError::TooWide), - _ => (), + if value.width().is_long_string() { + return Err(MissingValuesError::TooWide); } if var_type.is_some_and(|t| t != value.var_type()) { return Err(MissingValuesError::MixedTypes); @@ -2041,7 +2049,7 @@ impl MissingValues { } } - pub fn contains(&self, value: &Datum) -> bool { + pub fn contains(&self, value: EncodedDat) -> bool { if self .values .iter() @@ -2050,66 +2058,30 @@ impl MissingValues { return true; } - match value { - Datum::Number(Some(number)) => self.range.is_some_and(|range| range.contains(*number)), - _ => false, - } - } - - pub fn is_resizable(&self, width: VarWidth) -> bool { - self.values.iter().all(|datum| datum.is_resizable(width)) - && self.range.iter().all(|range| range.is_resizable(width)) - } - - pub fn resize(&mut self, width: VarWidth) { - for datum in &mut self.values { - datum.resize(width); - } - if let Some(range) = &mut self.range { - range.resize(width); - } - } - - pub fn display(&self, encoding: &'static Encoding) -> DisplayMissingValues<'_> { - DisplayMissingValues { - mv: self, - encoding: Some(encoding), + if let EncodedDat::Number(Some(number)) = value + && let Some(range) = self.range + { + range.contains(number) + } else { + false } } -} - -pub struct DisplayMissingValues<'a> { - mv: &'a MissingValues, - encoding: Option<&'static Encoding>, -} -impl<'a> Display for DisplayMissingValues<'a> { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - if let Some(range) = &self.mv.range { - write!(f, "{range}")?; - if !self.mv.values.is_empty() { - write!(f, "; ")?; + pub fn resize(&mut self, width: VarWidth) -> Result<(), ()> { + fn inner(this: &mut MissingValues, width: VarWidth) -> Result<(), ()> { + for datum in &mut this.values { + datum.resize(width)?; } - } - - for (i, value) in self.mv.values.iter().enumerate() { - if i > 0 { - write!(f, "; ")?; - } - match self.encoding { - Some(encoding) => value.display_plain(encoding).fmt(f)?, - None => value.fmt(f)?, + if let Some(range) = &mut this.range { + range.resize(width)?; } + Ok(()) } - - if self.mv.is_empty() { - write!(f, "none")?; - } - Ok(()) + inner(self, width).inspect_err(|_| self.clear()) } } -#[derive(Copy, Clone, Serialize)] +#[derive(Copy, Clone, Debug, Serialize)] pub enum MissingValueRange { In { low: f64, high: f64 }, From { low: f64 }, @@ -2148,12 +2120,12 @@ impl MissingValueRange { } } - pub fn is_resizable(&self, width: VarWidth) -> bool { - width.is_numeric() - } - - pub fn resize(&self, width: VarWidth) { - assert_eq!(width, VarWidth::Numeric); + pub fn resize(&self, width: VarWidth) -> Result<(), ()> { + if width.is_numeric() { + Ok(()) + } else { + Err(()) + } } } diff --git a/rust/pspp/src/format/display/mod.rs b/rust/pspp/src/format/display/mod.rs index 755b6bdfc0..3a956ee84e 100644 --- a/rust/pspp/src/format/display/mod.rs +++ b/rust/pspp/src/format/display/mod.rs @@ -29,7 +29,7 @@ use smallvec::{Array, SmallVec}; use crate::{ calendar::{calendar_offset_to_gregorian, day_of_year, month_name, short_month_name}, - data::Datum, + data::{Datum, QuotedEncodedDat}, endian::{endian_to_smallvec, ToBytes}, format::{Category, DateTemplate, Decimal, Format, NumberStyle, Settings, TemplateItem, Type}, settings::{EndianSettings, Settings as PsppSettings}, @@ -93,43 +93,8 @@ impl Datum { DisplayDatum::new(format, self, encoding) } - pub fn display_plain(&self, encoding: &'static Encoding) -> DisplayDatumPlain { - DisplayDatumPlain { - datum: self, - encoding, - quote_strings: true, - } - } -} - -pub struct DisplayDatumPlain<'a> { - datum: &'a Datum, - encoding: &'static Encoding, - quote_strings: bool, -} - -impl DisplayDatumPlain<'_> { - pub fn without_quotes(self) -> Self { - Self { - quote_strings: false, - ..self - } - } -} - -impl Display for DisplayDatumPlain<'_> { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - match self.datum { - Datum::Number(None) => write!(f, "SYSMIS"), - Datum::Number(Some(number)) => number.display_plain().fmt(f), - Datum::String(string) => { - if self.quote_strings { - write!(f, "\"{}\"", string.display(self.encoding)) - } else { - string.display(self.encoding).fmt(f) - } - } - } + pub fn display_plain(&self, encoding: &'static Encoding) -> QuotedEncodedDat<'_> { + self.as_encoded(encoding).quoted() } } diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index 4c51c9cf9e..db67102f9d 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -26,7 +26,7 @@ use std::{ use crate::{ calendar::date_time_to_pspp, crypto::EncryptedFile, - data::{Datum, RawString}, + data::{Datum, EncodedDatum, RawString}, dictionary::{ DictIndexMultipleResponseSet, DictIndexVariableSet, Dictionary, InvalidRole, MissingValues, MissingValuesError, MultipleResponseType, VarWidth, Variable, @@ -913,7 +913,7 @@ impl Records { variable.label = input.label.clone(); - variable.missing_values = input.missing_values.clone(); + variable.missing_values = input.missing_values.decode(encoding).unwrap(); variable.print_format = decode_format( input.print_format, @@ -1257,7 +1257,7 @@ impl Records { .map(|v| { let mut value = RawString::from(v.0.as_slice()); value.resize(variable.width.as_string_width().unwrap()); - Datum::String(value) + EncodedDatum::String(value.with_encoding(encoding)) }) .collect::>(); match MissingValues::new(values, None) { diff --git a/rust/pspp/src/sys/raw/records.rs b/rust/pspp/src/sys/raw/records.rs index c6353d78e8..e46e86e2a0 100644 --- a/rust/pspp/src/sys/raw/records.rs +++ b/rust/pspp/src/sys/raw/records.rs @@ -14,8 +14,8 @@ use std::{ use crate::{ data::{Datum, RawString}, dictionary::{ - Alignment, Attributes, CategoryLabels, Measure, MissingValueRange, MissingValues, VarType, - VarWidth, + Alignment, Attributes, CategoryLabels, Measure, MissingValueRange, MissingValues, + MissingValuesError, VarType, VarWidth, }, endian::{Endian, Parse}, format::{DisplayPlainF64, Format, Type}, @@ -31,6 +31,7 @@ use crate::{ use binrw::{binrw, BinRead, BinWrite, Error as BinError}; use clap::ValueEnum; +use encoding_rs::Encoding; use itertools::Itertools; use serde::{ser::SerializeTuple, Serialize, Serializer}; use thiserror::Error as ThisError; @@ -351,7 +352,20 @@ fn format_name(type_: u32) -> Cow<'static, str> { .into() } -impl MissingValues { +#[derive(Clone, Debug, Default, Serialize)] +pub struct RawMissingValues { + /// Individual missing values, up to 3 of them. + pub values: Vec, + + /// Optional range of missing values. + pub range: Option, +} + +impl RawMissingValues { + pub fn new(values: Vec, range: Option) -> Self { + Self { values, range } + } + fn read( r: &mut R, offsets: Range, @@ -430,7 +444,7 @@ impl MissingValues { let range = range.map(|(low, high)| { MissingValueRange::new(endian.parse(low), endian.parse(high)) }); - return Ok(Self::new(values, range).unwrap()); + return Ok(Self::new(values, range)); } Ok(VarWidth::String(_)) if range.is_some() => warn(Warning::new( Some(offsets), @@ -442,7 +456,7 @@ impl MissingValues { .into_iter() .map(|value| Datum::String(RawString::from(&value[..width]))) .collect(); - return Ok(Self::new(values, None).unwrap()); + return Ok(Self::new(values, None)); } Err(()) => warn(Warning::new( Some(offsets), @@ -451,6 +465,16 @@ impl MissingValues { } Ok(Self::default()) } + + pub fn decode(&self, encoding: &'static Encoding) -> Result { + MissingValues::new( + self.values + .iter() + .map(|datum| datum.clone().with_encoding(encoding)) + .collect(), + self.range, + ) + } } /// Warning for a variable record. @@ -487,7 +511,7 @@ where pub write_format: RawFormat, /// Missing values. - pub missing_values: MissingValues, + pub missing_values: RawMissingValues, /// Optional variable label. pub label: Option, @@ -564,7 +588,7 @@ impl VariableRecord { } }; - let missing_values = MissingValues::read( + let missing_values = RawMissingValues::read( r, offsets, width, diff --git a/rust/pspp/src/sys/write.rs b/rust/pspp/src/sys/write.rs index eed16fca49..f7c38ed58d 100644 --- a/rust/pspp/src/sys/write.rs +++ b/rust/pspp/src/sys/write.rs @@ -17,7 +17,7 @@ use itertools::zip_eq; use smallvec::SmallVec; use crate::{ - data::Datum, + data::{Datum, EncodedDatum}, dictionary::{ Alignment, Attributes, CategoryLabels, Dictionary, Measure, MultipleResponseType, ValueLabels, VarWidth, @@ -579,7 +579,7 @@ where for value in variable.missing_values.values() { let value = value.as_string().unwrap(); - value.0[..8].write_le(&mut cursor).unwrap(); + value.as_bytes()[..8].write_le(&mut cursor).unwrap(); } } self.write_bytes_record(22, &body) @@ -690,6 +690,22 @@ impl BinWrite for Datum { } } +impl BinWrite for EncodedDatum { + type Args<'a> = (); + + fn write_options( + &self, + writer: &mut W, + endian: binrw::Endian, + _: (), + ) -> binrw::BinResult<()> { + match self { + Self::Number(number) => number.unwrap_or(f64::MIN).write_options(writer, endian, ()), + Self::String(raw_string) => raw_string.as_bytes().write_options(writer, endian, ()), + } + } +} + #[derive(Debug)] struct StringSegment { data_bytes: usize, -- 2.30.2