From bb1170a6e6eafe98a1c673614afe9b9f069e0f53 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 29 Jul 2025 08:27:52 -0700 Subject: [PATCH] passes! --- rust/pspp/src/data.rs | 149 ++++++++++++++++++++++++--- rust/pspp/src/dictionary.rs | 15 +-- rust/pspp/src/format/display/mod.rs | 53 +++++++--- rust/pspp/src/format/display/test.rs | 10 +- rust/pspp/src/main.rs | 2 +- rust/pspp/src/output/pivot/mod.rs | 15 ++- rust/pspp/src/sys/cooked.rs | 4 +- rust/pspp/src/sys/write.rs | 58 +++++------ 8 files changed, 224 insertions(+), 82 deletions(-) diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index a16691d0b4..f047ce93b3 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -274,10 +274,7 @@ where } mod encoded; -pub use encoded::{ - BorrowedEncodedDatum, BorrowedEncodedString, EncodedDatum, EncodedString, OwnedEncodedDatum, - OwnedEncodedString, QuotedEncodedDatum, -}; +pub use encoded::{BorrowedEncodedString, EncodedString, OwnedEncodedString}; /// A [Datum] that owns its string data (if any). pub type OwnedDatum = Datum; @@ -478,17 +475,141 @@ where } } - pub fn as_encoded<'a>(&'a self, encoding: &'static Encoding) -> BorrowedEncodedDatum<'a> { + pub fn as_encoded<'a>( + &'a self, + encoding: &'static Encoding, + ) -> Datum> { match self { - Datum::Number(number) => EncodedDatum::Number(*number), - Datum::String(raw_string) => EncodedDatum::String(EncodedString { - encoding, + Datum::Number(number) => Datum::Number(*number), + Datum::String(raw_string) => Datum::String(EncodedString { raw: raw_string.borrow(), + encoding, }), } } } +impl Datum { + pub fn borrowed(&self) -> BorrowedDatum { + match self { + Datum::Number(number) => Datum::Number(*number), + Datum::String(string) => Datum::String(Borrow::borrow(string)), + } + } +} + +impl<'a> Datum<&'a BorrowedRawString> { + pub fn borrowed(&self) -> BorrowedDatum { + self.clone() + } +} + +impl Datum { + pub fn borrowed<'a>(&'a self) -> Datum> { + match self { + Datum::Number(number) => Datum::Number(*number), + Datum::String(string) => Datum::String(string.borrowed()), + } + } +} + +impl<'a> Datum> { + pub fn borrowed(&self) -> Datum> { + self.clone() + } +} + +impl Datum +where + D: BorrowString, +{ + pub fn borrowed_string<'a>(&'a self) -> Datum> { + match self { + Datum::Number(number) => Datum::Number(*number), + Datum::String(string) => Datum::String(string.borrow_string()), + } + } +} + +pub trait BorrowString { + type Borrowed<'a> + where + Self: 'a; + fn borrow_string<'a>(&'a self) -> Self::Borrowed<'a>; +} + +impl BorrowString for OwnedRawString { + type Borrowed<'a> = &'a BorrowedRawString; + fn borrow_string<'a>(&'a self) -> Self::Borrowed<'a> { + BorrowedRawString::new(&self.0) + } +} + +impl BorrowString for BorrowedRawString { + type Borrowed<'a> = &'a BorrowedRawString; + fn borrow_string<'a>(&'a self) -> Self::Borrowed<'a> { + self.clone() + } +} + +impl BorrowString for OwnedEncodedString { + type Borrowed<'a> = BorrowedEncodedString<'a>; + fn borrow_string<'a>(&'a self) -> Self::Borrowed<'a> { + BorrowedEncodedString::new(self.raw.borrowed(), self.encoding) + } +} + +impl<'b> BorrowString for BorrowedEncodedString<'b> { + type Borrowed<'a> + = BorrowedEncodedString<'b> + where + Self: 'a; + + fn borrow_string<'a>(&'a self) -> Self::Borrowed<'a> { + self.clone() + } +} + +pub trait AsEncodedString: Borrow { + fn as_encoded_string<'a>(&'a self) -> BorrowedEncodedString<'a>; +} + +impl AsEncodedString for OwnedEncodedString { + fn as_encoded_string<'a>(&'a self) -> BorrowedEncodedString<'a> { + self.borrowed() + } +} + +impl<'b> AsEncodedString for BorrowedEncodedString<'b> { + fn as_encoded_string<'a>(&'a self) -> BorrowedEncodedString<'a> { + self.clone() + } +} + +impl Datum +where + B: AsEncodedString, +{ + pub fn quoted<'a>(&'a self) -> QuotedDatum<'a, B> { + QuotedDatum(self) + } +} + +pub struct QuotedDatum<'a, B>(&'a Datum); + +impl<'a, B> Display for QuotedDatum<'a, B> +where + B: AsEncodedString, +{ + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match &self.0 { + Datum::Number(None) => write!(f, "SYSMIS"), + Datum::Number(Some(number)) => number.display_plain().fmt(f), + Datum::String(string) => write!(f, "\"{}\"", string.as_encoded_string().as_str()), + } + } +} + impl Datum where B: BorrowMut, @@ -528,10 +649,10 @@ where } impl Datum { - pub fn with_encoding(self, encoding: &'static Encoding) -> OwnedEncodedDatum { + pub fn with_encoding(self, encoding: &'static Encoding) -> Datum { match self { - Datum::Number(number) => EncodedDatum::Number(number), - Datum::String(raw_string) => EncodedDatum::String(raw_string.with_encoding(encoding)), + Datum::Number(number) => Datum::Number(number), + Datum::String(raw_string) => Datum::String(raw_string.with_encoding(encoding)), } } } @@ -609,13 +730,13 @@ impl Case where B: Borrow<[Datum]>, { - fn len(&self) -> usize { + pub fn len(&self) -> usize { self.data.borrow().len() } } impl IntoIterator for Case>> { - type Item = OwnedEncodedDatum; + type Item = Datum; type IntoIter = CaseVecIter; @@ -633,7 +754,7 @@ pub struct CaseVecIter { } impl Iterator for CaseVecIter { - type Item = OwnedEncodedDatum; + type Item = Datum; fn next(&mut self) -> Option { self.iter diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index 9ddd5fde42..0198c3bb7e 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -40,7 +40,7 @@ use thiserror::Error as ThisError; use unicase::UniCase; use crate::{ - data::{BorrowedEncodedDatum, Datum, OwnedEncodedDatum, OwnedRawString}, + data::{AsEncodedString, Datum, OwnedEncodedString, OwnedRawString}, format::{DisplayPlain, Format}, identifier::{ByIdentifier, HasIdentifier, Identifier}, output::pivot::{ @@ -890,7 +890,7 @@ impl<'a> OutputValueLabels<'a> { .with_show_value_label(Some(Show::Value)); if variable .missing_values - .contains(datum.as_encoded(variable.encoding())) + .contains(&datum.as_encoded(variable.encoding())) { value.add_footnote(&missing_footnote); } @@ -1957,7 +1957,7 @@ impl Hash for ValueLabels { #[derive(Clone, Default, Serialize)] pub struct MissingValues { /// Individual missing values, up to 3 of them. - values: Vec, + values: Vec>, /// Optional range of missing values. range: Option, @@ -2003,7 +2003,7 @@ impl MissingValues { pub fn clear(&mut self) { *self = Self::default(); } - pub fn values(&self) -> &[OwnedEncodedDatum] { + pub fn values(&self) -> &[Datum] { &self.values } @@ -2012,7 +2012,7 @@ impl MissingValues { } pub fn new( - mut values: Vec, + mut values: Vec>, range: Option, ) -> Result { if values.len() > 3 { @@ -2052,7 +2052,10 @@ impl MissingValues { } } - pub fn contains(&self, value: BorrowedEncodedDatum<'_>) -> bool { + pub fn contains(&self, value: &Datum) -> bool + where + S: AsEncodedString, + { if self .values .iter() diff --git a/rust/pspp/src/format/display/mod.rs b/rust/pspp/src/format/display/mod.rs index 236ada8b8a..cf3ae72114 100644 --- a/rust/pspp/src/format/display/mod.rs +++ b/rust/pspp/src/format/display/mod.rs @@ -15,7 +15,6 @@ // this program. If not, see . use std::{ - borrow::Borrow, cmp::min, fmt::{Display, Error as FmtError, Formatter, Result as FmtResult, Write as _}, io::{Error as IoError, Write as IoWrite}, @@ -30,19 +29,17 @@ use smallvec::{Array, SmallVec}; use crate::{ calendar::{calendar_offset_to_gregorian, day_of_year, month_name, short_month_name}, - data::{ - BorrowedEncodedDatum, BorrowedRawString, EncodedDatum, EncodedString, QuotedEncodedDatum, - }, + data::{AsEncodedString, BorrowString, Datum, QuotedDatum}, endian::{endian_to_smallvec, ToBytes}, format::{Category, DateTemplate, Decimal, Format, NumberStyle, Settings, TemplateItem, Type}, settings::{EndianSettings, Settings as PsppSettings}, }; -pub struct DisplayDatum<'a, 'b> { +pub struct DisplayDatum<'b, B> { format: Format, settings: &'b Settings, endian: EndianSettings, - datum: BorrowedEncodedDatum<'a>, + datum: Datum, /// If true, the output will remove leading and trailing spaces from numeric /// values, and trailing spaces from string values. (This might make the @@ -85,6 +82,7 @@ impl Display for DisplayPlainF64 { } } +/* impl EncodedDatum> where R: Borrow, @@ -100,20 +98,44 @@ where pub fn display_plain(&self) -> QuotedEncodedDatum<'_> { self.quoted() } +}*/ + +impl<'a, D> Datum +where + D: AsEncodedString + BorrowString, +{ + /// Returns an object that implements [Display] for printing this + /// [EncodedDatum] as `format`. + /// + /// [Display]: std::fmt::Display + pub fn display(&'a self, format: Format) -> DisplayDatum<'a, D::Borrowed<'a>> + where + D::Borrowed<'a>: AsEncodedString, + { + DisplayDatum::new(format, self.borrowed_string()) + } + + pub fn display_plain(&self) -> QuotedDatum<'_, D> { + self.quoted() + } } -impl Display for DisplayDatum<'_, '_> { +impl<'a, 'b, B> Display for DisplayDatum<'b, B> +where + B: AsEncodedString, +{ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - let number = match self.datum { - EncodedDatum::Number(number) => number, - EncodedDatum::String(string) => { + let number = match &self.datum { + Datum::Number(number) => *number, + Datum::String(string) => { if self.format.type_() == Type::AHex { - for byte in string.as_bytes() { + for byte in string.as_encoded_string().as_bytes() { write!(f, "{byte:02x}")?; } } else { let quote = if self.quote_strings { "\"" } else { "" }; - let s = string.as_str(); + let s = string.as_encoded_string(); + let s = s.as_str(); let s = if self.trim_spaces { s.trim_end_matches(' ') } else { @@ -164,8 +186,11 @@ impl Display for DisplayDatum<'_, '_> { } } -impl<'a, 'b> DisplayDatum<'a, 'b> { - pub fn new(format: Format, datum: BorrowedEncodedDatum<'a>) -> Self { +impl<'b, B> DisplayDatum<'b, B> +where + B: AsEncodedString, +{ + pub fn new(format: Format, datum: Datum) -> Self { let settings = PsppSettings::global(); Self { format, diff --git a/rust/pspp/src/format/display/test.rs b/rust/pspp/src/format/display/test.rs index c65697f9cd..5ff3f00165 100644 --- a/rust/pspp/src/format/display/test.rs +++ b/rust/pspp/src/format/display/test.rs @@ -23,7 +23,7 @@ use smallstr::SmallString; use smallvec::SmallVec; use crate::{ - data::OwnedEncodedDatum, + data::{Datum, OwnedEncodedString}, endian::Endian, format::{AbstractFormat, Epoch, Format, Settings, Type, UncheckedFormat, CC}, lex::{scan::StringScanner, segment::Syntax, Punct, Token}, @@ -75,7 +75,7 @@ fn test(name: &str) { let format: Format = format.try_into().unwrap(); assert_eq!(tokens.get(1), Some(&Token::Punct(Punct::Colon))); let expected = tokens[2].as_string().unwrap(); - let actual = OwnedEncodedDatum::Number(value) + let actual = Datum::::Number(value) .display(format) .with_settings(&settings) .with_endian(endian) @@ -183,7 +183,7 @@ fn leading_zeros() { } fn test_with_settings(value: f64, expected: [&str; 2], settings: &Settings) { - let value = OwnedEncodedDatum::from(value); + let value = Datum::::from(value); for (expected, d) in expected.into_iter().zip([2, 1].into_iter()) { assert_eq!( &value @@ -214,7 +214,7 @@ fn leading_zeros() { fn non_ascii_cc() { fn test(settings: &Settings, value: f64, expected: &str) { assert_eq!( - &OwnedEncodedDatum::from(value) + &Datum::::from(value) .display(Format::new(Type::CC(CC::A), 10, 2).unwrap()) .with_settings(settings) .to_string(), @@ -266,7 +266,7 @@ fn test_binhex(name: &str) { assert_eq!(tokens.get(1), Some(&Token::Punct(Punct::Colon))); let expected = tokens[2].as_string().unwrap(); let mut actual = SmallVec::<[u8; 16]>::new(); - OwnedEncodedDatum::Number(value) + Datum::::Number(value) .display(format) .with_endian(endian) .write(&mut actual, UTF_8) diff --git a/rust/pspp/src/main.rs b/rust/pspp/src/main.rs index 2104cbfa20..8025c6a626 100644 --- a/rust/pspp/src/main.rs +++ b/rust/pspp/src/main.rs @@ -172,7 +172,7 @@ impl Convert { .with_compression(self.sys_options.compression) .write_file(&dictionary, output)?; for case in cases { - output.write_case(case?.into_iter().map(|datum| datum.into_raw()))?; + output.write_case(case?)?; } } } diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs index f55df6c141..75cc9e3235 100644 --- a/rust/pspp/src/output/pivot/mod.rs +++ b/rust/pspp/src/output/pivot/mod.rs @@ -43,7 +43,6 @@ //! could also be a variable name or an arbitrary text string. use std::{ - borrow::Borrow, collections::HashMap, fmt::{Debug, Display, Write}, io::Read, @@ -68,9 +67,7 @@ use thiserror::Error as ThisError; use tlo::parse_tlo; use crate::{ - data::{ - BorrowedRawString, Datum, EncodedDatum, EncodedString, OwnedEncodedDatum, OwnedRawString, - }, + data::{AsEncodedString, Datum, OwnedEncodedString, OwnedRawString}, dictionary::{VarType, Variable}, format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat}, settings::{Settings, Show}, @@ -1862,13 +1859,13 @@ impl Value { variable_label: variable.label.clone(), })) } - pub fn new_datum(value: &EncodedDatum>) -> Self + pub fn new_datum(value: &Datum) -> Self where - R: Borrow, + B: AsEncodedString, { match value { - EncodedDatum::Number(number) => Self::new_number(*number), - EncodedDatum::String(string) => Self::new_user_text(string.as_str()), + Datum::Number(number) => Self::new_number(*number), + Datum::String(string) => Self::new_user_text(string.as_encoded_string().as_str()), } } pub fn new_variable_value(variable: &Variable, value: &Datum) -> Self { @@ -2214,7 +2211,7 @@ impl Display for DisplayValue<'_> { write!( &mut buf, "{}", - OwnedEncodedDatum::Number(*value).display(format) + Datum::::Number(*value).display(format) ) .unwrap(); write!(f, "{}", buf.trim_start_matches(' '))?; diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index d256183bb2..448f2471d5 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -26,7 +26,7 @@ use std::{ use crate::{ calendar::date_time_to_pspp, crypto::EncryptedFile, - data::{Case, Datum, EncodedDatum, OwnedRawString}, + data::{Case, Datum, OwnedRawString}, dictionary::{ DictIndexMultipleResponseSet, DictIndexVariableSet, Dictionary, InvalidRole, MissingValues, MissingValuesError, MultipleResponseType, VarWidth, Variable, @@ -1258,7 +1258,7 @@ impl Records { .map(|v| { let mut value = OwnedRawString::from(v.0.as_slice()); value.resize(variable.width.as_string_width().unwrap()); - EncodedDatum::String(value.with_encoding(encoding)) + Datum::String(value.with_encoding(encoding)) }) .collect::>(); match MissingValues::new(values, None) { diff --git a/rust/pspp/src/sys/write.rs b/rust/pspp/src/sys/write.rs index 7a808e6996..8fa7d11410 100644 --- a/rust/pspp/src/sys/write.rs +++ b/rust/pspp/src/sys/write.rs @@ -1,5 +1,5 @@ use std::{ - borrow::Cow, + borrow::{Borrow, Cow}, collections::HashMap, fmt::Write as _, fs::File, @@ -17,7 +17,7 @@ use itertools::zip_eq; use smallvec::SmallVec; use crate::{ - data::{Datum, OwnedEncodedDatum, OwnedRawString}, + data::{BorrowedRawString, Datum}, dictionary::{ Alignment, Attributes, CategoryLabels, Dictionary, Measure, MultipleResponseType, ValueLabels, VarWidth, @@ -674,7 +674,10 @@ impl BinWrite for Pad { } } -impl BinWrite for Datum { +impl BinWrite for Datum +where + B: Borrow, +{ type Args<'a> = (); fn write_options( @@ -685,23 +688,7 @@ impl BinWrite for Datum { ) -> binrw::BinResult<()> { match self { Datum::Number(number) => number.unwrap_or(f64::MIN).write_options(writer, endian, ()), - Datum::String(raw_string) => raw_string.0.write_options(writer, endian, ()), - } - } -} - -impl BinWrite for OwnedEncodedDatum { - type Args<'a> = (); - - fn write_options( - &self, - writer: &mut W, - endian: binrw::Endian, - _: (), - ) -> binrw::BinResult<()> { - match self { - Self::Number(number) => number.unwrap_or(f64::MIN).write_options(writer, endian, ()), - Self::String(raw_string) => raw_string.as_bytes().write_options(writer, endian, ()), + Datum::String(raw_string) => raw_string.borrow().0.write_options(writer, endian, ()), } } } @@ -840,10 +827,13 @@ where Ok(()) } - fn write_case_uncompressed<'c>( + fn write_case_uncompressed<'c, B>( &mut self, - case: impl Iterator>, - ) -> Result<(), BinError> { + case: impl Iterator>, + ) -> Result<(), BinError> + where + B: Borrow, + { for (var, datum) in zip_eq(self.case_vars, case) { match var { CaseVar::Numeric => datum @@ -852,7 +842,7 @@ where .unwrap_or(f64::MIN) .write_le(&mut self.inner)?, CaseVar::String(encoding) => { - let mut s = datum.as_string().unwrap().as_bytes(); + let mut s = datum.as_string().unwrap().borrow().as_bytes(); for segment in encoding { let data; (data, s) = s.split_at(segment.data_bytes); @@ -863,10 +853,13 @@ where } Ok(()) } - fn write_case_compressed<'c>( + fn write_case_compressed<'c, B>( &mut self, - case: impl Iterator>, - ) -> Result<(), BinError> { + case: impl Iterator>, + ) -> Result<(), BinError> + where + B: Borrow, + { for (var, datum) in zip_eq(self.case_vars, case) { match var { CaseVar::Numeric => match datum.as_number().unwrap() { @@ -885,7 +878,7 @@ where }, CaseVar::String(encoding) => { - let mut s = datum.as_string().unwrap().as_bytes(); + let mut s = datum.as_string().unwrap().borrow().as_bytes(); for segment in encoding { let data; (data, s) = s.split_at(segment.data_bytes); @@ -989,10 +982,13 @@ where /// # Panic /// /// Panics if [try_finish](Self::try_finish) has been called. - pub fn write_case<'a>( + pub fn write_case<'a, B>( &mut self, - case: impl IntoIterator>, - ) -> Result<(), BinError> { + case: impl IntoIterator>, + ) -> Result<(), BinError> + where + B: Borrow, + { match self.inner.as_mut().unwrap() { Either::Left(inner) => { let mut inner = -- 2.30.2