From: Ben Pfaff Date: Sun, 10 Aug 2025 03:38:36 +0000 (-0700) Subject: codepage_to_unicode tests pass X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b7a17b49dc108a1d5ae97517878dfce55da16b4c;p=pspp codepage_to_unicode tests pass --- diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index f17441070e..7b1364dfb9 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -44,8 +44,8 @@ use serde::{ }; use crate::{ - variable::{VarType, VarWidth}, format::DisplayPlain, + variable::{VarType, VarWidth}, }; pub trait RawString: Debug + PartialEq + Eq + PartialOrd + Ord + Hash { @@ -93,6 +93,14 @@ pub trait RawString: Debug + PartialEq + Eq + PartialOrd + Ord + Hash { ByteStr(self.raw_string_bytes()) } + fn without_trailing_spaces(&self) -> ByteStr<'_> { + let mut raw = self.raw_string_bytes(); + while let Some(trimmed) = raw.strip_suffix(b" ") { + raw = trimmed; + } + ByteStr(raw) + } + fn as_encoded(&self, encoding: &'static Encoding) -> WithEncoding> where Self: Sized, diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index d548c9b4f7..d7db117319 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -35,7 +35,7 @@ use thiserror::Error as ThisError; use unicase::UniCase; use crate::{ - data::{ByteString, Datum}, + data::{ByteString, Datum, RawString}, identifier::{ByIdentifier, HasIdentifier, Identifier}, output::pivot::{ Axis3, Dimension, Display26Adic, Footnote, Footnotes, Group, PivotTable, Value, @@ -86,7 +86,7 @@ pub struct Dictionary { pub attributes: Attributes, /// Multiple response sets. - pub mrsets: BTreeSet>, + mrsets: BTreeSet>, /// Variable sets. /// @@ -170,9 +170,12 @@ pub struct InvalidWeightVariable; pub struct InvalidFilterVariable; /// Invalid dictionary index. -#[derive(Debug, ThisError)] -#[error("Invalid dictionary index.")] -pub struct DictIndexError; +#[derive(Debug, Clone, ThisError)] +#[error("Invalid index {index} in dictionary with {n} variables.")] +pub struct DictIndexError { + index: usize, + n: usize, +} impl Dictionary { /// Creates a new, empty dictionary with the specified `encoding`. @@ -274,6 +277,10 @@ impl Dictionary { MultipleResponseSets::new(self) } + pub fn mrsets_mut(&mut self) -> MultipleResponseSetsMut<'_> { + MultipleResponseSetsMut::new(self) + } + pub fn variable_sets(&self) -> VariableSets<'_> { VariableSets::new(self) } @@ -647,17 +654,17 @@ impl Dictionary { } self.variables = variables; - let mut vectors = HashSet::new(); let mut index = 0; - for mut vector in self.vectors.drain() { + let mut vectors = self.vectors.drain().collect::>(); + vectors.sort(); + for mut vector in vectors { vector.codepage_to_unicode(); - while vectors.contains(&vector) { + while self.vectors.contains(&vector) { index += 1; vector.name = Identifier::new(format!("Vec{index}")).unwrap(); } - vectors.insert(vector); + self.vectors.insert(vector); } - self.vectors = vectors; self.attributes.codepage_to_unicode(); @@ -667,7 +674,7 @@ impl Dictionary { mrset.codepage_to_unicode(); while mrsets.contains(&mrset) { index += 1; - mrset.name = Identifier::new(format!("Mr{index}")).unwrap(); + mrset.name = Identifier::new(format!("MrSet{index}")).unwrap(); } mrsets.insert(mrset); } @@ -1105,14 +1112,13 @@ impl<'a> MappedVariables<'a> { dictionary: &'a Dictionary, dict_indexes: &'a [DictIndex], ) -> Result { - if dict_indexes - .iter() - .all(|dict_index| *dict_index < dictionary.variables.len()) - { - Ok(Self::new_unchecked(dictionary, dict_indexes)) - } else { - Err(DictIndexError) + let n = dictionary.variables.len(); + for index in dict_indexes.iter().copied() { + if index >= n { + return Err(DictIndexError { index, n }); + } } + Ok(Self::new_unchecked(dictionary, dict_indexes)) } pub fn len(&self) -> usize { @@ -1128,6 +1134,10 @@ impl<'a> MappedVariables<'a> { pub fn iter(&self) -> MappedVariablesIter<'a> { MappedVariablesIter::new(self.dictionary, self.dict_indexes.iter()) } + + pub fn dict_indexes(&self) -> &[DictIndex] { + self.dict_indexes + } } impl<'a> Index for MappedVariables<'a> { @@ -1363,6 +1373,53 @@ impl<'a> Iterator for VariableSetsIter<'a> { } } +#[derive(Debug)] +pub struct MultipleResponseSetsMut<'a>(&'a mut Dictionary); + +#[derive(ThisError, Clone, Debug)] +pub enum MrSetError { + #[error("{0}")] + DictIndexError(#[from] DictIndexError), + + /// Counted value {value} has width {width}, but it must be no wider than + /// {max_width}, the width of the narrowest variable in multiple response + /// set {mr_set}. + #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")] + TooWideMDGroupCountedValue { + /// Multiple response set name. + mr_set: Identifier, + /// Counted value. + value: String, + /// Width of counted value. + width: usize, + /// Maximum allowed width of counted value. + max_width: u16, + }, + + /// Multiple response set {0} contains both string and numeric variables. + #[error("Multiple response set {0} contains both string and numeric variables.")] + MixedMrSet( + /// Multiple response set name. + Identifier, + ), +} + +impl<'a> MultipleResponseSetsMut<'a> { + fn new(dictionary: &'a mut Dictionary) -> Self { + Self(dictionary) + } + + pub fn mrsets(&'a self) -> MultipleResponseSets<'a> { + MultipleResponseSets::new(self.0) + } + + pub fn insert(&mut self, mrset: DictIndexMultipleResponseSet) -> Result<(), MrSetError> { + MultipleResponseSet::new(self.0, &mrset)?; + self.0.mrsets.insert(ByIdentifier(mrset)); + Ok(()) + } +} + #[derive(Clone, Debug)] pub struct MultipleResponseSets<'a>(&'a Dictionary); @@ -1379,7 +1436,7 @@ impl<'a> MultipleResponseSets<'a> { self.0 .mrsets .get(&name.0) - .map(|mrset| MultipleResponseSet::new(self.0, mrset)) + .map(|mrset| MultipleResponseSet::new_unchecked(self.0, mrset)) } pub fn iter(&self) -> MultipleResponseSetIter<'a> { @@ -1417,7 +1474,7 @@ impl<'a> Iterator for MultipleResponseSetIter<'a> { fn next(&mut self) -> Option { self.iter .next() - .map(|set| MultipleResponseSet::new(self.dictionary, set)) + .map(|set| MultipleResponseSet::new_unchecked(self.dictionary, set)) } } @@ -1442,10 +1499,44 @@ pub struct MultipleResponseSet<'a> { } impl<'a> MultipleResponseSet<'a> { - fn new(dictionary: &'a Dictionary, mrset: &'a DictIndexMultipleResponseSet) -> Self { + fn new_unchecked(dictionary: &'a Dictionary, mrset: &'a DictIndexMultipleResponseSet) -> Self { Self { dictionary, mrset } } + fn new( + dictionary: &'a Dictionary, + mrset: &'a DictIndexMultipleResponseSet, + ) -> Result { + let variables = MappedVariables::new(dictionary, &mrset.variables)?; + let (min_width, _max_width) = Self::widths(&variables) + .ok_or_else(|| MrSetError::MixedMrSet(mrset.name.clone()))? + .into_inner(); + + if let MultipleResponseType::MultipleDichotomy { datum, labels: _ } = &mrset.mr_type { + match (datum, min_width) { + (Datum::Number(_), VarWidth::Numeric) => (), + (Datum::String(s), VarWidth::String(min_width)) => { + if s.without_trailing_spaces().len() > min_width as usize {} + } + _ => return Err(MrSetError::MixedMrSet(mrset.name.clone())), + } + } + Ok(Self::new_unchecked(dictionary, mrset)) + } + + fn widths(variables: &MappedVariables<'_>) -> Option> { + variables + .iter() + .map(|v| Some((v.width, v.width))) + .reduce(|a, b| { + let (na, wa) = a?; + let (nb, wb) = b?; + Some((VarWidth::narrower(na, nb)?, VarWidth::wider(wa, wb)?)) + }) + .flatten() + .map(|(min_width, max_width)| min_width..=max_width) + } + pub fn name(&self) -> &Identifier { &self.mrset.name } @@ -1455,7 +1546,7 @@ impl<'a> MultipleResponseSet<'a> { } pub fn width(&self) -> RangeInclusive { - self.mrset.width.clone() + Self::widths(&self.variables()).unwrap() } pub fn mr_type(&self) -> &MultipleResponseType { @@ -1491,9 +1582,6 @@ pub struct DictIndexMultipleResponseSet { /// A description for the set. pub label: String, - /// Range of widths among the variables. - pub width: RangeInclusive, - /// What kind of multiple response set this is. pub mr_type: MultipleResponseType, @@ -1592,7 +1680,11 @@ mod tests { use smallvec::SmallVec; use crate::{ - dictionary::{DictIndexVector, Dictionary}, + data::Datum, + dictionary::{ + CategoryLabels, DictIndexMultipleResponseSet, DictIndexVector, Dictionary, + MultipleResponseType, + }, identifier::Identifier, variable::{VarWidth, Variable}, }; @@ -1716,12 +1808,72 @@ mod tests { .vectors_mut() .insert(DictIndexVector { name: Identifier::new("ààààààààààààààààààààààààààààààààx").unwrap(), + variables: vec![1, 0], + }) + .unwrap(); + + dictionary + .mrsets_mut() + .insert(DictIndexMultipleResponseSet { + name: Identifier::new("üüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüasdf").unwrap(), + label: String::from("my mcgroup"), + mr_type: MultipleResponseType::MultipleCategory, + variables: vec![0, 1], + }) + .unwrap(); + dictionary + .mrsets_mut() + .insert(DictIndexMultipleResponseSet { + name: Identifier::new("üüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüquux").unwrap(), + label: String::new(), + mr_type: MultipleResponseType::MultipleDichotomy { + datum: Datum::Number(Some(55.0)), + labels: CategoryLabels::VarLabels, + }, variables: vec![0, 1], }) .unwrap(); dictionary.codepage_to_unicode(); dbg!(&dictionary); - todo!() + assert_eq!( + &dictionary.variables[0].name, + "éééééééééééééééééééééééééééééééé" + ); + assert_eq!(&dictionary.variables[1].name, "Var1"); + assert_eq!( + dictionary + .vectors() + .get(&Identifier::new("àààààààààààààààààààààààààààààààà").unwrap()) + .unwrap() + .variables() + .dict_indexes(), + &[0, 1] + ); + assert_eq!( + dictionary + .vectors() + .get(&Identifier::new("Vec1").unwrap()) + .unwrap() + .variables() + .dict_indexes(), + &[1, 0] + ); + assert!(matches!( + dictionary + .mrsets() + .get(&Identifier::new("üüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüü").unwrap()) + .unwrap() + .mr_type(), + MultipleResponseType::MultipleCategory + )); + assert!(matches!( + dictionary + .mrsets() + .get(&Identifier::new("MrSet1").unwrap()) + .unwrap() + .mr_type(), + MultipleResponseType::MultipleDichotomy { .. } + )); } } diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index 049c152791..f6e3233a13 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -28,11 +28,12 @@ use crate::{ crypto::EncryptedFile, data::{ByteString, Case, Datum, MutRawString, RawString}, dictionary::{ - DictIndexMultipleResponseSet, DictIndexVariableSet, Dictionary, MultipleResponseType, + DictIndexMultipleResponseSet, DictIndexVariableSet, Dictionary, MrSetError, + MultipleResponseType, }, format::{Error as FormatError, Format, UncheckedFormat}, hexfloat::HexFloat, - identifier::{ByIdentifier, Error as IdError, Identifier}, + identifier::{Error as IdError, Identifier}, output::pivot::{Group, Value}, sys::{ raw::{ @@ -49,7 +50,7 @@ use crate::{ }, serialize_endian, }, - variable::{InvalidRole, MissingValues, MissingValuesError, VarWidth, Variable}, + variable::{InvalidRole, MissingValues, MissingValuesError, VarType, VarWidth, Variable}, }; use anyhow::{anyhow, Error as AnyError}; use binrw::{BinRead, BinWrite, Endian}; @@ -195,12 +196,9 @@ pub enum Error { Identifier, ), - /// Multiple response set {0} contains both string and numeric variables. - #[error("Multiple response set {0} contains both string and numeric variables.")] - MixedMrSet( - /// Multiple response set name. - Identifier, - ), + /// Error adding multiple response set. + #[error("{0}")] + MrSetError(#[from] MrSetError), /// Invalid numeric format for counted value {number} in multiple response set {mr_set}. #[error( @@ -213,21 +211,6 @@ pub enum Error { number: String, }, - /// Counted value {value} has width {width}, but it must be no wider than - /// {max_width}, the width of the narrowest variable in multiple response - /// set {mr_set}. - #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")] - TooWideMDGroupCountedValue { - /// Multiple response set name. - mr_set: Identifier, - /// Counted value. - value: String, - /// Width of counted value. - width: usize, - /// Maximum allowed width of counted value. - max_width: u16, - }, - /// Ignoring long string value label for unknown variable {0}. #[error("Ignoring long string value label for unknown variable {0}.")] UnknownLongStringValueLabelVariable( @@ -1077,7 +1060,7 @@ impl Records { { match DictIndexMultipleResponseSet::decode(&dictionary, record, &mut warn) { Ok(mrset) => { - dictionary.mrsets.insert(ByIdentifier::new(mrset)); + dictionary.mrsets_mut().insert(mrset).unwrap(); } Err(error) => warn(error), } @@ -1574,21 +1557,18 @@ impl DictIndexMultipleResponseSet { _ => (), } - let Some((Some(min_width), Some(max_width))) = variables + let Ok(var_type) = variables .iter() - .copied() - .map(|dict_index| dictionary.variables[dict_index].width) - .map(|w| (Some(w), Some(w))) - .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb))) + .map(|dict_index| VarType::from(dictionary.variables[*dict_index].width)) + .all_equal_value() else { - return Err(Error::MixedMrSet(mr_set_name)); + return Err(MrSetError::MixedMrSet(mr_set_name).into()); }; - let mr_type = MultipleResponseType::decode(&mr_set_name, &input.mr_type, min_width)?; + let mr_type = MultipleResponseType::decode(&mr_set_name, &input.mr_type, var_type)?; Ok(DictIndexMultipleResponseSet { name: mr_set_name, - width: min_width..=max_width, label: input.label.to_string(), mr_type, variables, @@ -1640,12 +1620,12 @@ impl MultipleResponseType { fn decode( mr_set: &Identifier, input: &raw::records::MultipleResponseType, - min_width: VarWidth, + var_type: VarType, ) -> Result { match input { raw::records::MultipleResponseType::MultipleDichotomy { value, labels } => { - let value = match min_width { - VarWidth::Numeric => { + let value = match var_type { + VarType::Numeric => { let string = String::from_utf8_lossy(&value.0); let number: f64 = string.trim().parse().map_err(|_| { Error::InvalidMDGroupCountedValue { @@ -1655,21 +1635,10 @@ impl MultipleResponseType { })?; Datum::Number(Some(number)) } - VarWidth::String(max_width) => { - let mut value = value.0.as_slice(); - while value.ends_with(b" ") { - value = &value[..value.len() - 1]; - } - let width = value.len(); - if width > max_width as usize { - return Err(Error::TooWideMDGroupCountedValue { - mr_set: mr_set.clone(), - value: String::from_utf8_lossy(value).into(), - width, - max_width, - }); - }; - Datum::String(value.into()) + VarType::String => { + let mut value = value.clone(); + value.trim_end(); + Datum::String(value) } }; Ok(MultipleResponseType::MultipleDichotomy { diff --git a/rust/pspp/src/sys/write.rs b/rust/pspp/src/sys/write.rs index 651d4254f6..16e54e6538 100644 --- a/rust/pspp/src/sys/write.rs +++ b/rust/pspp/src/sys/write.rs @@ -498,13 +498,13 @@ where let mut output = Vec::new(); for set in self .dictionary - .mrsets + .mrsets() .iter() - .filter(|set| set.mr_type.supported_before_v14() == pre_v14) + .filter(|set| set.mr_type().supported_before_v14() == pre_v14) { - output.extend_from_slice(&self.dictionary.encoding().encode(&set.name).0[..]); + output.extend_from_slice(&self.dictionary.encoding().encode(set.name()).0[..]); output.push(b'='); - match &set.mr_type { + match set.mr_type() { MultipleResponseType::MultipleDichotomy { datum, labels } => { let leader = match labels { CategoryLabels::VarLabels => b"D".as_slice(), @@ -531,15 +531,15 @@ where MultipleResponseType::MultipleCategory => write!(&mut output, "C ").unwrap(), } - let label = if set.mr_type.label_from_var_label() { + let label = if set.mr_type().label_from_var_label() { Cow::from(&[]) } else { - self.dictionary.encoding().encode(&set.label).0 + self.dictionary.encoding().encode(set.label()).0 }; write!(&mut output, "{} ", label.len()).unwrap(); output.extend_from_slice(&label[..]); - for variable in set.variables.iter().copied() { + for variable in set.variables().dict_indexes().iter().copied() { // Only lowercase ASCII characters because other characters // might expand upon lowercasing. let short_name = self.short_names[variable][0].as_str().to_ascii_lowercase(); @@ -1215,7 +1215,7 @@ mod tests { CategoryLabels, DictIndexMultipleResponseSet, DictIndexVariableSet, Dictionary, MultipleResponseType, }, - identifier::{ByIdentifier, Identifier}, + identifier::Identifier, sys::{ raw::{ records::{ @@ -1709,44 +1709,43 @@ mod tests { } } dictionary - .mrsets - .insert(ByIdentifier::new(DictIndexMultipleResponseSet { + .mrsets_mut() + .insert(DictIndexMultipleResponseSet { name: Identifier::new("$a").unwrap(), label: String::from("my mcgroup"), - width: VarWidth::Numeric..=VarWidth::Numeric, mr_type: MultipleResponseType::MultipleCategory, variables: vec![0, 1, 2], - })); + }) + .unwrap(); dictionary - .mrsets - .insert(ByIdentifier::new(DictIndexMultipleResponseSet { + .mrsets_mut() + .insert(DictIndexMultipleResponseSet { name: Identifier::new("$b").unwrap(), label: String::new(), - width: VarWidth::Numeric..=VarWidth::Numeric, mr_type: MultipleResponseType::MultipleDichotomy { datum: Datum::Number(Some(55.0)), labels: CategoryLabels::VarLabels, }, variables: vec![6, 4, 5, 3], - })); + }) + .unwrap(); dictionary - .mrsets - .insert(ByIdentifier::new(DictIndexMultipleResponseSet { + .mrsets_mut() + .insert(DictIndexMultipleResponseSet { name: Identifier::new("$c").unwrap(), label: String::from("mdgroup #2"), - width: VarWidth::String(3)..=VarWidth::String(3), mr_type: MultipleResponseType::MultipleDichotomy { datum: Datum::String("Yes".into()), labels: CategoryLabels::VarLabels, }, variables: vec![7, 8, 9], - })); + }) + .unwrap(); dictionary - .mrsets - .insert(ByIdentifier::new(DictIndexMultipleResponseSet { + .mrsets_mut() + .insert(DictIndexMultipleResponseSet { name: Identifier::new("$d").unwrap(), label: String::from("third mdgroup"), - width: VarWidth::Numeric..=VarWidth::Numeric, mr_type: MultipleResponseType::MultipleDichotomy { datum: Datum::Number(Some(34.0)), labels: CategoryLabels::CountedValues { @@ -1754,13 +1753,13 @@ mod tests { }, }, variables: vec![10, 11, 12], - })); + }) + .unwrap(); dictionary - .mrsets - .insert(ByIdentifier::new(DictIndexMultipleResponseSet { + .mrsets_mut() + .insert(DictIndexMultipleResponseSet { name: Identifier::new("$e").unwrap(), label: String::new(), - width: VarWidth::String(6)..=VarWidth::String(6), mr_type: MultipleResponseType::MultipleDichotomy { datum: Datum::String("choice".into()), labels: CategoryLabels::CountedValues { @@ -1768,7 +1767,8 @@ mod tests { }, }, variables: vec![13, 14, 15], - })); + }) + .unwrap(); fn get_mrsets(dictionary: &Dictionary, pre_v14: bool) -> String { let mut raw = Vec::new(); diff --git a/rust/pspp/src/variable.rs b/rust/pspp/src/variable.rs index 3e09f16c2e..01913dfd0f 100644 --- a/rust/pspp/src/variable.rs +++ b/rust/pspp/src/variable.rs @@ -78,7 +78,7 @@ impl Display for VarType { #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize)] pub enum VarWidth { Numeric, - String(u16), + String(u16), // XXX change to NonZeroU16, or to 1..=32767 range type } impl VarWidth { @@ -91,16 +91,10 @@ impl VarWidth { } } - fn width_predicate( - a: Option, - b: Option, - f: impl Fn(u16, u16) -> u16, - ) -> Option { + fn width_predicate(a: VarWidth, b: VarWidth, f: impl Fn(u16, u16) -> u16) -> Option { match (a, b) { - (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric), - (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => { - Some(VarWidth::String(f(a, b))) - } + (VarWidth::Numeric, VarWidth::Numeric) => Some(VarWidth::Numeric), + (VarWidth::String(a), VarWidth::String(b)) => Some(VarWidth::String(f(a, b))), _ => None, } } @@ -109,13 +103,12 @@ impl VarWidth { /// - Numerical variable widths are equally wide. /// - Longer strings are wider than shorter strings. /// - Numerical and string types are incomparable, so result in `None`. - /// - Any `None` in the input yields `None` in the output. - pub fn wider(a: Option, b: Option) -> Option { + pub fn wider(a: VarWidth, b: VarWidth) -> Option { Self::width_predicate(a, b, |a, b| a.max(b)) } /// Returns the narrower of `self` and `other` (see [`Self::wider`]). - pub fn narrower(a: Option, b: Option) -> Option { + pub fn narrower(a: VarWidth, b: VarWidth) -> Option { Self::width_predicate(a, b, |a, b| a.min(b)) }