From 339c00fe61e04e683d3a16fd0e42e80e6fa7fe18 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Thu, 7 Aug 2025 08:32:30 -0700 Subject: [PATCH] vectors - mrsets next! --- rust/pspp/src/dictionary.rs | 197 ++++++++++++++++++++++++------------ rust/pspp/src/identifier.rs | 49 +++++++++ rust/pspp/src/sys/write.rs | 5 +- rust/pspp/src/variable.rs | 25 ++++- 4 files changed, 205 insertions(+), 71 deletions(-) diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index ae74dfd1da..d548c9b4f7 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -97,6 +97,36 @@ pub struct Dictionary { encoding: &'static Encoding, } +impl PartialEq for Dictionary { + fn eq(&self, other: &Self) -> bool { + // We have to compare the dereferenced versions of fields that use + // [ByIdentifier. Otherwise we would just be comparing their names. + self.variables + .iter() + .map(|var| &*var) + .eq(other.variables.iter().map(|var| &*var)) + && self.split_file == other.split_file + && self.weight == other.weight + && self.filter == other.filter + && self.case_limit == other.case_limit + && self.file_label == other.file_label + && self.documents == other.documents + && self + .vectors + .iter() + .map(|vector| &*vector) + .eq(other.vectors.iter().map(|vector| &*vector)) + && self.attributes == other.attributes + && self + .mrsets + .iter() + .map(|mrset| &*mrset) + .eq(other.mrsets.iter().map(|mrset| &*mrset)) + && self.variable_sets == other.variable_sets + && self.encoding == other.encoding + } +} + impl Serialize for Dictionary { fn serialize(&self, serializer: S) -> Result where @@ -139,6 +169,11 @@ pub struct InvalidWeightVariable; #[error("Filter variable must be numeric.")] pub struct InvalidFilterVariable; +/// Invalid dictionary index. +#[derive(Debug, ThisError)] +#[error("Invalid dictionary index.")] +pub struct DictIndexError; + impl Dictionary { /// Creates a new, empty dictionary with the specified `encoding`. pub fn new(encoding: &'static Encoding) -> Self { @@ -224,13 +259,17 @@ impl Dictionary { /// Returns the split variables. pub fn split_vars(&self) -> MappedVariables<'_> { - MappedVariables::new(self, &self.split_file) + MappedVariables::new_unchecked(self, &self.split_file) } pub fn vectors(&self) -> Vectors<'_> { Vectors::new(self) } + pub fn vectors_mut(&mut self) -> VectorsMut<'_> { + VectorsMut::new(self) + } + pub fn mrsets(&self) -> MultipleResponseSets<'_> { MultipleResponseSets::new(self) } @@ -1031,11 +1070,21 @@ pub struct Vector<'a> { } impl<'a> Vector<'a> { + fn new_unchecked(dictionary: &'a Dictionary, vector: &'a DictIndexVector) -> Self { + Self { dictionary, vector } + } + pub fn new( + dictionary: &'a Dictionary, + vector: &'a DictIndexVector, + ) -> Result { + MappedVariables::new(dictionary, &vector.variables)?; + Ok(Self::new_unchecked(dictionary, vector)) + } pub fn name(&self) -> &'a Identifier { &self.vector.name } pub fn variables(&self) -> MappedVariables<'a> { - MappedVariables::new(self.dictionary, &self.vector.variables) + MappedVariables::new_unchecked(self.dictionary, &self.vector.variables) } } @@ -1045,13 +1094,27 @@ pub struct MappedVariables<'a> { } impl<'a> MappedVariables<'a> { - fn new(dictionary: &'a Dictionary, dict_indexes: &'a [DictIndex]) -> Self { + fn new_unchecked(dictionary: &'a Dictionary, dict_indexes: &'a [DictIndex]) -> Self { Self { dictionary, dict_indexes, } } + pub fn new( + dictionary: &'a Dictionary, + dict_indexes: &'a [DictIndex], + ) -> Result { + if dict_indexes + .iter() + .all(|dict_index| *dict_index < dictionary.variables.len()) + { + Ok(Self::new_unchecked(dictionary, dict_indexes)) + } else { + Err(DictIndexError) + } + } + pub fn len(&self) -> usize { self.dict_indexes.len() } @@ -1139,10 +1202,26 @@ impl<'a> Iterator for VectorsIter<'a> { type Item = Vector<'a>; fn next(&mut self) -> Option { - self.iter.next().map(|vector| Vector { - dictionary: self.dictionary, - vector, - }) + self.iter + .next() + .map(|vector| Vector::new_unchecked(self.dictionary, vector)) + } +} + +#[derive(Debug)] +pub struct VectorsMut<'a>(&'a mut Dictionary); + +impl<'a> VectorsMut<'a> { + fn new(dictionary: &'a mut Dictionary) -> Self { + Self(dictionary) + } + pub fn as_vectors(&'a self) -> Vectors<'a> { + Vectors(self.0) + } + pub fn insert(&mut self, vector: DictIndexVector) -> Result<(), DictIndexError> { + Vector::new(self.0, &vector)?; + self.0.vectors.insert(ByIdentifier(vector)); + Ok(()) } } @@ -1157,10 +1236,10 @@ impl<'a> Vectors<'a> { self.0.vectors.len() } pub fn get(&self, name: &Identifier) -> Option> { - self.0.vectors.get(&name.0).map(|vector| Vector { - dictionary: self.0, - vector: &*vector, - }) + self.0 + .vectors + .get(&name.0) + .map(|vector| Vector::new_unchecked(self.0, &*vector)) } pub fn iter(&self) -> VectorsIter<'a> { VectorsIter::new(self.0) @@ -1208,7 +1287,7 @@ impl<'a> VariableSet<'a> { &self.variable_set.name } pub fn variables(&self) -> MappedVariables<'a> { - MappedVariables::new(self.dictionary, &self.variable_set.variables) + MappedVariables::new_unchecked(self.dictionary, &self.variable_set.variables) } } @@ -1384,7 +1463,7 @@ impl<'a> MultipleResponseSet<'a> { } pub fn variables(&self) -> MappedVariables<'a> { - MappedVariables::new(self.dictionary, &self.mrset.variables) + MappedVariables::new_unchecked(self.dictionary, &self.mrset.variables) } } @@ -1508,65 +1587,16 @@ impl DictIndexVariableSet { } #[cfg(test)] -mod test { - use std::collections::HashSet; - +mod tests { use encoding_rs::{UTF_8, WINDOWS_1252}; use smallvec::SmallVec; - use unicase::UniCase; use crate::{ - dictionary::Dictionary, + dictionary::{DictIndexVector, Dictionary}, identifier::Identifier, variable::{VarWidth, Variable}, }; - use super::{ByIdentifier, HasIdentifier}; - - #[derive(PartialEq, Eq, Debug, Clone)] - struct SimpleVar { - name: Identifier, - value: i32, - } - - impl HasIdentifier for SimpleVar { - fn identifier(&self) -> &UniCase { - &self.name.0 - } - } - - #[test] - fn test() { - // Variables should not be the same if their values differ. - let abcd = Identifier::new("abcd").unwrap(); - let abcd1 = SimpleVar { - name: abcd.clone(), - value: 1, - }; - let abcd2 = SimpleVar { - name: abcd, - value: 2, - }; - assert_ne!(abcd1, abcd2); - - // But `ByName` should treat them the same. - let abcd1_by_name = ByIdentifier::new(abcd1); - let abcd2_by_name = ByIdentifier::new(abcd2); - assert_eq!(abcd1_by_name, abcd2_by_name); - - // And a `HashSet` of `ByName` should also treat them the same. - let mut vars: HashSet> = HashSet::new(); - assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone()))); - assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone()))); - assert_eq!( - vars.get(&UniCase::new(String::from("abcd"))) - .unwrap() - .0 - .value, - 1 - ); - } - #[test] fn short_names() { for (variables, expected, encoding) in [ @@ -1655,4 +1685,43 @@ mod test { assert_eq!(expected, dict.short_names()); } } + + #[test] + fn codepage_to_unicode() { + let mut dictionary = Dictionary::new(WINDOWS_1252); + + dictionary + .add_var(Variable::new( + Identifier::new("ééééééééééééééééééééééééééééééééa").unwrap(), + VarWidth::Numeric, + WINDOWS_1252, + )) + .unwrap(); + dictionary + .add_var(Variable::new( + Identifier::new("ééééééééééééééééééééééééééééééééb").unwrap(), + VarWidth::Numeric, + WINDOWS_1252, + )) + .unwrap(); + + dictionary + .vectors_mut() + .insert(DictIndexVector { + name: Identifier::new("àààààààààààààààààààààààààààààààà").unwrap(), + variables: vec![0, 1], + }) + .unwrap(); + dictionary + .vectors_mut() + .insert(DictIndexVector { + name: Identifier::new("ààààààààààààààààààààààààààààààààx").unwrap(), + variables: vec![0, 1], + }) + .unwrap(); + + dictionary.codepage_to_unicode(); + dbg!(&dictionary); + todo!() + } } diff --git a/rust/pspp/src/identifier.rs b/rust/pspp/src/identifier.rs index 9697cf7673..700b1cd087 100644 --- a/rust/pspp/src/identifier.rs +++ b/rust/pspp/src/identifier.rs @@ -643,10 +643,59 @@ where #[cfg(test)] mod tests { + use std::collections::HashSet; + use encoding_rs::{Encoding, UTF_8, WINDOWS_1252}; + use unicase::UniCase; use crate::identifier::Identifier; + use super::{ByIdentifier, HasIdentifier}; + + #[derive(PartialEq, Eq, Debug, Clone)] + struct SimpleVar { + name: Identifier, + value: i32, + } + + impl HasIdentifier for SimpleVar { + fn identifier(&self) -> &UniCase { + &self.name.0 + } + } + + #[test] + fn identifier() { + // Variables should not be the same if their values differ. + let abcd = Identifier::new("abcd").unwrap(); + let abcd1 = SimpleVar { + name: abcd.clone(), + value: 1, + }; + let abcd2 = SimpleVar { + name: abcd, + value: 2, + }; + assert_ne!(abcd1, abcd2); + + // But [ByIdentifier]` should treat them the same. + let abcd1_by_name = ByIdentifier::new(abcd1); + let abcd2_by_name = ByIdentifier::new(abcd2); + assert_eq!(abcd1_by_name, abcd2_by_name); + + // And a [HashSet] of [ByIdentifier] should also treat them the same. + let mut vars: HashSet> = HashSet::new(); + assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone()))); + assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone()))); + assert_eq!( + vars.get(&UniCase::new(String::from("abcd"))) + .unwrap() + .0 + .value, + 1 + ); + } + #[test] fn with_suffix() { for (head, suffix, encoding, max_len, expected) in [ diff --git a/rust/pspp/src/sys/write.rs b/rust/pspp/src/sys/write.rs index 320820146d..651d4254f6 100644 --- a/rust/pspp/src/sys/write.rs +++ b/rust/pspp/src/sys/write.rs @@ -1588,10 +1588,7 @@ mod tests { UTF_8, ); for (value, label) in value_labels { - assert_eq!( - variable.value_labels.insert(value.clone(), (*label).into()), - None - ); + assert_eq!(variable.value_labels.insert(value.clone(), *label), None); } dictionary.add_var(variable).unwrap(); } diff --git a/rust/pspp/src/variable.rs b/rust/pspp/src/variable.rs index ea1a86f885..3e09f16c2e 100644 --- a/rust/pspp/src/variable.rs +++ b/rust/pspp/src/variable.rs @@ -603,8 +603,8 @@ impl ValueLabels { self.0.get(value).map(|s| s.as_str()) } - pub fn insert(&mut self, value: Datum, label: String) -> Option { - self.0.insert(value, label) + pub fn insert(&mut self, value: Datum, label: impl Into) -> Option { + self.0.insert(value, label.into()) } pub fn is_resizable(&self, width: VarWidth) -> bool { @@ -716,6 +716,7 @@ impl<'a> MissingValuesMut<'a> { } } +// Currently doesn't filter out duplicates (should it?). #[derive(Clone, Default, Serialize, PartialEq)] pub struct MissingValues { /// Individual missing values, up to 3 of them. @@ -1002,7 +1003,7 @@ mod tests { use crate::{ data::{ByteString, Datum, RawString, WithEncoding}, - variable::{MissingValues, VarWidth}, + variable::{MissingValues, ValueLabels, VarWidth}, }; #[test] @@ -1055,4 +1056,22 @@ mod tests { assert_eq!(&actual, &expected); } + + #[test] + fn value_labels_codepage_to_unicode() { + fn windows_1252(s: &str) -> Datum { + Datum::String(ByteString::from(WINDOWS_1252.encode(s).0)) + } + + let mut actual = ValueLabels::new(); + actual.insert(windows_1252("abcd"), "Label 1"); + actual.insert(windows_1252("éèäî"), "Label 2"); + actual.codepage_to_unicode(WINDOWS_1252); + + let mut expected = ValueLabels::new(); + expected.insert(Datum::String(ByteString::from("abcd ")), "Label 1"); + expected.insert(Datum::String(ByteString::from("éèäî ")), "Label 2"); + + assert_eq!(&actual, &expected); + } } -- 2.30.2