vectors - mrsets next!
authorBen Pfaff <blp@cs.stanford.edu>
Thu, 7 Aug 2025 15:32:30 +0000 (08:32 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Thu, 7 Aug 2025 15:32:30 +0000 (08:32 -0700)
rust/pspp/src/dictionary.rs
rust/pspp/src/identifier.rs
rust/pspp/src/sys/write.rs
rust/pspp/src/variable.rs

index ae74dfd1da45bd3c56a57f9bd33bbdcfa85d80b1..d548c9b4f7436c0d2904d508b0ea3efc9736fc15 100644 (file)
@@ -97,6 +97,36 @@ pub struct Dictionary {
     encoding: &'static Encoding,
 }
 
+impl PartialEq for Dictionary {
+    fn eq(&self, other: &Self) -> bool {
+        // We have to compare the dereferenced versions of fields that use
+        // [ByIdentifier.  Otherwise we would just be comparing their names.
+        self.variables
+            .iter()
+            .map(|var| &*var)
+            .eq(other.variables.iter().map(|var| &*var))
+            && self.split_file == other.split_file
+            && self.weight == other.weight
+            && self.filter == other.filter
+            && self.case_limit == other.case_limit
+            && self.file_label == other.file_label
+            && self.documents == other.documents
+            && self
+                .vectors
+                .iter()
+                .map(|vector| &*vector)
+                .eq(other.vectors.iter().map(|vector| &*vector))
+            && self.attributes == other.attributes
+            && self
+                .mrsets
+                .iter()
+                .map(|mrset| &*mrset)
+                .eq(other.mrsets.iter().map(|mrset| &*mrset))
+            && self.variable_sets == other.variable_sets
+            && self.encoding == other.encoding
+    }
+}
+
 impl Serialize for Dictionary {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
@@ -139,6 +169,11 @@ pub struct InvalidWeightVariable;
 #[error("Filter variable must be numeric.")]
 pub struct InvalidFilterVariable;
 
+/// Invalid dictionary index.
+#[derive(Debug, ThisError)]
+#[error("Invalid dictionary index.")]
+pub struct DictIndexError;
+
 impl Dictionary {
     /// Creates a new, empty dictionary with the specified `encoding`.
     pub fn new(encoding: &'static Encoding) -> Self {
@@ -224,13 +259,17 @@ impl Dictionary {
 
     /// Returns the split variables.
     pub fn split_vars(&self) -> MappedVariables<'_> {
-        MappedVariables::new(self, &self.split_file)
+        MappedVariables::new_unchecked(self, &self.split_file)
     }
 
     pub fn vectors(&self) -> Vectors<'_> {
         Vectors::new(self)
     }
 
+    pub fn vectors_mut(&mut self) -> VectorsMut<'_> {
+        VectorsMut::new(self)
+    }
+
     pub fn mrsets(&self) -> MultipleResponseSets<'_> {
         MultipleResponseSets::new(self)
     }
@@ -1031,11 +1070,21 @@ pub struct Vector<'a> {
 }
 
 impl<'a> Vector<'a> {
+    fn new_unchecked(dictionary: &'a Dictionary, vector: &'a DictIndexVector) -> Self {
+        Self { dictionary, vector }
+    }
+    pub fn new(
+        dictionary: &'a Dictionary,
+        vector: &'a DictIndexVector,
+    ) -> Result<Self, DictIndexError> {
+        MappedVariables::new(dictionary, &vector.variables)?;
+        Ok(Self::new_unchecked(dictionary, vector))
+    }
     pub fn name(&self) -> &'a Identifier {
         &self.vector.name
     }
     pub fn variables(&self) -> MappedVariables<'a> {
-        MappedVariables::new(self.dictionary, &self.vector.variables)
+        MappedVariables::new_unchecked(self.dictionary, &self.vector.variables)
     }
 }
 
@@ -1045,13 +1094,27 @@ pub struct MappedVariables<'a> {
 }
 
 impl<'a> MappedVariables<'a> {
-    fn new(dictionary: &'a Dictionary, dict_indexes: &'a [DictIndex]) -> Self {
+    fn new_unchecked(dictionary: &'a Dictionary, dict_indexes: &'a [DictIndex]) -> Self {
         Self {
             dictionary,
             dict_indexes,
         }
     }
 
+    pub fn new(
+        dictionary: &'a Dictionary,
+        dict_indexes: &'a [DictIndex],
+    ) -> Result<Self, DictIndexError> {
+        if dict_indexes
+            .iter()
+            .all(|dict_index| *dict_index < dictionary.variables.len())
+        {
+            Ok(Self::new_unchecked(dictionary, dict_indexes))
+        } else {
+            Err(DictIndexError)
+        }
+    }
+
     pub fn len(&self) -> usize {
         self.dict_indexes.len()
     }
@@ -1139,10 +1202,26 @@ impl<'a> Iterator for VectorsIter<'a> {
     type Item = Vector<'a>;
 
     fn next(&mut self) -> Option<Self::Item> {
-        self.iter.next().map(|vector| Vector {
-            dictionary: self.dictionary,
-            vector,
-        })
+        self.iter
+            .next()
+            .map(|vector| Vector::new_unchecked(self.dictionary, vector))
+    }
+}
+
+#[derive(Debug)]
+pub struct VectorsMut<'a>(&'a mut Dictionary);
+
+impl<'a> VectorsMut<'a> {
+    fn new(dictionary: &'a mut Dictionary) -> Self {
+        Self(dictionary)
+    }
+    pub fn as_vectors(&'a self) -> Vectors<'a> {
+        Vectors(self.0)
+    }
+    pub fn insert(&mut self, vector: DictIndexVector) -> Result<(), DictIndexError> {
+        Vector::new(self.0, &vector)?;
+        self.0.vectors.insert(ByIdentifier(vector));
+        Ok(())
     }
 }
 
@@ -1157,10 +1236,10 @@ impl<'a> Vectors<'a> {
         self.0.vectors.len()
     }
     pub fn get(&self, name: &Identifier) -> Option<Vector<'a>> {
-        self.0.vectors.get(&name.0).map(|vector| Vector {
-            dictionary: self.0,
-            vector: &*vector,
-        })
+        self.0
+            .vectors
+            .get(&name.0)
+            .map(|vector| Vector::new_unchecked(self.0, &*vector))
     }
     pub fn iter(&self) -> VectorsIter<'a> {
         VectorsIter::new(self.0)
@@ -1208,7 +1287,7 @@ impl<'a> VariableSet<'a> {
         &self.variable_set.name
     }
     pub fn variables(&self) -> MappedVariables<'a> {
-        MappedVariables::new(self.dictionary, &self.variable_set.variables)
+        MappedVariables::new_unchecked(self.dictionary, &self.variable_set.variables)
     }
 }
 
@@ -1384,7 +1463,7 @@ impl<'a> MultipleResponseSet<'a> {
     }
 
     pub fn variables(&self) -> MappedVariables<'a> {
-        MappedVariables::new(self.dictionary, &self.mrset.variables)
+        MappedVariables::new_unchecked(self.dictionary, &self.mrset.variables)
     }
 }
 
@@ -1508,65 +1587,16 @@ impl DictIndexVariableSet {
 }
 
 #[cfg(test)]
-mod test {
-    use std::collections::HashSet;
-
+mod tests {
     use encoding_rs::{UTF_8, WINDOWS_1252};
     use smallvec::SmallVec;
-    use unicase::UniCase;
 
     use crate::{
-        dictionary::Dictionary,
+        dictionary::{DictIndexVector, Dictionary},
         identifier::Identifier,
         variable::{VarWidth, Variable},
     };
 
-    use super::{ByIdentifier, HasIdentifier};
-
-    #[derive(PartialEq, Eq, Debug, Clone)]
-    struct SimpleVar {
-        name: Identifier,
-        value: i32,
-    }
-
-    impl HasIdentifier for SimpleVar {
-        fn identifier(&self) -> &UniCase<String> {
-            &self.name.0
-        }
-    }
-
-    #[test]
-    fn test() {
-        // Variables should not be the same if their values differ.
-        let abcd = Identifier::new("abcd").unwrap();
-        let abcd1 = SimpleVar {
-            name: abcd.clone(),
-            value: 1,
-        };
-        let abcd2 = SimpleVar {
-            name: abcd,
-            value: 2,
-        };
-        assert_ne!(abcd1, abcd2);
-
-        // But `ByName` should treat them the same.
-        let abcd1_by_name = ByIdentifier::new(abcd1);
-        let abcd2_by_name = ByIdentifier::new(abcd2);
-        assert_eq!(abcd1_by_name, abcd2_by_name);
-
-        // And a `HashSet` of `ByName` should also treat them the same.
-        let mut vars: HashSet<ByIdentifier<SimpleVar>> = HashSet::new();
-        assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
-        assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
-        assert_eq!(
-            vars.get(&UniCase::new(String::from("abcd")))
-                .unwrap()
-                .0
-                .value,
-            1
-        );
-    }
-
     #[test]
     fn short_names() {
         for (variables, expected, encoding) in [
@@ -1655,4 +1685,43 @@ mod test {
             assert_eq!(expected, dict.short_names());
         }
     }
+
+    #[test]
+    fn codepage_to_unicode() {
+        let mut dictionary = Dictionary::new(WINDOWS_1252);
+
+        dictionary
+            .add_var(Variable::new(
+                Identifier::new("ééééééééééééééééééééééééééééééééa").unwrap(),
+                VarWidth::Numeric,
+                WINDOWS_1252,
+            ))
+            .unwrap();
+        dictionary
+            .add_var(Variable::new(
+                Identifier::new("ééééééééééééééééééééééééééééééééb").unwrap(),
+                VarWidth::Numeric,
+                WINDOWS_1252,
+            ))
+            .unwrap();
+
+        dictionary
+            .vectors_mut()
+            .insert(DictIndexVector {
+                name: Identifier::new("àààààààààààààààààààààààààààààààà").unwrap(),
+                variables: vec![0, 1],
+            })
+            .unwrap();
+        dictionary
+            .vectors_mut()
+            .insert(DictIndexVector {
+                name: Identifier::new("ààààààààààààààààààààààààààààààààx").unwrap(),
+                variables: vec![0, 1],
+            })
+            .unwrap();
+
+        dictionary.codepage_to_unicode();
+        dbg!(&dictionary);
+        todo!()
+    }
 }
index 9697cf7673b0804273f39070e7222cf5db47efb4..700b1cd087784f39163c65b819bc3df5d147487e 100644 (file)
@@ -643,10 +643,59 @@ where
 
 #[cfg(test)]
 mod tests {
+    use std::collections::HashSet;
+
     use encoding_rs::{Encoding, UTF_8, WINDOWS_1252};
+    use unicase::UniCase;
 
     use crate::identifier::Identifier;
 
+    use super::{ByIdentifier, HasIdentifier};
+
+    #[derive(PartialEq, Eq, Debug, Clone)]
+    struct SimpleVar {
+        name: Identifier,
+        value: i32,
+    }
+
+    impl HasIdentifier for SimpleVar {
+        fn identifier(&self) -> &UniCase<String> {
+            &self.name.0
+        }
+    }
+
+    #[test]
+    fn identifier() {
+        // Variables should not be the same if their values differ.
+        let abcd = Identifier::new("abcd").unwrap();
+        let abcd1 = SimpleVar {
+            name: abcd.clone(),
+            value: 1,
+        };
+        let abcd2 = SimpleVar {
+            name: abcd,
+            value: 2,
+        };
+        assert_ne!(abcd1, abcd2);
+
+        // But [ByIdentifier]` should treat them the same.
+        let abcd1_by_name = ByIdentifier::new(abcd1);
+        let abcd2_by_name = ByIdentifier::new(abcd2);
+        assert_eq!(abcd1_by_name, abcd2_by_name);
+
+        // And a [HashSet] of [ByIdentifier] should also treat them the same.
+        let mut vars: HashSet<ByIdentifier<SimpleVar>> = HashSet::new();
+        assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
+        assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
+        assert_eq!(
+            vars.get(&UniCase::new(String::from("abcd")))
+                .unwrap()
+                .0
+                .value,
+            1
+        );
+    }
+
     #[test]
     fn with_suffix() {
         for (head, suffix, encoding, max_len, expected) in [
index 320820146d6c6ac5d50081951912315369e03a21..651d4254f68d552cc8963586d666df819dc5320b 100644 (file)
@@ -1588,10 +1588,7 @@ mod tests {
                     UTF_8,
                 );
                 for (value, label) in value_labels {
-                    assert_eq!(
-                        variable.value_labels.insert(value.clone(), (*label).into()),
-                        None
-                    );
+                    assert_eq!(variable.value_labels.insert(value.clone(), *label), None);
                 }
                 dictionary.add_var(variable).unwrap();
             }
index ea1a86f8850ce0712a81c7b4c37a0fbf5bcea871..3e09f16c2e3207f96862f09ee84f54ebb71853d7 100644 (file)
@@ -603,8 +603,8 @@ impl ValueLabels {
         self.0.get(value).map(|s| s.as_str())
     }
 
-    pub fn insert(&mut self, value: Datum<ByteString>, label: String) -> Option<String> {
-        self.0.insert(value, label)
+    pub fn insert(&mut self, value: Datum<ByteString>, label: impl Into<String>) -> Option<String> {
+        self.0.insert(value, label.into())
     }
 
     pub fn is_resizable(&self, width: VarWidth) -> bool {
@@ -716,6 +716,7 @@ impl<'a> MissingValuesMut<'a> {
     }
 }
 
+// Currently doesn't filter out duplicates (should it?).
 #[derive(Clone, Default, Serialize, PartialEq)]
 pub struct MissingValues {
     /// Individual missing values, up to 3 of them.
@@ -1002,7 +1003,7 @@ mod tests {
 
     use crate::{
         data::{ByteString, Datum, RawString, WithEncoding},
-        variable::{MissingValues, VarWidth},
+        variable::{MissingValues, ValueLabels, VarWidth},
     };
 
     #[test]
@@ -1055,4 +1056,22 @@ mod tests {
 
         assert_eq!(&actual, &expected);
     }
+
+    #[test]
+    fn value_labels_codepage_to_unicode() {
+        fn windows_1252(s: &str) -> Datum<ByteString> {
+            Datum::String(ByteString::from(WINDOWS_1252.encode(s).0))
+        }
+
+        let mut actual = ValueLabels::new();
+        actual.insert(windows_1252("abcd"), "Label 1");
+        actual.insert(windows_1252("éèäî"), "Label 2");
+        actual.codepage_to_unicode(WINDOWS_1252);
+
+        let mut expected = ValueLabels::new();
+        expected.insert(Datum::String(ByteString::from("abcd        ")), "Label 1");
+        expected.insert(Datum::String(ByteString::from("éèäî    ")), "Label 2");
+
+        assert_eq!(&actual, &expected);
+    }
 }