Implement vectors accessor for dictionary
authorBen Pfaff <blp@cs.stanford.edu>
Tue, 22 Jul 2025 21:40:43 +0000 (14:40 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Tue, 22 Jul 2025 21:40:43 +0000 (14:40 -0700)
rust/pspp/src/dictionary.rs

index 60dc4f9d5213886c680f968a768933583121bf99..623d7f3b06e4233d7f462a75ad775064035437a1 100644 (file)
@@ -23,7 +23,7 @@ use std::{
     collections::{BTreeMap, BTreeSet, HashMap, HashSet},
     fmt::{Debug, Display, Formatter, Result as FmtResult},
     hash::{DefaultHasher, Hash, Hasher},
-    ops::{Bound, Not, RangeBounds, RangeInclusive},
+    ops::{Bound, Index, Not, RangeBounds, RangeInclusive},
     str::FromStr,
 };
 
@@ -31,7 +31,10 @@ use encoding_rs::Encoding;
 use enum_map::{Enum, EnumMap};
 use indexmap::IndexSet;
 use num::integer::div_ceil;
-use serde::{ser::SerializeStruct, Serialize};
+use serde::{
+    ser::{SerializeMap, SerializeSeq, SerializeStruct},
+    Serialize,
+};
 use smallvec::SmallVec;
 use thiserror::Error as ThisError;
 use unicase::UniCase;
@@ -281,7 +284,7 @@ pub struct Dictionary {
     pub documents: Vec<String>,
 
     /// Named collections of variables within the dictionary.
-    pub vectors: HashSet<ByIdentifier<Vector>>,
+    pub vectors: HashSet<ByIdentifier<DictIndexVector>>,
 
     /// Attributes for the dictionary itself.
     ///
@@ -311,7 +314,7 @@ impl Serialize for Dictionary {
         map.serialize_field("weight", &self.weight_var())?;
         map.serialize_field("filter", &self.filter_var())?;
         map.serialize_field("documents", &self.documents)?;
-        // vectors
+        map.serialize_field("vectors", &self.vectors())?;
         map.serialize_field("attributes", &self.attributes)?;
         map.serialize_field("mrsets", &self.mrsets)?;
         //variable sets
@@ -369,6 +372,10 @@ impl Dictionary {
             .collect()
     }
 
+    pub fn vectors(&self) -> Vectors<'_> {
+        Vectors::new(self)
+    }
+
     /// Adds `variable` at the end of the dictionary and returns its index.
     ///
     /// The operation fails if the dictionary already contains a variable with
@@ -1325,12 +1332,12 @@ impl HasIdentifier for Variable {
 }
 
 #[derive(Clone, Debug)]
-pub struct Vector {
+pub struct DictIndexVector {
     pub name: Identifier,
     pub variables: Vec<DictIndex>,
 }
 
-impl Vector {
+impl DictIndexVector {
     fn with_updated_dict_indexes(
         mut self,
         f: impl Fn(DictIndex) -> Option<DictIndex>,
@@ -1340,12 +1347,178 @@ impl Vector {
     }
 }
 
-impl HasIdentifier for Vector {
+impl HasIdentifier for DictIndexVector {
     fn identifier(&self) -> &UniCase<String> {
         &self.name.0
     }
 }
 
+pub struct Vector<'a> {
+    dictionary: &'a Dictionary,
+    vector: &'a DictIndexVector,
+}
+
+impl<'a> Vector<'a> {
+    pub fn name(&self) -> &'a Identifier {
+        &self.vector.name
+    }
+    pub fn variables(&self) -> MappedVariables<'a> {
+        MappedVariables::new(self.dictionary, &self.vector.variables)
+    }
+}
+
+pub struct MappedVariables<'a> {
+    dictionary: &'a Dictionary,
+    dict_indexes: &'a [DictIndex],
+}
+
+impl<'a> MappedVariables<'a> {
+    fn new(dictionary: &'a Dictionary, dict_indexes: &'a [DictIndex]) -> Self {
+        Self {
+            dictionary,
+            dict_indexes,
+        }
+    }
+
+    pub fn len(&self) -> usize {
+        self.dict_indexes.len()
+    }
+
+    pub fn get(&self, index: usize) -> Option<&'a Variable> {
+        self.dict_indexes
+            .get(index)
+            .map(|dict_index| &*self.dictionary.variables[*dict_index])
+    }
+
+    pub fn iter(&self) -> MappedVariablesIter<'a> {
+        MappedVariablesIter::new(self.dictionary, self.dict_indexes.iter())
+    }
+}
+
+impl<'a> Index<usize> for MappedVariables<'a> {
+    type Output = Variable;
+
+    fn index(&self, index: usize) -> &Self::Output {
+        &*self.dictionary.variables[self.dict_indexes[index]]
+    }
+}
+
+impl<'a> Serialize for MappedVariables<'a> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        let mut seq = serializer.serialize_seq(Some(self.len()))?;
+        for variable in self {
+            seq.serialize_element(&variable.name)?;
+        }
+        seq.end()
+    }
+}
+
+impl<'a> IntoIterator for &MappedVariables<'a> {
+    type Item = &'a Variable;
+
+    type IntoIter = MappedVariablesIter<'a>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.iter()
+    }
+}
+
+pub struct MappedVariablesIter<'a> {
+    dictionary: &'a Dictionary,
+    dict_indexes: std::slice::Iter<'a, DictIndex>,
+}
+
+impl<'a> MappedVariablesIter<'a> {
+    pub fn new(dictionary: &'a Dictionary, dict_indexes: std::slice::Iter<'a, DictIndex>) -> Self {
+        Self {
+            dictionary,
+            dict_indexes,
+        }
+    }
+}
+
+impl<'a> Iterator for MappedVariablesIter<'a> {
+    type Item = &'a Variable;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.dict_indexes
+            .next()
+            .map(|dict_index| &*self.dictionary.variables[*dict_index])
+    }
+}
+
+pub struct VectorsIter<'a> {
+    dictionary: &'a Dictionary,
+    iter: std::collections::hash_set::Iter<'a, ByIdentifier<DictIndexVector>>,
+}
+
+impl<'a> VectorsIter<'a> {
+    fn new(dictionary: &'a Dictionary) -> Self {
+        Self {
+            dictionary,
+            iter: dictionary.vectors.iter(),
+        }
+    }
+}
+impl<'a> Iterator for VectorsIter<'a> {
+    type Item = Vector<'a>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.iter.next().map(|vector| Vector {
+            dictionary: self.dictionary,
+            vector,
+        })
+    }
+}
+
+#[derive(Debug)]
+pub struct Vectors<'a>(&'a Dictionary);
+
+impl<'a> Vectors<'a> {
+    fn new(dictionary: &'a Dictionary) -> Self {
+        Self(dictionary)
+    }
+    pub fn len(&self) -> usize {
+        self.0.vectors.len()
+    }
+    pub fn get(&self, name: &Identifier) -> Option<Vector<'a>> {
+        self.0.vectors.get(&name.0).map(|vector| Vector {
+            dictionary: self.0,
+            vector: &*vector,
+        })
+    }
+    pub fn iter(&self) -> VectorsIter<'a> {
+        VectorsIter::new(self.0)
+    }
+}
+
+impl<'a> IntoIterator for &Vectors<'a> {
+    type Item = Vector<'a>;
+
+    type IntoIter = VectorsIter<'a>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.iter()
+    }
+}
+
+impl<'a> Serialize for Vectors<'a> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        let mut map = serializer.serialize_map(Some(self.len()))?;
+        for vector in self {
+            map.serialize_key(vector.name())?;
+            map.serialize_value(&vector.variables())?;
+        }
+        map.end()
+    }
+}
+
 /// Variables that represent multiple responses to a survey question.
 #[derive(Clone, Debug, Serialize)]
 pub struct MultipleResponseSet {