From 9e5e645370759a85b5117a58677ff828e1dca7d8 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 22 Jul 2025 14:40:43 -0700 Subject: [PATCH] Implement vectors accessor for dictionary --- rust/pspp/src/dictionary.rs | 187 ++++++++++++++++++++++++++++++++++-- 1 file changed, 180 insertions(+), 7 deletions(-) diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index 60dc4f9d52..623d7f3b06 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -23,7 +23,7 @@ use std::{ collections::{BTreeMap, BTreeSet, HashMap, HashSet}, fmt::{Debug, Display, Formatter, Result as FmtResult}, hash::{DefaultHasher, Hash, Hasher}, - ops::{Bound, Not, RangeBounds, RangeInclusive}, + ops::{Bound, Index, Not, RangeBounds, RangeInclusive}, str::FromStr, }; @@ -31,7 +31,10 @@ use encoding_rs::Encoding; use enum_map::{Enum, EnumMap}; use indexmap::IndexSet; use num::integer::div_ceil; -use serde::{ser::SerializeStruct, Serialize}; +use serde::{ + ser::{SerializeMap, SerializeSeq, SerializeStruct}, + Serialize, +}; use smallvec::SmallVec; use thiserror::Error as ThisError; use unicase::UniCase; @@ -281,7 +284,7 @@ pub struct Dictionary { pub documents: Vec, /// Named collections of variables within the dictionary. - pub vectors: HashSet>, + pub vectors: HashSet>, /// Attributes for the dictionary itself. /// @@ -311,7 +314,7 @@ impl Serialize for Dictionary { map.serialize_field("weight", &self.weight_var())?; map.serialize_field("filter", &self.filter_var())?; map.serialize_field("documents", &self.documents)?; - // vectors + map.serialize_field("vectors", &self.vectors())?; map.serialize_field("attributes", &self.attributes)?; map.serialize_field("mrsets", &self.mrsets)?; //variable sets @@ -369,6 +372,10 @@ impl Dictionary { .collect() } + pub fn vectors(&self) -> Vectors<'_> { + Vectors::new(self) + } + /// Adds `variable` at the end of the dictionary and returns its index. /// /// The operation fails if the dictionary already contains a variable with @@ -1325,12 +1332,12 @@ impl HasIdentifier for Variable { } #[derive(Clone, Debug)] -pub struct Vector { +pub struct DictIndexVector { pub name: Identifier, pub variables: Vec, } -impl Vector { +impl DictIndexVector { fn with_updated_dict_indexes( mut self, f: impl Fn(DictIndex) -> Option, @@ -1340,12 +1347,178 @@ impl Vector { } } -impl HasIdentifier for Vector { +impl HasIdentifier for DictIndexVector { fn identifier(&self) -> &UniCase { &self.name.0 } } +pub struct Vector<'a> { + dictionary: &'a Dictionary, + vector: &'a DictIndexVector, +} + +impl<'a> Vector<'a> { + pub fn name(&self) -> &'a Identifier { + &self.vector.name + } + pub fn variables(&self) -> MappedVariables<'a> { + MappedVariables::new(self.dictionary, &self.vector.variables) + } +} + +pub struct MappedVariables<'a> { + dictionary: &'a Dictionary, + dict_indexes: &'a [DictIndex], +} + +impl<'a> MappedVariables<'a> { + fn new(dictionary: &'a Dictionary, dict_indexes: &'a [DictIndex]) -> Self { + Self { + dictionary, + dict_indexes, + } + } + + pub fn len(&self) -> usize { + self.dict_indexes.len() + } + + pub fn get(&self, index: usize) -> Option<&'a Variable> { + self.dict_indexes + .get(index) + .map(|dict_index| &*self.dictionary.variables[*dict_index]) + } + + pub fn iter(&self) -> MappedVariablesIter<'a> { + MappedVariablesIter::new(self.dictionary, self.dict_indexes.iter()) + } +} + +impl<'a> Index for MappedVariables<'a> { + type Output = Variable; + + fn index(&self, index: usize) -> &Self::Output { + &*self.dictionary.variables[self.dict_indexes[index]] + } +} + +impl<'a> Serialize for MappedVariables<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut seq = serializer.serialize_seq(Some(self.len()))?; + for variable in self { + seq.serialize_element(&variable.name)?; + } + seq.end() + } +} + +impl<'a> IntoIterator for &MappedVariables<'a> { + type Item = &'a Variable; + + type IntoIter = MappedVariablesIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +pub struct MappedVariablesIter<'a> { + dictionary: &'a Dictionary, + dict_indexes: std::slice::Iter<'a, DictIndex>, +} + +impl<'a> MappedVariablesIter<'a> { + pub fn new(dictionary: &'a Dictionary, dict_indexes: std::slice::Iter<'a, DictIndex>) -> Self { + Self { + dictionary, + dict_indexes, + } + } +} + +impl<'a> Iterator for MappedVariablesIter<'a> { + type Item = &'a Variable; + + fn next(&mut self) -> Option { + self.dict_indexes + .next() + .map(|dict_index| &*self.dictionary.variables[*dict_index]) + } +} + +pub struct VectorsIter<'a> { + dictionary: &'a Dictionary, + iter: std::collections::hash_set::Iter<'a, ByIdentifier>, +} + +impl<'a> VectorsIter<'a> { + fn new(dictionary: &'a Dictionary) -> Self { + Self { + dictionary, + iter: dictionary.vectors.iter(), + } + } +} +impl<'a> Iterator for VectorsIter<'a> { + type Item = Vector<'a>; + + fn next(&mut self) -> Option { + self.iter.next().map(|vector| Vector { + dictionary: self.dictionary, + vector, + }) + } +} + +#[derive(Debug)] +pub struct Vectors<'a>(&'a Dictionary); + +impl<'a> Vectors<'a> { + fn new(dictionary: &'a Dictionary) -> Self { + Self(dictionary) + } + pub fn len(&self) -> usize { + self.0.vectors.len() + } + pub fn get(&self, name: &Identifier) -> Option> { + self.0.vectors.get(&name.0).map(|vector| Vector { + dictionary: self.0, + vector: &*vector, + }) + } + pub fn iter(&self) -> VectorsIter<'a> { + VectorsIter::new(self.0) + } +} + +impl<'a> IntoIterator for &Vectors<'a> { + type Item = Vector<'a>; + + type IntoIter = VectorsIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +impl<'a> Serialize for Vectors<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut map = serializer.serialize_map(Some(self.len()))?; + for vector in self { + map.serialize_key(vector.name())?; + map.serialize_value(&vector.variables())?; + } + map.end() + } +} + /// Variables that represent multiple responses to a survey question. #[derive(Clone, Debug, Serialize)] pub struct MultipleResponseSet { -- 2.30.2