From ac32d9e296986c677a076326b11c9a9b04ddfa1c Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 22 Jul 2025 17:26:14 -0700 Subject: [PATCH] work on making dictionary fields private --- rust/pspp/src/dictionary.rs | 308 +++++++++++++++++++++++++++++++++--- rust/pspp/src/sys/cooked.rs | 21 ++- rust/pspp/src/sys/write.rs | 2 +- 3 files changed, 299 insertions(+), 32 deletions(-) diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index 623d7f3b06..181639c3c1 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -20,7 +20,7 @@ use core::str; use std::{ borrow::Cow, cmp::Ordering, - collections::{BTreeMap, BTreeSet, HashMap, HashSet}, + collections::{btree_set, BTreeMap, BTreeSet, HashMap, HashSet}, fmt::{Debug, Display, Formatter, Result as FmtResult}, hash::{DefaultHasher, Hash, Hasher}, ops::{Bound, Index, Not, RangeBounds, RangeInclusive}, @@ -260,19 +260,19 @@ pub struct Dictionary { pub variables: IndexSet>, /// Indexes into `variables` of the `SPLIT FILE` variables. - pub split_file: Vec, + split_file: Vec, /// Index of the weight variable, if any. /// /// The weight variable must be numeric. - pub weight: Option, + weight: Option, /// Index of the filter variable, if any. /// /// The filter variable must be numeric. If there is a filter variable, /// then data analysis excludes cases whose filter value is zero or system- /// or user-missing. - pub filter: Option, + filter: Option, /// An optional limit on the number of cases read by procedures. pub case_limit: Option, @@ -292,12 +292,12 @@ pub struct Dictionary { pub attributes: Attributes, /// Multiple response sets. - pub mrsets: BTreeSet>, + pub mrsets: BTreeSet>, /// Variable sets. /// /// Only the GUI makes use of variable sets. - pub variable_sets: Vec, + pub variable_sets: Vec, /// Character encoding for the dictionary and the data. pub encoding: &'static Encoding, @@ -316,8 +316,8 @@ impl Serialize for Dictionary { map.serialize_field("documents", &self.documents)?; map.serialize_field("vectors", &self.vectors())?; map.serialize_field("attributes", &self.attributes)?; - map.serialize_field("mrsets", &self.mrsets)?; - //variable sets + map.serialize_field("mrsets", &self.mrsets())?; + map.serialize_field("variable_sets", &self.variable_sets())?; map.serialize_field("encoding", self.encoding)?; map.end() } @@ -335,6 +335,16 @@ pub enum AddVarError { }, } +/// Weight variable must be numeric. +#[derive(Debug, ThisError)] +#[error("Weight variable must be numeric.")] +pub struct InvalidWeightVariable; + +/// Filter variable must be numeric. +#[derive(Debug, ThisError)] +#[error("Filter variable must be numeric.")] +pub struct InvalidFilterVariable; + impl Dictionary { /// Creates a new, empty dictionary with the specified `encoding`. pub fn new(encoding: &'static Encoding) -> Self { @@ -359,23 +369,78 @@ impl Dictionary { self.weight.map(|index| &self.variables[index].0) } + /// Returns the weight variable's dictionary index. + pub fn weight_index(&self) -> Option { + self.weight + } + + /// Sets the weight variable to the variable with the given dictionary + /// index. + /// + /// # Panic + /// + /// Panics if `dict_index` is not a valid dictionary index. + pub fn set_weight( + &mut self, + dict_index: Option, + ) -> Result<(), InvalidWeightVariable> { + if let Some(dict_index) = dict_index + && !self.variables[dict_index].width.is_numeric() + { + Err(InvalidWeightVariable) + } else { + self.weight = dict_index; + Ok(()) + } + } + /// Returns a reference to the filter variable, if any. pub fn filter_var(&self) -> Option<&Variable> { self.filter.map(|index| &self.variables[index].0) } - /// Returns references to all the split variables, if any. - pub fn split_vars(&self) -> Vec<&Variable> { - self.split_file - .iter() - .map(|index| &self.variables[*index].0) - .collect() + /// Returns the filter variable's dictionary index. + pub fn filter_index(&self) -> Option { + self.filter + } + + /// Sets the filter variable to the variable with the given dictionary + /// index. + /// + /// # Panic + /// + /// Panics if `dict_index` is not a valid dictionary index. + pub fn set_filter( + &mut self, + dict_index: Option, + ) -> Result<(), InvalidFilterVariable> { + if let Some(dict_index) = dict_index + && !self.variables[dict_index].width.is_numeric() + { + Err(InvalidFilterVariable) + } else { + self.filter = dict_index; + Ok(()) + } + } + + /// Returns the split variables. + pub fn split_vars(&self) -> MappedVariables<'_> { + MappedVariables::new(self, &self.split_file) } pub fn vectors(&self) -> Vectors<'_> { Vectors::new(self) } + pub fn mrsets(&self) -> MultipleResponseSets<'_> { + MultipleResponseSets::new(self) + } + + pub fn variable_sets(&self) -> VariableSets<'_> { + VariableSets::new(self) + } + /// Adds `variable` at the end of the dictionary and returns its index. /// /// The operation fails if the dictionary already contains a variable with @@ -1519,9 +1584,214 @@ impl<'a> Serialize for Vectors<'a> { } } +pub struct VariableSet<'a> { + dictionary: &'a Dictionary, + variable_set: &'a DictIndexVariableSet, +} + +impl<'a> VariableSet<'a> { + pub fn name(&self) -> &'a String { + &self.variable_set.name + } + pub fn variables(&self) -> MappedVariables<'a> { + MappedVariables::new(self.dictionary, &self.variable_set.variables) + } +} + +#[derive(Debug)] +pub struct VariableSets<'a>(&'a Dictionary); + +impl<'a> VariableSets<'a> { + fn new(dictionary: &'a Dictionary) -> Self { + Self(dictionary) + } + pub fn len(&self) -> usize { + self.0.variable_sets.len() + } + pub fn get(&self, index: usize) -> Option> { + self.0 + .variable_sets + .get(index) + .map(|variable_set| VariableSet { + dictionary: self.0, + variable_set: &*variable_set, + }) + } + pub fn iter(&self) -> VariableSetsIter<'a> { + VariableSetsIter::new(self.0) + } +} + +impl<'a> IntoIterator for &VariableSets<'a> { + type Item = VariableSet<'a>; + + type IntoIter = VariableSetsIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +impl<'a> Serialize for VariableSets<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut map = serializer.serialize_map(Some(self.len()))?; + for variable_set in self { + map.serialize_key(variable_set.name())?; + map.serialize_value(&variable_set.variables())?; + } + map.end() + } +} + +pub struct VariableSetsIter<'a> { + dictionary: &'a Dictionary, + iter: std::slice::Iter<'a, DictIndexVariableSet>, +} + +impl<'a> VariableSetsIter<'a> { + fn new(dictionary: &'a Dictionary) -> Self { + Self { + dictionary, + iter: dictionary.variable_sets.iter(), + } + } +} +impl<'a> Iterator for VariableSetsIter<'a> { + type Item = VariableSet<'a>; + + fn next(&mut self) -> Option { + self.iter.next().map(|variable_set| VariableSet { + dictionary: self.dictionary, + variable_set, + }) + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseSets<'a>(&'a Dictionary); + +impl<'a> MultipleResponseSets<'a> { + fn new(dictionary: &'a Dictionary) -> Self { + Self(dictionary) + } + + pub fn len(&self) -> usize { + self.0.mrsets.len() + } + + pub fn get(&self, name: &Identifier) -> Option> { + self.0 + .mrsets + .get(&name.0) + .map(|mrset| MultipleResponseSet::new(self.0, mrset)) + } + + pub fn iter(&self) -> MultipleResponseSetIter<'a> { + MultipleResponseSetIter::new(self.0) + } +} + +impl<'a> IntoIterator for &MultipleResponseSets<'a> { + type Item = MultipleResponseSet<'a>; + + type IntoIter = MultipleResponseSetIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +pub struct MultipleResponseSetIter<'a> { + dictionary: &'a Dictionary, + iter: btree_set::Iter<'a, ByIdentifier>, +} + +impl<'a> MultipleResponseSetIter<'a> { + fn new(dictionary: &'a Dictionary) -> Self { + Self { + dictionary, + iter: dictionary.mrsets.iter(), + } + } +} + +impl<'a> Iterator for MultipleResponseSetIter<'a> { + type Item = MultipleResponseSet<'a>; + + fn next(&mut self) -> Option { + self.iter + .next() + .map(|set| MultipleResponseSet::new(self.dictionary, set)) + } +} + +impl<'a> Serialize for MultipleResponseSets<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut seq = serializer.serialize_seq(Some(self.len()))?; + for set in self { + seq.serialize_element(&set)?; + } + seq.end() + } +} + +/// Variables that represent multiple responses to a survey question. +#[derive(Clone, Debug)] +pub struct MultipleResponseSet<'a> { + dictionary: &'a Dictionary, + mrset: &'a DictIndexMultipleResponseSet, +} + +impl<'a> MultipleResponseSet<'a> { + fn new(dictionary: &'a Dictionary, mrset: &'a DictIndexMultipleResponseSet) -> Self { + Self { dictionary, mrset } + } + + pub fn name(&self) -> &Identifier { + &self.mrset.name + } + + pub fn label(&self) -> &String { + &self.mrset.label + } + + pub fn width(&self) -> RangeInclusive { + self.mrset.width.clone() + } + + pub fn mr_type(&self) -> &MultipleResponseType { + &self.mrset.mr_type + } + + pub fn variables(&self) -> MappedVariables<'a> { + MappedVariables::new(self.dictionary, &self.mrset.variables) + } +} + +impl<'a> Serialize for MultipleResponseSet<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut map = serializer.serialize_map(Some(5))?; + map.serialize_entry("name", self.name())?; + map.serialize_entry("label", self.label())?; + map.serialize_entry("width", &self.width())?; + map.serialize_entry("type", self.mr_type())?; + map.serialize_entry("variables", &self.variables())?; + map.end() + } +} + /// Variables that represent multiple responses to a survey question. #[derive(Clone, Debug, Serialize)] -pub struct MultipleResponseSet { +pub struct DictIndexMultipleResponseSet { /// The set's name. pub name: Identifier, @@ -1538,7 +1808,7 @@ pub struct MultipleResponseSet { pub variables: Vec, } -impl MultipleResponseSet { +impl DictIndexMultipleResponseSet { fn with_updated_dict_indexes( mut self, f: impl Fn(DictIndex) -> Option, @@ -1548,7 +1818,7 @@ impl MultipleResponseSet { } } -impl HasIdentifier for MultipleResponseSet { +impl HasIdentifier for DictIndexMultipleResponseSet { fn identifier(&self) -> &UniCase { &self.name.0 } @@ -1604,12 +1874,12 @@ pub enum CategoryLabels { } #[derive(Clone, Debug)] -pub struct VariableSet { +pub struct DictIndexVariableSet { pub name: String, pub variables: Vec, } -impl VariableSet { +impl DictIndexVariableSet { fn with_updated_dict_indexes( mut self, f: impl Fn(DictIndex) -> Option, diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index c4c67bf753..14aafcf167 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -28,8 +28,8 @@ use crate::{ crypto::EncryptedFile, data::{Datum, RawString}, dictionary::{ - Dictionary, InvalidRole, MissingValues, MissingValuesError, MultipleResponseSet, - MultipleResponseType, VarWidth, Variable, VariableSet, + DictIndexMultipleResponseSet, DictIndexVariableSet, Dictionary, InvalidRole, MissingValues, + MissingValuesError, MultipleResponseType, VarWidth, Variable, }, endian::Endian, format::{Error as FormatError, Format, UncheckedFormat}, @@ -970,20 +970,17 @@ impl Records { }); } else { let (var_index, dict_index) = var_index_map.range(..=&index).last().unwrap(); - let variable = &dictionary.variables[*dict_index]; if *var_index == index { - if variable.is_numeric() { - dictionary.weight = Some(*dict_index); - } else { + if dictionary.set_weight(Some(*dict_index)).is_err() { warn(Error::InvalidWeightVar { index: weight_index, - name: variable.name.clone(), + name: dictionary.variables[*dict_index].name.clone(), }); } } else { warn(Error::WeightIndexStringContinuation { index: weight_index, - name: variable.name.clone(), + name: dictionary.variables[*dict_index].name.clone(), }); } } @@ -1075,7 +1072,7 @@ impl Records { .iter() .flat_map(|record| record.sets.iter()) { - match MultipleResponseSet::decode(&dictionary, record, &mut warn) { + match DictIndexMultipleResponseSet::decode(&dictionary, record, &mut warn) { Ok(mrset) => { dictionary.mrsets.insert(ByIdentifier::new(mrset)); } @@ -1291,7 +1288,7 @@ impl Records { }; variables.push(dict_index); } - let variable_set = VariableSet { + let variable_set = DictIndexVariableSet { name: record.name, variables, }; @@ -1531,7 +1528,7 @@ impl Decoder { } } -impl MultipleResponseSet { +impl DictIndexMultipleResponseSet { fn decode( dictionary: &Dictionary, input: &raw::records::MultipleResponseSet, @@ -1581,7 +1578,7 @@ impl MultipleResponseSet { let mr_type = MultipleResponseType::decode(&mr_set_name, &input.mr_type, min_width)?; - Ok(MultipleResponseSet { + Ok(DictIndexMultipleResponseSet { name: mr_set_name, width: min_width..=max_width, label: input.label.to_string(), diff --git a/rust/pspp/src/sys/write.rs b/rust/pspp/src/sys/write.rs index 293c9beee1..859bdad154 100644 --- a/rust/pspp/src/sys/write.rs +++ b/rust/pspp/src/sys/write.rs @@ -206,7 +206,7 @@ where Some(Compression::ZLib) => 2, None => 0, }, - weight_index: if let Some(weight_index) = self.dictionary.weight { + weight_index: if let Some(weight_index) = self.dictionary.weight_index() { count_segments(&self.case_vars[..weight_index]) + 1 } else { 0 -- 2.30.2