work
[pspp] / rust / src / dictionary.rs
diff --git a/rust/src/dictionary.rs b/rust/src/dictionary.rs
new file mode 100644 (file)
index 0000000..fd408d7
--- /dev/null
@@ -0,0 +1,292 @@
+use std::{
+    collections::{HashMap, HashSet},
+    fmt::Debug,
+};
+
+use encoding_rs::Encoding;
+use indexmap::IndexSet;
+
+use crate::{
+    cooked::{Alignment, Measure, MissingValues, Value, VarWidth},
+    format::Format,
+    identifier::{ByIdentifier, HasIdentifier, Identifier},
+    raw::CategoryLabels,
+};
+
+pub type DictIndex = usize;
+
+#[derive(Clone, Debug)]
+pub struct Dictionary {
+    pub variables: IndexSet<ByIdentifier<Variable>>,
+    pub split_file: Vec<DictIndex>,
+    pub weight: Option<DictIndex>,
+    pub filter: Option<DictIndex>,
+    pub case_limit: Option<u64>,
+    pub file_label: Option<String>,
+    pub documents: Vec<String>,
+    pub vectors: HashSet<ByIdentifier<Vector>>,
+    pub attributes: HashSet<ByIdentifier<Attribute>>,
+    pub mrsets: HashSet<ByIdentifier<MultipleResponseSet>>,
+    pub variable_sets: HashSet<ByIdentifier<VariableSet>>,
+    pub encoding: &'static Encoding,
+}
+
+impl Dictionary {
+    pub fn new(encoding: &'static Encoding) -> Self {
+        Self {
+            variables: IndexSet::new(),
+            split_file: Vec::new(),
+            weight: None,
+            filter: None,
+            case_limit: None,
+            file_label: None,
+            documents: Vec::new(),
+            vectors: HashSet::new(),
+            attributes: HashSet::new(),
+            mrsets: HashSet::new(),
+            variable_sets: HashSet::new(),
+            encoding,
+        }
+    }
+
+    pub fn delete_vars(&mut self, start: DictIndex, count: usize) {
+        self.update_dict_indexes(&|index| {
+            if index < start {
+                Some(index)
+            } else if index < start + count {
+                None
+            } else {
+                Some(index - count)
+            }
+        })
+    }
+
+    fn update_dict_indexes<F>(&mut self, f: &F)
+    where
+        F: Fn(DictIndex) -> Option<DictIndex>,
+    {
+        update_dict_index_vec(&mut self.split_file, f);
+        self.weight = self.weight.map(|index| f(index)).flatten();
+        self.filter = self.filter.map(|index| f(index)).flatten();
+        self.vectors = self
+            .vectors
+            .drain()
+            .filter_map(|vector_by_id| {
+                vector_by_id
+                    .0
+                    .with_updated_dict_indexes(f)
+                    .map(|vector| ByIdentifier::new(vector))
+            })
+            .collect();
+        self.mrsets = self
+            .mrsets
+            .drain()
+            .filter_map(|mrset_by_id| {
+                mrset_by_id
+                    .0
+                    .with_updated_dict_indexes(f)
+                    .map(|mrset| ByIdentifier::new(mrset))
+            })
+            .collect();
+        self.variable_sets = self
+            .variable_sets
+            .drain()
+            .filter_map(|var_set_by_id| {
+                var_set_by_id
+                    .0
+                    .with_updated_dict_indexes(f)
+                    .map(|var_set| ByIdentifier::new(var_set))
+            })
+            .collect();
+    }
+}
+
+fn update_dict_index_vec<F>(dict_indexes: &mut Vec<DictIndex>, f: F)
+where
+    F: Fn(DictIndex) -> Option<DictIndex>,
+{
+    dict_indexes.retain_mut(|index| {
+        if let Some(new) = f(*index) {
+            *index = new;
+            true
+        } else {
+            false
+        }
+    });
+}
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
+pub enum Role {
+    Input,
+    Target,
+    Both,
+    None,
+    Partition,
+    Split,
+}
+
+#[derive(Clone, Debug)]
+pub struct Variable {
+    pub name: Identifier,
+    pub width: VarWidth,
+    pub missing_values: MissingValues,
+    pub print_format: Format,
+    pub write_format: Format,
+    pub value_labels: HashMap<Value, String>,
+    pub label: Option<String>,
+    pub measure: Measure,
+    pub role: Role,
+    pub display_width: u32,
+    pub alignment: Alignment,
+    pub leave: bool,
+    pub short_names: Vec<Identifier>,
+    pub attributes: HashSet<ByIdentifier<Attribute>>,
+}
+
+impl HasIdentifier for Variable {
+    fn identifier(&self) -> &Identifier {
+        &self.name
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Vector {
+    pub name: Identifier,
+    pub variables: Vec<DictIndex>,
+}
+
+impl Vector {
+    fn with_updated_dict_indexes(
+        mut self,
+        f: impl Fn(DictIndex) -> Option<DictIndex>,
+    ) -> Option<Self> {
+        update_dict_index_vec(&mut self.variables, f);
+        (!self.variables.is_empty()).then_some(self)
+    }
+}
+
+impl HasIdentifier for Vector {
+    fn identifier(&self) -> &Identifier {
+        &self.name
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Attribute {
+    pub name: Identifier,
+    pub values: Vec<String>,
+}
+
+impl HasIdentifier for Attribute {
+    fn identifier(&self) -> &Identifier {
+        &self.name
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseSet {
+    pub name: Identifier,
+    pub label: String,
+    pub mr_type: MultipleResponseType,
+    pub variables: Vec<DictIndex>,
+}
+
+impl MultipleResponseSet {
+    fn with_updated_dict_indexes(
+        mut self,
+        f: impl Fn(DictIndex) -> Option<DictIndex>,
+    ) -> Option<Self> {
+        update_dict_index_vec(&mut self.variables, f);
+        (self.variables.len() > 1).then_some(self)
+    }
+}
+
+impl HasIdentifier for MultipleResponseSet {
+    fn identifier(&self) -> &Identifier {
+        &self.name
+    }
+}
+
+#[derive(Clone, Debug)]
+pub enum MultipleResponseType {
+    MultipleDichotomy {
+        value: Value,
+        labels: CategoryLabels,
+    },
+    MultipleCategory,
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableSet {
+    pub name: Identifier,
+    pub variables: Vec<DictIndex>,
+}
+
+impl VariableSet {
+    fn with_updated_dict_indexes(
+        mut self,
+        f: impl Fn(DictIndex) -> Option<DictIndex>,
+    ) -> Option<Self> {
+        update_dict_index_vec(&mut self.variables, f);
+        (!self.variables.is_empty()).then_some(self)
+    }
+}
+
+impl HasIdentifier for VariableSet {
+    fn identifier(&self) -> &Identifier {
+        &self.name
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::collections::HashSet;
+
+    use crate::identifier::Identifier;
+
+    use super::{ByIdentifier, HasIdentifier};
+
+    #[derive(PartialEq, Eq, Debug, Clone)]
+    struct Variable {
+        name: Identifier,
+        value: i32,
+    }
+
+    impl HasIdentifier for Variable {
+        fn identifier(&self) -> &Identifier {
+            &self.name
+        }
+    }
+
+    #[test]
+    fn test() {
+        // Variables should not be the same if their values differ.
+        let abcd = Identifier::new_utf8("abcd").unwrap();
+        let abcd1 = Variable {
+            name: abcd.clone(),
+            value: 1,
+        };
+        let abcd2 = Variable {
+            name: abcd,
+            value: 2,
+        };
+        assert_ne!(abcd1, abcd2);
+
+        // But `ByName` should treat them the same.
+        let abcd1_by_name = ByIdentifier::new(abcd1);
+        let abcd2_by_name = ByIdentifier::new(abcd2);
+        assert_eq!(abcd1_by_name, abcd2_by_name);
+
+        // And a `HashSet` of `ByName` should also treat them the same.
+        let mut vars: HashSet<ByIdentifier<Variable>> = HashSet::new();
+        assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
+        assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
+        assert_eq!(
+            vars.get(&Identifier::new_utf8("abcd").unwrap())
+                .unwrap()
+                .0
+                .value,
+            1
+        );
+    }
+}