+++ /dev/null
-use std::{
- collections::{HashMap, HashSet},
- fmt::Debug,
- ops::{Bound, RangeBounds},
-};
-
-use encoding_rs::Encoding;
-use indexmap::IndexSet;
-
-use crate::{
- cooked::{Value, VarWidth},
- format::Spec,
- identifier::{ByIdentifier, HasIdentifier, Identifier},
- raw::{Alignment, CategoryLabels, Measure, MissingValues, VarType},
-};
-
-pub type DictIndex = usize;
-
-#[derive(Clone, Debug)]
-pub struct Dictionary {
- pub variables: IndexSet<ByIdentifier<Variable>>,
- pub split_file: Vec<DictIndex>,
- pub weight: Option<DictIndex>,
- pub filter: Option<DictIndex>,
- pub case_limit: Option<u64>,
- pub file_label: Option<String>,
- pub documents: Vec<String>,
- pub vectors: HashSet<ByIdentifier<Vector>>,
- pub attributes: HashSet<ByIdentifier<Attribute>>,
- pub mrsets: HashSet<ByIdentifier<MultipleResponseSet>>,
- pub variable_sets: HashSet<ByIdentifier<VariableSet>>,
- pub encoding: &'static Encoding,
-}
-
-impl Dictionary {
- pub fn new(encoding: &'static Encoding) -> Self {
- Self {
- variables: IndexSet::new(),
- split_file: Vec::new(),
- weight: None,
- filter: None,
- case_limit: None,
- file_label: None,
- documents: Vec::new(),
- vectors: HashSet::new(),
- attributes: HashSet::new(),
- mrsets: HashSet::new(),
- variable_sets: HashSet::new(),
- encoding,
- }
- }
-
- pub fn add_var(&mut self, variable: Variable) -> Result<(), ()> {
- if self.variables.insert(ByIdentifier::new(variable)) {
- Ok(())
- } else {
- Err(())
- }
- }
-
- pub fn reorder_var(&mut self, from_index: DictIndex, to_index: DictIndex) {
- if from_index != to_index {
- self.variables.move_index(from_index, to_index);
- self.update_dict_indexes(&|index| {
- if index == from_index {
- Some(to_index)
- } else if from_index < to_index {
- if index > from_index && index <= to_index {
- Some(index - 1)
- } else {
- Some(index)
- }
- } else {
- if index >= to_index && index < from_index {
- Some(index + 1)
- } else {
- Some(index)
- }
- }
- })
- }
- }
-
- pub fn retain_vars<F>(&mut self, keep: F)
- where
- F: Fn(&Variable) -> bool,
- {
- let mut deleted = Vec::new();
- let mut index = 0;
- self.variables.retain(|var_by_id| {
- let keep = keep(&var_by_id.0);
- if !keep {
- deleted.push(index);
- }
- index += 1;
- keep
- });
- if !deleted.is_empty() {
- self.update_dict_indexes(&|index| match deleted.binary_search(&index) {
- Ok(_) => None,
- Err(position) => Some(position),
- })
- }
- }
-
- pub fn delete_vars<R>(&mut self, range: R)
- where
- R: RangeBounds<DictIndex>,
- {
- let start = match range.start_bound() {
- Bound::Included(&start) => start,
- Bound::Excluded(&start) => start + 1,
- Bound::Unbounded => 0,
- };
- let end = match range.end_bound() {
- Bound::Included(&end) => end + 1,
- Bound::Excluded(&end) => end,
- Bound::Unbounded => self.variables.len(),
- };
- if end > start {
- self.variables.drain(start..end);
- self.update_dict_indexes(&|index| {
- if index < start {
- Some(index)
- } else if index < end {
- None
- } else {
- Some(index - end - start)
- }
- })
- }
- }
-
- fn update_dict_indexes<F>(&mut self, f: &F)
- where
- F: Fn(DictIndex) -> Option<DictIndex>,
- {
- update_dict_index_vec(&mut self.split_file, f);
- self.weight = self.weight.map(|index| f(index)).flatten();
- self.filter = self.filter.map(|index| f(index)).flatten();
- self.vectors = self
- .vectors
- .drain()
- .filter_map(|vector_by_id| {
- vector_by_id
- .0
- .with_updated_dict_indexes(f)
- .map(|vector| ByIdentifier::new(vector))
- })
- .collect();
- self.mrsets = self
- .mrsets
- .drain()
- .filter_map(|mrset_by_id| {
- mrset_by_id
- .0
- .with_updated_dict_indexes(f)
- .map(|mrset| ByIdentifier::new(mrset))
- })
- .collect();
- self.variable_sets = self
- .variable_sets
- .drain()
- .filter_map(|var_set_by_id| {
- var_set_by_id
- .0
- .with_updated_dict_indexes(f)
- .map(|var_set| ByIdentifier::new(var_set))
- })
- .collect();
- }
-}
-
-fn update_dict_index_vec<F>(dict_indexes: &mut Vec<DictIndex>, f: F)
-where
- F: Fn(DictIndex) -> Option<DictIndex>,
-{
- dict_indexes.retain_mut(|index| {
- if let Some(new) = f(*index) {
- *index = new;
- true
- } else {
- false
- }
- });
-}
-
-#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
-pub enum Role {
- Input,
- Target,
- Both,
- None,
- Partition,
- Split,
-}
-
-impl Default for Role {
- fn default() -> Self {
- Self::Input
- }
-}
-
-pub enum DictClass {
- Ordinary,
- System,
- Scratch,
-}
-
-impl DictClass {
- pub fn from_identifier(id: &Identifier) -> Self {
- if id.0.starts_with('$') {
- Self::System
- } else if id.0.starts_with('#') {
- Self::Scratch
- } else {
- Self::Ordinary
- }
- }
-
- pub fn must_leave(self) -> bool {
- match self {
- DictClass::Ordinary => false,
- DictClass::System => false,
- DictClass::Scratch => true,
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct Variable {
- pub name: Identifier,
- pub width: VarWidth,
- pub missing_values: MissingValues,
- pub print_format: Spec,
- pub write_format: Spec,
- pub value_labels: HashMap<Value, String>,
- pub label: Option<String>,
- pub measure: Option<Measure>,
- pub role: Role,
- pub display_width: u32,
- pub alignment: Alignment,
- pub leave: bool,
- pub short_names: Vec<Identifier>,
- pub attributes: HashSet<ByIdentifier<Attribute>>,
-}
-
-impl Variable {
- pub fn new(name: Identifier, width: VarWidth) -> Self {
- let var_type = VarType::from_width(width);
- let leave = DictClass::from_identifier(&name).must_leave();
- Self {
- name,
- width,
- missing_values: MissingValues::default(),
- print_format: Spec::default_for_width(width),
- write_format: Spec::default_for_width(width),
- value_labels: HashMap::new(),
- label: None,
- measure: Measure::default_for_type(var_type),
- role: Role::default(),
- display_width: width.default_display_width(),
- alignment: Alignment::default_for_type(var_type),
- leave,
- short_names: Vec::new(),
- attributes: HashSet::new()
- }
- }
-}
-
-impl HasIdentifier for Variable {
- fn identifier(&self) -> &Identifier {
- &self.name
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct Vector {
- pub name: Identifier,
- pub variables: Vec<DictIndex>,
-}
-
-impl Vector {
- fn with_updated_dict_indexes(
- mut self,
- f: impl Fn(DictIndex) -> Option<DictIndex>,
- ) -> Option<Self> {
- update_dict_index_vec(&mut self.variables, f);
- (!self.variables.is_empty()).then_some(self)
- }
-}
-
-impl HasIdentifier for Vector {
- fn identifier(&self) -> &Identifier {
- &self.name
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct Attribute {
- pub name: Identifier,
- pub values: Vec<String>,
-}
-
-impl HasIdentifier for Attribute {
- fn identifier(&self) -> &Identifier {
- &self.name
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct MultipleResponseSet {
- pub name: Identifier,
- pub label: String,
- pub mr_type: MultipleResponseType,
- pub variables: Vec<DictIndex>,
-}
-
-impl MultipleResponseSet {
- fn with_updated_dict_indexes(
- mut self,
- f: impl Fn(DictIndex) -> Option<DictIndex>,
- ) -> Option<Self> {
- update_dict_index_vec(&mut self.variables, f);
- (self.variables.len() > 1).then_some(self)
- }
-}
-
-impl HasIdentifier for MultipleResponseSet {
- fn identifier(&self) -> &Identifier {
- &self.name
- }
-}
-
-#[derive(Clone, Debug)]
-pub enum MultipleResponseType {
- MultipleDichotomy {
- value: Value,
- labels: CategoryLabels,
- },
- MultipleCategory,
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableSet {
- pub name: Identifier,
- pub variables: Vec<DictIndex>,
-}
-
-impl VariableSet {
- fn with_updated_dict_indexes(
- mut self,
- f: impl Fn(DictIndex) -> Option<DictIndex>,
- ) -> Option<Self> {
- update_dict_index_vec(&mut self.variables, f);
- (!self.variables.is_empty()).then_some(self)
- }
-}
-
-impl HasIdentifier for VariableSet {
- fn identifier(&self) -> &Identifier {
- &self.name
- }
-}
-
-#[cfg(test)]
-mod test {
- use std::collections::HashSet;
-
- use crate::identifier::Identifier;
-
- use super::{ByIdentifier, HasIdentifier};
-
- #[derive(PartialEq, Eq, Debug, Clone)]
- struct Variable {
- name: Identifier,
- value: i32,
- }
-
- impl HasIdentifier for Variable {
- fn identifier(&self) -> &Identifier {
- &self.name
- }
- }
-
- #[test]
- fn test() {
- // Variables should not be the same if their values differ.
- let abcd = Identifier::new_utf8("abcd").unwrap();
- let abcd1 = Variable {
- name: abcd.clone(),
- value: 1,
- };
- let abcd2 = Variable {
- name: abcd,
- value: 2,
- };
- assert_ne!(abcd1, abcd2);
-
- // But `ByName` should treat them the same.
- let abcd1_by_name = ByIdentifier::new(abcd1);
- let abcd2_by_name = ByIdentifier::new(abcd2);
- assert_eq!(abcd1_by_name, abcd2_by_name);
-
- // And a `HashSet` of `ByName` should also treat them the same.
- let mut vars: HashSet<ByIdentifier<Variable>> = HashSet::new();
- assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
- assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
- assert_eq!(
- vars.get(&Identifier::new_utf8("abcd").unwrap())
- .unwrap()
- .0
- .value,
- 1
- );
- }
-}