work
[pspp] / rust / src / dictionary.rs
1 use std::{
2     collections::{HashMap, HashSet},
3     fmt::Debug,
4     ops::{Bound, RangeBounds},
5 };
6
7 use encoding_rs::Encoding;
8 use indexmap::IndexSet;
9
10 use crate::{
11     cooked::{Alignment, Measure, MissingValues, Value, VarWidth},
12     format::Format,
13     identifier::{ByIdentifier, HasIdentifier, Identifier},
14     raw::CategoryLabels,
15 };
16
17 pub type DictIndex = usize;
18
19 #[derive(Clone, Debug)]
20 pub struct Dictionary {
21     pub variables: IndexSet<ByIdentifier<Variable>>,
22     pub split_file: Vec<DictIndex>,
23     pub weight: Option<DictIndex>,
24     pub filter: Option<DictIndex>,
25     pub case_limit: Option<u64>,
26     pub file_label: Option<String>,
27     pub documents: Vec<String>,
28     pub vectors: HashSet<ByIdentifier<Vector>>,
29     pub attributes: HashSet<ByIdentifier<Attribute>>,
30     pub mrsets: HashSet<ByIdentifier<MultipleResponseSet>>,
31     pub variable_sets: HashSet<ByIdentifier<VariableSet>>,
32     pub encoding: &'static Encoding,
33 }
34
35 impl Dictionary {
36     pub fn new(encoding: &'static Encoding) -> Self {
37         Self {
38             variables: IndexSet::new(),
39             split_file: Vec::new(),
40             weight: None,
41             filter: None,
42             case_limit: None,
43             file_label: None,
44             documents: Vec::new(),
45             vectors: HashSet::new(),
46             attributes: HashSet::new(),
47             mrsets: HashSet::new(),
48             variable_sets: HashSet::new(),
49             encoding,
50         }
51     }
52
53     pub fn reorder_var(&mut self, from_index: DictIndex, to_index: DictIndex) {
54         if from_index != to_index {
55             self.variables.move_index(from_index, to_index);
56             self.update_dict_indexes(&|index| {
57                 if index == from_index {
58                     Some(to_index)
59                 } else if from_index < to_index {
60                     if index > from_index && index <= to_index {
61                         Some(index - 1)
62                     } else {
63                         Some(index)
64                     }
65                 } else {
66                     if index >= to_index && index < from_index {
67                         Some(index + 1)
68                     } else {
69                         Some(index)
70                     }
71                 }
72             })
73         }
74     }
75
76     pub fn retain_vars<F>(&mut self, keep: F)
77     where
78         F: Fn(&Variable) -> bool,
79     {
80         let mut deleted = Vec::new();
81         let mut index = 0;
82         self.variables.retain(|var_by_id| {
83             let keep = keep(&var_by_id.0);
84             if !keep {
85                 deleted.push(index);
86             }
87             index += 1;
88             keep
89         });
90         if !deleted.is_empty() {
91             self.update_dict_indexes(&|index| match deleted.binary_search(&index) {
92                 Ok(_) => None,
93                 Err(position) => Some(position),
94             })
95         }
96     }
97
98     pub fn delete_vars<R>(&mut self, range: R)
99     where
100         R: RangeBounds<DictIndex>,
101     {
102         let start = match range.start_bound() {
103             Bound::Included(&start) => start,
104             Bound::Excluded(&start) => start + 1,
105             Bound::Unbounded => 0,
106         };
107         let end = match range.end_bound() {
108             Bound::Included(&end) => end + 1,
109             Bound::Excluded(&end) => end,
110             Bound::Unbounded => self.variables.len(),
111         };
112         if end > start {
113             self.variables.drain(start..end);
114             self.update_dict_indexes(&|index| {
115                 if index < start {
116                     Some(index)
117                 } else if index < end {
118                     None
119                 } else {
120                     Some(index - end - start)
121                 }
122             })
123         }
124     }
125
126     fn update_dict_indexes<F>(&mut self, f: &F)
127     where
128         F: Fn(DictIndex) -> Option<DictIndex>,
129     {
130         update_dict_index_vec(&mut self.split_file, f);
131         self.weight = self.weight.map(|index| f(index)).flatten();
132         self.filter = self.filter.map(|index| f(index)).flatten();
133         self.vectors = self
134             .vectors
135             .drain()
136             .filter_map(|vector_by_id| {
137                 vector_by_id
138                     .0
139                     .with_updated_dict_indexes(f)
140                     .map(|vector| ByIdentifier::new(vector))
141             })
142             .collect();
143         self.mrsets = self
144             .mrsets
145             .drain()
146             .filter_map(|mrset_by_id| {
147                 mrset_by_id
148                     .0
149                     .with_updated_dict_indexes(f)
150                     .map(|mrset| ByIdentifier::new(mrset))
151             })
152             .collect();
153         self.variable_sets = self
154             .variable_sets
155             .drain()
156             .filter_map(|var_set_by_id| {
157                 var_set_by_id
158                     .0
159                     .with_updated_dict_indexes(f)
160                     .map(|var_set| ByIdentifier::new(var_set))
161             })
162             .collect();
163     }
164 }
165
166 fn update_dict_index_vec<F>(dict_indexes: &mut Vec<DictIndex>, f: F)
167 where
168     F: Fn(DictIndex) -> Option<DictIndex>,
169 {
170     dict_indexes.retain_mut(|index| {
171         if let Some(new) = f(*index) {
172             *index = new;
173             true
174         } else {
175             false
176         }
177     });
178 }
179
180 #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
181 pub enum Role {
182     Input,
183     Target,
184     Both,
185     None,
186     Partition,
187     Split,
188 }
189
190 #[derive(Clone, Debug)]
191 pub struct Variable {
192     pub name: Identifier,
193     pub width: VarWidth,
194     pub missing_values: MissingValues,
195     pub print_format: Format,
196     pub write_format: Format,
197     pub value_labels: HashMap<Value, String>,
198     pub label: Option<String>,
199     pub measure: Measure,
200     pub role: Role,
201     pub display_width: u32,
202     pub alignment: Alignment,
203     pub leave: bool,
204     pub short_names: Vec<Identifier>,
205     pub attributes: HashSet<ByIdentifier<Attribute>>,
206 }
207
208 impl HasIdentifier for Variable {
209     fn identifier(&self) -> &Identifier {
210         &self.name
211     }
212 }
213
214 #[derive(Clone, Debug)]
215 pub struct Vector {
216     pub name: Identifier,
217     pub variables: Vec<DictIndex>,
218 }
219
220 impl Vector {
221     fn with_updated_dict_indexes(
222         mut self,
223         f: impl Fn(DictIndex) -> Option<DictIndex>,
224     ) -> Option<Self> {
225         update_dict_index_vec(&mut self.variables, f);
226         (!self.variables.is_empty()).then_some(self)
227     }
228 }
229
230 impl HasIdentifier for Vector {
231     fn identifier(&self) -> &Identifier {
232         &self.name
233     }
234 }
235
236 #[derive(Clone, Debug)]
237 pub struct Attribute {
238     pub name: Identifier,
239     pub values: Vec<String>,
240 }
241
242 impl HasIdentifier for Attribute {
243     fn identifier(&self) -> &Identifier {
244         &self.name
245     }
246 }
247
248 #[derive(Clone, Debug)]
249 pub struct MultipleResponseSet {
250     pub name: Identifier,
251     pub label: String,
252     pub mr_type: MultipleResponseType,
253     pub variables: Vec<DictIndex>,
254 }
255
256 impl MultipleResponseSet {
257     fn with_updated_dict_indexes(
258         mut self,
259         f: impl Fn(DictIndex) -> Option<DictIndex>,
260     ) -> Option<Self> {
261         update_dict_index_vec(&mut self.variables, f);
262         (self.variables.len() > 1).then_some(self)
263     }
264 }
265
266 impl HasIdentifier for MultipleResponseSet {
267     fn identifier(&self) -> &Identifier {
268         &self.name
269     }
270 }
271
272 #[derive(Clone, Debug)]
273 pub enum MultipleResponseType {
274     MultipleDichotomy {
275         value: Value,
276         labels: CategoryLabels,
277     },
278     MultipleCategory,
279 }
280
281 #[derive(Clone, Debug)]
282 pub struct VariableSet {
283     pub name: Identifier,
284     pub variables: Vec<DictIndex>,
285 }
286
287 impl VariableSet {
288     fn with_updated_dict_indexes(
289         mut self,
290         f: impl Fn(DictIndex) -> Option<DictIndex>,
291     ) -> Option<Self> {
292         update_dict_index_vec(&mut self.variables, f);
293         (!self.variables.is_empty()).then_some(self)
294     }
295 }
296
297 impl HasIdentifier for VariableSet {
298     fn identifier(&self) -> &Identifier {
299         &self.name
300     }
301 }
302
303 #[cfg(test)]
304 mod test {
305     use std::collections::HashSet;
306
307     use crate::identifier::Identifier;
308
309     use super::{ByIdentifier, HasIdentifier};
310
311     #[derive(PartialEq, Eq, Debug, Clone)]
312     struct Variable {
313         name: Identifier,
314         value: i32,
315     }
316
317     impl HasIdentifier for Variable {
318         fn identifier(&self) -> &Identifier {
319             &self.name
320         }
321     }
322
323     #[test]
324     fn test() {
325         // Variables should not be the same if their values differ.
326         let abcd = Identifier::new_utf8("abcd").unwrap();
327         let abcd1 = Variable {
328             name: abcd.clone(),
329             value: 1,
330         };
331         let abcd2 = Variable {
332             name: abcd,
333             value: 2,
334         };
335         assert_ne!(abcd1, abcd2);
336
337         // But `ByName` should treat them the same.
338         let abcd1_by_name = ByIdentifier::new(abcd1);
339         let abcd2_by_name = ByIdentifier::new(abcd2);
340         assert_eq!(abcd1_by_name, abcd2_by_name);
341
342         // And a `HashSet` of `ByName` should also treat them the same.
343         let mut vars: HashSet<ByIdentifier<Variable>> = HashSet::new();
344         assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
345         assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
346         assert_eq!(
347             vars.get(&Identifier::new_utf8("abcd").unwrap())
348                 .unwrap()
349                 .0
350                 .value,
351             1
352         );
353     }
354 }