2 collections::{HashMap, HashSet},
4 ops::{Bound, RangeBounds}, cmp::Ordering,
7 use encoding_rs::Encoding;
8 use indexmap::IndexSet;
9 use num::integer::div_ceil;
10 use ordered_float::OrderedFloat;
14 identifier::{ByIdentifier, HasIdentifier, Identifier},
15 raw::{Alignment, CategoryLabels, Measure, MissingValues, VarType, self, RawStr, Decoder},
18 pub type DictIndex = usize;
20 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
26 impl PartialOrd for VarWidth {
27 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
29 (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal),
30 (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)),
37 pub const MAX_STRING: u16 = 32767;
39 pub fn n_dict_indexes(self) -> usize {
41 VarWidth::Numeric => 1,
42 VarWidth::String(w) => div_ceil(w as usize, 8),
49 f: impl Fn(u16, u16) -> u16,
50 ) -> Option<VarWidth> {
52 (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric),
53 (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => {
54 Some(VarWidth::String(f(a, b)))
60 /// Returns the wider of `self` and `other`:
61 /// - Numerical variable widths are equally wide.
62 /// - Longer strings are wider than shorter strings.
63 /// - Numerical and string types are incomparable, so result in `None`.
64 /// - Any `None` in the input yields `None` in the output.
65 pub fn wider(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
66 Self::width_predicate(a, b, |a, b| a.max(b))
69 /// Returns the narrower of `self` and `other` (see [`Self::wider`]).
70 pub fn narrower(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
71 Self::width_predicate(a, b, |a, b| a.min(b))
74 pub fn default_display_width(&self) -> u32 {
76 VarWidth::Numeric => 8,
77 VarWidth::String(width) => *width.min(&32) as u32,
82 impl From<VarWidth> for VarType {
83 fn from(source: VarWidth) -> Self {
85 VarWidth::Numeric => VarType::Numeric,
86 VarWidth::String(_) => VarType::String,
91 #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
93 Number(Option<OrderedFloat<f64>>),
98 pub fn decode(raw: &raw::Value<RawStr<8>>, decoder: &Decoder) -> Self {
100 raw::Value::Number(x) => Value::Number(x.map(|x| x.into())),
101 raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
106 #[derive(Clone, Debug)]
107 pub struct Dictionary {
108 pub variables: IndexSet<ByIdentifier<Variable>>,
109 pub split_file: Vec<DictIndex>,
110 pub weight: Option<DictIndex>,
111 pub filter: Option<DictIndex>,
112 pub case_limit: Option<u64>,
113 pub file_label: Option<String>,
114 pub documents: Vec<String>,
115 pub vectors: HashSet<ByIdentifier<Vector>>,
116 pub attributes: HashMap<Identifier, Vec<String>>,
117 pub mrsets: HashSet<ByIdentifier<MultipleResponseSet>>,
118 pub variable_sets: HashSet<ByIdentifier<VariableSet>>,
119 pub encoding: &'static Encoding,
123 pub fn new(encoding: &'static Encoding) -> Self {
125 variables: IndexSet::new(),
126 split_file: Vec::new(),
131 documents: Vec::new(),
132 vectors: HashSet::new(),
133 attributes: HashMap::new(),
134 mrsets: HashSet::new(),
135 variable_sets: HashSet::new(),
140 pub fn add_var(&mut self, variable: Variable) -> Result<(), ()> {
141 if self.variables.insert(ByIdentifier::new(variable)) {
148 pub fn reorder_var(&mut self, from_index: DictIndex, to_index: DictIndex) {
149 if from_index != to_index {
150 self.variables.move_index(from_index, to_index);
151 self.update_dict_indexes(&|index| {
152 if index == from_index {
154 } else if from_index < to_index {
155 if index > from_index && index <= to_index {
161 if index >= to_index && index < from_index {
171 pub fn retain_vars<F>(&mut self, keep: F)
173 F: Fn(&Variable) -> bool,
175 let mut deleted = Vec::new();
177 self.variables.retain(|var_by_id| {
178 let keep = keep(&var_by_id.0);
185 if !deleted.is_empty() {
186 self.update_dict_indexes(&|index| match deleted.binary_search(&index) {
188 Err(position) => Some(position),
193 pub fn delete_vars<R>(&mut self, range: R)
195 R: RangeBounds<DictIndex>,
197 let start = match range.start_bound() {
198 Bound::Included(&start) => start,
199 Bound::Excluded(&start) => start + 1,
200 Bound::Unbounded => 0,
202 let end = match range.end_bound() {
203 Bound::Included(&end) => end + 1,
204 Bound::Excluded(&end) => end,
205 Bound::Unbounded => self.variables.len(),
208 self.variables.drain(start..end);
209 self.update_dict_indexes(&|index| {
212 } else if index < end {
215 Some(index - end - start)
221 fn update_dict_indexes<F>(&mut self, f: &F)
223 F: Fn(DictIndex) -> Option<DictIndex>,
225 update_dict_index_vec(&mut self.split_file, f);
226 self.weight = self.weight.map(|index| f(index)).flatten();
227 self.filter = self.filter.map(|index| f(index)).flatten();
231 .filter_map(|vector_by_id| {
234 .with_updated_dict_indexes(f)
235 .map(|vector| ByIdentifier::new(vector))
241 .filter_map(|mrset_by_id| {
244 .with_updated_dict_indexes(f)
245 .map(|mrset| ByIdentifier::new(mrset))
248 self.variable_sets = self
251 .filter_map(|var_set_by_id| {
254 .with_updated_dict_indexes(f)
255 .map(|var_set| ByIdentifier::new(var_set))
261 fn update_dict_index_vec<F>(dict_indexes: &mut Vec<DictIndex>, f: F)
263 F: Fn(DictIndex) -> Option<DictIndex>,
265 dict_indexes.retain_mut(|index| {
266 if let Some(new) = f(*index) {
275 #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
285 impl Default for Role {
286 fn default() -> Self {
298 pub fn from_identifier(id: &Identifier) -> Self {
299 if id.0.starts_with('$') {
301 } else if id.0.starts_with('#') {
308 pub fn must_leave(self) -> bool {
310 DictClass::Ordinary => false,
311 DictClass::System => false,
312 DictClass::Scratch => true,
317 #[derive(Clone, Debug)]
318 pub struct Variable {
319 pub name: Identifier,
321 pub missing_values: MissingValues,
322 pub print_format: Spec,
323 pub write_format: Spec,
324 pub value_labels: HashMap<Value, String>,
325 pub label: Option<String>,
326 pub measure: Option<Measure>,
328 pub display_width: u32,
329 pub alignment: Alignment,
331 pub short_names: Vec<Identifier>,
332 pub attributes: HashSet<ByIdentifier<Attribute>>,
336 pub fn new(name: Identifier, width: VarWidth) -> Self {
337 let var_type = VarType::from_width(width);
338 let leave = DictClass::from_identifier(&name).must_leave();
342 missing_values: MissingValues::default(),
343 print_format: Spec::default_for_width(width),
344 write_format: Spec::default_for_width(width),
345 value_labels: HashMap::new(),
347 measure: Measure::default_for_type(var_type),
348 role: Role::default(),
349 display_width: width.default_display_width(),
350 alignment: Alignment::default_for_type(var_type),
352 short_names: Vec::new(),
353 attributes: HashSet::new()
358 impl HasIdentifier for Variable {
359 fn identifier(&self) -> &Identifier {
364 #[derive(Clone, Debug)]
366 pub name: Identifier,
367 pub variables: Vec<DictIndex>,
371 fn with_updated_dict_indexes(
373 f: impl Fn(DictIndex) -> Option<DictIndex>,
375 update_dict_index_vec(&mut self.variables, f);
376 (!self.variables.is_empty()).then_some(self)
380 impl HasIdentifier for Vector {
381 fn identifier(&self) -> &Identifier {
386 #[derive(Clone, Debug)]
387 pub struct Attribute {
388 pub name: Identifier,
389 pub values: Vec<String>,
392 impl HasIdentifier for Attribute {
393 fn identifier(&self) -> &Identifier {
398 #[derive(Clone, Debug)]
399 pub struct MultipleResponseSet {
400 pub name: Identifier,
402 pub mr_type: MultipleResponseType,
403 pub variables: Vec<DictIndex>,
406 impl MultipleResponseSet {
407 fn with_updated_dict_indexes(
409 f: impl Fn(DictIndex) -> Option<DictIndex>,
411 update_dict_index_vec(&mut self.variables, f);
412 (self.variables.len() > 1).then_some(self)
416 impl HasIdentifier for MultipleResponseSet {
417 fn identifier(&self) -> &Identifier {
422 #[derive(Clone, Debug)]
423 pub enum MultipleResponseType {
426 labels: CategoryLabels,
431 #[derive(Clone, Debug)]
432 pub struct VariableSet {
433 pub name: Identifier,
434 pub variables: Vec<DictIndex>,
438 fn with_updated_dict_indexes(
440 f: impl Fn(DictIndex) -> Option<DictIndex>,
442 update_dict_index_vec(&mut self.variables, f);
443 (!self.variables.is_empty()).then_some(self)
447 impl HasIdentifier for VariableSet {
448 fn identifier(&self) -> &Identifier {
455 use std::collections::HashSet;
457 use crate::identifier::Identifier;
459 use super::{ByIdentifier, HasIdentifier};
461 #[derive(PartialEq, Eq, Debug, Clone)]
467 impl HasIdentifier for Variable {
468 fn identifier(&self) -> &Identifier {
475 // Variables should not be the same if their values differ.
476 let abcd = Identifier::new_utf8("abcd").unwrap();
477 let abcd1 = Variable {
481 let abcd2 = Variable {
485 assert_ne!(abcd1, abcd2);
487 // But `ByName` should treat them the same.
488 let abcd1_by_name = ByIdentifier::new(abcd1);
489 let abcd2_by_name = ByIdentifier::new(abcd2);
490 assert_eq!(abcd1_by_name, abcd2_by_name);
492 // And a `HashSet` of `ByName` should also treat them the same.
493 let mut vars: HashSet<ByIdentifier<Variable>> = HashSet::new();
494 assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
495 assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
497 vars.get(&Identifier::new_utf8("abcd").unwrap())