3 collections::{HashMap, HashSet},
5 ops::{Bound, RangeBounds},
8 use encoding_rs::Encoding;
9 use indexmap::IndexSet;
10 use num::integer::div_ceil;
11 use ordered_float::OrderedFloat;
15 identifier::{ByIdentifier, HasIdentifier, Identifier},
16 raw::{self, Alignment, CategoryLabels, Decoder, Measure, MissingValues, RawStr, VarType},
19 pub type DictIndex = usize;
21 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
27 impl PartialOrd for VarWidth {
28 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
30 (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal),
31 (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)),
38 pub const MAX_STRING: u16 = 32767;
40 pub fn n_dict_indexes(self) -> usize {
42 VarWidth::Numeric => 1,
43 VarWidth::String(w) => div_ceil(w as usize, 8),
50 f: impl Fn(u16, u16) -> u16,
51 ) -> Option<VarWidth> {
53 (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric),
54 (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => {
55 Some(VarWidth::String(f(a, b)))
61 /// Returns the wider of `self` and `other`:
62 /// - Numerical variable widths are equally wide.
63 /// - Longer strings are wider than shorter strings.
64 /// - Numerical and string types are incomparable, so result in `None`.
65 /// - Any `None` in the input yields `None` in the output.
66 pub fn wider(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
67 Self::width_predicate(a, b, |a, b| a.max(b))
70 /// Returns the narrower of `self` and `other` (see [`Self::wider`]).
71 pub fn narrower(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
72 Self::width_predicate(a, b, |a, b| a.min(b))
75 pub fn default_display_width(&self) -> u32 {
77 VarWidth::Numeric => 8,
78 VarWidth::String(width) => *width.min(&32) as u32,
83 impl From<VarWidth> for VarType {
84 fn from(source: VarWidth) -> Self {
86 VarWidth::Numeric => VarType::Numeric,
87 VarWidth::String(_) => VarType::String,
92 #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
94 Number(Option<OrderedFloat<f64>>),
99 pub fn decode(raw: &raw::Value<RawStr<8>>, decoder: &Decoder) -> Self {
101 raw::Value::Number(x) => Value::Number(x.map(|x| x.into())),
102 raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
107 #[derive(Clone, Debug)]
108 pub struct Dictionary {
109 pub variables: IndexSet<ByIdentifier<Variable>>,
110 pub split_file: Vec<DictIndex>,
111 pub weight: Option<DictIndex>,
112 pub filter: Option<DictIndex>,
113 pub case_limit: Option<u64>,
114 pub file_label: Option<String>,
115 pub documents: Vec<String>,
116 pub vectors: HashSet<ByIdentifier<Vector>>,
117 pub attributes: HashMap<Identifier, Vec<String>>,
118 pub mrsets: HashSet<ByIdentifier<MultipleResponseSet>>,
119 pub variable_sets: HashSet<ByIdentifier<VariableSet>>,
120 pub encoding: &'static Encoding,
123 pub struct DuplicateVariableName;
126 pub fn new(encoding: &'static Encoding) -> Self {
128 variables: IndexSet::new(),
129 split_file: Vec::new(),
134 documents: Vec::new(),
135 vectors: HashSet::new(),
136 attributes: HashMap::new(),
137 mrsets: HashSet::new(),
138 variable_sets: HashSet::new(),
143 pub fn add_var(&mut self, variable: Variable) -> Result<(), DuplicateVariableName> {
144 if self.variables.insert(ByIdentifier::new(variable)) {
147 Err(DuplicateVariableName)
151 pub fn reorder_var(&mut self, from_index: DictIndex, to_index: DictIndex) {
152 if from_index != to_index {
153 self.variables.move_index(from_index, to_index);
154 self.update_dict_indexes(&|index| {
155 #[allow(clippy::collapsible_else_if)]
156 if index == from_index {
158 } else if from_index < to_index {
159 if index > from_index && index <= to_index {
165 if index >= to_index && index < from_index {
175 pub fn retain_vars<F>(&mut self, keep: F)
177 F: Fn(&Variable) -> bool,
179 let mut deleted = Vec::new();
181 self.variables.retain(|var_by_id| {
182 let keep = keep(&var_by_id.0);
189 if !deleted.is_empty() {
190 self.update_dict_indexes(&|index| match deleted.binary_search(&index) {
192 Err(position) => Some(position),
197 pub fn delete_vars<R>(&mut self, range: R)
199 R: RangeBounds<DictIndex>,
201 let start = match range.start_bound() {
202 Bound::Included(&start) => start,
203 Bound::Excluded(&start) => start + 1,
204 Bound::Unbounded => 0,
206 let end = match range.end_bound() {
207 Bound::Included(&end) => end + 1,
208 Bound::Excluded(&end) => end,
209 Bound::Unbounded => self.variables.len(),
212 self.variables.drain(start..end);
213 self.update_dict_indexes(&|index| {
216 } else if index < end {
219 Some(index - end - start)
225 fn update_dict_indexes<F>(&mut self, f: &F)
227 F: Fn(DictIndex) -> Option<DictIndex>,
229 update_dict_index_vec(&mut self.split_file, f);
230 self.weight = self.weight.and_then(f);
231 self.filter = self.filter.and_then(f);
235 .filter_map(|vector_by_id| {
238 .with_updated_dict_indexes(f)
239 .map(ByIdentifier::new)
245 .filter_map(|mrset_by_id| {
248 .with_updated_dict_indexes(f)
249 .map(ByIdentifier::new)
252 self.variable_sets = self
255 .filter_map(|var_set_by_id| {
258 .with_updated_dict_indexes(f)
259 .map(ByIdentifier::new)
265 fn update_dict_index_vec<F>(dict_indexes: &mut Vec<DictIndex>, f: F)
267 F: Fn(DictIndex) -> Option<DictIndex>,
269 dict_indexes.retain_mut(|index| {
270 if let Some(new) = f(*index) {
279 #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
289 impl Default for Role {
290 fn default() -> Self {
302 pub fn from_identifier(id: &Identifier) -> Self {
303 if id.0.starts_with('$') {
305 } else if id.0.starts_with('#') {
312 pub fn must_leave(self) -> bool {
314 DictClass::Ordinary => false,
315 DictClass::System => false,
316 DictClass::Scratch => true,
321 #[derive(Clone, Debug)]
322 pub struct Variable {
323 pub name: Identifier,
325 pub missing_values: MissingValues,
326 pub print_format: Spec,
327 pub write_format: Spec,
328 pub value_labels: HashMap<Value, String>,
329 pub label: Option<String>,
330 pub measure: Option<Measure>,
332 pub display_width: u32,
333 pub alignment: Alignment,
335 pub short_names: Vec<Identifier>,
336 pub attributes: HashSet<ByIdentifier<Attribute>>,
340 pub fn new(name: Identifier, width: VarWidth) -> Self {
341 let var_type = VarType::from_width(width);
342 let leave = DictClass::from_identifier(&name).must_leave();
346 missing_values: MissingValues::default(),
347 print_format: Spec::default_for_width(width),
348 write_format: Spec::default_for_width(width),
349 value_labels: HashMap::new(),
351 measure: Measure::default_for_type(var_type),
352 role: Role::default(),
353 display_width: width.default_display_width(),
354 alignment: Alignment::default_for_type(var_type),
356 short_names: Vec::new(),
357 attributes: HashSet::new(),
362 impl HasIdentifier for Variable {
363 fn identifier(&self) -> &Identifier {
368 #[derive(Clone, Debug)]
370 pub name: Identifier,
371 pub variables: Vec<DictIndex>,
375 fn with_updated_dict_indexes(
377 f: impl Fn(DictIndex) -> Option<DictIndex>,
379 update_dict_index_vec(&mut self.variables, f);
380 (!self.variables.is_empty()).then_some(self)
384 impl HasIdentifier for Vector {
385 fn identifier(&self) -> &Identifier {
390 #[derive(Clone, Debug)]
391 pub struct Attribute {
392 pub name: Identifier,
393 pub values: Vec<String>,
396 impl HasIdentifier for Attribute {
397 fn identifier(&self) -> &Identifier {
402 #[derive(Clone, Debug)]
403 pub struct MultipleResponseSet {
404 pub name: Identifier,
406 pub mr_type: MultipleResponseType,
407 pub variables: Vec<DictIndex>,
410 impl MultipleResponseSet {
411 fn with_updated_dict_indexes(
413 f: impl Fn(DictIndex) -> Option<DictIndex>,
415 update_dict_index_vec(&mut self.variables, f);
416 (self.variables.len() > 1).then_some(self)
420 impl HasIdentifier for MultipleResponseSet {
421 fn identifier(&self) -> &Identifier {
426 #[derive(Clone, Debug)]
427 pub enum MultipleResponseType {
430 labels: CategoryLabels,
435 #[derive(Clone, Debug)]
436 pub struct VariableSet {
437 pub name: Identifier,
438 pub variables: Vec<DictIndex>,
442 fn with_updated_dict_indexes(
444 f: impl Fn(DictIndex) -> Option<DictIndex>,
446 update_dict_index_vec(&mut self.variables, f);
447 (!self.variables.is_empty()).then_some(self)
451 impl HasIdentifier for VariableSet {
452 fn identifier(&self) -> &Identifier {
459 use std::collections::HashSet;
461 use crate::identifier::Identifier;
463 use super::{ByIdentifier, HasIdentifier};
465 #[derive(PartialEq, Eq, Debug, Clone)]
471 impl HasIdentifier for Variable {
472 fn identifier(&self) -> &Identifier {
479 // Variables should not be the same if their values differ.
480 let abcd = Identifier::new_utf8("abcd").unwrap();
481 let abcd1 = Variable {
485 let abcd2 = Variable {
489 assert_ne!(abcd1, abcd2);
491 // But `ByName` should treat them the same.
492 let abcd1_by_name = ByIdentifier::new(abcd1);
493 let abcd2_by_name = ByIdentifier::new(abcd2);
494 assert_eq!(abcd1_by_name, abcd2_by_name);
496 // And a `HashSet` of `ByName` should also treat them the same.
497 let mut vars: HashSet<ByIdentifier<Variable>> = HashSet::new();
498 assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
499 assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
501 vars.get(&Identifier::new_utf8("abcd").unwrap())