3 collections::{HashMap, HashSet},
5 ops::{Bound, RangeBounds},
8 use encoding_rs::Encoding;
9 use indexmap::IndexSet;
10 use num::integer::div_ceil;
11 use ordered_float::OrderedFloat;
15 identifier::{ByIdentifier, HasIdentifier, Identifier},
16 raw::{self, Alignment, CategoryLabels, Decoder, Measure, MissingValues, RawStr, VarType},
19 pub type DictIndex = usize;
21 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
27 impl PartialOrd for VarWidth {
28 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
30 (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal),
31 (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)),
38 pub const MAX_STRING: u16 = 32767;
40 pub fn n_dict_indexes(self) -> usize {
42 VarWidth::Numeric => 1,
43 VarWidth::String(w) => div_ceil(w as usize, 8),
50 f: impl Fn(u16, u16) -> u16,
51 ) -> Option<VarWidth> {
53 (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric),
54 (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => {
55 Some(VarWidth::String(f(a, b)))
61 /// Returns the wider of `self` and `other`:
62 /// - Numerical variable widths are equally wide.
63 /// - Longer strings are wider than shorter strings.
64 /// - Numerical and string types are incomparable, so result in `None`.
65 /// - Any `None` in the input yields `None` in the output.
66 pub fn wider(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
67 Self::width_predicate(a, b, |a, b| a.max(b))
70 /// Returns the narrower of `self` and `other` (see [`Self::wider`]).
71 pub fn narrower(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
72 Self::width_predicate(a, b, |a, b| a.min(b))
75 pub fn default_display_width(&self) -> u32 {
77 VarWidth::Numeric => 8,
78 VarWidth::String(width) => *width.min(&32) as u32,
82 pub fn from_raw(raw: impl Into<i32>) -> Result<Self, ()> {
83 let raw: i32 = raw.into();
85 0 => Ok(Self::Numeric),
86 1..=255 => Ok(Self::String(raw as u16)),
91 pub fn is_long_string(&self) -> bool {
92 if let Self::String(width) = self {
100 impl From<VarWidth> for VarType {
101 fn from(source: VarWidth) -> Self {
103 VarWidth::Numeric => VarType::Numeric,
104 VarWidth::String(_) => VarType::String,
109 #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
111 Number(Option<OrderedFloat<f64>>),
116 pub fn decode(raw: &raw::Value<RawStr<8>>, decoder: &Decoder) -> Self {
118 raw::Value::Number(x) => Value::Number(x.map(|x| x.into())),
119 raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
124 #[derive(Clone, Debug)]
125 pub struct Dictionary {
126 pub variables: IndexSet<ByIdentifier<Variable>>,
127 pub split_file: Vec<DictIndex>,
128 pub weight: Option<DictIndex>,
129 pub filter: Option<DictIndex>,
130 pub case_limit: Option<u64>,
131 pub file_label: Option<String>,
132 pub documents: Vec<String>,
133 pub vectors: HashSet<ByIdentifier<Vector>>,
134 pub attributes: HashMap<Identifier, Vec<String>>,
135 pub mrsets: HashSet<ByIdentifier<MultipleResponseSet>>,
136 pub variable_sets: HashSet<ByIdentifier<VariableSet>>,
137 pub encoding: &'static Encoding,
141 pub struct DuplicateVariableName;
144 pub fn new(encoding: &'static Encoding) -> Self {
146 variables: IndexSet::new(),
147 split_file: Vec::new(),
152 documents: Vec::new(),
153 vectors: HashSet::new(),
154 attributes: HashMap::new(),
155 mrsets: HashSet::new(),
156 variable_sets: HashSet::new(),
161 pub fn add_var(&mut self, variable: Variable) -> Result<usize, DuplicateVariableName> {
162 let (index, inserted) = self.variables.insert_full(ByIdentifier::new(variable));
166 Err(DuplicateVariableName)
170 pub fn reorder_var(&mut self, from_index: DictIndex, to_index: DictIndex) {
171 if from_index != to_index {
172 self.variables.move_index(from_index, to_index);
173 self.update_dict_indexes(&|index| {
174 #[allow(clippy::collapsible_else_if)]
175 if index == from_index {
177 } else if from_index < to_index {
178 if index > from_index && index <= to_index {
184 if index >= to_index && index < from_index {
194 pub fn retain_vars<F>(&mut self, keep: F)
196 F: Fn(&Variable) -> bool,
198 let mut deleted = Vec::new();
200 self.variables.retain(|var_by_id| {
201 let keep = keep(&var_by_id.0);
208 if !deleted.is_empty() {
209 self.update_dict_indexes(&|index| match deleted.binary_search(&index) {
211 Err(position) => Some(position),
216 pub fn delete_vars<R>(&mut self, range: R)
218 R: RangeBounds<DictIndex>,
220 let start = match range.start_bound() {
221 Bound::Included(&start) => start,
222 Bound::Excluded(&start) => start + 1,
223 Bound::Unbounded => 0,
225 let end = match range.end_bound() {
226 Bound::Included(&end) => end + 1,
227 Bound::Excluded(&end) => end,
228 Bound::Unbounded => self.variables.len(),
231 self.variables.drain(start..end);
232 self.update_dict_indexes(&|index| {
235 } else if index < end {
238 Some(index - end - start)
244 fn update_dict_indexes<F>(&mut self, f: &F)
246 F: Fn(DictIndex) -> Option<DictIndex>,
248 update_dict_index_vec(&mut self.split_file, f);
249 self.weight = self.weight.and_then(f);
250 self.filter = self.filter.and_then(f);
254 .filter_map(|vector_by_id| {
257 .with_updated_dict_indexes(f)
258 .map(ByIdentifier::new)
264 .filter_map(|mrset_by_id| {
267 .with_updated_dict_indexes(f)
268 .map(ByIdentifier::new)
271 self.variable_sets = self
274 .filter_map(|var_set_by_id| {
277 .with_updated_dict_indexes(f)
278 .map(ByIdentifier::new)
284 fn update_dict_index_vec<F>(dict_indexes: &mut Vec<DictIndex>, f: F)
286 F: Fn(DictIndex) -> Option<DictIndex>,
288 dict_indexes.retain_mut(|index| {
289 if let Some(new) = f(*index) {
298 #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
308 impl Default for Role {
309 fn default() -> Self {
321 pub fn from_identifier(id: &Identifier) -> Self {
322 if id.0.starts_with('$') {
324 } else if id.0.starts_with('#') {
331 pub fn must_leave(self) -> bool {
333 DictClass::Ordinary => false,
334 DictClass::System => false,
335 DictClass::Scratch => true,
340 #[derive(Clone, Debug)]
341 pub struct Variable {
342 pub name: Identifier,
344 pub missing_values: MissingValues,
345 pub print_format: Spec,
346 pub write_format: Spec,
347 pub value_labels: HashMap<Value, String>,
348 pub label: Option<String>,
349 pub measure: Option<Measure>,
351 pub display_width: u32,
352 pub alignment: Alignment,
354 pub short_names: Vec<Identifier>,
355 pub attributes: HashSet<ByIdentifier<Attribute>>,
359 pub fn new(name: Identifier, width: VarWidth) -> Self {
360 let var_type = VarType::from_width(width);
361 let leave = DictClass::from_identifier(&name).must_leave();
365 missing_values: MissingValues::default(),
366 print_format: Spec::default_for_width(width),
367 write_format: Spec::default_for_width(width),
368 value_labels: HashMap::new(),
370 measure: Measure::default_for_type(var_type),
371 role: Role::default(),
372 display_width: width.default_display_width(),
373 alignment: Alignment::default_for_type(var_type),
375 short_names: Vec::new(),
376 attributes: HashSet::new(),
381 impl HasIdentifier for Variable {
382 fn identifier(&self) -> &Identifier {
387 #[derive(Clone, Debug)]
389 pub name: Identifier,
390 pub variables: Vec<DictIndex>,
394 fn with_updated_dict_indexes(
396 f: impl Fn(DictIndex) -> Option<DictIndex>,
398 update_dict_index_vec(&mut self.variables, f);
399 (!self.variables.is_empty()).then_some(self)
403 impl HasIdentifier for Vector {
404 fn identifier(&self) -> &Identifier {
409 #[derive(Clone, Debug)]
410 pub struct Attribute {
411 pub name: Identifier,
412 pub values: Vec<String>,
415 impl HasIdentifier for Attribute {
416 fn identifier(&self) -> &Identifier {
421 #[derive(Clone, Debug)]
422 pub struct MultipleResponseSet {
423 pub name: Identifier,
425 pub mr_type: MultipleResponseType,
426 pub variables: Vec<DictIndex>,
429 impl MultipleResponseSet {
430 fn with_updated_dict_indexes(
432 f: impl Fn(DictIndex) -> Option<DictIndex>,
434 update_dict_index_vec(&mut self.variables, f);
435 (self.variables.len() > 1).then_some(self)
439 impl HasIdentifier for MultipleResponseSet {
440 fn identifier(&self) -> &Identifier {
445 #[derive(Clone, Debug)]
446 pub enum MultipleResponseType {
449 labels: CategoryLabels,
454 #[derive(Clone, Debug)]
455 pub struct VariableSet {
456 pub name: Identifier,
457 pub variables: Vec<DictIndex>,
461 fn with_updated_dict_indexes(
463 f: impl Fn(DictIndex) -> Option<DictIndex>,
465 update_dict_index_vec(&mut self.variables, f);
466 (!self.variables.is_empty()).then_some(self)
470 impl HasIdentifier for VariableSet {
471 fn identifier(&self) -> &Identifier {
478 use std::collections::HashSet;
480 use crate::identifier::Identifier;
482 use super::{ByIdentifier, HasIdentifier};
484 #[derive(PartialEq, Eq, Debug, Clone)]
490 impl HasIdentifier for Variable {
491 fn identifier(&self) -> &Identifier {
498 // Variables should not be the same if their values differ.
499 let abcd = Identifier::new_utf8("abcd").unwrap();
500 let abcd1 = Variable {
504 let abcd2 = Variable {
508 assert_ne!(abcd1, abcd2);
510 // But `ByName` should treat them the same.
511 let abcd1_by_name = ByIdentifier::new(abcd1);
512 let abcd2_by_name = ByIdentifier::new(abcd2);
513 assert_eq!(abcd1_by_name, abcd2_by_name);
515 // And a `HashSet` of `ByName` should also treat them the same.
516 let mut vars: HashSet<ByIdentifier<Variable>> = HashSet::new();
517 assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
518 assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
520 vars.get(&Identifier::new_utf8("abcd").unwrap())