3 collections::{HashMap, HashSet},
5 ops::{Bound, RangeBounds},
8 use encoding_rs::Encoding;
9 use indexmap::IndexSet;
10 use num::integer::div_ceil;
11 use ordered_float::OrderedFloat;
15 identifier::{ByIdentifier, HasIdentifier, Identifier},
16 raw::{self, Alignment, CategoryLabels, Decoder, Measure, MissingValues, RawStr, VarType},
19 pub type DictIndex = usize;
21 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
27 impl PartialOrd for VarWidth {
28 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
30 (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal),
31 (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)),
38 pub const MAX_STRING: u16 = 32767;
40 pub fn n_dict_indexes(self) -> usize {
42 VarWidth::Numeric => 1,
43 VarWidth::String(w) => div_ceil(w as usize, 8),
50 f: impl Fn(u16, u16) -> u16,
51 ) -> Option<VarWidth> {
53 (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric),
54 (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => {
55 Some(VarWidth::String(f(a, b)))
61 /// Returns the wider of `self` and `other`:
62 /// - Numerical variable widths are equally wide.
63 /// - Longer strings are wider than shorter strings.
64 /// - Numerical and string types are incomparable, so result in `None`.
65 /// - Any `None` in the input yields `None` in the output.
66 pub fn wider(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
67 Self::width_predicate(a, b, |a, b| a.max(b))
70 /// Returns the narrower of `self` and `other` (see [`Self::wider`]).
71 pub fn narrower(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
72 Self::width_predicate(a, b, |a, b| a.min(b))
75 pub fn default_display_width(&self) -> u32 {
77 VarWidth::Numeric => 8,
78 VarWidth::String(width) => *width.min(&32) as u32,
82 pub fn from_raw(raw: impl Into<i32>) -> Result<Self, ()> {
83 let raw: i32 = raw.into();
85 0 => Ok(Self::Numeric),
86 1..=255 => Ok(Self::String(raw as u16)),
92 impl From<VarWidth> for VarType {
93 fn from(source: VarWidth) -> Self {
95 VarWidth::Numeric => VarType::Numeric,
96 VarWidth::String(_) => VarType::String,
101 #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
103 Number(Option<OrderedFloat<f64>>),
108 pub fn decode(raw: &raw::Value<RawStr<8>>, decoder: &Decoder) -> Self {
110 raw::Value::Number(x) => Value::Number(x.map(|x| x.into())),
111 raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
116 #[derive(Clone, Debug)]
117 pub struct Dictionary {
118 pub variables: IndexSet<ByIdentifier<Variable>>,
119 pub split_file: Vec<DictIndex>,
120 pub weight: Option<DictIndex>,
121 pub filter: Option<DictIndex>,
122 pub case_limit: Option<u64>,
123 pub file_label: Option<String>,
124 pub documents: Vec<String>,
125 pub vectors: HashSet<ByIdentifier<Vector>>,
126 pub attributes: HashMap<Identifier, Vec<String>>,
127 pub mrsets: HashSet<ByIdentifier<MultipleResponseSet>>,
128 pub variable_sets: HashSet<ByIdentifier<VariableSet>>,
129 pub encoding: &'static Encoding,
133 pub struct DuplicateVariableName;
136 pub fn new(encoding: &'static Encoding) -> Self {
138 variables: IndexSet::new(),
139 split_file: Vec::new(),
144 documents: Vec::new(),
145 vectors: HashSet::new(),
146 attributes: HashMap::new(),
147 mrsets: HashSet::new(),
148 variable_sets: HashSet::new(),
153 pub fn add_var(&mut self, variable: Variable) -> Result<usize, DuplicateVariableName> {
154 let (index, inserted) = self.variables.insert_full(ByIdentifier::new(variable));
158 Err(DuplicateVariableName)
162 pub fn reorder_var(&mut self, from_index: DictIndex, to_index: DictIndex) {
163 if from_index != to_index {
164 self.variables.move_index(from_index, to_index);
165 self.update_dict_indexes(&|index| {
166 #[allow(clippy::collapsible_else_if)]
167 if index == from_index {
169 } else if from_index < to_index {
170 if index > from_index && index <= to_index {
176 if index >= to_index && index < from_index {
186 pub fn retain_vars<F>(&mut self, keep: F)
188 F: Fn(&Variable) -> bool,
190 let mut deleted = Vec::new();
192 self.variables.retain(|var_by_id| {
193 let keep = keep(&var_by_id.0);
200 if !deleted.is_empty() {
201 self.update_dict_indexes(&|index| match deleted.binary_search(&index) {
203 Err(position) => Some(position),
208 pub fn delete_vars<R>(&mut self, range: R)
210 R: RangeBounds<DictIndex>,
212 let start = match range.start_bound() {
213 Bound::Included(&start) => start,
214 Bound::Excluded(&start) => start + 1,
215 Bound::Unbounded => 0,
217 let end = match range.end_bound() {
218 Bound::Included(&end) => end + 1,
219 Bound::Excluded(&end) => end,
220 Bound::Unbounded => self.variables.len(),
223 self.variables.drain(start..end);
224 self.update_dict_indexes(&|index| {
227 } else if index < end {
230 Some(index - end - start)
236 fn update_dict_indexes<F>(&mut self, f: &F)
238 F: Fn(DictIndex) -> Option<DictIndex>,
240 update_dict_index_vec(&mut self.split_file, f);
241 self.weight = self.weight.and_then(f);
242 self.filter = self.filter.and_then(f);
246 .filter_map(|vector_by_id| {
249 .with_updated_dict_indexes(f)
250 .map(ByIdentifier::new)
256 .filter_map(|mrset_by_id| {
259 .with_updated_dict_indexes(f)
260 .map(ByIdentifier::new)
263 self.variable_sets = self
266 .filter_map(|var_set_by_id| {
269 .with_updated_dict_indexes(f)
270 .map(ByIdentifier::new)
276 fn update_dict_index_vec<F>(dict_indexes: &mut Vec<DictIndex>, f: F)
278 F: Fn(DictIndex) -> Option<DictIndex>,
280 dict_indexes.retain_mut(|index| {
281 if let Some(new) = f(*index) {
290 #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
300 impl Default for Role {
301 fn default() -> Self {
313 pub fn from_identifier(id: &Identifier) -> Self {
314 if id.0.starts_with('$') {
316 } else if id.0.starts_with('#') {
323 pub fn must_leave(self) -> bool {
325 DictClass::Ordinary => false,
326 DictClass::System => false,
327 DictClass::Scratch => true,
332 #[derive(Clone, Debug)]
333 pub struct Variable {
334 pub name: Identifier,
336 pub missing_values: MissingValues,
337 pub print_format: Spec,
338 pub write_format: Spec,
339 pub value_labels: HashMap<Value, String>,
340 pub label: Option<String>,
341 pub measure: Option<Measure>,
343 pub display_width: u32,
344 pub alignment: Alignment,
346 pub short_names: Vec<Identifier>,
347 pub attributes: HashSet<ByIdentifier<Attribute>>,
351 pub fn new(name: Identifier, width: VarWidth) -> Self {
352 let var_type = VarType::from_width(width);
353 let leave = DictClass::from_identifier(&name).must_leave();
357 missing_values: MissingValues::default(),
358 print_format: Spec::default_for_width(width),
359 write_format: Spec::default_for_width(width),
360 value_labels: HashMap::new(),
362 measure: Measure::default_for_type(var_type),
363 role: Role::default(),
364 display_width: width.default_display_width(),
365 alignment: Alignment::default_for_type(var_type),
367 short_names: Vec::new(),
368 attributes: HashSet::new(),
373 impl HasIdentifier for Variable {
374 fn identifier(&self) -> &Identifier {
379 #[derive(Clone, Debug)]
381 pub name: Identifier,
382 pub variables: Vec<DictIndex>,
386 fn with_updated_dict_indexes(
388 f: impl Fn(DictIndex) -> Option<DictIndex>,
390 update_dict_index_vec(&mut self.variables, f);
391 (!self.variables.is_empty()).then_some(self)
395 impl HasIdentifier for Vector {
396 fn identifier(&self) -> &Identifier {
401 #[derive(Clone, Debug)]
402 pub struct Attribute {
403 pub name: Identifier,
404 pub values: Vec<String>,
407 impl HasIdentifier for Attribute {
408 fn identifier(&self) -> &Identifier {
413 #[derive(Clone, Debug)]
414 pub struct MultipleResponseSet {
415 pub name: Identifier,
417 pub mr_type: MultipleResponseType,
418 pub variables: Vec<DictIndex>,
421 impl MultipleResponseSet {
422 fn with_updated_dict_indexes(
424 f: impl Fn(DictIndex) -> Option<DictIndex>,
426 update_dict_index_vec(&mut self.variables, f);
427 (self.variables.len() > 1).then_some(self)
431 impl HasIdentifier for MultipleResponseSet {
432 fn identifier(&self) -> &Identifier {
437 #[derive(Clone, Debug)]
438 pub enum MultipleResponseType {
441 labels: CategoryLabels,
446 #[derive(Clone, Debug)]
447 pub struct VariableSet {
448 pub name: Identifier,
449 pub variables: Vec<DictIndex>,
453 fn with_updated_dict_indexes(
455 f: impl Fn(DictIndex) -> Option<DictIndex>,
457 update_dict_index_vec(&mut self.variables, f);
458 (!self.variables.is_empty()).then_some(self)
462 impl HasIdentifier for VariableSet {
463 fn identifier(&self) -> &Identifier {
470 use std::collections::HashSet;
472 use crate::identifier::Identifier;
474 use super::{ByIdentifier, HasIdentifier};
476 #[derive(PartialEq, Eq, Debug, Clone)]
482 impl HasIdentifier for Variable {
483 fn identifier(&self) -> &Identifier {
490 // Variables should not be the same if their values differ.
491 let abcd = Identifier::new_utf8("abcd").unwrap();
492 let abcd1 = Variable {
496 let abcd2 = Variable {
500 assert_ne!(abcd1, abcd2);
502 // But `ByName` should treat them the same.
503 let abcd1_by_name = ByIdentifier::new(abcd1);
504 let abcd2_by_name = ByIdentifier::new(abcd2);
505 assert_eq!(abcd1_by_name, abcd2_by_name);
507 // And a `HashSet` of `ByName` should also treat them the same.
508 let mut vars: HashSet<ByIdentifier<Variable>> = HashSet::new();
509 assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
510 assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
512 vars.get(&Identifier::new_utf8("abcd").unwrap())