use std::{
+ cmp::Ordering,
collections::{HashMap, HashSet},
fmt::Debug,
ops::{Bound, RangeBounds},
use encoding_rs::Encoding;
use indexmap::IndexSet;
+use num::integer::div_ceil;
+use ordered_float::OrderedFloat;
use crate::{
- cooked::{Value, VarWidth},
format::Spec,
identifier::{ByIdentifier, HasIdentifier, Identifier},
- raw::{Alignment, CategoryLabels, Measure, MissingValues, VarType},
+ raw::{self, Alignment, CategoryLabels, Decoder, Measure, MissingValues, RawStr, VarType},
};
pub type DictIndex = usize;
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum VarWidth {
+ Numeric,
+ String(u16),
+}
+
+impl PartialOrd for VarWidth {
+ fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+ match (self, other) {
+ (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal),
+ (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)),
+ _ => None,
+ }
+ }
+}
+
+impl VarWidth {
+ pub const MAX_STRING: u16 = 32767;
+
+ pub fn n_dict_indexes(self) -> usize {
+ match self {
+ VarWidth::Numeric => 1,
+ VarWidth::String(w) => div_ceil(w as usize, 8),
+ }
+ }
+
+ fn width_predicate(
+ a: Option<VarWidth>,
+ b: Option<VarWidth>,
+ f: impl Fn(u16, u16) -> u16,
+ ) -> Option<VarWidth> {
+ match (a, b) {
+ (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric),
+ (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => {
+ Some(VarWidth::String(f(a, b)))
+ }
+ _ => None,
+ }
+ }
+
+ /// Returns the wider of `self` and `other`:
+ /// - Numerical variable widths are equally wide.
+ /// - Longer strings are wider than shorter strings.
+ /// - Numerical and string types are incomparable, so result in `None`.
+ /// - Any `None` in the input yields `None` in the output.
+ pub fn wider(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
+ Self::width_predicate(a, b, |a, b| a.max(b))
+ }
+
+ /// Returns the narrower of `self` and `other` (see [`Self::wider`]).
+ pub fn narrower(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
+ Self::width_predicate(a, b, |a, b| a.min(b))
+ }
+
+ pub fn default_display_width(&self) -> u32 {
+ match self {
+ VarWidth::Numeric => 8,
+ VarWidth::String(width) => *width.min(&32) as u32,
+ }
+ }
+
+ pub fn from_raw(raw: impl Into<i32>) -> Result<Self, ()> {
+ let raw: i32 = raw.into();
+ match raw {
+ 0 => Ok(Self::Numeric),
+ 1..=255 => Ok(Self::String(raw as u16)),
+ _ => Err(()),
+ }
+ }
+
+ pub fn is_long_string(&self) -> bool {
+ if let Self::String(width) = self {
+ *width > 8
+ } else {
+ false
+ }
+ }
+}
+
+impl From<VarWidth> for VarType {
+ fn from(source: VarWidth) -> Self {
+ match source {
+ VarWidth::Numeric => VarType::Numeric,
+ VarWidth::String(_) => VarType::String,
+ }
+ }
+}
+
+#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Value {
+ Number(Option<OrderedFloat<f64>>),
+ String(String),
+}
+
+impl Value {
+ pub fn decode(raw: &raw::Value<RawStr<8>>, decoder: &Decoder) -> Self {
+ match raw {
+ raw::Value::Number(x) => Value::Number(x.map(|x| x.into())),
+ raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
+ }
+ }
+}
+
#[derive(Clone, Debug)]
pub struct Dictionary {
pub variables: IndexSet<ByIdentifier<Variable>>,
pub file_label: Option<String>,
pub documents: Vec<String>,
pub vectors: HashSet<ByIdentifier<Vector>>,
- pub attributes: HashSet<ByIdentifier<Attribute>>,
+ pub attributes: HashMap<Identifier, Vec<String>>,
pub mrsets: HashSet<ByIdentifier<MultipleResponseSet>>,
pub variable_sets: HashSet<ByIdentifier<VariableSet>>,
pub encoding: &'static Encoding,
}
+#[derive(Debug)]
+pub struct DuplicateVariableName;
+
impl Dictionary {
pub fn new(encoding: &'static Encoding) -> Self {
Self {
file_label: None,
documents: Vec::new(),
vectors: HashSet::new(),
- attributes: HashSet::new(),
+ attributes: HashMap::new(),
mrsets: HashSet::new(),
variable_sets: HashSet::new(),
encoding,
}
}
- pub fn add_var(&mut self, variable: Variable) -> Result<(), ()> {
- if self.variables.insert(ByIdentifier::new(variable)) {
- Ok(())
+ pub fn add_var(&mut self, variable: Variable) -> Result<usize, DuplicateVariableName> {
+ let (index, inserted) = self.variables.insert_full(ByIdentifier::new(variable));
+ if inserted {
+ Ok(index)
} else {
- Err(())
+ Err(DuplicateVariableName)
}
}
if from_index != to_index {
self.variables.move_index(from_index, to_index);
self.update_dict_indexes(&|index| {
+ #[allow(clippy::collapsible_else_if)]
if index == from_index {
Some(to_index)
} else if from_index < to_index {
F: Fn(DictIndex) -> Option<DictIndex>,
{
update_dict_index_vec(&mut self.split_file, f);
- self.weight = self.weight.map(|index| f(index)).flatten();
- self.filter = self.filter.map(|index| f(index)).flatten();
+ self.weight = self.weight.and_then(f);
+ self.filter = self.filter.and_then(f);
self.vectors = self
.vectors
.drain()
vector_by_id
.0
.with_updated_dict_indexes(f)
- .map(|vector| ByIdentifier::new(vector))
+ .map(ByIdentifier::new)
})
.collect();
self.mrsets = self
mrset_by_id
.0
.with_updated_dict_indexes(f)
- .map(|mrset| ByIdentifier::new(mrset))
+ .map(ByIdentifier::new)
})
.collect();
self.variable_sets = self
var_set_by_id
.0
.with_updated_dict_indexes(f)
- .map(|var_set| ByIdentifier::new(var_set))
+ .map(ByIdentifier::new)
})
.collect();
}
alignment: Alignment::default_for_type(var_type),
leave,
short_names: Vec::new(),
- attributes: HashSet::new()
+ attributes: HashSet::new(),
}
}
}