use std::{
+ cmp::Ordering,
collections::{HashMap, HashSet},
fmt::Debug,
ops::{Bound, RangeBounds},
use encoding_rs::Encoding;
use indexmap::IndexSet;
+use num::integer::div_ceil;
+use ordered_float::OrderedFloat;
use crate::{
- cooked::{MissingValues, Value, VarWidth},
- format::Format,
+ format::Spec,
identifier::{ByIdentifier, HasIdentifier, Identifier},
- raw::{CategoryLabels, Alignment, Measure},
+ raw::{self, Alignment, CategoryLabels, Decoder, Measure, MissingValues, RawStr, VarType},
};
pub type DictIndex = usize;
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum VarWidth {
+ Numeric,
+ String(u16),
+}
+
+impl PartialOrd for VarWidth {
+ fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+ match (self, other) {
+ (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal),
+ (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)),
+ _ => None,
+ }
+ }
+}
+
+impl VarWidth {
+ pub const MAX_STRING: u16 = 32767;
+
+ pub fn n_dict_indexes(self) -> usize {
+ match self {
+ VarWidth::Numeric => 1,
+ VarWidth::String(w) => div_ceil(w as usize, 8),
+ }
+ }
+
+ fn width_predicate(
+ a: Option<VarWidth>,
+ b: Option<VarWidth>,
+ f: impl Fn(u16, u16) -> u16,
+ ) -> Option<VarWidth> {
+ match (a, b) {
+ (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric),
+ (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => {
+ Some(VarWidth::String(f(a, b)))
+ }
+ _ => None,
+ }
+ }
+
+ /// Returns the wider of `self` and `other`:
+ /// - Numerical variable widths are equally wide.
+ /// - Longer strings are wider than shorter strings.
+ /// - Numerical and string types are incomparable, so result in `None`.
+ /// - Any `None` in the input yields `None` in the output.
+ pub fn wider(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
+ Self::width_predicate(a, b, |a, b| a.max(b))
+ }
+
+ /// Returns the narrower of `self` and `other` (see [`Self::wider`]).
+ pub fn narrower(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
+ Self::width_predicate(a, b, |a, b| a.min(b))
+ }
+
+ pub fn default_display_width(&self) -> u32 {
+ match self {
+ VarWidth::Numeric => 8,
+ VarWidth::String(width) => *width.min(&32) as u32,
+ }
+ }
+
+ pub fn from_raw(raw: impl Into<i32>) -> Result<Self, ()> {
+ let raw: i32 = raw.into();
+ match raw {
+ 0 => Ok(Self::Numeric),
+ 1..=255 => Ok(Self::String(raw as u16)),
+ _ => Err(()),
+ }
+ }
+
+ pub fn is_long_string(&self) -> bool {
+ if let Self::String(width) = self {
+ *width > 8
+ } else {
+ false
+ }
+ }
+}
+
+impl From<VarWidth> for VarType {
+ fn from(source: VarWidth) -> Self {
+ match source {
+ VarWidth::Numeric => VarType::Numeric,
+ VarWidth::String(_) => VarType::String,
+ }
+ }
+}
+
+#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Value {
+ Number(Option<OrderedFloat<f64>>),
+ String(String),
+}
+
+impl Value {
+ pub fn decode(raw: &raw::Value<RawStr<8>>, decoder: &Decoder) -> Self {
+ match raw {
+ raw::Value::Number(x) => Value::Number(x.map(|x| x.into())),
+ raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
+ }
+ }
+}
+
#[derive(Clone, Debug)]
pub struct Dictionary {
pub variables: IndexSet<ByIdentifier<Variable>>,
pub file_label: Option<String>,
pub documents: Vec<String>,
pub vectors: HashSet<ByIdentifier<Vector>>,
- pub attributes: HashSet<ByIdentifier<Attribute>>,
+ pub attributes: HashMap<Identifier, Vec<String>>,
pub mrsets: HashSet<ByIdentifier<MultipleResponseSet>>,
pub variable_sets: HashSet<ByIdentifier<VariableSet>>,
pub encoding: &'static Encoding,
}
+#[derive(Debug)]
+pub struct DuplicateVariableName;
+
impl Dictionary {
pub fn new(encoding: &'static Encoding) -> Self {
Self {
file_label: None,
documents: Vec::new(),
vectors: HashSet::new(),
- attributes: HashSet::new(),
+ attributes: HashMap::new(),
mrsets: HashSet::new(),
variable_sets: HashSet::new(),
encoding,
}
}
+ pub fn add_var(&mut self, variable: Variable) -> Result<usize, DuplicateVariableName> {
+ let (index, inserted) = self.variables.insert_full(ByIdentifier::new(variable));
+ if inserted {
+ Ok(index)
+ } else {
+ Err(DuplicateVariableName)
+ }
+ }
+
pub fn reorder_var(&mut self, from_index: DictIndex, to_index: DictIndex) {
if from_index != to_index {
self.variables.move_index(from_index, to_index);
self.update_dict_indexes(&|index| {
+ #[allow(clippy::collapsible_else_if)]
if index == from_index {
Some(to_index)
} else if from_index < to_index {
F: Fn(DictIndex) -> Option<DictIndex>,
{
update_dict_index_vec(&mut self.split_file, f);
- self.weight = self.weight.map(|index| f(index)).flatten();
- self.filter = self.filter.map(|index| f(index)).flatten();
+ self.weight = self.weight.and_then(f);
+ self.filter = self.filter.and_then(f);
self.vectors = self
.vectors
.drain()
vector_by_id
.0
.with_updated_dict_indexes(f)
- .map(|vector| ByIdentifier::new(vector))
+ .map(ByIdentifier::new)
})
.collect();
self.mrsets = self
mrset_by_id
.0
.with_updated_dict_indexes(f)
- .map(|mrset| ByIdentifier::new(mrset))
+ .map(ByIdentifier::new)
})
.collect();
self.variable_sets = self
var_set_by_id
.0
.with_updated_dict_indexes(f)
- .map(|var_set| ByIdentifier::new(var_set))
+ .map(ByIdentifier::new)
})
.collect();
}
Split,
}
+impl Default for Role {
+ fn default() -> Self {
+ Self::Input
+ }
+}
+
+pub enum DictClass {
+ Ordinary,
+ System,
+ Scratch,
+}
+
+impl DictClass {
+ pub fn from_identifier(id: &Identifier) -> Self {
+ if id.0.starts_with('$') {
+ Self::System
+ } else if id.0.starts_with('#') {
+ Self::Scratch
+ } else {
+ Self::Ordinary
+ }
+ }
+
+ pub fn must_leave(self) -> bool {
+ match self {
+ DictClass::Ordinary => false,
+ DictClass::System => false,
+ DictClass::Scratch => true,
+ }
+ }
+}
+
#[derive(Clone, Debug)]
pub struct Variable {
pub name: Identifier,
pub width: VarWidth,
pub missing_values: MissingValues,
- pub print_format: Format,
- pub write_format: Format,
+ pub print_format: Spec,
+ pub write_format: Spec,
pub value_labels: HashMap<Value, String>,
pub label: Option<String>,
- pub measure: Measure,
+ pub measure: Option<Measure>,
pub role: Role,
pub display_width: u32,
pub alignment: Alignment,
pub attributes: HashSet<ByIdentifier<Attribute>>,
}
+impl Variable {
+ pub fn new(name: Identifier, width: VarWidth) -> Self {
+ let var_type = VarType::from_width(width);
+ let leave = DictClass::from_identifier(&name).must_leave();
+ Self {
+ name,
+ width,
+ missing_values: MissingValues::default(),
+ print_format: Spec::default_for_width(width),
+ write_format: Spec::default_for_width(width),
+ value_labels: HashMap::new(),
+ label: None,
+ measure: Measure::default_for_type(var_type),
+ role: Role::default(),
+ display_width: width.default_display_width(),
+ alignment: Alignment::default_for_type(var_type),
+ leave,
+ short_names: Vec::new(),
+ attributes: HashSet::new(),
+ }
+ }
+}
+
impl HasIdentifier for Variable {
fn identifier(&self) -> &Identifier {
&self.name