work on lexer
[pspp] / rust / src / dictionary.rs
index f9886641f7f582398627b9ab937cf63b1ab8caa3..8d28ff329bceedca534bd59b852fef6f0d78b4d6 100644 (file)
@@ -1,4 +1,5 @@
 use std::{
+    cmp::Ordering,
     collections::{HashMap, HashSet},
     fmt::Debug,
     ops::{Bound, RangeBounds},
@@ -6,16 +7,120 @@ use std::{
 
 use encoding_rs::Encoding;
 use indexmap::IndexSet;
+use num::integer::div_ceil;
+use ordered_float::OrderedFloat;
 
 use crate::{
-    cooked::{MissingValues, Value, VarWidth},
-    format::Format,
+    format::Spec,
     identifier::{ByIdentifier, HasIdentifier, Identifier},
-    raw::{CategoryLabels, Alignment, Measure},
+    raw::{self, Alignment, CategoryLabels, Decoder, Measure, MissingValues, RawStr, VarType},
 };
 
 pub type DictIndex = usize;
 
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum VarWidth {
+    Numeric,
+    String(u16),
+}
+
+impl PartialOrd for VarWidth {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        match (self, other) {
+            (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal),
+            (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)),
+            _ => None,
+        }
+    }
+}
+
+impl VarWidth {
+    pub const MAX_STRING: u16 = 32767;
+
+    pub fn n_dict_indexes(self) -> usize {
+        match self {
+            VarWidth::Numeric => 1,
+            VarWidth::String(w) => div_ceil(w as usize, 8),
+        }
+    }
+
+    fn width_predicate(
+        a: Option<VarWidth>,
+        b: Option<VarWidth>,
+        f: impl Fn(u16, u16) -> u16,
+    ) -> Option<VarWidth> {
+        match (a, b) {
+            (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric),
+            (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => {
+                Some(VarWidth::String(f(a, b)))
+            }
+            _ => None,
+        }
+    }
+
+    /// Returns the wider of `self` and `other`:
+    /// - Numerical variable widths are equally wide.
+    /// - Longer strings are wider than shorter strings.
+    /// - Numerical and string types are incomparable, so result in `None`.
+    /// - Any `None` in the input yields `None` in the output.
+    pub fn wider(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
+        Self::width_predicate(a, b, |a, b| a.max(b))
+    }
+
+    /// Returns the narrower of `self` and `other` (see [`Self::wider`]).
+    pub fn narrower(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
+        Self::width_predicate(a, b, |a, b| a.min(b))
+    }
+
+    pub fn default_display_width(&self) -> u32 {
+        match self {
+            VarWidth::Numeric => 8,
+            VarWidth::String(width) => *width.min(&32) as u32,
+        }
+    }
+
+    pub fn from_raw(raw: impl Into<i32>) -> Result<Self, ()> {
+        let raw: i32 = raw.into();
+        match raw {
+            0 => Ok(Self::Numeric),
+            1..=255 => Ok(Self::String(raw as u16)),
+            _ => Err(()),
+        }
+    }
+
+    pub fn is_long_string(&self) -> bool {
+        if let Self::String(width) = self {
+            *width > 8
+        } else {
+            false
+        }
+    }
+}
+
+impl From<VarWidth> for VarType {
+    fn from(source: VarWidth) -> Self {
+        match source {
+            VarWidth::Numeric => VarType::Numeric,
+            VarWidth::String(_) => VarType::String,
+        }
+    }
+}
+
+#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Value {
+    Number(Option<OrderedFloat<f64>>),
+    String(String),
+}
+
+impl Value {
+    pub fn decode(raw: &raw::Value<RawStr<8>>, decoder: &Decoder) -> Self {
+        match raw {
+            raw::Value::Number(x) => Value::Number(x.map(|x| x.into())),
+            raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
+        }
+    }
+}
+
 #[derive(Clone, Debug)]
 pub struct Dictionary {
     pub variables: IndexSet<ByIdentifier<Variable>>,
@@ -26,12 +131,15 @@ pub struct Dictionary {
     pub file_label: Option<String>,
     pub documents: Vec<String>,
     pub vectors: HashSet<ByIdentifier<Vector>>,
-    pub attributes: HashSet<ByIdentifier<Attribute>>,
+    pub attributes: HashMap<Identifier, Vec<String>>,
     pub mrsets: HashSet<ByIdentifier<MultipleResponseSet>>,
     pub variable_sets: HashSet<ByIdentifier<VariableSet>>,
     pub encoding: &'static Encoding,
 }
 
+#[derive(Debug)]
+pub struct DuplicateVariableName;
+
 impl Dictionary {
     pub fn new(encoding: &'static Encoding) -> Self {
         Self {
@@ -43,17 +151,27 @@ impl Dictionary {
             file_label: None,
             documents: Vec::new(),
             vectors: HashSet::new(),
-            attributes: HashSet::new(),
+            attributes: HashMap::new(),
             mrsets: HashSet::new(),
             variable_sets: HashSet::new(),
             encoding,
         }
     }
 
+    pub fn add_var(&mut self, variable: Variable) -> Result<usize, DuplicateVariableName> {
+        let (index, inserted) = self.variables.insert_full(ByIdentifier::new(variable));
+        if inserted {
+            Ok(index)
+        } else {
+            Err(DuplicateVariableName)
+        }
+    }
+
     pub fn reorder_var(&mut self, from_index: DictIndex, to_index: DictIndex) {
         if from_index != to_index {
             self.variables.move_index(from_index, to_index);
             self.update_dict_indexes(&|index| {
+                #[allow(clippy::collapsible_else_if)]
                 if index == from_index {
                     Some(to_index)
                 } else if from_index < to_index {
@@ -128,8 +246,8 @@ impl Dictionary {
         F: Fn(DictIndex) -> Option<DictIndex>,
     {
         update_dict_index_vec(&mut self.split_file, f);
-        self.weight = self.weight.map(|index| f(index)).flatten();
-        self.filter = self.filter.map(|index| f(index)).flatten();
+        self.weight = self.weight.and_then(f);
+        self.filter = self.filter.and_then(f);
         self.vectors = self
             .vectors
             .drain()
@@ -137,7 +255,7 @@ impl Dictionary {
                 vector_by_id
                     .0
                     .with_updated_dict_indexes(f)
-                    .map(|vector| ByIdentifier::new(vector))
+                    .map(ByIdentifier::new)
             })
             .collect();
         self.mrsets = self
@@ -147,7 +265,7 @@ impl Dictionary {
                 mrset_by_id
                     .0
                     .with_updated_dict_indexes(f)
-                    .map(|mrset| ByIdentifier::new(mrset))
+                    .map(ByIdentifier::new)
             })
             .collect();
         self.variable_sets = self
@@ -157,7 +275,7 @@ impl Dictionary {
                 var_set_by_id
                     .0
                     .with_updated_dict_indexes(f)
-                    .map(|var_set| ByIdentifier::new(var_set))
+                    .map(ByIdentifier::new)
             })
             .collect();
     }
@@ -187,16 +305,48 @@ pub enum Role {
     Split,
 }
 
+impl Default for Role {
+    fn default() -> Self {
+        Self::Input
+    }
+}
+
+pub enum DictClass {
+    Ordinary,
+    System,
+    Scratch,
+}
+
+impl DictClass {
+    pub fn from_identifier(id: &Identifier) -> Self {
+        if id.0.starts_with('$') {
+            Self::System
+        } else if id.0.starts_with('#') {
+            Self::Scratch
+        } else {
+            Self::Ordinary
+        }
+    }
+
+    pub fn must_leave(self) -> bool {
+        match self {
+            DictClass::Ordinary => false,
+            DictClass::System => false,
+            DictClass::Scratch => true,
+        }
+    }
+}
+
 #[derive(Clone, Debug)]
 pub struct Variable {
     pub name: Identifier,
     pub width: VarWidth,
     pub missing_values: MissingValues,
-    pub print_format: Format,
-    pub write_format: Format,
+    pub print_format: Spec,
+    pub write_format: Spec,
     pub value_labels: HashMap<Value, String>,
     pub label: Option<String>,
-    pub measure: Measure,
+    pub measure: Option<Measure>,
     pub role: Role,
     pub display_width: u32,
     pub alignment: Alignment,
@@ -205,6 +355,29 @@ pub struct Variable {
     pub attributes: HashSet<ByIdentifier<Attribute>>,
 }
 
+impl Variable {
+    pub fn new(name: Identifier, width: VarWidth) -> Self {
+        let var_type = VarType::from_width(width);
+        let leave = DictClass::from_identifier(&name).must_leave();
+        Self {
+            name,
+            width,
+            missing_values: MissingValues::default(),
+            print_format: Spec::default_for_width(width),
+            write_format: Spec::default_for_width(width),
+            value_labels: HashMap::new(),
+            label: None,
+            measure: Measure::default_for_type(var_type),
+            role: Role::default(),
+            display_width: width.default_display_width(),
+            alignment: Alignment::default_for_type(var_type),
+            leave,
+            short_names: Vec::new(),
+            attributes: HashSet::new(),
+        }
+    }
+}
+
 impl HasIdentifier for Variable {
     fn identifier(&self) -> &Identifier {
         &self.name