work
authorBen Pfaff <blp@cs.stanford.edu>
Sun, 10 Dec 2023 21:12:51 +0000 (13:12 -0800)
committerBen Pfaff <blp@cs.stanford.edu>
Sun, 10 Dec 2023 21:12:51 +0000 (13:12 -0800)
rust/Cargo.lock
rust/Cargo.toml
rust/src/cooked.rs
rust/src/dictionary.rs [new file with mode: 0644]
rust/src/identifier.rs
rust/src/lib.rs

index d9d4fc6e37f9d2206802ec5733f61a84a700703d..8b89b6fb95bc7e1b659885472c3aae11cd20da64 100644 (file)
@@ -147,6 +147,12 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "equivalent"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+
 [[package]]
 name = "errno"
 version = "0.2.8"
@@ -201,6 +207,12 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
 
+[[package]]
+name = "hashbrown"
+version = "0.14.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
+
 [[package]]
 name = "heck"
 version = "0.4.1"
@@ -255,6 +267,16 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "indexmap"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f"
+dependencies = [
+ "equivalent",
+ "hashbrown",
+]
+
 [[package]]
 name = "io-lifetimes"
 version = "1.0.5"
@@ -478,6 +500,7 @@ dependencies = [
  "flate2",
  "float_next_after",
  "hexplay",
+ "indexmap",
  "lazy_static",
  "libc",
  "num",
index 0e7523976483d7bd727aac2c861bea536d934102..5131409ac01d631ca5144640c128c159585fddb8 100644 (file)
@@ -21,6 +21,7 @@ chrono = "0.4.26"
 finl_unicode = "1.2.0"
 unicase = "2.6.0"
 libc = "0.2.147"
+indexmap = "2.1.0"
 
 [target.'cfg(windows)'.dependencies]
 windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] }
index f1753846c21f1c135750a533a845df87d7583e69..be1b4c4b71fd77fa9cb344dec127ef9491cb2b87 100644 (file)
@@ -210,54 +210,6 @@ pub struct Decoder {
     n_generated_names: usize,
 }
 
-fn decode_sort_order(record: &raw::Record) -> i32 {
-    match record {
-        // File header record.
-        raw::Record::Header(_) => 0,
-
-        // Then the records used to decide character encoding.
-        raw::Record::Encoding(_) => 1,
-        raw::Record::IntegerInfo(_) => 2,
-
-        // Then the other records that don't use variables at all.
-        raw::Record::Document(_) => 3,
-        raw::Record::FloatInfo(_) => 4,
-        raw::Record::ProductInfo(_) => 5,
-        raw::Record::FileAttributes(_) => 6,
-
-        // Variable records.
-        raw::Record::Variable(_) => 7,
-
-        // These records use variable indexes that would be invalidated by very
-        // long string variables.
-        raw::Record::ValueLabel(_) => 8,
-        raw::Record::VarDisplay(_) => 9,
-
-        // These records use short names.
-        raw::Record::MultipleResponse(_) => 10,
-        raw::Record::VeryLongStrings(_) => 11,
-
-        // Rename short names to long names.
-        raw::Record::LongNames(_) => 12,
-
-        // These records use long names.
-        raw::Record::VariableAttributes(_) => 13,
-        raw::Record::LongStringValueLabels(_) => 14,
-        raw::Record::LongStringMissingValues(_) => 15,
-        raw::Record::VariableSets(_) => 16,
-
-        // Cases come last.
-        raw::Record::Cases(_) => 17,
-
-        // We don't use these records at all.
-        raw::Record::NumberOfCases(_) => i32::MAX,
-        raw::Record::OtherExtension(_) => i32::MAX,
-        raw::Record::EndOfHeaders(_) => i32::MAX,
-        raw::Record::ZHeader(_) => i32::MAX,
-        raw::Record::ZTrailer(_) => i32::MAX,
-    }
-}
-
 #[derive(Default)]
 struct Headers<'a> {
     header: Option<&'a raw::HeaderRecord>,
@@ -274,7 +226,7 @@ struct Headers<'a> {
     encoding: Option<&'a raw::EncodingRecord>,
     number_of_cases: Option<&'a raw::NumberOfCasesRecord>,
     product_info: Option<&'a raw::TextRecord>,
-    long_names: Vec<&'a raw::TextRecord>,
+    long_names: Option<&'a raw::TextRecord>,
     very_long_strings: Vec<&'a raw::TextRecord>,
     file_attributes: Vec<&'a raw::TextRecord>,
     variable_attributes: Vec<&'a raw::TextRecord>,
@@ -315,14 +267,14 @@ impl<'a> Headers<'a> {
                     set_or_warn(&mut h.number_of_cases, record, warn)
                 }
                 raw::Record::ProductInfo(record) => set_or_warn(&mut h.product_info, record, warn),
-                raw::Record::LongNames(record) => h.long_names.push(record),
+                raw::Record::LongNames(record) => set_or_warn(&mut h.long_names, record, warn),
                 raw::Record::VeryLongStrings(record) => h.very_long_strings.push(record),
                 raw::Record::FileAttributes(record) => h.file_attributes.push(record),
                 raw::Record::VariableAttributes(record) => h.variable_attributes.push(record),
                 raw::Record::OtherExtension(record) => h.other_extensions.push(record),
-                raw::Record::EndOfHeaders(_) => todo!(),
-                raw::Record::ZHeader(_) => todo!(),
-                raw::Record::ZTrailer(_) => todo!(),
+                raw::Record::EndOfHeaders(_) => (),
+                raw::Record::ZHeader(_) => (),
+                raw::Record::ZTrailer(_) => (),
                 raw::Record::Cases(record) => set_or_warn(&mut h.cases, record, warn),
             }
         }
diff --git a/rust/src/dictionary.rs b/rust/src/dictionary.rs
new file mode 100644 (file)
index 0000000..fd408d7
--- /dev/null
@@ -0,0 +1,292 @@
+use std::{
+    collections::{HashMap, HashSet},
+    fmt::Debug,
+};
+
+use encoding_rs::Encoding;
+use indexmap::IndexSet;
+
+use crate::{
+    cooked::{Alignment, Measure, MissingValues, Value, VarWidth},
+    format::Format,
+    identifier::{ByIdentifier, HasIdentifier, Identifier},
+    raw::CategoryLabels,
+};
+
+pub type DictIndex = usize;
+
+#[derive(Clone, Debug)]
+pub struct Dictionary {
+    pub variables: IndexSet<ByIdentifier<Variable>>,
+    pub split_file: Vec<DictIndex>,
+    pub weight: Option<DictIndex>,
+    pub filter: Option<DictIndex>,
+    pub case_limit: Option<u64>,
+    pub file_label: Option<String>,
+    pub documents: Vec<String>,
+    pub vectors: HashSet<ByIdentifier<Vector>>,
+    pub attributes: HashSet<ByIdentifier<Attribute>>,
+    pub mrsets: HashSet<ByIdentifier<MultipleResponseSet>>,
+    pub variable_sets: HashSet<ByIdentifier<VariableSet>>,
+    pub encoding: &'static Encoding,
+}
+
+impl Dictionary {
+    pub fn new(encoding: &'static Encoding) -> Self {
+        Self {
+            variables: IndexSet::new(),
+            split_file: Vec::new(),
+            weight: None,
+            filter: None,
+            case_limit: None,
+            file_label: None,
+            documents: Vec::new(),
+            vectors: HashSet::new(),
+            attributes: HashSet::new(),
+            mrsets: HashSet::new(),
+            variable_sets: HashSet::new(),
+            encoding,
+        }
+    }
+
+    pub fn delete_vars(&mut self, start: DictIndex, count: usize) {
+        self.update_dict_indexes(&|index| {
+            if index < start {
+                Some(index)
+            } else if index < start + count {
+                None
+            } else {
+                Some(index - count)
+            }
+        })
+    }
+
+    fn update_dict_indexes<F>(&mut self, f: &F)
+    where
+        F: Fn(DictIndex) -> Option<DictIndex>,
+    {
+        update_dict_index_vec(&mut self.split_file, f);
+        self.weight = self.weight.map(|index| f(index)).flatten();
+        self.filter = self.filter.map(|index| f(index)).flatten();
+        self.vectors = self
+            .vectors
+            .drain()
+            .filter_map(|vector_by_id| {
+                vector_by_id
+                    .0
+                    .with_updated_dict_indexes(f)
+                    .map(|vector| ByIdentifier::new(vector))
+            })
+            .collect();
+        self.mrsets = self
+            .mrsets
+            .drain()
+            .filter_map(|mrset_by_id| {
+                mrset_by_id
+                    .0
+                    .with_updated_dict_indexes(f)
+                    .map(|mrset| ByIdentifier::new(mrset))
+            })
+            .collect();
+        self.variable_sets = self
+            .variable_sets
+            .drain()
+            .filter_map(|var_set_by_id| {
+                var_set_by_id
+                    .0
+                    .with_updated_dict_indexes(f)
+                    .map(|var_set| ByIdentifier::new(var_set))
+            })
+            .collect();
+    }
+}
+
+fn update_dict_index_vec<F>(dict_indexes: &mut Vec<DictIndex>, f: F)
+where
+    F: Fn(DictIndex) -> Option<DictIndex>,
+{
+    dict_indexes.retain_mut(|index| {
+        if let Some(new) = f(*index) {
+            *index = new;
+            true
+        } else {
+            false
+        }
+    });
+}
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
+pub enum Role {
+    Input,
+    Target,
+    Both,
+    None,
+    Partition,
+    Split,
+}
+
+#[derive(Clone, Debug)]
+pub struct Variable {
+    pub name: Identifier,
+    pub width: VarWidth,
+    pub missing_values: MissingValues,
+    pub print_format: Format,
+    pub write_format: Format,
+    pub value_labels: HashMap<Value, String>,
+    pub label: Option<String>,
+    pub measure: Measure,
+    pub role: Role,
+    pub display_width: u32,
+    pub alignment: Alignment,
+    pub leave: bool,
+    pub short_names: Vec<Identifier>,
+    pub attributes: HashSet<ByIdentifier<Attribute>>,
+}
+
+impl HasIdentifier for Variable {
+    fn identifier(&self) -> &Identifier {
+        &self.name
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Vector {
+    pub name: Identifier,
+    pub variables: Vec<DictIndex>,
+}
+
+impl Vector {
+    fn with_updated_dict_indexes(
+        mut self,
+        f: impl Fn(DictIndex) -> Option<DictIndex>,
+    ) -> Option<Self> {
+        update_dict_index_vec(&mut self.variables, f);
+        (!self.variables.is_empty()).then_some(self)
+    }
+}
+
+impl HasIdentifier for Vector {
+    fn identifier(&self) -> &Identifier {
+        &self.name
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Attribute {
+    pub name: Identifier,
+    pub values: Vec<String>,
+}
+
+impl HasIdentifier for Attribute {
+    fn identifier(&self) -> &Identifier {
+        &self.name
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseSet {
+    pub name: Identifier,
+    pub label: String,
+    pub mr_type: MultipleResponseType,
+    pub variables: Vec<DictIndex>,
+}
+
+impl MultipleResponseSet {
+    fn with_updated_dict_indexes(
+        mut self,
+        f: impl Fn(DictIndex) -> Option<DictIndex>,
+    ) -> Option<Self> {
+        update_dict_index_vec(&mut self.variables, f);
+        (self.variables.len() > 1).then_some(self)
+    }
+}
+
+impl HasIdentifier for MultipleResponseSet {
+    fn identifier(&self) -> &Identifier {
+        &self.name
+    }
+}
+
+#[derive(Clone, Debug)]
+pub enum MultipleResponseType {
+    MultipleDichotomy {
+        value: Value,
+        labels: CategoryLabels,
+    },
+    MultipleCategory,
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableSet {
+    pub name: Identifier,
+    pub variables: Vec<DictIndex>,
+}
+
+impl VariableSet {
+    fn with_updated_dict_indexes(
+        mut self,
+        f: impl Fn(DictIndex) -> Option<DictIndex>,
+    ) -> Option<Self> {
+        update_dict_index_vec(&mut self.variables, f);
+        (!self.variables.is_empty()).then_some(self)
+    }
+}
+
+impl HasIdentifier for VariableSet {
+    fn identifier(&self) -> &Identifier {
+        &self.name
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::collections::HashSet;
+
+    use crate::identifier::Identifier;
+
+    use super::{ByIdentifier, HasIdentifier};
+
+    #[derive(PartialEq, Eq, Debug, Clone)]
+    struct Variable {
+        name: Identifier,
+        value: i32,
+    }
+
+    impl HasIdentifier for Variable {
+        fn identifier(&self) -> &Identifier {
+            &self.name
+        }
+    }
+
+    #[test]
+    fn test() {
+        // Variables should not be the same if their values differ.
+        let abcd = Identifier::new_utf8("abcd").unwrap();
+        let abcd1 = Variable {
+            name: abcd.clone(),
+            value: 1,
+        };
+        let abcd2 = Variable {
+            name: abcd,
+            value: 2,
+        };
+        assert_ne!(abcd1, abcd2);
+
+        // But `ByName` should treat them the same.
+        let abcd1_by_name = ByIdentifier::new(abcd1);
+        let abcd2_by_name = ByIdentifier::new(abcd2);
+        assert_eq!(abcd1_by_name, abcd2_by_name);
+
+        // And a `HashSet` of `ByName` should also treat them the same.
+        let mut vars: HashSet<ByIdentifier<Variable>> = HashSet::new();
+        assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
+        assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
+        assert_eq!(
+            vars.get(&Identifier::new_utf8("abcd").unwrap())
+                .unwrap()
+                .0
+                .value,
+            1
+        );
+    }
+}
index 8727bf1ea360454bdc15a911dc7ec1a0cd563f49..d8b5219920b9958233d3f71e97304b0cbf72b4d5 100644 (file)
@@ -1,6 +1,11 @@
-use std::fmt::{Display, Formatter, Result as FmtResult};
-
-use encoding_rs::{EncoderResult, Encoding};
+use std::{
+    borrow::Borrow,
+    cmp::Ordering,
+    fmt::{Debug, Display, Formatter, Result as FmtResult},
+    hash::{Hash, Hasher},
+};
+
+use encoding_rs::{EncoderResult, Encoding, UTF_8};
 use finl_unicode::categories::{CharacterCategories, MajorCategory};
 use thiserror::Error as ThisError;
 use unicase::UniCase;
@@ -71,7 +76,7 @@ fn is_reserved_word(s: &str) -> bool {
     false
 }
 
-#[derive(Clone, PartialEq, Eq, Debug, Hash)]
+#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
 pub struct Identifier(pub UniCase<String>);
 
 impl Identifier {
@@ -79,6 +84,9 @@ impl Identifier {
     /// encoding used by the dictionary, not in UTF-8.
     pub const MAX_LEN: usize = 64;
 
+    pub fn new_utf8(s: &str) -> Result<Identifier, Error> {
+        Self::new(s, UTF_8)
+    }
     pub fn new(s: &str, encoding: &'static Encoding) -> Result<Identifier, Error> {
         Self::is_plausible(s)?;
         let (encoded, _, unencodable) = encoding.encode(s);
@@ -138,3 +146,85 @@ impl Display for Identifier {
         write!(f, "{}", self.0)
     }
 }
+
+pub trait HasIdentifier {
+    fn identifier(&self) -> &Identifier;
+}
+
+pub struct ByIdentifier<T>(pub T)
+where
+    T: HasIdentifier;
+
+impl<T> ByIdentifier<T>
+where
+    T: HasIdentifier,
+{
+    pub fn new(inner: T) -> Self {
+        Self(inner)
+    }
+}
+
+impl<T> PartialEq for ByIdentifier<T>
+where
+    T: HasIdentifier,
+{
+    fn eq(&self, other: &Self) -> bool {
+        self.0.identifier().eq(other.0.identifier())
+    }
+}
+
+impl<T> Eq for ByIdentifier<T> where T: HasIdentifier {}
+
+impl<T> PartialOrd for ByIdentifier<T>
+where
+    T: HasIdentifier,
+{
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl<T> Ord for ByIdentifier<T>
+where
+    T: HasIdentifier,
+{
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.0.identifier().cmp(other.0.identifier())
+    }
+}
+
+impl<T> Hash for ByIdentifier<T>
+where
+    T: HasIdentifier,
+{
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.0.identifier().hash(state)
+    }
+}
+
+impl<T> Borrow<Identifier> for ByIdentifier<T>
+where
+    T: HasIdentifier,
+{
+    fn borrow(&self) -> &Identifier {
+        self.0.identifier()
+    }
+}
+
+impl<T> Debug for ByIdentifier<T>
+where
+    T: HasIdentifier + Debug,
+{
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        self.0.fmt(f)
+    }
+}
+
+impl<T> Clone for ByIdentifier<T>
+where
+    T: HasIdentifier + Clone,
+{
+    fn clone(&self) -> Self {
+        Self(self.0.clone())
+    }
+}
index 86422046bb2991981e4a3a7c4708935dfb58cfa0..f8e880c14e21641d96ea5a8f8e626f0963ba263c 100644 (file)
@@ -1,4 +1,5 @@
 pub mod cooked;
+pub mod dictionary;
 pub mod encoding;
 pub mod endian;
 pub mod format;