From 98eeb0ffdd812543c011752cfb74e375cbf715ba Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 10 Dec 2023 13:12:51 -0800 Subject: [PATCH] work --- rust/Cargo.lock | 23 ++++ rust/Cargo.toml | 1 + rust/src/cooked.rs | 58 +------- rust/src/dictionary.rs | 292 +++++++++++++++++++++++++++++++++++++++++ rust/src/identifier.rs | 98 +++++++++++++- rust/src/lib.rs | 1 + 6 files changed, 416 insertions(+), 57 deletions(-) create mode 100644 rust/src/dictionary.rs diff --git a/rust/Cargo.lock b/rust/Cargo.lock index d9d4fc6e37..8b89b6fb95 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -147,6 +147,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + [[package]] name = "errno" version = "0.2.8" @@ -201,6 +207,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" +[[package]] +name = "hashbrown" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" + [[package]] name = "heck" version = "0.4.1" @@ -255,6 +267,16 @@ dependencies = [ "cc", ] +[[package]] +name = "indexmap" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f" +dependencies = [ + "equivalent", + "hashbrown", +] + [[package]] name = "io-lifetimes" version = "1.0.5" @@ -478,6 +500,7 @@ dependencies = [ "flate2", "float_next_after", "hexplay", + "indexmap", "lazy_static", "libc", "num", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 0e75239764..5131409ac0 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -21,6 +21,7 @@ chrono = "0.4.26" finl_unicode = "1.2.0" unicase = "2.6.0" libc = "0.2.147" +indexmap = "2.1.0" [target.'cfg(windows)'.dependencies] windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] } diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index f1753846c2..be1b4c4b71 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -210,54 +210,6 @@ pub struct Decoder { n_generated_names: usize, } -fn decode_sort_order(record: &raw::Record) -> i32 { - match record { - // File header record. - raw::Record::Header(_) => 0, - - // Then the records used to decide character encoding. - raw::Record::Encoding(_) => 1, - raw::Record::IntegerInfo(_) => 2, - - // Then the other records that don't use variables at all. - raw::Record::Document(_) => 3, - raw::Record::FloatInfo(_) => 4, - raw::Record::ProductInfo(_) => 5, - raw::Record::FileAttributes(_) => 6, - - // Variable records. - raw::Record::Variable(_) => 7, - - // These records use variable indexes that would be invalidated by very - // long string variables. - raw::Record::ValueLabel(_) => 8, - raw::Record::VarDisplay(_) => 9, - - // These records use short names. - raw::Record::MultipleResponse(_) => 10, - raw::Record::VeryLongStrings(_) => 11, - - // Rename short names to long names. - raw::Record::LongNames(_) => 12, - - // These records use long names. - raw::Record::VariableAttributes(_) => 13, - raw::Record::LongStringValueLabels(_) => 14, - raw::Record::LongStringMissingValues(_) => 15, - raw::Record::VariableSets(_) => 16, - - // Cases come last. - raw::Record::Cases(_) => 17, - - // We don't use these records at all. - raw::Record::NumberOfCases(_) => i32::MAX, - raw::Record::OtherExtension(_) => i32::MAX, - raw::Record::EndOfHeaders(_) => i32::MAX, - raw::Record::ZHeader(_) => i32::MAX, - raw::Record::ZTrailer(_) => i32::MAX, - } -} - #[derive(Default)] struct Headers<'a> { header: Option<&'a raw::HeaderRecord>, @@ -274,7 +226,7 @@ struct Headers<'a> { encoding: Option<&'a raw::EncodingRecord>, number_of_cases: Option<&'a raw::NumberOfCasesRecord>, product_info: Option<&'a raw::TextRecord>, - long_names: Vec<&'a raw::TextRecord>, + long_names: Option<&'a raw::TextRecord>, very_long_strings: Vec<&'a raw::TextRecord>, file_attributes: Vec<&'a raw::TextRecord>, variable_attributes: Vec<&'a raw::TextRecord>, @@ -315,14 +267,14 @@ impl<'a> Headers<'a> { set_or_warn(&mut h.number_of_cases, record, warn) } raw::Record::ProductInfo(record) => set_or_warn(&mut h.product_info, record, warn), - raw::Record::LongNames(record) => h.long_names.push(record), + raw::Record::LongNames(record) => set_or_warn(&mut h.long_names, record, warn), raw::Record::VeryLongStrings(record) => h.very_long_strings.push(record), raw::Record::FileAttributes(record) => h.file_attributes.push(record), raw::Record::VariableAttributes(record) => h.variable_attributes.push(record), raw::Record::OtherExtension(record) => h.other_extensions.push(record), - raw::Record::EndOfHeaders(_) => todo!(), - raw::Record::ZHeader(_) => todo!(), - raw::Record::ZTrailer(_) => todo!(), + raw::Record::EndOfHeaders(_) => (), + raw::Record::ZHeader(_) => (), + raw::Record::ZTrailer(_) => (), raw::Record::Cases(record) => set_or_warn(&mut h.cases, record, warn), } } diff --git a/rust/src/dictionary.rs b/rust/src/dictionary.rs new file mode 100644 index 0000000000..fd408d7725 --- /dev/null +++ b/rust/src/dictionary.rs @@ -0,0 +1,292 @@ +use std::{ + collections::{HashMap, HashSet}, + fmt::Debug, +}; + +use encoding_rs::Encoding; +use indexmap::IndexSet; + +use crate::{ + cooked::{Alignment, Measure, MissingValues, Value, VarWidth}, + format::Format, + identifier::{ByIdentifier, HasIdentifier, Identifier}, + raw::CategoryLabels, +}; + +pub type DictIndex = usize; + +#[derive(Clone, Debug)] +pub struct Dictionary { + pub variables: IndexSet>, + pub split_file: Vec, + pub weight: Option, + pub filter: Option, + pub case_limit: Option, + pub file_label: Option, + pub documents: Vec, + pub vectors: HashSet>, + pub attributes: HashSet>, + pub mrsets: HashSet>, + pub variable_sets: HashSet>, + pub encoding: &'static Encoding, +} + +impl Dictionary { + pub fn new(encoding: &'static Encoding) -> Self { + Self { + variables: IndexSet::new(), + split_file: Vec::new(), + weight: None, + filter: None, + case_limit: None, + file_label: None, + documents: Vec::new(), + vectors: HashSet::new(), + attributes: HashSet::new(), + mrsets: HashSet::new(), + variable_sets: HashSet::new(), + encoding, + } + } + + pub fn delete_vars(&mut self, start: DictIndex, count: usize) { + self.update_dict_indexes(&|index| { + if index < start { + Some(index) + } else if index < start + count { + None + } else { + Some(index - count) + } + }) + } + + fn update_dict_indexes(&mut self, f: &F) + where + F: Fn(DictIndex) -> Option, + { + update_dict_index_vec(&mut self.split_file, f); + self.weight = self.weight.map(|index| f(index)).flatten(); + self.filter = self.filter.map(|index| f(index)).flatten(); + self.vectors = self + .vectors + .drain() + .filter_map(|vector_by_id| { + vector_by_id + .0 + .with_updated_dict_indexes(f) + .map(|vector| ByIdentifier::new(vector)) + }) + .collect(); + self.mrsets = self + .mrsets + .drain() + .filter_map(|mrset_by_id| { + mrset_by_id + .0 + .with_updated_dict_indexes(f) + .map(|mrset| ByIdentifier::new(mrset)) + }) + .collect(); + self.variable_sets = self + .variable_sets + .drain() + .filter_map(|var_set_by_id| { + var_set_by_id + .0 + .with_updated_dict_indexes(f) + .map(|var_set| ByIdentifier::new(var_set)) + }) + .collect(); + } +} + +fn update_dict_index_vec(dict_indexes: &mut Vec, f: F) +where + F: Fn(DictIndex) -> Option, +{ + dict_indexes.retain_mut(|index| { + if let Some(new) = f(*index) { + *index = new; + true + } else { + false + } + }); +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] +pub enum Role { + Input, + Target, + Both, + None, + Partition, + Split, +} + +#[derive(Clone, Debug)] +pub struct Variable { + pub name: Identifier, + pub width: VarWidth, + pub missing_values: MissingValues, + pub print_format: Format, + pub write_format: Format, + pub value_labels: HashMap, + pub label: Option, + pub measure: Measure, + pub role: Role, + pub display_width: u32, + pub alignment: Alignment, + pub leave: bool, + pub short_names: Vec, + pub attributes: HashSet>, +} + +impl HasIdentifier for Variable { + fn identifier(&self) -> &Identifier { + &self.name + } +} + +#[derive(Clone, Debug)] +pub struct Vector { + pub name: Identifier, + pub variables: Vec, +} + +impl Vector { + fn with_updated_dict_indexes( + mut self, + f: impl Fn(DictIndex) -> Option, + ) -> Option { + update_dict_index_vec(&mut self.variables, f); + (!self.variables.is_empty()).then_some(self) + } +} + +impl HasIdentifier for Vector { + fn identifier(&self) -> &Identifier { + &self.name + } +} + +#[derive(Clone, Debug)] +pub struct Attribute { + pub name: Identifier, + pub values: Vec, +} + +impl HasIdentifier for Attribute { + fn identifier(&self) -> &Identifier { + &self.name + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseSet { + pub name: Identifier, + pub label: String, + pub mr_type: MultipleResponseType, + pub variables: Vec, +} + +impl MultipleResponseSet { + fn with_updated_dict_indexes( + mut self, + f: impl Fn(DictIndex) -> Option, + ) -> Option { + update_dict_index_vec(&mut self.variables, f); + (self.variables.len() > 1).then_some(self) + } +} + +impl HasIdentifier for MultipleResponseSet { + fn identifier(&self) -> &Identifier { + &self.name + } +} + +#[derive(Clone, Debug)] +pub enum MultipleResponseType { + MultipleDichotomy { + value: Value, + labels: CategoryLabels, + }, + MultipleCategory, +} + +#[derive(Clone, Debug)] +pub struct VariableSet { + pub name: Identifier, + pub variables: Vec, +} + +impl VariableSet { + fn with_updated_dict_indexes( + mut self, + f: impl Fn(DictIndex) -> Option, + ) -> Option { + update_dict_index_vec(&mut self.variables, f); + (!self.variables.is_empty()).then_some(self) + } +} + +impl HasIdentifier for VariableSet { + fn identifier(&self) -> &Identifier { + &self.name + } +} + +#[cfg(test)] +mod test { + use std::collections::HashSet; + + use crate::identifier::Identifier; + + use super::{ByIdentifier, HasIdentifier}; + + #[derive(PartialEq, Eq, Debug, Clone)] + struct Variable { + name: Identifier, + value: i32, + } + + impl HasIdentifier for Variable { + fn identifier(&self) -> &Identifier { + &self.name + } + } + + #[test] + fn test() { + // Variables should not be the same if their values differ. + let abcd = Identifier::new_utf8("abcd").unwrap(); + let abcd1 = Variable { + name: abcd.clone(), + value: 1, + }; + let abcd2 = Variable { + name: abcd, + value: 2, + }; + assert_ne!(abcd1, abcd2); + + // But `ByName` should treat them the same. + let abcd1_by_name = ByIdentifier::new(abcd1); + let abcd2_by_name = ByIdentifier::new(abcd2); + assert_eq!(abcd1_by_name, abcd2_by_name); + + // And a `HashSet` of `ByName` should also treat them the same. + let mut vars: HashSet> = HashSet::new(); + assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone()))); + assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone()))); + assert_eq!( + vars.get(&Identifier::new_utf8("abcd").unwrap()) + .unwrap() + .0 + .value, + 1 + ); + } +} diff --git a/rust/src/identifier.rs b/rust/src/identifier.rs index 8727bf1ea3..d8b5219920 100644 --- a/rust/src/identifier.rs +++ b/rust/src/identifier.rs @@ -1,6 +1,11 @@ -use std::fmt::{Display, Formatter, Result as FmtResult}; - -use encoding_rs::{EncoderResult, Encoding}; +use std::{ + borrow::Borrow, + cmp::Ordering, + fmt::{Debug, Display, Formatter, Result as FmtResult}, + hash::{Hash, Hasher}, +}; + +use encoding_rs::{EncoderResult, Encoding, UTF_8}; use finl_unicode::categories::{CharacterCategories, MajorCategory}; use thiserror::Error as ThisError; use unicase::UniCase; @@ -71,7 +76,7 @@ fn is_reserved_word(s: &str) -> bool { false } -#[derive(Clone, PartialEq, Eq, Debug, Hash)] +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] pub struct Identifier(pub UniCase); impl Identifier { @@ -79,6 +84,9 @@ impl Identifier { /// encoding used by the dictionary, not in UTF-8. pub const MAX_LEN: usize = 64; + pub fn new_utf8(s: &str) -> Result { + Self::new(s, UTF_8) + } pub fn new(s: &str, encoding: &'static Encoding) -> Result { Self::is_plausible(s)?; let (encoded, _, unencodable) = encoding.encode(s); @@ -138,3 +146,85 @@ impl Display for Identifier { write!(f, "{}", self.0) } } + +pub trait HasIdentifier { + fn identifier(&self) -> &Identifier; +} + +pub struct ByIdentifier(pub T) +where + T: HasIdentifier; + +impl ByIdentifier +where + T: HasIdentifier, +{ + pub fn new(inner: T) -> Self { + Self(inner) + } +} + +impl PartialEq for ByIdentifier +where + T: HasIdentifier, +{ + fn eq(&self, other: &Self) -> bool { + self.0.identifier().eq(other.0.identifier()) + } +} + +impl Eq for ByIdentifier where T: HasIdentifier {} + +impl PartialOrd for ByIdentifier +where + T: HasIdentifier, +{ + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for ByIdentifier +where + T: HasIdentifier, +{ + fn cmp(&self, other: &Self) -> Ordering { + self.0.identifier().cmp(other.0.identifier()) + } +} + +impl Hash for ByIdentifier +where + T: HasIdentifier, +{ + fn hash(&self, state: &mut H) { + self.0.identifier().hash(state) + } +} + +impl Borrow for ByIdentifier +where + T: HasIdentifier, +{ + fn borrow(&self) -> &Identifier { + self.0.identifier() + } +} + +impl Debug for ByIdentifier +where + T: HasIdentifier + Debug, +{ + fn fmt(&self, f: &mut Formatter) -> FmtResult { + self.0.fmt(f) + } +} + +impl Clone for ByIdentifier +where + T: HasIdentifier + Clone, +{ + fn clone(&self) -> Self { + Self(self.0.clone()) + } +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 86422046bb..f8e880c14e 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,4 +1,5 @@ pub mod cooked; +pub mod dictionary; pub mod encoding; pub mod endian; pub mod format; -- 2.30.2