"cfg-if",
]
+[[package]]
+name = "equivalent"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+
[[package]]
name = "errno"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
+[[package]]
+name = "hashbrown"
+version = "0.14.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
+
[[package]]
name = "heck"
version = "0.4.1"
"cc",
]
+[[package]]
+name = "indexmap"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f"
+dependencies = [
+ "equivalent",
+ "hashbrown",
+]
+
[[package]]
name = "io-lifetimes"
version = "1.0.5"
"flate2",
"float_next_after",
"hexplay",
+ "indexmap",
"lazy_static",
"libc",
"num",
finl_unicode = "1.2.0"
unicase = "2.6.0"
libc = "0.2.147"
+indexmap = "2.1.0"
[target.'cfg(windows)'.dependencies]
windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] }
n_generated_names: usize,
}
-fn decode_sort_order(record: &raw::Record) -> i32 {
- match record {
- // File header record.
- raw::Record::Header(_) => 0,
-
- // Then the records used to decide character encoding.
- raw::Record::Encoding(_) => 1,
- raw::Record::IntegerInfo(_) => 2,
-
- // Then the other records that don't use variables at all.
- raw::Record::Document(_) => 3,
- raw::Record::FloatInfo(_) => 4,
- raw::Record::ProductInfo(_) => 5,
- raw::Record::FileAttributes(_) => 6,
-
- // Variable records.
- raw::Record::Variable(_) => 7,
-
- // These records use variable indexes that would be invalidated by very
- // long string variables.
- raw::Record::ValueLabel(_) => 8,
- raw::Record::VarDisplay(_) => 9,
-
- // These records use short names.
- raw::Record::MultipleResponse(_) => 10,
- raw::Record::VeryLongStrings(_) => 11,
-
- // Rename short names to long names.
- raw::Record::LongNames(_) => 12,
-
- // These records use long names.
- raw::Record::VariableAttributes(_) => 13,
- raw::Record::LongStringValueLabels(_) => 14,
- raw::Record::LongStringMissingValues(_) => 15,
- raw::Record::VariableSets(_) => 16,
-
- // Cases come last.
- raw::Record::Cases(_) => 17,
-
- // We don't use these records at all.
- raw::Record::NumberOfCases(_) => i32::MAX,
- raw::Record::OtherExtension(_) => i32::MAX,
- raw::Record::EndOfHeaders(_) => i32::MAX,
- raw::Record::ZHeader(_) => i32::MAX,
- raw::Record::ZTrailer(_) => i32::MAX,
- }
-}
-
#[derive(Default)]
struct Headers<'a> {
header: Option<&'a raw::HeaderRecord>,
encoding: Option<&'a raw::EncodingRecord>,
number_of_cases: Option<&'a raw::NumberOfCasesRecord>,
product_info: Option<&'a raw::TextRecord>,
- long_names: Vec<&'a raw::TextRecord>,
+ long_names: Option<&'a raw::TextRecord>,
very_long_strings: Vec<&'a raw::TextRecord>,
file_attributes: Vec<&'a raw::TextRecord>,
variable_attributes: Vec<&'a raw::TextRecord>,
set_or_warn(&mut h.number_of_cases, record, warn)
}
raw::Record::ProductInfo(record) => set_or_warn(&mut h.product_info, record, warn),
- raw::Record::LongNames(record) => h.long_names.push(record),
+ raw::Record::LongNames(record) => set_or_warn(&mut h.long_names, record, warn),
raw::Record::VeryLongStrings(record) => h.very_long_strings.push(record),
raw::Record::FileAttributes(record) => h.file_attributes.push(record),
raw::Record::VariableAttributes(record) => h.variable_attributes.push(record),
raw::Record::OtherExtension(record) => h.other_extensions.push(record),
- raw::Record::EndOfHeaders(_) => todo!(),
- raw::Record::ZHeader(_) => todo!(),
- raw::Record::ZTrailer(_) => todo!(),
+ raw::Record::EndOfHeaders(_) => (),
+ raw::Record::ZHeader(_) => (),
+ raw::Record::ZTrailer(_) => (),
raw::Record::Cases(record) => set_or_warn(&mut h.cases, record, warn),
}
}
--- /dev/null
+use std::{
+ collections::{HashMap, HashSet},
+ fmt::Debug,
+};
+
+use encoding_rs::Encoding;
+use indexmap::IndexSet;
+
+use crate::{
+ cooked::{Alignment, Measure, MissingValues, Value, VarWidth},
+ format::Format,
+ identifier::{ByIdentifier, HasIdentifier, Identifier},
+ raw::CategoryLabels,
+};
+
+pub type DictIndex = usize;
+
+#[derive(Clone, Debug)]
+pub struct Dictionary {
+ pub variables: IndexSet<ByIdentifier<Variable>>,
+ pub split_file: Vec<DictIndex>,
+ pub weight: Option<DictIndex>,
+ pub filter: Option<DictIndex>,
+ pub case_limit: Option<u64>,
+ pub file_label: Option<String>,
+ pub documents: Vec<String>,
+ pub vectors: HashSet<ByIdentifier<Vector>>,
+ pub attributes: HashSet<ByIdentifier<Attribute>>,
+ pub mrsets: HashSet<ByIdentifier<MultipleResponseSet>>,
+ pub variable_sets: HashSet<ByIdentifier<VariableSet>>,
+ pub encoding: &'static Encoding,
+}
+
+impl Dictionary {
+ pub fn new(encoding: &'static Encoding) -> Self {
+ Self {
+ variables: IndexSet::new(),
+ split_file: Vec::new(),
+ weight: None,
+ filter: None,
+ case_limit: None,
+ file_label: None,
+ documents: Vec::new(),
+ vectors: HashSet::new(),
+ attributes: HashSet::new(),
+ mrsets: HashSet::new(),
+ variable_sets: HashSet::new(),
+ encoding,
+ }
+ }
+
+ pub fn delete_vars(&mut self, start: DictIndex, count: usize) {
+ self.update_dict_indexes(&|index| {
+ if index < start {
+ Some(index)
+ } else if index < start + count {
+ None
+ } else {
+ Some(index - count)
+ }
+ })
+ }
+
+ fn update_dict_indexes<F>(&mut self, f: &F)
+ where
+ F: Fn(DictIndex) -> Option<DictIndex>,
+ {
+ update_dict_index_vec(&mut self.split_file, f);
+ self.weight = self.weight.map(|index| f(index)).flatten();
+ self.filter = self.filter.map(|index| f(index)).flatten();
+ self.vectors = self
+ .vectors
+ .drain()
+ .filter_map(|vector_by_id| {
+ vector_by_id
+ .0
+ .with_updated_dict_indexes(f)
+ .map(|vector| ByIdentifier::new(vector))
+ })
+ .collect();
+ self.mrsets = self
+ .mrsets
+ .drain()
+ .filter_map(|mrset_by_id| {
+ mrset_by_id
+ .0
+ .with_updated_dict_indexes(f)
+ .map(|mrset| ByIdentifier::new(mrset))
+ })
+ .collect();
+ self.variable_sets = self
+ .variable_sets
+ .drain()
+ .filter_map(|var_set_by_id| {
+ var_set_by_id
+ .0
+ .with_updated_dict_indexes(f)
+ .map(|var_set| ByIdentifier::new(var_set))
+ })
+ .collect();
+ }
+}
+
+fn update_dict_index_vec<F>(dict_indexes: &mut Vec<DictIndex>, f: F)
+where
+ F: Fn(DictIndex) -> Option<DictIndex>,
+{
+ dict_indexes.retain_mut(|index| {
+ if let Some(new) = f(*index) {
+ *index = new;
+ true
+ } else {
+ false
+ }
+ });
+}
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
+pub enum Role {
+ Input,
+ Target,
+ Both,
+ None,
+ Partition,
+ Split,
+}
+
+#[derive(Clone, Debug)]
+pub struct Variable {
+ pub name: Identifier,
+ pub width: VarWidth,
+ pub missing_values: MissingValues,
+ pub print_format: Format,
+ pub write_format: Format,
+ pub value_labels: HashMap<Value, String>,
+ pub label: Option<String>,
+ pub measure: Measure,
+ pub role: Role,
+ pub display_width: u32,
+ pub alignment: Alignment,
+ pub leave: bool,
+ pub short_names: Vec<Identifier>,
+ pub attributes: HashSet<ByIdentifier<Attribute>>,
+}
+
+impl HasIdentifier for Variable {
+ fn identifier(&self) -> &Identifier {
+ &self.name
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Vector {
+ pub name: Identifier,
+ pub variables: Vec<DictIndex>,
+}
+
+impl Vector {
+ fn with_updated_dict_indexes(
+ mut self,
+ f: impl Fn(DictIndex) -> Option<DictIndex>,
+ ) -> Option<Self> {
+ update_dict_index_vec(&mut self.variables, f);
+ (!self.variables.is_empty()).then_some(self)
+ }
+}
+
+impl HasIdentifier for Vector {
+ fn identifier(&self) -> &Identifier {
+ &self.name
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Attribute {
+ pub name: Identifier,
+ pub values: Vec<String>,
+}
+
+impl HasIdentifier for Attribute {
+ fn identifier(&self) -> &Identifier {
+ &self.name
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseSet {
+ pub name: Identifier,
+ pub label: String,
+ pub mr_type: MultipleResponseType,
+ pub variables: Vec<DictIndex>,
+}
+
+impl MultipleResponseSet {
+ fn with_updated_dict_indexes(
+ mut self,
+ f: impl Fn(DictIndex) -> Option<DictIndex>,
+ ) -> Option<Self> {
+ update_dict_index_vec(&mut self.variables, f);
+ (self.variables.len() > 1).then_some(self)
+ }
+}
+
+impl HasIdentifier for MultipleResponseSet {
+ fn identifier(&self) -> &Identifier {
+ &self.name
+ }
+}
+
+#[derive(Clone, Debug)]
+pub enum MultipleResponseType {
+ MultipleDichotomy {
+ value: Value,
+ labels: CategoryLabels,
+ },
+ MultipleCategory,
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableSet {
+ pub name: Identifier,
+ pub variables: Vec<DictIndex>,
+}
+
+impl VariableSet {
+ fn with_updated_dict_indexes(
+ mut self,
+ f: impl Fn(DictIndex) -> Option<DictIndex>,
+ ) -> Option<Self> {
+ update_dict_index_vec(&mut self.variables, f);
+ (!self.variables.is_empty()).then_some(self)
+ }
+}
+
+impl HasIdentifier for VariableSet {
+ fn identifier(&self) -> &Identifier {
+ &self.name
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use std::collections::HashSet;
+
+ use crate::identifier::Identifier;
+
+ use super::{ByIdentifier, HasIdentifier};
+
+ #[derive(PartialEq, Eq, Debug, Clone)]
+ struct Variable {
+ name: Identifier,
+ value: i32,
+ }
+
+ impl HasIdentifier for Variable {
+ fn identifier(&self) -> &Identifier {
+ &self.name
+ }
+ }
+
+ #[test]
+ fn test() {
+ // Variables should not be the same if their values differ.
+ let abcd = Identifier::new_utf8("abcd").unwrap();
+ let abcd1 = Variable {
+ name: abcd.clone(),
+ value: 1,
+ };
+ let abcd2 = Variable {
+ name: abcd,
+ value: 2,
+ };
+ assert_ne!(abcd1, abcd2);
+
+ // But `ByName` should treat them the same.
+ let abcd1_by_name = ByIdentifier::new(abcd1);
+ let abcd2_by_name = ByIdentifier::new(abcd2);
+ assert_eq!(abcd1_by_name, abcd2_by_name);
+
+ // And a `HashSet` of `ByName` should also treat them the same.
+ let mut vars: HashSet<ByIdentifier<Variable>> = HashSet::new();
+ assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
+ assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
+ assert_eq!(
+ vars.get(&Identifier::new_utf8("abcd").unwrap())
+ .unwrap()
+ .0
+ .value,
+ 1
+ );
+ }
+}
-use std::fmt::{Display, Formatter, Result as FmtResult};
-
-use encoding_rs::{EncoderResult, Encoding};
+use std::{
+ borrow::Borrow,
+ cmp::Ordering,
+ fmt::{Debug, Display, Formatter, Result as FmtResult},
+ hash::{Hash, Hasher},
+};
+
+use encoding_rs::{EncoderResult, Encoding, UTF_8};
use finl_unicode::categories::{CharacterCategories, MajorCategory};
use thiserror::Error as ThisError;
use unicase::UniCase;
false
}
-#[derive(Clone, PartialEq, Eq, Debug, Hash)]
+#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
pub struct Identifier(pub UniCase<String>);
impl Identifier {
/// encoding used by the dictionary, not in UTF-8.
pub const MAX_LEN: usize = 64;
+ pub fn new_utf8(s: &str) -> Result<Identifier, Error> {
+ Self::new(s, UTF_8)
+ }
pub fn new(s: &str, encoding: &'static Encoding) -> Result<Identifier, Error> {
Self::is_plausible(s)?;
let (encoded, _, unencodable) = encoding.encode(s);
write!(f, "{}", self.0)
}
}
+
+pub trait HasIdentifier {
+ fn identifier(&self) -> &Identifier;
+}
+
+pub struct ByIdentifier<T>(pub T)
+where
+ T: HasIdentifier;
+
+impl<T> ByIdentifier<T>
+where
+ T: HasIdentifier,
+{
+ pub fn new(inner: T) -> Self {
+ Self(inner)
+ }
+}
+
+impl<T> PartialEq for ByIdentifier<T>
+where
+ T: HasIdentifier,
+{
+ fn eq(&self, other: &Self) -> bool {
+ self.0.identifier().eq(other.0.identifier())
+ }
+}
+
+impl<T> Eq for ByIdentifier<T> where T: HasIdentifier {}
+
+impl<T> PartialOrd for ByIdentifier<T>
+where
+ T: HasIdentifier,
+{
+ fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+impl<T> Ord for ByIdentifier<T>
+where
+ T: HasIdentifier,
+{
+ fn cmp(&self, other: &Self) -> Ordering {
+ self.0.identifier().cmp(other.0.identifier())
+ }
+}
+
+impl<T> Hash for ByIdentifier<T>
+where
+ T: HasIdentifier,
+{
+ fn hash<H: Hasher>(&self, state: &mut H) {
+ self.0.identifier().hash(state)
+ }
+}
+
+impl<T> Borrow<Identifier> for ByIdentifier<T>
+where
+ T: HasIdentifier,
+{
+ fn borrow(&self) -> &Identifier {
+ self.0.identifier()
+ }
+}
+
+impl<T> Debug for ByIdentifier<T>
+where
+ T: HasIdentifier + Debug,
+{
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ self.0.fmt(f)
+ }
+}
+
+impl<T> Clone for ByIdentifier<T>
+where
+ T: HasIdentifier + Clone,
+{
+ fn clone(&self) -> Self {
+ Self(self.0.clone())
+ }
+}
pub mod cooked;
+pub mod dictionary;
pub mod encoding;
pub mod endian;
pub mod format;