work on sysfile reader
authorBen Pfaff <blp@cs.stanford.edu>
Mon, 23 Dec 2024 03:26:34 +0000 (19:26 -0800)
committerBen Pfaff <blp@cs.stanford.edu>
Mon, 23 Dec 2024 03:26:34 +0000 (19:26 -0800)
rust/pspp/src/cooked.rs
rust/pspp/src/dictionary.rs
rust/pspp/src/identifier.rs
rust/pspp/src/raw.rs
rust/pspp/src/settings.rs

index 7a3c62e70c711fb2e8cd78b14b2091cdcb77e5db..84b473fdf7bd452642ab47fd44b19105cdf60d87 100644 (file)
@@ -11,7 +11,7 @@ use crate::{
     identifier::{ByIdentifier, Error as IdError, Identifier},
     raw::{
         self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord,
-        FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongNamesRecord,
+        FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName, LongNamesRecord,
         LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord,
         NumberOfCasesRecord, ProductInfoRecord, RawStr, RawWidth, ValueLabel, ValueLabelRecord,
         VarDisplayRecord, VariableAttributeRecord, VariableRecord, VariableSetRecord,
@@ -405,7 +405,7 @@ impl Decoder {
         loop {
             self.n_generated_names += 1;
             let name = Identifier::from_encoding(
-                &format!("VAR{:03}", self.n_generated_names),
+                format!("VAR{:03}", self.n_generated_names),
                 self.encoding,
             )
             .unwrap();
@@ -459,7 +459,7 @@ pub fn decode(
         .filter(|(_index, record)| record.width != RawWidth::Continuation)
     {
         let name = trim_end_spaces(input.name.to_string());
-        let name = match Identifier::from_encoding(&name, encoding) {
+        let name = match Identifier::from_encoding(name, encoding) {
             Ok(name) => {
                 if !dictionary.variables.contains(&name.0) {
                     name
@@ -610,6 +610,43 @@ pub fn decode(
         }
     }
 
+    if headers.long_names.is_empty() {
+        // There are no long variable names.  Use the short variable names,
+        // converted to lowercase, as the long variable names.
+        for index in 0..dictionary.variables.len() {
+            let lower = dictionary.variables[index].name.0.as_ref().to_lowercase();
+            if let Ok(new_name) = Identifier::from_encoding(lower, dictionary.encoding) {
+                dictionary.try_rename_var(index, new_name);
+            }
+        }
+    } else {
+        // Rename each of the variables, one by one.  (In a correctly
+        // constructed system file, this cannot create any intermediate
+        // duplicate variable names, because all of the new variable names are
+        // longer than any of the old variable names and thus there cannot be
+        // any overlaps.)
+        for renaming in headers
+            .long_names
+            .iter()
+            .flat_map(|record| record.0.iter().cloned())
+        {
+            let LongName {
+                short_name,
+                long_name,
+            } = renaming;
+            if let Some(index) = dictionary.variables.get_index_of(&short_name.0) {
+                dictionary.try_rename_var(index, long_name);
+                dictionary
+                    .variables
+                    .get_index_mut2(index)
+                    .unwrap()
+                    .short_names = vec![short_name];
+            } else {
+                warn(Error::TBD);
+            }
+        }
+    }
+
     let metadata = Metadata::decode(&headers, warn);
     Ok((dictionary, metadata))
 }
index 5e7f25d208595424c9e8cdf98ecdf839d225772a..e8988ca2c1d6cb00ededbbb515fad1d78201557f 100644 (file)
@@ -374,6 +374,22 @@ impl Dictionary {
             })
             .collect();
     }
+
+    pub fn try_rename_var(&mut self, index: usize, new_name: Identifier) -> bool {
+        let mut variable = self.variables.swap_remove_index(index).unwrap();
+        let may_rename = !self.variables.contains(&new_name.0);
+        if may_rename {
+            variable.name = new_name;
+            variable.short_names = Vec::new();
+        };
+        assert!(self.variables.insert(variable));
+        self.variables.swap_indices(self.variables.len() - 1, index);
+        may_rename
+    }
+
+    pub fn rename_var(&mut self, index: usize, new_name: Identifier) {
+        assert!(self.try_rename_var(index, new_name));
+    }
 }
 
 fn update_dict_index_vec<F>(dict_indexes: &mut Vec<DictIndex>, f: F)
@@ -390,8 +406,9 @@ where
     });
 }
 
-#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
+#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
 pub enum Role {
+    #[default]
     Input,
     Target,
     Both,
@@ -400,12 +417,7 @@ pub enum Role {
     Split,
 }
 
-impl Default for Role {
-    fn default() -> Self {
-        Self::Input
-    }
-}
-
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
 pub enum DictClass {
     Ordinary,
     System,
@@ -413,7 +425,13 @@ pub enum DictClass {
 }
 
 impl DictClass {
-    pub fn from_identifier(id: &Identifier) -> Self {
+    pub fn must_leave(self) -> bool {
+        self == DictClass::Scratch
+    }
+}
+
+impl From<&Identifier> for DictClass {
+    fn from(id: &Identifier) -> Self {
         if id.0.starts_with('$') {
             Self::System
         } else if id.0.starts_with('#') {
@@ -422,14 +440,6 @@ impl DictClass {
             Self::Ordinary
         }
     }
-
-    pub fn must_leave(self) -> bool {
-        match self {
-            DictClass::Ordinary => false,
-            DictClass::System => false,
-            DictClass::Scratch => true,
-        }
-    }
 }
 
 #[derive(Clone, Debug)]
@@ -453,7 +463,7 @@ pub struct Variable {
 impl Variable {
     pub fn new(name: Identifier, width: VarWidth) -> Self {
         let var_type = VarType::from_width(width);
-        let leave = DictClass::from_identifier(&name).must_leave();
+        let leave = DictClass::from(&name).must_leave();
         Self {
             name,
             width,
index c6909fd58afd095aa76bd407af44a59237688edd..1f6694b6332ba9cf2a7ecfbdf4b209066c48f21a 100644 (file)
@@ -11,6 +11,8 @@ use finl_unicode::categories::{CharacterCategories, MajorCategory};
 use thiserror::Error as ThisError;
 use unicase::UniCase;
 
+use crate::dictionary::DictClass;
+
 pub trait IdentifierChar {
     /// Returns true if `self` is an ASCII character that may be the first
     /// character in an identifier.
@@ -153,12 +155,13 @@ impl Identifier {
     /// encoding used by the dictionary, not in UTF-8.
     pub const MAX_LEN: usize = 64;
 
-    pub fn new(s: &str) -> Result<Self, Error> {
+    pub fn new(s: impl Into<UniCase<String>>) -> Result<Self, Error> {
         Self::from_encoding(s, UTF_8)
     }
-    pub fn from_encoding(s: &str, encoding: &'static Encoding) -> Result<Identifier, Error> {
-        Self::is_plausible(s)?;
-        let identifier = Identifier(s.into());
+    pub fn from_encoding(s: impl Into<UniCase<String>>, encoding: &'static Encoding) -> Result<Identifier, Error> {
+        let s: UniCase<String> = s.into();
+        Self::is_plausible(&s)?;
+        let identifier = Identifier(s);
         identifier.check_encoding(encoding)?;
         Ok(identifier)
     }
@@ -241,6 +244,17 @@ impl Identifier {
     pub fn matches_keyword_n(&self, keyword: &str, n: usize) -> bool {
         id_match_n_nonstatic(keyword, self.0.as_str(), n)
     }
+
+    pub fn must_be_ordinary(self) -> Result<Self, Error> {
+        match DictClass::from(&self)  {
+            DictClass::Ordinary => Ok(self),
+            _ => {
+                let s = self.0.into_inner();
+                let first = s.chars().next().unwrap();
+                Err(Error::BadFirstCharacter(s, first))
+            }
+        }
+    }
 }
 
 impl PartialEq<str> for Identifier {
index e5933d3756ca67b67a1da05d195bcdfee28188ac..63b7c04c3e1c66878f5b5688ceb9394892dee67e 100644 (file)
@@ -2301,6 +2301,7 @@ impl VeryLongString {
         };
         let short_name = decoder
             .new_identifier(short_name)
+            .and_then(Identifier::must_be_ordinary)
             .map_err(Warning::InvalidLongStringName)?;
         let length = length.parse().map_err(|_| Warning::TBD)?;
         Ok(VeryLongString { short_name, length })
@@ -2421,6 +2422,7 @@ impl VarAttributeSet {
         };
         let long_var_name = decoder
             .new_identifier(long_var_name)
+            .and_then(Identifier::must_be_ordinary)
             .map_err(Warning::InvalidAttributeVariableName)?;
         let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?;
         let var_attribute = VarAttributeSet {
@@ -2465,9 +2467,11 @@ impl LongName {
         };
         let short_name = decoder
             .new_identifier(short_name)
+            .and_then(Identifier::must_be_ordinary)
             .map_err(Warning::InvalidShortName)?;
         let long_name = decoder
             .new_identifier(long_name)
+            .and_then(Identifier::must_be_ordinary)
             .map_err(Warning::InvalidLongName)?;
         Ok(LongName {
             short_name,
@@ -2477,7 +2481,7 @@ impl LongName {
 }
 
 #[derive(Clone, Debug)]
-pub struct LongNamesRecord(Vec<LongName>);
+pub struct LongNamesRecord(pub Vec<LongName>);
 
 impl LongNamesRecord {
     fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
@@ -2513,6 +2517,7 @@ impl VariableSet {
         for var in input.split_ascii_whitespace() {
             if let Some(identifier) = decoder
                 .new_identifier(var)
+                .and_then(Identifier::must_be_ordinary)
                 .map_err(Warning::InvalidVariableSetName)
                 .issue_warning(&decoder.warn)
             {
index de519512025b6af1181207ddbc6a391d1e490676..b48420aefa4ef3f8f39b6aa9e20dadc19f542f97 100644 (file)
@@ -68,9 +68,9 @@ impl Default for Settings {
             testing: false,
             fuzz_bits: 6,
             scale_min: 24,
-            commands: Compatibility::Enhanced,
-            global: Compatibility::Enhanced,
-            syntax: Compatibility::Enhanced,
+            commands: Compatibility::default(),
+            global: Compatibility::default(),
+            syntax: Compatibility::default(),
             formats: FormatSettings::default(),
             small: 0.0001,
         }
@@ -84,9 +84,14 @@ impl Settings {
     }
 }
 
+#[derive(Copy, Clone, PartialEq, Eq, Default)]
 pub enum Compatibility {
-    Compatible,
+    /// Use improved PSPP behavior.
+    #[default]
     Enhanced,
+
+    /// Be as compatible as possible.
+    Compatible,
 }
 
 pub struct MacroSettings {