variables parsed
authorBen Pfaff <blp@cs.stanford.edu>
Mon, 26 Feb 2024 17:11:17 +0000 (09:11 -0800)
committerBen Pfaff <blp@cs.stanford.edu>
Mon, 26 Feb 2024 17:11:17 +0000 (09:11 -0800)
rust/src/cooked.rs
rust/src/dictionary.rs
rust/src/raw.rs

index 5d32c91ced3c7cdf91b5414e7ec6ce944d9630f6..ee66890027040eba699b7de61c04894279a7eaf2 100644 (file)
@@ -1,10 +1,10 @@
-use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc};
+use std::{cell::RefCell, ops::Range, rc::Rc, collections::HashMap};
 
 use crate::{
-    dictionary::{Dictionary, VarWidth},
+    dictionary::{Dictionary, VarWidth, Variable},
     encoding::Error as EncodingError,
     endian::Endian,
-    format::{Error as FormatError, Spec},
+    format::{Error as FormatError, Spec, UncheckedSpec},
     identifier::{Error as IdError, Identifier},
     raw::{
         self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord,
@@ -17,6 +17,7 @@ use crate::{
 };
 use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
 use encoding_rs::Encoding;
+use num::Integer;
 use thiserror::Error as ThisError;
 
 pub use crate::raw::{CategoryLabels, Compression};
@@ -162,23 +163,6 @@ pub enum Error {
 
 type DictIndex = usize;
 
-pub struct Variable {
-    pub dict_index: DictIndex,
-    pub short_name: Identifier,
-    pub long_name: Option<Identifier>,
-    pub width: VarWidth,
-}
-
-pub struct Decoder {
-    pub raw: raw::Decoder,
-    pub encoding: &'static Encoding,
-    pub variables: HashMap<DictIndex, Variable>,
-    pub var_names: HashMap<Identifier, DictIndex>,
-    pub dictionary: Dictionary,
-    n_dict_indexes: usize,
-    n_generated_names: usize,
-}
-
 #[derive(Clone, Debug)]
 pub struct Headers {
     pub header: HeaderRecord<String>,
@@ -396,6 +380,30 @@ impl Metadata {
     }
 }
 
+struct Decoder {
+    //pub raw: raw::Decoder,
+    pub encoding: &'static Encoding,
+    //pub variables: HashMap<DictIndex, Variable>,
+    //pub var_names: HashMap<Identifier, DictIndex>,
+    //pub dictionary: Dictionary,
+    //n_dict_indexes: usize,
+    n_generated_names: usize,
+}
+
+impl Decoder {
+    fn generate_name(&mut self, dictionary: &Dictionary) -> Identifier {
+        loop {
+            self.n_generated_names += 1;
+            let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
+                .unwrap();
+            if !dictionary.variables.contains(&name) {
+                return name;
+            }
+            assert!(self.n_generated_names < usize::MAX);
+        }
+    }
+}
+
 pub fn decode(
     mut headers: Headers,
     encoding: &'static Encoding,
@@ -409,7 +417,7 @@ pub fn decode(
     }
 
     for attributes in headers.file_attributes.drain(..) {
-        dictionary.attributes.extend(attributes.0.0.into_iter())
+        dictionary.attributes.extend(attributes.0 .0.into_iter())
     }
 
     // Concatenate all the document records (really there should only be one)
@@ -424,16 +432,86 @@ pub fn decode(
     // XXX warn for weird integer format
     // XXX warn for weird floating-point format, etc.
 
-    /*
-        let mut decoder = Decoder {
-            raw: decoder,
-            variables: HashMap::new(),
-            var_names: HashMap::new(),
-            dictionary,
-            n_dict_indexes: 0,
-            n_generated_names: 0,
+    let mut decoder = Decoder {
+        encoding,
+        n_generated_names: 0,
+    };
+
+    let mut header_vars = headers.variable.iter().enumerate();
+    let mut var_index_map = HashMap::new();
+    while let Some((value_index, input)) = header_vars.next() {
+        let name = trim_end_spaces(input.name.to_string());
+        let name = match Identifier::new(&name, encoding) {
+            Ok(name) => {
+                if !dictionary.variables.contains(&name) {
+                    name
+                } else {
+                    let new_name = decoder.generate_name(&dictionary);
+                    warn(Error::DuplicateVariableName {
+                        duplicate_name: name.clone(),
+                        new_name: new_name.clone(),
+                    });
+                    new_name
+                }
+            }
+            Err(id_error) => {
+                let new_name = decoder.generate_name(&dictionary);
+                warn(Error::InvalidVariableName {
+                    id_error,
+                    new_name: new_name.clone(),
+                });
+                new_name
+            }
         };
-    */
+        let mut variable = Variable::new(name.clone(), VarWidth::from_raw(input.width).unwrap());
+
+        // Set the short name the same as the long name (even if we renamed it).
+        variable.short_names = vec![name];
+
+        variable.label = input.label.clone();
+
+        variable.missing_values = input.missing_values.clone();
+
+        variable.print_format = decode_format(
+            input.print_format,
+            variable.width,
+            |new_spec, format_error| {
+                warn(Error::InvalidPrintFormat {
+                    new_spec,
+                    variable: variable.name.clone(),
+                    format_error,
+                })
+            },
+        );
+        variable.write_format = decode_format(
+            input.write_format,
+            variable.width,
+            |new_spec, format_error| {
+                warn(Error::InvalidWriteFormat {
+                    new_spec,
+                    variable: variable.name.clone(),
+                    format_error,
+                })
+            },
+        );
+
+        // Skip long string continuation records.
+        if input.width > 0 {
+            #[allow(unstable_name_collisions)]
+            for _ in 1..input.width.div_ceil(&8) {
+                if let Some((_, continuation)) = header_vars.next() {
+                    if continuation.width == -1 {
+                        continue;
+                    }
+                }
+                return Err(Error::TBD);
+            }
+        }
+
+        let dict_index = dictionary.add_var(variable).unwrap();
+        assert_eq!(var_index_map.insert(value_index, dict_index), None);
+    }
+
     let metadata = Metadata::decode(&headers, warn);
     Ok((dictionary, metadata))
 }
@@ -464,6 +542,17 @@ fn fix_line_ends(s: &str) -> String {
     out
 }
 
+fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec {
+    UncheckedSpec::try_from(raw)
+        .and_then(Spec::try_from)
+        .and_then(|x| x.check_width_compatibility(width))
+        .unwrap_or_else(|error| {
+            let new_format = Spec::default_for_width(width);
+            warn(new_format, error);
+            new_format
+        })
+}
+
 /*
 impl Decoder {
     fn generate_name(&mut self) -> Identifier {
@@ -677,16 +766,6 @@ pub struct VariableRecord {
     pub label: Option<String>,
 }
 
-fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec {
-    UncheckedSpec::try_from(raw)
-        .and_then(Spec::try_from)
-        .and_then(|x| x.check_width_compatibility(width))
-        .unwrap_or_else(|error| {
-            let new_format = Spec::default_for_width(width);
-            warn(new_format, error);
-            new_format
-        })
-}
 
 fn parse_variable_record(
     decoder: &mut Decoder,
index 59e1e3a853dcc59191d75d61d650dce27b37be5f..042a294452a2e572bd4dc95ed8b890b77269a839 100644 (file)
@@ -78,6 +78,15 @@ impl VarWidth {
             VarWidth::String(width) => *width.min(&32) as u32,
         }
     }
+
+    pub fn from_raw(raw: impl Into<i32>) -> Result<Self, ()> {
+        let raw: i32 = raw.into();
+        match raw {
+            0 => Ok(Self::Numeric),
+            1..=255 => Ok(Self::String(raw as u16)),
+            _ => Err(()),
+        }
+    }
 }
 
 impl From<VarWidth> for VarType {
@@ -120,6 +129,7 @@ pub struct Dictionary {
     pub encoding: &'static Encoding,
 }
 
+#[derive(Debug)]
 pub struct DuplicateVariableName;
 
 impl Dictionary {
@@ -140,9 +150,10 @@ impl Dictionary {
         }
     }
 
-    pub fn add_var(&mut self, variable: Variable) -> Result<(), DuplicateVariableName> {
-        if self.variables.insert(ByIdentifier::new(variable)) {
-            Ok(())
+    pub fn add_var(&mut self, variable: Variable) -> Result<usize, DuplicateVariableName> {
+        let (index, inserted) = self.variables.insert_full(ByIdentifier::new(variable));
+        if inserted {
+            Ok(index)
         } else {
             Err(DuplicateVariableName)
         }
index 0620d4eea6884a19318a2bd089970841c4579553..e8a279f5e848418e0fdb846333cc80f5e0a60ce4 100644 (file)
@@ -46,6 +46,12 @@ pub enum Error {
     #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
     BadRecordType { offset: u64, rec_type: u32 },
 
+    #[error("In variable record starting at offset {start_offset:#x}, variable width is not in the valid range -1 to 255.")]
+    BadVariableWidth {
+        start_offset: u64,
+        width: i32,
+    },
+
     #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
     BadVariableLabelCode {
         start_offset: u64,
@@ -1298,6 +1304,9 @@ impl VariableRecord<RawString, RawStr<8>> {
     fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
         let start_offset = r.stream_position()?;
         let width: i32 = endian.parse(read_bytes(r)?);
+        if !(-1..=255).contains(&width) {
+            return Err(Error::BadVariableWidth { start_offset, width });
+        }
         let code_offset = r.stream_position()?;
         let has_variable_label: u32 = endian.parse(read_bytes(r)?);
         let missing_value_code: i32 = endian.parse(read_bytes(r)?);