continue implementation
authorBen Pfaff <blp@cs.stanford.edu>
Sun, 7 Jan 2024 19:41:28 +0000 (11:41 -0800)
committerBen Pfaff <blp@cs.stanford.edu>
Sun, 7 Jan 2024 19:41:28 +0000 (11:41 -0800)
rust/src/cooked.rs
rust/src/dictionary.rs
rust/src/main.rs
rust/src/raw.rs

index 3e4677e71c1bbbe80191617e92f8e935ece67fe4..56d6aa1c82254e1fa44bd3b9e3992ade55743df3 100644 (file)
@@ -4,11 +4,15 @@ use std::{
 };
 
 use crate::{
+    dictionary::{self, Dictionary},
     encoding::{default_encoding, get_encoding, Error as EncodingError},
     endian::Endian,
     format::{Error as FormatError, Spec, UncheckedSpec},
     identifier::{Error as IdError, Identifier},
-    raw::{self, ProductInfoRecord, RawDocumentLine, RawStr, RawString, VarDisplayRecord, VarType},
+    raw::{
+        self, LongStringMissingValueRecord, MissingValues, ProductInfoRecord, RawDocumentLine,
+        RawStr, RawString, VarDisplayRecord, VarType,
+    },
 };
 use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
 use encoding_rs::{DecoderResult, Encoding};
@@ -166,7 +170,7 @@ pub enum Record {
     VariableSets(VariableSetRecord),
     VarDisplay(VarDisplayRecord),
     MultipleResponse(MultipleResponseRecord),
-    LongStringMissingValues(LongStringMissingValuesRecord),
+    LongStringMissingValues(LongStringMissingValueRecord<String, String>),
     LongStringValueLabels(LongStringValueLabelRecord),
     Encoding(EncodingRecord),
     NumberOfCases(NumberOfCasesRecord),
@@ -195,28 +199,28 @@ pub struct Variable {
 }
 
 pub struct Decoder {
-    pub compression: Option<Compression>,
-    pub endian: Endian,
+    pub raw: raw::Decoder,
     pub encoding: &'static Encoding,
     pub variables: HashMap<DictIndex, Variable>,
     pub var_names: HashMap<Identifier, DictIndex>,
+    pub dictionary: Dictionary,
     n_dict_indexes: usize,
     n_generated_names: usize,
 }
 
 #[derive(Default)]
 struct Headers<'a> {
-    header: Option<&'a raw::HeaderRecord<RawString>>,
-    variables: Vec<&'a raw::VariableRecord<RawString, RawStr<8>>>,
+    header: Option<raw::HeaderRecord<Cow<'a, str>>>,
+    variables: Vec<raw::VariableRecord<Cow<'a, str>, String>>,
     value_labels: Vec<&'a raw::ValueLabelRecord<RawStr<8>, RawString>>,
-    document: Option<&'a raw::DocumentRecord<RawDocumentLine>>,
+    documents: Vec<raw::DocumentRecord<Cow<'a, str>>>,
     integer_info: Option<&'a raw::IntegerInfoRecord>,
     float_info: Option<&'a raw::FloatInfoRecord>,
     variable_sets: Vec<&'a raw::VariableSetRecord>,
     var_display: Option<&'a raw::VarDisplayRecord>,
     multiple_response: Vec<&'a raw::MultipleResponseRecord<RawString, RawString>>,
     long_string_value_labels: Vec<&'a raw::LongStringValueLabelRecord<RawString>>,
-    long_string_missing_values: Vec<&'a raw::LongStringMissingValueRecord<RawString, RawStr<8>>>,
+    long_string_missing_values: Vec<raw::LongStringMissingValueRecord<Identifier, String>>,
     encoding: Option<&'a raw::EncodingRecord>,
     number_of_cases: Option<&'a raw::NumberOfCasesRecord>,
     product_info: Option<&'a raw::ProductInfoRecord>,
@@ -237,14 +241,16 @@ fn set_or_warn<T>(option: &mut Option<T>, value: T, warn: &impl Fn(Error)) {
 }
 
 impl<'a> Headers<'a> {
-    fn new(headers: &'a Vec<raw::Record>, warn: &impl Fn(Error)) -> Headers<'a> {
+    fn new(headers: &'a Vec<raw::Record>, decoder: &Decoder, warn: &impl Fn(Error)) -> Headers<'a> {
         let mut h = Headers::default();
         for header in headers {
             match header {
-                raw::Record::Header(record) => set_or_warn(&mut h.header, record, warn),
-                raw::Record::Variable(record) => h.variables.push(record),
+                raw::Record::Header(record) => {
+                    set_or_warn(&mut h.header, record.decode(&decoder.raw), warn)
+                }
+                raw::Record::Variable(record) => h.variables.push(record.decode(&decoder.raw)),
                 raw::Record::ValueLabel(record) => h.value_labels.push(record),
-                raw::Record::Document(record) => set_or_warn(&mut h.document, record, warn),
+                raw::Record::Document(record) => h.documents.push(record.decode(&decoder.raw)),
                 raw::Record::IntegerInfo(record) => set_or_warn(&mut h.integer_info, record, warn),
                 raw::Record::FloatInfo(record) => set_or_warn(&mut h.float_info, record, warn),
                 raw::Record::VariableSets(record) => h.variable_sets.push(record),
@@ -253,9 +259,9 @@ impl<'a> Headers<'a> {
                 raw::Record::LongStringValueLabels(record) => {
                     h.long_string_value_labels.push(record)
                 }
-                raw::Record::LongStringMissingValues(record) => {
-                    h.long_string_missing_values.push(record)
-                }
+                raw::Record::LongStringMissingValues(record) => h
+                    .long_string_missing_values
+                    .push(record.decode(&decoder.raw)),
                 raw::Record::Encoding(record) => set_or_warn(&mut h.encoding, record, warn),
                 raw::Record::NumberOfCases(record) => {
                     set_or_warn(&mut h.number_of_cases, record, warn)
@@ -277,85 +283,83 @@ impl<'a> Headers<'a> {
     }
 }
 
-pub fn decode(
-    headers: Vec<raw::Record>,
-    encoding: Option<&'static Encoding>,
+pub fn encoding_from_headers(
+    headers: &Vec<raw::Record>,
     warn: &impl Fn(Error),
-) -> Result<Vec<Record>, Error> {
-    let h = Headers::new(&headers, warn);
-    let Some(header) = h.header else {
-        return Err(Error::MissingHeaderRecord);
-    };
-    let encoding = match encoding {
-        Some(encoding) => encoding,
-        None => {
-            let encoding = h.encoding.map(|record| record.0.as_str());
-            let character_code = h.integer_info.map(|record| record.character_code);
-            match get_encoding(encoding, character_code) {
-                Ok(encoding) => encoding,
-                Err(err @ EncodingError::Ebcdic) => return Err(Error::EncodingError(err)),
-                Err(err) => {
-                    warn(Error::EncodingError(err));
-                    // Warn that we're using the default encoding.
-                    default_encoding()
-                }
-            }
+) -> Result<&'static Encoding, Error> {
+    let mut encoding_record = None;
+    let mut integer_info_record = None;
+    for record in headers {
+        match record {
+            raw::Record::Encoding(record) => encoding_record = Some(record),
+            raw::Record::IntegerInfo(record) => integer_info_record = Some(record),
+            _ => (),
         }
-    };
-
-    //let mut dictionary = Dictionary::new(encoding);
+    }
+    let encoding = encoding_record.map(|record| record.0.as_str());
+    let character_code = integer_info_record.map(|record| record.character_code);
+    match get_encoding(encoding, character_code) {
+        Ok(encoding) => Ok(encoding),
+        Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)),
+        Err(err) => {
+            warn(Error::EncodingError(err));
+            // Warn that we're using the default encoding.
+            Ok(default_encoding())
+        }
+    }
+}
 
+pub fn decode(
+    headers: Vec<raw::Record>,
+    encoding: &'static Encoding,
+    warn: &impl Fn(Error),
+) -> Result<(Vec<Record>, Metadata), Error> {
     let mut decoder = Decoder {
-        compression: header.compression,
-        endian: header.endian,
+        raw: raw::Decoder {
+            encoding,
+            warn: Box::new(|error| println!("{error}")),
+        },
         encoding,
         variables: HashMap::new(),
         var_names: HashMap::new(),
+        dictionary: Dictionary::new(encoding),
         n_dict_indexes: 0,
         n_generated_names: 0,
     };
 
+    let h = Headers::new(&headers, &decoder, warn);
+    let Some(header) = h.header else {
+        return Err(Error::MissingHeaderRecord);
+    };
+
     let mut output = Vec::with_capacity(headers.len());
 
     // Decode the records that don't use variables at all.
-    if let Some(header) = HeaderRecord::try_decode(&mut decoder, header, warn)? {
+    if let Some(header) = HeaderRecord::try_decode(&mut decoder, &header, warn)? {
         output.push(Record::Header(header))
     }
-    if let Some(raw) = h.document {
-        if let Some(document) = DocumentRecord::try_decode(&mut decoder, raw, warn)? {
-            output.push(Record::Document(document))
+    for document in h.documents {
+        for line in &document.lines {
+            decoder.dictionary.documents.push(line.to_string())
         }
     }
-    if let Some(raw) = h.integer_info {
-        output.push(Record::IntegerInfo(raw.clone()));
-    }
-    if let Some(raw) = h.float_info {
-        output.push(Record::FloatInfo(raw.clone()));
-    }
-    if let Some(raw) = h.product_info {
-        output.push(Record::ProductInfo(raw.clone()));
+    /*
+            for &raw in &h.file_attributes {
+                let s = decoder.decode_string_cow(&raw.text.0, warn);
+                output.push(Record::FileAttributes(FileAttributeRecord::parse(
+                    &decoder, &s, warn,
+                )?));
+            }
+            for &raw in &h.other_extensions {
+                output.push(Record::OtherExtension(raw.clone()));
     }
-    if let Some(raw) = h.number_of_cases {
-        output.push(Record::NumberOfCases(raw.clone()))
+        */
+    // Decode the variable records, which are the basis of almost everything
+    // else.
+    for raw in &h.variables {
+        parse_variable_record(&mut decoder, raw, warn)?;
     }
     /*
-        for &raw in &h.file_attributes {
-            let s = decoder.decode_string_cow(&raw.text.0, warn);
-            output.push(Record::FileAttributes(FileAttributeRecord::parse(
-                &decoder, &s, warn,
-            )?));
-        }
-        for &raw in &h.other_extensions {
-            output.push(Record::OtherExtension(raw.clone()));
-        }
-        // Decode the variable records, which are the basis of almost everything
-        // else.
-        for &raw in &h.variables {
-            if let Some(variable) = VariableRecord::try_decode(&mut decoder, raw, warn)? {
-                output.push(Record::Variable(variable));
-            }
-        }
-
         // Decode value labels and weight variable.  These use indexes into the
         // variable records, so we need to parse them before those indexes become
         // invalidated by very long string variables.
@@ -413,8 +417,9 @@ pub fn decode(
             let s = decoder.decode_string_cow(&raw.text.0, warn);
             output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?));
         }
-    */
-    Ok(output)
+     */
+    let metadata = Metadata::decode(&header, h.integer_info, h.product_info, warn);
+    Ok((output, metadata))
 }
 
 impl Decoder {
@@ -532,29 +537,79 @@ fn trim_end_spaces(mut s: String) -> String {
     s
 }
 
+/// Data file info that doesn't fit in [Dictionary].
+pub struct Metadata {
+    creation: NaiveDateTime,
+    endian: Endian,
+    compression: Option<Compression>,
+    n_cases: Option<u64>,
+    product: String,
+    product_ext: Option<String>,
+    version: Option<(i32, i32, i32)>,
+}
+
+impl Metadata {
+    fn decode(
+        header: &crate::raw::HeaderRecord<Cow<str>>,
+        integer_info: Option<&IntegerInfoRecord>,
+        product_ext: Option<&ProductInfoRecord>,
+        warn: impl Fn(Error),
+    ) -> Self {
+        let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
+            .unwrap_or_else(|_| {
+                warn(Error::InvalidCreationDate {
+                    creation_date: header.creation_date.to_string(),
+                });
+                Default::default()
+            });
+        let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
+            .unwrap_or_else(|_| {
+                warn(Error::InvalidCreationTime {
+                    creation_time: header.creation_time.to_string(),
+                });
+                Default::default()
+            });
+        let creation = NaiveDateTime::new(creation_date, creation_time);
+
+        let product = header
+            .eye_catcher
+            .trim_start_matches("@(#) SPSS DATA FILE")
+            .trim_end()
+            .to_string();
+
+        Self {
+            creation,
+            endian: header.endian,
+            compression: header.compression,
+            n_cases: header.n_cases.map(|n| n as u64),
+            product,
+            product_ext: product_ext.map(|pe| pe.0.clone()),
+            version: integer_info.map(|ii| ii.version),
+        }
+    }
+}
+
 impl TryDecode for HeaderRecord {
-    type Input<'a> = crate::raw::HeaderRecord<RawString>;
+    type Input<'a> = crate::raw::HeaderRecord<Cow<'a, str>>;
 
     fn try_decode(
-        decoder: &mut Decoder,
+        _decoder: &mut Decoder,
         input: &Self::Input<'_>,
         warn: impl Fn(Error),
     ) -> Result<Option<Self>, Error> {
-        let eye_catcher = trim_end_spaces(decoder.decode_string(&input.eye_catcher.0, &warn));
-        let file_label = trim_end_spaces(decoder.decode_string(&input.file_label.0, &warn));
-        let creation_date = decoder.decode_string_cow(&input.creation_date.0, &warn);
-        let creation_date =
-            NaiveDate::parse_from_str(&creation_date, "%e %b %Y").unwrap_or_else(|_| {
+        let eye_catcher = trim_end_spaces(input.eye_catcher.to_string());
+        let file_label = trim_end_spaces(input.file_label.to_string());
+        let creation_date = NaiveDate::parse_from_str(&input.creation_date, "%e %b %Y")
+            .unwrap_or_else(|_| {
                 warn(Error::InvalidCreationDate {
-                    creation_date: creation_date.into(),
+                    creation_date: input.creation_date.to_string(),
                 });
                 Default::default()
             });
-        let creation_time = decoder.decode_string_cow(&input.creation_time.0, &warn);
-        let creation_time =
-            NaiveTime::parse_from_str(&creation_time, "%H:%M:%S").unwrap_or_else(|_| {
+        let creation_time = NaiveTime::parse_from_str(&input.creation_time, "%H:%M:%S")
+            .unwrap_or_else(|_| {
                 warn(Error::InvalidCreationTime {
-                    creation_time: creation_time.into(),
+                    creation_time: input.creation_time.to_string(),
                 });
                 Default::default()
             });
@@ -621,6 +676,13 @@ impl VarWidth {
     pub fn narrower(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
         Self::width_predicate(a, b, |a, b| a.min(b))
     }
+
+    pub fn default_display_width(&self) -> u32 {
+        match self {
+            VarWidth::Numeric => 8,
+            VarWidth::String(width) => *width.min(&32) as u32,
+        }
+    }
 }
 
 impl From<VarWidth> for VarType {
@@ -638,39 +700,10 @@ pub struct VariableRecord {
     pub name: Identifier,
     pub print_format: Spec,
     pub write_format: Spec,
-    pub missing_values: MissingValues,
+    pub missing_values: MissingValues<String>,
     pub label: Option<String>,
 }
 
-#[derive(Clone, Debug)]
-pub struct MissingValues {
-    /// Individual missing values, up to 3 of them.
-    pub values: Vec<Value>,
-
-    /// Optional range of missing values.
-    pub range: Option<(Value, Value)>,
-}
-
-impl Decode<raw::MissingValues<RawStr<8>>> for MissingValues {
-    fn decode(
-        decoder: &Decoder,
-        input: &raw::MissingValues<RawStr<8>>,
-        _warn: impl Fn(Error),
-    ) -> Self {
-        MissingValues {
-            values: input
-                .values
-                .iter()
-                .map(|value| Value::decode(value, decoder))
-                .collect(),
-            range: input
-                .range
-                .as_ref()
-                .map(|(low, high)| (Value::decode(low, decoder), Value::decode(high, decoder))),
-        }
-    }
-}
-
 fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec {
     UncheckedSpec::try_from(raw)
         .and_then(Spec::try_from)
@@ -682,91 +715,84 @@ fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatErro
         })
 }
 
-impl TryDecode for VariableRecord {
-    type Input<'a> = raw::VariableRecord<RawString, RawStr<8>>;
-
-    fn try_decode(
-        decoder: &mut Decoder,
-        input: &Self::Input<'_>,
-        warn: impl Fn(Error),
-    ) -> Result<Option<VariableRecord>, Error> {
-        let width = match input.width {
-            0 => VarWidth::Numeric,
-            w @ 1..=255 => VarWidth::String(w as u16),
-            -1 => return Ok(None),
-            _ => {
-                return Err(Error::InvalidVariableWidth {
-                    offsets: input.offsets.clone(),
-                    width: input.width,
-                })
-            }
-        };
-        let name = trim_end_spaces(decoder.decode_string(&input.name.0, &warn));
-        let name = match Identifier::new(&name, decoder.encoding) {
-            Ok(name) => {
-                if !decoder.var_names.contains_key(&name) {
-                    name
-                } else {
-                    let new_name = decoder.generate_name();
-                    warn(Error::DuplicateVariableName {
-                        duplicate_name: name.clone(),
-                        new_name: new_name.clone(),
-                    });
-                    new_name
-                }
-            }
-            Err(id_error) => {
+fn parse_variable_record(
+    decoder: &mut Decoder,
+    input: &raw::VariableRecord<Cow<str>, String>,
+    warn: impl Fn(Error),
+) -> Result<(), Error> {
+    let width = match input.width {
+        0 => VarWidth::Numeric,
+        w @ 1..=255 => VarWidth::String(w as u16),
+        -1 => return Ok(()),
+        _ => {
+            return Err(Error::InvalidVariableWidth {
+                offsets: input.offsets.clone(),
+                width: input.width,
+            })
+        }
+    };
+    let name = trim_end_spaces(input.name.to_string());
+    let name = match Identifier::new(&name, decoder.encoding) {
+        Ok(name) => {
+            if !decoder.var_names.contains_key(&name) {
+                name
+            } else {
                 let new_name = decoder.generate_name();
-                warn(Error::InvalidVariableName {
-                    id_error,
+                warn(Error::DuplicateVariableName {
+                    duplicate_name: name.clone(),
                     new_name: new_name.clone(),
                 });
                 new_name
             }
-        };
-        let variable = Variable {
-            dict_index: decoder.n_dict_indexes,
-            short_name: name.clone(),
-            long_name: None,
-            width,
-        };
-        decoder.n_dict_indexes += width.n_dict_indexes();
-        assert!(decoder
-            .var_names
-            .insert(name.clone(), variable.dict_index)
-            .is_none());
-        assert!(decoder
-            .variables
-            .insert(variable.dict_index, variable)
-            .is_none());
-
-        let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
-            warn(Error::InvalidPrintFormat {
-                new_spec,
-                variable: name.clone(),
-                format_error,
-            })
-        });
-        let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
-            warn(Error::InvalidWriteFormat {
-                new_spec,
-                variable: name.clone(),
-                format_error,
-            })
-        });
-        let label = input
-            .label
-            .as_ref()
-            .map(|label| decoder.decode_string(&label.0, &warn));
-        Ok(Some(VariableRecord {
-            width,
-            name,
-            print_format,
-            write_format,
-            missing_values: MissingValues::decode(decoder, &input.missing_values, warn),
-            label,
-        }))
+        }
+        Err(id_error) => {
+            let new_name = decoder.generate_name();
+            warn(Error::InvalidVariableName {
+                id_error,
+                new_name: new_name.clone(),
+            });
+            new_name
+        }
+    };
+    let variable = Variable {
+        dict_index: decoder.n_dict_indexes,
+        short_name: name.clone(),
+        long_name: None,
+        width,
+    };
+    decoder.n_dict_indexes += width.n_dict_indexes();
+    assert!(decoder
+        .var_names
+        .insert(name.clone(), variable.dict_index)
+        .is_none());
+    assert!(decoder
+        .variables
+        .insert(variable.dict_index, variable)
+        .is_none());
+
+    let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
+        warn(Error::InvalidPrintFormat {
+            new_spec,
+            variable: name.clone(),
+            format_error,
+        })
+    });
+    let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
+        warn(Error::InvalidWriteFormat {
+            new_spec,
+            variable: name.clone(),
+            format_error,
+        })
+    });
+    let mut variable = dictionary::Variable::new(name, width);
+    variable.print_format = print_format;
+    variable.write_format = write_format;
+    variable.missing_values = input.missing_values.clone();
+    if let Some(ref label) = input.label {
+        variable.label = Some(label.to_string());
     }
+    decoder.dictionary.add_var(variable).unwrap();
+    Ok(())
 }
 
 #[derive(Clone, Debug)]
@@ -1284,56 +1310,6 @@ impl TryDecode for MultipleResponseRecord {
     }
 }
 
-#[derive(Clone, Debug)]
-pub struct LongStringMissingValues {
-    /// Variable name.
-    pub var_name: Identifier,
-
-    /// Missing values.
-    pub missing_values: MissingValues,
-}
-
-impl LongStringMissingValues {
-    fn decode(
-        decoder: &Decoder,
-        input: &raw::LongStringMissingValues<RawString, RawStr<8>>,
-        warn: &impl Fn(Error),
-    ) -> Result<Self, Error> {
-        let var_name = decoder.decode_string(&input.var_name.0, warn);
-        let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
-            .map_err(Error::InvalidLongStringValueLabelName)?;
-
-        let missing_values = MissingValues::decode(decoder, &input.missing_values, warn);
-
-        Ok(LongStringMissingValues {
-            var_name,
-            missing_values,
-        })
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringMissingValuesRecord(Vec<LongStringMissingValues>);
-
-impl TryDecode for LongStringMissingValuesRecord {
-    type Input<'a> = raw::LongStringMissingValueRecord<RawString, RawStr<8>>;
-
-    fn try_decode(
-        decoder: &mut Decoder,
-        input: &Self::Input<'_>,
-        warn: impl Fn(Error),
-    ) -> Result<Option<Self>, Error> {
-        let mut labels = Vec::with_capacity(input.0.len());
-        for label in &input.0 {
-            match LongStringMissingValues::decode(decoder, label, &warn) {
-                Ok(set) => labels.push(set),
-                Err(error) => warn(error),
-            }
-        }
-        Ok(Some(LongStringMissingValuesRecord(labels)))
-    }
-}
-
 #[derive(Clone, Debug)]
 pub struct LongStringValueLabels {
     pub var_name: Identifier,
index f9886641f7f582398627b9ab937cf63b1ab8caa3..e9eca118abe71218f8122e70629eccfcf820500f 100644 (file)
@@ -8,10 +8,10 @@ use encoding_rs::Encoding;
 use indexmap::IndexSet;
 
 use crate::{
-    cooked::{MissingValues, Value, VarWidth},
-    format::Format,
+    cooked::{Value, VarWidth},
+    format::Spec,
     identifier::{ByIdentifier, HasIdentifier, Identifier},
-    raw::{CategoryLabels, Alignment, Measure},
+    raw::{Alignment, CategoryLabels, Measure, MissingValues, VarType},
 };
 
 pub type DictIndex = usize;
@@ -50,6 +50,14 @@ impl Dictionary {
         }
     }
 
+    pub fn add_var(&mut self, variable: Variable) -> Result<(), ()> {
+        if self.variables.insert(ByIdentifier::new(variable)) {
+            Ok(())
+        } else {
+            Err(())
+        }
+    }
+
     pub fn reorder_var(&mut self, from_index: DictIndex, to_index: DictIndex) {
         if from_index != to_index {
             self.variables.move_index(from_index, to_index);
@@ -187,16 +195,48 @@ pub enum Role {
     Split,
 }
 
+impl Default for Role {
+    fn default() -> Self {
+        Self::Input
+    }
+}
+
+pub enum DictClass {
+    Ordinary,
+    System,
+    Scratch,
+}
+
+impl DictClass {
+    pub fn from_identifier(id: &Identifier) -> Self {
+        if id.0.starts_with('$') {
+            Self::System
+        } else if id.0.starts_with('#') {
+            Self::Scratch
+        } else {
+            Self::Ordinary
+        }
+    }
+
+    pub fn must_leave(self) -> bool {
+        match self {
+            DictClass::Ordinary => false,
+            DictClass::System => false,
+            DictClass::Scratch => true,
+        }
+    }
+}
+
 #[derive(Clone, Debug)]
 pub struct Variable {
     pub name: Identifier,
     pub width: VarWidth,
     pub missing_values: MissingValues,
-    pub print_format: Format,
-    pub write_format: Format,
+    pub print_format: Spec,
+    pub write_format: Spec,
     pub value_labels: HashMap<Value, String>,
     pub label: Option<String>,
-    pub measure: Measure,
+    pub measure: Option<Measure>,
     pub role: Role,
     pub display_width: u32,
     pub alignment: Alignment,
@@ -205,6 +245,29 @@ pub struct Variable {
     pub attributes: HashSet<ByIdentifier<Attribute>>,
 }
 
+impl Variable {
+    pub fn new(name: Identifier, width: VarWidth) -> Self {
+        let var_type = VarType::from_width(width);
+        let leave = DictClass::from_identifier(&name).must_leave();
+        Self {
+            name,
+            width,
+            missing_values: MissingValues::default(),
+            print_format: Spec::default_for_width(width),
+            write_format: Spec::default_for_width(width),
+            value_labels: HashMap::new(),
+            label: None,
+            measure: Measure::default_for_type(var_type),
+            role: Role::default(),
+            display_width: width.default_display_width(),
+            alignment: Alignment::default_for_type(var_type),
+            leave,
+            short_names: Vec::new(),
+            attributes: HashSet::new()
+        }
+    }
+}
+
 impl HasIdentifier for Variable {
     fn identifier(&self) -> &Identifier {
         &self.name
index 45d0622f0d4e039f09c4973596d2d4a5b1b9078c..473062183ba4a427171370dbcac596ac955b3574 100644 (file)
@@ -17,7 +17,7 @@
 use anyhow::Result;
 use clap::{Parser, ValueEnum};
 use encoding_rs::Encoding;
-use pspp::cooked::decode;
+use pspp::cooked::{decode, encoding_from_headers};
 use pspp::raw::{Reader, Record, Magic};
 use std::fs::File;
 use std::io::BufReader;
@@ -111,7 +111,8 @@ fn dissect(file_name: &Path, max_cases: u64, mode: Mode, encoding: Option<&'stat
         }
         Mode::Cooked => {
             let headers: Vec<Record> = reader.collect::<Result<Vec<_>, _>>()?;
-            let headers = decode(headers, encoding, &|e| eprintln!("{e}"))?;
+            let encoding = encoding_from_headers(&headers, &|e| eprintln!("{e}"))?;
+            let (headers, _) = decode(headers, encoding, &|e| eprintln!("{e}"))?;
             for header in headers {
                 println!("{header:?}");
             }
index 544481906f20153aabd415e50f507b198375c73f..986bb92a52bb389726c8875582130305aa3772d2 100644 (file)
@@ -1,4 +1,5 @@
 use crate::{
+    cooked::VarWidth,
     endian::{Endian, Parse, ToBytes},
     identifier::{Error as IdError, Identifier},
 };
@@ -185,6 +186,9 @@ pub enum Error {
     #[error("Invalid multiple response set variable name.  {0}")]
     InvalidMrSetVariableName(IdError),
 
+    #[error("Invalid variable name in long string missing values record.  {0}")]
+    InvalidLongStringMissingValueVariableName(IdError),
+
     #[error("Details TBD")]
     TBD,
 }
@@ -398,7 +402,7 @@ impl HeaderRecord<RawString> {
         })
     }
 
-    fn decode<'a>(&'a self, decoder: &Decoder) -> HeaderRecord<Cow<'a, str>> {
+    pub fn decode<'a>(&'a self, decoder: &Decoder) -> HeaderRecord<Cow<'a, str>> {
         let eye_catcher = decoder.decode(&self.eye_catcher);
         let file_label = decoder.decode(&self.file_label);
         let creation_date = decoder.decode(&self.creation_date);
@@ -421,9 +425,9 @@ impl HeaderRecord<RawString> {
     }
 }
 
-struct Decoder {
-    encoding: &'static Encoding,
-    warn: Box<dyn Fn(Error)>,
+pub struct Decoder {
+    pub encoding: &'static Encoding,
+    pub warn: Box<dyn Fn(Error)>,
 }
 
 impl Decoder {
@@ -552,14 +556,14 @@ pub enum VarType {
 }
 
 impl VarType {
-    fn from_width(width: i32) -> VarType {
+    pub fn from_width(width: VarWidth) -> VarType {
         match width {
-            0 => VarType::Numeric,
-            _ => VarType::String,
+            VarWidth::Numeric => Self::Numeric,
+            VarWidth::String(_) => Self::String,
         }
     }
 
-    fn opposite(self) -> VarType {
+    pub fn opposite(self) -> VarType {
         match self {
             Self::Numeric => Self::String,
             Self::String => Self::Numeric,
@@ -848,7 +852,11 @@ where
                 };
                 match record {
                     Record::Variable(VariableRecord { width, .. }) => {
-                        self.var_types.push(VarType::from_width(width));
+                        self.var_types.push(if width == 0 {
+                            VarType::Numeric
+                        } else {
+                            VarType::String
+                        });
                     }
                     Record::EndOfHeaders(_) => {
                         self.state = if let Some(Compression::ZLib) = self.header.compression {
@@ -1016,7 +1024,7 @@ fn format_name(type_: u32) -> Cow<'static, str> {
 }
 
 #[derive(Clone)]
-pub struct MissingValues<S>
+pub struct MissingValues<S = String>
 where
     S: Debug,
 {
@@ -1063,6 +1071,18 @@ where
     }
 }
 
+impl<S> Default for MissingValues<S>
+where
+    S: Debug,
+{
+    fn default() -> Self {
+        Self {
+            values: Vec::new(),
+            range: None,
+        }
+    }
+}
+
 impl MissingValues<RawStr<8>> {
     fn read<R: Read + Seek>(
         r: &mut R,
@@ -1079,7 +1099,11 @@ impl MissingValues<RawStr<8>> {
             (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
         };
 
-        let var_type = VarType::from_width(width);
+        let var_type = if width == 0 {
+            VarType::Numeric
+        } else {
+            VarType::String
+        };
 
         let mut values = Vec::new();
         for _ in 0..n_values {
@@ -1209,7 +1233,7 @@ impl VariableRecord<RawString, RawStr<8>> {
         }))
     }
 
-    fn decode<'a>(&'a self, decoder: &Decoder) -> VariableRecord<Cow<'a, str>, String> {
+    pub fn decode<'a>(&'a self, decoder: &Decoder) -> VariableRecord<Cow<'a, str>, String> {
         VariableRecord {
             offsets: self.offsets.clone(),
             width: self.width,
@@ -1506,7 +1530,7 @@ impl DocumentRecord<RawDocumentLine> {
         }
     }
 
-    fn decode<'a>(&'a self, decoder: &Decoder) -> DocumentRecord<Cow<'a, str>> {
+    pub fn decode<'a>(&'a self, decoder: &Decoder) -> DocumentRecord<Cow<'a, str>> {
         DocumentRecord {
             offsets: self.offsets.clone(),
             lines: self
@@ -1792,6 +1816,13 @@ pub enum Measure {
 }
 
 impl Measure {
+    pub fn default_for_type(var_type: VarType) -> Option<Measure> {
+        match var_type {
+            VarType::Numeric => None,
+            VarType::String => Some(Self::Nominal),
+        }
+    }
+
     fn try_decode(source: u32) -> Result<Option<Measure>, Error> {
         match source {
             0 => Ok(None),
@@ -1820,6 +1851,13 @@ impl Alignment {
             _ => Err(Error::InvalidAlignment(source)),
         }
     }
+
+    pub fn default_for_type(var_type: VarType) -> Self {
+        match var_type {
+            VarType::Numeric => Self::Right,
+            VarType::String => Self::Left,
+        }
+    }
 }
 
 #[derive(Clone, Debug)]
@@ -1892,11 +1930,14 @@ where
 }
 
 impl LongStringMissingValues<RawString, RawStr<8>> {
-    fn decode<'a>(&self, decoder: &Decoder) -> LongStringMissingValues<String, String> {
-        LongStringMissingValues {
-            var_name: decoder.decode(&self.var_name).to_string(),
+    fn decode<'a>(
+        &self,
+        decoder: &Decoder,
+    ) -> Result<LongStringMissingValues<Identifier, String>, IdError> {
+        Ok(LongStringMissingValues {
+            var_name: decoder.decode_identifier(&self.var_name)?,
             missing_values: self.missing_values.decode(decoder),
-        }
+        })
     }
 }
 
@@ -1959,8 +2000,21 @@ impl ExtensionRecord for LongStringMissingValueRecord<RawString, RawStr<8>> {
 }
 
 impl LongStringMissingValueRecord<RawString, RawStr<8>> {
-    fn decode<'a>(&self, decoder: &Decoder) -> LongStringMissingValueRecord<String, String> {
-        LongStringMissingValueRecord(self.0.iter().map(|mv| mv.decode(decoder)).collect())
+    pub fn decode<'a>(
+        &self,
+        decoder: &Decoder,
+    ) -> LongStringMissingValueRecord<Identifier, String> {
+        let mut mvs = Vec::with_capacity(self.0.len());
+        for mv in self.0.iter() {
+            if let Some(mv) = mv
+                .decode(decoder)
+                .map_err(|err| Error::InvalidLongStringMissingValueVariableName(err))
+                .warn_on_error(&decoder.warn)
+            {
+                mvs.push(mv);
+            }
+        }
+        LongStringMissingValueRecord(mvs)
     }
 }