work on new decoder - about to start decoding variables
authorBen Pfaff <blp@cs.stanford.edu>
Sun, 25 Feb 2024 20:01:10 +0000 (12:01 -0800)
committerBen Pfaff <blp@cs.stanford.edu>
Sun, 25 Feb 2024 20:01:10 +0000 (12:01 -0800)
rust/src/cooked.rs
rust/src/lib.rs
rust/src/raw.rs

index 78eaf138c2951fc831d72d5da98cbd8005542c73..5d32c91ced3c7cdf91b5414e7ec6ce944d9630f6 100644 (file)
@@ -1,34 +1,35 @@
-use std::{
-    borrow::Cow, cell::RefCell, cmp::Ordering, collections::HashMap, iter::repeat, ops::Range,
-    rc::Rc,
-};
+use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc};
 
 use crate::{
-    dictionary::{self, Dictionary, VarWidth},
-    encoding::{default_encoding, get_encoding, Error as EncodingError},
+    dictionary::{Dictionary, VarWidth},
+    encoding::Error as EncodingError,
     endian::Endian,
-    format::{Error as FormatError, Spec, UncheckedSpec},
+    format::{Error as FormatError, Spec},
     identifier::{Error as IdError, Identifier},
     raw::{
-        self, LongStringMissingValueRecord, MissingValues, ProductInfoRecord, RawDocumentLine,
-        RawStr, RawString, VarDisplayRecord, VarType, DecodedRecord,
+        self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord,
+        FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongNamesRecord,
+        LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord,
+        NumberOfCasesRecord, ProductInfoRecord, RawStr, ValueLabelRecord, VarDisplayRecord,
+        VariableAttributeRecord, VariableRecord, VariableSetRecord, VeryLongStringsRecord, ZHeader,
+        ZTrailer,
     },
 };
 use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
-use encoding_rs::{DecoderResult, Encoding};
-use num::integer::div_ceil;
-use ordered_float::OrderedFloat;
+use encoding_rs::Encoding;
 use thiserror::Error as ThisError;
 
 pub use crate::raw::{CategoryLabels, Compression};
 
 #[derive(ThisError, Debug)]
 pub enum Error {
-    // XXX this is really an internal error and maybe we should change the
-    // interfaces to make it impossible
     #[error("Missing header record")]
     MissingHeaderRecord,
 
+    // XXX this is an internal error
+    #[error("More than one file header record")]
+    DuplicateHeaderRecord,
+
     #[error("{0}")]
     EncodingError(EncodingError),
 
@@ -159,36 +160,6 @@ pub enum Error {
     TBD,
 }
 
-#[derive(Clone, Debug)]
-pub enum Record {
-    Header(HeaderRecord),
-    Variable(VariableRecord),
-    ValueLabel(ValueLabelRecord),
-    Document(DocumentRecord),
-    IntegerInfo(IntegerInfoRecord),
-    FloatInfo(FloatInfoRecord),
-    VariableSets(VariableSetRecord),
-    VarDisplay(VarDisplayRecord),
-    MultipleResponse(MultipleResponseRecord),
-    LongStringMissingValues(LongStringMissingValueRecord<String, String>),
-    LongStringValueLabels(LongStringValueLabelRecord),
-    Encoding(EncodingRecord),
-    NumberOfCases(NumberOfCasesRecord),
-    ProductInfo(ProductInfoRecord),
-    LongNames(LongNameRecord),
-    VeryLongStrings(VeryLongStringRecord),
-    FileAttributes(FileAttributeRecord),
-    VariableAttributes(VariableAttributeRecord),
-    OtherExtension(Extension),
-    //Case(Vec<Value>),
-}
-
-pub use crate::raw::EncodingRecord;
-pub use crate::raw::Extension;
-pub use crate::raw::FloatInfoRecord;
-pub use crate::raw::IntegerInfoRecord;
-pub use crate::raw::NumberOfCasesRecord;
-
 type DictIndex = usize;
 
 pub struct Variable {
@@ -208,191 +179,292 @@ pub struct Decoder {
     n_generated_names: usize,
 }
 
-#[derive(Default)]
-struct Headers<'a> {
-    header: Option<raw::HeaderRecord<Cow<'a, str>>>,
-    variables: Vec<raw::VariableRecord<Cow<'a, str>, String>>,
-    value_labels: Vec<&'a raw::ValueLabelRecord<RawStr<8>, RawString>>,
-    documents: Vec<raw::DocumentRecord<Cow<'a, str>>>,
-    integer_info: Option<&'a raw::IntegerInfoRecord>,
-    float_info: Option<&'a raw::FloatInfoRecord>,
-    variable_sets: Vec<&'a raw::VariableSetRecord>,
-    var_display: Option<&'a raw::VarDisplayRecord>,
-    multiple_response: Vec<&'a raw::MultipleResponseRecord<RawString, RawString>>,
-    long_string_value_labels: Vec<&'a raw::LongStringValueLabelRecord<RawString>>,
-    long_string_missing_values: Vec<raw::LongStringMissingValueRecord<Identifier, String>>,
-    encoding: Option<&'a raw::EncodingRecord>,
-    number_of_cases: Option<&'a raw::NumberOfCasesRecord>,
-    product_info: Option<&'a raw::ProductInfoRecord>,
-    long_names: Option<&'a raw::LongNamesRecord>,
-    very_long_strings: Vec<&'a raw::VeryLongStringsRecord>,
-    file_attributes: Vec<&'a raw::FileAttributeRecord>,
-    variable_attributes: Vec<&'a raw::VariableAttributeRecord>,
-    other_extensions: Vec<&'a raw::Extension>,
-    cases: Option<&'a Rc<RefCell<raw::Cases>>>,
+#[derive(Clone, Debug)]
+pub struct Headers {
+    pub header: HeaderRecord<String>,
+    pub variable: Vec<VariableRecord<String, String>>,
+    pub value_label: Vec<ValueLabelRecord<RawStr<8>, String>>,
+    pub document: Vec<DocumentRecord<String>>,
+    pub integer_info: Option<IntegerInfoRecord>,
+    pub float_info: Option<FloatInfoRecord>,
+    pub var_display: Option<VarDisplayRecord>,
+    pub multiple_response: Vec<MultipleResponseRecord<Identifier, String>>,
+    pub long_string_value_labels: Vec<LongStringValueLabelRecord<Identifier, String>>,
+    pub long_string_missing_values: Vec<LongStringMissingValueRecord<Identifier, String>>,
+    pub encoding: Option<EncodingRecord>,
+    pub number_of_cases: Option<NumberOfCasesRecord>,
+    pub variable_sets: Vec<VariableSetRecord>,
+    pub product_info: Option<ProductInfoRecord>,
+    pub long_names: Vec<LongNamesRecord>,
+    pub very_long_strings: Vec<VeryLongStringsRecord>,
+    pub file_attributes: Vec<FileAttributeRecord>,
+    pub variable_attributes: Vec<VariableAttributeRecord>,
+    pub other_extension: Vec<Extension>,
+    pub end_of_headers: Option<u32>,
+    pub z_header: Option<ZHeader>,
+    pub z_trailer: Option<ZTrailer>,
+    pub cases: Option<Rc<RefCell<Cases>>>,
 }
 
-fn set_or_warn<T>(option: &mut Option<T>, value: T, warn: &impl Fn(Error)) {
-    if option.is_none() {
-        let _ = option.insert(value);
-    } else {
-        warn(Error::TBD);
+fn take_first<T, F>(mut vec: Vec<T>, more_than_one: F) -> Option<T>
+where
+    F: FnOnce(),
+{
+    if vec.len() > 1 {
+        more_than_one();
     }
+    vec.drain(..).next()
 }
 
-impl<'a> Headers<'a> {
-    fn new(headers: &'a Vec<raw::Record>, decoder: &Decoder, warn: &impl Fn(Error)) -> Headers<'a> {
-        let mut h = Headers::default();
+impl Headers {
+    pub fn new(headers: Vec<raw::DecodedRecord>, warn: &impl Fn(Error)) -> Result<Headers, Error> {
+        let mut file_header = Vec::new();
+        let mut variable = Vec::new();
+        let mut value_label = Vec::new();
+        let mut document = Vec::new();
+        let mut integer_info = Vec::new();
+        let mut float_info = Vec::new();
+        let mut var_display = Vec::new();
+        let mut multiple_response = Vec::new();
+        let mut long_string_value_labels = Vec::new();
+        let mut long_string_missing_values = Vec::new();
+        let mut encoding = Vec::new();
+        let mut number_of_cases = Vec::new();
+        let mut variable_sets = Vec::new();
+        let mut product_info = Vec::new();
+        let mut long_names = Vec::new();
+        let mut very_long_strings = Vec::new();
+        let mut file_attributes = Vec::new();
+        let mut variable_attributes = Vec::new();
+        let mut other_extension = Vec::new();
+        let mut end_of_headers = Vec::new();
+        let mut z_header = Vec::new();
+        let mut z_trailer = Vec::new();
+        let mut cases = Vec::new();
+
         for header in headers {
             match header {
-                raw::Record::Header(record) => {
-                    set_or_warn(&mut h.header, record.decode(&decoder.raw), warn)
+                DecodedRecord::Header(record) => {
+                    file_header.push(record);
+                }
+                DecodedRecord::Variable(record) => {
+                    variable.push(record);
+                }
+                DecodedRecord::ValueLabel(record) => {
+                    value_label.push(record);
+                }
+                DecodedRecord::Document(record) => {
+                    document.push(record);
+                }
+                DecodedRecord::IntegerInfo(record) => {
+                    integer_info.push(record);
+                }
+                DecodedRecord::FloatInfo(record) => {
+                    float_info.push(record);
+                }
+                DecodedRecord::VariableSets(record) => {
+                    variable_sets.push(record);
+                }
+                DecodedRecord::VarDisplay(record) => {
+                    var_display.push(record);
+                }
+                DecodedRecord::MultipleResponse(record) => {
+                    multiple_response.push(record);
+                }
+                DecodedRecord::LongStringValueLabels(record) => {
+                    long_string_value_labels.push(record)
+                }
+                DecodedRecord::LongStringMissingValues(record) => {
+                    long_string_missing_values.push(record);
+                }
+                DecodedRecord::Encoding(record) => {
+                    encoding.push(record);
+                }
+                DecodedRecord::NumberOfCases(record) => {
+                    number_of_cases.push(record);
+                }
+                DecodedRecord::ProductInfo(record) => {
+                    product_info.push(record);
+                }
+                DecodedRecord::LongNames(record) => {
+                    long_names.push(record);
+                }
+                DecodedRecord::VeryLongStrings(record) => {
+                    very_long_strings.push(record);
+                }
+                DecodedRecord::FileAttributes(record) => {
+                    file_attributes.push(record);
                 }
-                raw::Record::Variable(record) => h.variables.push(record.decode(&decoder.raw)),
-                raw::Record::ValueLabel(record) => h.value_labels.push(record),
-                raw::Record::Document(record) => h.documents.push(record.decode(&decoder.raw)),
-                raw::Record::IntegerInfo(record) => set_or_warn(&mut h.integer_info, record, warn),
-                raw::Record::FloatInfo(record) => set_or_warn(&mut h.float_info, record, warn),
-                raw::Record::VariableSets(record) => h.variable_sets.push(record),
-                raw::Record::VarDisplay(record) => set_or_warn(&mut h.var_display, record, warn),
-                raw::Record::MultipleResponse(record) => h.multiple_response.push(record),
-                raw::Record::LongStringValueLabels(record) => {
-                    h.long_string_value_labels.push(record)
+                DecodedRecord::VariableAttributes(record) => {
+                    variable_attributes.push(record);
                 }
-                raw::Record::LongStringMissingValues(record) => h
-                    .long_string_missing_values
-                    .push(record.decode(&decoder.raw)),
-                raw::Record::Encoding(record) => set_or_warn(&mut h.encoding, record, warn),
-                raw::Record::NumberOfCases(record) => {
-                    set_or_warn(&mut h.number_of_cases, record, warn)
+                DecodedRecord::OtherExtension(record) => {
+                    other_extension.push(record);
+                }
+                DecodedRecord::EndOfHeaders(record) => {
+                    end_of_headers.push(record);
+                }
+                DecodedRecord::ZHeader(record) => {
+                    z_header.push(record);
+                }
+                DecodedRecord::ZTrailer(record) => {
+                    z_trailer.push(record);
+                }
+                DecodedRecord::Cases(record) => {
+                    cases.push(record);
                 }
-                raw::Record::ProductInfo(record) => set_or_warn(&mut h.product_info, record, warn),
-                raw::Record::LongNames(record) => set_or_warn(&mut h.long_names, record, warn),
-                raw::Record::VeryLongStrings(record) => h.very_long_strings.push(record),
-                raw::Record::FileAttributes(record) => h.file_attributes.push(record),
-                raw::Record::VariableAttributes(record) => h.variable_attributes.push(record),
-                raw::Record::OtherExtension(record) => h.other_extensions.push(record),
-                raw::Record::EndOfHeaders(_) => (),
-                raw::Record::ZHeader(_) => (),
-                raw::Record::ZTrailer(_) => (),
-                raw::Record::Cases(record) => set_or_warn(&mut h.cases, record, warn),
-                raw::Record::Text(_) => todo!(),
             }
         }
-        h
+
+        let Some(file_header) = take_first(file_header, || warn(Error::DuplicateHeaderRecord))
+        else {
+            return Err(Error::MissingHeaderRecord);
+        };
+
+        Ok(Headers {
+            header: file_header,
+            variable,
+            value_label,
+            document,
+            integer_info: take_first(integer_info, || warn(Error::TBD)),
+            float_info: take_first(float_info, || warn(Error::TBD)),
+            var_display: take_first(var_display, || warn(Error::TBD)),
+            multiple_response,
+            long_string_value_labels,
+            long_string_missing_values,
+            encoding: take_first(encoding, || warn(Error::TBD)),
+            number_of_cases: take_first(number_of_cases, || warn(Error::TBD)),
+            variable_sets,
+            product_info: take_first(product_info, || warn(Error::TBD)),
+            long_names,
+            very_long_strings,
+            file_attributes,
+            variable_attributes,
+            other_extension,
+            end_of_headers: take_first(end_of_headers, || warn(Error::TBD)),
+            z_header: take_first(z_header, || warn(Error::TBD)),
+            z_trailer: take_first(z_trailer, || warn(Error::TBD)),
+            cases: take_first(cases, || warn(Error::TBD)),
+        })
     }
 }
 
+pub struct Metadata {
+    creation: NaiveDateTime,
+    endian: Endian,
+    compression: Option<Compression>,
+    n_cases: Option<u64>,
+    product: String,
+    product_ext: Option<String>,
+    version: Option<(i32, i32, i32)>,
+}
 
-pub fn decode(
-    headers: Vec<DecodedRecord>,
-    decoder: raw::Decoder,
-) -> Result<(Vec<Record>, Metadata), Error> {
-    let dictionary = Dictionary::new(decoder.encoding);
-    let mut decoder = Decoder {
-        raw: decoder,
-        variables: HashMap::new(),
-        var_names: HashMap::new(),
-        dictionary,
-        n_dict_indexes: 0,
-        n_generated_names: 0,
-    };
-
-    let h = Headers::new(&headers, &decoder);
-    let Some(header) = h.header else {
-        return Err(Error::MissingHeaderRecord);
-    };
+impl Metadata {
+    fn decode(headers: &Headers, warn: impl Fn(Error)) -> Self {
+        let header = &headers.header;
+        let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
+            .unwrap_or_else(|_| {
+                warn(Error::InvalidCreationDate {
+                    creation_date: header.creation_date.to_string(),
+                });
+                Default::default()
+            });
+        let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
+            .unwrap_or_else(|_| {
+                warn(Error::InvalidCreationTime {
+                    creation_time: header.creation_time.to_string(),
+                });
+                Default::default()
+            });
+        let creation = NaiveDateTime::new(creation_date, creation_time);
 
-    let mut output = Vec::with_capacity(headers.len());
+        let product = header
+            .eye_catcher
+            .trim_start_matches("@(#) SPSS DATA FILE")
+            .trim_end()
+            .to_string();
 
-    // Decode the records that don't use variables at all.
-    if let Some(header) = HeaderRecord::try_decode(&mut decoder, &header, warn)? {
-        output.push(Record::Header(header))
-    }
-    for document in h.documents {
-        for line in &document.lines {
-            decoder.dictionary.documents.push(line.to_string())
+        Self {
+            creation,
+            endian: header.endian,
+            compression: header.compression,
+            n_cases: header.n_cases.map(|n| n as u64),
+            product,
+            product_ext: headers.product_info.as_ref().map(|pe| fix_line_ends(&pe.0)),
+            version: headers.integer_info.as_ref().map(|ii| ii.version),
         }
     }
-    /*
-            for &raw in &h.file_attributes {
-                let s = decoder.decode_string_cow(&raw.text.0, warn);
-                output.push(Record::FileAttributes(FileAttributeRecord::parse(
-                    &decoder, &s, warn,
-                )?));
-            }
-            for &raw in &h.other_extensions {
-                output.push(Record::OtherExtension(raw.clone()));
+}
+
+pub fn decode(
+    mut headers: Headers,
+    encoding: &'static Encoding,
+    warn: impl Fn(Error),
+) -> Result<(Dictionary, Metadata), Error> {
+    let mut dictionary = Dictionary::new(encoding);
+
+    let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' '));
+    if !file_label.is_empty() {
+        dictionary.file_label = Some(file_label);
     }
-        */
-    // Decode the variable records, which are the basis of almost everything
-    // else.
-    for raw in &h.variables {
-        parse_variable_record(&mut decoder, raw, warn)?;
+
+    for attributes in headers.file_attributes.drain(..) {
+        dictionary.attributes.extend(attributes.0.0.into_iter())
     }
-    /*
-        // Decode value labels and weight variable.  These use indexes into the
-        // variable records, so we need to parse them before those indexes become
-        // invalidated by very long string variables.
-        for &raw in &h.value_labels {
-            if let Some(value_label) = ValueLabelRecord::try_decode(&mut decoder, raw, warn)? {
-                output.push(Record::ValueLabel(value_label));
-            }
-        }
-        // XXX weight
-        if let Some(raw) = h.var_display {
-            output.push(Record::VarDisplay(raw.clone()));
-        }
 
-        // Decode records that use short names.
-            for &raw in &h.multiple_response {
-                if let Some(mrr) = MultipleResponseRecord::try_decode(&mut decoder, raw, warn)? {
-                    output.push(Record::MultipleResponse(mrr))
-                }
-            }
-        for &raw in &h.very_long_strings {
-            let s = decoder.decode_string_cow(&raw.text.0, warn);
-            output.push(Record::VeryLongStrings(VeryLongStringRecord::parse(
-                &decoder, &s, warn,
-            )?));
-        }
+    // Concatenate all the document records (really there should only be one)
+    // and trim off the trailing spaces that pad them to 80 bytes.
+    dictionary.documents = headers
+        .document
+        .drain(..)
+        .flat_map(|record| record.lines)
+        .map(trim_end_spaces)
+        .collect();
 
-        // Rename variables to their long names.
-        for &raw in &h.long_names {
-            let s = decoder.decode_string_cow(&raw.text.0, warn);
-            output.push(Record::LongNames(LongNameRecord::parse(
-                &mut decoder,
-                &s,
-                warn,
-            )?));
-        }
+    // XXX warn for weird integer format
+    // XXX warn for weird floating-point format, etc.
 
-        // Decode recods that use long names.
-        for &raw in &h.variable_attributes {
-            let s = decoder.decode_string_cow(&raw.text.0, warn);
-            output.push(Record::VariableAttributes(VariableAttributeRecord::parse(
-                &decoder, &s, warn,
-            )?));
-        }
-        for &raw in &h.long_string_value_labels {
-            if let Some(mrr) = LongStringValueLabelRecord::try_decode(&mut decoder, raw, warn)? {
-                output.push(Record::LongStringValueLabels(mrr))
-            }
-        }
-        for &raw in &h.long_string_missing_values {
-            if let Some(mrr) = LongStringMissingValuesRecord::try_decode(&mut decoder, raw, warn)? {
-                output.push(Record::LongStringMissingValues(mrr))
+    /*
+        let mut decoder = Decoder {
+            raw: decoder,
+            variables: HashMap::new(),
+            var_names: HashMap::new(),
+            dictionary,
+            n_dict_indexes: 0,
+            n_generated_names: 0,
+        };
+    */
+    let metadata = Metadata::decode(&headers, warn);
+    Ok((dictionary, metadata))
+}
+
+fn trim_end_spaces(mut s: String) -> String {
+    s.truncate(s.trim_end_matches(' ').len());
+    s
+}
+
+/// Returns a copy of `s` in which all lone CR and CR LF pairs have been
+/// replaced by LF.
+///
+/// (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
+/// files that use CR-only line ends in the file label and extra product
+/// info.) */
+fn fix_line_ends(s: &str) -> String {
+    let mut out = String::with_capacity(s.len());
+    let mut s = s.chars().peekable();
+    while let Some(c) = s.next() {
+        match c {
+            '\r' => {
+                s.next_if_eq(&'\n');
+                out.push('\n')
             }
+            c => out.push(c),
         }
-        for &raw in &h.variable_sets {
-            let s = decoder.decode_string_cow(&raw.text.0, warn);
-            output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?));
-        }
-     */
-    let metadata = Metadata::decode(&header, h.integer_info, h.product_info, warn);
-    Ok((output, metadata))
+    }
+    out
 }
 
+/*
 impl Decoder {
     fn generate_name(&mut self) -> Identifier {
         loop {
@@ -493,7 +565,8 @@ impl<const N: usize> Decode<RawStr<N>> for String {
         decoder.decode_string(&input.0, &warn)
     }
 }
-
+*/
+/*
 #[derive(Clone, Debug)]
 pub struct HeaderRecord {
     pub eye_catcher: String,
@@ -1285,3 +1358,4 @@ mod test {
         assert_eq!(&charset[..], &encoded[..]);
     }
 }
+*/
index 7f0ec2aadd0508c737cd815e085cd7474de7c677..f8e880c14e21641d96ea5a8f8e626f0963ba263c 100644 (file)
@@ -1,4 +1,4 @@
-//pub mod cooked;
+pub mod cooked;
 pub mod dictionary;
 pub mod encoding;
 pub mod endian;
index bbe2b46d36754ff3f7aee5fcd83ef8b7c91a675a..0620d4eea6884a19318a2bd089970841c4579553 100644 (file)
@@ -1621,7 +1621,8 @@ where
 {
     pub offsets: Range<u64>,
 
-    /// The document, as an array of 80-byte lines.
+    /// The document, as an array of lines.  Raw lines are exactly 80 bytes long
+    /// and are right-padded with spaces without any new-line termination.
     pub lines: Vec<S>,
 }
 
@@ -2346,7 +2347,7 @@ impl AttributeSet {
 }
 
 #[derive(Clone, Debug, Default)]
-pub struct FileAttributeRecord(AttributeSet);
+pub struct FileAttributeRecord(pub AttributeSet);
 
 impl FileAttributeRecord {
     fn decode(source: &TextRecord, decoder: &Decoder) -> Self {