continue implementation
[pspp] / rust / src / cooked.rs
index 3e4677e71c1bbbe80191617e92f8e935ece67fe4..56d6aa1c82254e1fa44bd3b9e3992ade55743df3 100644 (file)
@@ -4,11 +4,15 @@ use std::{
 };
 
 use crate::{
+    dictionary::{self, Dictionary},
     encoding::{default_encoding, get_encoding, Error as EncodingError},
     endian::Endian,
     format::{Error as FormatError, Spec, UncheckedSpec},
     identifier::{Error as IdError, Identifier},
-    raw::{self, ProductInfoRecord, RawDocumentLine, RawStr, RawString, VarDisplayRecord, VarType},
+    raw::{
+        self, LongStringMissingValueRecord, MissingValues, ProductInfoRecord, RawDocumentLine,
+        RawStr, RawString, VarDisplayRecord, VarType,
+    },
 };
 use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
 use encoding_rs::{DecoderResult, Encoding};
@@ -166,7 +170,7 @@ pub enum Record {
     VariableSets(VariableSetRecord),
     VarDisplay(VarDisplayRecord),
     MultipleResponse(MultipleResponseRecord),
-    LongStringMissingValues(LongStringMissingValuesRecord),
+    LongStringMissingValues(LongStringMissingValueRecord<String, String>),
     LongStringValueLabels(LongStringValueLabelRecord),
     Encoding(EncodingRecord),
     NumberOfCases(NumberOfCasesRecord),
@@ -195,28 +199,28 @@ pub struct Variable {
 }
 
 pub struct Decoder {
-    pub compression: Option<Compression>,
-    pub endian: Endian,
+    pub raw: raw::Decoder,
     pub encoding: &'static Encoding,
     pub variables: HashMap<DictIndex, Variable>,
     pub var_names: HashMap<Identifier, DictIndex>,
+    pub dictionary: Dictionary,
     n_dict_indexes: usize,
     n_generated_names: usize,
 }
 
 #[derive(Default)]
 struct Headers<'a> {
-    header: Option<&'a raw::HeaderRecord<RawString>>,
-    variables: Vec<&'a raw::VariableRecord<RawString, RawStr<8>>>,
+    header: Option<raw::HeaderRecord<Cow<'a, str>>>,
+    variables: Vec<raw::VariableRecord<Cow<'a, str>, String>>,
     value_labels: Vec<&'a raw::ValueLabelRecord<RawStr<8>, RawString>>,
-    document: Option<&'a raw::DocumentRecord<RawDocumentLine>>,
+    documents: Vec<raw::DocumentRecord<Cow<'a, str>>>,
     integer_info: Option<&'a raw::IntegerInfoRecord>,
     float_info: Option<&'a raw::FloatInfoRecord>,
     variable_sets: Vec<&'a raw::VariableSetRecord>,
     var_display: Option<&'a raw::VarDisplayRecord>,
     multiple_response: Vec<&'a raw::MultipleResponseRecord<RawString, RawString>>,
     long_string_value_labels: Vec<&'a raw::LongStringValueLabelRecord<RawString>>,
-    long_string_missing_values: Vec<&'a raw::LongStringMissingValueRecord<RawString, RawStr<8>>>,
+    long_string_missing_values: Vec<raw::LongStringMissingValueRecord<Identifier, String>>,
     encoding: Option<&'a raw::EncodingRecord>,
     number_of_cases: Option<&'a raw::NumberOfCasesRecord>,
     product_info: Option<&'a raw::ProductInfoRecord>,
@@ -237,14 +241,16 @@ fn set_or_warn<T>(option: &mut Option<T>, value: T, warn: &impl Fn(Error)) {
 }
 
 impl<'a> Headers<'a> {
-    fn new(headers: &'a Vec<raw::Record>, warn: &impl Fn(Error)) -> Headers<'a> {
+    fn new(headers: &'a Vec<raw::Record>, decoder: &Decoder, warn: &impl Fn(Error)) -> Headers<'a> {
         let mut h = Headers::default();
         for header in headers {
             match header {
-                raw::Record::Header(record) => set_or_warn(&mut h.header, record, warn),
-                raw::Record::Variable(record) => h.variables.push(record),
+                raw::Record::Header(record) => {
+                    set_or_warn(&mut h.header, record.decode(&decoder.raw), warn)
+                }
+                raw::Record::Variable(record) => h.variables.push(record.decode(&decoder.raw)),
                 raw::Record::ValueLabel(record) => h.value_labels.push(record),
-                raw::Record::Document(record) => set_or_warn(&mut h.document, record, warn),
+                raw::Record::Document(record) => h.documents.push(record.decode(&decoder.raw)),
                 raw::Record::IntegerInfo(record) => set_or_warn(&mut h.integer_info, record, warn),
                 raw::Record::FloatInfo(record) => set_or_warn(&mut h.float_info, record, warn),
                 raw::Record::VariableSets(record) => h.variable_sets.push(record),
@@ -253,9 +259,9 @@ impl<'a> Headers<'a> {
                 raw::Record::LongStringValueLabels(record) => {
                     h.long_string_value_labels.push(record)
                 }
-                raw::Record::LongStringMissingValues(record) => {
-                    h.long_string_missing_values.push(record)
-                }
+                raw::Record::LongStringMissingValues(record) => h
+                    .long_string_missing_values
+                    .push(record.decode(&decoder.raw)),
                 raw::Record::Encoding(record) => set_or_warn(&mut h.encoding, record, warn),
                 raw::Record::NumberOfCases(record) => {
                     set_or_warn(&mut h.number_of_cases, record, warn)
@@ -277,85 +283,83 @@ impl<'a> Headers<'a> {
     }
 }
 
-pub fn decode(
-    headers: Vec<raw::Record>,
-    encoding: Option<&'static Encoding>,
+pub fn encoding_from_headers(
+    headers: &Vec<raw::Record>,
     warn: &impl Fn(Error),
-) -> Result<Vec<Record>, Error> {
-    let h = Headers::new(&headers, warn);
-    let Some(header) = h.header else {
-        return Err(Error::MissingHeaderRecord);
-    };
-    let encoding = match encoding {
-        Some(encoding) => encoding,
-        None => {
-            let encoding = h.encoding.map(|record| record.0.as_str());
-            let character_code = h.integer_info.map(|record| record.character_code);
-            match get_encoding(encoding, character_code) {
-                Ok(encoding) => encoding,
-                Err(err @ EncodingError::Ebcdic) => return Err(Error::EncodingError(err)),
-                Err(err) => {
-                    warn(Error::EncodingError(err));
-                    // Warn that we're using the default encoding.
-                    default_encoding()
-                }
-            }
+) -> Result<&'static Encoding, Error> {
+    let mut encoding_record = None;
+    let mut integer_info_record = None;
+    for record in headers {
+        match record {
+            raw::Record::Encoding(record) => encoding_record = Some(record),
+            raw::Record::IntegerInfo(record) => integer_info_record = Some(record),
+            _ => (),
         }
-    };
-
-    //let mut dictionary = Dictionary::new(encoding);
+    }
+    let encoding = encoding_record.map(|record| record.0.as_str());
+    let character_code = integer_info_record.map(|record| record.character_code);
+    match get_encoding(encoding, character_code) {
+        Ok(encoding) => Ok(encoding),
+        Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)),
+        Err(err) => {
+            warn(Error::EncodingError(err));
+            // Warn that we're using the default encoding.
+            Ok(default_encoding())
+        }
+    }
+}
 
+pub fn decode(
+    headers: Vec<raw::Record>,
+    encoding: &'static Encoding,
+    warn: &impl Fn(Error),
+) -> Result<(Vec<Record>, Metadata), Error> {
     let mut decoder = Decoder {
-        compression: header.compression,
-        endian: header.endian,
+        raw: raw::Decoder {
+            encoding,
+            warn: Box::new(|error| println!("{error}")),
+        },
         encoding,
         variables: HashMap::new(),
         var_names: HashMap::new(),
+        dictionary: Dictionary::new(encoding),
         n_dict_indexes: 0,
         n_generated_names: 0,
     };
 
+    let h = Headers::new(&headers, &decoder, warn);
+    let Some(header) = h.header else {
+        return Err(Error::MissingHeaderRecord);
+    };
+
     let mut output = Vec::with_capacity(headers.len());
 
     // Decode the records that don't use variables at all.
-    if let Some(header) = HeaderRecord::try_decode(&mut decoder, header, warn)? {
+    if let Some(header) = HeaderRecord::try_decode(&mut decoder, &header, warn)? {
         output.push(Record::Header(header))
     }
-    if let Some(raw) = h.document {
-        if let Some(document) = DocumentRecord::try_decode(&mut decoder, raw, warn)? {
-            output.push(Record::Document(document))
+    for document in h.documents {
+        for line in &document.lines {
+            decoder.dictionary.documents.push(line.to_string())
         }
     }
-    if let Some(raw) = h.integer_info {
-        output.push(Record::IntegerInfo(raw.clone()));
-    }
-    if let Some(raw) = h.float_info {
-        output.push(Record::FloatInfo(raw.clone()));
-    }
-    if let Some(raw) = h.product_info {
-        output.push(Record::ProductInfo(raw.clone()));
+    /*
+            for &raw in &h.file_attributes {
+                let s = decoder.decode_string_cow(&raw.text.0, warn);
+                output.push(Record::FileAttributes(FileAttributeRecord::parse(
+                    &decoder, &s, warn,
+                )?));
+            }
+            for &raw in &h.other_extensions {
+                output.push(Record::OtherExtension(raw.clone()));
     }
-    if let Some(raw) = h.number_of_cases {
-        output.push(Record::NumberOfCases(raw.clone()))
+        */
+    // Decode the variable records, which are the basis of almost everything
+    // else.
+    for raw in &h.variables {
+        parse_variable_record(&mut decoder, raw, warn)?;
     }
     /*
-        for &raw in &h.file_attributes {
-            let s = decoder.decode_string_cow(&raw.text.0, warn);
-            output.push(Record::FileAttributes(FileAttributeRecord::parse(
-                &decoder, &s, warn,
-            )?));
-        }
-        for &raw in &h.other_extensions {
-            output.push(Record::OtherExtension(raw.clone()));
-        }
-        // Decode the variable records, which are the basis of almost everything
-        // else.
-        for &raw in &h.variables {
-            if let Some(variable) = VariableRecord::try_decode(&mut decoder, raw, warn)? {
-                output.push(Record::Variable(variable));
-            }
-        }
-
         // Decode value labels and weight variable.  These use indexes into the
         // variable records, so we need to parse them before those indexes become
         // invalidated by very long string variables.
@@ -413,8 +417,9 @@ pub fn decode(
             let s = decoder.decode_string_cow(&raw.text.0, warn);
             output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?));
         }
-    */
-    Ok(output)
+     */
+    let metadata = Metadata::decode(&header, h.integer_info, h.product_info, warn);
+    Ok((output, metadata))
 }
 
 impl Decoder {
@@ -532,29 +537,79 @@ fn trim_end_spaces(mut s: String) -> String {
     s
 }
 
+/// Data file info that doesn't fit in [Dictionary].
+pub struct Metadata {
+    creation: NaiveDateTime,
+    endian: Endian,
+    compression: Option<Compression>,
+    n_cases: Option<u64>,
+    product: String,
+    product_ext: Option<String>,
+    version: Option<(i32, i32, i32)>,
+}
+
+impl Metadata {
+    fn decode(
+        header: &crate::raw::HeaderRecord<Cow<str>>,
+        integer_info: Option<&IntegerInfoRecord>,
+        product_ext: Option<&ProductInfoRecord>,
+        warn: impl Fn(Error),
+    ) -> Self {
+        let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
+            .unwrap_or_else(|_| {
+                warn(Error::InvalidCreationDate {
+                    creation_date: header.creation_date.to_string(),
+                });
+                Default::default()
+            });
+        let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
+            .unwrap_or_else(|_| {
+                warn(Error::InvalidCreationTime {
+                    creation_time: header.creation_time.to_string(),
+                });
+                Default::default()
+            });
+        let creation = NaiveDateTime::new(creation_date, creation_time);
+
+        let product = header
+            .eye_catcher
+            .trim_start_matches("@(#) SPSS DATA FILE")
+            .trim_end()
+            .to_string();
+
+        Self {
+            creation,
+            endian: header.endian,
+            compression: header.compression,
+            n_cases: header.n_cases.map(|n| n as u64),
+            product,
+            product_ext: product_ext.map(|pe| pe.0.clone()),
+            version: integer_info.map(|ii| ii.version),
+        }
+    }
+}
+
 impl TryDecode for HeaderRecord {
-    type Input<'a> = crate::raw::HeaderRecord<RawString>;
+    type Input<'a> = crate::raw::HeaderRecord<Cow<'a, str>>;
 
     fn try_decode(
-        decoder: &mut Decoder,
+        _decoder: &mut Decoder,
         input: &Self::Input<'_>,
         warn: impl Fn(Error),
     ) -> Result<Option<Self>, Error> {
-        let eye_catcher = trim_end_spaces(decoder.decode_string(&input.eye_catcher.0, &warn));
-        let file_label = trim_end_spaces(decoder.decode_string(&input.file_label.0, &warn));
-        let creation_date = decoder.decode_string_cow(&input.creation_date.0, &warn);
-        let creation_date =
-            NaiveDate::parse_from_str(&creation_date, "%e %b %Y").unwrap_or_else(|_| {
+        let eye_catcher = trim_end_spaces(input.eye_catcher.to_string());
+        let file_label = trim_end_spaces(input.file_label.to_string());
+        let creation_date = NaiveDate::parse_from_str(&input.creation_date, "%e %b %Y")
+            .unwrap_or_else(|_| {
                 warn(Error::InvalidCreationDate {
-                    creation_date: creation_date.into(),
+                    creation_date: input.creation_date.to_string(),
                 });
                 Default::default()
             });
-        let creation_time = decoder.decode_string_cow(&input.creation_time.0, &warn);
-        let creation_time =
-            NaiveTime::parse_from_str(&creation_time, "%H:%M:%S").unwrap_or_else(|_| {
+        let creation_time = NaiveTime::parse_from_str(&input.creation_time, "%H:%M:%S")
+            .unwrap_or_else(|_| {
                 warn(Error::InvalidCreationTime {
-                    creation_time: creation_time.into(),
+                    creation_time: input.creation_time.to_string(),
                 });
                 Default::default()
             });
@@ -621,6 +676,13 @@ impl VarWidth {
     pub fn narrower(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
         Self::width_predicate(a, b, |a, b| a.min(b))
     }
+
+    pub fn default_display_width(&self) -> u32 {
+        match self {
+            VarWidth::Numeric => 8,
+            VarWidth::String(width) => *width.min(&32) as u32,
+        }
+    }
 }
 
 impl From<VarWidth> for VarType {
@@ -638,39 +700,10 @@ pub struct VariableRecord {
     pub name: Identifier,
     pub print_format: Spec,
     pub write_format: Spec,
-    pub missing_values: MissingValues,
+    pub missing_values: MissingValues<String>,
     pub label: Option<String>,
 }
 
-#[derive(Clone, Debug)]
-pub struct MissingValues {
-    /// Individual missing values, up to 3 of them.
-    pub values: Vec<Value>,
-
-    /// Optional range of missing values.
-    pub range: Option<(Value, Value)>,
-}
-
-impl Decode<raw::MissingValues<RawStr<8>>> for MissingValues {
-    fn decode(
-        decoder: &Decoder,
-        input: &raw::MissingValues<RawStr<8>>,
-        _warn: impl Fn(Error),
-    ) -> Self {
-        MissingValues {
-            values: input
-                .values
-                .iter()
-                .map(|value| Value::decode(value, decoder))
-                .collect(),
-            range: input
-                .range
-                .as_ref()
-                .map(|(low, high)| (Value::decode(low, decoder), Value::decode(high, decoder))),
-        }
-    }
-}
-
 fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec {
     UncheckedSpec::try_from(raw)
         .and_then(Spec::try_from)
@@ -682,91 +715,84 @@ fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatErro
         })
 }
 
-impl TryDecode for VariableRecord {
-    type Input<'a> = raw::VariableRecord<RawString, RawStr<8>>;
-
-    fn try_decode(
-        decoder: &mut Decoder,
-        input: &Self::Input<'_>,
-        warn: impl Fn(Error),
-    ) -> Result<Option<VariableRecord>, Error> {
-        let width = match input.width {
-            0 => VarWidth::Numeric,
-            w @ 1..=255 => VarWidth::String(w as u16),
-            -1 => return Ok(None),
-            _ => {
-                return Err(Error::InvalidVariableWidth {
-                    offsets: input.offsets.clone(),
-                    width: input.width,
-                })
-            }
-        };
-        let name = trim_end_spaces(decoder.decode_string(&input.name.0, &warn));
-        let name = match Identifier::new(&name, decoder.encoding) {
-            Ok(name) => {
-                if !decoder.var_names.contains_key(&name) {
-                    name
-                } else {
-                    let new_name = decoder.generate_name();
-                    warn(Error::DuplicateVariableName {
-                        duplicate_name: name.clone(),
-                        new_name: new_name.clone(),
-                    });
-                    new_name
-                }
-            }
-            Err(id_error) => {
+fn parse_variable_record(
+    decoder: &mut Decoder,
+    input: &raw::VariableRecord<Cow<str>, String>,
+    warn: impl Fn(Error),
+) -> Result<(), Error> {
+    let width = match input.width {
+        0 => VarWidth::Numeric,
+        w @ 1..=255 => VarWidth::String(w as u16),
+        -1 => return Ok(()),
+        _ => {
+            return Err(Error::InvalidVariableWidth {
+                offsets: input.offsets.clone(),
+                width: input.width,
+            })
+        }
+    };
+    let name = trim_end_spaces(input.name.to_string());
+    let name = match Identifier::new(&name, decoder.encoding) {
+        Ok(name) => {
+            if !decoder.var_names.contains_key(&name) {
+                name
+            } else {
                 let new_name = decoder.generate_name();
-                warn(Error::InvalidVariableName {
-                    id_error,
+                warn(Error::DuplicateVariableName {
+                    duplicate_name: name.clone(),
                     new_name: new_name.clone(),
                 });
                 new_name
             }
-        };
-        let variable = Variable {
-            dict_index: decoder.n_dict_indexes,
-            short_name: name.clone(),
-            long_name: None,
-            width,
-        };
-        decoder.n_dict_indexes += width.n_dict_indexes();
-        assert!(decoder
-            .var_names
-            .insert(name.clone(), variable.dict_index)
-            .is_none());
-        assert!(decoder
-            .variables
-            .insert(variable.dict_index, variable)
-            .is_none());
-
-        let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
-            warn(Error::InvalidPrintFormat {
-                new_spec,
-                variable: name.clone(),
-                format_error,
-            })
-        });
-        let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
-            warn(Error::InvalidWriteFormat {
-                new_spec,
-                variable: name.clone(),
-                format_error,
-            })
-        });
-        let label = input
-            .label
-            .as_ref()
-            .map(|label| decoder.decode_string(&label.0, &warn));
-        Ok(Some(VariableRecord {
-            width,
-            name,
-            print_format,
-            write_format,
-            missing_values: MissingValues::decode(decoder, &input.missing_values, warn),
-            label,
-        }))
+        }
+        Err(id_error) => {
+            let new_name = decoder.generate_name();
+            warn(Error::InvalidVariableName {
+                id_error,
+                new_name: new_name.clone(),
+            });
+            new_name
+        }
+    };
+    let variable = Variable {
+        dict_index: decoder.n_dict_indexes,
+        short_name: name.clone(),
+        long_name: None,
+        width,
+    };
+    decoder.n_dict_indexes += width.n_dict_indexes();
+    assert!(decoder
+        .var_names
+        .insert(name.clone(), variable.dict_index)
+        .is_none());
+    assert!(decoder
+        .variables
+        .insert(variable.dict_index, variable)
+        .is_none());
+
+    let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
+        warn(Error::InvalidPrintFormat {
+            new_spec,
+            variable: name.clone(),
+            format_error,
+        })
+    });
+    let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
+        warn(Error::InvalidWriteFormat {
+            new_spec,
+            variable: name.clone(),
+            format_error,
+        })
+    });
+    let mut variable = dictionary::Variable::new(name, width);
+    variable.print_format = print_format;
+    variable.write_format = write_format;
+    variable.missing_values = input.missing_values.clone();
+    if let Some(ref label) = input.label {
+        variable.label = Some(label.to_string());
     }
+    decoder.dictionary.add_var(variable).unwrap();
+    Ok(())
 }
 
 #[derive(Clone, Debug)]
@@ -1284,56 +1310,6 @@ impl TryDecode for MultipleResponseRecord {
     }
 }
 
-#[derive(Clone, Debug)]
-pub struct LongStringMissingValues {
-    /// Variable name.
-    pub var_name: Identifier,
-
-    /// Missing values.
-    pub missing_values: MissingValues,
-}
-
-impl LongStringMissingValues {
-    fn decode(
-        decoder: &Decoder,
-        input: &raw::LongStringMissingValues<RawString, RawStr<8>>,
-        warn: &impl Fn(Error),
-    ) -> Result<Self, Error> {
-        let var_name = decoder.decode_string(&input.var_name.0, warn);
-        let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
-            .map_err(Error::InvalidLongStringValueLabelName)?;
-
-        let missing_values = MissingValues::decode(decoder, &input.missing_values, warn);
-
-        Ok(LongStringMissingValues {
-            var_name,
-            missing_values,
-        })
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringMissingValuesRecord(Vec<LongStringMissingValues>);
-
-impl TryDecode for LongStringMissingValuesRecord {
-    type Input<'a> = raw::LongStringMissingValueRecord<RawString, RawStr<8>>;
-
-    fn try_decode(
-        decoder: &mut Decoder,
-        input: &Self::Input<'_>,
-        warn: impl Fn(Error),
-    ) -> Result<Option<Self>, Error> {
-        let mut labels = Vec::with_capacity(input.0.len());
-        for label in &input.0 {
-            match LongStringMissingValues::decode(decoder, label, &warn) {
-                Ok(set) => labels.push(set),
-                Err(error) => warn(error),
-            }
-        }
-        Ok(Some(LongStringMissingValuesRecord(labels)))
-    }
-}
-
 #[derive(Clone, Debug)]
 pub struct LongStringValueLabels {
     pub var_name: Identifier,