From: Ben Pfaff Date: Sun, 25 Feb 2024 20:01:10 +0000 (-0800) Subject: work on new decoder - about to start decoding variables X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=cbcfea4612b66e57816d40d9e23c8fa302ff19ad;p=pspp work on new decoder - about to start decoding variables --- diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index 78eaf138c2..5d32c91ced 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -1,34 +1,35 @@ -use std::{ - borrow::Cow, cell::RefCell, cmp::Ordering, collections::HashMap, iter::repeat, ops::Range, - rc::Rc, -}; +use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc}; use crate::{ - dictionary::{self, Dictionary, VarWidth}, - encoding::{default_encoding, get_encoding, Error as EncodingError}, + dictionary::{Dictionary, VarWidth}, + encoding::Error as EncodingError, endian::Endian, - format::{Error as FormatError, Spec, UncheckedSpec}, + format::{Error as FormatError, Spec}, identifier::{Error as IdError, Identifier}, raw::{ - self, LongStringMissingValueRecord, MissingValues, ProductInfoRecord, RawDocumentLine, - RawStr, RawString, VarDisplayRecord, VarType, DecodedRecord, + self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord, + FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongNamesRecord, + LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord, + NumberOfCasesRecord, ProductInfoRecord, RawStr, ValueLabelRecord, VarDisplayRecord, + VariableAttributeRecord, VariableRecord, VariableSetRecord, VeryLongStringsRecord, ZHeader, + ZTrailer, }, }; use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; -use encoding_rs::{DecoderResult, Encoding}; -use num::integer::div_ceil; -use ordered_float::OrderedFloat; +use encoding_rs::Encoding; use thiserror::Error as ThisError; pub use crate::raw::{CategoryLabels, Compression}; #[derive(ThisError, Debug)] pub enum Error { - // XXX this is really an internal error and maybe we should change the - // interfaces to make it impossible #[error("Missing header record")] MissingHeaderRecord, + // XXX this is an internal error + #[error("More than one file header record")] + DuplicateHeaderRecord, + #[error("{0}")] EncodingError(EncodingError), @@ -159,36 +160,6 @@ pub enum Error { TBD, } -#[derive(Clone, Debug)] -pub enum Record { - Header(HeaderRecord), - Variable(VariableRecord), - ValueLabel(ValueLabelRecord), - Document(DocumentRecord), - IntegerInfo(IntegerInfoRecord), - FloatInfo(FloatInfoRecord), - VariableSets(VariableSetRecord), - VarDisplay(VarDisplayRecord), - MultipleResponse(MultipleResponseRecord), - LongStringMissingValues(LongStringMissingValueRecord), - LongStringValueLabels(LongStringValueLabelRecord), - Encoding(EncodingRecord), - NumberOfCases(NumberOfCasesRecord), - ProductInfo(ProductInfoRecord), - LongNames(LongNameRecord), - VeryLongStrings(VeryLongStringRecord), - FileAttributes(FileAttributeRecord), - VariableAttributes(VariableAttributeRecord), - OtherExtension(Extension), - //Case(Vec), -} - -pub use crate::raw::EncodingRecord; -pub use crate::raw::Extension; -pub use crate::raw::FloatInfoRecord; -pub use crate::raw::IntegerInfoRecord; -pub use crate::raw::NumberOfCasesRecord; - type DictIndex = usize; pub struct Variable { @@ -208,191 +179,292 @@ pub struct Decoder { n_generated_names: usize, } -#[derive(Default)] -struct Headers<'a> { - header: Option>>, - variables: Vec, String>>, - value_labels: Vec<&'a raw::ValueLabelRecord, RawString>>, - documents: Vec>>, - integer_info: Option<&'a raw::IntegerInfoRecord>, - float_info: Option<&'a raw::FloatInfoRecord>, - variable_sets: Vec<&'a raw::VariableSetRecord>, - var_display: Option<&'a raw::VarDisplayRecord>, - multiple_response: Vec<&'a raw::MultipleResponseRecord>, - long_string_value_labels: Vec<&'a raw::LongStringValueLabelRecord>, - long_string_missing_values: Vec>, - encoding: Option<&'a raw::EncodingRecord>, - number_of_cases: Option<&'a raw::NumberOfCasesRecord>, - product_info: Option<&'a raw::ProductInfoRecord>, - long_names: Option<&'a raw::LongNamesRecord>, - very_long_strings: Vec<&'a raw::VeryLongStringsRecord>, - file_attributes: Vec<&'a raw::FileAttributeRecord>, - variable_attributes: Vec<&'a raw::VariableAttributeRecord>, - other_extensions: Vec<&'a raw::Extension>, - cases: Option<&'a Rc>>, +#[derive(Clone, Debug)] +pub struct Headers { + pub header: HeaderRecord, + pub variable: Vec>, + pub value_label: Vec, String>>, + pub document: Vec>, + pub integer_info: Option, + pub float_info: Option, + pub var_display: Option, + pub multiple_response: Vec>, + pub long_string_value_labels: Vec>, + pub long_string_missing_values: Vec>, + pub encoding: Option, + pub number_of_cases: Option, + pub variable_sets: Vec, + pub product_info: Option, + pub long_names: Vec, + pub very_long_strings: Vec, + pub file_attributes: Vec, + pub variable_attributes: Vec, + pub other_extension: Vec, + pub end_of_headers: Option, + pub z_header: Option, + pub z_trailer: Option, + pub cases: Option>>, } -fn set_or_warn(option: &mut Option, value: T, warn: &impl Fn(Error)) { - if option.is_none() { - let _ = option.insert(value); - } else { - warn(Error::TBD); +fn take_first(mut vec: Vec, more_than_one: F) -> Option +where + F: FnOnce(), +{ + if vec.len() > 1 { + more_than_one(); } + vec.drain(..).next() } -impl<'a> Headers<'a> { - fn new(headers: &'a Vec, decoder: &Decoder, warn: &impl Fn(Error)) -> Headers<'a> { - let mut h = Headers::default(); +impl Headers { + pub fn new(headers: Vec, warn: &impl Fn(Error)) -> Result { + let mut file_header = Vec::new(); + let mut variable = Vec::new(); + let mut value_label = Vec::new(); + let mut document = Vec::new(); + let mut integer_info = Vec::new(); + let mut float_info = Vec::new(); + let mut var_display = Vec::new(); + let mut multiple_response = Vec::new(); + let mut long_string_value_labels = Vec::new(); + let mut long_string_missing_values = Vec::new(); + let mut encoding = Vec::new(); + let mut number_of_cases = Vec::new(); + let mut variable_sets = Vec::new(); + let mut product_info = Vec::new(); + let mut long_names = Vec::new(); + let mut very_long_strings = Vec::new(); + let mut file_attributes = Vec::new(); + let mut variable_attributes = Vec::new(); + let mut other_extension = Vec::new(); + let mut end_of_headers = Vec::new(); + let mut z_header = Vec::new(); + let mut z_trailer = Vec::new(); + let mut cases = Vec::new(); + for header in headers { match header { - raw::Record::Header(record) => { - set_or_warn(&mut h.header, record.decode(&decoder.raw), warn) + DecodedRecord::Header(record) => { + file_header.push(record); + } + DecodedRecord::Variable(record) => { + variable.push(record); + } + DecodedRecord::ValueLabel(record) => { + value_label.push(record); + } + DecodedRecord::Document(record) => { + document.push(record); + } + DecodedRecord::IntegerInfo(record) => { + integer_info.push(record); + } + DecodedRecord::FloatInfo(record) => { + float_info.push(record); + } + DecodedRecord::VariableSets(record) => { + variable_sets.push(record); + } + DecodedRecord::VarDisplay(record) => { + var_display.push(record); + } + DecodedRecord::MultipleResponse(record) => { + multiple_response.push(record); + } + DecodedRecord::LongStringValueLabels(record) => { + long_string_value_labels.push(record) + } + DecodedRecord::LongStringMissingValues(record) => { + long_string_missing_values.push(record); + } + DecodedRecord::Encoding(record) => { + encoding.push(record); + } + DecodedRecord::NumberOfCases(record) => { + number_of_cases.push(record); + } + DecodedRecord::ProductInfo(record) => { + product_info.push(record); + } + DecodedRecord::LongNames(record) => { + long_names.push(record); + } + DecodedRecord::VeryLongStrings(record) => { + very_long_strings.push(record); + } + DecodedRecord::FileAttributes(record) => { + file_attributes.push(record); } - raw::Record::Variable(record) => h.variables.push(record.decode(&decoder.raw)), - raw::Record::ValueLabel(record) => h.value_labels.push(record), - raw::Record::Document(record) => h.documents.push(record.decode(&decoder.raw)), - raw::Record::IntegerInfo(record) => set_or_warn(&mut h.integer_info, record, warn), - raw::Record::FloatInfo(record) => set_or_warn(&mut h.float_info, record, warn), - raw::Record::VariableSets(record) => h.variable_sets.push(record), - raw::Record::VarDisplay(record) => set_or_warn(&mut h.var_display, record, warn), - raw::Record::MultipleResponse(record) => h.multiple_response.push(record), - raw::Record::LongStringValueLabels(record) => { - h.long_string_value_labels.push(record) + DecodedRecord::VariableAttributes(record) => { + variable_attributes.push(record); } - raw::Record::LongStringMissingValues(record) => h - .long_string_missing_values - .push(record.decode(&decoder.raw)), - raw::Record::Encoding(record) => set_or_warn(&mut h.encoding, record, warn), - raw::Record::NumberOfCases(record) => { - set_or_warn(&mut h.number_of_cases, record, warn) + DecodedRecord::OtherExtension(record) => { + other_extension.push(record); + } + DecodedRecord::EndOfHeaders(record) => { + end_of_headers.push(record); + } + DecodedRecord::ZHeader(record) => { + z_header.push(record); + } + DecodedRecord::ZTrailer(record) => { + z_trailer.push(record); + } + DecodedRecord::Cases(record) => { + cases.push(record); } - raw::Record::ProductInfo(record) => set_or_warn(&mut h.product_info, record, warn), - raw::Record::LongNames(record) => set_or_warn(&mut h.long_names, record, warn), - raw::Record::VeryLongStrings(record) => h.very_long_strings.push(record), - raw::Record::FileAttributes(record) => h.file_attributes.push(record), - raw::Record::VariableAttributes(record) => h.variable_attributes.push(record), - raw::Record::OtherExtension(record) => h.other_extensions.push(record), - raw::Record::EndOfHeaders(_) => (), - raw::Record::ZHeader(_) => (), - raw::Record::ZTrailer(_) => (), - raw::Record::Cases(record) => set_or_warn(&mut h.cases, record, warn), - raw::Record::Text(_) => todo!(), } } - h + + let Some(file_header) = take_first(file_header, || warn(Error::DuplicateHeaderRecord)) + else { + return Err(Error::MissingHeaderRecord); + }; + + Ok(Headers { + header: file_header, + variable, + value_label, + document, + integer_info: take_first(integer_info, || warn(Error::TBD)), + float_info: take_first(float_info, || warn(Error::TBD)), + var_display: take_first(var_display, || warn(Error::TBD)), + multiple_response, + long_string_value_labels, + long_string_missing_values, + encoding: take_first(encoding, || warn(Error::TBD)), + number_of_cases: take_first(number_of_cases, || warn(Error::TBD)), + variable_sets, + product_info: take_first(product_info, || warn(Error::TBD)), + long_names, + very_long_strings, + file_attributes, + variable_attributes, + other_extension, + end_of_headers: take_first(end_of_headers, || warn(Error::TBD)), + z_header: take_first(z_header, || warn(Error::TBD)), + z_trailer: take_first(z_trailer, || warn(Error::TBD)), + cases: take_first(cases, || warn(Error::TBD)), + }) } } +pub struct Metadata { + creation: NaiveDateTime, + endian: Endian, + compression: Option, + n_cases: Option, + product: String, + product_ext: Option, + version: Option<(i32, i32, i32)>, +} -pub fn decode( - headers: Vec, - decoder: raw::Decoder, -) -> Result<(Vec, Metadata), Error> { - let dictionary = Dictionary::new(decoder.encoding); - let mut decoder = Decoder { - raw: decoder, - variables: HashMap::new(), - var_names: HashMap::new(), - dictionary, - n_dict_indexes: 0, - n_generated_names: 0, - }; - - let h = Headers::new(&headers, &decoder); - let Some(header) = h.header else { - return Err(Error::MissingHeaderRecord); - }; +impl Metadata { + fn decode(headers: &Headers, warn: impl Fn(Error)) -> Self { + let header = &headers.header; + let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y") + .unwrap_or_else(|_| { + warn(Error::InvalidCreationDate { + creation_date: header.creation_date.to_string(), + }); + Default::default() + }); + let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S") + .unwrap_or_else(|_| { + warn(Error::InvalidCreationTime { + creation_time: header.creation_time.to_string(), + }); + Default::default() + }); + let creation = NaiveDateTime::new(creation_date, creation_time); - let mut output = Vec::with_capacity(headers.len()); + let product = header + .eye_catcher + .trim_start_matches("@(#) SPSS DATA FILE") + .trim_end() + .to_string(); - // Decode the records that don't use variables at all. - if let Some(header) = HeaderRecord::try_decode(&mut decoder, &header, warn)? { - output.push(Record::Header(header)) - } - for document in h.documents { - for line in &document.lines { - decoder.dictionary.documents.push(line.to_string()) + Self { + creation, + endian: header.endian, + compression: header.compression, + n_cases: header.n_cases.map(|n| n as u64), + product, + product_ext: headers.product_info.as_ref().map(|pe| fix_line_ends(&pe.0)), + version: headers.integer_info.as_ref().map(|ii| ii.version), } } - /* - for &raw in &h.file_attributes { - let s = decoder.decode_string_cow(&raw.text.0, warn); - output.push(Record::FileAttributes(FileAttributeRecord::parse( - &decoder, &s, warn, - )?)); - } - for &raw in &h.other_extensions { - output.push(Record::OtherExtension(raw.clone())); +} + +pub fn decode( + mut headers: Headers, + encoding: &'static Encoding, + warn: impl Fn(Error), +) -> Result<(Dictionary, Metadata), Error> { + let mut dictionary = Dictionary::new(encoding); + + let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' ')); + if !file_label.is_empty() { + dictionary.file_label = Some(file_label); } - */ - // Decode the variable records, which are the basis of almost everything - // else. - for raw in &h.variables { - parse_variable_record(&mut decoder, raw, warn)?; + + for attributes in headers.file_attributes.drain(..) { + dictionary.attributes.extend(attributes.0.0.into_iter()) } - /* - // Decode value labels and weight variable. These use indexes into the - // variable records, so we need to parse them before those indexes become - // invalidated by very long string variables. - for &raw in &h.value_labels { - if let Some(value_label) = ValueLabelRecord::try_decode(&mut decoder, raw, warn)? { - output.push(Record::ValueLabel(value_label)); - } - } - // XXX weight - if let Some(raw) = h.var_display { - output.push(Record::VarDisplay(raw.clone())); - } - // Decode records that use short names. - for &raw in &h.multiple_response { - if let Some(mrr) = MultipleResponseRecord::try_decode(&mut decoder, raw, warn)? { - output.push(Record::MultipleResponse(mrr)) - } - } - for &raw in &h.very_long_strings { - let s = decoder.decode_string_cow(&raw.text.0, warn); - output.push(Record::VeryLongStrings(VeryLongStringRecord::parse( - &decoder, &s, warn, - )?)); - } + // Concatenate all the document records (really there should only be one) + // and trim off the trailing spaces that pad them to 80 bytes. + dictionary.documents = headers + .document + .drain(..) + .flat_map(|record| record.lines) + .map(trim_end_spaces) + .collect(); - // Rename variables to their long names. - for &raw in &h.long_names { - let s = decoder.decode_string_cow(&raw.text.0, warn); - output.push(Record::LongNames(LongNameRecord::parse( - &mut decoder, - &s, - warn, - )?)); - } + // XXX warn for weird integer format + // XXX warn for weird floating-point format, etc. - // Decode recods that use long names. - for &raw in &h.variable_attributes { - let s = decoder.decode_string_cow(&raw.text.0, warn); - output.push(Record::VariableAttributes(VariableAttributeRecord::parse( - &decoder, &s, warn, - )?)); - } - for &raw in &h.long_string_value_labels { - if let Some(mrr) = LongStringValueLabelRecord::try_decode(&mut decoder, raw, warn)? { - output.push(Record::LongStringValueLabels(mrr)) - } - } - for &raw in &h.long_string_missing_values { - if let Some(mrr) = LongStringMissingValuesRecord::try_decode(&mut decoder, raw, warn)? { - output.push(Record::LongStringMissingValues(mrr)) + /* + let mut decoder = Decoder { + raw: decoder, + variables: HashMap::new(), + var_names: HashMap::new(), + dictionary, + n_dict_indexes: 0, + n_generated_names: 0, + }; + */ + let metadata = Metadata::decode(&headers, warn); + Ok((dictionary, metadata)) +} + +fn trim_end_spaces(mut s: String) -> String { + s.truncate(s.trim_end_matches(' ').len()); + s +} + +/// Returns a copy of `s` in which all lone CR and CR LF pairs have been +/// replaced by LF. +/// +/// (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system +/// files that use CR-only line ends in the file label and extra product +/// info.) */ +fn fix_line_ends(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + let mut s = s.chars().peekable(); + while let Some(c) = s.next() { + match c { + '\r' => { + s.next_if_eq(&'\n'); + out.push('\n') } + c => out.push(c), } - for &raw in &h.variable_sets { - let s = decoder.decode_string_cow(&raw.text.0, warn); - output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?)); - } - */ - let metadata = Metadata::decode(&header, h.integer_info, h.product_info, warn); - Ok((output, metadata)) + } + out } +/* impl Decoder { fn generate_name(&mut self) -> Identifier { loop { @@ -493,7 +565,8 @@ impl Decode> for String { decoder.decode_string(&input.0, &warn) } } - +*/ +/* #[derive(Clone, Debug)] pub struct HeaderRecord { pub eye_catcher: String, @@ -1285,3 +1358,4 @@ mod test { assert_eq!(&charset[..], &encoded[..]); } } +*/ diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 7f0ec2aadd..f8e880c14e 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,4 +1,4 @@ -//pub mod cooked; +pub mod cooked; pub mod dictionary; pub mod encoding; pub mod endian; diff --git a/rust/src/raw.rs b/rust/src/raw.rs index bbe2b46d36..0620d4eea6 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -1621,7 +1621,8 @@ where { pub offsets: Range, - /// The document, as an array of 80-byte lines. + /// The document, as an array of lines. Raw lines are exactly 80 bytes long + /// and are right-padded with spaces without any new-line termination. pub lines: Vec, } @@ -2346,7 +2347,7 @@ impl AttributeSet { } #[derive(Clone, Debug, Default)] -pub struct FileAttributeRecord(AttributeSet); +pub struct FileAttributeRecord(pub AttributeSet); impl FileAttributeRecord { fn decode(source: &TextRecord, decoder: &Decoder) -> Self {