-use std::{
- borrow::Cow, cell::RefCell, cmp::Ordering, collections::HashMap, iter::repeat, ops::Range,
- rc::Rc,
-};
+use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc};
use crate::{
- dictionary::{self, Dictionary, VarWidth},
- encoding::{default_encoding, get_encoding, Error as EncodingError},
+ dictionary::{Dictionary, VarWidth},
+ encoding::Error as EncodingError,
endian::Endian,
- format::{Error as FormatError, Spec, UncheckedSpec},
+ format::{Error as FormatError, Spec},
identifier::{Error as IdError, Identifier},
raw::{
- self, LongStringMissingValueRecord, MissingValues, ProductInfoRecord, RawDocumentLine,
- RawStr, RawString, VarDisplayRecord, VarType, DecodedRecord,
+ self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord,
+ FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongNamesRecord,
+ LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord,
+ NumberOfCasesRecord, ProductInfoRecord, RawStr, ValueLabelRecord, VarDisplayRecord,
+ VariableAttributeRecord, VariableRecord, VariableSetRecord, VeryLongStringsRecord, ZHeader,
+ ZTrailer,
},
};
use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
-use encoding_rs::{DecoderResult, Encoding};
-use num::integer::div_ceil;
-use ordered_float::OrderedFloat;
+use encoding_rs::Encoding;
use thiserror::Error as ThisError;
pub use crate::raw::{CategoryLabels, Compression};
#[derive(ThisError, Debug)]
pub enum Error {
- // XXX this is really an internal error and maybe we should change the
- // interfaces to make it impossible
#[error("Missing header record")]
MissingHeaderRecord,
+ // XXX this is an internal error
+ #[error("More than one file header record")]
+ DuplicateHeaderRecord,
+
#[error("{0}")]
EncodingError(EncodingError),
TBD,
}
-#[derive(Clone, Debug)]
-pub enum Record {
- Header(HeaderRecord),
- Variable(VariableRecord),
- ValueLabel(ValueLabelRecord),
- Document(DocumentRecord),
- IntegerInfo(IntegerInfoRecord),
- FloatInfo(FloatInfoRecord),
- VariableSets(VariableSetRecord),
- VarDisplay(VarDisplayRecord),
- MultipleResponse(MultipleResponseRecord),
- LongStringMissingValues(LongStringMissingValueRecord<String, String>),
- LongStringValueLabels(LongStringValueLabelRecord),
- Encoding(EncodingRecord),
- NumberOfCases(NumberOfCasesRecord),
- ProductInfo(ProductInfoRecord),
- LongNames(LongNameRecord),
- VeryLongStrings(VeryLongStringRecord),
- FileAttributes(FileAttributeRecord),
- VariableAttributes(VariableAttributeRecord),
- OtherExtension(Extension),
- //Case(Vec<Value>),
-}
-
-pub use crate::raw::EncodingRecord;
-pub use crate::raw::Extension;
-pub use crate::raw::FloatInfoRecord;
-pub use crate::raw::IntegerInfoRecord;
-pub use crate::raw::NumberOfCasesRecord;
-
type DictIndex = usize;
pub struct Variable {
n_generated_names: usize,
}
-#[derive(Default)]
-struct Headers<'a> {
- header: Option<raw::HeaderRecord<Cow<'a, str>>>,
- variables: Vec<raw::VariableRecord<Cow<'a, str>, String>>,
- value_labels: Vec<&'a raw::ValueLabelRecord<RawStr<8>, RawString>>,
- documents: Vec<raw::DocumentRecord<Cow<'a, str>>>,
- integer_info: Option<&'a raw::IntegerInfoRecord>,
- float_info: Option<&'a raw::FloatInfoRecord>,
- variable_sets: Vec<&'a raw::VariableSetRecord>,
- var_display: Option<&'a raw::VarDisplayRecord>,
- multiple_response: Vec<&'a raw::MultipleResponseRecord<RawString, RawString>>,
- long_string_value_labels: Vec<&'a raw::LongStringValueLabelRecord<RawString>>,
- long_string_missing_values: Vec<raw::LongStringMissingValueRecord<Identifier, String>>,
- encoding: Option<&'a raw::EncodingRecord>,
- number_of_cases: Option<&'a raw::NumberOfCasesRecord>,
- product_info: Option<&'a raw::ProductInfoRecord>,
- long_names: Option<&'a raw::LongNamesRecord>,
- very_long_strings: Vec<&'a raw::VeryLongStringsRecord>,
- file_attributes: Vec<&'a raw::FileAttributeRecord>,
- variable_attributes: Vec<&'a raw::VariableAttributeRecord>,
- other_extensions: Vec<&'a raw::Extension>,
- cases: Option<&'a Rc<RefCell<raw::Cases>>>,
+#[derive(Clone, Debug)]
+pub struct Headers {
+ pub header: HeaderRecord<String>,
+ pub variable: Vec<VariableRecord<String, String>>,
+ pub value_label: Vec<ValueLabelRecord<RawStr<8>, String>>,
+ pub document: Vec<DocumentRecord<String>>,
+ pub integer_info: Option<IntegerInfoRecord>,
+ pub float_info: Option<FloatInfoRecord>,
+ pub var_display: Option<VarDisplayRecord>,
+ pub multiple_response: Vec<MultipleResponseRecord<Identifier, String>>,
+ pub long_string_value_labels: Vec<LongStringValueLabelRecord<Identifier, String>>,
+ pub long_string_missing_values: Vec<LongStringMissingValueRecord<Identifier, String>>,
+ pub encoding: Option<EncodingRecord>,
+ pub number_of_cases: Option<NumberOfCasesRecord>,
+ pub variable_sets: Vec<VariableSetRecord>,
+ pub product_info: Option<ProductInfoRecord>,
+ pub long_names: Vec<LongNamesRecord>,
+ pub very_long_strings: Vec<VeryLongStringsRecord>,
+ pub file_attributes: Vec<FileAttributeRecord>,
+ pub variable_attributes: Vec<VariableAttributeRecord>,
+ pub other_extension: Vec<Extension>,
+ pub end_of_headers: Option<u32>,
+ pub z_header: Option<ZHeader>,
+ pub z_trailer: Option<ZTrailer>,
+ pub cases: Option<Rc<RefCell<Cases>>>,
}
-fn set_or_warn<T>(option: &mut Option<T>, value: T, warn: &impl Fn(Error)) {
- if option.is_none() {
- let _ = option.insert(value);
- } else {
- warn(Error::TBD);
+fn take_first<T, F>(mut vec: Vec<T>, more_than_one: F) -> Option<T>
+where
+ F: FnOnce(),
+{
+ if vec.len() > 1 {
+ more_than_one();
}
+ vec.drain(..).next()
}
-impl<'a> Headers<'a> {
- fn new(headers: &'a Vec<raw::Record>, decoder: &Decoder, warn: &impl Fn(Error)) -> Headers<'a> {
- let mut h = Headers::default();
+impl Headers {
+ pub fn new(headers: Vec<raw::DecodedRecord>, warn: &impl Fn(Error)) -> Result<Headers, Error> {
+ let mut file_header = Vec::new();
+ let mut variable = Vec::new();
+ let mut value_label = Vec::new();
+ let mut document = Vec::new();
+ let mut integer_info = Vec::new();
+ let mut float_info = Vec::new();
+ let mut var_display = Vec::new();
+ let mut multiple_response = Vec::new();
+ let mut long_string_value_labels = Vec::new();
+ let mut long_string_missing_values = Vec::new();
+ let mut encoding = Vec::new();
+ let mut number_of_cases = Vec::new();
+ let mut variable_sets = Vec::new();
+ let mut product_info = Vec::new();
+ let mut long_names = Vec::new();
+ let mut very_long_strings = Vec::new();
+ let mut file_attributes = Vec::new();
+ let mut variable_attributes = Vec::new();
+ let mut other_extension = Vec::new();
+ let mut end_of_headers = Vec::new();
+ let mut z_header = Vec::new();
+ let mut z_trailer = Vec::new();
+ let mut cases = Vec::new();
+
for header in headers {
match header {
- raw::Record::Header(record) => {
- set_or_warn(&mut h.header, record.decode(&decoder.raw), warn)
+ DecodedRecord::Header(record) => {
+ file_header.push(record);
+ }
+ DecodedRecord::Variable(record) => {
+ variable.push(record);
+ }
+ DecodedRecord::ValueLabel(record) => {
+ value_label.push(record);
+ }
+ DecodedRecord::Document(record) => {
+ document.push(record);
+ }
+ DecodedRecord::IntegerInfo(record) => {
+ integer_info.push(record);
+ }
+ DecodedRecord::FloatInfo(record) => {
+ float_info.push(record);
+ }
+ DecodedRecord::VariableSets(record) => {
+ variable_sets.push(record);
+ }
+ DecodedRecord::VarDisplay(record) => {
+ var_display.push(record);
+ }
+ DecodedRecord::MultipleResponse(record) => {
+ multiple_response.push(record);
+ }
+ DecodedRecord::LongStringValueLabels(record) => {
+ long_string_value_labels.push(record)
+ }
+ DecodedRecord::LongStringMissingValues(record) => {
+ long_string_missing_values.push(record);
+ }
+ DecodedRecord::Encoding(record) => {
+ encoding.push(record);
+ }
+ DecodedRecord::NumberOfCases(record) => {
+ number_of_cases.push(record);
+ }
+ DecodedRecord::ProductInfo(record) => {
+ product_info.push(record);
+ }
+ DecodedRecord::LongNames(record) => {
+ long_names.push(record);
+ }
+ DecodedRecord::VeryLongStrings(record) => {
+ very_long_strings.push(record);
+ }
+ DecodedRecord::FileAttributes(record) => {
+ file_attributes.push(record);
}
- raw::Record::Variable(record) => h.variables.push(record.decode(&decoder.raw)),
- raw::Record::ValueLabel(record) => h.value_labels.push(record),
- raw::Record::Document(record) => h.documents.push(record.decode(&decoder.raw)),
- raw::Record::IntegerInfo(record) => set_or_warn(&mut h.integer_info, record, warn),
- raw::Record::FloatInfo(record) => set_or_warn(&mut h.float_info, record, warn),
- raw::Record::VariableSets(record) => h.variable_sets.push(record),
- raw::Record::VarDisplay(record) => set_or_warn(&mut h.var_display, record, warn),
- raw::Record::MultipleResponse(record) => h.multiple_response.push(record),
- raw::Record::LongStringValueLabels(record) => {
- h.long_string_value_labels.push(record)
+ DecodedRecord::VariableAttributes(record) => {
+ variable_attributes.push(record);
}
- raw::Record::LongStringMissingValues(record) => h
- .long_string_missing_values
- .push(record.decode(&decoder.raw)),
- raw::Record::Encoding(record) => set_or_warn(&mut h.encoding, record, warn),
- raw::Record::NumberOfCases(record) => {
- set_or_warn(&mut h.number_of_cases, record, warn)
+ DecodedRecord::OtherExtension(record) => {
+ other_extension.push(record);
+ }
+ DecodedRecord::EndOfHeaders(record) => {
+ end_of_headers.push(record);
+ }
+ DecodedRecord::ZHeader(record) => {
+ z_header.push(record);
+ }
+ DecodedRecord::ZTrailer(record) => {
+ z_trailer.push(record);
+ }
+ DecodedRecord::Cases(record) => {
+ cases.push(record);
}
- raw::Record::ProductInfo(record) => set_or_warn(&mut h.product_info, record, warn),
- raw::Record::LongNames(record) => set_or_warn(&mut h.long_names, record, warn),
- raw::Record::VeryLongStrings(record) => h.very_long_strings.push(record),
- raw::Record::FileAttributes(record) => h.file_attributes.push(record),
- raw::Record::VariableAttributes(record) => h.variable_attributes.push(record),
- raw::Record::OtherExtension(record) => h.other_extensions.push(record),
- raw::Record::EndOfHeaders(_) => (),
- raw::Record::ZHeader(_) => (),
- raw::Record::ZTrailer(_) => (),
- raw::Record::Cases(record) => set_or_warn(&mut h.cases, record, warn),
- raw::Record::Text(_) => todo!(),
}
}
- h
+
+ let Some(file_header) = take_first(file_header, || warn(Error::DuplicateHeaderRecord))
+ else {
+ return Err(Error::MissingHeaderRecord);
+ };
+
+ Ok(Headers {
+ header: file_header,
+ variable,
+ value_label,
+ document,
+ integer_info: take_first(integer_info, || warn(Error::TBD)),
+ float_info: take_first(float_info, || warn(Error::TBD)),
+ var_display: take_first(var_display, || warn(Error::TBD)),
+ multiple_response,
+ long_string_value_labels,
+ long_string_missing_values,
+ encoding: take_first(encoding, || warn(Error::TBD)),
+ number_of_cases: take_first(number_of_cases, || warn(Error::TBD)),
+ variable_sets,
+ product_info: take_first(product_info, || warn(Error::TBD)),
+ long_names,
+ very_long_strings,
+ file_attributes,
+ variable_attributes,
+ other_extension,
+ end_of_headers: take_first(end_of_headers, || warn(Error::TBD)),
+ z_header: take_first(z_header, || warn(Error::TBD)),
+ z_trailer: take_first(z_trailer, || warn(Error::TBD)),
+ cases: take_first(cases, || warn(Error::TBD)),
+ })
}
}
+pub struct Metadata {
+ creation: NaiveDateTime,
+ endian: Endian,
+ compression: Option<Compression>,
+ n_cases: Option<u64>,
+ product: String,
+ product_ext: Option<String>,
+ version: Option<(i32, i32, i32)>,
+}
-pub fn decode(
- headers: Vec<DecodedRecord>,
- decoder: raw::Decoder,
-) -> Result<(Vec<Record>, Metadata), Error> {
- let dictionary = Dictionary::new(decoder.encoding);
- let mut decoder = Decoder {
- raw: decoder,
- variables: HashMap::new(),
- var_names: HashMap::new(),
- dictionary,
- n_dict_indexes: 0,
- n_generated_names: 0,
- };
-
- let h = Headers::new(&headers, &decoder);
- let Some(header) = h.header else {
- return Err(Error::MissingHeaderRecord);
- };
+impl Metadata {
+ fn decode(headers: &Headers, warn: impl Fn(Error)) -> Self {
+ let header = &headers.header;
+ let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
+ .unwrap_or_else(|_| {
+ warn(Error::InvalidCreationDate {
+ creation_date: header.creation_date.to_string(),
+ });
+ Default::default()
+ });
+ let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
+ .unwrap_or_else(|_| {
+ warn(Error::InvalidCreationTime {
+ creation_time: header.creation_time.to_string(),
+ });
+ Default::default()
+ });
+ let creation = NaiveDateTime::new(creation_date, creation_time);
- let mut output = Vec::with_capacity(headers.len());
+ let product = header
+ .eye_catcher
+ .trim_start_matches("@(#) SPSS DATA FILE")
+ .trim_end()
+ .to_string();
- // Decode the records that don't use variables at all.
- if let Some(header) = HeaderRecord::try_decode(&mut decoder, &header, warn)? {
- output.push(Record::Header(header))
- }
- for document in h.documents {
- for line in &document.lines {
- decoder.dictionary.documents.push(line.to_string())
+ Self {
+ creation,
+ endian: header.endian,
+ compression: header.compression,
+ n_cases: header.n_cases.map(|n| n as u64),
+ product,
+ product_ext: headers.product_info.as_ref().map(|pe| fix_line_ends(&pe.0)),
+ version: headers.integer_info.as_ref().map(|ii| ii.version),
}
}
- /*
- for &raw in &h.file_attributes {
- let s = decoder.decode_string_cow(&raw.text.0, warn);
- output.push(Record::FileAttributes(FileAttributeRecord::parse(
- &decoder, &s, warn,
- )?));
- }
- for &raw in &h.other_extensions {
- output.push(Record::OtherExtension(raw.clone()));
+}
+
+pub fn decode(
+ mut headers: Headers,
+ encoding: &'static Encoding,
+ warn: impl Fn(Error),
+) -> Result<(Dictionary, Metadata), Error> {
+ let mut dictionary = Dictionary::new(encoding);
+
+ let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' '));
+ if !file_label.is_empty() {
+ dictionary.file_label = Some(file_label);
}
- */
- // Decode the variable records, which are the basis of almost everything
- // else.
- for raw in &h.variables {
- parse_variable_record(&mut decoder, raw, warn)?;
+
+ for attributes in headers.file_attributes.drain(..) {
+ dictionary.attributes.extend(attributes.0.0.into_iter())
}
- /*
- // Decode value labels and weight variable. These use indexes into the
- // variable records, so we need to parse them before those indexes become
- // invalidated by very long string variables.
- for &raw in &h.value_labels {
- if let Some(value_label) = ValueLabelRecord::try_decode(&mut decoder, raw, warn)? {
- output.push(Record::ValueLabel(value_label));
- }
- }
- // XXX weight
- if let Some(raw) = h.var_display {
- output.push(Record::VarDisplay(raw.clone()));
- }
- // Decode records that use short names.
- for &raw in &h.multiple_response {
- if let Some(mrr) = MultipleResponseRecord::try_decode(&mut decoder, raw, warn)? {
- output.push(Record::MultipleResponse(mrr))
- }
- }
- for &raw in &h.very_long_strings {
- let s = decoder.decode_string_cow(&raw.text.0, warn);
- output.push(Record::VeryLongStrings(VeryLongStringRecord::parse(
- &decoder, &s, warn,
- )?));
- }
+ // Concatenate all the document records (really there should only be one)
+ // and trim off the trailing spaces that pad them to 80 bytes.
+ dictionary.documents = headers
+ .document
+ .drain(..)
+ .flat_map(|record| record.lines)
+ .map(trim_end_spaces)
+ .collect();
- // Rename variables to their long names.
- for &raw in &h.long_names {
- let s = decoder.decode_string_cow(&raw.text.0, warn);
- output.push(Record::LongNames(LongNameRecord::parse(
- &mut decoder,
- &s,
- warn,
- )?));
- }
+ // XXX warn for weird integer format
+ // XXX warn for weird floating-point format, etc.
- // Decode recods that use long names.
- for &raw in &h.variable_attributes {
- let s = decoder.decode_string_cow(&raw.text.0, warn);
- output.push(Record::VariableAttributes(VariableAttributeRecord::parse(
- &decoder, &s, warn,
- )?));
- }
- for &raw in &h.long_string_value_labels {
- if let Some(mrr) = LongStringValueLabelRecord::try_decode(&mut decoder, raw, warn)? {
- output.push(Record::LongStringValueLabels(mrr))
- }
- }
- for &raw in &h.long_string_missing_values {
- if let Some(mrr) = LongStringMissingValuesRecord::try_decode(&mut decoder, raw, warn)? {
- output.push(Record::LongStringMissingValues(mrr))
+ /*
+ let mut decoder = Decoder {
+ raw: decoder,
+ variables: HashMap::new(),
+ var_names: HashMap::new(),
+ dictionary,
+ n_dict_indexes: 0,
+ n_generated_names: 0,
+ };
+ */
+ let metadata = Metadata::decode(&headers, warn);
+ Ok((dictionary, metadata))
+}
+
+fn trim_end_spaces(mut s: String) -> String {
+ s.truncate(s.trim_end_matches(' ').len());
+ s
+}
+
+/// Returns a copy of `s` in which all lone CR and CR LF pairs have been
+/// replaced by LF.
+///
+/// (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
+/// files that use CR-only line ends in the file label and extra product
+/// info.) */
+fn fix_line_ends(s: &str) -> String {
+ let mut out = String::with_capacity(s.len());
+ let mut s = s.chars().peekable();
+ while let Some(c) = s.next() {
+ match c {
+ '\r' => {
+ s.next_if_eq(&'\n');
+ out.push('\n')
}
+ c => out.push(c),
}
- for &raw in &h.variable_sets {
- let s = decoder.decode_string_cow(&raw.text.0, warn);
- output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?));
- }
- */
- let metadata = Metadata::decode(&header, h.integer_info, h.product_info, warn);
- Ok((output, metadata))
+ }
+ out
}
+/*
impl Decoder {
fn generate_name(&mut self) -> Identifier {
loop {
decoder.decode_string(&input.0, &warn)
}
}
-
+*/
+/*
#[derive(Clone, Debug)]
pub struct HeaderRecord {
pub eye_catcher: String,
assert_eq!(&charset[..], &encoded[..]);
}
}
+*/