X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=rust%2Fsrc%2Fcooked.rs;h=d00f3f3c34f7a6087570c6315e98b0539f82c2ab;hb=e0cbdf0daefcca81be9572aab0deedf945687f5a;hp=93fe21f2bd39edbb0963ea35c94445439dad530b;hpb=4a37f273df6dc6a1ed9788986ce7d9908445a458;p=pspp diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index 93fe21f2bd..d00f3f3c34 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -1,23 +1,38 @@ -use std::{borrow::Cow, collections::{HashSet, HashMap}, cmp::Ordering}; +use std::{borrow::Cow, cmp::Ordering, collections::HashMap, iter::repeat, ops::Range}; -use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; -use encoding_rs::Encoding; -use num::integer::div_ceil; use crate::{ - format::{Spec, UncheckedSpec}, + encoding::{default_encoding, get_encoding, Error as EncodingError}, + endian::Endian, + format::{Error as FormatError, Spec, UncheckedSpec}, identifier::{Error as IdError, Identifier}, - raw::{self, MissingValues, VarType}, - {endian::Endian, Compression}, + raw::{self, UnencodedStr, VarType}, }; +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use encoding_rs::{DecoderResult, Encoding}; +use num::integer::div_ceil; +use ordered_float::OrderedFloat; use thiserror::Error as ThisError; +pub use crate::raw::{CategoryLabels, Compression}; + #[derive(ThisError, Debug)] pub enum Error { - #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")] - BadVariableWidth { offset: u64, width: i32 }, + // XXX this is really an internal error and maybe we should change the + // interfaces to make it impossible + #[error("Missing header record")] + MissingHeaderRecord, + + #[error("{0}")] + EncodingError(EncodingError), + + #[error("Using default encoding {0}.")] + UsingDefaultEncoding(String), + + #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)] + InvalidVariableWidth { offsets: Range, width: i32 }, #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")] - BadLongMissingValueFormat, + InvalidLongMissingValueFormat, #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")] InvalidCreationDate { creation_date: String }, @@ -25,65 +40,430 @@ pub enum Error { #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")] InvalidCreationTime { creation_time: String }, - #[error("Invalid variable name: {0}")] - BadIdentifier(#[from] IdError), + #[error("{id_error} Renaming variable to {new_name}.")] + InvalidVariableName { + id_error: IdError, + new_name: Identifier, + }, + + #[error( + "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}" + )] + InvalidPrintFormat { + new_spec: Spec, + variable: Identifier, + format_error: FormatError, + }, + + #[error( + "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}" + )] + InvalidWriteFormat { + new_spec: Spec, + variable: Identifier, + format_error: FormatError, + }, + + #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")] + DuplicateVariableName { + duplicate_name: Identifier, + new_name: Identifier, + }, + + #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")] + InvalidDictIndex { dict_index: usize, max_index: usize }, + + #[error("Dictionary index {0} refers to a long string continuation.")] + DictIndexIsContinuation(usize), + + #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")] + ValueLabelsDifferentTypes { + numeric_var: Identifier, + string_var: Identifier, + }, + + #[error( + "Value labels may not be added to long string variable {0} using record types 3 or 4." + )] + InvalidLongStringValueLabel(Identifier), + + #[error("Invalid multiple response set name. {0}")] + InvalidMrSetName(IdError), + + #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")] + UnknownMrSetVariable { + mr_set: Identifier, + short_name: Identifier, + }, + + #[error("Multiple response set {0} has no variables.")] + EmptyMrSet(Identifier), + + #[error("Multiple response set {0} has only one variable.")] + OneVarMrSet(Identifier), + + #[error("Multiple response set {0} contains both string and numeric variables.")] + MixedMrSet(Identifier), + + #[error( + "Invalid numeric format for counted value {number} in multiple response set {mr_set}." + )] + InvalidMDGroupCountedValue { mr_set: Identifier, number: String }, + + #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")] + TooWideMDGroupCountedValue { + mr_set: Identifier, + value: String, + width: usize, + max_width: u16, + }, + + #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")] + InvalidLongValueLabelWidth { + name: Identifier, + width: u32, + min_width: u16, + max_width: u16, + }, + + #[error("Invalid attribute name. {0}")] + InvalidAttributeName(IdError), + + #[error("Invalid short name in long variable name record. {0}")] + InvalidShortName(IdError), + + #[error("Invalid name in long variable name record. {0}")] + InvalidLongName(IdError), + + #[error("Invalid variable name in very long string record. {0}")] + InvalidLongStringName(IdError), + + #[error("Invalid variable name in long string value label record. {0}")] + InvalidLongStringValueLabelName(IdError), + + #[error("Invalid variable name in attribute record. {0}")] + InvalidAttributeVariableName(IdError), + + // XXX This is risky because `text` might be arbitarily long. + #[error("Text string contains invalid bytes for {encoding} encoding: {text}")] + MalformedString { encoding: String, text: String }, + + #[error("Invalid variable measurement level value {0}")] + InvalidMeasurement(u32), + + #[error("Invalid variable display alignment value {0}")] + InvalidAlignment(u32), #[error("Details TBD")] TBD, } +#[derive(Clone, Debug)] +pub enum Record { + Header(HeaderRecord), + Variable(VariableRecord), + ValueLabel(ValueLabelRecord), + Document(DocumentRecord), + IntegerInfo(IntegerInfoRecord), + FloatInfo(FloatInfoRecord), + VariableSets(VariableSetRecord), + VarDisplay(VarDisplayRecord), + MultipleResponse(MultipleResponseRecord), + LongStringMissingValues(LongStringMissingValuesRecord), + LongStringValueLabels(LongStringValueLabelRecord), + Encoding(EncodingRecord), + NumberOfCases(NumberOfCasesRecord), + ProductInfo(ProductInfoRecord), + LongNames(LongNameRecord), + VeryLongStrings(VeryLongStringRecord), + FileAttributes(FileAttributeRecord), + VariableAttributes(VariableAttributeRecord), + OtherExtension(Extension), + //EndOfHeaders(u32), + //ZHeader(ZHeader), + //ZTrailer(ZTrailer), + //Case(Vec), +} + +pub use crate::raw::EncodingRecord; +pub use crate::raw::Extension; +pub use crate::raw::FloatInfoRecord; +pub use crate::raw::IntegerInfoRecord; +pub use crate::raw::NumberOfCasesRecord; + +type DictIndex = usize; + +pub struct Variable { + pub dict_index: DictIndex, + pub short_name: Identifier, + pub long_name: Option, + pub width: VarWidth, +} + pub struct Decoder { pub compression: Option, pub endian: Endian, pub encoding: &'static Encoding, - pub var_names: HashSet, - pub dict_indexes: HashMap, + pub variables: HashMap, + pub var_names: HashMap, n_dict_indexes: usize, n_generated_names: usize, } -impl Decoder { - fn take_name(&mut self, id: &Identifier) -> bool { - self.var_names.insert(id.clone()) +pub fn decode( + headers: Vec, + encoding: Option<&'static Encoding>, + warn: &impl Fn(Error), +) -> Result, Error> { + let Some(header_record) = headers.iter().find_map(|rec| { + if let raw::Record::Header(header) = rec { + Some(header) + } else { + None + } + }) else { + return Err(Error::MissingHeaderRecord); + }; + let encoding = match encoding { + Some(encoding) => encoding, + None => { + let encoding = headers.iter().find_map(|rec| { + if let raw::Record::Encoding(ref e) = rec { + Some(e.0.as_str()) + } else { + None + } + }); + let character_code = headers.iter().find_map(|rec| { + if let raw::Record::IntegerInfo(ref r) = rec { + Some(r.character_code) + } else { + None + } + }); + match get_encoding(encoding, character_code) { + Ok(encoding) => encoding, + Err(err @ EncodingError::Ebcdic) => return Err(Error::EncodingError(err)), + Err(err) => { + warn(Error::EncodingError(err)); + // Warn that we're using the default encoding. + default_encoding() + } + } + } + }; + + let mut decoder = Decoder { + compression: header_record.compression, + endian: header_record.endian, + encoding, + variables: HashMap::new(), + var_names: HashMap::new(), + n_dict_indexes: 0, + n_generated_names: 0, + }; + + let mut output = Vec::with_capacity(headers.len()); + for header in &headers { + match header { + raw::Record::Header(ref input) => { + if let Some(header) = HeaderRecord::try_decode(&mut decoder, input, warn)? { + output.push(Record::Header(header)) + } + } + raw::Record::Variable(ref input) => { + if let Some(variable) = VariableRecord::try_decode(&mut decoder, input, warn)? { + output.push(Record::Variable(variable)); + } + } + raw::Record::ValueLabel(ref input) => { + if let Some(value_label) = ValueLabelRecord::try_decode(&mut decoder, input, warn)? + { + output.push(Record::ValueLabel(value_label)); + } + } + raw::Record::Document(ref input) => { + if let Some(document) = DocumentRecord::try_decode(&mut decoder, input, warn)? { + output.push(Record::Document(document)) + } + } + raw::Record::IntegerInfo(ref input) => output.push(Record::IntegerInfo(input.clone())), + raw::Record::FloatInfo(ref input) => output.push(Record::FloatInfo(input.clone())), + raw::Record::VariableSets(ref input) => { + let s = decoder.decode_string_cow(&input.text.0, warn); + output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?)); + } + raw::Record::VarDisplay(ref input) => { + if let Some(vdr) = VarDisplayRecord::try_decode(&mut decoder, input, warn)? { + output.push(Record::VarDisplay(vdr)) + } + } + raw::Record::MultipleResponse(ref input) => { + if let Some(mrr) = MultipleResponseRecord::try_decode(&mut decoder, input, warn)? { + output.push(Record::MultipleResponse(mrr)) + } + } + raw::Record::LongStringMissingValues(ref input) => { + if let Some(mrr) = LongStringMissingValuesRecord::try_decode(&mut decoder, input, warn)? { + output.push(Record::LongStringMissingValues(mrr)) + } + } + raw::Record::LongStringValueLabels(ref input) => { + if let Some(mrr) = + LongStringValueLabelRecord::try_decode(&mut decoder, input, warn)? + { + output.push(Record::LongStringValueLabels(mrr)) + } + } + raw::Record::Encoding(ref input) => output.push(Record::Encoding(input.clone())), + raw::Record::NumberOfCases(ref input) => { + output.push(Record::NumberOfCases(input.clone())) + } + raw::Record::ProductInfo(ref input) => { + let s = decoder.decode_string_cow(&input.text.0, warn); + output.push(Record::ProductInfo(ProductInfoRecord::parse(&s, warn)?)); + } + raw::Record::LongNames(ref input) => { + let s = decoder.decode_string_cow(&input.text.0, warn); + output.push(Record::LongNames(LongNameRecord::parse( + &mut decoder, + &s, + warn, + )?)); + } + raw::Record::VeryLongStrings(ref input) => { + let s = decoder.decode_string_cow(&input.text.0, warn); + output.push(Record::VeryLongStrings(VeryLongStringRecord::parse( + &decoder, &s, warn, + )?)); + } + raw::Record::FileAttributes(ref input) => { + let s = decoder.decode_string_cow(&input.text.0, warn); + output.push(Record::FileAttributes(FileAttributeRecord::parse( + &decoder, &s, warn, + )?)); + } + raw::Record::VariableAttributes(ref input) => { + let s = decoder.decode_string_cow(&input.text.0, warn); + output.push(Record::VariableAttributes(VariableAttributeRecord::parse( + &decoder, &s, warn, + )?)); + } + raw::Record::OtherExtension(ref input) => { + output.push(Record::OtherExtension(input.clone())) + } + raw::Record::EndOfHeaders(_) => (), + raw::Record::ZHeader(_) => (), + raw::Record::ZTrailer(_) => (), + raw::Record::Case(_) => (), + }; } + Ok(output) +} + +impl Decoder { fn generate_name(&mut self) -> Identifier { loop { self.n_generated_names += 1; let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding) .unwrap(); - if self.take_name(&name) { + if !self.var_names.contains_key(&name) { return name; } assert!(self.n_generated_names < usize::MAX); } } - fn take_dict_indexes(&mut self, id: &Identifier, width: VarWidth) -> usize { - let n = match width { - VarWidth::Numeric => 1, - VarWidth::String(w) => div_ceil(w as usize, 8), - }; - let dict_index = self.n_dict_indexes; - self.dict_indexes.insert(self.n_dict_indexes, id.clone()); - self.n_dict_indexes += n; - dict_index - - } - fn decode_string<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> { + fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> { let (output, malformed) = self.encoding.decode_without_bom_handling(input); if malformed { - warn(Error::TBD); + warn(Error::MalformedString { + encoding: self.encoding.name().into(), + text: output.clone().into(), + }); } output } + fn decode_string(&self, input: &[u8], warn: &impl Fn(Error)) -> String { + self.decode_string_cow(input, warn).into() + } + pub fn decode_identifier( + &self, + input: &[u8], + warn: &impl Fn(Error), + ) -> Result { + let s = self.decode_string_cow(input, warn); + Identifier::new(&s, self.encoding) + } + fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> { + let max_index = self.n_dict_indexes; + if dict_index == 0 || dict_index > max_index { + return Err(Error::InvalidDictIndex { + dict_index, + max_index, + }); + } + let Some(variable) = self.variables.get(&(dict_index - 1)) else { + return Err(Error::DictIndexIsContinuation(dict_index)); + }; + Ok(variable) + } + + /// Returns `input` decoded from `self.encoding` into UTF-8 such that + /// re-encoding the result back into `self.encoding` will have exactly the + /// same length in bytes. + /// + /// XXX warn about errors? + fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { + if let (s, false) = self.encoding.decode_without_bom_handling(input) { + // This is the common case. Usually there will be no errors. + s + } else { + // Unusual case. Don't bother to optimize it much. + let mut decoder = self.encoding.new_decoder_without_bom_handling(); + let mut output = String::with_capacity( + decoder + .max_utf8_buffer_length_without_replacement(input.len()) + .unwrap(), + ); + let mut rest = input; + while !rest.is_empty() { + match decoder.decode_to_string_without_replacement(rest, &mut output, true) { + (DecoderResult::InputEmpty, _) => break, + (DecoderResult::OutputFull, _) => unreachable!(), + (DecoderResult::Malformed(a, b), consumed) => { + let skipped = a as usize + b as usize; + output.extend(repeat('?').take(skipped)); + rest = &rest[consumed..]; + } + } + } + assert_eq!(self.encoding.encode(&output).0.len(), input.len()); + output.into() + } + } } -pub trait Decode: Sized { +pub trait TryDecode: Sized { type Input; - fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result; + fn try_decode( + decoder: &mut Decoder, + input: &Self::Input, + warn: impl Fn(Error), + ) -> Result, Error>; +} + +pub trait Decode: Sized { + fn decode(decoder: &Decoder, input: &Input, warn: impl Fn(Error)) -> Self; } -#[derive(Clone)] -pub struct Header { +impl Decode> for String { + fn decode(decoder: &Decoder, input: &UnencodedStr, warn: impl Fn(Error)) -> Self { + decoder.decode_string(&input.0, &warn) + } +} + +#[derive(Clone, Debug)] +pub struct HeaderRecord { pub eye_catcher: String, pub weight_index: Option, pub n_cases: Option, @@ -91,20 +471,30 @@ pub struct Header { pub file_label: String, } -impl Decode for Header { - type Input = crate::raw::Header; +fn trim_end_spaces(mut s: String) -> String { + s.truncate(s.trim_end_matches(' ').len()); + s +} + +impl TryDecode for HeaderRecord { + type Input = crate::raw::HeaderRecord; - fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result { - let eye_catcher = decoder.decode_string(&input.eye_catcher.0, &warn); - let file_label = decoder.decode_string(&input.file_label.0, &warn); - let creation_date = decoder.decode_string(&input.creation_date.0, &warn); - let creation_date = NaiveDate::parse_from_str(&creation_date, "%v").unwrap_or_else(|_| { - warn(Error::InvalidCreationDate { - creation_date: creation_date.into(), + fn try_decode( + decoder: &mut Decoder, + input: &Self::Input, + warn: impl Fn(Error), + ) -> Result, Error> { + let eye_catcher = trim_end_spaces(decoder.decode_string(&input.eye_catcher.0, &warn)); + let file_label = trim_end_spaces(decoder.decode_string(&input.file_label.0, &warn)); + let creation_date = decoder.decode_string_cow(&input.creation_date.0, &warn); + let creation_date = + NaiveDate::parse_from_str(&creation_date, "%e %b %Y").unwrap_or_else(|_| { + warn(Error::InvalidCreationDate { + creation_date: creation_date.into(), + }); + Default::default() }); - Default::default() - }); - let creation_time = decoder.decode_string(&input.creation_time.0, &warn); + let creation_time = decoder.decode_string_cow(&input.creation_time.0, &warn); let creation_time = NaiveTime::parse_from_str(&creation_time, "%H:%M:%S").unwrap_or_else(|_| { warn(Error::InvalidCreationTime { @@ -112,17 +502,17 @@ impl Decode for Header { }); Default::default() }); - Ok(Header { - eye_catcher: eye_catcher.into(), + Ok(Some(HeaderRecord { + eye_catcher, weight_index: input.weight_index.map(|n| n as usize), n_cases: input.n_cases.map(|n| n as u64), creation: NaiveDateTime::new(creation_date, creation_time), - file_label: file_label.into(), - }) + file_label, + })) } } -#[derive(Copy, Clone, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum VarWidth { Numeric, String(u16), @@ -138,6 +528,45 @@ impl PartialOrd for VarWidth { } } +impl VarWidth { + const MAX_STRING: u16 = 32767; + + fn n_dict_indexes(self) -> usize { + match self { + VarWidth::Numeric => 1, + VarWidth::String(w) => div_ceil(w as usize, 8), + } + } + + fn width_predicate( + a: Option, + b: Option, + f: impl Fn(u16, u16) -> u16, + ) -> Option { + match (a, b) { + (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric), + (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => { + Some(VarWidth::String(f(a, b))) + } + _ => None, + } + } + + /// Returns the wider of `self` and `other`: + /// - Numerical variable widths are equally wide. + /// - Longer strings are wider than shorter strings. + /// - Numerical and string types are incomparable, so result in `None`. + /// - Any `None` in the input yields `None` in the output. + pub fn wider(a: Option, b: Option) -> Option { + Self::width_predicate(a, b, |a, b| a.max(b)) + } + + /// Returns the narrower of `self` and `other` (see [`Self::wider`]). + pub fn narrower(a: Option, b: Option) -> Option { + Self::width_predicate(a, b, |a, b| a.min(b)) + } +} + impl From for VarType { fn from(source: VarWidth) -> Self { match source { @@ -147,7 +576,8 @@ impl From for VarType { } } -pub struct Variable { +#[derive(Clone, Debug)] +pub struct VariableRecord { pub width: VarWidth, pub name: Identifier, pub print_format: Spec, @@ -156,85 +586,150 @@ pub struct Variable { pub label: Option, } -fn decode_format(raw: raw::Spec, name: &str, width: VarWidth) -> Spec { +#[derive(Clone, Debug)] +pub struct MissingValues { + /// Individual missing values, up to 3 of them. + pub values: Vec, + + /// Optional range of missing values. + pub range: Option<(Value, Value)>, +} + +impl Decode for MissingValues { + fn decode(decoder: &Decoder, input: &raw::MissingValues, _warn: impl Fn(Error)) -> Self { + MissingValues { + values: input + .values + .iter() + .map(|value| Value::decode(value, decoder)) + .collect(), + range: input + .range + .as_ref() + .map(|(low, high)| (Value::decode(low, decoder), Value::decode(high, decoder))), + } + } +} + +fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec { UncheckedSpec::try_from(raw) .and_then(Spec::try_from) - .and_then(|x| x.check_width_compatibility(Some(name), width)) - .unwrap_or_else(|_warning| { - /*warn(warning);*/ - Spec::default_for_width(width) + .and_then(|x| x.check_width_compatibility(width)) + .unwrap_or_else(|error| { + let new_format = Spec::default_for_width(width); + warn(new_format, error); + new_format }) } -impl Variable { - pub fn decode( +impl TryDecode for VariableRecord { + type Input = raw::VariableRecord; + + fn try_decode( decoder: &mut Decoder, - input: &crate::raw::Variable, + input: &crate::raw::VariableRecord, warn: impl Fn(Error), - ) -> Result, Error> { + ) -> Result, Error> { let width = match input.width { 0 => VarWidth::Numeric, w @ 1..=255 => VarWidth::String(w as u16), -1 => return Ok(None), _ => { - return Err(Error::BadVariableWidth { - offset: input.offset, + return Err(Error::InvalidVariableWidth { + offsets: input.offsets.clone(), width: input.width, }) } }; - let name = decoder.decode_string(&input.name.0, &warn); + let name = trim_end_spaces(decoder.decode_string(&input.name.0, &warn)); let name = match Identifier::new(&name, decoder.encoding) { Ok(name) => { - if !decoder.take_name(&name) { - decoder.generate_name() - } else { + if !decoder.var_names.contains_key(&name) { name + } else { + let new_name = decoder.generate_name(); + warn(Error::DuplicateVariableName { + duplicate_name: name.clone(), + new_name: new_name.clone(), + }); + new_name } } - Err(error) => { - warn(error.into()); - decoder.generate_name() + Err(id_error) => { + let new_name = decoder.generate_name(); + warn(Error::InvalidVariableName { + id_error, + new_name: new_name.clone(), + }); + new_name } }; - let print_format = decode_format(input.print_format, &name.0, width); - let write_format = decode_format(input.write_format, &name.0, width); + let variable = Variable { + dict_index: decoder.n_dict_indexes, + short_name: name.clone(), + long_name: None, + width, + }; + decoder.n_dict_indexes += width.n_dict_indexes(); + assert!(decoder + .var_names + .insert(name.clone(), variable.dict_index) + .is_none()); + assert!(decoder + .variables + .insert(variable.dict_index, variable) + .is_none()); + + let print_format = decode_format(input.print_format, width, |new_spec, format_error| { + warn(Error::InvalidPrintFormat { + new_spec, + variable: name.clone(), + format_error, + }) + }); + let write_format = decode_format(input.write_format, width, |new_spec, format_error| { + warn(Error::InvalidWriteFormat { + new_spec, + variable: name.clone(), + format_error, + }) + }); let label = input .label .as_ref() - .map(|label| decoder.decode_string(&label.0, &warn).into()); - decoder.take_dict_indexes(&name, width); - Ok(Some(Variable { + .map(|label| decoder.decode_string(&label.0, &warn)); + Ok(Some(VariableRecord { width, name, print_format, write_format, - missing_values: input.missing_values.clone(), + missing_values: MissingValues::decode(decoder, &input.missing_values, warn), label, })) } } -#[derive(Clone)] -pub struct Document(Vec); +#[derive(Clone, Debug)] +pub struct DocumentRecord(Vec); -impl Decode for Document { - type Input = crate::raw::Document; +impl TryDecode for DocumentRecord { + type Input = crate::raw::DocumentRecord; - fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result { - Ok(Document( + fn try_decode( + decoder: &mut Decoder, + input: &Self::Input, + warn: impl Fn(Error), + ) -> Result, Error> { + Ok(Some(DocumentRecord( input .lines .iter() - .map(|s| decoder.decode_string(&s.0, &warn).into()) + .map(|s| trim_end_spaces(decoder.decode_string(&s.0, &warn))) .collect(), - )) + ))) } } -pub use crate::raw::FloatInfo; -pub use crate::raw::IntegerInfo; - trait TextRecord where Self: Sized, @@ -243,6 +738,7 @@ where fn parse(input: &str, warn: impl Fn(Error)) -> Result; } +#[derive(Clone, Debug)] pub struct VariableSet { pub name: String, pub vars: Vec, @@ -259,11 +755,118 @@ impl VariableSet { } } -/* +trait WarnOnError { + fn warn_on_error(self, warn: &F) -> Option; +} +impl WarnOnError for Result { + fn warn_on_error(self, warn: &F) -> Option { + match self { + Ok(result) => Some(result), + Err(error) => { + warn(error); + None + } + } + } +} + +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Value { + Number(Option>), + String(String), +} + +impl Value { + pub fn decode(raw: &raw::Value, decoder: &Decoder) -> Self { + match raw { + raw::Value::Number(x) => Value::Number(x.map(|x| x.into())), + raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()), + } + } +} + +#[derive(Clone, Debug)] +pub struct ValueLabel { + pub value: Value, + pub label: String, +} + +#[derive(Clone, Debug)] pub struct ValueLabelRecord { - pub labels: Vec<( + pub var_type: VarType, + pub labels: Vec, + pub variables: Vec, } -*/ + +impl TryDecode for ValueLabelRecord { + type Input = crate::raw::ValueLabelRecord; + fn try_decode( + decoder: &mut Decoder, + input: &Self::Input, + warn: impl Fn(Error), + ) -> Result, Error> { + let variables: Vec<&Variable> = input + .dict_indexes + .iter() + .filter_map(|&dict_index| { + decoder + .get_var_by_index(dict_index as usize) + .warn_on_error(&warn) + }) + .filter(|&variable| match variable.width { + VarWidth::String(width) if width > 8 => { + warn(Error::InvalidLongStringValueLabel( + variable.short_name.clone(), + )); + false + } + _ => true, + }) + .collect(); + let mut i = variables.iter(); + let Some(&first_var) = i.next() else { + return Ok(None); + }; + let var_type: VarType = first_var.width.into(); + for &variable in i { + let this_type: VarType = variable.width.into(); + if var_type != this_type { + let (numeric_var, string_var) = match var_type { + VarType::Numeric => (first_var, variable), + VarType::String => (variable, first_var), + }; + warn(Error::ValueLabelsDifferentTypes { + numeric_var: numeric_var.short_name.clone(), + string_var: string_var.short_name.clone(), + }); + return Ok(None); + } + } + let labels = input + .labels + .iter() + .map(|(value, label)| { + let label = decoder.decode_string(&label.0, &warn); + let value = Value::decode( + &raw::Value::from_raw(value, var_type, decoder.endian), + decoder, + ); + ValueLabel { value, label } + }) + .collect(); + let variables = variables + .iter() + .map(|&variable| variable.short_name.clone()) + .collect(); + Ok(Some(ValueLabelRecord { + var_type, + labels, + variables, + })) + } +} + +#[derive(Clone, Debug)] pub struct VariableSetRecord(Vec); impl TextRecord for VariableSetRecord { @@ -271,95 +874,116 @@ impl TextRecord for VariableSetRecord { fn parse(input: &str, warn: impl Fn(Error)) -> Result { let mut sets = Vec::new(); for line in input.lines() { - match VariableSet::parse(line) { - Ok(set) => sets.push(set), - Err(error) => warn(error), + if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) { + sets.push(set) } } Ok(VariableSetRecord(sets)) } } -pub struct ProductInfo(pub String); +#[derive(Clone, Debug)] +pub struct ProductInfoRecord(pub String); -impl TextRecord for ProductInfo { +impl TextRecord for ProductInfoRecord { const NAME: &'static str = "extra product info"; fn parse(input: &str, _warn: impl Fn(Error)) -> Result { - Ok(ProductInfo(input.into())) + Ok(ProductInfoRecord(input.into())) } } -pub struct LongVariableName { - pub short_name: String, - pub long_name: String, +#[derive(Clone, Debug)] +pub struct LongName { + pub short_name: Identifier, + pub long_name: Identifier, } -pub struct LongVariableNameRecord(Vec); +impl LongName { + fn new(decoder: &mut Decoder, short_name: &str, long_name: &str) -> Result { + let short_name = + Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidShortName)?; + let long_name = + Identifier::new(long_name, decoder.encoding).map_err(Error::InvalidLongName)?; + Ok(LongName { + short_name, + long_name, + }) + } +} -impl TextRecord for LongVariableNameRecord { - const NAME: &'static str = "long variable names"; - fn parse(input: &str, warn: impl Fn(Error)) -> Result { +#[derive(Clone, Debug)] +pub struct LongNameRecord(Vec); + +impl LongNameRecord { + pub fn parse(decoder: &mut Decoder, input: &str, warn: impl Fn(Error)) -> Result { let mut names = Vec::new(); for pair in input.split('\t').filter(|s| !s.is_empty()) { if let Some((short_name, long_name)) = pair.split_once('=') { - let name = LongVariableName { - short_name: short_name.into(), - long_name: long_name.into(), - }; - names.push(name); + if let Some(long_name) = + LongName::new(decoder, short_name, long_name).warn_on_error(&warn) + { + names.push(long_name); + } } else { warn(Error::TBD) } } - Ok(LongVariableNameRecord(names)) + Ok(LongNameRecord(names)) } } +#[derive(Clone, Debug)] pub struct VeryLongString { - pub short_name: String, - pub length: usize, + pub short_name: Identifier, + pub length: u16, } impl VeryLongString { - fn parse(input: &str) -> Result { + fn parse(decoder: &Decoder, input: &str) -> Result { let Some((short_name, length)) = input.split_once('=') else { return Err(Error::TBD); }; - let length: usize = length.parse().map_err(|_| Error::TBD)?; - Ok(VeryLongString { - short_name: short_name.into(), - length, - }) + let short_name = + Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidLongStringName)?; + let length: u16 = length.parse().map_err(|_| Error::TBD)?; + if length > VarWidth::MAX_STRING { + return Err(Error::TBD); + } + Ok(VeryLongString { short_name, length }) } } +#[derive(Clone, Debug)] pub struct VeryLongStringRecord(Vec); -impl TextRecord for VeryLongStringRecord { - const NAME: &'static str = "very long strings"; - fn parse(input: &str, warn: impl Fn(Error)) -> Result { +impl VeryLongStringRecord { + pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result { let mut very_long_strings = Vec::new(); for tuple in input .split('\0') .map(|s| s.trim_end_matches('\t')) .filter(|s| !s.is_empty()) { - match VeryLongString::parse(tuple) { - Ok(vls) => very_long_strings.push(vls), - Err(error) => warn(error), + if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&warn) { + very_long_strings.push(vls) } } Ok(VeryLongStringRecord(very_long_strings)) } } +#[derive(Clone, Debug)] pub struct Attribute { - pub name: String, + pub name: Identifier, pub values: Vec, } impl Attribute { - fn parse<'a>(input: &'a str, warn: &impl Fn(Error)) -> Result<(Attribute, &'a str), Error> { + fn parse<'a>( + decoder: &Decoder, + input: &'a str, + warn: &impl Fn(Error), + ) -> Result<(Option, &'a str), Error> { let Some((name, mut input)) = input.split_once('(') else { return Err(Error::TBD); }; @@ -378,23 +1002,23 @@ impl Attribute { values.push(value.into()); } if let Some(rest) = rest.strip_prefix(')') { - return Ok(( - Attribute { - name: name.into(), - values, - }, - rest, - )); - } + let attribute = Identifier::new(name, decoder.encoding) + .map_err(Error::InvalidAttributeName) + .warn_on_error(warn) + .map(|name| Attribute { name, values }); + return Ok((attribute, rest)); + }; input = rest; } } } +#[derive(Clone, Debug)] pub struct AttributeSet(pub Vec); impl AttributeSet { fn parse<'a>( + decoder: &Decoder, mut input: &'a str, sentinel: Option, warn: &impl Fn(Error), @@ -405,8 +1029,10 @@ impl AttributeSet { None => break input, c if c == sentinel => break &input[1..], _ => { - let (attribute, rest) = Attribute::parse(input, &warn)?; - attributes.push(attribute); + let (attribute, rest) = Attribute::parse(decoder, input, &warn)?; + if let Some(attribute) = attribute { + attributes.push(attribute); + } input = rest; } } @@ -415,12 +1041,12 @@ impl AttributeSet { } } +#[derive(Clone, Debug)] pub struct FileAttributeRecord(AttributeSet); -impl TextRecord for FileAttributeRecord { - const NAME: &'static str = "data file attributes"; - fn parse(input: &str, warn: impl Fn(Error)) -> Result { - let (set, rest) = AttributeSet::parse(input, None, &warn)?; +impl FileAttributeRecord { + pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result { + let (set, rest) = AttributeSet::parse(decoder, input, None, &warn)?; if !rest.is_empty() { warn(Error::TBD); } @@ -428,68 +1054,425 @@ impl TextRecord for FileAttributeRecord { } } +#[derive(Clone, Debug)] pub struct VarAttributeSet { - pub long_var_name: String, + pub long_var_name: Identifier, pub attributes: AttributeSet, } impl VarAttributeSet { fn parse<'a>( + decoder: &Decoder, input: &'a str, warn: &impl Fn(Error), - ) -> Result<(VarAttributeSet, &'a str), Error> { + ) -> Result<(Option, &'a str), Error> { let Some((long_var_name, rest)) = input.split_once(':') else { return Err(Error::TBD); }; - let (attributes, rest) = AttributeSet::parse(rest, Some('/'), warn)?; - Ok(( - VarAttributeSet { - long_var_name: long_var_name.into(), + let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'), warn)?; + let var_attribute = Identifier::new(long_var_name, decoder.encoding) + .map_err(Error::InvalidAttributeVariableName) + .warn_on_error(warn) + .map(|name| VarAttributeSet { + long_var_name: name, attributes, - }, - rest, - )) + }); + Ok((var_attribute, rest)) } } +#[derive(Clone, Debug)] pub struct VariableAttributeRecord(Vec); -impl TextRecord for VariableAttributeRecord { - const NAME: &'static str = "variable attributes"; - fn parse(mut input: &str, warn: impl Fn(Error)) -> Result { +impl VariableAttributeRecord { + pub fn parse(decoder: &Decoder, mut input: &str, warn: impl Fn(Error)) -> Result { let mut var_attribute_sets = Vec::new(); while !input.is_empty() { - match VarAttributeSet::parse(input, &warn) { - Ok((var_attribute, rest)) => { - var_attribute_sets.push(var_attribute); - input = rest; - } - Err(error) => { - warn(error); - break; - } + let Some((var_attribute, rest)) = + VarAttributeSet::parse(decoder, input, &warn).warn_on_error(&warn) + else { + break; + }; + if let Some(var_attribute) = var_attribute { + var_attribute_sets.push(var_attribute); } + input = rest; } Ok(VariableAttributeRecord(var_attribute_sets)) } } +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum Measure { Nominal, Ordinal, Scale, } +impl Measure { + fn try_decode(source: u32) -> Result, Error> { + match source { + 0 => Ok(None), + 1 => Ok(Some(Measure::Nominal)), + 2 => Ok(Some(Measure::Ordinal)), + 3 => Ok(Some(Measure::Scale)), + _ => Err(Error::InvalidMeasurement(source)), + } + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum Alignment { Left, Right, Center, } +impl Alignment { + fn try_decode(source: u32) -> Result, Error> { + match source { + 0 => Ok(None), + 1 => Ok(Some(Alignment::Left)), + 2 => Ok(Some(Alignment::Right)), + 3 => Ok(Some(Alignment::Center)), + _ => Err(Error::InvalidAlignment(source)), + } + } +} + +#[derive(Clone, Debug)] pub struct VarDisplay { pub measure: Option, - pub width: u32, - pub align: Option, + pub width: Option, + pub alignment: Option, } +#[derive(Clone, Debug)] pub struct VarDisplayRecord(pub Vec); + +impl TryDecode for VarDisplayRecord { + type Input = raw::VarDisplayRecord; + fn try_decode( + decoder: &mut Decoder, + input: &Self::Input, + warn: impl Fn(Error), + ) -> Result, Error> { + let n_vars = decoder.variables.len(); + let n_per_var = if input.0.len() == 3 * n_vars { + 3 + } else if input.0.len() == 2 * n_vars { + 2 + } else { + return Err(Error::TBD); + }; + + let var_displays = input + .0 + .chunks(n_per_var) + .map(|chunk| { + let (measure, width, alignment) = match n_per_var == 3 { + true => (chunk[0], Some(chunk[1]), chunk[2]), + false => (chunk[0], None, chunk[1]), + }; + let measure = Measure::try_decode(measure).warn_on_error(&warn).flatten(); + let alignment = Alignment::try_decode(alignment) + .warn_on_error(&warn) + .flatten(); + VarDisplay { + measure, + width, + alignment, + } + }) + .collect(); + Ok(Some(VarDisplayRecord(var_displays))) + } +} + +#[derive(Clone, Debug)] +pub enum MultipleResponseType { + MultipleDichotomy { + value: Value, + labels: CategoryLabels, + }, + MultipleCategory, +} + +impl MultipleResponseType { + fn decode( + decoder: &Decoder, + mr_set: &Identifier, + input: &raw::MultipleResponseType, + min_width: VarWidth, + warn: &impl Fn(Error), + ) -> Result { + let mr_type = match input { + raw::MultipleResponseType::MultipleDichotomy { value, labels } => { + let value = decoder.decode_string_cow(&value.0, warn); + let value = match min_width { + VarWidth::Numeric => { + let number: f64 = value.trim().parse().map_err(|_| { + Error::InvalidMDGroupCountedValue { + mr_set: mr_set.clone(), + number: value.into(), + } + })?; + Value::Number(Some(number.into())) + } + VarWidth::String(max_width) => { + let value = value.trim_end_matches(' '); + let width = value.len(); + if width > max_width as usize { + return Err(Error::TooWideMDGroupCountedValue { + mr_set: mr_set.clone(), + value: value.into(), + width, + max_width, + }); + }; + Value::String(value.into()) + } + }; + MultipleResponseType::MultipleDichotomy { + value, + labels: *labels, + } + } + raw::MultipleResponseType::MultipleCategory => MultipleResponseType::MultipleCategory, + }; + Ok(mr_type) + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseSet { + pub name: Identifier, + pub min_width: VarWidth, + pub max_width: VarWidth, + pub label: String, + pub mr_type: MultipleResponseType, + pub dict_indexes: Vec, +} + +impl MultipleResponseSet { + fn decode( + decoder: &Decoder, + input: &raw::MultipleResponseSet, + warn: &impl Fn(Error), + ) -> Result { + let mr_set_name = decoder + .decode_identifier(&input.name.0, warn) + .map_err(Error::InvalidMrSetName)?; + + let label = decoder.decode_string(&input.label.0, warn); + + let mut dict_indexes = Vec::with_capacity(input.short_names.len()); + for short_name in input.short_names.iter() { + let short_name = match decoder.decode_identifier(&short_name.0, warn) { + Ok(name) => name, + Err(error) => { + warn(Error::InvalidMrSetName(error)); + continue; + } + }; + let Some(&dict_index) = decoder.var_names.get(&short_name) else { + warn(Error::UnknownMrSetVariable { + mr_set: mr_set_name.clone(), + short_name: short_name.clone(), + }); + continue; + }; + dict_indexes.push(dict_index); + } + + match dict_indexes.len() { + 0 => return Err(Error::EmptyMrSet(mr_set_name)), + 1 => return Err(Error::OneVarMrSet(mr_set_name)), + _ => (), + } + + let Some((Some(min_width), Some(max_width))) = dict_indexes + .iter() + .map(|dict_index| decoder.variables[dict_index].width) + .map(|w| (Some(w), Some(w))) + .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb))) + else { + return Err(Error::MixedMrSet(mr_set_name)); + }; + + let mr_type = + MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?; + + Ok(MultipleResponseSet { + name: mr_set_name, + min_width, + max_width, + label, + mr_type, + dict_indexes, + }) + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseRecord(pub Vec); + +impl TryDecode for MultipleResponseRecord { + type Input = raw::MultipleResponseRecord; + + fn try_decode( + decoder: &mut Decoder, + input: &Self::Input, + warn: impl Fn(Error), + ) -> Result, Error> { + let mut sets = Vec::with_capacity(input.0.len()); + for set in &input.0 { + match MultipleResponseSet::decode(decoder, set, &warn) { + Ok(set) => sets.push(set), + Err(error) => warn(error), + } + } + Ok(Some(MultipleResponseRecord(sets))) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringMissingValues { + /// Variable name. + pub var_name: Identifier, + + /// Missing values. + pub missing_values: MissingValues, +} + +impl LongStringMissingValues { + fn decode( + decoder: &Decoder, + input: &raw::LongStringMissingValues, + warn: &impl Fn(Error), + ) -> Result { + let var_name = decoder.decode_string(&input.var_name.0, warn); + let var_name = Identifier::new(var_name.trim_end(), decoder.encoding) + .map_err(Error::InvalidLongStringValueLabelName)?; + + let missing_values = MissingValues::decode(decoder, &input.missing_values, warn); + + Ok(LongStringMissingValues { + var_name, + missing_values + }) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringMissingValuesRecord(Vec); + +impl TryDecode for LongStringMissingValuesRecord { + type Input = raw::LongStringMissingValueSet; + + fn try_decode( + decoder: &mut Decoder, + input: &Self::Input, + warn: impl Fn(Error), + ) -> Result, Error> { + let mut labels = Vec::with_capacity(input.0.len()); + for label in &input.0 { + match LongStringMissingValues::decode(decoder, label, &warn) { + Ok(set) => labels.push(set), + Err(error) => warn(error), + } + } + Ok(Some(LongStringMissingValuesRecord(labels))) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringValueLabels { + pub var_name: Identifier, + pub width: VarWidth, + pub labels: Vec, +} + +impl LongStringValueLabels { + fn decode( + decoder: &Decoder, + input: &raw::LongStringValueLabels, + warn: &impl Fn(Error), + ) -> Result { + let var_name = decoder.decode_string(&input.var_name.0, warn); + let var_name = Identifier::new(var_name.trim_end(), decoder.encoding) + .map_err(Error::InvalidLongStringValueLabelName)?; + + let min_width = 9; + let max_width = VarWidth::MAX_STRING; + if input.width < 9 || input.width > max_width as u32 { + return Err(Error::InvalidLongValueLabelWidth { + name: var_name, + width: input.width, + min_width, + max_width, + }); + } + let width = input.width as u16; + + let mut labels = Vec::with_capacity(input.labels.len()); + for (value, label) in input.labels.iter() { + let value = Value::String(decoder.decode_exact_length(&value.0).into()); + let label = decoder.decode_string(&label.0, warn); + labels.push(ValueLabel { value, label }); + } + + Ok(LongStringValueLabels { + var_name, + width: VarWidth::String(width), + labels, + }) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringValueLabelRecord(pub Vec); + +impl TryDecode for LongStringValueLabelRecord { + type Input = raw::LongStringValueLabelRecord; + + fn try_decode( + decoder: &mut Decoder, + input: &Self::Input, + warn: impl Fn(Error), + ) -> Result, Error> { + let mut labels = Vec::with_capacity(input.0.len()); + for label in &input.0 { + match LongStringValueLabels::decode(decoder, label, &warn) { + Ok(set) => labels.push(set), + Err(error) => warn(error), + } + } + Ok(Some(LongStringValueLabelRecord(labels))) + } +} + +#[cfg(test)] +mod test { + use encoding_rs::WINDOWS_1252; + + #[test] + fn test() { + let mut s = String::new(); + s.push(char::REPLACEMENT_CHARACTER); + let encoded = WINDOWS_1252.encode(&s).0; + let decoded = WINDOWS_1252.decode(&encoded[..]).0; + println!("{:?}", decoded); + } + + #[test] + fn test2() { + let charset: Vec = (0..=255).collect(); + println!("{}", charset.len()); + let decoded = WINDOWS_1252.decode(&charset[..]).0; + println!("{}", decoded.len()); + let encoded = WINDOWS_1252.encode(&decoded[..]).0; + println!("{}", encoded.len()); + assert_eq!(&charset[..], &encoded[..]); + } +}