X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=rust%2Fsrc%2Fcooked.rs;h=d00f3f3c34f7a6087570c6315e98b0539f82c2ab;hb=e0cbdf0daefcca81be9572aab0deedf945687f5a;hp=30430c101e5c568c47bcec0d0148179272f216b6;hpb=1b75557bce9dc7cd9a146563ac4da39bf5884846;p=pspp diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index 30430c101e..d00f3f3c34 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -1,10 +1,11 @@ -use std::{borrow::Cow, cmp::Ordering, collections::HashMap, iter::repeat}; +use std::{borrow::Cow, cmp::Ordering, collections::HashMap, iter::repeat, ops::Range}; use crate::{ + encoding::{default_encoding, get_encoding, Error as EncodingError}, endian::Endian, format::{Error as FormatError, Spec, UncheckedSpec}, identifier::{Error as IdError, Identifier}, - raw::{self, MissingValues, UnencodedStr, VarType}, + raw::{self, UnencodedStr, VarType}, }; use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; use encoding_rs::{DecoderResult, Encoding}; @@ -16,8 +17,19 @@ pub use crate::raw::{CategoryLabels, Compression}; #[derive(ThisError, Debug)] pub enum Error { - #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")] - InvalidVariableWidth { offset: u64, width: i32 }, + // XXX this is really an internal error and maybe we should change the + // interfaces to make it impossible + #[error("Missing header record")] + MissingHeaderRecord, + + #[error("{0}")] + EncodingError(EncodingError), + + #[error("Using default encoding {0}.")] + UsingDefaultEncoding(String), + + #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)] + InvalidVariableWidth { offsets: Range, width: i32 }, #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")] InvalidLongMissingValueFormat, @@ -132,6 +144,16 @@ pub enum Error { #[error("Invalid variable name in attribute record. {0}")] InvalidAttributeVariableName(IdError), + // XXX This is risky because `text` might be arbitarily long. + #[error("Text string contains invalid bytes for {encoding} encoding: {text}")] + MalformedString { encoding: String, text: String }, + + #[error("Invalid variable measurement level value {0}")] + InvalidMeasurement(u32), + + #[error("Invalid variable display alignment value {0}")] + InvalidAlignment(u32), + #[error("Details TBD")] TBD, } @@ -147,6 +169,7 @@ pub enum Record { VariableSets(VariableSetRecord), VarDisplay(VarDisplayRecord), MultipleResponse(MultipleResponseRecord), + LongStringMissingValues(LongStringMissingValuesRecord), LongStringValueLabels(LongStringValueLabelRecord), Encoding(EncodingRecord), NumberOfCases(NumberOfCasesRecord), @@ -155,7 +178,7 @@ pub enum Record { VeryLongStrings(VeryLongStringRecord), FileAttributes(FileAttributeRecord), VariableAttributes(VariableAttributeRecord), - //OtherExtension(Extension), + OtherExtension(Extension), //EndOfHeaders(u32), //ZHeader(ZHeader), //ZTrailer(ZTrailer), @@ -163,6 +186,7 @@ pub enum Record { } pub use crate::raw::EncodingRecord; +pub use crate::raw::Extension; pub use crate::raw::FloatInfoRecord; pub use crate::raw::IntegerInfoRecord; pub use crate::raw::NumberOfCasesRecord; @@ -186,6 +210,157 @@ pub struct Decoder { n_generated_names: usize, } +pub fn decode( + headers: Vec, + encoding: Option<&'static Encoding>, + warn: &impl Fn(Error), +) -> Result, Error> { + let Some(header_record) = headers.iter().find_map(|rec| { + if let raw::Record::Header(header) = rec { + Some(header) + } else { + None + } + }) else { + return Err(Error::MissingHeaderRecord); + }; + let encoding = match encoding { + Some(encoding) => encoding, + None => { + let encoding = headers.iter().find_map(|rec| { + if let raw::Record::Encoding(ref e) = rec { + Some(e.0.as_str()) + } else { + None + } + }); + let character_code = headers.iter().find_map(|rec| { + if let raw::Record::IntegerInfo(ref r) = rec { + Some(r.character_code) + } else { + None + } + }); + match get_encoding(encoding, character_code) { + Ok(encoding) => encoding, + Err(err @ EncodingError::Ebcdic) => return Err(Error::EncodingError(err)), + Err(err) => { + warn(Error::EncodingError(err)); + // Warn that we're using the default encoding. + default_encoding() + } + } + } + }; + + let mut decoder = Decoder { + compression: header_record.compression, + endian: header_record.endian, + encoding, + variables: HashMap::new(), + var_names: HashMap::new(), + n_dict_indexes: 0, + n_generated_names: 0, + }; + + let mut output = Vec::with_capacity(headers.len()); + for header in &headers { + match header { + raw::Record::Header(ref input) => { + if let Some(header) = HeaderRecord::try_decode(&mut decoder, input, warn)? { + output.push(Record::Header(header)) + } + } + raw::Record::Variable(ref input) => { + if let Some(variable) = VariableRecord::try_decode(&mut decoder, input, warn)? { + output.push(Record::Variable(variable)); + } + } + raw::Record::ValueLabel(ref input) => { + if let Some(value_label) = ValueLabelRecord::try_decode(&mut decoder, input, warn)? + { + output.push(Record::ValueLabel(value_label)); + } + } + raw::Record::Document(ref input) => { + if let Some(document) = DocumentRecord::try_decode(&mut decoder, input, warn)? { + output.push(Record::Document(document)) + } + } + raw::Record::IntegerInfo(ref input) => output.push(Record::IntegerInfo(input.clone())), + raw::Record::FloatInfo(ref input) => output.push(Record::FloatInfo(input.clone())), + raw::Record::VariableSets(ref input) => { + let s = decoder.decode_string_cow(&input.text.0, warn); + output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?)); + } + raw::Record::VarDisplay(ref input) => { + if let Some(vdr) = VarDisplayRecord::try_decode(&mut decoder, input, warn)? { + output.push(Record::VarDisplay(vdr)) + } + } + raw::Record::MultipleResponse(ref input) => { + if let Some(mrr) = MultipleResponseRecord::try_decode(&mut decoder, input, warn)? { + output.push(Record::MultipleResponse(mrr)) + } + } + raw::Record::LongStringMissingValues(ref input) => { + if let Some(mrr) = LongStringMissingValuesRecord::try_decode(&mut decoder, input, warn)? { + output.push(Record::LongStringMissingValues(mrr)) + } + } + raw::Record::LongStringValueLabels(ref input) => { + if let Some(mrr) = + LongStringValueLabelRecord::try_decode(&mut decoder, input, warn)? + { + output.push(Record::LongStringValueLabels(mrr)) + } + } + raw::Record::Encoding(ref input) => output.push(Record::Encoding(input.clone())), + raw::Record::NumberOfCases(ref input) => { + output.push(Record::NumberOfCases(input.clone())) + } + raw::Record::ProductInfo(ref input) => { + let s = decoder.decode_string_cow(&input.text.0, warn); + output.push(Record::ProductInfo(ProductInfoRecord::parse(&s, warn)?)); + } + raw::Record::LongNames(ref input) => { + let s = decoder.decode_string_cow(&input.text.0, warn); + output.push(Record::LongNames(LongNameRecord::parse( + &mut decoder, + &s, + warn, + )?)); + } + raw::Record::VeryLongStrings(ref input) => { + let s = decoder.decode_string_cow(&input.text.0, warn); + output.push(Record::VeryLongStrings(VeryLongStringRecord::parse( + &decoder, &s, warn, + )?)); + } + raw::Record::FileAttributes(ref input) => { + let s = decoder.decode_string_cow(&input.text.0, warn); + output.push(Record::FileAttributes(FileAttributeRecord::parse( + &decoder, &s, warn, + )?)); + } + raw::Record::VariableAttributes(ref input) => { + let s = decoder.decode_string_cow(&input.text.0, warn); + output.push(Record::VariableAttributes(VariableAttributeRecord::parse( + &decoder, &s, warn, + )?)); + } + raw::Record::OtherExtension(ref input) => { + output.push(Record::OtherExtension(input.clone())) + } + raw::Record::EndOfHeaders(_) => (), + raw::Record::ZHeader(_) => (), + raw::Record::ZTrailer(_) => (), + raw::Record::Case(_) => (), + }; + } + Ok(output) +} + impl Decoder { fn generate_name(&mut self) -> Identifier { loop { @@ -201,7 +376,10 @@ impl Decoder { fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> { let (output, malformed) = self.encoding.decode_without_bom_handling(input); if malformed { - warn(Error::TBD); + warn(Error::MalformedString { + encoding: self.encoding.name().into(), + text: output.clone().into(), + }); } output } @@ -217,14 +395,14 @@ impl Decoder { Identifier::new(&s, self.encoding) } fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> { - let max_index = self.n_dict_indexes - 1; - if dict_index == 0 || dict_index as usize > max_index { + let max_index = self.n_dict_indexes; + if dict_index == 0 || dict_index > max_index { return Err(Error::InvalidDictIndex { dict_index, max_index, }); } - let Some(variable) = self.variables.get(&dict_index) else { + let Some(variable) = self.variables.get(&(dict_index - 1)) else { return Err(Error::DictIndexIsContinuation(dict_index)); }; Ok(variable) @@ -238,7 +416,7 @@ impl Decoder { fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { if let (s, false) = self.encoding.decode_without_bom_handling(input) { // This is the common case. Usually there will be no errors. - s.into() + s } else { // Unusual case. Don't bother to optimize it much. let mut decoder = self.encoding.new_decoder_without_bom_handling(); @@ -268,10 +446,10 @@ impl Decoder { pub trait TryDecode: Sized { type Input; fn try_decode( - decoder: &Decoder, + decoder: &mut Decoder, input: &Self::Input, warn: impl Fn(Error), - ) -> Result; + ) -> Result, Error>; } pub trait Decode: Sized { @@ -293,23 +471,29 @@ pub struct HeaderRecord { pub file_label: String, } +fn trim_end_spaces(mut s: String) -> String { + s.truncate(s.trim_end_matches(' ').len()); + s +} + impl TryDecode for HeaderRecord { type Input = crate::raw::HeaderRecord; fn try_decode( - decoder: &Decoder, + decoder: &mut Decoder, input: &Self::Input, warn: impl Fn(Error), - ) -> Result { - let eye_catcher = decoder.decode_string(&input.eye_catcher.0, &warn); - let file_label = decoder.decode_string(&input.file_label.0, &warn); + ) -> Result, Error> { + let eye_catcher = trim_end_spaces(decoder.decode_string(&input.eye_catcher.0, &warn)); + let file_label = trim_end_spaces(decoder.decode_string(&input.file_label.0, &warn)); let creation_date = decoder.decode_string_cow(&input.creation_date.0, &warn); - let creation_date = NaiveDate::parse_from_str(&creation_date, "%v").unwrap_or_else(|_| { - warn(Error::InvalidCreationDate { - creation_date: creation_date.into(), + let creation_date = + NaiveDate::parse_from_str(&creation_date, "%e %b %Y").unwrap_or_else(|_| { + warn(Error::InvalidCreationDate { + creation_date: creation_date.into(), + }); + Default::default() }); - Default::default() - }); let creation_time = decoder.decode_string_cow(&input.creation_time.0, &warn); let creation_time = NaiveTime::parse_from_str(&creation_time, "%H:%M:%S").unwrap_or_else(|_| { @@ -318,13 +502,13 @@ impl TryDecode for HeaderRecord { }); Default::default() }); - Ok(HeaderRecord { + Ok(Some(HeaderRecord { eye_catcher, weight_index: input.weight_index.map(|n| n as usize), n_cases: input.n_cases.map(|n| n as u64), creation: NaiveDateTime::new(creation_date, creation_time), file_label, - }) + })) } } @@ -402,6 +586,31 @@ pub struct VariableRecord { pub label: Option, } +#[derive(Clone, Debug)] +pub struct MissingValues { + /// Individual missing values, up to 3 of them. + pub values: Vec, + + /// Optional range of missing values. + pub range: Option<(Value, Value)>, +} + +impl Decode for MissingValues { + fn decode(decoder: &Decoder, input: &raw::MissingValues, _warn: impl Fn(Error)) -> Self { + MissingValues { + values: input + .values + .iter() + .map(|value| Value::decode(value, decoder)) + .collect(), + range: input + .range + .as_ref() + .map(|(low, high)| (Value::decode(low, decoder), Value::decode(high, decoder))), + } + } +} + fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec { UncheckedSpec::try_from(raw) .and_then(Spec::try_from) @@ -413,8 +622,10 @@ fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatErro }) } -impl VariableRecord { - pub fn decode( +impl TryDecode for VariableRecord { + type Input = raw::VariableRecord; + + fn try_decode( decoder: &mut Decoder, input: &crate::raw::VariableRecord, warn: impl Fn(Error), @@ -425,12 +636,13 @@ impl VariableRecord { -1 => return Ok(None), _ => { return Err(Error::InvalidVariableWidth { - offset: input.offset, + offsets: input.offsets.clone(), width: input.width, }) } }; - let name = match decoder.decode_identifier(&input.name.0, &warn) { + let name = trim_end_spaces(decoder.decode_string(&input.name.0, &warn)); + let name = match Identifier::new(&name, decoder.encoding) { Ok(name) => { if !decoder.var_names.contains_key(&name) { name @@ -491,7 +703,7 @@ impl VariableRecord { name, print_format, write_format, - missing_values: input.missing_values.clone(), + missing_values: MissingValues::decode(decoder, &input.missing_values, warn), label, })) } @@ -504,17 +716,17 @@ impl TryDecode for DocumentRecord { type Input = crate::raw::DocumentRecord; fn try_decode( - decoder: &Decoder, + decoder: &mut Decoder, input: &Self::Input, warn: impl Fn(Error), - ) -> Result { - Ok(DocumentRecord( + ) -> Result, Error> { + Ok(Some(DocumentRecord( input .lines .iter() - .map(|s| decoder.decode_string(&s.0, &warn)) + .map(|s| trim_end_spaces(decoder.decode_string(&s.0, &warn))) .collect(), - )) + ))) } } @@ -565,7 +777,7 @@ pub enum Value { } impl Value { - pub fn decode(raw: raw::Value, decoder: &Decoder) -> Self { + pub fn decode(raw: &raw::Value, decoder: &Decoder) -> Self { match raw { raw::Value::Number(x) => Value::Number(x.map(|x| x.into())), raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()), @@ -586,14 +798,14 @@ pub struct ValueLabelRecord { pub variables: Vec, } -impl ValueLabelRecord { - pub fn decode( +impl TryDecode for ValueLabelRecord { + type Input = crate::raw::ValueLabelRecord; + fn try_decode( decoder: &mut Decoder, - raw_value_label: &crate::raw::ValueLabelRecord, - dict_indexes: &crate::raw::VarIndexRecord, + input: &Self::Input, warn: impl Fn(Error), ) -> Result, Error> { - let variables: Vec<&Variable> = dict_indexes + let variables: Vec<&Variable> = input .dict_indexes .iter() .filter_map(|&dict_index| { @@ -630,14 +842,14 @@ impl ValueLabelRecord { return Ok(None); } } - let labels = raw_value_label + let labels = input .labels .iter() .map(|(value, label)| { let label = decoder.decode_string(&label.0, &warn); let value = Value::decode( - raw::Value::from_raw(*value, var_type, decoder.endian), - &decoder, + &raw::Value::from_raw(value, var_type, decoder.endian), + decoder, ); ValueLabel { value, label } }) @@ -688,10 +900,10 @@ pub struct LongName { impl LongName { fn new(decoder: &mut Decoder, short_name: &str, long_name: &str) -> Result { - let short_name = Identifier::new(short_name, decoder.encoding) - .map_err(|e| Error::InvalidShortName(e))?; + let short_name = + Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidShortName)?; let long_name = - Identifier::new(long_name, decoder.encoding).map_err(|e| Error::InvalidLongName(e))?; + Identifier::new(long_name, decoder.encoding).map_err(Error::InvalidLongName)?; Ok(LongName { short_name, long_name, @@ -731,16 +943,13 @@ impl VeryLongString { let Some((short_name, length)) = input.split_once('=') else { return Err(Error::TBD); }; - let short_name = Identifier::new(short_name, decoder.encoding) - .map_err(|e| Error::InvalidLongStringName(e))?; + let short_name = + Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidLongStringName)?; let length: u16 = length.parse().map_err(|_| Error::TBD)?; if length > VarWidth::MAX_STRING { return Err(Error::TBD); } - Ok(VeryLongString { - short_name: short_name.into(), - length, - }) + Ok(VeryLongString { short_name, length }) } } @@ -794,7 +1003,7 @@ impl Attribute { } if let Some(rest) = rest.strip_prefix(')') { let attribute = Identifier::new(name, decoder.encoding) - .map_err(|e| Error::InvalidAttributeName(e)) + .map_err(Error::InvalidAttributeName) .warn_on_error(warn) .map(|name| Attribute { name, values }); return Ok((attribute, rest)); @@ -862,7 +1071,7 @@ impl VarAttributeSet { }; let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'), warn)?; let var_attribute = Identifier::new(long_var_name, decoder.encoding) - .map_err(|e| Error::InvalidAttributeVariableName(e)) + .map_err(Error::InvalidAttributeVariableName) .warn_on_error(warn) .map(|name| VarAttributeSet { long_var_name: name, @@ -900,6 +1109,18 @@ pub enum Measure { Scale, } +impl Measure { + fn try_decode(source: u32) -> Result, Error> { + match source { + 0 => Ok(None), + 1 => Ok(Some(Measure::Nominal)), + 2 => Ok(Some(Measure::Ordinal)), + 3 => Ok(Some(Measure::Scale)), + _ => Err(Error::InvalidMeasurement(source)), + } + } +} + #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum Alignment { Left, @@ -907,16 +1128,67 @@ pub enum Alignment { Center, } +impl Alignment { + fn try_decode(source: u32) -> Result, Error> { + match source { + 0 => Ok(None), + 1 => Ok(Some(Alignment::Left)), + 2 => Ok(Some(Alignment::Right)), + 3 => Ok(Some(Alignment::Center)), + _ => Err(Error::InvalidAlignment(source)), + } + } +} + #[derive(Clone, Debug)] pub struct VarDisplay { pub measure: Option, - pub width: u32, - pub align: Option, + pub width: Option, + pub alignment: Option, } #[derive(Clone, Debug)] pub struct VarDisplayRecord(pub Vec); +impl TryDecode for VarDisplayRecord { + type Input = raw::VarDisplayRecord; + fn try_decode( + decoder: &mut Decoder, + input: &Self::Input, + warn: impl Fn(Error), + ) -> Result, Error> { + let n_vars = decoder.variables.len(); + let n_per_var = if input.0.len() == 3 * n_vars { + 3 + } else if input.0.len() == 2 * n_vars { + 2 + } else { + return Err(Error::TBD); + }; + + let var_displays = input + .0 + .chunks(n_per_var) + .map(|chunk| { + let (measure, width, alignment) = match n_per_var == 3 { + true => (chunk[0], Some(chunk[1]), chunk[2]), + false => (chunk[0], None, chunk[1]), + }; + let measure = Measure::try_decode(measure).warn_on_error(&warn).flatten(); + let alignment = Alignment::try_decode(alignment) + .warn_on_error(&warn) + .flatten(); + VarDisplay { + measure, + width, + alignment, + } + }) + .collect(); + Ok(Some(VarDisplayRecord(var_displays))) + } +} + #[derive(Clone, Debug)] pub enum MultipleResponseType { MultipleDichotomy { @@ -990,7 +1262,7 @@ impl MultipleResponseSet { ) -> Result { let mr_set_name = decoder .decode_identifier(&input.name.0, warn) - .map_err(|error| Error::InvalidMrSetName(error))?; + .map_err(Error::InvalidMrSetName)?; let label = decoder.decode_string(&input.label.0, warn); @@ -1049,10 +1321,10 @@ impl TryDecode for MultipleResponseRecord { type Input = raw::MultipleResponseRecord; fn try_decode( - decoder: &Decoder, + decoder: &mut Decoder, input: &Self::Input, warn: impl Fn(Error), - ) -> Result { + ) -> Result, Error> { let mut sets = Vec::with_capacity(input.0.len()); for set in &input.0 { match MultipleResponseSet::decode(decoder, set, &warn) { @@ -1060,7 +1332,57 @@ impl TryDecode for MultipleResponseRecord { Err(error) => warn(error), } } - Ok(MultipleResponseRecord(sets)) + Ok(Some(MultipleResponseRecord(sets))) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringMissingValues { + /// Variable name. + pub var_name: Identifier, + + /// Missing values. + pub missing_values: MissingValues, +} + +impl LongStringMissingValues { + fn decode( + decoder: &Decoder, + input: &raw::LongStringMissingValues, + warn: &impl Fn(Error), + ) -> Result { + let var_name = decoder.decode_string(&input.var_name.0, warn); + let var_name = Identifier::new(var_name.trim_end(), decoder.encoding) + .map_err(Error::InvalidLongStringValueLabelName)?; + + let missing_values = MissingValues::decode(decoder, &input.missing_values, warn); + + Ok(LongStringMissingValues { + var_name, + missing_values + }) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringMissingValuesRecord(Vec); + +impl TryDecode for LongStringMissingValuesRecord { + type Input = raw::LongStringMissingValueSet; + + fn try_decode( + decoder: &mut Decoder, + input: &Self::Input, + warn: impl Fn(Error), + ) -> Result, Error> { + let mut labels = Vec::with_capacity(input.0.len()); + for label in &input.0 { + match LongStringMissingValues::decode(decoder, label, &warn) { + Ok(set) => labels.push(set), + Err(error) => warn(error), + } + } + Ok(Some(LongStringMissingValuesRecord(labels))) } } @@ -1077,15 +1399,15 @@ impl LongStringValueLabels { input: &raw::LongStringValueLabels, warn: &impl Fn(Error), ) -> Result { - let var_name = decoder - .decode_identifier(&input.var_name.0, warn) - .map_err(|e| Error::InvalidLongStringValueLabelName(e))?; + let var_name = decoder.decode_string(&input.var_name.0, warn); + let var_name = Identifier::new(var_name.trim_end(), decoder.encoding) + .map_err(Error::InvalidLongStringValueLabelName)?; let min_width = 9; let max_width = VarWidth::MAX_STRING; if input.width < 9 || input.width > max_width as u32 { return Err(Error::InvalidLongValueLabelWidth { - name: var_name.into(), + name: var_name, width: input.width, min_width, max_width, @@ -1115,10 +1437,10 @@ impl TryDecode for LongStringValueLabelRecord { type Input = raw::LongStringValueLabelRecord; fn try_decode( - decoder: &Decoder, + decoder: &mut Decoder, input: &Self::Input, warn: impl Fn(Error), - ) -> Result { + ) -> Result, Error> { let mut labels = Vec::with_capacity(input.0.len()); for label in &input.0 { match LongStringValueLabels::decode(decoder, label, &warn) { @@ -1126,7 +1448,7 @@ impl TryDecode for LongStringValueLabelRecord { Err(error) => warn(error), } } - Ok(LongStringValueLabelRecord(labels)) + Ok(Some(LongStringValueLabelRecord(labels))) } }