From: Ben Pfaff Date: Mon, 28 Aug 2023 16:04:24 +0000 (-0700) Subject: work X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=926ed34468e873fd82bc2999ca229cfb424e7fc0;p=pspp work --- diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index fcc15901fe..58e9c27051 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -4,7 +4,7 @@ use crate::{ endian::Endian, format::{Error as FormatError, Spec, UncheckedSpec}, identifier::{Error as IdError, Identifier}, - raw::{self, MissingValues, VarType}, + raw::{self, MissingValues, UnencodedStr, VarType}, }; use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; use encoding_rs::{DecoderResult, Encoding}; @@ -76,7 +76,7 @@ pub enum Error { InvalidLongStringValueLabel(Identifier), #[error("Invalid multiple response set name. {0}")] - InvalidMrSetName(#[from] IdError), + InvalidMrSetName(IdError), #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")] UnknownMrSetVariable { @@ -93,11 +93,41 @@ pub enum Error { #[error("Multiple response set {0} contains both string and numeric variables.")] MixedMrSet(Identifier), - #[error("Invalid numeric format for counted value {number} in multiple response set {mr_set}.")] + #[error( + "Invalid numeric format for counted value {number} in multiple response set {mr_set}." + )] InvalidMDGroupCountedValue { mr_set: Identifier, number: String }, #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")] - TooWideMDGroupCountedValue { mr_set: Identifier, value: String, width: usize, max_width: u16 }, + TooWideMDGroupCountedValue { + mr_set: Identifier, + value: String, + width: usize, + max_width: u16, + }, + + #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")] + InvalidLongValueLabelWidth { + name: Identifier, + width: u32, + min_width: u16, + max_width: u16, + }, + + #[error("Invalid attribute name. {0}")] + InvalidAttributeName(IdError), + + #[error("Invalid short name in long variable name record. {0}")] + InvalidShortName(IdError), + + #[error("Invalid name in long variable name record. {0}")] + InvalidLongName(IdError), + + #[error("Invalid variable name in very long string record. {0}")] + InvalidLongStringName(IdError), + + #[error("Invalid variable name in long string value label record. {0}")] + InvalidLongStringValueLabelName(IdError), #[error("Details TBD")] TBD, @@ -114,13 +144,13 @@ pub enum Record { VariableSets(VariableSetRecord), VarDisplay(VarDisplayRecord), MultipleResponse(MultipleResponseRecord), - //LongStringValueLabels(LongStringValueLabelRecord), + LongStringValueLabels(LongStringValueLabelRecord), Encoding(EncodingRecord), NumberOfCases(NumberOfCasesRecord), ProductInfo(ProductInfoRecord), - //LongNames(UnencodedString), - //LongStrings(UnencodedString), - //FileAttributes(UnencodedString), + LongNames(LongNameRecord), + VeryLongStrings(VeryLongStringRecord), + FileAttributes(FileAttributeRecord), //VariableAttributes(UnencodedString), //OtherExtension(Extension), //EndOfHeaders(u32), @@ -165,19 +195,22 @@ impl Decoder { assert!(self.n_generated_names < usize::MAX); } } - fn decode_string<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> { + fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> { let (output, malformed) = self.encoding.decode_without_bom_handling(input); if malformed { warn(Error::TBD); } output } + fn decode_string(&self, input: &[u8], warn: &impl Fn(Error)) -> String { + self.decode_string_cow(input, warn).into() + } pub fn decode_identifier( &self, input: &[u8], warn: &impl Fn(Error), ) -> Result { - let s = self.decode_string(input, warn); + let s = self.decode_string_cow(input, warn); Identifier::new(&s, self.encoding) } fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> { @@ -229,9 +262,23 @@ impl Decoder { } } -pub trait Decode: Sized { +pub trait TryDecode: Sized { type Input; - fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result; + fn try_decode( + decoder: &Decoder, + input: &Self::Input, + warn: impl Fn(Error), + ) -> Result; +} + +pub trait Decode: Sized { + fn decode(decoder: &Decoder, input: &Input, warn: impl Fn(Error)) -> Self; +} + +impl Decode> for String { + fn decode(decoder: &Decoder, input: &UnencodedStr, warn: impl Fn(Error)) -> Self { + decoder.decode_string(&input.0, &warn) + } } #[derive(Clone, Debug)] @@ -243,20 +290,24 @@ pub struct HeaderRecord { pub file_label: String, } -impl Decode for HeaderRecord { +impl TryDecode for HeaderRecord { type Input = crate::raw::HeaderRecord; - fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result { + fn try_decode( + decoder: &Decoder, + input: &Self::Input, + warn: impl Fn(Error), + ) -> Result { let eye_catcher = decoder.decode_string(&input.eye_catcher.0, &warn); let file_label = decoder.decode_string(&input.file_label.0, &warn); - let creation_date = decoder.decode_string(&input.creation_date.0, &warn); + let creation_date = decoder.decode_string_cow(&input.creation_date.0, &warn); let creation_date = NaiveDate::parse_from_str(&creation_date, "%v").unwrap_or_else(|_| { warn(Error::InvalidCreationDate { creation_date: creation_date.into(), }); Default::default() }); - let creation_time = decoder.decode_string(&input.creation_time.0, &warn); + let creation_time = decoder.decode_string_cow(&input.creation_time.0, &warn); let creation_time = NaiveTime::parse_from_str(&creation_time, "%H:%M:%S").unwrap_or_else(|_| { warn(Error::InvalidCreationTime { @@ -265,11 +316,11 @@ impl Decode for HeaderRecord { Default::default() }); Ok(HeaderRecord { - eye_catcher: eye_catcher.into(), + eye_catcher, weight_index: input.weight_index.map(|n| n as usize), n_cases: input.n_cases.map(|n| n as u64), creation: NaiveDateTime::new(creation_date, creation_time), - file_label: file_label.into(), + file_label, }) } } @@ -291,6 +342,8 @@ impl PartialOrd for VarWidth { } impl VarWidth { + const MAX_STRING: u16 = 32767; + fn n_dict_indexes(self) -> usize { match self { VarWidth::Numeric => 1, @@ -429,7 +482,7 @@ impl VariableRecord { let label = input .label .as_ref() - .map(|label| decoder.decode_string(&label.0, &warn).into()); + .map(|label| decoder.decode_string(&label.0, &warn)); Ok(Some(VariableRecord { width, name, @@ -444,15 +497,19 @@ impl VariableRecord { #[derive(Clone, Debug)] pub struct DocumentRecord(Vec); -impl Decode for DocumentRecord { +impl TryDecode for DocumentRecord { type Input = crate::raw::DocumentRecord; - fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result { + fn try_decode( + decoder: &Decoder, + input: &Self::Input, + warn: impl Fn(Error), + ) -> Result { Ok(DocumentRecord( input .lines .iter() - .map(|s| decoder.decode_string(&s.0, &warn).into()) + .map(|s| decoder.decode_string(&s.0, &warn)) .collect(), )) } @@ -483,6 +540,21 @@ impl VariableSet { } } +trait WarnOnError { + fn warn_on_error(self, warn: &F) -> Option; +} +impl WarnOnError for Result { + fn warn_on_error(self, warn: &F) -> Option { + match self { + Ok(result) => Some(result), + Err(error) => { + warn(error); + None + } + } + } +} + #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum Value { Number(Option>), @@ -498,28 +570,19 @@ impl Value { } } +#[derive(Clone, Debug)] +pub struct ValueLabel { + pub value: Value, + pub label: String, +} + #[derive(Clone, Debug)] pub struct ValueLabelRecord { pub var_type: VarType, - pub labels: Vec<(Value, String)>, + pub labels: Vec, pub variables: Vec, } -trait WarnOnError { - fn warn_on_error(self, warn: &F) -> Option; -} -impl WarnOnError for Result { - fn warn_on_error(self, warn: &F) -> Option { - match self { - Ok(result) => Some(result), - Err(error) => { - warn(error); - None - } - } - } -} - impl ValueLabelRecord { pub fn decode( decoder: &mut Decoder, @@ -573,7 +636,7 @@ impl ValueLabelRecord { raw::Value::from_raw(*value, var_type, decoder.endian), &decoder, ); - (value, label.into()) + ValueLabel { value, label } }) .collect(); let variables = variables @@ -614,43 +677,63 @@ impl TextRecord for ProductInfoRecord { } } -pub struct LongVariableName { - pub short_name: String, - pub long_name: String, +#[derive(Clone, Debug)] +pub struct LongName { + pub short_name: Identifier, + pub long_name: Identifier, +} + +impl LongName { + fn new(decoder: &mut Decoder, short_name: &str, long_name: &str) -> Result { + let short_name = Identifier::new(short_name, decoder.encoding) + .map_err(|e| Error::InvalidShortName(e))?; + let long_name = + Identifier::new(long_name, decoder.encoding).map_err(|e| Error::InvalidLongName(e))?; + Ok(LongName { + short_name, + long_name, + }) + } } -pub struct LongVariableNameRecord(Vec); +#[derive(Clone, Debug)] +pub struct LongNameRecord(Vec); -impl TextRecord for LongVariableNameRecord { - const NAME: &'static str = "long variable names"; - fn parse(input: &str, warn: impl Fn(Error)) -> Result { +impl LongNameRecord { + pub fn parse(decoder: &mut Decoder, input: &str, warn: impl Fn(Error)) -> Result { let mut names = Vec::new(); for pair in input.split('\t').filter(|s| !s.is_empty()) { if let Some((short_name, long_name)) = pair.split_once('=') { - let name = LongVariableName { - short_name: short_name.into(), - long_name: long_name.into(), - }; - names.push(name); + if let Some(long_name) = + LongName::new(decoder, short_name, long_name).warn_on_error(&warn) + { + names.push(long_name); + } } else { warn(Error::TBD) } } - Ok(LongVariableNameRecord(names)) + Ok(LongNameRecord(names)) } } +#[derive(Clone, Debug)] pub struct VeryLongString { - pub short_name: String, - pub length: usize, + pub short_name: Identifier, + pub length: u16, } impl VeryLongString { - fn parse(input: &str) -> Result { + fn parse(decoder: &Decoder, input: &str) -> Result { let Some((short_name, length)) = input.split_once('=') else { return Err(Error::TBD); }; - let length: usize = length.parse().map_err(|_| Error::TBD)?; + let short_name = Identifier::new(short_name, decoder.encoding) + .map_err(|e| Error::InvalidLongStringName(e))?; + let length: u16 = length.parse().map_err(|_| Error::TBD)?; + if length > VarWidth::MAX_STRING { + return Err(Error::TBD); + } Ok(VeryLongString { short_name: short_name.into(), length, @@ -658,18 +741,18 @@ impl VeryLongString { } } +#[derive(Clone, Debug)] pub struct VeryLongStringRecord(Vec); -impl TextRecord for VeryLongStringRecord { - const NAME: &'static str = "very long strings"; - fn parse(input: &str, warn: impl Fn(Error)) -> Result { +impl VeryLongStringRecord { + pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result { let mut very_long_strings = Vec::new(); for tuple in input .split('\0') .map(|s| s.trim_end_matches('\t')) .filter(|s| !s.is_empty()) { - if let Some(vls) = VeryLongString::parse(tuple).warn_on_error(&warn) { + if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&warn) { very_long_strings.push(vls) } } @@ -677,13 +760,18 @@ impl TextRecord for VeryLongStringRecord { } } +#[derive(Clone, Debug)] pub struct Attribute { - pub name: String, + pub name: Identifier, pub values: Vec, } impl Attribute { - fn parse<'a>(input: &'a str, warn: &impl Fn(Error)) -> Result<(Attribute, &'a str), Error> { + fn parse<'a>( + decoder: &Decoder, + input: &'a str, + warn: &impl Fn(Error), + ) -> Result<(Option, &'a str), Error> { let Some((name, mut input)) = input.split_once('(') else { return Err(Error::TBD); }; @@ -702,23 +790,23 @@ impl Attribute { values.push(value.into()); } if let Some(rest) = rest.strip_prefix(')') { - return Ok(( - Attribute { - name: name.into(), - values, - }, - rest, - )); - } + let attribute = Identifier::new(name, decoder.encoding) + .map_err(|e| Error::InvalidAttributeName(e)) + .warn_on_error(warn) + .map(|name| Attribute { name, values }); + return Ok((attribute, rest)); + }; input = rest; } } } +#[derive(Clone, Debug)] pub struct AttributeSet(pub Vec); impl AttributeSet { fn parse<'a>( + decoder: &Decoder, mut input: &'a str, sentinel: Option, warn: &impl Fn(Error), @@ -729,8 +817,10 @@ impl AttributeSet { None => break input, c if c == sentinel => break &input[1..], _ => { - let (attribute, rest) = Attribute::parse(input, &warn)?; - attributes.push(attribute); + let (attribute, rest) = Attribute::parse(decoder, input, &warn)?; + if let Some(attribute) = attribute { + attributes.push(attribute); + } input = rest; } } @@ -739,12 +829,12 @@ impl AttributeSet { } } +#[derive(Clone, Debug)] pub struct FileAttributeRecord(AttributeSet); -impl TextRecord for FileAttributeRecord { - const NAME: &'static str = "data file attributes"; - fn parse(input: &str, warn: impl Fn(Error)) -> Result { - let (set, rest) = AttributeSet::parse(input, None, &warn)?; +impl FileAttributeRecord { + pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result { + let (set, rest) = AttributeSet::parse(decoder, input, None, &warn)?; if !rest.is_empty() { warn(Error::TBD); } @@ -759,13 +849,14 @@ pub struct VarAttributeSet { impl VarAttributeSet { fn parse<'a>( + decoder: &Decoder, input: &'a str, warn: &impl Fn(Error), ) -> Result<(VarAttributeSet, &'a str), Error> { let Some((long_var_name, rest)) = input.split_once(':') else { return Err(Error::TBD); }; - let (attributes, rest) = AttributeSet::parse(rest, Some('/'), warn)?; + let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'), warn)?; Ok(( VarAttributeSet { long_var_name: long_var_name.into(), @@ -778,13 +869,12 @@ impl VarAttributeSet { pub struct VariableAttributeRecord(Vec); -impl TextRecord for VariableAttributeRecord { - const NAME: &'static str = "variable attributes"; - fn parse(mut input: &str, warn: impl Fn(Error)) -> Result { +impl VariableAttributeRecord { + pub fn parse(decoder: &Decoder, mut input: &str, warn: impl Fn(Error)) -> Result { let mut var_attribute_sets = Vec::new(); while !input.is_empty() { let Some((var_attribute, rest)) = - VarAttributeSet::parse(input, &warn).warn_on_error(&warn) + VarAttributeSet::parse(decoder, input, &warn).warn_on_error(&warn) else { break; }; @@ -838,24 +928,36 @@ impl MultipleResponseType { ) -> Result { let mr_type = match input { raw::MultipleResponseType::MultipleDichotomy { value, labels } => { - let value = decoder.decode_string(&value.0, warn); + let value = decoder.decode_string_cow(&value.0, warn); let value = match min_width { VarWidth::Numeric => { - let number: f64 = value.trim().parse() - .map_err(|_| Error::InvalidMDGroupCountedValue { mr_set: mr_set.clone(), number: value.into() })?; + let number: f64 = value.trim().parse().map_err(|_| { + Error::InvalidMDGroupCountedValue { + mr_set: mr_set.clone(), + number: value.into(), + } + })?; Value::Number(Some(number.into())) - }, + } VarWidth::String(max_width) => { let value = value.trim_end_matches(' '); let width = value.len(); if width > max_width as usize { - return Err(Error::TooWideMDGroupCountedValue { mr_set: mr_set.clone(), value: value.into(), width, max_width }); + return Err(Error::TooWideMDGroupCountedValue { + mr_set: mr_set.clone(), + value: value.into(), + width, + max_width, + }); }; Value::String(value.into()) } }; - MultipleResponseType::MultipleDichotomy { value, labels: *labels } - }, + MultipleResponseType::MultipleDichotomy { + value, + labels: *labels, + } + } raw::MultipleResponseType::MultipleCategory => MultipleResponseType::MultipleCategory, }; Ok(mr_type) @@ -882,7 +984,7 @@ impl MultipleResponseSet { .decode_identifier(&input.name.0, warn) .map_err(|error| Error::InvalidMrSetName(error))?; - let label = decoder.decode_string(&input.label.0, warn).into(); + let label = decoder.decode_string(&input.label.0, warn); let mut dict_indexes = Vec::with_capacity(input.short_names.len()); for short_name in input.short_names.iter() { @@ -918,7 +1020,8 @@ impl MultipleResponseSet { return Err(Error::MixedMrSet(mr_set_name)); }; - let mr_type = MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?; + let mr_type = + MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?; Ok(MultipleResponseSet { name: mr_set_name, @@ -934,10 +1037,14 @@ impl MultipleResponseSet { #[derive(Clone, Debug)] pub struct MultipleResponseRecord(pub Vec); -impl Decode for MultipleResponseRecord { +impl TryDecode for MultipleResponseRecord { type Input = raw::MultipleResponseRecord; - fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result { + fn try_decode( + decoder: &Decoder, + input: &Self::Input, + warn: impl Fn(Error), + ) -> Result { let mut sets = Vec::with_capacity(input.0.len()); for set in &input.0 { match MultipleResponseSet::decode(decoder, set, &warn) { @@ -949,6 +1056,72 @@ impl Decode for MultipleResponseRecord { } } +#[derive(Clone, Debug)] +pub struct LongStringValueLabels { + pub var_name: Identifier, + pub width: VarWidth, + pub labels: Vec, +} + +impl LongStringValueLabels { + fn decode( + decoder: &Decoder, + input: &raw::LongStringValueLabels, + warn: &impl Fn(Error), + ) -> Result { + let var_name = decoder + .decode_identifier(&input.var_name.0, warn) + .map_err(|e| Error::InvalidLongStringValueLabelName(e))?; + + let min_width = 9; + let max_width = VarWidth::MAX_STRING; + if input.width < 9 || input.width > max_width as u32 { + return Err(Error::InvalidLongValueLabelWidth { + name: var_name.into(), + width: input.width, + min_width, + max_width, + }); + } + let width = input.width as u16; + + let mut labels = Vec::with_capacity(input.labels.len()); + for (value, label) in input.labels.iter() { + let value = Value::String(decoder.decode_exact_length(&value.0).into()); + let label = decoder.decode_string(&label.0, warn); + labels.push(ValueLabel { value, label }); + } + + Ok(LongStringValueLabels { + var_name, + width: VarWidth::String(width), + labels, + }) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringValueLabelRecord(pub Vec); + +impl TryDecode for LongStringValueLabelRecord { + type Input = raw::LongStringValueLabelRecord; + + fn try_decode( + decoder: &Decoder, + input: &Self::Input, + warn: impl Fn(Error), + ) -> Result { + let mut labels = Vec::with_capacity(input.0.len()); + for label in &input.0 { + match LongStringValueLabels::decode(decoder, label, &warn) { + Ok(set) => labels.push(set), + Err(error) => warn(error), + } + } + Ok(LongStringValueLabelRecord(labels)) + } +} + #[cfg(test)] mod test { use encoding_rs::WINDOWS_1252; diff --git a/rust/src/identifier.rs b/rust/src/identifier.rs index 5b0649f533..8727bf1ea3 100644 --- a/rust/src/identifier.rs +++ b/rust/src/identifier.rs @@ -84,18 +84,30 @@ impl Identifier { let (encoded, _, unencodable) = encoding.encode(s); if unencodable { let mut encoder = encoding.new_encoder(); - let mut buf = - Vec::with_capacity(encoder.max_buffer_length_from_utf8_without_replacement(s.len()).unwrap()); + let mut buf = Vec::with_capacity( + encoder + .max_buffer_length_from_utf8_without_replacement(s.len()) + .unwrap(), + ); let EncoderResult::Unmappable(c) = encoder .encode_from_utf8_to_vec_without_replacement(s, &mut buf, true) .0 else { unreachable!(); }; - return Err(Error::NotEncodable { id: s.into(), encoding: encoding.name(), c }); + return Err(Error::NotEncodable { + id: s.into(), + encoding: encoding.name(), + c, + }); } if encoded.len() > Self::MAX_LEN { - return Err(Error::TooLong { id: s.into(), length: encoded.len(), encoding: encoding.name(), max: Self::MAX_LEN }); + return Err(Error::TooLong { + id: s.into(), + length: encoded.len(), + encoding: encoding.name(), + max: Self::MAX_LEN, + }); } Ok(Identifier(s.into())) } diff --git a/rust/src/raw.rs b/rust/src/raw.rs index a8c8ff7b46..a8c7c858e9 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -142,7 +142,7 @@ pub enum Record { NumberOfCases(NumberOfCasesRecord), ProductInfo(TextRecord), LongNames(TextRecord), - LongStrings(TextRecord), + VeryLongStrings(TextRecord), FileAttributes(TextRecord), VariableAttributes(TextRecord), OtherExtension(Extension), @@ -1592,7 +1592,7 @@ impl Extension { 5 => Ok(Record::VariableSets(extension.into())), 10 => Ok(Record::ProductInfo(extension.into())), 13 => Ok(Record::LongNames(extension.into())), - 14 => Ok(Record::LongStrings(extension.into())), + 14 => Ok(Record::VeryLongStrings(extension.into())), 17 => Ok(Record::FileAttributes(extension.into())), 18 => Ok(Record::VariableAttributes(extension.into())), _ => Ok(Record::OtherExtension(extension)), @@ -1756,7 +1756,7 @@ pub struct LongStringValueLabels { } #[derive(Clone, Debug)] -pub struct LongStringValueLabelRecord(Vec); +pub struct LongStringValueLabelRecord(pub Vec); impl ExtensionRecord for LongStringValueLabelRecord { const SUBTYPE: u32 = 21;