From fa709bc6e88c78308254a7536c3d100df4ded67a Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Mon, 25 Dec 2023 11:17:16 -0800 Subject: [PATCH] work --- rust/src/cooked.rs | 36 ++++------ rust/src/identifier.rs | 11 ++- rust/src/raw.rs | 151 ++++++++++++++++++++++++++++------------- 3 files changed, 124 insertions(+), 74 deletions(-) diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index 4c0135b949..b7802e91cd 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -8,7 +8,7 @@ use crate::{ endian::Endian, format::{Error as FormatError, Spec, UncheckedSpec}, identifier::{Error as IdError, Identifier}, - raw::{self, RawDocumentLine, RawStr, RawString, VarDisplayRecord, VarType}, + raw::{self, RawDocumentLine, RawStr, RawString, VarDisplayRecord, VarType, ProductInfoRecord}, }; use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; use encoding_rs::{DecoderResult, Encoding}; @@ -212,18 +212,18 @@ struct Headers<'a> { document: Option<&'a raw::DocumentRecord>, integer_info: Option<&'a raw::IntegerInfoRecord>, float_info: Option<&'a raw::FloatInfoRecord>, - variable_sets: Vec<&'a raw::TextRecord>, + variable_sets: Vec<&'a raw::VariableSetRecord>, var_display: Option<&'a raw::VarDisplayRecord>, multiple_response: Vec<&'a raw::MultipleResponseRecord>, long_string_value_labels: Vec<&'a raw::LongStringValueLabelRecord>, long_string_missing_values: Vec<&'a raw::LongStringMissingValueRecord>>, encoding: Option<&'a raw::EncodingRecord>, number_of_cases: Option<&'a raw::NumberOfCasesRecord>, - product_info: Option<&'a raw::TextRecord>, - long_names: Option<&'a raw::TextRecord>, - very_long_strings: Vec<&'a raw::TextRecord>, - file_attributes: Vec<&'a raw::TextRecord>, - variable_attributes: Vec<&'a raw::TextRecord>, + product_info: Option<&'a raw::ProductInfoRecord>, + long_names: Option<&'a raw::LongNamesRecord>, + very_long_strings: Vec<&'a raw::VeryLongStringsRecord>, + file_attributes: Vec<&'a raw::FileAttributeRecord>, + variable_attributes: Vec<&'a raw::VariableAttributeRecord>, other_extensions: Vec<&'a raw::Extension>, cases: Option<&'a Rc>>, } @@ -239,7 +239,6 @@ fn set_or_warn(option: &mut Option, value: T, warn: &impl Fn(Error)) { impl<'a> Headers<'a> { fn new(headers: &'a Vec, warn: &impl Fn(Error)) -> Headers<'a> { let mut h = Headers::default(); -/* for header in headers { match header { raw::Record::Header(record) => set_or_warn(&mut h.header, record, warn), @@ -271,9 +270,10 @@ impl<'a> Headers<'a> { raw::Record::ZHeader(_) => (), raw::Record::ZTrailer(_) => (), raw::Record::Cases(record) => set_or_warn(&mut h.cases, record, warn), + raw::Record::Text(_) => todo!(), + } } -*/ h } } @@ -334,12 +334,12 @@ pub fn decode( output.push(Record::FloatInfo(raw.clone())); } if let Some(raw) = h.product_info { - let s = decoder.decode_string_cow(&raw.text.0, warn); - output.push(Record::ProductInfo(ProductInfoRecord::parse(&s, warn)?)); + output.push(Record::ProductInfo(raw.clone())); } if let Some(raw) = h.number_of_cases { output.push(Record::NumberOfCases(raw.clone())) } +/* for &raw in &h.file_attributes { let s = decoder.decode_string_cow(&raw.text.0, warn); output.push(Record::FileAttributes(FileAttributeRecord::parse( @@ -349,7 +349,6 @@ pub fn decode( for &raw in &h.other_extensions { output.push(Record::OtherExtension(raw.clone())); } - // Decode the variable records, which are the basis of almost everything // else. for &raw in &h.variables { @@ -372,13 +371,11 @@ pub fn decode( } // Decode records that use short names. - /* for &raw in &h.multiple_response { if let Some(mrr) = MultipleResponseRecord::try_decode(&mut decoder, raw, warn)? { output.push(Record::MultipleResponse(mrr)) } } - */ for &raw in &h.very_long_strings { let s = decoder.decode_string_cow(&raw.text.0, warn); output.push(Record::VeryLongStrings(VeryLongStringRecord::parse( @@ -417,6 +414,7 @@ pub fn decode( let s = decoder.decode_string_cow(&raw.text.0, warn); output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?)); } +*/ Ok(output) } @@ -942,16 +940,6 @@ impl TextRecord for VariableSetRecord { } } -#[derive(Clone, Debug)] -pub struct ProductInfoRecord(pub String); - -impl TextRecord for ProductInfoRecord { - const NAME: &'static str = "extra product info"; - fn parse(input: &str, _warn: impl Fn(Error)) -> Result { - Ok(ProductInfoRecord(input.into())) - } -} - #[derive(Clone, Debug)] pub struct LongName { pub short_name: Identifier, diff --git a/rust/src/identifier.rs b/rust/src/identifier.rs index d8b5219920..70fbc00aa1 100644 --- a/rust/src/identifier.rs +++ b/rust/src/identifier.rs @@ -89,6 +89,15 @@ impl Identifier { } pub fn new(s: &str, encoding: &'static Encoding) -> Result { Self::is_plausible(s)?; + let identifier = Identifier(s.into()); + identifier.check_encoding(encoding)?; + Ok(identifier) + } + /// Checks whether this is a valid identifier in the given `encoding`. An + /// identifier that is valid in one encoding might be invalid in another + /// because some characters are unencodable or because it is too long. + pub fn check_encoding(&self, encoding: &'static Encoding) -> Result<(), Error> { + let s = self.0.as_str(); let (encoded, _, unencodable) = encoding.encode(s); if unencodable { let mut encoder = encoding.new_encoder(); @@ -117,7 +126,7 @@ impl Identifier { max: Self::MAX_LEN, }); } - Ok(Identifier(s.into())) + Ok(()) } pub fn is_plausible(s: &str) -> Result<(), Error> { if s.is_empty() { diff --git a/rust/src/raw.rs b/rust/src/raw.rs index 4ab11c3043..ba7124a385 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -1,4 +1,7 @@ -use crate::endian::{Endian, Parse, ToBytes}; +use crate::{ + endian::{Endian, Parse, ToBytes}, + identifier::{Error as IdError, Identifier}, +}; use encoding_rs::{mem::decode_latin1, DecoderResult, Encoding}; use flate2::read::ZlibDecoder; @@ -158,6 +161,24 @@ pub enum Error { #[error("Invalid variable display alignment value {0}")] InvalidAlignment(u32), + #[error("Invalid attribute name. {0}")] + InvalidAttributeName(IdError), + + #[error("Invalid variable name in attribute record. {0}")] + InvalidAttributeVariableName(IdError), + + #[error("Invalid short name in long variable name record. {0}")] + InvalidShortName(IdError), + + #[error("Invalid name in long variable name record. {0}")] + InvalidLongName(IdError), + + #[error("Invalid variable name in very long string record. {0}")] + InvalidLongStringName(IdError), + + #[error("Invalid variable name in variable set record. {0}")] + InvalidVariableSetName(IdError), + #[error("Details TBD")] TBD, } @@ -451,6 +472,14 @@ impl Decoder { output.into() } } + + pub fn decode_identifier(&self, input: &RawString) -> Result { + self.new_identifier(&self.decode(input)) + } + + pub fn new_identifier(&self, name: &str) -> Result { + Identifier::new(name, self.encoding) + } } impl Header for HeaderRecord @@ -1998,17 +2027,16 @@ impl TextRecord { TextRecordType::FileAttributes => { Ok(FileAttributeRecord::decode(self, decoder).map(|fa| Record::FileAttributes(fa))) } - TextRecordType::VariableAttributes => { - Ok(Some(Record::VariableAttributes( -VariableAttributeRecord::decode(self, decoder)))) - } + TextRecordType::VariableAttributes => Ok(Some(Record::VariableAttributes( + VariableAttributeRecord::decode(self, decoder), + ))), } } } #[derive(Clone, Debug)] pub struct VeryLongString { - pub short_name: String, + pub short_name: Identifier, pub length: u16, } @@ -2017,17 +2045,37 @@ impl VeryLongString { let Some((short_name, length)) = input.split_once('=') else { return Err(Error::TBD); }; + let short_name = decoder + .new_identifier(short_name) + .map_err(Error::InvalidLongStringName)?; let length = length.parse().map_err(|_| Error::TBD)?; - Ok(VeryLongString { - short_name: short_name.into(), - length, - }) + Ok(VeryLongString { short_name, length }) + } +} + +#[derive(Clone, Debug)] +pub struct VeryLongStringsRecord(Vec); + +impl VeryLongStringsRecord { + fn decode(source: &TextRecord, decoder: &Decoder) -> Self { + let input = decoder.decode(&source.text); + let mut very_long_strings = Vec::new(); + for tuple in input + .split('\0') + .map(|s| s.trim_end_matches('\t')) + .filter(|s| !s.is_empty()) + { + if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&decoder.warn) { + very_long_strings.push(vls) + } + } + VeryLongStringsRecord(very_long_strings) } } #[derive(Clone, Debug)] pub struct Attribute { - pub name: String, + pub name: Identifier, pub values: Vec, } @@ -2036,6 +2084,9 @@ impl Attribute { let Some((name, mut input)) = input.split_once('(') else { return Err(Error::TBD); }; + let name = decoder + .new_identifier(name) + .map_err(Error::InvalidAttributeName)?; let mut values = Vec::new(); loop { let Some((value, rest)) = input.split_once('\n') else { @@ -2051,10 +2102,7 @@ impl Attribute { values.push(value.into()); } if let Some(rest) = rest.strip_prefix(')') { - let attribute = Attribute { - name: name.into(), - values, - }; + let attribute = Attribute { name, values }; return Ok((attribute, rest)); }; input = rest; @@ -2107,7 +2155,7 @@ impl FileAttributeRecord { #[derive(Clone, Debug)] pub struct VarAttributeSet { - pub long_var_name: String, + pub long_var_name: Identifier, pub attributes: AttributeSet, } @@ -2116,9 +2164,12 @@ impl VarAttributeSet { let Some((long_var_name, rest)) = input.split_once(':') else { return Err(Error::TBD); }; + let long_var_name = decoder + .new_identifier(long_var_name) + .map_err(Error::InvalidAttributeVariableName)?; let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?; let var_attribute = VarAttributeSet { - long_var_name: long_var_name.into(), + long_var_name, attributes, }; Ok((var_attribute, rest)) @@ -2147,29 +2198,27 @@ impl VariableAttributeRecord { } #[derive(Clone, Debug)] -pub struct VeryLongStringsRecord(Vec); - -impl VeryLongStringsRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Self { - let input = decoder.decode(&source.text); - let mut very_long_strings = Vec::new(); - for tuple in input - .split('\0') - .map(|s| s.trim_end_matches('\t')) - .filter(|s| !s.is_empty()) - { - if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&decoder.warn) { - very_long_strings.push(vls) - } - } - VeryLongStringsRecord(very_long_strings) - } +pub struct LongName { + pub short_name: Identifier, + pub long_name: Identifier, } -#[derive(Clone, Debug)] -pub struct LongName { - pub short_name: String, - pub long_name: String, +impl LongName { + fn parse(input: &str, decoder: &Decoder) -> Result { + let Some((short_name, long_name)) = input.split_once('=') else { + return Err(Error::TBD); + }; + let short_name = decoder + .new_identifier(short_name) + .map_err(Error::InvalidShortName)?; + let long_name = decoder + .new_identifier(long_name) + .map_err(Error::InvalidLongName)?; + Ok(LongName { + short_name, + long_name, + }) + } } #[derive(Clone, Debug)] @@ -2180,13 +2229,8 @@ impl LongNamesRecord { let input = decoder.decode(&source.text); let mut names = Vec::new(); for pair in input.split('\t').filter(|s| !s.is_empty()) { - if let Some((short_name, long_name)) = pair.split_once('=') { - names.push(LongName { - short_name: short_name.into(), - long_name: long_name.into(), - }); - } else { - decoder.warn(Error::TBD) + if let Some(long_name) = LongName::parse(pair, decoder).warn_on_error(&decoder.warn) { + names.push(long_name); } } LongNamesRecord(names) @@ -2205,13 +2249,22 @@ impl ProductInfoRecord { #[derive(Clone, Debug)] pub struct VariableSet { pub name: String, - pub vars: Vec, + pub vars: Vec, } impl VariableSet { - fn parse(input: &str) -> Result { + fn parse(input: &str, decoder: &Decoder) -> Result { let (name, input) = input.split_once('=').ok_or(Error::TBD)?; - let vars = input.split_ascii_whitespace().map(String::from).collect(); + let mut vars = Vec::new(); + for var in input.split_ascii_whitespace() { + if let Some(identifier) = decoder + .new_identifier(var) + .map_err(Error::InvalidVariableSetName) + .warn_on_error(&decoder.warn) + { + vars.push(identifier); + } + } Ok(VariableSet { name: name.into(), vars, @@ -2230,7 +2283,7 @@ impl VariableSetRecord { let mut sets = Vec::new(); let input = decoder.decode(&source.text); for line in input.lines() { - if let Some(set) = VariableSet::parse(line).warn_on_error(&decoder.warn) { + if let Some(set) = VariableSet::parse(line, decoder).warn_on_error(&decoder.warn) { sets.push(set) } } -- 2.30.2