X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=rust%2Fsrc%2Fraw.rs;h=e8a279f5e848418e0fdb846333cc80f5e0a60ce4;hb=7e2346251b2a07f03e2b5e77f2f9b938a9b00ab7;hp=544481906f20153aabd415e50f507b198375c73f;hpb=55c50f4539128d520f9517abd3c40d38b3b39519;p=pspp diff --git a/rust/src/raw.rs b/rust/src/raw.rs index 544481906f..e8a279f5e8 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -1,4 +1,6 @@ use crate::{ + dictionary::VarWidth, + encoding::{default_encoding, get_encoding, Error as EncodingError}, endian::{Endian, Parse, ToBytes}, identifier::{Error as IdError, Identifier}, }; @@ -10,7 +12,7 @@ use std::{ borrow::Cow, cell::RefCell, cmp::Ordering, - collections::VecDeque, + collections::{HashMap, VecDeque}, fmt::{Debug, Display, Formatter, Result as FmtResult}, io::{Error as IoError, Read, Seek, SeekFrom}, iter::repeat, @@ -38,15 +40,18 @@ pub enum Error { #[error("Invalid ZSAV compression code {0}")] InvalidZsavCompression(u32), - #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")] - BadVariableWidth { offset: u64, width: i32 }, - #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")] BadDocumentLength { offset: u64, n: usize, max: usize }, #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")] BadRecordType { offset: u64, rec_type: u32 }, + #[error("In variable record starting at offset {start_offset:#x}, variable width is not in the valid range -1 to 255.")] + BadVariableWidth { + start_offset: u64, + width: i32, + }, + #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")] BadVariableLabelCode { start_offset: u64, @@ -71,23 +76,6 @@ pub enum Error { #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")] TooManyVarIndexes { offset: u64, n: u32, max: u32 }, - #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")] - NoVarIndexes { offset: u64 }, - - #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())] - MixedVarTypes { - offset: u64, - var_type: VarType, - wrong_types: Vec, - }, - - #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")] - InvalidVarIndexes { - offset: u64, - max: usize, - invalid: Vec, - }, - #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")] ExtensionRecordTooLarge { offset: u64, @@ -125,6 +113,32 @@ pub enum Error { ztrailer_len: u64, }, + #[error("{0}")] + EncodingError(EncodingError), +} + +#[derive(ThisError, Debug)] +pub enum Warning { + #[error("Unexpected end of data inside extension record.")] + UnexpectedEndOfData, + + #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")] + NoVarIndexes { offset: u64 }, + + #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())] + MixedVarTypes { + offset: u64, + var_type: VarType, + wrong_types: Vec, + }, + + #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")] + InvalidVarIndexes { + offset: u64, + max: usize, + invalid: Vec, + }, + #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")] BadRecordSize { offset: u64, @@ -185,10 +199,25 @@ pub enum Error { #[error("Invalid multiple response set variable name. {0}")] InvalidMrSetVariableName(IdError), + #[error("Invalid variable name in long string missing values record. {0}")] + InvalidLongStringMissingValueVariableName(IdError), + + #[error("Invalid variable name in long string value label record. {0}")] + InvalidLongStringValueLabelName(IdError), + + #[error("{0}")] + EncodingError(EncodingError), + #[error("Details TBD")] TBD, } +impl From for Warning { + fn from(_source: IoError) -> Self { + Self::UnexpectedEndOfData + } +} + #[derive(Clone, Debug)] pub enum Record { Header(HeaderRecord), @@ -197,19 +226,40 @@ pub enum Record { Document(DocumentRecord), IntegerInfo(IntegerInfoRecord), FloatInfo(FloatInfoRecord), - VariableSets(VariableSetRecord), VarDisplay(VarDisplayRecord), MultipleResponse(MultipleResponseRecord), - LongStringValueLabels(LongStringValueLabelRecord), + LongStringValueLabels(LongStringValueLabelRecord), LongStringMissingValues(LongStringMissingValueRecord>), Encoding(EncodingRecord), NumberOfCases(NumberOfCasesRecord), + Text(TextRecord), + OtherExtension(Extension), + EndOfHeaders(u32), + ZHeader(ZHeader), + ZTrailer(ZTrailer), + Cases(Rc>), +} + +#[derive(Clone, Debug)] +pub enum DecodedRecord { + Header(HeaderRecord), + Variable(VariableRecord), + ValueLabel(ValueLabelRecord, String>), + Document(DocumentRecord), + IntegerInfo(IntegerInfoRecord), + FloatInfo(FloatInfoRecord), + VarDisplay(VarDisplayRecord), + MultipleResponse(MultipleResponseRecord), + LongStringValueLabels(LongStringValueLabelRecord), + LongStringMissingValues(LongStringMissingValueRecord), + Encoding(EncodingRecord), + NumberOfCases(NumberOfCasesRecord), + VariableSets(VariableSetRecord), ProductInfo(ProductInfoRecord), LongNames(LongNamesRecord), VeryLongStrings(VeryLongStringsRecord), FileAttributes(FileAttributeRecord), VariableAttributes(VariableAttributeRecord), - Text(TextRecord), OtherExtension(Extension), EndOfHeaders(u32), ZHeader(ZHeader), @@ -222,7 +272,7 @@ impl Record { reader: &mut R, endian: Endian, var_types: &[VarType], - warn: &Box, + warn: &dyn Fn(Warning), ) -> Result, Error> where R: Read + Seek, @@ -242,6 +292,59 @@ impl Record { }), } } + + pub fn decode(self, decoder: &Decoder) -> Result { + Ok(match self { + Record::Header(record) => record.decode(decoder), + Record::Variable(record) => record.decode(decoder), + Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)), + Record::Document(record) => record.decode(decoder), + Record::IntegerInfo(record) => DecodedRecord::IntegerInfo(record.clone()), + Record::FloatInfo(record) => DecodedRecord::FloatInfo(record.clone()), + Record::VarDisplay(record) => DecodedRecord::VarDisplay(record.clone()), + Record::MultipleResponse(record) => record.decode(decoder), + Record::LongStringValueLabels(record) => { + DecodedRecord::LongStringValueLabels(record.decode(decoder)) + } + Record::LongStringMissingValues(record) => { + DecodedRecord::LongStringMissingValues(record.decode(decoder)) + } + Record::Encoding(record) => DecodedRecord::Encoding(record.clone()), + Record::NumberOfCases(record) => DecodedRecord::NumberOfCases(record.clone()), + Record::Text(record) => record.decode(decoder), + Record::OtherExtension(record) => DecodedRecord::OtherExtension(record.clone()), + Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record), + Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()), + Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()), + Record::Cases(record) => DecodedRecord::Cases(record.clone()), + }) + } +} + +pub fn encoding_from_headers( + headers: &Vec, + warn: &impl Fn(Warning), +) -> Result<&'static Encoding, Error> { + let mut encoding_record = None; + let mut integer_info_record = None; + for record in headers { + match record { + Record::Encoding(record) => encoding_record = Some(record), + Record::IntegerInfo(record) => integer_info_record = Some(record), + _ => (), + } + } + let encoding = encoding_record.map(|record| record.0.as_str()); + let character_code = integer_info_record.map(|record| record.character_code); + match get_encoding(encoding, character_code) { + Ok(encoding) => Ok(encoding), + Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)), + Err(err) => { + warn(Warning::EncodingError(err)); + // Warn that we're using the default encoding. + Ok(default_encoding()) + } + } } // If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it @@ -398,12 +501,12 @@ impl HeaderRecord { }) } - fn decode<'a>(&'a self, decoder: &Decoder) -> HeaderRecord> { - let eye_catcher = decoder.decode(&self.eye_catcher); - let file_label = decoder.decode(&self.file_label); - let creation_date = decoder.decode(&self.creation_date); - let creation_time = decoder.decode(&self.creation_time); - HeaderRecord { + pub fn decode(self, decoder: &Decoder) -> DecodedRecord { + let eye_catcher = decoder.decode(&self.eye_catcher).to_string(); + let file_label = decoder.decode(&self.file_label).to_string(); + let creation_date = decoder.decode(&self.creation_date).to_string(); + let creation_time = decoder.decode(&self.creation_time).to_string(); + DecodedRecord::Header(HeaderRecord { eye_catcher, weight_index: self.weight_index, n_cases: self.n_cases, @@ -417,23 +520,32 @@ impl HeaderRecord { creation_date, creation_time, endian: self.endian, - } + }) } } -struct Decoder { - encoding: &'static Encoding, - warn: Box, +pub struct Decoder { + pub encoding: &'static Encoding, + pub warn: Box, } impl Decoder { - fn warn(&self, error: Error) { - (self.warn)(error) + pub fn new(encoding: &'static Encoding, warn: F) -> Self + where + F: Fn(Warning) + 'static, + { + Self { + encoding, + warn: Box::new(warn), + } + } + fn warn(&self, warning: Warning) { + (self.warn)(warning) } fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { let (output, malformed) = self.encoding.decode_without_bom_handling(input); if malformed { - self.warn(Error::MalformedString { + self.warn(Warning::MalformedString { encoding: self.encoding.name().into(), text: output.clone().into(), }); @@ -450,7 +562,7 @@ impl Decoder { /// same length in bytes. /// /// XXX warn about errors? - fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { + pub fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { if let (s, false) = self.encoding.decode_without_bom_handling(input) { // This is the common case. Usually there will be no errors. s @@ -552,14 +664,14 @@ pub enum VarType { } impl VarType { - fn from_width(width: i32) -> VarType { + pub fn from_width(width: VarWidth) -> VarType { match width { - 0 => VarType::Numeric, - _ => VarType::String, + VarWidth::Numeric => Self::Numeric, + VarWidth::String(_) => Self::String, } } - fn opposite(self) -> VarType { + pub fn opposite(self) -> VarType { match self { Self::Numeric => Self::String, Self::String => Self::Numeric, @@ -717,9 +829,9 @@ impl RawValue { Ok(Some(values)) } - fn decode(&self, decoder: &Decoder) -> Value { + fn decode(self, decoder: &Decoder) -> Value { match self { - Self::Number(x) => Value::Number(*x), + Self::Number(x) => Value::Number(x), Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()), } } @@ -786,7 +898,7 @@ where R: Read + Seek + 'static, { reader: Option, - warn: Box, + warn: Box, header: HeaderRecord, var_types: Vec, @@ -800,7 +912,7 @@ where { pub fn new(mut reader: R, warn: F) -> Result where - F: Fn(Error) + 'static, + F: Fn(Warning) + 'static, { let header = HeaderRecord::read(&mut reader)?; Ok(Self { @@ -819,15 +931,7 @@ where &self.header, ) } -} - -impl Iterator for Reader -where - R: Read + Seek + 'static, -{ - type Item = Result; - - fn next(&mut self) -> Option { + fn _next(&mut self) -> Option<::Item> { match self.state { ReaderState::Start => { self.state = ReaderState::Headers; @@ -848,7 +952,11 @@ where }; match record { Record::Variable(VariableRecord { width, .. }) => { - self.var_types.push(VarType::from_width(width)); + self.var_types.push(if width == 0 { + VarType::Numeric + } else { + VarType::String + }); } Record::EndOfHeaders(_) => { self.state = if let Some(Compression::ZLib) = self.header.compression { @@ -894,6 +1002,21 @@ where } } +impl Iterator for Reader +where + R: Read + Seek + 'static, +{ + type Item = Result; + + fn next(&mut self) -> Option { + let retval = self._next(); + if matches!(retval, Some(Err(_))) { + self.state = ReaderState::End; + } + retval + } +} + trait ReadSeek: Read + Seek {} impl ReadSeek for T where T: Read + Seek {} @@ -1016,7 +1139,7 @@ fn format_name(type_: u32) -> Cow<'static, str> { } #[derive(Clone)] -pub struct MissingValues +pub struct MissingValues where S: Debug, { @@ -1063,6 +1186,18 @@ where } } +impl Default for MissingValues +where + S: Debug, +{ + fn default() -> Self { + Self { + values: Vec::new(), + range: None, + } + } +} + impl MissingValues> { fn read( r: &mut R, @@ -1079,7 +1214,11 @@ impl MissingValues> { (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }), }; - let var_type = VarType::from_width(width); + let var_type = if width == 0 { + VarType::Numeric + } else { + VarType::String + }; let mut values = Vec::new(); for _ in 0..n_values { @@ -1094,7 +1233,7 @@ impl MissingValues> { }; Ok(Self { values, range }) } - fn decode<'a>(&'a self, decoder: &Decoder) -> MissingValues { + fn decode(&self, decoder: &Decoder) -> MissingValues { MissingValues { values: self .values @@ -1165,6 +1304,9 @@ impl VariableRecord> { fn read(r: &mut R, endian: Endian) -> Result { let start_offset = r.stream_position()?; let width: i32 = endian.parse(read_bytes(r)?); + if !(-1..=255).contains(&width) { + return Err(Error::BadVariableWidth { start_offset, width }); + } let code_offset = r.stream_position()?; let has_variable_label: u32 = endian.parse(read_bytes(r)?); let missing_value_code: i32 = endian.parse(read_bytes(r)?); @@ -1209,16 +1351,19 @@ impl VariableRecord> { })) } - fn decode<'a>(&'a self, decoder: &Decoder) -> VariableRecord, String> { - VariableRecord { + pub fn decode(self, decoder: &Decoder) -> DecodedRecord { + DecodedRecord::Variable(VariableRecord { offsets: self.offsets.clone(), width: self.width, - name: decoder.decode(&self.name), + name: decoder.decode(&self.name).to_string(), print_format: self.print_format, write_format: self.write_format, missing_values: self.missing_values.decode(decoder), - label: self.label.as_ref().map(|label| decoder.decode(label)), - } + label: self + .label + .as_ref() + .map(|label| decoder.decode(label).to_string()), + }) } } @@ -1358,7 +1503,7 @@ impl ValueLabelRecord, RawString> { r: &mut R, endian: Endian, var_types: &[VarType], - warn: &Box, + warn: &dyn Fn(Warning), ) -> Result, Error> { let label_offset = r.stream_position()?; let n: u32 = endian.parse(read_bytes(r)?); @@ -1412,7 +1557,7 @@ impl ValueLabelRecord, RawString> { } } if !invalid_indexes.is_empty() { - warn(Error::InvalidVarIndexes { + warn(Warning::InvalidVarIndexes { offset: index_offset, max: var_types.len(), invalid: invalid_indexes, @@ -1420,7 +1565,7 @@ impl ValueLabelRecord, RawString> { } let Some(&first_index) = dict_indexes.first() else { - warn(Error::NoVarIndexes { + warn(Warning::NoVarIndexes { offset: index_offset, }); return Ok(None); @@ -1436,7 +1581,7 @@ impl ValueLabelRecord, RawString> { } }); if !wrong_type_indexes.is_empty() { - warn(Error::MixedVarTypes { + warn(Warning::MixedVarTypes { offset: index_offset, var_type, wrong_types: wrong_type_indexes, @@ -1459,6 +1604,23 @@ impl ValueLabelRecord, RawString> { var_type, }))) } + + fn decode(self, decoder: &Decoder) -> ValueLabelRecord, String> { + let labels = self + .labels + .iter() + .map(|ValueLabel { value, label }| ValueLabel { + value: *value, + label: decoder.decode(label).to_string(), + }) + .collect(); + ValueLabelRecord { + offsets: self.offsets.clone(), + labels, + dict_indexes: self.dict_indexes.clone(), + var_type: self.var_type, + } + } } #[derive(Clone, Debug)] @@ -1468,7 +1630,8 @@ where { pub offsets: Range, - /// The document, as an array of 80-byte lines. + /// The document, as an array of lines. Raw lines are exactly 80 bytes long + /// and are right-padded with spaces without any new-line termination. pub lines: Vec, } @@ -1506,15 +1669,15 @@ impl DocumentRecord { } } - fn decode<'a>(&'a self, decoder: &Decoder) -> DocumentRecord> { - DocumentRecord { + pub fn decode(self, decoder: &Decoder) -> DecodedRecord { + DecodedRecord::Document(DocumentRecord { offsets: self.offsets.clone(), lines: self .lines .iter() - .map(|s| decoder.decode_slice(&s.0)) + .map(|s| decoder.decode_slice(&s.0).to_string()) .collect(), - } + }) } } @@ -1532,7 +1695,7 @@ trait ExtensionRecord { const SIZE: Option; const COUNT: Option; const NAME: &'static str; - fn parse(ext: &Extension, endian: Endian) -> Result; + fn parse(ext: &Extension, endian: Endian) -> Result; } #[derive(Clone, Debug)] @@ -1552,7 +1715,7 @@ impl ExtensionRecord for IntegerInfoRecord { const COUNT: Option = Some(8); const NAME: &'static str = "integer record"; - fn parse(ext: &Extension, endian: Endian) -> Result { + fn parse(ext: &Extension, endian: Endian) -> Result { ext.check_size::()?; let mut input = &ext.data[..]; @@ -1584,7 +1747,7 @@ impl ExtensionRecord for FloatInfoRecord { const COUNT: Option = Some(3); const NAME: &'static str = "floating point record"; - fn parse(ext: &Extension, endian: Endian) -> Result { + fn parse(ext: &Extension, endian: Endian) -> Result { ext.check_size::()?; let mut input = &ext.data[..]; @@ -1615,7 +1778,7 @@ pub enum MultipleResponseType { } impl MultipleResponseType { - fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Error> { + fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> { let (mr_type, input) = match input.split_first() { Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input), Some((b'D', input)) => { @@ -1634,7 +1797,7 @@ impl MultipleResponseType { } else if let Some(rest) = input.strip_prefix(b" 11 ") { (CategoryLabels::VarLabels, rest) } else { - return Err(Error::TBD); + return Err(Warning::TBD); }; let (value, input) = parse_counted_string(input)?; ( @@ -1642,7 +1805,7 @@ impl MultipleResponseType { input, ) } - _ => return Err(Error::TBD), + _ => return Err(Warning::TBD), }; Ok((mr_type, input)) } @@ -1661,14 +1824,14 @@ where } impl MultipleResponseSet { - fn parse(input: &[u8]) -> Result<(Self, &[u8]), Error> { + fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> { let Some(equals) = input.iter().position(|&b| b == b'=') else { - return Err(Error::TBD); + return Err(Warning::TBD); }; let (name, input) = input.split_at(equals); let (mr_type, input) = MultipleResponseType::parse(input)?; let Some(input) = input.strip_prefix(b" ") else { - return Err(Error::TBD); + return Err(Warning::TBD); }; let (label, mut input) = parse_counted_string(input)?; let mut vars = Vec::new(); @@ -1676,7 +1839,7 @@ impl MultipleResponseSet { match input.split_first() { Some((b' ', rest)) => { let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else { - return Err(Error::TBD); + return Err(Warning::TBD); }; let (var, rest) = rest.split_at(length); if !var.is_empty() { @@ -1684,7 +1847,7 @@ impl MultipleResponseSet { } input = rest; } - _ => return Err(Error::TBD), + _ => return Err(Warning::TBD), } } while input.first() == Some(&b'\n') { @@ -1701,16 +1864,16 @@ impl MultipleResponseSet { )) } - fn decode<'a>( - &'a self, + fn decode( + &self, decoder: &Decoder, - ) -> Result>, Error> { + ) -> Result, Warning> { let mut short_names = Vec::with_capacity(self.short_names.len()); for short_name in self.short_names.iter() { if let Some(short_name) = decoder .decode_identifier(short_name) - .map_err(|err| Error::InvalidMrSetName(err)) - .warn_on_error(&decoder.warn) + .map_err(Warning::InvalidMrSetName) + .issue_warning(&decoder.warn) { short_names.push(short_name); } @@ -1718,10 +1881,10 @@ impl MultipleResponseSet { Ok(MultipleResponseSet { name: decoder .decode_identifier(&self.name) - .map_err(|err| Error::InvalidMrSetVariableName(err))?, - label: decoder.decode(&self.label), + .map_err(Warning::InvalidMrSetVariableName)?, + label: decoder.decode(&self.label).to_string(), mr_type: self.mr_type.clone(), - short_names: short_names, + short_names, }) } } @@ -1738,7 +1901,7 @@ impl ExtensionRecord for MultipleResponseRecord { const COUNT: Option = None; const NAME: &'static str = "multiple response set record"; - fn parse(ext: &Extension, _endian: Endian) -> Result { + fn parse(ext: &Extension, _endian: Endian) -> Result { ext.check_size::()?; let mut input = &ext.data[..]; @@ -1753,31 +1916,31 @@ impl ExtensionRecord for MultipleResponseRecord { } impl MultipleResponseRecord { - fn decode<'a>(&'a self, decoder: &Decoder) -> MultipleResponseRecord> { + fn decode(self, decoder: &Decoder) -> DecodedRecord { let mut sets = Vec::new(); for set in self.0.iter() { - if let Some(set) = set.decode(decoder).warn_on_error(&decoder.warn) { + if let Some(set) = set.decode(decoder).issue_warning(&decoder.warn) { sets.push(set); } } - MultipleResponseRecord(sets) + DecodedRecord::MultipleResponse(MultipleResponseRecord(sets)) } } -fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Error> { +fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> { let Some(space) = input.iter().position(|&b| b == b' ') else { - return Err(Error::TBD); + return Err(Warning::TBD); }; let Ok(length) = from_utf8(&input[..space]) else { - return Err(Error::TBD); + return Err(Warning::TBD); }; let Ok(length): Result = length.parse() else { - return Err(Error::TBD); + return Err(Warning::TBD); }; let input = &input[space + 1..]; if input.len() < length { - return Err(Error::TBD); + return Err(Warning::TBD); }; let (string, rest) = input.split_at(length); @@ -1792,13 +1955,20 @@ pub enum Measure { } impl Measure { - fn try_decode(source: u32) -> Result, Error> { + pub fn default_for_type(var_type: VarType) -> Option { + match var_type { + VarType::Numeric => None, + VarType::String => Some(Self::Nominal), + } + } + + fn try_decode(source: u32) -> Result, Warning> { match source { 0 => Ok(None), 1 => Ok(Some(Measure::Nominal)), 2 => Ok(Some(Measure::Ordinal)), 3 => Ok(Some(Measure::Scale)), - _ => Err(Error::InvalidMeasurement(source)), + _ => Err(Warning::InvalidMeasurement(source)), } } } @@ -1811,13 +1981,20 @@ pub enum Alignment { } impl Alignment { - fn try_decode(source: u32) -> Result, Error> { + fn try_decode(source: u32) -> Result, Warning> { match source { 0 => Ok(None), 1 => Ok(Some(Alignment::Left)), 2 => Ok(Some(Alignment::Right)), 3 => Ok(Some(Alignment::Center)), - _ => Err(Error::InvalidAlignment(source)), + _ => Err(Warning::InvalidAlignment(source)), + } + } + + pub fn default_for_type(var_type: VarType) -> Self { + match var_type { + VarType::Numeric => Self::Right, + VarType::String => Self::Left, } } } @@ -1839,10 +2016,10 @@ impl VarDisplayRecord { ext: &Extension, n_vars: usize, endian: Endian, - warn: &Box, - ) -> Result { + warn: &dyn Fn(Warning), + ) -> Result { if ext.size != 4 { - return Err(Error::BadRecordSize { + return Err(Warning::BadRecordSize { offset: ext.offsets.start, record: String::from("variable display record"), size: ext.size, @@ -1855,18 +2032,18 @@ impl VarDisplayRecord { } else if ext.count as usize == 2 * n_vars { false } else { - return Err(Error::TBD); + return Err(Warning::TBD); }; let mut var_displays = Vec::new(); let mut input = &ext.data[..]; for _ in 0..n_vars { let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap())) - .warn_on_error(&warn) + .issue_warning(&warn) .flatten(); let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap())); let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap())) - .warn_on_error(&warn) + .issue_warning(&warn) .flatten(); var_displays.push(VarDisplay { measure, @@ -1892,11 +2069,14 @@ where } impl LongStringMissingValues> { - fn decode<'a>(&self, decoder: &Decoder) -> LongStringMissingValues { - LongStringMissingValues { - var_name: decoder.decode(&self.var_name).to_string(), + fn decode( + &self, + decoder: &Decoder, + ) -> Result, IdError> { + Ok(LongStringMissingValues { + var_name: decoder.decode_identifier(&self.var_name)?, missing_values: self.missing_values.decode(decoder), - } + }) } } @@ -1912,7 +2092,7 @@ impl ExtensionRecord for LongStringMissingValueRecord> { const COUNT: Option = None; const NAME: &'static str = "long string missing values record"; - fn parse(ext: &Extension, endian: Endian) -> Result { + fn parse(ext: &Extension, endian: Endian) -> Result { ext.check_size::()?; let mut input = &ext.data[..]; @@ -1923,7 +2103,7 @@ impl ExtensionRecord for LongStringMissingValueRecord> { let value_len: u32 = endian.parse(read_bytes(&mut input)?); if value_len != 8 { let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start; - return Err(Error::BadLongMissingValueLength { + return Err(Warning::BadLongMissingValueLength { record_offset: ext.offsets.start, offset, value_len, @@ -1959,8 +2139,18 @@ impl ExtensionRecord for LongStringMissingValueRecord> { } impl LongStringMissingValueRecord> { - fn decode<'a>(&self, decoder: &Decoder) -> LongStringMissingValueRecord { - LongStringMissingValueRecord(self.0.iter().map(|mv| mv.decode(decoder)).collect()) + pub fn decode(self, decoder: &Decoder) -> LongStringMissingValueRecord { + let mut mvs = Vec::with_capacity(self.0.len()); + for mv in self.0.iter() { + if let Some(mv) = mv + .decode(decoder) + .map_err(Warning::InvalidLongStringMissingValueVariableName) + .issue_warning(&decoder.warn) + { + mvs.push(mv); + } + } + LongStringMissingValueRecord(mvs) } } @@ -1973,18 +2163,18 @@ impl ExtensionRecord for EncodingRecord { const COUNT: Option = None; const NAME: &'static str = "encoding record"; - fn parse(ext: &Extension, _endian: Endian) -> Result { + fn parse(ext: &Extension, _endian: Endian) -> Result { ext.check_size::()?; Ok(Record::Encoding(EncodingRecord( - String::from_utf8(ext.data.clone()).map_err(|_| Error::BadEncodingName { + String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName { offset: ext.offsets.start, })?, ))) } } -#[derive(Copy, Clone, Debug)] +#[derive(Clone, Debug)] pub struct NumberOfCasesRecord { /// Always observed as 1. pub one: u64, @@ -1999,7 +2189,7 @@ impl ExtensionRecord for NumberOfCasesRecord { const COUNT: Option = Some(2); const NAME: &'static str = "extended number of cases record"; - fn parse(ext: &Extension, endian: Endian) -> Result { + fn parse(ext: &Extension, endian: Endian) -> Result { ext.check_size::()?; let mut input = &ext.data[..]; @@ -2039,26 +2229,26 @@ impl TextRecord { text: extension.data.into(), } } - fn decode<'a>(&self, decoder: &Decoder) -> Result, Error> { + pub fn decode(self, decoder: &Decoder) -> DecodedRecord { match self.rec_type { - TextRecordType::VariableSets => Ok(Some(Record::VariableSets( - VariableSetRecord::decode(self, decoder), - ))), - TextRecordType::ProductInfo => Ok(Some(Record::ProductInfo( - ProductInfoRecord::decode(self, decoder), - ))), - TextRecordType::LongNames => Ok(Some(Record::LongNames(LongNamesRecord::decode( - self, decoder, - )))), - TextRecordType::VeryLongStrings => Ok(Some(Record::VeryLongStrings( - VeryLongStringsRecord::decode(self, decoder), - ))), + TextRecordType::VariableSets => { + DecodedRecord::VariableSets(VariableSetRecord::decode(&self, decoder)) + } + TextRecordType::ProductInfo => { + DecodedRecord::ProductInfo(ProductInfoRecord::decode(&self, decoder)) + } + TextRecordType::LongNames => { + DecodedRecord::LongNames(LongNamesRecord::decode(&self, decoder)) + } + TextRecordType::VeryLongStrings => { + DecodedRecord::VeryLongStrings(VeryLongStringsRecord::decode(&self, decoder)) + } TextRecordType::FileAttributes => { - Ok(FileAttributeRecord::decode(self, decoder).map(|fa| Record::FileAttributes(fa))) + DecodedRecord::FileAttributes(FileAttributeRecord::decode(&self, decoder)) + } + TextRecordType::VariableAttributes => { + DecodedRecord::VariableAttributes(VariableAttributeRecord::decode(&self, decoder)) } - TextRecordType::VariableAttributes => Ok(Some(Record::VariableAttributes( - VariableAttributeRecord::decode(self, decoder), - ))), } } } @@ -2070,14 +2260,14 @@ pub struct VeryLongString { } impl VeryLongString { - fn parse(decoder: &Decoder, input: &str) -> Result { + fn parse(decoder: &Decoder, input: &str) -> Result { let Some((short_name, length)) = input.split_once('=') else { - return Err(Error::TBD); + return Err(Warning::TBD); }; let short_name = decoder .new_identifier(short_name) - .map_err(Error::InvalidLongStringName)?; - let length = length.parse().map_err(|_| Error::TBD)?; + .map_err(Warning::InvalidLongStringName)?; + let length = length.parse().map_err(|_| Warning::TBD)?; Ok(VeryLongString { short_name, length }) } } @@ -2094,7 +2284,7 @@ impl VeryLongStringsRecord { .map(|s| s.trim_end_matches('\t')) .filter(|s| !s.is_empty()) { - if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&decoder.warn) { + if let Some(vls) = VeryLongString::parse(decoder, tuple).issue_warning(&decoder.warn) { very_long_strings.push(vls) } } @@ -2109,17 +2299,17 @@ pub struct Attribute { } impl Attribute { - fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Error> { + fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> { let Some((name, mut input)) = input.split_once('(') else { - return Err(Error::TBD); + return Err(Warning::TBD); }; let name = decoder .new_identifier(name) - .map_err(Error::InvalidAttributeName)?; + .map_err(Warning::InvalidAttributeName)?; let mut values = Vec::new(); loop { let Some((value, rest)) = input.split_once('\n') else { - return Err(Error::TBD); + return Err(Warning::TBD); }; if let Some(stripped) = value .strip_prefix('\'') @@ -2127,7 +2317,7 @@ impl Attribute { { values.push(stripped.into()); } else { - decoder.warn(Error::TBD); + decoder.warn(Warning::TBD); values.push(value.into()); } if let Some(rest) = rest.strip_prefix(')') { @@ -2139,23 +2329,24 @@ impl Attribute { } } -#[derive(Clone, Debug)] -pub struct AttributeSet(pub Vec); +#[derive(Clone, Debug, Default)] +pub struct AttributeSet(pub HashMap>); impl AttributeSet { fn parse<'a>( decoder: &Decoder, mut input: &'a str, sentinel: Option, - ) -> Result<(AttributeSet, &'a str), Error> { - let mut attributes = Vec::new(); + ) -> Result<(AttributeSet, &'a str), Warning> { + let mut attributes = HashMap::new(); let rest = loop { match input.chars().next() { None => break input, c if c == sentinel => break &input[1..], _ => { let (attribute, rest) = Attribute::parse(decoder, input)?; - attributes.push(attribute); + // XXX report duplicate name + attributes.insert(attribute.name, attribute.values); input = rest; } } @@ -2164,20 +2355,20 @@ impl AttributeSet { } } -#[derive(Clone, Debug)] -pub struct FileAttributeRecord(AttributeSet); +#[derive(Clone, Debug, Default)] +pub struct FileAttributeRecord(pub AttributeSet); impl FileAttributeRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Option { + fn decode(source: &TextRecord, decoder: &Decoder) -> Self { let input = decoder.decode(&source.text); - match AttributeSet::parse(decoder, &input, None).warn_on_error(&decoder.warn) { + match AttributeSet::parse(decoder, &input, None).issue_warning(&decoder.warn) { Some((set, rest)) => { if !rest.is_empty() { - decoder.warn(Error::TBD); + decoder.warn(Warning::TBD); } - Some(FileAttributeRecord(set)) + FileAttributeRecord(set) } - None => None, + None => FileAttributeRecord::default(), } } } @@ -2189,13 +2380,13 @@ pub struct VarAttributeSet { } impl VarAttributeSet { - fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributeSet, &'a str), Error> { + fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributeSet, &'a str), Warning> { let Some((long_var_name, rest)) = input.split_once(':') else { - return Err(Error::TBD); + return Err(Warning::TBD); }; let long_var_name = decoder .new_identifier(long_var_name) - .map_err(Error::InvalidAttributeVariableName)?; + .map_err(Warning::InvalidAttributeVariableName)?; let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?; let var_attribute = VarAttributeSet { long_var_name, @@ -2215,12 +2406,12 @@ impl VariableAttributeRecord { let mut var_attribute_sets = Vec::new(); while !input.is_empty() { let Some((var_attribute, rest)) = - VarAttributeSet::parse(decoder, &input).warn_on_error(&decoder.warn) + VarAttributeSet::parse(decoder, input).issue_warning(&decoder.warn) else { break; }; var_attribute_sets.push(var_attribute); - input = rest.into(); + input = rest; } VariableAttributeRecord(var_attribute_sets) } @@ -2233,16 +2424,16 @@ pub struct LongName { } impl LongName { - fn parse(input: &str, decoder: &Decoder) -> Result { + fn parse(input: &str, decoder: &Decoder) -> Result { let Some((short_name, long_name)) = input.split_once('=') else { - return Err(Error::TBD); + return Err(Warning::TBD); }; let short_name = decoder .new_identifier(short_name) - .map_err(Error::InvalidShortName)?; + .map_err(Warning::InvalidShortName)?; let long_name = decoder .new_identifier(long_name) - .map_err(Error::InvalidLongName)?; + .map_err(Warning::InvalidLongName)?; Ok(LongName { short_name, long_name, @@ -2258,7 +2449,7 @@ impl LongNamesRecord { let input = decoder.decode(&source.text); let mut names = Vec::new(); for pair in input.split('\t').filter(|s| !s.is_empty()) { - if let Some(long_name) = LongName::parse(pair, decoder).warn_on_error(&decoder.warn) { + if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&decoder.warn) { names.push(long_name); } } @@ -2270,7 +2461,6 @@ impl LongNamesRecord { pub struct ProductInfoRecord(pub String); impl ProductInfoRecord { - const NAME: &'static str = "extra product info"; fn decode(source: &TextRecord, decoder: &Decoder) -> Self { Self(decoder.decode(&source.text).into()) } @@ -2282,14 +2472,14 @@ pub struct VariableSet { } impl VariableSet { - fn parse(input: &str, decoder: &Decoder) -> Result { - let (name, input) = input.split_once('=').ok_or(Error::TBD)?; + fn parse(input: &str, decoder: &Decoder) -> Result { + let (name, input) = input.split_once('=').ok_or(Warning::TBD)?; let mut vars = Vec::new(); for var in input.split_ascii_whitespace() { if let Some(identifier) = decoder .new_identifier(var) - .map_err(Error::InvalidVariableSetName) - .warn_on_error(&decoder.warn) + .map_err(Warning::InvalidVariableSetName) + .issue_warning(&decoder.warn) { vars.push(identifier); } @@ -2312,7 +2502,7 @@ impl VariableSetRecord { let mut sets = Vec::new(); let input = decoder.decode(&source.text); for line in input.lines() { - if let Some(set) = VariableSet::parse(line, decoder).warn_on_error(&decoder.warn) { + if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&decoder.warn) { sets.push(set) } } @@ -2323,11 +2513,16 @@ impl VariableSetRecord { } } -trait WarnOnError { - fn warn_on_error(self, warn: &F) -> Option; +trait IssueWarning { + fn issue_warning(self, warn: &F) -> Option + where + F: Fn(Warning); } -impl WarnOnError for Result { - fn warn_on_error(self, warn: &F) -> Option { +impl IssueWarning for Result { + fn issue_warning(self, warn: &F) -> Option + where + F: Fn(Warning), + { match self { Ok(result) => Some(result), Err(error) => { @@ -2356,10 +2551,10 @@ pub struct Extension { } impl Extension { - fn check_size(&self) -> Result<(), Error> { + fn check_size(&self) -> Result<(), Warning> { if let Some(expected_size) = E::SIZE { if self.size != expected_size { - return Err(Error::BadRecordSize { + return Err(Warning::BadRecordSize { offset: self.offsets.start, record: E::NAME.into(), size: self.size, @@ -2369,7 +2564,7 @@ impl Extension { } if let Some(expected_count) = E::COUNT { if self.count != expected_count { - return Err(Error::BadRecordCount { + return Err(Warning::BadRecordCount { offset: self.offsets.start, record: E::NAME.into(), count: self.count, @@ -2384,7 +2579,7 @@ impl Extension { r: &mut R, endian: Endian, n_vars: usize, - warn: &Box, + warn: &dyn Fn(Warning), ) -> Result, Error> { let subtype = endian.parse(read_bytes(r)?); let header_offset = r.stream_position()?; @@ -2603,29 +2798,54 @@ fn read_string(r: &mut R, endian: Endian) -> Result } #[derive(Clone, Debug)] -pub struct LongStringValueLabels +pub struct LongStringValueLabels where S: Debug, { - pub var_name: S, + pub var_name: N, pub width: u32, /// `(value, label)` pairs, where each value is `width` bytes. pub labels: Vec<(S, S)>, } +impl LongStringValueLabels { + fn decode( + &self, + decoder: &Decoder, + ) -> Result, Warning> { + let var_name = decoder.decode(&self.var_name); + let var_name = Identifier::new(var_name.trim_end(), decoder.encoding) + .map_err(Warning::InvalidLongStringValueLabelName)?; + + let mut labels = Vec::with_capacity(self.labels.len()); + for (value, label) in self.labels.iter() { + let value = decoder.decode_exact_length(&value.0).to_string(); + let label = decoder.decode(label).to_string(); + labels.push((value, label)); + } + + Ok(LongStringValueLabels { + var_name, + width: self.width, + labels, + }) + } +} + #[derive(Clone, Debug)] -pub struct LongStringValueLabelRecord(pub Vec>) +pub struct LongStringValueLabelRecord(pub Vec>) where + N: Debug, S: Debug; -impl ExtensionRecord for LongStringValueLabelRecord { +impl ExtensionRecord for LongStringValueLabelRecord { const SUBTYPE: u32 = 21; const SIZE: Option = Some(1); const COUNT: Option = None; const NAME: &'static str = "long string value labels record"; - fn parse(ext: &Extension, endian: Endian) -> Result { + fn parse(ext: &Extension, endian: Endian) -> Result { ext.check_size::()?; let mut input = &ext.data[..]; @@ -2651,3 +2871,16 @@ impl ExtensionRecord for LongStringValueLabelRecord { ))) } } + +impl LongStringValueLabelRecord { + fn decode(self, decoder: &Decoder) -> LongStringValueLabelRecord { + let mut labels = Vec::with_capacity(self.0.len()); + for label in &self.0 { + match label.decode(decoder) { + Ok(set) => labels.push(set), + Err(error) => decoder.warn(error), + } + } + LongStringValueLabelRecord(labels) + } +}