From bb850126810785d9d4308890390fad1b68535305 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Thu, 16 Nov 2023 15:42:35 -0800 Subject: [PATCH] Work --- rust/src/raw.rs | 191 +++++++++++++++++++++++++----------------------- 1 file changed, 99 insertions(+), 92 deletions(-) diff --git a/rust/src/raw.rs b/rust/src/raw.rs index 345a719e47..db508ae93f 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -44,7 +44,11 @@ pub enum Error { BadRecordType { offset: u64, rec_type: u32 }, #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")] - BadVariableLabelCode { start_offset: u64, code_offset: u64, code: u32 }, + BadVariableLabelCode { + start_offset: u64, + code_offset: u64, + code: u32, + }, #[error( "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3." @@ -142,6 +146,7 @@ pub enum Record { VarDisplay(VarDisplayRecord), MultipleResponse(MultipleResponseRecord), LongStringValueLabels(LongStringValueLabelRecord), + LongStringMissingValues(LongStringMissingValueSet), Encoding(EncodingRecord), NumberOfCases(NumberOfCasesRecord), ProductInfo(TextRecord), @@ -175,7 +180,7 @@ impl Record { // If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it // decoded as Latin-1 (actually bytes interpreted as Unicode code points). -fn default_decode<>(s: &[u8]) -> Cow { +fn default_decode(s: &[u8]) -> Cow { from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from) } @@ -938,7 +943,8 @@ impl VariableRecord { } }; - let missing_values = MissingValues::read(r, start_offset, width, missing_value_code, endian)?; + let missing_values = + MissingValues::read(r, start_offset, width, missing_value_code, endian)?; let end_offset = r.stream_position()?; @@ -1018,17 +1024,12 @@ impl Debug for UnencodedStr { #[derive(Clone)] pub struct ValueLabelRecord { - /// Offset from the start of the file to the start of the value label - /// record. - pub label_offset: u64, + /// Range of offsets in file. + pub offsets: Range, /// The labels. pub labels: Vec<(UntypedValue, UnencodedString)>, - /// Offset from the start of the file to the start of the variable index - /// record. - pub index_offset: u64, - /// The 1-based indexes of the variable indexes. pub dict_indexes: Vec, } @@ -1047,6 +1048,12 @@ impl Debug for ValueLabelRecord { } } +impl Header for ValueLabelRecord { + fn offsets(&self) -> Range { + self.offsets.clone() + } +} + impl ValueLabelRecord { /// Maximum number of value labels in a record. pub const MAX_LABELS: u32 = u32::MAX / 8; @@ -1099,10 +1106,10 @@ impl ValueLabelRecord { dict_indexes.push(endian.parse(read_bytes(r)?)); } + let end_offset = r.stream_position()?; Ok(ValueLabelRecord { - label_offset, + offsets: label_offset..end_offset, labels, - index_offset, dict_indexes, }) } @@ -1110,8 +1117,7 @@ impl ValueLabelRecord { #[derive(Clone, Debug)] pub struct DocumentRecord { - /// Offset from the start of the file to the start of the record. - pub pos: u64, + pub offsets: Range, /// The document, as an array of 80-byte lines. pub lines: Vec, @@ -1129,39 +1135,46 @@ impl DocumentRecord { pub const MAX_LINES: usize = i32::MAX as usize / Self::LINE_LEN; fn read(r: &mut R, endian: Endian) -> Result { - let offset = r.stream_position()?; + let start_offset = r.stream_position()?; let n: u32 = endian.parse(read_bytes(r)?); let n = n as usize; if n > Self::MAX_LINES { Err(Error::BadDocumentLength { - offset, + offset: start_offset, n, max: Self::MAX_LINES, }) } else { - let pos = r.stream_position()?; let mut lines = Vec::with_capacity(n); for _ in 0..n { lines.push(UnencodedStr::<{ DocumentRecord::LINE_LEN }>(read_bytes(r)?)); } - Ok(DocumentRecord { pos, lines }) + let end_offset = r.stream_position()?; + Ok(DocumentRecord { + offsets: start_offset..end_offset, + lines, + }) } } } -trait ExtensionRecord -where - Self: Sized, -{ +impl Header for DocumentRecord { + fn offsets(&self) -> Range { + self.offsets.clone() + } +} + +trait ExtensionRecord { const SUBTYPE: u32; const SIZE: Option; const COUNT: Option; const NAME: &'static str; - fn parse(ext: &Extension, endian: Endian, warn: impl Fn(Error)) -> Result; + fn parse(ext: &Extension, endian: Endian) -> Result; } #[derive(Clone, Debug)] pub struct IntegerInfoRecord { + pub offsets: Range, pub version: (i32, i32, i32), pub machine_code: i32, pub floating_point_rep: i32, @@ -1176,21 +1189,22 @@ impl ExtensionRecord for IntegerInfoRecord { const COUNT: Option = Some(8); const NAME: &'static str = "integer record"; - fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { + fn parse(ext: &Extension, endian: Endian) -> Result { ext.check_size::()?; let mut input = &ext.data[..]; let data: Vec = (0..8) .map(|_| endian.parse(read_bytes(&mut input).unwrap())) .collect(); - Ok(IntegerInfoRecord { + Ok(Record::IntegerInfo(IntegerInfoRecord { + offsets: ext.offsets.clone(), version: (data[0], data[1], data[2]), machine_code: data[3], floating_point_rep: data[4], compression_code: data[5], endianness: data[6], character_code: data[7], - }) + })) } } @@ -1207,18 +1221,18 @@ impl ExtensionRecord for FloatInfoRecord { const COUNT: Option = Some(3); const NAME: &'static str = "floating point record"; - fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { + fn parse(ext: &Extension, endian: Endian) -> Result { ext.check_size::()?; let mut input = &ext.data[..]; let data: Vec = (0..3) .map(|_| endian.parse(read_bytes(&mut input).unwrap())) .collect(); - Ok(FloatInfoRecord { + Ok(Record::FloatInfo(FloatInfoRecord { sysmis: data[0], highest: data[1], lowest: data[2], - }) + })) } } @@ -1265,10 +1279,7 @@ impl MultipleResponseType { }; let (value, input) = parse_counted_string(input)?; ( - MultipleResponseType::MultipleDichotomy { - value, - labels, - }, + MultipleResponseType::MultipleDichotomy { value, labels }, input, ) } @@ -1335,7 +1346,7 @@ impl ExtensionRecord for MultipleResponseRecord { const COUNT: Option = None; const NAME: &'static str = "multiple response set record"; - fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result { + fn parse(ext: &Extension, _endian: Endian) -> Result { ext.check_size::()?; let mut input = &ext.data[..]; @@ -1345,7 +1356,7 @@ impl ExtensionRecord for MultipleResponseRecord { sets.push(set); input = rest; } - Ok(MultipleResponseRecord(sets)) + Ok(Record::MultipleResponse(MultipleResponseRecord(sets))) } } @@ -1378,17 +1389,18 @@ impl ExtensionRecord for VarDisplayRecord { const COUNT: Option = None; const NAME: &'static str = "variable display record"; - fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { + fn parse(ext: &Extension, endian: Endian) -> Result { ext.check_size::()?; let mut input = &ext.data[..]; let display = (0..ext.count) .map(|_| endian.parse(read_bytes(&mut input).unwrap())) .collect(); - Ok(VarDisplayRecord(display)) + Ok(Record::VarDisplay(VarDisplayRecord(display))) } } +#[derive(Clone, Debug)] pub struct LongStringMissingValues { /// Variable name. pub var_name: UnencodedString, @@ -1397,6 +1409,7 @@ pub struct LongStringMissingValues { pub missing_values: MissingValues, } +#[derive(Clone, Debug)] pub struct LongStringMissingValueSet(Vec); impl ExtensionRecord for LongStringMissingValueSet { @@ -1405,7 +1418,7 @@ impl ExtensionRecord for LongStringMissingValueSet { const COUNT: Option = None; const NAME: &'static str = "long string missing values record"; - fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { + fn parse(ext: &Extension, endian: Endian) -> Result { ext.check_size::()?; let mut input = &ext.data[..]; @@ -1415,9 +1428,9 @@ impl ExtensionRecord for LongStringMissingValueSet { let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?); let value_len: u32 = endian.parse(read_bytes(&mut input)?); if value_len != 8 { - let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offset; + let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start; return Err(Error::BadLongMissingValueLength { - record_offset: ext.offset, + record_offset: ext.offsets.start, offset, value_len, }); @@ -1445,7 +1458,9 @@ impl ExtensionRecord for LongStringMissingValueSet { missing_values, }); } - Ok(LongStringMissingValueSet(missing_value_set)) + Ok(Record::LongStringMissingValues(LongStringMissingValueSet( + missing_value_set, + ))) } } @@ -1458,13 +1473,14 @@ impl ExtensionRecord for EncodingRecord { const COUNT: Option = None; const NAME: &'static str = "encoding record"; - fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result { + fn parse(ext: &Extension, _endian: Endian) -> Result { ext.check_size::()?; - Ok(EncodingRecord( - String::from_utf8(ext.data.clone()) - .map_err(|_| Error::BadEncodingName { offset: ext.offset })?, - )) + Ok(Record::Encoding(EncodingRecord( + String::from_utf8(ext.data.clone()).map_err(|_| Error::BadEncodingName { + offset: ext.offsets.start, + })?, + ))) } } @@ -1483,21 +1499,20 @@ impl ExtensionRecord for NumberOfCasesRecord { const COUNT: Option = Some(2); const NAME: &'static str = "extended number of cases record"; - fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { + fn parse(ext: &Extension, endian: Endian) -> Result { ext.check_size::()?; let mut input = &ext.data[..]; let one = endian.parse(read_bytes(&mut input)?); let n_cases = endian.parse(read_bytes(&mut input)?); - Ok(NumberOfCasesRecord { one, n_cases }) + Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases })) } } #[derive(Clone, Debug)] pub struct TextRecord { - /// Offset from the start of the file to the start of the record. - pub offset: u64, + pub offsets: Range, /// The text content of the record. pub text: UnencodedString, @@ -1506,7 +1521,7 @@ pub struct TextRecord { impl From for TextRecord { fn from(source: Extension) -> Self { TextRecord { - offset: source.offset, + offsets: source.offsets, text: source.data.into(), } } @@ -1514,8 +1529,7 @@ impl From for TextRecord { #[derive(Clone, Debug)] pub struct Extension { - /// Offset from the start of the file to the start of the record. - pub offset: u64, + pub offsets: Range, /// Record subtype. pub subtype: u32, @@ -1535,7 +1549,7 @@ impl Extension { if let Some(expected_size) = E::SIZE { if self.size != expected_size { return Err(Error::BadRecordSize { - offset: self.offset, + offset: self.offsets.start, record: E::NAME.into(), size: self.size, expected_size, @@ -1545,7 +1559,7 @@ impl Extension { if let Some(expected_count) = E::COUNT { if self.count != expected_count { return Err(Error::BadRecordCount { - offset: self.offset, + offset: self.offsets.start, record: E::NAME.into(), count: self.count, expected_count, @@ -1557,58 +1571,49 @@ impl Extension { fn read(r: &mut R, endian: Endian) -> Result { let subtype = endian.parse(read_bytes(r)?); - let offset = r.stream_position()?; + let header_offset = r.stream_position()?; let size: u32 = endian.parse(read_bytes(r)?); let count = endian.parse(read_bytes(r)?); let Some(product) = size.checked_mul(count) else { return Err(Error::ExtensionRecordTooLarge { - offset, + offset: header_offset, subtype, size, count, }); }; - let offset = r.stream_position()?; + let start_offset = r.stream_position()?; let data = read_vec(r, product as usize)?; + let end_offset = start_offset + product as u64; let extension = Extension { - offset, + offsets: start_offset..end_offset, subtype, size, count, data, }; match subtype { - IntegerInfoRecord::SUBTYPE => Ok(Record::IntegerInfo(IntegerInfoRecord::parse( - &extension, - endian, - |_| (), - )?)), - FloatInfoRecord::SUBTYPE => Ok(Record::FloatInfo(FloatInfoRecord::parse( - &extension, - endian, - |_| (), - )?)), - VarDisplayRecord::SUBTYPE => Ok(Record::VarDisplay(VarDisplayRecord::parse( - &extension, - endian, - |_| (), - )?)), - MultipleResponseRecord::SUBTYPE | 19 => Ok(Record::MultipleResponse( - MultipleResponseRecord::parse(&extension, endian, |_| ())?, - )), - LongStringValueLabelRecord::SUBTYPE => Ok(Record::LongStringValueLabels( - LongStringValueLabelRecord::parse(&extension, endian, |_| ())?, - )), - EncodingRecord::SUBTYPE => Ok(Record::Encoding(EncodingRecord::parse( - &extension, - endian, - |_| (), - )?)), - NumberOfCasesRecord::SUBTYPE => Ok(Record::NumberOfCases(NumberOfCasesRecord::parse( - &extension, - endian, - |_| (), - )?)), + IntegerInfoRecord::SUBTYPE => Ok(IntegerInfoRecord::parse( + &extension, endian, + )?), + FloatInfoRecord::SUBTYPE => Ok(FloatInfoRecord::parse( + &extension, endian, + )?), + VarDisplayRecord::SUBTYPE => Ok(VarDisplayRecord::parse( + &extension, endian, + )?), + MultipleResponseRecord::SUBTYPE | 19 => Ok( + MultipleResponseRecord::parse(&extension, endian)?, + ), + LongStringValueLabelRecord::SUBTYPE => Ok( + LongStringValueLabelRecord::parse(&extension, endian)?, + ), + EncodingRecord::SUBTYPE => { + Ok(EncodingRecord::parse(&extension, endian)?) + } + NumberOfCasesRecord::SUBTYPE => Ok(NumberOfCasesRecord::parse( + &extension, endian, + )?), 5 => Ok(Record::VariableSets(extension.into())), 10 => Ok(Record::ProductInfo(extension.into())), 13 => Ok(Record::LongNames(extension.into())), @@ -1784,7 +1789,7 @@ impl ExtensionRecord for LongStringValueLabelRecord { const COUNT: Option = None; const NAME: &'static str = "long string value labels record"; - fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { + fn parse(ext: &Extension, endian: Endian) -> Result { ext.check_size::()?; let mut input = &ext.data[..]; @@ -1805,6 +1810,8 @@ impl ExtensionRecord for LongStringValueLabelRecord { labels, }) } - Ok(LongStringValueLabelRecord(label_set)) + Ok(Record::LongStringValueLabels(LongStringValueLabelRecord( + label_set, + ))) } } -- 2.30.2