From 85c5363cf5b8f7bb95652d209ba6d46ef806ceab Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 5 Aug 2023 14:02:55 -0700 Subject: [PATCH] more work on parser --- doc/dev/system-file-format.texi | 4 +- rust/src/lib.rs | 9 ++ rust/src/raw.rs | 161 ++++++++++++++++++++++++++++++-- 3 files changed, 165 insertions(+), 9 deletions(-) diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi index db8af7e374..a12c353d96 100644 --- a/doc/dev/system-file-format.texi +++ b/doc/dev/system-file-format.texi @@ -1511,7 +1511,7 @@ int32 var_name_len; char var_name[]; char n_missing_values; int32 value_len; -char values[values_len * n_missing_values]; +char values[value_len * n_missing_values]; @end example @table @code @@ -1545,7 +1545,7 @@ definition), only the first 8 bytes of a long string variable's missing values are allowed to be non-spaces, and any spaces within the first 8 bytes are included in the missing value here. -@item char values[values_len * n_missing_values] +@item char values[value_len * n_missing_values] The missing values themselves, without any padding or null terminators. @end table diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 680d3a3a51..8a4c7d0704 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -92,4 +92,13 @@ pub enum Error { #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")] BadRecordCount { offset: u64, record: String, count: u32, expected_count: u32 }, + + #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")] + BadEncodingName { offset: u64 }, + + #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")] + BadLongMissingValueLength { record_offset: u64, offset: u64, value_len: u32 }, + + #[error("This file has corrupted metadata written by a buggy version of PSPP. To fix it, save a new copy of the file.")] + BadLongMissingValueFormat, } diff --git a/rust/src/raw.rs b/rust/src/raw.rs index e017f74ac4..48ff3ee5fa 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -759,11 +759,14 @@ enum ExtensionType { } */ -trait ExtensionRecord where Self: Sized { +trait ExtensionRecord +where + Self: Sized, +{ const SIZE: Option; const COUNT: Option; const NAME: &'static str; - fn parse(ext: &Extension, endian: Endian) -> Result; + fn parse(ext: &Extension, endian: Endian, warn: impl Fn(Error)) -> Result; } pub struct IntegerInfo { @@ -780,7 +783,7 @@ impl ExtensionRecord for IntegerInfo { const COUNT: Option = Some(8); const NAME: &'static str = "integer record"; - fn parse(ext: &Extension, endian: Endian) -> Result{ + fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { ext.check_size::()?; let mut input = &ext.data[..]; @@ -793,7 +796,7 @@ impl ExtensionRecord for IntegerInfo { floating_point_rep: data[4], compression_code: data[5], endianness: data[6], - character_code: data[7] + character_code: data[7], }) } } @@ -809,7 +812,7 @@ impl ExtensionRecord for FloatInfo { const COUNT: Option = Some(3); const NAME: &'static str = "floating point record"; - fn parse(ext: &Extension, endian: Endian) -> Result{ + fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { ext.check_size::()?; let mut input = &ext.data[..]; @@ -824,6 +827,119 @@ impl ExtensionRecord for FloatInfo { } } +pub struct VarDisplayRecord(Vec); + +impl ExtensionRecord for VarDisplayRecord { + const SIZE: Option = Some(4); + const COUNT: Option = None; + const NAME: &'static str = "variable display record"; + + fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let display = (0..ext.count) + .map(|_| endian.parse(read_bytes(&mut input).unwrap())) + .collect(); + Ok(VarDisplayRecord(display)) + } +} + +pub struct LongStringValueLabels { + pub var_name: Vec, + pub width: u32, + + /// `(value, label)` pairs, where each value is `width` bytes. + pub labels: Vec<(Vec, Vec)>, +} + +pub struct LongStringValueLabelSet(Vec); + +impl ExtensionRecord for LongStringValueLabelSet { + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "long string value labels record"; + + fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let mut label_set = Vec::new(); + while !input.is_empty() { + let var_name = read_string(&mut input, endian)?; + let width: u32 = endian.parse(read_bytes(&mut input)?); + let n_labels: u32 = endian.parse(read_bytes(&mut input)?); + let mut labels = Vec::new(); + for _ in 0..n_labels { + let value = read_string(&mut input, endian)?; + let label = read_string(&mut input, endian)?; + labels.push((value, label)); + } + label_set.push(LongStringValueLabels { + var_name, + width, + labels, + }) + } + Ok(LongStringValueLabelSet(label_set)) + } +} + +pub struct LongStringMissingValues { + /// Variable name. + pub var_name: Vec, + + /// Up to three missing values. + pub missing_values: Vec<[u8; 8]>, +} + +pub struct LongStringMissingValueSet(Vec); + +impl ExtensionRecord for LongStringMissingValueSet { + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "long string missing values record"; + + fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let mut missing_value_set = Vec::new(); + while !input.is_empty() { + let var_name = read_string(&mut input, endian)?; + let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?); + let value_len: u32 = endian.parse(read_bytes(&mut input)?); + if value_len != 8 { + let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offset; + return Err(Error::BadLongMissingValueLength { + record_offset: ext.offset, + offset, + value_len, + }); + } + let mut missing_values = Vec::new(); + for i in 0..n_missing_values { + let value: [u8; 8] = read_bytes(&mut input)?; + let numeric_value: u64 = endian.parse(value); + let value = if i > 0 && numeric_value == 8 { + // Tolerate files written by old, buggy versions of PSPP + // where we believed that the value_length was repeated + // before each missing value. + read_bytes(&mut input)? + } else { + value + }; + missing_values.push(value); + } + missing_value_set.push(LongStringMissingValues { + var_name, + missing_values, + }); + } + Ok(LongStringMissingValueSet(missing_value_set)) + } +} + pub struct Encoding(pub String); impl ExtensionRecord for Encoding { @@ -831,10 +947,36 @@ impl ExtensionRecord for Encoding { const COUNT: Option = None; const NAME: &'static str = "encoding record"; - fn parse(ext: &Extension, endian: Endian) -> Result{ + fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result { + ext.check_size::()?; + + Ok(Encoding(String::from_utf8(ext.data.clone()).map_err( + |_| Error::BadEncodingName { offset: ext.offset }, + )?)) + } +} + +pub struct NumberOfCasesRecord { + /// Always observed as 1. + one: u64, + + /// Number of cases. + n_cases: u64, +} + +impl ExtensionRecord for NumberOfCasesRecord { + const SIZE: Option = Some(8); + const COUNT: Option = Some(2); + const NAME: &'static str = "extended number of cases record"; + + fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { ext.check_size::()?; - Ok(Encoding(String::from_utf8(ext.data)?)) + let mut input = &ext.data[..]; + let one = endian.parse(read_bytes(&mut input)?); + let n_cases = endian.parse(read_bytes(&mut input)?); + + Ok(NumberOfCasesRecord { one, n_cases }) } } @@ -1070,3 +1212,8 @@ fn read_vec(r: &mut R, n: usize) -> Result, IoError> { r.read_exact(&mut vec)?; Ok(vec) } + +fn read_string(r: &mut R, endian: Endian) -> Result, IoError> { + let length: u32 = endian.parse(read_bytes(r)?); + read_vec(r, length as usize) +} -- 2.30.2