X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=rust%2Fsrc%2Fraw.rs;h=ac8b3a057027cc6649a39c99d294014102da7cec;hb=a8331d2f67af24ce1f9f5da99641b8d1cdc21300;hp=ab3ad844d9a3cee79b731e00733ebb0a2e0145f3;hpb=168475a9c04b570944e03eb7eda05f7dfd946213;p=pspp diff --git a/rust/src/raw.rs b/rust/src/raw.rs index ab3ad844d9..ac8b3a0570 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -1,5 +1,4 @@ use crate::endian::{Endian, Parse, ToBytes}; -use crate::{CategoryLabels, Compression}; use encoding_rs::mem::decode_latin1; use flate2::read::ZlibDecoder; @@ -56,6 +55,9 @@ pub enum Error { #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")] BadNumberOfValueLabels { offset: u64, n: u32, max: u32 }, + #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")] + ExpectedVarIndexRecord { offset: u64, rec_type: u32 }, + #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")] BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 }, @@ -97,13 +99,27 @@ pub enum Error { }, #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")] - BadRecordSize { offset: u64, record: String, size: u32, expected_size: u32 }, + BadRecordSize { + offset: u64, + record: String, + size: u32, + expected_size: u32, + }, #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")] - BadRecordCount { offset: u64, record: String, count: u32, expected_count: u32 }, + BadRecordCount { + offset: u64, + record: String, + count: u32, + expected_count: u32, + }, #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")] - BadLongMissingValueLength { record_offset: u64, offset: u64, value_len: u32 }, + BadLongMissingValueLength { + record_offset: u64, + offset: u64, + value_len: u32, + }, #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")] BadEncodingName { offset: u64 }, @@ -114,25 +130,23 @@ pub enum Error { #[derive(Clone, Debug)] pub enum Record { - Header(Header), - Variable(Variable), - ValueLabel(ValueLabel), - VarIndexes(VarIndexes), - Document(Document), - IntegerInfo(IntegerInfo), - FloatInfo(FloatInfo), - VariableSets(UnencodedString), + Header(HeaderRecord), + Variable(VariableRecord), + ValueLabel(ValueLabelRecord), + Document(DocumentRecord), + IntegerInfo(IntegerInfoRecord), + FloatInfo(FloatInfoRecord), + VariableSets(TextRecord), VarDisplay(VarDisplayRecord), MultipleResponse(MultipleResponseRecord), LongStringValueLabels(LongStringValueLabelRecord), Encoding(EncodingRecord), NumberOfCases(NumberOfCasesRecord), - ProductInfo(UnencodedString), - LongNames(UnencodedString), - LongStrings(UnencodedString), - FileAttributes(UnencodedString), - VariableAttributes(UnencodedString), - TextExtension(TextExtension), + ProductInfo(TextRecord), + LongNames(TextRecord), + VeryLongStrings(TextRecord), + FileAttributes(TextRecord), + VariableAttributes(TextRecord), OtherExtension(Extension), EndOfHeaders(u32), ZHeader(ZHeader), @@ -144,10 +158,9 @@ impl Record { fn read(reader: &mut R, endian: Endian) -> Result { let rec_type: u32 = endian.parse(read_bytes(reader)?); match rec_type { - 2 => Ok(Record::Variable(Variable::read(reader, endian)?)), - 3 => Ok(Record::ValueLabel(ValueLabel::read(reader, endian)?)), - 4 => Ok(Record::VarIndexes(VarIndexes::read(reader, endian)?)), - 6 => Ok(Record::Document(Document::read(reader, endian)?)), + 2 => Ok(Record::Variable(VariableRecord::read(reader, endian)?)), + 3 => Ok(Record::ValueLabel(ValueLabelRecord::read(reader, endian)?)), + 6 => Ok(Record::Document(DocumentRecord::read(reader, endian)?)), 7 => Ok(Extension::read(reader, endian)?), 999 => Ok(Record::EndOfHeaders(endian.parse(read_bytes(reader)?))), _ => Err(Error::BadRecordType { @@ -164,8 +177,14 @@ fn default_decode<'a>(s: &'a [u8]) -> Cow<'a, str> { from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from) } +#[derive(Copy, Clone, Debug)] +pub enum Compression { + Simple, + ZLib, +} + #[derive(Clone)] -pub struct Header { +pub struct HeaderRecord { /// Magic number. pub magic: Magic, @@ -183,7 +202,7 @@ pub struct Header { /// Compression type, if any, pub compression: Option, - /// 0-based variable index of the weight variable, or `None` if the file is + /// 1-based variable index of the weight variable, or `None` if the file is /// unweighted. pub weight_index: Option, @@ -206,13 +225,13 @@ pub struct Header { pub endian: Endian, } -impl Header { +impl HeaderRecord { fn debug_field(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult { writeln!(f, "{name:>17}: {:?}", value) } } -impl Debug for Header { +impl Debug for HeaderRecord { fn fmt(&self, f: &mut Formatter) -> FmtResult { writeln!(f, "File header record:")?; self.debug_field(f, "Magic", self.magic)?; @@ -230,8 +249,8 @@ impl Debug for Header { } } -impl Header { - fn read(r: &mut R) -> Result { +impl HeaderRecord { + fn read(r: &mut R) -> Result { let magic: [u8; 4] = read_bytes(r)?; let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?; @@ -256,7 +275,7 @@ impl Header { }; let weight_index: u32 = endian.parse(read_bytes(r)?); - let weight_index = (weight_index > 0).then(|| weight_index - 1); + let weight_index = (weight_index > 0).then_some(weight_index); let n_cases: u32 = endian.parse(read_bytes(r)?); let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases); @@ -268,7 +287,7 @@ impl Header { let file_label = UnencodedStr::<64>(read_bytes(r)?); let _: [u8; 3] = read_bytes(r)?; - Ok(Header { + Ok(HeaderRecord { magic, layout_code, nominal_case_size, @@ -324,16 +343,16 @@ impl TryFrom<[u8; 4]> for Magic { } } -#[derive(Copy, Clone, PartialEq, Eq, Hash)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum VarType { - Number, + Numeric, String, } impl VarType { fn from_width(width: i32) -> VarType { match width { - 0 => VarType::Number, + 0 => VarType::Numeric, _ => VarType::String, } } @@ -341,8 +360,8 @@ impl VarType { mod state { use super::{ - Compression, Error, Header, Record, Value, VarType, Variable, ZHeader, ZTrailer, - ZlibDecodeMultiple, + Compression, Error, HeaderRecord, Record, Value, VarType, VariableRecord, ZHeader, + ZTrailer, ZlibDecodeMultiple, }; use crate::endian::Endian; use std::{ @@ -373,7 +392,7 @@ mod state { impl State for Start { fn read(mut self: Box) -> Result)>, Error> { - let header = Header::read(&mut self.reader)?; + let header = HeaderRecord::read(&mut self.reader)?; let next_state = Headers(CommonState { reader: self.reader, endian: header.endian, @@ -391,7 +410,7 @@ mod state { fn read(mut self: Box) -> Result)>, Error> { let record = Record::read(&mut self.0.reader, self.0.endian)?; match record { - Record::Variable(Variable { width, .. }) => { + Record::Variable(VariableRecord { width, .. }) => { self.0.var_types.push(VarType::from_width(width)); } Record::EndOfHeaders(_) => { @@ -500,14 +519,18 @@ impl Debug for Value { impl Value { fn read(r: &mut R, var_type: VarType, endian: Endian) -> Result { - Ok(Self::from_raw(var_type, read_bytes(r)?, endian)) + Ok(Self::from_raw( + UntypedValue(read_bytes(r)?), + var_type, + endian, + )) } - pub fn from_raw(var_type: VarType, raw: [u8; 8], endian: Endian) -> Value { + pub fn from_raw(raw: UntypedValue, var_type: VarType, endian: Endian) -> Value { match var_type { - VarType::String => Value::String(UnencodedStr(raw)), - VarType::Number => { - let number: f64 = endian.parse(raw); + VarType::String => Value::String(UnencodedStr(raw.0)), + VarType::Numeric => { + let number: f64 = endian.parse(raw.0); Value::Number((number != -f64::MAX).then_some(number)) } } @@ -533,7 +556,7 @@ impl Value { }); } }; - values.push(Value::from_raw(var_type, raw, endian)); + values.push(Value::from_raw(UntypedValue(raw), var_type, endian)); } Ok(Some(values)) } @@ -567,7 +590,7 @@ impl Value { match code { 0 => (), 1..=251 => match var_type { - VarType::Number => break Value::Number(Some(code as f64 - bias)), + VarType::Numeric => break Value::Number(Some(code as f64 - bias)), VarType::String => { break Value::String(UnencodedStr(endian.to_bytes(code as f64 - bias))) } @@ -583,10 +606,12 @@ impl Value { }); } } - 253 => break Value::from_raw(var_type, read_bytes(reader)?, endian), + 253 => { + break Value::from_raw(UntypedValue(read_bytes(reader)?), var_type, endian) + } 254 => match var_type { VarType::String => break Value::String(UnencodedStr(*b" ")), // XXX EBCDIC - VarType::Number => { + VarType::Numeric => { return Err(Error::CompressedStringExpected { offset: case_start, case_ofs: reader.stream_position()? - case_start, @@ -594,7 +619,7 @@ impl Value { } }, 255 => match var_type { - VarType::Number => break Value::Number(None), + VarType::Numeric => break Value::Number(None), VarType::String => { return Err(Error::CompressedNumberExpected { offset: case_start, @@ -744,8 +769,9 @@ fn format_name(type_: u32) -> Cow<'static, str> { 39 => "SDATE", 40 => "MTIME", 41 => "YMDHMS", - _ => return format!("").into() - }.into() + _ => return format!("").into(), + } + .into() } #[derive(Clone)] @@ -819,7 +845,7 @@ impl MissingValues { } #[derive(Clone)] -pub struct Variable { +pub struct VariableRecord { /// Offset from the start of the file to the start of the record. pub offset: u64, @@ -842,7 +868,7 @@ pub struct Variable { pub label: Option, } -impl Debug for Variable { +impl Debug for VariableRecord { fn fmt(&self, f: &mut Formatter) -> FmtResult { writeln!( f, @@ -864,8 +890,8 @@ impl Debug for Variable { } } -impl Variable { - fn read(r: &mut R, endian: Endian) -> Result { +impl VariableRecord { + fn read(r: &mut R, endian: Endian) -> Result { let offset = r.stream_position()?; let width: i32 = endian.parse(read_bytes(r)?); let has_variable_label: u32 = endian.parse(read_bytes(r)?); @@ -896,7 +922,7 @@ impl Variable { let missing_values = MissingValues::read(r, offset, width, missing_value_code, endian)?; - Ok(Variable { + Ok(VariableRecord { offset, width, name, @@ -971,35 +997,51 @@ impl Debug for UnencodedStr { } #[derive(Clone)] -pub struct ValueLabel { - /// Offset from the start of the file to the start of the record. - pub offset: u64, +pub struct ValueLabelRecord { + /// Offset from the start of the file to the start of the value label + /// record. + pub label_offset: u64, /// The labels. pub labels: Vec<(UntypedValue, UnencodedString)>, + + /// Offset from the start of the file to the start of the variable index + /// record. + pub index_offset: u64, + + /// The 1-based indexes of the variable indexes. + pub dict_indexes: Vec, } -impl Debug for ValueLabel { +impl Debug for ValueLabelRecord { fn fmt(&self, f: &mut Formatter) -> FmtResult { + writeln!(f, "labels: ")?; for (value, label) in self.labels.iter() { writeln!(f, "{value:?}: {label:?}")?; } + write!(f, "apply to variables")?; + for dict_index in self.dict_indexes.iter() { + write!(f, " #{dict_index}")?; + } Ok(()) } } -impl ValueLabel { +impl ValueLabelRecord { /// Maximum number of value labels in a record. - pub const MAX: u32 = u32::MAX / 8; + pub const MAX_LABELS: u32 = u32::MAX / 8; - fn read(r: &mut R, endian: Endian) -> Result { - let offset = r.stream_position()?; + /// Maximum number of variable indexes in a record. + pub const MAX_INDEXES: u32 = u32::MAX / 8; + + fn read(r: &mut R, endian: Endian) -> Result { + let label_offset = r.stream_position()?; let n: u32 = endian.parse(read_bytes(r)?); - if n > ValueLabel::MAX { + if n > Self::MAX_LABELS { return Err(Error::BadNumberOfValueLabels { - offset, + offset: label_offset, n, - max: ValueLabel::MAX, + max: Self::MAX_LABELS, }); } @@ -1014,67 +1056,50 @@ impl ValueLabel { label.truncate(label_len); labels.push((value, UnencodedString(label))); } - Ok(ValueLabel { offset, labels }) - } -} - -#[derive(Clone)] -pub struct VarIndexes { - /// Offset from the start of the file to the start of the record. - pub offset: u64, - /// The 0-based indexes of the variable indexes. - pub var_indexes: Vec, -} - -impl Debug for VarIndexes { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "apply to variables")?; - for var_index in self.var_indexes.iter() { - write!(f, " #{var_index}")?; + let index_offset = r.stream_position()?; + let rec_type: u32 = endian.parse(read_bytes(r)?); + if rec_type != 4 { + return Err(Error::ExpectedVarIndexRecord { + offset: index_offset, + rec_type, + }); } - Ok(()) - } -} -impl VarIndexes { - /// Maximum number of variable indexes in a record. - pub const MAX: u32 = u32::MAX / 8; - - fn read(r: &mut R, endian: Endian) -> Result { - let offset = r.stream_position()?; let n: u32 = endian.parse(read_bytes(r)?); - if n > VarIndexes::MAX { + if n > Self::MAX_INDEXES { return Err(Error::BadNumberOfVarIndexes { - offset, + offset: index_offset, n, - max: VarIndexes::MAX, + max: Self::MAX_INDEXES, }); } - let mut var_indexes = Vec::with_capacity(n as usize); + let mut dict_indexes = Vec::with_capacity(n as usize); for _ in 0..n { - var_indexes.push(endian.parse(read_bytes(r)?)); + dict_indexes.push(endian.parse(read_bytes(r)?)); } - Ok(VarIndexes { - offset, - var_indexes, + Ok(ValueLabelRecord { + label_offset, + labels, + index_offset, + dict_indexes, }) } } #[derive(Clone, Debug)] -pub struct Document { +pub struct DocumentRecord { /// Offset from the start of the file to the start of the record. pub pos: u64, /// The document, as an array of 80-byte lines. - pub lines: Vec + pub lines: Vec, } -pub type DocumentLine = UnencodedStr<{Document::LINE_LEN}>; +pub type DocumentLine = UnencodedStr<{ DocumentRecord::LINE_LEN }>; -impl Document { +impl DocumentRecord { /// Length of a line in a document. Document lines are fixed-length and /// padded on the right with spaces. pub const LINE_LEN: usize = 80; @@ -1083,7 +1108,7 @@ impl Document { /// the maximum number that will fit in a 32-bit space. pub const MAX_LINES: usize = i32::MAX as usize / Self::LINE_LEN; - fn read(r: &mut R, endian: Endian) -> Result { + fn read(r: &mut R, endian: Endian) -> Result { let offset = r.stream_position()?; let n: u32 = endian.parse(read_bytes(r)?); let n = n as usize; @@ -1097,9 +1122,9 @@ impl Document { let pos = r.stream_position()?; let mut lines = Vec::with_capacity(n); for _ in 0..n { - lines.push(UnencodedStr::<{Document::LINE_LEN}>(read_bytes(r)?)); + lines.push(UnencodedStr::<{ DocumentRecord::LINE_LEN }>(read_bytes(r)?)); } - Ok(Document { pos, lines }) + Ok(DocumentRecord { pos, lines }) } } } @@ -1116,7 +1141,7 @@ where } #[derive(Clone, Debug)] -pub struct IntegerInfo { +pub struct IntegerInfoRecord { pub version: (i32, i32, i32), pub machine_code: i32, pub floating_point_rep: i32, @@ -1125,7 +1150,7 @@ pub struct IntegerInfo { pub character_code: i32, } -impl ExtensionRecord for IntegerInfo { +impl ExtensionRecord for IntegerInfoRecord { const SUBTYPE: u32 = 3; const SIZE: Option = Some(4); const COUNT: Option = Some(8); @@ -1138,7 +1163,7 @@ impl ExtensionRecord for IntegerInfo { let data: Vec = (0..8) .map(|_| endian.parse(read_bytes(&mut input).unwrap())) .collect(); - Ok(IntegerInfo { + Ok(IntegerInfoRecord { version: (data[0], data[1], data[2]), machine_code: data[3], floating_point_rep: data[4], @@ -1150,13 +1175,13 @@ impl ExtensionRecord for IntegerInfo { } #[derive(Clone, Debug)] -pub struct FloatInfo { +pub struct FloatInfoRecord { pub sysmis: f64, pub highest: f64, pub lowest: f64, } -impl ExtensionRecord for FloatInfo { +impl ExtensionRecord for FloatInfoRecord { const SUBTYPE: u32 = 4; const SIZE: Option = Some(8); const COUNT: Option = Some(3); @@ -1169,7 +1194,7 @@ impl ExtensionRecord for FloatInfo { let data: Vec = (0..3) .map(|_| endian.parse(read_bytes(&mut input).unwrap())) .collect(); - Ok(FloatInfo { + Ok(FloatInfoRecord { sysmis: data[0], highest: data[1], lowest: data[2], @@ -1177,6 +1202,12 @@ impl ExtensionRecord for FloatInfo { } } +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum CategoryLabels { + VarLabels, + CountedValues, +} + #[derive(Clone, Debug)] pub enum MultipleResponseType { MultipleDichotomy { @@ -1185,20 +1216,9 @@ pub enum MultipleResponseType { }, MultipleCategory, } -#[derive(Clone, Debug)] -pub struct MultipleResponseSet { - pub name: UnencodedString, - pub label: UnencodedString, - pub mr_type: MultipleResponseType, - pub vars: Vec, -} -impl MultipleResponseSet { - fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> { - let Some(equals) = input.iter().position(|&b| b == b'=') else { - return Err(Error::TBD); - }; - let (name, input) = input.split_at(equals); +impl MultipleResponseType { + fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Error> { let (mr_type, input) = match input.get(0) { Some(b'C') => (MultipleResponseType::MultipleCategory, &input[1..]), Some(b'D') => { @@ -1234,6 +1254,25 @@ impl MultipleResponseSet { } _ => return Err(Error::TBD), }; + Ok((mr_type, input)) + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseSet { + pub name: UnencodedString, + pub label: UnencodedString, + pub mr_type: MultipleResponseType, + pub short_names: Vec, +} + +impl MultipleResponseSet { + fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> { + let Some(equals) = input.iter().position(|&b| b == b'=') else { + return Err(Error::TBD); + }; + let (name, input) = input.split_at(equals); + let (mr_type, input) = MultipleResponseType::parse(input)?; let Some(b' ') = input.get(0) else { return Err(Error::TBD); }; @@ -1260,7 +1299,7 @@ impl MultipleResponseSet { name: name.into(), label: label.into(), mr_type, - vars, + short_names: vars, }, input, )) @@ -1268,7 +1307,7 @@ impl MultipleResponseSet { } #[derive(Clone, Debug)] -pub struct MultipleResponseRecord(Vec); +pub struct MultipleResponseRecord(pub Vec); impl ExtensionRecord for MultipleResponseRecord { const SUBTYPE: u32 = 7; @@ -1409,7 +1448,6 @@ impl ExtensionRecord for EncodingRecord { } } - #[derive(Clone, Debug)] pub struct NumberOfCasesRecord { /// Always observed as 1. @@ -1436,20 +1474,22 @@ impl ExtensionRecord for NumberOfCasesRecord { } } -#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] -pub enum TextExtensionSubtype { - VariableSets = 5, - ProductInfo = 10, - LongNames = 13, - LongStrings = 14, - FileAttributes = 17, - VariableAttributes = 18, +#[derive(Clone, Debug)] +pub struct TextRecord { + /// Offset from the start of the file to the start of the record. + pub offset: u64, + + /// The text content of the record. + pub text: UnencodedString, } -#[derive(Clone, Debug)] -pub struct TextExtension { - pub subtype: TextExtensionSubtype, - pub string: UnencodedString, +impl From for TextRecord { + fn from(source: Extension) -> Self { + TextRecord { + offset: source.offset, + text: source.data.into(), + } + } } #[derive(Clone, Debug)] @@ -1518,12 +1558,12 @@ impl Extension { data, }; match subtype { - IntegerInfo::SUBTYPE => Ok(Record::IntegerInfo(IntegerInfo::parse( + IntegerInfoRecord::SUBTYPE => Ok(Record::IntegerInfo(IntegerInfoRecord::parse( &extension, endian, |_| (), )?)), - FloatInfo::SUBTYPE => Ok(Record::FloatInfo(FloatInfo::parse( + FloatInfoRecord::SUBTYPE => Ok(Record::FloatInfo(FloatInfoRecord::parse( &extension, endian, |_| (), @@ -1549,24 +1589,12 @@ impl Extension { endian, |_| (), )?)), - x if x == TextExtensionSubtype::VariableSets as u32 => { - Ok(Record::VariableSets(UnencodedString(extension.data))) - } - x if x == TextExtensionSubtype::ProductInfo as u32 => { - Ok(Record::ProductInfo(UnencodedString(extension.data))) - } - x if x == TextExtensionSubtype::LongNames as u32 => { - Ok(Record::LongNames(UnencodedString(extension.data))) - } - x if x == TextExtensionSubtype::LongStrings as u32 => { - Ok(Record::LongStrings(UnencodedString(extension.data))) - } - x if x == TextExtensionSubtype::FileAttributes as u32 => { - Ok(Record::FileAttributes(UnencodedString(extension.data))) - } - x if x == TextExtensionSubtype::VariableAttributes as u32 => { - Ok(Record::VariableAttributes(UnencodedString(extension.data))) - } + 5 => Ok(Record::VariableSets(extension.into())), + 10 => Ok(Record::ProductInfo(extension.into())), + 13 => Ok(Record::LongNames(extension.into())), + 14 => Ok(Record::VeryLongStrings(extension.into())), + 17 => Ok(Record::FileAttributes(extension.into())), + 18 => Ok(Record::VariableAttributes(extension.into())), _ => Ok(Record::OtherExtension(extension)), } } @@ -1728,7 +1756,7 @@ pub struct LongStringValueLabels { } #[derive(Clone, Debug)] -pub struct LongStringValueLabelRecord(Vec); +pub struct LongStringValueLabelRecord(pub Vec); impl ExtensionRecord for LongStringValueLabelRecord { const SUBTYPE: u32 = 21; @@ -1760,4 +1788,3 @@ impl ExtensionRecord for LongStringValueLabelRecord { Ok(LongStringValueLabelRecord(label_set)) } } -