From 34128124500f22989706bdf71230dd71a1eb4cfd Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Mon, 24 Jul 2023 19:48:39 -0700 Subject: [PATCH] work --- rust/src/lib.rs | 243 +++++++++++++++++++++++++++--------------------- 1 file changed, 136 insertions(+), 107 deletions(-) diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 64e9801447..becdb20ad5 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -53,8 +53,8 @@ pub enum Error { #[error("At offset {offset:#x}, variable index record (type 4) does not immediately follow value label record (type 3) as it should.")] MissingVariableIndexRecord { offset: u64 }, - #[error("At offset {offset:#x}, number of variables associated with a value label ({n}) is not between 1 and the number of variables ({max}).")] - BadNumberOfValueLabelVariables { offset: u64, n: u32, max: u32 }, + #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")] + BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 }, #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")] ExtensionRecordTooLarge { @@ -89,16 +89,17 @@ pub enum Compression { ZLib, } -pub struct Reader { - r: BufReader, - documents: Vec, - variables: Vec, - value_labels: Vec, - extensions: Vec, - zheader: Option, +pub enum Record { + Header(Header), + Document(Document), + Variable(Variable), + ValueLabel(ValueLabel), + VarIndexes(VarIndexes), + Extension(Extension), + EndOfHeaders, } -pub struct FileHeader { +pub struct Header { /// Magic number. pub magic: Magic, @@ -154,47 +155,67 @@ impl TryFrom<[u8; 4]> for Magic { } } +pub struct Reader { + r: BufReader, + state: u32, + endianness: Option, +} + impl Reader { - pub fn new(r: R, warn: impl Fn(Warning)) -> Result, Error> { - let mut r = BufReader::new(r); - - let header = read_header(&mut r, &warn)?; - let e = header.endianness; - let mut documents = Vec::new(); - let mut variables = Vec::new(); - let mut value_labels = Vec::new(); - let mut extensions = Vec::new(); - loop { - let offset = r.stream_position()?; - let rec_type: u32 = e.parse(read_bytes(&mut r)?); - match rec_type { - 2 => variables.push(read_variable_record(&mut r, e)?), - 3 => value_labels.push(read_value_label_record(&mut r, e, variables.len())?), - 4 => return Err(Error::MisplacedType4Record(offset)), - 6 => documents.push(read_document_record(&mut r, e)?), - 7 => extensions.push(read_extension_record(&mut r, e)?), - 999 => break, - _ => return Err(Error::BadRecordType { offset, rec_type }), + pub fn new(r: R) -> Result, Error> { + Ok(Reader { + r: BufReader::new(r), + state: 0, + endianness: None, + }) + } + + pub fn read(&mut self) -> Result, Error> { + let retval = self.do_read(); + match retval { + Ok(None) => { + self.state = u32::MAX; + Ok(None) } + Ok(Some((record, next_state))) => { + self.state = next_state; + Ok(Some(record)) + } + Err(error) => Err(error), } - let _: [u8; 4] = read_bytes(&mut r)?; - let zheader = match header.magic { - Magic::ZSAV => Some(read_zheader(&mut r, e)?), - _ => None, - }; + } - Ok(Reader { - r, - documents, - variables, - value_labels, - extensions, - zheader, - }) + pub fn do_read(&mut self) -> Result, Error> { + match self.state { + 0 => { + let header = read_header(&mut self.r)?; + self.endianness = Some(header.endianness); + Ok(Some((Record::Header(header), 1))) + } + 1 => { + let e = self.endianness.unwrap(); + let rec_type: u32 = e.parse(read_bytes(&mut self.r)?); + let record = match rec_type { + 2 => Record::Variable(read_variable_record(&mut self.r, e)?), + 3 => Record::ValueLabel(read_value_label_record(&mut self.r, e)?), + 4 => Record::VarIndexes(read_var_indexes_record(&mut self.r, e)?), + 6 => Record::Document(read_document_record(&mut self.r, e)?), + 7 => Record::Extension(read_extension_record(&mut self.r, e)?), + 999 => Record::EndOfHeaders, + _ => { + return Err(Error::BadRecordType { + offset: self.r.stream_position()?, + rec_type, + }) + } + }; + Ok(Some((record, 1))) + } + } } } -fn read_header(r: &mut R, warn: impl Fn(Warning)) -> Result { +fn read_header(r: &mut R) -> Result { let magic: [u8; 4] = read_bytes(r)?; let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?; @@ -224,16 +245,13 @@ fn read_header(r: &mut R, warn: impl Fn(Warning)) -> Result(r: &mut R, warn: impl Fn(Warning)) -> Result( r: &mut BufReader, e: Endian, -) -> Result { +) -> Result { let offset = r.stream_position()?; let width: i32 = e.parse(read_bytes(r)?); let has_variable_label: u32 = e.parse(read_bytes(r)?); @@ -327,7 +345,7 @@ fn read_variable_record( } } - Ok(VariableRecord { + Ok(Variable { offset, width, name, @@ -339,31 +357,30 @@ fn read_variable_record( }) } -pub struct ValueLabelRecord { +pub struct ValueLabel { /// Offset from the start of the file to the start of the record. pub offset: u64, /// The labels. pub labels: Vec<([u8; 8], Vec)>, - - /// The 0-based indexes of the variables to which the labels are assigned. - pub var_indexes: Vec, } -pub const MAX_VALUE_LABELS: u32 = u32::MAX / 8; +impl ValueLabel { + /// Maximum number of value labels in a record. + pub const MAX: u32 = u32::MAX / 8; +} fn read_value_label_record( r: &mut BufReader, e: Endian, - n_var_records: usize, -) -> Result { +) -> Result { let offset = r.stream_position()?; let n: u32 = e.parse(read_bytes(r)?); - if n > MAX_VALUE_LABELS { + if n > ValueLabel::MAX { return Err(Error::BadNumberOfValueLabels { offset, n, - max: MAX_VALUE_LABELS, + max: ValueLabel::MAX, }); } @@ -378,30 +395,42 @@ fn read_value_label_record( label.truncate(label_len); labels.push((value, label)); } + Ok(ValueLabel { offset, labels }) +} - let rec_type: u32 = e.parse(read_bytes(r)?); - if rec_type != 4 { - return Err(Error::MissingVariableIndexRecord { - offset: r.stream_position()?, - }); - } +pub struct VarIndexes { + /// Offset from the start of the file to the start of the record. + pub offset: u64, + + /// The 0-based indexes of the variable indexes. + pub var_indexes: Vec, +} + +impl VarIndexes { + /// Maximum number of variable indexes in a record. + pub const MAX: u32 = u32::MAX / 8; +} - let n_vars: u32 = e.parse(read_bytes(r)?); - if n_vars < 1 || n_vars as usize > n_var_records { - return Err(Error::BadNumberOfValueLabelVariables { - offset: r.stream_position()?, - n: n_vars, - max: n_var_records as u32, +fn read_var_indexes_record( + r: &mut BufReader, + e: Endian, +) -> Result { + let offset = r.stream_position()?; + let n: u32 = e.parse(read_bytes(r)?); + if n > VarIndexes::MAX { + return Err(Error::BadNumberOfVarIndexes { + offset, + n, + max: VarIndexes::MAX, }); } - let mut var_indexes = Vec::with_capacity(n_vars as usize); - for _ in 0..n_vars { + let mut var_indexes = Vec::with_capacity(n as usize); + for _ in 0..n { var_indexes.push(e.parse(read_bytes(r)?)); } - Ok(ValueLabelRecord { + Ok(VarIndexes { offset, - labels, var_indexes, }) } @@ -409,7 +438,7 @@ fn read_value_label_record( pub const DOC_LINE_LEN: u32 = 80; pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN; -pub struct DocumentRecord { +pub struct Document { /// Offset from the start of the file to the start of the record. pub pos: u64, @@ -420,7 +449,7 @@ pub struct DocumentRecord { fn read_document_record( r: &mut BufReader, e: Endian, -) -> Result { +) -> Result { let offset = r.stream_position()?; let n: u32 = e.parse(read_bytes(r)?); match n { @@ -431,7 +460,7 @@ fn read_document_record( let line: [u8; 80] = read_bytes(r)?; lines.push(line); } - Ok(DocumentRecord { pos, lines }) + Ok(Document { pos, lines }) } _ => Err(Error::BadDocumentLength { offset, @@ -442,7 +471,7 @@ fn read_document_record( } #[derive(FromPrimitive)] -enum Extension { +enum ExtensionType { /// Machine integer info. Integer = 3, /// Machine floating-point info. @@ -481,53 +510,53 @@ enum Extension { Dataview = 24, } -struct ExtensionRecord { +pub struct Extension { /// Offset from the start of the file to the start of the record. - offset: u64, + pub offset: u64, /// Record subtype. - subtype: u32, + pub subtype: u32, /// Size of each data element. - size: u32, + pub size: u32, /// Number of data elements. - count: u32, + pub count: u32, /// `size * count` bytes of data. - data: Vec, + pub data: Vec, } -fn extension_record_size_requirements(extension: Extension) -> (u32, u32) { +fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) { match extension { /* Implemented record types. */ - Extension::Integer => (4, 8), - Extension::Float => (8, 3), - Extension::VarSets => (1, 0), - Extension::Mrsets => (1, 0), - Extension::ProductInfo => (1, 0), - Extension::Display => (4, 0), - Extension::LongNames => (1, 0), - Extension::LongStrings => (1, 0), - Extension::Ncases => (8, 2), - Extension::FileAttrs => (1, 0), - Extension::VarAttrs => (1, 0), - Extension::Mrsets2 => (1, 0), - Extension::Encoding => (1, 0), - Extension::LongLabels => (1, 0), - Extension::LongMissing => (1, 0), + ExtensionType::Integer => (4, 8), + ExtensionType::Float => (8, 3), + ExtensionType::VarSets => (1, 0), + ExtensionType::Mrsets => (1, 0), + ExtensionType::ProductInfo => (1, 0), + ExtensionType::Display => (4, 0), + ExtensionType::LongNames => (1, 0), + ExtensionType::LongStrings => (1, 0), + ExtensionType::Ncases => (8, 2), + ExtensionType::FileAttrs => (1, 0), + ExtensionType::VarAttrs => (1, 0), + ExtensionType::Mrsets2 => (1, 0), + ExtensionType::Encoding => (1, 0), + ExtensionType::LongLabels => (1, 0), + ExtensionType::LongMissing => (1, 0), /* Ignored record types. */ - Extension::Date => (0, 0), - Extension::DataEntry => (0, 0), - Extension::Dataview => (0, 0), + ExtensionType::Date => (0, 0), + ExtensionType::DataEntry => (0, 0), + ExtensionType::Dataview => (0, 0), } } fn read_extension_record( r: &mut BufReader, e: Endian, -) -> Result { +) -> Result { let subtype = e.parse(read_bytes(r)?); let offset = r.stream_position()?; let size: u32 = e.parse(read_bytes(r)?); @@ -542,7 +571,7 @@ fn read_extension_record( }; let offset = r.stream_position()?; let data = read_vec(r, product as usize)?; - Ok(ExtensionRecord { + Ok(Extension { offset, subtype, size, -- 2.30.2