From: Ben Pfaff Date: Mon, 24 Jul 2023 16:02:56 +0000 (-0700) Subject: work on rust X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e56b75378779551cff64de941bf29bd502c7ed15;p=pspp work on rust --- diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 09889b3500..38a454c4b7 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -62,7 +62,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -194,6 +194,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-derive" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e6a0fd4f737c707bd9086cc16c925f294943eb62eb71499e9fd4cf71f8b9f4e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.27", +] + [[package]] name = "num-integer" version = "0.1.45" @@ -229,9 +240,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" +checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" dependencies = [ "autocfg", ] @@ -257,7 +268,7 @@ dependencies = [ "proc-macro-error-attr", "proc-macro2", "quote", - "syn", + "syn 1.0.109", "version_check", ] @@ -274,9 +285,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.51" +version = "1.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6" +checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" dependencies = [ "unicode-ident", ] @@ -289,14 +300,16 @@ dependencies = [ "clap", "hexplay", "num", + "num-derive", + "num-traits", "thiserror", ] [[package]] name = "quote" -version = "1.0.23" +version = "1.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" +checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965" dependencies = [ "proc-macro2", ] @@ -332,6 +345,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "syn" +version = "2.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b60f673f44a8255b9c8c657daf66a596d435f2da81a555b06dc644d080ba45e0" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "termcolor" version = "0.3.6" @@ -367,7 +391,7 @@ checksum = "5420d42e90af0c38c3290abcca25b9b3bdf379fc9f55c528f53a269d9c9a267e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 09324d1dfd..d276b1ae18 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -9,6 +9,8 @@ anyhow = "1.0.69" clap = { version = "4.1.7", features = ["derive"] } hexplay = "0.2.1" num = "0.4.0" +num-derive = "0.4.0" +num-traits = "0.2.16" thiserror = "1.0" [[bin]] @@ -16,4 +18,4 @@ name = "pspp-dump-sav" path = "src/main.rs" [lib] -path = "src/lib.rs" \ No newline at end of file +path = "src/lib.rs" diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 26db62dc11..2bde62b758 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,6 +1,7 @@ #![allow(unused_variables)] use endian::{Endian, Parse}; use num::Integer; +use num_derive::FromPrimitive; use std::io::{BufReader, Error as IoError, Read, Seek}; use thiserror::Error; @@ -51,6 +52,23 @@ pub enum Error { #[error("At offset {offset:#x}, number of variables associated with a value label ({n}) is not between 1 and the number of variables ({max}).")] BadNumberOfValueLabelVariables { offset: u64, n: u32, max: u32 }, + + #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")] + ExtensionRecordTooLarge { + offset: u64, + subtype: u32, + size: u32, + count: u32, + }, + + #[error("Wrong ZLIB data header offset {zheader_offset:#x} (expected {offset:#x}).")] + BadZlibHeaderOffset { offset: u64, zheader_offset: u64 }, + + #[error("At offset {offset:#x}, impossible ZLIB trailer offset {ztrailer_offset:#x}.")] + BadZlibTrailerOffset { offset: u64, ztrailer_offset: u64 }, + + #[error("At offset {offset:#x}, impossible ZLIB trailer length {ztrailer_len}.")] + BadZlibTrailerLen { offset: u64, ztrailer_len: u64 }, } #[derive(Error, Debug)] @@ -70,12 +88,11 @@ pub enum Compression { pub struct Reader { r: BufReader, - - document_record: Option, - + documents: Vec, variables: Vec, - value_labels: Vec, + extensions: Vec, + zheader: Option, } /// Magic number for a regular system file. @@ -124,49 +141,42 @@ pub struct FileHeader { pub file_label: [u8; 64], } -pub const DOC_LINE_LEN: u32 = 80; -pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN; - impl Reader { pub fn new(r: R, warn: impl Fn(Warning)) -> Result, Error> { let mut r = BufReader::new(r); let header = read_header(&mut r, &warn)?; let e = header.endianness; - let mut document_record = None; + let mut documents = Vec::new(); let mut variables = Vec::new(); let mut value_labels = Vec::new(); + let mut extensions = Vec::new(); loop { let offset = r.stream_position()?; let rec_type: u32 = e.parse(read_bytes(&mut r)?); match rec_type { 2 => variables.push(read_variable_record(&mut r, e)?), 3 => value_labels.push(read_value_label_record(&mut r, e, variables.len())?), - // A Type 4 record is always immediately after a type 3 record, - // the code for type 3 records reads the type 4 record too. 4 => return Err(Error::MisplacedType4Record(offset)), - - 6 => { - let d = read_document_record(&mut r, e)?; - if document_record.is_some() { - warn(Warning::DuplicateDocumentRecord); - } else { - document_record = d; - } - } - /* - 7 => d.read_extension_record()?, - */ + 6 => documents.push(read_document_record(&mut r, e)?), + 7 => extensions.push(read_extension_record(&mut r, e)?), 999 => break, _ => return Err(Error::BadRecordType { offset, rec_type }), } } + let _: [u8; 4] = read_bytes(&mut r)?; + let zheader = match header.is_zsav { + true => Some(read_zheader(&mut r, e)?), + false => None, + }; Ok(Reader { r, - document_record, + documents, variables, value_labels, + extensions, + zheader, }) } } @@ -390,6 +400,9 @@ fn read_value_label_record( }) } +pub const DOC_LINE_LEN: u32 = 80; +pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN; + pub struct DocumentRecord { /// Offset from the start of the file to the start of the record. pub pos: u64, @@ -401,28 +414,184 @@ pub struct DocumentRecord { fn read_document_record( r: &mut BufReader, e: Endian, -) -> Result, Error> { +) -> Result { let offset = r.stream_position()?; let n: u32 = e.parse(read_bytes(r)?); - if n == 0 { - Ok(None) - } else if n > DOC_MAX_LINES { - Err(Error::BadDocumentLength { + match n { + 0..=DOC_MAX_LINES => { + let pos = r.stream_position()?; + let mut lines = Vec::with_capacity(n as usize); + for _ in 0..n { + let line: [u8; 80] = read_bytes(r)?; + lines.push(line); + } + Ok(DocumentRecord { pos, lines }) + } + _ => Err(Error::BadDocumentLength { offset, n, max: DOC_MAX_LINES, - }) - } else { - let pos = r.stream_position()?; - let mut lines = Vec::with_capacity(n as usize); - for i in 0..n { - let line: [u8; 80] = read_bytes(r)?; - lines.push(line); - } - Ok(Some(DocumentRecord { pos, lines })) + }), + } +} + +#[derive(FromPrimitive)] +enum Extension { + /// Machine integer info. + Integer = 3, + /// Machine floating-point info. + Float = 4, + /// Variable sets. + VarSets = 5, + /// DATE. + Date = 6, + /// Multiple response sets. + Mrsets = 7, + /// SPSS Data Entry. + DataEntry = 8, + /// Extra product info text. + ProductInfo = 10, + /// Variable display parameters. + Display = 11, + /// Long variable names. + LongNames = 13, + /// Long strings. + LongStrings = 14, + /// Extended number of cases. + Ncases = 16, + /// Data file attributes. + FileAttrs = 17, + /// Variable attributes. + VarAttrs = 18, + /// Multiple response sets (extended). + Mrsets2 = 19, + /// Character encoding. + Encoding = 20, + /// Value labels for long strings. + LongLabels = 21, + /// Missing values for long strings. + LongMissing = 22, + /// "Format properties in dataview table". + Dataview = 24, +} + +struct ExtensionRecord { + /// Offset from the start of the file to the start of the record. + offset: u64, + + /// Record subtype. + subtype: u32, + + /// Size of each data element. + size: u32, + + /// Number of data elements. + count: u32, + + /// `size * count` bytes of data. + data: Vec, +} + +fn extension_record_size_requirements(extension: Extension) -> (u32, u32) { + match extension { + /* Implemented record types. */ + Extension::Integer => (4, 8), + Extension::Float => (8, 3), + Extension::VarSets => (1, 0), + Extension::Mrsets => (1, 0), + Extension::ProductInfo => (1, 0), + Extension::Display => (4, 0), + Extension::LongNames => (1, 0), + Extension::LongStrings => (1, 0), + Extension::Ncases => (8, 2), + Extension::FileAttrs => (1, 0), + Extension::VarAttrs => (1, 0), + Extension::Mrsets2 => (1, 0), + Extension::Encoding => (1, 0), + Extension::LongLabels => (1, 0), + Extension::LongMissing => (1, 0), + + /* Ignored record types. */ + Extension::Date => (0, 0), + Extension::DataEntry => (0, 0), + Extension::Dataview => (0, 0), } } +fn read_extension_record( + r: &mut BufReader, + e: Endian, +) -> Result { + let subtype = e.parse(read_bytes(r)?); + let offset = r.stream_position()?; + let size: u32 = e.parse(read_bytes(r)?); + let count = e.parse(read_bytes(r)?); + let Some(product) = size.checked_mul(count) else { + return Err(Error::ExtensionRecordTooLarge { + offset, + subtype, + size, + count, + }); + }; + let offset = r.stream_position()?; + let data = read_vec(r, product as usize)?; + Ok(ExtensionRecord { + offset, + subtype, + size, + count, + data, + }) +} + +struct ZHeader { + /// File offset to the start of the record. + offset: u64, + + /// File offset to the ZLIB data header. + zheader_offset: u64, + + /// File offset to the ZLIB trailer. + ztrailer_offset: u64, + + /// Length of the ZLIB trailer in bytes. + ztrailer_len: u64, +} + +fn read_zheader(r: &mut BufReader, e: Endian) -> Result { + let offset = r.stream_position()?; + let zheader_offset: u64 = e.parse(read_bytes(r)?); + let ztrailer_offset: u64 = e.parse(read_bytes(r)?); + let ztrailer_len: u64 = e.parse(read_bytes(r)?); + + if zheader_offset != offset { + return Err(Error::BadZlibHeaderOffset { + offset, + zheader_offset, + }); + } + if ztrailer_offset < offset { + return Err(Error::BadZlibTrailerOffset { + offset, + ztrailer_offset, + }); + } + if ztrailer_len < 24 || ztrailer_len % 24 != 0 { + return Err(Error::BadZlibTrailerLen { + offset, + ztrailer_len, + }); + } + + Ok(ZHeader { + offset, + zheader_offset, + ztrailer_offset, + ztrailer_len, + }) +} + fn read_bytes(r: &mut R) -> Result<[u8; N], IoError> { let mut buf = [0; N]; r.read_exact(&mut buf)?;