From 853329fd7c721a0b430a80abe0308d5eb1b1002c Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Fri, 28 Jul 2023 06:38:56 -0700 Subject: [PATCH] work --- rust/Cargo.lock | 41 ++++++++++ rust/Cargo.toml | 1 + rust/src/lib.rs | 164 ++++++++++++++++++++++++++++--------- src/data/sys-file-reader.c | 1 + 4 files changed, 167 insertions(+), 40 deletions(-) diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 38a454c4b7..950e262847 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + [[package]] name = "anyhow" version = "1.0.69" @@ -37,6 +43,12 @@ version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + [[package]] name = "clap" version = "4.1.7" @@ -74,6 +86,15 @@ dependencies = [ "os_str_bytes", ] +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if", +] + [[package]] name = "errno" version = "0.2.8" @@ -95,6 +116,16 @@ dependencies = [ "libc", ] +[[package]] +name = "flate2" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "heck" version = "0.4.1" @@ -160,6 +191,15 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" +[[package]] +name = "miniz_oxide" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +dependencies = [ + "adler", +] + [[package]] name = "num" version = "0.4.0" @@ -298,6 +338,7 @@ version = "1.0.0" dependencies = [ "anyhow", "clap", + "flate2", "hexplay", "num", "num-derive", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index d276b1ae18..a58a5fafa6 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -7,6 +7,7 @@ authors = [ "Ben Pfaff", "John Darrington" ] [dependencies] anyhow = "1.0.69" clap = { version = "4.1.7", features = ["derive"] } +flate2 = "1.0.26" hexplay = "0.2.1" num = "0.4.0" num-derive = "0.4.0" diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 354b7c0950..d237f81b80 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,10 +1,11 @@ #![allow(unused_variables)] use endian::{Endian, Parse, ToBytes}; +//use flate2::bufread::ZlibDecoder; use num::Integer; use num_derive::FromPrimitive; use std::{ collections::VecDeque, - io::{BufReader, Error as IoError, Read, Seek}, + io::{BufReader, Error as IoError, Read, Seek, SeekFrom}, }; use thiserror::Error; @@ -96,6 +97,9 @@ pub enum Error { #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")] CompressedStringExpected { offset: u64, case_ofs: u64 }, + + #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")] + BadZlibTrailerNBlocks { offset: u64, n_blocks: u32, expected_n_blocks: u64, ztrailer_len: u64 } } #[derive(Error, Debug)] @@ -121,6 +125,8 @@ pub enum Record { VarIndexes(VarIndexes), Extension(Extension), EndOfHeaders, + ZHeader(ZHeader), + ZTrailer(ZTrailer), Case(Vec), } @@ -218,6 +224,9 @@ enum ReaderState { Headers(Endian, Option), Data(Endian), CompressedData(Endian, VecDeque), + ZHeader(Endian), + ZTrailer { endian: Endian, ztrailer_ofs: u64, ztrailer_len: u64 }, + //ZData, End, } @@ -273,7 +282,7 @@ impl Reader { Some(Compression::Simple) => { ReaderState::CompressedData(endian, VecDeque::new()) } - _ => ReaderState::End, + Some(Compression::ZLib) => ReaderState::ZHeader(endian), }; return Ok(Some(Record::EndOfHeaders)); } @@ -339,11 +348,9 @@ impl Reader { }); } } - 253 => break Value::from_raw( - var_type, - read_bytes(&mut self.r)?, - endian, - ), + 253 => { + break Value::from_raw(var_type, read_bytes(&mut self.r)?, endian) + } 254 => match var_type { VarType::String => break Value::String(*b" "), // XXX EBCDIC VarType::Number => { @@ -358,15 +365,35 @@ impl Reader { VarType::String => { return Err(Error::CompressedNumberExpected { offset: case_start, - case_ofs: self.r.stream_position()? - case_start,}) + case_ofs: self.r.stream_position()? - case_start, + }) } - } + }, } }; values.push(value); } Ok(Some(Record::Case(values))) } + ReaderState::ZHeader(endian) => { + let zheader = read_zheader(&mut self.r, endian)?; + self.state = ReaderState::ZTrailer { endian, ztrailer_ofs: zheader.ztrailer_offset, ztrailer_len: zheader.ztrailer_len}; + Ok(Some(Record::ZHeader(zheader))) + } + ReaderState::ZTrailer { endian, ztrailer_ofs, ztrailer_len } => { + //self.state = ReaderState::ZData; + match read_ztrailer(&mut self.r, endian, ztrailer_ofs, ztrailer_len)? { + Some(ztrailer) => { + Ok(Some(Record::ZTrailer(ztrailer))) + }, + None => self._next() + } + } +/* + ReaderState::ZData(zlib_decoder) => { + let zlib_decoder = zlib_decoder.unwrap_or_else( + }, +*/ ReaderState::End => Ok(None), } } @@ -382,9 +409,7 @@ impl Iterator for Reader { self.state = ReaderState::End; None } - Ok(Some(record)) => { - Some(Ok(record)) - } + Ok(Some(record)) => Some(Ok(record)), Err(error) => { self.state = ReaderState::End; Some(Err(error)) @@ -474,20 +499,20 @@ pub struct Variable { fn read_variable_record( r: &mut BufReader, - e: Endian, + endian: Endian, ) -> Result { let offset = r.stream_position()?; - let width: i32 = e.parse(read_bytes(r)?); - let has_variable_label: u32 = e.parse(read_bytes(r)?); - let missing_value_code: i32 = e.parse(read_bytes(r)?); - let print_format: u32 = e.parse(read_bytes(r)?); - let write_format: u32 = e.parse(read_bytes(r)?); + let width: i32 = endian.parse(read_bytes(r)?); + let has_variable_label: u32 = endian.parse(read_bytes(r)?); + let missing_value_code: i32 = endian.parse(read_bytes(r)?); + let print_format: u32 = endian.parse(read_bytes(r)?); + let write_format: u32 = endian.parse(read_bytes(r)?); let name: [u8; 8] = read_bytes(r)?; let label = match has_variable_label { 0 => None, 1 => { - let len: u32 = e.parse(read_bytes(r)?); + let len: u32 = endian.parse(read_bytes(r)?); let read_len = len.min(65535) as usize; let label = Some(read_vec(r, read_len)?); @@ -555,10 +580,10 @@ impl ValueLabel { fn read_value_label_record( r: &mut BufReader, - e: Endian, + endian: Endian, ) -> Result { let offset = r.stream_position()?; - let n: u32 = e.parse(read_bytes(r)?); + let n: u32 = endian.parse(read_bytes(r)?); if n > ValueLabel::MAX { return Err(Error::BadNumberOfValueLabels { offset, @@ -570,7 +595,7 @@ fn read_value_label_record( let mut labels = Vec::new(); for _ in 0..n { let value: [u8; 8] = read_bytes(r)?; - let label_len: u8 = e.parse(read_bytes(r)?); + let label_len: u8 = endian.parse(read_bytes(r)?); let label_len = label_len as usize; let padded_len = Integer::next_multiple_of(&(label_len + 1), &8); @@ -596,10 +621,10 @@ impl VarIndexes { fn read_var_indexes_record( r: &mut BufReader, - e: Endian, + endian: Endian, ) -> Result { let offset = r.stream_position()?; - let n: u32 = e.parse(read_bytes(r)?); + let n: u32 = endian.parse(read_bytes(r)?); if n > VarIndexes::MAX { return Err(Error::BadNumberOfVarIndexes { offset, @@ -609,7 +634,7 @@ fn read_var_indexes_record( } let mut var_indexes = Vec::with_capacity(n as usize); for _ in 0..n { - var_indexes.push(e.parse(read_bytes(r)?)); + var_indexes.push(endian.parse(read_bytes(r)?)); } Ok(VarIndexes { @@ -631,10 +656,10 @@ pub struct Document { fn read_document_record( r: &mut BufReader, - e: Endian, + endian: Endian, ) -> Result { let offset = r.stream_position()?; - let n: u32 = e.parse(read_bytes(r)?); + let n: u32 = endian.parse(read_bytes(r)?); match n { 0..=DOC_MAX_LINES => { let pos = r.stream_position()?; @@ -738,12 +763,12 @@ fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) { fn read_extension_record( r: &mut BufReader, - e: Endian, + endian: Endian, ) -> Result { - let subtype = e.parse(read_bytes(r)?); + let subtype = endian.parse(read_bytes(r)?); let offset = r.stream_position()?; - let size: u32 = e.parse(read_bytes(r)?); - let count = e.parse(read_bytes(r)?); + let size: u32 = endian.parse(read_bytes(r)?); + let count = endian.parse(read_bytes(r)?); let Some(product) = size.checked_mul(count) else { return Err(Error::ExtensionRecordTooLarge { offset, @@ -763,25 +788,25 @@ fn read_extension_record( }) } -struct ZHeader { +pub struct ZHeader { /// File offset to the start of the record. - offset: u64, + pub offset: u64, /// File offset to the ZLIB data header. - zheader_offset: u64, + pub zheader_offset: u64, /// File offset to the ZLIB trailer. - ztrailer_offset: u64, + pub ztrailer_offset: u64, /// Length of the ZLIB trailer in bytes. - ztrailer_len: u64, + pub ztrailer_len: u64, } -fn read_zheader(r: &mut BufReader, e: Endian) -> Result { +fn read_zheader(r: &mut BufReader, endian: Endian) -> Result { let offset = r.stream_position()?; - let zheader_offset: u64 = e.parse(read_bytes(r)?); - let ztrailer_offset: u64 = e.parse(read_bytes(r)?); - let ztrailer_len: u64 = e.parse(read_bytes(r)?); + let zheader_offset: u64 = endian.parse(read_bytes(r)?); + let ztrailer_offset: u64 = endian.parse(read_bytes(r)?); + let ztrailer_len: u64 = endian.parse(read_bytes(r)?); Ok(ZHeader { offset, @@ -791,6 +816,65 @@ fn read_zheader(r: &mut BufReader, e: Endian) -> Result, +} + +pub struct ZBlock { + /// Offset of block of data if simple compression were used. + pub uncompressed_ofs: u64, + + /// Actual offset within the file of the compressed data block. + pub compressed_ofs: u64, + + /// The number of bytes in this data block after decompression. This is + /// `block_size` in every data block but the last, which may be smaller. + pub uncompressed_size: u32, + + /// The number of bytes in this data block, as stored compressed in this + /// file. + pub compressed_size: u32, +} + +fn read_ztrailer(r: &mut BufReader, endian: Endian, ztrailer_ofs: u64, ztrailer_len: u64) -> Result, Error> { + let start_offset = r.stream_position()?; + if r.seek(SeekFrom::Start(ztrailer_ofs)).is_err() { + return Ok(None) + } + let int_bias = endian.parse(read_bytes(r)?); + let zero = endian.parse(read_bytes(r)?); + let block_size = endian.parse(read_bytes(r)?); + let n_blocks: u32 = endian.parse(read_bytes(r)?); + let expected_n_blocks = (ztrailer_len - 24) / 24; + if n_blocks as u64 != expected_n_blocks { + return Err(Error::BadZlibTrailerNBlocks { offset: ztrailer_ofs, n_blocks, expected_n_blocks, ztrailer_len }) + } + let mut blocks = Vec::with_capacity(n_blocks as usize); + for _ in 0..n_blocks { + let uncompressed_ofs = endian.parse(read_bytes(r)?); + let compressed_ofs = endian.parse(read_bytes(r)?); + let uncompressed_size = endian.parse(read_bytes(r)?); + let compressed_size = endian.parse(read_bytes(r)?); + blocks.push(ZBlock { uncompressed_ofs, compressed_ofs, uncompressed_size, compressed_size }); + } + r.seek(SeekFrom::Start(start_offset))?; + Ok(Some(ZTrailer { offset: ztrailer_ofs, int_bias, zero, block_size, blocks })) +} + fn try_read_bytes(r: &mut R) -> Result, IoError> { let mut buf = [0; N]; let n = r.read(&mut buf)?; diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index 491df7f996..eb19db479b 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -3562,6 +3562,7 @@ read_zheader (struct sfm_reader *r) return false; } + /* XXX allow a value of 0 for these two? */ if (ztrailer_ofs < r->pos) { sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."), -- 2.30.2