From 818db07841e1410219f3c6c5adf383bda060ac83 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Fri, 28 Jul 2023 13:19:45 -0700 Subject: [PATCH] work (zlib works?) --- rust/src/lib.rs | 359 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 336 insertions(+), 23 deletions(-) diff --git a/rust/src/lib.rs b/rust/src/lib.rs index d237f81b80..963e4ef448 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,6 +1,6 @@ #![allow(unused_variables)] use endian::{Endian, Parse, ToBytes}; -//use flate2::bufread::ZlibDecoder; +use flate2::bufread::ZlibDecoder; use num::Integer; use num_derive::FromPrimitive; use std::{ @@ -99,7 +99,12 @@ pub enum Error { CompressedStringExpected { offset: u64, case_ofs: u64 }, #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")] - BadZlibTrailerNBlocks { offset: u64, n_blocks: u32, expected_n_blocks: u64, ztrailer_len: u64 } + BadZlibTrailerNBlocks { + offset: u64, + n_blocks: u32, + expected_n_blocks: u64, + ztrailer_len: u64, + }, } #[derive(Error, Debug)] @@ -213,19 +218,292 @@ impl VarType { } } -pub struct Reader { +pub struct Reader { r: BufReader, var_types: Vec, state: ReaderState, } +trait State { + fn read(self: Box) -> Result)>, Error>; +} + +struct Start { + r: BufReader, +} + +impl State for Start { + fn read(mut self: Box) -> Result)>, Error> { + let header = read_header(&mut self.r)?; + Ok(Some((Record::Header(header), self))) + } +} + +struct Headers { + reader: BufReader, + endian: Endian, + compression: Option, + var_types: Vec, +} + +impl State for Headers { + fn read(mut self: Box) -> Result)>, Error> { + let rec_type: u32 = self.endian.parse(read_bytes(&mut self.reader)?); + let record = match rec_type { + 2 => { + let variable = read_variable_record(&mut self.reader, self.endian)?; + self.var_types.push(VarType::from_width(variable.width)); + Record::Variable(variable) + } + 3 => Record::ValueLabel(read_value_label_record(&mut self.reader, self.endian)?), + 4 => Record::VarIndexes(read_var_indexes_record(&mut self.reader, self.endian)?), + 6 => Record::Document(read_document_record(&mut self.reader, self.endian)?), + 7 => Record::Extension(read_extension_record(&mut self.reader, self.endian)?), + 999 => { + let _: [u8; 4] = read_bytes(&mut self.reader)?; + let next_state: Box = match self.compression { + None => Box::new(Data { + reader: self.reader, + endian: self.endian, + var_types: self.var_types, + }), + Some(Compression::Simple) => Box::new(CompressedData { + reader: self.reader, + endian: self.endian, + var_types: self.var_types, + codes: VecDeque::new(), + }), + Some(Compression::ZLib) => Box::new(ZlibData { + reader: ZlibDecodeMultiple::new(self.reader), + endian: self.endian, + var_types: self.var_types, + codes: VecDeque::new(), + }), + }; + return Ok(Some((Record::EndOfHeaders, next_state))); + } + _ => { + return Err(Error::BadRecordType { + offset: self.reader.stream_position()?, + rec_type, + }) + } + }; + Ok(Some((record, self))) + } +} + +struct Data { + reader: BufReader, + endian: Endian, + var_types: Vec, +} + +impl State for Data { + fn read(mut self: Box) -> Result)>, Error> { + let case_start = self.reader.stream_position()?; + let mut values = Vec::with_capacity(self.var_types.len()); + for (i, &var_type) in self.var_types.iter().enumerate() { + let Some(raw) = try_read_bytes(&mut self.reader)? else { + if i == 0 { + return Ok(None); + } else { + let offset = self.reader.stream_position()?; + return Err(Error::EofInCase { + offset, + case_ofs: offset - case_start, + case_len: self.var_types.len() * 8, + }); + } + }; + values.push(Value::from_raw(var_type, raw, self.endian)); + } + Ok(Some((Record::Case(values), self))) + } +} + +struct CompressedData { + reader: BufReader, + endian: Endian, + var_types: Vec, + codes: VecDeque, +} + +fn read_compressed_data( + reader: &mut R, + endian: Endian, + var_types: &Vec, + codes: &mut VecDeque, +) -> Result, Error> +where + R: Read + Seek, +{ + let case_start = reader.stream_position()?; + let mut values = Vec::with_capacity(var_types.len()); + let bias = 100.0; // XXX + for (i, &var_type) in var_types.iter().enumerate() { + let value = loop { + let Some(code) = codes.pop_front() else { + let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else { + if i == 0 { + return Ok(None); + } else { + let offset = reader.stream_position()?; + return Err(Error::EofInCompressedCase { + offset, + case_ofs: offset - case_start, + }); + } + }; + codes.extend(new_codes.into_iter()); + continue; + }; + match code { + 0 => (), + 1..=251 => match var_type { + VarType::Number => break Value::Number(Some(code as f64 - bias)), + VarType::String => break Value::String(endian.to_bytes(code as f64 - bias)), + }, + 252 => { + if i == 0 { + return Ok(None); + } else { + let offset = reader.stream_position()?; + return Err(Error::PartialCompressedCase { + offset, + case_ofs: offset - case_start, + }); + } + } + 253 => break Value::from_raw(var_type, read_bytes(reader)?, endian), + 254 => match var_type { + VarType::String => break Value::String(*b" "), // XXX EBCDIC + VarType::Number => { + return Err(Error::CompressedStringExpected { + offset: case_start, + case_ofs: reader.stream_position()? - case_start, + }) + } + }, + 255 => match var_type { + VarType::Number => break Value::Number(None), + VarType::String => { + return Err(Error::CompressedNumberExpected { + offset: case_start, + case_ofs: reader.stream_position()? - case_start, + }) + } + }, + } + }; + values.push(value); + } + Ok(Some(Record::Case(values))) +} + +impl State for CompressedData { + fn read(mut self: Box) -> Result)>, Error> { + match read_compressed_data( + &mut self.reader, + self.endian, + &self.var_types, + &mut self.codes, + )? { + None => Ok(None), + Some(record) => Ok(Some((record, self))), + } + } +} + +struct ZlibData { + reader: ZlibDecodeMultiple, + endian: Endian, + var_types: Vec, + codes: VecDeque, +} + +impl State for ZlibData { + fn read(mut self: Box) -> Result)>, Error> { + match read_compressed_data( + &mut self.reader, + self.endian, + &self.var_types, + &mut self.codes, + )? { + None => Ok(None), + Some(record) => Ok(Some((record, self))), + } + } +} + +struct ZlibDecodeMultiple +where + R: Read + Seek, +{ + reader: Option>>, +} + +impl ZlibDecodeMultiple +where + R: Read + Seek, +{ + fn new(reader: BufReader) -> ZlibDecodeMultiple { + ZlibDecodeMultiple { + reader: Some(ZlibDecoder::new(reader)), + } + } +} + +impl Read for ZlibDecodeMultiple +where + R: Read + Seek, +{ + fn read(&mut self, buf: &mut [u8]) -> Result { + loop { + match self.reader.as_mut().unwrap().read(buf)? { + 0 => { + let inner = self.reader.take().unwrap().into_inner(); + self.reader = Some(ZlibDecoder::new(inner)); + } + n => return Ok(n), + }; + } + } +} + +impl Seek for ZlibDecodeMultiple +where + R: Read + Seek, +{ + fn seek(&mut self, pos: SeekFrom) -> Result { + unimplemented!(); + } +} + +/* +impl BufRead for ZlibDecodeMultiple +where + R: Read + Seek, +{ + fn fill_buf(&mut self) -> Result<&[u8], IoError> { + self.reader.as_mut().unwrap().fill_buf() + } + fn consume(&mut self, amt: usize) { + self.reader.as_mut().unwrap().consume(amt) + } +}*/ + enum ReaderState { Start, Headers(Endian, Option), Data(Endian), CompressedData(Endian, VecDeque), ZHeader(Endian), - ZTrailer { endian: Endian, ztrailer_ofs: u64, ztrailer_len: u64 }, + ZTrailer { + endian: Endian, + ztrailer_ofs: u64, + ztrailer_len: u64, + }, //ZData, End, } @@ -304,7 +582,11 @@ impl Reader { return Ok(None); } else { let offset = self.r.stream_position()?; - return Err(Error::EofInCase { offset, case_ofs: offset - case_start, case_len: self.var_types.len() * 8}); + return Err(Error::EofInCase { + offset, + case_ofs: offset - case_start, + case_len: self.var_types.len() * 8, + }); } }; values.push(Value::from_raw(var_type, raw, endian)); @@ -318,12 +600,16 @@ impl Reader { for (i, &var_type) in self.var_types.iter().enumerate() { let value = loop { let Some(code) = codes.pop_front() else { - let Some(new_codes): Option<[u8; 8]> = try_read_bytes(&mut self.r)? else { + let Some(new_codes): Option<[u8; 8]> = try_read_bytes(&mut self.r)? + else { if i == 0 { return Ok(None); } else { let offset = self.r.stream_position()?; - return Err(Error::EofInCompressedCase { offset, case_ofs: offset - case_start}); + return Err(Error::EofInCompressedCase { + offset, + case_ofs: offset - case_start, + }); } }; codes.extend(new_codes.into_iter()); @@ -377,23 +663,29 @@ impl Reader { } ReaderState::ZHeader(endian) => { let zheader = read_zheader(&mut self.r, endian)?; - self.state = ReaderState::ZTrailer { endian, ztrailer_ofs: zheader.ztrailer_offset, ztrailer_len: zheader.ztrailer_len}; + self.state = ReaderState::ZTrailer { + endian, + ztrailer_ofs: zheader.ztrailer_offset, + ztrailer_len: zheader.ztrailer_len, + }; Ok(Some(Record::ZHeader(zheader))) } - ReaderState::ZTrailer { endian, ztrailer_ofs, ztrailer_len } => { + ReaderState::ZTrailer { + endian, + ztrailer_ofs, + ztrailer_len, + } => { //self.state = ReaderState::ZData; match read_ztrailer(&mut self.r, endian, ztrailer_ofs, ztrailer_len)? { - Some(ztrailer) => { - Ok(Some(Record::ZTrailer(ztrailer))) - }, - None => self._next() + Some(ztrailer) => Ok(Some(Record::ZTrailer(ztrailer))), + None => self._next(), } } -/* - ReaderState::ZData(zlib_decoder) => { - let zlib_decoder = zlib_decoder.unwrap_or_else( - }, -*/ + /* + ReaderState::ZData(zlib_decoder) => { + let zlib_decoder = zlib_decoder.unwrap_or_else( + }, + */ ReaderState::End => Ok(None), } } @@ -850,10 +1142,15 @@ pub struct ZBlock { pub compressed_size: u32, } -fn read_ztrailer(r: &mut BufReader, endian: Endian, ztrailer_ofs: u64, ztrailer_len: u64) -> Result, Error> { +fn read_ztrailer( + r: &mut BufReader, + endian: Endian, + ztrailer_ofs: u64, + ztrailer_len: u64, +) -> Result, Error> { let start_offset = r.stream_position()?; if r.seek(SeekFrom::Start(ztrailer_ofs)).is_err() { - return Ok(None) + return Ok(None); } let int_bias = endian.parse(read_bytes(r)?); let zero = endian.parse(read_bytes(r)?); @@ -861,7 +1158,12 @@ fn read_ztrailer(r: &mut BufReader, endian: Endian, ztrailer_ let n_blocks: u32 = endian.parse(read_bytes(r)?); let expected_n_blocks = (ztrailer_len - 24) / 24; if n_blocks as u64 != expected_n_blocks { - return Err(Error::BadZlibTrailerNBlocks { offset: ztrailer_ofs, n_blocks, expected_n_blocks, ztrailer_len }) + return Err(Error::BadZlibTrailerNBlocks { + offset: ztrailer_ofs, + n_blocks, + expected_n_blocks, + ztrailer_len, + }); } let mut blocks = Vec::with_capacity(n_blocks as usize); for _ in 0..n_blocks { @@ -869,10 +1171,21 @@ fn read_ztrailer(r: &mut BufReader, endian: Endian, ztrailer_ let compressed_ofs = endian.parse(read_bytes(r)?); let uncompressed_size = endian.parse(read_bytes(r)?); let compressed_size = endian.parse(read_bytes(r)?); - blocks.push(ZBlock { uncompressed_ofs, compressed_ofs, uncompressed_size, compressed_size }); + blocks.push(ZBlock { + uncompressed_ofs, + compressed_ofs, + uncompressed_size, + compressed_size, + }); } r.seek(SeekFrom::Start(start_offset))?; - Ok(Some(ZTrailer { offset: ztrailer_ofs, int_bias, zero, block_size, blocks })) + Ok(Some(ZTrailer { + offset: ztrailer_ofs, + int_bias, + zero, + block_size, + blocks, + })) } fn try_read_bytes(r: &mut R) -> Result, IoError> { -- 2.30.2