From 692571bde4c05649467a631b7ab8fda556e7fe19 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 9 Dec 2023 13:07:41 -0800 Subject: [PATCH] work --- rust/src/cooked.rs | 2 +- rust/src/main.rs | 25 +-- rust/src/raw.rs | 390 +++++++++++++++++++++++---------------------- 3 files changed, 213 insertions(+), 204 deletions(-) diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index d00f3f3c34..0206b84b75 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -355,7 +355,7 @@ pub fn decode( raw::Record::EndOfHeaders(_) => (), raw::Record::ZHeader(_) => (), raw::Record::ZTrailer(_) => (), - raw::Record::Case(_) => (), + raw::Record::Cases(_) => (), }; } Ok(output) diff --git a/rust/src/main.rs b/rust/src/main.rs index 213b381a6e..45d0622f0d 100644 --- a/rust/src/main.rs +++ b/rust/src/main.rs @@ -95,25 +95,28 @@ fn dissect(file_name: &Path, max_cases: u64, mode: Mode, encoding: Option<&'stat return Ok(()) } Mode::Raw => { - let headers: Vec = reader.collect_headers()?; - for header in headers { - println!("{header:?}"); + for header in reader { + let header = header?; + println!("{:?}", header); + if let Record::Cases(cases) = header { + let mut cases = cases.borrow_mut(); + for _ in 0..max_cases { + let Some(Ok(record)) = cases.next() else { + break; + }; + println!("{:?}", record); + } + } } } Mode::Cooked => { - let headers: Vec = reader.collect_headers()?; - let headers = decode(headers, encoding, &|e| panic!("{e}"))?; + let headers: Vec = reader.collect::, _>>()?; + let headers = decode(headers, encoding, &|e| eprintln!("{e}"))?; for header in headers { println!("{header:?}"); } } } - for _ in 0..max_cases { - let Some(Ok(record)) = reader.next() else { - break; - }; - println!("{:?}", record); - } Ok(()) } diff --git a/rust/src/raw.rs b/rust/src/raw.rs index 222a39b01d..8b69f760d6 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -3,20 +3,19 @@ use crate::endian::{Endian, Parse, ToBytes}; use encoding_rs::mem::decode_latin1; use flate2::read::ZlibDecoder; use num::Integer; -use std::borrow::Cow; -use std::cmp::Ordering; -use std::fmt::{Debug, Formatter, Result as FmtResult}; -use std::ops::Range; -use std::str::from_utf8; use std::{ + borrow::Cow, + cmp::Ordering, collections::VecDeque, + fmt::{Debug, Formatter, Result as FmtResult}, io::{Error as IoError, Read, Seek, SeekFrom}, - iter::FusedIterator, + mem::take, + ops::Range, + rc::Rc, + str::from_utf8, cell::RefCell, }; use thiserror::Error as ThisError; -use self::state::State; - #[derive(ThisError, Debug)] pub enum Error { #[error("Not an SPSS system file")] @@ -158,26 +157,27 @@ pub enum Record { EndOfHeaders(u32), ZHeader(ZHeader), ZTrailer(ZTrailer), - Case(Vec), + Cases(Rc>), } impl Record { - fn read(reader: &mut R, endian: Endian, warn: &Box) -> Result { - loop { - if let Some(record) = Self::_read(reader, endian, warn)? { - return Ok(record); - } - } - } - - fn _read(reader: &mut R, endian: Endian, warn: &Box) -> Result, Error> { + fn read( + reader: &mut R, + endian: Endian, + warn: &Box, + ) -> Result, Error> + where + R: Read + Seek, + { let rec_type: u32 = endian.parse(read_bytes(reader)?); match rec_type { 2 => Ok(Some(VariableRecord::read(reader, endian)?)), 3 => Ok(Some(ValueLabelRecord::read(reader, endian)?)), 6 => Ok(Some(DocumentRecord::read(reader, endian)?)), 7 => Extension::read(reader, endian, warn), - 999 => Ok(Some(Record::EndOfHeaders(endian.parse(read_bytes(reader)?)))), + 999 => Ok(Some(Record::EndOfHeaders( + endian.parse(read_bytes(reader)?), + ))), _ => Err(Error::BadRecordType { offset: reader.stream_position()?, rec_type, @@ -192,7 +192,7 @@ fn default_decode(s: &[u8]) -> Cow { from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from) } -#[derive(Copy, Clone, Debug)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum Compression { Simple, ZLib, @@ -398,154 +398,6 @@ impl VarType { } } -mod state { - use super::{ - Compression, Error, HeaderRecord, Record, Value, VarType, VariableRecord, ZHeader, - ZTrailer, ZlibDecodeMultiple, - }; - use crate::endian::Endian; - use std::{ - collections::VecDeque, - io::{Read, Seek}, - }; - - pub trait State { - #[allow(clippy::type_complexity)] - fn read(self: Box) -> Result)>, Error>; - } - - struct Start { - reader: R, - warn: Box - } - - pub fn new(reader: R, warn: F) -> Box { - Box::new(Start { reader, warn: Box::new(warn) }) - } - - struct CommonState { - reader: R, - warn: Box, - endian: Endian, - bias: f64, - compression: Option, - var_types: Vec, - } - - impl State for Start { - fn read(mut self: Box) -> Result)>, Error> { - let header = HeaderRecord::read(&mut self.reader)?; - let next_state = Headers(CommonState { - reader: self.reader, - warn: self.warn, - endian: header.endian, - bias: header.bias, - compression: header.compression, - var_types: Vec::new(), - }); - Ok(Some((Record::Header(header), Box::new(next_state)))) - } - } - - struct Headers(CommonState); - - impl State for Headers { - fn read(mut self: Box) -> Result)>, Error> { - let record = Record::read(&mut self.0.reader, self.0.endian, &self.0.warn)?; - match record { - Record::Variable(VariableRecord { width, .. }) => { - self.0.var_types.push(VarType::from_width(width)); - } - Record::EndOfHeaders(_) => { - let next_state: Box = match self.0.compression { - None => Box::new(Data(self.0)), - Some(Compression::Simple) => Box::new(CompressedData::new(self.0)), - Some(Compression::ZLib) => Box::new(ZlibHeader(self.0)), - }; - return Ok(Some((record, next_state))); - } - _ => (), - }; - Ok(Some((record, self))) - } - } - - struct ZlibHeader(CommonState); - - impl State for ZlibHeader { - fn read(mut self: Box) -> Result)>, Error> { - let zheader = ZHeader::read(&mut self.0.reader, self.0.endian)?; - let next_state = Box::new(ZlibTrailer(self.0, zheader.clone())); - Ok(Some((Record::ZHeader(zheader), next_state))) - } - } - - struct ZlibTrailer(CommonState, ZHeader); - - impl State for ZlibTrailer { - fn read(mut self: Box) -> Result)>, Error> { - let retval = ZTrailer::read( - &mut self.0.reader, - self.0.endian, - self.1.ztrailer_offset, - self.1.ztrailer_len, - )?; - let next_state = Box::new(CompressedData::new(CommonState { - reader: ZlibDecodeMultiple::new(self.0.reader), - warn: self.0.warn, - endian: self.0.endian, - bias: self.0.bias, - compression: self.0.compression, - var_types: self.0.var_types, - })); - match retval { - None => next_state.read(), - Some(ztrailer) => Ok(Some((Record::ZTrailer(ztrailer), next_state))), - } - } - } - - struct Data(CommonState); - - impl State for Data { - fn read(mut self: Box) -> Result)>, Error> { - match Value::read_case(&mut self.0.reader, &self.0.var_types, self.0.endian)? { - None => Ok(None), - Some(values) => Ok(Some((Record::Case(values), self))), - } - } - } - - struct CompressedData { - common: CommonState, - codes: VecDeque, - } - - impl CompressedData { - fn new(common: CommonState) -> CompressedData { - CompressedData { - common, - codes: VecDeque::new(), - } - } - } - - impl State for CompressedData { - fn read(mut self: Box) -> Result)>, Error> { - match Value::read_compressed_case( - &mut self.common.reader, - &self.common.var_types, - &mut self.codes, - self.common.endian, - self.common.bias, - )? { - None => Ok(None), - Some(values) => Ok(Some((Record::Case(values), self))), - } - } - } -} - #[derive(Copy, Clone)] pub enum Value { Number(Option), @@ -724,44 +576,194 @@ where } } -pub struct Reader { - state: Option>, +enum ReaderState { + Start, + Headers, + ZlibHeader, + ZlibTrailer { + ztrailer_offset: u64, + ztrailer_len: u64, + }, + Cases, + End, +} + +pub struct Reader +where + R: Read + Seek + 'static, +{ + reader: Option, + warn: Box, + + header: HeaderRecord, + var_types: Vec, + + state: ReaderState, } -impl Reader { - pub fn new(reader: R, warn: F) -> Result { - Ok(Reader { - state: Some(state::new(reader, warn)), +impl Reader +where + R: Read + Seek + 'static, +{ + pub fn new(mut reader: R, warn: F) -> Result + where + F: Fn(Error) + 'static, + { + let header = HeaderRecord::read(&mut reader)?; + Ok(Self { + reader: Some(reader), + warn: Box::new(warn), + header, + var_types: Vec::new(), + state: ReaderState::Start, }) } - pub fn collect_headers(&mut self) -> Result, Error> { - let mut headers = Vec::new(); - for record in self { - match record? { - Record::EndOfHeaders(_) => break, - r => headers.push(r), - }; - } - Ok(headers) + fn cases(&mut self) -> Cases { + self.state = ReaderState::End; + Cases::new( + self.reader.take().unwrap(), + take(&mut self.var_types), + &self.header, + ) } } -impl Iterator for Reader { +impl Iterator for Reader +where + R: Read + Seek + 'static, +{ type Item = Result; fn next(&mut self) -> Option { - match self.state.take()?.read() { - Ok(Some((record, next_state))) => { - self.state = Some(next_state); + match self.state { + ReaderState::Start => { + self.state = ReaderState::Headers; + Some(Ok(Record::Header(self.header.clone()))) + } + ReaderState::Headers => { + let record = loop { + match Record::read( + self.reader.as_mut().unwrap(), + self.header.endian, + &self.warn, + ) { + Ok(Some(record)) => break record, + Ok(None) => (), + Err(error) => return Some(Err(error)), + } + }; + match record { + Record::Variable(VariableRecord { width, .. }) => { + self.var_types.push(VarType::from_width(width)); + } + Record::EndOfHeaders(_) => { + self.state = if let Some(Compression::ZLib) = self.header.compression { + ReaderState::ZlibHeader + } else { + ReaderState::Cases + }; + } + _ => (), + }; Some(Ok(record)) } - Ok(None) => None, - Err(error) => Some(Err(error)), + ReaderState::ZlibHeader => { + let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian) + { + Ok(zheader) => zheader, + Err(error) => return Some(Err(error)), + }; + self.state = ReaderState::ZlibTrailer { + ztrailer_offset: zheader.ztrailer_offset, + ztrailer_len: zheader.ztrailer_len, + }; + Some(Ok(Record::ZHeader(zheader))) + } + ReaderState::ZlibTrailer { + ztrailer_offset, + ztrailer_len, + } => { + match ZTrailer::read( + self.reader.as_mut().unwrap(), + self.header.endian, + ztrailer_offset, + ztrailer_len, + ) { + Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))), + Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))), + Err(error) => Some(Err(error)), + } + } + ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))), + ReaderState::End => None, + } + } +} + +trait ReadSeek: Read + Seek {} +impl ReadSeek for T where T: Read + Seek {} + +pub struct Cases { + reader: Box, + var_types: Vec, + compression: Option, + bias: f64, + endian: Endian, + codes: VecDeque, + eof: bool +} + +impl Debug for Cases { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "Cases") + } +} + +impl Cases { + fn new(reader: R, var_types: Vec, header: &HeaderRecord) -> Self + where + R: Read + Seek + 'static, + { + Self { + reader: if header.compression == Some(Compression::ZLib) { + Box::new(ZlibDecodeMultiple::new(reader)) + } else { + Box::new(reader) + }, + var_types, + compression: header.compression, + bias: header.bias, + endian: header.endian, + codes: VecDeque::with_capacity(8), + eof: false, } } } -impl FusedIterator for Reader {} +impl Iterator for Cases { + type Item = Result, Error>; + + fn next(&mut self) -> Option { + if self.eof { + return None; + } + + let retval = if self.compression.is_some() { + Value::read_compressed_case( + &mut self.reader, + &self.var_types, + &mut self.codes, + self.endian, + self.bias, + ) + .transpose() + } else { + Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose() + }; + self.eof = matches!(retval, None | Some(Err(_))); + retval + } +} #[derive(Copy, Clone, PartialEq, Eq, Hash)] pub struct Spec(pub u32); @@ -1589,7 +1591,11 @@ impl Extension { Ok(()) } - fn read(r: &mut R, endian: Endian, warn: &Box) -> Result, Error> { + fn read( + r: &mut R, + endian: Endian, + warn: &Box, + ) -> Result, Error> { let subtype = endian.parse(read_bytes(r)?); let header_offset = r.stream_position()?; let size: u32 = endian.parse(read_bytes(r)?); @@ -1637,7 +1643,7 @@ impl Extension { Err(error) => { warn(error); Ok(None) - }, + } } } } -- 2.30.2