From e404e91fe1237d5dcca500ae178afcecb7907664 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Thu, 27 Jul 2023 09:08:45 -0700 Subject: [PATCH] work --- Makefile.am | 2 +- rust/src/endian.rs | 12 ++++ rust/src/lib.rs | 147 ++++++++++++++++++++++++++++++++++++++------- 3 files changed, 137 insertions(+), 24 deletions(-) diff --git a/Makefile.am b/Makefile.am index 0fdd46bb2c..1283efb019 100644 --- a/Makefile.am +++ b/Makefile.am @@ -160,4 +160,4 @@ mimedir = $(datadir)/mime/packages mime_DATA = org.gnu.pspp.xml EXTRA_DIST += org.gnu.pspp.xml -EXTRA_DIST += rust/Cargo.lock rust/Cargo.toml rust/src/main.rs +EXTRA_DIST += rust/Cargo.lock rust/Cargo.toml rust/src/main.rs rust/src/lib.rs rust/src/endian.rs rust/src/hexfloat.rs diff --git a/rust/src/endian.rs b/rust/src/endian.rs index 51952137b7..6bd25ab95a 100644 --- a/rust/src/endian.rs +++ b/rust/src/endian.rs @@ -34,6 +34,18 @@ impl Endian { } } +pub trait ToBytes { + fn to_bytes(self, value: T) -> [u8; N]; +} +impl ToBytes for Endian { + fn to_bytes(self, value: f64) -> [u8; 8] { + match self { + Endian::Big => f64::to_be_bytes(value), + Endian::Little => f64::to_le_bytes(value), + } + } +} + /// Parses an `N`-byte slice in one of the supported formats into native format /// as type `T`. pub trait Parse { diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 0b8c541e54..354b7c0950 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,8 +1,11 @@ #![allow(unused_variables)] -use endian::{Endian, Parse}; +use endian::{Endian, Parse, ToBytes}; use num::Integer; use num_derive::FromPrimitive; -use std::io::{BufReader, Error as IoError, ErrorKind, Read, Seek}; +use std::{ + collections::VecDeque, + io::{BufReader, Error as IoError, Read, Seek}, +}; use thiserror::Error; pub mod endian; @@ -16,10 +19,7 @@ pub enum Error { BadMagic([u8; 4]), #[error("I/O error ({0})")] - Io( - #[from] - IoError, - ), + Io(#[from] IoError), #[error("Invalid SAV compression code {0}")] InvalidSavCompression(u32), @@ -75,6 +75,27 @@ pub enum Error { #[error("At offset {offset:#x}, impossible ZLIB trailer length {ztrailer_len}.")] BadZlibTrailerLen { offset: u64, ztrailer_len: u64 }, + + #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")] + EofInCase { + offset: u64, + case_ofs: u64, + case_len: usize, + }, + + #[error( + "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case." + )] + EofInCompressedCase { offset: u64, case_ofs: u64 }, + + #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")] + PartialCompressedCase { offset: u64, case_ofs: u64 }, + + #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")] + CompressedNumberExpected { offset: u64, case_ofs: u64 }, + + #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")] + CompressedStringExpected { offset: u64, case_ofs: u64 }, } #[derive(Error, Debug)] @@ -196,6 +217,7 @@ enum ReaderState { Start, Headers(Endian, Option), Data(Endian), + CompressedData(Endian, VecDeque), End, } @@ -225,12 +247,12 @@ impl Reader { state: ReaderState::Start, }) } - fn _next(&mut self) -> Result, Error> { + fn _next(&mut self) -> Result, Error> { match self.state { ReaderState::Start => { let header = read_header(&mut self.r)?; - let next_state = ReaderState::Headers(header.endianness, header.compression); - Ok(Some((Record::Header(header), next_state))) + self.state = ReaderState::Headers(header.endianness, header.compression); + Ok(Some(Record::Header(header))) } ReaderState::Headers(endian, compression) => { let rec_type: u32 = endian.parse(read_bytes(&mut self.r)?); @@ -246,11 +268,14 @@ impl Reader { 7 => Record::Extension(read_extension_record(&mut self.r, endian)?), 999 => { let _: [u8; 4] = read_bytes(&mut self.r)?; - let next_state = match compression { + self.state = match compression { None => ReaderState::Data(endian), + Some(Compression::Simple) => { + ReaderState::CompressedData(endian, VecDeque::new()) + } _ => ReaderState::End, }; - return Ok(Some((Record::EndOfHeaders, next_state))); + return Ok(Some(Record::EndOfHeaders)); } _ => { return Err(Error::BadRecordType { @@ -259,24 +284,88 @@ impl Reader { }) } }; - Ok(Some((record, ReaderState::Headers(endian, compression)))) + Ok(Some(record)) } ReaderState::Data(endian) => { + let case_start = self.r.stream_position()?; let mut values = Vec::with_capacity(self.var_types.len()); for (i, &var_type) in self.var_types.iter().enumerate() { - let raw = match read_bytes(&mut self.r) { - Ok(raw) => raw, - Err(err) => { - if i == 0 && err.kind() == ErrorKind::UnexpectedEof { - return Ok(None); - } else { - return Err(Error::Io(err)); - } + let Some(raw) = try_read_bytes(&mut self.r)? else { + if i == 0 { + return Ok(None); + } else { + let offset = self.r.stream_position()?; + return Err(Error::EofInCase { offset, case_ofs: offset - case_start, case_len: self.var_types.len() * 8}); } }; values.push(Value::from_raw(var_type, raw, endian)); } - Ok(Some((Record::Case(values), ReaderState::Data(endian)))) + Ok(Some(Record::Case(values))) + } + ReaderState::CompressedData(endian, ref mut codes) => { + let case_start = self.r.stream_position()?; + let mut values = Vec::with_capacity(self.var_types.len()); + let bias = 100.0; // XXX + for (i, &var_type) in self.var_types.iter().enumerate() { + let value = loop { + let Some(code) = codes.pop_front() else { + let Some(new_codes): Option<[u8; 8]> = try_read_bytes(&mut self.r)? else { + if i == 0 { + return Ok(None); + } else { + let offset = self.r.stream_position()?; + return Err(Error::EofInCompressedCase { offset, case_ofs: offset - case_start}); + } + }; + codes.extend(new_codes.into_iter()); + continue; + }; + match code { + 0 => (), + 1..=251 => match var_type { + VarType::Number => break Value::Number(Some(code as f64 - bias)), + VarType::String => { + break Value::String(endian.to_bytes(code as f64 - bias)) + } + }, + 252 => { + if i == 0 { + return Ok(None); + } else { + let offset = self.r.stream_position()?; + return Err(Error::PartialCompressedCase { + offset, + case_ofs: offset - case_start, + }); + } + } + 253 => break Value::from_raw( + var_type, + read_bytes(&mut self.r)?, + endian, + ), + 254 => match var_type { + VarType::String => break Value::String(*b" "), // XXX EBCDIC + VarType::Number => { + return Err(Error::CompressedStringExpected { + offset: case_start, + case_ofs: self.r.stream_position()? - case_start, + }) + } + }, + 255 => match var_type { + VarType::Number => break Value::Number(None), + VarType::String => { + return Err(Error::CompressedNumberExpected { + offset: case_start, + case_ofs: self.r.stream_position()? - case_start,}) + } + } + } + }; + values.push(value); + } + Ok(Some(Record::Case(values))) } ReaderState::End => Ok(None), } @@ -293,8 +382,7 @@ impl Iterator for Reader { self.state = ReaderState::End; None } - Ok(Some((record, next_state))) => { - self.state = next_state; + Ok(Some(record)) => { Some(Ok(record)) } Err(error) => { @@ -703,6 +791,19 @@ fn read_zheader(r: &mut BufReader, e: Endian) -> Result(r: &mut R) -> Result, IoError> { + let mut buf = [0; N]; + let n = r.read(&mut buf)?; + if n > 0 { + if n < N { + r.read_exact(&mut buf[n..])?; + } + Ok(Some(buf)) + } else { + Ok(None) + } +} + fn read_bytes(r: &mut R) -> Result<[u8; N], IoError> { let mut buf = [0; N]; r.read_exact(&mut buf)?; -- 2.30.2