work
authorBen Pfaff <blp@cs.stanford.edu>
Thu, 27 Jul 2023 16:08:45 +0000 (09:08 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Thu, 27 Jul 2023 16:08:45 +0000 (09:08 -0700)
Makefile.am
rust/src/endian.rs
rust/src/lib.rs

index 0fdd46bb2c1f4f4f758abf61a48149c947ef78b0..1283efb019923f78b67c3e88f52d2b5723c25301 100644 (file)
@@ -160,4 +160,4 @@ mimedir = $(datadir)/mime/packages
 mime_DATA = org.gnu.pspp.xml
 EXTRA_DIST += org.gnu.pspp.xml
 
-EXTRA_DIST += rust/Cargo.lock rust/Cargo.toml rust/src/main.rs
+EXTRA_DIST += rust/Cargo.lock rust/Cargo.toml rust/src/main.rs rust/src/lib.rs rust/src/endian.rs rust/src/hexfloat.rs
index 51952137b7b657d123d9394413388f6340f65642..6bd25ab95ac0408793983a34ed18ca8ad68f0db6 100644 (file)
@@ -34,6 +34,18 @@ impl Endian {
     }
 }
 
+pub trait ToBytes<T, const N: usize> {
+    fn to_bytes(self, value: T) -> [u8; N];
+}
+impl ToBytes<f64, 8> for Endian {
+    fn to_bytes(self, value: f64) -> [u8; 8] {
+        match self {
+            Endian::Big => f64::to_be_bytes(value),
+            Endian::Little => f64::to_le_bytes(value),
+        }
+    }
+}
+
 /// Parses an `N`-byte slice in one of the supported formats into native format
 /// as type `T`.
 pub trait Parse<T, const N: usize> {
index 0b8c541e54612fcee67b45596a6370ec128f5c5b..354b7c09501f9c0e3ba3677c06453d4dd7debb75 100644 (file)
@@ -1,8 +1,11 @@
 #![allow(unused_variables)]
-use endian::{Endian, Parse};
+use endian::{Endian, Parse, ToBytes};
 use num::Integer;
 use num_derive::FromPrimitive;
-use std::io::{BufReader, Error as IoError, ErrorKind, Read, Seek};
+use std::{
+    collections::VecDeque,
+    io::{BufReader, Error as IoError, Read, Seek},
+};
 use thiserror::Error;
 
 pub mod endian;
@@ -16,10 +19,7 @@ pub enum Error {
     BadMagic([u8; 4]),
 
     #[error("I/O error ({0})")]
-    Io(
-        #[from]
-        IoError,
-    ),
+    Io(#[from] IoError),
 
     #[error("Invalid SAV compression code {0}")]
     InvalidSavCompression(u32),
@@ -75,6 +75,27 @@ pub enum Error {
 
     #[error("At offset {offset:#x}, impossible ZLIB trailer length {ztrailer_len}.")]
     BadZlibTrailerLen { offset: u64, ztrailer_len: u64 },
+
+    #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
+    EofInCase {
+        offset: u64,
+        case_ofs: u64,
+        case_len: usize,
+    },
+
+    #[error(
+        "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
+    )]
+    EofInCompressedCase { offset: u64, case_ofs: u64 },
+
+    #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
+    PartialCompressedCase { offset: u64, case_ofs: u64 },
+
+    #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
+    CompressedNumberExpected { offset: u64, case_ofs: u64 },
+
+    #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
+    CompressedStringExpected { offset: u64, case_ofs: u64 },
 }
 
 #[derive(Error, Debug)]
@@ -196,6 +217,7 @@ enum ReaderState {
     Start,
     Headers(Endian, Option<Compression>),
     Data(Endian),
+    CompressedData(Endian, VecDeque<u8>),
     End,
 }
 
@@ -225,12 +247,12 @@ impl<R: Read + Seek> Reader<R> {
             state: ReaderState::Start,
         })
     }
-    fn _next(&mut self) -> Result<Option<(Record, ReaderState)>, Error> {
+    fn _next(&mut self) -> Result<Option<Record>, Error> {
         match self.state {
             ReaderState::Start => {
                 let header = read_header(&mut self.r)?;
-                let next_state = ReaderState::Headers(header.endianness, header.compression);
-                Ok(Some((Record::Header(header), next_state)))
+                self.state = ReaderState::Headers(header.endianness, header.compression);
+                Ok(Some(Record::Header(header)))
             }
             ReaderState::Headers(endian, compression) => {
                 let rec_type: u32 = endian.parse(read_bytes(&mut self.r)?);
@@ -246,11 +268,14 @@ impl<R: Read + Seek> Reader<R> {
                     7 => Record::Extension(read_extension_record(&mut self.r, endian)?),
                     999 => {
                         let _: [u8; 4] = read_bytes(&mut self.r)?;
-                        let next_state = match compression {
+                        self.state = match compression {
                             None => ReaderState::Data(endian),
+                            Some(Compression::Simple) => {
+                                ReaderState::CompressedData(endian, VecDeque::new())
+                            }
                             _ => ReaderState::End,
                         };
-                        return Ok(Some((Record::EndOfHeaders, next_state)));
+                        return Ok(Some(Record::EndOfHeaders));
                     }
                     _ => {
                         return Err(Error::BadRecordType {
@@ -259,24 +284,88 @@ impl<R: Read + Seek> Reader<R> {
                         })
                     }
                 };
-                Ok(Some((record, ReaderState::Headers(endian, compression))))
+                Ok(Some(record))
             }
             ReaderState::Data(endian) => {
+                let case_start = self.r.stream_position()?;
                 let mut values = Vec::with_capacity(self.var_types.len());
                 for (i, &var_type) in self.var_types.iter().enumerate() {
-                    let raw = match read_bytes(&mut self.r) {
-                        Ok(raw) => raw,
-                        Err(err) => {
-                            if i == 0 && err.kind() == ErrorKind::UnexpectedEof {
-                                return Ok(None);
-                            } else {
-                                return Err(Error::Io(err));
-                            }
+                    let Some(raw) = try_read_bytes(&mut self.r)? else {
+                        if i == 0 {
+                            return Ok(None);
+                        } else {
+                            let offset = self.r.stream_position()?;
+                            return Err(Error::EofInCase { offset, case_ofs: offset - case_start, case_len: self.var_types.len() * 8});
                         }
                     };
                     values.push(Value::from_raw(var_type, raw, endian));
                 }
-                Ok(Some((Record::Case(values), ReaderState::Data(endian))))
+                Ok(Some(Record::Case(values)))
+            }
+            ReaderState::CompressedData(endian, ref mut codes) => {
+                let case_start = self.r.stream_position()?;
+                let mut values = Vec::with_capacity(self.var_types.len());
+                let bias = 100.0; // XXX
+                for (i, &var_type) in self.var_types.iter().enumerate() {
+                    let value = loop {
+                        let Some(code) = codes.pop_front() else {
+                            let Some(new_codes): Option<[u8; 8]> = try_read_bytes(&mut self.r)? else {
+                                if i == 0 {
+                                    return Ok(None);
+                                } else {
+                                    let offset = self.r.stream_position()?;
+                                    return Err(Error::EofInCompressedCase { offset, case_ofs: offset - case_start});
+                                }
+                            };
+                            codes.extend(new_codes.into_iter());
+                            continue;
+                        };
+                        match code {
+                            0 => (),
+                            1..=251 => match var_type {
+                                VarType::Number => break Value::Number(Some(code as f64 - bias)),
+                                VarType::String => {
+                                    break Value::String(endian.to_bytes(code as f64 - bias))
+                                }
+                            },
+                            252 => {
+                                if i == 0 {
+                                    return Ok(None);
+                                } else {
+                                    let offset = self.r.stream_position()?;
+                                    return Err(Error::PartialCompressedCase {
+                                        offset,
+                                        case_ofs: offset - case_start,
+                                    });
+                                }
+                            }
+                            253 => break Value::from_raw(
+                                var_type,
+                                read_bytes(&mut self.r)?,
+                                endian,
+                            ),
+                            254 => match var_type {
+                                VarType::String => break Value::String(*b"        "), // XXX EBCDIC
+                                VarType::Number => {
+                                    return Err(Error::CompressedStringExpected {
+                                        offset: case_start,
+                                        case_ofs: self.r.stream_position()? - case_start,
+                                    })
+                                }
+                            },
+                            255 => match var_type {
+                                VarType::Number => break Value::Number(None),
+                                VarType::String => {
+                                    return Err(Error::CompressedNumberExpected {
+                                        offset: case_start,
+                                        case_ofs: self.r.stream_position()? - case_start,})
+                                }
+                            }
+                        }
+                    };
+                    values.push(value);
+                }
+                Ok(Some(Record::Case(values)))
             }
             ReaderState::End => Ok(None),
         }
@@ -293,8 +382,7 @@ impl<R: Read + Seek> Iterator for Reader<R> {
                 self.state = ReaderState::End;
                 None
             }
-            Ok(Some((record, next_state))) => {
-                self.state = next_state;
+            Ok(Some(record)) => {
                 Some(Ok(record))
             }
             Err(error) => {
@@ -703,6 +791,19 @@ fn read_zheader<R: Read + Seek>(r: &mut BufReader<R>, e: Endian) -> Result<ZHead
     })
 }
 
+fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
+    let mut buf = [0; N];
+    let n = r.read(&mut buf)?;
+    if n > 0 {
+        if n < N {
+            r.read_exact(&mut buf[n..])?;
+        }
+        Ok(Some(buf))
+    } else {
+        Ok(None)
+    }
+}
+
 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
     let mut buf = [0; N];
     r.read_exact(&mut buf)?;