work
[pspp] / rust / src / lib.rs
index 354b7c09501f9c0e3ba3677c06453d4dd7debb75..d237f81b801aca3b91b4dce45e9da5ebd2d56015 100644 (file)
@@ -1,10 +1,11 @@
 #![allow(unused_variables)]
 use endian::{Endian, Parse, ToBytes};
+//use flate2::bufread::ZlibDecoder;
 use num::Integer;
 use num_derive::FromPrimitive;
 use std::{
     collections::VecDeque,
-    io::{BufReader, Error as IoError, Read, Seek},
+    io::{BufReader, Error as IoError, Read, Seek, SeekFrom},
 };
 use thiserror::Error;
 
@@ -96,6 +97,9 @@ pub enum Error {
 
     #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
     CompressedStringExpected { offset: u64, case_ofs: u64 },
+
+    #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
+    BadZlibTrailerNBlocks { offset: u64, n_blocks: u32, expected_n_blocks: u64, ztrailer_len: u64 }
 }
 
 #[derive(Error, Debug)]
@@ -121,6 +125,8 @@ pub enum Record {
     VarIndexes(VarIndexes),
     Extension(Extension),
     EndOfHeaders,
+    ZHeader(ZHeader),
+    ZTrailer(ZTrailer),
     Case(Vec<Value>),
 }
 
@@ -218,6 +224,9 @@ enum ReaderState {
     Headers(Endian, Option<Compression>),
     Data(Endian),
     CompressedData(Endian, VecDeque<u8>),
+    ZHeader(Endian),
+    ZTrailer { endian: Endian, ztrailer_ofs: u64, ztrailer_len: u64 },
+    //ZData,
     End,
 }
 
@@ -273,7 +282,7 @@ impl<R: Read + Seek> Reader<R> {
                             Some(Compression::Simple) => {
                                 ReaderState::CompressedData(endian, VecDeque::new())
                             }
-                            _ => ReaderState::End,
+                            Some(Compression::ZLib) => ReaderState::ZHeader(endian),
                         };
                         return Ok(Some(Record::EndOfHeaders));
                     }
@@ -339,11 +348,9 @@ impl<R: Read + Seek> Reader<R> {
                                     });
                                 }
                             }
-                            253 => break Value::from_raw(
-                                var_type,
-                                read_bytes(&mut self.r)?,
-                                endian,
-                            ),
+                            253 => {
+                                break Value::from_raw(var_type, read_bytes(&mut self.r)?, endian)
+                            }
                             254 => match var_type {
                                 VarType::String => break Value::String(*b"        "), // XXX EBCDIC
                                 VarType::Number => {
@@ -358,15 +365,35 @@ impl<R: Read + Seek> Reader<R> {
                                 VarType::String => {
                                     return Err(Error::CompressedNumberExpected {
                                         offset: case_start,
-                                        case_ofs: self.r.stream_position()? - case_start,})
+                                        case_ofs: self.r.stream_position()? - case_start,
+                                    })
                                 }
-                            }
+                            },
                         }
                     };
                     values.push(value);
                 }
                 Ok(Some(Record::Case(values)))
             }
+            ReaderState::ZHeader(endian) => {
+                let zheader = read_zheader(&mut self.r, endian)?;
+                self.state = ReaderState::ZTrailer { endian, ztrailer_ofs: zheader.ztrailer_offset, ztrailer_len: zheader.ztrailer_len};
+                Ok(Some(Record::ZHeader(zheader)))
+            }
+            ReaderState::ZTrailer { endian, ztrailer_ofs, ztrailer_len } => {
+                //self.state = ReaderState::ZData;
+                match read_ztrailer(&mut self.r, endian, ztrailer_ofs, ztrailer_len)? {
+                    Some(ztrailer) => {
+                        Ok(Some(Record::ZTrailer(ztrailer)))
+                    },
+                    None => self._next()
+                }
+            }
+/*
+            ReaderState::ZData(zlib_decoder) => {
+                let zlib_decoder = zlib_decoder.unwrap_or_else(
+            },
+*/
             ReaderState::End => Ok(None),
         }
     }
@@ -382,9 +409,7 @@ impl<R: Read + Seek> Iterator for Reader<R> {
                 self.state = ReaderState::End;
                 None
             }
-            Ok(Some(record)) => {
-                Some(Ok(record))
-            }
+            Ok(Some(record)) => Some(Ok(record)),
             Err(error) => {
                 self.state = ReaderState::End;
                 Some(Err(error))
@@ -474,20 +499,20 @@ pub struct Variable {
 
 fn read_variable_record<R: Read + Seek>(
     r: &mut BufReader<R>,
-    e: Endian,
+    endian: Endian,
 ) -> Result<Variable, Error> {
     let offset = r.stream_position()?;
-    let width: i32 = e.parse(read_bytes(r)?);
-    let has_variable_label: u32 = e.parse(read_bytes(r)?);
-    let missing_value_code: i32 = e.parse(read_bytes(r)?);
-    let print_format: u32 = e.parse(read_bytes(r)?);
-    let write_format: u32 = e.parse(read_bytes(r)?);
+    let width: i32 = endian.parse(read_bytes(r)?);
+    let has_variable_label: u32 = endian.parse(read_bytes(r)?);
+    let missing_value_code: i32 = endian.parse(read_bytes(r)?);
+    let print_format: u32 = endian.parse(read_bytes(r)?);
+    let write_format: u32 = endian.parse(read_bytes(r)?);
     let name: [u8; 8] = read_bytes(r)?;
 
     let label = match has_variable_label {
         0 => None,
         1 => {
-            let len: u32 = e.parse(read_bytes(r)?);
+            let len: u32 = endian.parse(read_bytes(r)?);
             let read_len = len.min(65535) as usize;
             let label = Some(read_vec(r, read_len)?);
 
@@ -555,10 +580,10 @@ impl ValueLabel {
 
 fn read_value_label_record<R: Read + Seek>(
     r: &mut BufReader<R>,
-    e: Endian,
+    endian: Endian,
 ) -> Result<ValueLabel, Error> {
     let offset = r.stream_position()?;
-    let n: u32 = e.parse(read_bytes(r)?);
+    let n: u32 = endian.parse(read_bytes(r)?);
     if n > ValueLabel::MAX {
         return Err(Error::BadNumberOfValueLabels {
             offset,
@@ -570,7 +595,7 @@ fn read_value_label_record<R: Read + Seek>(
     let mut labels = Vec::new();
     for _ in 0..n {
         let value: [u8; 8] = read_bytes(r)?;
-        let label_len: u8 = e.parse(read_bytes(r)?);
+        let label_len: u8 = endian.parse(read_bytes(r)?);
         let label_len = label_len as usize;
         let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
 
@@ -596,10 +621,10 @@ impl VarIndexes {
 
 fn read_var_indexes_record<R: Read + Seek>(
     r: &mut BufReader<R>,
-    e: Endian,
+    endian: Endian,
 ) -> Result<VarIndexes, Error> {
     let offset = r.stream_position()?;
-    let n: u32 = e.parse(read_bytes(r)?);
+    let n: u32 = endian.parse(read_bytes(r)?);
     if n > VarIndexes::MAX {
         return Err(Error::BadNumberOfVarIndexes {
             offset,
@@ -609,7 +634,7 @@ fn read_var_indexes_record<R: Read + Seek>(
     }
     let mut var_indexes = Vec::with_capacity(n as usize);
     for _ in 0..n {
-        var_indexes.push(e.parse(read_bytes(r)?));
+        var_indexes.push(endian.parse(read_bytes(r)?));
     }
 
     Ok(VarIndexes {
@@ -631,10 +656,10 @@ pub struct Document {
 
 fn read_document_record<R: Read + Seek>(
     r: &mut BufReader<R>,
-    e: Endian,
+    endian: Endian,
 ) -> Result<Document, Error> {
     let offset = r.stream_position()?;
-    let n: u32 = e.parse(read_bytes(r)?);
+    let n: u32 = endian.parse(read_bytes(r)?);
     match n {
         0..=DOC_MAX_LINES => {
             let pos = r.stream_position()?;
@@ -738,12 +763,12 @@ fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) {
 
 fn read_extension_record<R: Read + Seek>(
     r: &mut BufReader<R>,
-    e: Endian,
+    endian: Endian,
 ) -> Result<Extension, Error> {
-    let subtype = e.parse(read_bytes(r)?);
+    let subtype = endian.parse(read_bytes(r)?);
     let offset = r.stream_position()?;
-    let size: u32 = e.parse(read_bytes(r)?);
-    let count = e.parse(read_bytes(r)?);
+    let size: u32 = endian.parse(read_bytes(r)?);
+    let count = endian.parse(read_bytes(r)?);
     let Some(product) = size.checked_mul(count) else {
         return Err(Error::ExtensionRecordTooLarge {
             offset,
@@ -763,25 +788,25 @@ fn read_extension_record<R: Read + Seek>(
     })
 }
 
-struct ZHeader {
+pub struct ZHeader {
     /// File offset to the start of the record.
-    offset: u64,
+    pub offset: u64,
 
     /// File offset to the ZLIB data header.
-    zheader_offset: u64,
+    pub zheader_offset: u64,
 
     /// File offset to the ZLIB trailer.
-    ztrailer_offset: u64,
+    pub ztrailer_offset: u64,
 
     /// Length of the ZLIB trailer in bytes.
-    ztrailer_len: u64,
+    pub ztrailer_len: u64,
 }
 
-fn read_zheader<R: Read + Seek>(r: &mut BufReader<R>, e: Endian) -> Result<ZHeader, Error> {
+fn read_zheader<R: Read + Seek>(r: &mut BufReader<R>, endian: Endian) -> Result<ZHeader, Error> {
     let offset = r.stream_position()?;
-    let zheader_offset: u64 = e.parse(read_bytes(r)?);
-    let ztrailer_offset: u64 = e.parse(read_bytes(r)?);
-    let ztrailer_len: u64 = e.parse(read_bytes(r)?);
+    let zheader_offset: u64 = endian.parse(read_bytes(r)?);
+    let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
+    let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
 
     Ok(ZHeader {
         offset,
@@ -791,6 +816,65 @@ fn read_zheader<R: Read + Seek>(r: &mut BufReader<R>, e: Endian) -> Result<ZHead
     })
 }
 
+pub struct ZTrailer {
+    /// File offset to the start of the record.
+    pub offset: u64,
+
+    /// Compression bias as a negative integer, e.g. -100.
+    pub int_bias: i64,
+
+    /// Always observed as zero.
+    pub zero: u64,
+
+    /// Uncompressed size of each block, except possibly the last.  Only
+    /// `0x3ff000` has been observed so far.
+    pub block_size: u32,
+
+    /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
+    pub blocks: Vec<ZBlock>,
+}
+
+pub struct ZBlock {
+    /// Offset of block of data if simple compression were used.
+    pub uncompressed_ofs: u64,
+
+    /// Actual offset within the file of the compressed data block.
+    pub compressed_ofs: u64,
+
+    /// The number of bytes in this data block after decompression.  This is
+    /// `block_size` in every data block but the last, which may be smaller.
+    pub uncompressed_size: u32,
+
+    /// The number of bytes in this data block, as stored compressed in this
+    /// file.
+    pub compressed_size: u32,
+}
+
+fn read_ztrailer<R: Read + Seek>(r: &mut BufReader<R>, endian: Endian, ztrailer_ofs: u64, ztrailer_len: u64) -> Result<Option<ZTrailer>, Error> {
+    let start_offset = r.stream_position()?;
+    if r.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
+        return Ok(None)
+    }
+    let int_bias = endian.parse(read_bytes(r)?);
+    let zero = endian.parse(read_bytes(r)?);
+    let block_size = endian.parse(read_bytes(r)?);
+    let n_blocks: u32 = endian.parse(read_bytes(r)?);
+    let expected_n_blocks = (ztrailer_len - 24) / 24;
+    if n_blocks as u64 != expected_n_blocks {
+        return Err(Error::BadZlibTrailerNBlocks { offset: ztrailer_ofs, n_blocks, expected_n_blocks, ztrailer_len })
+    }
+    let mut blocks = Vec::with_capacity(n_blocks as usize);
+    for _ in 0..n_blocks {
+        let uncompressed_ofs = endian.parse(read_bytes(r)?);
+        let compressed_ofs = endian.parse(read_bytes(r)?);
+        let uncompressed_size = endian.parse(read_bytes(r)?);
+        let compressed_size = endian.parse(read_bytes(r)?);
+        blocks.push(ZBlock { uncompressed_ofs, compressed_ofs, uncompressed_size, compressed_size });
+    }
+    r.seek(SeekFrom::Start(start_offset))?;
+    Ok(Some(ZTrailer { offset: ztrailer_ofs, int_bias, zero, block_size, blocks }))
+}
+
 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
     let mut buf = [0; N];
     let n = r.read(&mut buf)?;