work
authorBen Pfaff <blp@cs.stanford.edu>
Fri, 28 Jul 2023 13:38:56 +0000 (06:38 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Fri, 28 Jul 2023 13:38:56 +0000 (06:38 -0700)
rust/Cargo.lock
rust/Cargo.toml
rust/src/lib.rs
src/data/sys-file-reader.c

index 38a454c4b767fc9d819c3770b54c559913505c22..950e262847199a3c314ecf69ce1b28985e576e11 100644 (file)
@@ -2,6 +2,12 @@
 # It is not intended for manual editing.
 version = 3
 
+[[package]]
+name = "adler"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
+
 [[package]]
 name = "anyhow"
 version = "1.0.69"
@@ -37,6 +43,12 @@ version = "1.0.79"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
 
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
 [[package]]
 name = "clap"
 version = "4.1.7"
@@ -74,6 +86,15 @@ dependencies = [
  "os_str_bytes",
 ]
 
+[[package]]
+name = "crc32fast"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
+dependencies = [
+ "cfg-if",
+]
+
 [[package]]
 name = "errno"
 version = "0.2.8"
@@ -95,6 +116,16 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "flate2"
+version = "1.0.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
+dependencies = [
+ "crc32fast",
+ "miniz_oxide",
+]
+
 [[package]]
 name = "heck"
 version = "0.4.1"
@@ -160,6 +191,15 @@ version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
 
+[[package]]
+name = "miniz_oxide"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+dependencies = [
+ "adler",
+]
+
 [[package]]
 name = "num"
 version = "0.4.0"
@@ -298,6 +338,7 @@ version = "1.0.0"
 dependencies = [
  "anyhow",
  "clap",
+ "flate2",
  "hexplay",
  "num",
  "num-derive",
index d276b1ae18eb32b03163c6d317185417f10ab8bc..a58a5fafa671aee11a8cabf80068428f192a0d1d 100644 (file)
@@ -7,6 +7,7 @@ authors = [ "Ben Pfaff", "John Darrington" ]
 [dependencies]
 anyhow = "1.0.69"
 clap = { version = "4.1.7", features = ["derive"] }
+flate2 = "1.0.26"
 hexplay = "0.2.1"
 num = "0.4.0"
 num-derive = "0.4.0"
index 354b7c09501f9c0e3ba3677c06453d4dd7debb75..d237f81b801aca3b91b4dce45e9da5ebd2d56015 100644 (file)
@@ -1,10 +1,11 @@
 #![allow(unused_variables)]
 use endian::{Endian, Parse, ToBytes};
+//use flate2::bufread::ZlibDecoder;
 use num::Integer;
 use num_derive::FromPrimitive;
 use std::{
     collections::VecDeque,
-    io::{BufReader, Error as IoError, Read, Seek},
+    io::{BufReader, Error as IoError, Read, Seek, SeekFrom},
 };
 use thiserror::Error;
 
@@ -96,6 +97,9 @@ pub enum Error {
 
     #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
     CompressedStringExpected { offset: u64, case_ofs: u64 },
+
+    #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
+    BadZlibTrailerNBlocks { offset: u64, n_blocks: u32, expected_n_blocks: u64, ztrailer_len: u64 }
 }
 
 #[derive(Error, Debug)]
@@ -121,6 +125,8 @@ pub enum Record {
     VarIndexes(VarIndexes),
     Extension(Extension),
     EndOfHeaders,
+    ZHeader(ZHeader),
+    ZTrailer(ZTrailer),
     Case(Vec<Value>),
 }
 
@@ -218,6 +224,9 @@ enum ReaderState {
     Headers(Endian, Option<Compression>),
     Data(Endian),
     CompressedData(Endian, VecDeque<u8>),
+    ZHeader(Endian),
+    ZTrailer { endian: Endian, ztrailer_ofs: u64, ztrailer_len: u64 },
+    //ZData,
     End,
 }
 
@@ -273,7 +282,7 @@ impl<R: Read + Seek> Reader<R> {
                             Some(Compression::Simple) => {
                                 ReaderState::CompressedData(endian, VecDeque::new())
                             }
-                            _ => ReaderState::End,
+                            Some(Compression::ZLib) => ReaderState::ZHeader(endian),
                         };
                         return Ok(Some(Record::EndOfHeaders));
                     }
@@ -339,11 +348,9 @@ impl<R: Read + Seek> Reader<R> {
                                     });
                                 }
                             }
-                            253 => break Value::from_raw(
-                                var_type,
-                                read_bytes(&mut self.r)?,
-                                endian,
-                            ),
+                            253 => {
+                                break Value::from_raw(var_type, read_bytes(&mut self.r)?, endian)
+                            }
                             254 => match var_type {
                                 VarType::String => break Value::String(*b"        "), // XXX EBCDIC
                                 VarType::Number => {
@@ -358,15 +365,35 @@ impl<R: Read + Seek> Reader<R> {
                                 VarType::String => {
                                     return Err(Error::CompressedNumberExpected {
                                         offset: case_start,
-                                        case_ofs: self.r.stream_position()? - case_start,})
+                                        case_ofs: self.r.stream_position()? - case_start,
+                                    })
                                 }
-                            }
+                            },
                         }
                     };
                     values.push(value);
                 }
                 Ok(Some(Record::Case(values)))
             }
+            ReaderState::ZHeader(endian) => {
+                let zheader = read_zheader(&mut self.r, endian)?;
+                self.state = ReaderState::ZTrailer { endian, ztrailer_ofs: zheader.ztrailer_offset, ztrailer_len: zheader.ztrailer_len};
+                Ok(Some(Record::ZHeader(zheader)))
+            }
+            ReaderState::ZTrailer { endian, ztrailer_ofs, ztrailer_len } => {
+                //self.state = ReaderState::ZData;
+                match read_ztrailer(&mut self.r, endian, ztrailer_ofs, ztrailer_len)? {
+                    Some(ztrailer) => {
+                        Ok(Some(Record::ZTrailer(ztrailer)))
+                    },
+                    None => self._next()
+                }
+            }
+/*
+            ReaderState::ZData(zlib_decoder) => {
+                let zlib_decoder = zlib_decoder.unwrap_or_else(
+            },
+*/
             ReaderState::End => Ok(None),
         }
     }
@@ -382,9 +409,7 @@ impl<R: Read + Seek> Iterator for Reader<R> {
                 self.state = ReaderState::End;
                 None
             }
-            Ok(Some(record)) => {
-                Some(Ok(record))
-            }
+            Ok(Some(record)) => Some(Ok(record)),
             Err(error) => {
                 self.state = ReaderState::End;
                 Some(Err(error))
@@ -474,20 +499,20 @@ pub struct Variable {
 
 fn read_variable_record<R: Read + Seek>(
     r: &mut BufReader<R>,
-    e: Endian,
+    endian: Endian,
 ) -> Result<Variable, Error> {
     let offset = r.stream_position()?;
-    let width: i32 = e.parse(read_bytes(r)?);
-    let has_variable_label: u32 = e.parse(read_bytes(r)?);
-    let missing_value_code: i32 = e.parse(read_bytes(r)?);
-    let print_format: u32 = e.parse(read_bytes(r)?);
-    let write_format: u32 = e.parse(read_bytes(r)?);
+    let width: i32 = endian.parse(read_bytes(r)?);
+    let has_variable_label: u32 = endian.parse(read_bytes(r)?);
+    let missing_value_code: i32 = endian.parse(read_bytes(r)?);
+    let print_format: u32 = endian.parse(read_bytes(r)?);
+    let write_format: u32 = endian.parse(read_bytes(r)?);
     let name: [u8; 8] = read_bytes(r)?;
 
     let label = match has_variable_label {
         0 => None,
         1 => {
-            let len: u32 = e.parse(read_bytes(r)?);
+            let len: u32 = endian.parse(read_bytes(r)?);
             let read_len = len.min(65535) as usize;
             let label = Some(read_vec(r, read_len)?);
 
@@ -555,10 +580,10 @@ impl ValueLabel {
 
 fn read_value_label_record<R: Read + Seek>(
     r: &mut BufReader<R>,
-    e: Endian,
+    endian: Endian,
 ) -> Result<ValueLabel, Error> {
     let offset = r.stream_position()?;
-    let n: u32 = e.parse(read_bytes(r)?);
+    let n: u32 = endian.parse(read_bytes(r)?);
     if n > ValueLabel::MAX {
         return Err(Error::BadNumberOfValueLabels {
             offset,
@@ -570,7 +595,7 @@ fn read_value_label_record<R: Read + Seek>(
     let mut labels = Vec::new();
     for _ in 0..n {
         let value: [u8; 8] = read_bytes(r)?;
-        let label_len: u8 = e.parse(read_bytes(r)?);
+        let label_len: u8 = endian.parse(read_bytes(r)?);
         let label_len = label_len as usize;
         let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
 
@@ -596,10 +621,10 @@ impl VarIndexes {
 
 fn read_var_indexes_record<R: Read + Seek>(
     r: &mut BufReader<R>,
-    e: Endian,
+    endian: Endian,
 ) -> Result<VarIndexes, Error> {
     let offset = r.stream_position()?;
-    let n: u32 = e.parse(read_bytes(r)?);
+    let n: u32 = endian.parse(read_bytes(r)?);
     if n > VarIndexes::MAX {
         return Err(Error::BadNumberOfVarIndexes {
             offset,
@@ -609,7 +634,7 @@ fn read_var_indexes_record<R: Read + Seek>(
     }
     let mut var_indexes = Vec::with_capacity(n as usize);
     for _ in 0..n {
-        var_indexes.push(e.parse(read_bytes(r)?));
+        var_indexes.push(endian.parse(read_bytes(r)?));
     }
 
     Ok(VarIndexes {
@@ -631,10 +656,10 @@ pub struct Document {
 
 fn read_document_record<R: Read + Seek>(
     r: &mut BufReader<R>,
-    e: Endian,
+    endian: Endian,
 ) -> Result<Document, Error> {
     let offset = r.stream_position()?;
-    let n: u32 = e.parse(read_bytes(r)?);
+    let n: u32 = endian.parse(read_bytes(r)?);
     match n {
         0..=DOC_MAX_LINES => {
             let pos = r.stream_position()?;
@@ -738,12 +763,12 @@ fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) {
 
 fn read_extension_record<R: Read + Seek>(
     r: &mut BufReader<R>,
-    e: Endian,
+    endian: Endian,
 ) -> Result<Extension, Error> {
-    let subtype = e.parse(read_bytes(r)?);
+    let subtype = endian.parse(read_bytes(r)?);
     let offset = r.stream_position()?;
-    let size: u32 = e.parse(read_bytes(r)?);
-    let count = e.parse(read_bytes(r)?);
+    let size: u32 = endian.parse(read_bytes(r)?);
+    let count = endian.parse(read_bytes(r)?);
     let Some(product) = size.checked_mul(count) else {
         return Err(Error::ExtensionRecordTooLarge {
             offset,
@@ -763,25 +788,25 @@ fn read_extension_record<R: Read + Seek>(
     })
 }
 
-struct ZHeader {
+pub struct ZHeader {
     /// File offset to the start of the record.
-    offset: u64,
+    pub offset: u64,
 
     /// File offset to the ZLIB data header.
-    zheader_offset: u64,
+    pub zheader_offset: u64,
 
     /// File offset to the ZLIB trailer.
-    ztrailer_offset: u64,
+    pub ztrailer_offset: u64,
 
     /// Length of the ZLIB trailer in bytes.
-    ztrailer_len: u64,
+    pub ztrailer_len: u64,
 }
 
-fn read_zheader<R: Read + Seek>(r: &mut BufReader<R>, e: Endian) -> Result<ZHeader, Error> {
+fn read_zheader<R: Read + Seek>(r: &mut BufReader<R>, endian: Endian) -> Result<ZHeader, Error> {
     let offset = r.stream_position()?;
-    let zheader_offset: u64 = e.parse(read_bytes(r)?);
-    let ztrailer_offset: u64 = e.parse(read_bytes(r)?);
-    let ztrailer_len: u64 = e.parse(read_bytes(r)?);
+    let zheader_offset: u64 = endian.parse(read_bytes(r)?);
+    let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
+    let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
 
     Ok(ZHeader {
         offset,
@@ -791,6 +816,65 @@ fn read_zheader<R: Read + Seek>(r: &mut BufReader<R>, e: Endian) -> Result<ZHead
     })
 }
 
+pub struct ZTrailer {
+    /// File offset to the start of the record.
+    pub offset: u64,
+
+    /// Compression bias as a negative integer, e.g. -100.
+    pub int_bias: i64,
+
+    /// Always observed as zero.
+    pub zero: u64,
+
+    /// Uncompressed size of each block, except possibly the last.  Only
+    /// `0x3ff000` has been observed so far.
+    pub block_size: u32,
+
+    /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
+    pub blocks: Vec<ZBlock>,
+}
+
+pub struct ZBlock {
+    /// Offset of block of data if simple compression were used.
+    pub uncompressed_ofs: u64,
+
+    /// Actual offset within the file of the compressed data block.
+    pub compressed_ofs: u64,
+
+    /// The number of bytes in this data block after decompression.  This is
+    /// `block_size` in every data block but the last, which may be smaller.
+    pub uncompressed_size: u32,
+
+    /// The number of bytes in this data block, as stored compressed in this
+    /// file.
+    pub compressed_size: u32,
+}
+
+fn read_ztrailer<R: Read + Seek>(r: &mut BufReader<R>, endian: Endian, ztrailer_ofs: u64, ztrailer_len: u64) -> Result<Option<ZTrailer>, Error> {
+    let start_offset = r.stream_position()?;
+    if r.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
+        return Ok(None)
+    }
+    let int_bias = endian.parse(read_bytes(r)?);
+    let zero = endian.parse(read_bytes(r)?);
+    let block_size = endian.parse(read_bytes(r)?);
+    let n_blocks: u32 = endian.parse(read_bytes(r)?);
+    let expected_n_blocks = (ztrailer_len - 24) / 24;
+    if n_blocks as u64 != expected_n_blocks {
+        return Err(Error::BadZlibTrailerNBlocks { offset: ztrailer_ofs, n_blocks, expected_n_blocks, ztrailer_len })
+    }
+    let mut blocks = Vec::with_capacity(n_blocks as usize);
+    for _ in 0..n_blocks {
+        let uncompressed_ofs = endian.parse(read_bytes(r)?);
+        let compressed_ofs = endian.parse(read_bytes(r)?);
+        let uncompressed_size = endian.parse(read_bytes(r)?);
+        let compressed_size = endian.parse(read_bytes(r)?);
+        blocks.push(ZBlock { uncompressed_ofs, compressed_ofs, uncompressed_size, compressed_size });
+    }
+    r.seek(SeekFrom::Start(start_offset))?;
+    Ok(Some(ZTrailer { offset: ztrailer_ofs, int_bias, zero, block_size, blocks }))
+}
+
 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
     let mut buf = [0; N];
     let n = r.read(&mut buf)?;
index 491df7f996c349a853c3492d0a49ca575e717161..eb19db479b7c43751e0d2fb4b79c26c30d00d26d 100644 (file)
@@ -3562,6 +3562,7 @@ read_zheader (struct sfm_reader *r)
       return false;
     }
 
+  /* XXX allow a value of 0 for these two? */
   if (ztrailer_ofs < r->pos)
     {
       sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."),