work on rust

author Ben Pfaff <blp@cs.stanford.edu>

Mon, 24 Jul 2023 16:02:56 +0000 (09:02 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Mon, 24 Jul 2023 16:02:56 +0000 (09:02 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Mon, 24 Jul 2023 16:02:56 +0000 (09:02 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Mon, 24 Jul 2023 16:02:56 +0000 (09:02 -0700)
diff --git a/rust/Cargo.lock b/rust/Cargo.lock

index 09889b3500ccb520f9d602396c24b06dc3934f5a..38a454c4b767fc9d819c3770b54c559913505c22 100644 (file)
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -62,7 +62,7 @@ dependencies = [
   "proc-macro-error",
   "proc-macro2",
   "quote",
- "syn",
+ "syn 1.0.109",
  ]
  
  [[package]]
@@ -194,6 +194,17 @@ dependencies = [
   "num-traits",
  ]
  
+[[package]]
+name = "num-derive"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e6a0fd4f737c707bd9086cc16c925f294943eb62eb71499e9fd4cf71f8b9f4e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.27",
+]
+
  [[package]]
  name = "num-integer"
  version = "0.1.45"
@@ -229,9 +240,9 @@ dependencies = [
  
  [[package]]
  name = "num-traits"
-version = "0.2.15"
+version = "0.2.16"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
+checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2"
  dependencies = [
   "autocfg",
  ]
@@ -257,7 +268,7 @@ dependencies = [
   "proc-macro-error-attr",
   "proc-macro2",
   "quote",
- "syn",
+ "syn 1.0.109",
   "version_check",
  ]
  
@@ -274,9 +285,9 @@ dependencies = [
  
  [[package]]
  name = "proc-macro2"
-version = "1.0.51"
+version = "1.0.66"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6"
+checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
  dependencies = [
   "unicode-ident",
  ]
@@ -289,14 +300,16 @@ dependencies = [
   "clap",
   "hexplay",
   "num",
+ "num-derive",
+ "num-traits",
   "thiserror",
  ]
  
  [[package]]
  name = "quote"
-version = "1.0.23"
+version = "1.0.32"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
+checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
  dependencies = [
   "proc-macro2",
  ]
@@ -332,6 +345,17 @@ dependencies = [
   "unicode-ident",
  ]
  
+[[package]]
+name = "syn"
+version = "2.0.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b60f673f44a8255b9c8c657daf66a596d435f2da81a555b06dc644d080ba45e0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
  [[package]]
  name = "termcolor"
  version = "0.3.6"
@@ -367,7 +391,7 @@ checksum = "5420d42e90af0c38c3290abcca25b9b3bdf379fc9f55c528f53a269d9c9a267e"
  dependencies = [
   "proc-macro2",
   "quote",
- "syn",
+ "syn 1.0.109",
  ]
  
  [[package]]
diff --git a/rust/Cargo.toml b/rust/Cargo.toml

index 09324d1dfdbc3d6b22e3e9658cf5d0e64ec50247..d276b1ae18eb32b03163c6d317185417f10ab8bc 100644 (file)
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -9,6 +9,8 @@ anyhow = "1.0.69"
  clap = { version = "4.1.7", features = ["derive"] }
  hexplay = "0.2.1"
  num = "0.4.0"
+num-derive = "0.4.0"
+num-traits = "0.2.16"
  thiserror = "1.0"
  
  [[bin]]
@@ -16,4 +18,4 @@ name = "pspp-dump-sav"
  path = "src/main.rs"
  
  [lib]
-path = "src/lib.rs"
-\ No newline at end of file
+path = "src/lib.rs"
diff --git a/rust/src/lib.rs b/rust/src/lib.rs

index 26db62dc119548097ebff22762aecf9c73c0dd27..2bde62b758b0de13c74896205a3a9a905cd1a717 100644 (file)
--- a/rust/src/lib.rs
+++ b/rust/src/lib.rs
@@ -1,6 +1,7 @@
  #![allow(unused_variables)]
  use endian::{Endian, Parse};
  use num::Integer;
+use num_derive::FromPrimitive;
  use std::io::{BufReader, Error as IoError, Read, Seek};
  use thiserror::Error;
  
@@ -51,6 +52,23 @@ pub enum Error {
  
      #[error("At offset {offset:#x}, number of variables associated with a value label ({n}) is not between 1 and the number of variables ({max}).")]
      BadNumberOfValueLabelVariables { offset: u64, n: u32, max: u32 },
+
+    #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
+    ExtensionRecordTooLarge {
+        offset: u64,
+        subtype: u32,
+        size: u32,
+        count: u32,
+    },
+
+    #[error("Wrong ZLIB data header offset {zheader_offset:#x} (expected {offset:#x}).")]
+    BadZlibHeaderOffset { offset: u64, zheader_offset: u64 },
+
+    #[error("At offset {offset:#x}, impossible ZLIB trailer offset {ztrailer_offset:#x}.")]
+    BadZlibTrailerOffset { offset: u64, ztrailer_offset: u64 },
+
+    #[error("At offset {offset:#x}, impossible ZLIB trailer length {ztrailer_len}.")]
+    BadZlibTrailerLen { offset: u64, ztrailer_len: u64 },
  }
  
  #[derive(Error, Debug)]
@@ -70,12 +88,11 @@ pub enum Compression {
  
  pub struct Reader<R: Read> {
      r: BufReader<R>,
-
-    document_record: Option<DocumentRecord>,
-
+    documents: Vec<DocumentRecord>,
      variables: Vec<VariableRecord>,
-
      value_labels: Vec<ValueLabelRecord>,
+    extensions: Vec<ExtensionRecord>,
+    zheader: Option<ZHeader>,
  }
  
  /// Magic number for a regular system file.
@@ -124,49 +141,42 @@ pub struct FileHeader {
      pub file_label: [u8; 64],
  }
  
-pub const DOC_LINE_LEN: u32 = 80;
-pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN;
-
  impl<R: Read + Seek> Reader<R> {
      pub fn new(r: R, warn: impl Fn(Warning)) -> Result<Reader<R>, Error> {
          let mut r = BufReader::new(r);
  
          let header = read_header(&mut r, &warn)?;
          let e = header.endianness;
-        let mut document_record = None;
+        let mut documents = Vec::new();
          let mut variables = Vec::new();
          let mut value_labels = Vec::new();
+        let mut extensions = Vec::new();
          loop {
              let offset = r.stream_position()?;
              let rec_type: u32 = e.parse(read_bytes(&mut r)?);
              match rec_type {
                  2 => variables.push(read_variable_record(&mut r, e)?),
                  3 => value_labels.push(read_value_label_record(&mut r, e, variables.len())?),
-                // A Type 4 record is always immediately after a type 3 record,
-                // the code for type 3 records reads the type 4 record too.
                  4 => return Err(Error::MisplacedType4Record(offset)),
-
-                6 => {
-                    let d = read_document_record(&mut r, e)?;
-                    if document_record.is_some() {
-                        warn(Warning::DuplicateDocumentRecord);
-                    } else {
-                        document_record = d;
-                    }
-                }
-                /*
-                                7 => d.read_extension_record()?,
-                */
+                6 => documents.push(read_document_record(&mut r, e)?),
+                7 => extensions.push(read_extension_record(&mut r, e)?),
                  999 => break,
                  _ => return Err(Error::BadRecordType { offset, rec_type }),
              }
          }
+        let _: [u8; 4] = read_bytes(&mut r)?;
+        let zheader = match header.is_zsav {
+            true => Some(read_zheader(&mut r, e)?),
+            false => None,
+        };
  
          Ok(Reader {
              r,
-            document_record,
+            documents,
              variables,
              value_labels,
+            extensions,
+            zheader,
          })
      }
  }
@@ -390,6 +400,9 @@ fn read_value_label_record<R: Read + Seek>(
      })
  }
  
+pub const DOC_LINE_LEN: u32 = 80;
+pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN;
+
  pub struct DocumentRecord {
      /// Offset from the start of the file to the start of the record.
      pub pos: u64,
@@ -401,28 +414,184 @@ pub struct DocumentRecord {
  fn read_document_record<R: Read + Seek>(
      r: &mut BufReader<R>,
      e: Endian,
-) -> Result<Option<DocumentRecord>, Error> {
+) -> Result<DocumentRecord, Error> {
      let offset = r.stream_position()?;
      let n: u32 = e.parse(read_bytes(r)?);
-    if n == 0 {
-        Ok(None)
-    } else if n > DOC_MAX_LINES {
-        Err(Error::BadDocumentLength {
+    match n {
+        0..=DOC_MAX_LINES => {
+            let pos = r.stream_position()?;
+            let mut lines = Vec::with_capacity(n as usize);
+            for _ in 0..n {
+                let line: [u8; 80] = read_bytes(r)?;
+                lines.push(line);
+            }
+            Ok(DocumentRecord { pos, lines })
+        }
+        _ => Err(Error::BadDocumentLength {
              offset,
              n,
              max: DOC_MAX_LINES,
-        })
-    } else {
-        let pos = r.stream_position()?;
-        let mut lines = Vec::with_capacity(n as usize);
-        for i in 0..n {
-            let line: [u8; 80] = read_bytes(r)?;
-            lines.push(line);
-        }
-        Ok(Some(DocumentRecord { pos, lines }))
+        }),
+    }
+}
+
+#[derive(FromPrimitive)]
+enum Extension {
+    /// Machine integer info.
+    Integer = 3,
+    /// Machine floating-point info.
+    Float = 4,
+    /// Variable sets.
+    VarSets = 5,
+    /// DATE.
+    Date = 6,
+    /// Multiple response sets.
+    Mrsets = 7,
+    /// SPSS Data Entry.
+    DataEntry = 8,
+    /// Extra product info text.
+    ProductInfo = 10,
+    /// Variable display parameters.
+    Display = 11,
+    /// Long variable names.
+    LongNames = 13,
+    /// Long strings.
+    LongStrings = 14,
+    /// Extended number of cases.
+    Ncases = 16,
+    /// Data file attributes.
+    FileAttrs = 17,
+    /// Variable attributes.
+    VarAttrs = 18,
+    /// Multiple response sets (extended).
+    Mrsets2 = 19,
+    /// Character encoding.
+    Encoding = 20,
+    /// Value labels for long strings.
+    LongLabels = 21,
+    /// Missing values for long strings.
+    LongMissing = 22,
+    /// "Format properties in dataview table".
+    Dataview = 24,
+}
+
+struct ExtensionRecord {
+    /// Offset from the start of the file to the start of the record.
+    offset: u64,
+
+    /// Record subtype.
+    subtype: u32,
+
+    /// Size of each data element.
+    size: u32,
+
+    /// Number of data elements.
+    count: u32,
+
+    /// `size * count` bytes of data.
+    data: Vec<u8>,
+}
+
+fn extension_record_size_requirements(extension: Extension) -> (u32, u32) {
+    match extension {
+        /* Implemented record types. */
+        Extension::Integer => (4, 8),
+        Extension::Float => (8, 3),
+        Extension::VarSets => (1, 0),
+        Extension::Mrsets => (1, 0),
+        Extension::ProductInfo => (1, 0),
+        Extension::Display => (4, 0),
+        Extension::LongNames => (1, 0),
+        Extension::LongStrings => (1, 0),
+        Extension::Ncases => (8, 2),
+        Extension::FileAttrs => (1, 0),
+        Extension::VarAttrs => (1, 0),
+        Extension::Mrsets2 => (1, 0),
+        Extension::Encoding => (1, 0),
+        Extension::LongLabels => (1, 0),
+        Extension::LongMissing => (1, 0),
+
+        /* Ignored record types. */
+        Extension::Date => (0, 0),
+        Extension::DataEntry => (0, 0),
+        Extension::Dataview => (0, 0),
      }
  }
  
+fn read_extension_record<R: Read + Seek>(
+    r: &mut BufReader<R>,
+    e: Endian,
+) -> Result<ExtensionRecord, Error> {
+    let subtype = e.parse(read_bytes(r)?);
+    let offset = r.stream_position()?;
+    let size: u32 = e.parse(read_bytes(r)?);
+    let count = e.parse(read_bytes(r)?);
+    let Some(product) = size.checked_mul(count) else {
+        return Err(Error::ExtensionRecordTooLarge {
+            offset,
+            subtype,
+            size,
+            count,
+        });
+    };
+    let offset = r.stream_position()?;
+    let data = read_vec(r, product as usize)?;
+    Ok(ExtensionRecord {
+        offset,
+        subtype,
+        size,
+        count,
+        data,
+    })
+}
+
+struct ZHeader {
+    /// File offset to the start of the record.
+    offset: u64,
+
+    /// File offset to the ZLIB data header.
+    zheader_offset: u64,
+
+    /// File offset to the ZLIB trailer.
+    ztrailer_offset: u64,
+
+    /// Length of the ZLIB trailer in bytes.
+    ztrailer_len: u64,
+}
+
+fn read_zheader<R: Read + Seek>(r: &mut BufReader<R>, e: Endian) -> Result<ZHeader, Error> {
+    let offset = r.stream_position()?;
+    let zheader_offset: u64 = e.parse(read_bytes(r)?);
+    let ztrailer_offset: u64 = e.parse(read_bytes(r)?);
+    let ztrailer_len: u64 = e.parse(read_bytes(r)?);
+
+    if zheader_offset != offset {
+        return Err(Error::BadZlibHeaderOffset {
+            offset,
+            zheader_offset,
+        });
+    }
+    if ztrailer_offset < offset {
+        return Err(Error::BadZlibTrailerOffset {
+            offset,
+            ztrailer_offset,
+        });
+    }
+    if ztrailer_len < 24 || ztrailer_len % 24 != 0 {
+        return Err(Error::BadZlibTrailerLen {
+            offset,
+            ztrailer_len,
+        });
+    }
+
+    Ok(ZHeader {
+        offset,
+        zheader_offset,
+        ztrailer_offset,
+        ztrailer_len,
+    })
+}
+
  fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
      let mut buf = [0; N];
      r.read_exact(&mut buf)?;
author	Ben Pfaff <blp@cs.stanford.edu>
	Mon, 24 Jul 2023 16:02:56 +0000 (09:02 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Mon, 24 Jul 2023 16:02:56 +0000 (09:02 -0700)
rust/Cargo.lock		patch \| blob \| history
rust/Cargo.toml		patch \| blob \| history
rust/src/lib.rs		patch \| blob \| history