Work on system file library.
authorBen Pfaff <blp@cs.stanford.edu>
Sun, 23 Jul 2023 21:19:26 +0000 (14:19 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Sun, 23 Jul 2023 21:19:26 +0000 (14:19 -0700)
rust/Cargo.lock
rust/Cargo.toml
rust/src/endian.rs [new file with mode: 0644]
rust/src/lib.rs [new file with mode: 0644]
utilities/pspp-dump-sav.c

index 6ebd99f2c1d4d674aae0b62d28847a7713be1747..09889b3500ccb520f9d602396c24b06dc3934f5a 100644 (file)
@@ -282,13 +282,14 @@ dependencies = [
 ]
 
 [[package]]
-name = "pspp-dump-sav"
+name = "pspp"
 version = "1.0.0"
 dependencies = [
  "anyhow",
  "clap",
  "hexplay",
  "num",
+ "thiserror",
 ]
 
 [[package]]
@@ -349,6 +350,26 @@ dependencies = [
  "winapi-util",
 ]
 
+[[package]]
+name = "thiserror"
+version = "1.0.39"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5ab016db510546d856297882807df8da66a16fb8c4101cb8b30054b0d5b2d9c"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.39"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5420d42e90af0c38c3290abcca25b9b3bdf379fc9f55c528f53a269d9c9a267e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.6"
index fe9af6885a4825f89d8d45dea130ce3bfd43e959..09324d1dfdbc3d6b22e3e9658cf5d0e64ec50247 100644 (file)
@@ -1,5 +1,5 @@
 [package]
-name = "pspp-dump-sav"
+name = "pspp"
 version = "1.0.0"
 edition = "2021"
 authors = [ "Ben Pfaff", "John Darrington" ]
@@ -9,3 +9,11 @@ anyhow = "1.0.69"
 clap = { version = "4.1.7", features = ["derive"] }
 hexplay = "0.2.1"
 num = "0.4.0"
+thiserror = "1.0"
+
+[[bin]]
+name = "pspp-dump-sav"
+path = "src/main.rs"
+
+[lib]
+path = "src/lib.rs"
\ No newline at end of file
diff --git a/rust/src/endian.rs b/rust/src/endian.rs
new file mode 100644 (file)
index 0000000..5195213
--- /dev/null
@@ -0,0 +1,115 @@
+#[derive(Copy, Clone, Debug)]
+/// The endianness for integer and floating-point numbers in SPSS system files.
+///
+/// SPSS system files can declare IBM 370 and DEC VAX floating-point
+/// representations, but no file that uses either of these has ever been found
+/// in the wild, so this code does not handle them.
+pub enum Endian {
+    /// Big-endian: MSB at lowest address.
+    Big,
+
+    /// Little-endian: LSB at lowest address.
+    Little,
+}
+
+impl Endian {
+    pub fn identify_u32(expected_value: u32, bytes: [u8; 4]) -> Option<Self> {
+        let as_big: u32 = Endian::Big.parse(bytes);
+        let as_little: u32 = Endian::Little.parse(bytes);
+        match (as_big == expected_value, as_little == expected_value) {
+            (true, false) => Some(Endian::Big),
+            (false, true) => Some(Endian::Little),
+            _ => None
+        }
+    }
+
+    pub fn identify_f64(expected_value: f64, bytes: [u8; 8]) -> Option<Self> {
+        let as_big: f64 = Endian::Big.parse(bytes);
+        let as_little: f64 = Endian::Little.parse(bytes);
+        match (as_big == expected_value, as_little == expected_value) {
+            (true, false) => Some(Endian::Big),
+            (false, true) => Some(Endian::Little),
+            _ => None
+        }
+    }
+}
+
+/// Parses an `N`-byte slice in one of the supported formats into native format
+/// as type `T`.
+pub trait Parse<T, const N: usize> {
+    /// Given 'bytes', returns `T`.
+    fn parse(self, bytes: [u8; N]) -> T;
+}
+impl Parse<u64, 8> for Endian {
+    fn parse(self, bytes: [u8; 8]) -> u64 {
+        match self {
+            Endian::Big => u64::from_be_bytes(bytes),
+            Endian::Little => u64::from_le_bytes(bytes),
+        }
+    }
+}
+impl Parse<u32, 4> for Endian {
+    fn parse(self, bytes: [u8; 4]) -> u32 {
+        match self {
+            Endian::Big => u32::from_be_bytes(bytes),
+            Endian::Little => u32::from_le_bytes(bytes),
+        }
+    }
+}
+impl Parse<u16, 2> for Endian {
+    fn parse(self, bytes: [u8; 2]) -> u16 {
+        match self {
+            Endian::Big => u16::from_be_bytes(bytes),
+            Endian::Little => u16::from_le_bytes(bytes),
+        }
+    }
+}
+impl Parse<u8, 1> for Endian {
+    fn parse(self, bytes: [u8; 1]) -> u8 {
+        match self {
+            Endian::Big => u8::from_be_bytes(bytes),
+            Endian::Little => u8::from_le_bytes(bytes),
+        }
+    }
+}
+impl Parse<i64, 8> for Endian {
+    fn parse(self, bytes: [u8; 8]) -> i64 {
+        match self {
+            Endian::Big => i64::from_be_bytes(bytes),
+            Endian::Little => i64::from_le_bytes(bytes),
+        }
+    }
+}
+impl Parse<i32, 4> for Endian {
+    fn parse(self, bytes: [u8; 4]) -> i32 {
+        match self {
+            Endian::Big => i32::from_be_bytes(bytes),
+            Endian::Little => i32::from_le_bytes(bytes),
+        }
+    }
+}
+impl Parse<i16, 2> for Endian {
+    fn parse(self, bytes: [u8; 2]) -> i16 {
+        match self {
+            Endian::Big => i16::from_be_bytes(bytes),
+            Endian::Little => i16::from_le_bytes(bytes),
+        }
+    }
+}
+impl Parse<i8, 1> for Endian {
+    fn parse(self, bytes: [u8; 1]) -> i8 {
+        match self {
+            Endian::Big => i8::from_be_bytes(bytes),
+            Endian::Little => i8::from_le_bytes(bytes),
+        }
+    }
+}
+impl Parse<f64, 8> for Endian {
+    fn parse(self, bytes: [u8; 8]) -> f64 {
+        match self {
+            Endian::Big => f64::from_be_bytes(bytes),
+            Endian::Little => f64::from_le_bytes(bytes),
+        }
+    }
+}
+
diff --git a/rust/src/lib.rs b/rust/src/lib.rs
new file mode 100644 (file)
index 0000000..5d6bf5c
--- /dev/null
@@ -0,0 +1,165 @@
+#![allow(unused_variables)]
+use endian::{Endian, Parse};
+use std::io::{BufReader, Error as IoError, Read};
+use thiserror::Error;
+
+pub mod endian;
+
+#[derive(Error, Debug)]
+pub enum Error {
+    #[error("Not an SPSS system file")]
+    NotASystemFile,
+
+    #[error("I/O error ({source})")]
+    Io {
+        #[from]
+        source: IoError,
+    },
+
+    #[error("Invalid SAV compression code {0}")]
+    InvalidSavCompression(u32),
+
+    #[error("Invalid ZSAV compression code {0}")]
+    InvalidZsavCompression(u32),
+}
+
+#[derive(Error, Debug)]
+pub enum Warning {
+    #[error("Unexpected floating-point bias {0} or unrecognized floating-point format.")]
+    UnexpectedBias(f64),
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum Compression {
+    Simple,
+    ZLib,
+}
+
+pub struct Reader<R: Read> {
+    r: BufReader<R>,
+}
+
+pub const ASCII_MAGIC: &[u8; 4] = b"$FL2";
+pub const ASCII_ZMAGIC: &[u8; 4] = b"$FL3";
+pub const EBCDIC_MAGIC: &[u8; 4] = &[0x5b, 0xc6, 0xd3, 0xf2];
+
+pub struct FileHeader {
+    /// First 4 bytes of the file, one of `ASCII_MAGIC`, `ASCII_ZMAGIC`, and
+    /// `EBCDIC_MAGIC`.
+    pub magic: [u8; 4],
+
+    /// True if `magic` indicates that this file contained zlib-compressed data.
+    pub is_zsav: bool,
+
+    /// True if `magic` indicates that this file contained EBCDIC data.
+    pub is_ebcdic: bool,
+
+    /// 0-based variable index of the weight variable, or `None` if the file is
+    /// unweighted.
+    pub weight_index: Option<u32>,
+
+    /// Number of variable positions, or `None` if the value in the file is
+    /// questionably trustworthy.
+    pub nominal_case_size: Option<u32>,
+
+    /// `dd mmm yy` in the file's encoding.
+    pub creation_date: [u8; 9],
+
+    /// `HH:MM:SS` in the file's encoding.
+    pub creation_time: [u8; 8],
+
+    /// Eye-catcher string, then product name, in the file's encoding.  Padded
+    /// on the right with spaces.
+    pub eye_catcher: [u8; 60],
+
+    /// File label, in the file's encoding.  Padded on the right with spaces.
+    pub file_label: [u8; 64],
+}
+
+impl<R: Read> Reader<R> {
+    pub fn new(r: R, warn: impl Fn(Warning)) -> Result<Reader<R>, Error> {
+        let mut r = BufReader::new(r);
+
+        let magic: [u8; 4] = read_bytes(&mut r)?;
+        let (is_zsav, is_ebcdic) = match &magic {
+            ASCII_MAGIC => (false, false),
+            ASCII_ZMAGIC => (true, false),
+            EBCDIC_MAGIC => (false, true),
+            _ => return Err(Error::NotASystemFile),
+        };
+
+        let eye_catcher: [u8; 60] = read_bytes(&mut r)?;
+        let layout_code: [u8; 4] = read_bytes(&mut r)?;
+        let endianness = Endian::identify_u32(2, layout_code)
+            .or_else(|| Endian::identify_u32(2, layout_code))
+            .ok_or_else(|| Error::NotASystemFile)?;
+
+        let nominal_case_size: u32 = endianness.parse(read_bytes(&mut r)?);
+        let nominal_case_size = (nominal_case_size <= u32::MAX / 32).then_some(nominal_case_size);
+
+        let compression_code: u32 = endianness.parse(read_bytes(&mut r)?);
+        let compression = match (is_zsav, compression_code) {
+            (false, 0) => None,
+            (false, 1) => Some(Compression::Simple),
+            (true, 2) => Some(Compression::ZLib),
+            (false, code) => return Err(Error::InvalidSavCompression(code)),
+            (true, code) => return Err(Error::InvalidZsavCompression(code)),
+        };
+
+        let weight_index: u32 = endianness.parse(read_bytes(&mut r)?);
+        let weight_index = (weight_index > 0).then_some(weight_index - 1);
+
+        let n_cases: u32 = endianness.parse(read_bytes(&mut r)?);
+        let n_cases = (n_cases <= u32::MAX / 4).then_some(n_cases);
+
+        let bias: f64 = endianness.parse(read_bytes(&mut r)?);
+        if bias != 100.0 {
+            warn(Warning::UnexpectedBias(bias))
+        }
+
+        let creation_date: [u8; 9] = read_bytes(&mut r)?;
+        let creation_time: [u8; 8] = read_bytes(&mut r)?;
+        let file_label: [u8; 64] = read_bytes(&mut r)?;
+        let _: [u8; 3] = read_bytes(&mut r)?;
+
+        let header = FileHeader {
+            magic,
+            is_zsav,
+            is_ebcdic,
+            weight_index,
+            nominal_case_size,
+            creation_date,
+            creation_time,
+            eye_catcher,
+            file_label,
+        };
+
+        Ok(Reader { r })
+    }
+}
+
+fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
+    let mut buf = [0; N];
+    r.read_exact(&mut buf)?;
+    Ok(buf)
+}
+
+/*
+fn trim_end(mut s: Vec<u8>, c: u8) -> Vec<u8> {
+    while s.last() == Some(&c) {
+        s.pop();
+    }
+    s
+}
+
+fn skip_bytes<R: Read>(r: &mut R, mut n: u64) -> Result<(), IoError> {
+    let mut buf = [0; 1024];
+    while n > 0 {
+        let chunk = u64::min(n, buf.len() as u64);
+        r.read_exact(&mut buf[0..chunk as usize])?;
+        n -= chunk;
+    }
+    Ok(())
+}
+
+*/
index ba5a0c9045c3c78734c2386bf897e08c66b963cb..d08999e01b673984a4af349c5d862783fea17bf3 100644 (file)
@@ -318,6 +318,19 @@ read_header (struct sfm_reader *r)
       else
         r->float_format = FLOAT_IEEE_DOUBLE_LE;
     }
+  if ((r->integer_format == INTEGER_MSB_FIRST && r->float_format != FLOAT_IEEE_DOUBLE_BE) ||
+      (r->integer_format == INTEGER_LSB_FIRST && r->float_format != FLOAT_IEEE_DOUBLE_LE)) 
+    {
+      printf ("unexpected floating-point format\n");
+    }
+
+  if (r->float_format != FLOAT_IEEE_DOUBLE_LE && r->float_format != FLOAT_IEEE_DOUBLE_BE)
+    {
+      printf ("non-IEEE format\n");
+    } else
+    {
+      printf ("IEEE format\n");
+    }
   r->bias = float_get_double (r->float_format, raw_bias);
 
   read_string (r, creation_date, sizeof creation_date);