From: Ben Pfaff Date: Sun, 23 Jul 2023 21:19:26 +0000 (-0700) Subject: Work on system file library. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e62271c65d61e9e84a6eb97a9db4673e710761c4;p=pspp Work on system file library. --- diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 6ebd99f2c1..09889b3500 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -282,13 +282,14 @@ dependencies = [ ] [[package]] -name = "pspp-dump-sav" +name = "pspp" version = "1.0.0" dependencies = [ "anyhow", "clap", "hexplay", "num", + "thiserror", ] [[package]] @@ -349,6 +350,26 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "thiserror" +version = "1.0.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5ab016db510546d856297882807df8da66a16fb8c4101cb8b30054b0d5b2d9c" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5420d42e90af0c38c3290abcca25b9b3bdf379fc9f55c528f53a269d9c9a267e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "unicode-ident" version = "1.0.6" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index fe9af6885a..09324d1dfd 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "pspp-dump-sav" +name = "pspp" version = "1.0.0" edition = "2021" authors = [ "Ben Pfaff", "John Darrington" ] @@ -9,3 +9,11 @@ anyhow = "1.0.69" clap = { version = "4.1.7", features = ["derive"] } hexplay = "0.2.1" num = "0.4.0" +thiserror = "1.0" + +[[bin]] +name = "pspp-dump-sav" +path = "src/main.rs" + +[lib] +path = "src/lib.rs" \ No newline at end of file diff --git a/rust/src/endian.rs b/rust/src/endian.rs new file mode 100644 index 0000000000..51952137b7 --- /dev/null +++ b/rust/src/endian.rs @@ -0,0 +1,115 @@ +#[derive(Copy, Clone, Debug)] +/// The endianness for integer and floating-point numbers in SPSS system files. +/// +/// SPSS system files can declare IBM 370 and DEC VAX floating-point +/// representations, but no file that uses either of these has ever been found +/// in the wild, so this code does not handle them. +pub enum Endian { + /// Big-endian: MSB at lowest address. + Big, + + /// Little-endian: LSB at lowest address. + Little, +} + +impl Endian { + pub fn identify_u32(expected_value: u32, bytes: [u8; 4]) -> Option { + let as_big: u32 = Endian::Big.parse(bytes); + let as_little: u32 = Endian::Little.parse(bytes); + match (as_big == expected_value, as_little == expected_value) { + (true, false) => Some(Endian::Big), + (false, true) => Some(Endian::Little), + _ => None + } + } + + pub fn identify_f64(expected_value: f64, bytes: [u8; 8]) -> Option { + let as_big: f64 = Endian::Big.parse(bytes); + let as_little: f64 = Endian::Little.parse(bytes); + match (as_big == expected_value, as_little == expected_value) { + (true, false) => Some(Endian::Big), + (false, true) => Some(Endian::Little), + _ => None + } + } +} + +/// Parses an `N`-byte slice in one of the supported formats into native format +/// as type `T`. +pub trait Parse { + /// Given 'bytes', returns `T`. + fn parse(self, bytes: [u8; N]) -> T; +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 8]) -> u64 { + match self { + Endian::Big => u64::from_be_bytes(bytes), + Endian::Little => u64::from_le_bytes(bytes), + } + } +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 4]) -> u32 { + match self { + Endian::Big => u32::from_be_bytes(bytes), + Endian::Little => u32::from_le_bytes(bytes), + } + } +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 2]) -> u16 { + match self { + Endian::Big => u16::from_be_bytes(bytes), + Endian::Little => u16::from_le_bytes(bytes), + } + } +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 1]) -> u8 { + match self { + Endian::Big => u8::from_be_bytes(bytes), + Endian::Little => u8::from_le_bytes(bytes), + } + } +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 8]) -> i64 { + match self { + Endian::Big => i64::from_be_bytes(bytes), + Endian::Little => i64::from_le_bytes(bytes), + } + } +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 4]) -> i32 { + match self { + Endian::Big => i32::from_be_bytes(bytes), + Endian::Little => i32::from_le_bytes(bytes), + } + } +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 2]) -> i16 { + match self { + Endian::Big => i16::from_be_bytes(bytes), + Endian::Little => i16::from_le_bytes(bytes), + } + } +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 1]) -> i8 { + match self { + Endian::Big => i8::from_be_bytes(bytes), + Endian::Little => i8::from_le_bytes(bytes), + } + } +} +impl Parse for Endian { + fn parse(self, bytes: [u8; 8]) -> f64 { + match self { + Endian::Big => f64::from_be_bytes(bytes), + Endian::Little => f64::from_le_bytes(bytes), + } + } +} + diff --git a/rust/src/lib.rs b/rust/src/lib.rs new file mode 100644 index 0000000000..5d6bf5c208 --- /dev/null +++ b/rust/src/lib.rs @@ -0,0 +1,165 @@ +#![allow(unused_variables)] +use endian::{Endian, Parse}; +use std::io::{BufReader, Error as IoError, Read}; +use thiserror::Error; + +pub mod endian; + +#[derive(Error, Debug)] +pub enum Error { + #[error("Not an SPSS system file")] + NotASystemFile, + + #[error("I/O error ({source})")] + Io { + #[from] + source: IoError, + }, + + #[error("Invalid SAV compression code {0}")] + InvalidSavCompression(u32), + + #[error("Invalid ZSAV compression code {0}")] + InvalidZsavCompression(u32), +} + +#[derive(Error, Debug)] +pub enum Warning { + #[error("Unexpected floating-point bias {0} or unrecognized floating-point format.")] + UnexpectedBias(f64), +} + +#[derive(Copy, Clone, Debug)] +pub enum Compression { + Simple, + ZLib, +} + +pub struct Reader { + r: BufReader, +} + +pub const ASCII_MAGIC: &[u8; 4] = b"$FL2"; +pub const ASCII_ZMAGIC: &[u8; 4] = b"$FL3"; +pub const EBCDIC_MAGIC: &[u8; 4] = &[0x5b, 0xc6, 0xd3, 0xf2]; + +pub struct FileHeader { + /// First 4 bytes of the file, one of `ASCII_MAGIC`, `ASCII_ZMAGIC`, and + /// `EBCDIC_MAGIC`. + pub magic: [u8; 4], + + /// True if `magic` indicates that this file contained zlib-compressed data. + pub is_zsav: bool, + + /// True if `magic` indicates that this file contained EBCDIC data. + pub is_ebcdic: bool, + + /// 0-based variable index of the weight variable, or `None` if the file is + /// unweighted. + pub weight_index: Option, + + /// Number of variable positions, or `None` if the value in the file is + /// questionably trustworthy. + pub nominal_case_size: Option, + + /// `dd mmm yy` in the file's encoding. + pub creation_date: [u8; 9], + + /// `HH:MM:SS` in the file's encoding. + pub creation_time: [u8; 8], + + /// Eye-catcher string, then product name, in the file's encoding. Padded + /// on the right with spaces. + pub eye_catcher: [u8; 60], + + /// File label, in the file's encoding. Padded on the right with spaces. + pub file_label: [u8; 64], +} + +impl Reader { + pub fn new(r: R, warn: impl Fn(Warning)) -> Result, Error> { + let mut r = BufReader::new(r); + + let magic: [u8; 4] = read_bytes(&mut r)?; + let (is_zsav, is_ebcdic) = match &magic { + ASCII_MAGIC => (false, false), + ASCII_ZMAGIC => (true, false), + EBCDIC_MAGIC => (false, true), + _ => return Err(Error::NotASystemFile), + }; + + let eye_catcher: [u8; 60] = read_bytes(&mut r)?; + let layout_code: [u8; 4] = read_bytes(&mut r)?; + let endianness = Endian::identify_u32(2, layout_code) + .or_else(|| Endian::identify_u32(2, layout_code)) + .ok_or_else(|| Error::NotASystemFile)?; + + let nominal_case_size: u32 = endianness.parse(read_bytes(&mut r)?); + let nominal_case_size = (nominal_case_size <= u32::MAX / 32).then_some(nominal_case_size); + + let compression_code: u32 = endianness.parse(read_bytes(&mut r)?); + let compression = match (is_zsav, compression_code) { + (false, 0) => None, + (false, 1) => Some(Compression::Simple), + (true, 2) => Some(Compression::ZLib), + (false, code) => return Err(Error::InvalidSavCompression(code)), + (true, code) => return Err(Error::InvalidZsavCompression(code)), + }; + + let weight_index: u32 = endianness.parse(read_bytes(&mut r)?); + let weight_index = (weight_index > 0).then_some(weight_index - 1); + + let n_cases: u32 = endianness.parse(read_bytes(&mut r)?); + let n_cases = (n_cases <= u32::MAX / 4).then_some(n_cases); + + let bias: f64 = endianness.parse(read_bytes(&mut r)?); + if bias != 100.0 { + warn(Warning::UnexpectedBias(bias)) + } + + let creation_date: [u8; 9] = read_bytes(&mut r)?; + let creation_time: [u8; 8] = read_bytes(&mut r)?; + let file_label: [u8; 64] = read_bytes(&mut r)?; + let _: [u8; 3] = read_bytes(&mut r)?; + + let header = FileHeader { + magic, + is_zsav, + is_ebcdic, + weight_index, + nominal_case_size, + creation_date, + creation_time, + eye_catcher, + file_label, + }; + + Ok(Reader { r }) + } +} + +fn read_bytes(r: &mut R) -> Result<[u8; N], IoError> { + let mut buf = [0; N]; + r.read_exact(&mut buf)?; + Ok(buf) +} + +/* +fn trim_end(mut s: Vec, c: u8) -> Vec { + while s.last() == Some(&c) { + s.pop(); + } + s +} + +fn skip_bytes(r: &mut R, mut n: u64) -> Result<(), IoError> { + let mut buf = [0; 1024]; + while n > 0 { + let chunk = u64::min(n, buf.len() as u64); + r.read_exact(&mut buf[0..chunk as usize])?; + n -= chunk; + } + Ok(()) +} + +*/ diff --git a/utilities/pspp-dump-sav.c b/utilities/pspp-dump-sav.c index ba5a0c9045..d08999e01b 100644 --- a/utilities/pspp-dump-sav.c +++ b/utilities/pspp-dump-sav.c @@ -318,6 +318,19 @@ read_header (struct sfm_reader *r) else r->float_format = FLOAT_IEEE_DOUBLE_LE; } + if ((r->integer_format == INTEGER_MSB_FIRST && r->float_format != FLOAT_IEEE_DOUBLE_BE) || + (r->integer_format == INTEGER_LSB_FIRST && r->float_format != FLOAT_IEEE_DOUBLE_LE)) + { + printf ("unexpected floating-point format\n"); + } + + if (r->float_format != FLOAT_IEEE_DOUBLE_LE && r->float_format != FLOAT_IEEE_DOUBLE_BE) + { + printf ("non-IEEE format\n"); + } else + { + printf ("IEEE format\n"); + } r->bias = float_get_double (r->float_format, raw_bias); read_string (r, creation_date, sizeof creation_date);