I think it's complete.

author Ben Pfaff <blp@cs.stanford.edu>

Sun, 23 Jul 2023 05:48:28 +0000 (22:48 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Sun, 23 Jul 2023 05:48:28 +0000 (22:48 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Sun, 23 Jul 2023 05:48:28 +0000 (22:48 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Sun, 23 Jul 2023 05:48:28 +0000 (22:48 -0700)
diff --git a/rust/src/hexfloat.rs b/rust/src/hexfloat.rs

new file mode 100644 (file)

index 0000000..b885fb2
--- /dev/null
+++ b/rust/src/hexfloat.rs
@@ -0,0 +1,52 @@
+use num::Float;
+use std::{num::FpCategory, fmt::{Display, Formatter, Result}};
+
+pub struct HexFloat<T: Float>(pub T);
+
+impl<T: Float> Display for HexFloat<T> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> Result {
+        let sign = if self.0.is_sign_negative() { "-" } else { "" };
+        match self.0.classify() {
+            FpCategory::Nan => return write!(f, "NaN"),
+            FpCategory::Infinite => return write!(f, "{sign}Infinity"),
+            FpCategory::Zero => return write!(f, "{sign}0.0"),
+            _ => (),
+        };
+        let (significand, mut exponent, _) = self.0.integer_decode();
+        let mut hex_sig = format!("{:x}", significand);
+        while hex_sig.ends_with('0') {
+            hex_sig.pop();
+            exponent += 4;
+        }
+        match hex_sig.len() {
+            0 => write!(f, "{sign}0.0"),
+            1 => write!(f, "{sign}0x{hex_sig}.0p{exponent}"),
+            len => write!(
+                f,
+                "{sign}0x{}.{}p{}",
+                hex_sig.chars().next().unwrap(),
+                &hex_sig[1..],
+                exponent + 4 * (len as i16 - 1)
+            ),
+        }
+    }
+}
+
+#[cfg(test)]
+mod hex_float_tests {
+    use crate::HexFloat;
+    use num::Float;
+
+    #[test]
+    fn test() {
+        assert_eq!(format!("{}", HexFloat(1.0)), "0x1.0p0");
+        assert_eq!(format!("{}", HexFloat(123.0)), "0x1.ecp6");
+        assert_eq!(format!("{}", HexFloat(1.0 / 16.0)), "0x1.0p-4");
+        assert_eq!(format!("{}", HexFloat(f64::infinity())), "Infinity");
+        assert_eq!(format!("{}", HexFloat(f64::neg_infinity())), "-Infinity");
+        assert_eq!(format!("{}", HexFloat(f64::nan())), "NaN");
+        assert_eq!(format!("{}", HexFloat(0.0)), "0.0");
+        assert_eq!(format!("{}", HexFloat(f64::neg_zero())), "-0.0");
+    }
+}
+
diff --git a/rust/src/main.rs b/rust/src/main.rs

index 9c92b5515d50f8dd6c2c10f4cce46258d0cd1b5b..7ee84de9ce16cc0a80b0d4742bc3964278d248fa 100644 (file)
--- a/rust/src/main.rs
+++ b/rust/src/main.rs
@@ -1,5 +1,3 @@
-#![allow(unused_variables)]
-#![allow(dead_code)]
  /* PSPP - a program for statistical analysis.
   * Copyright (C) 2023 Free Software Foundation, Inc.
   *
@@ -20,14 +18,19 @@ use anyhow::{anyhow, Result};
  use clap::Parser;
  use hexplay::HexView;
  use hexplay::HexViewBuilder;
-use num::{Float, Num};
+use num::Num;
  use std::cmp::Ordering;
+use std::collections::VecDeque;
+use std::fmt;
  use std::fs::File;
  use std::io::prelude::*;
  use std::io::BufReader;
+use std::io::ErrorKind;
  use std::path::{Path, PathBuf};
  use std::str;
-use std::{fmt, num::FpCategory};
+
+mod hexfloat;
+use hexfloat::HexFloat;
  
  /// A utility to dissect SPSS system files.
  #[derive(Parser, Debug)]
@@ -46,7 +49,7 @@ fn main() -> Result<()> {
      let Args { max_cases, files } = Args::parse();
  
      for file in files {
-        Dissector::new(file)?;
+        Dissector::new(file, max_cases)?;
      }
      Ok(())
  }
@@ -156,13 +159,18 @@ trait ReadSwap<T> {
      fn read_swap(&mut self) -> Result<T>;
  }
  
+impl ReadSwap<u8> for Dissector {
+    fn read_swap(&mut self) -> Result<u8> {
+        Ok(self.endianness.parse(read_bytes(&mut self.r)?))
+    }
+}
  impl ReadSwap<u32> for Dissector {
      fn read_swap(&mut self) -> Result<u32> {
          Ok(self.endianness.parse(read_bytes(&mut self.r)?))
      }
  }
-impl ReadSwap<u8> for Dissector {
-    fn read_swap(&mut self) -> Result<u8> {
+impl ReadSwap<u64> for Dissector {
+    fn read_swap(&mut self) -> Result<u64> {
          Ok(self.endianness.parse(read_bytes(&mut self.r)?))
      }
  }
@@ -182,7 +190,6 @@ impl ReadSwap<f64> for Dissector {
  struct Dissector {
      filename: String,
      r: BufReader<File>,
-    compression: Option<Compression>,
      endianness: Endianness,
      fp_format: Endianness,
      bias: f64,
@@ -218,13 +225,6 @@ fn trim_end(mut s: Vec<u8>, c: u8) -> Vec<u8> {
      s
  }
  
-fn slice_trim_end(mut s: &[u8], c: u8) -> &[u8] {
-    while s.last() == Some(&c) {
-        s = s.split_last().unwrap().1;
-    }
-    s
-}
-
  fn format_name(type_: u32) -> &'static str {
      match type_ {
          1 => "A",
@@ -296,57 +296,8 @@ impl fmt::Display for UntypedValue {
      }
  }
  
-struct HexFloat<T: Float>(T);
-
-impl<T: Float> fmt::Display for HexFloat<T> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let sign = if self.0.is_sign_negative() { "-" } else { "" };
-        match self.0.classify() {
-            FpCategory::Nan => return write!(f, "NaN"),
-            FpCategory::Infinite => return write!(f, "{sign}Infinity"),
-            FpCategory::Zero => return write!(f, "{sign}0.0"),
-            _ => (),
-        };
-        let (significand, mut exponent, _) = self.0.integer_decode();
-        let mut hex_sig = format!("{:x}", significand);
-        while hex_sig.ends_with('0') {
-            hex_sig.pop();
-            exponent += 4;
-        }
-        match hex_sig.len() {
-            0 => write!(f, "{sign}0.0"),
-            1 => write!(f, "{sign}0x{hex_sig}.0p{exponent}"),
-            len => write!(
-                f,
-                "{sign}0x{}.{}p{}",
-                hex_sig.chars().next().unwrap(),
-                &hex_sig[1..],
-                exponent + 4 * (len as i16 - 1)
-            ),
-        }
-    }
-}
-
-#[cfg(test)]
-mod hex_float_tests {
-    use crate::HexFloat;
-    use num::Float;
-
-    #[test]
-    fn test() {
-        assert_eq!(format!("{}", HexFloat(1.0)), "0x1.0p0");
-        assert_eq!(format!("{}", HexFloat(123.0)), "0x1.ecp6");
-        assert_eq!(format!("{}", HexFloat(1.0 / 16.0)), "0x1.0p-4");
-        assert_eq!(format!("{}", HexFloat(f64::infinity())), "Infinity");
-        assert_eq!(format!("{}", HexFloat(f64::neg_infinity())), "-Infinity");
-        assert_eq!(format!("{}", HexFloat(f64::nan())), "NaN");
-        assert_eq!(format!("{}", HexFloat(0.0)), "0.0");
-        assert_eq!(format!("{}", HexFloat(f64::neg_zero())), "-0.0");
-    }
-}
-
  impl Dissector {
-    fn new<P: AsRef<Path>>(filename: P) -> Result<Dissector> {
+    fn new<P: AsRef<Path>>(filename: P, max_cases: usize) -> Result<Dissector> {
          let mut r = BufReader::new(File::open(&filename)?);
          let filename = filename.as_ref().to_string_lossy().into_owned();
          let rec_type: [u8; 4] = read_bytes(&mut r)?;
@@ -384,7 +335,6 @@ impl Dissector {
          let mut d = Dissector {
              filename,
              r,
-            compression,
              endianness,
              fp_format,
              bias,
@@ -455,9 +405,183 @@ impl Dissector {
              pos + 4
          );
  
+        match compression {
+            Some(Compression::Simple) => {
+                if max_cases > 0 {
+                    d.read_simple_compressed_data(max_cases)?;
+                }
+            }
+            Some(Compression::ZLib) => d.read_zlib_compressed_data()?,
+            None => (),
+        }
+
          Ok(d)
      }
  
+    fn read_simple_compressed_data(&mut self, max_cases: usize) -> Result<()> {
+        let _: i32 = self.read_swap()?;
+        println!("\n{:08x}: compressed data:", self.r.stream_position()?);
+
+        const N_OPCODES: usize = 8;
+        let mut opcodes = VecDeque::<u8>::with_capacity(8);
+        let mut opcode_ofs = 0;
+        for case_num in 0..max_cases {
+            println!(
+                "{:08x}: case {case_num}'s uncompressible data begins",
+                self.r.stream_position()?
+            );
+            let mut i = 0;
+            while i < self.var_widths.len() {
+                let width = self.var_widths[i];
+
+                let opcode_idx = N_OPCODES - opcodes.len();
+                let Some(opcode) = opcodes.pop_back() else {
+                    opcode_ofs = self.r.stream_position()?;
+                    let mut new_opcodes = [0; N_OPCODES];
+                    if let Err(error) = self.r.read_exact(&mut new_opcodes) {
+                        if i == 0 && error.kind() == ErrorKind::UnexpectedEof {
+                            return Ok(());
+                        } else {
+                            return Err(error.into());
+                        }
+                    };
+                    opcodes.extend(new_opcodes.into_iter());
+                    continue;
+                };
+
+                print!(
+                    "{:08x}: variable {i}: opcode {opcode}: ",
+                    opcode_ofs + opcode_idx as u64
+                );
+                match opcode {
+                    0 => println!("ignored padding"),
+                    252 => {
+                        println!("end of data");
+                        break;
+                    }
+                    253 => {
+                        let raw: [u8; 8] = read_bytes(&mut self.r)?;
+                        let value = UntypedValue::new(raw, self.fp_format);
+                        println!("uncompressible data: {value}");
+                        i += 1;
+                    }
+                    254 => {
+                        print!("spaces");
+                        if width == 0 {
+                            print!(", but this is a numeric variable");
+                        }
+                        println!();
+                        i += 1;
+                    }
+                    255 => {
+                        print!("SYSMIS");
+                        if width != 0 {
+                            print!(", but this is a string variable (width={width})");
+                        }
+                        println!();
+                        i += 1;
+                    }
+                    _ => {
+                        print!("{}", opcode as f64 - self.bias);
+                        if width != 0 {
+                            print!(", but this is a string variable (width={width})");
+                        }
+                        println!();
+                        i += 1;
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+
+    fn read_zlib_compressed_data(&mut self) -> Result<()> {
+        let _: i32 = self.read_swap()?;
+        let ofs = self.r.stream_position()?;
+        println!("\n{ofs:08x}: ZLIB compressed data header:");
+
+        let this_ofs: u64 = self.read_swap()?;
+        let next_ofs: u64 = self.read_swap()?;
+        let next_len: u64 = self.read_swap()?;
+
+        println!("\theader_ofs: {this_ofs:#x}");
+        if this_ofs != ofs {
+            println!("\t\t(Expected {ofs:#x}.)");
+        }
+        println!("\ttrailer_ofs: {next_ofs:#x}");
+        println!("\ttrailer_len: {next_len}");
+        if next_len < 24 || next_len % 24 != 0 {
+            println!("\t\t(Trailer length is not positive multiple of 24.)");
+        }
+
+        let zlib_data_len = next_ofs - (ofs + 8 * 3);
+        println!(
+            "\n{:08x}: {zlib_data_len:#x} bytes of ZLIB compressed data",
+            ofs + 8 * 3
+        );
+
+        self.skip_bytes(zlib_data_len)?;
+
+        println!("\n{next_ofs:08x}: ZLIB trailer fixed header");
+        let bias: u64 = self.read_swap()?;
+        let zero: u64 = self.read_swap()?;
+        let block_size: u32 = self.read_swap()?;
+        let n_blocks: u32 = self.read_swap()?;
+        println!("\tbias: {bias}");
+        println!("\tzero: {zero:#x}");
+        if zero != 0 {
+            println!("\t\t(Expected 0.)");
+        }
+        println!("\tblock size: {block_size:#x}");
+        if block_size != 0x3ff000 {
+            println!("\t\t(Expected 0x3ff000.)");
+        }
+        println!("\tn_blocks: {n_blocks}");
+        if n_blocks as u64 != next_len / 24 - 1 {
+            println!("\t\t(Expected {}.)", next_len / 24 - 1);
+        }
+
+        let mut expected_uncmp_ofs = ofs;
+        let mut expected_cmp_ofs = ofs + 24;
+        for i in 1..=n_blocks {
+            let blockinfo_ofs = self.r.stream_position()?;
+            let uncompressed_ofs: u64 = self.read_swap()?;
+            let compressed_ofs: u64 = self.read_swap()?;
+            let uncompressed_size: u32 = self.read_swap()?;
+            let compressed_size: u32 = self.read_swap()?;
+
+            println!("\n{blockinfo_ofs:08x}: ZLIB block descriptor {i}");
+
+            println!("\tuncompressed_ofs: {uncompressed_ofs:#x}");
+            if uncompressed_ofs != expected_uncmp_ofs {
+                println!("\t\t(Expected {ofs:#x}.)");
+            }
+
+            println!("\tcompressed_ofs: {compressed_ofs:#x}");
+            if compressed_ofs != expected_cmp_ofs {
+                println!("\t\t(Expected {expected_cmp_ofs:#x}.)");
+            }
+
+            println!("\tuncompressed_size: {uncompressed_size:#x}");
+            if i < n_blocks && uncompressed_size != block_size {
+                println!("\t\t(Expected {block_size:#x}.)");
+            }
+
+            println!("\tcompressed_size: {compressed_size:#x}");
+            if i == n_blocks && compressed_ofs.checked_add(compressed_size as u64) != Some(next_ofs)
+            {
+                println!(
+                    "\t\t(This was expected to be {:#x}.)",
+                    next_ofs - compressed_size as u64
+                );
+            }
+
+            expected_uncmp_ofs += uncompressed_size as u64;
+            expected_cmp_ofs += uncompressed_size as u64;
+        }
+        Ok(())
+    }
+
      fn read_extension_record(&mut self) -> Result<()> {
          let offset = self.r.stream_position()?;
          let subtype: u32 = self.read_swap()?;
@@ -475,6 +599,7 @@ impl Dissector {
              7 | 19 => self.read_mrsets(size, count),
              10 => self.read_extra_product_info(size, count),
              11 => self.read_display_parameters(size, count),
+            13 => self.read_long_string_map(size, count),
              _ => self.read_unknown_extension(subtype, size, count),
          }
      }
@@ -782,6 +907,7 @@ impl Dissector {
              let Some(name) = text.tokenize(b'=') else {
                  break;
              };
+            let name = Vec::from(name);
  
              let (mrset, cat_label_from_counted_values, label_from_var_label) = if text
                  .match_byte(b'C')
@@ -827,17 +953,45 @@ impl Dissector {
              };
  
              let counted_value = if mrset == MrSet::MD {
-                Some(text.parse_counted_string()?)
-            } else { None };
+                Some(Vec::from(text.parse_counted_string()?))
+            } else {
+                None
+            };
  
-            let label = text.parse_counted_string()?;
+            let label = Vec::from(text.parse_counted_string()?);
  
              let variables = text.tokenize(b'\n');
  
-            print!("\t\"{}\": multiple {} set",
-                   String::from_utf8_lossy(name),
-                   if mrset == MrSet::MC { "category" } else { "dichotomy" });
-            
+            print!(
+                "\t\"{}\": multiple {} set",
+                String::from_utf8_lossy(&name),
+                if mrset == MrSet::MC {
+                    "category"
+                } else {
+                    "dichotomy"
+                }
+            );
+            if let Some(counted_value) = counted_value {
+                print!(
+                    ", counted value \"{}\"",
+                    String::from_utf8_lossy(&counted_value)
+                );
+            }
+            if cat_label_from_counted_values {
+                println!(", category labels from counted values");
+            }
+            if label != b"" {
+                print!(", label \"{}\"", String::from_utf8_lossy(&label));
+            }
+            if label_from_var_label {
+                print!(", label from variable label");
+            }
+            if let Some(variables) = variables {
+                print!(", variables \"{}\"", String::from_utf8_lossy(variables));
+            } else {
+                print!("no variables");
+            }
+            println!();
          }
          Ok(())
      }
@@ -899,12 +1053,31 @@ impl Dissector {
          Ok(())
      }
  
-    fn open_text_record(&mut self, size: u32, count: u32) -> Result<TextRecord> {
-        let n_bytes = match u32::checked_mul(size, count) {
-            Some(n) => n,
-            None => Err(anyhow!("Extension record too large."))?,
+    fn read_long_string_map(&mut self, size: u32, count: u32) -> Result<()> {
+        print!(
+            "{:08x}: very long strings (variable => length)",
+            self.r.stream_position()?
+        );
+        let mut text = self.open_text_record(size, count)?;
+        while let Some((var, length)) = text.read_variable_to_value_pair() {
+            println!(
+                "\t{} => {}",
+                String::from_utf8_lossy(&var),
+                String::from_utf8_lossy(&length)
+            );
+        }
+        Ok(())
+    }
+
+    fn read_text_record(&mut self, size: u32, count: u32) -> Result<Vec<u8>> {
+        let Some(n_bytes) = u32::checked_mul(size, count) else {
+            Err(anyhow!("Extension record too large."))?
          };
-        Ok(TextRecord::new(read_vec(&mut self.r, n_bytes as usize)?))
+        read_vec(&mut self.r, n_bytes as usize)
+    }
+
+    fn open_text_record(&mut self, size: u32, count: u32) -> Result<TextRecord> {
+        Ok(TextRecord::new(self.read_text_record(size, count)?))
      }
  }
  
@@ -996,8 +1169,11 @@ impl TextRecord {
          }
  
          let Some((start, end)) = self.get_n_bytes(length) else {
-            Err(anyhow!("{length}-byte string starting at offset {} exceeds record length {}",
-                        self.pos, self.buffer.len()))?
+            Err(anyhow!(
+                "{length}-byte string starting at offset {} exceeds record length {}",
+                self.pos,
+                self.buffer.len()
+            ))?
          };
          if !self.match_byte(b' ') {
              Err(anyhow!(
@@ -1008,4 +1184,12 @@ impl TextRecord {
          }
          Ok(&self.buffer[start..end])
      }
+
+    fn read_variable_to_value_pair(&mut self) -> Option<(Vec<u8>, Vec<u8>)> {
+        let key = self.tokenize(b'=')?.into();
+        let value = self.tokenize(b'\t')?.into();
+
+        while self.match_byte(b'\t') || self.match_byte(b'\0') {}
+        Some((key, value))
+    }
  }
author	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 23 Jul 2023 05:48:28 +0000 (22:48 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 23 Jul 2023 05:48:28 +0000 (22:48 -0700)
rust/src/hexfloat.rs	[new file with mode: 0644]	patch \| blob
rust/src/main.rs		patch \| blob \| history