work

author Ben Pfaff <blp@cs.stanford.edu>

Sun, 6 Aug 2023 19:08:19 +0000 (12:08 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Sun, 6 Aug 2023 19:08:19 +0000 (12:08 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Sun, 6 Aug 2023 19:08:19 +0000 (12:08 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Sun, 6 Aug 2023 19:08:19 +0000 (12:08 -0700)
diff --git a/rust/src/main.rs b/rust/src/main.rs

index 5da01dd0243f3a9efd32ffac42f01ccd771e79cb..0bb33bbbcc23db9d4a520403e37e883a2b0d0453 100644 (file)
--- a/rust/src/main.rs
+++ b/rust/src/main.rs
@@ -14,26 +14,14 @@
   * You should have received a copy of the GNU General Public License
   * along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  
-use anyhow::{anyhow, Result};
+use anyhow::{Result};
  use clap::Parser;
-use hexplay::HexView;
-use hexplay::HexViewBuilder;
-use num::Num;
-use std::cmp::Ordering;
-use std::collections::VecDeque;
-use std::fmt;
+use pspp::raw::Reader;
  use std::fs::File;
-use std::io::prelude::*;
  use std::io::BufReader;
-use std::io::ErrorKind;
  use std::path::{Path, PathBuf};
  use std::str;
  
-mod hexfloat;
-use hexfloat::HexFloat;
-
-const ID_MAX_LEN: u32 = 64;
-
  /// A utility to dissect SPSS system files.
  #[derive(Parser, Debug)]
  #[command(author, version, about, long_about = None)]
@@ -48,1314 +36,20 @@ struct Args {
  }
  
  fn main() -> Result<()> {
-    let Args { max_cases, files } = Args::parse();
+    let Args { files, .. } = Args::parse();
  
      for file in files {
-        Dissector::new(file, max_cases)?;
+        dissect(&file)?;
      }
      Ok(())
  }
  
-#[derive(Copy, Clone, Debug)]
-enum Compression {
-    Simple,
-    ZLib,
-}
-
-#[derive(Copy, Clone, Debug)]
-enum Endianness {
-    BigEndian,
-    LittleEndian,
-}
-use Endianness::*;
-
-trait Parse<T, const N: usize> {
-    fn parse(self, bytes: [u8; N]) -> T;
-}
-impl Parse<u64, 8> for Endianness {
-    fn parse(self, bytes: [u8; 8]) -> u64 {
-        match self {
-            BigEndian => u64::from_be_bytes(bytes),
-            LittleEndian => u64::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<u32, 4> for Endianness {
-    fn parse(self, bytes: [u8; 4]) -> u32 {
-        match self {
-            BigEndian => u32::from_be_bytes(bytes),
-            LittleEndian => u32::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<u16, 2> for Endianness {
-    fn parse(self, bytes: [u8; 2]) -> u16 {
-        match self {
-            BigEndian => u16::from_be_bytes(bytes),
-            LittleEndian => u16::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<u8, 1> for Endianness {
-    fn parse(self, bytes: [u8; 1]) -> u8 {
-        match self {
-            BigEndian => u8::from_be_bytes(bytes),
-            LittleEndian => u8::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<i64, 8> for Endianness {
-    fn parse(self, bytes: [u8; 8]) -> i64 {
-        match self {
-            BigEndian => i64::from_be_bytes(bytes),
-            LittleEndian => i64::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<i32, 4> for Endianness {
-    fn parse(self, bytes: [u8; 4]) -> i32 {
-        match self {
-            BigEndian => i32::from_be_bytes(bytes),
-            LittleEndian => i32::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<i16, 2> for Endianness {
-    fn parse(self, bytes: [u8; 2]) -> i16 {
-        match self {
-            BigEndian => i16::from_be_bytes(bytes),
-            LittleEndian => i16::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<i8, 1> for Endianness {
-    fn parse(self, bytes: [u8; 1]) -> i8 {
-        match self {
-            BigEndian => i8::from_be_bytes(bytes),
-            LittleEndian => i8::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<f64, 8> for Endianness {
-    fn parse(self, bytes: [u8; 8]) -> f64 {
-        match self {
-            BigEndian => f64::from_be_bytes(bytes),
-            LittleEndian => f64::from_le_bytes(bytes),
-        }
-    }
-}
-
-fn read_bytes<const N: usize>(r: &mut BufReader<File>) -> Result<[u8; N]> {
-    let mut buf = [0; N];
-    r.read_exact(&mut buf)?;
-    Ok(buf)
-}
-
-fn read_vec(r: &mut BufReader<File>, n: usize) -> Result<Vec<u8>> {
-    let mut vec = vec![0; n];
-    r.read_exact(&mut vec)?;
-    Ok(vec)
-}
-
-trait ReadSwap<T> {
-    fn read_swap(&mut self) -> Result<T>;
-}
-
-impl ReadSwap<u8> for Dissector {
-    fn read_swap(&mut self) -> Result<u8> {
-        Ok(self.endianness.parse(read_bytes(&mut self.r)?))
-    }
-}
-impl ReadSwap<u32> for Dissector {
-    fn read_swap(&mut self) -> Result<u32> {
-        Ok(self.endianness.parse(read_bytes(&mut self.r)?))
-    }
-}
-impl ReadSwap<u64> for Dissector {
-    fn read_swap(&mut self) -> Result<u64> {
-        Ok(self.endianness.parse(read_bytes(&mut self.r)?))
-    }
-}
-
-impl ReadSwap<i32> for Dissector {
-    fn read_swap(&mut self) -> Result<i32> {
-        Ok(self.endianness.parse(read_bytes(&mut self.r)?))
-    }
-}
-
-impl ReadSwap<f64> for Dissector {
-    fn read_swap(&mut self) -> Result<f64> {
-        Ok(self.endianness.parse(read_bytes(&mut self.r)?))
-    }
-}
-
-struct Dissector {
-    filename: String,
-    r: BufReader<File>,
-    endianness: Endianness,
-    fp_format: Endianness,
-    bias: f64,
-    n_variable_records: usize,
-    n_variables: usize,
-    var_widths: Vec<i32>,
-}
-
-fn detect_endianness(layout_code: [u8; 4]) -> Option<Endianness> {
-    for endianness in [BigEndian, LittleEndian] {
-        match endianness.parse(layout_code) {
-            2 | 3 => return Some(endianness),
-            _ => (),
-        }
-    }
-    None
-}
-
-fn detect_fp_format(bias: [u8; 8]) -> Option<Endianness> {
-    for endianness in [BigEndian, LittleEndian] {
-        let value: f64 = endianness.parse(bias);
-        if value == 100.0 {
-            return Some(endianness);
-        }
-    }
-    None
-}
-
-fn trim_end(mut s: Vec<u8>, c: u8) -> Vec<u8> {
-    while s.last() == Some(&c) {
-        s.pop();
-    }
-    s
-}
-
-fn format_name(type_: u32) -> &'static str {
-    match type_ {
-        1 => "A",
-        2 => "AHEX",
-        3 => "COMMA",
-        4 => "DOLLAR",
-        5 => "F",
-        6 => "IB",
-        7 => "PIBHEX",
-        8 => "P",
-        9 => "PIB",
-        10 => "PK",
-        11 => "RB",
-        12 => "RBHEX",
-        15 => "Z",
-        16 => "N",
-        17 => "E",
-        20 => "DATE",
-        21 => "TIME",
-        22 => "DATETIME",
-        23 => "ADATE",
-        24 => "JDATE",
-        25 => "DTIME",
-        26 => "WKDAY",
-        27 => "MONTH",
-        28 => "MOYR",
-        29 => "QYR",
-        30 => "WKYR",
-        31 => "PCT",
-        32 => "DOT",
-        33 => "CCA",
-        34 => "CCB",
-        35 => "CCC",
-        36 => "CCD",
-        37 => "CCE",
-        38 => "EDATE",
-        39 => "SDATE",
-        40 => "MTIME",
-        41 => "YMDHMS",
-        _ => "invalid",
-    }
-}
-
-fn round_up<T: Num + Copy>(x: T, y: T) -> T {
-    (x + (y - T::one())) / y * y
-}
-
-struct UntypedValue {
-    raw: [u8; 8],
-    endianness: Endianness,
-}
-
-impl UntypedValue {
-    fn new(raw: [u8; 8], endianness: Endianness) -> UntypedValue {
-        UntypedValue { raw, endianness }
-    }
-}
-
-impl fmt::Display for UntypedValue {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let numeric: f64 = self.endianness.parse(self.raw);
-        let n_printable = self
-            .raw
-            .iter()
-            .take_while(|&&x| x == b' ' || x.is_ascii_graphic())
-            .count();
-        let printable_prefix = std::str::from_utf8(&self.raw[0..n_printable]).unwrap();
-        write!(f, "{numeric}/\"{printable_prefix}\"")
-    }
-}
-
-impl Dissector {
-    fn new<P: AsRef<Path>>(filename: P, max_cases: usize) -> Result<Dissector> {
-        let mut r = BufReader::new(File::open(&filename)?);
-        let filename = filename.as_ref().to_string_lossy().into_owned();
-        let rec_type: [u8; 4] = read_bytes(&mut r)?;
-        let zmagic = match &rec_type {
-            b"$FL2" => false,
-            b"$FL3" => true,
-            _ => Err(anyhow!("This is not an SPSS system file."))?,
-        };
-
-        let eye_catcher: [u8; 60] = read_bytes(&mut r)?;
-        let layout_code: [u8; 4] = read_bytes(&mut r)?;
-        let endianness = detect_endianness(layout_code)
-            .ok_or_else(|| anyhow!("This is not an SPSS system file."))?;
-        let layout_code: u32 = endianness.parse(layout_code);
-        let _nominal_case_size: [u8; 4] = read_bytes(&mut r)?;
-        let compressed: u32 = endianness.parse(read_bytes(&mut r)?);
-        let compression = match (zmagic, compressed) {
-            (false, 0) => None,
-            (false, 1) => Some(Compression::Simple),
-            (true, 2) => Some(Compression::ZLib),
-            _ => Err(anyhow!(
-                "{} file header has invalid compression value {compressed}.",
-                if zmagic { "ZSAV" } else { "SAV" }
-            ))?,
-        };
-
-        let weight_index: u32 = endianness.parse(read_bytes(&mut r)?);
-        let n_cases: u32 = endianness.parse(read_bytes(&mut r)?);
-
-        let bias: [u8; 8] = read_bytes(&mut r)?;
-        let fp_format = detect_fp_format(bias)
-            .unwrap_or_else(|| { eprintln!("Compression bias is not the usual value of 100, or system file uses unrecognized floating-point format."); endianness });
-        let bias: f64 = fp_format.parse(bias);
-
-        let mut d = Dissector {
-            filename,
-            r,
-            endianness,
-            fp_format,
-            bias,
-            n_variable_records: 0,
-            n_variables: 0,
-            var_widths: Vec::new(),
-        };
-
-        let creation_date: [u8; 9] = read_bytes(&mut d.r)?;
-        let creation_time: [u8; 8] = read_bytes(&mut d.r)?;
-        let file_label: [u8; 64] = read_bytes(&mut d.r)?;
-        let file_label = trim_end(Vec::from(file_label), b' ');
-        d.skip_bytes(3)?;
-
-        println!("File header record:");
-        println!(
-            "{:>17}: {}",
-            "Product name",
-            String::from_utf8_lossy(&eye_catcher)
-        );
-        println!("{:>17}: {}", "Layout code", layout_code);
-        println!(
-            "{:>17}: {} ({})",
-            "Compressed",
-            compressed,
-            match compression {
-                None => "no compression",
-                Some(Compression::Simple) => "simple compression",
-                Some(Compression::ZLib) => "ZLIB compression",
-            }
-        );
-        println!("{:>17}: {}", "Weight index", weight_index);
-        println!("{:>17}: {}", "Number of cases", n_cases);
-        println!("{:>17}: {}", "Compression bias", bias);
-        println!(
-            "{:>17}: {}",
-            "Creation date",
-            String::from_utf8_lossy(&creation_date)
-        );
-        println!(
-            "{:>17}: {}",
-            "Creation time",
-            String::from_utf8_lossy(&creation_time)
-        );
-        println!(
-            "{:>17}: \"{}\"",
-            "File label",
-            String::from_utf8_lossy(&file_label)
-        );
-
-        loop {
-            let rec_type: u32 = d.read_swap()?;
-            match rec_type {
-                2 => d.read_variable_record()?,
-                3 => d.read_value_label_record()?,
-                4 => Err(anyhow!("Misplaced type 4 record."))?,
-                6 => d.read_document_record()?,
-                7 => d.read_extension_record()?,
-                999 => break,
-                _ => Err(anyhow!("Unrecognized record type {rec_type}."))?,
-            }
-        }
-
-        let pos = d.r.stream_position()?;
-        println!(
-            "{:08x}: end-of-dictionary record (first byte of data at {:0x})",
-            pos,
-            pos + 4
-        );
-
-        match compression {
-            Some(Compression::Simple) => {
-                if max_cases > 0 {
-                    d.read_simple_compressed_data(max_cases)?;
-                }
-            }
-            Some(Compression::ZLib) => d.read_zlib_compressed_data()?,
-            None => (),
-        }
-
-        Ok(d)
-    }
-
-    fn read_simple_compressed_data(&mut self, max_cases: usize) -> Result<()> {
-        let _: i32 = self.read_swap()?;
-        println!("\n{:08x}: compressed data:", self.r.stream_position()?);
-
-        const N_OPCODES: usize = 8;
-        let mut opcodes = VecDeque::<u8>::with_capacity(8);
-        let mut opcode_ofs = 0;
-        for case_num in 0..max_cases {
-            println!(
-                "{:08x}: case {case_num}'s uncompressible data begins",
-                self.r.stream_position()?
-            );
-            let mut i = 0;
-            while i < self.var_widths.len() {
-                let width = self.var_widths[i];
-
-                let opcode_idx = N_OPCODES - opcodes.len();
-                let Some(opcode) = opcodes.pop_back() else {
-                    opcode_ofs = self.r.stream_position()?;
-                    let mut new_opcodes = [0; N_OPCODES];
-                    if let Err(error) = self.r.read_exact(&mut new_opcodes) {
-                        if i == 0 && error.kind() == ErrorKind::UnexpectedEof {
-                            return Ok(());
-                        } else {
-                            return Err(error.into());
-                        }
-                    };
-                    opcodes.extend(new_opcodes.into_iter());
-                    continue;
-                };
-
-                print!(
-                    "{:08x}: variable {i}: opcode {opcode}: ",
-                    opcode_ofs + opcode_idx as u64
-                );
-                match opcode {
-                    0 => println!("ignored padding"),
-                    252 => {
-                        println!("end of data");
-                        break;
-                    }
-                    253 => {
-                        let raw: [u8; 8] = read_bytes(&mut self.r)?;
-                        let value = UntypedValue::new(raw, self.fp_format);
-                        println!("uncompressible data: {value}");
-                        i += 1;
-                    }
-                    254 => {
-                        print!("spaces");
-                        if width == 0 {
-                            print!(", but this is a numeric variable");
-                        }
-                        println!();
-                        i += 1;
-                    }
-                    255 => {
-                        print!("SYSMIS");
-                        if width != 0 {
-                            print!(", but this is a string variable (width={width})");
-                        }
-                        println!();
-                        i += 1;
-                    }
-                    _ => {
-                        print!("{}", opcode as f64 - self.bias);
-                        if width != 0 {
-                            print!(", but this is a string variable (width={width})");
-                        }
-                        println!();
-                        i += 1;
-                    }
-                }
-            }
-        }
-        Ok(())
-    }
-
-    fn read_zlib_compressed_data(&mut self) -> Result<()> {
-        let _: i32 = self.read_swap()?;
-        let ofs = self.r.stream_position()?;
-        println!("\n{ofs:08x}: ZLIB compressed data header:");
-
-        let this_ofs: u64 = self.read_swap()?;
-        let next_ofs: u64 = self.read_swap()?;
-        let next_len: u64 = self.read_swap()?;
-
-        println!("\theader_ofs: {this_ofs:#x}");
-        if this_ofs != ofs {
-            println!("\t\t(Expected {ofs:#x}.)");
-        }
-        println!("\ttrailer_ofs: {next_ofs:#x}");
-        println!("\ttrailer_len: {next_len}");
-        if next_len < 24 || next_len % 24 != 0 {
-            println!("\t\t(Trailer length is not positive multiple of 24.)");
-        }
-
-        let zlib_data_len = next_ofs - (ofs + 8 * 3);
-        println!(
-            "\n{:08x}: {zlib_data_len:#x} bytes of ZLIB compressed data",
-            ofs + 8 * 3
-        );
-
-        self.skip_bytes(zlib_data_len)?;
-
-        println!("\n{next_ofs:08x}: ZLIB trailer fixed header");
-        let bias: u64 = self.read_swap()?;
-        let zero: u64 = self.read_swap()?;
-        let block_size: u32 = self.read_swap()?;
-        let n_blocks: u32 = self.read_swap()?;
-        println!("\tbias: {bias}");
-        println!("\tzero: {zero:#x}");
-        if zero != 0 {
-            println!("\t\t(Expected 0.)");
-        }
-        println!("\tblock size: {block_size:#x}");
-        if block_size != 0x3ff000 {
-            println!("\t\t(Expected 0x3ff000.)");
-        }
-        println!("\tn_blocks: {n_blocks}");
-        if n_blocks as u64 != next_len / 24 - 1 {
-            println!("\t\t(Expected {}.)", next_len / 24 - 1);
-        }
-
-        let mut expected_uncmp_ofs = ofs;
-        let mut expected_cmp_ofs = ofs + 24;
-        for i in 1..=n_blocks {
-            let blockinfo_ofs = self.r.stream_position()?;
-            let uncompressed_ofs: u64 = self.read_swap()?;
-            let compressed_ofs: u64 = self.read_swap()?;
-            let uncompressed_size: u32 = self.read_swap()?;
-            let compressed_size: u32 = self.read_swap()?;
-
-            println!("\n{blockinfo_ofs:08x}: ZLIB block descriptor {i}");
-
-            println!("\tuncompressed_ofs: {uncompressed_ofs:#x}");
-            if uncompressed_ofs != expected_uncmp_ofs {
-                println!("\t\t(Expected {ofs:#x}.)");
-            }
-
-            println!("\tcompressed_ofs: {compressed_ofs:#x}");
-            if compressed_ofs != expected_cmp_ofs {
-                println!("\t\t(Expected {expected_cmp_ofs:#x}.)");
-            }
-
-            println!("\tuncompressed_size: {uncompressed_size:#x}");
-            if i < n_blocks && uncompressed_size != block_size {
-                println!("\t\t(Expected {block_size:#x}.)");
-            }
-
-            println!("\tcompressed_size: {compressed_size:#x}");
-            if i == n_blocks && compressed_ofs.checked_add(compressed_size as u64) != Some(next_ofs)
-            {
-                println!(
-                    "\t\t(This was expected to be {:#x}.)",
-                    next_ofs - compressed_size as u64
-                );
-            }
-
-            expected_uncmp_ofs += uncompressed_size as u64;
-            expected_cmp_ofs += uncompressed_size as u64;
-        }
-        Ok(())
-    }
-
-    fn read_extension_record(&mut self) -> Result<()> {
-        let offset = self.r.stream_position()?;
-        let subtype: u32 = self.read_swap()?;
-        let size: u32 = self.read_swap()?;
-        let count: u32 = self.read_swap()?;
-        println!("{offset:08x}: Record 7, subtype {subtype}, size={size}, count={count}");
-        if size.checked_mul(count).is_none() {
-            Err(anyhow!("{size} * {count} exceeds {}", u32::MAX))?
-        }
-        match subtype {
-            3 => self.read_machine_integer_info(size, count),
-            4 => self.read_machine_float_info(size, count),
-            5 => self.read_variable_sets(size, count),
-            6 => {
-                // DATE variable information.  We don't use it yet, but we should.
-                Ok(())
-            }
-            7 | 19 => self.read_mrsets(size, count),
-            10 => self.read_extra_product_info(size, count),
-            11 => self.read_display_parameters(size, count),
-            13 => self.read_long_var_name_map(size, count),
-            14 => self.read_long_string_map(size, count),
-            16 => self.read_ncases64(size, count),
-            17 => self.read_datafile_attributes(size, count),
-            18 => self.read_variable_attributes(size, count),
-            20 => self.read_character_encoding(size, count),
-            21 => self.read_long_string_value_labels(size, count),
-            22 => self.read_long_string_missing_values(size, count),
-            _ => self.read_unknown_extension(subtype, size, count),
-        }
-    }
-
-    fn warn(&mut self, s: String) -> Result<()> {
-        println!(
-            "\"{}\" near offset 0x{:08x}: {s}",
-            self.filename,
-            self.r.stream_position()?
-        );
-        Ok(())
-    }
-
-    fn skip_bytes(&mut self, mut n: u64) -> Result<()> {
-        let mut buf = [0; 1024];
-        while n > 0 {
-            let chunk = u64::min(n, buf.len() as u64);
-            self.r.read_exact(&mut buf[0..chunk as usize])?;
-            n -= chunk;
-        }
-        Ok(())
-    }
-
-    fn read_unknown_extension(&mut self, subtype: u32, size: u32, count: u32) -> Result<()> {
-        self.warn(format!("Unrecognized record type 7, subtype {subtype}."))?;
-        if size == 0 || count > 65536 / size {
-            self.skip_bytes(size as u64 * count as u64)?;
-        } else if size != 1 {
-            let mut offset = 0;
-            for _ in 0..count {
-                let vec = read_vec(&mut self.r, size as usize)?;
-                println!(
-                    "{}",
-                    HexViewBuilder::new(&vec).address_offset(offset).finish()
-                );
-                offset += size as usize;
-            }
-        }
-        Ok(())
-    }
-
-    fn read_variable_record(&mut self) -> Result<()> {
-        self.n_variable_records += 1;
-        println!(
-            "{:08x}: variable record {}",
-            self.r.stream_position()?,
-            self.n_variable_records
-        );
-        let width: i32 = self.read_swap()?;
-        let has_variable_label: u32 = self.read_swap()?;
-        let missing_value_code: i32 = self.read_swap()?;
-        let print_format: u32 = self.read_swap()?;
-        let write_format: u32 = self.read_swap()?;
-        let name: [u8; 8] = read_bytes(&mut self.r)?;
-        let name: Vec<u8> = trim_end(Vec::from(name), b'\0');
-
-        if width >= 0 {
-            self.n_variables += 1;
-        }
-        self.var_widths.push(width);
-
-        println!(
-            "\tWidth: {width} ({})",
-            match width {
-                _ if width > 0 => "string",
-                _ if width == 0 => "numeric",
-                _ => "long string continuation record",
-            }
-        );
-
-        println!("\tVariable label: {has_variable_label}");
-        println!(
-            "\tMissing values code: {missing_value_code} ({})",
-            match missing_value_code {
-                0 => "no missing values",
-                1 => "one missing value",
-                2 => "two missing values",
-                3 => "three missing values",
-                -2 => "one missing value range",
-                -3 => "one missing value, one range",
-                _ => "bad value",
-            }
-        );
-        for (which, format) in [("Print", print_format), ("Worite", write_format)] {
-            let type_ = format_name(format >> 16);
-            let w = (format >> 8) & 0xff;
-            let d = format & 0xff;
-            println!("\t{which} format: {format:06x} ({type_}{w}.{d})");
-        }
-        println!("\tName: {}", String::from_utf8_lossy(&name));
-
-        // Read variable label.
-        match has_variable_label {
-            0 => (),
-            1 => {
-                let offset = self.r.stream_position()?;
-                let len: u32 = self.read_swap()?;
-                let read_len = len.min(65535) as usize;
-                let label = read_vec(&mut self.r, read_len)?;
-                println!(
-                    "\t{offset:08x} Variable label: \"{}\"",
-                    String::from_utf8_lossy(&label)
-                );
-
-                self.skip_bytes((round_up(len, 4) - len).into())?;
-            }
-            _ => Err(anyhow!("Variable label indicator field is not 0 or 1."))?,
-        };
-
-        // Read missing values.
-        if missing_value_code != 0 {
-            print!("\t{:08x} Missing values:", self.r.stream_position()?);
-            match width.cmp(&0) {
-                Ordering::Equal => {
-                    let (has_range, n_individual) = match missing_value_code {
-                        -3 => (true, 1),
-                        -2 => (true, 0),
-                        1 | 2 | 3 => (false, missing_value_code),
-                        _ => Err(anyhow!(
-                            "Numeric missing value indicator field is not -3, -2, 0, 1, 2, or 3."
-                        ))?,
-                    };
-                    if has_range {
-                        let low: f64 = self.read_swap()?;
-                        let high: f64 = self.read_swap()?;
-                        print!(" {low}...{high}");
-                    }
-                    for _ in 0..n_individual {
-                        let value: f64 = self.read_swap()?;
-                        print!(" {value}");
-                    }
-                }
-                Ordering::Greater => {
-                    if !(0..=3).contains(&missing_value_code) {
-                        Err(anyhow!(
-                            "String missing value indicator field is not 0, 1, 2, or 3."
-                        ))?;
-                    }
-                    for _ in 0..missing_value_code {
-                        let string: [u8; 8] = read_bytes(&mut self.r)?;
-                        let string: Vec<u8> = trim_end(Vec::from(string), b'\0');
-                        println!(" {}", String::from_utf8_lossy(&string));
-                    }
-                }
-                Ordering::Less => (),
-            }
-            println!();
-        }
-
-        Ok(())
-    }
-
-    fn read_value_label_record(&mut self) -> Result<()> {
-        println!("{:08x}: value labels record", self.r.stream_position()?);
-
-        // Read the labels.
-        let n_labels: u32 = self.read_swap()?;
-        for _ in 0..n_labels {
-            let raw: [u8; 8] = read_bytes(&mut self.r)?;
-            let value = UntypedValue::new(raw, self.fp_format);
-            let label_len: u8 = self.read_swap()?;
-            let padded_len = round_up(label_len as usize + 1, 8);
-
-            let mut label = read_vec(&mut self.r, padded_len)?;
-            label.truncate(label_len as usize);
-            let label = String::from_utf8_lossy(&label);
-
-            println!("\t{value}: {label}");
-        }
-
-        // Read the type-4 record with the corresponding variable indexes.
-        let rec_type: u32 = self.read_swap()?;
-        if rec_type != 4 {
-            Err(anyhow!(
-                "Variable index record (type 4) does not immediately \
-                         follow value label record (type 3) as it should."
-            ))?;
-        }
-
-        println!("\t{:08x}: apply to variables", self.r.stream_position()?);
-        let n_vars: u32 = self.read_swap()?;
-        for _ in 0..n_vars {
-            let index: u32 = self.read_swap()?;
-            print!(" {index}");
-        }
-        println!();
-
-        Ok(())
-    }
-
-    fn read_document_record(&mut self) -> Result<()> {
-        println!("{:08x}: document record", self.r.stream_position()?);
-        let n_lines: u32 = self.read_swap()?;
-        println!("\t{n_lines} lines of documents");
-
-        for i in 0..n_lines {
-            print!("\t{:08x}: ", self.r.stream_position()?);
-            let line: [u8; 64] = read_bytes(&mut self.r)?;
-            let line = trim_end(Vec::from(line), b' ');
-            println!("line {i}: \"{}\"", String::from_utf8_lossy(&line));
-        }
-        Ok(())
-    }
-
-    fn read_machine_integer_info(&mut self, size: u32, count: u32) -> Result<()> {
-        let offset = self.r.stream_position()?;
-        let version_major: u32 = self.read_swap()?;
-        let version_minor: u32 = self.read_swap()?;
-        let version_revision: u32 = self.read_swap()?;
-        let machine_code: u32 = self.read_swap()?;
-        let float_representation: u32 = self.read_swap()?;
-        let compression_code: u32 = self.read_swap()?;
-        let integer_representation: u32 = self.read_swap()?;
-        let character_code: u32 = self.read_swap()?;
-
-        println!("{offset:08x}: machine integer info");
-        if size != 4 || count != 8 {
-            Err(anyhow!(
-                "Bad size ({size}) or count ({count}) field on record type 7, subtype 3"
-            ))?;
-        }
-        println!("\tVersion: {version_major}.{version_minor}.{version_revision}");
-        println!("\tMachine code: {machine_code}");
-        println!(
-            "\tFloating point representation: {float_representation} ({})",
-            match float_representation {
-                1 => "IEEE 754",
-                2 => "IBM 370",
-                3 => "DEC VAX",
-                _ => "unknown",
-            }
-        );
-        println!("\tCompression code: {compression_code}");
-        println!(
-            "\tEndianness: {integer_representation} ({})",
-            match integer_representation {
-                1 => "big",
-                2 => "little",
-                _ => "unknown",
-            }
-        );
-        println!("\tCharacter code: {character_code}");
-        Ok(())
-    }
-
-    fn read_machine_float_info(&mut self, size: u32, count: u32) -> Result<()> {
-        let offset = self.r.stream_position()?;
-        let sysmis: f64 = self.read_swap()?;
-        let highest: f64 = self.read_swap()?;
-        let lowest: f64 = self.read_swap()?;
-
-        println!("{offset:08x}: machine float info");
-        if size != 4 || count != 8 {
-            Err(anyhow!(
-                "Bad size ({size}) or count ({count}) field on extension 4."
-            ))?;
-        }
-
-        println!("\tsysmis: {sysmis} ({})", HexFloat(sysmis));
-        println!("\thighest: {highest} ({})", HexFloat(highest));
-        println!("\tlowest: {lowest} ({})", HexFloat(lowest));
-        Ok(())
-    }
-
-    fn read_variable_sets(&mut self, size: u32, count: u32) -> Result<()> {
-        println!("{:08x}: variable sets", self.r.stream_position()?);
-        let mut text = self.open_text_record(size, count)?;
-        loop {
-            while text.match_byte(b'\n') {
-                continue;
-            }
-            let set = match text.tokenize(b'=') {
-                Some(set) => String::from_utf8_lossy(set).into_owned(),
-                None => break,
-            };
-
-            // Always present even for an empty set.
-            text.match_byte(b' ');
-
-            match text.tokenize(b'\n') {
-                None => println!("\tset \"{set}\" is empty"),
-                Some(variables) => {
-                    println!(
-                        "\tset \"{set}\" contains \"{}\"",
-                        String::from_utf8_lossy(variables).trim_end_matches('\r')
-                    );
-                }
-            };
-        }
-        Ok(())
-    }
-
-    // Read record type 7, subtype 7.
-    fn read_mrsets(&mut self, size: u32, count: u32) -> Result<()> {
-        print!("{:08x}: multiple response sets", self.r.stream_position()?);
-        let mut text = self.open_text_record(size, count)?;
-        loop {
-            #[derive(PartialEq, Eq)]
-            enum MrSet {
-                MC,
-                MD,
-            }
-
-            while text.match_byte(b'\n') {}
-            let Some(name) = text.tokenize(b'=') else {
-                break;
-            };
-            let name = Vec::from(name);
-
-            let (mrset, cat_label_from_counted_values, label_from_var_label) = if text
-                .match_byte(b'C')
-            {
-                if !text.match_byte(b' ') {
-                    Err(anyhow!(
-                        "missing space following 'C' at offset {} in mrsets record",
-                        text.pos
-                    ))?;
-                }
-                (MrSet::MC, false, false)
-            } else if text.match_byte(b'D') {
-                (MrSet::MD, false, false)
-            } else if text.match_byte(b'E') {
-                if !text.match_byte(b' ') {
-                    Err(anyhow!(
-                        "missing space following 'E' at offset {} in mrsets record",
-                        text.pos
-                    ))?;
-                }
-
-                let pos = text.pos;
-                let Some(number) = text.tokenize(b' ') else {
-                    Err(anyhow!(
-                        "Missing label source value following `E' at offset {}u in MRSETS record",
-                        text.pos
-                    ))?
-                };
-
-                let label_from_var_label = if number == b"11" {
-                    true
-                } else if number == b"1" {
-                    false
-                } else {
-                    Err(anyhow!("Unexpected label source value `{}' following `E' at offset {pos} in MRSETS record", String::from_utf8_lossy(number)))?
-                };
-                (MrSet::MD, true, label_from_var_label)
-            } else {
-                Err(anyhow!(
-                    "missing `C', `D', or `E' at offset {} in mrsets record",
-                    text.pos
-                ))?
-            };
-
-            let counted_value = if mrset == MrSet::MD {
-                Some(Vec::from(text.parse_counted_string()?))
-            } else {
-                None
-            };
-
-            let label = Vec::from(text.parse_counted_string()?);
-
-            let variables = text.tokenize(b'\n');
-
-            print!(
-                "\t\"{}\": multiple {} set",
-                String::from_utf8_lossy(&name),
-                if mrset == MrSet::MC {
-                    "category"
-                } else {
-                    "dichotomy"
-                }
-            );
-            if let Some(counted_value) = counted_value {
-                print!(
-                    ", counted value \"{}\"",
-                    String::from_utf8_lossy(&counted_value)
-                );
-            }
-            if cat_label_from_counted_values {
-                println!(", category labels from counted values");
-            }
-            if label != b"" {
-                print!(", label \"{}\"", String::from_utf8_lossy(&label));
-            }
-            if label_from_var_label {
-                print!(", label from variable label");
-            }
-            if let Some(variables) = variables {
-                print!(", variables \"{}\"", String::from_utf8_lossy(variables));
-            } else {
-                print!("no variables");
-            }
-            println!();
-        }
-        Ok(())
-    }
-
-    fn read_extra_product_info(&mut self, size: u32, count: u32) -> Result<()> {
-        print!("{:08x}: extra product info", self.r.stream_position()?);
-        let text = self.open_text_record(size, count)?;
-        print_string(&text.buffer);
-        Ok(())
-    }
-
-    fn read_display_parameters(&mut self, size: u32, count: u32) -> Result<()> {
-        println!(
-            "{:08x}: variable display parameters",
-            self.r.stream_position()?
-        );
-        if size != 4 {
-            Err(anyhow!("Bad size ({size}) on extension 11."))?;
-        }
-        let n_vars = self.n_variables;
-        let includes_width = if count as usize == 3 * n_vars {
-            true
-        } else if count as usize == 2 * n_vars {
-            false
-        } else {
-            Err(anyhow!(
-                "Extension 11 has bad count {count} (for {n_vars} variables)."
-            ))?
-        };
-
-        for i in 0..n_vars {
-            let measure: u32 = self.read_swap()?;
-            print!(
-                "\tVar #{i}: measure={measure} ({})",
-                match measure {
-                    1 => "nominal",
-                    2 => "ordinal",
-                    3 => "scale",
-                    _ => "invalid",
-                }
-            );
-
-            if includes_width {
-                let width: u32 = self.read_swap()?;
-                print!(", width={width}");
-            }
-
-            let align: u32 = self.read_swap()?;
-            println!(
-                ", align={align} ({})",
-                match align {
-                    0 => "left",
-                    1 => "right",
-                    2 => "centre",
-                    _ => "invalid",
-                }
-            );
-        }
-        Ok(())
-    }
-
-    fn read_long_var_name_map(&mut self, size: u32, count: u32) -> Result<()> {
-        print!(
-            "{:08x}: long variable names (short => long)",
-            self.r.stream_position()?
-        );
-        let mut text = self.open_text_record(size, count)?;
-        while let Some((var, long_name)) = text.read_variable_to_value_pair() {
-            println!(
-                "\t{} => {}",
-                String::from_utf8_lossy(&var),
-                String::from_utf8_lossy(&long_name)
-            );
-        }
-        Ok(())
-    }
-
-    fn read_long_string_map(&mut self, size: u32, count: u32) -> Result<()> {
-        print!(
-            "{:08x}: very long strings (variable => length)",
-            self.r.stream_position()?
-        );
-        let mut text = self.open_text_record(size, count)?;
-        while let Some((var, length)) = text.read_variable_to_value_pair() {
-            println!(
-                "\t{} => {}",
-                String::from_utf8_lossy(&var),
-                String::from_utf8_lossy(&length)
-            );
-        }
-        Ok(())
-    }
-
-    fn read_ncases64(&mut self, size: u32, count: u32) -> Result<()> {
-        if size != 8 {
-            Err(anyhow!("Bad size {size} for extended number of cases."))?
-        }
-        if count != 2 {
-            Err(anyhow!("Bad count {count} for extended number of cases."))?
-        }
-        let unknown: u64 = self.read_swap()?;
-        let ncases64: u64 = self.read_swap()?;
-        print!(
-            "{:08x}: extended number of cases: unknown={unknown}, ncases64={ncases64}",
-            self.r.stream_position()?
-        );
-        Ok(())
-    }
-
-    fn read_attributes(&mut self, text: &mut TextRecord, variable: &str) -> Result<()> {
-        loop {
-            let Some(key) = text.tokenize_string(b'(') else {
-                break;
-            };
-            for index in 1.. {
-                let Some(value) = text.tokenize_string(b'\n') else {
-                    Err(anyhow!(
-                        "{variable}: Error parsing attribute value {key}[{index}]"
-                    ))?
-                };
-                if value.starts_with('\'') && value.ends_with('\'') && value.len() >= 2 {
-                    let middle = &value[1..value.len() - 2];
-                    println!("\t{variable}: {key}[{index}] = \"{middle}\"");
-                } else {
-                    self.warn(format!(
-                        "{variable}: Attribute value {key}[{index}] is not quoted: {value}"
-                    ))?;
-                }
-                if text.match_byte(b')') {
-                    break;
-                }
-            }
-
-            if text.match_byte(b'/') {
-                break;
-            }
-        }
-        Ok(())
-    }
-
-    fn read_datafile_attributes(&mut self, size: u32, count: u32) -> Result<()> {
-        print!("{:08x}: datafile attributes", self.r.stream_position()?);
-        let mut text = self.open_text_record(size, count)?;
-        self.read_attributes(&mut text, "datafile")?;
-        Ok(())
-    }
-
-    fn read_variable_attributes(&mut self, size: u32, count: u32) -> Result<()> {
-        print!("{:08x}: variable attributes", self.r.stream_position()?);
-        let mut text = self.open_text_record(size, count)?;
-        loop {
-            let Some(variable) = text.tokenize_string(b':') else {
-                break;
-            };
-            self.read_attributes(&mut text, &variable)?;
-        }
-        Ok(())
-    }
-
-    fn read_character_encoding(&mut self, size: u32, count: u32) -> Result<()> {
-        let offset = self.r.stream_position()?;
-        let encoding = read_vec(&mut self.r, (size * count) as usize)?;
-        println!("{offset:08x}: Character Encoding: {}", String::from_utf8_lossy(&encoding));
-        Ok(())
-    }
-
-    fn read_long_string_value_labels(&mut self, size: u32, count: u32) -> Result<()> {
-        let start = self.r.stream_position()?;
-
-        println!("{start:08x}: long string value labels");
-        while self.r.stream_position()? - start < (size * count) as u64 {
-            let position = self.r.stream_position()?;
-
-            let var_name_len: u32 = self.read_swap()?;
-            if var_name_len > ID_MAX_LEN {
-                Err(anyhow!("Variable name length in long string value label record ({var_name_len} exceeds {ID_MAX_LEN}-byte limit."))?
-            }
-            let var_name = read_vec(&mut self.r, var_name_len as usize)?;
-
-            let width: u32 = self.read_swap()?;
-            let n_values: u32 = self.read_swap()?;
-
-            println!("\t{position:08x}: {}, width {width}, {n_values} values",
-                     String::from_utf8_lossy(&var_name));
-
-            for _ in 0..n_values {
-                let position = self.r.stream_position()?;
-                let value_length: u32 = self.read_swap()?;
-                let value = read_vec(&mut self.r, value_length as usize)?;
-                let label_length: u32 = self.read_swap()?;
-                let label = read_vec(&mut self.r, value_length as usize)?;
-                println!("\t\t{position:08x}: \"{}\" ({value_length} bytes) => \"{}\" ({label_length} bytes)",
-                         String::from_utf8_lossy(&value),
-                         String::from_utf8_lossy(&label));
-            }
-        }
-        Ok(())
-    }
-
-    fn read_long_string_missing_values(&mut self, size: u32, count: u32) -> Result<()> {
-        let start = self.r.stream_position()?;
-
-        println!("{start:08x}: long string missing values");
-        while self.r.stream_position()? - start < (size * count) as u64 {
-            let position = self.r.stream_position()?;
-
-            let var_name_len: u32 = self.read_swap()?;
-            if var_name_len > ID_MAX_LEN {
-                Err(anyhow!("Variable name length in long string missing value record ({var_name_len} exceeds {ID_MAX_LEN}-byte limit."))?
-            }
-            let var_name = read_vec(&mut self.r, var_name_len as usize)?;
-
-            let n_missing_values: u8 = self.read_swap()?;
-            let value_length: u32 = self.read_swap()?;
-
-            println!("\t{position:08x}: {}, {n_missing_values}, each {value_length} bytes:",
-                     String::from_utf8_lossy(&var_name));
-
-            for _ in 0..n_missing_values {
-                let value = read_vec(&mut self.r, value_length as usize)?;
-                println!(" \"{}\"", String::from_utf8_lossy(&value));
-            }
-        }
-        Ok(())
-    }
-
-    fn read_text_record(&mut self, size: u32, count: u32) -> Result<Vec<u8>> {
-        let Some(n_bytes) = u32::checked_mul(size, count) else {
-            Err(anyhow!("Extension record too large."))?
-        };
-        read_vec(&mut self.r, n_bytes as usize)
-    }
-
-    fn open_text_record(&mut self, size: u32, count: u32) -> Result<TextRecord> {
-        Ok(TextRecord::new(self.read_text_record(size, count)?))
-    }
-}
-
-fn print_string(s: &[u8]) {
-    if s.contains(&b'\0') {
-        println!("{}", HexView::new(s));
-    } else {
-        for &c in s {
-            match c {
-                b'\\' => print!("\\\\"),
-                b'\n' => println!(),
-                c if (b' '..=b'~').contains(&c) => print!("{}", c as char),
-                c => print!("\\{:2x}", c),
-            }
-        }
-    }
-}
-
-struct TextRecord {
-    buffer: Vec<u8>,
-    pos: usize,
-}
-
-impl TextRecord {
-    fn new(buffer: Vec<u8>) -> TextRecord {
-        TextRecord { buffer, pos: 0 }
-    }
-
-    fn tokenize(&mut self, delimiter: u8) -> Option<&[u8]> {
-        let start = self.pos;
-        while self.pos < self.buffer.len()
-            && self.buffer[self.pos] != delimiter
-            && self.buffer[self.pos] != 0
-        {
-            self.pos += 1
-        }
-        if start == self.pos {
-            None
-        } else {
-            Some(&self.buffer[start..self.pos])
-        }
-    }
-
-    fn tokenize_string(&mut self, delimiter: u8) -> Option<String> {
-        self.tokenize(delimiter)
-            .map(|s| String::from_utf8_lossy(s).into_owned())
-    }
-
-    fn match_byte(&mut self, c: u8) -> bool {
-        if self.pos < self.buffer.len() && self.buffer[self.pos] == c {
-            self.pos += 1;
-            true
-        } else {
-            false
-        }
-    }
-
-    fn parse_usize(&mut self) -> Result<usize> {
-        let n_digits = self.buffer[self.pos..]
-            .iter()
-            .take_while(|c| c.is_ascii_digit())
-            .count();
-        if n_digits == 0 {
-            Err(anyhow!("expecting digit at offset {} in record", self.pos))?;
-        }
-        let start = self.pos;
-        self.pos += n_digits;
-        let end = self.pos;
-        let digits = str::from_utf8(&self.buffer[start..end]).unwrap();
-        let Ok(number) = digits.parse::<usize>() else {
-            Err(anyhow!(
-                "expecting number in [0,{}] at offset {} in record",
-                usize::MAX,
-                self.pos
-            ))?
-        };
-        self.pos = end;
-        Ok(number)
-    }
-
-    fn get_n_bytes(&mut self, n: usize) -> Option<(usize, usize)> {
-        let start = self.pos;
-        let Some(end) = start.checked_add(n) else {
-            return None;
-        };
-        self.pos = end;
-        Some((start, end))
-    }
-
-    fn parse_counted_string(&mut self) -> Result<&[u8]> {
-        let length = self.parse_usize()?;
-        if !self.match_byte(b' ') {
-            Err(anyhow!("expecting space at offset {} in record", self.pos))?;
-        }
-
-        let Some((start, end)) = self.get_n_bytes(length) else {
-            Err(anyhow!(
-                "{length}-byte string starting at offset {} exceeds record length {}",
-                self.pos,
-                self.buffer.len()
-            ))?
-        };
-        if !self.match_byte(b' ') {
-            Err(anyhow!(
-                "expecting space at offset {} following {}-byte string",
-                self.pos,
-                end - start
-            ))?;
-        }
-        Ok(&self.buffer[start..end])
-    }
-
-    fn read_variable_to_value_pair(&mut self) -> Option<(Vec<u8>, Vec<u8>)> {
-        let key = self.tokenize(b'=')?.into();
-        let value = self.tokenize(b'\t')?.into();
-
-        while self.match_byte(b'\t') || self.match_byte(b'\0') {}
-        Some((key, value))
+fn dissect(file_name: &Path) -> Result<()> {
+    let reader = File::open(file_name)?;
+    let reader = BufReader::new(reader);
+    let reader = Reader::new(reader)?;
+    for record in reader {
+        println!("{record:?}");
      }
+    Ok(())
  }
diff --git a/rust/src/raw.rs b/rust/src/raw.rs

index f0e8c540c2ac170f6fe18682bd8358ba817bc85d..ca0596f5414aecdb378b38d1e5e8cd3824fe6a18 100644 (file)
--- a/rust/src/raw.rs
+++ b/rust/src/raw.rs
@@ -3,6 +3,7 @@ use crate::Error;
  
  use flate2::read::ZlibDecoder;
  use num::Integer;
+use std::fmt::{Debug, Formatter, Result as FmtResult};
  use std::str::from_utf8;
  use std::{
      collections::VecDeque,
@@ -18,6 +19,7 @@ pub enum Compression {
      ZLib,
  }
  
+#[derive(Clone, Debug)]
  pub enum Record {
      Header(Header),
      Document(Document),
@@ -49,6 +51,27 @@ impl Record {
      }
  }
  
+pub struct FallbackEncoding<'a>(&'a [u8]);
+
+impl<'a> Debug for FallbackEncoding<'a> {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        if let Ok(s) = from_utf8(self.0) {
+            let s = s.trim_end();
+            write!(f, "\"{s}\"")
+        } else {
+            let s: String = self
+                .0
+                .iter()
+                .map(|c| char::from(*c).escape_default())
+                .flatten()
+                .collect();
+            let s = s.trim_end();
+            write!(f, "\"{s}\"")
+        }
+    }
+}
+
+#[derive(Clone)]
  pub struct Header {
      /// Magic number.
      pub magic: Magic,
@@ -90,6 +113,30 @@ pub struct Header {
      pub endian: Endian,
  }
  
+impl Header {
+    fn debug_field<T: Debug>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult {
+        writeln!(f, "{name:>17}: {:?}", value)
+    }
+}
+
+impl Debug for Header {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        writeln!(f, "File header record:")?;
+        self.debug_field(f, "Magic", self.magic)?;
+        self.debug_field(f, "Product name", FallbackEncoding(&self.eye_catcher))?;
+        self.debug_field(f, "Layout code", self.layout_code)?;
+        self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
+        self.debug_field(f, "Compression", self.compression)?;
+        self.debug_field(f, "Weight index", self.weight_index)?;
+        self.debug_field(f, "Number of cases", self.n_cases)?;
+        self.debug_field(f, "Compression bias", self.bias)?;
+        self.debug_field(f, "Creation date", FallbackEncoding(&self.creation_date))?;
+        self.debug_field(f, "Creation time", FallbackEncoding(&self.creation_time))?;
+        self.debug_field(f, "File label", FallbackEncoding(&self.file_label))?;
+        self.debug_field(f, "Endianness", self.endian)
+    }
+}
+
  impl Header {
      fn read<R: Read>(r: &mut R) -> Result<Header, Error> {
          let magic: [u8; 4] = read_bytes(r)?;
@@ -116,7 +163,7 @@ impl Header {
          };
  
          let weight_index: u32 = endian.parse(read_bytes(r)?);
-        let weight_index = (weight_index > 0).then_some(weight_index - 1);
+        let weight_index = (weight_index > 0).then(|| weight_index - 1);
  
          let n_cases: u32 = endian.parse(read_bytes(r)?);
          let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
@@ -160,6 +207,18 @@ impl Magic {
      pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
  }
  
+impl Debug for Magic {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        let s = match self {
+            &Magic::SAV => "$FL2",
+            &Magic::ZSAV => "$FL3",
+            &Magic::EBCDIC => "($FL2 in EBCDIC)",
+            _ => return write!(f, "{:?}", self.0),
+        };
+        write!(f, "{s}")
+    }
+}
+
  impl TryFrom<[u8; 4]> for Magic {
      type Error = Error;
  
@@ -336,7 +395,21 @@ pub enum Value {
      String([u8; 8]),
  }
  
+impl Debug for Value {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        match self {
+            Value::Number(Some(number)) => write!(f, "{number:?}"),
+            Value::Number(None) => write!(f, "SYSMIS"),
+            Value::String(bytes) => write!(f, "{:?}", FallbackEncoding(bytes)),
+        }
+    }
+}
+
  impl Value {
+    fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Value, IoError> {
+        Ok(Self::from_raw(var_type, read_bytes(r)?, endian))
+    }
+
      pub fn from_raw(var_type: VarType, raw: [u8; 8], endian: Endian) -> Value {
          match var_type {
              VarType::String => Value::String(raw),
@@ -517,6 +590,132 @@ impl Iterator for Reader {
  
  impl FusedIterator for Reader {}
  
+#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+pub struct Format(pub u32);
+
+impl Debug for Format {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        let type_ = format_name(self.0 >> 16);
+        let w = (self.0 >> 8) & 0xff;
+        let d = self.0 & 0xff;
+        write!(f, "{:06x} ({type_}{w}.{d})", self.0)
+    }
+}
+
+fn format_name(type_: u32) -> &'static str {
+    match type_ {
+        1 => "A",
+        2 => "AHEX",
+        3 => "COMMA",
+        4 => "DOLLAR",
+        5 => "F",
+        6 => "IB",
+        7 => "PIBHEX",
+        8 => "P",
+        9 => "PIB",
+        10 => "PK",
+        11 => "RB",
+        12 => "RBHEX",
+        15 => "Z",
+        16 => "N",
+        17 => "E",
+        20 => "DATE",
+        21 => "TIME",
+        22 => "DATETIME",
+        23 => "ADATE",
+        24 => "JDATE",
+        25 => "DTIME",
+        26 => "WKDAY",
+        27 => "MONTH",
+        28 => "MOYR",
+        29 => "QYR",
+        30 => "WKYR",
+        31 => "PCT",
+        32 => "DOT",
+        33 => "CCA",
+        34 => "CCB",
+        35 => "CCC",
+        36 => "CCD",
+        37 => "CCE",
+        38 => "EDATE",
+        39 => "SDATE",
+        40 => "MTIME",
+        41 => "YMDHMS",
+        _ => "(unknown)",
+    }
+}
+
+#[derive(Clone)]
+pub struct MissingValues {
+    /// Individual missing values, up to 3 of them.
+    pub values: Vec<Value>,
+
+    /// Optional range of missing values.
+    pub range: Option<(Value, Value)>,
+}
+
+impl Debug for MissingValues {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        for (i, value) in self.values.iter().enumerate() {
+            if i > 0 {
+                write!(f, ", ")?;
+            }
+            write!(f, "{value:?}")?;
+        }
+
+        if let Some((low, high)) = self.range {
+            if !self.values.is_empty() {
+                write!(f, ", ")?;
+            }
+            write!(f, "{low:?} THRU {high:?}")?;
+        }
+
+        if self.is_empty() {
+            write!(f, "none")?;
+        }
+
+        Ok(())
+    }
+}
+
+impl MissingValues {
+    fn is_empty(&self) -> bool {
+        self.values.is_empty() && self.range.is_none()
+    }
+
+    fn read<R: Read + Seek>(
+        r: &mut R,
+        offset: u64,
+        width: i32,
+        code: i32,
+        endian: Endian,
+    ) -> Result<MissingValues, Error> {
+        let (n_values, has_range) = match (width, code) {
+            (_, 0..=3) => (code, false),
+            (0, -2) => (0, true),
+            (0, -3) => (1, true),
+            (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
+            (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
+        };
+
+        let var_type = VarType::from_width(width);
+
+        let mut values = Vec::new();
+        for _ in 0..n_values {
+            values.push(Value::read(r, var_type, endian)?);
+        }
+        let range = if has_range {
+            let low = Value::read(r, var_type, endian)?;
+            let high = Value::read(r, var_type, endian)?;
+            Some((low, high))
+        } else {
+            None
+        };
+        Ok(MissingValues { values, range })
+    }
+}
+
+#[derive(Clone)]
  pub struct Variable {
      /// Offset from the start of the file to the start of the record.
      pub offset: u64,
@@ -533,16 +732,41 @@ pub struct Variable {
      /// Write format.
      pub write_format: u32,
  
-    /// Missing value code, one of -3, -2, 0, 1, 2, or 3.
-    pub missing_value_code: i32,
-
-    /// Raw missing values, up to 3 of them.
-    pub missing: Vec<[u8; 8]>,
+    /// Missing values.
+    pub missing_values: MissingValues,
  
      /// Optional variable label.
      pub label: Option<Vec<u8>>,
  }
  
+impl Debug for Variable {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        writeln!(
+            f,
+            "Width: {} ({})",
+            self.width,
+            if self.width > 0 {
+                "string"
+            } else if self.width == 0 {
+                "numeric"
+            } else {
+                "long string continuation record"
+            }
+        )?;
+        writeln!(f, "Print format: {:?}", Format(self.print_format))?;
+        writeln!(f, "Write format: {:?}", Format(self.write_format))?;
+        writeln!(f, "Name: {:?}", FallbackEncoding(&self.name))?;
+        writeln!(
+            f,
+            "Variable label: {:?}",
+            self.label
+                .as_ref()
+                .map(|label| FallbackEncoding(&label[..]))
+        )?;
+        writeln!(f, "Missing values: {:?}", self.missing_values)
+    }
+}
+
  impl Variable {
      fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Variable, Error> {
          let offset = r.stream_position()?;
@@ -573,29 +797,7 @@ impl Variable {
              }
          };
  
-        let mut missing = Vec::new();
-        if missing_value_code != 0 {
-            match (width, missing_value_code) {
-                (0, -3 | -2 | 1 | 2 | 3) => (),
-                (0, _) => {
-                    return Err(Error::BadNumericMissingValueCode {
-                        offset,
-                        code: missing_value_code,
-                    })
-                }
-                (_, 0..=3) => (),
-                (_, _) => {
-                    return Err(Error::BadStringMissingValueCode {
-                        offset,
-                        code: missing_value_code,
-                    })
-                }
-            }
-
-            for _ in 0..missing_value_code.abs() {
-                missing.push(read_bytes(r)?);
-            }
-        }
+        let missing_values = MissingValues::read(r, offset, width, missing_value_code, endian)?;
  
          Ok(Variable {
              offset,
@@ -603,13 +805,13 @@ impl Variable {
              name,
              print_format,
              write_format,
-            missing_value_code,
-            missing,
+            missing_values,
              label,
          })
      }
  }
  
+#[derive(Clone, Debug)]
  pub struct ValueLabel {
      /// Offset from the start of the file to the start of the record.
      pub offset: u64,
@@ -648,6 +850,7 @@ impl ValueLabel {
      }
  }
  
+#[derive(Clone, Debug)]
  pub struct VarIndexes {
      /// Offset from the start of the file to the start of the record.
      pub offset: u64,
@@ -682,6 +885,7 @@ impl VarIndexes {
      }
  }
  
+#[derive(Clone, Debug)]
  pub struct Document {
      /// Offset from the start of the file to the start of the record.
      pub pos: u64,
@@ -1348,6 +1552,7 @@ impl ExtensionRecord for NumberOfCasesRecord {
      }
  }
  
+#[derive(Clone, Debug)]
  pub struct Extension {
      /// Offset from the start of the file to the start of the record.
      pub offset: u64,
@@ -1443,6 +1648,7 @@ impl Extension {
      }
  }
  
+#[derive(Clone, Debug)]
  pub struct ZHeader {
      /// File offset to the start of the record.
      pub offset: u64,
@@ -1473,6 +1679,7 @@ impl ZHeader {
      }
  }
  
+#[derive(Clone, Debug)]
  pub struct ZTrailer {
      /// File offset to the start of the record.
      pub offset: u64,
@@ -1491,6 +1698,7 @@ pub struct ZTrailer {
      pub blocks: Vec<ZBlock>,
  }
  
+#[derive(Clone, Debug)]
  pub struct ZBlock {
      /// Offset of block of data if simple compression were used.
      pub uncompressed_ofs: u64,
author	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 6 Aug 2023 19:08:19 +0000 (12:08 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 6 Aug 2023 19:08:19 +0000 (12:08 -0700)
rust/src/main.rs		patch \| blob \| history
rust/src/raw.rs		patch \| blob \| history