cleanup
[pspp] / rust / src / main.rs
index 9c92b5515d50f8dd6c2c10f4cce46258d0cd1b5b..404e96d57d07becaf09c4688319a0ac3739ceee4 100644 (file)
@@ -1,5 +1,3 @@
-#![allow(unused_variables)]
-#![allow(dead_code)]
 /* PSPP - a program for statistical analysis.
  * Copyright (C) 2023 Free Software Foundation, Inc.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>. */
 
-use anyhow::{anyhow, Result};
-use clap::Parser;
-use hexplay::HexView;
-use hexplay::HexViewBuilder;
-use num::{Float, Num};
-use std::cmp::Ordering;
+use anyhow::Result;
+use clap::{Parser, ValueEnum};
+use encoding_rs::Encoding;
+use pspp::cooked::decode;
+use pspp::raw::{Reader, Record, Magic};
 use std::fs::File;
-use std::io::prelude::*;
 use std::io::BufReader;
 use std::path::{Path, PathBuf};
 use std::str;
-use std::{fmt, num::FpCategory};
+use thiserror::Error as ThisError;
 
 /// A utility to dissect SPSS system files.
 #[derive(Parser, Debug)]
@@ -35,977 +31,89 @@ use std::{fmt, num::FpCategory};
 struct Args {
     /// Maximum number of cases to print.
     #[arg(long = "data", default_value_t = 0)]
-    max_cases: usize,
+    max_cases: u64,
 
     /// Files to dissect.
     #[arg(required = true)]
     files: Vec<PathBuf>,
-}
-
-fn main() -> Result<()> {
-    let Args { max_cases, files } = Args::parse();
-
-    for file in files {
-        Dissector::new(file)?;
-    }
-    Ok(())
-}
-
-#[derive(Copy, Clone, Debug)]
-enum Compression {
-    Simple,
-    ZLib,
-}
-
-#[derive(Copy, Clone, Debug)]
-enum Endianness {
-    BigEndian,
-    LittleEndian,
-}
-use Endianness::*;
-
-trait Parse<T, const N: usize> {
-    fn parse(self, bytes: [u8; N]) -> T;
-}
-impl Parse<u64, 8> for Endianness {
-    fn parse(self, bytes: [u8; 8]) -> u64 {
-        match self {
-            BigEndian => u64::from_be_bytes(bytes),
-            LittleEndian => u64::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<u32, 4> for Endianness {
-    fn parse(self, bytes: [u8; 4]) -> u32 {
-        match self {
-            BigEndian => u32::from_be_bytes(bytes),
-            LittleEndian => u32::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<u16, 2> for Endianness {
-    fn parse(self, bytes: [u8; 2]) -> u16 {
-        match self {
-            BigEndian => u16::from_be_bytes(bytes),
-            LittleEndian => u16::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<u8, 1> for Endianness {
-    fn parse(self, bytes: [u8; 1]) -> u8 {
-        match self {
-            BigEndian => u8::from_be_bytes(bytes),
-            LittleEndian => u8::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<i64, 8> for Endianness {
-    fn parse(self, bytes: [u8; 8]) -> i64 {
-        match self {
-            BigEndian => i64::from_be_bytes(bytes),
-            LittleEndian => i64::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<i32, 4> for Endianness {
-    fn parse(self, bytes: [u8; 4]) -> i32 {
-        match self {
-            BigEndian => i32::from_be_bytes(bytes),
-            LittleEndian => i32::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<i16, 2> for Endianness {
-    fn parse(self, bytes: [u8; 2]) -> i16 {
-        match self {
-            BigEndian => i16::from_be_bytes(bytes),
-            LittleEndian => i16::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<i8, 1> for Endianness {
-    fn parse(self, bytes: [u8; 1]) -> i8 {
-        match self {
-            BigEndian => i8::from_be_bytes(bytes),
-            LittleEndian => i8::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<f64, 8> for Endianness {
-    fn parse(self, bytes: [u8; 8]) -> f64 {
-        match self {
-            BigEndian => f64::from_be_bytes(bytes),
-            LittleEndian => f64::from_le_bytes(bytes),
-        }
-    }
-}
-
-fn read_bytes<const N: usize>(r: &mut BufReader<File>) -> Result<[u8; N]> {
-    let mut buf = [0; N];
-    r.read_exact(&mut buf)?;
-    Ok(buf)
-}
-
-fn read_vec(r: &mut BufReader<File>, n: usize) -> Result<Vec<u8>> {
-    let mut vec = vec![0; n];
-    r.read_exact(&mut vec)?;
-    Ok(vec)
-}
-
-trait ReadSwap<T> {
-    fn read_swap(&mut self) -> Result<T>;
-}
-
-impl ReadSwap<u32> for Dissector {
-    fn read_swap(&mut self) -> Result<u32> {
-        Ok(self.endianness.parse(read_bytes(&mut self.r)?))
-    }
-}
-impl ReadSwap<u8> for Dissector {
-    fn read_swap(&mut self) -> Result<u8> {
-        Ok(self.endianness.parse(read_bytes(&mut self.r)?))
-    }
-}
-
-impl ReadSwap<i32> for Dissector {
-    fn read_swap(&mut self) -> Result<i32> {
-        Ok(self.endianness.parse(read_bytes(&mut self.r)?))
-    }
-}
 
-impl ReadSwap<f64> for Dissector {
-    fn read_swap(&mut self) -> Result<f64> {
-        Ok(self.endianness.parse(read_bytes(&mut self.r)?))
-    }
-}
+    /// How to dissect the file.
+    #[arg(short, long, value_enum, default_value_t)]
+    mode: Mode,
 
-struct Dissector {
-    filename: String,
-    r: BufReader<File>,
-    compression: Option<Compression>,
-    endianness: Endianness,
-    fp_format: Endianness,
-    bias: f64,
-    n_variable_records: usize,
-    n_variables: usize,
-    var_widths: Vec<i32>,
-}
-
-fn detect_endianness(layout_code: [u8; 4]) -> Option<Endianness> {
-    for endianness in [BigEndian, LittleEndian] {
-        match endianness.parse(layout_code) {
-            2 | 3 => return Some(endianness),
-            _ => (),
-        }
-    }
-    None
+    /// The encoding to use.
+    #[arg(long, value_parser = parse_encoding)]
+    encoding: Option<&'static Encoding>,
 }
 
-fn detect_fp_format(bias: [u8; 8]) -> Option<Endianness> {
-    for endianness in [BigEndian, LittleEndian] {
-        let value: f64 = endianness.parse(bias);
-        if value == 100.0 {
-            return Some(endianness);
-        }
-    }
-    None
-}
+#[derive(ThisError, Debug)]
+#[error("{0}: unknown encoding")]
+struct UnknownEncodingError(String);
 
-fn trim_end(mut s: Vec<u8>, c: u8) -> Vec<u8> {
-    while s.last() == Some(&c) {
-        s.pop();
+fn parse_encoding(arg: &str) -> Result<&'static Encoding, UnknownEncodingError> {
+    match Encoding::for_label_no_replacement(arg.as_bytes()) {
+        Some(encoding) => Ok(encoding),
+        None => Err(UnknownEncodingError(arg.to_string())),
     }
-    s
 }
 
-fn slice_trim_end(mut s: &[u8], c: u8) -> &[u8] {
-    while s.last() == Some(&c) {
-        s = s.split_last().unwrap().1;
-    }
-    s
+#[derive(Clone, Copy, Debug, Default, ValueEnum)]
+enum Mode {
+    Identify,
+    Raw,
+    #[default]
+    Cooked,
 }
 
-fn format_name(type_: u32) -> &'static str {
-    match type_ {
-        1 => "A",
-        2 => "AHEX",
-        3 => "COMMA",
-        4 => "DOLLAR",
-        5 => "F",
-        6 => "IB",
-        7 => "PIBHEX",
-        8 => "P",
-        9 => "PIB",
-        10 => "PK",
-        11 => "RB",
-        12 => "RBHEX",
-        15 => "Z",
-        16 => "N",
-        17 => "E",
-        20 => "DATE",
-        21 => "TIME",
-        22 => "DATETIME",
-        23 => "ADATE",
-        24 => "JDATE",
-        25 => "DTIME",
-        26 => "WKDAY",
-        27 => "MONTH",
-        28 => "MOYR",
-        29 => "QYR",
-        30 => "WKYR",
-        31 => "PCT",
-        32 => "DOT",
-        33 => "CCA",
-        34 => "CCB",
-        35 => "CCC",
-        36 => "CCD",
-        37 => "CCE",
-        38 => "EDATE",
-        39 => "SDATE",
-        40 => "MTIME",
-        41 => "YMDHMS",
-        _ => "invalid",
-    }
-}
-
-fn round_up<T: Num + Copy>(x: T, y: T) -> T {
-    (x + (y - T::one())) / y * y
-}
-
-struct UntypedValue {
-    raw: [u8; 8],
-    endianness: Endianness,
-}
-
-impl UntypedValue {
-    fn new(raw: [u8; 8], endianness: Endianness) -> UntypedValue {
-        UntypedValue { raw, endianness }
-    }
-}
-
-impl fmt::Display for UntypedValue {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let numeric: f64 = self.endianness.parse(self.raw);
-        let n_printable = self
-            .raw
-            .iter()
-            .take_while(|&&x| x == b' ' || x.is_ascii_graphic())
-            .count();
-        let printable_prefix = std::str::from_utf8(&self.raw[0..n_printable]).unwrap();
-        write!(f, "{numeric}/\"{printable_prefix}\"")
-    }
-}
-
-struct HexFloat<T: Float>(T);
-
-impl<T: Float> fmt::Display for HexFloat<T> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let sign = if self.0.is_sign_negative() { "-" } else { "" };
-        match self.0.classify() {
-            FpCategory::Nan => return write!(f, "NaN"),
-            FpCategory::Infinite => return write!(f, "{sign}Infinity"),
-            FpCategory::Zero => return write!(f, "{sign}0.0"),
-            _ => (),
-        };
-        let (significand, mut exponent, _) = self.0.integer_decode();
-        let mut hex_sig = format!("{:x}", significand);
-        while hex_sig.ends_with('0') {
-            hex_sig.pop();
-            exponent += 4;
-        }
-        match hex_sig.len() {
-            0 => write!(f, "{sign}0.0"),
-            1 => write!(f, "{sign}0x{hex_sig}.0p{exponent}"),
-            len => write!(
-                f,
-                "{sign}0x{}.{}p{}",
-                hex_sig.chars().next().unwrap(),
-                &hex_sig[1..],
-                exponent + 4 * (len as i16 - 1)
-            ),
-        }
-    }
-}
-
-#[cfg(test)]
-mod hex_float_tests {
-    use crate::HexFloat;
-    use num::Float;
+fn main() -> Result<()> {
+    let Args {
+        max_cases,
+        files,
+        mode,
+        encoding,
+    } = Args::parse();
 
-    #[test]
-    fn test() {
-        assert_eq!(format!("{}", HexFloat(1.0)), "0x1.0p0");
-        assert_eq!(format!("{}", HexFloat(123.0)), "0x1.ecp6");
-        assert_eq!(format!("{}", HexFloat(1.0 / 16.0)), "0x1.0p-4");
-        assert_eq!(format!("{}", HexFloat(f64::infinity())), "Infinity");
-        assert_eq!(format!("{}", HexFloat(f64::neg_infinity())), "-Infinity");
-        assert_eq!(format!("{}", HexFloat(f64::nan())), "NaN");
-        assert_eq!(format!("{}", HexFloat(0.0)), "0.0");
-        assert_eq!(format!("{}", HexFloat(f64::neg_zero())), "-0.0");
+    for file in files {
+        dissect(&file, max_cases, mode, encoding)?;
     }
+    Ok(())
 }
 
-impl Dissector {
-    fn new<P: AsRef<Path>>(filename: P) -> Result<Dissector> {
-        let mut r = BufReader::new(File::open(&filename)?);
-        let filename = filename.as_ref().to_string_lossy().into_owned();
-        let rec_type: [u8; 4] = read_bytes(&mut r)?;
-        let zmagic = match &rec_type {
-            b"$FL2" => false,
-            b"$FL3" => true,
-            _ => Err(anyhow!("This is not an SPSS system file."))?,
-        };
-
-        let eye_catcher: [u8; 60] = read_bytes(&mut r)?;
-        let layout_code: [u8; 4] = read_bytes(&mut r)?;
-        let endianness = detect_endianness(layout_code)
-            .ok_or_else(|| anyhow!("This is not an SPSS system file."))?;
-        let layout_code: u32 = endianness.parse(layout_code);
-        let _nominal_case_size: [u8; 4] = read_bytes(&mut r)?;
-        let compressed: u32 = endianness.parse(read_bytes(&mut r)?);
-        let compression = match (zmagic, compressed) {
-            (false, 0) => None,
-            (false, 1) => Some(Compression::Simple),
-            (true, 2) => Some(Compression::ZLib),
-            _ => Err(anyhow!(
-                "{} file header has invalid compression value {compressed}.",
-                if zmagic { "ZSAV" } else { "SAV" }
-            ))?,
-        };
-
-        let weight_index: u32 = endianness.parse(read_bytes(&mut r)?);
-        let n_cases: u32 = endianness.parse(read_bytes(&mut r)?);
-
-        let bias: [u8; 8] = read_bytes(&mut r)?;
-        let fp_format = detect_fp_format(bias)
-            .unwrap_or_else(|| { eprintln!("Compression bias is not the usual value of 100, or system file uses unrecognized floating-point format."); endianness });
-        let bias: f64 = fp_format.parse(bias);
-
-        let mut d = Dissector {
-            filename,
-            r,
-            compression,
-            endianness,
-            fp_format,
-            bias,
-            n_variable_records: 0,
-            n_variables: 0,
-            var_widths: Vec::new(),
-        };
-
-        let creation_date: [u8; 9] = read_bytes(&mut d.r)?;
-        let creation_time: [u8; 8] = read_bytes(&mut d.r)?;
-        let file_label: [u8; 64] = read_bytes(&mut d.r)?;
-        let file_label = trim_end(Vec::from(file_label), b' ');
-        d.skip_bytes(3)?;
-
-        println!("File header record:");
-        println!(
-            "{:>17}: {}",
-            "Product name",
-            String::from_utf8_lossy(&eye_catcher)
-        );
-        println!("{:>17}: {}", "Layout code", layout_code);
-        println!(
-            "{:>17}: {} ({})",
-            "Compressed",
-            compressed,
-            match compression {
-                None => "no compression",
-                Some(Compression::Simple) => "simple compression",
-                Some(Compression::ZLib) => "ZLIB compression",
-            }
-        );
-        println!("{:>17}: {}", "Weight index", weight_index);
-        println!("{:>17}: {}", "Number of cases", n_cases);
-        println!("{:>17}: {}", "Compression bias", bias);
-        println!(
-            "{:>17}: {}",
-            "Creation date",
-            String::from_utf8_lossy(&creation_date)
-        );
-        println!(
-            "{:>17}: {}",
-            "Creation time",
-            String::from_utf8_lossy(&creation_time)
-        );
-        println!(
-            "{:>17}: \"{}\"",
-            "File label",
-            String::from_utf8_lossy(&file_label)
-        );
-
-        loop {
-            let rec_type: u32 = d.read_swap()?;
-            match rec_type {
-                2 => d.read_variable_record()?,
-                3 => d.read_value_label_record()?,
-                4 => Err(anyhow!("Misplaced type 4 record."))?,
-                6 => d.read_document_record()?,
-                7 => d.read_extension_record()?,
-                999 => break,
-                _ => Err(anyhow!("Unrecognized record type {rec_type}."))?,
-            }
-        }
-
-        let pos = d.r.stream_position()?;
-        println!(
-            "{:08x}: end-of-dictionary record (first byte of data at {:0x})",
-            pos,
-            pos + 4
-        );
-
-        Ok(d)
-    }
-
-    fn read_extension_record(&mut self) -> Result<()> {
-        let offset = self.r.stream_position()?;
-        let subtype: u32 = self.read_swap()?;
-        let size: u32 = self.read_swap()?;
-        let count: u32 = self.read_swap()?;
-        println!("{offset:08x}: Record 7, subtype {subtype}, size={size}, count={count}");
-        match subtype {
-            3 => self.read_machine_integer_info(size, count),
-            4 => self.read_machine_float_info(size, count),
-            5 => self.read_variable_sets(size, count),
-            6 => {
-                // DATE variable information.  We don't use it yet, but we should.
-                Ok(())
-            }
-            7 | 19 => self.read_mrsets(size, count),
-            10 => self.read_extra_product_info(size, count),
-            11 => self.read_display_parameters(size, count),
-            _ => self.read_unknown_extension(subtype, size, count),
-        }
-    }
-
-    fn warn(&mut self, s: String) -> Result<()> {
-        println!(
-            "\"{}\" near offset 0x{:08x}: {s}",
-            self.filename,
-            self.r.stream_position()?
-        );
-        Ok(())
-    }
-
-    fn skip_bytes(&mut self, mut n: u64) -> Result<()> {
-        let mut buf = [0; 1024];
-        while n > 0 {
-            let chunk = u64::min(n, buf.len() as u64);
-            self.r.read_exact(&mut buf[0..chunk as usize])?;
-            n -= chunk;
-        }
-        Ok(())
-    }
-
-    fn read_unknown_extension(&mut self, subtype: u32, size: u32, count: u32) -> Result<()> {
-        self.warn(format!("Unrecognized record type 7, subtype {subtype}."))?;
-        if size == 0 || count > 65536 / size {
-            self.skip_bytes(size as u64 * count as u64)?;
-        } else if size != 1 {
-            let mut offset = 0;
-            for _ in 0..count {
-                let vec = read_vec(&mut self.r, size as usize)?;
-                println!(
-                    "{}",
-                    HexViewBuilder::new(&vec).address_offset(offset).finish()
-                );
-                offset += size as usize;
-            }
-        }
-        Ok(())
-    }
-
-    fn read_variable_record(&mut self) -> Result<()> {
-        self.n_variable_records += 1;
-        println!(
-            "{:08x}: variable record {}",
-            self.r.stream_position()?,
-            self.n_variable_records
-        );
-        let width: i32 = self.read_swap()?;
-        let has_variable_label: u32 = self.read_swap()?;
-        let missing_value_code: i32 = self.read_swap()?;
-        let print_format: u32 = self.read_swap()?;
-        let write_format: u32 = self.read_swap()?;
-        let name: [u8; 8] = read_bytes(&mut self.r)?;
-        let name: Vec<u8> = trim_end(Vec::from(name), b'\0');
-
-        if width >= 0 {
-            self.n_variables += 1;
-        }
-        self.var_widths.push(width);
-
-        println!(
-            "\tWidth: {width} ({})",
-            match width {
-                _ if width > 0 => "string",
-                _ if width == 0 => "numeric",
-                _ => "long string continuation record",
-            }
-        );
-
-        println!("\tVariable label: {has_variable_label}");
-        println!(
-            "\tMissing values code: {missing_value_code} ({})",
-            match missing_value_code {
-                0 => "no missing values",
-                1 => "one missing value",
-                2 => "two missing values",
-                3 => "three missing values",
-                -2 => "one missing value range",
-                -3 => "one missing value, one range",
-                _ => "bad value",
-            }
-        );
-        for (which, format) in [("Print", print_format), ("Worite", write_format)] {
-            let type_ = format_name(format >> 16);
-            let w = (format >> 8) & 0xff;
-            let d = format & 0xff;
-            println!("\t{which} format: {format:06x} ({type_}{w}.{d})");
-        }
-        println!("\tName: {}", String::from_utf8_lossy(&name));
-
-        // Read variable label.
-        match has_variable_label {
-            0 => (),
-            1 => {
-                let offset = self.r.stream_position()?;
-                let len: u32 = self.read_swap()?;
-                let read_len = len.min(65535) as usize;
-                let label = read_vec(&mut self.r, read_len)?;
-                println!(
-                    "\t{offset:08x} Variable label: \"{}\"",
-                    String::from_utf8_lossy(&label)
-                );
-
-                self.skip_bytes((round_up(len, 4) - len).into())?;
-            }
-            _ => Err(anyhow!("Variable label indicator field is not 0 or 1."))?,
-        };
-
-        // Read missing values.
-        if missing_value_code != 0 {
-            print!("\t{:08x} Missing values:", self.r.stream_position()?);
-            match width.cmp(&0) {
-                Ordering::Equal => {
-                    let (has_range, n_individual) = match missing_value_code {
-                        -3 => (true, 1),
-                        -2 => (true, 0),
-                        1 | 2 | 3 => (false, missing_value_code),
-                        _ => Err(anyhow!(
-                            "Numeric missing value indicator field is not -3, -2, 0, 1, 2, or 3."
-                        ))?,
-                    };
-                    if has_range {
-                        let low: f64 = self.read_swap()?;
-                        let high: f64 = self.read_swap()?;
-                        print!(" {low}...{high}");
-                    }
-                    for _ in 0..n_individual {
-                        let value: f64 = self.read_swap()?;
-                        print!(" {value}");
-                    }
-                }
-                Ordering::Greater => {
-                    if !(0..=3).contains(&missing_value_code) {
-                        Err(anyhow!(
-                            "String missing value indicator field is not 0, 1, 2, or 3."
-                        ))?;
-                    }
-                    for _ in 0..missing_value_code {
-                        let string: [u8; 8] = read_bytes(&mut self.r)?;
-                        let string: Vec<u8> = trim_end(Vec::from(string), b'\0');
-                        println!(" {}", String::from_utf8_lossy(&string));
-                    }
-                }
-                Ordering::Less => (),
-            }
-            println!();
-        }
-
-        Ok(())
-    }
-
-    fn read_value_label_record(&mut self) -> Result<()> {
-        println!("{:08x}: value labels record", self.r.stream_position()?);
-
-        // Read the labels.
-        let n_labels: u32 = self.read_swap()?;
-        for _ in 0..n_labels {
-            let raw: [u8; 8] = read_bytes(&mut self.r)?;
-            let value = UntypedValue::new(raw, self.fp_format);
-            let label_len: u8 = self.read_swap()?;
-            let padded_len = round_up(label_len as usize + 1, 8);
-
-            let mut label = read_vec(&mut self.r, padded_len)?;
-            label.truncate(label_len as usize);
-            let label = String::from_utf8_lossy(&label);
-
-            println!("\t{value}: {label}");
-        }
-
-        // Read the type-4 record with the corresponding variable indexes.
-        let rec_type: u32 = self.read_swap()?;
-        if rec_type != 4 {
-            Err(anyhow!(
-                "Variable index record (type 4) does not immediately \
-                         follow value label record (type 3) as it should."
-            ))?;
-        }
-
-        println!("\t{:08x}: apply to variables", self.r.stream_position()?);
-        let n_vars: u32 = self.read_swap()?;
-        for _ in 0..n_vars {
-            let index: u32 = self.read_swap()?;
-            print!(" {index}");
-        }
-        println!();
-
-        Ok(())
-    }
-
-    fn read_document_record(&mut self) -> Result<()> {
-        println!("{:08x}: document record", self.r.stream_position()?);
-        let n_lines: u32 = self.read_swap()?;
-        println!("\t{n_lines} lines of documents");
-
-        for i in 0..n_lines {
-            print!("\t{:08x}: ", self.r.stream_position()?);
-            let line: [u8; 64] = read_bytes(&mut self.r)?;
-            let line = trim_end(Vec::from(line), b' ');
-            println!("line {i}: \"{}\"", String::from_utf8_lossy(&line));
-        }
-        Ok(())
-    }
-
-    fn read_machine_integer_info(&mut self, size: u32, count: u32) -> Result<()> {
-        let offset = self.r.stream_position()?;
-        let version_major: u32 = self.read_swap()?;
-        let version_minor: u32 = self.read_swap()?;
-        let version_revision: u32 = self.read_swap()?;
-        let machine_code: u32 = self.read_swap()?;
-        let float_representation: u32 = self.read_swap()?;
-        let compression_code: u32 = self.read_swap()?;
-        let integer_representation: u32 = self.read_swap()?;
-        let character_code: u32 = self.read_swap()?;
-
-        println!("{offset:08x}: machine integer info");
-        if size != 4 || count != 8 {
-            Err(anyhow!(
-                "Bad size ({size}) or count ({count}) field on record type 7, subtype 3"
-            ))?;
-        }
-        println!("\tVersion: {version_major}.{version_minor}.{version_revision}");
-        println!("\tMachine code: {machine_code}");
-        println!(
-            "\tFloating point representation: {float_representation} ({})",
-            match float_representation {
-                1 => "IEEE 754",
-                2 => "IBM 370",
-                3 => "DEC VAX",
-                _ => "unknown",
-            }
-        );
-        println!("\tCompression code: {compression_code}");
-        println!(
-            "\tEndianness: {integer_representation} ({})",
-            match integer_representation {
-                1 => "big",
-                2 => "little",
-                _ => "unknown",
-            }
-        );
-        println!("\tCharacter code: {character_code}");
-        Ok(())
-    }
-
-    fn read_machine_float_info(&mut self, size: u32, count: u32) -> Result<()> {
-        let offset = self.r.stream_position()?;
-        let sysmis: f64 = self.read_swap()?;
-        let highest: f64 = self.read_swap()?;
-        let lowest: f64 = self.read_swap()?;
-
-        println!("{offset:08x}: machine float info");
-        if size != 4 || count != 8 {
-            Err(anyhow!(
-                "Bad size ({size}) or count ({count}) field on extension 4."
-            ))?;
-        }
-
-        println!("\tsysmis: {sysmis} ({})", HexFloat(sysmis));
-        println!("\thighest: {highest} ({})", HexFloat(highest));
-        println!("\tlowest: {lowest} ({})", HexFloat(lowest));
-        Ok(())
-    }
+fn dissect(file_name: &Path, max_cases: u64, mode: Mode, encoding: Option<&'static Encoding>) -> Result<()> {
+    let reader = File::open(file_name)?;
+    let reader = BufReader::new(reader);
+    let mut reader = Reader::new(reader)?;
 
-    fn read_variable_sets(&mut self, size: u32, count: u32) -> Result<()> {
-        println!("{:08x}: variable sets", self.r.stream_position()?);
-        let mut text = self.open_text_record(size, count)?;
-        loop {
-            while text.match_byte(b'\n') {
-                continue;
+    match mode {
+        Mode::Identify => {
+            let Record::Header(header) = reader.next().unwrap()? else { unreachable!() };
+            match header.magic {
+                Magic::Sav => println!("SPSS System File"),
+                Magic::Zsav => println!("SPSS System File with Zlib compression"),
+                Magic::Ebcdic => println!("EBCDIC-encoded SPSS System File"),
             }
-            let set = match text.tokenize(b'=') {
-                Some(set) => String::from_utf8_lossy(set).into_owned(),
-                None => break,
-            };
-
-            // Always present even for an empty set.
-            text.match_byte(b' ');
-
-            match text.tokenize(b'\n') {
-                None => println!("\tset \"{set}\" is empty"),
-                Some(variables) => {
-                    println!(
-                        "\tset \"{set}\" contains \"{}\"",
-                        String::from_utf8_lossy(variables).trim_end_matches('\r')
-                    );
-                }
-            };
+            return Ok(())
         }
-        Ok(())
-    }
-
-    // Read record type 7, subtype 7.
-    fn read_mrsets(&mut self, size: u32, count: u32) -> Result<()> {
-        print!("{:08x}: multiple response sets", self.r.stream_position()?);
-        let mut text = self.open_text_record(size, count)?;
-        loop {
-            #[derive(PartialEq, Eq)]
-            enum MrSet {
-                MC,
-                MD,
+        Mode::Raw => {
+            let headers: Vec<Record> = reader.collect_headers()?;
+            for header in headers {
+                println!("{header:?}");
             }
-
-            while text.match_byte(b'\n') {}
-            let Some(name) = text.tokenize(b'=') else {
-                break;
-            };
-
-            let (mrset, cat_label_from_counted_values, label_from_var_label) = if text
-                .match_byte(b'C')
-            {
-                if !text.match_byte(b' ') {
-                    Err(anyhow!(
-                        "missing space following 'C' at offset {} in mrsets record",
-                        text.pos
-                    ))?;
-                }
-                (MrSet::MC, false, false)
-            } else if text.match_byte(b'D') {
-                (MrSet::MD, false, false)
-            } else if text.match_byte(b'E') {
-                if !text.match_byte(b' ') {
-                    Err(anyhow!(
-                        "missing space following 'E' at offset {} in mrsets record",
-                        text.pos
-                    ))?;
-                }
-
-                let pos = text.pos;
-                let Some(number) = text.tokenize(b' ') else {
-                    Err(anyhow!(
-                        "Missing label source value following `E' at offset {}u in MRSETS record",
-                        text.pos
-                    ))?
-                };
-
-                let label_from_var_label = if number == b"11" {
-                    true
-                } else if number == b"1" {
-                    false
-                } else {
-                    Err(anyhow!("Unexpected label source value `{}' following `E' at offset {pos} in MRSETS record", String::from_utf8_lossy(number)))?
-                };
-                (MrSet::MD, true, label_from_var_label)
-            } else {
-                Err(anyhow!(
-                    "missing `C', `D', or `E' at offset {} in mrsets record",
-                    text.pos
-                ))?
-            };
-
-            let counted_value = if mrset == MrSet::MD {
-                Some(text.parse_counted_string()?)
-            } else { None };
-
-            let label = text.parse_counted_string()?;
-
-            let variables = text.tokenize(b'\n');
-
-            print!("\t\"{}\": multiple {} set",
-                   String::from_utf8_lossy(name),
-                   if mrset == MrSet::MC { "category" } else { "dichotomy" });
-            
         }
-        Ok(())
-    }
-
-    fn read_extra_product_info(&mut self, size: u32, count: u32) -> Result<()> {
-        print!("{:08x}: extra product info", self.r.stream_position()?);
-        let text = self.open_text_record(size, count)?;
-        print_string(&text.buffer);
-        Ok(())
-    }
-
-    fn read_display_parameters(&mut self, size: u32, count: u32) -> Result<()> {
-        println!(
-            "{:08x}: variable display parameters",
-            self.r.stream_position()?
-        );
-        if size != 4 {
-            Err(anyhow!("Bad size ({size}) on extension 11."))?;
-        }
-        let n_vars = self.n_variables;
-        let includes_width = if count as usize == 3 * n_vars {
-            true
-        } else if count as usize == 2 * n_vars {
-            false
-        } else {
-            Err(anyhow!(
-                "Extension 11 has bad count {count} (for {n_vars} variables)."
-            ))?
-        };
-
-        for i in 0..n_vars {
-            let measure: u32 = self.read_swap()?;
-            print!(
-                "\tVar #{i}: measure={measure} ({})",
-                match measure {
-                    1 => "nominal",
-                    2 => "ordinal",
-                    3 => "scale",
-                    _ => "invalid",
-                }
-            );
-
-            if includes_width {
-                let width: u32 = self.read_swap()?;
-                print!(", width={width}");
+        Mode::Cooked => {
+            let headers: Vec<Record> = reader.collect_headers()?;
+            let headers = decode(headers, encoding, &|e| panic!("{e}"))?;
+            for header in headers {
+                println!("{header:?}");
             }
-
-            let align: u32 = self.read_swap()?;
-            println!(
-                ", align={align} ({})",
-                match align {
-                    0 => "left",
-                    1 => "right",
-                    2 => "centre",
-                    _ => "invalid",
-                }
-            );
         }
-        Ok(())
     }
 
-    fn open_text_record(&mut self, size: u32, count: u32) -> Result<TextRecord> {
-        let n_bytes = match u32::checked_mul(size, count) {
-            Some(n) => n,
-            None => Err(anyhow!("Extension record too large."))?,
+    for _ in 0..max_cases {
+        let Some(Ok(record)) = reader.next() else {
+            break;
         };
-        Ok(TextRecord::new(read_vec(&mut self.r, n_bytes as usize)?))
-    }
-}
-
-fn print_string(s: &[u8]) {
-    if s.contains(&b'\0') {
-        println!("{}", HexView::new(s));
-    } else {
-        for &c in s {
-            match c {
-                b'\\' => print!("\\\\"),
-                b'\n' => println!(),
-                c if (b' '..=b'~').contains(&c) => print!("{}", c as char),
-                c => print!("\\{:2x}", c),
-            }
-        }
-    }
-}
-
-struct TextRecord {
-    buffer: Vec<u8>,
-    pos: usize,
-}
-
-impl TextRecord {
-    fn new(buffer: Vec<u8>) -> TextRecord {
-        TextRecord { buffer, pos: 0 }
-    }
-
-    fn tokenize(&mut self, delimiter: u8) -> Option<&[u8]> {
-        let start = self.pos;
-        while self.pos < self.buffer.len()
-            && self.buffer[self.pos] != delimiter
-            && self.buffer[self.pos] != 0
-        {
-            self.pos += 1
-        }
-        if start == self.pos {
-            None
-        } else {
-            Some(&self.buffer[start..self.pos])
-        }
-    }
-
-    fn match_byte(&mut self, c: u8) -> bool {
-        if self.pos < self.buffer.len() && self.buffer[self.pos] == c {
-            self.pos += 1;
-            true
-        } else {
-            false
-        }
-    }
-
-    fn parse_usize(&mut self) -> Result<usize> {
-        let n_digits = self.buffer[self.pos..]
-            .iter()
-            .take_while(|c| c.is_ascii_digit())
-            .count();
-        if n_digits == 0 {
-            Err(anyhow!("expecting digit at offset {} in record", self.pos))?;
-        }
-        let start = self.pos;
-        self.pos += n_digits;
-        let end = self.pos;
-        let digits = str::from_utf8(&self.buffer[start..end]).unwrap();
-        let Ok(number) = digits.parse::<usize>() else {
-            Err(anyhow!(
-                "expecting number in [0,{}] at offset {} in record",
-                usize::MAX,
-                self.pos
-            ))?
-        };
-        self.pos = end;
-        Ok(number)
-    }
-
-    fn get_n_bytes(&mut self, n: usize) -> Option<(usize, usize)> {
-        let start = self.pos;
-        let Some(end) = start.checked_add(n) else {
-            return None;
-        };
-        self.pos = end;
-        Some((start, end))
-    }
-
-    fn parse_counted_string(&mut self) -> Result<&[u8]> {
-        let length = self.parse_usize()?;
-        if !self.match_byte(b' ') {
-            Err(anyhow!("expecting space at offset {} in record", self.pos))?;
-        }
-
-        let Some((start, end)) = self.get_n_bytes(length) else {
-            Err(anyhow!("{length}-byte string starting at offset {} exceeds record length {}",
-                        self.pos, self.buffer.len()))?
-        };
-        if !self.match_byte(b' ') {
-            Err(anyhow!(
-                "expecting space at offset {} following {}-byte string",
-                self.pos,
-                end - start
-            ))?;
-        }
-        Ok(&self.buffer[start..end])
+        println!("{:?}", record);
     }
+    Ok(())
 }