+/* PSPP - a program for statistical analysis.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+use anyhow::{anyhow, Result};
+use clap::Parser;
+use num::Num;
+use std::fs::File;
+use std::io::prelude::*;
+use std::io::BufReader;
+use std::path::{Path, PathBuf};
+
+/// A utility to dissect SPSS system files.
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+ /// Maximum number of cases to print.
+ #[arg(long = "data", default_value_t = 0)]
+ max_cases: usize,
+
+ /// Files to dissect.
+ #[arg(required = true)]
+ files: Vec<PathBuf>
+}
+
+fn main() -> Result<()> {
+ let Args { max_cases, files } = Args::parse();
+
+ let error = false;
+ for file in files {
+ Dissector::new(file)?;
+ }
+ Ok(())
+}
+
+#[derive(Copy, Clone, Debug)]
+enum Compression {
+ Simple,
+ ZLib
+}
+
+#[derive(Copy, Clone, Debug)]
+enum Endianness {
+ BigEndian,
+ LittleEndian
+}
+use Endianness::*;
+
+trait Parse<T, const N: usize> {
+ fn parse(self, bytes: [u8; N]) -> T;
+}
+impl Parse<u64, 8> for Endianness {
+ fn parse(self, bytes: [u8; 8]) -> u64 {
+ match self {
+ BigEndian => u64::from_be_bytes(bytes),
+ LittleEndian => u64::from_le_bytes(bytes)
+ }
+ }
+}
+impl Parse<u32, 4> for Endianness {
+ fn parse(self, bytes: [u8; 4]) -> u32 {
+ match self {
+ BigEndian => u32::from_be_bytes(bytes),
+ LittleEndian => u32::from_le_bytes(bytes)
+ }
+ }
+}
+impl Parse<u16, 2> for Endianness {
+ fn parse(self, bytes: [u8; 2]) -> u16 {
+ match self {
+ BigEndian => u16::from_be_bytes(bytes),
+ LittleEndian => u16::from_le_bytes(bytes)
+ }
+ }
+}
+impl Parse<u8, 1> for Endianness {
+ fn parse(self, bytes: [u8; 1]) -> u8 {
+ match self {
+ BigEndian => u8::from_be_bytes(bytes),
+ LittleEndian => u8::from_le_bytes(bytes)
+ }
+ }
+}
+impl Parse<i64, 8> for Endianness {
+ fn parse(self, bytes: [u8; 8]) -> i64 {
+ match self {
+ BigEndian => i64::from_be_bytes(bytes),
+ LittleEndian => i64::from_le_bytes(bytes)
+ }
+ }
+}
+impl Parse<i32, 4> for Endianness {
+ fn parse(self, bytes: [u8; 4]) -> i32 {
+ match self {
+ BigEndian => i32::from_be_bytes(bytes),
+ LittleEndian => i32::from_le_bytes(bytes)
+ }
+ }
+}
+impl Parse<i16, 2> for Endianness {
+ fn parse(self, bytes: [u8; 2]) -> i16 {
+ match self {
+ BigEndian => i16::from_be_bytes(bytes),
+ LittleEndian => i16::from_le_bytes(bytes)
+ }
+ }
+}
+impl Parse<i8, 1> for Endianness {
+ fn parse(self, bytes: [u8; 1]) -> i8 {
+ match self {
+ BigEndian => i8::from_be_bytes(bytes),
+ LittleEndian => i8::from_le_bytes(bytes)
+ }
+ }
+}
+impl Parse<f64, 8> for Endianness {
+ fn parse(self, bytes: [u8; 8]) -> f64 {
+ match self {
+ BigEndian => f64::from_be_bytes(bytes),
+ LittleEndian => f64::from_le_bytes(bytes)
+ }
+ }
+}
+
+fn read_bytes<const N: usize>(r: &mut BufReader<File>) -> Result<[u8; N]> {
+ let mut buf = [0; N];
+ r.read_exact(&mut buf)?;
+ Ok(buf)
+}
+
+fn read_vec(r: &mut BufReader<File>, n: usize) -> Result<Vec<u8>> {
+ let mut vec = Vec::with_capacity(n);
+ vec.resize(n, 0);
+ r.read_exact(&mut vec)?;
+ Ok(vec)
+}
+
+trait ReadSwap<T> {
+ fn read_swap(&mut self) -> Result<T>;
+}
+
+impl ReadSwap<u32> for Dissector {
+ fn read_swap(&mut self) -> Result<u32> {
+ Ok(self.endianness.parse(read_bytes(&mut self.r)?))
+ }
+}
+impl ReadSwap<u8> for Dissector {
+ fn read_swap(&mut self) -> Result<u8> {
+ Ok(self.endianness.parse(read_bytes(&mut self.r)?))
+ }
+}
+
+impl ReadSwap<i32> for Dissector {
+ fn read_swap(&mut self) -> Result<i32> {
+ Ok(self.endianness.parse(read_bytes(&mut self.r)?))
+ }
+}
+
+impl ReadSwap<f64> for Dissector {
+ fn read_swap(&mut self) -> Result<f64> {
+ Ok(self.endianness.parse(read_bytes(&mut self.r)?))
+ }
+}
+
+struct Dissector {
+ filename: String,
+ r: BufReader<File>,
+ compression: Option<Compression>,
+ endianness: Endianness,
+ fp_format: Endianness,
+ bias: f64,
+ n_variable_records: usize,
+ n_variables: usize,
+ var_widths: Vec<i32>,
+}
+
+fn detect_endianness(layout_code: [u8; 4]) -> Option<Endianness> {
+ for endianness in [BigEndian, LittleEndian] {
+ match endianness.parse(layout_code) {
+ 2 | 3 => return Some(endianness),
+ _ => ()
+ }
+ }
+ None
+}
+
+fn detect_fp_format(bias: [u8; 8]) -> Option<Endianness> {
+ for endianness in [BigEndian, LittleEndian] {
+ let value: f64 = endianness.parse(bias);
+ if value == 100.0 {
+ return Some(endianness)
+ }
+ }
+ None
+}
+
+fn trim_end(mut s: Vec<u8>, c: u8) -> Vec<u8> {
+ while s.last() == Some(&c) {
+ s.pop();
+ }
+ s
+}
+
+fn format_name(type_: u32) -> &'static str {
+ match type_ {
+ 1 => "A",
+ 2 => "AHEX",
+ 3 => "COMMA",
+ 4 => "DOLLAR",
+ 5 => "F",
+ 6 => "IB",
+ 7 => "PIBHEX",
+ 8 => "P",
+ 9 => "PIB",
+ 10 => "PK",
+ 11 => "RB",
+ 12 => "RBHEX",
+ 15 => "Z",
+ 16 => "N",
+ 17 => "E",
+ 20 => "DATE",
+ 21 => "TIME",
+ 22 => "DATETIME",
+ 23 => "ADATE",
+ 24 => "JDATE",
+ 25 => "DTIME",
+ 26 => "WKDAY",
+ 27 => "MONTH",
+ 28 => "MOYR",
+ 29 => "QYR",
+ 30 => "WKYR",
+ 31 => "PCT",
+ 32 => "DOT",
+ 33 => "CCA",
+ 34 => "CCB",
+ 35 => "CCC",
+ 36 => "CCD",
+ 37 => "CCE",
+ 38 => "EDATE",
+ 39 => "SDATE",
+ 40 => "MTIME",
+ 41 => "YMDHMS",
+ _ => "invalid"
+ }
+}
+
+fn round_up<T: Num + Copy>(x: T, y: T) -> T
+{
+ (x + (y - T::one())) / y * y
+}
+
+impl UntypedValue {
+ fn new(
+}
+
+impl Dissector {
+ fn new<P: AsRef<Path>>(filename: P) -> Result<Dissector> {
+ let mut r = BufReader::new(File::open(&filename)?);
+ let filename = filename.as_ref().to_string_lossy().into_owned();
+ let rec_type: [u8; 4] = read_bytes(&mut r)?;
+ let zmagic = match &rec_type {
+ b"$FL2" => false,
+ b"$FL3" => true,
+ _ => Err(anyhow!("This is not an SPSS system file."))?
+ };
+
+ let eye_catcher: [u8; 60] = read_bytes(&mut r)?;
+ let layout_code: [u8; 4] = read_bytes(&mut r)?;
+ let endianness = detect_endianness(layout_code)
+ .ok_or_else(|| anyhow!("This is not an SPSS system file."))?;
+ let layout_code: u32 = endianness.parse(layout_code);
+ let _nominal_case_size: [u8; 4] = read_bytes(&mut r)?;
+ let compressed: u32 = endianness.parse(read_bytes(&mut r)?);
+ let compression = match (zmagic, compressed) {
+ (false, 0) => None,
+ (false, 1) => Some(Compression::Simple),
+ (true, 2) => Some(Compression::ZLib),
+ _ => Err(anyhow!("{} file header has invalid compression value {compressed}.",
+ if zmagic { "ZSAV" } else { "SAV" }))?,
+ };
+
+ let weight_index: u32 = endianness.parse(read_bytes(&mut r)?);
+ let n_cases: u32 = endianness.parse(read_bytes(&mut r)?);
+
+ let bias: [u8; 8] = read_bytes(&mut r)?;
+ let fp_format = detect_fp_format(bias)
+ .unwrap_or_else(|| { eprintln!("Compression bias is not the usual value of 100, or system file uses unrecognized floating-point format."); endianness });
+ let bias: f64 = fp_format.parse(bias);
+
+ let mut d = Dissector {
+ filename,
+ r,
+ compression,
+ endianness,
+ fp_format,
+ bias,
+ n_variable_records: 0,
+ n_variables: 0,
+ var_widths: Vec::new(),
+ };
+
+ let creation_date: [u8; 9] = read_bytes(&mut d.r)?;
+ let creation_time: [u8; 8] = read_bytes(&mut d.r)?;
+ let file_label: [u8; 64] = read_bytes(&mut d.r)?;
+ let mut file_label = trim_end(Vec::from(file_label), b' ');
+ d.r.seek_relative(3)?;
+
+ println!("File header record:");
+ println!("{:>17}: {}", "Product name", String::from_utf8_lossy(&eye_catcher));
+ println!("{:>17}: {}", "Layout code", layout_code);
+ println!("{:>17}: {} ({})", "Compressed", compressed, match compression {
+ None => "no compression",
+ Some(Compression::Simple) => "simple compression",
+ Some(Compression::ZLib) => "ZLIB compression",
+ });
+ println!("{:>17}: {}", "Weight index", weight_index);
+ println!("{:>17}: {}", "Number of cases", n_cases);
+ println!("{:>17}: {}", "Compression bias", bias);
+ println!("{:>17}: {}", "Creation date", String::from_utf8_lossy(&creation_date));
+ println!("{:>17}: {}", "Creation time", String::from_utf8_lossy(&creation_time));
+ println!("{:>17}: \"{}\"", "File label", String::from_utf8_lossy(&file_label));
+
+ loop {
+ let rec_type: u32 = d.read_swap()?;
+ match rec_type {
+ 2 => d.read_variable_record()?,
+ 3 => d.read_value_label_record()?,
+ 4 => Err(anyhow!("Misplaced type 4 record."))?,
+ 999 => break,
+ _ => Err(anyhow!("Unrecognized record type {rec_type}."))?
+ }
+ }
+
+ let pos = d.r.stream_position()?;
+ println!("{:08x}: end-of-dictionary record (first byte of data at {:0x})", pos, pos + 4);
+
+ Ok(d)
+ }
+
+ fn read_variable_record(&mut self) -> Result<()> {
+ self.n_variable_records += 1;
+ println!("{:08x}: variable record {}", self.r.stream_position()?, self.n_variable_records);
+ let width: i32 = self.read_swap()?;
+ let has_variable_label: u32 = self.read_swap()?;
+ let missing_value_code: i32 = self.read_swap()?;
+ let print_format: u32 = self.read_swap()?;
+ let write_format: u32 = self.read_swap()?;
+ let name: [u8; 8] = read_bytes(&mut self.r)?;
+ let name: Vec<u8> = trim_end(Vec::from(name), b'\0');
+
+ if width >= 0 {
+ self.n_variables += 1;
+ }
+ self.var_widths.push(width);
+
+ println!("\tWidth: {width} ({})", match width {
+ _ if width > 0 => "string",
+ _ if width == 0 => "numeric",
+ _ => "long string continuation record"
+ });
+
+ println!("\tVariable label: {has_variable_label}");
+ println!("\tMissing values code: {missing_value_code} ({})",
+ match missing_value_code {
+ 0 => "no missing values",
+ 1 => "one missing value",
+ 2 => "two missing values",
+ 3 => "three missing values",
+ -2 => "one missing value range",
+ -3 => "one missing value, one range",
+ _ => "bad value"
+ });
+ for (which, format) in [("Print", print_format),
+ ("Worite", write_format)] {
+ let type_ = format_name(format >> 16);
+ let w = (format >> 8) & 0xff;
+ let d = format & 0xff;
+ println!("\t{which} format: {format:06x} ({type_}{w}.{d})");
+ }
+ println!("\tName: {}", String::from_utf8_lossy(&name));
+
+ // Read variable label.
+ match has_variable_label {
+ 0 => (),
+ 1 => {
+ let offset = self.r.stream_position()?;
+ let len: u32 = self.read_swap()?;
+ let read_len = len.min(65535) as usize;
+ let label = read_vec(&mut self.r, read_len)?;
+ println!("\t{offset:08x} Variable label: \"{}\"", String::from_utf8_lossy(&label));
+
+ self.r.seek_relative((round_up(len, 4) - len).into())?;
+ },
+ _ => Err(anyhow!("Variable label indicator field is not 0 or 1."))?,
+ };
+
+ // Read missing values.
+ if missing_value_code != 0 {
+ print!("\t{:08x} Missing values:", self.r.stream_position()?);
+ if width == 0 {
+ let (has_range, n_individual) = match missing_value_code {
+ -3 => (true, 1),
+ -2 => (true, 0),
+ 1 | 2 | 3 => (false, missing_value_code),
+ _ => Err(anyhow!("Numeric missing value indicator field is not -3, -2, 0, 1, 2, or 3."))?,
+ };
+ if has_range {
+ let low: f64 = self.read_swap()?;
+ let high: f64 = self.read_swap()?;
+ print!(" {low}...{high}");
+ }
+ for _i in 0..n_individual {
+ let value: f64 = self.read_swap()?;
+ print!(" {value}");
+ }
+ } else if width > 0 {
+ if missing_value_code < 1 || missing_value_code > 3 {
+ Err(anyhow!("String missing value indicator field is not 0, 1, 2, or 3."))?;
+ }
+ for _i in 0..missing_value_code {
+ let string: [u8; 8] = read_bytes(&mut self.r)?;
+ let string: Vec<u8> = trim_end(Vec::from(string), b'\0');
+ println!(" {}", String::from_utf8_lossy(&string));
+ }
+ }
+ println!();
+ }
+
+ Ok(())
+ }
+
+ fn read_value_label_record(&mut self) -> Result<()> {
+ println!("{:08x}: value labels record", self.r.stream_position()?);
+
+ let n_labels: u32 = self.read_swap()?;
+ for _i in 0..n_labels {
+ let raw: [u8; 8] = read_bytes(&mut self.r)?;
+ let label_len: u8 = self.read_swap()?;
+ let padded_len = round_up(label_len as usize + 1, 8);
+
+ let mut label = read_vec(&mut self.r, padded_len)?;
+ label.truncate(label_len as usize);
+ print
+ }
+
+ Ok(())
+ }
+}