/* PSPP - a program for statistical analysis.
- Copyright (C) 2023 Free Software Foundation, Inc.
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>. */
+ * Copyright (C) 2023 Free Software Foundation, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>. */
use anyhow::{anyhow, Result};
use clap::Parser;
+use hexplay::HexView;
use hexplay::HexViewBuilder;
-use num::{Float, Num};
-use std::{fmt, num::FpCategory};
+use num::Num;
+use std::cmp::Ordering;
+use std::collections::VecDeque;
+use std::fmt;
use std::fs::File;
use std::io::prelude::*;
use std::io::BufReader;
+use std::io::ErrorKind;
use std::path::{Path, PathBuf};
+use std::str;
+
+mod hexfloat;
+use hexfloat::HexFloat;
+
+const ID_MAX_LEN: u32 = 64;
/// A utility to dissect SPSS system files.
#[derive(Parser, Debug)]
/// Files to dissect.
#[arg(required = true)]
- files: Vec<PathBuf>
+ files: Vec<PathBuf>,
}
fn main() -> Result<()> {
let Args { max_cases, files } = Args::parse();
for file in files {
- Dissector::new(file)?;
+ Dissector::new(file, max_cases)?;
}
Ok(())
}
#[derive(Copy, Clone, Debug)]
enum Compression {
Simple,
- ZLib
+ ZLib,
}
#[derive(Copy, Clone, Debug)]
enum Endianness {
BigEndian,
- LittleEndian
+ LittleEndian,
}
use Endianness::*;
fn parse(self, bytes: [u8; 8]) -> u64 {
match self {
BigEndian => u64::from_be_bytes(bytes),
- LittleEndian => u64::from_le_bytes(bytes)
+ LittleEndian => u64::from_le_bytes(bytes),
}
}
}
fn parse(self, bytes: [u8; 4]) -> u32 {
match self {
BigEndian => u32::from_be_bytes(bytes),
- LittleEndian => u32::from_le_bytes(bytes)
+ LittleEndian => u32::from_le_bytes(bytes),
}
}
}
fn parse(self, bytes: [u8; 2]) -> u16 {
match self {
BigEndian => u16::from_be_bytes(bytes),
- LittleEndian => u16::from_le_bytes(bytes)
+ LittleEndian => u16::from_le_bytes(bytes),
}
}
}
fn parse(self, bytes: [u8; 1]) -> u8 {
match self {
BigEndian => u8::from_be_bytes(bytes),
- LittleEndian => u8::from_le_bytes(bytes)
+ LittleEndian => u8::from_le_bytes(bytes),
}
}
}
fn parse(self, bytes: [u8; 8]) -> i64 {
match self {
BigEndian => i64::from_be_bytes(bytes),
- LittleEndian => i64::from_le_bytes(bytes)
+ LittleEndian => i64::from_le_bytes(bytes),
}
}
}
fn parse(self, bytes: [u8; 4]) -> i32 {
match self {
BigEndian => i32::from_be_bytes(bytes),
- LittleEndian => i32::from_le_bytes(bytes)
+ LittleEndian => i32::from_le_bytes(bytes),
}
}
}
fn parse(self, bytes: [u8; 2]) -> i16 {
match self {
BigEndian => i16::from_be_bytes(bytes),
- LittleEndian => i16::from_le_bytes(bytes)
+ LittleEndian => i16::from_le_bytes(bytes),
}
}
}
fn parse(self, bytes: [u8; 1]) -> i8 {
match self {
BigEndian => i8::from_be_bytes(bytes),
- LittleEndian => i8::from_le_bytes(bytes)
+ LittleEndian => i8::from_le_bytes(bytes),
}
}
}
fn parse(self, bytes: [u8; 8]) -> f64 {
match self {
BigEndian => f64::from_be_bytes(bytes),
- LittleEndian => f64::from_le_bytes(bytes)
+ LittleEndian => f64::from_le_bytes(bytes),
}
}
}
}
fn read_vec(r: &mut BufReader<File>, n: usize) -> Result<Vec<u8>> {
- let mut vec = Vec::with_capacity(n);
- vec.resize(n, 0);
+ let mut vec = vec![0; n];
r.read_exact(&mut vec)?;
Ok(vec)
-}
+}
trait ReadSwap<T> {
fn read_swap(&mut self) -> Result<T>;
}
+impl ReadSwap<u8> for Dissector {
+ fn read_swap(&mut self) -> Result<u8> {
+ Ok(self.endianness.parse(read_bytes(&mut self.r)?))
+ }
+}
impl ReadSwap<u32> for Dissector {
fn read_swap(&mut self) -> Result<u32> {
Ok(self.endianness.parse(read_bytes(&mut self.r)?))
}
}
-impl ReadSwap<u8> for Dissector {
- fn read_swap(&mut self) -> Result<u8> {
+impl ReadSwap<u64> for Dissector {
+ fn read_swap(&mut self) -> Result<u64> {
Ok(self.endianness.parse(read_bytes(&mut self.r)?))
}
}
struct Dissector {
filename: String,
r: BufReader<File>,
- compression: Option<Compression>,
endianness: Endianness,
fp_format: Endianness,
bias: f64,
for endianness in [BigEndian, LittleEndian] {
match endianness.parse(layout_code) {
2 | 3 => return Some(endianness),
- _ => ()
+ _ => (),
}
}
None
for endianness in [BigEndian, LittleEndian] {
let value: f64 = endianness.parse(bias);
if value == 100.0 {
- return Some(endianness)
+ return Some(endianness);
}
}
None
s
}
-fn slice_trim_end(mut s: &[u8], c: u8) -> &[u8] {
- while s.last() == Some(&c) {
- s = s.split_last().unwrap().1;
- }
- s
-}
-
fn format_name(type_: u32) -> &'static str {
match type_ {
1 => "A",
39 => "SDATE",
40 => "MTIME",
41 => "YMDHMS",
- _ => "invalid"
+ _ => "invalid",
}
}
-fn round_up<T: Num + Copy>(x: T, y: T) -> T
-{
+fn round_up<T: Num + Copy>(x: T, y: T) -> T {
(x + (y - T::one())) / y * y
}
struct UntypedValue {
raw: [u8; 8],
- endianness: Endianness
+ endianness: Endianness,
}
impl UntypedValue {
impl fmt::Display for UntypedValue {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let numeric: f64 = self.endianness.parse(self.raw);
- let n_printable = self.raw.iter().take_while(|&&x| x == b' ' || x.is_ascii_graphic()).count();
+ let n_printable = self
+ .raw
+ .iter()
+ .take_while(|&&x| x == b' ' || x.is_ascii_graphic())
+ .count();
let printable_prefix = std::str::from_utf8(&self.raw[0..n_printable]).unwrap();
write!(f, "{numeric}/\"{printable_prefix}\"")
}
}
-struct HexFloat<T: Float>(T);
-
-impl<T: Float> fmt::Display for HexFloat<T> {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- let sign = if self.0.is_sign_negative() { "-" } else { "" };
- match self.0.classify() {
- FpCategory::Nan => return write!(f, "NaN"),
- FpCategory::Infinite => return write!(f, "{sign}Infinity"),
- FpCategory::Zero => return write!(f, "{sign}0.0"),
- _ => (),
- };
- let (significand, mut exponent, _) = self.0.integer_decode();
- let mut hex_sig = format!("{:x}", significand);
- while hex_sig.ends_with('0') {
- hex_sig.pop();
- exponent += 4;
- }
- match hex_sig.len() {
- 0 => write!(f, "{sign}0.0"),
- 1 => write!(f, "{sign}0x{hex_sig}.0p{exponent}"),
- len => write!(f, "{sign}0x{}.{}p{}",
- hex_sig.chars().nth(0).unwrap(),
- &hex_sig[1..],
- exponent + 4 * (len as i16 - 1))
- }
- }
-}
-
-#[cfg(test)]
-mod hex_float_tests {
- use crate::HexFloat;
- use num::Float;
-
- #[test]
- fn test() {
- assert_eq!(format!("{}", HexFloat(1.0)), "0x1.0p0");
- assert_eq!(format!("{}", HexFloat(123.0)), "0x1.ecp6");
- assert_eq!(format!("{}", HexFloat(1.0 / 16.0)), "0x1.0p-4");
- assert_eq!(format!("{}", HexFloat(f64::infinity())), "Infinity");
- assert_eq!(format!("{}", HexFloat(f64::neg_infinity())), "-Infinity");
- assert_eq!(format!("{}", HexFloat(f64::nan())), "NaN");
- assert_eq!(format!("{}", HexFloat(0.0)), "0.0");
- assert_eq!(format!("{}", HexFloat(f64::neg_zero())), "-0.0");
- }
-}
-
impl Dissector {
- fn new<P: AsRef<Path>>(filename: P) -> Result<Dissector> {
+ fn new<P: AsRef<Path>>(filename: P, max_cases: usize) -> Result<Dissector> {
let mut r = BufReader::new(File::open(&filename)?);
let filename = filename.as_ref().to_string_lossy().into_owned();
let rec_type: [u8; 4] = read_bytes(&mut r)?;
let zmagic = match &rec_type {
b"$FL2" => false,
b"$FL3" => true,
- _ => Err(anyhow!("This is not an SPSS system file."))?
+ _ => Err(anyhow!("This is not an SPSS system file."))?,
};
let eye_catcher: [u8; 60] = read_bytes(&mut r)?;
(false, 0) => None,
(false, 1) => Some(Compression::Simple),
(true, 2) => Some(Compression::ZLib),
- _ => Err(anyhow!("{} file header has invalid compression value {compressed}.",
- if zmagic { "ZSAV" } else { "SAV" }))?,
+ _ => Err(anyhow!(
+ "{} file header has invalid compression value {compressed}.",
+ if zmagic { "ZSAV" } else { "SAV" }
+ ))?,
};
let weight_index: u32 = endianness.parse(read_bytes(&mut r)?);
let mut d = Dissector {
filename,
r,
- compression,
endianness,
fp_format,
bias,
d.skip_bytes(3)?;
println!("File header record:");
- println!("{:>17}: {}", "Product name", String::from_utf8_lossy(&eye_catcher));
+ println!(
+ "{:>17}: {}",
+ "Product name",
+ String::from_utf8_lossy(&eye_catcher)
+ );
println!("{:>17}: {}", "Layout code", layout_code);
- println!("{:>17}: {} ({})", "Compressed", compressed, match compression {
- None => "no compression",
- Some(Compression::Simple) => "simple compression",
- Some(Compression::ZLib) => "ZLIB compression",
- });
+ println!(
+ "{:>17}: {} ({})",
+ "Compressed",
+ compressed,
+ match compression {
+ None => "no compression",
+ Some(Compression::Simple) => "simple compression",
+ Some(Compression::ZLib) => "ZLIB compression",
+ }
+ );
println!("{:>17}: {}", "Weight index", weight_index);
println!("{:>17}: {}", "Number of cases", n_cases);
println!("{:>17}: {}", "Compression bias", bias);
- println!("{:>17}: {}", "Creation date", String::from_utf8_lossy(&creation_date));
- println!("{:>17}: {}", "Creation time", String::from_utf8_lossy(&creation_time));
- println!("{:>17}: \"{}\"", "File label", String::from_utf8_lossy(&file_label));
+ println!(
+ "{:>17}: {}",
+ "Creation date",
+ String::from_utf8_lossy(&creation_date)
+ );
+ println!(
+ "{:>17}: {}",
+ "Creation time",
+ String::from_utf8_lossy(&creation_time)
+ );
+ println!(
+ "{:>17}: \"{}\"",
+ "File label",
+ String::from_utf8_lossy(&file_label)
+ );
loop {
let rec_type: u32 = d.read_swap()?;
6 => d.read_document_record()?,
7 => d.read_extension_record()?,
999 => break,
- _ => Err(anyhow!("Unrecognized record type {rec_type}."))?
+ _ => Err(anyhow!("Unrecognized record type {rec_type}."))?,
}
}
let pos = d.r.stream_position()?;
- println!("{:08x}: end-of-dictionary record (first byte of data at {:0x})", pos, pos + 4);
+ println!(
+ "{:08x}: end-of-dictionary record (first byte of data at {:0x})",
+ pos,
+ pos + 4
+ );
+
+ match compression {
+ Some(Compression::Simple) => {
+ if max_cases > 0 {
+ d.read_simple_compressed_data(max_cases)?;
+ }
+ }
+ Some(Compression::ZLib) => d.read_zlib_compressed_data()?,
+ None => (),
+ }
Ok(d)
}
+ fn read_simple_compressed_data(&mut self, max_cases: usize) -> Result<()> {
+ let _: i32 = self.read_swap()?;
+ println!("\n{:08x}: compressed data:", self.r.stream_position()?);
+
+ const N_OPCODES: usize = 8;
+ let mut opcodes = VecDeque::<u8>::with_capacity(8);
+ let mut opcode_ofs = 0;
+ for case_num in 0..max_cases {
+ println!(
+ "{:08x}: case {case_num}'s uncompressible data begins",
+ self.r.stream_position()?
+ );
+ let mut i = 0;
+ while i < self.var_widths.len() {
+ let width = self.var_widths[i];
+
+ let opcode_idx = N_OPCODES - opcodes.len();
+ let Some(opcode) = opcodes.pop_back() else {
+ opcode_ofs = self.r.stream_position()?;
+ let mut new_opcodes = [0; N_OPCODES];
+ if let Err(error) = self.r.read_exact(&mut new_opcodes) {
+ if i == 0 && error.kind() == ErrorKind::UnexpectedEof {
+ return Ok(());
+ } else {
+ return Err(error.into());
+ }
+ };
+ opcodes.extend(new_opcodes.into_iter());
+ continue;
+ };
+
+ print!(
+ "{:08x}: variable {i}: opcode {opcode}: ",
+ opcode_ofs + opcode_idx as u64
+ );
+ match opcode {
+ 0 => println!("ignored padding"),
+ 252 => {
+ println!("end of data");
+ break;
+ }
+ 253 => {
+ let raw: [u8; 8] = read_bytes(&mut self.r)?;
+ let value = UntypedValue::new(raw, self.fp_format);
+ println!("uncompressible data: {value}");
+ i += 1;
+ }
+ 254 => {
+ print!("spaces");
+ if width == 0 {
+ print!(", but this is a numeric variable");
+ }
+ println!();
+ i += 1;
+ }
+ 255 => {
+ print!("SYSMIS");
+ if width != 0 {
+ print!(", but this is a string variable (width={width})");
+ }
+ println!();
+ i += 1;
+ }
+ _ => {
+ print!("{}", opcode as f64 - self.bias);
+ if width != 0 {
+ print!(", but this is a string variable (width={width})");
+ }
+ println!();
+ i += 1;
+ }
+ }
+ }
+ }
+ Ok(())
+ }
+
+ fn read_zlib_compressed_data(&mut self) -> Result<()> {
+ let _: i32 = self.read_swap()?;
+ let ofs = self.r.stream_position()?;
+ println!("\n{ofs:08x}: ZLIB compressed data header:");
+
+ let this_ofs: u64 = self.read_swap()?;
+ let next_ofs: u64 = self.read_swap()?;
+ let next_len: u64 = self.read_swap()?;
+
+ println!("\theader_ofs: {this_ofs:#x}");
+ if this_ofs != ofs {
+ println!("\t\t(Expected {ofs:#x}.)");
+ }
+ println!("\ttrailer_ofs: {next_ofs:#x}");
+ println!("\ttrailer_len: {next_len}");
+ if next_len < 24 || next_len % 24 != 0 {
+ println!("\t\t(Trailer length is not positive multiple of 24.)");
+ }
+
+ let zlib_data_len = next_ofs - (ofs + 8 * 3);
+ println!(
+ "\n{:08x}: {zlib_data_len:#x} bytes of ZLIB compressed data",
+ ofs + 8 * 3
+ );
+
+ self.skip_bytes(zlib_data_len)?;
+
+ println!("\n{next_ofs:08x}: ZLIB trailer fixed header");
+ let bias: u64 = self.read_swap()?;
+ let zero: u64 = self.read_swap()?;
+ let block_size: u32 = self.read_swap()?;
+ let n_blocks: u32 = self.read_swap()?;
+ println!("\tbias: {bias}");
+ println!("\tzero: {zero:#x}");
+ if zero != 0 {
+ println!("\t\t(Expected 0.)");
+ }
+ println!("\tblock size: {block_size:#x}");
+ if block_size != 0x3ff000 {
+ println!("\t\t(Expected 0x3ff000.)");
+ }
+ println!("\tn_blocks: {n_blocks}");
+ if n_blocks as u64 != next_len / 24 - 1 {
+ println!("\t\t(Expected {}.)", next_len / 24 - 1);
+ }
+
+ let mut expected_uncmp_ofs = ofs;
+ let mut expected_cmp_ofs = ofs + 24;
+ for i in 1..=n_blocks {
+ let blockinfo_ofs = self.r.stream_position()?;
+ let uncompressed_ofs: u64 = self.read_swap()?;
+ let compressed_ofs: u64 = self.read_swap()?;
+ let uncompressed_size: u32 = self.read_swap()?;
+ let compressed_size: u32 = self.read_swap()?;
+
+ println!("\n{blockinfo_ofs:08x}: ZLIB block descriptor {i}");
+
+ println!("\tuncompressed_ofs: {uncompressed_ofs:#x}");
+ if uncompressed_ofs != expected_uncmp_ofs {
+ println!("\t\t(Expected {ofs:#x}.)");
+ }
+
+ println!("\tcompressed_ofs: {compressed_ofs:#x}");
+ if compressed_ofs != expected_cmp_ofs {
+ println!("\t\t(Expected {expected_cmp_ofs:#x}.)");
+ }
+
+ println!("\tuncompressed_size: {uncompressed_size:#x}");
+ if i < n_blocks && uncompressed_size != block_size {
+ println!("\t\t(Expected {block_size:#x}.)");
+ }
+
+ println!("\tcompressed_size: {compressed_size:#x}");
+ if i == n_blocks && compressed_ofs.checked_add(compressed_size as u64) != Some(next_ofs)
+ {
+ println!(
+ "\t\t(This was expected to be {:#x}.)",
+ next_ofs - compressed_size as u64
+ );
+ }
+
+ expected_uncmp_ofs += uncompressed_size as u64;
+ expected_cmp_ofs += uncompressed_size as u64;
+ }
+ Ok(())
+ }
+
fn read_extension_record(&mut self) -> Result<()> {
let offset = self.r.stream_position()?;
let subtype: u32 = self.read_swap()?;
let size: u32 = self.read_swap()?;
let count: u32 = self.read_swap()?;
println!("{offset:08x}: Record 7, subtype {subtype}, size={size}, count={count}");
+ if size.checked_mul(count).is_none() {
+ Err(anyhow!("{size} * {count} exceeds {}", u32::MAX))?
+ }
match subtype {
3 => self.read_machine_integer_info(size, count),
4 => self.read_machine_float_info(size, count),
+ 5 => self.read_variable_sets(size, count),
+ 6 => {
+ // DATE variable information. We don't use it yet, but we should.
+ Ok(())
+ }
+ 7 | 19 => self.read_mrsets(size, count),
+ 10 => self.read_extra_product_info(size, count),
+ 11 => self.read_display_parameters(size, count),
+ 13 => self.read_long_var_name_map(size, count),
+ 14 => self.read_long_string_map(size, count),
+ 16 => self.read_ncases64(size, count),
+ 17 => self.read_datafile_attributes(size, count),
+ 18 => self.read_variable_attributes(size, count),
+ 20 => self.read_character_encoding(size, count),
+ 21 => self.read_long_string_value_labels(size, count),
+ 22 => self.read_long_string_missing_values(size, count),
_ => self.read_unknown_extension(subtype, size, count),
}
}
fn warn(&mut self, s: String) -> Result<()> {
- println!("\"{}\" near offset 0x{:08x}: {s}", self.filename, self.r.stream_position()?);
+ println!(
+ "\"{}\" near offset 0x{:08x}: {s}",
+ self.filename,
+ self.r.stream_position()?
+ );
Ok(())
}
let mut offset = 0;
for _ in 0..count {
let vec = read_vec(&mut self.r, size as usize)?;
- println!("{}", HexViewBuilder::new(&vec).address_offset(offset).finish());
+ println!(
+ "{}",
+ HexViewBuilder::new(&vec).address_offset(offset).finish()
+ );
offset += size as usize;
}
}
fn read_variable_record(&mut self) -> Result<()> {
self.n_variable_records += 1;
- println!("{:08x}: variable record {}", self.r.stream_position()?, self.n_variable_records);
+ println!(
+ "{:08x}: variable record {}",
+ self.r.stream_position()?,
+ self.n_variable_records
+ );
let width: i32 = self.read_swap()?;
let has_variable_label: u32 = self.read_swap()?;
let missing_value_code: i32 = self.read_swap()?;
}
self.var_widths.push(width);
- println!("\tWidth: {width} ({})", match width {
- _ if width > 0 => "string",
- _ if width == 0 => "numeric",
- _ => "long string continuation record"
- });
+ println!(
+ "\tWidth: {width} ({})",
+ match width {
+ _ if width > 0 => "string",
+ _ if width == 0 => "numeric",
+ _ => "long string continuation record",
+ }
+ );
println!("\tVariable label: {has_variable_label}");
- println!("\tMissing values code: {missing_value_code} ({})",
- match missing_value_code {
- 0 => "no missing values",
- 1 => "one missing value",
- 2 => "two missing values",
- 3 => "three missing values",
- -2 => "one missing value range",
- -3 => "one missing value, one range",
- _ => "bad value"
- });
- for (which, format) in [("Print", print_format),
- ("Worite", write_format)] {
+ println!(
+ "\tMissing values code: {missing_value_code} ({})",
+ match missing_value_code {
+ 0 => "no missing values",
+ 1 => "one missing value",
+ 2 => "two missing values",
+ 3 => "three missing values",
+ -2 => "one missing value range",
+ -3 => "one missing value, one range",
+ _ => "bad value",
+ }
+ );
+ for (which, format) in [("Print", print_format), ("Worite", write_format)] {
let type_ = format_name(format >> 16);
let w = (format >> 8) & 0xff;
let d = format & 0xff;
let len: u32 = self.read_swap()?;
let read_len = len.min(65535) as usize;
let label = read_vec(&mut self.r, read_len)?;
- println!("\t{offset:08x} Variable label: \"{}\"", String::from_utf8_lossy(&label));
+ println!(
+ "\t{offset:08x} Variable label: \"{}\"",
+ String::from_utf8_lossy(&label)
+ );
self.skip_bytes((round_up(len, 4) - len).into())?;
- },
+ }
_ => Err(anyhow!("Variable label indicator field is not 0 or 1."))?,
};
// Read missing values.
if missing_value_code != 0 {
print!("\t{:08x} Missing values:", self.r.stream_position()?);
- if width == 0 {
- let (has_range, n_individual) = match missing_value_code {
- -3 => (true, 1),
- -2 => (true, 0),
- 1 | 2 | 3 => (false, missing_value_code),
- _ => Err(anyhow!("Numeric missing value indicator field is not -3, -2, 0, 1, 2, or 3."))?,
- };
- if has_range {
- let low: f64 = self.read_swap()?;
- let high: f64 = self.read_swap()?;
- print!(" {low}...{high}");
- }
- for _ in 0..n_individual {
- let value: f64 = self.read_swap()?;
- print!(" {value}");
- }
- } else if width > 0 {
- if missing_value_code < 1 || missing_value_code > 3 {
- Err(anyhow!("String missing value indicator field is not 0, 1, 2, or 3."))?;
+ match width.cmp(&0) {
+ Ordering::Equal => {
+ let (has_range, n_individual) = match missing_value_code {
+ -3 => (true, 1),
+ -2 => (true, 0),
+ 1 | 2 | 3 => (false, missing_value_code),
+ _ => Err(anyhow!(
+ "Numeric missing value indicator field is not -3, -2, 0, 1, 2, or 3."
+ ))?,
+ };
+ if has_range {
+ let low: f64 = self.read_swap()?;
+ let high: f64 = self.read_swap()?;
+ print!(" {low}...{high}");
+ }
+ for _ in 0..n_individual {
+ let value: f64 = self.read_swap()?;
+ print!(" {value}");
+ }
}
- for _ in 0..missing_value_code {
- let string: [u8; 8] = read_bytes(&mut self.r)?;
- let string: Vec<u8> = trim_end(Vec::from(string), b'\0');
- println!(" {}", String::from_utf8_lossy(&string));
+ Ordering::Greater => {
+ if !(0..=3).contains(&missing_value_code) {
+ Err(anyhow!(
+ "String missing value indicator field is not 0, 1, 2, or 3."
+ ))?;
+ }
+ for _ in 0..missing_value_code {
+ let string: [u8; 8] = read_bytes(&mut self.r)?;
+ let string: Vec<u8> = trim_end(Vec::from(string), b'\0');
+ println!(" {}", String::from_utf8_lossy(&string));
+ }
}
+ Ordering::Less => (),
}
println!();
}
// Read the type-4 record with the corresponding variable indexes.
let rec_type: u32 = self.read_swap()?;
if rec_type != 4 {
- Err(anyhow!("Variable index record (type 4) does not immediately \
- follow value label record (type 3) as it should."))?;
+ Err(anyhow!(
+ "Variable index record (type 4) does not immediately \
+ follow value label record (type 3) as it should."
+ ))?;
}
println!("\t{:08x}: apply to variables", self.r.stream_position()?);
println!("{offset:08x}: machine integer info");
if size != 4 || count != 8 {
- Err(anyhow!("Bad size ({size}) or count ({count}) field on record type 7, subtype 3"))?;
+ Err(anyhow!(
+ "Bad size ({size}) or count ({count}) field on record type 7, subtype 3"
+ ))?;
}
println!("\tVersion: {version_major}.{version_minor}.{version_revision}");
println!("\tMachine code: {machine_code}");
- println!("\tFloating point representation: {float_representation} ({})",
- match float_representation {
- 1 => "IEEE 754",
- 2 => "IBM 370",
- 3 => "DEC VAX",
- _ => "unknown"
- });
+ println!(
+ "\tFloating point representation: {float_representation} ({})",
+ match float_representation {
+ 1 => "IEEE 754",
+ 2 => "IBM 370",
+ 3 => "DEC VAX",
+ _ => "unknown",
+ }
+ );
println!("\tCompression code: {compression_code}");
- println!("\tEndianness: {integer_representation} ({})",
- match integer_representation {
- 1 => "big",
- 2 => "little",
- _ => "unknown"
- });
+ println!(
+ "\tEndianness: {integer_representation} ({})",
+ match integer_representation {
+ 1 => "big",
+ 2 => "little",
+ _ => "unknown",
+ }
+ );
println!("\tCharacter code: {character_code}");
Ok(())
}
println!("{offset:08x}: machine float info");
if size != 4 || count != 8 {
- Err(anyhow!("Bad size ({size}) or count ({count}) field on extension 4."))?;
+ Err(anyhow!(
+ "Bad size ({size}) or count ({count}) field on extension 4."
+ ))?;
}
println!("\tsysmis: {sysmis} ({})", HexFloat(sysmis));
continue;
}
let set = match text.tokenize(b'=') {
- Some(set) => String::from_utf8_lossy(&set).into_owned(),
+ Some(set) => String::from_utf8_lossy(set).into_owned(),
None => break,
};
match text.tokenize(b'\n') {
None => println!("\tset \"{set}\" is empty"),
Some(variables) => {
- println!("\tset \"{set}\" contains \"{}\"", String::from_utf8_lossy(variables).trim_end_matches('\r'));
- },
+ println!(
+ "\tset \"{set}\" contains \"{}\"",
+ String::from_utf8_lossy(variables).trim_end_matches('\r')
+ );
+ }
};
-
+ }
+ Ok(())
+ }
+
+ // Read record type 7, subtype 7.
+ fn read_mrsets(&mut self, size: u32, count: u32) -> Result<()> {
+ print!("{:08x}: multiple response sets", self.r.stream_position()?);
+ let mut text = self.open_text_record(size, count)?;
+ loop {
+ #[derive(PartialEq, Eq)]
+ enum MrSet {
+ MC,
+ MD,
+ }
+
+ while text.match_byte(b'\n') {}
+ let Some(name) = text.tokenize(b'=') else {
+ break;
+ };
+ let name = Vec::from(name);
+
+ let (mrset, cat_label_from_counted_values, label_from_var_label) = if text
+ .match_byte(b'C')
+ {
+ if !text.match_byte(b' ') {
+ Err(anyhow!(
+ "missing space following 'C' at offset {} in mrsets record",
+ text.pos
+ ))?;
+ }
+ (MrSet::MC, false, false)
+ } else if text.match_byte(b'D') {
+ (MrSet::MD, false, false)
+ } else if text.match_byte(b'E') {
+ if !text.match_byte(b' ') {
+ Err(anyhow!(
+ "missing space following 'E' at offset {} in mrsets record",
+ text.pos
+ ))?;
+ }
+
+ let pos = text.pos;
+ let Some(number) = text.tokenize(b' ') else {
+ Err(anyhow!(
+ "Missing label source value following `E' at offset {}u in MRSETS record",
+ text.pos
+ ))?
+ };
+
+ let label_from_var_label = if number == b"11" {
+ true
+ } else if number == b"1" {
+ false
+ } else {
+ Err(anyhow!("Unexpected label source value `{}' following `E' at offset {pos} in MRSETS record", String::from_utf8_lossy(number)))?
+ };
+ (MrSet::MD, true, label_from_var_label)
+ } else {
+ Err(anyhow!(
+ "missing `C', `D', or `E' at offset {} in mrsets record",
+ text.pos
+ ))?
+ };
+
+ let counted_value = if mrset == MrSet::MD {
+ Some(Vec::from(text.parse_counted_string()?))
+ } else {
+ None
+ };
+
+ let label = Vec::from(text.parse_counted_string()?);
+
+ let variables = text.tokenize(b'\n');
+
+ print!(
+ "\t\"{}\": multiple {} set",
+ String::from_utf8_lossy(&name),
+ if mrset == MrSet::MC {
+ "category"
+ } else {
+ "dichotomy"
+ }
+ );
+ if let Some(counted_value) = counted_value {
+ print!(
+ ", counted value \"{}\"",
+ String::from_utf8_lossy(&counted_value)
+ );
+ }
+ if cat_label_from_counted_values {
+ println!(", category labels from counted values");
+ }
+ if label != b"" {
+ print!(", label \"{}\"", String::from_utf8_lossy(&label));
+ }
+ if label_from_var_label {
+ print!(", label from variable label");
+ }
+ if let Some(variables) = variables {
+ print!(", variables \"{}\"", String::from_utf8_lossy(variables));
+ } else {
+ print!("no variables");
+ }
+ println!();
}
Ok(())
}
fn read_extra_product_info(&mut self, size: u32, count: u32) -> Result<()> {
print!("{:08x}: extra product info", self.r.stream_position()?);
+ let text = self.open_text_record(size, count)?;
+ print_string(&text.buffer);
+ Ok(())
+ }
+
+ fn read_display_parameters(&mut self, size: u32, count: u32) -> Result<()> {
+ println!(
+ "{:08x}: variable display parameters",
+ self.r.stream_position()?
+ );
+ if size != 4 {
+ Err(anyhow!("Bad size ({size}) on extension 11."))?;
+ }
+ let n_vars = self.n_variables;
+ let includes_width = if count as usize == 3 * n_vars {
+ true
+ } else if count as usize == 2 * n_vars {
+ false
+ } else {
+ Err(anyhow!(
+ "Extension 11 has bad count {count} (for {n_vars} variables)."
+ ))?
+ };
+
+ for i in 0..n_vars {
+ let measure: u32 = self.read_swap()?;
+ print!(
+ "\tVar #{i}: measure={measure} ({})",
+ match measure {
+ 1 => "nominal",
+ 2 => "ordinal",
+ 3 => "scale",
+ _ => "invalid",
+ }
+ );
+
+ if includes_width {
+ let width: u32 = self.read_swap()?;
+ print!(", width={width}");
+ }
+
+ let align: u32 = self.read_swap()?;
+ println!(
+ ", align={align} ({})",
+ match align {
+ 0 => "left",
+ 1 => "right",
+ 2 => "centre",
+ _ => "invalid",
+ }
+ );
+ }
+ Ok(())
+ }
+
+ fn read_long_var_name_map(&mut self, size: u32, count: u32) -> Result<()> {
+ print!(
+ "{:08x}: long variable names (short => long)",
+ self.r.stream_position()?
+ );
let mut text = self.open_text_record(size, count)?;
+ while let Some((var, long_name)) = text.read_variable_to_value_pair() {
+ println!(
+ "\t{} => {}",
+ String::from_utf8_lossy(&var),
+ String::from_utf8_lossy(&long_name)
+ );
+ }
+ Ok(())
+ }
+
+ fn read_long_string_map(&mut self, size: u32, count: u32) -> Result<()> {
+ print!(
+ "{:08x}: very long strings (variable => length)",
+ self.r.stream_position()?
+ );
+ let mut text = self.open_text_record(size, count)?;
+ while let Some((var, length)) = text.read_variable_to_value_pair() {
+ println!(
+ "\t{} => {}",
+ String::from_utf8_lossy(&var),
+ String::from_utf8_lossy(&length)
+ );
+ }
+ Ok(())
+ }
+ fn read_ncases64(&mut self, size: u32, count: u32) -> Result<()> {
+ if size != 8 {
+ Err(anyhow!("Bad size {size} for extended number of cases."))?
+ }
+ if count != 2 {
+ Err(anyhow!("Bad count {count} for extended number of cases."))?
+ }
+ let unknown: u64 = self.read_swap()?;
+ let ncases64: u64 = self.read_swap()?;
+ print!(
+ "{:08x}: extended number of cases: unknown={unknown}, ncases64={ncases64}",
+ self.r.stream_position()?
+ );
+ Ok(())
}
- fn open_text_record(&mut self, size: u32, count: u32) -> Result<TextRecord> {
- let n_bytes = match u32::checked_mul(size, count) {
- Some(n) => n,
- None => Err(anyhow!("Extension record too large."))?
+ fn read_attributes(&mut self, text: &mut TextRecord, variable: &str) -> Result<()> {
+ loop {
+ let Some(key) = text.tokenize_string(b'(') else {
+ break;
+ };
+ for index in 1.. {
+ let Some(value) = text.tokenize_string(b'\n') else {
+ Err(anyhow!(
+ "{variable}: Error parsing attribute value {key}[{index}]"
+ ))?
+ };
+ if value.starts_with('\'') && value.ends_with('\'') && value.len() >= 2 {
+ let middle = &value[1..value.len() - 2];
+ println!("\t{variable}: {key}[{index}] = \"{middle}\"");
+ } else {
+ self.warn(format!(
+ "{variable}: Attribute value {key}[{index}] is not quoted: {value}"
+ ))?;
+ }
+ if text.match_byte(b')') {
+ break;
+ }
+ }
+
+ if text.match_byte(b'/') {
+ break;
+ }
+ }
+ Ok(())
+ }
+
+ fn read_datafile_attributes(&mut self, size: u32, count: u32) -> Result<()> {
+ print!("{:08x}: datafile attributes", self.r.stream_position()?);
+ let mut text = self.open_text_record(size, count)?;
+ self.read_attributes(&mut text, "datafile")?;
+ Ok(())
+ }
+
+ fn read_variable_attributes(&mut self, size: u32, count: u32) -> Result<()> {
+ print!("{:08x}: variable attributes", self.r.stream_position()?);
+ let mut text = self.open_text_record(size, count)?;
+ loop {
+ let Some(variable) = text.tokenize_string(b':') else {
+ break;
+ };
+ self.read_attributes(&mut text, &variable)?;
+ }
+ Ok(())
+ }
+
+ fn read_character_encoding(&mut self, size: u32, count: u32) -> Result<()> {
+ let offset = self.r.stream_position()?;
+ let encoding = read_vec(&mut self.r, (size * count) as usize)?;
+ println!("{offset:08x}: Character Encoding: {}", String::from_utf8_lossy(&encoding));
+ Ok(())
+ }
+
+ fn read_long_string_value_labels(&mut self, size: u32, count: u32) -> Result<()> {
+ let start = self.r.stream_position()?;
+
+ println!("{start:08x}: long string value labels");
+ while self.r.stream_position()? - start < (size * count) as u64 {
+ let position = self.r.stream_position()?;
+
+ let var_name_len: u32 = self.read_swap()?;
+ if var_name_len > ID_MAX_LEN {
+ Err(anyhow!("Variable name length in long string value label record ({var_name_len} exceeds {ID_MAX_LEN}-byte limit."))?
+ }
+ let var_name = read_vec(&mut self.r, var_name_len as usize)?;
+
+ let width: u32 = self.read_swap()?;
+ let n_values: u32 = self.read_swap()?;
+
+ println!("\t{position:08x}: {}, width {width}, {n_values} values",
+ String::from_utf8_lossy(&var_name));
+
+ for _ in 0..n_values {
+ let position = self.r.stream_position()?;
+ let value_length: u32 = self.read_swap()?;
+ let value = read_vec(&mut self.r, value_length as usize)?;
+ let label_length: u32 = self.read_swap()?;
+ let label = read_vec(&mut self.r, value_length as usize)?;
+ println!("\t\t{position:08x}: \"{}\" ({value_length} bytes) => \"{}\" ({label_length} bytes)",
+ String::from_utf8_lossy(&value),
+ String::from_utf8_lossy(&label));
+ }
+ }
+ Ok(())
+ }
+
+ fn read_long_string_missing_values(&mut self, size: u32, count: u32) -> Result<()> {
+ let start = self.r.stream_position()?;
+
+ println!("{start:08x}: long string missing values");
+ while self.r.stream_position()? - start < (size * count) as u64 {
+ let position = self.r.stream_position()?;
+
+ let var_name_len: u32 = self.read_swap()?;
+ if var_name_len > ID_MAX_LEN {
+ Err(anyhow!("Variable name length in long string missing value record ({var_name_len} exceeds {ID_MAX_LEN}-byte limit."))?
+ }
+ let var_name = read_vec(&mut self.r, var_name_len as usize)?;
+
+ let n_missing_values: u8 = self.read_swap()?;
+ let value_length: u32 = self.read_swap()?;
+
+ println!("\t{position:08x}: {}, {n_missing_values}, each {value_length} bytes:",
+ String::from_utf8_lossy(&var_name));
+
+ for _ in 0..n_missing_values {
+ let value = read_vec(&mut self.r, value_length as usize)?;
+ println!(" \"{}\"", String::from_utf8_lossy(&value));
+ }
+ }
+ Ok(())
+ }
+
+ fn read_text_record(&mut self, size: u32, count: u32) -> Result<Vec<u8>> {
+ let Some(n_bytes) = u32::checked_mul(size, count) else {
+ Err(anyhow!("Extension record too large."))?
};
- Ok(TextRecord::new(read_vec(&mut self.r, n_bytes as usize)?))
+ read_vec(&mut self.r, n_bytes as usize)
+ }
+
+ fn open_text_record(&mut self, size: u32, count: u32) -> Result<TextRecord> {
+ Ok(TextRecord::new(self.read_text_record(size, count)?))
+ }
+}
+
+fn print_string(s: &[u8]) {
+ if s.contains(&b'\0') {
+ println!("{}", HexView::new(s));
+ } else {
+ for &c in s {
+ match c {
+ b'\\' => print!("\\\\"),
+ b'\n' => println!(),
+ c if (b' '..=b'~').contains(&c) => print!("{}", c as char),
+ c => print!("\\{:2x}", c),
+ }
+ }
}
}
struct TextRecord {
buffer: Vec<u8>,
- pos: usize
+ pos: usize,
}
impl TextRecord {
TextRecord { buffer, pos: 0 }
}
- fn tokenize<'a>(&'a mut self, delimiter: u8) -> Option<&'a [u8]> {
- let mut start = self.pos;
- while self.pos < self.buffer.len() && self.buffer[self.pos] != delimiter && self.buffer[self.pos] != 0 {
+ fn tokenize(&mut self, delimiter: u8) -> Option<&[u8]> {
+ let start = self.pos;
+ while self.pos < self.buffer.len()
+ && self.buffer[self.pos] != delimiter
+ && self.buffer[self.pos] != 0
+ {
self.pos += 1
}
if start == self.pos {
}
}
+ fn tokenize_string(&mut self, delimiter: u8) -> Option<String> {
+ self.tokenize(delimiter)
+ .map(|s| String::from_utf8_lossy(s).into_owned())
+ }
+
fn match_byte(&mut self, c: u8) -> bool {
if self.pos < self.buffer.len() && self.buffer[self.pos] == c {
self.pos += 1;
false
}
}
+
+ fn parse_usize(&mut self) -> Result<usize> {
+ let n_digits = self.buffer[self.pos..]
+ .iter()
+ .take_while(|c| c.is_ascii_digit())
+ .count();
+ if n_digits == 0 {
+ Err(anyhow!("expecting digit at offset {} in record", self.pos))?;
+ }
+ let start = self.pos;
+ self.pos += n_digits;
+ let end = self.pos;
+ let digits = str::from_utf8(&self.buffer[start..end]).unwrap();
+ let Ok(number) = digits.parse::<usize>() else {
+ Err(anyhow!(
+ "expecting number in [0,{}] at offset {} in record",
+ usize::MAX,
+ self.pos
+ ))?
+ };
+ self.pos = end;
+ Ok(number)
+ }
+
+ fn get_n_bytes(&mut self, n: usize) -> Option<(usize, usize)> {
+ let start = self.pos;
+ let Some(end) = start.checked_add(n) else {
+ return None;
+ };
+ self.pos = end;
+ Some((start, end))
+ }
+
+ fn parse_counted_string(&mut self) -> Result<&[u8]> {
+ let length = self.parse_usize()?;
+ if !self.match_byte(b' ') {
+ Err(anyhow!("expecting space at offset {} in record", self.pos))?;
+ }
+
+ let Some((start, end)) = self.get_n_bytes(length) else {
+ Err(anyhow!(
+ "{length}-byte string starting at offset {} exceeds record length {}",
+ self.pos,
+ self.buffer.len()
+ ))?
+ };
+ if !self.match_byte(b' ') {
+ Err(anyhow!(
+ "expecting space at offset {} following {}-byte string",
+ self.pos,
+ end - start
+ ))?;
+ }
+ Ok(&self.buffer[start..end])
+ }
+
+ fn read_variable_to_value_pair(&mut self) -> Option<(Vec<u8>, Vec<u8>)> {
+ let key = self.tokenize(b'=')?.into();
+ let value = self.tokenize(b'\t')?.into();
+
+ while self.match_byte(b'\t') || self.match_byte(b'\0') {}
+ Some((key, value))
+ }
}