1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2023 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 use anyhow::{anyhow, Result};
19 use hexplay::HexViewBuilder;
20 use num::{Float, Num};
21 use std::{fmt, num::FpCategory};
23 use std::io::prelude::*;
24 use std::io::BufReader;
25 use std::path::{Path, PathBuf};
27 /// A utility to dissect SPSS system files.
28 #[derive(Parser, Debug)]
29 #[command(author, version, about, long_about = None)]
31 /// Maximum number of cases to print.
32 #[arg(long = "data", default_value_t = 0)]
36 #[arg(required = true)]
40 fn main() -> Result<()> {
41 let Args { max_cases, files } = Args::parse();
44 Dissector::new(file)?;
49 #[derive(Copy, Clone, Debug)]
55 #[derive(Copy, Clone, Debug)]
62 trait Parse<T, const N: usize> {
63 fn parse(self, bytes: [u8; N]) -> T;
65 impl Parse<u64, 8> for Endianness {
66 fn parse(self, bytes: [u8; 8]) -> u64 {
68 BigEndian => u64::from_be_bytes(bytes),
69 LittleEndian => u64::from_le_bytes(bytes)
73 impl Parse<u32, 4> for Endianness {
74 fn parse(self, bytes: [u8; 4]) -> u32 {
76 BigEndian => u32::from_be_bytes(bytes),
77 LittleEndian => u32::from_le_bytes(bytes)
81 impl Parse<u16, 2> for Endianness {
82 fn parse(self, bytes: [u8; 2]) -> u16 {
84 BigEndian => u16::from_be_bytes(bytes),
85 LittleEndian => u16::from_le_bytes(bytes)
89 impl Parse<u8, 1> for Endianness {
90 fn parse(self, bytes: [u8; 1]) -> u8 {
92 BigEndian => u8::from_be_bytes(bytes),
93 LittleEndian => u8::from_le_bytes(bytes)
97 impl Parse<i64, 8> for Endianness {
98 fn parse(self, bytes: [u8; 8]) -> i64 {
100 BigEndian => i64::from_be_bytes(bytes),
101 LittleEndian => i64::from_le_bytes(bytes)
105 impl Parse<i32, 4> for Endianness {
106 fn parse(self, bytes: [u8; 4]) -> i32 {
108 BigEndian => i32::from_be_bytes(bytes),
109 LittleEndian => i32::from_le_bytes(bytes)
113 impl Parse<i16, 2> for Endianness {
114 fn parse(self, bytes: [u8; 2]) -> i16 {
116 BigEndian => i16::from_be_bytes(bytes),
117 LittleEndian => i16::from_le_bytes(bytes)
121 impl Parse<i8, 1> for Endianness {
122 fn parse(self, bytes: [u8; 1]) -> i8 {
124 BigEndian => i8::from_be_bytes(bytes),
125 LittleEndian => i8::from_le_bytes(bytes)
129 impl Parse<f64, 8> for Endianness {
130 fn parse(self, bytes: [u8; 8]) -> f64 {
132 BigEndian => f64::from_be_bytes(bytes),
133 LittleEndian => f64::from_le_bytes(bytes)
138 fn read_bytes<const N: usize>(r: &mut BufReader<File>) -> Result<[u8; N]> {
139 let mut buf = [0; N];
140 r.read_exact(&mut buf)?;
144 fn read_vec(r: &mut BufReader<File>, n: usize) -> Result<Vec<u8>> {
145 let mut vec = Vec::with_capacity(n);
147 r.read_exact(&mut vec)?;
152 fn read_swap(&mut self) -> Result<T>;
155 impl ReadSwap<u32> for Dissector {
156 fn read_swap(&mut self) -> Result<u32> {
157 Ok(self.endianness.parse(read_bytes(&mut self.r)?))
160 impl ReadSwap<u8> for Dissector {
161 fn read_swap(&mut self) -> Result<u8> {
162 Ok(self.endianness.parse(read_bytes(&mut self.r)?))
166 impl ReadSwap<i32> for Dissector {
167 fn read_swap(&mut self) -> Result<i32> {
168 Ok(self.endianness.parse(read_bytes(&mut self.r)?))
172 impl ReadSwap<f64> for Dissector {
173 fn read_swap(&mut self) -> Result<f64> {
174 Ok(self.endianness.parse(read_bytes(&mut self.r)?))
181 compression: Option<Compression>,
182 endianness: Endianness,
183 fp_format: Endianness,
185 n_variable_records: usize,
187 var_widths: Vec<i32>,
190 fn detect_endianness(layout_code: [u8; 4]) -> Option<Endianness> {
191 for endianness in [BigEndian, LittleEndian] {
192 match endianness.parse(layout_code) {
193 2 | 3 => return Some(endianness),
200 fn detect_fp_format(bias: [u8; 8]) -> Option<Endianness> {
201 for endianness in [BigEndian, LittleEndian] {
202 let value: f64 = endianness.parse(bias);
204 return Some(endianness)
210 fn trim_end(mut s: Vec<u8>, c: u8) -> Vec<u8> {
211 while s.last() == Some(&c) {
217 fn slice_trim_end(mut s: &[u8], c: u8) -> &[u8] {
218 while s.last() == Some(&c) {
219 s = s.split_last().unwrap().1;
224 fn format_name(type_: u32) -> &'static str {
267 fn round_up<T: Num + Copy>(x: T, y: T) -> T
269 (x + (y - T::one())) / y * y
272 struct UntypedValue {
274 endianness: Endianness
278 fn new(raw: [u8; 8], endianness: Endianness) -> UntypedValue {
279 UntypedValue { raw, endianness }
283 impl fmt::Display for UntypedValue {
284 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
285 let numeric: f64 = self.endianness.parse(self.raw);
286 let n_printable = self.raw.iter().take_while(|&&x| x == b' ' || x.is_ascii_graphic()).count();
287 let printable_prefix = std::str::from_utf8(&self.raw[0..n_printable]).unwrap();
288 write!(f, "{numeric}/\"{printable_prefix}\"")
292 struct HexFloat<T: Float>(T);
294 impl<T: Float> fmt::Display for HexFloat<T> {
295 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
296 let sign = if self.0.is_sign_negative() { "-" } else { "" };
297 match self.0.classify() {
298 FpCategory::Nan => return write!(f, "NaN"),
299 FpCategory::Infinite => return write!(f, "{sign}Infinity"),
300 FpCategory::Zero => return write!(f, "{sign}0.0"),
303 let (significand, mut exponent, _) = self.0.integer_decode();
304 let mut hex_sig = format!("{:x}", significand);
305 while hex_sig.ends_with('0') {
309 match hex_sig.len() {
310 0 => write!(f, "{sign}0.0"),
311 1 => write!(f, "{sign}0x{hex_sig}.0p{exponent}"),
312 len => write!(f, "{sign}0x{}.{}p{}",
313 hex_sig.chars().nth(0).unwrap(),
315 exponent + 4 * (len as i16 - 1))
321 mod hex_float_tests {
327 assert_eq!(format!("{}", HexFloat(1.0)), "0x1.0p0");
328 assert_eq!(format!("{}", HexFloat(123.0)), "0x1.ecp6");
329 assert_eq!(format!("{}", HexFloat(1.0 / 16.0)), "0x1.0p-4");
330 assert_eq!(format!("{}", HexFloat(f64::infinity())), "Infinity");
331 assert_eq!(format!("{}", HexFloat(f64::neg_infinity())), "-Infinity");
332 assert_eq!(format!("{}", HexFloat(f64::nan())), "NaN");
333 assert_eq!(format!("{}", HexFloat(0.0)), "0.0");
334 assert_eq!(format!("{}", HexFloat(f64::neg_zero())), "-0.0");
339 fn new<P: AsRef<Path>>(filename: P) -> Result<Dissector> {
340 let mut r = BufReader::new(File::open(&filename)?);
341 let filename = filename.as_ref().to_string_lossy().into_owned();
342 let rec_type: [u8; 4] = read_bytes(&mut r)?;
343 let zmagic = match &rec_type {
346 _ => Err(anyhow!("This is not an SPSS system file."))?
349 let eye_catcher: [u8; 60] = read_bytes(&mut r)?;
350 let layout_code: [u8; 4] = read_bytes(&mut r)?;
351 let endianness = detect_endianness(layout_code)
352 .ok_or_else(|| anyhow!("This is not an SPSS system file."))?;
353 let layout_code: u32 = endianness.parse(layout_code);
354 let _nominal_case_size: [u8; 4] = read_bytes(&mut r)?;
355 let compressed: u32 = endianness.parse(read_bytes(&mut r)?);
356 let compression = match (zmagic, compressed) {
358 (false, 1) => Some(Compression::Simple),
359 (true, 2) => Some(Compression::ZLib),
360 _ => Err(anyhow!("{} file header has invalid compression value {compressed}.",
361 if zmagic { "ZSAV" } else { "SAV" }))?,
364 let weight_index: u32 = endianness.parse(read_bytes(&mut r)?);
365 let n_cases: u32 = endianness.parse(read_bytes(&mut r)?);
367 let bias: [u8; 8] = read_bytes(&mut r)?;
368 let fp_format = detect_fp_format(bias)
369 .unwrap_or_else(|| { eprintln!("Compression bias is not the usual value of 100, or system file uses unrecognized floating-point format."); endianness });
370 let bias: f64 = fp_format.parse(bias);
372 let mut d = Dissector {
379 n_variable_records: 0,
381 var_widths: Vec::new(),
384 let creation_date: [u8; 9] = read_bytes(&mut d.r)?;
385 let creation_time: [u8; 8] = read_bytes(&mut d.r)?;
386 let file_label: [u8; 64] = read_bytes(&mut d.r)?;
387 let file_label = trim_end(Vec::from(file_label), b' ');
390 println!("File header record:");
391 println!("{:>17}: {}", "Product name", String::from_utf8_lossy(&eye_catcher));
392 println!("{:>17}: {}", "Layout code", layout_code);
393 println!("{:>17}: {} ({})", "Compressed", compressed, match compression {
394 None => "no compression",
395 Some(Compression::Simple) => "simple compression",
396 Some(Compression::ZLib) => "ZLIB compression",
398 println!("{:>17}: {}", "Weight index", weight_index);
399 println!("{:>17}: {}", "Number of cases", n_cases);
400 println!("{:>17}: {}", "Compression bias", bias);
401 println!("{:>17}: {}", "Creation date", String::from_utf8_lossy(&creation_date));
402 println!("{:>17}: {}", "Creation time", String::from_utf8_lossy(&creation_time));
403 println!("{:>17}: \"{}\"", "File label", String::from_utf8_lossy(&file_label));
406 let rec_type: u32 = d.read_swap()?;
408 2 => d.read_variable_record()?,
409 3 => d.read_value_label_record()?,
410 4 => Err(anyhow!("Misplaced type 4 record."))?,
411 6 => d.read_document_record()?,
412 7 => d.read_extension_record()?,
414 _ => Err(anyhow!("Unrecognized record type {rec_type}."))?
418 let pos = d.r.stream_position()?;
419 println!("{:08x}: end-of-dictionary record (first byte of data at {:0x})", pos, pos + 4);
424 fn read_extension_record(&mut self) -> Result<()> {
425 let offset = self.r.stream_position()?;
426 let subtype: u32 = self.read_swap()?;
427 let size: u32 = self.read_swap()?;
428 let count: u32 = self.read_swap()?;
429 println!("{offset:08x}: Record 7, subtype {subtype}, size={size}, count={count}");
431 3 => self.read_machine_integer_info(size, count),
432 4 => self.read_machine_float_info(size, count),
433 _ => self.read_unknown_extension(subtype, size, count),
437 fn warn(&mut self, s: String) -> Result<()> {
438 println!("\"{}\" near offset 0x{:08x}: {s}", self.filename, self.r.stream_position()?);
442 fn skip_bytes(&mut self, mut n: u64) -> Result<()> {
443 let mut buf = [0; 1024];
445 let chunk = u64::min(n, buf.len() as u64);
446 self.r.read_exact(&mut buf[0..chunk as usize])?;
452 fn read_unknown_extension(&mut self, subtype: u32, size: u32, count: u32) -> Result<()> {
453 self.warn(format!("Unrecognized record type 7, subtype {subtype}."))?;
454 if size == 0 || count > 65536 / size {
455 self.skip_bytes(size as u64 * count as u64)?;
456 } else if size != 1 {
459 let vec = read_vec(&mut self.r, size as usize)?;
460 println!("{}", HexViewBuilder::new(&vec).address_offset(offset).finish());
461 offset += size as usize;
467 fn read_variable_record(&mut self) -> Result<()> {
468 self.n_variable_records += 1;
469 println!("{:08x}: variable record {}", self.r.stream_position()?, self.n_variable_records);
470 let width: i32 = self.read_swap()?;
471 let has_variable_label: u32 = self.read_swap()?;
472 let missing_value_code: i32 = self.read_swap()?;
473 let print_format: u32 = self.read_swap()?;
474 let write_format: u32 = self.read_swap()?;
475 let name: [u8; 8] = read_bytes(&mut self.r)?;
476 let name: Vec<u8> = trim_end(Vec::from(name), b'\0');
479 self.n_variables += 1;
481 self.var_widths.push(width);
483 println!("\tWidth: {width} ({})", match width {
484 _ if width > 0 => "string",
485 _ if width == 0 => "numeric",
486 _ => "long string continuation record"
489 println!("\tVariable label: {has_variable_label}");
490 println!("\tMissing values code: {missing_value_code} ({})",
491 match missing_value_code {
492 0 => "no missing values",
493 1 => "one missing value",
494 2 => "two missing values",
495 3 => "three missing values",
496 -2 => "one missing value range",
497 -3 => "one missing value, one range",
500 for (which, format) in [("Print", print_format),
501 ("Worite", write_format)] {
502 let type_ = format_name(format >> 16);
503 let w = (format >> 8) & 0xff;
504 let d = format & 0xff;
505 println!("\t{which} format: {format:06x} ({type_}{w}.{d})");
507 println!("\tName: {}", String::from_utf8_lossy(&name));
509 // Read variable label.
510 match has_variable_label {
513 let offset = self.r.stream_position()?;
514 let len: u32 = self.read_swap()?;
515 let read_len = len.min(65535) as usize;
516 let label = read_vec(&mut self.r, read_len)?;
517 println!("\t{offset:08x} Variable label: \"{}\"", String::from_utf8_lossy(&label));
519 self.skip_bytes((round_up(len, 4) - len).into())?;
521 _ => Err(anyhow!("Variable label indicator field is not 0 or 1."))?,
524 // Read missing values.
525 if missing_value_code != 0 {
526 print!("\t{:08x} Missing values:", self.r.stream_position()?);
528 let (has_range, n_individual) = match missing_value_code {
531 1 | 2 | 3 => (false, missing_value_code),
532 _ => Err(anyhow!("Numeric missing value indicator field is not -3, -2, 0, 1, 2, or 3."))?,
535 let low: f64 = self.read_swap()?;
536 let high: f64 = self.read_swap()?;
537 print!(" {low}...{high}");
539 for _ in 0..n_individual {
540 let value: f64 = self.read_swap()?;
543 } else if width > 0 {
544 if missing_value_code < 1 || missing_value_code > 3 {
545 Err(anyhow!("String missing value indicator field is not 0, 1, 2, or 3."))?;
547 for _ in 0..missing_value_code {
548 let string: [u8; 8] = read_bytes(&mut self.r)?;
549 let string: Vec<u8> = trim_end(Vec::from(string), b'\0');
550 println!(" {}", String::from_utf8_lossy(&string));
559 fn read_value_label_record(&mut self) -> Result<()> {
560 println!("{:08x}: value labels record", self.r.stream_position()?);
563 let n_labels: u32 = self.read_swap()?;
564 for _ in 0..n_labels {
565 let raw: [u8; 8] = read_bytes(&mut self.r)?;
566 let value = UntypedValue::new(raw, self.fp_format);
567 let label_len: u8 = self.read_swap()?;
568 let padded_len = round_up(label_len as usize + 1, 8);
570 let mut label = read_vec(&mut self.r, padded_len)?;
571 label.truncate(label_len as usize);
572 let label = String::from_utf8_lossy(&label);
574 println!("\t{value}: {label}");
577 // Read the type-4 record with the corresponding variable indexes.
578 let rec_type: u32 = self.read_swap()?;
580 Err(anyhow!("Variable index record (type 4) does not immediately \
581 follow value label record (type 3) as it should."))?;
584 println!("\t{:08x}: apply to variables", self.r.stream_position()?);
585 let n_vars: u32 = self.read_swap()?;
587 let index: u32 = self.read_swap()?;
595 fn read_document_record(&mut self) -> Result<()> {
596 println!("{:08x}: document record", self.r.stream_position()?);
597 let n_lines: u32 = self.read_swap()?;
598 println!("\t{n_lines} lines of documents");
600 for i in 0..n_lines {
601 print!("\t{:08x}: ", self.r.stream_position()?);
602 let line: [u8; 64] = read_bytes(&mut self.r)?;
603 let line = trim_end(Vec::from(line), b' ');
604 println!("line {i}: \"{}\"", String::from_utf8_lossy(&line));
609 fn read_machine_integer_info(&mut self, size: u32, count: u32) -> Result<()> {
610 let offset = self.r.stream_position()?;
611 let version_major: u32 = self.read_swap()?;
612 let version_minor: u32 = self.read_swap()?;
613 let version_revision: u32 = self.read_swap()?;
614 let machine_code: u32 = self.read_swap()?;
615 let float_representation: u32 = self.read_swap()?;
616 let compression_code: u32 = self.read_swap()?;
617 let integer_representation: u32 = self.read_swap()?;
618 let character_code: u32 = self.read_swap()?;
620 println!("{offset:08x}: machine integer info");
621 if size != 4 || count != 8 {
622 Err(anyhow!("Bad size ({size}) or count ({count}) field on record type 7, subtype 3"))?;
624 println!("\tVersion: {version_major}.{version_minor}.{version_revision}");
625 println!("\tMachine code: {machine_code}");
626 println!("\tFloating point representation: {float_representation} ({})",
627 match float_representation {
633 println!("\tCompression code: {compression_code}");
634 println!("\tEndianness: {integer_representation} ({})",
635 match integer_representation {
640 println!("\tCharacter code: {character_code}");
644 fn read_machine_float_info(&mut self, size: u32, count: u32) -> Result<()> {
645 let offset = self.r.stream_position()?;
646 let sysmis: f64 = self.read_swap()?;
647 let highest: f64 = self.read_swap()?;
648 let lowest: f64 = self.read_swap()?;
650 println!("{offset:08x}: machine float info");
651 if size != 4 || count != 8 {
652 Err(anyhow!("Bad size ({size}) or count ({count}) field on extension 4."))?;
655 println!("\tsysmis: {sysmis} ({})", HexFloat(sysmis));
656 println!("\thighest: {highest} ({})", HexFloat(highest));
657 println!("\tlowest: {lowest} ({})", HexFloat(lowest));
661 fn read_variable_sets(&mut self, size: u32, count: u32) -> Result<()> {
662 println!("{:08x}: variable sets", self.r.stream_position()?);
663 let mut text = self.open_text_record(size, count)?;
665 while text.match_byte(b'\n') {
668 let set = match text.tokenize(b'=') {
669 Some(set) => String::from_utf8_lossy(&set).into_owned(),
673 // Always present even for an empty set.
674 text.match_byte(b' ');
676 match text.tokenize(b'\n') {
677 None => println!("\tset \"{set}\" is empty"),
679 println!("\tset \"{set}\" contains \"{}\"", String::from_utf8_lossy(variables).trim_end_matches('\r'));
687 fn read_extra_product_info(&mut self, size: u32, count: u32) -> Result<()> {
688 print!("{:08x}: extra product info", self.r.stream_position()?);
689 let mut text = self.open_text_record(size, count)?;
693 fn open_text_record(&mut self, size: u32, count: u32) -> Result<TextRecord> {
694 let n_bytes = match u32::checked_mul(size, count) {
696 None => Err(anyhow!("Extension record too large."))?
698 Ok(TextRecord::new(read_vec(&mut self.r, n_bytes as usize)?))
708 fn new(buffer: Vec<u8>) -> TextRecord {
709 TextRecord { buffer, pos: 0 }
712 fn tokenize<'a>(&'a mut self, delimiter: u8) -> Option<&'a [u8]> {
713 let mut start = self.pos;
714 while self.pos < self.buffer.len() && self.buffer[self.pos] != delimiter && self.buffer[self.pos] != 0 {
717 if start == self.pos {
720 Some(&self.buffer[start..self.pos])
724 fn match_byte(&mut self, c: u8) -> bool {
725 if self.pos < self.buffer.len() && self.buffer[self.pos] == c {