pintos-os.org Git - pspp/blob - rust/src/main.rs

   1 /* PSPP - a program for statistical analysis.
   2  * Copyright (C) 2023 Free Software Foundation, Inc.
   3  *
   4  * This program is free software: you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation, either version 3 of the License, or
   7  * (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 use anyhow::{anyhow, Result};
  18 use clap::Parser;
  19 use hexplay::HexView;
  20 use hexplay::HexViewBuilder;
  21 use num::Num;
  22 use std::cmp::Ordering;
  23 use std::collections::VecDeque;
  24 use std::fmt;
  25 use std::fs::File;
  26 use std::io::prelude::*;
  27 use std::io::BufReader;
  28 use std::io::ErrorKind;
  29 use std::path::{Path, PathBuf};
  30 use std::str;
  31
  32 mod hexfloat;
  33 use hexfloat::HexFloat;
  34
  35 /// A utility to dissect SPSS system files.
  36 #[derive(Parser, Debug)]
  37 #[command(author, version, about, long_about = None)]
  38 struct Args {
  39     /// Maximum number of cases to print.
  40     #[arg(long = "data", default_value_t = 0)]
  41     max_cases: usize,
  42
  43     /// Files to dissect.
  44     #[arg(required = true)]
  45     files: Vec<PathBuf>,
  46 }
  47
  48 fn main() -> Result<()> {
  49     let Args { max_cases, files } = Args::parse();
  50
  51     for file in files {
  52         Dissector::new(file, max_cases)?;
  53     }
  54     Ok(())
  55 }
  56
  57 #[derive(Copy, Clone, Debug)]
  58 enum Compression {
  59     Simple,
  60     ZLib,
  61 }
  62
  63 #[derive(Copy, Clone, Debug)]
  64 enum Endianness {
  65     BigEndian,
  66     LittleEndian,
  67 }
  68 use Endianness::*;
  69
  70 trait Parse<T, const N: usize> {
  71     fn parse(self, bytes: [u8; N]) -> T;
  72 }
  73 impl Parse<u64, 8> for Endianness {
  74     fn parse(self, bytes: [u8; 8]) -> u64 {
  75         match self {
  76             BigEndian => u64::from_be_bytes(bytes),
  77             LittleEndian => u64::from_le_bytes(bytes),
  78         }
  79     }
  80 }
  81 impl Parse<u32, 4> for Endianness {
  82     fn parse(self, bytes: [u8; 4]) -> u32 {
  83         match self {
  84             BigEndian => u32::from_be_bytes(bytes),
  85             LittleEndian => u32::from_le_bytes(bytes),
  86         }
  87     }
  88 }
  89 impl Parse<u16, 2> for Endianness {
  90     fn parse(self, bytes: [u8; 2]) -> u16 {
  91         match self {
  92             BigEndian => u16::from_be_bytes(bytes),
  93             LittleEndian => u16::from_le_bytes(bytes),
  94         }
  95     }
  96 }
  97 impl Parse<u8, 1> for Endianness {
  98     fn parse(self, bytes: [u8; 1]) -> u8 {
  99         match self {
 100             BigEndian => u8::from_be_bytes(bytes),
 101             LittleEndian => u8::from_le_bytes(bytes),
 102         }
 103     }
 104 }
 105 impl Parse<i64, 8> for Endianness {
 106     fn parse(self, bytes: [u8; 8]) -> i64 {
 107         match self {
 108             BigEndian => i64::from_be_bytes(bytes),
 109             LittleEndian => i64::from_le_bytes(bytes),
 110         }
 111     }
 112 }
 113 impl Parse<i32, 4> for Endianness {
 114     fn parse(self, bytes: [u8; 4]) -> i32 {
 115         match self {
 116             BigEndian => i32::from_be_bytes(bytes),
 117             LittleEndian => i32::from_le_bytes(bytes),
 118         }
 119     }
 120 }
 121 impl Parse<i16, 2> for Endianness {
 122     fn parse(self, bytes: [u8; 2]) -> i16 {
 123         match self {
 124             BigEndian => i16::from_be_bytes(bytes),
 125             LittleEndian => i16::from_le_bytes(bytes),
 126         }
 127     }
 128 }
 129 impl Parse<i8, 1> for Endianness {
 130     fn parse(self, bytes: [u8; 1]) -> i8 {
 131         match self {
 132             BigEndian => i8::from_be_bytes(bytes),
 133             LittleEndian => i8::from_le_bytes(bytes),
 134         }
 135     }
 136 }
 137 impl Parse<f64, 8> for Endianness {
 138     fn parse(self, bytes: [u8; 8]) -> f64 {
 139         match self {
 140             BigEndian => f64::from_be_bytes(bytes),
 141             LittleEndian => f64::from_le_bytes(bytes),
 142         }
 143     }
 144 }
 145
 146 fn read_bytes<const N: usize>(r: &mut BufReader<File>) -> Result<[u8; N]> {
 147     let mut buf = [0; N];
 148     r.read_exact(&mut buf)?;
 149     Ok(buf)
 150 }
 151
 152 fn read_vec(r: &mut BufReader<File>, n: usize) -> Result<Vec<u8>> {
 153     let mut vec = vec![0; n];
 154     r.read_exact(&mut vec)?;
 155     Ok(vec)
 156 }
 157
 158 trait ReadSwap<T> {
 159     fn read_swap(&mut self) -> Result<T>;
 160 }
 161
 162 impl ReadSwap<u8> for Dissector {
 163     fn read_swap(&mut self) -> Result<u8> {
 164         Ok(self.endianness.parse(read_bytes(&mut self.r)?))
 165     }
 166 }
 167 impl ReadSwap<u32> for Dissector {
 168     fn read_swap(&mut self) -> Result<u32> {
 169         Ok(self.endianness.parse(read_bytes(&mut self.r)?))
 170     }
 171 }
 172 impl ReadSwap<u64> for Dissector {
 173     fn read_swap(&mut self) -> Result<u64> {
 174         Ok(self.endianness.parse(read_bytes(&mut self.r)?))
 175     }
 176 }
 177
 178 impl ReadSwap<i32> for Dissector {
 179     fn read_swap(&mut self) -> Result<i32> {
 180         Ok(self.endianness.parse(read_bytes(&mut self.r)?))
 181     }
 182 }
 183
 184 impl ReadSwap<f64> for Dissector {
 185     fn read_swap(&mut self) -> Result<f64> {
 186         Ok(self.endianness.parse(read_bytes(&mut self.r)?))
 187     }
 188 }
 189
 190 struct Dissector {
 191     filename: String,
 192     r: BufReader<File>,
 193     endianness: Endianness,
 194     fp_format: Endianness,
 195     bias: f64,
 196     n_variable_records: usize,
 197     n_variables: usize,
 198     var_widths: Vec<i32>,
 199 }
 200
 201 fn detect_endianness(layout_code: [u8; 4]) -> Option<Endianness> {
 202     for endianness in [BigEndian, LittleEndian] {
 203         match endianness.parse(layout_code) {
 204             2 | 3 => return Some(endianness),
 205             _ => (),
 206         }
 207     }
 208     None
 209 }
 210
 211 fn detect_fp_format(bias: [u8; 8]) -> Option<Endianness> {
 212     for endianness in [BigEndian, LittleEndian] {
 213         let value: f64 = endianness.parse(bias);
 214         if value == 100.0 {
 215             return Some(endianness);
 216         }
 217     }
 218     None
 219 }
 220
 221 fn trim_end(mut s: Vec<u8>, c: u8) -> Vec<u8> {
 222     while s.last() == Some(&c) {
 223         s.pop();
 224     }
 225     s
 226 }
 227
 228 fn format_name(type_: u32) -> &'static str {
 229     match type_ {
 230         1 => "A",
 231         2 => "AHEX",
 232         3 => "COMMA",
 233         4 => "DOLLAR",
 234         5 => "F",
 235         6 => "IB",
 236         7 => "PIBHEX",
 237         8 => "P",
 238         9 => "PIB",
 239         10 => "PK",
 240         11 => "RB",
 241         12 => "RBHEX",
 242         15 => "Z",
 243         16 => "N",
 244         17 => "E",
 245         20 => "DATE",
 246         21 => "TIME",
 247         22 => "DATETIME",
 248         23 => "ADATE",
 249         24 => "JDATE",
 250         25 => "DTIME",
 251         26 => "WKDAY",
 252         27 => "MONTH",
 253         28 => "MOYR",
 254         29 => "QYR",
 255         30 => "WKYR",
 256         31 => "PCT",
 257         32 => "DOT",
 258         33 => "CCA",
 259         34 => "CCB",
 260         35 => "CCC",
 261         36 => "CCD",
 262         37 => "CCE",
 263         38 => "EDATE",
 264         39 => "SDATE",
 265         40 => "MTIME",
 266         41 => "YMDHMS",
 267         _ => "invalid",
 268     }
 269 }
 270
 271 fn round_up<T: Num + Copy>(x: T, y: T) -> T {
 272     (x + (y - T::one())) / y * y
 273 }
 274
 275 struct UntypedValue {
 276     raw: [u8; 8],
 277     endianness: Endianness,
 278 }
 279
 280 impl UntypedValue {
 281     fn new(raw: [u8; 8], endianness: Endianness) -> UntypedValue {
 282         UntypedValue { raw, endianness }
 283     }
 284 }
 285
 286 impl fmt::Display for UntypedValue {
 287     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 288         let numeric: f64 = self.endianness.parse(self.raw);
 289         let n_printable = self
 290             .raw
 291             .iter()
 292             .take_while(|&&x| x == b' ' || x.is_ascii_graphic())
 293             .count();
 294         let printable_prefix = std::str::from_utf8(&self.raw[0..n_printable]).unwrap();
 295         write!(f, "{numeric}/\"{printable_prefix}\"")
 296     }
 297 }
 298
 299 impl Dissector {
 300     fn new<P: AsRef<Path>>(filename: P, max_cases: usize) -> Result<Dissector> {
 301         let mut r = BufReader::new(File::open(&filename)?);
 302         let filename = filename.as_ref().to_string_lossy().into_owned();
 303         let rec_type: [u8; 4] = read_bytes(&mut r)?;
 304         let zmagic = match &rec_type {
 305             b"$FL2" => false,
 306             b"$FL3" => true,
 307             _ => Err(anyhow!("This is not an SPSS system file."))?,
 308         };
 309
 310         let eye_catcher: [u8; 60] = read_bytes(&mut r)?;
 311         let layout_code: [u8; 4] = read_bytes(&mut r)?;
 312         let endianness = detect_endianness(layout_code)
 313             .ok_or_else(|| anyhow!("This is not an SPSS system file."))?;
 314         let layout_code: u32 = endianness.parse(layout_code);
 315         let _nominal_case_size: [u8; 4] = read_bytes(&mut r)?;
 316         let compressed: u32 = endianness.parse(read_bytes(&mut r)?);
 317         let compression = match (zmagic, compressed) {
 318             (false, 0) => None,
 319             (false, 1) => Some(Compression::Simple),
 320             (true, 2) => Some(Compression::ZLib),
 321             _ => Err(anyhow!(
 322                 "{} file header has invalid compression value {compressed}.",
 323                 if zmagic { "ZSAV" } else { "SAV" }
 324             ))?,
 325         };
 326
 327         let weight_index: u32 = endianness.parse(read_bytes(&mut r)?);
 328         let n_cases: u32 = endianness.parse(read_bytes(&mut r)?);
 329
 330         let bias: [u8; 8] = read_bytes(&mut r)?;
 331         let fp_format = detect_fp_format(bias)
 332             .unwrap_or_else(|| { eprintln!("Compression bias is not the usual value of 100, or system file uses unrecognized floating-point format."); endianness });
 333         let bias: f64 = fp_format.parse(bias);
 334
 335         let mut d = Dissector {
 336             filename,
 337             r,
 338             endianness,
 339             fp_format,
 340             bias,
 341             n_variable_records: 0,
 342             n_variables: 0,
 343             var_widths: Vec::new(),
 344         };
 345
 346         let creation_date: [u8; 9] = read_bytes(&mut d.r)?;
 347         let creation_time: [u8; 8] = read_bytes(&mut d.r)?;
 348         let file_label: [u8; 64] = read_bytes(&mut d.r)?;
 349         let file_label = trim_end(Vec::from(file_label), b' ');
 350         d.skip_bytes(3)?;
 351
 352         println!("File header record:");
 353         println!(
 354             "{:>17}: {}",
 355             "Product name",
 356             String::from_utf8_lossy(&eye_catcher)
 357         );
 358         println!("{:>17}: {}", "Layout code", layout_code);
 359         println!(
 360             "{:>17}: {} ({})",
 361             "Compressed",
 362             compressed,
 363             match compression {
 364                 None => "no compression",
 365                 Some(Compression::Simple) => "simple compression",
 366                 Some(Compression::ZLib) => "ZLIB compression",
 367             }
 368         );
 369         println!("{:>17}: {}", "Weight index", weight_index);
 370         println!("{:>17}: {}", "Number of cases", n_cases);
 371         println!("{:>17}: {}", "Compression bias", bias);
 372         println!(
 373             "{:>17}: {}",
 374             "Creation date",
 375             String::from_utf8_lossy(&creation_date)
 376         );
 377         println!(
 378             "{:>17}: {}",
 379             "Creation time",
 380             String::from_utf8_lossy(&creation_time)
 381         );
 382         println!(
 383             "{:>17}: \"{}\"",
 384             "File label",
 385             String::from_utf8_lossy(&file_label)
 386         );
 387
 388         loop {
 389             let rec_type: u32 = d.read_swap()?;
 390             match rec_type {
 391                 2 => d.read_variable_record()?,
 392                 3 => d.read_value_label_record()?,
 393                 4 => Err(anyhow!("Misplaced type 4 record."))?,
 394                 6 => d.read_document_record()?,
 395                 7 => d.read_extension_record()?,
 396                 999 => break,
 397                 _ => Err(anyhow!("Unrecognized record type {rec_type}."))?,
 398             }
 399         }
 400
 401         let pos = d.r.stream_position()?;
 402         println!(
 403             "{:08x}: end-of-dictionary record (first byte of data at {:0x})",
 404             pos,
 405             pos + 4
 406         );
 407
 408         match compression {
 409             Some(Compression::Simple) => {
 410                 if max_cases > 0 {
 411                     d.read_simple_compressed_data(max_cases)?;
 412                 }
 413             }
 414             Some(Compression::ZLib) => d.read_zlib_compressed_data()?,
 415             None => (),
 416         }
 417
 418         Ok(d)
 419     }
 420
 421     fn read_simple_compressed_data(&mut self, max_cases: usize) -> Result<()> {
 422         let _: i32 = self.read_swap()?;
 423         println!("\n{:08x}: compressed data:", self.r.stream_position()?);
 424
 425         const N_OPCODES: usize = 8;
 426         let mut opcodes = VecDeque::<u8>::with_capacity(8);
 427         let mut opcode_ofs = 0;
 428         for case_num in 0..max_cases {
 429             println!(
 430                 "{:08x}: case {case_num}'s uncompressible data begins",
 431                 self.r.stream_position()?
 432             );
 433             let mut i = 0;
 434             while i < self.var_widths.len() {
 435                 let width = self.var_widths[i];
 436
 437                 let opcode_idx = N_OPCODES - opcodes.len();
 438                 let Some(opcode) = opcodes.pop_back() else {
 439                     opcode_ofs = self.r.stream_position()?;
 440                     let mut new_opcodes = [0; N_OPCODES];
 441                     if let Err(error) = self.r.read_exact(&mut new_opcodes) {
 442                         if i == 0 && error.kind() == ErrorKind::UnexpectedEof {
 443                             return Ok(());
 444                         } else {
 445                             return Err(error.into());
 446                         }
 447                     };
 448                     opcodes.extend(new_opcodes.into_iter());
 449                     continue;
 450                 };
 451
 452                 print!(
 453                     "{:08x}: variable {i}: opcode {opcode}: ",
 454                     opcode_ofs + opcode_idx as u64
 455                 );
 456                 match opcode {
 457                     0 => println!("ignored padding"),
 458                     252 => {
 459                         println!("end of data");
 460                         break;
 461                     }
 462                     253 => {
 463                         let raw: [u8; 8] = read_bytes(&mut self.r)?;
 464                         let value = UntypedValue::new(raw, self.fp_format);
 465                         println!("uncompressible data: {value}");
 466                         i += 1;
 467                     }
 468                     254 => {
 469                         print!("spaces");
 470                         if width == 0 {
 471                             print!(", but this is a numeric variable");
 472                         }
 473                         println!();
 474                         i += 1;
 475                     }
 476                     255 => {
 477                         print!("SYSMIS");
 478                         if width != 0 {
 479                             print!(", but this is a string variable (width={width})");
 480                         }
 481                         println!();
 482                         i += 1;
 483                     }
 484                     _ => {
 485                         print!("{}", opcode as f64 - self.bias);
 486                         if width != 0 {
 487                             print!(", but this is a string variable (width={width})");
 488                         }
 489                         println!();
 490                         i += 1;
 491                     }
 492                 }
 493             }
 494         }
 495         Ok(())
 496     }
 497
 498     fn read_zlib_compressed_data(&mut self) -> Result<()> {
 499         let _: i32 = self.read_swap()?;
 500         let ofs = self.r.stream_position()?;
 501         println!("\n{ofs:08x}: ZLIB compressed data header:");
 502
 503         let this_ofs: u64 = self.read_swap()?;
 504         let next_ofs: u64 = self.read_swap()?;
 505         let next_len: u64 = self.read_swap()?;
 506
 507         println!("\theader_ofs: {this_ofs:#x}");
 508         if this_ofs != ofs {
 509             println!("\t\t(Expected {ofs:#x}.)");
 510         }
 511         println!("\ttrailer_ofs: {next_ofs:#x}");
 512         println!("\ttrailer_len: {next_len}");
 513         if next_len < 24 || next_len % 24 != 0 {
 514             println!("\t\t(Trailer length is not positive multiple of 24.)");
 515         }
 516
 517         let zlib_data_len = next_ofs - (ofs + 8 * 3);
 518         println!(
 519             "\n{:08x}: {zlib_data_len:#x} bytes of ZLIB compressed data",
 520             ofs + 8 * 3
 521         );
 522
 523         self.skip_bytes(zlib_data_len)?;
 524
 525         println!("\n{next_ofs:08x}: ZLIB trailer fixed header");
 526         let bias: u64 = self.read_swap()?;
 527         let zero: u64 = self.read_swap()?;
 528         let block_size: u32 = self.read_swap()?;
 529         let n_blocks: u32 = self.read_swap()?;
 530         println!("\tbias: {bias}");
 531         println!("\tzero: {zero:#x}");
 532         if zero != 0 {
 533             println!("\t\t(Expected 0.)");
 534         }
 535         println!("\tblock size: {block_size:#x}");
 536         if block_size != 0x3ff000 {
 537             println!("\t\t(Expected 0x3ff000.)");
 538         }
 539         println!("\tn_blocks: {n_blocks}");
 540         if n_blocks as u64 != next_len / 24 - 1 {
 541             println!("\t\t(Expected {}.)", next_len / 24 - 1);
 542         }
 543
 544         let mut expected_uncmp_ofs = ofs;
 545         let mut expected_cmp_ofs = ofs + 24;
 546         for i in 1..=n_blocks {
 547             let blockinfo_ofs = self.r.stream_position()?;
 548             let uncompressed_ofs: u64 = self.read_swap()?;
 549             let compressed_ofs: u64 = self.read_swap()?;
 550             let uncompressed_size: u32 = self.read_swap()?;
 551             let compressed_size: u32 = self.read_swap()?;
 552
 553             println!("\n{blockinfo_ofs:08x}: ZLIB block descriptor {i}");
 554
 555             println!("\tuncompressed_ofs: {uncompressed_ofs:#x}");
 556             if uncompressed_ofs != expected_uncmp_ofs {
 557                 println!("\t\t(Expected {ofs:#x}.)");
 558             }
 559
 560             println!("\tcompressed_ofs: {compressed_ofs:#x}");
 561             if compressed_ofs != expected_cmp_ofs {
 562                 println!("\t\t(Expected {expected_cmp_ofs:#x}.)");
 563             }
 564
 565             println!("\tuncompressed_size: {uncompressed_size:#x}");
 566             if i < n_blocks && uncompressed_size != block_size {
 567                 println!("\t\t(Expected {block_size:#x}.)");
 568             }
 569
 570             println!("\tcompressed_size: {compressed_size:#x}");
 571             if i == n_blocks && compressed_ofs.checked_add(compressed_size as u64) != Some(next_ofs)
 572             {
 573                 println!(
 574                     "\t\t(This was expected to be {:#x}.)",
 575                     next_ofs - compressed_size as u64
 576                 );
 577             }
 578
 579             expected_uncmp_ofs += uncompressed_size as u64;
 580             expected_cmp_ofs += uncompressed_size as u64;
 581         }
 582         Ok(())
 583     }
 584
 585     fn read_extension_record(&mut self) -> Result<()> {
 586         let offset = self.r.stream_position()?;
 587         let subtype: u32 = self.read_swap()?;
 588         let size: u32 = self.read_swap()?;
 589         let count: u32 = self.read_swap()?;
 590         println!("{offset:08x}: Record 7, subtype {subtype}, size={size}, count={count}");
 591         match subtype {
 592             3 => self.read_machine_integer_info(size, count),
 593             4 => self.read_machine_float_info(size, count),
 594             5 => self.read_variable_sets(size, count),
 595             6 => {
 596                 // DATE variable information.  We don't use it yet, but we should.
 597                 Ok(())
 598             }
 599             7 | 19 => self.read_mrsets(size, count),
 600             10 => self.read_extra_product_info(size, count),
 601             11 => self.read_display_parameters(size, count),
 602             13 => self.read_long_string_map(size, count),
 603             _ => self.read_unknown_extension(subtype, size, count),
 604         }
 605     }
 606
 607     fn warn(&mut self, s: String) -> Result<()> {
 608         println!(
 609             "\"{}\" near offset 0x{:08x}: {s}",
 610             self.filename,
 611             self.r.stream_position()?
 612         );
 613         Ok(())
 614     }
 615
 616     fn skip_bytes(&mut self, mut n: u64) -> Result<()> {
 617         let mut buf = [0; 1024];
 618         while n > 0 {
 619             let chunk = u64::min(n, buf.len() as u64);
 620             self.r.read_exact(&mut buf[0..chunk as usize])?;
 621             n -= chunk;
 622         }
 623         Ok(())
 624     }
 625
 626     fn read_unknown_extension(&mut self, subtype: u32, size: u32, count: u32) -> Result<()> {
 627         self.warn(format!("Unrecognized record type 7, subtype {subtype}."))?;
 628         if size == 0 || count > 65536 / size {
 629             self.skip_bytes(size as u64 * count as u64)?;
 630         } else if size != 1 {
 631             let mut offset = 0;
 632             for _ in 0..count {
 633                 let vec = read_vec(&mut self.r, size as usize)?;
 634                 println!(
 635                     "{}",
 636                     HexViewBuilder::new(&vec).address_offset(offset).finish()
 637                 );
 638                 offset += size as usize;
 639             }
 640         }
 641         Ok(())
 642     }
 643
 644     fn read_variable_record(&mut self) -> Result<()> {
 645         self.n_variable_records += 1;
 646         println!(
 647             "{:08x}: variable record {}",
 648             self.r.stream_position()?,
 649             self.n_variable_records
 650         );
 651         let width: i32 = self.read_swap()?;
 652         let has_variable_label: u32 = self.read_swap()?;
 653         let missing_value_code: i32 = self.read_swap()?;
 654         let print_format: u32 = self.read_swap()?;
 655         let write_format: u32 = self.read_swap()?;
 656         let name: [u8; 8] = read_bytes(&mut self.r)?;
 657         let name: Vec<u8> = trim_end(Vec::from(name), b'\0');
 658
 659         if width >= 0 {
 660             self.n_variables += 1;
 661         }
 662         self.var_widths.push(width);
 663
 664         println!(
 665             "\tWidth: {width} ({})",
 666             match width {
 667                 _ if width > 0 => "string",
 668                 _ if width == 0 => "numeric",
 669                 _ => "long string continuation record",
 670             }
 671         );
 672
 673         println!("\tVariable label: {has_variable_label}");
 674         println!(
 675             "\tMissing values code: {missing_value_code} ({})",
 676             match missing_value_code {
 677                 0 => "no missing values",
 678                 1 => "one missing value",
 679                 2 => "two missing values",
 680                 3 => "three missing values",
 681                 -2 => "one missing value range",
 682                 -3 => "one missing value, one range",
 683                 _ => "bad value",
 684             }
 685         );
 686         for (which, format) in [("Print", print_format), ("Worite", write_format)] {
 687             let type_ = format_name(format >> 16);
 688             let w = (format >> 8) & 0xff;
 689             let d = format & 0xff;
 690             println!("\t{which} format: {format:06x} ({type_}{w}.{d})");
 691         }
 692         println!("\tName: {}", String::from_utf8_lossy(&name));
 693
 694         // Read variable label.
 695         match has_variable_label {
 696             0 => (),
 697             1 => {
 698                 let offset = self.r.stream_position()?;
 699                 let len: u32 = self.read_swap()?;
 700                 let read_len = len.min(65535) as usize;
 701                 let label = read_vec(&mut self.r, read_len)?;
 702                 println!(
 703                     "\t{offset:08x} Variable label: \"{}\"",
 704                     String::from_utf8_lossy(&label)
 705                 );
 706
 707                 self.skip_bytes((round_up(len, 4) - len).into())?;
 708             }
 709             _ => Err(anyhow!("Variable label indicator field is not 0 or 1."))?,
 710         };
 711
 712         // Read missing values.
 713         if missing_value_code != 0 {
 714             print!("\t{:08x} Missing values:", self.r.stream_position()?);
 715             match width.cmp(&0) {
 716                 Ordering::Equal => {
 717                     let (has_range, n_individual) = match missing_value_code {
 718                         -3 => (true, 1),
 719                         -2 => (true, 0),
 720                         1 | 2 | 3 => (false, missing_value_code),
 721                         _ => Err(anyhow!(
 722                             "Numeric missing value indicator field is not -3, -2, 0, 1, 2, or 3."
 723                         ))?,
 724                     };
 725                     if has_range {
 726                         let low: f64 = self.read_swap()?;
 727                         let high: f64 = self.read_swap()?;
 728                         print!(" {low}...{high}");
 729                     }
 730                     for _ in 0..n_individual {
 731                         let value: f64 = self.read_swap()?;
 732                         print!(" {value}");
 733                     }
 734                 }
 735                 Ordering::Greater => {
 736                     if !(0..=3).contains(&missing_value_code) {
 737                         Err(anyhow!(
 738                             "String missing value indicator field is not 0, 1, 2, or 3."
 739                         ))?;
 740                     }
 741                     for _ in 0..missing_value_code {
 742                         let string: [u8; 8] = read_bytes(&mut self.r)?;
 743                         let string: Vec<u8> = trim_end(Vec::from(string), b'\0');
 744                         println!(" {}", String::from_utf8_lossy(&string));
 745                     }
 746                 }
 747                 Ordering::Less => (),
 748             }
 749             println!();
 750         }
 751
 752         Ok(())
 753     }
 754
 755     fn read_value_label_record(&mut self) -> Result<()> {
 756         println!("{:08x}: value labels record", self.r.stream_position()?);
 757
 758         // Read the labels.
 759         let n_labels: u32 = self.read_swap()?;
 760         for _ in 0..n_labels {
 761             let raw: [u8; 8] = read_bytes(&mut self.r)?;
 762             let value = UntypedValue::new(raw, self.fp_format);
 763             let label_len: u8 = self.read_swap()?;
 764             let padded_len = round_up(label_len as usize + 1, 8);
 765
 766             let mut label = read_vec(&mut self.r, padded_len)?;
 767             label.truncate(label_len as usize);
 768             let label = String::from_utf8_lossy(&label);
 769
 770             println!("\t{value}: {label}");
 771         }
 772
 773         // Read the type-4 record with the corresponding variable indexes.
 774         let rec_type: u32 = self.read_swap()?;
 775         if rec_type != 4 {
 776             Err(anyhow!(
 777                 "Variable index record (type 4) does not immediately \
 778                          follow value label record (type 3) as it should."
 779             ))?;
 780         }
 781
 782         println!("\t{:08x}: apply to variables", self.r.stream_position()?);
 783         let n_vars: u32 = self.read_swap()?;
 784         for _ in 0..n_vars {
 785             let index: u32 = self.read_swap()?;
 786             print!(" {index}");
 787         }
 788         println!();
 789
 790         Ok(())
 791     }
 792
 793     fn read_document_record(&mut self) -> Result<()> {
 794         println!("{:08x}: document record", self.r.stream_position()?);
 795         let n_lines: u32 = self.read_swap()?;
 796         println!("\t{n_lines} lines of documents");
 797
 798         for i in 0..n_lines {
 799             print!("\t{:08x}: ", self.r.stream_position()?);
 800             let line: [u8; 64] = read_bytes(&mut self.r)?;
 801             let line = trim_end(Vec::from(line), b' ');
 802             println!("line {i}: \"{}\"", String::from_utf8_lossy(&line));
 803         }
 804         Ok(())
 805     }
 806
 807     fn read_machine_integer_info(&mut self, size: u32, count: u32) -> Result<()> {
 808         let offset = self.r.stream_position()?;
 809         let version_major: u32 = self.read_swap()?;
 810         let version_minor: u32 = self.read_swap()?;
 811         let version_revision: u32 = self.read_swap()?;
 812         let machine_code: u32 = self.read_swap()?;
 813         let float_representation: u32 = self.read_swap()?;
 814         let compression_code: u32 = self.read_swap()?;
 815         let integer_representation: u32 = self.read_swap()?;
 816         let character_code: u32 = self.read_swap()?;
 817
 818         println!("{offset:08x}: machine integer info");
 819         if size != 4 || count != 8 {
 820             Err(anyhow!(
 821                 "Bad size ({size}) or count ({count}) field on record type 7, subtype 3"
 822             ))?;
 823         }
 824         println!("\tVersion: {version_major}.{version_minor}.{version_revision}");
 825         println!("\tMachine code: {machine_code}");
 826         println!(
 827             "\tFloating point representation: {float_representation} ({})",
 828             match float_representation {
 829                 1 => "IEEE 754",
 830                 2 => "IBM 370",
 831                 3 => "DEC VAX",
 832                 _ => "unknown",
 833             }
 834         );
 835         println!("\tCompression code: {compression_code}");
 836         println!(
 837             "\tEndianness: {integer_representation} ({})",
 838             match integer_representation {
 839                 1 => "big",
 840                 2 => "little",
 841                 _ => "unknown",
 842             }
 843         );
 844         println!("\tCharacter code: {character_code}");
 845         Ok(())
 846     }
 847
 848     fn read_machine_float_info(&mut self, size: u32, count: u32) -> Result<()> {
 849         let offset = self.r.stream_position()?;
 850         let sysmis: f64 = self.read_swap()?;
 851         let highest: f64 = self.read_swap()?;
 852         let lowest: f64 = self.read_swap()?;
 853
 854         println!("{offset:08x}: machine float info");
 855         if size != 4 || count != 8 {
 856             Err(anyhow!(
 857                 "Bad size ({size}) or count ({count}) field on extension 4."
 858             ))?;
 859         }
 860
 861         println!("\tsysmis: {sysmis} ({})", HexFloat(sysmis));
 862         println!("\thighest: {highest} ({})", HexFloat(highest));
 863         println!("\tlowest: {lowest} ({})", HexFloat(lowest));
 864         Ok(())
 865     }
 866
 867     fn read_variable_sets(&mut self, size: u32, count: u32) -> Result<()> {
 868         println!("{:08x}: variable sets", self.r.stream_position()?);
 869         let mut text = self.open_text_record(size, count)?;
 870         loop {
 871             while text.match_byte(b'\n') {
 872                 continue;
 873             }
 874             let set = match text.tokenize(b'=') {
 875                 Some(set) => String::from_utf8_lossy(set).into_owned(),
 876                 None => break,
 877             };
 878
 879             // Always present even for an empty set.
 880             text.match_byte(b' ');
 881
 882             match text.tokenize(b'\n') {
 883                 None => println!("\tset \"{set}\" is empty"),
 884                 Some(variables) => {
 885                     println!(
 886                         "\tset \"{set}\" contains \"{}\"",
 887                         String::from_utf8_lossy(variables).trim_end_matches('\r')
 888                     );
 889                 }
 890             };
 891         }
 892         Ok(())
 893     }
 894
 895     // Read record type 7, subtype 7.
 896     fn read_mrsets(&mut self, size: u32, count: u32) -> Result<()> {
 897         print!("{:08x}: multiple response sets", self.r.stream_position()?);
 898         let mut text = self.open_text_record(size, count)?;
 899         loop {
 900             #[derive(PartialEq, Eq)]
 901             enum MrSet {
 902                 MC,
 903                 MD,
 904             }
 905
 906             while text.match_byte(b'\n') {}
 907             let Some(name) = text.tokenize(b'=') else {
 908                 break;
 909             };
 910             let name = Vec::from(name);
 911
 912             let (mrset, cat_label_from_counted_values, label_from_var_label) = if text
 913                 .match_byte(b'C')
 914             {
 915                 if !text.match_byte(b' ') {
 916                     Err(anyhow!(
 917                         "missing space following 'C' at offset {} in mrsets record",
 918                         text.pos
 919                     ))?;
 920                 }
 921                 (MrSet::MC, false, false)
 922             } else if text.match_byte(b'D') {
 923                 (MrSet::MD, false, false)
 924             } else if text.match_byte(b'E') {
 925                 if !text.match_byte(b' ') {
 926                     Err(anyhow!(
 927                         "missing space following 'E' at offset {} in mrsets record",
 928                         text.pos
 929                     ))?;
 930                 }
 931
 932                 let pos = text.pos;
 933                 let Some(number) = text.tokenize(b' ') else {
 934                     Err(anyhow!(
 935                         "Missing label source value following `E' at offset {}u in MRSETS record",
 936                         text.pos
 937                     ))?
 938                 };
 939
 940                 let label_from_var_label = if number == b"11" {
 941                     true
 942                 } else if number == b"1" {
 943                     false
 944                 } else {
 945                     Err(anyhow!("Unexpected label source value `{}' following `E' at offset {pos} in MRSETS record", String::from_utf8_lossy(number)))?
 946                 };
 947                 (MrSet::MD, true, label_from_var_label)
 948             } else {
 949                 Err(anyhow!(
 950                     "missing `C', `D', or `E' at offset {} in mrsets record",
 951                     text.pos
 952                 ))?
 953             };
 954
 955             let counted_value = if mrset == MrSet::MD {
 956                 Some(Vec::from(text.parse_counted_string()?))
 957             } else {
 958                 None
 959             };
 960
 961             let label = Vec::from(text.parse_counted_string()?);
 962
 963             let variables = text.tokenize(b'\n');
 964
 965             print!(
 966                 "\t\"{}\": multiple {} set",
 967                 String::from_utf8_lossy(&name),
 968                 if mrset == MrSet::MC {
 969                     "category"
 970                 } else {
 971                     "dichotomy"
 972                 }
 973             );
 974             if let Some(counted_value) = counted_value {
 975                 print!(
 976                     ", counted value \"{}\"",
 977                     String::from_utf8_lossy(&counted_value)
 978                 );
 979             }
 980             if cat_label_from_counted_values {
 981                 println!(", category labels from counted values");
 982             }
 983             if label != b"" {
 984                 print!(", label \"{}\"", String::from_utf8_lossy(&label));
 985             }
 986             if label_from_var_label {
 987                 print!(", label from variable label");
 988             }
 989             if let Some(variables) = variables {
 990                 print!(", variables \"{}\"", String::from_utf8_lossy(variables));
 991             } else {
 992                 print!("no variables");
 993             }
 994             println!();
 995         }
 996         Ok(())
 997     }
 998
 999     fn read_extra_product_info(&mut self, size: u32, count: u32) -> Result<()> {
1000         print!("{:08x}: extra product info", self.r.stream_position()?);
1001         let text = self.open_text_record(size, count)?;
1002         print_string(&text.buffer);
1003         Ok(())
1004     }
1005
1006     fn read_display_parameters(&mut self, size: u32, count: u32) -> Result<()> {
1007         println!(
1008             "{:08x}: variable display parameters",
1009             self.r.stream_position()?
1010         );
1011         if size != 4 {
1012             Err(anyhow!("Bad size ({size}) on extension 11."))?;
1013         }
1014         let n_vars = self.n_variables;
1015         let includes_width = if count as usize == 3 * n_vars {
1016             true
1017         } else if count as usize == 2 * n_vars {
1018             false
1019         } else {
1020             Err(anyhow!(
1021                 "Extension 11 has bad count {count} (for {n_vars} variables)."
1022             ))?
1023         };
1024
1025         for i in 0..n_vars {
1026             let measure: u32 = self.read_swap()?;
1027             print!(
1028                 "\tVar #{i}: measure={measure} ({})",
1029                 match measure {
1030                     1 => "nominal",
1031                     2 => "ordinal",
1032                     3 => "scale",
1033                     _ => "invalid",
1034                 }
1035             );
1036
1037             if includes_width {
1038                 let width: u32 = self.read_swap()?;
1039                 print!(", width={width}");
1040             }
1041
1042             let align: u32 = self.read_swap()?;
1043             println!(
1044                 ", align={align} ({})",
1045                 match align {
1046                     0 => "left",
1047                     1 => "right",
1048                     2 => "centre",
1049                     _ => "invalid",
1050                 }
1051             );
1052         }
1053         Ok(())
1054     }
1055
1056     fn read_long_string_map(&mut self, size: u32, count: u32) -> Result<()> {
1057         print!(
1058             "{:08x}: very long strings (variable => length)",
1059             self.r.stream_position()?
1060         );
1061         let mut text = self.open_text_record(size, count)?;
1062         while let Some((var, length)) = text.read_variable_to_value_pair() {
1063             println!(
1064                 "\t{} => {}",
1065                 String::from_utf8_lossy(&var),
1066                 String::from_utf8_lossy(&length)
1067             );
1068         }
1069         Ok(())
1070     }
1071
1072     fn read_text_record(&mut self, size: u32, count: u32) -> Result<Vec<u8>> {
1073         let Some(n_bytes) = u32::checked_mul(size, count) else {
1074             Err(anyhow!("Extension record too large."))?
1075         };
1076         read_vec(&mut self.r, n_bytes as usize)
1077     }
1078
1079     fn open_text_record(&mut self, size: u32, count: u32) -> Result<TextRecord> {
1080         Ok(TextRecord::new(self.read_text_record(size, count)?))
1081     }
1082 }
1083
1084 fn print_string(s: &[u8]) {
1085     if s.contains(&b'\0') {
1086         println!("{}", HexView::new(s));
1087     } else {
1088         for &c in s {
1089             match c {
1090                 b'\\' => print!("\\\\"),
1091                 b'\n' => println!(),
1092                 c if (b' '..=b'~').contains(&c) => print!("{}", c as char),
1093                 c => print!("\\{:2x}", c),
1094             }
1095         }
1096     }
1097 }
1098
1099 struct TextRecord {
1100     buffer: Vec<u8>,
1101     pos: usize,
1102 }
1103
1104 impl TextRecord {
1105     fn new(buffer: Vec<u8>) -> TextRecord {
1106         TextRecord { buffer, pos: 0 }
1107     }
1108
1109     fn tokenize(&mut self, delimiter: u8) -> Option<&[u8]> {
1110         let start = self.pos;
1111         while self.pos < self.buffer.len()
1112             && self.buffer[self.pos] != delimiter
1113             && self.buffer[self.pos] != 0
1114         {
1115             self.pos += 1
1116         }
1117         if start == self.pos {
1118             None
1119         } else {
1120             Some(&self.buffer[start..self.pos])
1121         }
1122     }
1123
1124     fn match_byte(&mut self, c: u8) -> bool {
1125         if self.pos < self.buffer.len() && self.buffer[self.pos] == c {
1126             self.pos += 1;
1127             true
1128         } else {
1129             false
1130         }
1131     }
1132
1133     fn parse_usize(&mut self) -> Result<usize> {
1134         let n_digits = self.buffer[self.pos..]
1135             .iter()
1136             .take_while(|c| c.is_ascii_digit())
1137             .count();
1138         if n_digits == 0 {
1139             Err(anyhow!("expecting digit at offset {} in record", self.pos))?;
1140         }
1141         let start = self.pos;
1142         self.pos += n_digits;
1143         let end = self.pos;
1144         let digits = str::from_utf8(&self.buffer[start..end]).unwrap();
1145         let Ok(number) = digits.parse::<usize>() else {
1146             Err(anyhow!(
1147                 "expecting number in [0,{}] at offset {} in record",
1148                 usize::MAX,
1149                 self.pos
1150             ))?
1151         };
1152         self.pos = end;
1153         Ok(number)
1154     }
1155
1156     fn get_n_bytes(&mut self, n: usize) -> Option<(usize, usize)> {
1157         let start = self.pos;
1158         let Some(end) = start.checked_add(n) else {
1159             return None;
1160         };
1161         self.pos = end;
1162         Some((start, end))
1163     }
1164
1165     fn parse_counted_string(&mut self) -> Result<&[u8]> {
1166         let length = self.parse_usize()?;
1167         if !self.match_byte(b' ') {
1168             Err(anyhow!("expecting space at offset {} in record", self.pos))?;
1169         }
1170
1171         let Some((start, end)) = self.get_n_bytes(length) else {
1172             Err(anyhow!(
1173                 "{length}-byte string starting at offset {} exceeds record length {}",
1174                 self.pos,
1175                 self.buffer.len()
1176             ))?
1177         };
1178         if !self.match_byte(b' ') {
1179             Err(anyhow!(
1180                 "expecting space at offset {} following {}-byte string",
1181                 self.pos,
1182                 end - start
1183             ))?;
1184         }
1185         Ok(&self.buffer[start..end])
1186     }
1187
1188     fn read_variable_to_value_pair(&mut self) -> Option<(Vec<u8>, Vec<u8>)> {
1189         let key = self.tokenize(b'=')?.into();
1190         let value = self.tokenize(b'\t')?.into();
1191
1192         while self.match_byte(b'\t') || self.match_byte(b'\0') {}
1193         Some((key, value))
1194     }
1195 }