pintos-os.org Git - pspp/blob - rust/src/main.rs

   1 #![allow(unused_variables)]
   2 #![allow(dead_code)]
   3 /* PSPP - a program for statistical analysis.
   4  * Copyright (C) 2023 Free Software Foundation, Inc.
   5  *
   6  * This program is free software: you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation, either version 3 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  18
  19 use anyhow::{anyhow, Result};
  20 use clap::Parser;
  21 use hexplay::HexView;
  22 use hexplay::HexViewBuilder;
  23 use num::{Float, Num};
  24 use std::cmp::Ordering;
  25 use std::fs::File;
  26 use std::io::prelude::*;
  27 use std::io::BufReader;
  28 use std::path::{Path, PathBuf};
  29 use std::str;
  30 use std::{fmt, num::FpCategory};
  31
  32 /// A utility to dissect SPSS system files.
  33 #[derive(Parser, Debug)]
  34 #[command(author, version, about, long_about = None)]
  35 struct Args {
  36     /// Maximum number of cases to print.
  37     #[arg(long = "data", default_value_t = 0)]
  38     max_cases: usize,
  39
  40     /// Files to dissect.
  41     #[arg(required = true)]
  42     files: Vec<PathBuf>,
  43 }
  44
  45 fn main() -> Result<()> {
  46     let Args { max_cases, files } = Args::parse();
  47
  48     for file in files {
  49         Dissector::new(file)?;
  50     }
  51     Ok(())
  52 }
  53
  54 #[derive(Copy, Clone, Debug)]
  55 enum Compression {
  56     Simple,
  57     ZLib,
  58 }
  59
  60 #[derive(Copy, Clone, Debug)]
  61 enum Endianness {
  62     BigEndian,
  63     LittleEndian,
  64 }
  65 use Endianness::*;
  66
  67 trait Parse<T, const N: usize> {
  68     fn parse(self, bytes: [u8; N]) -> T;
  69 }
  70 impl Parse<u64, 8> for Endianness {
  71     fn parse(self, bytes: [u8; 8]) -> u64 {
  72         match self {
  73             BigEndian => u64::from_be_bytes(bytes),
  74             LittleEndian => u64::from_le_bytes(bytes),
  75         }
  76     }
  77 }
  78 impl Parse<u32, 4> for Endianness {
  79     fn parse(self, bytes: [u8; 4]) -> u32 {
  80         match self {
  81             BigEndian => u32::from_be_bytes(bytes),
  82             LittleEndian => u32::from_le_bytes(bytes),
  83         }
  84     }
  85 }
  86 impl Parse<u16, 2> for Endianness {
  87     fn parse(self, bytes: [u8; 2]) -> u16 {
  88         match self {
  89             BigEndian => u16::from_be_bytes(bytes),
  90             LittleEndian => u16::from_le_bytes(bytes),
  91         }
  92     }
  93 }
  94 impl Parse<u8, 1> for Endianness {
  95     fn parse(self, bytes: [u8; 1]) -> u8 {
  96         match self {
  97             BigEndian => u8::from_be_bytes(bytes),
  98             LittleEndian => u8::from_le_bytes(bytes),
  99         }
 100     }
 101 }
 102 impl Parse<i64, 8> for Endianness {
 103     fn parse(self, bytes: [u8; 8]) -> i64 {
 104         match self {
 105             BigEndian => i64::from_be_bytes(bytes),
 106             LittleEndian => i64::from_le_bytes(bytes),
 107         }
 108     }
 109 }
 110 impl Parse<i32, 4> for Endianness {
 111     fn parse(self, bytes: [u8; 4]) -> i32 {
 112         match self {
 113             BigEndian => i32::from_be_bytes(bytes),
 114             LittleEndian => i32::from_le_bytes(bytes),
 115         }
 116     }
 117 }
 118 impl Parse<i16, 2> for Endianness {
 119     fn parse(self, bytes: [u8; 2]) -> i16 {
 120         match self {
 121             BigEndian => i16::from_be_bytes(bytes),
 122             LittleEndian => i16::from_le_bytes(bytes),
 123         }
 124     }
 125 }
 126 impl Parse<i8, 1> for Endianness {
 127     fn parse(self, bytes: [u8; 1]) -> i8 {
 128         match self {
 129             BigEndian => i8::from_be_bytes(bytes),
 130             LittleEndian => i8::from_le_bytes(bytes),
 131         }
 132     }
 133 }
 134 impl Parse<f64, 8> for Endianness {
 135     fn parse(self, bytes: [u8; 8]) -> f64 {
 136         match self {
 137             BigEndian => f64::from_be_bytes(bytes),
 138             LittleEndian => f64::from_le_bytes(bytes),
 139         }
 140     }
 141 }
 142
 143 fn read_bytes<const N: usize>(r: &mut BufReader<File>) -> Result<[u8; N]> {
 144     let mut buf = [0; N];
 145     r.read_exact(&mut buf)?;
 146     Ok(buf)
 147 }
 148
 149 fn read_vec(r: &mut BufReader<File>, n: usize) -> Result<Vec<u8>> {
 150     let mut vec = vec![0; n];
 151     r.read_exact(&mut vec)?;
 152     Ok(vec)
 153 }
 154
 155 trait ReadSwap<T> {
 156     fn read_swap(&mut self) -> Result<T>;
 157 }
 158
 159 impl ReadSwap<u32> for Dissector {
 160     fn read_swap(&mut self) -> Result<u32> {
 161         Ok(self.endianness.parse(read_bytes(&mut self.r)?))
 162     }
 163 }
 164 impl ReadSwap<u8> for Dissector {
 165     fn read_swap(&mut self) -> Result<u8> {
 166         Ok(self.endianness.parse(read_bytes(&mut self.r)?))
 167     }
 168 }
 169
 170 impl ReadSwap<i32> for Dissector {
 171     fn read_swap(&mut self) -> Result<i32> {
 172         Ok(self.endianness.parse(read_bytes(&mut self.r)?))
 173     }
 174 }
 175
 176 impl ReadSwap<f64> for Dissector {
 177     fn read_swap(&mut self) -> Result<f64> {
 178         Ok(self.endianness.parse(read_bytes(&mut self.r)?))
 179     }
 180 }
 181
 182 struct Dissector {
 183     filename: String,
 184     r: BufReader<File>,
 185     compression: Option<Compression>,
 186     endianness: Endianness,
 187     fp_format: Endianness,
 188     bias: f64,
 189     n_variable_records: usize,
 190     n_variables: usize,
 191     var_widths: Vec<i32>,
 192 }
 193
 194 fn detect_endianness(layout_code: [u8; 4]) -> Option<Endianness> {
 195     for endianness in [BigEndian, LittleEndian] {
 196         match endianness.parse(layout_code) {
 197             2 | 3 => return Some(endianness),
 198             _ => (),
 199         }
 200     }
 201     None
 202 }
 203
 204 fn detect_fp_format(bias: [u8; 8]) -> Option<Endianness> {
 205     for endianness in [BigEndian, LittleEndian] {
 206         let value: f64 = endianness.parse(bias);
 207         if value == 100.0 {
 208             return Some(endianness);
 209         }
 210     }
 211     None
 212 }
 213
 214 fn trim_end(mut s: Vec<u8>, c: u8) -> Vec<u8> {
 215     while s.last() == Some(&c) {
 216         s.pop();
 217     }
 218     s
 219 }
 220
 221 fn slice_trim_end(mut s: &[u8], c: u8) -> &[u8] {
 222     while s.last() == Some(&c) {
 223         s = s.split_last().unwrap().1;
 224     }
 225     s
 226 }
 227
 228 fn format_name(type_: u32) -> &'static str {
 229     match type_ {
 230         1 => "A",
 231         2 => "AHEX",
 232         3 => "COMMA",
 233         4 => "DOLLAR",
 234         5 => "F",
 235         6 => "IB",
 236         7 => "PIBHEX",
 237         8 => "P",
 238         9 => "PIB",
 239         10 => "PK",
 240         11 => "RB",
 241         12 => "RBHEX",
 242         15 => "Z",
 243         16 => "N",
 244         17 => "E",
 245         20 => "DATE",
 246         21 => "TIME",
 247         22 => "DATETIME",
 248         23 => "ADATE",
 249         24 => "JDATE",
 250         25 => "DTIME",
 251         26 => "WKDAY",
 252         27 => "MONTH",
 253         28 => "MOYR",
 254         29 => "QYR",
 255         30 => "WKYR",
 256         31 => "PCT",
 257         32 => "DOT",
 258         33 => "CCA",
 259         34 => "CCB",
 260         35 => "CCC",
 261         36 => "CCD",
 262         37 => "CCE",
 263         38 => "EDATE",
 264         39 => "SDATE",
 265         40 => "MTIME",
 266         41 => "YMDHMS",
 267         _ => "invalid",
 268     }
 269 }
 270
 271 fn round_up<T: Num + Copy>(x: T, y: T) -> T {
 272     (x + (y - T::one())) / y * y
 273 }
 274
 275 struct UntypedValue {
 276     raw: [u8; 8],
 277     endianness: Endianness,
 278 }
 279
 280 impl UntypedValue {
 281     fn new(raw: [u8; 8], endianness: Endianness) -> UntypedValue {
 282         UntypedValue { raw, endianness }
 283     }
 284 }
 285
 286 impl fmt::Display for UntypedValue {
 287     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 288         let numeric: f64 = self.endianness.parse(self.raw);
 289         let n_printable = self
 290             .raw
 291             .iter()
 292             .take_while(|&&x| x == b' ' || x.is_ascii_graphic())
 293             .count();
 294         let printable_prefix = std::str::from_utf8(&self.raw[0..n_printable]).unwrap();
 295         write!(f, "{numeric}/\"{printable_prefix}\"")
 296     }
 297 }
 298
 299 struct HexFloat<T: Float>(T);
 300
 301 impl<T: Float> fmt::Display for HexFloat<T> {
 302     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 303         let sign = if self.0.is_sign_negative() { "-" } else { "" };
 304         match self.0.classify() {
 305             FpCategory::Nan => return write!(f, "NaN"),
 306             FpCategory::Infinite => return write!(f, "{sign}Infinity"),
 307             FpCategory::Zero => return write!(f, "{sign}0.0"),
 308             _ => (),
 309         };
 310         let (significand, mut exponent, _) = self.0.integer_decode();
 311         let mut hex_sig = format!("{:x}", significand);
 312         while hex_sig.ends_with('0') {
 313             hex_sig.pop();
 314             exponent += 4;
 315         }
 316         match hex_sig.len() {
 317             0 => write!(f, "{sign}0.0"),
 318             1 => write!(f, "{sign}0x{hex_sig}.0p{exponent}"),
 319             len => write!(
 320                 f,
 321                 "{sign}0x{}.{}p{}",
 322                 hex_sig.chars().next().unwrap(),
 323                 &hex_sig[1..],
 324                 exponent + 4 * (len as i16 - 1)
 325             ),
 326         }
 327     }
 328 }
 329
 330 #[cfg(test)]
 331 mod hex_float_tests {
 332     use crate::HexFloat;
 333     use num::Float;
 334
 335     #[test]
 336     fn test() {
 337         assert_eq!(format!("{}", HexFloat(1.0)), "0x1.0p0");
 338         assert_eq!(format!("{}", HexFloat(123.0)), "0x1.ecp6");
 339         assert_eq!(format!("{}", HexFloat(1.0 / 16.0)), "0x1.0p-4");
 340         assert_eq!(format!("{}", HexFloat(f64::infinity())), "Infinity");
 341         assert_eq!(format!("{}", HexFloat(f64::neg_infinity())), "-Infinity");
 342         assert_eq!(format!("{}", HexFloat(f64::nan())), "NaN");
 343         assert_eq!(format!("{}", HexFloat(0.0)), "0.0");
 344         assert_eq!(format!("{}", HexFloat(f64::neg_zero())), "-0.0");
 345     }
 346 }
 347
 348 impl Dissector {
 349     fn new<P: AsRef<Path>>(filename: P) -> Result<Dissector> {
 350         let mut r = BufReader::new(File::open(&filename)?);
 351         let filename = filename.as_ref().to_string_lossy().into_owned();
 352         let rec_type: [u8; 4] = read_bytes(&mut r)?;
 353         let zmagic = match &rec_type {
 354             b"$FL2" => false,
 355             b"$FL3" => true,
 356             _ => Err(anyhow!("This is not an SPSS system file."))?,
 357         };
 358
 359         let eye_catcher: [u8; 60] = read_bytes(&mut r)?;
 360         let layout_code: [u8; 4] = read_bytes(&mut r)?;
 361         let endianness = detect_endianness(layout_code)
 362             .ok_or_else(|| anyhow!("This is not an SPSS system file."))?;
 363         let layout_code: u32 = endianness.parse(layout_code);
 364         let _nominal_case_size: [u8; 4] = read_bytes(&mut r)?;
 365         let compressed: u32 = endianness.parse(read_bytes(&mut r)?);
 366         let compression = match (zmagic, compressed) {
 367             (false, 0) => None,
 368             (false, 1) => Some(Compression::Simple),
 369             (true, 2) => Some(Compression::ZLib),
 370             _ => Err(anyhow!(
 371                 "{} file header has invalid compression value {compressed}.",
 372                 if zmagic { "ZSAV" } else { "SAV" }
 373             ))?,
 374         };
 375
 376         let weight_index: u32 = endianness.parse(read_bytes(&mut r)?);
 377         let n_cases: u32 = endianness.parse(read_bytes(&mut r)?);
 378
 379         let bias: [u8; 8] = read_bytes(&mut r)?;
 380         let fp_format = detect_fp_format(bias)
 381             .unwrap_or_else(|| { eprintln!("Compression bias is not the usual value of 100, or system file uses unrecognized floating-point format."); endianness });
 382         let bias: f64 = fp_format.parse(bias);
 383
 384         let mut d = Dissector {
 385             filename,
 386             r,
 387             compression,
 388             endianness,
 389             fp_format,
 390             bias,
 391             n_variable_records: 0,
 392             n_variables: 0,
 393             var_widths: Vec::new(),
 394         };
 395
 396         let creation_date: [u8; 9] = read_bytes(&mut d.r)?;
 397         let creation_time: [u8; 8] = read_bytes(&mut d.r)?;
 398         let file_label: [u8; 64] = read_bytes(&mut d.r)?;
 399         let file_label = trim_end(Vec::from(file_label), b' ');
 400         d.skip_bytes(3)?;
 401
 402         println!("File header record:");
 403         println!(
 404             "{:>17}: {}",
 405             "Product name",
 406             String::from_utf8_lossy(&eye_catcher)
 407         );
 408         println!("{:>17}: {}", "Layout code", layout_code);
 409         println!(
 410             "{:>17}: {} ({})",
 411             "Compressed",
 412             compressed,
 413             match compression {
 414                 None => "no compression",
 415                 Some(Compression::Simple) => "simple compression",
 416                 Some(Compression::ZLib) => "ZLIB compression",
 417             }
 418         );
 419         println!("{:>17}: {}", "Weight index", weight_index);
 420         println!("{:>17}: {}", "Number of cases", n_cases);
 421         println!("{:>17}: {}", "Compression bias", bias);
 422         println!(
 423             "{:>17}: {}",
 424             "Creation date",
 425             String::from_utf8_lossy(&creation_date)
 426         );
 427         println!(
 428             "{:>17}: {}",
 429             "Creation time",
 430             String::from_utf8_lossy(&creation_time)
 431         );
 432         println!(
 433             "{:>17}: \"{}\"",
 434             "File label",
 435             String::from_utf8_lossy(&file_label)
 436         );
 437
 438         loop {
 439             let rec_type: u32 = d.read_swap()?;
 440             match rec_type {
 441                 2 => d.read_variable_record()?,
 442                 3 => d.read_value_label_record()?,
 443                 4 => Err(anyhow!("Misplaced type 4 record."))?,
 444                 6 => d.read_document_record()?,
 445                 7 => d.read_extension_record()?,
 446                 999 => break,
 447                 _ => Err(anyhow!("Unrecognized record type {rec_type}."))?,
 448             }
 449         }
 450
 451         let pos = d.r.stream_position()?;
 452         println!(
 453             "{:08x}: end-of-dictionary record (first byte of data at {:0x})",
 454             pos,
 455             pos + 4
 456         );
 457
 458         Ok(d)
 459     }
 460
 461     fn read_extension_record(&mut self) -> Result<()> {
 462         let offset = self.r.stream_position()?;
 463         let subtype: u32 = self.read_swap()?;
 464         let size: u32 = self.read_swap()?;
 465         let count: u32 = self.read_swap()?;
 466         println!("{offset:08x}: Record 7, subtype {subtype}, size={size}, count={count}");
 467         match subtype {
 468             3 => self.read_machine_integer_info(size, count),
 469             4 => self.read_machine_float_info(size, count),
 470             5 => self.read_variable_sets(size, count),
 471             6 => {
 472                 // DATE variable information.  We don't use it yet, but we should.
 473                 Ok(())
 474             }
 475             7 | 19 => self.read_mrsets(size, count),
 476             10 => self.read_extra_product_info(size, count),
 477             11 => self.read_display_parameters(size, count),
 478             _ => self.read_unknown_extension(subtype, size, count),
 479         }
 480     }
 481
 482     fn warn(&mut self, s: String) -> Result<()> {
 483         println!(
 484             "\"{}\" near offset 0x{:08x}: {s}",
 485             self.filename,
 486             self.r.stream_position()?
 487         );
 488         Ok(())
 489     }
 490
 491     fn skip_bytes(&mut self, mut n: u64) -> Result<()> {
 492         let mut buf = [0; 1024];
 493         while n > 0 {
 494             let chunk = u64::min(n, buf.len() as u64);
 495             self.r.read_exact(&mut buf[0..chunk as usize])?;
 496             n -= chunk;
 497         }
 498         Ok(())
 499     }
 500
 501     fn read_unknown_extension(&mut self, subtype: u32, size: u32, count: u32) -> Result<()> {
 502         self.warn(format!("Unrecognized record type 7, subtype {subtype}."))?;
 503         if size == 0 || count > 65536 / size {
 504             self.skip_bytes(size as u64 * count as u64)?;
 505         } else if size != 1 {
 506             let mut offset = 0;
 507             for _ in 0..count {
 508                 let vec = read_vec(&mut self.r, size as usize)?;
 509                 println!(
 510                     "{}",
 511                     HexViewBuilder::new(&vec).address_offset(offset).finish()
 512                 );
 513                 offset += size as usize;
 514             }
 515         }
 516         Ok(())
 517     }
 518
 519     fn read_variable_record(&mut self) -> Result<()> {
 520         self.n_variable_records += 1;
 521         println!(
 522             "{:08x}: variable record {}",
 523             self.r.stream_position()?,
 524             self.n_variable_records
 525         );
 526         let width: i32 = self.read_swap()?;
 527         let has_variable_label: u32 = self.read_swap()?;
 528         let missing_value_code: i32 = self.read_swap()?;
 529         let print_format: u32 = self.read_swap()?;
 530         let write_format: u32 = self.read_swap()?;
 531         let name: [u8; 8] = read_bytes(&mut self.r)?;
 532         let name: Vec<u8> = trim_end(Vec::from(name), b'\0');
 533
 534         if width >= 0 {
 535             self.n_variables += 1;
 536         }
 537         self.var_widths.push(width);
 538
 539         println!(
 540             "\tWidth: {width} ({})",
 541             match width {
 542                 _ if width > 0 => "string",
 543                 _ if width == 0 => "numeric",
 544                 _ => "long string continuation record",
 545             }
 546         );
 547
 548         println!("\tVariable label: {has_variable_label}");
 549         println!(
 550             "\tMissing values code: {missing_value_code} ({})",
 551             match missing_value_code {
 552                 0 => "no missing values",
 553                 1 => "one missing value",
 554                 2 => "two missing values",
 555                 3 => "three missing values",
 556                 -2 => "one missing value range",
 557                 -3 => "one missing value, one range",
 558                 _ => "bad value",
 559             }
 560         );
 561         for (which, format) in [("Print", print_format), ("Worite", write_format)] {
 562             let type_ = format_name(format >> 16);
 563             let w = (format >> 8) & 0xff;
 564             let d = format & 0xff;
 565             println!("\t{which} format: {format:06x} ({type_}{w}.{d})");
 566         }
 567         println!("\tName: {}", String::from_utf8_lossy(&name));
 568
 569         // Read variable label.
 570         match has_variable_label {
 571             0 => (),
 572             1 => {
 573                 let offset = self.r.stream_position()?;
 574                 let len: u32 = self.read_swap()?;
 575                 let read_len = len.min(65535) as usize;
 576                 let label = read_vec(&mut self.r, read_len)?;
 577                 println!(
 578                     "\t{offset:08x} Variable label: \"{}\"",
 579                     String::from_utf8_lossy(&label)
 580                 );
 581
 582                 self.skip_bytes((round_up(len, 4) - len).into())?;
 583             }
 584             _ => Err(anyhow!("Variable label indicator field is not 0 or 1."))?,
 585         };
 586
 587         // Read missing values.
 588         if missing_value_code != 0 {
 589             print!("\t{:08x} Missing values:", self.r.stream_position()?);
 590             match width.cmp(&0) {
 591                 Ordering::Equal => {
 592                     let (has_range, n_individual) = match missing_value_code {
 593                         -3 => (true, 1),
 594                         -2 => (true, 0),
 595                         1 | 2 | 3 => (false, missing_value_code),
 596                         _ => Err(anyhow!(
 597                             "Numeric missing value indicator field is not -3, -2, 0, 1, 2, or 3."
 598                         ))?,
 599                     };
 600                     if has_range {
 601                         let low: f64 = self.read_swap()?;
 602                         let high: f64 = self.read_swap()?;
 603                         print!(" {low}...{high}");
 604                     }
 605                     for _ in 0..n_individual {
 606                         let value: f64 = self.read_swap()?;
 607                         print!(" {value}");
 608                     }
 609                 }
 610                 Ordering::Greater => {
 611                     if !(0..=3).contains(&missing_value_code) {
 612                         Err(anyhow!(
 613                             "String missing value indicator field is not 0, 1, 2, or 3."
 614                         ))?;
 615                     }
 616                     for _ in 0..missing_value_code {
 617                         let string: [u8; 8] = read_bytes(&mut self.r)?;
 618                         let string: Vec<u8> = trim_end(Vec::from(string), b'\0');
 619                         println!(" {}", String::from_utf8_lossy(&string));
 620                     }
 621                 }
 622                 Ordering::Less => (),
 623             }
 624             println!();
 625         }
 626
 627         Ok(())
 628     }
 629
 630     fn read_value_label_record(&mut self) -> Result<()> {
 631         println!("{:08x}: value labels record", self.r.stream_position()?);
 632
 633         // Read the labels.
 634         let n_labels: u32 = self.read_swap()?;
 635         for _ in 0..n_labels {
 636             let raw: [u8; 8] = read_bytes(&mut self.r)?;
 637             let value = UntypedValue::new(raw, self.fp_format);
 638             let label_len: u8 = self.read_swap()?;
 639             let padded_len = round_up(label_len as usize + 1, 8);
 640
 641             let mut label = read_vec(&mut self.r, padded_len)?;
 642             label.truncate(label_len as usize);
 643             let label = String::from_utf8_lossy(&label);
 644
 645             println!("\t{value}: {label}");
 646         }
 647
 648         // Read the type-4 record with the corresponding variable indexes.
 649         let rec_type: u32 = self.read_swap()?;
 650         if rec_type != 4 {
 651             Err(anyhow!(
 652                 "Variable index record (type 4) does not immediately \
 653                          follow value label record (type 3) as it should."
 654             ))?;
 655         }
 656
 657         println!("\t{:08x}: apply to variables", self.r.stream_position()?);
 658         let n_vars: u32 = self.read_swap()?;
 659         for _ in 0..n_vars {
 660             let index: u32 = self.read_swap()?;
 661             print!(" {index}");
 662         }
 663         println!();
 664
 665         Ok(())
 666     }
 667
 668     fn read_document_record(&mut self) -> Result<()> {
 669         println!("{:08x}: document record", self.r.stream_position()?);
 670         let n_lines: u32 = self.read_swap()?;
 671         println!("\t{n_lines} lines of documents");
 672
 673         for i in 0..n_lines {
 674             print!("\t{:08x}: ", self.r.stream_position()?);
 675             let line: [u8; 64] = read_bytes(&mut self.r)?;
 676             let line = trim_end(Vec::from(line), b' ');
 677             println!("line {i}: \"{}\"", String::from_utf8_lossy(&line));
 678         }
 679         Ok(())
 680     }
 681
 682     fn read_machine_integer_info(&mut self, size: u32, count: u32) -> Result<()> {
 683         let offset = self.r.stream_position()?;
 684         let version_major: u32 = self.read_swap()?;
 685         let version_minor: u32 = self.read_swap()?;
 686         let version_revision: u32 = self.read_swap()?;
 687         let machine_code: u32 = self.read_swap()?;
 688         let float_representation: u32 = self.read_swap()?;
 689         let compression_code: u32 = self.read_swap()?;
 690         let integer_representation: u32 = self.read_swap()?;
 691         let character_code: u32 = self.read_swap()?;
 692
 693         println!("{offset:08x}: machine integer info");
 694         if size != 4 || count != 8 {
 695             Err(anyhow!(
 696                 "Bad size ({size}) or count ({count}) field on record type 7, subtype 3"
 697             ))?;
 698         }
 699         println!("\tVersion: {version_major}.{version_minor}.{version_revision}");
 700         println!("\tMachine code: {machine_code}");
 701         println!(
 702             "\tFloating point representation: {float_representation} ({})",
 703             match float_representation {
 704                 1 => "IEEE 754",
 705                 2 => "IBM 370",
 706                 3 => "DEC VAX",
 707                 _ => "unknown",
 708             }
 709         );
 710         println!("\tCompression code: {compression_code}");
 711         println!(
 712             "\tEndianness: {integer_representation} ({})",
 713             match integer_representation {
 714                 1 => "big",
 715                 2 => "little",
 716                 _ => "unknown",
 717             }
 718         );
 719         println!("\tCharacter code: {character_code}");
 720         Ok(())
 721     }
 722
 723     fn read_machine_float_info(&mut self, size: u32, count: u32) -> Result<()> {
 724         let offset = self.r.stream_position()?;
 725         let sysmis: f64 = self.read_swap()?;
 726         let highest: f64 = self.read_swap()?;
 727         let lowest: f64 = self.read_swap()?;
 728
 729         println!("{offset:08x}: machine float info");
 730         if size != 4 || count != 8 {
 731             Err(anyhow!(
 732                 "Bad size ({size}) or count ({count}) field on extension 4."
 733             ))?;
 734         }
 735
 736         println!("\tsysmis: {sysmis} ({})", HexFloat(sysmis));
 737         println!("\thighest: {highest} ({})", HexFloat(highest));
 738         println!("\tlowest: {lowest} ({})", HexFloat(lowest));
 739         Ok(())
 740     }
 741
 742     fn read_variable_sets(&mut self, size: u32, count: u32) -> Result<()> {
 743         println!("{:08x}: variable sets", self.r.stream_position()?);
 744         let mut text = self.open_text_record(size, count)?;
 745         loop {
 746             while text.match_byte(b'\n') {
 747                 continue;
 748             }
 749             let set = match text.tokenize(b'=') {
 750                 Some(set) => String::from_utf8_lossy(set).into_owned(),
 751                 None => break,
 752             };
 753
 754             // Always present even for an empty set.
 755             text.match_byte(b' ');
 756
 757             match text.tokenize(b'\n') {
 758                 None => println!("\tset \"{set}\" is empty"),
 759                 Some(variables) => {
 760                     println!(
 761                         "\tset \"{set}\" contains \"{}\"",
 762                         String::from_utf8_lossy(variables).trim_end_matches('\r')
 763                     );
 764                 }
 765             };
 766         }
 767         Ok(())
 768     }
 769
 770     // Read record type 7, subtype 7.
 771     fn read_mrsets(&mut self, size: u32, count: u32) -> Result<()> {
 772         print!("{:08x}: multiple response sets", self.r.stream_position()?);
 773         let mut text = self.open_text_record(size, count)?;
 774         loop {
 775             #[derive(PartialEq, Eq)]
 776             enum MrSet {
 777                 MC,
 778                 MD,
 779             }
 780
 781             while text.match_byte(b'\n') {}
 782             let Some(name) = text.tokenize(b'=') else {
 783                 break;
 784             };
 785
 786             let (mrset, cat_label_from_counted_values, label_from_var_label) = if text
 787                 .match_byte(b'C')
 788             {
 789                 if !text.match_byte(b' ') {
 790                     Err(anyhow!(
 791                         "missing space following 'C' at offset {} in mrsets record",
 792                         text.pos
 793                     ))?;
 794                 }
 795                 (MrSet::MC, false, false)
 796             } else if text.match_byte(b'D') {
 797                 (MrSet::MD, false, false)
 798             } else if text.match_byte(b'E') {
 799                 if !text.match_byte(b' ') {
 800                     Err(anyhow!(
 801                         "missing space following 'E' at offset {} in mrsets record",
 802                         text.pos
 803                     ))?;
 804                 }
 805
 806                 let pos = text.pos;
 807                 let Some(number) = text.tokenize(b' ') else {
 808                     Err(anyhow!(
 809                         "Missing label source value following `E' at offset {}u in MRSETS record",
 810                         text.pos
 811                     ))?
 812                 };
 813
 814                 let label_from_var_label = if number == b"11" {
 815                     true
 816                 } else if number == b"1" {
 817                     false
 818                 } else {
 819                     Err(anyhow!("Unexpected label source value `{}' following `E' at offset {pos} in MRSETS record", String::from_utf8_lossy(number)))?
 820                 };
 821                 (MrSet::MD, true, label_from_var_label)
 822             } else {
 823                 Err(anyhow!(
 824                     "missing `C', `D', or `E' at offset {} in mrsets record",
 825                     text.pos
 826                 ))?
 827             };
 828
 829             let counted_value = if mrset == MrSet::MD {
 830                 Some(text.parse_counted_string()?)
 831             } else { None };
 832
 833             let label = text.parse_counted_string()?;
 834
 835             let variables = text.tokenize(b'\n');
 836
 837             print!("\t\"{}\": multiple {} set",
 838                    String::from_utf8_lossy(name),
 839                    if mrset == MrSet::MC { "category" } else { "dichotomy" });
 840
 841         }
 842         Ok(())
 843     }
 844
 845     fn read_extra_product_info(&mut self, size: u32, count: u32) -> Result<()> {
 846         print!("{:08x}: extra product info", self.r.stream_position()?);
 847         let text = self.open_text_record(size, count)?;
 848         print_string(&text.buffer);
 849         Ok(())
 850     }
 851
 852     fn read_display_parameters(&mut self, size: u32, count: u32) -> Result<()> {
 853         println!(
 854             "{:08x}: variable display parameters",
 855             self.r.stream_position()?
 856         );
 857         if size != 4 {
 858             Err(anyhow!("Bad size ({size}) on extension 11."))?;
 859         }
 860         let n_vars = self.n_variables;
 861         let includes_width = if count as usize == 3 * n_vars {
 862             true
 863         } else if count as usize == 2 * n_vars {
 864             false
 865         } else {
 866             Err(anyhow!(
 867                 "Extension 11 has bad count {count} (for {n_vars} variables)."
 868             ))?
 869         };
 870
 871         for i in 0..n_vars {
 872             let measure: u32 = self.read_swap()?;
 873             print!(
 874                 "\tVar #{i}: measure={measure} ({})",
 875                 match measure {
 876                     1 => "nominal",
 877                     2 => "ordinal",
 878                     3 => "scale",
 879                     _ => "invalid",
 880                 }
 881             );
 882
 883             if includes_width {
 884                 let width: u32 = self.read_swap()?;
 885                 print!(", width={width}");
 886             }
 887
 888             let align: u32 = self.read_swap()?;
 889             println!(
 890                 ", align={align} ({})",
 891                 match align {
 892                     0 => "left",
 893                     1 => "right",
 894                     2 => "centre",
 895                     _ => "invalid",
 896                 }
 897             );
 898         }
 899         Ok(())
 900     }
 901
 902     fn open_text_record(&mut self, size: u32, count: u32) -> Result<TextRecord> {
 903         let n_bytes = match u32::checked_mul(size, count) {
 904             Some(n) => n,
 905             None => Err(anyhow!("Extension record too large."))?,
 906         };
 907         Ok(TextRecord::new(read_vec(&mut self.r, n_bytes as usize)?))
 908     }
 909 }
 910
 911 fn print_string(s: &[u8]) {
 912     if s.contains(&b'\0') {
 913         println!("{}", HexView::new(s));
 914     } else {
 915         for &c in s {
 916             match c {
 917                 b'\\' => print!("\\\\"),
 918                 b'\n' => println!(),
 919                 c if (b' '..=b'~').contains(&c) => print!("{}", c as char),
 920                 c => print!("\\{:2x}", c),
 921             }
 922         }
 923     }
 924 }
 925
 926 struct TextRecord {
 927     buffer: Vec<u8>,
 928     pos: usize,
 929 }
 930
 931 impl TextRecord {
 932     fn new(buffer: Vec<u8>) -> TextRecord {
 933         TextRecord { buffer, pos: 0 }
 934     }
 935
 936     fn tokenize(&mut self, delimiter: u8) -> Option<&[u8]> {
 937         let start = self.pos;
 938         while self.pos < self.buffer.len()
 939             && self.buffer[self.pos] != delimiter
 940             && self.buffer[self.pos] != 0
 941         {
 942             self.pos += 1
 943         }
 944         if start == self.pos {
 945             None
 946         } else {
 947             Some(&self.buffer[start..self.pos])
 948         }
 949     }
 950
 951     fn match_byte(&mut self, c: u8) -> bool {
 952         if self.pos < self.buffer.len() && self.buffer[self.pos] == c {
 953             self.pos += 1;
 954             true
 955         } else {
 956             false
 957         }
 958     }
 959
 960     fn parse_usize(&mut self) -> Result<usize> {
 961         let n_digits = self.buffer[self.pos..]
 962             .iter()
 963             .take_while(|c| c.is_ascii_digit())
 964             .count();
 965         if n_digits == 0 {
 966             Err(anyhow!("expecting digit at offset {} in record", self.pos))?;
 967         }
 968         let start = self.pos;
 969         self.pos += n_digits;
 970         let end = self.pos;
 971         let digits = str::from_utf8(&self.buffer[start..end]).unwrap();
 972         let Ok(number) = digits.parse::<usize>() else {
 973             Err(anyhow!(
 974                 "expecting number in [0,{}] at offset {} in record",
 975                 usize::MAX,
 976                 self.pos
 977             ))?
 978         };
 979         self.pos = end;
 980         Ok(number)
 981     }
 982
 983     fn get_n_bytes(&mut self, n: usize) -> Option<(usize, usize)> {
 984         let start = self.pos;
 985         let Some(end) = start.checked_add(n) else {
 986             return None;
 987         };
 988         self.pos = end;
 989         Some((start, end))
 990     }
 991
 992     fn parse_counted_string(&mut self) -> Result<&[u8]> {
 993         let length = self.parse_usize()?;
 994         if !self.match_byte(b' ') {
 995             Err(anyhow!("expecting space at offset {} in record", self.pos))?;
 996         }
 997
 998         let Some((start, end)) = self.get_n_bytes(length) else {
 999             Err(anyhow!("{length}-byte string starting at offset {} exceeds record length {}",
1000                         self.pos, self.buffer.len()))?
1001         };
1002         if !self.match_byte(b' ') {
1003             Err(anyhow!(
1004                 "expecting space at offset {} following {}-byte string",
1005                 self.pos,
1006                 end - start
1007             ))?;
1008         }
1009         Ok(&self.buffer[start..end])
1010     }
1011 }