1 /* PSPP - a program for statistical analysis.
2 * Copyright (C) 2023 Free Software Foundation, Inc.
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 use anyhow::{anyhow, Result};
20 use hexplay::HexViewBuilder;
22 use std::cmp::Ordering;
23 use std::collections::VecDeque;
26 use std::io::prelude::*;
27 use std::io::BufReader;
28 use std::io::ErrorKind;
29 use std::path::{Path, PathBuf};
33 use hexfloat::HexFloat;
35 const ID_MAX_LEN: u32 = 64;
37 /// A utility to dissect SPSS system files.
38 #[derive(Parser, Debug)]
39 #[command(author, version, about, long_about = None)]
41 /// Maximum number of cases to print.
42 #[arg(long = "data", default_value_t = 0)]
46 #[arg(required = true)]
50 fn main() -> Result<()> {
51 let Args { max_cases, files } = Args::parse();
54 Dissector::new(file, max_cases)?;
59 #[derive(Copy, Clone, Debug)]
65 #[derive(Copy, Clone, Debug)]
72 trait Parse<T, const N: usize> {
73 fn parse(self, bytes: [u8; N]) -> T;
75 impl Parse<u64, 8> for Endianness {
76 fn parse(self, bytes: [u8; 8]) -> u64 {
78 BigEndian => u64::from_be_bytes(bytes),
79 LittleEndian => u64::from_le_bytes(bytes),
83 impl Parse<u32, 4> for Endianness {
84 fn parse(self, bytes: [u8; 4]) -> u32 {
86 BigEndian => u32::from_be_bytes(bytes),
87 LittleEndian => u32::from_le_bytes(bytes),
91 impl Parse<u16, 2> for Endianness {
92 fn parse(self, bytes: [u8; 2]) -> u16 {
94 BigEndian => u16::from_be_bytes(bytes),
95 LittleEndian => u16::from_le_bytes(bytes),
99 impl Parse<u8, 1> for Endianness {
100 fn parse(self, bytes: [u8; 1]) -> u8 {
102 BigEndian => u8::from_be_bytes(bytes),
103 LittleEndian => u8::from_le_bytes(bytes),
107 impl Parse<i64, 8> for Endianness {
108 fn parse(self, bytes: [u8; 8]) -> i64 {
110 BigEndian => i64::from_be_bytes(bytes),
111 LittleEndian => i64::from_le_bytes(bytes),
115 impl Parse<i32, 4> for Endianness {
116 fn parse(self, bytes: [u8; 4]) -> i32 {
118 BigEndian => i32::from_be_bytes(bytes),
119 LittleEndian => i32::from_le_bytes(bytes),
123 impl Parse<i16, 2> for Endianness {
124 fn parse(self, bytes: [u8; 2]) -> i16 {
126 BigEndian => i16::from_be_bytes(bytes),
127 LittleEndian => i16::from_le_bytes(bytes),
131 impl Parse<i8, 1> for Endianness {
132 fn parse(self, bytes: [u8; 1]) -> i8 {
134 BigEndian => i8::from_be_bytes(bytes),
135 LittleEndian => i8::from_le_bytes(bytes),
139 impl Parse<f64, 8> for Endianness {
140 fn parse(self, bytes: [u8; 8]) -> f64 {
142 BigEndian => f64::from_be_bytes(bytes),
143 LittleEndian => f64::from_le_bytes(bytes),
148 fn read_bytes<const N: usize>(r: &mut BufReader<File>) -> Result<[u8; N]> {
149 let mut buf = [0; N];
150 r.read_exact(&mut buf)?;
154 fn read_vec(r: &mut BufReader<File>, n: usize) -> Result<Vec<u8>> {
155 let mut vec = vec![0; n];
156 r.read_exact(&mut vec)?;
161 fn read_swap(&mut self) -> Result<T>;
164 impl ReadSwap<u8> for Dissector {
165 fn read_swap(&mut self) -> Result<u8> {
166 Ok(self.endianness.parse(read_bytes(&mut self.r)?))
169 impl ReadSwap<u32> for Dissector {
170 fn read_swap(&mut self) -> Result<u32> {
171 Ok(self.endianness.parse(read_bytes(&mut self.r)?))
174 impl ReadSwap<u64> for Dissector {
175 fn read_swap(&mut self) -> Result<u64> {
176 Ok(self.endianness.parse(read_bytes(&mut self.r)?))
180 impl ReadSwap<i32> for Dissector {
181 fn read_swap(&mut self) -> Result<i32> {
182 Ok(self.endianness.parse(read_bytes(&mut self.r)?))
186 impl ReadSwap<f64> for Dissector {
187 fn read_swap(&mut self) -> Result<f64> {
188 Ok(self.endianness.parse(read_bytes(&mut self.r)?))
195 endianness: Endianness,
196 fp_format: Endianness,
198 n_variable_records: usize,
200 var_widths: Vec<i32>,
203 fn detect_endianness(layout_code: [u8; 4]) -> Option<Endianness> {
204 for endianness in [BigEndian, LittleEndian] {
205 match endianness.parse(layout_code) {
206 2 | 3 => return Some(endianness),
213 fn detect_fp_format(bias: [u8; 8]) -> Option<Endianness> {
214 for endianness in [BigEndian, LittleEndian] {
215 let value: f64 = endianness.parse(bias);
217 return Some(endianness);
223 fn trim_end(mut s: Vec<u8>, c: u8) -> Vec<u8> {
224 while s.last() == Some(&c) {
230 fn format_name(type_: u32) -> &'static str {
273 fn round_up<T: Num + Copy>(x: T, y: T) -> T {
274 (x + (y - T::one())) / y * y
277 struct UntypedValue {
279 endianness: Endianness,
283 fn new(raw: [u8; 8], endianness: Endianness) -> UntypedValue {
284 UntypedValue { raw, endianness }
288 impl fmt::Display for UntypedValue {
289 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
290 let numeric: f64 = self.endianness.parse(self.raw);
291 let n_printable = self
294 .take_while(|&&x| x == b' ' || x.is_ascii_graphic())
296 let printable_prefix = std::str::from_utf8(&self.raw[0..n_printable]).unwrap();
297 write!(f, "{numeric}/\"{printable_prefix}\"")
302 fn new<P: AsRef<Path>>(filename: P, max_cases: usize) -> Result<Dissector> {
303 let mut r = BufReader::new(File::open(&filename)?);
304 let filename = filename.as_ref().to_string_lossy().into_owned();
305 let rec_type: [u8; 4] = read_bytes(&mut r)?;
306 let zmagic = match &rec_type {
309 _ => Err(anyhow!("This is not an SPSS system file."))?,
312 let eye_catcher: [u8; 60] = read_bytes(&mut r)?;
313 let layout_code: [u8; 4] = read_bytes(&mut r)?;
314 let endianness = detect_endianness(layout_code)
315 .ok_or_else(|| anyhow!("This is not an SPSS system file."))?;
316 let layout_code: u32 = endianness.parse(layout_code);
317 let _nominal_case_size: [u8; 4] = read_bytes(&mut r)?;
318 let compressed: u32 = endianness.parse(read_bytes(&mut r)?);
319 let compression = match (zmagic, compressed) {
321 (false, 1) => Some(Compression::Simple),
322 (true, 2) => Some(Compression::ZLib),
324 "{} file header has invalid compression value {compressed}.",
325 if zmagic { "ZSAV" } else { "SAV" }
329 let weight_index: u32 = endianness.parse(read_bytes(&mut r)?);
330 let n_cases: u32 = endianness.parse(read_bytes(&mut r)?);
332 let bias: [u8; 8] = read_bytes(&mut r)?;
333 let fp_format = detect_fp_format(bias)
334 .unwrap_or_else(|| { eprintln!("Compression bias is not the usual value of 100, or system file uses unrecognized floating-point format."); endianness });
335 let bias: f64 = fp_format.parse(bias);
337 let mut d = Dissector {
343 n_variable_records: 0,
345 var_widths: Vec::new(),
348 let creation_date: [u8; 9] = read_bytes(&mut d.r)?;
349 let creation_time: [u8; 8] = read_bytes(&mut d.r)?;
350 let file_label: [u8; 64] = read_bytes(&mut d.r)?;
351 let file_label = trim_end(Vec::from(file_label), b' ');
354 println!("File header record:");
358 String::from_utf8_lossy(&eye_catcher)
360 println!("{:>17}: {}", "Layout code", layout_code);
366 None => "no compression",
367 Some(Compression::Simple) => "simple compression",
368 Some(Compression::ZLib) => "ZLIB compression",
371 println!("{:>17}: {}", "Weight index", weight_index);
372 println!("{:>17}: {}", "Number of cases", n_cases);
373 println!("{:>17}: {}", "Compression bias", bias);
377 String::from_utf8_lossy(&creation_date)
382 String::from_utf8_lossy(&creation_time)
387 String::from_utf8_lossy(&file_label)
391 let rec_type: u32 = d.read_swap()?;
393 2 => d.read_variable_record()?,
394 3 => d.read_value_label_record()?,
395 4 => Err(anyhow!("Misplaced type 4 record."))?,
396 6 => d.read_document_record()?,
397 7 => d.read_extension_record()?,
399 _ => Err(anyhow!("Unrecognized record type {rec_type}."))?,
403 let pos = d.r.stream_position()?;
405 "{:08x}: end-of-dictionary record (first byte of data at {:0x})",
411 Some(Compression::Simple) => {
413 d.read_simple_compressed_data(max_cases)?;
416 Some(Compression::ZLib) => d.read_zlib_compressed_data()?,
423 fn read_simple_compressed_data(&mut self, max_cases: usize) -> Result<()> {
424 let _: i32 = self.read_swap()?;
425 println!("\n{:08x}: compressed data:", self.r.stream_position()?);
427 const N_OPCODES: usize = 8;
428 let mut opcodes = VecDeque::<u8>::with_capacity(8);
429 let mut opcode_ofs = 0;
430 for case_num in 0..max_cases {
432 "{:08x}: case {case_num}'s uncompressible data begins",
433 self.r.stream_position()?
436 while i < self.var_widths.len() {
437 let width = self.var_widths[i];
439 let opcode_idx = N_OPCODES - opcodes.len();
440 let Some(opcode) = opcodes.pop_back() else {
441 opcode_ofs = self.r.stream_position()?;
442 let mut new_opcodes = [0; N_OPCODES];
443 if let Err(error) = self.r.read_exact(&mut new_opcodes) {
444 if i == 0 && error.kind() == ErrorKind::UnexpectedEof {
447 return Err(error.into());
450 opcodes.extend(new_opcodes.into_iter());
455 "{:08x}: variable {i}: opcode {opcode}: ",
456 opcode_ofs + opcode_idx as u64
459 0 => println!("ignored padding"),
461 println!("end of data");
465 let raw: [u8; 8] = read_bytes(&mut self.r)?;
466 let value = UntypedValue::new(raw, self.fp_format);
467 println!("uncompressible data: {value}");
473 print!(", but this is a numeric variable");
481 print!(", but this is a string variable (width={width})");
487 print!("{}", opcode as f64 - self.bias);
489 print!(", but this is a string variable (width={width})");
500 fn read_zlib_compressed_data(&mut self) -> Result<()> {
501 let _: i32 = self.read_swap()?;
502 let ofs = self.r.stream_position()?;
503 println!("\n{ofs:08x}: ZLIB compressed data header:");
505 let this_ofs: u64 = self.read_swap()?;
506 let next_ofs: u64 = self.read_swap()?;
507 let next_len: u64 = self.read_swap()?;
509 println!("\theader_ofs: {this_ofs:#x}");
511 println!("\t\t(Expected {ofs:#x}.)");
513 println!("\ttrailer_ofs: {next_ofs:#x}");
514 println!("\ttrailer_len: {next_len}");
515 if next_len < 24 || next_len % 24 != 0 {
516 println!("\t\t(Trailer length is not positive multiple of 24.)");
519 let zlib_data_len = next_ofs - (ofs + 8 * 3);
521 "\n{:08x}: {zlib_data_len:#x} bytes of ZLIB compressed data",
525 self.skip_bytes(zlib_data_len)?;
527 println!("\n{next_ofs:08x}: ZLIB trailer fixed header");
528 let bias: u64 = self.read_swap()?;
529 let zero: u64 = self.read_swap()?;
530 let block_size: u32 = self.read_swap()?;
531 let n_blocks: u32 = self.read_swap()?;
532 println!("\tbias: {bias}");
533 println!("\tzero: {zero:#x}");
535 println!("\t\t(Expected 0.)");
537 println!("\tblock size: {block_size:#x}");
538 if block_size != 0x3ff000 {
539 println!("\t\t(Expected 0x3ff000.)");
541 println!("\tn_blocks: {n_blocks}");
542 if n_blocks as u64 != next_len / 24 - 1 {
543 println!("\t\t(Expected {}.)", next_len / 24 - 1);
546 let mut expected_uncmp_ofs = ofs;
547 let mut expected_cmp_ofs = ofs + 24;
548 for i in 1..=n_blocks {
549 let blockinfo_ofs = self.r.stream_position()?;
550 let uncompressed_ofs: u64 = self.read_swap()?;
551 let compressed_ofs: u64 = self.read_swap()?;
552 let uncompressed_size: u32 = self.read_swap()?;
553 let compressed_size: u32 = self.read_swap()?;
555 println!("\n{blockinfo_ofs:08x}: ZLIB block descriptor {i}");
557 println!("\tuncompressed_ofs: {uncompressed_ofs:#x}");
558 if uncompressed_ofs != expected_uncmp_ofs {
559 println!("\t\t(Expected {ofs:#x}.)");
562 println!("\tcompressed_ofs: {compressed_ofs:#x}");
563 if compressed_ofs != expected_cmp_ofs {
564 println!("\t\t(Expected {expected_cmp_ofs:#x}.)");
567 println!("\tuncompressed_size: {uncompressed_size:#x}");
568 if i < n_blocks && uncompressed_size != block_size {
569 println!("\t\t(Expected {block_size:#x}.)");
572 println!("\tcompressed_size: {compressed_size:#x}");
573 if i == n_blocks && compressed_ofs.checked_add(compressed_size as u64) != Some(next_ofs)
576 "\t\t(This was expected to be {:#x}.)",
577 next_ofs - compressed_size as u64
581 expected_uncmp_ofs += uncompressed_size as u64;
582 expected_cmp_ofs += uncompressed_size as u64;
587 fn read_extension_record(&mut self) -> Result<()> {
588 let offset = self.r.stream_position()?;
589 let subtype: u32 = self.read_swap()?;
590 let size: u32 = self.read_swap()?;
591 let count: u32 = self.read_swap()?;
592 println!("{offset:08x}: Record 7, subtype {subtype}, size={size}, count={count}");
593 if size.checked_mul(count).is_none() {
594 Err(anyhow!("{size} * {count} exceeds {}", u32::MAX))?
597 3 => self.read_machine_integer_info(size, count),
598 4 => self.read_machine_float_info(size, count),
599 5 => self.read_variable_sets(size, count),
601 // DATE variable information. We don't use it yet, but we should.
604 7 | 19 => self.read_mrsets(size, count),
605 10 => self.read_extra_product_info(size, count),
606 11 => self.read_display_parameters(size, count),
607 13 => self.read_long_var_name_map(size, count),
608 14 => self.read_long_string_map(size, count),
609 16 => self.read_ncases64(size, count),
610 17 => self.read_datafile_attributes(size, count),
611 18 => self.read_variable_attributes(size, count),
612 20 => self.read_character_encoding(size, count),
613 21 => self.read_long_string_value_labels(size, count),
614 22 => self.read_long_string_missing_values(size, count),
615 _ => self.read_unknown_extension(subtype, size, count),
619 fn warn(&mut self, s: String) -> Result<()> {
621 "\"{}\" near offset 0x{:08x}: {s}",
623 self.r.stream_position()?
628 fn skip_bytes(&mut self, mut n: u64) -> Result<()> {
629 let mut buf = [0; 1024];
631 let chunk = u64::min(n, buf.len() as u64);
632 self.r.read_exact(&mut buf[0..chunk as usize])?;
638 fn read_unknown_extension(&mut self, subtype: u32, size: u32, count: u32) -> Result<()> {
639 self.warn(format!("Unrecognized record type 7, subtype {subtype}."))?;
640 if size == 0 || count > 65536 / size {
641 self.skip_bytes(size as u64 * count as u64)?;
642 } else if size != 1 {
645 let vec = read_vec(&mut self.r, size as usize)?;
648 HexViewBuilder::new(&vec).address_offset(offset).finish()
650 offset += size as usize;
656 fn read_variable_record(&mut self) -> Result<()> {
657 self.n_variable_records += 1;
659 "{:08x}: variable record {}",
660 self.r.stream_position()?,
661 self.n_variable_records
663 let width: i32 = self.read_swap()?;
664 let has_variable_label: u32 = self.read_swap()?;
665 let missing_value_code: i32 = self.read_swap()?;
666 let print_format: u32 = self.read_swap()?;
667 let write_format: u32 = self.read_swap()?;
668 let name: [u8; 8] = read_bytes(&mut self.r)?;
669 let name: Vec<u8> = trim_end(Vec::from(name), b'\0');
672 self.n_variables += 1;
674 self.var_widths.push(width);
677 "\tWidth: {width} ({})",
679 _ if width > 0 => "string",
680 _ if width == 0 => "numeric",
681 _ => "long string continuation record",
685 println!("\tVariable label: {has_variable_label}");
687 "\tMissing values code: {missing_value_code} ({})",
688 match missing_value_code {
689 0 => "no missing values",
690 1 => "one missing value",
691 2 => "two missing values",
692 3 => "three missing values",
693 -2 => "one missing value range",
694 -3 => "one missing value, one range",
698 for (which, format) in [("Print", print_format), ("Worite", write_format)] {
699 let type_ = format_name(format >> 16);
700 let w = (format >> 8) & 0xff;
701 let d = format & 0xff;
702 println!("\t{which} format: {format:06x} ({type_}{w}.{d})");
704 println!("\tName: {}", String::from_utf8_lossy(&name));
706 // Read variable label.
707 match has_variable_label {
710 let offset = self.r.stream_position()?;
711 let len: u32 = self.read_swap()?;
712 let read_len = len.min(65535) as usize;
713 let label = read_vec(&mut self.r, read_len)?;
715 "\t{offset:08x} Variable label: \"{}\"",
716 String::from_utf8_lossy(&label)
719 self.skip_bytes((round_up(len, 4) - len).into())?;
721 _ => Err(anyhow!("Variable label indicator field is not 0 or 1."))?,
724 // Read missing values.
725 if missing_value_code != 0 {
726 print!("\t{:08x} Missing values:", self.r.stream_position()?);
727 match width.cmp(&0) {
729 let (has_range, n_individual) = match missing_value_code {
732 1 | 2 | 3 => (false, missing_value_code),
734 "Numeric missing value indicator field is not -3, -2, 0, 1, 2, or 3."
738 let low: f64 = self.read_swap()?;
739 let high: f64 = self.read_swap()?;
740 print!(" {low}...{high}");
742 for _ in 0..n_individual {
743 let value: f64 = self.read_swap()?;
747 Ordering::Greater => {
748 if !(0..=3).contains(&missing_value_code) {
750 "String missing value indicator field is not 0, 1, 2, or 3."
753 for _ in 0..missing_value_code {
754 let string: [u8; 8] = read_bytes(&mut self.r)?;
755 let string: Vec<u8> = trim_end(Vec::from(string), b'\0');
756 println!(" {}", String::from_utf8_lossy(&string));
759 Ordering::Less => (),
767 fn read_value_label_record(&mut self) -> Result<()> {
768 println!("{:08x}: value labels record", self.r.stream_position()?);
771 let n_labels: u32 = self.read_swap()?;
772 for _ in 0..n_labels {
773 let raw: [u8; 8] = read_bytes(&mut self.r)?;
774 let value = UntypedValue::new(raw, self.fp_format);
775 let label_len: u8 = self.read_swap()?;
776 let padded_len = round_up(label_len as usize + 1, 8);
778 let mut label = read_vec(&mut self.r, padded_len)?;
779 label.truncate(label_len as usize);
780 let label = String::from_utf8_lossy(&label);
782 println!("\t{value}: {label}");
785 // Read the type-4 record with the corresponding variable indexes.
786 let rec_type: u32 = self.read_swap()?;
789 "Variable index record (type 4) does not immediately \
790 follow value label record (type 3) as it should."
794 println!("\t{:08x}: apply to variables", self.r.stream_position()?);
795 let n_vars: u32 = self.read_swap()?;
797 let index: u32 = self.read_swap()?;
805 fn read_document_record(&mut self) -> Result<()> {
806 println!("{:08x}: document record", self.r.stream_position()?);
807 let n_lines: u32 = self.read_swap()?;
808 println!("\t{n_lines} lines of documents");
810 for i in 0..n_lines {
811 print!("\t{:08x}: ", self.r.stream_position()?);
812 let line: [u8; 64] = read_bytes(&mut self.r)?;
813 let line = trim_end(Vec::from(line), b' ');
814 println!("line {i}: \"{}\"", String::from_utf8_lossy(&line));
819 fn read_machine_integer_info(&mut self, size: u32, count: u32) -> Result<()> {
820 let offset = self.r.stream_position()?;
821 let version_major: u32 = self.read_swap()?;
822 let version_minor: u32 = self.read_swap()?;
823 let version_revision: u32 = self.read_swap()?;
824 let machine_code: u32 = self.read_swap()?;
825 let float_representation: u32 = self.read_swap()?;
826 let compression_code: u32 = self.read_swap()?;
827 let integer_representation: u32 = self.read_swap()?;
828 let character_code: u32 = self.read_swap()?;
830 println!("{offset:08x}: machine integer info");
831 if size != 4 || count != 8 {
833 "Bad size ({size}) or count ({count}) field on record type 7, subtype 3"
836 println!("\tVersion: {version_major}.{version_minor}.{version_revision}");
837 println!("\tMachine code: {machine_code}");
839 "\tFloating point representation: {float_representation} ({})",
840 match float_representation {
847 println!("\tCompression code: {compression_code}");
849 "\tEndianness: {integer_representation} ({})",
850 match integer_representation {
856 println!("\tCharacter code: {character_code}");
860 fn read_machine_float_info(&mut self, size: u32, count: u32) -> Result<()> {
861 let offset = self.r.stream_position()?;
862 let sysmis: f64 = self.read_swap()?;
863 let highest: f64 = self.read_swap()?;
864 let lowest: f64 = self.read_swap()?;
866 println!("{offset:08x}: machine float info");
867 if size != 4 || count != 8 {
869 "Bad size ({size}) or count ({count}) field on extension 4."
873 println!("\tsysmis: {sysmis} ({})", HexFloat(sysmis));
874 println!("\thighest: {highest} ({})", HexFloat(highest));
875 println!("\tlowest: {lowest} ({})", HexFloat(lowest));
879 fn read_variable_sets(&mut self, size: u32, count: u32) -> Result<()> {
880 println!("{:08x}: variable sets", self.r.stream_position()?);
881 let mut text = self.open_text_record(size, count)?;
883 while text.match_byte(b'\n') {
886 let set = match text.tokenize(b'=') {
887 Some(set) => String::from_utf8_lossy(set).into_owned(),
891 // Always present even for an empty set.
892 text.match_byte(b' ');
894 match text.tokenize(b'\n') {
895 None => println!("\tset \"{set}\" is empty"),
898 "\tset \"{set}\" contains \"{}\"",
899 String::from_utf8_lossy(variables).trim_end_matches('\r')
907 // Read record type 7, subtype 7.
908 fn read_mrsets(&mut self, size: u32, count: u32) -> Result<()> {
909 print!("{:08x}: multiple response sets", self.r.stream_position()?);
910 let mut text = self.open_text_record(size, count)?;
912 #[derive(PartialEq, Eq)]
918 while text.match_byte(b'\n') {}
919 let Some(name) = text.tokenize(b'=') else {
922 let name = Vec::from(name);
924 let (mrset, cat_label_from_counted_values, label_from_var_label) = if text
927 if !text.match_byte(b' ') {
929 "missing space following 'C' at offset {} in mrsets record",
933 (MrSet::MC, false, false)
934 } else if text.match_byte(b'D') {
935 (MrSet::MD, false, false)
936 } else if text.match_byte(b'E') {
937 if !text.match_byte(b' ') {
939 "missing space following 'E' at offset {} in mrsets record",
945 let Some(number) = text.tokenize(b' ') else {
947 "Missing label source value following `E' at offset {}u in MRSETS record",
952 let label_from_var_label = if number == b"11" {
954 } else if number == b"1" {
957 Err(anyhow!("Unexpected label source value `{}' following `E' at offset {pos} in MRSETS record", String::from_utf8_lossy(number)))?
959 (MrSet::MD, true, label_from_var_label)
962 "missing `C', `D', or `E' at offset {} in mrsets record",
967 let counted_value = if mrset == MrSet::MD {
968 Some(Vec::from(text.parse_counted_string()?))
973 let label = Vec::from(text.parse_counted_string()?);
975 let variables = text.tokenize(b'\n');
978 "\t\"{}\": multiple {} set",
979 String::from_utf8_lossy(&name),
980 if mrset == MrSet::MC {
986 if let Some(counted_value) = counted_value {
988 ", counted value \"{}\"",
989 String::from_utf8_lossy(&counted_value)
992 if cat_label_from_counted_values {
993 println!(", category labels from counted values");
996 print!(", label \"{}\"", String::from_utf8_lossy(&label));
998 if label_from_var_label {
999 print!(", label from variable label");
1001 if let Some(variables) = variables {
1002 print!(", variables \"{}\"", String::from_utf8_lossy(variables));
1004 print!("no variables");
1011 fn read_extra_product_info(&mut self, size: u32, count: u32) -> Result<()> {
1012 print!("{:08x}: extra product info", self.r.stream_position()?);
1013 let text = self.open_text_record(size, count)?;
1014 print_string(&text.buffer);
1018 fn read_display_parameters(&mut self, size: u32, count: u32) -> Result<()> {
1020 "{:08x}: variable display parameters",
1021 self.r.stream_position()?
1024 Err(anyhow!("Bad size ({size}) on extension 11."))?;
1026 let n_vars = self.n_variables;
1027 let includes_width = if count as usize == 3 * n_vars {
1029 } else if count as usize == 2 * n_vars {
1033 "Extension 11 has bad count {count} (for {n_vars} variables)."
1037 for i in 0..n_vars {
1038 let measure: u32 = self.read_swap()?;
1040 "\tVar #{i}: measure={measure} ({})",
1050 let width: u32 = self.read_swap()?;
1051 print!(", width={width}");
1054 let align: u32 = self.read_swap()?;
1056 ", align={align} ({})",
1068 fn read_long_var_name_map(&mut self, size: u32, count: u32) -> Result<()> {
1070 "{:08x}: long variable names (short => long)",
1071 self.r.stream_position()?
1073 let mut text = self.open_text_record(size, count)?;
1074 while let Some((var, long_name)) = text.read_variable_to_value_pair() {
1077 String::from_utf8_lossy(&var),
1078 String::from_utf8_lossy(&long_name)
1084 fn read_long_string_map(&mut self, size: u32, count: u32) -> Result<()> {
1086 "{:08x}: very long strings (variable => length)",
1087 self.r.stream_position()?
1089 let mut text = self.open_text_record(size, count)?;
1090 while let Some((var, length)) = text.read_variable_to_value_pair() {
1093 String::from_utf8_lossy(&var),
1094 String::from_utf8_lossy(&length)
1100 fn read_ncases64(&mut self, size: u32, count: u32) -> Result<()> {
1102 Err(anyhow!("Bad size {size} for extended number of cases."))?
1105 Err(anyhow!("Bad count {count} for extended number of cases."))?
1107 let unknown: u64 = self.read_swap()?;
1108 let ncases64: u64 = self.read_swap()?;
1110 "{:08x}: extended number of cases: unknown={unknown}, ncases64={ncases64}",
1111 self.r.stream_position()?
1116 fn read_attributes(&mut self, text: &mut TextRecord, variable: &str) -> Result<()> {
1118 let Some(key) = text.tokenize_string(b'(') else {
1122 let Some(value) = text.tokenize_string(b'\n') else {
1124 "{variable}: Error parsing attribute value {key}[{index}]"
1127 if value.starts_with('\'') && value.ends_with('\'') && value.len() >= 2 {
1128 let middle = &value[1..value.len() - 2];
1129 println!("\t{variable}: {key}[{index}] = \"{middle}\"");
1132 "{variable}: Attribute value {key}[{index}] is not quoted: {value}"
1135 if text.match_byte(b')') {
1140 if text.match_byte(b'/') {
1147 fn read_datafile_attributes(&mut self, size: u32, count: u32) -> Result<()> {
1148 print!("{:08x}: datafile attributes", self.r.stream_position()?);
1149 let mut text = self.open_text_record(size, count)?;
1150 self.read_attributes(&mut text, "datafile")?;
1154 fn read_variable_attributes(&mut self, size: u32, count: u32) -> Result<()> {
1155 print!("{:08x}: variable attributes", self.r.stream_position()?);
1156 let mut text = self.open_text_record(size, count)?;
1158 let Some(variable) = text.tokenize_string(b':') else {
1161 self.read_attributes(&mut text, &variable)?;
1166 fn read_character_encoding(&mut self, size: u32, count: u32) -> Result<()> {
1167 let offset = self.r.stream_position()?;
1168 let encoding = read_vec(&mut self.r, (size * count) as usize)?;
1169 println!("{offset:08x}: Character Encoding: {}", String::from_utf8_lossy(&encoding));
1173 fn read_long_string_value_labels(&mut self, size: u32, count: u32) -> Result<()> {
1174 let start = self.r.stream_position()?;
1176 println!("{start:08x}: long string value labels");
1177 while self.r.stream_position()? - start < (size * count) as u64 {
1178 let position = self.r.stream_position()?;
1180 let var_name_len: u32 = self.read_swap()?;
1181 if var_name_len > ID_MAX_LEN {
1182 Err(anyhow!("Variable name length in long string value label record ({var_name_len} exceeds {ID_MAX_LEN}-byte limit."))?
1184 let var_name = read_vec(&mut self.r, var_name_len as usize)?;
1186 let width: u32 = self.read_swap()?;
1187 let n_values: u32 = self.read_swap()?;
1189 println!("\t{position:08x}: {}, width {width}, {n_values} values",
1190 String::from_utf8_lossy(&var_name));
1192 for _ in 0..n_values {
1193 let position = self.r.stream_position()?;
1194 let value_length: u32 = self.read_swap()?;
1195 let value = read_vec(&mut self.r, value_length as usize)?;
1196 let label_length: u32 = self.read_swap()?;
1197 let label = read_vec(&mut self.r, value_length as usize)?;
1198 println!("\t\t{position:08x}: \"{}\" ({value_length} bytes) => \"{}\" ({label_length} bytes)",
1199 String::from_utf8_lossy(&value),
1200 String::from_utf8_lossy(&label));
1206 fn read_long_string_missing_values(&mut self, size: u32, count: u32) -> Result<()> {
1207 let start = self.r.stream_position()?;
1209 println!("{start:08x}: long string missing values");
1210 while self.r.stream_position()? - start < (size * count) as u64 {
1211 let position = self.r.stream_position()?;
1213 let var_name_len: u32 = self.read_swap()?;
1214 if var_name_len > ID_MAX_LEN {
1215 Err(anyhow!("Variable name length in long string missing value record ({var_name_len} exceeds {ID_MAX_LEN}-byte limit."))?
1217 let var_name = read_vec(&mut self.r, var_name_len as usize)?;
1219 let n_missing_values: u8 = self.read_swap()?;
1220 let value_length: u32 = self.read_swap()?;
1222 println!("\t{position:08x}: {}, {n_missing_values}, each {value_length} bytes:",
1223 String::from_utf8_lossy(&var_name));
1225 for _ in 0..n_missing_values {
1226 let value = read_vec(&mut self.r, value_length as usize)?;
1227 println!(" \"{}\"", String::from_utf8_lossy(&value));
1233 fn read_text_record(&mut self, size: u32, count: u32) -> Result<Vec<u8>> {
1234 let Some(n_bytes) = u32::checked_mul(size, count) else {
1235 Err(anyhow!("Extension record too large."))?
1237 read_vec(&mut self.r, n_bytes as usize)
1240 fn open_text_record(&mut self, size: u32, count: u32) -> Result<TextRecord> {
1241 Ok(TextRecord::new(self.read_text_record(size, count)?))
1245 fn print_string(s: &[u8]) {
1246 if s.contains(&b'\0') {
1247 println!("{}", HexView::new(s));
1251 b'\\' => print!("\\\\"),
1252 b'\n' => println!(),
1253 c if (b' '..=b'~').contains(&c) => print!("{}", c as char),
1254 c => print!("\\{:2x}", c),
1266 fn new(buffer: Vec<u8>) -> TextRecord {
1267 TextRecord { buffer, pos: 0 }
1270 fn tokenize(&mut self, delimiter: u8) -> Option<&[u8]> {
1271 let start = self.pos;
1272 while self.pos < self.buffer.len()
1273 && self.buffer[self.pos] != delimiter
1274 && self.buffer[self.pos] != 0
1278 if start == self.pos {
1281 Some(&self.buffer[start..self.pos])
1285 fn tokenize_string(&mut self, delimiter: u8) -> Option<String> {
1286 self.tokenize(delimiter)
1287 .map(|s| String::from_utf8_lossy(s).into_owned())
1290 fn match_byte(&mut self, c: u8) -> bool {
1291 if self.pos < self.buffer.len() && self.buffer[self.pos] == c {
1299 fn parse_usize(&mut self) -> Result<usize> {
1300 let n_digits = self.buffer[self.pos..]
1302 .take_while(|c| c.is_ascii_digit())
1305 Err(anyhow!("expecting digit at offset {} in record", self.pos))?;
1307 let start = self.pos;
1308 self.pos += n_digits;
1310 let digits = str::from_utf8(&self.buffer[start..end]).unwrap();
1311 let Ok(number) = digits.parse::<usize>() else {
1313 "expecting number in [0,{}] at offset {} in record",
1322 fn get_n_bytes(&mut self, n: usize) -> Option<(usize, usize)> {
1323 let start = self.pos;
1324 let Some(end) = start.checked_add(n) else {
1331 fn parse_counted_string(&mut self) -> Result<&[u8]> {
1332 let length = self.parse_usize()?;
1333 if !self.match_byte(b' ') {
1334 Err(anyhow!("expecting space at offset {} in record", self.pos))?;
1337 let Some((start, end)) = self.get_n_bytes(length) else {
1339 "{length}-byte string starting at offset {} exceeds record length {}",
1344 if !self.match_byte(b' ') {
1346 "expecting space at offset {} following {}-byte string",
1351 Ok(&self.buffer[start..end])
1354 fn read_variable_to_value_pair(&mut self) -> Option<(Vec<u8>, Vec<u8>)> {
1355 let key = self.tokenize(b'=')?.into();
1356 let value = self.tokenize(b'\t')?.into();
1358 while self.match_byte(b'\t') || self.match_byte(b'\0') {}