-impl Dissector {
- fn new<P: AsRef<Path>>(filename: P, max_cases: usize) -> Result<Dissector> {
- let mut r = BufReader::new(File::open(&filename)?);
- let filename = filename.as_ref().to_string_lossy().into_owned();
- let rec_type: [u8; 4] = read_bytes(&mut r)?;
- let zmagic = match &rec_type {
- b"$FL2" => false,
- b"$FL3" => true,
- _ => Err(anyhow!("This is not an SPSS system file."))?,
- };
-
- let eye_catcher: [u8; 60] = read_bytes(&mut r)?;
- let layout_code: [u8; 4] = read_bytes(&mut r)?;
- let endianness = detect_endianness(layout_code)
- .ok_or_else(|| anyhow!("This is not an SPSS system file."))?;
- let layout_code: u32 = endianness.parse(layout_code);
- let _nominal_case_size: [u8; 4] = read_bytes(&mut r)?;
- let compressed: u32 = endianness.parse(read_bytes(&mut r)?);
- let compression = match (zmagic, compressed) {
- (false, 0) => None,
- (false, 1) => Some(Compression::Simple),
- (true, 2) => Some(Compression::ZLib),
- _ => Err(anyhow!(
- "{} file header has invalid compression value {compressed}.",
- if zmagic { "ZSAV" } else { "SAV" }
- ))?,
- };
-
- let weight_index: u32 = endianness.parse(read_bytes(&mut r)?);
- let n_cases: u32 = endianness.parse(read_bytes(&mut r)?);
-
- let bias: [u8; 8] = read_bytes(&mut r)?;
- let fp_format = detect_fp_format(bias)
- .unwrap_or_else(|| { eprintln!("Compression bias is not the usual value of 100, or system file uses unrecognized floating-point format."); endianness });
- let bias: f64 = fp_format.parse(bias);
-
- let mut d = Dissector {
- filename,
- r,
- endianness,
- fp_format,
- bias,
- n_variable_records: 0,
- n_variables: 0,
- var_widths: Vec::new(),
- };
-
- let creation_date: [u8; 9] = read_bytes(&mut d.r)?;
- let creation_time: [u8; 8] = read_bytes(&mut d.r)?;
- let file_label: [u8; 64] = read_bytes(&mut d.r)?;
- let file_label = trim_end(Vec::from(file_label), b' ');
- d.skip_bytes(3)?;
-
- println!("File header record:");
- println!(
- "{:>17}: {}",
- "Product name",
- String::from_utf8_lossy(&eye_catcher)
- );
- println!("{:>17}: {}", "Layout code", layout_code);
- println!(
- "{:>17}: {} ({})",
- "Compressed",
- compressed,
- match compression {
- None => "no compression",
- Some(Compression::Simple) => "simple compression",
- Some(Compression::ZLib) => "ZLIB compression",
- }
- );
- println!("{:>17}: {}", "Weight index", weight_index);
- println!("{:>17}: {}", "Number of cases", n_cases);
- println!("{:>17}: {}", "Compression bias", bias);
- println!(
- "{:>17}: {}",
- "Creation date",
- String::from_utf8_lossy(&creation_date)
- );
- println!(
- "{:>17}: {}",
- "Creation time",
- String::from_utf8_lossy(&creation_time)
- );
- println!(
- "{:>17}: \"{}\"",
- "File label",
- String::from_utf8_lossy(&file_label)
- );
-
- loop {
- let rec_type: u32 = d.read_swap()?;
- match rec_type {
- 2 => d.read_variable_record()?,
- 3 => d.read_value_label_record()?,
- 4 => Err(anyhow!("Misplaced type 4 record."))?,
- 6 => d.read_document_record()?,
- 7 => d.read_extension_record()?,
- 999 => break,
- _ => Err(anyhow!("Unrecognized record type {rec_type}."))?,
- }
- }
-
- let pos = d.r.stream_position()?;
- println!(
- "{:08x}: end-of-dictionary record (first byte of data at {:0x})",
- pos,
- pos + 4
- );
-
- match compression {
- Some(Compression::Simple) => {
- if max_cases > 0 {
- d.read_simple_compressed_data(max_cases)?;
- }
- }
- Some(Compression::ZLib) => d.read_zlib_compressed_data()?,
- None => (),
- }
-
- Ok(d)
- }
-
- fn read_simple_compressed_data(&mut self, max_cases: usize) -> Result<()> {
- let _: i32 = self.read_swap()?;
- println!("\n{:08x}: compressed data:", self.r.stream_position()?);
-
- const N_OPCODES: usize = 8;
- let mut opcodes = VecDeque::<u8>::with_capacity(8);
- let mut opcode_ofs = 0;
- for case_num in 0..max_cases {
- println!(
- "{:08x}: case {case_num}'s uncompressible data begins",
- self.r.stream_position()?
- );
- let mut i = 0;
- while i < self.var_widths.len() {
- let width = self.var_widths[i];
-
- let opcode_idx = N_OPCODES - opcodes.len();
- let Some(opcode) = opcodes.pop_back() else {
- opcode_ofs = self.r.stream_position()?;
- let mut new_opcodes = [0; N_OPCODES];
- if let Err(error) = self.r.read_exact(&mut new_opcodes) {
- if i == 0 && error.kind() == ErrorKind::UnexpectedEof {
- return Ok(());
- } else {
- return Err(error.into());
- }
- };
- opcodes.extend(new_opcodes.into_iter());
- continue;
- };
-
- print!(
- "{:08x}: variable {i}: opcode {opcode}: ",
- opcode_ofs + opcode_idx as u64
- );
- match opcode {
- 0 => println!("ignored padding"),
- 252 => {
- println!("end of data");
- break;
- }
- 253 => {
- let raw: [u8; 8] = read_bytes(&mut self.r)?;
- let value = UntypedValue::new(raw, self.fp_format);
- println!("uncompressible data: {value}");
- i += 1;
- }
- 254 => {
- print!("spaces");
- if width == 0 {
- print!(", but this is a numeric variable");
- }
- println!();
- i += 1;
- }
- 255 => {
- print!("SYSMIS");
- if width != 0 {
- print!(", but this is a string variable (width={width})");
- }
- println!();
- i += 1;
- }
- _ => {
- print!("{}", opcode as f64 - self.bias);
- if width != 0 {
- print!(", but this is a string variable (width={width})");
- }
- println!();
- i += 1;
- }
- }
- }
- }
- Ok(())
- }
-
- fn read_zlib_compressed_data(&mut self) -> Result<()> {
- let _: i32 = self.read_swap()?;
- let ofs = self.r.stream_position()?;
- println!("\n{ofs:08x}: ZLIB compressed data header:");
-
- let this_ofs: u64 = self.read_swap()?;
- let next_ofs: u64 = self.read_swap()?;
- let next_len: u64 = self.read_swap()?;
-
- println!("\theader_ofs: {this_ofs:#x}");
- if this_ofs != ofs {
- println!("\t\t(Expected {ofs:#x}.)");
- }
- println!("\ttrailer_ofs: {next_ofs:#x}");
- println!("\ttrailer_len: {next_len}");
- if next_len < 24 || next_len % 24 != 0 {
- println!("\t\t(Trailer length is not positive multiple of 24.)");
- }
-
- let zlib_data_len = next_ofs - (ofs + 8 * 3);
- println!(
- "\n{:08x}: {zlib_data_len:#x} bytes of ZLIB compressed data",
- ofs + 8 * 3
- );
-
- self.skip_bytes(zlib_data_len)?;
-
- println!("\n{next_ofs:08x}: ZLIB trailer fixed header");
- let bias: u64 = self.read_swap()?;
- let zero: u64 = self.read_swap()?;
- let block_size: u32 = self.read_swap()?;
- let n_blocks: u32 = self.read_swap()?;
- println!("\tbias: {bias}");
- println!("\tzero: {zero:#x}");
- if zero != 0 {
- println!("\t\t(Expected 0.)");
- }
- println!("\tblock size: {block_size:#x}");
- if block_size != 0x3ff000 {
- println!("\t\t(Expected 0x3ff000.)");
- }
- println!("\tn_blocks: {n_blocks}");
- if n_blocks as u64 != next_len / 24 - 1 {
- println!("\t\t(Expected {}.)", next_len / 24 - 1);
- }
-
- let mut expected_uncmp_ofs = ofs;
- let mut expected_cmp_ofs = ofs + 24;
- for i in 1..=n_blocks {
- let blockinfo_ofs = self.r.stream_position()?;
- let uncompressed_ofs: u64 = self.read_swap()?;
- let compressed_ofs: u64 = self.read_swap()?;
- let uncompressed_size: u32 = self.read_swap()?;
- let compressed_size: u32 = self.read_swap()?;
-
- println!("\n{blockinfo_ofs:08x}: ZLIB block descriptor {i}");
-
- println!("\tuncompressed_ofs: {uncompressed_ofs:#x}");
- if uncompressed_ofs != expected_uncmp_ofs {
- println!("\t\t(Expected {ofs:#x}.)");
- }
-
- println!("\tcompressed_ofs: {compressed_ofs:#x}");
- if compressed_ofs != expected_cmp_ofs {
- println!("\t\t(Expected {expected_cmp_ofs:#x}.)");
- }
-
- println!("\tuncompressed_size: {uncompressed_size:#x}");
- if i < n_blocks && uncompressed_size != block_size {
- println!("\t\t(Expected {block_size:#x}.)");
- }
-
- println!("\tcompressed_size: {compressed_size:#x}");
- if i == n_blocks && compressed_ofs.checked_add(compressed_size as u64) != Some(next_ofs)
- {
- println!(
- "\t\t(This was expected to be {:#x}.)",
- next_ofs - compressed_size as u64
- );
- }
-
- expected_uncmp_ofs += uncompressed_size as u64;
- expected_cmp_ofs += uncompressed_size as u64;
- }
- Ok(())
- }
-
- fn read_extension_record(&mut self) -> Result<()> {
- let offset = self.r.stream_position()?;
- let subtype: u32 = self.read_swap()?;
- let size: u32 = self.read_swap()?;
- let count: u32 = self.read_swap()?;
- println!("{offset:08x}: Record 7, subtype {subtype}, size={size}, count={count}");
- match subtype {
- 3 => self.read_machine_integer_info(size, count),
- 4 => self.read_machine_float_info(size, count),
- 5 => self.read_variable_sets(size, count),
- 6 => {
- // DATE variable information. We don't use it yet, but we should.
- Ok(())
- }
- 7 | 19 => self.read_mrsets(size, count),
- 10 => self.read_extra_product_info(size, count),
- 11 => self.read_display_parameters(size, count),
- 13 => self.read_long_string_map(size, count),
- _ => self.read_unknown_extension(subtype, size, count),
- }
- }
-
- fn warn(&mut self, s: String) -> Result<()> {
- println!(
- "\"{}\" near offset 0x{:08x}: {s}",
- self.filename,
- self.r.stream_position()?
- );
- Ok(())
- }
-
- fn skip_bytes(&mut self, mut n: u64) -> Result<()> {
- let mut buf = [0; 1024];
- while n > 0 {
- let chunk = u64::min(n, buf.len() as u64);
- self.r.read_exact(&mut buf[0..chunk as usize])?;
- n -= chunk;
- }
- Ok(())
- }
-
- fn read_unknown_extension(&mut self, subtype: u32, size: u32, count: u32) -> Result<()> {
- self.warn(format!("Unrecognized record type 7, subtype {subtype}."))?;
- if size == 0 || count > 65536 / size {
- self.skip_bytes(size as u64 * count as u64)?;
- } else if size != 1 {
- let mut offset = 0;
- for _ in 0..count {
- let vec = read_vec(&mut self.r, size as usize)?;
- println!(
- "{}",
- HexViewBuilder::new(&vec).address_offset(offset).finish()
- );
- offset += size as usize;
- }
- }
- Ok(())
- }
-
- fn read_variable_record(&mut self) -> Result<()> {
- self.n_variable_records += 1;
- println!(
- "{:08x}: variable record {}",
- self.r.stream_position()?,
- self.n_variable_records
- );
- let width: i32 = self.read_swap()?;
- let has_variable_label: u32 = self.read_swap()?;
- let missing_value_code: i32 = self.read_swap()?;
- let print_format: u32 = self.read_swap()?;
- let write_format: u32 = self.read_swap()?;
- let name: [u8; 8] = read_bytes(&mut self.r)?;
- let name: Vec<u8> = trim_end(Vec::from(name), b'\0');
-
- if width >= 0 {
- self.n_variables += 1;
- }
- self.var_widths.push(width);
-
- println!(
- "\tWidth: {width} ({})",
- match width {
- _ if width > 0 => "string",
- _ if width == 0 => "numeric",
- _ => "long string continuation record",
- }
- );
-
- println!("\tVariable label: {has_variable_label}");
- println!(
- "\tMissing values code: {missing_value_code} ({})",
- match missing_value_code {
- 0 => "no missing values",
- 1 => "one missing value",
- 2 => "two missing values",
- 3 => "three missing values",
- -2 => "one missing value range",
- -3 => "one missing value, one range",
- _ => "bad value",
- }
- );
- for (which, format) in [("Print", print_format), ("Worite", write_format)] {
- let type_ = format_name(format >> 16);
- let w = (format >> 8) & 0xff;
- let d = format & 0xff;
- println!("\t{which} format: {format:06x} ({type_}{w}.{d})");
- }
- println!("\tName: {}", String::from_utf8_lossy(&name));
-
- // Read variable label.
- match has_variable_label {
- 0 => (),
- 1 => {
- let offset = self.r.stream_position()?;
- let len: u32 = self.read_swap()?;
- let read_len = len.min(65535) as usize;
- let label = read_vec(&mut self.r, read_len)?;
- println!(
- "\t{offset:08x} Variable label: \"{}\"",
- String::from_utf8_lossy(&label)
- );
-
- self.skip_bytes((round_up(len, 4) - len).into())?;
- }
- _ => Err(anyhow!("Variable label indicator field is not 0 or 1."))?,
- };
-
- // Read missing values.
- if missing_value_code != 0 {
- print!("\t{:08x} Missing values:", self.r.stream_position()?);
- match width.cmp(&0) {
- Ordering::Equal => {
- let (has_range, n_individual) = match missing_value_code {
- -3 => (true, 1),
- -2 => (true, 0),
- 1 | 2 | 3 => (false, missing_value_code),
- _ => Err(anyhow!(
- "Numeric missing value indicator field is not -3, -2, 0, 1, 2, or 3."
- ))?,
- };
- if has_range {
- let low: f64 = self.read_swap()?;
- let high: f64 = self.read_swap()?;
- print!(" {low}...{high}");
- }
- for _ in 0..n_individual {
- let value: f64 = self.read_swap()?;
- print!(" {value}");
- }
- }
- Ordering::Greater => {
- if !(0..=3).contains(&missing_value_code) {
- Err(anyhow!(
- "String missing value indicator field is not 0, 1, 2, or 3."
- ))?;
- }
- for _ in 0..missing_value_code {
- let string: [u8; 8] = read_bytes(&mut self.r)?;
- let string: Vec<u8> = trim_end(Vec::from(string), b'\0');
- println!(" {}", String::from_utf8_lossy(&string));
- }
- }
- Ordering::Less => (),
- }
- println!();
- }
-
- Ok(())
- }
-
- fn read_value_label_record(&mut self) -> Result<()> {
- println!("{:08x}: value labels record", self.r.stream_position()?);
-
- // Read the labels.
- let n_labels: u32 = self.read_swap()?;
- for _ in 0..n_labels {
- let raw: [u8; 8] = read_bytes(&mut self.r)?;
- let value = UntypedValue::new(raw, self.fp_format);
- let label_len: u8 = self.read_swap()?;
- let padded_len = round_up(label_len as usize + 1, 8);
-
- let mut label = read_vec(&mut self.r, padded_len)?;
- label.truncate(label_len as usize);
- let label = String::from_utf8_lossy(&label);
-
- println!("\t{value}: {label}");
- }
-
- // Read the type-4 record with the corresponding variable indexes.
- let rec_type: u32 = self.read_swap()?;
- if rec_type != 4 {
- Err(anyhow!(
- "Variable index record (type 4) does not immediately \
- follow value label record (type 3) as it should."
- ))?;
- }
-
- println!("\t{:08x}: apply to variables", self.r.stream_position()?);
- let n_vars: u32 = self.read_swap()?;
- for _ in 0..n_vars {
- let index: u32 = self.read_swap()?;
- print!(" {index}");
- }
- println!();
-
- Ok(())
- }
-
- fn read_document_record(&mut self) -> Result<()> {
- println!("{:08x}: document record", self.r.stream_position()?);
- let n_lines: u32 = self.read_swap()?;
- println!("\t{n_lines} lines of documents");
-
- for i in 0..n_lines {
- print!("\t{:08x}: ", self.r.stream_position()?);
- let line: [u8; 64] = read_bytes(&mut self.r)?;
- let line = trim_end(Vec::from(line), b' ');
- println!("line {i}: \"{}\"", String::from_utf8_lossy(&line));
- }
- Ok(())
- }
-
- fn read_machine_integer_info(&mut self, size: u32, count: u32) -> Result<()> {
- let offset = self.r.stream_position()?;
- let version_major: u32 = self.read_swap()?;
- let version_minor: u32 = self.read_swap()?;
- let version_revision: u32 = self.read_swap()?;
- let machine_code: u32 = self.read_swap()?;
- let float_representation: u32 = self.read_swap()?;
- let compression_code: u32 = self.read_swap()?;
- let integer_representation: u32 = self.read_swap()?;
- let character_code: u32 = self.read_swap()?;
-
- println!("{offset:08x}: machine integer info");
- if size != 4 || count != 8 {
- Err(anyhow!(
- "Bad size ({size}) or count ({count}) field on record type 7, subtype 3"
- ))?;
- }
- println!("\tVersion: {version_major}.{version_minor}.{version_revision}");
- println!("\tMachine code: {machine_code}");
- println!(
- "\tFloating point representation: {float_representation} ({})",
- match float_representation {
- 1 => "IEEE 754",
- 2 => "IBM 370",
- 3 => "DEC VAX",
- _ => "unknown",
- }
- );
- println!("\tCompression code: {compression_code}");
- println!(
- "\tEndianness: {integer_representation} ({})",
- match integer_representation {
- 1 => "big",
- 2 => "little",
- _ => "unknown",
- }
- );
- println!("\tCharacter code: {character_code}");
- Ok(())
- }
-
- fn read_machine_float_info(&mut self, size: u32, count: u32) -> Result<()> {
- let offset = self.r.stream_position()?;
- let sysmis: f64 = self.read_swap()?;
- let highest: f64 = self.read_swap()?;
- let lowest: f64 = self.read_swap()?;
-
- println!("{offset:08x}: machine float info");
- if size != 4 || count != 8 {
- Err(anyhow!(
- "Bad size ({size}) or count ({count}) field on extension 4."
- ))?;
- }
-
- println!("\tsysmis: {sysmis} ({})", HexFloat(sysmis));
- println!("\thighest: {highest} ({})", HexFloat(highest));
- println!("\tlowest: {lowest} ({})", HexFloat(lowest));
- Ok(())
- }
-
- fn read_variable_sets(&mut self, size: u32, count: u32) -> Result<()> {
- println!("{:08x}: variable sets", self.r.stream_position()?);
- let mut text = self.open_text_record(size, count)?;
- loop {
- while text.match_byte(b'\n') {
- continue;
- }
- let set = match text.tokenize(b'=') {
- Some(set) => String::from_utf8_lossy(set).into_owned(),
- None => break,
- };
-
- // Always present even for an empty set.
- text.match_byte(b' ');