X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;ds=sidebyside;f=rust%2Fsrc%2Fmain.rs;h=5da01dd0243f3a9efd32ffac42f01ccd771e79cb;hb=2f5387c960ae55adfd09d698566430947c28615e;hp=9c92b5515d50f8dd6c2c10f4cce46258d0cd1b5b;hpb=0b4388bfca70b1ce3e9daeb00d48681db823a337;p=pspp diff --git a/rust/src/main.rs b/rust/src/main.rs index 9c92b5515d..5da01dd024 100644 --- a/rust/src/main.rs +++ b/rust/src/main.rs @@ -1,5 +1,3 @@ -#![allow(unused_variables)] -#![allow(dead_code)] /* PSPP - a program for statistical analysis. * Copyright (C) 2023 Free Software Foundation, Inc. * @@ -20,14 +18,21 @@ use anyhow::{anyhow, Result}; use clap::Parser; use hexplay::HexView; use hexplay::HexViewBuilder; -use num::{Float, Num}; +use num::Num; use std::cmp::Ordering; +use std::collections::VecDeque; +use std::fmt; use std::fs::File; use std::io::prelude::*; use std::io::BufReader; +use std::io::ErrorKind; use std::path::{Path, PathBuf}; use std::str; -use std::{fmt, num::FpCategory}; + +mod hexfloat; +use hexfloat::HexFloat; + +const ID_MAX_LEN: u32 = 64; /// A utility to dissect SPSS system files. #[derive(Parser, Debug)] @@ -46,7 +51,7 @@ fn main() -> Result<()> { let Args { max_cases, files } = Args::parse(); for file in files { - Dissector::new(file)?; + Dissector::new(file, max_cases)?; } Ok(()) } @@ -156,13 +161,18 @@ trait ReadSwap { fn read_swap(&mut self) -> Result; } +impl ReadSwap for Dissector { + fn read_swap(&mut self) -> Result { + Ok(self.endianness.parse(read_bytes(&mut self.r)?)) + } +} impl ReadSwap for Dissector { fn read_swap(&mut self) -> Result { Ok(self.endianness.parse(read_bytes(&mut self.r)?)) } } -impl ReadSwap for Dissector { - fn read_swap(&mut self) -> Result { +impl ReadSwap for Dissector { + fn read_swap(&mut self) -> Result { Ok(self.endianness.parse(read_bytes(&mut self.r)?)) } } @@ -182,7 +192,6 @@ impl ReadSwap for Dissector { struct Dissector { filename: String, r: BufReader, - compression: Option, endianness: Endianness, fp_format: Endianness, bias: f64, @@ -218,13 +227,6 @@ fn trim_end(mut s: Vec, c: u8) -> Vec { s } -fn slice_trim_end(mut s: &[u8], c: u8) -> &[u8] { - while s.last() == Some(&c) { - s = s.split_last().unwrap().1; - } - s -} - fn format_name(type_: u32) -> &'static str { match type_ { 1 => "A", @@ -296,57 +298,8 @@ impl fmt::Display for UntypedValue { } } -struct HexFloat(T); - -impl fmt::Display for HexFloat { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let sign = if self.0.is_sign_negative() { "-" } else { "" }; - match self.0.classify() { - FpCategory::Nan => return write!(f, "NaN"), - FpCategory::Infinite => return write!(f, "{sign}Infinity"), - FpCategory::Zero => return write!(f, "{sign}0.0"), - _ => (), - }; - let (significand, mut exponent, _) = self.0.integer_decode(); - let mut hex_sig = format!("{:x}", significand); - while hex_sig.ends_with('0') { - hex_sig.pop(); - exponent += 4; - } - match hex_sig.len() { - 0 => write!(f, "{sign}0.0"), - 1 => write!(f, "{sign}0x{hex_sig}.0p{exponent}"), - len => write!( - f, - "{sign}0x{}.{}p{}", - hex_sig.chars().next().unwrap(), - &hex_sig[1..], - exponent + 4 * (len as i16 - 1) - ), - } - } -} - -#[cfg(test)] -mod hex_float_tests { - use crate::HexFloat; - use num::Float; - - #[test] - fn test() { - assert_eq!(format!("{}", HexFloat(1.0)), "0x1.0p0"); - assert_eq!(format!("{}", HexFloat(123.0)), "0x1.ecp6"); - assert_eq!(format!("{}", HexFloat(1.0 / 16.0)), "0x1.0p-4"); - assert_eq!(format!("{}", HexFloat(f64::infinity())), "Infinity"); - assert_eq!(format!("{}", HexFloat(f64::neg_infinity())), "-Infinity"); - assert_eq!(format!("{}", HexFloat(f64::nan())), "NaN"); - assert_eq!(format!("{}", HexFloat(0.0)), "0.0"); - assert_eq!(format!("{}", HexFloat(f64::neg_zero())), "-0.0"); - } -} - impl Dissector { - fn new>(filename: P) -> Result { + fn new>(filename: P, max_cases: usize) -> Result { let mut r = BufReader::new(File::open(&filename)?); let filename = filename.as_ref().to_string_lossy().into_owned(); let rec_type: [u8; 4] = read_bytes(&mut r)?; @@ -384,7 +337,6 @@ impl Dissector { let mut d = Dissector { filename, r, - compression, endianness, fp_format, bias, @@ -455,15 +407,192 @@ impl Dissector { pos + 4 ); + match compression { + Some(Compression::Simple) => { + if max_cases > 0 { + d.read_simple_compressed_data(max_cases)?; + } + } + Some(Compression::ZLib) => d.read_zlib_compressed_data()?, + None => (), + } + Ok(d) } + fn read_simple_compressed_data(&mut self, max_cases: usize) -> Result<()> { + let _: i32 = self.read_swap()?; + println!("\n{:08x}: compressed data:", self.r.stream_position()?); + + const N_OPCODES: usize = 8; + let mut opcodes = VecDeque::::with_capacity(8); + let mut opcode_ofs = 0; + for case_num in 0..max_cases { + println!( + "{:08x}: case {case_num}'s uncompressible data begins", + self.r.stream_position()? + ); + let mut i = 0; + while i < self.var_widths.len() { + let width = self.var_widths[i]; + + let opcode_idx = N_OPCODES - opcodes.len(); + let Some(opcode) = opcodes.pop_back() else { + opcode_ofs = self.r.stream_position()?; + let mut new_opcodes = [0; N_OPCODES]; + if let Err(error) = self.r.read_exact(&mut new_opcodes) { + if i == 0 && error.kind() == ErrorKind::UnexpectedEof { + return Ok(()); + } else { + return Err(error.into()); + } + }; + opcodes.extend(new_opcodes.into_iter()); + continue; + }; + + print!( + "{:08x}: variable {i}: opcode {opcode}: ", + opcode_ofs + opcode_idx as u64 + ); + match opcode { + 0 => println!("ignored padding"), + 252 => { + println!("end of data"); + break; + } + 253 => { + let raw: [u8; 8] = read_bytes(&mut self.r)?; + let value = UntypedValue::new(raw, self.fp_format); + println!("uncompressible data: {value}"); + i += 1; + } + 254 => { + print!("spaces"); + if width == 0 { + print!(", but this is a numeric variable"); + } + println!(); + i += 1; + } + 255 => { + print!("SYSMIS"); + if width != 0 { + print!(", but this is a string variable (width={width})"); + } + println!(); + i += 1; + } + _ => { + print!("{}", opcode as f64 - self.bias); + if width != 0 { + print!(", but this is a string variable (width={width})"); + } + println!(); + i += 1; + } + } + } + } + Ok(()) + } + + fn read_zlib_compressed_data(&mut self) -> Result<()> { + let _: i32 = self.read_swap()?; + let ofs = self.r.stream_position()?; + println!("\n{ofs:08x}: ZLIB compressed data header:"); + + let this_ofs: u64 = self.read_swap()?; + let next_ofs: u64 = self.read_swap()?; + let next_len: u64 = self.read_swap()?; + + println!("\theader_ofs: {this_ofs:#x}"); + if this_ofs != ofs { + println!("\t\t(Expected {ofs:#x}.)"); + } + println!("\ttrailer_ofs: {next_ofs:#x}"); + println!("\ttrailer_len: {next_len}"); + if next_len < 24 || next_len % 24 != 0 { + println!("\t\t(Trailer length is not positive multiple of 24.)"); + } + + let zlib_data_len = next_ofs - (ofs + 8 * 3); + println!( + "\n{:08x}: {zlib_data_len:#x} bytes of ZLIB compressed data", + ofs + 8 * 3 + ); + + self.skip_bytes(zlib_data_len)?; + + println!("\n{next_ofs:08x}: ZLIB trailer fixed header"); + let bias: u64 = self.read_swap()?; + let zero: u64 = self.read_swap()?; + let block_size: u32 = self.read_swap()?; + let n_blocks: u32 = self.read_swap()?; + println!("\tbias: {bias}"); + println!("\tzero: {zero:#x}"); + if zero != 0 { + println!("\t\t(Expected 0.)"); + } + println!("\tblock size: {block_size:#x}"); + if block_size != 0x3ff000 { + println!("\t\t(Expected 0x3ff000.)"); + } + println!("\tn_blocks: {n_blocks}"); + if n_blocks as u64 != next_len / 24 - 1 { + println!("\t\t(Expected {}.)", next_len / 24 - 1); + } + + let mut expected_uncmp_ofs = ofs; + let mut expected_cmp_ofs = ofs + 24; + for i in 1..=n_blocks { + let blockinfo_ofs = self.r.stream_position()?; + let uncompressed_ofs: u64 = self.read_swap()?; + let compressed_ofs: u64 = self.read_swap()?; + let uncompressed_size: u32 = self.read_swap()?; + let compressed_size: u32 = self.read_swap()?; + + println!("\n{blockinfo_ofs:08x}: ZLIB block descriptor {i}"); + + println!("\tuncompressed_ofs: {uncompressed_ofs:#x}"); + if uncompressed_ofs != expected_uncmp_ofs { + println!("\t\t(Expected {ofs:#x}.)"); + } + + println!("\tcompressed_ofs: {compressed_ofs:#x}"); + if compressed_ofs != expected_cmp_ofs { + println!("\t\t(Expected {expected_cmp_ofs:#x}.)"); + } + + println!("\tuncompressed_size: {uncompressed_size:#x}"); + if i < n_blocks && uncompressed_size != block_size { + println!("\t\t(Expected {block_size:#x}.)"); + } + + println!("\tcompressed_size: {compressed_size:#x}"); + if i == n_blocks && compressed_ofs.checked_add(compressed_size as u64) != Some(next_ofs) + { + println!( + "\t\t(This was expected to be {:#x}.)", + next_ofs - compressed_size as u64 + ); + } + + expected_uncmp_ofs += uncompressed_size as u64; + expected_cmp_ofs += uncompressed_size as u64; + } + Ok(()) + } + fn read_extension_record(&mut self) -> Result<()> { let offset = self.r.stream_position()?; let subtype: u32 = self.read_swap()?; let size: u32 = self.read_swap()?; let count: u32 = self.read_swap()?; println!("{offset:08x}: Record 7, subtype {subtype}, size={size}, count={count}"); + if size.checked_mul(count).is_none() { + Err(anyhow!("{size} * {count} exceeds {}", u32::MAX))? + } match subtype { 3 => self.read_machine_integer_info(size, count), 4 => self.read_machine_float_info(size, count), @@ -475,6 +604,14 @@ impl Dissector { 7 | 19 => self.read_mrsets(size, count), 10 => self.read_extra_product_info(size, count), 11 => self.read_display_parameters(size, count), + 13 => self.read_long_var_name_map(size, count), + 14 => self.read_long_string_map(size, count), + 16 => self.read_ncases64(size, count), + 17 => self.read_datafile_attributes(size, count), + 18 => self.read_variable_attributes(size, count), + 20 => self.read_character_encoding(size, count), + 21 => self.read_long_string_value_labels(size, count), + 22 => self.read_long_string_missing_values(size, count), _ => self.read_unknown_extension(subtype, size, count), } } @@ -782,6 +919,7 @@ impl Dissector { let Some(name) = text.tokenize(b'=') else { break; }; + let name = Vec::from(name); let (mrset, cat_label_from_counted_values, label_from_var_label) = if text .match_byte(b'C') @@ -827,17 +965,45 @@ impl Dissector { }; let counted_value = if mrset == MrSet::MD { - Some(text.parse_counted_string()?) - } else { None }; + Some(Vec::from(text.parse_counted_string()?)) + } else { + None + }; - let label = text.parse_counted_string()?; + let label = Vec::from(text.parse_counted_string()?); let variables = text.tokenize(b'\n'); - print!("\t\"{}\": multiple {} set", - String::from_utf8_lossy(name), - if mrset == MrSet::MC { "category" } else { "dichotomy" }); - + print!( + "\t\"{}\": multiple {} set", + String::from_utf8_lossy(&name), + if mrset == MrSet::MC { + "category" + } else { + "dichotomy" + } + ); + if let Some(counted_value) = counted_value { + print!( + ", counted value \"{}\"", + String::from_utf8_lossy(&counted_value) + ); + } + if cat_label_from_counted_values { + println!(", category labels from counted values"); + } + if label != b"" { + print!(", label \"{}\"", String::from_utf8_lossy(&label)); + } + if label_from_var_label { + print!(", label from variable label"); + } + if let Some(variables) = variables { + print!(", variables \"{}\"", String::from_utf8_lossy(variables)); + } else { + print!("no variables"); + } + println!(); } Ok(()) } @@ -899,12 +1065,180 @@ impl Dissector { Ok(()) } - fn open_text_record(&mut self, size: u32, count: u32) -> Result { - let n_bytes = match u32::checked_mul(size, count) { - Some(n) => n, - None => Err(anyhow!("Extension record too large."))?, + fn read_long_var_name_map(&mut self, size: u32, count: u32) -> Result<()> { + print!( + "{:08x}: long variable names (short => long)", + self.r.stream_position()? + ); + let mut text = self.open_text_record(size, count)?; + while let Some((var, long_name)) = text.read_variable_to_value_pair() { + println!( + "\t{} => {}", + String::from_utf8_lossy(&var), + String::from_utf8_lossy(&long_name) + ); + } + Ok(()) + } + + fn read_long_string_map(&mut self, size: u32, count: u32) -> Result<()> { + print!( + "{:08x}: very long strings (variable => length)", + self.r.stream_position()? + ); + let mut text = self.open_text_record(size, count)?; + while let Some((var, length)) = text.read_variable_to_value_pair() { + println!( + "\t{} => {}", + String::from_utf8_lossy(&var), + String::from_utf8_lossy(&length) + ); + } + Ok(()) + } + + fn read_ncases64(&mut self, size: u32, count: u32) -> Result<()> { + if size != 8 { + Err(anyhow!("Bad size {size} for extended number of cases."))? + } + if count != 2 { + Err(anyhow!("Bad count {count} for extended number of cases."))? + } + let unknown: u64 = self.read_swap()?; + let ncases64: u64 = self.read_swap()?; + print!( + "{:08x}: extended number of cases: unknown={unknown}, ncases64={ncases64}", + self.r.stream_position()? + ); + Ok(()) + } + + fn read_attributes(&mut self, text: &mut TextRecord, variable: &str) -> Result<()> { + loop { + let Some(key) = text.tokenize_string(b'(') else { + break; + }; + for index in 1.. { + let Some(value) = text.tokenize_string(b'\n') else { + Err(anyhow!( + "{variable}: Error parsing attribute value {key}[{index}]" + ))? + }; + if value.starts_with('\'') && value.ends_with('\'') && value.len() >= 2 { + let middle = &value[1..value.len() - 2]; + println!("\t{variable}: {key}[{index}] = \"{middle}\""); + } else { + self.warn(format!( + "{variable}: Attribute value {key}[{index}] is not quoted: {value}" + ))?; + } + if text.match_byte(b')') { + break; + } + } + + if text.match_byte(b'/') { + break; + } + } + Ok(()) + } + + fn read_datafile_attributes(&mut self, size: u32, count: u32) -> Result<()> { + print!("{:08x}: datafile attributes", self.r.stream_position()?); + let mut text = self.open_text_record(size, count)?; + self.read_attributes(&mut text, "datafile")?; + Ok(()) + } + + fn read_variable_attributes(&mut self, size: u32, count: u32) -> Result<()> { + print!("{:08x}: variable attributes", self.r.stream_position()?); + let mut text = self.open_text_record(size, count)?; + loop { + let Some(variable) = text.tokenize_string(b':') else { + break; + }; + self.read_attributes(&mut text, &variable)?; + } + Ok(()) + } + + fn read_character_encoding(&mut self, size: u32, count: u32) -> Result<()> { + let offset = self.r.stream_position()?; + let encoding = read_vec(&mut self.r, (size * count) as usize)?; + println!("{offset:08x}: Character Encoding: {}", String::from_utf8_lossy(&encoding)); + Ok(()) + } + + fn read_long_string_value_labels(&mut self, size: u32, count: u32) -> Result<()> { + let start = self.r.stream_position()?; + + println!("{start:08x}: long string value labels"); + while self.r.stream_position()? - start < (size * count) as u64 { + let position = self.r.stream_position()?; + + let var_name_len: u32 = self.read_swap()?; + if var_name_len > ID_MAX_LEN { + Err(anyhow!("Variable name length in long string value label record ({var_name_len} exceeds {ID_MAX_LEN}-byte limit."))? + } + let var_name = read_vec(&mut self.r, var_name_len as usize)?; + + let width: u32 = self.read_swap()?; + let n_values: u32 = self.read_swap()?; + + println!("\t{position:08x}: {}, width {width}, {n_values} values", + String::from_utf8_lossy(&var_name)); + + for _ in 0..n_values { + let position = self.r.stream_position()?; + let value_length: u32 = self.read_swap()?; + let value = read_vec(&mut self.r, value_length as usize)?; + let label_length: u32 = self.read_swap()?; + let label = read_vec(&mut self.r, value_length as usize)?; + println!("\t\t{position:08x}: \"{}\" ({value_length} bytes) => \"{}\" ({label_length} bytes)", + String::from_utf8_lossy(&value), + String::from_utf8_lossy(&label)); + } + } + Ok(()) + } + + fn read_long_string_missing_values(&mut self, size: u32, count: u32) -> Result<()> { + let start = self.r.stream_position()?; + + println!("{start:08x}: long string missing values"); + while self.r.stream_position()? - start < (size * count) as u64 { + let position = self.r.stream_position()?; + + let var_name_len: u32 = self.read_swap()?; + if var_name_len > ID_MAX_LEN { + Err(anyhow!("Variable name length in long string missing value record ({var_name_len} exceeds {ID_MAX_LEN}-byte limit."))? + } + let var_name = read_vec(&mut self.r, var_name_len as usize)?; + + let n_missing_values: u8 = self.read_swap()?; + let value_length: u32 = self.read_swap()?; + + println!("\t{position:08x}: {}, {n_missing_values}, each {value_length} bytes:", + String::from_utf8_lossy(&var_name)); + + for _ in 0..n_missing_values { + let value = read_vec(&mut self.r, value_length as usize)?; + println!(" \"{}\"", String::from_utf8_lossy(&value)); + } + } + Ok(()) + } + + fn read_text_record(&mut self, size: u32, count: u32) -> Result> { + let Some(n_bytes) = u32::checked_mul(size, count) else { + Err(anyhow!("Extension record too large."))? }; - Ok(TextRecord::new(read_vec(&mut self.r, n_bytes as usize)?)) + read_vec(&mut self.r, n_bytes as usize) + } + + fn open_text_record(&mut self, size: u32, count: u32) -> Result { + Ok(TextRecord::new(self.read_text_record(size, count)?)) } } @@ -948,6 +1282,11 @@ impl TextRecord { } } + fn tokenize_string(&mut self, delimiter: u8) -> Option { + self.tokenize(delimiter) + .map(|s| String::from_utf8_lossy(s).into_owned()) + } + fn match_byte(&mut self, c: u8) -> bool { if self.pos < self.buffer.len() && self.buffer[self.pos] == c { self.pos += 1; @@ -996,8 +1335,11 @@ impl TextRecord { } let Some((start, end)) = self.get_n_bytes(length) else { - Err(anyhow!("{length}-byte string starting at offset {} exceeds record length {}", - self.pos, self.buffer.len()))? + Err(anyhow!( + "{length}-byte string starting at offset {} exceeds record length {}", + self.pos, + self.buffer.len() + ))? }; if !self.match_byte(b' ') { Err(anyhow!( @@ -1008,4 +1350,12 @@ impl TextRecord { } Ok(&self.buffer[start..end]) } + + fn read_variable_to_value_pair(&mut self) -> Option<(Vec, Vec)> { + let key = self.tokenize(b'=')?.into(); + let value = self.tokenize(b'\t')?.into(); + + while self.match_byte(b'\t') || self.match_byte(b'\0') {} + Some((key, value)) + } }