X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=rust%2Fsrc%2Fmain.rs;h=404e96d57d07becaf09c4688319a0ac3739ceee4;hb=e0cbdf0daefcca81be9572aab0deedf945687f5a;hp=41dec4d92fe3c33cad9dec9d511717900d209191;hpb=5619e0513e9d2246c7e1d36a519f38cd432d2f6c;p=pspp diff --git a/rust/src/main.rs b/rust/src/main.rs index 41dec4d92f..404e96d57d 100644 --- a/rust/src/main.rs +++ b/rust/src/main.rs @@ -1,26 +1,29 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2023 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -use anyhow::{anyhow, Result}; -use clap::Parser; -use num::Num; + * Copyright (C) 2023 Free Software Foundation, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . */ + +use anyhow::Result; +use clap::{Parser, ValueEnum}; +use encoding_rs::Encoding; +use pspp::cooked::decode; +use pspp::raw::{Reader, Record, Magic}; use std::fs::File; -use std::io::prelude::*; use std::io::BufReader; use std::path::{Path, PathBuf}; +use std::str; +use thiserror::Error as ThisError; /// A utility to dissect SPSS system files. #[derive(Parser, Debug)] @@ -28,433 +31,89 @@ use std::path::{Path, PathBuf}; struct Args { /// Maximum number of cases to print. #[arg(long = "data", default_value_t = 0)] - max_cases: usize, + max_cases: u64, /// Files to dissect. #[arg(required = true)] - files: Vec -} - -fn main() -> Result<()> { - let Args { max_cases, files } = Args::parse(); - - let error = false; - for file in files { - Dissector::new(file)?; - } - Ok(()) -} - -#[derive(Copy, Clone, Debug)] -enum Compression { - Simple, - ZLib -} - -#[derive(Copy, Clone, Debug)] -enum Endianness { - BigEndian, - LittleEndian -} -use Endianness::*; - -trait Parse { - fn parse(self, bytes: [u8; N]) -> T; -} -impl Parse for Endianness { - fn parse(self, bytes: [u8; 8]) -> u64 { - match self { - BigEndian => u64::from_be_bytes(bytes), - LittleEndian => u64::from_le_bytes(bytes) - } - } -} -impl Parse for Endianness { - fn parse(self, bytes: [u8; 4]) -> u32 { - match self { - BigEndian => u32::from_be_bytes(bytes), - LittleEndian => u32::from_le_bytes(bytes) - } - } -} -impl Parse for Endianness { - fn parse(self, bytes: [u8; 2]) -> u16 { - match self { - BigEndian => u16::from_be_bytes(bytes), - LittleEndian => u16::from_le_bytes(bytes) - } - } -} -impl Parse for Endianness { - fn parse(self, bytes: [u8; 1]) -> u8 { - match self { - BigEndian => u8::from_be_bytes(bytes), - LittleEndian => u8::from_le_bytes(bytes) - } - } -} -impl Parse for Endianness { - fn parse(self, bytes: [u8; 8]) -> i64 { - match self { - BigEndian => i64::from_be_bytes(bytes), - LittleEndian => i64::from_le_bytes(bytes) - } - } -} -impl Parse for Endianness { - fn parse(self, bytes: [u8; 4]) -> i32 { - match self { - BigEndian => i32::from_be_bytes(bytes), - LittleEndian => i32::from_le_bytes(bytes) - } - } -} -impl Parse for Endianness { - fn parse(self, bytes: [u8; 2]) -> i16 { - match self { - BigEndian => i16::from_be_bytes(bytes), - LittleEndian => i16::from_le_bytes(bytes) - } - } -} -impl Parse for Endianness { - fn parse(self, bytes: [u8; 1]) -> i8 { - match self { - BigEndian => i8::from_be_bytes(bytes), - LittleEndian => i8::from_le_bytes(bytes) - } - } -} -impl Parse for Endianness { - fn parse(self, bytes: [u8; 8]) -> f64 { - match self { - BigEndian => f64::from_be_bytes(bytes), - LittleEndian => f64::from_le_bytes(bytes) - } - } -} - -fn read_bytes(r: &mut BufReader) -> Result<[u8; N]> { - let mut buf = [0; N]; - r.read_exact(&mut buf)?; - Ok(buf) -} - -fn read_vec(r: &mut BufReader, n: usize) -> Result> { - let mut vec = Vec::with_capacity(n); - vec.resize(n, 0); - r.read_exact(&mut vec)?; - Ok(vec) -} - -trait ReadSwap { - fn read_swap(&mut self) -> Result; -} + files: Vec, -impl ReadSwap for Dissector { - fn read_swap(&mut self) -> Result { - Ok(self.endianness.parse(read_bytes(&mut self.r)?)) - } -} -impl ReadSwap for Dissector { - fn read_swap(&mut self) -> Result { - Ok(self.endianness.parse(read_bytes(&mut self.r)?)) - } -} + /// How to dissect the file. + #[arg(short, long, value_enum, default_value_t)] + mode: Mode, -impl ReadSwap for Dissector { - fn read_swap(&mut self) -> Result { - Ok(self.endianness.parse(read_bytes(&mut self.r)?)) - } + /// The encoding to use. + #[arg(long, value_parser = parse_encoding)] + encoding: Option<&'static Encoding>, } -impl ReadSwap for Dissector { - fn read_swap(&mut self) -> Result { - Ok(self.endianness.parse(read_bytes(&mut self.r)?)) - } -} +#[derive(ThisError, Debug)] +#[error("{0}: unknown encoding")] +struct UnknownEncodingError(String); -struct Dissector { - filename: String, - r: BufReader, - compression: Option, - endianness: Endianness, - fp_format: Endianness, - bias: f64, - n_variable_records: usize, - n_variables: usize, - var_widths: Vec, -} - -fn detect_endianness(layout_code: [u8; 4]) -> Option { - for endianness in [BigEndian, LittleEndian] { - match endianness.parse(layout_code) { - 2 | 3 => return Some(endianness), - _ => () - } +fn parse_encoding(arg: &str) -> Result<&'static Encoding, UnknownEncodingError> { + match Encoding::for_label_no_replacement(arg.as_bytes()) { + Some(encoding) => Ok(encoding), + None => Err(UnknownEncodingError(arg.to_string())), } - None } -fn detect_fp_format(bias: [u8; 8]) -> Option { - for endianness in [BigEndian, LittleEndian] { - let value: f64 = endianness.parse(bias); - if value == 100.0 { - return Some(endianness) - } - } - None +#[derive(Clone, Copy, Debug, Default, ValueEnum)] +enum Mode { + Identify, + Raw, + #[default] + Cooked, } -fn trim_end(mut s: Vec, c: u8) -> Vec { - while s.last() == Some(&c) { - s.pop(); - } - s -} +fn main() -> Result<()> { + let Args { + max_cases, + files, + mode, + encoding, + } = Args::parse(); -fn format_name(type_: u32) -> &'static str { - match type_ { - 1 => "A", - 2 => "AHEX", - 3 => "COMMA", - 4 => "DOLLAR", - 5 => "F", - 6 => "IB", - 7 => "PIBHEX", - 8 => "P", - 9 => "PIB", - 10 => "PK", - 11 => "RB", - 12 => "RBHEX", - 15 => "Z", - 16 => "N", - 17 => "E", - 20 => "DATE", - 21 => "TIME", - 22 => "DATETIME", - 23 => "ADATE", - 24 => "JDATE", - 25 => "DTIME", - 26 => "WKDAY", - 27 => "MONTH", - 28 => "MOYR", - 29 => "QYR", - 30 => "WKYR", - 31 => "PCT", - 32 => "DOT", - 33 => "CCA", - 34 => "CCB", - 35 => "CCC", - 36 => "CCD", - 37 => "CCE", - 38 => "EDATE", - 39 => "SDATE", - 40 => "MTIME", - 41 => "YMDHMS", - _ => "invalid" + for file in files { + dissect(&file, max_cases, mode, encoding)?; } + Ok(()) } -fn round_up(x: T, y: T) -> T -{ - (x + (y - T::one())) / y * y -} - -impl UntypedValue { - fn new( -} - -impl Dissector { - fn new>(filename: P) -> Result { - let mut r = BufReader::new(File::open(&filename)?); - let filename = filename.as_ref().to_string_lossy().into_owned(); - let rec_type: [u8; 4] = read_bytes(&mut r)?; - let zmagic = match &rec_type { - b"$FL2" => false, - b"$FL3" => true, - _ => Err(anyhow!("This is not an SPSS system file."))? - }; - - let eye_catcher: [u8; 60] = read_bytes(&mut r)?; - let layout_code: [u8; 4] = read_bytes(&mut r)?; - let endianness = detect_endianness(layout_code) - .ok_or_else(|| anyhow!("This is not an SPSS system file."))?; - let layout_code: u32 = endianness.parse(layout_code); - let _nominal_case_size: [u8; 4] = read_bytes(&mut r)?; - let compressed: u32 = endianness.parse(read_bytes(&mut r)?); - let compression = match (zmagic, compressed) { - (false, 0) => None, - (false, 1) => Some(Compression::Simple), - (true, 2) => Some(Compression::ZLib), - _ => Err(anyhow!("{} file header has invalid compression value {compressed}.", - if zmagic { "ZSAV" } else { "SAV" }))?, - }; - - let weight_index: u32 = endianness.parse(read_bytes(&mut r)?); - let n_cases: u32 = endianness.parse(read_bytes(&mut r)?); +fn dissect(file_name: &Path, max_cases: u64, mode: Mode, encoding: Option<&'static Encoding>) -> Result<()> { + let reader = File::open(file_name)?; + let reader = BufReader::new(reader); + let mut reader = Reader::new(reader)?; - let bias: [u8; 8] = read_bytes(&mut r)?; - let fp_format = detect_fp_format(bias) - .unwrap_or_else(|| { eprintln!("Compression bias is not the usual value of 100, or system file uses unrecognized floating-point format."); endianness }); - let bias: f64 = fp_format.parse(bias); - - let mut d = Dissector { - filename, - r, - compression, - endianness, - fp_format, - bias, - n_variable_records: 0, - n_variables: 0, - var_widths: Vec::new(), - }; - - let creation_date: [u8; 9] = read_bytes(&mut d.r)?; - let creation_time: [u8; 8] = read_bytes(&mut d.r)?; - let file_label: [u8; 64] = read_bytes(&mut d.r)?; - let mut file_label = trim_end(Vec::from(file_label), b' '); - d.r.seek_relative(3)?; - - println!("File header record:"); - println!("{:>17}: {}", "Product name", String::from_utf8_lossy(&eye_catcher)); - println!("{:>17}: {}", "Layout code", layout_code); - println!("{:>17}: {} ({})", "Compressed", compressed, match compression { - None => "no compression", - Some(Compression::Simple) => "simple compression", - Some(Compression::ZLib) => "ZLIB compression", - }); - println!("{:>17}: {}", "Weight index", weight_index); - println!("{:>17}: {}", "Number of cases", n_cases); - println!("{:>17}: {}", "Compression bias", bias); - println!("{:>17}: {}", "Creation date", String::from_utf8_lossy(&creation_date)); - println!("{:>17}: {}", "Creation time", String::from_utf8_lossy(&creation_time)); - println!("{:>17}: \"{}\"", "File label", String::from_utf8_lossy(&file_label)); - - loop { - let rec_type: u32 = d.read_swap()?; - match rec_type { - 2 => d.read_variable_record()?, - 3 => d.read_value_label_record()?, - 4 => Err(anyhow!("Misplaced type 4 record."))?, - 999 => break, - _ => Err(anyhow!("Unrecognized record type {rec_type}."))? + match mode { + Mode::Identify => { + let Record::Header(header) = reader.next().unwrap()? else { unreachable!() }; + match header.magic { + Magic::Sav => println!("SPSS System File"), + Magic::Zsav => println!("SPSS System File with Zlib compression"), + Magic::Ebcdic => println!("EBCDIC-encoded SPSS System File"), } + return Ok(()) } - - let pos = d.r.stream_position()?; - println!("{:08x}: end-of-dictionary record (first byte of data at {:0x})", pos, pos + 4); - - Ok(d) - } - - fn read_variable_record(&mut self) -> Result<()> { - self.n_variable_records += 1; - println!("{:08x}: variable record {}", self.r.stream_position()?, self.n_variable_records); - let width: i32 = self.read_swap()?; - let has_variable_label: u32 = self.read_swap()?; - let missing_value_code: i32 = self.read_swap()?; - let print_format: u32 = self.read_swap()?; - let write_format: u32 = self.read_swap()?; - let name: [u8; 8] = read_bytes(&mut self.r)?; - let name: Vec = trim_end(Vec::from(name), b'\0'); - - if width >= 0 { - self.n_variables += 1; - } - self.var_widths.push(width); - - println!("\tWidth: {width} ({})", match width { - _ if width > 0 => "string", - _ if width == 0 => "numeric", - _ => "long string continuation record" - }); - - println!("\tVariable label: {has_variable_label}"); - println!("\tMissing values code: {missing_value_code} ({})", - match missing_value_code { - 0 => "no missing values", - 1 => "one missing value", - 2 => "two missing values", - 3 => "three missing values", - -2 => "one missing value range", - -3 => "one missing value, one range", - _ => "bad value" - }); - for (which, format) in [("Print", print_format), - ("Worite", write_format)] { - let type_ = format_name(format >> 16); - let w = (format >> 8) & 0xff; - let d = format & 0xff; - println!("\t{which} format: {format:06x} ({type_}{w}.{d})"); + Mode::Raw => { + let headers: Vec = reader.collect_headers()?; + for header in headers { + println!("{header:?}"); + } } - println!("\tName: {}", String::from_utf8_lossy(&name)); - - // Read variable label. - match has_variable_label { - 0 => (), - 1 => { - let offset = self.r.stream_position()?; - let len: u32 = self.read_swap()?; - let read_len = len.min(65535) as usize; - let label = read_vec(&mut self.r, read_len)?; - println!("\t{offset:08x} Variable label: \"{}\"", String::from_utf8_lossy(&label)); - - self.r.seek_relative((round_up(len, 4) - len).into())?; - }, - _ => Err(anyhow!("Variable label indicator field is not 0 or 1."))?, - }; - - // Read missing values. - if missing_value_code != 0 { - print!("\t{:08x} Missing values:", self.r.stream_position()?); - if width == 0 { - let (has_range, n_individual) = match missing_value_code { - -3 => (true, 1), - -2 => (true, 0), - 1 | 2 | 3 => (false, missing_value_code), - _ => Err(anyhow!("Numeric missing value indicator field is not -3, -2, 0, 1, 2, or 3."))?, - }; - if has_range { - let low: f64 = self.read_swap()?; - let high: f64 = self.read_swap()?; - print!(" {low}...{high}"); - } - for _i in 0..n_individual { - let value: f64 = self.read_swap()?; - print!(" {value}"); - } - } else if width > 0 { - if missing_value_code < 1 || missing_value_code > 3 { - Err(anyhow!("String missing value indicator field is not 0, 1, 2, or 3."))?; - } - for _i in 0..missing_value_code { - let string: [u8; 8] = read_bytes(&mut self.r)?; - let string: Vec = trim_end(Vec::from(string), b'\0'); - println!(" {}", String::from_utf8_lossy(&string)); - } + Mode::Cooked => { + let headers: Vec = reader.collect_headers()?; + let headers = decode(headers, encoding, &|e| panic!("{e}"))?; + for header in headers { + println!("{header:?}"); } - println!(); } - - Ok(()) } - fn read_value_label_record(&mut self) -> Result<()> { - println!("{:08x}: value labels record", self.r.stream_position()?); - - let n_labels: u32 = self.read_swap()?; - for _i in 0..n_labels { - let raw: [u8; 8] = read_bytes(&mut self.r)?; - let label_len: u8 = self.read_swap()?; - let padded_len = round_up(label_len as usize + 1, 8); - - let mut label = read_vec(&mut self.r, padded_len)?; - label.truncate(label_len as usize); - print - } - - Ok(()) + for _ in 0..max_cases { + let Some(Ok(record)) = reader.next() else { + break; + }; + println!("{:?}", record); } + Ok(()) }