X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=rust%2Fsrc%2Flib.rs;h=482bd081979a8ec060945304729680ad8c081323;hb=163d8e396c5b2fd5afd68903d08bff938f13d048;hp=26db62dc119548097ebff22762aecf9c73c0dd27;hpb=7434003a421667a79280c63277ee8c7e8b0e9352;p=pspp diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 26db62dc11..482bd08197 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,21 +1,23 @@ -#![allow(unused_variables)] -use endian::{Endian, Parse}; -use num::Integer; -use std::io::{BufReader, Error as IoError, Read, Seek}; -use thiserror::Error; +use std::io::Error as IoError; +use thiserror::Error as ThisError; pub mod endian; +pub mod raw; +pub mod cooked; +pub mod sack; +pub mod encoding; +pub mod format; -#[derive(Error, Debug)] +#[derive(ThisError, Debug)] pub enum Error { #[error("Not an SPSS system file")] NotASystemFile, - #[error("I/O error ({source})")] - Io { - #[from] - source: IoError, - }, + #[error("Invalid magic number {0:?}")] + BadMagic([u8; 4]), + + #[error("I/O error ({0})")] + Io(#[from] IoError), #[error("Invalid SAV compression code {0}")] InvalidSavCompression(u32), @@ -23,13 +25,13 @@ pub enum Error { #[error("Invalid ZSAV compression code {0}")] InvalidZsavCompression(u32), - #[error("Misplaced type 4 record near offset {0:#x}.")] - MisplacedType4Record(u64), + #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")] + BadVariableWidth { offset: u64, width: i32 }, #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")] BadDocumentLength { offset: u64, n: u32, max: u32 }, - #[error("At offset {offset:#x}, Unrecognized record type {rec_type}.")] + #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")] BadRecordType { offset: u64, rec_type: u32 }, #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")] @@ -46,411 +48,79 @@ pub enum Error { #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")] BadNumberOfValueLabels { offset: u64, n: u32, max: u32 }, - #[error("At offset {offset:#x}, variable index record (type 4) does not immediately follow value label record (type 3) as it should.")] - MissingVariableIndexRecord { offset: u64 }, - - #[error("At offset {offset:#x}, number of variables associated with a value label ({n}) is not between 1 and the number of variables ({max}).")] - BadNumberOfValueLabelVariables { offset: u64, n: u32, max: u32 }, -} - -#[derive(Error, Debug)] -pub enum Warning { - #[error("Unexpected floating-point bias {0} or unrecognized floating-point format.")] - UnexpectedBias(f64), - - #[error("Duplicate type 6 (document) record.")] - DuplicateDocumentRecord, -} - -#[derive(Copy, Clone, Debug)] -pub enum Compression { - Simple, - ZLib, -} - -pub struct Reader { - r: BufReader, - - document_record: Option, - - variables: Vec, - - value_labels: Vec, -} - -/// Magic number for a regular system file. -pub const ASCII_MAGIC: &[u8; 4] = b"$FL2"; - -/// Magic number for a system file that contains zlib-compressed data. -pub const ASCII_ZMAGIC: &[u8; 4] = b"$FL3"; - -/// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded in -/// EBCDIC. -pub const EBCDIC_MAGIC: &[u8; 4] = &[0x5b, 0xc6, 0xd3, 0xf2]; - -pub struct FileHeader { - /// First 4 bytes of the file, one of `ASCII_MAGIC`, `ASCII_ZMAGIC`, and - /// `EBCDIC_MAGIC`. - pub magic: [u8; 4], - - /// True if `magic` indicates that this file contained zlib-compressed data. - pub is_zsav: bool, - - /// True if `magic` indicates that this file contained EBCDIC data. - pub is_ebcdic: bool, - - /// Endianness of the data in the file header. - pub endianness: Endian, - - /// 0-based variable index of the weight variable, or `None` if the file is - /// unweighted. - pub weight_index: Option, - - /// Number of variable positions, or `None` if the value in the file is - /// questionably trustworthy. - pub nominal_case_size: Option, - - /// `dd mmm yy` in the file's encoding. - pub creation_date: [u8; 9], - - /// `HH:MM:SS` in the file's encoding. - pub creation_time: [u8; 8], - - /// Eye-catcher string, then product name, in the file's encoding. Padded - /// on the right with spaces. - pub eye_catcher: [u8; 60], - - /// File label, in the file's encoding. Padded on the right with spaces. - pub file_label: [u8; 64], -} - -pub const DOC_LINE_LEN: u32 = 80; -pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN; - -impl Reader { - pub fn new(r: R, warn: impl Fn(Warning)) -> Result, Error> { - let mut r = BufReader::new(r); - - let header = read_header(&mut r, &warn)?; - let e = header.endianness; - let mut document_record = None; - let mut variables = Vec::new(); - let mut value_labels = Vec::new(); - loop { - let offset = r.stream_position()?; - let rec_type: u32 = e.parse(read_bytes(&mut r)?); - match rec_type { - 2 => variables.push(read_variable_record(&mut r, e)?), - 3 => value_labels.push(read_value_label_record(&mut r, e, variables.len())?), - // A Type 4 record is always immediately after a type 3 record, - // the code for type 3 records reads the type 4 record too. - 4 => return Err(Error::MisplacedType4Record(offset)), - - 6 => { - let d = read_document_record(&mut r, e)?; - if document_record.is_some() { - warn(Warning::DuplicateDocumentRecord); - } else { - document_record = d; - } - } - /* - 7 => d.read_extension_record()?, - */ - 999 => break, - _ => return Err(Error::BadRecordType { offset, rec_type }), - } - } - - Ok(Reader { - r, - document_record, - variables, - value_labels, - }) - } -} - -fn read_header(r: &mut R, warn: impl Fn(Warning)) -> Result { - let magic: [u8; 4] = read_bytes(r)?; - let (is_zsav, is_ebcdic) = match &magic { - ASCII_MAGIC => (false, false), - ASCII_ZMAGIC => (true, false), - EBCDIC_MAGIC => (false, true), - _ => return Err(Error::NotASystemFile), - }; - - let eye_catcher: [u8; 60] = read_bytes(r)?; - let layout_code: [u8; 4] = read_bytes(r)?; - let endianness = Endian::identify_u32(2, layout_code) - .or_else(|| Endian::identify_u32(2, layout_code)) - .ok_or_else(|| Error::NotASystemFile)?; + #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")] + BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 }, - let nominal_case_size: u32 = endianness.parse(read_bytes(r)?); - let nominal_case_size = - (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size); - - let compression_code: u32 = endianness.parse(read_bytes(r)?); - let compression = match (is_zsav, compression_code) { - (false, 0) => None, - (false, 1) => Some(Compression::Simple), - (true, 2) => Some(Compression::ZLib), - (false, code) => return Err(Error::InvalidSavCompression(code)), - (true, code) => return Err(Error::InvalidZsavCompression(code)), - }; - - let weight_index: u32 = endianness.parse(read_bytes(r)?); - let weight_index = (weight_index > 0).then_some(weight_index - 1); - - let n_cases: u32 = endianness.parse(read_bytes(r)?); - let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases); - - let bias: f64 = endianness.parse(read_bytes(r)?); - if bias != 100.0 { - warn(Warning::UnexpectedBias(bias)) - } - - let creation_date: [u8; 9] = read_bytes(r)?; - let creation_time: [u8; 8] = read_bytes(r)?; - let file_label: [u8; 64] = read_bytes(r)?; - let _: [u8; 3] = read_bytes(r)?; - - Ok(FileHeader { - magic, - is_zsav, - is_ebcdic, - endianness, - weight_index, - nominal_case_size, - creation_date, - creation_time, - eye_catcher, - file_label, - }) -} - -pub struct VariableRecord { - /// Offset from the start of the file to the start of the record. - pub offset: u64, - - /// Variable width, in the range -1..=255. - pub width: i32, - - /// Variable name, padded on the right with spaces. - pub name: [u8; 8], - - /// Print format. - pub print_format: u32, - - /// Write format. - pub write_format: u32, - - /// Missing value code, one of -3, -2, 0, 1, 2, or 3. - pub missing_value_code: i32, - - /// Raw missing values, up to 3 of them. - pub missing: Vec<[u8; 8]>, - - /// Optional variable label. - pub label: Option>, -} - -fn read_variable_record( - r: &mut BufReader, - e: Endian, -) -> Result { - let offset = r.stream_position()?; - let width: i32 = e.parse(read_bytes(r)?); - let has_variable_label: u32 = e.parse(read_bytes(r)?); - let missing_value_code: i32 = e.parse(read_bytes(r)?); - let print_format: u32 = e.parse(read_bytes(r)?); - let write_format: u32 = e.parse(read_bytes(r)?); - let name: [u8; 8] = read_bytes(r)?; - - let label = match has_variable_label { - 0 => None, - 1 => { - let len: u32 = e.parse(read_bytes(r)?); - let read_len = len.min(65535) as usize; - let label = Some(read_vec(r, read_len)?); - - let padding_bytes = Integer::next_multiple_of(&len, &4) - len; - let _ = read_vec(r, padding_bytes as usize)?; - - label - } - _ => { - return Err(Error::BadVariableLabelCode { - offset, - code: has_variable_label, - }) - } - }; - - let mut missing = Vec::new(); - if missing_value_code != 0 { - match (width, missing_value_code) { - (0, -3 | -2 | 1 | 2 | 3) => (), - (0, _) => { - return Err(Error::BadNumericMissingValueCode { - offset, - code: missing_value_code, - }) - } - (_, 0..=3) => (), - (_, _) => { - return Err(Error::BadStringMissingValueCode { - offset, - code: missing_value_code, - }) - } - } - - for _ in 0..missing_value_code.abs() { - missing.push(read_bytes(r)?); - } - } - - Ok(VariableRecord { - offset, - width, - name, - print_format, - write_format, - missing_value_code, - missing, - label, - }) -} - -pub struct ValueLabelRecord { - /// Offset from the start of the file to the start of the record. - pub offset: u64, + #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")] + ExtensionRecordTooLarge { + offset: u64, + subtype: u32, + size: u32, + count: u32, + }, - /// The labels. - pub labels: Vec<([u8; 8], Vec)>, + #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")] + EofInCase { + offset: u64, + case_ofs: u64, + case_len: usize, + }, - /// The 0-based indexes of the variables to which the labels are assigned. - pub var_indexes: Vec, -} + #[error( + "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case." + )] + EofInCompressedCase { offset: u64, case_ofs: u64 }, -pub const MAX_VALUE_LABELS: u32 = u32::MAX / 8; + #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")] + PartialCompressedCase { offset: u64, case_ofs: u64 }, -fn read_value_label_record( - r: &mut BufReader, - e: Endian, - n_var_records: usize, -) -> Result { - let offset = r.stream_position()?; - let n: u32 = e.parse(read_bytes(r)?); - if n > MAX_VALUE_LABELS { - return Err(Error::BadNumberOfValueLabels { - offset, - n, - max: MAX_VALUE_LABELS, - }); - } + #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")] + CompressedNumberExpected { offset: u64, case_ofs: u64 }, - let mut labels = Vec::new(); - for _ in 0..n { - let value: [u8; 8] = read_bytes(r)?; - let label_len: u8 = e.parse(read_bytes(r)?); - let label_len = label_len as usize; - let padded_len = Integer::next_multiple_of(&(label_len + 1), &8); + #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")] + CompressedStringExpected { offset: u64, case_ofs: u64 }, - let mut label = read_vec(r, padded_len)?; - label.truncate(label_len); - labels.push((value, label)); - } + #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")] + BadZlibTrailerNBlocks { + offset: u64, + n_blocks: u32, + expected_n_blocks: u64, + ztrailer_len: u64, + }, - let rec_type: u32 = e.parse(read_bytes(r)?); - if rec_type != 4 { - return Err(Error::MissingVariableIndexRecord { - offset: r.stream_position()?, - }); - } + #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")] + BadRecordSize { offset: u64, record: String, size: u32, expected_size: u32 }, - let n_vars: u32 = e.parse(read_bytes(r)?); - if n_vars < 1 || n_vars as usize > n_var_records { - return Err(Error::BadNumberOfValueLabelVariables { - offset: r.stream_position()?, - n: n_vars, - max: n_var_records as u32, - }); - } - let mut var_indexes = Vec::with_capacity(n_vars as usize); - for _ in 0..n_vars { - var_indexes.push(e.parse(read_bytes(r)?)); - } + #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")] + BadRecordCount { offset: u64, record: String, count: u32, expected_count: u32 }, - Ok(ValueLabelRecord { - offset, - labels, - var_indexes, - }) -} + #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")] + BadEncodingName { offset: u64 }, -pub struct DocumentRecord { - /// Offset from the start of the file to the start of the record. - pub pos: u64, + #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")] + BadLongMissingValueLength { record_offset: u64, offset: u64, value_len: u32 }, - /// The document, as an array of 80-byte lines. - pub lines: Vec<[u8; DOC_LINE_LEN as usize]>, -} + #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")] + BadLongMissingValueFormat, -fn read_document_record( - r: &mut BufReader, - e: Endian, -) -> Result, Error> { - let offset = r.stream_position()?; - let n: u32 = e.parse(read_bytes(r)?); - if n == 0 { - Ok(None) - } else if n > DOC_MAX_LINES { - Err(Error::BadDocumentLength { - offset, - n, - max: DOC_MAX_LINES, - }) - } else { - let pos = r.stream_position()?; - let mut lines = Vec::with_capacity(n as usize); - for i in 0..n { - let line: [u8; 80] = read_bytes(r)?; - lines.push(line); - } - Ok(Some(DocumentRecord { pos, lines })) - } -} + #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")] + InvalidCreationDate { creation_date: String }, -fn read_bytes(r: &mut R) -> Result<[u8; N], IoError> { - let mut buf = [0; N]; - r.read_exact(&mut buf)?; - Ok(buf) -} + #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")] + InvalidCreationTime { creation_time: String }, -fn read_vec(r: &mut BufReader, n: usize) -> Result, IoError> { - let mut vec = vec![0; n]; - r.read_exact(&mut vec)?; - Ok(vec) + #[error("Details TBD")] + TBD, } -/* -fn trim_end(mut s: Vec, c: u8) -> Vec { - while s.last() == Some(&c) { - s.pop(); - } - s +#[derive(Copy, Clone, Debug)] +pub enum Compression { + Simple, + ZLib, } -fn skip_bytes(r: &mut R, mut n: u64) -> Result<(), IoError> { - let mut buf = [0; 1024]; - while n > 0 { - let chunk = u64::min(n, buf.len() as u64); - r.read_exact(&mut buf[0..chunk as usize])?; - n -= chunk; - } - Ok(()) +#[derive(Clone, Debug)] +pub enum CategoryLabels { + VarLabels, + CountedValues, } - -*/