X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=rust%2Fsrc%2Flib.rs;h=0b8c541e54612fcee67b45596a6370ec128f5c5b;hb=20aa2ce6ec383c360658a4a4a46f6d51af5dc096;hp=5d6bf5c20876b4f0eb5e457adfc67eaa1362842e;hpb=e62271c65d61e9e84a6eb97a9db4673e710761c4;p=pspp diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 5d6bf5c208..0b8c541e54 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,6 +1,8 @@ #![allow(unused_variables)] use endian::{Endian, Parse}; -use std::io::{BufReader, Error as IoError, Read}; +use num::Integer; +use num_derive::FromPrimitive; +use std::io::{BufReader, Error as IoError, ErrorKind, Read, Seek}; use thiserror::Error; pub mod endian; @@ -10,23 +12,78 @@ pub enum Error { #[error("Not an SPSS system file")] NotASystemFile, - #[error("I/O error ({source})")] - Io { + #[error("Invalid magic number {0:?}")] + BadMagic([u8; 4]), + + #[error("I/O error ({0})")] + Io( #[from] - source: IoError, - }, + IoError, + ), #[error("Invalid SAV compression code {0}")] InvalidSavCompression(u32), #[error("Invalid ZSAV compression code {0}")] InvalidZsavCompression(u32), + + #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")] + BadVariableWidth { offset: u64, width: i32 }, + + #[error("Misplaced type 4 record near offset {0:#x}.")] + MisplacedType4Record(u64), + + #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")] + BadDocumentLength { offset: u64, n: u32, max: u32 }, + + #[error("At offset {offset:#x}, Unrecognized record type {rec_type}.")] + BadRecordType { offset: u64, rec_type: u32 }, + + #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")] + BadVariableLabelCode { offset: u64, code: u32 }, + + #[error( + "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3." + )] + BadNumericMissingValueCode { offset: u64, code: i32 }, + + #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")] + BadStringMissingValueCode { offset: u64, code: i32 }, + + #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")] + BadNumberOfValueLabels { offset: u64, n: u32, max: u32 }, + + #[error("At offset {offset:#x}, variable index record (type 4) does not immediately follow value label record (type 3) as it should.")] + MissingVariableIndexRecord { offset: u64 }, + + #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")] + BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 }, + + #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")] + ExtensionRecordTooLarge { + offset: u64, + subtype: u32, + size: u32, + count: u32, + }, + + #[error("Wrong ZLIB data header offset {zheader_offset:#x} (expected {offset:#x}).")] + BadZlibHeaderOffset { offset: u64, zheader_offset: u64 }, + + #[error("At offset {offset:#x}, impossible ZLIB trailer offset {ztrailer_offset:#x}.")] + BadZlibTrailerOffset { offset: u64, ztrailer_offset: u64 }, + + #[error("At offset {offset:#x}, impossible ZLIB trailer length {ztrailer_len}.")] + BadZlibTrailerLen { offset: u64, ztrailer_len: u64 }, } #[derive(Error, Debug)] pub enum Warning { #[error("Unexpected floating-point bias {0} or unrecognized floating-point format.")] UnexpectedBias(f64), + + #[error("Duplicate type 6 (document) record.")] + DuplicateDocumentRecord, } #[derive(Copy, Clone, Debug)] @@ -35,32 +92,44 @@ pub enum Compression { ZLib, } -pub struct Reader { - r: BufReader, +pub enum Record { + Header(Header), + Document(Document), + Variable(Variable), + ValueLabel(ValueLabel), + VarIndexes(VarIndexes), + Extension(Extension), + EndOfHeaders, + Case(Vec), } -pub const ASCII_MAGIC: &[u8; 4] = b"$FL2"; -pub const ASCII_ZMAGIC: &[u8; 4] = b"$FL3"; -pub const EBCDIC_MAGIC: &[u8; 4] = &[0x5b, 0xc6, 0xd3, 0xf2]; +pub struct Header { + /// Magic number. + pub magic: Magic, -pub struct FileHeader { - /// First 4 bytes of the file, one of `ASCII_MAGIC`, `ASCII_ZMAGIC`, and - /// `EBCDIC_MAGIC`. - pub magic: [u8; 4], + /// Eye-catcher string, product name, in the file's encoding. Padded + /// on the right with spaces. + pub eye_catcher: [u8; 60], - /// True if `magic` indicates that this file contained zlib-compressed data. - pub is_zsav: bool, + /// Layout code, normally either 2 or 3. + pub layout_code: u32, - /// True if `magic` indicates that this file contained EBCDIC data. - pub is_ebcdic: bool, + /// Number of variable positions, or `None` if the value in the file is + /// questionably trustworthy. + pub nominal_case_size: Option, + + /// Compression type, if any, + pub compression: Option, /// 0-based variable index of the weight variable, or `None` if the file is /// unweighted. pub weight_index: Option, - /// Number of variable positions, or `None` if the value in the file is - /// questionably trustworthy. - pub nominal_case_size: Option, + /// Claimed number of cases, if known. + pub n_cases: Option, + + /// Compression bias, usually 100.0. + pub bias: f64, /// `dd mmm yy` in the file's encoding. pub creation_date: [u8; 9], @@ -68,74 +137,570 @@ pub struct FileHeader { /// `HH:MM:SS` in the file's encoding. pub creation_time: [u8; 8], - /// Eye-catcher string, then product name, in the file's encoding. Padded - /// on the right with spaces. - pub eye_catcher: [u8; 60], - /// File label, in the file's encoding. Padded on the right with spaces. pub file_label: [u8; 64], + + /// Endianness of the data in the file header. + pub endianness: Endian, } -impl Reader { - pub fn new(r: R, warn: impl Fn(Warning)) -> Result, Error> { - let mut r = BufReader::new(r); - - let magic: [u8; 4] = read_bytes(&mut r)?; - let (is_zsav, is_ebcdic) = match &magic { - ASCII_MAGIC => (false, false), - ASCII_ZMAGIC => (true, false), - EBCDIC_MAGIC => (false, true), - _ => return Err(Error::NotASystemFile), - }; - - let eye_catcher: [u8; 60] = read_bytes(&mut r)?; - let layout_code: [u8; 4] = read_bytes(&mut r)?; - let endianness = Endian::identify_u32(2, layout_code) - .or_else(|| Endian::identify_u32(2, layout_code)) - .ok_or_else(|| Error::NotASystemFile)?; - - let nominal_case_size: u32 = endianness.parse(read_bytes(&mut r)?); - let nominal_case_size = (nominal_case_size <= u32::MAX / 32).then_some(nominal_case_size); - - let compression_code: u32 = endianness.parse(read_bytes(&mut r)?); - let compression = match (is_zsav, compression_code) { - (false, 0) => None, - (false, 1) => Some(Compression::Simple), - (true, 2) => Some(Compression::ZLib), - (false, code) => return Err(Error::InvalidSavCompression(code)), - (true, code) => return Err(Error::InvalidZsavCompression(code)), - }; - - let weight_index: u32 = endianness.parse(read_bytes(&mut r)?); - let weight_index = (weight_index > 0).then_some(weight_index - 1); - - let n_cases: u32 = endianness.parse(read_bytes(&mut r)?); - let n_cases = (n_cases <= u32::MAX / 4).then_some(n_cases); - - let bias: f64 = endianness.parse(read_bytes(&mut r)?); - if bias != 100.0 { - warn(Warning::UnexpectedBias(bias)) +#[derive(Copy, Clone, PartialEq, Eq, Hash)] +pub struct Magic([u8; 4]); + +impl Magic { + /// Magic number for a regular system file. + pub const SAV: Magic = Magic(*b"$FL2"); + + /// Magic number for a system file that contains zlib-compressed data. + pub const ZSAV: Magic = Magic(*b"$FL3"); + + /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded + /// in EBCDIC. + pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]); +} + +impl TryFrom<[u8; 4]> for Magic { + type Error = Error; + + fn try_from(value: [u8; 4]) -> Result { + let magic = Magic(value); + match magic { + Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic), + _ => Err(Error::BadMagic(value)), + } + } +} + +#[derive(Copy, Clone, PartialEq, Eq, Hash)] +pub enum VarType { + Number, + String, +} + +impl VarType { + fn from_width(width: i32) -> VarType { + match width { + 0 => VarType::Number, + _ => VarType::String, + } + } +} + +pub struct Reader { + r: BufReader, + var_types: Vec, + state: ReaderState, +} + +enum ReaderState { + Start, + Headers(Endian, Option), + Data(Endian), + End, +} + +#[derive(Copy, Clone)] +pub enum Value { + Number(Option), + String([u8; 8]), +} + +impl Value { + pub fn from_raw(var_type: VarType, raw: [u8; 8], endian: Endian) -> Value { + match var_type { + VarType::String => Value::String(raw), + VarType::Number => { + let number: f64 = endian.parse(raw); + Value::Number((number != -f64::MAX).then_some(number)) + } } + } +} - let creation_date: [u8; 9] = read_bytes(&mut r)?; - let creation_time: [u8; 8] = read_bytes(&mut r)?; - let file_label: [u8; 64] = read_bytes(&mut r)?; - let _: [u8; 3] = read_bytes(&mut r)?; - - let header = FileHeader { - magic, - is_zsav, - is_ebcdic, - weight_index, - nominal_case_size, - creation_date, - creation_time, - eye_catcher, - file_label, - }; - - Ok(Reader { r }) +impl Reader { + pub fn new(r: R) -> Result, Error> { + Ok(Reader { + r: BufReader::new(r), + var_types: Vec::new(), + state: ReaderState::Start, + }) } + fn _next(&mut self) -> Result, Error> { + match self.state { + ReaderState::Start => { + let header = read_header(&mut self.r)?; + let next_state = ReaderState::Headers(header.endianness, header.compression); + Ok(Some((Record::Header(header), next_state))) + } + ReaderState::Headers(endian, compression) => { + let rec_type: u32 = endian.parse(read_bytes(&mut self.r)?); + let record = match rec_type { + 2 => { + let variable = read_variable_record(&mut self.r, endian)?; + self.var_types.push(VarType::from_width(variable.width)); + Record::Variable(variable) + } + 3 => Record::ValueLabel(read_value_label_record(&mut self.r, endian)?), + 4 => Record::VarIndexes(read_var_indexes_record(&mut self.r, endian)?), + 6 => Record::Document(read_document_record(&mut self.r, endian)?), + 7 => Record::Extension(read_extension_record(&mut self.r, endian)?), + 999 => { + let _: [u8; 4] = read_bytes(&mut self.r)?; + let next_state = match compression { + None => ReaderState::Data(endian), + _ => ReaderState::End, + }; + return Ok(Some((Record::EndOfHeaders, next_state))); + } + _ => { + return Err(Error::BadRecordType { + offset: self.r.stream_position()?, + rec_type, + }) + } + }; + Ok(Some((record, ReaderState::Headers(endian, compression)))) + } + ReaderState::Data(endian) => { + let mut values = Vec::with_capacity(self.var_types.len()); + for (i, &var_type) in self.var_types.iter().enumerate() { + let raw = match read_bytes(&mut self.r) { + Ok(raw) => raw, + Err(err) => { + if i == 0 && err.kind() == ErrorKind::UnexpectedEof { + return Ok(None); + } else { + return Err(Error::Io(err)); + } + } + }; + values.push(Value::from_raw(var_type, raw, endian)); + } + Ok(Some((Record::Case(values), ReaderState::Data(endian)))) + } + ReaderState::End => Ok(None), + } + } +} + +impl Iterator for Reader { + type Item = Result; + + fn next(&mut self) -> Option { + let retval = self._next(); + match retval { + Ok(None) => { + self.state = ReaderState::End; + None + } + Ok(Some((record, next_state))) => { + self.state = next_state; + Some(Ok(record)) + } + Err(error) => { + self.state = ReaderState::End; + Some(Err(error)) + } + } + } +} + +fn read_header(r: &mut R) -> Result { + let magic: [u8; 4] = read_bytes(r)?; + let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?; + + let eye_catcher: [u8; 60] = read_bytes(r)?; + let layout_code: [u8; 4] = read_bytes(r)?; + let endianness = Endian::identify_u32(2, layout_code) + .or_else(|| Endian::identify_u32(2, layout_code)) + .ok_or_else(|| Error::NotASystemFile)?; + let layout_code = endianness.parse(layout_code); + + let nominal_case_size: u32 = endianness.parse(read_bytes(r)?); + let nominal_case_size = + (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size); + + let compression_code: u32 = endianness.parse(read_bytes(r)?); + let compression = match (magic, compression_code) { + (Magic::ZSAV, 2) => Some(Compression::ZLib), + (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)), + (_, 0) => None, + (_, 1) => Some(Compression::Simple), + (_, code) => return Err(Error::InvalidSavCompression(code)), + }; + + let weight_index: u32 = endianness.parse(read_bytes(r)?); + let weight_index = (weight_index > 0).then_some(weight_index - 1); + + let n_cases: u32 = endianness.parse(read_bytes(r)?); + let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases); + + let bias: f64 = endianness.parse(read_bytes(r)?); + + let creation_date: [u8; 9] = read_bytes(r)?; + let creation_time: [u8; 8] = read_bytes(r)?; + let file_label: [u8; 64] = read_bytes(r)?; + let _: [u8; 3] = read_bytes(r)?; + + Ok(Header { + magic, + layout_code, + nominal_case_size, + compression, + weight_index, + n_cases, + bias, + creation_date, + creation_time, + eye_catcher, + file_label, + endianness, + }) +} + +pub struct Variable { + /// Offset from the start of the file to the start of the record. + pub offset: u64, + + /// Variable width, in the range -1..=255. + pub width: i32, + + /// Variable name, padded on the right with spaces. + pub name: [u8; 8], + + /// Print format. + pub print_format: u32, + + /// Write format. + pub write_format: u32, + + /// Missing value code, one of -3, -2, 0, 1, 2, or 3. + pub missing_value_code: i32, + + /// Raw missing values, up to 3 of them. + pub missing: Vec<[u8; 8]>, + + /// Optional variable label. + pub label: Option>, +} + +fn read_variable_record( + r: &mut BufReader, + e: Endian, +) -> Result { + let offset = r.stream_position()?; + let width: i32 = e.parse(read_bytes(r)?); + let has_variable_label: u32 = e.parse(read_bytes(r)?); + let missing_value_code: i32 = e.parse(read_bytes(r)?); + let print_format: u32 = e.parse(read_bytes(r)?); + let write_format: u32 = e.parse(read_bytes(r)?); + let name: [u8; 8] = read_bytes(r)?; + + let label = match has_variable_label { + 0 => None, + 1 => { + let len: u32 = e.parse(read_bytes(r)?); + let read_len = len.min(65535) as usize; + let label = Some(read_vec(r, read_len)?); + + let padding_bytes = Integer::next_multiple_of(&len, &4) - len; + let _ = read_vec(r, padding_bytes as usize)?; + + label + } + _ => { + return Err(Error::BadVariableLabelCode { + offset, + code: has_variable_label, + }) + } + }; + + let mut missing = Vec::new(); + if missing_value_code != 0 { + match (width, missing_value_code) { + (0, -3 | -2 | 1 | 2 | 3) => (), + (0, _) => { + return Err(Error::BadNumericMissingValueCode { + offset, + code: missing_value_code, + }) + } + (_, 0..=3) => (), + (_, _) => { + return Err(Error::BadStringMissingValueCode { + offset, + code: missing_value_code, + }) + } + } + + for _ in 0..missing_value_code.abs() { + missing.push(read_bytes(r)?); + } + } + + Ok(Variable { + offset, + width, + name, + print_format, + write_format, + missing_value_code, + missing, + label, + }) +} + +pub struct ValueLabel { + /// Offset from the start of the file to the start of the record. + pub offset: u64, + + /// The labels. + pub labels: Vec<([u8; 8], Vec)>, +} + +impl ValueLabel { + /// Maximum number of value labels in a record. + pub const MAX: u32 = u32::MAX / 8; +} + +fn read_value_label_record( + r: &mut BufReader, + e: Endian, +) -> Result { + let offset = r.stream_position()?; + let n: u32 = e.parse(read_bytes(r)?); + if n > ValueLabel::MAX { + return Err(Error::BadNumberOfValueLabels { + offset, + n, + max: ValueLabel::MAX, + }); + } + + let mut labels = Vec::new(); + for _ in 0..n { + let value: [u8; 8] = read_bytes(r)?; + let label_len: u8 = e.parse(read_bytes(r)?); + let label_len = label_len as usize; + let padded_len = Integer::next_multiple_of(&(label_len + 1), &8); + + let mut label = read_vec(r, padded_len)?; + label.truncate(label_len); + labels.push((value, label)); + } + Ok(ValueLabel { offset, labels }) +} + +pub struct VarIndexes { + /// Offset from the start of the file to the start of the record. + pub offset: u64, + + /// The 0-based indexes of the variable indexes. + pub var_indexes: Vec, +} + +impl VarIndexes { + /// Maximum number of variable indexes in a record. + pub const MAX: u32 = u32::MAX / 8; +} + +fn read_var_indexes_record( + r: &mut BufReader, + e: Endian, +) -> Result { + let offset = r.stream_position()?; + let n: u32 = e.parse(read_bytes(r)?); + if n > VarIndexes::MAX { + return Err(Error::BadNumberOfVarIndexes { + offset, + n, + max: VarIndexes::MAX, + }); + } + let mut var_indexes = Vec::with_capacity(n as usize); + for _ in 0..n { + var_indexes.push(e.parse(read_bytes(r)?)); + } + + Ok(VarIndexes { + offset, + var_indexes, + }) +} + +pub const DOC_LINE_LEN: u32 = 80; +pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN; + +pub struct Document { + /// Offset from the start of the file to the start of the record. + pub pos: u64, + + /// The document, as an array of 80-byte lines. + pub lines: Vec<[u8; DOC_LINE_LEN as usize]>, +} + +fn read_document_record( + r: &mut BufReader, + e: Endian, +) -> Result { + let offset = r.stream_position()?; + let n: u32 = e.parse(read_bytes(r)?); + match n { + 0..=DOC_MAX_LINES => { + let pos = r.stream_position()?; + let mut lines = Vec::with_capacity(n as usize); + for _ in 0..n { + let line: [u8; 80] = read_bytes(r)?; + lines.push(line); + } + Ok(Document { pos, lines }) + } + _ => Err(Error::BadDocumentLength { + offset, + n, + max: DOC_MAX_LINES, + }), + } +} + +#[derive(FromPrimitive)] +enum ExtensionType { + /// Machine integer info. + Integer = 3, + /// Machine floating-point info. + Float = 4, + /// Variable sets. + VarSets = 5, + /// DATE. + Date = 6, + /// Multiple response sets. + Mrsets = 7, + /// SPSS Data Entry. + DataEntry = 8, + /// Extra product info text. + ProductInfo = 10, + /// Variable display parameters. + Display = 11, + /// Long variable names. + LongNames = 13, + /// Long strings. + LongStrings = 14, + /// Extended number of cases. + Ncases = 16, + /// Data file attributes. + FileAttrs = 17, + /// Variable attributes. + VarAttrs = 18, + /// Multiple response sets (extended). + Mrsets2 = 19, + /// Character encoding. + Encoding = 20, + /// Value labels for long strings. + LongLabels = 21, + /// Missing values for long strings. + LongMissing = 22, + /// "Format properties in dataview table". + Dataview = 24, +} + +pub struct Extension { + /// Offset from the start of the file to the start of the record. + pub offset: u64, + + /// Record subtype. + pub subtype: u32, + + /// Size of each data element. + pub size: u32, + + /// Number of data elements. + pub count: u32, + + /// `size * count` bytes of data. + pub data: Vec, +} + +fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) { + match extension { + /* Implemented record types. */ + ExtensionType::Integer => (4, 8), + ExtensionType::Float => (8, 3), + ExtensionType::VarSets => (1, 0), + ExtensionType::Mrsets => (1, 0), + ExtensionType::ProductInfo => (1, 0), + ExtensionType::Display => (4, 0), + ExtensionType::LongNames => (1, 0), + ExtensionType::LongStrings => (1, 0), + ExtensionType::Ncases => (8, 2), + ExtensionType::FileAttrs => (1, 0), + ExtensionType::VarAttrs => (1, 0), + ExtensionType::Mrsets2 => (1, 0), + ExtensionType::Encoding => (1, 0), + ExtensionType::LongLabels => (1, 0), + ExtensionType::LongMissing => (1, 0), + + /* Ignored record types. */ + ExtensionType::Date => (0, 0), + ExtensionType::DataEntry => (0, 0), + ExtensionType::Dataview => (0, 0), + } +} + +fn read_extension_record( + r: &mut BufReader, + e: Endian, +) -> Result { + let subtype = e.parse(read_bytes(r)?); + let offset = r.stream_position()?; + let size: u32 = e.parse(read_bytes(r)?); + let count = e.parse(read_bytes(r)?); + let Some(product) = size.checked_mul(count) else { + return Err(Error::ExtensionRecordTooLarge { + offset, + subtype, + size, + count, + }); + }; + let offset = r.stream_position()?; + let data = read_vec(r, product as usize)?; + Ok(Extension { + offset, + subtype, + size, + count, + data, + }) +} + +struct ZHeader { + /// File offset to the start of the record. + offset: u64, + + /// File offset to the ZLIB data header. + zheader_offset: u64, + + /// File offset to the ZLIB trailer. + ztrailer_offset: u64, + + /// Length of the ZLIB trailer in bytes. + ztrailer_len: u64, +} + +fn read_zheader(r: &mut BufReader, e: Endian) -> Result { + let offset = r.stream_position()?; + let zheader_offset: u64 = e.parse(read_bytes(r)?); + let ztrailer_offset: u64 = e.parse(read_bytes(r)?); + let ztrailer_len: u64 = e.parse(read_bytes(r)?); + + Ok(ZHeader { + offset, + zheader_offset, + ztrailer_offset, + ztrailer_len, + }) } fn read_bytes(r: &mut R) -> Result<[u8; N], IoError> { @@ -144,6 +709,12 @@ fn read_bytes(r: &mut R) -> Result<[u8; N], IoError> { Ok(buf) } +fn read_vec(r: &mut BufReader, n: usize) -> Result, IoError> { + let mut vec = vec![0; n]; + r.read_exact(&mut vec)?; + Ok(vec) +} + /* fn trim_end(mut s: Vec, c: u8) -> Vec { while s.last() == Some(&c) {