X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=rust%2Fsrc%2Flib.rs;h=3eb4bbae4eba2032a46a954c05c98d0d99959525;hb=6165ed413d9aa818e3246d0a063c646dc4efc7e5;hp=8e8ee6cab4fad89ec9e749960b40f8330cf0cdde;hpb=eb8cd5c6597e10e02bc0321ac5893ec7463797fa;p=pspp diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 8e8ee6cab4..3eb4bbae4e 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,699 +1,8 @@ -#![allow(unused_variables)] -use endian::{Endian, Parse}; -use num::Integer; -use num_derive::FromPrimitive; -use std::io::{BufReader, Error as IoError, Read, Seek}; -use thiserror::Error; - +pub mod cooked; +pub mod encoding; pub mod endian; - -#[derive(Error, Debug)] -pub enum Error { - #[error("Not an SPSS system file")] - NotASystemFile, - - #[error("Invalid magic number {0:?}")] - BadMagic([u8; 4]), - - #[error("I/O error ({source})")] - Io { - #[from] - source: IoError, - }, - - #[error("Invalid SAV compression code {0}")] - InvalidSavCompression(u32), - - #[error("Invalid ZSAV compression code {0}")] - InvalidZsavCompression(u32), - - #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")] - BadVariableWidth { offset: u64, width: i32 }, - - #[error("Misplaced type 4 record near offset {0:#x}.")] - MisplacedType4Record(u64), - - #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")] - BadDocumentLength { offset: u64, n: u32, max: u32 }, - - #[error("At offset {offset:#x}, Unrecognized record type {rec_type}.")] - BadRecordType { offset: u64, rec_type: u32 }, - - #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")] - BadVariableLabelCode { offset: u64, code: u32 }, - - #[error( - "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3." - )] - BadNumericMissingValueCode { offset: u64, code: i32 }, - - #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")] - BadStringMissingValueCode { offset: u64, code: i32 }, - - #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")] - BadNumberOfValueLabels { offset: u64, n: u32, max: u32 }, - - #[error("At offset {offset:#x}, variable index record (type 4) does not immediately follow value label record (type 3) as it should.")] - MissingVariableIndexRecord { offset: u64 }, - - #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")] - BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 }, - - #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")] - ExtensionRecordTooLarge { - offset: u64, - subtype: u32, - size: u32, - count: u32, - }, - - #[error("Wrong ZLIB data header offset {zheader_offset:#x} (expected {offset:#x}).")] - BadZlibHeaderOffset { offset: u64, zheader_offset: u64 }, - - #[error("At offset {offset:#x}, impossible ZLIB trailer offset {ztrailer_offset:#x}.")] - BadZlibTrailerOffset { offset: u64, ztrailer_offset: u64 }, - - #[error("At offset {offset:#x}, impossible ZLIB trailer length {ztrailer_len}.")] - BadZlibTrailerLen { offset: u64, ztrailer_len: u64 }, -} - -#[derive(Error, Debug)] -pub enum Warning { - #[error("Unexpected floating-point bias {0} or unrecognized floating-point format.")] - UnexpectedBias(f64), - - #[error("Duplicate type 6 (document) record.")] - DuplicateDocumentRecord, -} - -#[derive(Copy, Clone, Debug)] -pub enum Compression { - Simple, - ZLib, -} - -pub enum Record { - Header(Header), - Document(Document), - Variable(Variable), - ValueLabel(ValueLabel), - VarIndexes(VarIndexes), - Extension(Extension), - EndOfHeaders, -} - -pub struct Header { - /// Magic number. - pub magic: Magic, - - /// Eye-catcher string, product name, in the file's encoding. Padded - /// on the right with spaces. - pub eye_catcher: [u8; 60], - - /// Layout code, normally either 2 or 3. - pub layout_code: u32, - - /// Number of variable positions, or `None` if the value in the file is - /// questionably trustworthy. - pub nominal_case_size: Option, - - /// Compression type, if any, - pub compression: Option, - - /// 0-based variable index of the weight variable, or `None` if the file is - /// unweighted. - pub weight_index: Option, - - /// Claimed number of cases, if known. - pub n_cases: Option, - - /// Compression bias, usually 100.0. - pub bias: f64, - - /// `dd mmm yy` in the file's encoding. - pub creation_date: [u8; 9], - - /// `HH:MM:SS` in the file's encoding. - pub creation_time: [u8; 8], - - /// File label, in the file's encoding. Padded on the right with spaces. - pub file_label: [u8; 64], - - /// Endianness of the data in the file header. - pub endianness: Endian, -} - -#[derive(Copy, Clone, PartialEq, Eq, Hash)] -pub struct Magic([u8; 4]); - -impl Magic { - /// Magic number for a regular system file. - pub const SAV: Magic = Magic(*b"$FL2"); - - /// Magic number for a system file that contains zlib-compressed data. - pub const ZSAV: Magic = Magic(*b"$FL3"); - - /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded - /// in EBCDIC. - pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]); -} - -impl TryFrom<[u8; 4]> for Magic { - type Error = Error; - - fn try_from(value: [u8; 4]) -> Result { - let magic = Magic(value); - match magic { - Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic), - _ => Err(Error::BadMagic(value)), - } - } -} - -enum VarType { - Number, - String, -} - -impl VarType { - fn from_width(width: i32) -> VarType { - match width { - 0 => VarType::Number, - _ => VarType::String, - } - } -} - -pub struct Reader { - r: BufReader, - var_types: Vec, - state: ReaderState, -} - -enum ReaderState { - Start, - Headers(Endian, Option), - Data(Endian), - End, -} - -impl Reader { - pub fn new(r: R) -> Result, Error> { - Ok(Reader { - r: BufReader::new(r), - var_types: Vec::new(), - state: ReaderState::Start, - }) - } - fn _next(&mut self) -> Result, Error> { - match self.state { - ReaderState::Start => { - let header = read_header(&mut self.r)?; - let next_state = ReaderState::Headers(header.endianness, header.compression); - Ok(Some((Record::Header(header), next_state))) - } - ReaderState::Headers(endian, compression) => { - let rec_type: u32 = endian.parse(read_bytes(&mut self.r)?); - let record = match rec_type { - 2 => { - let variable = read_variable_record(&mut self.r, endian)?; - self.var_types.push(VarType::from_width(variable.width)); - Record::Variable(variable) - } - 3 => Record::ValueLabel(read_value_label_record(&mut self.r, endian)?), - 4 => Record::VarIndexes(read_var_indexes_record(&mut self.r, endian)?), - 6 => Record::Document(read_document_record(&mut self.r, endian)?), - 7 => Record::Extension(read_extension_record(&mut self.r, endian)?), - 999 => { - let _: [u8; 4] = read_bytes(&mut self.r)?; - let next_state = match compression { - None => ReaderState::Data(endian), - _ => ReaderState::End, - }; - return Ok(Some((Record::EndOfHeaders, next_state))); - } - _ => { - return Err(Error::BadRecordType { - offset: self.r.stream_position()?, - rec_type, - }) - } - }; - Ok(Some((record, ReaderState::Headers(endian, compression)))) - } - ReaderState::End => Ok(None), - } - } -} - -impl Iterator for Reader { - type Item = Result; - - fn next(&mut self) -> Option { - let retval = self._next(); - match retval { - Ok(None) => { - self.state = ReaderState::End; - None - } - Ok(Some((record, next_state))) => { - self.state = next_state; - Some(Ok(record)) - } - Err(error) => { - self.state = ReaderState::End; - Some(Err(error)) - } - } - } -} - -fn read_header(r: &mut R) -> Result { - let magic: [u8; 4] = read_bytes(r)?; - let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?; - - let eye_catcher: [u8; 60] = read_bytes(r)?; - let layout_code: [u8; 4] = read_bytes(r)?; - let endianness = Endian::identify_u32(2, layout_code) - .or_else(|| Endian::identify_u32(2, layout_code)) - .ok_or_else(|| Error::NotASystemFile)?; - let layout_code = endianness.parse(layout_code); - - let nominal_case_size: u32 = endianness.parse(read_bytes(r)?); - let nominal_case_size = - (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size); - - let compression_code: u32 = endianness.parse(read_bytes(r)?); - let compression = match (magic, compression_code) { - (Magic::ZSAV, 2) => Some(Compression::ZLib), - (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)), - (_, 0) => None, - (_, 1) => Some(Compression::Simple), - (_, code) => return Err(Error::InvalidSavCompression(code)), - }; - - let weight_index: u32 = endianness.parse(read_bytes(r)?); - let weight_index = (weight_index > 0).then_some(weight_index - 1); - - let n_cases: u32 = endianness.parse(read_bytes(r)?); - let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases); - - let bias: f64 = endianness.parse(read_bytes(r)?); - - let creation_date: [u8; 9] = read_bytes(r)?; - let creation_time: [u8; 8] = read_bytes(r)?; - let file_label: [u8; 64] = read_bytes(r)?; - let _: [u8; 3] = read_bytes(r)?; - - Ok(Header { - magic, - layout_code, - nominal_case_size, - compression, - weight_index, - n_cases, - bias, - creation_date, - creation_time, - eye_catcher, - file_label, - endianness, - }) -} - -pub struct Variable { - /// Offset from the start of the file to the start of the record. - pub offset: u64, - - /// Variable width, in the range -1..=255. - pub width: i32, - - /// Variable name, padded on the right with spaces. - pub name: [u8; 8], - - /// Print format. - pub print_format: u32, - - /// Write format. - pub write_format: u32, - - /// Missing value code, one of -3, -2, 0, 1, 2, or 3. - pub missing_value_code: i32, - - /// Raw missing values, up to 3 of them. - pub missing: Vec<[u8; 8]>, - - /// Optional variable label. - pub label: Option>, -} - -fn read_variable_record( - r: &mut BufReader, - e: Endian, -) -> Result { - let offset = r.stream_position()?; - let width: i32 = e.parse(read_bytes(r)?); - let has_variable_label: u32 = e.parse(read_bytes(r)?); - let missing_value_code: i32 = e.parse(read_bytes(r)?); - let print_format: u32 = e.parse(read_bytes(r)?); - let write_format: u32 = e.parse(read_bytes(r)?); - let name: [u8; 8] = read_bytes(r)?; - - let label = match has_variable_label { - 0 => None, - 1 => { - let len: u32 = e.parse(read_bytes(r)?); - let read_len = len.min(65535) as usize; - let label = Some(read_vec(r, read_len)?); - - let padding_bytes = Integer::next_multiple_of(&len, &4) - len; - let _ = read_vec(r, padding_bytes as usize)?; - - label - } - _ => { - return Err(Error::BadVariableLabelCode { - offset, - code: has_variable_label, - }) - } - }; - - let mut missing = Vec::new(); - if missing_value_code != 0 { - match (width, missing_value_code) { - (0, -3 | -2 | 1 | 2 | 3) => (), - (0, _) => { - return Err(Error::BadNumericMissingValueCode { - offset, - code: missing_value_code, - }) - } - (_, 0..=3) => (), - (_, _) => { - return Err(Error::BadStringMissingValueCode { - offset, - code: missing_value_code, - }) - } - } - - for _ in 0..missing_value_code.abs() { - missing.push(read_bytes(r)?); - } - } - - Ok(Variable { - offset, - width, - name, - print_format, - write_format, - missing_value_code, - missing, - label, - }) -} - -pub struct ValueLabel { - /// Offset from the start of the file to the start of the record. - pub offset: u64, - - /// The labels. - pub labels: Vec<([u8; 8], Vec)>, -} - -impl ValueLabel { - /// Maximum number of value labels in a record. - pub const MAX: u32 = u32::MAX / 8; -} - -fn read_value_label_record( - r: &mut BufReader, - e: Endian, -) -> Result { - let offset = r.stream_position()?; - let n: u32 = e.parse(read_bytes(r)?); - if n > ValueLabel::MAX { - return Err(Error::BadNumberOfValueLabels { - offset, - n, - max: ValueLabel::MAX, - }); - } - - let mut labels = Vec::new(); - for _ in 0..n { - let value: [u8; 8] = read_bytes(r)?; - let label_len: u8 = e.parse(read_bytes(r)?); - let label_len = label_len as usize; - let padded_len = Integer::next_multiple_of(&(label_len + 1), &8); - - let mut label = read_vec(r, padded_len)?; - label.truncate(label_len); - labels.push((value, label)); - } - Ok(ValueLabel { offset, labels }) -} - -pub struct VarIndexes { - /// Offset from the start of the file to the start of the record. - pub offset: u64, - - /// The 0-based indexes of the variable indexes. - pub var_indexes: Vec, -} - -impl VarIndexes { - /// Maximum number of variable indexes in a record. - pub const MAX: u32 = u32::MAX / 8; -} - -fn read_var_indexes_record( - r: &mut BufReader, - e: Endian, -) -> Result { - let offset = r.stream_position()?; - let n: u32 = e.parse(read_bytes(r)?); - if n > VarIndexes::MAX { - return Err(Error::BadNumberOfVarIndexes { - offset, - n, - max: VarIndexes::MAX, - }); - } - let mut var_indexes = Vec::with_capacity(n as usize); - for _ in 0..n { - var_indexes.push(e.parse(read_bytes(r)?)); - } - - Ok(VarIndexes { - offset, - var_indexes, - }) -} - -pub const DOC_LINE_LEN: u32 = 80; -pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN; - -pub struct Document { - /// Offset from the start of the file to the start of the record. - pub pos: u64, - - /// The document, as an array of 80-byte lines. - pub lines: Vec<[u8; DOC_LINE_LEN as usize]>, -} - -fn read_document_record( - r: &mut BufReader, - e: Endian, -) -> Result { - let offset = r.stream_position()?; - let n: u32 = e.parse(read_bytes(r)?); - match n { - 0..=DOC_MAX_LINES => { - let pos = r.stream_position()?; - let mut lines = Vec::with_capacity(n as usize); - for _ in 0..n { - let line: [u8; 80] = read_bytes(r)?; - lines.push(line); - } - Ok(Document { pos, lines }) - } - _ => Err(Error::BadDocumentLength { - offset, - n, - max: DOC_MAX_LINES, - }), - } -} - -#[derive(FromPrimitive)] -enum ExtensionType { - /// Machine integer info. - Integer = 3, - /// Machine floating-point info. - Float = 4, - /// Variable sets. - VarSets = 5, - /// DATE. - Date = 6, - /// Multiple response sets. - Mrsets = 7, - /// SPSS Data Entry. - DataEntry = 8, - /// Extra product info text. - ProductInfo = 10, - /// Variable display parameters. - Display = 11, - /// Long variable names. - LongNames = 13, - /// Long strings. - LongStrings = 14, - /// Extended number of cases. - Ncases = 16, - /// Data file attributes. - FileAttrs = 17, - /// Variable attributes. - VarAttrs = 18, - /// Multiple response sets (extended). - Mrsets2 = 19, - /// Character encoding. - Encoding = 20, - /// Value labels for long strings. - LongLabels = 21, - /// Missing values for long strings. - LongMissing = 22, - /// "Format properties in dataview table". - Dataview = 24, -} - -pub struct Extension { - /// Offset from the start of the file to the start of the record. - pub offset: u64, - - /// Record subtype. - pub subtype: u32, - - /// Size of each data element. - pub size: u32, - - /// Number of data elements. - pub count: u32, - - /// `size * count` bytes of data. - pub data: Vec, -} - -fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) { - match extension { - /* Implemented record types. */ - ExtensionType::Integer => (4, 8), - ExtensionType::Float => (8, 3), - ExtensionType::VarSets => (1, 0), - ExtensionType::Mrsets => (1, 0), - ExtensionType::ProductInfo => (1, 0), - ExtensionType::Display => (4, 0), - ExtensionType::LongNames => (1, 0), - ExtensionType::LongStrings => (1, 0), - ExtensionType::Ncases => (8, 2), - ExtensionType::FileAttrs => (1, 0), - ExtensionType::VarAttrs => (1, 0), - ExtensionType::Mrsets2 => (1, 0), - ExtensionType::Encoding => (1, 0), - ExtensionType::LongLabels => (1, 0), - ExtensionType::LongMissing => (1, 0), - - /* Ignored record types. */ - ExtensionType::Date => (0, 0), - ExtensionType::DataEntry => (0, 0), - ExtensionType::Dataview => (0, 0), - } -} - -fn read_extension_record( - r: &mut BufReader, - e: Endian, -) -> Result { - let subtype = e.parse(read_bytes(r)?); - let offset = r.stream_position()?; - let size: u32 = e.parse(read_bytes(r)?); - let count = e.parse(read_bytes(r)?); - let Some(product) = size.checked_mul(count) else { - return Err(Error::ExtensionRecordTooLarge { - offset, - subtype, - size, - count, - }); - }; - let offset = r.stream_position()?; - let data = read_vec(r, product as usize)?; - Ok(Extension { - offset, - subtype, - size, - count, - data, - }) -} - -struct ZHeader { - /// File offset to the start of the record. - offset: u64, - - /// File offset to the ZLIB data header. - zheader_offset: u64, - - /// File offset to the ZLIB trailer. - ztrailer_offset: u64, - - /// Length of the ZLIB trailer in bytes. - ztrailer_len: u64, -} - -fn read_zheader(r: &mut BufReader, e: Endian) -> Result { - let offset = r.stream_position()?; - let zheader_offset: u64 = e.parse(read_bytes(r)?); - let ztrailer_offset: u64 = e.parse(read_bytes(r)?); - let ztrailer_len: u64 = e.parse(read_bytes(r)?); - - Ok(ZHeader { - offset, - zheader_offset, - ztrailer_offset, - ztrailer_len, - }) -} - -fn read_bytes(r: &mut R) -> Result<[u8; N], IoError> { - let mut buf = [0; N]; - r.read_exact(&mut buf)?; - Ok(buf) -} - -fn read_vec(r: &mut BufReader, n: usize) -> Result, IoError> { - let mut vec = vec![0; n]; - r.read_exact(&mut vec)?; - Ok(vec) -} - -/* -fn trim_end(mut s: Vec, c: u8) -> Vec { - while s.last() == Some(&c) { - s.pop(); - } - s -} - -fn skip_bytes(r: &mut R, mut n: u64) -> Result<(), IoError> { - let mut buf = [0; 1024]; - while n > 0 { - let chunk = u64::min(n, buf.len() as u64); - r.read_exact(&mut buf[0..chunk as usize])?; - n -= chunk; - } - Ok(()) -} - -*/ +pub mod format; +pub mod identifier; +pub mod raw; +pub mod sack; +pub mod locale_charset;