From: Ben Pfaff Date: Sun, 23 Jul 2023 22:25:27 +0000 (-0700) Subject: more work on library X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2d8fda335c37b783807d4ddc6f7474cc8c87cdf3;p=pspp more work on library --- diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 5d6bf5c208..401a6ecb94 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,6 +1,7 @@ #![allow(unused_variables)] use endian::{Endian, Parse}; -use std::io::{BufReader, Error as IoError, Read}; +use num::Integer; +use std::io::{BufReader, Error as IoError, Read, Seek}; use thiserror::Error; pub mod endian; @@ -21,12 +22,33 @@ pub enum Error { #[error("Invalid ZSAV compression code {0}")] InvalidZsavCompression(u32), + + #[error("Misplaced type 4 record.")] + MisplacedType4Record, + + #[error("Number of document lines ({n}) must be greater than 0 and less than {max}.")] + BadDocumentLength { n: u32, max: u32 }, + + #[error("Unrecognized record type {0}.")] + BadRecordType(u32), + + #[error("Variable label indicator ({0}) is not 0 or 1.")] + BadVariableLabelIndicator(u32), + + #[error("Numeric missing value indicator ({0}) is not -3, -2, 0, 1, 2, or 3.")] + BadNumericMissingValueIndicator(i32), + + #[error("String missing value indicator ({0}) is not 0, 1, 2, or 3.")] + BadStringMissingValueIndicator(i32), } #[derive(Error, Debug)] pub enum Warning { #[error("Unexpected floating-point bias {0} or unrecognized floating-point format.")] UnexpectedBias(f64), + + #[error("Duplicate type 6 (document) record.")] + DuplicateDocumentRecord, } #[derive(Copy, Clone, Debug)] @@ -37,10 +59,18 @@ pub enum Compression { pub struct Reader { r: BufReader, + + document_record: Option, } +/// Magic number for a regular system file. pub const ASCII_MAGIC: &[u8; 4] = b"$FL2"; + +/// Magic number for a system file that contains zlib-compressed data. pub const ASCII_ZMAGIC: &[u8; 4] = b"$FL3"; + +/// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded in +/// EBCDIC. pub const EBCDIC_MAGIC: &[u8; 4] = &[0x5b, 0xc6, 0xd3, 0xf2]; pub struct FileHeader { @@ -54,6 +84,9 @@ pub struct FileHeader { /// True if `magic` indicates that this file contained EBCDIC data. pub is_ebcdic: bool, + /// Endianness of the data in the file header. + pub endianness: Endian, + /// 0-based variable index of the weight variable, or `None` if the file is /// unweighted. pub weight_index: Option, @@ -76,65 +109,213 @@ pub struct FileHeader { pub file_label: [u8; 64], } -impl Reader { +pub const DOC_LINE_LEN: u32 = 80; +pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN; + +impl Reader { pub fn new(r: R, warn: impl Fn(Warning)) -> Result, Error> { let mut r = BufReader::new(r); - let magic: [u8; 4] = read_bytes(&mut r)?; - let (is_zsav, is_ebcdic) = match &magic { - ASCII_MAGIC => (false, false), - ASCII_ZMAGIC => (true, false), - EBCDIC_MAGIC => (false, true), - _ => return Err(Error::NotASystemFile), - }; - - let eye_catcher: [u8; 60] = read_bytes(&mut r)?; - let layout_code: [u8; 4] = read_bytes(&mut r)?; - let endianness = Endian::identify_u32(2, layout_code) - .or_else(|| Endian::identify_u32(2, layout_code)) - .ok_or_else(|| Error::NotASystemFile)?; - - let nominal_case_size: u32 = endianness.parse(read_bytes(&mut r)?); - let nominal_case_size = (nominal_case_size <= u32::MAX / 32).then_some(nominal_case_size); - - let compression_code: u32 = endianness.parse(read_bytes(&mut r)?); - let compression = match (is_zsav, compression_code) { - (false, 0) => None, - (false, 1) => Some(Compression::Simple), - (true, 2) => Some(Compression::ZLib), - (false, code) => return Err(Error::InvalidSavCompression(code)), - (true, code) => return Err(Error::InvalidZsavCompression(code)), - }; - - let weight_index: u32 = endianness.parse(read_bytes(&mut r)?); - let weight_index = (weight_index > 0).then_some(weight_index - 1); - - let n_cases: u32 = endianness.parse(read_bytes(&mut r)?); - let n_cases = (n_cases <= u32::MAX / 4).then_some(n_cases); - - let bias: f64 = endianness.parse(read_bytes(&mut r)?); - if bias != 100.0 { - warn(Warning::UnexpectedBias(bias)) + let header = read_header(&mut r, &warn)?; + let e = header.endianness; + let mut document_record = None; + let mut variables = Vec::new(); + loop { + let rec_type: u32 = e.parse(read_bytes(&mut r)?); + match rec_type { + 2 => variables.push(read_variable_record(&mut r, e)?), + /* + 3 => d.read_value_label_record()?, + */ + // A Type 4 record is always immediately after a type 3 record, + // the code for type 3 records reads the type 4 record too. + 4 => return Err(Error::MisplacedType4Record), + + 6 => { + let d = read_document_record(&mut r, e)?; + if document_record.is_some() { + warn(Warning::DuplicateDocumentRecord); + } else { + document_record = d; + } + } + /* + 7 => d.read_extension_record()?, + */ + 999 => break, + _ => return Err(Error::BadRecordType(rec_type)), + } + } + + Ok(Reader { r, document_record }) + } +} + +fn read_header(r: &mut R, warn: impl Fn(Warning)) -> Result { + let magic: [u8; 4] = read_bytes(r)?; + let (is_zsav, is_ebcdic) = match &magic { + ASCII_MAGIC => (false, false), + ASCII_ZMAGIC => (true, false), + EBCDIC_MAGIC => (false, true), + _ => return Err(Error::NotASystemFile), + }; + + let eye_catcher: [u8; 60] = read_bytes(r)?; + let layout_code: [u8; 4] = read_bytes(r)?; + let endianness = Endian::identify_u32(2, layout_code) + .or_else(|| Endian::identify_u32(2, layout_code)) + .ok_or_else(|| Error::NotASystemFile)?; + + let nominal_case_size: u32 = endianness.parse(read_bytes(r)?); + let nominal_case_size = + (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size); + + let compression_code: u32 = endianness.parse(read_bytes(r)?); + let compression = match (is_zsav, compression_code) { + (false, 0) => None, + (false, 1) => Some(Compression::Simple), + (true, 2) => Some(Compression::ZLib), + (false, code) => return Err(Error::InvalidSavCompression(code)), + (true, code) => return Err(Error::InvalidZsavCompression(code)), + }; + + let weight_index: u32 = endianness.parse(read_bytes(r)?); + let weight_index = (weight_index > 0).then_some(weight_index - 1); + + let n_cases: u32 = endianness.parse(read_bytes(r)?); + let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases); + + let bias: f64 = endianness.parse(read_bytes(r)?); + if bias != 100.0 { + warn(Warning::UnexpectedBias(bias)) + } + + let creation_date: [u8; 9] = read_bytes(r)?; + let creation_time: [u8; 8] = read_bytes(r)?; + let file_label: [u8; 64] = read_bytes(r)?; + let _: [u8; 3] = read_bytes(r)?; + + Ok(FileHeader { + magic, + is_zsav, + is_ebcdic, + endianness, + weight_index, + nominal_case_size, + creation_date, + creation_time, + eye_catcher, + file_label, + }) +} + +pub struct VariableRecord { + /// Offset from the start of the file to the start of the record. + pub pos: u64, + + /// Variable width, in the range -1..=255. + pub width: i32, + + /// Variable name, padded on the right with spaces. + pub name: [u8; 8], + + /// Print format. + pub print_format: u32, + + /// Write format. + pub write_format: u32, + + /// Missing value code, one of -3, -2, 0, 1, 2, or 3. + pub missing_value_code: i32, + + /// Raw missing values, up to 3 of them. + pub missing: Vec<[u8; 8]>, + + /// Optional variable label. + pub label: Option>, +} + +fn read_variable_record( + r: &mut BufReader, + e: Endian, +) -> Result { + let pos = r.stream_position()?; + let width: i32 = e.parse(read_bytes(r)?); + let has_variable_label: u32 = e.parse(read_bytes(r)?); + let missing_value_code: i32 = e.parse(read_bytes(r)?); + let print_format: u32 = e.parse(read_bytes(r)?); + let write_format: u32 = e.parse(read_bytes(r)?); + let name: [u8; 8] = read_bytes(r)?; + + let label = match has_variable_label { + 0 => None, + 1 => { + let len: u32 = e.parse(read_bytes(r)?); + let read_len = len.min(65535) as usize; + let label = Some(read_vec(r, read_len)?); + + let padding_bytes = Integer::next_multiple_of(&len, &4) - len; + let _ = read_vec(r, padding_bytes as usize)?; + + label + } + _ => return Err(Error::BadVariableLabelIndicator(has_variable_label)), + }; + + let mut missing = Vec::new(); + if missing_value_code != 0 { + match (width, missing_value_code) { + (0, -3 | -2 | 1 | 2 | 3) => (), + (0, _) => return Err(Error::BadNumericMissingValueIndicator(missing_value_code)), + (_, 0..=3) => (), + (_, _) => return Err(Error::BadStringMissingValueIndicator(missing_value_code)), } - let creation_date: [u8; 9] = read_bytes(&mut r)?; - let creation_time: [u8; 8] = read_bytes(&mut r)?; - let file_label: [u8; 64] = read_bytes(&mut r)?; - let _: [u8; 3] = read_bytes(&mut r)?; - - let header = FileHeader { - magic, - is_zsav, - is_ebcdic, - weight_index, - nominal_case_size, - creation_date, - creation_time, - eye_catcher, - file_label, - }; - - Ok(Reader { r }) + for _ in 0..missing_value_code.abs() { + missing.push(read_bytes(r)?); + } + } + + Ok(VariableRecord { + pos, + width, + name, + print_format, + write_format, + missing_value_code, + missing, + label, + }) +} + +pub struct DocumentRecord { + /// Offset from the start of the file to the start of the record. + pub pos: u64, + + /// The document, as an array of 80-byte lines. + pub lines: Vec<[u8; DOC_LINE_LEN as usize]>, +} + +fn read_document_record( + r: &mut BufReader, + e: Endian, +) -> Result, Error> { + let n: u32 = e.parse(read_bytes(r)?); + if n == 0 { + Ok(None) + } else if n > DOC_MAX_LINES { + Err(Error::BadDocumentLength { + n, + max: DOC_MAX_LINES, + }) + } else { + let pos = r.stream_position()?; + let mut lines = Vec::with_capacity(n as usize); + for i in 0..n { + let line: [u8; 80] = read_bytes(r)?; + lines.push(line); + } + Ok(Some(DocumentRecord { pos, lines })) } } @@ -144,6 +325,12 @@ fn read_bytes(r: &mut R) -> Result<[u8; N], IoError> { Ok(buf) } +fn read_vec(r: &mut BufReader, n: usize) -> Result, IoError> { + let mut vec = vec![0; n]; + r.read_exact(&mut vec)?; + Ok(vec) +} + /* fn trim_end(mut s: Vec, c: u8) -> Vec { while s.last() == Some(&c) {