X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=rust%2Fsrc%2Fraw.rs;h=ac8b3a057027cc6649a39c99d294014102da7cec;hb=a8331d2f67af24ce1f9f5da99641b8d1cdc21300;hp=3f7309c7ce67cb0584a83c2efb1f415ec1100674;hpb=92bba7a2dd4dff9030989f4459902f47d504752a;p=pspp diff --git a/rust/src/raw.rs b/rust/src/raw.rs index 3f7309c7ce..ac8b3a0570 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -1,29 +1,153 @@ use crate::endian::{Endian, Parse, ToBytes}; -use crate::Error; +use encoding_rs::mem::decode_latin1; use flate2::read::ZlibDecoder; use num::Integer; +use std::borrow::Cow; +use std::fmt::{Debug, Formatter, Result as FmtResult}; +use std::str::from_utf8; use std::{ collections::VecDeque, io::{Error as IoError, Read, Seek, SeekFrom}, iter::FusedIterator, }; +use thiserror::Error as ThisError; use self::state::State; -#[derive(Copy, Clone, Debug)] -pub enum Compression { - Simple, - ZLib, +#[derive(ThisError, Debug)] +pub enum Error { + #[error("Not an SPSS system file")] + NotASystemFile, + + #[error("Invalid magic number {0:?}")] + BadMagic([u8; 4]), + + #[error("I/O error ({0})")] + Io(#[from] IoError), + + #[error("Invalid SAV compression code {0}")] + InvalidSavCompression(u32), + + #[error("Invalid ZSAV compression code {0}")] + InvalidZsavCompression(u32), + + #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")] + BadVariableWidth { offset: u64, width: i32 }, + + #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")] + BadDocumentLength { offset: u64, n: usize, max: usize }, + + #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")] + BadRecordType { offset: u64, rec_type: u32 }, + + #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")] + BadVariableLabelCode { offset: u64, code: u32 }, + + #[error( + "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3." + )] + BadNumericMissingValueCode { offset: u64, code: i32 }, + + #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")] + BadStringMissingValueCode { offset: u64, code: i32 }, + + #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")] + BadNumberOfValueLabels { offset: u64, n: u32, max: u32 }, + + #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")] + ExpectedVarIndexRecord { offset: u64, rec_type: u32 }, + + #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")] + BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 }, + + #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")] + ExtensionRecordTooLarge { + offset: u64, + subtype: u32, + size: u32, + count: u32, + }, + + #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")] + EofInCase { + offset: u64, + case_ofs: u64, + case_len: usize, + }, + + #[error( + "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case." + )] + EofInCompressedCase { offset: u64, case_ofs: u64 }, + + #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")] + PartialCompressedCase { offset: u64, case_ofs: u64 }, + + #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")] + CompressedNumberExpected { offset: u64, case_ofs: u64 }, + + #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")] + CompressedStringExpected { offset: u64, case_ofs: u64 }, + + #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")] + BadZlibTrailerNBlocks { + offset: u64, + n_blocks: u32, + expected_n_blocks: u64, + ztrailer_len: u64, + }, + + #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")] + BadRecordSize { + offset: u64, + record: String, + size: u32, + expected_size: u32, + }, + + #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")] + BadRecordCount { + offset: u64, + record: String, + count: u32, + expected_count: u32, + }, + + #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")] + BadLongMissingValueLength { + record_offset: u64, + offset: u64, + value_len: u32, + }, + + #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")] + BadEncodingName { offset: u64 }, + + #[error("Details TBD")] + TBD, } +#[derive(Clone, Debug)] pub enum Record { - Header(Header), - Document(Document), - Variable(Variable), - ValueLabel(ValueLabel), - VarIndexes(VarIndexes), - Extension(Extension), + Header(HeaderRecord), + Variable(VariableRecord), + ValueLabel(ValueLabelRecord), + Document(DocumentRecord), + IntegerInfo(IntegerInfoRecord), + FloatInfo(FloatInfoRecord), + VariableSets(TextRecord), + VarDisplay(VarDisplayRecord), + MultipleResponse(MultipleResponseRecord), + LongStringValueLabels(LongStringValueLabelRecord), + Encoding(EncodingRecord), + NumberOfCases(NumberOfCasesRecord), + ProductInfo(TextRecord), + LongNames(TextRecord), + VeryLongStrings(TextRecord), + FileAttributes(TextRecord), + VariableAttributes(TextRecord), + OtherExtension(Extension), EndOfHeaders(u32), ZHeader(ZHeader), ZTrailer(ZTrailer), @@ -34,11 +158,10 @@ impl Record { fn read(reader: &mut R, endian: Endian) -> Result { let rec_type: u32 = endian.parse(read_bytes(reader)?); match rec_type { - 2 => Ok(Record::Variable(Variable::read(reader, endian)?)), - 3 => Ok(Record::ValueLabel(ValueLabel::read(reader, endian)?)), - 4 => Ok(Record::VarIndexes(VarIndexes::read(reader, endian)?)), - 6 => Ok(Record::Document(Document::read(reader, endian)?)), - 7 => Ok(Record::Extension(Extension::read(reader, endian)?)), + 2 => Ok(Record::Variable(VariableRecord::read(reader, endian)?)), + 3 => Ok(Record::ValueLabel(ValueLabelRecord::read(reader, endian)?)), + 6 => Ok(Record::Document(DocumentRecord::read(reader, endian)?)), + 7 => Ok(Extension::read(reader, endian)?), 999 => Ok(Record::EndOfHeaders(endian.parse(read_bytes(reader)?))), _ => Err(Error::BadRecordType { offset: reader.stream_position()?, @@ -48,13 +171,26 @@ impl Record { } } -pub struct Header { +// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it +// decoded as Latin-1 (actually bytes interpreted as Unicode code points). +fn default_decode<'a>(s: &'a [u8]) -> Cow<'a, str> { + from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from) +} + +#[derive(Copy, Clone, Debug)] +pub enum Compression { + Simple, + ZLib, +} + +#[derive(Clone)] +pub struct HeaderRecord { /// Magic number. pub magic: Magic, /// Eye-catcher string, product name, in the file's encoding. Padded /// on the right with spaces. - pub eye_catcher: [u8; 60], + pub eye_catcher: UnencodedStr<60>, /// Layout code, normally either 2 or 3. pub layout_code: u32, @@ -66,7 +202,7 @@ pub struct Header { /// Compression type, if any, pub compression: Option, - /// 0-based variable index of the weight variable, or `None` if the file is + /// 1-based variable index of the weight variable, or `None` if the file is /// unweighted. pub weight_index: Option, @@ -77,24 +213,48 @@ pub struct Header { pub bias: f64, /// `dd mmm yy` in the file's encoding. - pub creation_date: [u8; 9], + pub creation_date: UnencodedStr<9>, /// `HH:MM:SS` in the file's encoding. - pub creation_time: [u8; 8], + pub creation_time: UnencodedStr<8>, /// File label, in the file's encoding. Padded on the right with spaces. - pub file_label: [u8; 64], + pub file_label: UnencodedStr<64>, /// Endianness of the data in the file header. pub endian: Endian, } -impl Header { - fn read(r: &mut R) -> Result { +impl HeaderRecord { + fn debug_field(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult { + writeln!(f, "{name:>17}: {:?}", value) + } +} + +impl Debug for HeaderRecord { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + writeln!(f, "File header record:")?; + self.debug_field(f, "Magic", self.magic)?; + self.debug_field(f, "Product name", &self.eye_catcher)?; + self.debug_field(f, "Layout code", self.layout_code)?; + self.debug_field(f, "Nominal case size", self.nominal_case_size)?; + self.debug_field(f, "Compression", self.compression)?; + self.debug_field(f, "Weight index", self.weight_index)?; + self.debug_field(f, "Number of cases", self.n_cases)?; + self.debug_field(f, "Compression bias", self.bias)?; + self.debug_field(f, "Creation date", &self.creation_date)?; + self.debug_field(f, "Creation time", &self.creation_time)?; + self.debug_field(f, "File label", &self.file_label)?; + self.debug_field(f, "Endianness", self.endian) + } +} + +impl HeaderRecord { + fn read(r: &mut R) -> Result { let magic: [u8; 4] = read_bytes(r)?; let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?; - let eye_catcher: [u8; 60] = read_bytes(r)?; + let eye_catcher = UnencodedStr::<60>(read_bytes(r)?); let layout_code: [u8; 4] = read_bytes(r)?; let endian = Endian::identify_u32(2, layout_code) .or_else(|| Endian::identify_u32(2, layout_code)) @@ -115,19 +275,19 @@ impl Header { }; let weight_index: u32 = endian.parse(read_bytes(r)?); - let weight_index = (weight_index > 0).then_some(weight_index - 1); + let weight_index = (weight_index > 0).then_some(weight_index); let n_cases: u32 = endian.parse(read_bytes(r)?); let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases); let bias: f64 = endian.parse(read_bytes(r)?); - let creation_date: [u8; 9] = read_bytes(r)?; - let creation_time: [u8; 8] = read_bytes(r)?; - let file_label: [u8; 64] = read_bytes(r)?; + let creation_date = UnencodedStr::<9>(read_bytes(r)?); + let creation_time = UnencodedStr::<8>(read_bytes(r)?); + let file_label = UnencodedStr::<64>(read_bytes(r)?); let _: [u8; 3] = read_bytes(r)?; - Ok(Header { + Ok(HeaderRecord { magic, layout_code, nominal_case_size, @@ -159,6 +319,18 @@ impl Magic { pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]); } +impl Debug for Magic { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + let s = match self { + &Magic::SAV => "$FL2", + &Magic::ZSAV => "$FL3", + &Magic::EBCDIC => "($FL2 in EBCDIC)", + _ => return write!(f, "{:?}", self.0), + }; + write!(f, "{s}") + } +} + impl TryFrom<[u8; 4]> for Magic { type Error = Error; @@ -171,16 +343,16 @@ impl TryFrom<[u8; 4]> for Magic { } } -#[derive(Copy, Clone, PartialEq, Eq, Hash)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum VarType { - Number, + Numeric, String, } impl VarType { fn from_width(width: i32) -> VarType { match width { - 0 => VarType::Number, + 0 => VarType::Numeric, _ => VarType::String, } } @@ -188,8 +360,8 @@ impl VarType { mod state { use super::{ - Compression, Error, Header, Record, Value, VarType, Variable, ZHeader, ZTrailer, - ZlibDecodeMultiple, + Compression, Error, HeaderRecord, Record, Value, VarType, VariableRecord, ZHeader, + ZTrailer, ZlibDecodeMultiple, }; use crate::endian::Endian; use std::{ @@ -220,7 +392,7 @@ mod state { impl State for Start { fn read(mut self: Box) -> Result)>, Error> { - let header = Header::read(&mut self.reader)?; + let header = HeaderRecord::read(&mut self.reader)?; let next_state = Headers(CommonState { reader: self.reader, endian: header.endian, @@ -238,7 +410,7 @@ mod state { fn read(mut self: Box) -> Result)>, Error> { let record = Record::read(&mut self.0.reader, self.0.endian)?; match record { - Record::Variable(Variable { width, .. }) => { + Record::Variable(VariableRecord { width, .. }) => { self.0.var_types.push(VarType::from_width(width)); } Record::EndOfHeaders(_) => { @@ -332,15 +504,33 @@ mod state { #[derive(Copy, Clone)] pub enum Value { Number(Option), - String([u8; 8]), + String(UnencodedStr<8>), +} + +impl Debug for Value { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + match self { + Value::Number(Some(number)) => write!(f, "{number:?}"), + Value::Number(None) => write!(f, "SYSMIS"), + Value::String(bytes) => write!(f, "{:?}", bytes), + } + } } impl Value { - pub fn from_raw(var_type: VarType, raw: [u8; 8], endian: Endian) -> Value { + fn read(r: &mut R, var_type: VarType, endian: Endian) -> Result { + Ok(Self::from_raw( + UntypedValue(read_bytes(r)?), + var_type, + endian, + )) + } + + pub fn from_raw(raw: UntypedValue, var_type: VarType, endian: Endian) -> Value { match var_type { - VarType::String => Value::String(raw), - VarType::Number => { - let number: f64 = endian.parse(raw); + VarType::String => Value::String(UnencodedStr(raw.0)), + VarType::Numeric => { + let number: f64 = endian.parse(raw.0); Value::Number((number != -f64::MAX).then_some(number)) } } @@ -366,7 +556,7 @@ impl Value { }); } }; - values.push(Value::from_raw(var_type, raw, endian)); + values.push(Value::from_raw(UntypedValue(raw), var_type, endian)); } Ok(Some(values)) } @@ -400,9 +590,9 @@ impl Value { match code { 0 => (), 1..=251 => match var_type { - VarType::Number => break Value::Number(Some(code as f64 - bias)), + VarType::Numeric => break Value::Number(Some(code as f64 - bias)), VarType::String => { - break Value::String(endian.to_bytes(code as f64 - bias)) + break Value::String(UnencodedStr(endian.to_bytes(code as f64 - bias))) } }, 252 => { @@ -416,10 +606,12 @@ impl Value { }); } } - 253 => break Value::from_raw(var_type, read_bytes(reader)?, endian), + 253 => { + break Value::from_raw(UntypedValue(read_bytes(reader)?), var_type, endian) + } 254 => match var_type { - VarType::String => break Value::String(*b" "), // XXX EBCDIC - VarType::Number => { + VarType::String => break Value::String(UnencodedStr(*b" ")), // XXX EBCDIC + VarType::Numeric => { return Err(Error::CompressedStringExpected { offset: case_start, case_ofs: reader.stream_position()? - case_start, @@ -427,7 +619,7 @@ impl Value { } }, 255 => match var_type { - VarType::Number => break Value::Number(None), + VarType::Numeric => break Value::Number(None), VarType::String => { return Err(Error::CompressedNumberExpected { offset: case_start, @@ -497,6 +689,16 @@ impl Reader { state: Some(state::new(reader)), }) } + pub fn collect_headers(&mut self) -> Result, Error> { + let mut headers = Vec::new(); + for record in self { + match record? { + Record::EndOfHeaders(_) => break, + r => headers.push(r), + }; + } + Ok(headers) + } } impl Iterator for Reader { @@ -516,7 +718,134 @@ impl Iterator for Reader { impl FusedIterator for Reader {} -pub struct Variable { +#[derive(Copy, Clone, PartialEq, Eq, Hash)] +pub struct Spec(pub u32); + +impl Debug for Spec { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + let type_ = format_name(self.0 >> 16); + let w = (self.0 >> 8) & 0xff; + let d = self.0 & 0xff; + write!(f, "{:06x} ({type_}{w}.{d})", self.0) + } +} + +fn format_name(type_: u32) -> Cow<'static, str> { + match type_ { + 1 => "A", + 2 => "AHEX", + 3 => "COMMA", + 4 => "DOLLAR", + 5 => "F", + 6 => "IB", + 7 => "PIBHEX", + 8 => "P", + 9 => "PIB", + 10 => "PK", + 11 => "RB", + 12 => "RBHEX", + 15 => "Z", + 16 => "N", + 17 => "E", + 20 => "DATE", + 21 => "TIME", + 22 => "DATETIME", + 23 => "ADATE", + 24 => "JDATE", + 25 => "DTIME", + 26 => "WKDAY", + 27 => "MONTH", + 28 => "MOYR", + 29 => "QYR", + 30 => "WKYR", + 31 => "PCT", + 32 => "DOT", + 33 => "CCA", + 34 => "CCB", + 35 => "CCC", + 36 => "CCD", + 37 => "CCE", + 38 => "EDATE", + 39 => "SDATE", + 40 => "MTIME", + 41 => "YMDHMS", + _ => return format!("").into(), + } + .into() +} + +#[derive(Clone)] +pub struct MissingValues { + /// Individual missing values, up to 3 of them. + pub values: Vec, + + /// Optional range of missing values. + pub range: Option<(Value, Value)>, +} + +impl Debug for MissingValues { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + for (i, value) in self.values.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{value:?}")?; + } + + if let Some((low, high)) = self.range { + if !self.values.is_empty() { + write!(f, ", ")?; + } + write!(f, "{low:?} THRU {high:?}")?; + } + + if self.is_empty() { + write!(f, "none")?; + } + + Ok(()) + } +} + +impl MissingValues { + fn is_empty(&self) -> bool { + self.values.is_empty() && self.range.is_none() + } + + fn read( + r: &mut R, + offset: u64, + width: i32, + code: i32, + endian: Endian, + ) -> Result { + let (n_values, has_range) = match (width, code) { + (_, 0..=3) => (code, false), + (0, -2) => (0, true), + (0, -3) => (1, true), + (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }), + (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }), + }; + + let var_type = VarType::from_width(width); + + let mut values = Vec::new(); + for _ in 0..n_values { + values.push(Value::read(r, var_type, endian)?); + } + let range = if has_range { + let low = Value::read(r, var_type, endian)?; + let high = Value::read(r, var_type, endian)?; + Some((low, high)) + } else { + None + }; + Ok(MissingValues { values, range }) + } +} + +#[derive(Clone)] +pub struct VariableRecord { /// Offset from the start of the file to the start of the record. pub offset: u64, @@ -524,45 +853,64 @@ pub struct Variable { pub width: i32, /// Variable name, padded on the right with spaces. - pub name: [u8; 8], + pub name: UnencodedStr<8>, /// Print format. - pub print_format: u32, + pub print_format: Spec, /// Write format. - pub write_format: u32, + pub write_format: Spec, - /// Missing value code, one of -3, -2, 0, 1, 2, or 3. - pub missing_value_code: i32, - - /// Raw missing values, up to 3 of them. - pub missing: Vec<[u8; 8]>, + /// Missing values. + pub missing_values: MissingValues, /// Optional variable label. - pub label: Option>, + pub label: Option, +} + +impl Debug for VariableRecord { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + writeln!( + f, + "Width: {} ({})", + self.width, + if self.width > 0 { + "string" + } else if self.width == 0 { + "numeric" + } else { + "long string continuation record" + } + )?; + writeln!(f, "Print format: {:?}", self.print_format)?; + writeln!(f, "Write format: {:?}", self.write_format)?; + writeln!(f, "Name: {:?}", &self.name)?; + writeln!(f, "Variable label: {:?}", self.label)?; + writeln!(f, "Missing values: {:?}", self.missing_values) + } } -impl Variable { - fn read(r: &mut R, endian: Endian) -> Result { +impl VariableRecord { + fn read(r: &mut R, endian: Endian) -> Result { let offset = r.stream_position()?; let width: i32 = endian.parse(read_bytes(r)?); let has_variable_label: u32 = endian.parse(read_bytes(r)?); let missing_value_code: i32 = endian.parse(read_bytes(r)?); - let print_format: u32 = endian.parse(read_bytes(r)?); - let write_format: u32 = endian.parse(read_bytes(r)?); - let name: [u8; 8] = read_bytes(r)?; + let print_format = Spec(endian.parse(read_bytes(r)?)); + let write_format = Spec(endian.parse(read_bytes(r)?)); + let name = UnencodedStr::<8>(read_bytes(r)?); let label = match has_variable_label { 0 => None, 1 => { let len: u32 = endian.parse(read_bytes(r)?); let read_len = len.min(65535) as usize; - let label = Some(read_vec(r, read_len)?); + let label = UnencodedString(read_vec(r, read_len)?); let padding_bytes = Integer::next_multiple_of(&len, &4) - len; let _ = read_vec(r, padding_bytes as usize)?; - label + Some(label) } _ => { return Err(Error::BadVariableLabelCode { @@ -572,251 +920,281 @@ impl Variable { } }; - let mut missing = Vec::new(); - if missing_value_code != 0 { - match (width, missing_value_code) { - (0, -3 | -2 | 1 | 2 | 3) => (), - (0, _) => { - return Err(Error::BadNumericMissingValueCode { - offset, - code: missing_value_code, - }) - } - (_, 0..=3) => (), - (_, _) => { - return Err(Error::BadStringMissingValueCode { - offset, - code: missing_value_code, - }) - } - } - - for _ in 0..missing_value_code.abs() { - missing.push(read_bytes(r)?); - } - } + let missing_values = MissingValues::read(r, offset, width, missing_value_code, endian)?; - Ok(Variable { + Ok(VariableRecord { offset, width, name, print_format, write_format, - missing_value_code, - missing, + missing_values, label, }) } } -pub struct ValueLabel { - /// Offset from the start of the file to the start of the record. - pub offset: u64, +#[derive(Copy, Clone)] +pub struct UntypedValue(pub [u8; 8]); + +impl Debug for UntypedValue { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + let little: f64 = Endian::Little.parse(self.0); + let little = format!("{:?}", little); + let big: f64 = Endian::Big.parse(self.0); + let big = format!("{:?}", big); + let number = if little.len() <= big.len() { + little + } else { + big + }; + write!(f, "{number}")?; + + let string = default_decode(&self.0); + let string = string + .split(|c: char| c == '\0' || c.is_control()) + .next() + .unwrap(); + write!(f, "{string:?}")?; + Ok(()) + } +} + +#[derive(Clone)] +pub struct UnencodedString(pub Vec); + +impl From> for UnencodedString { + fn from(source: Vec) -> Self { + Self(source) + } +} + +impl From<&[u8]> for UnencodedString { + fn from(source: &[u8]) -> Self { + Self(source.into()) + } +} + +impl Debug for UnencodedString { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{:?}", default_decode(self.0.as_slice())) + } +} + +#[derive(Copy, Clone)] +pub struct UnencodedStr(pub [u8; N]); + +impl From<[u8; N]> for UnencodedStr { + fn from(source: [u8; N]) -> Self { + Self(source) + } +} + +impl Debug for UnencodedStr { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{:?}", default_decode(&self.0)) + } +} + +#[derive(Clone)] +pub struct ValueLabelRecord { + /// Offset from the start of the file to the start of the value label + /// record. + pub label_offset: u64, /// The labels. - pub labels: Vec<([u8; 8], Vec)>, + pub labels: Vec<(UntypedValue, UnencodedString)>, + + /// Offset from the start of the file to the start of the variable index + /// record. + pub index_offset: u64, + + /// The 1-based indexes of the variable indexes. + pub dict_indexes: Vec, } -impl ValueLabel { +impl Debug for ValueLabelRecord { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + writeln!(f, "labels: ")?; + for (value, label) in self.labels.iter() { + writeln!(f, "{value:?}: {label:?}")?; + } + write!(f, "apply to variables")?; + for dict_index in self.dict_indexes.iter() { + write!(f, " #{dict_index}")?; + } + Ok(()) + } +} + +impl ValueLabelRecord { /// Maximum number of value labels in a record. - pub const MAX: u32 = u32::MAX / 8; + pub const MAX_LABELS: u32 = u32::MAX / 8; - fn read(r: &mut R, endian: Endian) -> Result { - let offset = r.stream_position()?; + /// Maximum number of variable indexes in a record. + pub const MAX_INDEXES: u32 = u32::MAX / 8; + + fn read(r: &mut R, endian: Endian) -> Result { + let label_offset = r.stream_position()?; let n: u32 = endian.parse(read_bytes(r)?); - if n > ValueLabel::MAX { + if n > Self::MAX_LABELS { return Err(Error::BadNumberOfValueLabels { - offset, + offset: label_offset, n, - max: ValueLabel::MAX, + max: Self::MAX_LABELS, }); } let mut labels = Vec::new(); for _ in 0..n { - let value: [u8; 8] = read_bytes(r)?; + let value = UntypedValue(read_bytes(r)?); let label_len: u8 = endian.parse(read_bytes(r)?); let label_len = label_len as usize; let padded_len = Integer::next_multiple_of(&(label_len + 1), &8); - let mut label = read_vec(r, padded_len)?; + let mut label = read_vec(r, padded_len - 1)?; label.truncate(label_len); - labels.push((value, label)); + labels.push((value, UnencodedString(label))); } - Ok(ValueLabel { offset, labels }) - } -} -pub struct VarIndexes { - /// Offset from the start of the file to the start of the record. - pub offset: u64, - - /// The 0-based indexes of the variable indexes. - pub var_indexes: Vec, -} - -impl VarIndexes { - /// Maximum number of variable indexes in a record. - pub const MAX: u32 = u32::MAX / 8; + let index_offset = r.stream_position()?; + let rec_type: u32 = endian.parse(read_bytes(r)?); + if rec_type != 4 { + return Err(Error::ExpectedVarIndexRecord { + offset: index_offset, + rec_type, + }); + } - fn read(r: &mut R, endian: Endian) -> Result { - let offset = r.stream_position()?; let n: u32 = endian.parse(read_bytes(r)?); - if n > VarIndexes::MAX { + if n > Self::MAX_INDEXES { return Err(Error::BadNumberOfVarIndexes { - offset, + offset: index_offset, n, - max: VarIndexes::MAX, + max: Self::MAX_INDEXES, }); } - let mut var_indexes = Vec::with_capacity(n as usize); + let mut dict_indexes = Vec::with_capacity(n as usize); for _ in 0..n { - var_indexes.push(endian.parse(read_bytes(r)?)); + dict_indexes.push(endian.parse(read_bytes(r)?)); } - Ok(VarIndexes { - offset, - var_indexes, + Ok(ValueLabelRecord { + label_offset, + labels, + index_offset, + dict_indexes, }) } } -pub struct Document { +#[derive(Clone, Debug)] +pub struct DocumentRecord { /// Offset from the start of the file to the start of the record. pub pos: u64, /// The document, as an array of 80-byte lines. - pub lines: Vec<[u8; Document::LINE_LEN as usize]>, + pub lines: Vec, } -impl Document { +pub type DocumentLine = UnencodedStr<{ DocumentRecord::LINE_LEN }>; + +impl DocumentRecord { /// Length of a line in a document. Document lines are fixed-length and /// padded on the right with spaces. - pub const LINE_LEN: u32 = 80; + pub const LINE_LEN: usize = 80; /// Maximum number of lines we will accept in a document. This is simply /// the maximum number that will fit in a 32-bit space. - pub const MAX_LINES: u32 = i32::MAX as u32 / Self::LINE_LEN; + pub const MAX_LINES: usize = i32::MAX as usize / Self::LINE_LEN; - fn read(r: &mut R, endian: Endian) -> Result { + fn read(r: &mut R, endian: Endian) -> Result { let offset = r.stream_position()?; let n: u32 = endian.parse(read_bytes(r)?); - match n { - 0..=Self::MAX_LINES => Ok(Document { - pos: r.stream_position()?, - lines: (0..n) - .map(|_| read_bytes(r)) - .collect::, _>>()?, - }), - _ => Err(Error::BadDocumentLength { + let n = n as usize; + if n > Self::MAX_LINES { + Err(Error::BadDocumentLength { offset, n, max: Self::MAX_LINES, - }), + }) + } else { + let pos = r.stream_position()?; + let mut lines = Vec::with_capacity(n); + for _ in 0..n { + lines.push(UnencodedStr::<{ DocumentRecord::LINE_LEN }>(read_bytes(r)?)); + } + Ok(DocumentRecord { pos, lines }) } } } -/* -#[derive(FromPrimitive)] -enum ExtensionType { - /// Machine integer info. - Integer = 3, - /// Machine floating-point info. - Float = 4, - /// Variable sets. - VarSets = 5, - /// DATE. - Date = 6, - /// Multiple response sets. - Mrsets = 7, - /// SPSS Data Entry. - DataEntry = 8, - /// Extra product info text. - ProductInfo = 10, - /// Variable display parameters. - Display = 11, - /// Long variable names. - LongNames = 13, - /// Long strings. - LongStrings = 14, - /// Extended number of cases. - Ncases = 16, - /// Data file attributes. - FileAttrs = 17, - /// Variable attributes. - VarAttrs = 18, - /// Multiple response sets (extended). - Mrsets2 = 19, - /// Character encoding. - Encoding = 20, - /// Value labels for long strings. - LongLabels = 21, - /// Missing values for long strings. - LongMissing = 22, - /// "Format properties in dataview table". - Dataview = 24, -} - */ - -trait ExtensionRecord where Self: Sized { +trait ExtensionRecord +where + Self: Sized, +{ + const SUBTYPE: u32; const SIZE: Option; const COUNT: Option; const NAME: &'static str; - fn parse(ext: &Extension, endian: Endian) -> Result; + fn parse(ext: &Extension, endian: Endian, warn: impl Fn(Error)) -> Result; } -pub struct IntegerInfo { - version: (i32, i32, i32), - machine_code: i32, - floating_point_rep: i32, - compression_code: i32, - endianness: i32, - character_code: i32, +#[derive(Clone, Debug)] +pub struct IntegerInfoRecord { + pub version: (i32, i32, i32), + pub machine_code: i32, + pub floating_point_rep: i32, + pub compression_code: i32, + pub endianness: i32, + pub character_code: i32, } -impl ExtensionRecord for IntegerInfo { +impl ExtensionRecord for IntegerInfoRecord { + const SUBTYPE: u32 = 3; const SIZE: Option = Some(4); const COUNT: Option = Some(8); const NAME: &'static str = "integer record"; - fn parse(ext: &Extension, endian: Endian) -> Result{ + fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { ext.check_size::()?; let mut input = &ext.data[..]; let data: Vec = (0..8) .map(|_| endian.parse(read_bytes(&mut input).unwrap())) .collect(); - Ok(IntegerInfo { + Ok(IntegerInfoRecord { version: (data[0], data[1], data[2]), machine_code: data[3], floating_point_rep: data[4], compression_code: data[5], endianness: data[6], - character_code: data[7] + character_code: data[7], }) } } -pub struct FloatInfo { - sysmis: f64, - highest: f64, - lowest: f64, +#[derive(Clone, Debug)] +pub struct FloatInfoRecord { + pub sysmis: f64, + pub highest: f64, + pub lowest: f64, } -impl ExtensionRecord for FloatInfo { +impl ExtensionRecord for FloatInfoRecord { + const SUBTYPE: u32 = 4; const SIZE: Option = Some(8); const COUNT: Option = Some(3); const NAME: &'static str = "floating point record"; - fn parse(ext: &Extension, endian: Endian) -> Result{ + fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { ext.check_size::()?; let mut input = &ext.data[..]; let data: Vec = (0..3) .map(|_| endian.parse(read_bytes(&mut input).unwrap())) .collect(); - Ok(FloatInfo { + Ok(FloatInfoRecord { sysmis: data[0], highest: data[1], lowest: data[2], @@ -824,6 +1202,297 @@ impl ExtensionRecord for FloatInfo { } } +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum CategoryLabels { + VarLabels, + CountedValues, +} + +#[derive(Clone, Debug)] +pub enum MultipleResponseType { + MultipleDichotomy { + value: UnencodedString, + labels: CategoryLabels, + }, + MultipleCategory, +} + +impl MultipleResponseType { + fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Error> { + let (mr_type, input) = match input.get(0) { + Some(b'C') => (MultipleResponseType::MultipleCategory, &input[1..]), + Some(b'D') => { + let (value, input) = parse_counted_string(&input[1..])?; + ( + MultipleResponseType::MultipleDichotomy { + value: value.into(), + labels: CategoryLabels::VarLabels, + }, + input, + ) + } + Some(b'E') => { + let Some(b' ') = input.get(1) else { + return Err(Error::TBD); + }; + let input = &input[2..]; + let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") { + (CategoryLabels::CountedValues, rest) + } else if let Some(rest) = input.strip_prefix(b" 11 ") { + (CategoryLabels::VarLabels, rest) + } else { + return Err(Error::TBD); + }; + let (value, input) = parse_counted_string(input)?; + ( + MultipleResponseType::MultipleDichotomy { + value: value.into(), + labels, + }, + input, + ) + } + _ => return Err(Error::TBD), + }; + Ok((mr_type, input)) + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseSet { + pub name: UnencodedString, + pub label: UnencodedString, + pub mr_type: MultipleResponseType, + pub short_names: Vec, +} + +impl MultipleResponseSet { + fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> { + let Some(equals) = input.iter().position(|&b| b == b'=') else { + return Err(Error::TBD); + }; + let (name, input) = input.split_at(equals); + let (mr_type, input) = MultipleResponseType::parse(input)?; + let Some(b' ') = input.get(0) else { + return Err(Error::TBD); + }; + let (label, mut input) = parse_counted_string(&input[1..])?; + let mut vars = Vec::new(); + while input.get(0) == Some(&b' ') { + input = &input[1..]; + let Some(length) = input.iter().position(|b| b" \n".contains(b)) else { + return Err(Error::TBD); + }; + if length > 0 { + vars.push(input[..length].into()); + } + input = &input[length..]; + } + if input.get(0) != Some(&b'\n') { + return Err(Error::TBD); + } + while input.get(0) == Some(&b'\n') { + input = &input[1..]; + } + Ok(( + MultipleResponseSet { + name: name.into(), + label: label.into(), + mr_type, + short_names: vars, + }, + input, + )) + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseRecord(pub Vec); + +impl ExtensionRecord for MultipleResponseRecord { + const SUBTYPE: u32 = 7; + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "multiple response set record"; + + fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let mut sets = Vec::new(); + while !input.is_empty() { + let (set, rest) = MultipleResponseSet::parse(input)?; + sets.push(set); + input = rest; + } + Ok(MultipleResponseRecord(sets)) + } +} + +fn parse_counted_string(input: &[u8]) -> Result<(UnencodedString, &[u8]), Error> { + let Some(space) = input.iter().position(|&b| b == b' ') else { + return Err(Error::TBD); + }; + let Ok(length) = from_utf8(&input[..space]) else { + return Err(Error::TBD); + }; + let Ok(length): Result = length.parse() else { + return Err(Error::TBD); + }; + + let input = &input[space + 1..]; + if input.len() < length { + return Err(Error::TBD); + }; + + let (string, rest) = input.split_at(length); + Ok((string.into(), rest)) +} + +#[derive(Clone, Debug)] +pub struct VarDisplayRecord(pub Vec); + +impl ExtensionRecord for VarDisplayRecord { + const SUBTYPE: u32 = 11; + const SIZE: Option = Some(4); + const COUNT: Option = None; + const NAME: &'static str = "variable display record"; + + fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let display = (0..ext.count) + .map(|_| endian.parse(read_bytes(&mut input).unwrap())) + .collect(); + Ok(VarDisplayRecord(display)) + } +} + +pub struct LongStringMissingValues { + /// Variable name. + pub var_name: UnencodedString, + + /// Missing values. + pub missing_values: MissingValues, +} + +pub struct LongStringMissingValueSet(Vec); + +impl ExtensionRecord for LongStringMissingValueSet { + const SUBTYPE: u32 = 22; + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "long string missing values record"; + + fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let mut missing_value_set = Vec::new(); + while !input.is_empty() { + let var_name = read_string(&mut input, endian)?; + let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?); + let value_len: u32 = endian.parse(read_bytes(&mut input)?); + if value_len != 8 { + let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offset; + return Err(Error::BadLongMissingValueLength { + record_offset: ext.offset, + offset, + value_len, + }); + } + let mut values = Vec::new(); + for i in 0..n_missing_values { + let value: [u8; 8] = read_bytes(&mut input)?; + let numeric_value: u64 = endian.parse(value); + let value = if i > 0 && numeric_value == 8 { + // Tolerate files written by old, buggy versions of PSPP + // where we believed that the value_length was repeated + // before each missing value. + read_bytes(&mut input)? + } else { + value + }; + values.push(Value::String(UnencodedStr(value))); + } + let missing_values = MissingValues { + values, + range: None, + }; + missing_value_set.push(LongStringMissingValues { + var_name, + missing_values, + }); + } + Ok(LongStringMissingValueSet(missing_value_set)) + } +} + +#[derive(Clone, Debug)] +pub struct EncodingRecord(pub String); + +impl ExtensionRecord for EncodingRecord { + const SUBTYPE: u32 = 20; + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "encoding record"; + + fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result { + ext.check_size::()?; + + Ok(EncodingRecord( + String::from_utf8(ext.data.clone()) + .map_err(|_| Error::BadEncodingName { offset: ext.offset })?, + )) + } +} + +#[derive(Clone, Debug)] +pub struct NumberOfCasesRecord { + /// Always observed as 1. + pub one: u64, + + /// Number of cases. + pub n_cases: u64, +} + +impl ExtensionRecord for NumberOfCasesRecord { + const SUBTYPE: u32 = 16; + const SIZE: Option = Some(8); + const COUNT: Option = Some(2); + const NAME: &'static str = "extended number of cases record"; + + fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let one = endian.parse(read_bytes(&mut input)?); + let n_cases = endian.parse(read_bytes(&mut input)?); + + Ok(NumberOfCasesRecord { one, n_cases }) + } +} + +#[derive(Clone, Debug)] +pub struct TextRecord { + /// Offset from the start of the file to the start of the record. + pub offset: u64, + + /// The text content of the record. + pub text: UnencodedString, +} + +impl From for TextRecord { + fn from(source: Extension) -> Self { + TextRecord { + offset: source.offset, + text: source.data.into(), + } + } +} + +#[derive(Clone, Debug)] pub struct Extension { /// Offset from the start of the file to the start of the record. pub offset: u64, @@ -841,34 +1510,6 @@ pub struct Extension { pub data: Vec, } -/* -fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) { - match extension { - /* Implemented record types. */ - ExtensionType::Integer => (4, 8), - ExtensionType::Float => (8, 3), - ExtensionType::VarSets => (1, 0), - ExtensionType::Mrsets => (1, 0), - ExtensionType::ProductInfo => (1, 0), - ExtensionType::Display => (4, 0), - ExtensionType::LongNames => (1, 0), - ExtensionType::LongStrings => (1, 0), - ExtensionType::Ncases => (8, 2), - ExtensionType::FileAttrs => (1, 0), - ExtensionType::VarAttrs => (1, 0), - ExtensionType::Mrsets2 => (1, 0), - ExtensionType::Encoding => (1, 0), - ExtensionType::LongLabels => (1, 0), - ExtensionType::LongMissing => (1, 0), - - /* Ignored record types. */ - ExtensionType::Date => (0, 0), - ExtensionType::DataEntry => (0, 0), - ExtensionType::Dataview => (0, 0), - } -} - */ - impl Extension { fn check_size(&self) -> Result<(), Error> { if let Some(expected_size) = E::SIZE { @@ -894,7 +1535,7 @@ impl Extension { Ok(()) } - fn read(r: &mut R, endian: Endian) -> Result { + fn read(r: &mut R, endian: Endian) -> Result { let subtype = endian.parse(read_bytes(r)?); let offset = r.stream_position()?; let size: u32 = endian.parse(read_bytes(r)?); @@ -909,16 +1550,57 @@ impl Extension { }; let offset = r.stream_position()?; let data = read_vec(r, product as usize)?; - Ok(Extension { + let extension = Extension { offset, subtype, size, count, data, - }) + }; + match subtype { + IntegerInfoRecord::SUBTYPE => Ok(Record::IntegerInfo(IntegerInfoRecord::parse( + &extension, + endian, + |_| (), + )?)), + FloatInfoRecord::SUBTYPE => Ok(Record::FloatInfo(FloatInfoRecord::parse( + &extension, + endian, + |_| (), + )?)), + VarDisplayRecord::SUBTYPE => Ok(Record::VarDisplay(VarDisplayRecord::parse( + &extension, + endian, + |_| (), + )?)), + MultipleResponseRecord::SUBTYPE | 19 => Ok(Record::MultipleResponse( + MultipleResponseRecord::parse(&extension, endian, |_| ())?, + )), + LongStringValueLabelRecord::SUBTYPE => Ok(Record::LongStringValueLabels( + LongStringValueLabelRecord::parse(&extension, endian, |_| ())?, + )), + EncodingRecord::SUBTYPE => Ok(Record::Encoding(EncodingRecord::parse( + &extension, + endian, + |_| (), + )?)), + NumberOfCasesRecord::SUBTYPE => Ok(Record::NumberOfCases(NumberOfCasesRecord::parse( + &extension, + endian, + |_| (), + )?)), + 5 => Ok(Record::VariableSets(extension.into())), + 10 => Ok(Record::ProductInfo(extension.into())), + 13 => Ok(Record::LongNames(extension.into())), + 14 => Ok(Record::VeryLongStrings(extension.into())), + 17 => Ok(Record::FileAttributes(extension.into())), + 18 => Ok(Record::VariableAttributes(extension.into())), + _ => Ok(Record::OtherExtension(extension)), + } } } +#[derive(Clone, Debug)] pub struct ZHeader { /// File offset to the start of the record. pub offset: u64, @@ -949,6 +1631,7 @@ impl ZHeader { } } +#[derive(Clone, Debug)] pub struct ZTrailer { /// File offset to the start of the record. pub offset: u64, @@ -967,6 +1650,7 @@ pub struct ZTrailer { pub blocks: Vec, } +#[derive(Clone, Debug)] pub struct ZBlock { /// Offset of block of data if simple compression were used. pub uncompressed_ofs: u64, @@ -1056,3 +1740,51 @@ fn read_vec(r: &mut R, n: usize) -> Result, IoError> { r.read_exact(&mut vec)?; Ok(vec) } + +fn read_string(r: &mut R, endian: Endian) -> Result { + let length: u32 = endian.parse(read_bytes(r)?); + Ok(read_vec(r, length as usize)?.into()) +} + +#[derive(Clone, Debug)] +pub struct LongStringValueLabels { + pub var_name: UnencodedString, + pub width: u32, + + /// `(value, label)` pairs, where each value is `width` bytes. + pub labels: Vec<(UnencodedString, UnencodedString)>, +} + +#[derive(Clone, Debug)] +pub struct LongStringValueLabelRecord(pub Vec); + +impl ExtensionRecord for LongStringValueLabelRecord { + const SUBTYPE: u32 = 21; + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "long string value labels record"; + + fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let mut label_set = Vec::new(); + while !input.is_empty() { + let var_name = read_string(&mut input, endian)?; + let width: u32 = endian.parse(read_bytes(&mut input)?); + let n_labels: u32 = endian.parse(read_bytes(&mut input)?); + let mut labels = Vec::new(); + for _ in 0..n_labels { + let value = read_string(&mut input, endian)?; + let label = read_string(&mut input, endian)?; + labels.push((value, label)); + } + label_set.push(LongStringValueLabels { + var_name, + width, + labels, + }) + } + Ok(LongStringValueLabelRecord(label_set)) + } +}