X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=rust%2Fsrc%2Fraw.rs;h=a8c7c858e9e0a766243226066023acd6bc1aead4;hb=6165ed413d9aa818e3246d0a063c646dc4efc7e5;hp=ca0596f5414aecdb378b38d1e5e8cd3824fe6a18;hpb=559e5e5035fd98393beac3ddfd70d08dc2134c23;p=pspp diff --git a/rust/src/raw.rs b/rust/src/raw.rs index ca0596f541..a8c7c858e9 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -1,8 +1,9 @@ use crate::endian::{Endian, Parse, ToBytes}; -use crate::Error; +use encoding_rs::mem::decode_latin1; use flate2::read::ZlibDecoder; use num::Integer; +use std::borrow::Cow; use std::fmt::{Debug, Formatter, Result as FmtResult}; use std::str::from_utf8; use std::{ @@ -10,23 +11,141 @@ use std::{ io::{Error as IoError, Read, Seek, SeekFrom}, iter::FusedIterator, }; +use thiserror::Error as ThisError; use self::state::State; -#[derive(Copy, Clone, Debug)] -pub enum Compression { - Simple, - ZLib, +#[derive(ThisError, Debug)] +pub enum Error { + #[error("Not an SPSS system file")] + NotASystemFile, + + #[error("Invalid magic number {0:?}")] + BadMagic([u8; 4]), + + #[error("I/O error ({0})")] + Io(#[from] IoError), + + #[error("Invalid SAV compression code {0}")] + InvalidSavCompression(u32), + + #[error("Invalid ZSAV compression code {0}")] + InvalidZsavCompression(u32), + + #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")] + BadVariableWidth { offset: u64, width: i32 }, + + #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")] + BadDocumentLength { offset: u64, n: usize, max: usize }, + + #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")] + BadRecordType { offset: u64, rec_type: u32 }, + + #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")] + BadVariableLabelCode { offset: u64, code: u32 }, + + #[error( + "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3." + )] + BadNumericMissingValueCode { offset: u64, code: i32 }, + + #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")] + BadStringMissingValueCode { offset: u64, code: i32 }, + + #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")] + BadNumberOfValueLabels { offset: u64, n: u32, max: u32 }, + + #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")] + BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 }, + + #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")] + ExtensionRecordTooLarge { + offset: u64, + subtype: u32, + size: u32, + count: u32, + }, + + #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")] + EofInCase { + offset: u64, + case_ofs: u64, + case_len: usize, + }, + + #[error( + "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case." + )] + EofInCompressedCase { offset: u64, case_ofs: u64 }, + + #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")] + PartialCompressedCase { offset: u64, case_ofs: u64 }, + + #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")] + CompressedNumberExpected { offset: u64, case_ofs: u64 }, + + #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")] + CompressedStringExpected { offset: u64, case_ofs: u64 }, + + #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")] + BadZlibTrailerNBlocks { + offset: u64, + n_blocks: u32, + expected_n_blocks: u64, + ztrailer_len: u64, + }, + + #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")] + BadRecordSize { + offset: u64, + record: String, + size: u32, + expected_size: u32, + }, + + #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")] + BadRecordCount { + offset: u64, + record: String, + count: u32, + expected_count: u32, + }, + + #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")] + BadLongMissingValueLength { + record_offset: u64, + offset: u64, + value_len: u32, + }, + + #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")] + BadEncodingName { offset: u64 }, + + #[error("Details TBD")] + TBD, } #[derive(Clone, Debug)] pub enum Record { - Header(Header), - Document(Document), - Variable(Variable), - ValueLabel(ValueLabel), - VarIndexes(VarIndexes), - Extension(Extension), + Header(HeaderRecord), + Variable(VariableRecord), + ValueLabel(ValueLabelRecord), + VarIndexes(VarIndexRecord), + Document(DocumentRecord), + IntegerInfo(IntegerInfoRecord), + FloatInfo(FloatInfoRecord), + VariableSets(TextRecord), + VarDisplay(VarDisplayRecord), + MultipleResponse(MultipleResponseRecord), + LongStringValueLabels(LongStringValueLabelRecord), + Encoding(EncodingRecord), + NumberOfCases(NumberOfCasesRecord), + ProductInfo(TextRecord), + LongNames(TextRecord), + VeryLongStrings(TextRecord), + FileAttributes(TextRecord), + VariableAttributes(TextRecord), + OtherExtension(Extension), EndOfHeaders(u32), ZHeader(ZHeader), ZTrailer(ZTrailer), @@ -37,11 +156,11 @@ impl Record { fn read(reader: &mut R, endian: Endian) -> Result { let rec_type: u32 = endian.parse(read_bytes(reader)?); match rec_type { - 2 => Ok(Record::Variable(Variable::read(reader, endian)?)), - 3 => Ok(Record::ValueLabel(ValueLabel::read(reader, endian)?)), - 4 => Ok(Record::VarIndexes(VarIndexes::read(reader, endian)?)), - 6 => Ok(Record::Document(Document::read(reader, endian)?)), - 7 => Ok(Record::Extension(Extension::read(reader, endian)?)), + 2 => Ok(Record::Variable(VariableRecord::read(reader, endian)?)), + 3 => Ok(Record::ValueLabel(ValueLabelRecord::read(reader, endian)?)), + 4 => Ok(Record::VarIndexes(VarIndexRecord::read(reader, endian)?)), + 6 => Ok(Record::Document(DocumentRecord::read(reader, endian)?)), + 7 => Ok(Extension::read(reader, endian)?), 999 => Ok(Record::EndOfHeaders(endian.parse(read_bytes(reader)?))), _ => Err(Error::BadRecordType { offset: reader.stream_position()?, @@ -51,34 +170,26 @@ impl Record { } } -pub struct FallbackEncoding<'a>(&'a [u8]); +// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it +// decoded as Latin-1 (actually bytes interpreted as Unicode code points). +fn default_decode<'a>(s: &'a [u8]) -> Cow<'a, str> { + from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from) +} -impl<'a> Debug for FallbackEncoding<'a> { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - if let Ok(s) = from_utf8(self.0) { - let s = s.trim_end(); - write!(f, "\"{s}\"") - } else { - let s: String = self - .0 - .iter() - .map(|c| char::from(*c).escape_default()) - .flatten() - .collect(); - let s = s.trim_end(); - write!(f, "\"{s}\"") - } - } +#[derive(Copy, Clone, Debug)] +pub enum Compression { + Simple, + ZLib, } #[derive(Clone)] -pub struct Header { +pub struct HeaderRecord { /// Magic number. pub magic: Magic, /// Eye-catcher string, product name, in the file's encoding. Padded /// on the right with spaces. - pub eye_catcher: [u8; 60], + pub eye_catcher: UnencodedStr<60>, /// Layout code, normally either 2 or 3. pub layout_code: u32, @@ -90,7 +201,7 @@ pub struct Header { /// Compression type, if any, pub compression: Option, - /// 0-based variable index of the weight variable, or `None` if the file is + /// 1-based variable index of the weight variable, or `None` if the file is /// unweighted. pub weight_index: Option, @@ -101,48 +212,48 @@ pub struct Header { pub bias: f64, /// `dd mmm yy` in the file's encoding. - pub creation_date: [u8; 9], + pub creation_date: UnencodedStr<9>, /// `HH:MM:SS` in the file's encoding. - pub creation_time: [u8; 8], + pub creation_time: UnencodedStr<8>, /// File label, in the file's encoding. Padded on the right with spaces. - pub file_label: [u8; 64], + pub file_label: UnencodedStr<64>, /// Endianness of the data in the file header. pub endian: Endian, } -impl Header { +impl HeaderRecord { fn debug_field(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult { writeln!(f, "{name:>17}: {:?}", value) } } -impl Debug for Header { +impl Debug for HeaderRecord { fn fmt(&self, f: &mut Formatter) -> FmtResult { writeln!(f, "File header record:")?; self.debug_field(f, "Magic", self.magic)?; - self.debug_field(f, "Product name", FallbackEncoding(&self.eye_catcher))?; + self.debug_field(f, "Product name", &self.eye_catcher)?; self.debug_field(f, "Layout code", self.layout_code)?; self.debug_field(f, "Nominal case size", self.nominal_case_size)?; self.debug_field(f, "Compression", self.compression)?; self.debug_field(f, "Weight index", self.weight_index)?; self.debug_field(f, "Number of cases", self.n_cases)?; self.debug_field(f, "Compression bias", self.bias)?; - self.debug_field(f, "Creation date", FallbackEncoding(&self.creation_date))?; - self.debug_field(f, "Creation time", FallbackEncoding(&self.creation_time))?; - self.debug_field(f, "File label", FallbackEncoding(&self.file_label))?; + self.debug_field(f, "Creation date", &self.creation_date)?; + self.debug_field(f, "Creation time", &self.creation_time)?; + self.debug_field(f, "File label", &self.file_label)?; self.debug_field(f, "Endianness", self.endian) } } -impl Header { - fn read(r: &mut R) -> Result { +impl HeaderRecord { + fn read(r: &mut R) -> Result { let magic: [u8; 4] = read_bytes(r)?; let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?; - let eye_catcher: [u8; 60] = read_bytes(r)?; + let eye_catcher = UnencodedStr::<60>(read_bytes(r)?); let layout_code: [u8; 4] = read_bytes(r)?; let endian = Endian::identify_u32(2, layout_code) .or_else(|| Endian::identify_u32(2, layout_code)) @@ -163,19 +274,19 @@ impl Header { }; let weight_index: u32 = endian.parse(read_bytes(r)?); - let weight_index = (weight_index > 0).then(|| weight_index - 1); + let weight_index = (weight_index > 0).then_some(weight_index); let n_cases: u32 = endian.parse(read_bytes(r)?); let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases); let bias: f64 = endian.parse(read_bytes(r)?); - let creation_date: [u8; 9] = read_bytes(r)?; - let creation_time: [u8; 8] = read_bytes(r)?; - let file_label: [u8; 64] = read_bytes(r)?; + let creation_date = UnencodedStr::<9>(read_bytes(r)?); + let creation_time = UnencodedStr::<8>(read_bytes(r)?); + let file_label = UnencodedStr::<64>(read_bytes(r)?); let _: [u8; 3] = read_bytes(r)?; - Ok(Header { + Ok(HeaderRecord { magic, layout_code, nominal_case_size, @@ -231,16 +342,16 @@ impl TryFrom<[u8; 4]> for Magic { } } -#[derive(Copy, Clone, PartialEq, Eq, Hash)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum VarType { - Number, + Numeric, String, } impl VarType { fn from_width(width: i32) -> VarType { match width { - 0 => VarType::Number, + 0 => VarType::Numeric, _ => VarType::String, } } @@ -248,8 +359,8 @@ impl VarType { mod state { use super::{ - Compression, Error, Header, Record, Value, VarType, Variable, ZHeader, ZTrailer, - ZlibDecodeMultiple, + Compression, Error, HeaderRecord, Record, Value, VarType, VariableRecord, ZHeader, + ZTrailer, ZlibDecodeMultiple, }; use crate::endian::Endian; use std::{ @@ -280,7 +391,7 @@ mod state { impl State for Start { fn read(mut self: Box) -> Result)>, Error> { - let header = Header::read(&mut self.reader)?; + let header = HeaderRecord::read(&mut self.reader)?; let next_state = Headers(CommonState { reader: self.reader, endian: header.endian, @@ -298,7 +409,7 @@ mod state { fn read(mut self: Box) -> Result)>, Error> { let record = Record::read(&mut self.0.reader, self.0.endian)?; match record { - Record::Variable(Variable { width, .. }) => { + Record::Variable(VariableRecord { width, .. }) => { self.0.var_types.push(VarType::from_width(width)); } Record::EndOfHeaders(_) => { @@ -392,7 +503,7 @@ mod state { #[derive(Copy, Clone)] pub enum Value { Number(Option), - String([u8; 8]), + String(UnencodedStr<8>), } impl Debug for Value { @@ -400,21 +511,25 @@ impl Debug for Value { match self { Value::Number(Some(number)) => write!(f, "{number:?}"), Value::Number(None) => write!(f, "SYSMIS"), - Value::String(bytes) => write!(f, "{:?}", FallbackEncoding(bytes)), + Value::String(bytes) => write!(f, "{:?}", bytes), } } } impl Value { fn read(r: &mut R, var_type: VarType, endian: Endian) -> Result { - Ok(Self::from_raw(var_type, read_bytes(r)?, endian)) + Ok(Self::from_raw( + UntypedValue(read_bytes(r)?), + var_type, + endian, + )) } - pub fn from_raw(var_type: VarType, raw: [u8; 8], endian: Endian) -> Value { + pub fn from_raw(raw: UntypedValue, var_type: VarType, endian: Endian) -> Value { match var_type { - VarType::String => Value::String(raw), - VarType::Number => { - let number: f64 = endian.parse(raw); + VarType::String => Value::String(UnencodedStr(raw.0)), + VarType::Numeric => { + let number: f64 = endian.parse(raw.0); Value::Number((number != -f64::MAX).then_some(number)) } } @@ -440,7 +555,7 @@ impl Value { }); } }; - values.push(Value::from_raw(var_type, raw, endian)); + values.push(Value::from_raw(UntypedValue(raw), var_type, endian)); } Ok(Some(values)) } @@ -474,9 +589,9 @@ impl Value { match code { 0 => (), 1..=251 => match var_type { - VarType::Number => break Value::Number(Some(code as f64 - bias)), + VarType::Numeric => break Value::Number(Some(code as f64 - bias)), VarType::String => { - break Value::String(endian.to_bytes(code as f64 - bias)) + break Value::String(UnencodedStr(endian.to_bytes(code as f64 - bias))) } }, 252 => { @@ -490,10 +605,12 @@ impl Value { }); } } - 253 => break Value::from_raw(var_type, read_bytes(reader)?, endian), + 253 => { + break Value::from_raw(UntypedValue(read_bytes(reader)?), var_type, endian) + } 254 => match var_type { - VarType::String => break Value::String(*b" "), // XXX EBCDIC - VarType::Number => { + VarType::String => break Value::String(UnencodedStr(*b" ")), // XXX EBCDIC + VarType::Numeric => { return Err(Error::CompressedStringExpected { offset: case_start, case_ofs: reader.stream_position()? - case_start, @@ -501,7 +618,7 @@ impl Value { } }, 255 => match var_type { - VarType::Number => break Value::Number(None), + VarType::Numeric => break Value::Number(None), VarType::String => { return Err(Error::CompressedNumberExpected { offset: case_start, @@ -571,6 +688,16 @@ impl Reader { state: Some(state::new(reader)), }) } + pub fn collect_headers(&mut self) -> Result, Error> { + let mut headers = Vec::new(); + for record in self { + match record? { + Record::EndOfHeaders(_) => break, + r => headers.push(r), + }; + } + Ok(headers) + } } impl Iterator for Reader { @@ -591,9 +718,9 @@ impl Iterator for Reader { impl FusedIterator for Reader {} #[derive(Copy, Clone, PartialEq, Eq, Hash)] -pub struct Format(pub u32); +pub struct Spec(pub u32); -impl Debug for Format { +impl Debug for Spec { fn fmt(&self, f: &mut Formatter) -> FmtResult { let type_ = format_name(self.0 >> 16); let w = (self.0 >> 8) & 0xff; @@ -602,7 +729,7 @@ impl Debug for Format { } } -fn format_name(type_: u32) -> &'static str { +fn format_name(type_: u32) -> Cow<'static, str> { match type_ { 1 => "A", 2 => "AHEX", @@ -641,8 +768,9 @@ fn format_name(type_: u32) -> &'static str { 39 => "SDATE", 40 => "MTIME", 41 => "YMDHMS", - _ => "(unknown)", + _ => return format!("").into(), } + .into() } #[derive(Clone)] @@ -716,7 +844,7 @@ impl MissingValues { } #[derive(Clone)] -pub struct Variable { +pub struct VariableRecord { /// Offset from the start of the file to the start of the record. pub offset: u64, @@ -724,22 +852,22 @@ pub struct Variable { pub width: i32, /// Variable name, padded on the right with spaces. - pub name: [u8; 8], + pub name: UnencodedStr<8>, /// Print format. - pub print_format: u32, + pub print_format: Spec, /// Write format. - pub write_format: u32, + pub write_format: Spec, /// Missing values. pub missing_values: MissingValues, /// Optional variable label. - pub label: Option>, + pub label: Option, } -impl Debug for Variable { +impl Debug for VariableRecord { fn fmt(&self, f: &mut Formatter) -> FmtResult { writeln!( f, @@ -753,41 +881,35 @@ impl Debug for Variable { "long string continuation record" } )?; - writeln!(f, "Print format: {:?}", Format(self.print_format))?; - writeln!(f, "Write format: {:?}", Format(self.write_format))?; - writeln!(f, "Name: {:?}", FallbackEncoding(&self.name))?; - writeln!( - f, - "Variable label: {:?}", - self.label - .as_ref() - .map(|label| FallbackEncoding(&label[..])) - )?; + writeln!(f, "Print format: {:?}", self.print_format)?; + writeln!(f, "Write format: {:?}", self.write_format)?; + writeln!(f, "Name: {:?}", &self.name)?; + writeln!(f, "Variable label: {:?}", self.label)?; writeln!(f, "Missing values: {:?}", self.missing_values) } } -impl Variable { - fn read(r: &mut R, endian: Endian) -> Result { +impl VariableRecord { + fn read(r: &mut R, endian: Endian) -> Result { let offset = r.stream_position()?; let width: i32 = endian.parse(read_bytes(r)?); let has_variable_label: u32 = endian.parse(read_bytes(r)?); let missing_value_code: i32 = endian.parse(read_bytes(r)?); - let print_format: u32 = endian.parse(read_bytes(r)?); - let write_format: u32 = endian.parse(read_bytes(r)?); - let name: [u8; 8] = read_bytes(r)?; + let print_format = Spec(endian.parse(read_bytes(r)?)); + let write_format = Spec(endian.parse(read_bytes(r)?)); + let name = UnencodedStr::<8>(read_bytes(r)?); let label = match has_variable_label { 0 => None, 1 => { let len: u32 = endian.parse(read_bytes(r)?); let read_len = len.min(65535) as usize; - let label = Some(read_vec(r, read_len)?); + let label = UnencodedString(read_vec(r, read_len)?); let padding_bytes = Integer::next_multiple_of(&len, &4) - len; let _ = read_vec(r, padding_bytes as usize)?; - label + Some(label) } _ => { return Err(Error::BadVariableLabelCode { @@ -799,7 +921,7 @@ impl Variable { let missing_values = MissingValues::read(r, offset, width, missing_value_code, endian)?; - Ok(Variable { + Ok(VariableRecord { offset, width, name, @@ -811,178 +933,215 @@ impl Variable { } } -#[derive(Clone, Debug)] -pub struct ValueLabel { +#[derive(Copy, Clone)] +pub struct UntypedValue(pub [u8; 8]); + +impl Debug for UntypedValue { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + let little: f64 = Endian::Little.parse(self.0); + let little = format!("{:?}", little); + let big: f64 = Endian::Big.parse(self.0); + let big = format!("{:?}", big); + let number = if little.len() <= big.len() { + little + } else { + big + }; + write!(f, "{number}")?; + + let string = default_decode(&self.0); + let string = string + .split(|c: char| c == '\0' || c.is_control()) + .next() + .unwrap(); + write!(f, "{string:?}")?; + Ok(()) + } +} + +#[derive(Clone)] +pub struct UnencodedString(pub Vec); + +impl From> for UnencodedString { + fn from(source: Vec) -> Self { + Self(source) + } +} + +impl From<&[u8]> for UnencodedString { + fn from(source: &[u8]) -> Self { + Self(source.into()) + } +} + +impl Debug for UnencodedString { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{:?}", default_decode(self.0.as_slice())) + } +} + +#[derive(Copy, Clone)] +pub struct UnencodedStr(pub [u8; N]); + +impl From<[u8; N]> for UnencodedStr { + fn from(source: [u8; N]) -> Self { + Self(source) + } +} + +impl Debug for UnencodedStr { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{:?}", default_decode(&self.0)) + } +} + +#[derive(Clone)] +pub struct ValueLabelRecord { /// Offset from the start of the file to the start of the record. pub offset: u64, /// The labels. - pub labels: Vec<([u8; 8], Vec)>, + pub labels: Vec<(UntypedValue, UnencodedString)>, } -impl ValueLabel { +impl Debug for ValueLabelRecord { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + for (value, label) in self.labels.iter() { + writeln!(f, "{value:?}: {label:?}")?; + } + Ok(()) + } +} + +impl ValueLabelRecord { /// Maximum number of value labels in a record. pub const MAX: u32 = u32::MAX / 8; - fn read(r: &mut R, endian: Endian) -> Result { + fn read(r: &mut R, endian: Endian) -> Result { let offset = r.stream_position()?; let n: u32 = endian.parse(read_bytes(r)?); - if n > ValueLabel::MAX { + if n > ValueLabelRecord::MAX { return Err(Error::BadNumberOfValueLabels { offset, n, - max: ValueLabel::MAX, + max: ValueLabelRecord::MAX, }); } let mut labels = Vec::new(); for _ in 0..n { - let value: [u8; 8] = read_bytes(r)?; + let value = UntypedValue(read_bytes(r)?); let label_len: u8 = endian.parse(read_bytes(r)?); let label_len = label_len as usize; let padded_len = Integer::next_multiple_of(&(label_len + 1), &8); - let mut label = read_vec(r, padded_len)?; + let mut label = read_vec(r, padded_len - 1)?; label.truncate(label_len); - labels.push((value, label)); + labels.push((value, UnencodedString(label))); } - Ok(ValueLabel { offset, labels }) + Ok(ValueLabelRecord { offset, labels }) } } -#[derive(Clone, Debug)] -pub struct VarIndexes { +#[derive(Clone)] +pub struct VarIndexRecord { /// Offset from the start of the file to the start of the record. pub offset: u64, - /// The 0-based indexes of the variable indexes. - pub var_indexes: Vec, + /// The 1-based indexes of the variable indexes. + pub dict_indexes: Vec, } -impl VarIndexes { +impl Debug for VarIndexRecord { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "apply to variables")?; + for dict_index in self.dict_indexes.iter() { + write!(f, " #{dict_index}")?; + } + Ok(()) + } +} + +impl VarIndexRecord { /// Maximum number of variable indexes in a record. pub const MAX: u32 = u32::MAX / 8; - fn read(r: &mut R, endian: Endian) -> Result { + fn read(r: &mut R, endian: Endian) -> Result { let offset = r.stream_position()?; let n: u32 = endian.parse(read_bytes(r)?); - if n > VarIndexes::MAX { + if n > VarIndexRecord::MAX { return Err(Error::BadNumberOfVarIndexes { offset, n, - max: VarIndexes::MAX, + max: VarIndexRecord::MAX, }); } - let mut var_indexes = Vec::with_capacity(n as usize); + let mut dict_indexes = Vec::with_capacity(n as usize); for _ in 0..n { - var_indexes.push(endian.parse(read_bytes(r)?)); + dict_indexes.push(endian.parse(read_bytes(r)?)); } - Ok(VarIndexes { + Ok(VarIndexRecord { offset, - var_indexes, + dict_indexes, }) } } #[derive(Clone, Debug)] -pub struct Document { +pub struct DocumentRecord { /// Offset from the start of the file to the start of the record. pub pos: u64, /// The document, as an array of 80-byte lines. - pub lines: Vec<[u8; Document::LINE_LEN as usize]>, + pub lines: Vec, } -impl Document { +pub type DocumentLine = UnencodedStr<{ DocumentRecord::LINE_LEN }>; + +impl DocumentRecord { /// Length of a line in a document. Document lines are fixed-length and /// padded on the right with spaces. - pub const LINE_LEN: u32 = 80; + pub const LINE_LEN: usize = 80; /// Maximum number of lines we will accept in a document. This is simply /// the maximum number that will fit in a 32-bit space. - pub const MAX_LINES: u32 = i32::MAX as u32 / Self::LINE_LEN; + pub const MAX_LINES: usize = i32::MAX as usize / Self::LINE_LEN; - fn read(r: &mut R, endian: Endian) -> Result { + fn read(r: &mut R, endian: Endian) -> Result { let offset = r.stream_position()?; let n: u32 = endian.parse(read_bytes(r)?); - match n { - 0..=Self::MAX_LINES => Ok(Document { - pos: r.stream_position()?, - lines: (0..n) - .map(|_| read_bytes(r)) - .collect::, _>>()?, - }), - _ => Err(Error::BadDocumentLength { + let n = n as usize; + if n > Self::MAX_LINES { + Err(Error::BadDocumentLength { offset, n, max: Self::MAX_LINES, - }), + }) + } else { + let pos = r.stream_position()?; + let mut lines = Vec::with_capacity(n); + for _ in 0..n { + lines.push(UnencodedStr::<{ DocumentRecord::LINE_LEN }>(read_bytes(r)?)); + } + Ok(DocumentRecord { pos, lines }) } } } -/* -#[derive(FromPrimitive)] -enum ExtensionType { - /// Machine integer info. - Integer = 3, - /// Machine floating-point info. - Float = 4, - /// Variable sets. - VarSets = 5, - /// DATE. - Date = 6, - /// Multiple response sets. - Mrsets = 7, - /// SPSS Data Entry. - DataEntry = 8, - /// Extra product info text. - ProductInfo = 10, - /// Variable display parameters. - Display = 11, - /// Long variable names. - LongNames = 13, - /// Long strings. - LongStrings = 14, - /// Extended number of cases. - Ncases = 16, - /// Data file attributes. - FileAttrs = 17, - /// Variable attributes. - VarAttrs = 18, - /// Multiple response sets (extended). - Mrsets2 = 19, - /// Character encoding. - Encoding = 20, - /// Value labels for long strings. - LongLabels = 21, - /// Missing values for long strings. - LongMissing = 22, - /// "Format properties in dataview table". - Dataview = 24, -} - */ - -trait TextRecord -where - Self: Sized, -{ - const NAME: &'static str; - fn parse(input: &str, warn: impl Fn(Error)) -> Result; -} - trait ExtensionRecord where Self: Sized, { + const SUBTYPE: u32; const SIZE: Option; const COUNT: Option; const NAME: &'static str; fn parse(ext: &Extension, endian: Endian, warn: impl Fn(Error)) -> Result; } -pub struct IntegerInfo { +#[derive(Clone, Debug)] +pub struct IntegerInfoRecord { pub version: (i32, i32, i32), pub machine_code: i32, pub floating_point_rep: i32, @@ -991,7 +1150,8 @@ pub struct IntegerInfo { pub character_code: i32, } -impl ExtensionRecord for IntegerInfo { +impl ExtensionRecord for IntegerInfoRecord { + const SUBTYPE: u32 = 3; const SIZE: Option = Some(4); const COUNT: Option = Some(8); const NAME: &'static str = "integer record"; @@ -1003,7 +1163,7 @@ impl ExtensionRecord for IntegerInfo { let data: Vec = (0..8) .map(|_| endian.parse(read_bytes(&mut input).unwrap())) .collect(); - Ok(IntegerInfo { + Ok(IntegerInfoRecord { version: (data[0], data[1], data[2]), machine_code: data[3], floating_point_rep: data[4], @@ -1014,13 +1174,15 @@ impl ExtensionRecord for IntegerInfo { } } -pub struct FloatInfo { +#[derive(Clone, Debug)] +pub struct FloatInfoRecord { pub sysmis: f64, pub highest: f64, pub lowest: f64, } -impl ExtensionRecord for FloatInfo { +impl ExtensionRecord for FloatInfoRecord { + const SUBTYPE: u32 = 4; const SIZE: Option = Some(8); const COUNT: Option = Some(3); const NAME: &'static str = "floating point record"; @@ -1032,7 +1194,7 @@ impl ExtensionRecord for FloatInfo { let data: Vec = (0..3) .map(|_| endian.parse(read_bytes(&mut input).unwrap())) .collect(); - Ok(FloatInfo { + Ok(FloatInfoRecord { sysmis: data[0], highest: data[1], lowest: data[2], @@ -1040,30 +1202,23 @@ impl ExtensionRecord for FloatInfo { } } +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum CategoryLabels { VarLabels, CountedValues, } + +#[derive(Clone, Debug)] pub enum MultipleResponseType { MultipleDichotomy { - value: Vec, + value: UnencodedString, labels: CategoryLabels, }, MultipleCategory, } -pub struct MultipleResponseSet { - pub name: Vec, - pub label: Vec, - pub mr_type: MultipleResponseType, - pub vars: Vec>, -} -impl MultipleResponseSet { - fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> { - let Some(equals) = input.iter().position(|&b| b == b'=') else { - return Err(Error::TBD); - }; - let (name, input) = input.split_at(equals); +impl MultipleResponseType { + fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Error> { let (mr_type, input) = match input.get(0) { Some(b'C') => (MultipleResponseType::MultipleCategory, &input[1..]), Some(b'D') => { @@ -1099,6 +1254,25 @@ impl MultipleResponseSet { } _ => return Err(Error::TBD), }; + Ok((mr_type, input)) + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseSet { + pub name: UnencodedString, + pub label: UnencodedString, + pub mr_type: MultipleResponseType, + pub short_names: Vec, +} + +impl MultipleResponseSet { + fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> { + let Some(equals) = input.iter().position(|&b| b == b'=') else { + return Err(Error::TBD); + }; + let (name, input) = input.split_at(equals); + let (mr_type, input) = MultipleResponseType::parse(input)?; let Some(b' ') = input.get(0) else { return Err(Error::TBD); }; @@ -1125,16 +1299,18 @@ impl MultipleResponseSet { name: name.into(), label: label.into(), mr_type, - vars, + short_names: vars, }, input, )) } } -pub struct MultipleResponseSets(Vec); +#[derive(Clone, Debug)] +pub struct MultipleResponseRecord(pub Vec); -impl ExtensionRecord for MultipleResponseSets { +impl ExtensionRecord for MultipleResponseRecord { + const SUBTYPE: u32 = 7; const SIZE: Option = Some(1); const COUNT: Option = None; const NAME: &'static str = "multiple response set record"; @@ -1149,11 +1325,11 @@ impl ExtensionRecord for MultipleResponseSets { sets.push(set); input = rest; } - Ok(MultipleResponseSets(sets)) + Ok(MultipleResponseRecord(sets)) } } -fn parse_counted_string(input: &[u8]) -> Result<(&[u8], &[u8]), Error> { +fn parse_counted_string(input: &[u8]) -> Result<(UnencodedString, &[u8]), Error> { let Some(space) = input.iter().position(|&b| b == b' ') else { return Err(Error::TBD); }; @@ -1170,21 +1346,14 @@ fn parse_counted_string(input: &[u8]) -> Result<(&[u8], &[u8]), Error> { }; let (string, rest) = input.split_at(length); - Ok((string, rest)) -} - -pub struct ExtraProductInfo(String); - -impl TextRecord for ExtraProductInfo { - const NAME: &'static str = "extra product info"; - fn parse(input: &str, _warn: impl Fn(Error)) -> Result { - Ok(ExtraProductInfo(input.into())) - } + Ok((string.into(), rest)) } -pub struct VarDisplayRecord(Vec); +#[derive(Clone, Debug)] +pub struct VarDisplayRecord(pub Vec); impl ExtensionRecord for VarDisplayRecord { + const SUBTYPE: u32 = 11; const SIZE: Option = Some(4); const COUNT: Option = None; const NAME: &'static str = "variable display record"; @@ -1200,153 +1369,18 @@ impl ExtensionRecord for VarDisplayRecord { } } -pub struct VariableSet { - pub name: String, - pub vars: Vec, -} - -impl VariableSet { - fn parse(input: &str) -> Result { - let (name, input) = input.split_once('=').ok_or(Error::TBD)?; - let vars = input.split_ascii_whitespace().map(String::from).collect(); - Ok(VariableSet { - name: name.into(), - vars, - }) - } -} - -pub struct VariableSetRecord(Vec); - -impl TextRecord for VariableSetRecord { - const NAME: &'static str = "variable set"; - fn parse(input: &str, warn: impl Fn(Error)) -> Result { - let mut sets = Vec::new(); - for line in input.lines() { - match VariableSet::parse(line) { - Ok(set) => sets.push(set), - Err(error) => warn(error), - } - } - Ok(VariableSetRecord(sets)) - } -} - -pub struct LongVariableName { - pub short_name: String, - pub long_name: String, -} - -pub struct LongVariableNameRecord(Vec); - -impl TextRecord for LongVariableNameRecord { - const NAME: &'static str = "long variable names"; - fn parse(input: &str, warn: impl Fn(Error)) -> Result { - let mut names = Vec::new(); - for pair in input.split('\t').filter(|s| !s.is_empty()) { - if let Some((short_name, long_name)) = pair.split_once('=') { - let name = LongVariableName { - short_name: short_name.into(), - long_name: long_name.into(), - }; - names.push(name); - } else { - warn(Error::TBD) - } - } - Ok(LongVariableNameRecord(names)) - } -} - -pub struct VeryLongString { - pub short_name: String, - pub length: usize, -} - -impl VeryLongString { - fn parse(input: &str) -> Result { - let Some((short_name, length)) = input.split_once('=') else { - return Err(Error::TBD); - }; - let length: usize = length.parse().map_err(|_| Error::TBD)?; - Ok(VeryLongString { - short_name: short_name.into(), - length, - }) - } -} - -pub struct VeryLongStringRecord(Vec); - -impl TextRecord for VeryLongStringRecord { - const NAME: &'static str = "very long strings"; - fn parse(input: &str, warn: impl Fn(Error)) -> Result { - let mut very_long_strings = Vec::new(); - for tuple in input - .split('\0') - .map(|s| s.trim_end_matches('\t')) - .filter(|s| !s.is_empty()) - { - match VeryLongString::parse(tuple) { - Ok(vls) => very_long_strings.push(vls), - Err(error) => warn(error), - } - } - Ok(VeryLongStringRecord(very_long_strings)) - } -} - -pub struct LongStringValueLabels { - pub var_name: Vec, - pub width: u32, - - /// `(value, label)` pairs, where each value is `width` bytes. - pub labels: Vec<(Vec, Vec)>, -} - -pub struct LongStringValueLabelSet(Vec); - -impl ExtensionRecord for LongStringValueLabelSet { - const SIZE: Option = Some(1); - const COUNT: Option = None; - const NAME: &'static str = "long string value labels record"; - - fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { - ext.check_size::()?; - - let mut input = &ext.data[..]; - let mut label_set = Vec::new(); - while !input.is_empty() { - let var_name = read_string(&mut input, endian)?; - let width: u32 = endian.parse(read_bytes(&mut input)?); - let n_labels: u32 = endian.parse(read_bytes(&mut input)?); - let mut labels = Vec::new(); - for _ in 0..n_labels { - let value = read_string(&mut input, endian)?; - let label = read_string(&mut input, endian)?; - labels.push((value, label)); - } - label_set.push(LongStringValueLabels { - var_name, - width, - labels, - }) - } - Ok(LongStringValueLabelSet(label_set)) - } -} - pub struct LongStringMissingValues { /// Variable name. - pub var_name: Vec, + pub var_name: UnencodedString, - /// Up to three missing values. - pub missing_values: Vec<[u8; 8]>, + /// Missing values. + pub missing_values: MissingValues, } pub struct LongStringMissingValueSet(Vec); impl ExtensionRecord for LongStringMissingValueSet { + const SUBTYPE: u32 = 22; const SIZE: Option = Some(1); const COUNT: Option = None; const NAME: &'static str = "long string missing values record"; @@ -1368,7 +1402,7 @@ impl ExtensionRecord for LongStringMissingValueSet { value_len, }); } - let mut missing_values = Vec::new(); + let mut values = Vec::new(); for i in 0..n_missing_values { let value: [u8; 8] = read_bytes(&mut input)?; let numeric_value: u64 = endian.parse(value); @@ -1380,8 +1414,12 @@ impl ExtensionRecord for LongStringMissingValueSet { } else { value }; - missing_values.push(value); + values.push(Value::String(UnencodedStr(value))); } + let missing_values = MissingValues { + values, + range: None, + }; missing_value_set.push(LongStringMissingValues { var_name, missing_values, @@ -1391,9 +1429,11 @@ impl ExtensionRecord for LongStringMissingValueSet { } } -pub struct Encoding(pub String); +#[derive(Clone, Debug)] +pub struct EncodingRecord(pub String); -impl ExtensionRecord for Encoding { +impl ExtensionRecord for EncodingRecord { + const SUBTYPE: u32 = 20; const SIZE: Option = Some(1); const COUNT: Option = None; const NAME: &'static str = "encoding record"; @@ -1401,133 +1441,14 @@ impl ExtensionRecord for Encoding { fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result { ext.check_size::()?; - Ok(Encoding(String::from_utf8(ext.data.clone()).map_err( - |_| Error::BadEncodingName { offset: ext.offset }, - )?)) - } -} - -pub struct Attribute { - pub name: String, - pub values: Vec, -} - -impl Attribute { - fn parse<'a>(input: &'a str, warn: &impl Fn(Error)) -> Result<(Attribute, &'a str), Error> { - let Some((name, mut input)) = input.split_once('(') else { - return Err(Error::TBD); - }; - let mut values = Vec::new(); - loop { - let Some((value, rest)) = input.split_once('\n') else { - return Err(Error::TBD); - }; - if let Some(stripped) = value - .strip_prefix('\'') - .and_then(|value| value.strip_suffix('\'')) - { - values.push(stripped.into()); - } else { - warn(Error::TBD); - values.push(value.into()); - } - if let Some(rest) = rest.strip_prefix(')') { - return Ok(( - Attribute { - name: name.into(), - values, - }, - rest, - )); - } - input = rest; - } - } -} - -pub struct AttributeSet(pub Vec); - -impl AttributeSet { - fn parse<'a>( - mut input: &'a str, - sentinel: Option, - warn: &impl Fn(Error), - ) -> Result<(AttributeSet, &'a str), Error> { - let mut attributes = Vec::new(); - let rest = loop { - match input.chars().next() { - None => break input, - c if c == sentinel => break &input[1..], - _ => { - let (attribute, rest) = Attribute::parse(input, &warn)?; - attributes.push(attribute); - input = rest; - } - } - }; - Ok((AttributeSet(attributes), rest)) - } -} - -pub struct FileAttributeRecord(AttributeSet); - -impl TextRecord for FileAttributeRecord { - const NAME: &'static str = "data file attributes"; - fn parse(input: &str, warn: impl Fn(Error)) -> Result { - let (set, rest) = AttributeSet::parse(input, None, &warn)?; - if !rest.is_empty() { - warn(Error::TBD); - } - Ok(FileAttributeRecord(set)) - } -} - -pub struct VarAttributeSet { - pub long_var_name: String, - pub attributes: AttributeSet, -} - -impl VarAttributeSet { - fn parse<'a>( - input: &'a str, - warn: &impl Fn(Error), - ) -> Result<(VarAttributeSet, &'a str), Error> { - let Some((long_var_name, rest)) = input.split_once(':') else { - return Err(Error::TBD); - }; - let (attributes, rest) = AttributeSet::parse(rest, Some('/'), warn)?; - Ok(( - VarAttributeSet { - long_var_name: long_var_name.into(), - attributes, - }, - rest, + Ok(EncodingRecord( + String::from_utf8(ext.data.clone()) + .map_err(|_| Error::BadEncodingName { offset: ext.offset })?, )) } } -pub struct VariableAttributeRecord(Vec); - -impl TextRecord for VariableAttributeRecord { - const NAME: &'static str = "variable attributes"; - fn parse(mut input: &str, warn: impl Fn(Error)) -> Result { - let mut var_attribute_sets = Vec::new(); - while !input.is_empty() { - match VarAttributeSet::parse(input, &warn) { - Ok((var_attribute, rest)) => { - var_attribute_sets.push(var_attribute); - input = rest; - } - Err(error) => { - warn(error); - break; - } - } - } - Ok(VariableAttributeRecord(var_attribute_sets)) - } -} - +#[derive(Clone, Debug)] pub struct NumberOfCasesRecord { /// Always observed as 1. pub one: u64, @@ -1537,6 +1458,7 @@ pub struct NumberOfCasesRecord { } impl ExtensionRecord for NumberOfCasesRecord { + const SUBTYPE: u32 = 16; const SIZE: Option = Some(8); const COUNT: Option = Some(2); const NAME: &'static str = "extended number of cases record"; @@ -1552,6 +1474,24 @@ impl ExtensionRecord for NumberOfCasesRecord { } } +#[derive(Clone, Debug)] +pub struct TextRecord { + /// Offset from the start of the file to the start of the record. + pub offset: u64, + + /// The text content of the record. + pub text: UnencodedString, +} + +impl From for TextRecord { + fn from(source: Extension) -> Self { + TextRecord { + offset: source.offset, + text: source.data.into(), + } + } +} + #[derive(Clone, Debug)] pub struct Extension { /// Offset from the start of the file to the start of the record. @@ -1570,34 +1510,6 @@ pub struct Extension { pub data: Vec, } -/* -fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) { - match extension { - /* Implemented record types. */ - ExtensionType::Integer => (4, 8), - ExtensionType::Float => (8, 3), - ExtensionType::VarSets => (1, 0), - ExtensionType::Mrsets => (1, 0), - ExtensionType::ProductInfo => (1, 0), - ExtensionType::Display => (4, 0), - ExtensionType::LongNames => (1, 0), - ExtensionType::LongStrings => (1, 0), - ExtensionType::Ncases => (8, 2), - ExtensionType::FileAttrs => (1, 0), - ExtensionType::VarAttrs => (1, 0), - ExtensionType::Mrsets2 => (1, 0), - ExtensionType::Encoding => (1, 0), - ExtensionType::LongLabels => (1, 0), - ExtensionType::LongMissing => (1, 0), - - /* Ignored record types. */ - ExtensionType::Date => (0, 0), - ExtensionType::DataEntry => (0, 0), - ExtensionType::Dataview => (0, 0), - } -} - */ - impl Extension { fn check_size(&self) -> Result<(), Error> { if let Some(expected_size) = E::SIZE { @@ -1623,7 +1535,7 @@ impl Extension { Ok(()) } - fn read(r: &mut R, endian: Endian) -> Result { + fn read(r: &mut R, endian: Endian) -> Result { let subtype = endian.parse(read_bytes(r)?); let offset = r.stream_position()?; let size: u32 = endian.parse(read_bytes(r)?); @@ -1638,13 +1550,53 @@ impl Extension { }; let offset = r.stream_position()?; let data = read_vec(r, product as usize)?; - Ok(Extension { + let extension = Extension { offset, subtype, size, count, data, - }) + }; + match subtype { + IntegerInfoRecord::SUBTYPE => Ok(Record::IntegerInfo(IntegerInfoRecord::parse( + &extension, + endian, + |_| (), + )?)), + FloatInfoRecord::SUBTYPE => Ok(Record::FloatInfo(FloatInfoRecord::parse( + &extension, + endian, + |_| (), + )?)), + VarDisplayRecord::SUBTYPE => Ok(Record::VarDisplay(VarDisplayRecord::parse( + &extension, + endian, + |_| (), + )?)), + MultipleResponseRecord::SUBTYPE | 19 => Ok(Record::MultipleResponse( + MultipleResponseRecord::parse(&extension, endian, |_| ())?, + )), + LongStringValueLabelRecord::SUBTYPE => Ok(Record::LongStringValueLabels( + LongStringValueLabelRecord::parse(&extension, endian, |_| ())?, + )), + EncodingRecord::SUBTYPE => Ok(Record::Encoding(EncodingRecord::parse( + &extension, + endian, + |_| (), + )?)), + NumberOfCasesRecord::SUBTYPE => Ok(Record::NumberOfCases(NumberOfCasesRecord::parse( + &extension, + endian, + |_| (), + )?)), + 5 => Ok(Record::VariableSets(extension.into())), + 10 => Ok(Record::ProductInfo(extension.into())), + 13 => Ok(Record::LongNames(extension.into())), + 14 => Ok(Record::VeryLongStrings(extension.into())), + 17 => Ok(Record::FileAttributes(extension.into())), + 18 => Ok(Record::VariableAttributes(extension.into())), + _ => Ok(Record::OtherExtension(extension)), + } } } @@ -1789,7 +1741,50 @@ fn read_vec(r: &mut R, n: usize) -> Result, IoError> { Ok(vec) } -fn read_string(r: &mut R, endian: Endian) -> Result, IoError> { +fn read_string(r: &mut R, endian: Endian) -> Result { let length: u32 = endian.parse(read_bytes(r)?); - read_vec(r, length as usize) + Ok(read_vec(r, length as usize)?.into()) +} + +#[derive(Clone, Debug)] +pub struct LongStringValueLabels { + pub var_name: UnencodedString, + pub width: u32, + + /// `(value, label)` pairs, where each value is `width` bytes. + pub labels: Vec<(UnencodedString, UnencodedString)>, +} + +#[derive(Clone, Debug)] +pub struct LongStringValueLabelRecord(pub Vec); + +impl ExtensionRecord for LongStringValueLabelRecord { + const SUBTYPE: u32 = 21; + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "long string value labels record"; + + fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let mut label_set = Vec::new(); + while !input.is_empty() { + let var_name = read_string(&mut input, endian)?; + let width: u32 = endian.parse(read_bytes(&mut input)?); + let n_labels: u32 = endian.parse(read_bytes(&mut input)?); + let mut labels = Vec::new(); + for _ in 0..n_labels { + let value = read_string(&mut input, endian)?; + let label = read_string(&mut input, endian)?; + labels.push((value, label)); + } + label_set.push(LongStringValueLabels { + var_name, + width, + labels, + }) + } + Ok(LongStringValueLabelRecord(label_set)) + } }