From ab8a1e2bad84f97efd821d813cb463cbbf8fb852 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 12 Jul 2025 17:17:53 -0700 Subject: [PATCH] more cleanup --- rust/pspp/src/dictionary.rs | 57 +- rust/pspp/src/format/mod.rs | 8 +- rust/pspp/src/output/pivot/mod.rs | 3 +- rust/pspp/src/sys/cooked.rs | 31 +- rust/pspp/src/sys/raw.rs | 2058 +---------------------------- rust/pspp/src/sys/raw/records.rs | 1952 +++++++++++++++++++++++++++ 6 files changed, 2072 insertions(+), 2037 deletions(-) create mode 100644 rust/pspp/src/sys/raw/records.rs diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index c555b30abf..e5e8f1df07 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -23,7 +23,7 @@ use std::{ collections::{BTreeMap, BTreeSet, HashMap, HashSet}, fmt::{Debug, Display, Formatter, Result as FmtResult}, hash::Hash, - ops::{Bound, RangeBounds, RangeInclusive}, + ops::{Bound, Not, RangeBounds, RangeInclusive}, str::FromStr, }; @@ -40,12 +40,50 @@ use crate::{ identifier::{ByIdentifier, HasIdentifier, Identifier}, output::pivot::{Axis3, Dimension, Footnote, Footnotes, Group, PivotTable, Value}, settings::Show, - sys::raw::{CategoryLabels, RawString, VarType}, + sys::raw::RawString, }; /// An index within [Dictionary::variables]. pub type DictIndex = usize; +/// Variable type. +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum VarType { + /// A numeric variable. + Numeric, + + /// A string variable. + String, +} + +impl Not for VarType { + type Output = Self; + + fn not(self) -> Self::Output { + match self { + Self::Numeric => Self::String, + Self::String => Self::Numeric, + } + } +} + +impl Not for &VarType { + type Output = VarType; + + fn not(self) -> Self::Output { + !*self + } +} + +impl Display for VarType { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + match self { + VarType::Numeric => write!(f, "numeric"), + VarType::String => write!(f, "string"), + } + } +} + /// [VarType], plus a width for [VarType::String]. #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum VarWidth { @@ -198,15 +236,12 @@ impl Display for VarWidthAdjective { } #[derive(Clone)] -pub enum Datum { +pub enum Datum { Number(Option), - String(S), + String(RawString), } -impl Debug for Datum -where - S: Debug, -{ +impl Debug for Datum { fn fmt(&self, f: &mut Formatter) -> FmtResult { match self { Datum::Number(Some(number)) => write!(f, "{number:?}"), @@ -1662,6 +1697,12 @@ impl Measure { } } +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum CategoryLabels { + VarLabels, + CountedValues, +} + #[cfg(test)] mod test { use std::collections::HashSet; diff --git a/rust/pspp/src/format/mod.rs b/rust/pspp/src/format/mod.rs index ff36586fe0..f7abf58757 100644 --- a/rust/pspp/src/format/mod.rs +++ b/rust/pspp/src/format/mod.rs @@ -28,8 +28,8 @@ use thiserror::Error as ThisError; use unicode_width::UnicodeWidthStr; use crate::{ - dictionary::{Datum, VarWidth}, - sys::raw::{self, RawString, VarType}, + dictionary::{Datum, VarType, VarWidth}, + sys::raw::{self, RawString}, }; mod display; @@ -800,10 +800,10 @@ impl UncheckedFormat { } } -impl TryFrom for UncheckedFormat { +impl TryFrom for UncheckedFormat { type Error = Error; - fn try_from(raw: raw::RawFormat) -> Result { + fn try_from(raw: raw::records::RawFormat) -> Result { let raw = raw.0; let raw_format = (raw >> 16) as u16; let format = raw_format.try_into()?; diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs index 6c34441add..a085ef4929 100644 --- a/rust/pspp/src/output/pivot/mod.rs +++ b/rust/pspp/src/output/pivot/mod.rs @@ -68,10 +68,9 @@ use thiserror::Error as ThisError; use tlo::parse_tlo; use crate::{ - dictionary::{Datum, Variable}, + dictionary::{Datum, VarType, Variable}, format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat}, settings::{Settings, Show}, - sys::raw::VarType, }; pub mod output; diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index 3b50a63ce5..914f03c10d 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -19,7 +19,8 @@ use std::{collections::BTreeMap, ops::Range}; use crate::{ calendar::date_time_to_pspp, dictionary::{ - Datum, Dictionary, InvalidRole, MissingValues, MissingValuesError, MultipleResponseSet, MultipleResponseType, VarWidth, Variable, VariableSet + Datum, Dictionary, InvalidRole, MissingValues, MissingValuesError, MultipleResponseSet, + MultipleResponseType, VarWidth, Variable, VariableSet, }, endian::Endian, format::{Error as FormatError, Format, UncheckedFormat}, @@ -29,12 +30,16 @@ use crate::{ sys::{ encoding::Error as EncodingError, raw::{ - self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, - FileAttributesRecord, FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName, - LongNamesRecord, LongStringMissingValueRecord, LongStringValueLabelRecord, - MultipleResponseRecord, NumberOfCasesRecord, ProductInfoRecord, RawDatum, RawString, - RawWidth, ValueLabel, ValueLabelRecord, VarDisplayRecord, VariableAttributesRecord, - VariableRecord, VariableSetRecord, VeryLongStringsRecord, ZHeader, ZTrailer, + self, + records::{ + Compression, DocumentRecord, EncodingRecord, Extension, FileAttributesRecord, + FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName, LongNamesRecord, + LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord, + NumberOfCasesRecord, ProductInfoRecord, RawFormat, ValueLabel, ValueLabelRecord, + VarDisplayRecord, VariableAttributesRecord, VariableRecord, VariableSetRecord, + VeryLongStringsRecord, ZHeader, ZTrailer, + }, + Cases, DecodedRecord, RawDatum, RawString, RawWidth, }, }, }; @@ -44,8 +49,6 @@ use indexmap::set::MutableValues; use itertools::Itertools; use thiserror::Error as ThisError; -pub use crate::sys::raw::{CategoryLabels, Compression}; - #[derive(ThisError, Clone, Debug)] pub enum Error { #[error("Missing header record")] @@ -1118,7 +1121,7 @@ impl Decoder { impl MultipleResponseSet { fn decode( dictionary: &Dictionary, - input: &raw::MultipleResponseSet, + input: &raw::records::MultipleResponseSet, warn: &mut impl FnMut(Error), ) -> Result { let mr_set_name = input.name.clone(); @@ -1201,7 +1204,7 @@ fn fix_line_ends(s: &str) -> String { } fn decode_format( - raw: raw::RawFormat, + raw: RawFormat, width: VarWidth, mut warn: impl FnMut(Format, FormatError), ) -> Format { @@ -1218,11 +1221,11 @@ fn decode_format( impl MultipleResponseType { fn decode( mr_set: &Identifier, - input: &raw::MultipleResponseType, + input: &raw::records::MultipleResponseType, min_width: VarWidth, ) -> Result { match input { - raw::MultipleResponseType::MultipleDichotomy { value, labels } => { + raw::records::MultipleResponseType::MultipleDichotomy { value, labels } => { let value = match min_width { VarWidth::Numeric => { let string = String::from_utf8_lossy(&value.0); @@ -1256,7 +1259,7 @@ impl MultipleResponseType { labels: *labels, }) } - raw::MultipleResponseType::MultipleCategory => { + raw::records::MultipleResponseType::MultipleCategory => { Ok(MultipleResponseType::MultipleCategory) } } diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index a1da804a6f..f4e7034ca0 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -20,34 +20,45 @@ //! raw details. Most readers will want to use higher-level interfaces. use crate::{ - dictionary::{ - Alignment, Attributes, Datum, Measure, MissingValueRange, MissingValues, VarWidth, - }, + dictionary::{Datum, VarType, VarWidth}, endian::{Endian, Parse, ToBytes}, format::DisplayPlainF64, identifier::{Error as IdError, Identifier}, - sys::encoding::{default_encoding, get_encoding, Error as EncodingError}, + sys::{ + encoding::{default_encoding, get_encoding, Error as EncodingError}, + raw::records::{ + Compression, DocumentRecord, EncodingRecord, Extension, FileAttributesRecord, + FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongNamesRecord, + LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord, + NumberOfCasesRecord, ProductInfoRecord, RawDocumentLine, RawFileAttributesRecord, + RawLongNamesRecord, RawProductInfoRecord, RawVariableAttributesRecord, + RawVariableSetRecord, RawVeryLongStringsRecord, ValueLabelRecord, VarDisplayRecord, + VariableAttributesRecord, VariableRecord, VariableSetRecord, VeryLongStringsRecord, + ZHeader, ZTrailer, + }, + }, }; use encoding_rs::{mem::decode_latin1, Encoding}; use flate2::read::ZlibDecoder; use itertools::Itertools; -use num::Integer; use smallvec::SmallVec; use std::{ borrow::{Borrow, Cow}, cell::RefCell, - collections::{BTreeMap, VecDeque}, + collections::VecDeque, fmt::{Debug, Display, Formatter, Result as FmtResult}, io::{empty, Error as IoError, Read, Seek, SeekFrom}, iter::repeat_n, mem::take, num::NonZeroU8, - ops::{Deref, Not, Range}, + ops::Deref, str::from_utf8, }; use thiserror::Error as ThisError; +pub mod records; + /// An error encountered reading raw system file records. /// /// Any error prevents reading further data from the system file. @@ -708,177 +719,6 @@ fn default_decode(s: &[u8]) -> Cow { from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from) } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum Compression { - Simple, - ZLib, -} - -#[derive(Clone)] -pub struct HeaderRecord -where - S: Debug, -{ - /// Offset in file. - pub offsets: Range, - - /// Magic number. - pub magic: Magic, - - /// Eye-catcher string, product name, in the file's encoding. Padded - /// on the right with spaces. - pub eye_catcher: S, - - /// Layout code, normally either 2 or 3. - pub layout_code: u32, - - /// Number of variable positions, or `None` if the value in the file is - /// questionably trustworthy. - pub nominal_case_size: Option, - - /// Compression type, if any, - pub compression: Option, - - /// 1-based variable index of the weight variable, or `None` if the file is - /// unweighted. - pub weight_index: Option, - - /// Claimed number of cases, if known. - pub n_cases: Option, - - /// Compression bias, usually 100.0. - pub bias: f64, - - /// `dd mmm yy` in the file's encoding. - pub creation_date: S, - - /// `HH:MM:SS` in the file's encoding. - pub creation_time: S, - - /// File label, in the file's encoding. Padded on the right with spaces. - pub file_label: S, - - /// Endianness of the data in the file header. - pub endian: Endian, -} - -impl HeaderRecord -where - S: Debug, -{ - fn debug_field(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult - where - T: Debug, - { - writeln!(f, "{name:>17}: {:?}", value) - } -} - -impl Debug for HeaderRecord -where - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> FmtResult { - writeln!(f, "File header record:")?; - self.debug_field(f, "Magic", self.magic)?; - self.debug_field(f, "Product name", &self.eye_catcher)?; - self.debug_field(f, "Layout code", self.layout_code)?; - self.debug_field(f, "Nominal case size", self.nominal_case_size)?; - self.debug_field(f, "Compression", self.compression)?; - self.debug_field(f, "Weight index", self.weight_index)?; - self.debug_field(f, "Number of cases", self.n_cases)?; - self.debug_field(f, "Compression bias", self.bias)?; - self.debug_field(f, "Creation date", &self.creation_date)?; - self.debug_field(f, "Creation time", &self.creation_time)?; - self.debug_field(f, "File label", &self.file_label)?; - self.debug_field(f, "Endianness", self.endian) - } -} - -impl HeaderRecord { - fn read(r: &mut R, warn: &mut dyn FnMut(Warning)) -> Result { - let start = r.stream_position()?; - - let magic: [u8; 4] = read_bytes(r)?; - let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?; - - let eye_catcher = RawString(read_vec(r, 60)?); - let layout_code: [u8; 4] = read_bytes(r)?; - let endian = Endian::identify_u32(2, layout_code) - .or_else(|| Endian::identify_u32(2, layout_code)) - .ok_or(Error::NotASystemFile)?; - let layout_code = endian.parse(layout_code); - - let nominal_case_size: u32 = endian.parse(read_bytes(r)?); - let nominal_case_size = (1..i32::MAX as u32 / 16) - .contains(&nominal_case_size) - .then_some(nominal_case_size); - - let compression_code: u32 = endian.parse(read_bytes(r)?); - let compression = match (magic, compression_code) { - (Magic::Zsav, 2) => Some(Compression::ZLib), - (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)), - (_, 0) => None, - (_, 1) => Some(Compression::Simple), - (_, code) => return Err(Error::InvalidSavCompression(code)), - }; - - let weight_index: u32 = endian.parse(read_bytes(r)?); - let weight_index = (weight_index > 0).then_some(weight_index); - - let n_cases: u32 = endian.parse(read_bytes(r)?); - let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases); - - let bias: f64 = endian.parse(read_bytes(r)?); - if bias != 100.0 && bias != 0.0 { - warn(Warning::UnexpectedBias(bias)); - } - - let creation_date = RawString(read_vec(r, 9)?); - let creation_time = RawString(read_vec(r, 8)?); - let file_label = RawString(read_vec(r, 64)?); - let _: [u8; 3] = read_bytes(r)?; - - Ok(HeaderRecord { - offsets: start..r.stream_position()?, - magic, - layout_code, - nominal_case_size, - compression, - weight_index, - n_cases, - bias, - creation_date, - creation_time, - eye_catcher, - file_label, - endian, - }) - } - - pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord { - let eye_catcher = decoder.decode(&self.eye_catcher).to_string(); - let file_label = decoder.decode(&self.file_label).to_string(); - let creation_date = decoder.decode(&self.creation_date).to_string(); - let creation_time = decoder.decode(&self.creation_time).to_string(); - DecodedRecord::Header(HeaderRecord { - eye_catcher, - weight_index: self.weight_index, - n_cases: self.n_cases, - file_label, - offsets: self.offsets.clone(), - magic: self.magic, - layout_code: self.layout_code, - nominal_case_size: self.nominal_case_size, - compression: self.compression, - bias: self.bias, - creation_date, - creation_time, - endian: self.endian, - }) - } -} - /// An [Encoding] along with a function to report decoding errors. /// /// This is used by functions that decode raw records. @@ -996,44 +836,6 @@ impl TryFrom<[u8; 4]> for Magic { } } -/// Variable type. -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub enum VarType { - /// A numeric variable. - Numeric, - - /// A string variable. - String, -} - -impl Not for VarType { - type Output = Self; - - fn not(self) -> Self::Output { - match self { - Self::Numeric => Self::String, - Self::String => Self::Numeric, - } - } -} - -impl Not for &VarType { - type Output = VarType; - - fn not(self) -> Self::Output { - !*self - } -} - -impl Display for VarType { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - match self { - VarType::Numeric => write!(f, "numeric"), - VarType::String => write!(f, "string"), - } - } -} - impl TryFrom for VarType { type Error = (); @@ -1662,150 +1464,6 @@ impl Iterator for Cases { } } -/// [crate::format::Format] as represented in a system file. -#[derive(Copy, Clone, PartialEq, Eq, Hash)] -pub struct RawFormat( - /// The most-significant 16 bits are the type, the next 8 bytes are the - /// width, and the least-significant 8 bits are the number of decimals. - pub u32, -); - -impl Debug for RawFormat { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - let type_ = format_name(self.0 >> 16); - let w = (self.0 >> 8) & 0xff; - let d = self.0 & 0xff; - write!(f, "{:06x} ({type_}{w}.{d})", self.0) - } -} - -fn format_name(type_: u32) -> Cow<'static, str> { - match type_ { - 1 => "A", - 2 => "AHEX", - 3 => "COMMA", - 4 => "DOLLAR", - 5 => "F", - 6 => "IB", - 7 => "PIBHEX", - 8 => "P", - 9 => "PIB", - 10 => "PK", - 11 => "RB", - 12 => "RBHEX", - 15 => "Z", - 16 => "N", - 17 => "E", - 20 => "DATE", - 21 => "TIME", - 22 => "DATETIME", - 23 => "ADATE", - 24 => "JDATE", - 25 => "DTIME", - 26 => "WKDAY", - 27 => "MONTH", - 28 => "MOYR", - 29 => "QYR", - 30 => "WKYR", - 31 => "PCT", - 32 => "DOT", - 33 => "CCA", - 34 => "CCB", - 35 => "CCC", - 36 => "CCD", - 37 => "CCE", - 38 => "EDATE", - 39 => "SDATE", - 40 => "MTIME", - 41 => "YMDHMS", - _ => return format!("").into(), - } - .into() -} - -impl MissingValues { - fn read( - r: &mut R, - offset: u64, - raw_width: RawWidth, - code: i32, - endian: Endian, - warn: &mut dyn FnMut(Warning), - ) -> Result { - let (individual_values, has_range) = match code { - 0 => return Ok(Self::default()), - 1..=3 => (code as usize, false), - -2 => (0, true), - -3 => (1, true), - _ => return Err(Error::BadMissingValueCode { offset, code }), - }; - - let mut values = Vec::with_capacity(individual_values); - let range = if has_range { - let low = read_bytes::<8, _>(r)?; - let high = read_bytes::<8, _>(r)?; - Some((low, high)) - } else { - None - }; - for _ in 0..individual_values { - values.push(read_bytes::<8, _>(r)?); - } - - match VarWidth::try_from(raw_width) { - Ok(VarWidth::Numeric) => { - let values = values - .into_iter() - .map(|v| Datum::Number(endian.parse(v))) - .collect(); - - let range = range.map(|(low, high)| { - MissingValueRange::new(endian.parse(low), endian.parse(high)) - }); - return Ok(Self::new(values, range).unwrap()); - } - Ok(VarWidth::String(_)) if range.is_some() => warn(Warning::MissingValueStringRange), - Ok(VarWidth::String(width)) => { - let width = width.min(8) as usize; - let values = values - .into_iter() - .map(|value| Datum::String(RawString::from(&value[..width]))) - .collect(); - return Ok(Self::new(values, None).unwrap()); - } - Err(()) => warn(Warning::MissingValueContinuation(offset)), - } - Ok(Self::default()) - } -} - -#[derive(Clone)] -pub struct VariableRecord -where - S: Debug, -{ - /// Range of offsets in file. - pub offsets: Range, - - /// Variable width, in the range -1..=255. - pub width: RawWidth, - - /// Variable name, padded on the right with spaces. - pub name: S, - - /// Print format. - pub print_format: RawFormat, - - /// Write format. - pub write_format: RawFormat, - - /// Missing values. - pub missing_values: MissingValues, - - /// Optional variable label. - pub label: Option, -} - /// Width of a variable record. #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum RawWidth { @@ -1857,92 +1515,6 @@ impl Display for RawWidth { } } -impl Debug for VariableRecord -where - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> FmtResult { - writeln!(f, "Width: {}", self.width,)?; - writeln!(f, "Print format: {:?}", self.print_format)?; - writeln!(f, "Write format: {:?}", self.write_format)?; - writeln!(f, "Name: {:?}", &self.name)?; - writeln!(f, "Variable label: {:?}", self.label)?; - writeln!(f, "Missing values: {:?}", self.missing_values) - } -} - -impl VariableRecord { - fn read( - r: &mut R, - endian: Endian, - warn: &mut dyn FnMut(Warning), - ) -> Result { - let start_offset = r.stream_position()?; - let width: i32 = endian.parse(read_bytes(r)?); - let width: RawWidth = width.try_into().map_err(|_| Error::BadVariableWidth { - start_offset, - width, - })?; - let code_offset = r.stream_position()?; - let has_variable_label: u32 = endian.parse(read_bytes(r)?); - let missing_value_code: i32 = endian.parse(read_bytes(r)?); - let print_format = RawFormat(endian.parse(read_bytes(r)?)); - let write_format = RawFormat(endian.parse(read_bytes(r)?)); - let name = RawString(read_vec(r, 8)?); - - let label = match has_variable_label { - 0 => None, - 1 => { - let len: u32 = endian.parse(read_bytes(r)?); - let read_len = len.min(65535) as usize; - let label = RawString(read_vec(r, read_len)?); - - let padding_bytes = Integer::next_multiple_of(&len, &4) - len; - let _ = read_vec(r, padding_bytes as usize)?; - - Some(label) - } - _ => { - return Err(Error::BadVariableLabelCode { - start_offset, - code_offset, - code: has_variable_label, - }); - } - }; - - let missing_values = - MissingValues::read(r, start_offset, width, missing_value_code, endian, warn)?; - - let end_offset = r.stream_position()?; - - Ok(Record::Variable(VariableRecord { - offsets: start_offset..end_offset, - width, - name, - print_format, - write_format, - missing_values, - label, - })) - } - - pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord { - DecodedRecord::Variable(VariableRecord { - offsets: self.offsets.clone(), - width: self.width, - name: decoder.decode(&self.name).to_string(), - print_format: self.print_format, - write_format: self.write_format, - missing_values: self.missing_values, - label: self - .label - .as_ref() - .map(|label| decoder.decode(label).to_string()), - }) - } -} - /// 8 bytes that represent a number or a string (but that's all we know). /// /// Used when we don't know whether it's a number or a string, or the numerical @@ -2225,1494 +1797,50 @@ impl Display for QuotedEncodedStr<'_> { } } -#[derive(Clone, Debug)] -pub struct ValueLabel -where - D: Debug, - S: Debug, -{ - pub datum: D, - pub label: S, -} - -#[derive(Clone)] -pub struct ValueLabelRecord -where - D: Debug, - S: Debug, -{ - /// Range of offsets in file. - pub offsets: Range, - - /// The labels. - pub labels: Vec>, - - /// The 1-based indexes of the variable indexes. - pub dict_indexes: Vec, - - /// The types of the variables. - pub var_type: VarType, -} - -impl Debug for ValueLabelRecord -where - D: Debug, - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> FmtResult { - writeln!(f, "labels: ")?; - for label in self.labels.iter() { - writeln!(f, "{label:?}")?; - } - write!(f, "apply to {} variables", self.var_type)?; - for dict_index in self.dict_indexes.iter() { - write!(f, " #{dict_index}")?; - } - Ok(()) +fn skip_bytes(r: &mut R, mut n: usize) -> Result<(), IoError> { + thread_local! { + static BUF: RefCell<[u8; 256]> = RefCell::new([0u8; 256]); } -} - -impl ValueLabelRecord -where - D: Debug, - S: Debug, -{ - /// Maximum number of value labels in a record. - pub const MAX_LABELS: u32 = u32::MAX / 8; - - /// Maximum number of variable indexes in a record. - pub const MAX_INDEXES: u32 = u32::MAX / 8; -} - -impl ValueLabelRecord { - fn read( - r: &mut R, - endian: Endian, - var_types: &VarTypes, - warn: &mut dyn FnMut(Warning), - ) -> Result, Error> { - let label_offset = r.stream_position()?; - let n: u32 = endian.parse(read_bytes(r)?); - if n > Self::MAX_LABELS { - return Err(Error::BadNumberOfValueLabels { - offset: label_offset, - n, - max: Self::MAX_LABELS, - }); - } - - let mut labels = Vec::new(); - for _ in 0..n { - let value = UntypedDatum(read_bytes(r)?); - let label_len: u8 = endian.parse(read_bytes(r)?); - let label_len = label_len as usize; - let padded_len = Integer::next_multiple_of(&(label_len + 1), &8); - - let mut label = read_vec(r, padded_len - 1)?; - label.truncate(label_len); - labels.push((value, RawString(label))); - } - - let index_offset = r.stream_position()?; - let rec_type: u32 = endian.parse(read_bytes(r)?); - if rec_type != 4 { - return Err(Error::ExpectedVarIndexRecord { - offset: index_offset, - rec_type, - }); - } - - let n: u32 = endian.parse(read_bytes(r)?); - if n > Self::MAX_INDEXES { - return Err(Error::TooManyVarIndexes { - offset: index_offset, - n, - max: Self::MAX_INDEXES, - }); - } else if n == 0 { - dbg!(); - warn(Warning::NoVarIndexes { - offset: index_offset, - }); - return Ok(None); - } - - let index_offset = r.stream_position()?; - let mut dict_indexes = Vec::with_capacity(n as usize); - let mut invalid_indexes = Vec::new(); - for _ in 0..n { - let index: u32 = endian.parse(read_bytes(r)?); - if var_types.is_valid_index(index as usize) { - dict_indexes.push(index); - } else { - invalid_indexes.push(index); - } - } - if !invalid_indexes.is_empty() { - warn(Warning::InvalidVarIndexes { - offset: index_offset, - max: var_types.n_values(), - invalid: invalid_indexes, - }); - } - - let Some(&first_index) = dict_indexes.first() else { - return Ok(None); - }; - let var_type = VarType::from(var_types.types[first_index as usize - 1].unwrap()); - let mut wrong_type_indexes = Vec::new(); - dict_indexes.retain(|&index| { - if var_types.types[index as usize - 1].map(VarType::from) != Some(var_type) { - wrong_type_indexes.push(index); - false - } else { - true - } - }); - if !wrong_type_indexes.is_empty() { - warn(Warning::MixedVarTypes { - offset: index_offset, - var_type, - wrong_types: wrong_type_indexes, - }); - } - - let labels = labels - .into_iter() - .map(|(value, label)| ValueLabel { - datum: RawDatum::from_raw(&value, var_type, endian), - label, - }) - .collect(); - - let end_offset = r.stream_position()?; - Ok(Some(Record::ValueLabel(ValueLabelRecord { - offsets: label_offset..end_offset, - labels, - dict_indexes, - var_type, - }))) - } - - fn decode(self, decoder: &mut Decoder) -> ValueLabelRecord { - let labels = self - .labels - .iter() - .map( - |ValueLabel { - datum: value, - label, - }| ValueLabel { - datum: value.clone(), - label: decoder.decode(label).to_string(), - }, - ) - .collect(); - ValueLabelRecord { - offsets: self.offsets.clone(), - labels, - dict_indexes: self.dict_indexes.clone(), - var_type: self.var_type, + BUF.with_borrow_mut(|buf| { + while n > 0 { + let chunk = n.min(buf.len()); + r.read_exact(&mut buf[..n])?; + n -= chunk; } - } + Ok(()) + }) } -#[derive(Clone, Debug)] -pub struct DocumentRecord -where - S: Debug, -{ - pub offsets: Range, - - /// The document, as an array of lines. Raw lines are exactly 80 bytes long - /// and are right-padded with spaces without any new-line termination. - pub lines: Vec, -} - -pub type RawDocumentLine = RawStrArray; - -/// Length of a line in a document. Document lines are fixed-length and -/// padded on the right with spaces. -pub const DOC_LINE_LEN: usize = 80; - -impl DocumentRecord { - /// Maximum number of lines we will accept in a document. This is simply - /// the maximum number that will fit in a 32-bit space. - pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN; - - fn read(r: &mut R, endian: Endian) -> Result { - let start_offset = r.stream_position()?; - let n: u32 = endian.parse(read_bytes(r)?); - let n = n as usize; - if n > Self::MAX_LINES { - Err(Error::BadDocumentLength { - offset: start_offset, - n, - max: Self::MAX_LINES, - }) - } else { - let mut lines = Vec::with_capacity(n); - for _ in 0..n { - lines.push(RawStrArray(read_bytes(r)?)); - } - let end_offset = r.stream_position()?; - Ok(Record::Document(DocumentRecord { - offsets: start_offset..end_offset, - lines, - })) +fn try_read_bytes_into(r: &mut R, buf: &mut [u8]) -> Result { + let n = r.read(buf)?; + if n > 0 { + if n < buf.len() { + r.read_exact(&mut buf[n..])?; } - } - - pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord { - DecodedRecord::Document(DocumentRecord { - offsets: self.offsets.clone(), - lines: self - .lines - .iter() - .map(|s| decoder.decode_slice(&s.0).to_string()) - .collect(), - }) - } -} - -struct ExtensionRecord<'a> { - size: Option, - count: Option, - name: &'a str, -} - -#[derive(Clone, Debug)] -pub struct IntegerInfoRecord { - pub offsets: Range, - pub version: (i32, i32, i32), - pub machine_code: i32, - pub floating_point_rep: i32, - pub compression_code: i32, - pub endianness: i32, - pub character_code: i32, -} - -static INTEGER_INFO_RECORD: ExtensionRecord = ExtensionRecord { - size: Some(4), - count: Some(8), - name: "integer record", -}; - -impl IntegerInfoRecord { - fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size(&INTEGER_INFO_RECORD)?; - - let mut input = &ext.data[..]; - let data: Vec = (0..8) - .map(|_| endian.parse(read_bytes(&mut input).unwrap())) - .collect(); - Ok(Record::IntegerInfo(IntegerInfoRecord { - offsets: ext.offsets.clone(), - version: (data[0], data[1], data[2]), - machine_code: data[3], - floating_point_rep: data[4], - compression_code: data[5], - endianness: data[6], - character_code: data[7], - })) - } -} - -#[derive(Clone, Debug)] -pub struct FloatInfoRecord { - pub sysmis: f64, - pub highest: f64, - pub lowest: f64, -} - -static FLOAT_INFO_RECORD: ExtensionRecord = ExtensionRecord { - size: Some(8), - count: Some(3), - name: "floating point record", -}; - -impl FloatInfoRecord { - fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size(&FLOAT_INFO_RECORD)?; - - let mut input = &ext.data[..]; - let data: Vec = (0..3) - .map(|_| endian.parse(read_bytes(&mut input).unwrap())) - .collect(); - Ok(Record::FloatInfo(FloatInfoRecord { - sysmis: data[0], - highest: data[1], - lowest: data[2], - })) + Ok(true) + } else { + Ok(false) } } -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub enum CategoryLabels { - VarLabels, - CountedValues, -} - -#[derive(Clone, Debug)] -pub enum MultipleResponseType { - MultipleDichotomy { - value: RawString, - labels: CategoryLabels, - }, - MultipleCategory, -} - -impl MultipleResponseType { - fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> { - let (mr_type, input) = match input.split_first() { - Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input), - Some((b'D', input)) => { - let (value, input) = parse_counted_string(input)?; - ( - MultipleResponseType::MultipleDichotomy { - value, - labels: CategoryLabels::VarLabels, - }, - input, - ) - } - Some((b'E', input)) => { - let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") { - (CategoryLabels::CountedValues, rest) - } else if let Some(rest) = input.strip_prefix(b" 11 ") { - (CategoryLabels::VarLabels, rest) - } else { - return Err(Warning::InvalidMultipleDichotomyLabelType); - }; - let (value, input) = parse_counted_string(input)?; - ( - MultipleResponseType::MultipleDichotomy { value, labels }, - input, - ) - } - _ => return Err(Warning::InvalidMultipleResponseType), - }; - Ok((mr_type, input)) +fn try_read_bytes(r: &mut R) -> Result, IoError> { + let mut buf = [0; N]; + match try_read_bytes_into(r, &mut buf)? { + true => Ok(Some(buf)), + false => Ok(None), } } -#[derive(Clone, Debug)] -pub struct MultipleResponseSet -where - I: Debug, - S: Debug, -{ - pub name: I, - pub label: S, - pub mr_type: MultipleResponseType, - pub short_names: Vec, +fn read_bytes(r: &mut R) -> Result<[u8; N], IoError> { + let mut buf = [0; N]; + r.read_exact(&mut buf)?; + Ok(buf) } -impl MultipleResponseSet { - fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> { - let Some(equals) = input.iter().position(|&b| b == b'=') else { - return Err(Warning::MultipleResponseSyntaxError("missing `=`")); - }; - let (name, input) = input.split_at(equals); - let input = input.strip_prefix(b"=").unwrap(); - let (mr_type, input) = MultipleResponseType::parse(input)?; - let Some(input) = input.strip_prefix(b" ") else { - return Err(Warning::MultipleResponseSyntaxError( - "missing space after multiple response type", - )); - }; - let (label, mut input) = parse_counted_string(input)?; - let mut vars = Vec::new(); - while input.first() != Some(&b'\n') { - match input.split_first() { - Some((b' ', rest)) => { - let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else { - return Err(Warning::MultipleResponseSyntaxError( - "missing variable name delimiter", - )); - }; - let (var, rest) = rest.split_at(length); - if !var.is_empty() { - vars.push(var.into()); - } - input = rest; - } - _ => { - return Err(Warning::MultipleResponseSyntaxError( - "missing space preceding variable name", - )); - } - } - } - while input.first() == Some(&b'\n') { - input = &input[1..]; - } - Ok(( - MultipleResponseSet { - name: name.into(), - label, - mr_type, - short_names: vars, - }, - input, - )) - } - - fn decode( - &self, - decoder: &mut Decoder, - ) -> Result, Warning> { - let mut short_names = Vec::with_capacity(self.short_names.len()); - for short_name in self.short_names.iter() { - if let Some(short_name) = decoder - .decode_identifier(short_name) - .map_err(Warning::InvalidMrSetName) - .issue_warning(&mut decoder.warn) - { - short_names.push(short_name); - } - } - Ok(MultipleResponseSet { - name: decoder - .decode_identifier(&self.name) - .map_err(Warning::InvalidMrSetVariableName)?, - label: decoder.decode(&self.label).to_string(), - mr_type: self.mr_type.clone(), - short_names, - }) - } -} - -#[derive(Clone, Debug)] -pub struct MultipleResponseRecord(pub Vec>) -where - I: Debug, - S: Debug; - -static MULTIPLE_RESPONSE_RECORD: ExtensionRecord = ExtensionRecord { - size: Some(1), - count: None, - name: "multiple response set record", -}; - -impl MultipleResponseRecord { - fn parse(ext: &Extension, _endian: Endian) -> Result { - ext.check_size(&MULTIPLE_RESPONSE_RECORD)?; - - let mut input = &ext.data[..]; - let mut sets = Vec::new(); - loop { - while let Some(suffix) = input.strip_prefix(b"\n") { - input = suffix; - } - if input.is_empty() { - break; - } - let (set, rest) = MultipleResponseSet::parse(input)?; - sets.push(set); - input = rest; - } - Ok(Record::MultipleResponse(MultipleResponseRecord(sets))) - } -} - -impl MultipleResponseRecord { - fn decode(self, decoder: &mut Decoder) -> DecodedRecord { - let mut sets = Vec::new(); - for set in self.0.iter() { - if let Some(set) = set.decode(decoder).issue_warning(&mut decoder.warn) { - sets.push(set); - } - } - DecodedRecord::MultipleResponse(MultipleResponseRecord(sets)) - } -} - -fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> { - let Some(space) = input.iter().position(|&b| b == b' ') else { - return Err(Warning::CountedStringMissingSpace); - }; - let Ok(length) = from_utf8(&input[..space]) else { - return Err(Warning::CountedStringInvalidUTF8); - }; - let Ok(length): Result = length.parse() else { - return Err(Warning::CountedStringInvalidLength(length.into())); - }; - - let Some((string, rest)) = input[space + 1..].split_at_checked(length) else { - return Err(Warning::CountedStringTooLong(length)); - }; - Ok((string.into(), rest)) -} - -impl Measure { - fn try_decode(source: u32) -> Result, Warning> { - match source { - 0 => Ok(None), - 1 => Ok(Some(Measure::Nominal)), - 2 => Ok(Some(Measure::Ordinal)), - 3 => Ok(Some(Measure::Scale)), - _ => Err(Warning::InvalidMeasurement(source)), - } - } -} - -impl Alignment { - fn try_decode(source: u32) -> Result, Warning> { - match source { - 0 => Ok(Some(Alignment::Left)), - 1 => Ok(Some(Alignment::Right)), - 2 => Ok(Some(Alignment::Center)), - _ => Err(Warning::InvalidAlignment(source)), - } - } -} - -#[derive(Clone, Debug)] -pub struct VarDisplay { - pub measure: Option, - pub width: Option, - pub alignment: Option, -} - -#[derive(Clone, Debug)] -pub struct VarDisplayRecord(pub Vec); - -impl VarDisplayRecord { - fn parse( - ext: &Extension, - var_types: &VarTypes, - endian: Endian, - warn: &mut dyn FnMut(Warning), - ) -> Result { - if ext.size != 4 { - return Err(Warning::BadRecordSize { - offset: ext.offsets.start, - record: String::from("variable display record"), - size: ext.size, - expected_size: 4, - }); - } - - let n_vars = var_types.n_vars(); - let has_width = if ext.count as usize == 3 * n_vars { - true - } else if ext.count as usize == 2 * n_vars { - false - } else { - return Err(Warning::InvalidVariableDisplayCount { - count: ext.count as usize, - first: 2 * n_vars, - second: 3 * n_vars, - }); - }; - - let mut var_displays = Vec::new(); - let mut input = &ext.data[..]; - for _ in 0..n_vars { - let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap())) - .issue_warning(warn) - .flatten(); - let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap())); - let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap())) - .issue_warning(warn) - .flatten(); - var_displays.push(VarDisplay { - measure, - width, - alignment, - }); - } - Ok(Record::VarDisplay(VarDisplayRecord(var_displays))) - } -} - -#[derive(Clone, Debug)] -pub struct LongStringMissingValues -where - N: Debug, -{ - /// Variable name. - pub var_name: N, - - /// Missing values. - pub missing_values: Vec>, -} - -impl LongStringMissingValues { - fn decode( - &self, - decoder: &mut Decoder, - ) -> Result, IdError> { - Ok(LongStringMissingValues { - var_name: decoder.decode_identifier(&self.var_name)?, - missing_values: self.missing_values.clone(), - }) - } -} - -#[derive(Clone, Debug)] -pub struct LongStringMissingValueRecord(pub Vec>) -where - N: Debug; - -static LONG_STRING_MISSING_VALUE_RECORD: ExtensionRecord = ExtensionRecord { - size: Some(1), - count: None, - name: "long string missing values record", -}; - -impl LongStringMissingValueRecord { - fn parse( - ext: &Extension, - endian: Endian, - warn: &mut dyn FnMut(Warning), - ) -> Result { - ext.check_size(&LONG_STRING_MISSING_VALUE_RECORD)?; - - let mut input = &ext.data[..]; - let mut missing_value_set = Vec::new(); - while !input.is_empty() { - let var_name = read_string(&mut input, endian)?; - dbg!(&var_name); - let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?); - let value_len: u32 = endian.parse(read_bytes(&mut input)?); - if value_len != 8 { - let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start; - warn(Warning::BadLongMissingValueLength { - record_offset: ext.offsets.start, - offset, - value_len, - }); - read_vec( - &mut input, - dbg!(value_len as usize * n_missing_values as usize), - )?; - continue; - } - let mut missing_values = Vec::new(); - for i in 0..n_missing_values { - if i > 0 { - // Tolerate files written by old, buggy versions of PSPP - // where we believed that the value_length was repeated - // before each missing value. - let mut peek = input; - let number: u32 = endian.parse(read_bytes(&mut peek)?); - if number == 8 { - input = peek; - } - } - - let value: [u8; 8] = read_bytes(&mut input)?; - missing_values.push(RawStrArray(value)); - } - missing_value_set.push(LongStringMissingValues { - var_name, - missing_values, - }); - } - Ok(Record::LongStringMissingValues( - LongStringMissingValueRecord(missing_value_set), - )) - } -} - -impl LongStringMissingValueRecord { - pub fn decode(self, decoder: &mut Decoder) -> LongStringMissingValueRecord { - let mut mvs = Vec::with_capacity(self.0.len()); - for mv in self.0.iter() { - if let Some(mv) = mv - .decode(decoder) - .map_err(Warning::InvalidLongStringMissingValueVariableName) - .issue_warning(&mut decoder.warn) - { - mvs.push(mv); - } - } - LongStringMissingValueRecord(mvs) - } -} - -#[derive(Clone, Debug)] -pub struct EncodingRecord(pub String); - -static ENCODING_RECORD: ExtensionRecord = ExtensionRecord { - size: Some(1), - count: None, - name: "encoding record", -}; - -impl EncodingRecord { - fn parse(ext: &Extension, _endian: Endian) -> Result { - ext.check_size(&ENCODING_RECORD)?; - - Ok(Record::Encoding(EncodingRecord( - String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName { - offset: ext.offsets.start, - })?, - ))) - } -} - -#[derive(Clone, Debug)] -pub struct NumberOfCasesRecord { - /// Always observed as 1. - pub one: u64, - - /// Number of cases. - pub n_cases: u64, -} - -static NUMBER_OF_CASES_RECORD: ExtensionRecord = ExtensionRecord { - size: Some(8), - count: Some(2), - name: "extended number of cases record", -}; - -impl NumberOfCasesRecord { - fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size(&NUMBER_OF_CASES_RECORD)?; - - let mut input = &ext.data[..]; - let one = endian.parse(read_bytes(&mut input)?); - let n_cases = endian.parse(read_bytes(&mut input)?); - - Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases })) - } -} - -#[derive(Clone, Debug)] -pub struct RawVariableSetRecord(TextRecord); - -impl RawVariableSetRecord { - fn parse(extension: Extension) -> Result { - Ok(Record::VariableSets(Self(TextRecord::parse( - extension, - "variable sets record", - )?))) - } - fn decode(self, decoder: &mut Decoder) -> VariableSetRecord { - let mut sets = Vec::new(); - let input = decoder.decode(&self.0.text); - for line in input.lines() { - if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&mut decoder.warn) { - sets.push(set) - } - } - VariableSetRecord { - offsets: self.0.offsets, - sets, - } - } -} - -#[derive(Clone, Debug)] -pub struct RawProductInfoRecord(TextRecord); - -impl RawProductInfoRecord { - fn parse(extension: Extension) -> Result { - Ok(Record::ProductInfo(Self(TextRecord::parse( - extension, - "product info record", - )?))) - } - fn decode(self, decoder: &mut Decoder) -> ProductInfoRecord { - ProductInfoRecord(decoder.decode(&self.0.text).into()) - } -} - -#[derive(Clone, Debug)] -pub struct RawLongNamesRecord(TextRecord); - -impl RawLongNamesRecord { - fn parse(extension: Extension) -> Result { - Ok(Record::LongNames(Self(TextRecord::parse( - extension, - "long names record", - )?))) - } - fn decode(self, decoder: &mut Decoder) -> LongNamesRecord { - let input = decoder.decode(&self.0.text); - let mut names = Vec::new(); - for pair in input.split('\t').filter(|s| !s.is_empty()) { - if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&mut decoder.warn) - { - names.push(long_name); - } - } - LongNamesRecord(names) - } -} - -#[derive(Clone, Debug)] -pub struct TextRecord { - pub offsets: Range, - - /// The text content of the record. - pub text: RawString, -} - -impl TextRecord { - fn parse(extension: Extension, name: &str) -> Result { - extension.check_size(&ExtensionRecord { - size: Some(1), - count: None, - name, - })?; - Ok(Self { - offsets: extension.offsets, - text: extension.data.into(), - }) - } -} - -#[derive(Clone, Debug)] -pub struct VeryLongString { - pub short_name: Identifier, - pub length: u16, -} - -impl VeryLongString { - fn parse(decoder: &Decoder, input: &str) -> Result { - let Some((short_name, length)) = input.split_once('=') else { - return Err(Warning::VeryLongStringMissingDelimiter(input.into())); - }; - let short_name = decoder - .new_identifier(short_name) - .and_then(Identifier::must_be_ordinary) - .map_err(Warning::InvalidLongStringName)?; - let length = length - .parse() - .map_err(|_| Warning::VeryLongStringInvalidLength(input.into()))?; - Ok(VeryLongString { short_name, length }) - } -} - -#[derive(Clone, Debug)] -pub struct RawVeryLongStringsRecord(TextRecord); - -#[derive(Clone, Debug)] -pub struct VeryLongStringsRecord(pub Vec); - -impl RawVeryLongStringsRecord { - fn parse(extension: Extension) -> Result { - Ok(Record::VeryLongStrings(Self(TextRecord::parse( - extension, - "very long strings record", - )?))) - } - fn decode(self, decoder: &mut Decoder) -> VeryLongStringsRecord { - let input = decoder.decode(&self.0.text); - let mut very_long_strings = Vec::new(); - for tuple in input - .split('\0') - .map(|s| s.trim_start_matches('\t')) - .filter(|s| !s.is_empty()) - { - if let Some(vls) = - VeryLongString::parse(decoder, tuple).issue_warning(&mut decoder.warn) - { - very_long_strings.push(vls) - } - } - VeryLongStringsRecord(very_long_strings) - } -} - -#[derive(Clone, Debug)] -pub struct Attribute { - pub name: Identifier, - pub values: Vec, -} - -impl Attribute { - fn parse<'a>(decoder: &mut Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> { - let Some((name, mut input)) = input.split_once('(') else { - return Err(Warning::AttributeMissingLParen(input.into())); - }; - let name = decoder - .new_identifier(name) - .map_err(Warning::InvalidAttributeName)?; - let mut values = Vec::new(); - loop { - let Some((value, rest)) = input.split_once('\n') else { - return Err(Warning::AttributeMissingValue { - name: name.clone(), - index: values.len(), - }); - }; - if let Some(stripped) = value - .strip_prefix('\'') - .and_then(|value| value.strip_suffix('\'')) - { - values.push(stripped.into()); - } else { - decoder.warn(Warning::AttributeMissingQuotes { - name: name.clone(), - index: values.len(), - }); - values.push(value.into()); - } - if let Some(rest) = rest.strip_prefix(')') { - let attribute = Attribute { name, values }; - return Ok((attribute, rest)); - }; - input = rest; - } - } -} - -impl Attributes { - fn parse<'a>( - decoder: &mut Decoder, - mut input: &'a str, - sentinel: Option, - ) -> Result<(Attributes, &'a str, Vec), Warning> { - let mut attributes = BTreeMap::new(); - let mut duplicates = Vec::new(); - let rest = loop { - match input.chars().next() { - None => break input, - c if c == sentinel => break &input[1..], - _ => { - let (attribute, rest) = Attribute::parse(decoder, input)?; - if attributes.contains_key(&attribute.name) { - duplicates.push(attribute.name.clone()); - } - attributes.insert(attribute.name, attribute.values); - input = rest; - } - } - }; - Ok((Attributes(attributes), rest, duplicates)) - } -} - -#[derive(Clone, Debug)] -pub struct RawFileAttributesRecord(TextRecord); - -#[derive(Clone, Debug, Default)] -pub struct FileAttributesRecord(pub Attributes); - -impl RawFileAttributesRecord { - fn parse(extension: Extension) -> Result { - Ok(Record::FileAttributes(Self(TextRecord::parse( - extension, - "file attributes record", - )?))) - } - fn decode(self, decoder: &mut Decoder) -> FileAttributesRecord { - let input = decoder.decode(&self.0.text); - match Attributes::parse(decoder, &input, None).issue_warning(&mut decoder.warn) { - Some((set, rest, duplicates)) => { - if !duplicates.is_empty() { - decoder.warn(Warning::DuplicateFileAttributes { - attributes: duplicates, - }); - } - if !rest.is_empty() { - decoder.warn(dbg!(Warning::TBD)); - } - FileAttributesRecord(set) - } - None => FileAttributesRecord::default(), - } - } -} - -#[derive(Clone, Debug)] -pub struct VarAttributes { - pub long_var_name: Identifier, - pub attributes: Attributes, -} - -impl VarAttributes { - fn parse<'a>( - decoder: &mut Decoder, - input: &'a str, - ) -> Result<(VarAttributes, &'a str), Warning> { - let Some((long_var_name, rest)) = input.split_once(':') else { - return Err(dbg!(Warning::TBD)); - }; - let long_var_name = decoder - .new_identifier(long_var_name) - .and_then(Identifier::must_be_ordinary) - .map_err(Warning::InvalidAttributeVariableName)?; - let (attributes, rest, duplicates) = Attributes::parse(decoder, rest, Some('/'))?; - if !duplicates.is_empty() { - decoder.warn(Warning::DuplicateVariableAttributes { - variable: long_var_name.clone(), - attributes: duplicates, - }); - } - let var_attribute = VarAttributes { - long_var_name, - attributes, - }; - Ok((var_attribute, rest)) - } -} - -#[derive(Clone, Debug)] -pub struct RawVariableAttributesRecord(TextRecord); - -#[derive(Clone, Debug)] -pub struct VariableAttributesRecord(pub Vec); - -impl RawVariableAttributesRecord { - fn parse(extension: Extension) -> Result { - Ok(Record::VariableAttributes(Self(TextRecord::parse( - extension, - "variable attributes record", - )?))) - } - fn decode(self, decoder: &mut Decoder) -> VariableAttributesRecord { - let decoded = decoder.decode(&self.0.text); - let mut input = decoded.as_ref(); - let mut var_attribute_sets = Vec::new(); - while !input.is_empty() { - let Some((var_attribute, rest)) = - VarAttributes::parse(decoder, input).issue_warning(&mut decoder.warn) - else { - break; - }; - var_attribute_sets.push(var_attribute); - input = rest; - } - VariableAttributesRecord(var_attribute_sets) - } -} - -#[derive(Clone, Debug)] -pub struct LongName { - pub short_name: Identifier, - pub long_name: Identifier, -} - -impl LongName { - fn parse(input: &str, decoder: &Decoder) -> Result { - let Some((short_name, long_name)) = input.split_once('=') else { - return Err(dbg!(Warning::LongNameMissingEquals)); - }; - let short_name = decoder - .new_identifier(short_name) - .and_then(Identifier::must_be_ordinary) - .map_err(Warning::InvalidShortName)?; - let long_name = decoder - .new_identifier(long_name) - .and_then(Identifier::must_be_ordinary) - .map_err(Warning::InvalidLongName)?; - Ok(LongName { - short_name, - long_name, - }) - } -} - -#[derive(Clone, Debug)] -pub struct LongNamesRecord(pub Vec); - -#[derive(Clone, Debug)] -pub struct ProductInfoRecord(pub String); - -#[derive(Clone, Debug)] -pub struct VariableSet { - pub name: String, - pub variable_names: Vec, -} - -impl VariableSet { - fn parse(input: &str, decoder: &mut Decoder) -> Result { - let (name, input) = input - .split_once('=') - .ok_or(Warning::VariableSetMissingEquals)?; - let mut vars = Vec::new(); - for var in input.split_ascii_whitespace() { - if let Some(identifier) = decoder - .new_identifier(var) - .and_then(Identifier::must_be_ordinary) - .map_err(Warning::InvalidVariableSetName) - .issue_warning(&mut decoder.warn) - { - vars.push(identifier); - } - } - Ok(VariableSet { - name: name.to_string(), - variable_names: vars, - }) - } -} - -#[derive(Clone, Debug)] -pub struct VariableSetRecord { - pub offsets: Range, - pub sets: Vec, -} - -trait IssueWarning { - fn issue_warning(self, warn: &mut dyn FnMut(Warning)) -> Option; -} -impl IssueWarning for Result { - fn issue_warning(self, warn: &mut dyn FnMut(Warning)) -> Option { - match self { - Ok(result) => Some(result), - Err(error) => { - warn(error); - None - } - } - } -} - -#[derive(Clone, Debug)] -pub struct Extension { - pub offsets: Range, - - /// Record subtype. - pub subtype: u32, - - /// Size of each data element. - pub size: u32, - - /// Number of data elements. - pub count: u32, - - /// `size * count` bytes of data. - pub data: Vec, -} - -impl Extension { - fn check_size(&self, expected: &ExtensionRecord) -> Result<(), Warning> { - match expected.size { - Some(expected_size) if self.size != expected_size => { - return Err(Warning::BadRecordSize { - offset: self.offsets.start, - record: expected.name.into(), - size: self.size, - expected_size, - }); - } - _ => (), - } - match expected.count { - Some(expected_count) if self.count != expected_count => { - return Err(Warning::BadRecordCount { - offset: self.offsets.start, - record: expected.name.into(), - count: self.count, - expected_count, - }); - } - _ => (), - } - Ok(()) - } - - fn read( - r: &mut R, - endian: Endian, - var_types: &VarTypes, - warn: &mut dyn FnMut(Warning), - ) -> Result, Error> { - let subtype = endian.parse(read_bytes(r)?); - let header_offset = r.stream_position()?; - let size: u32 = endian.parse(read_bytes(r)?); - let count = endian.parse(read_bytes(r)?); - let Some(product) = size.checked_mul(count) else { - return Err(Error::ExtensionRecordTooLarge { - offset: header_offset, - subtype, - size, - count, - }); - }; - let start_offset = r.stream_position()?; - let data = read_vec(r, product as usize)?; - let end_offset = start_offset + product as u64; - let extension = Extension { - offsets: start_offset..end_offset, - subtype, - size, - count, - data, - }; - let result = match subtype { - 3 => IntegerInfoRecord::parse(&extension, endian), - 4 => FloatInfoRecord::parse(&extension, endian), - 11 => VarDisplayRecord::parse(&extension, var_types, endian, warn), - 7 | 19 => MultipleResponseRecord::parse(&extension, endian), - 21 => LongStringValueLabelRecord::parse(&extension, endian), - 22 => LongStringMissingValueRecord::parse(&extension, endian, warn), - 20 => EncodingRecord::parse(&extension, endian), - 16 => NumberOfCasesRecord::parse(&extension, endian), - 5 => RawVariableSetRecord::parse(extension), - 10 => RawProductInfoRecord::parse(extension), - 13 => RawLongNamesRecord::parse(extension), - 14 => RawVeryLongStringsRecord::parse(extension), - 17 => RawFileAttributesRecord::parse(extension), - 18 => RawVariableAttributesRecord::parse(extension), - _ => Ok(Record::OtherExtension(extension)), - }; - match result { - Ok(result) => Ok(Some(result)), - Err(error) => { - warn(error); - Ok(None) - } - } - } -} - -#[derive(Clone, Debug)] -pub struct ZHeader { - /// File offset to the start of the record. - pub offset: u64, - - /// File offset to the ZLIB data header. - pub zheader_offset: u64, - - /// File offset to the ZLIB trailer. - pub ztrailer_offset: u64, - - /// Length of the ZLIB trailer in bytes. - pub ztrailer_len: u64, -} - -impl ZHeader { - fn read(r: &mut R, endian: Endian) -> Result { - let offset = r.stream_position()?; - let zheader_offset: u64 = endian.parse(read_bytes(r)?); - let ztrailer_offset: u64 = endian.parse(read_bytes(r)?); - let ztrailer_len: u64 = endian.parse(read_bytes(r)?); - - if zheader_offset != offset { - return Err(Error::UnexpectedZHeaderOffset { - actual: zheader_offset, - expected: offset, - }); - } - - if ztrailer_offset < offset { - return Err(Error::ImpossibleZTrailerOffset(ztrailer_offset)); - } - - if ztrailer_len < 24 || ztrailer_len % 24 != 0 { - return Err(Error::InvalidZTrailerLength(ztrailer_len)); - } - - Ok(ZHeader { - offset, - zheader_offset, - ztrailer_offset, - ztrailer_len, - }) - } -} - -#[derive(Clone, Debug)] -pub struct ZTrailer { - /// File offset to the start of the record. - pub offset: u64, - - /// Compression bias as a negative integer, e.g. -100. - pub int_bias: i64, - - /// Always observed as zero. - pub zero: u64, - - /// Uncompressed size of each block, except possibly the last. Only - /// `0x3ff000` has been observed so far. - pub block_size: u32, - - /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them. - pub blocks: Vec, -} - -#[derive(Clone, Debug)] -pub struct ZBlock { - /// Offset of block of data if simple compression were used. - pub uncompressed_ofs: u64, - - /// Actual offset within the file of the compressed data block. - pub compressed_ofs: u64, - - /// The number of bytes in this data block after decompression. This is - /// `block_size` in every data block but the last, which may be smaller. - pub uncompressed_size: u32, - - /// The number of bytes in this data block, as stored compressed in this - /// file. - pub compressed_size: u32, -} - -impl ZBlock { - fn read(r: &mut R, endian: Endian) -> Result { - Ok(ZBlock { - uncompressed_ofs: endian.parse(read_bytes(r)?), - compressed_ofs: endian.parse(read_bytes(r)?), - uncompressed_size: endian.parse(read_bytes(r)?), - compressed_size: endian.parse(read_bytes(r)?), - }) - } -} - -impl ZTrailer { - fn read( - reader: &mut R, - endian: Endian, - bias: f64, - zheader: &ZHeader, - warn: &mut dyn FnMut(Warning), - ) -> Result, Error> { - let start_offset = reader.stream_position()?; - if reader - .seek(SeekFrom::Start(zheader.ztrailer_offset)) - .is_err() - { - return Ok(None); - } - let int_bias = endian.parse(read_bytes(reader)?); - if int_bias as f64 != -bias { - return Err(Error::WrongZlibTrailerBias { - actual: int_bias, - expected: -bias, - }); - } - let zero = endian.parse(read_bytes(reader)?); - if zero != 0 { - return Err(Error::WrongZlibTrailerZero(zero)); - } - let block_size = endian.parse(read_bytes(reader)?); - if block_size != 0x3ff000 { - return Err(Error::WrongZlibTrailerBlockSize(block_size)); - } - let n_blocks: u32 = endian.parse(read_bytes(reader)?); - let expected_n_blocks = (zheader.ztrailer_len - 24) / 24; - if n_blocks as u64 != expected_n_blocks { - return Err(Error::BadZlibTrailerNBlocks { - offset: zheader.ztrailer_offset, - n_blocks, - expected_n_blocks, - ztrailer_len: zheader.ztrailer_len, - }); - } - let blocks = (0..n_blocks) - .map(|_| ZBlock::read(reader, endian)) - .collect::, _>>()?; - - let mut expected_uncmp_ofs = zheader.zheader_offset; - let mut expected_cmp_ofs = zheader.zheader_offset + 24; - for (index, block) in blocks.iter().enumerate() { - if block.uncompressed_ofs != expected_uncmp_ofs { - return Err(Error::ZlibTrailerBlockWrongUncmpOfs { - index, - actual: block.uncompressed_ofs, - expected: expected_cmp_ofs, - }); - } - if block.compressed_ofs != expected_cmp_ofs { - return Err(Error::ZlibTrailerBlockWrongCmpOfs { - index, - actual: block.compressed_ofs, - expected: expected_cmp_ofs, - }); - } - if index < blocks.len() - 1 { - if block.uncompressed_size != block_size { - warn(Warning::ZlibTrailerBlockWrongSize { - index, - actual: block.uncompressed_size, - expected: block_size, - }); - } - } else { - if block.uncompressed_size > block_size { - warn(Warning::ZlibTrailerBlockTooBig { - index, - actual: block.uncompressed_size, - max_expected: block_size, - }); - } - } - // http://www.zlib.net/zlib_tech.html says that the maximum - // expansion from compression, with worst-case parameters, is 13.5% - // plus 11 bytes. This code checks for an expansion of more than - // 14.3% plus 11 bytes. - if block.compressed_size > block.uncompressed_size + block.uncompressed_size / 7 + 11 { - return Err(Error::ZlibExpansion { - index, - compressed_size: block.compressed_size, - uncompressed_size: block.uncompressed_size, - }); - } - - expected_cmp_ofs += block.compressed_size as u64; - expected_uncmp_ofs += block.uncompressed_size as u64; - } - - if expected_cmp_ofs != zheader.ztrailer_offset { - return Err(Error::ZlibTrailerOffsetInconsistency { - descriptors: expected_cmp_ofs, - zheader: zheader.ztrailer_offset, - }); - } - - reader.seek(SeekFrom::Start(start_offset))?; - Ok(Some(ZTrailer { - offset: zheader.ztrailer_offset, - int_bias, - zero, - block_size, - blocks, - })) - } -} - -fn skip_bytes(r: &mut R, mut n: usize) -> Result<(), IoError> { - thread_local! { - static BUF: RefCell<[u8; 256]> = RefCell::new([0u8; 256]); - } - BUF.with_borrow_mut(|buf| { - while n > 0 { - let chunk = n.min(buf.len()); - r.read_exact(&mut buf[..n])?; - n -= chunk; - } - Ok(()) - }) -} - -fn try_read_bytes_into(r: &mut R, buf: &mut [u8]) -> Result { - let n = r.read(buf)?; - if n > 0 { - if n < buf.len() { - r.read_exact(&mut buf[n..])?; - } - Ok(true) - } else { - Ok(false) - } -} - -fn try_read_bytes(r: &mut R) -> Result, IoError> { - let mut buf = [0; N]; - match try_read_bytes_into(r, &mut buf)? { - true => Ok(Some(buf)), - false => Ok(None), - } -} - -fn read_bytes(r: &mut R) -> Result<[u8; N], IoError> { - let mut buf = [0; N]; - r.read_exact(&mut buf)?; - Ok(buf) -} - -fn read_vec(r: &mut R, n: usize) -> Result, IoError> { - let mut vec = vec![0; n]; - r.read_exact(&mut vec)?; - Ok(vec) +fn read_vec(r: &mut R, n: usize) -> Result, IoError> { + let mut vec = vec![0; n]; + r.read_exact(&mut vec)?; + Ok(vec) } fn read_string(r: &mut R, endian: Endian) -> Result { @@ -3720,94 +1848,6 @@ fn read_string(r: &mut R, endian: Endian) -> Result Ok(read_vec(r, length as usize)?.into()) } -#[derive(Clone, Debug)] -pub struct LongStringValueLabels -where - S: Debug, -{ - pub var_name: N, - pub width: u32, - - /// `(value, label)` pairs, where each value is `width` bytes. - pub labels: Vec<(RawString, S)>, -} - -impl LongStringValueLabels { - fn decode( - &self, - decoder: &mut Decoder, - ) -> Result, Warning> { - let var_name = decoder.decode(&self.var_name); - let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding) - .map_err(Warning::InvalidLongStringValueLabelName)?; - - let mut labels = Vec::with_capacity(self.labels.len()); - for (value, label) in self.labels.iter() { - let label = decoder.decode(label).to_string(); - labels.push((value.clone(), label)); - } - - Ok(LongStringValueLabels { - var_name, - width: self.width, - labels, - }) - } -} - -#[derive(Clone, Debug)] -pub struct LongStringValueLabelRecord(pub Vec>) -where - N: Debug, - S: Debug; - -static LONG_STRING_VALUE_LABEL_RECORD: ExtensionRecord = ExtensionRecord { - size: Some(1), - count: None, - name: "long string value labels record", -}; - -impl LongStringValueLabelRecord { - fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size(&LONG_STRING_VALUE_LABEL_RECORD)?; - - let mut input = &ext.data[..]; - let mut label_set = Vec::new(); - while !input.is_empty() { - let var_name = read_string(&mut input, endian)?; - let width: u32 = endian.parse(read_bytes(&mut input)?); - let n_labels: u32 = endian.parse(read_bytes(&mut input)?); - let mut labels = Vec::new(); - for _ in 0..n_labels { - let value = read_string(&mut input, endian)?; - let label = read_string(&mut input, endian)?; - labels.push((value, label)); - } - label_set.push(LongStringValueLabels { - var_name, - width, - labels, - }) - } - Ok(Record::LongStringValueLabels(LongStringValueLabelRecord( - label_set, - ))) - } -} - -impl LongStringValueLabelRecord { - fn decode(self, decoder: &mut Decoder) -> LongStringValueLabelRecord { - let mut labels = Vec::with_capacity(self.0.len()); - for label in &self.0 { - match label.decode(decoder) { - Ok(set) => labels.push(set), - Err(error) => decoder.warn(error), - } - } - LongStringValueLabelRecord(labels) - } -} - #[derive(Default)] pub struct VarTypes { pub types: Vec>, diff --git a/rust/pspp/src/sys/raw/records.rs b/rust/pspp/src/sys/raw/records.rs new file mode 100644 index 0000000000..00cbc9ca2c --- /dev/null +++ b/rust/pspp/src/sys/raw/records.rs @@ -0,0 +1,1952 @@ +//! Raw records. +//! +//! Separated into a submodule just to reduce clutter. + +use std::{ + borrow::Cow, + collections::BTreeMap, + fmt::{Debug, Formatter}, + io::{Read, Seek, SeekFrom}, + ops::Range, + str::from_utf8, +}; + +use crate::{ + dictionary::{ + Alignment, Attributes, CategoryLabels, Datum, Measure, MissingValueRange, MissingValues, + VarType, VarWidth, + }, + endian::{Endian, Parse}, + identifier::{Error as IdError, Identifier}, + sys::raw::{ + read_bytes, read_string, read_vec, DecodedRecord, Decoder, Error, Magic, RawDatum, + RawStrArray, RawString, RawWidth, Record, VarTypes, Warning, + }, +}; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum Compression { + Simple, + ZLib, +} + +#[derive(Clone)] +pub struct HeaderRecord +where + S: Debug, +{ + /// Offset in file. + pub offsets: Range, + + /// Magic number. + pub magic: Magic, + + /// Eye-catcher string, product name, in the file's encoding. Padded + /// on the right with spaces. + pub eye_catcher: S, + + /// Layout code, normally either 2 or 3. + pub layout_code: u32, + + /// Number of variable positions, or `None` if the value in the file is + /// questionably trustworthy. + pub nominal_case_size: Option, + + /// Compression type, if any, + pub compression: Option, + + /// 1-based variable index of the weight variable, or `None` if the file is + /// unweighted. + pub weight_index: Option, + + /// Claimed number of cases, if known. + pub n_cases: Option, + + /// Compression bias, usually 100.0. + pub bias: f64, + + /// `dd mmm yy` in the file's encoding. + pub creation_date: S, + + /// `HH:MM:SS` in the file's encoding. + pub creation_time: S, + + /// File label, in the file's encoding. Padded on the right with spaces. + pub file_label: S, + + /// Endianness of the data in the file header. + pub endian: Endian, +} + +impl HeaderRecord +where + S: Debug, +{ + fn debug_field(&self, f: &mut Formatter, name: &str, value: T) -> std::fmt::Result + where + T: Debug, + { + writeln!(f, "{name:>17}: {:?}", value) + } +} + +impl Debug for HeaderRecord +where + S: Debug, +{ + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + writeln!(f, "File header record:")?; + self.debug_field(f, "Magic", self.magic)?; + self.debug_field(f, "Product name", &self.eye_catcher)?; + self.debug_field(f, "Layout code", self.layout_code)?; + self.debug_field(f, "Nominal case size", self.nominal_case_size)?; + self.debug_field(f, "Compression", self.compression)?; + self.debug_field(f, "Weight index", self.weight_index)?; + self.debug_field(f, "Number of cases", self.n_cases)?; + self.debug_field(f, "Compression bias", self.bias)?; + self.debug_field(f, "Creation date", &self.creation_date)?; + self.debug_field(f, "Creation time", &self.creation_time)?; + self.debug_field(f, "File label", &self.file_label)?; + self.debug_field(f, "Endianness", self.endian) + } +} + +impl HeaderRecord { + pub fn read(r: &mut R, warn: &mut dyn FnMut(Warning)) -> Result { + let start = r.stream_position()?; + + let magic: [u8; 4] = read_bytes(r)?; + let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?; + + let eye_catcher = RawString(read_vec(r, 60)?); + let layout_code: [u8; 4] = read_bytes(r)?; + let endian = Endian::identify_u32(2, layout_code) + .or_else(|| Endian::identify_u32(2, layout_code)) + .ok_or(Error::NotASystemFile)?; + let layout_code = endian.parse(layout_code); + + let nominal_case_size: u32 = endian.parse(read_bytes(r)?); + let nominal_case_size = (1..i32::MAX as u32 / 16) + .contains(&nominal_case_size) + .then_some(nominal_case_size); + + let compression_code: u32 = endian.parse(read_bytes(r)?); + let compression = match (magic, compression_code) { + (Magic::Zsav, 2) => Some(Compression::ZLib), + (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)), + (_, 0) => None, + (_, 1) => Some(Compression::Simple), + (_, code) => return Err(Error::InvalidSavCompression(code)), + }; + + let weight_index: u32 = endian.parse(read_bytes(r)?); + let weight_index = (weight_index > 0).then_some(weight_index); + + let n_cases: u32 = endian.parse(read_bytes(r)?); + let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases); + + let bias: f64 = endian.parse(read_bytes(r)?); + if bias != 100.0 && bias != 0.0 { + warn(Warning::UnexpectedBias(bias)); + } + + let creation_date = RawString(read_vec(r, 9)?); + let creation_time = RawString(read_vec(r, 8)?); + let file_label = RawString(read_vec(r, 64)?); + let _: [u8; 3] = read_bytes(r)?; + + Ok(HeaderRecord { + offsets: start..r.stream_position()?, + magic, + layout_code, + nominal_case_size, + compression, + weight_index, + n_cases, + bias, + creation_date, + creation_time, + eye_catcher, + file_label, + endian, + }) + } + + pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord { + let eye_catcher = decoder.decode(&self.eye_catcher).to_string(); + let file_label = decoder.decode(&self.file_label).to_string(); + let creation_date = decoder.decode(&self.creation_date).to_string(); + let creation_time = decoder.decode(&self.creation_time).to_string(); + DecodedRecord::Header(HeaderRecord { + eye_catcher, + weight_index: self.weight_index, + n_cases: self.n_cases, + file_label, + offsets: self.offsets.clone(), + magic: self.magic, + layout_code: self.layout_code, + nominal_case_size: self.nominal_case_size, + compression: self.compression, + bias: self.bias, + creation_date, + creation_time, + endian: self.endian, + }) + } +} + +/// [crate::format::Format] as represented in a system file. +#[derive(Copy, Clone, PartialEq, Eq, Hash)] +pub struct RawFormat( + /// The most-significant 16 bits are the type, the next 8 bytes are the + /// width, and the least-significant 8 bits are the number of decimals. + pub u32, +); + +impl Debug for RawFormat { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + let type_ = format_name(self.0 >> 16); + let w = (self.0 >> 8) & 0xff; + let d = self.0 & 0xff; + write!(f, "{:06x} ({type_}{w}.{d})", self.0) + } +} + +fn format_name(type_: u32) -> Cow<'static, str> { + match type_ { + 1 => "A", + 2 => "AHEX", + 3 => "COMMA", + 4 => "DOLLAR", + 5 => "F", + 6 => "IB", + 7 => "PIBHEX", + 8 => "P", + 9 => "PIB", + 10 => "PK", + 11 => "RB", + 12 => "RBHEX", + 15 => "Z", + 16 => "N", + 17 => "E", + 20 => "DATE", + 21 => "TIME", + 22 => "DATETIME", + 23 => "ADATE", + 24 => "JDATE", + 25 => "DTIME", + 26 => "WKDAY", + 27 => "MONTH", + 28 => "MOYR", + 29 => "QYR", + 30 => "WKYR", + 31 => "PCT", + 32 => "DOT", + 33 => "CCA", + 34 => "CCB", + 35 => "CCC", + 36 => "CCD", + 37 => "CCE", + 38 => "EDATE", + 39 => "SDATE", + 40 => "MTIME", + 41 => "YMDHMS", + _ => return format!("").into(), + } + .into() +} + +impl MissingValues { + pub fn read( + r: &mut R, + offset: u64, + raw_width: RawWidth, + code: i32, + endian: Endian, + warn: &mut dyn FnMut(Warning), + ) -> Result { + let (individual_values, has_range) = match code { + 0 => return Ok(Self::default()), + 1..=3 => (code as usize, false), + -2 => (0, true), + -3 => (1, true), + _ => return Err(Error::BadMissingValueCode { offset, code }), + }; + + let mut values = Vec::with_capacity(individual_values); + let range = if has_range { + let low = read_bytes::<8, _>(r)?; + let high = read_bytes::<8, _>(r)?; + Some((low, high)) + } else { + None + }; + for _ in 0..individual_values { + values.push(read_bytes::<8, _>(r)?); + } + + match VarWidth::try_from(raw_width) { + Ok(VarWidth::Numeric) => { + let values = values + .into_iter() + .map(|v| Datum::Number(endian.parse(v))) + .collect(); + + let range = range.map(|(low, high)| { + MissingValueRange::new(endian.parse(low), endian.parse(high)) + }); + return Ok(Self::new(values, range).unwrap()); + } + Ok(VarWidth::String(_)) if range.is_some() => warn(Warning::MissingValueStringRange), + Ok(VarWidth::String(width)) => { + let width = width.min(8) as usize; + let values = values + .into_iter() + .map(|value| Datum::String(RawString::from(&value[..width]))) + .collect(); + return Ok(Self::new(values, None).unwrap()); + } + Err(()) => warn(Warning::MissingValueContinuation(offset)), + } + Ok(Self::default()) + } +} + +#[derive(Clone)] +pub struct VariableRecord +where + S: Debug, +{ + /// Range of offsets in file. + pub offsets: Range, + + /// Variable width, in the range -1..=255. + pub width: RawWidth, + + /// Variable name, padded on the right with spaces. + pub name: S, + + /// Print format. + pub print_format: RawFormat, + + /// Write format. + pub write_format: RawFormat, + + /// Missing values. + pub missing_values: MissingValues, + + /// Optional variable label. + pub label: Option, +} + +impl Debug for VariableRecord +where + S: Debug, +{ + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + writeln!(f, "Width: {}", self.width,)?; + writeln!(f, "Print format: {:?}", self.print_format)?; + writeln!(f, "Write format: {:?}", self.write_format)?; + writeln!(f, "Name: {:?}", &self.name)?; + writeln!(f, "Variable label: {:?}", self.label)?; + writeln!(f, "Missing values: {:?}", self.missing_values) + } +} + +impl VariableRecord { + pub fn read( + r: &mut R, + endian: Endian, + warn: &mut dyn FnMut(Warning), + ) -> Result { + let start_offset = r.stream_position()?; + let width: i32 = endian.parse(read_bytes(r)?); + let width: RawWidth = width.try_into().map_err(|_| Error::BadVariableWidth { + start_offset, + width, + })?; + let code_offset = r.stream_position()?; + let has_variable_label: u32 = endian.parse(read_bytes(r)?); + let missing_value_code: i32 = endian.parse(read_bytes(r)?); + let print_format = RawFormat(endian.parse(read_bytes(r)?)); + let write_format = RawFormat(endian.parse(read_bytes(r)?)); + let name = RawString(read_vec(r, 8)?); + + let label = match has_variable_label { + 0 => None, + 1 => { + let len: u32 = endian.parse(read_bytes(r)?); + let read_len = len.min(65535) as usize; + let label = RawString(read_vec(r, read_len)?); + + let padding_bytes = len.next_multiple_of(4) - len; + let _ = read_vec(r, padding_bytes as usize)?; + + Some(label) + } + _ => { + return Err(Error::BadVariableLabelCode { + start_offset, + code_offset, + code: has_variable_label, + }); + } + }; + + let missing_values = + MissingValues::read(r, start_offset, width, missing_value_code, endian, warn)?; + + let end_offset = r.stream_position()?; + + Ok(Record::Variable(VariableRecord { + offsets: start_offset..end_offset, + width, + name, + print_format, + write_format, + missing_values, + label, + })) + } + + pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord { + DecodedRecord::Variable(VariableRecord { + offsets: self.offsets.clone(), + width: self.width, + name: decoder.decode(&self.name).to_string(), + print_format: self.print_format, + write_format: self.write_format, + missing_values: self.missing_values, + label: self + .label + .as_ref() + .map(|label| decoder.decode(label).to_string()), + }) + } +} + +#[derive(Clone, Debug)] +pub struct ValueLabel +where + D: Debug, + S: Debug, +{ + pub datum: D, + pub label: S, +} + +#[derive(Clone)] +pub struct ValueLabelRecord +where + D: Debug, + S: Debug, +{ + /// Range of offsets in file. + pub offsets: Range, + + /// The labels. + pub labels: Vec>, + + /// The 1-based indexes of the variable indexes. + pub dict_indexes: Vec, + + /// The types of the variables. + pub var_type: VarType, +} + +impl Debug for ValueLabelRecord +where + D: Debug, + S: Debug, +{ + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + writeln!(f, "labels: ")?; + for label in self.labels.iter() { + writeln!(f, "{label:?}")?; + } + write!(f, "apply to {} variables", self.var_type)?; + for dict_index in self.dict_indexes.iter() { + write!(f, " #{dict_index}")?; + } + Ok(()) + } +} + +impl ValueLabelRecord +where + D: Debug, + S: Debug, +{ + /// Maximum number of value labels in a record. + pub const MAX_LABELS: u32 = u32::MAX / 8; + + /// Maximum number of variable indexes in a record. + pub const MAX_INDEXES: u32 = u32::MAX / 8; +} + +impl ValueLabelRecord { + pub fn read( + r: &mut R, + endian: Endian, + var_types: &VarTypes, + warn: &mut dyn FnMut(Warning), + ) -> Result, Error> { + let label_offset = r.stream_position()?; + let n: u32 = endian.parse(read_bytes(r)?); + if n > Self::MAX_LABELS { + return Err(Error::BadNumberOfValueLabels { + offset: label_offset, + n, + max: Self::MAX_LABELS, + }); + } + + let mut labels = Vec::new(); + for _ in 0..n { + let value = super::UntypedDatum(read_bytes(r)?); + let label_len: u8 = endian.parse(read_bytes(r)?); + let label_len = label_len as usize; + let padded_len = (label_len + 1).next_multiple_of(8); + + let mut label = read_vec(r, padded_len - 1)?; + label.truncate(label_len); + labels.push((value, RawString(label))); + } + + let index_offset = r.stream_position()?; + let rec_type: u32 = endian.parse(read_bytes(r)?); + if rec_type != 4 { + return Err(Error::ExpectedVarIndexRecord { + offset: index_offset, + rec_type, + }); + } + + let n: u32 = endian.parse(read_bytes(r)?); + if n > Self::MAX_INDEXES { + return Err(Error::TooManyVarIndexes { + offset: index_offset, + n, + max: Self::MAX_INDEXES, + }); + } else if n == 0 { + dbg!(); + warn(Warning::NoVarIndexes { + offset: index_offset, + }); + return Ok(None); + } + + let index_offset = r.stream_position()?; + let mut dict_indexes = Vec::with_capacity(n as usize); + let mut invalid_indexes = Vec::new(); + for _ in 0..n { + let index: u32 = endian.parse(read_bytes(r)?); + if var_types.is_valid_index(index as usize) { + dict_indexes.push(index); + } else { + invalid_indexes.push(index); + } + } + if !invalid_indexes.is_empty() { + warn(Warning::InvalidVarIndexes { + offset: index_offset, + max: var_types.n_values(), + invalid: invalid_indexes, + }); + } + + let Some(&first_index) = dict_indexes.first() else { + return Ok(None); + }; + let var_type = VarType::from(var_types.types[first_index as usize - 1].unwrap()); + let mut wrong_type_indexes = Vec::new(); + dict_indexes.retain(|&index| { + if var_types.types[index as usize - 1].map(VarType::from) != Some(var_type) { + wrong_type_indexes.push(index); + false + } else { + true + } + }); + if !wrong_type_indexes.is_empty() { + warn(Warning::MixedVarTypes { + offset: index_offset, + var_type, + wrong_types: wrong_type_indexes, + }); + } + + let labels = labels + .into_iter() + .map(|(value, label)| ValueLabel { + datum: RawDatum::from_raw(&value, var_type, endian), + label, + }) + .collect(); + + let end_offset = r.stream_position()?; + Ok(Some(Record::ValueLabel(ValueLabelRecord { + offsets: label_offset..end_offset, + labels, + dict_indexes, + var_type, + }))) + } + + pub fn decode(self, decoder: &mut Decoder) -> ValueLabelRecord { + let labels = self + .labels + .iter() + .map( + |ValueLabel { + datum: value, + label, + }| ValueLabel { + datum: value.clone(), + label: decoder.decode(label).to_string(), + }, + ) + .collect(); + ValueLabelRecord { + offsets: self.offsets.clone(), + labels, + dict_indexes: self.dict_indexes.clone(), + var_type: self.var_type, + } + } +} + +#[derive(Clone, Debug)] +pub struct DocumentRecord +where + S: Debug, +{ + pub offsets: Range, + + /// The document, as an array of lines. Raw lines are exactly 80 bytes long + /// and are right-padded with spaces without any new-line termination. + pub lines: Vec, +} + +pub type RawDocumentLine = RawStrArray; + +/// Length of a line in a document. Document lines are fixed-length and +/// padded on the right with spaces. +pub const DOC_LINE_LEN: usize = 80; + +impl DocumentRecord { + /// Maximum number of lines we will accept in a document. This is simply + /// the maximum number that will fit in a 32-bit space. + pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN; + + pub fn read(r: &mut R, endian: Endian) -> Result { + let start_offset = r.stream_position()?; + let n: u32 = endian.parse(read_bytes(r)?); + let n = n as usize; + if n > Self::MAX_LINES { + Err(Error::BadDocumentLength { + offset: start_offset, + n, + max: Self::MAX_LINES, + }) + } else { + let mut lines = Vec::with_capacity(n); + for _ in 0..n { + lines.push(RawStrArray(read_bytes(r)?)); + } + let end_offset = r.stream_position()?; + Ok(Record::Document(DocumentRecord { + offsets: start_offset..end_offset, + lines, + })) + } + } + + pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord { + DecodedRecord::Document(DocumentRecord { + offsets: self.offsets.clone(), + lines: self + .lines + .iter() + .map(|s| decoder.decode_slice(&s.0).to_string()) + .collect(), + }) + } +} + +pub struct ExtensionRecord<'a> { + pub size: Option, + pub count: Option, + pub name: &'a str, +} + +#[derive(Clone, Debug)] +pub struct IntegerInfoRecord { + pub offsets: Range, + pub version: (i32, i32, i32), + pub machine_code: i32, + pub floating_point_rep: i32, + pub compression_code: i32, + pub endianness: i32, + pub character_code: i32, +} + +static INTEGER_INFO_RECORD: ExtensionRecord = ExtensionRecord { + size: Some(4), + count: Some(8), + name: "integer record", +}; + +impl IntegerInfoRecord { + pub fn parse(ext: &Extension, endian: Endian) -> Result { + ext.check_size(&INTEGER_INFO_RECORD)?; + + let mut input = &ext.data[..]; + let data: Vec = (0..8) + .map(|_| endian.parse(read_bytes(&mut input).unwrap())) + .collect(); + Ok(Record::IntegerInfo(IntegerInfoRecord { + offsets: ext.offsets.clone(), + version: (data[0], data[1], data[2]), + machine_code: data[3], + floating_point_rep: data[4], + compression_code: data[5], + endianness: data[6], + character_code: data[7], + })) + } +} + +static FLOAT_INFO_RECORD: ExtensionRecord = ExtensionRecord { + size: Some(8), + count: Some(3), + name: "floating point record", +}; + +impl FloatInfoRecord { + pub fn parse(ext: &Extension, endian: Endian) -> Result { + ext.check_size(&FLOAT_INFO_RECORD)?; + + let mut input = &ext.data[..]; + let data: Vec = (0..3) + .map(|_| endian.parse(read_bytes(&mut input).unwrap())) + .collect(); + Ok(Record::FloatInfo(FloatInfoRecord { + sysmis: data[0], + highest: data[1], + lowest: data[2], + })) + } +} + +#[derive(Clone, Debug)] +pub struct FloatInfoRecord { + pub sysmis: f64, + pub highest: f64, + pub lowest: f64, +} + +#[derive(Clone, Debug)] +pub struct RawLongNamesRecord(TextRecord); + +impl RawLongNamesRecord { + pub fn parse(extension: Extension) -> Result { + Ok(Record::LongNames(Self(TextRecord::parse( + extension, + "long names record", + )?))) + } + pub fn decode(self, decoder: &mut Decoder) -> LongNamesRecord { + let input = decoder.decode(&self.0.text); + let mut names = Vec::new(); + for pair in input.split('\t').filter(|s| !s.is_empty()) { + if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&mut decoder.warn) + { + names.push(long_name); + } + } + LongNamesRecord(names) + } +} + +#[derive(Clone, Debug)] +pub struct TextRecord { + pub offsets: Range, + + /// The text content of the record. + pub text: RawString, +} + +impl TextRecord { + pub fn parse(extension: Extension, name: &str) -> Result { + extension.check_size(&ExtensionRecord { + size: Some(1), + count: None, + name, + })?; + Ok(Self { + offsets: extension.offsets, + text: extension.data.into(), + }) + } +} + +#[derive(Clone, Debug)] +pub struct VeryLongString { + pub short_name: Identifier, + pub length: u16, +} + +impl VeryLongString { + fn parse(decoder: &Decoder, input: &str) -> Result { + let Some((short_name, length)) = input.split_once('=') else { + return Err(Warning::VeryLongStringMissingDelimiter(input.into())); + }; + let short_name = decoder + .new_identifier(short_name) + .and_then(Identifier::must_be_ordinary) + .map_err(Warning::InvalidLongStringName)?; + let length = length + .parse() + .map_err(|_| Warning::VeryLongStringInvalidLength(input.into()))?; + Ok(VeryLongString { short_name, length }) + } +} + +#[derive(Clone, Debug)] +pub struct RawVeryLongStringsRecord(TextRecord); + +#[derive(Clone, Debug)] +pub struct VeryLongStringsRecord(pub Vec); + +impl RawVeryLongStringsRecord { + pub fn parse(extension: Extension) -> Result { + Ok(Record::VeryLongStrings(Self(TextRecord::parse( + extension, + "very long strings record", + )?))) + } + pub fn decode(self, decoder: &mut Decoder) -> VeryLongStringsRecord { + let input = decoder.decode(&self.0.text); + let mut very_long_strings = Vec::new(); + for tuple in input + .split('\0') + .map(|s| s.trim_start_matches('\t')) + .filter(|s| !s.is_empty()) + { + if let Some(vls) = + VeryLongString::parse(decoder, tuple).issue_warning(&mut decoder.warn) + { + very_long_strings.push(vls) + } + } + VeryLongStringsRecord(very_long_strings) + } +} + +#[derive(Clone, Debug)] +pub enum MultipleResponseType { + MultipleDichotomy { + value: RawString, + labels: CategoryLabels, + }, + MultipleCategory, +} + +impl MultipleResponseType { + fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> { + let (mr_type, input) = match input.split_first() { + Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input), + Some((b'D', input)) => { + let (value, input) = parse_counted_string(input)?; + ( + MultipleResponseType::MultipleDichotomy { + value, + labels: CategoryLabels::VarLabels, + }, + input, + ) + } + Some((b'E', input)) => { + let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") { + (CategoryLabels::CountedValues, rest) + } else if let Some(rest) = input.strip_prefix(b" 11 ") { + (CategoryLabels::VarLabels, rest) + } else { + return Err(Warning::InvalidMultipleDichotomyLabelType); + }; + let (value, input) = parse_counted_string(input)?; + ( + MultipleResponseType::MultipleDichotomy { value, labels }, + input, + ) + } + _ => return Err(Warning::InvalidMultipleResponseType), + }; + Ok((mr_type, input)) + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseSet +where + I: Debug, + S: Debug, +{ + pub name: I, + pub label: S, + pub mr_type: MultipleResponseType, + pub short_names: Vec, +} + +impl MultipleResponseSet { + fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> { + let Some(equals) = input.iter().position(|&b| b == b'=') else { + return Err(Warning::MultipleResponseSyntaxError("missing `=`")); + }; + let (name, input) = input.split_at(equals); + let input = input.strip_prefix(b"=").unwrap(); + let (mr_type, input) = MultipleResponseType::parse(input)?; + let Some(input) = input.strip_prefix(b" ") else { + return Err(Warning::MultipleResponseSyntaxError( + "missing space after multiple response type", + )); + }; + let (label, mut input) = parse_counted_string(input)?; + let mut vars = Vec::new(); + while input.first() != Some(&b'\n') { + match input.split_first() { + Some((b' ', rest)) => { + let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else { + return Err(Warning::MultipleResponseSyntaxError( + "missing variable name delimiter", + )); + }; + let (var, rest) = rest.split_at(length); + if !var.is_empty() { + vars.push(var.into()); + } + input = rest; + } + _ => { + return Err(Warning::MultipleResponseSyntaxError( + "missing space preceding variable name", + )); + } + } + } + while input.first() == Some(&b'\n') { + input = &input[1..]; + } + Ok(( + MultipleResponseSet { + name: name.into(), + label, + mr_type, + short_names: vars, + }, + input, + )) + } + + fn decode( + &self, + decoder: &mut Decoder, + ) -> Result, Warning> { + let mut short_names = Vec::with_capacity(self.short_names.len()); + for short_name in self.short_names.iter() { + if let Some(short_name) = decoder + .decode_identifier(short_name) + .map_err(Warning::InvalidMrSetName) + .issue_warning(&mut decoder.warn) + { + short_names.push(short_name); + } + } + Ok(MultipleResponseSet { + name: decoder + .decode_identifier(&self.name) + .map_err(Warning::InvalidMrSetVariableName)?, + label: decoder.decode(&self.label).to_string(), + mr_type: self.mr_type.clone(), + short_names, + }) + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseRecord(pub Vec>) +where + I: Debug, + S: Debug; + +static MULTIPLE_RESPONSE_RECORD: ExtensionRecord = ExtensionRecord { + size: Some(1), + count: None, + name: "multiple response set record", +}; + +impl MultipleResponseRecord { + fn parse(ext: &Extension, _endian: Endian) -> Result { + ext.check_size(&MULTIPLE_RESPONSE_RECORD)?; + + let mut input = &ext.data[..]; + let mut sets = Vec::new(); + loop { + while let Some(suffix) = input.strip_prefix(b"\n") { + input = suffix; + } + if input.is_empty() { + break; + } + let (set, rest) = MultipleResponseSet::parse(input)?; + sets.push(set); + input = rest; + } + Ok(Record::MultipleResponse(MultipleResponseRecord(sets))) + } +} + +impl MultipleResponseRecord { + pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord { + let mut sets = Vec::new(); + for set in self.0.iter() { + if let Some(set) = set.decode(decoder).issue_warning(&mut decoder.warn) { + sets.push(set); + } + } + DecodedRecord::MultipleResponse(MultipleResponseRecord(sets)) + } +} + +fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> { + let Some(space) = input.iter().position(|&b| b == b' ') else { + return Err(Warning::CountedStringMissingSpace); + }; + let Ok(length) = from_utf8(&input[..space]) else { + return Err(Warning::CountedStringInvalidUTF8); + }; + let Ok(length): Result = length.parse() else { + return Err(Warning::CountedStringInvalidLength(length.into())); + }; + + let Some((string, rest)) = input[space + 1..].split_at_checked(length) else { + return Err(Warning::CountedStringTooLong(length)); + }; + Ok((string.into(), rest)) +} + +impl Measure { + fn try_decode(source: u32) -> Result, Warning> { + match source { + 0 => Ok(None), + 1 => Ok(Some(Measure::Nominal)), + 2 => Ok(Some(Measure::Ordinal)), + 3 => Ok(Some(Measure::Scale)), + _ => Err(Warning::InvalidMeasurement(source)), + } + } +} + +impl Alignment { + fn try_decode(source: u32) -> Result, Warning> { + match source { + 0 => Ok(Some(Alignment::Left)), + 1 => Ok(Some(Alignment::Right)), + 2 => Ok(Some(Alignment::Center)), + _ => Err(Warning::InvalidAlignment(source)), + } + } +} + +#[derive(Clone, Debug)] +pub struct VarDisplay { + pub measure: Option, + pub width: Option, + pub alignment: Option, +} + +#[derive(Clone, Debug)] +pub struct VarDisplayRecord(pub Vec); + +impl VarDisplayRecord { + fn parse( + ext: &Extension, + var_types: &VarTypes, + endian: Endian, + warn: &mut dyn FnMut(Warning), + ) -> Result { + if ext.size != 4 { + return Err(Warning::BadRecordSize { + offset: ext.offsets.start, + record: String::from("variable display record"), + size: ext.size, + expected_size: 4, + }); + } + + let n_vars = var_types.n_vars(); + let has_width = if ext.count as usize == 3 * n_vars { + true + } else if ext.count as usize == 2 * n_vars { + false + } else { + return Err(Warning::InvalidVariableDisplayCount { + count: ext.count as usize, + first: 2 * n_vars, + second: 3 * n_vars, + }); + }; + + let mut var_displays = Vec::new(); + let mut input = &ext.data[..]; + for _ in 0..n_vars { + let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap())) + .issue_warning(warn) + .flatten(); + let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap())); + let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap())) + .issue_warning(warn) + .flatten(); + var_displays.push(VarDisplay { + measure, + width, + alignment, + }); + } + Ok(Record::VarDisplay(VarDisplayRecord(var_displays))) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringMissingValues +where + N: Debug, +{ + /// Variable name. + pub var_name: N, + + /// Missing values. + pub missing_values: Vec>, +} + +impl LongStringMissingValues { + fn decode( + &self, + decoder: &mut Decoder, + ) -> Result, IdError> { + Ok(LongStringMissingValues { + var_name: decoder.decode_identifier(&self.var_name)?, + missing_values: self.missing_values.clone(), + }) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringMissingValueRecord(pub Vec>) +where + N: Debug; + +static LONG_STRING_MISSING_VALUE_RECORD: ExtensionRecord = ExtensionRecord { + size: Some(1), + count: None, + name: "long string missing values record", +}; + +impl LongStringMissingValueRecord { + fn parse( + ext: &Extension, + endian: Endian, + warn: &mut dyn FnMut(Warning), + ) -> Result { + ext.check_size(&LONG_STRING_MISSING_VALUE_RECORD)?; + + let mut input = &ext.data[..]; + let mut missing_value_set = Vec::new(); + while !input.is_empty() { + let var_name = read_string(&mut input, endian)?; + dbg!(&var_name); + let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?); + let value_len: u32 = endian.parse(read_bytes(&mut input)?); + if value_len != 8 { + let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start; + warn(Warning::BadLongMissingValueLength { + record_offset: ext.offsets.start, + offset, + value_len, + }); + read_vec( + &mut input, + dbg!(value_len as usize * n_missing_values as usize), + )?; + continue; + } + let mut missing_values = Vec::new(); + for i in 0..n_missing_values { + if i > 0 { + // Tolerate files written by old, buggy versions of PSPP + // where we believed that the value_length was repeated + // before each missing value. + let mut peek = input; + let number: u32 = endian.parse(read_bytes(&mut peek)?); + if number == 8 { + input = peek; + } + } + + let value: [u8; 8] = read_bytes(&mut input)?; + missing_values.push(RawStrArray(value)); + } + missing_value_set.push(LongStringMissingValues { + var_name, + missing_values, + }); + } + Ok(Record::LongStringMissingValues( + LongStringMissingValueRecord(missing_value_set), + )) + } +} + +impl LongStringMissingValueRecord { + pub fn decode(self, decoder: &mut Decoder) -> LongStringMissingValueRecord { + let mut mvs = Vec::with_capacity(self.0.len()); + for mv in self.0.iter() { + if let Some(mv) = mv + .decode(decoder) + .map_err(Warning::InvalidLongStringMissingValueVariableName) + .issue_warning(&mut decoder.warn) + { + mvs.push(mv); + } + } + LongStringMissingValueRecord(mvs) + } +} + +#[derive(Clone, Debug)] +pub struct EncodingRecord(pub String); + +static ENCODING_RECORD: ExtensionRecord = ExtensionRecord { + size: Some(1), + count: None, + name: "encoding record", +}; + +impl EncodingRecord { + fn parse(ext: &Extension, _endian: Endian) -> Result { + ext.check_size(&ENCODING_RECORD)?; + + Ok(Record::Encoding(EncodingRecord( + String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName { + offset: ext.offsets.start, + })?, + ))) + } +} + +#[derive(Clone, Debug)] +pub struct NumberOfCasesRecord { + /// Always observed as 1. + pub one: u64, + + /// Number of cases. + pub n_cases: u64, +} + +static NUMBER_OF_CASES_RECORD: ExtensionRecord = ExtensionRecord { + size: Some(8), + count: Some(2), + name: "extended number of cases record", +}; + +impl NumberOfCasesRecord { + fn parse(ext: &Extension, endian: Endian) -> Result { + ext.check_size(&NUMBER_OF_CASES_RECORD)?; + + let mut input = &ext.data[..]; + let one = endian.parse(read_bytes(&mut input)?); + let n_cases = endian.parse(read_bytes(&mut input)?); + + Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases })) + } +} + +#[derive(Clone, Debug)] +pub struct RawVariableSetRecord(TextRecord); + +impl RawVariableSetRecord { + fn parse(extension: Extension) -> Result { + Ok(Record::VariableSets(Self(TextRecord::parse( + extension, + "variable sets record", + )?))) + } + pub fn decode(self, decoder: &mut Decoder) -> VariableSetRecord { + let mut sets = Vec::new(); + let input = decoder.decode(&self.0.text); + for line in input.lines() { + if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&mut decoder.warn) { + sets.push(set) + } + } + VariableSetRecord { + offsets: self.0.offsets, + sets, + } + } +} + +#[derive(Clone, Debug)] +pub struct RawProductInfoRecord(TextRecord); + +impl RawProductInfoRecord { + fn parse(extension: Extension) -> Result { + Ok(Record::ProductInfo(Self(TextRecord::parse( + extension, + "product info record", + )?))) + } + pub fn decode(self, decoder: &mut Decoder) -> ProductInfoRecord { + ProductInfoRecord(decoder.decode(&self.0.text).into()) + } +} + +#[derive(Clone, Debug)] +pub struct Attribute { + pub name: Identifier, + pub values: Vec, +} + +impl Attribute { + fn parse<'a>(decoder: &mut Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> { + let Some((name, mut input)) = input.split_once('(') else { + return Err(Warning::AttributeMissingLParen(input.into())); + }; + let name = decoder + .new_identifier(name) + .map_err(Warning::InvalidAttributeName)?; + let mut values = Vec::new(); + loop { + let Some((value, rest)) = input.split_once('\n') else { + return Err(Warning::AttributeMissingValue { + name: name.clone(), + index: values.len(), + }); + }; + if let Some(stripped) = value + .strip_prefix('\'') + .and_then(|value| value.strip_suffix('\'')) + { + values.push(stripped.into()); + } else { + decoder.warn(Warning::AttributeMissingQuotes { + name: name.clone(), + index: values.len(), + }); + values.push(value.into()); + } + if let Some(rest) = rest.strip_prefix(')') { + let attribute = Attribute { name, values }; + return Ok((attribute, rest)); + }; + input = rest; + } + } +} + +impl Attributes { + fn parse<'a>( + decoder: &mut Decoder, + mut input: &'a str, + sentinel: Option, + ) -> Result<(Attributes, &'a str, Vec), Warning> { + let mut attributes = BTreeMap::new(); + let mut duplicates = Vec::new(); + let rest = loop { + match input.chars().next() { + None => break input, + c if c == sentinel => break &input[1..], + _ => { + let (attribute, rest) = Attribute::parse(decoder, input)?; + if attributes.contains_key(&attribute.name) { + duplicates.push(attribute.name.clone()); + } + attributes.insert(attribute.name, attribute.values); + input = rest; + } + } + }; + Ok((Attributes(attributes), rest, duplicates)) + } +} + +#[derive(Clone, Debug)] +pub struct RawFileAttributesRecord(TextRecord); + +#[derive(Clone, Debug, Default)] +pub struct FileAttributesRecord(pub Attributes); + +impl RawFileAttributesRecord { + fn parse(extension: Extension) -> Result { + Ok(Record::FileAttributes(Self(TextRecord::parse( + extension, + "file attributes record", + )?))) + } + pub fn decode(self, decoder: &mut Decoder) -> FileAttributesRecord { + let input = decoder.decode(&self.0.text); + match Attributes::parse(decoder, &input, None).issue_warning(&mut decoder.warn) { + Some((set, rest, duplicates)) => { + if !duplicates.is_empty() { + decoder.warn(Warning::DuplicateFileAttributes { + attributes: duplicates, + }); + } + if !rest.is_empty() { + decoder.warn(dbg!(Warning::TBD)); + } + FileAttributesRecord(set) + } + None => FileAttributesRecord::default(), + } + } +} + +#[derive(Clone, Debug)] +pub struct VarAttributes { + pub long_var_name: Identifier, + pub attributes: Attributes, +} + +impl VarAttributes { + fn parse<'a>( + decoder: &mut Decoder, + input: &'a str, + ) -> Result<(VarAttributes, &'a str), Warning> { + let Some((long_var_name, rest)) = input.split_once(':') else { + return Err(dbg!(Warning::TBD)); + }; + let long_var_name = decoder + .new_identifier(long_var_name) + .and_then(Identifier::must_be_ordinary) + .map_err(Warning::InvalidAttributeVariableName)?; + let (attributes, rest, duplicates) = Attributes::parse(decoder, rest, Some('/'))?; + if !duplicates.is_empty() { + decoder.warn(Warning::DuplicateVariableAttributes { + variable: long_var_name.clone(), + attributes: duplicates, + }); + } + let var_attribute = VarAttributes { + long_var_name, + attributes, + }; + Ok((var_attribute, rest)) + } +} + +#[derive(Clone, Debug)] +pub struct RawVariableAttributesRecord(TextRecord); + +#[derive(Clone, Debug)] +pub struct VariableAttributesRecord(pub Vec); + +impl RawVariableAttributesRecord { + fn parse(extension: Extension) -> Result { + Ok(Record::VariableAttributes(Self(TextRecord::parse( + extension, + "variable attributes record", + )?))) + } + pub fn decode(self, decoder: &mut Decoder) -> VariableAttributesRecord { + let decoded = decoder.decode(&self.0.text); + let mut input = decoded.as_ref(); + let mut var_attribute_sets = Vec::new(); + while !input.is_empty() { + let Some((var_attribute, rest)) = + VarAttributes::parse(decoder, input).issue_warning(&mut decoder.warn) + else { + break; + }; + var_attribute_sets.push(var_attribute); + input = rest; + } + VariableAttributesRecord(var_attribute_sets) + } +} + +#[derive(Clone, Debug)] +pub struct LongName { + pub short_name: Identifier, + pub long_name: Identifier, +} + +impl LongName { + fn parse(input: &str, decoder: &Decoder) -> Result { + let Some((short_name, long_name)) = input.split_once('=') else { + return Err(dbg!(Warning::LongNameMissingEquals)); + }; + let short_name = decoder + .new_identifier(short_name) + .and_then(Identifier::must_be_ordinary) + .map_err(Warning::InvalidShortName)?; + let long_name = decoder + .new_identifier(long_name) + .and_then(Identifier::must_be_ordinary) + .map_err(Warning::InvalidLongName)?; + Ok(LongName { + short_name, + long_name, + }) + } +} + +#[derive(Clone, Debug)] +pub struct LongNamesRecord(pub Vec); + +#[derive(Clone, Debug)] +pub struct ProductInfoRecord(pub String); + +#[derive(Clone, Debug)] +pub struct VariableSet { + pub name: String, + pub variable_names: Vec, +} + +impl VariableSet { + fn parse(input: &str, decoder: &mut Decoder) -> Result { + let (name, input) = input + .split_once('=') + .ok_or(Warning::VariableSetMissingEquals)?; + let mut vars = Vec::new(); + for var in input.split_ascii_whitespace() { + if let Some(identifier) = decoder + .new_identifier(var) + .and_then(Identifier::must_be_ordinary) + .map_err(Warning::InvalidVariableSetName) + .issue_warning(&mut decoder.warn) + { + vars.push(identifier); + } + } + Ok(VariableSet { + name: name.to_string(), + variable_names: vars, + }) + } +} + +#[derive(Clone, Debug)] +pub struct VariableSetRecord { + pub offsets: Range, + pub sets: Vec, +} + +trait IssueWarning { + fn issue_warning(self, warn: &mut dyn FnMut(Warning)) -> Option; +} +impl IssueWarning for Result { + fn issue_warning(self, warn: &mut dyn FnMut(Warning)) -> Option { + match self { + Ok(result) => Some(result), + Err(error) => { + warn(error); + None + } + } + } +} + +#[derive(Clone, Debug)] +pub struct Extension { + pub offsets: Range, + + /// Record subtype. + pub subtype: u32, + + /// Size of each data element. + pub size: u32, + + /// Number of data elements. + pub count: u32, + + /// `size * count` bytes of data. + pub data: Vec, +} + +impl Extension { + pub fn check_size(&self, expected: &ExtensionRecord) -> Result<(), Warning> { + match expected.size { + Some(expected_size) if self.size != expected_size => { + return Err(Warning::BadRecordSize { + offset: self.offsets.start, + record: expected.name.into(), + size: self.size, + expected_size, + }); + } + _ => (), + } + match expected.count { + Some(expected_count) if self.count != expected_count => { + return Err(Warning::BadRecordCount { + offset: self.offsets.start, + record: expected.name.into(), + count: self.count, + expected_count, + }); + } + _ => (), + } + Ok(()) + } + + pub fn read( + r: &mut R, + endian: Endian, + var_types: &VarTypes, + warn: &mut dyn FnMut(Warning), + ) -> Result, Error> { + let subtype = endian.parse(read_bytes(r)?); + let header_offset = r.stream_position()?; + let size: u32 = endian.parse(read_bytes(r)?); + let count = endian.parse(read_bytes(r)?); + let Some(product) = size.checked_mul(count) else { + return Err(Error::ExtensionRecordTooLarge { + offset: header_offset, + subtype, + size, + count, + }); + }; + let start_offset = r.stream_position()?; + let data = read_vec(r, product as usize)?; + let end_offset = start_offset + product as u64; + let extension = Extension { + offsets: start_offset..end_offset, + subtype, + size, + count, + data, + }; + let result = match subtype { + 3 => IntegerInfoRecord::parse(&extension, endian), + 4 => FloatInfoRecord::parse(&extension, endian), + 11 => VarDisplayRecord::parse(&extension, var_types, endian, warn), + 7 | 19 => MultipleResponseRecord::parse(&extension, endian), + 21 => LongStringValueLabelRecord::parse(&extension, endian), + 22 => LongStringMissingValueRecord::parse(&extension, endian, warn), + 20 => EncodingRecord::parse(&extension, endian), + 16 => NumberOfCasesRecord::parse(&extension, endian), + 5 => RawVariableSetRecord::parse(extension), + 10 => RawProductInfoRecord::parse(extension), + 13 => RawLongNamesRecord::parse(extension), + 14 => RawVeryLongStringsRecord::parse(extension), + 17 => RawFileAttributesRecord::parse(extension), + 18 => RawVariableAttributesRecord::parse(extension), + _ => Ok(Record::OtherExtension(extension)), + }; + match result { + Ok(result) => Ok(Some(result)), + Err(error) => { + warn(error); + Ok(None) + } + } + } +} + +#[derive(Clone, Debug)] +pub struct LongStringValueLabels +where + S: Debug, +{ + pub var_name: N, + pub width: u32, + + /// `(value, label)` pairs, where each value is `width` bytes. + pub labels: Vec<(RawString, S)>, +} + +impl LongStringValueLabels { + fn decode( + &self, + decoder: &mut Decoder, + ) -> Result, Warning> { + let var_name = decoder.decode(&self.var_name); + let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding) + .map_err(Warning::InvalidLongStringValueLabelName)?; + + let mut labels = Vec::with_capacity(self.labels.len()); + for (value, label) in self.labels.iter() { + let label = decoder.decode(label).to_string(); + labels.push((value.clone(), label)); + } + + Ok(LongStringValueLabels { + var_name, + width: self.width, + labels, + }) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringValueLabelRecord(pub Vec>) +where + N: Debug, + S: Debug; + +static LONG_STRING_VALUE_LABEL_RECORD: ExtensionRecord = ExtensionRecord { + size: Some(1), + count: None, + name: "long string value labels record", +}; + +impl LongStringValueLabelRecord { + fn parse(ext: &Extension, endian: Endian) -> Result { + ext.check_size(&LONG_STRING_VALUE_LABEL_RECORD)?; + + let mut input = &ext.data[..]; + let mut label_set = Vec::new(); + while !input.is_empty() { + let var_name = read_string(&mut input, endian)?; + let width: u32 = endian.parse(read_bytes(&mut input)?); + let n_labels: u32 = endian.parse(read_bytes(&mut input)?); + let mut labels = Vec::new(); + for _ in 0..n_labels { + let value = read_string(&mut input, endian)?; + let label = read_string(&mut input, endian)?; + labels.push((value, label)); + } + label_set.push(LongStringValueLabels { + var_name, + width, + labels, + }) + } + Ok(Record::LongStringValueLabels(LongStringValueLabelRecord( + label_set, + ))) + } +} + +impl LongStringValueLabelRecord { + pub fn decode(self, decoder: &mut Decoder) -> LongStringValueLabelRecord { + let mut labels = Vec::with_capacity(self.0.len()); + for label in &self.0 { + match label.decode(decoder) { + Ok(set) => labels.push(set), + Err(error) => decoder.warn(error), + } + } + LongStringValueLabelRecord(labels) + } +} + +#[derive(Clone, Debug)] +pub struct ZHeader { + /// File offset to the start of the record. + pub offset: u64, + + /// File offset to the ZLIB data header. + pub zheader_offset: u64, + + /// File offset to the ZLIB trailer. + pub ztrailer_offset: u64, + + /// Length of the ZLIB trailer in bytes. + pub ztrailer_len: u64, +} + +impl ZHeader { + pub fn read(r: &mut R, endian: Endian) -> Result { + let offset = r.stream_position()?; + let zheader_offset: u64 = endian.parse(read_bytes(r)?); + let ztrailer_offset: u64 = endian.parse(read_bytes(r)?); + let ztrailer_len: u64 = endian.parse(read_bytes(r)?); + + if zheader_offset != offset { + return Err(Error::UnexpectedZHeaderOffset { + actual: zheader_offset, + expected: offset, + }); + } + + if ztrailer_offset < offset { + return Err(Error::ImpossibleZTrailerOffset(ztrailer_offset)); + } + + if ztrailer_len < 24 || ztrailer_len % 24 != 0 { + return Err(Error::InvalidZTrailerLength(ztrailer_len)); + } + + Ok(ZHeader { + offset, + zheader_offset, + ztrailer_offset, + ztrailer_len, + }) + } +} + +#[derive(Clone, Debug)] +pub struct ZTrailer { + /// File offset to the start of the record. + pub offset: u64, + + /// Compression bias as a negative integer, e.g. -100. + pub int_bias: i64, + + /// Always observed as zero. + pub zero: u64, + + /// Uncompressed size of each block, except possibly the last. Only + /// `0x3ff000` has been observed so far. + pub block_size: u32, + + /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them. + pub blocks: Vec, +} + +#[derive(Clone, Debug)] +pub struct ZBlock { + /// Offset of block of data if simple compression were used. + pub uncompressed_ofs: u64, + + /// Actual offset within the file of the compressed data block. + pub compressed_ofs: u64, + + /// The number of bytes in this data block after decompression. This is + /// `block_size` in every data block but the last, which may be smaller. + pub uncompressed_size: u32, + + /// The number of bytes in this data block, as stored compressed in this + /// file. + pub compressed_size: u32, +} + +impl ZBlock { + fn read(r: &mut R, endian: Endian) -> Result { + Ok(ZBlock { + uncompressed_ofs: endian.parse(read_bytes(r)?), + compressed_ofs: endian.parse(read_bytes(r)?), + uncompressed_size: endian.parse(read_bytes(r)?), + compressed_size: endian.parse(read_bytes(r)?), + }) + } +} + +impl ZTrailer { + pub fn read( + reader: &mut R, + endian: Endian, + bias: f64, + zheader: &ZHeader, + warn: &mut dyn FnMut(Warning), + ) -> Result, Error> { + let start_offset = reader.stream_position()?; + if reader + .seek(SeekFrom::Start(zheader.ztrailer_offset)) + .is_err() + { + return Ok(None); + } + let int_bias = endian.parse(read_bytes(reader)?); + if int_bias as f64 != -bias { + return Err(Error::WrongZlibTrailerBias { + actual: int_bias, + expected: -bias, + }); + } + let zero = endian.parse(read_bytes(reader)?); + if zero != 0 { + return Err(Error::WrongZlibTrailerZero(zero)); + } + let block_size = endian.parse(read_bytes(reader)?); + if block_size != 0x3ff000 { + return Err(Error::WrongZlibTrailerBlockSize(block_size)); + } + let n_blocks: u32 = endian.parse(read_bytes(reader)?); + let expected_n_blocks = (zheader.ztrailer_len - 24) / 24; + if n_blocks as u64 != expected_n_blocks { + return Err(Error::BadZlibTrailerNBlocks { + offset: zheader.ztrailer_offset, + n_blocks, + expected_n_blocks, + ztrailer_len: zheader.ztrailer_len, + }); + } + let blocks = (0..n_blocks) + .map(|_| ZBlock::read(reader, endian)) + .collect::, _>>()?; + + let mut expected_uncmp_ofs = zheader.zheader_offset; + let mut expected_cmp_ofs = zheader.zheader_offset + 24; + for (index, block) in blocks.iter().enumerate() { + if block.uncompressed_ofs != expected_uncmp_ofs { + return Err(Error::ZlibTrailerBlockWrongUncmpOfs { + index, + actual: block.uncompressed_ofs, + expected: expected_cmp_ofs, + }); + } + if block.compressed_ofs != expected_cmp_ofs { + return Err(Error::ZlibTrailerBlockWrongCmpOfs { + index, + actual: block.compressed_ofs, + expected: expected_cmp_ofs, + }); + } + if index < blocks.len() - 1 { + if block.uncompressed_size != block_size { + warn(Warning::ZlibTrailerBlockWrongSize { + index, + actual: block.uncompressed_size, + expected: block_size, + }); + } + } else { + if block.uncompressed_size > block_size { + warn(Warning::ZlibTrailerBlockTooBig { + index, + actual: block.uncompressed_size, + max_expected: block_size, + }); + } + } + // http://www.zlib.net/zlib_tech.html says that the maximum + // expansion from compression, with worst-case parameters, is 13.5% + // plus 11 bytes. This code checks for an expansion of more than + // 14.3% plus 11 bytes. + if block.compressed_size > block.uncompressed_size + block.uncompressed_size / 7 + 11 { + return Err(Error::ZlibExpansion { + index, + compressed_size: block.compressed_size, + uncompressed_size: block.uncompressed_size, + }); + } + + expected_cmp_ofs += block.compressed_size as u64; + expected_uncmp_ofs += block.uncompressed_size as u64; + } + + if expected_cmp_ofs != zheader.ztrailer_offset { + return Err(Error::ZlibTrailerOffsetInconsistency { + descriptors: expected_cmp_ofs, + zheader: zheader.ztrailer_offset, + }); + } + + reader.seek(SeekFrom::Start(start_offset))?; + Ok(Some(ZTrailer { + offset: zheader.ztrailer_offset, + int_bias, + zero, + block_size, + blocks, + })) + } +} -- 2.30.2