collections::{BTreeMap, BTreeSet, HashMap, HashSet},
fmt::{Debug, Display, Formatter, Result as FmtResult},
hash::Hash,
- ops::{Bound, RangeBounds, RangeInclusive},
+ ops::{Bound, Not, RangeBounds, RangeInclusive},
str::FromStr,
};
identifier::{ByIdentifier, HasIdentifier, Identifier},
output::pivot::{Axis3, Dimension, Footnote, Footnotes, Group, PivotTable, Value},
settings::Show,
- sys::raw::{CategoryLabels, RawString, VarType},
+ sys::raw::RawString,
};
/// An index within [Dictionary::variables].
pub type DictIndex = usize;
+/// Variable type.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum VarType {
+ /// A numeric variable.
+ Numeric,
+
+ /// A string variable.
+ String,
+}
+
+impl Not for VarType {
+ type Output = Self;
+
+ fn not(self) -> Self::Output {
+ match self {
+ Self::Numeric => Self::String,
+ Self::String => Self::Numeric,
+ }
+ }
+}
+
+impl Not for &VarType {
+ type Output = VarType;
+
+ fn not(self) -> Self::Output {
+ !*self
+ }
+}
+
+impl Display for VarType {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ match self {
+ VarType::Numeric => write!(f, "numeric"),
+ VarType::String => write!(f, "string"),
+ }
+ }
+}
+
/// [VarType], plus a width for [VarType::String].
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum VarWidth {
}
#[derive(Clone)]
-pub enum Datum<S = RawString> {
+pub enum Datum {
Number(Option<f64>),
- String(S),
+ String(RawString),
}
-impl<S> Debug for Datum<S>
-where
- S: Debug,
-{
+impl Debug for Datum {
fn fmt(&self, f: &mut Formatter) -> FmtResult {
match self {
Datum::Number(Some(number)) => write!(f, "{number:?}"),
}
}
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum CategoryLabels {
+ VarLabels,
+ CountedValues,
+}
+
#[cfg(test)]
mod test {
use std::collections::HashSet;
use unicode_width::UnicodeWidthStr;
use crate::{
- dictionary::{Datum, VarWidth},
- sys::raw::{self, RawString, VarType},
+ dictionary::{Datum, VarType, VarWidth},
+ sys::raw::{self, RawString},
};
mod display;
}
}
-impl TryFrom<raw::RawFormat> for UncheckedFormat {
+impl TryFrom<raw::records::RawFormat> for UncheckedFormat {
type Error = Error;
- fn try_from(raw: raw::RawFormat) -> Result<Self, Self::Error> {
+ fn try_from(raw: raw::records::RawFormat) -> Result<Self, Self::Error> {
let raw = raw.0;
let raw_format = (raw >> 16) as u16;
let format = raw_format.try_into()?;
use tlo::parse_tlo;
use crate::{
- dictionary::{Datum, Variable},
+ dictionary::{Datum, VarType, Variable},
format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat},
settings::{Settings, Show},
- sys::raw::VarType,
};
pub mod output;
use crate::{
calendar::date_time_to_pspp,
dictionary::{
- Datum, Dictionary, InvalidRole, MissingValues, MissingValuesError, MultipleResponseSet, MultipleResponseType, VarWidth, Variable, VariableSet
+ Datum, Dictionary, InvalidRole, MissingValues, MissingValuesError, MultipleResponseSet,
+ MultipleResponseType, VarWidth, Variable, VariableSet,
},
endian::Endian,
format::{Error as FormatError, Format, UncheckedFormat},
sys::{
encoding::Error as EncodingError,
raw::{
- self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension,
- FileAttributesRecord, FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName,
- LongNamesRecord, LongStringMissingValueRecord, LongStringValueLabelRecord,
- MultipleResponseRecord, NumberOfCasesRecord, ProductInfoRecord, RawDatum, RawString,
- RawWidth, ValueLabel, ValueLabelRecord, VarDisplayRecord, VariableAttributesRecord,
- VariableRecord, VariableSetRecord, VeryLongStringsRecord, ZHeader, ZTrailer,
+ self,
+ records::{
+ Compression, DocumentRecord, EncodingRecord, Extension, FileAttributesRecord,
+ FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName, LongNamesRecord,
+ LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord,
+ NumberOfCasesRecord, ProductInfoRecord, RawFormat, ValueLabel, ValueLabelRecord,
+ VarDisplayRecord, VariableAttributesRecord, VariableRecord, VariableSetRecord,
+ VeryLongStringsRecord, ZHeader, ZTrailer,
+ },
+ Cases, DecodedRecord, RawDatum, RawString, RawWidth,
},
},
};
use itertools::Itertools;
use thiserror::Error as ThisError;
-pub use crate::sys::raw::{CategoryLabels, Compression};
-
#[derive(ThisError, Clone, Debug)]
pub enum Error {
#[error("Missing header record")]
impl MultipleResponseSet {
fn decode(
dictionary: &Dictionary,
- input: &raw::MultipleResponseSet<Identifier, String>,
+ input: &raw::records::MultipleResponseSet<Identifier, String>,
warn: &mut impl FnMut(Error),
) -> Result<Self, Error> {
let mr_set_name = input.name.clone();
}
fn decode_format(
- raw: raw::RawFormat,
+ raw: RawFormat,
width: VarWidth,
mut warn: impl FnMut(Format, FormatError),
) -> Format {
impl MultipleResponseType {
fn decode(
mr_set: &Identifier,
- input: &raw::MultipleResponseType,
+ input: &raw::records::MultipleResponseType,
min_width: VarWidth,
) -> Result<Self, Error> {
match input {
- raw::MultipleResponseType::MultipleDichotomy { value, labels } => {
+ raw::records::MultipleResponseType::MultipleDichotomy { value, labels } => {
let value = match min_width {
VarWidth::Numeric => {
let string = String::from_utf8_lossy(&value.0);
labels: *labels,
})
}
- raw::MultipleResponseType::MultipleCategory => {
+ raw::records::MultipleResponseType::MultipleCategory => {
Ok(MultipleResponseType::MultipleCategory)
}
}
//! raw details. Most readers will want to use higher-level interfaces.
use crate::{
- dictionary::{
- Alignment, Attributes, Datum, Measure, MissingValueRange, MissingValues, VarWidth,
- },
+ dictionary::{Datum, VarType, VarWidth},
endian::{Endian, Parse, ToBytes},
format::DisplayPlainF64,
identifier::{Error as IdError, Identifier},
- sys::encoding::{default_encoding, get_encoding, Error as EncodingError},
+ sys::{
+ encoding::{default_encoding, get_encoding, Error as EncodingError},
+ raw::records::{
+ Compression, DocumentRecord, EncodingRecord, Extension, FileAttributesRecord,
+ FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongNamesRecord,
+ LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord,
+ NumberOfCasesRecord, ProductInfoRecord, RawDocumentLine, RawFileAttributesRecord,
+ RawLongNamesRecord, RawProductInfoRecord, RawVariableAttributesRecord,
+ RawVariableSetRecord, RawVeryLongStringsRecord, ValueLabelRecord, VarDisplayRecord,
+ VariableAttributesRecord, VariableRecord, VariableSetRecord, VeryLongStringsRecord,
+ ZHeader, ZTrailer,
+ },
+ },
};
use encoding_rs::{mem::decode_latin1, Encoding};
use flate2::read::ZlibDecoder;
use itertools::Itertools;
-use num::Integer;
use smallvec::SmallVec;
use std::{
borrow::{Borrow, Cow},
cell::RefCell,
- collections::{BTreeMap, VecDeque},
+ collections::VecDeque,
fmt::{Debug, Display, Formatter, Result as FmtResult},
io::{empty, Error as IoError, Read, Seek, SeekFrom},
iter::repeat_n,
mem::take,
num::NonZeroU8,
- ops::{Deref, Not, Range},
+ ops::Deref,
str::from_utf8,
};
use thiserror::Error as ThisError;
+pub mod records;
+
/// An error encountered reading raw system file records.
///
/// Any error prevents reading further data from the system file.
from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
}
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum Compression {
- Simple,
- ZLib,
-}
-
-#[derive(Clone)]
-pub struct HeaderRecord<S>
-where
- S: Debug,
-{
- /// Offset in file.
- pub offsets: Range<u64>,
-
- /// Magic number.
- pub magic: Magic,
-
- /// Eye-catcher string, product name, in the file's encoding. Padded
- /// on the right with spaces.
- pub eye_catcher: S,
-
- /// Layout code, normally either 2 or 3.
- pub layout_code: u32,
-
- /// Number of variable positions, or `None` if the value in the file is
- /// questionably trustworthy.
- pub nominal_case_size: Option<u32>,
-
- /// Compression type, if any,
- pub compression: Option<Compression>,
-
- /// 1-based variable index of the weight variable, or `None` if the file is
- /// unweighted.
- pub weight_index: Option<u32>,
-
- /// Claimed number of cases, if known.
- pub n_cases: Option<u32>,
-
- /// Compression bias, usually 100.0.
- pub bias: f64,
-
- /// `dd mmm yy` in the file's encoding.
- pub creation_date: S,
-
- /// `HH:MM:SS` in the file's encoding.
- pub creation_time: S,
-
- /// File label, in the file's encoding. Padded on the right with spaces.
- pub file_label: S,
-
- /// Endianness of the data in the file header.
- pub endian: Endian,
-}
-
-impl<S> HeaderRecord<S>
-where
- S: Debug,
-{
- fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult
- where
- T: Debug,
- {
- writeln!(f, "{name:>17}: {:?}", value)
- }
-}
-
-impl<S> Debug for HeaderRecord<S>
-where
- S: Debug,
-{
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- writeln!(f, "File header record:")?;
- self.debug_field(f, "Magic", self.magic)?;
- self.debug_field(f, "Product name", &self.eye_catcher)?;
- self.debug_field(f, "Layout code", self.layout_code)?;
- self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
- self.debug_field(f, "Compression", self.compression)?;
- self.debug_field(f, "Weight index", self.weight_index)?;
- self.debug_field(f, "Number of cases", self.n_cases)?;
- self.debug_field(f, "Compression bias", self.bias)?;
- self.debug_field(f, "Creation date", &self.creation_date)?;
- self.debug_field(f, "Creation time", &self.creation_time)?;
- self.debug_field(f, "File label", &self.file_label)?;
- self.debug_field(f, "Endianness", self.endian)
- }
-}
-
-impl HeaderRecord<RawString> {
- fn read<R: Read + Seek>(r: &mut R, warn: &mut dyn FnMut(Warning)) -> Result<Self, Error> {
- let start = r.stream_position()?;
-
- let magic: [u8; 4] = read_bytes(r)?;
- let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
-
- let eye_catcher = RawString(read_vec(r, 60)?);
- let layout_code: [u8; 4] = read_bytes(r)?;
- let endian = Endian::identify_u32(2, layout_code)
- .or_else(|| Endian::identify_u32(2, layout_code))
- .ok_or(Error::NotASystemFile)?;
- let layout_code = endian.parse(layout_code);
-
- let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
- let nominal_case_size = (1..i32::MAX as u32 / 16)
- .contains(&nominal_case_size)
- .then_some(nominal_case_size);
-
- let compression_code: u32 = endian.parse(read_bytes(r)?);
- let compression = match (magic, compression_code) {
- (Magic::Zsav, 2) => Some(Compression::ZLib),
- (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)),
- (_, 0) => None,
- (_, 1) => Some(Compression::Simple),
- (_, code) => return Err(Error::InvalidSavCompression(code)),
- };
-
- let weight_index: u32 = endian.parse(read_bytes(r)?);
- let weight_index = (weight_index > 0).then_some(weight_index);
-
- let n_cases: u32 = endian.parse(read_bytes(r)?);
- let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
-
- let bias: f64 = endian.parse(read_bytes(r)?);
- if bias != 100.0 && bias != 0.0 {
- warn(Warning::UnexpectedBias(bias));
- }
-
- let creation_date = RawString(read_vec(r, 9)?);
- let creation_time = RawString(read_vec(r, 8)?);
- let file_label = RawString(read_vec(r, 64)?);
- let _: [u8; 3] = read_bytes(r)?;
-
- Ok(HeaderRecord {
- offsets: start..r.stream_position()?,
- magic,
- layout_code,
- nominal_case_size,
- compression,
- weight_index,
- n_cases,
- bias,
- creation_date,
- creation_time,
- eye_catcher,
- file_label,
- endian,
- })
- }
-
- pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord {
- let eye_catcher = decoder.decode(&self.eye_catcher).to_string();
- let file_label = decoder.decode(&self.file_label).to_string();
- let creation_date = decoder.decode(&self.creation_date).to_string();
- let creation_time = decoder.decode(&self.creation_time).to_string();
- DecodedRecord::Header(HeaderRecord {
- eye_catcher,
- weight_index: self.weight_index,
- n_cases: self.n_cases,
- file_label,
- offsets: self.offsets.clone(),
- magic: self.magic,
- layout_code: self.layout_code,
- nominal_case_size: self.nominal_case_size,
- compression: self.compression,
- bias: self.bias,
- creation_date,
- creation_time,
- endian: self.endian,
- })
- }
-}
-
/// An [Encoding] along with a function to report decoding errors.
///
/// This is used by functions that decode raw records.
}
}
-/// Variable type.
-#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum VarType {
- /// A numeric variable.
- Numeric,
-
- /// A string variable.
- String,
-}
-
-impl Not for VarType {
- type Output = Self;
-
- fn not(self) -> Self::Output {
- match self {
- Self::Numeric => Self::String,
- Self::String => Self::Numeric,
- }
- }
-}
-
-impl Not for &VarType {
- type Output = VarType;
-
- fn not(self) -> Self::Output {
- !*self
- }
-}
-
-impl Display for VarType {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- match self {
- VarType::Numeric => write!(f, "numeric"),
- VarType::String => write!(f, "string"),
- }
- }
-}
-
impl TryFrom<RawWidth> for VarType {
type Error = ();
}
}
-/// [crate::format::Format] as represented in a system file.
-#[derive(Copy, Clone, PartialEq, Eq, Hash)]
-pub struct RawFormat(
- /// The most-significant 16 bits are the type, the next 8 bytes are the
- /// width, and the least-significant 8 bits are the number of decimals.
- pub u32,
-);
-
-impl Debug for RawFormat {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- let type_ = format_name(self.0 >> 16);
- let w = (self.0 >> 8) & 0xff;
- let d = self.0 & 0xff;
- write!(f, "{:06x} ({type_}{w}.{d})", self.0)
- }
-}
-
-fn format_name(type_: u32) -> Cow<'static, str> {
- match type_ {
- 1 => "A",
- 2 => "AHEX",
- 3 => "COMMA",
- 4 => "DOLLAR",
- 5 => "F",
- 6 => "IB",
- 7 => "PIBHEX",
- 8 => "P",
- 9 => "PIB",
- 10 => "PK",
- 11 => "RB",
- 12 => "RBHEX",
- 15 => "Z",
- 16 => "N",
- 17 => "E",
- 20 => "DATE",
- 21 => "TIME",
- 22 => "DATETIME",
- 23 => "ADATE",
- 24 => "JDATE",
- 25 => "DTIME",
- 26 => "WKDAY",
- 27 => "MONTH",
- 28 => "MOYR",
- 29 => "QYR",
- 30 => "WKYR",
- 31 => "PCT",
- 32 => "DOT",
- 33 => "CCA",
- 34 => "CCB",
- 35 => "CCC",
- 36 => "CCD",
- 37 => "CCE",
- 38 => "EDATE",
- 39 => "SDATE",
- 40 => "MTIME",
- 41 => "YMDHMS",
- _ => return format!("<unknown format {type_}>").into(),
- }
- .into()
-}
-
-impl MissingValues {
- fn read<R: Read + Seek>(
- r: &mut R,
- offset: u64,
- raw_width: RawWidth,
- code: i32,
- endian: Endian,
- warn: &mut dyn FnMut(Warning),
- ) -> Result<Self, Error> {
- let (individual_values, has_range) = match code {
- 0 => return Ok(Self::default()),
- 1..=3 => (code as usize, false),
- -2 => (0, true),
- -3 => (1, true),
- _ => return Err(Error::BadMissingValueCode { offset, code }),
- };
-
- let mut values = Vec::with_capacity(individual_values);
- let range = if has_range {
- let low = read_bytes::<8, _>(r)?;
- let high = read_bytes::<8, _>(r)?;
- Some((low, high))
- } else {
- None
- };
- for _ in 0..individual_values {
- values.push(read_bytes::<8, _>(r)?);
- }
-
- match VarWidth::try_from(raw_width) {
- Ok(VarWidth::Numeric) => {
- let values = values
- .into_iter()
- .map(|v| Datum::Number(endian.parse(v)))
- .collect();
-
- let range = range.map(|(low, high)| {
- MissingValueRange::new(endian.parse(low), endian.parse(high))
- });
- return Ok(Self::new(values, range).unwrap());
- }
- Ok(VarWidth::String(_)) if range.is_some() => warn(Warning::MissingValueStringRange),
- Ok(VarWidth::String(width)) => {
- let width = width.min(8) as usize;
- let values = values
- .into_iter()
- .map(|value| Datum::String(RawString::from(&value[..width])))
- .collect();
- return Ok(Self::new(values, None).unwrap());
- }
- Err(()) => warn(Warning::MissingValueContinuation(offset)),
- }
- Ok(Self::default())
- }
-}
-
-#[derive(Clone)]
-pub struct VariableRecord<S>
-where
- S: Debug,
-{
- /// Range of offsets in file.
- pub offsets: Range<u64>,
-
- /// Variable width, in the range -1..=255.
- pub width: RawWidth,
-
- /// Variable name, padded on the right with spaces.
- pub name: S,
-
- /// Print format.
- pub print_format: RawFormat,
-
- /// Write format.
- pub write_format: RawFormat,
-
- /// Missing values.
- pub missing_values: MissingValues,
-
- /// Optional variable label.
- pub label: Option<S>,
-}
-
/// Width of a variable record.
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum RawWidth {
}
}
-impl<S> Debug for VariableRecord<S>
-where
- S: Debug,
-{
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- writeln!(f, "Width: {}", self.width,)?;
- writeln!(f, "Print format: {:?}", self.print_format)?;
- writeln!(f, "Write format: {:?}", self.write_format)?;
- writeln!(f, "Name: {:?}", &self.name)?;
- writeln!(f, "Variable label: {:?}", self.label)?;
- writeln!(f, "Missing values: {:?}", self.missing_values)
- }
-}
-
-impl VariableRecord<RawString> {
- fn read<R: Read + Seek>(
- r: &mut R,
- endian: Endian,
- warn: &mut dyn FnMut(Warning),
- ) -> Result<Record, Error> {
- let start_offset = r.stream_position()?;
- let width: i32 = endian.parse(read_bytes(r)?);
- let width: RawWidth = width.try_into().map_err(|_| Error::BadVariableWidth {
- start_offset,
- width,
- })?;
- let code_offset = r.stream_position()?;
- let has_variable_label: u32 = endian.parse(read_bytes(r)?);
- let missing_value_code: i32 = endian.parse(read_bytes(r)?);
- let print_format = RawFormat(endian.parse(read_bytes(r)?));
- let write_format = RawFormat(endian.parse(read_bytes(r)?));
- let name = RawString(read_vec(r, 8)?);
-
- let label = match has_variable_label {
- 0 => None,
- 1 => {
- let len: u32 = endian.parse(read_bytes(r)?);
- let read_len = len.min(65535) as usize;
- let label = RawString(read_vec(r, read_len)?);
-
- let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
- let _ = read_vec(r, padding_bytes as usize)?;
-
- Some(label)
- }
- _ => {
- return Err(Error::BadVariableLabelCode {
- start_offset,
- code_offset,
- code: has_variable_label,
- });
- }
- };
-
- let missing_values =
- MissingValues::read(r, start_offset, width, missing_value_code, endian, warn)?;
-
- let end_offset = r.stream_position()?;
-
- Ok(Record::Variable(VariableRecord {
- offsets: start_offset..end_offset,
- width,
- name,
- print_format,
- write_format,
- missing_values,
- label,
- }))
- }
-
- pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord {
- DecodedRecord::Variable(VariableRecord {
- offsets: self.offsets.clone(),
- width: self.width,
- name: decoder.decode(&self.name).to_string(),
- print_format: self.print_format,
- write_format: self.write_format,
- missing_values: self.missing_values,
- label: self
- .label
- .as_ref()
- .map(|label| decoder.decode(label).to_string()),
- })
- }
-}
-
/// 8 bytes that represent a number or a string (but that's all we know).
///
/// Used when we don't know whether it's a number or a string, or the numerical
}
}
-#[derive(Clone, Debug)]
-pub struct ValueLabel<D, S>
-where
- D: Debug,
- S: Debug,
-{
- pub datum: D,
- pub label: S,
-}
-
-#[derive(Clone)]
-pub struct ValueLabelRecord<D, S>
-where
- D: Debug,
- S: Debug,
-{
- /// Range of offsets in file.
- pub offsets: Range<u64>,
-
- /// The labels.
- pub labels: Vec<ValueLabel<D, S>>,
-
- /// The 1-based indexes of the variable indexes.
- pub dict_indexes: Vec<u32>,
-
- /// The types of the variables.
- pub var_type: VarType,
-}
-
-impl<D, S> Debug for ValueLabelRecord<D, S>
-where
- D: Debug,
- S: Debug,
-{
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- writeln!(f, "labels: ")?;
- for label in self.labels.iter() {
- writeln!(f, "{label:?}")?;
- }
- write!(f, "apply to {} variables", self.var_type)?;
- for dict_index in self.dict_indexes.iter() {
- write!(f, " #{dict_index}")?;
- }
- Ok(())
+fn skip_bytes<R: Read>(r: &mut R, mut n: usize) -> Result<(), IoError> {
+ thread_local! {
+ static BUF: RefCell<[u8; 256]> = RefCell::new([0u8; 256]);
}
-}
-
-impl<D, S> ValueLabelRecord<D, S>
-where
- D: Debug,
- S: Debug,
-{
- /// Maximum number of value labels in a record.
- pub const MAX_LABELS: u32 = u32::MAX / 8;
-
- /// Maximum number of variable indexes in a record.
- pub const MAX_INDEXES: u32 = u32::MAX / 8;
-}
-
-impl ValueLabelRecord<RawDatum, RawString> {
- fn read<R: Read + Seek>(
- r: &mut R,
- endian: Endian,
- var_types: &VarTypes,
- warn: &mut dyn FnMut(Warning),
- ) -> Result<Option<Record>, Error> {
- let label_offset = r.stream_position()?;
- let n: u32 = endian.parse(read_bytes(r)?);
- if n > Self::MAX_LABELS {
- return Err(Error::BadNumberOfValueLabels {
- offset: label_offset,
- n,
- max: Self::MAX_LABELS,
- });
- }
-
- let mut labels = Vec::new();
- for _ in 0..n {
- let value = UntypedDatum(read_bytes(r)?);
- let label_len: u8 = endian.parse(read_bytes(r)?);
- let label_len = label_len as usize;
- let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
-
- let mut label = read_vec(r, padded_len - 1)?;
- label.truncate(label_len);
- labels.push((value, RawString(label)));
- }
-
- let index_offset = r.stream_position()?;
- let rec_type: u32 = endian.parse(read_bytes(r)?);
- if rec_type != 4 {
- return Err(Error::ExpectedVarIndexRecord {
- offset: index_offset,
- rec_type,
- });
- }
-
- let n: u32 = endian.parse(read_bytes(r)?);
- if n > Self::MAX_INDEXES {
- return Err(Error::TooManyVarIndexes {
- offset: index_offset,
- n,
- max: Self::MAX_INDEXES,
- });
- } else if n == 0 {
- dbg!();
- warn(Warning::NoVarIndexes {
- offset: index_offset,
- });
- return Ok(None);
- }
-
- let index_offset = r.stream_position()?;
- let mut dict_indexes = Vec::with_capacity(n as usize);
- let mut invalid_indexes = Vec::new();
- for _ in 0..n {
- let index: u32 = endian.parse(read_bytes(r)?);
- if var_types.is_valid_index(index as usize) {
- dict_indexes.push(index);
- } else {
- invalid_indexes.push(index);
- }
- }
- if !invalid_indexes.is_empty() {
- warn(Warning::InvalidVarIndexes {
- offset: index_offset,
- max: var_types.n_values(),
- invalid: invalid_indexes,
- });
- }
-
- let Some(&first_index) = dict_indexes.first() else {
- return Ok(None);
- };
- let var_type = VarType::from(var_types.types[first_index as usize - 1].unwrap());
- let mut wrong_type_indexes = Vec::new();
- dict_indexes.retain(|&index| {
- if var_types.types[index as usize - 1].map(VarType::from) != Some(var_type) {
- wrong_type_indexes.push(index);
- false
- } else {
- true
- }
- });
- if !wrong_type_indexes.is_empty() {
- warn(Warning::MixedVarTypes {
- offset: index_offset,
- var_type,
- wrong_types: wrong_type_indexes,
- });
- }
-
- let labels = labels
- .into_iter()
- .map(|(value, label)| ValueLabel {
- datum: RawDatum::from_raw(&value, var_type, endian),
- label,
- })
- .collect();
-
- let end_offset = r.stream_position()?;
- Ok(Some(Record::ValueLabel(ValueLabelRecord {
- offsets: label_offset..end_offset,
- labels,
- dict_indexes,
- var_type,
- })))
- }
-
- fn decode(self, decoder: &mut Decoder) -> ValueLabelRecord<RawDatum, String> {
- let labels = self
- .labels
- .iter()
- .map(
- |ValueLabel {
- datum: value,
- label,
- }| ValueLabel {
- datum: value.clone(),
- label: decoder.decode(label).to_string(),
- },
- )
- .collect();
- ValueLabelRecord {
- offsets: self.offsets.clone(),
- labels,
- dict_indexes: self.dict_indexes.clone(),
- var_type: self.var_type,
+ BUF.with_borrow_mut(|buf| {
+ while n > 0 {
+ let chunk = n.min(buf.len());
+ r.read_exact(&mut buf[..n])?;
+ n -= chunk;
}
- }
+ Ok(())
+ })
}
-#[derive(Clone, Debug)]
-pub struct DocumentRecord<S>
-where
- S: Debug,
-{
- pub offsets: Range<u64>,
-
- /// The document, as an array of lines. Raw lines are exactly 80 bytes long
- /// and are right-padded with spaces without any new-line termination.
- pub lines: Vec<S>,
-}
-
-pub type RawDocumentLine = RawStrArray<DOC_LINE_LEN>;
-
-/// Length of a line in a document. Document lines are fixed-length and
-/// padded on the right with spaces.
-pub const DOC_LINE_LEN: usize = 80;
-
-impl DocumentRecord<RawDocumentLine> {
- /// Maximum number of lines we will accept in a document. This is simply
- /// the maximum number that will fit in a 32-bit space.
- pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
-
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
- let start_offset = r.stream_position()?;
- let n: u32 = endian.parse(read_bytes(r)?);
- let n = n as usize;
- if n > Self::MAX_LINES {
- Err(Error::BadDocumentLength {
- offset: start_offset,
- n,
- max: Self::MAX_LINES,
- })
- } else {
- let mut lines = Vec::with_capacity(n);
- for _ in 0..n {
- lines.push(RawStrArray(read_bytes(r)?));
- }
- let end_offset = r.stream_position()?;
- Ok(Record::Document(DocumentRecord {
- offsets: start_offset..end_offset,
- lines,
- }))
+fn try_read_bytes_into<R: Read>(r: &mut R, buf: &mut [u8]) -> Result<bool, IoError> {
+ let n = r.read(buf)?;
+ if n > 0 {
+ if n < buf.len() {
+ r.read_exact(&mut buf[n..])?;
}
- }
-
- pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord {
- DecodedRecord::Document(DocumentRecord {
- offsets: self.offsets.clone(),
- lines: self
- .lines
- .iter()
- .map(|s| decoder.decode_slice(&s.0).to_string())
- .collect(),
- })
- }
-}
-
-struct ExtensionRecord<'a> {
- size: Option<u32>,
- count: Option<u32>,
- name: &'a str,
-}
-
-#[derive(Clone, Debug)]
-pub struct IntegerInfoRecord {
- pub offsets: Range<u64>,
- pub version: (i32, i32, i32),
- pub machine_code: i32,
- pub floating_point_rep: i32,
- pub compression_code: i32,
- pub endianness: i32,
- pub character_code: i32,
-}
-
-static INTEGER_INFO_RECORD: ExtensionRecord = ExtensionRecord {
- size: Some(4),
- count: Some(8),
- name: "integer record",
-};
-
-impl IntegerInfoRecord {
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size(&INTEGER_INFO_RECORD)?;
-
- let mut input = &ext.data[..];
- let data: Vec<i32> = (0..8)
- .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
- .collect();
- Ok(Record::IntegerInfo(IntegerInfoRecord {
- offsets: ext.offsets.clone(),
- version: (data[0], data[1], data[2]),
- machine_code: data[3],
- floating_point_rep: data[4],
- compression_code: data[5],
- endianness: data[6],
- character_code: data[7],
- }))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct FloatInfoRecord {
- pub sysmis: f64,
- pub highest: f64,
- pub lowest: f64,
-}
-
-static FLOAT_INFO_RECORD: ExtensionRecord = ExtensionRecord {
- size: Some(8),
- count: Some(3),
- name: "floating point record",
-};
-
-impl FloatInfoRecord {
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size(&FLOAT_INFO_RECORD)?;
-
- let mut input = &ext.data[..];
- let data: Vec<f64> = (0..3)
- .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
- .collect();
- Ok(Record::FloatInfo(FloatInfoRecord {
- sysmis: data[0],
- highest: data[1],
- lowest: data[2],
- }))
+ Ok(true)
+ } else {
+ Ok(false)
}
}
-#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum CategoryLabels {
- VarLabels,
- CountedValues,
-}
-
-#[derive(Clone, Debug)]
-pub enum MultipleResponseType {
- MultipleDichotomy {
- value: RawString,
- labels: CategoryLabels,
- },
- MultipleCategory,
-}
-
-impl MultipleResponseType {
- fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> {
- let (mr_type, input) = match input.split_first() {
- Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
- Some((b'D', input)) => {
- let (value, input) = parse_counted_string(input)?;
- (
- MultipleResponseType::MultipleDichotomy {
- value,
- labels: CategoryLabels::VarLabels,
- },
- input,
- )
- }
- Some((b'E', input)) => {
- let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
- (CategoryLabels::CountedValues, rest)
- } else if let Some(rest) = input.strip_prefix(b" 11 ") {
- (CategoryLabels::VarLabels, rest)
- } else {
- return Err(Warning::InvalidMultipleDichotomyLabelType);
- };
- let (value, input) = parse_counted_string(input)?;
- (
- MultipleResponseType::MultipleDichotomy { value, labels },
- input,
- )
- }
- _ => return Err(Warning::InvalidMultipleResponseType),
- };
- Ok((mr_type, input))
+fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
+ let mut buf = [0; N];
+ match try_read_bytes_into(r, &mut buf)? {
+ true => Ok(Some(buf)),
+ false => Ok(None),
}
}
-#[derive(Clone, Debug)]
-pub struct MultipleResponseSet<I, S>
-where
- I: Debug,
- S: Debug,
-{
- pub name: I,
- pub label: S,
- pub mr_type: MultipleResponseType,
- pub short_names: Vec<I>,
+fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
+ let mut buf = [0; N];
+ r.read_exact(&mut buf)?;
+ Ok(buf)
}
-impl MultipleResponseSet<RawString, RawString> {
- fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> {
- let Some(equals) = input.iter().position(|&b| b == b'=') else {
- return Err(Warning::MultipleResponseSyntaxError("missing `=`"));
- };
- let (name, input) = input.split_at(equals);
- let input = input.strip_prefix(b"=").unwrap();
- let (mr_type, input) = MultipleResponseType::parse(input)?;
- let Some(input) = input.strip_prefix(b" ") else {
- return Err(Warning::MultipleResponseSyntaxError(
- "missing space after multiple response type",
- ));
- };
- let (label, mut input) = parse_counted_string(input)?;
- let mut vars = Vec::new();
- while input.first() != Some(&b'\n') {
- match input.split_first() {
- Some((b' ', rest)) => {
- let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
- return Err(Warning::MultipleResponseSyntaxError(
- "missing variable name delimiter",
- ));
- };
- let (var, rest) = rest.split_at(length);
- if !var.is_empty() {
- vars.push(var.into());
- }
- input = rest;
- }
- _ => {
- return Err(Warning::MultipleResponseSyntaxError(
- "missing space preceding variable name",
- ));
- }
- }
- }
- while input.first() == Some(&b'\n') {
- input = &input[1..];
- }
- Ok((
- MultipleResponseSet {
- name: name.into(),
- label,
- mr_type,
- short_names: vars,
- },
- input,
- ))
- }
-
- fn decode(
- &self,
- decoder: &mut Decoder,
- ) -> Result<MultipleResponseSet<Identifier, String>, Warning> {
- let mut short_names = Vec::with_capacity(self.short_names.len());
- for short_name in self.short_names.iter() {
- if let Some(short_name) = decoder
- .decode_identifier(short_name)
- .map_err(Warning::InvalidMrSetName)
- .issue_warning(&mut decoder.warn)
- {
- short_names.push(short_name);
- }
- }
- Ok(MultipleResponseSet {
- name: decoder
- .decode_identifier(&self.name)
- .map_err(Warning::InvalidMrSetVariableName)?,
- label: decoder.decode(&self.label).to_string(),
- mr_type: self.mr_type.clone(),
- short_names,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct MultipleResponseRecord<I, S>(pub Vec<MultipleResponseSet<I, S>>)
-where
- I: Debug,
- S: Debug;
-
-static MULTIPLE_RESPONSE_RECORD: ExtensionRecord = ExtensionRecord {
- size: Some(1),
- count: None,
- name: "multiple response set record",
-};
-
-impl MultipleResponseRecord<RawString, RawString> {
- fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
- ext.check_size(&MULTIPLE_RESPONSE_RECORD)?;
-
- let mut input = &ext.data[..];
- let mut sets = Vec::new();
- loop {
- while let Some(suffix) = input.strip_prefix(b"\n") {
- input = suffix;
- }
- if input.is_empty() {
- break;
- }
- let (set, rest) = MultipleResponseSet::parse(input)?;
- sets.push(set);
- input = rest;
- }
- Ok(Record::MultipleResponse(MultipleResponseRecord(sets)))
- }
-}
-
-impl MultipleResponseRecord<RawString, RawString> {
- fn decode(self, decoder: &mut Decoder) -> DecodedRecord {
- let mut sets = Vec::new();
- for set in self.0.iter() {
- if let Some(set) = set.decode(decoder).issue_warning(&mut decoder.warn) {
- sets.push(set);
- }
- }
- DecodedRecord::MultipleResponse(MultipleResponseRecord(sets))
- }
-}
-
-fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> {
- let Some(space) = input.iter().position(|&b| b == b' ') else {
- return Err(Warning::CountedStringMissingSpace);
- };
- let Ok(length) = from_utf8(&input[..space]) else {
- return Err(Warning::CountedStringInvalidUTF8);
- };
- let Ok(length): Result<usize, _> = length.parse() else {
- return Err(Warning::CountedStringInvalidLength(length.into()));
- };
-
- let Some((string, rest)) = input[space + 1..].split_at_checked(length) else {
- return Err(Warning::CountedStringTooLong(length));
- };
- Ok((string.into(), rest))
-}
-
-impl Measure {
- fn try_decode(source: u32) -> Result<Option<Measure>, Warning> {
- match source {
- 0 => Ok(None),
- 1 => Ok(Some(Measure::Nominal)),
- 2 => Ok(Some(Measure::Ordinal)),
- 3 => Ok(Some(Measure::Scale)),
- _ => Err(Warning::InvalidMeasurement(source)),
- }
- }
-}
-
-impl Alignment {
- fn try_decode(source: u32) -> Result<Option<Alignment>, Warning> {
- match source {
- 0 => Ok(Some(Alignment::Left)),
- 1 => Ok(Some(Alignment::Right)),
- 2 => Ok(Some(Alignment::Center)),
- _ => Err(Warning::InvalidAlignment(source)),
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VarDisplay {
- pub measure: Option<Measure>,
- pub width: Option<u32>,
- pub alignment: Option<Alignment>,
-}
-
-#[derive(Clone, Debug)]
-pub struct VarDisplayRecord(pub Vec<VarDisplay>);
-
-impl VarDisplayRecord {
- fn parse(
- ext: &Extension,
- var_types: &VarTypes,
- endian: Endian,
- warn: &mut dyn FnMut(Warning),
- ) -> Result<Record, Warning> {
- if ext.size != 4 {
- return Err(Warning::BadRecordSize {
- offset: ext.offsets.start,
- record: String::from("variable display record"),
- size: ext.size,
- expected_size: 4,
- });
- }
-
- let n_vars = var_types.n_vars();
- let has_width = if ext.count as usize == 3 * n_vars {
- true
- } else if ext.count as usize == 2 * n_vars {
- false
- } else {
- return Err(Warning::InvalidVariableDisplayCount {
- count: ext.count as usize,
- first: 2 * n_vars,
- second: 3 * n_vars,
- });
- };
-
- let mut var_displays = Vec::new();
- let mut input = &ext.data[..];
- for _ in 0..n_vars {
- let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
- .issue_warning(warn)
- .flatten();
- let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
- let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
- .issue_warning(warn)
- .flatten();
- var_displays.push(VarDisplay {
- measure,
- width,
- alignment,
- });
- }
- Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringMissingValues<N>
-where
- N: Debug,
-{
- /// Variable name.
- pub var_name: N,
-
- /// Missing values.
- pub missing_values: Vec<RawStrArray<8>>,
-}
-
-impl LongStringMissingValues<RawString> {
- fn decode(
- &self,
- decoder: &mut Decoder,
- ) -> Result<LongStringMissingValues<Identifier>, IdError> {
- Ok(LongStringMissingValues {
- var_name: decoder.decode_identifier(&self.var_name)?,
- missing_values: self.missing_values.clone(),
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringMissingValueRecord<N>(pub Vec<LongStringMissingValues<N>>)
-where
- N: Debug;
-
-static LONG_STRING_MISSING_VALUE_RECORD: ExtensionRecord = ExtensionRecord {
- size: Some(1),
- count: None,
- name: "long string missing values record",
-};
-
-impl LongStringMissingValueRecord<RawString> {
- fn parse(
- ext: &Extension,
- endian: Endian,
- warn: &mut dyn FnMut(Warning),
- ) -> Result<Record, Warning> {
- ext.check_size(&LONG_STRING_MISSING_VALUE_RECORD)?;
-
- let mut input = &ext.data[..];
- let mut missing_value_set = Vec::new();
- while !input.is_empty() {
- let var_name = read_string(&mut input, endian)?;
- dbg!(&var_name);
- let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
- let value_len: u32 = endian.parse(read_bytes(&mut input)?);
- if value_len != 8 {
- let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
- warn(Warning::BadLongMissingValueLength {
- record_offset: ext.offsets.start,
- offset,
- value_len,
- });
- read_vec(
- &mut input,
- dbg!(value_len as usize * n_missing_values as usize),
- )?;
- continue;
- }
- let mut missing_values = Vec::new();
- for i in 0..n_missing_values {
- if i > 0 {
- // Tolerate files written by old, buggy versions of PSPP
- // where we believed that the value_length was repeated
- // before each missing value.
- let mut peek = input;
- let number: u32 = endian.parse(read_bytes(&mut peek)?);
- if number == 8 {
- input = peek;
- }
- }
-
- let value: [u8; 8] = read_bytes(&mut input)?;
- missing_values.push(RawStrArray(value));
- }
- missing_value_set.push(LongStringMissingValues {
- var_name,
- missing_values,
- });
- }
- Ok(Record::LongStringMissingValues(
- LongStringMissingValueRecord(missing_value_set),
- ))
- }
-}
-
-impl LongStringMissingValueRecord<RawString> {
- pub fn decode(self, decoder: &mut Decoder) -> LongStringMissingValueRecord<Identifier> {
- let mut mvs = Vec::with_capacity(self.0.len());
- for mv in self.0.iter() {
- if let Some(mv) = mv
- .decode(decoder)
- .map_err(Warning::InvalidLongStringMissingValueVariableName)
- .issue_warning(&mut decoder.warn)
- {
- mvs.push(mv);
- }
- }
- LongStringMissingValueRecord(mvs)
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct EncodingRecord(pub String);
-
-static ENCODING_RECORD: ExtensionRecord = ExtensionRecord {
- size: Some(1),
- count: None,
- name: "encoding record",
-};
-
-impl EncodingRecord {
- fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
- ext.check_size(&ENCODING_RECORD)?;
-
- Ok(Record::Encoding(EncodingRecord(
- String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName {
- offset: ext.offsets.start,
- })?,
- )))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct NumberOfCasesRecord {
- /// Always observed as 1.
- pub one: u64,
-
- /// Number of cases.
- pub n_cases: u64,
-}
-
-static NUMBER_OF_CASES_RECORD: ExtensionRecord = ExtensionRecord {
- size: Some(8),
- count: Some(2),
- name: "extended number of cases record",
-};
-
-impl NumberOfCasesRecord {
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size(&NUMBER_OF_CASES_RECORD)?;
-
- let mut input = &ext.data[..];
- let one = endian.parse(read_bytes(&mut input)?);
- let n_cases = endian.parse(read_bytes(&mut input)?);
-
- Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct RawVariableSetRecord(TextRecord);
-
-impl RawVariableSetRecord {
- fn parse(extension: Extension) -> Result<Record, Warning> {
- Ok(Record::VariableSets(Self(TextRecord::parse(
- extension,
- "variable sets record",
- )?)))
- }
- fn decode(self, decoder: &mut Decoder) -> VariableSetRecord {
- let mut sets = Vec::new();
- let input = decoder.decode(&self.0.text);
- for line in input.lines() {
- if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&mut decoder.warn) {
- sets.push(set)
- }
- }
- VariableSetRecord {
- offsets: self.0.offsets,
- sets,
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct RawProductInfoRecord(TextRecord);
-
-impl RawProductInfoRecord {
- fn parse(extension: Extension) -> Result<Record, Warning> {
- Ok(Record::ProductInfo(Self(TextRecord::parse(
- extension,
- "product info record",
- )?)))
- }
- fn decode(self, decoder: &mut Decoder) -> ProductInfoRecord {
- ProductInfoRecord(decoder.decode(&self.0.text).into())
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct RawLongNamesRecord(TextRecord);
-
-impl RawLongNamesRecord {
- fn parse(extension: Extension) -> Result<Record, Warning> {
- Ok(Record::LongNames(Self(TextRecord::parse(
- extension,
- "long names record",
- )?)))
- }
- fn decode(self, decoder: &mut Decoder) -> LongNamesRecord {
- let input = decoder.decode(&self.0.text);
- let mut names = Vec::new();
- for pair in input.split('\t').filter(|s| !s.is_empty()) {
- if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&mut decoder.warn)
- {
- names.push(long_name);
- }
- }
- LongNamesRecord(names)
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct TextRecord {
- pub offsets: Range<u64>,
-
- /// The text content of the record.
- pub text: RawString,
-}
-
-impl TextRecord {
- fn parse(extension: Extension, name: &str) -> Result<TextRecord, Warning> {
- extension.check_size(&ExtensionRecord {
- size: Some(1),
- count: None,
- name,
- })?;
- Ok(Self {
- offsets: extension.offsets,
- text: extension.data.into(),
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VeryLongString {
- pub short_name: Identifier,
- pub length: u16,
-}
-
-impl VeryLongString {
- fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Warning> {
- let Some((short_name, length)) = input.split_once('=') else {
- return Err(Warning::VeryLongStringMissingDelimiter(input.into()));
- };
- let short_name = decoder
- .new_identifier(short_name)
- .and_then(Identifier::must_be_ordinary)
- .map_err(Warning::InvalidLongStringName)?;
- let length = length
- .parse()
- .map_err(|_| Warning::VeryLongStringInvalidLength(input.into()))?;
- Ok(VeryLongString { short_name, length })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct RawVeryLongStringsRecord(TextRecord);
-
-#[derive(Clone, Debug)]
-pub struct VeryLongStringsRecord(pub Vec<VeryLongString>);
-
-impl RawVeryLongStringsRecord {
- fn parse(extension: Extension) -> Result<Record, Warning> {
- Ok(Record::VeryLongStrings(Self(TextRecord::parse(
- extension,
- "very long strings record",
- )?)))
- }
- fn decode(self, decoder: &mut Decoder) -> VeryLongStringsRecord {
- let input = decoder.decode(&self.0.text);
- let mut very_long_strings = Vec::new();
- for tuple in input
- .split('\0')
- .map(|s| s.trim_start_matches('\t'))
- .filter(|s| !s.is_empty())
- {
- if let Some(vls) =
- VeryLongString::parse(decoder, tuple).issue_warning(&mut decoder.warn)
- {
- very_long_strings.push(vls)
- }
- }
- VeryLongStringsRecord(very_long_strings)
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct Attribute {
- pub name: Identifier,
- pub values: Vec<String>,
-}
-
-impl Attribute {
- fn parse<'a>(decoder: &mut Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> {
- let Some((name, mut input)) = input.split_once('(') else {
- return Err(Warning::AttributeMissingLParen(input.into()));
- };
- let name = decoder
- .new_identifier(name)
- .map_err(Warning::InvalidAttributeName)?;
- let mut values = Vec::new();
- loop {
- let Some((value, rest)) = input.split_once('\n') else {
- return Err(Warning::AttributeMissingValue {
- name: name.clone(),
- index: values.len(),
- });
- };
- if let Some(stripped) = value
- .strip_prefix('\'')
- .and_then(|value| value.strip_suffix('\''))
- {
- values.push(stripped.into());
- } else {
- decoder.warn(Warning::AttributeMissingQuotes {
- name: name.clone(),
- index: values.len(),
- });
- values.push(value.into());
- }
- if let Some(rest) = rest.strip_prefix(')') {
- let attribute = Attribute { name, values };
- return Ok((attribute, rest));
- };
- input = rest;
- }
- }
-}
-
-impl Attributes {
- fn parse<'a>(
- decoder: &mut Decoder,
- mut input: &'a str,
- sentinel: Option<char>,
- ) -> Result<(Attributes, &'a str, Vec<Identifier>), Warning> {
- let mut attributes = BTreeMap::new();
- let mut duplicates = Vec::new();
- let rest = loop {
- match input.chars().next() {
- None => break input,
- c if c == sentinel => break &input[1..],
- _ => {
- let (attribute, rest) = Attribute::parse(decoder, input)?;
- if attributes.contains_key(&attribute.name) {
- duplicates.push(attribute.name.clone());
- }
- attributes.insert(attribute.name, attribute.values);
- input = rest;
- }
- }
- };
- Ok((Attributes(attributes), rest, duplicates))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct RawFileAttributesRecord(TextRecord);
-
-#[derive(Clone, Debug, Default)]
-pub struct FileAttributesRecord(pub Attributes);
-
-impl RawFileAttributesRecord {
- fn parse(extension: Extension) -> Result<Record, Warning> {
- Ok(Record::FileAttributes(Self(TextRecord::parse(
- extension,
- "file attributes record",
- )?)))
- }
- fn decode(self, decoder: &mut Decoder) -> FileAttributesRecord {
- let input = decoder.decode(&self.0.text);
- match Attributes::parse(decoder, &input, None).issue_warning(&mut decoder.warn) {
- Some((set, rest, duplicates)) => {
- if !duplicates.is_empty() {
- decoder.warn(Warning::DuplicateFileAttributes {
- attributes: duplicates,
- });
- }
- if !rest.is_empty() {
- decoder.warn(dbg!(Warning::TBD));
- }
- FileAttributesRecord(set)
- }
- None => FileAttributesRecord::default(),
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VarAttributes {
- pub long_var_name: Identifier,
- pub attributes: Attributes,
-}
-
-impl VarAttributes {
- fn parse<'a>(
- decoder: &mut Decoder,
- input: &'a str,
- ) -> Result<(VarAttributes, &'a str), Warning> {
- let Some((long_var_name, rest)) = input.split_once(':') else {
- return Err(dbg!(Warning::TBD));
- };
- let long_var_name = decoder
- .new_identifier(long_var_name)
- .and_then(Identifier::must_be_ordinary)
- .map_err(Warning::InvalidAttributeVariableName)?;
- let (attributes, rest, duplicates) = Attributes::parse(decoder, rest, Some('/'))?;
- if !duplicates.is_empty() {
- decoder.warn(Warning::DuplicateVariableAttributes {
- variable: long_var_name.clone(),
- attributes: duplicates,
- });
- }
- let var_attribute = VarAttributes {
- long_var_name,
- attributes,
- };
- Ok((var_attribute, rest))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct RawVariableAttributesRecord(TextRecord);
-
-#[derive(Clone, Debug)]
-pub struct VariableAttributesRecord(pub Vec<VarAttributes>);
-
-impl RawVariableAttributesRecord {
- fn parse(extension: Extension) -> Result<Record, Warning> {
- Ok(Record::VariableAttributes(Self(TextRecord::parse(
- extension,
- "variable attributes record",
- )?)))
- }
- fn decode(self, decoder: &mut Decoder) -> VariableAttributesRecord {
- let decoded = decoder.decode(&self.0.text);
- let mut input = decoded.as_ref();
- let mut var_attribute_sets = Vec::new();
- while !input.is_empty() {
- let Some((var_attribute, rest)) =
- VarAttributes::parse(decoder, input).issue_warning(&mut decoder.warn)
- else {
- break;
- };
- var_attribute_sets.push(var_attribute);
- input = rest;
- }
- VariableAttributesRecord(var_attribute_sets)
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongName {
- pub short_name: Identifier,
- pub long_name: Identifier,
-}
-
-impl LongName {
- fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
- let Some((short_name, long_name)) = input.split_once('=') else {
- return Err(dbg!(Warning::LongNameMissingEquals));
- };
- let short_name = decoder
- .new_identifier(short_name)
- .and_then(Identifier::must_be_ordinary)
- .map_err(Warning::InvalidShortName)?;
- let long_name = decoder
- .new_identifier(long_name)
- .and_then(Identifier::must_be_ordinary)
- .map_err(Warning::InvalidLongName)?;
- Ok(LongName {
- short_name,
- long_name,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongNamesRecord(pub Vec<LongName>);
-
-#[derive(Clone, Debug)]
-pub struct ProductInfoRecord(pub String);
-
-#[derive(Clone, Debug)]
-pub struct VariableSet {
- pub name: String,
- pub variable_names: Vec<Identifier>,
-}
-
-impl VariableSet {
- fn parse(input: &str, decoder: &mut Decoder) -> Result<Self, Warning> {
- let (name, input) = input
- .split_once('=')
- .ok_or(Warning::VariableSetMissingEquals)?;
- let mut vars = Vec::new();
- for var in input.split_ascii_whitespace() {
- if let Some(identifier) = decoder
- .new_identifier(var)
- .and_then(Identifier::must_be_ordinary)
- .map_err(Warning::InvalidVariableSetName)
- .issue_warning(&mut decoder.warn)
- {
- vars.push(identifier);
- }
- }
- Ok(VariableSet {
- name: name.to_string(),
- variable_names: vars,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableSetRecord {
- pub offsets: Range<u64>,
- pub sets: Vec<VariableSet>,
-}
-
-trait IssueWarning<T> {
- fn issue_warning(self, warn: &mut dyn FnMut(Warning)) -> Option<T>;
-}
-impl<T> IssueWarning<T> for Result<T, Warning> {
- fn issue_warning(self, warn: &mut dyn FnMut(Warning)) -> Option<T> {
- match self {
- Ok(result) => Some(result),
- Err(error) => {
- warn(error);
- None
- }
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct Extension {
- pub offsets: Range<u64>,
-
- /// Record subtype.
- pub subtype: u32,
-
- /// Size of each data element.
- pub size: u32,
-
- /// Number of data elements.
- pub count: u32,
-
- /// `size * count` bytes of data.
- pub data: Vec<u8>,
-}
-
-impl Extension {
- fn check_size(&self, expected: &ExtensionRecord) -> Result<(), Warning> {
- match expected.size {
- Some(expected_size) if self.size != expected_size => {
- return Err(Warning::BadRecordSize {
- offset: self.offsets.start,
- record: expected.name.into(),
- size: self.size,
- expected_size,
- });
- }
- _ => (),
- }
- match expected.count {
- Some(expected_count) if self.count != expected_count => {
- return Err(Warning::BadRecordCount {
- offset: self.offsets.start,
- record: expected.name.into(),
- count: self.count,
- expected_count,
- });
- }
- _ => (),
- }
- Ok(())
- }
-
- fn read<R: Read + Seek>(
- r: &mut R,
- endian: Endian,
- var_types: &VarTypes,
- warn: &mut dyn FnMut(Warning),
- ) -> Result<Option<Record>, Error> {
- let subtype = endian.parse(read_bytes(r)?);
- let header_offset = r.stream_position()?;
- let size: u32 = endian.parse(read_bytes(r)?);
- let count = endian.parse(read_bytes(r)?);
- let Some(product) = size.checked_mul(count) else {
- return Err(Error::ExtensionRecordTooLarge {
- offset: header_offset,
- subtype,
- size,
- count,
- });
- };
- let start_offset = r.stream_position()?;
- let data = read_vec(r, product as usize)?;
- let end_offset = start_offset + product as u64;
- let extension = Extension {
- offsets: start_offset..end_offset,
- subtype,
- size,
- count,
- data,
- };
- let result = match subtype {
- 3 => IntegerInfoRecord::parse(&extension, endian),
- 4 => FloatInfoRecord::parse(&extension, endian),
- 11 => VarDisplayRecord::parse(&extension, var_types, endian, warn),
- 7 | 19 => MultipleResponseRecord::parse(&extension, endian),
- 21 => LongStringValueLabelRecord::parse(&extension, endian),
- 22 => LongStringMissingValueRecord::parse(&extension, endian, warn),
- 20 => EncodingRecord::parse(&extension, endian),
- 16 => NumberOfCasesRecord::parse(&extension, endian),
- 5 => RawVariableSetRecord::parse(extension),
- 10 => RawProductInfoRecord::parse(extension),
- 13 => RawLongNamesRecord::parse(extension),
- 14 => RawVeryLongStringsRecord::parse(extension),
- 17 => RawFileAttributesRecord::parse(extension),
- 18 => RawVariableAttributesRecord::parse(extension),
- _ => Ok(Record::OtherExtension(extension)),
- };
- match result {
- Ok(result) => Ok(Some(result)),
- Err(error) => {
- warn(error);
- Ok(None)
- }
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct ZHeader {
- /// File offset to the start of the record.
- pub offset: u64,
-
- /// File offset to the ZLIB data header.
- pub zheader_offset: u64,
-
- /// File offset to the ZLIB trailer.
- pub ztrailer_offset: u64,
-
- /// Length of the ZLIB trailer in bytes.
- pub ztrailer_len: u64,
-}
-
-impl ZHeader {
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
- let offset = r.stream_position()?;
- let zheader_offset: u64 = endian.parse(read_bytes(r)?);
- let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
- let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
-
- if zheader_offset != offset {
- return Err(Error::UnexpectedZHeaderOffset {
- actual: zheader_offset,
- expected: offset,
- });
- }
-
- if ztrailer_offset < offset {
- return Err(Error::ImpossibleZTrailerOffset(ztrailer_offset));
- }
-
- if ztrailer_len < 24 || ztrailer_len % 24 != 0 {
- return Err(Error::InvalidZTrailerLength(ztrailer_len));
- }
-
- Ok(ZHeader {
- offset,
- zheader_offset,
- ztrailer_offset,
- ztrailer_len,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct ZTrailer {
- /// File offset to the start of the record.
- pub offset: u64,
-
- /// Compression bias as a negative integer, e.g. -100.
- pub int_bias: i64,
-
- /// Always observed as zero.
- pub zero: u64,
-
- /// Uncompressed size of each block, except possibly the last. Only
- /// `0x3ff000` has been observed so far.
- pub block_size: u32,
-
- /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
- pub blocks: Vec<ZBlock>,
-}
-
-#[derive(Clone, Debug)]
-pub struct ZBlock {
- /// Offset of block of data if simple compression were used.
- pub uncompressed_ofs: u64,
-
- /// Actual offset within the file of the compressed data block.
- pub compressed_ofs: u64,
-
- /// The number of bytes in this data block after decompression. This is
- /// `block_size` in every data block but the last, which may be smaller.
- pub uncompressed_size: u32,
-
- /// The number of bytes in this data block, as stored compressed in this
- /// file.
- pub compressed_size: u32,
-}
-
-impl ZBlock {
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
- Ok(ZBlock {
- uncompressed_ofs: endian.parse(read_bytes(r)?),
- compressed_ofs: endian.parse(read_bytes(r)?),
- uncompressed_size: endian.parse(read_bytes(r)?),
- compressed_size: endian.parse(read_bytes(r)?),
- })
- }
-}
-
-impl ZTrailer {
- fn read<R: Read + Seek>(
- reader: &mut R,
- endian: Endian,
- bias: f64,
- zheader: &ZHeader,
- warn: &mut dyn FnMut(Warning),
- ) -> Result<Option<ZTrailer>, Error> {
- let start_offset = reader.stream_position()?;
- if reader
- .seek(SeekFrom::Start(zheader.ztrailer_offset))
- .is_err()
- {
- return Ok(None);
- }
- let int_bias = endian.parse(read_bytes(reader)?);
- if int_bias as f64 != -bias {
- return Err(Error::WrongZlibTrailerBias {
- actual: int_bias,
- expected: -bias,
- });
- }
- let zero = endian.parse(read_bytes(reader)?);
- if zero != 0 {
- return Err(Error::WrongZlibTrailerZero(zero));
- }
- let block_size = endian.parse(read_bytes(reader)?);
- if block_size != 0x3ff000 {
- return Err(Error::WrongZlibTrailerBlockSize(block_size));
- }
- let n_blocks: u32 = endian.parse(read_bytes(reader)?);
- let expected_n_blocks = (zheader.ztrailer_len - 24) / 24;
- if n_blocks as u64 != expected_n_blocks {
- return Err(Error::BadZlibTrailerNBlocks {
- offset: zheader.ztrailer_offset,
- n_blocks,
- expected_n_blocks,
- ztrailer_len: zheader.ztrailer_len,
- });
- }
- let blocks = (0..n_blocks)
- .map(|_| ZBlock::read(reader, endian))
- .collect::<Result<Vec<_>, _>>()?;
-
- let mut expected_uncmp_ofs = zheader.zheader_offset;
- let mut expected_cmp_ofs = zheader.zheader_offset + 24;
- for (index, block) in blocks.iter().enumerate() {
- if block.uncompressed_ofs != expected_uncmp_ofs {
- return Err(Error::ZlibTrailerBlockWrongUncmpOfs {
- index,
- actual: block.uncompressed_ofs,
- expected: expected_cmp_ofs,
- });
- }
- if block.compressed_ofs != expected_cmp_ofs {
- return Err(Error::ZlibTrailerBlockWrongCmpOfs {
- index,
- actual: block.compressed_ofs,
- expected: expected_cmp_ofs,
- });
- }
- if index < blocks.len() - 1 {
- if block.uncompressed_size != block_size {
- warn(Warning::ZlibTrailerBlockWrongSize {
- index,
- actual: block.uncompressed_size,
- expected: block_size,
- });
- }
- } else {
- if block.uncompressed_size > block_size {
- warn(Warning::ZlibTrailerBlockTooBig {
- index,
- actual: block.uncompressed_size,
- max_expected: block_size,
- });
- }
- }
- // http://www.zlib.net/zlib_tech.html says that the maximum
- // expansion from compression, with worst-case parameters, is 13.5%
- // plus 11 bytes. This code checks for an expansion of more than
- // 14.3% plus 11 bytes.
- if block.compressed_size > block.uncompressed_size + block.uncompressed_size / 7 + 11 {
- return Err(Error::ZlibExpansion {
- index,
- compressed_size: block.compressed_size,
- uncompressed_size: block.uncompressed_size,
- });
- }
-
- expected_cmp_ofs += block.compressed_size as u64;
- expected_uncmp_ofs += block.uncompressed_size as u64;
- }
-
- if expected_cmp_ofs != zheader.ztrailer_offset {
- return Err(Error::ZlibTrailerOffsetInconsistency {
- descriptors: expected_cmp_ofs,
- zheader: zheader.ztrailer_offset,
- });
- }
-
- reader.seek(SeekFrom::Start(start_offset))?;
- Ok(Some(ZTrailer {
- offset: zheader.ztrailer_offset,
- int_bias,
- zero,
- block_size,
- blocks,
- }))
- }
-}
-
-fn skip_bytes<R: Read>(r: &mut R, mut n: usize) -> Result<(), IoError> {
- thread_local! {
- static BUF: RefCell<[u8; 256]> = RefCell::new([0u8; 256]);
- }
- BUF.with_borrow_mut(|buf| {
- while n > 0 {
- let chunk = n.min(buf.len());
- r.read_exact(&mut buf[..n])?;
- n -= chunk;
- }
- Ok(())
- })
-}
-
-fn try_read_bytes_into<R: Read>(r: &mut R, buf: &mut [u8]) -> Result<bool, IoError> {
- let n = r.read(buf)?;
- if n > 0 {
- if n < buf.len() {
- r.read_exact(&mut buf[n..])?;
- }
- Ok(true)
- } else {
- Ok(false)
- }
-}
-
-fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
- let mut buf = [0; N];
- match try_read_bytes_into(r, &mut buf)? {
- true => Ok(Some(buf)),
- false => Ok(None),
- }
-}
-
-fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
- let mut buf = [0; N];
- r.read_exact(&mut buf)?;
- Ok(buf)
-}
-
-fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
- let mut vec = vec![0; n];
- r.read_exact(&mut vec)?;
- Ok(vec)
+fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
+ let mut vec = vec![0; n];
+ r.read_exact(&mut vec)?;
+ Ok(vec)
}
fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<RawString, IoError> {
Ok(read_vec(r, length as usize)?.into())
}
-#[derive(Clone, Debug)]
-pub struct LongStringValueLabels<N, S>
-where
- S: Debug,
-{
- pub var_name: N,
- pub width: u32,
-
- /// `(value, label)` pairs, where each value is `width` bytes.
- pub labels: Vec<(RawString, S)>,
-}
-
-impl LongStringValueLabels<RawString, RawString> {
- fn decode(
- &self,
- decoder: &mut Decoder,
- ) -> Result<LongStringValueLabels<Identifier, String>, Warning> {
- let var_name = decoder.decode(&self.var_name);
- let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding)
- .map_err(Warning::InvalidLongStringValueLabelName)?;
-
- let mut labels = Vec::with_capacity(self.labels.len());
- for (value, label) in self.labels.iter() {
- let label = decoder.decode(label).to_string();
- labels.push((value.clone(), label));
- }
-
- Ok(LongStringValueLabels {
- var_name,
- width: self.width,
- labels,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringValueLabelRecord<N, S>(pub Vec<LongStringValueLabels<N, S>>)
-where
- N: Debug,
- S: Debug;
-
-static LONG_STRING_VALUE_LABEL_RECORD: ExtensionRecord = ExtensionRecord {
- size: Some(1),
- count: None,
- name: "long string value labels record",
-};
-
-impl LongStringValueLabelRecord<RawString, RawString> {
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size(&LONG_STRING_VALUE_LABEL_RECORD)?;
-
- let mut input = &ext.data[..];
- let mut label_set = Vec::new();
- while !input.is_empty() {
- let var_name = read_string(&mut input, endian)?;
- let width: u32 = endian.parse(read_bytes(&mut input)?);
- let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
- let mut labels = Vec::new();
- for _ in 0..n_labels {
- let value = read_string(&mut input, endian)?;
- let label = read_string(&mut input, endian)?;
- labels.push((value, label));
- }
- label_set.push(LongStringValueLabels {
- var_name,
- width,
- labels,
- })
- }
- Ok(Record::LongStringValueLabels(LongStringValueLabelRecord(
- label_set,
- )))
- }
-}
-
-impl LongStringValueLabelRecord<RawString, RawString> {
- fn decode(self, decoder: &mut Decoder) -> LongStringValueLabelRecord<Identifier, String> {
- let mut labels = Vec::with_capacity(self.0.len());
- for label in &self.0 {
- match label.decode(decoder) {
- Ok(set) => labels.push(set),
- Err(error) => decoder.warn(error),
- }
- }
- LongStringValueLabelRecord(labels)
- }
-}
-
#[derive(Default)]
pub struct VarTypes {
pub types: Vec<Option<VarWidth>>,
--- /dev/null
+//! Raw records.
+//!
+//! Separated into a submodule just to reduce clutter.
+
+use std::{
+ borrow::Cow,
+ collections::BTreeMap,
+ fmt::{Debug, Formatter},
+ io::{Read, Seek, SeekFrom},
+ ops::Range,
+ str::from_utf8,
+};
+
+use crate::{
+ dictionary::{
+ Alignment, Attributes, CategoryLabels, Datum, Measure, MissingValueRange, MissingValues,
+ VarType, VarWidth,
+ },
+ endian::{Endian, Parse},
+ identifier::{Error as IdError, Identifier},
+ sys::raw::{
+ read_bytes, read_string, read_vec, DecodedRecord, Decoder, Error, Magic, RawDatum,
+ RawStrArray, RawString, RawWidth, Record, VarTypes, Warning,
+ },
+};
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum Compression {
+ Simple,
+ ZLib,
+}
+
+#[derive(Clone)]
+pub struct HeaderRecord<S>
+where
+ S: Debug,
+{
+ /// Offset in file.
+ pub offsets: Range<u64>,
+
+ /// Magic number.
+ pub magic: Magic,
+
+ /// Eye-catcher string, product name, in the file's encoding. Padded
+ /// on the right with spaces.
+ pub eye_catcher: S,
+
+ /// Layout code, normally either 2 or 3.
+ pub layout_code: u32,
+
+ /// Number of variable positions, or `None` if the value in the file is
+ /// questionably trustworthy.
+ pub nominal_case_size: Option<u32>,
+
+ /// Compression type, if any,
+ pub compression: Option<Compression>,
+
+ /// 1-based variable index of the weight variable, or `None` if the file is
+ /// unweighted.
+ pub weight_index: Option<u32>,
+
+ /// Claimed number of cases, if known.
+ pub n_cases: Option<u32>,
+
+ /// Compression bias, usually 100.0.
+ pub bias: f64,
+
+ /// `dd mmm yy` in the file's encoding.
+ pub creation_date: S,
+
+ /// `HH:MM:SS` in the file's encoding.
+ pub creation_time: S,
+
+ /// File label, in the file's encoding. Padded on the right with spaces.
+ pub file_label: S,
+
+ /// Endianness of the data in the file header.
+ pub endian: Endian,
+}
+
+impl<S> HeaderRecord<S>
+where
+ S: Debug,
+{
+ fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> std::fmt::Result
+ where
+ T: Debug,
+ {
+ writeln!(f, "{name:>17}: {:?}", value)
+ }
+}
+
+impl<S> Debug for HeaderRecord<S>
+where
+ S: Debug,
+{
+ fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+ writeln!(f, "File header record:")?;
+ self.debug_field(f, "Magic", self.magic)?;
+ self.debug_field(f, "Product name", &self.eye_catcher)?;
+ self.debug_field(f, "Layout code", self.layout_code)?;
+ self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
+ self.debug_field(f, "Compression", self.compression)?;
+ self.debug_field(f, "Weight index", self.weight_index)?;
+ self.debug_field(f, "Number of cases", self.n_cases)?;
+ self.debug_field(f, "Compression bias", self.bias)?;
+ self.debug_field(f, "Creation date", &self.creation_date)?;
+ self.debug_field(f, "Creation time", &self.creation_time)?;
+ self.debug_field(f, "File label", &self.file_label)?;
+ self.debug_field(f, "Endianness", self.endian)
+ }
+}
+
+impl HeaderRecord<RawString> {
+ pub fn read<R: Read + Seek>(r: &mut R, warn: &mut dyn FnMut(Warning)) -> Result<Self, Error> {
+ let start = r.stream_position()?;
+
+ let magic: [u8; 4] = read_bytes(r)?;
+ let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
+
+ let eye_catcher = RawString(read_vec(r, 60)?);
+ let layout_code: [u8; 4] = read_bytes(r)?;
+ let endian = Endian::identify_u32(2, layout_code)
+ .or_else(|| Endian::identify_u32(2, layout_code))
+ .ok_or(Error::NotASystemFile)?;
+ let layout_code = endian.parse(layout_code);
+
+ let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
+ let nominal_case_size = (1..i32::MAX as u32 / 16)
+ .contains(&nominal_case_size)
+ .then_some(nominal_case_size);
+
+ let compression_code: u32 = endian.parse(read_bytes(r)?);
+ let compression = match (magic, compression_code) {
+ (Magic::Zsav, 2) => Some(Compression::ZLib),
+ (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)),
+ (_, 0) => None,
+ (_, 1) => Some(Compression::Simple),
+ (_, code) => return Err(Error::InvalidSavCompression(code)),
+ };
+
+ let weight_index: u32 = endian.parse(read_bytes(r)?);
+ let weight_index = (weight_index > 0).then_some(weight_index);
+
+ let n_cases: u32 = endian.parse(read_bytes(r)?);
+ let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
+
+ let bias: f64 = endian.parse(read_bytes(r)?);
+ if bias != 100.0 && bias != 0.0 {
+ warn(Warning::UnexpectedBias(bias));
+ }
+
+ let creation_date = RawString(read_vec(r, 9)?);
+ let creation_time = RawString(read_vec(r, 8)?);
+ let file_label = RawString(read_vec(r, 64)?);
+ let _: [u8; 3] = read_bytes(r)?;
+
+ Ok(HeaderRecord {
+ offsets: start..r.stream_position()?,
+ magic,
+ layout_code,
+ nominal_case_size,
+ compression,
+ weight_index,
+ n_cases,
+ bias,
+ creation_date,
+ creation_time,
+ eye_catcher,
+ file_label,
+ endian,
+ })
+ }
+
+ pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord {
+ let eye_catcher = decoder.decode(&self.eye_catcher).to_string();
+ let file_label = decoder.decode(&self.file_label).to_string();
+ let creation_date = decoder.decode(&self.creation_date).to_string();
+ let creation_time = decoder.decode(&self.creation_time).to_string();
+ DecodedRecord::Header(HeaderRecord {
+ eye_catcher,
+ weight_index: self.weight_index,
+ n_cases: self.n_cases,
+ file_label,
+ offsets: self.offsets.clone(),
+ magic: self.magic,
+ layout_code: self.layout_code,
+ nominal_case_size: self.nominal_case_size,
+ compression: self.compression,
+ bias: self.bias,
+ creation_date,
+ creation_time,
+ endian: self.endian,
+ })
+ }
+}
+
+/// [crate::format::Format] as represented in a system file.
+#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+pub struct RawFormat(
+ /// The most-significant 16 bits are the type, the next 8 bytes are the
+ /// width, and the least-significant 8 bits are the number of decimals.
+ pub u32,
+);
+
+impl Debug for RawFormat {
+ fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+ let type_ = format_name(self.0 >> 16);
+ let w = (self.0 >> 8) & 0xff;
+ let d = self.0 & 0xff;
+ write!(f, "{:06x} ({type_}{w}.{d})", self.0)
+ }
+}
+
+fn format_name(type_: u32) -> Cow<'static, str> {
+ match type_ {
+ 1 => "A",
+ 2 => "AHEX",
+ 3 => "COMMA",
+ 4 => "DOLLAR",
+ 5 => "F",
+ 6 => "IB",
+ 7 => "PIBHEX",
+ 8 => "P",
+ 9 => "PIB",
+ 10 => "PK",
+ 11 => "RB",
+ 12 => "RBHEX",
+ 15 => "Z",
+ 16 => "N",
+ 17 => "E",
+ 20 => "DATE",
+ 21 => "TIME",
+ 22 => "DATETIME",
+ 23 => "ADATE",
+ 24 => "JDATE",
+ 25 => "DTIME",
+ 26 => "WKDAY",
+ 27 => "MONTH",
+ 28 => "MOYR",
+ 29 => "QYR",
+ 30 => "WKYR",
+ 31 => "PCT",
+ 32 => "DOT",
+ 33 => "CCA",
+ 34 => "CCB",
+ 35 => "CCC",
+ 36 => "CCD",
+ 37 => "CCE",
+ 38 => "EDATE",
+ 39 => "SDATE",
+ 40 => "MTIME",
+ 41 => "YMDHMS",
+ _ => return format!("<unknown format {type_}>").into(),
+ }
+ .into()
+}
+
+impl MissingValues {
+ pub fn read<R: Read + Seek>(
+ r: &mut R,
+ offset: u64,
+ raw_width: RawWidth,
+ code: i32,
+ endian: Endian,
+ warn: &mut dyn FnMut(Warning),
+ ) -> Result<Self, Error> {
+ let (individual_values, has_range) = match code {
+ 0 => return Ok(Self::default()),
+ 1..=3 => (code as usize, false),
+ -2 => (0, true),
+ -3 => (1, true),
+ _ => return Err(Error::BadMissingValueCode { offset, code }),
+ };
+
+ let mut values = Vec::with_capacity(individual_values);
+ let range = if has_range {
+ let low = read_bytes::<8, _>(r)?;
+ let high = read_bytes::<8, _>(r)?;
+ Some((low, high))
+ } else {
+ None
+ };
+ for _ in 0..individual_values {
+ values.push(read_bytes::<8, _>(r)?);
+ }
+
+ match VarWidth::try_from(raw_width) {
+ Ok(VarWidth::Numeric) => {
+ let values = values
+ .into_iter()
+ .map(|v| Datum::Number(endian.parse(v)))
+ .collect();
+
+ let range = range.map(|(low, high)| {
+ MissingValueRange::new(endian.parse(low), endian.parse(high))
+ });
+ return Ok(Self::new(values, range).unwrap());
+ }
+ Ok(VarWidth::String(_)) if range.is_some() => warn(Warning::MissingValueStringRange),
+ Ok(VarWidth::String(width)) => {
+ let width = width.min(8) as usize;
+ let values = values
+ .into_iter()
+ .map(|value| Datum::String(RawString::from(&value[..width])))
+ .collect();
+ return Ok(Self::new(values, None).unwrap());
+ }
+ Err(()) => warn(Warning::MissingValueContinuation(offset)),
+ }
+ Ok(Self::default())
+ }
+}
+
+#[derive(Clone)]
+pub struct VariableRecord<S>
+where
+ S: Debug,
+{
+ /// Range of offsets in file.
+ pub offsets: Range<u64>,
+
+ /// Variable width, in the range -1..=255.
+ pub width: RawWidth,
+
+ /// Variable name, padded on the right with spaces.
+ pub name: S,
+
+ /// Print format.
+ pub print_format: RawFormat,
+
+ /// Write format.
+ pub write_format: RawFormat,
+
+ /// Missing values.
+ pub missing_values: MissingValues,
+
+ /// Optional variable label.
+ pub label: Option<S>,
+}
+
+impl<S> Debug for VariableRecord<S>
+where
+ S: Debug,
+{
+ fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+ writeln!(f, "Width: {}", self.width,)?;
+ writeln!(f, "Print format: {:?}", self.print_format)?;
+ writeln!(f, "Write format: {:?}", self.write_format)?;
+ writeln!(f, "Name: {:?}", &self.name)?;
+ writeln!(f, "Variable label: {:?}", self.label)?;
+ writeln!(f, "Missing values: {:?}", self.missing_values)
+ }
+}
+
+impl VariableRecord<RawString> {
+ pub fn read<R: Read + Seek>(
+ r: &mut R,
+ endian: Endian,
+ warn: &mut dyn FnMut(Warning),
+ ) -> Result<Record, Error> {
+ let start_offset = r.stream_position()?;
+ let width: i32 = endian.parse(read_bytes(r)?);
+ let width: RawWidth = width.try_into().map_err(|_| Error::BadVariableWidth {
+ start_offset,
+ width,
+ })?;
+ let code_offset = r.stream_position()?;
+ let has_variable_label: u32 = endian.parse(read_bytes(r)?);
+ let missing_value_code: i32 = endian.parse(read_bytes(r)?);
+ let print_format = RawFormat(endian.parse(read_bytes(r)?));
+ let write_format = RawFormat(endian.parse(read_bytes(r)?));
+ let name = RawString(read_vec(r, 8)?);
+
+ let label = match has_variable_label {
+ 0 => None,
+ 1 => {
+ let len: u32 = endian.parse(read_bytes(r)?);
+ let read_len = len.min(65535) as usize;
+ let label = RawString(read_vec(r, read_len)?);
+
+ let padding_bytes = len.next_multiple_of(4) - len;
+ let _ = read_vec(r, padding_bytes as usize)?;
+
+ Some(label)
+ }
+ _ => {
+ return Err(Error::BadVariableLabelCode {
+ start_offset,
+ code_offset,
+ code: has_variable_label,
+ });
+ }
+ };
+
+ let missing_values =
+ MissingValues::read(r, start_offset, width, missing_value_code, endian, warn)?;
+
+ let end_offset = r.stream_position()?;
+
+ Ok(Record::Variable(VariableRecord {
+ offsets: start_offset..end_offset,
+ width,
+ name,
+ print_format,
+ write_format,
+ missing_values,
+ label,
+ }))
+ }
+
+ pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord {
+ DecodedRecord::Variable(VariableRecord {
+ offsets: self.offsets.clone(),
+ width: self.width,
+ name: decoder.decode(&self.name).to_string(),
+ print_format: self.print_format,
+ write_format: self.write_format,
+ missing_values: self.missing_values,
+ label: self
+ .label
+ .as_ref()
+ .map(|label| decoder.decode(label).to_string()),
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct ValueLabel<D, S>
+where
+ D: Debug,
+ S: Debug,
+{
+ pub datum: D,
+ pub label: S,
+}
+
+#[derive(Clone)]
+pub struct ValueLabelRecord<D, S>
+where
+ D: Debug,
+ S: Debug,
+{
+ /// Range of offsets in file.
+ pub offsets: Range<u64>,
+
+ /// The labels.
+ pub labels: Vec<ValueLabel<D, S>>,
+
+ /// The 1-based indexes of the variable indexes.
+ pub dict_indexes: Vec<u32>,
+
+ /// The types of the variables.
+ pub var_type: VarType,
+}
+
+impl<D, S> Debug for ValueLabelRecord<D, S>
+where
+ D: Debug,
+ S: Debug,
+{
+ fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+ writeln!(f, "labels: ")?;
+ for label in self.labels.iter() {
+ writeln!(f, "{label:?}")?;
+ }
+ write!(f, "apply to {} variables", self.var_type)?;
+ for dict_index in self.dict_indexes.iter() {
+ write!(f, " #{dict_index}")?;
+ }
+ Ok(())
+ }
+}
+
+impl<D, S> ValueLabelRecord<D, S>
+where
+ D: Debug,
+ S: Debug,
+{
+ /// Maximum number of value labels in a record.
+ pub const MAX_LABELS: u32 = u32::MAX / 8;
+
+ /// Maximum number of variable indexes in a record.
+ pub const MAX_INDEXES: u32 = u32::MAX / 8;
+}
+
+impl ValueLabelRecord<RawDatum, RawString> {
+ pub fn read<R: Read + Seek>(
+ r: &mut R,
+ endian: Endian,
+ var_types: &VarTypes,
+ warn: &mut dyn FnMut(Warning),
+ ) -> Result<Option<Record>, Error> {
+ let label_offset = r.stream_position()?;
+ let n: u32 = endian.parse(read_bytes(r)?);
+ if n > Self::MAX_LABELS {
+ return Err(Error::BadNumberOfValueLabels {
+ offset: label_offset,
+ n,
+ max: Self::MAX_LABELS,
+ });
+ }
+
+ let mut labels = Vec::new();
+ for _ in 0..n {
+ let value = super::UntypedDatum(read_bytes(r)?);
+ let label_len: u8 = endian.parse(read_bytes(r)?);
+ let label_len = label_len as usize;
+ let padded_len = (label_len + 1).next_multiple_of(8);
+
+ let mut label = read_vec(r, padded_len - 1)?;
+ label.truncate(label_len);
+ labels.push((value, RawString(label)));
+ }
+
+ let index_offset = r.stream_position()?;
+ let rec_type: u32 = endian.parse(read_bytes(r)?);
+ if rec_type != 4 {
+ return Err(Error::ExpectedVarIndexRecord {
+ offset: index_offset,
+ rec_type,
+ });
+ }
+
+ let n: u32 = endian.parse(read_bytes(r)?);
+ if n > Self::MAX_INDEXES {
+ return Err(Error::TooManyVarIndexes {
+ offset: index_offset,
+ n,
+ max: Self::MAX_INDEXES,
+ });
+ } else if n == 0 {
+ dbg!();
+ warn(Warning::NoVarIndexes {
+ offset: index_offset,
+ });
+ return Ok(None);
+ }
+
+ let index_offset = r.stream_position()?;
+ let mut dict_indexes = Vec::with_capacity(n as usize);
+ let mut invalid_indexes = Vec::new();
+ for _ in 0..n {
+ let index: u32 = endian.parse(read_bytes(r)?);
+ if var_types.is_valid_index(index as usize) {
+ dict_indexes.push(index);
+ } else {
+ invalid_indexes.push(index);
+ }
+ }
+ if !invalid_indexes.is_empty() {
+ warn(Warning::InvalidVarIndexes {
+ offset: index_offset,
+ max: var_types.n_values(),
+ invalid: invalid_indexes,
+ });
+ }
+
+ let Some(&first_index) = dict_indexes.first() else {
+ return Ok(None);
+ };
+ let var_type = VarType::from(var_types.types[first_index as usize - 1].unwrap());
+ let mut wrong_type_indexes = Vec::new();
+ dict_indexes.retain(|&index| {
+ if var_types.types[index as usize - 1].map(VarType::from) != Some(var_type) {
+ wrong_type_indexes.push(index);
+ false
+ } else {
+ true
+ }
+ });
+ if !wrong_type_indexes.is_empty() {
+ warn(Warning::MixedVarTypes {
+ offset: index_offset,
+ var_type,
+ wrong_types: wrong_type_indexes,
+ });
+ }
+
+ let labels = labels
+ .into_iter()
+ .map(|(value, label)| ValueLabel {
+ datum: RawDatum::from_raw(&value, var_type, endian),
+ label,
+ })
+ .collect();
+
+ let end_offset = r.stream_position()?;
+ Ok(Some(Record::ValueLabel(ValueLabelRecord {
+ offsets: label_offset..end_offset,
+ labels,
+ dict_indexes,
+ var_type,
+ })))
+ }
+
+ pub fn decode(self, decoder: &mut Decoder) -> ValueLabelRecord<RawDatum, String> {
+ let labels = self
+ .labels
+ .iter()
+ .map(
+ |ValueLabel {
+ datum: value,
+ label,
+ }| ValueLabel {
+ datum: value.clone(),
+ label: decoder.decode(label).to_string(),
+ },
+ )
+ .collect();
+ ValueLabelRecord {
+ offsets: self.offsets.clone(),
+ labels,
+ dict_indexes: self.dict_indexes.clone(),
+ var_type: self.var_type,
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct DocumentRecord<S>
+where
+ S: Debug,
+{
+ pub offsets: Range<u64>,
+
+ /// The document, as an array of lines. Raw lines are exactly 80 bytes long
+ /// and are right-padded with spaces without any new-line termination.
+ pub lines: Vec<S>,
+}
+
+pub type RawDocumentLine = RawStrArray<DOC_LINE_LEN>;
+
+/// Length of a line in a document. Document lines are fixed-length and
+/// padded on the right with spaces.
+pub const DOC_LINE_LEN: usize = 80;
+
+impl DocumentRecord<RawDocumentLine> {
+ /// Maximum number of lines we will accept in a document. This is simply
+ /// the maximum number that will fit in a 32-bit space.
+ pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
+
+ pub fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
+ let start_offset = r.stream_position()?;
+ let n: u32 = endian.parse(read_bytes(r)?);
+ let n = n as usize;
+ if n > Self::MAX_LINES {
+ Err(Error::BadDocumentLength {
+ offset: start_offset,
+ n,
+ max: Self::MAX_LINES,
+ })
+ } else {
+ let mut lines = Vec::with_capacity(n);
+ for _ in 0..n {
+ lines.push(RawStrArray(read_bytes(r)?));
+ }
+ let end_offset = r.stream_position()?;
+ Ok(Record::Document(DocumentRecord {
+ offsets: start_offset..end_offset,
+ lines,
+ }))
+ }
+ }
+
+ pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord {
+ DecodedRecord::Document(DocumentRecord {
+ offsets: self.offsets.clone(),
+ lines: self
+ .lines
+ .iter()
+ .map(|s| decoder.decode_slice(&s.0).to_string())
+ .collect(),
+ })
+ }
+}
+
+pub struct ExtensionRecord<'a> {
+ pub size: Option<u32>,
+ pub count: Option<u32>,
+ pub name: &'a str,
+}
+
+#[derive(Clone, Debug)]
+pub struct IntegerInfoRecord {
+ pub offsets: Range<u64>,
+ pub version: (i32, i32, i32),
+ pub machine_code: i32,
+ pub floating_point_rep: i32,
+ pub compression_code: i32,
+ pub endianness: i32,
+ pub character_code: i32,
+}
+
+static INTEGER_INFO_RECORD: ExtensionRecord = ExtensionRecord {
+ size: Some(4),
+ count: Some(8),
+ name: "integer record",
+};
+
+impl IntegerInfoRecord {
+ pub fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+ ext.check_size(&INTEGER_INFO_RECORD)?;
+
+ let mut input = &ext.data[..];
+ let data: Vec<i32> = (0..8)
+ .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
+ .collect();
+ Ok(Record::IntegerInfo(IntegerInfoRecord {
+ offsets: ext.offsets.clone(),
+ version: (data[0], data[1], data[2]),
+ machine_code: data[3],
+ floating_point_rep: data[4],
+ compression_code: data[5],
+ endianness: data[6],
+ character_code: data[7],
+ }))
+ }
+}
+
+static FLOAT_INFO_RECORD: ExtensionRecord = ExtensionRecord {
+ size: Some(8),
+ count: Some(3),
+ name: "floating point record",
+};
+
+impl FloatInfoRecord {
+ pub fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+ ext.check_size(&FLOAT_INFO_RECORD)?;
+
+ let mut input = &ext.data[..];
+ let data: Vec<f64> = (0..3)
+ .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
+ .collect();
+ Ok(Record::FloatInfo(FloatInfoRecord {
+ sysmis: data[0],
+ highest: data[1],
+ lowest: data[2],
+ }))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct FloatInfoRecord {
+ pub sysmis: f64,
+ pub highest: f64,
+ pub lowest: f64,
+}
+
+#[derive(Clone, Debug)]
+pub struct RawLongNamesRecord(TextRecord);
+
+impl RawLongNamesRecord {
+ pub fn parse(extension: Extension) -> Result<Record, Warning> {
+ Ok(Record::LongNames(Self(TextRecord::parse(
+ extension,
+ "long names record",
+ )?)))
+ }
+ pub fn decode(self, decoder: &mut Decoder) -> LongNamesRecord {
+ let input = decoder.decode(&self.0.text);
+ let mut names = Vec::new();
+ for pair in input.split('\t').filter(|s| !s.is_empty()) {
+ if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&mut decoder.warn)
+ {
+ names.push(long_name);
+ }
+ }
+ LongNamesRecord(names)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct TextRecord {
+ pub offsets: Range<u64>,
+
+ /// The text content of the record.
+ pub text: RawString,
+}
+
+impl TextRecord {
+ pub fn parse(extension: Extension, name: &str) -> Result<TextRecord, Warning> {
+ extension.check_size(&ExtensionRecord {
+ size: Some(1),
+ count: None,
+ name,
+ })?;
+ Ok(Self {
+ offsets: extension.offsets,
+ text: extension.data.into(),
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VeryLongString {
+ pub short_name: Identifier,
+ pub length: u16,
+}
+
+impl VeryLongString {
+ fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Warning> {
+ let Some((short_name, length)) = input.split_once('=') else {
+ return Err(Warning::VeryLongStringMissingDelimiter(input.into()));
+ };
+ let short_name = decoder
+ .new_identifier(short_name)
+ .and_then(Identifier::must_be_ordinary)
+ .map_err(Warning::InvalidLongStringName)?;
+ let length = length
+ .parse()
+ .map_err(|_| Warning::VeryLongStringInvalidLength(input.into()))?;
+ Ok(VeryLongString { short_name, length })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct RawVeryLongStringsRecord(TextRecord);
+
+#[derive(Clone, Debug)]
+pub struct VeryLongStringsRecord(pub Vec<VeryLongString>);
+
+impl RawVeryLongStringsRecord {
+ pub fn parse(extension: Extension) -> Result<Record, Warning> {
+ Ok(Record::VeryLongStrings(Self(TextRecord::parse(
+ extension,
+ "very long strings record",
+ )?)))
+ }
+ pub fn decode(self, decoder: &mut Decoder) -> VeryLongStringsRecord {
+ let input = decoder.decode(&self.0.text);
+ let mut very_long_strings = Vec::new();
+ for tuple in input
+ .split('\0')
+ .map(|s| s.trim_start_matches('\t'))
+ .filter(|s| !s.is_empty())
+ {
+ if let Some(vls) =
+ VeryLongString::parse(decoder, tuple).issue_warning(&mut decoder.warn)
+ {
+ very_long_strings.push(vls)
+ }
+ }
+ VeryLongStringsRecord(very_long_strings)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub enum MultipleResponseType {
+ MultipleDichotomy {
+ value: RawString,
+ labels: CategoryLabels,
+ },
+ MultipleCategory,
+}
+
+impl MultipleResponseType {
+ fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> {
+ let (mr_type, input) = match input.split_first() {
+ Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
+ Some((b'D', input)) => {
+ let (value, input) = parse_counted_string(input)?;
+ (
+ MultipleResponseType::MultipleDichotomy {
+ value,
+ labels: CategoryLabels::VarLabels,
+ },
+ input,
+ )
+ }
+ Some((b'E', input)) => {
+ let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
+ (CategoryLabels::CountedValues, rest)
+ } else if let Some(rest) = input.strip_prefix(b" 11 ") {
+ (CategoryLabels::VarLabels, rest)
+ } else {
+ return Err(Warning::InvalidMultipleDichotomyLabelType);
+ };
+ let (value, input) = parse_counted_string(input)?;
+ (
+ MultipleResponseType::MultipleDichotomy { value, labels },
+ input,
+ )
+ }
+ _ => return Err(Warning::InvalidMultipleResponseType),
+ };
+ Ok((mr_type, input))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseSet<I, S>
+where
+ I: Debug,
+ S: Debug,
+{
+ pub name: I,
+ pub label: S,
+ pub mr_type: MultipleResponseType,
+ pub short_names: Vec<I>,
+}
+
+impl MultipleResponseSet<RawString, RawString> {
+ fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> {
+ let Some(equals) = input.iter().position(|&b| b == b'=') else {
+ return Err(Warning::MultipleResponseSyntaxError("missing `=`"));
+ };
+ let (name, input) = input.split_at(equals);
+ let input = input.strip_prefix(b"=").unwrap();
+ let (mr_type, input) = MultipleResponseType::parse(input)?;
+ let Some(input) = input.strip_prefix(b" ") else {
+ return Err(Warning::MultipleResponseSyntaxError(
+ "missing space after multiple response type",
+ ));
+ };
+ let (label, mut input) = parse_counted_string(input)?;
+ let mut vars = Vec::new();
+ while input.first() != Some(&b'\n') {
+ match input.split_first() {
+ Some((b' ', rest)) => {
+ let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
+ return Err(Warning::MultipleResponseSyntaxError(
+ "missing variable name delimiter",
+ ));
+ };
+ let (var, rest) = rest.split_at(length);
+ if !var.is_empty() {
+ vars.push(var.into());
+ }
+ input = rest;
+ }
+ _ => {
+ return Err(Warning::MultipleResponseSyntaxError(
+ "missing space preceding variable name",
+ ));
+ }
+ }
+ }
+ while input.first() == Some(&b'\n') {
+ input = &input[1..];
+ }
+ Ok((
+ MultipleResponseSet {
+ name: name.into(),
+ label,
+ mr_type,
+ short_names: vars,
+ },
+ input,
+ ))
+ }
+
+ fn decode(
+ &self,
+ decoder: &mut Decoder,
+ ) -> Result<MultipleResponseSet<Identifier, String>, Warning> {
+ let mut short_names = Vec::with_capacity(self.short_names.len());
+ for short_name in self.short_names.iter() {
+ if let Some(short_name) = decoder
+ .decode_identifier(short_name)
+ .map_err(Warning::InvalidMrSetName)
+ .issue_warning(&mut decoder.warn)
+ {
+ short_names.push(short_name);
+ }
+ }
+ Ok(MultipleResponseSet {
+ name: decoder
+ .decode_identifier(&self.name)
+ .map_err(Warning::InvalidMrSetVariableName)?,
+ label: decoder.decode(&self.label).to_string(),
+ mr_type: self.mr_type.clone(),
+ short_names,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseRecord<I, S>(pub Vec<MultipleResponseSet<I, S>>)
+where
+ I: Debug,
+ S: Debug;
+
+static MULTIPLE_RESPONSE_RECORD: ExtensionRecord = ExtensionRecord {
+ size: Some(1),
+ count: None,
+ name: "multiple response set record",
+};
+
+impl MultipleResponseRecord<RawString, RawString> {
+ fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
+ ext.check_size(&MULTIPLE_RESPONSE_RECORD)?;
+
+ let mut input = &ext.data[..];
+ let mut sets = Vec::new();
+ loop {
+ while let Some(suffix) = input.strip_prefix(b"\n") {
+ input = suffix;
+ }
+ if input.is_empty() {
+ break;
+ }
+ let (set, rest) = MultipleResponseSet::parse(input)?;
+ sets.push(set);
+ input = rest;
+ }
+ Ok(Record::MultipleResponse(MultipleResponseRecord(sets)))
+ }
+}
+
+impl MultipleResponseRecord<RawString, RawString> {
+ pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord {
+ let mut sets = Vec::new();
+ for set in self.0.iter() {
+ if let Some(set) = set.decode(decoder).issue_warning(&mut decoder.warn) {
+ sets.push(set);
+ }
+ }
+ DecodedRecord::MultipleResponse(MultipleResponseRecord(sets))
+ }
+}
+
+fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> {
+ let Some(space) = input.iter().position(|&b| b == b' ') else {
+ return Err(Warning::CountedStringMissingSpace);
+ };
+ let Ok(length) = from_utf8(&input[..space]) else {
+ return Err(Warning::CountedStringInvalidUTF8);
+ };
+ let Ok(length): Result<usize, _> = length.parse() else {
+ return Err(Warning::CountedStringInvalidLength(length.into()));
+ };
+
+ let Some((string, rest)) = input[space + 1..].split_at_checked(length) else {
+ return Err(Warning::CountedStringTooLong(length));
+ };
+ Ok((string.into(), rest))
+}
+
+impl Measure {
+ fn try_decode(source: u32) -> Result<Option<Measure>, Warning> {
+ match source {
+ 0 => Ok(None),
+ 1 => Ok(Some(Measure::Nominal)),
+ 2 => Ok(Some(Measure::Ordinal)),
+ 3 => Ok(Some(Measure::Scale)),
+ _ => Err(Warning::InvalidMeasurement(source)),
+ }
+ }
+}
+
+impl Alignment {
+ fn try_decode(source: u32) -> Result<Option<Alignment>, Warning> {
+ match source {
+ 0 => Ok(Some(Alignment::Left)),
+ 1 => Ok(Some(Alignment::Right)),
+ 2 => Ok(Some(Alignment::Center)),
+ _ => Err(Warning::InvalidAlignment(source)),
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VarDisplay {
+ pub measure: Option<Measure>,
+ pub width: Option<u32>,
+ pub alignment: Option<Alignment>,
+}
+
+#[derive(Clone, Debug)]
+pub struct VarDisplayRecord(pub Vec<VarDisplay>);
+
+impl VarDisplayRecord {
+ fn parse(
+ ext: &Extension,
+ var_types: &VarTypes,
+ endian: Endian,
+ warn: &mut dyn FnMut(Warning),
+ ) -> Result<Record, Warning> {
+ if ext.size != 4 {
+ return Err(Warning::BadRecordSize {
+ offset: ext.offsets.start,
+ record: String::from("variable display record"),
+ size: ext.size,
+ expected_size: 4,
+ });
+ }
+
+ let n_vars = var_types.n_vars();
+ let has_width = if ext.count as usize == 3 * n_vars {
+ true
+ } else if ext.count as usize == 2 * n_vars {
+ false
+ } else {
+ return Err(Warning::InvalidVariableDisplayCount {
+ count: ext.count as usize,
+ first: 2 * n_vars,
+ second: 3 * n_vars,
+ });
+ };
+
+ let mut var_displays = Vec::new();
+ let mut input = &ext.data[..];
+ for _ in 0..n_vars {
+ let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
+ .issue_warning(warn)
+ .flatten();
+ let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
+ let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
+ .issue_warning(warn)
+ .flatten();
+ var_displays.push(VarDisplay {
+ measure,
+ width,
+ alignment,
+ });
+ }
+ Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringMissingValues<N>
+where
+ N: Debug,
+{
+ /// Variable name.
+ pub var_name: N,
+
+ /// Missing values.
+ pub missing_values: Vec<RawStrArray<8>>,
+}
+
+impl LongStringMissingValues<RawString> {
+ fn decode(
+ &self,
+ decoder: &mut Decoder,
+ ) -> Result<LongStringMissingValues<Identifier>, IdError> {
+ Ok(LongStringMissingValues {
+ var_name: decoder.decode_identifier(&self.var_name)?,
+ missing_values: self.missing_values.clone(),
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringMissingValueRecord<N>(pub Vec<LongStringMissingValues<N>>)
+where
+ N: Debug;
+
+static LONG_STRING_MISSING_VALUE_RECORD: ExtensionRecord = ExtensionRecord {
+ size: Some(1),
+ count: None,
+ name: "long string missing values record",
+};
+
+impl LongStringMissingValueRecord<RawString> {
+ fn parse(
+ ext: &Extension,
+ endian: Endian,
+ warn: &mut dyn FnMut(Warning),
+ ) -> Result<Record, Warning> {
+ ext.check_size(&LONG_STRING_MISSING_VALUE_RECORD)?;
+
+ let mut input = &ext.data[..];
+ let mut missing_value_set = Vec::new();
+ while !input.is_empty() {
+ let var_name = read_string(&mut input, endian)?;
+ dbg!(&var_name);
+ let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
+ let value_len: u32 = endian.parse(read_bytes(&mut input)?);
+ if value_len != 8 {
+ let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
+ warn(Warning::BadLongMissingValueLength {
+ record_offset: ext.offsets.start,
+ offset,
+ value_len,
+ });
+ read_vec(
+ &mut input,
+ dbg!(value_len as usize * n_missing_values as usize),
+ )?;
+ continue;
+ }
+ let mut missing_values = Vec::new();
+ for i in 0..n_missing_values {
+ if i > 0 {
+ // Tolerate files written by old, buggy versions of PSPP
+ // where we believed that the value_length was repeated
+ // before each missing value.
+ let mut peek = input;
+ let number: u32 = endian.parse(read_bytes(&mut peek)?);
+ if number == 8 {
+ input = peek;
+ }
+ }
+
+ let value: [u8; 8] = read_bytes(&mut input)?;
+ missing_values.push(RawStrArray(value));
+ }
+ missing_value_set.push(LongStringMissingValues {
+ var_name,
+ missing_values,
+ });
+ }
+ Ok(Record::LongStringMissingValues(
+ LongStringMissingValueRecord(missing_value_set),
+ ))
+ }
+}
+
+impl LongStringMissingValueRecord<RawString> {
+ pub fn decode(self, decoder: &mut Decoder) -> LongStringMissingValueRecord<Identifier> {
+ let mut mvs = Vec::with_capacity(self.0.len());
+ for mv in self.0.iter() {
+ if let Some(mv) = mv
+ .decode(decoder)
+ .map_err(Warning::InvalidLongStringMissingValueVariableName)
+ .issue_warning(&mut decoder.warn)
+ {
+ mvs.push(mv);
+ }
+ }
+ LongStringMissingValueRecord(mvs)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct EncodingRecord(pub String);
+
+static ENCODING_RECORD: ExtensionRecord = ExtensionRecord {
+ size: Some(1),
+ count: None,
+ name: "encoding record",
+};
+
+impl EncodingRecord {
+ fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
+ ext.check_size(&ENCODING_RECORD)?;
+
+ Ok(Record::Encoding(EncodingRecord(
+ String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName {
+ offset: ext.offsets.start,
+ })?,
+ )))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct NumberOfCasesRecord {
+ /// Always observed as 1.
+ pub one: u64,
+
+ /// Number of cases.
+ pub n_cases: u64,
+}
+
+static NUMBER_OF_CASES_RECORD: ExtensionRecord = ExtensionRecord {
+ size: Some(8),
+ count: Some(2),
+ name: "extended number of cases record",
+};
+
+impl NumberOfCasesRecord {
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+ ext.check_size(&NUMBER_OF_CASES_RECORD)?;
+
+ let mut input = &ext.data[..];
+ let one = endian.parse(read_bytes(&mut input)?);
+ let n_cases = endian.parse(read_bytes(&mut input)?);
+
+ Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct RawVariableSetRecord(TextRecord);
+
+impl RawVariableSetRecord {
+ fn parse(extension: Extension) -> Result<Record, Warning> {
+ Ok(Record::VariableSets(Self(TextRecord::parse(
+ extension,
+ "variable sets record",
+ )?)))
+ }
+ pub fn decode(self, decoder: &mut Decoder) -> VariableSetRecord {
+ let mut sets = Vec::new();
+ let input = decoder.decode(&self.0.text);
+ for line in input.lines() {
+ if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&mut decoder.warn) {
+ sets.push(set)
+ }
+ }
+ VariableSetRecord {
+ offsets: self.0.offsets,
+ sets,
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct RawProductInfoRecord(TextRecord);
+
+impl RawProductInfoRecord {
+ fn parse(extension: Extension) -> Result<Record, Warning> {
+ Ok(Record::ProductInfo(Self(TextRecord::parse(
+ extension,
+ "product info record",
+ )?)))
+ }
+ pub fn decode(self, decoder: &mut Decoder) -> ProductInfoRecord {
+ ProductInfoRecord(decoder.decode(&self.0.text).into())
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Attribute {
+ pub name: Identifier,
+ pub values: Vec<String>,
+}
+
+impl Attribute {
+ fn parse<'a>(decoder: &mut Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> {
+ let Some((name, mut input)) = input.split_once('(') else {
+ return Err(Warning::AttributeMissingLParen(input.into()));
+ };
+ let name = decoder
+ .new_identifier(name)
+ .map_err(Warning::InvalidAttributeName)?;
+ let mut values = Vec::new();
+ loop {
+ let Some((value, rest)) = input.split_once('\n') else {
+ return Err(Warning::AttributeMissingValue {
+ name: name.clone(),
+ index: values.len(),
+ });
+ };
+ if let Some(stripped) = value
+ .strip_prefix('\'')
+ .and_then(|value| value.strip_suffix('\''))
+ {
+ values.push(stripped.into());
+ } else {
+ decoder.warn(Warning::AttributeMissingQuotes {
+ name: name.clone(),
+ index: values.len(),
+ });
+ values.push(value.into());
+ }
+ if let Some(rest) = rest.strip_prefix(')') {
+ let attribute = Attribute { name, values };
+ return Ok((attribute, rest));
+ };
+ input = rest;
+ }
+ }
+}
+
+impl Attributes {
+ fn parse<'a>(
+ decoder: &mut Decoder,
+ mut input: &'a str,
+ sentinel: Option<char>,
+ ) -> Result<(Attributes, &'a str, Vec<Identifier>), Warning> {
+ let mut attributes = BTreeMap::new();
+ let mut duplicates = Vec::new();
+ let rest = loop {
+ match input.chars().next() {
+ None => break input,
+ c if c == sentinel => break &input[1..],
+ _ => {
+ let (attribute, rest) = Attribute::parse(decoder, input)?;
+ if attributes.contains_key(&attribute.name) {
+ duplicates.push(attribute.name.clone());
+ }
+ attributes.insert(attribute.name, attribute.values);
+ input = rest;
+ }
+ }
+ };
+ Ok((Attributes(attributes), rest, duplicates))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct RawFileAttributesRecord(TextRecord);
+
+#[derive(Clone, Debug, Default)]
+pub struct FileAttributesRecord(pub Attributes);
+
+impl RawFileAttributesRecord {
+ fn parse(extension: Extension) -> Result<Record, Warning> {
+ Ok(Record::FileAttributes(Self(TextRecord::parse(
+ extension,
+ "file attributes record",
+ )?)))
+ }
+ pub fn decode(self, decoder: &mut Decoder) -> FileAttributesRecord {
+ let input = decoder.decode(&self.0.text);
+ match Attributes::parse(decoder, &input, None).issue_warning(&mut decoder.warn) {
+ Some((set, rest, duplicates)) => {
+ if !duplicates.is_empty() {
+ decoder.warn(Warning::DuplicateFileAttributes {
+ attributes: duplicates,
+ });
+ }
+ if !rest.is_empty() {
+ decoder.warn(dbg!(Warning::TBD));
+ }
+ FileAttributesRecord(set)
+ }
+ None => FileAttributesRecord::default(),
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VarAttributes {
+ pub long_var_name: Identifier,
+ pub attributes: Attributes,
+}
+
+impl VarAttributes {
+ fn parse<'a>(
+ decoder: &mut Decoder,
+ input: &'a str,
+ ) -> Result<(VarAttributes, &'a str), Warning> {
+ let Some((long_var_name, rest)) = input.split_once(':') else {
+ return Err(dbg!(Warning::TBD));
+ };
+ let long_var_name = decoder
+ .new_identifier(long_var_name)
+ .and_then(Identifier::must_be_ordinary)
+ .map_err(Warning::InvalidAttributeVariableName)?;
+ let (attributes, rest, duplicates) = Attributes::parse(decoder, rest, Some('/'))?;
+ if !duplicates.is_empty() {
+ decoder.warn(Warning::DuplicateVariableAttributes {
+ variable: long_var_name.clone(),
+ attributes: duplicates,
+ });
+ }
+ let var_attribute = VarAttributes {
+ long_var_name,
+ attributes,
+ };
+ Ok((var_attribute, rest))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct RawVariableAttributesRecord(TextRecord);
+
+#[derive(Clone, Debug)]
+pub struct VariableAttributesRecord(pub Vec<VarAttributes>);
+
+impl RawVariableAttributesRecord {
+ fn parse(extension: Extension) -> Result<Record, Warning> {
+ Ok(Record::VariableAttributes(Self(TextRecord::parse(
+ extension,
+ "variable attributes record",
+ )?)))
+ }
+ pub fn decode(self, decoder: &mut Decoder) -> VariableAttributesRecord {
+ let decoded = decoder.decode(&self.0.text);
+ let mut input = decoded.as_ref();
+ let mut var_attribute_sets = Vec::new();
+ while !input.is_empty() {
+ let Some((var_attribute, rest)) =
+ VarAttributes::parse(decoder, input).issue_warning(&mut decoder.warn)
+ else {
+ break;
+ };
+ var_attribute_sets.push(var_attribute);
+ input = rest;
+ }
+ VariableAttributesRecord(var_attribute_sets)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongName {
+ pub short_name: Identifier,
+ pub long_name: Identifier,
+}
+
+impl LongName {
+ fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
+ let Some((short_name, long_name)) = input.split_once('=') else {
+ return Err(dbg!(Warning::LongNameMissingEquals));
+ };
+ let short_name = decoder
+ .new_identifier(short_name)
+ .and_then(Identifier::must_be_ordinary)
+ .map_err(Warning::InvalidShortName)?;
+ let long_name = decoder
+ .new_identifier(long_name)
+ .and_then(Identifier::must_be_ordinary)
+ .map_err(Warning::InvalidLongName)?;
+ Ok(LongName {
+ short_name,
+ long_name,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongNamesRecord(pub Vec<LongName>);
+
+#[derive(Clone, Debug)]
+pub struct ProductInfoRecord(pub String);
+
+#[derive(Clone, Debug)]
+pub struct VariableSet {
+ pub name: String,
+ pub variable_names: Vec<Identifier>,
+}
+
+impl VariableSet {
+ fn parse(input: &str, decoder: &mut Decoder) -> Result<Self, Warning> {
+ let (name, input) = input
+ .split_once('=')
+ .ok_or(Warning::VariableSetMissingEquals)?;
+ let mut vars = Vec::new();
+ for var in input.split_ascii_whitespace() {
+ if let Some(identifier) = decoder
+ .new_identifier(var)
+ .and_then(Identifier::must_be_ordinary)
+ .map_err(Warning::InvalidVariableSetName)
+ .issue_warning(&mut decoder.warn)
+ {
+ vars.push(identifier);
+ }
+ }
+ Ok(VariableSet {
+ name: name.to_string(),
+ variable_names: vars,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableSetRecord {
+ pub offsets: Range<u64>,
+ pub sets: Vec<VariableSet>,
+}
+
+trait IssueWarning<T> {
+ fn issue_warning(self, warn: &mut dyn FnMut(Warning)) -> Option<T>;
+}
+impl<T> IssueWarning<T> for Result<T, Warning> {
+ fn issue_warning(self, warn: &mut dyn FnMut(Warning)) -> Option<T> {
+ match self {
+ Ok(result) => Some(result),
+ Err(error) => {
+ warn(error);
+ None
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Extension {
+ pub offsets: Range<u64>,
+
+ /// Record subtype.
+ pub subtype: u32,
+
+ /// Size of each data element.
+ pub size: u32,
+
+ /// Number of data elements.
+ pub count: u32,
+
+ /// `size * count` bytes of data.
+ pub data: Vec<u8>,
+}
+
+impl Extension {
+ pub fn check_size(&self, expected: &ExtensionRecord) -> Result<(), Warning> {
+ match expected.size {
+ Some(expected_size) if self.size != expected_size => {
+ return Err(Warning::BadRecordSize {
+ offset: self.offsets.start,
+ record: expected.name.into(),
+ size: self.size,
+ expected_size,
+ });
+ }
+ _ => (),
+ }
+ match expected.count {
+ Some(expected_count) if self.count != expected_count => {
+ return Err(Warning::BadRecordCount {
+ offset: self.offsets.start,
+ record: expected.name.into(),
+ count: self.count,
+ expected_count,
+ });
+ }
+ _ => (),
+ }
+ Ok(())
+ }
+
+ pub fn read<R: Read + Seek>(
+ r: &mut R,
+ endian: Endian,
+ var_types: &VarTypes,
+ warn: &mut dyn FnMut(Warning),
+ ) -> Result<Option<Record>, Error> {
+ let subtype = endian.parse(read_bytes(r)?);
+ let header_offset = r.stream_position()?;
+ let size: u32 = endian.parse(read_bytes(r)?);
+ let count = endian.parse(read_bytes(r)?);
+ let Some(product) = size.checked_mul(count) else {
+ return Err(Error::ExtensionRecordTooLarge {
+ offset: header_offset,
+ subtype,
+ size,
+ count,
+ });
+ };
+ let start_offset = r.stream_position()?;
+ let data = read_vec(r, product as usize)?;
+ let end_offset = start_offset + product as u64;
+ let extension = Extension {
+ offsets: start_offset..end_offset,
+ subtype,
+ size,
+ count,
+ data,
+ };
+ let result = match subtype {
+ 3 => IntegerInfoRecord::parse(&extension, endian),
+ 4 => FloatInfoRecord::parse(&extension, endian),
+ 11 => VarDisplayRecord::parse(&extension, var_types, endian, warn),
+ 7 | 19 => MultipleResponseRecord::parse(&extension, endian),
+ 21 => LongStringValueLabelRecord::parse(&extension, endian),
+ 22 => LongStringMissingValueRecord::parse(&extension, endian, warn),
+ 20 => EncodingRecord::parse(&extension, endian),
+ 16 => NumberOfCasesRecord::parse(&extension, endian),
+ 5 => RawVariableSetRecord::parse(extension),
+ 10 => RawProductInfoRecord::parse(extension),
+ 13 => RawLongNamesRecord::parse(extension),
+ 14 => RawVeryLongStringsRecord::parse(extension),
+ 17 => RawFileAttributesRecord::parse(extension),
+ 18 => RawVariableAttributesRecord::parse(extension),
+ _ => Ok(Record::OtherExtension(extension)),
+ };
+ match result {
+ Ok(result) => Ok(Some(result)),
+ Err(error) => {
+ warn(error);
+ Ok(None)
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringValueLabels<N, S>
+where
+ S: Debug,
+{
+ pub var_name: N,
+ pub width: u32,
+
+ /// `(value, label)` pairs, where each value is `width` bytes.
+ pub labels: Vec<(RawString, S)>,
+}
+
+impl LongStringValueLabels<RawString, RawString> {
+ fn decode(
+ &self,
+ decoder: &mut Decoder,
+ ) -> Result<LongStringValueLabels<Identifier, String>, Warning> {
+ let var_name = decoder.decode(&self.var_name);
+ let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding)
+ .map_err(Warning::InvalidLongStringValueLabelName)?;
+
+ let mut labels = Vec::with_capacity(self.labels.len());
+ for (value, label) in self.labels.iter() {
+ let label = decoder.decode(label).to_string();
+ labels.push((value.clone(), label));
+ }
+
+ Ok(LongStringValueLabels {
+ var_name,
+ width: self.width,
+ labels,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringValueLabelRecord<N, S>(pub Vec<LongStringValueLabels<N, S>>)
+where
+ N: Debug,
+ S: Debug;
+
+static LONG_STRING_VALUE_LABEL_RECORD: ExtensionRecord = ExtensionRecord {
+ size: Some(1),
+ count: None,
+ name: "long string value labels record",
+};
+
+impl LongStringValueLabelRecord<RawString, RawString> {
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+ ext.check_size(&LONG_STRING_VALUE_LABEL_RECORD)?;
+
+ let mut input = &ext.data[..];
+ let mut label_set = Vec::new();
+ while !input.is_empty() {
+ let var_name = read_string(&mut input, endian)?;
+ let width: u32 = endian.parse(read_bytes(&mut input)?);
+ let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
+ let mut labels = Vec::new();
+ for _ in 0..n_labels {
+ let value = read_string(&mut input, endian)?;
+ let label = read_string(&mut input, endian)?;
+ labels.push((value, label));
+ }
+ label_set.push(LongStringValueLabels {
+ var_name,
+ width,
+ labels,
+ })
+ }
+ Ok(Record::LongStringValueLabels(LongStringValueLabelRecord(
+ label_set,
+ )))
+ }
+}
+
+impl LongStringValueLabelRecord<RawString, RawString> {
+ pub fn decode(self, decoder: &mut Decoder) -> LongStringValueLabelRecord<Identifier, String> {
+ let mut labels = Vec::with_capacity(self.0.len());
+ for label in &self.0 {
+ match label.decode(decoder) {
+ Ok(set) => labels.push(set),
+ Err(error) => decoder.warn(error),
+ }
+ }
+ LongStringValueLabelRecord(labels)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct ZHeader {
+ /// File offset to the start of the record.
+ pub offset: u64,
+
+ /// File offset to the ZLIB data header.
+ pub zheader_offset: u64,
+
+ /// File offset to the ZLIB trailer.
+ pub ztrailer_offset: u64,
+
+ /// Length of the ZLIB trailer in bytes.
+ pub ztrailer_len: u64,
+}
+
+impl ZHeader {
+ pub fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
+ let offset = r.stream_position()?;
+ let zheader_offset: u64 = endian.parse(read_bytes(r)?);
+ let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
+ let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
+
+ if zheader_offset != offset {
+ return Err(Error::UnexpectedZHeaderOffset {
+ actual: zheader_offset,
+ expected: offset,
+ });
+ }
+
+ if ztrailer_offset < offset {
+ return Err(Error::ImpossibleZTrailerOffset(ztrailer_offset));
+ }
+
+ if ztrailer_len < 24 || ztrailer_len % 24 != 0 {
+ return Err(Error::InvalidZTrailerLength(ztrailer_len));
+ }
+
+ Ok(ZHeader {
+ offset,
+ zheader_offset,
+ ztrailer_offset,
+ ztrailer_len,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct ZTrailer {
+ /// File offset to the start of the record.
+ pub offset: u64,
+
+ /// Compression bias as a negative integer, e.g. -100.
+ pub int_bias: i64,
+
+ /// Always observed as zero.
+ pub zero: u64,
+
+ /// Uncompressed size of each block, except possibly the last. Only
+ /// `0x3ff000` has been observed so far.
+ pub block_size: u32,
+
+ /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
+ pub blocks: Vec<ZBlock>,
+}
+
+#[derive(Clone, Debug)]
+pub struct ZBlock {
+ /// Offset of block of data if simple compression were used.
+ pub uncompressed_ofs: u64,
+
+ /// Actual offset within the file of the compressed data block.
+ pub compressed_ofs: u64,
+
+ /// The number of bytes in this data block after decompression. This is
+ /// `block_size` in every data block but the last, which may be smaller.
+ pub uncompressed_size: u32,
+
+ /// The number of bytes in this data block, as stored compressed in this
+ /// file.
+ pub compressed_size: u32,
+}
+
+impl ZBlock {
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
+ Ok(ZBlock {
+ uncompressed_ofs: endian.parse(read_bytes(r)?),
+ compressed_ofs: endian.parse(read_bytes(r)?),
+ uncompressed_size: endian.parse(read_bytes(r)?),
+ compressed_size: endian.parse(read_bytes(r)?),
+ })
+ }
+}
+
+impl ZTrailer {
+ pub fn read<R: Read + Seek>(
+ reader: &mut R,
+ endian: Endian,
+ bias: f64,
+ zheader: &ZHeader,
+ warn: &mut dyn FnMut(Warning),
+ ) -> Result<Option<ZTrailer>, Error> {
+ let start_offset = reader.stream_position()?;
+ if reader
+ .seek(SeekFrom::Start(zheader.ztrailer_offset))
+ .is_err()
+ {
+ return Ok(None);
+ }
+ let int_bias = endian.parse(read_bytes(reader)?);
+ if int_bias as f64 != -bias {
+ return Err(Error::WrongZlibTrailerBias {
+ actual: int_bias,
+ expected: -bias,
+ });
+ }
+ let zero = endian.parse(read_bytes(reader)?);
+ if zero != 0 {
+ return Err(Error::WrongZlibTrailerZero(zero));
+ }
+ let block_size = endian.parse(read_bytes(reader)?);
+ if block_size != 0x3ff000 {
+ return Err(Error::WrongZlibTrailerBlockSize(block_size));
+ }
+ let n_blocks: u32 = endian.parse(read_bytes(reader)?);
+ let expected_n_blocks = (zheader.ztrailer_len - 24) / 24;
+ if n_blocks as u64 != expected_n_blocks {
+ return Err(Error::BadZlibTrailerNBlocks {
+ offset: zheader.ztrailer_offset,
+ n_blocks,
+ expected_n_blocks,
+ ztrailer_len: zheader.ztrailer_len,
+ });
+ }
+ let blocks = (0..n_blocks)
+ .map(|_| ZBlock::read(reader, endian))
+ .collect::<Result<Vec<_>, _>>()?;
+
+ let mut expected_uncmp_ofs = zheader.zheader_offset;
+ let mut expected_cmp_ofs = zheader.zheader_offset + 24;
+ for (index, block) in blocks.iter().enumerate() {
+ if block.uncompressed_ofs != expected_uncmp_ofs {
+ return Err(Error::ZlibTrailerBlockWrongUncmpOfs {
+ index,
+ actual: block.uncompressed_ofs,
+ expected: expected_cmp_ofs,
+ });
+ }
+ if block.compressed_ofs != expected_cmp_ofs {
+ return Err(Error::ZlibTrailerBlockWrongCmpOfs {
+ index,
+ actual: block.compressed_ofs,
+ expected: expected_cmp_ofs,
+ });
+ }
+ if index < blocks.len() - 1 {
+ if block.uncompressed_size != block_size {
+ warn(Warning::ZlibTrailerBlockWrongSize {
+ index,
+ actual: block.uncompressed_size,
+ expected: block_size,
+ });
+ }
+ } else {
+ if block.uncompressed_size > block_size {
+ warn(Warning::ZlibTrailerBlockTooBig {
+ index,
+ actual: block.uncompressed_size,
+ max_expected: block_size,
+ });
+ }
+ }
+ // http://www.zlib.net/zlib_tech.html says that the maximum
+ // expansion from compression, with worst-case parameters, is 13.5%
+ // plus 11 bytes. This code checks for an expansion of more than
+ // 14.3% plus 11 bytes.
+ if block.compressed_size > block.uncompressed_size + block.uncompressed_size / 7 + 11 {
+ return Err(Error::ZlibExpansion {
+ index,
+ compressed_size: block.compressed_size,
+ uncompressed_size: block.uncompressed_size,
+ });
+ }
+
+ expected_cmp_ofs += block.compressed_size as u64;
+ expected_uncmp_ofs += block.uncompressed_size as u64;
+ }
+
+ if expected_cmp_ofs != zheader.ztrailer_offset {
+ return Err(Error::ZlibTrailerOffsetInconsistency {
+ descriptors: expected_cmp_ofs,
+ zheader: zheader.ztrailer_offset,
+ });
+ }
+
+ reader.seek(SeekFrom::Start(start_offset))?;
+ Ok(Some(ZTrailer {
+ offset: zheader.ztrailer_offset,
+ int_bias,
+ zero,
+ block_size,
+ blocks,
+ }))
+ }
+}