-use endian::{Endian, Parse, ToBytes};
-use flate2::read::ZlibDecoder;
-use num::Integer;
-use num_derive::FromPrimitive;
-use std::{
- collections::VecDeque,
- io::{Error as IoError, Read, Seek, SeekFrom},
- iter::FusedIterator,
-};
-use thiserror::Error;
+use std::io::Error as IoError;
+use thiserror::Error as ThisError;
pub mod endian;
+pub mod raw;
pub mod sack;
-#[derive(Error, Debug)]
+#[derive(ThisError, Debug)]
pub enum Error {
#[error("Not an SPSS system file")]
NotASystemFile,
#[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
BadVariableWidth { offset: u64, width: i32 },
- #[error("Misplaced type 4 record near offset {0:#x}.")]
- MisplacedType4Record(u64),
-
#[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
BadDocumentLength { offset: u64, n: u32, max: u32 },
#[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
- #[error("At offset {offset:#x}, variable index record (type 4) does not immediately follow value label record (type 3) as it should.")]
- MissingVariableIndexRecord { offset: u64 },
-
#[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
count: u32,
},
- #[error("Wrong ZLIB data header offset {zheader_offset:#x} (expected {offset:#x}).")]
- BadZlibHeaderOffset { offset: u64, zheader_offset: u64 },
-
- #[error("At offset {offset:#x}, impossible ZLIB trailer offset {ztrailer_offset:#x}.")]
- BadZlibTrailerOffset { offset: u64, ztrailer_offset: u64 },
-
- #[error("At offset {offset:#x}, impossible ZLIB trailer length {ztrailer_len}.")]
- BadZlibTrailerLen { offset: u64, ztrailer_len: u64 },
-
#[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
EofInCase {
offset: u64,
ztrailer_len: u64,
},
}
-
-#[derive(Error, Debug)]
-pub enum Warning {
- #[error("Unexpected floating-point bias {0} or unrecognized floating-point format.")]
- UnexpectedBias(f64),
-
- #[error("Duplicate type 6 (document) record.")]
- DuplicateDocumentRecord,
-}
-
-#[derive(Copy, Clone, Debug)]
-pub enum Compression {
- Simple,
- ZLib,
-}
-
-pub enum Record {
- Header(Header),
- Document(Document),
- Variable(Variable),
- ValueLabel(ValueLabel),
- VarIndexes(VarIndexes),
- Extension(Extension),
- EndOfHeaders,
- ZHeader(ZHeader),
- ZTrailer(ZTrailer),
- Case(Vec<Value>),
-}
-
-pub struct Header {
- /// Magic number.
- pub magic: Magic,
-
- /// Eye-catcher string, product name, in the file's encoding. Padded
- /// on the right with spaces.
- pub eye_catcher: [u8; 60],
-
- /// Layout code, normally either 2 or 3.
- pub layout_code: u32,
-
- /// Number of variable positions, or `None` if the value in the file is
- /// questionably trustworthy.
- pub nominal_case_size: Option<u32>,
-
- /// Compression type, if any,
- pub compression: Option<Compression>,
-
- /// 0-based variable index of the weight variable, or `None` if the file is
- /// unweighted.
- pub weight_index: Option<u32>,
-
- /// Claimed number of cases, if known.
- pub n_cases: Option<u32>,
-
- /// Compression bias, usually 100.0.
- pub bias: f64,
-
- /// `dd mmm yy` in the file's encoding.
- pub creation_date: [u8; 9],
-
- /// `HH:MM:SS` in the file's encoding.
- pub creation_time: [u8; 8],
-
- /// File label, in the file's encoding. Padded on the right with spaces.
- pub file_label: [u8; 64],
-
- /// Endianness of the data in the file header.
- pub endian: Endian,
-}
-
-#[derive(Copy, Clone, PartialEq, Eq, Hash)]
-pub struct Magic([u8; 4]);
-
-impl Magic {
- /// Magic number for a regular system file.
- pub const SAV: Magic = Magic(*b"$FL2");
-
- /// Magic number for a system file that contains zlib-compressed data.
- pub const ZSAV: Magic = Magic(*b"$FL3");
-
- /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
- /// in EBCDIC.
- pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
-}
-
-impl TryFrom<[u8; 4]> for Magic {
- type Error = Error;
-
- fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
- let magic = Magic(value);
- match magic {
- Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
- _ => Err(Error::BadMagic(value)),
- }
- }
-}
-
-#[derive(Copy, Clone, PartialEq, Eq, Hash)]
-pub enum VarType {
- Number,
- String,
-}
-
-impl VarType {
- fn from_width(width: i32) -> VarType {
- match width {
- 0 => VarType::Number,
- _ => VarType::String,
- }
- }
-}
-
-trait State {
- #[allow(clippy::type_complexity)]
- fn read(self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error>;
-}
-
-struct Start<R: Read + Seek> {
- reader: R,
-}
-
-struct CommonState<R: Read + Seek> {
- reader: R,
- endian: Endian,
- bias: f64,
- compression: Option<Compression>,
- var_types: Vec<VarType>,
-}
-
-impl<R: Read + Seek + 'static> State for Start<R> {
- fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
- let header = read_header(&mut self.reader)?;
- let next_state = Headers(CommonState {
- reader: self.reader,
- endian: header.endian,
- bias: header.bias,
- compression: header.compression,
- var_types: Vec::new(),
- });
- Ok(Some((Record::Header(header), Box::new(next_state))))
- }
-}
-
-struct Headers<R: Read + Seek>(CommonState<R>);
-
-impl<R: Read + Seek + 'static> State for Headers<R> {
- fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
- let endian = self.0.endian;
- let rec_type: u32 = endian.parse(read_bytes(&mut self.0.reader)?);
- let record = match rec_type {
- 2 => {
- let variable = read_variable_record(&mut self.0.reader, endian)?;
- self.0.var_types.push(VarType::from_width(variable.width));
- Record::Variable(variable)
- }
- 3 => Record::ValueLabel(read_value_label_record(&mut self.0.reader, endian)?),
- 4 => Record::VarIndexes(read_var_indexes_record(&mut self.0.reader, endian)?),
- 6 => Record::Document(read_document_record(&mut self.0.reader, endian)?),
- 7 => Record::Extension(read_extension_record(&mut self.0.reader, endian)?),
- 999 => {
- let _: [u8; 4] = read_bytes(&mut self.0.reader)?;
- let next_state: Box<dyn State> = match self.0.compression {
- None => Box::new(Data(self.0)),
- Some(Compression::Simple) => Box::new(CompressedData::new(self.0)),
- Some(Compression::ZLib) => Box::new(ZlibHeader(self.0)),
- };
- return Ok(Some((Record::EndOfHeaders, next_state)));
- }
- _ => {
- return Err(Error::BadRecordType {
- offset: self.0.reader.stream_position()?,
- rec_type,
- })
- }
- };
- Ok(Some((record, self)))
- }
-}
-
-struct ZlibHeader<R: Read + Seek>(CommonState<R>);
-
-impl<R: Read + Seek + 'static> State for ZlibHeader<R> {
- fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
- let zheader = read_zheader(&mut self.0.reader, self.0.endian)?;
- Ok(Some((Record::ZHeader(zheader), self)))
- }
-}
-
-struct ZlibTrailer<R: Read + Seek>(CommonState<R>, ZHeader);
-
-impl<R: Read + Seek + 'static> State for ZlibTrailer<R> {
- fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
- let retval = read_ztrailer(&mut self.0.reader, self.0.endian, self.1.ztrailer_offset, self.1.ztrailer_len)?;
- let next_state = Box::new(CompressedData::new(CommonState {
- reader: ZlibDecodeMultiple::new(self.0.reader),
- endian: self.0.endian,
- bias: self.0.bias,
- compression: self.0.compression,
- var_types: self.0.var_types
- }));
- match retval {
- None => next_state.read(),
- Some(ztrailer) => Ok(Some((Record::ZTrailer(ztrailer), next_state)))
- }
- }
-}
-
-struct Data<R: Read + Seek>(CommonState<R>);
-
-impl<R: Read + Seek + 'static> State for Data<R> {
- fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
- let case_start = self.0.reader.stream_position()?;
- let mut values = Vec::with_capacity(self.0.var_types.len());
- for (i, &var_type) in self.0.var_types.iter().enumerate() {
- let Some(raw) = try_read_bytes(&mut self.0.reader)? else {
- if i == 0 {
- return Ok(None);
- } else {
- let offset = self.0.reader.stream_position()?;
- return Err(Error::EofInCase {
- offset,
- case_ofs: offset - case_start,
- case_len: self.0.var_types.len() * 8,
- });
- }
- };
- values.push(Value::from_raw(var_type, raw, self.0.endian));
- }
- Ok(Some((Record::Case(values), self)))
- }
-}
-
-struct CompressedData<R: Read + Seek> {
- common: CommonState<R>,
- codes: VecDeque<u8>,
-}
-
-impl<R: Read + Seek + 'static> CompressedData<R> {
- fn new(common: CommonState<R>) -> CompressedData<R> {
- CompressedData { common, codes: VecDeque::new() }
- }
-}
-
-impl<R: Read + Seek + 'static> State for CompressedData<R> {
- fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
- let case_start = self.common.reader.stream_position()?;
- let mut values = Vec::with_capacity(self.common.var_types.len());
- for (i, &var_type) in self.common.var_types.iter().enumerate() {
- let value = loop {
- let Some(code) = self.codes.pop_front() else {
- let Some(new_codes): Option<[u8; 8]> = try_read_bytes(&mut self.common.reader)?
- else {
- if i == 0 {
- return Ok(None);
- } else {
- let offset = self.common.reader.stream_position()?;
- return Err(Error::EofInCompressedCase {
- offset,
- case_ofs: offset - case_start,
- });
- }
- };
- self.codes.extend(new_codes.into_iter());
- continue;
- };
- match code {
- 0 => (),
- 1..=251 => match var_type {
- VarType::Number => break Value::Number(Some(code as f64 - self.common.bias)),
- VarType::String => {
- break Value::String(self.common.endian.to_bytes(code as f64 - self.common.bias))
- }
- },
- 252 => {
- if i == 0 {
- return Ok(None);
- } else {
- let offset = self.common.reader.stream_position()?;
- return Err(Error::PartialCompressedCase {
- offset,
- case_ofs: offset - case_start,
- });
- }
- }
- 253 => {
- break Value::from_raw(
- var_type,
- read_bytes(&mut self.common.reader)?,
- self.common.endian,
- )
- }
- 254 => match var_type {
- VarType::String => break Value::String(*b" "), // XXX EBCDIC
- VarType::Number => {
- return Err(Error::CompressedStringExpected {
- offset: case_start,
- case_ofs: self.common.reader.stream_position()? - case_start,
- })
- }
- },
- 255 => match var_type {
- VarType::Number => break Value::Number(None),
- VarType::String => {
- return Err(Error::CompressedNumberExpected {
- offset: case_start,
- case_ofs: self.common.reader.stream_position()? - case_start,
- })
- }
- },
- }
- };
- values.push(value);
- }
- Ok(Some((Record::Case(values), self)))
- }
-}
-
-struct ZlibDecodeMultiple<R>
-where
- R: Read + Seek,
-{
- reader: Option<ZlibDecoder<R>>,
-}
-
-impl<R> ZlibDecodeMultiple<R>
-where
- R: Read + Seek,
-{
- fn new(reader: R) -> ZlibDecodeMultiple<R> {
- ZlibDecodeMultiple {
- reader: Some(ZlibDecoder::new(reader)),
- }
- }
-}
-
-impl<R> Read for ZlibDecodeMultiple<R>
-where
- R: Read + Seek,
-{
- fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
- loop {
- match self.reader.as_mut().unwrap().read(buf)? {
- 0 => {
- let inner = self.reader.take().unwrap().into_inner();
- self.reader = Some(ZlibDecoder::new(inner));
- }
- n => return Ok(n),
- };
- }
- }
-}
-
-impl<R> Seek for ZlibDecodeMultiple<R>
-where
- R: Read + Seek,
-{
- fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
- self.reader.as_mut().unwrap().get_mut().seek(pos)
- }
-}
-
-#[derive(Copy, Clone)]
-pub enum Value {
- Number(Option<f64>),
- String([u8; 8]),
-}
-
-impl Value {
- pub fn from_raw(var_type: VarType, raw: [u8; 8], endian: Endian) -> Value {
- match var_type {
- VarType::String => Value::String(raw),
- VarType::Number => {
- let number: f64 = endian.parse(raw);
- Value::Number((number != -f64::MAX).then_some(number))
- }
- }
- }
-}
-
-pub struct Reader {
- state: Option<Box<dyn State>>,
-}
-
-impl Reader {
- pub fn new<R: Read + Seek + 'static>(reader: R) -> Result<Reader, Error> {
- Ok(Reader {
- state: Some(Box::new(Start { reader })),
- })
- }
-}
-
-impl Iterator for Reader {
- type Item = Result<Record, Error>;
-
- fn next(&mut self) -> Option<Self::Item> {
- match self.state.take()?.read() {
- Ok(Some((record, next_state))) => {
- self.state = Some(next_state);
- Some(Ok(record))
- }
- Ok(None) => None,
- Err(error) => Some(Err(error)),
- }
- }
-}
-
-impl FusedIterator for Reader {}
-
-fn read_header<R: Read>(r: &mut R) -> Result<Header, Error> {
- let magic: [u8; 4] = read_bytes(r)?;
- let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
-
- let eye_catcher: [u8; 60] = read_bytes(r)?;
- let layout_code: [u8; 4] = read_bytes(r)?;
- let endian = Endian::identify_u32(2, layout_code)
- .or_else(|| Endian::identify_u32(2, layout_code))
- .ok_or_else(|| Error::NotASystemFile)?;
- let layout_code = endian.parse(layout_code);
-
- let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
- let nominal_case_size =
- (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
-
- let compression_code: u32 = endian.parse(read_bytes(r)?);
- let compression = match (magic, compression_code) {
- (Magic::ZSAV, 2) => Some(Compression::ZLib),
- (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
- (_, 0) => None,
- (_, 1) => Some(Compression::Simple),
- (_, code) => return Err(Error::InvalidSavCompression(code)),
- };
-
- let weight_index: u32 = endian.parse(read_bytes(r)?);
- let weight_index = (weight_index > 0).then_some(weight_index - 1);
-
- let n_cases: u32 = endian.parse(read_bytes(r)?);
- let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
-
- let bias: f64 = endian.parse(read_bytes(r)?);
-
- let creation_date: [u8; 9] = read_bytes(r)?;
- let creation_time: [u8; 8] = read_bytes(r)?;
- let file_label: [u8; 64] = read_bytes(r)?;
- let _: [u8; 3] = read_bytes(r)?;
-
- Ok(Header {
- magic,
- layout_code,
- nominal_case_size,
- compression,
- weight_index,
- n_cases,
- bias,
- creation_date,
- creation_time,
- eye_catcher,
- file_label,
- endian,
- })
-}
-
-pub struct Variable {
- /// Offset from the start of the file to the start of the record.
- pub offset: u64,
-
- /// Variable width, in the range -1..=255.
- pub width: i32,
-
- /// Variable name, padded on the right with spaces.
- pub name: [u8; 8],
-
- /// Print format.
- pub print_format: u32,
-
- /// Write format.
- pub write_format: u32,
-
- /// Missing value code, one of -3, -2, 0, 1, 2, or 3.
- pub missing_value_code: i32,
-
- /// Raw missing values, up to 3 of them.
- pub missing: Vec<[u8; 8]>,
-
- /// Optional variable label.
- pub label: Option<Vec<u8>>,
-}
-
-fn read_variable_record<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Variable, Error> {
- let offset = r.stream_position()?;
- let width: i32 = endian.parse(read_bytes(r)?);
- let has_variable_label: u32 = endian.parse(read_bytes(r)?);
- let missing_value_code: i32 = endian.parse(read_bytes(r)?);
- let print_format: u32 = endian.parse(read_bytes(r)?);
- let write_format: u32 = endian.parse(read_bytes(r)?);
- let name: [u8; 8] = read_bytes(r)?;
-
- let label = match has_variable_label {
- 0 => None,
- 1 => {
- let len: u32 = endian.parse(read_bytes(r)?);
- let read_len = len.min(65535) as usize;
- let label = Some(read_vec(r, read_len)?);
-
- let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
- let _ = read_vec(r, padding_bytes as usize)?;
-
- label
- }
- _ => {
- return Err(Error::BadVariableLabelCode {
- offset,
- code: has_variable_label,
- })
- }
- };
-
- let mut missing = Vec::new();
- if missing_value_code != 0 {
- match (width, missing_value_code) {
- (0, -3 | -2 | 1 | 2 | 3) => (),
- (0, _) => {
- return Err(Error::BadNumericMissingValueCode {
- offset,
- code: missing_value_code,
- })
- }
- (_, 0..=3) => (),
- (_, _) => {
- return Err(Error::BadStringMissingValueCode {
- offset,
- code: missing_value_code,
- })
- }
- }
-
- for _ in 0..missing_value_code.abs() {
- missing.push(read_bytes(r)?);
- }
- }
-
- Ok(Variable {
- offset,
- width,
- name,
- print_format,
- write_format,
- missing_value_code,
- missing,
- label,
- })
-}
-
-pub struct ValueLabel {
- /// Offset from the start of the file to the start of the record.
- pub offset: u64,
-
- /// The labels.
- pub labels: Vec<([u8; 8], Vec<u8>)>,
-}
-
-impl ValueLabel {
- /// Maximum number of value labels in a record.
- pub const MAX: u32 = u32::MAX / 8;
-}
-
-fn read_value_label_record<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabel, Error> {
- let offset = r.stream_position()?;
- let n: u32 = endian.parse(read_bytes(r)?);
- if n > ValueLabel::MAX {
- return Err(Error::BadNumberOfValueLabels {
- offset,
- n,
- max: ValueLabel::MAX,
- });
- }
-
- let mut labels = Vec::new();
- for _ in 0..n {
- let value: [u8; 8] = read_bytes(r)?;
- let label_len: u8 = endian.parse(read_bytes(r)?);
- let label_len = label_len as usize;
- let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
-
- let mut label = read_vec(r, padded_len)?;
- label.truncate(label_len);
- labels.push((value, label));
- }
- Ok(ValueLabel { offset, labels })
-}
-
-pub struct VarIndexes {
- /// Offset from the start of the file to the start of the record.
- pub offset: u64,
-
- /// The 0-based indexes of the variable indexes.
- pub var_indexes: Vec<u32>,
-}
-
-impl VarIndexes {
- /// Maximum number of variable indexes in a record.
- pub const MAX: u32 = u32::MAX / 8;
-}
-
-fn read_var_indexes_record<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VarIndexes, Error> {
- let offset = r.stream_position()?;
- let n: u32 = endian.parse(read_bytes(r)?);
- if n > VarIndexes::MAX {
- return Err(Error::BadNumberOfVarIndexes {
- offset,
- n,
- max: VarIndexes::MAX,
- });
- }
- let mut var_indexes = Vec::with_capacity(n as usize);
- for _ in 0..n {
- var_indexes.push(endian.parse(read_bytes(r)?));
- }
-
- Ok(VarIndexes {
- offset,
- var_indexes,
- })
-}
-
-pub const DOC_LINE_LEN: u32 = 80;
-pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN;
-
-pub struct Document {
- /// Offset from the start of the file to the start of the record.
- pub pos: u64,
-
- /// The document, as an array of 80-byte lines.
- pub lines: Vec<[u8; DOC_LINE_LEN as usize]>,
-}
-
-fn read_document_record<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Document, Error> {
- let offset = r.stream_position()?;
- let n: u32 = endian.parse(read_bytes(r)?);
- match n {
- 0..=DOC_MAX_LINES => {
- let pos = r.stream_position()?;
- let mut lines = Vec::with_capacity(n as usize);
- for _ in 0..n {
- let line: [u8; 80] = read_bytes(r)?;
- lines.push(line);
- }
- Ok(Document { pos, lines })
- }
- _ => Err(Error::BadDocumentLength {
- offset,
- n,
- max: DOC_MAX_LINES,
- }),
- }
-}
-
-#[derive(FromPrimitive)]
-enum ExtensionType {
- /// Machine integer info.
- Integer = 3,
- /// Machine floating-point info.
- Float = 4,
- /// Variable sets.
- VarSets = 5,
- /// DATE.
- Date = 6,
- /// Multiple response sets.
- Mrsets = 7,
- /// SPSS Data Entry.
- DataEntry = 8,
- /// Extra product info text.
- ProductInfo = 10,
- /// Variable display parameters.
- Display = 11,
- /// Long variable names.
- LongNames = 13,
- /// Long strings.
- LongStrings = 14,
- /// Extended number of cases.
- Ncases = 16,
- /// Data file attributes.
- FileAttrs = 17,
- /// Variable attributes.
- VarAttrs = 18,
- /// Multiple response sets (extended).
- Mrsets2 = 19,
- /// Character encoding.
- Encoding = 20,
- /// Value labels for long strings.
- LongLabels = 21,
- /// Missing values for long strings.
- LongMissing = 22,
- /// "Format properties in dataview table".
- Dataview = 24,
-}
-
-pub struct Extension {
- /// Offset from the start of the file to the start of the record.
- pub offset: u64,
-
- /// Record subtype.
- pub subtype: u32,
-
- /// Size of each data element.
- pub size: u32,
-
- /// Number of data elements.
- pub count: u32,
-
- /// `size * count` bytes of data.
- pub data: Vec<u8>,
-}
-
-fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) {
- match extension {
- /* Implemented record types. */
- ExtensionType::Integer => (4, 8),
- ExtensionType::Float => (8, 3),
- ExtensionType::VarSets => (1, 0),
- ExtensionType::Mrsets => (1, 0),
- ExtensionType::ProductInfo => (1, 0),
- ExtensionType::Display => (4, 0),
- ExtensionType::LongNames => (1, 0),
- ExtensionType::LongStrings => (1, 0),
- ExtensionType::Ncases => (8, 2),
- ExtensionType::FileAttrs => (1, 0),
- ExtensionType::VarAttrs => (1, 0),
- ExtensionType::Mrsets2 => (1, 0),
- ExtensionType::Encoding => (1, 0),
- ExtensionType::LongLabels => (1, 0),
- ExtensionType::LongMissing => (1, 0),
-
- /* Ignored record types. */
- ExtensionType::Date => (0, 0),
- ExtensionType::DataEntry => (0, 0),
- ExtensionType::Dataview => (0, 0),
- }
-}
-
-fn read_extension_record<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Extension, Error> {
- let subtype = endian.parse(read_bytes(r)?);
- let offset = r.stream_position()?;
- let size: u32 = endian.parse(read_bytes(r)?);
- let count = endian.parse(read_bytes(r)?);
- let Some(product) = size.checked_mul(count) else {
- return Err(Error::ExtensionRecordTooLarge {
- offset,
- subtype,
- size,
- count,
- });
- };
- let offset = r.stream_position()?;
- let data = read_vec(r, product as usize)?;
- Ok(Extension {
- offset,
- subtype,
- size,
- count,
- data,
- })
-}
-
-pub struct ZHeader {
- /// File offset to the start of the record.
- pub offset: u64,
-
- /// File offset to the ZLIB data header.
- pub zheader_offset: u64,
-
- /// File offset to the ZLIB trailer.
- pub ztrailer_offset: u64,
-
- /// Length of the ZLIB trailer in bytes.
- pub ztrailer_len: u64,
-}
-
-fn read_zheader<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
- let offset = r.stream_position()?;
- let zheader_offset: u64 = endian.parse(read_bytes(r)?);
- let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
- let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
-
- Ok(ZHeader {
- offset,
- zheader_offset,
- ztrailer_offset,
- ztrailer_len,
- })
-}
-
-pub struct ZTrailer {
- /// File offset to the start of the record.
- pub offset: u64,
-
- /// Compression bias as a negative integer, e.g. -100.
- pub int_bias: i64,
-
- /// Always observed as zero.
- pub zero: u64,
-
- /// Uncompressed size of each block, except possibly the last. Only
- /// `0x3ff000` has been observed so far.
- pub block_size: u32,
-
- /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
- pub blocks: Vec<ZBlock>,
-}
-
-pub struct ZBlock {
- /// Offset of block of data if simple compression were used.
- pub uncompressed_ofs: u64,
-
- /// Actual offset within the file of the compressed data block.
- pub compressed_ofs: u64,
-
- /// The number of bytes in this data block after decompression. This is
- /// `block_size` in every data block but the last, which may be smaller.
- pub uncompressed_size: u32,
-
- /// The number of bytes in this data block, as stored compressed in this
- /// file.
- pub compressed_size: u32,
-}
-
-fn read_ztrailer<R: Read + Seek>(
- r: &mut R,
- endian: Endian,
- ztrailer_ofs: u64,
- ztrailer_len: u64,
-) -> Result<Option<ZTrailer>, Error> {
- let start_offset = r.stream_position()?;
- if r.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
- return Ok(None);
- }
- let int_bias = endian.parse(read_bytes(r)?);
- let zero = endian.parse(read_bytes(r)?);
- let block_size = endian.parse(read_bytes(r)?);
- let n_blocks: u32 = endian.parse(read_bytes(r)?);
- let expected_n_blocks = (ztrailer_len - 24) / 24;
- if n_blocks as u64 != expected_n_blocks {
- return Err(Error::BadZlibTrailerNBlocks {
- offset: ztrailer_ofs,
- n_blocks,
- expected_n_blocks,
- ztrailer_len,
- });
- }
- let mut blocks = Vec::with_capacity(n_blocks as usize);
- for _ in 0..n_blocks {
- let uncompressed_ofs = endian.parse(read_bytes(r)?);
- let compressed_ofs = endian.parse(read_bytes(r)?);
- let uncompressed_size = endian.parse(read_bytes(r)?);
- let compressed_size = endian.parse(read_bytes(r)?);
- blocks.push(ZBlock {
- uncompressed_ofs,
- compressed_ofs,
- uncompressed_size,
- compressed_size,
- });
- }
- r.seek(SeekFrom::Start(start_offset))?;
- Ok(Some(ZTrailer {
- offset: ztrailer_ofs,
- int_bias,
- zero,
- block_size,
- blocks,
- }))
-}
-
-fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
- let mut buf = [0; N];
- let n = r.read(&mut buf)?;
- if n > 0 {
- if n < N {
- r.read_exact(&mut buf[n..])?;
- }
- Ok(Some(buf))
- } else {
- Ok(None)
- }
-}
-
-fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
- let mut buf = [0; N];
- r.read_exact(&mut buf)?;
- Ok(buf)
-}
-
-fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
- let mut vec = vec![0; n];
- r.read_exact(&mut vec)?;
- Ok(vec)
-}
-
-/*
-fn trim_end(mut s: Vec<u8>, c: u8) -> Vec<u8> {
- while s.last() == Some(&c) {
- s.pop();
- }
- s
-}
-
-fn skip_bytes<R: Read>(r: &mut R, mut n: u64) -> Result<(), IoError> {
- let mut buf = [0; 1024];
- while n > 0 {
- let chunk = u64::min(n, buf.len() as u64);
- r.read_exact(&mut buf[0..chunk as usize])?;
- n -= chunk;
- }
- Ok(())
-}
-
-*/
--- /dev/null
+use crate::endian::{Endian, Parse, ToBytes};
+use crate::Error;
+
+use flate2::read::ZlibDecoder;
+use num::Integer;
+use num_derive::FromPrimitive;
+use std::{
+ collections::VecDeque,
+ io::{Error as IoError, Read, Seek, SeekFrom},
+ iter::FusedIterator,
+};
+
+#[derive(Copy, Clone, Debug)]
+pub enum Compression {
+ Simple,
+ ZLib,
+}
+
+pub enum Record {
+ Header(Header),
+ Document(Document),
+ Variable(Variable),
+ ValueLabel(ValueLabel),
+ VarIndexes(VarIndexes),
+ Extension(Extension),
+ EndOfHeaders,
+ ZHeader(ZHeader),
+ ZTrailer(ZTrailer),
+ Case(Vec<Value>),
+}
+
+pub struct Header {
+ /// Magic number.
+ pub magic: Magic,
+
+ /// Eye-catcher string, product name, in the file's encoding. Padded
+ /// on the right with spaces.
+ pub eye_catcher: [u8; 60],
+
+ /// Layout code, normally either 2 or 3.
+ pub layout_code: u32,
+
+ /// Number of variable positions, or `None` if the value in the file is
+ /// questionably trustworthy.
+ pub nominal_case_size: Option<u32>,
+
+ /// Compression type, if any,
+ pub compression: Option<Compression>,
+
+ /// 0-based variable index of the weight variable, or `None` if the file is
+ /// unweighted.
+ pub weight_index: Option<u32>,
+
+ /// Claimed number of cases, if known.
+ pub n_cases: Option<u32>,
+
+ /// Compression bias, usually 100.0.
+ pub bias: f64,
+
+ /// `dd mmm yy` in the file's encoding.
+ pub creation_date: [u8; 9],
+
+ /// `HH:MM:SS` in the file's encoding.
+ pub creation_time: [u8; 8],
+
+ /// File label, in the file's encoding. Padded on the right with spaces.
+ pub file_label: [u8; 64],
+
+ /// Endianness of the data in the file header.
+ pub endian: Endian,
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+pub struct Magic([u8; 4]);
+
+impl Magic {
+ /// Magic number for a regular system file.
+ pub const SAV: Magic = Magic(*b"$FL2");
+
+ /// Magic number for a system file that contains zlib-compressed data.
+ pub const ZSAV: Magic = Magic(*b"$FL3");
+
+ /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
+ /// in EBCDIC.
+ pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
+}
+
+impl TryFrom<[u8; 4]> for Magic {
+ type Error = Error;
+
+ fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
+ let magic = Magic(value);
+ match magic {
+ Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
+ _ => Err(Error::BadMagic(value)),
+ }
+ }
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+pub enum VarType {
+ Number,
+ String,
+}
+
+impl VarType {
+ fn from_width(width: i32) -> VarType {
+ match width {
+ 0 => VarType::Number,
+ _ => VarType::String,
+ }
+ }
+}
+
+trait State {
+ #[allow(clippy::type_complexity)]
+ fn read(self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error>;
+}
+
+struct Start<R: Read + Seek> {
+ reader: R,
+}
+
+struct CommonState<R: Read + Seek> {
+ reader: R,
+ endian: Endian,
+ bias: f64,
+ compression: Option<Compression>,
+ var_types: Vec<VarType>,
+}
+
+impl<R: Read + Seek + 'static> State for Start<R> {
+ fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
+ let header = read_header(&mut self.reader)?;
+ let next_state = Headers(CommonState {
+ reader: self.reader,
+ endian: header.endian,
+ bias: header.bias,
+ compression: header.compression,
+ var_types: Vec::new(),
+ });
+ Ok(Some((Record::Header(header), Box::new(next_state))))
+ }
+}
+
+struct Headers<R: Read + Seek>(CommonState<R>);
+
+impl<R: Read + Seek + 'static> State for Headers<R> {
+ fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
+ let endian = self.0.endian;
+ let rec_type: u32 = endian.parse(read_bytes(&mut self.0.reader)?);
+ let record = match rec_type {
+ 2 => {
+ let variable = read_variable_record(&mut self.0.reader, endian)?;
+ self.0.var_types.push(VarType::from_width(variable.width));
+ Record::Variable(variable)
+ }
+ 3 => Record::ValueLabel(read_value_label_record(&mut self.0.reader, endian)?),
+ 4 => Record::VarIndexes(read_var_indexes_record(&mut self.0.reader, endian)?),
+ 6 => Record::Document(read_document_record(&mut self.0.reader, endian)?),
+ 7 => Record::Extension(read_extension_record(&mut self.0.reader, endian)?),
+ 999 => {
+ let _: [u8; 4] = read_bytes(&mut self.0.reader)?;
+ let next_state: Box<dyn State> = match self.0.compression {
+ None => Box::new(Data(self.0)),
+ Some(Compression::Simple) => Box::new(CompressedData::new(self.0)),
+ Some(Compression::ZLib) => Box::new(ZlibHeader(self.0)),
+ };
+ return Ok(Some((Record::EndOfHeaders, next_state)));
+ }
+ _ => {
+ return Err(Error::BadRecordType {
+ offset: self.0.reader.stream_position()?,
+ rec_type,
+ })
+ }
+ };
+ Ok(Some((record, self)))
+ }
+}
+
+struct ZlibHeader<R: Read + Seek>(CommonState<R>);
+
+impl<R: Read + Seek + 'static> State for ZlibHeader<R> {
+ fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
+ let zheader = read_zheader(&mut self.0.reader, self.0.endian)?;
+ Ok(Some((Record::ZHeader(zheader), self)))
+ }
+}
+
+struct ZlibTrailer<R: Read + Seek>(CommonState<R>, ZHeader);
+
+impl<R: Read + Seek + 'static> State for ZlibTrailer<R> {
+ fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
+ let retval = read_ztrailer(&mut self.0.reader, self.0.endian, self.1.ztrailer_offset, self.1.ztrailer_len)?;
+ let next_state = Box::new(CompressedData::new(CommonState {
+ reader: ZlibDecodeMultiple::new(self.0.reader),
+ endian: self.0.endian,
+ bias: self.0.bias,
+ compression: self.0.compression,
+ var_types: self.0.var_types
+ }));
+ match retval {
+ None => next_state.read(),
+ Some(ztrailer) => Ok(Some((Record::ZTrailer(ztrailer), next_state)))
+ }
+ }
+}
+
+struct Data<R: Read + Seek>(CommonState<R>);
+
+impl<R: Read + Seek + 'static> State for Data<R> {
+ fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
+ let case_start = self.0.reader.stream_position()?;
+ let mut values = Vec::with_capacity(self.0.var_types.len());
+ for (i, &var_type) in self.0.var_types.iter().enumerate() {
+ let Some(raw) = try_read_bytes(&mut self.0.reader)? else {
+ if i == 0 {
+ return Ok(None);
+ } else {
+ let offset = self.0.reader.stream_position()?;
+ return Err(Error::EofInCase {
+ offset,
+ case_ofs: offset - case_start,
+ case_len: self.0.var_types.len() * 8,
+ });
+ }
+ };
+ values.push(Value::from_raw(var_type, raw, self.0.endian));
+ }
+ Ok(Some((Record::Case(values), self)))
+ }
+}
+
+struct CompressedData<R: Read + Seek> {
+ common: CommonState<R>,
+ codes: VecDeque<u8>,
+}
+
+impl<R: Read + Seek + 'static> CompressedData<R> {
+ fn new(common: CommonState<R>) -> CompressedData<R> {
+ CompressedData { common, codes: VecDeque::new() }
+ }
+}
+
+impl<R: Read + Seek + 'static> State for CompressedData<R> {
+ fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
+ let case_start = self.common.reader.stream_position()?;
+ let mut values = Vec::with_capacity(self.common.var_types.len());
+ for (i, &var_type) in self.common.var_types.iter().enumerate() {
+ let value = loop {
+ let Some(code) = self.codes.pop_front() else {
+ let Some(new_codes): Option<[u8; 8]> = try_read_bytes(&mut self.common.reader)?
+ else {
+ if i == 0 {
+ return Ok(None);
+ } else {
+ let offset = self.common.reader.stream_position()?;
+ return Err(Error::EofInCompressedCase {
+ offset,
+ case_ofs: offset - case_start,
+ });
+ }
+ };
+ self.codes.extend(new_codes.into_iter());
+ continue;
+ };
+ match code {
+ 0 => (),
+ 1..=251 => match var_type {
+ VarType::Number => break Value::Number(Some(code as f64 - self.common.bias)),
+ VarType::String => {
+ break Value::String(self.common.endian.to_bytes(code as f64 - self.common.bias))
+ }
+ },
+ 252 => {
+ if i == 0 {
+ return Ok(None);
+ } else {
+ let offset = self.common.reader.stream_position()?;
+ return Err(Error::PartialCompressedCase {
+ offset,
+ case_ofs: offset - case_start,
+ });
+ }
+ }
+ 253 => {
+ break Value::from_raw(
+ var_type,
+ read_bytes(&mut self.common.reader)?,
+ self.common.endian,
+ )
+ }
+ 254 => match var_type {
+ VarType::String => break Value::String(*b" "), // XXX EBCDIC
+ VarType::Number => {
+ return Err(Error::CompressedStringExpected {
+ offset: case_start,
+ case_ofs: self.common.reader.stream_position()? - case_start,
+ })
+ }
+ },
+ 255 => match var_type {
+ VarType::Number => break Value::Number(None),
+ VarType::String => {
+ return Err(Error::CompressedNumberExpected {
+ offset: case_start,
+ case_ofs: self.common.reader.stream_position()? - case_start,
+ })
+ }
+ },
+ }
+ };
+ values.push(value);
+ }
+ Ok(Some((Record::Case(values), self)))
+ }
+}
+
+struct ZlibDecodeMultiple<R>
+where
+ R: Read + Seek,
+{
+ reader: Option<ZlibDecoder<R>>,
+}
+
+impl<R> ZlibDecodeMultiple<R>
+where
+ R: Read + Seek,
+{
+ fn new(reader: R) -> ZlibDecodeMultiple<R> {
+ ZlibDecodeMultiple {
+ reader: Some(ZlibDecoder::new(reader)),
+ }
+ }
+}
+
+impl<R> Read for ZlibDecodeMultiple<R>
+where
+ R: Read + Seek,
+{
+ fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
+ loop {
+ match self.reader.as_mut().unwrap().read(buf)? {
+ 0 => {
+ let inner = self.reader.take().unwrap().into_inner();
+ self.reader = Some(ZlibDecoder::new(inner));
+ }
+ n => return Ok(n),
+ };
+ }
+ }
+}
+
+impl<R> Seek for ZlibDecodeMultiple<R>
+where
+ R: Read + Seek,
+{
+ fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
+ self.reader.as_mut().unwrap().get_mut().seek(pos)
+ }
+}
+
+#[derive(Copy, Clone)]
+pub enum Value {
+ Number(Option<f64>),
+ String([u8; 8]),
+}
+
+impl Value {
+ pub fn from_raw(var_type: VarType, raw: [u8; 8], endian: Endian) -> Value {
+ match var_type {
+ VarType::String => Value::String(raw),
+ VarType::Number => {
+ let number: f64 = endian.parse(raw);
+ Value::Number((number != -f64::MAX).then_some(number))
+ }
+ }
+ }
+}
+
+pub struct Reader {
+ state: Option<Box<dyn State>>,
+}
+
+impl Reader {
+ pub fn new<R: Read + Seek + 'static>(reader: R) -> Result<Reader, Error> {
+ Ok(Reader {
+ state: Some(Box::new(Start { reader })),
+ })
+ }
+}
+
+impl Iterator for Reader {
+ type Item = Result<Record, Error>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ match self.state.take()?.read() {
+ Ok(Some((record, next_state))) => {
+ self.state = Some(next_state);
+ Some(Ok(record))
+ }
+ Ok(None) => None,
+ Err(error) => Some(Err(error)),
+ }
+ }
+}
+
+impl FusedIterator for Reader {}
+
+fn read_header<R: Read>(r: &mut R) -> Result<Header, Error> {
+ let magic: [u8; 4] = read_bytes(r)?;
+ let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
+
+ let eye_catcher: [u8; 60] = read_bytes(r)?;
+ let layout_code: [u8; 4] = read_bytes(r)?;
+ let endian = Endian::identify_u32(2, layout_code)
+ .or_else(|| Endian::identify_u32(2, layout_code))
+ .ok_or_else(|| Error::NotASystemFile)?;
+ let layout_code = endian.parse(layout_code);
+
+ let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
+ let nominal_case_size =
+ (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
+
+ let compression_code: u32 = endian.parse(read_bytes(r)?);
+ let compression = match (magic, compression_code) {
+ (Magic::ZSAV, 2) => Some(Compression::ZLib),
+ (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
+ (_, 0) => None,
+ (_, 1) => Some(Compression::Simple),
+ (_, code) => return Err(Error::InvalidSavCompression(code)),
+ };
+
+ let weight_index: u32 = endian.parse(read_bytes(r)?);
+ let weight_index = (weight_index > 0).then_some(weight_index - 1);
+
+ let n_cases: u32 = endian.parse(read_bytes(r)?);
+ let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
+
+ let bias: f64 = endian.parse(read_bytes(r)?);
+
+ let creation_date: [u8; 9] = read_bytes(r)?;
+ let creation_time: [u8; 8] = read_bytes(r)?;
+ let file_label: [u8; 64] = read_bytes(r)?;
+ let _: [u8; 3] = read_bytes(r)?;
+
+ Ok(Header {
+ magic,
+ layout_code,
+ nominal_case_size,
+ compression,
+ weight_index,
+ n_cases,
+ bias,
+ creation_date,
+ creation_time,
+ eye_catcher,
+ file_label,
+ endian,
+ })
+}
+
+pub struct Variable {
+ /// Offset from the start of the file to the start of the record.
+ pub offset: u64,
+
+ /// Variable width, in the range -1..=255.
+ pub width: i32,
+
+ /// Variable name, padded on the right with spaces.
+ pub name: [u8; 8],
+
+ /// Print format.
+ pub print_format: u32,
+
+ /// Write format.
+ pub write_format: u32,
+
+ /// Missing value code, one of -3, -2, 0, 1, 2, or 3.
+ pub missing_value_code: i32,
+
+ /// Raw missing values, up to 3 of them.
+ pub missing: Vec<[u8; 8]>,
+
+ /// Optional variable label.
+ pub label: Option<Vec<u8>>,
+}
+
+fn read_variable_record<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Variable, Error> {
+ let offset = r.stream_position()?;
+ let width: i32 = endian.parse(read_bytes(r)?);
+ let has_variable_label: u32 = endian.parse(read_bytes(r)?);
+ let missing_value_code: i32 = endian.parse(read_bytes(r)?);
+ let print_format: u32 = endian.parse(read_bytes(r)?);
+ let write_format: u32 = endian.parse(read_bytes(r)?);
+ let name: [u8; 8] = read_bytes(r)?;
+
+ let label = match has_variable_label {
+ 0 => None,
+ 1 => {
+ let len: u32 = endian.parse(read_bytes(r)?);
+ let read_len = len.min(65535) as usize;
+ let label = Some(read_vec(r, read_len)?);
+
+ let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
+ let _ = read_vec(r, padding_bytes as usize)?;
+
+ label
+ }
+ _ => {
+ return Err(Error::BadVariableLabelCode {
+ offset,
+ code: has_variable_label,
+ })
+ }
+ };
+
+ let mut missing = Vec::new();
+ if missing_value_code != 0 {
+ match (width, missing_value_code) {
+ (0, -3 | -2 | 1 | 2 | 3) => (),
+ (0, _) => {
+ return Err(Error::BadNumericMissingValueCode {
+ offset,
+ code: missing_value_code,
+ })
+ }
+ (_, 0..=3) => (),
+ (_, _) => {
+ return Err(Error::BadStringMissingValueCode {
+ offset,
+ code: missing_value_code,
+ })
+ }
+ }
+
+ for _ in 0..missing_value_code.abs() {
+ missing.push(read_bytes(r)?);
+ }
+ }
+
+ Ok(Variable {
+ offset,
+ width,
+ name,
+ print_format,
+ write_format,
+ missing_value_code,
+ missing,
+ label,
+ })
+}
+
+pub struct ValueLabel {
+ /// Offset from the start of the file to the start of the record.
+ pub offset: u64,
+
+ /// The labels.
+ pub labels: Vec<([u8; 8], Vec<u8>)>,
+}
+
+impl ValueLabel {
+ /// Maximum number of value labels in a record.
+ pub const MAX: u32 = u32::MAX / 8;
+}
+
+fn read_value_label_record<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabel, Error> {
+ let offset = r.stream_position()?;
+ let n: u32 = endian.parse(read_bytes(r)?);
+ if n > ValueLabel::MAX {
+ return Err(Error::BadNumberOfValueLabels {
+ offset,
+ n,
+ max: ValueLabel::MAX,
+ });
+ }
+
+ let mut labels = Vec::new();
+ for _ in 0..n {
+ let value: [u8; 8] = read_bytes(r)?;
+ let label_len: u8 = endian.parse(read_bytes(r)?);
+ let label_len = label_len as usize;
+ let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
+
+ let mut label = read_vec(r, padded_len)?;
+ label.truncate(label_len);
+ labels.push((value, label));
+ }
+ Ok(ValueLabel { offset, labels })
+}
+
+pub struct VarIndexes {
+ /// Offset from the start of the file to the start of the record.
+ pub offset: u64,
+
+ /// The 0-based indexes of the variable indexes.
+ pub var_indexes: Vec<u32>,
+}
+
+impl VarIndexes {
+ /// Maximum number of variable indexes in a record.
+ pub const MAX: u32 = u32::MAX / 8;
+}
+
+fn read_var_indexes_record<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VarIndexes, Error> {
+ let offset = r.stream_position()?;
+ let n: u32 = endian.parse(read_bytes(r)?);
+ if n > VarIndexes::MAX {
+ return Err(Error::BadNumberOfVarIndexes {
+ offset,
+ n,
+ max: VarIndexes::MAX,
+ });
+ }
+ let mut var_indexes = Vec::with_capacity(n as usize);
+ for _ in 0..n {
+ var_indexes.push(endian.parse(read_bytes(r)?));
+ }
+
+ Ok(VarIndexes {
+ offset,
+ var_indexes,
+ })
+}
+
+pub const DOC_LINE_LEN: u32 = 80;
+pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN;
+
+pub struct Document {
+ /// Offset from the start of the file to the start of the record.
+ pub pos: u64,
+
+ /// The document, as an array of 80-byte lines.
+ pub lines: Vec<[u8; DOC_LINE_LEN as usize]>,
+}
+
+fn read_document_record<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Document, Error> {
+ let offset = r.stream_position()?;
+ let n: u32 = endian.parse(read_bytes(r)?);
+ match n {
+ 0..=DOC_MAX_LINES => {
+ let pos = r.stream_position()?;
+ let mut lines = Vec::with_capacity(n as usize);
+ for _ in 0..n {
+ let line: [u8; 80] = read_bytes(r)?;
+ lines.push(line);
+ }
+ Ok(Document { pos, lines })
+ }
+ _ => Err(Error::BadDocumentLength {
+ offset,
+ n,
+ max: DOC_MAX_LINES,
+ }),
+ }
+}
+
+#[derive(FromPrimitive)]
+enum ExtensionType {
+ /// Machine integer info.
+ Integer = 3,
+ /// Machine floating-point info.
+ Float = 4,
+ /// Variable sets.
+ VarSets = 5,
+ /// DATE.
+ Date = 6,
+ /// Multiple response sets.
+ Mrsets = 7,
+ /// SPSS Data Entry.
+ DataEntry = 8,
+ /// Extra product info text.
+ ProductInfo = 10,
+ /// Variable display parameters.
+ Display = 11,
+ /// Long variable names.
+ LongNames = 13,
+ /// Long strings.
+ LongStrings = 14,
+ /// Extended number of cases.
+ Ncases = 16,
+ /// Data file attributes.
+ FileAttrs = 17,
+ /// Variable attributes.
+ VarAttrs = 18,
+ /// Multiple response sets (extended).
+ Mrsets2 = 19,
+ /// Character encoding.
+ Encoding = 20,
+ /// Value labels for long strings.
+ LongLabels = 21,
+ /// Missing values for long strings.
+ LongMissing = 22,
+ /// "Format properties in dataview table".
+ Dataview = 24,
+}
+
+pub struct Extension {
+ /// Offset from the start of the file to the start of the record.
+ pub offset: u64,
+
+ /// Record subtype.
+ pub subtype: u32,
+
+ /// Size of each data element.
+ pub size: u32,
+
+ /// Number of data elements.
+ pub count: u32,
+
+ /// `size * count` bytes of data.
+ pub data: Vec<u8>,
+}
+
+/*
+fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) {
+ match extension {
+ /* Implemented record types. */
+ ExtensionType::Integer => (4, 8),
+ ExtensionType::Float => (8, 3),
+ ExtensionType::VarSets => (1, 0),
+ ExtensionType::Mrsets => (1, 0),
+ ExtensionType::ProductInfo => (1, 0),
+ ExtensionType::Display => (4, 0),
+ ExtensionType::LongNames => (1, 0),
+ ExtensionType::LongStrings => (1, 0),
+ ExtensionType::Ncases => (8, 2),
+ ExtensionType::FileAttrs => (1, 0),
+ ExtensionType::VarAttrs => (1, 0),
+ ExtensionType::Mrsets2 => (1, 0),
+ ExtensionType::Encoding => (1, 0),
+ ExtensionType::LongLabels => (1, 0),
+ ExtensionType::LongMissing => (1, 0),
+
+ /* Ignored record types. */
+ ExtensionType::Date => (0, 0),
+ ExtensionType::DataEntry => (0, 0),
+ ExtensionType::Dataview => (0, 0),
+ }
+}
+ */
+
+fn read_extension_record<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Extension, Error> {
+ let subtype = endian.parse(read_bytes(r)?);
+ let offset = r.stream_position()?;
+ let size: u32 = endian.parse(read_bytes(r)?);
+ let count = endian.parse(read_bytes(r)?);
+ let Some(product) = size.checked_mul(count) else {
+ return Err(Error::ExtensionRecordTooLarge {
+ offset,
+ subtype,
+ size,
+ count,
+ });
+ };
+ let offset = r.stream_position()?;
+ let data = read_vec(r, product as usize)?;
+ Ok(Extension {
+ offset,
+ subtype,
+ size,
+ count,
+ data,
+ })
+}
+
+pub struct ZHeader {
+ /// File offset to the start of the record.
+ pub offset: u64,
+
+ /// File offset to the ZLIB data header.
+ pub zheader_offset: u64,
+
+ /// File offset to the ZLIB trailer.
+ pub ztrailer_offset: u64,
+
+ /// Length of the ZLIB trailer in bytes.
+ pub ztrailer_len: u64,
+}
+
+fn read_zheader<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
+ let offset = r.stream_position()?;
+ let zheader_offset: u64 = endian.parse(read_bytes(r)?);
+ let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
+ let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
+
+ Ok(ZHeader {
+ offset,
+ zheader_offset,
+ ztrailer_offset,
+ ztrailer_len,
+ })
+}
+
+pub struct ZTrailer {
+ /// File offset to the start of the record.
+ pub offset: u64,
+
+ /// Compression bias as a negative integer, e.g. -100.
+ pub int_bias: i64,
+
+ /// Always observed as zero.
+ pub zero: u64,
+
+ /// Uncompressed size of each block, except possibly the last. Only
+ /// `0x3ff000` has been observed so far.
+ pub block_size: u32,
+
+ /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
+ pub blocks: Vec<ZBlock>,
+}
+
+pub struct ZBlock {
+ /// Offset of block of data if simple compression were used.
+ pub uncompressed_ofs: u64,
+
+ /// Actual offset within the file of the compressed data block.
+ pub compressed_ofs: u64,
+
+ /// The number of bytes in this data block after decompression. This is
+ /// `block_size` in every data block but the last, which may be smaller.
+ pub uncompressed_size: u32,
+
+ /// The number of bytes in this data block, as stored compressed in this
+ /// file.
+ pub compressed_size: u32,
+}
+
+fn read_ztrailer<R: Read + Seek>(
+ r: &mut R,
+ endian: Endian,
+ ztrailer_ofs: u64,
+ ztrailer_len: u64,
+) -> Result<Option<ZTrailer>, Error> {
+ let start_offset = r.stream_position()?;
+ if r.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
+ return Ok(None);
+ }
+ let int_bias = endian.parse(read_bytes(r)?);
+ let zero = endian.parse(read_bytes(r)?);
+ let block_size = endian.parse(read_bytes(r)?);
+ let n_blocks: u32 = endian.parse(read_bytes(r)?);
+ let expected_n_blocks = (ztrailer_len - 24) / 24;
+ if n_blocks as u64 != expected_n_blocks {
+ return Err(Error::BadZlibTrailerNBlocks {
+ offset: ztrailer_ofs,
+ n_blocks,
+ expected_n_blocks,
+ ztrailer_len,
+ });
+ }
+ let mut blocks = Vec::with_capacity(n_blocks as usize);
+ for _ in 0..n_blocks {
+ let uncompressed_ofs = endian.parse(read_bytes(r)?);
+ let compressed_ofs = endian.parse(read_bytes(r)?);
+ let uncompressed_size = endian.parse(read_bytes(r)?);
+ let compressed_size = endian.parse(read_bytes(r)?);
+ blocks.push(ZBlock {
+ uncompressed_ofs,
+ compressed_ofs,
+ uncompressed_size,
+ compressed_size,
+ });
+ }
+ r.seek(SeekFrom::Start(start_offset))?;
+ Ok(Some(ZTrailer {
+ offset: ztrailer_ofs,
+ int_bias,
+ zero,
+ block_size,
+ blocks,
+ }))
+}
+
+fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
+ let mut buf = [0; N];
+ let n = r.read(&mut buf)?;
+ if n > 0 {
+ if n < N {
+ r.read_exact(&mut buf[n..])?;
+ }
+ Ok(Some(buf))
+ } else {
+ Ok(None)
+ }
+}
+
+fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
+ let mut buf = [0; N];
+ r.read_exact(&mut buf)?;
+ Ok(buf)
+}
+
+fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
+ let mut vec = vec![0; n];
+ r.read_exact(&mut vec)?;
+ Ok(vec)
+}