#[error("Not an SPSS system file")]
NotASystemFile,
+ #[error("Invalid magic number {0:?}")]
+ BadMagic([u8; 4]),
+
#[error("I/O error ({source})")]
Io {
#[from]
#[error("Invalid ZSAV compression code {0}")]
InvalidZsavCompression(u32),
+ #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
+ BadVariableWidth { offset: u64, width: i32 },
+
#[error("Misplaced type 4 record near offset {0:#x}.")]
MisplacedType4Record(u64),
#[error("At offset {offset:#x}, variable index record (type 4) does not immediately follow value label record (type 3) as it should.")]
MissingVariableIndexRecord { offset: u64 },
- #[error("At offset {offset:#x}, number of variables associated with a value label ({n}) is not between 1 and the number of variables ({max}).")]
- BadNumberOfValueLabelVariables { offset: u64, n: u32, max: u32 },
+ #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
+ BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
#[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
ExtensionRecordTooLarge {
ZLib,
}
-pub struct Reader<R: Read> {
- r: BufReader<R>,
- documents: Vec<DocumentRecord>,
- variables: Vec<VariableRecord>,
- value_labels: Vec<ValueLabelRecord>,
- extensions: Vec<ExtensionRecord>,
- zheader: Option<ZHeader>,
+pub enum Record {
+ Header(Header),
+ Document(Document),
+ Variable(Variable),
+ ValueLabel(ValueLabel),
+ VarIndexes(VarIndexes),
+ Extension(Extension),
+ EndOfHeaders,
}
-/// Magic number for a regular system file.
-pub const ASCII_MAGIC: &[u8; 4] = b"$FL2";
-
-/// Magic number for a system file that contains zlib-compressed data.
-pub const ASCII_ZMAGIC: &[u8; 4] = b"$FL3";
-
-/// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded in
-/// EBCDIC.
-pub const EBCDIC_MAGIC: &[u8; 4] = &[0x5b, 0xc6, 0xd3, 0xf2];
+pub struct Header {
+ /// Magic number.
+ pub magic: Magic,
-pub struct FileHeader {
- /// First 4 bytes of the file, one of `ASCII_MAGIC`, `ASCII_ZMAGIC`, and
- /// `EBCDIC_MAGIC`.
- pub magic: [u8; 4],
+ /// Eye-catcher string, product name, in the file's encoding. Padded
+ /// on the right with spaces.
+ pub eye_catcher: [u8; 60],
- /// True if `magic` indicates that this file contained zlib-compressed data.
- pub is_zsav: bool,
+ /// Layout code, normally either 2 or 3.
+ pub layout_code: u32,
- /// True if `magic` indicates that this file contained EBCDIC data.
- pub is_ebcdic: bool,
+ /// Number of variable positions, or `None` if the value in the file is
+ /// questionably trustworthy.
+ pub nominal_case_size: Option<u32>,
- /// Endianness of the data in the file header.
- pub endianness: Endian,
+ /// Compression type, if any,
+ pub compression: Option<Compression>,
/// 0-based variable index of the weight variable, or `None` if the file is
/// unweighted.
pub weight_index: Option<u32>,
- /// Number of variable positions, or `None` if the value in the file is
- /// questionably trustworthy.
- pub nominal_case_size: Option<u32>,
+ /// Claimed number of cases, if known.
+ pub n_cases: Option<u32>,
+
+ /// Compression bias, usually 100.0.
+ pub bias: f64,
/// `dd mmm yy` in the file's encoding.
pub creation_date: [u8; 9],
/// `HH:MM:SS` in the file's encoding.
pub creation_time: [u8; 8],
- /// Eye-catcher string, then product name, in the file's encoding. Padded
- /// on the right with spaces.
- pub eye_catcher: [u8; 60],
-
/// File label, in the file's encoding. Padded on the right with spaces.
pub file_label: [u8; 64],
+
+ /// Endianness of the data in the file header.
+ pub endianness: Endian,
}
-impl<R: Read + Seek> Reader<R> {
- pub fn new(r: R, warn: impl Fn(Warning)) -> Result<Reader<R>, Error> {
- let mut r = BufReader::new(r);
-
- let header = read_header(&mut r, &warn)?;
- let e = header.endianness;
- let mut documents = Vec::new();
- let mut variables = Vec::new();
- let mut value_labels = Vec::new();
- let mut extensions = Vec::new();
- loop {
- let offset = r.stream_position()?;
- let rec_type: u32 = e.parse(read_bytes(&mut r)?);
- match rec_type {
- 2 => variables.push(read_variable_record(&mut r, e)?),
- 3 => value_labels.push(read_value_label_record(&mut r, e, variables.len())?),
- 4 => return Err(Error::MisplacedType4Record(offset)),
- 6 => documents.push(read_document_record(&mut r, e)?),
- 7 => extensions.push(read_extension_record(&mut r, e)?),
- 999 => break,
- _ => return Err(Error::BadRecordType { offset, rec_type }),
- }
+#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+pub struct Magic([u8; 4]);
+
+impl Magic {
+ /// Magic number for a regular system file.
+ pub const SAV: Magic = Magic(*b"$FL2");
+
+ /// Magic number for a system file that contains zlib-compressed data.
+ pub const ZSAV: Magic = Magic(*b"$FL3");
+
+ /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
+ /// in EBCDIC.
+ pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
+}
+
+impl TryFrom<[u8; 4]> for Magic {
+ type Error = Error;
+
+ fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
+ let magic = Magic(value);
+ match magic {
+ Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
+ _ => Err(Error::BadMagic(value)),
+ }
+ }
+}
+
+enum VarType {
+ Number,
+ String,
+}
+
+impl VarType {
+ fn from_width(width: i32) -> VarType {
+ match width {
+ 0 => VarType::Number,
+ _ => VarType::String,
}
- let _: [u8; 4] = read_bytes(&mut r)?;
- let zheader = match header.is_zsav {
- true => Some(read_zheader(&mut r, e)?),
- false => None,
- };
+ }
+}
+
+pub struct Reader<R: Read> {
+ r: BufReader<R>,
+ var_types: Vec<VarType>,
+ state: ReaderState,
+}
+
+enum ReaderState {
+ Start,
+ Headers(Endian, Option<Compression>),
+ Data(Endian),
+ End,
+}
+impl<R: Read + Seek> Reader<R> {
+ pub fn new(r: R) -> Result<Reader<R>, Error> {
Ok(Reader {
- r,
- documents,
- variables,
- value_labels,
- extensions,
- zheader,
+ r: BufReader::new(r),
+ var_types: Vec::new(),
+ state: ReaderState::Start,
})
}
+ fn _next(&mut self) -> Result<Option<(Record, ReaderState)>, Error> {
+ match self.state {
+ ReaderState::Start => {
+ let header = read_header(&mut self.r)?;
+ let next_state = ReaderState::Headers(header.endianness, header.compression);
+ Ok(Some((Record::Header(header), next_state)))
+ }
+ ReaderState::Headers(endian, compression) => {
+ let rec_type: u32 = endian.parse(read_bytes(&mut self.r)?);
+ let record = match rec_type {
+ 2 => {
+ let variable = read_variable_record(&mut self.r, endian)?;
+ self.var_types.push(VarType::from_width(variable.width));
+ Record::Variable(variable)
+ }
+ 3 => Record::ValueLabel(read_value_label_record(&mut self.r, endian)?),
+ 4 => Record::VarIndexes(read_var_indexes_record(&mut self.r, endian)?),
+ 6 => Record::Document(read_document_record(&mut self.r, endian)?),
+ 7 => Record::Extension(read_extension_record(&mut self.r, endian)?),
+ 999 => {
+ let _: [u8; 4] = read_bytes(&mut self.r)?;
+ let next_state = match compression {
+ None => ReaderState::Data(endian),
+ _ => ReaderState::End,
+ };
+ return Ok(Some((Record::EndOfHeaders, next_state)));
+ }
+ _ => {
+ return Err(Error::BadRecordType {
+ offset: self.r.stream_position()?,
+ rec_type,
+ })
+ }
+ };
+ Ok(Some((record, ReaderState::Headers(endian, compression))))
+ }
+ ReaderState::End => Ok(None),
+ }
+ }
+}
+
+impl<R: Read + Seek> Iterator for Reader<R> {
+ type Item = Result<Record, Error>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ let retval = self._next();
+ match retval {
+ Ok(None) => {
+ self.state = ReaderState::End;
+ None
+ }
+ Ok(Some((record, next_state))) => {
+ self.state = next_state;
+ Some(Ok(record))
+ }
+ Err(error) => {
+ self.state = ReaderState::End;
+ Some(Err(error))
+ }
+ }
+ }
}
-fn read_header<R: Read>(r: &mut R, warn: impl Fn(Warning)) -> Result<FileHeader, Error> {
+fn read_header<R: Read>(r: &mut R) -> Result<Header, Error> {
let magic: [u8; 4] = read_bytes(r)?;
- let (is_zsav, is_ebcdic) = match &magic {
- ASCII_MAGIC => (false, false),
- ASCII_ZMAGIC => (true, false),
- EBCDIC_MAGIC => (false, true),
- _ => return Err(Error::NotASystemFile),
- };
+ let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
let eye_catcher: [u8; 60] = read_bytes(r)?;
let layout_code: [u8; 4] = read_bytes(r)?;
let endianness = Endian::identify_u32(2, layout_code)
.or_else(|| Endian::identify_u32(2, layout_code))
.ok_or_else(|| Error::NotASystemFile)?;
+ let layout_code = endianness.parse(layout_code);
let nominal_case_size: u32 = endianness.parse(read_bytes(r)?);
let nominal_case_size =
(nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
let compression_code: u32 = endianness.parse(read_bytes(r)?);
- let compression = match (is_zsav, compression_code) {
- (false, 0) => None,
- (false, 1) => Some(Compression::Simple),
- (true, 2) => Some(Compression::ZLib),
- (false, code) => return Err(Error::InvalidSavCompression(code)),
- (true, code) => return Err(Error::InvalidZsavCompression(code)),
+ let compression = match (magic, compression_code) {
+ (Magic::ZSAV, 2) => Some(Compression::ZLib),
+ (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
+ (_, 0) => None,
+ (_, 1) => Some(Compression::Simple),
+ (_, code) => return Err(Error::InvalidSavCompression(code)),
};
let weight_index: u32 = endianness.parse(read_bytes(r)?);
let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
let bias: f64 = endianness.parse(read_bytes(r)?);
- if bias != 100.0 {
- warn(Warning::UnexpectedBias(bias))
- }
let creation_date: [u8; 9] = read_bytes(r)?;
let creation_time: [u8; 8] = read_bytes(r)?;
let file_label: [u8; 64] = read_bytes(r)?;
let _: [u8; 3] = read_bytes(r)?;
- Ok(FileHeader {
+ Ok(Header {
magic,
- is_zsav,
- is_ebcdic,
- endianness,
- weight_index,
+ layout_code,
nominal_case_size,
+ compression,
+ weight_index,
+ n_cases,
+ bias,
creation_date,
creation_time,
eye_catcher,
file_label,
+ endianness,
})
}
-pub struct VariableRecord {
+pub struct Variable {
/// Offset from the start of the file to the start of the record.
pub offset: u64,
fn read_variable_record<R: Read + Seek>(
r: &mut BufReader<R>,
e: Endian,
-) -> Result<VariableRecord, Error> {
+) -> Result<Variable, Error> {
let offset = r.stream_position()?;
let width: i32 = e.parse(read_bytes(r)?);
let has_variable_label: u32 = e.parse(read_bytes(r)?);
}
}
- Ok(VariableRecord {
+ Ok(Variable {
offset,
width,
name,
})
}
-pub struct ValueLabelRecord {
+pub struct ValueLabel {
/// Offset from the start of the file to the start of the record.
pub offset: u64,
/// The labels.
pub labels: Vec<([u8; 8], Vec<u8>)>,
-
- /// The 0-based indexes of the variables to which the labels are assigned.
- pub var_indexes: Vec<u32>,
}
-pub const MAX_VALUE_LABELS: u32 = u32::MAX / 8;
+impl ValueLabel {
+ /// Maximum number of value labels in a record.
+ pub const MAX: u32 = u32::MAX / 8;
+}
fn read_value_label_record<R: Read + Seek>(
r: &mut BufReader<R>,
e: Endian,
- n_var_records: usize,
-) -> Result<ValueLabelRecord, Error> {
+) -> Result<ValueLabel, Error> {
let offset = r.stream_position()?;
let n: u32 = e.parse(read_bytes(r)?);
- if n > MAX_VALUE_LABELS {
+ if n > ValueLabel::MAX {
return Err(Error::BadNumberOfValueLabels {
offset,
n,
- max: MAX_VALUE_LABELS,
+ max: ValueLabel::MAX,
});
}
label.truncate(label_len);
labels.push((value, label));
}
+ Ok(ValueLabel { offset, labels })
+}
- let rec_type: u32 = e.parse(read_bytes(r)?);
- if rec_type != 4 {
- return Err(Error::MissingVariableIndexRecord {
- offset: r.stream_position()?,
- });
- }
+pub struct VarIndexes {
+ /// Offset from the start of the file to the start of the record.
+ pub offset: u64,
- let n_vars: u32 = e.parse(read_bytes(r)?);
- if n_vars < 1 || n_vars as usize > n_var_records {
- return Err(Error::BadNumberOfValueLabelVariables {
- offset: r.stream_position()?,
- n: n_vars,
- max: n_var_records as u32,
+ /// The 0-based indexes of the variable indexes.
+ pub var_indexes: Vec<u32>,
+}
+
+impl VarIndexes {
+ /// Maximum number of variable indexes in a record.
+ pub const MAX: u32 = u32::MAX / 8;
+}
+
+fn read_var_indexes_record<R: Read + Seek>(
+ r: &mut BufReader<R>,
+ e: Endian,
+) -> Result<VarIndexes, Error> {
+ let offset = r.stream_position()?;
+ let n: u32 = e.parse(read_bytes(r)?);
+ if n > VarIndexes::MAX {
+ return Err(Error::BadNumberOfVarIndexes {
+ offset,
+ n,
+ max: VarIndexes::MAX,
});
}
- let mut var_indexes = Vec::with_capacity(n_vars as usize);
- for _ in 0..n_vars {
+ let mut var_indexes = Vec::with_capacity(n as usize);
+ for _ in 0..n {
var_indexes.push(e.parse(read_bytes(r)?));
}
- Ok(ValueLabelRecord {
+ Ok(VarIndexes {
offset,
- labels,
var_indexes,
})
}
pub const DOC_LINE_LEN: u32 = 80;
pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN;
-pub struct DocumentRecord {
+pub struct Document {
/// Offset from the start of the file to the start of the record.
pub pos: u64,
fn read_document_record<R: Read + Seek>(
r: &mut BufReader<R>,
e: Endian,
-) -> Result<DocumentRecord, Error> {
+) -> Result<Document, Error> {
let offset = r.stream_position()?;
let n: u32 = e.parse(read_bytes(r)?);
match n {
let line: [u8; 80] = read_bytes(r)?;
lines.push(line);
}
- Ok(DocumentRecord { pos, lines })
+ Ok(Document { pos, lines })
}
_ => Err(Error::BadDocumentLength {
offset,
}
#[derive(FromPrimitive)]
-enum Extension {
+enum ExtensionType {
/// Machine integer info.
Integer = 3,
/// Machine floating-point info.
Dataview = 24,
}
-struct ExtensionRecord {
+pub struct Extension {
/// Offset from the start of the file to the start of the record.
- offset: u64,
+ pub offset: u64,
/// Record subtype.
- subtype: u32,
+ pub subtype: u32,
/// Size of each data element.
- size: u32,
+ pub size: u32,
/// Number of data elements.
- count: u32,
+ pub count: u32,
/// `size * count` bytes of data.
- data: Vec<u8>,
+ pub data: Vec<u8>,
}
-fn extension_record_size_requirements(extension: Extension) -> (u32, u32) {
+fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) {
match extension {
/* Implemented record types. */
- Extension::Integer => (4, 8),
- Extension::Float => (8, 3),
- Extension::VarSets => (1, 0),
- Extension::Mrsets => (1, 0),
- Extension::ProductInfo => (1, 0),
- Extension::Display => (4, 0),
- Extension::LongNames => (1, 0),
- Extension::LongStrings => (1, 0),
- Extension::Ncases => (8, 2),
- Extension::FileAttrs => (1, 0),
- Extension::VarAttrs => (1, 0),
- Extension::Mrsets2 => (1, 0),
- Extension::Encoding => (1, 0),
- Extension::LongLabels => (1, 0),
- Extension::LongMissing => (1, 0),
+ ExtensionType::Integer => (4, 8),
+ ExtensionType::Float => (8, 3),
+ ExtensionType::VarSets => (1, 0),
+ ExtensionType::Mrsets => (1, 0),
+ ExtensionType::ProductInfo => (1, 0),
+ ExtensionType::Display => (4, 0),
+ ExtensionType::LongNames => (1, 0),
+ ExtensionType::LongStrings => (1, 0),
+ ExtensionType::Ncases => (8, 2),
+ ExtensionType::FileAttrs => (1, 0),
+ ExtensionType::VarAttrs => (1, 0),
+ ExtensionType::Mrsets2 => (1, 0),
+ ExtensionType::Encoding => (1, 0),
+ ExtensionType::LongLabels => (1, 0),
+ ExtensionType::LongMissing => (1, 0),
/* Ignored record types. */
- Extension::Date => (0, 0),
- Extension::DataEntry => (0, 0),
- Extension::Dataview => (0, 0),
+ ExtensionType::Date => (0, 0),
+ ExtensionType::DataEntry => (0, 0),
+ ExtensionType::Dataview => (0, 0),
}
}
fn read_extension_record<R: Read + Seek>(
r: &mut BufReader<R>,
e: Endian,
-) -> Result<ExtensionRecord, Error> {
+) -> Result<Extension, Error> {
let subtype = e.parse(read_bytes(r)?);
let offset = r.stream_position()?;
let size: u32 = e.parse(read_bytes(r)?);
};
let offset = r.stream_position()?;
let data = read_vec(r, product as usize)?;
- Ok(ExtensionRecord {
+ Ok(Extension {
offset,
subtype,
size,
let ztrailer_offset: u64 = e.parse(read_bytes(r)?);
let ztrailer_len: u64 = e.parse(read_bytes(r)?);
- if zheader_offset != offset {
- return Err(Error::BadZlibHeaderOffset {
- offset,
- zheader_offset,
- });
- }
- if ztrailer_offset < offset {
- return Err(Error::BadZlibTrailerOffset {
- offset,
- ztrailer_offset,
- });
- }
- if ztrailer_len < 24 || ztrailer_len % 24 != 0 {
- return Err(Error::BadZlibTrailerLen {
- offset,
- ztrailer_len,
- });
- }
-
Ok(ZHeader {
offset,
zheader_offset,