-#![allow(unused_variables)]
-use endian::{Endian, Parse};
-use num::Integer;
-use std::io::{BufReader, Error as IoError, Read, Seek};
-use thiserror::Error;
-
pub mod endian;
-
-#[derive(Error, Debug)]
-pub enum Error {
- #[error("Not an SPSS system file")]
- NotASystemFile,
-
- #[error("I/O error ({source})")]
- Io {
- #[from]
- source: IoError,
- },
-
- #[error("Invalid SAV compression code {0}")]
- InvalidSavCompression(u32),
-
- #[error("Invalid ZSAV compression code {0}")]
- InvalidZsavCompression(u32),
-
- #[error("Misplaced type 4 record near offset {0:#x}.")]
- MisplacedType4Record(u64),
-
- #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
- BadDocumentLength { offset: u64, n: u32, max: u32 },
-
- #[error("At offset {offset:#x}, Unrecognized record type {rec_type}.")]
- BadRecordType { offset: u64, rec_type: u32 },
-
- #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")]
- BadVariableLabelCode { offset: u64, code: u32 },
-
- #[error(
- "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
- )]
- BadNumericMissingValueCode { offset: u64, code: i32 },
-
- #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
- BadStringMissingValueCode { offset: u64, code: i32 },
-
- #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
- BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
-
- #[error("At offset {offset:#x}, variable index record (type 4) does not immediately follow value label record (type 3) as it should.")]
- MissingVariableIndexRecord { offset: u64 },
-
- #[error("At offset {offset:#x}, number of variables associated with a value label ({n}) is not between 1 and the number of variables ({max}).")]
- BadNumberOfValueLabelVariables { offset: u64, n: u32, max: u32 },
-}
-
-#[derive(Error, Debug)]
-pub enum Warning {
- #[error("Unexpected floating-point bias {0} or unrecognized floating-point format.")]
- UnexpectedBias(f64),
-
- #[error("Duplicate type 6 (document) record.")]
- DuplicateDocumentRecord,
-}
+pub mod raw;
+pub mod cooked;
+pub mod sack;
+pub mod encoding;
+pub mod format;
+pub mod identifier;
#[derive(Copy, Clone, Debug)]
pub enum Compression {
ZLib,
}
-pub struct Reader<R: Read> {
- r: BufReader<R>,
-
- document_record: Option<DocumentRecord>,
-
- variables: Vec<VariableRecord>,
-
- value_labels: Vec<ValueLabelRecord>,
-}
-
-/// Magic number for a regular system file.
-pub const ASCII_MAGIC: &[u8; 4] = b"$FL2";
-
-/// Magic number for a system file that contains zlib-compressed data.
-pub const ASCII_ZMAGIC: &[u8; 4] = b"$FL3";
-
-/// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded in
-/// EBCDIC.
-pub const EBCDIC_MAGIC: &[u8; 4] = &[0x5b, 0xc6, 0xd3, 0xf2];
-
-pub struct FileHeader {
- /// First 4 bytes of the file, one of `ASCII_MAGIC`, `ASCII_ZMAGIC`, and
- /// `EBCDIC_MAGIC`.
- pub magic: [u8; 4],
-
- /// True if `magic` indicates that this file contained zlib-compressed data.
- pub is_zsav: bool,
-
- /// True if `magic` indicates that this file contained EBCDIC data.
- pub is_ebcdic: bool,
-
- /// Endianness of the data in the file header.
- pub endianness: Endian,
-
- /// 0-based variable index of the weight variable, or `None` if the file is
- /// unweighted.
- pub weight_index: Option<u32>,
-
- /// Number of variable positions, or `None` if the value in the file is
- /// questionably trustworthy.
- pub nominal_case_size: Option<u32>,
-
- /// `dd mmm yy` in the file's encoding.
- pub creation_date: [u8; 9],
-
- /// `HH:MM:SS` in the file's encoding.
- pub creation_time: [u8; 8],
-
- /// Eye-catcher string, then product name, in the file's encoding. Padded
- /// on the right with spaces.
- pub eye_catcher: [u8; 60],
-
- /// File label, in the file's encoding. Padded on the right with spaces.
- pub file_label: [u8; 64],
-}
-
-pub const DOC_LINE_LEN: u32 = 80;
-pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN;
-
-impl<R: Read + Seek> Reader<R> {
- pub fn new(r: R, warn: impl Fn(Warning)) -> Result<Reader<R>, Error> {
- let mut r = BufReader::new(r);
-
- let header = read_header(&mut r, &warn)?;
- let e = header.endianness;
- let mut document_record = None;
- let mut variables = Vec::new();
- let mut value_labels = Vec::new();
- loop {
- let offset = r.stream_position()?;
- let rec_type: u32 = e.parse(read_bytes(&mut r)?);
- match rec_type {
- 2 => variables.push(read_variable_record(&mut r, e)?),
- 3 => value_labels.push(read_value_label_record(&mut r, e, variables.len())?),
- // A Type 4 record is always immediately after a type 3 record,
- // the code for type 3 records reads the type 4 record too.
- 4 => return Err(Error::MisplacedType4Record(offset)),
-
- 6 => {
- let d = read_document_record(&mut r, e)?;
- if document_record.is_some() {
- warn(Warning::DuplicateDocumentRecord);
- } else {
- document_record = d;
- }
- }
- /*
- 7 => d.read_extension_record()?,
- */
- 999 => break,
- _ => return Err(Error::BadRecordType { offset, rec_type }),
- }
- }
-
- Ok(Reader {
- r,
- document_record,
- variables,
- value_labels,
- })
- }
-}
-
-fn read_header<R: Read>(r: &mut R, warn: impl Fn(Warning)) -> Result<FileHeader, Error> {
- let magic: [u8; 4] = read_bytes(r)?;
- let (is_zsav, is_ebcdic) = match &magic {
- ASCII_MAGIC => (false, false),
- ASCII_ZMAGIC => (true, false),
- EBCDIC_MAGIC => (false, true),
- _ => return Err(Error::NotASystemFile),
- };
-
- let eye_catcher: [u8; 60] = read_bytes(r)?;
- let layout_code: [u8; 4] = read_bytes(r)?;
- let endianness = Endian::identify_u32(2, layout_code)
- .or_else(|| Endian::identify_u32(2, layout_code))
- .ok_or_else(|| Error::NotASystemFile)?;
-
- let nominal_case_size: u32 = endianness.parse(read_bytes(r)?);
- let nominal_case_size =
- (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
-
- let compression_code: u32 = endianness.parse(read_bytes(r)?);
- let compression = match (is_zsav, compression_code) {
- (false, 0) => None,
- (false, 1) => Some(Compression::Simple),
- (true, 2) => Some(Compression::ZLib),
- (false, code) => return Err(Error::InvalidSavCompression(code)),
- (true, code) => return Err(Error::InvalidZsavCompression(code)),
- };
-
- let weight_index: u32 = endianness.parse(read_bytes(r)?);
- let weight_index = (weight_index > 0).then_some(weight_index - 1);
-
- let n_cases: u32 = endianness.parse(read_bytes(r)?);
- let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
-
- let bias: f64 = endianness.parse(read_bytes(r)?);
- if bias != 100.0 {
- warn(Warning::UnexpectedBias(bias))
- }
-
- let creation_date: [u8; 9] = read_bytes(r)?;
- let creation_time: [u8; 8] = read_bytes(r)?;
- let file_label: [u8; 64] = read_bytes(r)?;
- let _: [u8; 3] = read_bytes(r)?;
-
- Ok(FileHeader {
- magic,
- is_zsav,
- is_ebcdic,
- endianness,
- weight_index,
- nominal_case_size,
- creation_date,
- creation_time,
- eye_catcher,
- file_label,
- })
-}
-
-pub struct VariableRecord {
- /// Offset from the start of the file to the start of the record.
- pub offset: u64,
-
- /// Variable width, in the range -1..=255.
- pub width: i32,
-
- /// Variable name, padded on the right with spaces.
- pub name: [u8; 8],
-
- /// Print format.
- pub print_format: u32,
-
- /// Write format.
- pub write_format: u32,
-
- /// Missing value code, one of -3, -2, 0, 1, 2, or 3.
- pub missing_value_code: i32,
-
- /// Raw missing values, up to 3 of them.
- pub missing: Vec<[u8; 8]>,
-
- /// Optional variable label.
- pub label: Option<Vec<u8>>,
-}
-
-fn read_variable_record<R: Read + Seek>(
- r: &mut BufReader<R>,
- e: Endian,
-) -> Result<VariableRecord, Error> {
- let offset = r.stream_position()?;
- let width: i32 = e.parse(read_bytes(r)?);
- let has_variable_label: u32 = e.parse(read_bytes(r)?);
- let missing_value_code: i32 = e.parse(read_bytes(r)?);
- let print_format: u32 = e.parse(read_bytes(r)?);
- let write_format: u32 = e.parse(read_bytes(r)?);
- let name: [u8; 8] = read_bytes(r)?;
-
- let label = match has_variable_label {
- 0 => None,
- 1 => {
- let len: u32 = e.parse(read_bytes(r)?);
- let read_len = len.min(65535) as usize;
- let label = Some(read_vec(r, read_len)?);
-
- let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
- let _ = read_vec(r, padding_bytes as usize)?;
-
- label
- }
- _ => {
- return Err(Error::BadVariableLabelCode {
- offset,
- code: has_variable_label,
- })
- }
- };
-
- let mut missing = Vec::new();
- if missing_value_code != 0 {
- match (width, missing_value_code) {
- (0, -3 | -2 | 1 | 2 | 3) => (),
- (0, _) => {
- return Err(Error::BadNumericMissingValueCode {
- offset,
- code: missing_value_code,
- })
- }
- (_, 0..=3) => (),
- (_, _) => {
- return Err(Error::BadStringMissingValueCode {
- offset,
- code: missing_value_code,
- })
- }
- }
-
- for _ in 0..missing_value_code.abs() {
- missing.push(read_bytes(r)?);
- }
- }
-
- Ok(VariableRecord {
- offset,
- width,
- name,
- print_format,
- write_format,
- missing_value_code,
- missing,
- label,
- })
+#[derive(Clone, Debug)]
+pub enum CategoryLabels {
+ VarLabels,
+ CountedValues,
}
-
-pub struct ValueLabelRecord {
- /// Offset from the start of the file to the start of the record.
- pub offset: u64,
-
- /// The labels.
- pub labels: Vec<([u8; 8], Vec<u8>)>,
-
- /// The 0-based indexes of the variables to which the labels are assigned.
- pub var_indexes: Vec<u32>,
-}
-
-pub const MAX_VALUE_LABELS: u32 = u32::MAX / 8;
-
-fn read_value_label_record<R: Read + Seek>(
- r: &mut BufReader<R>,
- e: Endian,
- n_var_records: usize,
-) -> Result<ValueLabelRecord, Error> {
- let offset = r.stream_position()?;
- let n: u32 = e.parse(read_bytes(r)?);
- if n > MAX_VALUE_LABELS {
- return Err(Error::BadNumberOfValueLabels {
- offset,
- n,
- max: MAX_VALUE_LABELS,
- });
- }
-
- let mut labels = Vec::new();
- for _ in 0..n {
- let value: [u8; 8] = read_bytes(r)?;
- let label_len: u8 = e.parse(read_bytes(r)?);
- let label_len = label_len as usize;
- let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
-
- let mut label = read_vec(r, padded_len)?;
- label.truncate(label_len);
- labels.push((value, label));
- }
-
- let rec_type: u32 = e.parse(read_bytes(r)?);
- if rec_type != 4 {
- return Err(Error::MissingVariableIndexRecord {
- offset: r.stream_position()?,
- });
- }
-
- let n_vars: u32 = e.parse(read_bytes(r)?);
- if n_vars < 1 || n_vars as usize > n_var_records {
- return Err(Error::BadNumberOfValueLabelVariables {
- offset: r.stream_position()?,
- n: n_vars,
- max: n_var_records as u32,
- });
- }
- let mut var_indexes = Vec::with_capacity(n_vars as usize);
- for _ in 0..n_vars {
- var_indexes.push(e.parse(read_bytes(r)?));
- }
-
- Ok(ValueLabelRecord {
- offset,
- labels,
- var_indexes,
- })
-}
-
-pub struct DocumentRecord {
- /// Offset from the start of the file to the start of the record.
- pub pos: u64,
-
- /// The document, as an array of 80-byte lines.
- pub lines: Vec<[u8; DOC_LINE_LEN as usize]>,
-}
-
-fn read_document_record<R: Read + Seek>(
- r: &mut BufReader<R>,
- e: Endian,
-) -> Result<Option<DocumentRecord>, Error> {
- let offset = r.stream_position()?;
- let n: u32 = e.parse(read_bytes(r)?);
- if n == 0 {
- Ok(None)
- } else if n > DOC_MAX_LINES {
- Err(Error::BadDocumentLength {
- offset,
- n,
- max: DOC_MAX_LINES,
- })
- } else {
- let pos = r.stream_position()?;
- let mut lines = Vec::with_capacity(n as usize);
- for i in 0..n {
- let line: [u8; 80] = read_bytes(r)?;
- lines.push(line);
- }
- Ok(Some(DocumentRecord { pos, lines }))
- }
-}
-
-fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
- let mut buf = [0; N];
- r.read_exact(&mut buf)?;
- Ok(buf)
-}
-
-fn read_vec<R: Read>(r: &mut BufReader<R>, n: usize) -> Result<Vec<u8>, IoError> {
- let mut vec = vec![0; n];
- r.read_exact(&mut vec)?;
- Ok(vec)
-}
-
-/*
-fn trim_end(mut s: Vec<u8>, c: u8) -> Vec<u8> {
- while s.last() == Some(&c) {
- s.pop();
- }
- s
-}
-
-fn skip_bytes<R: Read>(r: &mut R, mut n: u64) -> Result<(), IoError> {
- let mut buf = [0; 1024];
- while n > 0 {
- let chunk = u64::min(n, buf.len() as u64);
- r.read_exact(&mut buf[0..chunk as usize])?;
- n -= chunk;
- }
- Ok(())
-}
-
-*/