1 #![allow(unused_variables)]
2 use endian::{Endian, Parse};
4 use num_derive::FromPrimitive;
5 use std::io::{BufReader, Error as IoError, Read, Seek};
10 #[derive(Error, Debug)]
12 #[error("Not an SPSS system file")]
15 #[error("Invalid magic number {0:?}")]
18 #[error("I/O error ({source})")]
24 #[error("Invalid SAV compression code {0}")]
25 InvalidSavCompression(u32),
27 #[error("Invalid ZSAV compression code {0}")]
28 InvalidZsavCompression(u32),
30 #[error("Misplaced type 4 record near offset {0:#x}.")]
31 MisplacedType4Record(u64),
33 #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
34 BadDocumentLength { offset: u64, n: u32, max: u32 },
36 #[error("At offset {offset:#x}, Unrecognized record type {rec_type}.")]
37 BadRecordType { offset: u64, rec_type: u32 },
39 #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")]
40 BadVariableLabelCode { offset: u64, code: u32 },
43 "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
45 BadNumericMissingValueCode { offset: u64, code: i32 },
47 #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
48 BadStringMissingValueCode { offset: u64, code: i32 },
50 #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
51 BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
53 #[error("At offset {offset:#x}, variable index record (type 4) does not immediately follow value label record (type 3) as it should.")]
54 MissingVariableIndexRecord { offset: u64 },
56 #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
57 BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
59 #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
60 ExtensionRecordTooLarge {
67 #[error("Wrong ZLIB data header offset {zheader_offset:#x} (expected {offset:#x}).")]
68 BadZlibHeaderOffset { offset: u64, zheader_offset: u64 },
70 #[error("At offset {offset:#x}, impossible ZLIB trailer offset {ztrailer_offset:#x}.")]
71 BadZlibTrailerOffset { offset: u64, ztrailer_offset: u64 },
73 #[error("At offset {offset:#x}, impossible ZLIB trailer length {ztrailer_len}.")]
74 BadZlibTrailerLen { offset: u64, ztrailer_len: u64 },
77 #[derive(Error, Debug)]
79 #[error("Unexpected floating-point bias {0} or unrecognized floating-point format.")]
82 #[error("Duplicate type 6 (document) record.")]
83 DuplicateDocumentRecord,
86 #[derive(Copy, Clone, Debug)]
87 pub enum Compression {
96 ValueLabel(ValueLabel),
97 VarIndexes(VarIndexes),
106 /// Endianness of the data in the file header.
107 pub endianness: Endian,
109 /// 0-based variable index of the weight variable, or `None` if the file is
111 pub weight_index: Option<u32>,
113 /// Number of variable positions, or `None` if the value in the file is
114 /// questionably trustworthy.
115 pub nominal_case_size: Option<u32>,
117 /// `dd mmm yy` in the file's encoding.
118 pub creation_date: [u8; 9],
120 /// `HH:MM:SS` in the file's encoding.
121 pub creation_time: [u8; 8],
123 /// Eye-catcher string, then product name, in the file's encoding. Padded
124 /// on the right with spaces.
125 pub eye_catcher: [u8; 60],
127 /// File label, in the file's encoding. Padded on the right with spaces.
128 pub file_label: [u8; 64],
131 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
132 pub struct Magic([u8; 4]);
135 /// Magic number for a regular system file.
136 pub const SAV: Magic = Magic(*b"$FL2");
138 /// Magic number for a system file that contains zlib-compressed data.
139 pub const ZSAV: Magic = Magic(*b"$FL3");
141 /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
143 pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
146 impl TryFrom<[u8; 4]> for Magic {
149 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
150 let magic = Magic(value);
152 Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
153 _ => Err(Error::BadMagic(value)),
158 pub struct Reader<R: Read> {
161 endianness: Option<Endian>,
170 impl<R: Read + Seek> Reader<R> {
171 pub fn new(r: R) -> Result<Reader<R>, Error> {
173 r: BufReader::new(r),
174 state: ReaderState::Start,
178 fn _next(&mut self) -> Result<Option<(Record, ReaderState)>, Error> {
180 ReaderState::Start => {
181 let header = read_header(&mut self.r)?;
182 let endianness = header.endianness;
183 Ok(Some((Record::Header(header), ReaderState::Headers(endianness))))
185 ReaderState::Headers(e) => {
186 let rec_type: u32 = e.parse(read_bytes(&mut self.r)?);
187 let record = match rec_type {
188 2 => Record::Variable(read_variable_record(&mut self.r, e)?),
189 3 => Record::ValueLabel(read_value_label_record(&mut self.r, e)?),
190 4 => Record::VarIndexes(read_var_indexes_record(&mut self.r, e)?),
191 6 => Record::Document(read_document_record(&mut self.r, e)?),
192 7 => Record::Extension(read_extension_record(&mut self.r, e)?),
194 let _: [u8; 4] = read_bytes(&mut self.r)?;
195 return Ok(Some((Record::EndOfHeaders, ReaderState::End)))
198 return Err(Error::BadRecordType {
199 offset: self.r.stream_position()?,
204 Ok(Some((record, ReaderState::Headers(e))))
206 ReaderState::End => Ok(None),
211 impl<R: Read + Seek> Iterator for Reader<R> {
212 type Item = Result<Record, Error>;
214 fn next(&mut self) -> Option<Self::Item> {
215 let retval = self._next();
218 self.state = ReaderState::End;
221 Ok(Some((record, next_state))) => {
222 self.state = next_state;
226 self.state = ReaderState::End;
233 fn read_header<R: Read>(r: &mut R) -> Result<Header, Error> {
234 let magic: [u8; 4] = read_bytes(r)?;
235 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
237 let eye_catcher: [u8; 60] = read_bytes(r)?;
238 let layout_code: [u8; 4] = read_bytes(r)?;
239 let endianness = Endian::identify_u32(2, layout_code)
240 .or_else(|| Endian::identify_u32(2, layout_code))
241 .ok_or_else(|| Error::NotASystemFile)?;
243 let nominal_case_size: u32 = endianness.parse(read_bytes(r)?);
244 let nominal_case_size =
245 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
247 let compression_code: u32 = endianness.parse(read_bytes(r)?);
248 let compression = match (magic, compression_code) {
249 (Magic::ZSAV, 2) => Some(Compression::ZLib),
250 (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
252 (_, 1) => Some(Compression::Simple),
253 (_, code) => return Err(Error::InvalidSavCompression(code)),
256 let weight_index: u32 = endianness.parse(read_bytes(r)?);
257 let weight_index = (weight_index > 0).then_some(weight_index - 1);
259 let n_cases: u32 = endianness.parse(read_bytes(r)?);
260 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
262 let bias: f64 = endianness.parse(read_bytes(r)?);
264 let creation_date: [u8; 9] = read_bytes(r)?;
265 let creation_time: [u8; 8] = read_bytes(r)?;
266 let file_label: [u8; 64] = read_bytes(r)?;
267 let _: [u8; 3] = read_bytes(r)?;
281 pub struct Variable {
282 /// Offset from the start of the file to the start of the record.
285 /// Variable width, in the range -1..=255.
288 /// Variable name, padded on the right with spaces.
292 pub print_format: u32,
295 pub write_format: u32,
297 /// Missing value code, one of -3, -2, 0, 1, 2, or 3.
298 pub missing_value_code: i32,
300 /// Raw missing values, up to 3 of them.
301 pub missing: Vec<[u8; 8]>,
303 /// Optional variable label.
304 pub label: Option<Vec<u8>>,
307 fn read_variable_record<R: Read + Seek>(
308 r: &mut BufReader<R>,
310 ) -> Result<Variable, Error> {
311 let offset = r.stream_position()?;
312 let width: i32 = e.parse(read_bytes(r)?);
313 let has_variable_label: u32 = e.parse(read_bytes(r)?);
314 let missing_value_code: i32 = e.parse(read_bytes(r)?);
315 let print_format: u32 = e.parse(read_bytes(r)?);
316 let write_format: u32 = e.parse(read_bytes(r)?);
317 let name: [u8; 8] = read_bytes(r)?;
319 let label = match has_variable_label {
322 let len: u32 = e.parse(read_bytes(r)?);
323 let read_len = len.min(65535) as usize;
324 let label = Some(read_vec(r, read_len)?);
326 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
327 let _ = read_vec(r, padding_bytes as usize)?;
332 return Err(Error::BadVariableLabelCode {
334 code: has_variable_label,
339 let mut missing = Vec::new();
340 if missing_value_code != 0 {
341 match (width, missing_value_code) {
342 (0, -3 | -2 | 1 | 2 | 3) => (),
344 return Err(Error::BadNumericMissingValueCode {
346 code: missing_value_code,
351 return Err(Error::BadStringMissingValueCode {
353 code: missing_value_code,
358 for _ in 0..missing_value_code.abs() {
359 missing.push(read_bytes(r)?);
375 pub struct ValueLabel {
376 /// Offset from the start of the file to the start of the record.
380 pub labels: Vec<([u8; 8], Vec<u8>)>,
384 /// Maximum number of value labels in a record.
385 pub const MAX: u32 = u32::MAX / 8;
388 fn read_value_label_record<R: Read + Seek>(
389 r: &mut BufReader<R>,
391 ) -> Result<ValueLabel, Error> {
392 let offset = r.stream_position()?;
393 let n: u32 = e.parse(read_bytes(r)?);
394 if n > ValueLabel::MAX {
395 return Err(Error::BadNumberOfValueLabels {
398 max: ValueLabel::MAX,
402 let mut labels = Vec::new();
404 let value: [u8; 8] = read_bytes(r)?;
405 let label_len: u8 = e.parse(read_bytes(r)?);
406 let label_len = label_len as usize;
407 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
409 let mut label = read_vec(r, padded_len)?;
410 label.truncate(label_len);
411 labels.push((value, label));
413 Ok(ValueLabel { offset, labels })
416 pub struct VarIndexes {
417 /// Offset from the start of the file to the start of the record.
420 /// The 0-based indexes of the variable indexes.
421 pub var_indexes: Vec<u32>,
425 /// Maximum number of variable indexes in a record.
426 pub const MAX: u32 = u32::MAX / 8;
429 fn read_var_indexes_record<R: Read + Seek>(
430 r: &mut BufReader<R>,
432 ) -> Result<VarIndexes, Error> {
433 let offset = r.stream_position()?;
434 let n: u32 = e.parse(read_bytes(r)?);
435 if n > VarIndexes::MAX {
436 return Err(Error::BadNumberOfVarIndexes {
439 max: VarIndexes::MAX,
442 let mut var_indexes = Vec::with_capacity(n as usize);
444 var_indexes.push(e.parse(read_bytes(r)?));
453 pub const DOC_LINE_LEN: u32 = 80;
454 pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN;
456 pub struct Document {
457 /// Offset from the start of the file to the start of the record.
460 /// The document, as an array of 80-byte lines.
461 pub lines: Vec<[u8; DOC_LINE_LEN as usize]>,
464 fn read_document_record<R: Read + Seek>(
465 r: &mut BufReader<R>,
467 ) -> Result<Document, Error> {
468 let offset = r.stream_position()?;
469 let n: u32 = e.parse(read_bytes(r)?);
471 0..=DOC_MAX_LINES => {
472 let pos = r.stream_position()?;
473 let mut lines = Vec::with_capacity(n as usize);
475 let line: [u8; 80] = read_bytes(r)?;
478 Ok(Document { pos, lines })
480 _ => Err(Error::BadDocumentLength {
488 #[derive(FromPrimitive)]
490 /// Machine integer info.
492 /// Machine floating-point info.
498 /// Multiple response sets.
502 /// Extra product info text.
504 /// Variable display parameters.
506 /// Long variable names.
510 /// Extended number of cases.
512 /// Data file attributes.
514 /// Variable attributes.
516 /// Multiple response sets (extended).
518 /// Character encoding.
520 /// Value labels for long strings.
522 /// Missing values for long strings.
524 /// "Format properties in dataview table".
528 pub struct Extension {
529 /// Offset from the start of the file to the start of the record.
535 /// Size of each data element.
538 /// Number of data elements.
541 /// `size * count` bytes of data.
545 fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) {
547 /* Implemented record types. */
548 ExtensionType::Integer => (4, 8),
549 ExtensionType::Float => (8, 3),
550 ExtensionType::VarSets => (1, 0),
551 ExtensionType::Mrsets => (1, 0),
552 ExtensionType::ProductInfo => (1, 0),
553 ExtensionType::Display => (4, 0),
554 ExtensionType::LongNames => (1, 0),
555 ExtensionType::LongStrings => (1, 0),
556 ExtensionType::Ncases => (8, 2),
557 ExtensionType::FileAttrs => (1, 0),
558 ExtensionType::VarAttrs => (1, 0),
559 ExtensionType::Mrsets2 => (1, 0),
560 ExtensionType::Encoding => (1, 0),
561 ExtensionType::LongLabels => (1, 0),
562 ExtensionType::LongMissing => (1, 0),
564 /* Ignored record types. */
565 ExtensionType::Date => (0, 0),
566 ExtensionType::DataEntry => (0, 0),
567 ExtensionType::Dataview => (0, 0),
571 fn read_extension_record<R: Read + Seek>(
572 r: &mut BufReader<R>,
574 ) -> Result<Extension, Error> {
575 let subtype = e.parse(read_bytes(r)?);
576 let offset = r.stream_position()?;
577 let size: u32 = e.parse(read_bytes(r)?);
578 let count = e.parse(read_bytes(r)?);
579 let Some(product) = size.checked_mul(count) else {
580 return Err(Error::ExtensionRecordTooLarge {
587 let offset = r.stream_position()?;
588 let data = read_vec(r, product as usize)?;
599 /// File offset to the start of the record.
602 /// File offset to the ZLIB data header.
605 /// File offset to the ZLIB trailer.
606 ztrailer_offset: u64,
608 /// Length of the ZLIB trailer in bytes.
612 fn read_zheader<R: Read + Seek>(r: &mut BufReader<R>, e: Endian) -> Result<ZHeader, Error> {
613 let offset = r.stream_position()?;
614 let zheader_offset: u64 = e.parse(read_bytes(r)?);
615 let ztrailer_offset: u64 = e.parse(read_bytes(r)?);
616 let ztrailer_len: u64 = e.parse(read_bytes(r)?);
618 if zheader_offset != offset {
619 return Err(Error::BadZlibHeaderOffset {
624 if ztrailer_offset < offset {
625 return Err(Error::BadZlibTrailerOffset {
630 if ztrailer_len < 24 || ztrailer_len % 24 != 0 {
631 return Err(Error::BadZlibTrailerLen {
645 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
646 let mut buf = [0; N];
647 r.read_exact(&mut buf)?;
651 fn read_vec<R: Read>(r: &mut BufReader<R>, n: usize) -> Result<Vec<u8>, IoError> {
652 let mut vec = vec![0; n];
653 r.read_exact(&mut vec)?;
658 fn trim_end(mut s: Vec<u8>, c: u8) -> Vec<u8> {
659 while s.last() == Some(&c) {
665 fn skip_bytes<R: Read>(r: &mut R, mut n: u64) -> Result<(), IoError> {
666 let mut buf = [0; 1024];
668 let chunk = u64::min(n, buf.len() as u64);
669 r.read_exact(&mut buf[0..chunk as usize])?;