1 #![allow(unused_variables)]
2 use endian::{Endian, Parse};
4 use num_derive::FromPrimitive;
5 use std::io::{BufReader, Error as IoError, Read, Seek};
10 #[derive(Error, Debug)]
12 #[error("Not an SPSS system file")]
15 #[error("Invalid magic number {0:?}")]
18 #[error("I/O error ({source})")]
24 #[error("Invalid SAV compression code {0}")]
25 InvalidSavCompression(u32),
27 #[error("Invalid ZSAV compression code {0}")]
28 InvalidZsavCompression(u32),
30 #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
31 BadVariableWidth { offset: u64, width: i32 },
33 #[error("Misplaced type 4 record near offset {0:#x}.")]
34 MisplacedType4Record(u64),
36 #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
37 BadDocumentLength { offset: u64, n: u32, max: u32 },
39 #[error("At offset {offset:#x}, Unrecognized record type {rec_type}.")]
40 BadRecordType { offset: u64, rec_type: u32 },
42 #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")]
43 BadVariableLabelCode { offset: u64, code: u32 },
46 "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
48 BadNumericMissingValueCode { offset: u64, code: i32 },
50 #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
51 BadStringMissingValueCode { offset: u64, code: i32 },
53 #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
54 BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
56 #[error("At offset {offset:#x}, variable index record (type 4) does not immediately follow value label record (type 3) as it should.")]
57 MissingVariableIndexRecord { offset: u64 },
59 #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
60 BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
62 #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
63 ExtensionRecordTooLarge {
70 #[error("Wrong ZLIB data header offset {zheader_offset:#x} (expected {offset:#x}).")]
71 BadZlibHeaderOffset { offset: u64, zheader_offset: u64 },
73 #[error("At offset {offset:#x}, impossible ZLIB trailer offset {ztrailer_offset:#x}.")]
74 BadZlibTrailerOffset { offset: u64, ztrailer_offset: u64 },
76 #[error("At offset {offset:#x}, impossible ZLIB trailer length {ztrailer_len}.")]
77 BadZlibTrailerLen { offset: u64, ztrailer_len: u64 },
80 #[derive(Error, Debug)]
82 #[error("Unexpected floating-point bias {0} or unrecognized floating-point format.")]
85 #[error("Duplicate type 6 (document) record.")]
86 DuplicateDocumentRecord,
89 #[derive(Copy, Clone, Debug)]
90 pub enum Compression {
99 ValueLabel(ValueLabel),
100 VarIndexes(VarIndexes),
101 Extension(Extension),
109 /// Eye-catcher string, product name, in the file's encoding. Padded
110 /// on the right with spaces.
111 pub eye_catcher: [u8; 60],
113 /// Layout code, normally either 2 or 3.
114 pub layout_code: u32,
116 /// Number of variable positions, or `None` if the value in the file is
117 /// questionably trustworthy.
118 pub nominal_case_size: Option<u32>,
120 /// Compression type, if any,
121 pub compression: Option<Compression>,
123 /// 0-based variable index of the weight variable, or `None` if the file is
125 pub weight_index: Option<u32>,
127 /// Claimed number of cases, if known.
128 pub n_cases: Option<u32>,
130 /// Compression bias, usually 100.0.
133 /// `dd mmm yy` in the file's encoding.
134 pub creation_date: [u8; 9],
136 /// `HH:MM:SS` in the file's encoding.
137 pub creation_time: [u8; 8],
139 /// File label, in the file's encoding. Padded on the right with spaces.
140 pub file_label: [u8; 64],
142 /// Endianness of the data in the file header.
143 pub endianness: Endian,
146 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
147 pub struct Magic([u8; 4]);
150 /// Magic number for a regular system file.
151 pub const SAV: Magic = Magic(*b"$FL2");
153 /// Magic number for a system file that contains zlib-compressed data.
154 pub const ZSAV: Magic = Magic(*b"$FL3");
156 /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
158 pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
161 impl TryFrom<[u8; 4]> for Magic {
164 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
165 let magic = Magic(value);
167 Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
168 _ => Err(Error::BadMagic(value)),
179 fn from_width(width: i32) -> VarType {
181 0 => VarType::Number,
182 _ => VarType::String,
187 pub struct Reader<R: Read> {
189 var_types: Vec<VarType>,
195 Headers(Endian, Option<Compression>),
200 impl<R: Read + Seek> Reader<R> {
201 pub fn new(r: R) -> Result<Reader<R>, Error> {
203 r: BufReader::new(r),
204 var_types: Vec::new(),
205 state: ReaderState::Start,
208 fn _next(&mut self) -> Result<Option<(Record, ReaderState)>, Error> {
210 ReaderState::Start => {
211 let header = read_header(&mut self.r)?;
212 let next_state = ReaderState::Headers(header.endianness, header.compression);
213 Ok(Some((Record::Header(header), next_state)))
215 ReaderState::Headers(endian, compression) => {
216 let rec_type: u32 = endian.parse(read_bytes(&mut self.r)?);
217 let record = match rec_type {
219 let variable = read_variable_record(&mut self.r, endian)?;
220 self.var_types.push(VarType::from_width(variable.width));
221 Record::Variable(variable)
223 3 => Record::ValueLabel(read_value_label_record(&mut self.r, endian)?),
224 4 => Record::VarIndexes(read_var_indexes_record(&mut self.r, endian)?),
225 6 => Record::Document(read_document_record(&mut self.r, endian)?),
226 7 => Record::Extension(read_extension_record(&mut self.r, endian)?),
228 let _: [u8; 4] = read_bytes(&mut self.r)?;
229 let next_state = match compression {
230 None => ReaderState::Data(endian),
231 _ => ReaderState::End,
233 return Ok(Some((Record::EndOfHeaders, next_state)));
236 return Err(Error::BadRecordType {
237 offset: self.r.stream_position()?,
242 Ok(Some((record, ReaderState::Headers(endian, compression))))
244 ReaderState::End => Ok(None),
249 impl<R: Read + Seek> Iterator for Reader<R> {
250 type Item = Result<Record, Error>;
252 fn next(&mut self) -> Option<Self::Item> {
253 let retval = self._next();
256 self.state = ReaderState::End;
259 Ok(Some((record, next_state))) => {
260 self.state = next_state;
264 self.state = ReaderState::End;
271 fn read_header<R: Read>(r: &mut R) -> Result<Header, Error> {
272 let magic: [u8; 4] = read_bytes(r)?;
273 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
275 let eye_catcher: [u8; 60] = read_bytes(r)?;
276 let layout_code: [u8; 4] = read_bytes(r)?;
277 let endianness = Endian::identify_u32(2, layout_code)
278 .or_else(|| Endian::identify_u32(2, layout_code))
279 .ok_or_else(|| Error::NotASystemFile)?;
280 let layout_code = endianness.parse(layout_code);
282 let nominal_case_size: u32 = endianness.parse(read_bytes(r)?);
283 let nominal_case_size =
284 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
286 let compression_code: u32 = endianness.parse(read_bytes(r)?);
287 let compression = match (magic, compression_code) {
288 (Magic::ZSAV, 2) => Some(Compression::ZLib),
289 (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
291 (_, 1) => Some(Compression::Simple),
292 (_, code) => return Err(Error::InvalidSavCompression(code)),
295 let weight_index: u32 = endianness.parse(read_bytes(r)?);
296 let weight_index = (weight_index > 0).then_some(weight_index - 1);
298 let n_cases: u32 = endianness.parse(read_bytes(r)?);
299 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
301 let bias: f64 = endianness.parse(read_bytes(r)?);
303 let creation_date: [u8; 9] = read_bytes(r)?;
304 let creation_time: [u8; 8] = read_bytes(r)?;
305 let file_label: [u8; 64] = read_bytes(r)?;
306 let _: [u8; 3] = read_bytes(r)?;
324 pub struct Variable {
325 /// Offset from the start of the file to the start of the record.
328 /// Variable width, in the range -1..=255.
331 /// Variable name, padded on the right with spaces.
335 pub print_format: u32,
338 pub write_format: u32,
340 /// Missing value code, one of -3, -2, 0, 1, 2, or 3.
341 pub missing_value_code: i32,
343 /// Raw missing values, up to 3 of them.
344 pub missing: Vec<[u8; 8]>,
346 /// Optional variable label.
347 pub label: Option<Vec<u8>>,
350 fn read_variable_record<R: Read + Seek>(
351 r: &mut BufReader<R>,
353 ) -> Result<Variable, Error> {
354 let offset = r.stream_position()?;
355 let width: i32 = e.parse(read_bytes(r)?);
356 let has_variable_label: u32 = e.parse(read_bytes(r)?);
357 let missing_value_code: i32 = e.parse(read_bytes(r)?);
358 let print_format: u32 = e.parse(read_bytes(r)?);
359 let write_format: u32 = e.parse(read_bytes(r)?);
360 let name: [u8; 8] = read_bytes(r)?;
362 let label = match has_variable_label {
365 let len: u32 = e.parse(read_bytes(r)?);
366 let read_len = len.min(65535) as usize;
367 let label = Some(read_vec(r, read_len)?);
369 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
370 let _ = read_vec(r, padding_bytes as usize)?;
375 return Err(Error::BadVariableLabelCode {
377 code: has_variable_label,
382 let mut missing = Vec::new();
383 if missing_value_code != 0 {
384 match (width, missing_value_code) {
385 (0, -3 | -2 | 1 | 2 | 3) => (),
387 return Err(Error::BadNumericMissingValueCode {
389 code: missing_value_code,
394 return Err(Error::BadStringMissingValueCode {
396 code: missing_value_code,
401 for _ in 0..missing_value_code.abs() {
402 missing.push(read_bytes(r)?);
418 pub struct ValueLabel {
419 /// Offset from the start of the file to the start of the record.
423 pub labels: Vec<([u8; 8], Vec<u8>)>,
427 /// Maximum number of value labels in a record.
428 pub const MAX: u32 = u32::MAX / 8;
431 fn read_value_label_record<R: Read + Seek>(
432 r: &mut BufReader<R>,
434 ) -> Result<ValueLabel, Error> {
435 let offset = r.stream_position()?;
436 let n: u32 = e.parse(read_bytes(r)?);
437 if n > ValueLabel::MAX {
438 return Err(Error::BadNumberOfValueLabels {
441 max: ValueLabel::MAX,
445 let mut labels = Vec::new();
447 let value: [u8; 8] = read_bytes(r)?;
448 let label_len: u8 = e.parse(read_bytes(r)?);
449 let label_len = label_len as usize;
450 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
452 let mut label = read_vec(r, padded_len)?;
453 label.truncate(label_len);
454 labels.push((value, label));
456 Ok(ValueLabel { offset, labels })
459 pub struct VarIndexes {
460 /// Offset from the start of the file to the start of the record.
463 /// The 0-based indexes of the variable indexes.
464 pub var_indexes: Vec<u32>,
468 /// Maximum number of variable indexes in a record.
469 pub const MAX: u32 = u32::MAX / 8;
472 fn read_var_indexes_record<R: Read + Seek>(
473 r: &mut BufReader<R>,
475 ) -> Result<VarIndexes, Error> {
476 let offset = r.stream_position()?;
477 let n: u32 = e.parse(read_bytes(r)?);
478 if n > VarIndexes::MAX {
479 return Err(Error::BadNumberOfVarIndexes {
482 max: VarIndexes::MAX,
485 let mut var_indexes = Vec::with_capacity(n as usize);
487 var_indexes.push(e.parse(read_bytes(r)?));
496 pub const DOC_LINE_LEN: u32 = 80;
497 pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN;
499 pub struct Document {
500 /// Offset from the start of the file to the start of the record.
503 /// The document, as an array of 80-byte lines.
504 pub lines: Vec<[u8; DOC_LINE_LEN as usize]>,
507 fn read_document_record<R: Read + Seek>(
508 r: &mut BufReader<R>,
510 ) -> Result<Document, Error> {
511 let offset = r.stream_position()?;
512 let n: u32 = e.parse(read_bytes(r)?);
514 0..=DOC_MAX_LINES => {
515 let pos = r.stream_position()?;
516 let mut lines = Vec::with_capacity(n as usize);
518 let line: [u8; 80] = read_bytes(r)?;
521 Ok(Document { pos, lines })
523 _ => Err(Error::BadDocumentLength {
531 #[derive(FromPrimitive)]
533 /// Machine integer info.
535 /// Machine floating-point info.
541 /// Multiple response sets.
545 /// Extra product info text.
547 /// Variable display parameters.
549 /// Long variable names.
553 /// Extended number of cases.
555 /// Data file attributes.
557 /// Variable attributes.
559 /// Multiple response sets (extended).
561 /// Character encoding.
563 /// Value labels for long strings.
565 /// Missing values for long strings.
567 /// "Format properties in dataview table".
571 pub struct Extension {
572 /// Offset from the start of the file to the start of the record.
578 /// Size of each data element.
581 /// Number of data elements.
584 /// `size * count` bytes of data.
588 fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) {
590 /* Implemented record types. */
591 ExtensionType::Integer => (4, 8),
592 ExtensionType::Float => (8, 3),
593 ExtensionType::VarSets => (1, 0),
594 ExtensionType::Mrsets => (1, 0),
595 ExtensionType::ProductInfo => (1, 0),
596 ExtensionType::Display => (4, 0),
597 ExtensionType::LongNames => (1, 0),
598 ExtensionType::LongStrings => (1, 0),
599 ExtensionType::Ncases => (8, 2),
600 ExtensionType::FileAttrs => (1, 0),
601 ExtensionType::VarAttrs => (1, 0),
602 ExtensionType::Mrsets2 => (1, 0),
603 ExtensionType::Encoding => (1, 0),
604 ExtensionType::LongLabels => (1, 0),
605 ExtensionType::LongMissing => (1, 0),
607 /* Ignored record types. */
608 ExtensionType::Date => (0, 0),
609 ExtensionType::DataEntry => (0, 0),
610 ExtensionType::Dataview => (0, 0),
614 fn read_extension_record<R: Read + Seek>(
615 r: &mut BufReader<R>,
617 ) -> Result<Extension, Error> {
618 let subtype = e.parse(read_bytes(r)?);
619 let offset = r.stream_position()?;
620 let size: u32 = e.parse(read_bytes(r)?);
621 let count = e.parse(read_bytes(r)?);
622 let Some(product) = size.checked_mul(count) else {
623 return Err(Error::ExtensionRecordTooLarge {
630 let offset = r.stream_position()?;
631 let data = read_vec(r, product as usize)?;
642 /// File offset to the start of the record.
645 /// File offset to the ZLIB data header.
648 /// File offset to the ZLIB trailer.
649 ztrailer_offset: u64,
651 /// Length of the ZLIB trailer in bytes.
655 fn read_zheader<R: Read + Seek>(r: &mut BufReader<R>, e: Endian) -> Result<ZHeader, Error> {
656 let offset = r.stream_position()?;
657 let zheader_offset: u64 = e.parse(read_bytes(r)?);
658 let ztrailer_offset: u64 = e.parse(read_bytes(r)?);
659 let ztrailer_len: u64 = e.parse(read_bytes(r)?);
669 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
670 let mut buf = [0; N];
671 r.read_exact(&mut buf)?;
675 fn read_vec<R: Read>(r: &mut BufReader<R>, n: usize) -> Result<Vec<u8>, IoError> {
676 let mut vec = vec![0; n];
677 r.read_exact(&mut vec)?;
682 fn trim_end(mut s: Vec<u8>, c: u8) -> Vec<u8> {
683 while s.last() == Some(&c) {
689 fn skip_bytes<R: Read>(r: &mut R, mut n: u64) -> Result<(), IoError> {
690 let mut buf = [0; 1024];
692 let chunk = u64::min(n, buf.len() as u64);
693 r.read_exact(&mut buf[0..chunk as usize])?;