1 #![allow(unused_variables)]
2 use endian::{Endian, Parse, ToBytes};
4 use num_derive::FromPrimitive;
7 io::{BufReader, Error as IoError, Read, Seek},
13 #[derive(Error, Debug)]
15 #[error("Not an SPSS system file")]
18 #[error("Invalid magic number {0:?}")]
21 #[error("I/O error ({0})")]
24 #[error("Invalid SAV compression code {0}")]
25 InvalidSavCompression(u32),
27 #[error("Invalid ZSAV compression code {0}")]
28 InvalidZsavCompression(u32),
30 #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
31 BadVariableWidth { offset: u64, width: i32 },
33 #[error("Misplaced type 4 record near offset {0:#x}.")]
34 MisplacedType4Record(u64),
36 #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
37 BadDocumentLength { offset: u64, n: u32, max: u32 },
39 #[error("At offset {offset:#x}, Unrecognized record type {rec_type}.")]
40 BadRecordType { offset: u64, rec_type: u32 },
42 #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")]
43 BadVariableLabelCode { offset: u64, code: u32 },
46 "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
48 BadNumericMissingValueCode { offset: u64, code: i32 },
50 #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
51 BadStringMissingValueCode { offset: u64, code: i32 },
53 #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
54 BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
56 #[error("At offset {offset:#x}, variable index record (type 4) does not immediately follow value label record (type 3) as it should.")]
57 MissingVariableIndexRecord { offset: u64 },
59 #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
60 BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
62 #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
63 ExtensionRecordTooLarge {
70 #[error("Wrong ZLIB data header offset {zheader_offset:#x} (expected {offset:#x}).")]
71 BadZlibHeaderOffset { offset: u64, zheader_offset: u64 },
73 #[error("At offset {offset:#x}, impossible ZLIB trailer offset {ztrailer_offset:#x}.")]
74 BadZlibTrailerOffset { offset: u64, ztrailer_offset: u64 },
76 #[error("At offset {offset:#x}, impossible ZLIB trailer length {ztrailer_len}.")]
77 BadZlibTrailerLen { offset: u64, ztrailer_len: u64 },
79 #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
87 "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
89 EofInCompressedCase { offset: u64, case_ofs: u64 },
91 #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
92 PartialCompressedCase { offset: u64, case_ofs: u64 },
94 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
95 CompressedNumberExpected { offset: u64, case_ofs: u64 },
97 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
98 CompressedStringExpected { offset: u64, case_ofs: u64 },
101 #[derive(Error, Debug)]
103 #[error("Unexpected floating-point bias {0} or unrecognized floating-point format.")]
106 #[error("Duplicate type 6 (document) record.")]
107 DuplicateDocumentRecord,
110 #[derive(Copy, Clone, Debug)]
111 pub enum Compression {
120 ValueLabel(ValueLabel),
121 VarIndexes(VarIndexes),
122 Extension(Extension),
131 /// Eye-catcher string, product name, in the file's encoding. Padded
132 /// on the right with spaces.
133 pub eye_catcher: [u8; 60],
135 /// Layout code, normally either 2 or 3.
136 pub layout_code: u32,
138 /// Number of variable positions, or `None` if the value in the file is
139 /// questionably trustworthy.
140 pub nominal_case_size: Option<u32>,
142 /// Compression type, if any,
143 pub compression: Option<Compression>,
145 /// 0-based variable index of the weight variable, or `None` if the file is
147 pub weight_index: Option<u32>,
149 /// Claimed number of cases, if known.
150 pub n_cases: Option<u32>,
152 /// Compression bias, usually 100.0.
155 /// `dd mmm yy` in the file's encoding.
156 pub creation_date: [u8; 9],
158 /// `HH:MM:SS` in the file's encoding.
159 pub creation_time: [u8; 8],
161 /// File label, in the file's encoding. Padded on the right with spaces.
162 pub file_label: [u8; 64],
164 /// Endianness of the data in the file header.
165 pub endianness: Endian,
168 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
169 pub struct Magic([u8; 4]);
172 /// Magic number for a regular system file.
173 pub const SAV: Magic = Magic(*b"$FL2");
175 /// Magic number for a system file that contains zlib-compressed data.
176 pub const ZSAV: Magic = Magic(*b"$FL3");
178 /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
180 pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
183 impl TryFrom<[u8; 4]> for Magic {
186 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
187 let magic = Magic(value);
189 Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
190 _ => Err(Error::BadMagic(value)),
195 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
202 fn from_width(width: i32) -> VarType {
204 0 => VarType::Number,
205 _ => VarType::String,
210 pub struct Reader<R: Read> {
212 var_types: Vec<VarType>,
218 Headers(Endian, Option<Compression>),
220 CompressedData(Endian, VecDeque<u8>),
224 #[derive(Copy, Clone)]
231 pub fn from_raw(var_type: VarType, raw: [u8; 8], endian: Endian) -> Value {
233 VarType::String => Value::String(raw),
235 let number: f64 = endian.parse(raw);
236 Value::Number((number != -f64::MAX).then_some(number))
242 impl<R: Read + Seek> Reader<R> {
243 pub fn new(r: R) -> Result<Reader<R>, Error> {
245 r: BufReader::new(r),
246 var_types: Vec::new(),
247 state: ReaderState::Start,
250 fn _next(&mut self) -> Result<Option<Record>, Error> {
252 ReaderState::Start => {
253 let header = read_header(&mut self.r)?;
254 self.state = ReaderState::Headers(header.endianness, header.compression);
255 Ok(Some(Record::Header(header)))
257 ReaderState::Headers(endian, compression) => {
258 let rec_type: u32 = endian.parse(read_bytes(&mut self.r)?);
259 let record = match rec_type {
261 let variable = read_variable_record(&mut self.r, endian)?;
262 self.var_types.push(VarType::from_width(variable.width));
263 Record::Variable(variable)
265 3 => Record::ValueLabel(read_value_label_record(&mut self.r, endian)?),
266 4 => Record::VarIndexes(read_var_indexes_record(&mut self.r, endian)?),
267 6 => Record::Document(read_document_record(&mut self.r, endian)?),
268 7 => Record::Extension(read_extension_record(&mut self.r, endian)?),
270 let _: [u8; 4] = read_bytes(&mut self.r)?;
271 self.state = match compression {
272 None => ReaderState::Data(endian),
273 Some(Compression::Simple) => {
274 ReaderState::CompressedData(endian, VecDeque::new())
276 _ => ReaderState::End,
278 return Ok(Some(Record::EndOfHeaders));
281 return Err(Error::BadRecordType {
282 offset: self.r.stream_position()?,
289 ReaderState::Data(endian) => {
290 let case_start = self.r.stream_position()?;
291 let mut values = Vec::with_capacity(self.var_types.len());
292 for (i, &var_type) in self.var_types.iter().enumerate() {
293 let Some(raw) = try_read_bytes(&mut self.r)? else {
297 let offset = self.r.stream_position()?;
298 return Err(Error::EofInCase { offset, case_ofs: offset - case_start, case_len: self.var_types.len() * 8});
301 values.push(Value::from_raw(var_type, raw, endian));
303 Ok(Some(Record::Case(values)))
305 ReaderState::CompressedData(endian, ref mut codes) => {
306 let case_start = self.r.stream_position()?;
307 let mut values = Vec::with_capacity(self.var_types.len());
308 let bias = 100.0; // XXX
309 for (i, &var_type) in self.var_types.iter().enumerate() {
311 let Some(code) = codes.pop_front() else {
312 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(&mut self.r)? else {
316 let offset = self.r.stream_position()?;
317 return Err(Error::EofInCompressedCase { offset, case_ofs: offset - case_start});
320 codes.extend(new_codes.into_iter());
325 1..=251 => match var_type {
326 VarType::Number => break Value::Number(Some(code as f64 - bias)),
328 break Value::String(endian.to_bytes(code as f64 - bias))
335 let offset = self.r.stream_position()?;
336 return Err(Error::PartialCompressedCase {
338 case_ofs: offset - case_start,
342 253 => break Value::from_raw(
344 read_bytes(&mut self.r)?,
347 254 => match var_type {
348 VarType::String => break Value::String(*b" "), // XXX EBCDIC
350 return Err(Error::CompressedStringExpected {
352 case_ofs: self.r.stream_position()? - case_start,
356 255 => match var_type {
357 VarType::Number => break Value::Number(None),
359 return Err(Error::CompressedNumberExpected {
361 case_ofs: self.r.stream_position()? - case_start,})
368 Ok(Some(Record::Case(values)))
370 ReaderState::End => Ok(None),
375 impl<R: Read + Seek> Iterator for Reader<R> {
376 type Item = Result<Record, Error>;
378 fn next(&mut self) -> Option<Self::Item> {
379 let retval = self._next();
382 self.state = ReaderState::End;
385 Ok(Some(record)) => {
389 self.state = ReaderState::End;
396 fn read_header<R: Read>(r: &mut R) -> Result<Header, Error> {
397 let magic: [u8; 4] = read_bytes(r)?;
398 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
400 let eye_catcher: [u8; 60] = read_bytes(r)?;
401 let layout_code: [u8; 4] = read_bytes(r)?;
402 let endianness = Endian::identify_u32(2, layout_code)
403 .or_else(|| Endian::identify_u32(2, layout_code))
404 .ok_or_else(|| Error::NotASystemFile)?;
405 let layout_code = endianness.parse(layout_code);
407 let nominal_case_size: u32 = endianness.parse(read_bytes(r)?);
408 let nominal_case_size =
409 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
411 let compression_code: u32 = endianness.parse(read_bytes(r)?);
412 let compression = match (magic, compression_code) {
413 (Magic::ZSAV, 2) => Some(Compression::ZLib),
414 (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
416 (_, 1) => Some(Compression::Simple),
417 (_, code) => return Err(Error::InvalidSavCompression(code)),
420 let weight_index: u32 = endianness.parse(read_bytes(r)?);
421 let weight_index = (weight_index > 0).then_some(weight_index - 1);
423 let n_cases: u32 = endianness.parse(read_bytes(r)?);
424 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
426 let bias: f64 = endianness.parse(read_bytes(r)?);
428 let creation_date: [u8; 9] = read_bytes(r)?;
429 let creation_time: [u8; 8] = read_bytes(r)?;
430 let file_label: [u8; 64] = read_bytes(r)?;
431 let _: [u8; 3] = read_bytes(r)?;
449 pub struct Variable {
450 /// Offset from the start of the file to the start of the record.
453 /// Variable width, in the range -1..=255.
456 /// Variable name, padded on the right with spaces.
460 pub print_format: u32,
463 pub write_format: u32,
465 /// Missing value code, one of -3, -2, 0, 1, 2, or 3.
466 pub missing_value_code: i32,
468 /// Raw missing values, up to 3 of them.
469 pub missing: Vec<[u8; 8]>,
471 /// Optional variable label.
472 pub label: Option<Vec<u8>>,
475 fn read_variable_record<R: Read + Seek>(
476 r: &mut BufReader<R>,
478 ) -> Result<Variable, Error> {
479 let offset = r.stream_position()?;
480 let width: i32 = e.parse(read_bytes(r)?);
481 let has_variable_label: u32 = e.parse(read_bytes(r)?);
482 let missing_value_code: i32 = e.parse(read_bytes(r)?);
483 let print_format: u32 = e.parse(read_bytes(r)?);
484 let write_format: u32 = e.parse(read_bytes(r)?);
485 let name: [u8; 8] = read_bytes(r)?;
487 let label = match has_variable_label {
490 let len: u32 = e.parse(read_bytes(r)?);
491 let read_len = len.min(65535) as usize;
492 let label = Some(read_vec(r, read_len)?);
494 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
495 let _ = read_vec(r, padding_bytes as usize)?;
500 return Err(Error::BadVariableLabelCode {
502 code: has_variable_label,
507 let mut missing = Vec::new();
508 if missing_value_code != 0 {
509 match (width, missing_value_code) {
510 (0, -3 | -2 | 1 | 2 | 3) => (),
512 return Err(Error::BadNumericMissingValueCode {
514 code: missing_value_code,
519 return Err(Error::BadStringMissingValueCode {
521 code: missing_value_code,
526 for _ in 0..missing_value_code.abs() {
527 missing.push(read_bytes(r)?);
543 pub struct ValueLabel {
544 /// Offset from the start of the file to the start of the record.
548 pub labels: Vec<([u8; 8], Vec<u8>)>,
552 /// Maximum number of value labels in a record.
553 pub const MAX: u32 = u32::MAX / 8;
556 fn read_value_label_record<R: Read + Seek>(
557 r: &mut BufReader<R>,
559 ) -> Result<ValueLabel, Error> {
560 let offset = r.stream_position()?;
561 let n: u32 = e.parse(read_bytes(r)?);
562 if n > ValueLabel::MAX {
563 return Err(Error::BadNumberOfValueLabels {
566 max: ValueLabel::MAX,
570 let mut labels = Vec::new();
572 let value: [u8; 8] = read_bytes(r)?;
573 let label_len: u8 = e.parse(read_bytes(r)?);
574 let label_len = label_len as usize;
575 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
577 let mut label = read_vec(r, padded_len)?;
578 label.truncate(label_len);
579 labels.push((value, label));
581 Ok(ValueLabel { offset, labels })
584 pub struct VarIndexes {
585 /// Offset from the start of the file to the start of the record.
588 /// The 0-based indexes of the variable indexes.
589 pub var_indexes: Vec<u32>,
593 /// Maximum number of variable indexes in a record.
594 pub const MAX: u32 = u32::MAX / 8;
597 fn read_var_indexes_record<R: Read + Seek>(
598 r: &mut BufReader<R>,
600 ) -> Result<VarIndexes, Error> {
601 let offset = r.stream_position()?;
602 let n: u32 = e.parse(read_bytes(r)?);
603 if n > VarIndexes::MAX {
604 return Err(Error::BadNumberOfVarIndexes {
607 max: VarIndexes::MAX,
610 let mut var_indexes = Vec::with_capacity(n as usize);
612 var_indexes.push(e.parse(read_bytes(r)?));
621 pub const DOC_LINE_LEN: u32 = 80;
622 pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN;
624 pub struct Document {
625 /// Offset from the start of the file to the start of the record.
628 /// The document, as an array of 80-byte lines.
629 pub lines: Vec<[u8; DOC_LINE_LEN as usize]>,
632 fn read_document_record<R: Read + Seek>(
633 r: &mut BufReader<R>,
635 ) -> Result<Document, Error> {
636 let offset = r.stream_position()?;
637 let n: u32 = e.parse(read_bytes(r)?);
639 0..=DOC_MAX_LINES => {
640 let pos = r.stream_position()?;
641 let mut lines = Vec::with_capacity(n as usize);
643 let line: [u8; 80] = read_bytes(r)?;
646 Ok(Document { pos, lines })
648 _ => Err(Error::BadDocumentLength {
656 #[derive(FromPrimitive)]
658 /// Machine integer info.
660 /// Machine floating-point info.
666 /// Multiple response sets.
670 /// Extra product info text.
672 /// Variable display parameters.
674 /// Long variable names.
678 /// Extended number of cases.
680 /// Data file attributes.
682 /// Variable attributes.
684 /// Multiple response sets (extended).
686 /// Character encoding.
688 /// Value labels for long strings.
690 /// Missing values for long strings.
692 /// "Format properties in dataview table".
696 pub struct Extension {
697 /// Offset from the start of the file to the start of the record.
703 /// Size of each data element.
706 /// Number of data elements.
709 /// `size * count` bytes of data.
713 fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) {
715 /* Implemented record types. */
716 ExtensionType::Integer => (4, 8),
717 ExtensionType::Float => (8, 3),
718 ExtensionType::VarSets => (1, 0),
719 ExtensionType::Mrsets => (1, 0),
720 ExtensionType::ProductInfo => (1, 0),
721 ExtensionType::Display => (4, 0),
722 ExtensionType::LongNames => (1, 0),
723 ExtensionType::LongStrings => (1, 0),
724 ExtensionType::Ncases => (8, 2),
725 ExtensionType::FileAttrs => (1, 0),
726 ExtensionType::VarAttrs => (1, 0),
727 ExtensionType::Mrsets2 => (1, 0),
728 ExtensionType::Encoding => (1, 0),
729 ExtensionType::LongLabels => (1, 0),
730 ExtensionType::LongMissing => (1, 0),
732 /* Ignored record types. */
733 ExtensionType::Date => (0, 0),
734 ExtensionType::DataEntry => (0, 0),
735 ExtensionType::Dataview => (0, 0),
739 fn read_extension_record<R: Read + Seek>(
740 r: &mut BufReader<R>,
742 ) -> Result<Extension, Error> {
743 let subtype = e.parse(read_bytes(r)?);
744 let offset = r.stream_position()?;
745 let size: u32 = e.parse(read_bytes(r)?);
746 let count = e.parse(read_bytes(r)?);
747 let Some(product) = size.checked_mul(count) else {
748 return Err(Error::ExtensionRecordTooLarge {
755 let offset = r.stream_position()?;
756 let data = read_vec(r, product as usize)?;
767 /// File offset to the start of the record.
770 /// File offset to the ZLIB data header.
773 /// File offset to the ZLIB trailer.
774 ztrailer_offset: u64,
776 /// Length of the ZLIB trailer in bytes.
780 fn read_zheader<R: Read + Seek>(r: &mut BufReader<R>, e: Endian) -> Result<ZHeader, Error> {
781 let offset = r.stream_position()?;
782 let zheader_offset: u64 = e.parse(read_bytes(r)?);
783 let ztrailer_offset: u64 = e.parse(read_bytes(r)?);
784 let ztrailer_len: u64 = e.parse(read_bytes(r)?);
794 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
795 let mut buf = [0; N];
796 let n = r.read(&mut buf)?;
799 r.read_exact(&mut buf[n..])?;
807 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
808 let mut buf = [0; N];
809 r.read_exact(&mut buf)?;
813 fn read_vec<R: Read>(r: &mut BufReader<R>, n: usize) -> Result<Vec<u8>, IoError> {
814 let mut vec = vec![0; n];
815 r.read_exact(&mut vec)?;
820 fn trim_end(mut s: Vec<u8>, c: u8) -> Vec<u8> {
821 while s.last() == Some(&c) {
827 fn skip_bytes<R: Read>(r: &mut R, mut n: u64) -> Result<(), IoError> {
828 let mut buf = [0; 1024];
830 let chunk = u64::min(n, buf.len() as u64);
831 r.read_exact(&mut buf[0..chunk as usize])?;