1 use crate::endian::{Endian, Parse, ToBytes};
4 use flate2::read::ZlibDecoder;
6 use num_derive::FromPrimitive;
9 io::{Error as IoError, Read, Seek, SeekFrom},
13 #[derive(Copy, Clone, Debug)]
14 pub enum Compression {
23 ValueLabel(ValueLabel),
24 VarIndexes(VarIndexes),
36 /// Eye-catcher string, product name, in the file's encoding. Padded
37 /// on the right with spaces.
38 pub eye_catcher: [u8; 60],
40 /// Layout code, normally either 2 or 3.
43 /// Number of variable positions, or `None` if the value in the file is
44 /// questionably trustworthy.
45 pub nominal_case_size: Option<u32>,
47 /// Compression type, if any,
48 pub compression: Option<Compression>,
50 /// 0-based variable index of the weight variable, or `None` if the file is
52 pub weight_index: Option<u32>,
54 /// Claimed number of cases, if known.
55 pub n_cases: Option<u32>,
57 /// Compression bias, usually 100.0.
60 /// `dd mmm yy` in the file's encoding.
61 pub creation_date: [u8; 9],
63 /// `HH:MM:SS` in the file's encoding.
64 pub creation_time: [u8; 8],
66 /// File label, in the file's encoding. Padded on the right with spaces.
67 pub file_label: [u8; 64],
69 /// Endianness of the data in the file header.
73 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
74 pub struct Magic([u8; 4]);
77 /// Magic number for a regular system file.
78 pub const SAV: Magic = Magic(*b"$FL2");
80 /// Magic number for a system file that contains zlib-compressed data.
81 pub const ZSAV: Magic = Magic(*b"$FL3");
83 /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
85 pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
88 impl TryFrom<[u8; 4]> for Magic {
91 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
92 let magic = Magic(value);
94 Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
95 _ => Err(Error::BadMagic(value)),
100 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
107 fn from_width(width: i32) -> VarType {
109 0 => VarType::Number,
110 _ => VarType::String,
116 #[allow(clippy::type_complexity)]
117 fn read(self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error>;
120 struct Start<R: Read + Seek> {
124 struct CommonState<R: Read + Seek> {
128 compression: Option<Compression>,
129 var_types: Vec<VarType>,
132 impl<R: Read + Seek + 'static> State for Start<R> {
133 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
134 let header = read_header(&mut self.reader)?;
135 let next_state = Headers(CommonState {
137 endian: header.endian,
139 compression: header.compression,
140 var_types: Vec::new(),
142 Ok(Some((Record::Header(header), Box::new(next_state))))
146 struct Headers<R: Read + Seek>(CommonState<R>);
148 impl<R: Read + Seek + 'static> State for Headers<R> {
149 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
150 let endian = self.0.endian;
151 let rec_type: u32 = endian.parse(read_bytes(&mut self.0.reader)?);
152 let record = match rec_type {
154 let variable = read_variable_record(&mut self.0.reader, endian)?;
155 self.0.var_types.push(VarType::from_width(variable.width));
156 Record::Variable(variable)
158 3 => Record::ValueLabel(read_value_label_record(&mut self.0.reader, endian)?),
159 4 => Record::VarIndexes(read_var_indexes_record(&mut self.0.reader, endian)?),
160 6 => Record::Document(read_document_record(&mut self.0.reader, endian)?),
161 7 => Record::Extension(read_extension_record(&mut self.0.reader, endian)?),
163 let _: [u8; 4] = read_bytes(&mut self.0.reader)?;
164 let next_state: Box<dyn State> = match self.0.compression {
165 None => Box::new(Data(self.0)),
166 Some(Compression::Simple) => Box::new(CompressedData::new(self.0)),
167 Some(Compression::ZLib) => Box::new(ZlibHeader(self.0)),
169 return Ok(Some((Record::EndOfHeaders, next_state)));
172 return Err(Error::BadRecordType {
173 offset: self.0.reader.stream_position()?,
178 Ok(Some((record, self)))
182 struct ZlibHeader<R: Read + Seek>(CommonState<R>);
184 impl<R: Read + Seek + 'static> State for ZlibHeader<R> {
185 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
186 let zheader = read_zheader(&mut self.0.reader, self.0.endian)?;
187 Ok(Some((Record::ZHeader(zheader), self)))
191 struct ZlibTrailer<R: Read + Seek>(CommonState<R>, ZHeader);
193 impl<R: Read + Seek + 'static> State for ZlibTrailer<R> {
194 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
195 let retval = read_ztrailer(&mut self.0.reader, self.0.endian, self.1.ztrailer_offset, self.1.ztrailer_len)?;
196 let next_state = Box::new(CompressedData::new(CommonState {
197 reader: ZlibDecodeMultiple::new(self.0.reader),
198 endian: self.0.endian,
200 compression: self.0.compression,
201 var_types: self.0.var_types
204 None => next_state.read(),
205 Some(ztrailer) => Ok(Some((Record::ZTrailer(ztrailer), next_state)))
210 struct Data<R: Read + Seek>(CommonState<R>);
212 impl<R: Read + Seek + 'static> State for Data<R> {
213 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
214 let case_start = self.0.reader.stream_position()?;
215 let mut values = Vec::with_capacity(self.0.var_types.len());
216 for (i, &var_type) in self.0.var_types.iter().enumerate() {
217 let Some(raw) = try_read_bytes(&mut self.0.reader)? else {
221 let offset = self.0.reader.stream_position()?;
222 return Err(Error::EofInCase {
224 case_ofs: offset - case_start,
225 case_len: self.0.var_types.len() * 8,
229 values.push(Value::from_raw(var_type, raw, self.0.endian));
231 Ok(Some((Record::Case(values), self)))
235 struct CompressedData<R: Read + Seek> {
236 common: CommonState<R>,
240 impl<R: Read + Seek + 'static> CompressedData<R> {
241 fn new(common: CommonState<R>) -> CompressedData<R> {
242 CompressedData { common, codes: VecDeque::new() }
246 impl<R: Read + Seek + 'static> State for CompressedData<R> {
247 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
248 let case_start = self.common.reader.stream_position()?;
249 let mut values = Vec::with_capacity(self.common.var_types.len());
250 for (i, &var_type) in self.common.var_types.iter().enumerate() {
252 let Some(code) = self.codes.pop_front() else {
253 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(&mut self.common.reader)?
258 let offset = self.common.reader.stream_position()?;
259 return Err(Error::EofInCompressedCase {
261 case_ofs: offset - case_start,
265 self.codes.extend(new_codes.into_iter());
270 1..=251 => match var_type {
271 VarType::Number => break Value::Number(Some(code as f64 - self.common.bias)),
273 break Value::String(self.common.endian.to_bytes(code as f64 - self.common.bias))
280 let offset = self.common.reader.stream_position()?;
281 return Err(Error::PartialCompressedCase {
283 case_ofs: offset - case_start,
288 break Value::from_raw(
290 read_bytes(&mut self.common.reader)?,
294 254 => match var_type {
295 VarType::String => break Value::String(*b" "), // XXX EBCDIC
297 return Err(Error::CompressedStringExpected {
299 case_ofs: self.common.reader.stream_position()? - case_start,
303 255 => match var_type {
304 VarType::Number => break Value::Number(None),
306 return Err(Error::CompressedNumberExpected {
308 case_ofs: self.common.reader.stream_position()? - case_start,
316 Ok(Some((Record::Case(values), self)))
320 struct ZlibDecodeMultiple<R>
324 reader: Option<ZlibDecoder<R>>,
327 impl<R> ZlibDecodeMultiple<R>
331 fn new(reader: R) -> ZlibDecodeMultiple<R> {
333 reader: Some(ZlibDecoder::new(reader)),
338 impl<R> Read for ZlibDecodeMultiple<R>
342 fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
344 match self.reader.as_mut().unwrap().read(buf)? {
346 let inner = self.reader.take().unwrap().into_inner();
347 self.reader = Some(ZlibDecoder::new(inner));
355 impl<R> Seek for ZlibDecodeMultiple<R>
359 fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
360 self.reader.as_mut().unwrap().get_mut().seek(pos)
364 #[derive(Copy, Clone)]
371 pub fn from_raw(var_type: VarType, raw: [u8; 8], endian: Endian) -> Value {
373 VarType::String => Value::String(raw),
375 let number: f64 = endian.parse(raw);
376 Value::Number((number != -f64::MAX).then_some(number))
383 state: Option<Box<dyn State>>,
387 pub fn new<R: Read + Seek + 'static>(reader: R) -> Result<Reader, Error> {
389 state: Some(Box::new(Start { reader })),
394 impl Iterator for Reader {
395 type Item = Result<Record, Error>;
397 fn next(&mut self) -> Option<Self::Item> {
398 match self.state.take()?.read() {
399 Ok(Some((record, next_state))) => {
400 self.state = Some(next_state);
404 Err(error) => Some(Err(error)),
409 impl FusedIterator for Reader {}
411 fn read_header<R: Read>(r: &mut R) -> Result<Header, Error> {
412 let magic: [u8; 4] = read_bytes(r)?;
413 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
415 let eye_catcher: [u8; 60] = read_bytes(r)?;
416 let layout_code: [u8; 4] = read_bytes(r)?;
417 let endian = Endian::identify_u32(2, layout_code)
418 .or_else(|| Endian::identify_u32(2, layout_code))
419 .ok_or_else(|| Error::NotASystemFile)?;
420 let layout_code = endian.parse(layout_code);
422 let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
423 let nominal_case_size =
424 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
426 let compression_code: u32 = endian.parse(read_bytes(r)?);
427 let compression = match (magic, compression_code) {
428 (Magic::ZSAV, 2) => Some(Compression::ZLib),
429 (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
431 (_, 1) => Some(Compression::Simple),
432 (_, code) => return Err(Error::InvalidSavCompression(code)),
435 let weight_index: u32 = endian.parse(read_bytes(r)?);
436 let weight_index = (weight_index > 0).then_some(weight_index - 1);
438 let n_cases: u32 = endian.parse(read_bytes(r)?);
439 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
441 let bias: f64 = endian.parse(read_bytes(r)?);
443 let creation_date: [u8; 9] = read_bytes(r)?;
444 let creation_time: [u8; 8] = read_bytes(r)?;
445 let file_label: [u8; 64] = read_bytes(r)?;
446 let _: [u8; 3] = read_bytes(r)?;
464 pub struct Variable {
465 /// Offset from the start of the file to the start of the record.
468 /// Variable width, in the range -1..=255.
471 /// Variable name, padded on the right with spaces.
475 pub print_format: u32,
478 pub write_format: u32,
480 /// Missing value code, one of -3, -2, 0, 1, 2, or 3.
481 pub missing_value_code: i32,
483 /// Raw missing values, up to 3 of them.
484 pub missing: Vec<[u8; 8]>,
486 /// Optional variable label.
487 pub label: Option<Vec<u8>>,
490 fn read_variable_record<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Variable, Error> {
491 let offset = r.stream_position()?;
492 let width: i32 = endian.parse(read_bytes(r)?);
493 let has_variable_label: u32 = endian.parse(read_bytes(r)?);
494 let missing_value_code: i32 = endian.parse(read_bytes(r)?);
495 let print_format: u32 = endian.parse(read_bytes(r)?);
496 let write_format: u32 = endian.parse(read_bytes(r)?);
497 let name: [u8; 8] = read_bytes(r)?;
499 let label = match has_variable_label {
502 let len: u32 = endian.parse(read_bytes(r)?);
503 let read_len = len.min(65535) as usize;
504 let label = Some(read_vec(r, read_len)?);
506 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
507 let _ = read_vec(r, padding_bytes as usize)?;
512 return Err(Error::BadVariableLabelCode {
514 code: has_variable_label,
519 let mut missing = Vec::new();
520 if missing_value_code != 0 {
521 match (width, missing_value_code) {
522 (0, -3 | -2 | 1 | 2 | 3) => (),
524 return Err(Error::BadNumericMissingValueCode {
526 code: missing_value_code,
531 return Err(Error::BadStringMissingValueCode {
533 code: missing_value_code,
538 for _ in 0..missing_value_code.abs() {
539 missing.push(read_bytes(r)?);
555 pub struct ValueLabel {
556 /// Offset from the start of the file to the start of the record.
560 pub labels: Vec<([u8; 8], Vec<u8>)>,
564 /// Maximum number of value labels in a record.
565 pub const MAX: u32 = u32::MAX / 8;
568 fn read_value_label_record<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabel, Error> {
569 let offset = r.stream_position()?;
570 let n: u32 = endian.parse(read_bytes(r)?);
571 if n > ValueLabel::MAX {
572 return Err(Error::BadNumberOfValueLabels {
575 max: ValueLabel::MAX,
579 let mut labels = Vec::new();
581 let value: [u8; 8] = read_bytes(r)?;
582 let label_len: u8 = endian.parse(read_bytes(r)?);
583 let label_len = label_len as usize;
584 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
586 let mut label = read_vec(r, padded_len)?;
587 label.truncate(label_len);
588 labels.push((value, label));
590 Ok(ValueLabel { offset, labels })
593 pub struct VarIndexes {
594 /// Offset from the start of the file to the start of the record.
597 /// The 0-based indexes of the variable indexes.
598 pub var_indexes: Vec<u32>,
602 /// Maximum number of variable indexes in a record.
603 pub const MAX: u32 = u32::MAX / 8;
606 fn read_var_indexes_record<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VarIndexes, Error> {
607 let offset = r.stream_position()?;
608 let n: u32 = endian.parse(read_bytes(r)?);
609 if n > VarIndexes::MAX {
610 return Err(Error::BadNumberOfVarIndexes {
613 max: VarIndexes::MAX,
616 let mut var_indexes = Vec::with_capacity(n as usize);
618 var_indexes.push(endian.parse(read_bytes(r)?));
627 pub const DOC_LINE_LEN: u32 = 80;
628 pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN;
630 pub struct Document {
631 /// Offset from the start of the file to the start of the record.
634 /// The document, as an array of 80-byte lines.
635 pub lines: Vec<[u8; DOC_LINE_LEN as usize]>,
638 fn read_document_record<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Document, Error> {
639 let offset = r.stream_position()?;
640 let n: u32 = endian.parse(read_bytes(r)?);
642 0..=DOC_MAX_LINES => {
643 let pos = r.stream_position()?;
644 let mut lines = Vec::with_capacity(n as usize);
646 let line: [u8; 80] = read_bytes(r)?;
649 Ok(Document { pos, lines })
651 _ => Err(Error::BadDocumentLength {
659 #[derive(FromPrimitive)]
661 /// Machine integer info.
663 /// Machine floating-point info.
669 /// Multiple response sets.
673 /// Extra product info text.
675 /// Variable display parameters.
677 /// Long variable names.
681 /// Extended number of cases.
683 /// Data file attributes.
685 /// Variable attributes.
687 /// Multiple response sets (extended).
689 /// Character encoding.
691 /// Value labels for long strings.
693 /// Missing values for long strings.
695 /// "Format properties in dataview table".
699 pub struct Extension {
700 /// Offset from the start of the file to the start of the record.
706 /// Size of each data element.
709 /// Number of data elements.
712 /// `size * count` bytes of data.
717 fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) {
719 /* Implemented record types. */
720 ExtensionType::Integer => (4, 8),
721 ExtensionType::Float => (8, 3),
722 ExtensionType::VarSets => (1, 0),
723 ExtensionType::Mrsets => (1, 0),
724 ExtensionType::ProductInfo => (1, 0),
725 ExtensionType::Display => (4, 0),
726 ExtensionType::LongNames => (1, 0),
727 ExtensionType::LongStrings => (1, 0),
728 ExtensionType::Ncases => (8, 2),
729 ExtensionType::FileAttrs => (1, 0),
730 ExtensionType::VarAttrs => (1, 0),
731 ExtensionType::Mrsets2 => (1, 0),
732 ExtensionType::Encoding => (1, 0),
733 ExtensionType::LongLabels => (1, 0),
734 ExtensionType::LongMissing => (1, 0),
736 /* Ignored record types. */
737 ExtensionType::Date => (0, 0),
738 ExtensionType::DataEntry => (0, 0),
739 ExtensionType::Dataview => (0, 0),
744 fn read_extension_record<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Extension, Error> {
745 let subtype = endian.parse(read_bytes(r)?);
746 let offset = r.stream_position()?;
747 let size: u32 = endian.parse(read_bytes(r)?);
748 let count = endian.parse(read_bytes(r)?);
749 let Some(product) = size.checked_mul(count) else {
750 return Err(Error::ExtensionRecordTooLarge {
757 let offset = r.stream_position()?;
758 let data = read_vec(r, product as usize)?;
769 /// File offset to the start of the record.
772 /// File offset to the ZLIB data header.
773 pub zheader_offset: u64,
775 /// File offset to the ZLIB trailer.
776 pub ztrailer_offset: u64,
778 /// Length of the ZLIB trailer in bytes.
779 pub ztrailer_len: u64,
782 fn read_zheader<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
783 let offset = r.stream_position()?;
784 let zheader_offset: u64 = endian.parse(read_bytes(r)?);
785 let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
786 let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
796 pub struct ZTrailer {
797 /// File offset to the start of the record.
800 /// Compression bias as a negative integer, e.g. -100.
803 /// Always observed as zero.
806 /// Uncompressed size of each block, except possibly the last. Only
807 /// `0x3ff000` has been observed so far.
810 /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
811 pub blocks: Vec<ZBlock>,
815 /// Offset of block of data if simple compression were used.
816 pub uncompressed_ofs: u64,
818 /// Actual offset within the file of the compressed data block.
819 pub compressed_ofs: u64,
821 /// The number of bytes in this data block after decompression. This is
822 /// `block_size` in every data block but the last, which may be smaller.
823 pub uncompressed_size: u32,
825 /// The number of bytes in this data block, as stored compressed in this
827 pub compressed_size: u32,
830 fn read_ztrailer<R: Read + Seek>(
835 ) -> Result<Option<ZTrailer>, Error> {
836 let start_offset = r.stream_position()?;
837 if r.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
840 let int_bias = endian.parse(read_bytes(r)?);
841 let zero = endian.parse(read_bytes(r)?);
842 let block_size = endian.parse(read_bytes(r)?);
843 let n_blocks: u32 = endian.parse(read_bytes(r)?);
844 let expected_n_blocks = (ztrailer_len - 24) / 24;
845 if n_blocks as u64 != expected_n_blocks {
846 return Err(Error::BadZlibTrailerNBlocks {
847 offset: ztrailer_ofs,
853 let mut blocks = Vec::with_capacity(n_blocks as usize);
854 for _ in 0..n_blocks {
855 let uncompressed_ofs = endian.parse(read_bytes(r)?);
856 let compressed_ofs = endian.parse(read_bytes(r)?);
857 let uncompressed_size = endian.parse(read_bytes(r)?);
858 let compressed_size = endian.parse(read_bytes(r)?);
866 r.seek(SeekFrom::Start(start_offset))?;
868 offset: ztrailer_ofs,
876 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
877 let mut buf = [0; N];
878 let n = r.read(&mut buf)?;
881 r.read_exact(&mut buf[n..])?;
889 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
890 let mut buf = [0; N];
891 r.read_exact(&mut buf)?;
895 fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
896 let mut vec = vec![0; n];
897 r.read_exact(&mut vec)?;