1 use crate::endian::{Endian, Parse, ToBytes};
4 use flate2::read::ZlibDecoder;
8 io::{Error as IoError, Read, Seek, SeekFrom},
12 use self::state::State;
14 #[derive(Copy, Clone, Debug)]
15 pub enum Compression {
24 ValueLabel(ValueLabel),
25 VarIndexes(VarIndexes),
34 fn read<R: Read + Seek>(reader: &mut R, endian: Endian) -> Result<Record, Error> {
35 let rec_type: u32 = endian.parse(read_bytes(reader)?);
37 2 => Ok(Record::Variable(Variable::read(reader, endian)?)),
38 3 => Ok(Record::ValueLabel(ValueLabel::read(reader, endian)?)),
39 4 => Ok(Record::VarIndexes(VarIndexes::read(reader, endian)?)),
40 6 => Ok(Record::Document(Document::read(reader, endian)?)),
41 7 => Ok(Record::Extension(Extension::read(reader, endian)?)),
42 999 => Ok(Record::EndOfHeaders(endian.parse(read_bytes(reader)?))),
43 _ => Err(Error::BadRecordType {
44 offset: reader.stream_position()?,
55 /// Eye-catcher string, product name, in the file's encoding. Padded
56 /// on the right with spaces.
57 pub eye_catcher: [u8; 60],
59 /// Layout code, normally either 2 or 3.
62 /// Number of variable positions, or `None` if the value in the file is
63 /// questionably trustworthy.
64 pub nominal_case_size: Option<u32>,
66 /// Compression type, if any,
67 pub compression: Option<Compression>,
69 /// 0-based variable index of the weight variable, or `None` if the file is
71 pub weight_index: Option<u32>,
73 /// Claimed number of cases, if known.
74 pub n_cases: Option<u32>,
76 /// Compression bias, usually 100.0.
79 /// `dd mmm yy` in the file's encoding.
80 pub creation_date: [u8; 9],
82 /// `HH:MM:SS` in the file's encoding.
83 pub creation_time: [u8; 8],
85 /// File label, in the file's encoding. Padded on the right with spaces.
86 pub file_label: [u8; 64],
88 /// Endianness of the data in the file header.
93 fn read<R: Read>(r: &mut R) -> Result<Header, Error> {
94 let magic: [u8; 4] = read_bytes(r)?;
95 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
97 let eye_catcher: [u8; 60] = read_bytes(r)?;
98 let layout_code: [u8; 4] = read_bytes(r)?;
99 let endian = Endian::identify_u32(2, layout_code)
100 .or_else(|| Endian::identify_u32(2, layout_code))
101 .ok_or_else(|| Error::NotASystemFile)?;
102 let layout_code = endian.parse(layout_code);
104 let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
105 let nominal_case_size =
106 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
108 let compression_code: u32 = endian.parse(read_bytes(r)?);
109 let compression = match (magic, compression_code) {
110 (Magic::ZSAV, 2) => Some(Compression::ZLib),
111 (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
113 (_, 1) => Some(Compression::Simple),
114 (_, code) => return Err(Error::InvalidSavCompression(code)),
117 let weight_index: u32 = endian.parse(read_bytes(r)?);
118 let weight_index = (weight_index > 0).then_some(weight_index - 1);
120 let n_cases: u32 = endian.parse(read_bytes(r)?);
121 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
123 let bias: f64 = endian.parse(read_bytes(r)?);
125 let creation_date: [u8; 9] = read_bytes(r)?;
126 let creation_time: [u8; 8] = read_bytes(r)?;
127 let file_label: [u8; 64] = read_bytes(r)?;
128 let _: [u8; 3] = read_bytes(r)?;
147 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
148 pub struct Magic([u8; 4]);
151 /// Magic number for a regular system file.
152 pub const SAV: Magic = Magic(*b"$FL2");
154 /// Magic number for a system file that contains zlib-compressed data.
155 pub const ZSAV: Magic = Magic(*b"$FL3");
157 /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
159 pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
162 impl TryFrom<[u8; 4]> for Magic {
165 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
166 let magic = Magic(value);
168 Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
169 _ => Err(Error::BadMagic(value)),
174 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
181 fn from_width(width: i32) -> VarType {
183 0 => VarType::Number,
184 _ => VarType::String,
191 Compression, Error, Header, Record, Value, VarType, Variable, ZHeader, ZTrailer,
194 use crate::endian::Endian;
196 collections::VecDeque,
201 #[allow(clippy::type_complexity)]
202 fn read(self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error>;
205 struct Start<R: Read + Seek> {
209 pub fn new<R: Read + Seek + 'static>(reader: R) -> Box<dyn State> {
210 Box::new(Start { reader })
213 struct CommonState<R: Read + Seek> {
217 compression: Option<Compression>,
218 var_types: Vec<VarType>,
221 impl<R: Read + Seek + 'static> State for Start<R> {
222 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
223 let header = Header::read(&mut self.reader)?;
224 let next_state = Headers(CommonState {
226 endian: header.endian,
228 compression: header.compression,
229 var_types: Vec::new(),
231 Ok(Some((Record::Header(header), Box::new(next_state))))
235 struct Headers<R: Read + Seek>(CommonState<R>);
237 impl<R: Read + Seek + 'static> State for Headers<R> {
238 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
239 let record = Record::read(&mut self.0.reader, self.0.endian)?;
241 Record::Variable(Variable { width, .. }) => {
242 self.0.var_types.push(VarType::from_width(width));
244 Record::EndOfHeaders(_) => {
245 let next_state: Box<dyn State> = match self.0.compression {
246 None => Box::new(Data(self.0)),
247 Some(Compression::Simple) => Box::new(CompressedData::new(self.0)),
248 Some(Compression::ZLib) => Box::new(ZlibHeader(self.0)),
250 return Ok(Some((record, next_state)));
254 Ok(Some((record, self)))
258 struct ZlibHeader<R: Read + Seek>(CommonState<R>);
260 impl<R: Read + Seek + 'static> State for ZlibHeader<R> {
261 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
262 let zheader = ZHeader::read(&mut self.0.reader, self.0.endian)?;
263 Ok(Some((Record::ZHeader(zheader), self)))
267 struct ZlibTrailer<R: Read + Seek>(CommonState<R>, ZHeader);
269 impl<R: Read + Seek + 'static> State for ZlibTrailer<R> {
270 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
271 let retval = ZTrailer::read(
274 self.1.ztrailer_offset,
277 let next_state = Box::new(CompressedData::new(CommonState {
278 reader: ZlibDecodeMultiple::new(self.0.reader),
279 endian: self.0.endian,
281 compression: self.0.compression,
282 var_types: self.0.var_types,
285 None => next_state.read(),
286 Some(ztrailer) => Ok(Some((Record::ZTrailer(ztrailer), next_state))),
291 struct Data<R: Read + Seek>(CommonState<R>);
293 impl<R: Read + Seek + 'static> State for Data<R> {
294 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
295 match Value::read_case(&mut self.0.reader, &self.0.var_types, self.0.endian)? {
297 Some(values) => Ok(Some((Record::Case(values), self))),
302 struct CompressedData<R: Read + Seek> {
303 common: CommonState<R>,
307 impl<R: Read + Seek + 'static> CompressedData<R> {
308 fn new(common: CommonState<R>) -> CompressedData<R> {
311 codes: VecDeque::new(),
316 impl<R: Read + Seek + 'static> State for CompressedData<R> {
317 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
318 match Value::read_compressed_case(
319 &mut self.common.reader,
320 &self.common.var_types,
326 Some(values) => Ok(Some((Record::Case(values), self))),
332 #[derive(Copy, Clone)]
339 pub fn from_raw(var_type: VarType, raw: [u8; 8], endian: Endian) -> Value {
341 VarType::String => Value::String(raw),
343 let number: f64 = endian.parse(raw);
344 Value::Number((number != -f64::MAX).then_some(number))
349 fn read_case<R: Read + Seek>(
351 var_types: &[VarType],
353 ) -> Result<Option<Vec<Value>>, Error> {
354 let case_start = reader.stream_position()?;
355 let mut values = Vec::with_capacity(var_types.len());
356 for (i, &var_type) in var_types.iter().enumerate() {
357 let Some(raw) = try_read_bytes(reader)? else {
361 let offset = reader.stream_position()?;
362 return Err(Error::EofInCase {
364 case_ofs: offset - case_start,
365 case_len: var_types.len() * 8,
369 values.push(Value::from_raw(var_type, raw, endian));
374 fn read_compressed_case<R: Read + Seek>(
376 var_types: &[VarType],
377 codes: &mut VecDeque<u8>,
380 ) -> Result<Option<Vec<Value>>, Error> {
381 let case_start = reader.stream_position()?;
382 let mut values = Vec::with_capacity(var_types.len());
383 for (i, &var_type) in var_types.iter().enumerate() {
385 let Some(code) = codes.pop_front() else {
386 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
390 let offset = reader.stream_position()?;
391 return Err(Error::EofInCompressedCase {
393 case_ofs: offset - case_start,
397 codes.extend(new_codes.into_iter());
402 1..=251 => match var_type {
403 VarType::Number => break Value::Number(Some(code as f64 - bias)),
405 break Value::String(endian.to_bytes(code as f64 - bias))
412 let offset = reader.stream_position()?;
413 return Err(Error::PartialCompressedCase {
415 case_ofs: offset - case_start,
419 253 => break Value::from_raw(var_type, read_bytes(reader)?, endian),
420 254 => match var_type {
421 VarType::String => break Value::String(*b" "), // XXX EBCDIC
423 return Err(Error::CompressedStringExpected {
425 case_ofs: reader.stream_position()? - case_start,
429 255 => match var_type {
430 VarType::Number => break Value::Number(None),
432 return Err(Error::CompressedNumberExpected {
434 case_ofs: reader.stream_position()? - case_start,
446 struct ZlibDecodeMultiple<R>
450 reader: Option<ZlibDecoder<R>>,
453 impl<R> ZlibDecodeMultiple<R>
457 fn new(reader: R) -> ZlibDecodeMultiple<R> {
459 reader: Some(ZlibDecoder::new(reader)),
464 impl<R> Read for ZlibDecodeMultiple<R>
468 fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
470 match self.reader.as_mut().unwrap().read(buf)? {
472 let inner = self.reader.take().unwrap().into_inner();
473 self.reader = Some(ZlibDecoder::new(inner));
481 impl<R> Seek for ZlibDecodeMultiple<R>
485 fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
486 self.reader.as_mut().unwrap().get_mut().seek(pos)
491 state: Option<Box<dyn State>>,
495 pub fn new<R: Read + Seek + 'static>(reader: R) -> Result<Reader, Error> {
497 state: Some(state::new(reader)),
502 impl Iterator for Reader {
503 type Item = Result<Record, Error>;
505 fn next(&mut self) -> Option<Self::Item> {
506 match self.state.take()?.read() {
507 Ok(Some((record, next_state))) => {
508 self.state = Some(next_state);
512 Err(error) => Some(Err(error)),
517 impl FusedIterator for Reader {}
519 pub struct Variable {
520 /// Offset from the start of the file to the start of the record.
523 /// Variable width, in the range -1..=255.
526 /// Variable name, padded on the right with spaces.
530 pub print_format: u32,
533 pub write_format: u32,
535 /// Missing value code, one of -3, -2, 0, 1, 2, or 3.
536 pub missing_value_code: i32,
538 /// Raw missing values, up to 3 of them.
539 pub missing: Vec<[u8; 8]>,
541 /// Optional variable label.
542 pub label: Option<Vec<u8>>,
546 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Variable, Error> {
547 let offset = r.stream_position()?;
548 let width: i32 = endian.parse(read_bytes(r)?);
549 let has_variable_label: u32 = endian.parse(read_bytes(r)?);
550 let missing_value_code: i32 = endian.parse(read_bytes(r)?);
551 let print_format: u32 = endian.parse(read_bytes(r)?);
552 let write_format: u32 = endian.parse(read_bytes(r)?);
553 let name: [u8; 8] = read_bytes(r)?;
555 let label = match has_variable_label {
558 let len: u32 = endian.parse(read_bytes(r)?);
559 let read_len = len.min(65535) as usize;
560 let label = Some(read_vec(r, read_len)?);
562 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
563 let _ = read_vec(r, padding_bytes as usize)?;
568 return Err(Error::BadVariableLabelCode {
570 code: has_variable_label,
575 let mut missing = Vec::new();
576 if missing_value_code != 0 {
577 match (width, missing_value_code) {
578 (0, -3 | -2 | 1 | 2 | 3) => (),
580 return Err(Error::BadNumericMissingValueCode {
582 code: missing_value_code,
587 return Err(Error::BadStringMissingValueCode {
589 code: missing_value_code,
594 for _ in 0..missing_value_code.abs() {
595 missing.push(read_bytes(r)?);
612 pub struct ValueLabel {
613 /// Offset from the start of the file to the start of the record.
617 pub labels: Vec<([u8; 8], Vec<u8>)>,
621 /// Maximum number of value labels in a record.
622 pub const MAX: u32 = u32::MAX / 8;
624 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabel, Error> {
625 let offset = r.stream_position()?;
626 let n: u32 = endian.parse(read_bytes(r)?);
627 if n > ValueLabel::MAX {
628 return Err(Error::BadNumberOfValueLabels {
631 max: ValueLabel::MAX,
635 let mut labels = Vec::new();
637 let value: [u8; 8] = read_bytes(r)?;
638 let label_len: u8 = endian.parse(read_bytes(r)?);
639 let label_len = label_len as usize;
640 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
642 let mut label = read_vec(r, padded_len)?;
643 label.truncate(label_len);
644 labels.push((value, label));
646 Ok(ValueLabel { offset, labels })
650 pub struct VarIndexes {
651 /// Offset from the start of the file to the start of the record.
654 /// The 0-based indexes of the variable indexes.
655 pub var_indexes: Vec<u32>,
659 /// Maximum number of variable indexes in a record.
660 pub const MAX: u32 = u32::MAX / 8;
662 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VarIndexes, Error> {
663 let offset = r.stream_position()?;
664 let n: u32 = endian.parse(read_bytes(r)?);
665 if n > VarIndexes::MAX {
666 return Err(Error::BadNumberOfVarIndexes {
669 max: VarIndexes::MAX,
672 let mut var_indexes = Vec::with_capacity(n as usize);
674 var_indexes.push(endian.parse(read_bytes(r)?));
684 pub struct Document {
685 /// Offset from the start of the file to the start of the record.
688 /// The document, as an array of 80-byte lines.
689 pub lines: Vec<[u8; DOC_LINE_LEN as usize]>,
693 /// Length of a line in a document. Document lines are fixed-length and
694 /// padded on the right with spaces.
695 pub const LINE_LEN: u32 = 80;
697 /// Maximum number of lines we will accept in a document. This is simply
698 /// the maximum number that will fit in a 32-bit space.
699 pub const MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN;
701 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Document, Error> {
702 let offset = r.stream_position()?;
703 let n: u32 = endian.parse(read_bytes(r)?);
705 0..=DOC_MAX_LINES => {
706 let pos = r.stream_position()?;
707 let mut lines = Vec::with_capacity(n as usize);
709 let line: [u8; 80] = read_bytes(r)?;
712 Ok(Document { pos, lines })
714 _ => Err(Error::BadDocumentLength {
724 #[derive(FromPrimitive)]
726 /// Machine integer info.
728 /// Machine floating-point info.
734 /// Multiple response sets.
738 /// Extra product info text.
740 /// Variable display parameters.
742 /// Long variable names.
746 /// Extended number of cases.
748 /// Data file attributes.
750 /// Variable attributes.
752 /// Multiple response sets (extended).
754 /// Character encoding.
756 /// Value labels for long strings.
758 /// Missing values for long strings.
760 /// "Format properties in dataview table".
765 pub struct Extension {
766 /// Offset from the start of the file to the start of the record.
772 /// Size of each data element.
775 /// Number of data elements.
778 /// `size * count` bytes of data.
783 fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) {
785 /* Implemented record types. */
786 ExtensionType::Integer => (4, 8),
787 ExtensionType::Float => (8, 3),
788 ExtensionType::VarSets => (1, 0),
789 ExtensionType::Mrsets => (1, 0),
790 ExtensionType::ProductInfo => (1, 0),
791 ExtensionType::Display => (4, 0),
792 ExtensionType::LongNames => (1, 0),
793 ExtensionType::LongStrings => (1, 0),
794 ExtensionType::Ncases => (8, 2),
795 ExtensionType::FileAttrs => (1, 0),
796 ExtensionType::VarAttrs => (1, 0),
797 ExtensionType::Mrsets2 => (1, 0),
798 ExtensionType::Encoding => (1, 0),
799 ExtensionType::LongLabels => (1, 0),
800 ExtensionType::LongMissing => (1, 0),
802 /* Ignored record types. */
803 ExtensionType::Date => (0, 0),
804 ExtensionType::DataEntry => (0, 0),
805 ExtensionType::Dataview => (0, 0),
811 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Extension, Error> {
812 let subtype = endian.parse(read_bytes(r)?);
813 let offset = r.stream_position()?;
814 let size: u32 = endian.parse(read_bytes(r)?);
815 let count = endian.parse(read_bytes(r)?);
816 let Some(product) = size.checked_mul(count) else {
817 return Err(Error::ExtensionRecordTooLarge {
824 let offset = r.stream_position()?;
825 let data = read_vec(r, product as usize)?;
837 /// File offset to the start of the record.
840 /// File offset to the ZLIB data header.
841 pub zheader_offset: u64,
843 /// File offset to the ZLIB trailer.
844 pub ztrailer_offset: u64,
846 /// Length of the ZLIB trailer in bytes.
847 pub ztrailer_len: u64,
851 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
852 let offset = r.stream_position()?;
853 let zheader_offset: u64 = endian.parse(read_bytes(r)?);
854 let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
855 let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
866 pub struct ZTrailer {
867 /// File offset to the start of the record.
870 /// Compression bias as a negative integer, e.g. -100.
873 /// Always observed as zero.
876 /// Uncompressed size of each block, except possibly the last. Only
877 /// `0x3ff000` has been observed so far.
880 /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
881 pub blocks: Vec<ZBlock>,
885 /// Offset of block of data if simple compression were used.
886 pub uncompressed_ofs: u64,
888 /// Actual offset within the file of the compressed data block.
889 pub compressed_ofs: u64,
891 /// The number of bytes in this data block after decompression. This is
892 /// `block_size` in every data block but the last, which may be smaller.
893 pub uncompressed_size: u32,
895 /// The number of bytes in this data block, as stored compressed in this
897 pub compressed_size: u32,
901 fn read<R: Read + Seek>(
906 ) -> Result<Option<ZTrailer>, Error> {
907 let start_offset = r.stream_position()?;
908 if r.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
911 let int_bias = endian.parse(read_bytes(r)?);
912 let zero = endian.parse(read_bytes(r)?);
913 let block_size = endian.parse(read_bytes(r)?);
914 let n_blocks: u32 = endian.parse(read_bytes(r)?);
915 let expected_n_blocks = (ztrailer_len - 24) / 24;
916 if n_blocks as u64 != expected_n_blocks {
917 return Err(Error::BadZlibTrailerNBlocks {
918 offset: ztrailer_ofs,
924 let mut blocks = Vec::with_capacity(n_blocks as usize);
925 for _ in 0..n_blocks {
926 let uncompressed_ofs = endian.parse(read_bytes(r)?);
927 let compressed_ofs = endian.parse(read_bytes(r)?);
928 let uncompressed_size = endian.parse(read_bytes(r)?);
929 let compressed_size = endian.parse(read_bytes(r)?);
937 r.seek(SeekFrom::Start(start_offset))?;
939 offset: ztrailer_ofs,
948 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
949 let mut buf = [0; N];
950 let n = r.read(&mut buf)?;
953 r.read_exact(&mut buf[n..])?;
961 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
962 let mut buf = [0; N];
963 r.read_exact(&mut buf)?;
967 fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
968 let mut vec = vec![0; n];
969 r.read_exact(&mut vec)?;