1 #![allow(unused_variables)]
2 use endian::{Endian, Parse};
4 use num_derive::FromPrimitive;
5 use std::io::{BufReader, Error as IoError, Read, Seek};
10 #[derive(Error, Debug)]
12 #[error("Not an SPSS system file")]
15 #[error("Invalid magic number {0:?}")]
18 #[error("I/O error ({source})")]
24 #[error("Invalid SAV compression code {0}")]
25 InvalidSavCompression(u32),
27 #[error("Invalid ZSAV compression code {0}")]
28 InvalidZsavCompression(u32),
30 #[error("Misplaced type 4 record near offset {0:#x}.")]
31 MisplacedType4Record(u64),
33 #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
34 BadDocumentLength { offset: u64, n: u32, max: u32 },
36 #[error("At offset {offset:#x}, Unrecognized record type {rec_type}.")]
37 BadRecordType { offset: u64, rec_type: u32 },
39 #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")]
40 BadVariableLabelCode { offset: u64, code: u32 },
43 "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
45 BadNumericMissingValueCode { offset: u64, code: i32 },
47 #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
48 BadStringMissingValueCode { offset: u64, code: i32 },
50 #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
51 BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
53 #[error("At offset {offset:#x}, variable index record (type 4) does not immediately follow value label record (type 3) as it should.")]
54 MissingVariableIndexRecord { offset: u64 },
56 #[error("At offset {offset:#x}, number of variables associated with a value label ({n}) is not between 1 and the number of variables ({max}).")]
57 BadNumberOfValueLabelVariables { offset: u64, n: u32, max: u32 },
59 #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
60 ExtensionRecordTooLarge {
67 #[error("Wrong ZLIB data header offset {zheader_offset:#x} (expected {offset:#x}).")]
68 BadZlibHeaderOffset { offset: u64, zheader_offset: u64 },
70 #[error("At offset {offset:#x}, impossible ZLIB trailer offset {ztrailer_offset:#x}.")]
71 BadZlibTrailerOffset { offset: u64, ztrailer_offset: u64 },
73 #[error("At offset {offset:#x}, impossible ZLIB trailer length {ztrailer_len}.")]
74 BadZlibTrailerLen { offset: u64, ztrailer_len: u64 },
77 #[derive(Error, Debug)]
79 #[error("Unexpected floating-point bias {0} or unrecognized floating-point format.")]
82 #[error("Duplicate type 6 (document) record.")]
83 DuplicateDocumentRecord,
86 #[derive(Copy, Clone, Debug)]
87 pub enum Compression {
92 pub struct Reader<R: Read> {
94 documents: Vec<DocumentRecord>,
95 variables: Vec<VariableRecord>,
96 value_labels: Vec<ValueLabelRecord>,
97 extensions: Vec<ExtensionRecord>,
98 zheader: Option<ZHeader>,
101 pub struct FileHeader {
105 /// Endianness of the data in the file header.
106 pub endianness: Endian,
108 /// 0-based variable index of the weight variable, or `None` if the file is
110 pub weight_index: Option<u32>,
112 /// Number of variable positions, or `None` if the value in the file is
113 /// questionably trustworthy.
114 pub nominal_case_size: Option<u32>,
116 /// `dd mmm yy` in the file's encoding.
117 pub creation_date: [u8; 9],
119 /// `HH:MM:SS` in the file's encoding.
120 pub creation_time: [u8; 8],
122 /// Eye-catcher string, then product name, in the file's encoding. Padded
123 /// on the right with spaces.
124 pub eye_catcher: [u8; 60],
126 /// File label, in the file's encoding. Padded on the right with spaces.
127 pub file_label: [u8; 64],
130 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
131 pub struct Magic([u8; 4]);
134 /// Magic number for a regular system file.
135 pub const SAV: Magic = Magic(*b"$FL2");
137 /// Magic number for a system file that contains zlib-compressed data.
138 pub const ZSAV: Magic = Magic(*b"$FL3");
140 /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
142 pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
145 impl TryFrom<[u8; 4]> for Magic {
148 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
149 let magic = Magic(value);
151 Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
152 _ => Err(Error::BadMagic(value)),
157 impl<R: Read + Seek> Reader<R> {
158 pub fn new(r: R, warn: impl Fn(Warning)) -> Result<Reader<R>, Error> {
159 let mut r = BufReader::new(r);
161 let header = read_header(&mut r, &warn)?;
162 let e = header.endianness;
163 let mut documents = Vec::new();
164 let mut variables = Vec::new();
165 let mut value_labels = Vec::new();
166 let mut extensions = Vec::new();
168 let offset = r.stream_position()?;
169 let rec_type: u32 = e.parse(read_bytes(&mut r)?);
171 2 => variables.push(read_variable_record(&mut r, e)?),
172 3 => value_labels.push(read_value_label_record(&mut r, e, variables.len())?),
173 4 => return Err(Error::MisplacedType4Record(offset)),
174 6 => documents.push(read_document_record(&mut r, e)?),
175 7 => extensions.push(read_extension_record(&mut r, e)?),
177 _ => return Err(Error::BadRecordType { offset, rec_type }),
180 let _: [u8; 4] = read_bytes(&mut r)?;
181 let zheader = match header.magic {
182 Magic::ZSAV => Some(read_zheader(&mut r, e)?),
197 fn read_header<R: Read>(r: &mut R, warn: impl Fn(Warning)) -> Result<FileHeader, Error> {
198 let magic: [u8; 4] = read_bytes(r)?;
199 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
201 let eye_catcher: [u8; 60] = read_bytes(r)?;
202 let layout_code: [u8; 4] = read_bytes(r)?;
203 let endianness = Endian::identify_u32(2, layout_code)
204 .or_else(|| Endian::identify_u32(2, layout_code))
205 .ok_or_else(|| Error::NotASystemFile)?;
207 let nominal_case_size: u32 = endianness.parse(read_bytes(r)?);
208 let nominal_case_size =
209 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
211 let compression_code: u32 = endianness.parse(read_bytes(r)?);
212 let compression = match (magic, compression_code) {
213 (Magic::ZSAV, 2) => Some(Compression::ZLib),
214 (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
216 (_, 1) => Some(Compression::Simple),
217 (_, code) => return Err(Error::InvalidSavCompression(code)),
220 let weight_index: u32 = endianness.parse(read_bytes(r)?);
221 let weight_index = (weight_index > 0).then_some(weight_index - 1);
223 let n_cases: u32 = endianness.parse(read_bytes(r)?);
224 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
226 let bias: f64 = endianness.parse(read_bytes(r)?);
228 warn(Warning::UnexpectedBias(bias))
231 let creation_date: [u8; 9] = read_bytes(r)?;
232 let creation_time: [u8; 8] = read_bytes(r)?;
233 let file_label: [u8; 64] = read_bytes(r)?;
234 let _: [u8; 3] = read_bytes(r)?;
248 pub struct VariableRecord {
249 /// Offset from the start of the file to the start of the record.
252 /// Variable width, in the range -1..=255.
255 /// Variable name, padded on the right with spaces.
259 pub print_format: u32,
262 pub write_format: u32,
264 /// Missing value code, one of -3, -2, 0, 1, 2, or 3.
265 pub missing_value_code: i32,
267 /// Raw missing values, up to 3 of them.
268 pub missing: Vec<[u8; 8]>,
270 /// Optional variable label.
271 pub label: Option<Vec<u8>>,
274 fn read_variable_record<R: Read + Seek>(
275 r: &mut BufReader<R>,
277 ) -> Result<VariableRecord, Error> {
278 let offset = r.stream_position()?;
279 let width: i32 = e.parse(read_bytes(r)?);
280 let has_variable_label: u32 = e.parse(read_bytes(r)?);
281 let missing_value_code: i32 = e.parse(read_bytes(r)?);
282 let print_format: u32 = e.parse(read_bytes(r)?);
283 let write_format: u32 = e.parse(read_bytes(r)?);
284 let name: [u8; 8] = read_bytes(r)?;
286 let label = match has_variable_label {
289 let len: u32 = e.parse(read_bytes(r)?);
290 let read_len = len.min(65535) as usize;
291 let label = Some(read_vec(r, read_len)?);
293 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
294 let _ = read_vec(r, padding_bytes as usize)?;
299 return Err(Error::BadVariableLabelCode {
301 code: has_variable_label,
306 let mut missing = Vec::new();
307 if missing_value_code != 0 {
308 match (width, missing_value_code) {
309 (0, -3 | -2 | 1 | 2 | 3) => (),
311 return Err(Error::BadNumericMissingValueCode {
313 code: missing_value_code,
318 return Err(Error::BadStringMissingValueCode {
320 code: missing_value_code,
325 for _ in 0..missing_value_code.abs() {
326 missing.push(read_bytes(r)?);
342 pub struct ValueLabelRecord {
343 /// Offset from the start of the file to the start of the record.
347 pub labels: Vec<([u8; 8], Vec<u8>)>,
349 /// The 0-based indexes of the variables to which the labels are assigned.
350 pub var_indexes: Vec<u32>,
353 pub const MAX_VALUE_LABELS: u32 = u32::MAX / 8;
355 fn read_value_label_record<R: Read + Seek>(
356 r: &mut BufReader<R>,
358 n_var_records: usize,
359 ) -> Result<ValueLabelRecord, Error> {
360 let offset = r.stream_position()?;
361 let n: u32 = e.parse(read_bytes(r)?);
362 if n > MAX_VALUE_LABELS {
363 return Err(Error::BadNumberOfValueLabels {
366 max: MAX_VALUE_LABELS,
370 let mut labels = Vec::new();
372 let value: [u8; 8] = read_bytes(r)?;
373 let label_len: u8 = e.parse(read_bytes(r)?);
374 let label_len = label_len as usize;
375 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
377 let mut label = read_vec(r, padded_len)?;
378 label.truncate(label_len);
379 labels.push((value, label));
382 let rec_type: u32 = e.parse(read_bytes(r)?);
384 return Err(Error::MissingVariableIndexRecord {
385 offset: r.stream_position()?,
389 let n_vars: u32 = e.parse(read_bytes(r)?);
390 if n_vars < 1 || n_vars as usize > n_var_records {
391 return Err(Error::BadNumberOfValueLabelVariables {
392 offset: r.stream_position()?,
394 max: n_var_records as u32,
397 let mut var_indexes = Vec::with_capacity(n_vars as usize);
399 var_indexes.push(e.parse(read_bytes(r)?));
402 Ok(ValueLabelRecord {
409 pub const DOC_LINE_LEN: u32 = 80;
410 pub const DOC_MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN;
412 pub struct DocumentRecord {
413 /// Offset from the start of the file to the start of the record.
416 /// The document, as an array of 80-byte lines.
417 pub lines: Vec<[u8; DOC_LINE_LEN as usize]>,
420 fn read_document_record<R: Read + Seek>(
421 r: &mut BufReader<R>,
423 ) -> Result<DocumentRecord, Error> {
424 let offset = r.stream_position()?;
425 let n: u32 = e.parse(read_bytes(r)?);
427 0..=DOC_MAX_LINES => {
428 let pos = r.stream_position()?;
429 let mut lines = Vec::with_capacity(n as usize);
431 let line: [u8; 80] = read_bytes(r)?;
434 Ok(DocumentRecord { pos, lines })
436 _ => Err(Error::BadDocumentLength {
444 #[derive(FromPrimitive)]
446 /// Machine integer info.
448 /// Machine floating-point info.
454 /// Multiple response sets.
458 /// Extra product info text.
460 /// Variable display parameters.
462 /// Long variable names.
466 /// Extended number of cases.
468 /// Data file attributes.
470 /// Variable attributes.
472 /// Multiple response sets (extended).
474 /// Character encoding.
476 /// Value labels for long strings.
478 /// Missing values for long strings.
480 /// "Format properties in dataview table".
484 struct ExtensionRecord {
485 /// Offset from the start of the file to the start of the record.
491 /// Size of each data element.
494 /// Number of data elements.
497 /// `size * count` bytes of data.
501 fn extension_record_size_requirements(extension: Extension) -> (u32, u32) {
503 /* Implemented record types. */
504 Extension::Integer => (4, 8),
505 Extension::Float => (8, 3),
506 Extension::VarSets => (1, 0),
507 Extension::Mrsets => (1, 0),
508 Extension::ProductInfo => (1, 0),
509 Extension::Display => (4, 0),
510 Extension::LongNames => (1, 0),
511 Extension::LongStrings => (1, 0),
512 Extension::Ncases => (8, 2),
513 Extension::FileAttrs => (1, 0),
514 Extension::VarAttrs => (1, 0),
515 Extension::Mrsets2 => (1, 0),
516 Extension::Encoding => (1, 0),
517 Extension::LongLabels => (1, 0),
518 Extension::LongMissing => (1, 0),
520 /* Ignored record types. */
521 Extension::Date => (0, 0),
522 Extension::DataEntry => (0, 0),
523 Extension::Dataview => (0, 0),
527 fn read_extension_record<R: Read + Seek>(
528 r: &mut BufReader<R>,
530 ) -> Result<ExtensionRecord, Error> {
531 let subtype = e.parse(read_bytes(r)?);
532 let offset = r.stream_position()?;
533 let size: u32 = e.parse(read_bytes(r)?);
534 let count = e.parse(read_bytes(r)?);
535 let Some(product) = size.checked_mul(count) else {
536 return Err(Error::ExtensionRecordTooLarge {
543 let offset = r.stream_position()?;
544 let data = read_vec(r, product as usize)?;
555 /// File offset to the start of the record.
558 /// File offset to the ZLIB data header.
561 /// File offset to the ZLIB trailer.
562 ztrailer_offset: u64,
564 /// Length of the ZLIB trailer in bytes.
568 fn read_zheader<R: Read + Seek>(r: &mut BufReader<R>, e: Endian) -> Result<ZHeader, Error> {
569 let offset = r.stream_position()?;
570 let zheader_offset: u64 = e.parse(read_bytes(r)?);
571 let ztrailer_offset: u64 = e.parse(read_bytes(r)?);
572 let ztrailer_len: u64 = e.parse(read_bytes(r)?);
574 if zheader_offset != offset {
575 return Err(Error::BadZlibHeaderOffset {
580 if ztrailer_offset < offset {
581 return Err(Error::BadZlibTrailerOffset {
586 if ztrailer_len < 24 || ztrailer_len % 24 != 0 {
587 return Err(Error::BadZlibTrailerLen {
601 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
602 let mut buf = [0; N];
603 r.read_exact(&mut buf)?;
607 fn read_vec<R: Read>(r: &mut BufReader<R>, n: usize) -> Result<Vec<u8>, IoError> {
608 let mut vec = vec![0; n];
609 r.read_exact(&mut vec)?;
614 fn trim_end(mut s: Vec<u8>, c: u8) -> Vec<u8> {
615 while s.last() == Some(&c) {
621 fn skip_bytes<R: Read>(r: &mut R, mut n: u64) -> Result<(), IoError> {
622 let mut buf = [0; 1024];
624 let chunk = u64::min(n, buf.len() as u64);
625 r.read_exact(&mut buf[0..chunk as usize])?;