1 use crate::endian::{Endian, Parse, ToBytes};
3 use encoding_rs::mem::decode_latin1;
4 use flate2::read::ZlibDecoder;
7 use std::cmp::Ordering;
8 use std::fmt::{Debug, Formatter, Result as FmtResult};
10 use std::str::from_utf8;
12 collections::VecDeque,
13 io::{Error as IoError, Read, Seek, SeekFrom},
16 use thiserror::Error as ThisError;
18 use self::state::State;
20 #[derive(ThisError, Debug)]
22 #[error("Not an SPSS system file")]
25 #[error("Invalid magic number {0:?}")]
28 #[error("I/O error ({0})")]
31 #[error("Invalid SAV compression code {0}")]
32 InvalidSavCompression(u32),
34 #[error("Invalid ZSAV compression code {0}")]
35 InvalidZsavCompression(u32),
37 #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
38 BadVariableWidth { offset: u64, width: i32 },
40 #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
41 BadDocumentLength { offset: u64, n: usize, max: usize },
43 #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
44 BadRecordType { offset: u64, rec_type: u32 },
46 #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
47 BadVariableLabelCode { start_offset: u64, code_offset: u64, code: u32 },
50 "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
52 BadNumericMissingValueCode { offset: u64, code: i32 },
54 #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
55 BadStringMissingValueCode { offset: u64, code: i32 },
57 #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
58 BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
60 #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
61 ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
63 #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
64 BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
66 #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
67 ExtensionRecordTooLarge {
74 #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
82 "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
84 EofInCompressedCase { offset: u64, case_ofs: u64 },
86 #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
87 PartialCompressedCase { offset: u64, case_ofs: u64 },
89 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
90 CompressedNumberExpected { offset: u64, case_ofs: u64 },
92 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
93 CompressedStringExpected { offset: u64, case_ofs: u64 },
95 #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
96 BadZlibTrailerNBlocks {
99 expected_n_blocks: u64,
103 #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
111 #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
119 #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
120 BadLongMissingValueLength {
126 #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
127 BadEncodingName { offset: u64 },
129 #[error("Details TBD")]
133 #[derive(Clone, Debug)]
135 Header(HeaderRecord),
136 Variable(VariableRecord),
137 ValueLabel(ValueLabelRecord),
138 Document(DocumentRecord),
139 IntegerInfo(IntegerInfoRecord),
140 FloatInfo(FloatInfoRecord),
141 VariableSets(TextRecord),
142 VarDisplay(VarDisplayRecord),
143 MultipleResponse(MultipleResponseRecord),
144 LongStringValueLabels(LongStringValueLabelRecord),
145 Encoding(EncodingRecord),
146 NumberOfCases(NumberOfCasesRecord),
147 ProductInfo(TextRecord),
148 LongNames(TextRecord),
149 VeryLongStrings(TextRecord),
150 FileAttributes(TextRecord),
151 VariableAttributes(TextRecord),
152 OtherExtension(Extension),
160 fn read<R: Read + Seek>(reader: &mut R, endian: Endian) -> Result<Record, Error> {
161 let rec_type: u32 = endian.parse(read_bytes(reader)?);
163 2 => Ok(Record::Variable(VariableRecord::read(reader, endian)?)),
164 3 => Ok(Record::ValueLabel(ValueLabelRecord::read(reader, endian)?)),
165 6 => Ok(Record::Document(DocumentRecord::read(reader, endian)?)),
166 7 => Ok(Extension::read(reader, endian)?),
167 999 => Ok(Record::EndOfHeaders(endian.parse(read_bytes(reader)?))),
168 _ => Err(Error::BadRecordType {
169 offset: reader.stream_position()?,
176 // If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
177 // decoded as Latin-1 (actually bytes interpreted as Unicode code points).
178 fn default_decode<>(s: &[u8]) -> Cow<str> {
179 from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
182 #[derive(Copy, Clone, Debug)]
183 pub enum Compression {
189 fn offsets(&self) -> Range<u64>;
193 pub struct HeaderRecord {
195 pub offsets: Range<u64>,
200 /// Eye-catcher string, product name, in the file's encoding. Padded
201 /// on the right with spaces.
202 pub eye_catcher: UnencodedStr<60>,
204 /// Layout code, normally either 2 or 3.
205 pub layout_code: u32,
207 /// Number of variable positions, or `None` if the value in the file is
208 /// questionably trustworthy.
209 pub nominal_case_size: Option<u32>,
211 /// Compression type, if any,
212 pub compression: Option<Compression>,
214 /// 1-based variable index of the weight variable, or `None` if the file is
216 pub weight_index: Option<u32>,
218 /// Claimed number of cases, if known.
219 pub n_cases: Option<u32>,
221 /// Compression bias, usually 100.0.
224 /// `dd mmm yy` in the file's encoding.
225 pub creation_date: UnencodedStr<9>,
227 /// `HH:MM:SS` in the file's encoding.
228 pub creation_time: UnencodedStr<8>,
230 /// File label, in the file's encoding. Padded on the right with spaces.
231 pub file_label: UnencodedStr<64>,
233 /// Endianness of the data in the file header.
238 fn debug_field<T: Debug>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult {
239 writeln!(f, "{name:>17}: {:?}", value)
243 impl Debug for HeaderRecord {
244 fn fmt(&self, f: &mut Formatter) -> FmtResult {
245 writeln!(f, "File header record:")?;
246 self.debug_field(f, "Magic", self.magic)?;
247 self.debug_field(f, "Product name", self.eye_catcher)?;
248 self.debug_field(f, "Layout code", self.layout_code)?;
249 self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
250 self.debug_field(f, "Compression", self.compression)?;
251 self.debug_field(f, "Weight index", self.weight_index)?;
252 self.debug_field(f, "Number of cases", self.n_cases)?;
253 self.debug_field(f, "Compression bias", self.bias)?;
254 self.debug_field(f, "Creation date", self.creation_date)?;
255 self.debug_field(f, "Creation time", self.creation_time)?;
256 self.debug_field(f, "File label", self.file_label)?;
257 self.debug_field(f, "Endianness", self.endian)
262 fn read<R: Read + Seek>(r: &mut R) -> Result<HeaderRecord, Error> {
263 let start = r.stream_position()?;
265 let magic: [u8; 4] = read_bytes(r)?;
266 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
268 let eye_catcher = UnencodedStr::<60>(read_bytes(r)?);
269 let layout_code: [u8; 4] = read_bytes(r)?;
270 let endian = Endian::identify_u32(2, layout_code)
271 .or_else(|| Endian::identify_u32(2, layout_code))
272 .ok_or_else(|| Error::NotASystemFile)?;
273 let layout_code = endian.parse(layout_code);
275 let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
276 let nominal_case_size =
277 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
279 let compression_code: u32 = endian.parse(read_bytes(r)?);
280 let compression = match (magic, compression_code) {
281 (Magic::ZSAV, 2) => Some(Compression::ZLib),
282 (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
284 (_, 1) => Some(Compression::Simple),
285 (_, code) => return Err(Error::InvalidSavCompression(code)),
288 let weight_index: u32 = endian.parse(read_bytes(r)?);
289 let weight_index = (weight_index > 0).then_some(weight_index);
291 let n_cases: u32 = endian.parse(read_bytes(r)?);
292 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
294 let bias: f64 = endian.parse(read_bytes(r)?);
296 let creation_date = UnencodedStr::<9>(read_bytes(r)?);
297 let creation_time = UnencodedStr::<8>(read_bytes(r)?);
298 let file_label = UnencodedStr::<64>(read_bytes(r)?);
299 let _: [u8; 3] = read_bytes(r)?;
302 offsets: start..r.stream_position()?,
319 impl Header for HeaderRecord {
320 fn offsets(&self) -> Range<u64> {
325 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
326 pub struct Magic([u8; 4]);
329 /// Magic number for a regular system file.
330 pub const SAV: Magic = Magic(*b"$FL2");
332 /// Magic number for a system file that contains zlib-compressed data.
333 pub const ZSAV: Magic = Magic(*b"$FL3");
335 /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
337 pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
340 impl Debug for Magic {
341 fn fmt(&self, f: &mut Formatter) -> FmtResult {
342 let s = match *self {
343 Magic::SAV => "$FL2",
344 Magic::ZSAV => "$FL3",
345 Magic::EBCDIC => "($FL2 in EBCDIC)",
346 _ => return write!(f, "{:?}", self.0),
352 impl TryFrom<[u8; 4]> for Magic {
355 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
356 let magic = Magic(value);
358 Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
359 _ => Err(Error::BadMagic(value)),
364 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
371 fn from_width(width: i32) -> VarType {
373 0 => VarType::Numeric,
374 _ => VarType::String,
381 Compression, Error, HeaderRecord, Record, Value, VarType, VariableRecord, ZHeader,
382 ZTrailer, ZlibDecodeMultiple,
384 use crate::endian::Endian;
386 collections::VecDeque,
391 #[allow(clippy::type_complexity)]
392 fn read(self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error>;
395 struct Start<R: Read + Seek> {
399 pub fn new<R: Read + Seek + 'static>(reader: R) -> Box<dyn State> {
400 Box::new(Start { reader })
403 struct CommonState<R: Read + Seek> {
407 compression: Option<Compression>,
408 var_types: Vec<VarType>,
411 impl<R: Read + Seek + 'static> State for Start<R> {
412 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
413 let header = HeaderRecord::read(&mut self.reader)?;
414 let next_state = Headers(CommonState {
416 endian: header.endian,
418 compression: header.compression,
419 var_types: Vec::new(),
421 Ok(Some((Record::Header(header), Box::new(next_state))))
425 struct Headers<R: Read + Seek>(CommonState<R>);
427 impl<R: Read + Seek + 'static> State for Headers<R> {
428 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
429 let record = Record::read(&mut self.0.reader, self.0.endian)?;
431 Record::Variable(VariableRecord { width, .. }) => {
432 self.0.var_types.push(VarType::from_width(width));
434 Record::EndOfHeaders(_) => {
435 let next_state: Box<dyn State> = match self.0.compression {
436 None => Box::new(Data(self.0)),
437 Some(Compression::Simple) => Box::new(CompressedData::new(self.0)),
438 Some(Compression::ZLib) => Box::new(ZlibHeader(self.0)),
440 return Ok(Some((record, next_state)));
444 Ok(Some((record, self)))
448 struct ZlibHeader<R: Read + Seek>(CommonState<R>);
450 impl<R: Read + Seek + 'static> State for ZlibHeader<R> {
451 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
452 let zheader = ZHeader::read(&mut self.0.reader, self.0.endian)?;
453 Ok(Some((Record::ZHeader(zheader), self)))
457 struct ZlibTrailer<R: Read + Seek>(CommonState<R>, ZHeader);
459 impl<R: Read + Seek + 'static> State for ZlibTrailer<R> {
460 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
461 let retval = ZTrailer::read(
464 self.1.ztrailer_offset,
467 let next_state = Box::new(CompressedData::new(CommonState {
468 reader: ZlibDecodeMultiple::new(self.0.reader),
469 endian: self.0.endian,
471 compression: self.0.compression,
472 var_types: self.0.var_types,
475 None => next_state.read(),
476 Some(ztrailer) => Ok(Some((Record::ZTrailer(ztrailer), next_state))),
481 struct Data<R: Read + Seek>(CommonState<R>);
483 impl<R: Read + Seek + 'static> State for Data<R> {
484 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
485 match Value::read_case(&mut self.0.reader, &self.0.var_types, self.0.endian)? {
487 Some(values) => Ok(Some((Record::Case(values), self))),
492 struct CompressedData<R: Read + Seek> {
493 common: CommonState<R>,
497 impl<R: Read + Seek + 'static> CompressedData<R> {
498 fn new(common: CommonState<R>) -> CompressedData<R> {
501 codes: VecDeque::new(),
506 impl<R: Read + Seek + 'static> State for CompressedData<R> {
507 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
508 match Value::read_compressed_case(
509 &mut self.common.reader,
510 &self.common.var_types,
516 Some(values) => Ok(Some((Record::Case(values), self))),
522 #[derive(Copy, Clone)]
525 String(UnencodedStr<8>),
528 impl Debug for Value {
529 fn fmt(&self, f: &mut Formatter) -> FmtResult {
531 Value::Number(Some(number)) => write!(f, "{number:?}"),
532 Value::Number(None) => write!(f, "SYSMIS"),
533 Value::String(bytes) => write!(f, "{:?}", bytes),
539 fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Value, IoError> {
541 UntypedValue(read_bytes(r)?),
547 pub fn from_raw(raw: UntypedValue, var_type: VarType, endian: Endian) -> Value {
549 VarType::String => Value::String(UnencodedStr(raw.0)),
550 VarType::Numeric => {
551 let number: f64 = endian.parse(raw.0);
552 Value::Number((number != -f64::MAX).then_some(number))
557 fn read_case<R: Read + Seek>(
559 var_types: &[VarType],
561 ) -> Result<Option<Vec<Value>>, Error> {
562 let case_start = reader.stream_position()?;
563 let mut values = Vec::with_capacity(var_types.len());
564 for (i, &var_type) in var_types.iter().enumerate() {
565 let Some(raw) = try_read_bytes(reader)? else {
569 let offset = reader.stream_position()?;
570 return Err(Error::EofInCase {
572 case_ofs: offset - case_start,
573 case_len: var_types.len() * 8,
577 values.push(Value::from_raw(UntypedValue(raw), var_type, endian));
582 fn read_compressed_case<R: Read + Seek>(
584 var_types: &[VarType],
585 codes: &mut VecDeque<u8>,
588 ) -> Result<Option<Vec<Value>>, Error> {
589 let case_start = reader.stream_position()?;
590 let mut values = Vec::with_capacity(var_types.len());
591 for (i, &var_type) in var_types.iter().enumerate() {
593 let Some(code) = codes.pop_front() else {
594 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
598 let offset = reader.stream_position()?;
599 return Err(Error::EofInCompressedCase {
601 case_ofs: offset - case_start,
605 codes.extend(new_codes.into_iter());
610 1..=251 => match var_type {
611 VarType::Numeric => break Value::Number(Some(code as f64 - bias)),
613 break Value::String(UnencodedStr(endian.to_bytes(code as f64 - bias)))
620 let offset = reader.stream_position()?;
621 return Err(Error::PartialCompressedCase {
623 case_ofs: offset - case_start,
628 break Value::from_raw(UntypedValue(read_bytes(reader)?), var_type, endian)
630 254 => match var_type {
631 VarType::String => break Value::String(UnencodedStr(*b" ")), // XXX EBCDIC
632 VarType::Numeric => {
633 return Err(Error::CompressedStringExpected {
635 case_ofs: reader.stream_position()? - case_start,
639 255 => match var_type {
640 VarType::Numeric => break Value::Number(None),
642 return Err(Error::CompressedNumberExpected {
644 case_ofs: reader.stream_position()? - case_start,
656 struct ZlibDecodeMultiple<R>
660 reader: Option<ZlibDecoder<R>>,
663 impl<R> ZlibDecodeMultiple<R>
667 fn new(reader: R) -> ZlibDecodeMultiple<R> {
669 reader: Some(ZlibDecoder::new(reader)),
674 impl<R> Read for ZlibDecodeMultiple<R>
678 fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
680 match self.reader.as_mut().unwrap().read(buf)? {
682 let inner = self.reader.take().unwrap().into_inner();
683 self.reader = Some(ZlibDecoder::new(inner));
691 impl<R> Seek for ZlibDecodeMultiple<R>
695 fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
696 self.reader.as_mut().unwrap().get_mut().seek(pos)
701 state: Option<Box<dyn State>>,
705 pub fn new<R: Read + Seek + 'static>(reader: R) -> Result<Reader, Error> {
707 state: Some(state::new(reader)),
710 pub fn collect_headers(&mut self) -> Result<Vec<Record>, Error> {
711 let mut headers = Vec::new();
714 Record::EndOfHeaders(_) => break,
715 r => headers.push(r),
722 impl Iterator for Reader {
723 type Item = Result<Record, Error>;
725 fn next(&mut self) -> Option<Self::Item> {
726 match self.state.take()?.read() {
727 Ok(Some((record, next_state))) => {
728 self.state = Some(next_state);
732 Err(error) => Some(Err(error)),
737 impl FusedIterator for Reader {}
739 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
740 pub struct Spec(pub u32);
742 impl Debug for Spec {
743 fn fmt(&self, f: &mut Formatter) -> FmtResult {
744 let type_ = format_name(self.0 >> 16);
745 let w = (self.0 >> 8) & 0xff;
746 let d = self.0 & 0xff;
747 write!(f, "{:06x} ({type_}{w}.{d})", self.0)
751 fn format_name(type_: u32) -> Cow<'static, str> {
790 _ => return format!("<unknown format {type_}>").into(),
796 pub struct MissingValues {
797 /// Individual missing values, up to 3 of them.
798 pub values: Vec<Value>,
800 /// Optional range of missing values.
801 pub range: Option<(Value, Value)>,
804 impl Debug for MissingValues {
805 fn fmt(&self, f: &mut Formatter) -> FmtResult {
806 for (i, value) in self.values.iter().enumerate() {
810 write!(f, "{value:?}")?;
813 if let Some((low, high)) = self.range {
814 if !self.values.is_empty() {
817 write!(f, "{low:?} THRU {high:?}")?;
829 fn is_empty(&self) -> bool {
830 self.values.is_empty() && self.range.is_none()
833 fn read<R: Read + Seek>(
839 ) -> Result<MissingValues, Error> {
840 let (n_values, has_range) = match (width, code) {
841 (_, 0..=3) => (code, false),
842 (0, -2) => (0, true),
843 (0, -3) => (1, true),
844 (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
845 (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
848 let var_type = VarType::from_width(width);
850 let mut values = Vec::new();
851 for _ in 0..n_values {
852 values.push(Value::read(r, var_type, endian)?);
854 let range = if has_range {
855 let low = Value::read(r, var_type, endian)?;
856 let high = Value::read(r, var_type, endian)?;
861 Ok(MissingValues { values, range })
866 pub struct VariableRecord {
867 /// Range of offsets in file.
868 pub offsets: Range<u64>,
870 /// Variable width, in the range -1..=255.
873 /// Variable name, padded on the right with spaces.
874 pub name: UnencodedStr<8>,
877 pub print_format: Spec,
880 pub write_format: Spec,
883 pub missing_values: MissingValues,
885 /// Optional variable label.
886 pub label: Option<UnencodedString>,
889 impl Debug for VariableRecord {
890 fn fmt(&self, f: &mut Formatter) -> FmtResult {
895 match self.width.cmp(&0) {
896 Ordering::Greater => "string",
897 Ordering::Equal => "numeric",
898 Ordering::Less => "long string continuation record",
901 writeln!(f, "Print format: {:?}", self.print_format)?;
902 writeln!(f, "Write format: {:?}", self.write_format)?;
903 writeln!(f, "Name: {:?}", &self.name)?;
904 writeln!(f, "Variable label: {:?}", self.label)?;
905 writeln!(f, "Missing values: {:?}", self.missing_values)
909 impl VariableRecord {
910 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VariableRecord, Error> {
911 let start_offset = r.stream_position()?;
912 let width: i32 = endian.parse(read_bytes(r)?);
913 let code_offset = r.stream_position()?;
914 let has_variable_label: u32 = endian.parse(read_bytes(r)?);
915 let missing_value_code: i32 = endian.parse(read_bytes(r)?);
916 let print_format = Spec(endian.parse(read_bytes(r)?));
917 let write_format = Spec(endian.parse(read_bytes(r)?));
918 let name = UnencodedStr::<8>(read_bytes(r)?);
920 let label = match has_variable_label {
923 let len: u32 = endian.parse(read_bytes(r)?);
924 let read_len = len.min(65535) as usize;
925 let label = UnencodedString(read_vec(r, read_len)?);
927 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
928 let _ = read_vec(r, padding_bytes as usize)?;
933 return Err(Error::BadVariableLabelCode {
936 code: has_variable_label,
941 let missing_values = MissingValues::read(r, start_offset, width, missing_value_code, endian)?;
943 let end_offset = r.stream_position()?;
946 offsets: start_offset..end_offset,
957 #[derive(Copy, Clone)]
958 pub struct UntypedValue(pub [u8; 8]);
960 impl Debug for UntypedValue {
961 fn fmt(&self, f: &mut Formatter) -> FmtResult {
962 let little: f64 = Endian::Little.parse(self.0);
963 let little = format!("{:?}", little);
964 let big: f64 = Endian::Big.parse(self.0);
965 let big = format!("{:?}", big);
966 let number = if little.len() <= big.len() {
971 write!(f, "{number}")?;
973 let string = default_decode(&self.0);
975 .split(|c: char| c == '\0' || c.is_control())
978 write!(f, "{string:?}")?;
984 pub struct UnencodedString(pub Vec<u8>);
986 impl From<Vec<u8>> for UnencodedString {
987 fn from(source: Vec<u8>) -> Self {
992 impl From<&[u8]> for UnencodedString {
993 fn from(source: &[u8]) -> Self {
998 impl Debug for UnencodedString {
999 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1000 write!(f, "{:?}", default_decode(self.0.as_slice()))
1004 #[derive(Copy, Clone)]
1005 pub struct UnencodedStr<const N: usize>(pub [u8; N]);
1007 impl<const N: usize> From<[u8; N]> for UnencodedStr<N> {
1008 fn from(source: [u8; N]) -> Self {
1013 impl<const N: usize> Debug for UnencodedStr<N> {
1014 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1015 write!(f, "{:?}", default_decode(&self.0))
1020 pub struct ValueLabelRecord {
1021 /// Offset from the start of the file to the start of the value label
1023 pub label_offset: u64,
1026 pub labels: Vec<(UntypedValue, UnencodedString)>,
1028 /// Offset from the start of the file to the start of the variable index
1030 pub index_offset: u64,
1032 /// The 1-based indexes of the variable indexes.
1033 pub dict_indexes: Vec<u32>,
1036 impl Debug for ValueLabelRecord {
1037 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1038 writeln!(f, "labels: ")?;
1039 for (value, label) in self.labels.iter() {
1040 writeln!(f, "{value:?}: {label:?}")?;
1042 write!(f, "apply to variables")?;
1043 for dict_index in self.dict_indexes.iter() {
1044 write!(f, " #{dict_index}")?;
1050 impl ValueLabelRecord {
1051 /// Maximum number of value labels in a record.
1052 pub const MAX_LABELS: u32 = u32::MAX / 8;
1054 /// Maximum number of variable indexes in a record.
1055 pub const MAX_INDEXES: u32 = u32::MAX / 8;
1057 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabelRecord, Error> {
1058 let label_offset = r.stream_position()?;
1059 let n: u32 = endian.parse(read_bytes(r)?);
1060 if n > Self::MAX_LABELS {
1061 return Err(Error::BadNumberOfValueLabels {
1062 offset: label_offset,
1064 max: Self::MAX_LABELS,
1068 let mut labels = Vec::new();
1070 let value = UntypedValue(read_bytes(r)?);
1071 let label_len: u8 = endian.parse(read_bytes(r)?);
1072 let label_len = label_len as usize;
1073 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
1075 let mut label = read_vec(r, padded_len - 1)?;
1076 label.truncate(label_len);
1077 labels.push((value, UnencodedString(label)));
1080 let index_offset = r.stream_position()?;
1081 let rec_type: u32 = endian.parse(read_bytes(r)?);
1083 return Err(Error::ExpectedVarIndexRecord {
1084 offset: index_offset,
1089 let n: u32 = endian.parse(read_bytes(r)?);
1090 if n > Self::MAX_INDEXES {
1091 return Err(Error::BadNumberOfVarIndexes {
1092 offset: index_offset,
1094 max: Self::MAX_INDEXES,
1097 let mut dict_indexes = Vec::with_capacity(n as usize);
1099 dict_indexes.push(endian.parse(read_bytes(r)?));
1102 Ok(ValueLabelRecord {
1111 #[derive(Clone, Debug)]
1112 pub struct DocumentRecord {
1113 /// Offset from the start of the file to the start of the record.
1116 /// The document, as an array of 80-byte lines.
1117 pub lines: Vec<DocumentLine>,
1120 pub type DocumentLine = UnencodedStr<{ DocumentRecord::LINE_LEN }>;
1122 impl DocumentRecord {
1123 /// Length of a line in a document. Document lines are fixed-length and
1124 /// padded on the right with spaces.
1125 pub const LINE_LEN: usize = 80;
1127 /// Maximum number of lines we will accept in a document. This is simply
1128 /// the maximum number that will fit in a 32-bit space.
1129 pub const MAX_LINES: usize = i32::MAX as usize / Self::LINE_LEN;
1131 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<DocumentRecord, Error> {
1132 let offset = r.stream_position()?;
1133 let n: u32 = endian.parse(read_bytes(r)?);
1135 if n > Self::MAX_LINES {
1136 Err(Error::BadDocumentLength {
1139 max: Self::MAX_LINES,
1142 let pos = r.stream_position()?;
1143 let mut lines = Vec::with_capacity(n);
1145 lines.push(UnencodedStr::<{ DocumentRecord::LINE_LEN }>(read_bytes(r)?));
1147 Ok(DocumentRecord { pos, lines })
1152 trait ExtensionRecord
1157 const SIZE: Option<u32>;
1158 const COUNT: Option<u32>;
1159 const NAME: &'static str;
1160 fn parse(ext: &Extension, endian: Endian, warn: impl Fn(Error)) -> Result<Self, Error>;
1163 #[derive(Clone, Debug)]
1164 pub struct IntegerInfoRecord {
1165 pub version: (i32, i32, i32),
1166 pub machine_code: i32,
1167 pub floating_point_rep: i32,
1168 pub compression_code: i32,
1169 pub endianness: i32,
1170 pub character_code: i32,
1173 impl ExtensionRecord for IntegerInfoRecord {
1174 const SUBTYPE: u32 = 3;
1175 const SIZE: Option<u32> = Some(4);
1176 const COUNT: Option<u32> = Some(8);
1177 const NAME: &'static str = "integer record";
1179 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1180 ext.check_size::<Self>()?;
1182 let mut input = &ext.data[..];
1183 let data: Vec<i32> = (0..8)
1184 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1186 Ok(IntegerInfoRecord {
1187 version: (data[0], data[1], data[2]),
1188 machine_code: data[3],
1189 floating_point_rep: data[4],
1190 compression_code: data[5],
1191 endianness: data[6],
1192 character_code: data[7],
1197 #[derive(Clone, Debug)]
1198 pub struct FloatInfoRecord {
1204 impl ExtensionRecord for FloatInfoRecord {
1205 const SUBTYPE: u32 = 4;
1206 const SIZE: Option<u32> = Some(8);
1207 const COUNT: Option<u32> = Some(3);
1208 const NAME: &'static str = "floating point record";
1210 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1211 ext.check_size::<Self>()?;
1213 let mut input = &ext.data[..];
1214 let data: Vec<f64> = (0..3)
1215 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1217 Ok(FloatInfoRecord {
1225 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1226 pub enum CategoryLabels {
1231 #[derive(Clone, Debug)]
1232 pub enum MultipleResponseType {
1234 value: UnencodedString,
1235 labels: CategoryLabels,
1240 impl MultipleResponseType {
1241 fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Error> {
1242 let (mr_type, input) = match input.first() {
1243 Some(b'C') => (MultipleResponseType::MultipleCategory, &input[1..]),
1245 let (value, input) = parse_counted_string(&input[1..])?;
1247 MultipleResponseType::MultipleDichotomy {
1249 labels: CategoryLabels::VarLabels,
1255 let Some(b' ') = input.get(1) else {
1256 return Err(Error::TBD);
1258 let input = &input[2..];
1259 let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
1260 (CategoryLabels::CountedValues, rest)
1261 } else if let Some(rest) = input.strip_prefix(b" 11 ") {
1262 (CategoryLabels::VarLabels, rest)
1264 return Err(Error::TBD);
1266 let (value, input) = parse_counted_string(input)?;
1268 MultipleResponseType::MultipleDichotomy {
1275 _ => return Err(Error::TBD),
1277 Ok((mr_type, input))
1281 #[derive(Clone, Debug)]
1282 pub struct MultipleResponseSet {
1283 pub name: UnencodedString,
1284 pub label: UnencodedString,
1285 pub mr_type: MultipleResponseType,
1286 pub short_names: Vec<UnencodedString>,
1289 impl MultipleResponseSet {
1290 fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> {
1291 let Some(equals) = input.iter().position(|&b| b == b'=') else {
1292 return Err(Error::TBD);
1294 let (name, input) = input.split_at(equals);
1295 let (mr_type, input) = MultipleResponseType::parse(input)?;
1296 let Some(b' ') = input.first() else {
1297 return Err(Error::TBD);
1299 let (label, mut input) = parse_counted_string(&input[1..])?;
1300 let mut vars = Vec::new();
1301 while input.first() == Some(&b' ') {
1302 input = &input[1..];
1303 let Some(length) = input.iter().position(|b| b" \n".contains(b)) else {
1304 return Err(Error::TBD);
1307 vars.push(input[..length].into());
1309 input = &input[length..];
1311 if input.first() != Some(&b'\n') {
1312 return Err(Error::TBD);
1314 while input.first() == Some(&b'\n') {
1315 input = &input[1..];
1318 MultipleResponseSet {
1329 #[derive(Clone, Debug)]
1330 pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
1332 impl ExtensionRecord for MultipleResponseRecord {
1333 const SUBTYPE: u32 = 7;
1334 const SIZE: Option<u32> = Some(1);
1335 const COUNT: Option<u32> = None;
1336 const NAME: &'static str = "multiple response set record";
1338 fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1339 ext.check_size::<Self>()?;
1341 let mut input = &ext.data[..];
1342 let mut sets = Vec::new();
1343 while !input.is_empty() {
1344 let (set, rest) = MultipleResponseSet::parse(input)?;
1348 Ok(MultipleResponseRecord(sets))
1352 fn parse_counted_string(input: &[u8]) -> Result<(UnencodedString, &[u8]), Error> {
1353 let Some(space) = input.iter().position(|&b| b == b' ') else {
1354 return Err(Error::TBD);
1356 let Ok(length) = from_utf8(&input[..space]) else {
1357 return Err(Error::TBD);
1359 let Ok(length): Result<usize, _> = length.parse() else {
1360 return Err(Error::TBD);
1363 let input = &input[space + 1..];
1364 if input.len() < length {
1365 return Err(Error::TBD);
1368 let (string, rest) = input.split_at(length);
1369 Ok((string.into(), rest))
1372 #[derive(Clone, Debug)]
1373 pub struct VarDisplayRecord(pub Vec<u32>);
1375 impl ExtensionRecord for VarDisplayRecord {
1376 const SUBTYPE: u32 = 11;
1377 const SIZE: Option<u32> = Some(4);
1378 const COUNT: Option<u32> = None;
1379 const NAME: &'static str = "variable display record";
1381 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1382 ext.check_size::<Self>()?;
1384 let mut input = &ext.data[..];
1385 let display = (0..ext.count)
1386 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1388 Ok(VarDisplayRecord(display))
1392 pub struct LongStringMissingValues {
1394 pub var_name: UnencodedString,
1397 pub missing_values: MissingValues,
1400 pub struct LongStringMissingValueSet(Vec<LongStringMissingValues>);
1402 impl ExtensionRecord for LongStringMissingValueSet {
1403 const SUBTYPE: u32 = 22;
1404 const SIZE: Option<u32> = Some(1);
1405 const COUNT: Option<u32> = None;
1406 const NAME: &'static str = "long string missing values record";
1408 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1409 ext.check_size::<Self>()?;
1411 let mut input = &ext.data[..];
1412 let mut missing_value_set = Vec::new();
1413 while !input.is_empty() {
1414 let var_name = read_string(&mut input, endian)?;
1415 let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
1416 let value_len: u32 = endian.parse(read_bytes(&mut input)?);
1418 let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offset;
1419 return Err(Error::BadLongMissingValueLength {
1420 record_offset: ext.offset,
1425 let mut values = Vec::new();
1426 for i in 0..n_missing_values {
1427 let value: [u8; 8] = read_bytes(&mut input)?;
1428 let numeric_value: u64 = endian.parse(value);
1429 let value = if i > 0 && numeric_value == 8 {
1430 // Tolerate files written by old, buggy versions of PSPP
1431 // where we believed that the value_length was repeated
1432 // before each missing value.
1433 read_bytes(&mut input)?
1437 values.push(Value::String(UnencodedStr(value)));
1439 let missing_values = MissingValues {
1443 missing_value_set.push(LongStringMissingValues {
1448 Ok(LongStringMissingValueSet(missing_value_set))
1452 #[derive(Clone, Debug)]
1453 pub struct EncodingRecord(pub String);
1455 impl ExtensionRecord for EncodingRecord {
1456 const SUBTYPE: u32 = 20;
1457 const SIZE: Option<u32> = Some(1);
1458 const COUNT: Option<u32> = None;
1459 const NAME: &'static str = "encoding record";
1461 fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1462 ext.check_size::<Self>()?;
1465 String::from_utf8(ext.data.clone())
1466 .map_err(|_| Error::BadEncodingName { offset: ext.offset })?,
1471 #[derive(Clone, Debug)]
1472 pub struct NumberOfCasesRecord {
1473 /// Always observed as 1.
1476 /// Number of cases.
1480 impl ExtensionRecord for NumberOfCasesRecord {
1481 const SUBTYPE: u32 = 16;
1482 const SIZE: Option<u32> = Some(8);
1483 const COUNT: Option<u32> = Some(2);
1484 const NAME: &'static str = "extended number of cases record";
1486 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1487 ext.check_size::<Self>()?;
1489 let mut input = &ext.data[..];
1490 let one = endian.parse(read_bytes(&mut input)?);
1491 let n_cases = endian.parse(read_bytes(&mut input)?);
1493 Ok(NumberOfCasesRecord { one, n_cases })
1497 #[derive(Clone, Debug)]
1498 pub struct TextRecord {
1499 /// Offset from the start of the file to the start of the record.
1502 /// The text content of the record.
1503 pub text: UnencodedString,
1506 impl From<Extension> for TextRecord {
1507 fn from(source: Extension) -> Self {
1509 offset: source.offset,
1510 text: source.data.into(),
1515 #[derive(Clone, Debug)]
1516 pub struct Extension {
1517 /// Offset from the start of the file to the start of the record.
1523 /// Size of each data element.
1526 /// Number of data elements.
1529 /// `size * count` bytes of data.
1534 fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
1535 if let Some(expected_size) = E::SIZE {
1536 if self.size != expected_size {
1537 return Err(Error::BadRecordSize {
1538 offset: self.offset,
1539 record: E::NAME.into(),
1545 if let Some(expected_count) = E::COUNT {
1546 if self.count != expected_count {
1547 return Err(Error::BadRecordCount {
1548 offset: self.offset,
1549 record: E::NAME.into(),
1558 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1559 let subtype = endian.parse(read_bytes(r)?);
1560 let offset = r.stream_position()?;
1561 let size: u32 = endian.parse(read_bytes(r)?);
1562 let count = endian.parse(read_bytes(r)?);
1563 let Some(product) = size.checked_mul(count) else {
1564 return Err(Error::ExtensionRecordTooLarge {
1571 let offset = r.stream_position()?;
1572 let data = read_vec(r, product as usize)?;
1573 let extension = Extension {
1581 IntegerInfoRecord::SUBTYPE => Ok(Record::IntegerInfo(IntegerInfoRecord::parse(
1586 FloatInfoRecord::SUBTYPE => Ok(Record::FloatInfo(FloatInfoRecord::parse(
1591 VarDisplayRecord::SUBTYPE => Ok(Record::VarDisplay(VarDisplayRecord::parse(
1596 MultipleResponseRecord::SUBTYPE | 19 => Ok(Record::MultipleResponse(
1597 MultipleResponseRecord::parse(&extension, endian, |_| ())?,
1599 LongStringValueLabelRecord::SUBTYPE => Ok(Record::LongStringValueLabels(
1600 LongStringValueLabelRecord::parse(&extension, endian, |_| ())?,
1602 EncodingRecord::SUBTYPE => Ok(Record::Encoding(EncodingRecord::parse(
1607 NumberOfCasesRecord::SUBTYPE => Ok(Record::NumberOfCases(NumberOfCasesRecord::parse(
1612 5 => Ok(Record::VariableSets(extension.into())),
1613 10 => Ok(Record::ProductInfo(extension.into())),
1614 13 => Ok(Record::LongNames(extension.into())),
1615 14 => Ok(Record::VeryLongStrings(extension.into())),
1616 17 => Ok(Record::FileAttributes(extension.into())),
1617 18 => Ok(Record::VariableAttributes(extension.into())),
1618 _ => Ok(Record::OtherExtension(extension)),
1623 #[derive(Clone, Debug)]
1624 pub struct ZHeader {
1625 /// File offset to the start of the record.
1628 /// File offset to the ZLIB data header.
1629 pub zheader_offset: u64,
1631 /// File offset to the ZLIB trailer.
1632 pub ztrailer_offset: u64,
1634 /// Length of the ZLIB trailer in bytes.
1635 pub ztrailer_len: u64,
1639 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
1640 let offset = r.stream_position()?;
1641 let zheader_offset: u64 = endian.parse(read_bytes(r)?);
1642 let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
1643 let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
1654 #[derive(Clone, Debug)]
1655 pub struct ZTrailer {
1656 /// File offset to the start of the record.
1659 /// Compression bias as a negative integer, e.g. -100.
1662 /// Always observed as zero.
1665 /// Uncompressed size of each block, except possibly the last. Only
1666 /// `0x3ff000` has been observed so far.
1667 pub block_size: u32,
1669 /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
1670 pub blocks: Vec<ZBlock>,
1673 #[derive(Clone, Debug)]
1675 /// Offset of block of data if simple compression were used.
1676 pub uncompressed_ofs: u64,
1678 /// Actual offset within the file of the compressed data block.
1679 pub compressed_ofs: u64,
1681 /// The number of bytes in this data block after decompression. This is
1682 /// `block_size` in every data block but the last, which may be smaller.
1683 pub uncompressed_size: u32,
1685 /// The number of bytes in this data block, as stored compressed in this
1687 pub compressed_size: u32,
1691 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
1693 uncompressed_ofs: endian.parse(read_bytes(r)?),
1694 compressed_ofs: endian.parse(read_bytes(r)?),
1695 uncompressed_size: endian.parse(read_bytes(r)?),
1696 compressed_size: endian.parse(read_bytes(r)?),
1702 fn read<R: Read + Seek>(
1707 ) -> Result<Option<ZTrailer>, Error> {
1708 let start_offset = reader.stream_position()?;
1709 if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
1712 let int_bias = endian.parse(read_bytes(reader)?);
1713 let zero = endian.parse(read_bytes(reader)?);
1714 let block_size = endian.parse(read_bytes(reader)?);
1715 let n_blocks: u32 = endian.parse(read_bytes(reader)?);
1716 let expected_n_blocks = (ztrailer_len - 24) / 24;
1717 if n_blocks as u64 != expected_n_blocks {
1718 return Err(Error::BadZlibTrailerNBlocks {
1719 offset: ztrailer_ofs,
1725 let blocks = (0..n_blocks)
1726 .map(|_| ZBlock::read(reader, endian))
1727 .collect::<Result<Vec<_>, _>>()?;
1728 reader.seek(SeekFrom::Start(start_offset))?;
1730 offset: ztrailer_ofs,
1739 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
1740 let mut buf = [0; N];
1741 let n = r.read(&mut buf)?;
1744 r.read_exact(&mut buf[n..])?;
1752 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
1753 let mut buf = [0; N];
1754 r.read_exact(&mut buf)?;
1758 fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
1759 let mut vec = vec![0; n];
1760 r.read_exact(&mut vec)?;
1764 fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<UnencodedString, IoError> {
1765 let length: u32 = endian.parse(read_bytes(r)?);
1766 Ok(read_vec(r, length as usize)?.into())
1769 #[derive(Clone, Debug)]
1770 pub struct LongStringValueLabels {
1771 pub var_name: UnencodedString,
1774 /// `(value, label)` pairs, where each value is `width` bytes.
1775 pub labels: Vec<(UnencodedString, UnencodedString)>,
1778 #[derive(Clone, Debug)]
1779 pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
1781 impl ExtensionRecord for LongStringValueLabelRecord {
1782 const SUBTYPE: u32 = 21;
1783 const SIZE: Option<u32> = Some(1);
1784 const COUNT: Option<u32> = None;
1785 const NAME: &'static str = "long string value labels record";
1787 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1788 ext.check_size::<Self>()?;
1790 let mut input = &ext.data[..];
1791 let mut label_set = Vec::new();
1792 while !input.is_empty() {
1793 let var_name = read_string(&mut input, endian)?;
1794 let width: u32 = endian.parse(read_bytes(&mut input)?);
1795 let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
1796 let mut labels = Vec::new();
1797 for _ in 0..n_labels {
1798 let value = read_string(&mut input, endian)?;
1799 let label = read_string(&mut input, endian)?;
1800 labels.push((value, label));
1802 label_set.push(LongStringValueLabels {
1808 Ok(LongStringValueLabelRecord(label_set))