1 use crate::endian::{Endian, Parse, ToBytes};
3 use encoding_rs::mem::decode_latin1;
4 use flate2::read::ZlibDecoder;
7 use std::fmt::{Debug, Formatter, Result as FmtResult};
8 use std::str::from_utf8;
10 collections::VecDeque,
11 io::{Error as IoError, Read, Seek, SeekFrom},
14 use thiserror::Error as ThisError;
16 use self::state::State;
18 #[derive(ThisError, Debug)]
20 #[error("Not an SPSS system file")]
23 #[error("Invalid magic number {0:?}")]
26 #[error("I/O error ({0})")]
29 #[error("Invalid SAV compression code {0}")]
30 InvalidSavCompression(u32),
32 #[error("Invalid ZSAV compression code {0}")]
33 InvalidZsavCompression(u32),
35 #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
36 BadVariableWidth { offset: u64, width: i32 },
38 #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
39 BadDocumentLength { offset: u64, n: usize, max: usize },
41 #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
42 BadRecordType { offset: u64, rec_type: u32 },
44 #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")]
45 BadVariableLabelCode { offset: u64, code: u32 },
48 "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
50 BadNumericMissingValueCode { offset: u64, code: i32 },
52 #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
53 BadStringMissingValueCode { offset: u64, code: i32 },
55 #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
56 BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
58 #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
59 BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
61 #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
62 ExtensionRecordTooLarge {
69 #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
77 "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
79 EofInCompressedCase { offset: u64, case_ofs: u64 },
81 #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
82 PartialCompressedCase { offset: u64, case_ofs: u64 },
84 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
85 CompressedNumberExpected { offset: u64, case_ofs: u64 },
87 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
88 CompressedStringExpected { offset: u64, case_ofs: u64 },
90 #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
91 BadZlibTrailerNBlocks {
94 expected_n_blocks: u64,
98 #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
106 #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
114 #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
115 BadLongMissingValueLength {
121 #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
122 BadEncodingName { offset: u64 },
124 #[error("Details TBD")]
128 #[derive(Clone, Debug)]
130 Header(HeaderRecord),
131 Variable(VariableRecord),
132 ValueLabel(ValueLabelRecord),
133 VarIndexes(VarIndexRecord),
134 Document(DocumentRecord),
135 IntegerInfo(IntegerInfoRecord),
136 FloatInfo(FloatInfoRecord),
137 VariableSets(TextRecord),
138 VarDisplay(VarDisplayRecord),
139 MultipleResponse(MultipleResponseRecord),
140 LongStringValueLabels(LongStringValueLabelRecord),
141 Encoding(EncodingRecord),
142 NumberOfCases(NumberOfCasesRecord),
143 ProductInfo(TextRecord),
144 LongNames(TextRecord),
145 VeryLongStrings(TextRecord),
146 FileAttributes(TextRecord),
147 VariableAttributes(TextRecord),
148 OtherExtension(Extension),
156 fn read<R: Read + Seek>(reader: &mut R, endian: Endian) -> Result<Record, Error> {
157 let rec_type: u32 = endian.parse(read_bytes(reader)?);
159 2 => Ok(Record::Variable(VariableRecord::read(reader, endian)?)),
160 3 => Ok(Record::ValueLabel(ValueLabelRecord::read(reader, endian)?)),
161 4 => Ok(Record::VarIndexes(VarIndexRecord::read(reader, endian)?)),
162 6 => Ok(Record::Document(DocumentRecord::read(reader, endian)?)),
163 7 => Ok(Extension::read(reader, endian)?),
164 999 => Ok(Record::EndOfHeaders(endian.parse(read_bytes(reader)?))),
165 _ => Err(Error::BadRecordType {
166 offset: reader.stream_position()?,
173 // If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
174 // decoded as Latin-1 (actually bytes interpreted as Unicode code points).
175 fn default_decode<'a>(s: &'a [u8]) -> Cow<'a, str> {
176 from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
179 #[derive(Copy, Clone, Debug)]
180 pub enum Compression {
186 pub struct HeaderRecord {
190 /// Eye-catcher string, product name, in the file's encoding. Padded
191 /// on the right with spaces.
192 pub eye_catcher: UnencodedStr<60>,
194 /// Layout code, normally either 2 or 3.
195 pub layout_code: u32,
197 /// Number of variable positions, or `None` if the value in the file is
198 /// questionably trustworthy.
199 pub nominal_case_size: Option<u32>,
201 /// Compression type, if any,
202 pub compression: Option<Compression>,
204 /// 1-based variable index of the weight variable, or `None` if the file is
206 pub weight_index: Option<u32>,
208 /// Claimed number of cases, if known.
209 pub n_cases: Option<u32>,
211 /// Compression bias, usually 100.0.
214 /// `dd mmm yy` in the file's encoding.
215 pub creation_date: UnencodedStr<9>,
217 /// `HH:MM:SS` in the file's encoding.
218 pub creation_time: UnencodedStr<8>,
220 /// File label, in the file's encoding. Padded on the right with spaces.
221 pub file_label: UnencodedStr<64>,
223 /// Endianness of the data in the file header.
228 fn debug_field<T: Debug>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult {
229 writeln!(f, "{name:>17}: {:?}", value)
233 impl Debug for HeaderRecord {
234 fn fmt(&self, f: &mut Formatter) -> FmtResult {
235 writeln!(f, "File header record:")?;
236 self.debug_field(f, "Magic", self.magic)?;
237 self.debug_field(f, "Product name", &self.eye_catcher)?;
238 self.debug_field(f, "Layout code", self.layout_code)?;
239 self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
240 self.debug_field(f, "Compression", self.compression)?;
241 self.debug_field(f, "Weight index", self.weight_index)?;
242 self.debug_field(f, "Number of cases", self.n_cases)?;
243 self.debug_field(f, "Compression bias", self.bias)?;
244 self.debug_field(f, "Creation date", &self.creation_date)?;
245 self.debug_field(f, "Creation time", &self.creation_time)?;
246 self.debug_field(f, "File label", &self.file_label)?;
247 self.debug_field(f, "Endianness", self.endian)
252 fn read<R: Read>(r: &mut R) -> Result<HeaderRecord, Error> {
253 let magic: [u8; 4] = read_bytes(r)?;
254 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
256 let eye_catcher = UnencodedStr::<60>(read_bytes(r)?);
257 let layout_code: [u8; 4] = read_bytes(r)?;
258 let endian = Endian::identify_u32(2, layout_code)
259 .or_else(|| Endian::identify_u32(2, layout_code))
260 .ok_or_else(|| Error::NotASystemFile)?;
261 let layout_code = endian.parse(layout_code);
263 let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
264 let nominal_case_size =
265 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
267 let compression_code: u32 = endian.parse(read_bytes(r)?);
268 let compression = match (magic, compression_code) {
269 (Magic::ZSAV, 2) => Some(Compression::ZLib),
270 (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
272 (_, 1) => Some(Compression::Simple),
273 (_, code) => return Err(Error::InvalidSavCompression(code)),
276 let weight_index: u32 = endian.parse(read_bytes(r)?);
277 let weight_index = (weight_index > 0).then_some(weight_index);
279 let n_cases: u32 = endian.parse(read_bytes(r)?);
280 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
282 let bias: f64 = endian.parse(read_bytes(r)?);
284 let creation_date = UnencodedStr::<9>(read_bytes(r)?);
285 let creation_time = UnencodedStr::<8>(read_bytes(r)?);
286 let file_label = UnencodedStr::<64>(read_bytes(r)?);
287 let _: [u8; 3] = read_bytes(r)?;
306 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
307 pub struct Magic([u8; 4]);
310 /// Magic number for a regular system file.
311 pub const SAV: Magic = Magic(*b"$FL2");
313 /// Magic number for a system file that contains zlib-compressed data.
314 pub const ZSAV: Magic = Magic(*b"$FL3");
316 /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
318 pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
321 impl Debug for Magic {
322 fn fmt(&self, f: &mut Formatter) -> FmtResult {
324 &Magic::SAV => "$FL2",
325 &Magic::ZSAV => "$FL3",
326 &Magic::EBCDIC => "($FL2 in EBCDIC)",
327 _ => return write!(f, "{:?}", self.0),
333 impl TryFrom<[u8; 4]> for Magic {
336 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
337 let magic = Magic(value);
339 Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
340 _ => Err(Error::BadMagic(value)),
345 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
352 fn from_width(width: i32) -> VarType {
354 0 => VarType::Numeric,
355 _ => VarType::String,
362 Compression, Error, HeaderRecord, Record, Value, VarType, VariableRecord, ZHeader,
363 ZTrailer, ZlibDecodeMultiple,
365 use crate::endian::Endian;
367 collections::VecDeque,
372 #[allow(clippy::type_complexity)]
373 fn read(self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error>;
376 struct Start<R: Read + Seek> {
380 pub fn new<R: Read + Seek + 'static>(reader: R) -> Box<dyn State> {
381 Box::new(Start { reader })
384 struct CommonState<R: Read + Seek> {
388 compression: Option<Compression>,
389 var_types: Vec<VarType>,
392 impl<R: Read + Seek + 'static> State for Start<R> {
393 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
394 let header = HeaderRecord::read(&mut self.reader)?;
395 let next_state = Headers(CommonState {
397 endian: header.endian,
399 compression: header.compression,
400 var_types: Vec::new(),
402 Ok(Some((Record::Header(header), Box::new(next_state))))
406 struct Headers<R: Read + Seek>(CommonState<R>);
408 impl<R: Read + Seek + 'static> State for Headers<R> {
409 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
410 let record = Record::read(&mut self.0.reader, self.0.endian)?;
412 Record::Variable(VariableRecord { width, .. }) => {
413 self.0.var_types.push(VarType::from_width(width));
415 Record::EndOfHeaders(_) => {
416 let next_state: Box<dyn State> = match self.0.compression {
417 None => Box::new(Data(self.0)),
418 Some(Compression::Simple) => Box::new(CompressedData::new(self.0)),
419 Some(Compression::ZLib) => Box::new(ZlibHeader(self.0)),
421 return Ok(Some((record, next_state)));
425 Ok(Some((record, self)))
429 struct ZlibHeader<R: Read + Seek>(CommonState<R>);
431 impl<R: Read + Seek + 'static> State for ZlibHeader<R> {
432 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
433 let zheader = ZHeader::read(&mut self.0.reader, self.0.endian)?;
434 Ok(Some((Record::ZHeader(zheader), self)))
438 struct ZlibTrailer<R: Read + Seek>(CommonState<R>, ZHeader);
440 impl<R: Read + Seek + 'static> State for ZlibTrailer<R> {
441 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
442 let retval = ZTrailer::read(
445 self.1.ztrailer_offset,
448 let next_state = Box::new(CompressedData::new(CommonState {
449 reader: ZlibDecodeMultiple::new(self.0.reader),
450 endian: self.0.endian,
452 compression: self.0.compression,
453 var_types: self.0.var_types,
456 None => next_state.read(),
457 Some(ztrailer) => Ok(Some((Record::ZTrailer(ztrailer), next_state))),
462 struct Data<R: Read + Seek>(CommonState<R>);
464 impl<R: Read + Seek + 'static> State for Data<R> {
465 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
466 match Value::read_case(&mut self.0.reader, &self.0.var_types, self.0.endian)? {
468 Some(values) => Ok(Some((Record::Case(values), self))),
473 struct CompressedData<R: Read + Seek> {
474 common: CommonState<R>,
478 impl<R: Read + Seek + 'static> CompressedData<R> {
479 fn new(common: CommonState<R>) -> CompressedData<R> {
482 codes: VecDeque::new(),
487 impl<R: Read + Seek + 'static> State for CompressedData<R> {
488 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
489 match Value::read_compressed_case(
490 &mut self.common.reader,
491 &self.common.var_types,
497 Some(values) => Ok(Some((Record::Case(values), self))),
503 #[derive(Copy, Clone)]
506 String(UnencodedStr<8>),
509 impl Debug for Value {
510 fn fmt(&self, f: &mut Formatter) -> FmtResult {
512 Value::Number(Some(number)) => write!(f, "{number:?}"),
513 Value::Number(None) => write!(f, "SYSMIS"),
514 Value::String(bytes) => write!(f, "{:?}", bytes),
520 fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Value, IoError> {
522 UntypedValue(read_bytes(r)?),
528 pub fn from_raw(raw: UntypedValue, var_type: VarType, endian: Endian) -> Value {
530 VarType::String => Value::String(UnencodedStr(raw.0)),
531 VarType::Numeric => {
532 let number: f64 = endian.parse(raw.0);
533 Value::Number((number != -f64::MAX).then_some(number))
538 fn read_case<R: Read + Seek>(
540 var_types: &[VarType],
542 ) -> Result<Option<Vec<Value>>, Error> {
543 let case_start = reader.stream_position()?;
544 let mut values = Vec::with_capacity(var_types.len());
545 for (i, &var_type) in var_types.iter().enumerate() {
546 let Some(raw) = try_read_bytes(reader)? else {
550 let offset = reader.stream_position()?;
551 return Err(Error::EofInCase {
553 case_ofs: offset - case_start,
554 case_len: var_types.len() * 8,
558 values.push(Value::from_raw(UntypedValue(raw), var_type, endian));
563 fn read_compressed_case<R: Read + Seek>(
565 var_types: &[VarType],
566 codes: &mut VecDeque<u8>,
569 ) -> Result<Option<Vec<Value>>, Error> {
570 let case_start = reader.stream_position()?;
571 let mut values = Vec::with_capacity(var_types.len());
572 for (i, &var_type) in var_types.iter().enumerate() {
574 let Some(code) = codes.pop_front() else {
575 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
579 let offset = reader.stream_position()?;
580 return Err(Error::EofInCompressedCase {
582 case_ofs: offset - case_start,
586 codes.extend(new_codes.into_iter());
591 1..=251 => match var_type {
592 VarType::Numeric => break Value::Number(Some(code as f64 - bias)),
594 break Value::String(UnencodedStr(endian.to_bytes(code as f64 - bias)))
601 let offset = reader.stream_position()?;
602 return Err(Error::PartialCompressedCase {
604 case_ofs: offset - case_start,
609 break Value::from_raw(UntypedValue(read_bytes(reader)?), var_type, endian)
611 254 => match var_type {
612 VarType::String => break Value::String(UnencodedStr(*b" ")), // XXX EBCDIC
613 VarType::Numeric => {
614 return Err(Error::CompressedStringExpected {
616 case_ofs: reader.stream_position()? - case_start,
620 255 => match var_type {
621 VarType::Numeric => break Value::Number(None),
623 return Err(Error::CompressedNumberExpected {
625 case_ofs: reader.stream_position()? - case_start,
637 struct ZlibDecodeMultiple<R>
641 reader: Option<ZlibDecoder<R>>,
644 impl<R> ZlibDecodeMultiple<R>
648 fn new(reader: R) -> ZlibDecodeMultiple<R> {
650 reader: Some(ZlibDecoder::new(reader)),
655 impl<R> Read for ZlibDecodeMultiple<R>
659 fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
661 match self.reader.as_mut().unwrap().read(buf)? {
663 let inner = self.reader.take().unwrap().into_inner();
664 self.reader = Some(ZlibDecoder::new(inner));
672 impl<R> Seek for ZlibDecodeMultiple<R>
676 fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
677 self.reader.as_mut().unwrap().get_mut().seek(pos)
682 state: Option<Box<dyn State>>,
686 pub fn new<R: Read + Seek + 'static>(reader: R) -> Result<Reader, Error> {
688 state: Some(state::new(reader)),
691 pub fn collect_headers(&mut self) -> Result<Vec<Record>, Error> {
692 let mut headers = Vec::new();
695 Record::EndOfHeaders(_) => break,
696 r => headers.push(r),
703 impl Iterator for Reader {
704 type Item = Result<Record, Error>;
706 fn next(&mut self) -> Option<Self::Item> {
707 match self.state.take()?.read() {
708 Ok(Some((record, next_state))) => {
709 self.state = Some(next_state);
713 Err(error) => Some(Err(error)),
718 impl FusedIterator for Reader {}
720 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
721 pub struct Spec(pub u32);
723 impl Debug for Spec {
724 fn fmt(&self, f: &mut Formatter) -> FmtResult {
725 let type_ = format_name(self.0 >> 16);
726 let w = (self.0 >> 8) & 0xff;
727 let d = self.0 & 0xff;
728 write!(f, "{:06x} ({type_}{w}.{d})", self.0)
732 fn format_name(type_: u32) -> Cow<'static, str> {
771 _ => return format!("<unknown format {type_}>").into(),
777 pub struct MissingValues {
778 /// Individual missing values, up to 3 of them.
779 pub values: Vec<Value>,
781 /// Optional range of missing values.
782 pub range: Option<(Value, Value)>,
785 impl Debug for MissingValues {
786 fn fmt(&self, f: &mut Formatter) -> FmtResult {
787 for (i, value) in self.values.iter().enumerate() {
791 write!(f, "{value:?}")?;
794 if let Some((low, high)) = self.range {
795 if !self.values.is_empty() {
798 write!(f, "{low:?} THRU {high:?}")?;
810 fn is_empty(&self) -> bool {
811 self.values.is_empty() && self.range.is_none()
814 fn read<R: Read + Seek>(
820 ) -> Result<MissingValues, Error> {
821 let (n_values, has_range) = match (width, code) {
822 (_, 0..=3) => (code, false),
823 (0, -2) => (0, true),
824 (0, -3) => (1, true),
825 (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
826 (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
829 let var_type = VarType::from_width(width);
831 let mut values = Vec::new();
832 for _ in 0..n_values {
833 values.push(Value::read(r, var_type, endian)?);
835 let range = if has_range {
836 let low = Value::read(r, var_type, endian)?;
837 let high = Value::read(r, var_type, endian)?;
842 Ok(MissingValues { values, range })
847 pub struct VariableRecord {
848 /// Offset from the start of the file to the start of the record.
851 /// Variable width, in the range -1..=255.
854 /// Variable name, padded on the right with spaces.
855 pub name: UnencodedStr<8>,
858 pub print_format: Spec,
861 pub write_format: Spec,
864 pub missing_values: MissingValues,
866 /// Optional variable label.
867 pub label: Option<UnencodedString>,
870 impl Debug for VariableRecord {
871 fn fmt(&self, f: &mut Formatter) -> FmtResult {
878 } else if self.width == 0 {
881 "long string continuation record"
884 writeln!(f, "Print format: {:?}", self.print_format)?;
885 writeln!(f, "Write format: {:?}", self.write_format)?;
886 writeln!(f, "Name: {:?}", &self.name)?;
887 writeln!(f, "Variable label: {:?}", self.label)?;
888 writeln!(f, "Missing values: {:?}", self.missing_values)
892 impl VariableRecord {
893 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VariableRecord, Error> {
894 let offset = r.stream_position()?;
895 let width: i32 = endian.parse(read_bytes(r)?);
896 let has_variable_label: u32 = endian.parse(read_bytes(r)?);
897 let missing_value_code: i32 = endian.parse(read_bytes(r)?);
898 let print_format = Spec(endian.parse(read_bytes(r)?));
899 let write_format = Spec(endian.parse(read_bytes(r)?));
900 let name = UnencodedStr::<8>(read_bytes(r)?);
902 let label = match has_variable_label {
905 let len: u32 = endian.parse(read_bytes(r)?);
906 let read_len = len.min(65535) as usize;
907 let label = UnencodedString(read_vec(r, read_len)?);
909 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
910 let _ = read_vec(r, padding_bytes as usize)?;
915 return Err(Error::BadVariableLabelCode {
917 code: has_variable_label,
922 let missing_values = MissingValues::read(r, offset, width, missing_value_code, endian)?;
936 #[derive(Copy, Clone)]
937 pub struct UntypedValue(pub [u8; 8]);
939 impl Debug for UntypedValue {
940 fn fmt(&self, f: &mut Formatter) -> FmtResult {
941 let little: f64 = Endian::Little.parse(self.0);
942 let little = format!("{:?}", little);
943 let big: f64 = Endian::Big.parse(self.0);
944 let big = format!("{:?}", big);
945 let number = if little.len() <= big.len() {
950 write!(f, "{number}")?;
952 let string = default_decode(&self.0);
954 .split(|c: char| c == '\0' || c.is_control())
957 write!(f, "{string:?}")?;
963 pub struct UnencodedString(pub Vec<u8>);
965 impl From<Vec<u8>> for UnencodedString {
966 fn from(source: Vec<u8>) -> Self {
971 impl From<&[u8]> for UnencodedString {
972 fn from(source: &[u8]) -> Self {
977 impl Debug for UnencodedString {
978 fn fmt(&self, f: &mut Formatter) -> FmtResult {
979 write!(f, "{:?}", default_decode(self.0.as_slice()))
983 #[derive(Copy, Clone)]
984 pub struct UnencodedStr<const N: usize>(pub [u8; N]);
986 impl<const N: usize> From<[u8; N]> for UnencodedStr<N> {
987 fn from(source: [u8; N]) -> Self {
992 impl<const N: usize> Debug for UnencodedStr<N> {
993 fn fmt(&self, f: &mut Formatter) -> FmtResult {
994 write!(f, "{:?}", default_decode(&self.0))
999 pub struct ValueLabelRecord {
1000 /// Offset from the start of the file to the start of the record.
1004 pub labels: Vec<(UntypedValue, UnencodedString)>,
1007 impl Debug for ValueLabelRecord {
1008 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1009 for (value, label) in self.labels.iter() {
1010 writeln!(f, "{value:?}: {label:?}")?;
1016 impl ValueLabelRecord {
1017 /// Maximum number of value labels in a record.
1018 pub const MAX: u32 = u32::MAX / 8;
1020 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabelRecord, Error> {
1021 let offset = r.stream_position()?;
1022 let n: u32 = endian.parse(read_bytes(r)?);
1023 if n > ValueLabelRecord::MAX {
1024 return Err(Error::BadNumberOfValueLabels {
1027 max: ValueLabelRecord::MAX,
1031 let mut labels = Vec::new();
1033 let value = UntypedValue(read_bytes(r)?);
1034 let label_len: u8 = endian.parse(read_bytes(r)?);
1035 let label_len = label_len as usize;
1036 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
1038 let mut label = read_vec(r, padded_len - 1)?;
1039 label.truncate(label_len);
1040 labels.push((value, UnencodedString(label)));
1042 Ok(ValueLabelRecord { offset, labels })
1047 pub struct VarIndexRecord {
1048 /// Offset from the start of the file to the start of the record.
1051 /// The 1-based indexes of the variable indexes.
1052 pub dict_indexes: Vec<u32>,
1055 impl Debug for VarIndexRecord {
1056 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1057 write!(f, "apply to variables")?;
1058 for dict_index in self.dict_indexes.iter() {
1059 write!(f, " #{dict_index}")?;
1065 impl VarIndexRecord {
1066 /// Maximum number of variable indexes in a record.
1067 pub const MAX: u32 = u32::MAX / 8;
1069 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VarIndexRecord, Error> {
1070 let offset = r.stream_position()?;
1071 let n: u32 = endian.parse(read_bytes(r)?);
1072 if n > VarIndexRecord::MAX {
1073 return Err(Error::BadNumberOfVarIndexes {
1076 max: VarIndexRecord::MAX,
1079 let mut dict_indexes = Vec::with_capacity(n as usize);
1081 dict_indexes.push(endian.parse(read_bytes(r)?));
1091 #[derive(Clone, Debug)]
1092 pub struct DocumentRecord {
1093 /// Offset from the start of the file to the start of the record.
1096 /// The document, as an array of 80-byte lines.
1097 pub lines: Vec<DocumentLine>,
1100 pub type DocumentLine = UnencodedStr<{ DocumentRecord::LINE_LEN }>;
1102 impl DocumentRecord {
1103 /// Length of a line in a document. Document lines are fixed-length and
1104 /// padded on the right with spaces.
1105 pub const LINE_LEN: usize = 80;
1107 /// Maximum number of lines we will accept in a document. This is simply
1108 /// the maximum number that will fit in a 32-bit space.
1109 pub const MAX_LINES: usize = i32::MAX as usize / Self::LINE_LEN;
1111 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<DocumentRecord, Error> {
1112 let offset = r.stream_position()?;
1113 let n: u32 = endian.parse(read_bytes(r)?);
1115 if n > Self::MAX_LINES {
1116 Err(Error::BadDocumentLength {
1119 max: Self::MAX_LINES,
1122 let pos = r.stream_position()?;
1123 let mut lines = Vec::with_capacity(n);
1125 lines.push(UnencodedStr::<{ DocumentRecord::LINE_LEN }>(read_bytes(r)?));
1127 Ok(DocumentRecord { pos, lines })
1132 trait ExtensionRecord
1137 const SIZE: Option<u32>;
1138 const COUNT: Option<u32>;
1139 const NAME: &'static str;
1140 fn parse(ext: &Extension, endian: Endian, warn: impl Fn(Error)) -> Result<Self, Error>;
1143 #[derive(Clone, Debug)]
1144 pub struct IntegerInfoRecord {
1145 pub version: (i32, i32, i32),
1146 pub machine_code: i32,
1147 pub floating_point_rep: i32,
1148 pub compression_code: i32,
1149 pub endianness: i32,
1150 pub character_code: i32,
1153 impl ExtensionRecord for IntegerInfoRecord {
1154 const SUBTYPE: u32 = 3;
1155 const SIZE: Option<u32> = Some(4);
1156 const COUNT: Option<u32> = Some(8);
1157 const NAME: &'static str = "integer record";
1159 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1160 ext.check_size::<Self>()?;
1162 let mut input = &ext.data[..];
1163 let data: Vec<i32> = (0..8)
1164 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1166 Ok(IntegerInfoRecord {
1167 version: (data[0], data[1], data[2]),
1168 machine_code: data[3],
1169 floating_point_rep: data[4],
1170 compression_code: data[5],
1171 endianness: data[6],
1172 character_code: data[7],
1177 #[derive(Clone, Debug)]
1178 pub struct FloatInfoRecord {
1184 impl ExtensionRecord for FloatInfoRecord {
1185 const SUBTYPE: u32 = 4;
1186 const SIZE: Option<u32> = Some(8);
1187 const COUNT: Option<u32> = Some(3);
1188 const NAME: &'static str = "floating point record";
1190 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1191 ext.check_size::<Self>()?;
1193 let mut input = &ext.data[..];
1194 let data: Vec<f64> = (0..3)
1195 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1197 Ok(FloatInfoRecord {
1205 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1206 pub enum CategoryLabels {
1211 #[derive(Clone, Debug)]
1212 pub enum MultipleResponseType {
1214 value: UnencodedString,
1215 labels: CategoryLabels,
1220 impl MultipleResponseType {
1221 fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Error> {
1222 let (mr_type, input) = match input.get(0) {
1223 Some(b'C') => (MultipleResponseType::MultipleCategory, &input[1..]),
1225 let (value, input) = parse_counted_string(&input[1..])?;
1227 MultipleResponseType::MultipleDichotomy {
1228 value: value.into(),
1229 labels: CategoryLabels::VarLabels,
1235 let Some(b' ') = input.get(1) else {
1236 return Err(Error::TBD);
1238 let input = &input[2..];
1239 let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
1240 (CategoryLabels::CountedValues, rest)
1241 } else if let Some(rest) = input.strip_prefix(b" 11 ") {
1242 (CategoryLabels::VarLabels, rest)
1244 return Err(Error::TBD);
1246 let (value, input) = parse_counted_string(input)?;
1248 MultipleResponseType::MultipleDichotomy {
1249 value: value.into(),
1255 _ => return Err(Error::TBD),
1257 Ok((mr_type, input))
1261 #[derive(Clone, Debug)]
1262 pub struct MultipleResponseSet {
1263 pub name: UnencodedString,
1264 pub label: UnencodedString,
1265 pub mr_type: MultipleResponseType,
1266 pub short_names: Vec<UnencodedString>,
1269 impl MultipleResponseSet {
1270 fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> {
1271 let Some(equals) = input.iter().position(|&b| b == b'=') else {
1272 return Err(Error::TBD);
1274 let (name, input) = input.split_at(equals);
1275 let (mr_type, input) = MultipleResponseType::parse(input)?;
1276 let Some(b' ') = input.get(0) else {
1277 return Err(Error::TBD);
1279 let (label, mut input) = parse_counted_string(&input[1..])?;
1280 let mut vars = Vec::new();
1281 while input.get(0) == Some(&b' ') {
1282 input = &input[1..];
1283 let Some(length) = input.iter().position(|b| b" \n".contains(b)) else {
1284 return Err(Error::TBD);
1287 vars.push(input[..length].into());
1289 input = &input[length..];
1291 if input.get(0) != Some(&b'\n') {
1292 return Err(Error::TBD);
1294 while input.get(0) == Some(&b'\n') {
1295 input = &input[1..];
1298 MultipleResponseSet {
1300 label: label.into(),
1309 #[derive(Clone, Debug)]
1310 pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
1312 impl ExtensionRecord for MultipleResponseRecord {
1313 const SUBTYPE: u32 = 7;
1314 const SIZE: Option<u32> = Some(1);
1315 const COUNT: Option<u32> = None;
1316 const NAME: &'static str = "multiple response set record";
1318 fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1319 ext.check_size::<Self>()?;
1321 let mut input = &ext.data[..];
1322 let mut sets = Vec::new();
1323 while !input.is_empty() {
1324 let (set, rest) = MultipleResponseSet::parse(input)?;
1328 Ok(MultipleResponseRecord(sets))
1332 fn parse_counted_string(input: &[u8]) -> Result<(UnencodedString, &[u8]), Error> {
1333 let Some(space) = input.iter().position(|&b| b == b' ') else {
1334 return Err(Error::TBD);
1336 let Ok(length) = from_utf8(&input[..space]) else {
1337 return Err(Error::TBD);
1339 let Ok(length): Result<usize, _> = length.parse() else {
1340 return Err(Error::TBD);
1343 let input = &input[space + 1..];
1344 if input.len() < length {
1345 return Err(Error::TBD);
1348 let (string, rest) = input.split_at(length);
1349 Ok((string.into(), rest))
1352 #[derive(Clone, Debug)]
1353 pub struct VarDisplayRecord(pub Vec<u32>);
1355 impl ExtensionRecord for VarDisplayRecord {
1356 const SUBTYPE: u32 = 11;
1357 const SIZE: Option<u32> = Some(4);
1358 const COUNT: Option<u32> = None;
1359 const NAME: &'static str = "variable display record";
1361 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1362 ext.check_size::<Self>()?;
1364 let mut input = &ext.data[..];
1365 let display = (0..ext.count)
1366 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1368 Ok(VarDisplayRecord(display))
1372 pub struct LongStringMissingValues {
1374 pub var_name: UnencodedString,
1377 pub missing_values: MissingValues,
1380 pub struct LongStringMissingValueSet(Vec<LongStringMissingValues>);
1382 impl ExtensionRecord for LongStringMissingValueSet {
1383 const SUBTYPE: u32 = 22;
1384 const SIZE: Option<u32> = Some(1);
1385 const COUNT: Option<u32> = None;
1386 const NAME: &'static str = "long string missing values record";
1388 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1389 ext.check_size::<Self>()?;
1391 let mut input = &ext.data[..];
1392 let mut missing_value_set = Vec::new();
1393 while !input.is_empty() {
1394 let var_name = read_string(&mut input, endian)?;
1395 let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
1396 let value_len: u32 = endian.parse(read_bytes(&mut input)?);
1398 let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offset;
1399 return Err(Error::BadLongMissingValueLength {
1400 record_offset: ext.offset,
1405 let mut values = Vec::new();
1406 for i in 0..n_missing_values {
1407 let value: [u8; 8] = read_bytes(&mut input)?;
1408 let numeric_value: u64 = endian.parse(value);
1409 let value = if i > 0 && numeric_value == 8 {
1410 // Tolerate files written by old, buggy versions of PSPP
1411 // where we believed that the value_length was repeated
1412 // before each missing value.
1413 read_bytes(&mut input)?
1417 values.push(Value::String(UnencodedStr(value)));
1419 let missing_values = MissingValues {
1423 missing_value_set.push(LongStringMissingValues {
1428 Ok(LongStringMissingValueSet(missing_value_set))
1432 #[derive(Clone, Debug)]
1433 pub struct EncodingRecord(pub String);
1435 impl ExtensionRecord for EncodingRecord {
1436 const SUBTYPE: u32 = 20;
1437 const SIZE: Option<u32> = Some(1);
1438 const COUNT: Option<u32> = None;
1439 const NAME: &'static str = "encoding record";
1441 fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1442 ext.check_size::<Self>()?;
1445 String::from_utf8(ext.data.clone())
1446 .map_err(|_| Error::BadEncodingName { offset: ext.offset })?,
1451 #[derive(Clone, Debug)]
1452 pub struct NumberOfCasesRecord {
1453 /// Always observed as 1.
1456 /// Number of cases.
1460 impl ExtensionRecord for NumberOfCasesRecord {
1461 const SUBTYPE: u32 = 16;
1462 const SIZE: Option<u32> = Some(8);
1463 const COUNT: Option<u32> = Some(2);
1464 const NAME: &'static str = "extended number of cases record";
1466 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1467 ext.check_size::<Self>()?;
1469 let mut input = &ext.data[..];
1470 let one = endian.parse(read_bytes(&mut input)?);
1471 let n_cases = endian.parse(read_bytes(&mut input)?);
1473 Ok(NumberOfCasesRecord { one, n_cases })
1477 #[derive(Clone, Debug)]
1478 pub struct TextRecord {
1479 /// Offset from the start of the file to the start of the record.
1482 /// The text content of the record.
1483 pub text: UnencodedString,
1486 impl From<Extension> for TextRecord {
1487 fn from(source: Extension) -> Self {
1489 offset: source.offset,
1490 text: source.data.into(),
1495 #[derive(Clone, Debug)]
1496 pub struct Extension {
1497 /// Offset from the start of the file to the start of the record.
1503 /// Size of each data element.
1506 /// Number of data elements.
1509 /// `size * count` bytes of data.
1514 fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
1515 if let Some(expected_size) = E::SIZE {
1516 if self.size != expected_size {
1517 return Err(Error::BadRecordSize {
1518 offset: self.offset,
1519 record: E::NAME.into(),
1525 if let Some(expected_count) = E::COUNT {
1526 if self.count != expected_count {
1527 return Err(Error::BadRecordCount {
1528 offset: self.offset,
1529 record: E::NAME.into(),
1538 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1539 let subtype = endian.parse(read_bytes(r)?);
1540 let offset = r.stream_position()?;
1541 let size: u32 = endian.parse(read_bytes(r)?);
1542 let count = endian.parse(read_bytes(r)?);
1543 let Some(product) = size.checked_mul(count) else {
1544 return Err(Error::ExtensionRecordTooLarge {
1551 let offset = r.stream_position()?;
1552 let data = read_vec(r, product as usize)?;
1553 let extension = Extension {
1561 IntegerInfoRecord::SUBTYPE => Ok(Record::IntegerInfo(IntegerInfoRecord::parse(
1566 FloatInfoRecord::SUBTYPE => Ok(Record::FloatInfo(FloatInfoRecord::parse(
1571 VarDisplayRecord::SUBTYPE => Ok(Record::VarDisplay(VarDisplayRecord::parse(
1576 MultipleResponseRecord::SUBTYPE | 19 => Ok(Record::MultipleResponse(
1577 MultipleResponseRecord::parse(&extension, endian, |_| ())?,
1579 LongStringValueLabelRecord::SUBTYPE => Ok(Record::LongStringValueLabels(
1580 LongStringValueLabelRecord::parse(&extension, endian, |_| ())?,
1582 EncodingRecord::SUBTYPE => Ok(Record::Encoding(EncodingRecord::parse(
1587 NumberOfCasesRecord::SUBTYPE => Ok(Record::NumberOfCases(NumberOfCasesRecord::parse(
1592 5 => Ok(Record::VariableSets(extension.into())),
1593 10 => Ok(Record::ProductInfo(extension.into())),
1594 13 => Ok(Record::LongNames(extension.into())),
1595 14 => Ok(Record::VeryLongStrings(extension.into())),
1596 17 => Ok(Record::FileAttributes(extension.into())),
1597 18 => Ok(Record::VariableAttributes(extension.into())),
1598 _ => Ok(Record::OtherExtension(extension)),
1603 #[derive(Clone, Debug)]
1604 pub struct ZHeader {
1605 /// File offset to the start of the record.
1608 /// File offset to the ZLIB data header.
1609 pub zheader_offset: u64,
1611 /// File offset to the ZLIB trailer.
1612 pub ztrailer_offset: u64,
1614 /// Length of the ZLIB trailer in bytes.
1615 pub ztrailer_len: u64,
1619 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
1620 let offset = r.stream_position()?;
1621 let zheader_offset: u64 = endian.parse(read_bytes(r)?);
1622 let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
1623 let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
1634 #[derive(Clone, Debug)]
1635 pub struct ZTrailer {
1636 /// File offset to the start of the record.
1639 /// Compression bias as a negative integer, e.g. -100.
1642 /// Always observed as zero.
1645 /// Uncompressed size of each block, except possibly the last. Only
1646 /// `0x3ff000` has been observed so far.
1647 pub block_size: u32,
1649 /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
1650 pub blocks: Vec<ZBlock>,
1653 #[derive(Clone, Debug)]
1655 /// Offset of block of data if simple compression were used.
1656 pub uncompressed_ofs: u64,
1658 /// Actual offset within the file of the compressed data block.
1659 pub compressed_ofs: u64,
1661 /// The number of bytes in this data block after decompression. This is
1662 /// `block_size` in every data block but the last, which may be smaller.
1663 pub uncompressed_size: u32,
1665 /// The number of bytes in this data block, as stored compressed in this
1667 pub compressed_size: u32,
1671 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
1673 uncompressed_ofs: endian.parse(read_bytes(r)?),
1674 compressed_ofs: endian.parse(read_bytes(r)?),
1675 uncompressed_size: endian.parse(read_bytes(r)?),
1676 compressed_size: endian.parse(read_bytes(r)?),
1682 fn read<R: Read + Seek>(
1687 ) -> Result<Option<ZTrailer>, Error> {
1688 let start_offset = reader.stream_position()?;
1689 if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
1692 let int_bias = endian.parse(read_bytes(reader)?);
1693 let zero = endian.parse(read_bytes(reader)?);
1694 let block_size = endian.parse(read_bytes(reader)?);
1695 let n_blocks: u32 = endian.parse(read_bytes(reader)?);
1696 let expected_n_blocks = (ztrailer_len - 24) / 24;
1697 if n_blocks as u64 != expected_n_blocks {
1698 return Err(Error::BadZlibTrailerNBlocks {
1699 offset: ztrailer_ofs,
1705 let blocks = (0..n_blocks)
1706 .map(|_| ZBlock::read(reader, endian))
1707 .collect::<Result<Vec<_>, _>>()?;
1708 reader.seek(SeekFrom::Start(start_offset))?;
1710 offset: ztrailer_ofs,
1719 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
1720 let mut buf = [0; N];
1721 let n = r.read(&mut buf)?;
1724 r.read_exact(&mut buf[n..])?;
1732 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
1733 let mut buf = [0; N];
1734 r.read_exact(&mut buf)?;
1738 fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
1739 let mut vec = vec![0; n];
1740 r.read_exact(&mut vec)?;
1744 fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<UnencodedString, IoError> {
1745 let length: u32 = endian.parse(read_bytes(r)?);
1746 Ok(read_vec(r, length as usize)?.into())
1749 #[derive(Clone, Debug)]
1750 pub struct LongStringValueLabels {
1751 pub var_name: UnencodedString,
1754 /// `(value, label)` pairs, where each value is `width` bytes.
1755 pub labels: Vec<(UnencodedString, UnencodedString)>,
1758 #[derive(Clone, Debug)]
1759 pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
1761 impl ExtensionRecord for LongStringValueLabelRecord {
1762 const SUBTYPE: u32 = 21;
1763 const SIZE: Option<u32> = Some(1);
1764 const COUNT: Option<u32> = None;
1765 const NAME: &'static str = "long string value labels record";
1767 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1768 ext.check_size::<Self>()?;
1770 let mut input = &ext.data[..];
1771 let mut label_set = Vec::new();
1772 while !input.is_empty() {
1773 let var_name = read_string(&mut input, endian)?;
1774 let width: u32 = endian.parse(read_bytes(&mut input)?);
1775 let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
1776 let mut labels = Vec::new();
1777 for _ in 0..n_labels {
1778 let value = read_string(&mut input, endian)?;
1779 let label = read_string(&mut input, endian)?;
1780 labels.push((value, label));
1782 label_set.push(LongStringValueLabels {
1788 Ok(LongStringValueLabelRecord(label_set))