1 use crate::endian::{Endian, Parse, ToBytes};
3 use encoding_rs::mem::decode_latin1;
4 use flate2::read::ZlibDecoder;
10 fmt::{Debug, Formatter, Result as FmtResult},
11 io::{Error as IoError, Read, Seek, SeekFrom},
15 str::from_utf8, cell::RefCell,
17 use thiserror::Error as ThisError;
19 #[derive(ThisError, Debug)]
21 #[error("Not an SPSS system file")]
24 #[error("Invalid magic number {0:?}")]
27 #[error("I/O error ({0})")]
30 #[error("Invalid SAV compression code {0}")]
31 InvalidSavCompression(u32),
33 #[error("Invalid ZSAV compression code {0}")]
34 InvalidZsavCompression(u32),
36 #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
37 BadVariableWidth { offset: u64, width: i32 },
39 #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
40 BadDocumentLength { offset: u64, n: usize, max: usize },
42 #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
43 BadRecordType { offset: u64, rec_type: u32 },
45 #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
46 BadVariableLabelCode {
53 "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
55 BadNumericMissingValueCode { offset: u64, code: i32 },
57 #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
58 BadStringMissingValueCode { offset: u64, code: i32 },
60 #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
61 BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
63 #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
64 ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
66 #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
67 BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
69 #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
70 ExtensionRecordTooLarge {
77 #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
85 "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
87 EofInCompressedCase { offset: u64, case_ofs: u64 },
89 #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
90 PartialCompressedCase { offset: u64, case_ofs: u64 },
92 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
93 CompressedNumberExpected { offset: u64, case_ofs: u64 },
95 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
96 CompressedStringExpected { offset: u64, case_ofs: u64 },
98 #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
99 BadZlibTrailerNBlocks {
102 expected_n_blocks: u64,
106 #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
114 #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
122 #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
123 BadLongMissingValueLength {
129 #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
130 BadEncodingName { offset: u64 },
132 #[error("Details TBD")]
136 #[derive(Clone, Debug)]
138 Header(HeaderRecord),
139 Variable(VariableRecord),
140 ValueLabel(ValueLabelRecord),
141 Document(DocumentRecord),
142 IntegerInfo(IntegerInfoRecord),
143 FloatInfo(FloatInfoRecord),
144 VariableSets(TextRecord),
145 VarDisplay(VarDisplayRecord),
146 MultipleResponse(MultipleResponseRecord),
147 LongStringValueLabels(LongStringValueLabelRecord),
148 LongStringMissingValues(LongStringMissingValueRecord),
149 Encoding(EncodingRecord),
150 NumberOfCases(NumberOfCasesRecord),
151 ProductInfo(TextRecord),
152 LongNames(TextRecord),
153 VeryLongStrings(TextRecord),
154 FileAttributes(TextRecord),
155 VariableAttributes(TextRecord),
156 OtherExtension(Extension),
160 Cases(Rc<RefCell<Cases>>),
167 warn: &Box<dyn Fn(Error)>,
168 ) -> Result<Option<Record>, Error>
172 let rec_type: u32 = endian.parse(read_bytes(reader)?);
174 2 => Ok(Some(VariableRecord::read(reader, endian)?)),
175 3 => Ok(Some(ValueLabelRecord::read(reader, endian)?)),
176 6 => Ok(Some(DocumentRecord::read(reader, endian)?)),
177 7 => Extension::read(reader, endian, warn),
178 999 => Ok(Some(Record::EndOfHeaders(
179 endian.parse(read_bytes(reader)?),
181 _ => Err(Error::BadRecordType {
182 offset: reader.stream_position()?,
189 // If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
190 // decoded as Latin-1 (actually bytes interpreted as Unicode code points).
191 fn default_decode(s: &[u8]) -> Cow<str> {
192 from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
195 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
196 pub enum Compression {
202 fn offsets(&self) -> Range<u64>;
206 pub struct HeaderRecord {
208 pub offsets: Range<u64>,
213 /// Eye-catcher string, product name, in the file's encoding. Padded
214 /// on the right with spaces.
215 pub eye_catcher: UnencodedStr<60>,
217 /// Layout code, normally either 2 or 3.
218 pub layout_code: u32,
220 /// Number of variable positions, or `None` if the value in the file is
221 /// questionably trustworthy.
222 pub nominal_case_size: Option<u32>,
224 /// Compression type, if any,
225 pub compression: Option<Compression>,
227 /// 1-based variable index of the weight variable, or `None` if the file is
229 pub weight_index: Option<u32>,
231 /// Claimed number of cases, if known.
232 pub n_cases: Option<u32>,
234 /// Compression bias, usually 100.0.
237 /// `dd mmm yy` in the file's encoding.
238 pub creation_date: UnencodedStr<9>,
240 /// `HH:MM:SS` in the file's encoding.
241 pub creation_time: UnencodedStr<8>,
243 /// File label, in the file's encoding. Padded on the right with spaces.
244 pub file_label: UnencodedStr<64>,
246 /// Endianness of the data in the file header.
251 fn debug_field<T: Debug>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult {
252 writeln!(f, "{name:>17}: {:?}", value)
256 impl Debug for HeaderRecord {
257 fn fmt(&self, f: &mut Formatter) -> FmtResult {
258 writeln!(f, "File header record:")?;
259 self.debug_field(f, "Magic", self.magic)?;
260 self.debug_field(f, "Product name", self.eye_catcher)?;
261 self.debug_field(f, "Layout code", self.layout_code)?;
262 self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
263 self.debug_field(f, "Compression", self.compression)?;
264 self.debug_field(f, "Weight index", self.weight_index)?;
265 self.debug_field(f, "Number of cases", self.n_cases)?;
266 self.debug_field(f, "Compression bias", self.bias)?;
267 self.debug_field(f, "Creation date", self.creation_date)?;
268 self.debug_field(f, "Creation time", self.creation_time)?;
269 self.debug_field(f, "File label", self.file_label)?;
270 self.debug_field(f, "Endianness", self.endian)
275 fn read<R: Read + Seek>(r: &mut R) -> Result<HeaderRecord, Error> {
276 let start = r.stream_position()?;
278 let magic: [u8; 4] = read_bytes(r)?;
279 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
281 let eye_catcher = UnencodedStr::<60>(read_bytes(r)?);
282 let layout_code: [u8; 4] = read_bytes(r)?;
283 let endian = Endian::identify_u32(2, layout_code)
284 .or_else(|| Endian::identify_u32(2, layout_code))
285 .ok_or_else(|| Error::NotASystemFile)?;
286 let layout_code = endian.parse(layout_code);
288 let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
289 let nominal_case_size =
290 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
292 let compression_code: u32 = endian.parse(read_bytes(r)?);
293 let compression = match (magic, compression_code) {
294 (Magic::Zsav, 2) => Some(Compression::ZLib),
295 (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)),
297 (_, 1) => Some(Compression::Simple),
298 (_, code) => return Err(Error::InvalidSavCompression(code)),
301 let weight_index: u32 = endian.parse(read_bytes(r)?);
302 let weight_index = (weight_index > 0).then_some(weight_index);
304 let n_cases: u32 = endian.parse(read_bytes(r)?);
305 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
307 let bias: f64 = endian.parse(read_bytes(r)?);
309 let creation_date = UnencodedStr::<9>(read_bytes(r)?);
310 let creation_time = UnencodedStr::<8>(read_bytes(r)?);
311 let file_label = UnencodedStr::<64>(read_bytes(r)?);
312 let _: [u8; 3] = read_bytes(r)?;
315 offsets: start..r.stream_position()?,
332 impl Header for HeaderRecord {
333 fn offsets(&self) -> Range<u64> {
338 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
340 /// Regular system file.
343 /// System file with Zlib-compressed data.
346 /// EBCDIC-encoded system file.
351 /// Magic number for a regular system file.
352 pub const SAV: [u8; 4] = *b"$FL2";
354 /// Magic number for a system file that contains zlib-compressed data.
355 pub const ZSAV: [u8; 4] = *b"$FL3";
357 /// Magic number for an EBCDIC-encoded system file. This is `$FL2` encoded
359 pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2];
362 impl Debug for Magic {
363 fn fmt(&self, f: &mut Formatter) -> FmtResult {
364 let s = match *self {
365 Magic::Sav => "$FL2",
366 Magic::Zsav => "$FL3",
367 Magic::Ebcdic => "($FL2 in EBCDIC)",
373 impl TryFrom<[u8; 4]> for Magic {
376 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
378 Magic::SAV => Ok(Magic::Sav),
379 Magic::ZSAV => Ok(Magic::Zsav),
380 Magic::EBCDIC => Ok(Magic::Ebcdic),
381 _ => Err(Error::BadMagic(value)),
386 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
393 fn from_width(width: i32) -> VarType {
395 0 => VarType::Numeric,
396 _ => VarType::String,
401 #[derive(Copy, Clone)]
404 String(UnencodedStr<8>),
407 impl Debug for Value {
408 fn fmt(&self, f: &mut Formatter) -> FmtResult {
410 Value::Number(Some(number)) => write!(f, "{number:?}"),
411 Value::Number(None) => write!(f, "SYSMIS"),
412 Value::String(bytes) => write!(f, "{:?}", bytes),
418 fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Value, IoError> {
420 &UntypedValue(read_bytes(r)?),
426 pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Value {
428 VarType::String => Value::String(UnencodedStr(raw.0)),
429 VarType::Numeric => {
430 let number: f64 = endian.parse(raw.0);
431 Value::Number((number != -f64::MAX).then_some(number))
436 fn read_case<R: Read + Seek>(
438 var_types: &[VarType],
440 ) -> Result<Option<Vec<Value>>, Error> {
441 let case_start = reader.stream_position()?;
442 let mut values = Vec::with_capacity(var_types.len());
443 for (i, &var_type) in var_types.iter().enumerate() {
444 let Some(raw) = try_read_bytes(reader)? else {
448 let offset = reader.stream_position()?;
449 return Err(Error::EofInCase {
451 case_ofs: offset - case_start,
452 case_len: var_types.len() * 8,
456 values.push(Value::from_raw(&UntypedValue(raw), var_type, endian));
461 fn read_compressed_case<R: Read + Seek>(
463 var_types: &[VarType],
464 codes: &mut VecDeque<u8>,
467 ) -> Result<Option<Vec<Value>>, Error> {
468 let case_start = reader.stream_position()?;
469 let mut values = Vec::with_capacity(var_types.len());
470 for (i, &var_type) in var_types.iter().enumerate() {
472 let Some(code) = codes.pop_front() else {
473 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
477 let offset = reader.stream_position()?;
478 return Err(Error::EofInCompressedCase {
480 case_ofs: offset - case_start,
484 codes.extend(new_codes.into_iter());
489 1..=251 => match var_type {
490 VarType::Numeric => break Value::Number(Some(code as f64 - bias)),
492 break Value::String(UnencodedStr(endian.to_bytes(code as f64 - bias)))
499 let offset = reader.stream_position()?;
500 return Err(Error::PartialCompressedCase {
502 case_ofs: offset - case_start,
507 break Value::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian)
509 254 => match var_type {
510 VarType::String => break Value::String(UnencodedStr(*b" ")), // XXX EBCDIC
511 VarType::Numeric => {
512 return Err(Error::CompressedStringExpected {
514 case_ofs: reader.stream_position()? - case_start,
518 255 => match var_type {
519 VarType::Numeric => break Value::Number(None),
521 return Err(Error::CompressedNumberExpected {
523 case_ofs: reader.stream_position()? - case_start,
535 struct ZlibDecodeMultiple<R>
539 reader: Option<ZlibDecoder<R>>,
542 impl<R> ZlibDecodeMultiple<R>
546 fn new(reader: R) -> ZlibDecodeMultiple<R> {
548 reader: Some(ZlibDecoder::new(reader)),
553 impl<R> Read for ZlibDecodeMultiple<R>
557 fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
559 match self.reader.as_mut().unwrap().read(buf)? {
561 let inner = self.reader.take().unwrap().into_inner();
562 self.reader = Some(ZlibDecoder::new(inner));
570 impl<R> Seek for ZlibDecodeMultiple<R>
574 fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
575 self.reader.as_mut().unwrap().get_mut().seek(pos)
584 ztrailer_offset: u64,
593 R: Read + Seek + 'static,
596 warn: Box<dyn Fn(Error)>,
598 header: HeaderRecord,
599 var_types: Vec<VarType>,
606 R: Read + Seek + 'static,
608 pub fn new<F>(mut reader: R, warn: F) -> Result<Self, Error>
610 F: Fn(Error) + 'static,
612 let header = HeaderRecord::read(&mut reader)?;
614 reader: Some(reader),
615 warn: Box::new(warn),
617 var_types: Vec::new(),
618 state: ReaderState::Start,
621 fn cases(&mut self) -> Cases {
622 self.state = ReaderState::End;
624 self.reader.take().unwrap(),
625 take(&mut self.var_types),
631 impl<R> Iterator for Reader<R>
633 R: Read + Seek + 'static,
635 type Item = Result<Record, Error>;
637 fn next(&mut self) -> Option<Self::Item> {
639 ReaderState::Start => {
640 self.state = ReaderState::Headers;
641 Some(Ok(Record::Header(self.header.clone())))
643 ReaderState::Headers => {
646 self.reader.as_mut().unwrap(),
650 Ok(Some(record)) => break record,
652 Err(error) => return Some(Err(error)),
656 Record::Variable(VariableRecord { width, .. }) => {
657 self.var_types.push(VarType::from_width(width));
659 Record::EndOfHeaders(_) => {
660 self.state = if let Some(Compression::ZLib) = self.header.compression {
661 ReaderState::ZlibHeader
670 ReaderState::ZlibHeader => {
671 let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian)
673 Ok(zheader) => zheader,
674 Err(error) => return Some(Err(error)),
676 self.state = ReaderState::ZlibTrailer {
677 ztrailer_offset: zheader.ztrailer_offset,
678 ztrailer_len: zheader.ztrailer_len,
680 Some(Ok(Record::ZHeader(zheader)))
682 ReaderState::ZlibTrailer {
686 match ZTrailer::read(
687 self.reader.as_mut().unwrap(),
692 Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
693 Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))),
694 Err(error) => Some(Err(error)),
697 ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
698 ReaderState::End => None,
703 trait ReadSeek: Read + Seek {}
704 impl<T> ReadSeek for T where T: Read + Seek {}
707 reader: Box<dyn ReadSeek>,
708 var_types: Vec<VarType>,
709 compression: Option<Compression>,
716 impl Debug for Cases {
717 fn fmt(&self, f: &mut Formatter) -> FmtResult {
723 fn new<R>(reader: R, var_types: Vec<VarType>, header: &HeaderRecord) -> Self
725 R: Read + Seek + 'static,
728 reader: if header.compression == Some(Compression::ZLib) {
729 Box::new(ZlibDecodeMultiple::new(reader))
734 compression: header.compression,
736 endian: header.endian,
737 codes: VecDeque::with_capacity(8),
743 impl Iterator for Cases {
744 type Item = Result<Vec<Value>, Error>;
746 fn next(&mut self) -> Option<Self::Item> {
751 let retval = if self.compression.is_some() {
752 Value::read_compressed_case(
761 Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose()
763 self.eof = matches!(retval, None | Some(Err(_)));
768 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
769 pub struct Spec(pub u32);
771 impl Debug for Spec {
772 fn fmt(&self, f: &mut Formatter) -> FmtResult {
773 let type_ = format_name(self.0 >> 16);
774 let w = (self.0 >> 8) & 0xff;
775 let d = self.0 & 0xff;
776 write!(f, "{:06x} ({type_}{w}.{d})", self.0)
780 fn format_name(type_: u32) -> Cow<'static, str> {
819 _ => return format!("<unknown format {type_}>").into(),
825 pub struct MissingValues {
826 /// Individual missing values, up to 3 of them.
827 pub values: Vec<Value>,
829 /// Optional range of missing values.
830 pub range: Option<(Value, Value)>,
833 impl Debug for MissingValues {
834 fn fmt(&self, f: &mut Formatter) -> FmtResult {
835 for (i, value) in self.values.iter().enumerate() {
839 write!(f, "{value:?}")?;
842 if let Some((low, high)) = self.range {
843 if !self.values.is_empty() {
846 write!(f, "{low:?} THRU {high:?}")?;
858 fn is_empty(&self) -> bool {
859 self.values.is_empty() && self.range.is_none()
862 fn read<R: Read + Seek>(
868 ) -> Result<MissingValues, Error> {
869 let (n_values, has_range) = match (width, code) {
870 (_, 0..=3) => (code, false),
871 (0, -2) => (0, true),
872 (0, -3) => (1, true),
873 (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
874 (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
877 let var_type = VarType::from_width(width);
879 let mut values = Vec::new();
880 for _ in 0..n_values {
881 values.push(Value::read(r, var_type, endian)?);
883 let range = if has_range {
884 let low = Value::read(r, var_type, endian)?;
885 let high = Value::read(r, var_type, endian)?;
890 Ok(MissingValues { values, range })
895 pub struct VariableRecord {
896 /// Range of offsets in file.
897 pub offsets: Range<u64>,
899 /// Variable width, in the range -1..=255.
902 /// Variable name, padded on the right with spaces.
903 pub name: UnencodedStr<8>,
906 pub print_format: Spec,
909 pub write_format: Spec,
912 pub missing_values: MissingValues,
914 /// Optional variable label.
915 pub label: Option<UnencodedString>,
918 impl Debug for VariableRecord {
919 fn fmt(&self, f: &mut Formatter) -> FmtResult {
924 match self.width.cmp(&0) {
925 Ordering::Greater => "string",
926 Ordering::Equal => "numeric",
927 Ordering::Less => "long string continuation record",
930 writeln!(f, "Print format: {:?}", self.print_format)?;
931 writeln!(f, "Write format: {:?}", self.write_format)?;
932 writeln!(f, "Name: {:?}", &self.name)?;
933 writeln!(f, "Variable label: {:?}", self.label)?;
934 writeln!(f, "Missing values: {:?}", self.missing_values)
938 impl VariableRecord {
939 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
940 let start_offset = r.stream_position()?;
941 let width: i32 = endian.parse(read_bytes(r)?);
942 let code_offset = r.stream_position()?;
943 let has_variable_label: u32 = endian.parse(read_bytes(r)?);
944 let missing_value_code: i32 = endian.parse(read_bytes(r)?);
945 let print_format = Spec(endian.parse(read_bytes(r)?));
946 let write_format = Spec(endian.parse(read_bytes(r)?));
947 let name = UnencodedStr::<8>(read_bytes(r)?);
949 let label = match has_variable_label {
952 let len: u32 = endian.parse(read_bytes(r)?);
953 let read_len = len.min(65535) as usize;
954 let label = UnencodedString(read_vec(r, read_len)?);
956 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
957 let _ = read_vec(r, padding_bytes as usize)?;
962 return Err(Error::BadVariableLabelCode {
965 code: has_variable_label,
971 MissingValues::read(r, start_offset, width, missing_value_code, endian)?;
973 let end_offset = r.stream_position()?;
975 Ok(Record::Variable(VariableRecord {
976 offsets: start_offset..end_offset,
987 #[derive(Copy, Clone)]
988 pub struct UntypedValue(pub [u8; 8]);
990 impl Debug for UntypedValue {
991 fn fmt(&self, f: &mut Formatter) -> FmtResult {
992 let little: f64 = Endian::Little.parse(self.0);
993 let little = format!("{:?}", little);
994 let big: f64 = Endian::Big.parse(self.0);
995 let big = format!("{:?}", big);
996 let number = if little.len() <= big.len() {
1001 write!(f, "{number}")?;
1003 let string = default_decode(&self.0);
1005 .split(|c: char| c == '\0' || c.is_control())
1008 write!(f, "{string:?}")?;
1014 pub struct UnencodedString(pub Vec<u8>);
1016 impl From<Vec<u8>> for UnencodedString {
1017 fn from(source: Vec<u8>) -> Self {
1022 impl From<&[u8]> for UnencodedString {
1023 fn from(source: &[u8]) -> Self {
1028 impl Debug for UnencodedString {
1029 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1030 write!(f, "{:?}", default_decode(self.0.as_slice()))
1034 #[derive(Copy, Clone)]
1035 pub struct UnencodedStr<const N: usize>(pub [u8; N]);
1037 impl<const N: usize> From<[u8; N]> for UnencodedStr<N> {
1038 fn from(source: [u8; N]) -> Self {
1043 impl<const N: usize> Debug for UnencodedStr<N> {
1044 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1045 write!(f, "{:?}", default_decode(&self.0))
1050 pub struct ValueLabelRecord {
1051 /// Range of offsets in file.
1052 pub offsets: Range<u64>,
1055 pub labels: Vec<(UntypedValue, UnencodedString)>,
1057 /// The 1-based indexes of the variable indexes.
1058 pub dict_indexes: Vec<u32>,
1061 impl Debug for ValueLabelRecord {
1062 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1063 writeln!(f, "labels: ")?;
1064 for (value, label) in self.labels.iter() {
1065 writeln!(f, "{value:?}: {label:?}")?;
1067 write!(f, "apply to variables")?;
1068 for dict_index in self.dict_indexes.iter() {
1069 write!(f, " #{dict_index}")?;
1075 impl Header for ValueLabelRecord {
1076 fn offsets(&self) -> Range<u64> {
1077 self.offsets.clone()
1081 impl ValueLabelRecord {
1082 /// Maximum number of value labels in a record.
1083 pub const MAX_LABELS: u32 = u32::MAX / 8;
1085 /// Maximum number of variable indexes in a record.
1086 pub const MAX_INDEXES: u32 = u32::MAX / 8;
1088 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1089 let label_offset = r.stream_position()?;
1090 let n: u32 = endian.parse(read_bytes(r)?);
1091 if n > Self::MAX_LABELS {
1092 return Err(Error::BadNumberOfValueLabels {
1093 offset: label_offset,
1095 max: Self::MAX_LABELS,
1099 let mut labels = Vec::new();
1101 let value = UntypedValue(read_bytes(r)?);
1102 let label_len: u8 = endian.parse(read_bytes(r)?);
1103 let label_len = label_len as usize;
1104 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
1106 let mut label = read_vec(r, padded_len - 1)?;
1107 label.truncate(label_len);
1108 labels.push((value, UnencodedString(label)));
1111 let index_offset = r.stream_position()?;
1112 let rec_type: u32 = endian.parse(read_bytes(r)?);
1114 return Err(Error::ExpectedVarIndexRecord {
1115 offset: index_offset,
1120 let n: u32 = endian.parse(read_bytes(r)?);
1121 if n > Self::MAX_INDEXES {
1122 return Err(Error::BadNumberOfVarIndexes {
1123 offset: index_offset,
1125 max: Self::MAX_INDEXES,
1128 let mut dict_indexes = Vec::with_capacity(n as usize);
1130 dict_indexes.push(endian.parse(read_bytes(r)?));
1133 let end_offset = r.stream_position()?;
1134 Ok(Record::ValueLabel(ValueLabelRecord {
1135 offsets: label_offset..end_offset,
1142 #[derive(Clone, Debug)]
1143 pub struct DocumentRecord {
1144 pub offsets: Range<u64>,
1146 /// The document, as an array of 80-byte lines.
1147 pub lines: Vec<DocumentLine>,
1150 pub type DocumentLine = UnencodedStr<{ DocumentRecord::LINE_LEN }>;
1152 impl DocumentRecord {
1153 /// Length of a line in a document. Document lines are fixed-length and
1154 /// padded on the right with spaces.
1155 pub const LINE_LEN: usize = 80;
1157 /// Maximum number of lines we will accept in a document. This is simply
1158 /// the maximum number that will fit in a 32-bit space.
1159 pub const MAX_LINES: usize = i32::MAX as usize / Self::LINE_LEN;
1161 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1162 let start_offset = r.stream_position()?;
1163 let n: u32 = endian.parse(read_bytes(r)?);
1165 if n > Self::MAX_LINES {
1166 Err(Error::BadDocumentLength {
1167 offset: start_offset,
1169 max: Self::MAX_LINES,
1172 let mut lines = Vec::with_capacity(n);
1174 lines.push(UnencodedStr::<{ DocumentRecord::LINE_LEN }>(read_bytes(r)?));
1176 let end_offset = r.stream_position()?;
1177 Ok(Record::Document(DocumentRecord {
1178 offsets: start_offset..end_offset,
1185 impl Header for DocumentRecord {
1186 fn offsets(&self) -> Range<u64> {
1187 self.offsets.clone()
1191 trait ExtensionRecord {
1193 const SIZE: Option<u32>;
1194 const COUNT: Option<u32>;
1195 const NAME: &'static str;
1196 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error>;
1199 #[derive(Clone, Debug)]
1200 pub struct IntegerInfoRecord {
1201 pub offsets: Range<u64>,
1202 pub version: (i32, i32, i32),
1203 pub machine_code: i32,
1204 pub floating_point_rep: i32,
1205 pub compression_code: i32,
1206 pub endianness: i32,
1207 pub character_code: i32,
1210 impl ExtensionRecord for IntegerInfoRecord {
1211 const SUBTYPE: u32 = 3;
1212 const SIZE: Option<u32> = Some(4);
1213 const COUNT: Option<u32> = Some(8);
1214 const NAME: &'static str = "integer record";
1216 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1217 ext.check_size::<Self>()?;
1219 let mut input = &ext.data[..];
1220 let data: Vec<i32> = (0..8)
1221 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1223 Ok(Record::IntegerInfo(IntegerInfoRecord {
1224 offsets: ext.offsets.clone(),
1225 version: (data[0], data[1], data[2]),
1226 machine_code: data[3],
1227 floating_point_rep: data[4],
1228 compression_code: data[5],
1229 endianness: data[6],
1230 character_code: data[7],
1235 #[derive(Clone, Debug)]
1236 pub struct FloatInfoRecord {
1242 impl ExtensionRecord for FloatInfoRecord {
1243 const SUBTYPE: u32 = 4;
1244 const SIZE: Option<u32> = Some(8);
1245 const COUNT: Option<u32> = Some(3);
1246 const NAME: &'static str = "floating point record";
1248 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1249 ext.check_size::<Self>()?;
1251 let mut input = &ext.data[..];
1252 let data: Vec<f64> = (0..3)
1253 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1255 Ok(Record::FloatInfo(FloatInfoRecord {
1263 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1264 pub enum CategoryLabels {
1269 #[derive(Clone, Debug)]
1270 pub enum MultipleResponseType {
1272 value: UnencodedString,
1273 labels: CategoryLabels,
1278 impl MultipleResponseType {
1279 fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Error> {
1280 let (mr_type, input) = match input.split_first() {
1281 Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
1282 Some((b'D', input)) => {
1283 let (value, input) = parse_counted_string(input)?;
1285 MultipleResponseType::MultipleDichotomy {
1287 labels: CategoryLabels::VarLabels,
1292 Some((b'E', input)) => {
1293 let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
1294 (CategoryLabels::CountedValues, rest)
1295 } else if let Some(rest) = input.strip_prefix(b" 11 ") {
1296 (CategoryLabels::VarLabels, rest)
1298 return Err(Error::TBD);
1300 let (value, input) = parse_counted_string(input)?;
1302 MultipleResponseType::MultipleDichotomy { value, labels },
1306 _ => return Err(Error::TBD),
1308 Ok((mr_type, input))
1312 #[derive(Clone, Debug)]
1313 pub struct MultipleResponseSet {
1314 pub name: UnencodedString,
1315 pub label: UnencodedString,
1316 pub mr_type: MultipleResponseType,
1317 pub short_names: Vec<UnencodedString>,
1320 impl MultipleResponseSet {
1321 fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> {
1322 let Some(equals) = input.iter().position(|&b| b == b'=') else {
1323 return Err(Error::TBD);
1325 let (name, input) = input.split_at(equals);
1326 let (mr_type, input) = MultipleResponseType::parse(input)?;
1327 let Some(input) = input.strip_prefix(b" ") else {
1328 return Err(Error::TBD);
1330 let (label, mut input) = parse_counted_string(input)?;
1331 let mut vars = Vec::new();
1332 while input.first() != Some(&b'\n') {
1333 match input.split_first() {
1334 Some((b' ', rest)) => {
1335 let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
1336 return Err(Error::TBD);
1338 let (var, rest) = rest.split_at(length);
1339 if !var.is_empty() {
1340 vars.push(var.into());
1344 _ => return Err(Error::TBD),
1347 while input.first() == Some(&b'\n') {
1348 input = &input[1..];
1351 MultipleResponseSet {
1362 #[derive(Clone, Debug)]
1363 pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
1365 impl ExtensionRecord for MultipleResponseRecord {
1366 const SUBTYPE: u32 = 7;
1367 const SIZE: Option<u32> = Some(1);
1368 const COUNT: Option<u32> = None;
1369 const NAME: &'static str = "multiple response set record";
1371 fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Error> {
1372 ext.check_size::<Self>()?;
1374 let mut input = &ext.data[..];
1375 let mut sets = Vec::new();
1376 while !input.is_empty() {
1377 let (set, rest) = MultipleResponseSet::parse(input)?;
1381 Ok(Record::MultipleResponse(MultipleResponseRecord(sets)))
1385 fn parse_counted_string(input: &[u8]) -> Result<(UnencodedString, &[u8]), Error> {
1386 let Some(space) = input.iter().position(|&b| b == b' ') else {
1387 return Err(Error::TBD);
1389 let Ok(length) = from_utf8(&input[..space]) else {
1390 return Err(Error::TBD);
1392 let Ok(length): Result<usize, _> = length.parse() else {
1393 return Err(Error::TBD);
1396 let input = &input[space + 1..];
1397 if input.len() < length {
1398 return Err(Error::TBD);
1401 let (string, rest) = input.split_at(length);
1402 Ok((string.into(), rest))
1405 #[derive(Clone, Debug)]
1406 pub struct VarDisplayRecord(pub Vec<u32>);
1408 impl ExtensionRecord for VarDisplayRecord {
1409 const SUBTYPE: u32 = 11;
1410 const SIZE: Option<u32> = Some(4);
1411 const COUNT: Option<u32> = None;
1412 const NAME: &'static str = "variable display record";
1414 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1415 ext.check_size::<Self>()?;
1417 let mut input = &ext.data[..];
1418 let display = (0..ext.count)
1419 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1421 Ok(Record::VarDisplay(VarDisplayRecord(display)))
1425 #[derive(Clone, Debug)]
1426 pub struct LongStringMissingValues {
1428 pub var_name: UnencodedString,
1431 pub missing_values: MissingValues,
1434 #[derive(Clone, Debug)]
1435 pub struct LongStringMissingValueRecord(pub Vec<LongStringMissingValues>);
1437 impl ExtensionRecord for LongStringMissingValueRecord {
1438 const SUBTYPE: u32 = 22;
1439 const SIZE: Option<u32> = Some(1);
1440 const COUNT: Option<u32> = None;
1441 const NAME: &'static str = "long string missing values record";
1443 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1444 ext.check_size::<Self>()?;
1446 let mut input = &ext.data[..];
1447 let mut missing_value_set = Vec::new();
1448 while !input.is_empty() {
1449 let var_name = read_string(&mut input, endian)?;
1450 let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
1451 let value_len: u32 = endian.parse(read_bytes(&mut input)?);
1453 let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
1454 return Err(Error::BadLongMissingValueLength {
1455 record_offset: ext.offsets.start,
1460 let mut values = Vec::new();
1461 for i in 0..n_missing_values {
1462 let value: [u8; 8] = read_bytes(&mut input)?;
1463 let numeric_value: u64 = endian.parse(value);
1464 let value = if i > 0 && numeric_value == 8 {
1465 // Tolerate files written by old, buggy versions of PSPP
1466 // where we believed that the value_length was repeated
1467 // before each missing value.
1468 read_bytes(&mut input)?
1472 values.push(Value::String(UnencodedStr(value)));
1474 let missing_values = MissingValues {
1478 missing_value_set.push(LongStringMissingValues {
1483 Ok(Record::LongStringMissingValues(LongStringMissingValueRecord(
1489 #[derive(Clone, Debug)]
1490 pub struct EncodingRecord(pub String);
1492 impl ExtensionRecord for EncodingRecord {
1493 const SUBTYPE: u32 = 20;
1494 const SIZE: Option<u32> = Some(1);
1495 const COUNT: Option<u32> = None;
1496 const NAME: &'static str = "encoding record";
1498 fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Error> {
1499 ext.check_size::<Self>()?;
1501 Ok(Record::Encoding(EncodingRecord(
1502 String::from_utf8(ext.data.clone()).map_err(|_| Error::BadEncodingName {
1503 offset: ext.offsets.start,
1509 #[derive(Copy, Clone, Debug)]
1510 pub struct NumberOfCasesRecord {
1511 /// Always observed as 1.
1514 /// Number of cases.
1518 impl ExtensionRecord for NumberOfCasesRecord {
1519 const SUBTYPE: u32 = 16;
1520 const SIZE: Option<u32> = Some(8);
1521 const COUNT: Option<u32> = Some(2);
1522 const NAME: &'static str = "extended number of cases record";
1524 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1525 ext.check_size::<Self>()?;
1527 let mut input = &ext.data[..];
1528 let one = endian.parse(read_bytes(&mut input)?);
1529 let n_cases = endian.parse(read_bytes(&mut input)?);
1531 Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
1535 #[derive(Clone, Debug)]
1536 pub struct TextRecord {
1537 pub offsets: Range<u64>,
1539 /// The text content of the record.
1540 pub text: UnencodedString,
1543 impl From<Extension> for TextRecord {
1544 fn from(source: Extension) -> Self {
1546 offsets: source.offsets,
1547 text: source.data.into(),
1552 #[derive(Clone, Debug)]
1553 pub struct Extension {
1554 pub offsets: Range<u64>,
1559 /// Size of each data element.
1562 /// Number of data elements.
1565 /// `size * count` bytes of data.
1570 fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
1571 if let Some(expected_size) = E::SIZE {
1572 if self.size != expected_size {
1573 return Err(Error::BadRecordSize {
1574 offset: self.offsets.start,
1575 record: E::NAME.into(),
1581 if let Some(expected_count) = E::COUNT {
1582 if self.count != expected_count {
1583 return Err(Error::BadRecordCount {
1584 offset: self.offsets.start,
1585 record: E::NAME.into(),
1594 fn read<R: Read + Seek>(
1597 warn: &Box<dyn Fn(Error)>,
1598 ) -> Result<Option<Record>, Error> {
1599 let subtype = endian.parse(read_bytes(r)?);
1600 let header_offset = r.stream_position()?;
1601 let size: u32 = endian.parse(read_bytes(r)?);
1602 let count = endian.parse(read_bytes(r)?);
1603 let Some(product) = size.checked_mul(count) else {
1604 return Err(Error::ExtensionRecordTooLarge {
1605 offset: header_offset,
1611 let start_offset = r.stream_position()?;
1612 let data = read_vec(r, product as usize)?;
1613 let end_offset = start_offset + product as u64;
1614 let extension = Extension {
1615 offsets: start_offset..end_offset,
1621 let result = match subtype {
1622 IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian),
1623 FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian),
1624 VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, endian),
1625 MultipleResponseRecord::SUBTYPE | 19 => {
1626 MultipleResponseRecord::parse(&extension, endian)
1628 LongStringValueLabelRecord::SUBTYPE => {
1629 LongStringValueLabelRecord::parse(&extension, endian)
1631 EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian),
1632 NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian),
1633 5 => Ok(Record::VariableSets(extension.into())),
1634 10 => Ok(Record::ProductInfo(extension.into())),
1635 13 => Ok(Record::LongNames(extension.into())),
1636 14 => Ok(Record::VeryLongStrings(extension.into())),
1637 17 => Ok(Record::FileAttributes(extension.into())),
1638 18 => Ok(Record::VariableAttributes(extension.into())),
1639 _ => Ok(Record::OtherExtension(extension)),
1642 Ok(result) => Ok(Some(result)),
1651 #[derive(Clone, Debug)]
1652 pub struct ZHeader {
1653 /// File offset to the start of the record.
1656 /// File offset to the ZLIB data header.
1657 pub zheader_offset: u64,
1659 /// File offset to the ZLIB trailer.
1660 pub ztrailer_offset: u64,
1662 /// Length of the ZLIB trailer in bytes.
1663 pub ztrailer_len: u64,
1667 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
1668 let offset = r.stream_position()?;
1669 let zheader_offset: u64 = endian.parse(read_bytes(r)?);
1670 let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
1671 let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
1682 #[derive(Clone, Debug)]
1683 pub struct ZTrailer {
1684 /// File offset to the start of the record.
1687 /// Compression bias as a negative integer, e.g. -100.
1690 /// Always observed as zero.
1693 /// Uncompressed size of each block, except possibly the last. Only
1694 /// `0x3ff000` has been observed so far.
1695 pub block_size: u32,
1697 /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
1698 pub blocks: Vec<ZBlock>,
1701 #[derive(Clone, Debug)]
1703 /// Offset of block of data if simple compression were used.
1704 pub uncompressed_ofs: u64,
1706 /// Actual offset within the file of the compressed data block.
1707 pub compressed_ofs: u64,
1709 /// The number of bytes in this data block after decompression. This is
1710 /// `block_size` in every data block but the last, which may be smaller.
1711 pub uncompressed_size: u32,
1713 /// The number of bytes in this data block, as stored compressed in this
1715 pub compressed_size: u32,
1719 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
1721 uncompressed_ofs: endian.parse(read_bytes(r)?),
1722 compressed_ofs: endian.parse(read_bytes(r)?),
1723 uncompressed_size: endian.parse(read_bytes(r)?),
1724 compressed_size: endian.parse(read_bytes(r)?),
1730 fn read<R: Read + Seek>(
1735 ) -> Result<Option<ZTrailer>, Error> {
1736 let start_offset = reader.stream_position()?;
1737 if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
1740 let int_bias = endian.parse(read_bytes(reader)?);
1741 let zero = endian.parse(read_bytes(reader)?);
1742 let block_size = endian.parse(read_bytes(reader)?);
1743 let n_blocks: u32 = endian.parse(read_bytes(reader)?);
1744 let expected_n_blocks = (ztrailer_len - 24) / 24;
1745 if n_blocks as u64 != expected_n_blocks {
1746 return Err(Error::BadZlibTrailerNBlocks {
1747 offset: ztrailer_ofs,
1753 let blocks = (0..n_blocks)
1754 .map(|_| ZBlock::read(reader, endian))
1755 .collect::<Result<Vec<_>, _>>()?;
1756 reader.seek(SeekFrom::Start(start_offset))?;
1758 offset: ztrailer_ofs,
1767 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
1768 let mut buf = [0; N];
1769 let n = r.read(&mut buf)?;
1772 r.read_exact(&mut buf[n..])?;
1780 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
1781 let mut buf = [0; N];
1782 r.read_exact(&mut buf)?;
1786 fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
1787 let mut vec = vec![0; n];
1788 r.read_exact(&mut vec)?;
1792 fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<UnencodedString, IoError> {
1793 let length: u32 = endian.parse(read_bytes(r)?);
1794 Ok(read_vec(r, length as usize)?.into())
1797 #[derive(Clone, Debug)]
1798 pub struct LongStringValueLabels {
1799 pub var_name: UnencodedString,
1802 /// `(value, label)` pairs, where each value is `width` bytes.
1803 pub labels: Vec<(UnencodedString, UnencodedString)>,
1806 #[derive(Clone, Debug)]
1807 pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
1809 impl ExtensionRecord for LongStringValueLabelRecord {
1810 const SUBTYPE: u32 = 21;
1811 const SIZE: Option<u32> = Some(1);
1812 const COUNT: Option<u32> = None;
1813 const NAME: &'static str = "long string value labels record";
1815 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1816 ext.check_size::<Self>()?;
1818 let mut input = &ext.data[..];
1819 let mut label_set = Vec::new();
1820 while !input.is_empty() {
1821 let var_name = read_string(&mut input, endian)?;
1822 let width: u32 = endian.parse(read_bytes(&mut input)?);
1823 let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
1824 let mut labels = Vec::new();
1825 for _ in 0..n_labels {
1826 let value = read_string(&mut input, endian)?;
1827 let label = read_string(&mut input, endian)?;
1828 labels.push((value, label));
1830 label_set.push(LongStringValueLabels {
1836 Ok(Record::LongStringValueLabels(LongStringValueLabelRecord(