1 use crate::endian::{Endian, Parse, ToBytes};
2 use crate::{CategoryLabels, Compression};
4 use encoding_rs::mem::decode_latin1;
5 use flate2::read::ZlibDecoder;
8 use std::fmt::{Debug, Formatter, Result as FmtResult};
9 use std::str::from_utf8;
11 collections::VecDeque,
12 io::{Error as IoError, Read, Seek, SeekFrom},
15 use thiserror::Error as ThisError;
17 use self::state::State;
19 #[derive(ThisError, Debug)]
21 #[error("Not an SPSS system file")]
24 #[error("Invalid magic number {0:?}")]
27 #[error("I/O error ({0})")]
30 #[error("Invalid SAV compression code {0}")]
31 InvalidSavCompression(u32),
33 #[error("Invalid ZSAV compression code {0}")]
34 InvalidZsavCompression(u32),
36 #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
37 BadVariableWidth { offset: u64, width: i32 },
39 #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
40 BadDocumentLength { offset: u64, n: usize, max: usize },
42 #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
43 BadRecordType { offset: u64, rec_type: u32 },
45 #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")]
46 BadVariableLabelCode { offset: u64, code: u32 },
49 "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
51 BadNumericMissingValueCode { offset: u64, code: i32 },
53 #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
54 BadStringMissingValueCode { offset: u64, code: i32 },
56 #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
57 BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
59 #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
60 BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
62 #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
63 ExtensionRecordTooLarge {
70 #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
78 "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
80 EofInCompressedCase { offset: u64, case_ofs: u64 },
82 #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
83 PartialCompressedCase { offset: u64, case_ofs: u64 },
85 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
86 CompressedNumberExpected { offset: u64, case_ofs: u64 },
88 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
89 CompressedStringExpected { offset: u64, case_ofs: u64 },
91 #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
92 BadZlibTrailerNBlocks {
95 expected_n_blocks: u64,
99 #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
107 #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
115 #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
116 BadLongMissingValueLength {
122 #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
123 BadEncodingName { offset: u64 },
125 #[error("Details TBD")]
129 #[derive(Clone, Debug)]
131 Header(HeaderRecord),
132 Variable(VariableRecord),
133 ValueLabel(ValueLabelRecord),
134 VarIndexes(VarIndexRecord),
135 Document(DocumentRecord),
136 IntegerInfo(IntegerInfoRecord),
137 FloatInfo(FloatInfoRecord),
138 VariableSets(UnencodedString),
139 VarDisplay(VarDisplayRecord),
140 MultipleResponse(MultipleResponseRecord),
141 LongStringValueLabels(LongStringValueLabelRecord),
142 Encoding(EncodingRecord),
143 NumberOfCases(NumberOfCasesRecord),
144 ProductInfo(UnencodedString),
145 LongNames(UnencodedString),
146 LongStrings(UnencodedString),
147 FileAttributes(UnencodedString),
148 VariableAttributes(UnencodedString),
149 OtherExtension(Extension),
157 fn read<R: Read + Seek>(reader: &mut R, endian: Endian) -> Result<Record, Error> {
158 let rec_type: u32 = endian.parse(read_bytes(reader)?);
160 2 => Ok(Record::Variable(VariableRecord::read(reader, endian)?)),
161 3 => Ok(Record::ValueLabel(ValueLabelRecord::read(reader, endian)?)),
162 4 => Ok(Record::VarIndexes(VarIndexRecord::read(reader, endian)?)),
163 6 => Ok(Record::Document(DocumentRecord::read(reader, endian)?)),
164 7 => Ok(Extension::read(reader, endian)?),
165 999 => Ok(Record::EndOfHeaders(endian.parse(read_bytes(reader)?))),
166 _ => Err(Error::BadRecordType {
167 offset: reader.stream_position()?,
174 // If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
175 // decoded as Latin-1 (actually bytes interpreted as Unicode code points).
176 fn default_decode<'a>(s: &'a [u8]) -> Cow<'a, str> {
177 from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
181 pub struct HeaderRecord {
185 /// Eye-catcher string, product name, in the file's encoding. Padded
186 /// on the right with spaces.
187 pub eye_catcher: UnencodedStr<60>,
189 /// Layout code, normally either 2 or 3.
190 pub layout_code: u32,
192 /// Number of variable positions, or `None` if the value in the file is
193 /// questionably trustworthy.
194 pub nominal_case_size: Option<u32>,
196 /// Compression type, if any,
197 pub compression: Option<Compression>,
199 /// 1-based variable index of the weight variable, or `None` if the file is
201 pub weight_index: Option<u32>,
203 /// Claimed number of cases, if known.
204 pub n_cases: Option<u32>,
206 /// Compression bias, usually 100.0.
209 /// `dd mmm yy` in the file's encoding.
210 pub creation_date: UnencodedStr<9>,
212 /// `HH:MM:SS` in the file's encoding.
213 pub creation_time: UnencodedStr<8>,
215 /// File label, in the file's encoding. Padded on the right with spaces.
216 pub file_label: UnencodedStr<64>,
218 /// Endianness of the data in the file header.
223 fn debug_field<T: Debug>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult {
224 writeln!(f, "{name:>17}: {:?}", value)
228 impl Debug for HeaderRecord {
229 fn fmt(&self, f: &mut Formatter) -> FmtResult {
230 writeln!(f, "File header record:")?;
231 self.debug_field(f, "Magic", self.magic)?;
232 self.debug_field(f, "Product name", &self.eye_catcher)?;
233 self.debug_field(f, "Layout code", self.layout_code)?;
234 self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
235 self.debug_field(f, "Compression", self.compression)?;
236 self.debug_field(f, "Weight index", self.weight_index)?;
237 self.debug_field(f, "Number of cases", self.n_cases)?;
238 self.debug_field(f, "Compression bias", self.bias)?;
239 self.debug_field(f, "Creation date", &self.creation_date)?;
240 self.debug_field(f, "Creation time", &self.creation_time)?;
241 self.debug_field(f, "File label", &self.file_label)?;
242 self.debug_field(f, "Endianness", self.endian)
247 fn read<R: Read>(r: &mut R) -> Result<HeaderRecord, Error> {
248 let magic: [u8; 4] = read_bytes(r)?;
249 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
251 let eye_catcher = UnencodedStr::<60>(read_bytes(r)?);
252 let layout_code: [u8; 4] = read_bytes(r)?;
253 let endian = Endian::identify_u32(2, layout_code)
254 .or_else(|| Endian::identify_u32(2, layout_code))
255 .ok_or_else(|| Error::NotASystemFile)?;
256 let layout_code = endian.parse(layout_code);
258 let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
259 let nominal_case_size =
260 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
262 let compression_code: u32 = endian.parse(read_bytes(r)?);
263 let compression = match (magic, compression_code) {
264 (Magic::ZSAV, 2) => Some(Compression::ZLib),
265 (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
267 (_, 1) => Some(Compression::Simple),
268 (_, code) => return Err(Error::InvalidSavCompression(code)),
271 let weight_index: u32 = endian.parse(read_bytes(r)?);
272 let weight_index = (weight_index > 0).then_some(weight_index);
274 let n_cases: u32 = endian.parse(read_bytes(r)?);
275 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
277 let bias: f64 = endian.parse(read_bytes(r)?);
279 let creation_date = UnencodedStr::<9>(read_bytes(r)?);
280 let creation_time = UnencodedStr::<8>(read_bytes(r)?);
281 let file_label = UnencodedStr::<64>(read_bytes(r)?);
282 let _: [u8; 3] = read_bytes(r)?;
301 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
302 pub struct Magic([u8; 4]);
305 /// Magic number for a regular system file.
306 pub const SAV: Magic = Magic(*b"$FL2");
308 /// Magic number for a system file that contains zlib-compressed data.
309 pub const ZSAV: Magic = Magic(*b"$FL3");
311 /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
313 pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
316 impl Debug for Magic {
317 fn fmt(&self, f: &mut Formatter) -> FmtResult {
319 &Magic::SAV => "$FL2",
320 &Magic::ZSAV => "$FL3",
321 &Magic::EBCDIC => "($FL2 in EBCDIC)",
322 _ => return write!(f, "{:?}", self.0),
328 impl TryFrom<[u8; 4]> for Magic {
331 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
332 let magic = Magic(value);
334 Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
335 _ => Err(Error::BadMagic(value)),
340 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
347 fn from_width(width: i32) -> VarType {
349 0 => VarType::Numeric,
350 _ => VarType::String,
357 Compression, Error, HeaderRecord, Record, Value, VarType, VariableRecord, ZHeader,
358 ZTrailer, ZlibDecodeMultiple,
360 use crate::endian::Endian;
362 collections::VecDeque,
367 #[allow(clippy::type_complexity)]
368 fn read(self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error>;
371 struct Start<R: Read + Seek> {
375 pub fn new<R: Read + Seek + 'static>(reader: R) -> Box<dyn State> {
376 Box::new(Start { reader })
379 struct CommonState<R: Read + Seek> {
383 compression: Option<Compression>,
384 var_types: Vec<VarType>,
387 impl<R: Read + Seek + 'static> State for Start<R> {
388 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
389 let header = HeaderRecord::read(&mut self.reader)?;
390 let next_state = Headers(CommonState {
392 endian: header.endian,
394 compression: header.compression,
395 var_types: Vec::new(),
397 Ok(Some((Record::Header(header), Box::new(next_state))))
401 struct Headers<R: Read + Seek>(CommonState<R>);
403 impl<R: Read + Seek + 'static> State for Headers<R> {
404 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
405 let record = Record::read(&mut self.0.reader, self.0.endian)?;
407 Record::Variable(VariableRecord { width, .. }) => {
408 self.0.var_types.push(VarType::from_width(width));
410 Record::EndOfHeaders(_) => {
411 let next_state: Box<dyn State> = match self.0.compression {
412 None => Box::new(Data(self.0)),
413 Some(Compression::Simple) => Box::new(CompressedData::new(self.0)),
414 Some(Compression::ZLib) => Box::new(ZlibHeader(self.0)),
416 return Ok(Some((record, next_state)));
420 Ok(Some((record, self)))
424 struct ZlibHeader<R: Read + Seek>(CommonState<R>);
426 impl<R: Read + Seek + 'static> State for ZlibHeader<R> {
427 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
428 let zheader = ZHeader::read(&mut self.0.reader, self.0.endian)?;
429 Ok(Some((Record::ZHeader(zheader), self)))
433 struct ZlibTrailer<R: Read + Seek>(CommonState<R>, ZHeader);
435 impl<R: Read + Seek + 'static> State for ZlibTrailer<R> {
436 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
437 let retval = ZTrailer::read(
440 self.1.ztrailer_offset,
443 let next_state = Box::new(CompressedData::new(CommonState {
444 reader: ZlibDecodeMultiple::new(self.0.reader),
445 endian: self.0.endian,
447 compression: self.0.compression,
448 var_types: self.0.var_types,
451 None => next_state.read(),
452 Some(ztrailer) => Ok(Some((Record::ZTrailer(ztrailer), next_state))),
457 struct Data<R: Read + Seek>(CommonState<R>);
459 impl<R: Read + Seek + 'static> State for Data<R> {
460 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
461 match Value::read_case(&mut self.0.reader, &self.0.var_types, self.0.endian)? {
463 Some(values) => Ok(Some((Record::Case(values), self))),
468 struct CompressedData<R: Read + Seek> {
469 common: CommonState<R>,
473 impl<R: Read + Seek + 'static> CompressedData<R> {
474 fn new(common: CommonState<R>) -> CompressedData<R> {
477 codes: VecDeque::new(),
482 impl<R: Read + Seek + 'static> State for CompressedData<R> {
483 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
484 match Value::read_compressed_case(
485 &mut self.common.reader,
486 &self.common.var_types,
492 Some(values) => Ok(Some((Record::Case(values), self))),
498 #[derive(Copy, Clone)]
501 String(UnencodedStr<8>),
504 impl Debug for Value {
505 fn fmt(&self, f: &mut Formatter) -> FmtResult {
507 Value::Number(Some(number)) => write!(f, "{number:?}"),
508 Value::Number(None) => write!(f, "SYSMIS"),
509 Value::String(bytes) => write!(f, "{:?}", bytes),
515 fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Value, IoError> {
517 UntypedValue(read_bytes(r)?),
523 pub fn from_raw(raw: UntypedValue, var_type: VarType, endian: Endian) -> Value {
525 VarType::String => Value::String(UnencodedStr(raw.0)),
526 VarType::Numeric => {
527 let number: f64 = endian.parse(raw.0);
528 Value::Number((number != -f64::MAX).then_some(number))
533 fn read_case<R: Read + Seek>(
535 var_types: &[VarType],
537 ) -> Result<Option<Vec<Value>>, Error> {
538 let case_start = reader.stream_position()?;
539 let mut values = Vec::with_capacity(var_types.len());
540 for (i, &var_type) in var_types.iter().enumerate() {
541 let Some(raw) = try_read_bytes(reader)? else {
545 let offset = reader.stream_position()?;
546 return Err(Error::EofInCase {
548 case_ofs: offset - case_start,
549 case_len: var_types.len() * 8,
553 values.push(Value::from_raw(UntypedValue(raw), var_type, endian));
558 fn read_compressed_case<R: Read + Seek>(
560 var_types: &[VarType],
561 codes: &mut VecDeque<u8>,
564 ) -> Result<Option<Vec<Value>>, Error> {
565 let case_start = reader.stream_position()?;
566 let mut values = Vec::with_capacity(var_types.len());
567 for (i, &var_type) in var_types.iter().enumerate() {
569 let Some(code) = codes.pop_front() else {
570 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
574 let offset = reader.stream_position()?;
575 return Err(Error::EofInCompressedCase {
577 case_ofs: offset - case_start,
581 codes.extend(new_codes.into_iter());
586 1..=251 => match var_type {
587 VarType::Numeric => break Value::Number(Some(code as f64 - bias)),
589 break Value::String(UnencodedStr(endian.to_bytes(code as f64 - bias)))
596 let offset = reader.stream_position()?;
597 return Err(Error::PartialCompressedCase {
599 case_ofs: offset - case_start,
604 break Value::from_raw(UntypedValue(read_bytes(reader)?), var_type, endian)
606 254 => match var_type {
607 VarType::String => break Value::String(UnencodedStr(*b" ")), // XXX EBCDIC
608 VarType::Numeric => {
609 return Err(Error::CompressedStringExpected {
611 case_ofs: reader.stream_position()? - case_start,
615 255 => match var_type {
616 VarType::Numeric => break Value::Number(None),
618 return Err(Error::CompressedNumberExpected {
620 case_ofs: reader.stream_position()? - case_start,
632 struct ZlibDecodeMultiple<R>
636 reader: Option<ZlibDecoder<R>>,
639 impl<R> ZlibDecodeMultiple<R>
643 fn new(reader: R) -> ZlibDecodeMultiple<R> {
645 reader: Some(ZlibDecoder::new(reader)),
650 impl<R> Read for ZlibDecodeMultiple<R>
654 fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
656 match self.reader.as_mut().unwrap().read(buf)? {
658 let inner = self.reader.take().unwrap().into_inner();
659 self.reader = Some(ZlibDecoder::new(inner));
667 impl<R> Seek for ZlibDecodeMultiple<R>
671 fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
672 self.reader.as_mut().unwrap().get_mut().seek(pos)
677 state: Option<Box<dyn State>>,
681 pub fn new<R: Read + Seek + 'static>(reader: R) -> Result<Reader, Error> {
683 state: Some(state::new(reader)),
686 pub fn collect_headers(&mut self) -> Result<Vec<Record>, Error> {
687 let mut headers = Vec::new();
690 Record::EndOfHeaders(_) => break,
691 r => headers.push(r),
698 impl Iterator for Reader {
699 type Item = Result<Record, Error>;
701 fn next(&mut self) -> Option<Self::Item> {
702 match self.state.take()?.read() {
703 Ok(Some((record, next_state))) => {
704 self.state = Some(next_state);
708 Err(error) => Some(Err(error)),
713 impl FusedIterator for Reader {}
715 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
716 pub struct Spec(pub u32);
718 impl Debug for Spec {
719 fn fmt(&self, f: &mut Formatter) -> FmtResult {
720 let type_ = format_name(self.0 >> 16);
721 let w = (self.0 >> 8) & 0xff;
722 let d = self.0 & 0xff;
723 write!(f, "{:06x} ({type_}{w}.{d})", self.0)
727 fn format_name(type_: u32) -> Cow<'static, str> {
766 _ => return format!("<unknown format {type_}>").into(),
772 pub struct MissingValues {
773 /// Individual missing values, up to 3 of them.
774 pub values: Vec<Value>,
776 /// Optional range of missing values.
777 pub range: Option<(Value, Value)>,
780 impl Debug for MissingValues {
781 fn fmt(&self, f: &mut Formatter) -> FmtResult {
782 for (i, value) in self.values.iter().enumerate() {
786 write!(f, "{value:?}")?;
789 if let Some((low, high)) = self.range {
790 if !self.values.is_empty() {
793 write!(f, "{low:?} THRU {high:?}")?;
805 fn is_empty(&self) -> bool {
806 self.values.is_empty() && self.range.is_none()
809 fn read<R: Read + Seek>(
815 ) -> Result<MissingValues, Error> {
816 let (n_values, has_range) = match (width, code) {
817 (_, 0..=3) => (code, false),
818 (0, -2) => (0, true),
819 (0, -3) => (1, true),
820 (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
821 (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
824 let var_type = VarType::from_width(width);
826 let mut values = Vec::new();
827 for _ in 0..n_values {
828 values.push(Value::read(r, var_type, endian)?);
830 let range = if has_range {
831 let low = Value::read(r, var_type, endian)?;
832 let high = Value::read(r, var_type, endian)?;
837 Ok(MissingValues { values, range })
842 pub struct VariableRecord {
843 /// Offset from the start of the file to the start of the record.
846 /// Variable width, in the range -1..=255.
849 /// Variable name, padded on the right with spaces.
850 pub name: UnencodedStr<8>,
853 pub print_format: Spec,
856 pub write_format: Spec,
859 pub missing_values: MissingValues,
861 /// Optional variable label.
862 pub label: Option<UnencodedString>,
865 impl Debug for VariableRecord {
866 fn fmt(&self, f: &mut Formatter) -> FmtResult {
873 } else if self.width == 0 {
876 "long string continuation record"
879 writeln!(f, "Print format: {:?}", self.print_format)?;
880 writeln!(f, "Write format: {:?}", self.write_format)?;
881 writeln!(f, "Name: {:?}", &self.name)?;
882 writeln!(f, "Variable label: {:?}", self.label)?;
883 writeln!(f, "Missing values: {:?}", self.missing_values)
887 impl VariableRecord {
888 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VariableRecord, Error> {
889 let offset = r.stream_position()?;
890 let width: i32 = endian.parse(read_bytes(r)?);
891 let has_variable_label: u32 = endian.parse(read_bytes(r)?);
892 let missing_value_code: i32 = endian.parse(read_bytes(r)?);
893 let print_format = Spec(endian.parse(read_bytes(r)?));
894 let write_format = Spec(endian.parse(read_bytes(r)?));
895 let name = UnencodedStr::<8>(read_bytes(r)?);
897 let label = match has_variable_label {
900 let len: u32 = endian.parse(read_bytes(r)?);
901 let read_len = len.min(65535) as usize;
902 let label = UnencodedString(read_vec(r, read_len)?);
904 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
905 let _ = read_vec(r, padding_bytes as usize)?;
910 return Err(Error::BadVariableLabelCode {
912 code: has_variable_label,
917 let missing_values = MissingValues::read(r, offset, width, missing_value_code, endian)?;
931 #[derive(Copy, Clone)]
932 pub struct UntypedValue(pub [u8; 8]);
934 impl Debug for UntypedValue {
935 fn fmt(&self, f: &mut Formatter) -> FmtResult {
936 let little: f64 = Endian::Little.parse(self.0);
937 let little = format!("{:?}", little);
938 let big: f64 = Endian::Big.parse(self.0);
939 let big = format!("{:?}", big);
940 let number = if little.len() <= big.len() {
945 write!(f, "{number}")?;
947 let string = default_decode(&self.0);
949 .split(|c: char| c == '\0' || c.is_control())
952 write!(f, "{string:?}")?;
958 pub struct UnencodedString(pub Vec<u8>);
960 impl From<Vec<u8>> for UnencodedString {
961 fn from(source: Vec<u8>) -> Self {
966 impl From<&[u8]> for UnencodedString {
967 fn from(source: &[u8]) -> Self {
972 impl Debug for UnencodedString {
973 fn fmt(&self, f: &mut Formatter) -> FmtResult {
974 write!(f, "{:?}", default_decode(self.0.as_slice()))
978 #[derive(Copy, Clone)]
979 pub struct UnencodedStr<const N: usize>(pub [u8; N]);
981 impl<const N: usize> From<[u8; N]> for UnencodedStr<N> {
982 fn from(source: [u8; N]) -> Self {
987 impl<const N: usize> Debug for UnencodedStr<N> {
988 fn fmt(&self, f: &mut Formatter) -> FmtResult {
989 write!(f, "{:?}", default_decode(&self.0))
994 pub struct ValueLabelRecord {
995 /// Offset from the start of the file to the start of the record.
999 pub labels: Vec<(UntypedValue, UnencodedString)>,
1002 impl Debug for ValueLabelRecord {
1003 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1004 for (value, label) in self.labels.iter() {
1005 writeln!(f, "{value:?}: {label:?}")?;
1011 impl ValueLabelRecord {
1012 /// Maximum number of value labels in a record.
1013 pub const MAX: u32 = u32::MAX / 8;
1015 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabelRecord, Error> {
1016 let offset = r.stream_position()?;
1017 let n: u32 = endian.parse(read_bytes(r)?);
1018 if n > ValueLabelRecord::MAX {
1019 return Err(Error::BadNumberOfValueLabels {
1022 max: ValueLabelRecord::MAX,
1026 let mut labels = Vec::new();
1028 let value = UntypedValue(read_bytes(r)?);
1029 let label_len: u8 = endian.parse(read_bytes(r)?);
1030 let label_len = label_len as usize;
1031 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
1033 let mut label = read_vec(r, padded_len - 1)?;
1034 label.truncate(label_len);
1035 labels.push((value, UnencodedString(label)));
1037 Ok(ValueLabelRecord { offset, labels })
1042 pub struct VarIndexRecord {
1043 /// Offset from the start of the file to the start of the record.
1046 /// The 1-based indexes of the variable indexes.
1047 pub dict_indexes: Vec<u32>,
1050 impl Debug for VarIndexRecord {
1051 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1052 write!(f, "apply to variables")?;
1053 for dict_index in self.dict_indexes.iter() {
1054 write!(f, " #{dict_index}")?;
1060 impl VarIndexRecord {
1061 /// Maximum number of variable indexes in a record.
1062 pub const MAX: u32 = u32::MAX / 8;
1064 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VarIndexRecord, Error> {
1065 let offset = r.stream_position()?;
1066 let n: u32 = endian.parse(read_bytes(r)?);
1067 if n > VarIndexRecord::MAX {
1068 return Err(Error::BadNumberOfVarIndexes {
1071 max: VarIndexRecord::MAX,
1074 let mut dict_indexes = Vec::with_capacity(n as usize);
1076 dict_indexes.push(endian.parse(read_bytes(r)?));
1086 #[derive(Clone, Debug)]
1087 pub struct DocumentRecord {
1088 /// Offset from the start of the file to the start of the record.
1091 /// The document, as an array of 80-byte lines.
1092 pub lines: Vec<DocumentLine>,
1095 pub type DocumentLine = UnencodedStr<{ DocumentRecord::LINE_LEN }>;
1097 impl DocumentRecord {
1098 /// Length of a line in a document. Document lines are fixed-length and
1099 /// padded on the right with spaces.
1100 pub const LINE_LEN: usize = 80;
1102 /// Maximum number of lines we will accept in a document. This is simply
1103 /// the maximum number that will fit in a 32-bit space.
1104 pub const MAX_LINES: usize = i32::MAX as usize / Self::LINE_LEN;
1106 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<DocumentRecord, Error> {
1107 let offset = r.stream_position()?;
1108 let n: u32 = endian.parse(read_bytes(r)?);
1110 if n > Self::MAX_LINES {
1111 Err(Error::BadDocumentLength {
1114 max: Self::MAX_LINES,
1117 let pos = r.stream_position()?;
1118 let mut lines = Vec::with_capacity(n);
1120 lines.push(UnencodedStr::<{ DocumentRecord::LINE_LEN }>(read_bytes(r)?));
1122 Ok(DocumentRecord { pos, lines })
1127 trait ExtensionRecord
1132 const SIZE: Option<u32>;
1133 const COUNT: Option<u32>;
1134 const NAME: &'static str;
1135 fn parse(ext: &Extension, endian: Endian, warn: impl Fn(Error)) -> Result<Self, Error>;
1138 #[derive(Clone, Debug)]
1139 pub struct IntegerInfoRecord {
1140 pub version: (i32, i32, i32),
1141 pub machine_code: i32,
1142 pub floating_point_rep: i32,
1143 pub compression_code: i32,
1144 pub endianness: i32,
1145 pub character_code: i32,
1148 impl ExtensionRecord for IntegerInfoRecord {
1149 const SUBTYPE: u32 = 3;
1150 const SIZE: Option<u32> = Some(4);
1151 const COUNT: Option<u32> = Some(8);
1152 const NAME: &'static str = "integer record";
1154 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1155 ext.check_size::<Self>()?;
1157 let mut input = &ext.data[..];
1158 let data: Vec<i32> = (0..8)
1159 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1161 Ok(IntegerInfoRecord {
1162 version: (data[0], data[1], data[2]),
1163 machine_code: data[3],
1164 floating_point_rep: data[4],
1165 compression_code: data[5],
1166 endianness: data[6],
1167 character_code: data[7],
1172 #[derive(Clone, Debug)]
1173 pub struct FloatInfoRecord {
1179 impl ExtensionRecord for FloatInfoRecord {
1180 const SUBTYPE: u32 = 4;
1181 const SIZE: Option<u32> = Some(8);
1182 const COUNT: Option<u32> = Some(3);
1183 const NAME: &'static str = "floating point record";
1185 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1186 ext.check_size::<Self>()?;
1188 let mut input = &ext.data[..];
1189 let data: Vec<f64> = (0..3)
1190 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1192 Ok(FloatInfoRecord {
1200 #[derive(Clone, Debug)]
1201 pub enum MultipleResponseType {
1203 value: UnencodedString,
1204 labels: CategoryLabels,
1208 #[derive(Clone, Debug)]
1209 pub struct MultipleResponseSet {
1210 pub name: UnencodedString,
1211 pub label: UnencodedString,
1212 pub mr_type: MultipleResponseType,
1213 pub vars: Vec<UnencodedString>,
1216 impl MultipleResponseSet {
1217 fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> {
1218 let Some(equals) = input.iter().position(|&b| b == b'=') else {
1219 return Err(Error::TBD);
1221 let (name, input) = input.split_at(equals);
1222 let (mr_type, input) = match input.get(0) {
1223 Some(b'C') => (MultipleResponseType::MultipleCategory, &input[1..]),
1225 let (value, input) = parse_counted_string(&input[1..])?;
1227 MultipleResponseType::MultipleDichotomy {
1228 value: value.into(),
1229 labels: CategoryLabels::VarLabels,
1235 let Some(b' ') = input.get(1) else {
1236 return Err(Error::TBD);
1238 let input = &input[2..];
1239 let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
1240 (CategoryLabels::CountedValues, rest)
1241 } else if let Some(rest) = input.strip_prefix(b" 11 ") {
1242 (CategoryLabels::VarLabels, rest)
1244 return Err(Error::TBD);
1246 let (value, input) = parse_counted_string(input)?;
1248 MultipleResponseType::MultipleDichotomy {
1249 value: value.into(),
1255 _ => return Err(Error::TBD),
1257 let Some(b' ') = input.get(0) else {
1258 return Err(Error::TBD);
1260 let (label, mut input) = parse_counted_string(&input[1..])?;
1261 let mut vars = Vec::new();
1262 while input.get(0) == Some(&b' ') {
1263 input = &input[1..];
1264 let Some(length) = input.iter().position(|b| b" \n".contains(b)) else {
1265 return Err(Error::TBD);
1268 vars.push(input[..length].into());
1270 input = &input[length..];
1272 if input.get(0) != Some(&b'\n') {
1273 return Err(Error::TBD);
1275 while input.get(0) == Some(&b'\n') {
1276 input = &input[1..];
1279 MultipleResponseSet {
1281 label: label.into(),
1290 #[derive(Clone, Debug)]
1291 pub struct MultipleResponseRecord(Vec<MultipleResponseSet>);
1293 impl ExtensionRecord for MultipleResponseRecord {
1294 const SUBTYPE: u32 = 7;
1295 const SIZE: Option<u32> = Some(1);
1296 const COUNT: Option<u32> = None;
1297 const NAME: &'static str = "multiple response set record";
1299 fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1300 ext.check_size::<Self>()?;
1302 let mut input = &ext.data[..];
1303 let mut sets = Vec::new();
1304 while !input.is_empty() {
1305 let (set, rest) = MultipleResponseSet::parse(input)?;
1309 Ok(MultipleResponseRecord(sets))
1313 fn parse_counted_string(input: &[u8]) -> Result<(UnencodedString, &[u8]), Error> {
1314 let Some(space) = input.iter().position(|&b| b == b' ') else {
1315 return Err(Error::TBD);
1317 let Ok(length) = from_utf8(&input[..space]) else {
1318 return Err(Error::TBD);
1320 let Ok(length): Result<usize, _> = length.parse() else {
1321 return Err(Error::TBD);
1324 let input = &input[space + 1..];
1325 if input.len() < length {
1326 return Err(Error::TBD);
1329 let (string, rest) = input.split_at(length);
1330 Ok((string.into(), rest))
1333 #[derive(Clone, Debug)]
1334 pub struct VarDisplayRecord(pub Vec<u32>);
1336 impl ExtensionRecord for VarDisplayRecord {
1337 const SUBTYPE: u32 = 11;
1338 const SIZE: Option<u32> = Some(4);
1339 const COUNT: Option<u32> = None;
1340 const NAME: &'static str = "variable display record";
1342 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1343 ext.check_size::<Self>()?;
1345 let mut input = &ext.data[..];
1346 let display = (0..ext.count)
1347 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1349 Ok(VarDisplayRecord(display))
1353 pub struct LongStringMissingValues {
1355 pub var_name: UnencodedString,
1358 pub missing_values: MissingValues,
1361 pub struct LongStringMissingValueSet(Vec<LongStringMissingValues>);
1363 impl ExtensionRecord for LongStringMissingValueSet {
1364 const SUBTYPE: u32 = 22;
1365 const SIZE: Option<u32> = Some(1);
1366 const COUNT: Option<u32> = None;
1367 const NAME: &'static str = "long string missing values record";
1369 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1370 ext.check_size::<Self>()?;
1372 let mut input = &ext.data[..];
1373 let mut missing_value_set = Vec::new();
1374 while !input.is_empty() {
1375 let var_name = read_string(&mut input, endian)?;
1376 let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
1377 let value_len: u32 = endian.parse(read_bytes(&mut input)?);
1379 let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offset;
1380 return Err(Error::BadLongMissingValueLength {
1381 record_offset: ext.offset,
1386 let mut values = Vec::new();
1387 for i in 0..n_missing_values {
1388 let value: [u8; 8] = read_bytes(&mut input)?;
1389 let numeric_value: u64 = endian.parse(value);
1390 let value = if i > 0 && numeric_value == 8 {
1391 // Tolerate files written by old, buggy versions of PSPP
1392 // where we believed that the value_length was repeated
1393 // before each missing value.
1394 read_bytes(&mut input)?
1398 values.push(Value::String(UnencodedStr(value)));
1400 let missing_values = MissingValues {
1404 missing_value_set.push(LongStringMissingValues {
1409 Ok(LongStringMissingValueSet(missing_value_set))
1413 #[derive(Clone, Debug)]
1414 pub struct EncodingRecord(pub String);
1416 impl ExtensionRecord for EncodingRecord {
1417 const SUBTYPE: u32 = 20;
1418 const SIZE: Option<u32> = Some(1);
1419 const COUNT: Option<u32> = None;
1420 const NAME: &'static str = "encoding record";
1422 fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1423 ext.check_size::<Self>()?;
1426 String::from_utf8(ext.data.clone())
1427 .map_err(|_| Error::BadEncodingName { offset: ext.offset })?,
1432 #[derive(Clone, Debug)]
1433 pub struct NumberOfCasesRecord {
1434 /// Always observed as 1.
1437 /// Number of cases.
1441 impl ExtensionRecord for NumberOfCasesRecord {
1442 const SUBTYPE: u32 = 16;
1443 const SIZE: Option<u32> = Some(8);
1444 const COUNT: Option<u32> = Some(2);
1445 const NAME: &'static str = "extended number of cases record";
1447 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1448 ext.check_size::<Self>()?;
1450 let mut input = &ext.data[..];
1451 let one = endian.parse(read_bytes(&mut input)?);
1452 let n_cases = endian.parse(read_bytes(&mut input)?);
1454 Ok(NumberOfCasesRecord { one, n_cases })
1458 #[derive(Clone, Debug)]
1459 pub struct Extension {
1460 /// Offset from the start of the file to the start of the record.
1466 /// Size of each data element.
1469 /// Number of data elements.
1472 /// `size * count` bytes of data.
1477 fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
1478 if let Some(expected_size) = E::SIZE {
1479 if self.size != expected_size {
1480 return Err(Error::BadRecordSize {
1481 offset: self.offset,
1482 record: E::NAME.into(),
1488 if let Some(expected_count) = E::COUNT {
1489 if self.count != expected_count {
1490 return Err(Error::BadRecordCount {
1491 offset: self.offset,
1492 record: E::NAME.into(),
1501 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1502 let subtype = endian.parse(read_bytes(r)?);
1503 let offset = r.stream_position()?;
1504 let size: u32 = endian.parse(read_bytes(r)?);
1505 let count = endian.parse(read_bytes(r)?);
1506 let Some(product) = size.checked_mul(count) else {
1507 return Err(Error::ExtensionRecordTooLarge {
1514 let offset = r.stream_position()?;
1515 let data = read_vec(r, product as usize)?;
1516 let extension = Extension {
1524 IntegerInfoRecord::SUBTYPE => Ok(Record::IntegerInfo(IntegerInfoRecord::parse(
1529 FloatInfoRecord::SUBTYPE => Ok(Record::FloatInfo(FloatInfoRecord::parse(
1534 VarDisplayRecord::SUBTYPE => Ok(Record::VarDisplay(VarDisplayRecord::parse(
1539 MultipleResponseRecord::SUBTYPE | 19 => Ok(Record::MultipleResponse(
1540 MultipleResponseRecord::parse(&extension, endian, |_| ())?,
1542 LongStringValueLabelRecord::SUBTYPE => Ok(Record::LongStringValueLabels(
1543 LongStringValueLabelRecord::parse(&extension, endian, |_| ())?,
1545 EncodingRecord::SUBTYPE => Ok(Record::Encoding(EncodingRecord::parse(
1550 NumberOfCasesRecord::SUBTYPE => Ok(Record::NumberOfCases(NumberOfCasesRecord::parse(
1555 5 => Ok(Record::VariableSets(UnencodedString(extension.data))),
1556 10 => Ok(Record::ProductInfo(UnencodedString(extension.data))),
1557 13 => Ok(Record::LongNames(UnencodedString(extension.data))),
1558 14 => Ok(Record::LongStrings(UnencodedString(extension.data))),
1559 17 => Ok(Record::FileAttributes(UnencodedString(extension.data))),
1560 18 => Ok(Record::VariableAttributes(UnencodedString(extension.data))),
1561 _ => Ok(Record::OtherExtension(extension)),
1566 #[derive(Clone, Debug)]
1567 pub struct ZHeader {
1568 /// File offset to the start of the record.
1571 /// File offset to the ZLIB data header.
1572 pub zheader_offset: u64,
1574 /// File offset to the ZLIB trailer.
1575 pub ztrailer_offset: u64,
1577 /// Length of the ZLIB trailer in bytes.
1578 pub ztrailer_len: u64,
1582 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
1583 let offset = r.stream_position()?;
1584 let zheader_offset: u64 = endian.parse(read_bytes(r)?);
1585 let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
1586 let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
1597 #[derive(Clone, Debug)]
1598 pub struct ZTrailer {
1599 /// File offset to the start of the record.
1602 /// Compression bias as a negative integer, e.g. -100.
1605 /// Always observed as zero.
1608 /// Uncompressed size of each block, except possibly the last. Only
1609 /// `0x3ff000` has been observed so far.
1610 pub block_size: u32,
1612 /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
1613 pub blocks: Vec<ZBlock>,
1616 #[derive(Clone, Debug)]
1618 /// Offset of block of data if simple compression were used.
1619 pub uncompressed_ofs: u64,
1621 /// Actual offset within the file of the compressed data block.
1622 pub compressed_ofs: u64,
1624 /// The number of bytes in this data block after decompression. This is
1625 /// `block_size` in every data block but the last, which may be smaller.
1626 pub uncompressed_size: u32,
1628 /// The number of bytes in this data block, as stored compressed in this
1630 pub compressed_size: u32,
1634 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
1636 uncompressed_ofs: endian.parse(read_bytes(r)?),
1637 compressed_ofs: endian.parse(read_bytes(r)?),
1638 uncompressed_size: endian.parse(read_bytes(r)?),
1639 compressed_size: endian.parse(read_bytes(r)?),
1645 fn read<R: Read + Seek>(
1650 ) -> Result<Option<ZTrailer>, Error> {
1651 let start_offset = reader.stream_position()?;
1652 if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
1655 let int_bias = endian.parse(read_bytes(reader)?);
1656 let zero = endian.parse(read_bytes(reader)?);
1657 let block_size = endian.parse(read_bytes(reader)?);
1658 let n_blocks: u32 = endian.parse(read_bytes(reader)?);
1659 let expected_n_blocks = (ztrailer_len - 24) / 24;
1660 if n_blocks as u64 != expected_n_blocks {
1661 return Err(Error::BadZlibTrailerNBlocks {
1662 offset: ztrailer_ofs,
1668 let blocks = (0..n_blocks)
1669 .map(|_| ZBlock::read(reader, endian))
1670 .collect::<Result<Vec<_>, _>>()?;
1671 reader.seek(SeekFrom::Start(start_offset))?;
1673 offset: ztrailer_ofs,
1682 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
1683 let mut buf = [0; N];
1684 let n = r.read(&mut buf)?;
1687 r.read_exact(&mut buf[n..])?;
1695 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
1696 let mut buf = [0; N];
1697 r.read_exact(&mut buf)?;
1701 fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
1702 let mut vec = vec![0; n];
1703 r.read_exact(&mut vec)?;
1707 fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<UnencodedString, IoError> {
1708 let length: u32 = endian.parse(read_bytes(r)?);
1709 Ok(read_vec(r, length as usize)?.into())
1712 #[derive(Clone, Debug)]
1713 pub struct LongStringValueLabels {
1714 pub var_name: UnencodedString,
1717 /// `(value, label)` pairs, where each value is `width` bytes.
1718 pub labels: Vec<(UnencodedString, UnencodedString)>,
1721 #[derive(Clone, Debug)]
1722 pub struct LongStringValueLabelRecord(Vec<LongStringValueLabels>);
1724 impl ExtensionRecord for LongStringValueLabelRecord {
1725 const SUBTYPE: u32 = 21;
1726 const SIZE: Option<u32> = Some(1);
1727 const COUNT: Option<u32> = None;
1728 const NAME: &'static str = "long string value labels record";
1730 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1731 ext.check_size::<Self>()?;
1733 let mut input = &ext.data[..];
1734 let mut label_set = Vec::new();
1735 while !input.is_empty() {
1736 let var_name = read_string(&mut input, endian)?;
1737 let width: u32 = endian.parse(read_bytes(&mut input)?);
1738 let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
1739 let mut labels = Vec::new();
1740 for _ in 0..n_labels {
1741 let value = read_string(&mut input, endian)?;
1742 let label = read_string(&mut input, endian)?;
1743 labels.push((value, label));
1745 label_set.push(LongStringValueLabels {
1751 Ok(LongStringValueLabelRecord(label_set))