1 use crate::endian::{Endian, Parse, ToBytes};
2 use crate::{CategoryLabels, Compression};
4 use encoding_rs::mem::decode_latin1;
5 use flate2::read::ZlibDecoder;
8 use std::fmt::{Debug, Formatter, Result as FmtResult};
9 use std::str::from_utf8;
11 collections::VecDeque,
12 io::{Error as IoError, Read, Seek, SeekFrom},
15 use thiserror::Error as ThisError;
17 use self::state::State;
19 #[derive(ThisError, Debug)]
21 #[error("Not an SPSS system file")]
24 #[error("Invalid magic number {0:?}")]
27 #[error("I/O error ({0})")]
30 #[error("Invalid SAV compression code {0}")]
31 InvalidSavCompression(u32),
33 #[error("Invalid ZSAV compression code {0}")]
34 InvalidZsavCompression(u32),
36 #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
37 BadVariableWidth { offset: u64, width: i32 },
39 #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
40 BadDocumentLength { offset: u64, n: usize, max: usize },
42 #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
43 BadRecordType { offset: u64, rec_type: u32 },
45 #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")]
46 BadVariableLabelCode { offset: u64, code: u32 },
49 "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
51 BadNumericMissingValueCode { offset: u64, code: i32 },
53 #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
54 BadStringMissingValueCode { offset: u64, code: i32 },
56 #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
57 BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
59 #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
60 BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
62 #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
63 ExtensionRecordTooLarge {
70 #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
78 "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
80 EofInCompressedCase { offset: u64, case_ofs: u64 },
82 #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
83 PartialCompressedCase { offset: u64, case_ofs: u64 },
85 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
86 CompressedNumberExpected { offset: u64, case_ofs: u64 },
88 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
89 CompressedStringExpected { offset: u64, case_ofs: u64 },
91 #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
92 BadZlibTrailerNBlocks {
95 expected_n_blocks: u64,
99 #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
100 BadRecordSize { offset: u64, record: String, size: u32, expected_size: u32 },
102 #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
103 BadRecordCount { offset: u64, record: String, count: u32, expected_count: u32 },
105 #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
106 BadLongMissingValueLength { record_offset: u64, offset: u64, value_len: u32 },
108 #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
109 BadEncodingName { offset: u64 },
111 #[error("Details TBD")]
115 #[derive(Clone, Debug)]
119 ValueLabel(ValueLabel),
120 VarIndexes(VarIndexes),
122 IntegerInfo(IntegerInfo),
123 FloatInfo(FloatInfo),
124 VariableSets(UnencodedString),
125 VarDisplay(VarDisplayRecord),
126 MultipleResponse(MultipleResponseRecord),
127 LongStringValueLabels(LongStringValueLabelRecord),
128 Encoding(EncodingRecord),
129 NumberOfCases(NumberOfCasesRecord),
130 ProductInfo(UnencodedString),
131 LongNames(UnencodedString),
132 LongStrings(UnencodedString),
133 FileAttributes(UnencodedString),
134 VariableAttributes(UnencodedString),
135 TextExtension(TextExtension),
136 OtherExtension(Extension),
144 fn read<R: Read + Seek>(reader: &mut R, endian: Endian) -> Result<Record, Error> {
145 let rec_type: u32 = endian.parse(read_bytes(reader)?);
147 2 => Ok(Record::Variable(Variable::read(reader, endian)?)),
148 3 => Ok(Record::ValueLabel(ValueLabel::read(reader, endian)?)),
149 4 => Ok(Record::VarIndexes(VarIndexes::read(reader, endian)?)),
150 6 => Ok(Record::Document(Document::read(reader, endian)?)),
151 7 => Ok(Extension::read(reader, endian)?),
152 999 => Ok(Record::EndOfHeaders(endian.parse(read_bytes(reader)?))),
153 _ => Err(Error::BadRecordType {
154 offset: reader.stream_position()?,
161 // If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
162 // decoded as Latin-1 (actually bytes interpreted as Unicode code points).
163 fn default_decode<'a>(s: &'a [u8]) -> Cow<'a, str> {
164 from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
172 /// Eye-catcher string, product name, in the file's encoding. Padded
173 /// on the right with spaces.
174 pub eye_catcher: UnencodedStr<60>,
176 /// Layout code, normally either 2 or 3.
177 pub layout_code: u32,
179 /// Number of variable positions, or `None` if the value in the file is
180 /// questionably trustworthy.
181 pub nominal_case_size: Option<u32>,
183 /// Compression type, if any,
184 pub compression: Option<Compression>,
186 /// 0-based variable index of the weight variable, or `None` if the file is
188 pub weight_index: Option<u32>,
190 /// Claimed number of cases, if known.
191 pub n_cases: Option<u32>,
193 /// Compression bias, usually 100.0.
196 /// `dd mmm yy` in the file's encoding.
197 pub creation_date: UnencodedStr<9>,
199 /// `HH:MM:SS` in the file's encoding.
200 pub creation_time: UnencodedStr<8>,
202 /// File label, in the file's encoding. Padded on the right with spaces.
203 pub file_label: UnencodedStr<64>,
205 /// Endianness of the data in the file header.
210 fn debug_field<T: Debug>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult {
211 writeln!(f, "{name:>17}: {:?}", value)
215 impl Debug for Header {
216 fn fmt(&self, f: &mut Formatter) -> FmtResult {
217 writeln!(f, "File header record:")?;
218 self.debug_field(f, "Magic", self.magic)?;
219 self.debug_field(f, "Product name", &self.eye_catcher)?;
220 self.debug_field(f, "Layout code", self.layout_code)?;
221 self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
222 self.debug_field(f, "Compression", self.compression)?;
223 self.debug_field(f, "Weight index", self.weight_index)?;
224 self.debug_field(f, "Number of cases", self.n_cases)?;
225 self.debug_field(f, "Compression bias", self.bias)?;
226 self.debug_field(f, "Creation date", &self.creation_date)?;
227 self.debug_field(f, "Creation time", &self.creation_time)?;
228 self.debug_field(f, "File label", &self.file_label)?;
229 self.debug_field(f, "Endianness", self.endian)
234 fn read<R: Read>(r: &mut R) -> Result<Header, Error> {
235 let magic: [u8; 4] = read_bytes(r)?;
236 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
238 let eye_catcher = UnencodedStr::<60>(read_bytes(r)?);
239 let layout_code: [u8; 4] = read_bytes(r)?;
240 let endian = Endian::identify_u32(2, layout_code)
241 .or_else(|| Endian::identify_u32(2, layout_code))
242 .ok_or_else(|| Error::NotASystemFile)?;
243 let layout_code = endian.parse(layout_code);
245 let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
246 let nominal_case_size =
247 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
249 let compression_code: u32 = endian.parse(read_bytes(r)?);
250 let compression = match (magic, compression_code) {
251 (Magic::ZSAV, 2) => Some(Compression::ZLib),
252 (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
254 (_, 1) => Some(Compression::Simple),
255 (_, code) => return Err(Error::InvalidSavCompression(code)),
258 let weight_index: u32 = endian.parse(read_bytes(r)?);
259 let weight_index = (weight_index > 0).then(|| weight_index - 1);
261 let n_cases: u32 = endian.parse(read_bytes(r)?);
262 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
264 let bias: f64 = endian.parse(read_bytes(r)?);
266 let creation_date = UnencodedStr::<9>(read_bytes(r)?);
267 let creation_time = UnencodedStr::<8>(read_bytes(r)?);
268 let file_label = UnencodedStr::<64>(read_bytes(r)?);
269 let _: [u8; 3] = read_bytes(r)?;
288 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
289 pub struct Magic([u8; 4]);
292 /// Magic number for a regular system file.
293 pub const SAV: Magic = Magic(*b"$FL2");
295 /// Magic number for a system file that contains zlib-compressed data.
296 pub const ZSAV: Magic = Magic(*b"$FL3");
298 /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
300 pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
303 impl Debug for Magic {
304 fn fmt(&self, f: &mut Formatter) -> FmtResult {
306 &Magic::SAV => "$FL2",
307 &Magic::ZSAV => "$FL3",
308 &Magic::EBCDIC => "($FL2 in EBCDIC)",
309 _ => return write!(f, "{:?}", self.0),
315 impl TryFrom<[u8; 4]> for Magic {
318 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
319 let magic = Magic(value);
321 Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
322 _ => Err(Error::BadMagic(value)),
327 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
334 fn from_width(width: i32) -> VarType {
336 0 => VarType::Number,
337 _ => VarType::String,
344 Compression, Error, Header, Record, Value, VarType, Variable, ZHeader, ZTrailer,
347 use crate::endian::Endian;
349 collections::VecDeque,
354 #[allow(clippy::type_complexity)]
355 fn read(self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error>;
358 struct Start<R: Read + Seek> {
362 pub fn new<R: Read + Seek + 'static>(reader: R) -> Box<dyn State> {
363 Box::new(Start { reader })
366 struct CommonState<R: Read + Seek> {
370 compression: Option<Compression>,
371 var_types: Vec<VarType>,
374 impl<R: Read + Seek + 'static> State for Start<R> {
375 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
376 let header = Header::read(&mut self.reader)?;
377 let next_state = Headers(CommonState {
379 endian: header.endian,
381 compression: header.compression,
382 var_types: Vec::new(),
384 Ok(Some((Record::Header(header), Box::new(next_state))))
388 struct Headers<R: Read + Seek>(CommonState<R>);
390 impl<R: Read + Seek + 'static> State for Headers<R> {
391 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
392 let record = Record::read(&mut self.0.reader, self.0.endian)?;
394 Record::Variable(Variable { width, .. }) => {
395 self.0.var_types.push(VarType::from_width(width));
397 Record::EndOfHeaders(_) => {
398 let next_state: Box<dyn State> = match self.0.compression {
399 None => Box::new(Data(self.0)),
400 Some(Compression::Simple) => Box::new(CompressedData::new(self.0)),
401 Some(Compression::ZLib) => Box::new(ZlibHeader(self.0)),
403 return Ok(Some((record, next_state)));
407 Ok(Some((record, self)))
411 struct ZlibHeader<R: Read + Seek>(CommonState<R>);
413 impl<R: Read + Seek + 'static> State for ZlibHeader<R> {
414 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
415 let zheader = ZHeader::read(&mut self.0.reader, self.0.endian)?;
416 Ok(Some((Record::ZHeader(zheader), self)))
420 struct ZlibTrailer<R: Read + Seek>(CommonState<R>, ZHeader);
422 impl<R: Read + Seek + 'static> State for ZlibTrailer<R> {
423 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
424 let retval = ZTrailer::read(
427 self.1.ztrailer_offset,
430 let next_state = Box::new(CompressedData::new(CommonState {
431 reader: ZlibDecodeMultiple::new(self.0.reader),
432 endian: self.0.endian,
434 compression: self.0.compression,
435 var_types: self.0.var_types,
438 None => next_state.read(),
439 Some(ztrailer) => Ok(Some((Record::ZTrailer(ztrailer), next_state))),
444 struct Data<R: Read + Seek>(CommonState<R>);
446 impl<R: Read + Seek + 'static> State for Data<R> {
447 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
448 match Value::read_case(&mut self.0.reader, &self.0.var_types, self.0.endian)? {
450 Some(values) => Ok(Some((Record::Case(values), self))),
455 struct CompressedData<R: Read + Seek> {
456 common: CommonState<R>,
460 impl<R: Read + Seek + 'static> CompressedData<R> {
461 fn new(common: CommonState<R>) -> CompressedData<R> {
464 codes: VecDeque::new(),
469 impl<R: Read + Seek + 'static> State for CompressedData<R> {
470 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
471 match Value::read_compressed_case(
472 &mut self.common.reader,
473 &self.common.var_types,
479 Some(values) => Ok(Some((Record::Case(values), self))),
485 #[derive(Copy, Clone)]
488 String(UnencodedStr<8>),
491 impl Debug for Value {
492 fn fmt(&self, f: &mut Formatter) -> FmtResult {
494 Value::Number(Some(number)) => write!(f, "{number:?}"),
495 Value::Number(None) => write!(f, "SYSMIS"),
496 Value::String(bytes) => write!(f, "{:?}", bytes),
502 fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Value, IoError> {
503 Ok(Self::from_raw(var_type, read_bytes(r)?, endian))
506 pub fn from_raw(var_type: VarType, raw: [u8; 8], endian: Endian) -> Value {
508 VarType::String => Value::String(UnencodedStr(raw)),
510 let number: f64 = endian.parse(raw);
511 Value::Number((number != -f64::MAX).then_some(number))
516 fn read_case<R: Read + Seek>(
518 var_types: &[VarType],
520 ) -> Result<Option<Vec<Value>>, Error> {
521 let case_start = reader.stream_position()?;
522 let mut values = Vec::with_capacity(var_types.len());
523 for (i, &var_type) in var_types.iter().enumerate() {
524 let Some(raw) = try_read_bytes(reader)? else {
528 let offset = reader.stream_position()?;
529 return Err(Error::EofInCase {
531 case_ofs: offset - case_start,
532 case_len: var_types.len() * 8,
536 values.push(Value::from_raw(var_type, raw, endian));
541 fn read_compressed_case<R: Read + Seek>(
543 var_types: &[VarType],
544 codes: &mut VecDeque<u8>,
547 ) -> Result<Option<Vec<Value>>, Error> {
548 let case_start = reader.stream_position()?;
549 let mut values = Vec::with_capacity(var_types.len());
550 for (i, &var_type) in var_types.iter().enumerate() {
552 let Some(code) = codes.pop_front() else {
553 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
557 let offset = reader.stream_position()?;
558 return Err(Error::EofInCompressedCase {
560 case_ofs: offset - case_start,
564 codes.extend(new_codes.into_iter());
569 1..=251 => match var_type {
570 VarType::Number => break Value::Number(Some(code as f64 - bias)),
572 break Value::String(UnencodedStr(endian.to_bytes(code as f64 - bias)))
579 let offset = reader.stream_position()?;
580 return Err(Error::PartialCompressedCase {
582 case_ofs: offset - case_start,
586 253 => break Value::from_raw(var_type, read_bytes(reader)?, endian),
587 254 => match var_type {
588 VarType::String => break Value::String(UnencodedStr(*b" ")), // XXX EBCDIC
590 return Err(Error::CompressedStringExpected {
592 case_ofs: reader.stream_position()? - case_start,
596 255 => match var_type {
597 VarType::Number => break Value::Number(None),
599 return Err(Error::CompressedNumberExpected {
601 case_ofs: reader.stream_position()? - case_start,
613 struct ZlibDecodeMultiple<R>
617 reader: Option<ZlibDecoder<R>>,
620 impl<R> ZlibDecodeMultiple<R>
624 fn new(reader: R) -> ZlibDecodeMultiple<R> {
626 reader: Some(ZlibDecoder::new(reader)),
631 impl<R> Read for ZlibDecodeMultiple<R>
635 fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
637 match self.reader.as_mut().unwrap().read(buf)? {
639 let inner = self.reader.take().unwrap().into_inner();
640 self.reader = Some(ZlibDecoder::new(inner));
648 impl<R> Seek for ZlibDecodeMultiple<R>
652 fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
653 self.reader.as_mut().unwrap().get_mut().seek(pos)
658 state: Option<Box<dyn State>>,
662 pub fn new<R: Read + Seek + 'static>(reader: R) -> Result<Reader, Error> {
664 state: Some(state::new(reader)),
667 pub fn collect_headers(&mut self) -> Result<Vec<Record>, Error> {
668 let mut headers = Vec::new();
671 Record::EndOfHeaders(_) => break,
672 r => headers.push(r),
679 impl Iterator for Reader {
680 type Item = Result<Record, Error>;
682 fn next(&mut self) -> Option<Self::Item> {
683 match self.state.take()?.read() {
684 Ok(Some((record, next_state))) => {
685 self.state = Some(next_state);
689 Err(error) => Some(Err(error)),
694 impl FusedIterator for Reader {}
696 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
697 pub struct Spec(pub u32);
699 impl Debug for Spec {
700 fn fmt(&self, f: &mut Formatter) -> FmtResult {
701 let type_ = format_name(self.0 >> 16);
702 let w = (self.0 >> 8) & 0xff;
703 let d = self.0 & 0xff;
704 write!(f, "{:06x} ({type_}{w}.{d})", self.0)
708 fn format_name(type_: u32) -> Cow<'static, str> {
747 _ => return format!("<unknown format {type_}>").into()
752 pub struct MissingValues {
753 /// Individual missing values, up to 3 of them.
754 pub values: Vec<Value>,
756 /// Optional range of missing values.
757 pub range: Option<(Value, Value)>,
760 impl Debug for MissingValues {
761 fn fmt(&self, f: &mut Formatter) -> FmtResult {
762 for (i, value) in self.values.iter().enumerate() {
766 write!(f, "{value:?}")?;
769 if let Some((low, high)) = self.range {
770 if !self.values.is_empty() {
773 write!(f, "{low:?} THRU {high:?}")?;
785 fn is_empty(&self) -> bool {
786 self.values.is_empty() && self.range.is_none()
789 fn read<R: Read + Seek>(
795 ) -> Result<MissingValues, Error> {
796 let (n_values, has_range) = match (width, code) {
797 (_, 0..=3) => (code, false),
798 (0, -2) => (0, true),
799 (0, -3) => (1, true),
800 (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
801 (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
804 let var_type = VarType::from_width(width);
806 let mut values = Vec::new();
807 for _ in 0..n_values {
808 values.push(Value::read(r, var_type, endian)?);
810 let range = if has_range {
811 let low = Value::read(r, var_type, endian)?;
812 let high = Value::read(r, var_type, endian)?;
817 Ok(MissingValues { values, range })
822 pub struct Variable {
823 /// Offset from the start of the file to the start of the record.
826 /// Variable width, in the range -1..=255.
829 /// Variable name, padded on the right with spaces.
830 pub name: UnencodedStr<8>,
833 pub print_format: Spec,
836 pub write_format: Spec,
839 pub missing_values: MissingValues,
841 /// Optional variable label.
842 pub label: Option<UnencodedString>,
845 impl Debug for Variable {
846 fn fmt(&self, f: &mut Formatter) -> FmtResult {
853 } else if self.width == 0 {
856 "long string continuation record"
859 writeln!(f, "Print format: {:?}", self.print_format)?;
860 writeln!(f, "Write format: {:?}", self.write_format)?;
861 writeln!(f, "Name: {:?}", &self.name)?;
862 writeln!(f, "Variable label: {:?}", self.label)?;
863 writeln!(f, "Missing values: {:?}", self.missing_values)
868 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Variable, Error> {
869 let offset = r.stream_position()?;
870 let width: i32 = endian.parse(read_bytes(r)?);
871 let has_variable_label: u32 = endian.parse(read_bytes(r)?);
872 let missing_value_code: i32 = endian.parse(read_bytes(r)?);
873 let print_format = Spec(endian.parse(read_bytes(r)?));
874 let write_format = Spec(endian.parse(read_bytes(r)?));
875 let name = UnencodedStr::<8>(read_bytes(r)?);
877 let label = match has_variable_label {
880 let len: u32 = endian.parse(read_bytes(r)?);
881 let read_len = len.min(65535) as usize;
882 let label = UnencodedString(read_vec(r, read_len)?);
884 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
885 let _ = read_vec(r, padding_bytes as usize)?;
890 return Err(Error::BadVariableLabelCode {
892 code: has_variable_label,
897 let missing_values = MissingValues::read(r, offset, width, missing_value_code, endian)?;
911 #[derive(Copy, Clone)]
912 pub struct UntypedValue(pub [u8; 8]);
914 impl Debug for UntypedValue {
915 fn fmt(&self, f: &mut Formatter) -> FmtResult {
916 let little: f64 = Endian::Little.parse(self.0);
917 let little = format!("{:?}", little);
918 let big: f64 = Endian::Big.parse(self.0);
919 let big = format!("{:?}", big);
920 let number = if little.len() <= big.len() {
925 write!(f, "{number}")?;
927 let string = default_decode(&self.0);
929 .split(|c: char| c == '\0' || c.is_control())
932 write!(f, "{string:?}")?;
938 pub struct UnencodedString(pub Vec<u8>);
940 impl From<Vec<u8>> for UnencodedString {
941 fn from(source: Vec<u8>) -> Self {
946 impl From<&[u8]> for UnencodedString {
947 fn from(source: &[u8]) -> Self {
952 impl Debug for UnencodedString {
953 fn fmt(&self, f: &mut Formatter) -> FmtResult {
954 write!(f, "{:?}", default_decode(self.0.as_slice()))
958 #[derive(Copy, Clone)]
959 pub struct UnencodedStr<const N: usize>(pub [u8; N]);
961 impl<const N: usize> From<[u8; N]> for UnencodedStr<N> {
962 fn from(source: [u8; N]) -> Self {
967 impl<const N: usize> Debug for UnencodedStr<N> {
968 fn fmt(&self, f: &mut Formatter) -> FmtResult {
969 write!(f, "{:?}", default_decode(&self.0))
974 pub struct ValueLabel {
975 /// Offset from the start of the file to the start of the record.
979 pub labels: Vec<(UntypedValue, UnencodedString)>,
982 impl Debug for ValueLabel {
983 fn fmt(&self, f: &mut Formatter) -> FmtResult {
984 for (value, label) in self.labels.iter() {
985 writeln!(f, "{value:?}: {label:?}")?;
992 /// Maximum number of value labels in a record.
993 pub const MAX: u32 = u32::MAX / 8;
995 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabel, Error> {
996 let offset = r.stream_position()?;
997 let n: u32 = endian.parse(read_bytes(r)?);
998 if n > ValueLabel::MAX {
999 return Err(Error::BadNumberOfValueLabels {
1002 max: ValueLabel::MAX,
1006 let mut labels = Vec::new();
1008 let value = UntypedValue(read_bytes(r)?);
1009 let label_len: u8 = endian.parse(read_bytes(r)?);
1010 let label_len = label_len as usize;
1011 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
1013 let mut label = read_vec(r, padded_len - 1)?;
1014 label.truncate(label_len);
1015 labels.push((value, UnencodedString(label)));
1017 Ok(ValueLabel { offset, labels })
1022 pub struct VarIndexes {
1023 /// Offset from the start of the file to the start of the record.
1026 /// The 0-based indexes of the variable indexes.
1027 pub var_indexes: Vec<u32>,
1030 impl Debug for VarIndexes {
1031 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1032 write!(f, "apply to variables")?;
1033 for var_index in self.var_indexes.iter() {
1034 write!(f, " #{var_index}")?;
1041 /// Maximum number of variable indexes in a record.
1042 pub const MAX: u32 = u32::MAX / 8;
1044 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VarIndexes, Error> {
1045 let offset = r.stream_position()?;
1046 let n: u32 = endian.parse(read_bytes(r)?);
1047 if n > VarIndexes::MAX {
1048 return Err(Error::BadNumberOfVarIndexes {
1051 max: VarIndexes::MAX,
1054 let mut var_indexes = Vec::with_capacity(n as usize);
1056 var_indexes.push(endian.parse(read_bytes(r)?));
1066 #[derive(Clone, Debug)]
1067 pub struct Document {
1068 /// Offset from the start of the file to the start of the record.
1071 /// The document, as an array of 80-byte lines.
1072 pub lines: Vec<DocumentLine>
1075 pub type DocumentLine = UnencodedStr<{Document::LINE_LEN}>;
1078 /// Length of a line in a document. Document lines are fixed-length and
1079 /// padded on the right with spaces.
1080 pub const LINE_LEN: usize = 80;
1082 /// Maximum number of lines we will accept in a document. This is simply
1083 /// the maximum number that will fit in a 32-bit space.
1084 pub const MAX_LINES: usize = i32::MAX as usize / Self::LINE_LEN;
1086 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Document, Error> {
1087 let offset = r.stream_position()?;
1088 let n: u32 = endian.parse(read_bytes(r)?);
1090 if n > Self::MAX_LINES {
1091 Err(Error::BadDocumentLength {
1094 max: Self::MAX_LINES,
1097 let pos = r.stream_position()?;
1098 let mut lines = Vec::with_capacity(n);
1100 lines.push(UnencodedStr::<{Document::LINE_LEN}>(read_bytes(r)?));
1102 Ok(Document { pos, lines })
1107 trait ExtensionRecord
1112 const SIZE: Option<u32>;
1113 const COUNT: Option<u32>;
1114 const NAME: &'static str;
1115 fn parse(ext: &Extension, endian: Endian, warn: impl Fn(Error)) -> Result<Self, Error>;
1118 #[derive(Clone, Debug)]
1119 pub struct IntegerInfo {
1120 pub version: (i32, i32, i32),
1121 pub machine_code: i32,
1122 pub floating_point_rep: i32,
1123 pub compression_code: i32,
1124 pub endianness: i32,
1125 pub character_code: i32,
1128 impl ExtensionRecord for IntegerInfo {
1129 const SUBTYPE: u32 = 3;
1130 const SIZE: Option<u32> = Some(4);
1131 const COUNT: Option<u32> = Some(8);
1132 const NAME: &'static str = "integer record";
1134 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1135 ext.check_size::<Self>()?;
1137 let mut input = &ext.data[..];
1138 let data: Vec<i32> = (0..8)
1139 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1142 version: (data[0], data[1], data[2]),
1143 machine_code: data[3],
1144 floating_point_rep: data[4],
1145 compression_code: data[5],
1146 endianness: data[6],
1147 character_code: data[7],
1152 #[derive(Clone, Debug)]
1153 pub struct FloatInfo {
1159 impl ExtensionRecord for FloatInfo {
1160 const SUBTYPE: u32 = 4;
1161 const SIZE: Option<u32> = Some(8);
1162 const COUNT: Option<u32> = Some(3);
1163 const NAME: &'static str = "floating point record";
1165 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1166 ext.check_size::<Self>()?;
1168 let mut input = &ext.data[..];
1169 let data: Vec<f64> = (0..3)
1170 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1180 #[derive(Clone, Debug)]
1181 pub enum MultipleResponseType {
1183 value: UnencodedString,
1184 labels: CategoryLabels,
1188 #[derive(Clone, Debug)]
1189 pub struct MultipleResponseSet {
1190 pub name: UnencodedString,
1191 pub label: UnencodedString,
1192 pub mr_type: MultipleResponseType,
1193 pub vars: Vec<UnencodedString>,
1196 impl MultipleResponseSet {
1197 fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> {
1198 let Some(equals) = input.iter().position(|&b| b == b'=') else {
1199 return Err(Error::TBD);
1201 let (name, input) = input.split_at(equals);
1202 let (mr_type, input) = match input.get(0) {
1203 Some(b'C') => (MultipleResponseType::MultipleCategory, &input[1..]),
1205 let (value, input) = parse_counted_string(&input[1..])?;
1207 MultipleResponseType::MultipleDichotomy {
1208 value: value.into(),
1209 labels: CategoryLabels::VarLabels,
1215 let Some(b' ') = input.get(1) else {
1216 return Err(Error::TBD);
1218 let input = &input[2..];
1219 let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
1220 (CategoryLabels::CountedValues, rest)
1221 } else if let Some(rest) = input.strip_prefix(b" 11 ") {
1222 (CategoryLabels::VarLabels, rest)
1224 return Err(Error::TBD);
1226 let (value, input) = parse_counted_string(input)?;
1228 MultipleResponseType::MultipleDichotomy {
1229 value: value.into(),
1235 _ => return Err(Error::TBD),
1237 let Some(b' ') = input.get(0) else {
1238 return Err(Error::TBD);
1240 let (label, mut input) = parse_counted_string(&input[1..])?;
1241 let mut vars = Vec::new();
1242 while input.get(0) == Some(&b' ') {
1243 input = &input[1..];
1244 let Some(length) = input.iter().position(|b| b" \n".contains(b)) else {
1245 return Err(Error::TBD);
1248 vars.push(input[..length].into());
1250 input = &input[length..];
1252 if input.get(0) != Some(&b'\n') {
1253 return Err(Error::TBD);
1255 while input.get(0) == Some(&b'\n') {
1256 input = &input[1..];
1259 MultipleResponseSet {
1261 label: label.into(),
1270 #[derive(Clone, Debug)]
1271 pub struct MultipleResponseRecord(Vec<MultipleResponseSet>);
1273 impl ExtensionRecord for MultipleResponseRecord {
1274 const SUBTYPE: u32 = 7;
1275 const SIZE: Option<u32> = Some(1);
1276 const COUNT: Option<u32> = None;
1277 const NAME: &'static str = "multiple response set record";
1279 fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1280 ext.check_size::<Self>()?;
1282 let mut input = &ext.data[..];
1283 let mut sets = Vec::new();
1284 while !input.is_empty() {
1285 let (set, rest) = MultipleResponseSet::parse(input)?;
1289 Ok(MultipleResponseRecord(sets))
1293 fn parse_counted_string(input: &[u8]) -> Result<(UnencodedString, &[u8]), Error> {
1294 let Some(space) = input.iter().position(|&b| b == b' ') else {
1295 return Err(Error::TBD);
1297 let Ok(length) = from_utf8(&input[..space]) else {
1298 return Err(Error::TBD);
1300 let Ok(length): Result<usize, _> = length.parse() else {
1301 return Err(Error::TBD);
1304 let input = &input[space + 1..];
1305 if input.len() < length {
1306 return Err(Error::TBD);
1309 let (string, rest) = input.split_at(length);
1310 Ok((string.into(), rest))
1313 #[derive(Clone, Debug)]
1314 pub struct VarDisplayRecord(pub Vec<u32>);
1316 impl ExtensionRecord for VarDisplayRecord {
1317 const SUBTYPE: u32 = 11;
1318 const SIZE: Option<u32> = Some(4);
1319 const COUNT: Option<u32> = None;
1320 const NAME: &'static str = "variable display record";
1322 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1323 ext.check_size::<Self>()?;
1325 let mut input = &ext.data[..];
1326 let display = (0..ext.count)
1327 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1329 Ok(VarDisplayRecord(display))
1333 pub struct LongStringMissingValues {
1335 pub var_name: UnencodedString,
1338 pub missing_values: MissingValues,
1341 pub struct LongStringMissingValueSet(Vec<LongStringMissingValues>);
1343 impl ExtensionRecord for LongStringMissingValueSet {
1344 const SUBTYPE: u32 = 22;
1345 const SIZE: Option<u32> = Some(1);
1346 const COUNT: Option<u32> = None;
1347 const NAME: &'static str = "long string missing values record";
1349 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1350 ext.check_size::<Self>()?;
1352 let mut input = &ext.data[..];
1353 let mut missing_value_set = Vec::new();
1354 while !input.is_empty() {
1355 let var_name = read_string(&mut input, endian)?;
1356 let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
1357 let value_len: u32 = endian.parse(read_bytes(&mut input)?);
1359 let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offset;
1360 return Err(Error::BadLongMissingValueLength {
1361 record_offset: ext.offset,
1366 let mut values = Vec::new();
1367 for i in 0..n_missing_values {
1368 let value: [u8; 8] = read_bytes(&mut input)?;
1369 let numeric_value: u64 = endian.parse(value);
1370 let value = if i > 0 && numeric_value == 8 {
1371 // Tolerate files written by old, buggy versions of PSPP
1372 // where we believed that the value_length was repeated
1373 // before each missing value.
1374 read_bytes(&mut input)?
1378 values.push(Value::String(UnencodedStr(value)));
1380 let missing_values = MissingValues {
1384 missing_value_set.push(LongStringMissingValues {
1389 Ok(LongStringMissingValueSet(missing_value_set))
1393 #[derive(Clone, Debug)]
1394 pub struct EncodingRecord(pub String);
1396 impl ExtensionRecord for EncodingRecord {
1397 const SUBTYPE: u32 = 20;
1398 const SIZE: Option<u32> = Some(1);
1399 const COUNT: Option<u32> = None;
1400 const NAME: &'static str = "encoding record";
1402 fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1403 ext.check_size::<Self>()?;
1406 String::from_utf8(ext.data.clone())
1407 .map_err(|_| Error::BadEncodingName { offset: ext.offset })?,
1413 #[derive(Clone, Debug)]
1414 pub struct NumberOfCasesRecord {
1415 /// Always observed as 1.
1418 /// Number of cases.
1422 impl ExtensionRecord for NumberOfCasesRecord {
1423 const SUBTYPE: u32 = 16;
1424 const SIZE: Option<u32> = Some(8);
1425 const COUNT: Option<u32> = Some(2);
1426 const NAME: &'static str = "extended number of cases record";
1428 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1429 ext.check_size::<Self>()?;
1431 let mut input = &ext.data[..];
1432 let one = endian.parse(read_bytes(&mut input)?);
1433 let n_cases = endian.parse(read_bytes(&mut input)?);
1435 Ok(NumberOfCasesRecord { one, n_cases })
1439 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
1440 pub enum TextExtensionSubtype {
1445 FileAttributes = 17,
1446 VariableAttributes = 18,
1449 #[derive(Clone, Debug)]
1450 pub struct TextExtension {
1451 pub subtype: TextExtensionSubtype,
1452 pub string: UnencodedString,
1455 #[derive(Clone, Debug)]
1456 pub struct Extension {
1457 /// Offset from the start of the file to the start of the record.
1463 /// Size of each data element.
1466 /// Number of data elements.
1469 /// `size * count` bytes of data.
1474 fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
1475 if let Some(expected_size) = E::SIZE {
1476 if self.size != expected_size {
1477 return Err(Error::BadRecordSize {
1478 offset: self.offset,
1479 record: E::NAME.into(),
1485 if let Some(expected_count) = E::COUNT {
1486 if self.count != expected_count {
1487 return Err(Error::BadRecordCount {
1488 offset: self.offset,
1489 record: E::NAME.into(),
1498 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1499 let subtype = endian.parse(read_bytes(r)?);
1500 let offset = r.stream_position()?;
1501 let size: u32 = endian.parse(read_bytes(r)?);
1502 let count = endian.parse(read_bytes(r)?);
1503 let Some(product) = size.checked_mul(count) else {
1504 return Err(Error::ExtensionRecordTooLarge {
1511 let offset = r.stream_position()?;
1512 let data = read_vec(r, product as usize)?;
1513 let extension = Extension {
1521 IntegerInfo::SUBTYPE => Ok(Record::IntegerInfo(IntegerInfo::parse(
1526 FloatInfo::SUBTYPE => Ok(Record::FloatInfo(FloatInfo::parse(
1531 VarDisplayRecord::SUBTYPE => Ok(Record::VarDisplay(VarDisplayRecord::parse(
1536 MultipleResponseRecord::SUBTYPE | 19 => Ok(Record::MultipleResponse(
1537 MultipleResponseRecord::parse(&extension, endian, |_| ())?,
1539 LongStringValueLabelRecord::SUBTYPE => Ok(Record::LongStringValueLabels(
1540 LongStringValueLabelRecord::parse(&extension, endian, |_| ())?,
1542 EncodingRecord::SUBTYPE => Ok(Record::Encoding(EncodingRecord::parse(
1547 NumberOfCasesRecord::SUBTYPE => Ok(Record::NumberOfCases(NumberOfCasesRecord::parse(
1552 x if x == TextExtensionSubtype::VariableSets as u32 => {
1553 Ok(Record::VariableSets(UnencodedString(extension.data)))
1555 x if x == TextExtensionSubtype::ProductInfo as u32 => {
1556 Ok(Record::ProductInfo(UnencodedString(extension.data)))
1558 x if x == TextExtensionSubtype::LongNames as u32 => {
1559 Ok(Record::LongNames(UnencodedString(extension.data)))
1561 x if x == TextExtensionSubtype::LongStrings as u32 => {
1562 Ok(Record::LongStrings(UnencodedString(extension.data)))
1564 x if x == TextExtensionSubtype::FileAttributes as u32 => {
1565 Ok(Record::FileAttributes(UnencodedString(extension.data)))
1567 x if x == TextExtensionSubtype::VariableAttributes as u32 => {
1568 Ok(Record::VariableAttributes(UnencodedString(extension.data)))
1570 _ => Ok(Record::OtherExtension(extension)),
1575 #[derive(Clone, Debug)]
1576 pub struct ZHeader {
1577 /// File offset to the start of the record.
1580 /// File offset to the ZLIB data header.
1581 pub zheader_offset: u64,
1583 /// File offset to the ZLIB trailer.
1584 pub ztrailer_offset: u64,
1586 /// Length of the ZLIB trailer in bytes.
1587 pub ztrailer_len: u64,
1591 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
1592 let offset = r.stream_position()?;
1593 let zheader_offset: u64 = endian.parse(read_bytes(r)?);
1594 let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
1595 let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
1606 #[derive(Clone, Debug)]
1607 pub struct ZTrailer {
1608 /// File offset to the start of the record.
1611 /// Compression bias as a negative integer, e.g. -100.
1614 /// Always observed as zero.
1617 /// Uncompressed size of each block, except possibly the last. Only
1618 /// `0x3ff000` has been observed so far.
1619 pub block_size: u32,
1621 /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
1622 pub blocks: Vec<ZBlock>,
1625 #[derive(Clone, Debug)]
1627 /// Offset of block of data if simple compression were used.
1628 pub uncompressed_ofs: u64,
1630 /// Actual offset within the file of the compressed data block.
1631 pub compressed_ofs: u64,
1633 /// The number of bytes in this data block after decompression. This is
1634 /// `block_size` in every data block but the last, which may be smaller.
1635 pub uncompressed_size: u32,
1637 /// The number of bytes in this data block, as stored compressed in this
1639 pub compressed_size: u32,
1643 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
1645 uncompressed_ofs: endian.parse(read_bytes(r)?),
1646 compressed_ofs: endian.parse(read_bytes(r)?),
1647 uncompressed_size: endian.parse(read_bytes(r)?),
1648 compressed_size: endian.parse(read_bytes(r)?),
1654 fn read<R: Read + Seek>(
1659 ) -> Result<Option<ZTrailer>, Error> {
1660 let start_offset = reader.stream_position()?;
1661 if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
1664 let int_bias = endian.parse(read_bytes(reader)?);
1665 let zero = endian.parse(read_bytes(reader)?);
1666 let block_size = endian.parse(read_bytes(reader)?);
1667 let n_blocks: u32 = endian.parse(read_bytes(reader)?);
1668 let expected_n_blocks = (ztrailer_len - 24) / 24;
1669 if n_blocks as u64 != expected_n_blocks {
1670 return Err(Error::BadZlibTrailerNBlocks {
1671 offset: ztrailer_ofs,
1677 let blocks = (0..n_blocks)
1678 .map(|_| ZBlock::read(reader, endian))
1679 .collect::<Result<Vec<_>, _>>()?;
1680 reader.seek(SeekFrom::Start(start_offset))?;
1682 offset: ztrailer_ofs,
1691 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
1692 let mut buf = [0; N];
1693 let n = r.read(&mut buf)?;
1696 r.read_exact(&mut buf[n..])?;
1704 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
1705 let mut buf = [0; N];
1706 r.read_exact(&mut buf)?;
1710 fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
1711 let mut vec = vec![0; n];
1712 r.read_exact(&mut vec)?;
1716 fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<UnencodedString, IoError> {
1717 let length: u32 = endian.parse(read_bytes(r)?);
1718 Ok(read_vec(r, length as usize)?.into())
1721 #[derive(Clone, Debug)]
1722 pub struct LongStringValueLabels {
1723 pub var_name: UnencodedString,
1726 /// `(value, label)` pairs, where each value is `width` bytes.
1727 pub labels: Vec<(UnencodedString, UnencodedString)>,
1730 #[derive(Clone, Debug)]
1731 pub struct LongStringValueLabelRecord(Vec<LongStringValueLabels>);
1733 impl ExtensionRecord for LongStringValueLabelRecord {
1734 const SUBTYPE: u32 = 21;
1735 const SIZE: Option<u32> = Some(1);
1736 const COUNT: Option<u32> = None;
1737 const NAME: &'static str = "long string value labels record";
1739 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1740 ext.check_size::<Self>()?;
1742 let mut input = &ext.data[..];
1743 let mut label_set = Vec::new();
1744 while !input.is_empty() {
1745 let var_name = read_string(&mut input, endian)?;
1746 let width: u32 = endian.parse(read_bytes(&mut input)?);
1747 let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
1748 let mut labels = Vec::new();
1749 for _ in 0..n_labels {
1750 let value = read_string(&mut input, endian)?;
1751 let label = read_string(&mut input, endian)?;
1752 labels.push((value, label));
1754 label_set.push(LongStringValueLabels {
1760 Ok(LongStringValueLabelRecord(label_set))