1 use crate::endian::{Endian, Parse, ToBytes};
3 use encoding_rs::mem::decode_latin1;
4 use flate2::read::ZlibDecoder;
7 use std::fmt::{Debug, Formatter, Result as FmtResult};
8 use std::str::from_utf8;
10 collections::VecDeque,
11 io::{Error as IoError, Read, Seek, SeekFrom},
14 use thiserror::Error as ThisError;
16 use self::state::State;
18 #[derive(ThisError, Debug)]
20 #[error("Not an SPSS system file")]
23 #[error("Invalid magic number {0:?}")]
26 #[error("I/O error ({0})")]
29 #[error("Invalid SAV compression code {0}")]
30 InvalidSavCompression(u32),
32 #[error("Invalid ZSAV compression code {0}")]
33 InvalidZsavCompression(u32),
35 #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
36 BadVariableWidth { offset: u64, width: i32 },
38 #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
39 BadDocumentLength { offset: u64, n: usize, max: usize },
41 #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
42 BadRecordType { offset: u64, rec_type: u32 },
44 #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")]
45 BadVariableLabelCode { offset: u64, code: u32 },
48 "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
50 BadNumericMissingValueCode { offset: u64, code: i32 },
52 #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
53 BadStringMissingValueCode { offset: u64, code: i32 },
55 #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
56 BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
58 #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
59 ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
61 #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
62 BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
64 #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
65 ExtensionRecordTooLarge {
72 #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
80 "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
82 EofInCompressedCase { offset: u64, case_ofs: u64 },
84 #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
85 PartialCompressedCase { offset: u64, case_ofs: u64 },
87 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
88 CompressedNumberExpected { offset: u64, case_ofs: u64 },
90 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
91 CompressedStringExpected { offset: u64, case_ofs: u64 },
93 #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
94 BadZlibTrailerNBlocks {
97 expected_n_blocks: u64,
101 #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
109 #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
117 #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
118 BadLongMissingValueLength {
124 #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
125 BadEncodingName { offset: u64 },
127 #[error("Details TBD")]
131 #[derive(Clone, Debug)]
133 Header(HeaderRecord),
134 Variable(VariableRecord),
135 ValueLabel(ValueLabelRecord),
136 Document(DocumentRecord),
137 IntegerInfo(IntegerInfoRecord),
138 FloatInfo(FloatInfoRecord),
139 VariableSets(TextRecord),
140 VarDisplay(VarDisplayRecord),
141 MultipleResponse(MultipleResponseRecord),
142 LongStringValueLabels(LongStringValueLabelRecord),
143 Encoding(EncodingRecord),
144 NumberOfCases(NumberOfCasesRecord),
145 ProductInfo(TextRecord),
146 LongNames(TextRecord),
147 VeryLongStrings(TextRecord),
148 FileAttributes(TextRecord),
149 VariableAttributes(TextRecord),
150 OtherExtension(Extension),
158 fn read<R: Read + Seek>(reader: &mut R, endian: Endian) -> Result<Record, Error> {
159 let rec_type: u32 = endian.parse(read_bytes(reader)?);
161 2 => Ok(Record::Variable(VariableRecord::read(reader, endian)?)),
162 3 => Ok(Record::ValueLabel(ValueLabelRecord::read(reader, endian)?)),
163 6 => Ok(Record::Document(DocumentRecord::read(reader, endian)?)),
164 7 => Ok(Extension::read(reader, endian)?),
165 999 => Ok(Record::EndOfHeaders(endian.parse(read_bytes(reader)?))),
166 _ => Err(Error::BadRecordType {
167 offset: reader.stream_position()?,
174 // If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
175 // decoded as Latin-1 (actually bytes interpreted as Unicode code points).
176 fn default_decode<'a>(s: &'a [u8]) -> Cow<'a, str> {
177 from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
180 #[derive(Copy, Clone, Debug)]
181 pub enum Compression {
187 pub struct HeaderRecord {
191 /// Eye-catcher string, product name, in the file's encoding. Padded
192 /// on the right with spaces.
193 pub eye_catcher: UnencodedStr<60>,
195 /// Layout code, normally either 2 or 3.
196 pub layout_code: u32,
198 /// Number of variable positions, or `None` if the value in the file is
199 /// questionably trustworthy.
200 pub nominal_case_size: Option<u32>,
202 /// Compression type, if any,
203 pub compression: Option<Compression>,
205 /// 1-based variable index of the weight variable, or `None` if the file is
207 pub weight_index: Option<u32>,
209 /// Claimed number of cases, if known.
210 pub n_cases: Option<u32>,
212 /// Compression bias, usually 100.0.
215 /// `dd mmm yy` in the file's encoding.
216 pub creation_date: UnencodedStr<9>,
218 /// `HH:MM:SS` in the file's encoding.
219 pub creation_time: UnencodedStr<8>,
221 /// File label, in the file's encoding. Padded on the right with spaces.
222 pub file_label: UnencodedStr<64>,
224 /// Endianness of the data in the file header.
229 fn debug_field<T: Debug>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult {
230 writeln!(f, "{name:>17}: {:?}", value)
234 impl Debug for HeaderRecord {
235 fn fmt(&self, f: &mut Formatter) -> FmtResult {
236 writeln!(f, "File header record:")?;
237 self.debug_field(f, "Magic", self.magic)?;
238 self.debug_field(f, "Product name", &self.eye_catcher)?;
239 self.debug_field(f, "Layout code", self.layout_code)?;
240 self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
241 self.debug_field(f, "Compression", self.compression)?;
242 self.debug_field(f, "Weight index", self.weight_index)?;
243 self.debug_field(f, "Number of cases", self.n_cases)?;
244 self.debug_field(f, "Compression bias", self.bias)?;
245 self.debug_field(f, "Creation date", &self.creation_date)?;
246 self.debug_field(f, "Creation time", &self.creation_time)?;
247 self.debug_field(f, "File label", &self.file_label)?;
248 self.debug_field(f, "Endianness", self.endian)
253 fn read<R: Read>(r: &mut R) -> Result<HeaderRecord, Error> {
254 let magic: [u8; 4] = read_bytes(r)?;
255 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
257 let eye_catcher = UnencodedStr::<60>(read_bytes(r)?);
258 let layout_code: [u8; 4] = read_bytes(r)?;
259 let endian = Endian::identify_u32(2, layout_code)
260 .or_else(|| Endian::identify_u32(2, layout_code))
261 .ok_or_else(|| Error::NotASystemFile)?;
262 let layout_code = endian.parse(layout_code);
264 let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
265 let nominal_case_size =
266 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
268 let compression_code: u32 = endian.parse(read_bytes(r)?);
269 let compression = match (magic, compression_code) {
270 (Magic::ZSAV, 2) => Some(Compression::ZLib),
271 (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
273 (_, 1) => Some(Compression::Simple),
274 (_, code) => return Err(Error::InvalidSavCompression(code)),
277 let weight_index: u32 = endian.parse(read_bytes(r)?);
278 let weight_index = (weight_index > 0).then_some(weight_index);
280 let n_cases: u32 = endian.parse(read_bytes(r)?);
281 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
283 let bias: f64 = endian.parse(read_bytes(r)?);
285 let creation_date = UnencodedStr::<9>(read_bytes(r)?);
286 let creation_time = UnencodedStr::<8>(read_bytes(r)?);
287 let file_label = UnencodedStr::<64>(read_bytes(r)?);
288 let _: [u8; 3] = read_bytes(r)?;
307 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
308 pub struct Magic([u8; 4]);
311 /// Magic number for a regular system file.
312 pub const SAV: Magic = Magic(*b"$FL2");
314 /// Magic number for a system file that contains zlib-compressed data.
315 pub const ZSAV: Magic = Magic(*b"$FL3");
317 /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
319 pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
322 impl Debug for Magic {
323 fn fmt(&self, f: &mut Formatter) -> FmtResult {
325 &Magic::SAV => "$FL2",
326 &Magic::ZSAV => "$FL3",
327 &Magic::EBCDIC => "($FL2 in EBCDIC)",
328 _ => return write!(f, "{:?}", self.0),
334 impl TryFrom<[u8; 4]> for Magic {
337 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
338 let magic = Magic(value);
340 Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
341 _ => Err(Error::BadMagic(value)),
346 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
353 fn from_width(width: i32) -> VarType {
355 0 => VarType::Numeric,
356 _ => VarType::String,
363 Compression, Error, HeaderRecord, Record, Value, VarType, VariableRecord, ZHeader,
364 ZTrailer, ZlibDecodeMultiple,
366 use crate::endian::Endian;
368 collections::VecDeque,
373 #[allow(clippy::type_complexity)]
374 fn read(self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error>;
377 struct Start<R: Read + Seek> {
381 pub fn new<R: Read + Seek + 'static>(reader: R) -> Box<dyn State> {
382 Box::new(Start { reader })
385 struct CommonState<R: Read + Seek> {
389 compression: Option<Compression>,
390 var_types: Vec<VarType>,
393 impl<R: Read + Seek + 'static> State for Start<R> {
394 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
395 let header = HeaderRecord::read(&mut self.reader)?;
396 let next_state = Headers(CommonState {
398 endian: header.endian,
400 compression: header.compression,
401 var_types: Vec::new(),
403 Ok(Some((Record::Header(header), Box::new(next_state))))
407 struct Headers<R: Read + Seek>(CommonState<R>);
409 impl<R: Read + Seek + 'static> State for Headers<R> {
410 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
411 let record = Record::read(&mut self.0.reader, self.0.endian)?;
413 Record::Variable(VariableRecord { width, .. }) => {
414 self.0.var_types.push(VarType::from_width(width));
416 Record::EndOfHeaders(_) => {
417 let next_state: Box<dyn State> = match self.0.compression {
418 None => Box::new(Data(self.0)),
419 Some(Compression::Simple) => Box::new(CompressedData::new(self.0)),
420 Some(Compression::ZLib) => Box::new(ZlibHeader(self.0)),
422 return Ok(Some((record, next_state)));
426 Ok(Some((record, self)))
430 struct ZlibHeader<R: Read + Seek>(CommonState<R>);
432 impl<R: Read + Seek + 'static> State for ZlibHeader<R> {
433 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
434 let zheader = ZHeader::read(&mut self.0.reader, self.0.endian)?;
435 Ok(Some((Record::ZHeader(zheader), self)))
439 struct ZlibTrailer<R: Read + Seek>(CommonState<R>, ZHeader);
441 impl<R: Read + Seek + 'static> State for ZlibTrailer<R> {
442 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
443 let retval = ZTrailer::read(
446 self.1.ztrailer_offset,
449 let next_state = Box::new(CompressedData::new(CommonState {
450 reader: ZlibDecodeMultiple::new(self.0.reader),
451 endian: self.0.endian,
453 compression: self.0.compression,
454 var_types: self.0.var_types,
457 None => next_state.read(),
458 Some(ztrailer) => Ok(Some((Record::ZTrailer(ztrailer), next_state))),
463 struct Data<R: Read + Seek>(CommonState<R>);
465 impl<R: Read + Seek + 'static> State for Data<R> {
466 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
467 match Value::read_case(&mut self.0.reader, &self.0.var_types, self.0.endian)? {
469 Some(values) => Ok(Some((Record::Case(values), self))),
474 struct CompressedData<R: Read + Seek> {
475 common: CommonState<R>,
479 impl<R: Read + Seek + 'static> CompressedData<R> {
480 fn new(common: CommonState<R>) -> CompressedData<R> {
483 codes: VecDeque::new(),
488 impl<R: Read + Seek + 'static> State for CompressedData<R> {
489 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
490 match Value::read_compressed_case(
491 &mut self.common.reader,
492 &self.common.var_types,
498 Some(values) => Ok(Some((Record::Case(values), self))),
504 #[derive(Copy, Clone)]
507 String(UnencodedStr<8>),
510 impl Debug for Value {
511 fn fmt(&self, f: &mut Formatter) -> FmtResult {
513 Value::Number(Some(number)) => write!(f, "{number:?}"),
514 Value::Number(None) => write!(f, "SYSMIS"),
515 Value::String(bytes) => write!(f, "{:?}", bytes),
521 fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Value, IoError> {
523 UntypedValue(read_bytes(r)?),
529 pub fn from_raw(raw: UntypedValue, var_type: VarType, endian: Endian) -> Value {
531 VarType::String => Value::String(UnencodedStr(raw.0)),
532 VarType::Numeric => {
533 let number: f64 = endian.parse(raw.0);
534 Value::Number((number != -f64::MAX).then_some(number))
539 fn read_case<R: Read + Seek>(
541 var_types: &[VarType],
543 ) -> Result<Option<Vec<Value>>, Error> {
544 let case_start = reader.stream_position()?;
545 let mut values = Vec::with_capacity(var_types.len());
546 for (i, &var_type) in var_types.iter().enumerate() {
547 let Some(raw) = try_read_bytes(reader)? else {
551 let offset = reader.stream_position()?;
552 return Err(Error::EofInCase {
554 case_ofs: offset - case_start,
555 case_len: var_types.len() * 8,
559 values.push(Value::from_raw(UntypedValue(raw), var_type, endian));
564 fn read_compressed_case<R: Read + Seek>(
566 var_types: &[VarType],
567 codes: &mut VecDeque<u8>,
570 ) -> Result<Option<Vec<Value>>, Error> {
571 let case_start = reader.stream_position()?;
572 let mut values = Vec::with_capacity(var_types.len());
573 for (i, &var_type) in var_types.iter().enumerate() {
575 let Some(code) = codes.pop_front() else {
576 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
580 let offset = reader.stream_position()?;
581 return Err(Error::EofInCompressedCase {
583 case_ofs: offset - case_start,
587 codes.extend(new_codes.into_iter());
592 1..=251 => match var_type {
593 VarType::Numeric => break Value::Number(Some(code as f64 - bias)),
595 break Value::String(UnencodedStr(endian.to_bytes(code as f64 - bias)))
602 let offset = reader.stream_position()?;
603 return Err(Error::PartialCompressedCase {
605 case_ofs: offset - case_start,
610 break Value::from_raw(UntypedValue(read_bytes(reader)?), var_type, endian)
612 254 => match var_type {
613 VarType::String => break Value::String(UnencodedStr(*b" ")), // XXX EBCDIC
614 VarType::Numeric => {
615 return Err(Error::CompressedStringExpected {
617 case_ofs: reader.stream_position()? - case_start,
621 255 => match var_type {
622 VarType::Numeric => break Value::Number(None),
624 return Err(Error::CompressedNumberExpected {
626 case_ofs: reader.stream_position()? - case_start,
638 struct ZlibDecodeMultiple<R>
642 reader: Option<ZlibDecoder<R>>,
645 impl<R> ZlibDecodeMultiple<R>
649 fn new(reader: R) -> ZlibDecodeMultiple<R> {
651 reader: Some(ZlibDecoder::new(reader)),
656 impl<R> Read for ZlibDecodeMultiple<R>
660 fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
662 match self.reader.as_mut().unwrap().read(buf)? {
664 let inner = self.reader.take().unwrap().into_inner();
665 self.reader = Some(ZlibDecoder::new(inner));
673 impl<R> Seek for ZlibDecodeMultiple<R>
677 fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
678 self.reader.as_mut().unwrap().get_mut().seek(pos)
683 state: Option<Box<dyn State>>,
687 pub fn new<R: Read + Seek + 'static>(reader: R) -> Result<Reader, Error> {
689 state: Some(state::new(reader)),
692 pub fn collect_headers(&mut self) -> Result<Vec<Record>, Error> {
693 let mut headers = Vec::new();
696 Record::EndOfHeaders(_) => break,
697 r => headers.push(r),
704 impl Iterator for Reader {
705 type Item = Result<Record, Error>;
707 fn next(&mut self) -> Option<Self::Item> {
708 match self.state.take()?.read() {
709 Ok(Some((record, next_state))) => {
710 self.state = Some(next_state);
714 Err(error) => Some(Err(error)),
719 impl FusedIterator for Reader {}
721 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
722 pub struct Spec(pub u32);
724 impl Debug for Spec {
725 fn fmt(&self, f: &mut Formatter) -> FmtResult {
726 let type_ = format_name(self.0 >> 16);
727 let w = (self.0 >> 8) & 0xff;
728 let d = self.0 & 0xff;
729 write!(f, "{:06x} ({type_}{w}.{d})", self.0)
733 fn format_name(type_: u32) -> Cow<'static, str> {
772 _ => return format!("<unknown format {type_}>").into(),
778 pub struct MissingValues {
779 /// Individual missing values, up to 3 of them.
780 pub values: Vec<Value>,
782 /// Optional range of missing values.
783 pub range: Option<(Value, Value)>,
786 impl Debug for MissingValues {
787 fn fmt(&self, f: &mut Formatter) -> FmtResult {
788 for (i, value) in self.values.iter().enumerate() {
792 write!(f, "{value:?}")?;
795 if let Some((low, high)) = self.range {
796 if !self.values.is_empty() {
799 write!(f, "{low:?} THRU {high:?}")?;
811 fn is_empty(&self) -> bool {
812 self.values.is_empty() && self.range.is_none()
815 fn read<R: Read + Seek>(
821 ) -> Result<MissingValues, Error> {
822 let (n_values, has_range) = match (width, code) {
823 (_, 0..=3) => (code, false),
824 (0, -2) => (0, true),
825 (0, -3) => (1, true),
826 (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
827 (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
830 let var_type = VarType::from_width(width);
832 let mut values = Vec::new();
833 for _ in 0..n_values {
834 values.push(Value::read(r, var_type, endian)?);
836 let range = if has_range {
837 let low = Value::read(r, var_type, endian)?;
838 let high = Value::read(r, var_type, endian)?;
843 Ok(MissingValues { values, range })
848 pub struct VariableRecord {
849 /// Offset from the start of the file to the start of the record.
852 /// Variable width, in the range -1..=255.
855 /// Variable name, padded on the right with spaces.
856 pub name: UnencodedStr<8>,
859 pub print_format: Spec,
862 pub write_format: Spec,
865 pub missing_values: MissingValues,
867 /// Optional variable label.
868 pub label: Option<UnencodedString>,
871 impl Debug for VariableRecord {
872 fn fmt(&self, f: &mut Formatter) -> FmtResult {
879 } else if self.width == 0 {
882 "long string continuation record"
885 writeln!(f, "Print format: {:?}", self.print_format)?;
886 writeln!(f, "Write format: {:?}", self.write_format)?;
887 writeln!(f, "Name: {:?}", &self.name)?;
888 writeln!(f, "Variable label: {:?}", self.label)?;
889 writeln!(f, "Missing values: {:?}", self.missing_values)
893 impl VariableRecord {
894 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VariableRecord, Error> {
895 let offset = r.stream_position()?;
896 let width: i32 = endian.parse(read_bytes(r)?);
897 let has_variable_label: u32 = endian.parse(read_bytes(r)?);
898 let missing_value_code: i32 = endian.parse(read_bytes(r)?);
899 let print_format = Spec(endian.parse(read_bytes(r)?));
900 let write_format = Spec(endian.parse(read_bytes(r)?));
901 let name = UnencodedStr::<8>(read_bytes(r)?);
903 let label = match has_variable_label {
906 let len: u32 = endian.parse(read_bytes(r)?);
907 let read_len = len.min(65535) as usize;
908 let label = UnencodedString(read_vec(r, read_len)?);
910 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
911 let _ = read_vec(r, padding_bytes as usize)?;
916 return Err(Error::BadVariableLabelCode {
918 code: has_variable_label,
923 let missing_values = MissingValues::read(r, offset, width, missing_value_code, endian)?;
937 #[derive(Copy, Clone)]
938 pub struct UntypedValue(pub [u8; 8]);
940 impl Debug for UntypedValue {
941 fn fmt(&self, f: &mut Formatter) -> FmtResult {
942 let little: f64 = Endian::Little.parse(self.0);
943 let little = format!("{:?}", little);
944 let big: f64 = Endian::Big.parse(self.0);
945 let big = format!("{:?}", big);
946 let number = if little.len() <= big.len() {
951 write!(f, "{number}")?;
953 let string = default_decode(&self.0);
955 .split(|c: char| c == '\0' || c.is_control())
958 write!(f, "{string:?}")?;
964 pub struct UnencodedString(pub Vec<u8>);
966 impl From<Vec<u8>> for UnencodedString {
967 fn from(source: Vec<u8>) -> Self {
972 impl From<&[u8]> for UnencodedString {
973 fn from(source: &[u8]) -> Self {
978 impl Debug for UnencodedString {
979 fn fmt(&self, f: &mut Formatter) -> FmtResult {
980 write!(f, "{:?}", default_decode(self.0.as_slice()))
984 #[derive(Copy, Clone)]
985 pub struct UnencodedStr<const N: usize>(pub [u8; N]);
987 impl<const N: usize> From<[u8; N]> for UnencodedStr<N> {
988 fn from(source: [u8; N]) -> Self {
993 impl<const N: usize> Debug for UnencodedStr<N> {
994 fn fmt(&self, f: &mut Formatter) -> FmtResult {
995 write!(f, "{:?}", default_decode(&self.0))
1000 pub struct ValueLabelRecord {
1001 /// Offset from the start of the file to the start of the value label
1003 pub label_offset: u64,
1006 pub labels: Vec<(UntypedValue, UnencodedString)>,
1008 /// Offset from the start of the file to the start of the variable index
1010 pub index_offset: u64,
1012 /// The 1-based indexes of the variable indexes.
1013 pub dict_indexes: Vec<u32>,
1016 impl Debug for ValueLabelRecord {
1017 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1018 writeln!(f, "labels: ")?;
1019 for (value, label) in self.labels.iter() {
1020 writeln!(f, "{value:?}: {label:?}")?;
1022 write!(f, "apply to variables")?;
1023 for dict_index in self.dict_indexes.iter() {
1024 write!(f, " #{dict_index}")?;
1030 impl ValueLabelRecord {
1031 /// Maximum number of value labels in a record.
1032 pub const MAX_LABELS: u32 = u32::MAX / 8;
1034 /// Maximum number of variable indexes in a record.
1035 pub const MAX_INDEXES: u32 = u32::MAX / 8;
1037 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabelRecord, Error> {
1038 let label_offset = r.stream_position()?;
1039 let n: u32 = endian.parse(read_bytes(r)?);
1040 if n > Self::MAX_LABELS {
1041 return Err(Error::BadNumberOfValueLabels {
1042 offset: label_offset,
1044 max: Self::MAX_LABELS,
1048 let mut labels = Vec::new();
1050 let value = UntypedValue(read_bytes(r)?);
1051 let label_len: u8 = endian.parse(read_bytes(r)?);
1052 let label_len = label_len as usize;
1053 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
1055 let mut label = read_vec(r, padded_len - 1)?;
1056 label.truncate(label_len);
1057 labels.push((value, UnencodedString(label)));
1060 let index_offset = r.stream_position()?;
1061 let rec_type: u32 = endian.parse(read_bytes(r)?);
1063 return Err(Error::ExpectedVarIndexRecord {
1064 offset: index_offset,
1069 let n: u32 = endian.parse(read_bytes(r)?);
1070 if n > Self::MAX_INDEXES {
1071 return Err(Error::BadNumberOfVarIndexes {
1072 offset: index_offset,
1074 max: Self::MAX_INDEXES,
1077 let mut dict_indexes = Vec::with_capacity(n as usize);
1079 dict_indexes.push(endian.parse(read_bytes(r)?));
1082 Ok(ValueLabelRecord {
1091 #[derive(Clone, Debug)]
1092 pub struct DocumentRecord {
1093 /// Offset from the start of the file to the start of the record.
1096 /// The document, as an array of 80-byte lines.
1097 pub lines: Vec<DocumentLine>,
1100 pub type DocumentLine = UnencodedStr<{ DocumentRecord::LINE_LEN }>;
1102 impl DocumentRecord {
1103 /// Length of a line in a document. Document lines are fixed-length and
1104 /// padded on the right with spaces.
1105 pub const LINE_LEN: usize = 80;
1107 /// Maximum number of lines we will accept in a document. This is simply
1108 /// the maximum number that will fit in a 32-bit space.
1109 pub const MAX_LINES: usize = i32::MAX as usize / Self::LINE_LEN;
1111 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<DocumentRecord, Error> {
1112 let offset = r.stream_position()?;
1113 let n: u32 = endian.parse(read_bytes(r)?);
1115 if n > Self::MAX_LINES {
1116 Err(Error::BadDocumentLength {
1119 max: Self::MAX_LINES,
1122 let pos = r.stream_position()?;
1123 let mut lines = Vec::with_capacity(n);
1125 lines.push(UnencodedStr::<{ DocumentRecord::LINE_LEN }>(read_bytes(r)?));
1127 Ok(DocumentRecord { pos, lines })
1132 trait ExtensionRecord
1137 const SIZE: Option<u32>;
1138 const COUNT: Option<u32>;
1139 const NAME: &'static str;
1140 fn parse(ext: &Extension, endian: Endian, warn: impl Fn(Error)) -> Result<Self, Error>;
1143 #[derive(Clone, Debug)]
1144 pub struct IntegerInfoRecord {
1145 pub version: (i32, i32, i32),
1146 pub machine_code: i32,
1147 pub floating_point_rep: i32,
1148 pub compression_code: i32,
1149 pub endianness: i32,
1150 pub character_code: i32,
1153 impl ExtensionRecord for IntegerInfoRecord {
1154 const SUBTYPE: u32 = 3;
1155 const SIZE: Option<u32> = Some(4);
1156 const COUNT: Option<u32> = Some(8);
1157 const NAME: &'static str = "integer record";
1159 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1160 ext.check_size::<Self>()?;
1162 let mut input = &ext.data[..];
1163 let data: Vec<i32> = (0..8)
1164 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1166 Ok(IntegerInfoRecord {
1167 version: (data[0], data[1], data[2]),
1168 machine_code: data[3],
1169 floating_point_rep: data[4],
1170 compression_code: data[5],
1171 endianness: data[6],
1172 character_code: data[7],
1177 #[derive(Clone, Debug)]
1178 pub struct FloatInfoRecord {
1184 impl ExtensionRecord for FloatInfoRecord {
1185 const SUBTYPE: u32 = 4;
1186 const SIZE: Option<u32> = Some(8);
1187 const COUNT: Option<u32> = Some(3);
1188 const NAME: &'static str = "floating point record";
1190 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1191 ext.check_size::<Self>()?;
1193 let mut input = &ext.data[..];
1194 let data: Vec<f64> = (0..3)
1195 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1197 Ok(FloatInfoRecord {
1205 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1206 pub enum CategoryLabels {
1211 #[derive(Clone, Debug)]
1212 pub enum MultipleResponseType {
1214 value: UnencodedString,
1215 labels: CategoryLabels,
1220 impl MultipleResponseType {
1221 fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Error> {
1222 let (mr_type, input) = match input.get(0) {
1223 Some(b'C') => (MultipleResponseType::MultipleCategory, &input[1..]),
1225 let (value, input) = parse_counted_string(&input[1..])?;
1227 MultipleResponseType::MultipleDichotomy {
1228 value: value.into(),
1229 labels: CategoryLabels::VarLabels,
1235 let Some(b' ') = input.get(1) else {
1236 return Err(Error::TBD);
1238 let input = &input[2..];
1239 let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
1240 (CategoryLabels::CountedValues, rest)
1241 } else if let Some(rest) = input.strip_prefix(b" 11 ") {
1242 (CategoryLabels::VarLabels, rest)
1244 return Err(Error::TBD);
1246 let (value, input) = parse_counted_string(input)?;
1248 MultipleResponseType::MultipleDichotomy {
1249 value: value.into(),
1255 _ => return Err(Error::TBD),
1257 Ok((mr_type, input))
1261 #[derive(Clone, Debug)]
1262 pub struct MultipleResponseSet {
1263 pub name: UnencodedString,
1264 pub label: UnencodedString,
1265 pub mr_type: MultipleResponseType,
1266 pub short_names: Vec<UnencodedString>,
1269 impl MultipleResponseSet {
1270 fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> {
1271 let Some(equals) = input.iter().position(|&b| b == b'=') else {
1272 return Err(Error::TBD);
1274 let (name, input) = input.split_at(equals);
1275 let (mr_type, input) = MultipleResponseType::parse(input)?;
1276 let Some(b' ') = input.get(0) else {
1277 return Err(Error::TBD);
1279 let (label, mut input) = parse_counted_string(&input[1..])?;
1280 let mut vars = Vec::new();
1281 while input.get(0) == Some(&b' ') {
1282 input = &input[1..];
1283 let Some(length) = input.iter().position(|b| b" \n".contains(b)) else {
1284 return Err(Error::TBD);
1287 vars.push(input[..length].into());
1289 input = &input[length..];
1291 if input.get(0) != Some(&b'\n') {
1292 return Err(Error::TBD);
1294 while input.get(0) == Some(&b'\n') {
1295 input = &input[1..];
1298 MultipleResponseSet {
1300 label: label.into(),
1309 #[derive(Clone, Debug)]
1310 pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
1312 impl ExtensionRecord for MultipleResponseRecord {
1313 const SUBTYPE: u32 = 7;
1314 const SIZE: Option<u32> = Some(1);
1315 const COUNT: Option<u32> = None;
1316 const NAME: &'static str = "multiple response set record";
1318 fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1319 ext.check_size::<Self>()?;
1321 let mut input = &ext.data[..];
1322 let mut sets = Vec::new();
1323 while !input.is_empty() {
1324 let (set, rest) = MultipleResponseSet::parse(input)?;
1328 Ok(MultipleResponseRecord(sets))
1332 fn parse_counted_string(input: &[u8]) -> Result<(UnencodedString, &[u8]), Error> {
1333 let Some(space) = input.iter().position(|&b| b == b' ') else {
1334 return Err(Error::TBD);
1336 let Ok(length) = from_utf8(&input[..space]) else {
1337 return Err(Error::TBD);
1339 let Ok(length): Result<usize, _> = length.parse() else {
1340 return Err(Error::TBD);
1343 let input = &input[space + 1..];
1344 if input.len() < length {
1345 return Err(Error::TBD);
1348 let (string, rest) = input.split_at(length);
1349 Ok((string.into(), rest))
1352 #[derive(Clone, Debug)]
1353 pub struct VarDisplayRecord(pub Vec<u32>);
1355 impl ExtensionRecord for VarDisplayRecord {
1356 const SUBTYPE: u32 = 11;
1357 const SIZE: Option<u32> = Some(4);
1358 const COUNT: Option<u32> = None;
1359 const NAME: &'static str = "variable display record";
1361 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1362 ext.check_size::<Self>()?;
1364 let mut input = &ext.data[..];
1365 let display = (0..ext.count)
1366 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1368 Ok(VarDisplayRecord(display))
1372 pub struct LongStringMissingValues {
1374 pub var_name: UnencodedString,
1377 pub missing_values: MissingValues,
1380 pub struct LongStringMissingValueSet(Vec<LongStringMissingValues>);
1382 impl ExtensionRecord for LongStringMissingValueSet {
1383 const SUBTYPE: u32 = 22;
1384 const SIZE: Option<u32> = Some(1);
1385 const COUNT: Option<u32> = None;
1386 const NAME: &'static str = "long string missing values record";
1388 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1389 ext.check_size::<Self>()?;
1391 let mut input = &ext.data[..];
1392 let mut missing_value_set = Vec::new();
1393 while !input.is_empty() {
1394 let var_name = read_string(&mut input, endian)?;
1395 let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
1396 let value_len: u32 = endian.parse(read_bytes(&mut input)?);
1398 let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offset;
1399 return Err(Error::BadLongMissingValueLength {
1400 record_offset: ext.offset,
1405 let mut values = Vec::new();
1406 for i in 0..n_missing_values {
1407 let value: [u8; 8] = read_bytes(&mut input)?;
1408 let numeric_value: u64 = endian.parse(value);
1409 let value = if i > 0 && numeric_value == 8 {
1410 // Tolerate files written by old, buggy versions of PSPP
1411 // where we believed that the value_length was repeated
1412 // before each missing value.
1413 read_bytes(&mut input)?
1417 values.push(Value::String(UnencodedStr(value)));
1419 let missing_values = MissingValues {
1423 missing_value_set.push(LongStringMissingValues {
1428 Ok(LongStringMissingValueSet(missing_value_set))
1432 #[derive(Clone, Debug)]
1433 pub struct EncodingRecord(pub String);
1435 impl ExtensionRecord for EncodingRecord {
1436 const SUBTYPE: u32 = 20;
1437 const SIZE: Option<u32> = Some(1);
1438 const COUNT: Option<u32> = None;
1439 const NAME: &'static str = "encoding record";
1441 fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1442 ext.check_size::<Self>()?;
1445 String::from_utf8(ext.data.clone())
1446 .map_err(|_| Error::BadEncodingName { offset: ext.offset })?,
1451 #[derive(Clone, Debug)]
1452 pub struct NumberOfCasesRecord {
1453 /// Always observed as 1.
1456 /// Number of cases.
1460 impl ExtensionRecord for NumberOfCasesRecord {
1461 const SUBTYPE: u32 = 16;
1462 const SIZE: Option<u32> = Some(8);
1463 const COUNT: Option<u32> = Some(2);
1464 const NAME: &'static str = "extended number of cases record";
1466 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1467 ext.check_size::<Self>()?;
1469 let mut input = &ext.data[..];
1470 let one = endian.parse(read_bytes(&mut input)?);
1471 let n_cases = endian.parse(read_bytes(&mut input)?);
1473 Ok(NumberOfCasesRecord { one, n_cases })
1477 #[derive(Clone, Debug)]
1478 pub struct TextRecord {
1479 /// Offset from the start of the file to the start of the record.
1482 /// The text content of the record.
1483 pub text: UnencodedString,
1486 impl From<Extension> for TextRecord {
1487 fn from(source: Extension) -> Self {
1489 offset: source.offset,
1490 text: source.data.into(),
1495 #[derive(Clone, Debug)]
1496 pub struct Extension {
1497 /// Offset from the start of the file to the start of the record.
1503 /// Size of each data element.
1506 /// Number of data elements.
1509 /// `size * count` bytes of data.
1514 fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
1515 if let Some(expected_size) = E::SIZE {
1516 if self.size != expected_size {
1517 return Err(Error::BadRecordSize {
1518 offset: self.offset,
1519 record: E::NAME.into(),
1525 if let Some(expected_count) = E::COUNT {
1526 if self.count != expected_count {
1527 return Err(Error::BadRecordCount {
1528 offset: self.offset,
1529 record: E::NAME.into(),
1538 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1539 let subtype = endian.parse(read_bytes(r)?);
1540 let offset = r.stream_position()?;
1541 let size: u32 = endian.parse(read_bytes(r)?);
1542 let count = endian.parse(read_bytes(r)?);
1543 let Some(product) = size.checked_mul(count) else {
1544 return Err(Error::ExtensionRecordTooLarge {
1551 let offset = r.stream_position()?;
1552 let data = read_vec(r, product as usize)?;
1553 let extension = Extension {
1561 IntegerInfoRecord::SUBTYPE => Ok(Record::IntegerInfo(IntegerInfoRecord::parse(
1566 FloatInfoRecord::SUBTYPE => Ok(Record::FloatInfo(FloatInfoRecord::parse(
1571 VarDisplayRecord::SUBTYPE => Ok(Record::VarDisplay(VarDisplayRecord::parse(
1576 MultipleResponseRecord::SUBTYPE | 19 => Ok(Record::MultipleResponse(
1577 MultipleResponseRecord::parse(&extension, endian, |_| ())?,
1579 LongStringValueLabelRecord::SUBTYPE => Ok(Record::LongStringValueLabels(
1580 LongStringValueLabelRecord::parse(&extension, endian, |_| ())?,
1582 EncodingRecord::SUBTYPE => Ok(Record::Encoding(EncodingRecord::parse(
1587 NumberOfCasesRecord::SUBTYPE => Ok(Record::NumberOfCases(NumberOfCasesRecord::parse(
1592 5 => Ok(Record::VariableSets(extension.into())),
1593 10 => Ok(Record::ProductInfo(extension.into())),
1594 13 => Ok(Record::LongNames(extension.into())),
1595 14 => Ok(Record::VeryLongStrings(extension.into())),
1596 17 => Ok(Record::FileAttributes(extension.into())),
1597 18 => Ok(Record::VariableAttributes(extension.into())),
1598 _ => Ok(Record::OtherExtension(extension)),
1603 #[derive(Clone, Debug)]
1604 pub struct ZHeader {
1605 /// File offset to the start of the record.
1608 /// File offset to the ZLIB data header.
1609 pub zheader_offset: u64,
1611 /// File offset to the ZLIB trailer.
1612 pub ztrailer_offset: u64,
1614 /// Length of the ZLIB trailer in bytes.
1615 pub ztrailer_len: u64,
1619 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
1620 let offset = r.stream_position()?;
1621 let zheader_offset: u64 = endian.parse(read_bytes(r)?);
1622 let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
1623 let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
1634 #[derive(Clone, Debug)]
1635 pub struct ZTrailer {
1636 /// File offset to the start of the record.
1639 /// Compression bias as a negative integer, e.g. -100.
1642 /// Always observed as zero.
1645 /// Uncompressed size of each block, except possibly the last. Only
1646 /// `0x3ff000` has been observed so far.
1647 pub block_size: u32,
1649 /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
1650 pub blocks: Vec<ZBlock>,
1653 #[derive(Clone, Debug)]
1655 /// Offset of block of data if simple compression were used.
1656 pub uncompressed_ofs: u64,
1658 /// Actual offset within the file of the compressed data block.
1659 pub compressed_ofs: u64,
1661 /// The number of bytes in this data block after decompression. This is
1662 /// `block_size` in every data block but the last, which may be smaller.
1663 pub uncompressed_size: u32,
1665 /// The number of bytes in this data block, as stored compressed in this
1667 pub compressed_size: u32,
1671 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
1673 uncompressed_ofs: endian.parse(read_bytes(r)?),
1674 compressed_ofs: endian.parse(read_bytes(r)?),
1675 uncompressed_size: endian.parse(read_bytes(r)?),
1676 compressed_size: endian.parse(read_bytes(r)?),
1682 fn read<R: Read + Seek>(
1687 ) -> Result<Option<ZTrailer>, Error> {
1688 let start_offset = reader.stream_position()?;
1689 if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
1692 let int_bias = endian.parse(read_bytes(reader)?);
1693 let zero = endian.parse(read_bytes(reader)?);
1694 let block_size = endian.parse(read_bytes(reader)?);
1695 let n_blocks: u32 = endian.parse(read_bytes(reader)?);
1696 let expected_n_blocks = (ztrailer_len - 24) / 24;
1697 if n_blocks as u64 != expected_n_blocks {
1698 return Err(Error::BadZlibTrailerNBlocks {
1699 offset: ztrailer_ofs,
1705 let blocks = (0..n_blocks)
1706 .map(|_| ZBlock::read(reader, endian))
1707 .collect::<Result<Vec<_>, _>>()?;
1708 reader.seek(SeekFrom::Start(start_offset))?;
1710 offset: ztrailer_ofs,
1719 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
1720 let mut buf = [0; N];
1721 let n = r.read(&mut buf)?;
1724 r.read_exact(&mut buf[n..])?;
1732 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
1733 let mut buf = [0; N];
1734 r.read_exact(&mut buf)?;
1738 fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
1739 let mut vec = vec![0; n];
1740 r.read_exact(&mut vec)?;
1744 fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<UnencodedString, IoError> {
1745 let length: u32 = endian.parse(read_bytes(r)?);
1746 Ok(read_vec(r, length as usize)?.into())
1749 #[derive(Clone, Debug)]
1750 pub struct LongStringValueLabels {
1751 pub var_name: UnencodedString,
1754 /// `(value, label)` pairs, where each value is `width` bytes.
1755 pub labels: Vec<(UnencodedString, UnencodedString)>,
1758 #[derive(Clone, Debug)]
1759 pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
1761 impl ExtensionRecord for LongStringValueLabelRecord {
1762 const SUBTYPE: u32 = 21;
1763 const SIZE: Option<u32> = Some(1);
1764 const COUNT: Option<u32> = None;
1765 const NAME: &'static str = "long string value labels record";
1767 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1768 ext.check_size::<Self>()?;
1770 let mut input = &ext.data[..];
1771 let mut label_set = Vec::new();
1772 while !input.is_empty() {
1773 let var_name = read_string(&mut input, endian)?;
1774 let width: u32 = endian.parse(read_bytes(&mut input)?);
1775 let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
1776 let mut labels = Vec::new();
1777 for _ in 0..n_labels {
1778 let value = read_string(&mut input, endian)?;
1779 let label = read_string(&mut input, endian)?;
1780 labels.push((value, label));
1782 label_set.push(LongStringValueLabels {
1788 Ok(LongStringValueLabelRecord(label_set))