3 endian::{Endian, Parse, ToBytes},
4 identifier::{Error as IdError, Identifier},
7 use encoding_rs::{mem::decode_latin1, DecoderResult, Encoding};
8 use flate2::read::ZlibDecoder;
14 collections::{HashMap, VecDeque},
15 fmt::{Debug, Display, Formatter, Result as FmtResult},
16 io::{Error as IoError, Read, Seek, SeekFrom},
23 use thiserror::Error as ThisError;
25 #[derive(ThisError, Debug)]
27 #[error("Not an SPSS system file")]
30 #[error("Invalid magic number {0:?}")]
33 #[error("I/O error ({0})")]
36 #[error("Invalid SAV compression code {0}")]
37 InvalidSavCompression(u32),
39 #[error("Invalid ZSAV compression code {0}")]
40 InvalidZsavCompression(u32),
42 #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
43 BadVariableWidth { offset: u64, width: i32 },
45 #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
46 BadDocumentLength { offset: u64, n: usize, max: usize },
48 #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
49 BadRecordType { offset: u64, rec_type: u32 },
51 #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
52 BadVariableLabelCode {
59 "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
61 BadNumericMissingValueCode { offset: u64, code: i32 },
63 #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
64 BadStringMissingValueCode { offset: u64, code: i32 },
66 #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
67 BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
69 #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
70 ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
72 #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")]
73 TooManyVarIndexes { offset: u64, n: u32, max: u32 },
75 #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")]
76 NoVarIndexes { offset: u64 },
78 #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())]
82 wrong_types: Vec<u32>,
85 #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")]
92 #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
93 ExtensionRecordTooLarge {
100 #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
108 "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
110 EofInCompressedCase { offset: u64, case_ofs: u64 },
112 #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
113 PartialCompressedCase { offset: u64, case_ofs: u64 },
115 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
116 CompressedNumberExpected { offset: u64, case_ofs: u64 },
118 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
119 CompressedStringExpected { offset: u64, case_ofs: u64 },
121 #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
122 BadZlibTrailerNBlocks {
125 expected_n_blocks: u64,
129 #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
137 #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
145 #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
146 BadLongMissingValueLength {
152 #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
153 BadEncodingName { offset: u64 },
155 // XXX This is risky because `text` might be arbitarily long.
156 #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
157 MalformedString { encoding: String, text: String },
159 #[error("Invalid variable measurement level value {0}")]
160 InvalidMeasurement(u32),
162 #[error("Invalid variable display alignment value {0}")]
163 InvalidAlignment(u32),
165 #[error("Invalid attribute name. {0}")]
166 InvalidAttributeName(IdError),
168 #[error("Invalid variable name in attribute record. {0}")]
169 InvalidAttributeVariableName(IdError),
171 #[error("Invalid short name in long variable name record. {0}")]
172 InvalidShortName(IdError),
174 #[error("Invalid name in long variable name record. {0}")]
175 InvalidLongName(IdError),
177 #[error("Invalid variable name in very long string record. {0}")]
178 InvalidLongStringName(IdError),
180 #[error("Invalid variable name in variable set record. {0}")]
181 InvalidVariableSetName(IdError),
183 #[error("Invalid multiple response set name. {0}")]
184 InvalidMrSetName(IdError),
186 #[error("Invalid multiple response set variable name. {0}")]
187 InvalidMrSetVariableName(IdError),
189 #[error("Invalid variable name in long string missing values record. {0}")]
190 InvalidLongStringMissingValueVariableName(IdError),
192 #[error("Invalid variable name in long string value label record. {0}")]
193 InvalidLongStringValueLabelName(IdError),
195 #[error("Details TBD")]
199 #[derive(Clone, Debug)]
201 Header(HeaderRecord<RawString>),
202 Variable(VariableRecord<RawString, RawStr<8>>),
203 ValueLabel(ValueLabelRecord<RawStr<8>, RawString>),
204 Document(DocumentRecord<RawDocumentLine>),
205 IntegerInfo(IntegerInfoRecord),
206 FloatInfo(FloatInfoRecord),
207 VarDisplay(VarDisplayRecord),
208 MultipleResponse(MultipleResponseRecord<RawString, RawString>),
209 LongStringValueLabels(LongStringValueLabelRecord<RawString, RawString>),
210 LongStringMissingValues(LongStringMissingValueRecord<RawString, RawStr<8>>),
211 Encoding(EncodingRecord),
212 NumberOfCases(NumberOfCasesRecord),
214 OtherExtension(Extension),
218 Cases(Rc<RefCell<Cases>>),
221 pub enum DecodedRecord<'a> {
222 Header(HeaderRecord<Cow<'a, str>>),
223 Variable(VariableRecord<Cow<'a, str>, String>),
224 ValueLabel(ValueLabelRecord<RawStr<8>, Cow<'a, str>>),
225 Document(DocumentRecord<Cow<'a, str>>),
226 IntegerInfo(IntegerInfoRecord),
227 FloatInfo(FloatInfoRecord),
228 VarDisplay(VarDisplayRecord),
229 MultipleResponse(MultipleResponseRecord<Identifier, Cow<'a, str>>),
230 LongStringValueLabels(LongStringValueLabelRecord<Identifier, Cow<'a, str>>),
231 LongStringMissingValues(LongStringMissingValueRecord<Identifier, String>),
232 Encoding(EncodingRecord),
233 NumberOfCases(NumberOfCasesRecord),
234 VariableSets(VariableSetRecord),
235 ProductInfo(ProductInfoRecord),
236 LongNames(LongNamesRecord),
237 VeryLongStrings(VeryLongStringsRecord),
238 FileAttributes(FileAttributeRecord),
239 VariableAttributes(VariableAttributeRecord),
240 OtherExtension(Extension),
250 var_types: &[VarType],
251 warn: &Box<dyn Fn(Error)>,
252 ) -> Result<Option<Record>, Error>
256 let rec_type: u32 = endian.parse(read_bytes(reader)?);
258 2 => Ok(Some(VariableRecord::read(reader, endian)?)),
259 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?),
260 6 => Ok(Some(DocumentRecord::read(reader, endian)?)),
261 7 => Extension::read(reader, endian, var_types.len(), warn),
262 999 => Ok(Some(Record::EndOfHeaders(
263 endian.parse(read_bytes(reader)?),
265 _ => Err(Error::BadRecordType {
266 offset: reader.stream_position()?,
272 fn decode<'a>(&'a self, decoder: &Decoder) -> Result<DecodedRecord<'a>, Error> {
274 Record::Header(record) => record.decode(decoder),
275 Record::Variable(record) => record.decode(decoder),
276 Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)),
277 Record::Document(record) => record.decode(decoder),
278 Record::IntegerInfo(record) => DecodedRecord::IntegerInfo(record.clone()),
279 Record::FloatInfo(record) => DecodedRecord::FloatInfo(record.clone()),
280 Record::VarDisplay(record) => DecodedRecord::VarDisplay(record.clone()),
281 Record::MultipleResponse(record) => record.decode(decoder),
282 Record::LongStringValueLabels(record) => {
283 DecodedRecord::LongStringValueLabels(record.decode(decoder)?)
285 Record::LongStringMissingValues(record) => {
286 DecodedRecord::LongStringMissingValues(record.decode(decoder))
288 Record::Encoding(record) => DecodedRecord::Encoding(record.clone()),
289 Record::NumberOfCases(record) => DecodedRecord::NumberOfCases(record.clone()),
290 Record::Text(record) => record.decode(decoder)?,
291 Record::OtherExtension(record) => DecodedRecord::OtherExtension(record.clone()),
292 Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record.clone()),
293 Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()),
294 Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()),
295 Record::Cases(_) => todo!(),
300 // If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
301 // decoded as Latin-1 (actually bytes interpreted as Unicode code points).
302 fn default_decode(s: &[u8]) -> Cow<str> {
303 from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
306 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
307 pub enum Compression {
313 fn offsets(&self) -> Range<u64>;
317 pub struct HeaderRecord<S>
322 pub offsets: Range<u64>,
327 /// Eye-catcher string, product name, in the file's encoding. Padded
328 /// on the right with spaces.
331 /// Layout code, normally either 2 or 3.
332 pub layout_code: u32,
334 /// Number of variable positions, or `None` if the value in the file is
335 /// questionably trustworthy.
336 pub nominal_case_size: Option<u32>,
338 /// Compression type, if any,
339 pub compression: Option<Compression>,
341 /// 1-based variable index of the weight variable, or `None` if the file is
343 pub weight_index: Option<u32>,
345 /// Claimed number of cases, if known.
346 pub n_cases: Option<u32>,
348 /// Compression bias, usually 100.0.
351 /// `dd mmm yy` in the file's encoding.
352 pub creation_date: S,
354 /// `HH:MM:SS` in the file's encoding.
355 pub creation_time: S,
357 /// File label, in the file's encoding. Padded on the right with spaces.
360 /// Endianness of the data in the file header.
364 impl<S> HeaderRecord<S>
368 fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult
372 writeln!(f, "{name:>17}: {:?}", value)
376 impl<S> Debug for HeaderRecord<S>
380 fn fmt(&self, f: &mut Formatter) -> FmtResult {
381 writeln!(f, "File header record:")?;
382 self.debug_field(f, "Magic", self.magic)?;
383 self.debug_field(f, "Product name", &self.eye_catcher)?;
384 self.debug_field(f, "Layout code", self.layout_code)?;
385 self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
386 self.debug_field(f, "Compression", self.compression)?;
387 self.debug_field(f, "Weight index", self.weight_index)?;
388 self.debug_field(f, "Number of cases", self.n_cases)?;
389 self.debug_field(f, "Compression bias", self.bias)?;
390 self.debug_field(f, "Creation date", &self.creation_date)?;
391 self.debug_field(f, "Creation time", &self.creation_time)?;
392 self.debug_field(f, "File label", &self.file_label)?;
393 self.debug_field(f, "Endianness", self.endian)
397 impl HeaderRecord<RawString> {
398 fn read<R: Read + Seek>(r: &mut R) -> Result<Self, Error> {
399 let start = r.stream_position()?;
401 let magic: [u8; 4] = read_bytes(r)?;
402 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
404 let eye_catcher = RawString(read_vec(r, 60)?);
405 let layout_code: [u8; 4] = read_bytes(r)?;
406 let endian = Endian::identify_u32(2, layout_code)
407 .or_else(|| Endian::identify_u32(2, layout_code))
408 .ok_or_else(|| Error::NotASystemFile)?;
409 let layout_code = endian.parse(layout_code);
411 let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
412 let nominal_case_size =
413 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
415 let compression_code: u32 = endian.parse(read_bytes(r)?);
416 let compression = match (magic, compression_code) {
417 (Magic::Zsav, 2) => Some(Compression::ZLib),
418 (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)),
420 (_, 1) => Some(Compression::Simple),
421 (_, code) => return Err(Error::InvalidSavCompression(code)),
424 let weight_index: u32 = endian.parse(read_bytes(r)?);
425 let weight_index = (weight_index > 0).then_some(weight_index);
427 let n_cases: u32 = endian.parse(read_bytes(r)?);
428 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
430 let bias: f64 = endian.parse(read_bytes(r)?);
432 let creation_date = RawString(read_vec(r, 9)?);
433 let creation_time = RawString(read_vec(r, 8)?);
434 let file_label = RawString(read_vec(r, 64)?);
435 let _: [u8; 3] = read_bytes(r)?;
438 offsets: start..r.stream_position()?,
454 pub fn decode<'a>(&'a self, decoder: &Decoder) -> DecodedRecord<'a> {
455 let eye_catcher = decoder.decode(&self.eye_catcher);
456 let file_label = decoder.decode(&self.file_label);
457 let creation_date = decoder.decode(&self.creation_date);
458 let creation_time = decoder.decode(&self.creation_time);
459 DecodedRecord::Header(HeaderRecord {
461 weight_index: self.weight_index,
462 n_cases: self.n_cases,
464 offsets: self.offsets.clone(),
466 layout_code: self.layout_code,
467 nominal_case_size: self.nominal_case_size,
468 compression: self.compression,
478 pub encoding: &'static Encoding,
479 pub warn: Box<dyn Fn(Error)>,
483 fn warn(&self, error: Error) {
486 fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
487 let (output, malformed) = self.encoding.decode_without_bom_handling(input);
489 self.warn(Error::MalformedString {
490 encoding: self.encoding.name().into(),
491 text: output.clone().into(),
497 fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> {
498 self.decode_slice(input.0.as_slice())
501 /// Returns `input` decoded from `self.encoding` into UTF-8 such that
502 /// re-encoding the result back into `self.encoding` will have exactly the
503 /// same length in bytes.
505 /// XXX warn about errors?
506 pub fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
507 if let (s, false) = self.encoding.decode_without_bom_handling(input) {
508 // This is the common case. Usually there will be no errors.
511 // Unusual case. Don't bother to optimize it much.
512 let mut decoder = self.encoding.new_decoder_without_bom_handling();
513 let mut output = String::with_capacity(
515 .max_utf8_buffer_length_without_replacement(input.len())
518 let mut rest = input;
519 while !rest.is_empty() {
520 match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
521 (DecoderResult::InputEmpty, _) => break,
522 (DecoderResult::OutputFull, _) => unreachable!(),
523 (DecoderResult::Malformed(a, b), consumed) => {
524 let skipped = a as usize + b as usize;
525 output.extend(repeat('?').take(skipped));
526 rest = &rest[consumed..];
530 assert_eq!(self.encoding.encode(&output).0.len(), input.len());
535 pub fn decode_identifier(&self, input: &RawString) -> Result<Identifier, IdError> {
536 self.new_identifier(&self.decode(input))
539 pub fn new_identifier(&self, name: &str) -> Result<Identifier, IdError> {
540 Identifier::new(name, self.encoding)
544 impl<S> Header for HeaderRecord<S>
548 fn offsets(&self) -> Range<u64> {
553 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
555 /// Regular system file.
558 /// System file with Zlib-compressed data.
561 /// EBCDIC-encoded system file.
566 /// Magic number for a regular system file.
567 pub const SAV: [u8; 4] = *b"$FL2";
569 /// Magic number for a system file that contains zlib-compressed data.
570 pub const ZSAV: [u8; 4] = *b"$FL3";
572 /// Magic number for an EBCDIC-encoded system file. This is `$FL2` encoded
574 pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2];
577 impl Debug for Magic {
578 fn fmt(&self, f: &mut Formatter) -> FmtResult {
579 let s = match *self {
580 Magic::Sav => "$FL2",
581 Magic::Zsav => "$FL3",
582 Magic::Ebcdic => "($FL2 in EBCDIC)",
588 impl TryFrom<[u8; 4]> for Magic {
591 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
593 Magic::SAV => Ok(Magic::Sav),
594 Magic::ZSAV => Ok(Magic::Zsav),
595 Magic::EBCDIC => Ok(Magic::Ebcdic),
596 _ => Err(Error::BadMagic(value)),
601 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
608 pub fn from_width(width: VarWidth) -> VarType {
610 VarWidth::Numeric => Self::Numeric,
611 VarWidth::String(_) => Self::String,
615 pub fn opposite(self) -> VarType {
617 Self::Numeric => Self::String,
618 Self::String => Self::Numeric,
623 impl Display for VarType {
624 fn fmt(&self, f: &mut Formatter) -> FmtResult {
626 VarType::Numeric => write!(f, "numeric"),
627 VarType::String => write!(f, "string"),
632 #[derive(Copy, Clone)]
641 type RawValue = Value<RawStr<8>>;
643 impl<S> Debug for Value<S>
647 fn fmt(&self, f: &mut Formatter) -> FmtResult {
649 Value::Number(Some(number)) => write!(f, "{number:?}"),
650 Value::Number(None) => write!(f, "SYSMIS"),
651 Value::String(s) => write!(f, "{:?}", s),
657 fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Self, IoError> {
659 &UntypedValue(read_bytes(r)?),
665 pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self {
667 VarType::String => Value::String(RawStr(raw.0)),
668 VarType::Numeric => {
669 let number: f64 = endian.parse(raw.0);
670 Value::Number((number != -f64::MAX).then_some(number))
675 fn read_case<R: Read + Seek>(
677 var_types: &[VarType],
679 ) -> Result<Option<Vec<Self>>, Error> {
680 let case_start = reader.stream_position()?;
681 let mut values = Vec::with_capacity(var_types.len());
682 for (i, &var_type) in var_types.iter().enumerate() {
683 let Some(raw) = try_read_bytes(reader)? else {
687 let offset = reader.stream_position()?;
688 return Err(Error::EofInCase {
690 case_ofs: offset - case_start,
691 case_len: var_types.len() * 8,
695 values.push(Value::from_raw(&UntypedValue(raw), var_type, endian));
700 fn read_compressed_case<R: Read + Seek>(
702 var_types: &[VarType],
703 codes: &mut VecDeque<u8>,
706 ) -> Result<Option<Vec<Self>>, Error> {
707 let case_start = reader.stream_position()?;
708 let mut values = Vec::with_capacity(var_types.len());
709 for (i, &var_type) in var_types.iter().enumerate() {
711 let Some(code) = codes.pop_front() else {
712 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
716 let offset = reader.stream_position()?;
717 return Err(Error::EofInCompressedCase {
719 case_ofs: offset - case_start,
723 codes.extend(new_codes.into_iter());
728 1..=251 => match var_type {
729 VarType::Numeric => break Self::Number(Some(code as f64 - bias)),
731 break Self::String(RawStr(endian.to_bytes(code as f64 - bias)))
738 let offset = reader.stream_position()?;
739 return Err(Error::PartialCompressedCase {
741 case_ofs: offset - case_start,
746 break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian)
748 254 => match var_type {
749 VarType::String => break Self::String(RawStr(*b" ")), // XXX EBCDIC
750 VarType::Numeric => {
751 return Err(Error::CompressedStringExpected {
753 case_ofs: reader.stream_position()? - case_start,
757 255 => match var_type {
758 VarType::Numeric => break Self::Number(None),
760 return Err(Error::CompressedNumberExpected {
762 case_ofs: reader.stream_position()? - case_start,
773 fn decode(&self, decoder: &Decoder) -> Value<String> {
775 Self::Number(x) => Value::Number(*x),
776 Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
781 struct ZlibDecodeMultiple<R>
785 reader: Option<ZlibDecoder<R>>,
788 impl<R> ZlibDecodeMultiple<R>
792 fn new(reader: R) -> ZlibDecodeMultiple<R> {
794 reader: Some(ZlibDecoder::new(reader)),
799 impl<R> Read for ZlibDecodeMultiple<R>
803 fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
805 match self.reader.as_mut().unwrap().read(buf)? {
807 let inner = self.reader.take().unwrap().into_inner();
808 self.reader = Some(ZlibDecoder::new(inner));
816 impl<R> Seek for ZlibDecodeMultiple<R>
820 fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
821 self.reader.as_mut().unwrap().get_mut().seek(pos)
830 ztrailer_offset: u64,
839 R: Read + Seek + 'static,
842 warn: Box<dyn Fn(Error)>,
844 header: HeaderRecord<RawString>,
845 var_types: Vec<VarType>,
852 R: Read + Seek + 'static,
854 pub fn new<F>(mut reader: R, warn: F) -> Result<Self, Error>
856 F: Fn(Error) + 'static,
858 let header = HeaderRecord::read(&mut reader)?;
860 reader: Some(reader),
861 warn: Box::new(warn),
863 var_types: Vec::new(),
864 state: ReaderState::Start,
867 fn cases(&mut self) -> Cases {
868 self.state = ReaderState::End;
870 self.reader.take().unwrap(),
871 take(&mut self.var_types),
877 impl<R> Iterator for Reader<R>
879 R: Read + Seek + 'static,
881 type Item = Result<Record, Error>;
883 fn next(&mut self) -> Option<Self::Item> {
885 ReaderState::Start => {
886 self.state = ReaderState::Headers;
887 Some(Ok(Record::Header(self.header.clone())))
889 ReaderState::Headers => {
892 self.reader.as_mut().unwrap(),
894 self.var_types.as_slice(),
897 Ok(Some(record)) => break record,
899 Err(error) => return Some(Err(error)),
903 Record::Variable(VariableRecord { width, .. }) => {
904 self.var_types.push(if width == 0 {
910 Record::EndOfHeaders(_) => {
911 self.state = if let Some(Compression::ZLib) = self.header.compression {
912 ReaderState::ZlibHeader
921 ReaderState::ZlibHeader => {
922 let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian)
924 Ok(zheader) => zheader,
925 Err(error) => return Some(Err(error)),
927 self.state = ReaderState::ZlibTrailer {
928 ztrailer_offset: zheader.ztrailer_offset,
929 ztrailer_len: zheader.ztrailer_len,
931 Some(Ok(Record::ZHeader(zheader)))
933 ReaderState::ZlibTrailer {
937 match ZTrailer::read(
938 self.reader.as_mut().unwrap(),
943 Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
944 Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))),
945 Err(error) => Some(Err(error)),
948 ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
949 ReaderState::End => None,
954 trait ReadSeek: Read + Seek {}
955 impl<T> ReadSeek for T where T: Read + Seek {}
958 reader: Box<dyn ReadSeek>,
959 var_types: Vec<VarType>,
960 compression: Option<Compression>,
967 impl Debug for Cases {
968 fn fmt(&self, f: &mut Formatter) -> FmtResult {
974 fn new<R>(reader: R, var_types: Vec<VarType>, header: &HeaderRecord<RawString>) -> Self
976 R: Read + Seek + 'static,
979 reader: if header.compression == Some(Compression::ZLib) {
980 Box::new(ZlibDecodeMultiple::new(reader))
985 compression: header.compression,
987 endian: header.endian,
988 codes: VecDeque::with_capacity(8),
994 impl Iterator for Cases {
995 type Item = Result<Vec<RawValue>, Error>;
997 fn next(&mut self) -> Option<Self::Item> {
1002 let retval = if self.compression.is_some() {
1003 Value::read_compressed_case(
1012 Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose()
1014 self.eof = matches!(retval, None | Some(Err(_)));
1019 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
1020 pub struct Spec(pub u32);
1022 impl Debug for Spec {
1023 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1024 let type_ = format_name(self.0 >> 16);
1025 let w = (self.0 >> 8) & 0xff;
1026 let d = self.0 & 0xff;
1027 write!(f, "{:06x} ({type_}{w}.{d})", self.0)
1031 fn format_name(type_: u32) -> Cow<'static, str> {
1070 _ => return format!("<unknown format {type_}>").into(),
1076 pub struct MissingValues<S = String>
1080 /// Individual missing values, up to 3 of them.
1081 pub values: Vec<Value<S>>,
1083 /// Optional range of missing values.
1084 pub range: Option<(Value<S>, Value<S>)>,
1087 impl<S> Debug for MissingValues<S>
1091 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1092 for (i, value) in self.values.iter().enumerate() {
1096 write!(f, "{value:?}")?;
1099 if let Some((low, high)) = &self.range {
1100 if !self.values.is_empty() {
1103 write!(f, "{low:?} THRU {high:?}")?;
1106 if self.is_empty() {
1114 impl<S> MissingValues<S>
1118 fn is_empty(&self) -> bool {
1119 self.values.is_empty() && self.range.is_none()
1123 impl<S> Default for MissingValues<S>
1127 fn default() -> Self {
1135 impl MissingValues<RawStr<8>> {
1136 fn read<R: Read + Seek>(
1142 ) -> Result<Self, Error> {
1143 let (n_values, has_range) = match (width, code) {
1144 (_, 0..=3) => (code, false),
1145 (0, -2) => (0, true),
1146 (0, -3) => (1, true),
1147 (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
1148 (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
1151 let var_type = if width == 0 {
1157 let mut values = Vec::new();
1158 for _ in 0..n_values {
1159 values.push(RawValue::read(r, var_type, endian)?);
1161 let range = if has_range {
1162 let low = RawValue::read(r, var_type, endian)?;
1163 let high = RawValue::read(r, var_type, endian)?;
1168 Ok(Self { values, range })
1170 fn decode<'a>(&'a self, decoder: &Decoder) -> MissingValues<String> {
1175 .map(|value| value.decode(decoder))
1180 .map(|(low, high)| (low.decode(decoder), high.decode(decoder))),
1186 pub struct VariableRecord<S, V>
1191 /// Range of offsets in file.
1192 pub offsets: Range<u64>,
1194 /// Variable width, in the range -1..=255.
1197 /// Variable name, padded on the right with spaces.
1201 pub print_format: Spec,
1204 pub write_format: Spec,
1207 pub missing_values: MissingValues<V>,
1209 /// Optional variable label.
1210 pub label: Option<S>,
1213 impl<S, V> Debug for VariableRecord<S, V>
1218 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1223 match self.width.cmp(&0) {
1224 Ordering::Greater => "string",
1225 Ordering::Equal => "numeric",
1226 Ordering::Less => "long string continuation record",
1229 writeln!(f, "Print format: {:?}", self.print_format)?;
1230 writeln!(f, "Write format: {:?}", self.write_format)?;
1231 writeln!(f, "Name: {:?}", &self.name)?;
1232 writeln!(f, "Variable label: {:?}", self.label)?;
1233 writeln!(f, "Missing values: {:?}", self.missing_values)
1237 impl VariableRecord<RawString, RawStr<8>> {
1238 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1239 let start_offset = r.stream_position()?;
1240 let width: i32 = endian.parse(read_bytes(r)?);
1241 let code_offset = r.stream_position()?;
1242 let has_variable_label: u32 = endian.parse(read_bytes(r)?);
1243 let missing_value_code: i32 = endian.parse(read_bytes(r)?);
1244 let print_format = Spec(endian.parse(read_bytes(r)?));
1245 let write_format = Spec(endian.parse(read_bytes(r)?));
1246 let name = RawString(read_vec(r, 8)?);
1248 let label = match has_variable_label {
1251 let len: u32 = endian.parse(read_bytes(r)?);
1252 let read_len = len.min(65535) as usize;
1253 let label = RawString(read_vec(r, read_len)?);
1255 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
1256 let _ = read_vec(r, padding_bytes as usize)?;
1261 return Err(Error::BadVariableLabelCode {
1264 code: has_variable_label,
1269 let missing_values =
1270 MissingValues::read(r, start_offset, width, missing_value_code, endian)?;
1272 let end_offset = r.stream_position()?;
1274 Ok(Record::Variable(VariableRecord {
1275 offsets: start_offset..end_offset,
1285 pub fn decode<'a>(&'a self, decoder: &Decoder) -> DecodedRecord {
1286 DecodedRecord::Variable(VariableRecord {
1287 offsets: self.offsets.clone(),
1289 name: decoder.decode(&self.name),
1290 print_format: self.print_format,
1291 write_format: self.write_format,
1292 missing_values: self.missing_values.decode(decoder),
1293 label: self.label.as_ref().map(|label| decoder.decode(label)),
1298 #[derive(Copy, Clone)]
1299 pub struct UntypedValue(pub [u8; 8]);
1301 impl Debug for UntypedValue {
1302 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1303 let little: f64 = Endian::Little.parse(self.0);
1304 let little = format!("{:?}", little);
1305 let big: f64 = Endian::Big.parse(self.0);
1306 let big = format!("{:?}", big);
1307 let number = if little.len() <= big.len() {
1312 write!(f, "{number}")?;
1314 let string = default_decode(&self.0);
1316 .split(|c: char| c == '\0' || c.is_control())
1319 write!(f, "{string:?}")?;
1325 pub struct RawString(pub Vec<u8>);
1327 impl From<Vec<u8>> for RawString {
1328 fn from(source: Vec<u8>) -> Self {
1333 impl From<&[u8]> for RawString {
1334 fn from(source: &[u8]) -> Self {
1339 impl Debug for RawString {
1340 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1341 write!(f, "{:?}", default_decode(self.0.as_slice()))
1345 #[derive(Copy, Clone)]
1346 pub struct RawStr<const N: usize>(pub [u8; N]);
1348 impl<const N: usize> From<[u8; N]> for RawStr<N> {
1349 fn from(source: [u8; N]) -> Self {
1354 impl<const N: usize> Debug for RawStr<N> {
1355 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1356 write!(f, "{:?}", default_decode(&self.0))
1360 #[derive(Clone, Debug)]
1361 pub struct ValueLabel<V, S>
1366 pub value: Value<V>,
1371 pub struct ValueLabelRecord<V, S>
1376 /// Range of offsets in file.
1377 pub offsets: Range<u64>,
1380 pub labels: Vec<ValueLabel<V, S>>,
1382 /// The 1-based indexes of the variable indexes.
1383 pub dict_indexes: Vec<u32>,
1385 /// The types of the variables.
1386 pub var_type: VarType,
1389 impl<V, S> Debug for ValueLabelRecord<V, S>
1394 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1395 writeln!(f, "labels: ")?;
1396 for label in self.labels.iter() {
1397 writeln!(f, "{label:?}")?;
1399 write!(f, "apply to {} variables", self.var_type)?;
1400 for dict_index in self.dict_indexes.iter() {
1401 write!(f, " #{dict_index}")?;
1407 impl<V, S> Header for ValueLabelRecord<V, S>
1412 fn offsets(&self) -> Range<u64> {
1413 self.offsets.clone()
1417 impl<V, S> ValueLabelRecord<V, S>
1422 /// Maximum number of value labels in a record.
1423 pub const MAX_LABELS: u32 = u32::MAX / 8;
1425 /// Maximum number of variable indexes in a record.
1426 pub const MAX_INDEXES: u32 = u32::MAX / 8;
1429 impl ValueLabelRecord<RawStr<8>, RawString> {
1430 fn read<R: Read + Seek>(
1433 var_types: &[VarType],
1434 warn: &Box<dyn Fn(Error)>,
1435 ) -> Result<Option<Record>, Error> {
1436 let label_offset = r.stream_position()?;
1437 let n: u32 = endian.parse(read_bytes(r)?);
1438 if n > Self::MAX_LABELS {
1439 return Err(Error::BadNumberOfValueLabels {
1440 offset: label_offset,
1442 max: Self::MAX_LABELS,
1446 let mut labels = Vec::new();
1448 let value = UntypedValue(read_bytes(r)?);
1449 let label_len: u8 = endian.parse(read_bytes(r)?);
1450 let label_len = label_len as usize;
1451 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
1453 let mut label = read_vec(r, padded_len - 1)?;
1454 label.truncate(label_len);
1455 labels.push((value, RawString(label)));
1458 let index_offset = r.stream_position()?;
1459 let rec_type: u32 = endian.parse(read_bytes(r)?);
1461 return Err(Error::ExpectedVarIndexRecord {
1462 offset: index_offset,
1467 let n: u32 = endian.parse(read_bytes(r)?);
1468 if n > Self::MAX_INDEXES {
1469 return Err(Error::TooManyVarIndexes {
1470 offset: index_offset,
1472 max: Self::MAX_INDEXES,
1476 let index_offset = r.stream_position()?;
1477 let mut dict_indexes = Vec::with_capacity(n as usize);
1478 let mut invalid_indexes = Vec::new();
1480 let index: u32 = endian.parse(read_bytes(r)?);
1481 if index == 0 || index as usize > var_types.len() {
1482 dict_indexes.push(index);
1484 invalid_indexes.push(index);
1487 if !invalid_indexes.is_empty() {
1488 warn(Error::InvalidVarIndexes {
1489 offset: index_offset,
1490 max: var_types.len(),
1491 invalid: invalid_indexes,
1495 let Some(&first_index) = dict_indexes.first() else {
1496 warn(Error::NoVarIndexes {
1497 offset: index_offset,
1501 let var_type = var_types[first_index as usize - 1];
1502 let mut wrong_type_indexes = Vec::new();
1503 dict_indexes.retain(|&index| {
1504 if var_types[index as usize - 1] != var_type {
1505 wrong_type_indexes.push(index);
1511 if !wrong_type_indexes.is_empty() {
1512 warn(Error::MixedVarTypes {
1513 offset: index_offset,
1515 wrong_types: wrong_type_indexes,
1521 .map(|(value, label)| ValueLabel {
1522 value: Value::from_raw(&value, var_type, endian),
1527 let end_offset = r.stream_position()?;
1528 Ok(Some(Record::ValueLabel(ValueLabelRecord {
1529 offsets: label_offset..end_offset,
1536 fn decode<'a>(&'a self, decoder: &Decoder) -> ValueLabelRecord<RawStr<8>, Cow<'a, str>> {
1540 .map(|ValueLabel { value, label }| ValueLabel {
1541 value: value.clone(),
1542 label: decoder.decode(label),
1546 offsets: self.offsets.clone(),
1548 dict_indexes: self.dict_indexes.clone(),
1549 var_type: self.var_type,
1554 #[derive(Clone, Debug)]
1555 pub struct DocumentRecord<S>
1559 pub offsets: Range<u64>,
1561 /// The document, as an array of 80-byte lines.
1565 pub type RawDocumentLine = RawStr<DOC_LINE_LEN>;
1567 /// Length of a line in a document. Document lines are fixed-length and
1568 /// padded on the right with spaces.
1569 pub const DOC_LINE_LEN: usize = 80;
1571 impl DocumentRecord<RawDocumentLine> {
1572 /// Maximum number of lines we will accept in a document. This is simply
1573 /// the maximum number that will fit in a 32-bit space.
1574 pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
1576 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1577 let start_offset = r.stream_position()?;
1578 let n: u32 = endian.parse(read_bytes(r)?);
1580 if n > Self::MAX_LINES {
1581 Err(Error::BadDocumentLength {
1582 offset: start_offset,
1584 max: Self::MAX_LINES,
1587 let mut lines = Vec::with_capacity(n);
1589 lines.push(RawStr(read_bytes(r)?));
1591 let end_offset = r.stream_position()?;
1592 Ok(Record::Document(DocumentRecord {
1593 offsets: start_offset..end_offset,
1599 pub fn decode<'a>(&'a self, decoder: &Decoder) -> DecodedRecord {
1600 DecodedRecord::Document(DocumentRecord {
1601 offsets: self.offsets.clone(),
1605 .map(|s| decoder.decode_slice(&s.0))
1611 impl<S> Header for DocumentRecord<S>
1615 fn offsets(&self) -> Range<u64> {
1616 self.offsets.clone()
1620 trait ExtensionRecord {
1622 const SIZE: Option<u32>;
1623 const COUNT: Option<u32>;
1624 const NAME: &'static str;
1625 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error>;
1628 #[derive(Clone, Debug)]
1629 pub struct IntegerInfoRecord {
1630 pub offsets: Range<u64>,
1631 pub version: (i32, i32, i32),
1632 pub machine_code: i32,
1633 pub floating_point_rep: i32,
1634 pub compression_code: i32,
1635 pub endianness: i32,
1636 pub character_code: i32,
1639 impl ExtensionRecord for IntegerInfoRecord {
1640 const SUBTYPE: u32 = 3;
1641 const SIZE: Option<u32> = Some(4);
1642 const COUNT: Option<u32> = Some(8);
1643 const NAME: &'static str = "integer record";
1645 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1646 ext.check_size::<Self>()?;
1648 let mut input = &ext.data[..];
1649 let data: Vec<i32> = (0..8)
1650 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1652 Ok(Record::IntegerInfo(IntegerInfoRecord {
1653 offsets: ext.offsets.clone(),
1654 version: (data[0], data[1], data[2]),
1655 machine_code: data[3],
1656 floating_point_rep: data[4],
1657 compression_code: data[5],
1658 endianness: data[6],
1659 character_code: data[7],
1664 #[derive(Clone, Debug)]
1665 pub struct FloatInfoRecord {
1671 impl ExtensionRecord for FloatInfoRecord {
1672 const SUBTYPE: u32 = 4;
1673 const SIZE: Option<u32> = Some(8);
1674 const COUNT: Option<u32> = Some(3);
1675 const NAME: &'static str = "floating point record";
1677 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1678 ext.check_size::<Self>()?;
1680 let mut input = &ext.data[..];
1681 let data: Vec<f64> = (0..3)
1682 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1684 Ok(Record::FloatInfo(FloatInfoRecord {
1692 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1693 pub enum CategoryLabels {
1698 #[derive(Clone, Debug)]
1699 pub enum MultipleResponseType {
1702 labels: CategoryLabels,
1707 impl MultipleResponseType {
1708 fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Error> {
1709 let (mr_type, input) = match input.split_first() {
1710 Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
1711 Some((b'D', input)) => {
1712 let (value, input) = parse_counted_string(input)?;
1714 MultipleResponseType::MultipleDichotomy {
1716 labels: CategoryLabels::VarLabels,
1721 Some((b'E', input)) => {
1722 let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
1723 (CategoryLabels::CountedValues, rest)
1724 } else if let Some(rest) = input.strip_prefix(b" 11 ") {
1725 (CategoryLabels::VarLabels, rest)
1727 return Err(Error::TBD);
1729 let (value, input) = parse_counted_string(input)?;
1731 MultipleResponseType::MultipleDichotomy { value, labels },
1735 _ => return Err(Error::TBD),
1737 Ok((mr_type, input))
1741 #[derive(Clone, Debug)]
1742 pub struct MultipleResponseSet<I, S>
1749 pub mr_type: MultipleResponseType,
1750 pub short_names: Vec<I>,
1753 impl MultipleResponseSet<RawString, RawString> {
1754 fn parse(input: &[u8]) -> Result<(Self, &[u8]), Error> {
1755 let Some(equals) = input.iter().position(|&b| b == b'=') else {
1756 return Err(Error::TBD);
1758 let (name, input) = input.split_at(equals);
1759 let (mr_type, input) = MultipleResponseType::parse(input)?;
1760 let Some(input) = input.strip_prefix(b" ") else {
1761 return Err(Error::TBD);
1763 let (label, mut input) = parse_counted_string(input)?;
1764 let mut vars = Vec::new();
1765 while input.first() != Some(&b'\n') {
1766 match input.split_first() {
1767 Some((b' ', rest)) => {
1768 let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
1769 return Err(Error::TBD);
1771 let (var, rest) = rest.split_at(length);
1772 if !var.is_empty() {
1773 vars.push(var.into());
1777 _ => return Err(Error::TBD),
1780 while input.first() == Some(&b'\n') {
1781 input = &input[1..];
1784 MultipleResponseSet {
1797 ) -> Result<MultipleResponseSet<Identifier, Cow<'a, str>>, Error> {
1798 let mut short_names = Vec::with_capacity(self.short_names.len());
1799 for short_name in self.short_names.iter() {
1800 if let Some(short_name) = decoder
1801 .decode_identifier(short_name)
1802 .map_err(|err| Error::InvalidMrSetName(err))
1803 .warn_on_error(&decoder.warn)
1805 short_names.push(short_name);
1808 Ok(MultipleResponseSet {
1810 .decode_identifier(&self.name)
1811 .map_err(|err| Error::InvalidMrSetVariableName(err))?,
1812 label: decoder.decode(&self.label),
1813 mr_type: self.mr_type.clone(),
1814 short_names: short_names,
1819 #[derive(Clone, Debug)]
1820 pub struct MultipleResponseRecord<I, S>(pub Vec<MultipleResponseSet<I, S>>)
1825 impl ExtensionRecord for MultipleResponseRecord<RawString, RawString> {
1826 const SUBTYPE: u32 = 7;
1827 const SIZE: Option<u32> = Some(1);
1828 const COUNT: Option<u32> = None;
1829 const NAME: &'static str = "multiple response set record";
1831 fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Error> {
1832 ext.check_size::<Self>()?;
1834 let mut input = &ext.data[..];
1835 let mut sets = Vec::new();
1836 while !input.is_empty() {
1837 let (set, rest) = MultipleResponseSet::parse(input)?;
1841 Ok(Record::MultipleResponse(MultipleResponseRecord(sets)))
1845 impl MultipleResponseRecord<RawString, RawString> {
1846 fn decode<'a>(&'a self, decoder: &Decoder) -> DecodedRecord {
1847 let mut sets = Vec::new();
1848 for set in self.0.iter() {
1849 if let Some(set) = set.decode(decoder).warn_on_error(&decoder.warn) {
1853 DecodedRecord::MultipleResponse(MultipleResponseRecord(sets))
1857 fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Error> {
1858 let Some(space) = input.iter().position(|&b| b == b' ') else {
1859 return Err(Error::TBD);
1861 let Ok(length) = from_utf8(&input[..space]) else {
1862 return Err(Error::TBD);
1864 let Ok(length): Result<usize, _> = length.parse() else {
1865 return Err(Error::TBD);
1868 let input = &input[space + 1..];
1869 if input.len() < length {
1870 return Err(Error::TBD);
1873 let (string, rest) = input.split_at(length);
1874 Ok((string.into(), rest))
1877 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1885 pub fn default_for_type(var_type: VarType) -> Option<Measure> {
1887 VarType::Numeric => None,
1888 VarType::String => Some(Self::Nominal),
1892 fn try_decode(source: u32) -> Result<Option<Measure>, Error> {
1895 1 => Ok(Some(Measure::Nominal)),
1896 2 => Ok(Some(Measure::Ordinal)),
1897 3 => Ok(Some(Measure::Scale)),
1898 _ => Err(Error::InvalidMeasurement(source)),
1903 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1904 pub enum Alignment {
1911 fn try_decode(source: u32) -> Result<Option<Alignment>, Error> {
1914 1 => Ok(Some(Alignment::Left)),
1915 2 => Ok(Some(Alignment::Right)),
1916 3 => Ok(Some(Alignment::Center)),
1917 _ => Err(Error::InvalidAlignment(source)),
1921 pub fn default_for_type(var_type: VarType) -> Self {
1923 VarType::Numeric => Self::Right,
1924 VarType::String => Self::Left,
1929 #[derive(Clone, Debug)]
1930 pub struct VarDisplay {
1931 pub measure: Option<Measure>,
1932 pub width: Option<u32>,
1933 pub alignment: Option<Alignment>,
1936 #[derive(Clone, Debug)]
1937 pub struct VarDisplayRecord(pub Vec<VarDisplay>);
1939 impl VarDisplayRecord {
1940 const SUBTYPE: u32 = 11;
1946 warn: &Box<dyn Fn(Error)>,
1947 ) -> Result<Record, Error> {
1949 return Err(Error::BadRecordSize {
1950 offset: ext.offsets.start,
1951 record: String::from("variable display record"),
1957 let has_width = if ext.count as usize == 3 * n_vars {
1959 } else if ext.count as usize == 2 * n_vars {
1962 return Err(Error::TBD);
1965 let mut var_displays = Vec::new();
1966 let mut input = &ext.data[..];
1967 for _ in 0..n_vars {
1968 let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
1969 .warn_on_error(&warn)
1971 let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
1972 let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
1973 .warn_on_error(&warn)
1975 var_displays.push(VarDisplay {
1981 Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
1985 #[derive(Clone, Debug)]
1986 pub struct LongStringMissingValues<N, V>
1995 pub missing_values: MissingValues<V>,
1998 impl LongStringMissingValues<RawString, RawStr<8>> {
2002 ) -> Result<LongStringMissingValues<Identifier, String>, IdError> {
2003 Ok(LongStringMissingValues {
2004 var_name: decoder.decode_identifier(&self.var_name)?,
2005 missing_values: self.missing_values.decode(decoder),
2010 #[derive(Clone, Debug)]
2011 pub struct LongStringMissingValueRecord<N, V>(pub Vec<LongStringMissingValues<N, V>>)
2016 impl ExtensionRecord for LongStringMissingValueRecord<RawString, RawStr<8>> {
2017 const SUBTYPE: u32 = 22;
2018 const SIZE: Option<u32> = Some(1);
2019 const COUNT: Option<u32> = None;
2020 const NAME: &'static str = "long string missing values record";
2022 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
2023 ext.check_size::<Self>()?;
2025 let mut input = &ext.data[..];
2026 let mut missing_value_set = Vec::new();
2027 while !input.is_empty() {
2028 let var_name = read_string(&mut input, endian)?;
2029 let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
2030 let value_len: u32 = endian.parse(read_bytes(&mut input)?);
2032 let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
2033 return Err(Error::BadLongMissingValueLength {
2034 record_offset: ext.offsets.start,
2039 let mut values = Vec::new();
2040 for i in 0..n_missing_values {
2041 let value: [u8; 8] = read_bytes(&mut input)?;
2042 let numeric_value: u64 = endian.parse(value);
2043 let value = if i > 0 && numeric_value == 8 {
2044 // Tolerate files written by old, buggy versions of PSPP
2045 // where we believed that the value_length was repeated
2046 // before each missing value.
2047 read_bytes(&mut input)?
2051 values.push(Value::String(RawStr(value)));
2053 let missing_values = MissingValues {
2057 missing_value_set.push(LongStringMissingValues {
2062 Ok(Record::LongStringMissingValues(
2063 LongStringMissingValueRecord(missing_value_set),
2068 impl LongStringMissingValueRecord<RawString, RawStr<8>> {
2072 ) -> LongStringMissingValueRecord<Identifier, String> {
2073 let mut mvs = Vec::with_capacity(self.0.len());
2074 for mv in self.0.iter() {
2075 if let Some(mv) = mv
2077 .map_err(|err| Error::InvalidLongStringMissingValueVariableName(err))
2078 .warn_on_error(&decoder.warn)
2083 LongStringMissingValueRecord(mvs)
2087 #[derive(Clone, Debug)]
2088 pub struct EncodingRecord(pub String);
2090 impl ExtensionRecord for EncodingRecord {
2091 const SUBTYPE: u32 = 20;
2092 const SIZE: Option<u32> = Some(1);
2093 const COUNT: Option<u32> = None;
2094 const NAME: &'static str = "encoding record";
2096 fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Error> {
2097 ext.check_size::<Self>()?;
2099 Ok(Record::Encoding(EncodingRecord(
2100 String::from_utf8(ext.data.clone()).map_err(|_| Error::BadEncodingName {
2101 offset: ext.offsets.start,
2107 #[derive(Copy, Clone, Debug)]
2108 pub struct NumberOfCasesRecord {
2109 /// Always observed as 1.
2112 /// Number of cases.
2116 impl ExtensionRecord for NumberOfCasesRecord {
2117 const SUBTYPE: u32 = 16;
2118 const SIZE: Option<u32> = Some(8);
2119 const COUNT: Option<u32> = Some(2);
2120 const NAME: &'static str = "extended number of cases record";
2122 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
2123 ext.check_size::<Self>()?;
2125 let mut input = &ext.data[..];
2126 let one = endian.parse(read_bytes(&mut input)?);
2127 let n_cases = endian.parse(read_bytes(&mut input)?);
2129 Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
2133 #[derive(Clone, Debug)]
2134 pub struct TextRecord {
2135 pub offsets: Range<u64>,
2138 pub rec_type: TextRecordType,
2140 /// The text content of the record.
2141 pub text: RawString,
2144 #[derive(Clone, Copy, Debug)]
2145 pub enum TextRecordType {
2155 fn new(extension: Extension, rec_type: TextRecordType) -> Self {
2157 offsets: extension.offsets,
2159 text: extension.data.into(),
2162 pub fn decode<'a>(&self, decoder: &Decoder) -> Result<DecodedRecord, Error> {
2163 match self.rec_type {
2164 TextRecordType::VariableSets => Ok(DecodedRecord::VariableSets(
2165 VariableSetRecord::decode(self, decoder),
2167 TextRecordType::ProductInfo => Ok(DecodedRecord::ProductInfo(
2168 ProductInfoRecord::decode(self, decoder),
2170 TextRecordType::LongNames => Ok(DecodedRecord::LongNames(LongNamesRecord::decode(
2173 TextRecordType::VeryLongStrings => Ok(DecodedRecord::VeryLongStrings(
2174 VeryLongStringsRecord::decode(self, decoder),
2176 TextRecordType::FileAttributes => Ok(DecodedRecord::FileAttributes(
2177 FileAttributeRecord::decode(self, decoder),
2179 TextRecordType::VariableAttributes => Ok(DecodedRecord::VariableAttributes(
2180 VariableAttributeRecord::decode(self, decoder),
2186 #[derive(Clone, Debug)]
2187 pub struct VeryLongString {
2188 pub short_name: Identifier,
2192 impl VeryLongString {
2193 fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Error> {
2194 let Some((short_name, length)) = input.split_once('=') else {
2195 return Err(Error::TBD);
2197 let short_name = decoder
2198 .new_identifier(short_name)
2199 .map_err(Error::InvalidLongStringName)?;
2200 let length = length.parse().map_err(|_| Error::TBD)?;
2201 Ok(VeryLongString { short_name, length })
2205 #[derive(Clone, Debug)]
2206 pub struct VeryLongStringsRecord(Vec<VeryLongString>);
2208 impl VeryLongStringsRecord {
2209 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2210 let input = decoder.decode(&source.text);
2211 let mut very_long_strings = Vec::new();
2214 .map(|s| s.trim_end_matches('\t'))
2215 .filter(|s| !s.is_empty())
2217 if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&decoder.warn) {
2218 very_long_strings.push(vls)
2221 VeryLongStringsRecord(very_long_strings)
2225 #[derive(Clone, Debug)]
2226 pub struct Attribute {
2227 pub name: Identifier,
2228 pub values: Vec<String>,
2232 fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Error> {
2233 let Some((name, mut input)) = input.split_once('(') else {
2234 return Err(Error::TBD);
2237 .new_identifier(name)
2238 .map_err(Error::InvalidAttributeName)?;
2239 let mut values = Vec::new();
2241 let Some((value, rest)) = input.split_once('\n') else {
2242 return Err(Error::TBD);
2244 if let Some(stripped) = value
2246 .and_then(|value| value.strip_suffix('\''))
2248 values.push(stripped.into());
2250 decoder.warn(Error::TBD);
2251 values.push(value.into());
2253 if let Some(rest) = rest.strip_prefix(')') {
2254 let attribute = Attribute { name, values };
2255 return Ok((attribute, rest));
2262 #[derive(Clone, Debug)]
2263 pub struct AttributeSet(pub HashMap<Identifier, Vec<String>>);
2269 sentinel: Option<char>,
2270 ) -> Result<(AttributeSet, &'a str), Error> {
2271 let mut attributes = HashMap::new();
2273 match input.chars().next() {
2274 None => break input,
2275 c if c == sentinel => break &input[1..],
2277 let (attribute, rest) = Attribute::parse(decoder, input)?;
2278 // XXX report duplicate name
2279 attributes.insert(attribute.name, attribute.values);
2284 Ok((AttributeSet(attributes), rest))
2288 impl Default for AttributeSet {
2289 fn default() -> Self {
2290 Self(HashMap::default())
2294 #[derive(Clone, Debug)]
2295 pub struct FileAttributeRecord(AttributeSet);
2297 impl FileAttributeRecord {
2298 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2299 let input = decoder.decode(&source.text);
2300 match AttributeSet::parse(decoder, &input, None).warn_on_error(&decoder.warn) {
2301 Some((set, rest)) => {
2302 if !rest.is_empty() {
2303 decoder.warn(Error::TBD);
2305 FileAttributeRecord(set)
2307 None => FileAttributeRecord::default(),
2312 impl Default for FileAttributeRecord {
2313 fn default() -> Self {
2314 Self(AttributeSet::default())
2318 #[derive(Clone, Debug)]
2319 pub struct VarAttributeSet {
2320 pub long_var_name: Identifier,
2321 pub attributes: AttributeSet,
2324 impl VarAttributeSet {
2325 fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributeSet, &'a str), Error> {
2326 let Some((long_var_name, rest)) = input.split_once(':') else {
2327 return Err(Error::TBD);
2329 let long_var_name = decoder
2330 .new_identifier(long_var_name)
2331 .map_err(Error::InvalidAttributeVariableName)?;
2332 let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?;
2333 let var_attribute = VarAttributeSet {
2337 Ok((var_attribute, rest))
2341 #[derive(Clone, Debug)]
2342 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
2344 impl VariableAttributeRecord {
2345 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2346 let decoded = decoder.decode(&source.text);
2347 let mut input = decoded.as_ref();
2348 let mut var_attribute_sets = Vec::new();
2349 while !input.is_empty() {
2350 let Some((var_attribute, rest)) =
2351 VarAttributeSet::parse(decoder, &input).warn_on_error(&decoder.warn)
2355 var_attribute_sets.push(var_attribute);
2356 input = rest.into();
2358 VariableAttributeRecord(var_attribute_sets)
2362 #[derive(Clone, Debug)]
2363 pub struct LongName {
2364 pub short_name: Identifier,
2365 pub long_name: Identifier,
2369 fn parse(input: &str, decoder: &Decoder) -> Result<Self, Error> {
2370 let Some((short_name, long_name)) = input.split_once('=') else {
2371 return Err(Error::TBD);
2373 let short_name = decoder
2374 .new_identifier(short_name)
2375 .map_err(Error::InvalidShortName)?;
2376 let long_name = decoder
2377 .new_identifier(long_name)
2378 .map_err(Error::InvalidLongName)?;
2386 #[derive(Clone, Debug)]
2387 pub struct LongNamesRecord(Vec<LongName>);
2389 impl LongNamesRecord {
2390 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2391 let input = decoder.decode(&source.text);
2392 let mut names = Vec::new();
2393 for pair in input.split('\t').filter(|s| !s.is_empty()) {
2394 if let Some(long_name) = LongName::parse(pair, decoder).warn_on_error(&decoder.warn) {
2395 names.push(long_name);
2398 LongNamesRecord(names)
2402 #[derive(Clone, Debug)]
2403 pub struct ProductInfoRecord(pub String);
2405 impl ProductInfoRecord {
2406 const NAME: &'static str = "extra product info";
2407 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2408 Self(decoder.decode(&source.text).into())
2411 #[derive(Clone, Debug)]
2412 pub struct VariableSet {
2414 pub vars: Vec<Identifier>,
2418 fn parse(input: &str, decoder: &Decoder) -> Result<Self, Error> {
2419 let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
2420 let mut vars = Vec::new();
2421 for var in input.split_ascii_whitespace() {
2422 if let Some(identifier) = decoder
2423 .new_identifier(var)
2424 .map_err(Error::InvalidVariableSetName)
2425 .warn_on_error(&decoder.warn)
2427 vars.push(identifier);
2437 #[derive(Clone, Debug)]
2438 pub struct VariableSetRecord {
2439 pub offsets: Range<u64>,
2440 pub sets: Vec<VariableSet>,
2443 impl VariableSetRecord {
2444 fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord {
2445 let mut sets = Vec::new();
2446 let input = decoder.decode(&source.text);
2447 for line in input.lines() {
2448 if let Some(set) = VariableSet::parse(line, decoder).warn_on_error(&decoder.warn) {
2453 offsets: source.offsets.clone(),
2459 trait WarnOnError<T> {
2460 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
2462 impl<T> WarnOnError<T> for Result<T, Error> {
2463 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
2465 Ok(result) => Some(result),
2474 #[derive(Clone, Debug)]
2475 pub struct Extension {
2476 pub offsets: Range<u64>,
2481 /// Size of each data element.
2484 /// Number of data elements.
2487 /// `size * count` bytes of data.
2492 fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
2493 if let Some(expected_size) = E::SIZE {
2494 if self.size != expected_size {
2495 return Err(Error::BadRecordSize {
2496 offset: self.offsets.start,
2497 record: E::NAME.into(),
2503 if let Some(expected_count) = E::COUNT {
2504 if self.count != expected_count {
2505 return Err(Error::BadRecordCount {
2506 offset: self.offsets.start,
2507 record: E::NAME.into(),
2516 fn read<R: Read + Seek>(
2520 warn: &Box<dyn Fn(Error)>,
2521 ) -> Result<Option<Record>, Error> {
2522 let subtype = endian.parse(read_bytes(r)?);
2523 let header_offset = r.stream_position()?;
2524 let size: u32 = endian.parse(read_bytes(r)?);
2525 let count = endian.parse(read_bytes(r)?);
2526 let Some(product) = size.checked_mul(count) else {
2527 return Err(Error::ExtensionRecordTooLarge {
2528 offset: header_offset,
2534 let start_offset = r.stream_position()?;
2535 let data = read_vec(r, product as usize)?;
2536 let end_offset = start_offset + product as u64;
2537 let extension = Extension {
2538 offsets: start_offset..end_offset,
2544 let result = match subtype {
2545 IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian),
2546 FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian),
2547 VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, n_vars, endian, warn),
2548 MultipleResponseRecord::SUBTYPE | 19 => {
2549 MultipleResponseRecord::parse(&extension, endian)
2551 LongStringValueLabelRecord::SUBTYPE => {
2552 LongStringValueLabelRecord::parse(&extension, endian)
2554 EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian),
2555 NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian),
2556 5 => Ok(Record::Text(TextRecord::new(
2558 TextRecordType::VariableSets,
2560 10 => Ok(Record::Text(TextRecord::new(
2562 TextRecordType::ProductInfo,
2564 13 => Ok(Record::Text(TextRecord::new(
2566 TextRecordType::LongNames,
2568 14 => Ok(Record::Text(TextRecord::new(
2570 TextRecordType::VeryLongStrings,
2572 17 => Ok(Record::Text(TextRecord::new(
2574 TextRecordType::FileAttributes,
2576 18 => Ok(Record::Text(TextRecord::new(
2578 TextRecordType::VariableAttributes,
2580 _ => Ok(Record::OtherExtension(extension)),
2583 Ok(result) => Ok(Some(result)),
2592 #[derive(Clone, Debug)]
2593 pub struct ZHeader {
2594 /// File offset to the start of the record.
2597 /// File offset to the ZLIB data header.
2598 pub zheader_offset: u64,
2600 /// File offset to the ZLIB trailer.
2601 pub ztrailer_offset: u64,
2603 /// Length of the ZLIB trailer in bytes.
2604 pub ztrailer_len: u64,
2608 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
2609 let offset = r.stream_position()?;
2610 let zheader_offset: u64 = endian.parse(read_bytes(r)?);
2611 let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
2612 let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
2623 #[derive(Clone, Debug)]
2624 pub struct ZTrailer {
2625 /// File offset to the start of the record.
2628 /// Compression bias as a negative integer, e.g. -100.
2631 /// Always observed as zero.
2634 /// Uncompressed size of each block, except possibly the last. Only
2635 /// `0x3ff000` has been observed so far.
2636 pub block_size: u32,
2638 /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
2639 pub blocks: Vec<ZBlock>,
2642 #[derive(Clone, Debug)]
2644 /// Offset of block of data if simple compression were used.
2645 pub uncompressed_ofs: u64,
2647 /// Actual offset within the file of the compressed data block.
2648 pub compressed_ofs: u64,
2650 /// The number of bytes in this data block after decompression. This is
2651 /// `block_size` in every data block but the last, which may be smaller.
2652 pub uncompressed_size: u32,
2654 /// The number of bytes in this data block, as stored compressed in this
2656 pub compressed_size: u32,
2660 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
2662 uncompressed_ofs: endian.parse(read_bytes(r)?),
2663 compressed_ofs: endian.parse(read_bytes(r)?),
2664 uncompressed_size: endian.parse(read_bytes(r)?),
2665 compressed_size: endian.parse(read_bytes(r)?),
2671 fn read<R: Read + Seek>(
2676 ) -> Result<Option<ZTrailer>, Error> {
2677 let start_offset = reader.stream_position()?;
2678 if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
2681 let int_bias = endian.parse(read_bytes(reader)?);
2682 let zero = endian.parse(read_bytes(reader)?);
2683 let block_size = endian.parse(read_bytes(reader)?);
2684 let n_blocks: u32 = endian.parse(read_bytes(reader)?);
2685 let expected_n_blocks = (ztrailer_len - 24) / 24;
2686 if n_blocks as u64 != expected_n_blocks {
2687 return Err(Error::BadZlibTrailerNBlocks {
2688 offset: ztrailer_ofs,
2694 let blocks = (0..n_blocks)
2695 .map(|_| ZBlock::read(reader, endian))
2696 .collect::<Result<Vec<_>, _>>()?;
2697 reader.seek(SeekFrom::Start(start_offset))?;
2699 offset: ztrailer_ofs,
2708 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
2709 let mut buf = [0; N];
2710 let n = r.read(&mut buf)?;
2713 r.read_exact(&mut buf[n..])?;
2721 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
2722 let mut buf = [0; N];
2723 r.read_exact(&mut buf)?;
2727 fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
2728 let mut vec = vec![0; n];
2729 r.read_exact(&mut vec)?;
2733 fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<RawString, IoError> {
2734 let length: u32 = endian.parse(read_bytes(r)?);
2735 Ok(read_vec(r, length as usize)?.into())
2738 #[derive(Clone, Debug)]
2739 pub struct LongStringValueLabels<N, S>
2746 /// `(value, label)` pairs, where each value is `width` bytes.
2747 pub labels: Vec<(S, S)>,
2750 impl LongStringValueLabels<RawString, RawString> {
2754 ) -> Result<LongStringValueLabels<Identifier, Cow<'a, str>>, Error> {
2755 let var_name = decoder.decode(&self.var_name);
2756 let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
2757 .map_err(Error::InvalidLongStringValueLabelName)?;
2759 let mut labels = Vec::with_capacity(self.labels.len());
2760 for (value, label) in self.labels.iter() {
2761 let value = decoder.decode_exact_length(&value.0);
2762 let label = decoder.decode(&label);
2763 labels.push((value, label));
2766 Ok(LongStringValueLabels {
2774 #[derive(Clone, Debug)]
2775 pub struct LongStringValueLabelRecord<N, S>(pub Vec<LongStringValueLabels<N, S>>)
2780 impl ExtensionRecord for LongStringValueLabelRecord<RawString, RawString> {
2781 const SUBTYPE: u32 = 21;
2782 const SIZE: Option<u32> = Some(1);
2783 const COUNT: Option<u32> = None;
2784 const NAME: &'static str = "long string value labels record";
2786 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
2787 ext.check_size::<Self>()?;
2789 let mut input = &ext.data[..];
2790 let mut label_set = Vec::new();
2791 while !input.is_empty() {
2792 let var_name = read_string(&mut input, endian)?;
2793 let width: u32 = endian.parse(read_bytes(&mut input)?);
2794 let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
2795 let mut labels = Vec::new();
2796 for _ in 0..n_labels {
2797 let value = read_string(&mut input, endian)?;
2798 let label = read_string(&mut input, endian)?;
2799 labels.push((value, label));
2801 label_set.push(LongStringValueLabels {
2807 Ok(Record::LongStringValueLabels(LongStringValueLabelRecord(
2813 impl LongStringValueLabelRecord<RawString, RawString> {
2817 ) -> Result<LongStringValueLabelRecord<Identifier, Cow<'a, str>>, Error> {
2818 let mut labels = Vec::with_capacity(self.0.len());
2819 for label in &self.0 {
2820 match label.decode(decoder) {
2821 Ok(set) => labels.push(set),
2822 Err(error) => decoder.warn(error),
2825 Ok(LongStringValueLabelRecord(labels))