1 use crate::endian::{Endian, Parse, ToBytes};
3 use encoding_rs::{mem::decode_latin1, DecoderResult, Encoding};
4 use flate2::read::ZlibDecoder;
10 collections::VecDeque,
11 fmt::{Debug, Display, Formatter, Result as FmtResult},
12 io::{Error as IoError, Read, Seek, SeekFrom},
19 use thiserror::Error as ThisError;
21 #[derive(ThisError, Debug)]
23 #[error("Not an SPSS system file")]
26 #[error("Invalid magic number {0:?}")]
29 #[error("I/O error ({0})")]
32 #[error("Invalid SAV compression code {0}")]
33 InvalidSavCompression(u32),
35 #[error("Invalid ZSAV compression code {0}")]
36 InvalidZsavCompression(u32),
38 #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
39 BadVariableWidth { offset: u64, width: i32 },
41 #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
42 BadDocumentLength { offset: u64, n: usize, max: usize },
44 #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
45 BadRecordType { offset: u64, rec_type: u32 },
47 #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
48 BadVariableLabelCode {
55 "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
57 BadNumericMissingValueCode { offset: u64, code: i32 },
59 #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
60 BadStringMissingValueCode { offset: u64, code: i32 },
62 #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
63 BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
65 #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
66 ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
68 #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")]
69 TooManyVarIndexes { offset: u64, n: u32, max: u32 },
71 #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")]
72 NoVarIndexes { offset: u64 },
74 #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())]
78 wrong_types: Vec<u32>,
81 #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")]
88 #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
89 ExtensionRecordTooLarge {
96 #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
104 "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
106 EofInCompressedCase { offset: u64, case_ofs: u64 },
108 #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
109 PartialCompressedCase { offset: u64, case_ofs: u64 },
111 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
112 CompressedNumberExpected { offset: u64, case_ofs: u64 },
114 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
115 CompressedStringExpected { offset: u64, case_ofs: u64 },
117 #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
118 BadZlibTrailerNBlocks {
121 expected_n_blocks: u64,
125 #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
133 #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
141 #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
142 BadLongMissingValueLength {
148 #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
149 BadEncodingName { offset: u64 },
151 // XXX This is risky because `text` might be arbitarily long.
152 #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
153 MalformedString { encoding: String, text: String },
155 #[error("Invalid variable measurement level value {0}")]
156 InvalidMeasurement(u32),
158 #[error("Invalid variable display alignment value {0}")]
159 InvalidAlignment(u32),
161 #[error("Details TBD")]
165 #[derive(Clone, Debug)]
167 Header(HeaderRecord<RawString>),
168 Variable(VariableRecord<RawString, RawStr<8>>),
169 ValueLabel(ValueLabelRecord<RawStr<8>, RawString>),
170 Document(DocumentRecord<RawDocumentLine>),
171 IntegerInfo(IntegerInfoRecord),
172 FloatInfo(FloatInfoRecord),
173 VariableSets(VariableSetRecord),
174 VarDisplay(VarDisplayRecord),
175 MultipleResponse(MultipleResponseRecord<RawString>),
176 LongStringValueLabels(LongStringValueLabelRecord<RawString>),
177 LongStringMissingValues(LongStringMissingValueRecord<RawString, RawStr<8>>),
178 Encoding(EncodingRecord),
179 NumberOfCases(NumberOfCasesRecord),
180 ProductInfo(ProductInfoRecord),
181 LongNames(LongNamesRecord),
182 VeryLongStrings(VeryLongStringsRecord),
183 FileAttributes(FileAttributeRecord),
184 VariableAttributes(VariableAttributeRecord),
186 OtherExtension(Extension),
190 Cases(Rc<RefCell<Cases>>),
197 var_types: &[VarType],
198 warn: &Box<dyn Fn(Error)>,
199 ) -> Result<Option<Record>, Error>
203 let rec_type: u32 = endian.parse(read_bytes(reader)?);
205 2 => Ok(Some(VariableRecord::read(reader, endian)?)),
206 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?),
207 6 => Ok(Some(DocumentRecord::read(reader, endian)?)),
208 7 => Extension::read(reader, endian, var_types.len(), warn),
209 999 => Ok(Some(Record::EndOfHeaders(
210 endian.parse(read_bytes(reader)?),
212 _ => Err(Error::BadRecordType {
213 offset: reader.stream_position()?,
220 // If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
221 // decoded as Latin-1 (actually bytes interpreted as Unicode code points).
222 fn default_decode(s: &[u8]) -> Cow<str> {
223 from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
226 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
227 pub enum Compression {
233 fn offsets(&self) -> Range<u64>;
237 pub struct HeaderRecord<S>
242 pub offsets: Range<u64>,
247 /// Eye-catcher string, product name, in the file's encoding. Padded
248 /// on the right with spaces.
251 /// Layout code, normally either 2 or 3.
252 pub layout_code: u32,
254 /// Number of variable positions, or `None` if the value in the file is
255 /// questionably trustworthy.
256 pub nominal_case_size: Option<u32>,
258 /// Compression type, if any,
259 pub compression: Option<Compression>,
261 /// 1-based variable index of the weight variable, or `None` if the file is
263 pub weight_index: Option<u32>,
265 /// Claimed number of cases, if known.
266 pub n_cases: Option<u32>,
268 /// Compression bias, usually 100.0.
271 /// `dd mmm yy` in the file's encoding.
272 pub creation_date: S,
274 /// `HH:MM:SS` in the file's encoding.
275 pub creation_time: S,
277 /// File label, in the file's encoding. Padded on the right with spaces.
280 /// Endianness of the data in the file header.
284 impl<S> HeaderRecord<S>
288 fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult
292 writeln!(f, "{name:>17}: {:?}", value)
296 impl<S> Debug for HeaderRecord<S>
300 fn fmt(&self, f: &mut Formatter) -> FmtResult {
301 writeln!(f, "File header record:")?;
302 self.debug_field(f, "Magic", self.magic)?;
303 self.debug_field(f, "Product name", &self.eye_catcher)?;
304 self.debug_field(f, "Layout code", self.layout_code)?;
305 self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
306 self.debug_field(f, "Compression", self.compression)?;
307 self.debug_field(f, "Weight index", self.weight_index)?;
308 self.debug_field(f, "Number of cases", self.n_cases)?;
309 self.debug_field(f, "Compression bias", self.bias)?;
310 self.debug_field(f, "Creation date", &self.creation_date)?;
311 self.debug_field(f, "Creation time", &self.creation_time)?;
312 self.debug_field(f, "File label", &self.file_label)?;
313 self.debug_field(f, "Endianness", self.endian)
317 impl HeaderRecord<RawString> {
318 fn read<R: Read + Seek>(r: &mut R) -> Result<Self, Error> {
319 let start = r.stream_position()?;
321 let magic: [u8; 4] = read_bytes(r)?;
322 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
324 let eye_catcher = RawString(read_vec(r, 60)?);
325 let layout_code: [u8; 4] = read_bytes(r)?;
326 let endian = Endian::identify_u32(2, layout_code)
327 .or_else(|| Endian::identify_u32(2, layout_code))
328 .ok_or_else(|| Error::NotASystemFile)?;
329 let layout_code = endian.parse(layout_code);
331 let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
332 let nominal_case_size =
333 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
335 let compression_code: u32 = endian.parse(read_bytes(r)?);
336 let compression = match (magic, compression_code) {
337 (Magic::Zsav, 2) => Some(Compression::ZLib),
338 (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)),
340 (_, 1) => Some(Compression::Simple),
341 (_, code) => return Err(Error::InvalidSavCompression(code)),
344 let weight_index: u32 = endian.parse(read_bytes(r)?);
345 let weight_index = (weight_index > 0).then_some(weight_index);
347 let n_cases: u32 = endian.parse(read_bytes(r)?);
348 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
350 let bias: f64 = endian.parse(read_bytes(r)?);
352 let creation_date = RawString(read_vec(r, 9)?);
353 let creation_time = RawString(read_vec(r, 8)?);
354 let file_label = RawString(read_vec(r, 64)?);
355 let _: [u8; 3] = read_bytes(r)?;
358 offsets: start..r.stream_position()?,
374 fn decode<'a>(&'a self, decoder: &Decoder) -> HeaderRecord<Cow<'a, str>> {
375 let eye_catcher = decoder.decode(&self.eye_catcher);
376 let file_label = decoder.decode(&self.file_label);
377 let creation_date = decoder.decode(&self.creation_date);
378 let creation_time = decoder.decode(&self.creation_time);
381 weight_index: self.weight_index,
382 n_cases: self.n_cases,
384 offsets: self.offsets.clone(),
386 layout_code: self.layout_code,
387 nominal_case_size: self.nominal_case_size,
388 compression: self.compression,
398 encoding: &'static Encoding,
399 warn: Box<dyn Fn(Error)>,
403 fn warn(&self, error: Error) {
406 fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
407 let (output, malformed) = self.encoding.decode_without_bom_handling(input);
409 self.warn(Error::MalformedString {
410 encoding: self.encoding.name().into(),
411 text: output.clone().into(),
417 fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> {
418 self.decode_slice(input.0.as_slice())
421 /// Returns `input` decoded from `self.encoding` into UTF-8 such that
422 /// re-encoding the result back into `self.encoding` will have exactly the
423 /// same length in bytes.
425 /// XXX warn about errors?
426 fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
427 if let (s, false) = self.encoding.decode_without_bom_handling(input) {
428 // This is the common case. Usually there will be no errors.
431 // Unusual case. Don't bother to optimize it much.
432 let mut decoder = self.encoding.new_decoder_without_bom_handling();
433 let mut output = String::with_capacity(
435 .max_utf8_buffer_length_without_replacement(input.len())
438 let mut rest = input;
439 while !rest.is_empty() {
440 match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
441 (DecoderResult::InputEmpty, _) => break,
442 (DecoderResult::OutputFull, _) => unreachable!(),
443 (DecoderResult::Malformed(a, b), consumed) => {
444 let skipped = a as usize + b as usize;
445 output.extend(repeat('?').take(skipped));
446 rest = &rest[consumed..];
450 assert_eq!(self.encoding.encode(&output).0.len(), input.len());
456 impl<S> Header for HeaderRecord<S>
460 fn offsets(&self) -> Range<u64> {
465 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
467 /// Regular system file.
470 /// System file with Zlib-compressed data.
473 /// EBCDIC-encoded system file.
478 /// Magic number for a regular system file.
479 pub const SAV: [u8; 4] = *b"$FL2";
481 /// Magic number for a system file that contains zlib-compressed data.
482 pub const ZSAV: [u8; 4] = *b"$FL3";
484 /// Magic number for an EBCDIC-encoded system file. This is `$FL2` encoded
486 pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2];
489 impl Debug for Magic {
490 fn fmt(&self, f: &mut Formatter) -> FmtResult {
491 let s = match *self {
492 Magic::Sav => "$FL2",
493 Magic::Zsav => "$FL3",
494 Magic::Ebcdic => "($FL2 in EBCDIC)",
500 impl TryFrom<[u8; 4]> for Magic {
503 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
505 Magic::SAV => Ok(Magic::Sav),
506 Magic::ZSAV => Ok(Magic::Zsav),
507 Magic::EBCDIC => Ok(Magic::Ebcdic),
508 _ => Err(Error::BadMagic(value)),
513 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
520 fn from_width(width: i32) -> VarType {
522 0 => VarType::Numeric,
523 _ => VarType::String,
527 fn opposite(self) -> VarType {
529 Self::Numeric => Self::String,
530 Self::String => Self::Numeric,
535 impl Display for VarType {
536 fn fmt(&self, f: &mut Formatter) -> FmtResult {
538 VarType::Numeric => write!(f, "numeric"),
539 VarType::String => write!(f, "string"),
544 #[derive(Copy, Clone)]
553 type RawValue = Value<RawStr<8>>;
555 impl<S> Debug for Value<S>
559 fn fmt(&self, f: &mut Formatter) -> FmtResult {
561 Value::Number(Some(number)) => write!(f, "{number:?}"),
562 Value::Number(None) => write!(f, "SYSMIS"),
563 Value::String(s) => write!(f, "{:?}", s),
569 fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Self, IoError> {
571 &UntypedValue(read_bytes(r)?),
577 pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self {
579 VarType::String => Value::String(RawStr(raw.0)),
580 VarType::Numeric => {
581 let number: f64 = endian.parse(raw.0);
582 Value::Number((number != -f64::MAX).then_some(number))
587 fn read_case<R: Read + Seek>(
589 var_types: &[VarType],
591 ) -> Result<Option<Vec<Self>>, Error> {
592 let case_start = reader.stream_position()?;
593 let mut values = Vec::with_capacity(var_types.len());
594 for (i, &var_type) in var_types.iter().enumerate() {
595 let Some(raw) = try_read_bytes(reader)? else {
599 let offset = reader.stream_position()?;
600 return Err(Error::EofInCase {
602 case_ofs: offset - case_start,
603 case_len: var_types.len() * 8,
607 values.push(Value::from_raw(&UntypedValue(raw), var_type, endian));
612 fn read_compressed_case<R: Read + Seek>(
614 var_types: &[VarType],
615 codes: &mut VecDeque<u8>,
618 ) -> Result<Option<Vec<Self>>, Error> {
619 let case_start = reader.stream_position()?;
620 let mut values = Vec::with_capacity(var_types.len());
621 for (i, &var_type) in var_types.iter().enumerate() {
623 let Some(code) = codes.pop_front() else {
624 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
628 let offset = reader.stream_position()?;
629 return Err(Error::EofInCompressedCase {
631 case_ofs: offset - case_start,
635 codes.extend(new_codes.into_iter());
640 1..=251 => match var_type {
641 VarType::Numeric => break Self::Number(Some(code as f64 - bias)),
643 break Self::String(RawStr(endian.to_bytes(code as f64 - bias)))
650 let offset = reader.stream_position()?;
651 return Err(Error::PartialCompressedCase {
653 case_ofs: offset - case_start,
658 break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian)
660 254 => match var_type {
661 VarType::String => break Self::String(RawStr(*b" ")), // XXX EBCDIC
662 VarType::Numeric => {
663 return Err(Error::CompressedStringExpected {
665 case_ofs: reader.stream_position()? - case_start,
669 255 => match var_type {
670 VarType::Numeric => break Self::Number(None),
672 return Err(Error::CompressedNumberExpected {
674 case_ofs: reader.stream_position()? - case_start,
685 fn decode(&self, decoder: &Decoder) -> Value<String> {
687 Self::Number(x) => Value::Number(*x),
688 Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
693 struct ZlibDecodeMultiple<R>
697 reader: Option<ZlibDecoder<R>>,
700 impl<R> ZlibDecodeMultiple<R>
704 fn new(reader: R) -> ZlibDecodeMultiple<R> {
706 reader: Some(ZlibDecoder::new(reader)),
711 impl<R> Read for ZlibDecodeMultiple<R>
715 fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
717 match self.reader.as_mut().unwrap().read(buf)? {
719 let inner = self.reader.take().unwrap().into_inner();
720 self.reader = Some(ZlibDecoder::new(inner));
728 impl<R> Seek for ZlibDecodeMultiple<R>
732 fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
733 self.reader.as_mut().unwrap().get_mut().seek(pos)
742 ztrailer_offset: u64,
751 R: Read + Seek + 'static,
754 warn: Box<dyn Fn(Error)>,
756 header: HeaderRecord<RawString>,
757 var_types: Vec<VarType>,
764 R: Read + Seek + 'static,
766 pub fn new<F>(mut reader: R, warn: F) -> Result<Self, Error>
768 F: Fn(Error) + 'static,
770 let header = HeaderRecord::read(&mut reader)?;
772 reader: Some(reader),
773 warn: Box::new(warn),
775 var_types: Vec::new(),
776 state: ReaderState::Start,
779 fn cases(&mut self) -> Cases {
780 self.state = ReaderState::End;
782 self.reader.take().unwrap(),
783 take(&mut self.var_types),
789 impl<R> Iterator for Reader<R>
791 R: Read + Seek + 'static,
793 type Item = Result<Record, Error>;
795 fn next(&mut self) -> Option<Self::Item> {
797 ReaderState::Start => {
798 self.state = ReaderState::Headers;
799 Some(Ok(Record::Header(self.header.clone())))
801 ReaderState::Headers => {
804 self.reader.as_mut().unwrap(),
806 self.var_types.as_slice(),
809 Ok(Some(record)) => break record,
811 Err(error) => return Some(Err(error)),
815 Record::Variable(VariableRecord { width, .. }) => {
816 self.var_types.push(VarType::from_width(width));
818 Record::EndOfHeaders(_) => {
819 self.state = if let Some(Compression::ZLib) = self.header.compression {
820 ReaderState::ZlibHeader
829 ReaderState::ZlibHeader => {
830 let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian)
832 Ok(zheader) => zheader,
833 Err(error) => return Some(Err(error)),
835 self.state = ReaderState::ZlibTrailer {
836 ztrailer_offset: zheader.ztrailer_offset,
837 ztrailer_len: zheader.ztrailer_len,
839 Some(Ok(Record::ZHeader(zheader)))
841 ReaderState::ZlibTrailer {
845 match ZTrailer::read(
846 self.reader.as_mut().unwrap(),
851 Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
852 Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))),
853 Err(error) => Some(Err(error)),
856 ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
857 ReaderState::End => None,
862 trait ReadSeek: Read + Seek {}
863 impl<T> ReadSeek for T where T: Read + Seek {}
866 reader: Box<dyn ReadSeek>,
867 var_types: Vec<VarType>,
868 compression: Option<Compression>,
875 impl Debug for Cases {
876 fn fmt(&self, f: &mut Formatter) -> FmtResult {
882 fn new<R>(reader: R, var_types: Vec<VarType>, header: &HeaderRecord<RawString>) -> Self
884 R: Read + Seek + 'static,
887 reader: if header.compression == Some(Compression::ZLib) {
888 Box::new(ZlibDecodeMultiple::new(reader))
893 compression: header.compression,
895 endian: header.endian,
896 codes: VecDeque::with_capacity(8),
902 impl Iterator for Cases {
903 type Item = Result<Vec<RawValue>, Error>;
905 fn next(&mut self) -> Option<Self::Item> {
910 let retval = if self.compression.is_some() {
911 Value::read_compressed_case(
920 Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose()
922 self.eof = matches!(retval, None | Some(Err(_)));
927 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
928 pub struct Spec(pub u32);
930 impl Debug for Spec {
931 fn fmt(&self, f: &mut Formatter) -> FmtResult {
932 let type_ = format_name(self.0 >> 16);
933 let w = (self.0 >> 8) & 0xff;
934 let d = self.0 & 0xff;
935 write!(f, "{:06x} ({type_}{w}.{d})", self.0)
939 fn format_name(type_: u32) -> Cow<'static, str> {
978 _ => return format!("<unknown format {type_}>").into(),
984 pub struct MissingValues<S>
988 /// Individual missing values, up to 3 of them.
989 pub values: Vec<Value<S>>,
991 /// Optional range of missing values.
992 pub range: Option<(Value<S>, Value<S>)>,
995 impl<S> Debug for MissingValues<S>
999 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1000 for (i, value) in self.values.iter().enumerate() {
1004 write!(f, "{value:?}")?;
1007 if let Some((low, high)) = &self.range {
1008 if !self.values.is_empty() {
1011 write!(f, "{low:?} THRU {high:?}")?;
1014 if self.is_empty() {
1022 impl<S> MissingValues<S>
1026 fn is_empty(&self) -> bool {
1027 self.values.is_empty() && self.range.is_none()
1031 impl MissingValues<RawStr<8>> {
1032 fn read<R: Read + Seek>(
1038 ) -> Result<Self, Error> {
1039 let (n_values, has_range) = match (width, code) {
1040 (_, 0..=3) => (code, false),
1041 (0, -2) => (0, true),
1042 (0, -3) => (1, true),
1043 (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
1044 (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
1047 let var_type = VarType::from_width(width);
1049 let mut values = Vec::new();
1050 for _ in 0..n_values {
1051 values.push(RawValue::read(r, var_type, endian)?);
1053 let range = if has_range {
1054 let low = RawValue::read(r, var_type, endian)?;
1055 let high = RawValue::read(r, var_type, endian)?;
1060 Ok(Self { values, range })
1062 fn decode<'a>(&'a self, decoder: &Decoder) -> MissingValues<String> {
1067 .map(|value| value.decode(decoder))
1072 .map(|(low, high)| (low.decode(decoder), high.decode(decoder))),
1078 pub struct VariableRecord<S, V>
1083 /// Range of offsets in file.
1084 pub offsets: Range<u64>,
1086 /// Variable width, in the range -1..=255.
1089 /// Variable name, padded on the right with spaces.
1093 pub print_format: Spec,
1096 pub write_format: Spec,
1099 pub missing_values: MissingValues<V>,
1101 /// Optional variable label.
1102 pub label: Option<S>,
1105 impl<S, V> Debug for VariableRecord<S, V>
1110 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1115 match self.width.cmp(&0) {
1116 Ordering::Greater => "string",
1117 Ordering::Equal => "numeric",
1118 Ordering::Less => "long string continuation record",
1121 writeln!(f, "Print format: {:?}", self.print_format)?;
1122 writeln!(f, "Write format: {:?}", self.write_format)?;
1123 writeln!(f, "Name: {:?}", &self.name)?;
1124 writeln!(f, "Variable label: {:?}", self.label)?;
1125 writeln!(f, "Missing values: {:?}", self.missing_values)
1129 impl VariableRecord<RawString, RawStr<8>> {
1130 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1131 let start_offset = r.stream_position()?;
1132 let width: i32 = endian.parse(read_bytes(r)?);
1133 let code_offset = r.stream_position()?;
1134 let has_variable_label: u32 = endian.parse(read_bytes(r)?);
1135 let missing_value_code: i32 = endian.parse(read_bytes(r)?);
1136 let print_format = Spec(endian.parse(read_bytes(r)?));
1137 let write_format = Spec(endian.parse(read_bytes(r)?));
1138 let name = RawString(read_vec(r, 8)?);
1140 let label = match has_variable_label {
1143 let len: u32 = endian.parse(read_bytes(r)?);
1144 let read_len = len.min(65535) as usize;
1145 let label = RawString(read_vec(r, read_len)?);
1147 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
1148 let _ = read_vec(r, padding_bytes as usize)?;
1153 return Err(Error::BadVariableLabelCode {
1156 code: has_variable_label,
1161 let missing_values =
1162 MissingValues::read(r, start_offset, width, missing_value_code, endian)?;
1164 let end_offset = r.stream_position()?;
1166 Ok(Record::Variable(VariableRecord {
1167 offsets: start_offset..end_offset,
1177 fn decode<'a>(&'a self, decoder: &Decoder) -> VariableRecord<Cow<'a, str>, String> {
1179 offsets: self.offsets.clone(),
1181 name: decoder.decode(&self.name),
1182 print_format: self.print_format,
1183 write_format: self.write_format,
1184 missing_values: self.missing_values.decode(decoder),
1185 label: self.label.as_ref().map(|label| decoder.decode(label)),
1190 #[derive(Copy, Clone)]
1191 pub struct UntypedValue(pub [u8; 8]);
1193 impl Debug for UntypedValue {
1194 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1195 let little: f64 = Endian::Little.parse(self.0);
1196 let little = format!("{:?}", little);
1197 let big: f64 = Endian::Big.parse(self.0);
1198 let big = format!("{:?}", big);
1199 let number = if little.len() <= big.len() {
1204 write!(f, "{number}")?;
1206 let string = default_decode(&self.0);
1208 .split(|c: char| c == '\0' || c.is_control())
1211 write!(f, "{string:?}")?;
1217 pub struct RawString(pub Vec<u8>);
1219 impl From<Vec<u8>> for RawString {
1220 fn from(source: Vec<u8>) -> Self {
1225 impl From<&[u8]> for RawString {
1226 fn from(source: &[u8]) -> Self {
1231 impl Debug for RawString {
1232 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1233 write!(f, "{:?}", default_decode(self.0.as_slice()))
1237 #[derive(Copy, Clone)]
1238 pub struct RawStr<const N: usize>(pub [u8; N]);
1240 impl<const N: usize> From<[u8; N]> for RawStr<N> {
1241 fn from(source: [u8; N]) -> Self {
1246 impl<const N: usize> Debug for RawStr<N> {
1247 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1248 write!(f, "{:?}", default_decode(&self.0))
1252 #[derive(Clone, Debug)]
1253 pub struct ValueLabel<V, S>
1258 pub value: Value<V>,
1263 pub struct ValueLabelRecord<V, S>
1268 /// Range of offsets in file.
1269 pub offsets: Range<u64>,
1272 pub labels: Vec<ValueLabel<V, S>>,
1274 /// The 1-based indexes of the variable indexes.
1275 pub dict_indexes: Vec<u32>,
1277 /// The types of the variables.
1278 pub var_type: VarType,
1281 impl<V, S> Debug for ValueLabelRecord<V, S>
1286 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1287 writeln!(f, "labels: ")?;
1288 for label in self.labels.iter() {
1289 writeln!(f, "{label:?}")?;
1291 write!(f, "apply to {} variables", self.var_type)?;
1292 for dict_index in self.dict_indexes.iter() {
1293 write!(f, " #{dict_index}")?;
1299 impl<V, S> Header for ValueLabelRecord<V, S>
1304 fn offsets(&self) -> Range<u64> {
1305 self.offsets.clone()
1309 impl<V, S> ValueLabelRecord<V, S>
1314 /// Maximum number of value labels in a record.
1315 pub const MAX_LABELS: u32 = u32::MAX / 8;
1317 /// Maximum number of variable indexes in a record.
1318 pub const MAX_INDEXES: u32 = u32::MAX / 8;
1321 impl ValueLabelRecord<RawStr<8>, RawString> {
1322 fn read<R: Read + Seek>(
1325 var_types: &[VarType],
1326 warn: &Box<dyn Fn(Error)>,
1327 ) -> Result<Option<Record>, Error> {
1328 let label_offset = r.stream_position()?;
1329 let n: u32 = endian.parse(read_bytes(r)?);
1330 if n > Self::MAX_LABELS {
1331 return Err(Error::BadNumberOfValueLabels {
1332 offset: label_offset,
1334 max: Self::MAX_LABELS,
1338 let mut labels = Vec::new();
1340 let value = UntypedValue(read_bytes(r)?);
1341 let label_len: u8 = endian.parse(read_bytes(r)?);
1342 let label_len = label_len as usize;
1343 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
1345 let mut label = read_vec(r, padded_len - 1)?;
1346 label.truncate(label_len);
1347 labels.push((value, RawString(label)));
1350 let index_offset = r.stream_position()?;
1351 let rec_type: u32 = endian.parse(read_bytes(r)?);
1353 return Err(Error::ExpectedVarIndexRecord {
1354 offset: index_offset,
1359 let n: u32 = endian.parse(read_bytes(r)?);
1360 if n > Self::MAX_INDEXES {
1361 return Err(Error::TooManyVarIndexes {
1362 offset: index_offset,
1364 max: Self::MAX_INDEXES,
1368 let index_offset = r.stream_position()?;
1369 let mut dict_indexes = Vec::with_capacity(n as usize);
1370 let mut invalid_indexes = Vec::new();
1372 let index: u32 = endian.parse(read_bytes(r)?);
1373 if index == 0 || index as usize > var_types.len() {
1374 dict_indexes.push(index);
1376 invalid_indexes.push(index);
1379 if !invalid_indexes.is_empty() {
1380 warn(Error::InvalidVarIndexes {
1381 offset: index_offset,
1382 max: var_types.len(),
1383 invalid: invalid_indexes,
1387 let Some(&first_index) = dict_indexes.first() else {
1388 warn(Error::NoVarIndexes {
1389 offset: index_offset,
1393 let var_type = var_types[first_index as usize - 1];
1394 let mut wrong_type_indexes = Vec::new();
1395 dict_indexes.retain(|&index| {
1396 if var_types[index as usize - 1] != var_type {
1397 wrong_type_indexes.push(index);
1403 if !wrong_type_indexes.is_empty() {
1404 warn(Error::MixedVarTypes {
1405 offset: index_offset,
1407 wrong_types: wrong_type_indexes,
1413 .map(|(value, label)| ValueLabel {
1414 value: Value::from_raw(&value, var_type, endian),
1419 let end_offset = r.stream_position()?;
1420 Ok(Some(Record::ValueLabel(ValueLabelRecord {
1421 offsets: label_offset..end_offset,
1429 #[derive(Clone, Debug)]
1430 pub struct DocumentRecord<S>
1434 pub offsets: Range<u64>,
1436 /// The document, as an array of 80-byte lines.
1440 pub type RawDocumentLine = RawStr<DOC_LINE_LEN>;
1442 /// Length of a line in a document. Document lines are fixed-length and
1443 /// padded on the right with spaces.
1444 pub const DOC_LINE_LEN: usize = 80;
1446 impl DocumentRecord<RawDocumentLine> {
1447 /// Maximum number of lines we will accept in a document. This is simply
1448 /// the maximum number that will fit in a 32-bit space.
1449 pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
1451 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1452 let start_offset = r.stream_position()?;
1453 let n: u32 = endian.parse(read_bytes(r)?);
1455 if n > Self::MAX_LINES {
1456 Err(Error::BadDocumentLength {
1457 offset: start_offset,
1459 max: Self::MAX_LINES,
1462 let mut lines = Vec::with_capacity(n);
1464 lines.push(RawStr(read_bytes(r)?));
1466 let end_offset = r.stream_position()?;
1467 Ok(Record::Document(DocumentRecord {
1468 offsets: start_offset..end_offset,
1474 fn decode<'a>(&'a self, decoder: &Decoder) -> DocumentRecord<Cow<'a, str>> {
1476 offsets: self.offsets.clone(),
1480 .map(|s| decoder.decode_slice(&s.0))
1486 impl<S> Header for DocumentRecord<S>
1490 fn offsets(&self) -> Range<u64> {
1491 self.offsets.clone()
1495 trait ExtensionRecord {
1497 const SIZE: Option<u32>;
1498 const COUNT: Option<u32>;
1499 const NAME: &'static str;
1500 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error>;
1503 #[derive(Clone, Debug)]
1504 pub struct IntegerInfoRecord {
1505 pub offsets: Range<u64>,
1506 pub version: (i32, i32, i32),
1507 pub machine_code: i32,
1508 pub floating_point_rep: i32,
1509 pub compression_code: i32,
1510 pub endianness: i32,
1511 pub character_code: i32,
1514 impl ExtensionRecord for IntegerInfoRecord {
1515 const SUBTYPE: u32 = 3;
1516 const SIZE: Option<u32> = Some(4);
1517 const COUNT: Option<u32> = Some(8);
1518 const NAME: &'static str = "integer record";
1520 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1521 ext.check_size::<Self>()?;
1523 let mut input = &ext.data[..];
1524 let data: Vec<i32> = (0..8)
1525 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1527 Ok(Record::IntegerInfo(IntegerInfoRecord {
1528 offsets: ext.offsets.clone(),
1529 version: (data[0], data[1], data[2]),
1530 machine_code: data[3],
1531 floating_point_rep: data[4],
1532 compression_code: data[5],
1533 endianness: data[6],
1534 character_code: data[7],
1539 #[derive(Clone, Debug)]
1540 pub struct FloatInfoRecord {
1546 impl ExtensionRecord for FloatInfoRecord {
1547 const SUBTYPE: u32 = 4;
1548 const SIZE: Option<u32> = Some(8);
1549 const COUNT: Option<u32> = Some(3);
1550 const NAME: &'static str = "floating point record";
1552 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1553 ext.check_size::<Self>()?;
1555 let mut input = &ext.data[..];
1556 let data: Vec<f64> = (0..3)
1557 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1559 Ok(Record::FloatInfo(FloatInfoRecord {
1567 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1568 pub enum CategoryLabels {
1573 #[derive(Clone, Debug)]
1574 pub enum MultipleResponseType {
1577 labels: CategoryLabels,
1582 impl MultipleResponseType {
1583 fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Error> {
1584 let (mr_type, input) = match input.split_first() {
1585 Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
1586 Some((b'D', input)) => {
1587 let (value, input) = parse_counted_string(input)?;
1589 MultipleResponseType::MultipleDichotomy {
1591 labels: CategoryLabels::VarLabels,
1596 Some((b'E', input)) => {
1597 let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
1598 (CategoryLabels::CountedValues, rest)
1599 } else if let Some(rest) = input.strip_prefix(b" 11 ") {
1600 (CategoryLabels::VarLabels, rest)
1602 return Err(Error::TBD);
1604 let (value, input) = parse_counted_string(input)?;
1606 MultipleResponseType::MultipleDichotomy { value, labels },
1610 _ => return Err(Error::TBD),
1612 Ok((mr_type, input))
1616 #[derive(Clone, Debug)]
1617 pub struct MultipleResponseSet<S>
1623 pub mr_type: MultipleResponseType,
1624 pub short_names: Vec<S>,
1627 impl MultipleResponseSet<RawString> {
1628 fn parse(input: &[u8]) -> Result<(Self, &[u8]), Error> {
1629 let Some(equals) = input.iter().position(|&b| b == b'=') else {
1630 return Err(Error::TBD);
1632 let (name, input) = input.split_at(equals);
1633 let (mr_type, input) = MultipleResponseType::parse(input)?;
1634 let Some(input) = input.strip_prefix(b" ") else {
1635 return Err(Error::TBD);
1637 let (label, mut input) = parse_counted_string(input)?;
1638 let mut vars = Vec::new();
1639 while input.first() != Some(&b'\n') {
1640 match input.split_first() {
1641 Some((b' ', rest)) => {
1642 let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
1643 return Err(Error::TBD);
1645 let (var, rest) = rest.split_at(length);
1646 if !var.is_empty() {
1647 vars.push(var.into());
1651 _ => return Err(Error::TBD),
1654 while input.first() == Some(&b'\n') {
1655 input = &input[1..];
1658 MultipleResponseSet {
1668 fn decode<'a>(&'a self, decoder: &Decoder) -> MultipleResponseSet<Cow<'a, str>> {
1669 MultipleResponseSet {
1670 name: decoder.decode(&self.name),
1671 label: decoder.decode(&self.label),
1672 mr_type: self.mr_type.clone(),
1673 short_names: self.short_names.iter().map(|s| decoder.decode(s)).collect(),
1678 #[derive(Clone, Debug)]
1679 pub struct MultipleResponseRecord<S>(pub Vec<MultipleResponseSet<S>>)
1683 impl ExtensionRecord for MultipleResponseRecord<RawString> {
1684 const SUBTYPE: u32 = 7;
1685 const SIZE: Option<u32> = Some(1);
1686 const COUNT: Option<u32> = None;
1687 const NAME: &'static str = "multiple response set record";
1689 fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Error> {
1690 ext.check_size::<Self>()?;
1692 let mut input = &ext.data[..];
1693 let mut sets = Vec::new();
1694 while !input.is_empty() {
1695 let (set, rest) = MultipleResponseSet::parse(input)?;
1699 Ok(Record::MultipleResponse(MultipleResponseRecord(sets)))
1703 impl MultipleResponseRecord<RawString> {
1704 fn decode<'a>(&'a self, decoder: &Decoder) -> MultipleResponseRecord<Cow<'a, str>> {
1705 MultipleResponseRecord(self.0.iter().map(|set| set.decode(decoder)).collect())
1709 fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Error> {
1710 let Some(space) = input.iter().position(|&b| b == b' ') else {
1711 return Err(Error::TBD);
1713 let Ok(length) = from_utf8(&input[..space]) else {
1714 return Err(Error::TBD);
1716 let Ok(length): Result<usize, _> = length.parse() else {
1717 return Err(Error::TBD);
1720 let input = &input[space + 1..];
1721 if input.len() < length {
1722 return Err(Error::TBD);
1725 let (string, rest) = input.split_at(length);
1726 Ok((string.into(), rest))
1729 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1737 fn try_decode(source: u32) -> Result<Option<Measure>, Error> {
1740 1 => Ok(Some(Measure::Nominal)),
1741 2 => Ok(Some(Measure::Ordinal)),
1742 3 => Ok(Some(Measure::Scale)),
1743 _ => Err(Error::InvalidMeasurement(source)),
1748 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1749 pub enum Alignment {
1756 fn try_decode(source: u32) -> Result<Option<Alignment>, Error> {
1759 1 => Ok(Some(Alignment::Left)),
1760 2 => Ok(Some(Alignment::Right)),
1761 3 => Ok(Some(Alignment::Center)),
1762 _ => Err(Error::InvalidAlignment(source)),
1767 #[derive(Clone, Debug)]
1768 pub struct VarDisplay {
1769 pub measure: Option<Measure>,
1770 pub width: Option<u32>,
1771 pub alignment: Option<Alignment>,
1774 #[derive(Clone, Debug)]
1775 pub struct VarDisplayRecord(pub Vec<VarDisplay>);
1777 impl VarDisplayRecord {
1778 const SUBTYPE: u32 = 11;
1784 warn: &Box<dyn Fn(Error)>,
1785 ) -> Result<Record, Error> {
1787 return Err(Error::BadRecordSize {
1788 offset: ext.offsets.start,
1789 record: String::from("variable display record"),
1795 let has_width = if ext.count as usize == 3 * n_vars {
1797 } else if ext.count as usize == 2 * n_vars {
1800 return Err(Error::TBD);
1803 let mut var_displays = Vec::new();
1804 let mut input = &ext.data[..];
1805 for _ in 0..n_vars {
1806 let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
1807 .warn_on_error(&warn)
1809 let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
1810 let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
1811 .warn_on_error(&warn)
1813 var_displays.push(VarDisplay {
1819 Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
1823 #[derive(Clone, Debug)]
1824 pub struct LongStringMissingValues<N, V>
1833 pub missing_values: MissingValues<V>,
1836 impl LongStringMissingValues<RawString, RawStr<8>> {
1837 fn decode<'a>(&self, decoder: &Decoder) -> LongStringMissingValues<String, String> {
1838 LongStringMissingValues {
1839 var_name: decoder.decode(&self.var_name).to_string(),
1840 missing_values: self.missing_values.decode(decoder),
1845 #[derive(Clone, Debug)]
1846 pub struct LongStringMissingValueRecord<N, V>(pub Vec<LongStringMissingValues<N, V>>)
1851 impl ExtensionRecord for LongStringMissingValueRecord<RawString, RawStr<8>> {
1852 const SUBTYPE: u32 = 22;
1853 const SIZE: Option<u32> = Some(1);
1854 const COUNT: Option<u32> = None;
1855 const NAME: &'static str = "long string missing values record";
1857 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1858 ext.check_size::<Self>()?;
1860 let mut input = &ext.data[..];
1861 let mut missing_value_set = Vec::new();
1862 while !input.is_empty() {
1863 let var_name = read_string(&mut input, endian)?;
1864 let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
1865 let value_len: u32 = endian.parse(read_bytes(&mut input)?);
1867 let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
1868 return Err(Error::BadLongMissingValueLength {
1869 record_offset: ext.offsets.start,
1874 let mut values = Vec::new();
1875 for i in 0..n_missing_values {
1876 let value: [u8; 8] = read_bytes(&mut input)?;
1877 let numeric_value: u64 = endian.parse(value);
1878 let value = if i > 0 && numeric_value == 8 {
1879 // Tolerate files written by old, buggy versions of PSPP
1880 // where we believed that the value_length was repeated
1881 // before each missing value.
1882 read_bytes(&mut input)?
1886 values.push(Value::String(RawStr(value)));
1888 let missing_values = MissingValues {
1892 missing_value_set.push(LongStringMissingValues {
1897 Ok(Record::LongStringMissingValues(
1898 LongStringMissingValueRecord(missing_value_set),
1903 impl LongStringMissingValueRecord<RawString, RawStr<8>> {
1904 fn decode<'a>(&self, decoder: &Decoder) -> LongStringMissingValueRecord<String, String> {
1905 LongStringMissingValueRecord(self.0.iter().map(|mv| mv.decode(decoder)).collect())
1909 #[derive(Clone, Debug)]
1910 pub struct EncodingRecord(pub String);
1912 impl ExtensionRecord for EncodingRecord {
1913 const SUBTYPE: u32 = 20;
1914 const SIZE: Option<u32> = Some(1);
1915 const COUNT: Option<u32> = None;
1916 const NAME: &'static str = "encoding record";
1918 fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Error> {
1919 ext.check_size::<Self>()?;
1921 Ok(Record::Encoding(EncodingRecord(
1922 String::from_utf8(ext.data.clone()).map_err(|_| Error::BadEncodingName {
1923 offset: ext.offsets.start,
1929 #[derive(Copy, Clone, Debug)]
1930 pub struct NumberOfCasesRecord {
1931 /// Always observed as 1.
1934 /// Number of cases.
1938 impl ExtensionRecord for NumberOfCasesRecord {
1939 const SUBTYPE: u32 = 16;
1940 const SIZE: Option<u32> = Some(8);
1941 const COUNT: Option<u32> = Some(2);
1942 const NAME: &'static str = "extended number of cases record";
1944 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1945 ext.check_size::<Self>()?;
1947 let mut input = &ext.data[..];
1948 let one = endian.parse(read_bytes(&mut input)?);
1949 let n_cases = endian.parse(read_bytes(&mut input)?);
1951 Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
1955 #[derive(Clone, Debug)]
1956 pub struct TextRecord {
1957 pub offsets: Range<u64>,
1960 pub rec_type: TextRecordType,
1962 /// The text content of the record.
1963 pub text: RawString,
1966 #[derive(Clone, Copy, Debug)]
1967 pub enum TextRecordType {
1977 fn new(extension: Extension, rec_type: TextRecordType) -> Self {
1979 offsets: extension.offsets,
1981 text: extension.data.into(),
1984 fn decode<'a>(&self, decoder: &Decoder) -> Result<Option<Record>, Error> {
1985 match self.rec_type {
1986 TextRecordType::VariableSets => Ok(Some(Record::VariableSets(
1987 VariableSetRecord::decode(self, decoder),
1989 TextRecordType::ProductInfo => Ok(Some(Record::ProductInfo(
1990 ProductInfoRecord::decode(self, decoder),
1992 TextRecordType::LongNames => Ok(Some(Record::LongNames(LongNamesRecord::decode(
1995 TextRecordType::VeryLongStrings => Ok(Some(Record::VeryLongStrings(
1996 VeryLongStringsRecord::decode(self, decoder),
1998 TextRecordType::FileAttributes => {
1999 Ok(FileAttributeRecord::decode(self, decoder).map(|fa| Record::FileAttributes(fa)))
2001 TextRecordType::VariableAttributes => {
2002 Ok(Some(Record::VariableAttributes(
2003 VariableAttributeRecord::decode(self, decoder))))
2009 #[derive(Clone, Debug)]
2010 pub struct VeryLongString {
2011 pub short_name: String,
2015 impl VeryLongString {
2016 fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Error> {
2017 let Some((short_name, length)) = input.split_once('=') else {
2018 return Err(Error::TBD);
2020 let length = length.parse().map_err(|_| Error::TBD)?;
2022 short_name: short_name.into(),
2028 #[derive(Clone, Debug)]
2029 pub struct Attribute {
2031 pub values: Vec<String>,
2035 fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Error> {
2036 let Some((name, mut input)) = input.split_once('(') else {
2037 return Err(Error::TBD);
2039 let mut values = Vec::new();
2041 let Some((value, rest)) = input.split_once('\n') else {
2042 return Err(Error::TBD);
2044 if let Some(stripped) = value
2046 .and_then(|value| value.strip_suffix('\''))
2048 values.push(stripped.into());
2050 decoder.warn(Error::TBD);
2051 values.push(value.into());
2053 if let Some(rest) = rest.strip_prefix(')') {
2054 let attribute = Attribute {
2058 return Ok((attribute, rest));
2065 #[derive(Clone, Debug)]
2066 pub struct AttributeSet(pub Vec<Attribute>);
2072 sentinel: Option<char>,
2073 ) -> Result<(AttributeSet, &'a str), Error> {
2074 let mut attributes = Vec::new();
2076 match input.chars().next() {
2077 None => break input,
2078 c if c == sentinel => break &input[1..],
2080 let (attribute, rest) = Attribute::parse(decoder, input)?;
2081 attributes.push(attribute);
2086 Ok((AttributeSet(attributes), rest))
2090 #[derive(Clone, Debug)]
2091 pub struct FileAttributeRecord(AttributeSet);
2093 impl FileAttributeRecord {
2094 fn decode(source: &TextRecord, decoder: &Decoder) -> Option<Self> {
2095 let input = decoder.decode(&source.text);
2096 match AttributeSet::parse(decoder, &input, None).warn_on_error(&decoder.warn) {
2097 Some((set, rest)) => {
2098 if !rest.is_empty() {
2099 decoder.warn(Error::TBD);
2101 Some(FileAttributeRecord(set))
2108 #[derive(Clone, Debug)]
2109 pub struct VarAttributeSet {
2110 pub long_var_name: String,
2111 pub attributes: AttributeSet,
2114 impl VarAttributeSet {
2115 fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributeSet, &'a str), Error> {
2116 let Some((long_var_name, rest)) = input.split_once(':') else {
2117 return Err(Error::TBD);
2119 let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?;
2120 let var_attribute = VarAttributeSet {
2121 long_var_name: long_var_name.into(),
2124 Ok((var_attribute, rest))
2128 #[derive(Clone, Debug)]
2129 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
2131 impl VariableAttributeRecord {
2132 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2133 let decoded = decoder.decode(&source.text);
2134 let mut input = decoded.as_ref();
2135 let mut var_attribute_sets = Vec::new();
2136 while !input.is_empty() {
2137 let Some((var_attribute, rest)) =
2138 VarAttributeSet::parse(decoder, &input).warn_on_error(&decoder.warn)
2142 var_attribute_sets.push(var_attribute);
2143 input = rest.into();
2145 VariableAttributeRecord(var_attribute_sets)
2149 #[derive(Clone, Debug)]
2150 pub struct VeryLongStringsRecord(Vec<VeryLongString>);
2152 impl VeryLongStringsRecord {
2153 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2154 let input = decoder.decode(&source.text);
2155 let mut very_long_strings = Vec::new();
2158 .map(|s| s.trim_end_matches('\t'))
2159 .filter(|s| !s.is_empty())
2161 if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&decoder.warn) {
2162 very_long_strings.push(vls)
2165 VeryLongStringsRecord(very_long_strings)
2169 #[derive(Clone, Debug)]
2170 pub struct LongName {
2171 pub short_name: String,
2172 pub long_name: String,
2175 #[derive(Clone, Debug)]
2176 pub struct LongNamesRecord(Vec<LongName>);
2178 impl LongNamesRecord {
2179 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2180 let input = decoder.decode(&source.text);
2181 let mut names = Vec::new();
2182 for pair in input.split('\t').filter(|s| !s.is_empty()) {
2183 if let Some((short_name, long_name)) = pair.split_once('=') {
2184 names.push(LongName {
2185 short_name: short_name.into(),
2186 long_name: long_name.into(),
2189 decoder.warn(Error::TBD)
2192 LongNamesRecord(names)
2196 #[derive(Clone, Debug)]
2197 pub struct ProductInfoRecord(pub String);
2199 impl ProductInfoRecord {
2200 const NAME: &'static str = "extra product info";
2201 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2202 Self(decoder.decode(&source.text).into())
2205 #[derive(Clone, Debug)]
2206 pub struct VariableSet {
2208 pub vars: Vec<String>,
2212 fn parse(input: &str) -> Result<Self, Error> {
2213 let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
2214 let vars = input.split_ascii_whitespace().map(String::from).collect();
2222 #[derive(Clone, Debug)]
2223 pub struct VariableSetRecord {
2224 pub offsets: Range<u64>,
2225 pub sets: Vec<VariableSet>,
2228 impl VariableSetRecord {
2229 fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord {
2230 let mut sets = Vec::new();
2231 let input = decoder.decode(&source.text);
2232 for line in input.lines() {
2233 if let Some(set) = VariableSet::parse(line).warn_on_error(&decoder.warn) {
2238 offsets: source.offsets.clone(),
2244 trait WarnOnError<T> {
2245 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
2247 impl<T> WarnOnError<T> for Result<T, Error> {
2248 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
2250 Ok(result) => Some(result),
2259 #[derive(Clone, Debug)]
2260 pub struct Extension {
2261 pub offsets: Range<u64>,
2266 /// Size of each data element.
2269 /// Number of data elements.
2272 /// `size * count` bytes of data.
2277 fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
2278 if let Some(expected_size) = E::SIZE {
2279 if self.size != expected_size {
2280 return Err(Error::BadRecordSize {
2281 offset: self.offsets.start,
2282 record: E::NAME.into(),
2288 if let Some(expected_count) = E::COUNT {
2289 if self.count != expected_count {
2290 return Err(Error::BadRecordCount {
2291 offset: self.offsets.start,
2292 record: E::NAME.into(),
2301 fn read<R: Read + Seek>(
2305 warn: &Box<dyn Fn(Error)>,
2306 ) -> Result<Option<Record>, Error> {
2307 let subtype = endian.parse(read_bytes(r)?);
2308 let header_offset = r.stream_position()?;
2309 let size: u32 = endian.parse(read_bytes(r)?);
2310 let count = endian.parse(read_bytes(r)?);
2311 let Some(product) = size.checked_mul(count) else {
2312 return Err(Error::ExtensionRecordTooLarge {
2313 offset: header_offset,
2319 let start_offset = r.stream_position()?;
2320 let data = read_vec(r, product as usize)?;
2321 let end_offset = start_offset + product as u64;
2322 let extension = Extension {
2323 offsets: start_offset..end_offset,
2329 let result = match subtype {
2330 IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian),
2331 FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian),
2332 VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, n_vars, endian, warn),
2333 MultipleResponseRecord::SUBTYPE | 19 => {
2334 MultipleResponseRecord::parse(&extension, endian)
2336 LongStringValueLabelRecord::SUBTYPE => {
2337 LongStringValueLabelRecord::parse(&extension, endian)
2339 EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian),
2340 NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian),
2341 5 => Ok(Record::Text(TextRecord::new(
2343 TextRecordType::VariableSets,
2345 10 => Ok(Record::Text(TextRecord::new(
2347 TextRecordType::ProductInfo,
2349 13 => Ok(Record::Text(TextRecord::new(
2351 TextRecordType::LongNames,
2353 14 => Ok(Record::Text(TextRecord::new(
2355 TextRecordType::VeryLongStrings,
2357 17 => Ok(Record::Text(TextRecord::new(
2359 TextRecordType::FileAttributes,
2361 18 => Ok(Record::Text(TextRecord::new(
2363 TextRecordType::VariableAttributes,
2365 _ => Ok(Record::OtherExtension(extension)),
2368 Ok(result) => Ok(Some(result)),
2377 #[derive(Clone, Debug)]
2378 pub struct ZHeader {
2379 /// File offset to the start of the record.
2382 /// File offset to the ZLIB data header.
2383 pub zheader_offset: u64,
2385 /// File offset to the ZLIB trailer.
2386 pub ztrailer_offset: u64,
2388 /// Length of the ZLIB trailer in bytes.
2389 pub ztrailer_len: u64,
2393 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
2394 let offset = r.stream_position()?;
2395 let zheader_offset: u64 = endian.parse(read_bytes(r)?);
2396 let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
2397 let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
2408 #[derive(Clone, Debug)]
2409 pub struct ZTrailer {
2410 /// File offset to the start of the record.
2413 /// Compression bias as a negative integer, e.g. -100.
2416 /// Always observed as zero.
2419 /// Uncompressed size of each block, except possibly the last. Only
2420 /// `0x3ff000` has been observed so far.
2421 pub block_size: u32,
2423 /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
2424 pub blocks: Vec<ZBlock>,
2427 #[derive(Clone, Debug)]
2429 /// Offset of block of data if simple compression were used.
2430 pub uncompressed_ofs: u64,
2432 /// Actual offset within the file of the compressed data block.
2433 pub compressed_ofs: u64,
2435 /// The number of bytes in this data block after decompression. This is
2436 /// `block_size` in every data block but the last, which may be smaller.
2437 pub uncompressed_size: u32,
2439 /// The number of bytes in this data block, as stored compressed in this
2441 pub compressed_size: u32,
2445 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
2447 uncompressed_ofs: endian.parse(read_bytes(r)?),
2448 compressed_ofs: endian.parse(read_bytes(r)?),
2449 uncompressed_size: endian.parse(read_bytes(r)?),
2450 compressed_size: endian.parse(read_bytes(r)?),
2456 fn read<R: Read + Seek>(
2461 ) -> Result<Option<ZTrailer>, Error> {
2462 let start_offset = reader.stream_position()?;
2463 if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
2466 let int_bias = endian.parse(read_bytes(reader)?);
2467 let zero = endian.parse(read_bytes(reader)?);
2468 let block_size = endian.parse(read_bytes(reader)?);
2469 let n_blocks: u32 = endian.parse(read_bytes(reader)?);
2470 let expected_n_blocks = (ztrailer_len - 24) / 24;
2471 if n_blocks as u64 != expected_n_blocks {
2472 return Err(Error::BadZlibTrailerNBlocks {
2473 offset: ztrailer_ofs,
2479 let blocks = (0..n_blocks)
2480 .map(|_| ZBlock::read(reader, endian))
2481 .collect::<Result<Vec<_>, _>>()?;
2482 reader.seek(SeekFrom::Start(start_offset))?;
2484 offset: ztrailer_ofs,
2493 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
2494 let mut buf = [0; N];
2495 let n = r.read(&mut buf)?;
2498 r.read_exact(&mut buf[n..])?;
2506 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
2507 let mut buf = [0; N];
2508 r.read_exact(&mut buf)?;
2512 fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
2513 let mut vec = vec![0; n];
2514 r.read_exact(&mut vec)?;
2518 fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<RawString, IoError> {
2519 let length: u32 = endian.parse(read_bytes(r)?);
2520 Ok(read_vec(r, length as usize)?.into())
2523 #[derive(Clone, Debug)]
2524 pub struct LongStringValueLabels<S>
2531 /// `(value, label)` pairs, where each value is `width` bytes.
2532 pub labels: Vec<(S, S)>,
2535 #[derive(Clone, Debug)]
2536 pub struct LongStringValueLabelRecord<S>(pub Vec<LongStringValueLabels<S>>)
2540 impl ExtensionRecord for LongStringValueLabelRecord<RawString> {
2541 const SUBTYPE: u32 = 21;
2542 const SIZE: Option<u32> = Some(1);
2543 const COUNT: Option<u32> = None;
2544 const NAME: &'static str = "long string value labels record";
2546 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
2547 ext.check_size::<Self>()?;
2549 let mut input = &ext.data[..];
2550 let mut label_set = Vec::new();
2551 while !input.is_empty() {
2552 let var_name = read_string(&mut input, endian)?;
2553 let width: u32 = endian.parse(read_bytes(&mut input)?);
2554 let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
2555 let mut labels = Vec::new();
2556 for _ in 0..n_labels {
2557 let value = read_string(&mut input, endian)?;
2558 let label = read_string(&mut input, endian)?;
2559 labels.push((value, label));
2561 label_set.push(LongStringValueLabels {
2567 Ok(Record::LongStringValueLabels(LongStringValueLabelRecord(