2 endian::{Endian, Parse, ToBytes},
3 identifier::{Error as IdError, Identifier},
6 use encoding_rs::{mem::decode_latin1, DecoderResult, Encoding};
7 use flate2::read::ZlibDecoder;
13 collections::VecDeque,
14 fmt::{Debug, Display, Formatter, Result as FmtResult},
15 io::{Error as IoError, Read, Seek, SeekFrom},
22 use thiserror::Error as ThisError;
24 #[derive(ThisError, Debug)]
26 #[error("Not an SPSS system file")]
29 #[error("Invalid magic number {0:?}")]
32 #[error("I/O error ({0})")]
35 #[error("Invalid SAV compression code {0}")]
36 InvalidSavCompression(u32),
38 #[error("Invalid ZSAV compression code {0}")]
39 InvalidZsavCompression(u32),
41 #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
42 BadVariableWidth { offset: u64, width: i32 },
44 #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
45 BadDocumentLength { offset: u64, n: usize, max: usize },
47 #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
48 BadRecordType { offset: u64, rec_type: u32 },
50 #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
51 BadVariableLabelCode {
58 "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
60 BadNumericMissingValueCode { offset: u64, code: i32 },
62 #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
63 BadStringMissingValueCode { offset: u64, code: i32 },
65 #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
66 BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
68 #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
69 ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
71 #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")]
72 TooManyVarIndexes { offset: u64, n: u32, max: u32 },
74 #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")]
75 NoVarIndexes { offset: u64 },
77 #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())]
81 wrong_types: Vec<u32>,
84 #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")]
91 #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
92 ExtensionRecordTooLarge {
99 #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
107 "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
109 EofInCompressedCase { offset: u64, case_ofs: u64 },
111 #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
112 PartialCompressedCase { offset: u64, case_ofs: u64 },
114 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
115 CompressedNumberExpected { offset: u64, case_ofs: u64 },
117 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
118 CompressedStringExpected { offset: u64, case_ofs: u64 },
120 #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
121 BadZlibTrailerNBlocks {
124 expected_n_blocks: u64,
128 #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
136 #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
144 #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
145 BadLongMissingValueLength {
151 #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
152 BadEncodingName { offset: u64 },
154 // XXX This is risky because `text` might be arbitarily long.
155 #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
156 MalformedString { encoding: String, text: String },
158 #[error("Invalid variable measurement level value {0}")]
159 InvalidMeasurement(u32),
161 #[error("Invalid variable display alignment value {0}")]
162 InvalidAlignment(u32),
164 #[error("Invalid attribute name. {0}")]
165 InvalidAttributeName(IdError),
167 #[error("Invalid variable name in attribute record. {0}")]
168 InvalidAttributeVariableName(IdError),
170 #[error("Invalid short name in long variable name record. {0}")]
171 InvalidShortName(IdError),
173 #[error("Invalid name in long variable name record. {0}")]
174 InvalidLongName(IdError),
176 #[error("Invalid variable name in very long string record. {0}")]
177 InvalidLongStringName(IdError),
179 #[error("Invalid variable name in variable set record. {0}")]
180 InvalidVariableSetName(IdError),
182 #[error("Details TBD")]
186 #[derive(Clone, Debug)]
188 Header(HeaderRecord<RawString>),
189 Variable(VariableRecord<RawString, RawStr<8>>),
190 ValueLabel(ValueLabelRecord<RawStr<8>, RawString>),
191 Document(DocumentRecord<RawDocumentLine>),
192 IntegerInfo(IntegerInfoRecord),
193 FloatInfo(FloatInfoRecord),
194 VariableSets(VariableSetRecord),
195 VarDisplay(VarDisplayRecord),
196 MultipleResponse(MultipleResponseRecord<RawString>),
197 LongStringValueLabels(LongStringValueLabelRecord<RawString>),
198 LongStringMissingValues(LongStringMissingValueRecord<RawString, RawStr<8>>),
199 Encoding(EncodingRecord),
200 NumberOfCases(NumberOfCasesRecord),
201 ProductInfo(ProductInfoRecord),
202 LongNames(LongNamesRecord),
203 VeryLongStrings(VeryLongStringsRecord),
204 FileAttributes(FileAttributeRecord),
205 VariableAttributes(VariableAttributeRecord),
207 OtherExtension(Extension),
211 Cases(Rc<RefCell<Cases>>),
218 var_types: &[VarType],
219 warn: &Box<dyn Fn(Error)>,
220 ) -> Result<Option<Record>, Error>
224 let rec_type: u32 = endian.parse(read_bytes(reader)?);
226 2 => Ok(Some(VariableRecord::read(reader, endian)?)),
227 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?),
228 6 => Ok(Some(DocumentRecord::read(reader, endian)?)),
229 7 => Extension::read(reader, endian, var_types.len(), warn),
230 999 => Ok(Some(Record::EndOfHeaders(
231 endian.parse(read_bytes(reader)?),
233 _ => Err(Error::BadRecordType {
234 offset: reader.stream_position()?,
241 // If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
242 // decoded as Latin-1 (actually bytes interpreted as Unicode code points).
243 fn default_decode(s: &[u8]) -> Cow<str> {
244 from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
247 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
248 pub enum Compression {
254 fn offsets(&self) -> Range<u64>;
258 pub struct HeaderRecord<S>
263 pub offsets: Range<u64>,
268 /// Eye-catcher string, product name, in the file's encoding. Padded
269 /// on the right with spaces.
272 /// Layout code, normally either 2 or 3.
273 pub layout_code: u32,
275 /// Number of variable positions, or `None` if the value in the file is
276 /// questionably trustworthy.
277 pub nominal_case_size: Option<u32>,
279 /// Compression type, if any,
280 pub compression: Option<Compression>,
282 /// 1-based variable index of the weight variable, or `None` if the file is
284 pub weight_index: Option<u32>,
286 /// Claimed number of cases, if known.
287 pub n_cases: Option<u32>,
289 /// Compression bias, usually 100.0.
292 /// `dd mmm yy` in the file's encoding.
293 pub creation_date: S,
295 /// `HH:MM:SS` in the file's encoding.
296 pub creation_time: S,
298 /// File label, in the file's encoding. Padded on the right with spaces.
301 /// Endianness of the data in the file header.
305 impl<S> HeaderRecord<S>
309 fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult
313 writeln!(f, "{name:>17}: {:?}", value)
317 impl<S> Debug for HeaderRecord<S>
321 fn fmt(&self, f: &mut Formatter) -> FmtResult {
322 writeln!(f, "File header record:")?;
323 self.debug_field(f, "Magic", self.magic)?;
324 self.debug_field(f, "Product name", &self.eye_catcher)?;
325 self.debug_field(f, "Layout code", self.layout_code)?;
326 self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
327 self.debug_field(f, "Compression", self.compression)?;
328 self.debug_field(f, "Weight index", self.weight_index)?;
329 self.debug_field(f, "Number of cases", self.n_cases)?;
330 self.debug_field(f, "Compression bias", self.bias)?;
331 self.debug_field(f, "Creation date", &self.creation_date)?;
332 self.debug_field(f, "Creation time", &self.creation_time)?;
333 self.debug_field(f, "File label", &self.file_label)?;
334 self.debug_field(f, "Endianness", self.endian)
338 impl HeaderRecord<RawString> {
339 fn read<R: Read + Seek>(r: &mut R) -> Result<Self, Error> {
340 let start = r.stream_position()?;
342 let magic: [u8; 4] = read_bytes(r)?;
343 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
345 let eye_catcher = RawString(read_vec(r, 60)?);
346 let layout_code: [u8; 4] = read_bytes(r)?;
347 let endian = Endian::identify_u32(2, layout_code)
348 .or_else(|| Endian::identify_u32(2, layout_code))
349 .ok_or_else(|| Error::NotASystemFile)?;
350 let layout_code = endian.parse(layout_code);
352 let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
353 let nominal_case_size =
354 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
356 let compression_code: u32 = endian.parse(read_bytes(r)?);
357 let compression = match (magic, compression_code) {
358 (Magic::Zsav, 2) => Some(Compression::ZLib),
359 (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)),
361 (_, 1) => Some(Compression::Simple),
362 (_, code) => return Err(Error::InvalidSavCompression(code)),
365 let weight_index: u32 = endian.parse(read_bytes(r)?);
366 let weight_index = (weight_index > 0).then_some(weight_index);
368 let n_cases: u32 = endian.parse(read_bytes(r)?);
369 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
371 let bias: f64 = endian.parse(read_bytes(r)?);
373 let creation_date = RawString(read_vec(r, 9)?);
374 let creation_time = RawString(read_vec(r, 8)?);
375 let file_label = RawString(read_vec(r, 64)?);
376 let _: [u8; 3] = read_bytes(r)?;
379 offsets: start..r.stream_position()?,
395 fn decode<'a>(&'a self, decoder: &Decoder) -> HeaderRecord<Cow<'a, str>> {
396 let eye_catcher = decoder.decode(&self.eye_catcher);
397 let file_label = decoder.decode(&self.file_label);
398 let creation_date = decoder.decode(&self.creation_date);
399 let creation_time = decoder.decode(&self.creation_time);
402 weight_index: self.weight_index,
403 n_cases: self.n_cases,
405 offsets: self.offsets.clone(),
407 layout_code: self.layout_code,
408 nominal_case_size: self.nominal_case_size,
409 compression: self.compression,
419 encoding: &'static Encoding,
420 warn: Box<dyn Fn(Error)>,
424 fn warn(&self, error: Error) {
427 fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
428 let (output, malformed) = self.encoding.decode_without_bom_handling(input);
430 self.warn(Error::MalformedString {
431 encoding: self.encoding.name().into(),
432 text: output.clone().into(),
438 fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> {
439 self.decode_slice(input.0.as_slice())
442 /// Returns `input` decoded from `self.encoding` into UTF-8 such that
443 /// re-encoding the result back into `self.encoding` will have exactly the
444 /// same length in bytes.
446 /// XXX warn about errors?
447 fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
448 if let (s, false) = self.encoding.decode_without_bom_handling(input) {
449 // This is the common case. Usually there will be no errors.
452 // Unusual case. Don't bother to optimize it much.
453 let mut decoder = self.encoding.new_decoder_without_bom_handling();
454 let mut output = String::with_capacity(
456 .max_utf8_buffer_length_without_replacement(input.len())
459 let mut rest = input;
460 while !rest.is_empty() {
461 match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
462 (DecoderResult::InputEmpty, _) => break,
463 (DecoderResult::OutputFull, _) => unreachable!(),
464 (DecoderResult::Malformed(a, b), consumed) => {
465 let skipped = a as usize + b as usize;
466 output.extend(repeat('?').take(skipped));
467 rest = &rest[consumed..];
471 assert_eq!(self.encoding.encode(&output).0.len(), input.len());
476 pub fn decode_identifier(&self, input: &RawString) -> Result<Identifier, IdError> {
477 self.new_identifier(&self.decode(input))
480 pub fn new_identifier(&self, name: &str) -> Result<Identifier, IdError> {
481 Identifier::new(name, self.encoding)
485 impl<S> Header for HeaderRecord<S>
489 fn offsets(&self) -> Range<u64> {
494 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
496 /// Regular system file.
499 /// System file with Zlib-compressed data.
502 /// EBCDIC-encoded system file.
507 /// Magic number for a regular system file.
508 pub const SAV: [u8; 4] = *b"$FL2";
510 /// Magic number for a system file that contains zlib-compressed data.
511 pub const ZSAV: [u8; 4] = *b"$FL3";
513 /// Magic number for an EBCDIC-encoded system file. This is `$FL2` encoded
515 pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2];
518 impl Debug for Magic {
519 fn fmt(&self, f: &mut Formatter) -> FmtResult {
520 let s = match *self {
521 Magic::Sav => "$FL2",
522 Magic::Zsav => "$FL3",
523 Magic::Ebcdic => "($FL2 in EBCDIC)",
529 impl TryFrom<[u8; 4]> for Magic {
532 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
534 Magic::SAV => Ok(Magic::Sav),
535 Magic::ZSAV => Ok(Magic::Zsav),
536 Magic::EBCDIC => Ok(Magic::Ebcdic),
537 _ => Err(Error::BadMagic(value)),
542 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
549 fn from_width(width: i32) -> VarType {
551 0 => VarType::Numeric,
552 _ => VarType::String,
556 fn opposite(self) -> VarType {
558 Self::Numeric => Self::String,
559 Self::String => Self::Numeric,
564 impl Display for VarType {
565 fn fmt(&self, f: &mut Formatter) -> FmtResult {
567 VarType::Numeric => write!(f, "numeric"),
568 VarType::String => write!(f, "string"),
573 #[derive(Copy, Clone)]
582 type RawValue = Value<RawStr<8>>;
584 impl<S> Debug for Value<S>
588 fn fmt(&self, f: &mut Formatter) -> FmtResult {
590 Value::Number(Some(number)) => write!(f, "{number:?}"),
591 Value::Number(None) => write!(f, "SYSMIS"),
592 Value::String(s) => write!(f, "{:?}", s),
598 fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Self, IoError> {
600 &UntypedValue(read_bytes(r)?),
606 pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self {
608 VarType::String => Value::String(RawStr(raw.0)),
609 VarType::Numeric => {
610 let number: f64 = endian.parse(raw.0);
611 Value::Number((number != -f64::MAX).then_some(number))
616 fn read_case<R: Read + Seek>(
618 var_types: &[VarType],
620 ) -> Result<Option<Vec<Self>>, Error> {
621 let case_start = reader.stream_position()?;
622 let mut values = Vec::with_capacity(var_types.len());
623 for (i, &var_type) in var_types.iter().enumerate() {
624 let Some(raw) = try_read_bytes(reader)? else {
628 let offset = reader.stream_position()?;
629 return Err(Error::EofInCase {
631 case_ofs: offset - case_start,
632 case_len: var_types.len() * 8,
636 values.push(Value::from_raw(&UntypedValue(raw), var_type, endian));
641 fn read_compressed_case<R: Read + Seek>(
643 var_types: &[VarType],
644 codes: &mut VecDeque<u8>,
647 ) -> Result<Option<Vec<Self>>, Error> {
648 let case_start = reader.stream_position()?;
649 let mut values = Vec::with_capacity(var_types.len());
650 for (i, &var_type) in var_types.iter().enumerate() {
652 let Some(code) = codes.pop_front() else {
653 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
657 let offset = reader.stream_position()?;
658 return Err(Error::EofInCompressedCase {
660 case_ofs: offset - case_start,
664 codes.extend(new_codes.into_iter());
669 1..=251 => match var_type {
670 VarType::Numeric => break Self::Number(Some(code as f64 - bias)),
672 break Self::String(RawStr(endian.to_bytes(code as f64 - bias)))
679 let offset = reader.stream_position()?;
680 return Err(Error::PartialCompressedCase {
682 case_ofs: offset - case_start,
687 break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian)
689 254 => match var_type {
690 VarType::String => break Self::String(RawStr(*b" ")), // XXX EBCDIC
691 VarType::Numeric => {
692 return Err(Error::CompressedStringExpected {
694 case_ofs: reader.stream_position()? - case_start,
698 255 => match var_type {
699 VarType::Numeric => break Self::Number(None),
701 return Err(Error::CompressedNumberExpected {
703 case_ofs: reader.stream_position()? - case_start,
714 fn decode(&self, decoder: &Decoder) -> Value<String> {
716 Self::Number(x) => Value::Number(*x),
717 Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
722 struct ZlibDecodeMultiple<R>
726 reader: Option<ZlibDecoder<R>>,
729 impl<R> ZlibDecodeMultiple<R>
733 fn new(reader: R) -> ZlibDecodeMultiple<R> {
735 reader: Some(ZlibDecoder::new(reader)),
740 impl<R> Read for ZlibDecodeMultiple<R>
744 fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
746 match self.reader.as_mut().unwrap().read(buf)? {
748 let inner = self.reader.take().unwrap().into_inner();
749 self.reader = Some(ZlibDecoder::new(inner));
757 impl<R> Seek for ZlibDecodeMultiple<R>
761 fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
762 self.reader.as_mut().unwrap().get_mut().seek(pos)
771 ztrailer_offset: u64,
780 R: Read + Seek + 'static,
783 warn: Box<dyn Fn(Error)>,
785 header: HeaderRecord<RawString>,
786 var_types: Vec<VarType>,
793 R: Read + Seek + 'static,
795 pub fn new<F>(mut reader: R, warn: F) -> Result<Self, Error>
797 F: Fn(Error) + 'static,
799 let header = HeaderRecord::read(&mut reader)?;
801 reader: Some(reader),
802 warn: Box::new(warn),
804 var_types: Vec::new(),
805 state: ReaderState::Start,
808 fn cases(&mut self) -> Cases {
809 self.state = ReaderState::End;
811 self.reader.take().unwrap(),
812 take(&mut self.var_types),
818 impl<R> Iterator for Reader<R>
820 R: Read + Seek + 'static,
822 type Item = Result<Record, Error>;
824 fn next(&mut self) -> Option<Self::Item> {
826 ReaderState::Start => {
827 self.state = ReaderState::Headers;
828 Some(Ok(Record::Header(self.header.clone())))
830 ReaderState::Headers => {
833 self.reader.as_mut().unwrap(),
835 self.var_types.as_slice(),
838 Ok(Some(record)) => break record,
840 Err(error) => return Some(Err(error)),
844 Record::Variable(VariableRecord { width, .. }) => {
845 self.var_types.push(VarType::from_width(width));
847 Record::EndOfHeaders(_) => {
848 self.state = if let Some(Compression::ZLib) = self.header.compression {
849 ReaderState::ZlibHeader
858 ReaderState::ZlibHeader => {
859 let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian)
861 Ok(zheader) => zheader,
862 Err(error) => return Some(Err(error)),
864 self.state = ReaderState::ZlibTrailer {
865 ztrailer_offset: zheader.ztrailer_offset,
866 ztrailer_len: zheader.ztrailer_len,
868 Some(Ok(Record::ZHeader(zheader)))
870 ReaderState::ZlibTrailer {
874 match ZTrailer::read(
875 self.reader.as_mut().unwrap(),
880 Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
881 Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))),
882 Err(error) => Some(Err(error)),
885 ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
886 ReaderState::End => None,
891 trait ReadSeek: Read + Seek {}
892 impl<T> ReadSeek for T where T: Read + Seek {}
895 reader: Box<dyn ReadSeek>,
896 var_types: Vec<VarType>,
897 compression: Option<Compression>,
904 impl Debug for Cases {
905 fn fmt(&self, f: &mut Formatter) -> FmtResult {
911 fn new<R>(reader: R, var_types: Vec<VarType>, header: &HeaderRecord<RawString>) -> Self
913 R: Read + Seek + 'static,
916 reader: if header.compression == Some(Compression::ZLib) {
917 Box::new(ZlibDecodeMultiple::new(reader))
922 compression: header.compression,
924 endian: header.endian,
925 codes: VecDeque::with_capacity(8),
931 impl Iterator for Cases {
932 type Item = Result<Vec<RawValue>, Error>;
934 fn next(&mut self) -> Option<Self::Item> {
939 let retval = if self.compression.is_some() {
940 Value::read_compressed_case(
949 Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose()
951 self.eof = matches!(retval, None | Some(Err(_)));
956 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
957 pub struct Spec(pub u32);
959 impl Debug for Spec {
960 fn fmt(&self, f: &mut Formatter) -> FmtResult {
961 let type_ = format_name(self.0 >> 16);
962 let w = (self.0 >> 8) & 0xff;
963 let d = self.0 & 0xff;
964 write!(f, "{:06x} ({type_}{w}.{d})", self.0)
968 fn format_name(type_: u32) -> Cow<'static, str> {
1007 _ => return format!("<unknown format {type_}>").into(),
1013 pub struct MissingValues<S>
1017 /// Individual missing values, up to 3 of them.
1018 pub values: Vec<Value<S>>,
1020 /// Optional range of missing values.
1021 pub range: Option<(Value<S>, Value<S>)>,
1024 impl<S> Debug for MissingValues<S>
1028 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1029 for (i, value) in self.values.iter().enumerate() {
1033 write!(f, "{value:?}")?;
1036 if let Some((low, high)) = &self.range {
1037 if !self.values.is_empty() {
1040 write!(f, "{low:?} THRU {high:?}")?;
1043 if self.is_empty() {
1051 impl<S> MissingValues<S>
1055 fn is_empty(&self) -> bool {
1056 self.values.is_empty() && self.range.is_none()
1060 impl MissingValues<RawStr<8>> {
1061 fn read<R: Read + Seek>(
1067 ) -> Result<Self, Error> {
1068 let (n_values, has_range) = match (width, code) {
1069 (_, 0..=3) => (code, false),
1070 (0, -2) => (0, true),
1071 (0, -3) => (1, true),
1072 (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
1073 (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
1076 let var_type = VarType::from_width(width);
1078 let mut values = Vec::new();
1079 for _ in 0..n_values {
1080 values.push(RawValue::read(r, var_type, endian)?);
1082 let range = if has_range {
1083 let low = RawValue::read(r, var_type, endian)?;
1084 let high = RawValue::read(r, var_type, endian)?;
1089 Ok(Self { values, range })
1091 fn decode<'a>(&'a self, decoder: &Decoder) -> MissingValues<String> {
1096 .map(|value| value.decode(decoder))
1101 .map(|(low, high)| (low.decode(decoder), high.decode(decoder))),
1107 pub struct VariableRecord<S, V>
1112 /// Range of offsets in file.
1113 pub offsets: Range<u64>,
1115 /// Variable width, in the range -1..=255.
1118 /// Variable name, padded on the right with spaces.
1122 pub print_format: Spec,
1125 pub write_format: Spec,
1128 pub missing_values: MissingValues<V>,
1130 /// Optional variable label.
1131 pub label: Option<S>,
1134 impl<S, V> Debug for VariableRecord<S, V>
1139 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1144 match self.width.cmp(&0) {
1145 Ordering::Greater => "string",
1146 Ordering::Equal => "numeric",
1147 Ordering::Less => "long string continuation record",
1150 writeln!(f, "Print format: {:?}", self.print_format)?;
1151 writeln!(f, "Write format: {:?}", self.write_format)?;
1152 writeln!(f, "Name: {:?}", &self.name)?;
1153 writeln!(f, "Variable label: {:?}", self.label)?;
1154 writeln!(f, "Missing values: {:?}", self.missing_values)
1158 impl VariableRecord<RawString, RawStr<8>> {
1159 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1160 let start_offset = r.stream_position()?;
1161 let width: i32 = endian.parse(read_bytes(r)?);
1162 let code_offset = r.stream_position()?;
1163 let has_variable_label: u32 = endian.parse(read_bytes(r)?);
1164 let missing_value_code: i32 = endian.parse(read_bytes(r)?);
1165 let print_format = Spec(endian.parse(read_bytes(r)?));
1166 let write_format = Spec(endian.parse(read_bytes(r)?));
1167 let name = RawString(read_vec(r, 8)?);
1169 let label = match has_variable_label {
1172 let len: u32 = endian.parse(read_bytes(r)?);
1173 let read_len = len.min(65535) as usize;
1174 let label = RawString(read_vec(r, read_len)?);
1176 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
1177 let _ = read_vec(r, padding_bytes as usize)?;
1182 return Err(Error::BadVariableLabelCode {
1185 code: has_variable_label,
1190 let missing_values =
1191 MissingValues::read(r, start_offset, width, missing_value_code, endian)?;
1193 let end_offset = r.stream_position()?;
1195 Ok(Record::Variable(VariableRecord {
1196 offsets: start_offset..end_offset,
1206 fn decode<'a>(&'a self, decoder: &Decoder) -> VariableRecord<Cow<'a, str>, String> {
1208 offsets: self.offsets.clone(),
1210 name: decoder.decode(&self.name),
1211 print_format: self.print_format,
1212 write_format: self.write_format,
1213 missing_values: self.missing_values.decode(decoder),
1214 label: self.label.as_ref().map(|label| decoder.decode(label)),
1219 #[derive(Copy, Clone)]
1220 pub struct UntypedValue(pub [u8; 8]);
1222 impl Debug for UntypedValue {
1223 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1224 let little: f64 = Endian::Little.parse(self.0);
1225 let little = format!("{:?}", little);
1226 let big: f64 = Endian::Big.parse(self.0);
1227 let big = format!("{:?}", big);
1228 let number = if little.len() <= big.len() {
1233 write!(f, "{number}")?;
1235 let string = default_decode(&self.0);
1237 .split(|c: char| c == '\0' || c.is_control())
1240 write!(f, "{string:?}")?;
1246 pub struct RawString(pub Vec<u8>);
1248 impl From<Vec<u8>> for RawString {
1249 fn from(source: Vec<u8>) -> Self {
1254 impl From<&[u8]> for RawString {
1255 fn from(source: &[u8]) -> Self {
1260 impl Debug for RawString {
1261 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1262 write!(f, "{:?}", default_decode(self.0.as_slice()))
1266 #[derive(Copy, Clone)]
1267 pub struct RawStr<const N: usize>(pub [u8; N]);
1269 impl<const N: usize> From<[u8; N]> for RawStr<N> {
1270 fn from(source: [u8; N]) -> Self {
1275 impl<const N: usize> Debug for RawStr<N> {
1276 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1277 write!(f, "{:?}", default_decode(&self.0))
1281 #[derive(Clone, Debug)]
1282 pub struct ValueLabel<V, S>
1287 pub value: Value<V>,
1292 pub struct ValueLabelRecord<V, S>
1297 /// Range of offsets in file.
1298 pub offsets: Range<u64>,
1301 pub labels: Vec<ValueLabel<V, S>>,
1303 /// The 1-based indexes of the variable indexes.
1304 pub dict_indexes: Vec<u32>,
1306 /// The types of the variables.
1307 pub var_type: VarType,
1310 impl<V, S> Debug for ValueLabelRecord<V, S>
1315 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1316 writeln!(f, "labels: ")?;
1317 for label in self.labels.iter() {
1318 writeln!(f, "{label:?}")?;
1320 write!(f, "apply to {} variables", self.var_type)?;
1321 for dict_index in self.dict_indexes.iter() {
1322 write!(f, " #{dict_index}")?;
1328 impl<V, S> Header for ValueLabelRecord<V, S>
1333 fn offsets(&self) -> Range<u64> {
1334 self.offsets.clone()
1338 impl<V, S> ValueLabelRecord<V, S>
1343 /// Maximum number of value labels in a record.
1344 pub const MAX_LABELS: u32 = u32::MAX / 8;
1346 /// Maximum number of variable indexes in a record.
1347 pub const MAX_INDEXES: u32 = u32::MAX / 8;
1350 impl ValueLabelRecord<RawStr<8>, RawString> {
1351 fn read<R: Read + Seek>(
1354 var_types: &[VarType],
1355 warn: &Box<dyn Fn(Error)>,
1356 ) -> Result<Option<Record>, Error> {
1357 let label_offset = r.stream_position()?;
1358 let n: u32 = endian.parse(read_bytes(r)?);
1359 if n > Self::MAX_LABELS {
1360 return Err(Error::BadNumberOfValueLabels {
1361 offset: label_offset,
1363 max: Self::MAX_LABELS,
1367 let mut labels = Vec::new();
1369 let value = UntypedValue(read_bytes(r)?);
1370 let label_len: u8 = endian.parse(read_bytes(r)?);
1371 let label_len = label_len as usize;
1372 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
1374 let mut label = read_vec(r, padded_len - 1)?;
1375 label.truncate(label_len);
1376 labels.push((value, RawString(label)));
1379 let index_offset = r.stream_position()?;
1380 let rec_type: u32 = endian.parse(read_bytes(r)?);
1382 return Err(Error::ExpectedVarIndexRecord {
1383 offset: index_offset,
1388 let n: u32 = endian.parse(read_bytes(r)?);
1389 if n > Self::MAX_INDEXES {
1390 return Err(Error::TooManyVarIndexes {
1391 offset: index_offset,
1393 max: Self::MAX_INDEXES,
1397 let index_offset = r.stream_position()?;
1398 let mut dict_indexes = Vec::with_capacity(n as usize);
1399 let mut invalid_indexes = Vec::new();
1401 let index: u32 = endian.parse(read_bytes(r)?);
1402 if index == 0 || index as usize > var_types.len() {
1403 dict_indexes.push(index);
1405 invalid_indexes.push(index);
1408 if !invalid_indexes.is_empty() {
1409 warn(Error::InvalidVarIndexes {
1410 offset: index_offset,
1411 max: var_types.len(),
1412 invalid: invalid_indexes,
1416 let Some(&first_index) = dict_indexes.first() else {
1417 warn(Error::NoVarIndexes {
1418 offset: index_offset,
1422 let var_type = var_types[first_index as usize - 1];
1423 let mut wrong_type_indexes = Vec::new();
1424 dict_indexes.retain(|&index| {
1425 if var_types[index as usize - 1] != var_type {
1426 wrong_type_indexes.push(index);
1432 if !wrong_type_indexes.is_empty() {
1433 warn(Error::MixedVarTypes {
1434 offset: index_offset,
1436 wrong_types: wrong_type_indexes,
1442 .map(|(value, label)| ValueLabel {
1443 value: Value::from_raw(&value, var_type, endian),
1448 let end_offset = r.stream_position()?;
1449 Ok(Some(Record::ValueLabel(ValueLabelRecord {
1450 offsets: label_offset..end_offset,
1458 #[derive(Clone, Debug)]
1459 pub struct DocumentRecord<S>
1463 pub offsets: Range<u64>,
1465 /// The document, as an array of 80-byte lines.
1469 pub type RawDocumentLine = RawStr<DOC_LINE_LEN>;
1471 /// Length of a line in a document. Document lines are fixed-length and
1472 /// padded on the right with spaces.
1473 pub const DOC_LINE_LEN: usize = 80;
1475 impl DocumentRecord<RawDocumentLine> {
1476 /// Maximum number of lines we will accept in a document. This is simply
1477 /// the maximum number that will fit in a 32-bit space.
1478 pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
1480 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1481 let start_offset = r.stream_position()?;
1482 let n: u32 = endian.parse(read_bytes(r)?);
1484 if n > Self::MAX_LINES {
1485 Err(Error::BadDocumentLength {
1486 offset: start_offset,
1488 max: Self::MAX_LINES,
1491 let mut lines = Vec::with_capacity(n);
1493 lines.push(RawStr(read_bytes(r)?));
1495 let end_offset = r.stream_position()?;
1496 Ok(Record::Document(DocumentRecord {
1497 offsets: start_offset..end_offset,
1503 fn decode<'a>(&'a self, decoder: &Decoder) -> DocumentRecord<Cow<'a, str>> {
1505 offsets: self.offsets.clone(),
1509 .map(|s| decoder.decode_slice(&s.0))
1515 impl<S> Header for DocumentRecord<S>
1519 fn offsets(&self) -> Range<u64> {
1520 self.offsets.clone()
1524 trait ExtensionRecord {
1526 const SIZE: Option<u32>;
1527 const COUNT: Option<u32>;
1528 const NAME: &'static str;
1529 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error>;
1532 #[derive(Clone, Debug)]
1533 pub struct IntegerInfoRecord {
1534 pub offsets: Range<u64>,
1535 pub version: (i32, i32, i32),
1536 pub machine_code: i32,
1537 pub floating_point_rep: i32,
1538 pub compression_code: i32,
1539 pub endianness: i32,
1540 pub character_code: i32,
1543 impl ExtensionRecord for IntegerInfoRecord {
1544 const SUBTYPE: u32 = 3;
1545 const SIZE: Option<u32> = Some(4);
1546 const COUNT: Option<u32> = Some(8);
1547 const NAME: &'static str = "integer record";
1549 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1550 ext.check_size::<Self>()?;
1552 let mut input = &ext.data[..];
1553 let data: Vec<i32> = (0..8)
1554 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1556 Ok(Record::IntegerInfo(IntegerInfoRecord {
1557 offsets: ext.offsets.clone(),
1558 version: (data[0], data[1], data[2]),
1559 machine_code: data[3],
1560 floating_point_rep: data[4],
1561 compression_code: data[5],
1562 endianness: data[6],
1563 character_code: data[7],
1568 #[derive(Clone, Debug)]
1569 pub struct FloatInfoRecord {
1575 impl ExtensionRecord for FloatInfoRecord {
1576 const SUBTYPE: u32 = 4;
1577 const SIZE: Option<u32> = Some(8);
1578 const COUNT: Option<u32> = Some(3);
1579 const NAME: &'static str = "floating point record";
1581 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1582 ext.check_size::<Self>()?;
1584 let mut input = &ext.data[..];
1585 let data: Vec<f64> = (0..3)
1586 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1588 Ok(Record::FloatInfo(FloatInfoRecord {
1596 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1597 pub enum CategoryLabels {
1602 #[derive(Clone, Debug)]
1603 pub enum MultipleResponseType {
1606 labels: CategoryLabels,
1611 impl MultipleResponseType {
1612 fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Error> {
1613 let (mr_type, input) = match input.split_first() {
1614 Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
1615 Some((b'D', input)) => {
1616 let (value, input) = parse_counted_string(input)?;
1618 MultipleResponseType::MultipleDichotomy {
1620 labels: CategoryLabels::VarLabels,
1625 Some((b'E', input)) => {
1626 let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
1627 (CategoryLabels::CountedValues, rest)
1628 } else if let Some(rest) = input.strip_prefix(b" 11 ") {
1629 (CategoryLabels::VarLabels, rest)
1631 return Err(Error::TBD);
1633 let (value, input) = parse_counted_string(input)?;
1635 MultipleResponseType::MultipleDichotomy { value, labels },
1639 _ => return Err(Error::TBD),
1641 Ok((mr_type, input))
1645 #[derive(Clone, Debug)]
1646 pub struct MultipleResponseSet<S>
1652 pub mr_type: MultipleResponseType,
1653 pub short_names: Vec<S>,
1656 impl MultipleResponseSet<RawString> {
1657 fn parse(input: &[u8]) -> Result<(Self, &[u8]), Error> {
1658 let Some(equals) = input.iter().position(|&b| b == b'=') else {
1659 return Err(Error::TBD);
1661 let (name, input) = input.split_at(equals);
1662 let (mr_type, input) = MultipleResponseType::parse(input)?;
1663 let Some(input) = input.strip_prefix(b" ") else {
1664 return Err(Error::TBD);
1666 let (label, mut input) = parse_counted_string(input)?;
1667 let mut vars = Vec::new();
1668 while input.first() != Some(&b'\n') {
1669 match input.split_first() {
1670 Some((b' ', rest)) => {
1671 let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
1672 return Err(Error::TBD);
1674 let (var, rest) = rest.split_at(length);
1675 if !var.is_empty() {
1676 vars.push(var.into());
1680 _ => return Err(Error::TBD),
1683 while input.first() == Some(&b'\n') {
1684 input = &input[1..];
1687 MultipleResponseSet {
1697 fn decode<'a>(&'a self, decoder: &Decoder) -> MultipleResponseSet<Cow<'a, str>> {
1698 MultipleResponseSet {
1699 name: decoder.decode(&self.name),
1700 label: decoder.decode(&self.label),
1701 mr_type: self.mr_type.clone(),
1702 short_names: self.short_names.iter().map(|s| decoder.decode(s)).collect(),
1707 #[derive(Clone, Debug)]
1708 pub struct MultipleResponseRecord<S>(pub Vec<MultipleResponseSet<S>>)
1712 impl ExtensionRecord for MultipleResponseRecord<RawString> {
1713 const SUBTYPE: u32 = 7;
1714 const SIZE: Option<u32> = Some(1);
1715 const COUNT: Option<u32> = None;
1716 const NAME: &'static str = "multiple response set record";
1718 fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Error> {
1719 ext.check_size::<Self>()?;
1721 let mut input = &ext.data[..];
1722 let mut sets = Vec::new();
1723 while !input.is_empty() {
1724 let (set, rest) = MultipleResponseSet::parse(input)?;
1728 Ok(Record::MultipleResponse(MultipleResponseRecord(sets)))
1732 impl MultipleResponseRecord<RawString> {
1733 fn decode<'a>(&'a self, decoder: &Decoder) -> MultipleResponseRecord<Cow<'a, str>> {
1734 MultipleResponseRecord(self.0.iter().map(|set| set.decode(decoder)).collect())
1738 fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Error> {
1739 let Some(space) = input.iter().position(|&b| b == b' ') else {
1740 return Err(Error::TBD);
1742 let Ok(length) = from_utf8(&input[..space]) else {
1743 return Err(Error::TBD);
1745 let Ok(length): Result<usize, _> = length.parse() else {
1746 return Err(Error::TBD);
1749 let input = &input[space + 1..];
1750 if input.len() < length {
1751 return Err(Error::TBD);
1754 let (string, rest) = input.split_at(length);
1755 Ok((string.into(), rest))
1758 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1766 fn try_decode(source: u32) -> Result<Option<Measure>, Error> {
1769 1 => Ok(Some(Measure::Nominal)),
1770 2 => Ok(Some(Measure::Ordinal)),
1771 3 => Ok(Some(Measure::Scale)),
1772 _ => Err(Error::InvalidMeasurement(source)),
1777 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1778 pub enum Alignment {
1785 fn try_decode(source: u32) -> Result<Option<Alignment>, Error> {
1788 1 => Ok(Some(Alignment::Left)),
1789 2 => Ok(Some(Alignment::Right)),
1790 3 => Ok(Some(Alignment::Center)),
1791 _ => Err(Error::InvalidAlignment(source)),
1796 #[derive(Clone, Debug)]
1797 pub struct VarDisplay {
1798 pub measure: Option<Measure>,
1799 pub width: Option<u32>,
1800 pub alignment: Option<Alignment>,
1803 #[derive(Clone, Debug)]
1804 pub struct VarDisplayRecord(pub Vec<VarDisplay>);
1806 impl VarDisplayRecord {
1807 const SUBTYPE: u32 = 11;
1813 warn: &Box<dyn Fn(Error)>,
1814 ) -> Result<Record, Error> {
1816 return Err(Error::BadRecordSize {
1817 offset: ext.offsets.start,
1818 record: String::from("variable display record"),
1824 let has_width = if ext.count as usize == 3 * n_vars {
1826 } else if ext.count as usize == 2 * n_vars {
1829 return Err(Error::TBD);
1832 let mut var_displays = Vec::new();
1833 let mut input = &ext.data[..];
1834 for _ in 0..n_vars {
1835 let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
1836 .warn_on_error(&warn)
1838 let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
1839 let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
1840 .warn_on_error(&warn)
1842 var_displays.push(VarDisplay {
1848 Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
1852 #[derive(Clone, Debug)]
1853 pub struct LongStringMissingValues<N, V>
1862 pub missing_values: MissingValues<V>,
1865 impl LongStringMissingValues<RawString, RawStr<8>> {
1866 fn decode<'a>(&self, decoder: &Decoder) -> LongStringMissingValues<String, String> {
1867 LongStringMissingValues {
1868 var_name: decoder.decode(&self.var_name).to_string(),
1869 missing_values: self.missing_values.decode(decoder),
1874 #[derive(Clone, Debug)]
1875 pub struct LongStringMissingValueRecord<N, V>(pub Vec<LongStringMissingValues<N, V>>)
1880 impl ExtensionRecord for LongStringMissingValueRecord<RawString, RawStr<8>> {
1881 const SUBTYPE: u32 = 22;
1882 const SIZE: Option<u32> = Some(1);
1883 const COUNT: Option<u32> = None;
1884 const NAME: &'static str = "long string missing values record";
1886 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1887 ext.check_size::<Self>()?;
1889 let mut input = &ext.data[..];
1890 let mut missing_value_set = Vec::new();
1891 while !input.is_empty() {
1892 let var_name = read_string(&mut input, endian)?;
1893 let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
1894 let value_len: u32 = endian.parse(read_bytes(&mut input)?);
1896 let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
1897 return Err(Error::BadLongMissingValueLength {
1898 record_offset: ext.offsets.start,
1903 let mut values = Vec::new();
1904 for i in 0..n_missing_values {
1905 let value: [u8; 8] = read_bytes(&mut input)?;
1906 let numeric_value: u64 = endian.parse(value);
1907 let value = if i > 0 && numeric_value == 8 {
1908 // Tolerate files written by old, buggy versions of PSPP
1909 // where we believed that the value_length was repeated
1910 // before each missing value.
1911 read_bytes(&mut input)?
1915 values.push(Value::String(RawStr(value)));
1917 let missing_values = MissingValues {
1921 missing_value_set.push(LongStringMissingValues {
1926 Ok(Record::LongStringMissingValues(
1927 LongStringMissingValueRecord(missing_value_set),
1932 impl LongStringMissingValueRecord<RawString, RawStr<8>> {
1933 fn decode<'a>(&self, decoder: &Decoder) -> LongStringMissingValueRecord<String, String> {
1934 LongStringMissingValueRecord(self.0.iter().map(|mv| mv.decode(decoder)).collect())
1938 #[derive(Clone, Debug)]
1939 pub struct EncodingRecord(pub String);
1941 impl ExtensionRecord for EncodingRecord {
1942 const SUBTYPE: u32 = 20;
1943 const SIZE: Option<u32> = Some(1);
1944 const COUNT: Option<u32> = None;
1945 const NAME: &'static str = "encoding record";
1947 fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Error> {
1948 ext.check_size::<Self>()?;
1950 Ok(Record::Encoding(EncodingRecord(
1951 String::from_utf8(ext.data.clone()).map_err(|_| Error::BadEncodingName {
1952 offset: ext.offsets.start,
1958 #[derive(Copy, Clone, Debug)]
1959 pub struct NumberOfCasesRecord {
1960 /// Always observed as 1.
1963 /// Number of cases.
1967 impl ExtensionRecord for NumberOfCasesRecord {
1968 const SUBTYPE: u32 = 16;
1969 const SIZE: Option<u32> = Some(8);
1970 const COUNT: Option<u32> = Some(2);
1971 const NAME: &'static str = "extended number of cases record";
1973 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1974 ext.check_size::<Self>()?;
1976 let mut input = &ext.data[..];
1977 let one = endian.parse(read_bytes(&mut input)?);
1978 let n_cases = endian.parse(read_bytes(&mut input)?);
1980 Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
1984 #[derive(Clone, Debug)]
1985 pub struct TextRecord {
1986 pub offsets: Range<u64>,
1989 pub rec_type: TextRecordType,
1991 /// The text content of the record.
1992 pub text: RawString,
1995 #[derive(Clone, Copy, Debug)]
1996 pub enum TextRecordType {
2006 fn new(extension: Extension, rec_type: TextRecordType) -> Self {
2008 offsets: extension.offsets,
2010 text: extension.data.into(),
2013 fn decode<'a>(&self, decoder: &Decoder) -> Result<Option<Record>, Error> {
2014 match self.rec_type {
2015 TextRecordType::VariableSets => Ok(Some(Record::VariableSets(
2016 VariableSetRecord::decode(self, decoder),
2018 TextRecordType::ProductInfo => Ok(Some(Record::ProductInfo(
2019 ProductInfoRecord::decode(self, decoder),
2021 TextRecordType::LongNames => Ok(Some(Record::LongNames(LongNamesRecord::decode(
2024 TextRecordType::VeryLongStrings => Ok(Some(Record::VeryLongStrings(
2025 VeryLongStringsRecord::decode(self, decoder),
2027 TextRecordType::FileAttributes => {
2028 Ok(FileAttributeRecord::decode(self, decoder).map(|fa| Record::FileAttributes(fa)))
2030 TextRecordType::VariableAttributes => Ok(Some(Record::VariableAttributes(
2031 VariableAttributeRecord::decode(self, decoder),
2037 #[derive(Clone, Debug)]
2038 pub struct VeryLongString {
2039 pub short_name: Identifier,
2043 impl VeryLongString {
2044 fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Error> {
2045 let Some((short_name, length)) = input.split_once('=') else {
2046 return Err(Error::TBD);
2048 let short_name = decoder
2049 .new_identifier(short_name)
2050 .map_err(Error::InvalidLongStringName)?;
2051 let length = length.parse().map_err(|_| Error::TBD)?;
2052 Ok(VeryLongString { short_name, length })
2056 #[derive(Clone, Debug)]
2057 pub struct VeryLongStringsRecord(Vec<VeryLongString>);
2059 impl VeryLongStringsRecord {
2060 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2061 let input = decoder.decode(&source.text);
2062 let mut very_long_strings = Vec::new();
2065 .map(|s| s.trim_end_matches('\t'))
2066 .filter(|s| !s.is_empty())
2068 if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&decoder.warn) {
2069 very_long_strings.push(vls)
2072 VeryLongStringsRecord(very_long_strings)
2076 #[derive(Clone, Debug)]
2077 pub struct Attribute {
2078 pub name: Identifier,
2079 pub values: Vec<String>,
2083 fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Error> {
2084 let Some((name, mut input)) = input.split_once('(') else {
2085 return Err(Error::TBD);
2088 .new_identifier(name)
2089 .map_err(Error::InvalidAttributeName)?;
2090 let mut values = Vec::new();
2092 let Some((value, rest)) = input.split_once('\n') else {
2093 return Err(Error::TBD);
2095 if let Some(stripped) = value
2097 .and_then(|value| value.strip_suffix('\''))
2099 values.push(stripped.into());
2101 decoder.warn(Error::TBD);
2102 values.push(value.into());
2104 if let Some(rest) = rest.strip_prefix(')') {
2105 let attribute = Attribute { name, values };
2106 return Ok((attribute, rest));
2113 #[derive(Clone, Debug)]
2114 pub struct AttributeSet(pub Vec<Attribute>);
2120 sentinel: Option<char>,
2121 ) -> Result<(AttributeSet, &'a str), Error> {
2122 let mut attributes = Vec::new();
2124 match input.chars().next() {
2125 None => break input,
2126 c if c == sentinel => break &input[1..],
2128 let (attribute, rest) = Attribute::parse(decoder, input)?;
2129 attributes.push(attribute);
2134 Ok((AttributeSet(attributes), rest))
2138 #[derive(Clone, Debug)]
2139 pub struct FileAttributeRecord(AttributeSet);
2141 impl FileAttributeRecord {
2142 fn decode(source: &TextRecord, decoder: &Decoder) -> Option<Self> {
2143 let input = decoder.decode(&source.text);
2144 match AttributeSet::parse(decoder, &input, None).warn_on_error(&decoder.warn) {
2145 Some((set, rest)) => {
2146 if !rest.is_empty() {
2147 decoder.warn(Error::TBD);
2149 Some(FileAttributeRecord(set))
2156 #[derive(Clone, Debug)]
2157 pub struct VarAttributeSet {
2158 pub long_var_name: Identifier,
2159 pub attributes: AttributeSet,
2162 impl VarAttributeSet {
2163 fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributeSet, &'a str), Error> {
2164 let Some((long_var_name, rest)) = input.split_once(':') else {
2165 return Err(Error::TBD);
2167 let long_var_name = decoder
2168 .new_identifier(long_var_name)
2169 .map_err(Error::InvalidAttributeVariableName)?;
2170 let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?;
2171 let var_attribute = VarAttributeSet {
2175 Ok((var_attribute, rest))
2179 #[derive(Clone, Debug)]
2180 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
2182 impl VariableAttributeRecord {
2183 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2184 let decoded = decoder.decode(&source.text);
2185 let mut input = decoded.as_ref();
2186 let mut var_attribute_sets = Vec::new();
2187 while !input.is_empty() {
2188 let Some((var_attribute, rest)) =
2189 VarAttributeSet::parse(decoder, &input).warn_on_error(&decoder.warn)
2193 var_attribute_sets.push(var_attribute);
2194 input = rest.into();
2196 VariableAttributeRecord(var_attribute_sets)
2200 #[derive(Clone, Debug)]
2201 pub struct LongName {
2202 pub short_name: Identifier,
2203 pub long_name: Identifier,
2207 fn parse(input: &str, decoder: &Decoder) -> Result<Self, Error> {
2208 let Some((short_name, long_name)) = input.split_once('=') else {
2209 return Err(Error::TBD);
2211 let short_name = decoder
2212 .new_identifier(short_name)
2213 .map_err(Error::InvalidShortName)?;
2214 let long_name = decoder
2215 .new_identifier(long_name)
2216 .map_err(Error::InvalidLongName)?;
2224 #[derive(Clone, Debug)]
2225 pub struct LongNamesRecord(Vec<LongName>);
2227 impl LongNamesRecord {
2228 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2229 let input = decoder.decode(&source.text);
2230 let mut names = Vec::new();
2231 for pair in input.split('\t').filter(|s| !s.is_empty()) {
2232 if let Some(long_name) = LongName::parse(pair, decoder).warn_on_error(&decoder.warn) {
2233 names.push(long_name);
2236 LongNamesRecord(names)
2240 #[derive(Clone, Debug)]
2241 pub struct ProductInfoRecord(pub String);
2243 impl ProductInfoRecord {
2244 const NAME: &'static str = "extra product info";
2245 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2246 Self(decoder.decode(&source.text).into())
2249 #[derive(Clone, Debug)]
2250 pub struct VariableSet {
2252 pub vars: Vec<Identifier>,
2256 fn parse(input: &str, decoder: &Decoder) -> Result<Self, Error> {
2257 let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
2258 let mut vars = Vec::new();
2259 for var in input.split_ascii_whitespace() {
2260 if let Some(identifier) = decoder
2261 .new_identifier(var)
2262 .map_err(Error::InvalidVariableSetName)
2263 .warn_on_error(&decoder.warn)
2265 vars.push(identifier);
2275 #[derive(Clone, Debug)]
2276 pub struct VariableSetRecord {
2277 pub offsets: Range<u64>,
2278 pub sets: Vec<VariableSet>,
2281 impl VariableSetRecord {
2282 fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord {
2283 let mut sets = Vec::new();
2284 let input = decoder.decode(&source.text);
2285 for line in input.lines() {
2286 if let Some(set) = VariableSet::parse(line, decoder).warn_on_error(&decoder.warn) {
2291 offsets: source.offsets.clone(),
2297 trait WarnOnError<T> {
2298 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
2300 impl<T> WarnOnError<T> for Result<T, Error> {
2301 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
2303 Ok(result) => Some(result),
2312 #[derive(Clone, Debug)]
2313 pub struct Extension {
2314 pub offsets: Range<u64>,
2319 /// Size of each data element.
2322 /// Number of data elements.
2325 /// `size * count` bytes of data.
2330 fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
2331 if let Some(expected_size) = E::SIZE {
2332 if self.size != expected_size {
2333 return Err(Error::BadRecordSize {
2334 offset: self.offsets.start,
2335 record: E::NAME.into(),
2341 if let Some(expected_count) = E::COUNT {
2342 if self.count != expected_count {
2343 return Err(Error::BadRecordCount {
2344 offset: self.offsets.start,
2345 record: E::NAME.into(),
2354 fn read<R: Read + Seek>(
2358 warn: &Box<dyn Fn(Error)>,
2359 ) -> Result<Option<Record>, Error> {
2360 let subtype = endian.parse(read_bytes(r)?);
2361 let header_offset = r.stream_position()?;
2362 let size: u32 = endian.parse(read_bytes(r)?);
2363 let count = endian.parse(read_bytes(r)?);
2364 let Some(product) = size.checked_mul(count) else {
2365 return Err(Error::ExtensionRecordTooLarge {
2366 offset: header_offset,
2372 let start_offset = r.stream_position()?;
2373 let data = read_vec(r, product as usize)?;
2374 let end_offset = start_offset + product as u64;
2375 let extension = Extension {
2376 offsets: start_offset..end_offset,
2382 let result = match subtype {
2383 IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian),
2384 FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian),
2385 VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, n_vars, endian, warn),
2386 MultipleResponseRecord::SUBTYPE | 19 => {
2387 MultipleResponseRecord::parse(&extension, endian)
2389 LongStringValueLabelRecord::SUBTYPE => {
2390 LongStringValueLabelRecord::parse(&extension, endian)
2392 EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian),
2393 NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian),
2394 5 => Ok(Record::Text(TextRecord::new(
2396 TextRecordType::VariableSets,
2398 10 => Ok(Record::Text(TextRecord::new(
2400 TextRecordType::ProductInfo,
2402 13 => Ok(Record::Text(TextRecord::new(
2404 TextRecordType::LongNames,
2406 14 => Ok(Record::Text(TextRecord::new(
2408 TextRecordType::VeryLongStrings,
2410 17 => Ok(Record::Text(TextRecord::new(
2412 TextRecordType::FileAttributes,
2414 18 => Ok(Record::Text(TextRecord::new(
2416 TextRecordType::VariableAttributes,
2418 _ => Ok(Record::OtherExtension(extension)),
2421 Ok(result) => Ok(Some(result)),
2430 #[derive(Clone, Debug)]
2431 pub struct ZHeader {
2432 /// File offset to the start of the record.
2435 /// File offset to the ZLIB data header.
2436 pub zheader_offset: u64,
2438 /// File offset to the ZLIB trailer.
2439 pub ztrailer_offset: u64,
2441 /// Length of the ZLIB trailer in bytes.
2442 pub ztrailer_len: u64,
2446 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
2447 let offset = r.stream_position()?;
2448 let zheader_offset: u64 = endian.parse(read_bytes(r)?);
2449 let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
2450 let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
2461 #[derive(Clone, Debug)]
2462 pub struct ZTrailer {
2463 /// File offset to the start of the record.
2466 /// Compression bias as a negative integer, e.g. -100.
2469 /// Always observed as zero.
2472 /// Uncompressed size of each block, except possibly the last. Only
2473 /// `0x3ff000` has been observed so far.
2474 pub block_size: u32,
2476 /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
2477 pub blocks: Vec<ZBlock>,
2480 #[derive(Clone, Debug)]
2482 /// Offset of block of data if simple compression were used.
2483 pub uncompressed_ofs: u64,
2485 /// Actual offset within the file of the compressed data block.
2486 pub compressed_ofs: u64,
2488 /// The number of bytes in this data block after decompression. This is
2489 /// `block_size` in every data block but the last, which may be smaller.
2490 pub uncompressed_size: u32,
2492 /// The number of bytes in this data block, as stored compressed in this
2494 pub compressed_size: u32,
2498 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
2500 uncompressed_ofs: endian.parse(read_bytes(r)?),
2501 compressed_ofs: endian.parse(read_bytes(r)?),
2502 uncompressed_size: endian.parse(read_bytes(r)?),
2503 compressed_size: endian.parse(read_bytes(r)?),
2509 fn read<R: Read + Seek>(
2514 ) -> Result<Option<ZTrailer>, Error> {
2515 let start_offset = reader.stream_position()?;
2516 if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
2519 let int_bias = endian.parse(read_bytes(reader)?);
2520 let zero = endian.parse(read_bytes(reader)?);
2521 let block_size = endian.parse(read_bytes(reader)?);
2522 let n_blocks: u32 = endian.parse(read_bytes(reader)?);
2523 let expected_n_blocks = (ztrailer_len - 24) / 24;
2524 if n_blocks as u64 != expected_n_blocks {
2525 return Err(Error::BadZlibTrailerNBlocks {
2526 offset: ztrailer_ofs,
2532 let blocks = (0..n_blocks)
2533 .map(|_| ZBlock::read(reader, endian))
2534 .collect::<Result<Vec<_>, _>>()?;
2535 reader.seek(SeekFrom::Start(start_offset))?;
2537 offset: ztrailer_ofs,
2546 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
2547 let mut buf = [0; N];
2548 let n = r.read(&mut buf)?;
2551 r.read_exact(&mut buf[n..])?;
2559 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
2560 let mut buf = [0; N];
2561 r.read_exact(&mut buf)?;
2565 fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
2566 let mut vec = vec![0; n];
2567 r.read_exact(&mut vec)?;
2571 fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<RawString, IoError> {
2572 let length: u32 = endian.parse(read_bytes(r)?);
2573 Ok(read_vec(r, length as usize)?.into())
2576 #[derive(Clone, Debug)]
2577 pub struct LongStringValueLabels<S>
2584 /// `(value, label)` pairs, where each value is `width` bytes.
2585 pub labels: Vec<(S, S)>,
2588 #[derive(Clone, Debug)]
2589 pub struct LongStringValueLabelRecord<S>(pub Vec<LongStringValueLabels<S>>)
2593 impl ExtensionRecord for LongStringValueLabelRecord<RawString> {
2594 const SUBTYPE: u32 = 21;
2595 const SIZE: Option<u32> = Some(1);
2596 const COUNT: Option<u32> = None;
2597 const NAME: &'static str = "long string value labels record";
2599 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
2600 ext.check_size::<Self>()?;
2602 let mut input = &ext.data[..];
2603 let mut label_set = Vec::new();
2604 while !input.is_empty() {
2605 let var_name = read_string(&mut input, endian)?;
2606 let width: u32 = endian.parse(read_bytes(&mut input)?);
2607 let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
2608 let mut labels = Vec::new();
2609 for _ in 0..n_labels {
2610 let value = read_string(&mut input, endian)?;
2611 let label = read_string(&mut input, endian)?;
2612 labels.push((value, label));
2614 label_set.push(LongStringValueLabels {
2620 Ok(Record::LongStringValueLabels(LongStringValueLabelRecord(