3 endian::{Endian, Parse, ToBytes},
4 identifier::{Error as IdError, Identifier},
7 use encoding_rs::{mem::decode_latin1, DecoderResult, Encoding};
8 use flate2::read::ZlibDecoder;
14 collections::{VecDeque, HashMap},
15 fmt::{Debug, Display, Formatter, Result as FmtResult},
16 io::{Error as IoError, Read, Seek, SeekFrom},
23 use thiserror::Error as ThisError;
25 #[derive(ThisError, Debug)]
27 #[error("Not an SPSS system file")]
30 #[error("Invalid magic number {0:?}")]
33 #[error("I/O error ({0})")]
36 #[error("Invalid SAV compression code {0}")]
37 InvalidSavCompression(u32),
39 #[error("Invalid ZSAV compression code {0}")]
40 InvalidZsavCompression(u32),
42 #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
43 BadVariableWidth { offset: u64, width: i32 },
45 #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
46 BadDocumentLength { offset: u64, n: usize, max: usize },
48 #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
49 BadRecordType { offset: u64, rec_type: u32 },
51 #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
52 BadVariableLabelCode {
59 "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
61 BadNumericMissingValueCode { offset: u64, code: i32 },
63 #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
64 BadStringMissingValueCode { offset: u64, code: i32 },
66 #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
67 BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
69 #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
70 ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
72 #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")]
73 TooManyVarIndexes { offset: u64, n: u32, max: u32 },
75 #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")]
76 NoVarIndexes { offset: u64 },
78 #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())]
82 wrong_types: Vec<u32>,
85 #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")]
92 #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
93 ExtensionRecordTooLarge {
100 #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
108 "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
110 EofInCompressedCase { offset: u64, case_ofs: u64 },
112 #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
113 PartialCompressedCase { offset: u64, case_ofs: u64 },
115 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
116 CompressedNumberExpected { offset: u64, case_ofs: u64 },
118 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
119 CompressedStringExpected { offset: u64, case_ofs: u64 },
121 #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
122 BadZlibTrailerNBlocks {
125 expected_n_blocks: u64,
129 #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
137 #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
145 #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
146 BadLongMissingValueLength {
152 #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
153 BadEncodingName { offset: u64 },
155 // XXX This is risky because `text` might be arbitarily long.
156 #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
157 MalformedString { encoding: String, text: String },
159 #[error("Invalid variable measurement level value {0}")]
160 InvalidMeasurement(u32),
162 #[error("Invalid variable display alignment value {0}")]
163 InvalidAlignment(u32),
165 #[error("Invalid attribute name. {0}")]
166 InvalidAttributeName(IdError),
168 #[error("Invalid variable name in attribute record. {0}")]
169 InvalidAttributeVariableName(IdError),
171 #[error("Invalid short name in long variable name record. {0}")]
172 InvalidShortName(IdError),
174 #[error("Invalid name in long variable name record. {0}")]
175 InvalidLongName(IdError),
177 #[error("Invalid variable name in very long string record. {0}")]
178 InvalidLongStringName(IdError),
180 #[error("Invalid variable name in variable set record. {0}")]
181 InvalidVariableSetName(IdError),
183 #[error("Invalid multiple response set name. {0}")]
184 InvalidMrSetName(IdError),
186 #[error("Invalid multiple response set variable name. {0}")]
187 InvalidMrSetVariableName(IdError),
189 #[error("Invalid variable name in long string missing values record. {0}")]
190 InvalidLongStringMissingValueVariableName(IdError),
192 #[error("Details TBD")]
196 #[derive(Clone, Debug)]
198 Header(HeaderRecord<RawString>),
199 Variable(VariableRecord<RawString, RawStr<8>>),
200 ValueLabel(ValueLabelRecord<RawStr<8>, RawString>),
201 Document(DocumentRecord<RawDocumentLine>),
202 IntegerInfo(IntegerInfoRecord),
203 FloatInfo(FloatInfoRecord),
204 VariableSets(VariableSetRecord),
205 VarDisplay(VarDisplayRecord),
206 MultipleResponse(MultipleResponseRecord<RawString, RawString>),
207 LongStringValueLabels(LongStringValueLabelRecord<RawString>),
208 LongStringMissingValues(LongStringMissingValueRecord<RawString, RawStr<8>>),
209 Encoding(EncodingRecord),
210 NumberOfCases(NumberOfCasesRecord),
211 ProductInfo(ProductInfoRecord),
212 LongNames(LongNamesRecord),
213 VeryLongStrings(VeryLongStringsRecord),
214 FileAttributes(FileAttributeRecord),
215 VariableAttributes(VariableAttributeRecord),
217 OtherExtension(Extension),
221 Cases(Rc<RefCell<Cases>>),
228 var_types: &[VarType],
229 warn: &Box<dyn Fn(Error)>,
230 ) -> Result<Option<Record>, Error>
234 let rec_type: u32 = endian.parse(read_bytes(reader)?);
236 2 => Ok(Some(VariableRecord::read(reader, endian)?)),
237 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?),
238 6 => Ok(Some(DocumentRecord::read(reader, endian)?)),
239 7 => Extension::read(reader, endian, var_types.len(), warn),
240 999 => Ok(Some(Record::EndOfHeaders(
241 endian.parse(read_bytes(reader)?),
243 _ => Err(Error::BadRecordType {
244 offset: reader.stream_position()?,
253 // If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
254 // decoded as Latin-1 (actually bytes interpreted as Unicode code points).
255 fn default_decode(s: &[u8]) -> Cow<str> {
256 from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
259 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
260 pub enum Compression {
266 fn offsets(&self) -> Range<u64>;
270 pub struct HeaderRecord<S>
275 pub offsets: Range<u64>,
280 /// Eye-catcher string, product name, in the file's encoding. Padded
281 /// on the right with spaces.
284 /// Layout code, normally either 2 or 3.
285 pub layout_code: u32,
287 /// Number of variable positions, or `None` if the value in the file is
288 /// questionably trustworthy.
289 pub nominal_case_size: Option<u32>,
291 /// Compression type, if any,
292 pub compression: Option<Compression>,
294 /// 1-based variable index of the weight variable, or `None` if the file is
296 pub weight_index: Option<u32>,
298 /// Claimed number of cases, if known.
299 pub n_cases: Option<u32>,
301 /// Compression bias, usually 100.0.
304 /// `dd mmm yy` in the file's encoding.
305 pub creation_date: S,
307 /// `HH:MM:SS` in the file's encoding.
308 pub creation_time: S,
310 /// File label, in the file's encoding. Padded on the right with spaces.
313 /// Endianness of the data in the file header.
317 impl<S> HeaderRecord<S>
321 fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult
325 writeln!(f, "{name:>17}: {:?}", value)
329 impl<S> Debug for HeaderRecord<S>
333 fn fmt(&self, f: &mut Formatter) -> FmtResult {
334 writeln!(f, "File header record:")?;
335 self.debug_field(f, "Magic", self.magic)?;
336 self.debug_field(f, "Product name", &self.eye_catcher)?;
337 self.debug_field(f, "Layout code", self.layout_code)?;
338 self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
339 self.debug_field(f, "Compression", self.compression)?;
340 self.debug_field(f, "Weight index", self.weight_index)?;
341 self.debug_field(f, "Number of cases", self.n_cases)?;
342 self.debug_field(f, "Compression bias", self.bias)?;
343 self.debug_field(f, "Creation date", &self.creation_date)?;
344 self.debug_field(f, "Creation time", &self.creation_time)?;
345 self.debug_field(f, "File label", &self.file_label)?;
346 self.debug_field(f, "Endianness", self.endian)
350 impl HeaderRecord<RawString> {
351 fn read<R: Read + Seek>(r: &mut R) -> Result<Self, Error> {
352 let start = r.stream_position()?;
354 let magic: [u8; 4] = read_bytes(r)?;
355 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
357 let eye_catcher = RawString(read_vec(r, 60)?);
358 let layout_code: [u8; 4] = read_bytes(r)?;
359 let endian = Endian::identify_u32(2, layout_code)
360 .or_else(|| Endian::identify_u32(2, layout_code))
361 .ok_or_else(|| Error::NotASystemFile)?;
362 let layout_code = endian.parse(layout_code);
364 let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
365 let nominal_case_size =
366 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
368 let compression_code: u32 = endian.parse(read_bytes(r)?);
369 let compression = match (magic, compression_code) {
370 (Magic::Zsav, 2) => Some(Compression::ZLib),
371 (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)),
373 (_, 1) => Some(Compression::Simple),
374 (_, code) => return Err(Error::InvalidSavCompression(code)),
377 let weight_index: u32 = endian.parse(read_bytes(r)?);
378 let weight_index = (weight_index > 0).then_some(weight_index);
380 let n_cases: u32 = endian.parse(read_bytes(r)?);
381 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
383 let bias: f64 = endian.parse(read_bytes(r)?);
385 let creation_date = RawString(read_vec(r, 9)?);
386 let creation_time = RawString(read_vec(r, 8)?);
387 let file_label = RawString(read_vec(r, 64)?);
388 let _: [u8; 3] = read_bytes(r)?;
391 offsets: start..r.stream_position()?,
407 pub fn decode<'a>(&'a self, decoder: &Decoder) -> HeaderRecord<Cow<'a, str>> {
408 let eye_catcher = decoder.decode(&self.eye_catcher);
409 let file_label = decoder.decode(&self.file_label);
410 let creation_date = decoder.decode(&self.creation_date);
411 let creation_time = decoder.decode(&self.creation_time);
414 weight_index: self.weight_index,
415 n_cases: self.n_cases,
417 offsets: self.offsets.clone(),
419 layout_code: self.layout_code,
420 nominal_case_size: self.nominal_case_size,
421 compression: self.compression,
431 pub encoding: &'static Encoding,
432 pub warn: Box<dyn Fn(Error)>,
436 fn warn(&self, error: Error) {
439 fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
440 let (output, malformed) = self.encoding.decode_without_bom_handling(input);
442 self.warn(Error::MalformedString {
443 encoding: self.encoding.name().into(),
444 text: output.clone().into(),
450 fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> {
451 self.decode_slice(input.0.as_slice())
454 /// Returns `input` decoded from `self.encoding` into UTF-8 such that
455 /// re-encoding the result back into `self.encoding` will have exactly the
456 /// same length in bytes.
458 /// XXX warn about errors?
459 fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
460 if let (s, false) = self.encoding.decode_without_bom_handling(input) {
461 // This is the common case. Usually there will be no errors.
464 // Unusual case. Don't bother to optimize it much.
465 let mut decoder = self.encoding.new_decoder_without_bom_handling();
466 let mut output = String::with_capacity(
468 .max_utf8_buffer_length_without_replacement(input.len())
471 let mut rest = input;
472 while !rest.is_empty() {
473 match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
474 (DecoderResult::InputEmpty, _) => break,
475 (DecoderResult::OutputFull, _) => unreachable!(),
476 (DecoderResult::Malformed(a, b), consumed) => {
477 let skipped = a as usize + b as usize;
478 output.extend(repeat('?').take(skipped));
479 rest = &rest[consumed..];
483 assert_eq!(self.encoding.encode(&output).0.len(), input.len());
488 pub fn decode_identifier(&self, input: &RawString) -> Result<Identifier, IdError> {
489 self.new_identifier(&self.decode(input))
492 pub fn new_identifier(&self, name: &str) -> Result<Identifier, IdError> {
493 Identifier::new(name, self.encoding)
497 impl<S> Header for HeaderRecord<S>
501 fn offsets(&self) -> Range<u64> {
506 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
508 /// Regular system file.
511 /// System file with Zlib-compressed data.
514 /// EBCDIC-encoded system file.
519 /// Magic number for a regular system file.
520 pub const SAV: [u8; 4] = *b"$FL2";
522 /// Magic number for a system file that contains zlib-compressed data.
523 pub const ZSAV: [u8; 4] = *b"$FL3";
525 /// Magic number for an EBCDIC-encoded system file. This is `$FL2` encoded
527 pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2];
530 impl Debug for Magic {
531 fn fmt(&self, f: &mut Formatter) -> FmtResult {
532 let s = match *self {
533 Magic::Sav => "$FL2",
534 Magic::Zsav => "$FL3",
535 Magic::Ebcdic => "($FL2 in EBCDIC)",
541 impl TryFrom<[u8; 4]> for Magic {
544 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
546 Magic::SAV => Ok(Magic::Sav),
547 Magic::ZSAV => Ok(Magic::Zsav),
548 Magic::EBCDIC => Ok(Magic::Ebcdic),
549 _ => Err(Error::BadMagic(value)),
554 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
561 pub fn from_width(width: VarWidth) -> VarType {
563 VarWidth::Numeric => Self::Numeric,
564 VarWidth::String(_) => Self::String,
568 pub fn opposite(self) -> VarType {
570 Self::Numeric => Self::String,
571 Self::String => Self::Numeric,
576 impl Display for VarType {
577 fn fmt(&self, f: &mut Formatter) -> FmtResult {
579 VarType::Numeric => write!(f, "numeric"),
580 VarType::String => write!(f, "string"),
585 #[derive(Copy, Clone)]
594 type RawValue = Value<RawStr<8>>;
596 impl<S> Debug for Value<S>
600 fn fmt(&self, f: &mut Formatter) -> FmtResult {
602 Value::Number(Some(number)) => write!(f, "{number:?}"),
603 Value::Number(None) => write!(f, "SYSMIS"),
604 Value::String(s) => write!(f, "{:?}", s),
610 fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Self, IoError> {
612 &UntypedValue(read_bytes(r)?),
618 pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self {
620 VarType::String => Value::String(RawStr(raw.0)),
621 VarType::Numeric => {
622 let number: f64 = endian.parse(raw.0);
623 Value::Number((number != -f64::MAX).then_some(number))
628 fn read_case<R: Read + Seek>(
630 var_types: &[VarType],
632 ) -> Result<Option<Vec<Self>>, Error> {
633 let case_start = reader.stream_position()?;
634 let mut values = Vec::with_capacity(var_types.len());
635 for (i, &var_type) in var_types.iter().enumerate() {
636 let Some(raw) = try_read_bytes(reader)? else {
640 let offset = reader.stream_position()?;
641 return Err(Error::EofInCase {
643 case_ofs: offset - case_start,
644 case_len: var_types.len() * 8,
648 values.push(Value::from_raw(&UntypedValue(raw), var_type, endian));
653 fn read_compressed_case<R: Read + Seek>(
655 var_types: &[VarType],
656 codes: &mut VecDeque<u8>,
659 ) -> Result<Option<Vec<Self>>, Error> {
660 let case_start = reader.stream_position()?;
661 let mut values = Vec::with_capacity(var_types.len());
662 for (i, &var_type) in var_types.iter().enumerate() {
664 let Some(code) = codes.pop_front() else {
665 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
669 let offset = reader.stream_position()?;
670 return Err(Error::EofInCompressedCase {
672 case_ofs: offset - case_start,
676 codes.extend(new_codes.into_iter());
681 1..=251 => match var_type {
682 VarType::Numeric => break Self::Number(Some(code as f64 - bias)),
684 break Self::String(RawStr(endian.to_bytes(code as f64 - bias)))
691 let offset = reader.stream_position()?;
692 return Err(Error::PartialCompressedCase {
694 case_ofs: offset - case_start,
699 break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian)
701 254 => match var_type {
702 VarType::String => break Self::String(RawStr(*b" ")), // XXX EBCDIC
703 VarType::Numeric => {
704 return Err(Error::CompressedStringExpected {
706 case_ofs: reader.stream_position()? - case_start,
710 255 => match var_type {
711 VarType::Numeric => break Self::Number(None),
713 return Err(Error::CompressedNumberExpected {
715 case_ofs: reader.stream_position()? - case_start,
726 fn decode(&self, decoder: &Decoder) -> Value<String> {
728 Self::Number(x) => Value::Number(*x),
729 Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
734 struct ZlibDecodeMultiple<R>
738 reader: Option<ZlibDecoder<R>>,
741 impl<R> ZlibDecodeMultiple<R>
745 fn new(reader: R) -> ZlibDecodeMultiple<R> {
747 reader: Some(ZlibDecoder::new(reader)),
752 impl<R> Read for ZlibDecodeMultiple<R>
756 fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
758 match self.reader.as_mut().unwrap().read(buf)? {
760 let inner = self.reader.take().unwrap().into_inner();
761 self.reader = Some(ZlibDecoder::new(inner));
769 impl<R> Seek for ZlibDecodeMultiple<R>
773 fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
774 self.reader.as_mut().unwrap().get_mut().seek(pos)
783 ztrailer_offset: u64,
792 R: Read + Seek + 'static,
795 warn: Box<dyn Fn(Error)>,
797 header: HeaderRecord<RawString>,
798 var_types: Vec<VarType>,
805 R: Read + Seek + 'static,
807 pub fn new<F>(mut reader: R, warn: F) -> Result<Self, Error>
809 F: Fn(Error) + 'static,
811 let header = HeaderRecord::read(&mut reader)?;
813 reader: Some(reader),
814 warn: Box::new(warn),
816 var_types: Vec::new(),
817 state: ReaderState::Start,
820 fn cases(&mut self) -> Cases {
821 self.state = ReaderState::End;
823 self.reader.take().unwrap(),
824 take(&mut self.var_types),
830 impl<R> Iterator for Reader<R>
832 R: Read + Seek + 'static,
834 type Item = Result<Record, Error>;
836 fn next(&mut self) -> Option<Self::Item> {
838 ReaderState::Start => {
839 self.state = ReaderState::Headers;
840 Some(Ok(Record::Header(self.header.clone())))
842 ReaderState::Headers => {
845 self.reader.as_mut().unwrap(),
847 self.var_types.as_slice(),
850 Ok(Some(record)) => break record,
852 Err(error) => return Some(Err(error)),
856 Record::Variable(VariableRecord { width, .. }) => {
857 self.var_types.push(if width == 0 {
863 Record::EndOfHeaders(_) => {
864 self.state = if let Some(Compression::ZLib) = self.header.compression {
865 ReaderState::ZlibHeader
874 ReaderState::ZlibHeader => {
875 let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian)
877 Ok(zheader) => zheader,
878 Err(error) => return Some(Err(error)),
880 self.state = ReaderState::ZlibTrailer {
881 ztrailer_offset: zheader.ztrailer_offset,
882 ztrailer_len: zheader.ztrailer_len,
884 Some(Ok(Record::ZHeader(zheader)))
886 ReaderState::ZlibTrailer {
890 match ZTrailer::read(
891 self.reader.as_mut().unwrap(),
896 Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
897 Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))),
898 Err(error) => Some(Err(error)),
901 ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
902 ReaderState::End => None,
907 trait ReadSeek: Read + Seek {}
908 impl<T> ReadSeek for T where T: Read + Seek {}
911 reader: Box<dyn ReadSeek>,
912 var_types: Vec<VarType>,
913 compression: Option<Compression>,
920 impl Debug for Cases {
921 fn fmt(&self, f: &mut Formatter) -> FmtResult {
927 fn new<R>(reader: R, var_types: Vec<VarType>, header: &HeaderRecord<RawString>) -> Self
929 R: Read + Seek + 'static,
932 reader: if header.compression == Some(Compression::ZLib) {
933 Box::new(ZlibDecodeMultiple::new(reader))
938 compression: header.compression,
940 endian: header.endian,
941 codes: VecDeque::with_capacity(8),
947 impl Iterator for Cases {
948 type Item = Result<Vec<RawValue>, Error>;
950 fn next(&mut self) -> Option<Self::Item> {
955 let retval = if self.compression.is_some() {
956 Value::read_compressed_case(
965 Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose()
967 self.eof = matches!(retval, None | Some(Err(_)));
972 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
973 pub struct Spec(pub u32);
975 impl Debug for Spec {
976 fn fmt(&self, f: &mut Formatter) -> FmtResult {
977 let type_ = format_name(self.0 >> 16);
978 let w = (self.0 >> 8) & 0xff;
979 let d = self.0 & 0xff;
980 write!(f, "{:06x} ({type_}{w}.{d})", self.0)
984 fn format_name(type_: u32) -> Cow<'static, str> {
1023 _ => return format!("<unknown format {type_}>").into(),
1029 pub struct MissingValues<S = String>
1033 /// Individual missing values, up to 3 of them.
1034 pub values: Vec<Value<S>>,
1036 /// Optional range of missing values.
1037 pub range: Option<(Value<S>, Value<S>)>,
1040 impl<S> Debug for MissingValues<S>
1044 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1045 for (i, value) in self.values.iter().enumerate() {
1049 write!(f, "{value:?}")?;
1052 if let Some((low, high)) = &self.range {
1053 if !self.values.is_empty() {
1056 write!(f, "{low:?} THRU {high:?}")?;
1059 if self.is_empty() {
1067 impl<S> MissingValues<S>
1071 fn is_empty(&self) -> bool {
1072 self.values.is_empty() && self.range.is_none()
1076 impl<S> Default for MissingValues<S>
1080 fn default() -> Self {
1088 impl MissingValues<RawStr<8>> {
1089 fn read<R: Read + Seek>(
1095 ) -> Result<Self, Error> {
1096 let (n_values, has_range) = match (width, code) {
1097 (_, 0..=3) => (code, false),
1098 (0, -2) => (0, true),
1099 (0, -3) => (1, true),
1100 (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
1101 (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
1104 let var_type = if width == 0 {
1110 let mut values = Vec::new();
1111 for _ in 0..n_values {
1112 values.push(RawValue::read(r, var_type, endian)?);
1114 let range = if has_range {
1115 let low = RawValue::read(r, var_type, endian)?;
1116 let high = RawValue::read(r, var_type, endian)?;
1121 Ok(Self { values, range })
1123 fn decode<'a>(&'a self, decoder: &Decoder) -> MissingValues<String> {
1128 .map(|value| value.decode(decoder))
1133 .map(|(low, high)| (low.decode(decoder), high.decode(decoder))),
1139 pub struct VariableRecord<S, V>
1144 /// Range of offsets in file.
1145 pub offsets: Range<u64>,
1147 /// Variable width, in the range -1..=255.
1150 /// Variable name, padded on the right with spaces.
1154 pub print_format: Spec,
1157 pub write_format: Spec,
1160 pub missing_values: MissingValues<V>,
1162 /// Optional variable label.
1163 pub label: Option<S>,
1166 impl<S, V> Debug for VariableRecord<S, V>
1171 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1176 match self.width.cmp(&0) {
1177 Ordering::Greater => "string",
1178 Ordering::Equal => "numeric",
1179 Ordering::Less => "long string continuation record",
1182 writeln!(f, "Print format: {:?}", self.print_format)?;
1183 writeln!(f, "Write format: {:?}", self.write_format)?;
1184 writeln!(f, "Name: {:?}", &self.name)?;
1185 writeln!(f, "Variable label: {:?}", self.label)?;
1186 writeln!(f, "Missing values: {:?}", self.missing_values)
1190 impl VariableRecord<RawString, RawStr<8>> {
1191 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1192 let start_offset = r.stream_position()?;
1193 let width: i32 = endian.parse(read_bytes(r)?);
1194 let code_offset = r.stream_position()?;
1195 let has_variable_label: u32 = endian.parse(read_bytes(r)?);
1196 let missing_value_code: i32 = endian.parse(read_bytes(r)?);
1197 let print_format = Spec(endian.parse(read_bytes(r)?));
1198 let write_format = Spec(endian.parse(read_bytes(r)?));
1199 let name = RawString(read_vec(r, 8)?);
1201 let label = match has_variable_label {
1204 let len: u32 = endian.parse(read_bytes(r)?);
1205 let read_len = len.min(65535) as usize;
1206 let label = RawString(read_vec(r, read_len)?);
1208 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
1209 let _ = read_vec(r, padding_bytes as usize)?;
1214 return Err(Error::BadVariableLabelCode {
1217 code: has_variable_label,
1222 let missing_values =
1223 MissingValues::read(r, start_offset, width, missing_value_code, endian)?;
1225 let end_offset = r.stream_position()?;
1227 Ok(Record::Variable(VariableRecord {
1228 offsets: start_offset..end_offset,
1238 pub fn decode<'a>(&'a self, decoder: &Decoder) -> VariableRecord<Cow<'a, str>, String> {
1240 offsets: self.offsets.clone(),
1242 name: decoder.decode(&self.name),
1243 print_format: self.print_format,
1244 write_format: self.write_format,
1245 missing_values: self.missing_values.decode(decoder),
1246 label: self.label.as_ref().map(|label| decoder.decode(label)),
1251 #[derive(Copy, Clone)]
1252 pub struct UntypedValue(pub [u8; 8]);
1254 impl Debug for UntypedValue {
1255 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1256 let little: f64 = Endian::Little.parse(self.0);
1257 let little = format!("{:?}", little);
1258 let big: f64 = Endian::Big.parse(self.0);
1259 let big = format!("{:?}", big);
1260 let number = if little.len() <= big.len() {
1265 write!(f, "{number}")?;
1267 let string = default_decode(&self.0);
1269 .split(|c: char| c == '\0' || c.is_control())
1272 write!(f, "{string:?}")?;
1278 pub struct RawString(pub Vec<u8>);
1280 impl From<Vec<u8>> for RawString {
1281 fn from(source: Vec<u8>) -> Self {
1286 impl From<&[u8]> for RawString {
1287 fn from(source: &[u8]) -> Self {
1292 impl Debug for RawString {
1293 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1294 write!(f, "{:?}", default_decode(self.0.as_slice()))
1298 #[derive(Copy, Clone)]
1299 pub struct RawStr<const N: usize>(pub [u8; N]);
1301 impl<const N: usize> From<[u8; N]> for RawStr<N> {
1302 fn from(source: [u8; N]) -> Self {
1307 impl<const N: usize> Debug for RawStr<N> {
1308 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1309 write!(f, "{:?}", default_decode(&self.0))
1313 #[derive(Clone, Debug)]
1314 pub struct ValueLabel<V, S>
1319 pub value: Value<V>,
1324 pub struct ValueLabelRecord<V, S>
1329 /// Range of offsets in file.
1330 pub offsets: Range<u64>,
1333 pub labels: Vec<ValueLabel<V, S>>,
1335 /// The 1-based indexes of the variable indexes.
1336 pub dict_indexes: Vec<u32>,
1338 /// The types of the variables.
1339 pub var_type: VarType,
1342 impl<V, S> Debug for ValueLabelRecord<V, S>
1347 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1348 writeln!(f, "labels: ")?;
1349 for label in self.labels.iter() {
1350 writeln!(f, "{label:?}")?;
1352 write!(f, "apply to {} variables", self.var_type)?;
1353 for dict_index in self.dict_indexes.iter() {
1354 write!(f, " #{dict_index}")?;
1360 impl<V, S> Header for ValueLabelRecord<V, S>
1365 fn offsets(&self) -> Range<u64> {
1366 self.offsets.clone()
1370 impl<V, S> ValueLabelRecord<V, S>
1375 /// Maximum number of value labels in a record.
1376 pub const MAX_LABELS: u32 = u32::MAX / 8;
1378 /// Maximum number of variable indexes in a record.
1379 pub const MAX_INDEXES: u32 = u32::MAX / 8;
1382 impl ValueLabelRecord<RawStr<8>, RawString> {
1383 fn read<R: Read + Seek>(
1386 var_types: &[VarType],
1387 warn: &Box<dyn Fn(Error)>,
1388 ) -> Result<Option<Record>, Error> {
1389 let label_offset = r.stream_position()?;
1390 let n: u32 = endian.parse(read_bytes(r)?);
1391 if n > Self::MAX_LABELS {
1392 return Err(Error::BadNumberOfValueLabels {
1393 offset: label_offset,
1395 max: Self::MAX_LABELS,
1399 let mut labels = Vec::new();
1401 let value = UntypedValue(read_bytes(r)?);
1402 let label_len: u8 = endian.parse(read_bytes(r)?);
1403 let label_len = label_len as usize;
1404 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
1406 let mut label = read_vec(r, padded_len - 1)?;
1407 label.truncate(label_len);
1408 labels.push((value, RawString(label)));
1411 let index_offset = r.stream_position()?;
1412 let rec_type: u32 = endian.parse(read_bytes(r)?);
1414 return Err(Error::ExpectedVarIndexRecord {
1415 offset: index_offset,
1420 let n: u32 = endian.parse(read_bytes(r)?);
1421 if n > Self::MAX_INDEXES {
1422 return Err(Error::TooManyVarIndexes {
1423 offset: index_offset,
1425 max: Self::MAX_INDEXES,
1429 let index_offset = r.stream_position()?;
1430 let mut dict_indexes = Vec::with_capacity(n as usize);
1431 let mut invalid_indexes = Vec::new();
1433 let index: u32 = endian.parse(read_bytes(r)?);
1434 if index == 0 || index as usize > var_types.len() {
1435 dict_indexes.push(index);
1437 invalid_indexes.push(index);
1440 if !invalid_indexes.is_empty() {
1441 warn(Error::InvalidVarIndexes {
1442 offset: index_offset,
1443 max: var_types.len(),
1444 invalid: invalid_indexes,
1448 let Some(&first_index) = dict_indexes.first() else {
1449 warn(Error::NoVarIndexes {
1450 offset: index_offset,
1454 let var_type = var_types[first_index as usize - 1];
1455 let mut wrong_type_indexes = Vec::new();
1456 dict_indexes.retain(|&index| {
1457 if var_types[index as usize - 1] != var_type {
1458 wrong_type_indexes.push(index);
1464 if !wrong_type_indexes.is_empty() {
1465 warn(Error::MixedVarTypes {
1466 offset: index_offset,
1468 wrong_types: wrong_type_indexes,
1474 .map(|(value, label)| ValueLabel {
1475 value: Value::from_raw(&value, var_type, endian),
1480 let end_offset = r.stream_position()?;
1481 Ok(Some(Record::ValueLabel(ValueLabelRecord {
1482 offsets: label_offset..end_offset,
1490 #[derive(Clone, Debug)]
1491 pub struct DocumentRecord<S>
1495 pub offsets: Range<u64>,
1497 /// The document, as an array of 80-byte lines.
1501 pub type RawDocumentLine = RawStr<DOC_LINE_LEN>;
1503 /// Length of a line in a document. Document lines are fixed-length and
1504 /// padded on the right with spaces.
1505 pub const DOC_LINE_LEN: usize = 80;
1507 impl DocumentRecord<RawDocumentLine> {
1508 /// Maximum number of lines we will accept in a document. This is simply
1509 /// the maximum number that will fit in a 32-bit space.
1510 pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
1512 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1513 let start_offset = r.stream_position()?;
1514 let n: u32 = endian.parse(read_bytes(r)?);
1516 if n > Self::MAX_LINES {
1517 Err(Error::BadDocumentLength {
1518 offset: start_offset,
1520 max: Self::MAX_LINES,
1523 let mut lines = Vec::with_capacity(n);
1525 lines.push(RawStr(read_bytes(r)?));
1527 let end_offset = r.stream_position()?;
1528 Ok(Record::Document(DocumentRecord {
1529 offsets: start_offset..end_offset,
1535 pub fn decode<'a>(&'a self, decoder: &Decoder) -> DocumentRecord<Cow<'a, str>> {
1537 offsets: self.offsets.clone(),
1541 .map(|s| decoder.decode_slice(&s.0))
1547 impl<S> Header for DocumentRecord<S>
1551 fn offsets(&self) -> Range<u64> {
1552 self.offsets.clone()
1556 trait ExtensionRecord {
1558 const SIZE: Option<u32>;
1559 const COUNT: Option<u32>;
1560 const NAME: &'static str;
1561 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error>;
1564 #[derive(Clone, Debug)]
1565 pub struct IntegerInfoRecord {
1566 pub offsets: Range<u64>,
1567 pub version: (i32, i32, i32),
1568 pub machine_code: i32,
1569 pub floating_point_rep: i32,
1570 pub compression_code: i32,
1571 pub endianness: i32,
1572 pub character_code: i32,
1575 impl ExtensionRecord for IntegerInfoRecord {
1576 const SUBTYPE: u32 = 3;
1577 const SIZE: Option<u32> = Some(4);
1578 const COUNT: Option<u32> = Some(8);
1579 const NAME: &'static str = "integer record";
1581 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1582 ext.check_size::<Self>()?;
1584 let mut input = &ext.data[..];
1585 let data: Vec<i32> = (0..8)
1586 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1588 Ok(Record::IntegerInfo(IntegerInfoRecord {
1589 offsets: ext.offsets.clone(),
1590 version: (data[0], data[1], data[2]),
1591 machine_code: data[3],
1592 floating_point_rep: data[4],
1593 compression_code: data[5],
1594 endianness: data[6],
1595 character_code: data[7],
1600 #[derive(Clone, Debug)]
1601 pub struct FloatInfoRecord {
1607 impl ExtensionRecord for FloatInfoRecord {
1608 const SUBTYPE: u32 = 4;
1609 const SIZE: Option<u32> = Some(8);
1610 const COUNT: Option<u32> = Some(3);
1611 const NAME: &'static str = "floating point record";
1613 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1614 ext.check_size::<Self>()?;
1616 let mut input = &ext.data[..];
1617 let data: Vec<f64> = (0..3)
1618 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1620 Ok(Record::FloatInfo(FloatInfoRecord {
1628 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1629 pub enum CategoryLabels {
1634 #[derive(Clone, Debug)]
1635 pub enum MultipleResponseType {
1638 labels: CategoryLabels,
1643 impl MultipleResponseType {
1644 fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Error> {
1645 let (mr_type, input) = match input.split_first() {
1646 Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
1647 Some((b'D', input)) => {
1648 let (value, input) = parse_counted_string(input)?;
1650 MultipleResponseType::MultipleDichotomy {
1652 labels: CategoryLabels::VarLabels,
1657 Some((b'E', input)) => {
1658 let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
1659 (CategoryLabels::CountedValues, rest)
1660 } else if let Some(rest) = input.strip_prefix(b" 11 ") {
1661 (CategoryLabels::VarLabels, rest)
1663 return Err(Error::TBD);
1665 let (value, input) = parse_counted_string(input)?;
1667 MultipleResponseType::MultipleDichotomy { value, labels },
1671 _ => return Err(Error::TBD),
1673 Ok((mr_type, input))
1677 #[derive(Clone, Debug)]
1678 pub struct MultipleResponseSet<I, S>
1685 pub mr_type: MultipleResponseType,
1686 pub short_names: Vec<I>,
1689 impl MultipleResponseSet<RawString, RawString> {
1690 fn parse(input: &[u8]) -> Result<(Self, &[u8]), Error> {
1691 let Some(equals) = input.iter().position(|&b| b == b'=') else {
1692 return Err(Error::TBD);
1694 let (name, input) = input.split_at(equals);
1695 let (mr_type, input) = MultipleResponseType::parse(input)?;
1696 let Some(input) = input.strip_prefix(b" ") else {
1697 return Err(Error::TBD);
1699 let (label, mut input) = parse_counted_string(input)?;
1700 let mut vars = Vec::new();
1701 while input.first() != Some(&b'\n') {
1702 match input.split_first() {
1703 Some((b' ', rest)) => {
1704 let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
1705 return Err(Error::TBD);
1707 let (var, rest) = rest.split_at(length);
1708 if !var.is_empty() {
1709 vars.push(var.into());
1713 _ => return Err(Error::TBD),
1716 while input.first() == Some(&b'\n') {
1717 input = &input[1..];
1720 MultipleResponseSet {
1733 ) -> Result<MultipleResponseSet<Identifier, Cow<'a, str>>, Error> {
1734 let mut short_names = Vec::with_capacity(self.short_names.len());
1735 for short_name in self.short_names.iter() {
1736 if let Some(short_name) = decoder
1737 .decode_identifier(short_name)
1738 .map_err(|err| Error::InvalidMrSetName(err))
1739 .warn_on_error(&decoder.warn)
1741 short_names.push(short_name);
1744 Ok(MultipleResponseSet {
1746 .decode_identifier(&self.name)
1747 .map_err(|err| Error::InvalidMrSetVariableName(err))?,
1748 label: decoder.decode(&self.label),
1749 mr_type: self.mr_type.clone(),
1750 short_names: short_names,
1755 #[derive(Clone, Debug)]
1756 pub struct MultipleResponseRecord<I, S>(pub Vec<MultipleResponseSet<I, S>>)
1761 impl ExtensionRecord for MultipleResponseRecord<RawString, RawString> {
1762 const SUBTYPE: u32 = 7;
1763 const SIZE: Option<u32> = Some(1);
1764 const COUNT: Option<u32> = None;
1765 const NAME: &'static str = "multiple response set record";
1767 fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Error> {
1768 ext.check_size::<Self>()?;
1770 let mut input = &ext.data[..];
1771 let mut sets = Vec::new();
1772 while !input.is_empty() {
1773 let (set, rest) = MultipleResponseSet::parse(input)?;
1777 Ok(Record::MultipleResponse(MultipleResponseRecord(sets)))
1781 impl MultipleResponseRecord<RawString, RawString> {
1782 fn decode<'a>(&'a self, decoder: &Decoder) -> MultipleResponseRecord<Identifier, Cow<'a, str>> {
1783 let mut sets = Vec::new();
1784 for set in self.0.iter() {
1785 if let Some(set) = set.decode(decoder).warn_on_error(&decoder.warn) {
1789 MultipleResponseRecord(sets)
1793 fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Error> {
1794 let Some(space) = input.iter().position(|&b| b == b' ') else {
1795 return Err(Error::TBD);
1797 let Ok(length) = from_utf8(&input[..space]) else {
1798 return Err(Error::TBD);
1800 let Ok(length): Result<usize, _> = length.parse() else {
1801 return Err(Error::TBD);
1804 let input = &input[space + 1..];
1805 if input.len() < length {
1806 return Err(Error::TBD);
1809 let (string, rest) = input.split_at(length);
1810 Ok((string.into(), rest))
1813 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1821 pub fn default_for_type(var_type: VarType) -> Option<Measure> {
1823 VarType::Numeric => None,
1824 VarType::String => Some(Self::Nominal),
1828 fn try_decode(source: u32) -> Result<Option<Measure>, Error> {
1831 1 => Ok(Some(Measure::Nominal)),
1832 2 => Ok(Some(Measure::Ordinal)),
1833 3 => Ok(Some(Measure::Scale)),
1834 _ => Err(Error::InvalidMeasurement(source)),
1839 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1840 pub enum Alignment {
1847 fn try_decode(source: u32) -> Result<Option<Alignment>, Error> {
1850 1 => Ok(Some(Alignment::Left)),
1851 2 => Ok(Some(Alignment::Right)),
1852 3 => Ok(Some(Alignment::Center)),
1853 _ => Err(Error::InvalidAlignment(source)),
1857 pub fn default_for_type(var_type: VarType) -> Self {
1859 VarType::Numeric => Self::Right,
1860 VarType::String => Self::Left,
1865 #[derive(Clone, Debug)]
1866 pub struct VarDisplay {
1867 pub measure: Option<Measure>,
1868 pub width: Option<u32>,
1869 pub alignment: Option<Alignment>,
1872 #[derive(Clone, Debug)]
1873 pub struct VarDisplayRecord(pub Vec<VarDisplay>);
1875 impl VarDisplayRecord {
1876 const SUBTYPE: u32 = 11;
1882 warn: &Box<dyn Fn(Error)>,
1883 ) -> Result<Record, Error> {
1885 return Err(Error::BadRecordSize {
1886 offset: ext.offsets.start,
1887 record: String::from("variable display record"),
1893 let has_width = if ext.count as usize == 3 * n_vars {
1895 } else if ext.count as usize == 2 * n_vars {
1898 return Err(Error::TBD);
1901 let mut var_displays = Vec::new();
1902 let mut input = &ext.data[..];
1903 for _ in 0..n_vars {
1904 let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
1905 .warn_on_error(&warn)
1907 let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
1908 let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
1909 .warn_on_error(&warn)
1911 var_displays.push(VarDisplay {
1917 Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
1921 #[derive(Clone, Debug)]
1922 pub struct LongStringMissingValues<N, V>
1931 pub missing_values: MissingValues<V>,
1934 impl LongStringMissingValues<RawString, RawStr<8>> {
1938 ) -> Result<LongStringMissingValues<Identifier, String>, IdError> {
1939 Ok(LongStringMissingValues {
1940 var_name: decoder.decode_identifier(&self.var_name)?,
1941 missing_values: self.missing_values.decode(decoder),
1946 #[derive(Clone, Debug)]
1947 pub struct LongStringMissingValueRecord<N, V>(pub Vec<LongStringMissingValues<N, V>>)
1952 impl ExtensionRecord for LongStringMissingValueRecord<RawString, RawStr<8>> {
1953 const SUBTYPE: u32 = 22;
1954 const SIZE: Option<u32> = Some(1);
1955 const COUNT: Option<u32> = None;
1956 const NAME: &'static str = "long string missing values record";
1958 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1959 ext.check_size::<Self>()?;
1961 let mut input = &ext.data[..];
1962 let mut missing_value_set = Vec::new();
1963 while !input.is_empty() {
1964 let var_name = read_string(&mut input, endian)?;
1965 let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
1966 let value_len: u32 = endian.parse(read_bytes(&mut input)?);
1968 let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
1969 return Err(Error::BadLongMissingValueLength {
1970 record_offset: ext.offsets.start,
1975 let mut values = Vec::new();
1976 for i in 0..n_missing_values {
1977 let value: [u8; 8] = read_bytes(&mut input)?;
1978 let numeric_value: u64 = endian.parse(value);
1979 let value = if i > 0 && numeric_value == 8 {
1980 // Tolerate files written by old, buggy versions of PSPP
1981 // where we believed that the value_length was repeated
1982 // before each missing value.
1983 read_bytes(&mut input)?
1987 values.push(Value::String(RawStr(value)));
1989 let missing_values = MissingValues {
1993 missing_value_set.push(LongStringMissingValues {
1998 Ok(Record::LongStringMissingValues(
1999 LongStringMissingValueRecord(missing_value_set),
2004 impl LongStringMissingValueRecord<RawString, RawStr<8>> {
2008 ) -> LongStringMissingValueRecord<Identifier, String> {
2009 let mut mvs = Vec::with_capacity(self.0.len());
2010 for mv in self.0.iter() {
2011 if let Some(mv) = mv
2013 .map_err(|err| Error::InvalidLongStringMissingValueVariableName(err))
2014 .warn_on_error(&decoder.warn)
2019 LongStringMissingValueRecord(mvs)
2023 #[derive(Clone, Debug)]
2024 pub struct EncodingRecord(pub String);
2026 impl ExtensionRecord for EncodingRecord {
2027 const SUBTYPE: u32 = 20;
2028 const SIZE: Option<u32> = Some(1);
2029 const COUNT: Option<u32> = None;
2030 const NAME: &'static str = "encoding record";
2032 fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Error> {
2033 ext.check_size::<Self>()?;
2035 Ok(Record::Encoding(EncodingRecord(
2036 String::from_utf8(ext.data.clone()).map_err(|_| Error::BadEncodingName {
2037 offset: ext.offsets.start,
2043 #[derive(Copy, Clone, Debug)]
2044 pub struct NumberOfCasesRecord {
2045 /// Always observed as 1.
2048 /// Number of cases.
2052 impl ExtensionRecord for NumberOfCasesRecord {
2053 const SUBTYPE: u32 = 16;
2054 const SIZE: Option<u32> = Some(8);
2055 const COUNT: Option<u32> = Some(2);
2056 const NAME: &'static str = "extended number of cases record";
2058 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
2059 ext.check_size::<Self>()?;
2061 let mut input = &ext.data[..];
2062 let one = endian.parse(read_bytes(&mut input)?);
2063 let n_cases = endian.parse(read_bytes(&mut input)?);
2065 Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
2069 #[derive(Clone, Debug)]
2070 pub struct TextRecord {
2071 pub offsets: Range<u64>,
2074 pub rec_type: TextRecordType,
2076 /// The text content of the record.
2077 pub text: RawString,
2080 #[derive(Clone, Copy, Debug)]
2081 pub enum TextRecordType {
2091 fn new(extension: Extension, rec_type: TextRecordType) -> Self {
2093 offsets: extension.offsets,
2095 text: extension.data.into(),
2098 pub fn decode<'a>(&self, decoder: &Decoder) -> Result<Option<Record>, Error> {
2099 match self.rec_type {
2100 TextRecordType::VariableSets => Ok(Some(Record::VariableSets(
2101 VariableSetRecord::decode(self, decoder),
2103 TextRecordType::ProductInfo => Ok(Some(Record::ProductInfo(
2104 ProductInfoRecord::decode(self, decoder),
2106 TextRecordType::LongNames => Ok(Some(Record::LongNames(LongNamesRecord::decode(
2109 TextRecordType::VeryLongStrings => Ok(Some(Record::VeryLongStrings(
2110 VeryLongStringsRecord::decode(self, decoder),
2112 TextRecordType::FileAttributes => {
2113 Ok(FileAttributeRecord::decode(self, decoder).map(|fa| Record::FileAttributes(fa)))
2115 TextRecordType::VariableAttributes => Ok(Some(Record::VariableAttributes(
2116 VariableAttributeRecord::decode(self, decoder),
2122 #[derive(Clone, Debug)]
2123 pub struct VeryLongString {
2124 pub short_name: Identifier,
2128 impl VeryLongString {
2129 fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Error> {
2130 let Some((short_name, length)) = input.split_once('=') else {
2131 return Err(Error::TBD);
2133 let short_name = decoder
2134 .new_identifier(short_name)
2135 .map_err(Error::InvalidLongStringName)?;
2136 let length = length.parse().map_err(|_| Error::TBD)?;
2137 Ok(VeryLongString { short_name, length })
2141 #[derive(Clone, Debug)]
2142 pub struct VeryLongStringsRecord(Vec<VeryLongString>);
2144 impl VeryLongStringsRecord {
2145 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2146 let input = decoder.decode(&source.text);
2147 let mut very_long_strings = Vec::new();
2150 .map(|s| s.trim_end_matches('\t'))
2151 .filter(|s| !s.is_empty())
2153 if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&decoder.warn) {
2154 very_long_strings.push(vls)
2157 VeryLongStringsRecord(very_long_strings)
2161 #[derive(Clone, Debug)]
2162 pub struct Attribute {
2163 pub name: Identifier,
2164 pub values: Vec<String>,
2168 fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Error> {
2169 let Some((name, mut input)) = input.split_once('(') else {
2170 return Err(Error::TBD);
2173 .new_identifier(name)
2174 .map_err(Error::InvalidAttributeName)?;
2175 let mut values = Vec::new();
2177 let Some((value, rest)) = input.split_once('\n') else {
2178 return Err(Error::TBD);
2180 if let Some(stripped) = value
2182 .and_then(|value| value.strip_suffix('\''))
2184 values.push(stripped.into());
2186 decoder.warn(Error::TBD);
2187 values.push(value.into());
2189 if let Some(rest) = rest.strip_prefix(')') {
2190 let attribute = Attribute { name, values };
2191 return Ok((attribute, rest));
2198 #[derive(Clone, Debug)]
2199 pub struct AttributeSet(pub HashMap<Identifier, Vec<String>>);
2205 sentinel: Option<char>,
2206 ) -> Result<(AttributeSet, &'a str), Error> {
2207 let mut attributes = HashMap::new();
2209 match input.chars().next() {
2210 None => break input,
2211 c if c == sentinel => break &input[1..],
2213 let (attribute, rest) = Attribute::parse(decoder, input)?;
2214 // XXX report duplicate name
2215 attributes.insert(attribute.name, attribute.values);
2220 Ok((AttributeSet(attributes), rest))
2224 #[derive(Clone, Debug)]
2225 pub struct FileAttributeRecord(AttributeSet);
2227 impl FileAttributeRecord {
2228 fn decode(source: &TextRecord, decoder: &Decoder) -> Option<Self> {
2229 let input = decoder.decode(&source.text);
2230 match AttributeSet::parse(decoder, &input, None).warn_on_error(&decoder.warn) {
2231 Some((set, rest)) => {
2232 if !rest.is_empty() {
2233 decoder.warn(Error::TBD);
2235 Some(FileAttributeRecord(set))
2242 #[derive(Clone, Debug)]
2243 pub struct VarAttributeSet {
2244 pub long_var_name: Identifier,
2245 pub attributes: AttributeSet,
2248 impl VarAttributeSet {
2249 fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributeSet, &'a str), Error> {
2250 let Some((long_var_name, rest)) = input.split_once(':') else {
2251 return Err(Error::TBD);
2253 let long_var_name = decoder
2254 .new_identifier(long_var_name)
2255 .map_err(Error::InvalidAttributeVariableName)?;
2256 let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?;
2257 let var_attribute = VarAttributeSet {
2261 Ok((var_attribute, rest))
2265 #[derive(Clone, Debug)]
2266 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
2268 impl VariableAttributeRecord {
2269 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2270 let decoded = decoder.decode(&source.text);
2271 let mut input = decoded.as_ref();
2272 let mut var_attribute_sets = Vec::new();
2273 while !input.is_empty() {
2274 let Some((var_attribute, rest)) =
2275 VarAttributeSet::parse(decoder, &input).warn_on_error(&decoder.warn)
2279 var_attribute_sets.push(var_attribute);
2280 input = rest.into();
2282 VariableAttributeRecord(var_attribute_sets)
2286 #[derive(Clone, Debug)]
2287 pub struct LongName {
2288 pub short_name: Identifier,
2289 pub long_name: Identifier,
2293 fn parse(input: &str, decoder: &Decoder) -> Result<Self, Error> {
2294 let Some((short_name, long_name)) = input.split_once('=') else {
2295 return Err(Error::TBD);
2297 let short_name = decoder
2298 .new_identifier(short_name)
2299 .map_err(Error::InvalidShortName)?;
2300 let long_name = decoder
2301 .new_identifier(long_name)
2302 .map_err(Error::InvalidLongName)?;
2310 #[derive(Clone, Debug)]
2311 pub struct LongNamesRecord(Vec<LongName>);
2313 impl LongNamesRecord {
2314 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2315 let input = decoder.decode(&source.text);
2316 let mut names = Vec::new();
2317 for pair in input.split('\t').filter(|s| !s.is_empty()) {
2318 if let Some(long_name) = LongName::parse(pair, decoder).warn_on_error(&decoder.warn) {
2319 names.push(long_name);
2322 LongNamesRecord(names)
2326 #[derive(Clone, Debug)]
2327 pub struct ProductInfoRecord(pub String);
2329 impl ProductInfoRecord {
2330 const NAME: &'static str = "extra product info";
2331 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2332 Self(decoder.decode(&source.text).into())
2335 #[derive(Clone, Debug)]
2336 pub struct VariableSet {
2338 pub vars: Vec<Identifier>,
2342 fn parse(input: &str, decoder: &Decoder) -> Result<Self, Error> {
2343 let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
2344 let mut vars = Vec::new();
2345 for var in input.split_ascii_whitespace() {
2346 if let Some(identifier) = decoder
2347 .new_identifier(var)
2348 .map_err(Error::InvalidVariableSetName)
2349 .warn_on_error(&decoder.warn)
2351 vars.push(identifier);
2361 #[derive(Clone, Debug)]
2362 pub struct VariableSetRecord {
2363 pub offsets: Range<u64>,
2364 pub sets: Vec<VariableSet>,
2367 impl VariableSetRecord {
2368 fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord {
2369 let mut sets = Vec::new();
2370 let input = decoder.decode(&source.text);
2371 for line in input.lines() {
2372 if let Some(set) = VariableSet::parse(line, decoder).warn_on_error(&decoder.warn) {
2377 offsets: source.offsets.clone(),
2383 trait WarnOnError<T> {
2384 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
2386 impl<T> WarnOnError<T> for Result<T, Error> {
2387 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
2389 Ok(result) => Some(result),
2398 #[derive(Clone, Debug)]
2399 pub struct Extension {
2400 pub offsets: Range<u64>,
2405 /// Size of each data element.
2408 /// Number of data elements.
2411 /// `size * count` bytes of data.
2416 fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
2417 if let Some(expected_size) = E::SIZE {
2418 if self.size != expected_size {
2419 return Err(Error::BadRecordSize {
2420 offset: self.offsets.start,
2421 record: E::NAME.into(),
2427 if let Some(expected_count) = E::COUNT {
2428 if self.count != expected_count {
2429 return Err(Error::BadRecordCount {
2430 offset: self.offsets.start,
2431 record: E::NAME.into(),
2440 fn read<R: Read + Seek>(
2444 warn: &Box<dyn Fn(Error)>,
2445 ) -> Result<Option<Record>, Error> {
2446 let subtype = endian.parse(read_bytes(r)?);
2447 let header_offset = r.stream_position()?;
2448 let size: u32 = endian.parse(read_bytes(r)?);
2449 let count = endian.parse(read_bytes(r)?);
2450 let Some(product) = size.checked_mul(count) else {
2451 return Err(Error::ExtensionRecordTooLarge {
2452 offset: header_offset,
2458 let start_offset = r.stream_position()?;
2459 let data = read_vec(r, product as usize)?;
2460 let end_offset = start_offset + product as u64;
2461 let extension = Extension {
2462 offsets: start_offset..end_offset,
2468 let result = match subtype {
2469 IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian),
2470 FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian),
2471 VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, n_vars, endian, warn),
2472 MultipleResponseRecord::SUBTYPE | 19 => {
2473 MultipleResponseRecord::parse(&extension, endian)
2475 LongStringValueLabelRecord::SUBTYPE => {
2476 LongStringValueLabelRecord::parse(&extension, endian)
2478 EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian),
2479 NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian),
2480 5 => Ok(Record::Text(TextRecord::new(
2482 TextRecordType::VariableSets,
2484 10 => Ok(Record::Text(TextRecord::new(
2486 TextRecordType::ProductInfo,
2488 13 => Ok(Record::Text(TextRecord::new(
2490 TextRecordType::LongNames,
2492 14 => Ok(Record::Text(TextRecord::new(
2494 TextRecordType::VeryLongStrings,
2496 17 => Ok(Record::Text(TextRecord::new(
2498 TextRecordType::FileAttributes,
2500 18 => Ok(Record::Text(TextRecord::new(
2502 TextRecordType::VariableAttributes,
2504 _ => Ok(Record::OtherExtension(extension)),
2507 Ok(result) => Ok(Some(result)),
2516 #[derive(Clone, Debug)]
2517 pub struct ZHeader {
2518 /// File offset to the start of the record.
2521 /// File offset to the ZLIB data header.
2522 pub zheader_offset: u64,
2524 /// File offset to the ZLIB trailer.
2525 pub ztrailer_offset: u64,
2527 /// Length of the ZLIB trailer in bytes.
2528 pub ztrailer_len: u64,
2532 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
2533 let offset = r.stream_position()?;
2534 let zheader_offset: u64 = endian.parse(read_bytes(r)?);
2535 let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
2536 let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
2547 #[derive(Clone, Debug)]
2548 pub struct ZTrailer {
2549 /// File offset to the start of the record.
2552 /// Compression bias as a negative integer, e.g. -100.
2555 /// Always observed as zero.
2558 /// Uncompressed size of each block, except possibly the last. Only
2559 /// `0x3ff000` has been observed so far.
2560 pub block_size: u32,
2562 /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
2563 pub blocks: Vec<ZBlock>,
2566 #[derive(Clone, Debug)]
2568 /// Offset of block of data if simple compression were used.
2569 pub uncompressed_ofs: u64,
2571 /// Actual offset within the file of the compressed data block.
2572 pub compressed_ofs: u64,
2574 /// The number of bytes in this data block after decompression. This is
2575 /// `block_size` in every data block but the last, which may be smaller.
2576 pub uncompressed_size: u32,
2578 /// The number of bytes in this data block, as stored compressed in this
2580 pub compressed_size: u32,
2584 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
2586 uncompressed_ofs: endian.parse(read_bytes(r)?),
2587 compressed_ofs: endian.parse(read_bytes(r)?),
2588 uncompressed_size: endian.parse(read_bytes(r)?),
2589 compressed_size: endian.parse(read_bytes(r)?),
2595 fn read<R: Read + Seek>(
2600 ) -> Result<Option<ZTrailer>, Error> {
2601 let start_offset = reader.stream_position()?;
2602 if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
2605 let int_bias = endian.parse(read_bytes(reader)?);
2606 let zero = endian.parse(read_bytes(reader)?);
2607 let block_size = endian.parse(read_bytes(reader)?);
2608 let n_blocks: u32 = endian.parse(read_bytes(reader)?);
2609 let expected_n_blocks = (ztrailer_len - 24) / 24;
2610 if n_blocks as u64 != expected_n_blocks {
2611 return Err(Error::BadZlibTrailerNBlocks {
2612 offset: ztrailer_ofs,
2618 let blocks = (0..n_blocks)
2619 .map(|_| ZBlock::read(reader, endian))
2620 .collect::<Result<Vec<_>, _>>()?;
2621 reader.seek(SeekFrom::Start(start_offset))?;
2623 offset: ztrailer_ofs,
2632 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
2633 let mut buf = [0; N];
2634 let n = r.read(&mut buf)?;
2637 r.read_exact(&mut buf[n..])?;
2645 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
2646 let mut buf = [0; N];
2647 r.read_exact(&mut buf)?;
2651 fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
2652 let mut vec = vec![0; n];
2653 r.read_exact(&mut vec)?;
2657 fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<RawString, IoError> {
2658 let length: u32 = endian.parse(read_bytes(r)?);
2659 Ok(read_vec(r, length as usize)?.into())
2662 #[derive(Clone, Debug)]
2663 pub struct LongStringValueLabels<S>
2670 /// `(value, label)` pairs, where each value is `width` bytes.
2671 pub labels: Vec<(S, S)>,
2674 #[derive(Clone, Debug)]
2675 pub struct LongStringValueLabelRecord<S>(pub Vec<LongStringValueLabels<S>>)
2679 impl ExtensionRecord for LongStringValueLabelRecord<RawString> {
2680 const SUBTYPE: u32 = 21;
2681 const SIZE: Option<u32> = Some(1);
2682 const COUNT: Option<u32> = None;
2683 const NAME: &'static str = "long string value labels record";
2685 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
2686 ext.check_size::<Self>()?;
2688 let mut input = &ext.data[..];
2689 let mut label_set = Vec::new();
2690 while !input.is_empty() {
2691 let var_name = read_string(&mut input, endian)?;
2692 let width: u32 = endian.parse(read_bytes(&mut input)?);
2693 let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
2694 let mut labels = Vec::new();
2695 for _ in 0..n_labels {
2696 let value = read_string(&mut input, endian)?;
2697 let label = read_string(&mut input, endian)?;
2698 labels.push((value, label));
2700 label_set.push(LongStringValueLabels {
2706 Ok(Record::LongStringValueLabels(LongStringValueLabelRecord(