3 endian::{Endian, Parse, ToBytes},
4 identifier::{Error as IdError, Identifier},
7 use encoding_rs::{mem::decode_latin1, DecoderResult, Encoding};
8 use flate2::read::ZlibDecoder;
14 collections::VecDeque,
15 fmt::{Debug, Display, Formatter, Result as FmtResult},
16 io::{Error as IoError, Read, Seek, SeekFrom},
23 use thiserror::Error as ThisError;
25 #[derive(ThisError, Debug)]
27 #[error("Not an SPSS system file")]
30 #[error("Invalid magic number {0:?}")]
33 #[error("I/O error ({0})")]
36 #[error("Invalid SAV compression code {0}")]
37 InvalidSavCompression(u32),
39 #[error("Invalid ZSAV compression code {0}")]
40 InvalidZsavCompression(u32),
42 #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
43 BadVariableWidth { offset: u64, width: i32 },
45 #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
46 BadDocumentLength { offset: u64, n: usize, max: usize },
48 #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
49 BadRecordType { offset: u64, rec_type: u32 },
51 #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
52 BadVariableLabelCode {
59 "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
61 BadNumericMissingValueCode { offset: u64, code: i32 },
63 #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
64 BadStringMissingValueCode { offset: u64, code: i32 },
66 #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
67 BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
69 #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
70 ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
72 #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")]
73 TooManyVarIndexes { offset: u64, n: u32, max: u32 },
75 #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")]
76 NoVarIndexes { offset: u64 },
78 #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())]
82 wrong_types: Vec<u32>,
85 #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")]
92 #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
93 ExtensionRecordTooLarge {
100 #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
108 "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
110 EofInCompressedCase { offset: u64, case_ofs: u64 },
112 #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
113 PartialCompressedCase { offset: u64, case_ofs: u64 },
115 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
116 CompressedNumberExpected { offset: u64, case_ofs: u64 },
118 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
119 CompressedStringExpected { offset: u64, case_ofs: u64 },
121 #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
122 BadZlibTrailerNBlocks {
125 expected_n_blocks: u64,
129 #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
137 #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
145 #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
146 BadLongMissingValueLength {
152 #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
153 BadEncodingName { offset: u64 },
155 // XXX This is risky because `text` might be arbitarily long.
156 #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
157 MalformedString { encoding: String, text: String },
159 #[error("Invalid variable measurement level value {0}")]
160 InvalidMeasurement(u32),
162 #[error("Invalid variable display alignment value {0}")]
163 InvalidAlignment(u32),
165 #[error("Invalid attribute name. {0}")]
166 InvalidAttributeName(IdError),
168 #[error("Invalid variable name in attribute record. {0}")]
169 InvalidAttributeVariableName(IdError),
171 #[error("Invalid short name in long variable name record. {0}")]
172 InvalidShortName(IdError),
174 #[error("Invalid name in long variable name record. {0}")]
175 InvalidLongName(IdError),
177 #[error("Invalid variable name in very long string record. {0}")]
178 InvalidLongStringName(IdError),
180 #[error("Invalid variable name in variable set record. {0}")]
181 InvalidVariableSetName(IdError),
183 #[error("Invalid multiple response set name. {0}")]
184 InvalidMrSetName(IdError),
186 #[error("Invalid multiple response set variable name. {0}")]
187 InvalidMrSetVariableName(IdError),
189 #[error("Invalid variable name in long string missing values record. {0}")]
190 InvalidLongStringMissingValueVariableName(IdError),
192 #[error("Details TBD")]
196 #[derive(Clone, Debug)]
198 Header(HeaderRecord<RawString>),
199 Variable(VariableRecord<RawString, RawStr<8>>),
200 ValueLabel(ValueLabelRecord<RawStr<8>, RawString>),
201 Document(DocumentRecord<RawDocumentLine>),
202 IntegerInfo(IntegerInfoRecord),
203 FloatInfo(FloatInfoRecord),
204 VariableSets(VariableSetRecord),
205 VarDisplay(VarDisplayRecord),
206 MultipleResponse(MultipleResponseRecord<RawString, RawString>),
207 LongStringValueLabels(LongStringValueLabelRecord<RawString>),
208 LongStringMissingValues(LongStringMissingValueRecord<RawString, RawStr<8>>),
209 Encoding(EncodingRecord),
210 NumberOfCases(NumberOfCasesRecord),
211 ProductInfo(ProductInfoRecord),
212 LongNames(LongNamesRecord),
213 VeryLongStrings(VeryLongStringsRecord),
214 FileAttributes(FileAttributeRecord),
215 VariableAttributes(VariableAttributeRecord),
217 OtherExtension(Extension),
221 Cases(Rc<RefCell<Cases>>),
228 var_types: &[VarType],
229 warn: &Box<dyn Fn(Error)>,
230 ) -> Result<Option<Record>, Error>
234 let rec_type: u32 = endian.parse(read_bytes(reader)?);
236 2 => Ok(Some(VariableRecord::read(reader, endian)?)),
237 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?),
238 6 => Ok(Some(DocumentRecord::read(reader, endian)?)),
239 7 => Extension::read(reader, endian, var_types.len(), warn),
240 999 => Ok(Some(Record::EndOfHeaders(
241 endian.parse(read_bytes(reader)?),
243 _ => Err(Error::BadRecordType {
244 offset: reader.stream_position()?,
251 // If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
252 // decoded as Latin-1 (actually bytes interpreted as Unicode code points).
253 fn default_decode(s: &[u8]) -> Cow<str> {
254 from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
257 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
258 pub enum Compression {
264 fn offsets(&self) -> Range<u64>;
268 pub struct HeaderRecord<S>
273 pub offsets: Range<u64>,
278 /// Eye-catcher string, product name, in the file's encoding. Padded
279 /// on the right with spaces.
282 /// Layout code, normally either 2 or 3.
283 pub layout_code: u32,
285 /// Number of variable positions, or `None` if the value in the file is
286 /// questionably trustworthy.
287 pub nominal_case_size: Option<u32>,
289 /// Compression type, if any,
290 pub compression: Option<Compression>,
292 /// 1-based variable index of the weight variable, or `None` if the file is
294 pub weight_index: Option<u32>,
296 /// Claimed number of cases, if known.
297 pub n_cases: Option<u32>,
299 /// Compression bias, usually 100.0.
302 /// `dd mmm yy` in the file's encoding.
303 pub creation_date: S,
305 /// `HH:MM:SS` in the file's encoding.
306 pub creation_time: S,
308 /// File label, in the file's encoding. Padded on the right with spaces.
311 /// Endianness of the data in the file header.
315 impl<S> HeaderRecord<S>
319 fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult
323 writeln!(f, "{name:>17}: {:?}", value)
327 impl<S> Debug for HeaderRecord<S>
331 fn fmt(&self, f: &mut Formatter) -> FmtResult {
332 writeln!(f, "File header record:")?;
333 self.debug_field(f, "Magic", self.magic)?;
334 self.debug_field(f, "Product name", &self.eye_catcher)?;
335 self.debug_field(f, "Layout code", self.layout_code)?;
336 self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
337 self.debug_field(f, "Compression", self.compression)?;
338 self.debug_field(f, "Weight index", self.weight_index)?;
339 self.debug_field(f, "Number of cases", self.n_cases)?;
340 self.debug_field(f, "Compression bias", self.bias)?;
341 self.debug_field(f, "Creation date", &self.creation_date)?;
342 self.debug_field(f, "Creation time", &self.creation_time)?;
343 self.debug_field(f, "File label", &self.file_label)?;
344 self.debug_field(f, "Endianness", self.endian)
348 impl HeaderRecord<RawString> {
349 fn read<R: Read + Seek>(r: &mut R) -> Result<Self, Error> {
350 let start = r.stream_position()?;
352 let magic: [u8; 4] = read_bytes(r)?;
353 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
355 let eye_catcher = RawString(read_vec(r, 60)?);
356 let layout_code: [u8; 4] = read_bytes(r)?;
357 let endian = Endian::identify_u32(2, layout_code)
358 .or_else(|| Endian::identify_u32(2, layout_code))
359 .ok_or_else(|| Error::NotASystemFile)?;
360 let layout_code = endian.parse(layout_code);
362 let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
363 let nominal_case_size =
364 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
366 let compression_code: u32 = endian.parse(read_bytes(r)?);
367 let compression = match (magic, compression_code) {
368 (Magic::Zsav, 2) => Some(Compression::ZLib),
369 (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)),
371 (_, 1) => Some(Compression::Simple),
372 (_, code) => return Err(Error::InvalidSavCompression(code)),
375 let weight_index: u32 = endian.parse(read_bytes(r)?);
376 let weight_index = (weight_index > 0).then_some(weight_index);
378 let n_cases: u32 = endian.parse(read_bytes(r)?);
379 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
381 let bias: f64 = endian.parse(read_bytes(r)?);
383 let creation_date = RawString(read_vec(r, 9)?);
384 let creation_time = RawString(read_vec(r, 8)?);
385 let file_label = RawString(read_vec(r, 64)?);
386 let _: [u8; 3] = read_bytes(r)?;
389 offsets: start..r.stream_position()?,
405 pub fn decode<'a>(&'a self, decoder: &Decoder) -> HeaderRecord<Cow<'a, str>> {
406 let eye_catcher = decoder.decode(&self.eye_catcher);
407 let file_label = decoder.decode(&self.file_label);
408 let creation_date = decoder.decode(&self.creation_date);
409 let creation_time = decoder.decode(&self.creation_time);
412 weight_index: self.weight_index,
413 n_cases: self.n_cases,
415 offsets: self.offsets.clone(),
417 layout_code: self.layout_code,
418 nominal_case_size: self.nominal_case_size,
419 compression: self.compression,
429 pub encoding: &'static Encoding,
430 pub warn: Box<dyn Fn(Error)>,
434 fn warn(&self, error: Error) {
437 fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
438 let (output, malformed) = self.encoding.decode_without_bom_handling(input);
440 self.warn(Error::MalformedString {
441 encoding: self.encoding.name().into(),
442 text: output.clone().into(),
448 fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> {
449 self.decode_slice(input.0.as_slice())
452 /// Returns `input` decoded from `self.encoding` into UTF-8 such that
453 /// re-encoding the result back into `self.encoding` will have exactly the
454 /// same length in bytes.
456 /// XXX warn about errors?
457 fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
458 if let (s, false) = self.encoding.decode_without_bom_handling(input) {
459 // This is the common case. Usually there will be no errors.
462 // Unusual case. Don't bother to optimize it much.
463 let mut decoder = self.encoding.new_decoder_without_bom_handling();
464 let mut output = String::with_capacity(
466 .max_utf8_buffer_length_without_replacement(input.len())
469 let mut rest = input;
470 while !rest.is_empty() {
471 match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
472 (DecoderResult::InputEmpty, _) => break,
473 (DecoderResult::OutputFull, _) => unreachable!(),
474 (DecoderResult::Malformed(a, b), consumed) => {
475 let skipped = a as usize + b as usize;
476 output.extend(repeat('?').take(skipped));
477 rest = &rest[consumed..];
481 assert_eq!(self.encoding.encode(&output).0.len(), input.len());
486 pub fn decode_identifier(&self, input: &RawString) -> Result<Identifier, IdError> {
487 self.new_identifier(&self.decode(input))
490 pub fn new_identifier(&self, name: &str) -> Result<Identifier, IdError> {
491 Identifier::new(name, self.encoding)
495 impl<S> Header for HeaderRecord<S>
499 fn offsets(&self) -> Range<u64> {
504 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
506 /// Regular system file.
509 /// System file with Zlib-compressed data.
512 /// EBCDIC-encoded system file.
517 /// Magic number for a regular system file.
518 pub const SAV: [u8; 4] = *b"$FL2";
520 /// Magic number for a system file that contains zlib-compressed data.
521 pub const ZSAV: [u8; 4] = *b"$FL3";
523 /// Magic number for an EBCDIC-encoded system file. This is `$FL2` encoded
525 pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2];
528 impl Debug for Magic {
529 fn fmt(&self, f: &mut Formatter) -> FmtResult {
530 let s = match *self {
531 Magic::Sav => "$FL2",
532 Magic::Zsav => "$FL3",
533 Magic::Ebcdic => "($FL2 in EBCDIC)",
539 impl TryFrom<[u8; 4]> for Magic {
542 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
544 Magic::SAV => Ok(Magic::Sav),
545 Magic::ZSAV => Ok(Magic::Zsav),
546 Magic::EBCDIC => Ok(Magic::Ebcdic),
547 _ => Err(Error::BadMagic(value)),
552 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
559 pub fn from_width(width: VarWidth) -> VarType {
561 VarWidth::Numeric => Self::Numeric,
562 VarWidth::String(_) => Self::String,
566 pub fn opposite(self) -> VarType {
568 Self::Numeric => Self::String,
569 Self::String => Self::Numeric,
574 impl Display for VarType {
575 fn fmt(&self, f: &mut Formatter) -> FmtResult {
577 VarType::Numeric => write!(f, "numeric"),
578 VarType::String => write!(f, "string"),
583 #[derive(Copy, Clone)]
592 type RawValue = Value<RawStr<8>>;
594 impl<S> Debug for Value<S>
598 fn fmt(&self, f: &mut Formatter) -> FmtResult {
600 Value::Number(Some(number)) => write!(f, "{number:?}"),
601 Value::Number(None) => write!(f, "SYSMIS"),
602 Value::String(s) => write!(f, "{:?}", s),
608 fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Self, IoError> {
610 &UntypedValue(read_bytes(r)?),
616 pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self {
618 VarType::String => Value::String(RawStr(raw.0)),
619 VarType::Numeric => {
620 let number: f64 = endian.parse(raw.0);
621 Value::Number((number != -f64::MAX).then_some(number))
626 fn read_case<R: Read + Seek>(
628 var_types: &[VarType],
630 ) -> Result<Option<Vec<Self>>, Error> {
631 let case_start = reader.stream_position()?;
632 let mut values = Vec::with_capacity(var_types.len());
633 for (i, &var_type) in var_types.iter().enumerate() {
634 let Some(raw) = try_read_bytes(reader)? else {
638 let offset = reader.stream_position()?;
639 return Err(Error::EofInCase {
641 case_ofs: offset - case_start,
642 case_len: var_types.len() * 8,
646 values.push(Value::from_raw(&UntypedValue(raw), var_type, endian));
651 fn read_compressed_case<R: Read + Seek>(
653 var_types: &[VarType],
654 codes: &mut VecDeque<u8>,
657 ) -> Result<Option<Vec<Self>>, Error> {
658 let case_start = reader.stream_position()?;
659 let mut values = Vec::with_capacity(var_types.len());
660 for (i, &var_type) in var_types.iter().enumerate() {
662 let Some(code) = codes.pop_front() else {
663 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
667 let offset = reader.stream_position()?;
668 return Err(Error::EofInCompressedCase {
670 case_ofs: offset - case_start,
674 codes.extend(new_codes.into_iter());
679 1..=251 => match var_type {
680 VarType::Numeric => break Self::Number(Some(code as f64 - bias)),
682 break Self::String(RawStr(endian.to_bytes(code as f64 - bias)))
689 let offset = reader.stream_position()?;
690 return Err(Error::PartialCompressedCase {
692 case_ofs: offset - case_start,
697 break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian)
699 254 => match var_type {
700 VarType::String => break Self::String(RawStr(*b" ")), // XXX EBCDIC
701 VarType::Numeric => {
702 return Err(Error::CompressedStringExpected {
704 case_ofs: reader.stream_position()? - case_start,
708 255 => match var_type {
709 VarType::Numeric => break Self::Number(None),
711 return Err(Error::CompressedNumberExpected {
713 case_ofs: reader.stream_position()? - case_start,
724 fn decode(&self, decoder: &Decoder) -> Value<String> {
726 Self::Number(x) => Value::Number(*x),
727 Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
732 struct ZlibDecodeMultiple<R>
736 reader: Option<ZlibDecoder<R>>,
739 impl<R> ZlibDecodeMultiple<R>
743 fn new(reader: R) -> ZlibDecodeMultiple<R> {
745 reader: Some(ZlibDecoder::new(reader)),
750 impl<R> Read for ZlibDecodeMultiple<R>
754 fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
756 match self.reader.as_mut().unwrap().read(buf)? {
758 let inner = self.reader.take().unwrap().into_inner();
759 self.reader = Some(ZlibDecoder::new(inner));
767 impl<R> Seek for ZlibDecodeMultiple<R>
771 fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
772 self.reader.as_mut().unwrap().get_mut().seek(pos)
781 ztrailer_offset: u64,
790 R: Read + Seek + 'static,
793 warn: Box<dyn Fn(Error)>,
795 header: HeaderRecord<RawString>,
796 var_types: Vec<VarType>,
803 R: Read + Seek + 'static,
805 pub fn new<F>(mut reader: R, warn: F) -> Result<Self, Error>
807 F: Fn(Error) + 'static,
809 let header = HeaderRecord::read(&mut reader)?;
811 reader: Some(reader),
812 warn: Box::new(warn),
814 var_types: Vec::new(),
815 state: ReaderState::Start,
818 fn cases(&mut self) -> Cases {
819 self.state = ReaderState::End;
821 self.reader.take().unwrap(),
822 take(&mut self.var_types),
828 impl<R> Iterator for Reader<R>
830 R: Read + Seek + 'static,
832 type Item = Result<Record, Error>;
834 fn next(&mut self) -> Option<Self::Item> {
836 ReaderState::Start => {
837 self.state = ReaderState::Headers;
838 Some(Ok(Record::Header(self.header.clone())))
840 ReaderState::Headers => {
843 self.reader.as_mut().unwrap(),
845 self.var_types.as_slice(),
848 Ok(Some(record)) => break record,
850 Err(error) => return Some(Err(error)),
854 Record::Variable(VariableRecord { width, .. }) => {
855 self.var_types.push(if width == 0 {
861 Record::EndOfHeaders(_) => {
862 self.state = if let Some(Compression::ZLib) = self.header.compression {
863 ReaderState::ZlibHeader
872 ReaderState::ZlibHeader => {
873 let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian)
875 Ok(zheader) => zheader,
876 Err(error) => return Some(Err(error)),
878 self.state = ReaderState::ZlibTrailer {
879 ztrailer_offset: zheader.ztrailer_offset,
880 ztrailer_len: zheader.ztrailer_len,
882 Some(Ok(Record::ZHeader(zheader)))
884 ReaderState::ZlibTrailer {
888 match ZTrailer::read(
889 self.reader.as_mut().unwrap(),
894 Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
895 Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))),
896 Err(error) => Some(Err(error)),
899 ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
900 ReaderState::End => None,
905 trait ReadSeek: Read + Seek {}
906 impl<T> ReadSeek for T where T: Read + Seek {}
909 reader: Box<dyn ReadSeek>,
910 var_types: Vec<VarType>,
911 compression: Option<Compression>,
918 impl Debug for Cases {
919 fn fmt(&self, f: &mut Formatter) -> FmtResult {
925 fn new<R>(reader: R, var_types: Vec<VarType>, header: &HeaderRecord<RawString>) -> Self
927 R: Read + Seek + 'static,
930 reader: if header.compression == Some(Compression::ZLib) {
931 Box::new(ZlibDecodeMultiple::new(reader))
936 compression: header.compression,
938 endian: header.endian,
939 codes: VecDeque::with_capacity(8),
945 impl Iterator for Cases {
946 type Item = Result<Vec<RawValue>, Error>;
948 fn next(&mut self) -> Option<Self::Item> {
953 let retval = if self.compression.is_some() {
954 Value::read_compressed_case(
963 Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose()
965 self.eof = matches!(retval, None | Some(Err(_)));
970 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
971 pub struct Spec(pub u32);
973 impl Debug for Spec {
974 fn fmt(&self, f: &mut Formatter) -> FmtResult {
975 let type_ = format_name(self.0 >> 16);
976 let w = (self.0 >> 8) & 0xff;
977 let d = self.0 & 0xff;
978 write!(f, "{:06x} ({type_}{w}.{d})", self.0)
982 fn format_name(type_: u32) -> Cow<'static, str> {
1021 _ => return format!("<unknown format {type_}>").into(),
1027 pub struct MissingValues<S = String>
1031 /// Individual missing values, up to 3 of them.
1032 pub values: Vec<Value<S>>,
1034 /// Optional range of missing values.
1035 pub range: Option<(Value<S>, Value<S>)>,
1038 impl<S> Debug for MissingValues<S>
1042 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1043 for (i, value) in self.values.iter().enumerate() {
1047 write!(f, "{value:?}")?;
1050 if let Some((low, high)) = &self.range {
1051 if !self.values.is_empty() {
1054 write!(f, "{low:?} THRU {high:?}")?;
1057 if self.is_empty() {
1065 impl<S> MissingValues<S>
1069 fn is_empty(&self) -> bool {
1070 self.values.is_empty() && self.range.is_none()
1074 impl<S> Default for MissingValues<S>
1078 fn default() -> Self {
1086 impl MissingValues<RawStr<8>> {
1087 fn read<R: Read + Seek>(
1093 ) -> Result<Self, Error> {
1094 let (n_values, has_range) = match (width, code) {
1095 (_, 0..=3) => (code, false),
1096 (0, -2) => (0, true),
1097 (0, -3) => (1, true),
1098 (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
1099 (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
1102 let var_type = if width == 0 {
1108 let mut values = Vec::new();
1109 for _ in 0..n_values {
1110 values.push(RawValue::read(r, var_type, endian)?);
1112 let range = if has_range {
1113 let low = RawValue::read(r, var_type, endian)?;
1114 let high = RawValue::read(r, var_type, endian)?;
1119 Ok(Self { values, range })
1121 fn decode<'a>(&'a self, decoder: &Decoder) -> MissingValues<String> {
1126 .map(|value| value.decode(decoder))
1131 .map(|(low, high)| (low.decode(decoder), high.decode(decoder))),
1137 pub struct VariableRecord<S, V>
1142 /// Range of offsets in file.
1143 pub offsets: Range<u64>,
1145 /// Variable width, in the range -1..=255.
1148 /// Variable name, padded on the right with spaces.
1152 pub print_format: Spec,
1155 pub write_format: Spec,
1158 pub missing_values: MissingValues<V>,
1160 /// Optional variable label.
1161 pub label: Option<S>,
1164 impl<S, V> Debug for VariableRecord<S, V>
1169 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1174 match self.width.cmp(&0) {
1175 Ordering::Greater => "string",
1176 Ordering::Equal => "numeric",
1177 Ordering::Less => "long string continuation record",
1180 writeln!(f, "Print format: {:?}", self.print_format)?;
1181 writeln!(f, "Write format: {:?}", self.write_format)?;
1182 writeln!(f, "Name: {:?}", &self.name)?;
1183 writeln!(f, "Variable label: {:?}", self.label)?;
1184 writeln!(f, "Missing values: {:?}", self.missing_values)
1188 impl VariableRecord<RawString, RawStr<8>> {
1189 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1190 let start_offset = r.stream_position()?;
1191 let width: i32 = endian.parse(read_bytes(r)?);
1192 let code_offset = r.stream_position()?;
1193 let has_variable_label: u32 = endian.parse(read_bytes(r)?);
1194 let missing_value_code: i32 = endian.parse(read_bytes(r)?);
1195 let print_format = Spec(endian.parse(read_bytes(r)?));
1196 let write_format = Spec(endian.parse(read_bytes(r)?));
1197 let name = RawString(read_vec(r, 8)?);
1199 let label = match has_variable_label {
1202 let len: u32 = endian.parse(read_bytes(r)?);
1203 let read_len = len.min(65535) as usize;
1204 let label = RawString(read_vec(r, read_len)?);
1206 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
1207 let _ = read_vec(r, padding_bytes as usize)?;
1212 return Err(Error::BadVariableLabelCode {
1215 code: has_variable_label,
1220 let missing_values =
1221 MissingValues::read(r, start_offset, width, missing_value_code, endian)?;
1223 let end_offset = r.stream_position()?;
1225 Ok(Record::Variable(VariableRecord {
1226 offsets: start_offset..end_offset,
1236 pub fn decode<'a>(&'a self, decoder: &Decoder) -> VariableRecord<Cow<'a, str>, String> {
1238 offsets: self.offsets.clone(),
1240 name: decoder.decode(&self.name),
1241 print_format: self.print_format,
1242 write_format: self.write_format,
1243 missing_values: self.missing_values.decode(decoder),
1244 label: self.label.as_ref().map(|label| decoder.decode(label)),
1249 #[derive(Copy, Clone)]
1250 pub struct UntypedValue(pub [u8; 8]);
1252 impl Debug for UntypedValue {
1253 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1254 let little: f64 = Endian::Little.parse(self.0);
1255 let little = format!("{:?}", little);
1256 let big: f64 = Endian::Big.parse(self.0);
1257 let big = format!("{:?}", big);
1258 let number = if little.len() <= big.len() {
1263 write!(f, "{number}")?;
1265 let string = default_decode(&self.0);
1267 .split(|c: char| c == '\0' || c.is_control())
1270 write!(f, "{string:?}")?;
1276 pub struct RawString(pub Vec<u8>);
1278 impl From<Vec<u8>> for RawString {
1279 fn from(source: Vec<u8>) -> Self {
1284 impl From<&[u8]> for RawString {
1285 fn from(source: &[u8]) -> Self {
1290 impl Debug for RawString {
1291 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1292 write!(f, "{:?}", default_decode(self.0.as_slice()))
1296 #[derive(Copy, Clone)]
1297 pub struct RawStr<const N: usize>(pub [u8; N]);
1299 impl<const N: usize> From<[u8; N]> for RawStr<N> {
1300 fn from(source: [u8; N]) -> Self {
1305 impl<const N: usize> Debug for RawStr<N> {
1306 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1307 write!(f, "{:?}", default_decode(&self.0))
1311 #[derive(Clone, Debug)]
1312 pub struct ValueLabel<V, S>
1317 pub value: Value<V>,
1322 pub struct ValueLabelRecord<V, S>
1327 /// Range of offsets in file.
1328 pub offsets: Range<u64>,
1331 pub labels: Vec<ValueLabel<V, S>>,
1333 /// The 1-based indexes of the variable indexes.
1334 pub dict_indexes: Vec<u32>,
1336 /// The types of the variables.
1337 pub var_type: VarType,
1340 impl<V, S> Debug for ValueLabelRecord<V, S>
1345 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1346 writeln!(f, "labels: ")?;
1347 for label in self.labels.iter() {
1348 writeln!(f, "{label:?}")?;
1350 write!(f, "apply to {} variables", self.var_type)?;
1351 for dict_index in self.dict_indexes.iter() {
1352 write!(f, " #{dict_index}")?;
1358 impl<V, S> Header for ValueLabelRecord<V, S>
1363 fn offsets(&self) -> Range<u64> {
1364 self.offsets.clone()
1368 impl<V, S> ValueLabelRecord<V, S>
1373 /// Maximum number of value labels in a record.
1374 pub const MAX_LABELS: u32 = u32::MAX / 8;
1376 /// Maximum number of variable indexes in a record.
1377 pub const MAX_INDEXES: u32 = u32::MAX / 8;
1380 impl ValueLabelRecord<RawStr<8>, RawString> {
1381 fn read<R: Read + Seek>(
1384 var_types: &[VarType],
1385 warn: &Box<dyn Fn(Error)>,
1386 ) -> Result<Option<Record>, Error> {
1387 let label_offset = r.stream_position()?;
1388 let n: u32 = endian.parse(read_bytes(r)?);
1389 if n > Self::MAX_LABELS {
1390 return Err(Error::BadNumberOfValueLabels {
1391 offset: label_offset,
1393 max: Self::MAX_LABELS,
1397 let mut labels = Vec::new();
1399 let value = UntypedValue(read_bytes(r)?);
1400 let label_len: u8 = endian.parse(read_bytes(r)?);
1401 let label_len = label_len as usize;
1402 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
1404 let mut label = read_vec(r, padded_len - 1)?;
1405 label.truncate(label_len);
1406 labels.push((value, RawString(label)));
1409 let index_offset = r.stream_position()?;
1410 let rec_type: u32 = endian.parse(read_bytes(r)?);
1412 return Err(Error::ExpectedVarIndexRecord {
1413 offset: index_offset,
1418 let n: u32 = endian.parse(read_bytes(r)?);
1419 if n > Self::MAX_INDEXES {
1420 return Err(Error::TooManyVarIndexes {
1421 offset: index_offset,
1423 max: Self::MAX_INDEXES,
1427 let index_offset = r.stream_position()?;
1428 let mut dict_indexes = Vec::with_capacity(n as usize);
1429 let mut invalid_indexes = Vec::new();
1431 let index: u32 = endian.parse(read_bytes(r)?);
1432 if index == 0 || index as usize > var_types.len() {
1433 dict_indexes.push(index);
1435 invalid_indexes.push(index);
1438 if !invalid_indexes.is_empty() {
1439 warn(Error::InvalidVarIndexes {
1440 offset: index_offset,
1441 max: var_types.len(),
1442 invalid: invalid_indexes,
1446 let Some(&first_index) = dict_indexes.first() else {
1447 warn(Error::NoVarIndexes {
1448 offset: index_offset,
1452 let var_type = var_types[first_index as usize - 1];
1453 let mut wrong_type_indexes = Vec::new();
1454 dict_indexes.retain(|&index| {
1455 if var_types[index as usize - 1] != var_type {
1456 wrong_type_indexes.push(index);
1462 if !wrong_type_indexes.is_empty() {
1463 warn(Error::MixedVarTypes {
1464 offset: index_offset,
1466 wrong_types: wrong_type_indexes,
1472 .map(|(value, label)| ValueLabel {
1473 value: Value::from_raw(&value, var_type, endian),
1478 let end_offset = r.stream_position()?;
1479 Ok(Some(Record::ValueLabel(ValueLabelRecord {
1480 offsets: label_offset..end_offset,
1488 #[derive(Clone, Debug)]
1489 pub struct DocumentRecord<S>
1493 pub offsets: Range<u64>,
1495 /// The document, as an array of 80-byte lines.
1499 pub type RawDocumentLine = RawStr<DOC_LINE_LEN>;
1501 /// Length of a line in a document. Document lines are fixed-length and
1502 /// padded on the right with spaces.
1503 pub const DOC_LINE_LEN: usize = 80;
1505 impl DocumentRecord<RawDocumentLine> {
1506 /// Maximum number of lines we will accept in a document. This is simply
1507 /// the maximum number that will fit in a 32-bit space.
1508 pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
1510 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1511 let start_offset = r.stream_position()?;
1512 let n: u32 = endian.parse(read_bytes(r)?);
1514 if n > Self::MAX_LINES {
1515 Err(Error::BadDocumentLength {
1516 offset: start_offset,
1518 max: Self::MAX_LINES,
1521 let mut lines = Vec::with_capacity(n);
1523 lines.push(RawStr(read_bytes(r)?));
1525 let end_offset = r.stream_position()?;
1526 Ok(Record::Document(DocumentRecord {
1527 offsets: start_offset..end_offset,
1533 pub fn decode<'a>(&'a self, decoder: &Decoder) -> DocumentRecord<Cow<'a, str>> {
1535 offsets: self.offsets.clone(),
1539 .map(|s| decoder.decode_slice(&s.0))
1545 impl<S> Header for DocumentRecord<S>
1549 fn offsets(&self) -> Range<u64> {
1550 self.offsets.clone()
1554 trait ExtensionRecord {
1556 const SIZE: Option<u32>;
1557 const COUNT: Option<u32>;
1558 const NAME: &'static str;
1559 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error>;
1562 #[derive(Clone, Debug)]
1563 pub struct IntegerInfoRecord {
1564 pub offsets: Range<u64>,
1565 pub version: (i32, i32, i32),
1566 pub machine_code: i32,
1567 pub floating_point_rep: i32,
1568 pub compression_code: i32,
1569 pub endianness: i32,
1570 pub character_code: i32,
1573 impl ExtensionRecord for IntegerInfoRecord {
1574 const SUBTYPE: u32 = 3;
1575 const SIZE: Option<u32> = Some(4);
1576 const COUNT: Option<u32> = Some(8);
1577 const NAME: &'static str = "integer record";
1579 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1580 ext.check_size::<Self>()?;
1582 let mut input = &ext.data[..];
1583 let data: Vec<i32> = (0..8)
1584 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1586 Ok(Record::IntegerInfo(IntegerInfoRecord {
1587 offsets: ext.offsets.clone(),
1588 version: (data[0], data[1], data[2]),
1589 machine_code: data[3],
1590 floating_point_rep: data[4],
1591 compression_code: data[5],
1592 endianness: data[6],
1593 character_code: data[7],
1598 #[derive(Clone, Debug)]
1599 pub struct FloatInfoRecord {
1605 impl ExtensionRecord for FloatInfoRecord {
1606 const SUBTYPE: u32 = 4;
1607 const SIZE: Option<u32> = Some(8);
1608 const COUNT: Option<u32> = Some(3);
1609 const NAME: &'static str = "floating point record";
1611 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1612 ext.check_size::<Self>()?;
1614 let mut input = &ext.data[..];
1615 let data: Vec<f64> = (0..3)
1616 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1618 Ok(Record::FloatInfo(FloatInfoRecord {
1626 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1627 pub enum CategoryLabels {
1632 #[derive(Clone, Debug)]
1633 pub enum MultipleResponseType {
1636 labels: CategoryLabels,
1641 impl MultipleResponseType {
1642 fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Error> {
1643 let (mr_type, input) = match input.split_first() {
1644 Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
1645 Some((b'D', input)) => {
1646 let (value, input) = parse_counted_string(input)?;
1648 MultipleResponseType::MultipleDichotomy {
1650 labels: CategoryLabels::VarLabels,
1655 Some((b'E', input)) => {
1656 let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
1657 (CategoryLabels::CountedValues, rest)
1658 } else if let Some(rest) = input.strip_prefix(b" 11 ") {
1659 (CategoryLabels::VarLabels, rest)
1661 return Err(Error::TBD);
1663 let (value, input) = parse_counted_string(input)?;
1665 MultipleResponseType::MultipleDichotomy { value, labels },
1669 _ => return Err(Error::TBD),
1671 Ok((mr_type, input))
1675 #[derive(Clone, Debug)]
1676 pub struct MultipleResponseSet<I, S>
1683 pub mr_type: MultipleResponseType,
1684 pub short_names: Vec<I>,
1687 impl MultipleResponseSet<RawString, RawString> {
1688 fn parse(input: &[u8]) -> Result<(Self, &[u8]), Error> {
1689 let Some(equals) = input.iter().position(|&b| b == b'=') else {
1690 return Err(Error::TBD);
1692 let (name, input) = input.split_at(equals);
1693 let (mr_type, input) = MultipleResponseType::parse(input)?;
1694 let Some(input) = input.strip_prefix(b" ") else {
1695 return Err(Error::TBD);
1697 let (label, mut input) = parse_counted_string(input)?;
1698 let mut vars = Vec::new();
1699 while input.first() != Some(&b'\n') {
1700 match input.split_first() {
1701 Some((b' ', rest)) => {
1702 let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
1703 return Err(Error::TBD);
1705 let (var, rest) = rest.split_at(length);
1706 if !var.is_empty() {
1707 vars.push(var.into());
1711 _ => return Err(Error::TBD),
1714 while input.first() == Some(&b'\n') {
1715 input = &input[1..];
1718 MultipleResponseSet {
1731 ) -> Result<MultipleResponseSet<Identifier, Cow<'a, str>>, Error> {
1732 let mut short_names = Vec::with_capacity(self.short_names.len());
1733 for short_name in self.short_names.iter() {
1734 if let Some(short_name) = decoder
1735 .decode_identifier(short_name)
1736 .map_err(|err| Error::InvalidMrSetName(err))
1737 .warn_on_error(&decoder.warn)
1739 short_names.push(short_name);
1742 Ok(MultipleResponseSet {
1744 .decode_identifier(&self.name)
1745 .map_err(|err| Error::InvalidMrSetVariableName(err))?,
1746 label: decoder.decode(&self.label),
1747 mr_type: self.mr_type.clone(),
1748 short_names: short_names,
1753 #[derive(Clone, Debug)]
1754 pub struct MultipleResponseRecord<I, S>(pub Vec<MultipleResponseSet<I, S>>)
1759 impl ExtensionRecord for MultipleResponseRecord<RawString, RawString> {
1760 const SUBTYPE: u32 = 7;
1761 const SIZE: Option<u32> = Some(1);
1762 const COUNT: Option<u32> = None;
1763 const NAME: &'static str = "multiple response set record";
1765 fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Error> {
1766 ext.check_size::<Self>()?;
1768 let mut input = &ext.data[..];
1769 let mut sets = Vec::new();
1770 while !input.is_empty() {
1771 let (set, rest) = MultipleResponseSet::parse(input)?;
1775 Ok(Record::MultipleResponse(MultipleResponseRecord(sets)))
1779 impl MultipleResponseRecord<RawString, RawString> {
1780 fn decode<'a>(&'a self, decoder: &Decoder) -> MultipleResponseRecord<Identifier, Cow<'a, str>> {
1781 let mut sets = Vec::new();
1782 for set in self.0.iter() {
1783 if let Some(set) = set.decode(decoder).warn_on_error(&decoder.warn) {
1787 MultipleResponseRecord(sets)
1791 fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Error> {
1792 let Some(space) = input.iter().position(|&b| b == b' ') else {
1793 return Err(Error::TBD);
1795 let Ok(length) = from_utf8(&input[..space]) else {
1796 return Err(Error::TBD);
1798 let Ok(length): Result<usize, _> = length.parse() else {
1799 return Err(Error::TBD);
1802 let input = &input[space + 1..];
1803 if input.len() < length {
1804 return Err(Error::TBD);
1807 let (string, rest) = input.split_at(length);
1808 Ok((string.into(), rest))
1811 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1819 pub fn default_for_type(var_type: VarType) -> Option<Measure> {
1821 VarType::Numeric => None,
1822 VarType::String => Some(Self::Nominal),
1826 fn try_decode(source: u32) -> Result<Option<Measure>, Error> {
1829 1 => Ok(Some(Measure::Nominal)),
1830 2 => Ok(Some(Measure::Ordinal)),
1831 3 => Ok(Some(Measure::Scale)),
1832 _ => Err(Error::InvalidMeasurement(source)),
1837 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1838 pub enum Alignment {
1845 fn try_decode(source: u32) -> Result<Option<Alignment>, Error> {
1848 1 => Ok(Some(Alignment::Left)),
1849 2 => Ok(Some(Alignment::Right)),
1850 3 => Ok(Some(Alignment::Center)),
1851 _ => Err(Error::InvalidAlignment(source)),
1855 pub fn default_for_type(var_type: VarType) -> Self {
1857 VarType::Numeric => Self::Right,
1858 VarType::String => Self::Left,
1863 #[derive(Clone, Debug)]
1864 pub struct VarDisplay {
1865 pub measure: Option<Measure>,
1866 pub width: Option<u32>,
1867 pub alignment: Option<Alignment>,
1870 #[derive(Clone, Debug)]
1871 pub struct VarDisplayRecord(pub Vec<VarDisplay>);
1873 impl VarDisplayRecord {
1874 const SUBTYPE: u32 = 11;
1880 warn: &Box<dyn Fn(Error)>,
1881 ) -> Result<Record, Error> {
1883 return Err(Error::BadRecordSize {
1884 offset: ext.offsets.start,
1885 record: String::from("variable display record"),
1891 let has_width = if ext.count as usize == 3 * n_vars {
1893 } else if ext.count as usize == 2 * n_vars {
1896 return Err(Error::TBD);
1899 let mut var_displays = Vec::new();
1900 let mut input = &ext.data[..];
1901 for _ in 0..n_vars {
1902 let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
1903 .warn_on_error(&warn)
1905 let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
1906 let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
1907 .warn_on_error(&warn)
1909 var_displays.push(VarDisplay {
1915 Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
1919 #[derive(Clone, Debug)]
1920 pub struct LongStringMissingValues<N, V>
1929 pub missing_values: MissingValues<V>,
1932 impl LongStringMissingValues<RawString, RawStr<8>> {
1936 ) -> Result<LongStringMissingValues<Identifier, String>, IdError> {
1937 Ok(LongStringMissingValues {
1938 var_name: decoder.decode_identifier(&self.var_name)?,
1939 missing_values: self.missing_values.decode(decoder),
1944 #[derive(Clone, Debug)]
1945 pub struct LongStringMissingValueRecord<N, V>(pub Vec<LongStringMissingValues<N, V>>)
1950 impl ExtensionRecord for LongStringMissingValueRecord<RawString, RawStr<8>> {
1951 const SUBTYPE: u32 = 22;
1952 const SIZE: Option<u32> = Some(1);
1953 const COUNT: Option<u32> = None;
1954 const NAME: &'static str = "long string missing values record";
1956 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
1957 ext.check_size::<Self>()?;
1959 let mut input = &ext.data[..];
1960 let mut missing_value_set = Vec::new();
1961 while !input.is_empty() {
1962 let var_name = read_string(&mut input, endian)?;
1963 let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
1964 let value_len: u32 = endian.parse(read_bytes(&mut input)?);
1966 let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
1967 return Err(Error::BadLongMissingValueLength {
1968 record_offset: ext.offsets.start,
1973 let mut values = Vec::new();
1974 for i in 0..n_missing_values {
1975 let value: [u8; 8] = read_bytes(&mut input)?;
1976 let numeric_value: u64 = endian.parse(value);
1977 let value = if i > 0 && numeric_value == 8 {
1978 // Tolerate files written by old, buggy versions of PSPP
1979 // where we believed that the value_length was repeated
1980 // before each missing value.
1981 read_bytes(&mut input)?
1985 values.push(Value::String(RawStr(value)));
1987 let missing_values = MissingValues {
1991 missing_value_set.push(LongStringMissingValues {
1996 Ok(Record::LongStringMissingValues(
1997 LongStringMissingValueRecord(missing_value_set),
2002 impl LongStringMissingValueRecord<RawString, RawStr<8>> {
2006 ) -> LongStringMissingValueRecord<Identifier, String> {
2007 let mut mvs = Vec::with_capacity(self.0.len());
2008 for mv in self.0.iter() {
2009 if let Some(mv) = mv
2011 .map_err(|err| Error::InvalidLongStringMissingValueVariableName(err))
2012 .warn_on_error(&decoder.warn)
2017 LongStringMissingValueRecord(mvs)
2021 #[derive(Clone, Debug)]
2022 pub struct EncodingRecord(pub String);
2024 impl ExtensionRecord for EncodingRecord {
2025 const SUBTYPE: u32 = 20;
2026 const SIZE: Option<u32> = Some(1);
2027 const COUNT: Option<u32> = None;
2028 const NAME: &'static str = "encoding record";
2030 fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Error> {
2031 ext.check_size::<Self>()?;
2033 Ok(Record::Encoding(EncodingRecord(
2034 String::from_utf8(ext.data.clone()).map_err(|_| Error::BadEncodingName {
2035 offset: ext.offsets.start,
2041 #[derive(Copy, Clone, Debug)]
2042 pub struct NumberOfCasesRecord {
2043 /// Always observed as 1.
2046 /// Number of cases.
2050 impl ExtensionRecord for NumberOfCasesRecord {
2051 const SUBTYPE: u32 = 16;
2052 const SIZE: Option<u32> = Some(8);
2053 const COUNT: Option<u32> = Some(2);
2054 const NAME: &'static str = "extended number of cases record";
2056 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
2057 ext.check_size::<Self>()?;
2059 let mut input = &ext.data[..];
2060 let one = endian.parse(read_bytes(&mut input)?);
2061 let n_cases = endian.parse(read_bytes(&mut input)?);
2063 Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
2067 #[derive(Clone, Debug)]
2068 pub struct TextRecord {
2069 pub offsets: Range<u64>,
2072 pub rec_type: TextRecordType,
2074 /// The text content of the record.
2075 pub text: RawString,
2078 #[derive(Clone, Copy, Debug)]
2079 pub enum TextRecordType {
2089 fn new(extension: Extension, rec_type: TextRecordType) -> Self {
2091 offsets: extension.offsets,
2093 text: extension.data.into(),
2096 fn decode<'a>(&self, decoder: &Decoder) -> Result<Option<Record>, Error> {
2097 match self.rec_type {
2098 TextRecordType::VariableSets => Ok(Some(Record::VariableSets(
2099 VariableSetRecord::decode(self, decoder),
2101 TextRecordType::ProductInfo => Ok(Some(Record::ProductInfo(
2102 ProductInfoRecord::decode(self, decoder),
2104 TextRecordType::LongNames => Ok(Some(Record::LongNames(LongNamesRecord::decode(
2107 TextRecordType::VeryLongStrings => Ok(Some(Record::VeryLongStrings(
2108 VeryLongStringsRecord::decode(self, decoder),
2110 TextRecordType::FileAttributes => {
2111 Ok(FileAttributeRecord::decode(self, decoder).map(|fa| Record::FileAttributes(fa)))
2113 TextRecordType::VariableAttributes => Ok(Some(Record::VariableAttributes(
2114 VariableAttributeRecord::decode(self, decoder),
2120 #[derive(Clone, Debug)]
2121 pub struct VeryLongString {
2122 pub short_name: Identifier,
2126 impl VeryLongString {
2127 fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Error> {
2128 let Some((short_name, length)) = input.split_once('=') else {
2129 return Err(Error::TBD);
2131 let short_name = decoder
2132 .new_identifier(short_name)
2133 .map_err(Error::InvalidLongStringName)?;
2134 let length = length.parse().map_err(|_| Error::TBD)?;
2135 Ok(VeryLongString { short_name, length })
2139 #[derive(Clone, Debug)]
2140 pub struct VeryLongStringsRecord(Vec<VeryLongString>);
2142 impl VeryLongStringsRecord {
2143 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2144 let input = decoder.decode(&source.text);
2145 let mut very_long_strings = Vec::new();
2148 .map(|s| s.trim_end_matches('\t'))
2149 .filter(|s| !s.is_empty())
2151 if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&decoder.warn) {
2152 very_long_strings.push(vls)
2155 VeryLongStringsRecord(very_long_strings)
2159 #[derive(Clone, Debug)]
2160 pub struct Attribute {
2161 pub name: Identifier,
2162 pub values: Vec<String>,
2166 fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Error> {
2167 let Some((name, mut input)) = input.split_once('(') else {
2168 return Err(Error::TBD);
2171 .new_identifier(name)
2172 .map_err(Error::InvalidAttributeName)?;
2173 let mut values = Vec::new();
2175 let Some((value, rest)) = input.split_once('\n') else {
2176 return Err(Error::TBD);
2178 if let Some(stripped) = value
2180 .and_then(|value| value.strip_suffix('\''))
2182 values.push(stripped.into());
2184 decoder.warn(Error::TBD);
2185 values.push(value.into());
2187 if let Some(rest) = rest.strip_prefix(')') {
2188 let attribute = Attribute { name, values };
2189 return Ok((attribute, rest));
2196 #[derive(Clone, Debug)]
2197 pub struct AttributeSet(pub Vec<Attribute>);
2203 sentinel: Option<char>,
2204 ) -> Result<(AttributeSet, &'a str), Error> {
2205 let mut attributes = Vec::new();
2207 match input.chars().next() {
2208 None => break input,
2209 c if c == sentinel => break &input[1..],
2211 let (attribute, rest) = Attribute::parse(decoder, input)?;
2212 attributes.push(attribute);
2217 Ok((AttributeSet(attributes), rest))
2221 #[derive(Clone, Debug)]
2222 pub struct FileAttributeRecord(AttributeSet);
2224 impl FileAttributeRecord {
2225 fn decode(source: &TextRecord, decoder: &Decoder) -> Option<Self> {
2226 let input = decoder.decode(&source.text);
2227 match AttributeSet::parse(decoder, &input, None).warn_on_error(&decoder.warn) {
2228 Some((set, rest)) => {
2229 if !rest.is_empty() {
2230 decoder.warn(Error::TBD);
2232 Some(FileAttributeRecord(set))
2239 #[derive(Clone, Debug)]
2240 pub struct VarAttributeSet {
2241 pub long_var_name: Identifier,
2242 pub attributes: AttributeSet,
2245 impl VarAttributeSet {
2246 fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributeSet, &'a str), Error> {
2247 let Some((long_var_name, rest)) = input.split_once(':') else {
2248 return Err(Error::TBD);
2250 let long_var_name = decoder
2251 .new_identifier(long_var_name)
2252 .map_err(Error::InvalidAttributeVariableName)?;
2253 let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?;
2254 let var_attribute = VarAttributeSet {
2258 Ok((var_attribute, rest))
2262 #[derive(Clone, Debug)]
2263 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
2265 impl VariableAttributeRecord {
2266 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2267 let decoded = decoder.decode(&source.text);
2268 let mut input = decoded.as_ref();
2269 let mut var_attribute_sets = Vec::new();
2270 while !input.is_empty() {
2271 let Some((var_attribute, rest)) =
2272 VarAttributeSet::parse(decoder, &input).warn_on_error(&decoder.warn)
2276 var_attribute_sets.push(var_attribute);
2277 input = rest.into();
2279 VariableAttributeRecord(var_attribute_sets)
2283 #[derive(Clone, Debug)]
2284 pub struct LongName {
2285 pub short_name: Identifier,
2286 pub long_name: Identifier,
2290 fn parse(input: &str, decoder: &Decoder) -> Result<Self, Error> {
2291 let Some((short_name, long_name)) = input.split_once('=') else {
2292 return Err(Error::TBD);
2294 let short_name = decoder
2295 .new_identifier(short_name)
2296 .map_err(Error::InvalidShortName)?;
2297 let long_name = decoder
2298 .new_identifier(long_name)
2299 .map_err(Error::InvalidLongName)?;
2307 #[derive(Clone, Debug)]
2308 pub struct LongNamesRecord(Vec<LongName>);
2310 impl LongNamesRecord {
2311 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2312 let input = decoder.decode(&source.text);
2313 let mut names = Vec::new();
2314 for pair in input.split('\t').filter(|s| !s.is_empty()) {
2315 if let Some(long_name) = LongName::parse(pair, decoder).warn_on_error(&decoder.warn) {
2316 names.push(long_name);
2319 LongNamesRecord(names)
2323 #[derive(Clone, Debug)]
2324 pub struct ProductInfoRecord(pub String);
2326 impl ProductInfoRecord {
2327 const NAME: &'static str = "extra product info";
2328 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2329 Self(decoder.decode(&source.text).into())
2332 #[derive(Clone, Debug)]
2333 pub struct VariableSet {
2335 pub vars: Vec<Identifier>,
2339 fn parse(input: &str, decoder: &Decoder) -> Result<Self, Error> {
2340 let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
2341 let mut vars = Vec::new();
2342 for var in input.split_ascii_whitespace() {
2343 if let Some(identifier) = decoder
2344 .new_identifier(var)
2345 .map_err(Error::InvalidVariableSetName)
2346 .warn_on_error(&decoder.warn)
2348 vars.push(identifier);
2358 #[derive(Clone, Debug)]
2359 pub struct VariableSetRecord {
2360 pub offsets: Range<u64>,
2361 pub sets: Vec<VariableSet>,
2364 impl VariableSetRecord {
2365 fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord {
2366 let mut sets = Vec::new();
2367 let input = decoder.decode(&source.text);
2368 for line in input.lines() {
2369 if let Some(set) = VariableSet::parse(line, decoder).warn_on_error(&decoder.warn) {
2374 offsets: source.offsets.clone(),
2380 trait WarnOnError<T> {
2381 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
2383 impl<T> WarnOnError<T> for Result<T, Error> {
2384 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
2386 Ok(result) => Some(result),
2395 #[derive(Clone, Debug)]
2396 pub struct Extension {
2397 pub offsets: Range<u64>,
2402 /// Size of each data element.
2405 /// Number of data elements.
2408 /// `size * count` bytes of data.
2413 fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
2414 if let Some(expected_size) = E::SIZE {
2415 if self.size != expected_size {
2416 return Err(Error::BadRecordSize {
2417 offset: self.offsets.start,
2418 record: E::NAME.into(),
2424 if let Some(expected_count) = E::COUNT {
2425 if self.count != expected_count {
2426 return Err(Error::BadRecordCount {
2427 offset: self.offsets.start,
2428 record: E::NAME.into(),
2437 fn read<R: Read + Seek>(
2441 warn: &Box<dyn Fn(Error)>,
2442 ) -> Result<Option<Record>, Error> {
2443 let subtype = endian.parse(read_bytes(r)?);
2444 let header_offset = r.stream_position()?;
2445 let size: u32 = endian.parse(read_bytes(r)?);
2446 let count = endian.parse(read_bytes(r)?);
2447 let Some(product) = size.checked_mul(count) else {
2448 return Err(Error::ExtensionRecordTooLarge {
2449 offset: header_offset,
2455 let start_offset = r.stream_position()?;
2456 let data = read_vec(r, product as usize)?;
2457 let end_offset = start_offset + product as u64;
2458 let extension = Extension {
2459 offsets: start_offset..end_offset,
2465 let result = match subtype {
2466 IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian),
2467 FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian),
2468 VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, n_vars, endian, warn),
2469 MultipleResponseRecord::SUBTYPE | 19 => {
2470 MultipleResponseRecord::parse(&extension, endian)
2472 LongStringValueLabelRecord::SUBTYPE => {
2473 LongStringValueLabelRecord::parse(&extension, endian)
2475 EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian),
2476 NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian),
2477 5 => Ok(Record::Text(TextRecord::new(
2479 TextRecordType::VariableSets,
2481 10 => Ok(Record::Text(TextRecord::new(
2483 TextRecordType::ProductInfo,
2485 13 => Ok(Record::Text(TextRecord::new(
2487 TextRecordType::LongNames,
2489 14 => Ok(Record::Text(TextRecord::new(
2491 TextRecordType::VeryLongStrings,
2493 17 => Ok(Record::Text(TextRecord::new(
2495 TextRecordType::FileAttributes,
2497 18 => Ok(Record::Text(TextRecord::new(
2499 TextRecordType::VariableAttributes,
2501 _ => Ok(Record::OtherExtension(extension)),
2504 Ok(result) => Ok(Some(result)),
2513 #[derive(Clone, Debug)]
2514 pub struct ZHeader {
2515 /// File offset to the start of the record.
2518 /// File offset to the ZLIB data header.
2519 pub zheader_offset: u64,
2521 /// File offset to the ZLIB trailer.
2522 pub ztrailer_offset: u64,
2524 /// Length of the ZLIB trailer in bytes.
2525 pub ztrailer_len: u64,
2529 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
2530 let offset = r.stream_position()?;
2531 let zheader_offset: u64 = endian.parse(read_bytes(r)?);
2532 let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
2533 let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
2544 #[derive(Clone, Debug)]
2545 pub struct ZTrailer {
2546 /// File offset to the start of the record.
2549 /// Compression bias as a negative integer, e.g. -100.
2552 /// Always observed as zero.
2555 /// Uncompressed size of each block, except possibly the last. Only
2556 /// `0x3ff000` has been observed so far.
2557 pub block_size: u32,
2559 /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
2560 pub blocks: Vec<ZBlock>,
2563 #[derive(Clone, Debug)]
2565 /// Offset of block of data if simple compression were used.
2566 pub uncompressed_ofs: u64,
2568 /// Actual offset within the file of the compressed data block.
2569 pub compressed_ofs: u64,
2571 /// The number of bytes in this data block after decompression. This is
2572 /// `block_size` in every data block but the last, which may be smaller.
2573 pub uncompressed_size: u32,
2575 /// The number of bytes in this data block, as stored compressed in this
2577 pub compressed_size: u32,
2581 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
2583 uncompressed_ofs: endian.parse(read_bytes(r)?),
2584 compressed_ofs: endian.parse(read_bytes(r)?),
2585 uncompressed_size: endian.parse(read_bytes(r)?),
2586 compressed_size: endian.parse(read_bytes(r)?),
2592 fn read<R: Read + Seek>(
2597 ) -> Result<Option<ZTrailer>, Error> {
2598 let start_offset = reader.stream_position()?;
2599 if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
2602 let int_bias = endian.parse(read_bytes(reader)?);
2603 let zero = endian.parse(read_bytes(reader)?);
2604 let block_size = endian.parse(read_bytes(reader)?);
2605 let n_blocks: u32 = endian.parse(read_bytes(reader)?);
2606 let expected_n_blocks = (ztrailer_len - 24) / 24;
2607 if n_blocks as u64 != expected_n_blocks {
2608 return Err(Error::BadZlibTrailerNBlocks {
2609 offset: ztrailer_ofs,
2615 let blocks = (0..n_blocks)
2616 .map(|_| ZBlock::read(reader, endian))
2617 .collect::<Result<Vec<_>, _>>()?;
2618 reader.seek(SeekFrom::Start(start_offset))?;
2620 offset: ztrailer_ofs,
2629 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
2630 let mut buf = [0; N];
2631 let n = r.read(&mut buf)?;
2634 r.read_exact(&mut buf[n..])?;
2642 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
2643 let mut buf = [0; N];
2644 r.read_exact(&mut buf)?;
2648 fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
2649 let mut vec = vec![0; n];
2650 r.read_exact(&mut vec)?;
2654 fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<RawString, IoError> {
2655 let length: u32 = endian.parse(read_bytes(r)?);
2656 Ok(read_vec(r, length as usize)?.into())
2659 #[derive(Clone, Debug)]
2660 pub struct LongStringValueLabels<S>
2667 /// `(value, label)` pairs, where each value is `width` bytes.
2668 pub labels: Vec<(S, S)>,
2671 #[derive(Clone, Debug)]
2672 pub struct LongStringValueLabelRecord<S>(pub Vec<LongStringValueLabels<S>>)
2676 impl ExtensionRecord for LongStringValueLabelRecord<RawString> {
2677 const SUBTYPE: u32 = 21;
2678 const SIZE: Option<u32> = Some(1);
2679 const COUNT: Option<u32> = None;
2680 const NAME: &'static str = "long string value labels record";
2682 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
2683 ext.check_size::<Self>()?;
2685 let mut input = &ext.data[..];
2686 let mut label_set = Vec::new();
2687 while !input.is_empty() {
2688 let var_name = read_string(&mut input, endian)?;
2689 let width: u32 = endian.parse(read_bytes(&mut input)?);
2690 let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
2691 let mut labels = Vec::new();
2692 for _ in 0..n_labels {
2693 let value = read_string(&mut input, endian)?;
2694 let label = read_string(&mut input, endian)?;
2695 labels.push((value, label));
2697 label_set.push(LongStringValueLabels {
2703 Ok(Record::LongStringValueLabels(LongStringValueLabelRecord(