3 endian::{Endian, Parse, ToBytes},
4 identifier::{Error as IdError, Identifier},
7 use encoding_rs::{mem::decode_latin1, DecoderResult, Encoding};
8 use flate2::read::ZlibDecoder;
14 collections::{HashMap, VecDeque},
15 fmt::{Debug, Display, Formatter, Result as FmtResult},
16 io::{Error as IoError, Read, Seek, SeekFrom},
23 use thiserror::Error as ThisError;
25 #[derive(ThisError, Debug)]
27 #[error("Not an SPSS system file")]
30 #[error("Invalid magic number {0:?}")]
33 #[error("I/O error ({0})")]
36 #[error("Invalid SAV compression code {0}")]
37 InvalidSavCompression(u32),
39 #[error("Invalid ZSAV compression code {0}")]
40 InvalidZsavCompression(u32),
42 #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
43 BadDocumentLength { offset: u64, n: usize, max: usize },
45 #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
46 BadRecordType { offset: u64, rec_type: u32 },
48 #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
49 BadVariableLabelCode {
56 "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
58 BadNumericMissingValueCode { offset: u64, code: i32 },
60 #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
61 BadStringMissingValueCode { offset: u64, code: i32 },
63 #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
64 BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
66 #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
67 ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
69 #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")]
70 TooManyVarIndexes { offset: u64, n: u32, max: u32 },
72 #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
73 ExtensionRecordTooLarge {
80 #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
88 "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
90 EofInCompressedCase { offset: u64, case_ofs: u64 },
92 #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
93 PartialCompressedCase { offset: u64, case_ofs: u64 },
95 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
96 CompressedNumberExpected { offset: u64, case_ofs: u64 },
98 #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
99 CompressedStringExpected { offset: u64, case_ofs: u64 },
101 #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
102 BadZlibTrailerNBlocks {
105 expected_n_blocks: u64,
110 #[derive(ThisError, Debug)]
112 #[error("Unexpected end of data inside extension record.")]
115 #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")]
116 NoVarIndexes { offset: u64 },
118 #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())]
122 wrong_types: Vec<u32>,
125 #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")]
132 #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
140 #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
148 #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
149 BadLongMissingValueLength {
155 #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
156 BadEncodingName { offset: u64 },
158 // XXX This is risky because `text` might be arbitarily long.
159 #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
160 MalformedString { encoding: String, text: String },
162 #[error("Invalid variable measurement level value {0}")]
163 InvalidMeasurement(u32),
165 #[error("Invalid variable display alignment value {0}")]
166 InvalidAlignment(u32),
168 #[error("Invalid attribute name. {0}")]
169 InvalidAttributeName(IdError),
171 #[error("Invalid variable name in attribute record. {0}")]
172 InvalidAttributeVariableName(IdError),
174 #[error("Invalid short name in long variable name record. {0}")]
175 InvalidShortName(IdError),
177 #[error("Invalid name in long variable name record. {0}")]
178 InvalidLongName(IdError),
180 #[error("Invalid variable name in very long string record. {0}")]
181 InvalidLongStringName(IdError),
183 #[error("Invalid variable name in variable set record. {0}")]
184 InvalidVariableSetName(IdError),
186 #[error("Invalid multiple response set name. {0}")]
187 InvalidMrSetName(IdError),
189 #[error("Invalid multiple response set variable name. {0}")]
190 InvalidMrSetVariableName(IdError),
192 #[error("Invalid variable name in long string missing values record. {0}")]
193 InvalidLongStringMissingValueVariableName(IdError),
195 #[error("Invalid variable name in long string value label record. {0}")]
196 InvalidLongStringValueLabelName(IdError),
198 #[error("Details TBD")]
202 impl From<IoError> for Warning {
203 fn from(_source: IoError) -> Self {
204 Self::UnexpectedEndOfData
208 #[derive(Clone, Debug)]
210 Header(HeaderRecord<RawString>),
211 Variable(VariableRecord<RawString, RawStr<8>>),
212 ValueLabel(ValueLabelRecord<RawStr<8>, RawString>),
213 Document(DocumentRecord<RawDocumentLine>),
214 IntegerInfo(IntegerInfoRecord),
215 FloatInfo(FloatInfoRecord),
216 VarDisplay(VarDisplayRecord),
217 MultipleResponse(MultipleResponseRecord<RawString, RawString>),
218 LongStringValueLabels(LongStringValueLabelRecord<RawString, RawString>),
219 LongStringMissingValues(LongStringMissingValueRecord<RawString, RawStr<8>>),
220 Encoding(EncodingRecord),
221 NumberOfCases(NumberOfCasesRecord),
223 OtherExtension(Extension),
227 Cases(Rc<RefCell<Cases>>),
230 pub enum DecodedRecord<'a> {
231 Header(HeaderRecord<Cow<'a, str>>),
232 Variable(VariableRecord<Cow<'a, str>, String>),
233 ValueLabel(ValueLabelRecord<RawStr<8>, Cow<'a, str>>),
234 Document(DocumentRecord<Cow<'a, str>>),
235 IntegerInfo(IntegerInfoRecord),
236 FloatInfo(FloatInfoRecord),
237 VarDisplay(VarDisplayRecord),
238 MultipleResponse(MultipleResponseRecord<Identifier, Cow<'a, str>>),
239 LongStringValueLabels(LongStringValueLabelRecord<Identifier, Cow<'a, str>>),
240 LongStringMissingValues(LongStringMissingValueRecord<Identifier, String>),
241 Encoding(EncodingRecord),
242 NumberOfCases(NumberOfCasesRecord),
243 VariableSets(VariableSetRecord),
244 ProductInfo(ProductInfoRecord),
245 LongNames(LongNamesRecord),
246 VeryLongStrings(VeryLongStringsRecord),
247 FileAttributes(FileAttributeRecord),
248 VariableAttributes(VariableAttributeRecord),
249 OtherExtension(Extension),
259 var_types: &[VarType],
260 warn: &Box<dyn Fn(Warning)>,
261 ) -> Result<Option<Record>, Error>
265 let rec_type: u32 = endian.parse(read_bytes(reader)?);
267 2 => Ok(Some(VariableRecord::read(reader, endian)?)),
268 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?),
269 6 => Ok(Some(DocumentRecord::read(reader, endian)?)),
270 7 => Extension::read(reader, endian, var_types.len(), warn),
271 999 => Ok(Some(Record::EndOfHeaders(
272 endian.parse(read_bytes(reader)?),
274 _ => Err(Error::BadRecordType {
275 offset: reader.stream_position()?,
281 pub fn decode<'a>(&'a self, decoder: &Decoder) -> Result<DecodedRecord<'a>, Error> {
283 Record::Header(record) => record.decode(decoder),
284 Record::Variable(record) => record.decode(decoder),
285 Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)),
286 Record::Document(record) => record.decode(decoder),
287 Record::IntegerInfo(record) => DecodedRecord::IntegerInfo(record.clone()),
288 Record::FloatInfo(record) => DecodedRecord::FloatInfo(record.clone()),
289 Record::VarDisplay(record) => DecodedRecord::VarDisplay(record.clone()),
290 Record::MultipleResponse(record) => record.decode(decoder),
291 Record::LongStringValueLabels(record) => {
292 DecodedRecord::LongStringValueLabels(record.decode(decoder))
294 Record::LongStringMissingValues(record) => {
295 DecodedRecord::LongStringMissingValues(record.decode(decoder))
297 Record::Encoding(record) => DecodedRecord::Encoding(record.clone()),
298 Record::NumberOfCases(record) => DecodedRecord::NumberOfCases(record.clone()),
299 Record::Text(record) => record.decode(decoder),
300 Record::OtherExtension(record) => DecodedRecord::OtherExtension(record.clone()),
301 Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record.clone()),
302 Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()),
303 Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()),
304 Record::Cases(_) => todo!(),
309 // If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
310 // decoded as Latin-1 (actually bytes interpreted as Unicode code points).
311 fn default_decode(s: &[u8]) -> Cow<str> {
312 from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
315 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
316 pub enum Compression {
322 fn offsets(&self) -> Range<u64>;
326 pub struct HeaderRecord<S>
331 pub offsets: Range<u64>,
336 /// Eye-catcher string, product name, in the file's encoding. Padded
337 /// on the right with spaces.
340 /// Layout code, normally either 2 or 3.
341 pub layout_code: u32,
343 /// Number of variable positions, or `None` if the value in the file is
344 /// questionably trustworthy.
345 pub nominal_case_size: Option<u32>,
347 /// Compression type, if any,
348 pub compression: Option<Compression>,
350 /// 1-based variable index of the weight variable, or `None` if the file is
352 pub weight_index: Option<u32>,
354 /// Claimed number of cases, if known.
355 pub n_cases: Option<u32>,
357 /// Compression bias, usually 100.0.
360 /// `dd mmm yy` in the file's encoding.
361 pub creation_date: S,
363 /// `HH:MM:SS` in the file's encoding.
364 pub creation_time: S,
366 /// File label, in the file's encoding. Padded on the right with spaces.
369 /// Endianness of the data in the file header.
373 impl<S> HeaderRecord<S>
377 fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult
381 writeln!(f, "{name:>17}: {:?}", value)
385 impl<S> Debug for HeaderRecord<S>
389 fn fmt(&self, f: &mut Formatter) -> FmtResult {
390 writeln!(f, "File header record:")?;
391 self.debug_field(f, "Magic", self.magic)?;
392 self.debug_field(f, "Product name", &self.eye_catcher)?;
393 self.debug_field(f, "Layout code", self.layout_code)?;
394 self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
395 self.debug_field(f, "Compression", self.compression)?;
396 self.debug_field(f, "Weight index", self.weight_index)?;
397 self.debug_field(f, "Number of cases", self.n_cases)?;
398 self.debug_field(f, "Compression bias", self.bias)?;
399 self.debug_field(f, "Creation date", &self.creation_date)?;
400 self.debug_field(f, "Creation time", &self.creation_time)?;
401 self.debug_field(f, "File label", &self.file_label)?;
402 self.debug_field(f, "Endianness", self.endian)
406 impl HeaderRecord<RawString> {
407 fn read<R: Read + Seek>(r: &mut R) -> Result<Self, Error> {
408 let start = r.stream_position()?;
410 let magic: [u8; 4] = read_bytes(r)?;
411 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
413 let eye_catcher = RawString(read_vec(r, 60)?);
414 let layout_code: [u8; 4] = read_bytes(r)?;
415 let endian = Endian::identify_u32(2, layout_code)
416 .or_else(|| Endian::identify_u32(2, layout_code))
417 .ok_or_else(|| Error::NotASystemFile)?;
418 let layout_code = endian.parse(layout_code);
420 let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
421 let nominal_case_size =
422 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
424 let compression_code: u32 = endian.parse(read_bytes(r)?);
425 let compression = match (magic, compression_code) {
426 (Magic::Zsav, 2) => Some(Compression::ZLib),
427 (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)),
429 (_, 1) => Some(Compression::Simple),
430 (_, code) => return Err(Error::InvalidSavCompression(code)),
433 let weight_index: u32 = endian.parse(read_bytes(r)?);
434 let weight_index = (weight_index > 0).then_some(weight_index);
436 let n_cases: u32 = endian.parse(read_bytes(r)?);
437 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
439 let bias: f64 = endian.parse(read_bytes(r)?);
441 let creation_date = RawString(read_vec(r, 9)?);
442 let creation_time = RawString(read_vec(r, 8)?);
443 let file_label = RawString(read_vec(r, 64)?);
444 let _: [u8; 3] = read_bytes(r)?;
447 offsets: start..r.stream_position()?,
463 pub fn decode<'a>(&'a self, decoder: &Decoder) -> DecodedRecord<'a> {
464 let eye_catcher = decoder.decode(&self.eye_catcher);
465 let file_label = decoder.decode(&self.file_label);
466 let creation_date = decoder.decode(&self.creation_date);
467 let creation_time = decoder.decode(&self.creation_time);
468 DecodedRecord::Header(HeaderRecord {
470 weight_index: self.weight_index,
471 n_cases: self.n_cases,
473 offsets: self.offsets.clone(),
475 layout_code: self.layout_code,
476 nominal_case_size: self.nominal_case_size,
477 compression: self.compression,
487 pub encoding: &'static Encoding,
488 pub warn: Box<dyn Fn(Warning)>,
492 fn warn(&self, warning: Warning) {
495 fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
496 let (output, malformed) = self.encoding.decode_without_bom_handling(input);
498 self.warn(Warning::MalformedString {
499 encoding: self.encoding.name().into(),
500 text: output.clone().into(),
506 fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> {
507 self.decode_slice(input.0.as_slice())
510 /// Returns `input` decoded from `self.encoding` into UTF-8 such that
511 /// re-encoding the result back into `self.encoding` will have exactly the
512 /// same length in bytes.
514 /// XXX warn about errors?
515 pub fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
516 if let (s, false) = self.encoding.decode_without_bom_handling(input) {
517 // This is the common case. Usually there will be no errors.
520 // Unusual case. Don't bother to optimize it much.
521 let mut decoder = self.encoding.new_decoder_without_bom_handling();
522 let mut output = String::with_capacity(
524 .max_utf8_buffer_length_without_replacement(input.len())
527 let mut rest = input;
528 while !rest.is_empty() {
529 match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
530 (DecoderResult::InputEmpty, _) => break,
531 (DecoderResult::OutputFull, _) => unreachable!(),
532 (DecoderResult::Malformed(a, b), consumed) => {
533 let skipped = a as usize + b as usize;
534 output.extend(repeat('?').take(skipped));
535 rest = &rest[consumed..];
539 assert_eq!(self.encoding.encode(&output).0.len(), input.len());
544 pub fn decode_identifier(&self, input: &RawString) -> Result<Identifier, IdError> {
545 self.new_identifier(&self.decode(input))
548 pub fn new_identifier(&self, name: &str) -> Result<Identifier, IdError> {
549 Identifier::new(name, self.encoding)
553 impl<S> Header for HeaderRecord<S>
557 fn offsets(&self) -> Range<u64> {
562 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
564 /// Regular system file.
567 /// System file with Zlib-compressed data.
570 /// EBCDIC-encoded system file.
575 /// Magic number for a regular system file.
576 pub const SAV: [u8; 4] = *b"$FL2";
578 /// Magic number for a system file that contains zlib-compressed data.
579 pub const ZSAV: [u8; 4] = *b"$FL3";
581 /// Magic number for an EBCDIC-encoded system file. This is `$FL2` encoded
583 pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2];
586 impl Debug for Magic {
587 fn fmt(&self, f: &mut Formatter) -> FmtResult {
588 let s = match *self {
589 Magic::Sav => "$FL2",
590 Magic::Zsav => "$FL3",
591 Magic::Ebcdic => "($FL2 in EBCDIC)",
597 impl TryFrom<[u8; 4]> for Magic {
600 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
602 Magic::SAV => Ok(Magic::Sav),
603 Magic::ZSAV => Ok(Magic::Zsav),
604 Magic::EBCDIC => Ok(Magic::Ebcdic),
605 _ => Err(Error::BadMagic(value)),
610 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
617 pub fn from_width(width: VarWidth) -> VarType {
619 VarWidth::Numeric => Self::Numeric,
620 VarWidth::String(_) => Self::String,
624 pub fn opposite(self) -> VarType {
626 Self::Numeric => Self::String,
627 Self::String => Self::Numeric,
632 impl Display for VarType {
633 fn fmt(&self, f: &mut Formatter) -> FmtResult {
635 VarType::Numeric => write!(f, "numeric"),
636 VarType::String => write!(f, "string"),
641 #[derive(Copy, Clone)]
650 type RawValue = Value<RawStr<8>>;
652 impl<S> Debug for Value<S>
656 fn fmt(&self, f: &mut Formatter) -> FmtResult {
658 Value::Number(Some(number)) => write!(f, "{number:?}"),
659 Value::Number(None) => write!(f, "SYSMIS"),
660 Value::String(s) => write!(f, "{:?}", s),
666 fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Self, IoError> {
668 &UntypedValue(read_bytes(r)?),
674 pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self {
676 VarType::String => Value::String(RawStr(raw.0)),
677 VarType::Numeric => {
678 let number: f64 = endian.parse(raw.0);
679 Value::Number((number != -f64::MAX).then_some(number))
684 fn read_case<R: Read + Seek>(
686 var_types: &[VarType],
688 ) -> Result<Option<Vec<Self>>, Error> {
689 let case_start = reader.stream_position()?;
690 let mut values = Vec::with_capacity(var_types.len());
691 for (i, &var_type) in var_types.iter().enumerate() {
692 let Some(raw) = try_read_bytes(reader)? else {
696 let offset = reader.stream_position()?;
697 return Err(Error::EofInCase {
699 case_ofs: offset - case_start,
700 case_len: var_types.len() * 8,
704 values.push(Value::from_raw(&UntypedValue(raw), var_type, endian));
709 fn read_compressed_case<R: Read + Seek>(
711 var_types: &[VarType],
712 codes: &mut VecDeque<u8>,
715 ) -> Result<Option<Vec<Self>>, Error> {
716 let case_start = reader.stream_position()?;
717 let mut values = Vec::with_capacity(var_types.len());
718 for (i, &var_type) in var_types.iter().enumerate() {
720 let Some(code) = codes.pop_front() else {
721 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
725 let offset = reader.stream_position()?;
726 return Err(Error::EofInCompressedCase {
728 case_ofs: offset - case_start,
732 codes.extend(new_codes.into_iter());
737 1..=251 => match var_type {
738 VarType::Numeric => break Self::Number(Some(code as f64 - bias)),
740 break Self::String(RawStr(endian.to_bytes(code as f64 - bias)))
747 let offset = reader.stream_position()?;
748 return Err(Error::PartialCompressedCase {
750 case_ofs: offset - case_start,
755 break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian)
757 254 => match var_type {
758 VarType::String => break Self::String(RawStr(*b" ")), // XXX EBCDIC
759 VarType::Numeric => {
760 return Err(Error::CompressedStringExpected {
762 case_ofs: reader.stream_position()? - case_start,
766 255 => match var_type {
767 VarType::Numeric => break Self::Number(None),
769 return Err(Error::CompressedNumberExpected {
771 case_ofs: reader.stream_position()? - case_start,
782 fn decode(&self, decoder: &Decoder) -> Value<String> {
784 Self::Number(x) => Value::Number(*x),
785 Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
790 struct ZlibDecodeMultiple<R>
794 reader: Option<ZlibDecoder<R>>,
797 impl<R> ZlibDecodeMultiple<R>
801 fn new(reader: R) -> ZlibDecodeMultiple<R> {
803 reader: Some(ZlibDecoder::new(reader)),
808 impl<R> Read for ZlibDecodeMultiple<R>
812 fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
814 match self.reader.as_mut().unwrap().read(buf)? {
816 let inner = self.reader.take().unwrap().into_inner();
817 self.reader = Some(ZlibDecoder::new(inner));
825 impl<R> Seek for ZlibDecodeMultiple<R>
829 fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
830 self.reader.as_mut().unwrap().get_mut().seek(pos)
839 ztrailer_offset: u64,
848 R: Read + Seek + 'static,
851 warn: Box<dyn Fn(Warning)>,
853 header: HeaderRecord<RawString>,
854 var_types: Vec<VarType>,
861 R: Read + Seek + 'static,
863 pub fn new<F>(mut reader: R, warn: F) -> Result<Self, Error>
865 F: Fn(Warning) + 'static,
867 let header = HeaderRecord::read(&mut reader)?;
869 reader: Some(reader),
870 warn: Box::new(warn),
872 var_types: Vec::new(),
873 state: ReaderState::Start,
876 fn cases(&mut self) -> Cases {
877 self.state = ReaderState::End;
879 self.reader.take().unwrap(),
880 take(&mut self.var_types),
886 impl<R> Iterator for Reader<R>
888 R: Read + Seek + 'static,
890 type Item = Result<Record, Error>;
892 fn next(&mut self) -> Option<Self::Item> {
894 ReaderState::Start => {
895 self.state = ReaderState::Headers;
896 Some(Ok(Record::Header(self.header.clone())))
898 ReaderState::Headers => {
901 self.reader.as_mut().unwrap(),
903 self.var_types.as_slice(),
906 Ok(Some(record)) => break record,
908 Err(error) => return Some(Err(error)),
912 Record::Variable(VariableRecord { width, .. }) => {
913 self.var_types.push(if width == 0 {
919 Record::EndOfHeaders(_) => {
920 self.state = if let Some(Compression::ZLib) = self.header.compression {
921 ReaderState::ZlibHeader
930 ReaderState::ZlibHeader => {
931 let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian)
933 Ok(zheader) => zheader,
934 Err(error) => return Some(Err(error)),
936 self.state = ReaderState::ZlibTrailer {
937 ztrailer_offset: zheader.ztrailer_offset,
938 ztrailer_len: zheader.ztrailer_len,
940 Some(Ok(Record::ZHeader(zheader)))
942 ReaderState::ZlibTrailer {
946 match ZTrailer::read(
947 self.reader.as_mut().unwrap(),
952 Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
953 Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))),
954 Err(error) => Some(Err(error)),
957 ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
958 ReaderState::End => None,
963 trait ReadSeek: Read + Seek {}
964 impl<T> ReadSeek for T where T: Read + Seek {}
967 reader: Box<dyn ReadSeek>,
968 var_types: Vec<VarType>,
969 compression: Option<Compression>,
976 impl Debug for Cases {
977 fn fmt(&self, f: &mut Formatter) -> FmtResult {
983 fn new<R>(reader: R, var_types: Vec<VarType>, header: &HeaderRecord<RawString>) -> Self
985 R: Read + Seek + 'static,
988 reader: if header.compression == Some(Compression::ZLib) {
989 Box::new(ZlibDecodeMultiple::new(reader))
994 compression: header.compression,
996 endian: header.endian,
997 codes: VecDeque::with_capacity(8),
1003 impl Iterator for Cases {
1004 type Item = Result<Vec<RawValue>, Error>;
1006 fn next(&mut self) -> Option<Self::Item> {
1011 let retval = if self.compression.is_some() {
1012 Value::read_compressed_case(
1021 Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose()
1023 self.eof = matches!(retval, None | Some(Err(_)));
1028 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
1029 pub struct Spec(pub u32);
1031 impl Debug for Spec {
1032 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1033 let type_ = format_name(self.0 >> 16);
1034 let w = (self.0 >> 8) & 0xff;
1035 let d = self.0 & 0xff;
1036 write!(f, "{:06x} ({type_}{w}.{d})", self.0)
1040 fn format_name(type_: u32) -> Cow<'static, str> {
1079 _ => return format!("<unknown format {type_}>").into(),
1085 pub struct MissingValues<S = String>
1089 /// Individual missing values, up to 3 of them.
1090 pub values: Vec<Value<S>>,
1092 /// Optional range of missing values.
1093 pub range: Option<(Value<S>, Value<S>)>,
1096 impl<S> Debug for MissingValues<S>
1100 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1101 for (i, value) in self.values.iter().enumerate() {
1105 write!(f, "{value:?}")?;
1108 if let Some((low, high)) = &self.range {
1109 if !self.values.is_empty() {
1112 write!(f, "{low:?} THRU {high:?}")?;
1115 if self.is_empty() {
1123 impl<S> MissingValues<S>
1127 fn is_empty(&self) -> bool {
1128 self.values.is_empty() && self.range.is_none()
1132 impl<S> Default for MissingValues<S>
1136 fn default() -> Self {
1144 impl MissingValues<RawStr<8>> {
1145 fn read<R: Read + Seek>(
1151 ) -> Result<Self, Error> {
1152 let (n_values, has_range) = match (width, code) {
1153 (_, 0..=3) => (code, false),
1154 (0, -2) => (0, true),
1155 (0, -3) => (1, true),
1156 (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
1157 (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
1160 let var_type = if width == 0 {
1166 let mut values = Vec::new();
1167 for _ in 0..n_values {
1168 values.push(RawValue::read(r, var_type, endian)?);
1170 let range = if has_range {
1171 let low = RawValue::read(r, var_type, endian)?;
1172 let high = RawValue::read(r, var_type, endian)?;
1177 Ok(Self { values, range })
1179 fn decode<'a>(&'a self, decoder: &Decoder) -> MissingValues<String> {
1184 .map(|value| value.decode(decoder))
1189 .map(|(low, high)| (low.decode(decoder), high.decode(decoder))),
1195 pub struct VariableRecord<S, V>
1200 /// Range of offsets in file.
1201 pub offsets: Range<u64>,
1203 /// Variable width, in the range -1..=255.
1206 /// Variable name, padded on the right with spaces.
1210 pub print_format: Spec,
1213 pub write_format: Spec,
1216 pub missing_values: MissingValues<V>,
1218 /// Optional variable label.
1219 pub label: Option<S>,
1222 impl<S, V> Debug for VariableRecord<S, V>
1227 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1232 match self.width.cmp(&0) {
1233 Ordering::Greater => "string",
1234 Ordering::Equal => "numeric",
1235 Ordering::Less => "long string continuation record",
1238 writeln!(f, "Print format: {:?}", self.print_format)?;
1239 writeln!(f, "Write format: {:?}", self.write_format)?;
1240 writeln!(f, "Name: {:?}", &self.name)?;
1241 writeln!(f, "Variable label: {:?}", self.label)?;
1242 writeln!(f, "Missing values: {:?}", self.missing_values)
1246 impl VariableRecord<RawString, RawStr<8>> {
1247 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1248 let start_offset = r.stream_position()?;
1249 let width: i32 = endian.parse(read_bytes(r)?);
1250 let code_offset = r.stream_position()?;
1251 let has_variable_label: u32 = endian.parse(read_bytes(r)?);
1252 let missing_value_code: i32 = endian.parse(read_bytes(r)?);
1253 let print_format = Spec(endian.parse(read_bytes(r)?));
1254 let write_format = Spec(endian.parse(read_bytes(r)?));
1255 let name = RawString(read_vec(r, 8)?);
1257 let label = match has_variable_label {
1260 let len: u32 = endian.parse(read_bytes(r)?);
1261 let read_len = len.min(65535) as usize;
1262 let label = RawString(read_vec(r, read_len)?);
1264 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
1265 let _ = read_vec(r, padding_bytes as usize)?;
1270 return Err(Error::BadVariableLabelCode {
1273 code: has_variable_label,
1278 let missing_values =
1279 MissingValues::read(r, start_offset, width, missing_value_code, endian)?;
1281 let end_offset = r.stream_position()?;
1283 Ok(Record::Variable(VariableRecord {
1284 offsets: start_offset..end_offset,
1294 pub fn decode<'a>(&'a self, decoder: &Decoder) -> DecodedRecord {
1295 DecodedRecord::Variable(VariableRecord {
1296 offsets: self.offsets.clone(),
1298 name: decoder.decode(&self.name),
1299 print_format: self.print_format,
1300 write_format: self.write_format,
1301 missing_values: self.missing_values.decode(decoder),
1302 label: self.label.as_ref().map(|label| decoder.decode(label)),
1307 #[derive(Copy, Clone)]
1308 pub struct UntypedValue(pub [u8; 8]);
1310 impl Debug for UntypedValue {
1311 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1312 let little: f64 = Endian::Little.parse(self.0);
1313 let little = format!("{:?}", little);
1314 let big: f64 = Endian::Big.parse(self.0);
1315 let big = format!("{:?}", big);
1316 let number = if little.len() <= big.len() {
1321 write!(f, "{number}")?;
1323 let string = default_decode(&self.0);
1325 .split(|c: char| c == '\0' || c.is_control())
1328 write!(f, "{string:?}")?;
1334 pub struct RawString(pub Vec<u8>);
1336 impl From<Vec<u8>> for RawString {
1337 fn from(source: Vec<u8>) -> Self {
1342 impl From<&[u8]> for RawString {
1343 fn from(source: &[u8]) -> Self {
1348 impl Debug for RawString {
1349 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1350 write!(f, "{:?}", default_decode(self.0.as_slice()))
1354 #[derive(Copy, Clone)]
1355 pub struct RawStr<const N: usize>(pub [u8; N]);
1357 impl<const N: usize> From<[u8; N]> for RawStr<N> {
1358 fn from(source: [u8; N]) -> Self {
1363 impl<const N: usize> Debug for RawStr<N> {
1364 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1365 write!(f, "{:?}", default_decode(&self.0))
1369 #[derive(Clone, Debug)]
1370 pub struct ValueLabel<V, S>
1375 pub value: Value<V>,
1380 pub struct ValueLabelRecord<V, S>
1385 /// Range of offsets in file.
1386 pub offsets: Range<u64>,
1389 pub labels: Vec<ValueLabel<V, S>>,
1391 /// The 1-based indexes of the variable indexes.
1392 pub dict_indexes: Vec<u32>,
1394 /// The types of the variables.
1395 pub var_type: VarType,
1398 impl<V, S> Debug for ValueLabelRecord<V, S>
1403 fn fmt(&self, f: &mut Formatter) -> FmtResult {
1404 writeln!(f, "labels: ")?;
1405 for label in self.labels.iter() {
1406 writeln!(f, "{label:?}")?;
1408 write!(f, "apply to {} variables", self.var_type)?;
1409 for dict_index in self.dict_indexes.iter() {
1410 write!(f, " #{dict_index}")?;
1416 impl<V, S> Header for ValueLabelRecord<V, S>
1421 fn offsets(&self) -> Range<u64> {
1422 self.offsets.clone()
1426 impl<V, S> ValueLabelRecord<V, S>
1431 /// Maximum number of value labels in a record.
1432 pub const MAX_LABELS: u32 = u32::MAX / 8;
1434 /// Maximum number of variable indexes in a record.
1435 pub const MAX_INDEXES: u32 = u32::MAX / 8;
1438 impl ValueLabelRecord<RawStr<8>, RawString> {
1439 fn read<R: Read + Seek>(
1442 var_types: &[VarType],
1443 warn: &Box<dyn Fn(Warning)>,
1444 ) -> Result<Option<Record>, Error> {
1445 let label_offset = r.stream_position()?;
1446 let n: u32 = endian.parse(read_bytes(r)?);
1447 if n > Self::MAX_LABELS {
1448 return Err(Error::BadNumberOfValueLabels {
1449 offset: label_offset,
1451 max: Self::MAX_LABELS,
1455 let mut labels = Vec::new();
1457 let value = UntypedValue(read_bytes(r)?);
1458 let label_len: u8 = endian.parse(read_bytes(r)?);
1459 let label_len = label_len as usize;
1460 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
1462 let mut label = read_vec(r, padded_len - 1)?;
1463 label.truncate(label_len);
1464 labels.push((value, RawString(label)));
1467 let index_offset = r.stream_position()?;
1468 let rec_type: u32 = endian.parse(read_bytes(r)?);
1470 return Err(Error::ExpectedVarIndexRecord {
1471 offset: index_offset,
1476 let n: u32 = endian.parse(read_bytes(r)?);
1477 if n > Self::MAX_INDEXES {
1478 return Err(Error::TooManyVarIndexes {
1479 offset: index_offset,
1481 max: Self::MAX_INDEXES,
1485 let index_offset = r.stream_position()?;
1486 let mut dict_indexes = Vec::with_capacity(n as usize);
1487 let mut invalid_indexes = Vec::new();
1489 let index: u32 = endian.parse(read_bytes(r)?);
1490 if index == 0 || index as usize > var_types.len() {
1491 dict_indexes.push(index);
1493 invalid_indexes.push(index);
1496 if !invalid_indexes.is_empty() {
1497 warn(Warning::InvalidVarIndexes {
1498 offset: index_offset,
1499 max: var_types.len(),
1500 invalid: invalid_indexes,
1504 let Some(&first_index) = dict_indexes.first() else {
1505 warn(Warning::NoVarIndexes {
1506 offset: index_offset,
1510 let var_type = var_types[first_index as usize - 1];
1511 let mut wrong_type_indexes = Vec::new();
1512 dict_indexes.retain(|&index| {
1513 if var_types[index as usize - 1] != var_type {
1514 wrong_type_indexes.push(index);
1520 if !wrong_type_indexes.is_empty() {
1521 warn(Warning::MixedVarTypes {
1522 offset: index_offset,
1524 wrong_types: wrong_type_indexes,
1530 .map(|(value, label)| ValueLabel {
1531 value: Value::from_raw(&value, var_type, endian),
1536 let end_offset = r.stream_position()?;
1537 Ok(Some(Record::ValueLabel(ValueLabelRecord {
1538 offsets: label_offset..end_offset,
1545 fn decode<'a>(&'a self, decoder: &Decoder) -> ValueLabelRecord<RawStr<8>, Cow<'a, str>> {
1549 .map(|ValueLabel { value, label }| ValueLabel {
1550 value: value.clone(),
1551 label: decoder.decode(label),
1555 offsets: self.offsets.clone(),
1557 dict_indexes: self.dict_indexes.clone(),
1558 var_type: self.var_type,
1563 #[derive(Clone, Debug)]
1564 pub struct DocumentRecord<S>
1568 pub offsets: Range<u64>,
1570 /// The document, as an array of 80-byte lines.
1574 pub type RawDocumentLine = RawStr<DOC_LINE_LEN>;
1576 /// Length of a line in a document. Document lines are fixed-length and
1577 /// padded on the right with spaces.
1578 pub const DOC_LINE_LEN: usize = 80;
1580 impl DocumentRecord<RawDocumentLine> {
1581 /// Maximum number of lines we will accept in a document. This is simply
1582 /// the maximum number that will fit in a 32-bit space.
1583 pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
1585 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
1586 let start_offset = r.stream_position()?;
1587 let n: u32 = endian.parse(read_bytes(r)?);
1589 if n > Self::MAX_LINES {
1590 Err(Error::BadDocumentLength {
1591 offset: start_offset,
1593 max: Self::MAX_LINES,
1596 let mut lines = Vec::with_capacity(n);
1598 lines.push(RawStr(read_bytes(r)?));
1600 let end_offset = r.stream_position()?;
1601 Ok(Record::Document(DocumentRecord {
1602 offsets: start_offset..end_offset,
1608 pub fn decode<'a>(&'a self, decoder: &Decoder) -> DecodedRecord {
1609 DecodedRecord::Document(DocumentRecord {
1610 offsets: self.offsets.clone(),
1614 .map(|s| decoder.decode_slice(&s.0))
1620 impl<S> Header for DocumentRecord<S>
1624 fn offsets(&self) -> Range<u64> {
1625 self.offsets.clone()
1629 trait ExtensionRecord {
1631 const SIZE: Option<u32>;
1632 const COUNT: Option<u32>;
1633 const NAME: &'static str;
1634 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning>;
1637 #[derive(Clone, Debug)]
1638 pub struct IntegerInfoRecord {
1639 pub offsets: Range<u64>,
1640 pub version: (i32, i32, i32),
1641 pub machine_code: i32,
1642 pub floating_point_rep: i32,
1643 pub compression_code: i32,
1644 pub endianness: i32,
1645 pub character_code: i32,
1648 impl ExtensionRecord for IntegerInfoRecord {
1649 const SUBTYPE: u32 = 3;
1650 const SIZE: Option<u32> = Some(4);
1651 const COUNT: Option<u32> = Some(8);
1652 const NAME: &'static str = "integer record";
1654 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
1655 ext.check_size::<Self>()?;
1657 let mut input = &ext.data[..];
1658 let data: Vec<i32> = (0..8)
1659 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1661 Ok(Record::IntegerInfo(IntegerInfoRecord {
1662 offsets: ext.offsets.clone(),
1663 version: (data[0], data[1], data[2]),
1664 machine_code: data[3],
1665 floating_point_rep: data[4],
1666 compression_code: data[5],
1667 endianness: data[6],
1668 character_code: data[7],
1673 #[derive(Clone, Debug)]
1674 pub struct FloatInfoRecord {
1680 impl ExtensionRecord for FloatInfoRecord {
1681 const SUBTYPE: u32 = 4;
1682 const SIZE: Option<u32> = Some(8);
1683 const COUNT: Option<u32> = Some(3);
1684 const NAME: &'static str = "floating point record";
1686 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
1687 ext.check_size::<Self>()?;
1689 let mut input = &ext.data[..];
1690 let data: Vec<f64> = (0..3)
1691 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1693 Ok(Record::FloatInfo(FloatInfoRecord {
1701 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1702 pub enum CategoryLabels {
1707 #[derive(Clone, Debug)]
1708 pub enum MultipleResponseType {
1711 labels: CategoryLabels,
1716 impl MultipleResponseType {
1717 fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> {
1718 let (mr_type, input) = match input.split_first() {
1719 Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
1720 Some((b'D', input)) => {
1721 let (value, input) = parse_counted_string(input)?;
1723 MultipleResponseType::MultipleDichotomy {
1725 labels: CategoryLabels::VarLabels,
1730 Some((b'E', input)) => {
1731 let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
1732 (CategoryLabels::CountedValues, rest)
1733 } else if let Some(rest) = input.strip_prefix(b" 11 ") {
1734 (CategoryLabels::VarLabels, rest)
1736 return Err(Warning::TBD);
1738 let (value, input) = parse_counted_string(input)?;
1740 MultipleResponseType::MultipleDichotomy { value, labels },
1744 _ => return Err(Warning::TBD),
1746 Ok((mr_type, input))
1750 #[derive(Clone, Debug)]
1751 pub struct MultipleResponseSet<I, S>
1758 pub mr_type: MultipleResponseType,
1759 pub short_names: Vec<I>,
1762 impl MultipleResponseSet<RawString, RawString> {
1763 fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> {
1764 let Some(equals) = input.iter().position(|&b| b == b'=') else {
1765 return Err(Warning::TBD);
1767 let (name, input) = input.split_at(equals);
1768 let (mr_type, input) = MultipleResponseType::parse(input)?;
1769 let Some(input) = input.strip_prefix(b" ") else {
1770 return Err(Warning::TBD);
1772 let (label, mut input) = parse_counted_string(input)?;
1773 let mut vars = Vec::new();
1774 while input.first() != Some(&b'\n') {
1775 match input.split_first() {
1776 Some((b' ', rest)) => {
1777 let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
1778 return Err(Warning::TBD);
1780 let (var, rest) = rest.split_at(length);
1781 if !var.is_empty() {
1782 vars.push(var.into());
1786 _ => return Err(Warning::TBD),
1789 while input.first() == Some(&b'\n') {
1790 input = &input[1..];
1793 MultipleResponseSet {
1806 ) -> Result<MultipleResponseSet<Identifier, Cow<'a, str>>, Warning> {
1807 let mut short_names = Vec::with_capacity(self.short_names.len());
1808 for short_name in self.short_names.iter() {
1809 if let Some(short_name) = decoder
1810 .decode_identifier(short_name)
1811 .map_err(|err| Warning::InvalidMrSetName(err))
1812 .issue_warning(&decoder.warn)
1814 short_names.push(short_name);
1817 Ok(MultipleResponseSet {
1819 .decode_identifier(&self.name)
1820 .map_err(|err| Warning::InvalidMrSetVariableName(err))?,
1821 label: decoder.decode(&self.label),
1822 mr_type: self.mr_type.clone(),
1823 short_names: short_names,
1828 #[derive(Clone, Debug)]
1829 pub struct MultipleResponseRecord<I, S>(pub Vec<MultipleResponseSet<I, S>>)
1834 impl ExtensionRecord for MultipleResponseRecord<RawString, RawString> {
1835 const SUBTYPE: u32 = 7;
1836 const SIZE: Option<u32> = Some(1);
1837 const COUNT: Option<u32> = None;
1838 const NAME: &'static str = "multiple response set record";
1840 fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
1841 ext.check_size::<Self>()?;
1843 let mut input = &ext.data[..];
1844 let mut sets = Vec::new();
1845 while !input.is_empty() {
1846 let (set, rest) = MultipleResponseSet::parse(input)?;
1850 Ok(Record::MultipleResponse(MultipleResponseRecord(sets)))
1854 impl MultipleResponseRecord<RawString, RawString> {
1855 fn decode<'a>(&'a self, decoder: &Decoder) -> DecodedRecord {
1856 let mut sets = Vec::new();
1857 for set in self.0.iter() {
1858 if let Some(set) = set.decode(decoder).issue_warning(&decoder.warn) {
1862 DecodedRecord::MultipleResponse(MultipleResponseRecord(sets))
1866 fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> {
1867 let Some(space) = input.iter().position(|&b| b == b' ') else {
1868 return Err(Warning::TBD);
1870 let Ok(length) = from_utf8(&input[..space]) else {
1871 return Err(Warning::TBD);
1873 let Ok(length): Result<usize, _> = length.parse() else {
1874 return Err(Warning::TBD);
1877 let input = &input[space + 1..];
1878 if input.len() < length {
1879 return Err(Warning::TBD);
1882 let (string, rest) = input.split_at(length);
1883 Ok((string.into(), rest))
1886 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1894 pub fn default_for_type(var_type: VarType) -> Option<Measure> {
1896 VarType::Numeric => None,
1897 VarType::String => Some(Self::Nominal),
1901 fn try_decode(source: u32) -> Result<Option<Measure>, Warning> {
1904 1 => Ok(Some(Measure::Nominal)),
1905 2 => Ok(Some(Measure::Ordinal)),
1906 3 => Ok(Some(Measure::Scale)),
1907 _ => Err(Warning::InvalidMeasurement(source)),
1912 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1913 pub enum Alignment {
1920 fn try_decode(source: u32) -> Result<Option<Alignment>, Warning> {
1923 1 => Ok(Some(Alignment::Left)),
1924 2 => Ok(Some(Alignment::Right)),
1925 3 => Ok(Some(Alignment::Center)),
1926 _ => Err(Warning::InvalidAlignment(source)),
1930 pub fn default_for_type(var_type: VarType) -> Self {
1932 VarType::Numeric => Self::Right,
1933 VarType::String => Self::Left,
1938 #[derive(Clone, Debug)]
1939 pub struct VarDisplay {
1940 pub measure: Option<Measure>,
1941 pub width: Option<u32>,
1942 pub alignment: Option<Alignment>,
1945 #[derive(Clone, Debug)]
1946 pub struct VarDisplayRecord(pub Vec<VarDisplay>);
1948 impl VarDisplayRecord {
1949 const SUBTYPE: u32 = 11;
1955 warn: &Box<dyn Fn(Warning)>,
1956 ) -> Result<Record, Warning> {
1958 return Err(Warning::BadRecordSize {
1959 offset: ext.offsets.start,
1960 record: String::from("variable display record"),
1966 let has_width = if ext.count as usize == 3 * n_vars {
1968 } else if ext.count as usize == 2 * n_vars {
1971 return Err(Warning::TBD);
1974 let mut var_displays = Vec::new();
1975 let mut input = &ext.data[..];
1976 for _ in 0..n_vars {
1977 let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
1978 .issue_warning(&warn)
1980 let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
1981 let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
1982 .issue_warning(&warn)
1984 var_displays.push(VarDisplay {
1990 Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
1994 #[derive(Clone, Debug)]
1995 pub struct LongStringMissingValues<N, V>
2004 pub missing_values: MissingValues<V>,
2007 impl LongStringMissingValues<RawString, RawStr<8>> {
2011 ) -> Result<LongStringMissingValues<Identifier, String>, IdError> {
2012 Ok(LongStringMissingValues {
2013 var_name: decoder.decode_identifier(&self.var_name)?,
2014 missing_values: self.missing_values.decode(decoder),
2019 #[derive(Clone, Debug)]
2020 pub struct LongStringMissingValueRecord<N, V>(pub Vec<LongStringMissingValues<N, V>>)
2025 impl ExtensionRecord for LongStringMissingValueRecord<RawString, RawStr<8>> {
2026 const SUBTYPE: u32 = 22;
2027 const SIZE: Option<u32> = Some(1);
2028 const COUNT: Option<u32> = None;
2029 const NAME: &'static str = "long string missing values record";
2031 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
2032 ext.check_size::<Self>()?;
2034 let mut input = &ext.data[..];
2035 let mut missing_value_set = Vec::new();
2036 while !input.is_empty() {
2037 let var_name = read_string(&mut input, endian)?;
2038 let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
2039 let value_len: u32 = endian.parse(read_bytes(&mut input)?);
2041 let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
2042 return Err(Warning::BadLongMissingValueLength {
2043 record_offset: ext.offsets.start,
2048 let mut values = Vec::new();
2049 for i in 0..n_missing_values {
2050 let value: [u8; 8] = read_bytes(&mut input)?;
2051 let numeric_value: u64 = endian.parse(value);
2052 let value = if i > 0 && numeric_value == 8 {
2053 // Tolerate files written by old, buggy versions of PSPP
2054 // where we believed that the value_length was repeated
2055 // before each missing value.
2056 read_bytes(&mut input)?
2060 values.push(Value::String(RawStr(value)));
2062 let missing_values = MissingValues {
2066 missing_value_set.push(LongStringMissingValues {
2071 Ok(Record::LongStringMissingValues(
2072 LongStringMissingValueRecord(missing_value_set),
2077 impl LongStringMissingValueRecord<RawString, RawStr<8>> {
2081 ) -> LongStringMissingValueRecord<Identifier, String> {
2082 let mut mvs = Vec::with_capacity(self.0.len());
2083 for mv in self.0.iter() {
2084 if let Some(mv) = mv
2086 .map_err(|err| Warning::InvalidLongStringMissingValueVariableName(err))
2087 .issue_warning(&decoder.warn)
2092 LongStringMissingValueRecord(mvs)
2096 #[derive(Clone, Debug)]
2097 pub struct EncodingRecord(pub String);
2099 impl ExtensionRecord for EncodingRecord {
2100 const SUBTYPE: u32 = 20;
2101 const SIZE: Option<u32> = Some(1);
2102 const COUNT: Option<u32> = None;
2103 const NAME: &'static str = "encoding record";
2105 fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
2106 ext.check_size::<Self>()?;
2108 Ok(Record::Encoding(EncodingRecord(
2109 String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName {
2110 offset: ext.offsets.start,
2116 #[derive(Copy, Clone, Debug)]
2117 pub struct NumberOfCasesRecord {
2118 /// Always observed as 1.
2121 /// Number of cases.
2125 impl ExtensionRecord for NumberOfCasesRecord {
2126 const SUBTYPE: u32 = 16;
2127 const SIZE: Option<u32> = Some(8);
2128 const COUNT: Option<u32> = Some(2);
2129 const NAME: &'static str = "extended number of cases record";
2131 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
2132 ext.check_size::<Self>()?;
2134 let mut input = &ext.data[..];
2135 let one = endian.parse(read_bytes(&mut input)?);
2136 let n_cases = endian.parse(read_bytes(&mut input)?);
2138 Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
2142 #[derive(Clone, Debug)]
2143 pub struct TextRecord {
2144 pub offsets: Range<u64>,
2147 pub rec_type: TextRecordType,
2149 /// The text content of the record.
2150 pub text: RawString,
2153 #[derive(Clone, Copy, Debug)]
2154 pub enum TextRecordType {
2164 fn new(extension: Extension, rec_type: TextRecordType) -> Self {
2166 offsets: extension.offsets,
2168 text: extension.data.into(),
2171 pub fn decode<'a>(&self, decoder: &Decoder) -> DecodedRecord {
2172 match self.rec_type {
2173 TextRecordType::VariableSets => {
2174 DecodedRecord::VariableSets(VariableSetRecord::decode(self, decoder))
2176 TextRecordType::ProductInfo => {
2177 DecodedRecord::ProductInfo(ProductInfoRecord::decode(self, decoder))
2179 TextRecordType::LongNames => {
2180 DecodedRecord::LongNames(LongNamesRecord::decode(self, decoder))
2182 TextRecordType::VeryLongStrings => {
2183 DecodedRecord::VeryLongStrings(VeryLongStringsRecord::decode(self, decoder))
2185 TextRecordType::FileAttributes => {
2186 DecodedRecord::FileAttributes(FileAttributeRecord::decode(self, decoder))
2188 TextRecordType::VariableAttributes => {
2189 DecodedRecord::VariableAttributes(VariableAttributeRecord::decode(self, decoder))
2195 #[derive(Clone, Debug)]
2196 pub struct VeryLongString {
2197 pub short_name: Identifier,
2201 impl VeryLongString {
2202 fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Warning> {
2203 let Some((short_name, length)) = input.split_once('=') else {
2204 return Err(Warning::TBD);
2206 let short_name = decoder
2207 .new_identifier(short_name)
2208 .map_err(Warning::InvalidLongStringName)?;
2209 let length = length.parse().map_err(|_| Warning::TBD)?;
2210 Ok(VeryLongString { short_name, length })
2214 #[derive(Clone, Debug)]
2215 pub struct VeryLongStringsRecord(Vec<VeryLongString>);
2217 impl VeryLongStringsRecord {
2218 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2219 let input = decoder.decode(&source.text);
2220 let mut very_long_strings = Vec::new();
2223 .map(|s| s.trim_end_matches('\t'))
2224 .filter(|s| !s.is_empty())
2226 if let Some(vls) = VeryLongString::parse(decoder, tuple).issue_warning(&decoder.warn) {
2227 very_long_strings.push(vls)
2230 VeryLongStringsRecord(very_long_strings)
2234 #[derive(Clone, Debug)]
2235 pub struct Attribute {
2236 pub name: Identifier,
2237 pub values: Vec<String>,
2241 fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> {
2242 let Some((name, mut input)) = input.split_once('(') else {
2243 return Err(Warning::TBD);
2246 .new_identifier(name)
2247 .map_err(Warning::InvalidAttributeName)?;
2248 let mut values = Vec::new();
2250 let Some((value, rest)) = input.split_once('\n') else {
2251 return Err(Warning::TBD);
2253 if let Some(stripped) = value
2255 .and_then(|value| value.strip_suffix('\''))
2257 values.push(stripped.into());
2259 decoder.warn(Warning::TBD);
2260 values.push(value.into());
2262 if let Some(rest) = rest.strip_prefix(')') {
2263 let attribute = Attribute { name, values };
2264 return Ok((attribute, rest));
2271 #[derive(Clone, Debug)]
2272 pub struct AttributeSet(pub HashMap<Identifier, Vec<String>>);
2278 sentinel: Option<char>,
2279 ) -> Result<(AttributeSet, &'a str), Warning> {
2280 let mut attributes = HashMap::new();
2282 match input.chars().next() {
2283 None => break input,
2284 c if c == sentinel => break &input[1..],
2286 let (attribute, rest) = Attribute::parse(decoder, input)?;
2287 // XXX report duplicate name
2288 attributes.insert(attribute.name, attribute.values);
2293 Ok((AttributeSet(attributes), rest))
2297 impl Default for AttributeSet {
2298 fn default() -> Self {
2299 Self(HashMap::default())
2303 #[derive(Clone, Debug)]
2304 pub struct FileAttributeRecord(AttributeSet);
2306 impl FileAttributeRecord {
2307 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2308 let input = decoder.decode(&source.text);
2309 match AttributeSet::parse(decoder, &input, None).issue_warning(&decoder.warn) {
2310 Some((set, rest)) => {
2311 if !rest.is_empty() {
2312 decoder.warn(Warning::TBD);
2314 FileAttributeRecord(set)
2316 None => FileAttributeRecord::default(),
2321 impl Default for FileAttributeRecord {
2322 fn default() -> Self {
2323 Self(AttributeSet::default())
2327 #[derive(Clone, Debug)]
2328 pub struct VarAttributeSet {
2329 pub long_var_name: Identifier,
2330 pub attributes: AttributeSet,
2333 impl VarAttributeSet {
2334 fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributeSet, &'a str), Warning> {
2335 let Some((long_var_name, rest)) = input.split_once(':') else {
2336 return Err(Warning::TBD);
2338 let long_var_name = decoder
2339 .new_identifier(long_var_name)
2340 .map_err(Warning::InvalidAttributeVariableName)?;
2341 let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?;
2342 let var_attribute = VarAttributeSet {
2346 Ok((var_attribute, rest))
2350 #[derive(Clone, Debug)]
2351 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
2353 impl VariableAttributeRecord {
2354 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2355 let decoded = decoder.decode(&source.text);
2356 let mut input = decoded.as_ref();
2357 let mut var_attribute_sets = Vec::new();
2358 while !input.is_empty() {
2359 let Some((var_attribute, rest)) =
2360 VarAttributeSet::parse(decoder, &input).issue_warning(&decoder.warn)
2364 var_attribute_sets.push(var_attribute);
2365 input = rest.into();
2367 VariableAttributeRecord(var_attribute_sets)
2371 #[derive(Clone, Debug)]
2372 pub struct LongName {
2373 pub short_name: Identifier,
2374 pub long_name: Identifier,
2378 fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
2379 let Some((short_name, long_name)) = input.split_once('=') else {
2380 return Err(Warning::TBD);
2382 let short_name = decoder
2383 .new_identifier(short_name)
2384 .map_err(Warning::InvalidShortName)?;
2385 let long_name = decoder
2386 .new_identifier(long_name)
2387 .map_err(Warning::InvalidLongName)?;
2395 #[derive(Clone, Debug)]
2396 pub struct LongNamesRecord(Vec<LongName>);
2398 impl LongNamesRecord {
2399 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2400 let input = decoder.decode(&source.text);
2401 let mut names = Vec::new();
2402 for pair in input.split('\t').filter(|s| !s.is_empty()) {
2403 if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&decoder.warn) {
2404 names.push(long_name);
2407 LongNamesRecord(names)
2411 #[derive(Clone, Debug)]
2412 pub struct ProductInfoRecord(pub String);
2414 impl ProductInfoRecord {
2415 fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
2416 Self(decoder.decode(&source.text).into())
2419 #[derive(Clone, Debug)]
2420 pub struct VariableSet {
2422 pub vars: Vec<Identifier>,
2426 fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
2427 let (name, input) = input.split_once('=').ok_or(Warning::TBD)?;
2428 let mut vars = Vec::new();
2429 for var in input.split_ascii_whitespace() {
2430 if let Some(identifier) = decoder
2431 .new_identifier(var)
2432 .map_err(Warning::InvalidVariableSetName)
2433 .issue_warning(&decoder.warn)
2435 vars.push(identifier);
2445 #[derive(Clone, Debug)]
2446 pub struct VariableSetRecord {
2447 pub offsets: Range<u64>,
2448 pub sets: Vec<VariableSet>,
2451 impl VariableSetRecord {
2452 fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord {
2453 let mut sets = Vec::new();
2454 let input = decoder.decode(&source.text);
2455 for line in input.lines() {
2456 if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&decoder.warn) {
2461 offsets: source.offsets.clone(),
2467 trait IssueWarning<T> {
2468 fn issue_warning<F>(self, warn: &F) -> Option<T>
2472 impl<T> IssueWarning<T> for Result<T, Warning> {
2473 fn issue_warning<F>(self, warn: &F) -> Option<T>
2478 Ok(result) => Some(result),
2487 #[derive(Clone, Debug)]
2488 pub struct Extension {
2489 pub offsets: Range<u64>,
2494 /// Size of each data element.
2497 /// Number of data elements.
2500 /// `size * count` bytes of data.
2505 fn check_size<E: ExtensionRecord>(&self) -> Result<(), Warning> {
2506 if let Some(expected_size) = E::SIZE {
2507 if self.size != expected_size {
2508 return Err(Warning::BadRecordSize {
2509 offset: self.offsets.start,
2510 record: E::NAME.into(),
2516 if let Some(expected_count) = E::COUNT {
2517 if self.count != expected_count {
2518 return Err(Warning::BadRecordCount {
2519 offset: self.offsets.start,
2520 record: E::NAME.into(),
2529 fn read<R: Read + Seek>(
2533 warn: &Box<dyn Fn(Warning)>,
2534 ) -> Result<Option<Record>, Error> {
2535 let subtype = endian.parse(read_bytes(r)?);
2536 let header_offset = r.stream_position()?;
2537 let size: u32 = endian.parse(read_bytes(r)?);
2538 let count = endian.parse(read_bytes(r)?);
2539 let Some(product) = size.checked_mul(count) else {
2540 return Err(Error::ExtensionRecordTooLarge {
2541 offset: header_offset,
2547 let start_offset = r.stream_position()?;
2548 let data = read_vec(r, product as usize)?;
2549 let end_offset = start_offset + product as u64;
2550 let extension = Extension {
2551 offsets: start_offset..end_offset,
2557 let result = match subtype {
2558 IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian),
2559 FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian),
2560 VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, n_vars, endian, warn),
2561 MultipleResponseRecord::SUBTYPE | 19 => {
2562 MultipleResponseRecord::parse(&extension, endian)
2564 LongStringValueLabelRecord::SUBTYPE => {
2565 LongStringValueLabelRecord::parse(&extension, endian)
2567 EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian),
2568 NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian),
2569 5 => Ok(Record::Text(TextRecord::new(
2571 TextRecordType::VariableSets,
2573 10 => Ok(Record::Text(TextRecord::new(
2575 TextRecordType::ProductInfo,
2577 13 => Ok(Record::Text(TextRecord::new(
2579 TextRecordType::LongNames,
2581 14 => Ok(Record::Text(TextRecord::new(
2583 TextRecordType::VeryLongStrings,
2585 17 => Ok(Record::Text(TextRecord::new(
2587 TextRecordType::FileAttributes,
2589 18 => Ok(Record::Text(TextRecord::new(
2591 TextRecordType::VariableAttributes,
2593 _ => Ok(Record::OtherExtension(extension)),
2596 Ok(result) => Ok(Some(result)),
2605 #[derive(Clone, Debug)]
2606 pub struct ZHeader {
2607 /// File offset to the start of the record.
2610 /// File offset to the ZLIB data header.
2611 pub zheader_offset: u64,
2613 /// File offset to the ZLIB trailer.
2614 pub ztrailer_offset: u64,
2616 /// Length of the ZLIB trailer in bytes.
2617 pub ztrailer_len: u64,
2621 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
2622 let offset = r.stream_position()?;
2623 let zheader_offset: u64 = endian.parse(read_bytes(r)?);
2624 let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
2625 let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
2636 #[derive(Clone, Debug)]
2637 pub struct ZTrailer {
2638 /// File offset to the start of the record.
2641 /// Compression bias as a negative integer, e.g. -100.
2644 /// Always observed as zero.
2647 /// Uncompressed size of each block, except possibly the last. Only
2648 /// `0x3ff000` has been observed so far.
2649 pub block_size: u32,
2651 /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
2652 pub blocks: Vec<ZBlock>,
2655 #[derive(Clone, Debug)]
2657 /// Offset of block of data if simple compression were used.
2658 pub uncompressed_ofs: u64,
2660 /// Actual offset within the file of the compressed data block.
2661 pub compressed_ofs: u64,
2663 /// The number of bytes in this data block after decompression. This is
2664 /// `block_size` in every data block but the last, which may be smaller.
2665 pub uncompressed_size: u32,
2667 /// The number of bytes in this data block, as stored compressed in this
2669 pub compressed_size: u32,
2673 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
2675 uncompressed_ofs: endian.parse(read_bytes(r)?),
2676 compressed_ofs: endian.parse(read_bytes(r)?),
2677 uncompressed_size: endian.parse(read_bytes(r)?),
2678 compressed_size: endian.parse(read_bytes(r)?),
2684 fn read<R: Read + Seek>(
2689 ) -> Result<Option<ZTrailer>, Error> {
2690 let start_offset = reader.stream_position()?;
2691 if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
2694 let int_bias = endian.parse(read_bytes(reader)?);
2695 let zero = endian.parse(read_bytes(reader)?);
2696 let block_size = endian.parse(read_bytes(reader)?);
2697 let n_blocks: u32 = endian.parse(read_bytes(reader)?);
2698 let expected_n_blocks = (ztrailer_len - 24) / 24;
2699 if n_blocks as u64 != expected_n_blocks {
2700 return Err(Error::BadZlibTrailerNBlocks {
2701 offset: ztrailer_ofs,
2707 let blocks = (0..n_blocks)
2708 .map(|_| ZBlock::read(reader, endian))
2709 .collect::<Result<Vec<_>, _>>()?;
2710 reader.seek(SeekFrom::Start(start_offset))?;
2712 offset: ztrailer_ofs,
2721 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
2722 let mut buf = [0; N];
2723 let n = r.read(&mut buf)?;
2726 r.read_exact(&mut buf[n..])?;
2734 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
2735 let mut buf = [0; N];
2736 r.read_exact(&mut buf)?;
2740 fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
2741 let mut vec = vec![0; n];
2742 r.read_exact(&mut vec)?;
2746 fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<RawString, IoError> {
2747 let length: u32 = endian.parse(read_bytes(r)?);
2748 Ok(read_vec(r, length as usize)?.into())
2751 #[derive(Clone, Debug)]
2752 pub struct LongStringValueLabels<N, S>
2759 /// `(value, label)` pairs, where each value is `width` bytes.
2760 pub labels: Vec<(S, S)>,
2763 impl LongStringValueLabels<RawString, RawString> {
2767 ) -> Result<LongStringValueLabels<Identifier, Cow<'a, str>>, Warning> {
2768 let var_name = decoder.decode(&self.var_name);
2769 let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
2770 .map_err(Warning::InvalidLongStringValueLabelName)?;
2772 let mut labels = Vec::with_capacity(self.labels.len());
2773 for (value, label) in self.labels.iter() {
2774 let value = decoder.decode_exact_length(&value.0);
2775 let label = decoder.decode(&label);
2776 labels.push((value, label));
2779 Ok(LongStringValueLabels {
2787 #[derive(Clone, Debug)]
2788 pub struct LongStringValueLabelRecord<N, S>(pub Vec<LongStringValueLabels<N, S>>)
2793 impl ExtensionRecord for LongStringValueLabelRecord<RawString, RawString> {
2794 const SUBTYPE: u32 = 21;
2795 const SIZE: Option<u32> = Some(1);
2796 const COUNT: Option<u32> = None;
2797 const NAME: &'static str = "long string value labels record";
2799 fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
2800 ext.check_size::<Self>()?;
2802 let mut input = &ext.data[..];
2803 let mut label_set = Vec::new();
2804 while !input.is_empty() {
2805 let var_name = read_string(&mut input, endian)?;
2806 let width: u32 = endian.parse(read_bytes(&mut input)?);
2807 let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
2808 let mut labels = Vec::new();
2809 for _ in 0..n_labels {
2810 let value = read_string(&mut input, endian)?;
2811 let label = read_string(&mut input, endian)?;
2812 labels.push((value, label));
2814 label_set.push(LongStringValueLabels {
2820 Ok(Record::LongStringValueLabels(LongStringValueLabelRecord(
2826 impl LongStringValueLabelRecord<RawString, RawString> {
2830 ) -> LongStringValueLabelRecord<Identifier, Cow<'a, str>> {
2831 let mut labels = Vec::with_capacity(self.0.len());
2832 for label in &self.0 {
2833 match label.decode(decoder) {
2834 Ok(set) => labels.push(set),
2835 Err(error) => decoder.warn(error),
2838 LongStringValueLabelRecord(labels)