1 use crate::endian::{Endian, Parse, ToBytes};
4 use flate2::read::ZlibDecoder;
6 use std::fmt::{Debug, Formatter, Result as FmtResult};
7 use std::str::from_utf8;
10 io::{Error as IoError, Read, Seek, SeekFrom},
14 use self::state::State;
16 #[derive(Copy, Clone, Debug)]
17 pub enum Compression {
22 #[derive(Clone, Debug)]
27 ValueLabel(ValueLabel),
28 VarIndexes(VarIndexes),
37 fn read<R: Read + Seek>(reader: &mut R, endian: Endian) -> Result<Record, Error> {
38 let rec_type: u32 = endian.parse(read_bytes(reader)?);
40 2 => Ok(Record::Variable(Variable::read(reader, endian)?)),
41 3 => Ok(Record::ValueLabel(ValueLabel::read(reader, endian)?)),
42 4 => Ok(Record::VarIndexes(VarIndexes::read(reader, endian)?)),
43 6 => Ok(Record::Document(Document::read(reader, endian)?)),
44 7 => Ok(Record::Extension(Extension::read(reader, endian)?)),
45 999 => Ok(Record::EndOfHeaders(endian.parse(read_bytes(reader)?))),
46 _ => Err(Error::BadRecordType {
47 offset: reader.stream_position()?,
54 pub struct FallbackEncoding<'a>(&'a [u8]);
56 impl<'a> Debug for FallbackEncoding<'a> {
57 fn fmt(&self, f: &mut Formatter) -> FmtResult {
58 if let Ok(s) = from_utf8(self.0) {
65 .map(|c| char::from(*c).escape_default())
79 /// Eye-catcher string, product name, in the file's encoding. Padded
80 /// on the right with spaces.
81 pub eye_catcher: [u8; 60],
83 /// Layout code, normally either 2 or 3.
86 /// Number of variable positions, or `None` if the value in the file is
87 /// questionably trustworthy.
88 pub nominal_case_size: Option<u32>,
90 /// Compression type, if any,
91 pub compression: Option<Compression>,
93 /// 0-based variable index of the weight variable, or `None` if the file is
95 pub weight_index: Option<u32>,
97 /// Claimed number of cases, if known.
98 pub n_cases: Option<u32>,
100 /// Compression bias, usually 100.0.
103 /// `dd mmm yy` in the file's encoding.
104 pub creation_date: [u8; 9],
106 /// `HH:MM:SS` in the file's encoding.
107 pub creation_time: [u8; 8],
109 /// File label, in the file's encoding. Padded on the right with spaces.
110 pub file_label: [u8; 64],
112 /// Endianness of the data in the file header.
117 fn debug_field<T: Debug>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult {
118 writeln!(f, "{name:>17}: {:?}", value)
122 impl Debug for Header {
123 fn fmt(&self, f: &mut Formatter) -> FmtResult {
124 writeln!(f, "File header record:")?;
125 self.debug_field(f, "Magic", self.magic)?;
126 self.debug_field(f, "Product name", FallbackEncoding(&self.eye_catcher))?;
127 self.debug_field(f, "Layout code", self.layout_code)?;
128 self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
129 self.debug_field(f, "Compression", self.compression)?;
130 self.debug_field(f, "Weight index", self.weight_index)?;
131 self.debug_field(f, "Number of cases", self.n_cases)?;
132 self.debug_field(f, "Compression bias", self.bias)?;
133 self.debug_field(f, "Creation date", FallbackEncoding(&self.creation_date))?;
134 self.debug_field(f, "Creation time", FallbackEncoding(&self.creation_time))?;
135 self.debug_field(f, "File label", FallbackEncoding(&self.file_label))?;
136 self.debug_field(f, "Endianness", self.endian)
141 fn read<R: Read>(r: &mut R) -> Result<Header, Error> {
142 let magic: [u8; 4] = read_bytes(r)?;
143 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
145 let eye_catcher: [u8; 60] = read_bytes(r)?;
146 let layout_code: [u8; 4] = read_bytes(r)?;
147 let endian = Endian::identify_u32(2, layout_code)
148 .or_else(|| Endian::identify_u32(2, layout_code))
149 .ok_or_else(|| Error::NotASystemFile)?;
150 let layout_code = endian.parse(layout_code);
152 let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
153 let nominal_case_size =
154 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
156 let compression_code: u32 = endian.parse(read_bytes(r)?);
157 let compression = match (magic, compression_code) {
158 (Magic::ZSAV, 2) => Some(Compression::ZLib),
159 (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
161 (_, 1) => Some(Compression::Simple),
162 (_, code) => return Err(Error::InvalidSavCompression(code)),
165 let weight_index: u32 = endian.parse(read_bytes(r)?);
166 let weight_index = (weight_index > 0).then(|| weight_index - 1);
168 let n_cases: u32 = endian.parse(read_bytes(r)?);
169 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
171 let bias: f64 = endian.parse(read_bytes(r)?);
173 let creation_date: [u8; 9] = read_bytes(r)?;
174 let creation_time: [u8; 8] = read_bytes(r)?;
175 let file_label: [u8; 64] = read_bytes(r)?;
176 let _: [u8; 3] = read_bytes(r)?;
195 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
196 pub struct Magic([u8; 4]);
199 /// Magic number for a regular system file.
200 pub const SAV: Magic = Magic(*b"$FL2");
202 /// Magic number for a system file that contains zlib-compressed data.
203 pub const ZSAV: Magic = Magic(*b"$FL3");
205 /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
207 pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
210 impl Debug for Magic {
211 fn fmt(&self, f: &mut Formatter) -> FmtResult {
213 &Magic::SAV => "$FL2",
214 &Magic::ZSAV => "$FL3",
215 &Magic::EBCDIC => "($FL2 in EBCDIC)",
216 _ => return write!(f, "{:?}", self.0),
222 impl TryFrom<[u8; 4]> for Magic {
225 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
226 let magic = Magic(value);
228 Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
229 _ => Err(Error::BadMagic(value)),
234 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
241 fn from_width(width: i32) -> VarType {
243 0 => VarType::Number,
244 _ => VarType::String,
251 Compression, Error, Header, Record, Value, VarType, Variable, ZHeader, ZTrailer,
254 use crate::endian::Endian;
256 collections::VecDeque,
261 #[allow(clippy::type_complexity)]
262 fn read(self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error>;
265 struct Start<R: Read + Seek> {
269 pub fn new<R: Read + Seek + 'static>(reader: R) -> Box<dyn State> {
270 Box::new(Start { reader })
273 struct CommonState<R: Read + Seek> {
277 compression: Option<Compression>,
278 var_types: Vec<VarType>,
281 impl<R: Read + Seek + 'static> State for Start<R> {
282 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
283 let header = Header::read(&mut self.reader)?;
284 let next_state = Headers(CommonState {
286 endian: header.endian,
288 compression: header.compression,
289 var_types: Vec::new(),
291 Ok(Some((Record::Header(header), Box::new(next_state))))
295 struct Headers<R: Read + Seek>(CommonState<R>);
297 impl<R: Read + Seek + 'static> State for Headers<R> {
298 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
299 let record = Record::read(&mut self.0.reader, self.0.endian)?;
301 Record::Variable(Variable { width, .. }) => {
302 self.0.var_types.push(VarType::from_width(width));
304 Record::EndOfHeaders(_) => {
305 let next_state: Box<dyn State> = match self.0.compression {
306 None => Box::new(Data(self.0)),
307 Some(Compression::Simple) => Box::new(CompressedData::new(self.0)),
308 Some(Compression::ZLib) => Box::new(ZlibHeader(self.0)),
310 return Ok(Some((record, next_state)));
314 Ok(Some((record, self)))
318 struct ZlibHeader<R: Read + Seek>(CommonState<R>);
320 impl<R: Read + Seek + 'static> State for ZlibHeader<R> {
321 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
322 let zheader = ZHeader::read(&mut self.0.reader, self.0.endian)?;
323 Ok(Some((Record::ZHeader(zheader), self)))
327 struct ZlibTrailer<R: Read + Seek>(CommonState<R>, ZHeader);
329 impl<R: Read + Seek + 'static> State for ZlibTrailer<R> {
330 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
331 let retval = ZTrailer::read(
334 self.1.ztrailer_offset,
337 let next_state = Box::new(CompressedData::new(CommonState {
338 reader: ZlibDecodeMultiple::new(self.0.reader),
339 endian: self.0.endian,
341 compression: self.0.compression,
342 var_types: self.0.var_types,
345 None => next_state.read(),
346 Some(ztrailer) => Ok(Some((Record::ZTrailer(ztrailer), next_state))),
351 struct Data<R: Read + Seek>(CommonState<R>);
353 impl<R: Read + Seek + 'static> State for Data<R> {
354 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
355 match Value::read_case(&mut self.0.reader, &self.0.var_types, self.0.endian)? {
357 Some(values) => Ok(Some((Record::Case(values), self))),
362 struct CompressedData<R: Read + Seek> {
363 common: CommonState<R>,
367 impl<R: Read + Seek + 'static> CompressedData<R> {
368 fn new(common: CommonState<R>) -> CompressedData<R> {
371 codes: VecDeque::new(),
376 impl<R: Read + Seek + 'static> State for CompressedData<R> {
377 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
378 match Value::read_compressed_case(
379 &mut self.common.reader,
380 &self.common.var_types,
386 Some(values) => Ok(Some((Record::Case(values), self))),
392 #[derive(Copy, Clone)]
398 impl Debug for Value {
399 fn fmt(&self, f: &mut Formatter) -> FmtResult {
401 Value::Number(Some(number)) => write!(f, "{number:?}"),
402 Value::Number(None) => write!(f, "SYSMIS"),
403 Value::String(bytes) => write!(f, "{:?}", FallbackEncoding(bytes)),
409 fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Value, IoError> {
410 Ok(Self::from_raw(var_type, read_bytes(r)?, endian))
413 pub fn from_raw(var_type: VarType, raw: [u8; 8], endian: Endian) -> Value {
415 VarType::String => Value::String(raw),
417 let number: f64 = endian.parse(raw);
418 Value::Number((number != -f64::MAX).then_some(number))
423 fn read_case<R: Read + Seek>(
425 var_types: &[VarType],
427 ) -> Result<Option<Vec<Value>>, Error> {
428 let case_start = reader.stream_position()?;
429 let mut values = Vec::with_capacity(var_types.len());
430 for (i, &var_type) in var_types.iter().enumerate() {
431 let Some(raw) = try_read_bytes(reader)? else {
435 let offset = reader.stream_position()?;
436 return Err(Error::EofInCase {
438 case_ofs: offset - case_start,
439 case_len: var_types.len() * 8,
443 values.push(Value::from_raw(var_type, raw, endian));
448 fn read_compressed_case<R: Read + Seek>(
450 var_types: &[VarType],
451 codes: &mut VecDeque<u8>,
454 ) -> Result<Option<Vec<Value>>, Error> {
455 let case_start = reader.stream_position()?;
456 let mut values = Vec::with_capacity(var_types.len());
457 for (i, &var_type) in var_types.iter().enumerate() {
459 let Some(code) = codes.pop_front() else {
460 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
464 let offset = reader.stream_position()?;
465 return Err(Error::EofInCompressedCase {
467 case_ofs: offset - case_start,
471 codes.extend(new_codes.into_iter());
476 1..=251 => match var_type {
477 VarType::Number => break Value::Number(Some(code as f64 - bias)),
479 break Value::String(endian.to_bytes(code as f64 - bias))
486 let offset = reader.stream_position()?;
487 return Err(Error::PartialCompressedCase {
489 case_ofs: offset - case_start,
493 253 => break Value::from_raw(var_type, read_bytes(reader)?, endian),
494 254 => match var_type {
495 VarType::String => break Value::String(*b" "), // XXX EBCDIC
497 return Err(Error::CompressedStringExpected {
499 case_ofs: reader.stream_position()? - case_start,
503 255 => match var_type {
504 VarType::Number => break Value::Number(None),
506 return Err(Error::CompressedNumberExpected {
508 case_ofs: reader.stream_position()? - case_start,
520 struct ZlibDecodeMultiple<R>
524 reader: Option<ZlibDecoder<R>>,
527 impl<R> ZlibDecodeMultiple<R>
531 fn new(reader: R) -> ZlibDecodeMultiple<R> {
533 reader: Some(ZlibDecoder::new(reader)),
538 impl<R> Read for ZlibDecodeMultiple<R>
542 fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
544 match self.reader.as_mut().unwrap().read(buf)? {
546 let inner = self.reader.take().unwrap().into_inner();
547 self.reader = Some(ZlibDecoder::new(inner));
555 impl<R> Seek for ZlibDecodeMultiple<R>
559 fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
560 self.reader.as_mut().unwrap().get_mut().seek(pos)
565 state: Option<Box<dyn State>>,
569 pub fn new<R: Read + Seek + 'static>(reader: R) -> Result<Reader, Error> {
571 state: Some(state::new(reader)),
576 impl Iterator for Reader {
577 type Item = Result<Record, Error>;
579 fn next(&mut self) -> Option<Self::Item> {
580 match self.state.take()?.read() {
581 Ok(Some((record, next_state))) => {
582 self.state = Some(next_state);
586 Err(error) => Some(Err(error)),
591 impl FusedIterator for Reader {}
593 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
594 pub struct Format(pub u32);
596 impl Debug for Format {
597 fn fmt(&self, f: &mut Formatter) -> FmtResult {
598 let type_ = format_name(self.0 >> 16);
599 let w = (self.0 >> 8) & 0xff;
600 let d = self.0 & 0xff;
601 write!(f, "{:06x} ({type_}{w}.{d})", self.0)
605 fn format_name(type_: u32) -> &'static str {
649 pub struct MissingValues {
650 /// Individual missing values, up to 3 of them.
651 pub values: Vec<Value>,
653 /// Optional range of missing values.
654 pub range: Option<(Value, Value)>,
657 impl Debug for MissingValues {
658 fn fmt(&self, f: &mut Formatter) -> FmtResult {
659 for (i, value) in self.values.iter().enumerate() {
663 write!(f, "{value:?}")?;
666 if let Some((low, high)) = self.range {
667 if !self.values.is_empty() {
670 write!(f, "{low:?} THRU {high:?}")?;
682 fn is_empty(&self) -> bool {
683 self.values.is_empty() && self.range.is_none()
686 fn read<R: Read + Seek>(
692 ) -> Result<MissingValues, Error> {
693 let (n_values, has_range) = match (width, code) {
694 (_, 0..=3) => (code, false),
695 (0, -2) => (0, true),
696 (0, -3) => (1, true),
697 (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
698 (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
701 let var_type = VarType::from_width(width);
703 let mut values = Vec::new();
704 for _ in 0..n_values {
705 values.push(Value::read(r, var_type, endian)?);
707 let range = if has_range {
708 let low = Value::read(r, var_type, endian)?;
709 let high = Value::read(r, var_type, endian)?;
714 Ok(MissingValues { values, range })
719 pub struct Variable {
720 /// Offset from the start of the file to the start of the record.
723 /// Variable width, in the range -1..=255.
726 /// Variable name, padded on the right with spaces.
730 pub print_format: u32,
733 pub write_format: u32,
736 pub missing_values: MissingValues,
738 /// Optional variable label.
739 pub label: Option<Vec<u8>>,
742 impl Debug for Variable {
743 fn fmt(&self, f: &mut Formatter) -> FmtResult {
750 } else if self.width == 0 {
753 "long string continuation record"
756 writeln!(f, "Print format: {:?}", Format(self.print_format))?;
757 writeln!(f, "Write format: {:?}", Format(self.write_format))?;
758 writeln!(f, "Name: {:?}", FallbackEncoding(&self.name))?;
761 "Variable label: {:?}",
764 .map(|label| FallbackEncoding(&label[..]))
766 writeln!(f, "Missing values: {:?}", self.missing_values)
771 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Variable, Error> {
772 let offset = r.stream_position()?;
773 let width: i32 = endian.parse(read_bytes(r)?);
774 let has_variable_label: u32 = endian.parse(read_bytes(r)?);
775 let missing_value_code: i32 = endian.parse(read_bytes(r)?);
776 let print_format: u32 = endian.parse(read_bytes(r)?);
777 let write_format: u32 = endian.parse(read_bytes(r)?);
778 let name: [u8; 8] = read_bytes(r)?;
780 let label = match has_variable_label {
783 let len: u32 = endian.parse(read_bytes(r)?);
784 let read_len = len.min(65535) as usize;
785 let label = Some(read_vec(r, read_len)?);
787 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
788 let _ = read_vec(r, padding_bytes as usize)?;
793 return Err(Error::BadVariableLabelCode {
795 code: has_variable_label,
800 let missing_values = MissingValues::read(r, offset, width, missing_value_code, endian)?;
814 #[derive(Clone, Debug)]
815 pub struct ValueLabel {
816 /// Offset from the start of the file to the start of the record.
820 pub labels: Vec<([u8; 8], Vec<u8>)>,
824 /// Maximum number of value labels in a record.
825 pub const MAX: u32 = u32::MAX / 8;
827 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabel, Error> {
828 let offset = r.stream_position()?;
829 let n: u32 = endian.parse(read_bytes(r)?);
830 if n > ValueLabel::MAX {
831 return Err(Error::BadNumberOfValueLabels {
834 max: ValueLabel::MAX,
838 let mut labels = Vec::new();
840 let value: [u8; 8] = read_bytes(r)?;
841 let label_len: u8 = endian.parse(read_bytes(r)?);
842 let label_len = label_len as usize;
843 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
845 let mut label = read_vec(r, padded_len)?;
846 label.truncate(label_len);
847 labels.push((value, label));
849 Ok(ValueLabel { offset, labels })
853 #[derive(Clone, Debug)]
854 pub struct VarIndexes {
855 /// Offset from the start of the file to the start of the record.
858 /// The 0-based indexes of the variable indexes.
859 pub var_indexes: Vec<u32>,
863 /// Maximum number of variable indexes in a record.
864 pub const MAX: u32 = u32::MAX / 8;
866 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VarIndexes, Error> {
867 let offset = r.stream_position()?;
868 let n: u32 = endian.parse(read_bytes(r)?);
869 if n > VarIndexes::MAX {
870 return Err(Error::BadNumberOfVarIndexes {
873 max: VarIndexes::MAX,
876 let mut var_indexes = Vec::with_capacity(n as usize);
878 var_indexes.push(endian.parse(read_bytes(r)?));
888 #[derive(Clone, Debug)]
889 pub struct Document {
890 /// Offset from the start of the file to the start of the record.
893 /// The document, as an array of 80-byte lines.
894 pub lines: Vec<[u8; Document::LINE_LEN as usize]>,
898 /// Length of a line in a document. Document lines are fixed-length and
899 /// padded on the right with spaces.
900 pub const LINE_LEN: u32 = 80;
902 /// Maximum number of lines we will accept in a document. This is simply
903 /// the maximum number that will fit in a 32-bit space.
904 pub const MAX_LINES: u32 = i32::MAX as u32 / Self::LINE_LEN;
906 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Document, Error> {
907 let offset = r.stream_position()?;
908 let n: u32 = endian.parse(read_bytes(r)?);
910 0..=Self::MAX_LINES => Ok(Document {
911 pos: r.stream_position()?,
913 .map(|_| read_bytes(r))
914 .collect::<Result<Vec<_>, _>>()?,
916 _ => Err(Error::BadDocumentLength {
919 max: Self::MAX_LINES,
926 #[derive(FromPrimitive)]
928 /// Machine integer info.
930 /// Machine floating-point info.
936 /// Multiple response sets.
940 /// Extra product info text.
942 /// Variable display parameters.
944 /// Long variable names.
948 /// Extended number of cases.
950 /// Data file attributes.
952 /// Variable attributes.
954 /// Multiple response sets (extended).
956 /// Character encoding.
958 /// Value labels for long strings.
960 /// Missing values for long strings.
962 /// "Format properties in dataview table".
971 const NAME: &'static str;
972 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error>;
975 trait ExtensionRecord
979 const SIZE: Option<u32>;
980 const COUNT: Option<u32>;
981 const NAME: &'static str;
982 fn parse(ext: &Extension, endian: Endian, warn: impl Fn(Error)) -> Result<Self, Error>;
985 pub struct IntegerInfo {
986 pub version: (i32, i32, i32),
987 pub machine_code: i32,
988 pub floating_point_rep: i32,
989 pub compression_code: i32,
991 pub character_code: i32,
994 impl ExtensionRecord for IntegerInfo {
995 const SIZE: Option<u32> = Some(4);
996 const COUNT: Option<u32> = Some(8);
997 const NAME: &'static str = "integer record";
999 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1000 ext.check_size::<Self>()?;
1002 let mut input = &ext.data[..];
1003 let data: Vec<i32> = (0..8)
1004 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1007 version: (data[0], data[1], data[2]),
1008 machine_code: data[3],
1009 floating_point_rep: data[4],
1010 compression_code: data[5],
1011 endianness: data[6],
1012 character_code: data[7],
1017 pub struct FloatInfo {
1023 impl ExtensionRecord for FloatInfo {
1024 const SIZE: Option<u32> = Some(8);
1025 const COUNT: Option<u32> = Some(3);
1026 const NAME: &'static str = "floating point record";
1028 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1029 ext.check_size::<Self>()?;
1031 let mut input = &ext.data[..];
1032 let data: Vec<f64> = (0..3)
1033 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1043 pub enum CategoryLabels {
1047 pub enum MultipleResponseType {
1050 labels: CategoryLabels,
1054 pub struct MultipleResponseSet {
1057 pub mr_type: MultipleResponseType,
1058 pub vars: Vec<Vec<u8>>,
1061 impl MultipleResponseSet {
1062 fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> {
1063 let Some(equals) = input.iter().position(|&b| b == b'=') else {
1064 return Err(Error::TBD);
1066 let (name, input) = input.split_at(equals);
1067 let (mr_type, input) = match input.get(0) {
1068 Some(b'C') => (MultipleResponseType::MultipleCategory, &input[1..]),
1070 let (value, input) = parse_counted_string(&input[1..])?;
1072 MultipleResponseType::MultipleDichotomy {
1073 value: value.into(),
1074 labels: CategoryLabels::VarLabels,
1080 let Some(b' ') = input.get(1) else {
1081 return Err(Error::TBD);
1083 let input = &input[2..];
1084 let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
1085 (CategoryLabels::CountedValues, rest)
1086 } else if let Some(rest) = input.strip_prefix(b" 11 ") {
1087 (CategoryLabels::VarLabels, rest)
1089 return Err(Error::TBD);
1091 let (value, input) = parse_counted_string(input)?;
1093 MultipleResponseType::MultipleDichotomy {
1094 value: value.into(),
1100 _ => return Err(Error::TBD),
1102 let Some(b' ') = input.get(0) else {
1103 return Err(Error::TBD);
1105 let (label, mut input) = parse_counted_string(&input[1..])?;
1106 let mut vars = Vec::new();
1107 while input.get(0) == Some(&b' ') {
1108 input = &input[1..];
1109 let Some(length) = input.iter().position(|b| b" \n".contains(b)) else {
1110 return Err(Error::TBD);
1113 vars.push(input[..length].into());
1115 input = &input[length..];
1117 if input.get(0) != Some(&b'\n') {
1118 return Err(Error::TBD);
1120 while input.get(0) == Some(&b'\n') {
1121 input = &input[1..];
1124 MultipleResponseSet {
1126 label: label.into(),
1135 pub struct MultipleResponseSets(Vec<MultipleResponseSet>);
1137 impl ExtensionRecord for MultipleResponseSets {
1138 const SIZE: Option<u32> = Some(1);
1139 const COUNT: Option<u32> = None;
1140 const NAME: &'static str = "multiple response set record";
1142 fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1143 ext.check_size::<Self>()?;
1145 let mut input = &ext.data[..];
1146 let mut sets = Vec::new();
1147 while !input.is_empty() {
1148 let (set, rest) = MultipleResponseSet::parse(input)?;
1152 Ok(MultipleResponseSets(sets))
1156 fn parse_counted_string(input: &[u8]) -> Result<(&[u8], &[u8]), Error> {
1157 let Some(space) = input.iter().position(|&b| b == b' ') else {
1158 return Err(Error::TBD);
1160 let Ok(length) = from_utf8(&input[..space]) else {
1161 return Err(Error::TBD);
1163 let Ok(length): Result<usize, _> = length.parse() else {
1164 return Err(Error::TBD);
1167 let input = &input[space + 1..];
1168 if input.len() < length {
1169 return Err(Error::TBD);
1172 let (string, rest) = input.split_at(length);
1176 pub struct ExtraProductInfo(String);
1178 impl TextRecord for ExtraProductInfo {
1179 const NAME: &'static str = "extra product info";
1180 fn parse(input: &str, _warn: impl Fn(Error)) -> Result<Self, Error> {
1181 Ok(ExtraProductInfo(input.into()))
1185 pub struct VarDisplayRecord(Vec<u32>);
1187 impl ExtensionRecord for VarDisplayRecord {
1188 const SIZE: Option<u32> = Some(4);
1189 const COUNT: Option<u32> = None;
1190 const NAME: &'static str = "variable display record";
1192 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1193 ext.check_size::<Self>()?;
1195 let mut input = &ext.data[..];
1196 let display = (0..ext.count)
1197 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1199 Ok(VarDisplayRecord(display))
1203 pub struct VariableSet {
1205 pub vars: Vec<String>,
1209 fn parse(input: &str) -> Result<Self, Error> {
1210 let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
1211 let vars = input.split_ascii_whitespace().map(String::from).collect();
1219 pub struct VariableSetRecord(Vec<VariableSet>);
1221 impl TextRecord for VariableSetRecord {
1222 const NAME: &'static str = "variable set";
1223 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1224 let mut sets = Vec::new();
1225 for line in input.lines() {
1226 match VariableSet::parse(line) {
1227 Ok(set) => sets.push(set),
1228 Err(error) => warn(error),
1231 Ok(VariableSetRecord(sets))
1235 pub struct LongVariableName {
1236 pub short_name: String,
1237 pub long_name: String,
1240 pub struct LongVariableNameRecord(Vec<LongVariableName>);
1242 impl TextRecord for LongVariableNameRecord {
1243 const NAME: &'static str = "long variable names";
1244 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1245 let mut names = Vec::new();
1246 for pair in input.split('\t').filter(|s| !s.is_empty()) {
1247 if let Some((short_name, long_name)) = pair.split_once('=') {
1248 let name = LongVariableName {
1249 short_name: short_name.into(),
1250 long_name: long_name.into(),
1257 Ok(LongVariableNameRecord(names))
1261 pub struct VeryLongString {
1262 pub short_name: String,
1266 impl VeryLongString {
1267 fn parse(input: &str) -> Result<VeryLongString, Error> {
1268 let Some((short_name, length)) = input.split_once('=') else {
1269 return Err(Error::TBD);
1271 let length: usize = length.parse().map_err(|_| Error::TBD)?;
1273 short_name: short_name.into(),
1279 pub struct VeryLongStringRecord(Vec<VeryLongString>);
1281 impl TextRecord for VeryLongStringRecord {
1282 const NAME: &'static str = "very long strings";
1283 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1284 let mut very_long_strings = Vec::new();
1287 .map(|s| s.trim_end_matches('\t'))
1288 .filter(|s| !s.is_empty())
1290 match VeryLongString::parse(tuple) {
1291 Ok(vls) => very_long_strings.push(vls),
1292 Err(error) => warn(error),
1295 Ok(VeryLongStringRecord(very_long_strings))
1299 pub struct LongStringValueLabels {
1300 pub var_name: Vec<u8>,
1303 /// `(value, label)` pairs, where each value is `width` bytes.
1304 pub labels: Vec<(Vec<u8>, Vec<u8>)>,
1307 pub struct LongStringValueLabelSet(Vec<LongStringValueLabels>);
1309 impl ExtensionRecord for LongStringValueLabelSet {
1310 const SIZE: Option<u32> = Some(1);
1311 const COUNT: Option<u32> = None;
1312 const NAME: &'static str = "long string value labels record";
1314 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1315 ext.check_size::<Self>()?;
1317 let mut input = &ext.data[..];
1318 let mut label_set = Vec::new();
1319 while !input.is_empty() {
1320 let var_name = read_string(&mut input, endian)?;
1321 let width: u32 = endian.parse(read_bytes(&mut input)?);
1322 let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
1323 let mut labels = Vec::new();
1324 for _ in 0..n_labels {
1325 let value = read_string(&mut input, endian)?;
1326 let label = read_string(&mut input, endian)?;
1327 labels.push((value, label));
1329 label_set.push(LongStringValueLabels {
1335 Ok(LongStringValueLabelSet(label_set))
1339 pub struct LongStringMissingValues {
1341 pub var_name: Vec<u8>,
1343 /// Up to three missing values.
1344 pub missing_values: Vec<[u8; 8]>,
1347 pub struct LongStringMissingValueSet(Vec<LongStringMissingValues>);
1349 impl ExtensionRecord for LongStringMissingValueSet {
1350 const SIZE: Option<u32> = Some(1);
1351 const COUNT: Option<u32> = None;
1352 const NAME: &'static str = "long string missing values record";
1354 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1355 ext.check_size::<Self>()?;
1357 let mut input = &ext.data[..];
1358 let mut missing_value_set = Vec::new();
1359 while !input.is_empty() {
1360 let var_name = read_string(&mut input, endian)?;
1361 let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
1362 let value_len: u32 = endian.parse(read_bytes(&mut input)?);
1364 let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offset;
1365 return Err(Error::BadLongMissingValueLength {
1366 record_offset: ext.offset,
1371 let mut missing_values = Vec::new();
1372 for i in 0..n_missing_values {
1373 let value: [u8; 8] = read_bytes(&mut input)?;
1374 let numeric_value: u64 = endian.parse(value);
1375 let value = if i > 0 && numeric_value == 8 {
1376 // Tolerate files written by old, buggy versions of PSPP
1377 // where we believed that the value_length was repeated
1378 // before each missing value.
1379 read_bytes(&mut input)?
1383 missing_values.push(value);
1385 missing_value_set.push(LongStringMissingValues {
1390 Ok(LongStringMissingValueSet(missing_value_set))
1394 pub struct Encoding(pub String);
1396 impl ExtensionRecord for Encoding {
1397 const SIZE: Option<u32> = Some(1);
1398 const COUNT: Option<u32> = None;
1399 const NAME: &'static str = "encoding record";
1401 fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1402 ext.check_size::<Self>()?;
1404 Ok(Encoding(String::from_utf8(ext.data.clone()).map_err(
1405 |_| Error::BadEncodingName { offset: ext.offset },
1410 pub struct Attribute {
1412 pub values: Vec<String>,
1416 fn parse<'a>(input: &'a str, warn: &impl Fn(Error)) -> Result<(Attribute, &'a str), Error> {
1417 let Some((name, mut input)) = input.split_once('(') else {
1418 return Err(Error::TBD);
1420 let mut values = Vec::new();
1422 let Some((value, rest)) = input.split_once('\n') else {
1423 return Err(Error::TBD);
1425 if let Some(stripped) = value
1427 .and_then(|value| value.strip_suffix('\''))
1429 values.push(stripped.into());
1432 values.push(value.into());
1434 if let Some(rest) = rest.strip_prefix(')') {
1448 pub struct AttributeSet(pub Vec<Attribute>);
1453 sentinel: Option<char>,
1454 warn: &impl Fn(Error),
1455 ) -> Result<(AttributeSet, &'a str), Error> {
1456 let mut attributes = Vec::new();
1458 match input.chars().next() {
1459 None => break input,
1460 c if c == sentinel => break &input[1..],
1462 let (attribute, rest) = Attribute::parse(input, &warn)?;
1463 attributes.push(attribute);
1468 Ok((AttributeSet(attributes), rest))
1472 pub struct FileAttributeRecord(AttributeSet);
1474 impl TextRecord for FileAttributeRecord {
1475 const NAME: &'static str = "data file attributes";
1476 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1477 let (set, rest) = AttributeSet::parse(input, None, &warn)?;
1478 if !rest.is_empty() {
1481 Ok(FileAttributeRecord(set))
1485 pub struct VarAttributeSet {
1486 pub long_var_name: String,
1487 pub attributes: AttributeSet,
1490 impl VarAttributeSet {
1493 warn: &impl Fn(Error),
1494 ) -> Result<(VarAttributeSet, &'a str), Error> {
1495 let Some((long_var_name, rest)) = input.split_once(':') else {
1496 return Err(Error::TBD);
1498 let (attributes, rest) = AttributeSet::parse(rest, Some('/'), warn)?;
1501 long_var_name: long_var_name.into(),
1509 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
1511 impl TextRecord for VariableAttributeRecord {
1512 const NAME: &'static str = "variable attributes";
1513 fn parse(mut input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1514 let mut var_attribute_sets = Vec::new();
1515 while !input.is_empty() {
1516 match VarAttributeSet::parse(input, &warn) {
1517 Ok((var_attribute, rest)) => {
1518 var_attribute_sets.push(var_attribute);
1527 Ok(VariableAttributeRecord(var_attribute_sets))
1531 pub struct NumberOfCasesRecord {
1532 /// Always observed as 1.
1535 /// Number of cases.
1539 impl ExtensionRecord for NumberOfCasesRecord {
1540 const SIZE: Option<u32> = Some(8);
1541 const COUNT: Option<u32> = Some(2);
1542 const NAME: &'static str = "extended number of cases record";
1544 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1545 ext.check_size::<Self>()?;
1547 let mut input = &ext.data[..];
1548 let one = endian.parse(read_bytes(&mut input)?);
1549 let n_cases = endian.parse(read_bytes(&mut input)?);
1551 Ok(NumberOfCasesRecord { one, n_cases })
1555 #[derive(Clone, Debug)]
1556 pub struct Extension {
1557 /// Offset from the start of the file to the start of the record.
1563 /// Size of each data element.
1566 /// Number of data elements.
1569 /// `size * count` bytes of data.
1574 fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) {
1576 /* Implemented record types. */
1577 ExtensionType::Integer => (4, 8),
1578 ExtensionType::Float => (8, 3),
1579 ExtensionType::VarSets => (1, 0),
1580 ExtensionType::Mrsets => (1, 0),
1581 ExtensionType::ProductInfo => (1, 0),
1582 ExtensionType::Display => (4, 0),
1583 ExtensionType::LongNames => (1, 0),
1584 ExtensionType::LongStrings => (1, 0),
1585 ExtensionType::Ncases => (8, 2),
1586 ExtensionType::FileAttrs => (1, 0),
1587 ExtensionType::VarAttrs => (1, 0),
1588 ExtensionType::Mrsets2 => (1, 0),
1589 ExtensionType::Encoding => (1, 0),
1590 ExtensionType::LongLabels => (1, 0),
1591 ExtensionType::LongMissing => (1, 0),
1593 /* Ignored record types. */
1594 ExtensionType::Date => (0, 0),
1595 ExtensionType::DataEntry => (0, 0),
1596 ExtensionType::Dataview => (0, 0),
1602 fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
1603 if let Some(expected_size) = E::SIZE {
1604 if self.size != expected_size {
1605 return Err(Error::BadRecordSize {
1606 offset: self.offset,
1607 record: E::NAME.into(),
1613 if let Some(expected_count) = E::COUNT {
1614 if self.count != expected_count {
1615 return Err(Error::BadRecordCount {
1616 offset: self.offset,
1617 record: E::NAME.into(),
1626 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Extension, Error> {
1627 let subtype = endian.parse(read_bytes(r)?);
1628 let offset = r.stream_position()?;
1629 let size: u32 = endian.parse(read_bytes(r)?);
1630 let count = endian.parse(read_bytes(r)?);
1631 let Some(product) = size.checked_mul(count) else {
1632 return Err(Error::ExtensionRecordTooLarge {
1639 let offset = r.stream_position()?;
1640 let data = read_vec(r, product as usize)?;
1651 #[derive(Clone, Debug)]
1652 pub struct ZHeader {
1653 /// File offset to the start of the record.
1656 /// File offset to the ZLIB data header.
1657 pub zheader_offset: u64,
1659 /// File offset to the ZLIB trailer.
1660 pub ztrailer_offset: u64,
1662 /// Length of the ZLIB trailer in bytes.
1663 pub ztrailer_len: u64,
1667 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
1668 let offset = r.stream_position()?;
1669 let zheader_offset: u64 = endian.parse(read_bytes(r)?);
1670 let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
1671 let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
1682 #[derive(Clone, Debug)]
1683 pub struct ZTrailer {
1684 /// File offset to the start of the record.
1687 /// Compression bias as a negative integer, e.g. -100.
1690 /// Always observed as zero.
1693 /// Uncompressed size of each block, except possibly the last. Only
1694 /// `0x3ff000` has been observed so far.
1695 pub block_size: u32,
1697 /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
1698 pub blocks: Vec<ZBlock>,
1701 #[derive(Clone, Debug)]
1703 /// Offset of block of data if simple compression were used.
1704 pub uncompressed_ofs: u64,
1706 /// Actual offset within the file of the compressed data block.
1707 pub compressed_ofs: u64,
1709 /// The number of bytes in this data block after decompression. This is
1710 /// `block_size` in every data block but the last, which may be smaller.
1711 pub uncompressed_size: u32,
1713 /// The number of bytes in this data block, as stored compressed in this
1715 pub compressed_size: u32,
1719 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
1721 uncompressed_ofs: endian.parse(read_bytes(r)?),
1722 compressed_ofs: endian.parse(read_bytes(r)?),
1723 uncompressed_size: endian.parse(read_bytes(r)?),
1724 compressed_size: endian.parse(read_bytes(r)?),
1730 fn read<R: Read + Seek>(
1735 ) -> Result<Option<ZTrailer>, Error> {
1736 let start_offset = reader.stream_position()?;
1737 if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
1740 let int_bias = endian.parse(read_bytes(reader)?);
1741 let zero = endian.parse(read_bytes(reader)?);
1742 let block_size = endian.parse(read_bytes(reader)?);
1743 let n_blocks: u32 = endian.parse(read_bytes(reader)?);
1744 let expected_n_blocks = (ztrailer_len - 24) / 24;
1745 if n_blocks as u64 != expected_n_blocks {
1746 return Err(Error::BadZlibTrailerNBlocks {
1747 offset: ztrailer_ofs,
1753 let blocks = (0..n_blocks)
1754 .map(|_| ZBlock::read(reader, endian))
1755 .collect::<Result<Vec<_>, _>>()?;
1756 reader.seek(SeekFrom::Start(start_offset))?;
1758 offset: ztrailer_ofs,
1767 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
1768 let mut buf = [0; N];
1769 let n = r.read(&mut buf)?;
1772 r.read_exact(&mut buf[n..])?;
1780 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
1781 let mut buf = [0; N];
1782 r.read_exact(&mut buf)?;
1786 fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
1787 let mut vec = vec![0; n];
1788 r.read_exact(&mut vec)?;
1792 fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<Vec<u8>, IoError> {
1793 let length: u32 = endian.parse(read_bytes(r)?);
1794 read_vec(r, length as usize)