1 use crate::endian::{Endian, Parse, ToBytes};
4 use flate2::read::ZlibDecoder;
7 use std::fmt::{Debug, Formatter, Result as FmtResult};
8 use std::str::from_utf8;
10 collections::VecDeque,
11 io::{Error as IoError, Read, Seek, SeekFrom},
15 use self::state::State;
17 #[derive(Copy, Clone, Debug)]
18 pub enum Compression {
23 #[derive(Clone, Debug)]
28 ValueLabel(ValueLabel),
29 VarIndexes(VarIndexes),
38 fn read<R: Read + Seek>(reader: &mut R, endian: Endian) -> Result<Record, Error> {
39 let rec_type: u32 = endian.parse(read_bytes(reader)?);
41 2 => Ok(Record::Variable(Variable::read(reader, endian)?)),
42 3 => Ok(Record::ValueLabel(ValueLabel::read(reader, endian)?)),
43 4 => Ok(Record::VarIndexes(VarIndexes::read(reader, endian)?)),
44 6 => Ok(Record::Document(Document::read(reader, endian)?)),
45 7 => Ok(Record::Extension(Extension::read(reader, endian)?)),
46 999 => Ok(Record::EndOfHeaders(endian.parse(read_bytes(reader)?))),
47 _ => Err(Error::BadRecordType {
48 offset: reader.stream_position()?,
55 pub struct FallbackEncoding<'a>(&'a [u8]);
57 fn fallback_encode<'a>(s: &'a [u8]) -> Cow<'a, str> {
58 if let Ok(s) = from_utf8(s) {
63 .map(|c| char::from(*c))
69 impl<'a> Debug for FallbackEncoding<'a> {
70 fn fmt(&self, f: &mut Formatter) -> FmtResult {
71 if let Ok(s) = from_utf8(self.0) {
78 .map(|c| char::from(*c).escape_default())
92 /// Eye-catcher string, product name, in the file's encoding. Padded
93 /// on the right with spaces.
94 pub eye_catcher: [u8; 60],
96 /// Layout code, normally either 2 or 3.
99 /// Number of variable positions, or `None` if the value in the file is
100 /// questionably trustworthy.
101 pub nominal_case_size: Option<u32>,
103 /// Compression type, if any,
104 pub compression: Option<Compression>,
106 /// 0-based variable index of the weight variable, or `None` if the file is
108 pub weight_index: Option<u32>,
110 /// Claimed number of cases, if known.
111 pub n_cases: Option<u32>,
113 /// Compression bias, usually 100.0.
116 /// `dd mmm yy` in the file's encoding.
117 pub creation_date: [u8; 9],
119 /// `HH:MM:SS` in the file's encoding.
120 pub creation_time: [u8; 8],
122 /// File label, in the file's encoding. Padded on the right with spaces.
123 pub file_label: [u8; 64],
125 /// Endianness of the data in the file header.
130 fn debug_field<T: Debug>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult {
131 writeln!(f, "{name:>17}: {:?}", value)
135 impl Debug for Header {
136 fn fmt(&self, f: &mut Formatter) -> FmtResult {
137 writeln!(f, "File header record:")?;
138 self.debug_field(f, "Magic", self.magic)?;
139 self.debug_field(f, "Product name", FallbackEncoding(&self.eye_catcher))?;
140 self.debug_field(f, "Layout code", self.layout_code)?;
141 self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
142 self.debug_field(f, "Compression", self.compression)?;
143 self.debug_field(f, "Weight index", self.weight_index)?;
144 self.debug_field(f, "Number of cases", self.n_cases)?;
145 self.debug_field(f, "Compression bias", self.bias)?;
146 self.debug_field(f, "Creation date", FallbackEncoding(&self.creation_date))?;
147 self.debug_field(f, "Creation time", FallbackEncoding(&self.creation_time))?;
148 self.debug_field(f, "File label", FallbackEncoding(&self.file_label))?;
149 self.debug_field(f, "Endianness", self.endian)
154 fn read<R: Read>(r: &mut R) -> Result<Header, Error> {
155 let magic: [u8; 4] = read_bytes(r)?;
156 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
158 let eye_catcher: [u8; 60] = read_bytes(r)?;
159 let layout_code: [u8; 4] = read_bytes(r)?;
160 let endian = Endian::identify_u32(2, layout_code)
161 .or_else(|| Endian::identify_u32(2, layout_code))
162 .ok_or_else(|| Error::NotASystemFile)?;
163 let layout_code = endian.parse(layout_code);
165 let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
166 let nominal_case_size =
167 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
169 let compression_code: u32 = endian.parse(read_bytes(r)?);
170 let compression = match (magic, compression_code) {
171 (Magic::ZSAV, 2) => Some(Compression::ZLib),
172 (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
174 (_, 1) => Some(Compression::Simple),
175 (_, code) => return Err(Error::InvalidSavCompression(code)),
178 let weight_index: u32 = endian.parse(read_bytes(r)?);
179 let weight_index = (weight_index > 0).then(|| weight_index - 1);
181 let n_cases: u32 = endian.parse(read_bytes(r)?);
182 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
184 let bias: f64 = endian.parse(read_bytes(r)?);
186 let creation_date: [u8; 9] = read_bytes(r)?;
187 let creation_time: [u8; 8] = read_bytes(r)?;
188 let file_label: [u8; 64] = read_bytes(r)?;
189 let _: [u8; 3] = read_bytes(r)?;
208 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
209 pub struct Magic([u8; 4]);
212 /// Magic number for a regular system file.
213 pub const SAV: Magic = Magic(*b"$FL2");
215 /// Magic number for a system file that contains zlib-compressed data.
216 pub const ZSAV: Magic = Magic(*b"$FL3");
218 /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
220 pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
223 impl Debug for Magic {
224 fn fmt(&self, f: &mut Formatter) -> FmtResult {
226 &Magic::SAV => "$FL2",
227 &Magic::ZSAV => "$FL3",
228 &Magic::EBCDIC => "($FL2 in EBCDIC)",
229 _ => return write!(f, "{:?}", self.0),
235 impl TryFrom<[u8; 4]> for Magic {
238 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
239 let magic = Magic(value);
241 Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
242 _ => Err(Error::BadMagic(value)),
247 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
254 fn from_width(width: i32) -> VarType {
256 0 => VarType::Number,
257 _ => VarType::String,
264 Compression, Error, Header, Record, Value, VarType, Variable, ZHeader, ZTrailer,
267 use crate::endian::Endian;
269 collections::VecDeque,
274 #[allow(clippy::type_complexity)]
275 fn read(self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error>;
278 struct Start<R: Read + Seek> {
282 pub fn new<R: Read + Seek + 'static>(reader: R) -> Box<dyn State> {
283 Box::new(Start { reader })
286 struct CommonState<R: Read + Seek> {
290 compression: Option<Compression>,
291 var_types: Vec<VarType>,
294 impl<R: Read + Seek + 'static> State for Start<R> {
295 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
296 let header = Header::read(&mut self.reader)?;
297 let next_state = Headers(CommonState {
299 endian: header.endian,
301 compression: header.compression,
302 var_types: Vec::new(),
304 Ok(Some((Record::Header(header), Box::new(next_state))))
308 struct Headers<R: Read + Seek>(CommonState<R>);
310 impl<R: Read + Seek + 'static> State for Headers<R> {
311 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
312 let record = Record::read(&mut self.0.reader, self.0.endian)?;
314 Record::Variable(Variable { width, .. }) => {
315 self.0.var_types.push(VarType::from_width(width));
317 Record::EndOfHeaders(_) => {
318 let next_state: Box<dyn State> = match self.0.compression {
319 None => Box::new(Data(self.0)),
320 Some(Compression::Simple) => Box::new(CompressedData::new(self.0)),
321 Some(Compression::ZLib) => Box::new(ZlibHeader(self.0)),
323 return Ok(Some((record, next_state)));
327 Ok(Some((record, self)))
331 struct ZlibHeader<R: Read + Seek>(CommonState<R>);
333 impl<R: Read + Seek + 'static> State for ZlibHeader<R> {
334 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
335 let zheader = ZHeader::read(&mut self.0.reader, self.0.endian)?;
336 Ok(Some((Record::ZHeader(zheader), self)))
340 struct ZlibTrailer<R: Read + Seek>(CommonState<R>, ZHeader);
342 impl<R: Read + Seek + 'static> State for ZlibTrailer<R> {
343 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
344 let retval = ZTrailer::read(
347 self.1.ztrailer_offset,
350 let next_state = Box::new(CompressedData::new(CommonState {
351 reader: ZlibDecodeMultiple::new(self.0.reader),
352 endian: self.0.endian,
354 compression: self.0.compression,
355 var_types: self.0.var_types,
358 None => next_state.read(),
359 Some(ztrailer) => Ok(Some((Record::ZTrailer(ztrailer), next_state))),
364 struct Data<R: Read + Seek>(CommonState<R>);
366 impl<R: Read + Seek + 'static> State for Data<R> {
367 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
368 match Value::read_case(&mut self.0.reader, &self.0.var_types, self.0.endian)? {
370 Some(values) => Ok(Some((Record::Case(values), self))),
375 struct CompressedData<R: Read + Seek> {
376 common: CommonState<R>,
380 impl<R: Read + Seek + 'static> CompressedData<R> {
381 fn new(common: CommonState<R>) -> CompressedData<R> {
384 codes: VecDeque::new(),
389 impl<R: Read + Seek + 'static> State for CompressedData<R> {
390 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
391 match Value::read_compressed_case(
392 &mut self.common.reader,
393 &self.common.var_types,
399 Some(values) => Ok(Some((Record::Case(values), self))),
405 #[derive(Copy, Clone)]
411 impl Debug for Value {
412 fn fmt(&self, f: &mut Formatter) -> FmtResult {
414 Value::Number(Some(number)) => write!(f, "{number:?}"),
415 Value::Number(None) => write!(f, "SYSMIS"),
416 Value::String(bytes) => write!(f, "{:?}", FallbackEncoding(bytes)),
422 fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Value, IoError> {
423 Ok(Self::from_raw(var_type, read_bytes(r)?, endian))
426 pub fn from_raw(var_type: VarType, raw: [u8; 8], endian: Endian) -> Value {
428 VarType::String => Value::String(raw),
430 let number: f64 = endian.parse(raw);
431 Value::Number((number != -f64::MAX).then_some(number))
436 fn read_case<R: Read + Seek>(
438 var_types: &[VarType],
440 ) -> Result<Option<Vec<Value>>, Error> {
441 let case_start = reader.stream_position()?;
442 let mut values = Vec::with_capacity(var_types.len());
443 for (i, &var_type) in var_types.iter().enumerate() {
444 let Some(raw) = try_read_bytes(reader)? else {
448 let offset = reader.stream_position()?;
449 return Err(Error::EofInCase {
451 case_ofs: offset - case_start,
452 case_len: var_types.len() * 8,
456 values.push(Value::from_raw(var_type, raw, endian));
461 fn read_compressed_case<R: Read + Seek>(
463 var_types: &[VarType],
464 codes: &mut VecDeque<u8>,
467 ) -> Result<Option<Vec<Value>>, Error> {
468 let case_start = reader.stream_position()?;
469 let mut values = Vec::with_capacity(var_types.len());
470 for (i, &var_type) in var_types.iter().enumerate() {
472 let Some(code) = codes.pop_front() else {
473 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
477 let offset = reader.stream_position()?;
478 return Err(Error::EofInCompressedCase {
480 case_ofs: offset - case_start,
484 codes.extend(new_codes.into_iter());
489 1..=251 => match var_type {
490 VarType::Number => break Value::Number(Some(code as f64 - bias)),
492 break Value::String(endian.to_bytes(code as f64 - bias))
499 let offset = reader.stream_position()?;
500 return Err(Error::PartialCompressedCase {
502 case_ofs: offset - case_start,
506 253 => break Value::from_raw(var_type, read_bytes(reader)?, endian),
507 254 => match var_type {
508 VarType::String => break Value::String(*b" "), // XXX EBCDIC
510 return Err(Error::CompressedStringExpected {
512 case_ofs: reader.stream_position()? - case_start,
516 255 => match var_type {
517 VarType::Number => break Value::Number(None),
519 return Err(Error::CompressedNumberExpected {
521 case_ofs: reader.stream_position()? - case_start,
533 struct ZlibDecodeMultiple<R>
537 reader: Option<ZlibDecoder<R>>,
540 impl<R> ZlibDecodeMultiple<R>
544 fn new(reader: R) -> ZlibDecodeMultiple<R> {
546 reader: Some(ZlibDecoder::new(reader)),
551 impl<R> Read for ZlibDecodeMultiple<R>
555 fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
557 match self.reader.as_mut().unwrap().read(buf)? {
559 let inner = self.reader.take().unwrap().into_inner();
560 self.reader = Some(ZlibDecoder::new(inner));
568 impl<R> Seek for ZlibDecodeMultiple<R>
572 fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
573 self.reader.as_mut().unwrap().get_mut().seek(pos)
578 state: Option<Box<dyn State>>,
582 pub fn new<R: Read + Seek + 'static>(reader: R) -> Result<Reader, Error> {
584 state: Some(state::new(reader)),
589 impl Iterator for Reader {
590 type Item = Result<Record, Error>;
592 fn next(&mut self) -> Option<Self::Item> {
593 match self.state.take()?.read() {
594 Ok(Some((record, next_state))) => {
595 self.state = Some(next_state);
599 Err(error) => Some(Err(error)),
604 impl FusedIterator for Reader {}
606 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
607 pub struct Format(pub u32);
609 impl Debug for Format {
610 fn fmt(&self, f: &mut Formatter) -> FmtResult {
611 let type_ = format_name(self.0 >> 16);
612 let w = (self.0 >> 8) & 0xff;
613 let d = self.0 & 0xff;
614 write!(f, "{:06x} ({type_}{w}.{d})", self.0)
618 fn format_name(type_: u32) -> &'static str {
662 pub struct MissingValues {
663 /// Individual missing values, up to 3 of them.
664 pub values: Vec<Value>,
666 /// Optional range of missing values.
667 pub range: Option<(Value, Value)>,
670 impl Debug for MissingValues {
671 fn fmt(&self, f: &mut Formatter) -> FmtResult {
672 for (i, value) in self.values.iter().enumerate() {
676 write!(f, "{value:?}")?;
679 if let Some((low, high)) = self.range {
680 if !self.values.is_empty() {
683 write!(f, "{low:?} THRU {high:?}")?;
695 fn is_empty(&self) -> bool {
696 self.values.is_empty() && self.range.is_none()
699 fn read<R: Read + Seek>(
705 ) -> Result<MissingValues, Error> {
706 let (n_values, has_range) = match (width, code) {
707 (_, 0..=3) => (code, false),
708 (0, -2) => (0, true),
709 (0, -3) => (1, true),
710 (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
711 (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
714 let var_type = VarType::from_width(width);
716 let mut values = Vec::new();
717 for _ in 0..n_values {
718 values.push(Value::read(r, var_type, endian)?);
720 let range = if has_range {
721 let low = Value::read(r, var_type, endian)?;
722 let high = Value::read(r, var_type, endian)?;
727 Ok(MissingValues { values, range })
732 pub struct Variable {
733 /// Offset from the start of the file to the start of the record.
736 /// Variable width, in the range -1..=255.
739 /// Variable name, padded on the right with spaces.
743 pub print_format: u32,
746 pub write_format: u32,
749 pub missing_values: MissingValues,
751 /// Optional variable label.
752 pub label: Option<UnencodedString>,
755 impl Debug for Variable {
756 fn fmt(&self, f: &mut Formatter) -> FmtResult {
763 } else if self.width == 0 {
766 "long string continuation record"
769 writeln!(f, "Print format: {:?}", Format(self.print_format))?;
770 writeln!(f, "Write format: {:?}", Format(self.write_format))?;
771 writeln!(f, "Name: {:?}", FallbackEncoding(&self.name))?;
774 "Variable label: {:?}",
777 writeln!(f, "Missing values: {:?}", self.missing_values)
782 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Variable, Error> {
783 let offset = r.stream_position()?;
784 let width: i32 = endian.parse(read_bytes(r)?);
785 let has_variable_label: u32 = endian.parse(read_bytes(r)?);
786 let missing_value_code: i32 = endian.parse(read_bytes(r)?);
787 let print_format: u32 = endian.parse(read_bytes(r)?);
788 let write_format: u32 = endian.parse(read_bytes(r)?);
789 let name: [u8; 8] = read_bytes(r)?;
791 let label = match has_variable_label {
794 let len: u32 = endian.parse(read_bytes(r)?);
795 let read_len = len.min(65535) as usize;
796 let label = UnencodedString(read_vec(r, read_len)?);
798 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
799 let _ = read_vec(r, padding_bytes as usize)?;
804 return Err(Error::BadVariableLabelCode {
806 code: has_variable_label,
811 let missing_values = MissingValues::read(r, offset, width, missing_value_code, endian)?;
825 #[derive(Copy, Clone)]
826 pub struct UntypedValue(pub [u8; 8]);
828 impl Debug for UntypedValue {
829 fn fmt(&self, f: &mut Formatter) -> FmtResult {
830 let little: f64 = Endian::Little.parse(self.0);
831 let little = format!("{:?}", little);
832 let big: f64 = Endian::Big.parse(self.0);
833 let big = format!("{:?}", big);
834 let number = if little.len() <= big.len() { little } else { big };
835 write!(f, "{number}")?;
837 let string = fallback_encode(&self.0);
838 let string = string.split(|c: char| c == '\0' || c.is_control()).next().unwrap();
839 write!(f, "/\"{string}\"")?;
845 pub struct UnencodedString(Vec<u8>);
847 impl From<Vec<u8>> for UnencodedString {
848 fn from(source: Vec<u8>) -> Self {
853 impl From<&[u8]> for UnencodedString {
854 fn from(source: &[u8]) -> Self {
859 impl Debug for UnencodedString {
860 fn fmt(&self, f: &mut Formatter) -> FmtResult {
861 write!(f, "{:?}", FallbackEncoding(self.0.as_slice()))
866 pub struct ValueLabel {
867 /// Offset from the start of the file to the start of the record.
871 pub labels: Vec<(UntypedValue, UnencodedString)>,
874 impl Debug for ValueLabel {
875 fn fmt(&self, f: &mut Formatter) -> FmtResult {
876 for (value, label) in self.labels.iter() {
877 writeln!(f, "{value:?}: {label:?}")?;
884 /// Maximum number of value labels in a record.
885 pub const MAX: u32 = u32::MAX / 8;
887 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabel, Error> {
888 let offset = r.stream_position()?;
889 let n: u32 = endian.parse(read_bytes(r)?);
890 if n > ValueLabel::MAX {
891 return Err(Error::BadNumberOfValueLabels {
894 max: ValueLabel::MAX,
898 let mut labels = Vec::new();
900 let value = UntypedValue(read_bytes(r)?);
901 let label_len: u8 = endian.parse(read_bytes(r)?);
902 let label_len = label_len as usize;
903 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
905 let mut label = read_vec(r, padded_len - 1)?;
906 label.truncate(label_len);
907 labels.push((value, UnencodedString(label)));
909 Ok(ValueLabel { offset, labels })
914 pub struct VarIndexes {
915 /// Offset from the start of the file to the start of the record.
918 /// The 0-based indexes of the variable indexes.
919 pub var_indexes: Vec<u32>,
922 impl Debug for VarIndexes {
923 fn fmt(&self, f: &mut Formatter) -> FmtResult {
924 write!(f, "apply to variables")?;
925 for var_index in self.var_indexes.iter() {
926 write!(f, " #{var_index}")?;
933 /// Maximum number of variable indexes in a record.
934 pub const MAX: u32 = u32::MAX / 8;
936 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VarIndexes, Error> {
937 let offset = r.stream_position()?;
938 let n: u32 = endian.parse(read_bytes(r)?);
939 if n > VarIndexes::MAX {
940 return Err(Error::BadNumberOfVarIndexes {
943 max: VarIndexes::MAX,
946 let mut var_indexes = Vec::with_capacity(n as usize);
948 var_indexes.push(endian.parse(read_bytes(r)?));
958 #[derive(Clone, Debug)]
959 pub struct Document {
960 /// Offset from the start of the file to the start of the record.
963 /// The document, as an array of 80-byte lines.
964 pub lines: Vec<[u8; Document::LINE_LEN as usize]>,
968 /// Length of a line in a document. Document lines are fixed-length and
969 /// padded on the right with spaces.
970 pub const LINE_LEN: u32 = 80;
972 /// Maximum number of lines we will accept in a document. This is simply
973 /// the maximum number that will fit in a 32-bit space.
974 pub const MAX_LINES: u32 = i32::MAX as u32 / Self::LINE_LEN;
976 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Document, Error> {
977 let offset = r.stream_position()?;
978 let n: u32 = endian.parse(read_bytes(r)?);
980 0..=Self::MAX_LINES => Ok(Document {
981 pos: r.stream_position()?,
983 .map(|_| read_bytes(r))
984 .collect::<Result<Vec<_>, _>>()?,
986 _ => Err(Error::BadDocumentLength {
989 max: Self::MAX_LINES,
996 #[derive(FromPrimitive)]
998 /// Machine integer info.
1000 /// Machine floating-point info.
1006 /// Multiple response sets.
1008 /// SPSS Data Entry.
1010 /// Extra product info text.
1012 /// Variable display parameters.
1014 /// Long variable names.
1018 /// Extended number of cases.
1020 /// Data file attributes.
1022 /// Variable attributes.
1024 /// Multiple response sets (extended).
1026 /// Character encoding.
1028 /// Value labels for long strings.
1030 /// Missing values for long strings.
1032 /// "Format properties in dataview table".
1041 const NAME: &'static str;
1042 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error>;
1045 trait ExtensionRecord
1049 const SIZE: Option<u32>;
1050 const COUNT: Option<u32>;
1051 const NAME: &'static str;
1052 fn parse(ext: &Extension, endian: Endian, warn: impl Fn(Error)) -> Result<Self, Error>;
1055 pub struct IntegerInfo {
1056 pub version: (i32, i32, i32),
1057 pub machine_code: i32,
1058 pub floating_point_rep: i32,
1059 pub compression_code: i32,
1060 pub endianness: i32,
1061 pub character_code: i32,
1064 impl ExtensionRecord for IntegerInfo {
1065 const SIZE: Option<u32> = Some(4);
1066 const COUNT: Option<u32> = Some(8);
1067 const NAME: &'static str = "integer record";
1069 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1070 ext.check_size::<Self>()?;
1072 let mut input = &ext.data[..];
1073 let data: Vec<i32> = (0..8)
1074 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1077 version: (data[0], data[1], data[2]),
1078 machine_code: data[3],
1079 floating_point_rep: data[4],
1080 compression_code: data[5],
1081 endianness: data[6],
1082 character_code: data[7],
1087 pub struct FloatInfo {
1093 impl ExtensionRecord for FloatInfo {
1094 const SIZE: Option<u32> = Some(8);
1095 const COUNT: Option<u32> = Some(3);
1096 const NAME: &'static str = "floating point record";
1098 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1099 ext.check_size::<Self>()?;
1101 let mut input = &ext.data[..];
1102 let data: Vec<f64> = (0..3)
1103 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1113 pub enum CategoryLabels {
1117 pub enum MultipleResponseType {
1119 value: UnencodedString,
1120 labels: CategoryLabels,
1124 pub struct MultipleResponseSet {
1125 pub name: UnencodedString,
1126 pub label: UnencodedString,
1127 pub mr_type: MultipleResponseType,
1128 pub vars: Vec<UnencodedString>,
1131 impl MultipleResponseSet {
1132 fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> {
1133 let Some(equals) = input.iter().position(|&b| b == b'=') else {
1134 return Err(Error::TBD);
1136 let (name, input) = input.split_at(equals);
1137 let (mr_type, input) = match input.get(0) {
1138 Some(b'C') => (MultipleResponseType::MultipleCategory, &input[1..]),
1140 let (value, input) = parse_counted_string(&input[1..])?;
1142 MultipleResponseType::MultipleDichotomy {
1143 value: value.into(),
1144 labels: CategoryLabels::VarLabels,
1150 let Some(b' ') = input.get(1) else {
1151 return Err(Error::TBD);
1153 let input = &input[2..];
1154 let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
1155 (CategoryLabels::CountedValues, rest)
1156 } else if let Some(rest) = input.strip_prefix(b" 11 ") {
1157 (CategoryLabels::VarLabels, rest)
1159 return Err(Error::TBD);
1161 let (value, input) = parse_counted_string(input)?;
1163 MultipleResponseType::MultipleDichotomy {
1164 value: value.into(),
1170 _ => return Err(Error::TBD),
1172 let Some(b' ') = input.get(0) else {
1173 return Err(Error::TBD);
1175 let (label, mut input) = parse_counted_string(&input[1..])?;
1176 let mut vars = Vec::new();
1177 while input.get(0) == Some(&b' ') {
1178 input = &input[1..];
1179 let Some(length) = input.iter().position(|b| b" \n".contains(b)) else {
1180 return Err(Error::TBD);
1183 vars.push(input[..length].into());
1185 input = &input[length..];
1187 if input.get(0) != Some(&b'\n') {
1188 return Err(Error::TBD);
1190 while input.get(0) == Some(&b'\n') {
1191 input = &input[1..];
1194 MultipleResponseSet {
1196 label: label.into(),
1205 pub struct MultipleResponseSets(Vec<MultipleResponseSet>);
1207 impl ExtensionRecord for MultipleResponseSets {
1208 const SIZE: Option<u32> = Some(1);
1209 const COUNT: Option<u32> = None;
1210 const NAME: &'static str = "multiple response set record";
1212 fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1213 ext.check_size::<Self>()?;
1215 let mut input = &ext.data[..];
1216 let mut sets = Vec::new();
1217 while !input.is_empty() {
1218 let (set, rest) = MultipleResponseSet::parse(input)?;
1222 Ok(MultipleResponseSets(sets))
1226 fn parse_counted_string(input: &[u8]) -> Result<(UnencodedString, &[u8]), Error> {
1227 let Some(space) = input.iter().position(|&b| b == b' ') else {
1228 return Err(Error::TBD);
1230 let Ok(length) = from_utf8(&input[..space]) else {
1231 return Err(Error::TBD);
1233 let Ok(length): Result<usize, _> = length.parse() else {
1234 return Err(Error::TBD);
1237 let input = &input[space + 1..];
1238 if input.len() < length {
1239 return Err(Error::TBD);
1242 let (string, rest) = input.split_at(length);
1243 Ok((string.into(), rest))
1246 pub struct ExtraProductInfo(String);
1248 impl TextRecord for ExtraProductInfo {
1249 const NAME: &'static str = "extra product info";
1250 fn parse(input: &str, _warn: impl Fn(Error)) -> Result<Self, Error> {
1251 Ok(ExtraProductInfo(input.into()))
1255 pub struct VarDisplayRecord(Vec<u32>);
1257 impl ExtensionRecord for VarDisplayRecord {
1258 const SIZE: Option<u32> = Some(4);
1259 const COUNT: Option<u32> = None;
1260 const NAME: &'static str = "variable display record";
1262 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1263 ext.check_size::<Self>()?;
1265 let mut input = &ext.data[..];
1266 let display = (0..ext.count)
1267 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
1269 Ok(VarDisplayRecord(display))
1273 pub struct VariableSet {
1275 pub vars: Vec<String>,
1279 fn parse(input: &str) -> Result<Self, Error> {
1280 let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
1281 let vars = input.split_ascii_whitespace().map(String::from).collect();
1289 pub struct VariableSetRecord(Vec<VariableSet>);
1291 impl TextRecord for VariableSetRecord {
1292 const NAME: &'static str = "variable set";
1293 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1294 let mut sets = Vec::new();
1295 for line in input.lines() {
1296 match VariableSet::parse(line) {
1297 Ok(set) => sets.push(set),
1298 Err(error) => warn(error),
1301 Ok(VariableSetRecord(sets))
1305 pub struct LongVariableName {
1306 pub short_name: String,
1307 pub long_name: String,
1310 pub struct LongVariableNameRecord(Vec<LongVariableName>);
1312 impl TextRecord for LongVariableNameRecord {
1313 const NAME: &'static str = "long variable names";
1314 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1315 let mut names = Vec::new();
1316 for pair in input.split('\t').filter(|s| !s.is_empty()) {
1317 if let Some((short_name, long_name)) = pair.split_once('=') {
1318 let name = LongVariableName {
1319 short_name: short_name.into(),
1320 long_name: long_name.into(),
1327 Ok(LongVariableNameRecord(names))
1331 pub struct VeryLongString {
1332 pub short_name: String,
1336 impl VeryLongString {
1337 fn parse(input: &str) -> Result<VeryLongString, Error> {
1338 let Some((short_name, length)) = input.split_once('=') else {
1339 return Err(Error::TBD);
1341 let length: usize = length.parse().map_err(|_| Error::TBD)?;
1343 short_name: short_name.into(),
1349 pub struct VeryLongStringRecord(Vec<VeryLongString>);
1351 impl TextRecord for VeryLongStringRecord {
1352 const NAME: &'static str = "very long strings";
1353 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1354 let mut very_long_strings = Vec::new();
1357 .map(|s| s.trim_end_matches('\t'))
1358 .filter(|s| !s.is_empty())
1360 match VeryLongString::parse(tuple) {
1361 Ok(vls) => very_long_strings.push(vls),
1362 Err(error) => warn(error),
1365 Ok(VeryLongStringRecord(very_long_strings))
1369 pub struct LongStringValueLabels {
1370 pub var_name: UnencodedString,
1373 /// `(value, label)` pairs, where each value is `width` bytes.
1374 pub labels: Vec<(UnencodedString, UnencodedString)>,
1377 pub struct LongStringValueLabelSet(Vec<LongStringValueLabels>);
1379 impl ExtensionRecord for LongStringValueLabelSet {
1380 const SIZE: Option<u32> = Some(1);
1381 const COUNT: Option<u32> = None;
1382 const NAME: &'static str = "long string value labels record";
1384 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1385 ext.check_size::<Self>()?;
1387 let mut input = &ext.data[..];
1388 let mut label_set = Vec::new();
1389 while !input.is_empty() {
1390 let var_name = read_string(&mut input, endian)?;
1391 let width: u32 = endian.parse(read_bytes(&mut input)?);
1392 let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
1393 let mut labels = Vec::new();
1394 for _ in 0..n_labels {
1395 let value = read_string(&mut input, endian)?;
1396 let label = read_string(&mut input, endian)?;
1397 labels.push((value, label));
1399 label_set.push(LongStringValueLabels {
1405 Ok(LongStringValueLabelSet(label_set))
1409 pub struct LongStringMissingValues {
1411 pub var_name: UnencodedString,
1414 pub missing_values: MissingValues,
1417 pub struct LongStringMissingValueSet(Vec<LongStringMissingValues>);
1419 impl ExtensionRecord for LongStringMissingValueSet {
1420 const SIZE: Option<u32> = Some(1);
1421 const COUNT: Option<u32> = None;
1422 const NAME: &'static str = "long string missing values record";
1424 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1425 ext.check_size::<Self>()?;
1427 let mut input = &ext.data[..];
1428 let mut missing_value_set = Vec::new();
1429 while !input.is_empty() {
1430 let var_name = read_string(&mut input, endian)?;
1431 let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
1432 let value_len: u32 = endian.parse(read_bytes(&mut input)?);
1434 let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offset;
1435 return Err(Error::BadLongMissingValueLength {
1436 record_offset: ext.offset,
1441 let mut values = Vec::new();
1442 for i in 0..n_missing_values {
1443 let value: [u8; 8] = read_bytes(&mut input)?;
1444 let numeric_value: u64 = endian.parse(value);
1445 let value = if i > 0 && numeric_value == 8 {
1446 // Tolerate files written by old, buggy versions of PSPP
1447 // where we believed that the value_length was repeated
1448 // before each missing value.
1449 read_bytes(&mut input)?
1453 values.push(Value::String(value));
1455 let missing_values = MissingValues { values, range: None };
1456 missing_value_set.push(LongStringMissingValues {
1461 Ok(LongStringMissingValueSet(missing_value_set))
1465 pub struct Encoding(pub String);
1467 impl ExtensionRecord for Encoding {
1468 const SIZE: Option<u32> = Some(1);
1469 const COUNT: Option<u32> = None;
1470 const NAME: &'static str = "encoding record";
1472 fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1473 ext.check_size::<Self>()?;
1475 Ok(Encoding(String::from_utf8(ext.data.clone()).map_err(
1476 |_| Error::BadEncodingName { offset: ext.offset },
1481 pub struct Attribute {
1483 pub values: Vec<String>,
1487 fn parse<'a>(input: &'a str, warn: &impl Fn(Error)) -> Result<(Attribute, &'a str), Error> {
1488 let Some((name, mut input)) = input.split_once('(') else {
1489 return Err(Error::TBD);
1491 let mut values = Vec::new();
1493 let Some((value, rest)) = input.split_once('\n') else {
1494 return Err(Error::TBD);
1496 if let Some(stripped) = value
1498 .and_then(|value| value.strip_suffix('\''))
1500 values.push(stripped.into());
1503 values.push(value.into());
1505 if let Some(rest) = rest.strip_prefix(')') {
1519 pub struct AttributeSet(pub Vec<Attribute>);
1524 sentinel: Option<char>,
1525 warn: &impl Fn(Error),
1526 ) -> Result<(AttributeSet, &'a str), Error> {
1527 let mut attributes = Vec::new();
1529 match input.chars().next() {
1530 None => break input,
1531 c if c == sentinel => break &input[1..],
1533 let (attribute, rest) = Attribute::parse(input, &warn)?;
1534 attributes.push(attribute);
1539 Ok((AttributeSet(attributes), rest))
1543 pub struct FileAttributeRecord(AttributeSet);
1545 impl TextRecord for FileAttributeRecord {
1546 const NAME: &'static str = "data file attributes";
1547 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1548 let (set, rest) = AttributeSet::parse(input, None, &warn)?;
1549 if !rest.is_empty() {
1552 Ok(FileAttributeRecord(set))
1556 pub struct VarAttributeSet {
1557 pub long_var_name: String,
1558 pub attributes: AttributeSet,
1561 impl VarAttributeSet {
1564 warn: &impl Fn(Error),
1565 ) -> Result<(VarAttributeSet, &'a str), Error> {
1566 let Some((long_var_name, rest)) = input.split_once(':') else {
1567 return Err(Error::TBD);
1569 let (attributes, rest) = AttributeSet::parse(rest, Some('/'), warn)?;
1572 long_var_name: long_var_name.into(),
1580 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
1582 impl TextRecord for VariableAttributeRecord {
1583 const NAME: &'static str = "variable attributes";
1584 fn parse(mut input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1585 let mut var_attribute_sets = Vec::new();
1586 while !input.is_empty() {
1587 match VarAttributeSet::parse(input, &warn) {
1588 Ok((var_attribute, rest)) => {
1589 var_attribute_sets.push(var_attribute);
1598 Ok(VariableAttributeRecord(var_attribute_sets))
1602 pub struct NumberOfCasesRecord {
1603 /// Always observed as 1.
1606 /// Number of cases.
1610 impl ExtensionRecord for NumberOfCasesRecord {
1611 const SIZE: Option<u32> = Some(8);
1612 const COUNT: Option<u32> = Some(2);
1613 const NAME: &'static str = "extended number of cases record";
1615 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1616 ext.check_size::<Self>()?;
1618 let mut input = &ext.data[..];
1619 let one = endian.parse(read_bytes(&mut input)?);
1620 let n_cases = endian.parse(read_bytes(&mut input)?);
1622 Ok(NumberOfCasesRecord { one, n_cases })
1626 #[derive(Clone, Debug)]
1627 pub struct Extension {
1628 /// Offset from the start of the file to the start of the record.
1634 /// Size of each data element.
1637 /// Number of data elements.
1640 /// `size * count` bytes of data.
1645 fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) {
1647 /* Implemented record types. */
1648 ExtensionType::Integer => (4, 8),
1649 ExtensionType::Float => (8, 3),
1650 ExtensionType::VarSets => (1, 0),
1651 ExtensionType::Mrsets => (1, 0),
1652 ExtensionType::ProductInfo => (1, 0),
1653 ExtensionType::Display => (4, 0),
1654 ExtensionType::LongNames => (1, 0),
1655 ExtensionType::LongStrings => (1, 0),
1656 ExtensionType::Ncases => (8, 2),
1657 ExtensionType::FileAttrs => (1, 0),
1658 ExtensionType::VarAttrs => (1, 0),
1659 ExtensionType::Mrsets2 => (1, 0),
1660 ExtensionType::Encoding => (1, 0),
1661 ExtensionType::LongLabels => (1, 0),
1662 ExtensionType::LongMissing => (1, 0),
1664 /* Ignored record types. */
1665 ExtensionType::Date => (0, 0),
1666 ExtensionType::DataEntry => (0, 0),
1667 ExtensionType::Dataview => (0, 0),
1673 fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
1674 if let Some(expected_size) = E::SIZE {
1675 if self.size != expected_size {
1676 return Err(Error::BadRecordSize {
1677 offset: self.offset,
1678 record: E::NAME.into(),
1684 if let Some(expected_count) = E::COUNT {
1685 if self.count != expected_count {
1686 return Err(Error::BadRecordCount {
1687 offset: self.offset,
1688 record: E::NAME.into(),
1697 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Extension, Error> {
1698 let subtype = endian.parse(read_bytes(r)?);
1699 let offset = r.stream_position()?;
1700 let size: u32 = endian.parse(read_bytes(r)?);
1701 let count = endian.parse(read_bytes(r)?);
1702 let Some(product) = size.checked_mul(count) else {
1703 return Err(Error::ExtensionRecordTooLarge {
1710 let offset = r.stream_position()?;
1711 let data = read_vec(r, product as usize)?;
1722 #[derive(Clone, Debug)]
1723 pub struct ZHeader {
1724 /// File offset to the start of the record.
1727 /// File offset to the ZLIB data header.
1728 pub zheader_offset: u64,
1730 /// File offset to the ZLIB trailer.
1731 pub ztrailer_offset: u64,
1733 /// Length of the ZLIB trailer in bytes.
1734 pub ztrailer_len: u64,
1738 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
1739 let offset = r.stream_position()?;
1740 let zheader_offset: u64 = endian.parse(read_bytes(r)?);
1741 let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
1742 let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
1753 #[derive(Clone, Debug)]
1754 pub struct ZTrailer {
1755 /// File offset to the start of the record.
1758 /// Compression bias as a negative integer, e.g. -100.
1761 /// Always observed as zero.
1764 /// Uncompressed size of each block, except possibly the last. Only
1765 /// `0x3ff000` has been observed so far.
1766 pub block_size: u32,
1768 /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
1769 pub blocks: Vec<ZBlock>,
1772 #[derive(Clone, Debug)]
1774 /// Offset of block of data if simple compression were used.
1775 pub uncompressed_ofs: u64,
1777 /// Actual offset within the file of the compressed data block.
1778 pub compressed_ofs: u64,
1780 /// The number of bytes in this data block after decompression. This is
1781 /// `block_size` in every data block but the last, which may be smaller.
1782 pub uncompressed_size: u32,
1784 /// The number of bytes in this data block, as stored compressed in this
1786 pub compressed_size: u32,
1790 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
1792 uncompressed_ofs: endian.parse(read_bytes(r)?),
1793 compressed_ofs: endian.parse(read_bytes(r)?),
1794 uncompressed_size: endian.parse(read_bytes(r)?),
1795 compressed_size: endian.parse(read_bytes(r)?),
1801 fn read<R: Read + Seek>(
1806 ) -> Result<Option<ZTrailer>, Error> {
1807 let start_offset = reader.stream_position()?;
1808 if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
1811 let int_bias = endian.parse(read_bytes(reader)?);
1812 let zero = endian.parse(read_bytes(reader)?);
1813 let block_size = endian.parse(read_bytes(reader)?);
1814 let n_blocks: u32 = endian.parse(read_bytes(reader)?);
1815 let expected_n_blocks = (ztrailer_len - 24) / 24;
1816 if n_blocks as u64 != expected_n_blocks {
1817 return Err(Error::BadZlibTrailerNBlocks {
1818 offset: ztrailer_ofs,
1824 let blocks = (0..n_blocks)
1825 .map(|_| ZBlock::read(reader, endian))
1826 .collect::<Result<Vec<_>, _>>()?;
1827 reader.seek(SeekFrom::Start(start_offset))?;
1829 offset: ztrailer_ofs,
1838 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
1839 let mut buf = [0; N];
1840 let n = r.read(&mut buf)?;
1843 r.read_exact(&mut buf[n..])?;
1851 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
1852 let mut buf = [0; N];
1853 r.read_exact(&mut buf)?;
1857 fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
1858 let mut vec = vec![0; n];
1859 r.read_exact(&mut vec)?;
1863 fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<UnencodedString, IoError> {
1864 let length: u32 = endian.parse(read_bytes(r)?);
1865 Ok(read_vec(r, length as usize)?.into())