1 use crate::endian::{Endian, Parse, ToBytes};
4 use flate2::read::ZlibDecoder;
6 use std::str::from_utf8;
9 io::{Error as IoError, Read, Seek, SeekFrom},
13 use self::state::State;
15 #[derive(Copy, Clone, Debug)]
16 pub enum Compression {
25 ValueLabel(ValueLabel),
26 VarIndexes(VarIndexes),
35 fn read<R: Read + Seek>(reader: &mut R, endian: Endian) -> Result<Record, Error> {
36 let rec_type: u32 = endian.parse(read_bytes(reader)?);
38 2 => Ok(Record::Variable(Variable::read(reader, endian)?)),
39 3 => Ok(Record::ValueLabel(ValueLabel::read(reader, endian)?)),
40 4 => Ok(Record::VarIndexes(VarIndexes::read(reader, endian)?)),
41 6 => Ok(Record::Document(Document::read(reader, endian)?)),
42 7 => Ok(Record::Extension(Extension::read(reader, endian)?)),
43 999 => Ok(Record::EndOfHeaders(endian.parse(read_bytes(reader)?))),
44 _ => Err(Error::BadRecordType {
45 offset: reader.stream_position()?,
56 /// Eye-catcher string, product name, in the file's encoding. Padded
57 /// on the right with spaces.
58 pub eye_catcher: [u8; 60],
60 /// Layout code, normally either 2 or 3.
63 /// Number of variable positions, or `None` if the value in the file is
64 /// questionably trustworthy.
65 pub nominal_case_size: Option<u32>,
67 /// Compression type, if any,
68 pub compression: Option<Compression>,
70 /// 0-based variable index of the weight variable, or `None` if the file is
72 pub weight_index: Option<u32>,
74 /// Claimed number of cases, if known.
75 pub n_cases: Option<u32>,
77 /// Compression bias, usually 100.0.
80 /// `dd mmm yy` in the file's encoding.
81 pub creation_date: [u8; 9],
83 /// `HH:MM:SS` in the file's encoding.
84 pub creation_time: [u8; 8],
86 /// File label, in the file's encoding. Padded on the right with spaces.
87 pub file_label: [u8; 64],
89 /// Endianness of the data in the file header.
94 fn read<R: Read>(r: &mut R) -> Result<Header, Error> {
95 let magic: [u8; 4] = read_bytes(r)?;
96 let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
98 let eye_catcher: [u8; 60] = read_bytes(r)?;
99 let layout_code: [u8; 4] = read_bytes(r)?;
100 let endian = Endian::identify_u32(2, layout_code)
101 .or_else(|| Endian::identify_u32(2, layout_code))
102 .ok_or_else(|| Error::NotASystemFile)?;
103 let layout_code = endian.parse(layout_code);
105 let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
106 let nominal_case_size =
107 (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
109 let compression_code: u32 = endian.parse(read_bytes(r)?);
110 let compression = match (magic, compression_code) {
111 (Magic::ZSAV, 2) => Some(Compression::ZLib),
112 (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
114 (_, 1) => Some(Compression::Simple),
115 (_, code) => return Err(Error::InvalidSavCompression(code)),
118 let weight_index: u32 = endian.parse(read_bytes(r)?);
119 let weight_index = (weight_index > 0).then_some(weight_index - 1);
121 let n_cases: u32 = endian.parse(read_bytes(r)?);
122 let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
124 let bias: f64 = endian.parse(read_bytes(r)?);
126 let creation_date: [u8; 9] = read_bytes(r)?;
127 let creation_time: [u8; 8] = read_bytes(r)?;
128 let file_label: [u8; 64] = read_bytes(r)?;
129 let _: [u8; 3] = read_bytes(r)?;
148 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
149 pub struct Magic([u8; 4]);
152 /// Magic number for a regular system file.
153 pub const SAV: Magic = Magic(*b"$FL2");
155 /// Magic number for a system file that contains zlib-compressed data.
156 pub const ZSAV: Magic = Magic(*b"$FL3");
158 /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
160 pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
163 impl TryFrom<[u8; 4]> for Magic {
166 fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
167 let magic = Magic(value);
169 Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
170 _ => Err(Error::BadMagic(value)),
175 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
182 fn from_width(width: i32) -> VarType {
184 0 => VarType::Number,
185 _ => VarType::String,
192 Compression, Error, Header, Record, Value, VarType, Variable, ZHeader, ZTrailer,
195 use crate::endian::Endian;
197 collections::VecDeque,
202 #[allow(clippy::type_complexity)]
203 fn read(self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error>;
206 struct Start<R: Read + Seek> {
210 pub fn new<R: Read + Seek + 'static>(reader: R) -> Box<dyn State> {
211 Box::new(Start { reader })
214 struct CommonState<R: Read + Seek> {
218 compression: Option<Compression>,
219 var_types: Vec<VarType>,
222 impl<R: Read + Seek + 'static> State for Start<R> {
223 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
224 let header = Header::read(&mut self.reader)?;
225 let next_state = Headers(CommonState {
227 endian: header.endian,
229 compression: header.compression,
230 var_types: Vec::new(),
232 Ok(Some((Record::Header(header), Box::new(next_state))))
236 struct Headers<R: Read + Seek>(CommonState<R>);
238 impl<R: Read + Seek + 'static> State for Headers<R> {
239 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
240 let record = Record::read(&mut self.0.reader, self.0.endian)?;
242 Record::Variable(Variable { width, .. }) => {
243 self.0.var_types.push(VarType::from_width(width));
245 Record::EndOfHeaders(_) => {
246 let next_state: Box<dyn State> = match self.0.compression {
247 None => Box::new(Data(self.0)),
248 Some(Compression::Simple) => Box::new(CompressedData::new(self.0)),
249 Some(Compression::ZLib) => Box::new(ZlibHeader(self.0)),
251 return Ok(Some((record, next_state)));
255 Ok(Some((record, self)))
259 struct ZlibHeader<R: Read + Seek>(CommonState<R>);
261 impl<R: Read + Seek + 'static> State for ZlibHeader<R> {
262 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
263 let zheader = ZHeader::read(&mut self.0.reader, self.0.endian)?;
264 Ok(Some((Record::ZHeader(zheader), self)))
268 struct ZlibTrailer<R: Read + Seek>(CommonState<R>, ZHeader);
270 impl<R: Read + Seek + 'static> State for ZlibTrailer<R> {
271 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
272 let retval = ZTrailer::read(
275 self.1.ztrailer_offset,
278 let next_state = Box::new(CompressedData::new(CommonState {
279 reader: ZlibDecodeMultiple::new(self.0.reader),
280 endian: self.0.endian,
282 compression: self.0.compression,
283 var_types: self.0.var_types,
286 None => next_state.read(),
287 Some(ztrailer) => Ok(Some((Record::ZTrailer(ztrailer), next_state))),
292 struct Data<R: Read + Seek>(CommonState<R>);
294 impl<R: Read + Seek + 'static> State for Data<R> {
295 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
296 match Value::read_case(&mut self.0.reader, &self.0.var_types, self.0.endian)? {
298 Some(values) => Ok(Some((Record::Case(values), self))),
303 struct CompressedData<R: Read + Seek> {
304 common: CommonState<R>,
308 impl<R: Read + Seek + 'static> CompressedData<R> {
309 fn new(common: CommonState<R>) -> CompressedData<R> {
312 codes: VecDeque::new(),
317 impl<R: Read + Seek + 'static> State for CompressedData<R> {
318 fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
319 match Value::read_compressed_case(
320 &mut self.common.reader,
321 &self.common.var_types,
327 Some(values) => Ok(Some((Record::Case(values), self))),
333 #[derive(Copy, Clone)]
340 pub fn from_raw(var_type: VarType, raw: [u8; 8], endian: Endian) -> Value {
342 VarType::String => Value::String(raw),
344 let number: f64 = endian.parse(raw);
345 Value::Number((number != -f64::MAX).then_some(number))
350 fn read_case<R: Read + Seek>(
352 var_types: &[VarType],
354 ) -> Result<Option<Vec<Value>>, Error> {
355 let case_start = reader.stream_position()?;
356 let mut values = Vec::with_capacity(var_types.len());
357 for (i, &var_type) in var_types.iter().enumerate() {
358 let Some(raw) = try_read_bytes(reader)? else {
362 let offset = reader.stream_position()?;
363 return Err(Error::EofInCase {
365 case_ofs: offset - case_start,
366 case_len: var_types.len() * 8,
370 values.push(Value::from_raw(var_type, raw, endian));
375 fn read_compressed_case<R: Read + Seek>(
377 var_types: &[VarType],
378 codes: &mut VecDeque<u8>,
381 ) -> Result<Option<Vec<Value>>, Error> {
382 let case_start = reader.stream_position()?;
383 let mut values = Vec::with_capacity(var_types.len());
384 for (i, &var_type) in var_types.iter().enumerate() {
386 let Some(code) = codes.pop_front() else {
387 let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
391 let offset = reader.stream_position()?;
392 return Err(Error::EofInCompressedCase {
394 case_ofs: offset - case_start,
398 codes.extend(new_codes.into_iter());
403 1..=251 => match var_type {
404 VarType::Number => break Value::Number(Some(code as f64 - bias)),
406 break Value::String(endian.to_bytes(code as f64 - bias))
413 let offset = reader.stream_position()?;
414 return Err(Error::PartialCompressedCase {
416 case_ofs: offset - case_start,
420 253 => break Value::from_raw(var_type, read_bytes(reader)?, endian),
421 254 => match var_type {
422 VarType::String => break Value::String(*b" "), // XXX EBCDIC
424 return Err(Error::CompressedStringExpected {
426 case_ofs: reader.stream_position()? - case_start,
430 255 => match var_type {
431 VarType::Number => break Value::Number(None),
433 return Err(Error::CompressedNumberExpected {
435 case_ofs: reader.stream_position()? - case_start,
447 struct ZlibDecodeMultiple<R>
451 reader: Option<ZlibDecoder<R>>,
454 impl<R> ZlibDecodeMultiple<R>
458 fn new(reader: R) -> ZlibDecodeMultiple<R> {
460 reader: Some(ZlibDecoder::new(reader)),
465 impl<R> Read for ZlibDecodeMultiple<R>
469 fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
471 match self.reader.as_mut().unwrap().read(buf)? {
473 let inner = self.reader.take().unwrap().into_inner();
474 self.reader = Some(ZlibDecoder::new(inner));
482 impl<R> Seek for ZlibDecodeMultiple<R>
486 fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
487 self.reader.as_mut().unwrap().get_mut().seek(pos)
492 state: Option<Box<dyn State>>,
496 pub fn new<R: Read + Seek + 'static>(reader: R) -> Result<Reader, Error> {
498 state: Some(state::new(reader)),
503 impl Iterator for Reader {
504 type Item = Result<Record, Error>;
506 fn next(&mut self) -> Option<Self::Item> {
507 match self.state.take()?.read() {
508 Ok(Some((record, next_state))) => {
509 self.state = Some(next_state);
513 Err(error) => Some(Err(error)),
518 impl FusedIterator for Reader {}
520 pub struct Variable {
521 /// Offset from the start of the file to the start of the record.
524 /// Variable width, in the range -1..=255.
527 /// Variable name, padded on the right with spaces.
531 pub print_format: u32,
534 pub write_format: u32,
536 /// Missing value code, one of -3, -2, 0, 1, 2, or 3.
537 pub missing_value_code: i32,
539 /// Raw missing values, up to 3 of them.
540 pub missing: Vec<[u8; 8]>,
542 /// Optional variable label.
543 pub label: Option<Vec<u8>>,
547 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Variable, Error> {
548 let offset = r.stream_position()?;
549 let width: i32 = endian.parse(read_bytes(r)?);
550 let has_variable_label: u32 = endian.parse(read_bytes(r)?);
551 let missing_value_code: i32 = endian.parse(read_bytes(r)?);
552 let print_format: u32 = endian.parse(read_bytes(r)?);
553 let write_format: u32 = endian.parse(read_bytes(r)?);
554 let name: [u8; 8] = read_bytes(r)?;
556 let label = match has_variable_label {
559 let len: u32 = endian.parse(read_bytes(r)?);
560 let read_len = len.min(65535) as usize;
561 let label = Some(read_vec(r, read_len)?);
563 let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
564 let _ = read_vec(r, padding_bytes as usize)?;
569 return Err(Error::BadVariableLabelCode {
571 code: has_variable_label,
576 let mut missing = Vec::new();
577 if missing_value_code != 0 {
578 match (width, missing_value_code) {
579 (0, -3 | -2 | 1 | 2 | 3) => (),
581 return Err(Error::BadNumericMissingValueCode {
583 code: missing_value_code,
588 return Err(Error::BadStringMissingValueCode {
590 code: missing_value_code,
595 for _ in 0..missing_value_code.abs() {
596 missing.push(read_bytes(r)?);
613 pub struct ValueLabel {
614 /// Offset from the start of the file to the start of the record.
618 pub labels: Vec<([u8; 8], Vec<u8>)>,
622 /// Maximum number of value labels in a record.
623 pub const MAX: u32 = u32::MAX / 8;
625 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabel, Error> {
626 let offset = r.stream_position()?;
627 let n: u32 = endian.parse(read_bytes(r)?);
628 if n > ValueLabel::MAX {
629 return Err(Error::BadNumberOfValueLabels {
632 max: ValueLabel::MAX,
636 let mut labels = Vec::new();
638 let value: [u8; 8] = read_bytes(r)?;
639 let label_len: u8 = endian.parse(read_bytes(r)?);
640 let label_len = label_len as usize;
641 let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
643 let mut label = read_vec(r, padded_len)?;
644 label.truncate(label_len);
645 labels.push((value, label));
647 Ok(ValueLabel { offset, labels })
651 pub struct VarIndexes {
652 /// Offset from the start of the file to the start of the record.
655 /// The 0-based indexes of the variable indexes.
656 pub var_indexes: Vec<u32>,
660 /// Maximum number of variable indexes in a record.
661 pub const MAX: u32 = u32::MAX / 8;
663 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VarIndexes, Error> {
664 let offset = r.stream_position()?;
665 let n: u32 = endian.parse(read_bytes(r)?);
666 if n > VarIndexes::MAX {
667 return Err(Error::BadNumberOfVarIndexes {
670 max: VarIndexes::MAX,
673 let mut var_indexes = Vec::with_capacity(n as usize);
675 var_indexes.push(endian.parse(read_bytes(r)?));
685 pub struct Document {
686 /// Offset from the start of the file to the start of the record.
689 /// The document, as an array of 80-byte lines.
690 pub lines: Vec<[u8; Document::LINE_LEN as usize]>,
694 /// Length of a line in a document. Document lines are fixed-length and
695 /// padded on the right with spaces.
696 pub const LINE_LEN: u32 = 80;
698 /// Maximum number of lines we will accept in a document. This is simply
699 /// the maximum number that will fit in a 32-bit space.
700 pub const MAX_LINES: u32 = i32::MAX as u32 / Self::LINE_LEN;
702 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Document, Error> {
703 let offset = r.stream_position()?;
704 let n: u32 = endian.parse(read_bytes(r)?);
706 0..=Self::MAX_LINES => Ok(Document {
707 pos: r.stream_position()?,
709 .map(|_| read_bytes(r))
710 .collect::<Result<Vec<_>, _>>()?,
712 _ => Err(Error::BadDocumentLength {
715 max: Self::MAX_LINES,
722 #[derive(FromPrimitive)]
724 /// Machine integer info.
726 /// Machine floating-point info.
732 /// Multiple response sets.
736 /// Extra product info text.
738 /// Variable display parameters.
740 /// Long variable names.
744 /// Extended number of cases.
746 /// Data file attributes.
748 /// Variable attributes.
750 /// Multiple response sets (extended).
752 /// Character encoding.
754 /// Value labels for long strings.
756 /// Missing values for long strings.
758 /// "Format properties in dataview table".
767 const NAME: &'static str;
768 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error>;
771 trait ExtensionRecord
775 const SIZE: Option<u32>;
776 const COUNT: Option<u32>;
777 const NAME: &'static str;
778 fn parse(ext: &Extension, endian: Endian, warn: impl Fn(Error)) -> Result<Self, Error>;
781 pub struct IntegerInfo {
782 pub version: (i32, i32, i32),
783 pub machine_code: i32,
784 pub floating_point_rep: i32,
785 pub compression_code: i32,
787 pub character_code: i32,
790 impl ExtensionRecord for IntegerInfo {
791 const SIZE: Option<u32> = Some(4);
792 const COUNT: Option<u32> = Some(8);
793 const NAME: &'static str = "integer record";
795 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
796 ext.check_size::<Self>()?;
798 let mut input = &ext.data[..];
799 let data: Vec<i32> = (0..8)
800 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
803 version: (data[0], data[1], data[2]),
804 machine_code: data[3],
805 floating_point_rep: data[4],
806 compression_code: data[5],
808 character_code: data[7],
813 pub struct FloatInfo {
819 impl ExtensionRecord for FloatInfo {
820 const SIZE: Option<u32> = Some(8);
821 const COUNT: Option<u32> = Some(3);
822 const NAME: &'static str = "floating point record";
824 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
825 ext.check_size::<Self>()?;
827 let mut input = &ext.data[..];
828 let data: Vec<f64> = (0..3)
829 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
839 pub enum CategoryLabels {
843 pub enum MultipleResponseType {
846 labels: CategoryLabels,
850 pub struct MultipleResponseSet {
853 pub mr_type: MultipleResponseType,
854 pub vars: Vec<Vec<u8>>,
857 impl MultipleResponseSet {
858 fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> {
859 let Some(equals) = input.iter().position(|&b| b == b'=') else {
860 return Err(Error::TBD);
862 let (name, input) = input.split_at(equals);
863 let (mr_type, input) = match input.get(0) {
864 Some(b'C') => (MultipleResponseType::MultipleCategory, &input[1..]),
866 let (value, input) = parse_counted_string(&input[1..])?;
868 MultipleResponseType::MultipleDichotomy {
870 labels: CategoryLabels::VarLabels,
876 let Some(b' ') = input.get(1) else {
877 return Err(Error::TBD);
879 let input = &input[2..];
880 let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
881 (CategoryLabels::CountedValues, rest)
882 } else if let Some(rest) = input.strip_prefix(b" 11 ") {
883 (CategoryLabels::VarLabels, rest)
885 return Err(Error::TBD);
887 let (value, input) = parse_counted_string(input)?;
889 MultipleResponseType::MultipleDichotomy {
896 _ => return Err(Error::TBD),
898 let Some(b' ') = input.get(0) else {
899 return Err(Error::TBD);
901 let (label, mut input) = parse_counted_string(&input[1..])?;
902 let mut vars = Vec::new();
903 while input.get(0) == Some(&b' ') {
905 let Some(length) = input.iter().position(|b| b" \n".contains(b)) else {
906 return Err(Error::TBD);
909 vars.push(input[..length].into());
911 input = &input[length..];
913 if input.get(0) != Some(&b'\n') {
914 return Err(Error::TBD);
916 while input.get(0) == Some(&b'\n') {
920 MultipleResponseSet {
931 pub struct MultipleResponseSets(Vec<MultipleResponseSet>);
933 impl ExtensionRecord for MultipleResponseSets {
934 const SIZE: Option<u32> = Some(1);
935 const COUNT: Option<u32> = None;
936 const NAME: &'static str = "multiple response set record";
938 fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
939 ext.check_size::<Self>()?;
941 let mut input = &ext.data[..];
942 let mut sets = Vec::new();
943 while !input.is_empty() {
944 let (set, rest) = MultipleResponseSet::parse(input)?;
948 Ok(MultipleResponseSets(sets))
952 fn parse_counted_string(input: &[u8]) -> Result<(&[u8], &[u8]), Error> {
953 let Some(space) = input.iter().position(|&b| b == b' ') else {
954 return Err(Error::TBD);
956 let Ok(length) = from_utf8(&input[..space]) else {
957 return Err(Error::TBD);
959 let Ok(length): Result<usize, _> = length.parse() else {
960 return Err(Error::TBD);
963 let input = &input[space + 1..];
964 if input.len() < length {
965 return Err(Error::TBD);
968 let (string, rest) = input.split_at(length);
972 pub struct ExtraProductInfo(String);
974 impl TextRecord for ExtraProductInfo {
975 const NAME: &'static str = "extra product info";
976 fn parse(input: &str, _warn: impl Fn(Error)) -> Result<Self, Error> {
977 Ok(ExtraProductInfo(input.into()))
981 pub struct VarDisplayRecord(Vec<u32>);
983 impl ExtensionRecord for VarDisplayRecord {
984 const SIZE: Option<u32> = Some(4);
985 const COUNT: Option<u32> = None;
986 const NAME: &'static str = "variable display record";
988 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
989 ext.check_size::<Self>()?;
991 let mut input = &ext.data[..];
992 let display = (0..ext.count)
993 .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
995 Ok(VarDisplayRecord(display))
999 pub struct VariableSet {
1001 pub vars: Vec<String>,
1005 fn parse(input: &str) -> Result<Self, Error> {
1006 let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
1007 let vars = input.split_ascii_whitespace().map(String::from).collect();
1015 pub struct VariableSetRecord(Vec<VariableSet>);
1017 impl TextRecord for VariableSetRecord {
1018 const NAME: &'static str = "variable set";
1019 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1020 let mut sets = Vec::new();
1021 for line in input.lines() {
1022 match VariableSet::parse(line) {
1023 Ok(set) => sets.push(set),
1024 Err(error) => warn(error),
1027 Ok(VariableSetRecord(sets))
1031 pub struct LongVariableName {
1032 pub short_name: String,
1033 pub long_name: String,
1036 pub struct LongVariableNameRecord(Vec<LongVariableName>);
1038 impl TextRecord for LongVariableNameRecord {
1039 const NAME: &'static str = "long variable names";
1040 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1041 let mut names = Vec::new();
1042 for pair in input.split('\t').filter(|s| !s.is_empty()) {
1043 if let Some((short_name, long_name)) = pair.split_once('=') {
1044 let name = LongVariableName {
1045 short_name: short_name.into(),
1046 long_name: long_name.into(),
1053 Ok(LongVariableNameRecord(names))
1057 pub struct VeryLongString {
1058 pub short_name: String,
1062 impl VeryLongString {
1063 fn parse(input: &str) -> Result<VeryLongString, Error> {
1064 let Some((short_name, length)) = input.split_once('=') else {
1065 return Err(Error::TBD);
1067 let length: usize = length.parse().map_err(|_| Error::TBD)?;
1069 short_name: short_name.into(),
1075 pub struct VeryLongStringRecord(Vec<VeryLongString>);
1077 impl TextRecord for VeryLongStringRecord {
1078 const NAME: &'static str = "very long strings";
1079 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1080 let mut very_long_strings = Vec::new();
1083 .map(|s| s.trim_end_matches('\t'))
1084 .filter(|s| !s.is_empty())
1086 match VeryLongString::parse(tuple) {
1087 Ok(vls) => very_long_strings.push(vls),
1088 Err(error) => warn(error),
1091 Ok(VeryLongStringRecord(very_long_strings))
1095 pub struct LongStringValueLabels {
1096 pub var_name: Vec<u8>,
1099 /// `(value, label)` pairs, where each value is `width` bytes.
1100 pub labels: Vec<(Vec<u8>, Vec<u8>)>,
1103 pub struct LongStringValueLabelSet(Vec<LongStringValueLabels>);
1105 impl ExtensionRecord for LongStringValueLabelSet {
1106 const SIZE: Option<u32> = Some(1);
1107 const COUNT: Option<u32> = None;
1108 const NAME: &'static str = "long string value labels record";
1110 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1111 ext.check_size::<Self>()?;
1113 let mut input = &ext.data[..];
1114 let mut label_set = Vec::new();
1115 while !input.is_empty() {
1116 let var_name = read_string(&mut input, endian)?;
1117 let width: u32 = endian.parse(read_bytes(&mut input)?);
1118 let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
1119 let mut labels = Vec::new();
1120 for _ in 0..n_labels {
1121 let value = read_string(&mut input, endian)?;
1122 let label = read_string(&mut input, endian)?;
1123 labels.push((value, label));
1125 label_set.push(LongStringValueLabels {
1131 Ok(LongStringValueLabelSet(label_set))
1135 pub struct LongStringMissingValues {
1137 pub var_name: Vec<u8>,
1139 /// Up to three missing values.
1140 pub missing_values: Vec<[u8; 8]>,
1143 pub struct LongStringMissingValueSet(Vec<LongStringMissingValues>);
1145 impl ExtensionRecord for LongStringMissingValueSet {
1146 const SIZE: Option<u32> = Some(1);
1147 const COUNT: Option<u32> = None;
1148 const NAME: &'static str = "long string missing values record";
1150 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1151 ext.check_size::<Self>()?;
1153 let mut input = &ext.data[..];
1154 let mut missing_value_set = Vec::new();
1155 while !input.is_empty() {
1156 let var_name = read_string(&mut input, endian)?;
1157 let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
1158 let value_len: u32 = endian.parse(read_bytes(&mut input)?);
1160 let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offset;
1161 return Err(Error::BadLongMissingValueLength {
1162 record_offset: ext.offset,
1167 let mut missing_values = Vec::new();
1168 for i in 0..n_missing_values {
1169 let value: [u8; 8] = read_bytes(&mut input)?;
1170 let numeric_value: u64 = endian.parse(value);
1171 let value = if i > 0 && numeric_value == 8 {
1172 // Tolerate files written by old, buggy versions of PSPP
1173 // where we believed that the value_length was repeated
1174 // before each missing value.
1175 read_bytes(&mut input)?
1179 missing_values.push(value);
1181 missing_value_set.push(LongStringMissingValues {
1186 Ok(LongStringMissingValueSet(missing_value_set))
1190 pub struct Encoding(pub String);
1192 impl ExtensionRecord for Encoding {
1193 const SIZE: Option<u32> = Some(1);
1194 const COUNT: Option<u32> = None;
1195 const NAME: &'static str = "encoding record";
1197 fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1198 ext.check_size::<Self>()?;
1200 Ok(Encoding(String::from_utf8(ext.data.clone()).map_err(
1201 |_| Error::BadEncodingName { offset: ext.offset },
1206 pub struct Attribute {
1208 pub values: Vec<String>,
1212 fn parse<'a>(input: &'a str, warn: &impl Fn(Error)) -> Result<(Attribute, &'a str), Error> {
1213 let Some((name, mut input)) = input.split_once('(') else {
1214 return Err(Error::TBD);
1216 let mut values = Vec::new();
1218 let Some((value, rest)) = input.split_once('\n') else {
1219 return Err(Error::TBD);
1221 if let Some(stripped) = value
1223 .and_then(|value| value.strip_suffix('\''))
1225 values.push(stripped.into());
1228 values.push(value.into());
1230 if let Some(rest) = rest.strip_prefix(')') {
1244 pub struct AttributeSet(pub Vec<Attribute>);
1249 sentinel: Option<char>,
1250 warn: &impl Fn(Error),
1251 ) -> Result<(AttributeSet, &'a str), Error> {
1252 let mut attributes = Vec::new();
1254 match input.chars().next() {
1255 None => break input,
1256 c if c == sentinel => break &input[1..],
1258 let (attribute, rest) = Attribute::parse(input, &warn)?;
1259 attributes.push(attribute);
1264 Ok((AttributeSet(attributes), rest))
1268 pub struct FileAttributeRecord(AttributeSet);
1270 impl TextRecord for FileAttributeRecord {
1271 const NAME: &'static str = "data file attributes";
1272 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1273 let (set, rest) = AttributeSet::parse(input, None, &warn)?;
1274 if !rest.is_empty() {
1277 Ok(FileAttributeRecord(set))
1281 pub struct VarAttributeSet {
1282 pub long_var_name: String,
1283 pub attributes: AttributeSet,
1286 impl VarAttributeSet {
1289 warn: &impl Fn(Error),
1290 ) -> Result<(VarAttributeSet, &'a str), Error> {
1291 let Some((long_var_name, rest)) = input.split_once(':') else {
1292 return Err(Error::TBD);
1294 let (attributes, rest) = AttributeSet::parse(rest, Some('/'), warn)?;
1297 long_var_name: long_var_name.into(),
1305 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
1307 impl TextRecord for VariableAttributeRecord {
1308 const NAME: &'static str = "variable attributes";
1309 fn parse(mut input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1310 let mut var_attribute_sets = Vec::new();
1311 while !input.is_empty() {
1312 match VarAttributeSet::parse(input, &warn) {
1313 Ok((var_attribute, rest)) => {
1314 var_attribute_sets.push(var_attribute);
1323 Ok(VariableAttributeRecord(var_attribute_sets))
1327 pub struct NumberOfCasesRecord {
1328 /// Always observed as 1.
1331 /// Number of cases.
1335 impl ExtensionRecord for NumberOfCasesRecord {
1336 const SIZE: Option<u32> = Some(8);
1337 const COUNT: Option<u32> = Some(2);
1338 const NAME: &'static str = "extended number of cases record";
1340 fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
1341 ext.check_size::<Self>()?;
1343 let mut input = &ext.data[..];
1344 let one = endian.parse(read_bytes(&mut input)?);
1345 let n_cases = endian.parse(read_bytes(&mut input)?);
1347 Ok(NumberOfCasesRecord { one, n_cases })
1351 pub struct Extension {
1352 /// Offset from the start of the file to the start of the record.
1358 /// Size of each data element.
1361 /// Number of data elements.
1364 /// `size * count` bytes of data.
1369 fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) {
1371 /* Implemented record types. */
1372 ExtensionType::Integer => (4, 8),
1373 ExtensionType::Float => (8, 3),
1374 ExtensionType::VarSets => (1, 0),
1375 ExtensionType::Mrsets => (1, 0),
1376 ExtensionType::ProductInfo => (1, 0),
1377 ExtensionType::Display => (4, 0),
1378 ExtensionType::LongNames => (1, 0),
1379 ExtensionType::LongStrings => (1, 0),
1380 ExtensionType::Ncases => (8, 2),
1381 ExtensionType::FileAttrs => (1, 0),
1382 ExtensionType::VarAttrs => (1, 0),
1383 ExtensionType::Mrsets2 => (1, 0),
1384 ExtensionType::Encoding => (1, 0),
1385 ExtensionType::LongLabels => (1, 0),
1386 ExtensionType::LongMissing => (1, 0),
1388 /* Ignored record types. */
1389 ExtensionType::Date => (0, 0),
1390 ExtensionType::DataEntry => (0, 0),
1391 ExtensionType::Dataview => (0, 0),
1397 fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
1398 if let Some(expected_size) = E::SIZE {
1399 if self.size != expected_size {
1400 return Err(Error::BadRecordSize {
1401 offset: self.offset,
1402 record: E::NAME.into(),
1408 if let Some(expected_count) = E::COUNT {
1409 if self.count != expected_count {
1410 return Err(Error::BadRecordCount {
1411 offset: self.offset,
1412 record: E::NAME.into(),
1421 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Extension, Error> {
1422 let subtype = endian.parse(read_bytes(r)?);
1423 let offset = r.stream_position()?;
1424 let size: u32 = endian.parse(read_bytes(r)?);
1425 let count = endian.parse(read_bytes(r)?);
1426 let Some(product) = size.checked_mul(count) else {
1427 return Err(Error::ExtensionRecordTooLarge {
1434 let offset = r.stream_position()?;
1435 let data = read_vec(r, product as usize)?;
1446 pub struct ZHeader {
1447 /// File offset to the start of the record.
1450 /// File offset to the ZLIB data header.
1451 pub zheader_offset: u64,
1453 /// File offset to the ZLIB trailer.
1454 pub ztrailer_offset: u64,
1456 /// Length of the ZLIB trailer in bytes.
1457 pub ztrailer_len: u64,
1461 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
1462 let offset = r.stream_position()?;
1463 let zheader_offset: u64 = endian.parse(read_bytes(r)?);
1464 let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
1465 let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
1476 pub struct ZTrailer {
1477 /// File offset to the start of the record.
1480 /// Compression bias as a negative integer, e.g. -100.
1483 /// Always observed as zero.
1486 /// Uncompressed size of each block, except possibly the last. Only
1487 /// `0x3ff000` has been observed so far.
1488 pub block_size: u32,
1490 /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
1491 pub blocks: Vec<ZBlock>,
1495 /// Offset of block of data if simple compression were used.
1496 pub uncompressed_ofs: u64,
1498 /// Actual offset within the file of the compressed data block.
1499 pub compressed_ofs: u64,
1501 /// The number of bytes in this data block after decompression. This is
1502 /// `block_size` in every data block but the last, which may be smaller.
1503 pub uncompressed_size: u32,
1505 /// The number of bytes in this data block, as stored compressed in this
1507 pub compressed_size: u32,
1511 fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
1513 uncompressed_ofs: endian.parse(read_bytes(r)?),
1514 compressed_ofs: endian.parse(read_bytes(r)?),
1515 uncompressed_size: endian.parse(read_bytes(r)?),
1516 compressed_size: endian.parse(read_bytes(r)?),
1522 fn read<R: Read + Seek>(
1527 ) -> Result<Option<ZTrailer>, Error> {
1528 let start_offset = reader.stream_position()?;
1529 if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
1532 let int_bias = endian.parse(read_bytes(reader)?);
1533 let zero = endian.parse(read_bytes(reader)?);
1534 let block_size = endian.parse(read_bytes(reader)?);
1535 let n_blocks: u32 = endian.parse(read_bytes(reader)?);
1536 let expected_n_blocks = (ztrailer_len - 24) / 24;
1537 if n_blocks as u64 != expected_n_blocks {
1538 return Err(Error::BadZlibTrailerNBlocks {
1539 offset: ztrailer_ofs,
1545 let blocks = (0..n_blocks)
1546 .map(|_| ZBlock::read(reader, endian))
1547 .collect::<Result<Vec<_>, _>>()?;
1548 reader.seek(SeekFrom::Start(start_offset))?;
1550 offset: ztrailer_ofs,
1559 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
1560 let mut buf = [0; N];
1561 let n = r.read(&mut buf)?;
1564 r.read_exact(&mut buf[n..])?;
1572 fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
1573 let mut buf = [0; N];
1574 r.read_exact(&mut buf)?;
1578 fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
1579 let mut vec = vec![0; n];
1580 r.read_exact(&mut vec)?;
1584 fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<Vec<u8>, IoError> {
1585 let length: u32 = endian.parse(read_bytes(r)?);
1586 read_vec(r, length as usize)