use crate::endian::{Endian, Parse, ToBytes};
-use crate::Error;
+use encoding_rs::mem::decode_latin1;
use flate2::read::ZlibDecoder;
use num::Integer;
+use std::borrow::Cow;
+use std::fmt::{Debug, Formatter, Result as FmtResult};
+use std::str::from_utf8;
use std::{
collections::VecDeque,
io::{Error as IoError, Read, Seek, SeekFrom},
iter::FusedIterator,
};
+use thiserror::Error as ThisError;
use self::state::State;
-#[derive(Copy, Clone, Debug)]
-pub enum Compression {
- Simple,
- ZLib,
+#[derive(ThisError, Debug)]
+pub enum Error {
+ #[error("Not an SPSS system file")]
+ NotASystemFile,
+
+ #[error("Invalid magic number {0:?}")]
+ BadMagic([u8; 4]),
+
+ #[error("I/O error ({0})")]
+ Io(#[from] IoError),
+
+ #[error("Invalid SAV compression code {0}")]
+ InvalidSavCompression(u32),
+
+ #[error("Invalid ZSAV compression code {0}")]
+ InvalidZsavCompression(u32),
+
+ #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
+ BadVariableWidth { offset: u64, width: i32 },
+
+ #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
+ BadDocumentLength { offset: u64, n: usize, max: usize },
+
+ #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
+ BadRecordType { offset: u64, rec_type: u32 },
+
+ #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")]
+ BadVariableLabelCode { offset: u64, code: u32 },
+
+ #[error(
+ "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
+ )]
+ BadNumericMissingValueCode { offset: u64, code: i32 },
+
+ #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
+ BadStringMissingValueCode { offset: u64, code: i32 },
+
+ #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
+ BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
+
+ #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
+ BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
+
+ #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
+ ExtensionRecordTooLarge {
+ offset: u64,
+ subtype: u32,
+ size: u32,
+ count: u32,
+ },
+
+ #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
+ EofInCase {
+ offset: u64,
+ case_ofs: u64,
+ case_len: usize,
+ },
+
+ #[error(
+ "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
+ )]
+ EofInCompressedCase { offset: u64, case_ofs: u64 },
+
+ #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
+ PartialCompressedCase { offset: u64, case_ofs: u64 },
+
+ #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
+ CompressedNumberExpected { offset: u64, case_ofs: u64 },
+
+ #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
+ CompressedStringExpected { offset: u64, case_ofs: u64 },
+
+ #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
+ BadZlibTrailerNBlocks {
+ offset: u64,
+ n_blocks: u32,
+ expected_n_blocks: u64,
+ ztrailer_len: u64,
+ },
+
+ #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
+ BadRecordSize {
+ offset: u64,
+ record: String,
+ size: u32,
+ expected_size: u32,
+ },
+
+ #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
+ BadRecordCount {
+ offset: u64,
+ record: String,
+ count: u32,
+ expected_count: u32,
+ },
+
+ #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
+ BadLongMissingValueLength {
+ record_offset: u64,
+ offset: u64,
+ value_len: u32,
+ },
+
+ #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
+ BadEncodingName { offset: u64 },
+
+ #[error("Details TBD")]
+ TBD,
}
+#[derive(Clone, Debug)]
pub enum Record {
- Header(Header),
- Document(Document),
- Variable(Variable),
- ValueLabel(ValueLabel),
- VarIndexes(VarIndexes),
- Extension(Extension),
+ Header(HeaderRecord),
+ Variable(VariableRecord),
+ ValueLabel(ValueLabelRecord),
+ VarIndexes(VarIndexRecord),
+ Document(DocumentRecord),
+ IntegerInfo(IntegerInfoRecord),
+ FloatInfo(FloatInfoRecord),
+ VariableSets(TextRecord),
+ VarDisplay(VarDisplayRecord),
+ MultipleResponse(MultipleResponseRecord),
+ LongStringValueLabels(LongStringValueLabelRecord),
+ Encoding(EncodingRecord),
+ NumberOfCases(NumberOfCasesRecord),
+ ProductInfo(TextRecord),
+ LongNames(TextRecord),
+ VeryLongStrings(TextRecord),
+ FileAttributes(TextRecord),
+ VariableAttributes(TextRecord),
+ OtherExtension(Extension),
EndOfHeaders(u32),
ZHeader(ZHeader),
ZTrailer(ZTrailer),
fn read<R: Read + Seek>(reader: &mut R, endian: Endian) -> Result<Record, Error> {
let rec_type: u32 = endian.parse(read_bytes(reader)?);
match rec_type {
- 2 => Ok(Record::Variable(Variable::read(reader, endian)?)),
- 3 => Ok(Record::ValueLabel(ValueLabel::read(reader, endian)?)),
- 4 => Ok(Record::VarIndexes(VarIndexes::read(reader, endian)?)),
- 6 => Ok(Record::Document(Document::read(reader, endian)?)),
- 7 => Ok(Record::Extension(Extension::read(reader, endian)?)),
+ 2 => Ok(Record::Variable(VariableRecord::read(reader, endian)?)),
+ 3 => Ok(Record::ValueLabel(ValueLabelRecord::read(reader, endian)?)),
+ 4 => Ok(Record::VarIndexes(VarIndexRecord::read(reader, endian)?)),
+ 6 => Ok(Record::Document(DocumentRecord::read(reader, endian)?)),
+ 7 => Ok(Extension::read(reader, endian)?),
999 => Ok(Record::EndOfHeaders(endian.parse(read_bytes(reader)?))),
_ => Err(Error::BadRecordType {
offset: reader.stream_position()?,
}
}
-pub struct Header {
+// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
+// decoded as Latin-1 (actually bytes interpreted as Unicode code points).
+fn default_decode<'a>(s: &'a [u8]) -> Cow<'a, str> {
+ from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum Compression {
+ Simple,
+ ZLib,
+}
+
+#[derive(Clone)]
+pub struct HeaderRecord {
/// Magic number.
pub magic: Magic,
/// Eye-catcher string, product name, in the file's encoding. Padded
/// on the right with spaces.
- pub eye_catcher: [u8; 60],
+ pub eye_catcher: UnencodedStr<60>,
/// Layout code, normally either 2 or 3.
pub layout_code: u32,
/// Compression type, if any,
pub compression: Option<Compression>,
- /// 0-based variable index of the weight variable, or `None` if the file is
+ /// 1-based variable index of the weight variable, or `None` if the file is
/// unweighted.
pub weight_index: Option<u32>,
pub bias: f64,
/// `dd mmm yy` in the file's encoding.
- pub creation_date: [u8; 9],
+ pub creation_date: UnencodedStr<9>,
/// `HH:MM:SS` in the file's encoding.
- pub creation_time: [u8; 8],
+ pub creation_time: UnencodedStr<8>,
/// File label, in the file's encoding. Padded on the right with spaces.
- pub file_label: [u8; 64],
+ pub file_label: UnencodedStr<64>,
/// Endianness of the data in the file header.
pub endian: Endian,
}
-impl Header {
- fn read<R: Read>(r: &mut R) -> Result<Header, Error> {
+impl HeaderRecord {
+ fn debug_field<T: Debug>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult {
+ writeln!(f, "{name:>17}: {:?}", value)
+ }
+}
+
+impl Debug for HeaderRecord {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ writeln!(f, "File header record:")?;
+ self.debug_field(f, "Magic", self.magic)?;
+ self.debug_field(f, "Product name", &self.eye_catcher)?;
+ self.debug_field(f, "Layout code", self.layout_code)?;
+ self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
+ self.debug_field(f, "Compression", self.compression)?;
+ self.debug_field(f, "Weight index", self.weight_index)?;
+ self.debug_field(f, "Number of cases", self.n_cases)?;
+ self.debug_field(f, "Compression bias", self.bias)?;
+ self.debug_field(f, "Creation date", &self.creation_date)?;
+ self.debug_field(f, "Creation time", &self.creation_time)?;
+ self.debug_field(f, "File label", &self.file_label)?;
+ self.debug_field(f, "Endianness", self.endian)
+ }
+}
+
+impl HeaderRecord {
+ fn read<R: Read>(r: &mut R) -> Result<HeaderRecord, Error> {
let magic: [u8; 4] = read_bytes(r)?;
let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
- let eye_catcher: [u8; 60] = read_bytes(r)?;
+ let eye_catcher = UnencodedStr::<60>(read_bytes(r)?);
let layout_code: [u8; 4] = read_bytes(r)?;
let endian = Endian::identify_u32(2, layout_code)
.or_else(|| Endian::identify_u32(2, layout_code))
};
let weight_index: u32 = endian.parse(read_bytes(r)?);
- let weight_index = (weight_index > 0).then_some(weight_index - 1);
+ let weight_index = (weight_index > 0).then_some(weight_index);
let n_cases: u32 = endian.parse(read_bytes(r)?);
let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
let bias: f64 = endian.parse(read_bytes(r)?);
- let creation_date: [u8; 9] = read_bytes(r)?;
- let creation_time: [u8; 8] = read_bytes(r)?;
- let file_label: [u8; 64] = read_bytes(r)?;
+ let creation_date = UnencodedStr::<9>(read_bytes(r)?);
+ let creation_time = UnencodedStr::<8>(read_bytes(r)?);
+ let file_label = UnencodedStr::<64>(read_bytes(r)?);
let _: [u8; 3] = read_bytes(r)?;
- Ok(Header {
+ Ok(HeaderRecord {
magic,
layout_code,
nominal_case_size,
pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
}
+impl Debug for Magic {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ let s = match self {
+ &Magic::SAV => "$FL2",
+ &Magic::ZSAV => "$FL3",
+ &Magic::EBCDIC => "($FL2 in EBCDIC)",
+ _ => return write!(f, "{:?}", self.0),
+ };
+ write!(f, "{s}")
+ }
+}
+
impl TryFrom<[u8; 4]> for Magic {
type Error = Error;
}
}
-#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum VarType {
- Number,
+ Numeric,
String,
}
impl VarType {
fn from_width(width: i32) -> VarType {
match width {
- 0 => VarType::Number,
+ 0 => VarType::Numeric,
_ => VarType::String,
}
}
mod state {
use super::{
- Compression, Error, Header, Record, Value, VarType, Variable, ZHeader, ZTrailer,
- ZlibDecodeMultiple,
+ Compression, Error, HeaderRecord, Record, Value, VarType, VariableRecord, ZHeader,
+ ZTrailer, ZlibDecodeMultiple,
};
use crate::endian::Endian;
use std::{
impl<R: Read + Seek + 'static> State for Start<R> {
fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
- let header = Header::read(&mut self.reader)?;
+ let header = HeaderRecord::read(&mut self.reader)?;
let next_state = Headers(CommonState {
reader: self.reader,
endian: header.endian,
fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
let record = Record::read(&mut self.0.reader, self.0.endian)?;
match record {
- Record::Variable(Variable { width, .. }) => {
+ Record::Variable(VariableRecord { width, .. }) => {
self.0.var_types.push(VarType::from_width(width));
}
Record::EndOfHeaders(_) => {
#[derive(Copy, Clone)]
pub enum Value {
Number(Option<f64>),
- String([u8; 8]),
+ String(UnencodedStr<8>),
+}
+
+impl Debug for Value {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ match self {
+ Value::Number(Some(number)) => write!(f, "{number:?}"),
+ Value::Number(None) => write!(f, "SYSMIS"),
+ Value::String(bytes) => write!(f, "{:?}", bytes),
+ }
+ }
}
impl Value {
- pub fn from_raw(var_type: VarType, raw: [u8; 8], endian: Endian) -> Value {
+ fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Value, IoError> {
+ Ok(Self::from_raw(
+ UntypedValue(read_bytes(r)?),
+ var_type,
+ endian,
+ ))
+ }
+
+ pub fn from_raw(raw: UntypedValue, var_type: VarType, endian: Endian) -> Value {
match var_type {
- VarType::String => Value::String(raw),
- VarType::Number => {
- let number: f64 = endian.parse(raw);
+ VarType::String => Value::String(UnencodedStr(raw.0)),
+ VarType::Numeric => {
+ let number: f64 = endian.parse(raw.0);
Value::Number((number != -f64::MAX).then_some(number))
}
}
});
}
};
- values.push(Value::from_raw(var_type, raw, endian));
+ values.push(Value::from_raw(UntypedValue(raw), var_type, endian));
}
Ok(Some(values))
}
match code {
0 => (),
1..=251 => match var_type {
- VarType::Number => break Value::Number(Some(code as f64 - bias)),
+ VarType::Numeric => break Value::Number(Some(code as f64 - bias)),
VarType::String => {
- break Value::String(endian.to_bytes(code as f64 - bias))
+ break Value::String(UnencodedStr(endian.to_bytes(code as f64 - bias)))
}
},
252 => {
});
}
}
- 253 => break Value::from_raw(var_type, read_bytes(reader)?, endian),
+ 253 => {
+ break Value::from_raw(UntypedValue(read_bytes(reader)?), var_type, endian)
+ }
254 => match var_type {
- VarType::String => break Value::String(*b" "), // XXX EBCDIC
- VarType::Number => {
+ VarType::String => break Value::String(UnencodedStr(*b" ")), // XXX EBCDIC
+ VarType::Numeric => {
return Err(Error::CompressedStringExpected {
offset: case_start,
case_ofs: reader.stream_position()? - case_start,
}
},
255 => match var_type {
- VarType::Number => break Value::Number(None),
+ VarType::Numeric => break Value::Number(None),
VarType::String => {
return Err(Error::CompressedNumberExpected {
offset: case_start,
state: Some(state::new(reader)),
})
}
+ pub fn collect_headers(&mut self) -> Result<Vec<Record>, Error> {
+ let mut headers = Vec::new();
+ for record in self {
+ match record? {
+ Record::EndOfHeaders(_) => break,
+ r => headers.push(r),
+ };
+ }
+ Ok(headers)
+ }
}
impl Iterator for Reader {
impl FusedIterator for Reader {}
-pub struct Variable {
+#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+pub struct Spec(pub u32);
+
+impl Debug for Spec {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ let type_ = format_name(self.0 >> 16);
+ let w = (self.0 >> 8) & 0xff;
+ let d = self.0 & 0xff;
+ write!(f, "{:06x} ({type_}{w}.{d})", self.0)
+ }
+}
+
+fn format_name(type_: u32) -> Cow<'static, str> {
+ match type_ {
+ 1 => "A",
+ 2 => "AHEX",
+ 3 => "COMMA",
+ 4 => "DOLLAR",
+ 5 => "F",
+ 6 => "IB",
+ 7 => "PIBHEX",
+ 8 => "P",
+ 9 => "PIB",
+ 10 => "PK",
+ 11 => "RB",
+ 12 => "RBHEX",
+ 15 => "Z",
+ 16 => "N",
+ 17 => "E",
+ 20 => "DATE",
+ 21 => "TIME",
+ 22 => "DATETIME",
+ 23 => "ADATE",
+ 24 => "JDATE",
+ 25 => "DTIME",
+ 26 => "WKDAY",
+ 27 => "MONTH",
+ 28 => "MOYR",
+ 29 => "QYR",
+ 30 => "WKYR",
+ 31 => "PCT",
+ 32 => "DOT",
+ 33 => "CCA",
+ 34 => "CCB",
+ 35 => "CCC",
+ 36 => "CCD",
+ 37 => "CCE",
+ 38 => "EDATE",
+ 39 => "SDATE",
+ 40 => "MTIME",
+ 41 => "YMDHMS",
+ _ => return format!("<unknown format {type_}>").into(),
+ }
+ .into()
+}
+
+#[derive(Clone)]
+pub struct MissingValues {
+ /// Individual missing values, up to 3 of them.
+ pub values: Vec<Value>,
+
+ /// Optional range of missing values.
+ pub range: Option<(Value, Value)>,
+}
+
+impl Debug for MissingValues {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ for (i, value) in self.values.iter().enumerate() {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ write!(f, "{value:?}")?;
+ }
+
+ if let Some((low, high)) = self.range {
+ if !self.values.is_empty() {
+ write!(f, ", ")?;
+ }
+ write!(f, "{low:?} THRU {high:?}")?;
+ }
+
+ if self.is_empty() {
+ write!(f, "none")?;
+ }
+
+ Ok(())
+ }
+}
+
+impl MissingValues {
+ fn is_empty(&self) -> bool {
+ self.values.is_empty() && self.range.is_none()
+ }
+
+ fn read<R: Read + Seek>(
+ r: &mut R,
+ offset: u64,
+ width: i32,
+ code: i32,
+ endian: Endian,
+ ) -> Result<MissingValues, Error> {
+ let (n_values, has_range) = match (width, code) {
+ (_, 0..=3) => (code, false),
+ (0, -2) => (0, true),
+ (0, -3) => (1, true),
+ (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
+ (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
+ };
+
+ let var_type = VarType::from_width(width);
+
+ let mut values = Vec::new();
+ for _ in 0..n_values {
+ values.push(Value::read(r, var_type, endian)?);
+ }
+ let range = if has_range {
+ let low = Value::read(r, var_type, endian)?;
+ let high = Value::read(r, var_type, endian)?;
+ Some((low, high))
+ } else {
+ None
+ };
+ Ok(MissingValues { values, range })
+ }
+}
+
+#[derive(Clone)]
+pub struct VariableRecord {
/// Offset from the start of the file to the start of the record.
pub offset: u64,
pub width: i32,
/// Variable name, padded on the right with spaces.
- pub name: [u8; 8],
+ pub name: UnencodedStr<8>,
/// Print format.
- pub print_format: u32,
+ pub print_format: Spec,
/// Write format.
- pub write_format: u32,
+ pub write_format: Spec,
- /// Missing value code, one of -3, -2, 0, 1, 2, or 3.
- pub missing_value_code: i32,
-
- /// Raw missing values, up to 3 of them.
- pub missing: Vec<[u8; 8]>,
+ /// Missing values.
+ pub missing_values: MissingValues,
/// Optional variable label.
- pub label: Option<Vec<u8>>,
+ pub label: Option<UnencodedString>,
+}
+
+impl Debug for VariableRecord {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ writeln!(
+ f,
+ "Width: {} ({})",
+ self.width,
+ if self.width > 0 {
+ "string"
+ } else if self.width == 0 {
+ "numeric"
+ } else {
+ "long string continuation record"
+ }
+ )?;
+ writeln!(f, "Print format: {:?}", self.print_format)?;
+ writeln!(f, "Write format: {:?}", self.write_format)?;
+ writeln!(f, "Name: {:?}", &self.name)?;
+ writeln!(f, "Variable label: {:?}", self.label)?;
+ writeln!(f, "Missing values: {:?}", self.missing_values)
+ }
}
-impl Variable {
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Variable, Error> {
+impl VariableRecord {
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VariableRecord, Error> {
let offset = r.stream_position()?;
let width: i32 = endian.parse(read_bytes(r)?);
let has_variable_label: u32 = endian.parse(read_bytes(r)?);
let missing_value_code: i32 = endian.parse(read_bytes(r)?);
- let print_format: u32 = endian.parse(read_bytes(r)?);
- let write_format: u32 = endian.parse(read_bytes(r)?);
- let name: [u8; 8] = read_bytes(r)?;
+ let print_format = Spec(endian.parse(read_bytes(r)?));
+ let write_format = Spec(endian.parse(read_bytes(r)?));
+ let name = UnencodedStr::<8>(read_bytes(r)?);
let label = match has_variable_label {
0 => None,
1 => {
let len: u32 = endian.parse(read_bytes(r)?);
let read_len = len.min(65535) as usize;
- let label = Some(read_vec(r, read_len)?);
+ let label = UnencodedString(read_vec(r, read_len)?);
let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
let _ = read_vec(r, padding_bytes as usize)?;
- label
+ Some(label)
}
_ => {
return Err(Error::BadVariableLabelCode {
}
};
- let mut missing = Vec::new();
- if missing_value_code != 0 {
- match (width, missing_value_code) {
- (0, -3 | -2 | 1 | 2 | 3) => (),
- (0, _) => {
- return Err(Error::BadNumericMissingValueCode {
- offset,
- code: missing_value_code,
- })
- }
- (_, 0..=3) => (),
- (_, _) => {
- return Err(Error::BadStringMissingValueCode {
- offset,
- code: missing_value_code,
- })
- }
- }
-
- for _ in 0..missing_value_code.abs() {
- missing.push(read_bytes(r)?);
- }
- }
+ let missing_values = MissingValues::read(r, offset, width, missing_value_code, endian)?;
- Ok(Variable {
+ Ok(VariableRecord {
offset,
width,
name,
print_format,
write_format,
- missing_value_code,
- missing,
+ missing_values,
label,
})
}
}
-pub struct ValueLabel {
+#[derive(Copy, Clone)]
+pub struct UntypedValue(pub [u8; 8]);
+
+impl Debug for UntypedValue {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ let little: f64 = Endian::Little.parse(self.0);
+ let little = format!("{:?}", little);
+ let big: f64 = Endian::Big.parse(self.0);
+ let big = format!("{:?}", big);
+ let number = if little.len() <= big.len() {
+ little
+ } else {
+ big
+ };
+ write!(f, "{number}")?;
+
+ let string = default_decode(&self.0);
+ let string = string
+ .split(|c: char| c == '\0' || c.is_control())
+ .next()
+ .unwrap();
+ write!(f, "{string:?}")?;
+ Ok(())
+ }
+}
+
+#[derive(Clone)]
+pub struct UnencodedString(pub Vec<u8>);
+
+impl From<Vec<u8>> for UnencodedString {
+ fn from(source: Vec<u8>) -> Self {
+ Self(source)
+ }
+}
+
+impl From<&[u8]> for UnencodedString {
+ fn from(source: &[u8]) -> Self {
+ Self(source.into())
+ }
+}
+
+impl Debug for UnencodedString {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "{:?}", default_decode(self.0.as_slice()))
+ }
+}
+
+#[derive(Copy, Clone)]
+pub struct UnencodedStr<const N: usize>(pub [u8; N]);
+
+impl<const N: usize> From<[u8; N]> for UnencodedStr<N> {
+ fn from(source: [u8; N]) -> Self {
+ Self(source)
+ }
+}
+
+impl<const N: usize> Debug for UnencodedStr<N> {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "{:?}", default_decode(&self.0))
+ }
+}
+
+#[derive(Clone)]
+pub struct ValueLabelRecord {
/// Offset from the start of the file to the start of the record.
pub offset: u64,
/// The labels.
- pub labels: Vec<([u8; 8], Vec<u8>)>,
+ pub labels: Vec<(UntypedValue, UnencodedString)>,
+}
+
+impl Debug for ValueLabelRecord {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ for (value, label) in self.labels.iter() {
+ writeln!(f, "{value:?}: {label:?}")?;
+ }
+ Ok(())
+ }
}
-impl ValueLabel {
+impl ValueLabelRecord {
/// Maximum number of value labels in a record.
pub const MAX: u32 = u32::MAX / 8;
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabel, Error> {
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabelRecord, Error> {
let offset = r.stream_position()?;
let n: u32 = endian.parse(read_bytes(r)?);
- if n > ValueLabel::MAX {
+ if n > ValueLabelRecord::MAX {
return Err(Error::BadNumberOfValueLabels {
offset,
n,
- max: ValueLabel::MAX,
+ max: ValueLabelRecord::MAX,
});
}
let mut labels = Vec::new();
for _ in 0..n {
- let value: [u8; 8] = read_bytes(r)?;
+ let value = UntypedValue(read_bytes(r)?);
let label_len: u8 = endian.parse(read_bytes(r)?);
let label_len = label_len as usize;
let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
- let mut label = read_vec(r, padded_len)?;
+ let mut label = read_vec(r, padded_len - 1)?;
label.truncate(label_len);
- labels.push((value, label));
+ labels.push((value, UnencodedString(label)));
}
- Ok(ValueLabel { offset, labels })
+ Ok(ValueLabelRecord { offset, labels })
}
}
-pub struct VarIndexes {
+#[derive(Clone)]
+pub struct VarIndexRecord {
/// Offset from the start of the file to the start of the record.
pub offset: u64,
- /// The 0-based indexes of the variable indexes.
- pub var_indexes: Vec<u32>,
+ /// The 1-based indexes of the variable indexes.
+ pub dict_indexes: Vec<u32>,
+}
+
+impl Debug for VarIndexRecord {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "apply to variables")?;
+ for dict_index in self.dict_indexes.iter() {
+ write!(f, " #{dict_index}")?;
+ }
+ Ok(())
+ }
}
-impl VarIndexes {
+impl VarIndexRecord {
/// Maximum number of variable indexes in a record.
pub const MAX: u32 = u32::MAX / 8;
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VarIndexes, Error> {
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VarIndexRecord, Error> {
let offset = r.stream_position()?;
let n: u32 = endian.parse(read_bytes(r)?);
- if n > VarIndexes::MAX {
+ if n > VarIndexRecord::MAX {
return Err(Error::BadNumberOfVarIndexes {
offset,
n,
- max: VarIndexes::MAX,
+ max: VarIndexRecord::MAX,
});
}
- let mut var_indexes = Vec::with_capacity(n as usize);
+ let mut dict_indexes = Vec::with_capacity(n as usize);
for _ in 0..n {
- var_indexes.push(endian.parse(read_bytes(r)?));
+ dict_indexes.push(endian.parse(read_bytes(r)?));
}
- Ok(VarIndexes {
+ Ok(VarIndexRecord {
offset,
- var_indexes,
+ dict_indexes,
})
}
}
-pub struct Document {
+#[derive(Clone, Debug)]
+pub struct DocumentRecord {
/// Offset from the start of the file to the start of the record.
pub pos: u64,
/// The document, as an array of 80-byte lines.
- pub lines: Vec<[u8; DOC_LINE_LEN as usize]>,
+ pub lines: Vec<DocumentLine>,
}
-impl Document {
+pub type DocumentLine = UnencodedStr<{ DocumentRecord::LINE_LEN }>;
+
+impl DocumentRecord {
/// Length of a line in a document. Document lines are fixed-length and
/// padded on the right with spaces.
- pub const LINE_LEN: u32 = 80;
+ pub const LINE_LEN: usize = 80;
/// Maximum number of lines we will accept in a document. This is simply
/// the maximum number that will fit in a 32-bit space.
- pub const MAX_LINES: u32 = i32::MAX as u32 / DOC_LINE_LEN;
+ pub const MAX_LINES: usize = i32::MAX as usize / Self::LINE_LEN;
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Document, Error> {
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<DocumentRecord, Error> {
let offset = r.stream_position()?;
let n: u32 = endian.parse(read_bytes(r)?);
- match n {
- 0..=DOC_MAX_LINES => {
- let pos = r.stream_position()?;
- let mut lines = Vec::with_capacity(n as usize);
- for _ in 0..n {
- let line: [u8; 80] = read_bytes(r)?;
- lines.push(line);
- }
- Ok(Document { pos, lines })
- }
- _ => Err(Error::BadDocumentLength {
+ let n = n as usize;
+ if n > Self::MAX_LINES {
+ Err(Error::BadDocumentLength {
offset,
n,
- max: DOC_MAX_LINES,
- }),
+ max: Self::MAX_LINES,
+ })
+ } else {
+ let pos = r.stream_position()?;
+ let mut lines = Vec::with_capacity(n);
+ for _ in 0..n {
+ lines.push(UnencodedStr::<{ DocumentRecord::LINE_LEN }>(read_bytes(r)?));
+ }
+ Ok(DocumentRecord { pos, lines })
}
}
}
-/*
-#[derive(FromPrimitive)]
-enum ExtensionType {
- /// Machine integer info.
- Integer = 3,
- /// Machine floating-point info.
- Float = 4,
- /// Variable sets.
- VarSets = 5,
- /// DATE.
- Date = 6,
- /// Multiple response sets.
- Mrsets = 7,
- /// SPSS Data Entry.
- DataEntry = 8,
- /// Extra product info text.
- ProductInfo = 10,
- /// Variable display parameters.
- Display = 11,
- /// Long variable names.
- LongNames = 13,
- /// Long strings.
- LongStrings = 14,
- /// Extended number of cases.
- Ncases = 16,
- /// Data file attributes.
- FileAttrs = 17,
- /// Variable attributes.
- VarAttrs = 18,
- /// Multiple response sets (extended).
- Mrsets2 = 19,
- /// Character encoding.
- Encoding = 20,
- /// Value labels for long strings.
- LongLabels = 21,
- /// Missing values for long strings.
- LongMissing = 22,
- /// "Format properties in dataview table".
- Dataview = 24,
-}
- */
+trait ExtensionRecord
+where
+ Self: Sized,
+{
+ const SUBTYPE: u32;
+ const SIZE: Option<u32>;
+ const COUNT: Option<u32>;
+ const NAME: &'static str;
+ fn parse(ext: &Extension, endian: Endian, warn: impl Fn(Error)) -> Result<Self, Error>;
+}
+
+#[derive(Clone, Debug)]
+pub struct IntegerInfoRecord {
+ pub version: (i32, i32, i32),
+ pub machine_code: i32,
+ pub floating_point_rep: i32,
+ pub compression_code: i32,
+ pub endianness: i32,
+ pub character_code: i32,
+}
+
+impl ExtensionRecord for IntegerInfoRecord {
+ const SUBTYPE: u32 = 3;
+ const SIZE: Option<u32> = Some(4);
+ const COUNT: Option<u32> = Some(8);
+ const NAME: &'static str = "integer record";
+
+ fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let data: Vec<i32> = (0..8)
+ .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
+ .collect();
+ Ok(IntegerInfoRecord {
+ version: (data[0], data[1], data[2]),
+ machine_code: data[3],
+ floating_point_rep: data[4],
+ compression_code: data[5],
+ endianness: data[6],
+ character_code: data[7],
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct FloatInfoRecord {
+ pub sysmis: f64,
+ pub highest: f64,
+ pub lowest: f64,
+}
+
+impl ExtensionRecord for FloatInfoRecord {
+ const SUBTYPE: u32 = 4;
+ const SIZE: Option<u32> = Some(8);
+ const COUNT: Option<u32> = Some(3);
+ const NAME: &'static str = "floating point record";
+
+ fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let data: Vec<f64> = (0..3)
+ .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
+ .collect();
+ Ok(FloatInfoRecord {
+ sysmis: data[0],
+ highest: data[1],
+ lowest: data[2],
+ })
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum CategoryLabels {
+ VarLabels,
+ CountedValues,
+}
+
+#[derive(Clone, Debug)]
+pub enum MultipleResponseType {
+ MultipleDichotomy {
+ value: UnencodedString,
+ labels: CategoryLabels,
+ },
+ MultipleCategory,
+}
+
+impl MultipleResponseType {
+ fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Error> {
+ let (mr_type, input) = match input.get(0) {
+ Some(b'C') => (MultipleResponseType::MultipleCategory, &input[1..]),
+ Some(b'D') => {
+ let (value, input) = parse_counted_string(&input[1..])?;
+ (
+ MultipleResponseType::MultipleDichotomy {
+ value: value.into(),
+ labels: CategoryLabels::VarLabels,
+ },
+ input,
+ )
+ }
+ Some(b'E') => {
+ let Some(b' ') = input.get(1) else {
+ return Err(Error::TBD);
+ };
+ let input = &input[2..];
+ let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
+ (CategoryLabels::CountedValues, rest)
+ } else if let Some(rest) = input.strip_prefix(b" 11 ") {
+ (CategoryLabels::VarLabels, rest)
+ } else {
+ return Err(Error::TBD);
+ };
+ let (value, input) = parse_counted_string(input)?;
+ (
+ MultipleResponseType::MultipleDichotomy {
+ value: value.into(),
+ labels,
+ },
+ input,
+ )
+ }
+ _ => return Err(Error::TBD),
+ };
+ Ok((mr_type, input))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseSet {
+ pub name: UnencodedString,
+ pub label: UnencodedString,
+ pub mr_type: MultipleResponseType,
+ pub short_names: Vec<UnencodedString>,
+}
+
+impl MultipleResponseSet {
+ fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> {
+ let Some(equals) = input.iter().position(|&b| b == b'=') else {
+ return Err(Error::TBD);
+ };
+ let (name, input) = input.split_at(equals);
+ let (mr_type, input) = MultipleResponseType::parse(input)?;
+ let Some(b' ') = input.get(0) else {
+ return Err(Error::TBD);
+ };
+ let (label, mut input) = parse_counted_string(&input[1..])?;
+ let mut vars = Vec::new();
+ while input.get(0) == Some(&b' ') {
+ input = &input[1..];
+ let Some(length) = input.iter().position(|b| b" \n".contains(b)) else {
+ return Err(Error::TBD);
+ };
+ if length > 0 {
+ vars.push(input[..length].into());
+ }
+ input = &input[length..];
+ }
+ if input.get(0) != Some(&b'\n') {
+ return Err(Error::TBD);
+ }
+ while input.get(0) == Some(&b'\n') {
+ input = &input[1..];
+ }
+ Ok((
+ MultipleResponseSet {
+ name: name.into(),
+ label: label.into(),
+ mr_type,
+ short_names: vars,
+ },
+ input,
+ ))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
+
+impl ExtensionRecord for MultipleResponseRecord {
+ const SUBTYPE: u32 = 7;
+ const SIZE: Option<u32> = Some(1);
+ const COUNT: Option<u32> = None;
+ const NAME: &'static str = "multiple response set record";
+
+ fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let mut sets = Vec::new();
+ while !input.is_empty() {
+ let (set, rest) = MultipleResponseSet::parse(input)?;
+ sets.push(set);
+ input = rest;
+ }
+ Ok(MultipleResponseRecord(sets))
+ }
+}
+
+fn parse_counted_string(input: &[u8]) -> Result<(UnencodedString, &[u8]), Error> {
+ let Some(space) = input.iter().position(|&b| b == b' ') else {
+ return Err(Error::TBD);
+ };
+ let Ok(length) = from_utf8(&input[..space]) else {
+ return Err(Error::TBD);
+ };
+ let Ok(length): Result<usize, _> = length.parse() else {
+ return Err(Error::TBD);
+ };
+
+ let input = &input[space + 1..];
+ if input.len() < length {
+ return Err(Error::TBD);
+ };
+
+ let (string, rest) = input.split_at(length);
+ Ok((string.into(), rest))
+}
+
+#[derive(Clone, Debug)]
+pub struct VarDisplayRecord(pub Vec<u32>);
+
+impl ExtensionRecord for VarDisplayRecord {
+ const SUBTYPE: u32 = 11;
+ const SIZE: Option<u32> = Some(4);
+ const COUNT: Option<u32> = None;
+ const NAME: &'static str = "variable display record";
+
+ fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let display = (0..ext.count)
+ .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
+ .collect();
+ Ok(VarDisplayRecord(display))
+ }
+}
+pub struct LongStringMissingValues {
+ /// Variable name.
+ pub var_name: UnencodedString,
+
+ /// Missing values.
+ pub missing_values: MissingValues,
+}
+
+pub struct LongStringMissingValueSet(Vec<LongStringMissingValues>);
+
+impl ExtensionRecord for LongStringMissingValueSet {
+ const SUBTYPE: u32 = 22;
+ const SIZE: Option<u32> = Some(1);
+ const COUNT: Option<u32> = None;
+ const NAME: &'static str = "long string missing values record";
+
+ fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let mut missing_value_set = Vec::new();
+ while !input.is_empty() {
+ let var_name = read_string(&mut input, endian)?;
+ let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
+ let value_len: u32 = endian.parse(read_bytes(&mut input)?);
+ if value_len != 8 {
+ let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offset;
+ return Err(Error::BadLongMissingValueLength {
+ record_offset: ext.offset,
+ offset,
+ value_len,
+ });
+ }
+ let mut values = Vec::new();
+ for i in 0..n_missing_values {
+ let value: [u8; 8] = read_bytes(&mut input)?;
+ let numeric_value: u64 = endian.parse(value);
+ let value = if i > 0 && numeric_value == 8 {
+ // Tolerate files written by old, buggy versions of PSPP
+ // where we believed that the value_length was repeated
+ // before each missing value.
+ read_bytes(&mut input)?
+ } else {
+ value
+ };
+ values.push(Value::String(UnencodedStr(value)));
+ }
+ let missing_values = MissingValues {
+ values,
+ range: None,
+ };
+ missing_value_set.push(LongStringMissingValues {
+ var_name,
+ missing_values,
+ });
+ }
+ Ok(LongStringMissingValueSet(missing_value_set))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct EncodingRecord(pub String);
+
+impl ExtensionRecord for EncodingRecord {
+ const SUBTYPE: u32 = 20;
+ const SIZE: Option<u32> = Some(1);
+ const COUNT: Option<u32> = None;
+ const NAME: &'static str = "encoding record";
+
+ fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
+ ext.check_size::<Self>()?;
+
+ Ok(EncodingRecord(
+ String::from_utf8(ext.data.clone())
+ .map_err(|_| Error::BadEncodingName { offset: ext.offset })?,
+ ))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct NumberOfCasesRecord {
+ /// Always observed as 1.
+ pub one: u64,
+
+ /// Number of cases.
+ pub n_cases: u64,
+}
+
+impl ExtensionRecord for NumberOfCasesRecord {
+ const SUBTYPE: u32 = 16;
+ const SIZE: Option<u32> = Some(8);
+ const COUNT: Option<u32> = Some(2);
+ const NAME: &'static str = "extended number of cases record";
+
+ fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let one = endian.parse(read_bytes(&mut input)?);
+ let n_cases = endian.parse(read_bytes(&mut input)?);
+
+ Ok(NumberOfCasesRecord { one, n_cases })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct TextRecord {
+ /// Offset from the start of the file to the start of the record.
+ pub offset: u64,
+
+ /// The text content of the record.
+ pub text: UnencodedString,
+}
+
+impl From<Extension> for TextRecord {
+ fn from(source: Extension) -> Self {
+ TextRecord {
+ offset: source.offset,
+ text: source.data.into(),
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
pub struct Extension {
/// Offset from the start of the file to the start of the record.
pub offset: u64,
pub data: Vec<u8>,
}
-/*
-fn extension_record_size_requirements(extension: ExtensionType) -> (u32, u32) {
- match extension {
- /* Implemented record types. */
- ExtensionType::Integer => (4, 8),
- ExtensionType::Float => (8, 3),
- ExtensionType::VarSets => (1, 0),
- ExtensionType::Mrsets => (1, 0),
- ExtensionType::ProductInfo => (1, 0),
- ExtensionType::Display => (4, 0),
- ExtensionType::LongNames => (1, 0),
- ExtensionType::LongStrings => (1, 0),
- ExtensionType::Ncases => (8, 2),
- ExtensionType::FileAttrs => (1, 0),
- ExtensionType::VarAttrs => (1, 0),
- ExtensionType::Mrsets2 => (1, 0),
- ExtensionType::Encoding => (1, 0),
- ExtensionType::LongLabels => (1, 0),
- ExtensionType::LongMissing => (1, 0),
-
- /* Ignored record types. */
- ExtensionType::Date => (0, 0),
- ExtensionType::DataEntry => (0, 0),
- ExtensionType::Dataview => (0, 0),
- }
-}
- */
-
impl Extension {
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Extension, Error> {
+ fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
+ if let Some(expected_size) = E::SIZE {
+ if self.size != expected_size {
+ return Err(Error::BadRecordSize {
+ offset: self.offset,
+ record: E::NAME.into(),
+ size: self.size,
+ expected_size,
+ });
+ }
+ }
+ if let Some(expected_count) = E::COUNT {
+ if self.count != expected_count {
+ return Err(Error::BadRecordCount {
+ offset: self.offset,
+ record: E::NAME.into(),
+ count: self.count,
+ expected_count,
+ });
+ }
+ }
+ Ok(())
+ }
+
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
let subtype = endian.parse(read_bytes(r)?);
let offset = r.stream_position()?;
let size: u32 = endian.parse(read_bytes(r)?);
};
let offset = r.stream_position()?;
let data = read_vec(r, product as usize)?;
- Ok(Extension {
+ let extension = Extension {
offset,
subtype,
size,
count,
data,
- })
+ };
+ match subtype {
+ IntegerInfoRecord::SUBTYPE => Ok(Record::IntegerInfo(IntegerInfoRecord::parse(
+ &extension,
+ endian,
+ |_| (),
+ )?)),
+ FloatInfoRecord::SUBTYPE => Ok(Record::FloatInfo(FloatInfoRecord::parse(
+ &extension,
+ endian,
+ |_| (),
+ )?)),
+ VarDisplayRecord::SUBTYPE => Ok(Record::VarDisplay(VarDisplayRecord::parse(
+ &extension,
+ endian,
+ |_| (),
+ )?)),
+ MultipleResponseRecord::SUBTYPE | 19 => Ok(Record::MultipleResponse(
+ MultipleResponseRecord::parse(&extension, endian, |_| ())?,
+ )),
+ LongStringValueLabelRecord::SUBTYPE => Ok(Record::LongStringValueLabels(
+ LongStringValueLabelRecord::parse(&extension, endian, |_| ())?,
+ )),
+ EncodingRecord::SUBTYPE => Ok(Record::Encoding(EncodingRecord::parse(
+ &extension,
+ endian,
+ |_| (),
+ )?)),
+ NumberOfCasesRecord::SUBTYPE => Ok(Record::NumberOfCases(NumberOfCasesRecord::parse(
+ &extension,
+ endian,
+ |_| (),
+ )?)),
+ 5 => Ok(Record::VariableSets(extension.into())),
+ 10 => Ok(Record::ProductInfo(extension.into())),
+ 13 => Ok(Record::LongNames(extension.into())),
+ 14 => Ok(Record::VeryLongStrings(extension.into())),
+ 17 => Ok(Record::FileAttributes(extension.into())),
+ 18 => Ok(Record::VariableAttributes(extension.into())),
+ _ => Ok(Record::OtherExtension(extension)),
+ }
}
}
+#[derive(Clone, Debug)]
pub struct ZHeader {
/// File offset to the start of the record.
pub offset: u64,
}
}
+#[derive(Clone, Debug)]
pub struct ZTrailer {
/// File offset to the start of the record.
pub offset: u64,
pub blocks: Vec<ZBlock>,
}
+#[derive(Clone, Debug)]
pub struct ZBlock {
/// Offset of block of data if simple compression were used.
pub uncompressed_ofs: u64,
pub compressed_size: u32,
}
+impl ZBlock {
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
+ Ok(ZBlock {
+ uncompressed_ofs: endian.parse(read_bytes(r)?),
+ compressed_ofs: endian.parse(read_bytes(r)?),
+ uncompressed_size: endian.parse(read_bytes(r)?),
+ compressed_size: endian.parse(read_bytes(r)?),
+ })
+ }
+}
+
impl ZTrailer {
fn read<R: Read + Seek>(
- r: &mut R,
+ reader: &mut R,
endian: Endian,
ztrailer_ofs: u64,
ztrailer_len: u64,
) -> Result<Option<ZTrailer>, Error> {
- let start_offset = r.stream_position()?;
- if r.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
+ let start_offset = reader.stream_position()?;
+ if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
return Ok(None);
}
- let int_bias = endian.parse(read_bytes(r)?);
- let zero = endian.parse(read_bytes(r)?);
- let block_size = endian.parse(read_bytes(r)?);
- let n_blocks: u32 = endian.parse(read_bytes(r)?);
+ let int_bias = endian.parse(read_bytes(reader)?);
+ let zero = endian.parse(read_bytes(reader)?);
+ let block_size = endian.parse(read_bytes(reader)?);
+ let n_blocks: u32 = endian.parse(read_bytes(reader)?);
let expected_n_blocks = (ztrailer_len - 24) / 24;
if n_blocks as u64 != expected_n_blocks {
return Err(Error::BadZlibTrailerNBlocks {
ztrailer_len,
});
}
- let mut blocks = Vec::with_capacity(n_blocks as usize);
- for _ in 0..n_blocks {
- let uncompressed_ofs = endian.parse(read_bytes(r)?);
- let compressed_ofs = endian.parse(read_bytes(r)?);
- let uncompressed_size = endian.parse(read_bytes(r)?);
- let compressed_size = endian.parse(read_bytes(r)?);
- blocks.push(ZBlock {
- uncompressed_ofs,
- compressed_ofs,
- uncompressed_size,
- compressed_size,
- });
- }
- r.seek(SeekFrom::Start(start_offset))?;
+ let blocks = (0..n_blocks)
+ .map(|_| ZBlock::read(reader, endian))
+ .collect::<Result<Vec<_>, _>>()?;
+ reader.seek(SeekFrom::Start(start_offset))?;
Ok(Some(ZTrailer {
offset: ztrailer_ofs,
int_bias,
r.read_exact(&mut vec)?;
Ok(vec)
}
+
+fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<UnencodedString, IoError> {
+ let length: u32 = endian.parse(read_bytes(r)?);
+ Ok(read_vec(r, length as usize)?.into())
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringValueLabels {
+ pub var_name: UnencodedString,
+ pub width: u32,
+
+ /// `(value, label)` pairs, where each value is `width` bytes.
+ pub labels: Vec<(UnencodedString, UnencodedString)>,
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
+
+impl ExtensionRecord for LongStringValueLabelRecord {
+ const SUBTYPE: u32 = 21;
+ const SIZE: Option<u32> = Some(1);
+ const COUNT: Option<u32> = None;
+ const NAME: &'static str = "long string value labels record";
+
+ fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let mut label_set = Vec::new();
+ while !input.is_empty() {
+ let var_name = read_string(&mut input, endian)?;
+ let width: u32 = endian.parse(read_bytes(&mut input)?);
+ let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
+ let mut labels = Vec::new();
+ for _ in 0..n_labels {
+ let value = read_string(&mut input, endian)?;
+ let label = read_string(&mut input, endian)?;
+ labels.push((value, label));
+ }
+ label_set.push(LongStringValueLabels {
+ var_name,
+ width,
+ labels,
+ })
+ }
+ Ok(LongStringValueLabelRecord(label_set))
+ }
+}