use crate::endian::{Endian, Parse, ToBytes};
-use crate::{CategoryLabels, Compression};
use encoding_rs::mem::decode_latin1;
use flate2::read::ZlibDecoder;
#[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
+ #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
+ ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
+
#[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
},
#[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
- BadRecordSize { offset: u64, record: String, size: u32, expected_size: u32 },
+ BadRecordSize {
+ offset: u64,
+ record: String,
+ size: u32,
+ expected_size: u32,
+ },
#[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
- BadRecordCount { offset: u64, record: String, count: u32, expected_count: u32 },
+ BadRecordCount {
+ offset: u64,
+ record: String,
+ count: u32,
+ expected_count: u32,
+ },
#[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
- BadLongMissingValueLength { record_offset: u64, offset: u64, value_len: u32 },
+ BadLongMissingValueLength {
+ record_offset: u64,
+ offset: u64,
+ value_len: u32,
+ },
#[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
BadEncodingName { offset: u64 },
#[derive(Clone, Debug)]
pub enum Record {
- Header(Header),
- Variable(Variable),
- ValueLabel(ValueLabel),
- VarIndexes(VarIndexes),
- Document(Document),
- IntegerInfo(IntegerInfo),
- FloatInfo(FloatInfo),
- VariableSets(UnencodedString),
+ Header(HeaderRecord),
+ Variable(VariableRecord),
+ ValueLabel(ValueLabelRecord),
+ Document(DocumentRecord),
+ IntegerInfo(IntegerInfoRecord),
+ FloatInfo(FloatInfoRecord),
+ VariableSets(TextRecord),
VarDisplay(VarDisplayRecord),
MultipleResponse(MultipleResponseRecord),
LongStringValueLabels(LongStringValueLabelRecord),
Encoding(EncodingRecord),
NumberOfCases(NumberOfCasesRecord),
- ProductInfo(UnencodedString),
- LongNames(UnencodedString),
- LongStrings(UnencodedString),
- FileAttributes(UnencodedString),
- VariableAttributes(UnencodedString),
- TextExtension(TextExtension),
+ ProductInfo(TextRecord),
+ LongNames(TextRecord),
+ VeryLongStrings(TextRecord),
+ FileAttributes(TextRecord),
+ VariableAttributes(TextRecord),
OtherExtension(Extension),
EndOfHeaders(u32),
ZHeader(ZHeader),
fn read<R: Read + Seek>(reader: &mut R, endian: Endian) -> Result<Record, Error> {
let rec_type: u32 = endian.parse(read_bytes(reader)?);
match rec_type {
- 2 => Ok(Record::Variable(Variable::read(reader, endian)?)),
- 3 => Ok(Record::ValueLabel(ValueLabel::read(reader, endian)?)),
- 4 => Ok(Record::VarIndexes(VarIndexes::read(reader, endian)?)),
- 6 => Ok(Record::Document(Document::read(reader, endian)?)),
+ 2 => Ok(Record::Variable(VariableRecord::read(reader, endian)?)),
+ 3 => Ok(Record::ValueLabel(ValueLabelRecord::read(reader, endian)?)),
+ 6 => Ok(Record::Document(DocumentRecord::read(reader, endian)?)),
7 => Ok(Extension::read(reader, endian)?),
999 => Ok(Record::EndOfHeaders(endian.parse(read_bytes(reader)?))),
_ => Err(Error::BadRecordType {
from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
}
+#[derive(Copy, Clone, Debug)]
+pub enum Compression {
+ Simple,
+ ZLib,
+}
+
#[derive(Clone)]
-pub struct Header {
+pub struct HeaderRecord {
/// Magic number.
pub magic: Magic,
/// Compression type, if any,
pub compression: Option<Compression>,
- /// 0-based variable index of the weight variable, or `None` if the file is
+ /// 1-based variable index of the weight variable, or `None` if the file is
/// unweighted.
pub weight_index: Option<u32>,
pub endian: Endian,
}
-impl Header {
+impl HeaderRecord {
fn debug_field<T: Debug>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult {
writeln!(f, "{name:>17}: {:?}", value)
}
}
-impl Debug for Header {
+impl Debug for HeaderRecord {
fn fmt(&self, f: &mut Formatter) -> FmtResult {
writeln!(f, "File header record:")?;
self.debug_field(f, "Magic", self.magic)?;
}
}
-impl Header {
- fn read<R: Read>(r: &mut R) -> Result<Header, Error> {
+impl HeaderRecord {
+ fn read<R: Read>(r: &mut R) -> Result<HeaderRecord, Error> {
let magic: [u8; 4] = read_bytes(r)?;
let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
};
let weight_index: u32 = endian.parse(read_bytes(r)?);
- let weight_index = (weight_index > 0).then(|| weight_index - 1);
+ let weight_index = (weight_index > 0).then_some(weight_index);
let n_cases: u32 = endian.parse(read_bytes(r)?);
let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
let file_label = UnencodedStr::<64>(read_bytes(r)?);
let _: [u8; 3] = read_bytes(r)?;
- Ok(Header {
+ Ok(HeaderRecord {
magic,
layout_code,
nominal_case_size,
}
}
-#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum VarType {
Numeric,
String,
mod state {
use super::{
- Compression, Error, Header, Record, Value, VarType, Variable, ZHeader, ZTrailer,
- ZlibDecodeMultiple,
+ Compression, Error, HeaderRecord, Record, Value, VarType, VariableRecord, ZHeader,
+ ZTrailer, ZlibDecodeMultiple,
};
use crate::endian::Endian;
use std::{
impl<R: Read + Seek + 'static> State for Start<R> {
fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
- let header = Header::read(&mut self.reader)?;
+ let header = HeaderRecord::read(&mut self.reader)?;
let next_state = Headers(CommonState {
reader: self.reader,
endian: header.endian,
fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
let record = Record::read(&mut self.0.reader, self.0.endian)?;
match record {
- Record::Variable(Variable { width, .. }) => {
+ Record::Variable(VariableRecord { width, .. }) => {
self.0.var_types.push(VarType::from_width(width));
}
Record::EndOfHeaders(_) => {
impl Value {
fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Value, IoError> {
- Ok(Self::from_raw(var_type, read_bytes(r)?, endian))
+ Ok(Self::from_raw(
+ UntypedValue(read_bytes(r)?),
+ var_type,
+ endian,
+ ))
}
- pub fn from_raw(var_type: VarType, raw: [u8; 8], endian: Endian) -> Value {
+ pub fn from_raw(raw: UntypedValue, var_type: VarType, endian: Endian) -> Value {
match var_type {
- VarType::String => Value::String(UnencodedStr(raw)),
+ VarType::String => Value::String(UnencodedStr(raw.0)),
VarType::Numeric => {
- let number: f64 = endian.parse(raw);
+ let number: f64 = endian.parse(raw.0);
Value::Number((number != -f64::MAX).then_some(number))
}
}
});
}
};
- values.push(Value::from_raw(var_type, raw, endian));
+ values.push(Value::from_raw(UntypedValue(raw), var_type, endian));
}
Ok(Some(values))
}
});
}
}
- 253 => break Value::from_raw(var_type, read_bytes(reader)?, endian),
+ 253 => {
+ break Value::from_raw(UntypedValue(read_bytes(reader)?), var_type, endian)
+ }
254 => match var_type {
VarType::String => break Value::String(UnencodedStr(*b" ")), // XXX EBCDIC
VarType::Numeric => {
39 => "SDATE",
40 => "MTIME",
41 => "YMDHMS",
- _ => return format!("<unknown format {type_}>").into()
- }.into()
+ _ => return format!("<unknown format {type_}>").into(),
+ }
+ .into()
}
#[derive(Clone)]
}
#[derive(Clone)]
-pub struct Variable {
+pub struct VariableRecord {
/// Offset from the start of the file to the start of the record.
pub offset: u64,
pub label: Option<UnencodedString>,
}
-impl Debug for Variable {
+impl Debug for VariableRecord {
fn fmt(&self, f: &mut Formatter) -> FmtResult {
writeln!(
f,
}
}
-impl Variable {
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Variable, Error> {
+impl VariableRecord {
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VariableRecord, Error> {
let offset = r.stream_position()?;
let width: i32 = endian.parse(read_bytes(r)?);
let has_variable_label: u32 = endian.parse(read_bytes(r)?);
let missing_values = MissingValues::read(r, offset, width, missing_value_code, endian)?;
- Ok(Variable {
+ Ok(VariableRecord {
offset,
width,
name,
}
#[derive(Clone)]
-pub struct ValueLabel {
- /// Offset from the start of the file to the start of the record.
- pub offset: u64,
+pub struct ValueLabelRecord {
+ /// Offset from the start of the file to the start of the value label
+ /// record.
+ pub label_offset: u64,
/// The labels.
pub labels: Vec<(UntypedValue, UnencodedString)>,
+
+ /// Offset from the start of the file to the start of the variable index
+ /// record.
+ pub index_offset: u64,
+
+ /// The 1-based indexes of the variable indexes.
+ pub dict_indexes: Vec<u32>,
}
-impl Debug for ValueLabel {
+impl Debug for ValueLabelRecord {
fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ writeln!(f, "labels: ")?;
for (value, label) in self.labels.iter() {
writeln!(f, "{value:?}: {label:?}")?;
}
+ write!(f, "apply to variables")?;
+ for dict_index in self.dict_indexes.iter() {
+ write!(f, " #{dict_index}")?;
+ }
Ok(())
}
}
-impl ValueLabel {
+impl ValueLabelRecord {
/// Maximum number of value labels in a record.
- pub const MAX: u32 = u32::MAX / 8;
+ pub const MAX_LABELS: u32 = u32::MAX / 8;
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabel, Error> {
- let offset = r.stream_position()?;
+ /// Maximum number of variable indexes in a record.
+ pub const MAX_INDEXES: u32 = u32::MAX / 8;
+
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabelRecord, Error> {
+ let label_offset = r.stream_position()?;
let n: u32 = endian.parse(read_bytes(r)?);
- if n > ValueLabel::MAX {
+ if n > Self::MAX_LABELS {
return Err(Error::BadNumberOfValueLabels {
- offset,
+ offset: label_offset,
n,
- max: ValueLabel::MAX,
+ max: Self::MAX_LABELS,
});
}
label.truncate(label_len);
labels.push((value, UnencodedString(label)));
}
- Ok(ValueLabel { offset, labels })
- }
-}
-
-#[derive(Clone)]
-pub struct VarIndexes {
- /// Offset from the start of the file to the start of the record.
- pub offset: u64,
- /// The 0-based indexes of the variable indexes.
- pub var_indexes: Vec<u32>,
-}
-
-impl Debug for VarIndexes {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "apply to variables")?;
- for var_index in self.var_indexes.iter() {
- write!(f, " #{var_index}")?;
+ let index_offset = r.stream_position()?;
+ let rec_type: u32 = endian.parse(read_bytes(r)?);
+ if rec_type != 4 {
+ return Err(Error::ExpectedVarIndexRecord {
+ offset: index_offset,
+ rec_type,
+ });
}
- Ok(())
- }
-}
-impl VarIndexes {
- /// Maximum number of variable indexes in a record.
- pub const MAX: u32 = u32::MAX / 8;
-
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VarIndexes, Error> {
- let offset = r.stream_position()?;
let n: u32 = endian.parse(read_bytes(r)?);
- if n > VarIndexes::MAX {
+ if n > Self::MAX_INDEXES {
return Err(Error::BadNumberOfVarIndexes {
- offset,
+ offset: index_offset,
n,
- max: VarIndexes::MAX,
+ max: Self::MAX_INDEXES,
});
}
- let mut var_indexes = Vec::with_capacity(n as usize);
+ let mut dict_indexes = Vec::with_capacity(n as usize);
for _ in 0..n {
- var_indexes.push(endian.parse(read_bytes(r)?));
+ dict_indexes.push(endian.parse(read_bytes(r)?));
}
- Ok(VarIndexes {
- offset,
- var_indexes,
+ Ok(ValueLabelRecord {
+ label_offset,
+ labels,
+ index_offset,
+ dict_indexes,
})
}
}
#[derive(Clone, Debug)]
-pub struct Document {
+pub struct DocumentRecord {
/// Offset from the start of the file to the start of the record.
pub pos: u64,
/// The document, as an array of 80-byte lines.
- pub lines: Vec<DocumentLine>
+ pub lines: Vec<DocumentLine>,
}
-pub type DocumentLine = UnencodedStr<{Document::LINE_LEN}>;
+pub type DocumentLine = UnencodedStr<{ DocumentRecord::LINE_LEN }>;
-impl Document {
+impl DocumentRecord {
/// Length of a line in a document. Document lines are fixed-length and
/// padded on the right with spaces.
pub const LINE_LEN: usize = 80;
/// the maximum number that will fit in a 32-bit space.
pub const MAX_LINES: usize = i32::MAX as usize / Self::LINE_LEN;
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Document, Error> {
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<DocumentRecord, Error> {
let offset = r.stream_position()?;
let n: u32 = endian.parse(read_bytes(r)?);
let n = n as usize;
let pos = r.stream_position()?;
let mut lines = Vec::with_capacity(n);
for _ in 0..n {
- lines.push(UnencodedStr::<{Document::LINE_LEN}>(read_bytes(r)?));
+ lines.push(UnencodedStr::<{ DocumentRecord::LINE_LEN }>(read_bytes(r)?));
}
- Ok(Document { pos, lines })
+ Ok(DocumentRecord { pos, lines })
}
}
}
}
#[derive(Clone, Debug)]
-pub struct IntegerInfo {
+pub struct IntegerInfoRecord {
pub version: (i32, i32, i32),
pub machine_code: i32,
pub floating_point_rep: i32,
pub character_code: i32,
}
-impl ExtensionRecord for IntegerInfo {
+impl ExtensionRecord for IntegerInfoRecord {
const SUBTYPE: u32 = 3;
const SIZE: Option<u32> = Some(4);
const COUNT: Option<u32> = Some(8);
let data: Vec<i32> = (0..8)
.map(|_| endian.parse(read_bytes(&mut input).unwrap()))
.collect();
- Ok(IntegerInfo {
+ Ok(IntegerInfoRecord {
version: (data[0], data[1], data[2]),
machine_code: data[3],
floating_point_rep: data[4],
}
#[derive(Clone, Debug)]
-pub struct FloatInfo {
+pub struct FloatInfoRecord {
pub sysmis: f64,
pub highest: f64,
pub lowest: f64,
}
-impl ExtensionRecord for FloatInfo {
+impl ExtensionRecord for FloatInfoRecord {
const SUBTYPE: u32 = 4;
const SIZE: Option<u32> = Some(8);
const COUNT: Option<u32> = Some(3);
let data: Vec<f64> = (0..3)
.map(|_| endian.parse(read_bytes(&mut input).unwrap()))
.collect();
- Ok(FloatInfo {
+ Ok(FloatInfoRecord {
sysmis: data[0],
highest: data[1],
lowest: data[2],
}
}
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum CategoryLabels {
+ VarLabels,
+ CountedValues,
+}
+
#[derive(Clone, Debug)]
pub enum MultipleResponseType {
MultipleDichotomy {
},
MultipleCategory,
}
-#[derive(Clone, Debug)]
-pub struct MultipleResponseSet {
- pub name: UnencodedString,
- pub label: UnencodedString,
- pub mr_type: MultipleResponseType,
- pub vars: Vec<UnencodedString>,
-}
-impl MultipleResponseSet {
- fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> {
- let Some(equals) = input.iter().position(|&b| b == b'=') else {
- return Err(Error::TBD);
- };
- let (name, input) = input.split_at(equals);
+impl MultipleResponseType {
+ fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Error> {
let (mr_type, input) = match input.get(0) {
Some(b'C') => (MultipleResponseType::MultipleCategory, &input[1..]),
Some(b'D') => {
}
_ => return Err(Error::TBD),
};
+ Ok((mr_type, input))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseSet {
+ pub name: UnencodedString,
+ pub label: UnencodedString,
+ pub mr_type: MultipleResponseType,
+ pub short_names: Vec<UnencodedString>,
+}
+
+impl MultipleResponseSet {
+ fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> {
+ let Some(equals) = input.iter().position(|&b| b == b'=') else {
+ return Err(Error::TBD);
+ };
+ let (name, input) = input.split_at(equals);
+ let (mr_type, input) = MultipleResponseType::parse(input)?;
let Some(b' ') = input.get(0) else {
return Err(Error::TBD);
};
name: name.into(),
label: label.into(),
mr_type,
- vars,
+ short_names: vars,
},
input,
))
}
#[derive(Clone, Debug)]
-pub struct MultipleResponseRecord(Vec<MultipleResponseSet>);
+pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
impl ExtensionRecord for MultipleResponseRecord {
const SUBTYPE: u32 = 7;
}
}
-
#[derive(Clone, Debug)]
pub struct NumberOfCasesRecord {
/// Always observed as 1.
}
}
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub enum TextExtensionSubtype {
- VariableSets = 5,
- ProductInfo = 10,
- LongNames = 13,
- LongStrings = 14,
- FileAttributes = 17,
- VariableAttributes = 18,
+#[derive(Clone, Debug)]
+pub struct TextRecord {
+ /// Offset from the start of the file to the start of the record.
+ pub offset: u64,
+
+ /// The text content of the record.
+ pub text: UnencodedString,
}
-#[derive(Clone, Debug)]
-pub struct TextExtension {
- pub subtype: TextExtensionSubtype,
- pub string: UnencodedString,
+impl From<Extension> for TextRecord {
+ fn from(source: Extension) -> Self {
+ TextRecord {
+ offset: source.offset,
+ text: source.data.into(),
+ }
+ }
}
#[derive(Clone, Debug)]
data,
};
match subtype {
- IntegerInfo::SUBTYPE => Ok(Record::IntegerInfo(IntegerInfo::parse(
+ IntegerInfoRecord::SUBTYPE => Ok(Record::IntegerInfo(IntegerInfoRecord::parse(
&extension,
endian,
|_| (),
)?)),
- FloatInfo::SUBTYPE => Ok(Record::FloatInfo(FloatInfo::parse(
+ FloatInfoRecord::SUBTYPE => Ok(Record::FloatInfo(FloatInfoRecord::parse(
&extension,
endian,
|_| (),
endian,
|_| (),
)?)),
- x if x == TextExtensionSubtype::VariableSets as u32 => {
- Ok(Record::VariableSets(UnencodedString(extension.data)))
- }
- x if x == TextExtensionSubtype::ProductInfo as u32 => {
- Ok(Record::ProductInfo(UnencodedString(extension.data)))
- }
- x if x == TextExtensionSubtype::LongNames as u32 => {
- Ok(Record::LongNames(UnencodedString(extension.data)))
- }
- x if x == TextExtensionSubtype::LongStrings as u32 => {
- Ok(Record::LongStrings(UnencodedString(extension.data)))
- }
- x if x == TextExtensionSubtype::FileAttributes as u32 => {
- Ok(Record::FileAttributes(UnencodedString(extension.data)))
- }
- x if x == TextExtensionSubtype::VariableAttributes as u32 => {
- Ok(Record::VariableAttributes(UnencodedString(extension.data)))
- }
+ 5 => Ok(Record::VariableSets(extension.into())),
+ 10 => Ok(Record::ProductInfo(extension.into())),
+ 13 => Ok(Record::LongNames(extension.into())),
+ 14 => Ok(Record::VeryLongStrings(extension.into())),
+ 17 => Ok(Record::FileAttributes(extension.into())),
+ 18 => Ok(Record::VariableAttributes(extension.into())),
_ => Ok(Record::OtherExtension(extension)),
}
}
}
#[derive(Clone, Debug)]
-pub struct LongStringValueLabelRecord(Vec<LongStringValueLabels>);
+pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
impl ExtensionRecord for LongStringValueLabelRecord {
const SUBTYPE: u32 = 21;
Ok(LongStringValueLabelRecord(label_set))
}
}
-