-use crate::endian::{Endian, Parse, ToBytes};
+use crate::{
+ dictionary::VarWidth,
+ encoding::{default_encoding, get_encoding, Error as EncodingError},
+ endian::{Endian, Parse, ToBytes},
+ identifier::{Error as IdError, Identifier},
+};
use encoding_rs::{mem::decode_latin1, DecoderResult, Encoding};
use flate2::read::ZlibDecoder;
borrow::Cow,
cell::RefCell,
cmp::Ordering,
- collections::VecDeque,
+ collections::{HashMap, VecDeque},
fmt::{Debug, Display, Formatter, Result as FmtResult},
io::{Error as IoError, Read, Seek, SeekFrom},
iter::repeat,
#[error("Invalid ZSAV compression code {0}")]
InvalidZsavCompression(u32),
- #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
- BadVariableWidth { offset: u64, width: i32 },
-
#[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
BadDocumentLength { offset: u64, n: usize, max: usize },
#[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
BadRecordType { offset: u64, rec_type: u32 },
+ #[error("In variable record starting at offset {start_offset:#x}, variable width is not in the valid range -1 to 255.")]
+ BadVariableWidth {
+ start_offset: u64,
+ width: i32,
+ },
+
#[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
BadVariableLabelCode {
start_offset: u64,
#[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")]
TooManyVarIndexes { offset: u64, n: u32, max: u32 },
- #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")]
- NoVarIndexes { offset: u64 },
-
- #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())]
- MixedVarTypes {
- offset: u64,
- var_type: VarType,
- wrong_types: Vec<u32>,
- },
-
- #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")]
- InvalidVarIndexes {
- offset: u64,
- max: usize,
- invalid: Vec<u32>,
- },
-
#[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
ExtensionRecordTooLarge {
offset: u64,
ztrailer_len: u64,
},
+ #[error("{0}")]
+ EncodingError(EncodingError),
+}
+
+#[derive(ThisError, Debug)]
+pub enum Warning {
+ #[error("Unexpected end of data inside extension record.")]
+ UnexpectedEndOfData,
+
+ #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")]
+ NoVarIndexes { offset: u64 },
+
+ #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())]
+ MixedVarTypes {
+ offset: u64,
+ var_type: VarType,
+ wrong_types: Vec<u32>,
+ },
+
+ #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")]
+ InvalidVarIndexes {
+ offset: u64,
+ max: usize,
+ invalid: Vec<u32>,
+ },
+
#[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
BadRecordSize {
offset: u64,
#[error("Invalid variable display alignment value {0}")]
InvalidAlignment(u32),
+ #[error("Invalid attribute name. {0}")]
+ InvalidAttributeName(IdError),
+
+ #[error("Invalid variable name in attribute record. {0}")]
+ InvalidAttributeVariableName(IdError),
+
+ #[error("Invalid short name in long variable name record. {0}")]
+ InvalidShortName(IdError),
+
+ #[error("Invalid name in long variable name record. {0}")]
+ InvalidLongName(IdError),
+
+ #[error("Invalid variable name in very long string record. {0}")]
+ InvalidLongStringName(IdError),
+
+ #[error("Invalid variable name in variable set record. {0}")]
+ InvalidVariableSetName(IdError),
+
+ #[error("Invalid multiple response set name. {0}")]
+ InvalidMrSetName(IdError),
+
+ #[error("Invalid multiple response set variable name. {0}")]
+ InvalidMrSetVariableName(IdError),
+
+ #[error("Invalid variable name in long string missing values record. {0}")]
+ InvalidLongStringMissingValueVariableName(IdError),
+
+ #[error("Invalid variable name in long string value label record. {0}")]
+ InvalidLongStringValueLabelName(IdError),
+
+ #[error("{0}")]
+ EncodingError(EncodingError),
+
#[error("Details TBD")]
TBD,
}
+impl From<IoError> for Warning {
+ fn from(_source: IoError) -> Self {
+ Self::UnexpectedEndOfData
+ }
+}
+
#[derive(Clone, Debug)]
pub enum Record {
Header(HeaderRecord<RawString>),
Document(DocumentRecord<RawDocumentLine>),
IntegerInfo(IntegerInfoRecord),
FloatInfo(FloatInfoRecord),
- VariableSets(TextRecord),
VarDisplay(VarDisplayRecord),
- MultipleResponse(MultipleResponseRecord<RawString>),
- LongStringValueLabels(LongStringValueLabelRecord),
- LongStringMissingValues(LongStringMissingValueRecord),
+ MultipleResponse(MultipleResponseRecord<RawString, RawString>),
+ LongStringValueLabels(LongStringValueLabelRecord<RawString, RawString>),
+ LongStringMissingValues(LongStringMissingValueRecord<RawString, RawStr<8>>),
+ Encoding(EncodingRecord),
+ NumberOfCases(NumberOfCasesRecord),
+ Text(TextRecord),
+ OtherExtension(Extension),
+ EndOfHeaders(u32),
+ ZHeader(ZHeader),
+ ZTrailer(ZTrailer),
+ Cases(Rc<RefCell<Cases>>),
+}
+
+#[derive(Clone, Debug)]
+pub enum DecodedRecord {
+ Header(HeaderRecord<String>),
+ Variable(VariableRecord<String, String>),
+ ValueLabel(ValueLabelRecord<RawStr<8>, String>),
+ Document(DocumentRecord<String>),
+ IntegerInfo(IntegerInfoRecord),
+ FloatInfo(FloatInfoRecord),
+ VarDisplay(VarDisplayRecord),
+ MultipleResponse(MultipleResponseRecord<Identifier, String>),
+ LongStringValueLabels(LongStringValueLabelRecord<Identifier, String>),
+ LongStringMissingValues(LongStringMissingValueRecord<Identifier, String>),
Encoding(EncodingRecord),
NumberOfCases(NumberOfCasesRecord),
- ProductInfo(TextRecord),
- LongNames(TextRecord),
- VeryLongStrings(TextRecord),
- FileAttributes(TextRecord),
- VariableAttributes(TextRecord),
+ VariableSets(VariableSetRecord),
+ ProductInfo(ProductInfoRecord),
+ LongNames(LongNamesRecord),
+ VeryLongStrings(VeryLongStringsRecord),
+ FileAttributes(FileAttributeRecord),
+ VariableAttributes(VariableAttributeRecord),
OtherExtension(Extension),
EndOfHeaders(u32),
ZHeader(ZHeader),
reader: &mut R,
endian: Endian,
var_types: &[VarType],
- warn: &Box<dyn Fn(Error)>,
+ warn: &dyn Fn(Warning),
) -> Result<Option<Record>, Error>
where
R: Read + Seek,
}),
}
}
+
+ pub fn decode(self, decoder: &Decoder) -> Result<DecodedRecord, Error> {
+ Ok(match self {
+ Record::Header(record) => record.decode(decoder),
+ Record::Variable(record) => record.decode(decoder),
+ Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)),
+ Record::Document(record) => record.decode(decoder),
+ Record::IntegerInfo(record) => DecodedRecord::IntegerInfo(record.clone()),
+ Record::FloatInfo(record) => DecodedRecord::FloatInfo(record.clone()),
+ Record::VarDisplay(record) => DecodedRecord::VarDisplay(record.clone()),
+ Record::MultipleResponse(record) => record.decode(decoder),
+ Record::LongStringValueLabels(record) => {
+ DecodedRecord::LongStringValueLabels(record.decode(decoder))
+ }
+ Record::LongStringMissingValues(record) => {
+ DecodedRecord::LongStringMissingValues(record.decode(decoder))
+ }
+ Record::Encoding(record) => DecodedRecord::Encoding(record.clone()),
+ Record::NumberOfCases(record) => DecodedRecord::NumberOfCases(record.clone()),
+ Record::Text(record) => record.decode(decoder),
+ Record::OtherExtension(record) => DecodedRecord::OtherExtension(record.clone()),
+ Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record),
+ Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()),
+ Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()),
+ Record::Cases(record) => DecodedRecord::Cases(record.clone()),
+ })
+ }
+}
+
+pub fn encoding_from_headers(
+ headers: &Vec<Record>,
+ warn: &impl Fn(Warning),
+) -> Result<&'static Encoding, Error> {
+ let mut encoding_record = None;
+ let mut integer_info_record = None;
+ for record in headers {
+ match record {
+ Record::Encoding(record) => encoding_record = Some(record),
+ Record::IntegerInfo(record) => integer_info_record = Some(record),
+ _ => (),
+ }
+ }
+ let encoding = encoding_record.map(|record| record.0.as_str());
+ let character_code = integer_info_record.map(|record| record.character_code);
+ match get_encoding(encoding, character_code) {
+ Ok(encoding) => Ok(encoding),
+ Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)),
+ Err(err) => {
+ warn(Warning::EncodingError(err));
+ // Warn that we're using the default encoding.
+ Ok(default_encoding())
+ }
+ }
}
// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
})
}
- fn decode<'a>(&'a self, decoder: &Decoder) -> HeaderRecord<Cow<'a, str>> {
- let eye_catcher = decoder.decode(&self.eye_catcher);
- let file_label = decoder.decode(&self.file_label);
- let creation_date = decoder.decode(&self.creation_date);
- let creation_time = decoder.decode(&self.creation_time);
- HeaderRecord {
+ pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ let eye_catcher = decoder.decode(&self.eye_catcher).to_string();
+ let file_label = decoder.decode(&self.file_label).to_string();
+ let creation_date = decoder.decode(&self.creation_date).to_string();
+ let creation_time = decoder.decode(&self.creation_time).to_string();
+ DecodedRecord::Header(HeaderRecord {
eye_catcher,
weight_index: self.weight_index,
n_cases: self.n_cases,
creation_date,
creation_time,
endian: self.endian,
- }
+ })
}
}
-struct Decoder {
- encoding: &'static Encoding,
- warn: Box<dyn Fn(Error)>,
+pub struct Decoder {
+ pub encoding: &'static Encoding,
+ pub warn: Box<dyn Fn(Warning)>,
}
impl Decoder {
+ pub fn new<F>(encoding: &'static Encoding, warn: F) -> Self
+ where
+ F: Fn(Warning) + 'static,
+ {
+ Self {
+ encoding,
+ warn: Box::new(warn),
+ }
+ }
+ fn warn(&self, warning: Warning) {
+ (self.warn)(warning)
+ }
fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
let (output, malformed) = self.encoding.decode_without_bom_handling(input);
if malformed {
- (self.warn)(Error::MalformedString {
+ self.warn(Warning::MalformedString {
encoding: self.encoding.name().into(),
text: output.clone().into(),
});
/// same length in bytes.
///
/// XXX warn about errors?
- fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
+ pub fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
if let (s, false) = self.encoding.decode_without_bom_handling(input) {
// This is the common case. Usually there will be no errors.
s
output.into()
}
}
+
+ pub fn decode_identifier(&self, input: &RawString) -> Result<Identifier, IdError> {
+ self.new_identifier(&self.decode(input))
+ }
+
+ pub fn new_identifier(&self, name: &str) -> Result<Identifier, IdError> {
+ Identifier::new(name, self.encoding)
+ }
}
impl<S> Header for HeaderRecord<S>
}
impl VarType {
- fn from_width(width: i32) -> VarType {
+ pub fn from_width(width: VarWidth) -> VarType {
match width {
- 0 => VarType::Numeric,
- _ => VarType::String,
+ VarWidth::Numeric => Self::Numeric,
+ VarWidth::String(_) => Self::String,
}
}
- fn opposite(self) -> VarType {
+ pub fn opposite(self) -> VarType {
match self {
Self::Numeric => Self::String,
Self::String => Self::Numeric,
Ok(Some(values))
}
- fn decode(&self, decoder: &Decoder) -> Value<String> {
+ fn decode(self, decoder: &Decoder) -> Value<String> {
match self {
- Self::Number(x) => Value::Number(*x),
+ Self::Number(x) => Value::Number(x),
Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
}
}
R: Read + Seek + 'static,
{
reader: Option<R>,
- warn: Box<dyn Fn(Error)>,
+ warn: Box<dyn Fn(Warning)>,
header: HeaderRecord<RawString>,
var_types: Vec<VarType>,
{
pub fn new<F>(mut reader: R, warn: F) -> Result<Self, Error>
where
- F: Fn(Error) + 'static,
+ F: Fn(Warning) + 'static,
{
let header = HeaderRecord::read(&mut reader)?;
Ok(Self {
&self.header,
)
}
-}
-
-impl<R> Iterator for Reader<R>
-where
- R: Read + Seek + 'static,
-{
- type Item = Result<Record, Error>;
-
- fn next(&mut self) -> Option<Self::Item> {
+ fn _next(&mut self) -> Option<<Self as Iterator>::Item> {
match self.state {
ReaderState::Start => {
self.state = ReaderState::Headers;
};
match record {
Record::Variable(VariableRecord { width, .. }) => {
- self.var_types.push(VarType::from_width(width));
+ self.var_types.push(if width == 0 {
+ VarType::Numeric
+ } else {
+ VarType::String
+ });
}
Record::EndOfHeaders(_) => {
self.state = if let Some(Compression::ZLib) = self.header.compression {
}
}
+impl<R> Iterator for Reader<R>
+where
+ R: Read + Seek + 'static,
+{
+ type Item = Result<Record, Error>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ let retval = self._next();
+ if matches!(retval, Some(Err(_))) {
+ self.state = ReaderState::End;
+ }
+ retval
+ }
+}
+
trait ReadSeek: Read + Seek {}
impl<T> ReadSeek for T where T: Read + Seek {}
}
#[derive(Clone)]
-pub struct MissingValues<S>
+pub struct MissingValues<S = String>
where
S: Debug,
{
}
}
+impl<S> Default for MissingValues<S>
+where
+ S: Debug,
+{
+ fn default() -> Self {
+ Self {
+ values: Vec::new(),
+ range: None,
+ }
+ }
+}
+
impl MissingValues<RawStr<8>> {
fn read<R: Read + Seek>(
r: &mut R,
(_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
};
- let var_type = VarType::from_width(width);
+ let var_type = if width == 0 {
+ VarType::Numeric
+ } else {
+ VarType::String
+ };
let mut values = Vec::new();
for _ in 0..n_values {
};
Ok(Self { values, range })
}
- fn decode<'a>(&'a self, decoder: &Decoder) -> MissingValues<String> {
+ fn decode(&self, decoder: &Decoder) -> MissingValues<String> {
MissingValues {
values: self
.values
fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
let start_offset = r.stream_position()?;
let width: i32 = endian.parse(read_bytes(r)?);
+ if !(-1..=255).contains(&width) {
+ return Err(Error::BadVariableWidth { start_offset, width });
+ }
let code_offset = r.stream_position()?;
let has_variable_label: u32 = endian.parse(read_bytes(r)?);
let missing_value_code: i32 = endian.parse(read_bytes(r)?);
}))
}
- fn decode<'a>(&'a self, decoder: &Decoder) -> VariableRecord<Cow<'a, str>, String> {
- VariableRecord {
+ pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ DecodedRecord::Variable(VariableRecord {
offsets: self.offsets.clone(),
width: self.width,
- name: decoder.decode(&self.name),
+ name: decoder.decode(&self.name).to_string(),
print_format: self.print_format,
write_format: self.write_format,
missing_values: self.missing_values.decode(decoder),
- label: self.label.as_ref().map(|label| decoder.decode(label)),
- }
+ label: self
+ .label
+ .as_ref()
+ .map(|label| decoder.decode(label).to_string()),
+ })
}
}
r: &mut R,
endian: Endian,
var_types: &[VarType],
- warn: &Box<dyn Fn(Error)>,
+ warn: &dyn Fn(Warning),
) -> Result<Option<Record>, Error> {
let label_offset = r.stream_position()?;
let n: u32 = endian.parse(read_bytes(r)?);
}
}
if !invalid_indexes.is_empty() {
- warn(Error::InvalidVarIndexes {
+ warn(Warning::InvalidVarIndexes {
offset: index_offset,
max: var_types.len(),
invalid: invalid_indexes,
}
let Some(&first_index) = dict_indexes.first() else {
- warn(Error::NoVarIndexes {
+ warn(Warning::NoVarIndexes {
offset: index_offset,
});
return Ok(None);
}
});
if !wrong_type_indexes.is_empty() {
- warn(Error::MixedVarTypes {
+ warn(Warning::MixedVarTypes {
offset: index_offset,
var_type,
wrong_types: wrong_type_indexes,
var_type,
})))
}
+
+ fn decode(self, decoder: &Decoder) -> ValueLabelRecord<RawStr<8>, String> {
+ let labels = self
+ .labels
+ .iter()
+ .map(|ValueLabel { value, label }| ValueLabel {
+ value: *value,
+ label: decoder.decode(label).to_string(),
+ })
+ .collect();
+ ValueLabelRecord {
+ offsets: self.offsets.clone(),
+ labels,
+ dict_indexes: self.dict_indexes.clone(),
+ var_type: self.var_type,
+ }
+ }
}
#[derive(Clone, Debug)]
{
pub offsets: Range<u64>,
- /// The document, as an array of 80-byte lines.
+ /// The document, as an array of lines. Raw lines are exactly 80 bytes long
+ /// and are right-padded with spaces without any new-line termination.
pub lines: Vec<S>,
}
}
}
- fn decode<'a>(&'a self, decoder: &Decoder) -> DocumentRecord<Cow<'a, str>> {
- DocumentRecord {
+ pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ DecodedRecord::Document(DocumentRecord {
offsets: self.offsets.clone(),
lines: self
.lines
.iter()
- .map(|s| decoder.decode_slice(&s.0))
+ .map(|s| decoder.decode_slice(&s.0).to_string())
.collect(),
- }
+ })
}
}
const SIZE: Option<u32>;
const COUNT: Option<u32>;
const NAME: &'static str;
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error>;
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning>;
}
#[derive(Clone, Debug)]
const COUNT: Option<u32> = Some(8);
const NAME: &'static str = "integer record";
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
ext.check_size::<Self>()?;
let mut input = &ext.data[..];
const COUNT: Option<u32> = Some(3);
const NAME: &'static str = "floating point record";
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
ext.check_size::<Self>()?;
let mut input = &ext.data[..];
}
impl MultipleResponseType {
- fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Error> {
+ fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> {
let (mr_type, input) = match input.split_first() {
Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
Some((b'D', input)) => {
} else if let Some(rest) = input.strip_prefix(b" 11 ") {
(CategoryLabels::VarLabels, rest)
} else {
- return Err(Error::TBD);
+ return Err(Warning::TBD);
};
let (value, input) = parse_counted_string(input)?;
(
input,
)
}
- _ => return Err(Error::TBD),
+ _ => return Err(Warning::TBD),
};
Ok((mr_type, input))
}
}
#[derive(Clone, Debug)]
-pub struct MultipleResponseSet<S>
+pub struct MultipleResponseSet<I, S>
where
+ I: Debug,
S: Debug,
{
- pub name: S,
+ pub name: I,
pub label: S,
pub mr_type: MultipleResponseType,
- pub short_names: Vec<S>,
+ pub short_names: Vec<I>,
}
-impl MultipleResponseSet<RawString> {
- fn parse(input: &[u8]) -> Result<(Self, &[u8]), Error> {
+impl MultipleResponseSet<RawString, RawString> {
+ fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> {
let Some(equals) = input.iter().position(|&b| b == b'=') else {
- return Err(Error::TBD);
+ return Err(Warning::TBD);
};
let (name, input) = input.split_at(equals);
let (mr_type, input) = MultipleResponseType::parse(input)?;
let Some(input) = input.strip_prefix(b" ") else {
- return Err(Error::TBD);
+ return Err(Warning::TBD);
};
let (label, mut input) = parse_counted_string(input)?;
let mut vars = Vec::new();
match input.split_first() {
Some((b' ', rest)) => {
let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
- return Err(Error::TBD);
+ return Err(Warning::TBD);
};
let (var, rest) = rest.split_at(length);
if !var.is_empty() {
}
input = rest;
}
- _ => return Err(Error::TBD),
+ _ => return Err(Warning::TBD),
}
}
while input.first() == Some(&b'\n') {
))
}
- fn decode<'a>(&'a self, decoder: &Decoder) -> MultipleResponseSet<Cow<'a, str>> {
- MultipleResponseSet {
- name: decoder.decode(&self.name),
- label: decoder.decode(&self.label),
- mr_type: self.mr_type.clone(),
- short_names: self.short_names.iter().map(|s| decoder.decode(s)).collect(),
+ fn decode(
+ &self,
+ decoder: &Decoder,
+ ) -> Result<MultipleResponseSet<Identifier, String>, Warning> {
+ let mut short_names = Vec::with_capacity(self.short_names.len());
+ for short_name in self.short_names.iter() {
+ if let Some(short_name) = decoder
+ .decode_identifier(short_name)
+ .map_err(Warning::InvalidMrSetName)
+ .issue_warning(&decoder.warn)
+ {
+ short_names.push(short_name);
+ }
}
+ Ok(MultipleResponseSet {
+ name: decoder
+ .decode_identifier(&self.name)
+ .map_err(Warning::InvalidMrSetVariableName)?,
+ label: decoder.decode(&self.label).to_string(),
+ mr_type: self.mr_type.clone(),
+ short_names,
+ })
}
}
#[derive(Clone, Debug)]
-pub struct MultipleResponseRecord<S>(pub Vec<MultipleResponseSet<S>>)
+pub struct MultipleResponseRecord<I, S>(pub Vec<MultipleResponseSet<I, S>>)
where
+ I: Debug,
S: Debug;
-impl ExtensionRecord for MultipleResponseRecord<RawString> {
+impl ExtensionRecord for MultipleResponseRecord<RawString, RawString> {
const SUBTYPE: u32 = 7;
const SIZE: Option<u32> = Some(1);
const COUNT: Option<u32> = None;
const NAME: &'static str = "multiple response set record";
- fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Error> {
+ fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
ext.check_size::<Self>()?;
let mut input = &ext.data[..];
}
}
-impl MultipleResponseRecord<RawString> {
- fn decode<'a>(&'a self, decoder: &Decoder) -> MultipleResponseRecord<Cow<'a, str>> {
- MultipleResponseRecord(self.0.iter().map(|set| set.decode(decoder)).collect())
+impl MultipleResponseRecord<RawString, RawString> {
+ fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ let mut sets = Vec::new();
+ for set in self.0.iter() {
+ if let Some(set) = set.decode(decoder).issue_warning(&decoder.warn) {
+ sets.push(set);
+ }
+ }
+ DecodedRecord::MultipleResponse(MultipleResponseRecord(sets))
}
}
-fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Error> {
+fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> {
let Some(space) = input.iter().position(|&b| b == b' ') else {
- return Err(Error::TBD);
+ return Err(Warning::TBD);
};
let Ok(length) = from_utf8(&input[..space]) else {
- return Err(Error::TBD);
+ return Err(Warning::TBD);
};
let Ok(length): Result<usize, _> = length.parse() else {
- return Err(Error::TBD);
+ return Err(Warning::TBD);
};
let input = &input[space + 1..];
if input.len() < length {
- return Err(Error::TBD);
+ return Err(Warning::TBD);
};
let (string, rest) = input.split_at(length);
}
impl Measure {
- fn try_decode(source: u32) -> Result<Option<Measure>, Error> {
+ pub fn default_for_type(var_type: VarType) -> Option<Measure> {
+ match var_type {
+ VarType::Numeric => None,
+ VarType::String => Some(Self::Nominal),
+ }
+ }
+
+ fn try_decode(source: u32) -> Result<Option<Measure>, Warning> {
match source {
0 => Ok(None),
1 => Ok(Some(Measure::Nominal)),
2 => Ok(Some(Measure::Ordinal)),
3 => Ok(Some(Measure::Scale)),
- _ => Err(Error::InvalidMeasurement(source)),
+ _ => Err(Warning::InvalidMeasurement(source)),
}
}
}
}
impl Alignment {
- fn try_decode(source: u32) -> Result<Option<Alignment>, Error> {
+ fn try_decode(source: u32) -> Result<Option<Alignment>, Warning> {
match source {
0 => Ok(None),
1 => Ok(Some(Alignment::Left)),
2 => Ok(Some(Alignment::Right)),
3 => Ok(Some(Alignment::Center)),
- _ => Err(Error::InvalidAlignment(source)),
+ _ => Err(Warning::InvalidAlignment(source)),
+ }
+ }
+
+ pub fn default_for_type(var_type: VarType) -> Self {
+ match var_type {
+ VarType::Numeric => Self::Right,
+ VarType::String => Self::Left,
}
}
}
ext: &Extension,
n_vars: usize,
endian: Endian,
- warn: &Box<dyn Fn(Error)>,
- ) -> Result<Record, Error> {
+ warn: &dyn Fn(Warning),
+ ) -> Result<Record, Warning> {
if ext.size != 4 {
- return Err(Error::BadRecordSize {
+ return Err(Warning::BadRecordSize {
offset: ext.offsets.start,
record: String::from("variable display record"),
size: ext.size,
} else if ext.count as usize == 2 * n_vars {
false
} else {
- return Err(Error::TBD);
+ return Err(Warning::TBD);
};
let mut var_displays = Vec::new();
let mut input = &ext.data[..];
for _ in 0..n_vars {
let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
- .warn_on_error(&warn)
+ .issue_warning(&warn)
.flatten();
let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
- .warn_on_error(&warn)
+ .issue_warning(&warn)
.flatten();
var_displays.push(VarDisplay {
measure,
}
#[derive(Clone, Debug)]
-pub struct LongStringMissingValues {
+pub struct LongStringMissingValues<N, V>
+where
+ N: Debug,
+ V: Debug,
+{
/// Variable name.
- pub var_name: RawString,
+ pub var_name: N,
/// Missing values.
- pub missing_values: MissingValues<RawStr<8>>,
+ pub missing_values: MissingValues<V>,
+}
+
+impl LongStringMissingValues<RawString, RawStr<8>> {
+ fn decode(
+ &self,
+ decoder: &Decoder,
+ ) -> Result<LongStringMissingValues<Identifier, String>, IdError> {
+ Ok(LongStringMissingValues {
+ var_name: decoder.decode_identifier(&self.var_name)?,
+ missing_values: self.missing_values.decode(decoder),
+ })
+ }
}
#[derive(Clone, Debug)]
-pub struct LongStringMissingValueRecord(pub Vec<LongStringMissingValues>);
+pub struct LongStringMissingValueRecord<N, V>(pub Vec<LongStringMissingValues<N, V>>)
+where
+ N: Debug,
+ V: Debug;
-impl ExtensionRecord for LongStringMissingValueRecord {
+impl ExtensionRecord for LongStringMissingValueRecord<RawString, RawStr<8>> {
const SUBTYPE: u32 = 22;
const SIZE: Option<u32> = Some(1);
const COUNT: Option<u32> = None;
const NAME: &'static str = "long string missing values record";
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
ext.check_size::<Self>()?;
let mut input = &ext.data[..];
let value_len: u32 = endian.parse(read_bytes(&mut input)?);
if value_len != 8 {
let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
- return Err(Error::BadLongMissingValueLength {
+ return Err(Warning::BadLongMissingValueLength {
record_offset: ext.offsets.start,
offset,
value_len,
}
}
+impl LongStringMissingValueRecord<RawString, RawStr<8>> {
+ pub fn decode(self, decoder: &Decoder) -> LongStringMissingValueRecord<Identifier, String> {
+ let mut mvs = Vec::with_capacity(self.0.len());
+ for mv in self.0.iter() {
+ if let Some(mv) = mv
+ .decode(decoder)
+ .map_err(Warning::InvalidLongStringMissingValueVariableName)
+ .issue_warning(&decoder.warn)
+ {
+ mvs.push(mv);
+ }
+ }
+ LongStringMissingValueRecord(mvs)
+ }
+}
+
#[derive(Clone, Debug)]
pub struct EncodingRecord(pub String);
const COUNT: Option<u32> = None;
const NAME: &'static str = "encoding record";
- fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Error> {
+ fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
ext.check_size::<Self>()?;
Ok(Record::Encoding(EncodingRecord(
- String::from_utf8(ext.data.clone()).map_err(|_| Error::BadEncodingName {
+ String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName {
offset: ext.offsets.start,
})?,
)))
}
}
-#[derive(Copy, Clone, Debug)]
+#[derive(Clone, Debug)]
pub struct NumberOfCasesRecord {
/// Always observed as 1.
pub one: u64,
const COUNT: Option<u32> = Some(2);
const NAME: &'static str = "extended number of cases record";
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
ext.check_size::<Self>()?;
let mut input = &ext.data[..];
pub struct TextRecord {
pub offsets: Range<u64>,
+ /// Type of record.
+ pub rec_type: TextRecordType,
+
/// The text content of the record.
pub text: RawString,
}
-impl From<Extension> for TextRecord {
- fn from(source: Extension) -> Self {
- TextRecord {
- offsets: source.offsets,
- text: source.data.into(),
+#[derive(Clone, Copy, Debug)]
+pub enum TextRecordType {
+ VariableSets,
+ ProductInfo,
+ LongNames,
+ VeryLongStrings,
+ FileAttributes,
+ VariableAttributes,
+}
+
+impl TextRecord {
+ fn new(extension: Extension, rec_type: TextRecordType) -> Self {
+ Self {
+ offsets: extension.offsets,
+ rec_type,
+ text: extension.data.into(),
+ }
+ }
+ pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ match self.rec_type {
+ TextRecordType::VariableSets => {
+ DecodedRecord::VariableSets(VariableSetRecord::decode(&self, decoder))
+ }
+ TextRecordType::ProductInfo => {
+ DecodedRecord::ProductInfo(ProductInfoRecord::decode(&self, decoder))
+ }
+ TextRecordType::LongNames => {
+ DecodedRecord::LongNames(LongNamesRecord::decode(&self, decoder))
+ }
+ TextRecordType::VeryLongStrings => {
+ DecodedRecord::VeryLongStrings(VeryLongStringsRecord::decode(&self, decoder))
+ }
+ TextRecordType::FileAttributes => {
+ DecodedRecord::FileAttributes(FileAttributeRecord::decode(&self, decoder))
+ }
+ TextRecordType::VariableAttributes => {
+ DecodedRecord::VariableAttributes(VariableAttributeRecord::decode(&self, decoder))
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VeryLongString {
+ pub short_name: Identifier,
+ pub length: u16,
+}
+
+impl VeryLongString {
+ fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Warning> {
+ let Some((short_name, length)) = input.split_once('=') else {
+ return Err(Warning::TBD);
+ };
+ let short_name = decoder
+ .new_identifier(short_name)
+ .map_err(Warning::InvalidLongStringName)?;
+ let length = length.parse().map_err(|_| Warning::TBD)?;
+ Ok(VeryLongString { short_name, length })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VeryLongStringsRecord(Vec<VeryLongString>);
+
+impl VeryLongStringsRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ let input = decoder.decode(&source.text);
+ let mut very_long_strings = Vec::new();
+ for tuple in input
+ .split('\0')
+ .map(|s| s.trim_end_matches('\t'))
+ .filter(|s| !s.is_empty())
+ {
+ if let Some(vls) = VeryLongString::parse(decoder, tuple).issue_warning(&decoder.warn) {
+ very_long_strings.push(vls)
+ }
}
+ VeryLongStringsRecord(very_long_strings)
}
}
+#[derive(Clone, Debug)]
+pub struct Attribute {
+ pub name: Identifier,
+ pub values: Vec<String>,
+}
+
+impl Attribute {
+ fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> {
+ let Some((name, mut input)) = input.split_once('(') else {
+ return Err(Warning::TBD);
+ };
+ let name = decoder
+ .new_identifier(name)
+ .map_err(Warning::InvalidAttributeName)?;
+ let mut values = Vec::new();
+ loop {
+ let Some((value, rest)) = input.split_once('\n') else {
+ return Err(Warning::TBD);
+ };
+ if let Some(stripped) = value
+ .strip_prefix('\'')
+ .and_then(|value| value.strip_suffix('\''))
+ {
+ values.push(stripped.into());
+ } else {
+ decoder.warn(Warning::TBD);
+ values.push(value.into());
+ }
+ if let Some(rest) = rest.strip_prefix(')') {
+ let attribute = Attribute { name, values };
+ return Ok((attribute, rest));
+ };
+ input = rest;
+ }
+ }
+}
+
+#[derive(Clone, Debug, Default)]
+pub struct AttributeSet(pub HashMap<Identifier, Vec<String>>);
+
+impl AttributeSet {
+ fn parse<'a>(
+ decoder: &Decoder,
+ mut input: &'a str,
+ sentinel: Option<char>,
+ ) -> Result<(AttributeSet, &'a str), Warning> {
+ let mut attributes = HashMap::new();
+ let rest = loop {
+ match input.chars().next() {
+ None => break input,
+ c if c == sentinel => break &input[1..],
+ _ => {
+ let (attribute, rest) = Attribute::parse(decoder, input)?;
+ // XXX report duplicate name
+ attributes.insert(attribute.name, attribute.values);
+ input = rest;
+ }
+ }
+ };
+ Ok((AttributeSet(attributes), rest))
+ }
+}
+
+#[derive(Clone, Debug, Default)]
+pub struct FileAttributeRecord(pub AttributeSet);
+
+impl FileAttributeRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ let input = decoder.decode(&source.text);
+ match AttributeSet::parse(decoder, &input, None).issue_warning(&decoder.warn) {
+ Some((set, rest)) => {
+ if !rest.is_empty() {
+ decoder.warn(Warning::TBD);
+ }
+ FileAttributeRecord(set)
+ }
+ None => FileAttributeRecord::default(),
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VarAttributeSet {
+ pub long_var_name: Identifier,
+ pub attributes: AttributeSet,
+}
+
+impl VarAttributeSet {
+ fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributeSet, &'a str), Warning> {
+ let Some((long_var_name, rest)) = input.split_once(':') else {
+ return Err(Warning::TBD);
+ };
+ let long_var_name = decoder
+ .new_identifier(long_var_name)
+ .map_err(Warning::InvalidAttributeVariableName)?;
+ let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?;
+ let var_attribute = VarAttributeSet {
+ long_var_name,
+ attributes,
+ };
+ Ok((var_attribute, rest))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
+
+impl VariableAttributeRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ let decoded = decoder.decode(&source.text);
+ let mut input = decoded.as_ref();
+ let mut var_attribute_sets = Vec::new();
+ while !input.is_empty() {
+ let Some((var_attribute, rest)) =
+ VarAttributeSet::parse(decoder, input).issue_warning(&decoder.warn)
+ else {
+ break;
+ };
+ var_attribute_sets.push(var_attribute);
+ input = rest;
+ }
+ VariableAttributeRecord(var_attribute_sets)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongName {
+ pub short_name: Identifier,
+ pub long_name: Identifier,
+}
+
+impl LongName {
+ fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
+ let Some((short_name, long_name)) = input.split_once('=') else {
+ return Err(Warning::TBD);
+ };
+ let short_name = decoder
+ .new_identifier(short_name)
+ .map_err(Warning::InvalidShortName)?;
+ let long_name = decoder
+ .new_identifier(long_name)
+ .map_err(Warning::InvalidLongName)?;
+ Ok(LongName {
+ short_name,
+ long_name,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongNamesRecord(Vec<LongName>);
+
+impl LongNamesRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ let input = decoder.decode(&source.text);
+ let mut names = Vec::new();
+ for pair in input.split('\t').filter(|s| !s.is_empty()) {
+ if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&decoder.warn) {
+ names.push(long_name);
+ }
+ }
+ LongNamesRecord(names)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct ProductInfoRecord(pub String);
+
+impl ProductInfoRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ Self(decoder.decode(&source.text).into())
+ }
+}
#[derive(Clone, Debug)]
pub struct VariableSet {
pub name: String,
- pub vars: Vec<String>,
+ pub vars: Vec<Identifier>,
}
impl VariableSet {
- fn parse(input: &str) -> Result<Self, Error> {
- let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
- let vars = input.split_ascii_whitespace().map(String::from).collect();
+ fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
+ let (name, input) = input.split_once('=').ok_or(Warning::TBD)?;
+ let mut vars = Vec::new();
+ for var in input.split_ascii_whitespace() {
+ if let Some(identifier) = decoder
+ .new_identifier(var)
+ .map_err(Warning::InvalidVariableSetName)
+ .issue_warning(&decoder.warn)
+ {
+ vars.push(identifier);
+ }
+ }
Ok(VariableSet {
name: name.into(),
vars,
}
impl VariableSetRecord {
- fn decode<'a>(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord {
let mut sets = Vec::new();
let input = decoder.decode(&source.text);
for line in input.lines() {
- if let Some(set) = VariableSet::parse(line).warn_on_error(&decoder.warn) {
+ if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&decoder.warn) {
sets.push(set)
}
}
}
}
-trait WarnOnError<T> {
- fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
+trait IssueWarning<T> {
+ fn issue_warning<F>(self, warn: &F) -> Option<T>
+ where
+ F: Fn(Warning);
}
-impl<T> WarnOnError<T> for Result<T, Error> {
- fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
+impl<T> IssueWarning<T> for Result<T, Warning> {
+ fn issue_warning<F>(self, warn: &F) -> Option<T>
+ where
+ F: Fn(Warning),
+ {
match self {
Ok(result) => Some(result),
Err(error) => {
}
impl Extension {
- fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
+ fn check_size<E: ExtensionRecord>(&self) -> Result<(), Warning> {
if let Some(expected_size) = E::SIZE {
if self.size != expected_size {
- return Err(Error::BadRecordSize {
+ return Err(Warning::BadRecordSize {
offset: self.offsets.start,
record: E::NAME.into(),
size: self.size,
}
if let Some(expected_count) = E::COUNT {
if self.count != expected_count {
- return Err(Error::BadRecordCount {
+ return Err(Warning::BadRecordCount {
offset: self.offsets.start,
record: E::NAME.into(),
count: self.count,
r: &mut R,
endian: Endian,
n_vars: usize,
- warn: &Box<dyn Fn(Error)>,
+ warn: &dyn Fn(Warning),
) -> Result<Option<Record>, Error> {
let subtype = endian.parse(read_bytes(r)?);
let header_offset = r.stream_position()?;
}
EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian),
NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian),
- 5 => Ok(Record::VariableSets(extension.into())),
- 10 => Ok(Record::ProductInfo(extension.into())),
- 13 => Ok(Record::LongNames(extension.into())),
- 14 => Ok(Record::VeryLongStrings(extension.into())),
- 17 => Ok(Record::FileAttributes(extension.into())),
- 18 => Ok(Record::VariableAttributes(extension.into())),
+ 5 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::VariableSets,
+ ))),
+ 10 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::ProductInfo,
+ ))),
+ 13 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::LongNames,
+ ))),
+ 14 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::VeryLongStrings,
+ ))),
+ 17 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::FileAttributes,
+ ))),
+ 18 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::VariableAttributes,
+ ))),
_ => Ok(Record::OtherExtension(extension)),
};
match result {
}
#[derive(Clone, Debug)]
-pub struct LongStringValueLabels {
- pub var_name: RawString,
+pub struct LongStringValueLabels<N, S>
+where
+ S: Debug,
+{
+ pub var_name: N,
pub width: u32,
/// `(value, label)` pairs, where each value is `width` bytes.
- pub labels: Vec<(RawString, RawString)>,
+ pub labels: Vec<(S, S)>,
+}
+
+impl LongStringValueLabels<RawString, RawString> {
+ fn decode(
+ &self,
+ decoder: &Decoder,
+ ) -> Result<LongStringValueLabels<Identifier, String>, Warning> {
+ let var_name = decoder.decode(&self.var_name);
+ let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
+ .map_err(Warning::InvalidLongStringValueLabelName)?;
+
+ let mut labels = Vec::with_capacity(self.labels.len());
+ for (value, label) in self.labels.iter() {
+ let value = decoder.decode_exact_length(&value.0).to_string();
+ let label = decoder.decode(label).to_string();
+ labels.push((value, label));
+ }
+
+ Ok(LongStringValueLabels {
+ var_name,
+ width: self.width,
+ labels,
+ })
+ }
}
#[derive(Clone, Debug)]
-pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
+pub struct LongStringValueLabelRecord<N, S>(pub Vec<LongStringValueLabels<N, S>>)
+where
+ N: Debug,
+ S: Debug;
-impl ExtensionRecord for LongStringValueLabelRecord {
+impl ExtensionRecord for LongStringValueLabelRecord<RawString, RawString> {
const SUBTYPE: u32 = 21;
const SIZE: Option<u32> = Some(1);
const COUNT: Option<u32> = None;
const NAME: &'static str = "long string value labels record";
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Error> {
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
ext.check_size::<Self>()?;
let mut input = &ext.data[..];
)))
}
}
+
+impl LongStringValueLabelRecord<RawString, RawString> {
+ fn decode(self, decoder: &Decoder) -> LongStringValueLabelRecord<Identifier, String> {
+ let mut labels = Vec::with_capacity(self.0.len());
+ for label in &self.0 {
+ match label.decode(decoder) {
+ Ok(set) => labels.push(set),
+ Err(error) => decoder.warn(error),
+ }
+ }
+ LongStringValueLabelRecord(labels)
+ }
+}