use std::{borrow::Cow, cmp::Ordering, collections::HashMap, iter::repeat};
use crate::{
- encoding::{get_encoding, Error as EncodingError, default_encoding},
+ encoding::{default_encoding, get_encoding, Error as EncodingError},
endian::Endian,
format::{Error as FormatError, Spec, UncheckedSpec},
identifier::{Error as IdError, Identifier},
#[error("Invalid variable name in attribute record. {0}")]
InvalidAttributeVariableName(IdError),
+ // XXX This is risky because `text` might be arbitarily long.
+ #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
+ MalformedString { encoding: String, text: String },
+
+ #[error("Invalid variable measurement level value {0}")]
+ InvalidMeasurement(u32),
+
+ #[error("Invalid variable display alignment value {0}")]
+ InvalidAlignment(u32),
+
#[error("Details TBD")]
TBD,
}
n_generated_names: usize,
}
-pub fn decode<T>(headers: Vec<raw::Record>, warn: &impl Fn(Error)) -> Result<Vec<Record>, Error> {
+pub fn decode(
+ headers: Vec<raw::Record>,
+ encoding: Option<&'static Encoding>,
+ warn: &impl Fn(Error),
+) -> Result<Vec<Record>, Error> {
let Some(header_record) = headers.iter().find_map(|rec| {
if let raw::Record::Header(header) = rec {
Some(header)
}) else {
return Err(Error::MissingHeaderRecord);
};
- let encoding = headers.iter().find_map(|rec| {
- if let raw::Record::Encoding(ref e) = rec {
- Some(e.0.as_str())
- } else {
- None
- }
- });
- let character_code = headers.iter().find_map(|rec| {
- if let raw::Record::IntegerInfo(ref r) = rec {
- Some(r.character_code)
- } else {
- None
- }
- });
- let encoding = match get_encoding(encoding, character_code) {
- Ok(encoding) => encoding,
- Err(err @ EncodingError::Ebcdic) => return Err(Error::EncodingError(err)),
- Err(err) => {
- warn(Error::EncodingError(err));
- // Warn that we're using the default encoding.
- default_encoding()
+ let encoding = match encoding {
+ Some(encoding) => encoding,
+ None => {
+ let encoding = headers.iter().find_map(|rec| {
+ if let raw::Record::Encoding(ref e) = rec {
+ Some(e.0.as_str())
+ } else {
+ None
+ }
+ });
+ let character_code = headers.iter().find_map(|rec| {
+ if let raw::Record::IntegerInfo(ref r) = rec {
+ Some(r.character_code)
+ } else {
+ None
+ }
+ });
+ match get_encoding(encoding, character_code) {
+ Ok(encoding) => encoding,
+ Err(err @ EncodingError::Ebcdic) => return Err(Error::EncodingError(err)),
+ Err(err) => {
+ warn(Error::EncodingError(err));
+ // Warn that we're using the default encoding.
+ default_encoding()
+ }
+ }
}
};
- let decoder = Decoder {
+ let mut decoder = Decoder {
compression: header_record.compression,
endian: header_record.endian,
encoding,
n_generated_names: 0,
};
- unreachable!()
+ let mut output = Vec::with_capacity(headers.len());
+ for header in &headers {
+ match header {
+ raw::Record::Header(ref input) => {
+ if let Some(header) = HeaderRecord::try_decode(&mut decoder, input, warn)? {
+ output.push(Record::Header(header))
+ }
+ }
+ raw::Record::Variable(ref input) => {
+ if let Some(variable) = VariableRecord::try_decode(&mut decoder, input, warn)? {
+ output.push(Record::Variable(variable));
+ }
+ }
+ raw::Record::ValueLabel(ref input) => {
+ if let Some(value_label) = ValueLabelRecord::try_decode(&mut decoder, input, warn)?
+ {
+ output.push(Record::ValueLabel(value_label));
+ }
+ }
+ raw::Record::Document(ref input) => {
+ if let Some(document) = DocumentRecord::try_decode(&mut decoder, input, warn)? {
+ output.push(Record::Document(document))
+ }
+ }
+ raw::Record::IntegerInfo(ref input) => output.push(Record::IntegerInfo(input.clone())),
+ raw::Record::FloatInfo(ref input) => output.push(Record::FloatInfo(input.clone())),
+ raw::Record::VariableSets(ref input) => {
+ let s = decoder.decode_string_cow(&input.text.0, warn);
+ output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?));
+ }
+ raw::Record::VarDisplay(ref input) => {
+ if let Some(vdr) = VarDisplayRecord::try_decode(&mut decoder, input, warn)? {
+ output.push(Record::VarDisplay(vdr))
+ }
+ }
+ raw::Record::MultipleResponse(ref input) => {
+ if let Some(mrr) = MultipleResponseRecord::try_decode(&mut decoder, input, warn)? {
+ output.push(Record::MultipleResponse(mrr))
+ }
+ }
+ raw::Record::LongStringValueLabels(ref input) => {
+ if let Some(mrr) =
+ LongStringValueLabelRecord::try_decode(&mut decoder, input, warn)?
+ {
+ output.push(Record::LongStringValueLabels(mrr))
+ }
+ }
+ raw::Record::Encoding(ref input) => output.push(Record::Encoding(input.clone())),
+ raw::Record::NumberOfCases(ref input) => {
+ output.push(Record::NumberOfCases(input.clone()))
+ }
+ raw::Record::ProductInfo(ref input) => {
+ let s = decoder.decode_string_cow(&input.text.0, warn);
+ output.push(Record::ProductInfo(ProductInfoRecord::parse(&s, warn)?));
+ }
+ raw::Record::LongNames(ref input) => {
+ let s = decoder.decode_string_cow(&input.text.0, warn);
+ output.push(Record::LongNames(LongNameRecord::parse(
+ &mut decoder,
+ &s,
+ warn,
+ )?));
+ }
+ raw::Record::VeryLongStrings(ref input) => {
+ let s = decoder.decode_string_cow(&input.text.0, warn);
+ output.push(Record::VeryLongStrings(VeryLongStringRecord::parse(
+ &mut decoder,
+ &s,
+ warn,
+ )?));
+ }
+ raw::Record::FileAttributes(ref input) => {
+ let s = decoder.decode_string_cow(&input.text.0, warn);
+ output.push(Record::FileAttributes(FileAttributeRecord::parse(
+ &decoder, &s, warn,
+ )?));
+ }
+ raw::Record::VariableAttributes(ref input) => {
+ let s = decoder.decode_string_cow(&input.text.0, warn);
+ output.push(Record::VariableAttributes(VariableAttributeRecord::parse(
+ &decoder, &s, warn,
+ )?));
+ }
+ raw::Record::OtherExtension(ref input) => {
+ output.push(Record::OtherExtension(input.clone()))
+ }
+ raw::Record::EndOfHeaders(_) => (),
+ raw::Record::ZHeader(_) => (),
+ raw::Record::ZTrailer(_) => (),
+ raw::Record::Case(_) => (),
+ };
+ }
+ Ok(output)
}
impl Decoder {
fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
let (output, malformed) = self.encoding.decode_without_bom_handling(input);
if malformed {
- warn(Error::TBD);
+ warn(Error::MalformedString {
+ encoding: self.encoding.name().into(),
+ text: output.clone().into(),
+ });
}
output
}
Identifier::new(&s, self.encoding)
}
fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> {
- let max_index = self.n_dict_indexes - 1;
- if dict_index == 0 || dict_index as usize > max_index {
+ let max_index = self.n_dict_indexes;
+ if dict_index == 0 || dict_index > max_index {
return Err(Error::InvalidDictIndex {
dict_index,
max_index,
});
}
- let Some(variable) = self.variables.get(&dict_index) else {
+ let Some(variable) = self.variables.get(&(dict_index - 1)) else {
return Err(Error::DictIndexIsContinuation(dict_index));
};
Ok(variable)
pub trait TryDecode: Sized {
type Input;
fn try_decode(
- decoder: &Decoder,
+ decoder: &mut Decoder,
input: &Self::Input,
warn: impl Fn(Error),
- ) -> Result<Self, Error>;
+ ) -> Result<Option<Self>, Error>;
}
pub trait Decode<Input>: Sized {
pub file_label: String,
}
+fn trim_end_spaces(mut s: String) -> String {
+ s.truncate(s.trim_end_matches(' ').len());
+ s
+}
+
impl TryDecode for HeaderRecord {
type Input = crate::raw::HeaderRecord;
fn try_decode(
- decoder: &Decoder,
+ decoder: &mut Decoder,
input: &Self::Input,
warn: impl Fn(Error),
- ) -> Result<Self, Error> {
- let eye_catcher = decoder.decode_string(&input.eye_catcher.0, &warn);
- let file_label = decoder.decode_string(&input.file_label.0, &warn);
+ ) -> Result<Option<Self>, Error> {
+ let eye_catcher = trim_end_spaces(decoder.decode_string(&input.eye_catcher.0, &warn));
+ let file_label = trim_end_spaces(decoder.decode_string(&input.file_label.0, &warn));
let creation_date = decoder.decode_string_cow(&input.creation_date.0, &warn);
- let creation_date = NaiveDate::parse_from_str(&creation_date, "%v").unwrap_or_else(|_| {
- warn(Error::InvalidCreationDate {
- creation_date: creation_date.into(),
+ let creation_date =
+ NaiveDate::parse_from_str(&creation_date, "%e %b %Y").unwrap_or_else(|_| {
+ warn(Error::InvalidCreationDate {
+ creation_date: creation_date.into(),
+ });
+ Default::default()
});
- Default::default()
- });
let creation_time = decoder.decode_string_cow(&input.creation_time.0, &warn);
let creation_time =
NaiveTime::parse_from_str(&creation_time, "%H:%M:%S").unwrap_or_else(|_| {
});
Default::default()
});
- Ok(HeaderRecord {
+ Ok(Some(HeaderRecord {
eye_catcher,
weight_index: input.weight_index.map(|n| n as usize),
n_cases: input.n_cases.map(|n| n as u64),
creation: NaiveDateTime::new(creation_date, creation_time),
file_label,
- })
+ }))
}
}
})
}
-impl VariableRecord {
- pub fn decode(
+impl TryDecode for VariableRecord {
+ type Input = raw::VariableRecord;
+
+ fn try_decode(
decoder: &mut Decoder,
input: &crate::raw::VariableRecord,
warn: impl Fn(Error),
})
}
};
- let name = match decoder.decode_identifier(&input.name.0, &warn) {
+ let name = trim_end_spaces(decoder.decode_string(&input.name.0, &warn));
+ let name = match Identifier::new(&name, decoder.encoding) {
Ok(name) => {
if !decoder.var_names.contains_key(&name) {
name
type Input = crate::raw::DocumentRecord;
fn try_decode(
- decoder: &Decoder,
+ decoder: &mut Decoder,
input: &Self::Input,
warn: impl Fn(Error),
- ) -> Result<Self, Error> {
- Ok(DocumentRecord(
+ ) -> Result<Option<Self>, Error> {
+ Ok(Some(DocumentRecord(
input
.lines
.iter()
- .map(|s| decoder.decode_string(&s.0, &warn))
+ .map(|s| trim_end_spaces(decoder.decode_string(&s.0, &warn)))
.collect(),
- ))
+ )))
}
}
pub variables: Vec<Identifier>,
}
-impl ValueLabelRecord {
- pub fn decode(
+impl TryDecode for ValueLabelRecord {
+ type Input = crate::raw::ValueLabelRecord;
+ fn try_decode(
decoder: &mut Decoder,
- raw_value_label: &crate::raw::ValueLabelRecord,
- dict_indexes: &crate::raw::VarIndexRecord,
+ input: &Self::Input,
warn: impl Fn(Error),
) -> Result<Option<ValueLabelRecord>, Error> {
- let variables: Vec<&Variable> = dict_indexes
+ let variables: Vec<&Variable> = input
.dict_indexes
.iter()
.filter_map(|&dict_index| {
return Ok(None);
}
}
- let labels = raw_value_label
+ let labels = input
.labels
.iter()
.map(|(value, label)| {
Scale,
}
+impl Measure {
+ fn try_decode(source: u32) -> Result<Option<Measure>, Error> {
+ match source {
+ 0 => Ok(None),
+ 1 => Ok(Some(Measure::Nominal)),
+ 2 => Ok(Some(Measure::Ordinal)),
+ 3 => Ok(Some(Measure::Scale)),
+ _ => Err(Error::InvalidMeasurement(source)),
+ }
+ }
+}
+
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Alignment {
Left,
Center,
}
+impl Alignment {
+ fn try_decode(source: u32) -> Result<Option<Alignment>, Error> {
+ match source {
+ 0 => Ok(None),
+ 1 => Ok(Some(Alignment::Left)),
+ 2 => Ok(Some(Alignment::Right)),
+ 3 => Ok(Some(Alignment::Center)),
+ _ => Err(Error::InvalidAlignment(source)),
+ }
+ }
+}
+
#[derive(Clone, Debug)]
pub struct VarDisplay {
pub measure: Option<Measure>,
- pub width: u32,
- pub align: Option<Alignment>,
+ pub width: Option<u32>,
+ pub alignment: Option<Alignment>,
}
#[derive(Clone, Debug)]
pub struct VarDisplayRecord(pub Vec<VarDisplay>);
+impl TryDecode for VarDisplayRecord {
+ type Input = raw::VarDisplayRecord;
+ fn try_decode(
+ decoder: &mut Decoder,
+ input: &Self::Input,
+ warn: impl Fn(Error),
+ ) -> Result<Option<Self>, Error> {
+ let n_vars = decoder.variables.len();
+ let n_per_var = if input.0.len() == 3 * n_vars {
+ 3
+ } else if input.0.len() == 2 * n_vars {
+ 2
+ } else {
+ return Err(Error::TBD);
+ };
+
+ let var_displays = input
+ .0
+ .chunks(n_per_var)
+ .map(|chunk| {
+ let (measure, width, alignment) = match n_per_var == 3 {
+ true => (chunk[0], Some(chunk[1]), chunk[2]),
+ false => (chunk[0], None, chunk[1]),
+ };
+ let measure = Measure::try_decode(measure).warn_on_error(&warn).flatten();
+ let alignment = Alignment::try_decode(alignment)
+ .warn_on_error(&warn)
+ .flatten();
+ VarDisplay {
+ measure,
+ width,
+ alignment,
+ }
+ })
+ .collect();
+ Ok(Some(VarDisplayRecord(var_displays)))
+ }
+}
+
#[derive(Clone, Debug)]
pub enum MultipleResponseType {
MultipleDichotomy {
type Input = raw::MultipleResponseRecord;
fn try_decode(
- decoder: &Decoder,
+ decoder: &mut Decoder,
input: &Self::Input,
warn: impl Fn(Error),
- ) -> Result<Self, Error> {
+ ) -> Result<Option<Self>, Error> {
let mut sets = Vec::with_capacity(input.0.len());
for set in &input.0 {
match MultipleResponseSet::decode(decoder, set, &warn) {
Err(error) => warn(error),
}
}
- Ok(MultipleResponseRecord(sets))
+ Ok(Some(MultipleResponseRecord(sets)))
}
}
input: &raw::LongStringValueLabels,
warn: &impl Fn(Error),
) -> Result<Self, Error> {
- let var_name = decoder
- .decode_identifier(&input.var_name.0, warn)
+ let var_name = decoder.decode_string(&input.var_name.0, warn);
+ let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
.map_err(|e| Error::InvalidLongStringValueLabelName(e))?;
let min_width = 9;
type Input = raw::LongStringValueLabelRecord;
fn try_decode(
- decoder: &Decoder,
+ decoder: &mut Decoder,
input: &Self::Input,
warn: impl Fn(Error),
- ) -> Result<Self, Error> {
+ ) -> Result<Option<Self>, Error> {
let mut labels = Vec::with_capacity(input.0.len());
for label in &input.0 {
match LongStringValueLabels::decode(decoder, label, &warn) {
Err(error) => warn(error),
}
}
- Ok(LongStringValueLabelRecord(labels))
+ Ok(Some(LongStringValueLabelRecord(labels)))
}
}
#[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
+ #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
+ ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
+
#[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
Header(HeaderRecord),
Variable(VariableRecord),
ValueLabel(ValueLabelRecord),
- VarIndexes(VarIndexRecord),
Document(DocumentRecord),
IntegerInfo(IntegerInfoRecord),
FloatInfo(FloatInfoRecord),
match rec_type {
2 => Ok(Record::Variable(VariableRecord::read(reader, endian)?)),
3 => Ok(Record::ValueLabel(ValueLabelRecord::read(reader, endian)?)),
- 4 => Ok(Record::VarIndexes(VarIndexRecord::read(reader, endian)?)),
6 => Ok(Record::Document(DocumentRecord::read(reader, endian)?)),
7 => Ok(Extension::read(reader, endian)?),
999 => Ok(Record::EndOfHeaders(endian.parse(read_bytes(reader)?))),
#[derive(Clone)]
pub struct ValueLabelRecord {
- /// Offset from the start of the file to the start of the record.
- pub offset: u64,
+ /// Offset from the start of the file to the start of the value label
+ /// record.
+ pub label_offset: u64,
/// The labels.
pub labels: Vec<(UntypedValue, UnencodedString)>,
+
+ /// Offset from the start of the file to the start of the variable index
+ /// record.
+ pub index_offset: u64,
+
+ /// The 1-based indexes of the variable indexes.
+ pub dict_indexes: Vec<u32>,
}
impl Debug for ValueLabelRecord {
fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ writeln!(f, "labels: ")?;
for (value, label) in self.labels.iter() {
writeln!(f, "{value:?}: {label:?}")?;
}
+ write!(f, "apply to variables")?;
+ for dict_index in self.dict_indexes.iter() {
+ write!(f, " #{dict_index}")?;
+ }
Ok(())
}
}
impl ValueLabelRecord {
/// Maximum number of value labels in a record.
- pub const MAX: u32 = u32::MAX / 8;
+ pub const MAX_LABELS: u32 = u32::MAX / 8;
+
+ /// Maximum number of variable indexes in a record.
+ pub const MAX_INDEXES: u32 = u32::MAX / 8;
fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabelRecord, Error> {
- let offset = r.stream_position()?;
+ let label_offset = r.stream_position()?;
let n: u32 = endian.parse(read_bytes(r)?);
- if n > ValueLabelRecord::MAX {
+ if n > Self::MAX_LABELS {
return Err(Error::BadNumberOfValueLabels {
- offset,
+ offset: label_offset,
n,
- max: ValueLabelRecord::MAX,
+ max: Self::MAX_LABELS,
});
}
label.truncate(label_len);
labels.push((value, UnencodedString(label)));
}
- Ok(ValueLabelRecord { offset, labels })
- }
-}
-#[derive(Clone)]
-pub struct VarIndexRecord {
- /// Offset from the start of the file to the start of the record.
- pub offset: u64,
-
- /// The 1-based indexes of the variable indexes.
- pub dict_indexes: Vec<u32>,
-}
-
-impl Debug for VarIndexRecord {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "apply to variables")?;
- for dict_index in self.dict_indexes.iter() {
- write!(f, " #{dict_index}")?;
+ let index_offset = r.stream_position()?;
+ let rec_type: u32 = endian.parse(read_bytes(r)?);
+ if rec_type != 4 {
+ return Err(Error::ExpectedVarIndexRecord {
+ offset: index_offset,
+ rec_type,
+ });
}
- Ok(())
- }
-}
-impl VarIndexRecord {
- /// Maximum number of variable indexes in a record.
- pub const MAX: u32 = u32::MAX / 8;
-
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VarIndexRecord, Error> {
- let offset = r.stream_position()?;
let n: u32 = endian.parse(read_bytes(r)?);
- if n > VarIndexRecord::MAX {
+ if n > Self::MAX_INDEXES {
return Err(Error::BadNumberOfVarIndexes {
- offset,
+ offset: index_offset,
n,
- max: VarIndexRecord::MAX,
+ max: Self::MAX_INDEXES,
});
}
let mut dict_indexes = Vec::with_capacity(n as usize);
dict_indexes.push(endian.parse(read_bytes(r)?));
}
- Ok(VarIndexRecord {
- offset,
+ Ok(ValueLabelRecord {
+ label_offset,
+ labels,
+ index_offset,
dict_indexes,
})
}