-use std::{borrow::Cow, cmp::Ordering, collections::HashMap, iter::repeat};
+use std::{borrow::Cow, cmp::Ordering, collections::HashMap, iter::repeat, ops::Range};
use crate::{
+ encoding::{default_encoding, get_encoding, Error as EncodingError},
endian::Endian,
format::{Error as FormatError, Spec, UncheckedSpec},
identifier::{Error as IdError, Identifier},
- raw::{self, MissingValues, UnencodedStr, VarType},
+ raw::{self, UnencodedStr, VarType},
};
use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
use encoding_rs::{DecoderResult, Encoding};
#[derive(ThisError, Debug)]
pub enum Error {
- #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
- InvalidVariableWidth { offset: u64, width: i32 },
+ // XXX this is really an internal error and maybe we should change the
+ // interfaces to make it impossible
+ #[error("Missing header record")]
+ MissingHeaderRecord,
+
+ #[error("{0}")]
+ EncodingError(EncodingError),
+
+ #[error("Using default encoding {0}.")]
+ UsingDefaultEncoding(String),
+
+ #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)]
+ InvalidVariableWidth { offsets: Range<u64>, width: i32 },
#[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
InvalidLongMissingValueFormat,
#[error("Invalid variable name in attribute record. {0}")]
InvalidAttributeVariableName(IdError),
+ // XXX This is risky because `text` might be arbitarily long.
+ #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
+ MalformedString { encoding: String, text: String },
+
+ #[error("Invalid variable measurement level value {0}")]
+ InvalidMeasurement(u32),
+
+ #[error("Invalid variable display alignment value {0}")]
+ InvalidAlignment(u32),
+
#[error("Details TBD")]
TBD,
}
VariableSets(VariableSetRecord),
VarDisplay(VarDisplayRecord),
MultipleResponse(MultipleResponseRecord),
+ LongStringMissingValues(LongStringMissingValuesRecord),
LongStringValueLabels(LongStringValueLabelRecord),
Encoding(EncodingRecord),
NumberOfCases(NumberOfCasesRecord),
VeryLongStrings(VeryLongStringRecord),
FileAttributes(FileAttributeRecord),
VariableAttributes(VariableAttributeRecord),
- //OtherExtension(Extension),
+ OtherExtension(Extension),
//EndOfHeaders(u32),
//ZHeader(ZHeader),
//ZTrailer(ZTrailer),
}
pub use crate::raw::EncodingRecord;
+pub use crate::raw::Extension;
pub use crate::raw::FloatInfoRecord;
pub use crate::raw::IntegerInfoRecord;
pub use crate::raw::NumberOfCasesRecord;
n_generated_names: usize,
}
+pub fn decode(
+ headers: Vec<raw::Record>,
+ encoding: Option<&'static Encoding>,
+ warn: &impl Fn(Error),
+) -> Result<Vec<Record>, Error> {
+ let Some(header_record) = headers.iter().find_map(|rec| {
+ if let raw::Record::Header(header) = rec {
+ Some(header)
+ } else {
+ None
+ }
+ }) else {
+ return Err(Error::MissingHeaderRecord);
+ };
+ let encoding = match encoding {
+ Some(encoding) => encoding,
+ None => {
+ let encoding = headers.iter().find_map(|rec| {
+ if let raw::Record::Encoding(ref e) = rec {
+ Some(e.0.as_str())
+ } else {
+ None
+ }
+ });
+ let character_code = headers.iter().find_map(|rec| {
+ if let raw::Record::IntegerInfo(ref r) = rec {
+ Some(r.character_code)
+ } else {
+ None
+ }
+ });
+ match get_encoding(encoding, character_code) {
+ Ok(encoding) => encoding,
+ Err(err @ EncodingError::Ebcdic) => return Err(Error::EncodingError(err)),
+ Err(err) => {
+ warn(Error::EncodingError(err));
+ // Warn that we're using the default encoding.
+ default_encoding()
+ }
+ }
+ }
+ };
+
+ let mut decoder = Decoder {
+ compression: header_record.compression,
+ endian: header_record.endian,
+ encoding,
+ variables: HashMap::new(),
+ var_names: HashMap::new(),
+ n_dict_indexes: 0,
+ n_generated_names: 0,
+ };
+
+ let mut output = Vec::with_capacity(headers.len());
+ for header in &headers {
+ match header {
+ raw::Record::Header(ref input) => {
+ if let Some(header) = HeaderRecord::try_decode(&mut decoder, input, warn)? {
+ output.push(Record::Header(header))
+ }
+ }
+ raw::Record::Variable(ref input) => {
+ if let Some(variable) = VariableRecord::try_decode(&mut decoder, input, warn)? {
+ output.push(Record::Variable(variable));
+ }
+ }
+ raw::Record::ValueLabel(ref input) => {
+ if let Some(value_label) = ValueLabelRecord::try_decode(&mut decoder, input, warn)?
+ {
+ output.push(Record::ValueLabel(value_label));
+ }
+ }
+ raw::Record::Document(ref input) => {
+ if let Some(document) = DocumentRecord::try_decode(&mut decoder, input, warn)? {
+ output.push(Record::Document(document))
+ }
+ }
+ raw::Record::IntegerInfo(ref input) => output.push(Record::IntegerInfo(input.clone())),
+ raw::Record::FloatInfo(ref input) => output.push(Record::FloatInfo(input.clone())),
+ raw::Record::VariableSets(ref input) => {
+ let s = decoder.decode_string_cow(&input.text.0, warn);
+ output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?));
+ }
+ raw::Record::VarDisplay(ref input) => {
+ if let Some(vdr) = VarDisplayRecord::try_decode(&mut decoder, input, warn)? {
+ output.push(Record::VarDisplay(vdr))
+ }
+ }
+ raw::Record::MultipleResponse(ref input) => {
+ if let Some(mrr) = MultipleResponseRecord::try_decode(&mut decoder, input, warn)? {
+ output.push(Record::MultipleResponse(mrr))
+ }
+ }
+ raw::Record::LongStringMissingValues(ref input) => {
+ if let Some(mrr) = LongStringMissingValuesRecord::try_decode(&mut decoder, input, warn)? {
+ output.push(Record::LongStringMissingValues(mrr))
+ }
+ }
+ raw::Record::LongStringValueLabels(ref input) => {
+ if let Some(mrr) =
+ LongStringValueLabelRecord::try_decode(&mut decoder, input, warn)?
+ {
+ output.push(Record::LongStringValueLabels(mrr))
+ }
+ }
+ raw::Record::Encoding(ref input) => output.push(Record::Encoding(input.clone())),
+ raw::Record::NumberOfCases(ref input) => {
+ output.push(Record::NumberOfCases(input.clone()))
+ }
+ raw::Record::ProductInfo(ref input) => {
+ let s = decoder.decode_string_cow(&input.text.0, warn);
+ output.push(Record::ProductInfo(ProductInfoRecord::parse(&s, warn)?));
+ }
+ raw::Record::LongNames(ref input) => {
+ let s = decoder.decode_string_cow(&input.text.0, warn);
+ output.push(Record::LongNames(LongNameRecord::parse(
+ &mut decoder,
+ &s,
+ warn,
+ )?));
+ }
+ raw::Record::VeryLongStrings(ref input) => {
+ let s = decoder.decode_string_cow(&input.text.0, warn);
+ output.push(Record::VeryLongStrings(VeryLongStringRecord::parse(
+ &decoder, &s, warn,
+ )?));
+ }
+ raw::Record::FileAttributes(ref input) => {
+ let s = decoder.decode_string_cow(&input.text.0, warn);
+ output.push(Record::FileAttributes(FileAttributeRecord::parse(
+ &decoder, &s, warn,
+ )?));
+ }
+ raw::Record::VariableAttributes(ref input) => {
+ let s = decoder.decode_string_cow(&input.text.0, warn);
+ output.push(Record::VariableAttributes(VariableAttributeRecord::parse(
+ &decoder, &s, warn,
+ )?));
+ }
+ raw::Record::OtherExtension(ref input) => {
+ output.push(Record::OtherExtension(input.clone()))
+ }
+ raw::Record::EndOfHeaders(_) => (),
+ raw::Record::ZHeader(_) => (),
+ raw::Record::ZTrailer(_) => (),
+ raw::Record::Case(_) => (),
+ };
+ }
+ Ok(output)
+}
+
impl Decoder {
fn generate_name(&mut self) -> Identifier {
loop {
fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
let (output, malformed) = self.encoding.decode_without_bom_handling(input);
if malformed {
- warn(Error::TBD);
+ warn(Error::MalformedString {
+ encoding: self.encoding.name().into(),
+ text: output.clone().into(),
+ });
}
output
}
Identifier::new(&s, self.encoding)
}
fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> {
- let max_index = self.n_dict_indexes - 1;
- if dict_index == 0 || dict_index as usize > max_index {
+ let max_index = self.n_dict_indexes;
+ if dict_index == 0 || dict_index > max_index {
return Err(Error::InvalidDictIndex {
dict_index,
max_index,
});
}
- let Some(variable) = self.variables.get(&dict_index) else {
+ let Some(variable) = self.variables.get(&(dict_index - 1)) else {
return Err(Error::DictIndexIsContinuation(dict_index));
};
Ok(variable)
fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
if let (s, false) = self.encoding.decode_without_bom_handling(input) {
// This is the common case. Usually there will be no errors.
- s.into()
+ s
} else {
// Unusual case. Don't bother to optimize it much.
let mut decoder = self.encoding.new_decoder_without_bom_handling();
pub trait TryDecode: Sized {
type Input;
fn try_decode(
- decoder: &Decoder,
+ decoder: &mut Decoder,
input: &Self::Input,
warn: impl Fn(Error),
- ) -> Result<Self, Error>;
+ ) -> Result<Option<Self>, Error>;
}
pub trait Decode<Input>: Sized {
pub file_label: String,
}
+fn trim_end_spaces(mut s: String) -> String {
+ s.truncate(s.trim_end_matches(' ').len());
+ s
+}
+
impl TryDecode for HeaderRecord {
type Input = crate::raw::HeaderRecord;
fn try_decode(
- decoder: &Decoder,
+ decoder: &mut Decoder,
input: &Self::Input,
warn: impl Fn(Error),
- ) -> Result<Self, Error> {
- let eye_catcher = decoder.decode_string(&input.eye_catcher.0, &warn);
- let file_label = decoder.decode_string(&input.file_label.0, &warn);
+ ) -> Result<Option<Self>, Error> {
+ let eye_catcher = trim_end_spaces(decoder.decode_string(&input.eye_catcher.0, &warn));
+ let file_label = trim_end_spaces(decoder.decode_string(&input.file_label.0, &warn));
let creation_date = decoder.decode_string_cow(&input.creation_date.0, &warn);
- let creation_date = NaiveDate::parse_from_str(&creation_date, "%v").unwrap_or_else(|_| {
- warn(Error::InvalidCreationDate {
- creation_date: creation_date.into(),
+ let creation_date =
+ NaiveDate::parse_from_str(&creation_date, "%e %b %Y").unwrap_or_else(|_| {
+ warn(Error::InvalidCreationDate {
+ creation_date: creation_date.into(),
+ });
+ Default::default()
});
- Default::default()
- });
let creation_time = decoder.decode_string_cow(&input.creation_time.0, &warn);
let creation_time =
NaiveTime::parse_from_str(&creation_time, "%H:%M:%S").unwrap_or_else(|_| {
});
Default::default()
});
- Ok(HeaderRecord {
+ Ok(Some(HeaderRecord {
eye_catcher,
weight_index: input.weight_index.map(|n| n as usize),
n_cases: input.n_cases.map(|n| n as u64),
creation: NaiveDateTime::new(creation_date, creation_time),
file_label,
- })
+ }))
}
}
pub label: Option<String>,
}
+#[derive(Clone, Debug)]
+pub struct MissingValues {
+ /// Individual missing values, up to 3 of them.
+ pub values: Vec<Value>,
+
+ /// Optional range of missing values.
+ pub range: Option<(Value, Value)>,
+}
+
+impl Decode<raw::MissingValues> for MissingValues {
+ fn decode(decoder: &Decoder, input: &raw::MissingValues, _warn: impl Fn(Error)) -> Self {
+ MissingValues {
+ values: input
+ .values
+ .iter()
+ .map(|value| Value::decode(value, decoder))
+ .collect(),
+ range: input
+ .range
+ .as_ref()
+ .map(|(low, high)| (Value::decode(low, decoder), Value::decode(high, decoder))),
+ }
+ }
+}
+
fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec {
UncheckedSpec::try_from(raw)
.and_then(Spec::try_from)
})
}
-impl VariableRecord {
- pub fn decode(
+impl TryDecode for VariableRecord {
+ type Input = raw::VariableRecord;
+
+ fn try_decode(
decoder: &mut Decoder,
input: &crate::raw::VariableRecord,
warn: impl Fn(Error),
-1 => return Ok(None),
_ => {
return Err(Error::InvalidVariableWidth {
- offset: input.offset,
+ offsets: input.offsets.clone(),
width: input.width,
})
}
};
- let name = match decoder.decode_identifier(&input.name.0, &warn) {
+ let name = trim_end_spaces(decoder.decode_string(&input.name.0, &warn));
+ let name = match Identifier::new(&name, decoder.encoding) {
Ok(name) => {
if !decoder.var_names.contains_key(&name) {
name
name,
print_format,
write_format,
- missing_values: input.missing_values.clone(),
+ missing_values: MissingValues::decode(decoder, &input.missing_values, warn),
label,
}))
}
type Input = crate::raw::DocumentRecord;
fn try_decode(
- decoder: &Decoder,
+ decoder: &mut Decoder,
input: &Self::Input,
warn: impl Fn(Error),
- ) -> Result<Self, Error> {
- Ok(DocumentRecord(
+ ) -> Result<Option<Self>, Error> {
+ Ok(Some(DocumentRecord(
input
.lines
.iter()
- .map(|s| decoder.decode_string(&s.0, &warn))
+ .map(|s| trim_end_spaces(decoder.decode_string(&s.0, &warn)))
.collect(),
- ))
+ )))
}
}
}
impl Value {
- pub fn decode(raw: raw::Value, decoder: &Decoder) -> Self {
+ pub fn decode(raw: &raw::Value, decoder: &Decoder) -> Self {
match raw {
raw::Value::Number(x) => Value::Number(x.map(|x| x.into())),
raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
pub variables: Vec<Identifier>,
}
-impl ValueLabelRecord {
- pub fn decode(
+impl TryDecode for ValueLabelRecord {
+ type Input = crate::raw::ValueLabelRecord;
+ fn try_decode(
decoder: &mut Decoder,
- raw_value_label: &crate::raw::ValueLabelRecord,
- dict_indexes: &crate::raw::VarIndexRecord,
+ input: &Self::Input,
warn: impl Fn(Error),
) -> Result<Option<ValueLabelRecord>, Error> {
- let variables: Vec<&Variable> = dict_indexes
+ let variables: Vec<&Variable> = input
.dict_indexes
.iter()
.filter_map(|&dict_index| {
return Ok(None);
}
}
- let labels = raw_value_label
+ let labels = input
.labels
.iter()
.map(|(value, label)| {
let label = decoder.decode_string(&label.0, &warn);
let value = Value::decode(
- raw::Value::from_raw(*value, var_type, decoder.endian),
- &decoder,
+ &raw::Value::from_raw(value, var_type, decoder.endian),
+ decoder,
);
ValueLabel { value, label }
})
impl LongName {
fn new(decoder: &mut Decoder, short_name: &str, long_name: &str) -> Result<LongName, Error> {
- let short_name = Identifier::new(short_name, decoder.encoding)
- .map_err(|e| Error::InvalidShortName(e))?;
+ let short_name =
+ Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidShortName)?;
let long_name =
- Identifier::new(long_name, decoder.encoding).map_err(|e| Error::InvalidLongName(e))?;
+ Identifier::new(long_name, decoder.encoding).map_err(Error::InvalidLongName)?;
Ok(LongName {
short_name,
long_name,
let Some((short_name, length)) = input.split_once('=') else {
return Err(Error::TBD);
};
- let short_name = Identifier::new(short_name, decoder.encoding)
- .map_err(|e| Error::InvalidLongStringName(e))?;
+ let short_name =
+ Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidLongStringName)?;
let length: u16 = length.parse().map_err(|_| Error::TBD)?;
if length > VarWidth::MAX_STRING {
return Err(Error::TBD);
}
- Ok(VeryLongString {
- short_name: short_name.into(),
- length,
- })
+ Ok(VeryLongString { short_name, length })
}
}
}
if let Some(rest) = rest.strip_prefix(')') {
let attribute = Identifier::new(name, decoder.encoding)
- .map_err(|e| Error::InvalidAttributeName(e))
+ .map_err(Error::InvalidAttributeName)
.warn_on_error(warn)
.map(|name| Attribute { name, values });
return Ok((attribute, rest));
};
let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'), warn)?;
let var_attribute = Identifier::new(long_var_name, decoder.encoding)
- .map_err(|e| Error::InvalidAttributeVariableName(e))
+ .map_err(Error::InvalidAttributeVariableName)
.warn_on_error(warn)
.map(|name| VarAttributeSet {
long_var_name: name,
Scale,
}
+impl Measure {
+ fn try_decode(source: u32) -> Result<Option<Measure>, Error> {
+ match source {
+ 0 => Ok(None),
+ 1 => Ok(Some(Measure::Nominal)),
+ 2 => Ok(Some(Measure::Ordinal)),
+ 3 => Ok(Some(Measure::Scale)),
+ _ => Err(Error::InvalidMeasurement(source)),
+ }
+ }
+}
+
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Alignment {
Left,
Center,
}
+impl Alignment {
+ fn try_decode(source: u32) -> Result<Option<Alignment>, Error> {
+ match source {
+ 0 => Ok(None),
+ 1 => Ok(Some(Alignment::Left)),
+ 2 => Ok(Some(Alignment::Right)),
+ 3 => Ok(Some(Alignment::Center)),
+ _ => Err(Error::InvalidAlignment(source)),
+ }
+ }
+}
+
#[derive(Clone, Debug)]
pub struct VarDisplay {
pub measure: Option<Measure>,
- pub width: u32,
- pub align: Option<Alignment>,
+ pub width: Option<u32>,
+ pub alignment: Option<Alignment>,
}
#[derive(Clone, Debug)]
pub struct VarDisplayRecord(pub Vec<VarDisplay>);
+impl TryDecode for VarDisplayRecord {
+ type Input = raw::VarDisplayRecord;
+ fn try_decode(
+ decoder: &mut Decoder,
+ input: &Self::Input,
+ warn: impl Fn(Error),
+ ) -> Result<Option<Self>, Error> {
+ let n_vars = decoder.variables.len();
+ let n_per_var = if input.0.len() == 3 * n_vars {
+ 3
+ } else if input.0.len() == 2 * n_vars {
+ 2
+ } else {
+ return Err(Error::TBD);
+ };
+
+ let var_displays = input
+ .0
+ .chunks(n_per_var)
+ .map(|chunk| {
+ let (measure, width, alignment) = match n_per_var == 3 {
+ true => (chunk[0], Some(chunk[1]), chunk[2]),
+ false => (chunk[0], None, chunk[1]),
+ };
+ let measure = Measure::try_decode(measure).warn_on_error(&warn).flatten();
+ let alignment = Alignment::try_decode(alignment)
+ .warn_on_error(&warn)
+ .flatten();
+ VarDisplay {
+ measure,
+ width,
+ alignment,
+ }
+ })
+ .collect();
+ Ok(Some(VarDisplayRecord(var_displays)))
+ }
+}
+
#[derive(Clone, Debug)]
pub enum MultipleResponseType {
MultipleDichotomy {
) -> Result<Self, Error> {
let mr_set_name = decoder
.decode_identifier(&input.name.0, warn)
- .map_err(|error| Error::InvalidMrSetName(error))?;
+ .map_err(Error::InvalidMrSetName)?;
let label = decoder.decode_string(&input.label.0, warn);
type Input = raw::MultipleResponseRecord;
fn try_decode(
- decoder: &Decoder,
+ decoder: &mut Decoder,
input: &Self::Input,
warn: impl Fn(Error),
- ) -> Result<Self, Error> {
+ ) -> Result<Option<Self>, Error> {
let mut sets = Vec::with_capacity(input.0.len());
for set in &input.0 {
match MultipleResponseSet::decode(decoder, set, &warn) {
Err(error) => warn(error),
}
}
- Ok(MultipleResponseRecord(sets))
+ Ok(Some(MultipleResponseRecord(sets)))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringMissingValues {
+ /// Variable name.
+ pub var_name: Identifier,
+
+ /// Missing values.
+ pub missing_values: MissingValues,
+}
+
+impl LongStringMissingValues {
+ fn decode(
+ decoder: &Decoder,
+ input: &raw::LongStringMissingValues,
+ warn: &impl Fn(Error),
+ ) -> Result<Self, Error> {
+ let var_name = decoder.decode_string(&input.var_name.0, warn);
+ let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
+ .map_err(Error::InvalidLongStringValueLabelName)?;
+
+ let missing_values = MissingValues::decode(decoder, &input.missing_values, warn);
+
+ Ok(LongStringMissingValues {
+ var_name,
+ missing_values
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringMissingValuesRecord(Vec<LongStringMissingValues>);
+
+impl TryDecode for LongStringMissingValuesRecord {
+ type Input = raw::LongStringMissingValueSet;
+
+ fn try_decode(
+ decoder: &mut Decoder,
+ input: &Self::Input,
+ warn: impl Fn(Error),
+ ) -> Result<Option<Self>, Error> {
+ let mut labels = Vec::with_capacity(input.0.len());
+ for label in &input.0 {
+ match LongStringMissingValues::decode(decoder, label, &warn) {
+ Ok(set) => labels.push(set),
+ Err(error) => warn(error),
+ }
+ }
+ Ok(Some(LongStringMissingValuesRecord(labels)))
}
}
input: &raw::LongStringValueLabels,
warn: &impl Fn(Error),
) -> Result<Self, Error> {
- let var_name = decoder
- .decode_identifier(&input.var_name.0, warn)
- .map_err(|e| Error::InvalidLongStringValueLabelName(e))?;
+ let var_name = decoder.decode_string(&input.var_name.0, warn);
+ let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
+ .map_err(Error::InvalidLongStringValueLabelName)?;
let min_width = 9;
let max_width = VarWidth::MAX_STRING;
if input.width < 9 || input.width > max_width as u32 {
return Err(Error::InvalidLongValueLabelWidth {
- name: var_name.into(),
+ name: var_name,
width: input.width,
min_width,
max_width,
type Input = raw::LongStringValueLabelRecord;
fn try_decode(
- decoder: &Decoder,
+ decoder: &mut Decoder,
input: &Self::Input,
warn: impl Fn(Error),
- ) -> Result<Self, Error> {
+ ) -> Result<Option<Self>, Error> {
let mut labels = Vec::with_capacity(input.0.len());
for label in &input.0 {
match LongStringValueLabels::decode(decoder, label, &warn) {
Err(error) => warn(error),
}
}
- Ok(LongStringValueLabelRecord(labels))
+ Ok(Some(LongStringValueLabelRecord(labels)))
}
}