-use std::{borrow::Cow, collections::{HashSet, HashMap}, cmp::Ordering};
+use std::{borrow::Cow, cmp::Ordering, collections::HashMap, iter::repeat};
-use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
-use encoding_rs::Encoding;
-use num::integer::div_ceil;
use crate::{
- format::{Spec, UncheckedSpec},
+ format::{Error as FormatError, Spec, UncheckedSpec},
identifier::{Error as IdError, Identifier},
raw::{self, MissingValues, VarType},
{endian::Endian, Compression},
};
+use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
+use encoding_rs::{DecoderResult, Encoding};
+use num::integer::div_ceil;
use thiserror::Error as ThisError;
#[derive(ThisError, Debug)]
pub enum Error {
#[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
- BadVariableWidth { offset: u64, width: i32 },
+ InvalidVariableWidth { offset: u64, width: i32 },
#[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
- BadLongMissingValueFormat,
+ InvalidLongMissingValueFormat,
#[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
InvalidCreationDate { creation_date: String },
#[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
InvalidCreationTime { creation_time: String },
- #[error("Invalid variable name: {0}")]
- BadIdentifier(#[from] IdError),
+ #[error("{id_error} Renaming variable to {new_name}.")]
+ InvalidVariableName {
+ id_error: IdError,
+ new_name: Identifier,
+ },
+
+ #[error(
+ "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}"
+ )]
+ InvalidPrintFormat {
+ new_spec: Spec,
+ variable: Identifier,
+ format_error: FormatError,
+ },
+
+ #[error(
+ "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}"
+ )]
+ InvalidWriteFormat {
+ new_spec: Spec,
+ variable: Identifier,
+ format_error: FormatError,
+ },
+
+ #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")]
+ DuplicateVariableName {
+ duplicate_name: Identifier,
+ new_name: Identifier,
+ },
+
+ #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")]
+ InvalidDictIndex { dict_index: usize, max_index: usize },
+
+ #[error("Dictionary index {0} refers to a long string continuation.")]
+ DictIndexIsContinuation(usize),
+
+ #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")]
+ ValueLabelsDifferentTypes {
+ numeric_var: Identifier,
+ string_var: Identifier,
+ },
+
+ #[error(
+ "Value labels may not be added to long string variable {0} using record types 3 or 4."
+ )]
+ InvalidLongStringValueLabel(Identifier),
#[error("Details TBD")]
TBD,
}
+type DictIndex = usize;
+
+pub struct Variable {
+ pub dict_index: DictIndex,
+ pub short_name: Identifier,
+ pub long_name: Option<Identifier>,
+ pub width: VarWidth,
+}
+
pub struct Decoder {
pub compression: Option<Compression>,
pub endian: Endian,
pub encoding: &'static Encoding,
- pub var_names: HashSet<Identifier>,
- pub dict_indexes: HashMap<usize, Identifier>,
+ pub variables: HashMap<DictIndex, Variable>,
+ pub var_names: HashMap<Identifier, DictIndex>,
n_dict_indexes: usize,
n_generated_names: usize,
}
impl Decoder {
- fn take_name(&mut self, id: &Identifier) -> bool {
- self.var_names.insert(id.clone())
- }
fn generate_name(&mut self) -> Identifier {
loop {
self.n_generated_names += 1;
let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
.unwrap();
- if self.take_name(&name) {
+ if !self.var_names.contains_key(&name) {
return name;
}
assert!(self.n_generated_names < usize::MAX);
}
}
- fn take_dict_indexes(&mut self, id: &Identifier, width: VarWidth) -> usize {
- let n = match width {
- VarWidth::Numeric => 1,
- VarWidth::String(w) => div_ceil(w as usize, 8),
- };
- let dict_index = self.n_dict_indexes;
- self.dict_indexes.insert(self.n_dict_indexes, id.clone());
- self.n_dict_indexes += n;
- dict_index
-
- }
fn decode_string<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
let (output, malformed) = self.encoding.decode_without_bom_handling(input);
if malformed {
}
output
}
+ fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> {
+ let max_index = self.n_dict_indexes - 1;
+ if dict_index == 0 || dict_index as usize > max_index {
+ return Err(Error::InvalidDictIndex {
+ dict_index,
+ max_index,
+ });
+ }
+ let Some(variable) = self.variables.get(&dict_index) else {
+ return Err(Error::DictIndexIsContinuation(dict_index));
+ };
+ Ok(variable)
+ }
+
+ /// Returns `input` decoded from `self.encoding` into UTF-8 such that
+ /// re-encoding the result back into `self.encoding` will have exactly the
+ /// same length in bytes.
+ ///
+ /// XXX warn about errors?
+ fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
+ if let (s, false) = self.encoding.decode_without_bom_handling(input) {
+ // This is the common case. Usually there will be no errors.
+ s.into()
+ } else {
+ // Unusual case. Don't bother to optimize it much.
+ let mut decoder = self.encoding.new_decoder_without_bom_handling();
+ let mut output = String::with_capacity(
+ decoder
+ .max_utf8_buffer_length_without_replacement(input.len())
+ .unwrap(),
+ );
+ let mut rest = input;
+ while !rest.is_empty() {
+ match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
+ (DecoderResult::InputEmpty, _) => break,
+ (DecoderResult::OutputFull, _) => unreachable!(),
+ (DecoderResult::Malformed(a, b), consumed) => {
+ let skipped = a as usize + b as usize;
+ output.extend(repeat('?').take(skipped));
+ rest = &rest[consumed..];
+ }
+ }
+ }
+ assert_eq!(self.encoding.encode(&output).0.len(), input.len());
+ output.into()
+ }
+ }
}
pub trait Decode: Sized {
}
}
+impl VarWidth {
+ fn n_dict_indexes(self) -> usize {
+ match self {
+ VarWidth::Numeric => 1,
+ VarWidth::String(w) => div_ceil(w as usize, 8),
+ }
+ }
+}
+
impl From<VarWidth> for VarType {
fn from(source: VarWidth) -> Self {
match source {
}
}
-pub struct Variable {
+pub struct VariableRecord {
pub width: VarWidth,
pub name: Identifier,
pub print_format: Spec,
pub label: Option<String>,
}
-fn decode_format(raw: raw::Spec, name: &str, width: VarWidth) -> Spec {
+fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec {
UncheckedSpec::try_from(raw)
.and_then(Spec::try_from)
- .and_then(|x| x.check_width_compatibility(Some(name), width))
- .unwrap_or_else(|_warning| {
- /*warn(warning);*/
- Spec::default_for_width(width)
+ .and_then(|x| x.check_width_compatibility(width))
+ .unwrap_or_else(|error| {
+ let new_format = Spec::default_for_width(width);
+ warn(new_format, error);
+ new_format
})
}
-impl Variable {
+impl VariableRecord {
pub fn decode(
decoder: &mut Decoder,
input: &crate::raw::Variable,
warn: impl Fn(Error),
- ) -> Result<Option<Variable>, Error> {
+ ) -> Result<Option<VariableRecord>, Error> {
let width = match input.width {
0 => VarWidth::Numeric,
w @ 1..=255 => VarWidth::String(w as u16),
-1 => return Ok(None),
_ => {
- return Err(Error::BadVariableWidth {
+ return Err(Error::InvalidVariableWidth {
offset: input.offset,
width: input.width,
})
let name = decoder.decode_string(&input.name.0, &warn);
let name = match Identifier::new(&name, decoder.encoding) {
Ok(name) => {
- if !decoder.take_name(&name) {
- decoder.generate_name()
- } else {
+ if !decoder.var_names.contains_key(&name) {
name
+ } else {
+ let new_name = decoder.generate_name();
+ warn(Error::DuplicateVariableName {
+ duplicate_name: name.clone(),
+ new_name: new_name.clone(),
+ });
+ new_name
}
}
- Err(error) => {
- warn(error.into());
- decoder.generate_name()
+ Err(id_error) => {
+ let new_name = decoder.generate_name();
+ warn(Error::InvalidVariableName {
+ id_error,
+ new_name: new_name.clone(),
+ });
+ new_name
}
};
- let print_format = decode_format(input.print_format, &name.0, width);
- let write_format = decode_format(input.write_format, &name.0, width);
+ let variable = Variable {
+ dict_index: decoder.n_dict_indexes,
+ short_name: name.clone(),
+ long_name: None,
+ width,
+ };
+ decoder.n_dict_indexes += width.n_dict_indexes();
+ assert!(decoder
+ .var_names
+ .insert(name.clone(), variable.dict_index)
+ .is_none());
+ assert!(decoder
+ .variables
+ .insert(variable.dict_index, variable)
+ .is_none());
+
+ let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
+ warn(Error::InvalidPrintFormat {
+ new_spec,
+ variable: name.clone(),
+ format_error,
+ })
+ });
+ let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
+ warn(Error::InvalidWriteFormat {
+ new_spec,
+ variable: name.clone(),
+ format_error,
+ })
+ });
let label = input
.label
.as_ref()
.map(|label| decoder.decode_string(&label.0, &warn).into());
- decoder.take_dict_indexes(&name, width);
- Ok(Some(Variable {
+ Ok(Some(VariableRecord {
width,
name,
print_format,
}
}
-/*
+#[derive(Clone)]
+pub enum Value {
+ Number(Option<f64>),
+ String(String),
+}
+
+impl Value {
+ pub fn decode(raw: raw::Value, decoder: &Decoder) -> Self {
+ match raw {
+ raw::Value::Number(x) => Value::Number(x),
+ raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
+ }
+ }
+}
+
pub struct ValueLabelRecord {
- pub labels: Vec<(
+ pub var_type: VarType,
+ pub labels: Vec<(Value, String)>,
+ pub variables: Vec<Identifier>,
+}
+
+trait WarnOnError<T> {
+ fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
+}
+impl<T> WarnOnError<T> for Result<T, Error> {
+ fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
+ match self {
+ Ok(result) => Some(result),
+ Err(error) => {
+ warn(error);
+ None
+ }
+ }
+ }
+}
+
+impl ValueLabelRecord {
+ pub fn decode(
+ decoder: &mut Decoder,
+ raw_value_label: &crate::raw::ValueLabel,
+ dict_indexes: &crate::raw::VarIndexes,
+ warn: impl Fn(Error),
+ ) -> Result<Option<ValueLabelRecord>, Error> {
+ let variables: Vec<&Variable> = dict_indexes
+ .dict_indexes
+ .iter()
+ .filter_map(|&dict_index| {
+ decoder
+ .get_var_by_index(dict_index as usize)
+ .warn_on_error(&warn)
+ })
+ .filter(|&variable| match variable.width {
+ VarWidth::String(width) if width > 8 => {
+ warn(Error::InvalidLongStringValueLabel(
+ variable.short_name.clone(),
+ ));
+ false
+ }
+ _ => true,
+ })
+ .collect();
+ let mut i = variables.iter();
+ let Some(&first_var) = i.next() else {
+ return Ok(None);
+ };
+ let var_type: VarType = first_var.width.into();
+ for &variable in i {
+ let this_type: VarType = variable.width.into();
+ if var_type != this_type {
+ let (numeric_var, string_var) = match var_type {
+ VarType::Numeric => (first_var, variable),
+ VarType::String => (variable, first_var),
+ };
+ warn(Error::ValueLabelsDifferentTypes {
+ numeric_var: numeric_var.short_name.clone(),
+ string_var: string_var.short_name.clone(),
+ });
+ return Ok(None);
+ }
+ }
+ let labels = raw_value_label
+ .labels
+ .iter()
+ .map(|(value, label)| {
+ let label = decoder.decode_string(&label.0, &warn);
+ let value = Value::decode(raw::Value::from_raw(*value, var_type, decoder.endian), &decoder);
+ (value, label.into())
+ })
+ .collect();
+ let variables = variables
+ .iter()
+ .map(|&variable| variable.short_name.clone())
+ .collect();
+ Ok(Some(ValueLabelRecord {
+ var_type,
+ labels,
+ variables,
+ }))
+ }
}
-*/
+
pub struct VariableSetRecord(Vec<VariableSet>);
impl TextRecord for VariableSetRecord {
fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
let mut sets = Vec::new();
for line in input.lines() {
- match VariableSet::parse(line) {
- Ok(set) => sets.push(set),
- Err(error) => warn(error),
+ if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) {
+ sets.push(set)
}
}
Ok(VariableSetRecord(sets))
.map(|s| s.trim_end_matches('\t'))
.filter(|s| !s.is_empty())
{
- match VeryLongString::parse(tuple) {
- Ok(vls) => very_long_strings.push(vls),
- Err(error) => warn(error),
+ if let Some(vls) = VeryLongString::parse(tuple).warn_on_error(&warn) {
+ very_long_strings.push(vls)
}
}
Ok(VeryLongStringRecord(very_long_strings))
fn parse(mut input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
let mut var_attribute_sets = Vec::new();
while !input.is_empty() {
- match VarAttributeSet::parse(input, &warn) {
- Ok((var_attribute, rest)) => {
- var_attribute_sets.push(var_attribute);
- input = rest;
- }
- Err(error) => {
- warn(error);
- break;
- }
- }
+ let Some((var_attribute, rest)) =
+ VarAttributeSet::parse(input, &warn).warn_on_error(&warn)
+ else {
+ break;
+ };
+ var_attribute_sets.push(var_attribute);
+ input = rest;
}
Ok(VariableAttributeRecord(var_attribute_sets))
}
}
pub struct VarDisplayRecord(pub Vec<VarDisplay>);
+
+#[cfg(test)]
+mod test {
+ use encoding_rs::WINDOWS_1252;
+
+ #[test]
+ fn test() {
+ let mut s = String::new();
+ s.push(char::REPLACEMENT_CHARACTER);
+ let encoded = WINDOWS_1252.encode(&s).0;
+ let decoded = WINDOWS_1252.decode(&encoded[..]).0;
+ println!("{:?}", decoded);
+ }
+
+ #[test]
+ fn test2() {
+ let charset: Vec<u8> = (0..=255).collect();
+ println!("{}", charset.len());
+ let decoded = WINDOWS_1252.decode(&charset[..]).0;
+ println!("{}", decoded.len());
+ let encoded = WINDOWS_1252.encode(&decoded[..]).0;
+ println!("{}", encoded.len());
+ assert_eq!(&charset[..], &encoded[..]);
+ }
+}
#[derive(ThisError, Debug)]
pub enum Error {
- #[error("Unknown format type {value}")]
+ #[error("Unknown format type {value}.")]
UnknownFormat { value: u16 },
#[error("Output format {0} specifies width {}, but {} requires an even width.", .0.w, .0.format)]
#[error("Numeric variable is not compatible with string format {0}.")]
UnnamedVariableNotCompatibleWithStringFormat(Format),
- #[error("String variable {variable} is not compatible with numeric format {format}.")]
- NamedVariableNotCompatibleWithNumericFormat { variable: String, format: Format },
-
- #[error("Numeric variable {variable} is not compatible with string format {format}.")]
- NamedVariableNotCompatibleWithStringFormat { variable: String, format: Format },
-
#[error("String variable {variable} with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")]
NamedStringVariableBadSpecWidth {
variable: String,
/// Checks whether this format is valid for a variable with the given
/// `var_type`.
- pub fn check_type_compatibility(
- self,
- variable: Option<&str>,
- var_type: VarType,
- ) -> Result<(), Error> {
+ pub fn check_type_compatibility(self, var_type: VarType) -> Result<(), Error> {
let my_type = self.var_type();
match (my_type, var_type) {
(VarType::Numeric, VarType::String) => {
- if let Some(variable) = variable {
- Err(Error::NamedVariableNotCompatibleWithNumericFormat {
- variable: variable.into(),
- format: self,
- })
- } else {
- Err(Error::UnnamedVariableNotCompatibleWithNumericFormat(self))
- }
+ Err(Error::UnnamedVariableNotCompatibleWithNumericFormat(self))
}
(VarType::String, VarType::Numeric) => {
- if let Some(variable) = variable {
- Err(Error::NamedVariableNotCompatibleWithStringFormat {
- variable: variable.into(),
- format: self,
- })
- } else {
- Err(Error::UnnamedVariableNotCompatibleWithStringFormat(self))
- }
+ Err(Error::UnnamedVariableNotCompatibleWithStringFormat(self))
}
_ => Ok(()),
}
/// Checks whether this format specification is valid for a variable with
/// width `var_width`.
- pub fn check_width_compatibility(
- self,
- variable: Option<&str>,
- var_width: VarWidth,
- ) -> Result<Self, Error> {
+ pub fn check_width_compatibility(self, var_width: VarWidth) -> Result<Self, Error> {
// Verify that the format is right for the variable's type.
self.format
- .check_type_compatibility(variable, var_width.into())?;
+ .check_type_compatibility(var_width.into())?;
if let VarWidth::String(w) = var_width {
if var_width != self.var_width() {
} else {
Spec { w: w * 2, ..self }
};
- if let Some(variable) = variable {
- return Err(Error::NamedStringVariableBadSpecWidth {
- variable: variable.into(),
- width: w,
- bad_spec,
- good_spec,
- });
- } else {
- return Err(Error::UnnamedStringVariableBadSpecWidth {
- width: w,
- bad_spec,
- good_spec,
- });
- }
+ return Err(Error::UnnamedStringVariableBadSpecWidth {
+ width: w,
+ bad_spec,
+ good_spec,
+ });
}
}