1 use std::{borrow::Cow, cmp::Ordering, collections::HashMap, iter::repeat};
4 format::{Error as FormatError, Spec, UncheckedSpec},
5 identifier::{Error as IdError, Identifier},
6 raw::{self, MissingValues, VarType},
8 {endian::Endian, Compression},
10 use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
11 use encoding_rs::{DecoderResult, Encoding};
12 use num::integer::div_ceil;
13 use ordered_float::OrderedFloat;
14 use thiserror::Error as ThisError;
16 #[derive(ThisError, Debug)]
18 #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
19 InvalidVariableWidth { offset: u64, width: i32 },
21 #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
22 InvalidLongMissingValueFormat,
24 #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
25 InvalidCreationDate { creation_date: String },
27 #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
28 InvalidCreationTime { creation_time: String },
30 #[error("{id_error} Renaming variable to {new_name}.")]
37 "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}"
42 format_error: FormatError,
46 "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}"
51 format_error: FormatError,
54 #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")]
55 DuplicateVariableName {
56 duplicate_name: Identifier,
60 #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")]
61 InvalidDictIndex { dict_index: usize, max_index: usize },
63 #[error("Dictionary index {0} refers to a long string continuation.")]
64 DictIndexIsContinuation(usize),
66 #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")]
67 ValueLabelsDifferentTypes {
68 numeric_var: Identifier,
69 string_var: Identifier,
73 "Value labels may not be added to long string variable {0} using record types 3 or 4."
75 InvalidLongStringValueLabel(Identifier),
77 #[error("Invalid multiple response set name. {0}")]
78 InvalidMrSetName(#[from] IdError),
80 #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")]
81 UnknownMrSetVariable {
83 short_name: Identifier,
86 #[error("Multiple response set {0} has no variables.")]
87 EmptyMrSet(Identifier),
89 #[error("Multiple response set {0} has only one variable.")]
90 OneVarMrSet(Identifier),
92 #[error("Multiple response set {0} contains both string and numeric variables.")]
93 MixedMrSet(Identifier),
95 #[error("Details TBD")]
99 #[derive(Clone, Debug)]
101 Header(HeaderRecord),
102 Variable(VariableRecord),
103 ValueLabel(ValueLabelRecord),
104 Document(DocumentRecord),
105 IntegerInfo(IntegerInfoRecord),
106 FloatInfo(FloatInfoRecord),
107 VariableSets(VariableSetRecord),
108 VarDisplay(VarDisplayRecord),
109 MultipleResponse(MultipleResponseRecord),
110 //LongStringValueLabels(LongStringValueLabelRecord),
111 Encoding(EncodingRecord),
112 NumberOfCases(NumberOfCasesRecord),
113 ProductInfo(ProductInfoRecord),
114 //LongNames(UnencodedString),
115 //LongStrings(UnencodedString),
116 //FileAttributes(UnencodedString),
117 //VariableAttributes(UnencodedString),
118 //OtherExtension(Extension),
121 //ZTrailer(ZTrailer),
125 pub use crate::raw::EncodingRecord;
126 pub use crate::raw::FloatInfoRecord;
127 pub use crate::raw::IntegerInfoRecord;
128 pub use crate::raw::NumberOfCasesRecord;
130 type DictIndex = usize;
132 pub struct Variable {
133 pub dict_index: DictIndex,
134 pub short_name: Identifier,
135 pub long_name: Option<Identifier>,
140 pub compression: Option<Compression>,
142 pub encoding: &'static Encoding,
143 pub variables: HashMap<DictIndex, Variable>,
144 pub var_names: HashMap<Identifier, DictIndex>,
145 n_dict_indexes: usize,
146 n_generated_names: usize,
150 fn generate_name(&mut self) -> Identifier {
152 self.n_generated_names += 1;
153 let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
155 if !self.var_names.contains_key(&name) {
158 assert!(self.n_generated_names < usize::MAX);
161 fn decode_string<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
162 let (output, malformed) = self.encoding.decode_without_bom_handling(input);
168 pub fn decode_identifier(
171 warn: &impl Fn(Error),
172 ) -> Result<Identifier, IdError> {
173 let s = self.decode_string(input, warn);
174 Identifier::new(&s, self.encoding)
176 fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> {
177 let max_index = self.n_dict_indexes - 1;
178 if dict_index == 0 || dict_index as usize > max_index {
179 return Err(Error::InvalidDictIndex {
184 let Some(variable) = self.variables.get(&dict_index) else {
185 return Err(Error::DictIndexIsContinuation(dict_index));
190 /// Returns `input` decoded from `self.encoding` into UTF-8 such that
191 /// re-encoding the result back into `self.encoding` will have exactly the
192 /// same length in bytes.
194 /// XXX warn about errors?
195 fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
196 if let (s, false) = self.encoding.decode_without_bom_handling(input) {
197 // This is the common case. Usually there will be no errors.
200 // Unusual case. Don't bother to optimize it much.
201 let mut decoder = self.encoding.new_decoder_without_bom_handling();
202 let mut output = String::with_capacity(
204 .max_utf8_buffer_length_without_replacement(input.len())
207 let mut rest = input;
208 while !rest.is_empty() {
209 match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
210 (DecoderResult::InputEmpty, _) => break,
211 (DecoderResult::OutputFull, _) => unreachable!(),
212 (DecoderResult::Malformed(a, b), consumed) => {
213 let skipped = a as usize + b as usize;
214 output.extend(repeat('?').take(skipped));
215 rest = &rest[consumed..];
219 assert_eq!(self.encoding.encode(&output).0.len(), input.len());
225 pub trait Decode: Sized {
227 fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result<Self, Error>;
230 #[derive(Clone, Debug)]
231 pub struct HeaderRecord {
232 pub eye_catcher: String,
233 pub weight_index: Option<usize>,
234 pub n_cases: Option<u64>,
235 pub creation: NaiveDateTime,
236 pub file_label: String,
239 impl Decode for HeaderRecord {
240 type Input = crate::raw::HeaderRecord;
242 fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result<Self, Error> {
243 let eye_catcher = decoder.decode_string(&input.eye_catcher.0, &warn);
244 let file_label = decoder.decode_string(&input.file_label.0, &warn);
245 let creation_date = decoder.decode_string(&input.creation_date.0, &warn);
246 let creation_date = NaiveDate::parse_from_str(&creation_date, "%v").unwrap_or_else(|_| {
247 warn(Error::InvalidCreationDate {
248 creation_date: creation_date.into(),
252 let creation_time = decoder.decode_string(&input.creation_time.0, &warn);
254 NaiveTime::parse_from_str(&creation_time, "%H:%M:%S").unwrap_or_else(|_| {
255 warn(Error::InvalidCreationTime {
256 creation_time: creation_time.into(),
261 eye_catcher: eye_catcher.into(),
262 weight_index: input.weight_index.map(|n| n as usize),
263 n_cases: input.n_cases.map(|n| n as u64),
264 creation: NaiveDateTime::new(creation_date, creation_time),
265 file_label: file_label.into(),
270 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
276 impl PartialOrd for VarWidth {
277 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
278 match (self, other) {
279 (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal),
280 (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)),
287 fn n_dict_indexes(self) -> usize {
289 VarWidth::Numeric => 1,
290 VarWidth::String(w) => div_ceil(w as usize, 8),
294 /// Returns the wider of `self` and `other`:
295 /// - Numerical variable widths are equally wide.
296 /// - Longer strings are wider than shorter strings.
297 /// - Numerical and string types are incomparable, so result in `None`.
298 /// - Any `None` in the input yields `None` in the output.
299 pub fn wider(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
301 (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric),
302 (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => {
303 Some(VarWidth::String(a.max(b)))
310 impl From<VarWidth> for VarType {
311 fn from(source: VarWidth) -> Self {
313 VarWidth::Numeric => VarType::Numeric,
314 VarWidth::String(_) => VarType::String,
319 #[derive(Clone, Debug)]
320 pub struct VariableRecord {
322 pub name: Identifier,
323 pub print_format: Spec,
324 pub write_format: Spec,
325 pub missing_values: MissingValues,
326 pub label: Option<String>,
329 fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec {
330 UncheckedSpec::try_from(raw)
331 .and_then(Spec::try_from)
332 .and_then(|x| x.check_width_compatibility(width))
333 .unwrap_or_else(|error| {
334 let new_format = Spec::default_for_width(width);
335 warn(new_format, error);
340 impl VariableRecord {
342 decoder: &mut Decoder,
343 input: &crate::raw::VariableRecord,
344 warn: impl Fn(Error),
345 ) -> Result<Option<VariableRecord>, Error> {
346 let width = match input.width {
347 0 => VarWidth::Numeric,
348 w @ 1..=255 => VarWidth::String(w as u16),
349 -1 => return Ok(None),
351 return Err(Error::InvalidVariableWidth {
352 offset: input.offset,
357 let name = match decoder.decode_identifier(&input.name.0, &warn) {
359 if !decoder.var_names.contains_key(&name) {
362 let new_name = decoder.generate_name();
363 warn(Error::DuplicateVariableName {
364 duplicate_name: name.clone(),
365 new_name: new_name.clone(),
371 let new_name = decoder.generate_name();
372 warn(Error::InvalidVariableName {
374 new_name: new_name.clone(),
379 let variable = Variable {
380 dict_index: decoder.n_dict_indexes,
381 short_name: name.clone(),
385 decoder.n_dict_indexes += width.n_dict_indexes();
388 .insert(name.clone(), variable.dict_index)
392 .insert(variable.dict_index, variable)
395 let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
396 warn(Error::InvalidPrintFormat {
398 variable: name.clone(),
402 let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
403 warn(Error::InvalidWriteFormat {
405 variable: name.clone(),
412 .map(|label| decoder.decode_string(&label.0, &warn).into());
413 Ok(Some(VariableRecord {
418 missing_values: input.missing_values.clone(),
424 #[derive(Clone, Debug)]
425 pub struct DocumentRecord(Vec<String>);
427 impl Decode for DocumentRecord {
428 type Input = crate::raw::DocumentRecord;
430 fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result<Self, Error> {
435 .map(|s| decoder.decode_string(&s.0, &warn).into())
445 const NAME: &'static str;
446 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error>;
449 #[derive(Clone, Debug)]
450 pub struct VariableSet {
452 pub vars: Vec<String>,
456 fn parse(input: &str) -> Result<Self, Error> {
457 let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
458 let vars = input.split_ascii_whitespace().map(String::from).collect();
466 #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
468 Number(Option<OrderedFloat<f64>>),
473 pub fn decode(raw: raw::Value, decoder: &Decoder) -> Self {
475 raw::Value::Number(x) => Value::Number(x.map(|x| x.into())),
476 raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
481 #[derive(Clone, Debug)]
482 pub struct ValueLabelRecord {
483 pub var_type: VarType,
484 pub labels: Vec<(Value, String)>,
485 pub variables: Vec<Identifier>,
488 trait WarnOnError<T> {
489 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
491 impl<T> WarnOnError<T> for Result<T, Error> {
492 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
494 Ok(result) => Some(result),
503 impl ValueLabelRecord {
505 decoder: &mut Decoder,
506 raw_value_label: &crate::raw::ValueLabelRecord,
507 dict_indexes: &crate::raw::VarIndexRecord,
508 warn: impl Fn(Error),
509 ) -> Result<Option<ValueLabelRecord>, Error> {
510 let variables: Vec<&Variable> = dict_indexes
513 .filter_map(|&dict_index| {
515 .get_var_by_index(dict_index as usize)
516 .warn_on_error(&warn)
518 .filter(|&variable| match variable.width {
519 VarWidth::String(width) if width > 8 => {
520 warn(Error::InvalidLongStringValueLabel(
521 variable.short_name.clone(),
528 let mut i = variables.iter();
529 let Some(&first_var) = i.next() else {
532 let var_type: VarType = first_var.width.into();
534 let this_type: VarType = variable.width.into();
535 if var_type != this_type {
536 let (numeric_var, string_var) = match var_type {
537 VarType::Numeric => (first_var, variable),
538 VarType::String => (variable, first_var),
540 warn(Error::ValueLabelsDifferentTypes {
541 numeric_var: numeric_var.short_name.clone(),
542 string_var: string_var.short_name.clone(),
547 let labels = raw_value_label
550 .map(|(value, label)| {
551 let label = decoder.decode_string(&label.0, &warn);
552 let value = Value::decode(
553 raw::Value::from_raw(*value, var_type, decoder.endian),
556 (value, label.into())
559 let variables = variables
561 .map(|&variable| variable.short_name.clone())
563 Ok(Some(ValueLabelRecord {
571 #[derive(Clone, Debug)]
572 pub struct VariableSetRecord(Vec<VariableSet>);
574 impl TextRecord for VariableSetRecord {
575 const NAME: &'static str = "variable set";
576 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
577 let mut sets = Vec::new();
578 for line in input.lines() {
579 if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) {
583 Ok(VariableSetRecord(sets))
587 #[derive(Clone, Debug)]
588 pub struct ProductInfoRecord(pub String);
590 impl TextRecord for ProductInfoRecord {
591 const NAME: &'static str = "extra product info";
592 fn parse(input: &str, _warn: impl Fn(Error)) -> Result<Self, Error> {
593 Ok(ProductInfoRecord(input.into()))
597 pub struct LongVariableName {
598 pub short_name: String,
599 pub long_name: String,
602 pub struct LongVariableNameRecord(Vec<LongVariableName>);
604 impl TextRecord for LongVariableNameRecord {
605 const NAME: &'static str = "long variable names";
606 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
607 let mut names = Vec::new();
608 for pair in input.split('\t').filter(|s| !s.is_empty()) {
609 if let Some((short_name, long_name)) = pair.split_once('=') {
610 let name = LongVariableName {
611 short_name: short_name.into(),
612 long_name: long_name.into(),
619 Ok(LongVariableNameRecord(names))
623 pub struct VeryLongString {
624 pub short_name: String,
628 impl VeryLongString {
629 fn parse(input: &str) -> Result<VeryLongString, Error> {
630 let Some((short_name, length)) = input.split_once('=') else {
631 return Err(Error::TBD);
633 let length: usize = length.parse().map_err(|_| Error::TBD)?;
635 short_name: short_name.into(),
641 pub struct VeryLongStringRecord(Vec<VeryLongString>);
643 impl TextRecord for VeryLongStringRecord {
644 const NAME: &'static str = "very long strings";
645 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
646 let mut very_long_strings = Vec::new();
649 .map(|s| s.trim_end_matches('\t'))
650 .filter(|s| !s.is_empty())
652 if let Some(vls) = VeryLongString::parse(tuple).warn_on_error(&warn) {
653 very_long_strings.push(vls)
656 Ok(VeryLongStringRecord(very_long_strings))
660 pub struct Attribute {
662 pub values: Vec<String>,
666 fn parse<'a>(input: &'a str, warn: &impl Fn(Error)) -> Result<(Attribute, &'a str), Error> {
667 let Some((name, mut input)) = input.split_once('(') else {
668 return Err(Error::TBD);
670 let mut values = Vec::new();
672 let Some((value, rest)) = input.split_once('\n') else {
673 return Err(Error::TBD);
675 if let Some(stripped) = value
677 .and_then(|value| value.strip_suffix('\''))
679 values.push(stripped.into());
682 values.push(value.into());
684 if let Some(rest) = rest.strip_prefix(')') {
698 pub struct AttributeSet(pub Vec<Attribute>);
703 sentinel: Option<char>,
704 warn: &impl Fn(Error),
705 ) -> Result<(AttributeSet, &'a str), Error> {
706 let mut attributes = Vec::new();
708 match input.chars().next() {
710 c if c == sentinel => break &input[1..],
712 let (attribute, rest) = Attribute::parse(input, &warn)?;
713 attributes.push(attribute);
718 Ok((AttributeSet(attributes), rest))
722 pub struct FileAttributeRecord(AttributeSet);
724 impl TextRecord for FileAttributeRecord {
725 const NAME: &'static str = "data file attributes";
726 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
727 let (set, rest) = AttributeSet::parse(input, None, &warn)?;
728 if !rest.is_empty() {
731 Ok(FileAttributeRecord(set))
735 pub struct VarAttributeSet {
736 pub long_var_name: String,
737 pub attributes: AttributeSet,
740 impl VarAttributeSet {
743 warn: &impl Fn(Error),
744 ) -> Result<(VarAttributeSet, &'a str), Error> {
745 let Some((long_var_name, rest)) = input.split_once(':') else {
746 return Err(Error::TBD);
748 let (attributes, rest) = AttributeSet::parse(rest, Some('/'), warn)?;
751 long_var_name: long_var_name.into(),
759 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
761 impl TextRecord for VariableAttributeRecord {
762 const NAME: &'static str = "variable attributes";
763 fn parse(mut input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
764 let mut var_attribute_sets = Vec::new();
765 while !input.is_empty() {
766 let Some((var_attribute, rest)) =
767 VarAttributeSet::parse(input, &warn).warn_on_error(&warn)
771 var_attribute_sets.push(var_attribute);
774 Ok(VariableAttributeRecord(var_attribute_sets))
778 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
785 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
792 #[derive(Clone, Debug)]
793 pub struct VarDisplay {
794 pub measure: Option<Measure>,
796 pub align: Option<Alignment>,
799 #[derive(Clone, Debug)]
800 pub struct VarDisplayRecord(pub Vec<VarDisplay>);
802 #[derive(Clone, Debug)]
803 pub enum MultipleResponseType {
806 labels: CategoryLabels,
811 #[derive(Clone, Debug)]
812 pub struct MultipleResponseSet {
813 pub name: Identifier,
815 pub mr_type: MultipleResponseType,
816 pub dict_indexes: Vec<DictIndex>,
819 impl MultipleResponseSet {
822 input: &raw::MultipleResponseSet,
823 warn: &impl Fn(Error),
824 ) -> Result<Option<Self>, Error> {
825 let mr_set_name = decoder
826 .decode_identifier(&input.name.0, warn)
827 .map_err(|error| Error::InvalidMrSetName(error))?;
829 let label = decoder.decode_string(&input.label.0, warn).into();
831 let dict_indexes = Vec::with_capacity(input.short_names.len());
832 for &short_name in input.short_names.iter() {
833 let short_name = match decoder.decode_identifier(&short_name.0, warn) {
836 warn(Error::InvalidMrSetName(error));
840 let Some(dict_index) = decoder.var_names.get(&short_name) else {
841 warn(Error::UnknownMrSetVariable {
842 mr_set: mr_set_name.clone(),
843 short_name: short_name.clone(),
847 dict_indexes.push(dict_index);
850 match dict_indexes.len() {
851 0 => return Err(Error::EmptyMrSet(mr_set_name)),
852 1 => return Err(Error::OneVarMrSet(mr_set_name)),
856 let Some(var_width) = dict_indexes
858 .map(|&dict_index| Some(decoder.variables[dict_index].width))
859 .reduce(|a, b| VarWidth::wider(a, b))
862 return Err(Error::MixedMrSet(mr_set_name));
867 #[derive(Clone, Debug)]
868 pub struct MultipleResponseRecord(Vec<MultipleResponseSet>);
870 impl Decode for MultipleResponseRecord {
871 type Input = raw::MultipleResponseRecord;
873 fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result<Self, Error> {
879 use encoding_rs::WINDOWS_1252;
883 let mut s = String::new();
884 s.push(char::REPLACEMENT_CHARACTER);
885 let encoded = WINDOWS_1252.encode(&s).0;
886 let decoded = WINDOWS_1252.decode(&encoded[..]).0;
887 println!("{:?}", decoded);
892 let charset: Vec<u8> = (0..=255).collect();
893 println!("{}", charset.len());
894 let decoded = WINDOWS_1252.decode(&charset[..]).0;
895 println!("{}", decoded.len());
896 let encoded = WINDOWS_1252.encode(&decoded[..]).0;
897 println!("{}", encoded.len());
898 assert_eq!(&charset[..], &encoded[..]);