1 use std::{borrow::Cow, cmp::Ordering, collections::HashMap, iter::repeat};
4 format::{Error as FormatError, Spec, UncheckedSpec},
5 identifier::{Error as IdError, Identifier},
6 raw::{self, MissingValues, VarType},
7 {endian::Endian, Compression},
9 use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
10 use encoding_rs::{DecoderResult, Encoding};
11 use num::integer::div_ceil;
12 use thiserror::Error as ThisError;
14 #[derive(ThisError, Debug)]
16 #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
17 InvalidVariableWidth { offset: u64, width: i32 },
19 #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
20 InvalidLongMissingValueFormat,
22 #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
23 InvalidCreationDate { creation_date: String },
25 #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
26 InvalidCreationTime { creation_time: String },
28 #[error("{id_error} Renaming variable to {new_name}.")]
35 "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}"
40 format_error: FormatError,
44 "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}"
49 format_error: FormatError,
52 #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")]
53 DuplicateVariableName {
54 duplicate_name: Identifier,
58 #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")]
59 InvalidDictIndex { dict_index: usize, max_index: usize },
61 #[error("Dictionary index {0} refers to a long string continuation.")]
62 DictIndexIsContinuation(usize),
64 #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")]
65 ValueLabelsDifferentTypes {
66 numeric_var: Identifier,
67 string_var: Identifier,
71 "Value labels may not be added to long string variable {0} using record types 3 or 4."
73 InvalidLongStringValueLabel(Identifier),
75 #[error("Details TBD")]
79 type DictIndex = usize;
82 pub dict_index: DictIndex,
83 pub short_name: Identifier,
84 pub long_name: Option<Identifier>,
89 pub compression: Option<Compression>,
91 pub encoding: &'static Encoding,
92 pub variables: HashMap<DictIndex, Variable>,
93 pub var_names: HashMap<Identifier, DictIndex>,
94 n_dict_indexes: usize,
95 n_generated_names: usize,
99 fn generate_name(&mut self) -> Identifier {
101 self.n_generated_names += 1;
102 let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
104 if !self.var_names.contains_key(&name) {
107 assert!(self.n_generated_names < usize::MAX);
110 fn decode_string<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
111 let (output, malformed) = self.encoding.decode_without_bom_handling(input);
117 fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> {
118 let max_index = self.n_dict_indexes - 1;
119 if dict_index == 0 || dict_index as usize > max_index {
120 return Err(Error::InvalidDictIndex {
125 let Some(variable) = self.variables.get(&dict_index) else {
126 return Err(Error::DictIndexIsContinuation(dict_index));
131 /// Returns `input` decoded from `self.encoding` into UTF-8 such that
132 /// re-encoding the result back into `self.encoding` will have exactly the
133 /// same length in bytes.
135 /// XXX warn about errors?
136 fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
137 if let (s, false) = self.encoding.decode_without_bom_handling(input) {
138 // This is the common case. Usually there will be no errors.
141 // Unusual case. Don't bother to optimize it much.
142 let mut decoder = self.encoding.new_decoder_without_bom_handling();
143 let mut output = String::with_capacity(
145 .max_utf8_buffer_length_without_replacement(input.len())
148 let mut rest = input;
149 while !rest.is_empty() {
150 match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
151 (DecoderResult::InputEmpty, _) => break,
152 (DecoderResult::OutputFull, _) => unreachable!(),
153 (DecoderResult::Malformed(a, b), consumed) => {
154 let skipped = a as usize + b as usize;
155 output.extend(repeat('?').take(skipped));
156 rest = &rest[consumed..];
160 assert_eq!(self.encoding.encode(&output).0.len(), input.len());
166 pub trait Decode: Sized {
168 fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result<Self, Error>;
173 pub eye_catcher: String,
174 pub weight_index: Option<usize>,
175 pub n_cases: Option<u64>,
176 pub creation: NaiveDateTime,
177 pub file_label: String,
180 impl Decode for Header {
181 type Input = crate::raw::HeaderRecord;
183 fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result<Self, Error> {
184 let eye_catcher = decoder.decode_string(&input.eye_catcher.0, &warn);
185 let file_label = decoder.decode_string(&input.file_label.0, &warn);
186 let creation_date = decoder.decode_string(&input.creation_date.0, &warn);
187 let creation_date = NaiveDate::parse_from_str(&creation_date, "%v").unwrap_or_else(|_| {
188 warn(Error::InvalidCreationDate {
189 creation_date: creation_date.into(),
193 let creation_time = decoder.decode_string(&input.creation_time.0, &warn);
195 NaiveTime::parse_from_str(&creation_time, "%H:%M:%S").unwrap_or_else(|_| {
196 warn(Error::InvalidCreationTime {
197 creation_time: creation_time.into(),
202 eye_catcher: eye_catcher.into(),
203 weight_index: input.weight_index.map(|n| n as usize),
204 n_cases: input.n_cases.map(|n| n as u64),
205 creation: NaiveDateTime::new(creation_date, creation_time),
206 file_label: file_label.into(),
211 #[derive(Copy, Clone, PartialEq, Eq)]
217 impl PartialOrd for VarWidth {
218 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
219 match (self, other) {
220 (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal),
221 (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)),
228 fn n_dict_indexes(self) -> usize {
230 VarWidth::Numeric => 1,
231 VarWidth::String(w) => div_ceil(w as usize, 8),
236 impl From<VarWidth> for VarType {
237 fn from(source: VarWidth) -> Self {
239 VarWidth::Numeric => VarType::Numeric,
240 VarWidth::String(_) => VarType::String,
245 pub struct VariableRecord {
247 pub name: Identifier,
248 pub print_format: Spec,
249 pub write_format: Spec,
250 pub missing_values: MissingValues,
251 pub label: Option<String>,
254 fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec {
255 UncheckedSpec::try_from(raw)
256 .and_then(Spec::try_from)
257 .and_then(|x| x.check_width_compatibility(width))
258 .unwrap_or_else(|error| {
259 let new_format = Spec::default_for_width(width);
260 warn(new_format, error);
265 impl VariableRecord {
267 decoder: &mut Decoder,
268 input: &crate::raw::VariableRecord,
269 warn: impl Fn(Error),
270 ) -> Result<Option<VariableRecord>, Error> {
271 let width = match input.width {
272 0 => VarWidth::Numeric,
273 w @ 1..=255 => VarWidth::String(w as u16),
274 -1 => return Ok(None),
276 return Err(Error::InvalidVariableWidth {
277 offset: input.offset,
282 let name = decoder.decode_string(&input.name.0, &warn);
283 let name = match Identifier::new(&name, decoder.encoding) {
285 if !decoder.var_names.contains_key(&name) {
288 let new_name = decoder.generate_name();
289 warn(Error::DuplicateVariableName {
290 duplicate_name: name.clone(),
291 new_name: new_name.clone(),
297 let new_name = decoder.generate_name();
298 warn(Error::InvalidVariableName {
300 new_name: new_name.clone(),
305 let variable = Variable {
306 dict_index: decoder.n_dict_indexes,
307 short_name: name.clone(),
311 decoder.n_dict_indexes += width.n_dict_indexes();
314 .insert(name.clone(), variable.dict_index)
318 .insert(variable.dict_index, variable)
321 let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
322 warn(Error::InvalidPrintFormat {
324 variable: name.clone(),
328 let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
329 warn(Error::InvalidWriteFormat {
331 variable: name.clone(),
338 .map(|label| decoder.decode_string(&label.0, &warn).into());
339 Ok(Some(VariableRecord {
344 missing_values: input.missing_values.clone(),
351 pub struct Document(Vec<String>);
353 impl Decode for Document {
354 type Input = crate::raw::DocumentRecord;
356 fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result<Self, Error> {
361 .map(|s| decoder.decode_string(&s.0, &warn).into())
371 const NAME: &'static str;
372 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error>;
375 pub struct VariableSet {
377 pub vars: Vec<String>,
381 fn parse(input: &str) -> Result<Self, Error> {
382 let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
383 let vars = input.split_ascii_whitespace().map(String::from).collect();
398 pub fn decode(raw: raw::Value, decoder: &Decoder) -> Self {
400 raw::Value::Number(x) => Value::Number(x),
401 raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
406 pub struct ValueLabelRecord {
407 pub var_type: VarType,
408 pub labels: Vec<(Value, String)>,
409 pub variables: Vec<Identifier>,
412 trait WarnOnError<T> {
413 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
415 impl<T> WarnOnError<T> for Result<T, Error> {
416 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
418 Ok(result) => Some(result),
427 impl ValueLabelRecord {
429 decoder: &mut Decoder,
430 raw_value_label: &crate::raw::ValueLabelRecord,
431 dict_indexes: &crate::raw::VarIndexRecord,
432 warn: impl Fn(Error),
433 ) -> Result<Option<ValueLabelRecord>, Error> {
434 let variables: Vec<&Variable> = dict_indexes
437 .filter_map(|&dict_index| {
439 .get_var_by_index(dict_index as usize)
440 .warn_on_error(&warn)
442 .filter(|&variable| match variable.width {
443 VarWidth::String(width) if width > 8 => {
444 warn(Error::InvalidLongStringValueLabel(
445 variable.short_name.clone(),
452 let mut i = variables.iter();
453 let Some(&first_var) = i.next() else {
456 let var_type: VarType = first_var.width.into();
458 let this_type: VarType = variable.width.into();
459 if var_type != this_type {
460 let (numeric_var, string_var) = match var_type {
461 VarType::Numeric => (first_var, variable),
462 VarType::String => (variable, first_var),
464 warn(Error::ValueLabelsDifferentTypes {
465 numeric_var: numeric_var.short_name.clone(),
466 string_var: string_var.short_name.clone(),
471 let labels = raw_value_label
474 .map(|(value, label)| {
475 let label = decoder.decode_string(&label.0, &warn);
476 let value = Value::decode(raw::Value::from_raw(*value, var_type, decoder.endian), &decoder);
477 (value, label.into())
480 let variables = variables
482 .map(|&variable| variable.short_name.clone())
484 Ok(Some(ValueLabelRecord {
492 pub struct VariableSetRecord(Vec<VariableSet>);
494 impl TextRecord for VariableSetRecord {
495 const NAME: &'static str = "variable set";
496 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
497 let mut sets = Vec::new();
498 for line in input.lines() {
499 if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) {
503 Ok(VariableSetRecord(sets))
507 pub struct ProductInfo(pub String);
509 impl TextRecord for ProductInfo {
510 const NAME: &'static str = "extra product info";
511 fn parse(input: &str, _warn: impl Fn(Error)) -> Result<Self, Error> {
512 Ok(ProductInfo(input.into()))
516 pub struct LongVariableName {
517 pub short_name: String,
518 pub long_name: String,
521 pub struct LongVariableNameRecord(Vec<LongVariableName>);
523 impl TextRecord for LongVariableNameRecord {
524 const NAME: &'static str = "long variable names";
525 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
526 let mut names = Vec::new();
527 for pair in input.split('\t').filter(|s| !s.is_empty()) {
528 if let Some((short_name, long_name)) = pair.split_once('=') {
529 let name = LongVariableName {
530 short_name: short_name.into(),
531 long_name: long_name.into(),
538 Ok(LongVariableNameRecord(names))
542 pub struct VeryLongString {
543 pub short_name: String,
547 impl VeryLongString {
548 fn parse(input: &str) -> Result<VeryLongString, Error> {
549 let Some((short_name, length)) = input.split_once('=') else {
550 return Err(Error::TBD);
552 let length: usize = length.parse().map_err(|_| Error::TBD)?;
554 short_name: short_name.into(),
560 pub struct VeryLongStringRecord(Vec<VeryLongString>);
562 impl TextRecord for VeryLongStringRecord {
563 const NAME: &'static str = "very long strings";
564 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
565 let mut very_long_strings = Vec::new();
568 .map(|s| s.trim_end_matches('\t'))
569 .filter(|s| !s.is_empty())
571 if let Some(vls) = VeryLongString::parse(tuple).warn_on_error(&warn) {
572 very_long_strings.push(vls)
575 Ok(VeryLongStringRecord(very_long_strings))
579 pub struct Attribute {
581 pub values: Vec<String>,
585 fn parse<'a>(input: &'a str, warn: &impl Fn(Error)) -> Result<(Attribute, &'a str), Error> {
586 let Some((name, mut input)) = input.split_once('(') else {
587 return Err(Error::TBD);
589 let mut values = Vec::new();
591 let Some((value, rest)) = input.split_once('\n') else {
592 return Err(Error::TBD);
594 if let Some(stripped) = value
596 .and_then(|value| value.strip_suffix('\''))
598 values.push(stripped.into());
601 values.push(value.into());
603 if let Some(rest) = rest.strip_prefix(')') {
617 pub struct AttributeSet(pub Vec<Attribute>);
622 sentinel: Option<char>,
623 warn: &impl Fn(Error),
624 ) -> Result<(AttributeSet, &'a str), Error> {
625 let mut attributes = Vec::new();
627 match input.chars().next() {
629 c if c == sentinel => break &input[1..],
631 let (attribute, rest) = Attribute::parse(input, &warn)?;
632 attributes.push(attribute);
637 Ok((AttributeSet(attributes), rest))
641 pub struct FileAttributeRecord(AttributeSet);
643 impl TextRecord for FileAttributeRecord {
644 const NAME: &'static str = "data file attributes";
645 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
646 let (set, rest) = AttributeSet::parse(input, None, &warn)?;
647 if !rest.is_empty() {
650 Ok(FileAttributeRecord(set))
654 pub struct VarAttributeSet {
655 pub long_var_name: String,
656 pub attributes: AttributeSet,
659 impl VarAttributeSet {
662 warn: &impl Fn(Error),
663 ) -> Result<(VarAttributeSet, &'a str), Error> {
664 let Some((long_var_name, rest)) = input.split_once(':') else {
665 return Err(Error::TBD);
667 let (attributes, rest) = AttributeSet::parse(rest, Some('/'), warn)?;
670 long_var_name: long_var_name.into(),
678 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
680 impl TextRecord for VariableAttributeRecord {
681 const NAME: &'static str = "variable attributes";
682 fn parse(mut input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
683 let mut var_attribute_sets = Vec::new();
684 while !input.is_empty() {
685 let Some((var_attribute, rest)) =
686 VarAttributeSet::parse(input, &warn).warn_on_error(&warn)
690 var_attribute_sets.push(var_attribute);
693 Ok(VariableAttributeRecord(var_attribute_sets))
709 pub struct VarDisplay {
710 pub measure: Option<Measure>,
712 pub align: Option<Alignment>,
715 pub struct VarDisplayRecord(pub Vec<VarDisplay>);
719 use encoding_rs::WINDOWS_1252;
723 let mut s = String::new();
724 s.push(char::REPLACEMENT_CHARACTER);
725 let encoded = WINDOWS_1252.encode(&s).0;
726 let decoded = WINDOWS_1252.decode(&encoded[..]).0;
727 println!("{:?}", decoded);
732 let charset: Vec<u8> = (0..=255).collect();
733 println!("{}", charset.len());
734 let decoded = WINDOWS_1252.decode(&charset[..]).0;
735 println!("{}", decoded.len());
736 let encoded = WINDOWS_1252.encode(&decoded[..]).0;
737 println!("{}", encoded.len());
738 assert_eq!(&charset[..], &encoded[..]);