2 borrow::Cow, cell::RefCell, cmp::Ordering, collections::HashMap, iter::repeat, ops::Range,
7 encoding::{default_encoding, get_encoding, Error as EncodingError},
9 format::{Error as FormatError, Spec, UncheckedSpec},
10 identifier::{Error as IdError, Identifier},
11 raw::{self, UnencodedStr, VarType},
13 use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
14 use encoding_rs::{DecoderResult, Encoding};
15 use num::integer::div_ceil;
16 use ordered_float::OrderedFloat;
17 use thiserror::Error as ThisError;
19 pub use crate::raw::{CategoryLabels, Compression};
21 #[derive(ThisError, Debug)]
23 // XXX this is really an internal error and maybe we should change the
24 // interfaces to make it impossible
25 #[error("Missing header record")]
29 EncodingError(EncodingError),
31 #[error("Using default encoding {0}.")]
32 UsingDefaultEncoding(String),
34 #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)]
35 InvalidVariableWidth { offsets: Range<u64>, width: i32 },
37 #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
38 InvalidLongMissingValueFormat,
40 #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
41 InvalidCreationDate { creation_date: String },
43 #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
44 InvalidCreationTime { creation_time: String },
46 #[error("{id_error} Renaming variable to {new_name}.")]
53 "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}"
58 format_error: FormatError,
62 "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}"
67 format_error: FormatError,
70 #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")]
71 DuplicateVariableName {
72 duplicate_name: Identifier,
76 #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")]
77 InvalidDictIndex { dict_index: usize, max_index: usize },
79 #[error("Dictionary index {0} refers to a long string continuation.")]
80 DictIndexIsContinuation(usize),
82 #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")]
83 ValueLabelsDifferentTypes {
84 numeric_var: Identifier,
85 string_var: Identifier,
89 "Value labels may not be added to long string variable {0} using record types 3 or 4."
91 InvalidLongStringValueLabel(Identifier),
93 #[error("Invalid multiple response set name. {0}")]
94 InvalidMrSetName(IdError),
96 #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")]
97 UnknownMrSetVariable {
99 short_name: Identifier,
102 #[error("Multiple response set {0} has no variables.")]
103 EmptyMrSet(Identifier),
105 #[error("Multiple response set {0} has only one variable.")]
106 OneVarMrSet(Identifier),
108 #[error("Multiple response set {0} contains both string and numeric variables.")]
109 MixedMrSet(Identifier),
112 "Invalid numeric format for counted value {number} in multiple response set {mr_set}."
114 InvalidMDGroupCountedValue { mr_set: Identifier, number: String },
116 #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")]
117 TooWideMDGroupCountedValue {
124 #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")]
125 InvalidLongValueLabelWidth {
132 #[error("Invalid attribute name. {0}")]
133 InvalidAttributeName(IdError),
135 #[error("Invalid short name in long variable name record. {0}")]
136 InvalidShortName(IdError),
138 #[error("Invalid name in long variable name record. {0}")]
139 InvalidLongName(IdError),
141 #[error("Invalid variable name in very long string record. {0}")]
142 InvalidLongStringName(IdError),
144 #[error("Invalid variable name in long string value label record. {0}")]
145 InvalidLongStringValueLabelName(IdError),
147 #[error("Invalid variable name in attribute record. {0}")]
148 InvalidAttributeVariableName(IdError),
150 // XXX This is risky because `text` might be arbitarily long.
151 #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
152 MalformedString { encoding: String, text: String },
154 #[error("Invalid variable measurement level value {0}")]
155 InvalidMeasurement(u32),
157 #[error("Invalid variable display alignment value {0}")]
158 InvalidAlignment(u32),
160 #[error("Details TBD")]
164 #[derive(Clone, Debug)]
166 Header(HeaderRecord),
167 Variable(VariableRecord),
168 ValueLabel(ValueLabelRecord),
169 Document(DocumentRecord),
170 IntegerInfo(IntegerInfoRecord),
171 FloatInfo(FloatInfoRecord),
172 VariableSets(VariableSetRecord),
173 VarDisplay(VarDisplayRecord),
174 MultipleResponse(MultipleResponseRecord),
175 LongStringMissingValues(LongStringMissingValuesRecord),
176 LongStringValueLabels(LongStringValueLabelRecord),
177 Encoding(EncodingRecord),
178 NumberOfCases(NumberOfCasesRecord),
179 ProductInfo(ProductInfoRecord),
180 LongNames(LongNameRecord),
181 VeryLongStrings(VeryLongStringRecord),
182 FileAttributes(FileAttributeRecord),
183 VariableAttributes(VariableAttributeRecord),
184 OtherExtension(Extension),
188 pub use crate::raw::EncodingRecord;
189 pub use crate::raw::Extension;
190 pub use crate::raw::FloatInfoRecord;
191 pub use crate::raw::IntegerInfoRecord;
192 pub use crate::raw::NumberOfCasesRecord;
194 type DictIndex = usize;
196 pub struct Variable {
197 pub dict_index: DictIndex,
198 pub short_name: Identifier,
199 pub long_name: Option<Identifier>,
204 pub compression: Option<Compression>,
206 pub encoding: &'static Encoding,
207 pub variables: HashMap<DictIndex, Variable>,
208 pub var_names: HashMap<Identifier, DictIndex>,
209 n_dict_indexes: usize,
210 n_generated_names: usize,
215 header: Option<&'a raw::HeaderRecord>,
216 variables: Vec<&'a raw::VariableRecord>,
217 value_labels: Vec<&'a raw::ValueLabelRecord>,
218 document: Option<&'a raw::DocumentRecord>,
219 integer_info: Option<&'a raw::IntegerInfoRecord>,
220 float_info: Option<&'a raw::FloatInfoRecord>,
221 variable_sets: Vec<&'a raw::TextRecord>,
222 var_display: Option<&'a raw::VarDisplayRecord>,
223 multiple_response: Vec<&'a raw::MultipleResponseRecord>,
224 long_string_value_labels: Vec<&'a raw::LongStringValueLabelRecord>,
225 long_string_missing_values: Vec<&'a raw::LongStringMissingValueRecord>,
226 encoding: Option<&'a raw::EncodingRecord>,
227 number_of_cases: Option<&'a raw::NumberOfCasesRecord>,
228 product_info: Option<&'a raw::TextRecord>,
229 long_names: Option<&'a raw::TextRecord>,
230 very_long_strings: Vec<&'a raw::TextRecord>,
231 file_attributes: Vec<&'a raw::TextRecord>,
232 variable_attributes: Vec<&'a raw::TextRecord>,
233 other_extensions: Vec<&'a raw::Extension>,
234 cases: Option<&'a Rc<RefCell<raw::Cases>>>,
237 fn set_or_warn<T>(option: &mut Option<T>, value: T, warn: &impl Fn(Error)) {
238 if option.is_none() {
239 let _ = option.insert(value);
245 impl<'a> Headers<'a> {
246 fn new(headers: &'a Vec<raw::Record>, warn: &impl Fn(Error)) -> Headers<'a> {
247 let mut h = Headers::default();
248 for header in headers {
250 raw::Record::Header(record) => set_or_warn(&mut h.header, record, warn),
251 raw::Record::Variable(record) => h.variables.push(record),
252 raw::Record::ValueLabel(record) => h.value_labels.push(record),
253 raw::Record::Document(record) => set_or_warn(&mut h.document, record, warn),
254 raw::Record::IntegerInfo(record) => set_or_warn(&mut h.integer_info, record, warn),
255 raw::Record::FloatInfo(record) => set_or_warn(&mut h.float_info, record, warn),
256 raw::Record::VariableSets(record) => h.variable_sets.push(record),
257 raw::Record::VarDisplay(record) => set_or_warn(&mut h.var_display, record, warn),
258 raw::Record::MultipleResponse(record) => h.multiple_response.push(record),
259 raw::Record::LongStringValueLabels(record) => {
260 h.long_string_value_labels.push(record)
262 raw::Record::LongStringMissingValues(record) => {
263 h.long_string_missing_values.push(record)
265 raw::Record::Encoding(record) => set_or_warn(&mut h.encoding, record, warn),
266 raw::Record::NumberOfCases(record) => {
267 set_or_warn(&mut h.number_of_cases, record, warn)
269 raw::Record::ProductInfo(record) => set_or_warn(&mut h.product_info, record, warn),
270 raw::Record::LongNames(record) => set_or_warn(&mut h.long_names, record, warn),
271 raw::Record::VeryLongStrings(record) => h.very_long_strings.push(record),
272 raw::Record::FileAttributes(record) => h.file_attributes.push(record),
273 raw::Record::VariableAttributes(record) => h.variable_attributes.push(record),
274 raw::Record::OtherExtension(record) => h.other_extensions.push(record),
275 raw::Record::EndOfHeaders(_) => (),
276 raw::Record::ZHeader(_) => (),
277 raw::Record::ZTrailer(_) => (),
278 raw::Record::Cases(record) => set_or_warn(&mut h.cases, record, warn),
286 headers: Vec<raw::Record>,
287 encoding: Option<&'static Encoding>,
288 warn: &impl Fn(Error),
289 ) -> Result<Vec<Record>, Error> {
290 let h = Headers::new(&headers, warn);
291 let Some(header) = h.header else {
292 return Err(Error::MissingHeaderRecord);
294 let encoding = match encoding {
295 Some(encoding) => encoding,
297 let encoding = h.encoding.map(|record| record.0.as_str());
298 let character_code = h.integer_info.map(|record| record.character_code);
299 match get_encoding(encoding, character_code) {
300 Ok(encoding) => encoding,
301 Err(err @ EncodingError::Ebcdic) => return Err(Error::EncodingError(err)),
303 warn(Error::EncodingError(err));
304 // Warn that we're using the default encoding.
311 //let mut dictionary = Dictionary::new(encoding);
313 let mut decoder = Decoder {
314 compression: header.compression,
315 endian: header.endian,
317 variables: HashMap::new(),
318 var_names: HashMap::new(),
320 n_generated_names: 0,
323 let mut output = Vec::with_capacity(headers.len());
325 // Decode the records that don't use variables at all.
326 if let Some(header) = HeaderRecord::try_decode(&mut decoder, header, warn)? {
327 output.push(Record::Header(header))
329 if let Some(raw) = h.document {
330 if let Some(document) = DocumentRecord::try_decode(&mut decoder, raw, warn)? {
331 output.push(Record::Document(document))
334 if let Some(raw) = h.integer_info {
335 output.push(Record::IntegerInfo(raw.clone()));
337 if let Some(raw) = h.float_info {
338 output.push(Record::FloatInfo(raw.clone()));
340 if let Some(raw) = h.product_info {
341 let s = decoder.decode_string_cow(&raw.text.0, warn);
342 output.push(Record::ProductInfo(ProductInfoRecord::parse(&s, warn)?));
344 if let Some(raw) = h.number_of_cases {
345 output.push(Record::NumberOfCases(raw.clone()))
347 for &raw in &h.file_attributes {
348 let s = decoder.decode_string_cow(&raw.text.0, warn);
349 output.push(Record::FileAttributes(FileAttributeRecord::parse(
353 for &raw in &h.other_extensions {
354 output.push(Record::OtherExtension(raw.clone()));
357 // Decode the variable records, which are the basis of almost everything
359 for &raw in &h.variables {
360 if let Some(variable) = VariableRecord::try_decode(&mut decoder, raw, warn)? {
361 output.push(Record::Variable(variable));
365 // Decode value labels and weight variable. These use indexes into the
366 // variable records, so we need to parse them before those indexes become
367 // invalidated by very long string variables.
368 for &raw in &h.value_labels {
369 if let Some(value_label) = ValueLabelRecord::try_decode(&mut decoder, raw, warn)? {
370 output.push(Record::ValueLabel(value_label));
374 if let Some(raw) = h.var_display {
375 if let Some(vdr) = VarDisplayRecord::try_decode(&mut decoder, raw, warn)? {
376 output.push(Record::VarDisplay(vdr))
380 // Decode records that use short names.
381 for &raw in &h.multiple_response {
382 if let Some(mrr) = MultipleResponseRecord::try_decode(&mut decoder, raw, warn)? {
383 output.push(Record::MultipleResponse(mrr))
386 for &raw in &h.very_long_strings {
387 let s = decoder.decode_string_cow(&raw.text.0, warn);
388 output.push(Record::VeryLongStrings(VeryLongStringRecord::parse(
393 // Rename variables to their long names.
394 for &raw in &h.long_names {
395 let s = decoder.decode_string_cow(&raw.text.0, warn);
396 output.push(Record::LongNames(LongNameRecord::parse(
403 // Decode recods that use long names.
404 for &raw in &h.variable_attributes {
405 let s = decoder.decode_string_cow(&raw.text.0, warn);
406 output.push(Record::VariableAttributes(VariableAttributeRecord::parse(
410 for &raw in &h.long_string_value_labels {
411 if let Some(mrr) = LongStringValueLabelRecord::try_decode(&mut decoder, raw, warn)? {
412 output.push(Record::LongStringValueLabels(mrr))
415 for &raw in &h.long_string_missing_values {
416 if let Some(mrr) = LongStringMissingValuesRecord::try_decode(&mut decoder, raw, warn)? {
417 output.push(Record::LongStringMissingValues(mrr))
420 for &raw in &h.variable_sets {
421 let s = decoder.decode_string_cow(&raw.text.0, warn);
422 output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?));
428 fn generate_name(&mut self) -> Identifier {
430 self.n_generated_names += 1;
431 let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
433 if !self.var_names.contains_key(&name) {
436 assert!(self.n_generated_names < usize::MAX);
439 fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
440 let (output, malformed) = self.encoding.decode_without_bom_handling(input);
442 warn(Error::MalformedString {
443 encoding: self.encoding.name().into(),
444 text: output.clone().into(),
449 fn decode_string(&self, input: &[u8], warn: &impl Fn(Error)) -> String {
450 self.decode_string_cow(input, warn).into()
452 pub fn decode_identifier(
455 warn: &impl Fn(Error),
456 ) -> Result<Identifier, IdError> {
457 let s = self.decode_string_cow(input, warn);
458 Identifier::new(&s, self.encoding)
460 fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> {
461 let max_index = self.n_dict_indexes;
462 if dict_index == 0 || dict_index > max_index {
463 return Err(Error::InvalidDictIndex {
468 let Some(variable) = self.variables.get(&(dict_index - 1)) else {
469 return Err(Error::DictIndexIsContinuation(dict_index));
474 /// Returns `input` decoded from `self.encoding` into UTF-8 such that
475 /// re-encoding the result back into `self.encoding` will have exactly the
476 /// same length in bytes.
478 /// XXX warn about errors?
479 fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
480 if let (s, false) = self.encoding.decode_without_bom_handling(input) {
481 // This is the common case. Usually there will be no errors.
484 // Unusual case. Don't bother to optimize it much.
485 let mut decoder = self.encoding.new_decoder_without_bom_handling();
486 let mut output = String::with_capacity(
488 .max_utf8_buffer_length_without_replacement(input.len())
491 let mut rest = input;
492 while !rest.is_empty() {
493 match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
494 (DecoderResult::InputEmpty, _) => break,
495 (DecoderResult::OutputFull, _) => unreachable!(),
496 (DecoderResult::Malformed(a, b), consumed) => {
497 let skipped = a as usize + b as usize;
498 output.extend(repeat('?').take(skipped));
499 rest = &rest[consumed..];
503 assert_eq!(self.encoding.encode(&output).0.len(), input.len());
509 pub trait TryDecode: Sized {
512 decoder: &mut Decoder,
514 warn: impl Fn(Error),
515 ) -> Result<Option<Self>, Error>;
518 pub trait Decode<Input>: Sized {
519 fn decode(decoder: &Decoder, input: &Input, warn: impl Fn(Error)) -> Self;
522 impl<const N: usize> Decode<UnencodedStr<N>> for String {
523 fn decode(decoder: &Decoder, input: &UnencodedStr<N>, warn: impl Fn(Error)) -> Self {
524 decoder.decode_string(&input.0, &warn)
528 #[derive(Clone, Debug)]
529 pub struct HeaderRecord {
530 pub eye_catcher: String,
531 pub weight_index: Option<usize>,
532 pub n_cases: Option<u64>,
533 pub creation: NaiveDateTime,
534 pub file_label: String,
537 fn trim_end_spaces(mut s: String) -> String {
538 s.truncate(s.trim_end_matches(' ').len());
542 impl TryDecode for HeaderRecord {
543 type Input = crate::raw::HeaderRecord;
546 decoder: &mut Decoder,
548 warn: impl Fn(Error),
549 ) -> Result<Option<Self>, Error> {
550 let eye_catcher = trim_end_spaces(decoder.decode_string(&input.eye_catcher.0, &warn));
551 let file_label = trim_end_spaces(decoder.decode_string(&input.file_label.0, &warn));
552 let creation_date = decoder.decode_string_cow(&input.creation_date.0, &warn);
554 NaiveDate::parse_from_str(&creation_date, "%e %b %Y").unwrap_or_else(|_| {
555 warn(Error::InvalidCreationDate {
556 creation_date: creation_date.into(),
560 let creation_time = decoder.decode_string_cow(&input.creation_time.0, &warn);
562 NaiveTime::parse_from_str(&creation_time, "%H:%M:%S").unwrap_or_else(|_| {
563 warn(Error::InvalidCreationTime {
564 creation_time: creation_time.into(),
568 Ok(Some(HeaderRecord {
570 weight_index: input.weight_index.map(|n| n as usize),
571 n_cases: input.n_cases.map(|n| n as u64),
572 creation: NaiveDateTime::new(creation_date, creation_time),
578 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
584 impl PartialOrd for VarWidth {
585 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
586 match (self, other) {
587 (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal),
588 (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)),
595 const MAX_STRING: u16 = 32767;
597 fn n_dict_indexes(self) -> usize {
599 VarWidth::Numeric => 1,
600 VarWidth::String(w) => div_ceil(w as usize, 8),
607 f: impl Fn(u16, u16) -> u16,
608 ) -> Option<VarWidth> {
610 (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric),
611 (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => {
612 Some(VarWidth::String(f(a, b)))
618 /// Returns the wider of `self` and `other`:
619 /// - Numerical variable widths are equally wide.
620 /// - Longer strings are wider than shorter strings.
621 /// - Numerical and string types are incomparable, so result in `None`.
622 /// - Any `None` in the input yields `None` in the output.
623 pub fn wider(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
624 Self::width_predicate(a, b, |a, b| a.max(b))
627 /// Returns the narrower of `self` and `other` (see [`Self::wider`]).
628 pub fn narrower(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
629 Self::width_predicate(a, b, |a, b| a.min(b))
633 impl From<VarWidth> for VarType {
634 fn from(source: VarWidth) -> Self {
636 VarWidth::Numeric => VarType::Numeric,
637 VarWidth::String(_) => VarType::String,
642 #[derive(Clone, Debug)]
643 pub struct VariableRecord {
645 pub name: Identifier,
646 pub print_format: Spec,
647 pub write_format: Spec,
648 pub missing_values: MissingValues,
649 pub label: Option<String>,
652 #[derive(Clone, Debug)]
653 pub struct MissingValues {
654 /// Individual missing values, up to 3 of them.
655 pub values: Vec<Value>,
657 /// Optional range of missing values.
658 pub range: Option<(Value, Value)>,
661 impl Decode<raw::MissingValues> for MissingValues {
662 fn decode(decoder: &Decoder, input: &raw::MissingValues, _warn: impl Fn(Error)) -> Self {
667 .map(|value| Value::decode(value, decoder))
672 .map(|(low, high)| (Value::decode(low, decoder), Value::decode(high, decoder))),
677 fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec {
678 UncheckedSpec::try_from(raw)
679 .and_then(Spec::try_from)
680 .and_then(|x| x.check_width_compatibility(width))
681 .unwrap_or_else(|error| {
682 let new_format = Spec::default_for_width(width);
683 warn(new_format, error);
688 impl TryDecode for VariableRecord {
689 type Input = raw::VariableRecord;
692 decoder: &mut Decoder,
693 input: &crate::raw::VariableRecord,
694 warn: impl Fn(Error),
695 ) -> Result<Option<VariableRecord>, Error> {
696 let width = match input.width {
697 0 => VarWidth::Numeric,
698 w @ 1..=255 => VarWidth::String(w as u16),
699 -1 => return Ok(None),
701 return Err(Error::InvalidVariableWidth {
702 offsets: input.offsets.clone(),
707 let name = trim_end_spaces(decoder.decode_string(&input.name.0, &warn));
708 let name = match Identifier::new(&name, decoder.encoding) {
710 if !decoder.var_names.contains_key(&name) {
713 let new_name = decoder.generate_name();
714 warn(Error::DuplicateVariableName {
715 duplicate_name: name.clone(),
716 new_name: new_name.clone(),
722 let new_name = decoder.generate_name();
723 warn(Error::InvalidVariableName {
725 new_name: new_name.clone(),
730 let variable = Variable {
731 dict_index: decoder.n_dict_indexes,
732 short_name: name.clone(),
736 decoder.n_dict_indexes += width.n_dict_indexes();
739 .insert(name.clone(), variable.dict_index)
743 .insert(variable.dict_index, variable)
746 let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
747 warn(Error::InvalidPrintFormat {
749 variable: name.clone(),
753 let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
754 warn(Error::InvalidWriteFormat {
756 variable: name.clone(),
763 .map(|label| decoder.decode_string(&label.0, &warn));
764 Ok(Some(VariableRecord {
769 missing_values: MissingValues::decode(decoder, &input.missing_values, warn),
775 #[derive(Clone, Debug)]
776 pub struct DocumentRecord(Vec<String>);
778 impl TryDecode for DocumentRecord {
779 type Input = crate::raw::DocumentRecord;
782 decoder: &mut Decoder,
784 warn: impl Fn(Error),
785 ) -> Result<Option<Self>, Error> {
786 Ok(Some(DocumentRecord(
790 .map(|s| trim_end_spaces(decoder.decode_string(&s.0, &warn)))
800 const NAME: &'static str;
801 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error>;
804 #[derive(Clone, Debug)]
805 pub struct VariableSet {
807 pub vars: Vec<String>,
811 fn parse(input: &str) -> Result<Self, Error> {
812 let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
813 let vars = input.split_ascii_whitespace().map(String::from).collect();
821 trait WarnOnError<T> {
822 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
824 impl<T> WarnOnError<T> for Result<T, Error> {
825 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
827 Ok(result) => Some(result),
836 #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
838 Number(Option<OrderedFloat<f64>>),
843 pub fn decode(raw: &raw::Value, decoder: &Decoder) -> Self {
845 raw::Value::Number(x) => Value::Number(x.map(|x| x.into())),
846 raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
851 #[derive(Clone, Debug)]
852 pub struct ValueLabel {
857 #[derive(Clone, Debug)]
858 pub struct ValueLabelRecord {
859 pub var_type: VarType,
860 pub labels: Vec<ValueLabel>,
861 pub variables: Vec<Identifier>,
864 impl TryDecode for ValueLabelRecord {
865 type Input = crate::raw::ValueLabelRecord;
867 decoder: &mut Decoder,
869 warn: impl Fn(Error),
870 ) -> Result<Option<ValueLabelRecord>, Error> {
871 let variables: Vec<&Variable> = input
874 .filter_map(|&dict_index| {
876 .get_var_by_index(dict_index as usize)
877 .warn_on_error(&warn)
879 .filter(|&variable| match variable.width {
880 VarWidth::String(width) if width > 8 => {
881 warn(Error::InvalidLongStringValueLabel(
882 variable.short_name.clone(),
889 let mut i = variables.iter();
890 let Some(&first_var) = i.next() else {
893 let var_type: VarType = first_var.width.into();
895 let this_type: VarType = variable.width.into();
896 if var_type != this_type {
897 let (numeric_var, string_var) = match var_type {
898 VarType::Numeric => (first_var, variable),
899 VarType::String => (variable, first_var),
901 warn(Error::ValueLabelsDifferentTypes {
902 numeric_var: numeric_var.short_name.clone(),
903 string_var: string_var.short_name.clone(),
911 .map(|(value, label)| {
912 let label = decoder.decode_string(&label.0, &warn);
913 let value = Value::decode(
914 &raw::Value::from_raw(value, var_type, decoder.endian),
917 ValueLabel { value, label }
920 let variables = variables
922 .map(|&variable| variable.short_name.clone())
924 Ok(Some(ValueLabelRecord {
932 #[derive(Clone, Debug)]
933 pub struct VariableSetRecord(Vec<VariableSet>);
935 impl TextRecord for VariableSetRecord {
936 const NAME: &'static str = "variable set";
937 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
938 let mut sets = Vec::new();
939 for line in input.lines() {
940 if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) {
944 Ok(VariableSetRecord(sets))
948 #[derive(Clone, Debug)]
949 pub struct ProductInfoRecord(pub String);
951 impl TextRecord for ProductInfoRecord {
952 const NAME: &'static str = "extra product info";
953 fn parse(input: &str, _warn: impl Fn(Error)) -> Result<Self, Error> {
954 Ok(ProductInfoRecord(input.into()))
958 #[derive(Clone, Debug)]
959 pub struct LongName {
960 pub short_name: Identifier,
961 pub long_name: Identifier,
965 fn new(decoder: &mut Decoder, short_name: &str, long_name: &str) -> Result<LongName, Error> {
967 Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidShortName)?;
969 Identifier::new(long_name, decoder.encoding).map_err(Error::InvalidLongName)?;
977 #[derive(Clone, Debug)]
978 pub struct LongNameRecord(Vec<LongName>);
980 impl LongNameRecord {
981 pub fn parse(decoder: &mut Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
982 let mut names = Vec::new();
983 for pair in input.split('\t').filter(|s| !s.is_empty()) {
984 if let Some((short_name, long_name)) = pair.split_once('=') {
985 if let Some(long_name) =
986 LongName::new(decoder, short_name, long_name).warn_on_error(&warn)
988 names.push(long_name);
994 Ok(LongNameRecord(names))
998 #[derive(Clone, Debug)]
999 pub struct VeryLongString {
1000 pub short_name: Identifier,
1004 impl VeryLongString {
1005 fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Error> {
1006 let Some((short_name, length)) = input.split_once('=') else {
1007 return Err(Error::TBD);
1010 Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidLongStringName)?;
1011 let length: u16 = length.parse().map_err(|_| Error::TBD)?;
1012 if length > VarWidth::MAX_STRING {
1013 return Err(Error::TBD);
1015 Ok(VeryLongString { short_name, length })
1019 #[derive(Clone, Debug)]
1020 pub struct VeryLongStringRecord(Vec<VeryLongString>);
1022 impl VeryLongStringRecord {
1023 pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1024 let mut very_long_strings = Vec::new();
1027 .map(|s| s.trim_end_matches('\t'))
1028 .filter(|s| !s.is_empty())
1030 if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&warn) {
1031 very_long_strings.push(vls)
1034 Ok(VeryLongStringRecord(very_long_strings))
1038 #[derive(Clone, Debug)]
1039 pub struct Attribute {
1040 pub name: Identifier,
1041 pub values: Vec<String>,
1048 warn: &impl Fn(Error),
1049 ) -> Result<(Option<Attribute>, &'a str), Error> {
1050 let Some((name, mut input)) = input.split_once('(') else {
1051 return Err(Error::TBD);
1053 let mut values = Vec::new();
1055 let Some((value, rest)) = input.split_once('\n') else {
1056 return Err(Error::TBD);
1058 if let Some(stripped) = value
1060 .and_then(|value| value.strip_suffix('\''))
1062 values.push(stripped.into());
1065 values.push(value.into());
1067 if let Some(rest) = rest.strip_prefix(')') {
1068 let attribute = Identifier::new(name, decoder.encoding)
1069 .map_err(Error::InvalidAttributeName)
1070 .warn_on_error(warn)
1071 .map(|name| Attribute { name, values });
1072 return Ok((attribute, rest));
1079 #[derive(Clone, Debug)]
1080 pub struct AttributeSet(pub Vec<Attribute>);
1086 sentinel: Option<char>,
1087 warn: &impl Fn(Error),
1088 ) -> Result<(AttributeSet, &'a str), Error> {
1089 let mut attributes = Vec::new();
1091 match input.chars().next() {
1092 None => break input,
1093 c if c == sentinel => break &input[1..],
1095 let (attribute, rest) = Attribute::parse(decoder, input, &warn)?;
1096 if let Some(attribute) = attribute {
1097 attributes.push(attribute);
1103 Ok((AttributeSet(attributes), rest))
1107 #[derive(Clone, Debug)]
1108 pub struct FileAttributeRecord(AttributeSet);
1110 impl FileAttributeRecord {
1111 pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1112 let (set, rest) = AttributeSet::parse(decoder, input, None, &warn)?;
1113 if !rest.is_empty() {
1116 Ok(FileAttributeRecord(set))
1120 #[derive(Clone, Debug)]
1121 pub struct VarAttributeSet {
1122 pub long_var_name: Identifier,
1123 pub attributes: AttributeSet,
1126 impl VarAttributeSet {
1130 warn: &impl Fn(Error),
1131 ) -> Result<(Option<VarAttributeSet>, &'a str), Error> {
1132 let Some((long_var_name, rest)) = input.split_once(':') else {
1133 return Err(Error::TBD);
1135 let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'), warn)?;
1136 let var_attribute = Identifier::new(long_var_name, decoder.encoding)
1137 .map_err(Error::InvalidAttributeVariableName)
1138 .warn_on_error(warn)
1139 .map(|name| VarAttributeSet {
1140 long_var_name: name,
1143 Ok((var_attribute, rest))
1147 #[derive(Clone, Debug)]
1148 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
1150 impl VariableAttributeRecord {
1151 pub fn parse(decoder: &Decoder, mut input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1152 let mut var_attribute_sets = Vec::new();
1153 while !input.is_empty() {
1154 let Some((var_attribute, rest)) =
1155 VarAttributeSet::parse(decoder, input, &warn).warn_on_error(&warn)
1159 if let Some(var_attribute) = var_attribute {
1160 var_attribute_sets.push(var_attribute);
1164 Ok(VariableAttributeRecord(var_attribute_sets))
1168 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1176 fn try_decode(source: u32) -> Result<Option<Measure>, Error> {
1179 1 => Ok(Some(Measure::Nominal)),
1180 2 => Ok(Some(Measure::Ordinal)),
1181 3 => Ok(Some(Measure::Scale)),
1182 _ => Err(Error::InvalidMeasurement(source)),
1187 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1188 pub enum Alignment {
1195 fn try_decode(source: u32) -> Result<Option<Alignment>, Error> {
1198 1 => Ok(Some(Alignment::Left)),
1199 2 => Ok(Some(Alignment::Right)),
1200 3 => Ok(Some(Alignment::Center)),
1201 _ => Err(Error::InvalidAlignment(source)),
1206 #[derive(Clone, Debug)]
1207 pub struct VarDisplay {
1208 pub measure: Option<Measure>,
1209 pub width: Option<u32>,
1210 pub alignment: Option<Alignment>,
1213 #[derive(Clone, Debug)]
1214 pub struct VarDisplayRecord(pub Vec<VarDisplay>);
1216 impl TryDecode for VarDisplayRecord {
1217 type Input = raw::VarDisplayRecord;
1219 decoder: &mut Decoder,
1220 input: &Self::Input,
1221 warn: impl Fn(Error),
1222 ) -> Result<Option<Self>, Error> {
1223 let n_vars = decoder.variables.len();
1224 let n_per_var = if input.0.len() == 3 * n_vars {
1226 } else if input.0.len() == 2 * n_vars {
1229 return Err(Error::TBD);
1232 let var_displays = input
1236 let (measure, width, alignment) = match n_per_var == 3 {
1237 true => (chunk[0], Some(chunk[1]), chunk[2]),
1238 false => (chunk[0], None, chunk[1]),
1240 let measure = Measure::try_decode(measure).warn_on_error(&warn).flatten();
1241 let alignment = Alignment::try_decode(alignment)
1242 .warn_on_error(&warn)
1251 Ok(Some(VarDisplayRecord(var_displays)))
1255 #[derive(Clone, Debug)]
1256 pub enum MultipleResponseType {
1259 labels: CategoryLabels,
1264 impl MultipleResponseType {
1267 mr_set: &Identifier,
1268 input: &raw::MultipleResponseType,
1269 min_width: VarWidth,
1270 warn: &impl Fn(Error),
1271 ) -> Result<Self, Error> {
1272 let mr_type = match input {
1273 raw::MultipleResponseType::MultipleDichotomy { value, labels } => {
1274 let value = decoder.decode_string_cow(&value.0, warn);
1275 let value = match min_width {
1276 VarWidth::Numeric => {
1277 let number: f64 = value.trim().parse().map_err(|_| {
1278 Error::InvalidMDGroupCountedValue {
1279 mr_set: mr_set.clone(),
1280 number: value.into(),
1283 Value::Number(Some(number.into()))
1285 VarWidth::String(max_width) => {
1286 let value = value.trim_end_matches(' ');
1287 let width = value.len();
1288 if width > max_width as usize {
1289 return Err(Error::TooWideMDGroupCountedValue {
1290 mr_set: mr_set.clone(),
1291 value: value.into(),
1296 Value::String(value.into())
1299 MultipleResponseType::MultipleDichotomy {
1304 raw::MultipleResponseType::MultipleCategory => MultipleResponseType::MultipleCategory,
1310 #[derive(Clone, Debug)]
1311 pub struct MultipleResponseSet {
1312 pub name: Identifier,
1313 pub min_width: VarWidth,
1314 pub max_width: VarWidth,
1316 pub mr_type: MultipleResponseType,
1317 pub dict_indexes: Vec<DictIndex>,
1320 impl MultipleResponseSet {
1323 input: &raw::MultipleResponseSet,
1324 warn: &impl Fn(Error),
1325 ) -> Result<Self, Error> {
1326 let mr_set_name = decoder
1327 .decode_identifier(&input.name.0, warn)
1328 .map_err(Error::InvalidMrSetName)?;
1330 let label = decoder.decode_string(&input.label.0, warn);
1332 let mut dict_indexes = Vec::with_capacity(input.short_names.len());
1333 for short_name in input.short_names.iter() {
1334 let short_name = match decoder.decode_identifier(&short_name.0, warn) {
1337 warn(Error::InvalidMrSetName(error));
1341 let Some(&dict_index) = decoder.var_names.get(&short_name) else {
1342 warn(Error::UnknownMrSetVariable {
1343 mr_set: mr_set_name.clone(),
1344 short_name: short_name.clone(),
1348 dict_indexes.push(dict_index);
1351 match dict_indexes.len() {
1352 0 => return Err(Error::EmptyMrSet(mr_set_name)),
1353 1 => return Err(Error::OneVarMrSet(mr_set_name)),
1357 let Some((Some(min_width), Some(max_width))) = dict_indexes
1359 .map(|dict_index| decoder.variables[dict_index].width)
1360 .map(|w| (Some(w), Some(w)))
1361 .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb)))
1363 return Err(Error::MixedMrSet(mr_set_name));
1367 MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?;
1369 Ok(MultipleResponseSet {
1380 #[derive(Clone, Debug)]
1381 pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
1383 impl TryDecode for MultipleResponseRecord {
1384 type Input = raw::MultipleResponseRecord;
1387 decoder: &mut Decoder,
1388 input: &Self::Input,
1389 warn: impl Fn(Error),
1390 ) -> Result<Option<Self>, Error> {
1391 let mut sets = Vec::with_capacity(input.0.len());
1392 for set in &input.0 {
1393 match MultipleResponseSet::decode(decoder, set, &warn) {
1394 Ok(set) => sets.push(set),
1395 Err(error) => warn(error),
1398 Ok(Some(MultipleResponseRecord(sets)))
1402 #[derive(Clone, Debug)]
1403 pub struct LongStringMissingValues {
1405 pub var_name: Identifier,
1408 pub missing_values: MissingValues,
1411 impl LongStringMissingValues {
1414 input: &raw::LongStringMissingValues,
1415 warn: &impl Fn(Error),
1416 ) -> Result<Self, Error> {
1417 let var_name = decoder.decode_string(&input.var_name.0, warn);
1418 let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
1419 .map_err(Error::InvalidLongStringValueLabelName)?;
1421 let missing_values = MissingValues::decode(decoder, &input.missing_values, warn);
1423 Ok(LongStringMissingValues {
1430 #[derive(Clone, Debug)]
1431 pub struct LongStringMissingValuesRecord(Vec<LongStringMissingValues>);
1433 impl TryDecode for LongStringMissingValuesRecord {
1434 type Input = raw::LongStringMissingValueRecord;
1437 decoder: &mut Decoder,
1438 input: &Self::Input,
1439 warn: impl Fn(Error),
1440 ) -> Result<Option<Self>, Error> {
1441 let mut labels = Vec::with_capacity(input.0.len());
1442 for label in &input.0 {
1443 match LongStringMissingValues::decode(decoder, label, &warn) {
1444 Ok(set) => labels.push(set),
1445 Err(error) => warn(error),
1448 Ok(Some(LongStringMissingValuesRecord(labels)))
1452 #[derive(Clone, Debug)]
1453 pub struct LongStringValueLabels {
1454 pub var_name: Identifier,
1455 pub width: VarWidth,
1456 pub labels: Vec<ValueLabel>,
1459 impl LongStringValueLabels {
1462 input: &raw::LongStringValueLabels,
1463 warn: &impl Fn(Error),
1464 ) -> Result<Self, Error> {
1465 let var_name = decoder.decode_string(&input.var_name.0, warn);
1466 let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
1467 .map_err(Error::InvalidLongStringValueLabelName)?;
1470 let max_width = VarWidth::MAX_STRING;
1471 if input.width < 9 || input.width > max_width as u32 {
1472 return Err(Error::InvalidLongValueLabelWidth {
1479 let width = input.width as u16;
1481 let mut labels = Vec::with_capacity(input.labels.len());
1482 for (value, label) in input.labels.iter() {
1483 let value = Value::String(decoder.decode_exact_length(&value.0).into());
1484 let label = decoder.decode_string(&label.0, warn);
1485 labels.push(ValueLabel { value, label });
1488 Ok(LongStringValueLabels {
1490 width: VarWidth::String(width),
1496 #[derive(Clone, Debug)]
1497 pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
1499 impl TryDecode for LongStringValueLabelRecord {
1500 type Input = raw::LongStringValueLabelRecord;
1503 decoder: &mut Decoder,
1504 input: &Self::Input,
1505 warn: impl Fn(Error),
1506 ) -> Result<Option<Self>, Error> {
1507 let mut labels = Vec::with_capacity(input.0.len());
1508 for label in &input.0 {
1509 match LongStringValueLabels::decode(decoder, label, &warn) {
1510 Ok(set) => labels.push(set),
1511 Err(error) => warn(error),
1514 Ok(Some(LongStringValueLabelRecord(labels)))
1520 use encoding_rs::WINDOWS_1252;
1524 let mut s = String::new();
1525 s.push(char::REPLACEMENT_CHARACTER);
1526 let encoded = WINDOWS_1252.encode(&s).0;
1527 let decoded = WINDOWS_1252.decode(&encoded[..]).0;
1528 println!("{:?}", decoded);
1533 let charset: Vec<u8> = (0..=255).collect();
1534 println!("{}", charset.len());
1535 let decoded = WINDOWS_1252.decode(&charset[..]).0;
1536 println!("{}", decoded.len());
1537 let encoded = WINDOWS_1252.encode(&decoded[..]).0;
1538 println!("{}", encoded.len());
1539 assert_eq!(&charset[..], &encoded[..]);