1 use std::{cell::RefCell, ops::Range, rc::Rc, collections::HashMap};
4 dictionary::{Dictionary, VarWidth, Variable},
5 encoding::Error as EncodingError,
7 format::{Error as FormatError, Spec, UncheckedSpec},
8 identifier::{Error as IdError, Identifier},
10 self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord,
11 FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongNamesRecord,
12 LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord,
13 NumberOfCasesRecord, ProductInfoRecord, RawStr, ValueLabelRecord, VarDisplayRecord,
14 VariableAttributeRecord, VariableRecord, VariableSetRecord, VeryLongStringsRecord, ZHeader,
18 use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
19 use encoding_rs::Encoding;
21 use thiserror::Error as ThisError;
23 pub use crate::raw::{CategoryLabels, Compression};
25 #[derive(ThisError, Debug)]
27 #[error("Missing header record")]
30 // XXX this is an internal error
31 #[error("More than one file header record")]
32 DuplicateHeaderRecord,
35 EncodingError(EncodingError),
37 #[error("Using default encoding {0}.")]
38 UsingDefaultEncoding(String),
40 #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)]
41 InvalidVariableWidth { offsets: Range<u64>, width: i32 },
43 #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
44 InvalidLongMissingValueFormat,
46 #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
47 InvalidCreationDate { creation_date: String },
49 #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
50 InvalidCreationTime { creation_time: String },
52 #[error("{id_error} Renaming variable to {new_name}.")]
59 "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}"
64 format_error: FormatError,
68 "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}"
73 format_error: FormatError,
76 #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")]
77 DuplicateVariableName {
78 duplicate_name: Identifier,
82 #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")]
83 InvalidDictIndex { dict_index: usize, max_index: usize },
85 #[error("Dictionary index {0} refers to a long string continuation.")]
86 DictIndexIsContinuation(usize),
88 #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")]
89 ValueLabelsDifferentTypes {
90 numeric_var: Identifier,
91 string_var: Identifier,
95 "Value labels may not be added to long string variable {0} using record types 3 or 4."
97 InvalidLongStringValueLabel(Identifier),
99 #[error("Invalid multiple response set name. {0}")]
100 InvalidMrSetName(IdError),
102 #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")]
103 UnknownMrSetVariable {
105 short_name: Identifier,
108 #[error("Multiple response set {0} has no variables.")]
109 EmptyMrSet(Identifier),
111 #[error("Multiple response set {0} has only one variable.")]
112 OneVarMrSet(Identifier),
114 #[error("Multiple response set {0} contains both string and numeric variables.")]
115 MixedMrSet(Identifier),
118 "Invalid numeric format for counted value {number} in multiple response set {mr_set}."
120 InvalidMDGroupCountedValue { mr_set: Identifier, number: String },
122 #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")]
123 TooWideMDGroupCountedValue {
130 #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")]
131 InvalidLongValueLabelWidth {
138 #[error("Invalid attribute name. {0}")]
139 InvalidAttributeName(IdError),
141 #[error("Invalid short name in long variable name record. {0}")]
142 InvalidShortName(IdError),
144 #[error("Invalid name in long variable name record. {0}")]
145 InvalidLongName(IdError),
147 #[error("Invalid variable name in very long string record. {0}")]
148 InvalidLongStringName(IdError),
150 #[error("Invalid variable name in long string value label record. {0}")]
151 InvalidLongStringValueLabelName(IdError),
153 #[error("Invalid variable name in attribute record. {0}")]
154 InvalidAttributeVariableName(IdError),
156 // XXX This is risky because `text` might be arbitarily long.
157 #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
158 MalformedString { encoding: String, text: String },
160 #[error("Details TBD")]
164 type DictIndex = usize;
166 #[derive(Clone, Debug)]
168 pub header: HeaderRecord<String>,
169 pub variable: Vec<VariableRecord<String, String>>,
170 pub value_label: Vec<ValueLabelRecord<RawStr<8>, String>>,
171 pub document: Vec<DocumentRecord<String>>,
172 pub integer_info: Option<IntegerInfoRecord>,
173 pub float_info: Option<FloatInfoRecord>,
174 pub var_display: Option<VarDisplayRecord>,
175 pub multiple_response: Vec<MultipleResponseRecord<Identifier, String>>,
176 pub long_string_value_labels: Vec<LongStringValueLabelRecord<Identifier, String>>,
177 pub long_string_missing_values: Vec<LongStringMissingValueRecord<Identifier, String>>,
178 pub encoding: Option<EncodingRecord>,
179 pub number_of_cases: Option<NumberOfCasesRecord>,
180 pub variable_sets: Vec<VariableSetRecord>,
181 pub product_info: Option<ProductInfoRecord>,
182 pub long_names: Vec<LongNamesRecord>,
183 pub very_long_strings: Vec<VeryLongStringsRecord>,
184 pub file_attributes: Vec<FileAttributeRecord>,
185 pub variable_attributes: Vec<VariableAttributeRecord>,
186 pub other_extension: Vec<Extension>,
187 pub end_of_headers: Option<u32>,
188 pub z_header: Option<ZHeader>,
189 pub z_trailer: Option<ZTrailer>,
190 pub cases: Option<Rc<RefCell<Cases>>>,
193 fn take_first<T, F>(mut vec: Vec<T>, more_than_one: F) -> Option<T>
204 pub fn new(headers: Vec<raw::DecodedRecord>, warn: &impl Fn(Error)) -> Result<Headers, Error> {
205 let mut file_header = Vec::new();
206 let mut variable = Vec::new();
207 let mut value_label = Vec::new();
208 let mut document = Vec::new();
209 let mut integer_info = Vec::new();
210 let mut float_info = Vec::new();
211 let mut var_display = Vec::new();
212 let mut multiple_response = Vec::new();
213 let mut long_string_value_labels = Vec::new();
214 let mut long_string_missing_values = Vec::new();
215 let mut encoding = Vec::new();
216 let mut number_of_cases = Vec::new();
217 let mut variable_sets = Vec::new();
218 let mut product_info = Vec::new();
219 let mut long_names = Vec::new();
220 let mut very_long_strings = Vec::new();
221 let mut file_attributes = Vec::new();
222 let mut variable_attributes = Vec::new();
223 let mut other_extension = Vec::new();
224 let mut end_of_headers = Vec::new();
225 let mut z_header = Vec::new();
226 let mut z_trailer = Vec::new();
227 let mut cases = Vec::new();
229 for header in headers {
231 DecodedRecord::Header(record) => {
232 file_header.push(record);
234 DecodedRecord::Variable(record) => {
235 variable.push(record);
237 DecodedRecord::ValueLabel(record) => {
238 value_label.push(record);
240 DecodedRecord::Document(record) => {
241 document.push(record);
243 DecodedRecord::IntegerInfo(record) => {
244 integer_info.push(record);
246 DecodedRecord::FloatInfo(record) => {
247 float_info.push(record);
249 DecodedRecord::VariableSets(record) => {
250 variable_sets.push(record);
252 DecodedRecord::VarDisplay(record) => {
253 var_display.push(record);
255 DecodedRecord::MultipleResponse(record) => {
256 multiple_response.push(record);
258 DecodedRecord::LongStringValueLabels(record) => {
259 long_string_value_labels.push(record)
261 DecodedRecord::LongStringMissingValues(record) => {
262 long_string_missing_values.push(record);
264 DecodedRecord::Encoding(record) => {
265 encoding.push(record);
267 DecodedRecord::NumberOfCases(record) => {
268 number_of_cases.push(record);
270 DecodedRecord::ProductInfo(record) => {
271 product_info.push(record);
273 DecodedRecord::LongNames(record) => {
274 long_names.push(record);
276 DecodedRecord::VeryLongStrings(record) => {
277 very_long_strings.push(record);
279 DecodedRecord::FileAttributes(record) => {
280 file_attributes.push(record);
282 DecodedRecord::VariableAttributes(record) => {
283 variable_attributes.push(record);
285 DecodedRecord::OtherExtension(record) => {
286 other_extension.push(record);
288 DecodedRecord::EndOfHeaders(record) => {
289 end_of_headers.push(record);
291 DecodedRecord::ZHeader(record) => {
292 z_header.push(record);
294 DecodedRecord::ZTrailer(record) => {
295 z_trailer.push(record);
297 DecodedRecord::Cases(record) => {
303 let Some(file_header) = take_first(file_header, || warn(Error::DuplicateHeaderRecord))
305 return Err(Error::MissingHeaderRecord);
313 integer_info: take_first(integer_info, || warn(Error::TBD)),
314 float_info: take_first(float_info, || warn(Error::TBD)),
315 var_display: take_first(var_display, || warn(Error::TBD)),
317 long_string_value_labels,
318 long_string_missing_values,
319 encoding: take_first(encoding, || warn(Error::TBD)),
320 number_of_cases: take_first(number_of_cases, || warn(Error::TBD)),
322 product_info: take_first(product_info, || warn(Error::TBD)),
328 end_of_headers: take_first(end_of_headers, || warn(Error::TBD)),
329 z_header: take_first(z_header, || warn(Error::TBD)),
330 z_trailer: take_first(z_trailer, || warn(Error::TBD)),
331 cases: take_first(cases, || warn(Error::TBD)),
336 pub struct Metadata {
337 creation: NaiveDateTime,
339 compression: Option<Compression>,
340 n_cases: Option<u64>,
342 product_ext: Option<String>,
343 version: Option<(i32, i32, i32)>,
347 fn decode(headers: &Headers, warn: impl Fn(Error)) -> Self {
348 let header = &headers.header;
349 let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
350 .unwrap_or_else(|_| {
351 warn(Error::InvalidCreationDate {
352 creation_date: header.creation_date.to_string(),
356 let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
357 .unwrap_or_else(|_| {
358 warn(Error::InvalidCreationTime {
359 creation_time: header.creation_time.to_string(),
363 let creation = NaiveDateTime::new(creation_date, creation_time);
367 .trim_start_matches("@(#) SPSS DATA FILE")
373 endian: header.endian,
374 compression: header.compression,
375 n_cases: header.n_cases.map(|n| n as u64),
377 product_ext: headers.product_info.as_ref().map(|pe| fix_line_ends(&pe.0)),
378 version: headers.integer_info.as_ref().map(|ii| ii.version),
384 //pub raw: raw::Decoder,
385 pub encoding: &'static Encoding,
386 //pub variables: HashMap<DictIndex, Variable>,
387 //pub var_names: HashMap<Identifier, DictIndex>,
388 //pub dictionary: Dictionary,
389 //n_dict_indexes: usize,
390 n_generated_names: usize,
394 fn generate_name(&mut self, dictionary: &Dictionary) -> Identifier {
396 self.n_generated_names += 1;
397 let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
399 if !dictionary.variables.contains(&name) {
402 assert!(self.n_generated_names < usize::MAX);
408 mut headers: Headers,
409 encoding: &'static Encoding,
410 warn: impl Fn(Error),
411 ) -> Result<(Dictionary, Metadata), Error> {
412 let mut dictionary = Dictionary::new(encoding);
414 let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' '));
415 if !file_label.is_empty() {
416 dictionary.file_label = Some(file_label);
419 for attributes in headers.file_attributes.drain(..) {
420 dictionary.attributes.extend(attributes.0 .0.into_iter())
423 // Concatenate all the document records (really there should only be one)
424 // and trim off the trailing spaces that pad them to 80 bytes.
425 dictionary.documents = headers
428 .flat_map(|record| record.lines)
429 .map(trim_end_spaces)
432 // XXX warn for weird integer format
433 // XXX warn for weird floating-point format, etc.
435 let mut decoder = Decoder {
437 n_generated_names: 0,
440 let mut header_vars = headers.variable.iter().enumerate();
441 let mut var_index_map = HashMap::new();
442 while let Some((value_index, input)) = header_vars.next() {
443 let name = trim_end_spaces(input.name.to_string());
444 let name = match Identifier::new(&name, encoding) {
446 if !dictionary.variables.contains(&name) {
449 let new_name = decoder.generate_name(&dictionary);
450 warn(Error::DuplicateVariableName {
451 duplicate_name: name.clone(),
452 new_name: new_name.clone(),
458 let new_name = decoder.generate_name(&dictionary);
459 warn(Error::InvalidVariableName {
461 new_name: new_name.clone(),
466 let mut variable = Variable::new(name.clone(), VarWidth::from_raw(input.width).unwrap());
468 // Set the short name the same as the long name (even if we renamed it).
469 variable.short_names = vec![name];
471 variable.label = input.label.clone();
473 variable.missing_values = input.missing_values.clone();
475 variable.print_format = decode_format(
478 |new_spec, format_error| {
479 warn(Error::InvalidPrintFormat {
481 variable: variable.name.clone(),
486 variable.write_format = decode_format(
489 |new_spec, format_error| {
490 warn(Error::InvalidWriteFormat {
492 variable: variable.name.clone(),
498 // Skip long string continuation records.
500 #[allow(unstable_name_collisions)]
501 for _ in 1..input.width.div_ceil(&8) {
502 if let Some((_, continuation)) = header_vars.next() {
503 if continuation.width == -1 {
507 return Err(Error::TBD);
511 let dict_index = dictionary.add_var(variable).unwrap();
512 assert_eq!(var_index_map.insert(value_index, dict_index), None);
515 let metadata = Metadata::decode(&headers, warn);
516 Ok((dictionary, metadata))
519 fn trim_end_spaces(mut s: String) -> String {
520 s.truncate(s.trim_end_matches(' ').len());
524 /// Returns a copy of `s` in which all lone CR and CR LF pairs have been
527 /// (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
528 /// files that use CR-only line ends in the file label and extra product
530 fn fix_line_ends(s: &str) -> String {
531 let mut out = String::with_capacity(s.len());
532 let mut s = s.chars().peekable();
533 while let Some(c) = s.next() {
545 fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec {
546 UncheckedSpec::try_from(raw)
547 .and_then(Spec::try_from)
548 .and_then(|x| x.check_width_compatibility(width))
549 .unwrap_or_else(|error| {
550 let new_format = Spec::default_for_width(width);
551 warn(new_format, error);
558 fn generate_name(&mut self) -> Identifier {
560 self.n_generated_names += 1;
561 let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
563 if !self.var_names.contains_key(&name) {
566 assert!(self.n_generated_names < usize::MAX);
569 fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
570 let (output, malformed) = self.encoding.decode_without_bom_handling(input);
572 warn(Error::MalformedString {
573 encoding: self.encoding.name().into(),
574 text: output.clone().into(),
579 fn decode_string(&self, input: &[u8], warn: &impl Fn(Error)) -> String {
580 self.decode_string_cow(input, warn).into()
582 pub fn decode_identifier(
585 warn: &impl Fn(Error),
586 ) -> Result<Identifier, IdError> {
587 let s = self.decode_string_cow(input, warn);
588 Identifier::new(&s, self.encoding)
590 fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> {
591 let max_index = self.n_dict_indexes;
592 if dict_index == 0 || dict_index > max_index {
593 return Err(Error::InvalidDictIndex {
598 let Some(variable) = self.variables.get(&(dict_index - 1)) else {
599 return Err(Error::DictIndexIsContinuation(dict_index));
604 /// Returns `input` decoded from `self.encoding` into UTF-8 such that
605 /// re-encoding the result back into `self.encoding` will have exactly the
606 /// same length in bytes.
608 /// XXX warn about errors?
609 fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
610 if let (s, false) = self.encoding.decode_without_bom_handling(input) {
611 // This is the common case. Usually there will be no errors.
614 // Unusual case. Don't bother to optimize it much.
615 let mut decoder = self.encoding.new_decoder_without_bom_handling();
616 let mut output = String::with_capacity(
618 .max_utf8_buffer_length_without_replacement(input.len())
621 let mut rest = input;
622 while !rest.is_empty() {
623 match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
624 (DecoderResult::InputEmpty, _) => break,
625 (DecoderResult::OutputFull, _) => unreachable!(),
626 (DecoderResult::Malformed(a, b), consumed) => {
627 let skipped = a as usize + b as usize;
628 output.extend(repeat('?').take(skipped));
629 rest = &rest[consumed..];
633 assert_eq!(self.encoding.encode(&output).0.len(), input.len());
639 pub trait TryDecode: Sized {
642 decoder: &mut Decoder,
643 input: &Self::Input<'_>,
644 warn: impl Fn(Error),
645 ) -> Result<Option<Self>, Error>;
648 pub trait Decode<Input>: Sized {
649 fn decode(decoder: &Decoder, input: &Input, warn: impl Fn(Error)) -> Self;
652 impl<const N: usize> Decode<RawStr<N>> for String {
653 fn decode(decoder: &Decoder, input: &RawStr<N>, warn: impl Fn(Error)) -> Self {
654 decoder.decode_string(&input.0, &warn)
659 #[derive(Clone, Debug)]
660 pub struct HeaderRecord {
661 pub eye_catcher: String,
662 pub weight_index: Option<usize>,
663 pub n_cases: Option<u64>,
664 pub creation: NaiveDateTime,
665 pub file_label: String,
668 fn trim_end_spaces(mut s: String) -> String {
669 s.truncate(s.trim_end_matches(' ').len());
673 /// Data file info that doesn't fit in [Dictionary].
674 pub struct Metadata {
675 creation: NaiveDateTime,
677 compression: Option<Compression>,
678 n_cases: Option<u64>,
680 product_ext: Option<String>,
681 version: Option<(i32, i32, i32)>,
686 header: &crate::raw::HeaderRecord<Cow<str>>,
687 integer_info: Option<&IntegerInfoRecord>,
688 product_ext: Option<&ProductInfoRecord>,
689 warn: impl Fn(Error),
691 let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
692 .unwrap_or_else(|_| {
693 warn(Error::InvalidCreationDate {
694 creation_date: header.creation_date.to_string(),
698 let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
699 .unwrap_or_else(|_| {
700 warn(Error::InvalidCreationTime {
701 creation_time: header.creation_time.to_string(),
705 let creation = NaiveDateTime::new(creation_date, creation_time);
709 .trim_start_matches("@(#) SPSS DATA FILE")
715 endian: header.endian,
716 compression: header.compression,
717 n_cases: header.n_cases.map(|n| n as u64),
719 product_ext: product_ext.map(|pe| pe.0.clone()),
720 version: integer_info.map(|ii| ii.version),
725 impl TryDecode for HeaderRecord {
726 type Input<'a> = crate::raw::HeaderRecord<Cow<'a, str>>;
729 _decoder: &mut Decoder,
730 input: &Self::Input<'_>,
731 warn: impl Fn(Error),
732 ) -> Result<Option<Self>, Error> {
733 let eye_catcher = trim_end_spaces(input.eye_catcher.to_string());
734 let file_label = trim_end_spaces(input.file_label.to_string());
735 let creation_date = NaiveDate::parse_from_str(&input.creation_date, "%e %b %Y")
736 .unwrap_or_else(|_| {
737 warn(Error::InvalidCreationDate {
738 creation_date: input.creation_date.to_string(),
742 let creation_time = NaiveTime::parse_from_str(&input.creation_time, "%H:%M:%S")
743 .unwrap_or_else(|_| {
744 warn(Error::InvalidCreationTime {
745 creation_time: input.creation_time.to_string(),
749 Ok(Some(HeaderRecord {
751 weight_index: input.weight_index.map(|n| n as usize),
752 n_cases: input.n_cases.map(|n| n as u64),
753 creation: NaiveDateTime::new(creation_date, creation_time),
759 #[derive(Clone, Debug)]
760 pub struct VariableRecord {
762 pub name: Identifier,
763 pub print_format: Spec,
764 pub write_format: Spec,
765 pub missing_values: MissingValues<String>,
766 pub label: Option<String>,
770 fn parse_variable_record(
771 decoder: &mut Decoder,
772 input: &raw::VariableRecord<Cow<str>, String>,
773 warn: impl Fn(Error),
774 ) -> Result<(), Error> {
775 let width = match input.width {
776 0 => VarWidth::Numeric,
777 w @ 1..=255 => VarWidth::String(w as u16),
780 return Err(Error::InvalidVariableWidth {
781 offsets: input.offsets.clone(),
786 let name = trim_end_spaces(input.name.to_string());
787 let name = match Identifier::new(&name, decoder.encoding) {
789 if !decoder.var_names.contains_key(&name) {
792 let new_name = decoder.generate_name();
793 warn(Error::DuplicateVariableName {
794 duplicate_name: name.clone(),
795 new_name: new_name.clone(),
801 let new_name = decoder.generate_name();
802 warn(Error::InvalidVariableName {
804 new_name: new_name.clone(),
809 let variable = Variable {
810 dict_index: decoder.n_dict_indexes,
811 short_name: name.clone(),
815 decoder.n_dict_indexes += width.n_dict_indexes();
818 .insert(name.clone(), variable.dict_index)
822 .insert(variable.dict_index, variable)
825 let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
826 warn(Error::InvalidPrintFormat {
828 variable: name.clone(),
832 let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
833 warn(Error::InvalidWriteFormat {
835 variable: name.clone(),
839 let mut variable = dictionary::Variable::new(name, width);
840 variable.print_format = print_format;
841 variable.write_format = write_format;
842 variable.missing_values = input.missing_values.clone();
843 if let Some(ref label) = input.label {
844 variable.label = Some(label.to_string());
846 decoder.dictionary.add_var(variable).unwrap();
850 #[derive(Clone, Debug)]
851 pub struct DocumentRecord(Vec<String>);
853 impl TryDecode for DocumentRecord {
854 type Input<'a> = crate::raw::DocumentRecord<RawDocumentLine>;
857 decoder: &mut Decoder,
858 input: &Self::Input<'_>,
859 warn: impl Fn(Error),
860 ) -> Result<Option<Self>, Error> {
861 Ok(Some(DocumentRecord(
865 .map(|s| trim_end_spaces(decoder.decode_string(&s.0, &warn)))
875 const NAME: &'static str;
876 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error>;
879 #[derive(Clone, Debug)]
880 pub struct VariableSet {
882 pub vars: Vec<String>,
886 fn parse(input: &str) -> Result<Self, Error> {
887 let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
888 let vars = input.split_ascii_whitespace().map(String::from).collect();
896 trait WarnOnError<T> {
897 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
899 impl<T> WarnOnError<T> for Result<T, Error> {
900 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
902 Ok(result) => Some(result),
911 #[derive(Clone, Debug)]
912 pub struct ValueLabel {
917 #[derive(Clone, Debug)]
918 pub struct ValueLabelRecord {
919 pub var_type: VarType,
920 pub labels: Vec<ValueLabel>,
921 pub variables: Vec<Identifier>,
924 impl TryDecode for ValueLabelRecord {
925 type Input<'a> = crate::raw::ValueLabelRecord<RawStr<8>, RawString>;
927 decoder: &mut Decoder,
928 input: &Self::Input<'_>,
929 warn: impl Fn(Error),
930 ) -> Result<Option<ValueLabelRecord>, Error> {
931 let variables: Vec<&Variable> = input
934 .filter_map(|&dict_index| {
936 .get_var_by_index(dict_index as usize)
937 .warn_on_error(&warn)
939 .filter(|&variable| match variable.width {
940 VarWidth::String(width) if width > 8 => {
941 warn(Error::InvalidLongStringValueLabel(
942 variable.short_name.clone(),
949 let mut i = variables.iter();
950 let Some(&first_var) = i.next() else {
953 let var_type: VarType = first_var.width.into();
955 let this_type: VarType = variable.width.into();
956 if var_type != this_type {
957 let (numeric_var, string_var) = match var_type {
958 VarType::Numeric => (first_var, variable),
959 VarType::String => (variable, first_var),
961 warn(Error::ValueLabelsDifferentTypes {
962 numeric_var: numeric_var.short_name.clone(),
963 string_var: string_var.short_name.clone(),
971 .map(|raw::ValueLabel { value, label }| {
972 let label = decoder.decode_string(&label.0, &warn);
973 let value = Value::decode(value, decoder);
974 ValueLabel { value, label }
977 let variables = variables
979 .map(|&variable| variable.short_name.clone())
981 Ok(Some(ValueLabelRecord {
989 #[derive(Clone, Debug)]
990 pub struct VariableSetRecord(Vec<VariableSet>);
992 impl TextRecord for VariableSetRecord {
993 const NAME: &'static str = "variable set";
994 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
995 let mut sets = Vec::new();
996 for line in input.lines() {
997 if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) {
1001 Ok(VariableSetRecord(sets))
1005 #[derive(Clone, Debug)]
1006 pub struct LongName {
1007 pub short_name: Identifier,
1008 pub long_name: Identifier,
1012 fn new(decoder: &mut Decoder, short_name: &str, long_name: &str) -> Result<LongName, Error> {
1014 Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidShortName)?;
1016 Identifier::new(long_name, decoder.encoding).map_err(Error::InvalidLongName)?;
1024 #[derive(Clone, Debug)]
1025 pub struct LongNameRecord(Vec<LongName>);
1027 impl LongNameRecord {
1028 pub fn parse(decoder: &mut Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1029 let mut names = Vec::new();
1030 for pair in input.split('\t').filter(|s| !s.is_empty()) {
1031 if let Some((short_name, long_name)) = pair.split_once('=') {
1032 if let Some(long_name) =
1033 LongName::new(decoder, short_name, long_name).warn_on_error(&warn)
1035 names.push(long_name);
1041 Ok(LongNameRecord(names))
1045 #[derive(Clone, Debug)]
1046 pub struct VeryLongString {
1047 pub short_name: Identifier,
1051 impl VeryLongString {
1052 fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Error> {
1053 let Some((short_name, length)) = input.split_once('=') else {
1054 return Err(Error::TBD);
1057 Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidLongStringName)?;
1058 let length: u16 = length.parse().map_err(|_| Error::TBD)?;
1059 if length > VarWidth::MAX_STRING {
1060 return Err(Error::TBD);
1062 Ok(VeryLongString { short_name, length })
1066 #[derive(Clone, Debug)]
1067 pub struct VeryLongStringRecord(Vec<VeryLongString>);
1069 impl VeryLongStringRecord {
1070 pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1071 let mut very_long_strings = Vec::new();
1074 .map(|s| s.trim_end_matches('\t'))
1075 .filter(|s| !s.is_empty())
1077 if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&warn) {
1078 very_long_strings.push(vls)
1081 Ok(VeryLongStringRecord(very_long_strings))
1085 #[derive(Clone, Debug)]
1086 pub struct Attribute {
1087 pub name: Identifier,
1088 pub values: Vec<String>,
1095 warn: &impl Fn(Error),
1096 ) -> Result<(Option<Attribute>, &'a str), Error> {
1097 let Some((name, mut input)) = input.split_once('(') else {
1098 return Err(Error::TBD);
1100 let mut values = Vec::new();
1102 let Some((value, rest)) = input.split_once('\n') else {
1103 return Err(Error::TBD);
1105 if let Some(stripped) = value
1107 .and_then(|value| value.strip_suffix('\''))
1109 values.push(stripped.into());
1112 values.push(value.into());
1114 if let Some(rest) = rest.strip_prefix(')') {
1115 let attribute = Identifier::new(name, decoder.encoding)
1116 .map_err(Error::InvalidAttributeName)
1117 .warn_on_error(warn)
1118 .map(|name| Attribute { name, values });
1119 return Ok((attribute, rest));
1126 #[derive(Clone, Debug)]
1127 pub struct AttributeSet(pub Vec<Attribute>);
1133 sentinel: Option<char>,
1134 warn: &impl Fn(Error),
1135 ) -> Result<(AttributeSet, &'a str), Error> {
1136 let mut attributes = Vec::new();
1138 match input.chars().next() {
1139 None => break input,
1140 c if c == sentinel => break &input[1..],
1142 let (attribute, rest) = Attribute::parse(decoder, input, &warn)?;
1143 if let Some(attribute) = attribute {
1144 attributes.push(attribute);
1150 Ok((AttributeSet(attributes), rest))
1154 #[derive(Clone, Debug)]
1155 pub struct FileAttributeRecord(AttributeSet);
1157 impl FileAttributeRecord {
1158 pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1159 let (set, rest) = AttributeSet::parse(decoder, input, None, &warn)?;
1160 if !rest.is_empty() {
1163 Ok(FileAttributeRecord(set))
1167 #[derive(Clone, Debug)]
1168 pub struct VarAttributeSet {
1169 pub long_var_name: Identifier,
1170 pub attributes: AttributeSet,
1173 impl VarAttributeSet {
1177 warn: &impl Fn(Error),
1178 ) -> Result<(Option<VarAttributeSet>, &'a str), Error> {
1179 let Some((long_var_name, rest)) = input.split_once(':') else {
1180 return Err(Error::TBD);
1182 let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'), warn)?;
1183 let var_attribute = Identifier::new(long_var_name, decoder.encoding)
1184 .map_err(Error::InvalidAttributeVariableName)
1185 .warn_on_error(warn)
1186 .map(|name| VarAttributeSet {
1187 long_var_name: name,
1190 Ok((var_attribute, rest))
1194 #[derive(Clone, Debug)]
1195 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
1197 impl VariableAttributeRecord {
1198 pub fn parse(decoder: &Decoder, mut input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1199 let mut var_attribute_sets = Vec::new();
1200 while !input.is_empty() {
1201 let Some((var_attribute, rest)) =
1202 VarAttributeSet::parse(decoder, input, &warn).warn_on_error(&warn)
1206 if let Some(var_attribute) = var_attribute {
1207 var_attribute_sets.push(var_attribute);
1211 Ok(VariableAttributeRecord(var_attribute_sets))
1215 #[derive(Clone, Debug)]
1216 pub enum MultipleResponseType {
1219 labels: CategoryLabels,
1224 impl MultipleResponseType {
1227 mr_set: &Identifier,
1228 input: &raw::MultipleResponseType,
1229 min_width: VarWidth,
1230 warn: &impl Fn(Error),
1231 ) -> Result<Self, Error> {
1232 let mr_type = match input {
1233 raw::MultipleResponseType::MultipleDichotomy { value, labels } => {
1234 let value = decoder.decode_string_cow(&value.0, warn);
1235 let value = match min_width {
1236 VarWidth::Numeric => {
1237 let number: f64 = value.trim().parse().map_err(|_| {
1238 Error::InvalidMDGroupCountedValue {
1239 mr_set: mr_set.clone(),
1240 number: value.into(),
1243 Value::Number(Some(number.into()))
1245 VarWidth::String(max_width) => {
1246 let value = value.trim_end_matches(' ');
1247 let width = value.len();
1248 if width > max_width as usize {
1249 return Err(Error::TooWideMDGroupCountedValue {
1250 mr_set: mr_set.clone(),
1251 value: value.into(),
1256 Value::String(value.into())
1259 MultipleResponseType::MultipleDichotomy {
1264 raw::MultipleResponseType::MultipleCategory => MultipleResponseType::MultipleCategory,
1270 #[derive(Clone, Debug)]
1271 pub struct MultipleResponseSet {
1272 pub name: Identifier,
1273 pub min_width: VarWidth,
1274 pub max_width: VarWidth,
1276 pub mr_type: MultipleResponseType,
1277 pub dict_indexes: Vec<DictIndex>,
1280 impl MultipleResponseSet {
1283 input: &raw::MultipleResponseSet<Identifier, Cow<str>>,
1284 warn: &impl Fn(Error),
1285 ) -> Result<Self, Error> {
1286 let mr_set_name = input.name.clone();
1287 let mut dict_indexes = Vec::with_capacity(input.short_names.len());
1288 for short_name in input.short_names.iter() {
1289 let Some(&dict_index) = decoder.var_names.get(&short_name) else {
1290 warn(Error::UnknownMrSetVariable {
1291 mr_set: mr_set_name.clone(),
1292 short_name: short_name.clone(),
1296 dict_indexes.push(dict_index);
1299 match dict_indexes.len() {
1300 0 => return Err(Error::EmptyMrSet(mr_set_name)),
1301 1 => return Err(Error::OneVarMrSet(mr_set_name)),
1305 let Some((Some(min_width), Some(max_width))) = dict_indexes
1307 .map(|dict_index| decoder.variables[dict_index].width)
1308 .map(|w| (Some(w), Some(w)))
1309 .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb)))
1311 return Err(Error::MixedMrSet(mr_set_name));
1315 MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?;
1317 Ok(MultipleResponseSet {
1321 label: input.label.to_string(),
1328 #[derive(Clone, Debug)]
1329 pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
1331 impl TryDecode for MultipleResponseRecord {
1332 type Input<'a> = raw::MultipleResponseRecord<Identifier, Cow<'a, str>>;
1335 decoder: &mut Decoder,
1336 input: &Self::Input<'_>,
1337 warn: impl Fn(Error),
1338 ) -> Result<Option<Self>, Error> {
1339 let mut sets = Vec::with_capacity(input.0.len());
1340 for set in &input.0 {
1341 match MultipleResponseSet::decode(decoder, set, &warn) {
1342 Ok(set) => sets.push(set),
1343 Err(error) => warn(error),
1346 Ok(Some(MultipleResponseRecord(sets)))
1350 #[derive(Clone, Debug)]
1351 pub struct LongStringValueLabels {
1352 pub var_name: Identifier,
1353 pub width: VarWidth,
1354 pub labels: Vec<ValueLabel>,
1357 impl LongStringValueLabels {
1360 input: &raw::LongStringValueLabels<RawString>,
1361 warn: &impl Fn(Error),
1362 ) -> Result<Self, Error> {
1363 let var_name = decoder.decode_string(&input.var_name.0, warn);
1364 let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
1365 .map_err(Error::InvalidLongStringValueLabelName)?;
1368 let max_width = VarWidth::MAX_STRING;
1369 if input.width < 9 || input.width > max_width as u32 {
1370 return Err(Error::InvalidLongValueLabelWidth {
1377 let width = input.width as u16;
1379 let mut labels = Vec::with_capacity(input.labels.len());
1380 for (value, label) in input.labels.iter() {
1381 let value = Value::String(decoder.decode_exact_length(&value.0).into());
1382 let label = decoder.decode_string(&label.0, warn);
1383 labels.push(ValueLabel { value, label });
1386 Ok(LongStringValueLabels {
1388 width: VarWidth::String(width),
1394 #[derive(Clone, Debug)]
1395 pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
1397 impl TryDecode for LongStringValueLabelRecord {
1398 type Input<'a> = raw::LongStringValueLabelRecord<RawString>;
1401 decoder: &mut Decoder,
1402 input: &Self::Input<'_>,
1403 warn: impl Fn(Error),
1404 ) -> Result<Option<Self>, Error> {
1405 let mut labels = Vec::with_capacity(input.0.len());
1406 for label in &input.0 {
1407 match LongStringValueLabels::decode(decoder, label, &warn) {
1408 Ok(set) => labels.push(set),
1409 Err(error) => warn(error),
1412 Ok(Some(LongStringValueLabelRecord(labels)))
1418 use encoding_rs::WINDOWS_1252;
1422 let mut s = String::new();
1423 s.push(char::REPLACEMENT_CHARACTER);
1424 let encoded = WINDOWS_1252.encode(&s).0;
1425 let decoded = WINDOWS_1252.decode(&encoded[..]).0;
1426 println!("{:?}", decoded);
1431 let charset: Vec<u8> = (0..=255).collect();
1432 println!("{}", charset.len());
1433 let decoded = WINDOWS_1252.decode(&charset[..]).0;
1434 println!("{}", decoded.len());
1435 let encoded = WINDOWS_1252.encode(&decoded[..]).0;
1436 println!("{}", encoded.len());
1437 assert_eq!(&charset[..], &encoded[..]);