1 use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc};
4 dictionary::{Dictionary, VarWidth, Variable},
5 encoding::Error as EncodingError,
7 format::{Error as FormatError, Spec, UncheckedSpec},
8 identifier::{Error as IdError, Identifier},
10 self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord,
11 FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongNamesRecord,
12 LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord,
13 NumberOfCasesRecord, ProductInfoRecord, RawStr, ValueLabelRecord, VarDisplayRecord,
14 VariableAttributeRecord, VariableRecord, VariableSetRecord, VeryLongStringsRecord, ZHeader,
18 use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
19 use encoding_rs::Encoding;
21 use thiserror::Error as ThisError;
23 pub use crate::raw::{CategoryLabels, Compression};
25 #[derive(ThisError, Debug)]
27 #[error("Missing header record")]
30 // XXX this is an internal error
31 #[error("More than one file header record")]
32 DuplicateHeaderRecord,
35 EncodingError(EncodingError),
37 #[error("Using default encoding {0}.")]
38 UsingDefaultEncoding(String),
40 #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)]
41 InvalidVariableWidth { offsets: Range<u64>, width: i32 },
43 #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
44 InvalidLongMissingValueFormat,
46 #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
47 InvalidCreationDate { creation_date: String },
49 #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
50 InvalidCreationTime { creation_time: String },
52 #[error("{id_error} Renaming variable to {new_name}.")]
59 "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}"
64 format_error: FormatError,
68 "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}"
73 format_error: FormatError,
76 #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")]
77 DuplicateVariableName {
78 duplicate_name: Identifier,
82 #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")]
83 InvalidDictIndex { dict_index: usize, max_index: usize },
85 #[error("Dictionary index {0} refers to a long string continuation.")]
86 DictIndexIsContinuation(usize),
88 #[error("At offset {offset:#x}, one or more variable indexes for value labels referred to long string continuation records: {indexes:?}")]
89 LongStringContinuationIndexes { offset: u64, indexes: Vec<u32> },
92 "At offsets {:#x}...{:#x}, record types 3 and 4 may not add value labels to one or more long string variables: {variables:?}", .offsets.start, .offsets.end
94 InvalidLongStringValueLabels {
96 variables: Vec<Identifier>,
99 #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")]
100 ValueLabelsDifferentTypes {
101 numeric_var: Identifier,
102 string_var: Identifier,
105 #[error("Invalid multiple response set name. {0}")]
106 InvalidMrSetName(IdError),
108 #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")]
109 UnknownMrSetVariable {
111 short_name: Identifier,
114 #[error("Multiple response set {0} has no variables.")]
115 EmptyMrSet(Identifier),
117 #[error("Multiple response set {0} has only one variable.")]
118 OneVarMrSet(Identifier),
120 #[error("Multiple response set {0} contains both string and numeric variables.")]
121 MixedMrSet(Identifier),
124 "Invalid numeric format for counted value {number} in multiple response set {mr_set}."
126 InvalidMDGroupCountedValue { mr_set: Identifier, number: String },
128 #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")]
129 TooWideMDGroupCountedValue {
136 #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")]
137 InvalidLongValueLabelWidth {
144 #[error("Invalid attribute name. {0}")]
145 InvalidAttributeName(IdError),
147 #[error("Invalid short name in long variable name record. {0}")]
148 InvalidShortName(IdError),
150 #[error("Invalid name in long variable name record. {0}")]
151 InvalidLongName(IdError),
153 #[error("Invalid variable name in very long string record. {0}")]
154 InvalidLongStringName(IdError),
156 #[error("Invalid variable name in long string value label record. {0}")]
157 InvalidLongStringValueLabelName(IdError),
159 #[error("Invalid variable name in attribute record. {0}")]
160 InvalidAttributeVariableName(IdError),
162 // XXX This is risky because `text` might be arbitarily long.
163 #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
164 MalformedString { encoding: String, text: String },
166 #[error("Details TBD")]
170 type DictIndex = usize;
172 #[derive(Clone, Debug)]
174 pub header: HeaderRecord<String>,
175 pub variable: Vec<VariableRecord<String, String>>,
176 pub value_label: Vec<ValueLabelRecord<RawStr<8>, String>>,
177 pub document: Vec<DocumentRecord<String>>,
178 pub integer_info: Option<IntegerInfoRecord>,
179 pub float_info: Option<FloatInfoRecord>,
180 pub var_display: Option<VarDisplayRecord>,
181 pub multiple_response: Vec<MultipleResponseRecord<Identifier, String>>,
182 pub long_string_value_labels: Vec<LongStringValueLabelRecord<Identifier, String>>,
183 pub long_string_missing_values: Vec<LongStringMissingValueRecord<Identifier, String>>,
184 pub encoding: Option<EncodingRecord>,
185 pub number_of_cases: Option<NumberOfCasesRecord>,
186 pub variable_sets: Vec<VariableSetRecord>,
187 pub product_info: Option<ProductInfoRecord>,
188 pub long_names: Vec<LongNamesRecord>,
189 pub very_long_strings: Vec<VeryLongStringsRecord>,
190 pub file_attributes: Vec<FileAttributeRecord>,
191 pub variable_attributes: Vec<VariableAttributeRecord>,
192 pub other_extension: Vec<Extension>,
193 pub end_of_headers: Option<u32>,
194 pub z_header: Option<ZHeader>,
195 pub z_trailer: Option<ZTrailer>,
196 pub cases: Option<Rc<RefCell<Cases>>>,
199 fn take_first<T, F>(mut vec: Vec<T>, more_than_one: F) -> Option<T>
210 pub fn new(headers: Vec<raw::DecodedRecord>, warn: &impl Fn(Error)) -> Result<Headers, Error> {
211 let mut file_header = Vec::new();
212 let mut variable = Vec::new();
213 let mut value_label = Vec::new();
214 let mut document = Vec::new();
215 let mut integer_info = Vec::new();
216 let mut float_info = Vec::new();
217 let mut var_display = Vec::new();
218 let mut multiple_response = Vec::new();
219 let mut long_string_value_labels = Vec::new();
220 let mut long_string_missing_values = Vec::new();
221 let mut encoding = Vec::new();
222 let mut number_of_cases = Vec::new();
223 let mut variable_sets = Vec::new();
224 let mut product_info = Vec::new();
225 let mut long_names = Vec::new();
226 let mut very_long_strings = Vec::new();
227 let mut file_attributes = Vec::new();
228 let mut variable_attributes = Vec::new();
229 let mut other_extension = Vec::new();
230 let mut end_of_headers = Vec::new();
231 let mut z_header = Vec::new();
232 let mut z_trailer = Vec::new();
233 let mut cases = Vec::new();
235 for header in headers {
237 DecodedRecord::Header(record) => {
238 file_header.push(record);
240 DecodedRecord::Variable(record) => {
241 variable.push(record);
243 DecodedRecord::ValueLabel(record) => {
244 value_label.push(record);
246 DecodedRecord::Document(record) => {
247 document.push(record);
249 DecodedRecord::IntegerInfo(record) => {
250 integer_info.push(record);
252 DecodedRecord::FloatInfo(record) => {
253 float_info.push(record);
255 DecodedRecord::VariableSets(record) => {
256 variable_sets.push(record);
258 DecodedRecord::VarDisplay(record) => {
259 var_display.push(record);
261 DecodedRecord::MultipleResponse(record) => {
262 multiple_response.push(record);
264 DecodedRecord::LongStringValueLabels(record) => {
265 long_string_value_labels.push(record)
267 DecodedRecord::LongStringMissingValues(record) => {
268 long_string_missing_values.push(record);
270 DecodedRecord::Encoding(record) => {
271 encoding.push(record);
273 DecodedRecord::NumberOfCases(record) => {
274 number_of_cases.push(record);
276 DecodedRecord::ProductInfo(record) => {
277 product_info.push(record);
279 DecodedRecord::LongNames(record) => {
280 long_names.push(record);
282 DecodedRecord::VeryLongStrings(record) => {
283 very_long_strings.push(record);
285 DecodedRecord::FileAttributes(record) => {
286 file_attributes.push(record);
288 DecodedRecord::VariableAttributes(record) => {
289 variable_attributes.push(record);
291 DecodedRecord::OtherExtension(record) => {
292 other_extension.push(record);
294 DecodedRecord::EndOfHeaders(record) => {
295 end_of_headers.push(record);
297 DecodedRecord::ZHeader(record) => {
298 z_header.push(record);
300 DecodedRecord::ZTrailer(record) => {
301 z_trailer.push(record);
303 DecodedRecord::Cases(record) => {
309 let Some(file_header) = take_first(file_header, || warn(Error::DuplicateHeaderRecord))
311 return Err(Error::MissingHeaderRecord);
319 integer_info: take_first(integer_info, || warn(Error::TBD)),
320 float_info: take_first(float_info, || warn(Error::TBD)),
321 var_display: take_first(var_display, || warn(Error::TBD)),
323 long_string_value_labels,
324 long_string_missing_values,
325 encoding: take_first(encoding, || warn(Error::TBD)),
326 number_of_cases: take_first(number_of_cases, || warn(Error::TBD)),
328 product_info: take_first(product_info, || warn(Error::TBD)),
334 end_of_headers: take_first(end_of_headers, || warn(Error::TBD)),
335 z_header: take_first(z_header, || warn(Error::TBD)),
336 z_trailer: take_first(z_trailer, || warn(Error::TBD)),
337 cases: take_first(cases, || warn(Error::TBD)),
342 pub struct Metadata {
343 creation: NaiveDateTime,
345 compression: Option<Compression>,
346 n_cases: Option<u64>,
348 product_ext: Option<String>,
349 version: Option<(i32, i32, i32)>,
353 fn decode(headers: &Headers, warn: impl Fn(Error)) -> Self {
354 let header = &headers.header;
355 let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
356 .unwrap_or_else(|_| {
357 warn(Error::InvalidCreationDate {
358 creation_date: header.creation_date.to_string(),
362 let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
363 .unwrap_or_else(|_| {
364 warn(Error::InvalidCreationTime {
365 creation_time: header.creation_time.to_string(),
369 let creation = NaiveDateTime::new(creation_date, creation_time);
373 .trim_start_matches("@(#) SPSS DATA FILE")
379 endian: header.endian,
380 compression: header.compression,
381 n_cases: header.n_cases.map(|n| n as u64),
383 product_ext: headers.product_info.as_ref().map(|pe| fix_line_ends(&pe.0)),
384 version: headers.integer_info.as_ref().map(|ii| ii.version),
390 //pub raw: raw::Decoder,
391 pub encoding: &'static Encoding,
392 //pub variables: HashMap<DictIndex, Variable>,
393 //pub var_names: HashMap<Identifier, DictIndex>,
394 //pub dictionary: Dictionary,
395 //n_dict_indexes: usize,
396 n_generated_names: usize,
400 fn generate_name(&mut self, dictionary: &Dictionary) -> Identifier {
402 self.n_generated_names += 1;
403 let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
405 if !dictionary.variables.contains(&name) {
408 assert!(self.n_generated_names < usize::MAX);
414 mut headers: Headers,
415 encoding: &'static Encoding,
416 warn: impl Fn(Error),
417 ) -> Result<(Dictionary, Metadata), Error> {
418 let mut dictionary = Dictionary::new(encoding);
420 let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' '));
421 if !file_label.is_empty() {
422 dictionary.file_label = Some(file_label);
425 for attributes in headers.file_attributes.drain(..) {
426 dictionary.attributes.extend(attributes.0 .0.into_iter())
429 // Concatenate all the document records (really there should only be one)
430 // and trim off the trailing spaces that pad them to 80 bytes.
431 dictionary.documents = headers
434 .flat_map(|record| record.lines)
435 .map(trim_end_spaces)
438 // XXX warn for weird integer format
439 // XXX warn for weird floating-point format, etc.
441 let mut decoder = Decoder {
443 n_generated_names: 0,
446 let mut header_vars = headers.variable.iter().enumerate();
447 let mut var_index_map = HashMap::new();
448 while let Some((value_index, input)) = header_vars.next() {
449 let name = trim_end_spaces(input.name.to_string());
450 let name = match Identifier::new(&name, encoding) {
452 if !dictionary.variables.contains(&name) {
455 let new_name = decoder.generate_name(&dictionary);
456 warn(Error::DuplicateVariableName {
457 duplicate_name: name.clone(),
458 new_name: new_name.clone(),
464 let new_name = decoder.generate_name(&dictionary);
465 warn(Error::InvalidVariableName {
467 new_name: new_name.clone(),
472 let mut variable = Variable::new(name.clone(), VarWidth::from_raw(input.width).unwrap());
474 // Set the short name the same as the long name (even if we renamed it).
475 variable.short_names = vec![name];
477 variable.label = input.label.clone();
479 variable.missing_values = input.missing_values.clone();
481 variable.print_format = decode_format(
484 |new_spec, format_error| {
485 warn(Error::InvalidPrintFormat {
487 variable: variable.name.clone(),
492 variable.write_format = decode_format(
495 |new_spec, format_error| {
496 warn(Error::InvalidWriteFormat {
498 variable: variable.name.clone(),
504 // Skip long string continuation records.
506 #[allow(unstable_name_collisions)]
507 for _ in 1..input.width.div_ceil(&8) {
508 if let Some((_, continuation)) = header_vars.next() {
509 if continuation.width == -1 {
513 return Err(Error::TBD);
517 let dict_index = dictionary.add_var(variable).unwrap();
518 assert_eq!(var_index_map.insert(value_index, dict_index), None);
521 for record in headers.value_label.drain(..) {
522 let mut dict_indexes = Vec::with_capacity(record.dict_indexes.len());
523 let mut continuation_indexes = Vec::new();
524 let mut long_string_variables = Vec::new();
525 for value_index in record.dict_indexes.iter() {
526 if let Some(dict_index) = var_index_map.get(&(*value_index as usize - 1)) {
527 let variable = &dictionary.variables[*dict_index];
528 if variable.width.is_long_string() {
529 long_string_variables.push(variable.name.clone());
531 dict_indexes.push(*dict_index);
534 continuation_indexes.push(*value_index);
537 if !continuation_indexes.is_empty() {
538 warn(Error::LongStringContinuationIndexes {
539 offset: record.offsets.start,
540 indexes: continuation_indexes,
543 if !long_string_variables.is_empty() {
544 warn(Error::InvalidLongStringValueLabels {
545 offsets: record.offsets.clone(),
546 variables: long_string_variables,
551 let metadata = Metadata::decode(&headers, warn);
552 Ok((dictionary, metadata))
555 fn trim_end_spaces(mut s: String) -> String {
556 s.truncate(s.trim_end_matches(' ').len());
560 /// Returns a copy of `s` in which all lone CR and CR LF pairs have been
563 /// (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
564 /// files that use CR-only line ends in the file label and extra product
566 fn fix_line_ends(s: &str) -> String {
567 let mut out = String::with_capacity(s.len());
568 let mut s = s.chars().peekable();
569 while let Some(c) = s.next() {
581 fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec {
582 UncheckedSpec::try_from(raw)
583 .and_then(Spec::try_from)
584 .and_then(|x| x.check_width_compatibility(width))
585 .unwrap_or_else(|error| {
586 let new_format = Spec::default_for_width(width);
587 warn(new_format, error);
594 fn generate_name(&mut self) -> Identifier {
596 self.n_generated_names += 1;
597 let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
599 if !self.var_names.contains_key(&name) {
602 assert!(self.n_generated_names < usize::MAX);
605 fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
606 let (output, malformed) = self.encoding.decode_without_bom_handling(input);
608 warn(Error::MalformedString {
609 encoding: self.encoding.name().into(),
610 text: output.clone().into(),
615 fn decode_string(&self, input: &[u8], warn: &impl Fn(Error)) -> String {
616 self.decode_string_cow(input, warn).into()
618 pub fn decode_identifier(
621 warn: &impl Fn(Error),
622 ) -> Result<Identifier, IdError> {
623 let s = self.decode_string_cow(input, warn);
624 Identifier::new(&s, self.encoding)
626 fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> {
627 let max_index = self.n_dict_indexes;
628 if dict_index == 0 || dict_index > max_index {
629 return Err(Error::InvalidDictIndex {
634 let Some(variable) = self.variables.get(&(dict_index - 1)) else {
635 return Err(Error::DictIndexIsContinuation(dict_index));
640 /// Returns `input` decoded from `self.encoding` into UTF-8 such that
641 /// re-encoding the result back into `self.encoding` will have exactly the
642 /// same length in bytes.
644 /// XXX warn about errors?
645 fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
646 if let (s, false) = self.encoding.decode_without_bom_handling(input) {
647 // This is the common case. Usually there will be no errors.
650 // Unusual case. Don't bother to optimize it much.
651 let mut decoder = self.encoding.new_decoder_without_bom_handling();
652 let mut output = String::with_capacity(
654 .max_utf8_buffer_length_without_replacement(input.len())
657 let mut rest = input;
658 while !rest.is_empty() {
659 match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
660 (DecoderResult::InputEmpty, _) => break,
661 (DecoderResult::OutputFull, _) => unreachable!(),
662 (DecoderResult::Malformed(a, b), consumed) => {
663 let skipped = a as usize + b as usize;
664 output.extend(repeat('?').take(skipped));
665 rest = &rest[consumed..];
669 assert_eq!(self.encoding.encode(&output).0.len(), input.len());
675 pub trait TryDecode: Sized {
678 decoder: &mut Decoder,
679 input: &Self::Input<'_>,
680 warn: impl Fn(Error),
681 ) -> Result<Option<Self>, Error>;
684 pub trait Decode<Input>: Sized {
685 fn decode(decoder: &Decoder, input: &Input, warn: impl Fn(Error)) -> Self;
688 impl<const N: usize> Decode<RawStr<N>> for String {
689 fn decode(decoder: &Decoder, input: &RawStr<N>, warn: impl Fn(Error)) -> Self {
690 decoder.decode_string(&input.0, &warn)
695 #[derive(Clone, Debug)]
696 pub struct HeaderRecord {
697 pub eye_catcher: String,
698 pub weight_index: Option<usize>,
699 pub n_cases: Option<u64>,
700 pub creation: NaiveDateTime,
701 pub file_label: String,
704 fn trim_end_spaces(mut s: String) -> String {
705 s.truncate(s.trim_end_matches(' ').len());
709 /// Data file info that doesn't fit in [Dictionary].
710 pub struct Metadata {
711 creation: NaiveDateTime,
713 compression: Option<Compression>,
714 n_cases: Option<u64>,
716 product_ext: Option<String>,
717 version: Option<(i32, i32, i32)>,
722 header: &crate::raw::HeaderRecord<Cow<str>>,
723 integer_info: Option<&IntegerInfoRecord>,
724 product_ext: Option<&ProductInfoRecord>,
725 warn: impl Fn(Error),
727 let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
728 .unwrap_or_else(|_| {
729 warn(Error::InvalidCreationDate {
730 creation_date: header.creation_date.to_string(),
734 let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
735 .unwrap_or_else(|_| {
736 warn(Error::InvalidCreationTime {
737 creation_time: header.creation_time.to_string(),
741 let creation = NaiveDateTime::new(creation_date, creation_time);
745 .trim_start_matches("@(#) SPSS DATA FILE")
751 endian: header.endian,
752 compression: header.compression,
753 n_cases: header.n_cases.map(|n| n as u64),
755 product_ext: product_ext.map(|pe| pe.0.clone()),
756 version: integer_info.map(|ii| ii.version),
761 impl TryDecode for HeaderRecord {
762 type Input<'a> = crate::raw::HeaderRecord<Cow<'a, str>>;
765 _decoder: &mut Decoder,
766 input: &Self::Input<'_>,
767 warn: impl Fn(Error),
768 ) -> Result<Option<Self>, Error> {
769 let eye_catcher = trim_end_spaces(input.eye_catcher.to_string());
770 let file_label = trim_end_spaces(input.file_label.to_string());
771 let creation_date = NaiveDate::parse_from_str(&input.creation_date, "%e %b %Y")
772 .unwrap_or_else(|_| {
773 warn(Error::InvalidCreationDate {
774 creation_date: input.creation_date.to_string(),
778 let creation_time = NaiveTime::parse_from_str(&input.creation_time, "%H:%M:%S")
779 .unwrap_or_else(|_| {
780 warn(Error::InvalidCreationTime {
781 creation_time: input.creation_time.to_string(),
785 Ok(Some(HeaderRecord {
787 weight_index: input.weight_index.map(|n| n as usize),
788 n_cases: input.n_cases.map(|n| n as u64),
789 creation: NaiveDateTime::new(creation_date, creation_time),
795 #[derive(Clone, Debug)]
796 pub struct VariableRecord {
798 pub name: Identifier,
799 pub print_format: Spec,
800 pub write_format: Spec,
801 pub missing_values: MissingValues<String>,
802 pub label: Option<String>,
806 fn parse_variable_record(
807 decoder: &mut Decoder,
808 input: &raw::VariableRecord<Cow<str>, String>,
809 warn: impl Fn(Error),
810 ) -> Result<(), Error> {
811 let width = match input.width {
812 0 => VarWidth::Numeric,
813 w @ 1..=255 => VarWidth::String(w as u16),
816 return Err(Error::InvalidVariableWidth {
817 offsets: input.offsets.clone(),
822 let name = trim_end_spaces(input.name.to_string());
823 let name = match Identifier::new(&name, decoder.encoding) {
825 if !decoder.var_names.contains_key(&name) {
828 let new_name = decoder.generate_name();
829 warn(Error::DuplicateVariableName {
830 duplicate_name: name.clone(),
831 new_name: new_name.clone(),
837 let new_name = decoder.generate_name();
838 warn(Error::InvalidVariableName {
840 new_name: new_name.clone(),
845 let variable = Variable {
846 dict_index: decoder.n_dict_indexes,
847 short_name: name.clone(),
851 decoder.n_dict_indexes += width.n_dict_indexes();
854 .insert(name.clone(), variable.dict_index)
858 .insert(variable.dict_index, variable)
861 let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
862 warn(Error::InvalidPrintFormat {
864 variable: name.clone(),
868 let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
869 warn(Error::InvalidWriteFormat {
871 variable: name.clone(),
875 let mut variable = dictionary::Variable::new(name, width);
876 variable.print_format = print_format;
877 variable.write_format = write_format;
878 variable.missing_values = input.missing_values.clone();
879 if let Some(ref label) = input.label {
880 variable.label = Some(label.to_string());
882 decoder.dictionary.add_var(variable).unwrap();
886 #[derive(Clone, Debug)]
887 pub struct DocumentRecord(Vec<String>);
889 impl TryDecode for DocumentRecord {
890 type Input<'a> = crate::raw::DocumentRecord<RawDocumentLine>;
893 decoder: &mut Decoder,
894 input: &Self::Input<'_>,
895 warn: impl Fn(Error),
896 ) -> Result<Option<Self>, Error> {
897 Ok(Some(DocumentRecord(
901 .map(|s| trim_end_spaces(decoder.decode_string(&s.0, &warn)))
911 const NAME: &'static str;
912 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error>;
915 #[derive(Clone, Debug)]
916 pub struct VariableSet {
918 pub vars: Vec<String>,
922 fn parse(input: &str) -> Result<Self, Error> {
923 let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
924 let vars = input.split_ascii_whitespace().map(String::from).collect();
932 trait WarnOnError<T> {
933 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
935 impl<T> WarnOnError<T> for Result<T, Error> {
936 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
938 Ok(result) => Some(result),
947 #[derive(Clone, Debug)]
948 pub struct ValueLabel {
953 #[derive(Clone, Debug)]
954 pub struct ValueLabelRecord {
955 pub var_type: VarType,
956 pub labels: Vec<ValueLabel>,
957 pub variables: Vec<Identifier>,
960 impl TryDecode for ValueLabelRecord {
961 type Input<'a> = crate::raw::ValueLabelRecord<RawStr<8>, RawString>;
963 decoder: &mut Decoder,
964 input: &Self::Input<'_>,
965 warn: impl Fn(Error),
966 ) -> Result<Option<ValueLabelRecord>, Error> {
967 let variables: Vec<&Variable> = input
970 .filter_map(|&dict_index| {
972 .get_var_by_index(dict_index as usize)
973 .warn_on_error(&warn)
975 .filter(|&variable| match variable.width {
976 VarWidth::String(width) if width > 8 => {
977 warn(Error::InvalidLongStringValueLabel(
978 variable.short_name.clone(),
985 let mut i = variables.iter();
986 let Some(&first_var) = i.next() else {
989 let var_type: VarType = first_var.width.into();
991 let this_type: VarType = variable.width.into();
992 if var_type != this_type {
993 let (numeric_var, string_var) = match var_type {
994 VarType::Numeric => (first_var, variable),
995 VarType::String => (variable, first_var),
997 warn(Error::ValueLabelsDifferentTypes {
998 numeric_var: numeric_var.short_name.clone(),
999 string_var: string_var.short_name.clone(),
1007 .map(|raw::ValueLabel { value, label }| {
1008 let label = decoder.decode_string(&label.0, &warn);
1009 let value = Value::decode(value, decoder);
1010 ValueLabel { value, label }
1013 let variables = variables
1015 .map(|&variable| variable.short_name.clone())
1017 Ok(Some(ValueLabelRecord {
1025 #[derive(Clone, Debug)]
1026 pub struct VariableSetRecord(Vec<VariableSet>);
1028 impl TextRecord for VariableSetRecord {
1029 const NAME: &'static str = "variable set";
1030 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1031 let mut sets = Vec::new();
1032 for line in input.lines() {
1033 if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) {
1037 Ok(VariableSetRecord(sets))
1041 #[derive(Clone, Debug)]
1042 pub struct LongName {
1043 pub short_name: Identifier,
1044 pub long_name: Identifier,
1048 fn new(decoder: &mut Decoder, short_name: &str, long_name: &str) -> Result<LongName, Error> {
1050 Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidShortName)?;
1052 Identifier::new(long_name, decoder.encoding).map_err(Error::InvalidLongName)?;
1060 #[derive(Clone, Debug)]
1061 pub struct LongNameRecord(Vec<LongName>);
1063 impl LongNameRecord {
1064 pub fn parse(decoder: &mut Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1065 let mut names = Vec::new();
1066 for pair in input.split('\t').filter(|s| !s.is_empty()) {
1067 if let Some((short_name, long_name)) = pair.split_once('=') {
1068 if let Some(long_name) =
1069 LongName::new(decoder, short_name, long_name).warn_on_error(&warn)
1071 names.push(long_name);
1077 Ok(LongNameRecord(names))
1081 #[derive(Clone, Debug)]
1082 pub struct VeryLongString {
1083 pub short_name: Identifier,
1087 impl VeryLongString {
1088 fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Error> {
1089 let Some((short_name, length)) = input.split_once('=') else {
1090 return Err(Error::TBD);
1093 Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidLongStringName)?;
1094 let length: u16 = length.parse().map_err(|_| Error::TBD)?;
1095 if length > VarWidth::MAX_STRING {
1096 return Err(Error::TBD);
1098 Ok(VeryLongString { short_name, length })
1102 #[derive(Clone, Debug)]
1103 pub struct VeryLongStringRecord(Vec<VeryLongString>);
1105 impl VeryLongStringRecord {
1106 pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1107 let mut very_long_strings = Vec::new();
1110 .map(|s| s.trim_end_matches('\t'))
1111 .filter(|s| !s.is_empty())
1113 if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&warn) {
1114 very_long_strings.push(vls)
1117 Ok(VeryLongStringRecord(very_long_strings))
1121 #[derive(Clone, Debug)]
1122 pub struct Attribute {
1123 pub name: Identifier,
1124 pub values: Vec<String>,
1131 warn: &impl Fn(Error),
1132 ) -> Result<(Option<Attribute>, &'a str), Error> {
1133 let Some((name, mut input)) = input.split_once('(') else {
1134 return Err(Error::TBD);
1136 let mut values = Vec::new();
1138 let Some((value, rest)) = input.split_once('\n') else {
1139 return Err(Error::TBD);
1141 if let Some(stripped) = value
1143 .and_then(|value| value.strip_suffix('\''))
1145 values.push(stripped.into());
1148 values.push(value.into());
1150 if let Some(rest) = rest.strip_prefix(')') {
1151 let attribute = Identifier::new(name, decoder.encoding)
1152 .map_err(Error::InvalidAttributeName)
1153 .warn_on_error(warn)
1154 .map(|name| Attribute { name, values });
1155 return Ok((attribute, rest));
1162 #[derive(Clone, Debug)]
1163 pub struct AttributeSet(pub Vec<Attribute>);
1169 sentinel: Option<char>,
1170 warn: &impl Fn(Error),
1171 ) -> Result<(AttributeSet, &'a str), Error> {
1172 let mut attributes = Vec::new();
1174 match input.chars().next() {
1175 None => break input,
1176 c if c == sentinel => break &input[1..],
1178 let (attribute, rest) = Attribute::parse(decoder, input, &warn)?;
1179 if let Some(attribute) = attribute {
1180 attributes.push(attribute);
1186 Ok((AttributeSet(attributes), rest))
1190 #[derive(Clone, Debug)]
1191 pub struct FileAttributeRecord(AttributeSet);
1193 impl FileAttributeRecord {
1194 pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1195 let (set, rest) = AttributeSet::parse(decoder, input, None, &warn)?;
1196 if !rest.is_empty() {
1199 Ok(FileAttributeRecord(set))
1203 #[derive(Clone, Debug)]
1204 pub struct VarAttributeSet {
1205 pub long_var_name: Identifier,
1206 pub attributes: AttributeSet,
1209 impl VarAttributeSet {
1213 warn: &impl Fn(Error),
1214 ) -> Result<(Option<VarAttributeSet>, &'a str), Error> {
1215 let Some((long_var_name, rest)) = input.split_once(':') else {
1216 return Err(Error::TBD);
1218 let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'), warn)?;
1219 let var_attribute = Identifier::new(long_var_name, decoder.encoding)
1220 .map_err(Error::InvalidAttributeVariableName)
1221 .warn_on_error(warn)
1222 .map(|name| VarAttributeSet {
1223 long_var_name: name,
1226 Ok((var_attribute, rest))
1230 #[derive(Clone, Debug)]
1231 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
1233 impl VariableAttributeRecord {
1234 pub fn parse(decoder: &Decoder, mut input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1235 let mut var_attribute_sets = Vec::new();
1236 while !input.is_empty() {
1237 let Some((var_attribute, rest)) =
1238 VarAttributeSet::parse(decoder, input, &warn).warn_on_error(&warn)
1242 if let Some(var_attribute) = var_attribute {
1243 var_attribute_sets.push(var_attribute);
1247 Ok(VariableAttributeRecord(var_attribute_sets))
1251 #[derive(Clone, Debug)]
1252 pub enum MultipleResponseType {
1255 labels: CategoryLabels,
1260 impl MultipleResponseType {
1263 mr_set: &Identifier,
1264 input: &raw::MultipleResponseType,
1265 min_width: VarWidth,
1266 warn: &impl Fn(Error),
1267 ) -> Result<Self, Error> {
1268 let mr_type = match input {
1269 raw::MultipleResponseType::MultipleDichotomy { value, labels } => {
1270 let value = decoder.decode_string_cow(&value.0, warn);
1271 let value = match min_width {
1272 VarWidth::Numeric => {
1273 let number: f64 = value.trim().parse().map_err(|_| {
1274 Error::InvalidMDGroupCountedValue {
1275 mr_set: mr_set.clone(),
1276 number: value.into(),
1279 Value::Number(Some(number.into()))
1281 VarWidth::String(max_width) => {
1282 let value = value.trim_end_matches(' ');
1283 let width = value.len();
1284 if width > max_width as usize {
1285 return Err(Error::TooWideMDGroupCountedValue {
1286 mr_set: mr_set.clone(),
1287 value: value.into(),
1292 Value::String(value.into())
1295 MultipleResponseType::MultipleDichotomy {
1300 raw::MultipleResponseType::MultipleCategory => MultipleResponseType::MultipleCategory,
1306 #[derive(Clone, Debug)]
1307 pub struct MultipleResponseSet {
1308 pub name: Identifier,
1309 pub min_width: VarWidth,
1310 pub max_width: VarWidth,
1312 pub mr_type: MultipleResponseType,
1313 pub dict_indexes: Vec<DictIndex>,
1316 impl MultipleResponseSet {
1319 input: &raw::MultipleResponseSet<Identifier, Cow<str>>,
1320 warn: &impl Fn(Error),
1321 ) -> Result<Self, Error> {
1322 let mr_set_name = input.name.clone();
1323 let mut dict_indexes = Vec::with_capacity(input.short_names.len());
1324 for short_name in input.short_names.iter() {
1325 let Some(&dict_index) = decoder.var_names.get(&short_name) else {
1326 warn(Error::UnknownMrSetVariable {
1327 mr_set: mr_set_name.clone(),
1328 short_name: short_name.clone(),
1332 dict_indexes.push(dict_index);
1335 match dict_indexes.len() {
1336 0 => return Err(Error::EmptyMrSet(mr_set_name)),
1337 1 => return Err(Error::OneVarMrSet(mr_set_name)),
1341 let Some((Some(min_width), Some(max_width))) = dict_indexes
1343 .map(|dict_index| decoder.variables[dict_index].width)
1344 .map(|w| (Some(w), Some(w)))
1345 .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb)))
1347 return Err(Error::MixedMrSet(mr_set_name));
1351 MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?;
1353 Ok(MultipleResponseSet {
1357 label: input.label.to_string(),
1364 #[derive(Clone, Debug)]
1365 pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
1367 impl TryDecode for MultipleResponseRecord {
1368 type Input<'a> = raw::MultipleResponseRecord<Identifier, Cow<'a, str>>;
1371 decoder: &mut Decoder,
1372 input: &Self::Input<'_>,
1373 warn: impl Fn(Error),
1374 ) -> Result<Option<Self>, Error> {
1375 let mut sets = Vec::with_capacity(input.0.len());
1376 for set in &input.0 {
1377 match MultipleResponseSet::decode(decoder, set, &warn) {
1378 Ok(set) => sets.push(set),
1379 Err(error) => warn(error),
1382 Ok(Some(MultipleResponseRecord(sets)))
1386 #[derive(Clone, Debug)]
1387 pub struct LongStringValueLabels {
1388 pub var_name: Identifier,
1389 pub width: VarWidth,
1390 pub labels: Vec<ValueLabel>,
1393 impl LongStringValueLabels {
1396 input: &raw::LongStringValueLabels<RawString>,
1397 warn: &impl Fn(Error),
1398 ) -> Result<Self, Error> {
1399 let var_name = decoder.decode_string(&input.var_name.0, warn);
1400 let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
1401 .map_err(Error::InvalidLongStringValueLabelName)?;
1404 let max_width = VarWidth::MAX_STRING;
1405 if input.width < 9 || input.width > max_width as u32 {
1406 return Err(Error::InvalidLongValueLabelWidth {
1413 let width = input.width as u16;
1415 let mut labels = Vec::with_capacity(input.labels.len());
1416 for (value, label) in input.labels.iter() {
1417 let value = Value::String(decoder.decode_exact_length(&value.0).into());
1418 let label = decoder.decode_string(&label.0, warn);
1419 labels.push(ValueLabel { value, label });
1422 Ok(LongStringValueLabels {
1424 width: VarWidth::String(width),
1430 #[derive(Clone, Debug)]
1431 pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
1433 impl TryDecode for LongStringValueLabelRecord {
1434 type Input<'a> = raw::LongStringValueLabelRecord<RawString>;
1437 decoder: &mut Decoder,
1438 input: &Self::Input<'_>,
1439 warn: impl Fn(Error),
1440 ) -> Result<Option<Self>, Error> {
1441 let mut labels = Vec::with_capacity(input.0.len());
1442 for label in &input.0 {
1443 match LongStringValueLabels::decode(decoder, label, &warn) {
1444 Ok(set) => labels.push(set),
1445 Err(error) => warn(error),
1448 Ok(Some(LongStringValueLabelRecord(labels)))
1454 use encoding_rs::WINDOWS_1252;
1458 let mut s = String::new();
1459 s.push(char::REPLACEMENT_CHARACTER);
1460 let encoded = WINDOWS_1252.encode(&s).0;
1461 let decoded = WINDOWS_1252.decode(&encoded[..]).0;
1462 println!("{:?}", decoded);
1467 let charset: Vec<u8> = (0..=255).collect();
1468 println!("{}", charset.len());
1469 let decoded = WINDOWS_1252.decode(&charset[..]).0;
1470 println!("{}", decoded.len());
1471 let encoded = WINDOWS_1252.encode(&decoded[..]).0;
1472 println!("{}", encoded.len());
1473 assert_eq!(&charset[..], &encoded[..]);