1 use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc};
4 dictionary::{Dictionary, VarWidth},
5 encoding::Error as EncodingError,
7 format::{Error as FormatError, Spec},
8 identifier::{Error as IdError, Identifier},
10 self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord,
11 FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongNamesRecord,
12 LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord,
13 NumberOfCasesRecord, ProductInfoRecord, RawStr, ValueLabelRecord, VarDisplayRecord,
14 VariableAttributeRecord, VariableRecord, VariableSetRecord, VeryLongStringsRecord, ZHeader,
18 use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
19 use encoding_rs::Encoding;
20 use thiserror::Error as ThisError;
22 pub use crate::raw::{CategoryLabels, Compression};
24 #[derive(ThisError, Debug)]
26 #[error("Missing header record")]
29 // XXX this is an internal error
30 #[error("More than one file header record")]
31 DuplicateHeaderRecord,
34 EncodingError(EncodingError),
36 #[error("Using default encoding {0}.")]
37 UsingDefaultEncoding(String),
39 #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)]
40 InvalidVariableWidth { offsets: Range<u64>, width: i32 },
42 #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
43 InvalidLongMissingValueFormat,
45 #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
46 InvalidCreationDate { creation_date: String },
48 #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
49 InvalidCreationTime { creation_time: String },
51 #[error("{id_error} Renaming variable to {new_name}.")]
58 "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}"
63 format_error: FormatError,
67 "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}"
72 format_error: FormatError,
75 #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")]
76 DuplicateVariableName {
77 duplicate_name: Identifier,
81 #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")]
82 InvalidDictIndex { dict_index: usize, max_index: usize },
84 #[error("Dictionary index {0} refers to a long string continuation.")]
85 DictIndexIsContinuation(usize),
87 #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")]
88 ValueLabelsDifferentTypes {
89 numeric_var: Identifier,
90 string_var: Identifier,
94 "Value labels may not be added to long string variable {0} using record types 3 or 4."
96 InvalidLongStringValueLabel(Identifier),
98 #[error("Invalid multiple response set name. {0}")]
99 InvalidMrSetName(IdError),
101 #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")]
102 UnknownMrSetVariable {
104 short_name: Identifier,
107 #[error("Multiple response set {0} has no variables.")]
108 EmptyMrSet(Identifier),
110 #[error("Multiple response set {0} has only one variable.")]
111 OneVarMrSet(Identifier),
113 #[error("Multiple response set {0} contains both string and numeric variables.")]
114 MixedMrSet(Identifier),
117 "Invalid numeric format for counted value {number} in multiple response set {mr_set}."
119 InvalidMDGroupCountedValue { mr_set: Identifier, number: String },
121 #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")]
122 TooWideMDGroupCountedValue {
129 #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")]
130 InvalidLongValueLabelWidth {
137 #[error("Invalid attribute name. {0}")]
138 InvalidAttributeName(IdError),
140 #[error("Invalid short name in long variable name record. {0}")]
141 InvalidShortName(IdError),
143 #[error("Invalid name in long variable name record. {0}")]
144 InvalidLongName(IdError),
146 #[error("Invalid variable name in very long string record. {0}")]
147 InvalidLongStringName(IdError),
149 #[error("Invalid variable name in long string value label record. {0}")]
150 InvalidLongStringValueLabelName(IdError),
152 #[error("Invalid variable name in attribute record. {0}")]
153 InvalidAttributeVariableName(IdError),
155 // XXX This is risky because `text` might be arbitarily long.
156 #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
157 MalformedString { encoding: String, text: String },
159 #[error("Details TBD")]
163 type DictIndex = usize;
165 pub struct Variable {
166 pub dict_index: DictIndex,
167 pub short_name: Identifier,
168 pub long_name: Option<Identifier>,
173 pub raw: raw::Decoder,
174 pub encoding: &'static Encoding,
175 pub variables: HashMap<DictIndex, Variable>,
176 pub var_names: HashMap<Identifier, DictIndex>,
177 pub dictionary: Dictionary,
178 n_dict_indexes: usize,
179 n_generated_names: usize,
182 #[derive(Clone, Debug)]
184 pub header: HeaderRecord<String>,
185 pub variable: Vec<VariableRecord<String, String>>,
186 pub value_label: Vec<ValueLabelRecord<RawStr<8>, String>>,
187 pub document: Vec<DocumentRecord<String>>,
188 pub integer_info: Option<IntegerInfoRecord>,
189 pub float_info: Option<FloatInfoRecord>,
190 pub var_display: Option<VarDisplayRecord>,
191 pub multiple_response: Vec<MultipleResponseRecord<Identifier, String>>,
192 pub long_string_value_labels: Vec<LongStringValueLabelRecord<Identifier, String>>,
193 pub long_string_missing_values: Vec<LongStringMissingValueRecord<Identifier, String>>,
194 pub encoding: Option<EncodingRecord>,
195 pub number_of_cases: Option<NumberOfCasesRecord>,
196 pub variable_sets: Vec<VariableSetRecord>,
197 pub product_info: Option<ProductInfoRecord>,
198 pub long_names: Vec<LongNamesRecord>,
199 pub very_long_strings: Vec<VeryLongStringsRecord>,
200 pub file_attributes: Vec<FileAttributeRecord>,
201 pub variable_attributes: Vec<VariableAttributeRecord>,
202 pub other_extension: Vec<Extension>,
203 pub end_of_headers: Option<u32>,
204 pub z_header: Option<ZHeader>,
205 pub z_trailer: Option<ZTrailer>,
206 pub cases: Option<Rc<RefCell<Cases>>>,
209 fn take_first<T, F>(mut vec: Vec<T>, more_than_one: F) -> Option<T>
220 pub fn new(headers: Vec<raw::DecodedRecord>, warn: &impl Fn(Error)) -> Result<Headers, Error> {
221 let mut file_header = Vec::new();
222 let mut variable = Vec::new();
223 let mut value_label = Vec::new();
224 let mut document = Vec::new();
225 let mut integer_info = Vec::new();
226 let mut float_info = Vec::new();
227 let mut var_display = Vec::new();
228 let mut multiple_response = Vec::new();
229 let mut long_string_value_labels = Vec::new();
230 let mut long_string_missing_values = Vec::new();
231 let mut encoding = Vec::new();
232 let mut number_of_cases = Vec::new();
233 let mut variable_sets = Vec::new();
234 let mut product_info = Vec::new();
235 let mut long_names = Vec::new();
236 let mut very_long_strings = Vec::new();
237 let mut file_attributes = Vec::new();
238 let mut variable_attributes = Vec::new();
239 let mut other_extension = Vec::new();
240 let mut end_of_headers = Vec::new();
241 let mut z_header = Vec::new();
242 let mut z_trailer = Vec::new();
243 let mut cases = Vec::new();
245 for header in headers {
247 DecodedRecord::Header(record) => {
248 file_header.push(record);
250 DecodedRecord::Variable(record) => {
251 variable.push(record);
253 DecodedRecord::ValueLabel(record) => {
254 value_label.push(record);
256 DecodedRecord::Document(record) => {
257 document.push(record);
259 DecodedRecord::IntegerInfo(record) => {
260 integer_info.push(record);
262 DecodedRecord::FloatInfo(record) => {
263 float_info.push(record);
265 DecodedRecord::VariableSets(record) => {
266 variable_sets.push(record);
268 DecodedRecord::VarDisplay(record) => {
269 var_display.push(record);
271 DecodedRecord::MultipleResponse(record) => {
272 multiple_response.push(record);
274 DecodedRecord::LongStringValueLabels(record) => {
275 long_string_value_labels.push(record)
277 DecodedRecord::LongStringMissingValues(record) => {
278 long_string_missing_values.push(record);
280 DecodedRecord::Encoding(record) => {
281 encoding.push(record);
283 DecodedRecord::NumberOfCases(record) => {
284 number_of_cases.push(record);
286 DecodedRecord::ProductInfo(record) => {
287 product_info.push(record);
289 DecodedRecord::LongNames(record) => {
290 long_names.push(record);
292 DecodedRecord::VeryLongStrings(record) => {
293 very_long_strings.push(record);
295 DecodedRecord::FileAttributes(record) => {
296 file_attributes.push(record);
298 DecodedRecord::VariableAttributes(record) => {
299 variable_attributes.push(record);
301 DecodedRecord::OtherExtension(record) => {
302 other_extension.push(record);
304 DecodedRecord::EndOfHeaders(record) => {
305 end_of_headers.push(record);
307 DecodedRecord::ZHeader(record) => {
308 z_header.push(record);
310 DecodedRecord::ZTrailer(record) => {
311 z_trailer.push(record);
313 DecodedRecord::Cases(record) => {
319 let Some(file_header) = take_first(file_header, || warn(Error::DuplicateHeaderRecord))
321 return Err(Error::MissingHeaderRecord);
329 integer_info: take_first(integer_info, || warn(Error::TBD)),
330 float_info: take_first(float_info, || warn(Error::TBD)),
331 var_display: take_first(var_display, || warn(Error::TBD)),
333 long_string_value_labels,
334 long_string_missing_values,
335 encoding: take_first(encoding, || warn(Error::TBD)),
336 number_of_cases: take_first(number_of_cases, || warn(Error::TBD)),
338 product_info: take_first(product_info, || warn(Error::TBD)),
344 end_of_headers: take_first(end_of_headers, || warn(Error::TBD)),
345 z_header: take_first(z_header, || warn(Error::TBD)),
346 z_trailer: take_first(z_trailer, || warn(Error::TBD)),
347 cases: take_first(cases, || warn(Error::TBD)),
352 pub struct Metadata {
353 creation: NaiveDateTime,
355 compression: Option<Compression>,
356 n_cases: Option<u64>,
358 product_ext: Option<String>,
359 version: Option<(i32, i32, i32)>,
363 fn decode(headers: &Headers, warn: impl Fn(Error)) -> Self {
364 let header = &headers.header;
365 let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
366 .unwrap_or_else(|_| {
367 warn(Error::InvalidCreationDate {
368 creation_date: header.creation_date.to_string(),
372 let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
373 .unwrap_or_else(|_| {
374 warn(Error::InvalidCreationTime {
375 creation_time: header.creation_time.to_string(),
379 let creation = NaiveDateTime::new(creation_date, creation_time);
383 .trim_start_matches("@(#) SPSS DATA FILE")
389 endian: header.endian,
390 compression: header.compression,
391 n_cases: header.n_cases.map(|n| n as u64),
393 product_ext: headers.product_info.as_ref().map(|pe| fix_line_ends(&pe.0)),
394 version: headers.integer_info.as_ref().map(|ii| ii.version),
400 mut headers: Headers,
401 encoding: &'static Encoding,
402 warn: impl Fn(Error),
403 ) -> Result<(Dictionary, Metadata), Error> {
404 let mut dictionary = Dictionary::new(encoding);
406 let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' '));
407 if !file_label.is_empty() {
408 dictionary.file_label = Some(file_label);
411 for attributes in headers.file_attributes.drain(..) {
412 dictionary.attributes.extend(attributes.0.0.into_iter())
415 // Concatenate all the document records (really there should only be one)
416 // and trim off the trailing spaces that pad them to 80 bytes.
417 dictionary.documents = headers
420 .flat_map(|record| record.lines)
421 .map(trim_end_spaces)
424 // XXX warn for weird integer format
425 // XXX warn for weird floating-point format, etc.
428 let mut decoder = Decoder {
430 variables: HashMap::new(),
431 var_names: HashMap::new(),
434 n_generated_names: 0,
437 let metadata = Metadata::decode(&headers, warn);
438 Ok((dictionary, metadata))
441 fn trim_end_spaces(mut s: String) -> String {
442 s.truncate(s.trim_end_matches(' ').len());
446 /// Returns a copy of `s` in which all lone CR and CR LF pairs have been
449 /// (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
450 /// files that use CR-only line ends in the file label and extra product
452 fn fix_line_ends(s: &str) -> String {
453 let mut out = String::with_capacity(s.len());
454 let mut s = s.chars().peekable();
455 while let Some(c) = s.next() {
469 fn generate_name(&mut self) -> Identifier {
471 self.n_generated_names += 1;
472 let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
474 if !self.var_names.contains_key(&name) {
477 assert!(self.n_generated_names < usize::MAX);
480 fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
481 let (output, malformed) = self.encoding.decode_without_bom_handling(input);
483 warn(Error::MalformedString {
484 encoding: self.encoding.name().into(),
485 text: output.clone().into(),
490 fn decode_string(&self, input: &[u8], warn: &impl Fn(Error)) -> String {
491 self.decode_string_cow(input, warn).into()
493 pub fn decode_identifier(
496 warn: &impl Fn(Error),
497 ) -> Result<Identifier, IdError> {
498 let s = self.decode_string_cow(input, warn);
499 Identifier::new(&s, self.encoding)
501 fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> {
502 let max_index = self.n_dict_indexes;
503 if dict_index == 0 || dict_index > max_index {
504 return Err(Error::InvalidDictIndex {
509 let Some(variable) = self.variables.get(&(dict_index - 1)) else {
510 return Err(Error::DictIndexIsContinuation(dict_index));
515 /// Returns `input` decoded from `self.encoding` into UTF-8 such that
516 /// re-encoding the result back into `self.encoding` will have exactly the
517 /// same length in bytes.
519 /// XXX warn about errors?
520 fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
521 if let (s, false) = self.encoding.decode_without_bom_handling(input) {
522 // This is the common case. Usually there will be no errors.
525 // Unusual case. Don't bother to optimize it much.
526 let mut decoder = self.encoding.new_decoder_without_bom_handling();
527 let mut output = String::with_capacity(
529 .max_utf8_buffer_length_without_replacement(input.len())
532 let mut rest = input;
533 while !rest.is_empty() {
534 match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
535 (DecoderResult::InputEmpty, _) => break,
536 (DecoderResult::OutputFull, _) => unreachable!(),
537 (DecoderResult::Malformed(a, b), consumed) => {
538 let skipped = a as usize + b as usize;
539 output.extend(repeat('?').take(skipped));
540 rest = &rest[consumed..];
544 assert_eq!(self.encoding.encode(&output).0.len(), input.len());
550 pub trait TryDecode: Sized {
553 decoder: &mut Decoder,
554 input: &Self::Input<'_>,
555 warn: impl Fn(Error),
556 ) -> Result<Option<Self>, Error>;
559 pub trait Decode<Input>: Sized {
560 fn decode(decoder: &Decoder, input: &Input, warn: impl Fn(Error)) -> Self;
563 impl<const N: usize> Decode<RawStr<N>> for String {
564 fn decode(decoder: &Decoder, input: &RawStr<N>, warn: impl Fn(Error)) -> Self {
565 decoder.decode_string(&input.0, &warn)
570 #[derive(Clone, Debug)]
571 pub struct HeaderRecord {
572 pub eye_catcher: String,
573 pub weight_index: Option<usize>,
574 pub n_cases: Option<u64>,
575 pub creation: NaiveDateTime,
576 pub file_label: String,
579 fn trim_end_spaces(mut s: String) -> String {
580 s.truncate(s.trim_end_matches(' ').len());
584 /// Data file info that doesn't fit in [Dictionary].
585 pub struct Metadata {
586 creation: NaiveDateTime,
588 compression: Option<Compression>,
589 n_cases: Option<u64>,
591 product_ext: Option<String>,
592 version: Option<(i32, i32, i32)>,
597 header: &crate::raw::HeaderRecord<Cow<str>>,
598 integer_info: Option<&IntegerInfoRecord>,
599 product_ext: Option<&ProductInfoRecord>,
600 warn: impl Fn(Error),
602 let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
603 .unwrap_or_else(|_| {
604 warn(Error::InvalidCreationDate {
605 creation_date: header.creation_date.to_string(),
609 let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
610 .unwrap_or_else(|_| {
611 warn(Error::InvalidCreationTime {
612 creation_time: header.creation_time.to_string(),
616 let creation = NaiveDateTime::new(creation_date, creation_time);
620 .trim_start_matches("@(#) SPSS DATA FILE")
626 endian: header.endian,
627 compression: header.compression,
628 n_cases: header.n_cases.map(|n| n as u64),
630 product_ext: product_ext.map(|pe| pe.0.clone()),
631 version: integer_info.map(|ii| ii.version),
636 impl TryDecode for HeaderRecord {
637 type Input<'a> = crate::raw::HeaderRecord<Cow<'a, str>>;
640 _decoder: &mut Decoder,
641 input: &Self::Input<'_>,
642 warn: impl Fn(Error),
643 ) -> Result<Option<Self>, Error> {
644 let eye_catcher = trim_end_spaces(input.eye_catcher.to_string());
645 let file_label = trim_end_spaces(input.file_label.to_string());
646 let creation_date = NaiveDate::parse_from_str(&input.creation_date, "%e %b %Y")
647 .unwrap_or_else(|_| {
648 warn(Error::InvalidCreationDate {
649 creation_date: input.creation_date.to_string(),
653 let creation_time = NaiveTime::parse_from_str(&input.creation_time, "%H:%M:%S")
654 .unwrap_or_else(|_| {
655 warn(Error::InvalidCreationTime {
656 creation_time: input.creation_time.to_string(),
660 Ok(Some(HeaderRecord {
662 weight_index: input.weight_index.map(|n| n as usize),
663 n_cases: input.n_cases.map(|n| n as u64),
664 creation: NaiveDateTime::new(creation_date, creation_time),
670 #[derive(Clone, Debug)]
671 pub struct VariableRecord {
673 pub name: Identifier,
674 pub print_format: Spec,
675 pub write_format: Spec,
676 pub missing_values: MissingValues<String>,
677 pub label: Option<String>,
680 fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec {
681 UncheckedSpec::try_from(raw)
682 .and_then(Spec::try_from)
683 .and_then(|x| x.check_width_compatibility(width))
684 .unwrap_or_else(|error| {
685 let new_format = Spec::default_for_width(width);
686 warn(new_format, error);
691 fn parse_variable_record(
692 decoder: &mut Decoder,
693 input: &raw::VariableRecord<Cow<str>, String>,
694 warn: impl Fn(Error),
695 ) -> Result<(), Error> {
696 let width = match input.width {
697 0 => VarWidth::Numeric,
698 w @ 1..=255 => VarWidth::String(w as u16),
701 return Err(Error::InvalidVariableWidth {
702 offsets: input.offsets.clone(),
707 let name = trim_end_spaces(input.name.to_string());
708 let name = match Identifier::new(&name, decoder.encoding) {
710 if !decoder.var_names.contains_key(&name) {
713 let new_name = decoder.generate_name();
714 warn(Error::DuplicateVariableName {
715 duplicate_name: name.clone(),
716 new_name: new_name.clone(),
722 let new_name = decoder.generate_name();
723 warn(Error::InvalidVariableName {
725 new_name: new_name.clone(),
730 let variable = Variable {
731 dict_index: decoder.n_dict_indexes,
732 short_name: name.clone(),
736 decoder.n_dict_indexes += width.n_dict_indexes();
739 .insert(name.clone(), variable.dict_index)
743 .insert(variable.dict_index, variable)
746 let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
747 warn(Error::InvalidPrintFormat {
749 variable: name.clone(),
753 let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
754 warn(Error::InvalidWriteFormat {
756 variable: name.clone(),
760 let mut variable = dictionary::Variable::new(name, width);
761 variable.print_format = print_format;
762 variable.write_format = write_format;
763 variable.missing_values = input.missing_values.clone();
764 if let Some(ref label) = input.label {
765 variable.label = Some(label.to_string());
767 decoder.dictionary.add_var(variable).unwrap();
771 #[derive(Clone, Debug)]
772 pub struct DocumentRecord(Vec<String>);
774 impl TryDecode for DocumentRecord {
775 type Input<'a> = crate::raw::DocumentRecord<RawDocumentLine>;
778 decoder: &mut Decoder,
779 input: &Self::Input<'_>,
780 warn: impl Fn(Error),
781 ) -> Result<Option<Self>, Error> {
782 Ok(Some(DocumentRecord(
786 .map(|s| trim_end_spaces(decoder.decode_string(&s.0, &warn)))
796 const NAME: &'static str;
797 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error>;
800 #[derive(Clone, Debug)]
801 pub struct VariableSet {
803 pub vars: Vec<String>,
807 fn parse(input: &str) -> Result<Self, Error> {
808 let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
809 let vars = input.split_ascii_whitespace().map(String::from).collect();
817 trait WarnOnError<T> {
818 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
820 impl<T> WarnOnError<T> for Result<T, Error> {
821 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
823 Ok(result) => Some(result),
832 #[derive(Clone, Debug)]
833 pub struct ValueLabel {
838 #[derive(Clone, Debug)]
839 pub struct ValueLabelRecord {
840 pub var_type: VarType,
841 pub labels: Vec<ValueLabel>,
842 pub variables: Vec<Identifier>,
845 impl TryDecode for ValueLabelRecord {
846 type Input<'a> = crate::raw::ValueLabelRecord<RawStr<8>, RawString>;
848 decoder: &mut Decoder,
849 input: &Self::Input<'_>,
850 warn: impl Fn(Error),
851 ) -> Result<Option<ValueLabelRecord>, Error> {
852 let variables: Vec<&Variable> = input
855 .filter_map(|&dict_index| {
857 .get_var_by_index(dict_index as usize)
858 .warn_on_error(&warn)
860 .filter(|&variable| match variable.width {
861 VarWidth::String(width) if width > 8 => {
862 warn(Error::InvalidLongStringValueLabel(
863 variable.short_name.clone(),
870 let mut i = variables.iter();
871 let Some(&first_var) = i.next() else {
874 let var_type: VarType = first_var.width.into();
876 let this_type: VarType = variable.width.into();
877 if var_type != this_type {
878 let (numeric_var, string_var) = match var_type {
879 VarType::Numeric => (first_var, variable),
880 VarType::String => (variable, first_var),
882 warn(Error::ValueLabelsDifferentTypes {
883 numeric_var: numeric_var.short_name.clone(),
884 string_var: string_var.short_name.clone(),
892 .map(|raw::ValueLabel { value, label }| {
893 let label = decoder.decode_string(&label.0, &warn);
894 let value = Value::decode(value, decoder);
895 ValueLabel { value, label }
898 let variables = variables
900 .map(|&variable| variable.short_name.clone())
902 Ok(Some(ValueLabelRecord {
910 #[derive(Clone, Debug)]
911 pub struct VariableSetRecord(Vec<VariableSet>);
913 impl TextRecord for VariableSetRecord {
914 const NAME: &'static str = "variable set";
915 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
916 let mut sets = Vec::new();
917 for line in input.lines() {
918 if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) {
922 Ok(VariableSetRecord(sets))
926 #[derive(Clone, Debug)]
927 pub struct LongName {
928 pub short_name: Identifier,
929 pub long_name: Identifier,
933 fn new(decoder: &mut Decoder, short_name: &str, long_name: &str) -> Result<LongName, Error> {
935 Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidShortName)?;
937 Identifier::new(long_name, decoder.encoding).map_err(Error::InvalidLongName)?;
945 #[derive(Clone, Debug)]
946 pub struct LongNameRecord(Vec<LongName>);
948 impl LongNameRecord {
949 pub fn parse(decoder: &mut Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
950 let mut names = Vec::new();
951 for pair in input.split('\t').filter(|s| !s.is_empty()) {
952 if let Some((short_name, long_name)) = pair.split_once('=') {
953 if let Some(long_name) =
954 LongName::new(decoder, short_name, long_name).warn_on_error(&warn)
956 names.push(long_name);
962 Ok(LongNameRecord(names))
966 #[derive(Clone, Debug)]
967 pub struct VeryLongString {
968 pub short_name: Identifier,
972 impl VeryLongString {
973 fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Error> {
974 let Some((short_name, length)) = input.split_once('=') else {
975 return Err(Error::TBD);
978 Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidLongStringName)?;
979 let length: u16 = length.parse().map_err(|_| Error::TBD)?;
980 if length > VarWidth::MAX_STRING {
981 return Err(Error::TBD);
983 Ok(VeryLongString { short_name, length })
987 #[derive(Clone, Debug)]
988 pub struct VeryLongStringRecord(Vec<VeryLongString>);
990 impl VeryLongStringRecord {
991 pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
992 let mut very_long_strings = Vec::new();
995 .map(|s| s.trim_end_matches('\t'))
996 .filter(|s| !s.is_empty())
998 if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&warn) {
999 very_long_strings.push(vls)
1002 Ok(VeryLongStringRecord(very_long_strings))
1006 #[derive(Clone, Debug)]
1007 pub struct Attribute {
1008 pub name: Identifier,
1009 pub values: Vec<String>,
1016 warn: &impl Fn(Error),
1017 ) -> Result<(Option<Attribute>, &'a str), Error> {
1018 let Some((name, mut input)) = input.split_once('(') else {
1019 return Err(Error::TBD);
1021 let mut values = Vec::new();
1023 let Some((value, rest)) = input.split_once('\n') else {
1024 return Err(Error::TBD);
1026 if let Some(stripped) = value
1028 .and_then(|value| value.strip_suffix('\''))
1030 values.push(stripped.into());
1033 values.push(value.into());
1035 if let Some(rest) = rest.strip_prefix(')') {
1036 let attribute = Identifier::new(name, decoder.encoding)
1037 .map_err(Error::InvalidAttributeName)
1038 .warn_on_error(warn)
1039 .map(|name| Attribute { name, values });
1040 return Ok((attribute, rest));
1047 #[derive(Clone, Debug)]
1048 pub struct AttributeSet(pub Vec<Attribute>);
1054 sentinel: Option<char>,
1055 warn: &impl Fn(Error),
1056 ) -> Result<(AttributeSet, &'a str), Error> {
1057 let mut attributes = Vec::new();
1059 match input.chars().next() {
1060 None => break input,
1061 c if c == sentinel => break &input[1..],
1063 let (attribute, rest) = Attribute::parse(decoder, input, &warn)?;
1064 if let Some(attribute) = attribute {
1065 attributes.push(attribute);
1071 Ok((AttributeSet(attributes), rest))
1075 #[derive(Clone, Debug)]
1076 pub struct FileAttributeRecord(AttributeSet);
1078 impl FileAttributeRecord {
1079 pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1080 let (set, rest) = AttributeSet::parse(decoder, input, None, &warn)?;
1081 if !rest.is_empty() {
1084 Ok(FileAttributeRecord(set))
1088 #[derive(Clone, Debug)]
1089 pub struct VarAttributeSet {
1090 pub long_var_name: Identifier,
1091 pub attributes: AttributeSet,
1094 impl VarAttributeSet {
1098 warn: &impl Fn(Error),
1099 ) -> Result<(Option<VarAttributeSet>, &'a str), Error> {
1100 let Some((long_var_name, rest)) = input.split_once(':') else {
1101 return Err(Error::TBD);
1103 let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'), warn)?;
1104 let var_attribute = Identifier::new(long_var_name, decoder.encoding)
1105 .map_err(Error::InvalidAttributeVariableName)
1106 .warn_on_error(warn)
1107 .map(|name| VarAttributeSet {
1108 long_var_name: name,
1111 Ok((var_attribute, rest))
1115 #[derive(Clone, Debug)]
1116 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
1118 impl VariableAttributeRecord {
1119 pub fn parse(decoder: &Decoder, mut input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1120 let mut var_attribute_sets = Vec::new();
1121 while !input.is_empty() {
1122 let Some((var_attribute, rest)) =
1123 VarAttributeSet::parse(decoder, input, &warn).warn_on_error(&warn)
1127 if let Some(var_attribute) = var_attribute {
1128 var_attribute_sets.push(var_attribute);
1132 Ok(VariableAttributeRecord(var_attribute_sets))
1136 #[derive(Clone, Debug)]
1137 pub enum MultipleResponseType {
1140 labels: CategoryLabels,
1145 impl MultipleResponseType {
1148 mr_set: &Identifier,
1149 input: &raw::MultipleResponseType,
1150 min_width: VarWidth,
1151 warn: &impl Fn(Error),
1152 ) -> Result<Self, Error> {
1153 let mr_type = match input {
1154 raw::MultipleResponseType::MultipleDichotomy { value, labels } => {
1155 let value = decoder.decode_string_cow(&value.0, warn);
1156 let value = match min_width {
1157 VarWidth::Numeric => {
1158 let number: f64 = value.trim().parse().map_err(|_| {
1159 Error::InvalidMDGroupCountedValue {
1160 mr_set: mr_set.clone(),
1161 number: value.into(),
1164 Value::Number(Some(number.into()))
1166 VarWidth::String(max_width) => {
1167 let value = value.trim_end_matches(' ');
1168 let width = value.len();
1169 if width > max_width as usize {
1170 return Err(Error::TooWideMDGroupCountedValue {
1171 mr_set: mr_set.clone(),
1172 value: value.into(),
1177 Value::String(value.into())
1180 MultipleResponseType::MultipleDichotomy {
1185 raw::MultipleResponseType::MultipleCategory => MultipleResponseType::MultipleCategory,
1191 #[derive(Clone, Debug)]
1192 pub struct MultipleResponseSet {
1193 pub name: Identifier,
1194 pub min_width: VarWidth,
1195 pub max_width: VarWidth,
1197 pub mr_type: MultipleResponseType,
1198 pub dict_indexes: Vec<DictIndex>,
1201 impl MultipleResponseSet {
1204 input: &raw::MultipleResponseSet<Identifier, Cow<str>>,
1205 warn: &impl Fn(Error),
1206 ) -> Result<Self, Error> {
1207 let mr_set_name = input.name.clone();
1208 let mut dict_indexes = Vec::with_capacity(input.short_names.len());
1209 for short_name in input.short_names.iter() {
1210 let Some(&dict_index) = decoder.var_names.get(&short_name) else {
1211 warn(Error::UnknownMrSetVariable {
1212 mr_set: mr_set_name.clone(),
1213 short_name: short_name.clone(),
1217 dict_indexes.push(dict_index);
1220 match dict_indexes.len() {
1221 0 => return Err(Error::EmptyMrSet(mr_set_name)),
1222 1 => return Err(Error::OneVarMrSet(mr_set_name)),
1226 let Some((Some(min_width), Some(max_width))) = dict_indexes
1228 .map(|dict_index| decoder.variables[dict_index].width)
1229 .map(|w| (Some(w), Some(w)))
1230 .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb)))
1232 return Err(Error::MixedMrSet(mr_set_name));
1236 MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?;
1238 Ok(MultipleResponseSet {
1242 label: input.label.to_string(),
1249 #[derive(Clone, Debug)]
1250 pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
1252 impl TryDecode for MultipleResponseRecord {
1253 type Input<'a> = raw::MultipleResponseRecord<Identifier, Cow<'a, str>>;
1256 decoder: &mut Decoder,
1257 input: &Self::Input<'_>,
1258 warn: impl Fn(Error),
1259 ) -> Result<Option<Self>, Error> {
1260 let mut sets = Vec::with_capacity(input.0.len());
1261 for set in &input.0 {
1262 match MultipleResponseSet::decode(decoder, set, &warn) {
1263 Ok(set) => sets.push(set),
1264 Err(error) => warn(error),
1267 Ok(Some(MultipleResponseRecord(sets)))
1271 #[derive(Clone, Debug)]
1272 pub struct LongStringValueLabels {
1273 pub var_name: Identifier,
1274 pub width: VarWidth,
1275 pub labels: Vec<ValueLabel>,
1278 impl LongStringValueLabels {
1281 input: &raw::LongStringValueLabels<RawString>,
1282 warn: &impl Fn(Error),
1283 ) -> Result<Self, Error> {
1284 let var_name = decoder.decode_string(&input.var_name.0, warn);
1285 let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
1286 .map_err(Error::InvalidLongStringValueLabelName)?;
1289 let max_width = VarWidth::MAX_STRING;
1290 if input.width < 9 || input.width > max_width as u32 {
1291 return Err(Error::InvalidLongValueLabelWidth {
1298 let width = input.width as u16;
1300 let mut labels = Vec::with_capacity(input.labels.len());
1301 for (value, label) in input.labels.iter() {
1302 let value = Value::String(decoder.decode_exact_length(&value.0).into());
1303 let label = decoder.decode_string(&label.0, warn);
1304 labels.push(ValueLabel { value, label });
1307 Ok(LongStringValueLabels {
1309 width: VarWidth::String(width),
1315 #[derive(Clone, Debug)]
1316 pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
1318 impl TryDecode for LongStringValueLabelRecord {
1319 type Input<'a> = raw::LongStringValueLabelRecord<RawString>;
1322 decoder: &mut Decoder,
1323 input: &Self::Input<'_>,
1324 warn: impl Fn(Error),
1325 ) -> Result<Option<Self>, Error> {
1326 let mut labels = Vec::with_capacity(input.0.len());
1327 for label in &input.0 {
1328 match LongStringValueLabels::decode(decoder, label, &warn) {
1329 Ok(set) => labels.push(set),
1330 Err(error) => warn(error),
1333 Ok(Some(LongStringValueLabelRecord(labels)))
1339 use encoding_rs::WINDOWS_1252;
1343 let mut s = String::new();
1344 s.push(char::REPLACEMENT_CHARACTER);
1345 let encoded = WINDOWS_1252.encode(&s).0;
1346 let decoded = WINDOWS_1252.decode(&encoded[..]).0;
1347 println!("{:?}", decoded);
1352 let charset: Vec<u8> = (0..=255).collect();
1353 println!("{}", charset.len());
1354 let decoded = WINDOWS_1252.decode(&charset[..]).0;
1355 println!("{}", decoded.len());
1356 let encoded = WINDOWS_1252.encode(&decoded[..]).0;
1357 println!("{}", encoded.len());
1358 assert_eq!(&charset[..], &encoded[..]);