2 borrow::Cow, cell::RefCell, cmp::Ordering, collections::HashMap, iter::repeat, ops::Range,
7 dictionary::{self, Dictionary},
8 encoding::{default_encoding, get_encoding, Error as EncodingError},
10 format::{Error as FormatError, Spec, UncheckedSpec},
11 identifier::{Error as IdError, Identifier},
13 self, LongStringMissingValueRecord, MissingValues, ProductInfoRecord, RawDocumentLine,
14 RawStr, RawString, VarDisplayRecord, VarType,
17 use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
18 use encoding_rs::{DecoderResult, Encoding};
19 use num::integer::div_ceil;
20 use ordered_float::OrderedFloat;
21 use thiserror::Error as ThisError;
23 pub use crate::raw::{CategoryLabels, Compression};
25 #[derive(ThisError, Debug)]
27 // XXX this is really an internal error and maybe we should change the
28 // interfaces to make it impossible
29 #[error("Missing header record")]
33 EncodingError(EncodingError),
35 #[error("Using default encoding {0}.")]
36 UsingDefaultEncoding(String),
38 #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)]
39 InvalidVariableWidth { offsets: Range<u64>, width: i32 },
41 #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
42 InvalidLongMissingValueFormat,
44 #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
45 InvalidCreationDate { creation_date: String },
47 #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
48 InvalidCreationTime { creation_time: String },
50 #[error("{id_error} Renaming variable to {new_name}.")]
57 "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}"
62 format_error: FormatError,
66 "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}"
71 format_error: FormatError,
74 #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")]
75 DuplicateVariableName {
76 duplicate_name: Identifier,
80 #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")]
81 InvalidDictIndex { dict_index: usize, max_index: usize },
83 #[error("Dictionary index {0} refers to a long string continuation.")]
84 DictIndexIsContinuation(usize),
86 #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")]
87 ValueLabelsDifferentTypes {
88 numeric_var: Identifier,
89 string_var: Identifier,
93 "Value labels may not be added to long string variable {0} using record types 3 or 4."
95 InvalidLongStringValueLabel(Identifier),
97 #[error("Invalid multiple response set name. {0}")]
98 InvalidMrSetName(IdError),
100 #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")]
101 UnknownMrSetVariable {
103 short_name: Identifier,
106 #[error("Multiple response set {0} has no variables.")]
107 EmptyMrSet(Identifier),
109 #[error("Multiple response set {0} has only one variable.")]
110 OneVarMrSet(Identifier),
112 #[error("Multiple response set {0} contains both string and numeric variables.")]
113 MixedMrSet(Identifier),
116 "Invalid numeric format for counted value {number} in multiple response set {mr_set}."
118 InvalidMDGroupCountedValue { mr_set: Identifier, number: String },
120 #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")]
121 TooWideMDGroupCountedValue {
128 #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")]
129 InvalidLongValueLabelWidth {
136 #[error("Invalid attribute name. {0}")]
137 InvalidAttributeName(IdError),
139 #[error("Invalid short name in long variable name record. {0}")]
140 InvalidShortName(IdError),
142 #[error("Invalid name in long variable name record. {0}")]
143 InvalidLongName(IdError),
145 #[error("Invalid variable name in very long string record. {0}")]
146 InvalidLongStringName(IdError),
148 #[error("Invalid variable name in long string value label record. {0}")]
149 InvalidLongStringValueLabelName(IdError),
151 #[error("Invalid variable name in attribute record. {0}")]
152 InvalidAttributeVariableName(IdError),
154 // XXX This is risky because `text` might be arbitarily long.
155 #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
156 MalformedString { encoding: String, text: String },
158 #[error("Details TBD")]
162 #[derive(Clone, Debug)]
164 Header(HeaderRecord),
165 Variable(VariableRecord),
166 ValueLabel(ValueLabelRecord),
167 Document(DocumentRecord),
168 IntegerInfo(IntegerInfoRecord),
169 FloatInfo(FloatInfoRecord),
170 VariableSets(VariableSetRecord),
171 VarDisplay(VarDisplayRecord),
172 MultipleResponse(MultipleResponseRecord),
173 LongStringMissingValues(LongStringMissingValueRecord<String, String>),
174 LongStringValueLabels(LongStringValueLabelRecord),
175 Encoding(EncodingRecord),
176 NumberOfCases(NumberOfCasesRecord),
177 ProductInfo(ProductInfoRecord),
178 LongNames(LongNameRecord),
179 VeryLongStrings(VeryLongStringRecord),
180 FileAttributes(FileAttributeRecord),
181 VariableAttributes(VariableAttributeRecord),
182 OtherExtension(Extension),
186 pub use crate::raw::EncodingRecord;
187 pub use crate::raw::Extension;
188 pub use crate::raw::FloatInfoRecord;
189 pub use crate::raw::IntegerInfoRecord;
190 pub use crate::raw::NumberOfCasesRecord;
192 type DictIndex = usize;
194 pub struct Variable {
195 pub dict_index: DictIndex,
196 pub short_name: Identifier,
197 pub long_name: Option<Identifier>,
202 pub raw: raw::Decoder,
203 pub encoding: &'static Encoding,
204 pub variables: HashMap<DictIndex, Variable>,
205 pub var_names: HashMap<Identifier, DictIndex>,
206 pub dictionary: Dictionary,
207 n_dict_indexes: usize,
208 n_generated_names: usize,
213 header: Option<raw::HeaderRecord<Cow<'a, str>>>,
214 variables: Vec<raw::VariableRecord<Cow<'a, str>, String>>,
215 value_labels: Vec<&'a raw::ValueLabelRecord<RawStr<8>, RawString>>,
216 documents: Vec<raw::DocumentRecord<Cow<'a, str>>>,
217 integer_info: Option<&'a raw::IntegerInfoRecord>,
218 float_info: Option<&'a raw::FloatInfoRecord>,
219 variable_sets: Vec<&'a raw::VariableSetRecord>,
220 var_display: Option<&'a raw::VarDisplayRecord>,
221 multiple_response: Vec<&'a raw::MultipleResponseRecord<RawString, RawString>>,
222 long_string_value_labels: Vec<&'a raw::LongStringValueLabelRecord<RawString>>,
223 long_string_missing_values: Vec<raw::LongStringMissingValueRecord<Identifier, String>>,
224 encoding: Option<&'a raw::EncodingRecord>,
225 number_of_cases: Option<&'a raw::NumberOfCasesRecord>,
226 product_info: Option<&'a raw::ProductInfoRecord>,
227 long_names: Option<&'a raw::LongNamesRecord>,
228 very_long_strings: Vec<&'a raw::VeryLongStringsRecord>,
229 file_attributes: Vec<&'a raw::FileAttributeRecord>,
230 variable_attributes: Vec<&'a raw::VariableAttributeRecord>,
231 other_extensions: Vec<&'a raw::Extension>,
232 cases: Option<&'a Rc<RefCell<raw::Cases>>>,
235 fn set_or_warn<T>(option: &mut Option<T>, value: T, warn: &impl Fn(Error)) {
236 if option.is_none() {
237 let _ = option.insert(value);
243 impl<'a> Headers<'a> {
244 fn new(headers: &'a Vec<raw::Record>, decoder: &Decoder, warn: &impl Fn(Error)) -> Headers<'a> {
245 let mut h = Headers::default();
246 for header in headers {
248 raw::Record::Header(record) => {
249 set_or_warn(&mut h.header, record.decode(&decoder.raw), warn)
251 raw::Record::Variable(record) => h.variables.push(record.decode(&decoder.raw)),
252 raw::Record::ValueLabel(record) => h.value_labels.push(record),
253 raw::Record::Document(record) => h.documents.push(record.decode(&decoder.raw)),
254 raw::Record::IntegerInfo(record) => set_or_warn(&mut h.integer_info, record, warn),
255 raw::Record::FloatInfo(record) => set_or_warn(&mut h.float_info, record, warn),
256 raw::Record::VariableSets(record) => h.variable_sets.push(record),
257 raw::Record::VarDisplay(record) => set_or_warn(&mut h.var_display, record, warn),
258 raw::Record::MultipleResponse(record) => h.multiple_response.push(record),
259 raw::Record::LongStringValueLabels(record) => {
260 h.long_string_value_labels.push(record)
262 raw::Record::LongStringMissingValues(record) => h
263 .long_string_missing_values
264 .push(record.decode(&decoder.raw)),
265 raw::Record::Encoding(record) => set_or_warn(&mut h.encoding, record, warn),
266 raw::Record::NumberOfCases(record) => {
267 set_or_warn(&mut h.number_of_cases, record, warn)
269 raw::Record::ProductInfo(record) => set_or_warn(&mut h.product_info, record, warn),
270 raw::Record::LongNames(record) => set_or_warn(&mut h.long_names, record, warn),
271 raw::Record::VeryLongStrings(record) => h.very_long_strings.push(record),
272 raw::Record::FileAttributes(record) => h.file_attributes.push(record),
273 raw::Record::VariableAttributes(record) => h.variable_attributes.push(record),
274 raw::Record::OtherExtension(record) => h.other_extensions.push(record),
275 raw::Record::EndOfHeaders(_) => (),
276 raw::Record::ZHeader(_) => (),
277 raw::Record::ZTrailer(_) => (),
278 raw::Record::Cases(record) => set_or_warn(&mut h.cases, record, warn),
279 raw::Record::Text(_) => todo!(),
286 pub fn encoding_from_headers(
287 headers: &Vec<raw::Record>,
288 warn: &impl Fn(Error),
289 ) -> Result<&'static Encoding, Error> {
290 let mut encoding_record = None;
291 let mut integer_info_record = None;
292 for record in headers {
294 raw::Record::Encoding(record) => encoding_record = Some(record),
295 raw::Record::IntegerInfo(record) => integer_info_record = Some(record),
299 let encoding = encoding_record.map(|record| record.0.as_str());
300 let character_code = integer_info_record.map(|record| record.character_code);
301 match get_encoding(encoding, character_code) {
302 Ok(encoding) => Ok(encoding),
303 Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)),
305 warn(Error::EncodingError(err));
306 // Warn that we're using the default encoding.
307 Ok(default_encoding())
313 headers: Vec<raw::Record>,
314 encoding: &'static Encoding,
315 warn: &impl Fn(Error),
316 ) -> Result<(Vec<Record>, Metadata), Error> {
317 let mut decoder = Decoder {
320 warn: Box::new(|error| println!("{error}")),
323 variables: HashMap::new(),
324 var_names: HashMap::new(),
325 dictionary: Dictionary::new(encoding),
327 n_generated_names: 0,
330 let h = Headers::new(&headers, &decoder, warn);
331 let Some(header) = h.header else {
332 return Err(Error::MissingHeaderRecord);
335 let mut output = Vec::with_capacity(headers.len());
337 // Decode the records that don't use variables at all.
338 if let Some(header) = HeaderRecord::try_decode(&mut decoder, &header, warn)? {
339 output.push(Record::Header(header))
341 for document in h.documents {
342 for line in &document.lines {
343 decoder.dictionary.documents.push(line.to_string())
347 for &raw in &h.file_attributes {
348 let s = decoder.decode_string_cow(&raw.text.0, warn);
349 output.push(Record::FileAttributes(FileAttributeRecord::parse(
353 for &raw in &h.other_extensions {
354 output.push(Record::OtherExtension(raw.clone()));
357 // Decode the variable records, which are the basis of almost everything
359 for raw in &h.variables {
360 parse_variable_record(&mut decoder, raw, warn)?;
363 // Decode value labels and weight variable. These use indexes into the
364 // variable records, so we need to parse them before those indexes become
365 // invalidated by very long string variables.
366 for &raw in &h.value_labels {
367 if let Some(value_label) = ValueLabelRecord::try_decode(&mut decoder, raw, warn)? {
368 output.push(Record::ValueLabel(value_label));
372 if let Some(raw) = h.var_display {
373 output.push(Record::VarDisplay(raw.clone()));
376 // Decode records that use short names.
377 for &raw in &h.multiple_response {
378 if let Some(mrr) = MultipleResponseRecord::try_decode(&mut decoder, raw, warn)? {
379 output.push(Record::MultipleResponse(mrr))
382 for &raw in &h.very_long_strings {
383 let s = decoder.decode_string_cow(&raw.text.0, warn);
384 output.push(Record::VeryLongStrings(VeryLongStringRecord::parse(
389 // Rename variables to their long names.
390 for &raw in &h.long_names {
391 let s = decoder.decode_string_cow(&raw.text.0, warn);
392 output.push(Record::LongNames(LongNameRecord::parse(
399 // Decode recods that use long names.
400 for &raw in &h.variable_attributes {
401 let s = decoder.decode_string_cow(&raw.text.0, warn);
402 output.push(Record::VariableAttributes(VariableAttributeRecord::parse(
406 for &raw in &h.long_string_value_labels {
407 if let Some(mrr) = LongStringValueLabelRecord::try_decode(&mut decoder, raw, warn)? {
408 output.push(Record::LongStringValueLabels(mrr))
411 for &raw in &h.long_string_missing_values {
412 if let Some(mrr) = LongStringMissingValuesRecord::try_decode(&mut decoder, raw, warn)? {
413 output.push(Record::LongStringMissingValues(mrr))
416 for &raw in &h.variable_sets {
417 let s = decoder.decode_string_cow(&raw.text.0, warn);
418 output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?));
421 let metadata = Metadata::decode(&header, h.integer_info, h.product_info, warn);
422 Ok((output, metadata))
426 fn generate_name(&mut self) -> Identifier {
428 self.n_generated_names += 1;
429 let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
431 if !self.var_names.contains_key(&name) {
434 assert!(self.n_generated_names < usize::MAX);
437 fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
438 let (output, malformed) = self.encoding.decode_without_bom_handling(input);
440 warn(Error::MalformedString {
441 encoding: self.encoding.name().into(),
442 text: output.clone().into(),
447 fn decode_string(&self, input: &[u8], warn: &impl Fn(Error)) -> String {
448 self.decode_string_cow(input, warn).into()
450 pub fn decode_identifier(
453 warn: &impl Fn(Error),
454 ) -> Result<Identifier, IdError> {
455 let s = self.decode_string_cow(input, warn);
456 Identifier::new(&s, self.encoding)
458 fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> {
459 let max_index = self.n_dict_indexes;
460 if dict_index == 0 || dict_index > max_index {
461 return Err(Error::InvalidDictIndex {
466 let Some(variable) = self.variables.get(&(dict_index - 1)) else {
467 return Err(Error::DictIndexIsContinuation(dict_index));
472 /// Returns `input` decoded from `self.encoding` into UTF-8 such that
473 /// re-encoding the result back into `self.encoding` will have exactly the
474 /// same length in bytes.
476 /// XXX warn about errors?
477 fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
478 if let (s, false) = self.encoding.decode_without_bom_handling(input) {
479 // This is the common case. Usually there will be no errors.
482 // Unusual case. Don't bother to optimize it much.
483 let mut decoder = self.encoding.new_decoder_without_bom_handling();
484 let mut output = String::with_capacity(
486 .max_utf8_buffer_length_without_replacement(input.len())
489 let mut rest = input;
490 while !rest.is_empty() {
491 match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
492 (DecoderResult::InputEmpty, _) => break,
493 (DecoderResult::OutputFull, _) => unreachable!(),
494 (DecoderResult::Malformed(a, b), consumed) => {
495 let skipped = a as usize + b as usize;
496 output.extend(repeat('?').take(skipped));
497 rest = &rest[consumed..];
501 assert_eq!(self.encoding.encode(&output).0.len(), input.len());
507 pub trait TryDecode: Sized {
510 decoder: &mut Decoder,
511 input: &Self::Input<'_>,
512 warn: impl Fn(Error),
513 ) -> Result<Option<Self>, Error>;
516 pub trait Decode<Input>: Sized {
517 fn decode(decoder: &Decoder, input: &Input, warn: impl Fn(Error)) -> Self;
520 impl<const N: usize> Decode<RawStr<N>> for String {
521 fn decode(decoder: &Decoder, input: &RawStr<N>, warn: impl Fn(Error)) -> Self {
522 decoder.decode_string(&input.0, &warn)
526 #[derive(Clone, Debug)]
527 pub struct HeaderRecord {
528 pub eye_catcher: String,
529 pub weight_index: Option<usize>,
530 pub n_cases: Option<u64>,
531 pub creation: NaiveDateTime,
532 pub file_label: String,
535 fn trim_end_spaces(mut s: String) -> String {
536 s.truncate(s.trim_end_matches(' ').len());
540 /// Data file info that doesn't fit in [Dictionary].
541 pub struct Metadata {
542 creation: NaiveDateTime,
544 compression: Option<Compression>,
545 n_cases: Option<u64>,
547 product_ext: Option<String>,
548 version: Option<(i32, i32, i32)>,
553 header: &crate::raw::HeaderRecord<Cow<str>>,
554 integer_info: Option<&IntegerInfoRecord>,
555 product_ext: Option<&ProductInfoRecord>,
556 warn: impl Fn(Error),
558 let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
559 .unwrap_or_else(|_| {
560 warn(Error::InvalidCreationDate {
561 creation_date: header.creation_date.to_string(),
565 let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
566 .unwrap_or_else(|_| {
567 warn(Error::InvalidCreationTime {
568 creation_time: header.creation_time.to_string(),
572 let creation = NaiveDateTime::new(creation_date, creation_time);
576 .trim_start_matches("@(#) SPSS DATA FILE")
582 endian: header.endian,
583 compression: header.compression,
584 n_cases: header.n_cases.map(|n| n as u64),
586 product_ext: product_ext.map(|pe| pe.0.clone()),
587 version: integer_info.map(|ii| ii.version),
592 impl TryDecode for HeaderRecord {
593 type Input<'a> = crate::raw::HeaderRecord<Cow<'a, str>>;
596 _decoder: &mut Decoder,
597 input: &Self::Input<'_>,
598 warn: impl Fn(Error),
599 ) -> Result<Option<Self>, Error> {
600 let eye_catcher = trim_end_spaces(input.eye_catcher.to_string());
601 let file_label = trim_end_spaces(input.file_label.to_string());
602 let creation_date = NaiveDate::parse_from_str(&input.creation_date, "%e %b %Y")
603 .unwrap_or_else(|_| {
604 warn(Error::InvalidCreationDate {
605 creation_date: input.creation_date.to_string(),
609 let creation_time = NaiveTime::parse_from_str(&input.creation_time, "%H:%M:%S")
610 .unwrap_or_else(|_| {
611 warn(Error::InvalidCreationTime {
612 creation_time: input.creation_time.to_string(),
616 Ok(Some(HeaderRecord {
618 weight_index: input.weight_index.map(|n| n as usize),
619 n_cases: input.n_cases.map(|n| n as u64),
620 creation: NaiveDateTime::new(creation_date, creation_time),
626 #[derive(Clone, Debug)]
627 pub struct VariableRecord {
629 pub name: Identifier,
630 pub print_format: Spec,
631 pub write_format: Spec,
632 pub missing_values: MissingValues<String>,
633 pub label: Option<String>,
636 fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec {
637 UncheckedSpec::try_from(raw)
638 .and_then(Spec::try_from)
639 .and_then(|x| x.check_width_compatibility(width))
640 .unwrap_or_else(|error| {
641 let new_format = Spec::default_for_width(width);
642 warn(new_format, error);
647 fn parse_variable_record(
648 decoder: &mut Decoder,
649 input: &raw::VariableRecord<Cow<str>, String>,
650 warn: impl Fn(Error),
651 ) -> Result<(), Error> {
652 let width = match input.width {
653 0 => VarWidth::Numeric,
654 w @ 1..=255 => VarWidth::String(w as u16),
657 return Err(Error::InvalidVariableWidth {
658 offsets: input.offsets.clone(),
663 let name = trim_end_spaces(input.name.to_string());
664 let name = match Identifier::new(&name, decoder.encoding) {
666 if !decoder.var_names.contains_key(&name) {
669 let new_name = decoder.generate_name();
670 warn(Error::DuplicateVariableName {
671 duplicate_name: name.clone(),
672 new_name: new_name.clone(),
678 let new_name = decoder.generate_name();
679 warn(Error::InvalidVariableName {
681 new_name: new_name.clone(),
686 let variable = Variable {
687 dict_index: decoder.n_dict_indexes,
688 short_name: name.clone(),
692 decoder.n_dict_indexes += width.n_dict_indexes();
695 .insert(name.clone(), variable.dict_index)
699 .insert(variable.dict_index, variable)
702 let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
703 warn(Error::InvalidPrintFormat {
705 variable: name.clone(),
709 let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
710 warn(Error::InvalidWriteFormat {
712 variable: name.clone(),
716 let mut variable = dictionary::Variable::new(name, width);
717 variable.print_format = print_format;
718 variable.write_format = write_format;
719 variable.missing_values = input.missing_values.clone();
720 if let Some(ref label) = input.label {
721 variable.label = Some(label.to_string());
723 decoder.dictionary.add_var(variable).unwrap();
727 #[derive(Clone, Debug)]
728 pub struct DocumentRecord(Vec<String>);
730 impl TryDecode for DocumentRecord {
731 type Input<'a> = crate::raw::DocumentRecord<RawDocumentLine>;
734 decoder: &mut Decoder,
735 input: &Self::Input<'_>,
736 warn: impl Fn(Error),
737 ) -> Result<Option<Self>, Error> {
738 Ok(Some(DocumentRecord(
742 .map(|s| trim_end_spaces(decoder.decode_string(&s.0, &warn)))
752 const NAME: &'static str;
753 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error>;
756 #[derive(Clone, Debug)]
757 pub struct VariableSet {
759 pub vars: Vec<String>,
763 fn parse(input: &str) -> Result<Self, Error> {
764 let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
765 let vars = input.split_ascii_whitespace().map(String::from).collect();
773 trait WarnOnError<T> {
774 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
776 impl<T> WarnOnError<T> for Result<T, Error> {
777 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
779 Ok(result) => Some(result),
788 #[derive(Clone, Debug)]
789 pub struct ValueLabel {
794 #[derive(Clone, Debug)]
795 pub struct ValueLabelRecord {
796 pub var_type: VarType,
797 pub labels: Vec<ValueLabel>,
798 pub variables: Vec<Identifier>,
801 impl TryDecode for ValueLabelRecord {
802 type Input<'a> = crate::raw::ValueLabelRecord<RawStr<8>, RawString>;
804 decoder: &mut Decoder,
805 input: &Self::Input<'_>,
806 warn: impl Fn(Error),
807 ) -> Result<Option<ValueLabelRecord>, Error> {
808 let variables: Vec<&Variable> = input
811 .filter_map(|&dict_index| {
813 .get_var_by_index(dict_index as usize)
814 .warn_on_error(&warn)
816 .filter(|&variable| match variable.width {
817 VarWidth::String(width) if width > 8 => {
818 warn(Error::InvalidLongStringValueLabel(
819 variable.short_name.clone(),
826 let mut i = variables.iter();
827 let Some(&first_var) = i.next() else {
830 let var_type: VarType = first_var.width.into();
832 let this_type: VarType = variable.width.into();
833 if var_type != this_type {
834 let (numeric_var, string_var) = match var_type {
835 VarType::Numeric => (first_var, variable),
836 VarType::String => (variable, first_var),
838 warn(Error::ValueLabelsDifferentTypes {
839 numeric_var: numeric_var.short_name.clone(),
840 string_var: string_var.short_name.clone(),
848 .map(|raw::ValueLabel { value, label }| {
849 let label = decoder.decode_string(&label.0, &warn);
850 let value = Value::decode(value, decoder);
851 ValueLabel { value, label }
854 let variables = variables
856 .map(|&variable| variable.short_name.clone())
858 Ok(Some(ValueLabelRecord {
866 #[derive(Clone, Debug)]
867 pub struct VariableSetRecord(Vec<VariableSet>);
869 impl TextRecord for VariableSetRecord {
870 const NAME: &'static str = "variable set";
871 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
872 let mut sets = Vec::new();
873 for line in input.lines() {
874 if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) {
878 Ok(VariableSetRecord(sets))
882 #[derive(Clone, Debug)]
883 pub struct LongName {
884 pub short_name: Identifier,
885 pub long_name: Identifier,
889 fn new(decoder: &mut Decoder, short_name: &str, long_name: &str) -> Result<LongName, Error> {
891 Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidShortName)?;
893 Identifier::new(long_name, decoder.encoding).map_err(Error::InvalidLongName)?;
901 #[derive(Clone, Debug)]
902 pub struct LongNameRecord(Vec<LongName>);
904 impl LongNameRecord {
905 pub fn parse(decoder: &mut Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
906 let mut names = Vec::new();
907 for pair in input.split('\t').filter(|s| !s.is_empty()) {
908 if let Some((short_name, long_name)) = pair.split_once('=') {
909 if let Some(long_name) =
910 LongName::new(decoder, short_name, long_name).warn_on_error(&warn)
912 names.push(long_name);
918 Ok(LongNameRecord(names))
922 #[derive(Clone, Debug)]
923 pub struct VeryLongString {
924 pub short_name: Identifier,
928 impl VeryLongString {
929 fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Error> {
930 let Some((short_name, length)) = input.split_once('=') else {
931 return Err(Error::TBD);
934 Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidLongStringName)?;
935 let length: u16 = length.parse().map_err(|_| Error::TBD)?;
936 if length > VarWidth::MAX_STRING {
937 return Err(Error::TBD);
939 Ok(VeryLongString { short_name, length })
943 #[derive(Clone, Debug)]
944 pub struct VeryLongStringRecord(Vec<VeryLongString>);
946 impl VeryLongStringRecord {
947 pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
948 let mut very_long_strings = Vec::new();
951 .map(|s| s.trim_end_matches('\t'))
952 .filter(|s| !s.is_empty())
954 if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&warn) {
955 very_long_strings.push(vls)
958 Ok(VeryLongStringRecord(very_long_strings))
962 #[derive(Clone, Debug)]
963 pub struct Attribute {
964 pub name: Identifier,
965 pub values: Vec<String>,
972 warn: &impl Fn(Error),
973 ) -> Result<(Option<Attribute>, &'a str), Error> {
974 let Some((name, mut input)) = input.split_once('(') else {
975 return Err(Error::TBD);
977 let mut values = Vec::new();
979 let Some((value, rest)) = input.split_once('\n') else {
980 return Err(Error::TBD);
982 if let Some(stripped) = value
984 .and_then(|value| value.strip_suffix('\''))
986 values.push(stripped.into());
989 values.push(value.into());
991 if let Some(rest) = rest.strip_prefix(')') {
992 let attribute = Identifier::new(name, decoder.encoding)
993 .map_err(Error::InvalidAttributeName)
995 .map(|name| Attribute { name, values });
996 return Ok((attribute, rest));
1003 #[derive(Clone, Debug)]
1004 pub struct AttributeSet(pub Vec<Attribute>);
1010 sentinel: Option<char>,
1011 warn: &impl Fn(Error),
1012 ) -> Result<(AttributeSet, &'a str), Error> {
1013 let mut attributes = Vec::new();
1015 match input.chars().next() {
1016 None => break input,
1017 c if c == sentinel => break &input[1..],
1019 let (attribute, rest) = Attribute::parse(decoder, input, &warn)?;
1020 if let Some(attribute) = attribute {
1021 attributes.push(attribute);
1027 Ok((AttributeSet(attributes), rest))
1031 #[derive(Clone, Debug)]
1032 pub struct FileAttributeRecord(AttributeSet);
1034 impl FileAttributeRecord {
1035 pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1036 let (set, rest) = AttributeSet::parse(decoder, input, None, &warn)?;
1037 if !rest.is_empty() {
1040 Ok(FileAttributeRecord(set))
1044 #[derive(Clone, Debug)]
1045 pub struct VarAttributeSet {
1046 pub long_var_name: Identifier,
1047 pub attributes: AttributeSet,
1050 impl VarAttributeSet {
1054 warn: &impl Fn(Error),
1055 ) -> Result<(Option<VarAttributeSet>, &'a str), Error> {
1056 let Some((long_var_name, rest)) = input.split_once(':') else {
1057 return Err(Error::TBD);
1059 let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'), warn)?;
1060 let var_attribute = Identifier::new(long_var_name, decoder.encoding)
1061 .map_err(Error::InvalidAttributeVariableName)
1062 .warn_on_error(warn)
1063 .map(|name| VarAttributeSet {
1064 long_var_name: name,
1067 Ok((var_attribute, rest))
1071 #[derive(Clone, Debug)]
1072 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
1074 impl VariableAttributeRecord {
1075 pub fn parse(decoder: &Decoder, mut input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1076 let mut var_attribute_sets = Vec::new();
1077 while !input.is_empty() {
1078 let Some((var_attribute, rest)) =
1079 VarAttributeSet::parse(decoder, input, &warn).warn_on_error(&warn)
1083 if let Some(var_attribute) = var_attribute {
1084 var_attribute_sets.push(var_attribute);
1088 Ok(VariableAttributeRecord(var_attribute_sets))
1092 #[derive(Clone, Debug)]
1093 pub enum MultipleResponseType {
1096 labels: CategoryLabels,
1101 impl MultipleResponseType {
1104 mr_set: &Identifier,
1105 input: &raw::MultipleResponseType,
1106 min_width: VarWidth,
1107 warn: &impl Fn(Error),
1108 ) -> Result<Self, Error> {
1109 let mr_type = match input {
1110 raw::MultipleResponseType::MultipleDichotomy { value, labels } => {
1111 let value = decoder.decode_string_cow(&value.0, warn);
1112 let value = match min_width {
1113 VarWidth::Numeric => {
1114 let number: f64 = value.trim().parse().map_err(|_| {
1115 Error::InvalidMDGroupCountedValue {
1116 mr_set: mr_set.clone(),
1117 number: value.into(),
1120 Value::Number(Some(number.into()))
1122 VarWidth::String(max_width) => {
1123 let value = value.trim_end_matches(' ');
1124 let width = value.len();
1125 if width > max_width as usize {
1126 return Err(Error::TooWideMDGroupCountedValue {
1127 mr_set: mr_set.clone(),
1128 value: value.into(),
1133 Value::String(value.into())
1136 MultipleResponseType::MultipleDichotomy {
1141 raw::MultipleResponseType::MultipleCategory => MultipleResponseType::MultipleCategory,
1147 #[derive(Clone, Debug)]
1148 pub struct MultipleResponseSet {
1149 pub name: Identifier,
1150 pub min_width: VarWidth,
1151 pub max_width: VarWidth,
1153 pub mr_type: MultipleResponseType,
1154 pub dict_indexes: Vec<DictIndex>,
1157 impl MultipleResponseSet {
1160 input: &raw::MultipleResponseSet<Identifier, Cow<str>>,
1161 warn: &impl Fn(Error),
1162 ) -> Result<Self, Error> {
1163 let mr_set_name = input.name.clone();
1164 let mut dict_indexes = Vec::with_capacity(input.short_names.len());
1165 for short_name in input.short_names.iter() {
1166 let Some(&dict_index) = decoder.var_names.get(&short_name) else {
1167 warn(Error::UnknownMrSetVariable {
1168 mr_set: mr_set_name.clone(),
1169 short_name: short_name.clone(),
1173 dict_indexes.push(dict_index);
1176 match dict_indexes.len() {
1177 0 => return Err(Error::EmptyMrSet(mr_set_name)),
1178 1 => return Err(Error::OneVarMrSet(mr_set_name)),
1182 let Some((Some(min_width), Some(max_width))) = dict_indexes
1184 .map(|dict_index| decoder.variables[dict_index].width)
1185 .map(|w| (Some(w), Some(w)))
1186 .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb)))
1188 return Err(Error::MixedMrSet(mr_set_name));
1192 MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?;
1194 Ok(MultipleResponseSet {
1198 label: input.label.to_string(),
1205 #[derive(Clone, Debug)]
1206 pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
1208 impl TryDecode for MultipleResponseRecord {
1209 type Input<'a> = raw::MultipleResponseRecord<Identifier, Cow<'a, str>>;
1212 decoder: &mut Decoder,
1213 input: &Self::Input<'_>,
1214 warn: impl Fn(Error),
1215 ) -> Result<Option<Self>, Error> {
1216 let mut sets = Vec::with_capacity(input.0.len());
1217 for set in &input.0 {
1218 match MultipleResponseSet::decode(decoder, set, &warn) {
1219 Ok(set) => sets.push(set),
1220 Err(error) => warn(error),
1223 Ok(Some(MultipleResponseRecord(sets)))
1227 #[derive(Clone, Debug)]
1228 pub struct LongStringValueLabels {
1229 pub var_name: Identifier,
1230 pub width: VarWidth,
1231 pub labels: Vec<ValueLabel>,
1234 impl LongStringValueLabels {
1237 input: &raw::LongStringValueLabels<RawString>,
1238 warn: &impl Fn(Error),
1239 ) -> Result<Self, Error> {
1240 let var_name = decoder.decode_string(&input.var_name.0, warn);
1241 let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
1242 .map_err(Error::InvalidLongStringValueLabelName)?;
1245 let max_width = VarWidth::MAX_STRING;
1246 if input.width < 9 || input.width > max_width as u32 {
1247 return Err(Error::InvalidLongValueLabelWidth {
1254 let width = input.width as u16;
1256 let mut labels = Vec::with_capacity(input.labels.len());
1257 for (value, label) in input.labels.iter() {
1258 let value = Value::String(decoder.decode_exact_length(&value.0).into());
1259 let label = decoder.decode_string(&label.0, warn);
1260 labels.push(ValueLabel { value, label });
1263 Ok(LongStringValueLabels {
1265 width: VarWidth::String(width),
1271 #[derive(Clone, Debug)]
1272 pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
1274 impl TryDecode for LongStringValueLabelRecord {
1275 type Input<'a> = raw::LongStringValueLabelRecord<RawString>;
1278 decoder: &mut Decoder,
1279 input: &Self::Input<'_>,
1280 warn: impl Fn(Error),
1281 ) -> Result<Option<Self>, Error> {
1282 let mut labels = Vec::with_capacity(input.0.len());
1283 for label in &input.0 {
1284 match LongStringValueLabels::decode(decoder, label, &warn) {
1285 Ok(set) => labels.push(set),
1286 Err(error) => warn(error),
1289 Ok(Some(LongStringValueLabelRecord(labels)))
1295 use encoding_rs::WINDOWS_1252;
1299 let mut s = String::new();
1300 s.push(char::REPLACEMENT_CHARACTER);
1301 let encoded = WINDOWS_1252.encode(&s).0;
1302 let decoded = WINDOWS_1252.decode(&encoded[..]).0;
1303 println!("{:?}", decoded);
1308 let charset: Vec<u8> = (0..=255).collect();
1309 println!("{}", charset.len());
1310 let decoded = WINDOWS_1252.decode(&charset[..]).0;
1311 println!("{}", decoded.len());
1312 let encoded = WINDOWS_1252.encode(&decoded[..]).0;
1313 println!("{}", encoded.len());
1314 assert_eq!(&charset[..], &encoded[..]);