1 use std::{borrow::Cow, cmp::Ordering, collections::HashMap, iter::repeat, ops::Range};
4 encoding::{default_encoding, get_encoding, Error as EncodingError},
6 format::{Error as FormatError, Spec, UncheckedSpec},
7 identifier::{Error as IdError, Identifier},
8 raw::{self, MissingValues, UnencodedStr, VarType},
10 use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
11 use encoding_rs::{DecoderResult, Encoding};
12 use num::integer::div_ceil;
13 use ordered_float::OrderedFloat;
14 use thiserror::Error as ThisError;
16 pub use crate::raw::{CategoryLabels, Compression};
18 #[derive(ThisError, Debug)]
20 // XXX this is really an internal error and maybe we should change the
21 // interfaces to make it impossible
22 #[error("Missing header record")]
26 EncodingError(EncodingError),
28 #[error("Using default encoding {0}.")]
29 UsingDefaultEncoding(String),
31 #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)]
32 InvalidVariableWidth { offsets: Range<u64>, width: i32 },
34 #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
35 InvalidLongMissingValueFormat,
37 #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
38 InvalidCreationDate { creation_date: String },
40 #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
41 InvalidCreationTime { creation_time: String },
43 #[error("{id_error} Renaming variable to {new_name}.")]
50 "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}"
55 format_error: FormatError,
59 "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}"
64 format_error: FormatError,
67 #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")]
68 DuplicateVariableName {
69 duplicate_name: Identifier,
73 #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")]
74 InvalidDictIndex { dict_index: usize, max_index: usize },
76 #[error("Dictionary index {0} refers to a long string continuation.")]
77 DictIndexIsContinuation(usize),
79 #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")]
80 ValueLabelsDifferentTypes {
81 numeric_var: Identifier,
82 string_var: Identifier,
86 "Value labels may not be added to long string variable {0} using record types 3 or 4."
88 InvalidLongStringValueLabel(Identifier),
90 #[error("Invalid multiple response set name. {0}")]
91 InvalidMrSetName(IdError),
93 #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")]
94 UnknownMrSetVariable {
96 short_name: Identifier,
99 #[error("Multiple response set {0} has no variables.")]
100 EmptyMrSet(Identifier),
102 #[error("Multiple response set {0} has only one variable.")]
103 OneVarMrSet(Identifier),
105 #[error("Multiple response set {0} contains both string and numeric variables.")]
106 MixedMrSet(Identifier),
109 "Invalid numeric format for counted value {number} in multiple response set {mr_set}."
111 InvalidMDGroupCountedValue { mr_set: Identifier, number: String },
113 #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")]
114 TooWideMDGroupCountedValue {
121 #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")]
122 InvalidLongValueLabelWidth {
129 #[error("Invalid attribute name. {0}")]
130 InvalidAttributeName(IdError),
132 #[error("Invalid short name in long variable name record. {0}")]
133 InvalidShortName(IdError),
135 #[error("Invalid name in long variable name record. {0}")]
136 InvalidLongName(IdError),
138 #[error("Invalid variable name in very long string record. {0}")]
139 InvalidLongStringName(IdError),
141 #[error("Invalid variable name in long string value label record. {0}")]
142 InvalidLongStringValueLabelName(IdError),
144 #[error("Invalid variable name in attribute record. {0}")]
145 InvalidAttributeVariableName(IdError),
147 // XXX This is risky because `text` might be arbitarily long.
148 #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
149 MalformedString { encoding: String, text: String },
151 #[error("Invalid variable measurement level value {0}")]
152 InvalidMeasurement(u32),
154 #[error("Invalid variable display alignment value {0}")]
155 InvalidAlignment(u32),
157 #[error("Details TBD")]
161 #[derive(Clone, Debug)]
163 Header(HeaderRecord),
164 Variable(VariableRecord),
165 ValueLabel(ValueLabelRecord),
166 Document(DocumentRecord),
167 IntegerInfo(IntegerInfoRecord),
168 FloatInfo(FloatInfoRecord),
169 VariableSets(VariableSetRecord),
170 VarDisplay(VarDisplayRecord),
171 MultipleResponse(MultipleResponseRecord),
172 LongStringValueLabels(LongStringValueLabelRecord),
173 Encoding(EncodingRecord),
174 NumberOfCases(NumberOfCasesRecord),
175 ProductInfo(ProductInfoRecord),
176 LongNames(LongNameRecord),
177 VeryLongStrings(VeryLongStringRecord),
178 FileAttributes(FileAttributeRecord),
179 VariableAttributes(VariableAttributeRecord),
180 OtherExtension(Extension),
183 //ZTrailer(ZTrailer),
187 pub use crate::raw::EncodingRecord;
188 pub use crate::raw::Extension;
189 pub use crate::raw::FloatInfoRecord;
190 pub use crate::raw::IntegerInfoRecord;
191 pub use crate::raw::NumberOfCasesRecord;
193 type DictIndex = usize;
195 pub struct Variable {
196 pub dict_index: DictIndex,
197 pub short_name: Identifier,
198 pub long_name: Option<Identifier>,
203 pub compression: Option<Compression>,
205 pub encoding: &'static Encoding,
206 pub variables: HashMap<DictIndex, Variable>,
207 pub var_names: HashMap<Identifier, DictIndex>,
208 n_dict_indexes: usize,
209 n_generated_names: usize,
213 headers: Vec<raw::Record>,
214 encoding: Option<&'static Encoding>,
215 warn: &impl Fn(Error),
216 ) -> Result<Vec<Record>, Error> {
217 let Some(header_record) = headers.iter().find_map(|rec| {
218 if let raw::Record::Header(header) = rec {
224 return Err(Error::MissingHeaderRecord);
226 let encoding = match encoding {
227 Some(encoding) => encoding,
229 let encoding = headers.iter().find_map(|rec| {
230 if let raw::Record::Encoding(ref e) = rec {
236 let character_code = headers.iter().find_map(|rec| {
237 if let raw::Record::IntegerInfo(ref r) = rec {
238 Some(r.character_code)
243 match get_encoding(encoding, character_code) {
244 Ok(encoding) => encoding,
245 Err(err @ EncodingError::Ebcdic) => return Err(Error::EncodingError(err)),
247 warn(Error::EncodingError(err));
248 // Warn that we're using the default encoding.
255 let mut decoder = Decoder {
256 compression: header_record.compression,
257 endian: header_record.endian,
259 variables: HashMap::new(),
260 var_names: HashMap::new(),
262 n_generated_names: 0,
265 let mut output = Vec::with_capacity(headers.len());
266 for header in &headers {
268 raw::Record::Header(ref input) => {
269 if let Some(header) = HeaderRecord::try_decode(&mut decoder, input, warn)? {
270 output.push(Record::Header(header))
273 raw::Record::Variable(ref input) => {
274 if let Some(variable) = VariableRecord::try_decode(&mut decoder, input, warn)? {
275 output.push(Record::Variable(variable));
278 raw::Record::ValueLabel(ref input) => {
279 if let Some(value_label) = ValueLabelRecord::try_decode(&mut decoder, input, warn)?
281 output.push(Record::ValueLabel(value_label));
284 raw::Record::Document(ref input) => {
285 if let Some(document) = DocumentRecord::try_decode(&mut decoder, input, warn)? {
286 output.push(Record::Document(document))
289 raw::Record::IntegerInfo(ref input) => output.push(Record::IntegerInfo(input.clone())),
290 raw::Record::FloatInfo(ref input) => output.push(Record::FloatInfo(input.clone())),
291 raw::Record::VariableSets(ref input) => {
292 let s = decoder.decode_string_cow(&input.text.0, warn);
293 output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?));
295 raw::Record::VarDisplay(ref input) => {
296 if let Some(vdr) = VarDisplayRecord::try_decode(&mut decoder, input, warn)? {
297 output.push(Record::VarDisplay(vdr))
300 raw::Record::MultipleResponse(ref input) => {
301 if let Some(mrr) = MultipleResponseRecord::try_decode(&mut decoder, input, warn)? {
302 output.push(Record::MultipleResponse(mrr))
305 raw::Record::LongStringValueLabels(ref input) => {
307 LongStringValueLabelRecord::try_decode(&mut decoder, input, warn)?
309 output.push(Record::LongStringValueLabels(mrr))
312 raw::Record::Encoding(ref input) => output.push(Record::Encoding(input.clone())),
313 raw::Record::NumberOfCases(ref input) => {
314 output.push(Record::NumberOfCases(input.clone()))
316 raw::Record::ProductInfo(ref input) => {
317 let s = decoder.decode_string_cow(&input.text.0, warn);
318 output.push(Record::ProductInfo(ProductInfoRecord::parse(&s, warn)?));
320 raw::Record::LongNames(ref input) => {
321 let s = decoder.decode_string_cow(&input.text.0, warn);
322 output.push(Record::LongNames(LongNameRecord::parse(
328 raw::Record::VeryLongStrings(ref input) => {
329 let s = decoder.decode_string_cow(&input.text.0, warn);
330 output.push(Record::VeryLongStrings(VeryLongStringRecord::parse(
334 raw::Record::FileAttributes(ref input) => {
335 let s = decoder.decode_string_cow(&input.text.0, warn);
336 output.push(Record::FileAttributes(FileAttributeRecord::parse(
340 raw::Record::VariableAttributes(ref input) => {
341 let s = decoder.decode_string_cow(&input.text.0, warn);
342 output.push(Record::VariableAttributes(VariableAttributeRecord::parse(
346 raw::Record::OtherExtension(ref input) => {
347 output.push(Record::OtherExtension(input.clone()))
349 raw::Record::EndOfHeaders(_) => (),
350 raw::Record::ZHeader(_) => (),
351 raw::Record::ZTrailer(_) => (),
352 raw::Record::Case(_) => (),
359 fn generate_name(&mut self) -> Identifier {
361 self.n_generated_names += 1;
362 let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
364 if !self.var_names.contains_key(&name) {
367 assert!(self.n_generated_names < usize::MAX);
370 fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
371 let (output, malformed) = self.encoding.decode_without_bom_handling(input);
373 warn(Error::MalformedString {
374 encoding: self.encoding.name().into(),
375 text: output.clone().into(),
380 fn decode_string(&self, input: &[u8], warn: &impl Fn(Error)) -> String {
381 self.decode_string_cow(input, warn).into()
383 pub fn decode_identifier(
386 warn: &impl Fn(Error),
387 ) -> Result<Identifier, IdError> {
388 let s = self.decode_string_cow(input, warn);
389 Identifier::new(&s, self.encoding)
391 fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> {
392 let max_index = self.n_dict_indexes;
393 if dict_index == 0 || dict_index > max_index {
394 return Err(Error::InvalidDictIndex {
399 let Some(variable) = self.variables.get(&(dict_index - 1)) else {
400 return Err(Error::DictIndexIsContinuation(dict_index));
405 /// Returns `input` decoded from `self.encoding` into UTF-8 such that
406 /// re-encoding the result back into `self.encoding` will have exactly the
407 /// same length in bytes.
409 /// XXX warn about errors?
410 fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
411 if let (s, false) = self.encoding.decode_without_bom_handling(input) {
412 // This is the common case. Usually there will be no errors.
415 // Unusual case. Don't bother to optimize it much.
416 let mut decoder = self.encoding.new_decoder_without_bom_handling();
417 let mut output = String::with_capacity(
419 .max_utf8_buffer_length_without_replacement(input.len())
422 let mut rest = input;
423 while !rest.is_empty() {
424 match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
425 (DecoderResult::InputEmpty, _) => break,
426 (DecoderResult::OutputFull, _) => unreachable!(),
427 (DecoderResult::Malformed(a, b), consumed) => {
428 let skipped = a as usize + b as usize;
429 output.extend(repeat('?').take(skipped));
430 rest = &rest[consumed..];
434 assert_eq!(self.encoding.encode(&output).0.len(), input.len());
440 pub trait TryDecode: Sized {
443 decoder: &mut Decoder,
445 warn: impl Fn(Error),
446 ) -> Result<Option<Self>, Error>;
449 pub trait Decode<Input>: Sized {
450 fn decode(decoder: &Decoder, input: &Input, warn: impl Fn(Error)) -> Self;
453 impl<const N: usize> Decode<UnencodedStr<N>> for String {
454 fn decode(decoder: &Decoder, input: &UnencodedStr<N>, warn: impl Fn(Error)) -> Self {
455 decoder.decode_string(&input.0, &warn)
459 #[derive(Clone, Debug)]
460 pub struct HeaderRecord {
461 pub eye_catcher: String,
462 pub weight_index: Option<usize>,
463 pub n_cases: Option<u64>,
464 pub creation: NaiveDateTime,
465 pub file_label: String,
468 fn trim_end_spaces(mut s: String) -> String {
469 s.truncate(s.trim_end_matches(' ').len());
473 impl TryDecode for HeaderRecord {
474 type Input = crate::raw::HeaderRecord;
477 decoder: &mut Decoder,
479 warn: impl Fn(Error),
480 ) -> Result<Option<Self>, Error> {
481 let eye_catcher = trim_end_spaces(decoder.decode_string(&input.eye_catcher.0, &warn));
482 let file_label = trim_end_spaces(decoder.decode_string(&input.file_label.0, &warn));
483 let creation_date = decoder.decode_string_cow(&input.creation_date.0, &warn);
485 NaiveDate::parse_from_str(&creation_date, "%e %b %Y").unwrap_or_else(|_| {
486 warn(Error::InvalidCreationDate {
487 creation_date: creation_date.into(),
491 let creation_time = decoder.decode_string_cow(&input.creation_time.0, &warn);
493 NaiveTime::parse_from_str(&creation_time, "%H:%M:%S").unwrap_or_else(|_| {
494 warn(Error::InvalidCreationTime {
495 creation_time: creation_time.into(),
499 Ok(Some(HeaderRecord {
501 weight_index: input.weight_index.map(|n| n as usize),
502 n_cases: input.n_cases.map(|n| n as u64),
503 creation: NaiveDateTime::new(creation_date, creation_time),
509 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
515 impl PartialOrd for VarWidth {
516 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
517 match (self, other) {
518 (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal),
519 (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)),
526 const MAX_STRING: u16 = 32767;
528 fn n_dict_indexes(self) -> usize {
530 VarWidth::Numeric => 1,
531 VarWidth::String(w) => div_ceil(w as usize, 8),
538 f: impl Fn(u16, u16) -> u16,
539 ) -> Option<VarWidth> {
541 (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric),
542 (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => {
543 Some(VarWidth::String(f(a, b)))
549 /// Returns the wider of `self` and `other`:
550 /// - Numerical variable widths are equally wide.
551 /// - Longer strings are wider than shorter strings.
552 /// - Numerical and string types are incomparable, so result in `None`.
553 /// - Any `None` in the input yields `None` in the output.
554 pub fn wider(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
555 Self::width_predicate(a, b, |a, b| a.max(b))
558 /// Returns the narrower of `self` and `other` (see [`Self::wider`]).
559 pub fn narrower(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
560 Self::width_predicate(a, b, |a, b| a.min(b))
564 impl From<VarWidth> for VarType {
565 fn from(source: VarWidth) -> Self {
567 VarWidth::Numeric => VarType::Numeric,
568 VarWidth::String(_) => VarType::String,
573 #[derive(Clone, Debug)]
574 pub struct VariableRecord {
576 pub name: Identifier,
577 pub print_format: Spec,
578 pub write_format: Spec,
579 pub missing_values: MissingValues,
580 pub label: Option<String>,
583 fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec {
584 UncheckedSpec::try_from(raw)
585 .and_then(Spec::try_from)
586 .and_then(|x| x.check_width_compatibility(width))
587 .unwrap_or_else(|error| {
588 let new_format = Spec::default_for_width(width);
589 warn(new_format, error);
594 impl TryDecode for VariableRecord {
595 type Input = raw::VariableRecord;
598 decoder: &mut Decoder,
599 input: &crate::raw::VariableRecord,
600 warn: impl Fn(Error),
601 ) -> Result<Option<VariableRecord>, Error> {
602 let width = match input.width {
603 0 => VarWidth::Numeric,
604 w @ 1..=255 => VarWidth::String(w as u16),
605 -1 => return Ok(None),
607 return Err(Error::InvalidVariableWidth {
608 offsets: input.offsets.clone(),
613 let name = trim_end_spaces(decoder.decode_string(&input.name.0, &warn));
614 let name = match Identifier::new(&name, decoder.encoding) {
616 if !decoder.var_names.contains_key(&name) {
619 let new_name = decoder.generate_name();
620 warn(Error::DuplicateVariableName {
621 duplicate_name: name.clone(),
622 new_name: new_name.clone(),
628 let new_name = decoder.generate_name();
629 warn(Error::InvalidVariableName {
631 new_name: new_name.clone(),
636 let variable = Variable {
637 dict_index: decoder.n_dict_indexes,
638 short_name: name.clone(),
642 decoder.n_dict_indexes += width.n_dict_indexes();
645 .insert(name.clone(), variable.dict_index)
649 .insert(variable.dict_index, variable)
652 let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
653 warn(Error::InvalidPrintFormat {
655 variable: name.clone(),
659 let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
660 warn(Error::InvalidWriteFormat {
662 variable: name.clone(),
669 .map(|label| decoder.decode_string(&label.0, &warn));
670 Ok(Some(VariableRecord {
675 missing_values: input.missing_values.clone(),
681 #[derive(Clone, Debug)]
682 pub struct DocumentRecord(Vec<String>);
684 impl TryDecode for DocumentRecord {
685 type Input = crate::raw::DocumentRecord;
688 decoder: &mut Decoder,
690 warn: impl Fn(Error),
691 ) -> Result<Option<Self>, Error> {
692 Ok(Some(DocumentRecord(
696 .map(|s| trim_end_spaces(decoder.decode_string(&s.0, &warn)))
706 const NAME: &'static str;
707 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error>;
710 #[derive(Clone, Debug)]
711 pub struct VariableSet {
713 pub vars: Vec<String>,
717 fn parse(input: &str) -> Result<Self, Error> {
718 let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
719 let vars = input.split_ascii_whitespace().map(String::from).collect();
727 trait WarnOnError<T> {
728 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
730 impl<T> WarnOnError<T> for Result<T, Error> {
731 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
733 Ok(result) => Some(result),
742 #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
744 Number(Option<OrderedFloat<f64>>),
749 pub fn decode(raw: raw::Value, decoder: &Decoder) -> Self {
751 raw::Value::Number(x) => Value::Number(x.map(|x| x.into())),
752 raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
757 #[derive(Clone, Debug)]
758 pub struct ValueLabel {
763 #[derive(Clone, Debug)]
764 pub struct ValueLabelRecord {
765 pub var_type: VarType,
766 pub labels: Vec<ValueLabel>,
767 pub variables: Vec<Identifier>,
770 impl TryDecode for ValueLabelRecord {
771 type Input = crate::raw::ValueLabelRecord;
773 decoder: &mut Decoder,
775 warn: impl Fn(Error),
776 ) -> Result<Option<ValueLabelRecord>, Error> {
777 let variables: Vec<&Variable> = input
780 .filter_map(|&dict_index| {
782 .get_var_by_index(dict_index as usize)
783 .warn_on_error(&warn)
785 .filter(|&variable| match variable.width {
786 VarWidth::String(width) if width > 8 => {
787 warn(Error::InvalidLongStringValueLabel(
788 variable.short_name.clone(),
795 let mut i = variables.iter();
796 let Some(&first_var) = i.next() else {
799 let var_type: VarType = first_var.width.into();
801 let this_type: VarType = variable.width.into();
802 if var_type != this_type {
803 let (numeric_var, string_var) = match var_type {
804 VarType::Numeric => (first_var, variable),
805 VarType::String => (variable, first_var),
807 warn(Error::ValueLabelsDifferentTypes {
808 numeric_var: numeric_var.short_name.clone(),
809 string_var: string_var.short_name.clone(),
817 .map(|(value, label)| {
818 let label = decoder.decode_string(&label.0, &warn);
819 let value = Value::decode(
820 raw::Value::from_raw(*value, var_type, decoder.endian),
823 ValueLabel { value, label }
826 let variables = variables
828 .map(|&variable| variable.short_name.clone())
830 Ok(Some(ValueLabelRecord {
838 #[derive(Clone, Debug)]
839 pub struct VariableSetRecord(Vec<VariableSet>);
841 impl TextRecord for VariableSetRecord {
842 const NAME: &'static str = "variable set";
843 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
844 let mut sets = Vec::new();
845 for line in input.lines() {
846 if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) {
850 Ok(VariableSetRecord(sets))
854 #[derive(Clone, Debug)]
855 pub struct ProductInfoRecord(pub String);
857 impl TextRecord for ProductInfoRecord {
858 const NAME: &'static str = "extra product info";
859 fn parse(input: &str, _warn: impl Fn(Error)) -> Result<Self, Error> {
860 Ok(ProductInfoRecord(input.into()))
864 #[derive(Clone, Debug)]
865 pub struct LongName {
866 pub short_name: Identifier,
867 pub long_name: Identifier,
871 fn new(decoder: &mut Decoder, short_name: &str, long_name: &str) -> Result<LongName, Error> {
873 Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidShortName)?;
875 Identifier::new(long_name, decoder.encoding).map_err(Error::InvalidLongName)?;
883 #[derive(Clone, Debug)]
884 pub struct LongNameRecord(Vec<LongName>);
886 impl LongNameRecord {
887 pub fn parse(decoder: &mut Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
888 let mut names = Vec::new();
889 for pair in input.split('\t').filter(|s| !s.is_empty()) {
890 if let Some((short_name, long_name)) = pair.split_once('=') {
891 if let Some(long_name) =
892 LongName::new(decoder, short_name, long_name).warn_on_error(&warn)
894 names.push(long_name);
900 Ok(LongNameRecord(names))
904 #[derive(Clone, Debug)]
905 pub struct VeryLongString {
906 pub short_name: Identifier,
910 impl VeryLongString {
911 fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Error> {
912 let Some((short_name, length)) = input.split_once('=') else {
913 return Err(Error::TBD);
916 Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidLongStringName)?;
917 let length: u16 = length.parse().map_err(|_| Error::TBD)?;
918 if length > VarWidth::MAX_STRING {
919 return Err(Error::TBD);
921 Ok(VeryLongString { short_name, length })
925 #[derive(Clone, Debug)]
926 pub struct VeryLongStringRecord(Vec<VeryLongString>);
928 impl VeryLongStringRecord {
929 pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
930 let mut very_long_strings = Vec::new();
933 .map(|s| s.trim_end_matches('\t'))
934 .filter(|s| !s.is_empty())
936 if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&warn) {
937 very_long_strings.push(vls)
940 Ok(VeryLongStringRecord(very_long_strings))
944 #[derive(Clone, Debug)]
945 pub struct Attribute {
946 pub name: Identifier,
947 pub values: Vec<String>,
954 warn: &impl Fn(Error),
955 ) -> Result<(Option<Attribute>, &'a str), Error> {
956 let Some((name, mut input)) = input.split_once('(') else {
957 return Err(Error::TBD);
959 let mut values = Vec::new();
961 let Some((value, rest)) = input.split_once('\n') else {
962 return Err(Error::TBD);
964 if let Some(stripped) = value
966 .and_then(|value| value.strip_suffix('\''))
968 values.push(stripped.into());
971 values.push(value.into());
973 if let Some(rest) = rest.strip_prefix(')') {
974 let attribute = Identifier::new(name, decoder.encoding)
975 .map_err(Error::InvalidAttributeName)
977 .map(|name| Attribute { name, values });
978 return Ok((attribute, rest));
985 #[derive(Clone, Debug)]
986 pub struct AttributeSet(pub Vec<Attribute>);
992 sentinel: Option<char>,
993 warn: &impl Fn(Error),
994 ) -> Result<(AttributeSet, &'a str), Error> {
995 let mut attributes = Vec::new();
997 match input.chars().next() {
999 c if c == sentinel => break &input[1..],
1001 let (attribute, rest) = Attribute::parse(decoder, input, &warn)?;
1002 if let Some(attribute) = attribute {
1003 attributes.push(attribute);
1009 Ok((AttributeSet(attributes), rest))
1013 #[derive(Clone, Debug)]
1014 pub struct FileAttributeRecord(AttributeSet);
1016 impl FileAttributeRecord {
1017 pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1018 let (set, rest) = AttributeSet::parse(decoder, input, None, &warn)?;
1019 if !rest.is_empty() {
1022 Ok(FileAttributeRecord(set))
1026 #[derive(Clone, Debug)]
1027 pub struct VarAttributeSet {
1028 pub long_var_name: Identifier,
1029 pub attributes: AttributeSet,
1032 impl VarAttributeSet {
1036 warn: &impl Fn(Error),
1037 ) -> Result<(Option<VarAttributeSet>, &'a str), Error> {
1038 let Some((long_var_name, rest)) = input.split_once(':') else {
1039 return Err(Error::TBD);
1041 let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'), warn)?;
1042 let var_attribute = Identifier::new(long_var_name, decoder.encoding)
1043 .map_err(Error::InvalidAttributeVariableName)
1044 .warn_on_error(warn)
1045 .map(|name| VarAttributeSet {
1046 long_var_name: name,
1049 Ok((var_attribute, rest))
1053 #[derive(Clone, Debug)]
1054 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
1056 impl VariableAttributeRecord {
1057 pub fn parse(decoder: &Decoder, mut input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1058 let mut var_attribute_sets = Vec::new();
1059 while !input.is_empty() {
1060 let Some((var_attribute, rest)) =
1061 VarAttributeSet::parse(decoder, input, &warn).warn_on_error(&warn)
1065 if let Some(var_attribute) = var_attribute {
1066 var_attribute_sets.push(var_attribute);
1070 Ok(VariableAttributeRecord(var_attribute_sets))
1074 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1082 fn try_decode(source: u32) -> Result<Option<Measure>, Error> {
1085 1 => Ok(Some(Measure::Nominal)),
1086 2 => Ok(Some(Measure::Ordinal)),
1087 3 => Ok(Some(Measure::Scale)),
1088 _ => Err(Error::InvalidMeasurement(source)),
1093 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1094 pub enum Alignment {
1101 fn try_decode(source: u32) -> Result<Option<Alignment>, Error> {
1104 1 => Ok(Some(Alignment::Left)),
1105 2 => Ok(Some(Alignment::Right)),
1106 3 => Ok(Some(Alignment::Center)),
1107 _ => Err(Error::InvalidAlignment(source)),
1112 #[derive(Clone, Debug)]
1113 pub struct VarDisplay {
1114 pub measure: Option<Measure>,
1115 pub width: Option<u32>,
1116 pub alignment: Option<Alignment>,
1119 #[derive(Clone, Debug)]
1120 pub struct VarDisplayRecord(pub Vec<VarDisplay>);
1122 impl TryDecode for VarDisplayRecord {
1123 type Input = raw::VarDisplayRecord;
1125 decoder: &mut Decoder,
1126 input: &Self::Input,
1127 warn: impl Fn(Error),
1128 ) -> Result<Option<Self>, Error> {
1129 let n_vars = decoder.variables.len();
1130 let n_per_var = if input.0.len() == 3 * n_vars {
1132 } else if input.0.len() == 2 * n_vars {
1135 return Err(Error::TBD);
1138 let var_displays = input
1142 let (measure, width, alignment) = match n_per_var == 3 {
1143 true => (chunk[0], Some(chunk[1]), chunk[2]),
1144 false => (chunk[0], None, chunk[1]),
1146 let measure = Measure::try_decode(measure).warn_on_error(&warn).flatten();
1147 let alignment = Alignment::try_decode(alignment)
1148 .warn_on_error(&warn)
1157 Ok(Some(VarDisplayRecord(var_displays)))
1161 #[derive(Clone, Debug)]
1162 pub enum MultipleResponseType {
1165 labels: CategoryLabels,
1170 impl MultipleResponseType {
1173 mr_set: &Identifier,
1174 input: &raw::MultipleResponseType,
1175 min_width: VarWidth,
1176 warn: &impl Fn(Error),
1177 ) -> Result<Self, Error> {
1178 let mr_type = match input {
1179 raw::MultipleResponseType::MultipleDichotomy { value, labels } => {
1180 let value = decoder.decode_string_cow(&value.0, warn);
1181 let value = match min_width {
1182 VarWidth::Numeric => {
1183 let number: f64 = value.trim().parse().map_err(|_| {
1184 Error::InvalidMDGroupCountedValue {
1185 mr_set: mr_set.clone(),
1186 number: value.into(),
1189 Value::Number(Some(number.into()))
1191 VarWidth::String(max_width) => {
1192 let value = value.trim_end_matches(' ');
1193 let width = value.len();
1194 if width > max_width as usize {
1195 return Err(Error::TooWideMDGroupCountedValue {
1196 mr_set: mr_set.clone(),
1197 value: value.into(),
1202 Value::String(value.into())
1205 MultipleResponseType::MultipleDichotomy {
1210 raw::MultipleResponseType::MultipleCategory => MultipleResponseType::MultipleCategory,
1216 #[derive(Clone, Debug)]
1217 pub struct MultipleResponseSet {
1218 pub name: Identifier,
1219 pub min_width: VarWidth,
1220 pub max_width: VarWidth,
1222 pub mr_type: MultipleResponseType,
1223 pub dict_indexes: Vec<DictIndex>,
1226 impl MultipleResponseSet {
1229 input: &raw::MultipleResponseSet,
1230 warn: &impl Fn(Error),
1231 ) -> Result<Self, Error> {
1232 let mr_set_name = decoder
1233 .decode_identifier(&input.name.0, warn)
1234 .map_err(Error::InvalidMrSetName)?;
1236 let label = decoder.decode_string(&input.label.0, warn);
1238 let mut dict_indexes = Vec::with_capacity(input.short_names.len());
1239 for short_name in input.short_names.iter() {
1240 let short_name = match decoder.decode_identifier(&short_name.0, warn) {
1243 warn(Error::InvalidMrSetName(error));
1247 let Some(&dict_index) = decoder.var_names.get(&short_name) else {
1248 warn(Error::UnknownMrSetVariable {
1249 mr_set: mr_set_name.clone(),
1250 short_name: short_name.clone(),
1254 dict_indexes.push(dict_index);
1257 match dict_indexes.len() {
1258 0 => return Err(Error::EmptyMrSet(mr_set_name)),
1259 1 => return Err(Error::OneVarMrSet(mr_set_name)),
1263 let Some((Some(min_width), Some(max_width))) = dict_indexes
1265 .map(|dict_index| decoder.variables[dict_index].width)
1266 .map(|w| (Some(w), Some(w)))
1267 .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb)))
1269 return Err(Error::MixedMrSet(mr_set_name));
1273 MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?;
1275 Ok(MultipleResponseSet {
1286 #[derive(Clone, Debug)]
1287 pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
1289 impl TryDecode for MultipleResponseRecord {
1290 type Input = raw::MultipleResponseRecord;
1293 decoder: &mut Decoder,
1294 input: &Self::Input,
1295 warn: impl Fn(Error),
1296 ) -> Result<Option<Self>, Error> {
1297 let mut sets = Vec::with_capacity(input.0.len());
1298 for set in &input.0 {
1299 match MultipleResponseSet::decode(decoder, set, &warn) {
1300 Ok(set) => sets.push(set),
1301 Err(error) => warn(error),
1304 Ok(Some(MultipleResponseRecord(sets)))
1308 #[derive(Clone, Debug)]
1309 pub struct LongStringValueLabels {
1310 pub var_name: Identifier,
1311 pub width: VarWidth,
1312 pub labels: Vec<ValueLabel>,
1315 impl LongStringValueLabels {
1318 input: &raw::LongStringValueLabels,
1319 warn: &impl Fn(Error),
1320 ) -> Result<Self, Error> {
1321 let var_name = decoder.decode_string(&input.var_name.0, warn);
1322 let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
1323 .map_err(Error::InvalidLongStringValueLabelName)?;
1326 let max_width = VarWidth::MAX_STRING;
1327 if input.width < 9 || input.width > max_width as u32 {
1328 return Err(Error::InvalidLongValueLabelWidth {
1335 let width = input.width as u16;
1337 let mut labels = Vec::with_capacity(input.labels.len());
1338 for (value, label) in input.labels.iter() {
1339 let value = Value::String(decoder.decode_exact_length(&value.0).into());
1340 let label = decoder.decode_string(&label.0, warn);
1341 labels.push(ValueLabel { value, label });
1344 Ok(LongStringValueLabels {
1346 width: VarWidth::String(width),
1352 #[derive(Clone, Debug)]
1353 pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
1355 impl TryDecode for LongStringValueLabelRecord {
1356 type Input = raw::LongStringValueLabelRecord;
1359 decoder: &mut Decoder,
1360 input: &Self::Input,
1361 warn: impl Fn(Error),
1362 ) -> Result<Option<Self>, Error> {
1363 let mut labels = Vec::with_capacity(input.0.len());
1364 for label in &input.0 {
1365 match LongStringValueLabels::decode(decoder, label, &warn) {
1366 Ok(set) => labels.push(set),
1367 Err(error) => warn(error),
1370 Ok(Some(LongStringValueLabelRecord(labels)))
1376 use encoding_rs::WINDOWS_1252;
1380 let mut s = String::new();
1381 s.push(char::REPLACEMENT_CHARACTER);
1382 let encoded = WINDOWS_1252.encode(&s).0;
1383 let decoded = WINDOWS_1252.decode(&encoded[..]).0;
1384 println!("{:?}", decoded);
1389 let charset: Vec<u8> = (0..=255).collect();
1390 println!("{}", charset.len());
1391 let decoded = WINDOWS_1252.decode(&charset[..]).0;
1392 println!("{}", decoded.len());
1393 let encoded = WINDOWS_1252.encode(&decoded[..]).0;
1394 println!("{}", encoded.len());
1395 assert_eq!(&charset[..], &encoded[..]);