1 use std::{borrow::Cow, cmp::Ordering, collections::HashMap, iter::repeat};
4 encoding::{default_encoding, get_encoding, Error as EncodingError},
6 format::{Error as FormatError, Spec, UncheckedSpec},
7 identifier::{Error as IdError, Identifier},
8 raw::{self, MissingValues, UnencodedStr, VarType},
10 use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
11 use encoding_rs::{DecoderResult, Encoding};
12 use num::integer::div_ceil;
13 use ordered_float::OrderedFloat;
14 use thiserror::Error as ThisError;
16 pub use crate::raw::{CategoryLabels, Compression};
18 #[derive(ThisError, Debug)]
20 // XXX this is really an internal error and maybe we should change the
21 // interfaces to make it impossible
22 #[error("Missing header record")]
26 EncodingError(EncodingError),
28 #[error("Using default encoding {0}.")]
29 UsingDefaultEncoding(String),
31 #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
32 InvalidVariableWidth { offset: u64, width: i32 },
34 #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
35 InvalidLongMissingValueFormat,
37 #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
38 InvalidCreationDate { creation_date: String },
40 #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
41 InvalidCreationTime { creation_time: String },
43 #[error("{id_error} Renaming variable to {new_name}.")]
50 "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}"
55 format_error: FormatError,
59 "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}"
64 format_error: FormatError,
67 #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")]
68 DuplicateVariableName {
69 duplicate_name: Identifier,
73 #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")]
74 InvalidDictIndex { dict_index: usize, max_index: usize },
76 #[error("Dictionary index {0} refers to a long string continuation.")]
77 DictIndexIsContinuation(usize),
79 #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")]
80 ValueLabelsDifferentTypes {
81 numeric_var: Identifier,
82 string_var: Identifier,
86 "Value labels may not be added to long string variable {0} using record types 3 or 4."
88 InvalidLongStringValueLabel(Identifier),
90 #[error("Invalid multiple response set name. {0}")]
91 InvalidMrSetName(IdError),
93 #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")]
94 UnknownMrSetVariable {
96 short_name: Identifier,
99 #[error("Multiple response set {0} has no variables.")]
100 EmptyMrSet(Identifier),
102 #[error("Multiple response set {0} has only one variable.")]
103 OneVarMrSet(Identifier),
105 #[error("Multiple response set {0} contains both string and numeric variables.")]
106 MixedMrSet(Identifier),
109 "Invalid numeric format for counted value {number} in multiple response set {mr_set}."
111 InvalidMDGroupCountedValue { mr_set: Identifier, number: String },
113 #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")]
114 TooWideMDGroupCountedValue {
121 #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")]
122 InvalidLongValueLabelWidth {
129 #[error("Invalid attribute name. {0}")]
130 InvalidAttributeName(IdError),
132 #[error("Invalid short name in long variable name record. {0}")]
133 InvalidShortName(IdError),
135 #[error("Invalid name in long variable name record. {0}")]
136 InvalidLongName(IdError),
138 #[error("Invalid variable name in very long string record. {0}")]
139 InvalidLongStringName(IdError),
141 #[error("Invalid variable name in long string value label record. {0}")]
142 InvalidLongStringValueLabelName(IdError),
144 #[error("Invalid variable name in attribute record. {0}")]
145 InvalidAttributeVariableName(IdError),
147 // XXX This is risky because `text` might be arbitarily long.
148 #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
149 MalformedString { encoding: String, text: String },
151 #[error("Invalid variable measurement level value {0}")]
152 InvalidMeasurement(u32),
154 #[error("Invalid variable display alignment value {0}")]
155 InvalidAlignment(u32),
157 #[error("Details TBD")]
161 #[derive(Clone, Debug)]
163 Header(HeaderRecord),
164 Variable(VariableRecord),
165 ValueLabel(ValueLabelRecord),
166 Document(DocumentRecord),
167 IntegerInfo(IntegerInfoRecord),
168 FloatInfo(FloatInfoRecord),
169 VariableSets(VariableSetRecord),
170 VarDisplay(VarDisplayRecord),
171 MultipleResponse(MultipleResponseRecord),
172 LongStringValueLabels(LongStringValueLabelRecord),
173 Encoding(EncodingRecord),
174 NumberOfCases(NumberOfCasesRecord),
175 ProductInfo(ProductInfoRecord),
176 LongNames(LongNameRecord),
177 VeryLongStrings(VeryLongStringRecord),
178 FileAttributes(FileAttributeRecord),
179 VariableAttributes(VariableAttributeRecord),
180 OtherExtension(Extension),
183 //ZTrailer(ZTrailer),
187 pub use crate::raw::EncodingRecord;
188 pub use crate::raw::Extension;
189 pub use crate::raw::FloatInfoRecord;
190 pub use crate::raw::IntegerInfoRecord;
191 pub use crate::raw::NumberOfCasesRecord;
193 type DictIndex = usize;
195 pub struct Variable {
196 pub dict_index: DictIndex,
197 pub short_name: Identifier,
198 pub long_name: Option<Identifier>,
203 pub compression: Option<Compression>,
205 pub encoding: &'static Encoding,
206 pub variables: HashMap<DictIndex, Variable>,
207 pub var_names: HashMap<Identifier, DictIndex>,
208 n_dict_indexes: usize,
209 n_generated_names: usize,
213 headers: Vec<raw::Record>,
214 encoding: Option<&'static Encoding>,
215 warn: &impl Fn(Error),
216 ) -> Result<Vec<Record>, Error> {
217 let Some(header_record) = headers.iter().find_map(|rec| {
218 if let raw::Record::Header(header) = rec {
224 return Err(Error::MissingHeaderRecord);
226 let encoding = match encoding {
227 Some(encoding) => encoding,
229 let encoding = headers.iter().find_map(|rec| {
230 if let raw::Record::Encoding(ref e) = rec {
236 let character_code = headers.iter().find_map(|rec| {
237 if let raw::Record::IntegerInfo(ref r) = rec {
238 Some(r.character_code)
243 match get_encoding(encoding, character_code) {
244 Ok(encoding) => encoding,
245 Err(err @ EncodingError::Ebcdic) => return Err(Error::EncodingError(err)),
247 warn(Error::EncodingError(err));
248 // Warn that we're using the default encoding.
255 let mut decoder = Decoder {
256 compression: header_record.compression,
257 endian: header_record.endian,
259 variables: HashMap::new(),
260 var_names: HashMap::new(),
262 n_generated_names: 0,
265 let mut output = Vec::with_capacity(headers.len());
266 for header in &headers {
268 raw::Record::Header(ref input) => {
269 if let Some(header) = HeaderRecord::try_decode(&mut decoder, input, warn)? {
270 output.push(Record::Header(header))
273 raw::Record::Variable(ref input) => {
274 if let Some(variable) = VariableRecord::try_decode(&mut decoder, input, warn)? {
275 output.push(Record::Variable(variable));
278 raw::Record::ValueLabel(ref input) => {
279 if let Some(value_label) = ValueLabelRecord::try_decode(&mut decoder, input, warn)?
281 output.push(Record::ValueLabel(value_label));
284 raw::Record::Document(ref input) => {
285 if let Some(document) = DocumentRecord::try_decode(&mut decoder, input, warn)? {
286 output.push(Record::Document(document))
289 raw::Record::IntegerInfo(ref input) => output.push(Record::IntegerInfo(input.clone())),
290 raw::Record::FloatInfo(ref input) => output.push(Record::FloatInfo(input.clone())),
291 raw::Record::VariableSets(ref input) => {
292 let s = decoder.decode_string_cow(&input.text.0, warn);
293 output.push(Record::VariableSets(VariableSetRecord::parse(&s, warn)?));
295 raw::Record::VarDisplay(ref input) => {
296 if let Some(vdr) = VarDisplayRecord::try_decode(&mut decoder, input, warn)? {
297 output.push(Record::VarDisplay(vdr))
300 raw::Record::MultipleResponse(ref input) => {
301 if let Some(mrr) = MultipleResponseRecord::try_decode(&mut decoder, input, warn)? {
302 output.push(Record::MultipleResponse(mrr))
305 raw::Record::LongStringValueLabels(ref input) => {
307 LongStringValueLabelRecord::try_decode(&mut decoder, input, warn)?
309 output.push(Record::LongStringValueLabels(mrr))
312 raw::Record::Encoding(ref input) => output.push(Record::Encoding(input.clone())),
313 raw::Record::NumberOfCases(ref input) => {
314 output.push(Record::NumberOfCases(input.clone()))
316 raw::Record::ProductInfo(ref input) => {
317 let s = decoder.decode_string_cow(&input.text.0, warn);
318 output.push(Record::ProductInfo(ProductInfoRecord::parse(&s, warn)?));
320 raw::Record::LongNames(ref input) => {
321 let s = decoder.decode_string_cow(&input.text.0, warn);
322 output.push(Record::LongNames(LongNameRecord::parse(
328 raw::Record::VeryLongStrings(ref input) => {
329 let s = decoder.decode_string_cow(&input.text.0, warn);
330 output.push(Record::VeryLongStrings(VeryLongStringRecord::parse(
336 raw::Record::FileAttributes(ref input) => {
337 let s = decoder.decode_string_cow(&input.text.0, warn);
338 output.push(Record::FileAttributes(FileAttributeRecord::parse(
342 raw::Record::VariableAttributes(ref input) => {
343 let s = decoder.decode_string_cow(&input.text.0, warn);
344 output.push(Record::VariableAttributes(VariableAttributeRecord::parse(
348 raw::Record::OtherExtension(ref input) => {
349 output.push(Record::OtherExtension(input.clone()))
351 raw::Record::EndOfHeaders(_) => (),
352 raw::Record::ZHeader(_) => (),
353 raw::Record::ZTrailer(_) => (),
354 raw::Record::Case(_) => (),
361 fn generate_name(&mut self) -> Identifier {
363 self.n_generated_names += 1;
364 let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
366 if !self.var_names.contains_key(&name) {
369 assert!(self.n_generated_names < usize::MAX);
372 fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
373 let (output, malformed) = self.encoding.decode_without_bom_handling(input);
375 warn(Error::MalformedString {
376 encoding: self.encoding.name().into(),
377 text: output.clone().into(),
382 fn decode_string(&self, input: &[u8], warn: &impl Fn(Error)) -> String {
383 self.decode_string_cow(input, warn).into()
385 pub fn decode_identifier(
388 warn: &impl Fn(Error),
389 ) -> Result<Identifier, IdError> {
390 let s = self.decode_string_cow(input, warn);
391 Identifier::new(&s, self.encoding)
393 fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> {
394 let max_index = self.n_dict_indexes;
395 if dict_index == 0 || dict_index > max_index {
396 return Err(Error::InvalidDictIndex {
401 let Some(variable) = self.variables.get(&(dict_index - 1)) else {
402 return Err(Error::DictIndexIsContinuation(dict_index));
407 /// Returns `input` decoded from `self.encoding` into UTF-8 such that
408 /// re-encoding the result back into `self.encoding` will have exactly the
409 /// same length in bytes.
411 /// XXX warn about errors?
412 fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
413 if let (s, false) = self.encoding.decode_without_bom_handling(input) {
414 // This is the common case. Usually there will be no errors.
417 // Unusual case. Don't bother to optimize it much.
418 let mut decoder = self.encoding.new_decoder_without_bom_handling();
419 let mut output = String::with_capacity(
421 .max_utf8_buffer_length_without_replacement(input.len())
424 let mut rest = input;
425 while !rest.is_empty() {
426 match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
427 (DecoderResult::InputEmpty, _) => break,
428 (DecoderResult::OutputFull, _) => unreachable!(),
429 (DecoderResult::Malformed(a, b), consumed) => {
430 let skipped = a as usize + b as usize;
431 output.extend(repeat('?').take(skipped));
432 rest = &rest[consumed..];
436 assert_eq!(self.encoding.encode(&output).0.len(), input.len());
442 pub trait TryDecode: Sized {
445 decoder: &mut Decoder,
447 warn: impl Fn(Error),
448 ) -> Result<Option<Self>, Error>;
451 pub trait Decode<Input>: Sized {
452 fn decode(decoder: &Decoder, input: &Input, warn: impl Fn(Error)) -> Self;
455 impl<const N: usize> Decode<UnencodedStr<N>> for String {
456 fn decode(decoder: &Decoder, input: &UnencodedStr<N>, warn: impl Fn(Error)) -> Self {
457 decoder.decode_string(&input.0, &warn)
461 #[derive(Clone, Debug)]
462 pub struct HeaderRecord {
463 pub eye_catcher: String,
464 pub weight_index: Option<usize>,
465 pub n_cases: Option<u64>,
466 pub creation: NaiveDateTime,
467 pub file_label: String,
470 fn trim_end_spaces(mut s: String) -> String {
471 s.truncate(s.trim_end_matches(' ').len());
475 impl TryDecode for HeaderRecord {
476 type Input = crate::raw::HeaderRecord;
479 decoder: &mut Decoder,
481 warn: impl Fn(Error),
482 ) -> Result<Option<Self>, Error> {
483 let eye_catcher = trim_end_spaces(decoder.decode_string(&input.eye_catcher.0, &warn));
484 let file_label = trim_end_spaces(decoder.decode_string(&input.file_label.0, &warn));
485 let creation_date = decoder.decode_string_cow(&input.creation_date.0, &warn);
487 NaiveDate::parse_from_str(&creation_date, "%e %b %Y").unwrap_or_else(|_| {
488 warn(Error::InvalidCreationDate {
489 creation_date: creation_date.into(),
493 let creation_time = decoder.decode_string_cow(&input.creation_time.0, &warn);
495 NaiveTime::parse_from_str(&creation_time, "%H:%M:%S").unwrap_or_else(|_| {
496 warn(Error::InvalidCreationTime {
497 creation_time: creation_time.into(),
501 Ok(Some(HeaderRecord {
503 weight_index: input.weight_index.map(|n| n as usize),
504 n_cases: input.n_cases.map(|n| n as u64),
505 creation: NaiveDateTime::new(creation_date, creation_time),
511 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
517 impl PartialOrd for VarWidth {
518 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
519 match (self, other) {
520 (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal),
521 (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)),
528 const MAX_STRING: u16 = 32767;
530 fn n_dict_indexes(self) -> usize {
532 VarWidth::Numeric => 1,
533 VarWidth::String(w) => div_ceil(w as usize, 8),
540 f: impl Fn(u16, u16) -> u16,
541 ) -> Option<VarWidth> {
543 (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric),
544 (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => {
545 Some(VarWidth::String(f(a, b)))
551 /// Returns the wider of `self` and `other`:
552 /// - Numerical variable widths are equally wide.
553 /// - Longer strings are wider than shorter strings.
554 /// - Numerical and string types are incomparable, so result in `None`.
555 /// - Any `None` in the input yields `None` in the output.
556 pub fn wider(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
557 Self::width_predicate(a, b, |a, b| a.max(b))
560 /// Returns the narrower of `self` and `other` (see [`Self::wider`]).
561 pub fn narrower(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
562 Self::width_predicate(a, b, |a, b| a.min(b))
566 impl From<VarWidth> for VarType {
567 fn from(source: VarWidth) -> Self {
569 VarWidth::Numeric => VarType::Numeric,
570 VarWidth::String(_) => VarType::String,
575 #[derive(Clone, Debug)]
576 pub struct VariableRecord {
578 pub name: Identifier,
579 pub print_format: Spec,
580 pub write_format: Spec,
581 pub missing_values: MissingValues,
582 pub label: Option<String>,
585 fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec {
586 UncheckedSpec::try_from(raw)
587 .and_then(Spec::try_from)
588 .and_then(|x| x.check_width_compatibility(width))
589 .unwrap_or_else(|error| {
590 let new_format = Spec::default_for_width(width);
591 warn(new_format, error);
596 impl TryDecode for VariableRecord {
597 type Input = raw::VariableRecord;
600 decoder: &mut Decoder,
601 input: &crate::raw::VariableRecord,
602 warn: impl Fn(Error),
603 ) -> Result<Option<VariableRecord>, Error> {
604 let width = match input.width {
605 0 => VarWidth::Numeric,
606 w @ 1..=255 => VarWidth::String(w as u16),
607 -1 => return Ok(None),
609 return Err(Error::InvalidVariableWidth {
610 offset: input.offset,
615 let name = trim_end_spaces(decoder.decode_string(&input.name.0, &warn));
616 let name = match Identifier::new(&name, decoder.encoding) {
618 if !decoder.var_names.contains_key(&name) {
621 let new_name = decoder.generate_name();
622 warn(Error::DuplicateVariableName {
623 duplicate_name: name.clone(),
624 new_name: new_name.clone(),
630 let new_name = decoder.generate_name();
631 warn(Error::InvalidVariableName {
633 new_name: new_name.clone(),
638 let variable = Variable {
639 dict_index: decoder.n_dict_indexes,
640 short_name: name.clone(),
644 decoder.n_dict_indexes += width.n_dict_indexes();
647 .insert(name.clone(), variable.dict_index)
651 .insert(variable.dict_index, variable)
654 let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
655 warn(Error::InvalidPrintFormat {
657 variable: name.clone(),
661 let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
662 warn(Error::InvalidWriteFormat {
664 variable: name.clone(),
671 .map(|label| decoder.decode_string(&label.0, &warn));
672 Ok(Some(VariableRecord {
677 missing_values: input.missing_values.clone(),
683 #[derive(Clone, Debug)]
684 pub struct DocumentRecord(Vec<String>);
686 impl TryDecode for DocumentRecord {
687 type Input = crate::raw::DocumentRecord;
690 decoder: &mut Decoder,
692 warn: impl Fn(Error),
693 ) -> Result<Option<Self>, Error> {
694 Ok(Some(DocumentRecord(
698 .map(|s| trim_end_spaces(decoder.decode_string(&s.0, &warn)))
708 const NAME: &'static str;
709 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error>;
712 #[derive(Clone, Debug)]
713 pub struct VariableSet {
715 pub vars: Vec<String>,
719 fn parse(input: &str) -> Result<Self, Error> {
720 let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
721 let vars = input.split_ascii_whitespace().map(String::from).collect();
729 trait WarnOnError<T> {
730 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
732 impl<T> WarnOnError<T> for Result<T, Error> {
733 fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
735 Ok(result) => Some(result),
744 #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
746 Number(Option<OrderedFloat<f64>>),
751 pub fn decode(raw: raw::Value, decoder: &Decoder) -> Self {
753 raw::Value::Number(x) => Value::Number(x.map(|x| x.into())),
754 raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
759 #[derive(Clone, Debug)]
760 pub struct ValueLabel {
765 #[derive(Clone, Debug)]
766 pub struct ValueLabelRecord {
767 pub var_type: VarType,
768 pub labels: Vec<ValueLabel>,
769 pub variables: Vec<Identifier>,
772 impl TryDecode for ValueLabelRecord {
773 type Input = crate::raw::ValueLabelRecord;
775 decoder: &mut Decoder,
777 warn: impl Fn(Error),
778 ) -> Result<Option<ValueLabelRecord>, Error> {
779 let variables: Vec<&Variable> = input
782 .filter_map(|&dict_index| {
784 .get_var_by_index(dict_index as usize)
785 .warn_on_error(&warn)
787 .filter(|&variable| match variable.width {
788 VarWidth::String(width) if width > 8 => {
789 warn(Error::InvalidLongStringValueLabel(
790 variable.short_name.clone(),
797 let mut i = variables.iter();
798 let Some(&first_var) = i.next() else {
801 let var_type: VarType = first_var.width.into();
803 let this_type: VarType = variable.width.into();
804 if var_type != this_type {
805 let (numeric_var, string_var) = match var_type {
806 VarType::Numeric => (first_var, variable),
807 VarType::String => (variable, first_var),
809 warn(Error::ValueLabelsDifferentTypes {
810 numeric_var: numeric_var.short_name.clone(),
811 string_var: string_var.short_name.clone(),
819 .map(|(value, label)| {
820 let label = decoder.decode_string(&label.0, &warn);
821 let value = Value::decode(
822 raw::Value::from_raw(*value, var_type, decoder.endian),
825 ValueLabel { value, label }
828 let variables = variables
830 .map(|&variable| variable.short_name.clone())
832 Ok(Some(ValueLabelRecord {
840 #[derive(Clone, Debug)]
841 pub struct VariableSetRecord(Vec<VariableSet>);
843 impl TextRecord for VariableSetRecord {
844 const NAME: &'static str = "variable set";
845 fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
846 let mut sets = Vec::new();
847 for line in input.lines() {
848 if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) {
852 Ok(VariableSetRecord(sets))
856 #[derive(Clone, Debug)]
857 pub struct ProductInfoRecord(pub String);
859 impl TextRecord for ProductInfoRecord {
860 const NAME: &'static str = "extra product info";
861 fn parse(input: &str, _warn: impl Fn(Error)) -> Result<Self, Error> {
862 Ok(ProductInfoRecord(input.into()))
866 #[derive(Clone, Debug)]
867 pub struct LongName {
868 pub short_name: Identifier,
869 pub long_name: Identifier,
873 fn new(decoder: &mut Decoder, short_name: &str, long_name: &str) -> Result<LongName, Error> {
874 let short_name = Identifier::new(short_name, decoder.encoding)
875 .map_err(|e| Error::InvalidShortName(e))?;
877 Identifier::new(long_name, decoder.encoding).map_err(|e| Error::InvalidLongName(e))?;
885 #[derive(Clone, Debug)]
886 pub struct LongNameRecord(Vec<LongName>);
888 impl LongNameRecord {
889 pub fn parse(decoder: &mut Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
890 let mut names = Vec::new();
891 for pair in input.split('\t').filter(|s| !s.is_empty()) {
892 if let Some((short_name, long_name)) = pair.split_once('=') {
893 if let Some(long_name) =
894 LongName::new(decoder, short_name, long_name).warn_on_error(&warn)
896 names.push(long_name);
902 Ok(LongNameRecord(names))
906 #[derive(Clone, Debug)]
907 pub struct VeryLongString {
908 pub short_name: Identifier,
912 impl VeryLongString {
913 fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Error> {
914 let Some((short_name, length)) = input.split_once('=') else {
915 return Err(Error::TBD);
917 let short_name = Identifier::new(short_name, decoder.encoding)
918 .map_err(|e| Error::InvalidLongStringName(e))?;
919 let length: u16 = length.parse().map_err(|_| Error::TBD)?;
920 if length > VarWidth::MAX_STRING {
921 return Err(Error::TBD);
924 short_name: short_name.into(),
930 #[derive(Clone, Debug)]
931 pub struct VeryLongStringRecord(Vec<VeryLongString>);
933 impl VeryLongStringRecord {
934 pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
935 let mut very_long_strings = Vec::new();
938 .map(|s| s.trim_end_matches('\t'))
939 .filter(|s| !s.is_empty())
941 if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&warn) {
942 very_long_strings.push(vls)
945 Ok(VeryLongStringRecord(very_long_strings))
949 #[derive(Clone, Debug)]
950 pub struct Attribute {
951 pub name: Identifier,
952 pub values: Vec<String>,
959 warn: &impl Fn(Error),
960 ) -> Result<(Option<Attribute>, &'a str), Error> {
961 let Some((name, mut input)) = input.split_once('(') else {
962 return Err(Error::TBD);
964 let mut values = Vec::new();
966 let Some((value, rest)) = input.split_once('\n') else {
967 return Err(Error::TBD);
969 if let Some(stripped) = value
971 .and_then(|value| value.strip_suffix('\''))
973 values.push(stripped.into());
976 values.push(value.into());
978 if let Some(rest) = rest.strip_prefix(')') {
979 let attribute = Identifier::new(name, decoder.encoding)
980 .map_err(|e| Error::InvalidAttributeName(e))
982 .map(|name| Attribute { name, values });
983 return Ok((attribute, rest));
990 #[derive(Clone, Debug)]
991 pub struct AttributeSet(pub Vec<Attribute>);
997 sentinel: Option<char>,
998 warn: &impl Fn(Error),
999 ) -> Result<(AttributeSet, &'a str), Error> {
1000 let mut attributes = Vec::new();
1002 match input.chars().next() {
1003 None => break input,
1004 c if c == sentinel => break &input[1..],
1006 let (attribute, rest) = Attribute::parse(decoder, input, &warn)?;
1007 if let Some(attribute) = attribute {
1008 attributes.push(attribute);
1014 Ok((AttributeSet(attributes), rest))
1018 #[derive(Clone, Debug)]
1019 pub struct FileAttributeRecord(AttributeSet);
1021 impl FileAttributeRecord {
1022 pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1023 let (set, rest) = AttributeSet::parse(decoder, input, None, &warn)?;
1024 if !rest.is_empty() {
1027 Ok(FileAttributeRecord(set))
1031 #[derive(Clone, Debug)]
1032 pub struct VarAttributeSet {
1033 pub long_var_name: Identifier,
1034 pub attributes: AttributeSet,
1037 impl VarAttributeSet {
1041 warn: &impl Fn(Error),
1042 ) -> Result<(Option<VarAttributeSet>, &'a str), Error> {
1043 let Some((long_var_name, rest)) = input.split_once(':') else {
1044 return Err(Error::TBD);
1046 let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'), warn)?;
1047 let var_attribute = Identifier::new(long_var_name, decoder.encoding)
1048 .map_err(|e| Error::InvalidAttributeVariableName(e))
1049 .warn_on_error(warn)
1050 .map(|name| VarAttributeSet {
1051 long_var_name: name,
1054 Ok((var_attribute, rest))
1058 #[derive(Clone, Debug)]
1059 pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
1061 impl VariableAttributeRecord {
1062 pub fn parse(decoder: &Decoder, mut input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
1063 let mut var_attribute_sets = Vec::new();
1064 while !input.is_empty() {
1065 let Some((var_attribute, rest)) =
1066 VarAttributeSet::parse(decoder, input, &warn).warn_on_error(&warn)
1070 if let Some(var_attribute) = var_attribute {
1071 var_attribute_sets.push(var_attribute);
1075 Ok(VariableAttributeRecord(var_attribute_sets))
1079 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1087 fn try_decode(source: u32) -> Result<Option<Measure>, Error> {
1090 1 => Ok(Some(Measure::Nominal)),
1091 2 => Ok(Some(Measure::Ordinal)),
1092 3 => Ok(Some(Measure::Scale)),
1093 _ => Err(Error::InvalidMeasurement(source)),
1098 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1099 pub enum Alignment {
1106 fn try_decode(source: u32) -> Result<Option<Alignment>, Error> {
1109 1 => Ok(Some(Alignment::Left)),
1110 2 => Ok(Some(Alignment::Right)),
1111 3 => Ok(Some(Alignment::Center)),
1112 _ => Err(Error::InvalidAlignment(source)),
1117 #[derive(Clone, Debug)]
1118 pub struct VarDisplay {
1119 pub measure: Option<Measure>,
1120 pub width: Option<u32>,
1121 pub alignment: Option<Alignment>,
1124 #[derive(Clone, Debug)]
1125 pub struct VarDisplayRecord(pub Vec<VarDisplay>);
1127 impl TryDecode for VarDisplayRecord {
1128 type Input = raw::VarDisplayRecord;
1130 decoder: &mut Decoder,
1131 input: &Self::Input,
1132 warn: impl Fn(Error),
1133 ) -> Result<Option<Self>, Error> {
1134 let n_vars = decoder.variables.len();
1135 let n_per_var = if input.0.len() == 3 * n_vars {
1137 } else if input.0.len() == 2 * n_vars {
1140 return Err(Error::TBD);
1143 let var_displays = input
1147 let (measure, width, alignment) = match n_per_var == 3 {
1148 true => (chunk[0], Some(chunk[1]), chunk[2]),
1149 false => (chunk[0], None, chunk[1]),
1151 let measure = Measure::try_decode(measure).warn_on_error(&warn).flatten();
1152 let alignment = Alignment::try_decode(alignment)
1153 .warn_on_error(&warn)
1162 Ok(Some(VarDisplayRecord(var_displays)))
1166 #[derive(Clone, Debug)]
1167 pub enum MultipleResponseType {
1170 labels: CategoryLabels,
1175 impl MultipleResponseType {
1178 mr_set: &Identifier,
1179 input: &raw::MultipleResponseType,
1180 min_width: VarWidth,
1181 warn: &impl Fn(Error),
1182 ) -> Result<Self, Error> {
1183 let mr_type = match input {
1184 raw::MultipleResponseType::MultipleDichotomy { value, labels } => {
1185 let value = decoder.decode_string_cow(&value.0, warn);
1186 let value = match min_width {
1187 VarWidth::Numeric => {
1188 let number: f64 = value.trim().parse().map_err(|_| {
1189 Error::InvalidMDGroupCountedValue {
1190 mr_set: mr_set.clone(),
1191 number: value.into(),
1194 Value::Number(Some(number.into()))
1196 VarWidth::String(max_width) => {
1197 let value = value.trim_end_matches(' ');
1198 let width = value.len();
1199 if width > max_width as usize {
1200 return Err(Error::TooWideMDGroupCountedValue {
1201 mr_set: mr_set.clone(),
1202 value: value.into(),
1207 Value::String(value.into())
1210 MultipleResponseType::MultipleDichotomy {
1215 raw::MultipleResponseType::MultipleCategory => MultipleResponseType::MultipleCategory,
1221 #[derive(Clone, Debug)]
1222 pub struct MultipleResponseSet {
1223 pub name: Identifier,
1224 pub min_width: VarWidth,
1225 pub max_width: VarWidth,
1227 pub mr_type: MultipleResponseType,
1228 pub dict_indexes: Vec<DictIndex>,
1231 impl MultipleResponseSet {
1234 input: &raw::MultipleResponseSet,
1235 warn: &impl Fn(Error),
1236 ) -> Result<Self, Error> {
1237 let mr_set_name = decoder
1238 .decode_identifier(&input.name.0, warn)
1239 .map_err(|error| Error::InvalidMrSetName(error))?;
1241 let label = decoder.decode_string(&input.label.0, warn);
1243 let mut dict_indexes = Vec::with_capacity(input.short_names.len());
1244 for short_name in input.short_names.iter() {
1245 let short_name = match decoder.decode_identifier(&short_name.0, warn) {
1248 warn(Error::InvalidMrSetName(error));
1252 let Some(&dict_index) = decoder.var_names.get(&short_name) else {
1253 warn(Error::UnknownMrSetVariable {
1254 mr_set: mr_set_name.clone(),
1255 short_name: short_name.clone(),
1259 dict_indexes.push(dict_index);
1262 match dict_indexes.len() {
1263 0 => return Err(Error::EmptyMrSet(mr_set_name)),
1264 1 => return Err(Error::OneVarMrSet(mr_set_name)),
1268 let Some((Some(min_width), Some(max_width))) = dict_indexes
1270 .map(|dict_index| decoder.variables[dict_index].width)
1271 .map(|w| (Some(w), Some(w)))
1272 .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb)))
1274 return Err(Error::MixedMrSet(mr_set_name));
1278 MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?;
1280 Ok(MultipleResponseSet {
1291 #[derive(Clone, Debug)]
1292 pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
1294 impl TryDecode for MultipleResponseRecord {
1295 type Input = raw::MultipleResponseRecord;
1298 decoder: &mut Decoder,
1299 input: &Self::Input,
1300 warn: impl Fn(Error),
1301 ) -> Result<Option<Self>, Error> {
1302 let mut sets = Vec::with_capacity(input.0.len());
1303 for set in &input.0 {
1304 match MultipleResponseSet::decode(decoder, set, &warn) {
1305 Ok(set) => sets.push(set),
1306 Err(error) => warn(error),
1309 Ok(Some(MultipleResponseRecord(sets)))
1313 #[derive(Clone, Debug)]
1314 pub struct LongStringValueLabels {
1315 pub var_name: Identifier,
1316 pub width: VarWidth,
1317 pub labels: Vec<ValueLabel>,
1320 impl LongStringValueLabels {
1323 input: &raw::LongStringValueLabels,
1324 warn: &impl Fn(Error),
1325 ) -> Result<Self, Error> {
1326 let var_name = decoder.decode_string(&input.var_name.0, warn);
1327 let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
1328 .map_err(|e| Error::InvalidLongStringValueLabelName(e))?;
1331 let max_width = VarWidth::MAX_STRING;
1332 if input.width < 9 || input.width > max_width as u32 {
1333 return Err(Error::InvalidLongValueLabelWidth {
1334 name: var_name.into(),
1340 let width = input.width as u16;
1342 let mut labels = Vec::with_capacity(input.labels.len());
1343 for (value, label) in input.labels.iter() {
1344 let value = Value::String(decoder.decode_exact_length(&value.0).into());
1345 let label = decoder.decode_string(&label.0, warn);
1346 labels.push(ValueLabel { value, label });
1349 Ok(LongStringValueLabels {
1351 width: VarWidth::String(width),
1357 #[derive(Clone, Debug)]
1358 pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
1360 impl TryDecode for LongStringValueLabelRecord {
1361 type Input = raw::LongStringValueLabelRecord;
1364 decoder: &mut Decoder,
1365 input: &Self::Input,
1366 warn: impl Fn(Error),
1367 ) -> Result<Option<Self>, Error> {
1368 let mut labels = Vec::with_capacity(input.0.len());
1369 for label in &input.0 {
1370 match LongStringValueLabels::decode(decoder, label, &warn) {
1371 Ok(set) => labels.push(set),
1372 Err(error) => warn(error),
1375 Ok(Some(LongStringValueLabelRecord(labels)))
1381 use encoding_rs::WINDOWS_1252;
1385 let mut s = String::new();
1386 s.push(char::REPLACEMENT_CHARACTER);
1387 let encoded = WINDOWS_1252.encode(&s).0;
1388 let decoded = WINDOWS_1252.decode(&encoded[..]).0;
1389 println!("{:?}", decoded);
1394 let charset: Vec<u8> = (0..=255).collect();
1395 println!("{}", charset.len());
1396 let decoded = WINDOWS_1252.decode(&charset[..]).0;
1397 println!("{}", decoded.len());
1398 let encoded = WINDOWS_1252.encode(&decoded[..]).0;
1399 println!("{}", encoded.len());
1400 assert_eq!(&charset[..], &encoded[..]);