+++ /dev/null
-use core::str;
-use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc};
-
-use crate::{
- dictionary::{
- Dictionary, InvalidRole, MultipleResponseSet, MultipleResponseType, Value, VarWidth,
- Variable, VariableSet,
- },
- encoding::Error as EncodingError,
- endian::Endian,
- format::{Error as FormatError, Format, UncheckedFormat},
- identifier::{ByIdentifier, Error as IdError, Identifier},
- raw::{
- self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord,
- FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName, LongNamesRecord,
- LongStringMissingValueRecord, LongStringValueLabelRecord, MissingValues,
- MultipleResponseRecord, NumberOfCasesRecord, ProductInfoRecord, RawStrArray, RawWidth,
- ValueLabel, ValueLabelRecord, VarDisplayRecord, VariableAttributeRecord, VariableRecord,
- VariableSetRecord, VeryLongStringsRecord, ZHeader, ZTrailer,
- },
-};
-use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
-use encoding_rs::Encoding;
-use indexmap::set::MutableValues;
-use thiserror::Error as ThisError;
-
-pub use crate::raw::{CategoryLabels, Compression};
-
-#[derive(ThisError, Debug)]
-pub enum Error {
- #[error("Missing header record")]
- MissingHeaderRecord,
-
- // XXX this is an internal error
- #[error("More than one file header record")]
- DuplicateHeaderRecord,
-
- #[error("{0}")]
- EncodingError(EncodingError),
-
- #[error("Using default encoding {0}.")]
- UsingDefaultEncoding(String),
-
- #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)]
- InvalidVariableWidth { offsets: Range<u64>, width: i32 },
-
- #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
- InvalidLongMissingValueFormat,
-
- #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
- InvalidCreationDate { creation_date: String },
-
- #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
- InvalidCreationTime { creation_time: String },
-
- #[error("{id_error} Renaming variable to {new_name}.")]
- InvalidVariableName {
- id_error: IdError,
- new_name: Identifier,
- },
-
- #[error(
- "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}"
- )]
- InvalidPrintFormat {
- new_spec: Format,
- variable: Identifier,
- format_error: FormatError,
- },
-
- #[error(
- "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}"
- )]
- InvalidWriteFormat {
- new_spec: Format,
- variable: Identifier,
- format_error: FormatError,
- },
-
- #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")]
- DuplicateVariableName {
- duplicate_name: Identifier,
- new_name: Identifier,
- },
-
- #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")]
- InvalidDictIndex { dict_index: usize, max_index: usize },
-
- #[error("Dictionary index {0} refers to a long string continuation.")]
- DictIndexIsContinuation(usize),
-
- #[error("At offset {offset:#x}, one or more variable indexes for value labels referred to long string continuation records: {indexes:?}")]
- LongStringContinuationIndexes { offset: u64, indexes: Vec<u32> },
-
- #[error(
- "At offsets {:#x}...{:#x}, record types 3 and 4 may not add value labels to one or more long string variables: {variables:?}", .offsets.start, .offsets.end
- )]
- InvalidLongStringValueLabels {
- offsets: Range<u64>,
- variables: Vec<Identifier>,
- },
-
- #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")]
- ValueLabelsDifferentTypes {
- numeric_var: Identifier,
- string_var: Identifier,
- },
-
- #[error("Invalid multiple response set name. {0}")]
- InvalidMrSetName(IdError),
-
- #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")]
- UnknownMrSetVariable {
- mr_set: Identifier,
- short_name: Identifier,
- },
-
- #[error("Multiple response set {0} has no variables.")]
- EmptyMrSet(Identifier),
-
- #[error("Multiple response set {0} has only one variable.")]
- OneVarMrSet(Identifier),
-
- #[error("Multiple response set {0} contains both string and numeric variables.")]
- MixedMrSet(Identifier),
-
- #[error(
- "Invalid numeric format for counted value {number} in multiple response set {mr_set}."
- )]
- InvalidMDGroupCountedValue { mr_set: Identifier, number: String },
-
- #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")]
- TooWideMDGroupCountedValue {
- mr_set: Identifier,
- value: String,
- width: usize,
- max_width: u16,
- },
-
- #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")]
- InvalidLongValueLabelWidth {
- name: Identifier,
- width: u32,
- min_width: u16,
- max_width: u16,
- },
-
- #[error("Invalid attribute name. {0}")]
- InvalidAttributeName(IdError),
-
- #[error("Invalid short name in long variable name record. {0}")]
- InvalidShortName(IdError),
-
- #[error("Invalid name in long variable name record. {0}")]
- InvalidLongName(IdError),
-
- #[error("Invalid variable name in very long string record. {0}")]
- InvalidLongStringName(IdError),
-
- #[error("Invalid variable name in long string value label record. {0}")]
- InvalidLongStringValueLabelName(IdError),
-
- #[error("Invalid variable name in attribute record. {0}")]
- InvalidAttributeVariableName(IdError),
-
- // XXX This is risky because `text` might be arbitarily long.
- #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
- MalformedString { encoding: String, text: String },
-
- #[error("Details TBD")]
- TBD,
-}
-
-#[derive(Clone, Debug)]
-pub struct Headers {
- pub header: HeaderRecord<String>,
- pub variable: Vec<VariableRecord<String>>,
- pub value_label: Vec<ValueLabelRecord<RawStrArray<8>, String>>,
- pub document: Vec<DocumentRecord<String>>,
- pub integer_info: Option<IntegerInfoRecord>,
- pub float_info: Option<FloatInfoRecord>,
- pub var_display: Option<VarDisplayRecord>,
- pub multiple_response: Vec<MultipleResponseRecord<Identifier, String>>,
- pub long_string_value_labels: Vec<LongStringValueLabelRecord<Identifier, String>>,
- pub long_string_missing_values: Vec<LongStringMissingValueRecord<Identifier>>,
- pub encoding: Option<EncodingRecord>,
- pub number_of_cases: Option<NumberOfCasesRecord>,
- pub variable_sets: Vec<VariableSetRecord>,
- pub product_info: Option<ProductInfoRecord>,
- pub long_names: Vec<LongNamesRecord>,
- pub very_long_strings: Vec<VeryLongStringsRecord>,
- pub file_attributes: Vec<FileAttributeRecord>,
- pub variable_attributes: Vec<VariableAttributeRecord>,
- pub other_extension: Vec<Extension>,
- pub end_of_headers: Option<u32>,
- pub z_header: Option<ZHeader>,
- pub z_trailer: Option<ZTrailer>,
- pub cases: Option<Rc<RefCell<Cases>>>,
-}
-
-fn take_first<T, F>(mut vec: Vec<T>, more_than_one: F) -> Option<T>
-where
- F: FnOnce(),
-{
- if vec.len() > 1 {
- more_than_one();
- }
- vec.drain(..).next()
-}
-
-impl Headers {
- pub fn new(headers: Vec<raw::DecodedRecord>, warn: &impl Fn(Error)) -> Result<Headers, Error> {
- let mut file_header = Vec::new();
- let mut variable = Vec::new();
- let mut value_label = Vec::new();
- let mut document = Vec::new();
- let mut integer_info = Vec::new();
- let mut float_info = Vec::new();
- let mut var_display = Vec::new();
- let mut multiple_response = Vec::new();
- let mut long_string_value_labels = Vec::new();
- let mut long_string_missing_values = Vec::new();
- let mut encoding = Vec::new();
- let mut number_of_cases = Vec::new();
- let mut variable_sets = Vec::new();
- let mut product_info = Vec::new();
- let mut long_names = Vec::new();
- let mut very_long_strings = Vec::new();
- let mut file_attributes = Vec::new();
- let mut variable_attributes = Vec::new();
- let mut other_extension = Vec::new();
- let mut end_of_headers = Vec::new();
- let mut z_header = Vec::new();
- let mut z_trailer = Vec::new();
- let mut cases = Vec::new();
-
- for header in headers {
- match header {
- DecodedRecord::Header(record) => {
- file_header.push(record);
- }
- DecodedRecord::Variable(record) => {
- variable.push(record);
- }
- DecodedRecord::ValueLabel(record) => {
- value_label.push(record);
- }
- DecodedRecord::Document(record) => {
- document.push(record);
- }
- DecodedRecord::IntegerInfo(record) => {
- integer_info.push(record);
- }
- DecodedRecord::FloatInfo(record) => {
- float_info.push(record);
- }
- DecodedRecord::VariableSets(record) => {
- variable_sets.push(record);
- }
- DecodedRecord::VarDisplay(record) => {
- var_display.push(record);
- }
- DecodedRecord::MultipleResponse(record) => {
- multiple_response.push(record);
- }
- DecodedRecord::LongStringValueLabels(record) => {
- long_string_value_labels.push(record)
- }
- DecodedRecord::LongStringMissingValues(record) => {
- long_string_missing_values.push(record);
- }
- DecodedRecord::Encoding(record) => {
- encoding.push(record);
- }
- DecodedRecord::NumberOfCases(record) => {
- number_of_cases.push(record);
- }
- DecodedRecord::ProductInfo(record) => {
- product_info.push(record);
- }
- DecodedRecord::LongNames(record) => {
- long_names.push(record);
- }
- DecodedRecord::VeryLongStrings(record) => {
- very_long_strings.push(record);
- }
- DecodedRecord::FileAttributes(record) => {
- file_attributes.push(record);
- }
- DecodedRecord::VariableAttributes(record) => {
- variable_attributes.push(record);
- }
- DecodedRecord::OtherExtension(record) => {
- other_extension.push(record);
- }
- DecodedRecord::EndOfHeaders(record) => {
- end_of_headers.push(record);
- }
- DecodedRecord::ZHeader(record) => {
- z_header.push(record);
- }
- DecodedRecord::ZTrailer(record) => {
- z_trailer.push(record);
- }
- DecodedRecord::Cases(record) => {
- cases.push(record);
- }
- }
- }
-
- let Some(file_header) = take_first(file_header, || warn(Error::DuplicateHeaderRecord))
- else {
- return Err(Error::MissingHeaderRecord);
- };
-
- Ok(Headers {
- header: file_header,
- variable,
- value_label,
- document,
- integer_info: take_first(integer_info, || warn(Error::TBD)),
- float_info: take_first(float_info, || warn(Error::TBD)),
- var_display: take_first(var_display, || warn(Error::TBD)),
- multiple_response,
- long_string_value_labels,
- long_string_missing_values,
- encoding: take_first(encoding, || warn(Error::TBD)),
- number_of_cases: take_first(number_of_cases, || warn(Error::TBD)),
- variable_sets,
- product_info: take_first(product_info, || warn(Error::TBD)),
- long_names,
- very_long_strings,
- file_attributes,
- variable_attributes,
- other_extension,
- end_of_headers: take_first(end_of_headers, || warn(Error::TBD)),
- z_header: take_first(z_header, || warn(Error::TBD)),
- z_trailer: take_first(z_trailer, || warn(Error::TBD)),
- cases: take_first(cases, || warn(Error::TBD)),
- })
- }
-}
-
-#[derive(Debug)]
-pub struct Metadata {
- pub creation: NaiveDateTime,
- pub endian: Endian,
- pub compression: Option<Compression>,
- pub n_cases: Option<u64>,
- pub product: String,
- pub product_ext: Option<String>,
- pub version: Option<(i32, i32, i32)>,
-}
-
-impl Metadata {
- fn decode(headers: &Headers, warn: impl Fn(Error)) -> Self {
- let header = &headers.header;
- let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
- .unwrap_or_else(|_| {
- warn(Error::InvalidCreationDate {
- creation_date: header.creation_date.to_string(),
- });
- Default::default()
- });
- let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
- .unwrap_or_else(|_| {
- warn(Error::InvalidCreationTime {
- creation_time: header.creation_time.to_string(),
- });
- Default::default()
- });
- let creation = NaiveDateTime::new(creation_date, creation_time);
-
- let product = header
- .eye_catcher
- .trim_start_matches("@(#) SPSS DATA FILE")
- .trim_end()
- .to_string();
-
- Self {
- creation,
- endian: header.endian,
- compression: header.compression,
- n_cases: header.n_cases.map(|n| n as u64),
- product,
- product_ext: headers.product_info.as_ref().map(|pe| fix_line_ends(&pe.0)),
- version: headers.integer_info.as_ref().map(|ii| ii.version),
- }
- }
-}
-
-struct Decoder {
- pub encoding: &'static Encoding,
- n_generated_names: usize,
-}
-
-impl Decoder {
- fn generate_name(&mut self, dictionary: &Dictionary) -> Identifier {
- loop {
- self.n_generated_names += 1;
- let name = Identifier::from_encoding(
- format!("VAR{:03}", self.n_generated_names),
- self.encoding,
- )
- .unwrap();
- if !dictionary.variables.contains(&name.0) {
- return name;
- }
- assert!(self.n_generated_names < usize::MAX);
- }
- }
-}
-
-pub fn decode(
- mut headers: Headers,
- encoding: &'static Encoding,
- warn: impl Fn(Error),
-) -> Result<(Dictionary, Metadata), Error> {
- let mut dictionary = Dictionary::new(encoding);
-
- let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' '));
- if !file_label.is_empty() {
- dictionary.file_label = Some(file_label);
- }
-
- for mut attributes in headers.file_attributes.drain(..) {
- dictionary.attributes.append(&mut attributes.0)
- }
-
- // Concatenate all the document records (really there should only be one)
- // and trim off the trailing spaces that pad them to 80 bytes.
- dictionary.documents = headers
- .document
- .drain(..)
- .flat_map(|record| record.lines)
- .map(trim_end_spaces)
- .collect();
-
- // XXX warn for weird integer format
- // XXX warn for weird floating-point format, etc.
-
- let mut decoder = Decoder {
- encoding,
- n_generated_names: 0,
- };
-
- let mut var_index_map = HashMap::new();
- let mut value_index = 0;
- for (index, input) in headers
- .variable
- .iter()
- .enumerate()
- .filter(|(_index, record)| record.width != RawWidth::Continuation)
- {
- let name = trim_end_spaces(input.name.to_string());
- let name = match Identifier::from_encoding(name, encoding) {
- Ok(name) => {
- if !dictionary.variables.contains(&name.0) {
- name
- } else {
- let new_name = decoder.generate_name(&dictionary);
- warn(Error::DuplicateVariableName {
- duplicate_name: name.clone(),
- new_name: new_name.clone(),
- });
- new_name
- }
- }
- Err(id_error) => {
- let new_name = decoder.generate_name(&dictionary);
- warn(Error::InvalidVariableName {
- id_error,
- new_name: new_name.clone(),
- });
- new_name
- }
- };
- let mut variable = Variable::new(name.clone(), VarWidth::try_from(input.width).unwrap());
-
- // Set the short name the same as the long name (even if we renamed it).
- variable.short_names = vec![name];
-
- variable.label = input.label.clone();
-
- variable.missing_values = input.missing_values.clone();
-
- variable.print_format = decode_format(
- input.print_format,
- variable.width,
- |new_spec, format_error| {
- warn(Error::InvalidPrintFormat {
- new_spec,
- variable: variable.name.clone(),
- format_error,
- })
- },
- );
- variable.write_format = decode_format(
- input.write_format,
- variable.width,
- |new_spec, format_error| {
- warn(Error::InvalidWriteFormat {
- new_spec,
- variable: variable.name.clone(),
- format_error,
- })
- },
- );
-
- // Check for long string continuation records.
- let n_values = input.width.n_values().unwrap();
- for offset in 1..n_values {
- if headers
- .variable
- .get(index + offset)
- .is_none_or(|record| record.width != RawWidth::Continuation)
- {
- warn(Error::TBD);
- break;
- }
- }
-
- let dict_index = dictionary.add_var(variable).unwrap();
- assert_eq!(var_index_map.insert(value_index, dict_index), None);
- value_index += n_values;
- }
-
- if let Some(weight_index) = headers.header.weight_index {
- if let Some(dict_index) = var_index_map.get(&(weight_index as usize - 1)) {
- let variable = &dictionary.variables[*dict_index];
- if variable.is_numeric() {
- dictionary.weight = Some(*dict_index);
- } else {
- warn(Error::TBD);
- }
- } else {
- warn(Error::TBD);
- }
- }
-
- for record in headers.value_label.drain(..) {
- let mut dict_indexes = Vec::with_capacity(record.dict_indexes.len());
- let mut long_string_variables = Vec::new();
- for value_index in record.dict_indexes.iter() {
- let Some(dict_index) = var_index_map.get(&(*value_index as usize - 1)) else {
- unreachable!()
- };
- let variable = &dictionary.variables[*dict_index];
- if variable.width.is_long_string() {
- long_string_variables.push(variable.name.clone());
- } else {
- dict_indexes.push(*dict_index);
- }
- }
- if !long_string_variables.is_empty() {
- warn(Error::InvalidLongStringValueLabels {
- offsets: record.offsets.clone(),
- variables: long_string_variables,
- });
- }
-
- for dict_index in dict_indexes {
- let variable = dictionary.variables.get_index_mut2(dict_index).unwrap();
- for ValueLabel { value, label } in record.labels.iter().cloned() {
- let value = value.decode(variable.width);
- variable.value_labels.insert(value, label);
- }
- }
- }
-
- if let Some(display) = &headers.var_display {
- for (index, display) in display.0.iter().enumerate() {
- if let Some(variable) = dictionary.variables.get_index_mut2(index) {
- if let Some(width) = display.width {
- variable.display_width = width;
- }
- if let Some(alignment) = display.alignment {
- variable.alignment = alignment;
- }
- if let Some(measure) = display.measure {
- variable.measure = Some(measure);
- }
- } else {
- warn(Error::TBD);
- }
- }
- }
-
- for record in headers
- .multiple_response
- .iter()
- .flat_map(|record| record.0.iter())
- {
- match MultipleResponseSet::decode(&dictionary, record, &warn) {
- Ok(mrset) => {
- dictionary.mrsets.insert(ByIdentifier::new(mrset));
- }
- Err(error) => warn(error),
- }
- }
-
- 'outer: for record in headers
- .very_long_strings
- .drain(..)
- .flat_map(|record| record.0.into_iter())
- {
- let Some(index) = dictionary.variables.get_index_of(&record.short_name.0) else {
- warn(Error::TBD);
- continue;
- };
- let width = VarWidth::String(record.length);
- let n_segments = width.n_segments();
- if n_segments == 1 {
- warn(Error::TBD);
- continue;
- }
- if index + n_segments > dictionary.variables.len() {
- warn(Error::TBD);
- continue;
- }
- let mut short_names = Vec::with_capacity(n_segments);
- for i in 0..n_segments {
- let alloc_width = width.segment_alloc_width(i);
- let segment = &dictionary.variables[index + i];
- short_names.push(segment.short_names[0].clone());
- let segment_width = segment.width.as_string_width().unwrap_or(0);
- if segment_width.next_multiple_of(8) != alloc_width.next_multiple_of(8) {
- warn(Error::TBD);
- continue 'outer;
- }
- }
- dictionary.delete_vars(index + 1..index + n_segments);
- let variable = dictionary.variables.get_index_mut2(index).unwrap();
- variable.short_names = short_names;
- variable.width = width;
- }
-
- if headers.long_names.is_empty() {
- // There are no long variable names. Use the short variable names,
- // converted to lowercase, as the long variable names.
- for index in 0..dictionary.variables.len() {
- let lower = dictionary.variables[index].name.0.as_ref().to_lowercase();
- if let Ok(new_name) = Identifier::from_encoding(lower, dictionary.encoding) {
- dictionary.try_rename_var(index, new_name);
- }
- }
- } else {
- // Rename each of the variables, one by one. (In a correctly
- // constructed system file, this cannot create any intermediate
- // duplicate variable names, because all of the new variable names are
- // longer than any of the old variable names and thus there cannot be
- // any overlaps.)
- for renaming in headers
- .long_names
- .iter()
- .flat_map(|record| record.0.iter().cloned())
- {
- let LongName {
- short_name,
- long_name,
- } = renaming;
- if let Some(index) = dictionary.variables.get_index_of(&short_name.0) {
- dictionary.try_rename_var(index, long_name);
- dictionary
- .variables
- .get_index_mut2(index)
- .unwrap()
- .short_names = vec![short_name];
- } else {
- warn(Error::TBD);
- }
- }
- }
-
- for mut attr_set in headers
- .variable_attributes
- .drain(..)
- .flat_map(|record| record.0.into_iter())
- {
- if let Some((_, variable)) = dictionary
- .variables
- .get_full_mut2(&attr_set.long_var_name.0)
- {
- variable.attributes.append(&mut attr_set.attributes);
- } else {
- warn(Error::TBD);
- }
- }
-
- // Assign variable roles.
- for index in 0..dictionary.variables.len() {
- let variable = dictionary.variables.get_index_mut2(index).unwrap();
- match variable.attributes.role() {
- Ok(role) => variable.role = role,
- Err(InvalidRole) => warn(Error::TBD),
- }
- }
-
- // Long string value labels.
- for record in headers
- .long_string_value_labels
- .drain(..)
- .flat_map(|record| record.0.into_iter())
- {
- let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else {
- warn(Error::TBD);
- continue;
- };
- let Some(width) = variable.width.as_string_width() else {
- warn(Error::TBD);
- continue;
- };
- for (mut value, label) in record.labels.into_iter() {
- // XXX warn about too-long value?
- value.0.resize(width, b' ');
- // XXX warn abouat duplicate value labels?
- variable.value_labels.insert(Value::String(value), label);
- }
- }
-
- let mut value = Vec::new();
- for record in headers
- .long_string_missing_values
- .drain(..)
- .flat_map(|record| record.0.into_iter())
- {
- let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else {
- warn(Error::TBD);
- continue;
- };
- let values = record
- .missing_values
- .into_iter()
- .map(|v| {
- value.clear();
- value.extend_from_slice(v.0.as_slice());
- value.resize(variable.width.as_string_width().unwrap(), b' ');
- Value::String(Box::from(value.as_slice()))
- })
- .collect::<Vec<_>>();
- variable.missing_values = MissingValues {
- values,
- range: None,
- };
- }
-
- for record in headers
- .variable_sets
- .drain(..)
- .flat_map(|record| record.sets.into_iter())
- {
- let mut variables = Vec::with_capacity(record.variable_names.len());
- for variable_name in record.variable_names {
- let Some((dict_index, _)) = dictionary.variables.get_full_mut2(&variable_name.0) else {
- warn(Error::TBD);
- continue;
- };
- variables.push(dict_index);
- }
- if !variables.is_empty() {
- let variable_set = VariableSet {
- name: record.name,
- variables,
- };
- dictionary
- .variable_sets
- .insert(ByIdentifier::new(variable_set));
- }
- }
-
- let metadata = Metadata::decode(&headers, warn);
- Ok((dictionary, metadata))
-}
-
-impl MultipleResponseSet {
- fn decode(
- dictionary: &Dictionary,
- input: &raw::MultipleResponseSet<Identifier, String>,
- warn: &impl Fn(Error),
- ) -> Result<Self, Error> {
- let mr_set_name = input.name.clone();
- let mut variables = Vec::with_capacity(input.short_names.len());
- for short_name in input.short_names.iter() {
- let Some(dict_index) = dictionary.variables.get_index_of(&short_name.0) else {
- warn(Error::UnknownMrSetVariable {
- mr_set: mr_set_name.clone(),
- short_name: short_name.clone(),
- });
- continue;
- };
- variables.push(dict_index);
- }
-
- match variables.len() {
- 0 => return Err(Error::EmptyMrSet(mr_set_name)),
- 1 => return Err(Error::OneVarMrSet(mr_set_name)),
- _ => (),
- }
-
- let Some((Some(min_width), Some(max_width))) = variables
- .iter()
- .copied()
- .map(|dict_index| dictionary.variables[dict_index].width)
- .map(|w| (Some(w), Some(w)))
- .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb)))
- else {
- return Err(Error::MixedMrSet(mr_set_name));
- };
-
- let mr_type = MultipleResponseType::decode(&mr_set_name, &input.mr_type, min_width)?;
-
- Ok(MultipleResponseSet {
- name: mr_set_name,
- width: min_width..=max_width,
- label: input.label.to_string(),
- mr_type,
- variables,
- })
- }
-}
-
-fn trim_end_spaces(mut s: String) -> String {
- s.truncate(s.trim_end_matches(' ').len());
- s
-}
-
-/// Returns a copy of `s` in which all lone CR and CR LF pairs have been
-/// replaced by LF.
-///
-/// (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
-/// files that use CR-only line ends in the file label and extra product info.)
-fn fix_line_ends(s: &str) -> String {
- let mut out = String::with_capacity(s.len());
- let mut s = s.chars().peekable();
- while let Some(c) = s.next() {
- match c {
- '\r' => {
- s.next_if_eq(&'\n');
- out.push('\n')
- }
- c => out.push(c),
- }
- }
- out
-}
-
-fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Format, FormatError)) -> Format {
- UncheckedFormat::try_from(raw)
- .and_then(Format::try_from)
- .and_then(|x| x.check_width_compatibility(width))
- .unwrap_or_else(|error| {
- let new_format = Format::default_for_width(width);
- warn(new_format, error);
- new_format
- })
-}
-
-impl MultipleResponseType {
- fn decode(
- mr_set: &Identifier,
- input: &raw::MultipleResponseType,
- min_width: VarWidth,
- ) -> Result<Self, Error> {
- match input {
- raw::MultipleResponseType::MultipleDichotomy { value, labels } => {
- let value = match min_width {
- VarWidth::Numeric => {
- let string = String::from_utf8_lossy(&value.0);
- let number: f64 = string.trim().parse().map_err(|_| {
- Error::InvalidMDGroupCountedValue {
- mr_set: mr_set.clone(),
- number: string.into(),
- }
- })?;
- Value::Number(Some(number))
- }
- VarWidth::String(max_width) => {
- let mut value = value.0.as_slice();
- while value.ends_with(b" ") {
- value = &value[..value.len() - 1];
- }
- let width = value.len();
- if width > max_width as usize {
- return Err(Error::TooWideMDGroupCountedValue {
- mr_set: mr_set.clone(),
- value: String::from_utf8_lossy(value).into(),
- width,
- max_width,
- });
- };
- Value::String(value.into())
- }
- };
- Ok(MultipleResponseType::MultipleDichotomy {
- value,
- labels: *labels,
- })
- }
- raw::MultipleResponseType::MultipleCategory => {
- Ok(MultipleResponseType::MultipleCategory)
- }
- }
- }
-}
use crate::{
format::Format,
identifier::{ByIdentifier, HasIdentifier, Identifier},
- raw::{Alignment, CategoryLabels, Measure, MissingValues, RawString, VarType},
+ sys::raw::{Alignment, CategoryLabels, Measure, MissingValues, RawString, VarType},
};
/// An index within [Dictionary::variables].
+++ /dev/null
-use crate::locale_charset::locale_charset;
-use encoding_rs::{Encoding, UTF_8};
-
-include!(concat!(env!("OUT_DIR"), "/encodings.rs"));
-
-pub fn codepage_from_encoding(encoding: &str) -> Option<u32> {
- CODEPAGE_NAME_TO_NUMBER
- .get(encoding.to_ascii_lowercase().as_str())
- .copied()
-}
-
-use thiserror::Error as ThisError;
-
-#[derive(ThisError, Debug)]
-pub enum Error {
- #[error("This system file does not indicate its own character encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")]
- NoEncoding,
-
- #[error("This system file encodes text strings with unknown code page {0}.")]
- UnknownCodepage(i32),
-
- #[error("This system file encodes text strings with unknown encoding {0}.")]
- UnknownEncoding(String),
-
- #[error("This system file is encoded in EBCDIC, which is not supported.")]
- Ebcdic,
-}
-
-pub fn default_encoding() -> &'static Encoding {
- lazy_static! {
- static ref DEFAULT_ENCODING: &'static Encoding =
- Encoding::for_label(locale_charset().as_bytes()).unwrap_or(UTF_8);
- }
- &DEFAULT_ENCODING
-}
-
-pub fn get_encoding(
- encoding: Option<&str>,
- character_code: Option<i32>,
-) -> Result<&'static Encoding, Error> {
- let label = if let Some(encoding) = encoding {
- encoding
- } else if let Some(codepage) = character_code {
- match codepage {
- 1 => return Err(Error::Ebcdic),
- 2 | 3 => {
- // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
- // respectively. However, many files have character code 2 but
- // data which are clearly not ASCII. Therefore, ignore these
- // values.
- return Err(Error::NoEncoding);
- }
- 4 => "MS_KANJI",
- _ => CODEPAGE_NUMBER_TO_NAME
- .get(&codepage)
- .copied()
- .ok_or(Error::UnknownCodepage(codepage))?,
- }
- } else {
- return Err(Error::NoEncoding);
- };
-
- Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into()))
-}
-
-/*
-#[cfg(test)]
-mod tests {
- use std::thread::spawn;
-
- use encoding_rs::{EUC_JP, UTF_8, WINDOWS_1252};
-
- #[test]
- fn round_trip() {
- let mut threads = Vec::new();
- for thread in 0..128 {
- let start: u32 = thread << 25;
- let end = start + ((1 << 25) - 1);
- threads.push(spawn(move || {
- for i in start..=end {
- let s = i.to_le_bytes();
- let (utf8, replacement) = EUC_JP.decode_without_bom_handling(&s);
- if !replacement {
- let s2 = UTF_8.encode(&utf8).0;
- assert_eq!(s.as_slice(), &*s2);
- }
- }
- }));
- }
- for thread in threads {
- thread.join().unwrap();
- }
- }
-}
-*/
use crate::{
dictionary::{Value, VarWidth},
- raw::{self, RawString, VarType},
+ sys::raw::{self, RawString, VarType},
};
mod display;
pub fn default_value(&self) -> Value {
match self.var_type() {
VarType::Numeric => Value::sysmis(),
- VarType::String => Value::String(RawString::default())
+ VarType::String => Value::String(RawString::default()),
}
}
}
dictionary::Value,
endian::{Endian, Parse},
format::{DateTemplate, Decimals, Settings, TemplateItem, Type},
- raw::{EncodedStr, EncodedString},
settings::{EndianSettings, Settings as PsppSettings},
+ sys::raw::{EncodedStr, EncodedString},
};
use encoding_rs::Encoding;
use smallstr::SmallString;
parse::{ParseError, ParseErrorKind, Sign},
Epoch, Format, Settings as FormatSettings, Type,
},
- raw::EncodedStr,
settings::EndianSettings,
+ sys::raw::EncodedStr,
};
fn test(name: &str, type_: Type) {
pub mod calendar;
pub mod command;
-pub mod cooked;
pub mod dictionary;
-pub mod encoding;
pub mod endian;
pub mod engine;
pub mod format;
pub mod message;
pub mod output;
pub mod prompt;
-pub mod raw;
-pub mod sack;
pub mod settings;
+pub mod sys;
use anyhow::Result;
use clap::{Parser, ValueEnum};
use encoding_rs::Encoding;
-use pspp::cooked::{decode, Headers};
-use pspp::raw::{encoding_from_headers, Decoder, Magic, Reader, Record};
+use pspp::sys::cooked::{decode, Headers};
+use pspp::sys::raw::{encoding_from_headers, Decoder, Magic, Reader, Record};
use std::fs::File;
use std::io::BufReader;
use std::path::{Path, PathBuf};
use crate::{
dictionary::Value as DataValue,
format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat},
- raw::VarType,
settings::{Settings, Show},
+ sys::raw::VarType,
};
pub mod output;
+++ /dev/null
-use crate::{
- dictionary::{Attributes, Value, VarWidth},
- encoding::{default_encoding, get_encoding, Error as EncodingError},
- endian::{Endian, Parse, ToBytes},
- identifier::{Error as IdError, Identifier},
-};
-
-use encoding_rs::{mem::decode_latin1, Encoding};
-use flate2::read::ZlibDecoder;
-use num::Integer;
-use std::{
- borrow::Cow,
- cell::RefCell,
- collections::{HashMap, VecDeque},
- fmt::{Debug, Display, Formatter, Result as FmtResult},
- io::{Error as IoError, Read, Seek, SeekFrom},
- mem::take,
- num::NonZeroU8,
- ops::Range,
- rc::Rc,
- str::from_utf8,
-};
-use thiserror::Error as ThisError;
-
-#[derive(ThisError, Debug)]
-pub enum Error {
- #[error("Not an SPSS system file")]
- NotASystemFile,
-
- #[error("Invalid magic number {0:?}")]
- BadMagic([u8; 4]),
-
- #[error("I/O error ({0})")]
- Io(#[from] IoError),
-
- #[error("Invalid SAV compression code {0}")]
- InvalidSavCompression(u32),
-
- #[error("Invalid ZSAV compression code {0}")]
- InvalidZsavCompression(u32),
-
- #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
- BadDocumentLength { offset: u64, n: usize, max: usize },
-
- #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
- BadRecordType { offset: u64, rec_type: u32 },
-
- #[error("In variable record starting at offset {start_offset:#x}, variable width is not in the valid range -1 to 255.")]
- BadVariableWidth { start_offset: u64, width: i32 },
-
- #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
- BadVariableLabelCode {
- start_offset: u64,
- code_offset: u64,
- code: u32,
- },
-
- #[error("At offset {offset:#x}, missing value code ({code}) is not -3, -2, 0, 1, 2, or 3.")]
- BadMissingValueCode { offset: u64, code: i32 },
-
- #[error(
- "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
- )]
- BadNumericMissingValueCode { offset: u64, code: i32 },
-
- #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
- BadStringMissingValueCode { offset: u64, code: i32 },
-
- #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
- BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
-
- #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
- ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
-
- #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")]
- TooManyVarIndexes { offset: u64, n: u32, max: u32 },
-
- #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
- ExtensionRecordTooLarge {
- offset: u64,
- subtype: u32,
- size: u32,
- count: u32,
- },
-
- #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
- EofInCase {
- offset: u64,
- case_ofs: u64,
- case_len: usize,
- },
-
- #[error(
- "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
- )]
- EofInCompressedCase { offset: u64, case_ofs: u64 },
-
- #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
- PartialCompressedCase { offset: u64, case_ofs: u64 },
-
- #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
- CompressedNumberExpected { offset: u64, case_ofs: u64 },
-
- #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
- CompressedStringExpected { offset: u64, case_ofs: u64 },
-
- #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
- BadZlibTrailerNBlocks {
- offset: u64,
- n_blocks: u32,
- expected_n_blocks: u64,
- ztrailer_len: u64,
- },
-
- #[error("{0}")]
- EncodingError(EncodingError),
-}
-
-#[derive(ThisError, Debug)]
-pub enum Warning {
- #[error("Unexpected end of data inside extension record.")]
- UnexpectedEndOfData,
-
- #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")]
- NoVarIndexes { offset: u64 },
-
- #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())]
- MixedVarTypes {
- offset: u64,
- var_type: VarType,
- wrong_types: Vec<u32>,
- },
-
- #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}] or referred to string continuations: {invalid:?}")]
- InvalidVarIndexes {
- offset: u64,
- max: usize,
- invalid: Vec<u32>,
- },
-
- #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
- BadRecordSize {
- offset: u64,
- record: String,
- size: u32,
- expected_size: u32,
- },
-
- #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
- BadRecordCount {
- offset: u64,
- record: String,
- count: u32,
- expected_count: u32,
- },
-
- #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
- BadLongMissingValueLength {
- record_offset: u64,
- offset: u64,
- value_len: u32,
- },
-
- #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
- BadEncodingName { offset: u64 },
-
- // XXX This is risky because `text` might be arbitarily long.
- #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
- MalformedString { encoding: String, text: String },
-
- #[error("Invalid variable measurement level value {0}")]
- InvalidMeasurement(u32),
-
- #[error("Invalid variable display alignment value {0}")]
- InvalidAlignment(u32),
-
- #[error("Invalid attribute name. {0}")]
- InvalidAttributeName(IdError),
-
- #[error("Invalid variable name in attribute record. {0}")]
- InvalidAttributeVariableName(IdError),
-
- #[error("Invalid short name in long variable name record. {0}")]
- InvalidShortName(IdError),
-
- #[error("Invalid name in long variable name record. {0}")]
- InvalidLongName(IdError),
-
- #[error("Invalid variable name in very long string record. {0}")]
- InvalidLongStringName(IdError),
-
- #[error("Invalid variable name in variable set record. {0}")]
- InvalidVariableSetName(IdError),
-
- #[error("Invalid multiple response set name. {0}")]
- InvalidMrSetName(IdError),
-
- #[error("Invalid multiple response set variable name. {0}")]
- InvalidMrSetVariableName(IdError),
-
- #[error("Invalid variable name in long string missing values record. {0}")]
- InvalidLongStringMissingValueVariableName(IdError),
-
- #[error("Invalid variable name in long string value label record. {0}")]
- InvalidLongStringValueLabelName(IdError),
-
- #[error("{0}")]
- EncodingError(EncodingError),
-
- #[error("Details TBD")]
- TBD,
-}
-
-impl From<IoError> for Warning {
- fn from(_source: IoError) -> Self {
- Self::UnexpectedEndOfData
- }
-}
-
-#[derive(Clone, Debug)]
-pub enum Record {
- Header(HeaderRecord<RawString>),
- Variable(VariableRecord<RawString>),
- ValueLabel(ValueLabelRecord<RawStrArray<8>, RawString>),
- Document(DocumentRecord<RawDocumentLine>),
- IntegerInfo(IntegerInfoRecord),
- FloatInfo(FloatInfoRecord),
- VarDisplay(VarDisplayRecord),
- MultipleResponse(MultipleResponseRecord<RawString, RawString>),
- LongStringValueLabels(LongStringValueLabelRecord<RawString, RawString>),
- LongStringMissingValues(LongStringMissingValueRecord<RawString>),
- Encoding(EncodingRecord),
- NumberOfCases(NumberOfCasesRecord),
- Text(TextRecord),
- OtherExtension(Extension),
- EndOfHeaders(u32),
- ZHeader(ZHeader),
- ZTrailer(ZTrailer),
- Cases(Rc<RefCell<Cases>>),
-}
-
-#[derive(Clone, Debug)]
-pub enum DecodedRecord {
- Header(HeaderRecord<String>),
- Variable(VariableRecord<String>),
- ValueLabel(ValueLabelRecord<RawStrArray<8>, String>),
- Document(DocumentRecord<String>),
- IntegerInfo(IntegerInfoRecord),
- FloatInfo(FloatInfoRecord),
- VarDisplay(VarDisplayRecord),
- MultipleResponse(MultipleResponseRecord<Identifier, String>),
- LongStringValueLabels(LongStringValueLabelRecord<Identifier, String>),
- LongStringMissingValues(LongStringMissingValueRecord<Identifier>),
- Encoding(EncodingRecord),
- NumberOfCases(NumberOfCasesRecord),
- VariableSets(VariableSetRecord),
- ProductInfo(ProductInfoRecord),
- LongNames(LongNamesRecord),
- VeryLongStrings(VeryLongStringsRecord),
- FileAttributes(FileAttributeRecord),
- VariableAttributes(VariableAttributeRecord),
- OtherExtension(Extension),
- EndOfHeaders(u32),
- ZHeader(ZHeader),
- ZTrailer(ZTrailer),
- Cases(Rc<RefCell<Cases>>),
-}
-
-impl Record {
- fn read<R>(
- reader: &mut R,
- endian: Endian,
- var_types: &VarTypes,
- warn: &dyn Fn(Warning),
- ) -> Result<Option<Record>, Error>
- where
- R: Read + Seek,
- {
- let rec_type: u32 = endian.parse(read_bytes(reader)?);
- match rec_type {
- 2 => Ok(Some(VariableRecord::read(reader, endian, warn)?)),
- 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?),
- 6 => Ok(Some(DocumentRecord::read(reader, endian)?)),
- 7 => Extension::read(reader, endian, var_types.n_values(), warn),
- 999 => Ok(Some(Record::EndOfHeaders(
- endian.parse(read_bytes(reader)?),
- ))),
- _ => Err(Error::BadRecordType {
- offset: reader.stream_position()?,
- rec_type,
- }),
- }
- }
-
- pub fn decode(self, decoder: &Decoder) -> Result<DecodedRecord, Error> {
- Ok(match self {
- Record::Header(record) => record.decode(decoder),
- Record::Variable(record) => record.decode(decoder),
- Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)),
- Record::Document(record) => record.decode(decoder),
- Record::IntegerInfo(record) => DecodedRecord::IntegerInfo(record.clone()),
- Record::FloatInfo(record) => DecodedRecord::FloatInfo(record.clone()),
- Record::VarDisplay(record) => DecodedRecord::VarDisplay(record.clone()),
- Record::MultipleResponse(record) => record.decode(decoder),
- Record::LongStringValueLabels(record) => {
- DecodedRecord::LongStringValueLabels(record.decode(decoder))
- }
- Record::LongStringMissingValues(record) => {
- DecodedRecord::LongStringMissingValues(record.decode(decoder))
- }
- Record::Encoding(record) => DecodedRecord::Encoding(record.clone()),
- Record::NumberOfCases(record) => DecodedRecord::NumberOfCases(record.clone()),
- Record::Text(record) => record.decode(decoder),
- Record::OtherExtension(record) => DecodedRecord::OtherExtension(record.clone()),
- Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record),
- Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()),
- Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()),
- Record::Cases(record) => DecodedRecord::Cases(record.clone()),
- })
- }
-}
-
-pub fn encoding_from_headers(
- headers: &Vec<Record>,
- warn: &impl Fn(Warning),
-) -> Result<&'static Encoding, Error> {
- let mut encoding_record = None;
- let mut integer_info_record = None;
- for record in headers {
- match record {
- Record::Encoding(record) => encoding_record = Some(record),
- Record::IntegerInfo(record) => integer_info_record = Some(record),
- _ => (),
- }
- }
- let encoding = encoding_record.map(|record| record.0.as_str());
- let character_code = integer_info_record.map(|record| record.character_code);
- match get_encoding(encoding, character_code) {
- Ok(encoding) => Ok(encoding),
- Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)),
- Err(err) => {
- warn(Warning::EncodingError(err));
- // Warn that we're using the default encoding.
- Ok(default_encoding())
- }
- }
-}
-
-// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
-// decoded as Latin-1 (actually bytes interpreted as Unicode code points).
-fn default_decode(s: &[u8]) -> Cow<str> {
- from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum Compression {
- Simple,
- ZLib,
-}
-
-#[derive(Clone)]
-pub struct HeaderRecord<S>
-where
- S: Debug,
-{
- /// Offset in file.
- pub offsets: Range<u64>,
-
- /// Magic number.
- pub magic: Magic,
-
- /// Eye-catcher string, product name, in the file's encoding. Padded
- /// on the right with spaces.
- pub eye_catcher: S,
-
- /// Layout code, normally either 2 or 3.
- pub layout_code: u32,
-
- /// Number of variable positions, or `None` if the value in the file is
- /// questionably trustworthy.
- pub nominal_case_size: Option<u32>,
-
- /// Compression type, if any,
- pub compression: Option<Compression>,
-
- /// 1-based variable index of the weight variable, or `None` if the file is
- /// unweighted.
- pub weight_index: Option<u32>,
-
- /// Claimed number of cases, if known.
- pub n_cases: Option<u32>,
-
- /// Compression bias, usually 100.0.
- pub bias: f64,
-
- /// `dd mmm yy` in the file's encoding.
- pub creation_date: S,
-
- /// `HH:MM:SS` in the file's encoding.
- pub creation_time: S,
-
- /// File label, in the file's encoding. Padded on the right with spaces.
- pub file_label: S,
-
- /// Endianness of the data in the file header.
- pub endian: Endian,
-}
-
-impl<S> HeaderRecord<S>
-where
- S: Debug,
-{
- fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult
- where
- T: Debug,
- {
- writeln!(f, "{name:>17}: {:?}", value)
- }
-}
-
-impl<S> Debug for HeaderRecord<S>
-where
- S: Debug,
-{
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- writeln!(f, "File header record:")?;
- self.debug_field(f, "Magic", self.magic)?;
- self.debug_field(f, "Product name", &self.eye_catcher)?;
- self.debug_field(f, "Layout code", self.layout_code)?;
- self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
- self.debug_field(f, "Compression", self.compression)?;
- self.debug_field(f, "Weight index", self.weight_index)?;
- self.debug_field(f, "Number of cases", self.n_cases)?;
- self.debug_field(f, "Compression bias", self.bias)?;
- self.debug_field(f, "Creation date", &self.creation_date)?;
- self.debug_field(f, "Creation time", &self.creation_time)?;
- self.debug_field(f, "File label", &self.file_label)?;
- self.debug_field(f, "Endianness", self.endian)
- }
-}
-
-impl HeaderRecord<RawString> {
- fn read<R: Read + Seek>(r: &mut R) -> Result<Self, Error> {
- let start = r.stream_position()?;
-
- let magic: [u8; 4] = read_bytes(r)?;
- let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
-
- let eye_catcher = RawString(read_vec(r, 60)?);
- let layout_code: [u8; 4] = read_bytes(r)?;
- let endian = Endian::identify_u32(2, layout_code)
- .or_else(|| Endian::identify_u32(2, layout_code))
- .ok_or(Error::NotASystemFile)?;
- let layout_code = endian.parse(layout_code);
-
- let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
- let nominal_case_size =
- (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
-
- let compression_code: u32 = endian.parse(read_bytes(r)?);
- let compression = match (magic, compression_code) {
- (Magic::Zsav, 2) => Some(Compression::ZLib),
- (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)),
- (_, 0) => None,
- (_, 1) => Some(Compression::Simple),
- (_, code) => return Err(Error::InvalidSavCompression(code)),
- };
-
- let weight_index: u32 = endian.parse(read_bytes(r)?);
- let weight_index = (weight_index > 0).then_some(weight_index);
-
- let n_cases: u32 = endian.parse(read_bytes(r)?);
- let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
-
- let bias: f64 = endian.parse(read_bytes(r)?);
-
- let creation_date = RawString(read_vec(r, 9)?);
- let creation_time = RawString(read_vec(r, 8)?);
- let file_label = RawString(read_vec(r, 64)?);
- let _: [u8; 3] = read_bytes(r)?;
-
- Ok(HeaderRecord {
- offsets: start..r.stream_position()?,
- magic,
- layout_code,
- nominal_case_size,
- compression,
- weight_index,
- n_cases,
- bias,
- creation_date,
- creation_time,
- eye_catcher,
- file_label,
- endian,
- })
- }
-
- pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
- let eye_catcher = decoder.decode(&self.eye_catcher).to_string();
- let file_label = decoder.decode(&self.file_label).to_string();
- let creation_date = decoder.decode(&self.creation_date).to_string();
- let creation_time = decoder.decode(&self.creation_time).to_string();
- DecodedRecord::Header(HeaderRecord {
- eye_catcher,
- weight_index: self.weight_index,
- n_cases: self.n_cases,
- file_label,
- offsets: self.offsets.clone(),
- magic: self.magic,
- layout_code: self.layout_code,
- nominal_case_size: self.nominal_case_size,
- compression: self.compression,
- bias: self.bias,
- creation_date,
- creation_time,
- endian: self.endian,
- })
- }
-}
-
-pub struct Decoder {
- pub encoding: &'static Encoding,
- pub warn: Box<dyn Fn(Warning)>,
-}
-
-impl Decoder {
- pub fn new<F>(encoding: &'static Encoding, warn: F) -> Self
- where
- F: Fn(Warning) + 'static,
- {
- Self {
- encoding,
- warn: Box::new(warn),
- }
- }
- fn warn(&self, warning: Warning) {
- (self.warn)(warning)
- }
- fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
- let (output, malformed) = self.encoding.decode_without_bom_handling(input);
- if malformed {
- self.warn(Warning::MalformedString {
- encoding: self.encoding.name().into(),
- text: output.clone().into(),
- });
- }
- output
- }
-
- fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> {
- self.decode_slice(input.0.as_slice())
- }
-
- pub fn decode_identifier(&self, input: &RawString) -> Result<Identifier, IdError> {
- self.new_identifier(&self.decode(input))
- }
-
- pub fn new_identifier(&self, name: &str) -> Result<Identifier, IdError> {
- Identifier::from_encoding(name, self.encoding)
- }
-}
-
-#[derive(Copy, Clone, PartialEq, Eq, Hash)]
-pub enum Magic {
- /// Regular system file.
- Sav,
-
- /// System file with Zlib-compressed data.
- Zsav,
-
- /// EBCDIC-encoded system file.
- Ebcdic,
-}
-
-impl Magic {
- /// Magic number for a regular system file.
- pub const SAV: [u8; 4] = *b"$FL2";
-
- /// Magic number for a system file that contains zlib-compressed data.
- pub const ZSAV: [u8; 4] = *b"$FL3";
-
- /// Magic number for an EBCDIC-encoded system file. This is `$FL2` encoded
- /// in EBCDIC.
- pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2];
-}
-
-impl Debug for Magic {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- let s = match *self {
- Magic::Sav => "$FL2",
- Magic::Zsav => "$FL3",
- Magic::Ebcdic => "($FL2 in EBCDIC)",
- };
- write!(f, "{s}")
- }
-}
-
-impl TryFrom<[u8; 4]> for Magic {
- type Error = Error;
-
- fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
- match value {
- Magic::SAV => Ok(Magic::Sav),
- Magic::ZSAV => Ok(Magic::Zsav),
- Magic::EBCDIC => Ok(Magic::Ebcdic),
- _ => Err(Error::BadMagic(value)),
- }
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum VarType {
- Numeric,
- String,
-}
-
-impl VarType {
- pub fn opposite(self) -> VarType {
- match self {
- Self::Numeric => Self::String,
- Self::String => Self::Numeric,
- }
- }
-}
-
-impl Display for VarType {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- match self {
- VarType::Numeric => write!(f, "numeric"),
- VarType::String => write!(f, "string"),
- }
- }
-}
-
-impl TryFrom<RawWidth> for VarType {
- type Error = ();
-
- fn try_from(value: RawWidth) -> Result<Self, Self::Error> {
- match value {
- RawWidth::Continuation => Err(()),
- RawWidth::Numeric => Ok(VarType::Numeric),
- RawWidth::String(_) => Ok(VarType::String),
- }
- }
-}
-
-impl TryFrom<RawWidth> for VarWidth {
- type Error = ();
-
- fn try_from(value: RawWidth) -> Result<Self, Self::Error> {
- match value {
- RawWidth::Continuation => Err(()),
- RawWidth::Numeric => Ok(Self::Numeric),
- RawWidth::String(width) => Ok(Self::String(width.get() as u16)),
- }
- }
-}
-
-type RawValue = Value<RawStrArray<8>>;
-
-impl RawValue {
- pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self {
- match var_type {
- VarType::String => Value::String(RawStrArray(raw.0)),
- VarType::Numeric => Value::Number(endian.parse(raw.0)),
- }
- }
-
- fn read_case<R: Read + Seek>(
- reader: &mut R,
- var_types: &VarTypes,
- endian: Endian,
- ) -> Result<Option<Vec<Self>>, Error> {
- let case_start = reader.stream_position()?;
- let mut values = Vec::with_capacity(var_types.n_values());
- for (i, var_type) in var_types.iter().enumerate() {
- let Some(raw) = try_read_bytes(reader)? else {
- if i == 0 {
- return Ok(None);
- } else {
- let offset = reader.stream_position()?;
- return Err(Error::EofInCase {
- offset,
- case_ofs: offset - case_start,
- case_len: var_types.n_values() * 8,
- });
- }
- };
- values.push(Value::from_raw(&UntypedValue(raw), var_type, endian));
- }
- Ok(Some(values))
- }
-
- fn read_compressed_case<R: Read + Seek>(
- reader: &mut R,
- var_types: &VarTypes,
- codes: &mut VecDeque<u8>,
- endian: Endian,
- bias: f64,
- ) -> Result<Option<Vec<Self>>, Error> {
- let case_start = reader.stream_position()?;
- let mut values = Vec::with_capacity(var_types.n_values());
- for (i, var_type) in var_types.iter().enumerate() {
- let value = loop {
- let Some(code) = codes.pop_front() else {
- let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
- if i == 0 {
- return Ok(None);
- } else {
- let offset = reader.stream_position()?;
- return Err(Error::EofInCompressedCase {
- offset,
- case_ofs: offset - case_start,
- });
- }
- };
- codes.extend(new_codes.into_iter());
- continue;
- };
- match code {
- 0 => (),
- 1..=251 => match var_type {
- VarType::Numeric => break Self::Number(Some(code as f64 - bias)),
- VarType::String => {
- break Self::String(RawStrArray(endian.to_bytes(code as f64 - bias)))
- }
- },
- 252 => {
- if i == 0 {
- return Ok(None);
- } else {
- let offset = reader.stream_position()?;
- return Err(Error::PartialCompressedCase {
- offset,
- case_ofs: offset - case_start,
- });
- }
- }
- 253 => {
- break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian)
- }
- 254 => match var_type {
- VarType::String => break Self::String(RawStrArray(*b" ")), // XXX EBCDIC
- VarType::Numeric => {
- return Err(Error::CompressedStringExpected {
- offset: case_start,
- case_ofs: reader.stream_position()? - case_start,
- })
- }
- },
- 255 => match var_type {
- VarType::Numeric => break Self::Number(None),
- VarType::String => {
- return Err(Error::CompressedNumberExpected {
- offset: case_start,
- case_ofs: reader.stream_position()? - case_start,
- })
- }
- },
- }
- };
- values.push(value);
- }
- Ok(Some(values))
- }
-
- pub fn decode(&self, width: VarWidth) -> Value {
- match self {
- Self::Number(x) => Value::Number(*x),
- Self::String(s) => {
- let width = width.as_string_width().unwrap();
- Value::String(RawString::from(&s.0[..width]))
- }
- }
- }
-}
-
-struct ZlibDecodeMultiple<R>
-where
- R: Read + Seek,
-{
- reader: Option<ZlibDecoder<R>>,
-}
-
-impl<R> ZlibDecodeMultiple<R>
-where
- R: Read + Seek,
-{
- fn new(reader: R) -> ZlibDecodeMultiple<R> {
- ZlibDecodeMultiple {
- reader: Some(ZlibDecoder::new(reader)),
- }
- }
-}
-
-impl<R> Read for ZlibDecodeMultiple<R>
-where
- R: Read + Seek,
-{
- fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
- loop {
- match self.reader.as_mut().unwrap().read(buf)? {
- 0 => {
- let inner = self.reader.take().unwrap().into_inner();
- self.reader = Some(ZlibDecoder::new(inner));
- }
- n => return Ok(n),
- };
- }
- }
-}
-
-impl<R> Seek for ZlibDecodeMultiple<R>
-where
- R: Read + Seek,
-{
- fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
- self.reader.as_mut().unwrap().get_mut().seek(pos)
- }
-}
-
-enum ReaderState {
- Start,
- Headers,
- ZlibHeader,
- ZlibTrailer {
- ztrailer_offset: u64,
- ztrailer_len: u64,
- },
- Cases,
- End,
-}
-
-pub struct Reader<R>
-where
- R: Read + Seek + 'static,
-{
- reader: Option<R>,
- warn: Box<dyn Fn(Warning)>,
-
- header: HeaderRecord<RawString>,
- var_types: VarTypes,
-
- state: ReaderState,
-}
-
-impl<R> Reader<R>
-where
- R: Read + Seek + 'static,
-{
- pub fn new<F>(mut reader: R, warn: F) -> Result<Self, Error>
- where
- F: Fn(Warning) + 'static,
- {
- let header = HeaderRecord::read(&mut reader)?;
- Ok(Self {
- reader: Some(reader),
- warn: Box::new(warn),
- header,
- var_types: VarTypes::new(),
- state: ReaderState::Start,
- })
- }
- fn cases(&mut self) -> Cases {
- self.state = ReaderState::End;
- Cases::new(
- self.reader.take().unwrap(),
- take(&mut self.var_types),
- &self.header,
- )
- }
- fn _next(&mut self) -> Option<<Self as Iterator>::Item> {
- match self.state {
- ReaderState::Start => {
- self.state = ReaderState::Headers;
- Some(Ok(Record::Header(self.header.clone())))
- }
- ReaderState::Headers => {
- let record = loop {
- match Record::read(
- self.reader.as_mut().unwrap(),
- self.header.endian,
- &self.var_types,
- &self.warn,
- ) {
- Ok(Some(record)) => break record,
- Ok(None) => (),
- Err(error) => return Some(Err(error)),
- }
- };
- match record {
- Record::Variable(VariableRecord { width, .. }) => self.var_types.push(width),
- Record::EndOfHeaders(_) => {
- self.state = if let Some(Compression::ZLib) = self.header.compression {
- ReaderState::ZlibHeader
- } else {
- ReaderState::Cases
- };
- }
- _ => (),
- };
- Some(Ok(record))
- }
- ReaderState::ZlibHeader => {
- let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian)
- {
- Ok(zheader) => zheader,
- Err(error) => return Some(Err(error)),
- };
- self.state = ReaderState::ZlibTrailer {
- ztrailer_offset: zheader.ztrailer_offset,
- ztrailer_len: zheader.ztrailer_len,
- };
- Some(Ok(Record::ZHeader(zheader)))
- }
- ReaderState::ZlibTrailer {
- ztrailer_offset,
- ztrailer_len,
- } => {
- match ZTrailer::read(
- self.reader.as_mut().unwrap(),
- self.header.endian,
- ztrailer_offset,
- ztrailer_len,
- ) {
- Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
- Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))),
- Err(error) => Some(Err(error)),
- }
- }
- ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
- ReaderState::End => None,
- }
- }
-}
-
-impl<R> Iterator for Reader<R>
-where
- R: Read + Seek + 'static,
-{
- type Item = Result<Record, Error>;
-
- fn next(&mut self) -> Option<Self::Item> {
- let retval = self._next();
- if matches!(retval, Some(Err(_))) {
- self.state = ReaderState::End;
- }
- retval
- }
-}
-
-trait ReadSeek: Read + Seek {}
-impl<T> ReadSeek for T where T: Read + Seek {}
-
-pub struct Cases {
- reader: Box<dyn ReadSeek>,
- var_types: VarTypes,
- compression: Option<Compression>,
- bias: f64,
- endian: Endian,
- codes: VecDeque<u8>,
- eof: bool,
-}
-
-impl Debug for Cases {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "Cases")
- }
-}
-
-impl Cases {
- fn new<R>(reader: R, var_types: VarTypes, header: &HeaderRecord<RawString>) -> Self
- where
- R: Read + Seek + 'static,
- {
- Self {
- reader: if header.compression == Some(Compression::ZLib) {
- Box::new(ZlibDecodeMultiple::new(reader))
- } else {
- Box::new(reader)
- },
- var_types,
- compression: header.compression,
- bias: header.bias,
- endian: header.endian,
- codes: VecDeque::with_capacity(8),
- eof: false,
- }
- }
-}
-
-impl Iterator for Cases {
- type Item = Result<Vec<RawValue>, Error>;
-
- fn next(&mut self) -> Option<Self::Item> {
- if self.eof {
- return None;
- }
-
- let retval = if self.compression.is_some() {
- Value::read_compressed_case(
- &mut self.reader,
- &self.var_types,
- &mut self.codes,
- self.endian,
- self.bias,
- )
- .transpose()
- } else {
- Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose()
- };
- self.eof = matches!(retval, None | Some(Err(_)));
- retval
- }
-}
-
-#[derive(Copy, Clone, PartialEq, Eq, Hash)]
-pub struct Spec(pub u32);
-
-impl Debug for Spec {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- let type_ = format_name(self.0 >> 16);
- let w = (self.0 >> 8) & 0xff;
- let d = self.0 & 0xff;
- write!(f, "{:06x} ({type_}{w}.{d})", self.0)
- }
-}
-
-fn format_name(type_: u32) -> Cow<'static, str> {
- match type_ {
- 1 => "A",
- 2 => "AHEX",
- 3 => "COMMA",
- 4 => "DOLLAR",
- 5 => "F",
- 6 => "IB",
- 7 => "PIBHEX",
- 8 => "P",
- 9 => "PIB",
- 10 => "PK",
- 11 => "RB",
- 12 => "RBHEX",
- 15 => "Z",
- 16 => "N",
- 17 => "E",
- 20 => "DATE",
- 21 => "TIME",
- 22 => "DATETIME",
- 23 => "ADATE",
- 24 => "JDATE",
- 25 => "DTIME",
- 26 => "WKDAY",
- 27 => "MONTH",
- 28 => "MOYR",
- 29 => "QYR",
- 30 => "WKYR",
- 31 => "PCT",
- 32 => "DOT",
- 33 => "CCA",
- 34 => "CCB",
- 35 => "CCC",
- 36 => "CCD",
- 37 => "CCE",
- 38 => "EDATE",
- 39 => "SDATE",
- 40 => "MTIME",
- 41 => "YMDHMS",
- _ => return format!("<unknown format {type_}>").into(),
- }
- .into()
-}
-
-#[derive(Clone)]
-pub struct MissingValues<S = Box<[u8]>>
-where
- S: Debug,
-{
- /// Individual missing values, up to 3 of them.
- pub values: Vec<Value<S>>,
-
- /// Optional range of missing values.
- pub range: Option<(Value<S>, Value<S>)>,
-}
-
-impl<S> Debug for MissingValues<S>
-where
- S: Debug,
-{
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- for (i, value) in self.values.iter().enumerate() {
- if i > 0 {
- write!(f, ", ")?;
- }
- write!(f, "{value:?}")?;
- }
-
- if let Some((low, high)) = &self.range {
- if !self.values.is_empty() {
- write!(f, ", ")?;
- }
- write!(f, "{low:?} THRU {high:?}")?;
- }
-
- if self.is_empty() {
- write!(f, "none")?;
- }
-
- Ok(())
- }
-}
-
-impl<S> MissingValues<S>
-where
- S: Debug,
-{
- fn is_empty(&self) -> bool {
- self.values.is_empty() && self.range.is_none()
- }
-}
-
-impl<S> Default for MissingValues<S>
-where
- S: Debug,
-{
- fn default() -> Self {
- Self {
- values: Vec::new(),
- range: None,
- }
- }
-}
-
-impl MissingValues {
- fn read<R: Read + Seek>(
- r: &mut R,
- offset: u64,
- width: RawWidth,
- code: i32,
- endian: Endian,
- warn: &dyn Fn(Warning),
- ) -> Result<Self, Error> {
- let (individual_values, has_range) = match code {
- 0..=3 => (code as usize, false),
- -2 => (0, true),
- -3 => (1, true),
- _ => return Err(Error::BadMissingValueCode { offset, code }),
- };
-
- let mut values = Vec::with_capacity(individual_values);
- for _ in 0..individual_values {
- values.push(read_bytes::<8, _>(r)?);
- }
- let range = if has_range {
- let low = read_bytes::<8, _>(r)?;
- let high = read_bytes::<8, _>(r)?;
- Some((low, high))
- } else {
- None
- };
-
- match VarWidth::try_from(width) {
- Ok(VarWidth::Numeric) => {
- let values = values
- .into_iter()
- .map(|v| Value::Number(endian.parse(v)))
- .collect();
- let range = range.map(|(low, high)| {
- (
- Value::Number(endian.parse(low)),
- Value::Number(endian.parse(high)),
- )
- });
- return Ok(Self { values, range });
- }
- Ok(VarWidth::String(width)) if width <= 8 && range.is_none() => {
- let values = values
- .into_iter()
- .map(|value| Value::String(Box::from(&value[..width as usize])))
- .collect();
- return Ok(Self {
- values,
- range: None,
- });
- }
- Ok(VarWidth::String(width)) if width > 8 => warn(Warning::TBD),
- Ok(VarWidth::String(_)) => warn(Warning::TBD),
- Err(()) => warn(Warning::TBD),
- }
- Ok(Self::default())
- }
-}
-
-#[derive(Clone)]
-pub struct VariableRecord<S>
-where
- S: Debug,
-{
- /// Range of offsets in file.
- pub offsets: Range<u64>,
-
- /// Variable width, in the range -1..=255.
- pub width: RawWidth,
-
- /// Variable name, padded on the right with spaces.
- pub name: S,
-
- /// Print format.
- pub print_format: Spec,
-
- /// Write format.
- pub write_format: Spec,
-
- /// Missing values.
- pub missing_values: MissingValues,
-
- /// Optional variable label.
- pub label: Option<S>,
-}
-
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub enum RawWidth {
- Continuation,
- Numeric,
- String(NonZeroU8),
-}
-
-impl RawWidth {
- pub fn n_values(&self) -> Option<usize> {
- match self {
- RawWidth::Numeric => Some(1),
- RawWidth::String(width) => Some((width.get() as usize).div_ceil(8)),
- _ => None,
- }
- }
-}
-
-impl TryFrom<i32> for RawWidth {
- type Error = ();
-
- fn try_from(value: i32) -> Result<Self, Self::Error> {
- match value {
- -1 => Ok(Self::Continuation),
- 0 => Ok(Self::Numeric),
- 1..=255 => Ok(Self::String(NonZeroU8::new(value as u8).unwrap())),
- _ => Err(()),
- }
- }
-}
-
-impl Display for RawWidth {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- match self {
- RawWidth::Continuation => write!(f, "long string continuation"),
- RawWidth::Numeric => write!(f, "numeric"),
- RawWidth::String(width) => write!(f, "{width}-byte string"),
- }
- }
-}
-
-impl<S> Debug for VariableRecord<S>
-where
- S: Debug,
-{
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- writeln!(f, "Width: {}", self.width,)?;
- writeln!(f, "Print format: {:?}", self.print_format)?;
- writeln!(f, "Write format: {:?}", self.write_format)?;
- writeln!(f, "Name: {:?}", &self.name)?;
- writeln!(f, "Variable label: {:?}", self.label)?;
- writeln!(f, "Missing values: {:?}", self.missing_values)
- }
-}
-
-impl VariableRecord<RawString> {
- fn read<R: Read + Seek>(
- r: &mut R,
- endian: Endian,
- warn: &dyn Fn(Warning),
- ) -> Result<Record, Error> {
- let start_offset = r.stream_position()?;
- let width: i32 = endian.parse(read_bytes(r)?);
- let width: RawWidth = width.try_into().map_err(|_| Error::BadVariableWidth {
- start_offset,
- width,
- })?;
- let code_offset = r.stream_position()?;
- let has_variable_label: u32 = endian.parse(read_bytes(r)?);
- let missing_value_code: i32 = endian.parse(read_bytes(r)?);
- let print_format = Spec(endian.parse(read_bytes(r)?));
- let write_format = Spec(endian.parse(read_bytes(r)?));
- let name = RawString(read_vec(r, 8)?);
-
- let label = match has_variable_label {
- 0 => None,
- 1 => {
- let len: u32 = endian.parse(read_bytes(r)?);
- let read_len = len.min(65535) as usize;
- let label = RawString(read_vec(r, read_len)?);
-
- let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
- let _ = read_vec(r, padding_bytes as usize)?;
-
- Some(label)
- }
- _ => {
- return Err(Error::BadVariableLabelCode {
- start_offset,
- code_offset,
- code: has_variable_label,
- })
- }
- };
-
- let missing_values =
- MissingValues::read(r, start_offset, width, missing_value_code, endian, warn)?;
-
- let end_offset = r.stream_position()?;
-
- Ok(Record::Variable(VariableRecord {
- offsets: start_offset..end_offset,
- width,
- name,
- print_format,
- write_format,
- missing_values,
- label,
- }))
- }
-
- pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
- DecodedRecord::Variable(VariableRecord {
- offsets: self.offsets.clone(),
- width: self.width,
- name: decoder.decode(&self.name).to_string(),
- print_format: self.print_format,
- write_format: self.write_format,
- missing_values: self.missing_values,
- label: self
- .label
- .as_ref()
- .map(|label| decoder.decode(label).to_string()),
- })
- }
-}
-
-#[derive(Copy, Clone)]
-pub struct UntypedValue(pub [u8; 8]);
-
-impl Debug for UntypedValue {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- let little: f64 = Endian::Little.parse(self.0);
- let little = format!("{:?}", little);
- let big: f64 = Endian::Big.parse(self.0);
- let big = format!("{:?}", big);
- let number = if little.len() <= big.len() {
- little
- } else {
- big
- };
- write!(f, "{number}")?;
-
- let string = default_decode(&self.0);
- let string = string
- .split(|c: char| c == '\0' || c.is_control())
- .next()
- .unwrap();
- write!(f, "{string:?}")?;
- Ok(())
- }
-}
-
-#[derive(Clone, PartialEq, Default, Eq, PartialOrd, Ord, Hash)]
-pub struct RawString(pub Vec<u8>);
-
-impl RawString {
- pub fn spaces(n: usize) -> Self {
- Self(std::iter::repeat_n(b' ', n).collect())
- }
- pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> {
- EncodedStr::new(&self.0, encoding)
- }
-}
-
-impl From<Cow<'_, [u8]>> for RawString {
- fn from(value: Cow<'_, [u8]>) -> Self {
- Self(value.into_owned())
- }
-}
-
-impl From<Vec<u8>> for RawString {
- fn from(source: Vec<u8>) -> Self {
- Self(source)
- }
-}
-
-impl From<&[u8]> for RawString {
- fn from(source: &[u8]) -> Self {
- Self(source.into())
- }
-}
-
-impl Debug for RawString {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{:?}", default_decode(self.0.as_slice()))
- }
-}
-
-#[derive(Copy, Clone)]
-pub struct RawStrArray<const N: usize>(pub [u8; N]);
-
-impl<const N: usize> From<[u8; N]> for RawStrArray<N> {
- fn from(source: [u8; N]) -> Self {
- Self(source)
- }
-}
-
-impl<const N: usize> Debug for RawStrArray<N> {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{:?}", default_decode(&self.0))
- }
-}
-
-#[derive(Clone, Debug)]
-pub enum EncodedString {
- Encoded {
- bytes: Vec<u8>,
- encoding: &'static Encoding,
- },
- Utf8 {
- s: String,
- },
-}
-
-impl EncodedString {
- pub fn borrowed(&self) -> EncodedStr<'_> {
- match self {
- EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding },
- EncodedString::Utf8 { s } => EncodedStr::Utf8 { s },
- }
- }
-}
-
-impl<'a> From<EncodedStr<'a>> for EncodedString {
- fn from(value: EncodedStr<'a>) -> Self {
- match value {
- EncodedStr::Encoded { bytes, encoding } => Self::Encoded {
- bytes: bytes.into(),
- encoding,
- },
- EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() },
- }
- }
-}
-
-pub enum EncodedStr<'a> {
- Encoded {
- bytes: &'a [u8],
- encoding: &'static Encoding,
- },
- Utf8 {
- s: &'a str,
- },
-}
-
-impl<'a> EncodedStr<'a> {
- pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self {
- Self::Encoded { bytes, encoding }
- }
- pub fn as_str(&self) -> Cow<'_, str> {
- match self {
- EncodedStr::Encoded { bytes, encoding } => {
- encoding.decode_without_bom_handling(bytes).0
- }
- EncodedStr::Utf8 { s } => Cow::from(*s),
- }
- }
- pub fn as_bytes(&self) -> &[u8] {
- match self {
- EncodedStr::Encoded { bytes, .. } => bytes,
- EncodedStr::Utf8 { s } => s.as_bytes(),
- }
- }
- pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> {
- match self {
- EncodedStr::Encoded { bytes, encoding } => {
- let utf8 = encoding.decode_without_bom_handling(bytes).0;
- match encoding.encode(&utf8).0 {
- Cow::Borrowed(_) => {
- // Recoding into UTF-8 and then back did not change anything.
- Cow::from(*bytes)
- }
- Cow::Owned(owned) => Cow::Owned(owned),
- }
- }
- EncodedStr::Utf8 { s } => encoding.encode(s).0,
- }
- }
- pub fn is_empty(&self) -> bool {
- match self {
- EncodedStr::Encoded { bytes, .. } => bytes.is_empty(),
- EncodedStr::Utf8 { s } => s.is_empty(),
- }
- }
- pub fn quoted(&self) -> QuotedEncodedStr {
- QuotedEncodedStr(self)
- }
-}
-
-impl<'a> From<&'a str> for EncodedStr<'a> {
- fn from(s: &'a str) -> Self {
- Self::Utf8 { s }
- }
-}
-
-impl<'a> From<&'a String> for EncodedStr<'a> {
- fn from(s: &'a String) -> Self {
- Self::Utf8 { s: s.as_str() }
- }
-}
-
-pub struct QuotedEncodedStr<'a>(&'a EncodedStr<'a>);
-
-impl Display for QuotedEncodedStr<'_> {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- write!(f, "{:?}", self.0.as_str())
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct ValueLabel<V, S>
-where
- V: Debug,
- S: Debug,
-{
- pub value: Value<V>,
- pub label: S,
-}
-
-#[derive(Clone)]
-pub struct ValueLabelRecord<V, S>
-where
- V: Debug,
- S: Debug,
-{
- /// Range of offsets in file.
- pub offsets: Range<u64>,
-
- /// The labels.
- pub labels: Vec<ValueLabel<V, S>>,
-
- /// The 1-based indexes of the variable indexes.
- pub dict_indexes: Vec<u32>,
-
- /// The types of the variables.
- pub var_type: VarType,
-}
-
-impl<V, S> Debug for ValueLabelRecord<V, S>
-where
- V: Debug,
- S: Debug,
-{
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- writeln!(f, "labels: ")?;
- for label in self.labels.iter() {
- writeln!(f, "{label:?}")?;
- }
- write!(f, "apply to {} variables", self.var_type)?;
- for dict_index in self.dict_indexes.iter() {
- write!(f, " #{dict_index}")?;
- }
- Ok(())
- }
-}
-
-impl<V, S> ValueLabelRecord<V, S>
-where
- V: Debug,
- S: Debug,
-{
- /// Maximum number of value labels in a record.
- pub const MAX_LABELS: u32 = u32::MAX / 8;
-
- /// Maximum number of variable indexes in a record.
- pub const MAX_INDEXES: u32 = u32::MAX / 8;
-}
-
-impl ValueLabelRecord<RawStrArray<8>, RawString> {
- fn read<R: Read + Seek>(
- r: &mut R,
- endian: Endian,
- var_types: &VarTypes,
- warn: &dyn Fn(Warning),
- ) -> Result<Option<Record>, Error> {
- let label_offset = r.stream_position()?;
- let n: u32 = endian.parse(read_bytes(r)?);
- if n > Self::MAX_LABELS {
- return Err(Error::BadNumberOfValueLabels {
- offset: label_offset,
- n,
- max: Self::MAX_LABELS,
- });
- }
-
- let mut labels = Vec::new();
- for _ in 0..n {
- let value = UntypedValue(read_bytes(r)?);
- let label_len: u8 = endian.parse(read_bytes(r)?);
- let label_len = label_len as usize;
- let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
-
- let mut label = read_vec(r, padded_len - 1)?;
- label.truncate(label_len);
- labels.push((value, RawString(label)));
- }
-
- let index_offset = r.stream_position()?;
- let rec_type: u32 = endian.parse(read_bytes(r)?);
- if rec_type != 4 {
- return Err(Error::ExpectedVarIndexRecord {
- offset: index_offset,
- rec_type,
- });
- }
-
- let n: u32 = endian.parse(read_bytes(r)?);
- if n > Self::MAX_INDEXES {
- return Err(Error::TooManyVarIndexes {
- offset: index_offset,
- n,
- max: Self::MAX_INDEXES,
- });
- } else if n == 0 {
- warn(Warning::NoVarIndexes {
- offset: index_offset,
- });
- return Ok(None);
- }
-
- let index_offset = r.stream_position()?;
- let mut dict_indexes = Vec::with_capacity(n as usize);
- let mut invalid_indexes = Vec::new();
- for _ in 0..n {
- let index: u32 = endian.parse(read_bytes(r)?);
- if var_types.is_valid_index(index as usize) {
- dict_indexes.push(index);
- } else {
- invalid_indexes.push(index);
- }
- }
- if !invalid_indexes.is_empty() {
- warn(Warning::InvalidVarIndexes {
- offset: index_offset,
- max: var_types.n_values(),
- invalid: invalid_indexes,
- });
- }
-
- let Some(&first_index) = dict_indexes.first() else {
- return Ok(None);
- };
- let var_type = var_types.types[first_index as usize - 1].unwrap();
- let mut wrong_type_indexes = Vec::new();
- dict_indexes.retain(|&index| {
- if var_types.types[index as usize - 1] != Some(var_type) {
- wrong_type_indexes.push(index);
- false
- } else {
- true
- }
- });
- if !wrong_type_indexes.is_empty() {
- warn(Warning::MixedVarTypes {
- offset: index_offset,
- var_type,
- wrong_types: wrong_type_indexes,
- });
- }
-
- let labels = labels
- .into_iter()
- .map(|(value, label)| ValueLabel {
- value: Value::from_raw(&value, var_type, endian),
- label,
- })
- .collect();
-
- let end_offset = r.stream_position()?;
- Ok(Some(Record::ValueLabel(ValueLabelRecord {
- offsets: label_offset..end_offset,
- labels,
- dict_indexes,
- var_type,
- })))
- }
-
- fn decode(self, decoder: &Decoder) -> ValueLabelRecord<RawStrArray<8>, String> {
- let labels = self
- .labels
- .iter()
- .map(|ValueLabel { value, label }| ValueLabel {
- value: value.clone(),
- label: decoder.decode(label).to_string(),
- })
- .collect();
- ValueLabelRecord {
- offsets: self.offsets.clone(),
- labels,
- dict_indexes: self.dict_indexes.clone(),
- var_type: self.var_type,
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct DocumentRecord<S>
-where
- S: Debug,
-{
- pub offsets: Range<u64>,
-
- /// The document, as an array of lines. Raw lines are exactly 80 bytes long
- /// and are right-padded with spaces without any new-line termination.
- pub lines: Vec<S>,
-}
-
-pub type RawDocumentLine = RawStrArray<DOC_LINE_LEN>;
-
-/// Length of a line in a document. Document lines are fixed-length and
-/// padded on the right with spaces.
-pub const DOC_LINE_LEN: usize = 80;
-
-impl DocumentRecord<RawDocumentLine> {
- /// Maximum number of lines we will accept in a document. This is simply
- /// the maximum number that will fit in a 32-bit space.
- pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
-
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
- let start_offset = r.stream_position()?;
- let n: u32 = endian.parse(read_bytes(r)?);
- let n = n as usize;
- if n > Self::MAX_LINES {
- Err(Error::BadDocumentLength {
- offset: start_offset,
- n,
- max: Self::MAX_LINES,
- })
- } else {
- let mut lines = Vec::with_capacity(n);
- for _ in 0..n {
- lines.push(RawStrArray(read_bytes(r)?));
- }
- let end_offset = r.stream_position()?;
- Ok(Record::Document(DocumentRecord {
- offsets: start_offset..end_offset,
- lines,
- }))
- }
- }
-
- pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
- DecodedRecord::Document(DocumentRecord {
- offsets: self.offsets.clone(),
- lines: self
- .lines
- .iter()
- .map(|s| decoder.decode_slice(&s.0).to_string())
- .collect(),
- })
- }
-}
-
-trait ExtensionRecord {
- const SUBTYPE: u32;
- const SIZE: Option<u32>;
- const COUNT: Option<u32>;
- const NAME: &'static str;
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning>;
-}
-
-#[derive(Clone, Debug)]
-pub struct IntegerInfoRecord {
- pub offsets: Range<u64>,
- pub version: (i32, i32, i32),
- pub machine_code: i32,
- pub floating_point_rep: i32,
- pub compression_code: i32,
- pub endianness: i32,
- pub character_code: i32,
-}
-
-impl ExtensionRecord for IntegerInfoRecord {
- const SUBTYPE: u32 = 3;
- const SIZE: Option<u32> = Some(4);
- const COUNT: Option<u32> = Some(8);
- const NAME: &'static str = "integer record";
-
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size::<Self>()?;
-
- let mut input = &ext.data[..];
- let data: Vec<i32> = (0..8)
- .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
- .collect();
- Ok(Record::IntegerInfo(IntegerInfoRecord {
- offsets: ext.offsets.clone(),
- version: (data[0], data[1], data[2]),
- machine_code: data[3],
- floating_point_rep: data[4],
- compression_code: data[5],
- endianness: data[6],
- character_code: data[7],
- }))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct FloatInfoRecord {
- pub sysmis: f64,
- pub highest: f64,
- pub lowest: f64,
-}
-
-impl ExtensionRecord for FloatInfoRecord {
- const SUBTYPE: u32 = 4;
- const SIZE: Option<u32> = Some(8);
- const COUNT: Option<u32> = Some(3);
- const NAME: &'static str = "floating point record";
-
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size::<Self>()?;
-
- let mut input = &ext.data[..];
- let data: Vec<f64> = (0..3)
- .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
- .collect();
- Ok(Record::FloatInfo(FloatInfoRecord {
- sysmis: data[0],
- highest: data[1],
- lowest: data[2],
- }))
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum CategoryLabels {
- VarLabels,
- CountedValues,
-}
-
-#[derive(Clone, Debug)]
-pub enum MultipleResponseType {
- MultipleDichotomy {
- value: RawString,
- labels: CategoryLabels,
- },
- MultipleCategory,
-}
-
-impl MultipleResponseType {
- fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> {
- let (mr_type, input) = match input.split_first() {
- Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
- Some((b'D', input)) => {
- let (value, input) = parse_counted_string(input)?;
- (
- MultipleResponseType::MultipleDichotomy {
- value,
- labels: CategoryLabels::VarLabels,
- },
- input,
- )
- }
- Some((b'E', input)) => {
- let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
- (CategoryLabels::CountedValues, rest)
- } else if let Some(rest) = input.strip_prefix(b" 11 ") {
- (CategoryLabels::VarLabels, rest)
- } else {
- return Err(Warning::TBD);
- };
- let (value, input) = parse_counted_string(input)?;
- (
- MultipleResponseType::MultipleDichotomy { value, labels },
- input,
- )
- }
- _ => return Err(Warning::TBD),
- };
- Ok((mr_type, input))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct MultipleResponseSet<I, S>
-where
- I: Debug,
- S: Debug,
-{
- pub name: I,
- pub label: S,
- pub mr_type: MultipleResponseType,
- pub short_names: Vec<I>,
-}
-
-impl MultipleResponseSet<RawString, RawString> {
- fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> {
- let Some(equals) = input.iter().position(|&b| b == b'=') else {
- return Err(Warning::TBD);
- };
- let (name, input) = input.split_at(equals);
- let (mr_type, input) = MultipleResponseType::parse(input)?;
- let Some(input) = input.strip_prefix(b" ") else {
- return Err(Warning::TBD);
- };
- let (label, mut input) = parse_counted_string(input)?;
- let mut vars = Vec::new();
- while input.first() != Some(&b'\n') {
- match input.split_first() {
- Some((b' ', rest)) => {
- let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
- return Err(Warning::TBD);
- };
- let (var, rest) = rest.split_at(length);
- if !var.is_empty() {
- vars.push(var.into());
- }
- input = rest;
- }
- _ => return Err(Warning::TBD),
- }
- }
- while input.first() == Some(&b'\n') {
- input = &input[1..];
- }
- Ok((
- MultipleResponseSet {
- name: name.into(),
- label,
- mr_type,
- short_names: vars,
- },
- input,
- ))
- }
-
- fn decode(
- &self,
- decoder: &Decoder,
- ) -> Result<MultipleResponseSet<Identifier, String>, Warning> {
- let mut short_names = Vec::with_capacity(self.short_names.len());
- for short_name in self.short_names.iter() {
- if let Some(short_name) = decoder
- .decode_identifier(short_name)
- .map_err(Warning::InvalidMrSetName)
- .issue_warning(&decoder.warn)
- {
- short_names.push(short_name);
- }
- }
- Ok(MultipleResponseSet {
- name: decoder
- .decode_identifier(&self.name)
- .map_err(Warning::InvalidMrSetVariableName)?,
- label: decoder.decode(&self.label).to_string(),
- mr_type: self.mr_type.clone(),
- short_names,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct MultipleResponseRecord<I, S>(pub Vec<MultipleResponseSet<I, S>>)
-where
- I: Debug,
- S: Debug;
-
-impl ExtensionRecord for MultipleResponseRecord<RawString, RawString> {
- const SUBTYPE: u32 = 7;
- const SIZE: Option<u32> = Some(1);
- const COUNT: Option<u32> = None;
- const NAME: &'static str = "multiple response set record";
-
- fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
- ext.check_size::<Self>()?;
-
- let mut input = &ext.data[..];
- let mut sets = Vec::new();
- while !input.is_empty() {
- let (set, rest) = MultipleResponseSet::parse(input)?;
- sets.push(set);
- input = rest;
- }
- Ok(Record::MultipleResponse(MultipleResponseRecord(sets)))
- }
-}
-
-impl MultipleResponseRecord<RawString, RawString> {
- fn decode(self, decoder: &Decoder) -> DecodedRecord {
- let mut sets = Vec::new();
- for set in self.0.iter() {
- if let Some(set) = set.decode(decoder).issue_warning(&decoder.warn) {
- sets.push(set);
- }
- }
- DecodedRecord::MultipleResponse(MultipleResponseRecord(sets))
- }
-}
-
-fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> {
- let Some(space) = input.iter().position(|&b| b == b' ') else {
- return Err(Warning::TBD);
- };
- let Ok(length) = from_utf8(&input[..space]) else {
- return Err(Warning::TBD);
- };
- let Ok(length): Result<usize, _> = length.parse() else {
- return Err(Warning::TBD);
- };
-
- let input = &input[space + 1..];
- if input.len() < length {
- return Err(Warning::TBD);
- };
-
- let (string, rest) = input.split_at(length);
- Ok((string.into(), rest))
-}
-
-/// [Level of measurement](https://en.wikipedia.org/wiki/Level_of_measurement).
-#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum Measure {
- /// Nominal values can only be compared for equality.
- Nominal,
-
- /// Ordinal values can be meaningfully ordered.
- Ordinal,
-
- /// Scale values can be meaningfully compared for the degree of difference.
- Scale,
-}
-
-impl Measure {
- pub fn default_for_type(var_type: VarType) -> Option<Measure> {
- match var_type {
- VarType::Numeric => None,
- VarType::String => Some(Self::Nominal),
- }
- }
-
- fn try_decode(source: u32) -> Result<Option<Measure>, Warning> {
- match source {
- 0 => Ok(None),
- 1 => Ok(Some(Measure::Nominal)),
- 2 => Ok(Some(Measure::Ordinal)),
- 3 => Ok(Some(Measure::Scale)),
- _ => Err(Warning::InvalidMeasurement(source)),
- }
- }
-}
-
-#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum Alignment {
- Left,
- Right,
- Center,
-}
-
-impl Alignment {
- fn try_decode(source: u32) -> Result<Option<Alignment>, Warning> {
- match source {
- 0 => Ok(None),
- 1 => Ok(Some(Alignment::Left)),
- 2 => Ok(Some(Alignment::Right)),
- 3 => Ok(Some(Alignment::Center)),
- _ => Err(Warning::InvalidAlignment(source)),
- }
- }
-
- pub fn default_for_type(var_type: VarType) -> Self {
- match var_type {
- VarType::Numeric => Self::Right,
- VarType::String => Self::Left,
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VarDisplay {
- pub measure: Option<Measure>,
- pub width: Option<u32>,
- pub alignment: Option<Alignment>,
-}
-
-#[derive(Clone, Debug)]
-pub struct VarDisplayRecord(pub Vec<VarDisplay>);
-
-impl VarDisplayRecord {
- const SUBTYPE: u32 = 11;
-
- fn parse(
- ext: &Extension,
- n_vars: usize,
- endian: Endian,
- warn: &dyn Fn(Warning),
- ) -> Result<Record, Warning> {
- if ext.size != 4 {
- return Err(Warning::BadRecordSize {
- offset: ext.offsets.start,
- record: String::from("variable display record"),
- size: ext.size,
- expected_size: 4,
- });
- }
-
- let has_width = if ext.count as usize == 3 * n_vars {
- true
- } else if ext.count as usize == 2 * n_vars {
- false
- } else {
- return Err(Warning::TBD);
- };
-
- let mut var_displays = Vec::new();
- let mut input = &ext.data[..];
- for _ in 0..n_vars {
- let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
- .issue_warning(&warn)
- .flatten();
- let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
- let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
- .issue_warning(&warn)
- .flatten();
- var_displays.push(VarDisplay {
- measure,
- width,
- alignment,
- });
- }
- Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringMissingValues<N>
-where
- N: Debug,
-{
- /// Variable name.
- pub var_name: N,
-
- /// Missing values.
- pub missing_values: Vec<RawStrArray<8>>,
-}
-
-impl LongStringMissingValues<RawString> {
- fn decode(&self, decoder: &Decoder) -> Result<LongStringMissingValues<Identifier>, IdError> {
- Ok(LongStringMissingValues {
- var_name: decoder.decode_identifier(&self.var_name)?,
- missing_values: self.missing_values.clone(),
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringMissingValueRecord<N>(pub Vec<LongStringMissingValues<N>>)
-where
- N: Debug;
-
-impl ExtensionRecord for LongStringMissingValueRecord<RawString> {
- const SUBTYPE: u32 = 22;
- const SIZE: Option<u32> = Some(1);
- const COUNT: Option<u32> = None;
- const NAME: &'static str = "long string missing values record";
-
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size::<Self>()?;
-
- let mut input = &ext.data[..];
- let mut missing_value_set = Vec::new();
- while !input.is_empty() {
- let var_name = read_string(&mut input, endian)?;
- let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
- let value_len: u32 = endian.parse(read_bytes(&mut input)?);
- if value_len != 8 {
- let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
- return Err(Warning::BadLongMissingValueLength {
- record_offset: ext.offsets.start,
- offset,
- value_len,
- });
- }
- let mut missing_values = Vec::new();
- for i in 0..n_missing_values {
- let value: [u8; 8] = read_bytes(&mut input)?;
- let numeric_value: u64 = endian.parse(value);
- let value = if i > 0 && numeric_value == 8 {
- // Tolerate files written by old, buggy versions of PSPP
- // where we believed that the value_length was repeated
- // before each missing value.
- read_bytes(&mut input)?
- } else {
- value
- };
- missing_values.push(RawStrArray(value));
- }
- missing_value_set.push(LongStringMissingValues {
- var_name,
- missing_values,
- });
- }
- Ok(Record::LongStringMissingValues(
- LongStringMissingValueRecord(missing_value_set),
- ))
- }
-}
-
-impl LongStringMissingValueRecord<RawString> {
- pub fn decode(self, decoder: &Decoder) -> LongStringMissingValueRecord<Identifier> {
- let mut mvs = Vec::with_capacity(self.0.len());
- for mv in self.0.iter() {
- if let Some(mv) = mv
- .decode(decoder)
- .map_err(Warning::InvalidLongStringMissingValueVariableName)
- .issue_warning(&decoder.warn)
- {
- mvs.push(mv);
- }
- }
- LongStringMissingValueRecord(mvs)
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct EncodingRecord(pub String);
-
-impl ExtensionRecord for EncodingRecord {
- const SUBTYPE: u32 = 20;
- const SIZE: Option<u32> = Some(1);
- const COUNT: Option<u32> = None;
- const NAME: &'static str = "encoding record";
-
- fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
- ext.check_size::<Self>()?;
-
- Ok(Record::Encoding(EncodingRecord(
- String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName {
- offset: ext.offsets.start,
- })?,
- )))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct NumberOfCasesRecord {
- /// Always observed as 1.
- pub one: u64,
-
- /// Number of cases.
- pub n_cases: u64,
-}
-
-impl ExtensionRecord for NumberOfCasesRecord {
- const SUBTYPE: u32 = 16;
- const SIZE: Option<u32> = Some(8);
- const COUNT: Option<u32> = Some(2);
- const NAME: &'static str = "extended number of cases record";
-
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size::<Self>()?;
-
- let mut input = &ext.data[..];
- let one = endian.parse(read_bytes(&mut input)?);
- let n_cases = endian.parse(read_bytes(&mut input)?);
-
- Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct TextRecord {
- pub offsets: Range<u64>,
-
- /// Type of record.
- pub rec_type: TextRecordType,
-
- /// The text content of the record.
- pub text: RawString,
-}
-
-#[derive(Clone, Copy, Debug)]
-pub enum TextRecordType {
- VariableSets,
- ProductInfo,
- LongNames,
- VeryLongStrings,
- FileAttributes,
- VariableAttributes,
-}
-
-impl TextRecord {
- fn new(extension: Extension, rec_type: TextRecordType) -> Self {
- Self {
- offsets: extension.offsets,
- rec_type,
- text: extension.data.into(),
- }
- }
- pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
- match self.rec_type {
- TextRecordType::VariableSets => {
- DecodedRecord::VariableSets(VariableSetRecord::decode(&self, decoder))
- }
- TextRecordType::ProductInfo => {
- DecodedRecord::ProductInfo(ProductInfoRecord::decode(&self, decoder))
- }
- TextRecordType::LongNames => {
- DecodedRecord::LongNames(LongNamesRecord::decode(&self, decoder))
- }
- TextRecordType::VeryLongStrings => {
- DecodedRecord::VeryLongStrings(VeryLongStringsRecord::decode(&self, decoder))
- }
- TextRecordType::FileAttributes => {
- DecodedRecord::FileAttributes(FileAttributeRecord::decode(&self, decoder))
- }
- TextRecordType::VariableAttributes => {
- DecodedRecord::VariableAttributes(VariableAttributeRecord::decode(&self, decoder))
- }
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VeryLongString {
- pub short_name: Identifier,
- pub length: u16,
-}
-
-impl VeryLongString {
- fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Warning> {
- let Some((short_name, length)) = input.split_once('=') else {
- return Err(Warning::TBD);
- };
- let short_name = decoder
- .new_identifier(short_name)
- .and_then(Identifier::must_be_ordinary)
- .map_err(Warning::InvalidLongStringName)?;
- let length = length.parse().map_err(|_| Warning::TBD)?;
- Ok(VeryLongString { short_name, length })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VeryLongStringsRecord(pub Vec<VeryLongString>);
-
-impl VeryLongStringsRecord {
- fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
- let input = decoder.decode(&source.text);
- let mut very_long_strings = Vec::new();
- for tuple in input
- .split('\0')
- .map(|s| s.trim_end_matches('\t'))
- .filter(|s| !s.is_empty())
- {
- if let Some(vls) = VeryLongString::parse(decoder, tuple).issue_warning(&decoder.warn) {
- very_long_strings.push(vls)
- }
- }
- VeryLongStringsRecord(very_long_strings)
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct Attribute {
- pub name: Identifier,
- pub values: Vec<String>,
-}
-
-impl Attribute {
- fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> {
- let Some((name, mut input)) = input.split_once('(') else {
- return Err(Warning::TBD);
- };
- let name = decoder
- .new_identifier(name)
- .map_err(Warning::InvalidAttributeName)?;
- let mut values = Vec::new();
- loop {
- let Some((value, rest)) = input.split_once('\n') else {
- return Err(Warning::TBD);
- };
- if let Some(stripped) = value
- .strip_prefix('\'')
- .and_then(|value| value.strip_suffix('\''))
- {
- values.push(stripped.into());
- } else {
- decoder.warn(Warning::TBD);
- values.push(value.into());
- }
- if let Some(rest) = rest.strip_prefix(')') {
- let attribute = Attribute { name, values };
- return Ok((attribute, rest));
- };
- input = rest;
- }
- }
-}
-
-impl Attributes {
- fn parse<'a>(
- decoder: &Decoder,
- mut input: &'a str,
- sentinel: Option<char>,
- ) -> Result<(Attributes, &'a str), Warning> {
- let mut attributes = HashMap::new();
- let rest = loop {
- match input.chars().next() {
- None => break input,
- c if c == sentinel => break &input[1..],
- _ => {
- let (attribute, rest) = Attribute::parse(decoder, input)?;
- // XXX report duplicate name
- attributes.insert(attribute.name, attribute.values);
- input = rest;
- }
- }
- };
- Ok((Attributes(attributes), rest))
- }
-}
-
-#[derive(Clone, Debug, Default)]
-pub struct FileAttributeRecord(pub Attributes);
-
-impl FileAttributeRecord {
- fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
- let input = decoder.decode(&source.text);
- match Attributes::parse(decoder, &input, None).issue_warning(&decoder.warn) {
- Some((set, rest)) => {
- if !rest.is_empty() {
- decoder.warn(Warning::TBD);
- }
- FileAttributeRecord(set)
- }
- None => FileAttributeRecord::default(),
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VarAttributes {
- pub long_var_name: Identifier,
- pub attributes: Attributes,
-}
-
-impl VarAttributes {
- fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributes, &'a str), Warning> {
- let Some((long_var_name, rest)) = input.split_once(':') else {
- return Err(Warning::TBD);
- };
- let long_var_name = decoder
- .new_identifier(long_var_name)
- .and_then(Identifier::must_be_ordinary)
- .map_err(Warning::InvalidAttributeVariableName)?;
- let (attributes, rest) = Attributes::parse(decoder, rest, Some('/'))?;
- let var_attribute = VarAttributes {
- long_var_name,
- attributes,
- };
- Ok((var_attribute, rest))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableAttributeRecord(pub Vec<VarAttributes>);
-
-impl VariableAttributeRecord {
- fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
- let decoded = decoder.decode(&source.text);
- let mut input = decoded.as_ref();
- let mut var_attribute_sets = Vec::new();
- while !input.is_empty() {
- let Some((var_attribute, rest)) =
- VarAttributes::parse(decoder, input).issue_warning(&decoder.warn)
- else {
- break;
- };
- var_attribute_sets.push(var_attribute);
- input = rest;
- }
- VariableAttributeRecord(var_attribute_sets)
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongName {
- pub short_name: Identifier,
- pub long_name: Identifier,
-}
-
-impl LongName {
- fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
- let Some((short_name, long_name)) = input.split_once('=') else {
- return Err(Warning::TBD);
- };
- let short_name = decoder
- .new_identifier(short_name)
- .and_then(Identifier::must_be_ordinary)
- .map_err(Warning::InvalidShortName)?;
- let long_name = decoder
- .new_identifier(long_name)
- .and_then(Identifier::must_be_ordinary)
- .map_err(Warning::InvalidLongName)?;
- Ok(LongName {
- short_name,
- long_name,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongNamesRecord(pub Vec<LongName>);
-
-impl LongNamesRecord {
- fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
- let input = decoder.decode(&source.text);
- let mut names = Vec::new();
- for pair in input.split('\t').filter(|s| !s.is_empty()) {
- if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&decoder.warn) {
- names.push(long_name);
- }
- }
- LongNamesRecord(names)
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct ProductInfoRecord(pub String);
-
-impl ProductInfoRecord {
- fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
- Self(decoder.decode(&source.text).into())
- }
-}
-#[derive(Clone, Debug)]
-pub struct VariableSet {
- pub name: Identifier,
- pub variable_names: Vec<Identifier>,
-}
-
-impl VariableSet {
- fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
- let (name, input) = input.split_once('=').ok_or(Warning::TBD)?;
- let name = decoder.new_identifier(name).map_err(|_| Warning::TBD)?;
- let mut vars = Vec::new();
- for var in input.split_ascii_whitespace() {
- if let Some(identifier) = decoder
- .new_identifier(var)
- .and_then(Identifier::must_be_ordinary)
- .map_err(Warning::InvalidVariableSetName)
- .issue_warning(&decoder.warn)
- {
- vars.push(identifier);
- }
- }
- Ok(VariableSet {
- name,
- variable_names: vars,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableSetRecord {
- pub offsets: Range<u64>,
- pub sets: Vec<VariableSet>,
-}
-
-impl VariableSetRecord {
- fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord {
- let mut sets = Vec::new();
- let input = decoder.decode(&source.text);
- for line in input.lines() {
- if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&decoder.warn) {
- sets.push(set)
- }
- }
- VariableSetRecord {
- offsets: source.offsets.clone(),
- sets,
- }
- }
-}
-
-trait IssueWarning<T> {
- fn issue_warning<F>(self, warn: &F) -> Option<T>
- where
- F: Fn(Warning);
-}
-impl<T> IssueWarning<T> for Result<T, Warning> {
- fn issue_warning<F>(self, warn: &F) -> Option<T>
- where
- F: Fn(Warning),
- {
- match self {
- Ok(result) => Some(result),
- Err(error) => {
- warn(error);
- None
- }
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct Extension {
- pub offsets: Range<u64>,
-
- /// Record subtype.
- pub subtype: u32,
-
- /// Size of each data element.
- pub size: u32,
-
- /// Number of data elements.
- pub count: u32,
-
- /// `size * count` bytes of data.
- pub data: Vec<u8>,
-}
-
-impl Extension {
- fn check_size<E: ExtensionRecord>(&self) -> Result<(), Warning> {
- if let Some(expected_size) = E::SIZE {
- if self.size != expected_size {
- return Err(Warning::BadRecordSize {
- offset: self.offsets.start,
- record: E::NAME.into(),
- size: self.size,
- expected_size,
- });
- }
- }
- if let Some(expected_count) = E::COUNT {
- if self.count != expected_count {
- return Err(Warning::BadRecordCount {
- offset: self.offsets.start,
- record: E::NAME.into(),
- count: self.count,
- expected_count,
- });
- }
- }
- Ok(())
- }
-
- fn read<R: Read + Seek>(
- r: &mut R,
- endian: Endian,
- n_vars: usize,
- warn: &dyn Fn(Warning),
- ) -> Result<Option<Record>, Error> {
- let subtype = endian.parse(read_bytes(r)?);
- let header_offset = r.stream_position()?;
- let size: u32 = endian.parse(read_bytes(r)?);
- let count = endian.parse(read_bytes(r)?);
- let Some(product) = size.checked_mul(count) else {
- return Err(Error::ExtensionRecordTooLarge {
- offset: header_offset,
- subtype,
- size,
- count,
- });
- };
- let start_offset = r.stream_position()?;
- let data = read_vec(r, product as usize)?;
- let end_offset = start_offset + product as u64;
- let extension = Extension {
- offsets: start_offset..end_offset,
- subtype,
- size,
- count,
- data,
- };
- let result = match subtype {
- IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian),
- FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian),
- VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, n_vars, endian, warn),
- MultipleResponseRecord::SUBTYPE | 19 => {
- MultipleResponseRecord::parse(&extension, endian)
- }
- LongStringValueLabelRecord::SUBTYPE => {
- LongStringValueLabelRecord::parse(&extension, endian)
- }
- EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian),
- NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian),
- 5 => Ok(Record::Text(TextRecord::new(
- extension,
- TextRecordType::VariableSets,
- ))),
- 10 => Ok(Record::Text(TextRecord::new(
- extension,
- TextRecordType::ProductInfo,
- ))),
- 13 => Ok(Record::Text(TextRecord::new(
- extension,
- TextRecordType::LongNames,
- ))),
- 14 => Ok(Record::Text(TextRecord::new(
- extension,
- TextRecordType::VeryLongStrings,
- ))),
- 17 => Ok(Record::Text(TextRecord::new(
- extension,
- TextRecordType::FileAttributes,
- ))),
- 18 => Ok(Record::Text(TextRecord::new(
- extension,
- TextRecordType::VariableAttributes,
- ))),
- _ => Ok(Record::OtherExtension(extension)),
- };
- match result {
- Ok(result) => Ok(Some(result)),
- Err(error) => {
- warn(error);
- Ok(None)
- }
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct ZHeader {
- /// File offset to the start of the record.
- pub offset: u64,
-
- /// File offset to the ZLIB data header.
- pub zheader_offset: u64,
-
- /// File offset to the ZLIB trailer.
- pub ztrailer_offset: u64,
-
- /// Length of the ZLIB trailer in bytes.
- pub ztrailer_len: u64,
-}
-
-impl ZHeader {
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
- let offset = r.stream_position()?;
- let zheader_offset: u64 = endian.parse(read_bytes(r)?);
- let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
- let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
-
- Ok(ZHeader {
- offset,
- zheader_offset,
- ztrailer_offset,
- ztrailer_len,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct ZTrailer {
- /// File offset to the start of the record.
- pub offset: u64,
-
- /// Compression bias as a negative integer, e.g. -100.
- pub int_bias: i64,
-
- /// Always observed as zero.
- pub zero: u64,
-
- /// Uncompressed size of each block, except possibly the last. Only
- /// `0x3ff000` has been observed so far.
- pub block_size: u32,
-
- /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
- pub blocks: Vec<ZBlock>,
-}
-
-#[derive(Clone, Debug)]
-pub struct ZBlock {
- /// Offset of block of data if simple compression were used.
- pub uncompressed_ofs: u64,
-
- /// Actual offset within the file of the compressed data block.
- pub compressed_ofs: u64,
-
- /// The number of bytes in this data block after decompression. This is
- /// `block_size` in every data block but the last, which may be smaller.
- pub uncompressed_size: u32,
-
- /// The number of bytes in this data block, as stored compressed in this
- /// file.
- pub compressed_size: u32,
-}
-
-impl ZBlock {
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
- Ok(ZBlock {
- uncompressed_ofs: endian.parse(read_bytes(r)?),
- compressed_ofs: endian.parse(read_bytes(r)?),
- uncompressed_size: endian.parse(read_bytes(r)?),
- compressed_size: endian.parse(read_bytes(r)?),
- })
- }
-}
-
-impl ZTrailer {
- fn read<R: Read + Seek>(
- reader: &mut R,
- endian: Endian,
- ztrailer_ofs: u64,
- ztrailer_len: u64,
- ) -> Result<Option<ZTrailer>, Error> {
- let start_offset = reader.stream_position()?;
- if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
- return Ok(None);
- }
- let int_bias = endian.parse(read_bytes(reader)?);
- let zero = endian.parse(read_bytes(reader)?);
- let block_size = endian.parse(read_bytes(reader)?);
- let n_blocks: u32 = endian.parse(read_bytes(reader)?);
- let expected_n_blocks = (ztrailer_len - 24) / 24;
- if n_blocks as u64 != expected_n_blocks {
- return Err(Error::BadZlibTrailerNBlocks {
- offset: ztrailer_ofs,
- n_blocks,
- expected_n_blocks,
- ztrailer_len,
- });
- }
- let blocks = (0..n_blocks)
- .map(|_| ZBlock::read(reader, endian))
- .collect::<Result<Vec<_>, _>>()?;
- reader.seek(SeekFrom::Start(start_offset))?;
- Ok(Some(ZTrailer {
- offset: ztrailer_ofs,
- int_bias,
- zero,
- block_size,
- blocks,
- }))
- }
-}
-
-fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
- let mut buf = [0; N];
- let n = r.read(&mut buf)?;
- if n > 0 {
- if n < N {
- r.read_exact(&mut buf[n..])?;
- }
- Ok(Some(buf))
- } else {
- Ok(None)
- }
-}
-
-fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
- let mut buf = [0; N];
- r.read_exact(&mut buf)?;
- Ok(buf)
-}
-
-fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
- let mut vec = vec![0; n];
- r.read_exact(&mut vec)?;
- Ok(vec)
-}
-
-fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<RawString, IoError> {
- let length: u32 = endian.parse(read_bytes(r)?);
- Ok(read_vec(r, length as usize)?.into())
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringValueLabels<N, S>
-where
- S: Debug,
-{
- pub var_name: N,
- pub width: u32,
-
- /// `(value, label)` pairs, where each value is `width` bytes.
- pub labels: Vec<(RawString, S)>,
-}
-
-impl LongStringValueLabels<RawString, RawString> {
- fn decode(
- &self,
- decoder: &Decoder,
- ) -> Result<LongStringValueLabels<Identifier, String>, Warning> {
- let var_name = decoder.decode(&self.var_name);
- let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding)
- .map_err(Warning::InvalidLongStringValueLabelName)?;
-
- let mut labels = Vec::with_capacity(self.labels.len());
- for (value, label) in self.labels.iter() {
- let label = decoder.decode(label).to_string();
- labels.push((value.clone(), label));
- }
-
- Ok(LongStringValueLabels {
- var_name,
- width: self.width,
- labels,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringValueLabelRecord<N, S>(pub Vec<LongStringValueLabels<N, S>>)
-where
- N: Debug,
- S: Debug;
-
-impl ExtensionRecord for LongStringValueLabelRecord<RawString, RawString> {
- const SUBTYPE: u32 = 21;
- const SIZE: Option<u32> = Some(1);
- const COUNT: Option<u32> = None;
- const NAME: &'static str = "long string value labels record";
-
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size::<Self>()?;
-
- let mut input = &ext.data[..];
- let mut label_set = Vec::new();
- while !input.is_empty() {
- let var_name = read_string(&mut input, endian)?;
- let width: u32 = endian.parse(read_bytes(&mut input)?);
- let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
- let mut labels = Vec::new();
- for _ in 0..n_labels {
- let value = read_string(&mut input, endian)?;
- let label = read_string(&mut input, endian)?;
- labels.push((value, label));
- }
- label_set.push(LongStringValueLabels {
- var_name,
- width,
- labels,
- })
- }
- Ok(Record::LongStringValueLabels(LongStringValueLabelRecord(
- label_set,
- )))
- }
-}
-
-impl LongStringValueLabelRecord<RawString, RawString> {
- fn decode(self, decoder: &Decoder) -> LongStringValueLabelRecord<Identifier, String> {
- let mut labels = Vec::with_capacity(self.0.len());
- for label in &self.0 {
- match label.decode(decoder) {
- Ok(set) => labels.push(set),
- Err(error) => decoder.warn(error),
- }
- }
- LongStringValueLabelRecord(labels)
- }
-}
-
-#[derive(Default)]
-pub struct VarTypes {
- pub types: Vec<Option<VarType>>,
-}
-
-impl VarTypes {
- pub fn new() -> Self {
- Self::default()
- }
-
- pub fn push(&mut self, width: RawWidth) {
- if let Ok(var_type) = VarType::try_from(width) {
- self.types.push(Some(var_type));
- for _ in 1..width.n_values().unwrap() {
- self.types.push(None);
- }
- }
- }
-
- pub fn n_values(&self) -> usize {
- self.types.len()
- }
-
- pub fn is_valid_index(&self, index: usize) -> bool {
- self.var_type_at(index).is_some()
- }
-
- pub fn var_type_at(&self, index: usize) -> Option<VarType> {
- if index >= 1 && index <= self.types.len() {
- self.types[index - 1]
- } else {
- None
- }
- }
-
- pub fn iter(&self) -> impl Iterator<Item = VarType> + use<'_> {
- self.types
- .iter()
- .map(|var_type| var_type.unwrap_or(VarType::String))
- }
-}
+++ /dev/null
-use float_next_after::NextAfter;
-use num::{Bounded, Zero};
-use ordered_float::OrderedFloat;
-use std::{
- collections::{hash_map::Entry, HashMap},
- error::Error as StdError,
- fmt::{Display, Formatter, Result as FmtResult},
- iter::repeat_n,
-};
-
-use crate::endian::{Endian, ToBytes};
-
-pub type Result<T, F = Error> = std::result::Result<T, F>;
-
-#[derive(Debug)]
-pub struct Error {
- pub file_name: Option<String>,
- pub line_number: Option<usize>,
- pub token: Option<String>,
- pub message: String,
-}
-
-impl Error {
- fn new(
- file_name: Option<&str>,
- line_number: Option<usize>,
- token: Option<&str>,
- message: String,
- ) -> Error {
- Error {
- file_name: file_name.map(String::from),
- line_number,
- token: token.map(String::from),
- message,
- }
- }
-}
-
-impl StdError for Error {}
-
-impl Display for Error {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- match (self.file_name.as_ref(), self.line_number) {
- (Some(ref file_name), Some(line_number)) => write!(f, "{file_name}:{line_number}: ")?,
- (Some(ref file_name), None) => write!(f, "{file_name}: ")?,
- (None, Some(line_number)) => write!(f, "line {line_number}: ")?,
- (None, None) => (),
- }
- if let Some(ref token) = self.token {
- write!(f, "at '{token}': ")?;
- }
- write!(f, "{}", self.message)
- }
-}
-
-pub fn sack(input: &str, input_file_name: Option<&str>, endian: Endian) -> Result<Vec<u8>> {
- let mut symbol_table = HashMap::new();
- let output = _sack(input, input_file_name, endian, &mut symbol_table)?;
- let output = if !symbol_table.is_empty() {
- for (k, v) in symbol_table.iter() {
- println!("{k} => {v:?}");
- }
- for (k, v) in symbol_table.iter() {
- if v.is_none() {
- Err(Error::new(
- input_file_name,
- None,
- None,
- format!("label {k} used but never defined"),
- ))?
- }
- }
- _sack(input, input_file_name, endian, &mut symbol_table)?
- } else {
- output
- };
- Ok(output)
-}
-
-fn _sack(
- input: &str,
- input_file_name: Option<&str>,
- endian: Endian,
- symbol_table: &mut HashMap<String, Option<u32>>,
-) -> Result<Vec<u8>> {
- let mut lexer = Lexer::new(input, input_file_name, endian)?;
- let mut output = Vec::new();
- while parse_data_item(&mut lexer, &mut output, symbol_table)? {}
- Ok(output)
-}
-
-fn parse_data_item(
- lexer: &mut Lexer,
- output: &mut Vec<u8>,
- symbol_table: &mut HashMap<String, Option<u32>>,
-) -> Result<bool> {
- if lexer.token.is_none() {
- return Ok(false);
- };
-
- let initial_len = output.len();
- match lexer.take()? {
- Token::Integer(integer) => {
- if let Ok(integer) = TryInto::<i32>::try_into(integer) {
- output.extend_from_slice(&lexer.endian.to_bytes(integer));
- } else if let Ok(integer) = TryInto::<u32>::try_into(integer) {
- output.extend_from_slice(&lexer.endian.to_bytes(integer));
- } else {
- Err(lexer.error(format!(
- "{integer} is not in the valid range [{},{}]",
- i32::MIN,
- u32::MAX
- )))?;
- };
- }
- Token::Float(float) => output.extend_from_slice(&lexer.endian.to_bytes(float.0)),
- Token::PcSysmis => {
- output.extend_from_slice(&[0xf5, 0x1e, 0x26, 0x02, 0x8a, 0x8c, 0xed, 0xff])
- }
- Token::I8 => put_integers::<u8, 1>(lexer, "i8", output)?,
- Token::I16 => put_integers::<u16, 2>(lexer, "i16", output)?,
- Token::I64 => put_integers::<i64, 8>(lexer, "i64", output)?,
- Token::String(string) => output.extend_from_slice(string.as_bytes()),
- Token::S(size) => {
- let Some((Token::String(ref string), _)) = lexer.token else {
- Err(lexer.error(format!("string expected after 's{size}'")))?
- };
- let len = string.len();
- if len > size {
- Err(lexer.error(format!(
- "{len}-byte string is longer than pad length {size}"
- )))?
- }
- output.extend_from_slice(string.as_bytes());
- output.extend(repeat_n(b' ', size - len));
- lexer.get()?;
- }
- Token::LParen => {
- while !matches!(lexer.token, Some((Token::RParen, _))) {
- parse_data_item(lexer, output, symbol_table)?;
- }
- lexer.get()?;
- }
- Token::Count => put_counted_items::<u32, 4>(lexer, "COUNT", output, symbol_table)?,
- Token::Count8 => put_counted_items::<u8, 1>(lexer, "COUNT8", output, symbol_table)?,
- Token::Hex => {
- let Some((Token::String(ref string), _)) = lexer.token else {
- Err(lexer.error(String::from("string expected after 'hex'")))?
- };
- let mut string = &string[..];
- loop {
- string = string.trim_start();
- if string.is_empty() {
- break;
- };
-
- let mut i = string.chars();
- let Some(c0) = i.next() else { return Ok(true) };
- let Some(c1) = i.next() else {
- Err(lexer.error(String::from("hex string has odd number of characters")))?
- };
-
- let (Some(digit0), Some(digit1)) = (c0.to_digit(16), c1.to_digit(16)) else {
- Err(lexer.error(String::from("invalid digit in hex string")))?
- };
- let byte = digit0 * 16 + digit1;
- output.push(byte as u8);
-
- string = i.as_str();
- }
- lexer.get()?;
- }
- Token::Label(name) => {
- println!("define {name}");
- let value = output.len() as u32;
- match symbol_table.entry(name.clone()) {
- Entry::Vacant(v) => {
- v.insert(Some(value));
- }
- Entry::Occupied(mut o) => {
- match o.get() {
- Some(v) => {
- if *v != value {
- Err(lexer.error(format!("{name}: can't redefine label for offset {:#x} with offset {:#x}", *v, value)))?
- }
- }
- None => drop(o.insert(Some(value))),
- }
- }
- };
- return Ok(true);
- }
- Token::At(name) => {
- let mut value = *symbol_table.entry(name.clone()).or_insert(None);
- loop {
- let plus = match lexer.token {
- Some((Token::Plus, _)) => true,
- Some((Token::Minus, _)) => false,
- _ => break,
- };
- lexer.get()?;
-
- let operand = match lexer.token {
- Some((Token::At(ref name), _)) => {
- *symbol_table.entry(name.clone()).or_insert(None)
- }
- Some((Token::Integer(integer), _)) => Some(
- integer
- .try_into()
- .map_err(|msg| lexer.error(format!("bad offset literal ({msg})")))?,
- ),
- _ => Err(lexer.error(String::from("expecting @label or integer literal")))?,
- };
- lexer.get()?;
-
- value = match (value, operand) {
- (Some(a), Some(b)) => Some(
- if plus {
- a.checked_add(b)
- } else {
- a.checked_sub(b)
- }
- .ok_or_else(|| {
- lexer.error(String::from("overflow in offset arithmetic"))
- })?,
- ),
- _ => None,
- };
- }
- let value = value.unwrap_or(0);
- output.extend_from_slice(&lexer.endian.to_bytes(value));
- }
- _ => (),
- };
- if let Some((Token::Asterisk, _)) = lexer.token {
- lexer.get()?;
- let Token::Integer(count) = lexer.take()? else {
- Err(lexer.error(String::from("positive integer expected after '*'")))?
- };
- if count < 1 {
- Err(lexer.error(String::from("positive integer expected after '*'")))?
- };
- let final_len = output.len();
- for _ in 1..count {
- output.extend_from_within(initial_len..final_len);
- }
- }
- match lexer.token {
- Some((Token::Semicolon, _)) => {
- lexer.get()?;
- }
- Some((Token::RParen, _)) => (),
- _ => Err(lexer.error(String::from("';' expected")))?,
- }
- Ok(true)
-}
-
-fn put_counted_items<T, const N: usize>(
- lexer: &mut Lexer,
- name: &str,
- output: &mut Vec<u8>,
- symbol_table: &mut HashMap<String, Option<u32>>,
-) -> Result<()>
-where
- T: Zero + TryFrom<usize>,
- Endian: ToBytes<T, N>,
-{
- let old_size = output.len();
- output.extend_from_slice(&lexer.endian.to_bytes(T::zero()));
- let start = output.len();
- if !matches!(lexer.token, Some((Token::LParen, _))) {
- Err(lexer.error(format!("'(' expected after '{name}'")))?
- }
- lexer.get()?;
- while !matches!(lexer.token, Some((Token::RParen, _))) {
- parse_data_item(lexer, output, symbol_table)?;
- }
- lexer.get()?;
- let delta = output.len() - start;
- let Ok(delta): Result<T, _> = delta.try_into() else {
- Err(lexer.error(format!("{delta} bytes is too much for '{name}'")))?
- };
- let dest = &mut output[old_size..old_size + N];
- dest.copy_from_slice(&lexer.endian.to_bytes(delta));
- Ok(())
-}
-
-fn put_integers<T, const N: usize>(
- lexer: &mut Lexer,
- name: &str,
- output: &mut Vec<u8>,
-) -> Result<()>
-where
- T: Bounded + Display + TryFrom<i64> + Copy,
- Endian: ToBytes<T, N>,
-{
- println!("put_integers {:?}", lexer.token);
- let mut n = 0;
- while let Some(integer) = lexer.take_if(|t| match t {
- Token::Integer(integer) => Some(*integer),
- _ => None,
- })? {
- println!("got integer {integer}");
- let Ok(integer) = integer.try_into() else {
- Err(lexer.error(format!(
- "{integer} is not in the valid range [{},{}]",
- T::min_value(),
- T::max_value()
- )))?
- };
- output.extend_from_slice(&lexer.endian.to_bytes(integer));
- n += 1;
- }
- println!("put_integers {:?} {n}", lexer.token);
- if n == 0 {
- Err(lexer.error(format!("integer expected after '{name}'")))?
- }
- Ok(())
-}
-
-#[derive(PartialEq, Eq, Clone, Debug)]
-enum Token {
- Integer(i64),
- Float(OrderedFloat<f64>),
- PcSysmis,
- String(String),
- Semicolon,
- Asterisk,
- LParen,
- RParen,
- I8,
- I16,
- I64,
- S(usize),
- Count,
- Count8,
- Hex,
- Label(String),
- At(String),
- Minus,
- Plus,
-}
-
-struct Lexer<'a> {
- input: &'a str,
- token: Option<(Token, &'a str)>,
- input_file_name: Option<&'a str>,
- line_number: usize,
- endian: Endian,
-}
-
-fn skip_comments(mut s: &str) -> (&str, usize) {
- let mut n_newlines = 0;
- let s = loop {
- s = s.trim_start_matches([' ', '\t', '\r', '<', '>']);
- if let Some(remainder) = s.strip_prefix('#') {
- let Some((_, remainder)) = remainder.split_once('\n') else {
- break "";
- };
- s = remainder;
- n_newlines += 1;
- } else if let Some(remainder) = s.strip_prefix('\n') {
- s = remainder;
- n_newlines += 1;
- } else {
- break s;
- }
- };
- (s, n_newlines)
-}
-
-impl<'a> Lexer<'a> {
- fn new(input: &'a str, input_file_name: Option<&'a str>, endian: Endian) -> Result<Lexer<'a>> {
- let mut lexer = Lexer {
- input,
- token: None,
- input_file_name,
- line_number: 1,
- endian,
- };
- lexer.token = lexer.next()?;
- Ok(lexer)
- }
- fn error(&self, message: String) -> Error {
- let repr = self.token.as_ref().map(|(_, repr)| *repr);
- Error::new(self.input_file_name, Some(self.line_number), repr, message)
- }
- fn take(&mut self) -> Result<Token> {
- let Some(token) = self.token.take() else {
- Err(self.error(String::from("unexpected end of input")))?
- };
- self.token = self.next()?;
- Ok(token.0)
- }
- fn take_if<F, T>(&mut self, condition: F) -> Result<Option<T>>
- where
- F: FnOnce(&Token) -> Option<T>,
- {
- let Some(ref token) = self.token else {
- return Ok(None);
- };
- match condition(&token.0) {
- Some(value) => {
- self.token = self.next()?;
- Ok(Some(value))
- }
- None => Ok(None),
- }
- }
- fn get(&mut self) -> Result<Option<&Token>> {
- if self.token.is_none() {
- Err(self.error(String::from("unexpected end of input")))?
- } else {
- self.token = self.next()?;
- match self.token {
- Some((ref token, _)) => Ok(Some(token)),
- None => Ok(None),
- }
- }
- }
-
- fn next(&mut self) -> Result<Option<(Token, &'a str)>> {
- // Get the first character of the token, skipping past white space and
- // comments.
- let (s, n_newlines) = skip_comments(self.input);
- self.line_number += n_newlines;
- self.input = s;
-
- let start = s;
- let mut iter = s.chars();
- let Some(c) = iter.next() else {
- return Ok(None);
- };
- let (token, rest) = match c {
- c if c.is_ascii_digit() || c == '-' => {
- let len = s
- .find(|c: char| {
- !(c.is_ascii_digit() || c.is_alphabetic() || c == '.' || c == '-')
- })
- .unwrap_or(s.len());
- let (number, rest) = s.split_at(len);
- let token = if number == "-" {
- Token::Minus
- } else if let Some(digits) = number.strip_prefix("0x") {
- Token::Integer(i64::from_str_radix(digits, 16).map_err(|msg| {
- self.error(format!("bad integer literal '{number}' ({msg})"))
- })?)
- } else if !number.contains('.') {
- Token::Integer(number.parse().map_err(|msg| {
- self.error(format!("bad integer literal '{number}' ({msg})"))
- })?)
- } else {
- Token::Float(number.parse().map_err(|msg| {
- self.error(format!("bad float literal '{number}' ({msg})"))
- })?)
- };
- (token, rest)
- }
- '"' => {
- let s = iter.as_str();
- let Some(len) = s.find(['\n', '"']) else {
- Err(self.error(String::from("end-of-file inside string")))?
- };
- let (string, rest) = s.split_at(len);
- let Some(rest) = rest.strip_prefix('"') else {
- Err(self.error(format!("new-line inside string ({string}...{rest})")))?
- };
- (Token::String(string.into()), rest)
- }
- ';' => (Token::Semicolon, iter.as_str()),
- '*' => (Token::Asterisk, iter.as_str()),
- '+' => (Token::Plus, iter.as_str()),
- '(' => (Token::LParen, iter.as_str()),
- ')' => (Token::RParen, iter.as_str()),
- c if c.is_alphabetic() || c == '@' || c == '_' => {
- let len = s
- .find(|c: char| {
- !(c.is_ascii_digit()
- || c.is_alphabetic()
- || c == '@'
- || c == '.'
- || c == '_')
- })
- .unwrap_or(s.len());
- let (s, rest) = s.split_at(len);
- if let Some(rest) = rest.strip_prefix(':') {
- (Token::Label(s.into()), rest)
- } else if let Some(name) = s.strip_prefix('@') {
- (Token::At(name.into()), rest)
- } else if let Some(count) = s.strip_prefix('s') {
- let token =
- Token::S(count.parse().map_err(|msg| {
- self.error(format!("bad counted string '{s}' ({msg})"))
- })?);
- (token, rest)
- } else {
- let token = match s {
- "i8" => Token::I8,
- "i16" => Token::I16,
- "i64" => Token::I64,
- "SYSMIS" => Token::Float(OrderedFloat(-f64::MAX)),
- "PCSYSMIS" => Token::PcSysmis,
- "LOWEST" => Token::Float((-f64::MAX).next_after(0.0).into()),
- "HIGHEST" => Token::Float(f64::MAX.into()),
- "ENDIAN" => Token::Integer(if self.endian == Endian::Big { 1 } else { 2 }),
- "COUNT" => Token::Count,
- "COUNT8" => Token::Count8,
- "hex" => Token::Hex,
- _ => Err(self.error(format!("invalid token '{s}'")))?,
- };
- (token, rest)
- }
- }
- _ => Err(self.error(format!("invalid input byte '{c}'")))?,
- };
- self.input = rest;
- let repr = &start[..start.len() - rest.len()];
- println!("{token:?} {repr}");
- Ok(Some((token, repr)))
- }
-}
-
-#[cfg(test)]
-mod test {
- use crate::endian::Endian;
- use crate::sack::sack;
- use anyhow::Result;
- use hexplay::HexView;
-
- #[test]
- fn basic_sack() -> Result<()> {
- let input = r#"
-"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
-2; # Layout code
-28; # Nominal case size
-0; # Not compressed
-0; # Not weighted
-1; # 1 case.
-100.0; # Bias.
-"01 Jan 11"; "20:53:52";
-"PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 "";
-i8 0 *3;
-"#;
- let output = sack(input, None, Endian::Big)?;
- HexView::new(&output).print()?;
- Ok(())
- }
-
- #[test]
- fn pcp_sack() -> Result<()> {
- let input = r#"
-# File header.
-2; 0;
-@MAIN; @MAIN_END - @MAIN;
-@VARS; @VARS_END - @VARS;
-@LABELS; @LABELS_END - @LABELS;
-@DATA; @DATA_END - @DATA;
-(0; 0) * 11;
-i8 0 * 128;
-
-MAIN:
- i16 1; # Fixed.
- s62 "PCSPSS PSPP synthetic test product";
- PCSYSMIS;
- 0; 0; i16 1; # Fixed.
- i16 0;
- i16 15;
- 1;
- i16 0; # Fixed.
- 1;
- s8 "11/28/14";
- s8 "15:11:00";
- s64 "PSPP synthetic test file";
-MAIN_END:
-
-VARS:
- 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS;
- 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS;
- 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS;
-
- # Numeric variable, no label or missing values.
- 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS;
-
- # Numeric variable, variable label.
- 0; 0; @NUM2_LABEL - @LABELS_OFS; 0x050800; s8 "NUM2"; PCSYSMIS;
-
- # Numeric variable with missing value.
- 0; 0; 0; 0x050800; s8 "NUM3"; 1.0;
-
- # Numeric variable, variable label and missing value.
- 0; 0; @NUM4_LABEL - @LABELS_OFS; 0x050800; s8 "NUM4"; 2.0;
-
- # String variable, no label or missing values.
- 0; 0; 0; 0x010800; s8 "STR1"; PCSYSMIS;
-
- # String variable, variable label.
- 0; 0; @STR2_LABEL - @LABELS_OFS; 0x010400; s8 "STR2"; PCSYSMIS;
-
- # String variable with missing value.
- 0; 0; 0; 0x010500; s8 "STR3"; s8 "MISS";
-
- # String variable, variable label and missing value.
- 0; 0; @STR4_LABEL - @LABELS_OFS; 0x010100; s8 "STR4"; s8 "OTHR";
-
- # Long string variable
- 0; 0; 0; 0x010b00; s8 "STR5"; PCSYSMIS;
- 0 * 8;
-
- # Long string variable with variable label
- 0; 0; @STR6_LABEL - @LABELS_OFS; 0x010b00; s8 "STR6"; PCSYSMIS;
- 0 * 8;
-VARS_END:
-
-LABELS:
- 3; i8 0 0 0; LABELS_OFS: i8 0;
- NUM2_LABEL: COUNT8("Numeric variable 2's label");
- NUM4_LABEL: COUNT8("Another numeric variable label");
- STR2_LABEL: COUNT8("STR2's variable label");
- STR4_LABEL: COUNT8("STR4's variable label");
- STR6_LABEL: COUNT8("Another string variable's label");
-LABELS_END:
-
-DATA:
- 0.0; "11/28/14"; 1.0;
- 0.0; 1.0; 2.0; PCSYSMIS; s8 "abcdefgh"; s8 "ijkl"; s8 "mnopq"; s8 "r";
- s16 "stuvwxyzAB"; s16 "CDEFGHIJKLM";
-DATA_END:
-"#;
- let output = sack(input, None, Endian::Big)?;
- HexView::new(&output).print()?;
- Ok(())
- }
-}
impl Settings {
pub fn global() -> &'static Settings {
static GLOBAL: OnceLock<Settings> = OnceLock::new();
- GLOBAL.get_or_init( Settings::default)
+ GLOBAL.get_or_init(Settings::default)
}
}
--- /dev/null
+use core::str;
+use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc};
+
+use crate::{
+ dictionary::{
+ Dictionary, InvalidRole, MultipleResponseSet, MultipleResponseType, Value, VarWidth,
+ Variable, VariableSet,
+ },
+ endian::Endian,
+ format::{Error as FormatError, Format, UncheckedFormat},
+ identifier::{ByIdentifier, Error as IdError, Identifier},
+ sys::encoding::Error as EncodingError,
+ sys::raw::{
+ self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord,
+ FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName, LongNamesRecord,
+ LongStringMissingValueRecord, LongStringValueLabelRecord, MissingValues,
+ MultipleResponseRecord, NumberOfCasesRecord, ProductInfoRecord, RawStrArray, RawWidth,
+ ValueLabel, ValueLabelRecord, VarDisplayRecord, VariableAttributeRecord, VariableRecord,
+ VariableSetRecord, VeryLongStringsRecord, ZHeader, ZTrailer,
+ },
+};
+use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
+use encoding_rs::Encoding;
+use indexmap::set::MutableValues;
+use thiserror::Error as ThisError;
+
+pub use crate::sys::raw::{CategoryLabels, Compression};
+
+#[derive(ThisError, Debug)]
+pub enum Error {
+ #[error("Missing header record")]
+ MissingHeaderRecord,
+
+ // XXX this is an internal error
+ #[error("More than one file header record")]
+ DuplicateHeaderRecord,
+
+ #[error("{0}")]
+ EncodingError(EncodingError),
+
+ #[error("Using default encoding {0}.")]
+ UsingDefaultEncoding(String),
+
+ #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)]
+ InvalidVariableWidth { offsets: Range<u64>, width: i32 },
+
+ #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
+ InvalidLongMissingValueFormat,
+
+ #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
+ InvalidCreationDate { creation_date: String },
+
+ #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
+ InvalidCreationTime { creation_time: String },
+
+ #[error("{id_error} Renaming variable to {new_name}.")]
+ InvalidVariableName {
+ id_error: IdError,
+ new_name: Identifier,
+ },
+
+ #[error(
+ "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}"
+ )]
+ InvalidPrintFormat {
+ new_spec: Format,
+ variable: Identifier,
+ format_error: FormatError,
+ },
+
+ #[error(
+ "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}"
+ )]
+ InvalidWriteFormat {
+ new_spec: Format,
+ variable: Identifier,
+ format_error: FormatError,
+ },
+
+ #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")]
+ DuplicateVariableName {
+ duplicate_name: Identifier,
+ new_name: Identifier,
+ },
+
+ #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")]
+ InvalidDictIndex { dict_index: usize, max_index: usize },
+
+ #[error("Dictionary index {0} refers to a long string continuation.")]
+ DictIndexIsContinuation(usize),
+
+ #[error("At offset {offset:#x}, one or more variable indexes for value labels referred to long string continuation records: {indexes:?}")]
+ LongStringContinuationIndexes { offset: u64, indexes: Vec<u32> },
+
+ #[error(
+ "At offsets {:#x}...{:#x}, record types 3 and 4 may not add value labels to one or more long string variables: {variables:?}", .offsets.start, .offsets.end
+ )]
+ InvalidLongStringValueLabels {
+ offsets: Range<u64>,
+ variables: Vec<Identifier>,
+ },
+
+ #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")]
+ ValueLabelsDifferentTypes {
+ numeric_var: Identifier,
+ string_var: Identifier,
+ },
+
+ #[error("Invalid multiple response set name. {0}")]
+ InvalidMrSetName(IdError),
+
+ #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")]
+ UnknownMrSetVariable {
+ mr_set: Identifier,
+ short_name: Identifier,
+ },
+
+ #[error("Multiple response set {0} has no variables.")]
+ EmptyMrSet(Identifier),
+
+ #[error("Multiple response set {0} has only one variable.")]
+ OneVarMrSet(Identifier),
+
+ #[error("Multiple response set {0} contains both string and numeric variables.")]
+ MixedMrSet(Identifier),
+
+ #[error(
+ "Invalid numeric format for counted value {number} in multiple response set {mr_set}."
+ )]
+ InvalidMDGroupCountedValue { mr_set: Identifier, number: String },
+
+ #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")]
+ TooWideMDGroupCountedValue {
+ mr_set: Identifier,
+ value: String,
+ width: usize,
+ max_width: u16,
+ },
+
+ #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")]
+ InvalidLongValueLabelWidth {
+ name: Identifier,
+ width: u32,
+ min_width: u16,
+ max_width: u16,
+ },
+
+ #[error("Invalid attribute name. {0}")]
+ InvalidAttributeName(IdError),
+
+ #[error("Invalid short name in long variable name record. {0}")]
+ InvalidShortName(IdError),
+
+ #[error("Invalid name in long variable name record. {0}")]
+ InvalidLongName(IdError),
+
+ #[error("Invalid variable name in very long string record. {0}")]
+ InvalidLongStringName(IdError),
+
+ #[error("Invalid variable name in long string value label record. {0}")]
+ InvalidLongStringValueLabelName(IdError),
+
+ #[error("Invalid variable name in attribute record. {0}")]
+ InvalidAttributeVariableName(IdError),
+
+ // XXX This is risky because `text` might be arbitarily long.
+ #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
+ MalformedString { encoding: String, text: String },
+
+ #[error("Details TBD")]
+ TBD,
+}
+
+#[derive(Clone, Debug)]
+pub struct Headers {
+ pub header: HeaderRecord<String>,
+ pub variable: Vec<VariableRecord<String>>,
+ pub value_label: Vec<ValueLabelRecord<RawStrArray<8>, String>>,
+ pub document: Vec<DocumentRecord<String>>,
+ pub integer_info: Option<IntegerInfoRecord>,
+ pub float_info: Option<FloatInfoRecord>,
+ pub var_display: Option<VarDisplayRecord>,
+ pub multiple_response: Vec<MultipleResponseRecord<Identifier, String>>,
+ pub long_string_value_labels: Vec<LongStringValueLabelRecord<Identifier, String>>,
+ pub long_string_missing_values: Vec<LongStringMissingValueRecord<Identifier>>,
+ pub encoding: Option<EncodingRecord>,
+ pub number_of_cases: Option<NumberOfCasesRecord>,
+ pub variable_sets: Vec<VariableSetRecord>,
+ pub product_info: Option<ProductInfoRecord>,
+ pub long_names: Vec<LongNamesRecord>,
+ pub very_long_strings: Vec<VeryLongStringsRecord>,
+ pub file_attributes: Vec<FileAttributeRecord>,
+ pub variable_attributes: Vec<VariableAttributeRecord>,
+ pub other_extension: Vec<Extension>,
+ pub end_of_headers: Option<u32>,
+ pub z_header: Option<ZHeader>,
+ pub z_trailer: Option<ZTrailer>,
+ pub cases: Option<Rc<RefCell<Cases>>>,
+}
+
+fn take_first<T, F>(mut vec: Vec<T>, more_than_one: F) -> Option<T>
+where
+ F: FnOnce(),
+{
+ if vec.len() > 1 {
+ more_than_one();
+ }
+ vec.drain(..).next()
+}
+
+impl Headers {
+ pub fn new(headers: Vec<raw::DecodedRecord>, warn: &impl Fn(Error)) -> Result<Headers, Error> {
+ let mut file_header = Vec::new();
+ let mut variable = Vec::new();
+ let mut value_label = Vec::new();
+ let mut document = Vec::new();
+ let mut integer_info = Vec::new();
+ let mut float_info = Vec::new();
+ let mut var_display = Vec::new();
+ let mut multiple_response = Vec::new();
+ let mut long_string_value_labels = Vec::new();
+ let mut long_string_missing_values = Vec::new();
+ let mut encoding = Vec::new();
+ let mut number_of_cases = Vec::new();
+ let mut variable_sets = Vec::new();
+ let mut product_info = Vec::new();
+ let mut long_names = Vec::new();
+ let mut very_long_strings = Vec::new();
+ let mut file_attributes = Vec::new();
+ let mut variable_attributes = Vec::new();
+ let mut other_extension = Vec::new();
+ let mut end_of_headers = Vec::new();
+ let mut z_header = Vec::new();
+ let mut z_trailer = Vec::new();
+ let mut cases = Vec::new();
+
+ for header in headers {
+ match header {
+ DecodedRecord::Header(record) => {
+ file_header.push(record);
+ }
+ DecodedRecord::Variable(record) => {
+ variable.push(record);
+ }
+ DecodedRecord::ValueLabel(record) => {
+ value_label.push(record);
+ }
+ DecodedRecord::Document(record) => {
+ document.push(record);
+ }
+ DecodedRecord::IntegerInfo(record) => {
+ integer_info.push(record);
+ }
+ DecodedRecord::FloatInfo(record) => {
+ float_info.push(record);
+ }
+ DecodedRecord::VariableSets(record) => {
+ variable_sets.push(record);
+ }
+ DecodedRecord::VarDisplay(record) => {
+ var_display.push(record);
+ }
+ DecodedRecord::MultipleResponse(record) => {
+ multiple_response.push(record);
+ }
+ DecodedRecord::LongStringValueLabels(record) => {
+ long_string_value_labels.push(record)
+ }
+ DecodedRecord::LongStringMissingValues(record) => {
+ long_string_missing_values.push(record);
+ }
+ DecodedRecord::Encoding(record) => {
+ encoding.push(record);
+ }
+ DecodedRecord::NumberOfCases(record) => {
+ number_of_cases.push(record);
+ }
+ DecodedRecord::ProductInfo(record) => {
+ product_info.push(record);
+ }
+ DecodedRecord::LongNames(record) => {
+ long_names.push(record);
+ }
+ DecodedRecord::VeryLongStrings(record) => {
+ very_long_strings.push(record);
+ }
+ DecodedRecord::FileAttributes(record) => {
+ file_attributes.push(record);
+ }
+ DecodedRecord::VariableAttributes(record) => {
+ variable_attributes.push(record);
+ }
+ DecodedRecord::OtherExtension(record) => {
+ other_extension.push(record);
+ }
+ DecodedRecord::EndOfHeaders(record) => {
+ end_of_headers.push(record);
+ }
+ DecodedRecord::ZHeader(record) => {
+ z_header.push(record);
+ }
+ DecodedRecord::ZTrailer(record) => {
+ z_trailer.push(record);
+ }
+ DecodedRecord::Cases(record) => {
+ cases.push(record);
+ }
+ }
+ }
+
+ let Some(file_header) = take_first(file_header, || warn(Error::DuplicateHeaderRecord))
+ else {
+ return Err(Error::MissingHeaderRecord);
+ };
+
+ Ok(Headers {
+ header: file_header,
+ variable,
+ value_label,
+ document,
+ integer_info: take_first(integer_info, || warn(Error::TBD)),
+ float_info: take_first(float_info, || warn(Error::TBD)),
+ var_display: take_first(var_display, || warn(Error::TBD)),
+ multiple_response,
+ long_string_value_labels,
+ long_string_missing_values,
+ encoding: take_first(encoding, || warn(Error::TBD)),
+ number_of_cases: take_first(number_of_cases, || warn(Error::TBD)),
+ variable_sets,
+ product_info: take_first(product_info, || warn(Error::TBD)),
+ long_names,
+ very_long_strings,
+ file_attributes,
+ variable_attributes,
+ other_extension,
+ end_of_headers: take_first(end_of_headers, || warn(Error::TBD)),
+ z_header: take_first(z_header, || warn(Error::TBD)),
+ z_trailer: take_first(z_trailer, || warn(Error::TBD)),
+ cases: take_first(cases, || warn(Error::TBD)),
+ })
+ }
+}
+
+#[derive(Debug)]
+pub struct Metadata {
+ pub creation: NaiveDateTime,
+ pub endian: Endian,
+ pub compression: Option<Compression>,
+ pub n_cases: Option<u64>,
+ pub product: String,
+ pub product_ext: Option<String>,
+ pub version: Option<(i32, i32, i32)>,
+}
+
+impl Metadata {
+ fn decode(headers: &Headers, warn: impl Fn(Error)) -> Self {
+ let header = &headers.header;
+ let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
+ .unwrap_or_else(|_| {
+ warn(Error::InvalidCreationDate {
+ creation_date: header.creation_date.to_string(),
+ });
+ Default::default()
+ });
+ let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
+ .unwrap_or_else(|_| {
+ warn(Error::InvalidCreationTime {
+ creation_time: header.creation_time.to_string(),
+ });
+ Default::default()
+ });
+ let creation = NaiveDateTime::new(creation_date, creation_time);
+
+ let product = header
+ .eye_catcher
+ .trim_start_matches("@(#) SPSS DATA FILE")
+ .trim_end()
+ .to_string();
+
+ Self {
+ creation,
+ endian: header.endian,
+ compression: header.compression,
+ n_cases: header.n_cases.map(|n| n as u64),
+ product,
+ product_ext: headers.product_info.as_ref().map(|pe| fix_line_ends(&pe.0)),
+ version: headers.integer_info.as_ref().map(|ii| ii.version),
+ }
+ }
+}
+
+struct Decoder {
+ pub encoding: &'static Encoding,
+ n_generated_names: usize,
+}
+
+impl Decoder {
+ fn generate_name(&mut self, dictionary: &Dictionary) -> Identifier {
+ loop {
+ self.n_generated_names += 1;
+ let name = Identifier::from_encoding(
+ format!("VAR{:03}", self.n_generated_names),
+ self.encoding,
+ )
+ .unwrap();
+ if !dictionary.variables.contains(&name.0) {
+ return name;
+ }
+ assert!(self.n_generated_names < usize::MAX);
+ }
+ }
+}
+
+pub fn decode(
+ mut headers: Headers,
+ encoding: &'static Encoding,
+ warn: impl Fn(Error),
+) -> Result<(Dictionary, Metadata), Error> {
+ let mut dictionary = Dictionary::new(encoding);
+
+ let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' '));
+ if !file_label.is_empty() {
+ dictionary.file_label = Some(file_label);
+ }
+
+ for mut attributes in headers.file_attributes.drain(..) {
+ dictionary.attributes.append(&mut attributes.0)
+ }
+
+ // Concatenate all the document records (really there should only be one)
+ // and trim off the trailing spaces that pad them to 80 bytes.
+ dictionary.documents = headers
+ .document
+ .drain(..)
+ .flat_map(|record| record.lines)
+ .map(trim_end_spaces)
+ .collect();
+
+ // XXX warn for weird integer format
+ // XXX warn for weird floating-point format, etc.
+
+ let mut decoder = Decoder {
+ encoding,
+ n_generated_names: 0,
+ };
+
+ let mut var_index_map = HashMap::new();
+ let mut value_index = 0;
+ for (index, input) in headers
+ .variable
+ .iter()
+ .enumerate()
+ .filter(|(_index, record)| record.width != RawWidth::Continuation)
+ {
+ let name = trim_end_spaces(input.name.to_string());
+ let name = match Identifier::from_encoding(name, encoding) {
+ Ok(name) => {
+ if !dictionary.variables.contains(&name.0) {
+ name
+ } else {
+ let new_name = decoder.generate_name(&dictionary);
+ warn(Error::DuplicateVariableName {
+ duplicate_name: name.clone(),
+ new_name: new_name.clone(),
+ });
+ new_name
+ }
+ }
+ Err(id_error) => {
+ let new_name = decoder.generate_name(&dictionary);
+ warn(Error::InvalidVariableName {
+ id_error,
+ new_name: new_name.clone(),
+ });
+ new_name
+ }
+ };
+ let mut variable = Variable::new(name.clone(), VarWidth::try_from(input.width).unwrap());
+
+ // Set the short name the same as the long name (even if we renamed it).
+ variable.short_names = vec![name];
+
+ variable.label = input.label.clone();
+
+ variable.missing_values = input.missing_values.clone();
+
+ variable.print_format = decode_format(
+ input.print_format,
+ variable.width,
+ |new_spec, format_error| {
+ warn(Error::InvalidPrintFormat {
+ new_spec,
+ variable: variable.name.clone(),
+ format_error,
+ })
+ },
+ );
+ variable.write_format = decode_format(
+ input.write_format,
+ variable.width,
+ |new_spec, format_error| {
+ warn(Error::InvalidWriteFormat {
+ new_spec,
+ variable: variable.name.clone(),
+ format_error,
+ })
+ },
+ );
+
+ // Check for long string continuation records.
+ let n_values = input.width.n_values().unwrap();
+ for offset in 1..n_values {
+ if headers
+ .variable
+ .get(index + offset)
+ .is_none_or(|record| record.width != RawWidth::Continuation)
+ {
+ warn(Error::TBD);
+ break;
+ }
+ }
+
+ let dict_index = dictionary.add_var(variable).unwrap();
+ assert_eq!(var_index_map.insert(value_index, dict_index), None);
+ value_index += n_values;
+ }
+
+ if let Some(weight_index) = headers.header.weight_index {
+ if let Some(dict_index) = var_index_map.get(&(weight_index as usize - 1)) {
+ let variable = &dictionary.variables[*dict_index];
+ if variable.is_numeric() {
+ dictionary.weight = Some(*dict_index);
+ } else {
+ warn(Error::TBD);
+ }
+ } else {
+ warn(Error::TBD);
+ }
+ }
+
+ for record in headers.value_label.drain(..) {
+ let mut dict_indexes = Vec::with_capacity(record.dict_indexes.len());
+ let mut long_string_variables = Vec::new();
+ for value_index in record.dict_indexes.iter() {
+ let Some(dict_index) = var_index_map.get(&(*value_index as usize - 1)) else {
+ unreachable!()
+ };
+ let variable = &dictionary.variables[*dict_index];
+ if variable.width.is_long_string() {
+ long_string_variables.push(variable.name.clone());
+ } else {
+ dict_indexes.push(*dict_index);
+ }
+ }
+ if !long_string_variables.is_empty() {
+ warn(Error::InvalidLongStringValueLabels {
+ offsets: record.offsets.clone(),
+ variables: long_string_variables,
+ });
+ }
+
+ for dict_index in dict_indexes {
+ let variable = dictionary.variables.get_index_mut2(dict_index).unwrap();
+ for ValueLabel { value, label } in record.labels.iter().cloned() {
+ let value = value.decode(variable.width);
+ variable.value_labels.insert(value, label);
+ }
+ }
+ }
+
+ if let Some(display) = &headers.var_display {
+ for (index, display) in display.0.iter().enumerate() {
+ if let Some(variable) = dictionary.variables.get_index_mut2(index) {
+ if let Some(width) = display.width {
+ variable.display_width = width;
+ }
+ if let Some(alignment) = display.alignment {
+ variable.alignment = alignment;
+ }
+ if let Some(measure) = display.measure {
+ variable.measure = Some(measure);
+ }
+ } else {
+ warn(Error::TBD);
+ }
+ }
+ }
+
+ for record in headers
+ .multiple_response
+ .iter()
+ .flat_map(|record| record.0.iter())
+ {
+ match MultipleResponseSet::decode(&dictionary, record, &warn) {
+ Ok(mrset) => {
+ dictionary.mrsets.insert(ByIdentifier::new(mrset));
+ }
+ Err(error) => warn(error),
+ }
+ }
+
+ 'outer: for record in headers
+ .very_long_strings
+ .drain(..)
+ .flat_map(|record| record.0.into_iter())
+ {
+ let Some(index) = dictionary.variables.get_index_of(&record.short_name.0) else {
+ warn(Error::TBD);
+ continue;
+ };
+ let width = VarWidth::String(record.length);
+ let n_segments = width.n_segments();
+ if n_segments == 1 {
+ warn(Error::TBD);
+ continue;
+ }
+ if index + n_segments > dictionary.variables.len() {
+ warn(Error::TBD);
+ continue;
+ }
+ let mut short_names = Vec::with_capacity(n_segments);
+ for i in 0..n_segments {
+ let alloc_width = width.segment_alloc_width(i);
+ let segment = &dictionary.variables[index + i];
+ short_names.push(segment.short_names[0].clone());
+ let segment_width = segment.width.as_string_width().unwrap_or(0);
+ if segment_width.next_multiple_of(8) != alloc_width.next_multiple_of(8) {
+ warn(Error::TBD);
+ continue 'outer;
+ }
+ }
+ dictionary.delete_vars(index + 1..index + n_segments);
+ let variable = dictionary.variables.get_index_mut2(index).unwrap();
+ variable.short_names = short_names;
+ variable.width = width;
+ }
+
+ if headers.long_names.is_empty() {
+ // There are no long variable names. Use the short variable names,
+ // converted to lowercase, as the long variable names.
+ for index in 0..dictionary.variables.len() {
+ let lower = dictionary.variables[index].name.0.as_ref().to_lowercase();
+ if let Ok(new_name) = Identifier::from_encoding(lower, dictionary.encoding) {
+ dictionary.try_rename_var(index, new_name);
+ }
+ }
+ } else {
+ // Rename each of the variables, one by one. (In a correctly
+ // constructed system file, this cannot create any intermediate
+ // duplicate variable names, because all of the new variable names are
+ // longer than any of the old variable names and thus there cannot be
+ // any overlaps.)
+ for renaming in headers
+ .long_names
+ .iter()
+ .flat_map(|record| record.0.iter().cloned())
+ {
+ let LongName {
+ short_name,
+ long_name,
+ } = renaming;
+ if let Some(index) = dictionary.variables.get_index_of(&short_name.0) {
+ dictionary.try_rename_var(index, long_name);
+ dictionary
+ .variables
+ .get_index_mut2(index)
+ .unwrap()
+ .short_names = vec![short_name];
+ } else {
+ warn(Error::TBD);
+ }
+ }
+ }
+
+ for mut attr_set in headers
+ .variable_attributes
+ .drain(..)
+ .flat_map(|record| record.0.into_iter())
+ {
+ if let Some((_, variable)) = dictionary
+ .variables
+ .get_full_mut2(&attr_set.long_var_name.0)
+ {
+ variable.attributes.append(&mut attr_set.attributes);
+ } else {
+ warn(Error::TBD);
+ }
+ }
+
+ // Assign variable roles.
+ for index in 0..dictionary.variables.len() {
+ let variable = dictionary.variables.get_index_mut2(index).unwrap();
+ match variable.attributes.role() {
+ Ok(role) => variable.role = role,
+ Err(InvalidRole) => warn(Error::TBD),
+ }
+ }
+
+ // Long string value labels.
+ for record in headers
+ .long_string_value_labels
+ .drain(..)
+ .flat_map(|record| record.0.into_iter())
+ {
+ let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else {
+ warn(Error::TBD);
+ continue;
+ };
+ let Some(width) = variable.width.as_string_width() else {
+ warn(Error::TBD);
+ continue;
+ };
+ for (mut value, label) in record.labels.into_iter() {
+ // XXX warn about too-long value?
+ value.0.resize(width, b' ');
+ // XXX warn abouat duplicate value labels?
+ variable.value_labels.insert(Value::String(value), label);
+ }
+ }
+
+ let mut value = Vec::new();
+ for record in headers
+ .long_string_missing_values
+ .drain(..)
+ .flat_map(|record| record.0.into_iter())
+ {
+ let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else {
+ warn(Error::TBD);
+ continue;
+ };
+ let values = record
+ .missing_values
+ .into_iter()
+ .map(|v| {
+ value.clear();
+ value.extend_from_slice(v.0.as_slice());
+ value.resize(variable.width.as_string_width().unwrap(), b' ');
+ Value::String(Box::from(value.as_slice()))
+ })
+ .collect::<Vec<_>>();
+ variable.missing_values = MissingValues {
+ values,
+ range: None,
+ };
+ }
+
+ for record in headers
+ .variable_sets
+ .drain(..)
+ .flat_map(|record| record.sets.into_iter())
+ {
+ let mut variables = Vec::with_capacity(record.variable_names.len());
+ for variable_name in record.variable_names {
+ let Some((dict_index, _)) = dictionary.variables.get_full_mut2(&variable_name.0) else {
+ warn(Error::TBD);
+ continue;
+ };
+ variables.push(dict_index);
+ }
+ if !variables.is_empty() {
+ let variable_set = VariableSet {
+ name: record.name,
+ variables,
+ };
+ dictionary
+ .variable_sets
+ .insert(ByIdentifier::new(variable_set));
+ }
+ }
+
+ let metadata = Metadata::decode(&headers, warn);
+ Ok((dictionary, metadata))
+}
+
+impl MultipleResponseSet {
+ fn decode(
+ dictionary: &Dictionary,
+ input: &raw::MultipleResponseSet<Identifier, String>,
+ warn: &impl Fn(Error),
+ ) -> Result<Self, Error> {
+ let mr_set_name = input.name.clone();
+ let mut variables = Vec::with_capacity(input.short_names.len());
+ for short_name in input.short_names.iter() {
+ let Some(dict_index) = dictionary.variables.get_index_of(&short_name.0) else {
+ warn(Error::UnknownMrSetVariable {
+ mr_set: mr_set_name.clone(),
+ short_name: short_name.clone(),
+ });
+ continue;
+ };
+ variables.push(dict_index);
+ }
+
+ match variables.len() {
+ 0 => return Err(Error::EmptyMrSet(mr_set_name)),
+ 1 => return Err(Error::OneVarMrSet(mr_set_name)),
+ _ => (),
+ }
+
+ let Some((Some(min_width), Some(max_width))) = variables
+ .iter()
+ .copied()
+ .map(|dict_index| dictionary.variables[dict_index].width)
+ .map(|w| (Some(w), Some(w)))
+ .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb)))
+ else {
+ return Err(Error::MixedMrSet(mr_set_name));
+ };
+
+ let mr_type = MultipleResponseType::decode(&mr_set_name, &input.mr_type, min_width)?;
+
+ Ok(MultipleResponseSet {
+ name: mr_set_name,
+ width: min_width..=max_width,
+ label: input.label.to_string(),
+ mr_type,
+ variables,
+ })
+ }
+}
+
+fn trim_end_spaces(mut s: String) -> String {
+ s.truncate(s.trim_end_matches(' ').len());
+ s
+}
+
+/// Returns a copy of `s` in which all lone CR and CR LF pairs have been
+/// replaced by LF.
+///
+/// (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
+/// files that use CR-only line ends in the file label and extra product info.)
+fn fix_line_ends(s: &str) -> String {
+ let mut out = String::with_capacity(s.len());
+ let mut s = s.chars().peekable();
+ while let Some(c) = s.next() {
+ match c {
+ '\r' => {
+ s.next_if_eq(&'\n');
+ out.push('\n')
+ }
+ c => out.push(c),
+ }
+ }
+ out
+}
+
+fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Format, FormatError)) -> Format {
+ UncheckedFormat::try_from(raw)
+ .and_then(Format::try_from)
+ .and_then(|x| x.check_width_compatibility(width))
+ .unwrap_or_else(|error| {
+ let new_format = Format::default_for_width(width);
+ warn(new_format, error);
+ new_format
+ })
+}
+
+impl MultipleResponseType {
+ fn decode(
+ mr_set: &Identifier,
+ input: &raw::MultipleResponseType,
+ min_width: VarWidth,
+ ) -> Result<Self, Error> {
+ match input {
+ raw::MultipleResponseType::MultipleDichotomy { value, labels } => {
+ let value = match min_width {
+ VarWidth::Numeric => {
+ let string = String::from_utf8_lossy(&value.0);
+ let number: f64 = string.trim().parse().map_err(|_| {
+ Error::InvalidMDGroupCountedValue {
+ mr_set: mr_set.clone(),
+ number: string.into(),
+ }
+ })?;
+ Value::Number(Some(number))
+ }
+ VarWidth::String(max_width) => {
+ let mut value = value.0.as_slice();
+ while value.ends_with(b" ") {
+ value = &value[..value.len() - 1];
+ }
+ let width = value.len();
+ if width > max_width as usize {
+ return Err(Error::TooWideMDGroupCountedValue {
+ mr_set: mr_set.clone(),
+ value: String::from_utf8_lossy(value).into(),
+ width,
+ max_width,
+ });
+ };
+ Value::String(value.into())
+ }
+ };
+ Ok(MultipleResponseType::MultipleDichotomy {
+ value,
+ labels: *labels,
+ })
+ }
+ raw::MultipleResponseType::MultipleCategory => {
+ Ok(MultipleResponseType::MultipleCategory)
+ }
+ }
+ }
+}
--- /dev/null
+use crate::locale_charset::locale_charset;
+use encoding_rs::{Encoding, UTF_8};
+
+include!(concat!(env!("OUT_DIR"), "/encodings.rs"));
+
+pub fn codepage_from_encoding(encoding: &str) -> Option<u32> {
+ CODEPAGE_NAME_TO_NUMBER
+ .get(encoding.to_ascii_lowercase().as_str())
+ .copied()
+}
+
+use thiserror::Error as ThisError;
+
+#[derive(ThisError, Debug)]
+pub enum Error {
+ #[error("This system file does not indicate its own character encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")]
+ NoEncoding,
+
+ #[error("This system file encodes text strings with unknown code page {0}.")]
+ UnknownCodepage(i32),
+
+ #[error("This system file encodes text strings with unknown encoding {0}.")]
+ UnknownEncoding(String),
+
+ #[error("This system file is encoded in EBCDIC, which is not supported.")]
+ Ebcdic,
+}
+
+pub fn default_encoding() -> &'static Encoding {
+ lazy_static! {
+ static ref DEFAULT_ENCODING: &'static Encoding =
+ Encoding::for_label(locale_charset().as_bytes()).unwrap_or(UTF_8);
+ }
+ &DEFAULT_ENCODING
+}
+
+pub fn get_encoding(
+ encoding: Option<&str>,
+ character_code: Option<i32>,
+) -> Result<&'static Encoding, Error> {
+ let label = if let Some(encoding) = encoding {
+ encoding
+ } else if let Some(codepage) = character_code {
+ match codepage {
+ 1 => return Err(Error::Ebcdic),
+ 2 | 3 => {
+ // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
+ // respectively. However, many files have character code 2 but
+ // data which are clearly not ASCII. Therefore, ignore these
+ // values.
+ return Err(Error::NoEncoding);
+ }
+ 4 => "MS_KANJI",
+ _ => CODEPAGE_NUMBER_TO_NAME
+ .get(&codepage)
+ .copied()
+ .ok_or(Error::UnknownCodepage(codepage))?,
+ }
+ } else {
+ return Err(Error::NoEncoding);
+ };
+
+ Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into()))
+}
+
+/*
+#[cfg(test)]
+mod tests {
+ use std::thread::spawn;
+
+ use encoding_rs::{EUC_JP, UTF_8, WINDOWS_1252};
+
+ #[test]
+ fn round_trip() {
+ let mut threads = Vec::new();
+ for thread in 0..128 {
+ let start: u32 = thread << 25;
+ let end = start + ((1 << 25) - 1);
+ threads.push(spawn(move || {
+ for i in start..=end {
+ let s = i.to_le_bytes();
+ let (utf8, replacement) = EUC_JP.decode_without_bom_handling(&s);
+ if !replacement {
+ let s2 = UTF_8.encode(&utf8).0;
+ assert_eq!(s.as_slice(), &*s2);
+ }
+ }
+ }));
+ }
+ for thread in threads {
+ thread.join().unwrap();
+ }
+ }
+}
+*/
--- /dev/null
+pub mod cooked;
+pub mod encoding;
+pub mod raw;
+pub mod sack;
--- /dev/null
+use crate::{
+ dictionary::{Attributes, Value, VarWidth},
+ endian::{Endian, Parse, ToBytes},
+ identifier::{Error as IdError, Identifier},
+ sys::encoding::{default_encoding, get_encoding, Error as EncodingError},
+};
+
+use encoding_rs::{mem::decode_latin1, Encoding};
+use flate2::read::ZlibDecoder;
+use num::Integer;
+use std::{
+ borrow::Cow,
+ cell::RefCell,
+ collections::{HashMap, VecDeque},
+ fmt::{Debug, Display, Formatter, Result as FmtResult},
+ io::{Error as IoError, Read, Seek, SeekFrom},
+ mem::take,
+ num::NonZeroU8,
+ ops::Range,
+ rc::Rc,
+ str::from_utf8,
+};
+use thiserror::Error as ThisError;
+
+#[derive(ThisError, Debug)]
+pub enum Error {
+ #[error("Not an SPSS system file")]
+ NotASystemFile,
+
+ #[error("Invalid magic number {0:?}")]
+ BadMagic([u8; 4]),
+
+ #[error("I/O error ({0})")]
+ Io(#[from] IoError),
+
+ #[error("Invalid SAV compression code {0}")]
+ InvalidSavCompression(u32),
+
+ #[error("Invalid ZSAV compression code {0}")]
+ InvalidZsavCompression(u32),
+
+ #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
+ BadDocumentLength { offset: u64, n: usize, max: usize },
+
+ #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
+ BadRecordType { offset: u64, rec_type: u32 },
+
+ #[error("In variable record starting at offset {start_offset:#x}, variable width is not in the valid range -1 to 255.")]
+ BadVariableWidth { start_offset: u64, width: i32 },
+
+ #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
+ BadVariableLabelCode {
+ start_offset: u64,
+ code_offset: u64,
+ code: u32,
+ },
+
+ #[error("At offset {offset:#x}, missing value code ({code}) is not -3, -2, 0, 1, 2, or 3.")]
+ BadMissingValueCode { offset: u64, code: i32 },
+
+ #[error(
+ "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
+ )]
+ BadNumericMissingValueCode { offset: u64, code: i32 },
+
+ #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
+ BadStringMissingValueCode { offset: u64, code: i32 },
+
+ #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
+ BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
+
+ #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
+ ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
+
+ #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")]
+ TooManyVarIndexes { offset: u64, n: u32, max: u32 },
+
+ #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
+ ExtensionRecordTooLarge {
+ offset: u64,
+ subtype: u32,
+ size: u32,
+ count: u32,
+ },
+
+ #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
+ EofInCase {
+ offset: u64,
+ case_ofs: u64,
+ case_len: usize,
+ },
+
+ #[error(
+ "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
+ )]
+ EofInCompressedCase { offset: u64, case_ofs: u64 },
+
+ #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
+ PartialCompressedCase { offset: u64, case_ofs: u64 },
+
+ #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
+ CompressedNumberExpected { offset: u64, case_ofs: u64 },
+
+ #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
+ CompressedStringExpected { offset: u64, case_ofs: u64 },
+
+ #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
+ BadZlibTrailerNBlocks {
+ offset: u64,
+ n_blocks: u32,
+ expected_n_blocks: u64,
+ ztrailer_len: u64,
+ },
+
+ #[error("{0}")]
+ EncodingError(EncodingError),
+}
+
+#[derive(ThisError, Debug)]
+pub enum Warning {
+ #[error("Unexpected end of data inside extension record.")]
+ UnexpectedEndOfData,
+
+ #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")]
+ NoVarIndexes { offset: u64 },
+
+ #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())]
+ MixedVarTypes {
+ offset: u64,
+ var_type: VarType,
+ wrong_types: Vec<u32>,
+ },
+
+ #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}] or referred to string continuations: {invalid:?}")]
+ InvalidVarIndexes {
+ offset: u64,
+ max: usize,
+ invalid: Vec<u32>,
+ },
+
+ #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
+ BadRecordSize {
+ offset: u64,
+ record: String,
+ size: u32,
+ expected_size: u32,
+ },
+
+ #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
+ BadRecordCount {
+ offset: u64,
+ record: String,
+ count: u32,
+ expected_count: u32,
+ },
+
+ #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
+ BadLongMissingValueLength {
+ record_offset: u64,
+ offset: u64,
+ value_len: u32,
+ },
+
+ #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
+ BadEncodingName { offset: u64 },
+
+ // XXX This is risky because `text` might be arbitarily long.
+ #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
+ MalformedString { encoding: String, text: String },
+
+ #[error("Invalid variable measurement level value {0}")]
+ InvalidMeasurement(u32),
+
+ #[error("Invalid variable display alignment value {0}")]
+ InvalidAlignment(u32),
+
+ #[error("Invalid attribute name. {0}")]
+ InvalidAttributeName(IdError),
+
+ #[error("Invalid variable name in attribute record. {0}")]
+ InvalidAttributeVariableName(IdError),
+
+ #[error("Invalid short name in long variable name record. {0}")]
+ InvalidShortName(IdError),
+
+ #[error("Invalid name in long variable name record. {0}")]
+ InvalidLongName(IdError),
+
+ #[error("Invalid variable name in very long string record. {0}")]
+ InvalidLongStringName(IdError),
+
+ #[error("Invalid variable name in variable set record. {0}")]
+ InvalidVariableSetName(IdError),
+
+ #[error("Invalid multiple response set name. {0}")]
+ InvalidMrSetName(IdError),
+
+ #[error("Invalid multiple response set variable name. {0}")]
+ InvalidMrSetVariableName(IdError),
+
+ #[error("Invalid variable name in long string missing values record. {0}")]
+ InvalidLongStringMissingValueVariableName(IdError),
+
+ #[error("Invalid variable name in long string value label record. {0}")]
+ InvalidLongStringValueLabelName(IdError),
+
+ #[error("{0}")]
+ EncodingError(EncodingError),
+
+ #[error("Details TBD")]
+ TBD,
+}
+
+impl From<IoError> for Warning {
+ fn from(_source: IoError) -> Self {
+ Self::UnexpectedEndOfData
+ }
+}
+
+#[derive(Clone, Debug)]
+pub enum Record {
+ Header(HeaderRecord<RawString>),
+ Variable(VariableRecord<RawString>),
+ ValueLabel(ValueLabelRecord<RawStrArray<8>, RawString>),
+ Document(DocumentRecord<RawDocumentLine>),
+ IntegerInfo(IntegerInfoRecord),
+ FloatInfo(FloatInfoRecord),
+ VarDisplay(VarDisplayRecord),
+ MultipleResponse(MultipleResponseRecord<RawString, RawString>),
+ LongStringValueLabels(LongStringValueLabelRecord<RawString, RawString>),
+ LongStringMissingValues(LongStringMissingValueRecord<RawString>),
+ Encoding(EncodingRecord),
+ NumberOfCases(NumberOfCasesRecord),
+ Text(TextRecord),
+ OtherExtension(Extension),
+ EndOfHeaders(u32),
+ ZHeader(ZHeader),
+ ZTrailer(ZTrailer),
+ Cases(Rc<RefCell<Cases>>),
+}
+
+#[derive(Clone, Debug)]
+pub enum DecodedRecord {
+ Header(HeaderRecord<String>),
+ Variable(VariableRecord<String>),
+ ValueLabel(ValueLabelRecord<RawStrArray<8>, String>),
+ Document(DocumentRecord<String>),
+ IntegerInfo(IntegerInfoRecord),
+ FloatInfo(FloatInfoRecord),
+ VarDisplay(VarDisplayRecord),
+ MultipleResponse(MultipleResponseRecord<Identifier, String>),
+ LongStringValueLabels(LongStringValueLabelRecord<Identifier, String>),
+ LongStringMissingValues(LongStringMissingValueRecord<Identifier>),
+ Encoding(EncodingRecord),
+ NumberOfCases(NumberOfCasesRecord),
+ VariableSets(VariableSetRecord),
+ ProductInfo(ProductInfoRecord),
+ LongNames(LongNamesRecord),
+ VeryLongStrings(VeryLongStringsRecord),
+ FileAttributes(FileAttributeRecord),
+ VariableAttributes(VariableAttributeRecord),
+ OtherExtension(Extension),
+ EndOfHeaders(u32),
+ ZHeader(ZHeader),
+ ZTrailer(ZTrailer),
+ Cases(Rc<RefCell<Cases>>),
+}
+
+impl Record {
+ fn read<R>(
+ reader: &mut R,
+ endian: Endian,
+ var_types: &VarTypes,
+ warn: &dyn Fn(Warning),
+ ) -> Result<Option<Record>, Error>
+ where
+ R: Read + Seek,
+ {
+ let rec_type: u32 = endian.parse(read_bytes(reader)?);
+ match rec_type {
+ 2 => Ok(Some(VariableRecord::read(reader, endian, warn)?)),
+ 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?),
+ 6 => Ok(Some(DocumentRecord::read(reader, endian)?)),
+ 7 => Extension::read(reader, endian, var_types.n_values(), warn),
+ 999 => Ok(Some(Record::EndOfHeaders(
+ endian.parse(read_bytes(reader)?),
+ ))),
+ _ => Err(Error::BadRecordType {
+ offset: reader.stream_position()?,
+ rec_type,
+ }),
+ }
+ }
+
+ pub fn decode(self, decoder: &Decoder) -> Result<DecodedRecord, Error> {
+ Ok(match self {
+ Record::Header(record) => record.decode(decoder),
+ Record::Variable(record) => record.decode(decoder),
+ Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)),
+ Record::Document(record) => record.decode(decoder),
+ Record::IntegerInfo(record) => DecodedRecord::IntegerInfo(record.clone()),
+ Record::FloatInfo(record) => DecodedRecord::FloatInfo(record.clone()),
+ Record::VarDisplay(record) => DecodedRecord::VarDisplay(record.clone()),
+ Record::MultipleResponse(record) => record.decode(decoder),
+ Record::LongStringValueLabels(record) => {
+ DecodedRecord::LongStringValueLabels(record.decode(decoder))
+ }
+ Record::LongStringMissingValues(record) => {
+ DecodedRecord::LongStringMissingValues(record.decode(decoder))
+ }
+ Record::Encoding(record) => DecodedRecord::Encoding(record.clone()),
+ Record::NumberOfCases(record) => DecodedRecord::NumberOfCases(record.clone()),
+ Record::Text(record) => record.decode(decoder),
+ Record::OtherExtension(record) => DecodedRecord::OtherExtension(record.clone()),
+ Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record),
+ Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()),
+ Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()),
+ Record::Cases(record) => DecodedRecord::Cases(record.clone()),
+ })
+ }
+}
+
+pub fn encoding_from_headers(
+ headers: &Vec<Record>,
+ warn: &impl Fn(Warning),
+) -> Result<&'static Encoding, Error> {
+ let mut encoding_record = None;
+ let mut integer_info_record = None;
+ for record in headers {
+ match record {
+ Record::Encoding(record) => encoding_record = Some(record),
+ Record::IntegerInfo(record) => integer_info_record = Some(record),
+ _ => (),
+ }
+ }
+ let encoding = encoding_record.map(|record| record.0.as_str());
+ let character_code = integer_info_record.map(|record| record.character_code);
+ match get_encoding(encoding, character_code) {
+ Ok(encoding) => Ok(encoding),
+ Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)),
+ Err(err) => {
+ warn(Warning::EncodingError(err));
+ // Warn that we're using the default encoding.
+ Ok(default_encoding())
+ }
+ }
+}
+
+// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
+// decoded as Latin-1 (actually bytes interpreted as Unicode code points).
+fn default_decode(s: &[u8]) -> Cow<str> {
+ from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum Compression {
+ Simple,
+ ZLib,
+}
+
+#[derive(Clone)]
+pub struct HeaderRecord<S>
+where
+ S: Debug,
+{
+ /// Offset in file.
+ pub offsets: Range<u64>,
+
+ /// Magic number.
+ pub magic: Magic,
+
+ /// Eye-catcher string, product name, in the file's encoding. Padded
+ /// on the right with spaces.
+ pub eye_catcher: S,
+
+ /// Layout code, normally either 2 or 3.
+ pub layout_code: u32,
+
+ /// Number of variable positions, or `None` if the value in the file is
+ /// questionably trustworthy.
+ pub nominal_case_size: Option<u32>,
+
+ /// Compression type, if any,
+ pub compression: Option<Compression>,
+
+ /// 1-based variable index of the weight variable, or `None` if the file is
+ /// unweighted.
+ pub weight_index: Option<u32>,
+
+ /// Claimed number of cases, if known.
+ pub n_cases: Option<u32>,
+
+ /// Compression bias, usually 100.0.
+ pub bias: f64,
+
+ /// `dd mmm yy` in the file's encoding.
+ pub creation_date: S,
+
+ /// `HH:MM:SS` in the file's encoding.
+ pub creation_time: S,
+
+ /// File label, in the file's encoding. Padded on the right with spaces.
+ pub file_label: S,
+
+ /// Endianness of the data in the file header.
+ pub endian: Endian,
+}
+
+impl<S> HeaderRecord<S>
+where
+ S: Debug,
+{
+ fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult
+ where
+ T: Debug,
+ {
+ writeln!(f, "{name:>17}: {:?}", value)
+ }
+}
+
+impl<S> Debug for HeaderRecord<S>
+where
+ S: Debug,
+{
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ writeln!(f, "File header record:")?;
+ self.debug_field(f, "Magic", self.magic)?;
+ self.debug_field(f, "Product name", &self.eye_catcher)?;
+ self.debug_field(f, "Layout code", self.layout_code)?;
+ self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
+ self.debug_field(f, "Compression", self.compression)?;
+ self.debug_field(f, "Weight index", self.weight_index)?;
+ self.debug_field(f, "Number of cases", self.n_cases)?;
+ self.debug_field(f, "Compression bias", self.bias)?;
+ self.debug_field(f, "Creation date", &self.creation_date)?;
+ self.debug_field(f, "Creation time", &self.creation_time)?;
+ self.debug_field(f, "File label", &self.file_label)?;
+ self.debug_field(f, "Endianness", self.endian)
+ }
+}
+
+impl HeaderRecord<RawString> {
+ fn read<R: Read + Seek>(r: &mut R) -> Result<Self, Error> {
+ let start = r.stream_position()?;
+
+ let magic: [u8; 4] = read_bytes(r)?;
+ let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
+
+ let eye_catcher = RawString(read_vec(r, 60)?);
+ let layout_code: [u8; 4] = read_bytes(r)?;
+ let endian = Endian::identify_u32(2, layout_code)
+ .or_else(|| Endian::identify_u32(2, layout_code))
+ .ok_or(Error::NotASystemFile)?;
+ let layout_code = endian.parse(layout_code);
+
+ let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
+ let nominal_case_size =
+ (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
+
+ let compression_code: u32 = endian.parse(read_bytes(r)?);
+ let compression = match (magic, compression_code) {
+ (Magic::Zsav, 2) => Some(Compression::ZLib),
+ (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)),
+ (_, 0) => None,
+ (_, 1) => Some(Compression::Simple),
+ (_, code) => return Err(Error::InvalidSavCompression(code)),
+ };
+
+ let weight_index: u32 = endian.parse(read_bytes(r)?);
+ let weight_index = (weight_index > 0).then_some(weight_index);
+
+ let n_cases: u32 = endian.parse(read_bytes(r)?);
+ let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
+
+ let bias: f64 = endian.parse(read_bytes(r)?);
+
+ let creation_date = RawString(read_vec(r, 9)?);
+ let creation_time = RawString(read_vec(r, 8)?);
+ let file_label = RawString(read_vec(r, 64)?);
+ let _: [u8; 3] = read_bytes(r)?;
+
+ Ok(HeaderRecord {
+ offsets: start..r.stream_position()?,
+ magic,
+ layout_code,
+ nominal_case_size,
+ compression,
+ weight_index,
+ n_cases,
+ bias,
+ creation_date,
+ creation_time,
+ eye_catcher,
+ file_label,
+ endian,
+ })
+ }
+
+ pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ let eye_catcher = decoder.decode(&self.eye_catcher).to_string();
+ let file_label = decoder.decode(&self.file_label).to_string();
+ let creation_date = decoder.decode(&self.creation_date).to_string();
+ let creation_time = decoder.decode(&self.creation_time).to_string();
+ DecodedRecord::Header(HeaderRecord {
+ eye_catcher,
+ weight_index: self.weight_index,
+ n_cases: self.n_cases,
+ file_label,
+ offsets: self.offsets.clone(),
+ magic: self.magic,
+ layout_code: self.layout_code,
+ nominal_case_size: self.nominal_case_size,
+ compression: self.compression,
+ bias: self.bias,
+ creation_date,
+ creation_time,
+ endian: self.endian,
+ })
+ }
+}
+
+pub struct Decoder {
+ pub encoding: &'static Encoding,
+ pub warn: Box<dyn Fn(Warning)>,
+}
+
+impl Decoder {
+ pub fn new<F>(encoding: &'static Encoding, warn: F) -> Self
+ where
+ F: Fn(Warning) + 'static,
+ {
+ Self {
+ encoding,
+ warn: Box::new(warn),
+ }
+ }
+ fn warn(&self, warning: Warning) {
+ (self.warn)(warning)
+ }
+ fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
+ let (output, malformed) = self.encoding.decode_without_bom_handling(input);
+ if malformed {
+ self.warn(Warning::MalformedString {
+ encoding: self.encoding.name().into(),
+ text: output.clone().into(),
+ });
+ }
+ output
+ }
+
+ fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> {
+ self.decode_slice(input.0.as_slice())
+ }
+
+ pub fn decode_identifier(&self, input: &RawString) -> Result<Identifier, IdError> {
+ self.new_identifier(&self.decode(input))
+ }
+
+ pub fn new_identifier(&self, name: &str) -> Result<Identifier, IdError> {
+ Identifier::from_encoding(name, self.encoding)
+ }
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+pub enum Magic {
+ /// Regular system file.
+ Sav,
+
+ /// System file with Zlib-compressed data.
+ Zsav,
+
+ /// EBCDIC-encoded system file.
+ Ebcdic,
+}
+
+impl Magic {
+ /// Magic number for a regular system file.
+ pub const SAV: [u8; 4] = *b"$FL2";
+
+ /// Magic number for a system file that contains zlib-compressed data.
+ pub const ZSAV: [u8; 4] = *b"$FL3";
+
+ /// Magic number for an EBCDIC-encoded system file. This is `$FL2` encoded
+ /// in EBCDIC.
+ pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2];
+}
+
+impl Debug for Magic {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ let s = match *self {
+ Magic::Sav => "$FL2",
+ Magic::Zsav => "$FL3",
+ Magic::Ebcdic => "($FL2 in EBCDIC)",
+ };
+ write!(f, "{s}")
+ }
+}
+
+impl TryFrom<[u8; 4]> for Magic {
+ type Error = Error;
+
+ fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
+ match value {
+ Magic::SAV => Ok(Magic::Sav),
+ Magic::ZSAV => Ok(Magic::Zsav),
+ Magic::EBCDIC => Ok(Magic::Ebcdic),
+ _ => Err(Error::BadMagic(value)),
+ }
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum VarType {
+ Numeric,
+ String,
+}
+
+impl VarType {
+ pub fn opposite(self) -> VarType {
+ match self {
+ Self::Numeric => Self::String,
+ Self::String => Self::Numeric,
+ }
+ }
+}
+
+impl Display for VarType {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ match self {
+ VarType::Numeric => write!(f, "numeric"),
+ VarType::String => write!(f, "string"),
+ }
+ }
+}
+
+impl TryFrom<RawWidth> for VarType {
+ type Error = ();
+
+ fn try_from(value: RawWidth) -> Result<Self, Self::Error> {
+ match value {
+ RawWidth::Continuation => Err(()),
+ RawWidth::Numeric => Ok(VarType::Numeric),
+ RawWidth::String(_) => Ok(VarType::String),
+ }
+ }
+}
+
+impl TryFrom<RawWidth> for VarWidth {
+ type Error = ();
+
+ fn try_from(value: RawWidth) -> Result<Self, Self::Error> {
+ match value {
+ RawWidth::Continuation => Err(()),
+ RawWidth::Numeric => Ok(Self::Numeric),
+ RawWidth::String(width) => Ok(Self::String(width.get() as u16)),
+ }
+ }
+}
+
+type RawValue = Value<RawStrArray<8>>;
+
+impl RawValue {
+ pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self {
+ match var_type {
+ VarType::String => Value::String(RawStrArray(raw.0)),
+ VarType::Numeric => Value::Number(endian.parse(raw.0)),
+ }
+ }
+
+ fn read_case<R: Read + Seek>(
+ reader: &mut R,
+ var_types: &VarTypes,
+ endian: Endian,
+ ) -> Result<Option<Vec<Self>>, Error> {
+ let case_start = reader.stream_position()?;
+ let mut values = Vec::with_capacity(var_types.n_values());
+ for (i, var_type) in var_types.iter().enumerate() {
+ let Some(raw) = try_read_bytes(reader)? else {
+ if i == 0 {
+ return Ok(None);
+ } else {
+ let offset = reader.stream_position()?;
+ return Err(Error::EofInCase {
+ offset,
+ case_ofs: offset - case_start,
+ case_len: var_types.n_values() * 8,
+ });
+ }
+ };
+ values.push(Value::from_raw(&UntypedValue(raw), var_type, endian));
+ }
+ Ok(Some(values))
+ }
+
+ fn read_compressed_case<R: Read + Seek>(
+ reader: &mut R,
+ var_types: &VarTypes,
+ codes: &mut VecDeque<u8>,
+ endian: Endian,
+ bias: f64,
+ ) -> Result<Option<Vec<Self>>, Error> {
+ let case_start = reader.stream_position()?;
+ let mut values = Vec::with_capacity(var_types.n_values());
+ for (i, var_type) in var_types.iter().enumerate() {
+ let value = loop {
+ let Some(code) = codes.pop_front() else {
+ let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
+ if i == 0 {
+ return Ok(None);
+ } else {
+ let offset = reader.stream_position()?;
+ return Err(Error::EofInCompressedCase {
+ offset,
+ case_ofs: offset - case_start,
+ });
+ }
+ };
+ codes.extend(new_codes.into_iter());
+ continue;
+ };
+ match code {
+ 0 => (),
+ 1..=251 => match var_type {
+ VarType::Numeric => break Self::Number(Some(code as f64 - bias)),
+ VarType::String => {
+ break Self::String(RawStrArray(endian.to_bytes(code as f64 - bias)))
+ }
+ },
+ 252 => {
+ if i == 0 {
+ return Ok(None);
+ } else {
+ let offset = reader.stream_position()?;
+ return Err(Error::PartialCompressedCase {
+ offset,
+ case_ofs: offset - case_start,
+ });
+ }
+ }
+ 253 => {
+ break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian)
+ }
+ 254 => match var_type {
+ VarType::String => break Self::String(RawStrArray(*b" ")), // XXX EBCDIC
+ VarType::Numeric => {
+ return Err(Error::CompressedStringExpected {
+ offset: case_start,
+ case_ofs: reader.stream_position()? - case_start,
+ })
+ }
+ },
+ 255 => match var_type {
+ VarType::Numeric => break Self::Number(None),
+ VarType::String => {
+ return Err(Error::CompressedNumberExpected {
+ offset: case_start,
+ case_ofs: reader.stream_position()? - case_start,
+ })
+ }
+ },
+ }
+ };
+ values.push(value);
+ }
+ Ok(Some(values))
+ }
+
+ pub fn decode(&self, width: VarWidth) -> Value {
+ match self {
+ Self::Number(x) => Value::Number(*x),
+ Self::String(s) => {
+ let width = width.as_string_width().unwrap();
+ Value::String(RawString::from(&s.0[..width]))
+ }
+ }
+ }
+}
+
+struct ZlibDecodeMultiple<R>
+where
+ R: Read + Seek,
+{
+ reader: Option<ZlibDecoder<R>>,
+}
+
+impl<R> ZlibDecodeMultiple<R>
+where
+ R: Read + Seek,
+{
+ fn new(reader: R) -> ZlibDecodeMultiple<R> {
+ ZlibDecodeMultiple {
+ reader: Some(ZlibDecoder::new(reader)),
+ }
+ }
+}
+
+impl<R> Read for ZlibDecodeMultiple<R>
+where
+ R: Read + Seek,
+{
+ fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
+ loop {
+ match self.reader.as_mut().unwrap().read(buf)? {
+ 0 => {
+ let inner = self.reader.take().unwrap().into_inner();
+ self.reader = Some(ZlibDecoder::new(inner));
+ }
+ n => return Ok(n),
+ };
+ }
+ }
+}
+
+impl<R> Seek for ZlibDecodeMultiple<R>
+where
+ R: Read + Seek,
+{
+ fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
+ self.reader.as_mut().unwrap().get_mut().seek(pos)
+ }
+}
+
+enum ReaderState {
+ Start,
+ Headers,
+ ZlibHeader,
+ ZlibTrailer {
+ ztrailer_offset: u64,
+ ztrailer_len: u64,
+ },
+ Cases,
+ End,
+}
+
+pub struct Reader<R>
+where
+ R: Read + Seek + 'static,
+{
+ reader: Option<R>,
+ warn: Box<dyn Fn(Warning)>,
+
+ header: HeaderRecord<RawString>,
+ var_types: VarTypes,
+
+ state: ReaderState,
+}
+
+impl<R> Reader<R>
+where
+ R: Read + Seek + 'static,
+{
+ pub fn new<F>(mut reader: R, warn: F) -> Result<Self, Error>
+ where
+ F: Fn(Warning) + 'static,
+ {
+ let header = HeaderRecord::read(&mut reader)?;
+ Ok(Self {
+ reader: Some(reader),
+ warn: Box::new(warn),
+ header,
+ var_types: VarTypes::new(),
+ state: ReaderState::Start,
+ })
+ }
+ fn cases(&mut self) -> Cases {
+ self.state = ReaderState::End;
+ Cases::new(
+ self.reader.take().unwrap(),
+ take(&mut self.var_types),
+ &self.header,
+ )
+ }
+ fn _next(&mut self) -> Option<<Self as Iterator>::Item> {
+ match self.state {
+ ReaderState::Start => {
+ self.state = ReaderState::Headers;
+ Some(Ok(Record::Header(self.header.clone())))
+ }
+ ReaderState::Headers => {
+ let record = loop {
+ match Record::read(
+ self.reader.as_mut().unwrap(),
+ self.header.endian,
+ &self.var_types,
+ &self.warn,
+ ) {
+ Ok(Some(record)) => break record,
+ Ok(None) => (),
+ Err(error) => return Some(Err(error)),
+ }
+ };
+ match record {
+ Record::Variable(VariableRecord { width, .. }) => self.var_types.push(width),
+ Record::EndOfHeaders(_) => {
+ self.state = if let Some(Compression::ZLib) = self.header.compression {
+ ReaderState::ZlibHeader
+ } else {
+ ReaderState::Cases
+ };
+ }
+ _ => (),
+ };
+ Some(Ok(record))
+ }
+ ReaderState::ZlibHeader => {
+ let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian)
+ {
+ Ok(zheader) => zheader,
+ Err(error) => return Some(Err(error)),
+ };
+ self.state = ReaderState::ZlibTrailer {
+ ztrailer_offset: zheader.ztrailer_offset,
+ ztrailer_len: zheader.ztrailer_len,
+ };
+ Some(Ok(Record::ZHeader(zheader)))
+ }
+ ReaderState::ZlibTrailer {
+ ztrailer_offset,
+ ztrailer_len,
+ } => {
+ match ZTrailer::read(
+ self.reader.as_mut().unwrap(),
+ self.header.endian,
+ ztrailer_offset,
+ ztrailer_len,
+ ) {
+ Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
+ Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))),
+ Err(error) => Some(Err(error)),
+ }
+ }
+ ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
+ ReaderState::End => None,
+ }
+ }
+}
+
+impl<R> Iterator for Reader<R>
+where
+ R: Read + Seek + 'static,
+{
+ type Item = Result<Record, Error>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ let retval = self._next();
+ if matches!(retval, Some(Err(_))) {
+ self.state = ReaderState::End;
+ }
+ retval
+ }
+}
+
+trait ReadSeek: Read + Seek {}
+impl<T> ReadSeek for T where T: Read + Seek {}
+
+pub struct Cases {
+ reader: Box<dyn ReadSeek>,
+ var_types: VarTypes,
+ compression: Option<Compression>,
+ bias: f64,
+ endian: Endian,
+ codes: VecDeque<u8>,
+ eof: bool,
+}
+
+impl Debug for Cases {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "Cases")
+ }
+}
+
+impl Cases {
+ fn new<R>(reader: R, var_types: VarTypes, header: &HeaderRecord<RawString>) -> Self
+ where
+ R: Read + Seek + 'static,
+ {
+ Self {
+ reader: if header.compression == Some(Compression::ZLib) {
+ Box::new(ZlibDecodeMultiple::new(reader))
+ } else {
+ Box::new(reader)
+ },
+ var_types,
+ compression: header.compression,
+ bias: header.bias,
+ endian: header.endian,
+ codes: VecDeque::with_capacity(8),
+ eof: false,
+ }
+ }
+}
+
+impl Iterator for Cases {
+ type Item = Result<Vec<RawValue>, Error>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ if self.eof {
+ return None;
+ }
+
+ let retval = if self.compression.is_some() {
+ Value::read_compressed_case(
+ &mut self.reader,
+ &self.var_types,
+ &mut self.codes,
+ self.endian,
+ self.bias,
+ )
+ .transpose()
+ } else {
+ Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose()
+ };
+ self.eof = matches!(retval, None | Some(Err(_)));
+ retval
+ }
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+pub struct Spec(pub u32);
+
+impl Debug for Spec {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ let type_ = format_name(self.0 >> 16);
+ let w = (self.0 >> 8) & 0xff;
+ let d = self.0 & 0xff;
+ write!(f, "{:06x} ({type_}{w}.{d})", self.0)
+ }
+}
+
+fn format_name(type_: u32) -> Cow<'static, str> {
+ match type_ {
+ 1 => "A",
+ 2 => "AHEX",
+ 3 => "COMMA",
+ 4 => "DOLLAR",
+ 5 => "F",
+ 6 => "IB",
+ 7 => "PIBHEX",
+ 8 => "P",
+ 9 => "PIB",
+ 10 => "PK",
+ 11 => "RB",
+ 12 => "RBHEX",
+ 15 => "Z",
+ 16 => "N",
+ 17 => "E",
+ 20 => "DATE",
+ 21 => "TIME",
+ 22 => "DATETIME",
+ 23 => "ADATE",
+ 24 => "JDATE",
+ 25 => "DTIME",
+ 26 => "WKDAY",
+ 27 => "MONTH",
+ 28 => "MOYR",
+ 29 => "QYR",
+ 30 => "WKYR",
+ 31 => "PCT",
+ 32 => "DOT",
+ 33 => "CCA",
+ 34 => "CCB",
+ 35 => "CCC",
+ 36 => "CCD",
+ 37 => "CCE",
+ 38 => "EDATE",
+ 39 => "SDATE",
+ 40 => "MTIME",
+ 41 => "YMDHMS",
+ _ => return format!("<unknown format {type_}>").into(),
+ }
+ .into()
+}
+
+#[derive(Clone)]
+pub struct MissingValues<S = Box<[u8]>>
+where
+ S: Debug,
+{
+ /// Individual missing values, up to 3 of them.
+ pub values: Vec<Value<S>>,
+
+ /// Optional range of missing values.
+ pub range: Option<(Value<S>, Value<S>)>,
+}
+
+impl<S> Debug for MissingValues<S>
+where
+ S: Debug,
+{
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ for (i, value) in self.values.iter().enumerate() {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ write!(f, "{value:?}")?;
+ }
+
+ if let Some((low, high)) = &self.range {
+ if !self.values.is_empty() {
+ write!(f, ", ")?;
+ }
+ write!(f, "{low:?} THRU {high:?}")?;
+ }
+
+ if self.is_empty() {
+ write!(f, "none")?;
+ }
+
+ Ok(())
+ }
+}
+
+impl<S> MissingValues<S>
+where
+ S: Debug,
+{
+ fn is_empty(&self) -> bool {
+ self.values.is_empty() && self.range.is_none()
+ }
+}
+
+impl<S> Default for MissingValues<S>
+where
+ S: Debug,
+{
+ fn default() -> Self {
+ Self {
+ values: Vec::new(),
+ range: None,
+ }
+ }
+}
+
+impl MissingValues {
+ fn read<R: Read + Seek>(
+ r: &mut R,
+ offset: u64,
+ width: RawWidth,
+ code: i32,
+ endian: Endian,
+ warn: &dyn Fn(Warning),
+ ) -> Result<Self, Error> {
+ let (individual_values, has_range) = match code {
+ 0..=3 => (code as usize, false),
+ -2 => (0, true),
+ -3 => (1, true),
+ _ => return Err(Error::BadMissingValueCode { offset, code }),
+ };
+
+ let mut values = Vec::with_capacity(individual_values);
+ for _ in 0..individual_values {
+ values.push(read_bytes::<8, _>(r)?);
+ }
+ let range = if has_range {
+ let low = read_bytes::<8, _>(r)?;
+ let high = read_bytes::<8, _>(r)?;
+ Some((low, high))
+ } else {
+ None
+ };
+
+ match VarWidth::try_from(width) {
+ Ok(VarWidth::Numeric) => {
+ let values = values
+ .into_iter()
+ .map(|v| Value::Number(endian.parse(v)))
+ .collect();
+ let range = range.map(|(low, high)| {
+ (
+ Value::Number(endian.parse(low)),
+ Value::Number(endian.parse(high)),
+ )
+ });
+ return Ok(Self { values, range });
+ }
+ Ok(VarWidth::String(width)) if width <= 8 && range.is_none() => {
+ let values = values
+ .into_iter()
+ .map(|value| Value::String(Box::from(&value[..width as usize])))
+ .collect();
+ return Ok(Self {
+ values,
+ range: None,
+ });
+ }
+ Ok(VarWidth::String(width)) if width > 8 => warn(Warning::TBD),
+ Ok(VarWidth::String(_)) => warn(Warning::TBD),
+ Err(()) => warn(Warning::TBD),
+ }
+ Ok(Self::default())
+ }
+}
+
+#[derive(Clone)]
+pub struct VariableRecord<S>
+where
+ S: Debug,
+{
+ /// Range of offsets in file.
+ pub offsets: Range<u64>,
+
+ /// Variable width, in the range -1..=255.
+ pub width: RawWidth,
+
+ /// Variable name, padded on the right with spaces.
+ pub name: S,
+
+ /// Print format.
+ pub print_format: Spec,
+
+ /// Write format.
+ pub write_format: Spec,
+
+ /// Missing values.
+ pub missing_values: MissingValues,
+
+ /// Optional variable label.
+ pub label: Option<S>,
+}
+
+#[derive(Copy, Clone, PartialEq, Eq)]
+pub enum RawWidth {
+ Continuation,
+ Numeric,
+ String(NonZeroU8),
+}
+
+impl RawWidth {
+ pub fn n_values(&self) -> Option<usize> {
+ match self {
+ RawWidth::Numeric => Some(1),
+ RawWidth::String(width) => Some((width.get() as usize).div_ceil(8)),
+ _ => None,
+ }
+ }
+}
+
+impl TryFrom<i32> for RawWidth {
+ type Error = ();
+
+ fn try_from(value: i32) -> Result<Self, Self::Error> {
+ match value {
+ -1 => Ok(Self::Continuation),
+ 0 => Ok(Self::Numeric),
+ 1..=255 => Ok(Self::String(NonZeroU8::new(value as u8).unwrap())),
+ _ => Err(()),
+ }
+ }
+}
+
+impl Display for RawWidth {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ match self {
+ RawWidth::Continuation => write!(f, "long string continuation"),
+ RawWidth::Numeric => write!(f, "numeric"),
+ RawWidth::String(width) => write!(f, "{width}-byte string"),
+ }
+ }
+}
+
+impl<S> Debug for VariableRecord<S>
+where
+ S: Debug,
+{
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ writeln!(f, "Width: {}", self.width,)?;
+ writeln!(f, "Print format: {:?}", self.print_format)?;
+ writeln!(f, "Write format: {:?}", self.write_format)?;
+ writeln!(f, "Name: {:?}", &self.name)?;
+ writeln!(f, "Variable label: {:?}", self.label)?;
+ writeln!(f, "Missing values: {:?}", self.missing_values)
+ }
+}
+
+impl VariableRecord<RawString> {
+ fn read<R: Read + Seek>(
+ r: &mut R,
+ endian: Endian,
+ warn: &dyn Fn(Warning),
+ ) -> Result<Record, Error> {
+ let start_offset = r.stream_position()?;
+ let width: i32 = endian.parse(read_bytes(r)?);
+ let width: RawWidth = width.try_into().map_err(|_| Error::BadVariableWidth {
+ start_offset,
+ width,
+ })?;
+ let code_offset = r.stream_position()?;
+ let has_variable_label: u32 = endian.parse(read_bytes(r)?);
+ let missing_value_code: i32 = endian.parse(read_bytes(r)?);
+ let print_format = Spec(endian.parse(read_bytes(r)?));
+ let write_format = Spec(endian.parse(read_bytes(r)?));
+ let name = RawString(read_vec(r, 8)?);
+
+ let label = match has_variable_label {
+ 0 => None,
+ 1 => {
+ let len: u32 = endian.parse(read_bytes(r)?);
+ let read_len = len.min(65535) as usize;
+ let label = RawString(read_vec(r, read_len)?);
+
+ let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
+ let _ = read_vec(r, padding_bytes as usize)?;
+
+ Some(label)
+ }
+ _ => {
+ return Err(Error::BadVariableLabelCode {
+ start_offset,
+ code_offset,
+ code: has_variable_label,
+ })
+ }
+ };
+
+ let missing_values =
+ MissingValues::read(r, start_offset, width, missing_value_code, endian, warn)?;
+
+ let end_offset = r.stream_position()?;
+
+ Ok(Record::Variable(VariableRecord {
+ offsets: start_offset..end_offset,
+ width,
+ name,
+ print_format,
+ write_format,
+ missing_values,
+ label,
+ }))
+ }
+
+ pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ DecodedRecord::Variable(VariableRecord {
+ offsets: self.offsets.clone(),
+ width: self.width,
+ name: decoder.decode(&self.name).to_string(),
+ print_format: self.print_format,
+ write_format: self.write_format,
+ missing_values: self.missing_values,
+ label: self
+ .label
+ .as_ref()
+ .map(|label| decoder.decode(label).to_string()),
+ })
+ }
+}
+
+#[derive(Copy, Clone)]
+pub struct UntypedValue(pub [u8; 8]);
+
+impl Debug for UntypedValue {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ let little: f64 = Endian::Little.parse(self.0);
+ let little = format!("{:?}", little);
+ let big: f64 = Endian::Big.parse(self.0);
+ let big = format!("{:?}", big);
+ let number = if little.len() <= big.len() {
+ little
+ } else {
+ big
+ };
+ write!(f, "{number}")?;
+
+ let string = default_decode(&self.0);
+ let string = string
+ .split(|c: char| c == '\0' || c.is_control())
+ .next()
+ .unwrap();
+ write!(f, "{string:?}")?;
+ Ok(())
+ }
+}
+
+#[derive(Clone, PartialEq, Default, Eq, PartialOrd, Ord, Hash)]
+pub struct RawString(pub Vec<u8>);
+
+impl RawString {
+ pub fn spaces(n: usize) -> Self {
+ Self(std::iter::repeat_n(b' ', n).collect())
+ }
+ pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> {
+ EncodedStr::new(&self.0, encoding)
+ }
+}
+
+impl From<Cow<'_, [u8]>> for RawString {
+ fn from(value: Cow<'_, [u8]>) -> Self {
+ Self(value.into_owned())
+ }
+}
+
+impl From<Vec<u8>> for RawString {
+ fn from(source: Vec<u8>) -> Self {
+ Self(source)
+ }
+}
+
+impl From<&[u8]> for RawString {
+ fn from(source: &[u8]) -> Self {
+ Self(source.into())
+ }
+}
+
+impl Debug for RawString {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "{:?}", default_decode(self.0.as_slice()))
+ }
+}
+
+#[derive(Copy, Clone)]
+pub struct RawStrArray<const N: usize>(pub [u8; N]);
+
+impl<const N: usize> From<[u8; N]> for RawStrArray<N> {
+ fn from(source: [u8; N]) -> Self {
+ Self(source)
+ }
+}
+
+impl<const N: usize> Debug for RawStrArray<N> {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "{:?}", default_decode(&self.0))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub enum EncodedString {
+ Encoded {
+ bytes: Vec<u8>,
+ encoding: &'static Encoding,
+ },
+ Utf8 {
+ s: String,
+ },
+}
+
+impl EncodedString {
+ pub fn borrowed(&self) -> EncodedStr<'_> {
+ match self {
+ EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding },
+ EncodedString::Utf8 { s } => EncodedStr::Utf8 { s },
+ }
+ }
+}
+
+impl<'a> From<EncodedStr<'a>> for EncodedString {
+ fn from(value: EncodedStr<'a>) -> Self {
+ match value {
+ EncodedStr::Encoded { bytes, encoding } => Self::Encoded {
+ bytes: bytes.into(),
+ encoding,
+ },
+ EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() },
+ }
+ }
+}
+
+pub enum EncodedStr<'a> {
+ Encoded {
+ bytes: &'a [u8],
+ encoding: &'static Encoding,
+ },
+ Utf8 {
+ s: &'a str,
+ },
+}
+
+impl<'a> EncodedStr<'a> {
+ pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self {
+ Self::Encoded { bytes, encoding }
+ }
+ pub fn as_str(&self) -> Cow<'_, str> {
+ match self {
+ EncodedStr::Encoded { bytes, encoding } => {
+ encoding.decode_without_bom_handling(bytes).0
+ }
+ EncodedStr::Utf8 { s } => Cow::from(*s),
+ }
+ }
+ pub fn as_bytes(&self) -> &[u8] {
+ match self {
+ EncodedStr::Encoded { bytes, .. } => bytes,
+ EncodedStr::Utf8 { s } => s.as_bytes(),
+ }
+ }
+ pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> {
+ match self {
+ EncodedStr::Encoded { bytes, encoding } => {
+ let utf8 = encoding.decode_without_bom_handling(bytes).0;
+ match encoding.encode(&utf8).0 {
+ Cow::Borrowed(_) => {
+ // Recoding into UTF-8 and then back did not change anything.
+ Cow::from(*bytes)
+ }
+ Cow::Owned(owned) => Cow::Owned(owned),
+ }
+ }
+ EncodedStr::Utf8 { s } => encoding.encode(s).0,
+ }
+ }
+ pub fn is_empty(&self) -> bool {
+ match self {
+ EncodedStr::Encoded { bytes, .. } => bytes.is_empty(),
+ EncodedStr::Utf8 { s } => s.is_empty(),
+ }
+ }
+ pub fn quoted(&self) -> QuotedEncodedStr {
+ QuotedEncodedStr(self)
+ }
+}
+
+impl<'a> From<&'a str> for EncodedStr<'a> {
+ fn from(s: &'a str) -> Self {
+ Self::Utf8 { s }
+ }
+}
+
+impl<'a> From<&'a String> for EncodedStr<'a> {
+ fn from(s: &'a String) -> Self {
+ Self::Utf8 { s: s.as_str() }
+ }
+}
+
+pub struct QuotedEncodedStr<'a>(&'a EncodedStr<'a>);
+
+impl Display for QuotedEncodedStr<'_> {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{:?}", self.0.as_str())
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct ValueLabel<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
+ pub value: Value<V>,
+ pub label: S,
+}
+
+#[derive(Clone)]
+pub struct ValueLabelRecord<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
+ /// Range of offsets in file.
+ pub offsets: Range<u64>,
+
+ /// The labels.
+ pub labels: Vec<ValueLabel<V, S>>,
+
+ /// The 1-based indexes of the variable indexes.
+ pub dict_indexes: Vec<u32>,
+
+ /// The types of the variables.
+ pub var_type: VarType,
+}
+
+impl<V, S> Debug for ValueLabelRecord<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ writeln!(f, "labels: ")?;
+ for label in self.labels.iter() {
+ writeln!(f, "{label:?}")?;
+ }
+ write!(f, "apply to {} variables", self.var_type)?;
+ for dict_index in self.dict_indexes.iter() {
+ write!(f, " #{dict_index}")?;
+ }
+ Ok(())
+ }
+}
+
+impl<V, S> ValueLabelRecord<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
+ /// Maximum number of value labels in a record.
+ pub const MAX_LABELS: u32 = u32::MAX / 8;
+
+ /// Maximum number of variable indexes in a record.
+ pub const MAX_INDEXES: u32 = u32::MAX / 8;
+}
+
+impl ValueLabelRecord<RawStrArray<8>, RawString> {
+ fn read<R: Read + Seek>(
+ r: &mut R,
+ endian: Endian,
+ var_types: &VarTypes,
+ warn: &dyn Fn(Warning),
+ ) -> Result<Option<Record>, Error> {
+ let label_offset = r.stream_position()?;
+ let n: u32 = endian.parse(read_bytes(r)?);
+ if n > Self::MAX_LABELS {
+ return Err(Error::BadNumberOfValueLabels {
+ offset: label_offset,
+ n,
+ max: Self::MAX_LABELS,
+ });
+ }
+
+ let mut labels = Vec::new();
+ for _ in 0..n {
+ let value = UntypedValue(read_bytes(r)?);
+ let label_len: u8 = endian.parse(read_bytes(r)?);
+ let label_len = label_len as usize;
+ let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
+
+ let mut label = read_vec(r, padded_len - 1)?;
+ label.truncate(label_len);
+ labels.push((value, RawString(label)));
+ }
+
+ let index_offset = r.stream_position()?;
+ let rec_type: u32 = endian.parse(read_bytes(r)?);
+ if rec_type != 4 {
+ return Err(Error::ExpectedVarIndexRecord {
+ offset: index_offset,
+ rec_type,
+ });
+ }
+
+ let n: u32 = endian.parse(read_bytes(r)?);
+ if n > Self::MAX_INDEXES {
+ return Err(Error::TooManyVarIndexes {
+ offset: index_offset,
+ n,
+ max: Self::MAX_INDEXES,
+ });
+ } else if n == 0 {
+ warn(Warning::NoVarIndexes {
+ offset: index_offset,
+ });
+ return Ok(None);
+ }
+
+ let index_offset = r.stream_position()?;
+ let mut dict_indexes = Vec::with_capacity(n as usize);
+ let mut invalid_indexes = Vec::new();
+ for _ in 0..n {
+ let index: u32 = endian.parse(read_bytes(r)?);
+ if var_types.is_valid_index(index as usize) {
+ dict_indexes.push(index);
+ } else {
+ invalid_indexes.push(index);
+ }
+ }
+ if !invalid_indexes.is_empty() {
+ warn(Warning::InvalidVarIndexes {
+ offset: index_offset,
+ max: var_types.n_values(),
+ invalid: invalid_indexes,
+ });
+ }
+
+ let Some(&first_index) = dict_indexes.first() else {
+ return Ok(None);
+ };
+ let var_type = var_types.types[first_index as usize - 1].unwrap();
+ let mut wrong_type_indexes = Vec::new();
+ dict_indexes.retain(|&index| {
+ if var_types.types[index as usize - 1] != Some(var_type) {
+ wrong_type_indexes.push(index);
+ false
+ } else {
+ true
+ }
+ });
+ if !wrong_type_indexes.is_empty() {
+ warn(Warning::MixedVarTypes {
+ offset: index_offset,
+ var_type,
+ wrong_types: wrong_type_indexes,
+ });
+ }
+
+ let labels = labels
+ .into_iter()
+ .map(|(value, label)| ValueLabel {
+ value: Value::from_raw(&value, var_type, endian),
+ label,
+ })
+ .collect();
+
+ let end_offset = r.stream_position()?;
+ Ok(Some(Record::ValueLabel(ValueLabelRecord {
+ offsets: label_offset..end_offset,
+ labels,
+ dict_indexes,
+ var_type,
+ })))
+ }
+
+ fn decode(self, decoder: &Decoder) -> ValueLabelRecord<RawStrArray<8>, String> {
+ let labels = self
+ .labels
+ .iter()
+ .map(|ValueLabel { value, label }| ValueLabel {
+ value: value.clone(),
+ label: decoder.decode(label).to_string(),
+ })
+ .collect();
+ ValueLabelRecord {
+ offsets: self.offsets.clone(),
+ labels,
+ dict_indexes: self.dict_indexes.clone(),
+ var_type: self.var_type,
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct DocumentRecord<S>
+where
+ S: Debug,
+{
+ pub offsets: Range<u64>,
+
+ /// The document, as an array of lines. Raw lines are exactly 80 bytes long
+ /// and are right-padded with spaces without any new-line termination.
+ pub lines: Vec<S>,
+}
+
+pub type RawDocumentLine = RawStrArray<DOC_LINE_LEN>;
+
+/// Length of a line in a document. Document lines are fixed-length and
+/// padded on the right with spaces.
+pub const DOC_LINE_LEN: usize = 80;
+
+impl DocumentRecord<RawDocumentLine> {
+ /// Maximum number of lines we will accept in a document. This is simply
+ /// the maximum number that will fit in a 32-bit space.
+ pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
+
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
+ let start_offset = r.stream_position()?;
+ let n: u32 = endian.parse(read_bytes(r)?);
+ let n = n as usize;
+ if n > Self::MAX_LINES {
+ Err(Error::BadDocumentLength {
+ offset: start_offset,
+ n,
+ max: Self::MAX_LINES,
+ })
+ } else {
+ let mut lines = Vec::with_capacity(n);
+ for _ in 0..n {
+ lines.push(RawStrArray(read_bytes(r)?));
+ }
+ let end_offset = r.stream_position()?;
+ Ok(Record::Document(DocumentRecord {
+ offsets: start_offset..end_offset,
+ lines,
+ }))
+ }
+ }
+
+ pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ DecodedRecord::Document(DocumentRecord {
+ offsets: self.offsets.clone(),
+ lines: self
+ .lines
+ .iter()
+ .map(|s| decoder.decode_slice(&s.0).to_string())
+ .collect(),
+ })
+ }
+}
+
+trait ExtensionRecord {
+ const SUBTYPE: u32;
+ const SIZE: Option<u32>;
+ const COUNT: Option<u32>;
+ const NAME: &'static str;
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning>;
+}
+
+#[derive(Clone, Debug)]
+pub struct IntegerInfoRecord {
+ pub offsets: Range<u64>,
+ pub version: (i32, i32, i32),
+ pub machine_code: i32,
+ pub floating_point_rep: i32,
+ pub compression_code: i32,
+ pub endianness: i32,
+ pub character_code: i32,
+}
+
+impl ExtensionRecord for IntegerInfoRecord {
+ const SUBTYPE: u32 = 3;
+ const SIZE: Option<u32> = Some(4);
+ const COUNT: Option<u32> = Some(8);
+ const NAME: &'static str = "integer record";
+
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let data: Vec<i32> = (0..8)
+ .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
+ .collect();
+ Ok(Record::IntegerInfo(IntegerInfoRecord {
+ offsets: ext.offsets.clone(),
+ version: (data[0], data[1], data[2]),
+ machine_code: data[3],
+ floating_point_rep: data[4],
+ compression_code: data[5],
+ endianness: data[6],
+ character_code: data[7],
+ }))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct FloatInfoRecord {
+ pub sysmis: f64,
+ pub highest: f64,
+ pub lowest: f64,
+}
+
+impl ExtensionRecord for FloatInfoRecord {
+ const SUBTYPE: u32 = 4;
+ const SIZE: Option<u32> = Some(8);
+ const COUNT: Option<u32> = Some(3);
+ const NAME: &'static str = "floating point record";
+
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let data: Vec<f64> = (0..3)
+ .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
+ .collect();
+ Ok(Record::FloatInfo(FloatInfoRecord {
+ sysmis: data[0],
+ highest: data[1],
+ lowest: data[2],
+ }))
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum CategoryLabels {
+ VarLabels,
+ CountedValues,
+}
+
+#[derive(Clone, Debug)]
+pub enum MultipleResponseType {
+ MultipleDichotomy {
+ value: RawString,
+ labels: CategoryLabels,
+ },
+ MultipleCategory,
+}
+
+impl MultipleResponseType {
+ fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> {
+ let (mr_type, input) = match input.split_first() {
+ Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
+ Some((b'D', input)) => {
+ let (value, input) = parse_counted_string(input)?;
+ (
+ MultipleResponseType::MultipleDichotomy {
+ value,
+ labels: CategoryLabels::VarLabels,
+ },
+ input,
+ )
+ }
+ Some((b'E', input)) => {
+ let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
+ (CategoryLabels::CountedValues, rest)
+ } else if let Some(rest) = input.strip_prefix(b" 11 ") {
+ (CategoryLabels::VarLabels, rest)
+ } else {
+ return Err(Warning::TBD);
+ };
+ let (value, input) = parse_counted_string(input)?;
+ (
+ MultipleResponseType::MultipleDichotomy { value, labels },
+ input,
+ )
+ }
+ _ => return Err(Warning::TBD),
+ };
+ Ok((mr_type, input))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseSet<I, S>
+where
+ I: Debug,
+ S: Debug,
+{
+ pub name: I,
+ pub label: S,
+ pub mr_type: MultipleResponseType,
+ pub short_names: Vec<I>,
+}
+
+impl MultipleResponseSet<RawString, RawString> {
+ fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> {
+ let Some(equals) = input.iter().position(|&b| b == b'=') else {
+ return Err(Warning::TBD);
+ };
+ let (name, input) = input.split_at(equals);
+ let (mr_type, input) = MultipleResponseType::parse(input)?;
+ let Some(input) = input.strip_prefix(b" ") else {
+ return Err(Warning::TBD);
+ };
+ let (label, mut input) = parse_counted_string(input)?;
+ let mut vars = Vec::new();
+ while input.first() != Some(&b'\n') {
+ match input.split_first() {
+ Some((b' ', rest)) => {
+ let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
+ return Err(Warning::TBD);
+ };
+ let (var, rest) = rest.split_at(length);
+ if !var.is_empty() {
+ vars.push(var.into());
+ }
+ input = rest;
+ }
+ _ => return Err(Warning::TBD),
+ }
+ }
+ while input.first() == Some(&b'\n') {
+ input = &input[1..];
+ }
+ Ok((
+ MultipleResponseSet {
+ name: name.into(),
+ label,
+ mr_type,
+ short_names: vars,
+ },
+ input,
+ ))
+ }
+
+ fn decode(
+ &self,
+ decoder: &Decoder,
+ ) -> Result<MultipleResponseSet<Identifier, String>, Warning> {
+ let mut short_names = Vec::with_capacity(self.short_names.len());
+ for short_name in self.short_names.iter() {
+ if let Some(short_name) = decoder
+ .decode_identifier(short_name)
+ .map_err(Warning::InvalidMrSetName)
+ .issue_warning(&decoder.warn)
+ {
+ short_names.push(short_name);
+ }
+ }
+ Ok(MultipleResponseSet {
+ name: decoder
+ .decode_identifier(&self.name)
+ .map_err(Warning::InvalidMrSetVariableName)?,
+ label: decoder.decode(&self.label).to_string(),
+ mr_type: self.mr_type.clone(),
+ short_names,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseRecord<I, S>(pub Vec<MultipleResponseSet<I, S>>)
+where
+ I: Debug,
+ S: Debug;
+
+impl ExtensionRecord for MultipleResponseRecord<RawString, RawString> {
+ const SUBTYPE: u32 = 7;
+ const SIZE: Option<u32> = Some(1);
+ const COUNT: Option<u32> = None;
+ const NAME: &'static str = "multiple response set record";
+
+ fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let mut sets = Vec::new();
+ while !input.is_empty() {
+ let (set, rest) = MultipleResponseSet::parse(input)?;
+ sets.push(set);
+ input = rest;
+ }
+ Ok(Record::MultipleResponse(MultipleResponseRecord(sets)))
+ }
+}
+
+impl MultipleResponseRecord<RawString, RawString> {
+ fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ let mut sets = Vec::new();
+ for set in self.0.iter() {
+ if let Some(set) = set.decode(decoder).issue_warning(&decoder.warn) {
+ sets.push(set);
+ }
+ }
+ DecodedRecord::MultipleResponse(MultipleResponseRecord(sets))
+ }
+}
+
+fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> {
+ let Some(space) = input.iter().position(|&b| b == b' ') else {
+ return Err(Warning::TBD);
+ };
+ let Ok(length) = from_utf8(&input[..space]) else {
+ return Err(Warning::TBD);
+ };
+ let Ok(length): Result<usize, _> = length.parse() else {
+ return Err(Warning::TBD);
+ };
+
+ let input = &input[space + 1..];
+ if input.len() < length {
+ return Err(Warning::TBD);
+ };
+
+ let (string, rest) = input.split_at(length);
+ Ok((string.into(), rest))
+}
+
+/// [Level of measurement](https://en.wikipedia.org/wiki/Level_of_measurement).
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Measure {
+ /// Nominal values can only be compared for equality.
+ Nominal,
+
+ /// Ordinal values can be meaningfully ordered.
+ Ordinal,
+
+ /// Scale values can be meaningfully compared for the degree of difference.
+ Scale,
+}
+
+impl Measure {
+ pub fn default_for_type(var_type: VarType) -> Option<Measure> {
+ match var_type {
+ VarType::Numeric => None,
+ VarType::String => Some(Self::Nominal),
+ }
+ }
+
+ fn try_decode(source: u32) -> Result<Option<Measure>, Warning> {
+ match source {
+ 0 => Ok(None),
+ 1 => Ok(Some(Measure::Nominal)),
+ 2 => Ok(Some(Measure::Ordinal)),
+ 3 => Ok(Some(Measure::Scale)),
+ _ => Err(Warning::InvalidMeasurement(source)),
+ }
+ }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Alignment {
+ Left,
+ Right,
+ Center,
+}
+
+impl Alignment {
+ fn try_decode(source: u32) -> Result<Option<Alignment>, Warning> {
+ match source {
+ 0 => Ok(None),
+ 1 => Ok(Some(Alignment::Left)),
+ 2 => Ok(Some(Alignment::Right)),
+ 3 => Ok(Some(Alignment::Center)),
+ _ => Err(Warning::InvalidAlignment(source)),
+ }
+ }
+
+ pub fn default_for_type(var_type: VarType) -> Self {
+ match var_type {
+ VarType::Numeric => Self::Right,
+ VarType::String => Self::Left,
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VarDisplay {
+ pub measure: Option<Measure>,
+ pub width: Option<u32>,
+ pub alignment: Option<Alignment>,
+}
+
+#[derive(Clone, Debug)]
+pub struct VarDisplayRecord(pub Vec<VarDisplay>);
+
+impl VarDisplayRecord {
+ const SUBTYPE: u32 = 11;
+
+ fn parse(
+ ext: &Extension,
+ n_vars: usize,
+ endian: Endian,
+ warn: &dyn Fn(Warning),
+ ) -> Result<Record, Warning> {
+ if ext.size != 4 {
+ return Err(Warning::BadRecordSize {
+ offset: ext.offsets.start,
+ record: String::from("variable display record"),
+ size: ext.size,
+ expected_size: 4,
+ });
+ }
+
+ let has_width = if ext.count as usize == 3 * n_vars {
+ true
+ } else if ext.count as usize == 2 * n_vars {
+ false
+ } else {
+ return Err(Warning::TBD);
+ };
+
+ let mut var_displays = Vec::new();
+ let mut input = &ext.data[..];
+ for _ in 0..n_vars {
+ let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
+ .issue_warning(&warn)
+ .flatten();
+ let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
+ let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
+ .issue_warning(&warn)
+ .flatten();
+ var_displays.push(VarDisplay {
+ measure,
+ width,
+ alignment,
+ });
+ }
+ Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringMissingValues<N>
+where
+ N: Debug,
+{
+ /// Variable name.
+ pub var_name: N,
+
+ /// Missing values.
+ pub missing_values: Vec<RawStrArray<8>>,
+}
+
+impl LongStringMissingValues<RawString> {
+ fn decode(&self, decoder: &Decoder) -> Result<LongStringMissingValues<Identifier>, IdError> {
+ Ok(LongStringMissingValues {
+ var_name: decoder.decode_identifier(&self.var_name)?,
+ missing_values: self.missing_values.clone(),
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringMissingValueRecord<N>(pub Vec<LongStringMissingValues<N>>)
+where
+ N: Debug;
+
+impl ExtensionRecord for LongStringMissingValueRecord<RawString> {
+ const SUBTYPE: u32 = 22;
+ const SIZE: Option<u32> = Some(1);
+ const COUNT: Option<u32> = None;
+ const NAME: &'static str = "long string missing values record";
+
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let mut missing_value_set = Vec::new();
+ while !input.is_empty() {
+ let var_name = read_string(&mut input, endian)?;
+ let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
+ let value_len: u32 = endian.parse(read_bytes(&mut input)?);
+ if value_len != 8 {
+ let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
+ return Err(Warning::BadLongMissingValueLength {
+ record_offset: ext.offsets.start,
+ offset,
+ value_len,
+ });
+ }
+ let mut missing_values = Vec::new();
+ for i in 0..n_missing_values {
+ let value: [u8; 8] = read_bytes(&mut input)?;
+ let numeric_value: u64 = endian.parse(value);
+ let value = if i > 0 && numeric_value == 8 {
+ // Tolerate files written by old, buggy versions of PSPP
+ // where we believed that the value_length was repeated
+ // before each missing value.
+ read_bytes(&mut input)?
+ } else {
+ value
+ };
+ missing_values.push(RawStrArray(value));
+ }
+ missing_value_set.push(LongStringMissingValues {
+ var_name,
+ missing_values,
+ });
+ }
+ Ok(Record::LongStringMissingValues(
+ LongStringMissingValueRecord(missing_value_set),
+ ))
+ }
+}
+
+impl LongStringMissingValueRecord<RawString> {
+ pub fn decode(self, decoder: &Decoder) -> LongStringMissingValueRecord<Identifier> {
+ let mut mvs = Vec::with_capacity(self.0.len());
+ for mv in self.0.iter() {
+ if let Some(mv) = mv
+ .decode(decoder)
+ .map_err(Warning::InvalidLongStringMissingValueVariableName)
+ .issue_warning(&decoder.warn)
+ {
+ mvs.push(mv);
+ }
+ }
+ LongStringMissingValueRecord(mvs)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct EncodingRecord(pub String);
+
+impl ExtensionRecord for EncodingRecord {
+ const SUBTYPE: u32 = 20;
+ const SIZE: Option<u32> = Some(1);
+ const COUNT: Option<u32> = None;
+ const NAME: &'static str = "encoding record";
+
+ fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
+ ext.check_size::<Self>()?;
+
+ Ok(Record::Encoding(EncodingRecord(
+ String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName {
+ offset: ext.offsets.start,
+ })?,
+ )))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct NumberOfCasesRecord {
+ /// Always observed as 1.
+ pub one: u64,
+
+ /// Number of cases.
+ pub n_cases: u64,
+}
+
+impl ExtensionRecord for NumberOfCasesRecord {
+ const SUBTYPE: u32 = 16;
+ const SIZE: Option<u32> = Some(8);
+ const COUNT: Option<u32> = Some(2);
+ const NAME: &'static str = "extended number of cases record";
+
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let one = endian.parse(read_bytes(&mut input)?);
+ let n_cases = endian.parse(read_bytes(&mut input)?);
+
+ Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct TextRecord {
+ pub offsets: Range<u64>,
+
+ /// Type of record.
+ pub rec_type: TextRecordType,
+
+ /// The text content of the record.
+ pub text: RawString,
+}
+
+#[derive(Clone, Copy, Debug)]
+pub enum TextRecordType {
+ VariableSets,
+ ProductInfo,
+ LongNames,
+ VeryLongStrings,
+ FileAttributes,
+ VariableAttributes,
+}
+
+impl TextRecord {
+ fn new(extension: Extension, rec_type: TextRecordType) -> Self {
+ Self {
+ offsets: extension.offsets,
+ rec_type,
+ text: extension.data.into(),
+ }
+ }
+ pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ match self.rec_type {
+ TextRecordType::VariableSets => {
+ DecodedRecord::VariableSets(VariableSetRecord::decode(&self, decoder))
+ }
+ TextRecordType::ProductInfo => {
+ DecodedRecord::ProductInfo(ProductInfoRecord::decode(&self, decoder))
+ }
+ TextRecordType::LongNames => {
+ DecodedRecord::LongNames(LongNamesRecord::decode(&self, decoder))
+ }
+ TextRecordType::VeryLongStrings => {
+ DecodedRecord::VeryLongStrings(VeryLongStringsRecord::decode(&self, decoder))
+ }
+ TextRecordType::FileAttributes => {
+ DecodedRecord::FileAttributes(FileAttributeRecord::decode(&self, decoder))
+ }
+ TextRecordType::VariableAttributes => {
+ DecodedRecord::VariableAttributes(VariableAttributeRecord::decode(&self, decoder))
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VeryLongString {
+ pub short_name: Identifier,
+ pub length: u16,
+}
+
+impl VeryLongString {
+ fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Warning> {
+ let Some((short_name, length)) = input.split_once('=') else {
+ return Err(Warning::TBD);
+ };
+ let short_name = decoder
+ .new_identifier(short_name)
+ .and_then(Identifier::must_be_ordinary)
+ .map_err(Warning::InvalidLongStringName)?;
+ let length = length.parse().map_err(|_| Warning::TBD)?;
+ Ok(VeryLongString { short_name, length })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VeryLongStringsRecord(pub Vec<VeryLongString>);
+
+impl VeryLongStringsRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ let input = decoder.decode(&source.text);
+ let mut very_long_strings = Vec::new();
+ for tuple in input
+ .split('\0')
+ .map(|s| s.trim_end_matches('\t'))
+ .filter(|s| !s.is_empty())
+ {
+ if let Some(vls) = VeryLongString::parse(decoder, tuple).issue_warning(&decoder.warn) {
+ very_long_strings.push(vls)
+ }
+ }
+ VeryLongStringsRecord(very_long_strings)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Attribute {
+ pub name: Identifier,
+ pub values: Vec<String>,
+}
+
+impl Attribute {
+ fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> {
+ let Some((name, mut input)) = input.split_once('(') else {
+ return Err(Warning::TBD);
+ };
+ let name = decoder
+ .new_identifier(name)
+ .map_err(Warning::InvalidAttributeName)?;
+ let mut values = Vec::new();
+ loop {
+ let Some((value, rest)) = input.split_once('\n') else {
+ return Err(Warning::TBD);
+ };
+ if let Some(stripped) = value
+ .strip_prefix('\'')
+ .and_then(|value| value.strip_suffix('\''))
+ {
+ values.push(stripped.into());
+ } else {
+ decoder.warn(Warning::TBD);
+ values.push(value.into());
+ }
+ if let Some(rest) = rest.strip_prefix(')') {
+ let attribute = Attribute { name, values };
+ return Ok((attribute, rest));
+ };
+ input = rest;
+ }
+ }
+}
+
+impl Attributes {
+ fn parse<'a>(
+ decoder: &Decoder,
+ mut input: &'a str,
+ sentinel: Option<char>,
+ ) -> Result<(Attributes, &'a str), Warning> {
+ let mut attributes = HashMap::new();
+ let rest = loop {
+ match input.chars().next() {
+ None => break input,
+ c if c == sentinel => break &input[1..],
+ _ => {
+ let (attribute, rest) = Attribute::parse(decoder, input)?;
+ // XXX report duplicate name
+ attributes.insert(attribute.name, attribute.values);
+ input = rest;
+ }
+ }
+ };
+ Ok((Attributes(attributes), rest))
+ }
+}
+
+#[derive(Clone, Debug, Default)]
+pub struct FileAttributeRecord(pub Attributes);
+
+impl FileAttributeRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ let input = decoder.decode(&source.text);
+ match Attributes::parse(decoder, &input, None).issue_warning(&decoder.warn) {
+ Some((set, rest)) => {
+ if !rest.is_empty() {
+ decoder.warn(Warning::TBD);
+ }
+ FileAttributeRecord(set)
+ }
+ None => FileAttributeRecord::default(),
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VarAttributes {
+ pub long_var_name: Identifier,
+ pub attributes: Attributes,
+}
+
+impl VarAttributes {
+ fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributes, &'a str), Warning> {
+ let Some((long_var_name, rest)) = input.split_once(':') else {
+ return Err(Warning::TBD);
+ };
+ let long_var_name = decoder
+ .new_identifier(long_var_name)
+ .and_then(Identifier::must_be_ordinary)
+ .map_err(Warning::InvalidAttributeVariableName)?;
+ let (attributes, rest) = Attributes::parse(decoder, rest, Some('/'))?;
+ let var_attribute = VarAttributes {
+ long_var_name,
+ attributes,
+ };
+ Ok((var_attribute, rest))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableAttributeRecord(pub Vec<VarAttributes>);
+
+impl VariableAttributeRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ let decoded = decoder.decode(&source.text);
+ let mut input = decoded.as_ref();
+ let mut var_attribute_sets = Vec::new();
+ while !input.is_empty() {
+ let Some((var_attribute, rest)) =
+ VarAttributes::parse(decoder, input).issue_warning(&decoder.warn)
+ else {
+ break;
+ };
+ var_attribute_sets.push(var_attribute);
+ input = rest;
+ }
+ VariableAttributeRecord(var_attribute_sets)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongName {
+ pub short_name: Identifier,
+ pub long_name: Identifier,
+}
+
+impl LongName {
+ fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
+ let Some((short_name, long_name)) = input.split_once('=') else {
+ return Err(Warning::TBD);
+ };
+ let short_name = decoder
+ .new_identifier(short_name)
+ .and_then(Identifier::must_be_ordinary)
+ .map_err(Warning::InvalidShortName)?;
+ let long_name = decoder
+ .new_identifier(long_name)
+ .and_then(Identifier::must_be_ordinary)
+ .map_err(Warning::InvalidLongName)?;
+ Ok(LongName {
+ short_name,
+ long_name,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongNamesRecord(pub Vec<LongName>);
+
+impl LongNamesRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ let input = decoder.decode(&source.text);
+ let mut names = Vec::new();
+ for pair in input.split('\t').filter(|s| !s.is_empty()) {
+ if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&decoder.warn) {
+ names.push(long_name);
+ }
+ }
+ LongNamesRecord(names)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct ProductInfoRecord(pub String);
+
+impl ProductInfoRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ Self(decoder.decode(&source.text).into())
+ }
+}
+#[derive(Clone, Debug)]
+pub struct VariableSet {
+ pub name: Identifier,
+ pub variable_names: Vec<Identifier>,
+}
+
+impl VariableSet {
+ fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
+ let (name, input) = input.split_once('=').ok_or(Warning::TBD)?;
+ let name = decoder.new_identifier(name).map_err(|_| Warning::TBD)?;
+ let mut vars = Vec::new();
+ for var in input.split_ascii_whitespace() {
+ if let Some(identifier) = decoder
+ .new_identifier(var)
+ .and_then(Identifier::must_be_ordinary)
+ .map_err(Warning::InvalidVariableSetName)
+ .issue_warning(&decoder.warn)
+ {
+ vars.push(identifier);
+ }
+ }
+ Ok(VariableSet {
+ name,
+ variable_names: vars,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableSetRecord {
+ pub offsets: Range<u64>,
+ pub sets: Vec<VariableSet>,
+}
+
+impl VariableSetRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord {
+ let mut sets = Vec::new();
+ let input = decoder.decode(&source.text);
+ for line in input.lines() {
+ if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&decoder.warn) {
+ sets.push(set)
+ }
+ }
+ VariableSetRecord {
+ offsets: source.offsets.clone(),
+ sets,
+ }
+ }
+}
+
+trait IssueWarning<T> {
+ fn issue_warning<F>(self, warn: &F) -> Option<T>
+ where
+ F: Fn(Warning);
+}
+impl<T> IssueWarning<T> for Result<T, Warning> {
+ fn issue_warning<F>(self, warn: &F) -> Option<T>
+ where
+ F: Fn(Warning),
+ {
+ match self {
+ Ok(result) => Some(result),
+ Err(error) => {
+ warn(error);
+ None
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Extension {
+ pub offsets: Range<u64>,
+
+ /// Record subtype.
+ pub subtype: u32,
+
+ /// Size of each data element.
+ pub size: u32,
+
+ /// Number of data elements.
+ pub count: u32,
+
+ /// `size * count` bytes of data.
+ pub data: Vec<u8>,
+}
+
+impl Extension {
+ fn check_size<E: ExtensionRecord>(&self) -> Result<(), Warning> {
+ if let Some(expected_size) = E::SIZE {
+ if self.size != expected_size {
+ return Err(Warning::BadRecordSize {
+ offset: self.offsets.start,
+ record: E::NAME.into(),
+ size: self.size,
+ expected_size,
+ });
+ }
+ }
+ if let Some(expected_count) = E::COUNT {
+ if self.count != expected_count {
+ return Err(Warning::BadRecordCount {
+ offset: self.offsets.start,
+ record: E::NAME.into(),
+ count: self.count,
+ expected_count,
+ });
+ }
+ }
+ Ok(())
+ }
+
+ fn read<R: Read + Seek>(
+ r: &mut R,
+ endian: Endian,
+ n_vars: usize,
+ warn: &dyn Fn(Warning),
+ ) -> Result<Option<Record>, Error> {
+ let subtype = endian.parse(read_bytes(r)?);
+ let header_offset = r.stream_position()?;
+ let size: u32 = endian.parse(read_bytes(r)?);
+ let count = endian.parse(read_bytes(r)?);
+ let Some(product) = size.checked_mul(count) else {
+ return Err(Error::ExtensionRecordTooLarge {
+ offset: header_offset,
+ subtype,
+ size,
+ count,
+ });
+ };
+ let start_offset = r.stream_position()?;
+ let data = read_vec(r, product as usize)?;
+ let end_offset = start_offset + product as u64;
+ let extension = Extension {
+ offsets: start_offset..end_offset,
+ subtype,
+ size,
+ count,
+ data,
+ };
+ let result = match subtype {
+ IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian),
+ FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian),
+ VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, n_vars, endian, warn),
+ MultipleResponseRecord::SUBTYPE | 19 => {
+ MultipleResponseRecord::parse(&extension, endian)
+ }
+ LongStringValueLabelRecord::SUBTYPE => {
+ LongStringValueLabelRecord::parse(&extension, endian)
+ }
+ EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian),
+ NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian),
+ 5 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::VariableSets,
+ ))),
+ 10 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::ProductInfo,
+ ))),
+ 13 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::LongNames,
+ ))),
+ 14 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::VeryLongStrings,
+ ))),
+ 17 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::FileAttributes,
+ ))),
+ 18 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::VariableAttributes,
+ ))),
+ _ => Ok(Record::OtherExtension(extension)),
+ };
+ match result {
+ Ok(result) => Ok(Some(result)),
+ Err(error) => {
+ warn(error);
+ Ok(None)
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct ZHeader {
+ /// File offset to the start of the record.
+ pub offset: u64,
+
+ /// File offset to the ZLIB data header.
+ pub zheader_offset: u64,
+
+ /// File offset to the ZLIB trailer.
+ pub ztrailer_offset: u64,
+
+ /// Length of the ZLIB trailer in bytes.
+ pub ztrailer_len: u64,
+}
+
+impl ZHeader {
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
+ let offset = r.stream_position()?;
+ let zheader_offset: u64 = endian.parse(read_bytes(r)?);
+ let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
+ let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
+
+ Ok(ZHeader {
+ offset,
+ zheader_offset,
+ ztrailer_offset,
+ ztrailer_len,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct ZTrailer {
+ /// File offset to the start of the record.
+ pub offset: u64,
+
+ /// Compression bias as a negative integer, e.g. -100.
+ pub int_bias: i64,
+
+ /// Always observed as zero.
+ pub zero: u64,
+
+ /// Uncompressed size of each block, except possibly the last. Only
+ /// `0x3ff000` has been observed so far.
+ pub block_size: u32,
+
+ /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
+ pub blocks: Vec<ZBlock>,
+}
+
+#[derive(Clone, Debug)]
+pub struct ZBlock {
+ /// Offset of block of data if simple compression were used.
+ pub uncompressed_ofs: u64,
+
+ /// Actual offset within the file of the compressed data block.
+ pub compressed_ofs: u64,
+
+ /// The number of bytes in this data block after decompression. This is
+ /// `block_size` in every data block but the last, which may be smaller.
+ pub uncompressed_size: u32,
+
+ /// The number of bytes in this data block, as stored compressed in this
+ /// file.
+ pub compressed_size: u32,
+}
+
+impl ZBlock {
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
+ Ok(ZBlock {
+ uncompressed_ofs: endian.parse(read_bytes(r)?),
+ compressed_ofs: endian.parse(read_bytes(r)?),
+ uncompressed_size: endian.parse(read_bytes(r)?),
+ compressed_size: endian.parse(read_bytes(r)?),
+ })
+ }
+}
+
+impl ZTrailer {
+ fn read<R: Read + Seek>(
+ reader: &mut R,
+ endian: Endian,
+ ztrailer_ofs: u64,
+ ztrailer_len: u64,
+ ) -> Result<Option<ZTrailer>, Error> {
+ let start_offset = reader.stream_position()?;
+ if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
+ return Ok(None);
+ }
+ let int_bias = endian.parse(read_bytes(reader)?);
+ let zero = endian.parse(read_bytes(reader)?);
+ let block_size = endian.parse(read_bytes(reader)?);
+ let n_blocks: u32 = endian.parse(read_bytes(reader)?);
+ let expected_n_blocks = (ztrailer_len - 24) / 24;
+ if n_blocks as u64 != expected_n_blocks {
+ return Err(Error::BadZlibTrailerNBlocks {
+ offset: ztrailer_ofs,
+ n_blocks,
+ expected_n_blocks,
+ ztrailer_len,
+ });
+ }
+ let blocks = (0..n_blocks)
+ .map(|_| ZBlock::read(reader, endian))
+ .collect::<Result<Vec<_>, _>>()?;
+ reader.seek(SeekFrom::Start(start_offset))?;
+ Ok(Some(ZTrailer {
+ offset: ztrailer_ofs,
+ int_bias,
+ zero,
+ block_size,
+ blocks,
+ }))
+ }
+}
+
+fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
+ let mut buf = [0; N];
+ let n = r.read(&mut buf)?;
+ if n > 0 {
+ if n < N {
+ r.read_exact(&mut buf[n..])?;
+ }
+ Ok(Some(buf))
+ } else {
+ Ok(None)
+ }
+}
+
+fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
+ let mut buf = [0; N];
+ r.read_exact(&mut buf)?;
+ Ok(buf)
+}
+
+fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
+ let mut vec = vec![0; n];
+ r.read_exact(&mut vec)?;
+ Ok(vec)
+}
+
+fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<RawString, IoError> {
+ let length: u32 = endian.parse(read_bytes(r)?);
+ Ok(read_vec(r, length as usize)?.into())
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringValueLabels<N, S>
+where
+ S: Debug,
+{
+ pub var_name: N,
+ pub width: u32,
+
+ /// `(value, label)` pairs, where each value is `width` bytes.
+ pub labels: Vec<(RawString, S)>,
+}
+
+impl LongStringValueLabels<RawString, RawString> {
+ fn decode(
+ &self,
+ decoder: &Decoder,
+ ) -> Result<LongStringValueLabels<Identifier, String>, Warning> {
+ let var_name = decoder.decode(&self.var_name);
+ let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding)
+ .map_err(Warning::InvalidLongStringValueLabelName)?;
+
+ let mut labels = Vec::with_capacity(self.labels.len());
+ for (value, label) in self.labels.iter() {
+ let label = decoder.decode(label).to_string();
+ labels.push((value.clone(), label));
+ }
+
+ Ok(LongStringValueLabels {
+ var_name,
+ width: self.width,
+ labels,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringValueLabelRecord<N, S>(pub Vec<LongStringValueLabels<N, S>>)
+where
+ N: Debug,
+ S: Debug;
+
+impl ExtensionRecord for LongStringValueLabelRecord<RawString, RawString> {
+ const SUBTYPE: u32 = 21;
+ const SIZE: Option<u32> = Some(1);
+ const COUNT: Option<u32> = None;
+ const NAME: &'static str = "long string value labels record";
+
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+ ext.check_size::<Self>()?;
+
+ let mut input = &ext.data[..];
+ let mut label_set = Vec::new();
+ while !input.is_empty() {
+ let var_name = read_string(&mut input, endian)?;
+ let width: u32 = endian.parse(read_bytes(&mut input)?);
+ let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
+ let mut labels = Vec::new();
+ for _ in 0..n_labels {
+ let value = read_string(&mut input, endian)?;
+ let label = read_string(&mut input, endian)?;
+ labels.push((value, label));
+ }
+ label_set.push(LongStringValueLabels {
+ var_name,
+ width,
+ labels,
+ })
+ }
+ Ok(Record::LongStringValueLabels(LongStringValueLabelRecord(
+ label_set,
+ )))
+ }
+}
+
+impl LongStringValueLabelRecord<RawString, RawString> {
+ fn decode(self, decoder: &Decoder) -> LongStringValueLabelRecord<Identifier, String> {
+ let mut labels = Vec::with_capacity(self.0.len());
+ for label in &self.0 {
+ match label.decode(decoder) {
+ Ok(set) => labels.push(set),
+ Err(error) => decoder.warn(error),
+ }
+ }
+ LongStringValueLabelRecord(labels)
+ }
+}
+
+#[derive(Default)]
+pub struct VarTypes {
+ pub types: Vec<Option<VarType>>,
+}
+
+impl VarTypes {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ pub fn push(&mut self, width: RawWidth) {
+ if let Ok(var_type) = VarType::try_from(width) {
+ self.types.push(Some(var_type));
+ for _ in 1..width.n_values().unwrap() {
+ self.types.push(None);
+ }
+ }
+ }
+
+ pub fn n_values(&self) -> usize {
+ self.types.len()
+ }
+
+ pub fn is_valid_index(&self, index: usize) -> bool {
+ self.var_type_at(index).is_some()
+ }
+
+ pub fn var_type_at(&self, index: usize) -> Option<VarType> {
+ if index >= 1 && index <= self.types.len() {
+ self.types[index - 1]
+ } else {
+ None
+ }
+ }
+
+ pub fn iter(&self) -> impl Iterator<Item = VarType> + use<'_> {
+ self.types
+ .iter()
+ .map(|var_type| var_type.unwrap_or(VarType::String))
+ }
+}
--- /dev/null
+use float_next_after::NextAfter;
+use num::{Bounded, Zero};
+use ordered_float::OrderedFloat;
+use std::{
+ collections::{hash_map::Entry, HashMap},
+ error::Error as StdError,
+ fmt::{Display, Formatter, Result as FmtResult},
+ iter::repeat_n,
+};
+
+use crate::endian::{Endian, ToBytes};
+
+pub type Result<T, F = Error> = std::result::Result<T, F>;
+
+#[derive(Debug)]
+pub struct Error {
+ pub file_name: Option<String>,
+ pub line_number: Option<usize>,
+ pub token: Option<String>,
+ pub message: String,
+}
+
+impl Error {
+ fn new(
+ file_name: Option<&str>,
+ line_number: Option<usize>,
+ token: Option<&str>,
+ message: String,
+ ) -> Error {
+ Error {
+ file_name: file_name.map(String::from),
+ line_number,
+ token: token.map(String::from),
+ message,
+ }
+ }
+}
+
+impl StdError for Error {}
+
+impl Display for Error {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ match (self.file_name.as_ref(), self.line_number) {
+ (Some(ref file_name), Some(line_number)) => write!(f, "{file_name}:{line_number}: ")?,
+ (Some(ref file_name), None) => write!(f, "{file_name}: ")?,
+ (None, Some(line_number)) => write!(f, "line {line_number}: ")?,
+ (None, None) => (),
+ }
+ if let Some(ref token) = self.token {
+ write!(f, "at '{token}': ")?;
+ }
+ write!(f, "{}", self.message)
+ }
+}
+
+pub fn sack(input: &str, input_file_name: Option<&str>, endian: Endian) -> Result<Vec<u8>> {
+ let mut symbol_table = HashMap::new();
+ let output = _sack(input, input_file_name, endian, &mut symbol_table)?;
+ let output = if !symbol_table.is_empty() {
+ for (k, v) in symbol_table.iter() {
+ println!("{k} => {v:?}");
+ }
+ for (k, v) in symbol_table.iter() {
+ if v.is_none() {
+ Err(Error::new(
+ input_file_name,
+ None,
+ None,
+ format!("label {k} used but never defined"),
+ ))?
+ }
+ }
+ _sack(input, input_file_name, endian, &mut symbol_table)?
+ } else {
+ output
+ };
+ Ok(output)
+}
+
+fn _sack(
+ input: &str,
+ input_file_name: Option<&str>,
+ endian: Endian,
+ symbol_table: &mut HashMap<String, Option<u32>>,
+) -> Result<Vec<u8>> {
+ let mut lexer = Lexer::new(input, input_file_name, endian)?;
+ let mut output = Vec::new();
+ while parse_data_item(&mut lexer, &mut output, symbol_table)? {}
+ Ok(output)
+}
+
+fn parse_data_item(
+ lexer: &mut Lexer,
+ output: &mut Vec<u8>,
+ symbol_table: &mut HashMap<String, Option<u32>>,
+) -> Result<bool> {
+ if lexer.token.is_none() {
+ return Ok(false);
+ };
+
+ let initial_len = output.len();
+ match lexer.take()? {
+ Token::Integer(integer) => {
+ if let Ok(integer) = TryInto::<i32>::try_into(integer) {
+ output.extend_from_slice(&lexer.endian.to_bytes(integer));
+ } else if let Ok(integer) = TryInto::<u32>::try_into(integer) {
+ output.extend_from_slice(&lexer.endian.to_bytes(integer));
+ } else {
+ Err(lexer.error(format!(
+ "{integer} is not in the valid range [{},{}]",
+ i32::MIN,
+ u32::MAX
+ )))?;
+ };
+ }
+ Token::Float(float) => output.extend_from_slice(&lexer.endian.to_bytes(float.0)),
+ Token::PcSysmis => {
+ output.extend_from_slice(&[0xf5, 0x1e, 0x26, 0x02, 0x8a, 0x8c, 0xed, 0xff])
+ }
+ Token::I8 => put_integers::<u8, 1>(lexer, "i8", output)?,
+ Token::I16 => put_integers::<u16, 2>(lexer, "i16", output)?,
+ Token::I64 => put_integers::<i64, 8>(lexer, "i64", output)?,
+ Token::String(string) => output.extend_from_slice(string.as_bytes()),
+ Token::S(size) => {
+ let Some((Token::String(ref string), _)) = lexer.token else {
+ Err(lexer.error(format!("string expected after 's{size}'")))?
+ };
+ let len = string.len();
+ if len > size {
+ Err(lexer.error(format!(
+ "{len}-byte string is longer than pad length {size}"
+ )))?
+ }
+ output.extend_from_slice(string.as_bytes());
+ output.extend(repeat_n(b' ', size - len));
+ lexer.get()?;
+ }
+ Token::LParen => {
+ while !matches!(lexer.token, Some((Token::RParen, _))) {
+ parse_data_item(lexer, output, symbol_table)?;
+ }
+ lexer.get()?;
+ }
+ Token::Count => put_counted_items::<u32, 4>(lexer, "COUNT", output, symbol_table)?,
+ Token::Count8 => put_counted_items::<u8, 1>(lexer, "COUNT8", output, symbol_table)?,
+ Token::Hex => {
+ let Some((Token::String(ref string), _)) = lexer.token else {
+ Err(lexer.error(String::from("string expected after 'hex'")))?
+ };
+ let mut string = &string[..];
+ loop {
+ string = string.trim_start();
+ if string.is_empty() {
+ break;
+ };
+
+ let mut i = string.chars();
+ let Some(c0) = i.next() else { return Ok(true) };
+ let Some(c1) = i.next() else {
+ Err(lexer.error(String::from("hex string has odd number of characters")))?
+ };
+
+ let (Some(digit0), Some(digit1)) = (c0.to_digit(16), c1.to_digit(16)) else {
+ Err(lexer.error(String::from("invalid digit in hex string")))?
+ };
+ let byte = digit0 * 16 + digit1;
+ output.push(byte as u8);
+
+ string = i.as_str();
+ }
+ lexer.get()?;
+ }
+ Token::Label(name) => {
+ println!("define {name}");
+ let value = output.len() as u32;
+ match symbol_table.entry(name.clone()) {
+ Entry::Vacant(v) => {
+ v.insert(Some(value));
+ }
+ Entry::Occupied(mut o) => {
+ match o.get() {
+ Some(v) => {
+ if *v != value {
+ Err(lexer.error(format!("{name}: can't redefine label for offset {:#x} with offset {:#x}", *v, value)))?
+ }
+ }
+ None => drop(o.insert(Some(value))),
+ }
+ }
+ };
+ return Ok(true);
+ }
+ Token::At(name) => {
+ let mut value = *symbol_table.entry(name.clone()).or_insert(None);
+ loop {
+ let plus = match lexer.token {
+ Some((Token::Plus, _)) => true,
+ Some((Token::Minus, _)) => false,
+ _ => break,
+ };
+ lexer.get()?;
+
+ let operand = match lexer.token {
+ Some((Token::At(ref name), _)) => {
+ *symbol_table.entry(name.clone()).or_insert(None)
+ }
+ Some((Token::Integer(integer), _)) => Some(
+ integer
+ .try_into()
+ .map_err(|msg| lexer.error(format!("bad offset literal ({msg})")))?,
+ ),
+ _ => Err(lexer.error(String::from("expecting @label or integer literal")))?,
+ };
+ lexer.get()?;
+
+ value = match (value, operand) {
+ (Some(a), Some(b)) => Some(
+ if plus {
+ a.checked_add(b)
+ } else {
+ a.checked_sub(b)
+ }
+ .ok_or_else(|| {
+ lexer.error(String::from("overflow in offset arithmetic"))
+ })?,
+ ),
+ _ => None,
+ };
+ }
+ let value = value.unwrap_or(0);
+ output.extend_from_slice(&lexer.endian.to_bytes(value));
+ }
+ _ => (),
+ };
+ if let Some((Token::Asterisk, _)) = lexer.token {
+ lexer.get()?;
+ let Token::Integer(count) = lexer.take()? else {
+ Err(lexer.error(String::from("positive integer expected after '*'")))?
+ };
+ if count < 1 {
+ Err(lexer.error(String::from("positive integer expected after '*'")))?
+ };
+ let final_len = output.len();
+ for _ in 1..count {
+ output.extend_from_within(initial_len..final_len);
+ }
+ }
+ match lexer.token {
+ Some((Token::Semicolon, _)) => {
+ lexer.get()?;
+ }
+ Some((Token::RParen, _)) => (),
+ _ => Err(lexer.error(String::from("';' expected")))?,
+ }
+ Ok(true)
+}
+
+fn put_counted_items<T, const N: usize>(
+ lexer: &mut Lexer,
+ name: &str,
+ output: &mut Vec<u8>,
+ symbol_table: &mut HashMap<String, Option<u32>>,
+) -> Result<()>
+where
+ T: Zero + TryFrom<usize>,
+ Endian: ToBytes<T, N>,
+{
+ let old_size = output.len();
+ output.extend_from_slice(&lexer.endian.to_bytes(T::zero()));
+ let start = output.len();
+ if !matches!(lexer.token, Some((Token::LParen, _))) {
+ Err(lexer.error(format!("'(' expected after '{name}'")))?
+ }
+ lexer.get()?;
+ while !matches!(lexer.token, Some((Token::RParen, _))) {
+ parse_data_item(lexer, output, symbol_table)?;
+ }
+ lexer.get()?;
+ let delta = output.len() - start;
+ let Ok(delta): Result<T, _> = delta.try_into() else {
+ Err(lexer.error(format!("{delta} bytes is too much for '{name}'")))?
+ };
+ let dest = &mut output[old_size..old_size + N];
+ dest.copy_from_slice(&lexer.endian.to_bytes(delta));
+ Ok(())
+}
+
+fn put_integers<T, const N: usize>(
+ lexer: &mut Lexer,
+ name: &str,
+ output: &mut Vec<u8>,
+) -> Result<()>
+where
+ T: Bounded + Display + TryFrom<i64> + Copy,
+ Endian: ToBytes<T, N>,
+{
+ println!("put_integers {:?}", lexer.token);
+ let mut n = 0;
+ while let Some(integer) = lexer.take_if(|t| match t {
+ Token::Integer(integer) => Some(*integer),
+ _ => None,
+ })? {
+ println!("got integer {integer}");
+ let Ok(integer) = integer.try_into() else {
+ Err(lexer.error(format!(
+ "{integer} is not in the valid range [{},{}]",
+ T::min_value(),
+ T::max_value()
+ )))?
+ };
+ output.extend_from_slice(&lexer.endian.to_bytes(integer));
+ n += 1;
+ }
+ println!("put_integers {:?} {n}", lexer.token);
+ if n == 0 {
+ Err(lexer.error(format!("integer expected after '{name}'")))?
+ }
+ Ok(())
+}
+
+#[derive(PartialEq, Eq, Clone, Debug)]
+enum Token {
+ Integer(i64),
+ Float(OrderedFloat<f64>),
+ PcSysmis,
+ String(String),
+ Semicolon,
+ Asterisk,
+ LParen,
+ RParen,
+ I8,
+ I16,
+ I64,
+ S(usize),
+ Count,
+ Count8,
+ Hex,
+ Label(String),
+ At(String),
+ Minus,
+ Plus,
+}
+
+struct Lexer<'a> {
+ input: &'a str,
+ token: Option<(Token, &'a str)>,
+ input_file_name: Option<&'a str>,
+ line_number: usize,
+ endian: Endian,
+}
+
+fn skip_comments(mut s: &str) -> (&str, usize) {
+ let mut n_newlines = 0;
+ let s = loop {
+ s = s.trim_start_matches([' ', '\t', '\r', '<', '>']);
+ if let Some(remainder) = s.strip_prefix('#') {
+ let Some((_, remainder)) = remainder.split_once('\n') else {
+ break "";
+ };
+ s = remainder;
+ n_newlines += 1;
+ } else if let Some(remainder) = s.strip_prefix('\n') {
+ s = remainder;
+ n_newlines += 1;
+ } else {
+ break s;
+ }
+ };
+ (s, n_newlines)
+}
+
+impl<'a> Lexer<'a> {
+ fn new(input: &'a str, input_file_name: Option<&'a str>, endian: Endian) -> Result<Lexer<'a>> {
+ let mut lexer = Lexer {
+ input,
+ token: None,
+ input_file_name,
+ line_number: 1,
+ endian,
+ };
+ lexer.token = lexer.next()?;
+ Ok(lexer)
+ }
+ fn error(&self, message: String) -> Error {
+ let repr = self.token.as_ref().map(|(_, repr)| *repr);
+ Error::new(self.input_file_name, Some(self.line_number), repr, message)
+ }
+ fn take(&mut self) -> Result<Token> {
+ let Some(token) = self.token.take() else {
+ Err(self.error(String::from("unexpected end of input")))?
+ };
+ self.token = self.next()?;
+ Ok(token.0)
+ }
+ fn take_if<F, T>(&mut self, condition: F) -> Result<Option<T>>
+ where
+ F: FnOnce(&Token) -> Option<T>,
+ {
+ let Some(ref token) = self.token else {
+ return Ok(None);
+ };
+ match condition(&token.0) {
+ Some(value) => {
+ self.token = self.next()?;
+ Ok(Some(value))
+ }
+ None => Ok(None),
+ }
+ }
+ fn get(&mut self) -> Result<Option<&Token>> {
+ if self.token.is_none() {
+ Err(self.error(String::from("unexpected end of input")))?
+ } else {
+ self.token = self.next()?;
+ match self.token {
+ Some((ref token, _)) => Ok(Some(token)),
+ None => Ok(None),
+ }
+ }
+ }
+
+ fn next(&mut self) -> Result<Option<(Token, &'a str)>> {
+ // Get the first character of the token, skipping past white space and
+ // comments.
+ let (s, n_newlines) = skip_comments(self.input);
+ self.line_number += n_newlines;
+ self.input = s;
+
+ let start = s;
+ let mut iter = s.chars();
+ let Some(c) = iter.next() else {
+ return Ok(None);
+ };
+ let (token, rest) = match c {
+ c if c.is_ascii_digit() || c == '-' => {
+ let len = s
+ .find(|c: char| {
+ !(c.is_ascii_digit() || c.is_alphabetic() || c == '.' || c == '-')
+ })
+ .unwrap_or(s.len());
+ let (number, rest) = s.split_at(len);
+ let token = if number == "-" {
+ Token::Minus
+ } else if let Some(digits) = number.strip_prefix("0x") {
+ Token::Integer(i64::from_str_radix(digits, 16).map_err(|msg| {
+ self.error(format!("bad integer literal '{number}' ({msg})"))
+ })?)
+ } else if !number.contains('.') {
+ Token::Integer(number.parse().map_err(|msg| {
+ self.error(format!("bad integer literal '{number}' ({msg})"))
+ })?)
+ } else {
+ Token::Float(number.parse().map_err(|msg| {
+ self.error(format!("bad float literal '{number}' ({msg})"))
+ })?)
+ };
+ (token, rest)
+ }
+ '"' => {
+ let s = iter.as_str();
+ let Some(len) = s.find(['\n', '"']) else {
+ Err(self.error(String::from("end-of-file inside string")))?
+ };
+ let (string, rest) = s.split_at(len);
+ let Some(rest) = rest.strip_prefix('"') else {
+ Err(self.error(format!("new-line inside string ({string}...{rest})")))?
+ };
+ (Token::String(string.into()), rest)
+ }
+ ';' => (Token::Semicolon, iter.as_str()),
+ '*' => (Token::Asterisk, iter.as_str()),
+ '+' => (Token::Plus, iter.as_str()),
+ '(' => (Token::LParen, iter.as_str()),
+ ')' => (Token::RParen, iter.as_str()),
+ c if c.is_alphabetic() || c == '@' || c == '_' => {
+ let len = s
+ .find(|c: char| {
+ !(c.is_ascii_digit()
+ || c.is_alphabetic()
+ || c == '@'
+ || c == '.'
+ || c == '_')
+ })
+ .unwrap_or(s.len());
+ let (s, rest) = s.split_at(len);
+ if let Some(rest) = rest.strip_prefix(':') {
+ (Token::Label(s.into()), rest)
+ } else if let Some(name) = s.strip_prefix('@') {
+ (Token::At(name.into()), rest)
+ } else if let Some(count) = s.strip_prefix('s') {
+ let token =
+ Token::S(count.parse().map_err(|msg| {
+ self.error(format!("bad counted string '{s}' ({msg})"))
+ })?);
+ (token, rest)
+ } else {
+ let token = match s {
+ "i8" => Token::I8,
+ "i16" => Token::I16,
+ "i64" => Token::I64,
+ "SYSMIS" => Token::Float(OrderedFloat(-f64::MAX)),
+ "PCSYSMIS" => Token::PcSysmis,
+ "LOWEST" => Token::Float((-f64::MAX).next_after(0.0).into()),
+ "HIGHEST" => Token::Float(f64::MAX.into()),
+ "ENDIAN" => Token::Integer(if self.endian == Endian::Big { 1 } else { 2 }),
+ "COUNT" => Token::Count,
+ "COUNT8" => Token::Count8,
+ "hex" => Token::Hex,
+ _ => Err(self.error(format!("invalid token '{s}'")))?,
+ };
+ (token, rest)
+ }
+ }
+ _ => Err(self.error(format!("invalid input byte '{c}'")))?,
+ };
+ self.input = rest;
+ let repr = &start[..start.len() - rest.len()];
+ println!("{token:?} {repr}");
+ Ok(Some((token, repr)))
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use crate::endian::Endian;
+ use crate::sys::sack::sack;
+ use anyhow::Result;
+ use hexplay::HexView;
+
+ #[test]
+ fn basic_sack() -> Result<()> {
+ let input = r#"
+"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
+2; # Layout code
+28; # Nominal case size
+0; # Not compressed
+0; # Not weighted
+1; # 1 case.
+100.0; # Bias.
+"01 Jan 11"; "20:53:52";
+"PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 "";
+i8 0 *3;
+"#;
+ let output = sack(input, None, Endian::Big)?;
+ HexView::new(&output).print()?;
+ Ok(())
+ }
+
+ #[test]
+ fn pcp_sack() -> Result<()> {
+ let input = r#"
+# File header.
+2; 0;
+@MAIN; @MAIN_END - @MAIN;
+@VARS; @VARS_END - @VARS;
+@LABELS; @LABELS_END - @LABELS;
+@DATA; @DATA_END - @DATA;
+(0; 0) * 11;
+i8 0 * 128;
+
+MAIN:
+ i16 1; # Fixed.
+ s62 "PCSPSS PSPP synthetic test product";
+ PCSYSMIS;
+ 0; 0; i16 1; # Fixed.
+ i16 0;
+ i16 15;
+ 1;
+ i16 0; # Fixed.
+ 1;
+ s8 "11/28/14";
+ s8 "15:11:00";
+ s64 "PSPP synthetic test file";
+MAIN_END:
+
+VARS:
+ 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS;
+ 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS;
+ 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS;
+
+ # Numeric variable, no label or missing values.
+ 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS;
+
+ # Numeric variable, variable label.
+ 0; 0; @NUM2_LABEL - @LABELS_OFS; 0x050800; s8 "NUM2"; PCSYSMIS;
+
+ # Numeric variable with missing value.
+ 0; 0; 0; 0x050800; s8 "NUM3"; 1.0;
+
+ # Numeric variable, variable label and missing value.
+ 0; 0; @NUM4_LABEL - @LABELS_OFS; 0x050800; s8 "NUM4"; 2.0;
+
+ # String variable, no label or missing values.
+ 0; 0; 0; 0x010800; s8 "STR1"; PCSYSMIS;
+
+ # String variable, variable label.
+ 0; 0; @STR2_LABEL - @LABELS_OFS; 0x010400; s8 "STR2"; PCSYSMIS;
+
+ # String variable with missing value.
+ 0; 0; 0; 0x010500; s8 "STR3"; s8 "MISS";
+
+ # String variable, variable label and missing value.
+ 0; 0; @STR4_LABEL - @LABELS_OFS; 0x010100; s8 "STR4"; s8 "OTHR";
+
+ # Long string variable
+ 0; 0; 0; 0x010b00; s8 "STR5"; PCSYSMIS;
+ 0 * 8;
+
+ # Long string variable with variable label
+ 0; 0; @STR6_LABEL - @LABELS_OFS; 0x010b00; s8 "STR6"; PCSYSMIS;
+ 0 * 8;
+VARS_END:
+
+LABELS:
+ 3; i8 0 0 0; LABELS_OFS: i8 0;
+ NUM2_LABEL: COUNT8("Numeric variable 2's label");
+ NUM4_LABEL: COUNT8("Another numeric variable label");
+ STR2_LABEL: COUNT8("STR2's variable label");
+ STR4_LABEL: COUNT8("STR4's variable label");
+ STR6_LABEL: COUNT8("Another string variable's label");
+LABELS_END:
+
+DATA:
+ 0.0; "11/28/14"; 1.0;
+ 0.0; 1.0; 2.0; PCSYSMIS; s8 "abcdefgh"; s8 "ijkl"; s8 "mnopq"; s8 "r";
+ s16 "stuvwxyzAB"; s16 "CDEFGHIJKLM";
+DATA_END:
+"#;
+ let output = sack(input, None, Endian::Big)?;
+ HexView::new(&output).print()?;
+ Ok(())
+ }
+}
use anyhow::{anyhow, Result};
use clap::Parser;
use pspp::endian::Endian;
-use pspp::sack::sack;
+use pspp::sys::sack::sack;
/// SAv Construction Kit
///
NOT_REACHED ();
}
-/* Appends syntax for the tokens in MTS to S. */
+/* Appends syntax for the tokens in MTS to S. If OFS and LEN are nonnull, sets
+ OFS[i] to the offset within S of the start of token 'i' in MTS and LEN[i] to
+ its length. OFS[i] + LEN[i] is not necessarily OFS[i + 1] because some
+ tokens are separated by white space. */
void
-macro_tokens_to_syntax (struct macro_tokens *mts, struct string *s)
+macro_tokens_to_syntax (struct macro_tokens *mts, struct string *s,
+ size_t *ofs, size_t *len)
{
+ assert ((ofs != NULL) == (len != NULL));
+
+ if (!mts->n)
+ return;
+
for (size_t i = 0; i < mts->n; i++)
{
if (i > 0)
}
}
+ if (ofs)
+ ofs[i] = s->ss.length;
macro_token_to_syntax (&mts->mts[i], s);
+ if (len)
+ len[i] = s->ss.length - ofs[i];
}
}
if (param)
{
size_t param_idx = param - me->macro->params;
- macro_tokens_to_syntax (me->args[param_idx], farg);
+ macro_tokens_to_syntax (me->args[param_idx], farg, NULL, NULL);
return 1;
}
break;
if (i)
ds_put_byte (farg, ' ');
- macro_tokens_to_syntax (me->args[i], farg);
+ macro_tokens_to_syntax (me->args[i], farg, NULL, NULL);
}
return 1;
}
if (mts.n > 1)
{
struct macro_tokens tail = { .mts = mts.mts + 1, .n = mts.n - 1 };
- macro_tokens_to_syntax (&tail, output);
+ macro_tokens_to_syntax (&tail, output, NULL, NULL);
}
macro_tokens_uninit (&mts);
ds_destroy (&tmp);
subme.stack = &stack;
macro_expand (mts.mts, mts.n, &subme, &exp);
- macro_tokens_to_syntax (&exp, output);
+ macro_tokens_to_syntax (&exp, output, NULL, NULL);
macro_tokens_uninit (&exp);
macro_tokens_uninit (&mts);
}
case '~':
return T_NE;
+
+ case '&':
+ return T_AND;
+
+ case '|':
+ return T_OR;
}
NOT_REACHED ();
xr_pager_destroy (struct xr_pager *p)
{
if (p)
- {x
+ {
free (p->nodes);
xr_page_style_unref (p->page_style);
unsigned char options; /* TABLE_CELL_*. */
const struct pivot_value *value;
+ const struct font_style *font_style;
const struct cell_style *cell_style;
};