From: Ben Pfaff Date: Thu, 1 May 2025 22:49:31 +0000 (-0700) Subject: Move sources related to sys files into submodule. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5692ddcf67eb5e12a28e4ad6506f5681e1f313f6;p=pspp Move sources related to sys files into submodule. --- diff --git a/rust/pspp/src/cooked.rs b/rust/pspp/src/cooked.rs deleted file mode 100644 index 8de34d5ac6..0000000000 --- a/rust/pspp/src/cooked.rs +++ /dev/null @@ -1,904 +0,0 @@ -use core::str; -use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc}; - -use crate::{ - dictionary::{ - Dictionary, InvalidRole, MultipleResponseSet, MultipleResponseType, Value, VarWidth, - Variable, VariableSet, - }, - encoding::Error as EncodingError, - endian::Endian, - format::{Error as FormatError, Format, UncheckedFormat}, - identifier::{ByIdentifier, Error as IdError, Identifier}, - raw::{ - self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord, - FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName, LongNamesRecord, - LongStringMissingValueRecord, LongStringValueLabelRecord, MissingValues, - MultipleResponseRecord, NumberOfCasesRecord, ProductInfoRecord, RawStrArray, RawWidth, - ValueLabel, ValueLabelRecord, VarDisplayRecord, VariableAttributeRecord, VariableRecord, - VariableSetRecord, VeryLongStringsRecord, ZHeader, ZTrailer, - }, -}; -use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; -use encoding_rs::Encoding; -use indexmap::set::MutableValues; -use thiserror::Error as ThisError; - -pub use crate::raw::{CategoryLabels, Compression}; - -#[derive(ThisError, Debug)] -pub enum Error { - #[error("Missing header record")] - MissingHeaderRecord, - - // XXX this is an internal error - #[error("More than one file header record")] - DuplicateHeaderRecord, - - #[error("{0}")] - EncodingError(EncodingError), - - #[error("Using default encoding {0}.")] - UsingDefaultEncoding(String), - - #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)] - InvalidVariableWidth { offsets: Range, width: i32 }, - - #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")] - InvalidLongMissingValueFormat, - - #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")] - InvalidCreationDate { creation_date: String }, - - #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")] - InvalidCreationTime { creation_time: String }, - - #[error("{id_error} Renaming variable to {new_name}.")] - InvalidVariableName { - id_error: IdError, - new_name: Identifier, - }, - - #[error( - "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}" - )] - InvalidPrintFormat { - new_spec: Format, - variable: Identifier, - format_error: FormatError, - }, - - #[error( - "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}" - )] - InvalidWriteFormat { - new_spec: Format, - variable: Identifier, - format_error: FormatError, - }, - - #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")] - DuplicateVariableName { - duplicate_name: Identifier, - new_name: Identifier, - }, - - #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")] - InvalidDictIndex { dict_index: usize, max_index: usize }, - - #[error("Dictionary index {0} refers to a long string continuation.")] - DictIndexIsContinuation(usize), - - #[error("At offset {offset:#x}, one or more variable indexes for value labels referred to long string continuation records: {indexes:?}")] - LongStringContinuationIndexes { offset: u64, indexes: Vec }, - - #[error( - "At offsets {:#x}...{:#x}, record types 3 and 4 may not add value labels to one or more long string variables: {variables:?}", .offsets.start, .offsets.end - )] - InvalidLongStringValueLabels { - offsets: Range, - variables: Vec, - }, - - #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")] - ValueLabelsDifferentTypes { - numeric_var: Identifier, - string_var: Identifier, - }, - - #[error("Invalid multiple response set name. {0}")] - InvalidMrSetName(IdError), - - #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")] - UnknownMrSetVariable { - mr_set: Identifier, - short_name: Identifier, - }, - - #[error("Multiple response set {0} has no variables.")] - EmptyMrSet(Identifier), - - #[error("Multiple response set {0} has only one variable.")] - OneVarMrSet(Identifier), - - #[error("Multiple response set {0} contains both string and numeric variables.")] - MixedMrSet(Identifier), - - #[error( - "Invalid numeric format for counted value {number} in multiple response set {mr_set}." - )] - InvalidMDGroupCountedValue { mr_set: Identifier, number: String }, - - #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")] - TooWideMDGroupCountedValue { - mr_set: Identifier, - value: String, - width: usize, - max_width: u16, - }, - - #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")] - InvalidLongValueLabelWidth { - name: Identifier, - width: u32, - min_width: u16, - max_width: u16, - }, - - #[error("Invalid attribute name. {0}")] - InvalidAttributeName(IdError), - - #[error("Invalid short name in long variable name record. {0}")] - InvalidShortName(IdError), - - #[error("Invalid name in long variable name record. {0}")] - InvalidLongName(IdError), - - #[error("Invalid variable name in very long string record. {0}")] - InvalidLongStringName(IdError), - - #[error("Invalid variable name in long string value label record. {0}")] - InvalidLongStringValueLabelName(IdError), - - #[error("Invalid variable name in attribute record. {0}")] - InvalidAttributeVariableName(IdError), - - // XXX This is risky because `text` might be arbitarily long. - #[error("Text string contains invalid bytes for {encoding} encoding: {text}")] - MalformedString { encoding: String, text: String }, - - #[error("Details TBD")] - TBD, -} - -#[derive(Clone, Debug)] -pub struct Headers { - pub header: HeaderRecord, - pub variable: Vec>, - pub value_label: Vec, String>>, - pub document: Vec>, - pub integer_info: Option, - pub float_info: Option, - pub var_display: Option, - pub multiple_response: Vec>, - pub long_string_value_labels: Vec>, - pub long_string_missing_values: Vec>, - pub encoding: Option, - pub number_of_cases: Option, - pub variable_sets: Vec, - pub product_info: Option, - pub long_names: Vec, - pub very_long_strings: Vec, - pub file_attributes: Vec, - pub variable_attributes: Vec, - pub other_extension: Vec, - pub end_of_headers: Option, - pub z_header: Option, - pub z_trailer: Option, - pub cases: Option>>, -} - -fn take_first(mut vec: Vec, more_than_one: F) -> Option -where - F: FnOnce(), -{ - if vec.len() > 1 { - more_than_one(); - } - vec.drain(..).next() -} - -impl Headers { - pub fn new(headers: Vec, warn: &impl Fn(Error)) -> Result { - let mut file_header = Vec::new(); - let mut variable = Vec::new(); - let mut value_label = Vec::new(); - let mut document = Vec::new(); - let mut integer_info = Vec::new(); - let mut float_info = Vec::new(); - let mut var_display = Vec::new(); - let mut multiple_response = Vec::new(); - let mut long_string_value_labels = Vec::new(); - let mut long_string_missing_values = Vec::new(); - let mut encoding = Vec::new(); - let mut number_of_cases = Vec::new(); - let mut variable_sets = Vec::new(); - let mut product_info = Vec::new(); - let mut long_names = Vec::new(); - let mut very_long_strings = Vec::new(); - let mut file_attributes = Vec::new(); - let mut variable_attributes = Vec::new(); - let mut other_extension = Vec::new(); - let mut end_of_headers = Vec::new(); - let mut z_header = Vec::new(); - let mut z_trailer = Vec::new(); - let mut cases = Vec::new(); - - for header in headers { - match header { - DecodedRecord::Header(record) => { - file_header.push(record); - } - DecodedRecord::Variable(record) => { - variable.push(record); - } - DecodedRecord::ValueLabel(record) => { - value_label.push(record); - } - DecodedRecord::Document(record) => { - document.push(record); - } - DecodedRecord::IntegerInfo(record) => { - integer_info.push(record); - } - DecodedRecord::FloatInfo(record) => { - float_info.push(record); - } - DecodedRecord::VariableSets(record) => { - variable_sets.push(record); - } - DecodedRecord::VarDisplay(record) => { - var_display.push(record); - } - DecodedRecord::MultipleResponse(record) => { - multiple_response.push(record); - } - DecodedRecord::LongStringValueLabels(record) => { - long_string_value_labels.push(record) - } - DecodedRecord::LongStringMissingValues(record) => { - long_string_missing_values.push(record); - } - DecodedRecord::Encoding(record) => { - encoding.push(record); - } - DecodedRecord::NumberOfCases(record) => { - number_of_cases.push(record); - } - DecodedRecord::ProductInfo(record) => { - product_info.push(record); - } - DecodedRecord::LongNames(record) => { - long_names.push(record); - } - DecodedRecord::VeryLongStrings(record) => { - very_long_strings.push(record); - } - DecodedRecord::FileAttributes(record) => { - file_attributes.push(record); - } - DecodedRecord::VariableAttributes(record) => { - variable_attributes.push(record); - } - DecodedRecord::OtherExtension(record) => { - other_extension.push(record); - } - DecodedRecord::EndOfHeaders(record) => { - end_of_headers.push(record); - } - DecodedRecord::ZHeader(record) => { - z_header.push(record); - } - DecodedRecord::ZTrailer(record) => { - z_trailer.push(record); - } - DecodedRecord::Cases(record) => { - cases.push(record); - } - } - } - - let Some(file_header) = take_first(file_header, || warn(Error::DuplicateHeaderRecord)) - else { - return Err(Error::MissingHeaderRecord); - }; - - Ok(Headers { - header: file_header, - variable, - value_label, - document, - integer_info: take_first(integer_info, || warn(Error::TBD)), - float_info: take_first(float_info, || warn(Error::TBD)), - var_display: take_first(var_display, || warn(Error::TBD)), - multiple_response, - long_string_value_labels, - long_string_missing_values, - encoding: take_first(encoding, || warn(Error::TBD)), - number_of_cases: take_first(number_of_cases, || warn(Error::TBD)), - variable_sets, - product_info: take_first(product_info, || warn(Error::TBD)), - long_names, - very_long_strings, - file_attributes, - variable_attributes, - other_extension, - end_of_headers: take_first(end_of_headers, || warn(Error::TBD)), - z_header: take_first(z_header, || warn(Error::TBD)), - z_trailer: take_first(z_trailer, || warn(Error::TBD)), - cases: take_first(cases, || warn(Error::TBD)), - }) - } -} - -#[derive(Debug)] -pub struct Metadata { - pub creation: NaiveDateTime, - pub endian: Endian, - pub compression: Option, - pub n_cases: Option, - pub product: String, - pub product_ext: Option, - pub version: Option<(i32, i32, i32)>, -} - -impl Metadata { - fn decode(headers: &Headers, warn: impl Fn(Error)) -> Self { - let header = &headers.header; - let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y") - .unwrap_or_else(|_| { - warn(Error::InvalidCreationDate { - creation_date: header.creation_date.to_string(), - }); - Default::default() - }); - let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S") - .unwrap_or_else(|_| { - warn(Error::InvalidCreationTime { - creation_time: header.creation_time.to_string(), - }); - Default::default() - }); - let creation = NaiveDateTime::new(creation_date, creation_time); - - let product = header - .eye_catcher - .trim_start_matches("@(#) SPSS DATA FILE") - .trim_end() - .to_string(); - - Self { - creation, - endian: header.endian, - compression: header.compression, - n_cases: header.n_cases.map(|n| n as u64), - product, - product_ext: headers.product_info.as_ref().map(|pe| fix_line_ends(&pe.0)), - version: headers.integer_info.as_ref().map(|ii| ii.version), - } - } -} - -struct Decoder { - pub encoding: &'static Encoding, - n_generated_names: usize, -} - -impl Decoder { - fn generate_name(&mut self, dictionary: &Dictionary) -> Identifier { - loop { - self.n_generated_names += 1; - let name = Identifier::from_encoding( - format!("VAR{:03}", self.n_generated_names), - self.encoding, - ) - .unwrap(); - if !dictionary.variables.contains(&name.0) { - return name; - } - assert!(self.n_generated_names < usize::MAX); - } - } -} - -pub fn decode( - mut headers: Headers, - encoding: &'static Encoding, - warn: impl Fn(Error), -) -> Result<(Dictionary, Metadata), Error> { - let mut dictionary = Dictionary::new(encoding); - - let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' ')); - if !file_label.is_empty() { - dictionary.file_label = Some(file_label); - } - - for mut attributes in headers.file_attributes.drain(..) { - dictionary.attributes.append(&mut attributes.0) - } - - // Concatenate all the document records (really there should only be one) - // and trim off the trailing spaces that pad them to 80 bytes. - dictionary.documents = headers - .document - .drain(..) - .flat_map(|record| record.lines) - .map(trim_end_spaces) - .collect(); - - // XXX warn for weird integer format - // XXX warn for weird floating-point format, etc. - - let mut decoder = Decoder { - encoding, - n_generated_names: 0, - }; - - let mut var_index_map = HashMap::new(); - let mut value_index = 0; - for (index, input) in headers - .variable - .iter() - .enumerate() - .filter(|(_index, record)| record.width != RawWidth::Continuation) - { - let name = trim_end_spaces(input.name.to_string()); - let name = match Identifier::from_encoding(name, encoding) { - Ok(name) => { - if !dictionary.variables.contains(&name.0) { - name - } else { - let new_name = decoder.generate_name(&dictionary); - warn(Error::DuplicateVariableName { - duplicate_name: name.clone(), - new_name: new_name.clone(), - }); - new_name - } - } - Err(id_error) => { - let new_name = decoder.generate_name(&dictionary); - warn(Error::InvalidVariableName { - id_error, - new_name: new_name.clone(), - }); - new_name - } - }; - let mut variable = Variable::new(name.clone(), VarWidth::try_from(input.width).unwrap()); - - // Set the short name the same as the long name (even if we renamed it). - variable.short_names = vec![name]; - - variable.label = input.label.clone(); - - variable.missing_values = input.missing_values.clone(); - - variable.print_format = decode_format( - input.print_format, - variable.width, - |new_spec, format_error| { - warn(Error::InvalidPrintFormat { - new_spec, - variable: variable.name.clone(), - format_error, - }) - }, - ); - variable.write_format = decode_format( - input.write_format, - variable.width, - |new_spec, format_error| { - warn(Error::InvalidWriteFormat { - new_spec, - variable: variable.name.clone(), - format_error, - }) - }, - ); - - // Check for long string continuation records. - let n_values = input.width.n_values().unwrap(); - for offset in 1..n_values { - if headers - .variable - .get(index + offset) - .is_none_or(|record| record.width != RawWidth::Continuation) - { - warn(Error::TBD); - break; - } - } - - let dict_index = dictionary.add_var(variable).unwrap(); - assert_eq!(var_index_map.insert(value_index, dict_index), None); - value_index += n_values; - } - - if let Some(weight_index) = headers.header.weight_index { - if let Some(dict_index) = var_index_map.get(&(weight_index as usize - 1)) { - let variable = &dictionary.variables[*dict_index]; - if variable.is_numeric() { - dictionary.weight = Some(*dict_index); - } else { - warn(Error::TBD); - } - } else { - warn(Error::TBD); - } - } - - for record in headers.value_label.drain(..) { - let mut dict_indexes = Vec::with_capacity(record.dict_indexes.len()); - let mut long_string_variables = Vec::new(); - for value_index in record.dict_indexes.iter() { - let Some(dict_index) = var_index_map.get(&(*value_index as usize - 1)) else { - unreachable!() - }; - let variable = &dictionary.variables[*dict_index]; - if variable.width.is_long_string() { - long_string_variables.push(variable.name.clone()); - } else { - dict_indexes.push(*dict_index); - } - } - if !long_string_variables.is_empty() { - warn(Error::InvalidLongStringValueLabels { - offsets: record.offsets.clone(), - variables: long_string_variables, - }); - } - - for dict_index in dict_indexes { - let variable = dictionary.variables.get_index_mut2(dict_index).unwrap(); - for ValueLabel { value, label } in record.labels.iter().cloned() { - let value = value.decode(variable.width); - variable.value_labels.insert(value, label); - } - } - } - - if let Some(display) = &headers.var_display { - for (index, display) in display.0.iter().enumerate() { - if let Some(variable) = dictionary.variables.get_index_mut2(index) { - if let Some(width) = display.width { - variable.display_width = width; - } - if let Some(alignment) = display.alignment { - variable.alignment = alignment; - } - if let Some(measure) = display.measure { - variable.measure = Some(measure); - } - } else { - warn(Error::TBD); - } - } - } - - for record in headers - .multiple_response - .iter() - .flat_map(|record| record.0.iter()) - { - match MultipleResponseSet::decode(&dictionary, record, &warn) { - Ok(mrset) => { - dictionary.mrsets.insert(ByIdentifier::new(mrset)); - } - Err(error) => warn(error), - } - } - - 'outer: for record in headers - .very_long_strings - .drain(..) - .flat_map(|record| record.0.into_iter()) - { - let Some(index) = dictionary.variables.get_index_of(&record.short_name.0) else { - warn(Error::TBD); - continue; - }; - let width = VarWidth::String(record.length); - let n_segments = width.n_segments(); - if n_segments == 1 { - warn(Error::TBD); - continue; - } - if index + n_segments > dictionary.variables.len() { - warn(Error::TBD); - continue; - } - let mut short_names = Vec::with_capacity(n_segments); - for i in 0..n_segments { - let alloc_width = width.segment_alloc_width(i); - let segment = &dictionary.variables[index + i]; - short_names.push(segment.short_names[0].clone()); - let segment_width = segment.width.as_string_width().unwrap_or(0); - if segment_width.next_multiple_of(8) != alloc_width.next_multiple_of(8) { - warn(Error::TBD); - continue 'outer; - } - } - dictionary.delete_vars(index + 1..index + n_segments); - let variable = dictionary.variables.get_index_mut2(index).unwrap(); - variable.short_names = short_names; - variable.width = width; - } - - if headers.long_names.is_empty() { - // There are no long variable names. Use the short variable names, - // converted to lowercase, as the long variable names. - for index in 0..dictionary.variables.len() { - let lower = dictionary.variables[index].name.0.as_ref().to_lowercase(); - if let Ok(new_name) = Identifier::from_encoding(lower, dictionary.encoding) { - dictionary.try_rename_var(index, new_name); - } - } - } else { - // Rename each of the variables, one by one. (In a correctly - // constructed system file, this cannot create any intermediate - // duplicate variable names, because all of the new variable names are - // longer than any of the old variable names and thus there cannot be - // any overlaps.) - for renaming in headers - .long_names - .iter() - .flat_map(|record| record.0.iter().cloned()) - { - let LongName { - short_name, - long_name, - } = renaming; - if let Some(index) = dictionary.variables.get_index_of(&short_name.0) { - dictionary.try_rename_var(index, long_name); - dictionary - .variables - .get_index_mut2(index) - .unwrap() - .short_names = vec![short_name]; - } else { - warn(Error::TBD); - } - } - } - - for mut attr_set in headers - .variable_attributes - .drain(..) - .flat_map(|record| record.0.into_iter()) - { - if let Some((_, variable)) = dictionary - .variables - .get_full_mut2(&attr_set.long_var_name.0) - { - variable.attributes.append(&mut attr_set.attributes); - } else { - warn(Error::TBD); - } - } - - // Assign variable roles. - for index in 0..dictionary.variables.len() { - let variable = dictionary.variables.get_index_mut2(index).unwrap(); - match variable.attributes.role() { - Ok(role) => variable.role = role, - Err(InvalidRole) => warn(Error::TBD), - } - } - - // Long string value labels. - for record in headers - .long_string_value_labels - .drain(..) - .flat_map(|record| record.0.into_iter()) - { - let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else { - warn(Error::TBD); - continue; - }; - let Some(width) = variable.width.as_string_width() else { - warn(Error::TBD); - continue; - }; - for (mut value, label) in record.labels.into_iter() { - // XXX warn about too-long value? - value.0.resize(width, b' '); - // XXX warn abouat duplicate value labels? - variable.value_labels.insert(Value::String(value), label); - } - } - - let mut value = Vec::new(); - for record in headers - .long_string_missing_values - .drain(..) - .flat_map(|record| record.0.into_iter()) - { - let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else { - warn(Error::TBD); - continue; - }; - let values = record - .missing_values - .into_iter() - .map(|v| { - value.clear(); - value.extend_from_slice(v.0.as_slice()); - value.resize(variable.width.as_string_width().unwrap(), b' '); - Value::String(Box::from(value.as_slice())) - }) - .collect::>(); - variable.missing_values = MissingValues { - values, - range: None, - }; - } - - for record in headers - .variable_sets - .drain(..) - .flat_map(|record| record.sets.into_iter()) - { - let mut variables = Vec::with_capacity(record.variable_names.len()); - for variable_name in record.variable_names { - let Some((dict_index, _)) = dictionary.variables.get_full_mut2(&variable_name.0) else { - warn(Error::TBD); - continue; - }; - variables.push(dict_index); - } - if !variables.is_empty() { - let variable_set = VariableSet { - name: record.name, - variables, - }; - dictionary - .variable_sets - .insert(ByIdentifier::new(variable_set)); - } - } - - let metadata = Metadata::decode(&headers, warn); - Ok((dictionary, metadata)) -} - -impl MultipleResponseSet { - fn decode( - dictionary: &Dictionary, - input: &raw::MultipleResponseSet, - warn: &impl Fn(Error), - ) -> Result { - let mr_set_name = input.name.clone(); - let mut variables = Vec::with_capacity(input.short_names.len()); - for short_name in input.short_names.iter() { - let Some(dict_index) = dictionary.variables.get_index_of(&short_name.0) else { - warn(Error::UnknownMrSetVariable { - mr_set: mr_set_name.clone(), - short_name: short_name.clone(), - }); - continue; - }; - variables.push(dict_index); - } - - match variables.len() { - 0 => return Err(Error::EmptyMrSet(mr_set_name)), - 1 => return Err(Error::OneVarMrSet(mr_set_name)), - _ => (), - } - - let Some((Some(min_width), Some(max_width))) = variables - .iter() - .copied() - .map(|dict_index| dictionary.variables[dict_index].width) - .map(|w| (Some(w), Some(w))) - .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb))) - else { - return Err(Error::MixedMrSet(mr_set_name)); - }; - - let mr_type = MultipleResponseType::decode(&mr_set_name, &input.mr_type, min_width)?; - - Ok(MultipleResponseSet { - name: mr_set_name, - width: min_width..=max_width, - label: input.label.to_string(), - mr_type, - variables, - }) - } -} - -fn trim_end_spaces(mut s: String) -> String { - s.truncate(s.trim_end_matches(' ').len()); - s -} - -/// Returns a copy of `s` in which all lone CR and CR LF pairs have been -/// replaced by LF. -/// -/// (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system -/// files that use CR-only line ends in the file label and extra product info.) -fn fix_line_ends(s: &str) -> String { - let mut out = String::with_capacity(s.len()); - let mut s = s.chars().peekable(); - while let Some(c) = s.next() { - match c { - '\r' => { - s.next_if_eq(&'\n'); - out.push('\n') - } - c => out.push(c), - } - } - out -} - -fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Format, FormatError)) -> Format { - UncheckedFormat::try_from(raw) - .and_then(Format::try_from) - .and_then(|x| x.check_width_compatibility(width)) - .unwrap_or_else(|error| { - let new_format = Format::default_for_width(width); - warn(new_format, error); - new_format - }) -} - -impl MultipleResponseType { - fn decode( - mr_set: &Identifier, - input: &raw::MultipleResponseType, - min_width: VarWidth, - ) -> Result { - match input { - raw::MultipleResponseType::MultipleDichotomy { value, labels } => { - let value = match min_width { - VarWidth::Numeric => { - let string = String::from_utf8_lossy(&value.0); - let number: f64 = string.trim().parse().map_err(|_| { - Error::InvalidMDGroupCountedValue { - mr_set: mr_set.clone(), - number: string.into(), - } - })?; - Value::Number(Some(number)) - } - VarWidth::String(max_width) => { - let mut value = value.0.as_slice(); - while value.ends_with(b" ") { - value = &value[..value.len() - 1]; - } - let width = value.len(); - if width > max_width as usize { - return Err(Error::TooWideMDGroupCountedValue { - mr_set: mr_set.clone(), - value: String::from_utf8_lossy(value).into(), - width, - max_width, - }); - }; - Value::String(value.into()) - } - }; - Ok(MultipleResponseType::MultipleDichotomy { - value, - labels: *labels, - }) - } - raw::MultipleResponseType::MultipleCategory => { - Ok(MultipleResponseType::MultipleCategory) - } - } - } -} diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index d0941be061..6a0a1843ba 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -18,7 +18,7 @@ use unicase::UniCase; use crate::{ format::Format, identifier::{ByIdentifier, HasIdentifier, Identifier}, - raw::{Alignment, CategoryLabels, Measure, MissingValues, RawString, VarType}, + sys::raw::{Alignment, CategoryLabels, Measure, MissingValues, RawString, VarType}, }; /// An index within [Dictionary::variables]. diff --git a/rust/pspp/src/encoding.rs b/rust/pspp/src/encoding.rs deleted file mode 100644 index c408bf56fa..0000000000 --- a/rust/pspp/src/encoding.rs +++ /dev/null @@ -1,95 +0,0 @@ -use crate::locale_charset::locale_charset; -use encoding_rs::{Encoding, UTF_8}; - -include!(concat!(env!("OUT_DIR"), "/encodings.rs")); - -pub fn codepage_from_encoding(encoding: &str) -> Option { - CODEPAGE_NAME_TO_NUMBER - .get(encoding.to_ascii_lowercase().as_str()) - .copied() -} - -use thiserror::Error as ThisError; - -#[derive(ThisError, Debug)] -pub enum Error { - #[error("This system file does not indicate its own character encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")] - NoEncoding, - - #[error("This system file encodes text strings with unknown code page {0}.")] - UnknownCodepage(i32), - - #[error("This system file encodes text strings with unknown encoding {0}.")] - UnknownEncoding(String), - - #[error("This system file is encoded in EBCDIC, which is not supported.")] - Ebcdic, -} - -pub fn default_encoding() -> &'static Encoding { - lazy_static! { - static ref DEFAULT_ENCODING: &'static Encoding = - Encoding::for_label(locale_charset().as_bytes()).unwrap_or(UTF_8); - } - &DEFAULT_ENCODING -} - -pub fn get_encoding( - encoding: Option<&str>, - character_code: Option, -) -> Result<&'static Encoding, Error> { - let label = if let Some(encoding) = encoding { - encoding - } else if let Some(codepage) = character_code { - match codepage { - 1 => return Err(Error::Ebcdic), - 2 | 3 => { - // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic] - // respectively. However, many files have character code 2 but - // data which are clearly not ASCII. Therefore, ignore these - // values. - return Err(Error::NoEncoding); - } - 4 => "MS_KANJI", - _ => CODEPAGE_NUMBER_TO_NAME - .get(&codepage) - .copied() - .ok_or(Error::UnknownCodepage(codepage))?, - } - } else { - return Err(Error::NoEncoding); - }; - - Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into())) -} - -/* -#[cfg(test)] -mod tests { - use std::thread::spawn; - - use encoding_rs::{EUC_JP, UTF_8, WINDOWS_1252}; - - #[test] - fn round_trip() { - let mut threads = Vec::new(); - for thread in 0..128 { - let start: u32 = thread << 25; - let end = start + ((1 << 25) - 1); - threads.push(spawn(move || { - for i in start..=end { - let s = i.to_le_bytes(); - let (utf8, replacement) = EUC_JP.decode_without_bom_handling(&s); - if !replacement { - let s2 = UTF_8.encode(&utf8).0; - assert_eq!(s.as_slice(), &*s2); - } - } - })); - } - for thread in threads { - thread.join().unwrap(); - } - } -} -*/ diff --git a/rust/pspp/src/format/mod.rs b/rust/pspp/src/format/mod.rs index 3078fffb43..2d17270b84 100644 --- a/rust/pspp/src/format/mod.rs +++ b/rust/pspp/src/format/mod.rs @@ -13,7 +13,7 @@ use unicode_width::UnicodeWidthStr; use crate::{ dictionary::{Value, VarWidth}, - raw::{self, RawString, VarType}, + sys::raw::{self, RawString, VarType}, }; mod display; @@ -377,7 +377,7 @@ impl Type { pub fn default_value(&self) -> Value { match self.var_type() { VarType::Numeric => Value::sysmis(), - VarType::String => Value::String(RawString::default()) + VarType::String => Value::String(RawString::default()), } } } diff --git a/rust/pspp/src/format/parse.rs b/rust/pspp/src/format/parse.rs index 222d61de50..2f5887370f 100644 --- a/rust/pspp/src/format/parse.rs +++ b/rust/pspp/src/format/parse.rs @@ -3,8 +3,8 @@ use crate::{ dictionary::Value, endian::{Endian, Parse}, format::{DateTemplate, Decimals, Settings, TemplateItem, Type}, - raw::{EncodedStr, EncodedString}, settings::{EndianSettings, Settings as PsppSettings}, + sys::raw::{EncodedStr, EncodedString}, }; use encoding_rs::Encoding; use smallstr::SmallString; @@ -911,8 +911,8 @@ mod test { parse::{ParseError, ParseErrorKind, Sign}, Epoch, Format, Settings as FormatSettings, Type, }, - raw::EncodedStr, settings::EndianSettings, + sys::raw::EncodedStr, }; fn test(name: &str, type_: Type) { diff --git a/rust/pspp/src/lib.rs b/rust/pspp/src/lib.rs index 3540125c81..b78b711bb4 100644 --- a/rust/pspp/src/lib.rs +++ b/rust/pspp/src/lib.rs @@ -1,8 +1,6 @@ pub mod calendar; pub mod command; -pub mod cooked; pub mod dictionary; -pub mod encoding; pub mod endian; pub mod engine; pub mod format; @@ -14,6 +12,5 @@ pub mod macros; pub mod message; pub mod output; pub mod prompt; -pub mod raw; -pub mod sack; pub mod settings; +pub mod sys; diff --git a/rust/pspp/src/main.rs b/rust/pspp/src/main.rs index 62ab24337b..35c057a341 100644 --- a/rust/pspp/src/main.rs +++ b/rust/pspp/src/main.rs @@ -17,8 +17,8 @@ use anyhow::Result; use clap::{Parser, ValueEnum}; use encoding_rs::Encoding; -use pspp::cooked::{decode, Headers}; -use pspp::raw::{encoding_from_headers, Decoder, Magic, Reader, Record}; +use pspp::sys::cooked::{decode, Headers}; +use pspp::sys::raw::{encoding_from_headers, Decoder, Magic, Reader, Record}; use std::fs::File; use std::io::BufReader; use std::path::{Path, PathBuf}; diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs index d79e01a3bd..14a6e51055 100644 --- a/rust/pspp/src/output/pivot/mod.rs +++ b/rust/pspp/src/output/pivot/mod.rs @@ -54,8 +54,8 @@ use tlo::parse_tlo; use crate::{ dictionary::Value as DataValue, format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat}, - raw::VarType, settings::{Settings, Show}, + sys::raw::VarType, }; pub mod output; diff --git a/rust/pspp/src/raw.rs b/rust/pspp/src/raw.rs deleted file mode 100644 index 5c3eb85472..0000000000 --- a/rust/pspp/src/raw.rs +++ /dev/null @@ -1,3008 +0,0 @@ -use crate::{ - dictionary::{Attributes, Value, VarWidth}, - encoding::{default_encoding, get_encoding, Error as EncodingError}, - endian::{Endian, Parse, ToBytes}, - identifier::{Error as IdError, Identifier}, -}; - -use encoding_rs::{mem::decode_latin1, Encoding}; -use flate2::read::ZlibDecoder; -use num::Integer; -use std::{ - borrow::Cow, - cell::RefCell, - collections::{HashMap, VecDeque}, - fmt::{Debug, Display, Formatter, Result as FmtResult}, - io::{Error as IoError, Read, Seek, SeekFrom}, - mem::take, - num::NonZeroU8, - ops::Range, - rc::Rc, - str::from_utf8, -}; -use thiserror::Error as ThisError; - -#[derive(ThisError, Debug)] -pub enum Error { - #[error("Not an SPSS system file")] - NotASystemFile, - - #[error("Invalid magic number {0:?}")] - BadMagic([u8; 4]), - - #[error("I/O error ({0})")] - Io(#[from] IoError), - - #[error("Invalid SAV compression code {0}")] - InvalidSavCompression(u32), - - #[error("Invalid ZSAV compression code {0}")] - InvalidZsavCompression(u32), - - #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")] - BadDocumentLength { offset: u64, n: usize, max: usize }, - - #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")] - BadRecordType { offset: u64, rec_type: u32 }, - - #[error("In variable record starting at offset {start_offset:#x}, variable width is not in the valid range -1 to 255.")] - BadVariableWidth { start_offset: u64, width: i32 }, - - #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")] - BadVariableLabelCode { - start_offset: u64, - code_offset: u64, - code: u32, - }, - - #[error("At offset {offset:#x}, missing value code ({code}) is not -3, -2, 0, 1, 2, or 3.")] - BadMissingValueCode { offset: u64, code: i32 }, - - #[error( - "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3." - )] - BadNumericMissingValueCode { offset: u64, code: i32 }, - - #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")] - BadStringMissingValueCode { offset: u64, code: i32 }, - - #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")] - BadNumberOfValueLabels { offset: u64, n: u32, max: u32 }, - - #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")] - ExpectedVarIndexRecord { offset: u64, rec_type: u32 }, - - #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")] - TooManyVarIndexes { offset: u64, n: u32, max: u32 }, - - #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")] - ExtensionRecordTooLarge { - offset: u64, - subtype: u32, - size: u32, - count: u32, - }, - - #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")] - EofInCase { - offset: u64, - case_ofs: u64, - case_len: usize, - }, - - #[error( - "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case." - )] - EofInCompressedCase { offset: u64, case_ofs: u64 }, - - #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")] - PartialCompressedCase { offset: u64, case_ofs: u64 }, - - #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")] - CompressedNumberExpected { offset: u64, case_ofs: u64 }, - - #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")] - CompressedStringExpected { offset: u64, case_ofs: u64 }, - - #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")] - BadZlibTrailerNBlocks { - offset: u64, - n_blocks: u32, - expected_n_blocks: u64, - ztrailer_len: u64, - }, - - #[error("{0}")] - EncodingError(EncodingError), -} - -#[derive(ThisError, Debug)] -pub enum Warning { - #[error("Unexpected end of data inside extension record.")] - UnexpectedEndOfData, - - #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")] - NoVarIndexes { offset: u64 }, - - #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())] - MixedVarTypes { - offset: u64, - var_type: VarType, - wrong_types: Vec, - }, - - #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}] or referred to string continuations: {invalid:?}")] - InvalidVarIndexes { - offset: u64, - max: usize, - invalid: Vec, - }, - - #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")] - BadRecordSize { - offset: u64, - record: String, - size: u32, - expected_size: u32, - }, - - #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")] - BadRecordCount { - offset: u64, - record: String, - count: u32, - expected_count: u32, - }, - - #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")] - BadLongMissingValueLength { - record_offset: u64, - offset: u64, - value_len: u32, - }, - - #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")] - BadEncodingName { offset: u64 }, - - // XXX This is risky because `text` might be arbitarily long. - #[error("Text string contains invalid bytes for {encoding} encoding: {text}")] - MalformedString { encoding: String, text: String }, - - #[error("Invalid variable measurement level value {0}")] - InvalidMeasurement(u32), - - #[error("Invalid variable display alignment value {0}")] - InvalidAlignment(u32), - - #[error("Invalid attribute name. {0}")] - InvalidAttributeName(IdError), - - #[error("Invalid variable name in attribute record. {0}")] - InvalidAttributeVariableName(IdError), - - #[error("Invalid short name in long variable name record. {0}")] - InvalidShortName(IdError), - - #[error("Invalid name in long variable name record. {0}")] - InvalidLongName(IdError), - - #[error("Invalid variable name in very long string record. {0}")] - InvalidLongStringName(IdError), - - #[error("Invalid variable name in variable set record. {0}")] - InvalidVariableSetName(IdError), - - #[error("Invalid multiple response set name. {0}")] - InvalidMrSetName(IdError), - - #[error("Invalid multiple response set variable name. {0}")] - InvalidMrSetVariableName(IdError), - - #[error("Invalid variable name in long string missing values record. {0}")] - InvalidLongStringMissingValueVariableName(IdError), - - #[error("Invalid variable name in long string value label record. {0}")] - InvalidLongStringValueLabelName(IdError), - - #[error("{0}")] - EncodingError(EncodingError), - - #[error("Details TBD")] - TBD, -} - -impl From for Warning { - fn from(_source: IoError) -> Self { - Self::UnexpectedEndOfData - } -} - -#[derive(Clone, Debug)] -pub enum Record { - Header(HeaderRecord), - Variable(VariableRecord), - ValueLabel(ValueLabelRecord, RawString>), - Document(DocumentRecord), - IntegerInfo(IntegerInfoRecord), - FloatInfo(FloatInfoRecord), - VarDisplay(VarDisplayRecord), - MultipleResponse(MultipleResponseRecord), - LongStringValueLabels(LongStringValueLabelRecord), - LongStringMissingValues(LongStringMissingValueRecord), - Encoding(EncodingRecord), - NumberOfCases(NumberOfCasesRecord), - Text(TextRecord), - OtherExtension(Extension), - EndOfHeaders(u32), - ZHeader(ZHeader), - ZTrailer(ZTrailer), - Cases(Rc>), -} - -#[derive(Clone, Debug)] -pub enum DecodedRecord { - Header(HeaderRecord), - Variable(VariableRecord), - ValueLabel(ValueLabelRecord, String>), - Document(DocumentRecord), - IntegerInfo(IntegerInfoRecord), - FloatInfo(FloatInfoRecord), - VarDisplay(VarDisplayRecord), - MultipleResponse(MultipleResponseRecord), - LongStringValueLabels(LongStringValueLabelRecord), - LongStringMissingValues(LongStringMissingValueRecord), - Encoding(EncodingRecord), - NumberOfCases(NumberOfCasesRecord), - VariableSets(VariableSetRecord), - ProductInfo(ProductInfoRecord), - LongNames(LongNamesRecord), - VeryLongStrings(VeryLongStringsRecord), - FileAttributes(FileAttributeRecord), - VariableAttributes(VariableAttributeRecord), - OtherExtension(Extension), - EndOfHeaders(u32), - ZHeader(ZHeader), - ZTrailer(ZTrailer), - Cases(Rc>), -} - -impl Record { - fn read( - reader: &mut R, - endian: Endian, - var_types: &VarTypes, - warn: &dyn Fn(Warning), - ) -> Result, Error> - where - R: Read + Seek, - { - let rec_type: u32 = endian.parse(read_bytes(reader)?); - match rec_type { - 2 => Ok(Some(VariableRecord::read(reader, endian, warn)?)), - 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?), - 6 => Ok(Some(DocumentRecord::read(reader, endian)?)), - 7 => Extension::read(reader, endian, var_types.n_values(), warn), - 999 => Ok(Some(Record::EndOfHeaders( - endian.parse(read_bytes(reader)?), - ))), - _ => Err(Error::BadRecordType { - offset: reader.stream_position()?, - rec_type, - }), - } - } - - pub fn decode(self, decoder: &Decoder) -> Result { - Ok(match self { - Record::Header(record) => record.decode(decoder), - Record::Variable(record) => record.decode(decoder), - Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)), - Record::Document(record) => record.decode(decoder), - Record::IntegerInfo(record) => DecodedRecord::IntegerInfo(record.clone()), - Record::FloatInfo(record) => DecodedRecord::FloatInfo(record.clone()), - Record::VarDisplay(record) => DecodedRecord::VarDisplay(record.clone()), - Record::MultipleResponse(record) => record.decode(decoder), - Record::LongStringValueLabels(record) => { - DecodedRecord::LongStringValueLabels(record.decode(decoder)) - } - Record::LongStringMissingValues(record) => { - DecodedRecord::LongStringMissingValues(record.decode(decoder)) - } - Record::Encoding(record) => DecodedRecord::Encoding(record.clone()), - Record::NumberOfCases(record) => DecodedRecord::NumberOfCases(record.clone()), - Record::Text(record) => record.decode(decoder), - Record::OtherExtension(record) => DecodedRecord::OtherExtension(record.clone()), - Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record), - Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()), - Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()), - Record::Cases(record) => DecodedRecord::Cases(record.clone()), - }) - } -} - -pub fn encoding_from_headers( - headers: &Vec, - warn: &impl Fn(Warning), -) -> Result<&'static Encoding, Error> { - let mut encoding_record = None; - let mut integer_info_record = None; - for record in headers { - match record { - Record::Encoding(record) => encoding_record = Some(record), - Record::IntegerInfo(record) => integer_info_record = Some(record), - _ => (), - } - } - let encoding = encoding_record.map(|record| record.0.as_str()); - let character_code = integer_info_record.map(|record| record.character_code); - match get_encoding(encoding, character_code) { - Ok(encoding) => Ok(encoding), - Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)), - Err(err) => { - warn(Warning::EncodingError(err)); - // Warn that we're using the default encoding. - Ok(default_encoding()) - } - } -} - -// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it -// decoded as Latin-1 (actually bytes interpreted as Unicode code points). -fn default_decode(s: &[u8]) -> Cow { - from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from) -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum Compression { - Simple, - ZLib, -} - -#[derive(Clone)] -pub struct HeaderRecord -where - S: Debug, -{ - /// Offset in file. - pub offsets: Range, - - /// Magic number. - pub magic: Magic, - - /// Eye-catcher string, product name, in the file's encoding. Padded - /// on the right with spaces. - pub eye_catcher: S, - - /// Layout code, normally either 2 or 3. - pub layout_code: u32, - - /// Number of variable positions, or `None` if the value in the file is - /// questionably trustworthy. - pub nominal_case_size: Option, - - /// Compression type, if any, - pub compression: Option, - - /// 1-based variable index of the weight variable, or `None` if the file is - /// unweighted. - pub weight_index: Option, - - /// Claimed number of cases, if known. - pub n_cases: Option, - - /// Compression bias, usually 100.0. - pub bias: f64, - - /// `dd mmm yy` in the file's encoding. - pub creation_date: S, - - /// `HH:MM:SS` in the file's encoding. - pub creation_time: S, - - /// File label, in the file's encoding. Padded on the right with spaces. - pub file_label: S, - - /// Endianness of the data in the file header. - pub endian: Endian, -} - -impl HeaderRecord -where - S: Debug, -{ - fn debug_field(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult - where - T: Debug, - { - writeln!(f, "{name:>17}: {:?}", value) - } -} - -impl Debug for HeaderRecord -where - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> FmtResult { - writeln!(f, "File header record:")?; - self.debug_field(f, "Magic", self.magic)?; - self.debug_field(f, "Product name", &self.eye_catcher)?; - self.debug_field(f, "Layout code", self.layout_code)?; - self.debug_field(f, "Nominal case size", self.nominal_case_size)?; - self.debug_field(f, "Compression", self.compression)?; - self.debug_field(f, "Weight index", self.weight_index)?; - self.debug_field(f, "Number of cases", self.n_cases)?; - self.debug_field(f, "Compression bias", self.bias)?; - self.debug_field(f, "Creation date", &self.creation_date)?; - self.debug_field(f, "Creation time", &self.creation_time)?; - self.debug_field(f, "File label", &self.file_label)?; - self.debug_field(f, "Endianness", self.endian) - } -} - -impl HeaderRecord { - fn read(r: &mut R) -> Result { - let start = r.stream_position()?; - - let magic: [u8; 4] = read_bytes(r)?; - let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?; - - let eye_catcher = RawString(read_vec(r, 60)?); - let layout_code: [u8; 4] = read_bytes(r)?; - let endian = Endian::identify_u32(2, layout_code) - .or_else(|| Endian::identify_u32(2, layout_code)) - .ok_or(Error::NotASystemFile)?; - let layout_code = endian.parse(layout_code); - - let nominal_case_size: u32 = endian.parse(read_bytes(r)?); - let nominal_case_size = - (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size); - - let compression_code: u32 = endian.parse(read_bytes(r)?); - let compression = match (magic, compression_code) { - (Magic::Zsav, 2) => Some(Compression::ZLib), - (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)), - (_, 0) => None, - (_, 1) => Some(Compression::Simple), - (_, code) => return Err(Error::InvalidSavCompression(code)), - }; - - let weight_index: u32 = endian.parse(read_bytes(r)?); - let weight_index = (weight_index > 0).then_some(weight_index); - - let n_cases: u32 = endian.parse(read_bytes(r)?); - let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases); - - let bias: f64 = endian.parse(read_bytes(r)?); - - let creation_date = RawString(read_vec(r, 9)?); - let creation_time = RawString(read_vec(r, 8)?); - let file_label = RawString(read_vec(r, 64)?); - let _: [u8; 3] = read_bytes(r)?; - - Ok(HeaderRecord { - offsets: start..r.stream_position()?, - magic, - layout_code, - nominal_case_size, - compression, - weight_index, - n_cases, - bias, - creation_date, - creation_time, - eye_catcher, - file_label, - endian, - }) - } - - pub fn decode(self, decoder: &Decoder) -> DecodedRecord { - let eye_catcher = decoder.decode(&self.eye_catcher).to_string(); - let file_label = decoder.decode(&self.file_label).to_string(); - let creation_date = decoder.decode(&self.creation_date).to_string(); - let creation_time = decoder.decode(&self.creation_time).to_string(); - DecodedRecord::Header(HeaderRecord { - eye_catcher, - weight_index: self.weight_index, - n_cases: self.n_cases, - file_label, - offsets: self.offsets.clone(), - magic: self.magic, - layout_code: self.layout_code, - nominal_case_size: self.nominal_case_size, - compression: self.compression, - bias: self.bias, - creation_date, - creation_time, - endian: self.endian, - }) - } -} - -pub struct Decoder { - pub encoding: &'static Encoding, - pub warn: Box, -} - -impl Decoder { - pub fn new(encoding: &'static Encoding, warn: F) -> Self - where - F: Fn(Warning) + 'static, - { - Self { - encoding, - warn: Box::new(warn), - } - } - fn warn(&self, warning: Warning) { - (self.warn)(warning) - } - fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { - let (output, malformed) = self.encoding.decode_without_bom_handling(input); - if malformed { - self.warn(Warning::MalformedString { - encoding: self.encoding.name().into(), - text: output.clone().into(), - }); - } - output - } - - fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> { - self.decode_slice(input.0.as_slice()) - } - - pub fn decode_identifier(&self, input: &RawString) -> Result { - self.new_identifier(&self.decode(input)) - } - - pub fn new_identifier(&self, name: &str) -> Result { - Identifier::from_encoding(name, self.encoding) - } -} - -#[derive(Copy, Clone, PartialEq, Eq, Hash)] -pub enum Magic { - /// Regular system file. - Sav, - - /// System file with Zlib-compressed data. - Zsav, - - /// EBCDIC-encoded system file. - Ebcdic, -} - -impl Magic { - /// Magic number for a regular system file. - pub const SAV: [u8; 4] = *b"$FL2"; - - /// Magic number for a system file that contains zlib-compressed data. - pub const ZSAV: [u8; 4] = *b"$FL3"; - - /// Magic number for an EBCDIC-encoded system file. This is `$FL2` encoded - /// in EBCDIC. - pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2]; -} - -impl Debug for Magic { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - let s = match *self { - Magic::Sav => "$FL2", - Magic::Zsav => "$FL3", - Magic::Ebcdic => "($FL2 in EBCDIC)", - }; - write!(f, "{s}") - } -} - -impl TryFrom<[u8; 4]> for Magic { - type Error = Error; - - fn try_from(value: [u8; 4]) -> Result { - match value { - Magic::SAV => Ok(Magic::Sav), - Magic::ZSAV => Ok(Magic::Zsav), - Magic::EBCDIC => Ok(Magic::Ebcdic), - _ => Err(Error::BadMagic(value)), - } - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub enum VarType { - Numeric, - String, -} - -impl VarType { - pub fn opposite(self) -> VarType { - match self { - Self::Numeric => Self::String, - Self::String => Self::Numeric, - } - } -} - -impl Display for VarType { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - match self { - VarType::Numeric => write!(f, "numeric"), - VarType::String => write!(f, "string"), - } - } -} - -impl TryFrom for VarType { - type Error = (); - - fn try_from(value: RawWidth) -> Result { - match value { - RawWidth::Continuation => Err(()), - RawWidth::Numeric => Ok(VarType::Numeric), - RawWidth::String(_) => Ok(VarType::String), - } - } -} - -impl TryFrom for VarWidth { - type Error = (); - - fn try_from(value: RawWidth) -> Result { - match value { - RawWidth::Continuation => Err(()), - RawWidth::Numeric => Ok(Self::Numeric), - RawWidth::String(width) => Ok(Self::String(width.get() as u16)), - } - } -} - -type RawValue = Value>; - -impl RawValue { - pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self { - match var_type { - VarType::String => Value::String(RawStrArray(raw.0)), - VarType::Numeric => Value::Number(endian.parse(raw.0)), - } - } - - fn read_case( - reader: &mut R, - var_types: &VarTypes, - endian: Endian, - ) -> Result>, Error> { - let case_start = reader.stream_position()?; - let mut values = Vec::with_capacity(var_types.n_values()); - for (i, var_type) in var_types.iter().enumerate() { - let Some(raw) = try_read_bytes(reader)? else { - if i == 0 { - return Ok(None); - } else { - let offset = reader.stream_position()?; - return Err(Error::EofInCase { - offset, - case_ofs: offset - case_start, - case_len: var_types.n_values() * 8, - }); - } - }; - values.push(Value::from_raw(&UntypedValue(raw), var_type, endian)); - } - Ok(Some(values)) - } - - fn read_compressed_case( - reader: &mut R, - var_types: &VarTypes, - codes: &mut VecDeque, - endian: Endian, - bias: f64, - ) -> Result>, Error> { - let case_start = reader.stream_position()?; - let mut values = Vec::with_capacity(var_types.n_values()); - for (i, var_type) in var_types.iter().enumerate() { - let value = loop { - let Some(code) = codes.pop_front() else { - let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else { - if i == 0 { - return Ok(None); - } else { - let offset = reader.stream_position()?; - return Err(Error::EofInCompressedCase { - offset, - case_ofs: offset - case_start, - }); - } - }; - codes.extend(new_codes.into_iter()); - continue; - }; - match code { - 0 => (), - 1..=251 => match var_type { - VarType::Numeric => break Self::Number(Some(code as f64 - bias)), - VarType::String => { - break Self::String(RawStrArray(endian.to_bytes(code as f64 - bias))) - } - }, - 252 => { - if i == 0 { - return Ok(None); - } else { - let offset = reader.stream_position()?; - return Err(Error::PartialCompressedCase { - offset, - case_ofs: offset - case_start, - }); - } - } - 253 => { - break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian) - } - 254 => match var_type { - VarType::String => break Self::String(RawStrArray(*b" ")), // XXX EBCDIC - VarType::Numeric => { - return Err(Error::CompressedStringExpected { - offset: case_start, - case_ofs: reader.stream_position()? - case_start, - }) - } - }, - 255 => match var_type { - VarType::Numeric => break Self::Number(None), - VarType::String => { - return Err(Error::CompressedNumberExpected { - offset: case_start, - case_ofs: reader.stream_position()? - case_start, - }) - } - }, - } - }; - values.push(value); - } - Ok(Some(values)) - } - - pub fn decode(&self, width: VarWidth) -> Value { - match self { - Self::Number(x) => Value::Number(*x), - Self::String(s) => { - let width = width.as_string_width().unwrap(); - Value::String(RawString::from(&s.0[..width])) - } - } - } -} - -struct ZlibDecodeMultiple -where - R: Read + Seek, -{ - reader: Option>, -} - -impl ZlibDecodeMultiple -where - R: Read + Seek, -{ - fn new(reader: R) -> ZlibDecodeMultiple { - ZlibDecodeMultiple { - reader: Some(ZlibDecoder::new(reader)), - } - } -} - -impl Read for ZlibDecodeMultiple -where - R: Read + Seek, -{ - fn read(&mut self, buf: &mut [u8]) -> Result { - loop { - match self.reader.as_mut().unwrap().read(buf)? { - 0 => { - let inner = self.reader.take().unwrap().into_inner(); - self.reader = Some(ZlibDecoder::new(inner)); - } - n => return Ok(n), - }; - } - } -} - -impl Seek for ZlibDecodeMultiple -where - R: Read + Seek, -{ - fn seek(&mut self, pos: SeekFrom) -> Result { - self.reader.as_mut().unwrap().get_mut().seek(pos) - } -} - -enum ReaderState { - Start, - Headers, - ZlibHeader, - ZlibTrailer { - ztrailer_offset: u64, - ztrailer_len: u64, - }, - Cases, - End, -} - -pub struct Reader -where - R: Read + Seek + 'static, -{ - reader: Option, - warn: Box, - - header: HeaderRecord, - var_types: VarTypes, - - state: ReaderState, -} - -impl Reader -where - R: Read + Seek + 'static, -{ - pub fn new(mut reader: R, warn: F) -> Result - where - F: Fn(Warning) + 'static, - { - let header = HeaderRecord::read(&mut reader)?; - Ok(Self { - reader: Some(reader), - warn: Box::new(warn), - header, - var_types: VarTypes::new(), - state: ReaderState::Start, - }) - } - fn cases(&mut self) -> Cases { - self.state = ReaderState::End; - Cases::new( - self.reader.take().unwrap(), - take(&mut self.var_types), - &self.header, - ) - } - fn _next(&mut self) -> Option<::Item> { - match self.state { - ReaderState::Start => { - self.state = ReaderState::Headers; - Some(Ok(Record::Header(self.header.clone()))) - } - ReaderState::Headers => { - let record = loop { - match Record::read( - self.reader.as_mut().unwrap(), - self.header.endian, - &self.var_types, - &self.warn, - ) { - Ok(Some(record)) => break record, - Ok(None) => (), - Err(error) => return Some(Err(error)), - } - }; - match record { - Record::Variable(VariableRecord { width, .. }) => self.var_types.push(width), - Record::EndOfHeaders(_) => { - self.state = if let Some(Compression::ZLib) = self.header.compression { - ReaderState::ZlibHeader - } else { - ReaderState::Cases - }; - } - _ => (), - }; - Some(Ok(record)) - } - ReaderState::ZlibHeader => { - let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian) - { - Ok(zheader) => zheader, - Err(error) => return Some(Err(error)), - }; - self.state = ReaderState::ZlibTrailer { - ztrailer_offset: zheader.ztrailer_offset, - ztrailer_len: zheader.ztrailer_len, - }; - Some(Ok(Record::ZHeader(zheader))) - } - ReaderState::ZlibTrailer { - ztrailer_offset, - ztrailer_len, - } => { - match ZTrailer::read( - self.reader.as_mut().unwrap(), - self.header.endian, - ztrailer_offset, - ztrailer_len, - ) { - Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))), - Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))), - Err(error) => Some(Err(error)), - } - } - ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))), - ReaderState::End => None, - } - } -} - -impl Iterator for Reader -where - R: Read + Seek + 'static, -{ - type Item = Result; - - fn next(&mut self) -> Option { - let retval = self._next(); - if matches!(retval, Some(Err(_))) { - self.state = ReaderState::End; - } - retval - } -} - -trait ReadSeek: Read + Seek {} -impl ReadSeek for T where T: Read + Seek {} - -pub struct Cases { - reader: Box, - var_types: VarTypes, - compression: Option, - bias: f64, - endian: Endian, - codes: VecDeque, - eof: bool, -} - -impl Debug for Cases { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "Cases") - } -} - -impl Cases { - fn new(reader: R, var_types: VarTypes, header: &HeaderRecord) -> Self - where - R: Read + Seek + 'static, - { - Self { - reader: if header.compression == Some(Compression::ZLib) { - Box::new(ZlibDecodeMultiple::new(reader)) - } else { - Box::new(reader) - }, - var_types, - compression: header.compression, - bias: header.bias, - endian: header.endian, - codes: VecDeque::with_capacity(8), - eof: false, - } - } -} - -impl Iterator for Cases { - type Item = Result, Error>; - - fn next(&mut self) -> Option { - if self.eof { - return None; - } - - let retval = if self.compression.is_some() { - Value::read_compressed_case( - &mut self.reader, - &self.var_types, - &mut self.codes, - self.endian, - self.bias, - ) - .transpose() - } else { - Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose() - }; - self.eof = matches!(retval, None | Some(Err(_))); - retval - } -} - -#[derive(Copy, Clone, PartialEq, Eq, Hash)] -pub struct Spec(pub u32); - -impl Debug for Spec { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - let type_ = format_name(self.0 >> 16); - let w = (self.0 >> 8) & 0xff; - let d = self.0 & 0xff; - write!(f, "{:06x} ({type_}{w}.{d})", self.0) - } -} - -fn format_name(type_: u32) -> Cow<'static, str> { - match type_ { - 1 => "A", - 2 => "AHEX", - 3 => "COMMA", - 4 => "DOLLAR", - 5 => "F", - 6 => "IB", - 7 => "PIBHEX", - 8 => "P", - 9 => "PIB", - 10 => "PK", - 11 => "RB", - 12 => "RBHEX", - 15 => "Z", - 16 => "N", - 17 => "E", - 20 => "DATE", - 21 => "TIME", - 22 => "DATETIME", - 23 => "ADATE", - 24 => "JDATE", - 25 => "DTIME", - 26 => "WKDAY", - 27 => "MONTH", - 28 => "MOYR", - 29 => "QYR", - 30 => "WKYR", - 31 => "PCT", - 32 => "DOT", - 33 => "CCA", - 34 => "CCB", - 35 => "CCC", - 36 => "CCD", - 37 => "CCE", - 38 => "EDATE", - 39 => "SDATE", - 40 => "MTIME", - 41 => "YMDHMS", - _ => return format!("").into(), - } - .into() -} - -#[derive(Clone)] -pub struct MissingValues> -where - S: Debug, -{ - /// Individual missing values, up to 3 of them. - pub values: Vec>, - - /// Optional range of missing values. - pub range: Option<(Value, Value)>, -} - -impl Debug for MissingValues -where - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> FmtResult { - for (i, value) in self.values.iter().enumerate() { - if i > 0 { - write!(f, ", ")?; - } - write!(f, "{value:?}")?; - } - - if let Some((low, high)) = &self.range { - if !self.values.is_empty() { - write!(f, ", ")?; - } - write!(f, "{low:?} THRU {high:?}")?; - } - - if self.is_empty() { - write!(f, "none")?; - } - - Ok(()) - } -} - -impl MissingValues -where - S: Debug, -{ - fn is_empty(&self) -> bool { - self.values.is_empty() && self.range.is_none() - } -} - -impl Default for MissingValues -where - S: Debug, -{ - fn default() -> Self { - Self { - values: Vec::new(), - range: None, - } - } -} - -impl MissingValues { - fn read( - r: &mut R, - offset: u64, - width: RawWidth, - code: i32, - endian: Endian, - warn: &dyn Fn(Warning), - ) -> Result { - let (individual_values, has_range) = match code { - 0..=3 => (code as usize, false), - -2 => (0, true), - -3 => (1, true), - _ => return Err(Error::BadMissingValueCode { offset, code }), - }; - - let mut values = Vec::with_capacity(individual_values); - for _ in 0..individual_values { - values.push(read_bytes::<8, _>(r)?); - } - let range = if has_range { - let low = read_bytes::<8, _>(r)?; - let high = read_bytes::<8, _>(r)?; - Some((low, high)) - } else { - None - }; - - match VarWidth::try_from(width) { - Ok(VarWidth::Numeric) => { - let values = values - .into_iter() - .map(|v| Value::Number(endian.parse(v))) - .collect(); - let range = range.map(|(low, high)| { - ( - Value::Number(endian.parse(low)), - Value::Number(endian.parse(high)), - ) - }); - return Ok(Self { values, range }); - } - Ok(VarWidth::String(width)) if width <= 8 && range.is_none() => { - let values = values - .into_iter() - .map(|value| Value::String(Box::from(&value[..width as usize]))) - .collect(); - return Ok(Self { - values, - range: None, - }); - } - Ok(VarWidth::String(width)) if width > 8 => warn(Warning::TBD), - Ok(VarWidth::String(_)) => warn(Warning::TBD), - Err(()) => warn(Warning::TBD), - } - Ok(Self::default()) - } -} - -#[derive(Clone)] -pub struct VariableRecord -where - S: Debug, -{ - /// Range of offsets in file. - pub offsets: Range, - - /// Variable width, in the range -1..=255. - pub width: RawWidth, - - /// Variable name, padded on the right with spaces. - pub name: S, - - /// Print format. - pub print_format: Spec, - - /// Write format. - pub write_format: Spec, - - /// Missing values. - pub missing_values: MissingValues, - - /// Optional variable label. - pub label: Option, -} - -#[derive(Copy, Clone, PartialEq, Eq)] -pub enum RawWidth { - Continuation, - Numeric, - String(NonZeroU8), -} - -impl RawWidth { - pub fn n_values(&self) -> Option { - match self { - RawWidth::Numeric => Some(1), - RawWidth::String(width) => Some((width.get() as usize).div_ceil(8)), - _ => None, - } - } -} - -impl TryFrom for RawWidth { - type Error = (); - - fn try_from(value: i32) -> Result { - match value { - -1 => Ok(Self::Continuation), - 0 => Ok(Self::Numeric), - 1..=255 => Ok(Self::String(NonZeroU8::new(value as u8).unwrap())), - _ => Err(()), - } - } -} - -impl Display for RawWidth { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - match self { - RawWidth::Continuation => write!(f, "long string continuation"), - RawWidth::Numeric => write!(f, "numeric"), - RawWidth::String(width) => write!(f, "{width}-byte string"), - } - } -} - -impl Debug for VariableRecord -where - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> FmtResult { - writeln!(f, "Width: {}", self.width,)?; - writeln!(f, "Print format: {:?}", self.print_format)?; - writeln!(f, "Write format: {:?}", self.write_format)?; - writeln!(f, "Name: {:?}", &self.name)?; - writeln!(f, "Variable label: {:?}", self.label)?; - writeln!(f, "Missing values: {:?}", self.missing_values) - } -} - -impl VariableRecord { - fn read( - r: &mut R, - endian: Endian, - warn: &dyn Fn(Warning), - ) -> Result { - let start_offset = r.stream_position()?; - let width: i32 = endian.parse(read_bytes(r)?); - let width: RawWidth = width.try_into().map_err(|_| Error::BadVariableWidth { - start_offset, - width, - })?; - let code_offset = r.stream_position()?; - let has_variable_label: u32 = endian.parse(read_bytes(r)?); - let missing_value_code: i32 = endian.parse(read_bytes(r)?); - let print_format = Spec(endian.parse(read_bytes(r)?)); - let write_format = Spec(endian.parse(read_bytes(r)?)); - let name = RawString(read_vec(r, 8)?); - - let label = match has_variable_label { - 0 => None, - 1 => { - let len: u32 = endian.parse(read_bytes(r)?); - let read_len = len.min(65535) as usize; - let label = RawString(read_vec(r, read_len)?); - - let padding_bytes = Integer::next_multiple_of(&len, &4) - len; - let _ = read_vec(r, padding_bytes as usize)?; - - Some(label) - } - _ => { - return Err(Error::BadVariableLabelCode { - start_offset, - code_offset, - code: has_variable_label, - }) - } - }; - - let missing_values = - MissingValues::read(r, start_offset, width, missing_value_code, endian, warn)?; - - let end_offset = r.stream_position()?; - - Ok(Record::Variable(VariableRecord { - offsets: start_offset..end_offset, - width, - name, - print_format, - write_format, - missing_values, - label, - })) - } - - pub fn decode(self, decoder: &Decoder) -> DecodedRecord { - DecodedRecord::Variable(VariableRecord { - offsets: self.offsets.clone(), - width: self.width, - name: decoder.decode(&self.name).to_string(), - print_format: self.print_format, - write_format: self.write_format, - missing_values: self.missing_values, - label: self - .label - .as_ref() - .map(|label| decoder.decode(label).to_string()), - }) - } -} - -#[derive(Copy, Clone)] -pub struct UntypedValue(pub [u8; 8]); - -impl Debug for UntypedValue { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - let little: f64 = Endian::Little.parse(self.0); - let little = format!("{:?}", little); - let big: f64 = Endian::Big.parse(self.0); - let big = format!("{:?}", big); - let number = if little.len() <= big.len() { - little - } else { - big - }; - write!(f, "{number}")?; - - let string = default_decode(&self.0); - let string = string - .split(|c: char| c == '\0' || c.is_control()) - .next() - .unwrap(); - write!(f, "{string:?}")?; - Ok(()) - } -} - -#[derive(Clone, PartialEq, Default, Eq, PartialOrd, Ord, Hash)] -pub struct RawString(pub Vec); - -impl RawString { - pub fn spaces(n: usize) -> Self { - Self(std::iter::repeat_n(b' ', n).collect()) - } - pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> { - EncodedStr::new(&self.0, encoding) - } -} - -impl From> for RawString { - fn from(value: Cow<'_, [u8]>) -> Self { - Self(value.into_owned()) - } -} - -impl From> for RawString { - fn from(source: Vec) -> Self { - Self(source) - } -} - -impl From<&[u8]> for RawString { - fn from(source: &[u8]) -> Self { - Self(source.into()) - } -} - -impl Debug for RawString { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{:?}", default_decode(self.0.as_slice())) - } -} - -#[derive(Copy, Clone)] -pub struct RawStrArray(pub [u8; N]); - -impl From<[u8; N]> for RawStrArray { - fn from(source: [u8; N]) -> Self { - Self(source) - } -} - -impl Debug for RawStrArray { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{:?}", default_decode(&self.0)) - } -} - -#[derive(Clone, Debug)] -pub enum EncodedString { - Encoded { - bytes: Vec, - encoding: &'static Encoding, - }, - Utf8 { - s: String, - }, -} - -impl EncodedString { - pub fn borrowed(&self) -> EncodedStr<'_> { - match self { - EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding }, - EncodedString::Utf8 { s } => EncodedStr::Utf8 { s }, - } - } -} - -impl<'a> From> for EncodedString { - fn from(value: EncodedStr<'a>) -> Self { - match value { - EncodedStr::Encoded { bytes, encoding } => Self::Encoded { - bytes: bytes.into(), - encoding, - }, - EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() }, - } - } -} - -pub enum EncodedStr<'a> { - Encoded { - bytes: &'a [u8], - encoding: &'static Encoding, - }, - Utf8 { - s: &'a str, - }, -} - -impl<'a> EncodedStr<'a> { - pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self { - Self::Encoded { bytes, encoding } - } - pub fn as_str(&self) -> Cow<'_, str> { - match self { - EncodedStr::Encoded { bytes, encoding } => { - encoding.decode_without_bom_handling(bytes).0 - } - EncodedStr::Utf8 { s } => Cow::from(*s), - } - } - pub fn as_bytes(&self) -> &[u8] { - match self { - EncodedStr::Encoded { bytes, .. } => bytes, - EncodedStr::Utf8 { s } => s.as_bytes(), - } - } - pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { - match self { - EncodedStr::Encoded { bytes, encoding } => { - let utf8 = encoding.decode_without_bom_handling(bytes).0; - match encoding.encode(&utf8).0 { - Cow::Borrowed(_) => { - // Recoding into UTF-8 and then back did not change anything. - Cow::from(*bytes) - } - Cow::Owned(owned) => Cow::Owned(owned), - } - } - EncodedStr::Utf8 { s } => encoding.encode(s).0, - } - } - pub fn is_empty(&self) -> bool { - match self { - EncodedStr::Encoded { bytes, .. } => bytes.is_empty(), - EncodedStr::Utf8 { s } => s.is_empty(), - } - } - pub fn quoted(&self) -> QuotedEncodedStr { - QuotedEncodedStr(self) - } -} - -impl<'a> From<&'a str> for EncodedStr<'a> { - fn from(s: &'a str) -> Self { - Self::Utf8 { s } - } -} - -impl<'a> From<&'a String> for EncodedStr<'a> { - fn from(s: &'a String) -> Self { - Self::Utf8 { s: s.as_str() } - } -} - -pub struct QuotedEncodedStr<'a>(&'a EncodedStr<'a>); - -impl Display for QuotedEncodedStr<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self.0.as_str()) - } -} - -#[derive(Clone, Debug)] -pub struct ValueLabel -where - V: Debug, - S: Debug, -{ - pub value: Value, - pub label: S, -} - -#[derive(Clone)] -pub struct ValueLabelRecord -where - V: Debug, - S: Debug, -{ - /// Range of offsets in file. - pub offsets: Range, - - /// The labels. - pub labels: Vec>, - - /// The 1-based indexes of the variable indexes. - pub dict_indexes: Vec, - - /// The types of the variables. - pub var_type: VarType, -} - -impl Debug for ValueLabelRecord -where - V: Debug, - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> FmtResult { - writeln!(f, "labels: ")?; - for label in self.labels.iter() { - writeln!(f, "{label:?}")?; - } - write!(f, "apply to {} variables", self.var_type)?; - for dict_index in self.dict_indexes.iter() { - write!(f, " #{dict_index}")?; - } - Ok(()) - } -} - -impl ValueLabelRecord -where - V: Debug, - S: Debug, -{ - /// Maximum number of value labels in a record. - pub const MAX_LABELS: u32 = u32::MAX / 8; - - /// Maximum number of variable indexes in a record. - pub const MAX_INDEXES: u32 = u32::MAX / 8; -} - -impl ValueLabelRecord, RawString> { - fn read( - r: &mut R, - endian: Endian, - var_types: &VarTypes, - warn: &dyn Fn(Warning), - ) -> Result, Error> { - let label_offset = r.stream_position()?; - let n: u32 = endian.parse(read_bytes(r)?); - if n > Self::MAX_LABELS { - return Err(Error::BadNumberOfValueLabels { - offset: label_offset, - n, - max: Self::MAX_LABELS, - }); - } - - let mut labels = Vec::new(); - for _ in 0..n { - let value = UntypedValue(read_bytes(r)?); - let label_len: u8 = endian.parse(read_bytes(r)?); - let label_len = label_len as usize; - let padded_len = Integer::next_multiple_of(&(label_len + 1), &8); - - let mut label = read_vec(r, padded_len - 1)?; - label.truncate(label_len); - labels.push((value, RawString(label))); - } - - let index_offset = r.stream_position()?; - let rec_type: u32 = endian.parse(read_bytes(r)?); - if rec_type != 4 { - return Err(Error::ExpectedVarIndexRecord { - offset: index_offset, - rec_type, - }); - } - - let n: u32 = endian.parse(read_bytes(r)?); - if n > Self::MAX_INDEXES { - return Err(Error::TooManyVarIndexes { - offset: index_offset, - n, - max: Self::MAX_INDEXES, - }); - } else if n == 0 { - warn(Warning::NoVarIndexes { - offset: index_offset, - }); - return Ok(None); - } - - let index_offset = r.stream_position()?; - let mut dict_indexes = Vec::with_capacity(n as usize); - let mut invalid_indexes = Vec::new(); - for _ in 0..n { - let index: u32 = endian.parse(read_bytes(r)?); - if var_types.is_valid_index(index as usize) { - dict_indexes.push(index); - } else { - invalid_indexes.push(index); - } - } - if !invalid_indexes.is_empty() { - warn(Warning::InvalidVarIndexes { - offset: index_offset, - max: var_types.n_values(), - invalid: invalid_indexes, - }); - } - - let Some(&first_index) = dict_indexes.first() else { - return Ok(None); - }; - let var_type = var_types.types[first_index as usize - 1].unwrap(); - let mut wrong_type_indexes = Vec::new(); - dict_indexes.retain(|&index| { - if var_types.types[index as usize - 1] != Some(var_type) { - wrong_type_indexes.push(index); - false - } else { - true - } - }); - if !wrong_type_indexes.is_empty() { - warn(Warning::MixedVarTypes { - offset: index_offset, - var_type, - wrong_types: wrong_type_indexes, - }); - } - - let labels = labels - .into_iter() - .map(|(value, label)| ValueLabel { - value: Value::from_raw(&value, var_type, endian), - label, - }) - .collect(); - - let end_offset = r.stream_position()?; - Ok(Some(Record::ValueLabel(ValueLabelRecord { - offsets: label_offset..end_offset, - labels, - dict_indexes, - var_type, - }))) - } - - fn decode(self, decoder: &Decoder) -> ValueLabelRecord, String> { - let labels = self - .labels - .iter() - .map(|ValueLabel { value, label }| ValueLabel { - value: value.clone(), - label: decoder.decode(label).to_string(), - }) - .collect(); - ValueLabelRecord { - offsets: self.offsets.clone(), - labels, - dict_indexes: self.dict_indexes.clone(), - var_type: self.var_type, - } - } -} - -#[derive(Clone, Debug)] -pub struct DocumentRecord -where - S: Debug, -{ - pub offsets: Range, - - /// The document, as an array of lines. Raw lines are exactly 80 bytes long - /// and are right-padded with spaces without any new-line termination. - pub lines: Vec, -} - -pub type RawDocumentLine = RawStrArray; - -/// Length of a line in a document. Document lines are fixed-length and -/// padded on the right with spaces. -pub const DOC_LINE_LEN: usize = 80; - -impl DocumentRecord { - /// Maximum number of lines we will accept in a document. This is simply - /// the maximum number that will fit in a 32-bit space. - pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN; - - fn read(r: &mut R, endian: Endian) -> Result { - let start_offset = r.stream_position()?; - let n: u32 = endian.parse(read_bytes(r)?); - let n = n as usize; - if n > Self::MAX_LINES { - Err(Error::BadDocumentLength { - offset: start_offset, - n, - max: Self::MAX_LINES, - }) - } else { - let mut lines = Vec::with_capacity(n); - for _ in 0..n { - lines.push(RawStrArray(read_bytes(r)?)); - } - let end_offset = r.stream_position()?; - Ok(Record::Document(DocumentRecord { - offsets: start_offset..end_offset, - lines, - })) - } - } - - pub fn decode(self, decoder: &Decoder) -> DecodedRecord { - DecodedRecord::Document(DocumentRecord { - offsets: self.offsets.clone(), - lines: self - .lines - .iter() - .map(|s| decoder.decode_slice(&s.0).to_string()) - .collect(), - }) - } -} - -trait ExtensionRecord { - const SUBTYPE: u32; - const SIZE: Option; - const COUNT: Option; - const NAME: &'static str; - fn parse(ext: &Extension, endian: Endian) -> Result; -} - -#[derive(Clone, Debug)] -pub struct IntegerInfoRecord { - pub offsets: Range, - pub version: (i32, i32, i32), - pub machine_code: i32, - pub floating_point_rep: i32, - pub compression_code: i32, - pub endianness: i32, - pub character_code: i32, -} - -impl ExtensionRecord for IntegerInfoRecord { - const SUBTYPE: u32 = 3; - const SIZE: Option = Some(4); - const COUNT: Option = Some(8); - const NAME: &'static str = "integer record"; - - fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size::()?; - - let mut input = &ext.data[..]; - let data: Vec = (0..8) - .map(|_| endian.parse(read_bytes(&mut input).unwrap())) - .collect(); - Ok(Record::IntegerInfo(IntegerInfoRecord { - offsets: ext.offsets.clone(), - version: (data[0], data[1], data[2]), - machine_code: data[3], - floating_point_rep: data[4], - compression_code: data[5], - endianness: data[6], - character_code: data[7], - })) - } -} - -#[derive(Clone, Debug)] -pub struct FloatInfoRecord { - pub sysmis: f64, - pub highest: f64, - pub lowest: f64, -} - -impl ExtensionRecord for FloatInfoRecord { - const SUBTYPE: u32 = 4; - const SIZE: Option = Some(8); - const COUNT: Option = Some(3); - const NAME: &'static str = "floating point record"; - - fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size::()?; - - let mut input = &ext.data[..]; - let data: Vec = (0..3) - .map(|_| endian.parse(read_bytes(&mut input).unwrap())) - .collect(); - Ok(Record::FloatInfo(FloatInfoRecord { - sysmis: data[0], - highest: data[1], - lowest: data[2], - })) - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub enum CategoryLabels { - VarLabels, - CountedValues, -} - -#[derive(Clone, Debug)] -pub enum MultipleResponseType { - MultipleDichotomy { - value: RawString, - labels: CategoryLabels, - }, - MultipleCategory, -} - -impl MultipleResponseType { - fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> { - let (mr_type, input) = match input.split_first() { - Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input), - Some((b'D', input)) => { - let (value, input) = parse_counted_string(input)?; - ( - MultipleResponseType::MultipleDichotomy { - value, - labels: CategoryLabels::VarLabels, - }, - input, - ) - } - Some((b'E', input)) => { - let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") { - (CategoryLabels::CountedValues, rest) - } else if let Some(rest) = input.strip_prefix(b" 11 ") { - (CategoryLabels::VarLabels, rest) - } else { - return Err(Warning::TBD); - }; - let (value, input) = parse_counted_string(input)?; - ( - MultipleResponseType::MultipleDichotomy { value, labels }, - input, - ) - } - _ => return Err(Warning::TBD), - }; - Ok((mr_type, input)) - } -} - -#[derive(Clone, Debug)] -pub struct MultipleResponseSet -where - I: Debug, - S: Debug, -{ - pub name: I, - pub label: S, - pub mr_type: MultipleResponseType, - pub short_names: Vec, -} - -impl MultipleResponseSet { - fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> { - let Some(equals) = input.iter().position(|&b| b == b'=') else { - return Err(Warning::TBD); - }; - let (name, input) = input.split_at(equals); - let (mr_type, input) = MultipleResponseType::parse(input)?; - let Some(input) = input.strip_prefix(b" ") else { - return Err(Warning::TBD); - }; - let (label, mut input) = parse_counted_string(input)?; - let mut vars = Vec::new(); - while input.first() != Some(&b'\n') { - match input.split_first() { - Some((b' ', rest)) => { - let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else { - return Err(Warning::TBD); - }; - let (var, rest) = rest.split_at(length); - if !var.is_empty() { - vars.push(var.into()); - } - input = rest; - } - _ => return Err(Warning::TBD), - } - } - while input.first() == Some(&b'\n') { - input = &input[1..]; - } - Ok(( - MultipleResponseSet { - name: name.into(), - label, - mr_type, - short_names: vars, - }, - input, - )) - } - - fn decode( - &self, - decoder: &Decoder, - ) -> Result, Warning> { - let mut short_names = Vec::with_capacity(self.short_names.len()); - for short_name in self.short_names.iter() { - if let Some(short_name) = decoder - .decode_identifier(short_name) - .map_err(Warning::InvalidMrSetName) - .issue_warning(&decoder.warn) - { - short_names.push(short_name); - } - } - Ok(MultipleResponseSet { - name: decoder - .decode_identifier(&self.name) - .map_err(Warning::InvalidMrSetVariableName)?, - label: decoder.decode(&self.label).to_string(), - mr_type: self.mr_type.clone(), - short_names, - }) - } -} - -#[derive(Clone, Debug)] -pub struct MultipleResponseRecord(pub Vec>) -where - I: Debug, - S: Debug; - -impl ExtensionRecord for MultipleResponseRecord { - const SUBTYPE: u32 = 7; - const SIZE: Option = Some(1); - const COUNT: Option = None; - const NAME: &'static str = "multiple response set record"; - - fn parse(ext: &Extension, _endian: Endian) -> Result { - ext.check_size::()?; - - let mut input = &ext.data[..]; - let mut sets = Vec::new(); - while !input.is_empty() { - let (set, rest) = MultipleResponseSet::parse(input)?; - sets.push(set); - input = rest; - } - Ok(Record::MultipleResponse(MultipleResponseRecord(sets))) - } -} - -impl MultipleResponseRecord { - fn decode(self, decoder: &Decoder) -> DecodedRecord { - let mut sets = Vec::new(); - for set in self.0.iter() { - if let Some(set) = set.decode(decoder).issue_warning(&decoder.warn) { - sets.push(set); - } - } - DecodedRecord::MultipleResponse(MultipleResponseRecord(sets)) - } -} - -fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> { - let Some(space) = input.iter().position(|&b| b == b' ') else { - return Err(Warning::TBD); - }; - let Ok(length) = from_utf8(&input[..space]) else { - return Err(Warning::TBD); - }; - let Ok(length): Result = length.parse() else { - return Err(Warning::TBD); - }; - - let input = &input[space + 1..]; - if input.len() < length { - return Err(Warning::TBD); - }; - - let (string, rest) = input.split_at(length); - Ok((string.into(), rest)) -} - -/// [Level of measurement](https://en.wikipedia.org/wiki/Level_of_measurement). -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub enum Measure { - /// Nominal values can only be compared for equality. - Nominal, - - /// Ordinal values can be meaningfully ordered. - Ordinal, - - /// Scale values can be meaningfully compared for the degree of difference. - Scale, -} - -impl Measure { - pub fn default_for_type(var_type: VarType) -> Option { - match var_type { - VarType::Numeric => None, - VarType::String => Some(Self::Nominal), - } - } - - fn try_decode(source: u32) -> Result, Warning> { - match source { - 0 => Ok(None), - 1 => Ok(Some(Measure::Nominal)), - 2 => Ok(Some(Measure::Ordinal)), - 3 => Ok(Some(Measure::Scale)), - _ => Err(Warning::InvalidMeasurement(source)), - } - } -} - -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub enum Alignment { - Left, - Right, - Center, -} - -impl Alignment { - fn try_decode(source: u32) -> Result, Warning> { - match source { - 0 => Ok(None), - 1 => Ok(Some(Alignment::Left)), - 2 => Ok(Some(Alignment::Right)), - 3 => Ok(Some(Alignment::Center)), - _ => Err(Warning::InvalidAlignment(source)), - } - } - - pub fn default_for_type(var_type: VarType) -> Self { - match var_type { - VarType::Numeric => Self::Right, - VarType::String => Self::Left, - } - } -} - -#[derive(Clone, Debug)] -pub struct VarDisplay { - pub measure: Option, - pub width: Option, - pub alignment: Option, -} - -#[derive(Clone, Debug)] -pub struct VarDisplayRecord(pub Vec); - -impl VarDisplayRecord { - const SUBTYPE: u32 = 11; - - fn parse( - ext: &Extension, - n_vars: usize, - endian: Endian, - warn: &dyn Fn(Warning), - ) -> Result { - if ext.size != 4 { - return Err(Warning::BadRecordSize { - offset: ext.offsets.start, - record: String::from("variable display record"), - size: ext.size, - expected_size: 4, - }); - } - - let has_width = if ext.count as usize == 3 * n_vars { - true - } else if ext.count as usize == 2 * n_vars { - false - } else { - return Err(Warning::TBD); - }; - - let mut var_displays = Vec::new(); - let mut input = &ext.data[..]; - for _ in 0..n_vars { - let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap())) - .issue_warning(&warn) - .flatten(); - let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap())); - let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap())) - .issue_warning(&warn) - .flatten(); - var_displays.push(VarDisplay { - measure, - width, - alignment, - }); - } - Ok(Record::VarDisplay(VarDisplayRecord(var_displays))) - } -} - -#[derive(Clone, Debug)] -pub struct LongStringMissingValues -where - N: Debug, -{ - /// Variable name. - pub var_name: N, - - /// Missing values. - pub missing_values: Vec>, -} - -impl LongStringMissingValues { - fn decode(&self, decoder: &Decoder) -> Result, IdError> { - Ok(LongStringMissingValues { - var_name: decoder.decode_identifier(&self.var_name)?, - missing_values: self.missing_values.clone(), - }) - } -} - -#[derive(Clone, Debug)] -pub struct LongStringMissingValueRecord(pub Vec>) -where - N: Debug; - -impl ExtensionRecord for LongStringMissingValueRecord { - const SUBTYPE: u32 = 22; - const SIZE: Option = Some(1); - const COUNT: Option = None; - const NAME: &'static str = "long string missing values record"; - - fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size::()?; - - let mut input = &ext.data[..]; - let mut missing_value_set = Vec::new(); - while !input.is_empty() { - let var_name = read_string(&mut input, endian)?; - let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?); - let value_len: u32 = endian.parse(read_bytes(&mut input)?); - if value_len != 8 { - let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start; - return Err(Warning::BadLongMissingValueLength { - record_offset: ext.offsets.start, - offset, - value_len, - }); - } - let mut missing_values = Vec::new(); - for i in 0..n_missing_values { - let value: [u8; 8] = read_bytes(&mut input)?; - let numeric_value: u64 = endian.parse(value); - let value = if i > 0 && numeric_value == 8 { - // Tolerate files written by old, buggy versions of PSPP - // where we believed that the value_length was repeated - // before each missing value. - read_bytes(&mut input)? - } else { - value - }; - missing_values.push(RawStrArray(value)); - } - missing_value_set.push(LongStringMissingValues { - var_name, - missing_values, - }); - } - Ok(Record::LongStringMissingValues( - LongStringMissingValueRecord(missing_value_set), - )) - } -} - -impl LongStringMissingValueRecord { - pub fn decode(self, decoder: &Decoder) -> LongStringMissingValueRecord { - let mut mvs = Vec::with_capacity(self.0.len()); - for mv in self.0.iter() { - if let Some(mv) = mv - .decode(decoder) - .map_err(Warning::InvalidLongStringMissingValueVariableName) - .issue_warning(&decoder.warn) - { - mvs.push(mv); - } - } - LongStringMissingValueRecord(mvs) - } -} - -#[derive(Clone, Debug)] -pub struct EncodingRecord(pub String); - -impl ExtensionRecord for EncodingRecord { - const SUBTYPE: u32 = 20; - const SIZE: Option = Some(1); - const COUNT: Option = None; - const NAME: &'static str = "encoding record"; - - fn parse(ext: &Extension, _endian: Endian) -> Result { - ext.check_size::()?; - - Ok(Record::Encoding(EncodingRecord( - String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName { - offset: ext.offsets.start, - })?, - ))) - } -} - -#[derive(Clone, Debug)] -pub struct NumberOfCasesRecord { - /// Always observed as 1. - pub one: u64, - - /// Number of cases. - pub n_cases: u64, -} - -impl ExtensionRecord for NumberOfCasesRecord { - const SUBTYPE: u32 = 16; - const SIZE: Option = Some(8); - const COUNT: Option = Some(2); - const NAME: &'static str = "extended number of cases record"; - - fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size::()?; - - let mut input = &ext.data[..]; - let one = endian.parse(read_bytes(&mut input)?); - let n_cases = endian.parse(read_bytes(&mut input)?); - - Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases })) - } -} - -#[derive(Clone, Debug)] -pub struct TextRecord { - pub offsets: Range, - - /// Type of record. - pub rec_type: TextRecordType, - - /// The text content of the record. - pub text: RawString, -} - -#[derive(Clone, Copy, Debug)] -pub enum TextRecordType { - VariableSets, - ProductInfo, - LongNames, - VeryLongStrings, - FileAttributes, - VariableAttributes, -} - -impl TextRecord { - fn new(extension: Extension, rec_type: TextRecordType) -> Self { - Self { - offsets: extension.offsets, - rec_type, - text: extension.data.into(), - } - } - pub fn decode(self, decoder: &Decoder) -> DecodedRecord { - match self.rec_type { - TextRecordType::VariableSets => { - DecodedRecord::VariableSets(VariableSetRecord::decode(&self, decoder)) - } - TextRecordType::ProductInfo => { - DecodedRecord::ProductInfo(ProductInfoRecord::decode(&self, decoder)) - } - TextRecordType::LongNames => { - DecodedRecord::LongNames(LongNamesRecord::decode(&self, decoder)) - } - TextRecordType::VeryLongStrings => { - DecodedRecord::VeryLongStrings(VeryLongStringsRecord::decode(&self, decoder)) - } - TextRecordType::FileAttributes => { - DecodedRecord::FileAttributes(FileAttributeRecord::decode(&self, decoder)) - } - TextRecordType::VariableAttributes => { - DecodedRecord::VariableAttributes(VariableAttributeRecord::decode(&self, decoder)) - } - } - } -} - -#[derive(Clone, Debug)] -pub struct VeryLongString { - pub short_name: Identifier, - pub length: u16, -} - -impl VeryLongString { - fn parse(decoder: &Decoder, input: &str) -> Result { - let Some((short_name, length)) = input.split_once('=') else { - return Err(Warning::TBD); - }; - let short_name = decoder - .new_identifier(short_name) - .and_then(Identifier::must_be_ordinary) - .map_err(Warning::InvalidLongStringName)?; - let length = length.parse().map_err(|_| Warning::TBD)?; - Ok(VeryLongString { short_name, length }) - } -} - -#[derive(Clone, Debug)] -pub struct VeryLongStringsRecord(pub Vec); - -impl VeryLongStringsRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Self { - let input = decoder.decode(&source.text); - let mut very_long_strings = Vec::new(); - for tuple in input - .split('\0') - .map(|s| s.trim_end_matches('\t')) - .filter(|s| !s.is_empty()) - { - if let Some(vls) = VeryLongString::parse(decoder, tuple).issue_warning(&decoder.warn) { - very_long_strings.push(vls) - } - } - VeryLongStringsRecord(very_long_strings) - } -} - -#[derive(Clone, Debug)] -pub struct Attribute { - pub name: Identifier, - pub values: Vec, -} - -impl Attribute { - fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> { - let Some((name, mut input)) = input.split_once('(') else { - return Err(Warning::TBD); - }; - let name = decoder - .new_identifier(name) - .map_err(Warning::InvalidAttributeName)?; - let mut values = Vec::new(); - loop { - let Some((value, rest)) = input.split_once('\n') else { - return Err(Warning::TBD); - }; - if let Some(stripped) = value - .strip_prefix('\'') - .and_then(|value| value.strip_suffix('\'')) - { - values.push(stripped.into()); - } else { - decoder.warn(Warning::TBD); - values.push(value.into()); - } - if let Some(rest) = rest.strip_prefix(')') { - let attribute = Attribute { name, values }; - return Ok((attribute, rest)); - }; - input = rest; - } - } -} - -impl Attributes { - fn parse<'a>( - decoder: &Decoder, - mut input: &'a str, - sentinel: Option, - ) -> Result<(Attributes, &'a str), Warning> { - let mut attributes = HashMap::new(); - let rest = loop { - match input.chars().next() { - None => break input, - c if c == sentinel => break &input[1..], - _ => { - let (attribute, rest) = Attribute::parse(decoder, input)?; - // XXX report duplicate name - attributes.insert(attribute.name, attribute.values); - input = rest; - } - } - }; - Ok((Attributes(attributes), rest)) - } -} - -#[derive(Clone, Debug, Default)] -pub struct FileAttributeRecord(pub Attributes); - -impl FileAttributeRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Self { - let input = decoder.decode(&source.text); - match Attributes::parse(decoder, &input, None).issue_warning(&decoder.warn) { - Some((set, rest)) => { - if !rest.is_empty() { - decoder.warn(Warning::TBD); - } - FileAttributeRecord(set) - } - None => FileAttributeRecord::default(), - } - } -} - -#[derive(Clone, Debug)] -pub struct VarAttributes { - pub long_var_name: Identifier, - pub attributes: Attributes, -} - -impl VarAttributes { - fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributes, &'a str), Warning> { - let Some((long_var_name, rest)) = input.split_once(':') else { - return Err(Warning::TBD); - }; - let long_var_name = decoder - .new_identifier(long_var_name) - .and_then(Identifier::must_be_ordinary) - .map_err(Warning::InvalidAttributeVariableName)?; - let (attributes, rest) = Attributes::parse(decoder, rest, Some('/'))?; - let var_attribute = VarAttributes { - long_var_name, - attributes, - }; - Ok((var_attribute, rest)) - } -} - -#[derive(Clone, Debug)] -pub struct VariableAttributeRecord(pub Vec); - -impl VariableAttributeRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Self { - let decoded = decoder.decode(&source.text); - let mut input = decoded.as_ref(); - let mut var_attribute_sets = Vec::new(); - while !input.is_empty() { - let Some((var_attribute, rest)) = - VarAttributes::parse(decoder, input).issue_warning(&decoder.warn) - else { - break; - }; - var_attribute_sets.push(var_attribute); - input = rest; - } - VariableAttributeRecord(var_attribute_sets) - } -} - -#[derive(Clone, Debug)] -pub struct LongName { - pub short_name: Identifier, - pub long_name: Identifier, -} - -impl LongName { - fn parse(input: &str, decoder: &Decoder) -> Result { - let Some((short_name, long_name)) = input.split_once('=') else { - return Err(Warning::TBD); - }; - let short_name = decoder - .new_identifier(short_name) - .and_then(Identifier::must_be_ordinary) - .map_err(Warning::InvalidShortName)?; - let long_name = decoder - .new_identifier(long_name) - .and_then(Identifier::must_be_ordinary) - .map_err(Warning::InvalidLongName)?; - Ok(LongName { - short_name, - long_name, - }) - } -} - -#[derive(Clone, Debug)] -pub struct LongNamesRecord(pub Vec); - -impl LongNamesRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Self { - let input = decoder.decode(&source.text); - let mut names = Vec::new(); - for pair in input.split('\t').filter(|s| !s.is_empty()) { - if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&decoder.warn) { - names.push(long_name); - } - } - LongNamesRecord(names) - } -} - -#[derive(Clone, Debug)] -pub struct ProductInfoRecord(pub String); - -impl ProductInfoRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Self { - Self(decoder.decode(&source.text).into()) - } -} -#[derive(Clone, Debug)] -pub struct VariableSet { - pub name: Identifier, - pub variable_names: Vec, -} - -impl VariableSet { - fn parse(input: &str, decoder: &Decoder) -> Result { - let (name, input) = input.split_once('=').ok_or(Warning::TBD)?; - let name = decoder.new_identifier(name).map_err(|_| Warning::TBD)?; - let mut vars = Vec::new(); - for var in input.split_ascii_whitespace() { - if let Some(identifier) = decoder - .new_identifier(var) - .and_then(Identifier::must_be_ordinary) - .map_err(Warning::InvalidVariableSetName) - .issue_warning(&decoder.warn) - { - vars.push(identifier); - } - } - Ok(VariableSet { - name, - variable_names: vars, - }) - } -} - -#[derive(Clone, Debug)] -pub struct VariableSetRecord { - pub offsets: Range, - pub sets: Vec, -} - -impl VariableSetRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord { - let mut sets = Vec::new(); - let input = decoder.decode(&source.text); - for line in input.lines() { - if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&decoder.warn) { - sets.push(set) - } - } - VariableSetRecord { - offsets: source.offsets.clone(), - sets, - } - } -} - -trait IssueWarning { - fn issue_warning(self, warn: &F) -> Option - where - F: Fn(Warning); -} -impl IssueWarning for Result { - fn issue_warning(self, warn: &F) -> Option - where - F: Fn(Warning), - { - match self { - Ok(result) => Some(result), - Err(error) => { - warn(error); - None - } - } - } -} - -#[derive(Clone, Debug)] -pub struct Extension { - pub offsets: Range, - - /// Record subtype. - pub subtype: u32, - - /// Size of each data element. - pub size: u32, - - /// Number of data elements. - pub count: u32, - - /// `size * count` bytes of data. - pub data: Vec, -} - -impl Extension { - fn check_size(&self) -> Result<(), Warning> { - if let Some(expected_size) = E::SIZE { - if self.size != expected_size { - return Err(Warning::BadRecordSize { - offset: self.offsets.start, - record: E::NAME.into(), - size: self.size, - expected_size, - }); - } - } - if let Some(expected_count) = E::COUNT { - if self.count != expected_count { - return Err(Warning::BadRecordCount { - offset: self.offsets.start, - record: E::NAME.into(), - count: self.count, - expected_count, - }); - } - } - Ok(()) - } - - fn read( - r: &mut R, - endian: Endian, - n_vars: usize, - warn: &dyn Fn(Warning), - ) -> Result, Error> { - let subtype = endian.parse(read_bytes(r)?); - let header_offset = r.stream_position()?; - let size: u32 = endian.parse(read_bytes(r)?); - let count = endian.parse(read_bytes(r)?); - let Some(product) = size.checked_mul(count) else { - return Err(Error::ExtensionRecordTooLarge { - offset: header_offset, - subtype, - size, - count, - }); - }; - let start_offset = r.stream_position()?; - let data = read_vec(r, product as usize)?; - let end_offset = start_offset + product as u64; - let extension = Extension { - offsets: start_offset..end_offset, - subtype, - size, - count, - data, - }; - let result = match subtype { - IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian), - FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian), - VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, n_vars, endian, warn), - MultipleResponseRecord::SUBTYPE | 19 => { - MultipleResponseRecord::parse(&extension, endian) - } - LongStringValueLabelRecord::SUBTYPE => { - LongStringValueLabelRecord::parse(&extension, endian) - } - EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian), - NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian), - 5 => Ok(Record::Text(TextRecord::new( - extension, - TextRecordType::VariableSets, - ))), - 10 => Ok(Record::Text(TextRecord::new( - extension, - TextRecordType::ProductInfo, - ))), - 13 => Ok(Record::Text(TextRecord::new( - extension, - TextRecordType::LongNames, - ))), - 14 => Ok(Record::Text(TextRecord::new( - extension, - TextRecordType::VeryLongStrings, - ))), - 17 => Ok(Record::Text(TextRecord::new( - extension, - TextRecordType::FileAttributes, - ))), - 18 => Ok(Record::Text(TextRecord::new( - extension, - TextRecordType::VariableAttributes, - ))), - _ => Ok(Record::OtherExtension(extension)), - }; - match result { - Ok(result) => Ok(Some(result)), - Err(error) => { - warn(error); - Ok(None) - } - } - } -} - -#[derive(Clone, Debug)] -pub struct ZHeader { - /// File offset to the start of the record. - pub offset: u64, - - /// File offset to the ZLIB data header. - pub zheader_offset: u64, - - /// File offset to the ZLIB trailer. - pub ztrailer_offset: u64, - - /// Length of the ZLIB trailer in bytes. - pub ztrailer_len: u64, -} - -impl ZHeader { - fn read(r: &mut R, endian: Endian) -> Result { - let offset = r.stream_position()?; - let zheader_offset: u64 = endian.parse(read_bytes(r)?); - let ztrailer_offset: u64 = endian.parse(read_bytes(r)?); - let ztrailer_len: u64 = endian.parse(read_bytes(r)?); - - Ok(ZHeader { - offset, - zheader_offset, - ztrailer_offset, - ztrailer_len, - }) - } -} - -#[derive(Clone, Debug)] -pub struct ZTrailer { - /// File offset to the start of the record. - pub offset: u64, - - /// Compression bias as a negative integer, e.g. -100. - pub int_bias: i64, - - /// Always observed as zero. - pub zero: u64, - - /// Uncompressed size of each block, except possibly the last. Only - /// `0x3ff000` has been observed so far. - pub block_size: u32, - - /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them. - pub blocks: Vec, -} - -#[derive(Clone, Debug)] -pub struct ZBlock { - /// Offset of block of data if simple compression were used. - pub uncompressed_ofs: u64, - - /// Actual offset within the file of the compressed data block. - pub compressed_ofs: u64, - - /// The number of bytes in this data block after decompression. This is - /// `block_size` in every data block but the last, which may be smaller. - pub uncompressed_size: u32, - - /// The number of bytes in this data block, as stored compressed in this - /// file. - pub compressed_size: u32, -} - -impl ZBlock { - fn read(r: &mut R, endian: Endian) -> Result { - Ok(ZBlock { - uncompressed_ofs: endian.parse(read_bytes(r)?), - compressed_ofs: endian.parse(read_bytes(r)?), - uncompressed_size: endian.parse(read_bytes(r)?), - compressed_size: endian.parse(read_bytes(r)?), - }) - } -} - -impl ZTrailer { - fn read( - reader: &mut R, - endian: Endian, - ztrailer_ofs: u64, - ztrailer_len: u64, - ) -> Result, Error> { - let start_offset = reader.stream_position()?; - if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() { - return Ok(None); - } - let int_bias = endian.parse(read_bytes(reader)?); - let zero = endian.parse(read_bytes(reader)?); - let block_size = endian.parse(read_bytes(reader)?); - let n_blocks: u32 = endian.parse(read_bytes(reader)?); - let expected_n_blocks = (ztrailer_len - 24) / 24; - if n_blocks as u64 != expected_n_blocks { - return Err(Error::BadZlibTrailerNBlocks { - offset: ztrailer_ofs, - n_blocks, - expected_n_blocks, - ztrailer_len, - }); - } - let blocks = (0..n_blocks) - .map(|_| ZBlock::read(reader, endian)) - .collect::, _>>()?; - reader.seek(SeekFrom::Start(start_offset))?; - Ok(Some(ZTrailer { - offset: ztrailer_ofs, - int_bias, - zero, - block_size, - blocks, - })) - } -} - -fn try_read_bytes(r: &mut R) -> Result, IoError> { - let mut buf = [0; N]; - let n = r.read(&mut buf)?; - if n > 0 { - if n < N { - r.read_exact(&mut buf[n..])?; - } - Ok(Some(buf)) - } else { - Ok(None) - } -} - -fn read_bytes(r: &mut R) -> Result<[u8; N], IoError> { - let mut buf = [0; N]; - r.read_exact(&mut buf)?; - Ok(buf) -} - -fn read_vec(r: &mut R, n: usize) -> Result, IoError> { - let mut vec = vec![0; n]; - r.read_exact(&mut vec)?; - Ok(vec) -} - -fn read_string(r: &mut R, endian: Endian) -> Result { - let length: u32 = endian.parse(read_bytes(r)?); - Ok(read_vec(r, length as usize)?.into()) -} - -#[derive(Clone, Debug)] -pub struct LongStringValueLabels -where - S: Debug, -{ - pub var_name: N, - pub width: u32, - - /// `(value, label)` pairs, where each value is `width` bytes. - pub labels: Vec<(RawString, S)>, -} - -impl LongStringValueLabels { - fn decode( - &self, - decoder: &Decoder, - ) -> Result, Warning> { - let var_name = decoder.decode(&self.var_name); - let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding) - .map_err(Warning::InvalidLongStringValueLabelName)?; - - let mut labels = Vec::with_capacity(self.labels.len()); - for (value, label) in self.labels.iter() { - let label = decoder.decode(label).to_string(); - labels.push((value.clone(), label)); - } - - Ok(LongStringValueLabels { - var_name, - width: self.width, - labels, - }) - } -} - -#[derive(Clone, Debug)] -pub struct LongStringValueLabelRecord(pub Vec>) -where - N: Debug, - S: Debug; - -impl ExtensionRecord for LongStringValueLabelRecord { - const SUBTYPE: u32 = 21; - const SIZE: Option = Some(1); - const COUNT: Option = None; - const NAME: &'static str = "long string value labels record"; - - fn parse(ext: &Extension, endian: Endian) -> Result { - ext.check_size::()?; - - let mut input = &ext.data[..]; - let mut label_set = Vec::new(); - while !input.is_empty() { - let var_name = read_string(&mut input, endian)?; - let width: u32 = endian.parse(read_bytes(&mut input)?); - let n_labels: u32 = endian.parse(read_bytes(&mut input)?); - let mut labels = Vec::new(); - for _ in 0..n_labels { - let value = read_string(&mut input, endian)?; - let label = read_string(&mut input, endian)?; - labels.push((value, label)); - } - label_set.push(LongStringValueLabels { - var_name, - width, - labels, - }) - } - Ok(Record::LongStringValueLabels(LongStringValueLabelRecord( - label_set, - ))) - } -} - -impl LongStringValueLabelRecord { - fn decode(self, decoder: &Decoder) -> LongStringValueLabelRecord { - let mut labels = Vec::with_capacity(self.0.len()); - for label in &self.0 { - match label.decode(decoder) { - Ok(set) => labels.push(set), - Err(error) => decoder.warn(error), - } - } - LongStringValueLabelRecord(labels) - } -} - -#[derive(Default)] -pub struct VarTypes { - pub types: Vec>, -} - -impl VarTypes { - pub fn new() -> Self { - Self::default() - } - - pub fn push(&mut self, width: RawWidth) { - if let Ok(var_type) = VarType::try_from(width) { - self.types.push(Some(var_type)); - for _ in 1..width.n_values().unwrap() { - self.types.push(None); - } - } - } - - pub fn n_values(&self) -> usize { - self.types.len() - } - - pub fn is_valid_index(&self, index: usize) -> bool { - self.var_type_at(index).is_some() - } - - pub fn var_type_at(&self, index: usize) -> Option { - if index >= 1 && index <= self.types.len() { - self.types[index - 1] - } else { - None - } - } - - pub fn iter(&self) -> impl Iterator + use<'_> { - self.types - .iter() - .map(|var_type| var_type.unwrap_or(VarType::String)) - } -} diff --git a/rust/pspp/src/sack.rs b/rust/pspp/src/sack.rs deleted file mode 100644 index 8eec1eb1fd..0000000000 --- a/rust/pspp/src/sack.rs +++ /dev/null @@ -1,633 +0,0 @@ -use float_next_after::NextAfter; -use num::{Bounded, Zero}; -use ordered_float::OrderedFloat; -use std::{ - collections::{hash_map::Entry, HashMap}, - error::Error as StdError, - fmt::{Display, Formatter, Result as FmtResult}, - iter::repeat_n, -}; - -use crate::endian::{Endian, ToBytes}; - -pub type Result = std::result::Result; - -#[derive(Debug)] -pub struct Error { - pub file_name: Option, - pub line_number: Option, - pub token: Option, - pub message: String, -} - -impl Error { - fn new( - file_name: Option<&str>, - line_number: Option, - token: Option<&str>, - message: String, - ) -> Error { - Error { - file_name: file_name.map(String::from), - line_number, - token: token.map(String::from), - message, - } - } -} - -impl StdError for Error {} - -impl Display for Error { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - match (self.file_name.as_ref(), self.line_number) { - (Some(ref file_name), Some(line_number)) => write!(f, "{file_name}:{line_number}: ")?, - (Some(ref file_name), None) => write!(f, "{file_name}: ")?, - (None, Some(line_number)) => write!(f, "line {line_number}: ")?, - (None, None) => (), - } - if let Some(ref token) = self.token { - write!(f, "at '{token}': ")?; - } - write!(f, "{}", self.message) - } -} - -pub fn sack(input: &str, input_file_name: Option<&str>, endian: Endian) -> Result> { - let mut symbol_table = HashMap::new(); - let output = _sack(input, input_file_name, endian, &mut symbol_table)?; - let output = if !symbol_table.is_empty() { - for (k, v) in symbol_table.iter() { - println!("{k} => {v:?}"); - } - for (k, v) in symbol_table.iter() { - if v.is_none() { - Err(Error::new( - input_file_name, - None, - None, - format!("label {k} used but never defined"), - ))? - } - } - _sack(input, input_file_name, endian, &mut symbol_table)? - } else { - output - }; - Ok(output) -} - -fn _sack( - input: &str, - input_file_name: Option<&str>, - endian: Endian, - symbol_table: &mut HashMap>, -) -> Result> { - let mut lexer = Lexer::new(input, input_file_name, endian)?; - let mut output = Vec::new(); - while parse_data_item(&mut lexer, &mut output, symbol_table)? {} - Ok(output) -} - -fn parse_data_item( - lexer: &mut Lexer, - output: &mut Vec, - symbol_table: &mut HashMap>, -) -> Result { - if lexer.token.is_none() { - return Ok(false); - }; - - let initial_len = output.len(); - match lexer.take()? { - Token::Integer(integer) => { - if let Ok(integer) = TryInto::::try_into(integer) { - output.extend_from_slice(&lexer.endian.to_bytes(integer)); - } else if let Ok(integer) = TryInto::::try_into(integer) { - output.extend_from_slice(&lexer.endian.to_bytes(integer)); - } else { - Err(lexer.error(format!( - "{integer} is not in the valid range [{},{}]", - i32::MIN, - u32::MAX - )))?; - }; - } - Token::Float(float) => output.extend_from_slice(&lexer.endian.to_bytes(float.0)), - Token::PcSysmis => { - output.extend_from_slice(&[0xf5, 0x1e, 0x26, 0x02, 0x8a, 0x8c, 0xed, 0xff]) - } - Token::I8 => put_integers::(lexer, "i8", output)?, - Token::I16 => put_integers::(lexer, "i16", output)?, - Token::I64 => put_integers::(lexer, "i64", output)?, - Token::String(string) => output.extend_from_slice(string.as_bytes()), - Token::S(size) => { - let Some((Token::String(ref string), _)) = lexer.token else { - Err(lexer.error(format!("string expected after 's{size}'")))? - }; - let len = string.len(); - if len > size { - Err(lexer.error(format!( - "{len}-byte string is longer than pad length {size}" - )))? - } - output.extend_from_slice(string.as_bytes()); - output.extend(repeat_n(b' ', size - len)); - lexer.get()?; - } - Token::LParen => { - while !matches!(lexer.token, Some((Token::RParen, _))) { - parse_data_item(lexer, output, symbol_table)?; - } - lexer.get()?; - } - Token::Count => put_counted_items::(lexer, "COUNT", output, symbol_table)?, - Token::Count8 => put_counted_items::(lexer, "COUNT8", output, symbol_table)?, - Token::Hex => { - let Some((Token::String(ref string), _)) = lexer.token else { - Err(lexer.error(String::from("string expected after 'hex'")))? - }; - let mut string = &string[..]; - loop { - string = string.trim_start(); - if string.is_empty() { - break; - }; - - let mut i = string.chars(); - let Some(c0) = i.next() else { return Ok(true) }; - let Some(c1) = i.next() else { - Err(lexer.error(String::from("hex string has odd number of characters")))? - }; - - let (Some(digit0), Some(digit1)) = (c0.to_digit(16), c1.to_digit(16)) else { - Err(lexer.error(String::from("invalid digit in hex string")))? - }; - let byte = digit0 * 16 + digit1; - output.push(byte as u8); - - string = i.as_str(); - } - lexer.get()?; - } - Token::Label(name) => { - println!("define {name}"); - let value = output.len() as u32; - match symbol_table.entry(name.clone()) { - Entry::Vacant(v) => { - v.insert(Some(value)); - } - Entry::Occupied(mut o) => { - match o.get() { - Some(v) => { - if *v != value { - Err(lexer.error(format!("{name}: can't redefine label for offset {:#x} with offset {:#x}", *v, value)))? - } - } - None => drop(o.insert(Some(value))), - } - } - }; - return Ok(true); - } - Token::At(name) => { - let mut value = *symbol_table.entry(name.clone()).or_insert(None); - loop { - let plus = match lexer.token { - Some((Token::Plus, _)) => true, - Some((Token::Minus, _)) => false, - _ => break, - }; - lexer.get()?; - - let operand = match lexer.token { - Some((Token::At(ref name), _)) => { - *symbol_table.entry(name.clone()).or_insert(None) - } - Some((Token::Integer(integer), _)) => Some( - integer - .try_into() - .map_err(|msg| lexer.error(format!("bad offset literal ({msg})")))?, - ), - _ => Err(lexer.error(String::from("expecting @label or integer literal")))?, - }; - lexer.get()?; - - value = match (value, operand) { - (Some(a), Some(b)) => Some( - if plus { - a.checked_add(b) - } else { - a.checked_sub(b) - } - .ok_or_else(|| { - lexer.error(String::from("overflow in offset arithmetic")) - })?, - ), - _ => None, - }; - } - let value = value.unwrap_or(0); - output.extend_from_slice(&lexer.endian.to_bytes(value)); - } - _ => (), - }; - if let Some((Token::Asterisk, _)) = lexer.token { - lexer.get()?; - let Token::Integer(count) = lexer.take()? else { - Err(lexer.error(String::from("positive integer expected after '*'")))? - }; - if count < 1 { - Err(lexer.error(String::from("positive integer expected after '*'")))? - }; - let final_len = output.len(); - for _ in 1..count { - output.extend_from_within(initial_len..final_len); - } - } - match lexer.token { - Some((Token::Semicolon, _)) => { - lexer.get()?; - } - Some((Token::RParen, _)) => (), - _ => Err(lexer.error(String::from("';' expected")))?, - } - Ok(true) -} - -fn put_counted_items( - lexer: &mut Lexer, - name: &str, - output: &mut Vec, - symbol_table: &mut HashMap>, -) -> Result<()> -where - T: Zero + TryFrom, - Endian: ToBytes, -{ - let old_size = output.len(); - output.extend_from_slice(&lexer.endian.to_bytes(T::zero())); - let start = output.len(); - if !matches!(lexer.token, Some((Token::LParen, _))) { - Err(lexer.error(format!("'(' expected after '{name}'")))? - } - lexer.get()?; - while !matches!(lexer.token, Some((Token::RParen, _))) { - parse_data_item(lexer, output, symbol_table)?; - } - lexer.get()?; - let delta = output.len() - start; - let Ok(delta): Result = delta.try_into() else { - Err(lexer.error(format!("{delta} bytes is too much for '{name}'")))? - }; - let dest = &mut output[old_size..old_size + N]; - dest.copy_from_slice(&lexer.endian.to_bytes(delta)); - Ok(()) -} - -fn put_integers( - lexer: &mut Lexer, - name: &str, - output: &mut Vec, -) -> Result<()> -where - T: Bounded + Display + TryFrom + Copy, - Endian: ToBytes, -{ - println!("put_integers {:?}", lexer.token); - let mut n = 0; - while let Some(integer) = lexer.take_if(|t| match t { - Token::Integer(integer) => Some(*integer), - _ => None, - })? { - println!("got integer {integer}"); - let Ok(integer) = integer.try_into() else { - Err(lexer.error(format!( - "{integer} is not in the valid range [{},{}]", - T::min_value(), - T::max_value() - )))? - }; - output.extend_from_slice(&lexer.endian.to_bytes(integer)); - n += 1; - } - println!("put_integers {:?} {n}", lexer.token); - if n == 0 { - Err(lexer.error(format!("integer expected after '{name}'")))? - } - Ok(()) -} - -#[derive(PartialEq, Eq, Clone, Debug)] -enum Token { - Integer(i64), - Float(OrderedFloat), - PcSysmis, - String(String), - Semicolon, - Asterisk, - LParen, - RParen, - I8, - I16, - I64, - S(usize), - Count, - Count8, - Hex, - Label(String), - At(String), - Minus, - Plus, -} - -struct Lexer<'a> { - input: &'a str, - token: Option<(Token, &'a str)>, - input_file_name: Option<&'a str>, - line_number: usize, - endian: Endian, -} - -fn skip_comments(mut s: &str) -> (&str, usize) { - let mut n_newlines = 0; - let s = loop { - s = s.trim_start_matches([' ', '\t', '\r', '<', '>']); - if let Some(remainder) = s.strip_prefix('#') { - let Some((_, remainder)) = remainder.split_once('\n') else { - break ""; - }; - s = remainder; - n_newlines += 1; - } else if let Some(remainder) = s.strip_prefix('\n') { - s = remainder; - n_newlines += 1; - } else { - break s; - } - }; - (s, n_newlines) -} - -impl<'a> Lexer<'a> { - fn new(input: &'a str, input_file_name: Option<&'a str>, endian: Endian) -> Result> { - let mut lexer = Lexer { - input, - token: None, - input_file_name, - line_number: 1, - endian, - }; - lexer.token = lexer.next()?; - Ok(lexer) - } - fn error(&self, message: String) -> Error { - let repr = self.token.as_ref().map(|(_, repr)| *repr); - Error::new(self.input_file_name, Some(self.line_number), repr, message) - } - fn take(&mut self) -> Result { - let Some(token) = self.token.take() else { - Err(self.error(String::from("unexpected end of input")))? - }; - self.token = self.next()?; - Ok(token.0) - } - fn take_if(&mut self, condition: F) -> Result> - where - F: FnOnce(&Token) -> Option, - { - let Some(ref token) = self.token else { - return Ok(None); - }; - match condition(&token.0) { - Some(value) => { - self.token = self.next()?; - Ok(Some(value)) - } - None => Ok(None), - } - } - fn get(&mut self) -> Result> { - if self.token.is_none() { - Err(self.error(String::from("unexpected end of input")))? - } else { - self.token = self.next()?; - match self.token { - Some((ref token, _)) => Ok(Some(token)), - None => Ok(None), - } - } - } - - fn next(&mut self) -> Result> { - // Get the first character of the token, skipping past white space and - // comments. - let (s, n_newlines) = skip_comments(self.input); - self.line_number += n_newlines; - self.input = s; - - let start = s; - let mut iter = s.chars(); - let Some(c) = iter.next() else { - return Ok(None); - }; - let (token, rest) = match c { - c if c.is_ascii_digit() || c == '-' => { - let len = s - .find(|c: char| { - !(c.is_ascii_digit() || c.is_alphabetic() || c == '.' || c == '-') - }) - .unwrap_or(s.len()); - let (number, rest) = s.split_at(len); - let token = if number == "-" { - Token::Minus - } else if let Some(digits) = number.strip_prefix("0x") { - Token::Integer(i64::from_str_radix(digits, 16).map_err(|msg| { - self.error(format!("bad integer literal '{number}' ({msg})")) - })?) - } else if !number.contains('.') { - Token::Integer(number.parse().map_err(|msg| { - self.error(format!("bad integer literal '{number}' ({msg})")) - })?) - } else { - Token::Float(number.parse().map_err(|msg| { - self.error(format!("bad float literal '{number}' ({msg})")) - })?) - }; - (token, rest) - } - '"' => { - let s = iter.as_str(); - let Some(len) = s.find(['\n', '"']) else { - Err(self.error(String::from("end-of-file inside string")))? - }; - let (string, rest) = s.split_at(len); - let Some(rest) = rest.strip_prefix('"') else { - Err(self.error(format!("new-line inside string ({string}...{rest})")))? - }; - (Token::String(string.into()), rest) - } - ';' => (Token::Semicolon, iter.as_str()), - '*' => (Token::Asterisk, iter.as_str()), - '+' => (Token::Plus, iter.as_str()), - '(' => (Token::LParen, iter.as_str()), - ')' => (Token::RParen, iter.as_str()), - c if c.is_alphabetic() || c == '@' || c == '_' => { - let len = s - .find(|c: char| { - !(c.is_ascii_digit() - || c.is_alphabetic() - || c == '@' - || c == '.' - || c == '_') - }) - .unwrap_or(s.len()); - let (s, rest) = s.split_at(len); - if let Some(rest) = rest.strip_prefix(':') { - (Token::Label(s.into()), rest) - } else if let Some(name) = s.strip_prefix('@') { - (Token::At(name.into()), rest) - } else if let Some(count) = s.strip_prefix('s') { - let token = - Token::S(count.parse().map_err(|msg| { - self.error(format!("bad counted string '{s}' ({msg})")) - })?); - (token, rest) - } else { - let token = match s { - "i8" => Token::I8, - "i16" => Token::I16, - "i64" => Token::I64, - "SYSMIS" => Token::Float(OrderedFloat(-f64::MAX)), - "PCSYSMIS" => Token::PcSysmis, - "LOWEST" => Token::Float((-f64::MAX).next_after(0.0).into()), - "HIGHEST" => Token::Float(f64::MAX.into()), - "ENDIAN" => Token::Integer(if self.endian == Endian::Big { 1 } else { 2 }), - "COUNT" => Token::Count, - "COUNT8" => Token::Count8, - "hex" => Token::Hex, - _ => Err(self.error(format!("invalid token '{s}'")))?, - }; - (token, rest) - } - } - _ => Err(self.error(format!("invalid input byte '{c}'")))?, - }; - self.input = rest; - let repr = &start[..start.len() - rest.len()]; - println!("{token:?} {repr}"); - Ok(Some((token, repr))) - } -} - -#[cfg(test)] -mod test { - use crate::endian::Endian; - use crate::sack::sack; - use anyhow::Result; - use hexplay::HexView; - - #[test] - fn basic_sack() -> Result<()> { - let input = r#" -"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; -2; # Layout code -28; # Nominal case size -0; # Not compressed -0; # Not weighted -1; # 1 case. -100.0; # Bias. -"01 Jan 11"; "20:53:52"; -"PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 ""; -i8 0 *3; -"#; - let output = sack(input, None, Endian::Big)?; - HexView::new(&output).print()?; - Ok(()) - } - - #[test] - fn pcp_sack() -> Result<()> { - let input = r#" -# File header. -2; 0; -@MAIN; @MAIN_END - @MAIN; -@VARS; @VARS_END - @VARS; -@LABELS; @LABELS_END - @LABELS; -@DATA; @DATA_END - @DATA; -(0; 0) * 11; -i8 0 * 128; - -MAIN: - i16 1; # Fixed. - s62 "PCSPSS PSPP synthetic test product"; - PCSYSMIS; - 0; 0; i16 1; # Fixed. - i16 0; - i16 15; - 1; - i16 0; # Fixed. - 1; - s8 "11/28/14"; - s8 "15:11:00"; - s64 "PSPP synthetic test file"; -MAIN_END: - -VARS: - 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; - 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; - 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; - - # Numeric variable, no label or missing values. - 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; - - # Numeric variable, variable label. - 0; 0; @NUM2_LABEL - @LABELS_OFS; 0x050800; s8 "NUM2"; PCSYSMIS; - - # Numeric variable with missing value. - 0; 0; 0; 0x050800; s8 "NUM3"; 1.0; - - # Numeric variable, variable label and missing value. - 0; 0; @NUM4_LABEL - @LABELS_OFS; 0x050800; s8 "NUM4"; 2.0; - - # String variable, no label or missing values. - 0; 0; 0; 0x010800; s8 "STR1"; PCSYSMIS; - - # String variable, variable label. - 0; 0; @STR2_LABEL - @LABELS_OFS; 0x010400; s8 "STR2"; PCSYSMIS; - - # String variable with missing value. - 0; 0; 0; 0x010500; s8 "STR3"; s8 "MISS"; - - # String variable, variable label and missing value. - 0; 0; @STR4_LABEL - @LABELS_OFS; 0x010100; s8 "STR4"; s8 "OTHR"; - - # Long string variable - 0; 0; 0; 0x010b00; s8 "STR5"; PCSYSMIS; - 0 * 8; - - # Long string variable with variable label - 0; 0; @STR6_LABEL - @LABELS_OFS; 0x010b00; s8 "STR6"; PCSYSMIS; - 0 * 8; -VARS_END: - -LABELS: - 3; i8 0 0 0; LABELS_OFS: i8 0; - NUM2_LABEL: COUNT8("Numeric variable 2's label"); - NUM4_LABEL: COUNT8("Another numeric variable label"); - STR2_LABEL: COUNT8("STR2's variable label"); - STR4_LABEL: COUNT8("STR4's variable label"); - STR6_LABEL: COUNT8("Another string variable's label"); -LABELS_END: - -DATA: - 0.0; "11/28/14"; 1.0; - 0.0; 1.0; 2.0; PCSYSMIS; s8 "abcdefgh"; s8 "ijkl"; s8 "mnopq"; s8 "r"; - s16 "stuvwxyzAB"; s16 "CDEFGHIJKLM"; -DATA_END: -"#; - let output = sack(input, None, Endian::Big)?; - HexView::new(&output).print()?; - Ok(()) - } -} diff --git a/rust/pspp/src/settings.rs b/rust/pspp/src/settings.rs index 6aad340605..3bfb4f0141 100644 --- a/rust/pspp/src/settings.rs +++ b/rust/pspp/src/settings.rs @@ -128,7 +128,7 @@ impl Default for Settings { impl Settings { pub fn global() -> &'static Settings { static GLOBAL: OnceLock = OnceLock::new(); - GLOBAL.get_or_init( Settings::default) + GLOBAL.get_or_init(Settings::default) } } diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs new file mode 100644 index 0000000000..43e4aa3da5 --- /dev/null +++ b/rust/pspp/src/sys/cooked.rs @@ -0,0 +1,904 @@ +use core::str; +use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc}; + +use crate::{ + dictionary::{ + Dictionary, InvalidRole, MultipleResponseSet, MultipleResponseType, Value, VarWidth, + Variable, VariableSet, + }, + endian::Endian, + format::{Error as FormatError, Format, UncheckedFormat}, + identifier::{ByIdentifier, Error as IdError, Identifier}, + sys::encoding::Error as EncodingError, + sys::raw::{ + self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord, + FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName, LongNamesRecord, + LongStringMissingValueRecord, LongStringValueLabelRecord, MissingValues, + MultipleResponseRecord, NumberOfCasesRecord, ProductInfoRecord, RawStrArray, RawWidth, + ValueLabel, ValueLabelRecord, VarDisplayRecord, VariableAttributeRecord, VariableRecord, + VariableSetRecord, VeryLongStringsRecord, ZHeader, ZTrailer, + }, +}; +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use encoding_rs::Encoding; +use indexmap::set::MutableValues; +use thiserror::Error as ThisError; + +pub use crate::sys::raw::{CategoryLabels, Compression}; + +#[derive(ThisError, Debug)] +pub enum Error { + #[error("Missing header record")] + MissingHeaderRecord, + + // XXX this is an internal error + #[error("More than one file header record")] + DuplicateHeaderRecord, + + #[error("{0}")] + EncodingError(EncodingError), + + #[error("Using default encoding {0}.")] + UsingDefaultEncoding(String), + + #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)] + InvalidVariableWidth { offsets: Range, width: i32 }, + + #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")] + InvalidLongMissingValueFormat, + + #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")] + InvalidCreationDate { creation_date: String }, + + #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")] + InvalidCreationTime { creation_time: String }, + + #[error("{id_error} Renaming variable to {new_name}.")] + InvalidVariableName { + id_error: IdError, + new_name: Identifier, + }, + + #[error( + "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}" + )] + InvalidPrintFormat { + new_spec: Format, + variable: Identifier, + format_error: FormatError, + }, + + #[error( + "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}" + )] + InvalidWriteFormat { + new_spec: Format, + variable: Identifier, + format_error: FormatError, + }, + + #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")] + DuplicateVariableName { + duplicate_name: Identifier, + new_name: Identifier, + }, + + #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")] + InvalidDictIndex { dict_index: usize, max_index: usize }, + + #[error("Dictionary index {0} refers to a long string continuation.")] + DictIndexIsContinuation(usize), + + #[error("At offset {offset:#x}, one or more variable indexes for value labels referred to long string continuation records: {indexes:?}")] + LongStringContinuationIndexes { offset: u64, indexes: Vec }, + + #[error( + "At offsets {:#x}...{:#x}, record types 3 and 4 may not add value labels to one or more long string variables: {variables:?}", .offsets.start, .offsets.end + )] + InvalidLongStringValueLabels { + offsets: Range, + variables: Vec, + }, + + #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")] + ValueLabelsDifferentTypes { + numeric_var: Identifier, + string_var: Identifier, + }, + + #[error("Invalid multiple response set name. {0}")] + InvalidMrSetName(IdError), + + #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")] + UnknownMrSetVariable { + mr_set: Identifier, + short_name: Identifier, + }, + + #[error("Multiple response set {0} has no variables.")] + EmptyMrSet(Identifier), + + #[error("Multiple response set {0} has only one variable.")] + OneVarMrSet(Identifier), + + #[error("Multiple response set {0} contains both string and numeric variables.")] + MixedMrSet(Identifier), + + #[error( + "Invalid numeric format for counted value {number} in multiple response set {mr_set}." + )] + InvalidMDGroupCountedValue { mr_set: Identifier, number: String }, + + #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")] + TooWideMDGroupCountedValue { + mr_set: Identifier, + value: String, + width: usize, + max_width: u16, + }, + + #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")] + InvalidLongValueLabelWidth { + name: Identifier, + width: u32, + min_width: u16, + max_width: u16, + }, + + #[error("Invalid attribute name. {0}")] + InvalidAttributeName(IdError), + + #[error("Invalid short name in long variable name record. {0}")] + InvalidShortName(IdError), + + #[error("Invalid name in long variable name record. {0}")] + InvalidLongName(IdError), + + #[error("Invalid variable name in very long string record. {0}")] + InvalidLongStringName(IdError), + + #[error("Invalid variable name in long string value label record. {0}")] + InvalidLongStringValueLabelName(IdError), + + #[error("Invalid variable name in attribute record. {0}")] + InvalidAttributeVariableName(IdError), + + // XXX This is risky because `text` might be arbitarily long. + #[error("Text string contains invalid bytes for {encoding} encoding: {text}")] + MalformedString { encoding: String, text: String }, + + #[error("Details TBD")] + TBD, +} + +#[derive(Clone, Debug)] +pub struct Headers { + pub header: HeaderRecord, + pub variable: Vec>, + pub value_label: Vec, String>>, + pub document: Vec>, + pub integer_info: Option, + pub float_info: Option, + pub var_display: Option, + pub multiple_response: Vec>, + pub long_string_value_labels: Vec>, + pub long_string_missing_values: Vec>, + pub encoding: Option, + pub number_of_cases: Option, + pub variable_sets: Vec, + pub product_info: Option, + pub long_names: Vec, + pub very_long_strings: Vec, + pub file_attributes: Vec, + pub variable_attributes: Vec, + pub other_extension: Vec, + pub end_of_headers: Option, + pub z_header: Option, + pub z_trailer: Option, + pub cases: Option>>, +} + +fn take_first(mut vec: Vec, more_than_one: F) -> Option +where + F: FnOnce(), +{ + if vec.len() > 1 { + more_than_one(); + } + vec.drain(..).next() +} + +impl Headers { + pub fn new(headers: Vec, warn: &impl Fn(Error)) -> Result { + let mut file_header = Vec::new(); + let mut variable = Vec::new(); + let mut value_label = Vec::new(); + let mut document = Vec::new(); + let mut integer_info = Vec::new(); + let mut float_info = Vec::new(); + let mut var_display = Vec::new(); + let mut multiple_response = Vec::new(); + let mut long_string_value_labels = Vec::new(); + let mut long_string_missing_values = Vec::new(); + let mut encoding = Vec::new(); + let mut number_of_cases = Vec::new(); + let mut variable_sets = Vec::new(); + let mut product_info = Vec::new(); + let mut long_names = Vec::new(); + let mut very_long_strings = Vec::new(); + let mut file_attributes = Vec::new(); + let mut variable_attributes = Vec::new(); + let mut other_extension = Vec::new(); + let mut end_of_headers = Vec::new(); + let mut z_header = Vec::new(); + let mut z_trailer = Vec::new(); + let mut cases = Vec::new(); + + for header in headers { + match header { + DecodedRecord::Header(record) => { + file_header.push(record); + } + DecodedRecord::Variable(record) => { + variable.push(record); + } + DecodedRecord::ValueLabel(record) => { + value_label.push(record); + } + DecodedRecord::Document(record) => { + document.push(record); + } + DecodedRecord::IntegerInfo(record) => { + integer_info.push(record); + } + DecodedRecord::FloatInfo(record) => { + float_info.push(record); + } + DecodedRecord::VariableSets(record) => { + variable_sets.push(record); + } + DecodedRecord::VarDisplay(record) => { + var_display.push(record); + } + DecodedRecord::MultipleResponse(record) => { + multiple_response.push(record); + } + DecodedRecord::LongStringValueLabels(record) => { + long_string_value_labels.push(record) + } + DecodedRecord::LongStringMissingValues(record) => { + long_string_missing_values.push(record); + } + DecodedRecord::Encoding(record) => { + encoding.push(record); + } + DecodedRecord::NumberOfCases(record) => { + number_of_cases.push(record); + } + DecodedRecord::ProductInfo(record) => { + product_info.push(record); + } + DecodedRecord::LongNames(record) => { + long_names.push(record); + } + DecodedRecord::VeryLongStrings(record) => { + very_long_strings.push(record); + } + DecodedRecord::FileAttributes(record) => { + file_attributes.push(record); + } + DecodedRecord::VariableAttributes(record) => { + variable_attributes.push(record); + } + DecodedRecord::OtherExtension(record) => { + other_extension.push(record); + } + DecodedRecord::EndOfHeaders(record) => { + end_of_headers.push(record); + } + DecodedRecord::ZHeader(record) => { + z_header.push(record); + } + DecodedRecord::ZTrailer(record) => { + z_trailer.push(record); + } + DecodedRecord::Cases(record) => { + cases.push(record); + } + } + } + + let Some(file_header) = take_first(file_header, || warn(Error::DuplicateHeaderRecord)) + else { + return Err(Error::MissingHeaderRecord); + }; + + Ok(Headers { + header: file_header, + variable, + value_label, + document, + integer_info: take_first(integer_info, || warn(Error::TBD)), + float_info: take_first(float_info, || warn(Error::TBD)), + var_display: take_first(var_display, || warn(Error::TBD)), + multiple_response, + long_string_value_labels, + long_string_missing_values, + encoding: take_first(encoding, || warn(Error::TBD)), + number_of_cases: take_first(number_of_cases, || warn(Error::TBD)), + variable_sets, + product_info: take_first(product_info, || warn(Error::TBD)), + long_names, + very_long_strings, + file_attributes, + variable_attributes, + other_extension, + end_of_headers: take_first(end_of_headers, || warn(Error::TBD)), + z_header: take_first(z_header, || warn(Error::TBD)), + z_trailer: take_first(z_trailer, || warn(Error::TBD)), + cases: take_first(cases, || warn(Error::TBD)), + }) + } +} + +#[derive(Debug)] +pub struct Metadata { + pub creation: NaiveDateTime, + pub endian: Endian, + pub compression: Option, + pub n_cases: Option, + pub product: String, + pub product_ext: Option, + pub version: Option<(i32, i32, i32)>, +} + +impl Metadata { + fn decode(headers: &Headers, warn: impl Fn(Error)) -> Self { + let header = &headers.header; + let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y") + .unwrap_or_else(|_| { + warn(Error::InvalidCreationDate { + creation_date: header.creation_date.to_string(), + }); + Default::default() + }); + let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S") + .unwrap_or_else(|_| { + warn(Error::InvalidCreationTime { + creation_time: header.creation_time.to_string(), + }); + Default::default() + }); + let creation = NaiveDateTime::new(creation_date, creation_time); + + let product = header + .eye_catcher + .trim_start_matches("@(#) SPSS DATA FILE") + .trim_end() + .to_string(); + + Self { + creation, + endian: header.endian, + compression: header.compression, + n_cases: header.n_cases.map(|n| n as u64), + product, + product_ext: headers.product_info.as_ref().map(|pe| fix_line_ends(&pe.0)), + version: headers.integer_info.as_ref().map(|ii| ii.version), + } + } +} + +struct Decoder { + pub encoding: &'static Encoding, + n_generated_names: usize, +} + +impl Decoder { + fn generate_name(&mut self, dictionary: &Dictionary) -> Identifier { + loop { + self.n_generated_names += 1; + let name = Identifier::from_encoding( + format!("VAR{:03}", self.n_generated_names), + self.encoding, + ) + .unwrap(); + if !dictionary.variables.contains(&name.0) { + return name; + } + assert!(self.n_generated_names < usize::MAX); + } + } +} + +pub fn decode( + mut headers: Headers, + encoding: &'static Encoding, + warn: impl Fn(Error), +) -> Result<(Dictionary, Metadata), Error> { + let mut dictionary = Dictionary::new(encoding); + + let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' ')); + if !file_label.is_empty() { + dictionary.file_label = Some(file_label); + } + + for mut attributes in headers.file_attributes.drain(..) { + dictionary.attributes.append(&mut attributes.0) + } + + // Concatenate all the document records (really there should only be one) + // and trim off the trailing spaces that pad them to 80 bytes. + dictionary.documents = headers + .document + .drain(..) + .flat_map(|record| record.lines) + .map(trim_end_spaces) + .collect(); + + // XXX warn for weird integer format + // XXX warn for weird floating-point format, etc. + + let mut decoder = Decoder { + encoding, + n_generated_names: 0, + }; + + let mut var_index_map = HashMap::new(); + let mut value_index = 0; + for (index, input) in headers + .variable + .iter() + .enumerate() + .filter(|(_index, record)| record.width != RawWidth::Continuation) + { + let name = trim_end_spaces(input.name.to_string()); + let name = match Identifier::from_encoding(name, encoding) { + Ok(name) => { + if !dictionary.variables.contains(&name.0) { + name + } else { + let new_name = decoder.generate_name(&dictionary); + warn(Error::DuplicateVariableName { + duplicate_name: name.clone(), + new_name: new_name.clone(), + }); + new_name + } + } + Err(id_error) => { + let new_name = decoder.generate_name(&dictionary); + warn(Error::InvalidVariableName { + id_error, + new_name: new_name.clone(), + }); + new_name + } + }; + let mut variable = Variable::new(name.clone(), VarWidth::try_from(input.width).unwrap()); + + // Set the short name the same as the long name (even if we renamed it). + variable.short_names = vec![name]; + + variable.label = input.label.clone(); + + variable.missing_values = input.missing_values.clone(); + + variable.print_format = decode_format( + input.print_format, + variable.width, + |new_spec, format_error| { + warn(Error::InvalidPrintFormat { + new_spec, + variable: variable.name.clone(), + format_error, + }) + }, + ); + variable.write_format = decode_format( + input.write_format, + variable.width, + |new_spec, format_error| { + warn(Error::InvalidWriteFormat { + new_spec, + variable: variable.name.clone(), + format_error, + }) + }, + ); + + // Check for long string continuation records. + let n_values = input.width.n_values().unwrap(); + for offset in 1..n_values { + if headers + .variable + .get(index + offset) + .is_none_or(|record| record.width != RawWidth::Continuation) + { + warn(Error::TBD); + break; + } + } + + let dict_index = dictionary.add_var(variable).unwrap(); + assert_eq!(var_index_map.insert(value_index, dict_index), None); + value_index += n_values; + } + + if let Some(weight_index) = headers.header.weight_index { + if let Some(dict_index) = var_index_map.get(&(weight_index as usize - 1)) { + let variable = &dictionary.variables[*dict_index]; + if variable.is_numeric() { + dictionary.weight = Some(*dict_index); + } else { + warn(Error::TBD); + } + } else { + warn(Error::TBD); + } + } + + for record in headers.value_label.drain(..) { + let mut dict_indexes = Vec::with_capacity(record.dict_indexes.len()); + let mut long_string_variables = Vec::new(); + for value_index in record.dict_indexes.iter() { + let Some(dict_index) = var_index_map.get(&(*value_index as usize - 1)) else { + unreachable!() + }; + let variable = &dictionary.variables[*dict_index]; + if variable.width.is_long_string() { + long_string_variables.push(variable.name.clone()); + } else { + dict_indexes.push(*dict_index); + } + } + if !long_string_variables.is_empty() { + warn(Error::InvalidLongStringValueLabels { + offsets: record.offsets.clone(), + variables: long_string_variables, + }); + } + + for dict_index in dict_indexes { + let variable = dictionary.variables.get_index_mut2(dict_index).unwrap(); + for ValueLabel { value, label } in record.labels.iter().cloned() { + let value = value.decode(variable.width); + variable.value_labels.insert(value, label); + } + } + } + + if let Some(display) = &headers.var_display { + for (index, display) in display.0.iter().enumerate() { + if let Some(variable) = dictionary.variables.get_index_mut2(index) { + if let Some(width) = display.width { + variable.display_width = width; + } + if let Some(alignment) = display.alignment { + variable.alignment = alignment; + } + if let Some(measure) = display.measure { + variable.measure = Some(measure); + } + } else { + warn(Error::TBD); + } + } + } + + for record in headers + .multiple_response + .iter() + .flat_map(|record| record.0.iter()) + { + match MultipleResponseSet::decode(&dictionary, record, &warn) { + Ok(mrset) => { + dictionary.mrsets.insert(ByIdentifier::new(mrset)); + } + Err(error) => warn(error), + } + } + + 'outer: for record in headers + .very_long_strings + .drain(..) + .flat_map(|record| record.0.into_iter()) + { + let Some(index) = dictionary.variables.get_index_of(&record.short_name.0) else { + warn(Error::TBD); + continue; + }; + let width = VarWidth::String(record.length); + let n_segments = width.n_segments(); + if n_segments == 1 { + warn(Error::TBD); + continue; + } + if index + n_segments > dictionary.variables.len() { + warn(Error::TBD); + continue; + } + let mut short_names = Vec::with_capacity(n_segments); + for i in 0..n_segments { + let alloc_width = width.segment_alloc_width(i); + let segment = &dictionary.variables[index + i]; + short_names.push(segment.short_names[0].clone()); + let segment_width = segment.width.as_string_width().unwrap_or(0); + if segment_width.next_multiple_of(8) != alloc_width.next_multiple_of(8) { + warn(Error::TBD); + continue 'outer; + } + } + dictionary.delete_vars(index + 1..index + n_segments); + let variable = dictionary.variables.get_index_mut2(index).unwrap(); + variable.short_names = short_names; + variable.width = width; + } + + if headers.long_names.is_empty() { + // There are no long variable names. Use the short variable names, + // converted to lowercase, as the long variable names. + for index in 0..dictionary.variables.len() { + let lower = dictionary.variables[index].name.0.as_ref().to_lowercase(); + if let Ok(new_name) = Identifier::from_encoding(lower, dictionary.encoding) { + dictionary.try_rename_var(index, new_name); + } + } + } else { + // Rename each of the variables, one by one. (In a correctly + // constructed system file, this cannot create any intermediate + // duplicate variable names, because all of the new variable names are + // longer than any of the old variable names and thus there cannot be + // any overlaps.) + for renaming in headers + .long_names + .iter() + .flat_map(|record| record.0.iter().cloned()) + { + let LongName { + short_name, + long_name, + } = renaming; + if let Some(index) = dictionary.variables.get_index_of(&short_name.0) { + dictionary.try_rename_var(index, long_name); + dictionary + .variables + .get_index_mut2(index) + .unwrap() + .short_names = vec![short_name]; + } else { + warn(Error::TBD); + } + } + } + + for mut attr_set in headers + .variable_attributes + .drain(..) + .flat_map(|record| record.0.into_iter()) + { + if let Some((_, variable)) = dictionary + .variables + .get_full_mut2(&attr_set.long_var_name.0) + { + variable.attributes.append(&mut attr_set.attributes); + } else { + warn(Error::TBD); + } + } + + // Assign variable roles. + for index in 0..dictionary.variables.len() { + let variable = dictionary.variables.get_index_mut2(index).unwrap(); + match variable.attributes.role() { + Ok(role) => variable.role = role, + Err(InvalidRole) => warn(Error::TBD), + } + } + + // Long string value labels. + for record in headers + .long_string_value_labels + .drain(..) + .flat_map(|record| record.0.into_iter()) + { + let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else { + warn(Error::TBD); + continue; + }; + let Some(width) = variable.width.as_string_width() else { + warn(Error::TBD); + continue; + }; + for (mut value, label) in record.labels.into_iter() { + // XXX warn about too-long value? + value.0.resize(width, b' '); + // XXX warn abouat duplicate value labels? + variable.value_labels.insert(Value::String(value), label); + } + } + + let mut value = Vec::new(); + for record in headers + .long_string_missing_values + .drain(..) + .flat_map(|record| record.0.into_iter()) + { + let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else { + warn(Error::TBD); + continue; + }; + let values = record + .missing_values + .into_iter() + .map(|v| { + value.clear(); + value.extend_from_slice(v.0.as_slice()); + value.resize(variable.width.as_string_width().unwrap(), b' '); + Value::String(Box::from(value.as_slice())) + }) + .collect::>(); + variable.missing_values = MissingValues { + values, + range: None, + }; + } + + for record in headers + .variable_sets + .drain(..) + .flat_map(|record| record.sets.into_iter()) + { + let mut variables = Vec::with_capacity(record.variable_names.len()); + for variable_name in record.variable_names { + let Some((dict_index, _)) = dictionary.variables.get_full_mut2(&variable_name.0) else { + warn(Error::TBD); + continue; + }; + variables.push(dict_index); + } + if !variables.is_empty() { + let variable_set = VariableSet { + name: record.name, + variables, + }; + dictionary + .variable_sets + .insert(ByIdentifier::new(variable_set)); + } + } + + let metadata = Metadata::decode(&headers, warn); + Ok((dictionary, metadata)) +} + +impl MultipleResponseSet { + fn decode( + dictionary: &Dictionary, + input: &raw::MultipleResponseSet, + warn: &impl Fn(Error), + ) -> Result { + let mr_set_name = input.name.clone(); + let mut variables = Vec::with_capacity(input.short_names.len()); + for short_name in input.short_names.iter() { + let Some(dict_index) = dictionary.variables.get_index_of(&short_name.0) else { + warn(Error::UnknownMrSetVariable { + mr_set: mr_set_name.clone(), + short_name: short_name.clone(), + }); + continue; + }; + variables.push(dict_index); + } + + match variables.len() { + 0 => return Err(Error::EmptyMrSet(mr_set_name)), + 1 => return Err(Error::OneVarMrSet(mr_set_name)), + _ => (), + } + + let Some((Some(min_width), Some(max_width))) = variables + .iter() + .copied() + .map(|dict_index| dictionary.variables[dict_index].width) + .map(|w| (Some(w), Some(w))) + .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb))) + else { + return Err(Error::MixedMrSet(mr_set_name)); + }; + + let mr_type = MultipleResponseType::decode(&mr_set_name, &input.mr_type, min_width)?; + + Ok(MultipleResponseSet { + name: mr_set_name, + width: min_width..=max_width, + label: input.label.to_string(), + mr_type, + variables, + }) + } +} + +fn trim_end_spaces(mut s: String) -> String { + s.truncate(s.trim_end_matches(' ').len()); + s +} + +/// Returns a copy of `s` in which all lone CR and CR LF pairs have been +/// replaced by LF. +/// +/// (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system +/// files that use CR-only line ends in the file label and extra product info.) +fn fix_line_ends(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + let mut s = s.chars().peekable(); + while let Some(c) = s.next() { + match c { + '\r' => { + s.next_if_eq(&'\n'); + out.push('\n') + } + c => out.push(c), + } + } + out +} + +fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Format, FormatError)) -> Format { + UncheckedFormat::try_from(raw) + .and_then(Format::try_from) + .and_then(|x| x.check_width_compatibility(width)) + .unwrap_or_else(|error| { + let new_format = Format::default_for_width(width); + warn(new_format, error); + new_format + }) +} + +impl MultipleResponseType { + fn decode( + mr_set: &Identifier, + input: &raw::MultipleResponseType, + min_width: VarWidth, + ) -> Result { + match input { + raw::MultipleResponseType::MultipleDichotomy { value, labels } => { + let value = match min_width { + VarWidth::Numeric => { + let string = String::from_utf8_lossy(&value.0); + let number: f64 = string.trim().parse().map_err(|_| { + Error::InvalidMDGroupCountedValue { + mr_set: mr_set.clone(), + number: string.into(), + } + })?; + Value::Number(Some(number)) + } + VarWidth::String(max_width) => { + let mut value = value.0.as_slice(); + while value.ends_with(b" ") { + value = &value[..value.len() - 1]; + } + let width = value.len(); + if width > max_width as usize { + return Err(Error::TooWideMDGroupCountedValue { + mr_set: mr_set.clone(), + value: String::from_utf8_lossy(value).into(), + width, + max_width, + }); + }; + Value::String(value.into()) + } + }; + Ok(MultipleResponseType::MultipleDichotomy { + value, + labels: *labels, + }) + } + raw::MultipleResponseType::MultipleCategory => { + Ok(MultipleResponseType::MultipleCategory) + } + } + } +} diff --git a/rust/pspp/src/sys/encoding.rs b/rust/pspp/src/sys/encoding.rs new file mode 100644 index 0000000000..c408bf56fa --- /dev/null +++ b/rust/pspp/src/sys/encoding.rs @@ -0,0 +1,95 @@ +use crate::locale_charset::locale_charset; +use encoding_rs::{Encoding, UTF_8}; + +include!(concat!(env!("OUT_DIR"), "/encodings.rs")); + +pub fn codepage_from_encoding(encoding: &str) -> Option { + CODEPAGE_NAME_TO_NUMBER + .get(encoding.to_ascii_lowercase().as_str()) + .copied() +} + +use thiserror::Error as ThisError; + +#[derive(ThisError, Debug)] +pub enum Error { + #[error("This system file does not indicate its own character encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")] + NoEncoding, + + #[error("This system file encodes text strings with unknown code page {0}.")] + UnknownCodepage(i32), + + #[error("This system file encodes text strings with unknown encoding {0}.")] + UnknownEncoding(String), + + #[error("This system file is encoded in EBCDIC, which is not supported.")] + Ebcdic, +} + +pub fn default_encoding() -> &'static Encoding { + lazy_static! { + static ref DEFAULT_ENCODING: &'static Encoding = + Encoding::for_label(locale_charset().as_bytes()).unwrap_or(UTF_8); + } + &DEFAULT_ENCODING +} + +pub fn get_encoding( + encoding: Option<&str>, + character_code: Option, +) -> Result<&'static Encoding, Error> { + let label = if let Some(encoding) = encoding { + encoding + } else if let Some(codepage) = character_code { + match codepage { + 1 => return Err(Error::Ebcdic), + 2 | 3 => { + // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic] + // respectively. However, many files have character code 2 but + // data which are clearly not ASCII. Therefore, ignore these + // values. + return Err(Error::NoEncoding); + } + 4 => "MS_KANJI", + _ => CODEPAGE_NUMBER_TO_NAME + .get(&codepage) + .copied() + .ok_or(Error::UnknownCodepage(codepage))?, + } + } else { + return Err(Error::NoEncoding); + }; + + Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into())) +} + +/* +#[cfg(test)] +mod tests { + use std::thread::spawn; + + use encoding_rs::{EUC_JP, UTF_8, WINDOWS_1252}; + + #[test] + fn round_trip() { + let mut threads = Vec::new(); + for thread in 0..128 { + let start: u32 = thread << 25; + let end = start + ((1 << 25) - 1); + threads.push(spawn(move || { + for i in start..=end { + let s = i.to_le_bytes(); + let (utf8, replacement) = EUC_JP.decode_without_bom_handling(&s); + if !replacement { + let s2 = UTF_8.encode(&utf8).0; + assert_eq!(s.as_slice(), &*s2); + } + } + })); + } + for thread in threads { + thread.join().unwrap(); + } + } +} +*/ diff --git a/rust/pspp/src/sys/mod.rs b/rust/pspp/src/sys/mod.rs new file mode 100644 index 0000000000..57a1d00e56 --- /dev/null +++ b/rust/pspp/src/sys/mod.rs @@ -0,0 +1,4 @@ +pub mod cooked; +pub mod encoding; +pub mod raw; +pub mod sack; diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs new file mode 100644 index 0000000000..7a0af59e1d --- /dev/null +++ b/rust/pspp/src/sys/raw.rs @@ -0,0 +1,3008 @@ +use crate::{ + dictionary::{Attributes, Value, VarWidth}, + endian::{Endian, Parse, ToBytes}, + identifier::{Error as IdError, Identifier}, + sys::encoding::{default_encoding, get_encoding, Error as EncodingError}, +}; + +use encoding_rs::{mem::decode_latin1, Encoding}; +use flate2::read::ZlibDecoder; +use num::Integer; +use std::{ + borrow::Cow, + cell::RefCell, + collections::{HashMap, VecDeque}, + fmt::{Debug, Display, Formatter, Result as FmtResult}, + io::{Error as IoError, Read, Seek, SeekFrom}, + mem::take, + num::NonZeroU8, + ops::Range, + rc::Rc, + str::from_utf8, +}; +use thiserror::Error as ThisError; + +#[derive(ThisError, Debug)] +pub enum Error { + #[error("Not an SPSS system file")] + NotASystemFile, + + #[error("Invalid magic number {0:?}")] + BadMagic([u8; 4]), + + #[error("I/O error ({0})")] + Io(#[from] IoError), + + #[error("Invalid SAV compression code {0}")] + InvalidSavCompression(u32), + + #[error("Invalid ZSAV compression code {0}")] + InvalidZsavCompression(u32), + + #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")] + BadDocumentLength { offset: u64, n: usize, max: usize }, + + #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")] + BadRecordType { offset: u64, rec_type: u32 }, + + #[error("In variable record starting at offset {start_offset:#x}, variable width is not in the valid range -1 to 255.")] + BadVariableWidth { start_offset: u64, width: i32 }, + + #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")] + BadVariableLabelCode { + start_offset: u64, + code_offset: u64, + code: u32, + }, + + #[error("At offset {offset:#x}, missing value code ({code}) is not -3, -2, 0, 1, 2, or 3.")] + BadMissingValueCode { offset: u64, code: i32 }, + + #[error( + "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3." + )] + BadNumericMissingValueCode { offset: u64, code: i32 }, + + #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")] + BadStringMissingValueCode { offset: u64, code: i32 }, + + #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")] + BadNumberOfValueLabels { offset: u64, n: u32, max: u32 }, + + #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")] + ExpectedVarIndexRecord { offset: u64, rec_type: u32 }, + + #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")] + TooManyVarIndexes { offset: u64, n: u32, max: u32 }, + + #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")] + ExtensionRecordTooLarge { + offset: u64, + subtype: u32, + size: u32, + count: u32, + }, + + #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")] + EofInCase { + offset: u64, + case_ofs: u64, + case_len: usize, + }, + + #[error( + "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case." + )] + EofInCompressedCase { offset: u64, case_ofs: u64 }, + + #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")] + PartialCompressedCase { offset: u64, case_ofs: u64 }, + + #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")] + CompressedNumberExpected { offset: u64, case_ofs: u64 }, + + #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")] + CompressedStringExpected { offset: u64, case_ofs: u64 }, + + #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")] + BadZlibTrailerNBlocks { + offset: u64, + n_blocks: u32, + expected_n_blocks: u64, + ztrailer_len: u64, + }, + + #[error("{0}")] + EncodingError(EncodingError), +} + +#[derive(ThisError, Debug)] +pub enum Warning { + #[error("Unexpected end of data inside extension record.")] + UnexpectedEndOfData, + + #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")] + NoVarIndexes { offset: u64 }, + + #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())] + MixedVarTypes { + offset: u64, + var_type: VarType, + wrong_types: Vec, + }, + + #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}] or referred to string continuations: {invalid:?}")] + InvalidVarIndexes { + offset: u64, + max: usize, + invalid: Vec, + }, + + #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")] + BadRecordSize { + offset: u64, + record: String, + size: u32, + expected_size: u32, + }, + + #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")] + BadRecordCount { + offset: u64, + record: String, + count: u32, + expected_count: u32, + }, + + #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")] + BadLongMissingValueLength { + record_offset: u64, + offset: u64, + value_len: u32, + }, + + #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")] + BadEncodingName { offset: u64 }, + + // XXX This is risky because `text` might be arbitarily long. + #[error("Text string contains invalid bytes for {encoding} encoding: {text}")] + MalformedString { encoding: String, text: String }, + + #[error("Invalid variable measurement level value {0}")] + InvalidMeasurement(u32), + + #[error("Invalid variable display alignment value {0}")] + InvalidAlignment(u32), + + #[error("Invalid attribute name. {0}")] + InvalidAttributeName(IdError), + + #[error("Invalid variable name in attribute record. {0}")] + InvalidAttributeVariableName(IdError), + + #[error("Invalid short name in long variable name record. {0}")] + InvalidShortName(IdError), + + #[error("Invalid name in long variable name record. {0}")] + InvalidLongName(IdError), + + #[error("Invalid variable name in very long string record. {0}")] + InvalidLongStringName(IdError), + + #[error("Invalid variable name in variable set record. {0}")] + InvalidVariableSetName(IdError), + + #[error("Invalid multiple response set name. {0}")] + InvalidMrSetName(IdError), + + #[error("Invalid multiple response set variable name. {0}")] + InvalidMrSetVariableName(IdError), + + #[error("Invalid variable name in long string missing values record. {0}")] + InvalidLongStringMissingValueVariableName(IdError), + + #[error("Invalid variable name in long string value label record. {0}")] + InvalidLongStringValueLabelName(IdError), + + #[error("{0}")] + EncodingError(EncodingError), + + #[error("Details TBD")] + TBD, +} + +impl From for Warning { + fn from(_source: IoError) -> Self { + Self::UnexpectedEndOfData + } +} + +#[derive(Clone, Debug)] +pub enum Record { + Header(HeaderRecord), + Variable(VariableRecord), + ValueLabel(ValueLabelRecord, RawString>), + Document(DocumentRecord), + IntegerInfo(IntegerInfoRecord), + FloatInfo(FloatInfoRecord), + VarDisplay(VarDisplayRecord), + MultipleResponse(MultipleResponseRecord), + LongStringValueLabels(LongStringValueLabelRecord), + LongStringMissingValues(LongStringMissingValueRecord), + Encoding(EncodingRecord), + NumberOfCases(NumberOfCasesRecord), + Text(TextRecord), + OtherExtension(Extension), + EndOfHeaders(u32), + ZHeader(ZHeader), + ZTrailer(ZTrailer), + Cases(Rc>), +} + +#[derive(Clone, Debug)] +pub enum DecodedRecord { + Header(HeaderRecord), + Variable(VariableRecord), + ValueLabel(ValueLabelRecord, String>), + Document(DocumentRecord), + IntegerInfo(IntegerInfoRecord), + FloatInfo(FloatInfoRecord), + VarDisplay(VarDisplayRecord), + MultipleResponse(MultipleResponseRecord), + LongStringValueLabels(LongStringValueLabelRecord), + LongStringMissingValues(LongStringMissingValueRecord), + Encoding(EncodingRecord), + NumberOfCases(NumberOfCasesRecord), + VariableSets(VariableSetRecord), + ProductInfo(ProductInfoRecord), + LongNames(LongNamesRecord), + VeryLongStrings(VeryLongStringsRecord), + FileAttributes(FileAttributeRecord), + VariableAttributes(VariableAttributeRecord), + OtherExtension(Extension), + EndOfHeaders(u32), + ZHeader(ZHeader), + ZTrailer(ZTrailer), + Cases(Rc>), +} + +impl Record { + fn read( + reader: &mut R, + endian: Endian, + var_types: &VarTypes, + warn: &dyn Fn(Warning), + ) -> Result, Error> + where + R: Read + Seek, + { + let rec_type: u32 = endian.parse(read_bytes(reader)?); + match rec_type { + 2 => Ok(Some(VariableRecord::read(reader, endian, warn)?)), + 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?), + 6 => Ok(Some(DocumentRecord::read(reader, endian)?)), + 7 => Extension::read(reader, endian, var_types.n_values(), warn), + 999 => Ok(Some(Record::EndOfHeaders( + endian.parse(read_bytes(reader)?), + ))), + _ => Err(Error::BadRecordType { + offset: reader.stream_position()?, + rec_type, + }), + } + } + + pub fn decode(self, decoder: &Decoder) -> Result { + Ok(match self { + Record::Header(record) => record.decode(decoder), + Record::Variable(record) => record.decode(decoder), + Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)), + Record::Document(record) => record.decode(decoder), + Record::IntegerInfo(record) => DecodedRecord::IntegerInfo(record.clone()), + Record::FloatInfo(record) => DecodedRecord::FloatInfo(record.clone()), + Record::VarDisplay(record) => DecodedRecord::VarDisplay(record.clone()), + Record::MultipleResponse(record) => record.decode(decoder), + Record::LongStringValueLabels(record) => { + DecodedRecord::LongStringValueLabels(record.decode(decoder)) + } + Record::LongStringMissingValues(record) => { + DecodedRecord::LongStringMissingValues(record.decode(decoder)) + } + Record::Encoding(record) => DecodedRecord::Encoding(record.clone()), + Record::NumberOfCases(record) => DecodedRecord::NumberOfCases(record.clone()), + Record::Text(record) => record.decode(decoder), + Record::OtherExtension(record) => DecodedRecord::OtherExtension(record.clone()), + Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record), + Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()), + Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()), + Record::Cases(record) => DecodedRecord::Cases(record.clone()), + }) + } +} + +pub fn encoding_from_headers( + headers: &Vec, + warn: &impl Fn(Warning), +) -> Result<&'static Encoding, Error> { + let mut encoding_record = None; + let mut integer_info_record = None; + for record in headers { + match record { + Record::Encoding(record) => encoding_record = Some(record), + Record::IntegerInfo(record) => integer_info_record = Some(record), + _ => (), + } + } + let encoding = encoding_record.map(|record| record.0.as_str()); + let character_code = integer_info_record.map(|record| record.character_code); + match get_encoding(encoding, character_code) { + Ok(encoding) => Ok(encoding), + Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)), + Err(err) => { + warn(Warning::EncodingError(err)); + // Warn that we're using the default encoding. + Ok(default_encoding()) + } + } +} + +// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it +// decoded as Latin-1 (actually bytes interpreted as Unicode code points). +fn default_decode(s: &[u8]) -> Cow { + from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from) +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum Compression { + Simple, + ZLib, +} + +#[derive(Clone)] +pub struct HeaderRecord +where + S: Debug, +{ + /// Offset in file. + pub offsets: Range, + + /// Magic number. + pub magic: Magic, + + /// Eye-catcher string, product name, in the file's encoding. Padded + /// on the right with spaces. + pub eye_catcher: S, + + /// Layout code, normally either 2 or 3. + pub layout_code: u32, + + /// Number of variable positions, or `None` if the value in the file is + /// questionably trustworthy. + pub nominal_case_size: Option, + + /// Compression type, if any, + pub compression: Option, + + /// 1-based variable index of the weight variable, or `None` if the file is + /// unweighted. + pub weight_index: Option, + + /// Claimed number of cases, if known. + pub n_cases: Option, + + /// Compression bias, usually 100.0. + pub bias: f64, + + /// `dd mmm yy` in the file's encoding. + pub creation_date: S, + + /// `HH:MM:SS` in the file's encoding. + pub creation_time: S, + + /// File label, in the file's encoding. Padded on the right with spaces. + pub file_label: S, + + /// Endianness of the data in the file header. + pub endian: Endian, +} + +impl HeaderRecord +where + S: Debug, +{ + fn debug_field(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult + where + T: Debug, + { + writeln!(f, "{name:>17}: {:?}", value) + } +} + +impl Debug for HeaderRecord +where + S: Debug, +{ + fn fmt(&self, f: &mut Formatter) -> FmtResult { + writeln!(f, "File header record:")?; + self.debug_field(f, "Magic", self.magic)?; + self.debug_field(f, "Product name", &self.eye_catcher)?; + self.debug_field(f, "Layout code", self.layout_code)?; + self.debug_field(f, "Nominal case size", self.nominal_case_size)?; + self.debug_field(f, "Compression", self.compression)?; + self.debug_field(f, "Weight index", self.weight_index)?; + self.debug_field(f, "Number of cases", self.n_cases)?; + self.debug_field(f, "Compression bias", self.bias)?; + self.debug_field(f, "Creation date", &self.creation_date)?; + self.debug_field(f, "Creation time", &self.creation_time)?; + self.debug_field(f, "File label", &self.file_label)?; + self.debug_field(f, "Endianness", self.endian) + } +} + +impl HeaderRecord { + fn read(r: &mut R) -> Result { + let start = r.stream_position()?; + + let magic: [u8; 4] = read_bytes(r)?; + let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?; + + let eye_catcher = RawString(read_vec(r, 60)?); + let layout_code: [u8; 4] = read_bytes(r)?; + let endian = Endian::identify_u32(2, layout_code) + .or_else(|| Endian::identify_u32(2, layout_code)) + .ok_or(Error::NotASystemFile)?; + let layout_code = endian.parse(layout_code); + + let nominal_case_size: u32 = endian.parse(read_bytes(r)?); + let nominal_case_size = + (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size); + + let compression_code: u32 = endian.parse(read_bytes(r)?); + let compression = match (magic, compression_code) { + (Magic::Zsav, 2) => Some(Compression::ZLib), + (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)), + (_, 0) => None, + (_, 1) => Some(Compression::Simple), + (_, code) => return Err(Error::InvalidSavCompression(code)), + }; + + let weight_index: u32 = endian.parse(read_bytes(r)?); + let weight_index = (weight_index > 0).then_some(weight_index); + + let n_cases: u32 = endian.parse(read_bytes(r)?); + let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases); + + let bias: f64 = endian.parse(read_bytes(r)?); + + let creation_date = RawString(read_vec(r, 9)?); + let creation_time = RawString(read_vec(r, 8)?); + let file_label = RawString(read_vec(r, 64)?); + let _: [u8; 3] = read_bytes(r)?; + + Ok(HeaderRecord { + offsets: start..r.stream_position()?, + magic, + layout_code, + nominal_case_size, + compression, + weight_index, + n_cases, + bias, + creation_date, + creation_time, + eye_catcher, + file_label, + endian, + }) + } + + pub fn decode(self, decoder: &Decoder) -> DecodedRecord { + let eye_catcher = decoder.decode(&self.eye_catcher).to_string(); + let file_label = decoder.decode(&self.file_label).to_string(); + let creation_date = decoder.decode(&self.creation_date).to_string(); + let creation_time = decoder.decode(&self.creation_time).to_string(); + DecodedRecord::Header(HeaderRecord { + eye_catcher, + weight_index: self.weight_index, + n_cases: self.n_cases, + file_label, + offsets: self.offsets.clone(), + magic: self.magic, + layout_code: self.layout_code, + nominal_case_size: self.nominal_case_size, + compression: self.compression, + bias: self.bias, + creation_date, + creation_time, + endian: self.endian, + }) + } +} + +pub struct Decoder { + pub encoding: &'static Encoding, + pub warn: Box, +} + +impl Decoder { + pub fn new(encoding: &'static Encoding, warn: F) -> Self + where + F: Fn(Warning) + 'static, + { + Self { + encoding, + warn: Box::new(warn), + } + } + fn warn(&self, warning: Warning) { + (self.warn)(warning) + } + fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { + let (output, malformed) = self.encoding.decode_without_bom_handling(input); + if malformed { + self.warn(Warning::MalformedString { + encoding: self.encoding.name().into(), + text: output.clone().into(), + }); + } + output + } + + fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> { + self.decode_slice(input.0.as_slice()) + } + + pub fn decode_identifier(&self, input: &RawString) -> Result { + self.new_identifier(&self.decode(input)) + } + + pub fn new_identifier(&self, name: &str) -> Result { + Identifier::from_encoding(name, self.encoding) + } +} + +#[derive(Copy, Clone, PartialEq, Eq, Hash)] +pub enum Magic { + /// Regular system file. + Sav, + + /// System file with Zlib-compressed data. + Zsav, + + /// EBCDIC-encoded system file. + Ebcdic, +} + +impl Magic { + /// Magic number for a regular system file. + pub const SAV: [u8; 4] = *b"$FL2"; + + /// Magic number for a system file that contains zlib-compressed data. + pub const ZSAV: [u8; 4] = *b"$FL3"; + + /// Magic number for an EBCDIC-encoded system file. This is `$FL2` encoded + /// in EBCDIC. + pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2]; +} + +impl Debug for Magic { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + let s = match *self { + Magic::Sav => "$FL2", + Magic::Zsav => "$FL3", + Magic::Ebcdic => "($FL2 in EBCDIC)", + }; + write!(f, "{s}") + } +} + +impl TryFrom<[u8; 4]> for Magic { + type Error = Error; + + fn try_from(value: [u8; 4]) -> Result { + match value { + Magic::SAV => Ok(Magic::Sav), + Magic::ZSAV => Ok(Magic::Zsav), + Magic::EBCDIC => Ok(Magic::Ebcdic), + _ => Err(Error::BadMagic(value)), + } + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum VarType { + Numeric, + String, +} + +impl VarType { + pub fn opposite(self) -> VarType { + match self { + Self::Numeric => Self::String, + Self::String => Self::Numeric, + } + } +} + +impl Display for VarType { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + match self { + VarType::Numeric => write!(f, "numeric"), + VarType::String => write!(f, "string"), + } + } +} + +impl TryFrom for VarType { + type Error = (); + + fn try_from(value: RawWidth) -> Result { + match value { + RawWidth::Continuation => Err(()), + RawWidth::Numeric => Ok(VarType::Numeric), + RawWidth::String(_) => Ok(VarType::String), + } + } +} + +impl TryFrom for VarWidth { + type Error = (); + + fn try_from(value: RawWidth) -> Result { + match value { + RawWidth::Continuation => Err(()), + RawWidth::Numeric => Ok(Self::Numeric), + RawWidth::String(width) => Ok(Self::String(width.get() as u16)), + } + } +} + +type RawValue = Value>; + +impl RawValue { + pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self { + match var_type { + VarType::String => Value::String(RawStrArray(raw.0)), + VarType::Numeric => Value::Number(endian.parse(raw.0)), + } + } + + fn read_case( + reader: &mut R, + var_types: &VarTypes, + endian: Endian, + ) -> Result>, Error> { + let case_start = reader.stream_position()?; + let mut values = Vec::with_capacity(var_types.n_values()); + for (i, var_type) in var_types.iter().enumerate() { + let Some(raw) = try_read_bytes(reader)? else { + if i == 0 { + return Ok(None); + } else { + let offset = reader.stream_position()?; + return Err(Error::EofInCase { + offset, + case_ofs: offset - case_start, + case_len: var_types.n_values() * 8, + }); + } + }; + values.push(Value::from_raw(&UntypedValue(raw), var_type, endian)); + } + Ok(Some(values)) + } + + fn read_compressed_case( + reader: &mut R, + var_types: &VarTypes, + codes: &mut VecDeque, + endian: Endian, + bias: f64, + ) -> Result>, Error> { + let case_start = reader.stream_position()?; + let mut values = Vec::with_capacity(var_types.n_values()); + for (i, var_type) in var_types.iter().enumerate() { + let value = loop { + let Some(code) = codes.pop_front() else { + let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else { + if i == 0 { + return Ok(None); + } else { + let offset = reader.stream_position()?; + return Err(Error::EofInCompressedCase { + offset, + case_ofs: offset - case_start, + }); + } + }; + codes.extend(new_codes.into_iter()); + continue; + }; + match code { + 0 => (), + 1..=251 => match var_type { + VarType::Numeric => break Self::Number(Some(code as f64 - bias)), + VarType::String => { + break Self::String(RawStrArray(endian.to_bytes(code as f64 - bias))) + } + }, + 252 => { + if i == 0 { + return Ok(None); + } else { + let offset = reader.stream_position()?; + return Err(Error::PartialCompressedCase { + offset, + case_ofs: offset - case_start, + }); + } + } + 253 => { + break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian) + } + 254 => match var_type { + VarType::String => break Self::String(RawStrArray(*b" ")), // XXX EBCDIC + VarType::Numeric => { + return Err(Error::CompressedStringExpected { + offset: case_start, + case_ofs: reader.stream_position()? - case_start, + }) + } + }, + 255 => match var_type { + VarType::Numeric => break Self::Number(None), + VarType::String => { + return Err(Error::CompressedNumberExpected { + offset: case_start, + case_ofs: reader.stream_position()? - case_start, + }) + } + }, + } + }; + values.push(value); + } + Ok(Some(values)) + } + + pub fn decode(&self, width: VarWidth) -> Value { + match self { + Self::Number(x) => Value::Number(*x), + Self::String(s) => { + let width = width.as_string_width().unwrap(); + Value::String(RawString::from(&s.0[..width])) + } + } + } +} + +struct ZlibDecodeMultiple +where + R: Read + Seek, +{ + reader: Option>, +} + +impl ZlibDecodeMultiple +where + R: Read + Seek, +{ + fn new(reader: R) -> ZlibDecodeMultiple { + ZlibDecodeMultiple { + reader: Some(ZlibDecoder::new(reader)), + } + } +} + +impl Read for ZlibDecodeMultiple +where + R: Read + Seek, +{ + fn read(&mut self, buf: &mut [u8]) -> Result { + loop { + match self.reader.as_mut().unwrap().read(buf)? { + 0 => { + let inner = self.reader.take().unwrap().into_inner(); + self.reader = Some(ZlibDecoder::new(inner)); + } + n => return Ok(n), + }; + } + } +} + +impl Seek for ZlibDecodeMultiple +where + R: Read + Seek, +{ + fn seek(&mut self, pos: SeekFrom) -> Result { + self.reader.as_mut().unwrap().get_mut().seek(pos) + } +} + +enum ReaderState { + Start, + Headers, + ZlibHeader, + ZlibTrailer { + ztrailer_offset: u64, + ztrailer_len: u64, + }, + Cases, + End, +} + +pub struct Reader +where + R: Read + Seek + 'static, +{ + reader: Option, + warn: Box, + + header: HeaderRecord, + var_types: VarTypes, + + state: ReaderState, +} + +impl Reader +where + R: Read + Seek + 'static, +{ + pub fn new(mut reader: R, warn: F) -> Result + where + F: Fn(Warning) + 'static, + { + let header = HeaderRecord::read(&mut reader)?; + Ok(Self { + reader: Some(reader), + warn: Box::new(warn), + header, + var_types: VarTypes::new(), + state: ReaderState::Start, + }) + } + fn cases(&mut self) -> Cases { + self.state = ReaderState::End; + Cases::new( + self.reader.take().unwrap(), + take(&mut self.var_types), + &self.header, + ) + } + fn _next(&mut self) -> Option<::Item> { + match self.state { + ReaderState::Start => { + self.state = ReaderState::Headers; + Some(Ok(Record::Header(self.header.clone()))) + } + ReaderState::Headers => { + let record = loop { + match Record::read( + self.reader.as_mut().unwrap(), + self.header.endian, + &self.var_types, + &self.warn, + ) { + Ok(Some(record)) => break record, + Ok(None) => (), + Err(error) => return Some(Err(error)), + } + }; + match record { + Record::Variable(VariableRecord { width, .. }) => self.var_types.push(width), + Record::EndOfHeaders(_) => { + self.state = if let Some(Compression::ZLib) = self.header.compression { + ReaderState::ZlibHeader + } else { + ReaderState::Cases + }; + } + _ => (), + }; + Some(Ok(record)) + } + ReaderState::ZlibHeader => { + let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian) + { + Ok(zheader) => zheader, + Err(error) => return Some(Err(error)), + }; + self.state = ReaderState::ZlibTrailer { + ztrailer_offset: zheader.ztrailer_offset, + ztrailer_len: zheader.ztrailer_len, + }; + Some(Ok(Record::ZHeader(zheader))) + } + ReaderState::ZlibTrailer { + ztrailer_offset, + ztrailer_len, + } => { + match ZTrailer::read( + self.reader.as_mut().unwrap(), + self.header.endian, + ztrailer_offset, + ztrailer_len, + ) { + Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))), + Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))), + Err(error) => Some(Err(error)), + } + } + ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))), + ReaderState::End => None, + } + } +} + +impl Iterator for Reader +where + R: Read + Seek + 'static, +{ + type Item = Result; + + fn next(&mut self) -> Option { + let retval = self._next(); + if matches!(retval, Some(Err(_))) { + self.state = ReaderState::End; + } + retval + } +} + +trait ReadSeek: Read + Seek {} +impl ReadSeek for T where T: Read + Seek {} + +pub struct Cases { + reader: Box, + var_types: VarTypes, + compression: Option, + bias: f64, + endian: Endian, + codes: VecDeque, + eof: bool, +} + +impl Debug for Cases { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "Cases") + } +} + +impl Cases { + fn new(reader: R, var_types: VarTypes, header: &HeaderRecord) -> Self + where + R: Read + Seek + 'static, + { + Self { + reader: if header.compression == Some(Compression::ZLib) { + Box::new(ZlibDecodeMultiple::new(reader)) + } else { + Box::new(reader) + }, + var_types, + compression: header.compression, + bias: header.bias, + endian: header.endian, + codes: VecDeque::with_capacity(8), + eof: false, + } + } +} + +impl Iterator for Cases { + type Item = Result, Error>; + + fn next(&mut self) -> Option { + if self.eof { + return None; + } + + let retval = if self.compression.is_some() { + Value::read_compressed_case( + &mut self.reader, + &self.var_types, + &mut self.codes, + self.endian, + self.bias, + ) + .transpose() + } else { + Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose() + }; + self.eof = matches!(retval, None | Some(Err(_))); + retval + } +} + +#[derive(Copy, Clone, PartialEq, Eq, Hash)] +pub struct Spec(pub u32); + +impl Debug for Spec { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + let type_ = format_name(self.0 >> 16); + let w = (self.0 >> 8) & 0xff; + let d = self.0 & 0xff; + write!(f, "{:06x} ({type_}{w}.{d})", self.0) + } +} + +fn format_name(type_: u32) -> Cow<'static, str> { + match type_ { + 1 => "A", + 2 => "AHEX", + 3 => "COMMA", + 4 => "DOLLAR", + 5 => "F", + 6 => "IB", + 7 => "PIBHEX", + 8 => "P", + 9 => "PIB", + 10 => "PK", + 11 => "RB", + 12 => "RBHEX", + 15 => "Z", + 16 => "N", + 17 => "E", + 20 => "DATE", + 21 => "TIME", + 22 => "DATETIME", + 23 => "ADATE", + 24 => "JDATE", + 25 => "DTIME", + 26 => "WKDAY", + 27 => "MONTH", + 28 => "MOYR", + 29 => "QYR", + 30 => "WKYR", + 31 => "PCT", + 32 => "DOT", + 33 => "CCA", + 34 => "CCB", + 35 => "CCC", + 36 => "CCD", + 37 => "CCE", + 38 => "EDATE", + 39 => "SDATE", + 40 => "MTIME", + 41 => "YMDHMS", + _ => return format!("").into(), + } + .into() +} + +#[derive(Clone)] +pub struct MissingValues> +where + S: Debug, +{ + /// Individual missing values, up to 3 of them. + pub values: Vec>, + + /// Optional range of missing values. + pub range: Option<(Value, Value)>, +} + +impl Debug for MissingValues +where + S: Debug, +{ + fn fmt(&self, f: &mut Formatter) -> FmtResult { + for (i, value) in self.values.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{value:?}")?; + } + + if let Some((low, high)) = &self.range { + if !self.values.is_empty() { + write!(f, ", ")?; + } + write!(f, "{low:?} THRU {high:?}")?; + } + + if self.is_empty() { + write!(f, "none")?; + } + + Ok(()) + } +} + +impl MissingValues +where + S: Debug, +{ + fn is_empty(&self) -> bool { + self.values.is_empty() && self.range.is_none() + } +} + +impl Default for MissingValues +where + S: Debug, +{ + fn default() -> Self { + Self { + values: Vec::new(), + range: None, + } + } +} + +impl MissingValues { + fn read( + r: &mut R, + offset: u64, + width: RawWidth, + code: i32, + endian: Endian, + warn: &dyn Fn(Warning), + ) -> Result { + let (individual_values, has_range) = match code { + 0..=3 => (code as usize, false), + -2 => (0, true), + -3 => (1, true), + _ => return Err(Error::BadMissingValueCode { offset, code }), + }; + + let mut values = Vec::with_capacity(individual_values); + for _ in 0..individual_values { + values.push(read_bytes::<8, _>(r)?); + } + let range = if has_range { + let low = read_bytes::<8, _>(r)?; + let high = read_bytes::<8, _>(r)?; + Some((low, high)) + } else { + None + }; + + match VarWidth::try_from(width) { + Ok(VarWidth::Numeric) => { + let values = values + .into_iter() + .map(|v| Value::Number(endian.parse(v))) + .collect(); + let range = range.map(|(low, high)| { + ( + Value::Number(endian.parse(low)), + Value::Number(endian.parse(high)), + ) + }); + return Ok(Self { values, range }); + } + Ok(VarWidth::String(width)) if width <= 8 && range.is_none() => { + let values = values + .into_iter() + .map(|value| Value::String(Box::from(&value[..width as usize]))) + .collect(); + return Ok(Self { + values, + range: None, + }); + } + Ok(VarWidth::String(width)) if width > 8 => warn(Warning::TBD), + Ok(VarWidth::String(_)) => warn(Warning::TBD), + Err(()) => warn(Warning::TBD), + } + Ok(Self::default()) + } +} + +#[derive(Clone)] +pub struct VariableRecord +where + S: Debug, +{ + /// Range of offsets in file. + pub offsets: Range, + + /// Variable width, in the range -1..=255. + pub width: RawWidth, + + /// Variable name, padded on the right with spaces. + pub name: S, + + /// Print format. + pub print_format: Spec, + + /// Write format. + pub write_format: Spec, + + /// Missing values. + pub missing_values: MissingValues, + + /// Optional variable label. + pub label: Option, +} + +#[derive(Copy, Clone, PartialEq, Eq)] +pub enum RawWidth { + Continuation, + Numeric, + String(NonZeroU8), +} + +impl RawWidth { + pub fn n_values(&self) -> Option { + match self { + RawWidth::Numeric => Some(1), + RawWidth::String(width) => Some((width.get() as usize).div_ceil(8)), + _ => None, + } + } +} + +impl TryFrom for RawWidth { + type Error = (); + + fn try_from(value: i32) -> Result { + match value { + -1 => Ok(Self::Continuation), + 0 => Ok(Self::Numeric), + 1..=255 => Ok(Self::String(NonZeroU8::new(value as u8).unwrap())), + _ => Err(()), + } + } +} + +impl Display for RawWidth { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + match self { + RawWidth::Continuation => write!(f, "long string continuation"), + RawWidth::Numeric => write!(f, "numeric"), + RawWidth::String(width) => write!(f, "{width}-byte string"), + } + } +} + +impl Debug for VariableRecord +where + S: Debug, +{ + fn fmt(&self, f: &mut Formatter) -> FmtResult { + writeln!(f, "Width: {}", self.width,)?; + writeln!(f, "Print format: {:?}", self.print_format)?; + writeln!(f, "Write format: {:?}", self.write_format)?; + writeln!(f, "Name: {:?}", &self.name)?; + writeln!(f, "Variable label: {:?}", self.label)?; + writeln!(f, "Missing values: {:?}", self.missing_values) + } +} + +impl VariableRecord { + fn read( + r: &mut R, + endian: Endian, + warn: &dyn Fn(Warning), + ) -> Result { + let start_offset = r.stream_position()?; + let width: i32 = endian.parse(read_bytes(r)?); + let width: RawWidth = width.try_into().map_err(|_| Error::BadVariableWidth { + start_offset, + width, + })?; + let code_offset = r.stream_position()?; + let has_variable_label: u32 = endian.parse(read_bytes(r)?); + let missing_value_code: i32 = endian.parse(read_bytes(r)?); + let print_format = Spec(endian.parse(read_bytes(r)?)); + let write_format = Spec(endian.parse(read_bytes(r)?)); + let name = RawString(read_vec(r, 8)?); + + let label = match has_variable_label { + 0 => None, + 1 => { + let len: u32 = endian.parse(read_bytes(r)?); + let read_len = len.min(65535) as usize; + let label = RawString(read_vec(r, read_len)?); + + let padding_bytes = Integer::next_multiple_of(&len, &4) - len; + let _ = read_vec(r, padding_bytes as usize)?; + + Some(label) + } + _ => { + return Err(Error::BadVariableLabelCode { + start_offset, + code_offset, + code: has_variable_label, + }) + } + }; + + let missing_values = + MissingValues::read(r, start_offset, width, missing_value_code, endian, warn)?; + + let end_offset = r.stream_position()?; + + Ok(Record::Variable(VariableRecord { + offsets: start_offset..end_offset, + width, + name, + print_format, + write_format, + missing_values, + label, + })) + } + + pub fn decode(self, decoder: &Decoder) -> DecodedRecord { + DecodedRecord::Variable(VariableRecord { + offsets: self.offsets.clone(), + width: self.width, + name: decoder.decode(&self.name).to_string(), + print_format: self.print_format, + write_format: self.write_format, + missing_values: self.missing_values, + label: self + .label + .as_ref() + .map(|label| decoder.decode(label).to_string()), + }) + } +} + +#[derive(Copy, Clone)] +pub struct UntypedValue(pub [u8; 8]); + +impl Debug for UntypedValue { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + let little: f64 = Endian::Little.parse(self.0); + let little = format!("{:?}", little); + let big: f64 = Endian::Big.parse(self.0); + let big = format!("{:?}", big); + let number = if little.len() <= big.len() { + little + } else { + big + }; + write!(f, "{number}")?; + + let string = default_decode(&self.0); + let string = string + .split(|c: char| c == '\0' || c.is_control()) + .next() + .unwrap(); + write!(f, "{string:?}")?; + Ok(()) + } +} + +#[derive(Clone, PartialEq, Default, Eq, PartialOrd, Ord, Hash)] +pub struct RawString(pub Vec); + +impl RawString { + pub fn spaces(n: usize) -> Self { + Self(std::iter::repeat_n(b' ', n).collect()) + } + pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> { + EncodedStr::new(&self.0, encoding) + } +} + +impl From> for RawString { + fn from(value: Cow<'_, [u8]>) -> Self { + Self(value.into_owned()) + } +} + +impl From> for RawString { + fn from(source: Vec) -> Self { + Self(source) + } +} + +impl From<&[u8]> for RawString { + fn from(source: &[u8]) -> Self { + Self(source.into()) + } +} + +impl Debug for RawString { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{:?}", default_decode(self.0.as_slice())) + } +} + +#[derive(Copy, Clone)] +pub struct RawStrArray(pub [u8; N]); + +impl From<[u8; N]> for RawStrArray { + fn from(source: [u8; N]) -> Self { + Self(source) + } +} + +impl Debug for RawStrArray { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{:?}", default_decode(&self.0)) + } +} + +#[derive(Clone, Debug)] +pub enum EncodedString { + Encoded { + bytes: Vec, + encoding: &'static Encoding, + }, + Utf8 { + s: String, + }, +} + +impl EncodedString { + pub fn borrowed(&self) -> EncodedStr<'_> { + match self { + EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding }, + EncodedString::Utf8 { s } => EncodedStr::Utf8 { s }, + } + } +} + +impl<'a> From> for EncodedString { + fn from(value: EncodedStr<'a>) -> Self { + match value { + EncodedStr::Encoded { bytes, encoding } => Self::Encoded { + bytes: bytes.into(), + encoding, + }, + EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() }, + } + } +} + +pub enum EncodedStr<'a> { + Encoded { + bytes: &'a [u8], + encoding: &'static Encoding, + }, + Utf8 { + s: &'a str, + }, +} + +impl<'a> EncodedStr<'a> { + pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self { + Self::Encoded { bytes, encoding } + } + pub fn as_str(&self) -> Cow<'_, str> { + match self { + EncodedStr::Encoded { bytes, encoding } => { + encoding.decode_without_bom_handling(bytes).0 + } + EncodedStr::Utf8 { s } => Cow::from(*s), + } + } + pub fn as_bytes(&self) -> &[u8] { + match self { + EncodedStr::Encoded { bytes, .. } => bytes, + EncodedStr::Utf8 { s } => s.as_bytes(), + } + } + pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { + match self { + EncodedStr::Encoded { bytes, encoding } => { + let utf8 = encoding.decode_without_bom_handling(bytes).0; + match encoding.encode(&utf8).0 { + Cow::Borrowed(_) => { + // Recoding into UTF-8 and then back did not change anything. + Cow::from(*bytes) + } + Cow::Owned(owned) => Cow::Owned(owned), + } + } + EncodedStr::Utf8 { s } => encoding.encode(s).0, + } + } + pub fn is_empty(&self) -> bool { + match self { + EncodedStr::Encoded { bytes, .. } => bytes.is_empty(), + EncodedStr::Utf8 { s } => s.is_empty(), + } + } + pub fn quoted(&self) -> QuotedEncodedStr { + QuotedEncodedStr(self) + } +} + +impl<'a> From<&'a str> for EncodedStr<'a> { + fn from(s: &'a str) -> Self { + Self::Utf8 { s } + } +} + +impl<'a> From<&'a String> for EncodedStr<'a> { + fn from(s: &'a String) -> Self { + Self::Utf8 { s: s.as_str() } + } +} + +pub struct QuotedEncodedStr<'a>(&'a EncodedStr<'a>); + +impl Display for QuotedEncodedStr<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.0.as_str()) + } +} + +#[derive(Clone, Debug)] +pub struct ValueLabel +where + V: Debug, + S: Debug, +{ + pub value: Value, + pub label: S, +} + +#[derive(Clone)] +pub struct ValueLabelRecord +where + V: Debug, + S: Debug, +{ + /// Range of offsets in file. + pub offsets: Range, + + /// The labels. + pub labels: Vec>, + + /// The 1-based indexes of the variable indexes. + pub dict_indexes: Vec, + + /// The types of the variables. + pub var_type: VarType, +} + +impl Debug for ValueLabelRecord +where + V: Debug, + S: Debug, +{ + fn fmt(&self, f: &mut Formatter) -> FmtResult { + writeln!(f, "labels: ")?; + for label in self.labels.iter() { + writeln!(f, "{label:?}")?; + } + write!(f, "apply to {} variables", self.var_type)?; + for dict_index in self.dict_indexes.iter() { + write!(f, " #{dict_index}")?; + } + Ok(()) + } +} + +impl ValueLabelRecord +where + V: Debug, + S: Debug, +{ + /// Maximum number of value labels in a record. + pub const MAX_LABELS: u32 = u32::MAX / 8; + + /// Maximum number of variable indexes in a record. + pub const MAX_INDEXES: u32 = u32::MAX / 8; +} + +impl ValueLabelRecord, RawString> { + fn read( + r: &mut R, + endian: Endian, + var_types: &VarTypes, + warn: &dyn Fn(Warning), + ) -> Result, Error> { + let label_offset = r.stream_position()?; + let n: u32 = endian.parse(read_bytes(r)?); + if n > Self::MAX_LABELS { + return Err(Error::BadNumberOfValueLabels { + offset: label_offset, + n, + max: Self::MAX_LABELS, + }); + } + + let mut labels = Vec::new(); + for _ in 0..n { + let value = UntypedValue(read_bytes(r)?); + let label_len: u8 = endian.parse(read_bytes(r)?); + let label_len = label_len as usize; + let padded_len = Integer::next_multiple_of(&(label_len + 1), &8); + + let mut label = read_vec(r, padded_len - 1)?; + label.truncate(label_len); + labels.push((value, RawString(label))); + } + + let index_offset = r.stream_position()?; + let rec_type: u32 = endian.parse(read_bytes(r)?); + if rec_type != 4 { + return Err(Error::ExpectedVarIndexRecord { + offset: index_offset, + rec_type, + }); + } + + let n: u32 = endian.parse(read_bytes(r)?); + if n > Self::MAX_INDEXES { + return Err(Error::TooManyVarIndexes { + offset: index_offset, + n, + max: Self::MAX_INDEXES, + }); + } else if n == 0 { + warn(Warning::NoVarIndexes { + offset: index_offset, + }); + return Ok(None); + } + + let index_offset = r.stream_position()?; + let mut dict_indexes = Vec::with_capacity(n as usize); + let mut invalid_indexes = Vec::new(); + for _ in 0..n { + let index: u32 = endian.parse(read_bytes(r)?); + if var_types.is_valid_index(index as usize) { + dict_indexes.push(index); + } else { + invalid_indexes.push(index); + } + } + if !invalid_indexes.is_empty() { + warn(Warning::InvalidVarIndexes { + offset: index_offset, + max: var_types.n_values(), + invalid: invalid_indexes, + }); + } + + let Some(&first_index) = dict_indexes.first() else { + return Ok(None); + }; + let var_type = var_types.types[first_index as usize - 1].unwrap(); + let mut wrong_type_indexes = Vec::new(); + dict_indexes.retain(|&index| { + if var_types.types[index as usize - 1] != Some(var_type) { + wrong_type_indexes.push(index); + false + } else { + true + } + }); + if !wrong_type_indexes.is_empty() { + warn(Warning::MixedVarTypes { + offset: index_offset, + var_type, + wrong_types: wrong_type_indexes, + }); + } + + let labels = labels + .into_iter() + .map(|(value, label)| ValueLabel { + value: Value::from_raw(&value, var_type, endian), + label, + }) + .collect(); + + let end_offset = r.stream_position()?; + Ok(Some(Record::ValueLabel(ValueLabelRecord { + offsets: label_offset..end_offset, + labels, + dict_indexes, + var_type, + }))) + } + + fn decode(self, decoder: &Decoder) -> ValueLabelRecord, String> { + let labels = self + .labels + .iter() + .map(|ValueLabel { value, label }| ValueLabel { + value: value.clone(), + label: decoder.decode(label).to_string(), + }) + .collect(); + ValueLabelRecord { + offsets: self.offsets.clone(), + labels, + dict_indexes: self.dict_indexes.clone(), + var_type: self.var_type, + } + } +} + +#[derive(Clone, Debug)] +pub struct DocumentRecord +where + S: Debug, +{ + pub offsets: Range, + + /// The document, as an array of lines. Raw lines are exactly 80 bytes long + /// and are right-padded with spaces without any new-line termination. + pub lines: Vec, +} + +pub type RawDocumentLine = RawStrArray; + +/// Length of a line in a document. Document lines are fixed-length and +/// padded on the right with spaces. +pub const DOC_LINE_LEN: usize = 80; + +impl DocumentRecord { + /// Maximum number of lines we will accept in a document. This is simply + /// the maximum number that will fit in a 32-bit space. + pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN; + + fn read(r: &mut R, endian: Endian) -> Result { + let start_offset = r.stream_position()?; + let n: u32 = endian.parse(read_bytes(r)?); + let n = n as usize; + if n > Self::MAX_LINES { + Err(Error::BadDocumentLength { + offset: start_offset, + n, + max: Self::MAX_LINES, + }) + } else { + let mut lines = Vec::with_capacity(n); + for _ in 0..n { + lines.push(RawStrArray(read_bytes(r)?)); + } + let end_offset = r.stream_position()?; + Ok(Record::Document(DocumentRecord { + offsets: start_offset..end_offset, + lines, + })) + } + } + + pub fn decode(self, decoder: &Decoder) -> DecodedRecord { + DecodedRecord::Document(DocumentRecord { + offsets: self.offsets.clone(), + lines: self + .lines + .iter() + .map(|s| decoder.decode_slice(&s.0).to_string()) + .collect(), + }) + } +} + +trait ExtensionRecord { + const SUBTYPE: u32; + const SIZE: Option; + const COUNT: Option; + const NAME: &'static str; + fn parse(ext: &Extension, endian: Endian) -> Result; +} + +#[derive(Clone, Debug)] +pub struct IntegerInfoRecord { + pub offsets: Range, + pub version: (i32, i32, i32), + pub machine_code: i32, + pub floating_point_rep: i32, + pub compression_code: i32, + pub endianness: i32, + pub character_code: i32, +} + +impl ExtensionRecord for IntegerInfoRecord { + const SUBTYPE: u32 = 3; + const SIZE: Option = Some(4); + const COUNT: Option = Some(8); + const NAME: &'static str = "integer record"; + + fn parse(ext: &Extension, endian: Endian) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let data: Vec = (0..8) + .map(|_| endian.parse(read_bytes(&mut input).unwrap())) + .collect(); + Ok(Record::IntegerInfo(IntegerInfoRecord { + offsets: ext.offsets.clone(), + version: (data[0], data[1], data[2]), + machine_code: data[3], + floating_point_rep: data[4], + compression_code: data[5], + endianness: data[6], + character_code: data[7], + })) + } +} + +#[derive(Clone, Debug)] +pub struct FloatInfoRecord { + pub sysmis: f64, + pub highest: f64, + pub lowest: f64, +} + +impl ExtensionRecord for FloatInfoRecord { + const SUBTYPE: u32 = 4; + const SIZE: Option = Some(8); + const COUNT: Option = Some(3); + const NAME: &'static str = "floating point record"; + + fn parse(ext: &Extension, endian: Endian) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let data: Vec = (0..3) + .map(|_| endian.parse(read_bytes(&mut input).unwrap())) + .collect(); + Ok(Record::FloatInfo(FloatInfoRecord { + sysmis: data[0], + highest: data[1], + lowest: data[2], + })) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum CategoryLabels { + VarLabels, + CountedValues, +} + +#[derive(Clone, Debug)] +pub enum MultipleResponseType { + MultipleDichotomy { + value: RawString, + labels: CategoryLabels, + }, + MultipleCategory, +} + +impl MultipleResponseType { + fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> { + let (mr_type, input) = match input.split_first() { + Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input), + Some((b'D', input)) => { + let (value, input) = parse_counted_string(input)?; + ( + MultipleResponseType::MultipleDichotomy { + value, + labels: CategoryLabels::VarLabels, + }, + input, + ) + } + Some((b'E', input)) => { + let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") { + (CategoryLabels::CountedValues, rest) + } else if let Some(rest) = input.strip_prefix(b" 11 ") { + (CategoryLabels::VarLabels, rest) + } else { + return Err(Warning::TBD); + }; + let (value, input) = parse_counted_string(input)?; + ( + MultipleResponseType::MultipleDichotomy { value, labels }, + input, + ) + } + _ => return Err(Warning::TBD), + }; + Ok((mr_type, input)) + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseSet +where + I: Debug, + S: Debug, +{ + pub name: I, + pub label: S, + pub mr_type: MultipleResponseType, + pub short_names: Vec, +} + +impl MultipleResponseSet { + fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> { + let Some(equals) = input.iter().position(|&b| b == b'=') else { + return Err(Warning::TBD); + }; + let (name, input) = input.split_at(equals); + let (mr_type, input) = MultipleResponseType::parse(input)?; + let Some(input) = input.strip_prefix(b" ") else { + return Err(Warning::TBD); + }; + let (label, mut input) = parse_counted_string(input)?; + let mut vars = Vec::new(); + while input.first() != Some(&b'\n') { + match input.split_first() { + Some((b' ', rest)) => { + let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else { + return Err(Warning::TBD); + }; + let (var, rest) = rest.split_at(length); + if !var.is_empty() { + vars.push(var.into()); + } + input = rest; + } + _ => return Err(Warning::TBD), + } + } + while input.first() == Some(&b'\n') { + input = &input[1..]; + } + Ok(( + MultipleResponseSet { + name: name.into(), + label, + mr_type, + short_names: vars, + }, + input, + )) + } + + fn decode( + &self, + decoder: &Decoder, + ) -> Result, Warning> { + let mut short_names = Vec::with_capacity(self.short_names.len()); + for short_name in self.short_names.iter() { + if let Some(short_name) = decoder + .decode_identifier(short_name) + .map_err(Warning::InvalidMrSetName) + .issue_warning(&decoder.warn) + { + short_names.push(short_name); + } + } + Ok(MultipleResponseSet { + name: decoder + .decode_identifier(&self.name) + .map_err(Warning::InvalidMrSetVariableName)?, + label: decoder.decode(&self.label).to_string(), + mr_type: self.mr_type.clone(), + short_names, + }) + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseRecord(pub Vec>) +where + I: Debug, + S: Debug; + +impl ExtensionRecord for MultipleResponseRecord { + const SUBTYPE: u32 = 7; + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "multiple response set record"; + + fn parse(ext: &Extension, _endian: Endian) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let mut sets = Vec::new(); + while !input.is_empty() { + let (set, rest) = MultipleResponseSet::parse(input)?; + sets.push(set); + input = rest; + } + Ok(Record::MultipleResponse(MultipleResponseRecord(sets))) + } +} + +impl MultipleResponseRecord { + fn decode(self, decoder: &Decoder) -> DecodedRecord { + let mut sets = Vec::new(); + for set in self.0.iter() { + if let Some(set) = set.decode(decoder).issue_warning(&decoder.warn) { + sets.push(set); + } + } + DecodedRecord::MultipleResponse(MultipleResponseRecord(sets)) + } +} + +fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> { + let Some(space) = input.iter().position(|&b| b == b' ') else { + return Err(Warning::TBD); + }; + let Ok(length) = from_utf8(&input[..space]) else { + return Err(Warning::TBD); + }; + let Ok(length): Result = length.parse() else { + return Err(Warning::TBD); + }; + + let input = &input[space + 1..]; + if input.len() < length { + return Err(Warning::TBD); + }; + + let (string, rest) = input.split_at(length); + Ok((string.into(), rest)) +} + +/// [Level of measurement](https://en.wikipedia.org/wiki/Level_of_measurement). +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Measure { + /// Nominal values can only be compared for equality. + Nominal, + + /// Ordinal values can be meaningfully ordered. + Ordinal, + + /// Scale values can be meaningfully compared for the degree of difference. + Scale, +} + +impl Measure { + pub fn default_for_type(var_type: VarType) -> Option { + match var_type { + VarType::Numeric => None, + VarType::String => Some(Self::Nominal), + } + } + + fn try_decode(source: u32) -> Result, Warning> { + match source { + 0 => Ok(None), + 1 => Ok(Some(Measure::Nominal)), + 2 => Ok(Some(Measure::Ordinal)), + 3 => Ok(Some(Measure::Scale)), + _ => Err(Warning::InvalidMeasurement(source)), + } + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Alignment { + Left, + Right, + Center, +} + +impl Alignment { + fn try_decode(source: u32) -> Result, Warning> { + match source { + 0 => Ok(None), + 1 => Ok(Some(Alignment::Left)), + 2 => Ok(Some(Alignment::Right)), + 3 => Ok(Some(Alignment::Center)), + _ => Err(Warning::InvalidAlignment(source)), + } + } + + pub fn default_for_type(var_type: VarType) -> Self { + match var_type { + VarType::Numeric => Self::Right, + VarType::String => Self::Left, + } + } +} + +#[derive(Clone, Debug)] +pub struct VarDisplay { + pub measure: Option, + pub width: Option, + pub alignment: Option, +} + +#[derive(Clone, Debug)] +pub struct VarDisplayRecord(pub Vec); + +impl VarDisplayRecord { + const SUBTYPE: u32 = 11; + + fn parse( + ext: &Extension, + n_vars: usize, + endian: Endian, + warn: &dyn Fn(Warning), + ) -> Result { + if ext.size != 4 { + return Err(Warning::BadRecordSize { + offset: ext.offsets.start, + record: String::from("variable display record"), + size: ext.size, + expected_size: 4, + }); + } + + let has_width = if ext.count as usize == 3 * n_vars { + true + } else if ext.count as usize == 2 * n_vars { + false + } else { + return Err(Warning::TBD); + }; + + let mut var_displays = Vec::new(); + let mut input = &ext.data[..]; + for _ in 0..n_vars { + let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap())) + .issue_warning(&warn) + .flatten(); + let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap())); + let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap())) + .issue_warning(&warn) + .flatten(); + var_displays.push(VarDisplay { + measure, + width, + alignment, + }); + } + Ok(Record::VarDisplay(VarDisplayRecord(var_displays))) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringMissingValues +where + N: Debug, +{ + /// Variable name. + pub var_name: N, + + /// Missing values. + pub missing_values: Vec>, +} + +impl LongStringMissingValues { + fn decode(&self, decoder: &Decoder) -> Result, IdError> { + Ok(LongStringMissingValues { + var_name: decoder.decode_identifier(&self.var_name)?, + missing_values: self.missing_values.clone(), + }) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringMissingValueRecord(pub Vec>) +where + N: Debug; + +impl ExtensionRecord for LongStringMissingValueRecord { + const SUBTYPE: u32 = 22; + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "long string missing values record"; + + fn parse(ext: &Extension, endian: Endian) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let mut missing_value_set = Vec::new(); + while !input.is_empty() { + let var_name = read_string(&mut input, endian)?; + let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?); + let value_len: u32 = endian.parse(read_bytes(&mut input)?); + if value_len != 8 { + let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start; + return Err(Warning::BadLongMissingValueLength { + record_offset: ext.offsets.start, + offset, + value_len, + }); + } + let mut missing_values = Vec::new(); + for i in 0..n_missing_values { + let value: [u8; 8] = read_bytes(&mut input)?; + let numeric_value: u64 = endian.parse(value); + let value = if i > 0 && numeric_value == 8 { + // Tolerate files written by old, buggy versions of PSPP + // where we believed that the value_length was repeated + // before each missing value. + read_bytes(&mut input)? + } else { + value + }; + missing_values.push(RawStrArray(value)); + } + missing_value_set.push(LongStringMissingValues { + var_name, + missing_values, + }); + } + Ok(Record::LongStringMissingValues( + LongStringMissingValueRecord(missing_value_set), + )) + } +} + +impl LongStringMissingValueRecord { + pub fn decode(self, decoder: &Decoder) -> LongStringMissingValueRecord { + let mut mvs = Vec::with_capacity(self.0.len()); + for mv in self.0.iter() { + if let Some(mv) = mv + .decode(decoder) + .map_err(Warning::InvalidLongStringMissingValueVariableName) + .issue_warning(&decoder.warn) + { + mvs.push(mv); + } + } + LongStringMissingValueRecord(mvs) + } +} + +#[derive(Clone, Debug)] +pub struct EncodingRecord(pub String); + +impl ExtensionRecord for EncodingRecord { + const SUBTYPE: u32 = 20; + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "encoding record"; + + fn parse(ext: &Extension, _endian: Endian) -> Result { + ext.check_size::()?; + + Ok(Record::Encoding(EncodingRecord( + String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName { + offset: ext.offsets.start, + })?, + ))) + } +} + +#[derive(Clone, Debug)] +pub struct NumberOfCasesRecord { + /// Always observed as 1. + pub one: u64, + + /// Number of cases. + pub n_cases: u64, +} + +impl ExtensionRecord for NumberOfCasesRecord { + const SUBTYPE: u32 = 16; + const SIZE: Option = Some(8); + const COUNT: Option = Some(2); + const NAME: &'static str = "extended number of cases record"; + + fn parse(ext: &Extension, endian: Endian) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let one = endian.parse(read_bytes(&mut input)?); + let n_cases = endian.parse(read_bytes(&mut input)?); + + Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases })) + } +} + +#[derive(Clone, Debug)] +pub struct TextRecord { + pub offsets: Range, + + /// Type of record. + pub rec_type: TextRecordType, + + /// The text content of the record. + pub text: RawString, +} + +#[derive(Clone, Copy, Debug)] +pub enum TextRecordType { + VariableSets, + ProductInfo, + LongNames, + VeryLongStrings, + FileAttributes, + VariableAttributes, +} + +impl TextRecord { + fn new(extension: Extension, rec_type: TextRecordType) -> Self { + Self { + offsets: extension.offsets, + rec_type, + text: extension.data.into(), + } + } + pub fn decode(self, decoder: &Decoder) -> DecodedRecord { + match self.rec_type { + TextRecordType::VariableSets => { + DecodedRecord::VariableSets(VariableSetRecord::decode(&self, decoder)) + } + TextRecordType::ProductInfo => { + DecodedRecord::ProductInfo(ProductInfoRecord::decode(&self, decoder)) + } + TextRecordType::LongNames => { + DecodedRecord::LongNames(LongNamesRecord::decode(&self, decoder)) + } + TextRecordType::VeryLongStrings => { + DecodedRecord::VeryLongStrings(VeryLongStringsRecord::decode(&self, decoder)) + } + TextRecordType::FileAttributes => { + DecodedRecord::FileAttributes(FileAttributeRecord::decode(&self, decoder)) + } + TextRecordType::VariableAttributes => { + DecodedRecord::VariableAttributes(VariableAttributeRecord::decode(&self, decoder)) + } + } + } +} + +#[derive(Clone, Debug)] +pub struct VeryLongString { + pub short_name: Identifier, + pub length: u16, +} + +impl VeryLongString { + fn parse(decoder: &Decoder, input: &str) -> Result { + let Some((short_name, length)) = input.split_once('=') else { + return Err(Warning::TBD); + }; + let short_name = decoder + .new_identifier(short_name) + .and_then(Identifier::must_be_ordinary) + .map_err(Warning::InvalidLongStringName)?; + let length = length.parse().map_err(|_| Warning::TBD)?; + Ok(VeryLongString { short_name, length }) + } +} + +#[derive(Clone, Debug)] +pub struct VeryLongStringsRecord(pub Vec); + +impl VeryLongStringsRecord { + fn decode(source: &TextRecord, decoder: &Decoder) -> Self { + let input = decoder.decode(&source.text); + let mut very_long_strings = Vec::new(); + for tuple in input + .split('\0') + .map(|s| s.trim_end_matches('\t')) + .filter(|s| !s.is_empty()) + { + if let Some(vls) = VeryLongString::parse(decoder, tuple).issue_warning(&decoder.warn) { + very_long_strings.push(vls) + } + } + VeryLongStringsRecord(very_long_strings) + } +} + +#[derive(Clone, Debug)] +pub struct Attribute { + pub name: Identifier, + pub values: Vec, +} + +impl Attribute { + fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> { + let Some((name, mut input)) = input.split_once('(') else { + return Err(Warning::TBD); + }; + let name = decoder + .new_identifier(name) + .map_err(Warning::InvalidAttributeName)?; + let mut values = Vec::new(); + loop { + let Some((value, rest)) = input.split_once('\n') else { + return Err(Warning::TBD); + }; + if let Some(stripped) = value + .strip_prefix('\'') + .and_then(|value| value.strip_suffix('\'')) + { + values.push(stripped.into()); + } else { + decoder.warn(Warning::TBD); + values.push(value.into()); + } + if let Some(rest) = rest.strip_prefix(')') { + let attribute = Attribute { name, values }; + return Ok((attribute, rest)); + }; + input = rest; + } + } +} + +impl Attributes { + fn parse<'a>( + decoder: &Decoder, + mut input: &'a str, + sentinel: Option, + ) -> Result<(Attributes, &'a str), Warning> { + let mut attributes = HashMap::new(); + let rest = loop { + match input.chars().next() { + None => break input, + c if c == sentinel => break &input[1..], + _ => { + let (attribute, rest) = Attribute::parse(decoder, input)?; + // XXX report duplicate name + attributes.insert(attribute.name, attribute.values); + input = rest; + } + } + }; + Ok((Attributes(attributes), rest)) + } +} + +#[derive(Clone, Debug, Default)] +pub struct FileAttributeRecord(pub Attributes); + +impl FileAttributeRecord { + fn decode(source: &TextRecord, decoder: &Decoder) -> Self { + let input = decoder.decode(&source.text); + match Attributes::parse(decoder, &input, None).issue_warning(&decoder.warn) { + Some((set, rest)) => { + if !rest.is_empty() { + decoder.warn(Warning::TBD); + } + FileAttributeRecord(set) + } + None => FileAttributeRecord::default(), + } + } +} + +#[derive(Clone, Debug)] +pub struct VarAttributes { + pub long_var_name: Identifier, + pub attributes: Attributes, +} + +impl VarAttributes { + fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributes, &'a str), Warning> { + let Some((long_var_name, rest)) = input.split_once(':') else { + return Err(Warning::TBD); + }; + let long_var_name = decoder + .new_identifier(long_var_name) + .and_then(Identifier::must_be_ordinary) + .map_err(Warning::InvalidAttributeVariableName)?; + let (attributes, rest) = Attributes::parse(decoder, rest, Some('/'))?; + let var_attribute = VarAttributes { + long_var_name, + attributes, + }; + Ok((var_attribute, rest)) + } +} + +#[derive(Clone, Debug)] +pub struct VariableAttributeRecord(pub Vec); + +impl VariableAttributeRecord { + fn decode(source: &TextRecord, decoder: &Decoder) -> Self { + let decoded = decoder.decode(&source.text); + let mut input = decoded.as_ref(); + let mut var_attribute_sets = Vec::new(); + while !input.is_empty() { + let Some((var_attribute, rest)) = + VarAttributes::parse(decoder, input).issue_warning(&decoder.warn) + else { + break; + }; + var_attribute_sets.push(var_attribute); + input = rest; + } + VariableAttributeRecord(var_attribute_sets) + } +} + +#[derive(Clone, Debug)] +pub struct LongName { + pub short_name: Identifier, + pub long_name: Identifier, +} + +impl LongName { + fn parse(input: &str, decoder: &Decoder) -> Result { + let Some((short_name, long_name)) = input.split_once('=') else { + return Err(Warning::TBD); + }; + let short_name = decoder + .new_identifier(short_name) + .and_then(Identifier::must_be_ordinary) + .map_err(Warning::InvalidShortName)?; + let long_name = decoder + .new_identifier(long_name) + .and_then(Identifier::must_be_ordinary) + .map_err(Warning::InvalidLongName)?; + Ok(LongName { + short_name, + long_name, + }) + } +} + +#[derive(Clone, Debug)] +pub struct LongNamesRecord(pub Vec); + +impl LongNamesRecord { + fn decode(source: &TextRecord, decoder: &Decoder) -> Self { + let input = decoder.decode(&source.text); + let mut names = Vec::new(); + for pair in input.split('\t').filter(|s| !s.is_empty()) { + if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&decoder.warn) { + names.push(long_name); + } + } + LongNamesRecord(names) + } +} + +#[derive(Clone, Debug)] +pub struct ProductInfoRecord(pub String); + +impl ProductInfoRecord { + fn decode(source: &TextRecord, decoder: &Decoder) -> Self { + Self(decoder.decode(&source.text).into()) + } +} +#[derive(Clone, Debug)] +pub struct VariableSet { + pub name: Identifier, + pub variable_names: Vec, +} + +impl VariableSet { + fn parse(input: &str, decoder: &Decoder) -> Result { + let (name, input) = input.split_once('=').ok_or(Warning::TBD)?; + let name = decoder.new_identifier(name).map_err(|_| Warning::TBD)?; + let mut vars = Vec::new(); + for var in input.split_ascii_whitespace() { + if let Some(identifier) = decoder + .new_identifier(var) + .and_then(Identifier::must_be_ordinary) + .map_err(Warning::InvalidVariableSetName) + .issue_warning(&decoder.warn) + { + vars.push(identifier); + } + } + Ok(VariableSet { + name, + variable_names: vars, + }) + } +} + +#[derive(Clone, Debug)] +pub struct VariableSetRecord { + pub offsets: Range, + pub sets: Vec, +} + +impl VariableSetRecord { + fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord { + let mut sets = Vec::new(); + let input = decoder.decode(&source.text); + for line in input.lines() { + if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&decoder.warn) { + sets.push(set) + } + } + VariableSetRecord { + offsets: source.offsets.clone(), + sets, + } + } +} + +trait IssueWarning { + fn issue_warning(self, warn: &F) -> Option + where + F: Fn(Warning); +} +impl IssueWarning for Result { + fn issue_warning(self, warn: &F) -> Option + where + F: Fn(Warning), + { + match self { + Ok(result) => Some(result), + Err(error) => { + warn(error); + None + } + } + } +} + +#[derive(Clone, Debug)] +pub struct Extension { + pub offsets: Range, + + /// Record subtype. + pub subtype: u32, + + /// Size of each data element. + pub size: u32, + + /// Number of data elements. + pub count: u32, + + /// `size * count` bytes of data. + pub data: Vec, +} + +impl Extension { + fn check_size(&self) -> Result<(), Warning> { + if let Some(expected_size) = E::SIZE { + if self.size != expected_size { + return Err(Warning::BadRecordSize { + offset: self.offsets.start, + record: E::NAME.into(), + size: self.size, + expected_size, + }); + } + } + if let Some(expected_count) = E::COUNT { + if self.count != expected_count { + return Err(Warning::BadRecordCount { + offset: self.offsets.start, + record: E::NAME.into(), + count: self.count, + expected_count, + }); + } + } + Ok(()) + } + + fn read( + r: &mut R, + endian: Endian, + n_vars: usize, + warn: &dyn Fn(Warning), + ) -> Result, Error> { + let subtype = endian.parse(read_bytes(r)?); + let header_offset = r.stream_position()?; + let size: u32 = endian.parse(read_bytes(r)?); + let count = endian.parse(read_bytes(r)?); + let Some(product) = size.checked_mul(count) else { + return Err(Error::ExtensionRecordTooLarge { + offset: header_offset, + subtype, + size, + count, + }); + }; + let start_offset = r.stream_position()?; + let data = read_vec(r, product as usize)?; + let end_offset = start_offset + product as u64; + let extension = Extension { + offsets: start_offset..end_offset, + subtype, + size, + count, + data, + }; + let result = match subtype { + IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian), + FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian), + VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, n_vars, endian, warn), + MultipleResponseRecord::SUBTYPE | 19 => { + MultipleResponseRecord::parse(&extension, endian) + } + LongStringValueLabelRecord::SUBTYPE => { + LongStringValueLabelRecord::parse(&extension, endian) + } + EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian), + NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian), + 5 => Ok(Record::Text(TextRecord::new( + extension, + TextRecordType::VariableSets, + ))), + 10 => Ok(Record::Text(TextRecord::new( + extension, + TextRecordType::ProductInfo, + ))), + 13 => Ok(Record::Text(TextRecord::new( + extension, + TextRecordType::LongNames, + ))), + 14 => Ok(Record::Text(TextRecord::new( + extension, + TextRecordType::VeryLongStrings, + ))), + 17 => Ok(Record::Text(TextRecord::new( + extension, + TextRecordType::FileAttributes, + ))), + 18 => Ok(Record::Text(TextRecord::new( + extension, + TextRecordType::VariableAttributes, + ))), + _ => Ok(Record::OtherExtension(extension)), + }; + match result { + Ok(result) => Ok(Some(result)), + Err(error) => { + warn(error); + Ok(None) + } + } + } +} + +#[derive(Clone, Debug)] +pub struct ZHeader { + /// File offset to the start of the record. + pub offset: u64, + + /// File offset to the ZLIB data header. + pub zheader_offset: u64, + + /// File offset to the ZLIB trailer. + pub ztrailer_offset: u64, + + /// Length of the ZLIB trailer in bytes. + pub ztrailer_len: u64, +} + +impl ZHeader { + fn read(r: &mut R, endian: Endian) -> Result { + let offset = r.stream_position()?; + let zheader_offset: u64 = endian.parse(read_bytes(r)?); + let ztrailer_offset: u64 = endian.parse(read_bytes(r)?); + let ztrailer_len: u64 = endian.parse(read_bytes(r)?); + + Ok(ZHeader { + offset, + zheader_offset, + ztrailer_offset, + ztrailer_len, + }) + } +} + +#[derive(Clone, Debug)] +pub struct ZTrailer { + /// File offset to the start of the record. + pub offset: u64, + + /// Compression bias as a negative integer, e.g. -100. + pub int_bias: i64, + + /// Always observed as zero. + pub zero: u64, + + /// Uncompressed size of each block, except possibly the last. Only + /// `0x3ff000` has been observed so far. + pub block_size: u32, + + /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them. + pub blocks: Vec, +} + +#[derive(Clone, Debug)] +pub struct ZBlock { + /// Offset of block of data if simple compression were used. + pub uncompressed_ofs: u64, + + /// Actual offset within the file of the compressed data block. + pub compressed_ofs: u64, + + /// The number of bytes in this data block after decompression. This is + /// `block_size` in every data block but the last, which may be smaller. + pub uncompressed_size: u32, + + /// The number of bytes in this data block, as stored compressed in this + /// file. + pub compressed_size: u32, +} + +impl ZBlock { + fn read(r: &mut R, endian: Endian) -> Result { + Ok(ZBlock { + uncompressed_ofs: endian.parse(read_bytes(r)?), + compressed_ofs: endian.parse(read_bytes(r)?), + uncompressed_size: endian.parse(read_bytes(r)?), + compressed_size: endian.parse(read_bytes(r)?), + }) + } +} + +impl ZTrailer { + fn read( + reader: &mut R, + endian: Endian, + ztrailer_ofs: u64, + ztrailer_len: u64, + ) -> Result, Error> { + let start_offset = reader.stream_position()?; + if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() { + return Ok(None); + } + let int_bias = endian.parse(read_bytes(reader)?); + let zero = endian.parse(read_bytes(reader)?); + let block_size = endian.parse(read_bytes(reader)?); + let n_blocks: u32 = endian.parse(read_bytes(reader)?); + let expected_n_blocks = (ztrailer_len - 24) / 24; + if n_blocks as u64 != expected_n_blocks { + return Err(Error::BadZlibTrailerNBlocks { + offset: ztrailer_ofs, + n_blocks, + expected_n_blocks, + ztrailer_len, + }); + } + let blocks = (0..n_blocks) + .map(|_| ZBlock::read(reader, endian)) + .collect::, _>>()?; + reader.seek(SeekFrom::Start(start_offset))?; + Ok(Some(ZTrailer { + offset: ztrailer_ofs, + int_bias, + zero, + block_size, + blocks, + })) + } +} + +fn try_read_bytes(r: &mut R) -> Result, IoError> { + let mut buf = [0; N]; + let n = r.read(&mut buf)?; + if n > 0 { + if n < N { + r.read_exact(&mut buf[n..])?; + } + Ok(Some(buf)) + } else { + Ok(None) + } +} + +fn read_bytes(r: &mut R) -> Result<[u8; N], IoError> { + let mut buf = [0; N]; + r.read_exact(&mut buf)?; + Ok(buf) +} + +fn read_vec(r: &mut R, n: usize) -> Result, IoError> { + let mut vec = vec![0; n]; + r.read_exact(&mut vec)?; + Ok(vec) +} + +fn read_string(r: &mut R, endian: Endian) -> Result { + let length: u32 = endian.parse(read_bytes(r)?); + Ok(read_vec(r, length as usize)?.into()) +} + +#[derive(Clone, Debug)] +pub struct LongStringValueLabels +where + S: Debug, +{ + pub var_name: N, + pub width: u32, + + /// `(value, label)` pairs, where each value is `width` bytes. + pub labels: Vec<(RawString, S)>, +} + +impl LongStringValueLabels { + fn decode( + &self, + decoder: &Decoder, + ) -> Result, Warning> { + let var_name = decoder.decode(&self.var_name); + let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding) + .map_err(Warning::InvalidLongStringValueLabelName)?; + + let mut labels = Vec::with_capacity(self.labels.len()); + for (value, label) in self.labels.iter() { + let label = decoder.decode(label).to_string(); + labels.push((value.clone(), label)); + } + + Ok(LongStringValueLabels { + var_name, + width: self.width, + labels, + }) + } +} + +#[derive(Clone, Debug)] +pub struct LongStringValueLabelRecord(pub Vec>) +where + N: Debug, + S: Debug; + +impl ExtensionRecord for LongStringValueLabelRecord { + const SUBTYPE: u32 = 21; + const SIZE: Option = Some(1); + const COUNT: Option = None; + const NAME: &'static str = "long string value labels record"; + + fn parse(ext: &Extension, endian: Endian) -> Result { + ext.check_size::()?; + + let mut input = &ext.data[..]; + let mut label_set = Vec::new(); + while !input.is_empty() { + let var_name = read_string(&mut input, endian)?; + let width: u32 = endian.parse(read_bytes(&mut input)?); + let n_labels: u32 = endian.parse(read_bytes(&mut input)?); + let mut labels = Vec::new(); + for _ in 0..n_labels { + let value = read_string(&mut input, endian)?; + let label = read_string(&mut input, endian)?; + labels.push((value, label)); + } + label_set.push(LongStringValueLabels { + var_name, + width, + labels, + }) + } + Ok(Record::LongStringValueLabels(LongStringValueLabelRecord( + label_set, + ))) + } +} + +impl LongStringValueLabelRecord { + fn decode(self, decoder: &Decoder) -> LongStringValueLabelRecord { + let mut labels = Vec::with_capacity(self.0.len()); + for label in &self.0 { + match label.decode(decoder) { + Ok(set) => labels.push(set), + Err(error) => decoder.warn(error), + } + } + LongStringValueLabelRecord(labels) + } +} + +#[derive(Default)] +pub struct VarTypes { + pub types: Vec>, +} + +impl VarTypes { + pub fn new() -> Self { + Self::default() + } + + pub fn push(&mut self, width: RawWidth) { + if let Ok(var_type) = VarType::try_from(width) { + self.types.push(Some(var_type)); + for _ in 1..width.n_values().unwrap() { + self.types.push(None); + } + } + } + + pub fn n_values(&self) -> usize { + self.types.len() + } + + pub fn is_valid_index(&self, index: usize) -> bool { + self.var_type_at(index).is_some() + } + + pub fn var_type_at(&self, index: usize) -> Option { + if index >= 1 && index <= self.types.len() { + self.types[index - 1] + } else { + None + } + } + + pub fn iter(&self) -> impl Iterator + use<'_> { + self.types + .iter() + .map(|var_type| var_type.unwrap_or(VarType::String)) + } +} diff --git a/rust/pspp/src/sys/sack.rs b/rust/pspp/src/sys/sack.rs new file mode 100644 index 0000000000..103a9be847 --- /dev/null +++ b/rust/pspp/src/sys/sack.rs @@ -0,0 +1,633 @@ +use float_next_after::NextAfter; +use num::{Bounded, Zero}; +use ordered_float::OrderedFloat; +use std::{ + collections::{hash_map::Entry, HashMap}, + error::Error as StdError, + fmt::{Display, Formatter, Result as FmtResult}, + iter::repeat_n, +}; + +use crate::endian::{Endian, ToBytes}; + +pub type Result = std::result::Result; + +#[derive(Debug)] +pub struct Error { + pub file_name: Option, + pub line_number: Option, + pub token: Option, + pub message: String, +} + +impl Error { + fn new( + file_name: Option<&str>, + line_number: Option, + token: Option<&str>, + message: String, + ) -> Error { + Error { + file_name: file_name.map(String::from), + line_number, + token: token.map(String::from), + message, + } + } +} + +impl StdError for Error {} + +impl Display for Error { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + match (self.file_name.as_ref(), self.line_number) { + (Some(ref file_name), Some(line_number)) => write!(f, "{file_name}:{line_number}: ")?, + (Some(ref file_name), None) => write!(f, "{file_name}: ")?, + (None, Some(line_number)) => write!(f, "line {line_number}: ")?, + (None, None) => (), + } + if let Some(ref token) = self.token { + write!(f, "at '{token}': ")?; + } + write!(f, "{}", self.message) + } +} + +pub fn sack(input: &str, input_file_name: Option<&str>, endian: Endian) -> Result> { + let mut symbol_table = HashMap::new(); + let output = _sack(input, input_file_name, endian, &mut symbol_table)?; + let output = if !symbol_table.is_empty() { + for (k, v) in symbol_table.iter() { + println!("{k} => {v:?}"); + } + for (k, v) in symbol_table.iter() { + if v.is_none() { + Err(Error::new( + input_file_name, + None, + None, + format!("label {k} used but never defined"), + ))? + } + } + _sack(input, input_file_name, endian, &mut symbol_table)? + } else { + output + }; + Ok(output) +} + +fn _sack( + input: &str, + input_file_name: Option<&str>, + endian: Endian, + symbol_table: &mut HashMap>, +) -> Result> { + let mut lexer = Lexer::new(input, input_file_name, endian)?; + let mut output = Vec::new(); + while parse_data_item(&mut lexer, &mut output, symbol_table)? {} + Ok(output) +} + +fn parse_data_item( + lexer: &mut Lexer, + output: &mut Vec, + symbol_table: &mut HashMap>, +) -> Result { + if lexer.token.is_none() { + return Ok(false); + }; + + let initial_len = output.len(); + match lexer.take()? { + Token::Integer(integer) => { + if let Ok(integer) = TryInto::::try_into(integer) { + output.extend_from_slice(&lexer.endian.to_bytes(integer)); + } else if let Ok(integer) = TryInto::::try_into(integer) { + output.extend_from_slice(&lexer.endian.to_bytes(integer)); + } else { + Err(lexer.error(format!( + "{integer} is not in the valid range [{},{}]", + i32::MIN, + u32::MAX + )))?; + }; + } + Token::Float(float) => output.extend_from_slice(&lexer.endian.to_bytes(float.0)), + Token::PcSysmis => { + output.extend_from_slice(&[0xf5, 0x1e, 0x26, 0x02, 0x8a, 0x8c, 0xed, 0xff]) + } + Token::I8 => put_integers::(lexer, "i8", output)?, + Token::I16 => put_integers::(lexer, "i16", output)?, + Token::I64 => put_integers::(lexer, "i64", output)?, + Token::String(string) => output.extend_from_slice(string.as_bytes()), + Token::S(size) => { + let Some((Token::String(ref string), _)) = lexer.token else { + Err(lexer.error(format!("string expected after 's{size}'")))? + }; + let len = string.len(); + if len > size { + Err(lexer.error(format!( + "{len}-byte string is longer than pad length {size}" + )))? + } + output.extend_from_slice(string.as_bytes()); + output.extend(repeat_n(b' ', size - len)); + lexer.get()?; + } + Token::LParen => { + while !matches!(lexer.token, Some((Token::RParen, _))) { + parse_data_item(lexer, output, symbol_table)?; + } + lexer.get()?; + } + Token::Count => put_counted_items::(lexer, "COUNT", output, symbol_table)?, + Token::Count8 => put_counted_items::(lexer, "COUNT8", output, symbol_table)?, + Token::Hex => { + let Some((Token::String(ref string), _)) = lexer.token else { + Err(lexer.error(String::from("string expected after 'hex'")))? + }; + let mut string = &string[..]; + loop { + string = string.trim_start(); + if string.is_empty() { + break; + }; + + let mut i = string.chars(); + let Some(c0) = i.next() else { return Ok(true) }; + let Some(c1) = i.next() else { + Err(lexer.error(String::from("hex string has odd number of characters")))? + }; + + let (Some(digit0), Some(digit1)) = (c0.to_digit(16), c1.to_digit(16)) else { + Err(lexer.error(String::from("invalid digit in hex string")))? + }; + let byte = digit0 * 16 + digit1; + output.push(byte as u8); + + string = i.as_str(); + } + lexer.get()?; + } + Token::Label(name) => { + println!("define {name}"); + let value = output.len() as u32; + match symbol_table.entry(name.clone()) { + Entry::Vacant(v) => { + v.insert(Some(value)); + } + Entry::Occupied(mut o) => { + match o.get() { + Some(v) => { + if *v != value { + Err(lexer.error(format!("{name}: can't redefine label for offset {:#x} with offset {:#x}", *v, value)))? + } + } + None => drop(o.insert(Some(value))), + } + } + }; + return Ok(true); + } + Token::At(name) => { + let mut value = *symbol_table.entry(name.clone()).or_insert(None); + loop { + let plus = match lexer.token { + Some((Token::Plus, _)) => true, + Some((Token::Minus, _)) => false, + _ => break, + }; + lexer.get()?; + + let operand = match lexer.token { + Some((Token::At(ref name), _)) => { + *symbol_table.entry(name.clone()).or_insert(None) + } + Some((Token::Integer(integer), _)) => Some( + integer + .try_into() + .map_err(|msg| lexer.error(format!("bad offset literal ({msg})")))?, + ), + _ => Err(lexer.error(String::from("expecting @label or integer literal")))?, + }; + lexer.get()?; + + value = match (value, operand) { + (Some(a), Some(b)) => Some( + if plus { + a.checked_add(b) + } else { + a.checked_sub(b) + } + .ok_or_else(|| { + lexer.error(String::from("overflow in offset arithmetic")) + })?, + ), + _ => None, + }; + } + let value = value.unwrap_or(0); + output.extend_from_slice(&lexer.endian.to_bytes(value)); + } + _ => (), + }; + if let Some((Token::Asterisk, _)) = lexer.token { + lexer.get()?; + let Token::Integer(count) = lexer.take()? else { + Err(lexer.error(String::from("positive integer expected after '*'")))? + }; + if count < 1 { + Err(lexer.error(String::from("positive integer expected after '*'")))? + }; + let final_len = output.len(); + for _ in 1..count { + output.extend_from_within(initial_len..final_len); + } + } + match lexer.token { + Some((Token::Semicolon, _)) => { + lexer.get()?; + } + Some((Token::RParen, _)) => (), + _ => Err(lexer.error(String::from("';' expected")))?, + } + Ok(true) +} + +fn put_counted_items( + lexer: &mut Lexer, + name: &str, + output: &mut Vec, + symbol_table: &mut HashMap>, +) -> Result<()> +where + T: Zero + TryFrom, + Endian: ToBytes, +{ + let old_size = output.len(); + output.extend_from_slice(&lexer.endian.to_bytes(T::zero())); + let start = output.len(); + if !matches!(lexer.token, Some((Token::LParen, _))) { + Err(lexer.error(format!("'(' expected after '{name}'")))? + } + lexer.get()?; + while !matches!(lexer.token, Some((Token::RParen, _))) { + parse_data_item(lexer, output, symbol_table)?; + } + lexer.get()?; + let delta = output.len() - start; + let Ok(delta): Result = delta.try_into() else { + Err(lexer.error(format!("{delta} bytes is too much for '{name}'")))? + }; + let dest = &mut output[old_size..old_size + N]; + dest.copy_from_slice(&lexer.endian.to_bytes(delta)); + Ok(()) +} + +fn put_integers( + lexer: &mut Lexer, + name: &str, + output: &mut Vec, +) -> Result<()> +where + T: Bounded + Display + TryFrom + Copy, + Endian: ToBytes, +{ + println!("put_integers {:?}", lexer.token); + let mut n = 0; + while let Some(integer) = lexer.take_if(|t| match t { + Token::Integer(integer) => Some(*integer), + _ => None, + })? { + println!("got integer {integer}"); + let Ok(integer) = integer.try_into() else { + Err(lexer.error(format!( + "{integer} is not in the valid range [{},{}]", + T::min_value(), + T::max_value() + )))? + }; + output.extend_from_slice(&lexer.endian.to_bytes(integer)); + n += 1; + } + println!("put_integers {:?} {n}", lexer.token); + if n == 0 { + Err(lexer.error(format!("integer expected after '{name}'")))? + } + Ok(()) +} + +#[derive(PartialEq, Eq, Clone, Debug)] +enum Token { + Integer(i64), + Float(OrderedFloat), + PcSysmis, + String(String), + Semicolon, + Asterisk, + LParen, + RParen, + I8, + I16, + I64, + S(usize), + Count, + Count8, + Hex, + Label(String), + At(String), + Minus, + Plus, +} + +struct Lexer<'a> { + input: &'a str, + token: Option<(Token, &'a str)>, + input_file_name: Option<&'a str>, + line_number: usize, + endian: Endian, +} + +fn skip_comments(mut s: &str) -> (&str, usize) { + let mut n_newlines = 0; + let s = loop { + s = s.trim_start_matches([' ', '\t', '\r', '<', '>']); + if let Some(remainder) = s.strip_prefix('#') { + let Some((_, remainder)) = remainder.split_once('\n') else { + break ""; + }; + s = remainder; + n_newlines += 1; + } else if let Some(remainder) = s.strip_prefix('\n') { + s = remainder; + n_newlines += 1; + } else { + break s; + } + }; + (s, n_newlines) +} + +impl<'a> Lexer<'a> { + fn new(input: &'a str, input_file_name: Option<&'a str>, endian: Endian) -> Result> { + let mut lexer = Lexer { + input, + token: None, + input_file_name, + line_number: 1, + endian, + }; + lexer.token = lexer.next()?; + Ok(lexer) + } + fn error(&self, message: String) -> Error { + let repr = self.token.as_ref().map(|(_, repr)| *repr); + Error::new(self.input_file_name, Some(self.line_number), repr, message) + } + fn take(&mut self) -> Result { + let Some(token) = self.token.take() else { + Err(self.error(String::from("unexpected end of input")))? + }; + self.token = self.next()?; + Ok(token.0) + } + fn take_if(&mut self, condition: F) -> Result> + where + F: FnOnce(&Token) -> Option, + { + let Some(ref token) = self.token else { + return Ok(None); + }; + match condition(&token.0) { + Some(value) => { + self.token = self.next()?; + Ok(Some(value)) + } + None => Ok(None), + } + } + fn get(&mut self) -> Result> { + if self.token.is_none() { + Err(self.error(String::from("unexpected end of input")))? + } else { + self.token = self.next()?; + match self.token { + Some((ref token, _)) => Ok(Some(token)), + None => Ok(None), + } + } + } + + fn next(&mut self) -> Result> { + // Get the first character of the token, skipping past white space and + // comments. + let (s, n_newlines) = skip_comments(self.input); + self.line_number += n_newlines; + self.input = s; + + let start = s; + let mut iter = s.chars(); + let Some(c) = iter.next() else { + return Ok(None); + }; + let (token, rest) = match c { + c if c.is_ascii_digit() || c == '-' => { + let len = s + .find(|c: char| { + !(c.is_ascii_digit() || c.is_alphabetic() || c == '.' || c == '-') + }) + .unwrap_or(s.len()); + let (number, rest) = s.split_at(len); + let token = if number == "-" { + Token::Minus + } else if let Some(digits) = number.strip_prefix("0x") { + Token::Integer(i64::from_str_radix(digits, 16).map_err(|msg| { + self.error(format!("bad integer literal '{number}' ({msg})")) + })?) + } else if !number.contains('.') { + Token::Integer(number.parse().map_err(|msg| { + self.error(format!("bad integer literal '{number}' ({msg})")) + })?) + } else { + Token::Float(number.parse().map_err(|msg| { + self.error(format!("bad float literal '{number}' ({msg})")) + })?) + }; + (token, rest) + } + '"' => { + let s = iter.as_str(); + let Some(len) = s.find(['\n', '"']) else { + Err(self.error(String::from("end-of-file inside string")))? + }; + let (string, rest) = s.split_at(len); + let Some(rest) = rest.strip_prefix('"') else { + Err(self.error(format!("new-line inside string ({string}...{rest})")))? + }; + (Token::String(string.into()), rest) + } + ';' => (Token::Semicolon, iter.as_str()), + '*' => (Token::Asterisk, iter.as_str()), + '+' => (Token::Plus, iter.as_str()), + '(' => (Token::LParen, iter.as_str()), + ')' => (Token::RParen, iter.as_str()), + c if c.is_alphabetic() || c == '@' || c == '_' => { + let len = s + .find(|c: char| { + !(c.is_ascii_digit() + || c.is_alphabetic() + || c == '@' + || c == '.' + || c == '_') + }) + .unwrap_or(s.len()); + let (s, rest) = s.split_at(len); + if let Some(rest) = rest.strip_prefix(':') { + (Token::Label(s.into()), rest) + } else if let Some(name) = s.strip_prefix('@') { + (Token::At(name.into()), rest) + } else if let Some(count) = s.strip_prefix('s') { + let token = + Token::S(count.parse().map_err(|msg| { + self.error(format!("bad counted string '{s}' ({msg})")) + })?); + (token, rest) + } else { + let token = match s { + "i8" => Token::I8, + "i16" => Token::I16, + "i64" => Token::I64, + "SYSMIS" => Token::Float(OrderedFloat(-f64::MAX)), + "PCSYSMIS" => Token::PcSysmis, + "LOWEST" => Token::Float((-f64::MAX).next_after(0.0).into()), + "HIGHEST" => Token::Float(f64::MAX.into()), + "ENDIAN" => Token::Integer(if self.endian == Endian::Big { 1 } else { 2 }), + "COUNT" => Token::Count, + "COUNT8" => Token::Count8, + "hex" => Token::Hex, + _ => Err(self.error(format!("invalid token '{s}'")))?, + }; + (token, rest) + } + } + _ => Err(self.error(format!("invalid input byte '{c}'")))?, + }; + self.input = rest; + let repr = &start[..start.len() - rest.len()]; + println!("{token:?} {repr}"); + Ok(Some((token, repr))) + } +} + +#[cfg(test)] +mod test { + use crate::endian::Endian; + use crate::sys::sack::sack; + use anyhow::Result; + use hexplay::HexView; + + #[test] + fn basic_sack() -> Result<()> { + let input = r#" +"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; # Layout code +28; # Nominal case size +0; # Not compressed +0; # Not weighted +1; # 1 case. +100.0; # Bias. +"01 Jan 11"; "20:53:52"; +"PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 ""; +i8 0 *3; +"#; + let output = sack(input, None, Endian::Big)?; + HexView::new(&output).print()?; + Ok(()) + } + + #[test] + fn pcp_sack() -> Result<()> { + let input = r#" +# File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +@LABELS; @LABELS_END - @LABELS; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; # Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; # Fixed. + i16 0; + i16 15; + 1; + i16 0; # Fixed. + 1; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + # Numeric variable, no label or missing values. + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + + # Numeric variable, variable label. + 0; 0; @NUM2_LABEL - @LABELS_OFS; 0x050800; s8 "NUM2"; PCSYSMIS; + + # Numeric variable with missing value. + 0; 0; 0; 0x050800; s8 "NUM3"; 1.0; + + # Numeric variable, variable label and missing value. + 0; 0; @NUM4_LABEL - @LABELS_OFS; 0x050800; s8 "NUM4"; 2.0; + + # String variable, no label or missing values. + 0; 0; 0; 0x010800; s8 "STR1"; PCSYSMIS; + + # String variable, variable label. + 0; 0; @STR2_LABEL - @LABELS_OFS; 0x010400; s8 "STR2"; PCSYSMIS; + + # String variable with missing value. + 0; 0; 0; 0x010500; s8 "STR3"; s8 "MISS"; + + # String variable, variable label and missing value. + 0; 0; @STR4_LABEL - @LABELS_OFS; 0x010100; s8 "STR4"; s8 "OTHR"; + + # Long string variable + 0; 0; 0; 0x010b00; s8 "STR5"; PCSYSMIS; + 0 * 8; + + # Long string variable with variable label + 0; 0; @STR6_LABEL - @LABELS_OFS; 0x010b00; s8 "STR6"; PCSYSMIS; + 0 * 8; +VARS_END: + +LABELS: + 3; i8 0 0 0; LABELS_OFS: i8 0; + NUM2_LABEL: COUNT8("Numeric variable 2's label"); + NUM4_LABEL: COUNT8("Another numeric variable label"); + STR2_LABEL: COUNT8("STR2's variable label"); + STR4_LABEL: COUNT8("STR4's variable label"); + STR6_LABEL: COUNT8("Another string variable's label"); +LABELS_END: + +DATA: + 0.0; "11/28/14"; 1.0; + 0.0; 1.0; 2.0; PCSYSMIS; s8 "abcdefgh"; s8 "ijkl"; s8 "mnopq"; s8 "r"; + s16 "stuvwxyzAB"; s16 "CDEFGHIJKLM"; +DATA_END: +"#; + let output = sack(input, None, Endian::Big)?; + HexView::new(&output).print()?; + Ok(()) + } +} diff --git a/rust/pspp/tests/sack.rs b/rust/pspp/tests/sack.rs index 49b10e77ac..5be80ea45f 100644 --- a/rust/pspp/tests/sack.rs +++ b/rust/pspp/tests/sack.rs @@ -4,7 +4,7 @@ use std::path::PathBuf; use anyhow::{anyhow, Result}; use clap::Parser; use pspp::endian::Endian; -use pspp::sack::sack; +use pspp::sys::sack::sack; /// SAv Construction Kit /// diff --git a/src/language/lexer/macro.c b/src/language/lexer/macro.c index d7bc1b611f..9536727a0f 100644 --- a/src/language/lexer/macro.c +++ b/src/language/lexer/macro.c @@ -357,10 +357,19 @@ classify_token (enum token_type type) NOT_REACHED (); } -/* Appends syntax for the tokens in MTS to S. */ +/* Appends syntax for the tokens in MTS to S. If OFS and LEN are nonnull, sets + OFS[i] to the offset within S of the start of token 'i' in MTS and LEN[i] to + its length. OFS[i] + LEN[i] is not necessarily OFS[i + 1] because some + tokens are separated by white space. */ void -macro_tokens_to_syntax (struct macro_tokens *mts, struct string *s) +macro_tokens_to_syntax (struct macro_tokens *mts, struct string *s, + size_t *ofs, size_t *len) { + assert ((ofs != NULL) == (len != NULL)); + + if (!mts->n) + return; + for (size_t i = 0; i < mts->n; i++) { if (i > 0) @@ -379,7 +388,11 @@ macro_tokens_to_syntax (struct macro_tokens *mts, struct string *s) } } + if (ofs) + ofs[i] = s->ss.length; macro_token_to_syntax (&mts->mts[i], s); + if (len) + len[i] = s->ss.length - ofs[i]; } } @@ -925,7 +938,7 @@ parse_function_arg (const struct macro_expander *me, if (param) { size_t param_idx = param - me->macro->params; - macro_tokens_to_syntax (me->args[param_idx], farg); + macro_tokens_to_syntax (me->args[param_idx], farg, NULL, NULL); return 1; } @@ -937,7 +950,7 @@ parse_function_arg (const struct macro_expander *me, break; if (i) ds_put_byte (farg, ' '); - macro_tokens_to_syntax (me->args[i], farg); + macro_tokens_to_syntax (me->args[i], farg, NULL, NULL); } return 1; } @@ -1254,7 +1267,7 @@ expand_macro_function (const struct macro_expander *me, if (mts.n > 1) { struct macro_tokens tail = { .mts = mts.mts + 1, .n = mts.n - 1 }; - macro_tokens_to_syntax (&tail, output); + macro_tokens_to_syntax (&tail, output, NULL, NULL); } macro_tokens_uninit (&mts); ds_destroy (&tmp); @@ -1293,7 +1306,7 @@ expand_macro_function (const struct macro_expander *me, subme.stack = &stack; macro_expand (mts.mts, mts.n, &subme, &exp); - macro_tokens_to_syntax (&exp, output); + macro_tokens_to_syntax (&exp, output, NULL, NULL); macro_tokens_uninit (&exp); macro_tokens_uninit (&mts); } diff --git a/src/language/lexer/scan.c b/src/language/lexer/scan.c index e0ab8f872e..e4fe405d47 100644 --- a/src/language/lexer/scan.c +++ b/src/language/lexer/scan.c @@ -222,6 +222,12 @@ scan_punct2__ (char c0, char c1) case '~': return T_NE; + + case '&': + return T_AND; + + case '|': + return T_OR; } NOT_REACHED (); diff --git a/src/output/cairo-pager.c b/src/output/cairo-pager.c index a52d0b095a..1dbffd8270 100644 --- a/src/output/cairo-pager.c +++ b/src/output/cairo-pager.c @@ -246,7 +246,7 @@ void xr_pager_destroy (struct xr_pager *p) { if (p) - {x + { free (p->nodes); xr_page_style_unref (p->page_style); diff --git a/src/output/table-provider.h b/src/output/table-provider.h index ecccbffe4e..a3d1fab78f 100644 --- a/src/output/table-provider.h +++ b/src/output/table-provider.h @@ -47,6 +47,7 @@ struct table_cell unsigned char options; /* TABLE_CELL_*. */ const struct pivot_value *value; + const struct font_style *font_style; const struct cell_style *cell_style; };