From: Ben Pfaff Date: Sun, 20 Aug 2023 20:57:25 +0000 (-0700) Subject: work X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c953078638501b58b8efbdfebb05559103a7a3b6;p=pspp work --- diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index 93fe21f2bd..fc7c44f0cd 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -1,23 +1,23 @@ -use std::{borrow::Cow, collections::{HashSet, HashMap}, cmp::Ordering}; +use std::{borrow::Cow, cmp::Ordering, collections::HashMap, iter::repeat}; -use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; -use encoding_rs::Encoding; -use num::integer::div_ceil; use crate::{ - format::{Spec, UncheckedSpec}, + format::{Error as FormatError, Spec, UncheckedSpec}, identifier::{Error as IdError, Identifier}, raw::{self, MissingValues, VarType}, {endian::Endian, Compression}, }; +use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; +use encoding_rs::{DecoderResult, Encoding}; +use num::integer::div_ceil; use thiserror::Error as ThisError; #[derive(ThisError, Debug)] pub enum Error { #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")] - BadVariableWidth { offset: u64, width: i32 }, + InvalidVariableWidth { offset: u64, width: i32 }, #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")] - BadLongMissingValueFormat, + InvalidLongMissingValueFormat, #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")] InvalidCreationDate { creation_date: String }, @@ -25,49 +25,88 @@ pub enum Error { #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")] InvalidCreationTime { creation_time: String }, - #[error("Invalid variable name: {0}")] - BadIdentifier(#[from] IdError), + #[error("{id_error} Renaming variable to {new_name}.")] + InvalidVariableName { + id_error: IdError, + new_name: Identifier, + }, + + #[error( + "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}" + )] + InvalidPrintFormat { + new_spec: Spec, + variable: Identifier, + format_error: FormatError, + }, + + #[error( + "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}" + )] + InvalidWriteFormat { + new_spec: Spec, + variable: Identifier, + format_error: FormatError, + }, + + #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")] + DuplicateVariableName { + duplicate_name: Identifier, + new_name: Identifier, + }, + + #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")] + InvalidDictIndex { dict_index: usize, max_index: usize }, + + #[error("Dictionary index {0} refers to a long string continuation.")] + DictIndexIsContinuation(usize), + + #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")] + ValueLabelsDifferentTypes { + numeric_var: Identifier, + string_var: Identifier, + }, + + #[error( + "Value labels may not be added to long string variable {0} using record types 3 or 4." + )] + InvalidLongStringValueLabel(Identifier), #[error("Details TBD")] TBD, } +type DictIndex = usize; + +pub struct Variable { + pub dict_index: DictIndex, + pub short_name: Identifier, + pub long_name: Option, + pub width: VarWidth, +} + pub struct Decoder { pub compression: Option, pub endian: Endian, pub encoding: &'static Encoding, - pub var_names: HashSet, - pub dict_indexes: HashMap, + pub variables: HashMap, + pub var_names: HashMap, n_dict_indexes: usize, n_generated_names: usize, } impl Decoder { - fn take_name(&mut self, id: &Identifier) -> bool { - self.var_names.insert(id.clone()) - } fn generate_name(&mut self) -> Identifier { loop { self.n_generated_names += 1; let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding) .unwrap(); - if self.take_name(&name) { + if !self.var_names.contains_key(&name) { return name; } assert!(self.n_generated_names < usize::MAX); } } - fn take_dict_indexes(&mut self, id: &Identifier, width: VarWidth) -> usize { - let n = match width { - VarWidth::Numeric => 1, - VarWidth::String(w) => div_ceil(w as usize, 8), - }; - let dict_index = self.n_dict_indexes; - self.dict_indexes.insert(self.n_dict_indexes, id.clone()); - self.n_dict_indexes += n; - dict_index - - } fn decode_string<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> { let (output, malformed) = self.encoding.decode_without_bom_handling(input); if malformed { @@ -75,6 +114,53 @@ impl Decoder { } output } + fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> { + let max_index = self.n_dict_indexes - 1; + if dict_index == 0 || dict_index as usize > max_index { + return Err(Error::InvalidDictIndex { + dict_index, + max_index, + }); + } + let Some(variable) = self.variables.get(&dict_index) else { + return Err(Error::DictIndexIsContinuation(dict_index)); + }; + Ok(variable) + } + + /// Returns `input` decoded from `self.encoding` into UTF-8 such that + /// re-encoding the result back into `self.encoding` will have exactly the + /// same length in bytes. + /// + /// XXX warn about errors? + fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { + if let (s, false) = self.encoding.decode_without_bom_handling(input) { + // This is the common case. Usually there will be no errors. + s.into() + } else { + // Unusual case. Don't bother to optimize it much. + let mut decoder = self.encoding.new_decoder_without_bom_handling(); + let mut output = String::with_capacity( + decoder + .max_utf8_buffer_length_without_replacement(input.len()) + .unwrap(), + ); + let mut rest = input; + while !rest.is_empty() { + match decoder.decode_to_string_without_replacement(rest, &mut output, true) { + (DecoderResult::InputEmpty, _) => break, + (DecoderResult::OutputFull, _) => unreachable!(), + (DecoderResult::Malformed(a, b), consumed) => { + let skipped = a as usize + b as usize; + output.extend(repeat('?').take(skipped)); + rest = &rest[consumed..]; + } + } + } + assert_eq!(self.encoding.encode(&output).0.len(), input.len()); + output.into() + } + } } pub trait Decode: Sized { @@ -138,6 +224,15 @@ impl PartialOrd for VarWidth { } } +impl VarWidth { + fn n_dict_indexes(self) -> usize { + match self { + VarWidth::Numeric => 1, + VarWidth::String(w) => div_ceil(w as usize, 8), + } + } +} + impl From for VarType { fn from(source: VarWidth) -> Self { match source { @@ -147,7 +242,7 @@ impl From for VarType { } } -pub struct Variable { +pub struct VariableRecord { pub width: VarWidth, pub name: Identifier, pub print_format: Spec, @@ -156,28 +251,29 @@ pub struct Variable { pub label: Option, } -fn decode_format(raw: raw::Spec, name: &str, width: VarWidth) -> Spec { +fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Spec, FormatError)) -> Spec { UncheckedSpec::try_from(raw) .and_then(Spec::try_from) - .and_then(|x| x.check_width_compatibility(Some(name), width)) - .unwrap_or_else(|_warning| { - /*warn(warning);*/ - Spec::default_for_width(width) + .and_then(|x| x.check_width_compatibility(width)) + .unwrap_or_else(|error| { + let new_format = Spec::default_for_width(width); + warn(new_format, error); + new_format }) } -impl Variable { +impl VariableRecord { pub fn decode( decoder: &mut Decoder, input: &crate::raw::Variable, warn: impl Fn(Error), - ) -> Result, Error> { + ) -> Result, Error> { let width = match input.width { 0 => VarWidth::Numeric, w @ 1..=255 => VarWidth::String(w as u16), -1 => return Ok(None), _ => { - return Err(Error::BadVariableWidth { + return Err(Error::InvalidVariableWidth { offset: input.offset, width: input.width, }) @@ -186,25 +282,61 @@ impl Variable { let name = decoder.decode_string(&input.name.0, &warn); let name = match Identifier::new(&name, decoder.encoding) { Ok(name) => { - if !decoder.take_name(&name) { - decoder.generate_name() - } else { + if !decoder.var_names.contains_key(&name) { name + } else { + let new_name = decoder.generate_name(); + warn(Error::DuplicateVariableName { + duplicate_name: name.clone(), + new_name: new_name.clone(), + }); + new_name } } - Err(error) => { - warn(error.into()); - decoder.generate_name() + Err(id_error) => { + let new_name = decoder.generate_name(); + warn(Error::InvalidVariableName { + id_error, + new_name: new_name.clone(), + }); + new_name } }; - let print_format = decode_format(input.print_format, &name.0, width); - let write_format = decode_format(input.write_format, &name.0, width); + let variable = Variable { + dict_index: decoder.n_dict_indexes, + short_name: name.clone(), + long_name: None, + width, + }; + decoder.n_dict_indexes += width.n_dict_indexes(); + assert!(decoder + .var_names + .insert(name.clone(), variable.dict_index) + .is_none()); + assert!(decoder + .variables + .insert(variable.dict_index, variable) + .is_none()); + + let print_format = decode_format(input.print_format, width, |new_spec, format_error| { + warn(Error::InvalidPrintFormat { + new_spec, + variable: name.clone(), + format_error, + }) + }); + let write_format = decode_format(input.write_format, width, |new_spec, format_error| { + warn(Error::InvalidWriteFormat { + new_spec, + variable: name.clone(), + format_error, + }) + }); let label = input .label .as_ref() .map(|label| decoder.decode_string(&label.0, &warn).into()); - decoder.take_dict_indexes(&name, width); - Ok(Some(Variable { + Ok(Some(VariableRecord { width, name, print_format, @@ -259,11 +391,107 @@ impl VariableSet { } } -/* +#[derive(Clone)] +pub enum Value { + Number(Option), + String(String), +} + +impl Value { + pub fn decode(raw: raw::Value, decoder: &Decoder) -> Self { + match raw { + raw::Value::Number(x) => Value::Number(x), + raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()), + } + } +} + pub struct ValueLabelRecord { - pub labels: Vec<( + pub var_type: VarType, + pub labels: Vec<(Value, String)>, + pub variables: Vec, +} + +trait WarnOnError { + fn warn_on_error(self, warn: &F) -> Option; +} +impl WarnOnError for Result { + fn warn_on_error(self, warn: &F) -> Option { + match self { + Ok(result) => Some(result), + Err(error) => { + warn(error); + None + } + } + } +} + +impl ValueLabelRecord { + pub fn decode( + decoder: &mut Decoder, + raw_value_label: &crate::raw::ValueLabel, + dict_indexes: &crate::raw::VarIndexes, + warn: impl Fn(Error), + ) -> Result, Error> { + let variables: Vec<&Variable> = dict_indexes + .dict_indexes + .iter() + .filter_map(|&dict_index| { + decoder + .get_var_by_index(dict_index as usize) + .warn_on_error(&warn) + }) + .filter(|&variable| match variable.width { + VarWidth::String(width) if width > 8 => { + warn(Error::InvalidLongStringValueLabel( + variable.short_name.clone(), + )); + false + } + _ => true, + }) + .collect(); + let mut i = variables.iter(); + let Some(&first_var) = i.next() else { + return Ok(None); + }; + let var_type: VarType = first_var.width.into(); + for &variable in i { + let this_type: VarType = variable.width.into(); + if var_type != this_type { + let (numeric_var, string_var) = match var_type { + VarType::Numeric => (first_var, variable), + VarType::String => (variable, first_var), + }; + warn(Error::ValueLabelsDifferentTypes { + numeric_var: numeric_var.short_name.clone(), + string_var: string_var.short_name.clone(), + }); + return Ok(None); + } + } + let labels = raw_value_label + .labels + .iter() + .map(|(value, label)| { + let label = decoder.decode_string(&label.0, &warn); + let value = Value::decode(raw::Value::from_raw(*value, var_type, decoder.endian), &decoder); + (value, label.into()) + }) + .collect(); + let variables = variables + .iter() + .map(|&variable| variable.short_name.clone()) + .collect(); + Ok(Some(ValueLabelRecord { + var_type, + labels, + variables, + })) + } } -*/ + pub struct VariableSetRecord(Vec); impl TextRecord for VariableSetRecord { @@ -271,9 +499,8 @@ impl TextRecord for VariableSetRecord { fn parse(input: &str, warn: impl Fn(Error)) -> Result { let mut sets = Vec::new(); for line in input.lines() { - match VariableSet::parse(line) { - Ok(set) => sets.push(set), - Err(error) => warn(error), + if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) { + sets.push(set) } } Ok(VariableSetRecord(sets)) @@ -344,9 +571,8 @@ impl TextRecord for VeryLongStringRecord { .map(|s| s.trim_end_matches('\t')) .filter(|s| !s.is_empty()) { - match VeryLongString::parse(tuple) { - Ok(vls) => very_long_strings.push(vls), - Err(error) => warn(error), + if let Some(vls) = VeryLongString::parse(tuple).warn_on_error(&warn) { + very_long_strings.push(vls) } } Ok(VeryLongStringRecord(very_long_strings)) @@ -459,16 +685,13 @@ impl TextRecord for VariableAttributeRecord { fn parse(mut input: &str, warn: impl Fn(Error)) -> Result { let mut var_attribute_sets = Vec::new(); while !input.is_empty() { - match VarAttributeSet::parse(input, &warn) { - Ok((var_attribute, rest)) => { - var_attribute_sets.push(var_attribute); - input = rest; - } - Err(error) => { - warn(error); - break; - } - } + let Some((var_attribute, rest)) = + VarAttributeSet::parse(input, &warn).warn_on_error(&warn) + else { + break; + }; + var_attribute_sets.push(var_attribute); + input = rest; } Ok(VariableAttributeRecord(var_attribute_sets)) } @@ -493,3 +716,28 @@ pub struct VarDisplay { } pub struct VarDisplayRecord(pub Vec); + +#[cfg(test)] +mod test { + use encoding_rs::WINDOWS_1252; + + #[test] + fn test() { + let mut s = String::new(); + s.push(char::REPLACEMENT_CHARACTER); + let encoded = WINDOWS_1252.encode(&s).0; + let decoded = WINDOWS_1252.decode(&encoded[..]).0; + println!("{:?}", decoded); + } + + #[test] + fn test2() { + let charset: Vec = (0..=255).collect(); + println!("{}", charset.len()); + let decoded = WINDOWS_1252.decode(&charset[..]).0; + println!("{}", decoded.len()); + let encoded = WINDOWS_1252.encode(&decoded[..]).0; + println!("{}", encoded.len()); + assert_eq!(&charset[..], &encoded[..]); + } +} diff --git a/rust/src/format.rs b/rust/src/format.rs index 0fc82a3987..34798ed65a 100644 --- a/rust/src/format.rs +++ b/rust/src/format.rs @@ -12,7 +12,7 @@ use crate::{ #[derive(ThisError, Debug)] pub enum Error { - #[error("Unknown format type {value}")] + #[error("Unknown format type {value}.")] UnknownFormat { value: u16 }, #[error("Output format {0} specifies width {}, but {} requires an even width.", .0.w, .0.format)] @@ -39,12 +39,6 @@ pub enum Error { #[error("Numeric variable is not compatible with string format {0}.")] UnnamedVariableNotCompatibleWithStringFormat(Format), - #[error("String variable {variable} is not compatible with numeric format {format}.")] - NamedVariableNotCompatibleWithNumericFormat { variable: String, format: Format }, - - #[error("Numeric variable {variable} is not compatible with string format {format}.")] - NamedVariableNotCompatibleWithStringFormat { variable: String, format: Format }, - #[error("String variable {variable} with width {width} is not compatible with format {bad_spec}. Use format {good_spec} instead.")] NamedStringVariableBadSpecWidth { variable: String, @@ -312,32 +306,14 @@ impl Format { /// Checks whether this format is valid for a variable with the given /// `var_type`. - pub fn check_type_compatibility( - self, - variable: Option<&str>, - var_type: VarType, - ) -> Result<(), Error> { + pub fn check_type_compatibility(self, var_type: VarType) -> Result<(), Error> { let my_type = self.var_type(); match (my_type, var_type) { (VarType::Numeric, VarType::String) => { - if let Some(variable) = variable { - Err(Error::NamedVariableNotCompatibleWithNumericFormat { - variable: variable.into(), - format: self, - }) - } else { - Err(Error::UnnamedVariableNotCompatibleWithNumericFormat(self)) - } + Err(Error::UnnamedVariableNotCompatibleWithNumericFormat(self)) } (VarType::String, VarType::Numeric) => { - if let Some(variable) = variable { - Err(Error::NamedVariableNotCompatibleWithStringFormat { - variable: variable.into(), - format: self, - }) - } else { - Err(Error::UnnamedVariableNotCompatibleWithStringFormat(self)) - } + Err(Error::UnnamedVariableNotCompatibleWithStringFormat(self)) } _ => Ok(()), } @@ -450,14 +426,10 @@ impl Spec { /// Checks whether this format specification is valid for a variable with /// width `var_width`. - pub fn check_width_compatibility( - self, - variable: Option<&str>, - var_width: VarWidth, - ) -> Result { + pub fn check_width_compatibility(self, var_width: VarWidth) -> Result { // Verify that the format is right for the variable's type. self.format - .check_type_compatibility(variable, var_width.into())?; + .check_type_compatibility(var_width.into())?; if let VarWidth::String(w) = var_width { if var_width != self.var_width() { @@ -467,20 +439,11 @@ impl Spec { } else { Spec { w: w * 2, ..self } }; - if let Some(variable) = variable { - return Err(Error::NamedStringVariableBadSpecWidth { - variable: variable.into(), - width: w, - bad_spec, - good_spec, - }); - } else { - return Err(Error::UnnamedStringVariableBadSpecWidth { - width: w, - bad_spec, - good_spec, - }); - } + return Err(Error::UnnamedStringVariableBadSpecWidth { + width: w, + bad_spec, + good_spec, + }); } } diff --git a/rust/src/identifier.rs b/rust/src/identifier.rs index 0553b41b11..5b0649f533 100644 --- a/rust/src/identifier.rs +++ b/rust/src/identifier.rs @@ -1,3 +1,5 @@ +use std::fmt::{Display, Formatter, Result as FmtResult}; + use encoding_rs::{EncoderResult, Encoding}; use finl_unicode::categories::{CharacterCategories, MajorCategory}; use thiserror::Error as ThisError; @@ -28,9 +30,6 @@ impl IdentifierChar for char { } } -#[derive(Clone, PartialEq, Eq, Debug, Hash)] -pub struct Identifier(pub UniCase); - #[derive(Clone, Debug, ThisError)] pub enum Error { #[error("Identifier cannot be empty string.")] @@ -72,6 +71,9 @@ fn is_reserved_word(s: &str) -> bool { false } +#[derive(Clone, PartialEq, Eq, Debug, Hash)] +pub struct Identifier(pub UniCase); + impl Identifier { /// Maximum length of an identifier, in bytes. The limit applies in the /// encoding used by the dictionary, not in UTF-8. @@ -118,3 +120,9 @@ impl Identifier { Ok(()) } } + +impl Display for Identifier { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{}", self.0) + } +} diff --git a/rust/src/raw.rs b/rust/src/raw.rs index 9fb92f52b0..8fa0f26bcf 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -183,7 +183,7 @@ pub struct Header { /// Compression type, if any, pub compression: Option, - /// 0-based variable index of the weight variable, or `None` if the file is + /// 1-based variable index of the weight variable, or `None` if the file is /// unweighted. pub weight_index: Option, @@ -256,7 +256,7 @@ impl Header { }; let weight_index: u32 = endian.parse(read_bytes(r)?); - let weight_index = (weight_index > 0).then(|| weight_index - 1); + let weight_index = (weight_index > 0).then_some(weight_index); let n_cases: u32 = endian.parse(read_bytes(r)?); let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases); @@ -500,14 +500,14 @@ impl Debug for Value { impl Value { fn read(r: &mut R, var_type: VarType, endian: Endian) -> Result { - Ok(Self::from_raw(var_type, read_bytes(r)?, endian)) + Ok(Self::from_raw(UntypedValue(read_bytes(r)?), var_type, endian)) } - pub fn from_raw(var_type: VarType, raw: [u8; 8], endian: Endian) -> Value { + pub fn from_raw(raw: UntypedValue, var_type: VarType, endian: Endian) -> Value { match var_type { - VarType::String => Value::String(UnencodedStr(raw)), + VarType::String => Value::String(UnencodedStr(raw.0)), VarType::Numeric => { - let number: f64 = endian.parse(raw); + let number: f64 = endian.parse(raw.0); Value::Number((number != -f64::MAX).then_some(number)) } } @@ -533,7 +533,7 @@ impl Value { }); } }; - values.push(Value::from_raw(var_type, raw, endian)); + values.push(Value::from_raw(UntypedValue(raw), var_type, endian)); } Ok(Some(values)) } @@ -583,7 +583,7 @@ impl Value { }); } } - 253 => break Value::from_raw(var_type, read_bytes(reader)?, endian), + 253 => break Value::from_raw(UntypedValue(read_bytes(reader)?), var_type, endian), 254 => match var_type { VarType::String => break Value::String(UnencodedStr(*b" ")), // XXX EBCDIC VarType::Numeric => { @@ -1023,15 +1023,15 @@ pub struct VarIndexes { /// Offset from the start of the file to the start of the record. pub offset: u64, - /// The 0-based indexes of the variable indexes. - pub var_indexes: Vec, + /// The 1-based indexes of the variable indexes. + pub dict_indexes: Vec, } impl Debug for VarIndexes { fn fmt(&self, f: &mut Formatter) -> FmtResult { write!(f, "apply to variables")?; - for var_index in self.var_indexes.iter() { - write!(f, " #{var_index}")?; + for dict_index in self.dict_indexes.iter() { + write!(f, " #{dict_index}")?; } Ok(()) } @@ -1051,14 +1051,14 @@ impl VarIndexes { max: VarIndexes::MAX, }); } - let mut var_indexes = Vec::with_capacity(n as usize); + let mut dict_indexes = Vec::with_capacity(n as usize); for _ in 0..n { - var_indexes.push(endian.parse(read_bytes(r)?)); + dict_indexes.push(endian.parse(read_bytes(r)?)); } Ok(VarIndexes { offset, - var_indexes, + dict_indexes, }) } }