From: Ben Pfaff Date: Mon, 29 Jan 2024 00:06:19 +0000 (-0800) Subject: decodedrecord works X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=510eb10a3e9573c41d14e7b02dd1f35f162f92c5;p=pspp decodedrecord works --- diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index 56d6aa1c82..894204b6b3 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -623,77 +623,6 @@ impl TryDecode for HeaderRecord { } } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum VarWidth { - Numeric, - String(u16), -} - -impl PartialOrd for VarWidth { - fn partial_cmp(&self, other: &Self) -> Option { - match (self, other) { - (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal), - (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)), - _ => None, - } - } -} - -impl VarWidth { - const MAX_STRING: u16 = 32767; - - fn n_dict_indexes(self) -> usize { - match self { - VarWidth::Numeric => 1, - VarWidth::String(w) => div_ceil(w as usize, 8), - } - } - - fn width_predicate( - a: Option, - b: Option, - f: impl Fn(u16, u16) -> u16, - ) -> Option { - match (a, b) { - (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric), - (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => { - Some(VarWidth::String(f(a, b))) - } - _ => None, - } - } - - /// Returns the wider of `self` and `other`: - /// - Numerical variable widths are equally wide. - /// - Longer strings are wider than shorter strings. - /// - Numerical and string types are incomparable, so result in `None`. - /// - Any `None` in the input yields `None` in the output. - pub fn wider(a: Option, b: Option) -> Option { - Self::width_predicate(a, b, |a, b| a.max(b)) - } - - /// Returns the narrower of `self` and `other` (see [`Self::wider`]). - pub fn narrower(a: Option, b: Option) -> Option { - Self::width_predicate(a, b, |a, b| a.min(b)) - } - - pub fn default_display_width(&self) -> u32 { - match self { - VarWidth::Numeric => 8, - VarWidth::String(width) => *width.min(&32) as u32, - } - } -} - -impl From for VarType { - fn from(source: VarWidth) -> Self { - match source { - VarWidth::Numeric => VarType::Numeric, - VarWidth::String(_) => VarType::String, - } - } -} - #[derive(Clone, Debug)] pub struct VariableRecord { pub width: VarWidth, @@ -856,21 +785,6 @@ impl WarnOnError for Result { } } -#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub enum Value { - Number(Option>), - String(String), -} - -impl Value { - pub fn decode(raw: &raw::Value>, decoder: &Decoder) -> Self { - match raw { - raw::Value::Number(x) => Value::Number(x.map(|x| x.into())), - raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()), - } - } -} - #[derive(Clone, Debug)] pub struct ValueLabel { pub value: Value, diff --git a/rust/src/dictionary.rs b/rust/src/dictionary.rs index 4a8e272116..6662aaa94e 100644 --- a/rust/src/dictionary.rs +++ b/rust/src/dictionary.rs @@ -1,21 +1,108 @@ use std::{ collections::{HashMap, HashSet}, fmt::Debug, - ops::{Bound, RangeBounds}, + ops::{Bound, RangeBounds}, cmp::Ordering, }; use encoding_rs::Encoding; use indexmap::IndexSet; +use num::integer::div_ceil; +use ordered_float::OrderedFloat; use crate::{ - cooked::{Value, VarWidth}, format::Spec, identifier::{ByIdentifier, HasIdentifier, Identifier}, - raw::{Alignment, CategoryLabels, Measure, MissingValues, VarType}, + raw::{Alignment, CategoryLabels, Measure, MissingValues, VarType, self, RawStr, Decoder}, }; pub type DictIndex = usize; +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum VarWidth { + Numeric, + String(u16), +} + +impl PartialOrd for VarWidth { + fn partial_cmp(&self, other: &Self) -> Option { + match (self, other) { + (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal), + (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)), + _ => None, + } + } +} + +impl VarWidth { + pub const MAX_STRING: u16 = 32767; + + fn n_dict_indexes(self) -> usize { + match self { + VarWidth::Numeric => 1, + VarWidth::String(w) => div_ceil(w as usize, 8), + } + } + + fn width_predicate( + a: Option, + b: Option, + f: impl Fn(u16, u16) -> u16, + ) -> Option { + match (a, b) { + (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric), + (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => { + Some(VarWidth::String(f(a, b))) + } + _ => None, + } + } + + /// Returns the wider of `self` and `other`: + /// - Numerical variable widths are equally wide. + /// - Longer strings are wider than shorter strings. + /// - Numerical and string types are incomparable, so result in `None`. + /// - Any `None` in the input yields `None` in the output. + pub fn wider(a: Option, b: Option) -> Option { + Self::width_predicate(a, b, |a, b| a.max(b)) + } + + /// Returns the narrower of `self` and `other` (see [`Self::wider`]). + pub fn narrower(a: Option, b: Option) -> Option { + Self::width_predicate(a, b, |a, b| a.min(b)) + } + + pub fn default_display_width(&self) -> u32 { + match self { + VarWidth::Numeric => 8, + VarWidth::String(width) => *width.min(&32) as u32, + } + } +} + +impl From for VarType { + fn from(source: VarWidth) -> Self { + match source { + VarWidth::Numeric => VarType::Numeric, + VarWidth::String(_) => VarType::String, + } + } +} + +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Value { + Number(Option>), + String(String), +} + +impl Value { + pub fn decode(raw: &raw::Value>, decoder: &Decoder) -> Self { + match raw { + raw::Value::Number(x) => Value::Number(x.map(|x| x.into())), + raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()), + } + } +} + #[derive(Clone, Debug)] pub struct Dictionary { pub variables: IndexSet>, diff --git a/rust/src/format.rs b/rust/src/format.rs index 9f285a9dd6..3612134a24 100644 --- a/rust/src/format.rs +++ b/rust/src/format.rs @@ -6,7 +6,7 @@ use std::{ use thiserror::Error as ThisError; use crate::{ - cooked::VarWidth, + dictionary::VarWidth, raw::{self, VarType}, }; diff --git a/rust/src/lib.rs b/rust/src/lib.rs index f8e880c14e..7f0ec2aadd 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,4 +1,4 @@ -pub mod cooked; +//pub mod cooked; pub mod dictionary; pub mod encoding; pub mod endian; diff --git a/rust/src/main.rs b/rust/src/main.rs index 473062183b..bf6abe9b8f 100644 --- a/rust/src/main.rs +++ b/rust/src/main.rs @@ -17,7 +17,6 @@ use anyhow::Result; use clap::{Parser, ValueEnum}; use encoding_rs::Encoding; -use pspp::cooked::{decode, encoding_from_headers}; use pspp::raw::{Reader, Record, Magic}; use std::fs::File; use std::io::BufReader; @@ -110,12 +109,14 @@ fn dissect(file_name: &Path, max_cases: u64, mode: Mode, encoding: Option<&'stat } } Mode::Cooked => { +/* let headers: Vec = reader.collect::, _>>()?; let encoding = encoding_from_headers(&headers, &|e| eprintln!("{e}"))?; let (headers, _) = decode(headers, encoding, &|e| eprintln!("{e}"))?; for header in headers { println!("{header:?}"); - } + } + */ } } diff --git a/rust/src/raw.rs b/rust/src/raw.rs index 65b0f9474c..e75e5a05e8 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -1,5 +1,5 @@ use crate::{ - cooked::VarWidth, + dictionary::VarWidth, endian::{Endian, Parse, ToBytes}, identifier::{Error as IdError, Identifier}, }; @@ -11,7 +11,7 @@ use std::{ borrow::Cow, cell::RefCell, cmp::Ordering, - collections::{VecDeque, HashMap}, + collections::{HashMap, VecDeque}, fmt::{Debug, Display, Formatter, Result as FmtResult}, io::{Error as IoError, Read, Seek, SeekFrom}, iter::repeat, @@ -189,6 +189,9 @@ pub enum Error { #[error("Invalid variable name in long string missing values record. {0}")] InvalidLongStringMissingValueVariableName(IdError), + #[error("Invalid variable name in long string value label record. {0}")] + InvalidLongStringValueLabelName(IdError), + #[error("Details TBD")] TBD, } @@ -201,24 +204,43 @@ pub enum Record { Document(DocumentRecord), IntegerInfo(IntegerInfoRecord), FloatInfo(FloatInfoRecord), - VariableSets(VariableSetRecord), VarDisplay(VarDisplayRecord), MultipleResponse(MultipleResponseRecord), - LongStringValueLabels(LongStringValueLabelRecord), + LongStringValueLabels(LongStringValueLabelRecord), LongStringMissingValues(LongStringMissingValueRecord>), Encoding(EncodingRecord), NumberOfCases(NumberOfCasesRecord), + Text(TextRecord), + OtherExtension(Extension), + EndOfHeaders(u32), + ZHeader(ZHeader), + ZTrailer(ZTrailer), + Cases(Rc>), +} + +pub enum DecodedRecord<'a> { + Header(HeaderRecord>), + Variable(VariableRecord, String>), + ValueLabel(ValueLabelRecord, Cow<'a, str>>), + Document(DocumentRecord>), + IntegerInfo(IntegerInfoRecord), + FloatInfo(FloatInfoRecord), + VarDisplay(VarDisplayRecord), + MultipleResponse(MultipleResponseRecord>), + LongStringValueLabels(LongStringValueLabelRecord>), + LongStringMissingValues(LongStringMissingValueRecord), + Encoding(EncodingRecord), + NumberOfCases(NumberOfCasesRecord), + VariableSets(VariableSetRecord), ProductInfo(ProductInfoRecord), LongNames(LongNamesRecord), VeryLongStrings(VeryLongStringsRecord), FileAttributes(FileAttributeRecord), VariableAttributes(VariableAttributeRecord), - Text(TextRecord), OtherExtension(Extension), EndOfHeaders(u32), ZHeader(ZHeader), ZTrailer(ZTrailer), - Cases(Rc>), } impl Record { @@ -247,7 +269,32 @@ impl Record { } } - + fn decode<'a>(&'a self, decoder: &Decoder) -> Result, Error> { + Ok(match self { + Record::Header(record) => record.decode(decoder), + Record::Variable(record) => record.decode(decoder), + Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)), + Record::Document(record) => record.decode(decoder), + Record::IntegerInfo(record) => DecodedRecord::IntegerInfo(record.clone()), + Record::FloatInfo(record) => DecodedRecord::FloatInfo(record.clone()), + Record::VarDisplay(record) => DecodedRecord::VarDisplay(record.clone()), + Record::MultipleResponse(record) => record.decode(decoder), + Record::LongStringValueLabels(record) => { + DecodedRecord::LongStringValueLabels(record.decode(decoder)?) + } + Record::LongStringMissingValues(record) => { + DecodedRecord::LongStringMissingValues(record.decode(decoder)) + } + Record::Encoding(record) => DecodedRecord::Encoding(record.clone()), + Record::NumberOfCases(record) => DecodedRecord::NumberOfCases(record.clone()), + Record::Text(record) => record.decode(decoder)?, + Record::OtherExtension(record) => DecodedRecord::OtherExtension(record.clone()), + Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record.clone()), + Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()), + Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()), + Record::Cases(_) => todo!(), + }) + } } // If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it @@ -404,12 +451,12 @@ impl HeaderRecord { }) } - pub fn decode<'a>(&'a self, decoder: &Decoder) -> HeaderRecord> { + pub fn decode<'a>(&'a self, decoder: &Decoder) -> DecodedRecord<'a> { let eye_catcher = decoder.decode(&self.eye_catcher); let file_label = decoder.decode(&self.file_label); let creation_date = decoder.decode(&self.creation_date); let creation_time = decoder.decode(&self.creation_time); - HeaderRecord { + DecodedRecord::Header(HeaderRecord { eye_catcher, weight_index: self.weight_index, n_cases: self.n_cases, @@ -423,7 +470,7 @@ impl HeaderRecord { creation_date, creation_time, endian: self.endian, - } + }) } } @@ -456,7 +503,7 @@ impl Decoder { /// same length in bytes. /// /// XXX warn about errors? - fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { + pub fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { if let (s, false) = self.encoding.decode_without_bom_handling(input) { // This is the common case. Usually there will be no errors. s @@ -1235,8 +1282,8 @@ impl VariableRecord> { })) } - pub fn decode<'a>(&'a self, decoder: &Decoder) -> VariableRecord, String> { - VariableRecord { + pub fn decode<'a>(&'a self, decoder: &Decoder) -> DecodedRecord { + DecodedRecord::Variable(VariableRecord { offsets: self.offsets.clone(), width: self.width, name: decoder.decode(&self.name), @@ -1244,7 +1291,7 @@ impl VariableRecord> { write_format: self.write_format, missing_values: self.missing_values.decode(decoder), label: self.label.as_ref().map(|label| decoder.decode(label)), - } + }) } } @@ -1485,6 +1532,23 @@ impl ValueLabelRecord, RawString> { var_type, }))) } + + fn decode<'a>(&'a self, decoder: &Decoder) -> ValueLabelRecord, Cow<'a, str>> { + let labels = self + .labels + .iter() + .map(|ValueLabel { value, label }| ValueLabel { + value: value.clone(), + label: decoder.decode(label), + }) + .collect(); + ValueLabelRecord { + offsets: self.offsets.clone(), + labels, + dict_indexes: self.dict_indexes.clone(), + var_type: self.var_type, + } + } } #[derive(Clone, Debug)] @@ -1532,15 +1596,15 @@ impl DocumentRecord { } } - pub fn decode<'a>(&'a self, decoder: &Decoder) -> DocumentRecord> { - DocumentRecord { + pub fn decode<'a>(&'a self, decoder: &Decoder) -> DecodedRecord { + DecodedRecord::Document(DocumentRecord { offsets: self.offsets.clone(), lines: self .lines .iter() .map(|s| decoder.decode_slice(&s.0)) .collect(), - } + }) } } @@ -1779,14 +1843,14 @@ impl ExtensionRecord for MultipleResponseRecord { } impl MultipleResponseRecord { - fn decode<'a>(&'a self, decoder: &Decoder) -> MultipleResponseRecord> { + fn decode<'a>(&'a self, decoder: &Decoder) -> DecodedRecord { let mut sets = Vec::new(); for set in self.0.iter() { if let Some(set) = set.decode(decoder).warn_on_error(&decoder.warn) { sets.push(set); } } - MultipleResponseRecord(sets) + DecodedRecord::MultipleResponse(MultipleResponseRecord(sets)) } } @@ -2095,26 +2159,26 @@ impl TextRecord { text: extension.data.into(), } } - pub fn decode<'a>(&self, decoder: &Decoder) -> Result, Error> { + pub fn decode<'a>(&self, decoder: &Decoder) -> Result { match self.rec_type { - TextRecordType::VariableSets => Ok(Some(Record::VariableSets( + TextRecordType::VariableSets => Ok(DecodedRecord::VariableSets( VariableSetRecord::decode(self, decoder), - ))), - TextRecordType::ProductInfo => Ok(Some(Record::ProductInfo( + )), + TextRecordType::ProductInfo => Ok(DecodedRecord::ProductInfo( ProductInfoRecord::decode(self, decoder), - ))), - TextRecordType::LongNames => Ok(Some(Record::LongNames(LongNamesRecord::decode( + )), + TextRecordType::LongNames => Ok(DecodedRecord::LongNames(LongNamesRecord::decode( self, decoder, - )))), - TextRecordType::VeryLongStrings => Ok(Some(Record::VeryLongStrings( - VeryLongStringsRecord::decode(self, decoder), ))), - TextRecordType::FileAttributes => { - Ok(FileAttributeRecord::decode(self, decoder).map(|fa| Record::FileAttributes(fa))) - } - TextRecordType::VariableAttributes => Ok(Some(Record::VariableAttributes( + TextRecordType::VeryLongStrings => Ok(DecodedRecord::VeryLongStrings( + VeryLongStringsRecord::decode(self, decoder), + )), + TextRecordType::FileAttributes => Ok(DecodedRecord::FileAttributes( + FileAttributeRecord::decode(self, decoder), + )), + TextRecordType::VariableAttributes => Ok(DecodedRecord::VariableAttributes( VariableAttributeRecord::decode(self, decoder), - ))), + )), } } } @@ -2221,24 +2285,36 @@ impl AttributeSet { } } +impl Default for AttributeSet { + fn default() -> Self { + Self(HashMap::default()) + } +} + #[derive(Clone, Debug)] pub struct FileAttributeRecord(AttributeSet); impl FileAttributeRecord { - fn decode(source: &TextRecord, decoder: &Decoder) -> Option { + fn decode(source: &TextRecord, decoder: &Decoder) -> Self { let input = decoder.decode(&source.text); match AttributeSet::parse(decoder, &input, None).warn_on_error(&decoder.warn) { Some((set, rest)) => { if !rest.is_empty() { decoder.warn(Error::TBD); } - Some(FileAttributeRecord(set)) + FileAttributeRecord(set) } - None => None, + None => FileAttributeRecord::default(), } } } +impl Default for FileAttributeRecord { + fn default() -> Self { + Self(AttributeSet::default()) + } +} + #[derive(Clone, Debug)] pub struct VarAttributeSet { pub long_var_name: Identifier, @@ -2660,23 +2736,48 @@ fn read_string(r: &mut R, endian: Endian) -> Result } #[derive(Clone, Debug)] -pub struct LongStringValueLabels +pub struct LongStringValueLabels where S: Debug, { - pub var_name: S, + pub var_name: N, pub width: u32, /// `(value, label)` pairs, where each value is `width` bytes. pub labels: Vec<(S, S)>, } +impl LongStringValueLabels { + fn decode<'a>( + &'a self, + decoder: &Decoder, + ) -> Result>, Error> { + let var_name = decoder.decode(&self.var_name); + let var_name = Identifier::new(var_name.trim_end(), decoder.encoding) + .map_err(Error::InvalidLongStringValueLabelName)?; + + let mut labels = Vec::with_capacity(self.labels.len()); + for (value, label) in self.labels.iter() { + let value = decoder.decode_exact_length(&value.0); + let label = decoder.decode(&label); + labels.push((value, label)); + } + + Ok(LongStringValueLabels { + var_name, + width: self.width, + labels, + }) + } +} + #[derive(Clone, Debug)] -pub struct LongStringValueLabelRecord(pub Vec>) +pub struct LongStringValueLabelRecord(pub Vec>) where + N: Debug, S: Debug; -impl ExtensionRecord for LongStringValueLabelRecord { +impl ExtensionRecord for LongStringValueLabelRecord { const SUBTYPE: u32 = 21; const SIZE: Option = Some(1); const COUNT: Option = None; @@ -2708,3 +2809,19 @@ impl ExtensionRecord for LongStringValueLabelRecord { ))) } } + +impl LongStringValueLabelRecord { + fn decode<'a>( + &'a self, + decoder: &Decoder, + ) -> Result>, Error> { + let mut labels = Vec::with_capacity(self.0.len()); + for label in &self.0 { + match label.decode(decoder) { + Ok(set) => labels.push(set), + Err(error) => decoder.warn(error), + } + } + Ok(LongStringValueLabelRecord(labels)) + } +}