From 1847205cc5366d9672ae742c037b67f4c0d9609d Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Fri, 27 Dec 2024 09:56:34 -0800 Subject: [PATCH] variable sets --- rust/pspp/src/cooked.rs | 74 ++++++++++--- rust/pspp/src/dictionary.rs | 33 +++--- rust/pspp/src/endian.rs | 6 ++ rust/pspp/src/raw.rs | 200 +++++++++++++++++------------------- 4 files changed, 181 insertions(+), 132 deletions(-) diff --git a/rust/pspp/src/cooked.rs b/rust/pspp/src/cooked.rs index cb8dbe6d75..2cde42e32e 100644 --- a/rust/pspp/src/cooked.rs +++ b/rust/pspp/src/cooked.rs @@ -4,6 +4,7 @@ use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc}; use crate::{ dictionary::{ Dictionary, MultipleResponseSet, MultipleResponseType, Value, VarWidth, Variable, + VariableSet, }, encoding::Error as EncodingError, endian::Endian, @@ -12,10 +13,10 @@ use crate::{ raw::{ self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord, FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName, LongNamesRecord, - LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord, - NumberOfCasesRecord, ProductInfoRecord, RawStr, RawWidth, ValueLabel, ValueLabelRecord, - VarDisplayRecord, VariableAttributeRecord, VariableRecord, VariableSetRecord, - VeryLongStringsRecord, ZHeader, ZTrailer, + LongStringMissingValueRecord, LongStringValueLabelRecord, MissingValues, + MultipleResponseRecord, NumberOfCasesRecord, ProductInfoRecord, RawStr, RawWidth, + ValueLabel, ValueLabelRecord, VarDisplayRecord, VariableAttributeRecord, VariableRecord, + VariableSetRecord, VeryLongStringsRecord, ZHeader, ZTrailer, }, }; use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; @@ -175,7 +176,7 @@ type DictIndex = usize; #[derive(Clone, Debug)] pub struct Headers { pub header: HeaderRecord, - pub variable: Vec>, + pub variable: Vec>, pub value_label: Vec, String>>, pub document: Vec>, pub integer_info: Option, @@ -183,7 +184,7 @@ pub struct Headers { pub var_display: Option, pub multiple_response: Vec>, pub long_string_value_labels: Vec>, - pub long_string_missing_values: Vec>, + pub long_string_missing_values: Vec>, pub encoding: Option, pub number_of_cases: Option, pub variable_sets: Vec, @@ -481,7 +482,7 @@ pub fn decode( new_name } }; - let mut variable = Variable::new(name.clone(), VarWidth::from_raw(input.width).unwrap()); + let mut variable = Variable::new(name.clone(), VarWidth::try_from(input.width).unwrap()); // Set the short name the same as the long name (even if we renamed it). variable.short_names = vec![name]; @@ -568,12 +569,7 @@ pub fn decode( for dict_index in dict_indexes { let variable = dictionary.variables.get_index_mut2(dict_index).unwrap(); for ValueLabel { value, label } in record.labels.iter().cloned() { - let value = match value { - raw::Value::Number(number) => Value::Number(number.map(|n| n.into())), - raw::Value::String(string) => { - string.0[..variable.width.as_string_width().unwrap()].into() - } - }; + let value = value.decode(variable.width); variable.value_labels.insert(value, label); } } @@ -688,13 +684,63 @@ pub fn decode( for (mut value, label) in record.labels.into_iter() { // XXX warn about too-long value? value.0.resize(width, b' '); - // XXX warn abouat duplicat value labels? + // XXX warn abouat duplicate value labels? variable .value_labels .insert(Value::String(value.0.into_boxed_slice()), label); } } + let mut value = Vec::new(); + for record in headers + .long_string_missing_values + .drain(..) + .flat_map(|record| record.0.into_iter()) + { + let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else { + warn(Error::TBD); + continue; + }; + let values = record + .missing_values + .into_iter() + .map(|v| { + value.clear(); + value.extend_from_slice(v.0.as_slice()); + value.resize(variable.width.as_string_width().unwrap(), b' '); + Value::String(Box::from(value.as_slice())) + }) + .collect::>(); + variable.missing_values = MissingValues { + values, + range: None, + }; + } + + for record in headers + .variable_sets + .drain(..) + .flat_map(|record| record.sets.into_iter()) + { + let mut variables = Vec::with_capacity(record.variable_names.len()); + for variable_name in record.variable_names { + let Some((dict_index, _)) = dictionary.variables.get_full_mut2(&variable_name.0) else { + warn(Error::TBD); + continue; + }; + variables.push(dict_index); + } + if !variables.is_empty() { + let variable_set = VariableSet { + name: record.name, + variables, + }; + dictionary + .variable_sets + .insert(ByIdentifier::new(variable_set)); + } + } + let metadata = Metadata::decode(&headers, warn); Ok((dictionary, metadata)) } diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index bed1c50bc7..1f4837972a 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -2,7 +2,7 @@ use core::str; use std::{ cmp::Ordering, collections::{HashMap, HashSet}, - fmt::Debug, + fmt::{Debug, Formatter, Result as FmtResult}, hash::Hash, ops::{Bound, RangeBounds}, }; @@ -16,7 +16,7 @@ use unicase::UniCase; use crate::{ format::Format, identifier::{ByIdentifier, HasIdentifier, Identifier}, - raw::{Alignment, CategoryLabels, Measure, MissingValues, RawWidth, VarType}, + raw::{Alignment, CategoryLabels, Measure, MissingValues, VarType}, }; pub type DictIndex = usize; @@ -82,14 +82,6 @@ impl VarWidth { } } - pub fn from_raw(raw: RawWidth) -> Result { - match raw { - RawWidth::Continuation => Err(()), - RawWidth::Numeric => Ok(Self::Numeric), - RawWidth::String(width) => Ok(Self::String(width.get() as u16)), - } - } - pub fn is_long_string(&self) -> bool { if let Self::String(width) = self { *width > 8 @@ -123,10 +115,23 @@ impl From for VarType { } } -#[derive(Clone, Debug)] -pub enum Value { +#[derive(Clone)] +pub enum Value> { Number(Option), - String(Box<[u8]>), + String(S), +} + +impl Debug for Value +where + S: Debug, +{ + fn fmt(&self, f: &mut Formatter) -> FmtResult { + match self { + Value::Number(Some(number)) => write!(f, "{number:?}"), + Value::Number(None) => write!(f, "SYSMIS"), + Value::String(s) => write!(f, "{:?}", s), + } + } } impl PartialEq for Value { @@ -486,7 +491,7 @@ pub struct Variable { impl Variable { pub fn new(name: Identifier, width: VarWidth) -> Self { - let var_type = VarType::from_width(width); + let var_type = VarType::from(width); let leave = name.class().must_leave(); Self { name, diff --git a/rust/pspp/src/endian.rs b/rust/pspp/src/endian.rs index dd89a6cc1d..ba47a3001a 100644 --- a/rust/pspp/src/endian.rs +++ b/rust/pspp/src/endian.rs @@ -166,3 +166,9 @@ impl Parse for Endian { } } } +impl Parse, 8> for Endian { + fn parse(self, bytes: [u8; 8]) -> Option { + let number: f64 = self.parse(bytes); + (number != -f64::MAX).then_some(number) + } +} diff --git a/rust/pspp/src/raw.rs b/rust/pspp/src/raw.rs index fc716dbc4f..3f2ef69fbc 100644 --- a/rust/pspp/src/raw.rs +++ b/rust/pspp/src/raw.rs @@ -1,5 +1,5 @@ use crate::{ - dictionary::{Attributes, VarWidth}, + dictionary::{Attributes, Value, VarWidth}, encoding::{default_encoding, get_encoding, Error as EncodingError}, endian::{Endian, Parse, ToBytes}, identifier::{Error as IdError, Identifier}, @@ -56,6 +56,9 @@ pub enum Error { code: u32, }, + #[error("At offset {offset:#x}, missing value code ({code}) is not -3, -2, 0, 1, 2, or 3.")] + BadMissingValueCode { offset: u64, code: i32 }, + #[error( "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3." )] @@ -218,7 +221,7 @@ impl From for Warning { #[derive(Clone, Debug)] pub enum Record { Header(HeaderRecord), - Variable(VariableRecord>), + Variable(VariableRecord), ValueLabel(ValueLabelRecord, RawString>), Document(DocumentRecord), IntegerInfo(IntegerInfoRecord), @@ -226,7 +229,7 @@ pub enum Record { VarDisplay(VarDisplayRecord), MultipleResponse(MultipleResponseRecord), LongStringValueLabels(LongStringValueLabelRecord), - LongStringMissingValues(LongStringMissingValueRecord>), + LongStringMissingValues(LongStringMissingValueRecord), Encoding(EncodingRecord), NumberOfCases(NumberOfCasesRecord), Text(TextRecord), @@ -240,7 +243,7 @@ pub enum Record { #[derive(Clone, Debug)] pub enum DecodedRecord { Header(HeaderRecord), - Variable(VariableRecord), + Variable(VariableRecord), ValueLabel(ValueLabelRecord, String>), Document(DocumentRecord), IntegerInfo(IntegerInfoRecord), @@ -248,7 +251,7 @@ pub enum DecodedRecord { VarDisplay(VarDisplayRecord), MultipleResponse(MultipleResponseRecord), LongStringValueLabels(LongStringValueLabelRecord), - LongStringMissingValues(LongStringMissingValueRecord), + LongStringMissingValues(LongStringMissingValueRecord), Encoding(EncodingRecord), NumberOfCases(NumberOfCasesRecord), VariableSets(VariableSetRecord), @@ -276,7 +279,7 @@ impl Record { { let rec_type: u32 = endian.parse(read_bytes(reader)?); match rec_type { - 2 => Ok(Some(VariableRecord::read(reader, endian)?)), + 2 => Ok(Some(VariableRecord::read(reader, endian, warn)?)), 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?), 6 => Ok(Some(DocumentRecord::read(reader, endian)?)), 7 => Extension::read(reader, endian, var_types.n_values(), warn), @@ -661,13 +664,6 @@ pub enum VarType { } impl VarType { - pub fn from_width(width: VarWidth) -> VarType { - match width { - VarWidth::Numeric => Self::Numeric, - VarWidth::String(_) => Self::String, - } - } - pub fn opposite(self) -> VarType { match self { Self::Numeric => Self::String, @@ -697,30 +693,20 @@ impl TryFrom for VarType { } } -#[derive(Copy, Clone)] -pub enum Value -where - S: Debug, -{ - Number(Option), - String(S), -} - -type RawValue = Value>; +impl TryFrom for VarWidth { + type Error = (); -impl Debug for Value -where - S: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> FmtResult { - match self { - Value::Number(Some(number)) => write!(f, "{number:?}"), - Value::Number(None) => write!(f, "SYSMIS"), - Value::String(s) => write!(f, "{:?}", s), + fn try_from(value: RawWidth) -> Result { + match value { + RawWidth::Continuation => Err(()), + RawWidth::Numeric => Ok(Self::Numeric), + RawWidth::String(width) => Ok(Self::String(width.get() as u16)), } } } +type RawValue = Value>; + impl RawValue { fn read(r: &mut R, var_type: VarType, endian: Endian) -> Result { Ok(Self::from_raw( @@ -733,10 +719,7 @@ impl RawValue { pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self { match var_type { VarType::String => Value::String(RawStr(raw.0)), - VarType::Numeric => { - let number: f64 = endian.parse(raw.0); - Value::Number((number != -f64::MAX).then_some(number)) - } + VarType::Numeric => Value::Number(endian.parse(raw.0)), } } @@ -838,10 +821,13 @@ impl RawValue { Ok(Some(values)) } - pub fn decode(self, decoder: &Decoder) -> Value { + pub fn decode(&self, width: VarWidth) -> Value { match self { - Self::Number(x) => Value::Number(x), - Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()), + Self::Number(x) => Value::Number(*x), + Self::String(s) => { + let width = width.as_string_width().unwrap(); + Value::String(Box::from(&s.0[..width])) + } } } } @@ -1142,7 +1128,7 @@ fn format_name(type_: u32) -> Cow<'static, str> { } #[derive(Clone)] -pub struct MissingValues +pub struct MissingValues> where S: Debug, { @@ -1201,59 +1187,70 @@ where } } -impl MissingValues> { +impl MissingValues { fn read( r: &mut R, offset: u64, width: RawWidth, code: i32, endian: Endian, + warn: &dyn Fn(Warning), ) -> Result { - let (n_values, has_range) = match (width, code) { - (_, 0..=3) => (code, false), - (RawWidth::Numeric, -2) => (0, true), - (RawWidth::Numeric, -3) => (1, true), - (RawWidth::Numeric, _) => { - return Err(Error::BadNumericMissingValueCode { offset, code }) - } - (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }), + let (individual_values, has_range) = match code { + 0..=3 => (code as usize, false), + -2 => (0, true), + -3 => (1, true), + _ => return Err(Error::BadMissingValueCode { offset, code }), }; - let var_type = VarType::try_from(width).unwrap(); - - let mut values = Vec::new(); - for _ in 0..n_values { - values.push(RawValue::read(r, var_type, endian)?); + let mut values = Vec::with_capacity(individual_values); + for _ in 0..individual_values { + values.push(read_bytes::<8, _>(r)?); } let range = if has_range { - let low = RawValue::read(r, var_type, endian)?; - let high = RawValue::read(r, var_type, endian)?; + let low = read_bytes::<8, _>(r)?; + let high = read_bytes::<8, _>(r)?; Some((low, high)) } else { None }; - Ok(Self { values, range }) - } - fn decode(&self, decoder: &Decoder) -> MissingValues { - MissingValues { - values: self - .values - .iter() - .map(|value| value.decode(decoder)) - .collect(), - range: self - .range - .as_ref() - .map(|(low, high)| (low.decode(decoder), high.decode(decoder))), + + match VarWidth::try_from(width) { + Ok(VarWidth::Numeric) => { + let values = values + .into_iter() + .map(|v| Value::Number(endian.parse(v))) + .collect(); + let range = range.map(|(low, high)| { + ( + Value::Number(endian.parse(low)), + Value::Number(endian.parse(high)), + ) + }); + return Ok(Self { values, range }); + } + Ok(VarWidth::String(width)) if width <= 8 && range.is_none() => { + let values = values + .into_iter() + .map(|value| Value::String(Box::from(&value[..width as usize]))) + .collect(); + return Ok(Self { + values, + range: None, + }); + } + Ok(VarWidth::String(width)) if width > 8 => warn(Warning::TBD), + Ok(VarWidth::String(_)) => warn(Warning::TBD), + Err(()) => warn(Warning::TBD), } + Ok(Self::default()) } } #[derive(Clone)] -pub struct VariableRecord +pub struct VariableRecord where S: Debug, - V: Debug, { /// Range of offsets in file. pub offsets: Range, @@ -1271,7 +1268,7 @@ where pub write_format: Spec, /// Missing values. - pub missing_values: MissingValues, + pub missing_values: MissingValues, /// Optional variable label. pub label: Option, @@ -1317,10 +1314,9 @@ impl Display for RawWidth { } } -impl Debug for VariableRecord +impl Debug for VariableRecord where S: Debug, - V: Debug, { fn fmt(&self, f: &mut Formatter) -> FmtResult { writeln!(f, "Width: {}", self.width,)?; @@ -1332,8 +1328,12 @@ where } } -impl VariableRecord> { - fn read(r: &mut R, endian: Endian) -> Result { +impl VariableRecord { + fn read( + r: &mut R, + endian: Endian, + warn: &dyn Fn(Warning), + ) -> Result { let start_offset = r.stream_position()?; let width: i32 = endian.parse(read_bytes(r)?); let width: RawWidth = width.try_into().map_err(|_| Error::BadVariableWidth { @@ -1369,7 +1369,7 @@ impl VariableRecord> { }; let missing_values = - MissingValues::read(r, start_offset, width, missing_value_code, endian)?; + MissingValues::read(r, start_offset, width, missing_value_code, endian, warn)?; let end_offset = r.stream_position()?; @@ -1391,7 +1391,7 @@ impl VariableRecord> { name: decoder.decode(&self.name).to_string(), print_format: self.print_format, write_format: self.write_format, - missing_values: self.missing_values.decode(decoder), + missing_values: self.missing_values, label: self .label .as_ref() @@ -1645,7 +1645,7 @@ impl ValueLabelRecord, RawString> { .labels .iter() .map(|ValueLabel { value, label }| ValueLabel { - value: *value, + value: value.clone(), label: decoder.decode(label).to_string(), }) .collect(); @@ -2091,37 +2091,32 @@ impl VarDisplayRecord { } #[derive(Clone, Debug)] -pub struct LongStringMissingValues +pub struct LongStringMissingValues where N: Debug, - V: Debug, { /// Variable name. pub var_name: N, /// Missing values. - pub missing_values: MissingValues, + pub missing_values: Vec>, } -impl LongStringMissingValues> { - fn decode( - &self, - decoder: &Decoder, - ) -> Result, IdError> { +impl LongStringMissingValues { + fn decode(&self, decoder: &Decoder) -> Result, IdError> { Ok(LongStringMissingValues { var_name: decoder.decode_identifier(&self.var_name)?, - missing_values: self.missing_values.decode(decoder), + missing_values: self.missing_values.clone(), }) } } #[derive(Clone, Debug)] -pub struct LongStringMissingValueRecord(pub Vec>) +pub struct LongStringMissingValueRecord(pub Vec>) where - N: Debug, - V: Debug; + N: Debug; -impl ExtensionRecord for LongStringMissingValueRecord> { +impl ExtensionRecord for LongStringMissingValueRecord { const SUBTYPE: u32 = 22; const SIZE: Option = Some(1); const COUNT: Option = None; @@ -2144,7 +2139,7 @@ impl ExtensionRecord for LongStringMissingValueRecord> { value_len, }); } - let mut values = Vec::new(); + let mut missing_values = Vec::new(); for i in 0..n_missing_values { let value: [u8; 8] = read_bytes(&mut input)?; let numeric_value: u64 = endian.parse(value); @@ -2156,12 +2151,8 @@ impl ExtensionRecord for LongStringMissingValueRecord> { } else { value }; - values.push(Value::String(RawStr(value))); + missing_values.push(RawStr(value)); } - let missing_values = MissingValues { - values, - range: None, - }; missing_value_set.push(LongStringMissingValues { var_name, missing_values, @@ -2173,8 +2164,8 @@ impl ExtensionRecord for LongStringMissingValueRecord> { } } -impl LongStringMissingValueRecord> { - pub fn decode(self, decoder: &Decoder) -> LongStringMissingValueRecord { +impl LongStringMissingValueRecord { + pub fn decode(self, decoder: &Decoder) -> LongStringMissingValueRecord { let mut mvs = Vec::with_capacity(self.0.len()); for mv in self.0.iter() { if let Some(mv) = mv @@ -2503,13 +2494,14 @@ impl ProductInfoRecord { } #[derive(Clone, Debug)] pub struct VariableSet { - pub name: String, - pub vars: Vec, + pub name: Identifier, + pub variable_names: Vec, } impl VariableSet { fn parse(input: &str, decoder: &Decoder) -> Result { let (name, input) = input.split_once('=').ok_or(Warning::TBD)?; + let name = decoder.new_identifier(name).map_err(|_| Warning::TBD)?; let mut vars = Vec::new(); for var in input.split_ascii_whitespace() { if let Some(identifier) = decoder @@ -2522,8 +2514,8 @@ impl VariableSet { } } Ok(VariableSet { - name: name.into(), - vars, + name, + variable_names: vars, }) } } -- 2.30.2