From cf9d48788078a8934623c694d6000c02b87e922d Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 22 Dec 2024 10:04:43 -0800 Subject: [PATCH] multiple response sets --- rust/pspp/src/cooked.rs | 190 ++++++++++++++++++++++++++---------- rust/pspp/src/dictionary.rs | 13 +-- rust/pspp/src/raw.rs | 144 ++++++++++++++++----------- 3 files changed, 229 insertions(+), 118 deletions(-) diff --git a/rust/pspp/src/cooked.rs b/rust/pspp/src/cooked.rs index b2ae515f8b..7a3c62e70c 100644 --- a/rust/pspp/src/cooked.rs +++ b/rust/pspp/src/cooked.rs @@ -1,16 +1,19 @@ +use core::str; use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc}; use crate::{ - dictionary::{Dictionary, Value, VarWidth, Variable}, + dictionary::{ + Dictionary, MultipleResponseSet, MultipleResponseType, Value, VarWidth, Variable, + }, encoding::Error as EncodingError, endian::Endian, format::{Error as FormatError, Format, UncheckedFormat}, - identifier::{Error as IdError, Identifier}, + identifier::{ByIdentifier, Error as IdError, Identifier}, raw::{ self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord, FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongNamesRecord, LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord, - NumberOfCasesRecord, ProductInfoRecord, RawStr, ValueLabel, ValueLabelRecord, + NumberOfCasesRecord, ProductInfoRecord, RawStr, RawWidth, ValueLabel, ValueLabelRecord, VarDisplayRecord, VariableAttributeRecord, VariableRecord, VariableSetRecord, VeryLongStringsRecord, ZHeader, ZTrailer, }, @@ -453,7 +456,7 @@ pub fn decode( .variable .iter() .enumerate() - .filter(|(_index, record)| record.width != -1) + .filter(|(_index, record)| record.width != RawWidth::Continuation) { let name = trim_end_spaces(input.name.to_string()); let name = match Identifier::from_encoding(&name, encoding) { @@ -511,12 +514,12 @@ pub fn decode( ); // Check for long string continuation records. - let n_values = input.n_values().unwrap(); + let n_values = input.width.n_values().unwrap(); for offset in 1..n_values { if headers .variable .get(index + offset) - .is_none_or(|record| record.width != -1) + .is_none_or(|record| record.width != RawWidth::Continuation) { warn(Error::TBD); break; @@ -576,10 +579,89 @@ pub fn decode( } } + if let Some(display) = &headers.var_display { + for (index, display) in display.0.iter().enumerate() { + if let Some(variable) = dictionary.variables.get_index_mut2(index) { + if let Some(width) = display.width { + variable.display_width = width; + } + if let Some(alignment) = display.alignment { + variable.alignment = alignment; + } + if let Some(measure) = display.measure { + variable.measure = Some(measure); + } + } else { + warn(Error::TBD); + } + } + } + + for record in headers + .multiple_response + .iter() + .flat_map(|record| record.0.iter()) + { + match MultipleResponseSet::decode(&dictionary, record, &warn) { + Ok(mrset) => { + dictionary.mrsets.insert(ByIdentifier::new(mrset)); + } + Err(error) => warn(error), + } + } + let metadata = Metadata::decode(&headers, warn); Ok((dictionary, metadata)) } +impl MultipleResponseSet { + fn decode( + dictionary: &Dictionary, + input: &raw::MultipleResponseSet, + warn: &impl Fn(Error), + ) -> Result { + let mr_set_name = input.name.clone(); + let mut variables = Vec::with_capacity(input.short_names.len()); + for short_name in input.short_names.iter() { + let Some(dict_index) = dictionary.variables.get_index_of(&short_name.0) else { + warn(Error::UnknownMrSetVariable { + mr_set: mr_set_name.clone(), + short_name: short_name.clone(), + }); + continue; + }; + variables.push(dict_index); + } + + match variables.len() { + 0 => return Err(Error::EmptyMrSet(mr_set_name)), + 1 => return Err(Error::OneVarMrSet(mr_set_name)), + _ => (), + } + + let Some((Some(min_width), Some(max_width))) = variables + .iter() + .copied() + .map(|dict_index| dictionary.variables[dict_index].width) + .map(|w| (Some(w), Some(w))) + .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb))) + else { + return Err(Error::MixedMrSet(mr_set_name)); + }; + + let mr_type = MultipleResponseType::decode(&mr_set_name, &input.mr_type, min_width, warn)?; + + Ok(MultipleResponseSet { + name: mr_set_name, + min_width, + max_width, + label: input.label.to_string(), + mr_type, + variables, + }) + } +} + fn trim_end_spaces(mut s: String) -> String { s.truncate(s.trim_end_matches(' ').len()); s @@ -616,6 +698,55 @@ fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Format, FormatEr }) } +impl MultipleResponseType { + fn decode( + mr_set: &Identifier, + input: &raw::MultipleResponseType, + min_width: VarWidth, + warn: &impl Fn(Error), + ) -> Result { + match input { + raw::MultipleResponseType::MultipleDichotomy { value, labels } => { + let value = match min_width { + VarWidth::Numeric => { + let string = String::from_utf8_lossy(&value.0); + let number: f64 = string.trim().parse().map_err(|_| { + Error::InvalidMDGroupCountedValue { + mr_set: mr_set.clone(), + number: string.into(), + } + })?; + Value::Number(Some(number)) + } + VarWidth::String(max_width) => { + let mut value = value.0.as_slice(); + while value.ends_with(b" ") { + value = &value[..value.len() - 1]; + } + let width = value.len(); + if width > max_width as usize { + return Err(Error::TooWideMDGroupCountedValue { + mr_set: mr_set.clone(), + value: String::from_utf8_lossy(value).into(), + width, + max_width, + }); + }; + Value::String(value.into()) + } + }; + Ok(MultipleResponseType::MultipleDichotomy { + value, + labels: *labels, + }) + } + raw::MultipleResponseType::MultipleCategory => { + Ok(MultipleResponseType::MultipleCategory) + } + } + } +} + /* impl Decoder { fn generate_name(&mut self) -> Identifier { @@ -1340,53 +1471,6 @@ pub struct MultipleResponseSet { pub dict_indexes: Vec, } -impl MultipleResponseSet { - fn decode( - decoder: &Decoder, - input: &raw::MultipleResponseSet>, - warn: &impl Fn(Error), - ) -> Result { - let mr_set_name = input.name.clone(); - let mut dict_indexes = Vec::with_capacity(input.short_names.len()); - for short_name in input.short_names.iter() { - let Some(&dict_index) = decoder.var_names.get(&short_name) else { - warn(Error::UnknownMrSetVariable { - mr_set: mr_set_name.clone(), - short_name: short_name.clone(), - }); - continue; - }; - dict_indexes.push(dict_index); - } - - match dict_indexes.len() { - 0 => return Err(Error::EmptyMrSet(mr_set_name)), - 1 => return Err(Error::OneVarMrSet(mr_set_name)), - _ => (), - } - - let Some((Some(min_width), Some(max_width))) = dict_indexes - .iter() - .map(|dict_index| decoder.variables[dict_index].width) - .map(|w| (Some(w), Some(w))) - .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb))) - else { - return Err(Error::MixedMrSet(mr_set_name)); - }; - - let mr_type = - MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?; - - Ok(MultipleResponseSet { - name: mr_set_name, - min_width, - max_width, - label: input.label.to_string(), - mr_type, - dict_indexes, - }) - } -} #[derive(Clone, Debug)] pub struct MultipleResponseRecord(pub Vec); diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index 5da085b2e5..5e7f25d208 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -16,7 +16,7 @@ use unicase::UniCase; use crate::{ format::Format, identifier::{ByIdentifier, HasIdentifier, Identifier}, - raw::{Alignment, CategoryLabels, Measure, MissingValues, VarType}, + raw::{Alignment, CategoryLabels, Measure, MissingValues, RawWidth, VarType}, }; pub type DictIndex = usize; @@ -82,12 +82,11 @@ impl VarWidth { } } - pub fn from_raw(raw: impl Into) -> Result { - let raw: i32 = raw.into(); + pub fn from_raw(raw: RawWidth) -> Result { match raw { - 0 => Ok(Self::Numeric), - 1..=255 => Ok(Self::String(raw as u16)), - _ => Err(()), + RawWidth::Continuation => Err(()), + RawWidth::Numeric => Ok(Self::Numeric), + RawWidth::String(width) => Ok(Self::String(width.get() as u16)), } } @@ -526,6 +525,8 @@ impl HasIdentifier for Attribute { pub struct MultipleResponseSet { pub name: Identifier, pub label: String, + pub min_width: VarWidth, + pub max_width: VarWidth, pub mr_type: MultipleResponseType, pub variables: Vec, } diff --git a/rust/pspp/src/raw.rs b/rust/pspp/src/raw.rs index 776677e563..e5933d3756 100644 --- a/rust/pspp/src/raw.rs +++ b/rust/pspp/src/raw.rs @@ -11,12 +11,12 @@ use num::Integer; use std::{ borrow::Cow, cell::RefCell, - cmp::Ordering, collections::{HashMap, VecDeque}, fmt::{Debug, Display, Formatter, Result as FmtResult}, io::{Error as IoError, Read, Seek, SeekFrom}, iter::repeat, mem::take, + num::NonZeroU8, ops::Range, rc::Rc, str::from_utf8, @@ -685,6 +685,18 @@ impl Display for VarType { } } +impl TryFrom for VarType { + type Error = (); + + fn try_from(value: RawWidth) -> Result { + match value { + RawWidth::Continuation => Err(()), + RawWidth::Numeric => Ok(VarType::Numeric), + RawWidth::String(_) => Ok(VarType::String), + } + } +} + #[derive(Copy, Clone)] pub enum Value where @@ -735,7 +747,7 @@ impl RawValue { ) -> Result>, Error> { let case_start = reader.stream_position()?; let mut values = Vec::with_capacity(var_types.n_values()); - for (i, (var_type, _)) in var_types.types.iter().enumerate() { + for (i, var_type) in var_types.iter().enumerate() { let Some(raw) = try_read_bytes(reader)? else { if i == 0 { return Ok(None); @@ -748,7 +760,7 @@ impl RawValue { }); } }; - values.push(Value::from_raw(&UntypedValue(raw), *var_type, endian)); + values.push(Value::from_raw(&UntypedValue(raw), var_type, endian)); } Ok(Some(values)) } @@ -762,7 +774,7 @@ impl RawValue { ) -> Result>, Error> { let case_start = reader.stream_position()?; let mut values = Vec::with_capacity(var_types.n_values()); - for (i, (var_type, _)) in var_types.types.iter().enumerate() { + for (i, var_type) in var_types.iter().enumerate() { let value = loop { let Some(code) = codes.pop_front() else { let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else { @@ -781,7 +793,7 @@ impl RawValue { }; match code { 0 => (), - 1..=251 => match *var_type { + 1..=251 => match var_type { VarType::Numeric => break Self::Number(Some(code as f64 - bias)), VarType::String => { break Self::String(RawStr(endian.to_bytes(code as f64 - bias))) @@ -799,9 +811,9 @@ impl RawValue { } } 253 => { - break Self::from_raw(&UntypedValue(read_bytes(reader)?), *var_type, endian) + break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian) } - 254 => match *var_type { + 254 => match var_type { VarType::String => break Self::String(RawStr(*b" ")), // XXX EBCDIC VarType::Numeric => { return Err(Error::CompressedStringExpected { @@ -810,7 +822,7 @@ impl RawValue { }) } }, - 255 => match *var_type { + 255 => match var_type { VarType::Numeric => break Self::Number(None), VarType::String => { return Err(Error::CompressedNumberExpected { @@ -1193,23 +1205,21 @@ impl MissingValues> { fn read( r: &mut R, offset: u64, - width: i32, + width: RawWidth, code: i32, endian: Endian, ) -> Result { let (n_values, has_range) = match (width, code) { (_, 0..=3) => (code, false), - (0, -2) => (0, true), - (0, -3) => (1, true), - (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }), + (RawWidth::Numeric, -2) => (0, true), + (RawWidth::Numeric, -3) => (1, true), + (RawWidth::Numeric, _) => { + return Err(Error::BadNumericMissingValueCode { offset, code }) + } (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }), }; - let var_type = if width == 0 { - VarType::Numeric - } else { - VarType::String - }; + let var_type = VarType::try_from(width).unwrap(); let mut values = Vec::new(); for _ in 0..n_values { @@ -1249,7 +1259,7 @@ where pub offsets: Range, /// Variable width, in the range -1..=255. - pub width: i32, + pub width: RawWidth, /// Variable name, padded on the right with spaces. pub name: S, @@ -1267,36 +1277,53 @@ where pub label: Option, } -impl VariableRecord -where - S: Debug, - V: Debug, -{ +#[derive(Copy, Clone, PartialEq, Eq)] +pub enum RawWidth { + Continuation, + Numeric, + String(NonZeroU8), +} + +impl RawWidth { pub fn n_values(&self) -> Option { - match self.width { - 0 => Some(1), - 1..=255 => Some((self.width as usize).div_ceil(8)), + match self { + RawWidth::Numeric => Some(1), + RawWidth::String(width) => Some((width.get() as usize).div_ceil(8)), _ => None, } } } +impl TryFrom for RawWidth { + type Error = (); + + fn try_from(value: i32) -> Result { + match value { + -1 => Ok(Self::Continuation), + 0 => Ok(Self::Numeric), + 1..=255 => Ok(Self::String(NonZeroU8::new(value as u8).unwrap())), + _ => Err(()), + } + } +} + +impl Display for RawWidth { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + match self { + RawWidth::Continuation => write!(f, "long string continuation"), + RawWidth::Numeric => write!(f, "numeric"), + RawWidth::String(width) => write!(f, "{width}-byte string"), + } + } +} + impl Debug for VariableRecord where S: Debug, V: Debug, { fn fmt(&self, f: &mut Formatter) -> FmtResult { - writeln!( - f, - "Width: {} ({})", - self.width, - match self.width.cmp(&0) { - Ordering::Greater => "string", - Ordering::Equal => "numeric", - Ordering::Less => "long string continuation record", - } - )?; + writeln!(f, "Width: {}", self.width,)?; writeln!(f, "Print format: {:?}", self.print_format)?; writeln!(f, "Write format: {:?}", self.write_format)?; writeln!(f, "Name: {:?}", &self.name)?; @@ -1309,12 +1336,10 @@ impl VariableRecord> { fn read(r: &mut R, endian: Endian) -> Result { let start_offset = r.stream_position()?; let width: i32 = endian.parse(read_bytes(r)?); - if !(-1..=255).contains(&width) { - return Err(Error::BadVariableWidth { - start_offset, - width, - }); - } + let width: RawWidth = width.try_into().map_err(|_| Error::BadVariableWidth { + start_offset, + width, + })?; let code_offset = r.stream_position()?; let has_variable_label: u32 = endian.parse(read_bytes(r)?); let missing_value_code: i32 = endian.parse(read_bytes(r)?); @@ -1580,10 +1605,10 @@ impl ValueLabelRecord, RawString> { let Some(&first_index) = dict_indexes.first() else { return Ok(None); }; - let var_type = var_types.types[first_index as usize - 1].0; + let var_type = var_types.types[first_index as usize - 1].unwrap(); let mut wrong_type_indexes = Vec::new(); dict_indexes.retain(|&index| { - if var_types.types[index as usize - 1].0 != var_type { + if var_types.types[index as usize - 1] != Some(var_type) { wrong_type_indexes.push(index); false } else { @@ -2897,7 +2922,7 @@ impl LongStringValueLabelRecord { #[derive(Default)] pub struct VarTypes { - pub types: Vec<(VarType, usize)>, + pub types: Vec>, } impl VarTypes { @@ -2905,16 +2930,12 @@ impl VarTypes { Self::default() } - pub fn push(&mut self, width: i32) { - let var_type = match width { - -1 => return, - 0 => VarType::Numeric, - 1..=255 => VarType::String, - _ => unreachable!(), - }; - let n_values = (width as usize).div_ceil(8).max(1); - for i in 0..n_values { - self.types.push((var_type, i)); + pub fn push(&mut self, width: RawWidth) { + if let Ok(var_type) = VarType::try_from(width) { + self.types.push(Some(var_type)); + for _ in 1..width.n_values().unwrap() { + self.types.push(None); + } } } @@ -2928,10 +2949,15 @@ impl VarTypes { pub fn var_type_at(&self, index: usize) -> Option { if index >= 1 && index <= self.types.len() { - if let (var_type, 0) = self.types[index - 1] { - return Some(var_type); - } + self.types[index - 1] + } else { + None } - None + } + + pub fn iter(&self) -> impl Iterator + use<'_> { + self.types + .iter() + .map(|var_type| var_type.unwrap_or(VarType::String)) } } -- 2.30.2