From: Ben Pfaff Date: Sat, 26 Aug 2023 16:35:28 +0000 (-0700) Subject: multiple response X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1e9454a6a578bebbf9dfd6a0fb6b845b14a67fb5;p=pspp multiple response --- diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs index 8d748778b9..fcc15901fe 100644 --- a/rust/src/cooked.rs +++ b/rust/src/cooked.rs @@ -1,11 +1,10 @@ use std::{borrow::Cow, cmp::Ordering, collections::HashMap, iter::repeat}; use crate::{ + endian::Endian, format::{Error as FormatError, Spec, UncheckedSpec}, identifier::{Error as IdError, Identifier}, raw::{self, MissingValues, VarType}, - CategoryLabels, - {endian::Endian, Compression}, }; use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; use encoding_rs::{DecoderResult, Encoding}; @@ -13,6 +12,8 @@ use num::integer::div_ceil; use ordered_float::OrderedFloat; use thiserror::Error as ThisError; +pub use crate::raw::{CategoryLabels, Compression}; + #[derive(ThisError, Debug)] pub enum Error { #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")] @@ -92,6 +93,12 @@ pub enum Error { #[error("Multiple response set {0} contains both string and numeric variables.")] MixedMrSet(Identifier), + #[error("Invalid numeric format for counted value {number} in multiple response set {mr_set}.")] + InvalidMDGroupCountedValue { mr_set: Identifier, number: String }, + + #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")] + TooWideMDGroupCountedValue { mr_set: Identifier, value: String, width: usize, max_width: u16 }, + #[error("Details TBD")] TBD, } @@ -291,20 +298,33 @@ impl VarWidth { } } - /// Returns the wider of `self` and `other`: - /// - Numerical variable widths are equally wide. - /// - Longer strings are wider than shorter strings. - /// - Numerical and string types are incomparable, so result in `None`. - /// - Any `None` in the input yields `None` in the output. - pub fn wider(a: Option, b: Option) -> Option { + fn width_predicate( + a: Option, + b: Option, + f: impl Fn(u16, u16) -> u16, + ) -> Option { match (a, b) { (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric), (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => { - Some(VarWidth::String(a.max(b))) + Some(VarWidth::String(f(a, b))) } _ => None, } } + + /// Returns the wider of `self` and `other`: + /// - Numerical variable widths are equally wide. + /// - Longer strings are wider than shorter strings. + /// - Numerical and string types are incomparable, so result in `None`. + /// - Any `None` in the input yields `None` in the output. + pub fn wider(a: Option, b: Option) -> Option { + Self::width_predicate(a, b, |a, b| a.max(b)) + } + + /// Returns the narrower of `self` and `other` (see [`Self::wider`]). + pub fn narrower(a: Option, b: Option) -> Option { + Self::width_predicate(a, b, |a, b| a.min(b)) + } } impl From for VarType { @@ -808,9 +828,45 @@ pub enum MultipleResponseType { MultipleCategory, } +impl MultipleResponseType { + fn decode( + decoder: &Decoder, + mr_set: &Identifier, + input: &raw::MultipleResponseType, + min_width: VarWidth, + warn: &impl Fn(Error), + ) -> Result { + let mr_type = match input { + raw::MultipleResponseType::MultipleDichotomy { value, labels } => { + let value = decoder.decode_string(&value.0, warn); + let value = match min_width { + VarWidth::Numeric => { + let number: f64 = value.trim().parse() + .map_err(|_| Error::InvalidMDGroupCountedValue { mr_set: mr_set.clone(), number: value.into() })?; + Value::Number(Some(number.into())) + }, + VarWidth::String(max_width) => { + let value = value.trim_end_matches(' '); + let width = value.len(); + if width > max_width as usize { + return Err(Error::TooWideMDGroupCountedValue { mr_set: mr_set.clone(), value: value.into(), width, max_width }); + }; + Value::String(value.into()) + } + }; + MultipleResponseType::MultipleDichotomy { value, labels: *labels } + }, + raw::MultipleResponseType::MultipleCategory => MultipleResponseType::MultipleCategory, + }; + Ok(mr_type) + } +} + #[derive(Clone, Debug)] pub struct MultipleResponseSet { pub name: Identifier, + pub min_width: VarWidth, + pub max_width: VarWidth, pub label: String, pub mr_type: MultipleResponseType, pub dict_indexes: Vec, @@ -821,15 +877,15 @@ impl MultipleResponseSet { decoder: &Decoder, input: &raw::MultipleResponseSet, warn: &impl Fn(Error), - ) -> Result, Error> { + ) -> Result { let mr_set_name = decoder .decode_identifier(&input.name.0, warn) .map_err(|error| Error::InvalidMrSetName(error))?; let label = decoder.decode_string(&input.label.0, warn).into(); - let dict_indexes = Vec::with_capacity(input.short_names.len()); - for &short_name in input.short_names.iter() { + let mut dict_indexes = Vec::with_capacity(input.short_names.len()); + for short_name in input.short_names.iter() { let short_name = match decoder.decode_identifier(&short_name.0, warn) { Ok(name) => name, Err(error) => { @@ -837,7 +893,7 @@ impl MultipleResponseSet { continue; } }; - let Some(dict_index) = decoder.var_names.get(&short_name) else { + let Some(&dict_index) = decoder.var_names.get(&short_name) else { warn(Error::UnknownMrSetVariable { mr_set: mr_set_name.clone(), short_name: short_name.clone(), @@ -853,24 +909,43 @@ impl MultipleResponseSet { _ => (), } - let Some(var_width) = dict_indexes + let Some((Some(min_width), Some(max_width))) = dict_indexes .iter() - .map(|&dict_index| Some(decoder.variables[dict_index].width)) - .reduce(|a, b| VarWidth::wider(a, b)) - .flatten() + .map(|dict_index| decoder.variables[dict_index].width) + .map(|w| (Some(w), Some(w))) + .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb))) else { return Err(Error::MixedMrSet(mr_set_name)); }; + + let mr_type = MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?; + + Ok(MultipleResponseSet { + name: mr_set_name, + min_width, + max_width, + label, + mr_type, + dict_indexes, + }) } } #[derive(Clone, Debug)] -pub struct MultipleResponseRecord(Vec); +pub struct MultipleResponseRecord(pub Vec); impl Decode for MultipleResponseRecord { type Input = raw::MultipleResponseRecord; fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result { + let mut sets = Vec::with_capacity(input.0.len()); + for set in &input.0 { + match MultipleResponseSet::decode(decoder, set, &warn) { + Ok(set) => sets.push(set), + Err(error) => warn(error), + } + } + Ok(MultipleResponseRecord(sets)) } } diff --git a/rust/src/lib.rs b/rust/src/lib.rs index c793f44cbb..ebb4033b25 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,19 +1,7 @@ -pub mod endian; -pub mod raw; pub mod cooked; -pub mod sack; pub mod encoding; +pub mod endian; pub mod format; pub mod identifier; - -#[derive(Copy, Clone, Debug)] -pub enum Compression { - Simple, - ZLib, -} - -#[derive(Clone, Debug)] -pub enum CategoryLabels { - VarLabels, - CountedValues, -} +pub mod raw; +pub mod sack; diff --git a/rust/src/raw.rs b/rust/src/raw.rs index a9f463425c..a8c8ff7b46 100644 --- a/rust/src/raw.rs +++ b/rust/src/raw.rs @@ -1,5 +1,4 @@ use crate::endian::{Endian, Parse, ToBytes}; -use crate::{CategoryLabels, Compression}; use encoding_rs::mem::decode_latin1; use flate2::read::ZlibDecoder; @@ -177,6 +176,12 @@ fn default_decode<'a>(s: &'a [u8]) -> Cow<'a, str> { from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from) } +#[derive(Copy, Clone, Debug)] +pub enum Compression { + Simple, + ZLib, +} + #[derive(Clone)] pub struct HeaderRecord { /// Magic number. @@ -1197,6 +1202,12 @@ impl ExtensionRecord for FloatInfoRecord { } } +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum CategoryLabels { + VarLabels, + CountedValues, +} + #[derive(Clone, Debug)] pub enum MultipleResponseType { MultipleDichotomy { @@ -1205,20 +1216,9 @@ pub enum MultipleResponseType { }, MultipleCategory, } -#[derive(Clone, Debug)] -pub struct MultipleResponseSet { - pub name: UnencodedString, - pub label: UnencodedString, - pub mr_type: MultipleResponseType, - pub short_names: Vec, -} -impl MultipleResponseSet { - fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> { - let Some(equals) = input.iter().position(|&b| b == b'=') else { - return Err(Error::TBD); - }; - let (name, input) = input.split_at(equals); +impl MultipleResponseType { + fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Error> { let (mr_type, input) = match input.get(0) { Some(b'C') => (MultipleResponseType::MultipleCategory, &input[1..]), Some(b'D') => { @@ -1254,6 +1254,25 @@ impl MultipleResponseSet { } _ => return Err(Error::TBD), }; + Ok((mr_type, input)) + } +} + +#[derive(Clone, Debug)] +pub struct MultipleResponseSet { + pub name: UnencodedString, + pub label: UnencodedString, + pub mr_type: MultipleResponseType, + pub short_names: Vec, +} + +impl MultipleResponseSet { + fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> { + let Some(equals) = input.iter().position(|&b| b == b'=') else { + return Err(Error::TBD); + }; + let (name, input) = input.split_at(equals); + let (mr_type, input) = MultipleResponseType::parse(input)?; let Some(b' ') = input.get(0) else { return Err(Error::TBD); }; @@ -1288,7 +1307,7 @@ impl MultipleResponseSet { } #[derive(Clone, Debug)] -pub struct MultipleResponseRecord(Vec); +pub struct MultipleResponseRecord(pub Vec); impl ExtensionRecord for MultipleResponseRecord { const SUBTYPE: u32 = 7; @@ -1466,7 +1485,10 @@ pub struct TextRecord { impl From for TextRecord { fn from(source: Extension) -> Self { - TextRecord { offset: source.offset, text: source.data.into() } + TextRecord { + offset: source.offset, + text: source.data.into(), + } } }