multiple response sets
authorBen Pfaff <blp@cs.stanford.edu>
Sun, 22 Dec 2024 18:04:43 +0000 (10:04 -0800)
committerBen Pfaff <blp@cs.stanford.edu>
Sun, 22 Dec 2024 18:04:43 +0000 (10:04 -0800)
rust/pspp/src/cooked.rs
rust/pspp/src/dictionary.rs
rust/pspp/src/raw.rs

index b2ae515f8bfef6d50fcfbf72b0a547bc1633c5c8..7a3c62e70c711fb2e8cd78b14b2091cdcb77e5db 100644 (file)
@@ -1,16 +1,19 @@
+use core::str;
 use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc};
 
 use crate::{
-    dictionary::{Dictionary, Value, VarWidth, Variable},
+    dictionary::{
+        Dictionary, MultipleResponseSet, MultipleResponseType, Value, VarWidth, Variable,
+    },
     encoding::Error as EncodingError,
     endian::Endian,
     format::{Error as FormatError, Format, UncheckedFormat},
-    identifier::{Error as IdError, Identifier},
+    identifier::{ByIdentifier, Error as IdError, Identifier},
     raw::{
         self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord,
         FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongNamesRecord,
         LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord,
-        NumberOfCasesRecord, ProductInfoRecord, RawStr, ValueLabel, ValueLabelRecord,
+        NumberOfCasesRecord, ProductInfoRecord, RawStr, RawWidth, ValueLabel, ValueLabelRecord,
         VarDisplayRecord, VariableAttributeRecord, VariableRecord, VariableSetRecord,
         VeryLongStringsRecord, ZHeader, ZTrailer,
     },
@@ -453,7 +456,7 @@ pub fn decode(
         .variable
         .iter()
         .enumerate()
-        .filter(|(_index, record)| record.width != -1)
+        .filter(|(_index, record)| record.width != RawWidth::Continuation)
     {
         let name = trim_end_spaces(input.name.to_string());
         let name = match Identifier::from_encoding(&name, encoding) {
@@ -511,12 +514,12 @@ pub fn decode(
         );
 
         // Check for long string continuation records.
-        let n_values = input.n_values().unwrap();
+        let n_values = input.width.n_values().unwrap();
         for offset in 1..n_values {
             if headers
                 .variable
                 .get(index + offset)
-                .is_none_or(|record| record.width != -1)
+                .is_none_or(|record| record.width != RawWidth::Continuation)
             {
                 warn(Error::TBD);
                 break;
@@ -576,10 +579,89 @@ pub fn decode(
         }
     }
 
+    if let Some(display) = &headers.var_display {
+        for (index, display) in display.0.iter().enumerate() {
+            if let Some(variable) = dictionary.variables.get_index_mut2(index) {
+                if let Some(width) = display.width {
+                    variable.display_width = width;
+                }
+                if let Some(alignment) = display.alignment {
+                    variable.alignment = alignment;
+                }
+                if let Some(measure) = display.measure {
+                    variable.measure = Some(measure);
+                }
+            } else {
+                warn(Error::TBD);
+            }
+        }
+    }
+
+    for record in headers
+        .multiple_response
+        .iter()
+        .flat_map(|record| record.0.iter())
+    {
+        match MultipleResponseSet::decode(&dictionary, record, &warn) {
+            Ok(mrset) => {
+                dictionary.mrsets.insert(ByIdentifier::new(mrset));
+            }
+            Err(error) => warn(error),
+        }
+    }
+
     let metadata = Metadata::decode(&headers, warn);
     Ok((dictionary, metadata))
 }
 
+impl MultipleResponseSet {
+    fn decode(
+        dictionary: &Dictionary,
+        input: &raw::MultipleResponseSet<Identifier, String>,
+        warn: &impl Fn(Error),
+    ) -> Result<Self, Error> {
+        let mr_set_name = input.name.clone();
+        let mut variables = Vec::with_capacity(input.short_names.len());
+        for short_name in input.short_names.iter() {
+            let Some(dict_index) = dictionary.variables.get_index_of(&short_name.0) else {
+                warn(Error::UnknownMrSetVariable {
+                    mr_set: mr_set_name.clone(),
+                    short_name: short_name.clone(),
+                });
+                continue;
+            };
+            variables.push(dict_index);
+        }
+
+        match variables.len() {
+            0 => return Err(Error::EmptyMrSet(mr_set_name)),
+            1 => return Err(Error::OneVarMrSet(mr_set_name)),
+            _ => (),
+        }
+
+        let Some((Some(min_width), Some(max_width))) = variables
+            .iter()
+            .copied()
+            .map(|dict_index| dictionary.variables[dict_index].width)
+            .map(|w| (Some(w), Some(w)))
+            .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb)))
+        else {
+            return Err(Error::MixedMrSet(mr_set_name));
+        };
+
+        let mr_type = MultipleResponseType::decode(&mr_set_name, &input.mr_type, min_width, warn)?;
+
+        Ok(MultipleResponseSet {
+            name: mr_set_name,
+            min_width,
+            max_width,
+            label: input.label.to_string(),
+            mr_type,
+            variables,
+        })
+    }
+}
+
 fn trim_end_spaces(mut s: String) -> String {
     s.truncate(s.trim_end_matches(' ').len());
     s
@@ -616,6 +698,55 @@ fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Format, FormatEr
         })
 }
 
+impl MultipleResponseType {
+    fn decode(
+        mr_set: &Identifier,
+        input: &raw::MultipleResponseType,
+        min_width: VarWidth,
+        warn: &impl Fn(Error),
+    ) -> Result<Self, Error> {
+        match input {
+            raw::MultipleResponseType::MultipleDichotomy { value, labels } => {
+                let value = match min_width {
+                    VarWidth::Numeric => {
+                        let string = String::from_utf8_lossy(&value.0);
+                        let number: f64 = string.trim().parse().map_err(|_| {
+                            Error::InvalidMDGroupCountedValue {
+                                mr_set: mr_set.clone(),
+                                number: string.into(),
+                            }
+                        })?;
+                        Value::Number(Some(number))
+                    }
+                    VarWidth::String(max_width) => {
+                        let mut value = value.0.as_slice();
+                        while value.ends_with(b" ") {
+                            value = &value[..value.len() - 1];
+                        }
+                        let width = value.len();
+                        if width > max_width as usize {
+                            return Err(Error::TooWideMDGroupCountedValue {
+                                mr_set: mr_set.clone(),
+                                value: String::from_utf8_lossy(value).into(),
+                                width,
+                                max_width,
+                            });
+                        };
+                        Value::String(value.into())
+                    }
+                };
+                Ok(MultipleResponseType::MultipleDichotomy {
+                    value,
+                    labels: *labels,
+                })
+            }
+            raw::MultipleResponseType::MultipleCategory => {
+                Ok(MultipleResponseType::MultipleCategory)
+            }
+        }
+    }
+}
+
 /*
 impl Decoder {
     fn generate_name(&mut self) -> Identifier {
@@ -1340,53 +1471,6 @@ pub struct MultipleResponseSet {
     pub dict_indexes: Vec<DictIndex>,
 }
 
-impl MultipleResponseSet {
-    fn decode(
-        decoder: &Decoder,
-        input: &raw::MultipleResponseSet<Identifier, Cow<str>>,
-        warn: &impl Fn(Error),
-    ) -> Result<Self, Error> {
-        let mr_set_name = input.name.clone();
-        let mut dict_indexes = Vec::with_capacity(input.short_names.len());
-        for short_name in input.short_names.iter() {
-            let Some(&dict_index) = decoder.var_names.get(&short_name) else {
-                warn(Error::UnknownMrSetVariable {
-                    mr_set: mr_set_name.clone(),
-                    short_name: short_name.clone(),
-                });
-                continue;
-            };
-            dict_indexes.push(dict_index);
-        }
-
-        match dict_indexes.len() {
-            0 => return Err(Error::EmptyMrSet(mr_set_name)),
-            1 => return Err(Error::OneVarMrSet(mr_set_name)),
-            _ => (),
-        }
-
-        let Some((Some(min_width), Some(max_width))) = dict_indexes
-            .iter()
-            .map(|dict_index| decoder.variables[dict_index].width)
-            .map(|w| (Some(w), Some(w)))
-            .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb)))
-        else {
-            return Err(Error::MixedMrSet(mr_set_name));
-        };
-
-        let mr_type =
-            MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?;
-
-        Ok(MultipleResponseSet {
-            name: mr_set_name,
-            min_width,
-            max_width,
-            label: input.label.to_string(),
-            mr_type,
-            dict_indexes,
-        })
-    }
-}
 
 #[derive(Clone, Debug)]
 pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
index 5da085b2e5e0747087ffb6cec0daf3043cdb2b79..5e7f25d208595424c9e8cdf98ecdf839d225772a 100644 (file)
@@ -16,7 +16,7 @@ use unicase::UniCase;
 use crate::{
     format::Format,
     identifier::{ByIdentifier, HasIdentifier, Identifier},
-    raw::{Alignment, CategoryLabels, Measure, MissingValues, VarType},
+    raw::{Alignment, CategoryLabels, Measure, MissingValues, RawWidth, VarType},
 };
 
 pub type DictIndex = usize;
@@ -82,12 +82,11 @@ impl VarWidth {
         }
     }
 
-    pub fn from_raw(raw: impl Into<i32>) -> Result<Self, ()> {
-        let raw: i32 = raw.into();
+    pub fn from_raw(raw: RawWidth) -> Result<Self, ()> {
         match raw {
-            0 => Ok(Self::Numeric),
-            1..=255 => Ok(Self::String(raw as u16)),
-            _ => Err(()),
+            RawWidth::Continuation => Err(()),
+            RawWidth::Numeric => Ok(Self::Numeric),
+            RawWidth::String(width) => Ok(Self::String(width.get() as u16)),
         }
     }
 
@@ -526,6 +525,8 @@ impl HasIdentifier for Attribute {
 pub struct MultipleResponseSet {
     pub name: Identifier,
     pub label: String,
+    pub min_width: VarWidth,
+    pub max_width: VarWidth,
     pub mr_type: MultipleResponseType,
     pub variables: Vec<DictIndex>,
 }
index 776677e56331256cf97113cc2320ae851b216318..e5933d3756ca67b67a1da05d195bcdfee28188ac 100644 (file)
@@ -11,12 +11,12 @@ use num::Integer;
 use std::{
     borrow::Cow,
     cell::RefCell,
-    cmp::Ordering,
     collections::{HashMap, VecDeque},
     fmt::{Debug, Display, Formatter, Result as FmtResult},
     io::{Error as IoError, Read, Seek, SeekFrom},
     iter::repeat,
     mem::take,
+    num::NonZeroU8,
     ops::Range,
     rc::Rc,
     str::from_utf8,
@@ -685,6 +685,18 @@ impl Display for VarType {
     }
 }
 
+impl TryFrom<RawWidth> for VarType {
+    type Error = ();
+
+    fn try_from(value: RawWidth) -> Result<Self, Self::Error> {
+        match value {
+            RawWidth::Continuation => Err(()),
+            RawWidth::Numeric => Ok(VarType::Numeric),
+            RawWidth::String(_) => Ok(VarType::String),
+        }
+    }
+}
+
 #[derive(Copy, Clone)]
 pub enum Value<S>
 where
@@ -735,7 +747,7 @@ impl RawValue {
     ) -> Result<Option<Vec<Self>>, Error> {
         let case_start = reader.stream_position()?;
         let mut values = Vec::with_capacity(var_types.n_values());
-        for (i, (var_type, _)) in var_types.types.iter().enumerate() {
+        for (i, var_type) in var_types.iter().enumerate() {
             let Some(raw) = try_read_bytes(reader)? else {
                 if i == 0 {
                     return Ok(None);
@@ -748,7 +760,7 @@ impl RawValue {
                     });
                 }
             };
-            values.push(Value::from_raw(&UntypedValue(raw), *var_type, endian));
+            values.push(Value::from_raw(&UntypedValue(raw), var_type, endian));
         }
         Ok(Some(values))
     }
@@ -762,7 +774,7 @@ impl RawValue {
     ) -> Result<Option<Vec<Self>>, Error> {
         let case_start = reader.stream_position()?;
         let mut values = Vec::with_capacity(var_types.n_values());
-        for (i, (var_type, _)) in var_types.types.iter().enumerate() {
+        for (i, var_type) in var_types.iter().enumerate() {
             let value = loop {
                 let Some(code) = codes.pop_front() else {
                     let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
@@ -781,7 +793,7 @@ impl RawValue {
                 };
                 match code {
                     0 => (),
-                    1..=251 => match *var_type {
+                    1..=251 => match var_type {
                         VarType::Numeric => break Self::Number(Some(code as f64 - bias)),
                         VarType::String => {
                             break Self::String(RawStr(endian.to_bytes(code as f64 - bias)))
@@ -799,9 +811,9 @@ impl RawValue {
                         }
                     }
                     253 => {
-                        break Self::from_raw(&UntypedValue(read_bytes(reader)?), *var_type, endian)
+                        break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian)
                     }
-                    254 => match *var_type {
+                    254 => match var_type {
                         VarType::String => break Self::String(RawStr(*b"        ")), // XXX EBCDIC
                         VarType::Numeric => {
                             return Err(Error::CompressedStringExpected {
@@ -810,7 +822,7 @@ impl RawValue {
                             })
                         }
                     },
-                    255 => match *var_type {
+                    255 => match var_type {
                         VarType::Numeric => break Self::Number(None),
                         VarType::String => {
                             return Err(Error::CompressedNumberExpected {
@@ -1193,23 +1205,21 @@ impl MissingValues<RawStr<8>> {
     fn read<R: Read + Seek>(
         r: &mut R,
         offset: u64,
-        width: i32,
+        width: RawWidth,
         code: i32,
         endian: Endian,
     ) -> Result<Self, Error> {
         let (n_values, has_range) = match (width, code) {
             (_, 0..=3) => (code, false),
-            (0, -2) => (0, true),
-            (0, -3) => (1, true),
-            (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
+            (RawWidth::Numeric, -2) => (0, true),
+            (RawWidth::Numeric, -3) => (1, true),
+            (RawWidth::Numeric, _) => {
+                return Err(Error::BadNumericMissingValueCode { offset, code })
+            }
             (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
         };
 
-        let var_type = if width == 0 {
-            VarType::Numeric
-        } else {
-            VarType::String
-        };
+        let var_type = VarType::try_from(width).unwrap();
 
         let mut values = Vec::new();
         for _ in 0..n_values {
@@ -1249,7 +1259,7 @@ where
     pub offsets: Range<u64>,
 
     /// Variable width, in the range -1..=255.
-    pub width: i32,
+    pub width: RawWidth,
 
     /// Variable name, padded on the right with spaces.
     pub name: S,
@@ -1267,36 +1277,53 @@ where
     pub label: Option<S>,
 }
 
-impl<S, V> VariableRecord<S, V>
-where
-    S: Debug,
-    V: Debug,
-{
+#[derive(Copy, Clone, PartialEq, Eq)]
+pub enum RawWidth {
+    Continuation,
+    Numeric,
+    String(NonZeroU8),
+}
+
+impl RawWidth {
     pub fn n_values(&self) -> Option<usize> {
-        match self.width {
-            0 => Some(1),
-            1..=255 => Some((self.width as usize).div_ceil(8)),
+        match self {
+            RawWidth::Numeric => Some(1),
+            RawWidth::String(width) => Some((width.get() as usize).div_ceil(8)),
             _ => None,
         }
     }
 }
 
+impl TryFrom<i32> for RawWidth {
+    type Error = ();
+
+    fn try_from(value: i32) -> Result<Self, Self::Error> {
+        match value {
+            -1 => Ok(Self::Continuation),
+            0 => Ok(Self::Numeric),
+            1..=255 => Ok(Self::String(NonZeroU8::new(value as u8).unwrap())),
+            _ => Err(()),
+        }
+    }
+}
+
+impl Display for RawWidth {
+    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+        match self {
+            RawWidth::Continuation => write!(f, "long string continuation"),
+            RawWidth::Numeric => write!(f, "numeric"),
+            RawWidth::String(width) => write!(f, "{width}-byte string"),
+        }
+    }
+}
+
 impl<S, V> Debug for VariableRecord<S, V>
 where
     S: Debug,
     V: Debug,
 {
     fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        writeln!(
-            f,
-            "Width: {} ({})",
-            self.width,
-            match self.width.cmp(&0) {
-                Ordering::Greater => "string",
-                Ordering::Equal => "numeric",
-                Ordering::Less => "long string continuation record",
-            }
-        )?;
+        writeln!(f, "Width: {}", self.width,)?;
         writeln!(f, "Print format: {:?}", self.print_format)?;
         writeln!(f, "Write format: {:?}", self.write_format)?;
         writeln!(f, "Name: {:?}", &self.name)?;
@@ -1309,12 +1336,10 @@ impl VariableRecord<RawString, RawStr<8>> {
     fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
         let start_offset = r.stream_position()?;
         let width: i32 = endian.parse(read_bytes(r)?);
-        if !(-1..=255).contains(&width) {
-            return Err(Error::BadVariableWidth {
-                start_offset,
-                width,
-            });
-        }
+        let width: RawWidth = width.try_into().map_err(|_| Error::BadVariableWidth {
+            start_offset,
+            width,
+        })?;
         let code_offset = r.stream_position()?;
         let has_variable_label: u32 = endian.parse(read_bytes(r)?);
         let missing_value_code: i32 = endian.parse(read_bytes(r)?);
@@ -1580,10 +1605,10 @@ impl ValueLabelRecord<RawStr<8>, RawString> {
         let Some(&first_index) = dict_indexes.first() else {
             return Ok(None);
         };
-        let var_type = var_types.types[first_index as usize - 1].0;
+        let var_type = var_types.types[first_index as usize - 1].unwrap();
         let mut wrong_type_indexes = Vec::new();
         dict_indexes.retain(|&index| {
-            if var_types.types[index as usize - 1].0 != var_type {
+            if var_types.types[index as usize - 1] != Some(var_type) {
                 wrong_type_indexes.push(index);
                 false
             } else {
@@ -2897,7 +2922,7 @@ impl LongStringValueLabelRecord<RawString, RawString> {
 
 #[derive(Default)]
 pub struct VarTypes {
-    pub types: Vec<(VarType, usize)>,
+    pub types: Vec<Option<VarType>>,
 }
 
 impl VarTypes {
@@ -2905,16 +2930,12 @@ impl VarTypes {
         Self::default()
     }
 
-    pub fn push(&mut self, width: i32) {
-        let var_type = match width {
-            -1 => return,
-            0 => VarType::Numeric,
-            1..=255 => VarType::String,
-            _ => unreachable!(),
-        };
-        let n_values = (width as usize).div_ceil(8).max(1);
-        for i in 0..n_values {
-            self.types.push((var_type, i));
+    pub fn push(&mut self, width: RawWidth) {
+        if let Ok(var_type) = VarType::try_from(width) {
+            self.types.push(Some(var_type));
+            for _ in 1..width.n_values().unwrap() {
+                self.types.push(None);
+            }
         }
     }
 
@@ -2928,10 +2949,15 @@ impl VarTypes {
 
     pub fn var_type_at(&self, index: usize) -> Option<VarType> {
         if index >= 1 && index <= self.types.len() {
-            if let (var_type, 0) = self.types[index - 1] {
-                return Some(var_type);
-            }
+            self.types[index - 1]
+        } else {
+            None
         }
-        None
+    }
+
+    pub fn iter(&self) -> impl Iterator<Item = VarType> + use<'_> {
+        self.types
+            .iter()
+            .map(|var_type| var_type.unwrap_or(VarType::String))
     }
 }