From 8b4440aeb0ea5b9828a6d680c5be3ce6b1d2fbc7 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Mon, 19 May 2025 20:02:38 -0700 Subject: [PATCH] work --- rust/pspp/src/dictionary.rs | 129 +++++++++++++++ rust/pspp/src/format/display.rs | 42 +++++ rust/pspp/src/identifier.rs | 4 + rust/pspp/src/output/pivot/mod.rs | 23 ++- rust/pspp/src/output/pivot/test.rs | 2 +- rust/pspp/src/sys/cooked.rs | 27 ++-- rust/pspp/src/sys/raw.rs | 248 ++++++++++++++++++++--------- rust/pspp/src/sys/test.rs | 3 + 8 files changed, 387 insertions(+), 91 deletions(-) diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index f8abe996d9..bbf97749f8 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -10,6 +10,7 @@ use std::{ }; use encoding_rs::Encoding; +use enum_map::{Enum, EnumMap}; use indexmap::IndexSet; use num::integer::div_ceil; use ordered_float::OrderedFloat; @@ -19,6 +20,7 @@ use unicase::UniCase; use crate::{ format::Format, identifier::{ByIdentifier, HasIdentifier, Identifier}, + output::pivot::{Axis3, Dimension, Group, PivotTable, Value as PivotValue}, sys::raw::{Alignment, CategoryLabels, Measure, MissingValues, RawString, VarType}, }; @@ -517,6 +519,119 @@ impl Dictionary { pub fn rename_var(&mut self, index: usize, new_name: Identifier) { assert!(self.try_rename_var(index, new_name)); } + + pub fn display_variables(&self) -> DisplayVariables { + DisplayVariables::new(self) + } +} + +pub struct DisplayVariables<'a> { + dictionary: &'a Dictionary, + fields: EnumMap, +} + +impl<'a> DisplayVariables<'a> { + fn new(dictionary: &'a Dictionary) -> Self { + Self { + dictionary, + fields: EnumMap::from_fn(|_field: VariableField| true), + } + } + pub fn to_pivot_table(&self) -> PivotTable { + let mut names = Group::new("Name"); + for variable in &self.dictionary.variables { + names.push(PivotValue::new_variable(variable)); + } + + let mut attributes = Group::new("Attributes"); + let mut columns = Vec::new(); + for field in self + .fields + .iter() + .filter_map(|(field, include)| include.then_some(field)) + { + columns.push((field, attributes.len())); + attributes.push(field.as_str()); + } + + let mut pt = PivotTable::new(vec![ + (Axis3::Y, Dimension::new(names)), + (Axis3::X, Dimension::new(attributes)), + ]); + for (var_index, variable) in self.dictionary.variables.iter().enumerate() { + for (field, field_index) in &columns { + if let Some(value) = + Self::get_field_value(var_index, variable, *field, self.dictionary.encoding) + { + pt.insert(&[var_index, *field_index], value); + } + } + } + + pt + } + + fn get_field_value( + index: usize, + variable: &Variable, + field: VariableField, + encoding: &'static Encoding, + ) -> Option { + match field { + VariableField::Position => Some(PivotValue::new_integer(Some(index as f64 + 1.0))), + VariableField::Label => variable + .label() + .map(|label| PivotValue::new_user_text(label)), + VariableField::Measure => variable + .measure + .map(|measure| PivotValue::new_text(measure.as_str())), + VariableField::Role => variable + .role + .map(|role| PivotValue::new_text(role.as_str())), + VariableField::Width => { + Some(PivotValue::new_integer(Some(variable.display_width as f64))) + } + VariableField::Alignment => Some(PivotValue::new_text(variable.alignment.as_str())), + VariableField::PrintFormat => { + Some(PivotValue::new_user_text(variable.print_format.to_string())) + } + VariableField::WriteFormat => { + Some(PivotValue::new_user_text(variable.write_format.to_string())) + } + VariableField::MissingValues => Some(PivotValue::new_user_text( + variable.missing_values.display(encoding).to_string(), + )), + } + } +} + +#[derive(Copy, Clone, Debug, Enum)] +enum VariableField { + Position, + Label, + Measure, + Role, + Width, + Alignment, + PrintFormat, + WriteFormat, + MissingValues, +} + +impl VariableField { + pub fn as_str(&self) -> &'static str { + match self { + VariableField::Position => "Position", + VariableField::Label => "Label", + VariableField::Measure => "Measurement Level", + VariableField::Role => "Role", + VariableField::Width => "Width", + VariableField::Alignment => "Alignment", + VariableField::PrintFormat => "Print Format", + VariableField::WriteFormat => "Write Format", + VariableField::MissingValues => "Missing Values", + } + } } fn update_dict_index_vec(dict_indexes: &mut Vec, f: F) @@ -579,6 +694,16 @@ impl Role { _ => Err(InvalidRole::UnknownRole(integer.to_string())), } } + + fn as_str(&self) -> &'static str { + match self { + Role::Input => "Input", + Role::Target => "Target", + Role::Both => "Both", + Role::Partition => "Partition", + Role::Split => "Split", + } + } } #[derive(Clone, Debug, Default, PartialEq, Eq)] @@ -710,6 +835,10 @@ impl Variable { pub fn is_string(&self) -> bool { self.width.is_string() } + + pub fn label(&self) -> Option<&String> { + self.label.as_ref() + } } impl HasIdentifier for Variable { diff --git a/rust/pspp/src/format/display.rs b/rust/pspp/src/format/display.rs index 7b430821ad..f82a14cec5 100644 --- a/rust/pspp/src/format/display.rs +++ b/rust/pspp/src/format/display.rs @@ -36,6 +36,48 @@ impl Value { pub fn display(&self, format: Format, encoding: &'static Encoding) -> DisplayValue { DisplayValue::new(format, self, encoding) } + + pub fn display_plain(&self, encoding: &'static Encoding) -> DisplayValuePlain { + DisplayValuePlain { + value: self, + encoding, + quote_strings: true, + } + } +} + +pub struct DisplayValuePlain<'a> { + value: &'a Value, + encoding: &'static Encoding, + quote_strings: bool, +} + +impl DisplayValuePlain<'_> { + pub fn without_quotes(self) -> Self { + Self { + quote_strings: false, + ..self + } + } +} + +impl Display for DisplayValuePlain<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + match self.value { + Value::Number(None) => write!(f, "SYSMIS"), + Value::Number(Some(number)) if number.abs() < 0.0005 || number.abs() > 1e15 => { + write!(f, "{number:.}") + } + Value::Number(Some(number)) => write!(f, "{number:.e}"), + Value::String(string) => { + if self.quote_strings { + write!(f, "\"{}\"", string.display(self.encoding)) + } else { + string.display(self.encoding).fmt(f) + } + } + } + } } impl Display for DisplayValue<'_, '_> { diff --git a/rust/pspp/src/identifier.rs b/rust/pspp/src/identifier.rs index b3ce546bc9..fba54d7ef5 100644 --- a/rust/pspp/src/identifier.rs +++ b/rust/pspp/src/identifier.rs @@ -292,6 +292,10 @@ impl Identifier { pub fn class(&self) -> Class { self.into() } + + pub fn as_str(&self) -> &str { + self.0.as_ref() + } } impl PartialEq for Identifier { diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs index 0160885211..b83574acd8 100644 --- a/rust/pspp/src/output/pivot/mod.rs +++ b/rust/pspp/src/output/pivot/mod.rs @@ -52,7 +52,7 @@ use thiserror::Error as ThisError; use tlo::parse_tlo; use crate::{ - dictionary::Value as DataValue, + dictionary::{Value as DataValue, Variable}, format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat}, settings::{Settings, Show}, sys::raw::VarType, @@ -62,7 +62,7 @@ pub mod output; mod look_xml; #[cfg(test)] -mod test; +pub mod test; mod tlo; /// Areas of a pivot table for styling purposes. @@ -573,9 +573,15 @@ impl From for Category { } } +impl From for Category { + fn from(group: Leaf) -> Self { + Self::Leaf(group) + } +} + impl From for Category { fn from(name: Value) -> Self { - Self::Leaf(Leaf::new(name)) + Leaf::new(name).into() } } @@ -1457,11 +1463,11 @@ impl PivotTable { cell_index(data_indexes, self.dimensions.iter().map(|d| d.len())) } - fn insert(&mut self, data_indexes: &[usize], value: Value) { + pub fn insert(&mut self, data_indexes: &[usize], value: Value) { self.cells.insert(self.cell_index(data_indexes), value); } - fn get(&self, data_indexes: &[usize]) -> Option<&Value> { + pub fn get(&self, data_indexes: &[usize]) -> Option<&Value> { self.cells.get(&self.cell_index(data_indexes)) } @@ -1714,6 +1720,13 @@ impl Value { value_label: None, })) } + pub fn new_variable(variable: &Variable) -> Self { + Self::new(ValueInner::Variable(VariableValue { + show: None, + var_name: String::from(variable.name.as_str()), + variable_label: variable.label.clone(), + })) + } pub fn new_number(x: Option) -> Self { Self::new_number_with_format(x, Format::F8_2) } diff --git a/rust/pspp/src/output/pivot/test.rs b/rust/pspp/src/output/pivot/test.rs index 5788d03288..84c8117ac9 100644 --- a/rust/pspp/src/output/pivot/test.rs +++ b/rust/pspp/src/output/pivot/test.rs @@ -123,7 +123,7 @@ fn d2(title: &str, axes: [Axis3; 2], dimension_labels: Option) -> } #[track_caller] -fn assert_rendering(name: &str, pivot_table: &PivotTable, expected: &str) { +pub fn assert_rendering(name: &str, pivot_table: &PivotTable, expected: &str) { let actual = pivot_table.to_string(); if actual != expected { eprintln!("Unexpected pivot table rendering:\n--- expected\n+++ actual"); diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index ac7272dab0..13be160ca5 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -9,14 +9,17 @@ use crate::{ endian::Endian, format::{Error as FormatError, Format, UncheckedFormat}, identifier::{ByIdentifier, Error as IdError, Identifier}, - sys::encoding::Error as EncodingError, - sys::raw::{ - self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord, - FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName, LongNamesRecord, - LongStringMissingValueRecord, LongStringValueLabelRecord, MissingValues, - MultipleResponseRecord, NumberOfCasesRecord, ProductInfoRecord, RawStrArray, RawWidth, - ValueLabel, ValueLabelRecord, VarDisplayRecord, VariableAttributeRecord, VariableRecord, - VariableSetRecord, VeryLongStringsRecord, ZHeader, ZTrailer, + sys::{ + encoding::Error as EncodingError, + raw::{ + self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, + FileAttributeRecord, FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName, + LongNamesRecord, LongStringMissingValueRecord, LongStringValueLabelRecord, + MissingValues, MultipleResponseRecord, NumberOfCasesRecord, ProductInfoRecord, + RawStrArray, RawString, RawWidth, ValueLabel, ValueLabelRecord, VarDisplayRecord, + VariableAttributeRecord, VariableRecord, VariableSetRecord, VeryLongStringsRecord, + ZHeader, ZTrailer, + }, }, }; use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; @@ -750,7 +753,6 @@ pub fn decode( } } - let mut value = Vec::new(); for record in headers .long_string_missing_values .drain(..) @@ -764,10 +766,9 @@ pub fn decode( .missing_values .into_iter() .map(|v| { - value.clear(); - value.extend_from_slice(v.0.as_slice()); - value.resize(variable.width.as_string_width().unwrap(), b' '); - Value::String(Box::from(value.as_slice())) + let mut value = RawString::from(v.0.as_slice()); + value.resize(variable.width.as_string_width().unwrap()); + Value::String(value) }) .collect::>(); variable.missing_values = MissingValues { diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 2a9113a736..dbb0d3de9e 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -9,14 +9,14 @@ use encoding_rs::{mem::decode_latin1, Encoding}; use flate2::read::ZlibDecoder; use num::Integer; use std::{ - borrow::Cow, + borrow::{Borrow, Cow}, cell::RefCell, collections::{HashMap, VecDeque}, fmt::{Debug, Display, Formatter, Result as FmtResult}, io::{Error as IoError, Read, Seek, SeekFrom}, mem::take, num::NonZeroU8, - ops::Range, + ops::{Deref, Range}, rc::Rc, str::from_utf8, }; @@ -1121,90 +1121,29 @@ fn format_name(type_: u32) -> Cow<'static, str> { } #[derive(Clone, Default)] -pub struct MissingValues> -where - S: Debug, -{ +pub struct MissingValues { /// Individual missing values, up to 3 of them. - pub values: Vec>, + pub values: Vec, /// Optional range of missing values. pub range: Option, } -#[derive(Copy, Clone)] -pub enum MissingValueRange { - In { low: f64, high: f64 }, - From { low: f64 }, - To { high: f64 }, -} - -impl MissingValueRange { - pub fn new(low: f64, high: f64) -> Self { - const LOWEST: f64 = f64::MIN.next_up(); - match (low, high) { - (f64::MIN | LOWEST, _) => Self::To { high }, - (_, f64::MAX) => Self::From { low }, - (_, _) => Self::In { low, high }, - } - } - - pub fn low(&self) -> Option { - match self { - MissingValueRange::In { low, .. } | MissingValueRange::From { low } => Some(*low), - MissingValueRange::To { .. } => None, - } - } - - pub fn high(&self) -> Option { - match self { - MissingValueRange::In { high, .. } | MissingValueRange::To { high } => Some(*high), - MissingValueRange::From { .. } => None, - } - } -} - -impl Debug for MissingValues -where - S: Debug, -{ +impl Debug for MissingValues { fn fmt(&self, f: &mut Formatter) -> FmtResult { - for (i, value) in self.values.iter().enumerate() { - if i > 0 { - write!(f, ", ")?; - } - write!(f, "{value:?}")?; - } - - if let Some(range) = &self.range { - if !self.values.is_empty() { - write!(f, ", ")?; - } - match range { - MissingValueRange::In { low, high } => write!(f, "{low:?} THRU {high:?}")?, - MissingValueRange::From { low } => write!(f, "{low:?} THRU HI")?, - MissingValueRange::To { high } => write!(f, "LOW THRU {high:?}")?, - } + DisplayMissingValues { + mv: self, + encoding: None, } - - if self.is_empty() { - write!(f, "none")?; - } - - Ok(()) + .fmt(f) } } -impl MissingValues -where - S: Debug, -{ +impl MissingValues { fn is_empty(&self) -> bool { self.values.is_empty() && self.range.is_none() } -} -impl MissingValues { fn read( r: &mut R, offset: u64, @@ -1250,7 +1189,7 @@ impl MissingValues { let width = width.min(8) as usize; let values = values .into_iter() - .map(|value| Value::String(Box::from(&value[..width]))) + .map(|value| Value::String(RawString::from(&value[..width]))) .collect(); return Ok(Self { values, @@ -1261,6 +1200,86 @@ impl MissingValues { } Ok(Self::default()) } + + pub fn display(&self, encoding: &'static Encoding) -> DisplayMissingValues<'_> { + DisplayMissingValues { + mv: self, + encoding: Some(encoding), + } + } +} + +pub struct DisplayMissingValues<'a> { + mv: &'a MissingValues, + encoding: Option<&'static Encoding>, +} + +impl<'a> Display for DisplayMissingValues<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + for (i, value) in self.mv.values.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + match self.encoding { + Some(encoding) => value.display_plain(encoding).fmt(f)?, + None => value.fmt(f)?, + } + } + + if let Some(range) = &self.mv.range { + if !self.mv.values.is_empty() { + write!(f, ", ")?; + } + write!(f, "{range}")?; + } + + if self.mv.is_empty() { + write!(f, "none")?; + } + Ok(()) + } +} + +#[derive(Copy, Clone)] +pub enum MissingValueRange { + In { low: f64, high: f64 }, + From { low: f64 }, + To { high: f64 }, +} + +impl MissingValueRange { + pub fn new(low: f64, high: f64) -> Self { + const LOWEST: f64 = f64::MIN.next_up(); + match (low, high) { + (f64::MIN | LOWEST, _) => Self::To { high }, + (_, f64::MAX) => Self::From { low }, + (_, _) => Self::In { low, high }, + } + } + + pub fn low(&self) -> Option { + match self { + MissingValueRange::In { low, .. } | MissingValueRange::From { low } => Some(*low), + MissingValueRange::To { .. } => None, + } + } + + pub fn high(&self) -> Option { + match self { + MissingValueRange::In { high, .. } | MissingValueRange::To { high } => Some(*high), + MissingValueRange::From { .. } => None, + } + } +} + +impl Display for MissingValueRange { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + match self { + MissingValueRange::In { low, high } => write!(f, "{low:?} THRU {high:?}"), + MissingValueRange::From { low } => write!(f, "{low:?} THRU HI"), + MissingValueRange::To { high } => write!(f, "LOW THRU {high:?}"), + } + } } #[derive(Clone)] @@ -1442,6 +1461,14 @@ impl Debug for UntypedValue { } } +/// An owned string in an unspecified encoding. +/// +/// We assume that the encoding is one supported by [encoding_rs] with byte +/// units (that is, not a `UTF-16` encoding). All of these encodings have some +/// basic ASCII compatibility. +/// +/// A [RawString] owns its contents and can grow and shrink, like a [Vec] or +/// [String]. For a borrowed raw string, see [RawStr]. #[derive(Clone, PartialEq, Default, Eq, PartialOrd, Ord, Hash)] pub struct RawString(pub Vec); @@ -1452,6 +1479,23 @@ impl RawString { pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> { EncodedStr::new(&self.0, encoding) } + pub fn resize(&mut self, len: usize) { + self.0.resize(len, b' '); + } +} + +impl Borrow for RawString { + fn borrow(&self) -> &RawStr { + RawStr::from_bytes(&self.0) + } +} + +impl Deref for RawString { + type Target = RawStr; + + fn deref(&self) -> &Self::Target { + self.borrow() + } } impl From> for RawString { @@ -1478,6 +1522,50 @@ impl Debug for RawString { } } +/// A borrowed string in an unspecified encoding. +/// +/// We assume that the encoding is one supported by [encoding_rs] with byte +/// units (that is, not a `UTF-16` encoding). All of these encodings have some +/// basic ASCII compatibility. +/// +/// For an owned raw string, see [RawString]. +#[repr(transparent)] +#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct RawStr(pub [u8]); + +impl RawStr { + pub fn from_bytes(bytes: &[u8]) -> &Self { + // SAFETY: `RawStr` is a transparent wrapper around `[u8]`, so we can + // turn a reference to the wrapped type into a reference to the wrapper + // type. + unsafe { &*(bytes as *const [u8] as *const Self) } + } + + pub fn as_bytes(&self) -> &[u8] { + &self.0 + } + + /// Returns an object that implements [Display] for printing this [RawStr], + /// given that it is encoded in `encoding`. + pub fn display(&self, encoding: &'static Encoding) -> DisplayRawString { + DisplayRawString(encoding.decode_without_bom_handling(&self.0).0) + } +} + +pub struct DisplayRawString<'a>(Cow<'a, str>); + +impl<'a> Display for DisplayRawString<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + write!(f, "{}", &self.0) + } +} + +impl Debug for RawStr { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{:?}", default_decode(self.as_bytes())) + } +} + #[derive(Copy, Clone)] pub struct RawStrArray(pub [u8; N]); @@ -2127,6 +2215,14 @@ impl Measure { _ => Err(Warning::InvalidMeasurement(source)), } } + + pub fn as_str(&self) -> &'static str { + match self { + Measure::Nominal => "Nominal", + Measure::Ordinal => "Ordinal", + Measure::Scale => "Scale", + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] @@ -2153,6 +2249,14 @@ impl Alignment { VarType::String => Self::Left, } } + + pub fn as_str(&self) -> &'static str { + match self { + Alignment::Left => "Left", + Alignment::Right => "Right", + Alignment::Center => "Center", + } + } } #[derive(Clone, Debug)] diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs index 2238194093..895ba496bd 100644 --- a/rust/pspp/src/sys/test.rs +++ b/rust/pspp/src/sys/test.rs @@ -2,6 +2,7 @@ use std::io::Cursor; use crate::{ endian::Endian, + output::pivot::test::assert_rendering, sys::{ cooked::{decode, Headers}, raw::{encoding_from_headers, Decoder, Reader, Record}, @@ -151,4 +152,6 @@ s16 "23456789abc"; s32 "defghijklmnopqstuvwxyzABC"; assert_eq!(metadata.n_cases, Some(1)); assert_eq!(metadata.version, Some((1, 2, 3))); println!("{metadata:#?}"); + let pt = dictionary.display_variables().to_pivot_table(); + assert_rendering("variable_labels_and_missing_values", &pt, ""); } -- 2.30.2