From: Ben Pfaff Date: Tue, 5 Aug 2025 15:16:42 +0000 (-0700) Subject: move variable into its own module X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=781a07a75ae5e8b43427996a65979c79a8912319;p=pspp move variable into its own module --- diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index fe77681998..f17441070e 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -44,7 +44,7 @@ use serde::{ }; use crate::{ - dictionary::{VarType, VarWidth}, + variable::{VarType, VarWidth}, format::DisplayPlain, }; diff --git a/rust/pspp/src/data/encoded.rs b/rust/pspp/src/data/encoded.rs index 3584d05070..0bb0041cac 100644 --- a/rust/pspp/src/data/encoded.rs +++ b/rust/pspp/src/data/encoded.rs @@ -10,7 +10,7 @@ use serde::Serialize; use crate::{ data::{ByteCow, ByteStr, ByteString, MutRawString, Quoted, RawString, ResizeError}, - dictionary::VarWidth, + variable::VarWidth, }; pub trait Encoded { diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index 4608d2dda3..ae74dfd1da 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -14,22 +14,18 @@ // You should have received a copy of the GNU General Public License along with // this program. If not, see . -//! Dictionaries and variables. +//! Dictionaries. use core::str; use std::{ borrow::Cow, - collections::{btree_set, BTreeMap, BTreeSet, HashMap, HashSet}, - fmt::{Debug, Display, Formatter, Result as FmtResult}, - hash::{DefaultHasher, Hash, Hasher}, - ops::{Bound, Deref, Index, Not, RangeBounds, RangeInclusive}, - str::FromStr, + collections::{btree_set, BTreeSet, HashSet}, + ops::{Bound, Index, RangeBounds, RangeInclusive}, }; use encoding_rs::{Encoding, UTF_8}; use enum_map::{Enum, EnumMap}; use indexmap::IndexSet; -use num::integer::div_ceil; use serde::{ ser::{SerializeMap, SerializeSeq, SerializeStruct}, Serialize, @@ -39,263 +35,18 @@ use thiserror::Error as ThisError; use unicase::UniCase; use crate::{ - data::{ByteString, Datum, Encoded, EncodedString, ResizeError, WithEncoding}, - format::{DisplayPlain, Format}, + data::{ByteString, Datum}, identifier::{ByIdentifier, HasIdentifier, Identifier}, output::pivot::{ Axis3, Dimension, Display26Adic, Footnote, Footnotes, Group, PivotTable, Value, }, settings::Show, + variable::{Attributes, VarWidth, Variable}, }; /// An index within [Dictionary::variables]. pub type DictIndex = usize; -/// Variable type. -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)] -pub enum VarType { - /// A numeric variable. - Numeric, - - /// A string variable. - String, -} - -impl Not for VarType { - type Output = Self; - - fn not(self) -> Self::Output { - match self { - Self::Numeric => Self::String, - Self::String => Self::Numeric, - } - } -} - -impl Not for &VarType { - type Output = VarType; - - fn not(self) -> Self::Output { - !*self - } -} - -impl Display for VarType { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - match self { - VarType::Numeric => write!(f, "numeric"), - VarType::String => write!(f, "string"), - } - } -} - -/// [VarType], plus a width for [VarType::String]. -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize)] -pub enum VarWidth { - Numeric, - String(u16), -} - -impl VarWidth { - pub const MAX_STRING: u16 = 32767; - - pub fn n_dict_indexes(self) -> usize { - match self { - VarWidth::Numeric => 1, - VarWidth::String(w) => div_ceil(w as usize, 8), - } - } - - fn width_predicate( - a: Option, - b: Option, - f: impl Fn(u16, u16) -> u16, - ) -> Option { - match (a, b) { - (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric), - (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => { - Some(VarWidth::String(f(a, b))) - } - _ => None, - } - } - - /// Returns the wider of `self` and `other`: - /// - Numerical variable widths are equally wide. - /// - Longer strings are wider than shorter strings. - /// - Numerical and string types are incomparable, so result in `None`. - /// - Any `None` in the input yields `None` in the output. - pub fn wider(a: Option, b: Option) -> Option { - Self::width_predicate(a, b, |a, b| a.max(b)) - } - - /// Returns the narrower of `self` and `other` (see [`Self::wider`]). - pub fn narrower(a: Option, b: Option) -> Option { - Self::width_predicate(a, b, |a, b| a.min(b)) - } - - pub fn default_display_width(&self) -> u32 { - match self { - VarWidth::Numeric => 8, - VarWidth::String(width) => *width.min(&32) as u32, - } - } - - pub fn is_long_string(&self) -> bool { - if let Self::String(width) = self { - *width > 8 - } else { - false - } - } - - pub fn as_string_width(&self) -> Option { - match self { - VarWidth::Numeric => None, - VarWidth::String(width) => Some(*width as usize), - } - } - - pub fn is_numeric(&self) -> bool { - *self == Self::Numeric - } - - pub fn is_string(&self) -> bool { - !self.is_numeric() - } - - /// Returns true if this is a very long string width, meaning wider than 255 - /// bytes, which was the limit for old versions of SPSS. - pub fn is_very_long_string(&self) -> bool { - match *self { - VarWidth::Numeric => false, - VarWidth::String(width) => width > 255, - } - } - - /// Number of bytes per segment by which the amount of space for very long - /// string variables is allocated. - pub const SEGMENT_SIZE: usize = 252; - - /// Returns an iterator over the "segments" used for writing case data for a - /// variable with this width. A segment is a physical variable in the - /// system file that represents some piece of a logical variable as seen by - /// a PSPP user. Most variables have one segment whose width is their own - /// width, but very long string variables, with width greater than 255, have - /// multiple segments each with width 255 or less. - pub fn segments(&self) -> Segments { - Segments::new(*self) - } - - /// Returns the number of 8-byte chunks used for writing case data for a - /// variable with this width. Very long string variables (wider than 255 - /// bytes) cannot directly be divided into chunks: they must first be - /// divided into multiple [segments](Self::segments), which can then be - /// divided into chunks. - pub fn n_chunks(&self) -> Option { - match *self { - VarWidth::Numeric => Some(1), - VarWidth::String(w) if w <= 255 => Some(w.div_ceil(8) as usize), - VarWidth::String(_) => None, - } - } - - /// Returns the width to allocate to the segment with the given - /// `segment_idx` within this variable. A segment is a physical variable in - /// the system file that represents some piece of a logical variable as seen - /// by a PSPP user. - pub fn segment_alloc_width(&self, segment_idx: usize) -> usize { - debug_assert!(segment_idx < self.segments().len()); - debug_assert!(self.is_very_long_string()); - - if segment_idx < self.segments().len() - 1 { - 255 - } else { - self.as_string_width().unwrap() - segment_idx * Self::SEGMENT_SIZE - } - } - - pub fn display_adjective(&self) -> VarWidthAdjective { - VarWidthAdjective(*self) - } - - pub fn codepage_to_unicode(&mut self) { - match self { - VarWidth::Numeric => (), - VarWidth::String(width) => *width = width.saturating_mul(3).min(Self::MAX_STRING), - } - } -} - -pub struct Segments { - width: VarWidth, - i: usize, - n: usize, -} -impl Segments { - pub fn new(width: VarWidth) -> Self { - Self { - width, - i: 0, - n: if width.is_very_long_string() { - width - .as_string_width() - .unwrap() - .div_ceil(VarWidth::SEGMENT_SIZE) - } else { - 1 - }, - } - } -} - -impl Iterator for Segments { - type Item = VarWidth; - - fn next(&mut self) -> Option { - let i = self.i; - if i >= self.n { - None - } else { - self.i += 1; - match self.width { - VarWidth::Numeric => Some(VarWidth::Numeric), - VarWidth::String(_) if i < self.n - 1 => Some(VarWidth::String(255)), - VarWidth::String(width) => Some(VarWidth::String( - width - (self.n as u16 - 1) * VarWidth::SEGMENT_SIZE as u16, - )), - } - } - } - - fn size_hint(&self) -> (usize, Option) { - let n = self.n - self.i; - (n, Some(n)) - } -} - -impl ExactSizeIterator for Segments {} - -impl From for VarType { - fn from(source: VarWidth) -> Self { - match source { - VarWidth::Numeric => VarType::Numeric, - VarWidth::String(_) => VarType::String, - } - } -} - -pub struct VarWidthAdjective(VarWidth); - -impl Display for VarWidthAdjective { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - match self.0 { - VarWidth::Numeric => write!(f, "numeric"), - VarWidth::String(width) => write!(f, "{width}-byte string"), - } - } -} - /// A collection of variables, plus additional metadata. #[derive(Clone, Debug)] pub struct Dictionary { @@ -506,9 +257,9 @@ impl Dictionary { /// the same name (or a variant with different case), or if `variable`'s /// encoding differs from the dictionary's. pub fn add_var(&mut self, variable: Variable) -> Result { - if variable.encoding != self.encoding { + if variable.encoding() != self.encoding { Err(AddVarError::WrongEncoding { - var_encoding: variable.encoding, + var_encoding: variable.encoding(), dict_encoding: self.encoding, }) } else { @@ -948,8 +699,8 @@ impl<'a> OutputVariables<'a> { VariableField::WriteFormat => { Some(Value::new_user_text(variable.write_format.to_string())) } - VariableField::MissingValues if !variable.missing_values.is_empty() => { - Some(Value::new_user_text(variable.missing_values.to_string())) + VariableField::MissingValues if !variable.missing_values().is_empty() => { + Some(Value::new_user_text(variable.missing_values().to_string())) } VariableField::MissingValues => None, } @@ -992,7 +743,7 @@ impl<'a> OutputValueLabels<'a> { let mut value = Value::new_variable_value(variable, datum) .with_show_value_label(Some(Show::Value)); if variable - .missing_values + .missing_values() .contains(&datum.as_encoded(variable.encoding())) { value.add_footnote(&missing_footnote); @@ -1232,226 +983,6 @@ where }); } -#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize)] -pub enum Role { - #[default] - Input, - Target, - Both, - None, - Partition, - Split, -} - -impl Role { - fn as_str(&self) -> &'static str { - match self { - Role::Input => "Input", - Role::Target => "Target", - Role::Both => "Both", - Role::None => "None", - Role::Partition => "Partition", - Role::Split => "Split", - } - } -} - -impl FromStr for Role { - type Err = InvalidRole; - - fn from_str(s: &str) -> Result { - for (string, value) in [ - ("input", Role::Input), - ("target", Role::Target), - ("both", Role::Both), - ("none", Role::None), - ("partition", Role::Partition), - ("split", Role::Split), - ] { - if string.eq_ignore_ascii_case(s) { - return Ok(value); - } - } - Err(InvalidRole::UnknownRole(s.into())) - } -} - -impl TryFrom for Role { - type Error = InvalidRole; - - fn try_from(value: i32) -> Result { - match value { - 0 => Ok(Role::Input), - 1 => Ok(Role::Target), - 2 => Ok(Role::Both), - 3 => Ok(Role::None), - 4 => Ok(Role::Partition), - 5 => Ok(Role::Split), - _ => Err(InvalidRole::UnknownRole(value.to_string())), - } - } -} - -impl From for i32 { - fn from(value: Role) -> Self { - match value { - Role::Input => 0, - Role::Target => 1, - Role::Both => 2, - Role::None => 3, - Role::Partition => 4, - Role::Split => 5, - } - } -} - -#[derive(Clone, Default, PartialEq, Eq, Serialize)] -pub struct Attributes(pub BTreeMap>); - -impl Attributes { - pub fn new() -> Self { - Self(BTreeMap::new()) - } - - pub fn contains_name(&self, name: &Identifier) -> bool { - self.0.contains_key(name) - } - - pub fn insert(&mut self, name: Identifier, values: Vec) { - self.0.insert(name, values); - } - - pub fn with(mut self, name: Identifier, values: Vec) -> Self { - self.insert(name, values); - self - } - - pub fn append(&mut self, other: &mut Self) { - self.0.append(&mut other.0) - } - - pub fn role(&self) -> Result, InvalidRole> { - self.try_into() - } - - pub fn iter(&self, include_at: bool) -> impl Iterator { - self.0.iter().filter_map(move |(name, values)| { - if include_at || !name.0.starts_with('@') { - Some((name, values.as_slice())) - } else { - None - } - }) - } - - pub fn has_any(&self, include_at: bool) -> bool { - self.iter(include_at).next().is_some() - } - - pub fn codepage_to_unicode(&mut self) { - let mut new = BTreeMap::new(); - while let Some((mut name, value)) = self.0.pop_first() { - name.codepage_to_unicode(); - new.insert(name, value); - } - self.0 = new; - } -} - -impl Debug for Attributes { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - self.0.fmt(f) - } -} - -#[derive(Clone, Debug, ThisError, PartialEq, Eq)] -pub enum InvalidRole { - #[error("Unknown role {0:?}.")] - UnknownRole(String), - - #[error("Role attribute $@Role must have exactly one value (not {0}).")] - InvalidValues(usize), -} - -impl TryFrom<&Attributes> for Option { - type Error = InvalidRole; - - fn try_from(value: &Attributes) -> Result { - let role = Identifier::new("$@Role").unwrap(); - value.0.get(&role).map_or(Ok(None), |attribute| { - if let Ok([string]) = <&[String; 1]>::try_from(attribute.as_slice()) { - match string.parse::() { - Ok(integer) => Ok(Some(Role::try_from(integer)?)), - Err(_) => Err(InvalidRole::UnknownRole(string.clone())), - } - } else { - Err(InvalidRole::InvalidValues(attribute.len())) - } - }) - } -} - -/// A variable, usually inside a [Dictionary]. -#[derive(Clone, Debug, Serialize)] -pub struct Variable { - /// The variable's name. - /// - /// PSPP variable names are case-insensitive. - pub name: Identifier, - - /// Variable width. - pub width: VarWidth, - - /// User-missing values. - /// - /// Numeric variables also have a system-missing value (represented as - /// `None`). - /// - /// Both kinds of missing values are excluded from most analyses. - missing_values: MissingValues, - - /// Output format used in most contexts. - pub print_format: Format, - - /// Output format used on the `WRITE` command. - pub write_format: Format, - - /// Value labels. - pub value_labels: ValueLabels, - - /// Variable label, an optional meaningful description for the variable - /// itself. - pub label: Option, - - /// Measurement level for the variable's data. - pub measure: Option, - - /// Role in data analysis. - pub role: Role, - - /// Width of data column in GUI. - pub display_width: u32, - - /// Data alignment in GUI. - pub alignment: Alignment, - - /// Whether to retain values of the variable from one case to the next. - pub leave: bool, - - /// For compatibility with old software that supported at most 8-character - /// variable names. - pub short_names: Vec, - - /// Variable attributes. - pub attributes: Attributes, - - /// Encoding for [Value]s inside this variable. - /// - /// The variables in a [Dictionary] must all use the same encoding as the - /// dictionary. - encoding: &'static Encoding, -} - pub fn escape_value_label(unescaped: &str) -> Cow<'_, str> { if unescaped.contains("\n") { unescaped.replace("\n", "\\n").into() @@ -1468,88 +999,6 @@ pub fn unescape_value_label(escaped: &str) -> Cow<'_, str> { } } -impl Variable { - pub fn new(name: Identifier, width: VarWidth, encoding: &'static Encoding) -> Self { - let var_type = VarType::from(width); - let leave = name.class().must_leave(); - Self { - name, - width, - missing_values: MissingValues::default(), - print_format: Format::default_for_width(width), - write_format: Format::default_for_width(width), - value_labels: ValueLabels::new(), - label: None, - measure: Measure::default_for_type(var_type), - role: Role::default(), - display_width: width.default_display_width(), - alignment: Alignment::default_for_type(var_type), - leave, - short_names: Vec::new(), - attributes: Attributes::new(), - encoding, - } - } - - pub fn encoding(&self) -> &'static Encoding { - self.encoding - } - - pub fn is_numeric(&self) -> bool { - self.width.is_numeric() - } - - pub fn is_string(&self) -> bool { - self.width.is_string() - } - - pub fn label(&self) -> Option<&String> { - self.label.as_ref() - } - - pub fn resize(&mut self, width: VarWidth) { - let _ = self.missing_values.resize(width); - - self.value_labels.resize(width); - - self.print_format.resize(width); - self.write_format.resize(width); - - self.width = width; - } - - pub fn missing_values(&self) -> &MissingValues { - &self.missing_values - } - - pub fn missing_values_mut(&mut self) -> MissingValuesMut<'_> { - MissingValuesMut { - inner: &mut self.missing_values, - width: self.width, - } - } - - pub fn codepage_to_unicode(&mut self) { - self.name.codepage_to_unicode(); - self.width.codepage_to_unicode(); - self.missing_values.codepage_to_unicode(); - self.print_format.codepage_to_unicode(); - self.write_format.codepage_to_unicode(); - self.attributes.codepage_to_unicode(); - self.encoding = UTF_8; - - // Anything old enough to not support long names is old enough not to - // support Unicode. - self.short_names.clear(); - } -} - -impl HasIdentifier for Variable { - fn identifier(&self) -> &UniCase { - &self.name.0 - } -} - #[derive(Clone, Debug)] pub struct DictIndexVector { pub name: Identifier, @@ -2058,419 +1507,6 @@ impl DictIndexVariableSet { } } -/// Associates values of a variable with meaningful labels. -/// -/// For example, 1 => strongly disagree, 2 => disagree, 3 => neither agree nor -/// disagree, ... -#[derive(Clone, Default, PartialEq, Eq, Serialize)] -pub struct ValueLabels(pub HashMap, String>); - -impl ValueLabels { - pub fn new() -> Self { - Self::default() - } - - pub fn is_empty(&self) -> bool { - self.0.is_empty() - } - - pub fn get(&self, value: &Datum) -> Option<&str> { - self.0.get(value).map(|s| s.as_str()) - } - - pub fn insert(&mut self, value: Datum, label: String) -> Option { - self.0.insert(value, label) - } - - pub fn is_resizable(&self, width: VarWidth) -> bool { - self.0.keys().all(|datum| datum.is_resizable(width)) - } - - pub fn resize(&mut self, width: VarWidth) { - self.0 = self - .0 - .drain() - .filter_map(|(mut datum, string)| datum.resize(width).is_ok().then(|| (datum, string))) - .collect(); - } - - pub fn codepage_to_unicode(&mut self, encoding: &'static Encoding) { - self.0 = self - .0 - .drain() - .map(|(key, value)| { - let mut key = key.with_encoding(encoding); - key.codepage_to_unicode(); - (key.without_encoding(), value) - }) - .collect(); - } -} - -impl Debug for ValueLabels { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - self.0.fmt(f) - } -} - -impl Hash for ValueLabels { - fn hash(&self, state: &mut H) { - let mut hash = 0; - for (k, v) in &self.0 { - let mut hasher = DefaultHasher::new(); - k.hash(&mut hasher); - v.hash(&mut hasher); - hash ^= hasher.finish(); - } - state.write_u64(hash); - } -} - -pub struct MissingValuesMut<'a> { - inner: &'a mut MissingValues, - width: VarWidth, -} - -impl<'a> Deref for MissingValuesMut<'a> { - type Target = MissingValues; - - fn deref(&self) -> &Self::Target { - self.inner - } -} - -impl<'a> MissingValuesMut<'a> { - pub fn replace(&mut self, mut new: MissingValues) -> Result<(), MissingValuesError> { - new.resize(self.width)?; - *self.inner = new; - Ok(()) - } - - pub fn add_value( - &mut self, - mut value: Datum>, - ) -> Result<(), MissingValuesError> { - if self.inner.values.len() > 2 - || (self.inner.range().is_some() && self.inner.values.len() > 1) - { - Err(MissingValuesError::TooMany) - } else if value.var_type() != VarType::from(self.width) { - Err(MissingValuesError::MixedTypes) - } else if value == Datum::Number(None) { - Err(MissingValuesError::SystemMissing) - } else if value.resize(self.width.min(VarWidth::String(8))).is_err() { - Err(MissingValuesError::TooWide) - } else { - value.trim_end(); - self.inner.values.push(value); - Ok(()) - } - } - - pub fn add_values( - &mut self, - values: impl IntoIterator>>, - ) -> Result<(), MissingValuesError> { - let n = self.inner.values.len(); - for value in values { - self.add_value(value) - .inspect_err(|_| self.inner.values.truncate(n))?; - } - Ok(()) - } - - pub fn add_range(&mut self, range: MissingValueRange) -> Result<(), MissingValuesError> { - if self.inner.range.is_some() || self.inner.values().len() > 1 { - Err(MissingValuesError::TooMany) - } else if self.width != VarWidth::Numeric { - Err(MissingValuesError::MixedTypes) - } else { - self.inner.range = Some(range); - Ok(()) - } - } -} - -#[derive(Clone, Default, Serialize, PartialEq)] -pub struct MissingValues { - /// Individual missing values, up to 3 of them. - values: Vec>>, - - /// Optional range of missing values. - range: Option, -} - -impl Debug for MissingValues { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{}", self) - } -} - -impl Display for MissingValues { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - if let Some(range) = &self.range { - write!(f, "{range}")?; - if !self.values.is_empty() { - write!(f, "; ")?; - } - } - - for (i, value) in self.values.iter().enumerate() { - if i > 0 { - write!(f, "; ")?; - } - write!(f, "{}", value.quoted())?; - } - - if self.is_empty() { - write!(f, "none")?; - } - Ok(()) - } -} - -#[derive(Copy, Clone, Debug)] -pub enum MissingValuesError { - TooMany, - TooWide, - MixedTypes, - SystemMissing, -} - -impl From for MissingValuesError { - fn from(value: ResizeError) -> Self { - match value { - ResizeError::MixedTypes => MissingValuesError::MixedTypes, - ResizeError::TooWide => MissingValuesError::TooWide, - } - } -} - -impl MissingValues { - pub fn clear(&mut self) { - *self = Self::default(); - } - pub fn values(&self) -> &[Datum>] { - &self.values - } - - pub fn range(&self) -> Option<&MissingValueRange> { - self.range.as_ref() - } - - pub fn new( - mut values: Vec>>, - range: Option, - ) -> Result { - if values.len() > 3 { - return Err(MissingValuesError::TooMany); - } - - let mut var_type = None; - for value in values.iter_mut() { - value.trim_end(); - if value.width().is_long_string() { - return Err(MissingValuesError::TooWide); - } - if var_type.is_some_and(|t| t != value.var_type()) { - return Err(MissingValuesError::MixedTypes); - } - var_type = Some(value.var_type()); - } - - if var_type == Some(VarType::String) && range.is_some() { - return Err(MissingValuesError::MixedTypes); - } - - Ok(Self { values, range }) - } - - pub fn is_empty(&self) -> bool { - self.values.is_empty() && self.range.is_none() - } - - pub fn var_type(&self) -> Option { - if let Some(datum) = self.values.first() { - Some(datum.var_type()) - } else if self.range.is_some() { - Some(VarType::Numeric) - } else { - None - } - } - - pub fn contains(&self, value: &Datum) -> bool - where - S: EncodedString, - { - if self - .values - .iter() - .any(|datum| datum.eq_ignore_trailing_spaces(value)) - { - return true; - } - - if let Some(Some(number)) = value.as_number() - && let Some(range) = self.range - { - range.contains(number) - } else { - false - } - } - - pub fn resize(&mut self, width: VarWidth) -> Result<(), MissingValuesError> { - fn inner(this: &mut MissingValues, width: VarWidth) -> Result<(), MissingValuesError> { - for datum in &mut this.values { - datum.resize(width)?; - datum.trim_end(); - } - if let Some(range) = &mut this.range { - range.resize(width)?; - } - Ok(()) - } - inner(self, width).inspect_err(|_| self.clear()) - } - - pub fn codepage_to_unicode(&mut self) { - self.values = self - .values - .drain(..) - .map(|value| match value { - Datum::Number(number) => Datum::Number(number), - Datum::String(s) => Datum::String(if s.encoding() != UTF_8 { - let mut new_s = ByteString::from(s.as_str()); - new_s.0.truncate(8); - WithEncoding::new(new_s, UTF_8) - } else { - s - }), - }) - .collect(); - } -} - -#[derive(Copy, Clone, Debug, Serialize, PartialEq)] -pub enum MissingValueRange { - In { low: f64, high: f64 }, - From { low: f64 }, - To { high: f64 }, -} - -impl MissingValueRange { - pub fn new(low: f64, high: f64) -> Self { - const LOWEST: f64 = f64::MIN.next_up(); - match (low, high) { - (f64::MIN | LOWEST, _) => Self::To { high }, - (_, f64::MAX) => Self::From { low }, - (_, _) => Self::In { low, high }, - } - } - - pub fn low(&self) -> Option { - match self { - MissingValueRange::In { low, .. } | MissingValueRange::From { low } => Some(*low), - MissingValueRange::To { .. } => None, - } - } - - pub fn high(&self) -> Option { - match self { - MissingValueRange::In { high, .. } | MissingValueRange::To { high } => Some(*high), - MissingValueRange::From { .. } => None, - } - } - - pub fn contains(&self, number: f64) -> bool { - match self { - MissingValueRange::In { low, high } => (*low..*high).contains(&number), - MissingValueRange::From { low } => number >= *low, - MissingValueRange::To { high } => number <= *high, - } - } - - pub fn resize(&self, width: VarWidth) -> Result<(), MissingValuesError> { - if width.is_numeric() { - Ok(()) - } else { - Err(MissingValuesError::MixedTypes) - } - } -} - -impl Display for MissingValueRange { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - match self.low() { - Some(low) => low.display_plain().fmt(f)?, - None => write!(f, "LOW")?, - } - - write!(f, " THRU ")?; - - match self.high() { - Some(high) => high.display_plain().fmt(f)?, - None => write!(f, "HIGH")?, - } - Ok(()) - } -} - -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)] -pub enum Alignment { - Left, - Right, - Center, -} - -impl Alignment { - pub fn default_for_type(var_type: VarType) -> Self { - match var_type { - VarType::Numeric => Self::Right, - VarType::String => Self::Left, - } - } - - pub fn as_str(&self) -> &'static str { - match self { - Alignment::Left => "Left", - Alignment::Right => "Right", - Alignment::Center => "Center", - } - } -} - -/// [Level of measurement](https://en.wikipedia.org/wiki/Level_of_measurement). -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)] -pub enum Measure { - /// Nominal values can only be compared for equality. - Nominal, - - /// Ordinal values can be meaningfully ordered. - Ordinal, - - /// Scale values can be meaningfully compared for the degree of difference. - Scale, -} - -impl Measure { - pub fn default_for_type(var_type: VarType) -> Option { - match var_type { - VarType::Numeric => None, - VarType::String => Some(Self::Nominal), - } - } - - pub fn as_str(&self) -> &'static str { - match self { - Measure::Nominal => "Nominal", - Measure::Ordinal => "Ordinal", - Measure::Scale => "Scale", - } - } -} - #[cfg(test)] mod test { use std::collections::HashSet; @@ -2480,9 +1516,9 @@ mod test { use unicase::UniCase; use crate::{ - data::{ByteString, Datum, RawString, WithEncoding}, - dictionary::{Dictionary, MissingValues, VarWidth, Variable}, + dictionary::Dictionary, identifier::Identifier, + variable::{VarWidth, Variable}, }; use super::{ByIdentifier, HasIdentifier}; @@ -2619,55 +1655,4 @@ mod test { assert_eq!(expected, dict.short_names()); } } - - #[test] - fn var_width_codepage_to_unicode() { - fn check_unicode(input: VarWidth, expected: VarWidth) { - let mut actual = input; - actual.codepage_to_unicode(); - assert_eq!(actual, expected); - } - - check_unicode(VarWidth::Numeric, VarWidth::Numeric); - check_unicode(VarWidth::String(1), VarWidth::String(3)); - check_unicode(VarWidth::String(2), VarWidth::String(6)); - check_unicode(VarWidth::String(3), VarWidth::String(9)); - check_unicode(VarWidth::String(1000), VarWidth::String(3000)); - check_unicode(VarWidth::String(20000), VarWidth::String(32767)); - check_unicode(VarWidth::String(30000), VarWidth::String(32767)); - } - - #[test] - fn missing_values_codepage_to_unicode() { - fn windows_1252(s: &str) -> WithEncoding { - ByteString::from(WINDOWS_1252.encode(s).0).with_encoding(WINDOWS_1252) - } - - let mut actual = MissingValues::new( - vec![ - Datum::String(windows_1252("abcdefgh")), - Datum::String(windows_1252("éèäî ")), - Datum::String(windows_1252("aaéèäîdf")), - ], - None, - ) - .unwrap(); - actual.codepage_to_unicode(); - - fn utf_8(s: &str) -> WithEncoding { - ByteString::from(s).with_encoding(UTF_8) - } - - let expected = MissingValues::new( - vec![ - Datum::String(utf_8("abcdefgh")), - Datum::String(utf_8("éèäî")), - Datum::String(utf_8("aaéèä")), - ], - None, - ) - .unwrap(); - - assert_eq!(&actual, &expected); - } } diff --git a/rust/pspp/src/format/mod.rs b/rust/pspp/src/format/mod.rs index b4109cb206..1131e9a827 100644 --- a/rust/pspp/src/format/mod.rs +++ b/rust/pspp/src/format/mod.rs @@ -30,7 +30,7 @@ use unicode_width::UnicodeWidthStr; use crate::{ data::{ByteString, Datum}, - dictionary::{VarType, VarWidth}, + variable::{VarType, VarWidth}, sys::raw, }; diff --git a/rust/pspp/src/lib.rs b/rust/pspp/src/lib.rs index 9ae4bdfcd9..cb2ddc4f80 100644 --- a/rust/pspp/src/lib.rs +++ b/rust/pspp/src/lib.rs @@ -35,6 +35,7 @@ pub mod output; pub mod prompt; pub mod settings; pub mod sys; +pub mod variable; /// This is [slice::element_offset] copied out from the standard library so that /// we can use it while it is still experimental. diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs index 7dd634f18a..0b91cbaeb6 100644 --- a/rust/pspp/src/output/pivot/mod.rs +++ b/rust/pspp/src/output/pivot/mod.rs @@ -68,7 +68,7 @@ use tlo::parse_tlo; use crate::{ data::{ByteString, Datum, EncodedString, RawString}, - dictionary::{VarType, Variable}, + variable::{VarType, Variable}, format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat}, settings::{Settings, Show}, }; diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index f541ad6fbc..049c152791 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -28,8 +28,7 @@ use crate::{ crypto::EncryptedFile, data::{ByteString, Case, Datum, MutRawString, RawString}, dictionary::{ - DictIndexMultipleResponseSet, DictIndexVariableSet, Dictionary, InvalidRole, MissingValues, - MissingValuesError, MultipleResponseType, VarWidth, Variable, + DictIndexMultipleResponseSet, DictIndexVariableSet, Dictionary, MultipleResponseType, }, format::{Error as FormatError, Format, UncheckedFormat}, hexfloat::HexFloat, @@ -50,6 +49,7 @@ use crate::{ }, serialize_endian, }, + variable::{InvalidRole, MissingValues, MissingValuesError, VarWidth, Variable}, }; use anyhow::{anyhow, Error as AnyError}; use binrw::{BinRead, BinWrite, Endian}; diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 3b7334169b..5b39156691 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -21,7 +21,7 @@ use crate::{ data::{ByteStr, ByteString, Datum, RawCase}, - dictionary::{VarType, VarWidth}, + variable::{VarType, VarWidth}, endian::{FromBytes, ToBytes}, identifier::{Error as IdError, Identifier}, sys::{ diff --git a/rust/pspp/src/sys/raw/records.rs b/rust/pspp/src/sys/raw/records.rs index 32fceed38b..18b76129be 100644 --- a/rust/pspp/src/sys/raw/records.rs +++ b/rust/pspp/src/sys/raw/records.rs @@ -13,10 +13,7 @@ use std::{ use crate::{ data::{ByteStrArray, ByteString, Datum}, - dictionary::{ - Alignment, Attributes, CategoryLabels, Measure, MissingValueRange, MissingValues, - MissingValuesError, VarType, VarWidth, - }, + dictionary::CategoryLabels, endian::FromBytes, format::{DisplayPlainF64, Format, Type}, identifier::{Error as IdError, Identifier}, @@ -27,6 +24,10 @@ use crate::{ }, serialize_endian, ProductVersion, }, + variable::{ + Alignment, Attributes, Measure, MissingValueRange, MissingValues, MissingValuesError, + VarType, VarWidth, + }, }; use binrw::{binrw, BinRead, BinWrite, Endian, Error as BinError}; diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs index c5eacee7b3..5ddf745f17 100644 --- a/rust/pspp/src/sys/test.rs +++ b/rust/pspp/src/sys/test.rs @@ -27,7 +27,7 @@ use encoding_rs::UTF_8; use crate::{ crypto::EncryptedFile, data::Datum, - dictionary::{Dictionary, VarWidth, Variable}, + dictionary::Dictionary, identifier::Identifier, output::{ pivot::{test::assert_lines_eq, Axis3, Dimension, Group, PivotTable, Value}, @@ -39,6 +39,7 @@ use crate::{ sack::sack, WriteOptions, }, + variable::{VarWidth, Variable}, }; #[test] diff --git a/rust/pspp/src/sys/write.rs b/rust/pspp/src/sys/write.rs index 7cd05e8699..320820146d 100644 --- a/rust/pspp/src/sys/write.rs +++ b/rust/pspp/src/sys/write.rs @@ -18,10 +18,7 @@ use smallvec::SmallVec; use crate::{ data::{Datum, RawString}, - dictionary::{ - Alignment, Attributes, CategoryLabels, Dictionary, Measure, MultipleResponseType, - ValueLabels, VarWidth, - }, + dictionary::{CategoryLabels, Dictionary, MultipleResponseType}, format::{DisplayPlainF64, Format}, identifier::Identifier, output::spv::Zeros, @@ -36,6 +33,7 @@ use crate::{ }, ProductVersion, }, + variable::{Alignment, Attributes, Measure, ValueLabels, VarWidth}, }; /// System file format version. @@ -1214,9 +1212,8 @@ mod tests { use crate::{ data::{ByteString, Datum, RawString}, dictionary::{ - Alignment, Attributes, CategoryLabels, DictIndexMultipleResponseSet, - DictIndexVariableSet, Dictionary, Measure, MissingValueRange, MultipleResponseType, - VarWidth, Variable, + CategoryLabels, DictIndexMultipleResponseSet, DictIndexVariableSet, Dictionary, + MultipleResponseType, }, identifier::{ByIdentifier, Identifier}, sys::{ @@ -1229,6 +1226,7 @@ mod tests { write::DictionaryWriter, ReadOptions, WriteOptions, }, + variable::{Alignment, Attributes, Measure, MissingValueRange, VarWidth, Variable}, }; /// Checks that the header record has the right nominal case size and weight diff --git a/rust/pspp/src/variable.rs b/rust/pspp/src/variable.rs new file mode 100644 index 0000000000..ea1a86f885 --- /dev/null +++ b/rust/pspp/src/variable.rs @@ -0,0 +1,1058 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +//! Variables. + +use std::{ + collections::{BTreeMap, HashMap}, + fmt::{Debug, Display}, + hash::{DefaultHasher, Hash, Hasher}, + ops::{Deref, Not}, + str::FromStr, +}; + +use encoding_rs::{Encoding, UTF_8}; +use num::integer::div_ceil; +use serde::Serialize; +use thiserror::Error as ThisError; +use unicase::UniCase; + +use crate::{ + data::{ByteString, Datum, Encoded, EncodedString, ResizeError, WithEncoding}, + format::{DisplayPlain, Format}, + identifier::{HasIdentifier, Identifier}, +}; + +/// Variable type. +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)] +pub enum VarType { + /// A numeric variable. + Numeric, + + /// A string variable. + String, +} + +impl Not for VarType { + type Output = Self; + + fn not(self) -> Self::Output { + match self { + Self::Numeric => Self::String, + Self::String => Self::Numeric, + } + } +} + +impl Not for &VarType { + type Output = VarType; + + fn not(self) -> Self::Output { + !*self + } +} + +impl Display for VarType { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + VarType::Numeric => write!(f, "numeric"), + VarType::String => write!(f, "string"), + } + } +} + +/// [VarType], plus a width for [VarType::String]. +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize)] +pub enum VarWidth { + Numeric, + String(u16), +} + +impl VarWidth { + pub const MAX_STRING: u16 = 32767; + + pub fn n_dict_indexes(self) -> usize { + match self { + VarWidth::Numeric => 1, + VarWidth::String(w) => div_ceil(w as usize, 8), + } + } + + fn width_predicate( + a: Option, + b: Option, + f: impl Fn(u16, u16) -> u16, + ) -> Option { + match (a, b) { + (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric), + (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => { + Some(VarWidth::String(f(a, b))) + } + _ => None, + } + } + + /// Returns the wider of `self` and `other`: + /// - Numerical variable widths are equally wide. + /// - Longer strings are wider than shorter strings. + /// - Numerical and string types are incomparable, so result in `None`. + /// - Any `None` in the input yields `None` in the output. + pub fn wider(a: Option, b: Option) -> Option { + Self::width_predicate(a, b, |a, b| a.max(b)) + } + + /// Returns the narrower of `self` and `other` (see [`Self::wider`]). + pub fn narrower(a: Option, b: Option) -> Option { + Self::width_predicate(a, b, |a, b| a.min(b)) + } + + pub fn default_display_width(&self) -> u32 { + match self { + VarWidth::Numeric => 8, + VarWidth::String(width) => *width.min(&32) as u32, + } + } + + pub fn is_long_string(&self) -> bool { + if let Self::String(width) = self { + *width > 8 + } else { + false + } + } + + pub fn as_string_width(&self) -> Option { + match self { + VarWidth::Numeric => None, + VarWidth::String(width) => Some(*width as usize), + } + } + + pub fn is_numeric(&self) -> bool { + *self == Self::Numeric + } + + pub fn is_string(&self) -> bool { + !self.is_numeric() + } + + /// Returns true if this is a very long string width, meaning wider than 255 + /// bytes, which was the limit for old versions of SPSS. + pub fn is_very_long_string(&self) -> bool { + match *self { + VarWidth::Numeric => false, + VarWidth::String(width) => width > 255, + } + } + + /// Number of bytes per segment by which the amount of space for very long + /// string variables is allocated. + pub const SEGMENT_SIZE: usize = 252; + + /// Returns an iterator over the "segments" used for writing case data for a + /// variable with this width. A segment is a physical variable in the + /// system file that represents some piece of a logical variable as seen by + /// a PSPP user. Most variables have one segment whose width is their own + /// width, but very long string variables, with width greater than 255, have + /// multiple segments each with width 255 or less. + pub fn segments(&self) -> Segments { + Segments::new(*self) + } + + /// Returns the number of 8-byte chunks used for writing case data for a + /// variable with this width. Very long string variables (wider than 255 + /// bytes) cannot directly be divided into chunks: they must first be + /// divided into multiple [segments](Self::segments), which can then be + /// divided into chunks. + pub fn n_chunks(&self) -> Option { + match *self { + VarWidth::Numeric => Some(1), + VarWidth::String(w) if w <= 255 => Some(w.div_ceil(8) as usize), + VarWidth::String(_) => None, + } + } + + /// Returns the width to allocate to the segment with the given + /// `segment_idx` within this variable. A segment is a physical variable in + /// the system file that represents some piece of a logical variable as seen + /// by a PSPP user. + pub fn segment_alloc_width(&self, segment_idx: usize) -> usize { + debug_assert!(segment_idx < self.segments().len()); + debug_assert!(self.is_very_long_string()); + + if segment_idx < self.segments().len() - 1 { + 255 + } else { + self.as_string_width().unwrap() - segment_idx * Self::SEGMENT_SIZE + } + } + + pub fn display_adjective(&self) -> VarWidthAdjective { + VarWidthAdjective(*self) + } + + pub fn codepage_to_unicode(&mut self) { + match self { + VarWidth::Numeric => (), + VarWidth::String(width) => *width = width.saturating_mul(3).min(Self::MAX_STRING), + } + } +} + +pub struct Segments { + width: VarWidth, + i: usize, + n: usize, +} +impl Segments { + pub fn new(width: VarWidth) -> Self { + Self { + width, + i: 0, + n: if width.is_very_long_string() { + width + .as_string_width() + .unwrap() + .div_ceil(VarWidth::SEGMENT_SIZE) + } else { + 1 + }, + } + } +} + +impl Iterator for Segments { + type Item = VarWidth; + + fn next(&mut self) -> Option { + let i = self.i; + if i >= self.n { + None + } else { + self.i += 1; + match self.width { + VarWidth::Numeric => Some(VarWidth::Numeric), + VarWidth::String(_) if i < self.n - 1 => Some(VarWidth::String(255)), + VarWidth::String(width) => Some(VarWidth::String( + width - (self.n as u16 - 1) * VarWidth::SEGMENT_SIZE as u16, + )), + } + } + } + + fn size_hint(&self) -> (usize, Option) { + let n = self.n - self.i; + (n, Some(n)) + } +} + +impl ExactSizeIterator for Segments {} + +impl From for VarType { + fn from(source: VarWidth) -> Self { + match source { + VarWidth::Numeric => VarType::Numeric, + VarWidth::String(_) => VarType::String, + } + } +} + +pub struct VarWidthAdjective(VarWidth); + +impl Display for VarWidthAdjective { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self.0 { + VarWidth::Numeric => write!(f, "numeric"), + VarWidth::String(width) => write!(f, "{width}-byte string"), + } + } +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize)] +pub enum Role { + #[default] + Input, + Target, + Both, + None, + Partition, + Split, +} + +impl Role { + pub fn as_str(&self) -> &'static str { + match self { + Role::Input => "Input", + Role::Target => "Target", + Role::Both => "Both", + Role::None => "None", + Role::Partition => "Partition", + Role::Split => "Split", + } + } +} + +impl FromStr for Role { + type Err = InvalidRole; + + fn from_str(s: &str) -> Result { + for (string, value) in [ + ("input", Role::Input), + ("target", Role::Target), + ("both", Role::Both), + ("none", Role::None), + ("partition", Role::Partition), + ("split", Role::Split), + ] { + if string.eq_ignore_ascii_case(s) { + return Ok(value); + } + } + Err(InvalidRole::UnknownRole(s.into())) + } +} + +impl TryFrom for Role { + type Error = InvalidRole; + + fn try_from(value: i32) -> Result { + match value { + 0 => Ok(Role::Input), + 1 => Ok(Role::Target), + 2 => Ok(Role::Both), + 3 => Ok(Role::None), + 4 => Ok(Role::Partition), + 5 => Ok(Role::Split), + _ => Err(InvalidRole::UnknownRole(value.to_string())), + } + } +} + +impl From for i32 { + fn from(value: Role) -> Self { + match value { + Role::Input => 0, + Role::Target => 1, + Role::Both => 2, + Role::None => 3, + Role::Partition => 4, + Role::Split => 5, + } + } +} + +#[derive(Clone, Default, PartialEq, Eq, Serialize)] +pub struct Attributes(pub BTreeMap>); + +impl Attributes { + pub fn new() -> Self { + Self(BTreeMap::new()) + } + + pub fn contains_name(&self, name: &Identifier) -> bool { + self.0.contains_key(name) + } + + pub fn insert(&mut self, name: Identifier, values: Vec) { + self.0.insert(name, values); + } + + pub fn with(mut self, name: Identifier, values: Vec) -> Self { + self.insert(name, values); + self + } + + pub fn append(&mut self, other: &mut Self) { + self.0.append(&mut other.0) + } + + pub fn role(&self) -> Result, InvalidRole> { + self.try_into() + } + + pub fn iter(&self, include_at: bool) -> impl Iterator { + self.0.iter().filter_map(move |(name, values)| { + if include_at || !name.0.starts_with('@') { + Some((name, values.as_slice())) + } else { + None + } + }) + } + + pub fn has_any(&self, include_at: bool) -> bool { + self.iter(include_at).next().is_some() + } + + pub fn codepage_to_unicode(&mut self) { + let mut new = BTreeMap::new(); + while let Some((mut name, value)) = self.0.pop_first() { + name.codepage_to_unicode(); + new.insert(name, value); + } + self.0 = new; + } +} + +impl Debug for Attributes { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + +#[derive(Clone, Debug, ThisError, PartialEq, Eq)] +pub enum InvalidRole { + #[error("Unknown role {0:?}.")] + UnknownRole(String), + + #[error("Role attribute $@Role must have exactly one value (not {0}).")] + InvalidValues(usize), +} + +impl TryFrom<&Attributes> for Option { + type Error = InvalidRole; + + fn try_from(value: &Attributes) -> Result { + let role = Identifier::new("$@Role").unwrap(); + value.0.get(&role).map_or(Ok(None), |attribute| { + if let Ok([string]) = <&[String; 1]>::try_from(attribute.as_slice()) { + match string.parse::() { + Ok(integer) => Ok(Some(Role::try_from(integer)?)), + Err(_) => Err(InvalidRole::UnknownRole(string.clone())), + } + } else { + Err(InvalidRole::InvalidValues(attribute.len())) + } + }) + } +} + +/// A variable, usually inside a [Dictionary]. +#[derive(Clone, Debug, Serialize)] +pub struct Variable { + /// The variable's name. + /// + /// PSPP variable names are case-insensitive. + pub name: Identifier, + + /// Variable width. + pub width: VarWidth, + + /// User-missing values. + /// + /// Numeric variables also have a system-missing value (represented as + /// `None`). + /// + /// Both kinds of missing values are excluded from most analyses. + missing_values: MissingValues, + + /// Output format used in most contexts. + pub print_format: Format, + + /// Output format used on the `WRITE` command. + pub write_format: Format, + + /// Value labels. + pub value_labels: ValueLabels, + + /// Variable label, an optional meaningful description for the variable + /// itself. + pub label: Option, + + /// Measurement level for the variable's data. + pub measure: Option, + + /// Role in data analysis. + pub role: Role, + + /// Width of data column in GUI. + pub display_width: u32, + + /// Data alignment in GUI. + pub alignment: Alignment, + + /// Whether to retain values of the variable from one case to the next. + pub leave: bool, + + /// For compatibility with old software that supported at most 8-character + /// variable names. + pub short_names: Vec, + + /// Variable attributes. + pub attributes: Attributes, + + /// Encoding for [Value]s inside this variable. + /// + /// The variables in a [Dictionary] must all use the same encoding as the + /// dictionary. + encoding: &'static Encoding, +} + +impl Variable { + pub fn new(name: Identifier, width: VarWidth, encoding: &'static Encoding) -> Self { + let var_type = VarType::from(width); + let leave = name.class().must_leave(); + Self { + name, + width, + missing_values: MissingValues::default(), + print_format: Format::default_for_width(width), + write_format: Format::default_for_width(width), + value_labels: ValueLabels::new(), + label: None, + measure: Measure::default_for_type(var_type), + role: Role::default(), + display_width: width.default_display_width(), + alignment: Alignment::default_for_type(var_type), + leave, + short_names: Vec::new(), + attributes: Attributes::new(), + encoding, + } + } + + pub fn encoding(&self) -> &'static Encoding { + self.encoding + } + + pub fn is_numeric(&self) -> bool { + self.width.is_numeric() + } + + pub fn is_string(&self) -> bool { + self.width.is_string() + } + + pub fn label(&self) -> Option<&String> { + self.label.as_ref() + } + + pub fn resize(&mut self, width: VarWidth) { + let _ = self.missing_values.resize(width); + + self.value_labels.resize(width); + + self.print_format.resize(width); + self.write_format.resize(width); + + self.width = width; + } + + pub fn missing_values(&self) -> &MissingValues { + &self.missing_values + } + + pub fn missing_values_mut(&mut self) -> MissingValuesMut<'_> { + MissingValuesMut { + inner: &mut self.missing_values, + width: self.width, + } + } + + pub fn codepage_to_unicode(&mut self) { + self.name.codepage_to_unicode(); + self.width.codepage_to_unicode(); + self.missing_values.codepage_to_unicode(); + self.print_format.codepage_to_unicode(); + self.write_format.codepage_to_unicode(); + self.attributes.codepage_to_unicode(); + self.encoding = UTF_8; + + // Anything old enough to not support long names is old enough not to + // support Unicode. + self.short_names.clear(); + } +} + +impl HasIdentifier for Variable { + fn identifier(&self) -> &UniCase { + &self.name.0 + } +} + +/// Associates values of a variable with meaningful labels. +/// +/// For example, 1 => strongly disagree, 2 => disagree, 3 => neither agree nor +/// disagree, ... +#[derive(Clone, Default, PartialEq, Eq, Serialize)] +pub struct ValueLabels(pub HashMap, String>); + +impl ValueLabels { + pub fn new() -> Self { + Self::default() + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn get(&self, value: &Datum) -> Option<&str> { + self.0.get(value).map(|s| s.as_str()) + } + + pub fn insert(&mut self, value: Datum, label: String) -> Option { + self.0.insert(value, label) + } + + pub fn is_resizable(&self, width: VarWidth) -> bool { + self.0.keys().all(|datum| datum.is_resizable(width)) + } + + pub fn resize(&mut self, width: VarWidth) { + self.0 = self + .0 + .drain() + .filter_map(|(mut datum, string)| datum.resize(width).is_ok().then(|| (datum, string))) + .collect(); + } + + pub fn codepage_to_unicode(&mut self, encoding: &'static Encoding) { + self.0 = self + .0 + .drain() + .map(|(key, value)| { + let mut key = key.with_encoding(encoding); + key.codepage_to_unicode(); + (key.without_encoding(), value) + }) + .collect(); + } +} + +impl Debug for ValueLabels { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + +impl Hash for ValueLabels { + fn hash(&self, state: &mut H) { + let mut hash = 0; + for (k, v) in &self.0 { + let mut hasher = DefaultHasher::new(); + k.hash(&mut hasher); + v.hash(&mut hasher); + hash ^= hasher.finish(); + } + state.write_u64(hash); + } +} + +pub struct MissingValuesMut<'a> { + inner: &'a mut MissingValues, + width: VarWidth, +} + +impl<'a> Deref for MissingValuesMut<'a> { + type Target = MissingValues; + + fn deref(&self) -> &Self::Target { + self.inner + } +} + +impl<'a> MissingValuesMut<'a> { + pub fn replace(&mut self, mut new: MissingValues) -> Result<(), MissingValuesError> { + new.resize(self.width)?; + *self.inner = new; + Ok(()) + } + + pub fn add_value( + &mut self, + mut value: Datum>, + ) -> Result<(), MissingValuesError> { + if self.inner.values.len() > 2 + || (self.inner.range().is_some() && self.inner.values.len() > 1) + { + Err(MissingValuesError::TooMany) + } else if value.var_type() != VarType::from(self.width) { + Err(MissingValuesError::MixedTypes) + } else if value == Datum::Number(None) { + Err(MissingValuesError::SystemMissing) + } else if value.resize(self.width.min(VarWidth::String(8))).is_err() { + Err(MissingValuesError::TooWide) + } else { + value.trim_end(); + self.inner.values.push(value); + Ok(()) + } + } + + pub fn add_values( + &mut self, + values: impl IntoIterator>>, + ) -> Result<(), MissingValuesError> { + let n = self.inner.values.len(); + for value in values { + self.add_value(value) + .inspect_err(|_| self.inner.values.truncate(n))?; + } + Ok(()) + } + + pub fn add_range(&mut self, range: MissingValueRange) -> Result<(), MissingValuesError> { + if self.inner.range.is_some() || self.inner.values().len() > 1 { + Err(MissingValuesError::TooMany) + } else if self.width != VarWidth::Numeric { + Err(MissingValuesError::MixedTypes) + } else { + self.inner.range = Some(range); + Ok(()) + } + } +} + +#[derive(Clone, Default, Serialize, PartialEq)] +pub struct MissingValues { + /// Individual missing values, up to 3 of them. + values: Vec>>, + + /// Optional range of missing values. + range: Option, +} + +impl Debug for MissingValues { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}", self) + } +} + +impl Display for MissingValues { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if let Some(range) = &self.range { + write!(f, "{range}")?; + if !self.values.is_empty() { + write!(f, "; ")?; + } + } + + for (i, value) in self.values.iter().enumerate() { + if i > 0 { + write!(f, "; ")?; + } + write!(f, "{}", value.quoted())?; + } + + if self.is_empty() { + write!(f, "none")?; + } + Ok(()) + } +} + +#[derive(Copy, Clone, Debug)] +pub enum MissingValuesError { + TooMany, + TooWide, + MixedTypes, + SystemMissing, +} + +impl From for MissingValuesError { + fn from(value: ResizeError) -> Self { + match value { + ResizeError::MixedTypes => MissingValuesError::MixedTypes, + ResizeError::TooWide => MissingValuesError::TooWide, + } + } +} + +impl MissingValues { + pub fn clear(&mut self) { + *self = Self::default(); + } + pub fn values(&self) -> &[Datum>] { + &self.values + } + + pub fn range(&self) -> Option<&MissingValueRange> { + self.range.as_ref() + } + + pub fn new( + mut values: Vec>>, + range: Option, + ) -> Result { + if values.len() > 3 { + return Err(MissingValuesError::TooMany); + } + + let mut var_type = None; + for value in values.iter_mut() { + value.trim_end(); + if value.width().is_long_string() { + return Err(MissingValuesError::TooWide); + } + if var_type.is_some_and(|t| t != value.var_type()) { + return Err(MissingValuesError::MixedTypes); + } + var_type = Some(value.var_type()); + } + + if var_type == Some(VarType::String) && range.is_some() { + return Err(MissingValuesError::MixedTypes); + } + + Ok(Self { values, range }) + } + + pub fn is_empty(&self) -> bool { + self.values.is_empty() && self.range.is_none() + } + + pub fn var_type(&self) -> Option { + if let Some(datum) = self.values.first() { + Some(datum.var_type()) + } else if self.range.is_some() { + Some(VarType::Numeric) + } else { + None + } + } + + pub fn contains(&self, value: &Datum) -> bool + where + S: EncodedString, + { + if self + .values + .iter() + .any(|datum| datum.eq_ignore_trailing_spaces(value)) + { + return true; + } + + if let Some(Some(number)) = value.as_number() + && let Some(range) = self.range + { + range.contains(number) + } else { + false + } + } + + pub fn resize(&mut self, width: VarWidth) -> Result<(), MissingValuesError> { + fn inner(this: &mut MissingValues, width: VarWidth) -> Result<(), MissingValuesError> { + for datum in &mut this.values { + datum.resize(width)?; + datum.trim_end(); + } + if let Some(range) = &mut this.range { + range.resize(width)?; + } + Ok(()) + } + inner(self, width).inspect_err(|_| self.clear()) + } + + pub fn codepage_to_unicode(&mut self) { + self.values = self + .values + .drain(..) + .map(|value| match value { + Datum::Number(number) => Datum::Number(number), + Datum::String(s) => Datum::String(if s.encoding() != UTF_8 { + let mut new_s = ByteString::from(s.as_str()); + new_s.0.truncate(8); + WithEncoding::new(new_s, UTF_8) + } else { + s + }), + }) + .collect(); + } +} + +#[derive(Copy, Clone, Debug, Serialize, PartialEq)] +pub enum MissingValueRange { + In { low: f64, high: f64 }, + From { low: f64 }, + To { high: f64 }, +} + +impl MissingValueRange { + pub fn new(low: f64, high: f64) -> Self { + const LOWEST: f64 = f64::MIN.next_up(); + match (low, high) { + (f64::MIN | LOWEST, _) => Self::To { high }, + (_, f64::MAX) => Self::From { low }, + (_, _) => Self::In { low, high }, + } + } + + pub fn low(&self) -> Option { + match self { + MissingValueRange::In { low, .. } | MissingValueRange::From { low } => Some(*low), + MissingValueRange::To { .. } => None, + } + } + + pub fn high(&self) -> Option { + match self { + MissingValueRange::In { high, .. } | MissingValueRange::To { high } => Some(*high), + MissingValueRange::From { .. } => None, + } + } + + pub fn contains(&self, number: f64) -> bool { + match self { + MissingValueRange::In { low, high } => (*low..*high).contains(&number), + MissingValueRange::From { low } => number >= *low, + MissingValueRange::To { high } => number <= *high, + } + } + + pub fn resize(&self, width: VarWidth) -> Result<(), MissingValuesError> { + if width.is_numeric() { + Ok(()) + } else { + Err(MissingValuesError::MixedTypes) + } + } +} + +impl Display for MissingValueRange { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self.low() { + Some(low) => low.display_plain().fmt(f)?, + None => write!(f, "LOW")?, + } + + write!(f, " THRU ")?; + + match self.high() { + Some(high) => high.display_plain().fmt(f)?, + None => write!(f, "HIGH")?, + } + Ok(()) + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)] +pub enum Alignment { + Left, + Right, + Center, +} + +impl Alignment { + pub fn default_for_type(var_type: VarType) -> Self { + match var_type { + VarType::Numeric => Self::Right, + VarType::String => Self::Left, + } + } + + pub fn as_str(&self) -> &'static str { + match self { + Alignment::Left => "Left", + Alignment::Right => "Right", + Alignment::Center => "Center", + } + } +} + +/// [Level of measurement](https://en.wikipedia.org/wiki/Level_of_measurement). +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)] +pub enum Measure { + /// Nominal values can only be compared for equality. + Nominal, + + /// Ordinal values can be meaningfully ordered. + Ordinal, + + /// Scale values can be meaningfully compared for the degree of difference. + Scale, +} + +impl Measure { + pub fn default_for_type(var_type: VarType) -> Option { + match var_type { + VarType::Numeric => None, + VarType::String => Some(Self::Nominal), + } + } + + pub fn as_str(&self) -> &'static str { + match self { + Measure::Nominal => "Nominal", + Measure::Ordinal => "Ordinal", + Measure::Scale => "Scale", + } + } +} + +#[cfg(test)] +mod tests { + use encoding_rs::{UTF_8, WINDOWS_1252}; + + use crate::{ + data::{ByteString, Datum, RawString, WithEncoding}, + variable::{MissingValues, VarWidth}, + }; + + #[test] + fn var_width_codepage_to_unicode() { + fn check_unicode(input: VarWidth, expected: VarWidth) { + let mut actual = input; + actual.codepage_to_unicode(); + assert_eq!(actual, expected); + } + + check_unicode(VarWidth::Numeric, VarWidth::Numeric); + check_unicode(VarWidth::String(1), VarWidth::String(3)); + check_unicode(VarWidth::String(2), VarWidth::String(6)); + check_unicode(VarWidth::String(3), VarWidth::String(9)); + check_unicode(VarWidth::String(1000), VarWidth::String(3000)); + check_unicode(VarWidth::String(20000), VarWidth::String(32767)); + check_unicode(VarWidth::String(30000), VarWidth::String(32767)); + } + + #[test] + fn missing_values_codepage_to_unicode() { + fn windows_1252(s: &str) -> WithEncoding { + ByteString::from(WINDOWS_1252.encode(s).0).with_encoding(WINDOWS_1252) + } + + let mut actual = MissingValues::new( + vec![ + Datum::String(windows_1252("abcdefgh")), + Datum::String(windows_1252("éèäî ")), + Datum::String(windows_1252("aaéèäîdf")), + ], + None, + ) + .unwrap(); + actual.codepage_to_unicode(); + + fn utf_8(s: &str) -> WithEncoding { + ByteString::from(s).with_encoding(UTF_8) + } + + let expected = MissingValues::new( + vec![ + Datum::String(utf_8("abcdefgh")), + Datum::String(utf_8("éèäî")), + Datum::String(utf_8("aaéèä")), + ], + None, + ) + .unwrap(); + + assert_eq!(&actual, &expected); + } +}