From: Ben Pfaff Date: Sun, 13 Jul 2025 15:29:42 +0000 (-0700) Subject: more cleanup X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=3ccb5898d8ed01e116287b948e6bd457e0ffe8df;p=pspp more cleanup --- diff --git a/rust/pspp/src/command/crosstabs.rs b/rust/pspp/src/command/crosstabs.rs index b8e3c2635b..38079857b8 100644 --- a/rust/pspp/src/command/crosstabs.rs +++ b/rust/pspp/src/command/crosstabs.rs @@ -200,9 +200,11 @@ mod tests { fn test(syntax: &str) { let mut engine = Engine::new(); - engine.run(Source::new_default(&Arc::new( - SyntaxFile::new(syntax.to_string(), Some("test.sps".to_string()), UTF_8), - ))); + engine.run(Source::new_default(&Arc::new(SyntaxFile::new( + syntax.to_string(), + Some("test.sps".to_string()), + UTF_8, + )))); } #[test] diff --git a/rust/pspp/src/command/ctables.rs b/rust/pspp/src/command/ctables.rs index a847242372..fd65961523 100644 --- a/rust/pspp/src/command/ctables.rs +++ b/rust/pspp/src/command/ctables.rs @@ -385,9 +385,11 @@ mod tests { fn test(syntax: &str) { let mut engine = Engine::new(); - engine.run(Source::new_default(&Arc::new( - SyntaxFile::new(syntax.to_string(), Some("test.sps".to_string()), UTF_8), - ))); + engine.run(Source::new_default(&Arc::new(SyntaxFile::new( + syntax.to_string(), + Some("test.sps".to_string()), + UTF_8, + )))); } #[test] diff --git a/rust/pspp/src/command/data_list.rs b/rust/pspp/src/command/data_list.rs index 181d8e51e4..ea52f708a6 100644 --- a/rust/pspp/src/command/data_list.rs +++ b/rust/pspp/src/command/data_list.rs @@ -115,9 +115,11 @@ mod tests { fn test(syntax: &str) { let mut engine = Engine::new(); - engine.run(Source::new_default(&Arc::new( - SyntaxFile::new(syntax.to_string(), Some("test.sps".to_string()), UTF_8), - ))); + engine.run(Source::new_default(&Arc::new(SyntaxFile::new( + syntax.to_string(), + Some("test.sps".to_string()), + UTF_8, + )))); } #[test] diff --git a/rust/pspp/src/command/descriptives.rs b/rust/pspp/src/command/descriptives.rs index 28619d37b5..9f8fb2d8b6 100644 --- a/rust/pspp/src/command/descriptives.rs +++ b/rust/pspp/src/command/descriptives.rs @@ -157,9 +157,11 @@ mod tests { fn test(syntax: &str) { let mut engine = Engine::new(); - engine.run(Source::new_default(&Arc::new( - SyntaxFile::new(syntax.to_string(), Some("test.sps".to_string()), UTF_8), - ))); + engine.run(Source::new_default(&Arc::new(SyntaxFile::new( + syntax.to_string(), + Some("test.sps".to_string()), + UTF_8, + )))); } #[test] diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs new file mode 100644 index 0000000000..9334eaf668 --- /dev/null +++ b/rust/pspp/src/data.rs @@ -0,0 +1,403 @@ +// PSPP - a program for statistical analysis. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This program is free software: you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later +// version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with +// this program. If not, see . + +//! Individual pieces of data. +//! +//! [Datum] is the value of one [Variable]. String data in a [Datum] is +//! represented as [RawString], whose character encoding is determined by the +//! associated [Variable]. (All the variables in a [Dictionary] have the same +//! character encoding.) +//! +//! [Variable]: crate::dictionary::Variable +//! [Dictionary]: crate::dictionary::Dictionary + +// Warn about missing docs, but not for items declared with `#[cfg(test)]`. +#![cfg_attr(not(test), warn(missing_docs))] + +use std::{ + borrow::{Borrow, Cow}, + cmp::Ordering, + fmt::{Debug, Display, Formatter}, + hash::Hash, + ops::Deref, + str::from_utf8, +}; + +use encoding_rs::{mem::decode_latin1, Encoding}; +use ordered_float::OrderedFloat; + +use crate::{ + dictionary::{VarType, VarWidth}, + sys::raw::EncodedStr, +}; + +/// An owned string in an unspecified character encoding. +/// +/// A [RawString] is usually associated with a [Variable] and uses the +/// variable's character encoding. We assume that the encoding is one supported +/// by [encoding_rs] with byte units (that is, not a `UTF-16` encoding). All of +/// these encodings have some basic ASCII compatibility. +/// +/// A [RawString] owns its contents and can grow and shrink, like a [Vec] or +/// [String]. For a borrowed raw string, see [RawStr]. +/// +/// [Variable]: crate::dictionary::Variable +#[derive(Clone, PartialEq, Default, Eq, PartialOrd, Ord, Hash)] +pub struct RawString(pub Vec); + +impl RawString { + /// Creates a new [RawString] that consists of `n` ASCII spaces. + pub fn spaces(n: usize) -> Self { + Self(std::iter::repeat_n(b' ', n).collect()) + } + + /// Creates an [EncodedStr] with `encoding` that borrows this string's + /// contents. + pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> { + EncodedStr::new(&self.0, encoding) + } + + /// Extends or shortens this [RawString] to exactly `len` bytes. If the + /// string needs to be extended, does so by appending spaces. + /// + /// If this shortens the string, it can cut off a multibyte character in the + /// middle. + pub fn resize(&mut self, len: usize) { + self.0.resize(len, b' '); + } + + /// Removes any trailing ASCII spaces. + pub fn trim_end(&mut self) { + while self.0.pop_if(|c| *c == b' ').is_some() {} + } +} + +impl Borrow for RawString { + fn borrow(&self) -> &RawStr { + RawStr::from_bytes(&self.0) + } +} + +impl Deref for RawString { + type Target = RawStr; + + fn deref(&self) -> &Self::Target { + self.borrow() + } +} + +impl From> for RawString { + fn from(value: Cow<'_, [u8]>) -> Self { + Self(value.into_owned()) + } +} + +impl From> for RawString { + fn from(source: Vec) -> Self { + Self(source) + } +} + +impl From<&[u8]> for RawString { + fn from(source: &[u8]) -> Self { + Self(source.into()) + } +} + +impl Debug for RawString { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + write!(f, "{:?}", *self) + } +} + +/// A borrowed string in an unspecified encoding. +/// +/// A [RawString] is usually associated with a [Variable] and uses the +/// variable's character encoding. We assume that the encoding is one supported +/// by [encoding_rs] with byte units (that is, not a `UTF-16` encoding). All of +/// these encodings have some basic ASCII compatibility. +/// +/// For an owned raw string, see [RawString]. +/// +/// [Variable]: crate::dictionary::Variable +#[repr(transparent)] +#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct RawStr(pub [u8]); + +impl RawStr { + /// Creates a new [RawStr] that contains `bytes`. + pub fn from_bytes(bytes: &[u8]) -> &Self { + // SAFETY: `RawStr` is a transparent wrapper around `[u8]`, so we can + // turn a reference to the wrapped type into a reference to the wrapper + // type. + unsafe { &*(bytes as *const [u8] as *const Self) } + } + + /// Returns the raw string's contents as a borrowed byte slice. + pub fn as_bytes(&self) -> &[u8] { + &self.0 + } + + /// Returns an object that implements [Display] for printing this [RawStr], + /// given that it is encoded in `encoding`. + pub fn display(&self, encoding: &'static Encoding) -> DisplayRawString { + DisplayRawString(encoding.decode_without_bom_handling(&self.0).0) + } + + /// Interprets the raw string's contents as the specified `encoding` and + /// returns it decoded into UTF-8, replacing any malformed sequences by + /// [REPLACEMENT_CHARACTER]. + /// + /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER + pub fn decode(&self, encoding: &'static Encoding) -> Cow<'_, str> { + encoding.decode_without_bom_handling(&self.0).0 + } + + /// Compares this string and `other` for equality, ignoring trailing ASCII + /// spaces in either string for the purpose of comparison. (This is + /// acceptable because we assume that the encoding is ASCII-compatible.) + pub fn eq_ignore_trailing_spaces(&self, other: &RawStr) -> bool { + let mut this = self.0.iter(); + let mut other = other.0.iter(); + loop { + match (this.next(), other.next()) { + (Some(a), Some(b)) if a == b => (), + (Some(_), Some(_)) => return false, + (None, None) => return true, + (Some(b' '), None) => return this.all(|c| *c == b' '), + (None, Some(b' ')) => return other.all(|c| *c == b' '), + (Some(_), None) | (None, Some(_)) => return false, + } + } + } + + /// Returns the string's length in bytes. + pub fn len(&self) -> usize { + self.0.len() + } +} + +/// Helper struct for printing [RawStr] with [format!]. +/// +/// Created by [RawStr::display]. +pub struct DisplayRawString<'a>(Cow<'a, str>); + +impl<'a> Display for DisplayRawString<'a> { + // If `s` is valid UTF-8, displays it as UTF-8, otherwise as Latin-1 + // (actually bytes interpreted as Unicode code points). + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", &self.0) + } +} + +impl Debug for RawStr { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + let s = from_utf8(&self.0).map_or_else(|_| decode_latin1(&self.0), Cow::from); + write!(f, "{s:?}") + } +} + +/// The value of a [Variable](crate::dictionary::Variable). +#[derive(Clone)] +pub enum Datum { + /// A numeric value. + Number( + /// A number, or `None` for the system-missing value. + Option, + ), + /// A string value. + String( + /// The value, in the variable's encoding. + RawString, + ), +} + +impl Debug for Datum { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + match self { + Datum::Number(Some(number)) => write!(f, "{number:?}"), + Datum::Number(None) => write!(f, "SYSMIS"), + Datum::String(s) => write!(f, "{:?}", s), + } + } +} + +impl PartialEq for Datum { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::Number(Some(l0)), Self::Number(Some(r0))) => { + OrderedFloat(*l0) == OrderedFloat(*r0) + } + (Self::Number(None), Self::Number(None)) => true, + (Self::String(l0), Self::String(r0)) => l0 == r0, + _ => false, + } + } +} + +impl Eq for Datum {} + +impl PartialOrd for Datum { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Datum { + fn cmp(&self, other: &Self) -> Ordering { + match (self, other) { + (Datum::Number(a), Datum::Number(b)) => match (a, b) { + (None, None) => Ordering::Equal, + (None, Some(_)) => Ordering::Less, + (Some(_), None) => Ordering::Greater, + (Some(a), Some(b)) => a.total_cmp(b), + }, + (Datum::Number(_), Datum::String(_)) => Ordering::Less, + (Datum::String(_), Datum::Number(_)) => Ordering::Greater, + (Datum::String(a), Datum::String(b)) => a.cmp(b), + } + } +} + +impl Hash for Datum { + fn hash(&self, state: &mut H) { + match self { + Datum::Number(number) => number.map(OrderedFloat).hash(state), + Datum::String(string) => string.hash(state), + } + } +} + +impl Datum { + /// Constructs a new numerical [Datum] for the system-missing value. + pub const fn sysmis() -> Self { + Self::Number(None) + } + + /// Returns the number inside this datum, or `None` if this is a string + /// datum. + pub fn as_number(&self) -> Option> { + match self { + Datum::Number(number) => Some(*number), + Datum::String(_) => None, + } + } + + /// Returns the string inside this datum, or `None` if this is a numeric + /// datum. + pub fn as_string(&self) -> Option<&RawString> { + match self { + Datum::Number(_) => None, + Datum::String(s) => Some(s), + } + } + + /// Returns the string inside this datum as a mutable borrow, or `None` if + /// this is a numeric datum. + pub fn as_string_mut(&mut self) -> Option<&mut RawString> { + match self { + Datum::Number(_) => None, + Datum::String(s) => Some(s), + } + } + + /// Returns true if this datum can be resized to the given `width` without + /// loss, which is true only if this datum and `width` are both string or + /// both numeric and, for string widths, if resizing would not drop any + /// non-space characters. + pub fn is_resizable(&self, width: VarWidth) -> bool { + match (self, width) { + (Datum::Number(_), VarWidth::Numeric) => true, + (Datum::String(s), VarWidth::String(new_width)) => { + let new_len = new_width as usize; + new_len >= s.len() || s.0[new_len..].iter().all(|c| *c == b' ') + } + _ => false, + } + } + + /// Resizes this datum to the given `width`. + /// + /// # Panic + /// + /// Panics if resizing would change the datum from numeric to string or vice + /// versa. + pub fn resize(&mut self, width: VarWidth) { + match (self, width) { + (Datum::Number(_), VarWidth::Numeric) => (), + (Datum::String(s), VarWidth::String(new_width)) => s.resize(new_width as usize), + _ => unreachable!(), + } + } + + /// Returns the [VarType] corresponding to this datum. + pub fn var_type(&self) -> VarType { + match self { + Self::Number(_) => VarType::Numeric, + Self::String(_) => VarType::String, + } + } + + /// Returns the [VarWidth] corresponding to this datum. + pub fn width(&self) -> VarWidth { + match self { + Datum::Number(_) => VarWidth::Numeric, + Datum::String(s) => VarWidth::String(s.len().try_into().unwrap()), + } + } + + /// Compares this datum and `other` for equality, ignoring trailing ASCII + /// spaces in either, if they are both strings, for the purpose of + /// comparison. + pub fn eq_ignore_trailing_spaces(&self, other: &Datum) -> bool { + match (self, other) { + (Self::String(a), Self::String(b)) => a.eq_ignore_trailing_spaces(b), + _ => self == other, + } + } + + /// Removes trailing ASCII spaces from this datum, if it is a string. + pub fn trim_end(&mut self) { + match self { + Self::Number(_) => (), + Self::String(s) => s.trim_end(), + } + } +} + +impl From for Datum { + fn from(number: f64) -> Self { + Some(number).into() + } +} + +impl From> for Datum { + fn from(value: Option) -> Self { + Self::Number(value) + } +} + +impl From<&str> for Datum { + fn from(value: &str) -> Self { + value.as_bytes().into() + } +} + +impl From<&[u8]> for Datum { + fn from(value: &[u8]) -> Self { + Self::String(value.into()) + } +} diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index e5e8f1df07..151e218987 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -31,16 +31,15 @@ use encoding_rs::Encoding; use enum_map::{Enum, EnumMap}; use indexmap::IndexSet; use num::integer::div_ceil; -use ordered_float::OrderedFloat; use thiserror::Error as ThisError; use unicase::UniCase; use crate::{ + data::Datum, format::{DisplayPlain, Format}, identifier::{ByIdentifier, HasIdentifier, Identifier}, output::pivot::{Axis3, Dimension, Footnote, Footnotes, Group, PivotTable, Value}, settings::Show, - sys::raw::RawString, }; /// An index within [Dictionary::variables]. @@ -235,159 +234,6 @@ impl Display for VarWidthAdjective { } } -#[derive(Clone)] -pub enum Datum { - Number(Option), - String(RawString), -} - -impl Debug for Datum { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - match self { - Datum::Number(Some(number)) => write!(f, "{number:?}"), - Datum::Number(None) => write!(f, "SYSMIS"), - Datum::String(s) => write!(f, "{:?}", s), - } - } -} - -impl PartialEq for Datum { - fn eq(&self, other: &Self) -> bool { - match (self, other) { - (Self::Number(Some(l0)), Self::Number(Some(r0))) => { - OrderedFloat(*l0) == OrderedFloat(*r0) - } - (Self::Number(None), Self::Number(None)) => true, - (Self::String(l0), Self::String(r0)) => l0 == r0, - _ => false, - } - } -} - -impl Eq for Datum {} - -impl PartialOrd for Datum { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for Datum { - fn cmp(&self, other: &Self) -> Ordering { - match (self, other) { - (Datum::Number(a), Datum::Number(b)) => match (a, b) { - (None, None) => Ordering::Equal, - (None, Some(_)) => Ordering::Less, - (Some(_), None) => Ordering::Greater, - (Some(a), Some(b)) => a.total_cmp(b), - }, - (Datum::Number(_), Datum::String(_)) => Ordering::Less, - (Datum::String(_), Datum::Number(_)) => Ordering::Greater, - (Datum::String(a), Datum::String(b)) => a.cmp(b), - } - } -} - -impl Hash for Datum { - fn hash(&self, state: &mut H) { - match self { - Datum::Number(number) => number.map(OrderedFloat).hash(state), - Datum::String(string) => string.hash(state), - } - } -} - -impl Datum { - pub const fn sysmis() -> Self { - Self::Number(None) - } - - pub fn as_number(&self) -> Option> { - match self { - Datum::Number(number) => Some(*number), - Datum::String(_) => None, - } - } - - pub fn as_string(&self) -> Option<&RawString> { - match self { - Datum::Number(_) => None, - Datum::String(s) => Some(s), - } - } - - pub fn is_resizable(&self, width: VarWidth) -> bool { - match (self, width) { - (Datum::Number(_), VarWidth::Numeric) => true, - (Datum::String(s), VarWidth::String(new_width)) => { - let new_len = new_width as usize; - new_len >= s.len() || s.0[new_len..].iter().all(|c| *c == b' ') - } - _ => false, - } - } - - pub fn resize(&mut self, width: VarWidth) { - match (self, width) { - (Datum::Number(_), VarWidth::Numeric) => (), - (Datum::String(s), VarWidth::String(new_width)) => s.resize(new_width as usize), - _ => unreachable!(), - } - } - - pub fn var_type(&self) -> VarType { - match self { - Self::Number(_) => VarType::Numeric, - Self::String(_) => VarType::String, - } - } - - pub fn width(&self) -> VarWidth { - match self { - Datum::Number(_) => VarWidth::Numeric, - Datum::String(s) => VarWidth::String(s.len().try_into().unwrap()), - } - } - - pub fn eq_ignore_trailing_spaces(&self, other: &Datum) -> bool { - match (self, other) { - (Self::String(a), Self::String(b)) => a.eq_ignore_trailing_spaces(b), - _ => self == other, - } - } - - pub fn trim_end(&mut self) { - match self { - Self::Number(_) => (), - Self::String(s) => s.trim_end(), - } - } -} - -impl From for Datum { - fn from(number: f64) -> Self { - Some(number).into() - } -} - -impl From> for Datum { - fn from(value: Option) -> Self { - Self::Number(value) - } -} - -impl From<&str> for Datum { - fn from(value: &str) -> Self { - value.as_bytes().into() - } -} - -impl From<&[u8]> for Datum { - fn from(value: &[u8]) -> Self { - Self::String(value.into()) - } -} - /// A collection of variables, plus additional metadata. #[derive(Clone, Debug)] pub struct Dictionary { diff --git a/rust/pspp/src/engine.rs b/rust/pspp/src/engine.rs index 6d9085c214..e8a71266e0 100644 --- a/rust/pspp/src/engine.rs +++ b/rust/pspp/src/engine.rs @@ -54,13 +54,11 @@ mod tests { #[ignore] fn test_echo() { let mut engine = Engine::new(); - engine.run(Source::new_default(&Arc::new( - SyntaxFile::new( - "ECHO 'hi there'.\nECHO 'bye there'.\n".to_string(), - Some("test.sps".to_string()), - UTF_8, - ), - ))); + engine.run(Source::new_default(&Arc::new(SyntaxFile::new( + "ECHO 'hi there'.\nECHO 'bye there'.\n".to_string(), + Some("test.sps".to_string()), + UTF_8, + )))); } #[test] diff --git a/rust/pspp/src/format/display/mod.rs b/rust/pspp/src/format/display/mod.rs index 92e628998f..51afc68be4 100644 --- a/rust/pspp/src/format/display/mod.rs +++ b/rust/pspp/src/format/display/mod.rs @@ -29,7 +29,7 @@ use smallvec::{Array, SmallVec}; use crate::{ calendar::{calendar_offset_to_gregorian, day_of_year, month_name, short_month_name}, - dictionary::Datum, + data::Datum, endian::ToBytes, format::{Category, DateTemplate, Decimal, Format, NumberStyle, Settings, TemplateItem, Type}, settings::{EndianSettings, Settings as PsppSettings}, diff --git a/rust/pspp/src/format/display/test.rs b/rust/pspp/src/format/display/test.rs index c041fa6a2b..d9b4dd5f64 100644 --- a/rust/pspp/src/format/display/test.rs +++ b/rust/pspp/src/format/display/test.rs @@ -23,7 +23,7 @@ use smallstr::SmallString; use smallvec::SmallVec; use crate::{ - dictionary::Datum, + data::Datum, endian::Endian, format::{AbstractFormat, Epoch, Format, Settings, Type, UncheckedFormat, CC}, lex::{scan::StringScanner, segment::Syntax, Punct, Token}, diff --git a/rust/pspp/src/format/mod.rs b/rust/pspp/src/format/mod.rs index f7abf58757..d46db2d8ee 100644 --- a/rust/pspp/src/format/mod.rs +++ b/rust/pspp/src/format/mod.rs @@ -28,8 +28,10 @@ use thiserror::Error as ThisError; use unicode_width::UnicodeWidthStr; use crate::{ - dictionary::{Datum, VarType, VarWidth}, - sys::raw::{self, RawString}, + data::RawString, + data::Datum, + dictionary::{VarType, VarWidth}, + sys::raw, }; mod display; diff --git a/rust/pspp/src/format/parse.rs b/rust/pspp/src/format/parse.rs index 39f5f67fcc..b69fbdb48b 100644 --- a/rust/pspp/src/format/parse.rs +++ b/rust/pspp/src/format/parse.rs @@ -16,7 +16,7 @@ use crate::{ calendar::{calendar_gregorian_to_offset, DateError}, - dictionary::Datum, + data::Datum, endian::{Endian, Parse}, format::{DateTemplate, Decimals, Settings, TemplateItem, Type}, settings::{EndianSettings, Settings as PsppSettings}, @@ -921,7 +921,7 @@ mod test { use crate::{ calendar::{days_in_month, is_leap_year}, - dictionary::Datum, + data::Datum, endian::Endian, format::{ parse::{ParseError, ParseErrorKind, Sign}, diff --git a/rust/pspp/src/lib.rs b/rust/pspp/src/lib.rs index 8b5d81f400..ee567c5d81 100644 --- a/rust/pspp/src/lib.rs +++ b/rust/pspp/src/lib.rs @@ -17,6 +17,7 @@ pub mod calendar; pub mod command; pub mod crypto; +pub mod data; pub mod dictionary; pub mod endian; pub mod engine; diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs index a085ef4929..f8301aa43c 100644 --- a/rust/pspp/src/output/pivot/mod.rs +++ b/rust/pspp/src/output/pivot/mod.rs @@ -68,7 +68,8 @@ use thiserror::Error as ThisError; use tlo::parse_tlo; use crate::{ - dictionary::{Datum, VarType, Variable}, + data::Datum, + dictionary::{VarType, Variable}, format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat}, settings::{Settings, Show}, }; diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index 914f03c10d..8a00f5ddd3 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -18,8 +18,9 @@ use std::{collections::BTreeMap, ops::Range}; use crate::{ calendar::date_time_to_pspp, + data::{Datum, RawString}, dictionary::{ - Datum, Dictionary, InvalidRole, MissingValues, MissingValuesError, MultipleResponseSet, + Dictionary, InvalidRole, MissingValues, MissingValuesError, MultipleResponseSet, MultipleResponseType, VarWidth, Variable, VariableSet, }, endian::Endian, @@ -39,7 +40,7 @@ use crate::{ VarDisplayRecord, VariableAttributesRecord, VariableRecord, VariableSetRecord, VeryLongStringsRecord, ZHeader, ZTrailer, }, - Cases, DecodedRecord, RawDatum, RawString, RawWidth, + Cases, DecodedRecord, RawDatum, RawWidth, }, }, }; diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index f4e7034ca0..b1263be32c 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -20,7 +20,8 @@ //! raw details. Most readers will want to use higher-level interfaces. use crate::{ - dictionary::{Datum, VarType, VarWidth}, + data::{Datum, RawStr, RawString}, + dictionary::{VarType, VarWidth}, endian::{Endian, Parse, ToBytes}, format::DisplayPlainF64, identifier::{Error as IdError, Identifier}, @@ -39,12 +40,12 @@ use crate::{ }, }; -use encoding_rs::{mem::decode_latin1, Encoding}; +use encoding_rs::Encoding; use flate2::read::ZlibDecoder; use itertools::Itertools; use smallvec::SmallVec; use std::{ - borrow::{Borrow, Cow}, + borrow::Cow, cell::RefCell, collections::VecDeque, fmt::{Debug, Display, Formatter, Result as FmtResult}, @@ -52,8 +53,6 @@ use std::{ iter::repeat_n, mem::take, num::NonZeroU8, - ops::Deref, - str::from_utf8, }; use thiserror::Error as ThisError; @@ -713,12 +712,6 @@ pub fn infer_encoding( } } -// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it -// decoded as Latin-1 (actually bytes interpreted as Unicode code points). -fn default_decode(s: &[u8]) -> Cow { - from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from) -} - /// An [Encoding] along with a function to report decoding errors. /// /// This is used by functions that decode raw records. @@ -872,7 +865,7 @@ impl Debug for RawDatum { match self { RawDatum::Number(Some(number)) => write!(f, "{number:?}"), RawDatum::Number(None) => write!(f, "SYSMIS"), - RawDatum::String(s) => write!(f, "{:?}", default_decode(s)), + RawDatum::String(s) => write!(f, "{:?}", RawStr::from_bytes(s)), } } } @@ -1534,145 +1527,7 @@ impl Debug for UntypedDatum { } else { big }; - write!(f, "{number}")?; - - let string = default_decode(&self.0); - let string = string - .split(|c: char| c == '\0' || c.is_control()) - .next() - .unwrap(); - write!(f, "{string:?}")?; - Ok(()) - } -} - -/// An owned string in an unspecified encoding. -/// -/// We assume that the encoding is one supported by [encoding_rs] with byte -/// units (that is, not a `UTF-16` encoding). All of these encodings have some -/// basic ASCII compatibility. -/// -/// A [RawString] owns its contents and can grow and shrink, like a [Vec] or -/// [String]. For a borrowed raw string, see [RawStr]. -#[derive(Clone, PartialEq, Default, Eq, PartialOrd, Ord, Hash)] -pub struct RawString(pub Vec); - -impl RawString { - pub fn spaces(n: usize) -> Self { - Self(std::iter::repeat_n(b' ', n).collect()) - } - pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> { - EncodedStr::new(&self.0, encoding) - } - pub fn resize(&mut self, len: usize) { - self.0.resize(len, b' '); - } - pub fn len(&self) -> usize { - self.0.len() - } - pub fn trim_end(&mut self) { - while self.0.pop_if(|c| *c == b' ').is_some() {} - } -} - -impl Borrow for RawString { - fn borrow(&self) -> &RawStr { - RawStr::from_bytes(&self.0) - } -} - -impl Deref for RawString { - type Target = RawStr; - - fn deref(&self) -> &Self::Target { - self.borrow() - } -} - -impl From> for RawString { - fn from(value: Cow<'_, [u8]>) -> Self { - Self(value.into_owned()) - } -} - -impl From> for RawString { - fn from(source: Vec) -> Self { - Self(source) - } -} - -impl From<&[u8]> for RawString { - fn from(source: &[u8]) -> Self { - Self(source.into()) - } -} - -impl Debug for RawString { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{:?}", default_decode(self.0.as_slice())) - } -} - -/// A borrowed string in an unspecified encoding. -/// -/// We assume that the encoding is one supported by [encoding_rs] with byte -/// units (that is, not a `UTF-16` encoding). All of these encodings have some -/// basic ASCII compatibility. -/// -/// For an owned raw string, see [RawString]. -#[repr(transparent)] -#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct RawStr(pub [u8]); - -impl RawStr { - pub fn from_bytes(bytes: &[u8]) -> &Self { - // SAFETY: `RawStr` is a transparent wrapper around `[u8]`, so we can - // turn a reference to the wrapped type into a reference to the wrapper - // type. - unsafe { &*(bytes as *const [u8] as *const Self) } - } - - pub fn as_bytes(&self) -> &[u8] { - &self.0 - } - - /// Returns an object that implements [Display] for printing this [RawStr], - /// given that it is encoded in `encoding`. - pub fn display(&self, encoding: &'static Encoding) -> DisplayRawString { - DisplayRawString(encoding.decode_without_bom_handling(&self.0).0) - } - - pub fn decode(&self, encoding: &'static Encoding) -> Cow<'_, str> { - encoding.decode_without_bom_handling(&self.0).0 - } - - pub fn eq_ignore_trailing_spaces(&self, other: &RawStr) -> bool { - let mut this = self.0.iter(); - let mut other = other.0.iter(); - loop { - match (this.next(), other.next()) { - (Some(a), Some(b)) if a == b => (), - (Some(_), Some(_)) => return false, - (None, None) => return true, - (Some(b' '), None) => return this.all(|c| *c == b' '), - (None, Some(b' ')) => return other.all(|c| *c == b' '), - (Some(_), None) | (None, Some(_)) => return false, - } - } - } -} - -pub struct DisplayRawString<'a>(Cow<'a, str>); - -impl<'a> Display for DisplayRawString<'a> { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - write!(f, "{}", &self.0) - } -} - -impl Debug for RawStr { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{:?}", default_decode(self.as_bytes())) + write!(f, "{number}/{:?}", RawStr::from_bytes(&self.0)) } } @@ -1687,7 +1542,7 @@ impl From<[u8; N]> for RawStrArray { impl Debug for RawStrArray { fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{:?}", default_decode(&self.0)) + write!(f, "{:?}", RawStr::from_bytes(&self.0)) } } diff --git a/rust/pspp/src/sys/raw/records.rs b/rust/pspp/src/sys/raw/records.rs index 00cbc9ca2c..cc1f2f2b57 100644 --- a/rust/pspp/src/sys/raw/records.rs +++ b/rust/pspp/src/sys/raw/records.rs @@ -12,15 +12,16 @@ use std::{ }; use crate::{ + data::{Datum, RawString}, dictionary::{ - Alignment, Attributes, CategoryLabels, Datum, Measure, MissingValueRange, MissingValues, - VarType, VarWidth, + Alignment, Attributes, CategoryLabels, Measure, MissingValueRange, MissingValues, VarType, + VarWidth, }, endian::{Endian, Parse}, identifier::{Error as IdError, Identifier}, sys::raw::{ read_bytes, read_string, read_vec, DecodedRecord, Decoder, Error, Magic, RawDatum, - RawStrArray, RawString, RawWidth, Record, VarTypes, Warning, + RawStrArray, RawWidth, Record, VarTypes, Warning, }, }; diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs index 34aba7f282..e0fb088c9e 100644 --- a/rust/pspp/src/sys/test.rs +++ b/rust/pspp/src/sys/test.rs @@ -608,8 +608,7 @@ where let output = match reader.headers().collect::, _>>() { Ok(headers) => { let cases = reader.cases(); - let encoding = - infer_encoding(&headers, &mut |warning| warnings.push(warning)).unwrap(); + let encoding = infer_encoding(&headers, &mut |warning| warnings.push(warning)).unwrap(); let mut decoder = Decoder::new(encoding, |warning| warnings.push(warning)); let mut decoded_records = Vec::new(); for header in headers {