From: Ben Pfaff Date: Sun, 18 May 2025 22:15:42 +0000 (-0700) Subject: start using encodedstring in variable X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8c675eec48e1cd5f3dd9e7a04e3aa5f77820f89e;p=pspp start using encodedstring in variable --- diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs index f8abe996d9..d0e2080fd1 100644 --- a/rust/pspp/src/dictionary.rs +++ b/rust/pspp/src/dictionary.rs @@ -19,7 +19,9 @@ use unicase::UniCase; use crate::{ format::Format, identifier::{ByIdentifier, HasIdentifier, Identifier}, - sys::raw::{Alignment, CategoryLabels, Measure, MissingValues, RawString, VarType}, + sys::raw::{ + Alignment, CategoryLabels, EncodedString, Measure, MissingValues, RawString, VarType, + }, }; /// An index within [Dictionary::variables]. @@ -167,6 +169,15 @@ pub enum Value { String(S), } +impl Value { + pub fn with_encoding(&self, encoding: &'static Encoding) -> Value { + match self { + Value::Number(number) => Value::Number(*number), + Value::String(string) => Value::String(string.with_encoding(encoding)), + } + } +} + impl Debug for Value where S: Debug, @@ -642,7 +653,7 @@ pub struct Variable { /// `None`). /// /// Both kinds of missing values are excluded from most analyses. - pub missing_values: MissingValues, + pub missing_values: MissingValues, /// Output format used in most contexts. pub print_format: Format, diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index 5d59305def..b9bbfbae2a 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -9,14 +9,17 @@ use crate::{ endian::Endian, format::{Error as FormatError, Format, UncheckedFormat}, identifier::{ByIdentifier, Error as IdError, Identifier}, - sys::encoding::Error as EncodingError, - sys::raw::{ - self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord, - FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName, LongNamesRecord, - LongStringMissingValueRecord, LongStringValueLabelRecord, MissingValues, - MultipleResponseRecord, NumberOfCasesRecord, ProductInfoRecord, RawStrArray, RawWidth, - ValueLabel, ValueLabelRecord, VarDisplayRecord, VariableAttributeRecord, VariableRecord, - VariableSetRecord, VeryLongStringsRecord, ZHeader, ZTrailer, + sys::{ + encoding::Error as EncodingError, + raw::{ + self, Cases, DecodedRecord, DocumentRecord, EncodedString, EncodingRecord, Extension, + FileAttributeRecord, FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName, + LongNamesRecord, LongStringMissingValueRecord, LongStringValueLabelRecord, + MissingValues, MultipleResponseRecord, NumberOfCasesRecord, ProductInfoRecord, + RawStrArray, RawWidth, ValueLabel, ValueLabelRecord, VarDisplayRecord, + VariableAttributeRecord, VariableRecord, VariableSetRecord, VeryLongStringsRecord, + ZHeader, ZTrailer, + }, }, }; use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; @@ -503,7 +506,7 @@ pub fn decode( variable.label = input.label.clone(); - variable.missing_values = input.missing_values.clone(); + variable.missing_values = input.missing_values.with_encoding(encoding); variable.print_format = decode_format( input.print_format, @@ -767,7 +770,7 @@ pub fn decode( value.clear(); value.extend_from_slice(v.0.as_slice()); value.resize(variable.width.as_string_width().unwrap(), b' '); - Value::String(Box::from(value.as_slice())) + Value::String(EncodedString::new(&*value, encoding)) }) .collect::>(); variable.missing_values = MissingValues { diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index c26ae08052..ec4e3643b7 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -9,7 +9,7 @@ use encoding_rs::{mem::decode_latin1, Encoding, UTF_8}; use flate2::read::ZlibDecoder; use num::Integer; use std::{ - borrow::{Borrow, Cow}, + borrow::Cow, cell::RefCell, collections::{HashMap, VecDeque}, fmt::{Debug, Display, Formatter, Result as FmtResult}, @@ -1121,7 +1121,7 @@ fn format_name(type_: u32) -> Cow<'static, str> { } #[derive(Clone, Default)] -pub struct MissingValues> +pub struct MissingValues where S: Debug, { @@ -1129,17 +1129,39 @@ where pub values: Vec>, /// Optional range of missing values. - pub range: Option>, + pub range: Option, } -#[derive(Clone)] -pub enum MissingValueRange> -where - S: Debug, -{ - In { low: Value, high: Value }, - From { low: Value }, - To { high: Value }, +#[derive(Copy, Clone)] +pub enum MissingValueRange { + In { low: f64, high: f64 }, + From { low: f64 }, + To { high: f64 }, +} + +impl MissingValueRange { + pub fn new(low: f64, high: f64) -> Self { + const LOWEST: f64 = f64::MIN.next_up(); + match (low, high) { + (f64::MIN | LOWEST, _) => Self::To { high }, + (_, f64::MAX) => Self::From { low }, + (_, _) => Self::In { low, high }, + } + } + + pub fn low(&self) -> Option { + match self { + MissingValueRange::In { low, .. } | MissingValueRange::From { low } => Some(*low), + MissingValueRange::To { .. } => None, + } + } + + pub fn high(&self) -> Option { + match self { + MissingValueRange::In { high, .. } | MissingValueRange::To { high } => Some(*high), + MissingValueRange::From { .. } => None, + } + } } impl Debug for MissingValues @@ -1158,10 +1180,14 @@ where if !self.values.is_empty() { write!(f, ", ")?; } - match range { - MissingValueRange::In { low, high } => write!(f, "{low:?} THRU {high:?}")?, - MissingValueRange::From { low } => write!(f, "{low:?} THRU HI")?, - MissingValueRange::To { high } => write!(f, "LOW THRU {high:?}")?, + match range.low() { + Some(low) => write!(f, "{low:?}")?, + None => write!(f, "LOW")?, + } + write!(f, " THRU ")?; + match range.high() { + Some(high) => write!(f, "{high:?}")?, + None => write!(f, "HIGH")?, } } @@ -1218,22 +1244,9 @@ impl MissingValues { .map(|v| Value::Number(endian.parse(v))) .collect(); - const LOWEST: f64 = f64::MIN.next_up(); - let range = - range.map( - |(low, high)| match (endian.parse(low), endian.parse(high)) { - (f64::MIN | LOWEST, high) => MissingValueRange::To { - high: Value::Number(Some(high)), - }, - (low, f64::MAX) => MissingValueRange::From { - low: Value::Number(Some(low)), - }, - (low, high) => MissingValueRange::In { - low: Value::Number(Some(low)), - high: Value::Number(Some(high)), - }, - }, - ); + let range = range.map(|(low, high)| { + MissingValueRange::new(endian.parse(low), endian.parse(high)) + }); return Ok(Self { values, range }); } Ok(VarWidth::String(_)) if range.is_some() => warn(Warning::MissingValueStringRange), @@ -1241,7 +1254,7 @@ impl MissingValues { let width = width.min(8) as usize; let values = values .into_iter() - .map(|value| Value::String(Box::from(&value[..width]))) + .map(|value| Value::String(RawString::from(&value[..width]))) .collect(); return Ok(Self { values, @@ -1252,6 +1265,17 @@ impl MissingValues { } Ok(Self::default()) } + + pub fn with_encoding(&self, encoding: &'static Encoding) -> MissingValues { + MissingValues { + values: self + .values + .iter() + .map(|value| value.with_encoding(encoding)) + .collect(), + range: self.range, + } + } } #[derive(Clone)] @@ -1443,6 +1467,9 @@ impl RawString { pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> { EncodedStr::new(&self.0, encoding) } + pub fn with_encoding(&self, encoding: &'static Encoding) -> EncodedString { + EncodedString::new(&*self.0, encoding) + } pub fn as_slice(&self) -> &[u8] { &*self.0 } @@ -1498,39 +1525,85 @@ impl Debug for RawStrArray { } } -#[derive(Clone, Debug)] +#[derive(Clone)] pub enum EncodedString { Encoded { bytes: Vec, encoding: &'static Encoding, }, - Utf8 { - s: String, - }, + Utf8(String), } impl EncodedString { + /// Creates a new `EncodedString` from `bytes` and `encoding`. + /// + /// It's cheaper to use `EncodedString::from(string)` if the input is in a + /// `&str` or `String`. + pub fn new(bytes: impl Into>, encoding: &'static Encoding) -> Self { + let bytes: Vec = bytes.into(); + if encoding == UTF_8 { + match String::from_utf8(bytes) { + Ok(string) => Self::Utf8(string), + Err(error) => Self::Encoded { + bytes: error.into_bytes(), + encoding, + }, + } + } else { + Self::Encoded { bytes, encoding } + } + } + pub fn borrowed(&self) -> EncodedStr<'_> { match self { EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding }, - EncodedString::Utf8 { s } => EncodedStr::Utf8 { s }, + EncodedString::Utf8(s) => EncodedStr::Utf8 { s }, } } pub fn as_utf8_bytes(&self) -> Option<&[u8]> { match self { EncodedString::Encoded { bytes, encoding } if *encoding == UTF_8 => Some(&bytes), - EncodedString::Utf8 { s } => Some(s.as_bytes()), + EncodedString::Utf8(s) => Some(s.as_bytes()), _ => None, } } pub fn as_encoded(&self) -> (&[u8], &'static Encoding) { match self { EncodedString::Encoded { bytes, encoding } => (&bytes, encoding), - EncodedString::Utf8 { s } => (s.as_bytes(), UTF_8), + EncodedString::Utf8(s) => (s.as_bytes(), UTF_8), } } } +impl Debug for EncodedString { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + match self { + Self::Encoded { bytes, encoding } => { + write!(f, "{:?}({})", self.borrowed().to_utf8(), encoding.name()) + } + Self::Utf8(string) => write!(f, "{string:?}"), + } + } +} + +impl From for EncodedString { + fn from(value: String) -> Self { + Self::Utf8(value) + } +} + +impl From<&'_ str> for EncodedString { + fn from(value: &'_ str) -> Self { + Self::Utf8(value.into()) + } +} + +impl Default for EncodedString { + fn default() -> Self { + Self::Utf8(String::new()) + } +} + impl<'a> From> for EncodedString { fn from(value: EncodedStr<'a>) -> Self { match value { @@ -1538,7 +1611,7 @@ impl<'a> From> for EncodedString { bytes: bytes.into(), encoding, }, - EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() }, + EncodedStr::Utf8 { s } => Self::Utf8(s.into()), } } }