start using encodedstring in variable
authorBen Pfaff <blp@cs.stanford.edu>
Sun, 18 May 2025 22:15:42 +0000 (15:15 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Sun, 18 May 2025 22:15:42 +0000 (15:15 -0700)
rust/pspp/src/dictionary.rs
rust/pspp/src/sys/cooked.rs
rust/pspp/src/sys/raw.rs

index f8abe996d9b2fc8d023371dab012ae296850f266..d0e2080fd1fd77310dea8592eb3f074d1061fa3c 100644 (file)
@@ -19,7 +19,9 @@ use unicase::UniCase;
 use crate::{
     format::Format,
     identifier::{ByIdentifier, HasIdentifier, Identifier},
-    sys::raw::{Alignment, CategoryLabels, Measure, MissingValues, RawString, VarType},
+    sys::raw::{
+        Alignment, CategoryLabels, EncodedString, Measure, MissingValues, RawString, VarType,
+    },
 };
 
 /// An index within [Dictionary::variables].
@@ -167,6 +169,15 @@ pub enum Value<S = RawString> {
     String(S),
 }
 
+impl Value {
+    pub fn with_encoding(&self, encoding: &'static Encoding) -> Value<EncodedString> {
+        match self {
+            Value::Number(number) => Value::Number(*number),
+            Value::String(string) => Value::String(string.with_encoding(encoding)),
+        }
+    }
+}
+
 impl<S> Debug for Value<S>
 where
     S: Debug,
@@ -642,7 +653,7 @@ pub struct Variable {
     /// `None`).
     ///
     /// Both kinds of missing values are excluded from most analyses.
-    pub missing_values: MissingValues,
+    pub missing_values: MissingValues<EncodedString>,
 
     /// Output format used in most contexts.
     pub print_format: Format,
index 5d59305defb5c1cdc063aa72abaa117f8798159b..b9bbfbae2a402219ce5ef427f0bc77d6edda7baa 100644 (file)
@@ -9,14 +9,17 @@ use crate::{
     endian::Endian,
     format::{Error as FormatError, Format, UncheckedFormat},
     identifier::{ByIdentifier, Error as IdError, Identifier},
-    sys::encoding::Error as EncodingError,
-    sys::raw::{
-        self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord,
-        FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName, LongNamesRecord,
-        LongStringMissingValueRecord, LongStringValueLabelRecord, MissingValues,
-        MultipleResponseRecord, NumberOfCasesRecord, ProductInfoRecord, RawStrArray, RawWidth,
-        ValueLabel, ValueLabelRecord, VarDisplayRecord, VariableAttributeRecord, VariableRecord,
-        VariableSetRecord, VeryLongStringsRecord, ZHeader, ZTrailer,
+    sys::{
+        encoding::Error as EncodingError,
+        raw::{
+            self, Cases, DecodedRecord, DocumentRecord, EncodedString, EncodingRecord, Extension,
+            FileAttributeRecord, FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName,
+            LongNamesRecord, LongStringMissingValueRecord, LongStringValueLabelRecord,
+            MissingValues, MultipleResponseRecord, NumberOfCasesRecord, ProductInfoRecord,
+            RawStrArray, RawWidth, ValueLabel, ValueLabelRecord, VarDisplayRecord,
+            VariableAttributeRecord, VariableRecord, VariableSetRecord, VeryLongStringsRecord,
+            ZHeader, ZTrailer,
+        },
     },
 };
 use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
@@ -503,7 +506,7 @@ pub fn decode(
 
         variable.label = input.label.clone();
 
-        variable.missing_values = input.missing_values.clone();
+        variable.missing_values = input.missing_values.with_encoding(encoding);
 
         variable.print_format = decode_format(
             input.print_format,
@@ -767,7 +770,7 @@ pub fn decode(
                 value.clear();
                 value.extend_from_slice(v.0.as_slice());
                 value.resize(variable.width.as_string_width().unwrap(), b' ');
-                Value::String(Box::from(value.as_slice()))
+                Value::String(EncodedString::new(&*value, encoding))
             })
             .collect::<Vec<_>>();
         variable.missing_values = MissingValues {
index c26ae08052c6722d61dae5ab1c634481f8727d35..ec4e3643b72b5f65fa0b5330ad8521509a179b54 100644 (file)
@@ -9,7 +9,7 @@ use encoding_rs::{mem::decode_latin1, Encoding, UTF_8};
 use flate2::read::ZlibDecoder;
 use num::Integer;
 use std::{
-    borrow::{Borrow, Cow},
+    borrow::Cow,
     cell::RefCell,
     collections::{HashMap, VecDeque},
     fmt::{Debug, Display, Formatter, Result as FmtResult},
@@ -1121,7 +1121,7 @@ fn format_name(type_: u32) -> Cow<'static, str> {
 }
 
 #[derive(Clone, Default)]
-pub struct MissingValues<S = Box<[u8]>>
+pub struct MissingValues<S = RawString>
 where
     S: Debug,
 {
@@ -1129,17 +1129,39 @@ where
     pub values: Vec<Value<S>>,
 
     /// Optional range of missing values.
-    pub range: Option<MissingValueRange<S>>,
+    pub range: Option<MissingValueRange>,
 }
 
-#[derive(Clone)]
-pub enum MissingValueRange<S = Box<[u8]>>
-where
-    S: Debug,
-{
-    In { low: Value<S>, high: Value<S> },
-    From { low: Value<S> },
-    To { high: Value<S> },
+#[derive(Copy, Clone)]
+pub enum MissingValueRange {
+    In { low: f64, high: f64 },
+    From { low: f64 },
+    To { high: f64 },
+}
+
+impl MissingValueRange {
+    pub fn new(low: f64, high: f64) -> Self {
+        const LOWEST: f64 = f64::MIN.next_up();
+        match (low, high) {
+            (f64::MIN | LOWEST, _) => Self::To { high },
+            (_, f64::MAX) => Self::From { low },
+            (_, _) => Self::In { low, high },
+        }
+    }
+
+    pub fn low(&self) -> Option<f64> {
+        match self {
+            MissingValueRange::In { low, .. } | MissingValueRange::From { low } => Some(*low),
+            MissingValueRange::To { .. } => None,
+        }
+    }
+
+    pub fn high(&self) -> Option<f64> {
+        match self {
+            MissingValueRange::In { high, .. } | MissingValueRange::To { high } => Some(*high),
+            MissingValueRange::From { .. } => None,
+        }
+    }
 }
 
 impl<S> Debug for MissingValues<S>
@@ -1158,10 +1180,14 @@ where
             if !self.values.is_empty() {
                 write!(f, ", ")?;
             }
-            match range {
-                MissingValueRange::In { low, high } => write!(f, "{low:?} THRU {high:?}")?,
-                MissingValueRange::From { low } => write!(f, "{low:?} THRU HI")?,
-                MissingValueRange::To { high } => write!(f, "LOW THRU {high:?}")?,
+            match range.low() {
+                Some(low) => write!(f, "{low:?}")?,
+                None => write!(f, "LOW")?,
+            }
+            write!(f, " THRU ")?;
+            match range.high() {
+                Some(high) => write!(f, "{high:?}")?,
+                None => write!(f, "HIGH")?,
             }
         }
 
@@ -1218,22 +1244,9 @@ impl MissingValues {
                     .map(|v| Value::Number(endian.parse(v)))
                     .collect();
 
-                const LOWEST: f64 = f64::MIN.next_up();
-                let range =
-                    range.map(
-                        |(low, high)| match (endian.parse(low), endian.parse(high)) {
-                            (f64::MIN | LOWEST, high) => MissingValueRange::To {
-                                high: Value::Number(Some(high)),
-                            },
-                            (low, f64::MAX) => MissingValueRange::From {
-                                low: Value::Number(Some(low)),
-                            },
-                            (low, high) => MissingValueRange::In {
-                                low: Value::Number(Some(low)),
-                                high: Value::Number(Some(high)),
-                            },
-                        },
-                    );
+                let range = range.map(|(low, high)| {
+                    MissingValueRange::new(endian.parse(low), endian.parse(high))
+                });
                 return Ok(Self { values, range });
             }
             Ok(VarWidth::String(_)) if range.is_some() => warn(Warning::MissingValueStringRange),
@@ -1241,7 +1254,7 @@ impl MissingValues {
                 let width = width.min(8) as usize;
                 let values = values
                     .into_iter()
-                    .map(|value| Value::String(Box::from(&value[..width])))
+                    .map(|value| Value::String(RawString::from(&value[..width])))
                     .collect();
                 return Ok(Self {
                     values,
@@ -1252,6 +1265,17 @@ impl MissingValues {
         }
         Ok(Self::default())
     }
+
+    pub fn with_encoding(&self, encoding: &'static Encoding) -> MissingValues<EncodedString> {
+        MissingValues {
+            values: self
+                .values
+                .iter()
+                .map(|value| value.with_encoding(encoding))
+                .collect(),
+            range: self.range,
+        }
+    }
 }
 
 #[derive(Clone)]
@@ -1443,6 +1467,9 @@ impl RawString {
     pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> {
         EncodedStr::new(&self.0, encoding)
     }
+    pub fn with_encoding(&self, encoding: &'static Encoding) -> EncodedString {
+        EncodedString::new(&*self.0, encoding)
+    }
     pub fn as_slice(&self) -> &[u8] {
         &*self.0
     }
@@ -1498,39 +1525,85 @@ impl<const N: usize> Debug for RawStrArray<N> {
     }
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone)]
 pub enum EncodedString {
     Encoded {
         bytes: Vec<u8>,
         encoding: &'static Encoding,
     },
-    Utf8 {
-        s: String,
-    },
+    Utf8(String),
 }
 
 impl EncodedString {
+    /// Creates a new `EncodedString` from `bytes` and `encoding`.
+    ///
+    /// It's cheaper to use `EncodedString::from(string)` if the input is in a
+    /// `&str` or `String`.
+    pub fn new(bytes: impl Into<Vec<u8>>, encoding: &'static Encoding) -> Self {
+        let bytes: Vec<u8> = bytes.into();
+        if encoding == UTF_8 {
+            match String::from_utf8(bytes) {
+                Ok(string) => Self::Utf8(string),
+                Err(error) => Self::Encoded {
+                    bytes: error.into_bytes(),
+                    encoding,
+                },
+            }
+        } else {
+            Self::Encoded { bytes, encoding }
+        }
+    }
+
     pub fn borrowed(&self) -> EncodedStr<'_> {
         match self {
             EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding },
-            EncodedString::Utf8 { s } => EncodedStr::Utf8 { s },
+            EncodedString::Utf8(s) => EncodedStr::Utf8 { s },
         }
     }
     pub fn as_utf8_bytes(&self) -> Option<&[u8]> {
         match self {
             EncodedString::Encoded { bytes, encoding } if *encoding == UTF_8 => Some(&bytes),
-            EncodedString::Utf8 { s } => Some(s.as_bytes()),
+            EncodedString::Utf8(s) => Some(s.as_bytes()),
             _ => None,
         }
     }
     pub fn as_encoded(&self) -> (&[u8], &'static Encoding) {
         match self {
             EncodedString::Encoded { bytes, encoding } => (&bytes, encoding),
-            EncodedString::Utf8 { s } => (s.as_bytes(), UTF_8),
+            EncodedString::Utf8(s) => (s.as_bytes(), UTF_8),
         }
     }
 }
 
+impl Debug for EncodedString {
+    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+        match self {
+            Self::Encoded { bytes, encoding } => {
+                write!(f, "{:?}({})", self.borrowed().to_utf8(), encoding.name())
+            }
+            Self::Utf8(string) => write!(f, "{string:?}"),
+        }
+    }
+}
+
+impl From<String> for EncodedString {
+    fn from(value: String) -> Self {
+        Self::Utf8(value)
+    }
+}
+
+impl From<&'_ str> for EncodedString {
+    fn from(value: &'_ str) -> Self {
+        Self::Utf8(value.into())
+    }
+}
+
+impl Default for EncodedString {
+    fn default() -> Self {
+        Self::Utf8(String::new())
+    }
+}
+
 impl<'a> From<EncodedStr<'a>> for EncodedString {
     fn from(value: EncodedStr<'a>) -> Self {
         match value {
@@ -1538,7 +1611,7 @@ impl<'a> From<EncodedStr<'a>> for EncodedString {
                 bytes: bytes.into(),
                 encoding,
             },
-            EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() },
+            EncodedStr::Utf8 { s } => Self::Utf8(s.into()),
         }
     }
 }