encodedstring
authorBen Pfaff <blp@cs.stanford.edu>
Sun, 13 Apr 2025 18:06:34 +0000 (11:06 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Sun, 13 Apr 2025 18:06:34 +0000 (11:06 -0700)
rust/pspp/src/format/parse.rs

index 54b2c636721ebe3a58937f9098f60740155a80cb..1ad20e7c6ab9d1cd1923e380b1ca7b81727d505d 100644 (file)
@@ -14,6 +14,29 @@ use std::{
 };
 use thiserror::Error as ThisError;
 
+#[derive(Clone, Debug)]
+pub enum EncodedString {
+    Encoded {
+        bytes: Vec<u8>,
+        encoding: &'static Encoding,
+    },
+    Utf8 {
+        s: String,
+    },
+}
+
+impl<'a> From<EncodedStr<'a>> for EncodedString {
+    fn from(value: EncodedStr<'a>) -> Self {
+        match value {
+            EncodedStr::Encoded { bytes, encoding } => Self::Encoded {
+                bytes: bytes.into(),
+                encoding,
+            },
+            EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() },
+        }
+    }
+}
+
 pub enum EncodedStr<'a> {
     Encoded {
         bytes: &'a [u8],
@@ -71,10 +94,16 @@ impl<'a> From<&'a str> for EncodedStr<'a> {
     }
 }
 
+impl<'a> From<&'a String> for EncodedStr<'a> {
+    fn from(s: &'a String) -> Self {
+        Self::Utf8 { s: s.as_str() }
+    }
+}
+
 #[derive(Clone, Debug)]
 pub struct ParseError {
     format: Format,
-    input: String,
+    input: EncodedString,
     kind: ParseErrorKind,
 }
 
@@ -218,53 +247,15 @@ impl<'a> ParseValue<'a> {
         }
     }
 
-    /// Parses `s`.
+    /// Parses `input`.
     ///
-    /// This is only appropriate if `s` was originally encoded in UTF-8
-    /// Otherwise, binary formats will not yield sensible parse results, because
-    /// recoding bytes from (e.g.) windows-1252 into UTF-8, and then
+    /// # Input encoding
+    ///
+    /// Be careful about the encoding of `input`.  It's tempting to recode all
+    /// input into UTF-8, but this will screw up parsing of binary formats,
+    /// because recoding bytes from (e.g.) windows-1252 into UTF-8, and then
     /// interpreting them as a binary number yields nonsense.
-    pub fn parse(&self, s: &str) -> Result<Value, ParseError> {
-        if s.is_empty() {
-            return Ok(self.format.default_value());
-        }
-        match self.format.type_ {
-            Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => {
-                self.parse_number(s, self.format.type_)
-            }
-            Type::CC(_) => self.parse_number(s, Type::F),
-            Type::N => self.parse_n(s),
-            Type::Z => self.parse_z(s),
-            Type::PIBHex => self.parse_pibhex(s),
-            Type::RBHex => self.parse_rbhex(s),
-            Type::Date
-            | Type::ADate
-            | Type::EDate
-            | Type::JDate
-            | Type::SDate
-            | Type::QYr
-            | Type::MoYr
-            | Type::WkYr
-            | Type::DateTime
-            | Type::YmdHms
-            | Type::MTime
-            | Type::Time
-            | Type::DTime => self.parse_date(s),
-            Type::WkDay => self.parse_wkday(s),
-            Type::Month => self.parse_month(s),
-            Type::P | Type::PK | Type::IB | Type::PIB | Type::RB | Type::AHex => {
-                todo!()
-            }
-            Type::A => Ok(Value::String(self.output_encoding.encode(s).0.into())),
-        }
-        .map_err(|details| ParseError {
-            format: self.format,
-            input: s.into(),
-            kind: details,
-        })
-    }
-
-    pub fn parse_all<'b, T>(&self, input: T) -> Result<Value, ParseError>
+    pub fn parse<'b, T>(&self, input: T) -> Result<Value, ParseError>
     where
         T: Into<EncodedStr<'b>>,
     {
@@ -306,10 +297,10 @@ impl<'a> ParseValue<'a> {
             )),
             Type::AHex => todo!(),
         }
-        .map_err(|details| ParseError {
+        .map_err(|kind| ParseError {
             format: self.format,
-            input: todo!(),
-            kind: details,
+            input: input.into(),
+            kind,
         })
     }
 
@@ -1698,7 +1689,7 @@ mod test {
                 .unwrap()
                 .parser(UTF_8)
                 .with_endian(EndianSettings::new(Endian::Big))
-                .parse_all(EncodedStr::new(&raw[..], UTF_8))
+                .parse(EncodedStr::new(&raw[..], UTF_8))
                 .unwrap()
                 .as_number()
                 .unwrap()