From 0b6caa32f54f2bb4b69a62f8af31ffeb7ca45fea Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 13 Apr 2025 11:06:34 -0700 Subject: [PATCH] encodedstring --- rust/pspp/src/format/parse.rs | 91 ++++++++++++++++------------------- 1 file changed, 41 insertions(+), 50 deletions(-) diff --git a/rust/pspp/src/format/parse.rs b/rust/pspp/src/format/parse.rs index 54b2c63672..1ad20e7c6a 100644 --- a/rust/pspp/src/format/parse.rs +++ b/rust/pspp/src/format/parse.rs @@ -14,6 +14,29 @@ use std::{ }; use thiserror::Error as ThisError; +#[derive(Clone, Debug)] +pub enum EncodedString { + Encoded { + bytes: Vec, + encoding: &'static Encoding, + }, + Utf8 { + s: String, + }, +} + +impl<'a> From> for EncodedString { + fn from(value: EncodedStr<'a>) -> Self { + match value { + EncodedStr::Encoded { bytes, encoding } => Self::Encoded { + bytes: bytes.into(), + encoding, + }, + EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() }, + } + } +} + pub enum EncodedStr<'a> { Encoded { bytes: &'a [u8], @@ -71,10 +94,16 @@ impl<'a> From<&'a str> for EncodedStr<'a> { } } +impl<'a> From<&'a String> for EncodedStr<'a> { + fn from(s: &'a String) -> Self { + Self::Utf8 { s: s.as_str() } + } +} + #[derive(Clone, Debug)] pub struct ParseError { format: Format, - input: String, + input: EncodedString, kind: ParseErrorKind, } @@ -218,53 +247,15 @@ impl<'a> ParseValue<'a> { } } - /// Parses `s`. + /// Parses `input`. /// - /// This is only appropriate if `s` was originally encoded in UTF-8 - /// Otherwise, binary formats will not yield sensible parse results, because - /// recoding bytes from (e.g.) windows-1252 into UTF-8, and then + /// # Input encoding + /// + /// Be careful about the encoding of `input`. It's tempting to recode all + /// input into UTF-8, but this will screw up parsing of binary formats, + /// because recoding bytes from (e.g.) windows-1252 into UTF-8, and then /// interpreting them as a binary number yields nonsense. - pub fn parse(&self, s: &str) -> Result { - if s.is_empty() { - return Ok(self.format.default_value()); - } - match self.format.type_ { - Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => { - self.parse_number(s, self.format.type_) - } - Type::CC(_) => self.parse_number(s, Type::F), - Type::N => self.parse_n(s), - Type::Z => self.parse_z(s), - Type::PIBHex => self.parse_pibhex(s), - Type::RBHex => self.parse_rbhex(s), - Type::Date - | Type::ADate - | Type::EDate - | Type::JDate - | Type::SDate - | Type::QYr - | Type::MoYr - | Type::WkYr - | Type::DateTime - | Type::YmdHms - | Type::MTime - | Type::Time - | Type::DTime => self.parse_date(s), - Type::WkDay => self.parse_wkday(s), - Type::Month => self.parse_month(s), - Type::P | Type::PK | Type::IB | Type::PIB | Type::RB | Type::AHex => { - todo!() - } - Type::A => Ok(Value::String(self.output_encoding.encode(s).0.into())), - } - .map_err(|details| ParseError { - format: self.format, - input: s.into(), - kind: details, - }) - } - - pub fn parse_all<'b, T>(&self, input: T) -> Result + pub fn parse<'b, T>(&self, input: T) -> Result where T: Into>, { @@ -306,10 +297,10 @@ impl<'a> ParseValue<'a> { )), Type::AHex => todo!(), } - .map_err(|details| ParseError { + .map_err(|kind| ParseError { format: self.format, - input: todo!(), - kind: details, + input: input.into(), + kind, }) } @@ -1698,7 +1689,7 @@ mod test { .unwrap() .parser(UTF_8) .with_endian(EndianSettings::new(Endian::Big)) - .parse_all(EncodedStr::new(&raw[..], UTF_8)) + .parse(EncodedStr::new(&raw[..], UTF_8)) .unwrap() .as_number() .unwrap() -- 2.30.2