From a7107654235839a754e4bc2ba6b51a4d6176d3e9 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 13 Apr 2025 11:00:08 -0700 Subject: [PATCH] parse_all --- rust/pspp/src/format/display.rs | 8 +- rust/pspp/src/format/parse.rs | 200 ++++++++++++++++++++------------ rust/pspp/src/raw.rs | 6 + 3 files changed, 138 insertions(+), 76 deletions(-) diff --git a/rust/pspp/src/format/display.rs b/rust/pspp/src/format/display.rs index 6edf3dc9f3..698ddf3ab2 100644 --- a/rust/pspp/src/format/display.rs +++ b/rust/pspp/src/format/display.rs @@ -1353,7 +1353,7 @@ mod test { let settings = Settings::default().with_epoch(Epoch(1930)); let parser = Format::new(Type::DateTime, 40, 0) .unwrap() - .parser() + .parser(UTF_8) .with_settings(&settings); static INPUTS: &[&str; 20] = &[ "10-6-1648 0:0:0", @@ -1379,7 +1379,7 @@ mod test { ]; assert_eq!(expect.len(), INPUTS.len()); for (input, expect) in INPUTS.iter().copied().zip_eq(expect.iter().copied()) { - let value = parser.parse(input, UTF_8).unwrap(); + let value = parser.parse(input).unwrap(); let formatted = value .display(format, UTF_8) .with_settings(&settings) @@ -2328,14 +2328,14 @@ mod test { let output_filename = directory.join(name); let output = BufReader::new(File::open(&output_filename).unwrap()); - let parser = Format::new(Type::DTime, 40, 0).unwrap().parser(); + let parser = Format::new(Type::DTime, 40, 0).unwrap().parser(UTF_8); for ((input, expect), line_number) in input .lines() .map(|r| r.unwrap()) .zip_eq(output.lines().map(|r| r.unwrap())) .zip(1..) { - let value = parser.parse(&input, UTF_8).unwrap(); + let value = parser.parse(&input).unwrap(); let formatted = value.display(format, UTF_8).to_string(); assert!( formatted == expect, diff --git a/rust/pspp/src/format/parse.rs b/rust/pspp/src/format/parse.rs index 1344f71343..54b2c63672 100644 --- a/rust/pspp/src/format/parse.rs +++ b/rust/pspp/src/format/parse.rs @@ -8,16 +8,68 @@ use crate::{ use encoding_rs::Encoding; use smallstr::SmallString; use std::{ + borrow::Cow, fmt::{Display, Write}, str::FromStr, }; use thiserror::Error as ThisError; -/* -pub enum ParseInput { - String(String), - Bytes(Box<[u8]>), -}*/ +pub enum EncodedStr<'a> { + Encoded { + bytes: &'a [u8], + encoding: &'static Encoding, + }, + Utf8 { + s: &'a str, + }, +} + +impl<'a> EncodedStr<'a> { + pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self { + Self::Encoded { bytes, encoding } + } + pub fn as_str(&self) -> Cow<'_, str> { + match self { + EncodedStr::Encoded { bytes, encoding } => { + encoding.decode_without_bom_handling(&bytes).0 + } + EncodedStr::Utf8 { s } => Cow::from(*s), + } + } + pub fn as_bytes(&self) -> &[u8] { + match self { + EncodedStr::Encoded { bytes, .. } => bytes, + EncodedStr::Utf8 { s } => s.as_bytes(), + } + } + pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { + match self { + EncodedStr::Encoded { bytes, encoding } => { + let utf8 = encoding.decode_without_bom_handling(bytes).0; + match encoding.encode(&utf8).0 { + Cow::Borrowed(_) => { + // Recoding into UTF-8 and then back did not change anything. + Cow::from(*bytes) + } + Cow::Owned(owned) => Cow::Owned(owned), + } + } + EncodedStr::Utf8 { s } => encoding.encode(s).0, + } + } + pub fn is_empty(&self) -> bool { + match self { + EncodedStr::Encoded { bytes, .. } => bytes.is_empty(), + EncodedStr::Utf8 { s } => s.is_empty(), + } + } +} + +impl<'a> From<&'a str> for EncodedStr<'a> { + fn from(s: &'a str) -> Self { + Self::Utf8 { s } + } +} #[derive(Clone, Debug)] pub struct ParseError { @@ -130,22 +182,24 @@ pub struct ParseValue<'a> { settings: &'a Settings, endian: EndianSettings, implied_decimals: bool, + output_encoding: &'static Encoding, } impl Format { - pub fn parser(&self) -> ParseValue<'static> { - ParseValue::new(*self) + pub fn parser(&self, output_encoding: &'static Encoding) -> ParseValue<'static> { + ParseValue::new(*self, output_encoding) } } impl ParseValue<'static> { - pub fn new(format: Format) -> Self { + pub fn new(format: Format, output_encoding: &'static Encoding) -> Self { let settings = PsppSettings::global(); Self { format, settings: &settings.formats, endian: settings.endian, implied_decimals: false, + output_encoding, } } } @@ -164,9 +218,13 @@ impl<'a> ParseValue<'a> { } } - /// Parses `s` as this format. For string formats, `encoding` specifies the - /// output encoding. - pub fn parse(&self, s: &str, _encoding: &'static Encoding) -> Result { + /// Parses `s`. + /// + /// This is only appropriate if `s` was originally encoded in UTF-8 + /// Otherwise, binary formats will not yield sensible parse results, because + /// recoding bytes from (e.g.) windows-1252 into UTF-8, and then + /// interpreting them as a binary number yields nonsense. + pub fn parse(&self, s: &str) -> Result { if s.is_empty() { return Ok(self.format.default_value()); } @@ -194,9 +252,10 @@ impl<'a> ParseValue<'a> { | Type::DTime => self.parse_date(s), Type::WkDay => self.parse_wkday(s), Type::Month => self.parse_month(s), - Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => todo!(), - Type::A => todo!(), - Type::AHex => todo!(), + Type::P | Type::PK | Type::IB | Type::PIB | Type::RB | Type::AHex => { + todo!() + } + Type::A => Ok(Value::String(self.output_encoding.encode(s).0.into())), } .map_err(|details| ParseError { format: self.format, @@ -205,34 +264,24 @@ impl<'a> ParseValue<'a> { }) } - /// Parses `s`, which is encoded in `encoding`. For string formats, - /// `encoding` is also the output encoding. - pub fn parse_encoded( - &self, - input: &[u8], - _encoding: &'static Encoding, - ) -> Result { + pub fn parse_all<'b, T>(&self, input: T) -> Result + where + T: Into>, + { + let input: EncodedStr = input.into(); if input.is_empty() { return Ok(self.format.default_value()); } match self.format.type_ { - Type::P => self.parse_p(input), - Type::PK => self.parse_pk(input), - Type::IB => self.parse_ib(input), - Type::PIB => self.parse_pib(input), - Type::RB => self.parse_rb(input), - Type::F - | Type::Comma - | Type::Dot - | Type::Dollar - | Type::Pct - | Type::E - | Type::CC(_) - | Type::N - | Type::Z - | Type::PIBHex - | Type::RBHex - | Type::Date + Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => { + self.parse_number(&input.as_str(), self.format.type_) + } + Type::CC(_) => self.parse_number(&input.as_str(), Type::F), + Type::N => self.parse_n(&input.as_str()), + Type::Z => self.parse_z(&input.as_str()), + Type::PIBHex => self.parse_pibhex(&input.as_str()), + Type::RBHex => self.parse_rbhex(&input.as_str()), + Type::Date | Type::ADate | Type::EDate | Type::JDate @@ -244,16 +293,23 @@ impl<'a> ParseValue<'a> { | Type::YmdHms | Type::MTime | Type::Time - | Type::DTime - | Type::WkDay - | Type::Month => todo!(), - Type::A => todo!(), + | Type::DTime => self.parse_date(&input.as_str()), + Type::WkDay => self.parse_wkday(&input.as_str()), + Type::Month => self.parse_month(&input.as_str()), + Type::P => self.parse_p(input.as_bytes()), + Type::PK => self.parse_pk(input.as_bytes()), + Type::IB => self.parse_ib(input.as_bytes()), + Type::PIB => self.parse_pib(input.as_bytes()), + Type::RB => self.parse_rb(input.as_bytes()), + Type::A => Ok(Value::String( + input.to_encoding(self.output_encoding).into(), + )), Type::AHex => todo!(), } - .map_err(|kind| ParseError { + .map_err(|details| ParseError { format: self.format, - input: String::new(), - kind, + input: todo!(), + kind: details, }) } @@ -925,7 +981,7 @@ mod test { dictionary::Value, endian::Endian, format::{ - parse::{ParseError, ParseErrorKind, Sign}, + parse::{EncodedStr, ParseError, ParseErrorKind, Sign}, Epoch, Format, Settings as FormatSettings, Type, }, settings::EndianSettings, @@ -942,7 +998,7 @@ mod test { .zip(expected_stream.lines().map(|result| result.unwrap())) .zip(1..) { - let result = format.parser().parse(&input, UTF_8); + let result = format.parser(UTF_8).parse(&input); let error = result.clone().err(); let value = result .unwrap_or(format.default_value()) @@ -1228,9 +1284,9 @@ mod test { let settings = FormatSettings::default().with_epoch(Epoch(1930)); let parsed = Format::new(self.type_, 40, 0) .unwrap() - .parser() + .parser(UTF_8) .with_settings(&settings) - .parse(&formatted, UTF_8) + .parse(&formatted) .unwrap(); assert_eq!(parsed, Value::Number(Some(expected as f64))); } @@ -1424,8 +1480,8 @@ mod test { let parsed = Format::new(self.type_, 40, 0) .unwrap() - .parser() - .parse(&formatted, UTF_8) + .parser(UTF_8) + .parse(&formatted) .unwrap() .as_number() .unwrap() @@ -1503,8 +1559,8 @@ mod test { loop { let parsed = Format::new(Type::WkDay, 40, 0) .unwrap() - .parser() - .parse(input, UTF_8) + .parser(UTF_8) + .parse(input) .unwrap_or(Value::Number(None)) .as_number() .unwrap(); @@ -1577,8 +1633,8 @@ mod test { let input = &input[..length]; let parsed = Format::new(Type::Month, 40, 0) .unwrap() - .parser() - .parse(input, UTF_8) + .parser(UTF_8) + .parse(input) .unwrap_or(Value::Number(None)) .as_number() .unwrap(); @@ -1595,12 +1651,12 @@ mod test { .chain((0xa..=0xf).zip('A'..='F')) .chain(std::iter::once((0, 'x'))) } - let parser = Format::new(Type::PIBHex, 2, 0).unwrap().parser(); + let parser = Format::new(Type::PIBHex, 2, 0).unwrap().parser(UTF_8); for (a, ac) in hex_digits() { for (b, bc) in hex_digits() { let s = [ac, bc].into_iter().collect::(); let parsed = parser - .parse(&s, UTF_8) + .parse(&s) .unwrap_or(Value::Number(None)) .as_number() .unwrap(); @@ -1612,8 +1668,8 @@ mod test { assert_eq!(parsed, expected); } } - assert_eq!(parser.parse(".", UTF_8).unwrap(), Value::Number(None)); - assert_eq!(parser.parse("", UTF_8).unwrap(), Value::Number(None)); + assert_eq!(parser.parse(".").unwrap(), Value::Number(None)); + assert_eq!(parser.parse("",).unwrap(), Value::Number(None)); } #[test] @@ -1623,8 +1679,8 @@ mod test { let formatted = format!("{:016x}", number.to_bits()); let parsed = Format::new(Type::RBHex, 16, 0) .unwrap() - .parser() - .parse(&formatted, UTF_8) + .parser(UTF_8) + .parse(&formatted) .unwrap() .as_number() .unwrap() @@ -1640,9 +1696,9 @@ mod test { let raw = number.to_be_bytes(); let parsed = Format::new(Type::RB, 8, 0) .unwrap() - .parser() + .parser(UTF_8) .with_endian(EndianSettings::new(Endian::Big)) - .parse_encoded(&raw[..], UTF_8) + .parse_all(EncodedStr::new(&raw[..], UTF_8)) .unwrap() .as_number() .unwrap() @@ -1653,11 +1709,11 @@ mod test { #[test] fn n() { - let parser = Format::new(Type::N, 2, 0).unwrap().parser(); + let parser = Format::new(Type::N, 2, 0).unwrap().parser(UTF_8); for number in 0..=99 { let formatted = format!("{:02}", number); let parsed = parser - .parse(&formatted, UTF_8) + .parse(&formatted) .unwrap() .as_number() .unwrap() @@ -1665,14 +1721,14 @@ mod test { assert_eq!(parsed, number as f64, "formatted as {formatted:?}"); } assert!(matches!( - parser.parse(" 0", UTF_8), + parser.parse(" 0"), Err(ParseError { kind: ParseErrorKind::Nondigit(' '), .. }) )); assert!(matches!( - parser.parse(".", UTF_8), + parser.parse("."), Err(ParseError { kind: ParseErrorKind::Nondigit('.'), .. @@ -1682,7 +1738,7 @@ mod test { #[test] fn z() { - let parser = Format::new(Type::Z, 2, 0).unwrap().parser(); + let parser = Format::new(Type::Z, 2, 0).unwrap().parser(UTF_8); for number in -99i32..=99 { for mut formatted in [ format!("{:02}", number.abs()), @@ -1696,7 +1752,7 @@ mod test { formatted.push(b"}JKLMNOPQR"[digit] as char); } let parsed = parser - .parse(&formatted, UTF_8) + .parse(&formatted) .unwrap() .as_number() .unwrap() @@ -1704,11 +1760,11 @@ mod test { assert_eq!(parsed, number as f64, "formatted as {formatted:?}"); } } - assert_eq!(parser.parse(".", UTF_8).unwrap(), Value::Number(None)); + assert_eq!(parser.parse(".").unwrap(), Value::Number(None)); let parser = Format::new(Type::Z, 4, 1) .unwrap() - .parser() + .parser(UTF_8) .with_implied_decimals(); for number in -999i32..=999 { let tenths = number as f64 / 10.0; @@ -1721,7 +1777,7 @@ mod test { formatted.push(b"}JKLMNOPQR"[digit] as char); } let parsed = parser - .parse(&formatted, UTF_8) + .parse(&formatted) .unwrap() .as_number() .unwrap() diff --git a/rust/pspp/src/raw.rs b/rust/pspp/src/raw.rs index b9f25230ad..5144b6286e 100644 --- a/rust/pspp/src/raw.rs +++ b/rust/pspp/src/raw.rs @@ -1379,6 +1379,12 @@ impl RawString { } } +impl From> for RawString { + fn from(value: Cow<'_, [u8]>) -> Self { + Self(value.into_owned()) + } +} + impl From> for RawString { fn from(source: Vec) -> Self { Self(source) -- 2.30.2