From c498daea7885807eb537de283bdfb1a801ac4ab5 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 13 Apr 2025 14:01:13 -0700 Subject: [PATCH] AHEX parsing. --- rust/pspp/src/calendar.rs | 2 +- rust/pspp/src/format/parse.rs | 192 ++++++++++++++++------------------ rust/pspp/src/raw.rs | 112 ++++++++++++++++++++ 3 files changed, 205 insertions(+), 101 deletions(-) diff --git a/rust/pspp/src/calendar.rs b/rust/pspp/src/calendar.rs index 239d084776..fa09c0ed0f 100644 --- a/rust/pspp/src/calendar.rs +++ b/rust/pspp/src/calendar.rs @@ -64,7 +64,7 @@ pub fn month_name(month: u32) -> Option<&'static str> { Some(name) } -#[derive(Copy, Clone, Debug, ThisError)] +#[derive(Copy, Clone, Debug, ThisError, PartialEq, Eq)] pub enum DateError { /// Date is too early. #[error("Date {y:04}-{m:02}-{d:02} is before the earliest supported date 1582-10-15.")] diff --git a/rust/pspp/src/format/parse.rs b/rust/pspp/src/format/parse.rs index 1ad20e7c6a..e7f26c3a02 100644 --- a/rust/pspp/src/format/parse.rs +++ b/rust/pspp/src/format/parse.rs @@ -3,103 +3,17 @@ use crate::{ dictionary::Value, endian::{Endian, Parse}, format::{DateTemplate, Format, Settings, TemplateItem, Type}, + raw::{EncodedStr, EncodedString}, settings::{EndianSettings, Settings as PsppSettings}, }; use encoding_rs::Encoding; use smallstr::SmallString; use std::{ - borrow::Cow, fmt::{Display, Write}, str::FromStr, }; use thiserror::Error as ThisError; -#[derive(Clone, Debug)] -pub enum EncodedString { - Encoded { - bytes: Vec, - encoding: &'static Encoding, - }, - Utf8 { - s: String, - }, -} - -impl<'a> From> for EncodedString { - fn from(value: EncodedStr<'a>) -> Self { - match value { - EncodedStr::Encoded { bytes, encoding } => Self::Encoded { - bytes: bytes.into(), - encoding, - }, - EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() }, - } - } -} - -pub enum EncodedStr<'a> { - Encoded { - bytes: &'a [u8], - encoding: &'static Encoding, - }, - Utf8 { - s: &'a str, - }, -} - -impl<'a> EncodedStr<'a> { - pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self { - Self::Encoded { bytes, encoding } - } - pub fn as_str(&self) -> Cow<'_, str> { - match self { - EncodedStr::Encoded { bytes, encoding } => { - encoding.decode_without_bom_handling(&bytes).0 - } - EncodedStr::Utf8 { s } => Cow::from(*s), - } - } - pub fn as_bytes(&self) -> &[u8] { - match self { - EncodedStr::Encoded { bytes, .. } => bytes, - EncodedStr::Utf8 { s } => s.as_bytes(), - } - } - pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { - match self { - EncodedStr::Encoded { bytes, encoding } => { - let utf8 = encoding.decode_without_bom_handling(bytes).0; - match encoding.encode(&utf8).0 { - Cow::Borrowed(_) => { - // Recoding into UTF-8 and then back did not change anything. - Cow::from(*bytes) - } - Cow::Owned(owned) => Cow::Owned(owned), - } - } - EncodedStr::Utf8 { s } => encoding.encode(s).0, - } - } - pub fn is_empty(&self) -> bool { - match self { - EncodedStr::Encoded { bytes, .. } => bytes.is_empty(), - EncodedStr::Utf8 { s } => s.is_empty(), - } - } -} - -impl<'a> From<&'a str> for EncodedStr<'a> { - fn from(s: &'a str) -> Self { - Self::Utf8 { s } - } -} - -impl<'a> From<&'a String> for EncodedStr<'a> { - fn from(s: &'a String) -> Self { - Self::Utf8 { s: s.as_str() } - } -} - #[derive(Clone, Debug)] pub struct ParseError { format: Format, @@ -113,13 +27,15 @@ impl Display for ParseError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, - "{:?} cannot be parsed as {}: {}", - &self.input, &self.format, &self.kind + "{} cannot be parsed as {}: {}", + self.input.borrowed().quoted(), + &self.format, + &self.kind ) } } -#[derive(ThisError, Clone, Debug)] +#[derive(ThisError, Clone, Debug, PartialEq, Eq)] enum ParseErrorKind { /// Field contents are not numeric. #[error("Field contents are not numeric.")] @@ -137,10 +53,18 @@ enum ParseErrorKind { #[error("Field contains unexpected non-hex digit {0:?}.")] NonHexDigit(char), + /// Field contains odd number of hex digits. + #[error("Field contains {0:?} hex digits but only an even number is allowed.")] + OddLength(usize), + /// Field contains invalid BCD digit. #[error("Field contains invalid BCD digit ({0:?}).")] NonBDCDigit(u8), + /// Invalid BCD sign. + #[error("Invalid BCD sign. 0x{0:x}.")] + InvalidBCDSign(u8), + /// Day must be between 1 and 31. #[error("Day ({0}) must be between 1 and 31.")] InvalidDay(i32), @@ -200,10 +124,6 @@ enum ParseErrorKind { /// Invalid zoned decimal (Z) syntax. #[error("Invalid zoned decimal (Z) syntax.")] InvalidZ, - - /// Invalid BCD sign. - #[error("Invalid BCD sign. 0x{0:x}.")] - InvalidBCDSign(u8), } pub struct ParseValue<'a> { @@ -295,7 +215,7 @@ impl<'a> ParseValue<'a> { Type::A => Ok(Value::String( input.to_encoding(self.output_encoding).into(), )), - Type::AHex => todo!(), + Type::AHex => self.parse_ahex(&input.as_str()), } .map_err(|kind| ParseError { format: self.format, @@ -518,15 +438,33 @@ impl<'a> ParseValue<'a> { Ok(Value::Number(number)) } + fn parse_ahex(&self, input: &str) -> Result { + let n = self.format.w() / 2; + let mut result = Vec::with_capacity(n); + let mut iter = input.chars(); + while let Some(hi) = iter.next() { + let Some(lo) = iter.next() else { + return Err(ParseErrorKind::OddLength(input.len())); + }; + let Some(hi) = hi.to_digit(16) else { + return Err(ParseErrorKind::NonHexDigit(hi)); + }; + let Some(lo) = lo.to_digit(16) else { + return Err(ParseErrorKind::NonHexDigit(lo)); + }; + result.push((hi * 16 + lo) as u8); + } + result.resize(n, 0); + Ok(Value::String(result.into())) + } + fn parse_hex(&self, input: &str) -> Result, ParseErrorKind> { let input = input.trim(); if input.is_empty() || input == "." { - return Ok(None); - } - if let Ok(value) = u64::from_str_radix(input, 16) { + Ok(None) + } else if let Ok(value) = u64::from_str_radix(input, 16) { Ok(Some(value)) } else { - println!("{input:?} {:?}", u64::from_str_radix(input, 16)); let c = input.chars().find(|c| !c.is_ascii_hexdigit()).unwrap(); Err(ParseErrorKind::NonHexDigit(c)) } @@ -972,9 +910,10 @@ mod test { dictionary::Value, endian::Endian, format::{ - parse::{EncodedStr, ParseError, ParseErrorKind, Sign}, + parse::{ParseError, ParseErrorKind, Sign}, Epoch, Format, Settings as FormatSettings, Type, }, + raw::EncodedStr, settings::EndianSettings, }; @@ -1777,4 +1716,57 @@ mod test { } } } + + #[test] + fn ahex() { + let parser = Format::new(Type::AHex, 16, 0).unwrap().parser(UTF_8); + + // Parse correct number of hex digits. + assert_eq!( + parser + .parse("6162636465666768") + .unwrap() + .as_string() + .unwrap() + .as_encoded(UTF_8) + .as_str(), + "abcdefgh" + ); + + // Parse too few hex digits. + assert_eq!( + parser + .parse("61626364656667") + .unwrap() + .as_string() + .unwrap() + .as_encoded(UTF_8) + .as_str(), + "abcdefg\u{0}" + ); + + // Parse too many hex digits. + assert_eq!( + parser + .parse("616263646566676869") + .unwrap() + .as_string() + .unwrap() + .as_encoded(UTF_8) + .as_str(), + "abcdefgh" + ); + + // Non-hex digit. + assert_eq!( + parser.parse("61626364656667xyzzy").unwrap_err().kind, + ParseErrorKind::NonHexDigit('x') + ); + + // Odd number of hex digits. + assert_eq!( + parser.parse("616263646566676").unwrap_err().kind, + ParseErrorKind::OddLength(15) + ); + } } diff --git a/rust/pspp/src/raw.rs b/rust/pspp/src/raw.rs index 62ba39f0e0..f87bb4d43b 100644 --- a/rust/pspp/src/raw.rs +++ b/rust/pspp/src/raw.rs @@ -1377,6 +1377,9 @@ impl RawString { pub fn spaces(n: usize) -> Self { Self(std::iter::repeat_n(b' ', n).collect()) } + pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> { + EncodedStr::new(&self.0, encoding) + } } impl From> for RawString { @@ -1418,6 +1421,115 @@ impl Debug for RawStrArray { } } +#[derive(Clone, Debug)] +pub enum EncodedString { + Encoded { + bytes: Vec, + encoding: &'static Encoding, + }, + Utf8 { + s: String, + }, +} + +impl EncodedString { + pub fn borrowed(&self) -> EncodedStr<'_> { + match self { + EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { + bytes: &bytes, + encoding, + }, + EncodedString::Utf8 { s } => EncodedStr::Utf8 { s: &s }, + } + } +} + +impl<'a> From> for EncodedString { + fn from(value: EncodedStr<'a>) -> Self { + match value { + EncodedStr::Encoded { bytes, encoding } => Self::Encoded { + bytes: bytes.into(), + encoding, + }, + EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() }, + } + } +} + +pub enum EncodedStr<'a> { + Encoded { + bytes: &'a [u8], + encoding: &'static Encoding, + }, + Utf8 { + s: &'a str, + }, +} + +impl<'a> EncodedStr<'a> { + pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self { + Self::Encoded { bytes, encoding } + } + pub fn as_str(&self) -> Cow<'_, str> { + match self { + EncodedStr::Encoded { bytes, encoding } => { + encoding.decode_without_bom_handling(&bytes).0 + } + EncodedStr::Utf8 { s } => Cow::from(*s), + } + } + pub fn as_bytes(&self) -> &[u8] { + match self { + EncodedStr::Encoded { bytes, .. } => bytes, + EncodedStr::Utf8 { s } => s.as_bytes(), + } + } + pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> { + match self { + EncodedStr::Encoded { bytes, encoding } => { + let utf8 = encoding.decode_without_bom_handling(bytes).0; + match encoding.encode(&utf8).0 { + Cow::Borrowed(_) => { + // Recoding into UTF-8 and then back did not change anything. + Cow::from(*bytes) + } + Cow::Owned(owned) => Cow::Owned(owned), + } + } + EncodedStr::Utf8 { s } => encoding.encode(s).0, + } + } + pub fn is_empty(&self) -> bool { + match self { + EncodedStr::Encoded { bytes, .. } => bytes.is_empty(), + EncodedStr::Utf8 { s } => s.is_empty(), + } + } + pub fn quoted(&self) -> QuotedEncodedStr { + QuotedEncodedStr(self) + } +} + +impl<'a> From<&'a str> for EncodedStr<'a> { + fn from(s: &'a str) -> Self { + Self::Utf8 { s } + } +} + +impl<'a> From<&'a String> for EncodedStr<'a> { + fn from(s: &'a String) -> Self { + Self::Utf8 { s: s.as_str() } + } +} + +pub struct QuotedEncodedStr<'a>(&'a EncodedStr<'a>); + +impl<'a> Display for QuotedEncodedStr<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.0.as_str()) + } +} + #[derive(Clone, Debug)] pub struct ValueLabel where -- 2.30.2