From 8b905f2fa93a89fe12f4a1446a300e701d6169b6 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Mon, 14 Apr 2025 16:15:33 -0700 Subject: [PATCH] parser work --- rust/pspp/src/format/display.rs | 7 +- rust/pspp/src/format/mod.rs | 7 ++ rust/pspp/src/format/parse.rs | 124 ++++++++++++-------------------- rust/pspp/src/raw.rs | 2 +- 4 files changed, 55 insertions(+), 85 deletions(-) diff --git a/rust/pspp/src/format/display.rs b/rust/pspp/src/format/display.rs index 698ddf3ab2..7b430821ad 100644 --- a/rust/pspp/src/format/display.rs +++ b/rust/pspp/src/format/display.rs @@ -1351,10 +1351,7 @@ mod test { fn test_dates(format: Format, expect: &[&str]) { let settings = Settings::default().with_epoch(Epoch(1930)); - let parser = Format::new(Type::DateTime, 40, 0) - .unwrap() - .parser(UTF_8) - .with_settings(&settings); + let parser = Type::DateTime.parser(UTF_8).with_settings(&settings); static INPUTS: &[&str; 20] = &[ "10-6-1648 0:0:0", "30-6-1680 4:50:38.12301", @@ -2328,7 +2325,7 @@ mod test { let output_filename = directory.join(name); let output = BufReader::new(File::open(&output_filename).unwrap()); - let parser = Format::new(Type::DTime, 40, 0).unwrap().parser(UTF_8); + let parser = Type::DTime.parser(UTF_8); for ((input, expect), line_number) in input .lines() .map(|r| r.unwrap()) diff --git a/rust/pspp/src/format/mod.rs b/rust/pspp/src/format/mod.rs index 8f520266a6..3078fffb43 100644 --- a/rust/pspp/src/format/mod.rs +++ b/rust/pspp/src/format/mod.rs @@ -373,6 +373,13 @@ impl Type { Self::AHex => "AHEX", } } + + pub fn default_value(&self) -> Value { + match self.var_type() { + VarType::Numeric => Value::sysmis(), + VarType::String => Value::String(RawString::default()) + } + } } impl Display for Type { diff --git a/rust/pspp/src/format/parse.rs b/rust/pspp/src/format/parse.rs index acf0357f3f..222d61de50 100644 --- a/rust/pspp/src/format/parse.rs +++ b/rust/pspp/src/format/parse.rs @@ -2,7 +2,7 @@ use crate::{ calendar::{calendar_gregorian_to_offset, DateError}, dictionary::Value, endian::{Endian, Parse}, - format::{DateTemplate, Format, Settings, TemplateItem, Type}, + format::{DateTemplate, Decimals, Settings, TemplateItem, Type}, raw::{EncodedStr, EncodedString}, settings::{EndianSettings, Settings as PsppSettings}, }; @@ -16,7 +16,7 @@ use thiserror::Error as ThisError; #[derive(Clone, Debug)] pub struct ParseError { - format: Format, + type_: Type, input: EncodedString, kind: ParseErrorKind, } @@ -29,7 +29,7 @@ impl Display for ParseError { f, "{} cannot be parsed as {}: {}", self.input.borrowed().quoted(), - &self.format, + &self.type_, &self.kind ) } @@ -37,8 +37,8 @@ impl Display for ParseError { #[derive(ThisError, Clone, Debug, PartialEq, Eq)] enum ParseErrorKind { - /// Field contents are not numeric. - #[error("Field contents are not numeric.")] + /// Input is not numeric. + #[error("Input is not numeric.")] NotNumeric, /// Invalid numeric systax. @@ -127,27 +127,27 @@ enum ParseErrorKind { } pub struct ParseValue<'a> { - format: Format, + type_: Type, settings: &'a Settings, endian: EndianSettings, - implied_decimals: bool, + implied_decimals: Option, output_encoding: &'static Encoding, } -impl Format { +impl Type { pub fn parser(&self, output_encoding: &'static Encoding) -> ParseValue<'static> { ParseValue::new(*self, output_encoding) } } impl ParseValue<'static> { - pub fn new(format: Format, output_encoding: &'static Encoding) -> Self { + pub fn new(type_: Type, output_encoding: &'static Encoding) -> Self { let settings = PsppSettings::global(); Self { - format, + type_, settings: &settings.formats, endian: settings.endian, - implied_decimals: false, + implied_decimals: None, output_encoding, } } @@ -160,9 +160,9 @@ impl<'a> ParseValue<'a> { pub fn with_endian(self, endian: EndianSettings) -> Self { Self { endian, ..self } } - pub fn with_implied_decimals(self) -> Self { + pub fn with_implied_decimals(self, d: Decimals) -> Self { Self { - implied_decimals: true, + implied_decimals: if d > 0 { Some(d) } else { None }, ..self } } @@ -181,11 +181,11 @@ impl<'a> ParseValue<'a> { { let input: EncodedStr = input.into(); if input.is_empty() { - return Ok(self.format.default_value()); + return Ok(self.type_.default_value()); } - match self.format.type_ { + match self.type_ { Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => { - self.parse_number(&input.as_str(), self.format.type_) + self.parse_number(&input.as_str(), self.type_) } Type::CC(_) => self.parse_number(&input.as_str(), Type::F), Type::N => self.parse_n(&input.as_str()), @@ -218,7 +218,7 @@ impl<'a> ParseValue<'a> { Type::AHex => self.parse_ahex(&input.as_str()), } .map_err(|kind| ParseError { - format: self.format, + type_: self.type_, input: input.into(), kind, }) @@ -351,8 +351,9 @@ impl<'a> ParseValue<'a> { _ => return Err(ParseErrorKind::InvalidZ), } } - if self.implied_decimals && !dot && self.format.d() != 0 { - write!(&mut number, "e-{}", self.format.d()).unwrap(); + match self.implied_decimals { + Some(d) if !dot && d > 0 => write!(&mut number, "e-{d}").unwrap(), + _ => (), } let number = number.parse::().unwrap(); let number = if sign == Some(Sign::Negative) { @@ -374,10 +375,9 @@ impl<'a> ParseValue<'a> { } fn apply_decimals(&self, number: f64) -> f64 { - if self.implied_decimals && self.format.d() > 0 { - number / 10.0f64.powi(self.format.d() as i32) - } else { - number + match self.implied_decimals { + Some(d) if d > 0 => number / 10.0f64.powi(d as i32), + _ => number, } } @@ -439,8 +439,7 @@ impl<'a> ParseValue<'a> { } fn parse_ahex(&self, input: &str) -> Result { - let n = self.format.w() / 2; - let mut result = Vec::with_capacity(n); + let mut result = Vec::with_capacity(input.len() / 2); let mut iter = input.chars(); while let Some(hi) = iter.next() { let Some(lo) = iter.next() else { @@ -454,7 +453,6 @@ impl<'a> ParseValue<'a> { }; result.push((hi * 16 + lo) as u8); } - result.resize(n, 0); Ok(Value::String(result.into())) } @@ -493,7 +491,7 @@ impl<'a> ParseValue<'a> { let mut time_sign = None; let mut time = 0.0; - let mut iter = DateTemplate::new(self.format.type_, 0).unwrap(); + let mut iter = DateTemplate::new(self.type_, 0).unwrap(); let template_width = iter.len(); while let Some(TemplateItem { c, n }) = iter.next() { match c { @@ -531,7 +529,7 @@ impl<'a> ParseValue<'a> { time += parse_time(&mut p)? * 60.0 * 60.0; } 'M' => { - if self.format.type_ == Type::MTime { + if self.type_ == Type::MTime { time_sign = Some(parse_sign(&mut p, time_sign)); } time += self.parse_minute_second(&mut p)?; @@ -539,7 +537,7 @@ impl<'a> ParseValue<'a> { '-' | '/' | '.' => parse_date_delimiter(&mut p)?, ':' => parse_time_delimiter(&mut p)?, ' ' => { - if self.format.type_ != Type::MoYr { + if self.type_ != Type::MoYr { p.strip_ws(); } else { parse_date_delimiter(&mut p)? @@ -573,7 +571,7 @@ impl<'a> ParseValue<'a> { fn parse_minute_second(&self, p: &mut StrParser<'_>) -> Result { let minute = parse_int::(p)?; - if self.format.type_ != Type::MTime && !(0..=59).contains(&minute) { + if self.type_ != Type::MTime && !(0..=59).contains(&minute) { return Err(ParseErrorKind::InvalidMinute(minute)); } let time = minute as f64 * 60.0; @@ -921,22 +919,21 @@ mod test { let base = Path::new(env!("CARGO_MANIFEST_DIR")).join("src/format/testdata/parse"); let input_stream = BufReader::new(File::open(base.join("num-in.txt")).unwrap()); let expected_stream = BufReader::new(File::open(base.join(name)).unwrap()); - let format = Format::new(type_, 40, 1).unwrap(); for ((input, expected), line_number) in input_stream .lines() .map(|result| result.unwrap()) .zip(expected_stream.lines().map(|result| result.unwrap())) .zip(1..) { - let result = format.parser(UTF_8).parse(&input); + let result = type_.parser(UTF_8).parse(&input); let error = result.clone().err(); let value = result - .unwrap_or(format.default_value()) + .unwrap_or(type_.default_value()) .display(Format::new(Type::F, 10, 4).unwrap(), UTF_8) .to_string(); if value != expected { panic!( - "parsing {input:?} as {format} failed ({name}:{line_number}):\n got: {value:?}\nexpected: {expected:?}\ndecode error: {error:?}", + "parsing {input:?} as {type_} failed ({name}:{line_number}):\n got: {value:?}\nexpected: {expected:?}\ndecode error: {error:?}", ); } } @@ -1212,8 +1209,8 @@ mod test { expected + time as i64 }; let settings = FormatSettings::default().with_epoch(Epoch(1930)); - let parsed = Format::new(self.type_, 40, 0) - .unwrap() + let parsed = self + .type_ .parser(UTF_8) .with_settings(&settings) .parse(&formatted) @@ -1408,8 +1405,8 @@ mod test { Sign::Negative => -expected, }; - let parsed = Format::new(self.type_, 40, 0) - .unwrap() + let parsed = self + .type_ .parser(UTF_8) .parse(&formatted) .unwrap() @@ -1487,8 +1484,7 @@ mod test { ("sturday", None), ] { loop { - let parsed = Format::new(Type::WkDay, 40, 0) - .unwrap() + let parsed = Type::WkDay .parser(UTF_8) .parse(input) .unwrap_or(Value::Number(None)) @@ -1561,8 +1557,7 @@ mod test { for length in lengths { let input = &input[..length]; - let parsed = Format::new(Type::Month, 40, 0) - .unwrap() + let parsed = Type::Month .parser(UTF_8) .parse(input) .unwrap_or(Value::Number(None)) @@ -1581,7 +1576,7 @@ mod test { .chain((0xa..=0xf).zip('A'..='F')) .chain(std::iter::once((0, 'x'))) } - let parser = Format::new(Type::PIBHex, 2, 0).unwrap().parser(UTF_8); + let parser = Type::PIBHex.parser(UTF_8); for (a, ac) in hex_digits() { for (b, bc) in hex_digits() { let s = [ac, bc].into_iter().collect::(); @@ -1607,8 +1602,7 @@ mod test { for _ in 0..10000 { let number = random::(); let formatted = format!("{:016x}", number.to_bits()); - let parsed = Format::new(Type::RBHex, 16, 0) - .unwrap() + let parsed = Type::RBHex .parser(UTF_8) .parse(&formatted) .unwrap() @@ -1624,8 +1618,7 @@ mod test { for _ in 0..10000 { let number = random::(); let raw = number.to_be_bytes(); - let parsed = Format::new(Type::RB, 8, 0) - .unwrap() + let parsed = Type::RB .parser(UTF_8) .with_endian(EndianSettings::new(Endian::Big)) .parse(EncodedStr::new(&raw[..], UTF_8)) @@ -1639,7 +1632,7 @@ mod test { #[test] fn n() { - let parser = Format::new(Type::N, 2, 0).unwrap().parser(UTF_8); + let parser = Type::N.parser(UTF_8); for number in 0..=99 { let formatted = format!("{:02}", number); let parsed = parser @@ -1668,7 +1661,7 @@ mod test { #[test] fn z() { - let parser = Format::new(Type::Z, 2, 0).unwrap().parser(UTF_8); + let parser = Type::Z.parser(UTF_8); for number in -99i32..=99 { for mut formatted in [ format!("{:02}", number.abs()), @@ -1692,10 +1685,7 @@ mod test { } assert_eq!(parser.parse(".").unwrap(), Value::Number(None)); - let parser = Format::new(Type::Z, 4, 1) - .unwrap() - .parser(UTF_8) - .with_implied_decimals(); + let parser = Type::Z.parser(UTF_8).with_implied_decimals(1); for number in -999i32..=999 { let tenths = number as f64 / 10.0; for mut formatted in [format!("{}", number.abs()), format!("{:.1}", tenths.abs())] { @@ -1719,9 +1709,9 @@ mod test { #[test] fn ahex() { - let parser = Format::new(Type::AHex, 16, 0).unwrap().parser(UTF_8); + let parser = Type::AHex.parser(UTF_8); - // Parse correct number of hex digits. + // Correct. assert_eq!( parser .parse("6162636465666768") @@ -1733,30 +1723,6 @@ mod test { "abcdefgh" ); - // Parse too few hex digits. - assert_eq!( - parser - .parse("61626364656667") - .unwrap() - .as_string() - .unwrap() - .as_encoded(UTF_8) - .as_str(), - "abcdefg\u{0}" - ); - - // Parse too many hex digits. - assert_eq!( - parser - .parse("616263646566676869") - .unwrap() - .as_string() - .unwrap() - .as_encoded(UTF_8) - .as_str(), - "abcdefgh" - ); - // Non-hex digit. assert_eq!( parser.parse("61626364656667xyzzy").unwrap_err().kind, diff --git a/rust/pspp/src/raw.rs b/rust/pspp/src/raw.rs index f87bb4d43b..1792a7f886 100644 --- a/rust/pspp/src/raw.rs +++ b/rust/pspp/src/raw.rs @@ -1370,7 +1370,7 @@ impl Debug for UntypedValue { } } -#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, PartialEq, Default, Eq, PartialOrd, Ord, Hash)] pub struct RawString(pub Vec); impl RawString { -- 2.30.2