From d2a1c269fbefcaa35a275211e373768f8fdf9b11 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 29 Mar 2025 19:43:08 -0700 Subject: [PATCH] work on format parsing --- rust/pspp/src/format/display.rs | 43 ++---- rust/pspp/src/format/mod.rs | 141 ++++++++++------- rust/pspp/src/format/parse.rs | 259 ++++++++++++++++++++++++++++++-- 3 files changed, 352 insertions(+), 91 deletions(-) diff --git a/rust/pspp/src/format/display.rs b/rust/pspp/src/format/display.rs index 3784406285..f73b735295 100644 --- a/rust/pspp/src/format/display.rs +++ b/rust/pspp/src/format/display.rs @@ -15,7 +15,7 @@ use crate::{ calendar::{calendar_offset_to_gregorian, day_of_year, month_name, short_month_name}, dictionary::Value, endian::ToBytes, - format::{Category, Decimal, Format, NumberStyle, Settings, Type}, + format::{Category, DateTemplate, Decimal, Format, NumberStyle, Settings, TemplateItem, Type}, settings::{EndianSettings, Settings as PsppSettings}, }; @@ -443,24 +443,13 @@ impl<'a, 'b> DisplayValue<'a, 'b> { }; let mut output = SmallString::<[u8; 40]>::new(); - let mut template = self - .format - .type_ - .date_template(self.format.w()) - .unwrap() - .bytes() - .peekable(); - while let Some(c) = template.next() { - let mut count = 1; - while template.next_if_eq(&c).is_some() { - count += 1; - } + for TemplateItem { c, n } in DateTemplate::for_format(self.format).unwrap() { match c { - b'd' if count < 3 => write!(&mut output, "{:02}", date.day()).unwrap(), - b'd' => write!(&mut output, "{:03}", day_of_year(date).unwrap_or(1)).unwrap(), - b'm' if count < 3 => write!(&mut output, "{:02}", date.month()).unwrap(), - b'm' => write!(&mut output, "{}", short_month_name(date.month()).unwrap()).unwrap(), - b'y' if count >= 4 => { + 'd' if n < 3 => write!(&mut output, "{:02}", date.day()).unwrap(), + 'd' => write!(&mut output, "{:03}", day_of_year(date).unwrap_or(1)).unwrap(), + 'm' if n < 3 => write!(&mut output, "{:02}", date.month()).unwrap(), + 'm' => write!(&mut output, "{}", short_month_name(date.month()).unwrap()).unwrap(), + 'y' if n >= 4 => { let year = date.year(); if year <= 9999 { write!(&mut output, "{year:04}").unwrap(); @@ -472,7 +461,7 @@ impl<'a, 'b> DisplayValue<'a, 'b> { return self.overflow(f); } } - b'y' => { + 'y' => { let epoch = self.settings.epoch.0; let offset = date.year() - epoch; if offset < 0 || offset > 99 { @@ -480,30 +469,30 @@ impl<'a, 'b> DisplayValue<'a, 'b> { } write!(&mut output, "{offset:02}").unwrap(); } - b'q' => write!(&mut output, "{}", date.month0() / 3 + 1).unwrap(), - b'w' => write!( + 'q' => write!(&mut output, "{}", date.month0() / 3 + 1).unwrap(), + 'w' => write!( &mut output, "{:2}", (day_of_year(date).unwrap_or(1) - 1) / 7 + 1 ) .unwrap(), - b'D' => { + 'D' => { if time < 0.0 { output.push('-'); } time = time.abs(); - write!(&mut output, "{:1$.0}", (time / DAY).floor(), count).unwrap(); + write!(&mut output, "{:1$.0}", (time / DAY).floor(), n).unwrap(); time %= DAY; } - b'H' => { + 'H' => { if time < 0.0 { output.push('-'); } time = time.abs(); - write!(&mut output, "{:1$.0}", (time / HOUR).floor(), count).unwrap(); + write!(&mut output, "{:1$.0}", (time / HOUR).floor(), n).unwrap(); time %= HOUR; } - b'M' => { + 'M' => { if time < 0.0 { output.push('-'); } @@ -529,7 +518,7 @@ impl<'a, 'b> DisplayValue<'a, 'b> { } } } - c if count == 1 => output.push(c as char), + c if n == 1 => output.push(c as char), _ => unreachable!(), } } diff --git a/rust/pspp/src/format/mod.rs b/rust/pspp/src/format/mod.rs index 1c4657ebaf..2c8f971a8f 100644 --- a/rust/pspp/src/format/mod.rs +++ b/rust/pspp/src/format/mod.rs @@ -332,57 +332,6 @@ impl Type { } } - /// For time and date formats, returns a template used for input and output - /// in a field of the given `width`. - /// - /// `width` only affects whether a 2-digit year or a 4-digit year is used, - /// that is, whether the returned string contains `yy` or `yyyy`, and - /// whether seconds are included, that is, whether the returned string - /// contains `:SS`. A caller that doesn't care whether the returned string - /// contains `yy` or `yyyy` or `:SS` can just specify 0 to omit them. - pub fn date_template(self, width: usize) -> Option<&'static str> { - let (short, long) = match self { - Self::F - | Self::Comma - | Self::Dot - | Self::Dollar - | Self::Pct - | Self::E - | Self::CC(_) - | Self::N - | Self::Z - | Self::P - | Self::PK - | Self::IB - | Self::PIB - | Self::PIBHex - | Self::RB - | Self::RBHex - | Self::WkDay - | Self::Month - | Self::A - | Self::AHex => return None, - Self::Date => ("dd-mmm-yy", "dd-mmm-yyyy"), - Self::ADate => ("mm/dd/yy", "mm/dd/yyyy"), - Self::EDate => ("dd.mm.yy", "dd.mm.yyyy"), - Self::JDate => ("yyddd", "yyyyddd"), - Self::SDate => ("yy/mm/dd", "yyyy/mm/dd"), - Self::QYr => ("q Q yy", "q Q yyyy"), - Self::MoYr => ("mmm yy", "mmm yyyy"), - Self::WkYr => ("ww WK yy", "ww WK yyyy"), - Self::DateTime => ("dd-mmm-yyyy HH:MM", "dd-mmm-yyyy HH:MM:SS"), - Self::YMDHMS => ("yyyy-mm-dd HH:MM", "yyyy-mm-dd HH:MM:SS"), - Self::MTime => ("MM", "MM:SS"), - Self::Time => ("HH:MM", "HH:MM:SS"), - Self::DTime => ("D HH:MM", "D HH:MM:SS"), - }; - if width >= long.len() { - Some(long) - } else { - Some(short) - } - } - pub fn as_string(&self) -> &'static str { match self { Self::F => "F", @@ -1103,3 +1052,93 @@ impl FromStr for NumberStyle { }) } } + +/// An item within a [DateTemplate]. +pub struct TemplateItem { + /// Character in the template. + pub c: char, + + /// Number of repetitions of the character. + pub n: usize, +} + +/// A template for date and time formats. +#[derive(Clone)] +pub struct DateTemplate(&'static str); + +impl DateTemplate { + /// Returns a [DateTemplate] used for date and time input and output in a + /// field of the given `type_` and `width`. + /// + /// `width` only affects whether a 2-digit year or a 4-digit year is used, + /// that is, whether the returned string contains `yy` or `yyyy`, and + /// whether seconds are included, that is, whether the returned string + /// contains `:SS`. A caller that doesn't care whether the returned string + /// contains `yy` or `yyyy` or `:SS` can just specify 0 to omit them. + pub fn new(type_: Type, width: usize) -> Option { + let (short, long) = match type_ { + Type::F + | Type::Comma + | Type::Dot + | Type::Dollar + | Type::Pct + | Type::E + | Type::CC(_) + | Type::N + | Type::Z + | Type::P + | Type::PK + | Type::IB + | Type::PIB + | Type::PIBHex + | Type::RB + | Type::RBHex + | Type::WkDay + | Type::Month + | Type::A + | Type::AHex => return None, + Type::Date => ("dd-mmm-yy", "dd-mmm-yyyy"), + Type::ADate => ("mm/dd/yy", "mm/dd/yyyy"), + Type::EDate => ("dd.mm.yy", "dd.mm.yyyy"), + Type::JDate => ("yyddd", "yyyyddd"), + Type::SDate => ("yy/mm/dd", "yyyy/mm/dd"), + Type::QYr => ("q Q yy", "q Q yyyy"), + Type::MoYr => ("mmm yy", "mmm yyyy"), + Type::WkYr => ("ww WK yy", "ww WK yyyy"), + Type::DateTime => ("dd-mmm-yyyy HH:MM", "dd-mmm-yyyy HH:MM:SS"), + Type::YMDHMS => ("yyyy-mm-dd HH:MM", "yyyy-mm-dd HH:MM:SS"), + Type::MTime => ("MM", "MM:SS"), + Type::Time => ("HH:MM", "HH:MM:SS"), + Type::DTime => ("D HH:MM", "D HH:MM:SS"), + }; + if width >= long.len() { + Some(DateTemplate(long)) + } else { + Some(DateTemplate(short)) + } + } + + pub fn for_format(format: Format) -> Option { + Self::new(format.type_(), format.w()) + } + + pub fn len(&self) -> usize { + self.0.len() + } +} + +impl Iterator for DateTemplate { + type Item = TemplateItem; + + fn next(&mut self) -> Option { + let mut iter = self.0.chars(); + let c = iter.next()?; + self.0 = iter.as_str(); + let mut n = 1; + while iter.next() == Some(c) { + self.0 = iter.as_str(); + n += 1; + } + Some(TemplateItem { c, n }) + } +} diff --git a/rust/pspp/src/format/parse.rs b/rust/pspp/src/format/parse.rs index 7d2fc3f384..6ee2d3bea3 100644 --- a/rust/pspp/src/format/parse.rs +++ b/rust/pspp/src/format/parse.rs @@ -1,6 +1,6 @@ use crate::{ dictionary::Value, - format::{Format, Settings, Type}, + format::{DateTemplate, Format, Settings, TemplateItem, Type}, settings::{EndianSettings, Settings as PsppSettings}, }; use encoding_rs::Encoding; @@ -15,7 +15,7 @@ use thiserror::Error as ThisError; pub struct ParseError { format: Format, input: String, - details: ParseErrorDetails, + details: ParseErrorKind, } impl std::error::Error for ParseError {} @@ -31,7 +31,7 @@ impl Display for ParseError { } #[derive(ThisError, Clone, Debug)] -enum ParseErrorDetails { +enum ParseErrorKind { /// Field contents are not numeric. #[error("Field contents are not numeric.")] NotNumeric, @@ -43,6 +43,26 @@ enum ParseErrorDetails { /// Field contains unexpected non-digit. #[error("Field contains unexpected non-digit {0:?}.")] Nondigit(char), + + /// Day must be between 1 and 31. + #[error("Day ({0}) must be between 1 and 31.")] + InvalidDay(i32), + + /// Syntax error in date field. + #[error("Syntax error in date field.")] + DateSyntax, + + /// Julian day must have exactly three digits. + #[error("Julian day must have exactly three digits (not {0}).")] + InvalidYDayLen(usize), + + /// Julian day must be between 1 and 366, inclusive. + #[error("Julian day ({0}) must be between 1 and 366, inclusive.")] + InvalidYDay(i32), + + /// Unrecognized month format. + #[error("Unrecognized month format. Months may be specified as Arabic or Roman numerals or as at least 3 letters of their English names.")] + InvalidMonth, } pub struct ParseValue<'a> { @@ -118,7 +138,54 @@ impl<'a> ParseValue<'a> { }) } - fn parse_number(&self, input: &str, type_: Type) -> Result { + /* + /// Parses `s`, which is encoded in `encoding`. For string formats, + /// `encoding` is also the output encoding. + fn parse_encoded(&self, s: &[u8], encoding: &'static Encoding) -> Result { + if s.is_empty() { + return Ok(self.format.default_value()); + } + match self.format.type_ { + Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => { + self.parse_number(s, self.format.type_) + } + Type::CC(_) => self.parse_number(s, Type::F), + Type::N => self.parse_n(s), + Type::Z => todo!(), + Type::P => todo!(), + Type::PK => todo!(), + Type::IB => todo!(), + Type::PIB => todo!(), + Type::PIBHex => todo!(), + Type::RB => todo!(), + Type::RBHex => todo!(), + Type::Date => todo!(), + Type::ADate => todo!(), + Type::EDate => todo!(), + Type::JDate => todo!(), + Type::SDate => todo!(), + Type::QYr => todo!(), + Type::MoYr => todo!(), + Type::WkYr => todo!(), + Type::DateTime => todo!(), + Type::YMDHMS => todo!(), + Type::MTime => todo!(), + Type::Time => todo!(), + Type::DTime => todo!(), + Type::WkDay => todo!(), + Type::Month => todo!(), + Type::A => todo!(), + Type::AHex => todo!(), + } + .map_err(|details| ParseError { + format: self.format, + input: s.into(), + details, + }) + } + */ + + fn parse_number(&self, input: &str, type_: Type) -> Result { let style = self.settings.number_style(type_); let input = input.trim(); @@ -152,9 +219,6 @@ impl<'a> ParseValue<'a> { } input } - fn take<'a>(input: &'a str, rest: &'a str) -> (&'a str, &'a str) { - (&input[..input.len() - rest.len()], rest) - } let (_, input) = strip_prefix(input, &*style.prefix.s); let (sign, input) = strip_one_of(input, &['-', '+']); @@ -181,7 +245,7 @@ impl<'a> ParseValue<'a> { let (_, input) = strip_prefix(input, &*style.suffix.s); if !input.is_empty() { - return Err(ParseErrorDetails::NotNumeric); + return Err(ParseErrorKind::NotNumeric); } let mut number = SmallString::<[u8; 64]>::new(); @@ -202,15 +266,177 @@ impl<'a> ParseValue<'a> { match f64::from_str(&number) { Ok(value) => Ok(Value::Number(Some(value))), - Err(_) => Err(ParseErrorDetails::InvalidNumericSyntax), + Err(_) => Err(ParseErrorKind::InvalidNumericSyntax), } } - fn parse_n(&self, input: &str) -> Result { + fn parse_n(&self, input: &str) -> Result { match input.chars().find(|c| !c.is_ascii_digit()) { None => Ok(Value::Number(Some(input.parse().unwrap()))), - Some(nondigit) => Err(ParseErrorDetails::Nondigit(nondigit)), + Some(nondigit) => Err(ParseErrorKind::Nondigit(nondigit)), + } + } + + fn parse_date(&self, input: &str) -> Result { + let orig_input = input; + let mut input = input.trim(); + if input.is_empty() || input == "." { + return Ok(Value::sysmis()); + } + + let mut day = 1; + let mut yday = 1; + let mut month = 1; + let mut year = None; + + let mut iter = DateTemplate::for_format(self.format).unwrap(); + let template_width = iter.len(); + while let Some(TemplateItem { c, n }) = iter.next() { + match c { + 'd' if n < 3 => { + day = parse_day(&mut input)?; + } + 'd' => { + yday = parse_yday(&mut input)?; + } + 'm' => { + month = parse_month(&mut input)?; + } + 'y' => { + let max_digits = if !iter + .clone() + .next() + .is_some_and(|item| item.c.is_ascii_alphabetic()) + { + usize::MAX + } else if orig_input.len() >= template_width + 2 { + 4 + } else { + 2 + }; + //year = Some(parse_year(&mut input, max_digits)?); + year = Some(1); + } + _ => (), + } } + todo!() + } +} + +fn parse_day(s: &mut &str) -> Result { + let day = parse_int::(s)?; + if (1..=31).contains(&day) { + Ok(day) + } else { + Err(ParseErrorKind::InvalidDay(day)) + } +} + +fn parse_yday(input: &mut &str) -> Result { + let mut rest = *input; + let yday = parse_int::(&mut rest)?; + let yday_len = input.len() - rest.len(); + if yday_len != 3 { + return Err(ParseErrorKind::InvalidYDayLen(yday_len)); + } else if !(1..=366).contains(&yday) { + return Err(ParseErrorKind::InvalidYDay(yday)); + } else { + *input = rest; + Ok(yday) + } +} + +fn parse_month(input: &mut &str) -> Result { + if input.starts_with(|c: char| c.is_ascii_digit()) { + let month = parse_int(input)?; + if (1..=12).contains(&month) { + return Ok(month); + } + } else { + let name; + (name, *input) = strip_name(*input); + let name = name.as_bytes(); + + static ENGLISH_NAMES: [&[u8]; 12] = [ + b"jan", b"feb", b"mar", b"apr", b"may", b"jun", b"jul", b"aug", b"sep", b"oct", b"nov", + b"dec", + ]; + if let Some(month) = match_name(&name[..3.min(name.len())], &ENGLISH_NAMES) { + return Ok(month); + } + + static ROMAN_NAMES: [&[u8]; 12] = [ + b"i", b"ii", b"iii", b"iv", b"v", b"vi", b"vii", b"viii", b"ix", b"x", b"xi", b"xii", + ]; + if let Some(month) = match_name(&name, &ENGLISH_NAMES) { + return Ok(month); + } + } + Err(ParseErrorKind::InvalidMonth) +} + +fn match_name(name: &[u8], candidates: &[&[u8]]) -> Option { + for (index, candidate) in candidates.iter().enumerate() { + if candidate.eq_ignore_ascii_case(name) { + return Some(index as i32 + 1); + } + } + None +} + +fn strip_name(input: &str) -> (&str, &str) { + take( + input, + input.trim_start_matches(|c: char| c.is_ascii_alphabetic()), + ) +} + +fn take<'a>(input: &'a str, rest: &'a str) -> (&'a str, &'a str) { + (&input[..input.len() - rest.len()], rest) +} + +fn parse_int(input: &mut &str) -> Result +where + T: FromStr, +{ + fn strip_prefix<'a>(input: &'a str, prefix: &str) -> (bool, &'a str) { + if prefix.is_empty() { + (false, input) + } else if let Some(rest) = input.strip_prefix(prefix) { + (true, rest.trim_start()) + } else { + (false, input) + } + } + fn strip_one_of<'a>(input: &'a str, chars: &[char]) -> (Option, &'a str) { + let mut iter = input.chars(); + match iter.next() { + Some(c) if chars.contains(&c) => (Some(c), iter.as_str().trim_start()), + _ => (None, input), + } + } + fn strip_integer(mut input: &str, grouping: Option) -> &str { + while let Some(rest) = input.strip_prefix(|c: char| c.is_ascii_digit()) { + let rest = if let Some(grouping) = grouping { + rest.strip_prefix(grouping).unwrap_or(rest) + } else { + rest + }; + input = rest; + } + input + } + + let (_, rest) = strip_one_of(*input, &['+', '-']); + let (_, rest) = take(rest, rest.trim_start_matches(|c: char| c.is_ascii_digit())); + let (number, rest) = take(input, rest); + match number.parse::() { + Ok(value) => { + *input = rest; + Ok(value) + } + Err(_) => Err(ParseErrorKind::DateSyntax), } } @@ -220,13 +446,11 @@ mod test { fs::File, io::{BufRead, BufReader}, path::Path, - str::from_utf8, }; use encoding_rs::UTF_8; use crate::{ - dictionary::Value, format::{Format, Type}, settings::Settings, }; @@ -288,4 +512,13 @@ mod test { fn pct() { test("pct.txt", Type::Pct); } + + /* + #[test] + fn legacy() { + for i in 0..=u16::MAX { + let input = i.to_be_bytes(); + + } + }*/ } -- 2.30.2