From 17e7516f8a23395429e449916023bcf9b88b3ffc Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 30 Mar 2025 10:03:17 -0700 Subject: [PATCH] work on date parsing --- rust/pspp/src/format/mod.rs | 23 +++ rust/pspp/src/format/parse.rs | 289 +++++++++++++++++++++++++--------- 2 files changed, 238 insertions(+), 74 deletions(-) diff --git a/rust/pspp/src/format/mod.rs b/rust/pspp/src/format/mod.rs index 2c8f971a8f..01f2df2872 100644 --- a/rust/pspp/src/format/mod.rs +++ b/rust/pspp/src/format/mod.rs @@ -767,6 +767,29 @@ impl Not for Decimal { #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] pub struct Epoch(pub i32); +impl Epoch { + /// Applies the epoch to `year`: + /// + /// - If `year` is 2 digits (between 0 and 99, inclusive), returns it + /// converted it to the correct year considering the epoch. + /// + /// - Otherwise, returns `year` unchanged. + fn apply(&self, year: i32) -> i32 { + match year { + 0..=99 => { + let century = self.0 / 100 * 100; + let offset = self.0 - century; + if year >= offset { + year + century + } else { + year + century + 100 + } + } + other => other, + } + } +} + impl Default for Epoch { fn default() -> Self { static DEFAULT: LazyLock = LazyLock::new(|| Epoch(Local::now().year() - 69)); diff --git a/rust/pspp/src/format/parse.rs b/rust/pspp/src/format/parse.rs index c6e05b6f12..f67a739c18 100644 --- a/rust/pspp/src/format/parse.rs +++ b/rust/pspp/src/format/parse.rs @@ -53,16 +53,44 @@ enum ParseErrorKind { DateSyntax, /// Julian day must have exactly three digits. - #[error("Julian day must have exactly three digits (not {0}).")] - InvalidYDayLen(usize), + #[error("Julian day must have exactly three digits.")] + InvalidYDayLen, /// Julian day must be between 1 and 366, inclusive. #[error("Julian day ({0}) must be between 1 and 366, inclusive.")] InvalidYDay(i32), + /// Quarter must be between 1 and 4, inclusive. + #[error("Quarter ({0}) must be between 1 and 4, inclusive.")] + InvalidQuarter(i32), + + /// Week must be between 1 and 53, inclusive. + #[error("Week ({0}) must be between 1 and 53, inclusive.")] + InvalidWeek(i32), + /// Unrecognized month format. #[error("Unrecognized month format. Months may be specified as Arabic or Roman numerals or as at least 3 letters of their English names.")] InvalidMonth, + + /// Delimiter expected between fields in time. + #[error("Delimiter expected between fields in time.")] + ExpectedTimeDelimiter, + + /// Delimiter expected between fields in date. + #[error("Delimiter expected between fields in date.")] + ExpectedDateDelimiter, + + /// Minute must be between 0 and 59, inclusive. + #[error("Minute ({0}) must be between 0 and 59, inclusive.")] + InvalidMinute(i32), + + /// Invalid weekday name. + #[error("Unrecognized weekday name. At least the first two letters of an English weekday name must be specified.")] + InvalidWeekdayName, + + /// Expected character. + #[error("{0:?} expected in date field.")] + ExpectedChar(char), } pub struct ParseValue<'a> { @@ -268,28 +296,31 @@ impl<'a> ParseValue<'a> { fn parse_date(&self, input: &str) -> Result { let orig_input = input; - let mut input = input.trim(); + let input = input.trim(); if input.is_empty() || input == "." { return Ok(Value::sysmis()); } + let mut p = StrParser(input); let mut day = 1; let mut yday = 1; let mut month = 1; let mut year = None; + let mut time_sign = None; + let mut time = 0.0; let mut iter = DateTemplate::for_format(self.format).unwrap(); let template_width = iter.len(); while let Some(TemplateItem { c, n }) = iter.next() { match c { 'd' if n < 3 => { - day = parse_day(&mut input)?; + day = parse_day(&mut p)?; } 'd' => { - yday = parse_yday(&mut input)?; + yday = parse_yday(&mut p)?; } 'm' => { - month = parse_month(&mut input)?; + month = parse_month(&mut p)?; } 'y' => { let max_digits = if !iter @@ -303,18 +334,98 @@ impl<'a> ParseValue<'a> { } else { 2 }; - //year = Some(parse_year(&mut input, max_digits)?); - year = Some(1); + year = Some(parse_year(&mut p, self.settings, max_digits)?); + } + 'q' => month = parse_quarter(&mut p)?, + 'w' => yday = parse_week(&mut p)?, + 'D' => { + time_sign = Some(parse_sign(&mut p, time_sign)); + time += parse_time(&mut p)? * 60.0 * 60.0 * 24.0; + } + 'H' => { + time_sign = Some(parse_sign(&mut p, time_sign)); + time += parse_time(&mut p)? * 60.0 * 60.0; + } + 'M' => { + if self.format.type_ == Type::MTime { + time_sign = Some(parse_sign(&mut p, time_sign)); + } + time += self.parse_minute_second(&mut p)?; + } + '-' | '/' | '.' => parse_date_delimiter(&mut p)?, + ':' => parse_time_delimiter(&mut p)?, + ' ' => { + if self.format.type_ != Type::MoYr { + p.strip_ws(); + } else { + parse_date_delimiter(&mut p)? + } + } + c => { + debug_assert_eq!(n, 1); + if p.strip_one_of(&[c.to_ascii_uppercase(), c.to_ascii_lowercase()]) + .is_none() + { + return Err(ParseErrorKind::ExpectedChar(c)); + } } - _ => (), } } todo!() } + + fn parse_minute_second<'b>(&self, p: &mut StrParser<'b>) -> Result { + let minute = parse_int::(p)?; + if self.format.type_ != Type::MTime && !(0..=59).contains(&minute) { + return Err(ParseErrorKind::InvalidMinute(minute)); + } + let time = minute as f64 * 60.0; + + if parse_time_delimiter(p).is_err() || !p.0.starts_with(|c: char| c.is_ascii_digit()) { + return Ok(time); + } + let seconds_start = p.0; + let integer = p.strip_matches(|c| c.is_ascii_digit()); + let fraction = if p.strip_prefix(self.settings.decimal.as_str()) { + p.strip_matches(|c| c.is_ascii_digit()) + } else { + "" + }; + + let mut number = SmallString::<[u8; 40]>::new(); + number.push_str(integer); + number.push('.'); + number.push_str(fraction); + let seconds = number.parse::().unwrap(); + Ok(time + seconds) + } +} + +enum Sign { + Positive, + Negative, +} + +fn parse_sign<'a>(p: &mut StrParser<'a>, sign: Option) -> Sign { + if let Some(sign) = sign { + sign + } else if p.strip_one_of(&['-', '+']) == Some('-') { + Sign::Negative + } else { + Sign::Positive + } } -fn parse_day(s: &mut &str) -> Result { - let day = parse_int::(s)?; +fn parse_time<'a>(p: &mut StrParser<'a>) -> Result { + let number = parse_int::(p)?; + if number < 0 { + return Err(ParseErrorKind::DateSyntax); + } + Ok(number as f64) +} + +fn parse_day<'a>(p: &mut StrParser<'a>) -> Result { + let day = parse_int::(p)?; if (1..=31).contains(&day) { Ok(day) } else { @@ -322,50 +433,87 @@ fn parse_day(s: &mut &str) -> Result { } } -fn parse_yday(input: &mut &str) -> Result { - let mut rest = *input; - let yday = parse_int::(&mut rest)?; - let yday_len = input.len() - rest.len(); - if yday_len != 3 { - return Err(ParseErrorKind::InvalidYDayLen(yday_len)); - } else if !(1..=366).contains(&yday) { +fn parse_yday<'a>(p: &mut StrParser<'a>) -> Result { + let Some(s) = p.0.get(..3) else { + return Err(ParseErrorKind::InvalidYDayLen); + }; + if !s.chars().all(|c| c.is_ascii_digit()) { + return Err(ParseErrorKind::InvalidYDayLen); + } + let yday = s.parse().unwrap(); + if !(1..=366).contains(&yday) { return Err(ParseErrorKind::InvalidYDay(yday)); - } else { - *input = rest; - Ok(yday) } + p.0 = &p.0[..3]; + Ok(yday) } -fn parse_month(input: &mut &str) -> Result { - if input.starts_with(|c: char| c.is_ascii_digit()) { - let month = parse_int(input)?; +fn parse_month<'a>(p: &mut StrParser<'a>) -> Result { + if p.0.starts_with(|c: char| c.is_ascii_digit()) { + let month = parse_int(p)?; if (1..=12).contains(&month) { return Ok(month); } } else { - let name; - (name, *input) = strip_name(*input); - let name = name.as_bytes(); + let name = p.strip_matches(|c| c.is_ascii_alphabetic()); - static ENGLISH_NAMES: [&[u8]; 12] = [ - b"jan", b"feb", b"mar", b"apr", b"may", b"jun", b"jul", b"aug", b"sep", b"oct", b"nov", - b"dec", + static ENGLISH_NAMES: [&str; 12] = [ + "jan", "fe", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec", ]; if let Some(month) = match_name(&name[..3.min(name.len())], &ENGLISH_NAMES) { return Ok(month); } - static ROMAN_NAMES: [&[u8]; 12] = [ - b"i", b"ii", b"iii", b"iv", b"v", b"vi", b"vii", b"viii", b"ix", b"x", b"xi", b"xii", + static ROMAN_NAMES: [&str; 12] = [ + "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii", ]; - if let Some(month) = match_name(&name, &ENGLISH_NAMES) { + if let Some(month) = match_name(name, &ENGLISH_NAMES) { return Ok(month); } } Err(ParseErrorKind::InvalidMonth) } -fn match_name(name: &[u8], candidates: &[&[u8]]) -> Option { +fn parse_weekday<'a>(p: &mut StrParser<'a>) -> Result { + static WEEKDAY_NAMES: [&str; 7] = ["su", "mo", "tu", "we", "th", "fr", "sa"]; + let name = p.strip_matches(|c| c.is_ascii_alphabetic()); + match_name(name, &WEEKDAY_NAMES).ok_or(ParseErrorKind::InvalidWeekdayName) +} + +fn parse_quarter<'a>(p: &mut StrParser<'a>) -> Result { + match parse_int(p)? { + quarter @ 1..=4 => Ok((quarter - 1) * 3 + 1), + other => Err(ParseErrorKind::InvalidQuarter(other)), + } +} + +fn parse_week<'a>(p: &mut StrParser<'a>) -> Result { + match parse_int(p)? { + week @ 1..=53 => Ok((week - 1) * 7 + 1), + other => Err(ParseErrorKind::InvalidWeek(other)), + } +} + +fn parse_time_delimiter<'a>(p: &mut StrParser<'a>) -> Result<(), ParseErrorKind> { + let delimiter = p.strip_matches(|c| c == ':' || c.is_ascii_whitespace()); + if !delimiter.is_empty() { + Ok(()) + } else { + Err(ParseErrorKind::ExpectedTimeDelimiter) + } +} + +fn parse_date_delimiter<'a>(p: &mut StrParser<'a>) -> Result<(), ParseErrorKind> { + let delimiter = p + .strip_matches(|c| c == '-' || c == '/' || c == '.' || c == ',' || c.is_ascii_whitespace()); + if !delimiter.is_empty() { + Ok(()) + } else { + Err(ParseErrorKind::ExpectedDateDelimiter) + } +} + +fn match_name(name: &str, candidates: &[&str]) -> Option { for (index, candidate) in candidates.iter().enumerate() { if candidate.eq_ignore_ascii_case(name) { return Some(index as i32 + 1); @@ -381,52 +529,41 @@ fn strip_name(input: &str) -> (&str, &str) { ) } +fn parse_year<'a>( + p: &mut StrParser<'a>, + settings: &Settings, + max_digits: usize, +) -> Result { + let head = p.0; + let head = if head.len() > max_digits { + head.get(..max_digits).ok_or(ParseErrorKind::DateSyntax)? + } else { + head + }; + let year = head + .parse::() + .map_err(|_| ParseErrorKind::DateSyntax)?; + p.0 = &p.0[head.len()..]; + Ok(settings.epoch.apply(year)) +} + fn take<'a>(input: &'a str, rest: &'a str) -> (&'a str, &'a str) { (&input[..input.len() - rest.len()], rest) } -fn parse_int(input: &mut &str) -> Result +fn parse_int<'a, T>(p: &mut StrParser<'a>) -> Result where T: FromStr, { - fn strip_prefix<'a>(input: &'a str, prefix: &str) -> (bool, &'a str) { - if prefix.is_empty() { - (false, input) - } else if let Some(rest) = input.strip_prefix(prefix) { - (true, rest.trim_start()) - } else { - (false, input) - } - } - fn strip_one_of<'a>(input: &'a str, chars: &[char]) -> (Option, &'a str) { - let mut iter = input.chars(); - match iter.next() { - Some(c) if chars.contains(&c) => (Some(c), iter.as_str().trim_start()), - _ => (None, input), - } - } - fn strip_integer(mut input: &str, grouping: Option) -> &str { - while let Some(rest) = input.strip_prefix(|c: char| c.is_ascii_digit()) { - let rest = if let Some(grouping) = grouping { - rest.strip_prefix(grouping).unwrap_or(rest) - } else { - rest - }; - input = rest; - } - input - } - - let (_, rest) = strip_one_of(*input, &['+', '-']); - let (_, rest) = take(rest, rest.trim_start_matches(|c: char| c.is_ascii_digit())); - let (number, rest) = take(input, rest); - match number.parse::() { - Ok(value) => { - *input = rest; - Ok(value) - } - Err(_) => Err(ParseErrorKind::DateSyntax), - } + let mut tmp = *p; + tmp.strip_one_of(&['+', '-']).inspect(|_| tmp.strip_ws()); + tmp.strip_matches(|c| c.is_ascii_digit()); + let number = p + .up_to(tmp.0) + .parse::() + .map_err(|_| ParseErrorKind::DateSyntax)?; + *p = tmp; + Ok(number) } #[derive(Copy, Clone, Debug)] @@ -468,10 +605,14 @@ impl<'a> StrParser<'a> { } fn advance(&mut self, rest: &'a str) -> &'a str { - let head = &self.0[..self.0.len() - rest.len()]; + let head = self.up_to(rest); self.0 = rest; head } + + fn up_to(&self, rest: &'a str) -> &'a str { + &self.0[..self.0.len() - rest.len()] + } } #[cfg(test)] -- 2.30.2