use crate::{
calendar::{calendar_gregorian_to_offset, DateError},
dictionary::Value,
+ endian::{Endian, Parse},
format::{DateTemplate, Format, Settings, TemplateItem, Type},
settings::{EndianSettings, Settings as PsppSettings},
};
};
use thiserror::Error as ThisError;
+pub enum ParseInput {
+ String(String),
+ Bytes(Box<[u8]>),
+}
+
#[derive(Clone, Debug)]
pub struct ParseError {
format: Format,
input: String,
- details: ParseErrorKind,
+ kind: ParseErrorKind,
}
impl std::error::Error for ParseError {}
write!(
f,
"{:?} cannot be parsed as {}: {}",
- &self.input, &self.format, &self.details
+ &self.input, &self.format, &self.kind
)
}
}
#[error("Field contains unexpected non-digit {0:?}.")]
Nondigit(char),
+ /// Field contains unexpected non-hex digit.
+ #[error("Field contains unexpected non-hex digit {0:?}.")]
+ NonHexDigit(char),
+
+ /// Field contains invalid BCD digit.
+ #[error("Field contains invalid BCD digit ({0:?}).")]
+ NonBDCDigit(u8),
+
/// Day must be between 1 and 31.
#[error("Day ({0}) must be between 1 and 31.")]
InvalidDay(i32),
/// Invalid date.
#[error("{0}")]
InvalidDate(#[from] DateError),
+
+ /// Invalid zoned decimal (Z) syntax.
+ #[error("Invalid zoned decimal (Z) syntax.")]
+ InvalidZ,
+
+ /// Invalid BCD sign.
+ #[error("Invalid BCD sign. 0x{0:x}.")]
+ InvalidBCDSign(u8),
}
pub struct ParseValue<'a> {
format: Format,
settings: &'a Settings,
endian: EndianSettings,
+ implied_decimals: bool,
}
impl Format {
- pub fn parser(&self) -> ParseValue {
+ pub fn parser(&self) -> ParseValue<'static> {
ParseValue::new(*self)
}
}
-impl<'a> ParseValue<'a> {
+impl ParseValue<'static> {
pub fn new(format: Format) -> Self {
let settings = PsppSettings::global();
Self {
format,
settings: &settings.formats,
endian: settings.endian,
+ implied_decimals: false,
}
}
+}
+
+impl<'a> ParseValue<'a> {
pub fn with_settings(self, settings: &'a Settings) -> Self {
Self { settings, ..self }
}
pub fn with_endian(self, endian: EndianSettings) -> Self {
Self { endian, ..self }
}
+ pub fn with_implied_decimals(self) -> Self {
+ Self {
+ implied_decimals: true,
+ ..self
+ }
+ }
/// Parses `s` as this format. For string formats, `encoding` specifies the
/// output encoding.
}
Type::CC(_) => self.parse_number(s, Type::F),
Type::N => self.parse_n(s),
- Type::Z => todo!(),
+ Type::Z => self.parse_z(s),
Type::P => todo!(),
Type::PK => todo!(),
Type::IB => todo!(),
Type::PIB => todo!(),
- Type::PIBHex => todo!(),
+ Type::PIBHex => self.parse_pibhex(s),
Type::RB => todo!(),
- Type::RBHex => todo!(),
+ Type::RBHex => self.parse_rbhex(s),
Type::Date
| Type::ADate
| Type::EDate
.map_err(|details| ParseError {
format: self.format,
input: s.into(),
- details,
+ kind: details,
})
}
- /*
- /// Parses `s`, which is encoded in `encoding`. For string formats,
- /// `encoding` is also the output encoding.
- fn parse_encoded(&self, s: &[u8], encoding: &'static Encoding) -> Result<Value, ParseError> {
- if s.is_empty() {
- return Ok(self.format.default_value());
- }
- match self.format.type_ {
- Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => {
- self.parse_number(s, self.format.type_)
- }
- Type::CC(_) => self.parse_number(s, Type::F),
- Type::N => self.parse_n(s),
- Type::Z => todo!(),
- Type::P => todo!(),
- Type::PK => todo!(),
- Type::IB => todo!(),
- Type::PIB => todo!(),
- Type::PIBHex => todo!(),
- Type::RB => todo!(),
- Type::RBHex => todo!(),
- Type::Date => todo!(),
- Type::ADate => todo!(),
- Type::EDate => todo!(),
- Type::JDate => todo!(),
- Type::SDate => todo!(),
- Type::QYr => todo!(),
- Type::MoYr => todo!(),
- Type::WkYr => todo!(),
- Type::DateTime => todo!(),
- Type::YMDHMS => todo!(),
- Type::MTime => todo!(),
- Type::Time => todo!(),
- Type::DTime => todo!(),
- Type::WkDay => todo!(),
- Type::Month => todo!(),
- Type::A => todo!(),
- Type::AHex => todo!(),
- }
- .map_err(|details| ParseError {
- format: self.format,
- input: s.into(),
- details,
- })
- }
- */
+ /// Parses `s`, which is encoded in `encoding`. For string formats,
+ /// `encoding` is also the output encoding.
+ pub fn parse_encoded(
+ &self,
+ input: &[u8],
+ encoding: &'static Encoding,
+ ) -> Result<Value, ParseError> {
+ if input.is_empty() {
+ return Ok(self.format.default_value());
+ }
+ match self.format.type_ {
+ Type::P => self.parse_p(input),
+ Type::PK => self.parse_pk(input),
+ Type::IB => self.parse_ib(input),
+ Type::PIB => self.parse_pib(input),
+ Type::RB => self.parse_rb(input),
+ Type::F
+ | Type::Comma
+ | Type::Dot
+ | Type::Dollar
+ | Type::Pct
+ | Type::E
+ | Type::CC(_)
+ | Type::N
+ | Type::Z
+ | Type::PIBHex
+ | Type::RBHex
+ | Type::Date
+ | Type::ADate
+ | Type::EDate
+ | Type::JDate
+ | Type::SDate
+ | Type::QYr
+ | Type::MoYr
+ | Type::WkYr
+ | Type::DateTime
+ | Type::YmdHms
+ | Type::MTime
+ | Type::Time
+ | Type::DTime
+ | Type::WkDay
+ | Type::Month => todo!(),
+ Type::A => todo!(),
+ Type::AHex => todo!(),
+ }
+ .map_err(|kind| ParseError {
+ format: self.format,
+ input: String::new(),
+ kind,
+ })
+ }
fn parse_number(&self, input: &str, type_: Type) -> Result<Value, ParseErrorKind> {
let style = self.settings.number_style(type_);
}
}
+ fn parse_z(&self, input: &str) -> Result<Value, ParseErrorKind> {
+ let input = input.trim();
+ if input.is_empty() || input == "." {
+ return Ok(Value::sysmis());
+ }
+
+ enum ZChar {
+ Digit(u32),
+ SignedDigit(u32, Sign),
+ Dot,
+ Invalid,
+ }
+
+ impl From<char> for ZChar {
+ fn from(c: char) -> Self {
+ match c {
+ '0'..='9' => ZChar::Digit(c as u32 - '0' as u32),
+ '{' => ZChar::SignedDigit(0, Sign::Positive),
+ 'A'..='I' => ZChar::SignedDigit(c as u32 - 'A' as u32 + 1, Sign::Positive),
+ '}' => ZChar::SignedDigit(0, Sign::Negative),
+ 'J'..='R' => ZChar::SignedDigit(c as u32 - 'J' as u32 + 1, Sign::Negative),
+ '.' => ZChar::Dot,
+ _ => ZChar::Invalid,
+ }
+ }
+ }
+
+ let mut number = SmallString::<[u8; 40]>::new();
+ let mut sign = None;
+ let mut dot = false;
+ for c in input.chars().map(ZChar::from) {
+ match c {
+ ZChar::Digit(digit) if sign.is_none() => {
+ number.push(char::from_digit(digit, 10).unwrap());
+ }
+ ZChar::SignedDigit(digit, s) if sign.is_none() => {
+ assert!(digit < 10, "{digit}");
+ number.push(char::from_digit(digit, 10).unwrap());
+ sign = Some(s);
+ }
+ ZChar::Dot if !dot => {
+ number.push('.');
+ dot = true;
+ }
+ _ => return Err(ParseErrorKind::InvalidZ),
+ }
+ }
+ if self.implied_decimals && !dot && self.format.d() != 0 {
+ write!(&mut number, "e-{}", self.format.d()).unwrap();
+ }
+ let number = number.parse::<f64>().unwrap();
+ let number = if sign == Some(Sign::Negative) {
+ -number
+ } else {
+ number
+ };
+ Ok(Value::Number(Some(number)))
+ }
+
+ fn parse_bcd(input: &[u8]) -> Result<u128, ParseErrorKind> {
+ let mut value = 0;
+ for byte in input.into_iter().copied() {
+ let hi = nibble(byte >> 4)?;
+ let lo = nibble(byte & 0x0f)?;
+ value = value * 100 + hi * 10 + lo;
+ }
+ Ok(value)
+ }
+
+ fn apply_decimals(&self, number: f64) -> f64 {
+ if self.implied_decimals && self.format.d() > 0 {
+ number / 10.0f64.powi(self.format.d() as i32)
+ } else {
+ number
+ }
+ }
+
+ fn parse_pk(&self, input: &[u8]) -> Result<Value, ParseErrorKind> {
+ let number = Self::parse_bcd(input)?;
+ Ok(Value::Number(Some(self.apply_decimals(number as f64))))
+ }
+
+ fn parse_p(&self, input: &[u8]) -> Result<Value, ParseErrorKind> {
+ if input.is_empty() {
+ return Ok(Value::Number(None));
+ };
+ let (head, tail) = input.split_at(input.len() - 1);
+ let number = Self::parse_bcd(head)?;
+ let number = number * 10 + nibble(tail[0] >> 4)?;
+ let number = match tail[0] & 0x0f {
+ 0xf => number as f64,
+ 0xd => -(number as f64),
+ other => return Err(ParseErrorKind::InvalidBCDSign(other)),
+ };
+ Ok(Value::Number(Some(self.apply_decimals(number))))
+ }
+
+ fn parse_binary(&self, input: &[u8]) -> u128 {
+ match self.endian.input {
+ Endian::Big => input.iter().fold(0, |acc, b| (acc << 8) + *b as u128),
+ Endian::Little => input.iter().rev().fold(0, |acc, b| (acc << 8) + *b as u128),
+ }
+ }
+
+ fn parse_ib(&self, input: &[u8]) -> Result<Value, ParseErrorKind> {
+ let number = self.parse_binary(input);
+ let sign_bit = 1 << (input.len() * 8 - 1);
+ let number = if (number & sign_bit) == 0 {
+ number as i128
+ } else {
+ -(number.wrapping_sub(sign_bit << 1) as i128)
+ };
+ Ok(Value::Number(Some(self.apply_decimals(number as f64))))
+ }
+
+ fn parse_pib(&self, input: &[u8]) -> Result<Value, ParseErrorKind> {
+ let number = self.parse_binary(input);
+ Ok(Value::Number(Some(self.apply_decimals(number as f64))))
+ }
+
+ fn parse_rb(&self, input: &[u8]) -> Result<Value, ParseErrorKind> {
+ let mut bytes = [0; 8];
+ let len = input.len().min(8);
+ bytes[..len].copy_from_slice(&input[..len]);
+ let bits: u64 = self.endian.input.parse(bytes);
+
+ const SYSMIS: f64 = -f64::MAX;
+ let number = match f64::from_bits(bits) {
+ SYSMIS => None,
+ other => Some(other),
+ };
+ Ok(Value::Number(number))
+ }
+
+ fn parse_hex(&self, input: &str) -> Result<Option<u64>, ParseErrorKind> {
+ let input = input.trim();
+ if input.is_empty() || input == "." {
+ return Ok(None);
+ }
+ if let Ok(value) = u64::from_str_radix(input, 16) {
+ Ok(Some(value))
+ } else {
+ println!("{input:?} {:?}", u64::from_str_radix(input, 16));
+ let c = input.chars().find(|c| !c.is_ascii_hexdigit()).unwrap();
+ Err(ParseErrorKind::NonHexDigit(c))
+ }
+ }
+
+ fn parse_pibhex(&self, input: &str) -> Result<Value, ParseErrorKind> {
+ self.parse_hex(input)
+ .map(|value| Value::Number(value.map(|number| number as f64)))
+ }
+
+ fn parse_rbhex(&self, input: &str) -> Result<Value, ParseErrorKind> {
+ self.parse_hex(input)
+ .map(|value| Value::Number(value.map(|number| f64::from_bits(number))))
+ }
+
fn parse_date(&self, input: &str) -> Result<Value, ParseErrorKind> {
let mut p = StrParser(input.trim());
if p.0.is_empty() || p.0 == "." {
}
}
+/*
+#[derive(Copy, Clone, Debug)]
+pub struct ByteParser<'a>(pub &'a [u8]);
+
+impl<'a> ByteParser<'a> {
+ pub fn new(s: &'a [u8]) -> Self {
+ Self(s)
+ }
+
+ pub fn strip_prefix(&mut self, prefix: &'a [u8]) -> bool {
+ if prefix.is_empty() {
+ false
+ } else if let Some(rest) = self.0.strip_prefix(prefix) {
+ self.0 = rest;
+ true
+ } else {
+ false
+ }
+ }
+
+ fn strip_one_of(&mut self, chars: &[char]) -> Option<char> {
+ let mut iter = self.0.iter();
+ match iter.next() {
+ Some(c) if chars.contains(&c) => {
+ self.0 = iter.as_str();
+ Some(c)
+ }
+ _ => None,
+ }
+ }
+
+ fn strip_matches(&mut self, f: impl Fn(char) -> bool) -> &'a [u8] {
+ self.advance(self.0.trim_start_matches(f))
+ }
+
+ fn strip_ws(&mut self) {
+ self.0 = self.0.trim_start();
+ }
+
+ fn advance(&mut self, rest: &'a [u8]) -> &'a [u8] {
+ let head = self.up_to(rest);
+ self.0 = rest;
+ head
+ }
+
+ fn up_to(&self, rest: &'a [u8]) -> &'a [u8] {
+ &self.0[..self.0.len() - rest.len()]
+ }
+}*/
+
+fn nibble(b: u8) -> Result<u128, ParseErrorKind> {
+ if b < 10 {
+ Ok(b as u128)
+ } else {
+ Err(ParseErrorKind::NonBDCDigit(b))
+ }
+}
+
#[cfg(test)]
mod test {
use std::{
};
use encoding_rs::UTF_8;
+ use rand::random;
use crate::{
calendar::{days_in_month, is_leap_year},
dictionary::Value,
- format::{parse::Sign, Epoch, Format, Settings as FormatSettings, Type},
+ endian::Endian,
+ format::{
+ parse::{ParseError, ParseErrorKind, Sign},
+ Epoch, Format, Settings as FormatSettings, Type,
+ },
+ settings::EndianSettings,
};
fn test(name: &str, type_: Type) {
let input_stream = BufReader::new(File::open(base.join("num-in.txt")).unwrap());
let expected_stream = BufReader::new(File::open(base.join(name)).unwrap());
let format = Format::new(type_, 40, 1).unwrap();
- for (line_number, (input, expected)) in input_stream
+ for ((input, expected), line_number) in input_stream
.lines()
.map(|result| result.unwrap())
.zip(expected_stream.lines().map(|result| result.unwrap()))
- .enumerate()
+ .zip(1..)
{
- let line_number = line_number + 1;
let result = format.parser().parse(&input, UTF_8);
let error = result.clone().err();
let value = result
}
}
}
+
+ #[test]
+ fn pibhex() {
+ fn hex_digits() -> impl Iterator<Item = (u8, char)> {
+ ((0..=9).zip('0'..='9'))
+ .chain((0xa..=0xf).zip('a'..='f'))
+ .chain((0xa..=0xf).zip('A'..='F'))
+ .chain(std::iter::once((0, 'x')))
+ }
+ let parser = Format::new(Type::PIBHex, 2, 0).unwrap().parser();
+ for (a, ac) in hex_digits() {
+ for (b, bc) in hex_digits() {
+ let s = [ac, bc].into_iter().collect::<String>();
+ let parsed = parser
+ .parse(&s, UTF_8)
+ .unwrap_or(Value::Number(None))
+ .as_number()
+ .unwrap();
+ let expected = if ac == 'x' || bc == 'x' {
+ None
+ } else {
+ Some((a * 16 + b) as f64)
+ };
+ assert_eq!(parsed, expected);
+ }
+ }
+ assert_eq!(parser.parse(".", UTF_8).unwrap(), Value::Number(None));
+ assert_eq!(parser.parse("", UTF_8).unwrap(), Value::Number(None));
+ }
+
+ #[test]
+ fn rbhex() {
+ for _ in 0..10000 {
+ let number = random::<f64>();
+ let formatted = format!("{:016x}", number.to_bits());
+ let parsed = Format::new(Type::RBHex, 16, 0)
+ .unwrap()
+ .parser()
+ .parse(&formatted, UTF_8)
+ .unwrap()
+ .as_number()
+ .unwrap()
+ .unwrap();
+ assert_eq!(parsed, number, "formatted as {formatted:?}");
+ }
+ }
+
+ #[test]
+ fn rb() {
+ for _ in 0..10000 {
+ let number = random::<f64>();
+ let raw = number.to_be_bytes();
+ let parsed = Format::new(Type::RB, 8, 0)
+ .unwrap()
+ .parser()
+ .with_endian(EndianSettings::new(Endian::Big))
+ .parse_encoded(&raw[..], UTF_8)
+ .unwrap()
+ .as_number()
+ .unwrap()
+ .unwrap();
+ assert_eq!(parsed, number);
+ }
+ }
+
+ #[test]
+ fn n() {
+ let parser = Format::new(Type::N, 2, 0).unwrap().parser();
+ for number in 0..=99 {
+ let formatted = format!("{:02}", number);
+ let parsed = parser
+ .parse(&formatted, UTF_8)
+ .unwrap()
+ .as_number()
+ .unwrap()
+ .unwrap();
+ assert_eq!(parsed, number as f64, "formatted as {formatted:?}");
+ }
+ assert!(matches!(
+ parser.parse(" 0", UTF_8),
+ Err(ParseError {
+ kind: ParseErrorKind::Nondigit(' '),
+ ..
+ })
+ ));
+ assert!(matches!(
+ parser.parse(".", UTF_8),
+ Err(ParseError {
+ kind: ParseErrorKind::Nondigit('.'),
+ ..
+ })
+ ));
+ }
+
+ #[test]
+ fn z() {
+ let parser = Format::new(Type::Z, 2, 0).unwrap().parser();
+ for number in -99i32..=99 {
+ for mut formatted in [
+ format!("{:02}", number.abs()),
+ format!("{:2}", number.abs()),
+ ] {
+ let last = formatted.pop().unwrap();
+ let digit = last.to_digit(10).unwrap() as usize;
+ if number >= 0 {
+ formatted.push(b"{ABCDEFGHI"[digit] as char);
+ } else {
+ formatted.push(b"}JKLMNOPQR"[digit] as char);
+ }
+ let parsed = parser
+ .parse(&formatted, UTF_8)
+ .unwrap()
+ .as_number()
+ .unwrap()
+ .unwrap();
+ assert_eq!(parsed, number as f64, "formatted as {formatted:?}");
+ }
+ }
+ assert_eq!(parser.parse(".", UTF_8).unwrap(), Value::Number(None));
+
+ let parser = Format::new(Type::Z, 4, 1)
+ .unwrap()
+ .parser()
+ .with_implied_decimals();
+ for number in -999i32..=999 {
+ let tenths = number as f64 / 10.0;
+ for mut formatted in [format!("{}", number.abs()), format!("{:.1}", tenths.abs())] {
+ let last = formatted.pop().unwrap();
+ let digit = last.to_digit(10).unwrap() as usize;
+ if number >= 0 {
+ formatted.push(b"{ABCDEFGHI"[digit] as char);
+ } else {
+ formatted.push(b"}JKLMNOPQR"[digit] as char);
+ }
+ let parsed = parser
+ .parse(&formatted, UTF_8)
+ .unwrap()
+ .as_number()
+ .unwrap()
+ .unwrap();
+ assert_eq!(parsed, tenths, "formatted as {formatted:?}");
+ }
+ }
+ }
}