use encoding_rs::Encoding;
use smallstr::SmallString;
use std::{
+ borrow::Cow,
fmt::{Display, Write},
str::FromStr,
};
use thiserror::Error as ThisError;
-/*
-pub enum ParseInput {
- String(String),
- Bytes(Box<[u8]>),
-}*/
+pub enum EncodedStr<'a> {
+ Encoded {
+ bytes: &'a [u8],
+ encoding: &'static Encoding,
+ },
+ Utf8 {
+ s: &'a str,
+ },
+}
+
+impl<'a> EncodedStr<'a> {
+ pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self {
+ Self::Encoded { bytes, encoding }
+ }
+ pub fn as_str(&self) -> Cow<'_, str> {
+ match self {
+ EncodedStr::Encoded { bytes, encoding } => {
+ encoding.decode_without_bom_handling(&bytes).0
+ }
+ EncodedStr::Utf8 { s } => Cow::from(*s),
+ }
+ }
+ pub fn as_bytes(&self) -> &[u8] {
+ match self {
+ EncodedStr::Encoded { bytes, .. } => bytes,
+ EncodedStr::Utf8 { s } => s.as_bytes(),
+ }
+ }
+ pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> {
+ match self {
+ EncodedStr::Encoded { bytes, encoding } => {
+ let utf8 = encoding.decode_without_bom_handling(bytes).0;
+ match encoding.encode(&utf8).0 {
+ Cow::Borrowed(_) => {
+ // Recoding into UTF-8 and then back did not change anything.
+ Cow::from(*bytes)
+ }
+ Cow::Owned(owned) => Cow::Owned(owned),
+ }
+ }
+ EncodedStr::Utf8 { s } => encoding.encode(s).0,
+ }
+ }
+ pub fn is_empty(&self) -> bool {
+ match self {
+ EncodedStr::Encoded { bytes, .. } => bytes.is_empty(),
+ EncodedStr::Utf8 { s } => s.is_empty(),
+ }
+ }
+}
+
+impl<'a> From<&'a str> for EncodedStr<'a> {
+ fn from(s: &'a str) -> Self {
+ Self::Utf8 { s }
+ }
+}
#[derive(Clone, Debug)]
pub struct ParseError {
settings: &'a Settings,
endian: EndianSettings,
implied_decimals: bool,
+ output_encoding: &'static Encoding,
}
impl Format {
- pub fn parser(&self) -> ParseValue<'static> {
- ParseValue::new(*self)
+ pub fn parser(&self, output_encoding: &'static Encoding) -> ParseValue<'static> {
+ ParseValue::new(*self, output_encoding)
}
}
impl ParseValue<'static> {
- pub fn new(format: Format) -> Self {
+ pub fn new(format: Format, output_encoding: &'static Encoding) -> Self {
let settings = PsppSettings::global();
Self {
format,
settings: &settings.formats,
endian: settings.endian,
implied_decimals: false,
+ output_encoding,
}
}
}
}
}
- /// Parses `s` as this format. For string formats, `encoding` specifies the
- /// output encoding.
- pub fn parse(&self, s: &str, _encoding: &'static Encoding) -> Result<Value, ParseError> {
+ /// Parses `s`.
+ ///
+ /// This is only appropriate if `s` was originally encoded in UTF-8
+ /// Otherwise, binary formats will not yield sensible parse results, because
+ /// recoding bytes from (e.g.) windows-1252 into UTF-8, and then
+ /// interpreting them as a binary number yields nonsense.
+ pub fn parse(&self, s: &str) -> Result<Value, ParseError> {
if s.is_empty() {
return Ok(self.format.default_value());
}
| Type::DTime => self.parse_date(s),
Type::WkDay => self.parse_wkday(s),
Type::Month => self.parse_month(s),
- Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => todo!(),
- Type::A => todo!(),
- Type::AHex => todo!(),
+ Type::P | Type::PK | Type::IB | Type::PIB | Type::RB | Type::AHex => {
+ todo!()
+ }
+ Type::A => Ok(Value::String(self.output_encoding.encode(s).0.into())),
}
.map_err(|details| ParseError {
format: self.format,
})
}
- /// Parses `s`, which is encoded in `encoding`. For string formats,
- /// `encoding` is also the output encoding.
- pub fn parse_encoded(
- &self,
- input: &[u8],
- _encoding: &'static Encoding,
- ) -> Result<Value, ParseError> {
+ pub fn parse_all<'b, T>(&self, input: T) -> Result<Value, ParseError>
+ where
+ T: Into<EncodedStr<'b>>,
+ {
+ let input: EncodedStr = input.into();
if input.is_empty() {
return Ok(self.format.default_value());
}
match self.format.type_ {
- Type::P => self.parse_p(input),
- Type::PK => self.parse_pk(input),
- Type::IB => self.parse_ib(input),
- Type::PIB => self.parse_pib(input),
- Type::RB => self.parse_rb(input),
- Type::F
- | Type::Comma
- | Type::Dot
- | Type::Dollar
- | Type::Pct
- | Type::E
- | Type::CC(_)
- | Type::N
- | Type::Z
- | Type::PIBHex
- | Type::RBHex
- | Type::Date
+ Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => {
+ self.parse_number(&input.as_str(), self.format.type_)
+ }
+ Type::CC(_) => self.parse_number(&input.as_str(), Type::F),
+ Type::N => self.parse_n(&input.as_str()),
+ Type::Z => self.parse_z(&input.as_str()),
+ Type::PIBHex => self.parse_pibhex(&input.as_str()),
+ Type::RBHex => self.parse_rbhex(&input.as_str()),
+ Type::Date
| Type::ADate
| Type::EDate
| Type::JDate
| Type::YmdHms
| Type::MTime
| Type::Time
- | Type::DTime
- | Type::WkDay
- | Type::Month => todo!(),
- Type::A => todo!(),
+ | Type::DTime => self.parse_date(&input.as_str()),
+ Type::WkDay => self.parse_wkday(&input.as_str()),
+ Type::Month => self.parse_month(&input.as_str()),
+ Type::P => self.parse_p(input.as_bytes()),
+ Type::PK => self.parse_pk(input.as_bytes()),
+ Type::IB => self.parse_ib(input.as_bytes()),
+ Type::PIB => self.parse_pib(input.as_bytes()),
+ Type::RB => self.parse_rb(input.as_bytes()),
+ Type::A => Ok(Value::String(
+ input.to_encoding(self.output_encoding).into(),
+ )),
Type::AHex => todo!(),
}
- .map_err(|kind| ParseError {
+ .map_err(|details| ParseError {
format: self.format,
- input: String::new(),
- kind,
+ input: todo!(),
+ kind: details,
})
}
dictionary::Value,
endian::Endian,
format::{
- parse::{ParseError, ParseErrorKind, Sign},
+ parse::{EncodedStr, ParseError, ParseErrorKind, Sign},
Epoch, Format, Settings as FormatSettings, Type,
},
settings::EndianSettings,
.zip(expected_stream.lines().map(|result| result.unwrap()))
.zip(1..)
{
- let result = format.parser().parse(&input, UTF_8);
+ let result = format.parser(UTF_8).parse(&input);
let error = result.clone().err();
let value = result
.unwrap_or(format.default_value())
let settings = FormatSettings::default().with_epoch(Epoch(1930));
let parsed = Format::new(self.type_, 40, 0)
.unwrap()
- .parser()
+ .parser(UTF_8)
.with_settings(&settings)
- .parse(&formatted, UTF_8)
+ .parse(&formatted)
.unwrap();
assert_eq!(parsed, Value::Number(Some(expected as f64)));
}
let parsed = Format::new(self.type_, 40, 0)
.unwrap()
- .parser()
- .parse(&formatted, UTF_8)
+ .parser(UTF_8)
+ .parse(&formatted)
.unwrap()
.as_number()
.unwrap()
loop {
let parsed = Format::new(Type::WkDay, 40, 0)
.unwrap()
- .parser()
- .parse(input, UTF_8)
+ .parser(UTF_8)
+ .parse(input)
.unwrap_or(Value::Number(None))
.as_number()
.unwrap();
let input = &input[..length];
let parsed = Format::new(Type::Month, 40, 0)
.unwrap()
- .parser()
- .parse(input, UTF_8)
+ .parser(UTF_8)
+ .parse(input)
.unwrap_or(Value::Number(None))
.as_number()
.unwrap();
.chain((0xa..=0xf).zip('A'..='F'))
.chain(std::iter::once((0, 'x')))
}
- let parser = Format::new(Type::PIBHex, 2, 0).unwrap().parser();
+ let parser = Format::new(Type::PIBHex, 2, 0).unwrap().parser(UTF_8);
for (a, ac) in hex_digits() {
for (b, bc) in hex_digits() {
let s = [ac, bc].into_iter().collect::<String>();
let parsed = parser
- .parse(&s, UTF_8)
+ .parse(&s)
.unwrap_or(Value::Number(None))
.as_number()
.unwrap();
assert_eq!(parsed, expected);
}
}
- assert_eq!(parser.parse(".", UTF_8).unwrap(), Value::Number(None));
- assert_eq!(parser.parse("", UTF_8).unwrap(), Value::Number(None));
+ assert_eq!(parser.parse(".").unwrap(), Value::Number(None));
+ assert_eq!(parser.parse("",).unwrap(), Value::Number(None));
}
#[test]
let formatted = format!("{:016x}", number.to_bits());
let parsed = Format::new(Type::RBHex, 16, 0)
.unwrap()
- .parser()
- .parse(&formatted, UTF_8)
+ .parser(UTF_8)
+ .parse(&formatted)
.unwrap()
.as_number()
.unwrap()
let raw = number.to_be_bytes();
let parsed = Format::new(Type::RB, 8, 0)
.unwrap()
- .parser()
+ .parser(UTF_8)
.with_endian(EndianSettings::new(Endian::Big))
- .parse_encoded(&raw[..], UTF_8)
+ .parse_all(EncodedStr::new(&raw[..], UTF_8))
.unwrap()
.as_number()
.unwrap()
#[test]
fn n() {
- let parser = Format::new(Type::N, 2, 0).unwrap().parser();
+ let parser = Format::new(Type::N, 2, 0).unwrap().parser(UTF_8);
for number in 0..=99 {
let formatted = format!("{:02}", number);
let parsed = parser
- .parse(&formatted, UTF_8)
+ .parse(&formatted)
.unwrap()
.as_number()
.unwrap()
assert_eq!(parsed, number as f64, "formatted as {formatted:?}");
}
assert!(matches!(
- parser.parse(" 0", UTF_8),
+ parser.parse(" 0"),
Err(ParseError {
kind: ParseErrorKind::Nondigit(' '),
..
})
));
assert!(matches!(
- parser.parse(".", UTF_8),
+ parser.parse("."),
Err(ParseError {
kind: ParseErrorKind::Nondigit('.'),
..
#[test]
fn z() {
- let parser = Format::new(Type::Z, 2, 0).unwrap().parser();
+ let parser = Format::new(Type::Z, 2, 0).unwrap().parser(UTF_8);
for number in -99i32..=99 {
for mut formatted in [
format!("{:02}", number.abs()),
formatted.push(b"}JKLMNOPQR"[digit] as char);
}
let parsed = parser
- .parse(&formatted, UTF_8)
+ .parse(&formatted)
.unwrap()
.as_number()
.unwrap()
assert_eq!(parsed, number as f64, "formatted as {formatted:?}");
}
}
- assert_eq!(parser.parse(".", UTF_8).unwrap(), Value::Number(None));
+ assert_eq!(parser.parse(".").unwrap(), Value::Number(None));
let parser = Format::new(Type::Z, 4, 1)
.unwrap()
- .parser()
+ .parser(UTF_8)
.with_implied_decimals();
for number in -999i32..=999 {
let tenths = number as f64 / 10.0;
formatted.push(b"}JKLMNOPQR"[digit] as char);
}
let parsed = parser
- .parse(&formatted, UTF_8)
+ .parse(&formatted)
.unwrap()
.as_number()
.unwrap()