}
match self.type_ {
Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => {
- self.parse_number(&input.as_str(), self.type_)
+ self.parse_number(&input.to_str(), self.type_)
}
- Type::CC(_) => self.parse_number(&input.as_str(), Type::F),
- Type::N => self.parse_n(&input.as_str()),
- Type::Z => self.parse_z(&input.as_str()),
- Type::PIBHex => self.parse_pibhex(&input.as_str()),
- Type::RBHex => self.parse_rbhex(&input.as_str()),
+ Type::CC(_) => self.parse_number(&input.to_str(), Type::F),
+ Type::N => self.parse_n(&input.to_str()),
+ Type::Z => self.parse_z(&input.to_str()),
+ Type::PIBHex => self.parse_pibhex(&input.to_str()),
+ Type::RBHex => self.parse_rbhex(&input.to_str()),
Type::Date
| Type::ADate
| Type::EDate
| Type::YmdHms
| Type::MTime
| Type::Time
- | Type::DTime => self.parse_date(&input.as_str()),
- Type::WkDay => self.parse_wkday(&input.as_str()),
- Type::Month => self.parse_month(&input.as_str()),
+ | Type::DTime => self.parse_date(&input.to_str()),
+ Type::WkDay => self.parse_wkday(&input.to_str()),
+ Type::Month => self.parse_month(&input.to_str()),
Type::P => self.parse_p(input.as_bytes()),
Type::PK => self.parse_pk(input.as_bytes()),
Type::IB => self.parse_ib(input.as_bytes()),
Type::A => Ok(Value::String(
input.to_encoding(self.output_encoding).into(),
)),
- Type::AHex => self.parse_ahex(&input.as_str()),
+ Type::AHex => self.parse_ahex(&input.to_str()),
}
.map_err(|kind| ParseError {
type_: self.type_,
.as_string()
.unwrap()
.as_encoded(UTF_8)
- .as_str(),
+ .to_str(),
"abcdefgh"
);
}
}
+#[derive(Clone, PartialEq, Default, Eq, PartialOrd, Ord, Hash)]
+pub struct RawStr<'a>(pub &'a [u8]);
+
+impl<'a> RawStr<'a> {
+ pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> {
+ EncodedStr::new(&self.0, encoding)
+ }
+ pub fn with_encoding(&self, encoding: &'static Encoding) -> EncodedString {
+ EncodedString::new(&*self.0, encoding)
+ }
+ pub fn as_slice(&self) -> &[u8] {
+ self.0
+ }
+}
+
+impl<'a> From<&'a [u8]> for RawStr<'a> {
+ fn from(source: &'a [u8]) -> Self {
+ Self(source)
+ }
+}
+
+impl Debug for RawStr<'_> {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "{:?}", default_decode(self.as_slice()))
+ }
+}
+
#[derive(Copy, Clone)]
pub struct RawStrArray<const N: usize>(pub [u8; N]);
}
}
+/// Pairs a text string with its [Encoding].
+///
+/// Strings in different encodings are considered to be different, even if they
+/// have the same contents. This is an important optimization for hashing:
+/// otherwise, strings could only be hashed if they were converted to a common
+/// encoding (probably UTF-8), which would be expensive.
#[derive(Clone)]
-pub enum EncodedString {
- Encoded {
- bytes: Vec<u8>,
- encoding: &'static Encoding,
- },
- Utf8(String),
+pub struct EncodedString {
+ /// Raw contents.
+ bytes: RawString,
+
+ /// Encoding.
+ encoding: &'static Encoding,
+
+ /// True if `bytes` can be treated as UTF-8, that is, if `bytes` contains
+ /// valid UTF-8 and that would correctly represent its contents in
+ /// `encoding`.
+ valid_utf8: bool,
}
impl EncodedString {
/// Creates a new `EncodedString` from `bytes` and `encoding`.
///
- /// It's cheaper to use `EncodedString::from(string)` if the input is in a
- /// `&str` or `String`.
+ /// If the input is in a `&str` or `String`, instead use
+ /// `EncodedString::from(string)` because it avoids checking for correct
+ /// UTF-8.
pub fn new(bytes: impl Into<Vec<u8>>, encoding: &'static Encoding) -> Self {
let bytes: Vec<u8> = bytes.into();
- if encoding == UTF_8 {
- match String::from_utf8(bytes) {
- Ok(string) => Self::Utf8(string),
- Err(error) => Self::Encoded {
- bytes: error.into_bytes(),
- encoding,
- },
- }
- } else {
- Self::Encoded { bytes, encoding }
+ Self {
+ valid_utf8: matches!(
+ encoding.decode_without_bom_handling(&bytes).0,
+ Cow::Borrowed(_),
+ ),
+ bytes: RawString::from(bytes),
+ encoding,
}
}
pub fn borrowed(&self) -> EncodedStr<'_> {
- match self {
- EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding },
- EncodedString::Utf8(s) => EncodedStr::Utf8 { s },
- }
+ todo!()
}
- pub fn as_utf8_bytes(&self) -> Option<&[u8]> {
- match self {
- EncodedString::Encoded { bytes, encoding } if *encoding == UTF_8 => Some(&bytes),
- EncodedString::Utf8(s) => Some(s.as_bytes()),
- _ => None,
- }
+
+ pub fn encoding(&self) -> &'static Encoding {
+ self.encoding
}
- pub fn as_encoded(&self) -> (&[u8], &'static Encoding) {
- match self {
- EncodedString::Encoded { bytes, encoding } => (&bytes, encoding),
- EncodedString::Utf8(s) => (s.as_bytes(), UTF_8),
+
+ pub fn as_bytes(&self) -> &[u8] {
+ self.bytes.0.iter().as_slice()
+ }
+
+ pub fn as_str(&self) -> Option<&str> {
+ self.valid_utf8
+ .then(|| unsafe { std::str::from_utf8_unchecked(self.bytes.as_slice()) })
+ }
+
+ pub fn to_utf8<'a>(&'a self) -> Cow<'a, str> {
+ match self.as_str() {
+ Some(str) => Cow::from(str),
+ None => {
+ self.encoding
+ .decode_without_bom_handling(self.bytes.as_slice())
+ .0
+ }
}
}
}
impl Debug for EncodedString {
fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- match self {
- Self::Encoded { bytes, encoding } => {
- write!(f, "{:?}({})", self.borrowed().to_utf8(), encoding.name())
- }
- Self::Utf8(string) => write!(f, "{string:?}"),
+ write!(f, "{:?}", self.to_utf8())?;
+ if self.encoding != UTF_8 {
+ write!(f, "({})", self.encoding.name())?;
}
+ Ok(())
}
}
impl From<String> for EncodedString {
fn from(value: String) -> Self {
- Self::Utf8(value)
+ Self {
+ bytes: RawString(value.into_bytes().into_boxed_slice()),
+ encoding: UTF_8,
+ valid_utf8: true,
+ }
}
}
impl From<&'_ str> for EncodedString {
fn from(value: &'_ str) -> Self {
- Self::Utf8(value.into())
+ Self {
+ bytes: value.as_bytes().into(),
+ encoding: UTF_8,
+ valid_utf8: true,
+ }
}
}
impl Default for EncodedString {
fn default() -> Self {
- Self::Utf8(String::new())
+ // XXX what encoding?
+ todo!()
}
}
impl<'a> From<EncodedStr<'a>> for EncodedString {
fn from(value: EncodedStr<'a>) -> Self {
- match value {
- EncodedStr::Encoded { bytes, encoding } => Self::Encoded {
- bytes: bytes.into(),
- encoding,
- },
- EncodedStr::Utf8 { s } => Self::Utf8(s.into()),
+ Self {
+ bytes: value.bytes.0.to_vec().into(),
+ encoding: value.encoding,
+ valid_utf8: value.valid_utf8,
}
}
}
impl PartialEq for EncodedString {
fn eq(&self, other: &Self) -> bool {
- if let Some(self_utf8) = self.as_utf8_bytes() {
- if let Some(other_utf8) = other.as_utf8_bytes() {
- return self_utf8 == other_utf8;
- }
- }
-
- let (self_bytes, self_encoding) = self.as_encoded();
- let (other_bytes, other_encoding) = other.as_encoded();
- if self_encoding == other_encoding {
- self_bytes == other_bytes
- } else {
- self.borrowed().to_utf8() == other.borrowed().to_utf8()
- }
+ self.encoding == other.encoding
+ && self.valid_utf8 == other.valid_utf8
+ && self.bytes == other.bytes
}
}
-pub enum EncodedStr<'a> {
- Encoded {
- bytes: &'a [u8],
- encoding: &'static Encoding,
- },
- Utf8 {
- s: &'a str,
- },
+pub struct EncodedStr<'a> {
+ /// Raw contents.
+ bytes: RawStr<'a>,
+
+ /// Encoding.
+ encoding: &'static Encoding,
+
+ /// True if `bytes` can be treated as UTF-8, that is, if `bytes` contains
+ /// valid UTF-8 and that would correctly represent its contents in
+ /// `encoding`.
+ valid_utf8: bool,
}
impl<'a> EncodedStr<'a> {
+ /// Creates a new `EncodedStr` from `bytes` and `encoding`.
+ ///
+ /// If the input is in a `&str` or `String`, instead use
+ /// `EncodedString::from(string)` because it avoids checking for correct
+ /// UTF-8.
pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self {
- Self::Encoded { bytes, encoding }
- }
- pub fn as_str(&self) -> Cow<'_, str> {
- match self {
- EncodedStr::Encoded { bytes, encoding } => {
- encoding.decode_without_bom_handling(bytes).0
- }
- EncodedStr::Utf8 { s } => Cow::from(*s),
+ Self {
+ valid_utf8: matches!(
+ encoding.decode_without_bom_handling(bytes).0,
+ Cow::Borrowed(_),
+ ),
+ bytes: RawStr::from(bytes),
+ encoding,
}
}
- pub fn as_bytes(&self) -> &[u8] {
- match self {
- EncodedStr::Encoded { bytes, .. } => bytes,
- EncodedStr::Utf8 { s } => s.as_bytes(),
- }
+
+ pub fn as_str(&self) -> Option<&str> {
+ self.valid_utf8
+ .then(|| unsafe { std::str::from_utf8_unchecked(self.bytes.as_slice()) })
}
- pub fn to_utf8(&self) -> Cow<'a, str> {
- match self {
- EncodedStr::Encoded { bytes, encoding } => {
- encoding.decode_without_bom_handling(bytes).0
+
+ pub fn to_str(&'a self) -> Cow<'a, str> {
+ match self.as_str() {
+ Some(str) => Cow::from(str),
+ None => {
+ self.encoding
+ .decode_without_bom_handling(self.bytes.as_slice())
+ .0
}
- EncodedStr::Utf8 { s } => Cow::from(*s),
}
}
+
+ pub fn as_bytes(&self) -> &[u8] {
+ self.bytes.as_slice()
+ }
+
pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> {
- match encoding.encode(&self.to_utf8()).0 {
+ match encoding.encode(&self.to_str()).0 {
Cow::Borrowed(_) => Cow::Borrowed(self.as_bytes()),
Cow::Owned(string) => Cow::Owned(string),
}
}
pub fn is_empty(&self) -> bool {
- match self {
- EncodedStr::Encoded { bytes, .. } => bytes.is_empty(),
- EncodedStr::Utf8 { s } => s.is_empty(),
- }
+ self.bytes.0.is_empty()
}
pub fn quoted(&self) -> QuotedEncodedStr {
QuotedEncodedStr(self)
impl<'a> From<&'a str> for EncodedStr<'a> {
fn from(s: &'a str) -> Self {
- Self::Utf8 { s }
+ Self {
+ bytes: RawStr(s.as_bytes()),
+ encoding: UTF_8,
+ valid_utf8: true,
+ }
}
}
impl<'a> From<&'a String> for EncodedStr<'a> {
fn from(s: &'a String) -> Self {
- Self::Utf8 { s: s.as_str() }
+ Self::from(s.as_str())
}
}