}
match self.type_ {
Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => {
- self.parse_number(&input.to_str(), self.type_)
+ self.parse_number(&input.as_str(), self.type_)
}
- Type::CC(_) => self.parse_number(&input.to_str(), Type::F),
- Type::N => self.parse_n(&input.to_str()),
- Type::Z => self.parse_z(&input.to_str()),
- Type::PIBHex => self.parse_pibhex(&input.to_str()),
- Type::RBHex => self.parse_rbhex(&input.to_str()),
+ Type::CC(_) => self.parse_number(&input.as_str(), Type::F),
+ Type::N => self.parse_n(&input.as_str()),
+ Type::Z => self.parse_z(&input.as_str()),
+ Type::PIBHex => self.parse_pibhex(&input.as_str()),
+ Type::RBHex => self.parse_rbhex(&input.as_str()),
Type::Date
| Type::ADate
| Type::EDate
| Type::YmdHms
| Type::MTime
| Type::Time
- | Type::DTime => self.parse_date(&input.to_str()),
- Type::WkDay => self.parse_wkday(&input.to_str()),
- Type::Month => self.parse_month(&input.to_str()),
+ | Type::DTime => self.parse_date(&input.as_str()),
+ Type::WkDay => self.parse_wkday(&input.as_str()),
+ Type::Month => self.parse_month(&input.as_str()),
Type::P => self.parse_p(input.as_bytes()),
Type::PK => self.parse_pk(input.as_bytes()),
Type::IB => self.parse_ib(input.as_bytes()),
Type::A => Ok(Value::String(
input.to_encoding(self.output_encoding).into(),
)),
- Type::AHex => self.parse_ahex(&input.to_str()),
+ Type::AHex => self.parse_ahex(&input.as_str()),
}
.map_err(|kind| ParseError {
type_: self.type_,
.as_string()
.unwrap()
.as_encoded(UTF_8)
- .to_str(),
+ .as_str(),
"abcdefgh"
);
}
}
-#[derive(Clone, PartialEq, Default, Eq, PartialOrd, Ord, Hash)]
-pub struct RawStr<'a>(pub &'a [u8]);
-
-impl<'a> RawStr<'a> {
- pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> {
- EncodedStr::new(&self.0, encoding)
- }
- pub fn with_encoding(&self, encoding: &'static Encoding) -> EncodedString {
- EncodedString::new(&*self.0, encoding)
- }
- pub fn as_slice(&self) -> &[u8] {
- self.0
- }
-}
-
-impl<'a> From<&'a [u8]> for RawStr<'a> {
- fn from(source: &'a [u8]) -> Self {
- Self(source)
- }
-}
-
-impl Debug for RawStr<'_> {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{:?}", default_decode(self.as_slice()))
- }
-}
-
#[derive(Copy, Clone)]
pub struct RawStrArray<const N: usize>(pub [u8; N]);
}
}
-/// Pairs a text string with its [Encoding].
-///
-/// Strings in different encodings are considered to be different, even if they
-/// have the same contents. This is an important optimization for hashing:
-/// otherwise, strings could only be hashed if they were converted to a common
-/// encoding (probably UTF-8), which would be expensive.
#[derive(Clone)]
-pub struct EncodedString {
- /// Raw contents.
- bytes: RawString,
-
- /// Encoding.
- encoding: &'static Encoding,
-
- /// True if `bytes` can be treated as UTF-8, that is, if `bytes` contains
- /// valid UTF-8 and that would correctly represent its contents in
- /// `encoding`.
- valid_utf8: bool,
+pub enum EncodedString {
+ Encoded {
+ bytes: Vec<u8>,
+ encoding: &'static Encoding,
+ },
+ Utf8(String),
}
impl EncodedString {
/// Creates a new `EncodedString` from `bytes` and `encoding`.
///
- /// If the input is in a `&str` or `String`, instead use
- /// `EncodedString::from(string)` because it avoids checking for correct
- /// UTF-8.
+ /// It's cheaper to use `EncodedString::from(string)` if the input is in a
+ /// `&str` or `String`.
pub fn new(bytes: impl Into<Vec<u8>>, encoding: &'static Encoding) -> Self {
let bytes: Vec<u8> = bytes.into();
- Self {
- valid_utf8: matches!(
- encoding.decode_without_bom_handling(&bytes).0,
- Cow::Borrowed(_),
- ),
- bytes: RawString::from(bytes),
- encoding,
+ if encoding == UTF_8 {
+ match String::from_utf8(bytes) {
+ Ok(string) => Self::Utf8(string),
+ Err(error) => Self::Encoded {
+ bytes: error.into_bytes(),
+ encoding,
+ },
+ }
+ } else {
+ Self::Encoded { bytes, encoding }
}
}
pub fn borrowed(&self) -> EncodedStr<'_> {
- todo!()
- }
-
- pub fn encoding(&self) -> &'static Encoding {
- self.encoding
- }
-
- pub fn as_bytes(&self) -> &[u8] {
- self.bytes.0.iter().as_slice()
+ match self {
+ EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding },
+ EncodedString::Utf8(s) => EncodedStr::Utf8 { s },
+ }
}
-
- pub fn as_str(&self) -> Option<&str> {
- self.valid_utf8
- .then(|| unsafe { std::str::from_utf8_unchecked(self.bytes.as_slice()) })
+ pub fn as_utf8_bytes(&self) -> Option<&[u8]> {
+ match self {
+ EncodedString::Encoded { bytes, encoding } if *encoding == UTF_8 => Some(&bytes),
+ EncodedString::Utf8(s) => Some(s.as_bytes()),
+ _ => None,
+ }
}
-
- pub fn to_utf8<'a>(&'a self) -> Cow<'a, str> {
- match self.as_str() {
- Some(str) => Cow::from(str),
- None => {
- self.encoding
- .decode_without_bom_handling(self.bytes.as_slice())
- .0
- }
+ pub fn as_encoded(&self) -> (&[u8], &'static Encoding) {
+ match self {
+ EncodedString::Encoded { bytes, encoding } => (&bytes, encoding),
+ EncodedString::Utf8(s) => (s.as_bytes(), UTF_8),
}
}
}
impl Debug for EncodedString {
fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- write!(f, "{:?}", self.to_utf8())?;
- if self.encoding != UTF_8 {
- write!(f, "({})", self.encoding.name())?;
+ match self {
+ Self::Encoded { bytes, encoding } => {
+ write!(f, "{:?}({})", self.borrowed().to_utf8(), encoding.name())
+ }
+ Self::Utf8(string) => write!(f, "{string:?}"),
}
- Ok(())
}
}
impl From<String> for EncodedString {
fn from(value: String) -> Self {
- Self {
- bytes: RawString(value.into_bytes().into_boxed_slice()),
- encoding: UTF_8,
- valid_utf8: true,
- }
+ Self::Utf8(value)
}
}
impl From<&'_ str> for EncodedString {
fn from(value: &'_ str) -> Self {
- Self {
- bytes: value.as_bytes().into(),
- encoding: UTF_8,
- valid_utf8: true,
- }
+ Self::Utf8(value.into())
}
}
impl Default for EncodedString {
fn default() -> Self {
- // XXX what encoding?
- todo!()
+ Self::Utf8(String::new())
}
}
impl<'a> From<EncodedStr<'a>> for EncodedString {
fn from(value: EncodedStr<'a>) -> Self {
- Self {
- bytes: value.bytes.0.to_vec().into(),
- encoding: value.encoding,
- valid_utf8: value.valid_utf8,
+ match value {
+ EncodedStr::Encoded { bytes, encoding } => Self::Encoded {
+ bytes: bytes.into(),
+ encoding,
+ },
+ EncodedStr::Utf8 { s } => Self::Utf8(s.into()),
}
}
}
impl PartialEq for EncodedString {
fn eq(&self, other: &Self) -> bool {
- self.encoding == other.encoding
- && self.valid_utf8 == other.valid_utf8
- && self.bytes == other.bytes
+ if let Some(self_utf8) = self.as_utf8_bytes() {
+ if let Some(other_utf8) = other.as_utf8_bytes() {
+ return self_utf8 == other_utf8;
+ }
+ }
+
+ let (self_bytes, self_encoding) = self.as_encoded();
+ let (other_bytes, other_encoding) = other.as_encoded();
+ if self_encoding == other_encoding {
+ self_bytes == other_bytes
+ } else {
+ self.borrowed().to_utf8() == other.borrowed().to_utf8()
+ }
}
}
-pub struct EncodedStr<'a> {
- /// Raw contents.
- bytes: RawStr<'a>,
-
- /// Encoding.
- encoding: &'static Encoding,
-
- /// True if `bytes` can be treated as UTF-8, that is, if `bytes` contains
- /// valid UTF-8 and that would correctly represent its contents in
- /// `encoding`.
- valid_utf8: bool,
+pub enum EncodedStr<'a> {
+ Encoded {
+ bytes: &'a [u8],
+ encoding: &'static Encoding,
+ },
+ Utf8 {
+ s: &'a str,
+ },
}
impl<'a> EncodedStr<'a> {
- /// Creates a new `EncodedStr` from `bytes` and `encoding`.
- ///
- /// If the input is in a `&str` or `String`, instead use
- /// `EncodedString::from(string)` because it avoids checking for correct
- /// UTF-8.
pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self {
- Self {
- valid_utf8: matches!(
- encoding.decode_without_bom_handling(bytes).0,
- Cow::Borrowed(_),
- ),
- bytes: RawStr::from(bytes),
- encoding,
- }
- }
-
- pub fn as_str(&self) -> Option<&str> {
- self.valid_utf8
- .then(|| unsafe { std::str::from_utf8_unchecked(self.bytes.as_slice()) })
+ Self::Encoded { bytes, encoding }
}
-
- pub fn to_str(&'a self) -> Cow<'a, str> {
- match self.as_str() {
- Some(str) => Cow::from(str),
- None => {
- self.encoding
- .decode_without_bom_handling(self.bytes.as_slice())
- .0
+ pub fn as_str(&self) -> Cow<'_, str> {
+ match self {
+ EncodedStr::Encoded { bytes, encoding } => {
+ encoding.decode_without_bom_handling(bytes).0
}
+ EncodedStr::Utf8 { s } => Cow::from(*s),
}
}
-
pub fn as_bytes(&self) -> &[u8] {
- self.bytes.as_slice()
+ match self {
+ EncodedStr::Encoded { bytes, .. } => bytes,
+ EncodedStr::Utf8 { s } => s.as_bytes(),
+ }
+ }
+ pub fn to_utf8(&self) -> Cow<'a, str> {
+ match self {
+ EncodedStr::Encoded { bytes, encoding } => {
+ encoding.decode_without_bom_handling(bytes).0
+ }
+ EncodedStr::Utf8 { s } => Cow::from(*s),
+ }
}
-
pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> {
- match encoding.encode(&self.to_str()).0 {
+ match encoding.encode(&self.to_utf8()).0 {
Cow::Borrowed(_) => Cow::Borrowed(self.as_bytes()),
Cow::Owned(string) => Cow::Owned(string),
}
}
pub fn is_empty(&self) -> bool {
- self.bytes.0.is_empty()
+ match self {
+ EncodedStr::Encoded { bytes, .. } => bytes.is_empty(),
+ EncodedStr::Utf8 { s } => s.is_empty(),
+ }
}
pub fn quoted(&self) -> QuotedEncodedStr {
QuotedEncodedStr(self)
impl<'a> From<&'a str> for EncodedStr<'a> {
fn from(s: &'a str) -> Self {
- Self {
- bytes: RawStr(s.as_bytes()),
- encoding: UTF_8,
- valid_utf8: true,
- }
+ Self::Utf8 { s }
}
}
impl<'a> From<&'a String> for EncodedStr<'a> {
fn from(s: &'a String) -> Self {
- Self::from(s.as_str())
+ Self::Utf8 { s: s.as_str() }
}
}