str::from_utf8,
};
-use encoding_rs::{mem::decode_latin1, Encoding};
+use encoding_rs::{mem::decode_latin1, Encoding, UTF_8};
use ordered_float::OrderedFloat;
-use crate::{
- dictionary::{VarType, VarWidth},
- sys::raw::EncodedStr,
-};
+use crate::dictionary::{VarType, VarWidth};
/// An owned string in an unspecified character encoding.
///
/// [Dictionary]: crate::dictionary::Dictionary
pub Vec<Datum>,
);
+
+/// An owned string and its [Encoding].
+///
+/// The string is not guaranteed to be valid in the encoding.
+///
+/// The borrowed form of such a string is [EncodedStr].
+#[derive(Clone, Debug)]
+pub enum EncodedString {
+ /// A string in arbitrary encoding.
+ Encoded {
+ /// The bytes of the string.
+ bytes: Vec<u8>,
+
+ /// The string's encoding.
+ ///
+ /// This can be [UTF_8].
+ encoding: &'static Encoding,
+ },
+
+ /// A string that is in UTF-8 and known to be valid.
+ Utf8 {
+ /// The string.
+ s: String,
+ },
+}
+
+impl EncodedString {
+ /// Returns the string's [Encoding].
+ pub fn encoding(&self) -> &'static Encoding {
+ match self {
+ EncodedString::Encoded { encoding, .. } => encoding,
+ EncodedString::Utf8 { .. } => UTF_8,
+ }
+ }
+
+ /// Returns a borrowed form of this string.
+ pub fn borrowed(&self) -> EncodedStr<'_> {
+ match self {
+ EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding },
+ EncodedString::Utf8 { s } => EncodedStr::Utf8 { s },
+ }
+ }
+}
+
+impl<'a> From<EncodedStr<'a>> for EncodedString {
+ fn from(value: EncodedStr<'a>) -> Self {
+ match value {
+ EncodedStr::Encoded { bytes, encoding } => Self::Encoded {
+ bytes: bytes.into(),
+ encoding,
+ },
+ EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() },
+ }
+ }
+}
+
+/// A borrowed string and its [Encoding].
+///
+/// The string is not guaranteed to be valid in the encoding.
+///
+/// The owned form of such a string is [EncodedString].
+pub enum EncodedStr<'a> {
+ /// A string in an arbitrary encoding
+ Encoded {
+ /// The bytes of the string.
+ bytes: &'a [u8],
+
+ /// The string's encoding.
+ ///
+ /// THis can be [UTF_8].
+ encoding: &'static Encoding,
+ },
+
+ /// A string in UTF-8 that is known to be valid.
+ Utf8 {
+ /// The string.
+ s: &'a str,
+ },
+}
+
+impl<'a> EncodedStr<'a> {
+ /// Construct a new string with an arbitrary encoding.
+ pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self {
+ Self::Encoded { bytes, encoding }
+ }
+
+ /// Returns this string recoded in UTF-8. Invalid characters will be
+ /// replaced by [REPLACEMENT_CHARACTER].
+ ///
+ /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER
+ pub fn as_str(&self) -> Cow<'_, str> {
+ match self {
+ EncodedStr::Encoded { bytes, encoding } => {
+ encoding.decode_without_bom_handling(bytes).0
+ }
+ EncodedStr::Utf8 { s } => Cow::from(*s),
+ }
+ }
+
+ /// Returns the bytes in the string, in its encoding.
+ pub fn as_bytes(&self) -> &[u8] {
+ match self {
+ EncodedStr::Encoded { bytes, .. } => bytes,
+ EncodedStr::Utf8 { s } => s.as_bytes(),
+ }
+ }
+
+ /// Returns this string recoded in `encoding`. Invalid characters will be
+ /// replaced by [REPLACEMENT_CHARACTER].
+ ///
+ /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER
+ pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> {
+ match self {
+ EncodedStr::Encoded { bytes, encoding } => {
+ let utf8 = encoding.decode_without_bom_handling(bytes).0;
+ match encoding.encode(&utf8).0 {
+ Cow::Borrowed(_) => {
+ // Recoding into UTF-8 and then back did not change anything.
+ Cow::from(*bytes)
+ }
+ Cow::Owned(owned) => Cow::Owned(owned),
+ }
+ }
+ EncodedStr::Utf8 { s } => encoding.encode(s).0,
+ }
+ }
+
+ /// Returns true if this string is empty.
+ pub fn is_empty(&self) -> bool {
+ match self {
+ EncodedStr::Encoded { bytes, .. } => bytes.is_empty(),
+ EncodedStr::Utf8 { s } => s.is_empty(),
+ }
+ }
+
+ /// Returns a helper for displaying this string in double quotes.
+ pub fn quoted(&self) -> QuotedEncodedStr {
+ QuotedEncodedStr(self)
+ }
+}
+
+impl<'a> From<&'a str> for EncodedStr<'a> {
+ fn from(s: &'a str) -> Self {
+ Self::Utf8 { s }
+ }
+}
+
+impl<'a> From<&'a String> for EncodedStr<'a> {
+ fn from(s: &'a String) -> Self {
+ Self::Utf8 { s: s.as_str() }
+ }
+}
+
+/// Helper struct for displaying a [QuotedEncodedStr] in double quotes.
+pub struct QuotedEncodedStr<'a>(&'a EncodedStr<'a>);
+
+impl Display for QuotedEncodedStr<'_> {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{:?}", self.0.as_str())
+ }
+}
use crate::{
calendar::{calendar_gregorian_to_offset, DateError},
- data::Datum,
+ data::{Datum, EncodedStr, EncodedString},
endian::{Endian, Parse},
format::{DateTemplate, Decimals, Settings, TemplateItem, Type},
settings::{EndianSettings, Settings as PsppSettings},
- sys::raw::{EncodedStr, EncodedString},
};
use encoding_rs::Encoding;
use smallstr::SmallString;
use crate::{
calendar::{days_in_month, is_leap_year},
- data::Datum,
+ data::{Datum, EncodedStr},
endian::Endian,
format::{
parse::{ParseError, ParseErrorKind, Sign},
Epoch, Format, Settings as FormatSettings, Type,
},
settings::EndianSettings,
- sys::raw::EncodedStr,
};
fn test(name: &str, type_: Type) {
}
}
-#[derive(Clone, Debug)]
-pub enum EncodedString {
- Encoded {
- bytes: Vec<u8>,
- encoding: &'static Encoding,
- },
- Utf8 {
- s: String,
- },
-}
-
-impl EncodedString {
- pub fn borrowed(&self) -> EncodedStr<'_> {
- match self {
- EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding },
- EncodedString::Utf8 { s } => EncodedStr::Utf8 { s },
- }
- }
-}
-
-impl<'a> From<EncodedStr<'a>> for EncodedString {
- fn from(value: EncodedStr<'a>) -> Self {
- match value {
- EncodedStr::Encoded { bytes, encoding } => Self::Encoded {
- bytes: bytes.into(),
- encoding,
- },
- EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() },
- }
- }
-}
-
-pub enum EncodedStr<'a> {
- Encoded {
- bytes: &'a [u8],
- encoding: &'static Encoding,
- },
- Utf8 {
- s: &'a str,
- },
-}
-
-impl<'a> EncodedStr<'a> {
- pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self {
- Self::Encoded { bytes, encoding }
- }
- pub fn as_str(&self) -> Cow<'_, str> {
- match self {
- EncodedStr::Encoded { bytes, encoding } => {
- encoding.decode_without_bom_handling(bytes).0
- }
- EncodedStr::Utf8 { s } => Cow::from(*s),
- }
- }
- pub fn as_bytes(&self) -> &[u8] {
- match self {
- EncodedStr::Encoded { bytes, .. } => bytes,
- EncodedStr::Utf8 { s } => s.as_bytes(),
- }
- }
- pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> {
- match self {
- EncodedStr::Encoded { bytes, encoding } => {
- let utf8 = encoding.decode_without_bom_handling(bytes).0;
- match encoding.encode(&utf8).0 {
- Cow::Borrowed(_) => {
- // Recoding into UTF-8 and then back did not change anything.
- Cow::from(*bytes)
- }
- Cow::Owned(owned) => Cow::Owned(owned),
- }
- }
- EncodedStr::Utf8 { s } => encoding.encode(s).0,
- }
- }
- pub fn is_empty(&self) -> bool {
- match self {
- EncodedStr::Encoded { bytes, .. } => bytes.is_empty(),
- EncodedStr::Utf8 { s } => s.is_empty(),
- }
- }
- pub fn quoted(&self) -> QuotedEncodedStr {
- QuotedEncodedStr(self)
- }
-}
-
-impl<'a> From<&'a str> for EncodedStr<'a> {
- fn from(s: &'a str) -> Self {
- Self::Utf8 { s }
- }
-}
-
-impl<'a> From<&'a String> for EncodedStr<'a> {
- fn from(s: &'a String) -> Self {
- Self::Utf8 { s: s.as_str() }
- }
-}
-
-pub struct QuotedEncodedStr<'a>(&'a EncodedStr<'a>);
-
-impl Display for QuotedEncodedStr<'_> {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- write!(f, "{:?}", self.0.as_str())
- }
-}
-
fn skip_bytes<R: Read>(r: &mut R, mut n: usize) -> Result<(), IoError> {
thread_local! {
static BUF: RefCell<[u8; 256]> = RefCell::new([0u8; 256]);