From d01ceb198f395042f55a7c346cc241767067b919 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 24 Aug 2025 13:51:09 -0700 Subject: [PATCH] rust: Change ByteStr from wrapping &[u8] to wrapping [u8]. This allows ByteString to implement Borrow, which makes lookup in collections easier. Signed-off-by: Ben Pfaff --- rust/pspp/src/data.rs | 67 ++++++++++++++++++----------- rust/pspp/src/data/encoded.rs | 4 +- rust/pspp/src/format/display/mod.rs | 2 +- rust/pspp/src/format/parse.rs | 2 +- rust/pspp/src/sys/raw.rs | 6 +-- 5 files changed, 48 insertions(+), 33 deletions(-) diff --git a/rust/pspp/src/data.rs b/rust/pspp/src/data.rs index 780e0e1c98..45208176ae 100644 --- a/rust/pspp/src/data.rs +++ b/rust/pspp/src/data.rs @@ -89,19 +89,19 @@ pub trait RawString: Debug + PartialEq + Eq + PartialOrd + Ord + Hash { self.raw_string_bytes().len() } - fn as_ref(&self) -> ByteStr<'_> { - ByteStr(self.raw_string_bytes()) + fn as_ref(&self) -> &ByteStr { + ByteStr::new(self.raw_string_bytes()) } - fn without_trailing_spaces(&self) -> ByteStr<'_> { + fn without_trailing_spaces(&self) -> &ByteStr { let mut raw = self.raw_string_bytes(); while let Some(trimmed) = raw.strip_suffix(b" ") { raw = trimmed; } - ByteStr(raw) + ByteStr::new(raw) } - fn as_encoded(&self, encoding: &'static Encoding) -> WithEncoding> + fn as_encoded(&self, encoding: &'static Encoding) -> WithEncoding<&ByteStr> where Self: Sized, { @@ -139,21 +139,30 @@ impl RawString for &'_ String { } } -#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct ByteStr<'a>(pub &'a [u8]); +#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)] +#[repr(transparent)] +pub struct ByteStr(pub [u8]); -impl RawString for ByteStr<'_> { +impl ByteStr { + pub fn new(s: &[u8]) -> &ByteStr { + // SAFETY: ByteStr is just a wrapper of [u8], + // therefore converting &[u8] to &ByteStr is safe. + unsafe { &*(s as *const [u8] as *const ByteStr) } + } +} + +impl<'a> RawString for &'a ByteStr { fn raw_string_bytes(&self) -> &[u8] { - self.0 + &self.0 } } -impl Serialize for ByteStr<'_> { +impl<'a> Serialize for &'a ByteStr { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { - if let Ok(s) = str::from_utf8(self.0) { + if let Ok(s) = str::from_utf8(&self.0) { let (variant_index, variant) = if self.0.iter().all(|b| b.is_ascii()) { (0, "Ascii") } else { @@ -165,17 +174,17 @@ impl Serialize for ByteStr<'_> { tuple.end() } else { let mut tuple = serializer.serialize_tuple_variant("RawString", 2, "Windows1252", 1)?; - tuple.serialize_field(&decode_latin1(self.0))?; + tuple.serialize_field(&decode_latin1(&self.0))?; tuple.end() } } } -impl Debug for ByteStr<'_> { +impl Debug for ByteStr { // If `s` is valid UTF-8, displays it as UTF-8, otherwise as Latin-1 // (actually bytes interpreted as Unicode code points). fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let s = from_utf8(&self.0).map_or_else(|_| decode_latin1(self.0), Cow::from); + let s = from_utf8(&self.0).map_or_else(|_| decode_latin1(&self.0), Cow::from); write!(f, "{s:?}") } } @@ -194,7 +203,7 @@ impl Serialize for ByteCow<'_> { where S: serde::Serializer, { - ByteStr(&self.0).serialize(serializer) + ByteStr::new(&self.0).serialize(serializer) } } @@ -206,7 +215,7 @@ impl RawString for ByteCow<'_> { impl Debug for ByteCow<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - ByteStr(&self.0).fmt(f) + ByteStr::new(&self.0).fmt(f) } } @@ -218,7 +227,7 @@ impl Serialize for ByteStrArray { where S: serde::Serializer, { - ByteStr(&self.0).serialize(serializer) + ByteStr::new(&self.0).serialize(serializer) } } @@ -230,7 +239,7 @@ impl RawString for ByteStrArray { impl Debug for ByteStrArray { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - ByteStr(&self.0).fmt(f) + ByteStr::new(&self.0).fmt(f) } } @@ -244,6 +253,12 @@ impl ByteString { } } +impl Borrow for ByteString { + fn borrow(&self) -> &ByteStr { + ByteStr::new(&self.0) + } +} + impl From for ByteString { fn from(value: String) -> Self { value.into_bytes().into() @@ -409,7 +424,7 @@ impl Datum where T: EncodedString, { - pub fn as_borrowed(&self) -> Datum>> { + pub fn as_borrowed(&self) -> Datum> { self.as_ref().map_string(|s| s.as_encoded_byte_str()) } pub fn cloned(&self) -> Datum> { @@ -610,7 +625,7 @@ where } } - pub fn as_encoded(&self, encoding: &'static Encoding) -> Datum>> { + pub fn as_encoded(&self, encoding: &'static Encoding) -> Datum> { self.as_ref().map_string(|s| s.as_encoded(encoding)) } @@ -694,15 +709,15 @@ impl From> for Datum { } } -impl<'a> From<&'a str> for Datum> { +impl<'a> From<&'a str> for Datum<&'a ByteStr> { fn from(value: &'a str) -> Self { - Datum::String(ByteStr(value.as_bytes())) + Datum::String(ByteStr::new(value.as_bytes())) } } -impl<'a> From<&'a [u8]> for Datum> { +impl<'a> From<&'a [u8]> for Datum<&'a ByteStr> { fn from(value: &'a [u8]) -> Self { - Self::String(ByteStr(value)) + Self::String(ByteStr::new(value)) } } @@ -796,7 +811,7 @@ pub struct CaseIter<'a> { } impl<'a> Iterator for CaseIter<'a> { - type Item = Datum>>; + type Item = Datum>; fn next(&mut self) -> Option { self.iter.next().map(|d| d.as_encoded(self.encoding)) @@ -807,7 +822,7 @@ impl<'a, B> IntoIterator for &'a Case where B: Borrow<[Datum]>, { - type Item = Datum>>; + type Item = Datum>; type IntoIter = CaseIter<'a>; diff --git a/rust/pspp/src/data/encoded.rs b/rust/pspp/src/data/encoded.rs index 304769d20d..2cc1a22eff 100644 --- a/rust/pspp/src/data/encoded.rs +++ b/rust/pspp/src/data/encoded.rs @@ -112,8 +112,8 @@ pub trait EncodedString: Encoded + RawString + Display + Debug { self.as_str().into_owned() } fn to_encoding(&self, encoding: &'static Encoding) -> WithEncoding>; - fn as_encoded_byte_str(&self) -> WithEncoding> { - WithEncoding::new(ByteStr(self.raw_string_bytes()), self.encoding()) + fn as_encoded_byte_str(&self) -> WithEncoding<&ByteStr> { + WithEncoding::new(ByteStr::new(self.raw_string_bytes()), self.encoding()) } fn cloned(&self) -> WithEncoding { WithEncoding::new(ByteString::from(self.raw_string_bytes()), self.encoding()) diff --git a/rust/pspp/src/format/display/mod.rs b/rust/pspp/src/format/display/mod.rs index 0347314a76..6f65cee10e 100644 --- a/rust/pspp/src/format/display/mod.rs +++ b/rust/pspp/src/format/display/mod.rs @@ -91,7 +91,7 @@ where /// `format`. /// /// [Display]: std::fmt::Display - pub fn display(&'a self, format: Format) -> DisplayDatum<'a, WithEncoding>> { + pub fn display(&'a self, format: Format) -> DisplayDatum<'a, WithEncoding<&'a ByteStr>> { DisplayDatum::new(format, self.as_borrowed()) } diff --git a/rust/pspp/src/format/parse.rs b/rust/pspp/src/format/parse.rs index 3e5d257a6e..f6a795ed2a 100644 --- a/rust/pspp/src/format/parse.rs +++ b/rust/pspp/src/format/parse.rs @@ -1637,7 +1637,7 @@ mod test { let parsed = Type::RB .parser(UTF_8) .with_endian(EndianSettings::new(Endian::Big)) - .parse(ByteStr(raw.as_slice()).with_encoding(UTF_8)) + .parse(ByteStr::new(raw.as_slice()).with_encoding(UTF_8)) .unwrap() .as_number() .unwrap() diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index e06babe62d..147fc6a9c4 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -1060,7 +1060,7 @@ impl Debug for RawDatum { match self { RawDatum::Number(Some(number)) => write!(f, "{number:?}"), RawDatum::Number(None) => write!(f, "SYSMIS"), - RawDatum::String(s) => write!(f, "{:?}", ByteStr(s)), + RawDatum::String(s) => write!(f, "{:?}", ByteStr::new(s)), } } } @@ -1072,7 +1072,7 @@ impl Serialize for RawDatum { { match self { RawDatum::Number(number) => number.serialize(serializer), - RawDatum::String(s) => ByteStr(s).serialize(serializer), + RawDatum::String(s) => ByteStr::new(s).serialize(serializer), } } } @@ -1755,7 +1755,7 @@ impl Debug for UntypedDatum { } else { big }; - write!(f, "{number}/{:?}", ByteStr(&self.0)) + write!(f, "{number}/{:?}", ByteStr::new(&self.0)) } } -- 2.30.2