}
}
+impl From<Cow<'_, str>> for ByteString {
+ fn from(value: Cow<'_, str>) -> Self {
+ value.into_owned().into()
+ }
+}
+
+impl From<Cow<'_, [u8]>> for ByteString {
+ fn from(value: Cow<'_, [u8]>) -> Self {
+ value.into_owned().into()
+ }
+}
+
impl From<Vec<u8>> for ByteString {
fn from(value: Vec<u8>) -> Self {
Self(value)
self.0.truncate(new_len);
}
Ordering::Equal => (),
- Ordering::Greater => self.0.extend((self.0.len()..new_len).map(|_| b' ')),
+ Ordering::Greater => self.0.resize(new_len, b' '),
}
Ok(())
}
let s: String = s.into();
Datum::String(ByteString::from(s).with_encoding(UTF_8))
}
+
+ pub fn codepage_to_unicode(&mut self) {
+ match self {
+ Datum::Number(_) => (),
+ Datum::String(string) => string.codepage_to_unicode(),
+ }
+ }
+
+ pub fn without_encoding(self) -> Datum<ByteString> {
+ match self {
+ Datum::Number(number) => Datum::Number(number),
+ Datum::String(string) => Datum::String(string.inner),
+ }
+ }
}
impl<'a> Datum<WithEncoding<ByteCow<'a>>> {
use encoding_rs::{Encoding, UTF_8};
use serde::Serialize;
-use crate::data::{ByteCow, ByteStr, ByteString, MutRawString, Quoted, RawString, ResizeError};
+use crate::{
+ data::{ByteCow, ByteStr, ByteString, MutRawString, Quoted, RawString, ResizeError},
+ dictionary::VarWidth,
+};
pub trait Encoded {
fn encoding(&self) -> &'static Encoding;
}
}
+impl WithEncoding<ByteString> {
+ pub fn codepage_to_unicode(&mut self) {
+ if self.encoding() != UTF_8 {
+ let new_len = (self.inner.len() * 3).min(VarWidth::MAX_STRING as usize);
+ if let Cow::Owned(string) = self
+ .encoding()
+ .decode_without_bom_handling(self.raw_string_bytes())
+ .0
+ {
+ self.inner = ByteString::from(string);
+ }
+
+ // Use `self.inner.0.resize` (instead of `self.inner.resize()`)
+ // because this is a forced resize that can trim off non-spaces.
+ self.inner.0.resize(new_len, b' ');
+
+ self.encoding = UTF_8;
+ }
+ }
+}
+
impl<T> Encoded for WithEncoding<T> {
fn encoding(&self) -> &'static Encoding {
self.encoding
self.inner.hash(state);
}
}
+
+#[cfg(test)]
+mod tests {
+ use std::{char::REPLACEMENT_CHARACTER, iter::repeat_n};
+
+ use encoding_rs::{Encoding, UTF_8, WINDOWS_1252};
+
+ use crate::data::{ByteString, EncodedString, RawString};
+
+ #[test]
+ fn codepage_to_unicode() {
+ fn check_unicode(original: &str, encoding: &'static Encoding, expected: &str) {
+ let original = ByteString::from(encoding.encode(original).0).with_encoding(encoding);
+ let mut actual = original.clone();
+ actual.codepage_to_unicode();
+ assert_eq!(actual.as_str().len(), expected.len());
+ assert_eq!(actual.as_str(), expected);
+ }
+
+ check_unicode("abc", UTF_8, "abc");
+ check_unicode("abc", WINDOWS_1252, "abc ");
+ check_unicode("éèäî", WINDOWS_1252, "éèäî ");
+ check_unicode(
+ &repeat_n('é', 15000).collect::<String>(),
+ WINDOWS_1252,
+ &repeat_n('é', 15000)
+ .chain(repeat_n(' ', 2767))
+ .collect::<String>(),
+ );
+ check_unicode(
+ &repeat_n('é', 20000).collect::<String>(),
+ WINDOWS_1252,
+ &repeat_n('é', 16383)
+ .chain(std::iter::once(REPLACEMENT_CHARACTER))
+ .collect::<String>(),
+ );
+ }
+}
str::FromStr,
};
-use encoding_rs::Encoding;
+use encoding_rs::{Encoding, UTF_8};
use enum_map::{Enum, EnumMap};
use indexmap::IndexSet;
use num::integer::div_ceil;
use unicase::UniCase;
use crate::{
- data::{ByteString, Datum, EncodedString, ResizeError, WithEncoding},
+ data::{ByteString, Datum, Encoded, EncodedString, ResizeError, WithEncoding},
format::{DisplayPlain, Format},
identifier::{ByIdentifier, HasIdentifier, Identifier},
output::pivot::{
pub fn display_adjective(&self) -> VarWidthAdjective {
VarWidthAdjective(*self)
}
+
+ pub fn codepage_to_unicode(&mut self) {
+ match self {
+ VarWidth::Numeric => (),
+ VarWidth::String(width) => *width = width.saturating_mul(3).min(Self::MAX_STRING),
+ }
+ }
}
pub struct Segments {
.map(|names| names.into_iter().flatten().collect())
.collect()
}
+
+ pub fn codepage_to_unicode(&mut self) {
+ if self.encoding == UTF_8 {
+ return;
+ }
+
+ let mut variables = IndexSet::new();
+ let mut index = 0;
+ for mut variable in self.variables.drain(..) {
+ variable.codepage_to_unicode();
+ while variables.contains(&variable) {
+ index += 1;
+ variable.name = Identifier::new(format!("Var{index}")).unwrap();
+ }
+ variables.insert(variable);
+ }
+ self.variables = variables;
+
+ let mut vectors = HashSet::new();
+ let mut index = 0;
+ for mut vector in self.vectors.drain() {
+ vector.codepage_to_unicode();
+ while vectors.contains(&vector) {
+ index += 1;
+ vector.name = Identifier::new(format!("Vec{index}")).unwrap();
+ }
+ vectors.insert(vector);
+ }
+ self.vectors = vectors;
+
+ self.attributes.codepage_to_unicode();
+
+ let mut mrsets = BTreeSet::new();
+ let mut index = 0;
+ while let Some(mut mrset) = self.mrsets.pop_first() {
+ mrset.codepage_to_unicode();
+ while mrsets.contains(&mrset) {
+ index += 1;
+ mrset.name = Identifier::new(format!("Mr{index}")).unwrap();
+ }
+ mrsets.insert(mrset);
+ }
+ self.mrsets = mrsets;
+
+ self.encoding = UTF_8;
+ }
}
pub struct OutputVariables<'a> {
pub fn has_any(&self, include_at: bool) -> bool {
self.iter(include_at).next().is_some()
}
+
+ pub fn codepage_to_unicode(&mut self) {
+ let mut new = BTreeMap::new();
+ while let Some((mut name, value)) = self.0.pop_first() {
+ name.codepage_to_unicode();
+ new.insert(name, value);
+ }
+ self.0 = new;
+ }
}
impl Debug for Attributes {
width: self.width,
}
}
+
+ pub fn codepage_to_unicode(&mut self) {
+ self.name.codepage_to_unicode();
+ self.width.codepage_to_unicode();
+ self.missing_values.codepage_to_unicode();
+ self.print_format.codepage_to_unicode();
+ self.write_format.codepage_to_unicode();
+ self.attributes.codepage_to_unicode();
+ self.encoding = UTF_8;
+
+ // Anything old enough to not support long names is old enough not to
+ // support Unicode.
+ self.short_names.clear();
+ }
}
impl HasIdentifier for Variable {
update_dict_index_vec(&mut self.variables, f);
(!self.variables.is_empty()).then_some(self)
}
+
+ pub fn codepage_to_unicode(&mut self) {
+ self.name.codepage_to_unicode();
+ }
}
impl HasIdentifier for DictIndexVector {
update_dict_index_vec(&mut self.variables, f);
(self.variables.len() > 1).then_some(self)
}
+
+ pub fn codepage_to_unicode(&mut self) {
+ self.name.codepage_to_unicode();
+ }
}
impl HasIdentifier for DictIndexMultipleResponseSet {
.filter_map(|(mut datum, string)| datum.resize(width).is_ok().then(|| (datum, string)))
.collect();
}
+
+ pub fn codepage_to_unicode(&mut self, encoding: &'static Encoding) {
+ self.0 = self
+ .0
+ .drain()
+ .map(|(key, value)| {
+ let mut key = key.with_encoding(encoding);
+ key.codepage_to_unicode();
+ (key.without_encoding(), value)
+ })
+ .collect();
+ }
}
impl Debug for ValueLabels {
}
}
-#[derive(Clone, Default, Serialize)]
+#[derive(Clone, Default, Serialize, PartialEq)]
pub struct MissingValues {
/// Individual missing values, up to 3 of them.
values: Vec<Datum<WithEncoding<ByteString>>>,
}
inner(self, width).inspect_err(|_| self.clear())
}
+
+ pub fn codepage_to_unicode(&mut self) {
+ self.values = self
+ .values
+ .drain(..)
+ .map(|value| match value {
+ Datum::Number(number) => Datum::Number(number),
+ Datum::String(s) => Datum::String(if s.encoding() != UTF_8 {
+ let mut new_s = ByteString::from(s.as_str());
+ new_s.0.truncate(8);
+ WithEncoding::new(new_s, UTF_8)
+ } else {
+ s
+ }),
+ })
+ .collect();
+ }
}
#[derive(Copy, Clone, Debug, Serialize, PartialEq)]
use unicase::UniCase;
use crate::{
- dictionary::{Dictionary, VarWidth, Variable},
+ data::{ByteString, Datum, RawString, WithEncoding},
+ dictionary::{Dictionary, MissingValues, VarWidth, Variable},
identifier::Identifier,
};
assert_eq!(expected, dict.short_names());
}
}
+
+ #[test]
+ fn var_width_codepage_to_unicode() {
+ fn check_unicode(input: VarWidth, expected: VarWidth) {
+ let mut actual = input;
+ actual.codepage_to_unicode();
+ assert_eq!(actual, expected);
+ }
+
+ check_unicode(VarWidth::Numeric, VarWidth::Numeric);
+ check_unicode(VarWidth::String(1), VarWidth::String(3));
+ check_unicode(VarWidth::String(2), VarWidth::String(6));
+ check_unicode(VarWidth::String(3), VarWidth::String(9));
+ check_unicode(VarWidth::String(1000), VarWidth::String(3000));
+ check_unicode(VarWidth::String(20000), VarWidth::String(32767));
+ check_unicode(VarWidth::String(30000), VarWidth::String(32767));
+ }
+
+ #[test]
+ fn missing_values_codepage_to_unicode() {
+ fn windows_1252(s: &str) -> WithEncoding<ByteString> {
+ ByteString::from(WINDOWS_1252.encode(s).0).with_encoding(WINDOWS_1252)
+ }
+
+ let mut actual = MissingValues::new(
+ vec![
+ Datum::String(windows_1252("abcdefgh")),
+ Datum::String(windows_1252("éèäî ")),
+ Datum::String(windows_1252("aaéèäîdf")),
+ ],
+ None,
+ )
+ .unwrap();
+ actual.codepage_to_unicode();
+
+ fn utf_8(s: &str) -> WithEncoding<ByteString> {
+ ByteString::from(s).with_encoding(UTF_8)
+ }
+
+ let expected = MissingValues::new(
+ vec![
+ Datum::String(utf_8("abcdefgh")),
+ Datum::String(utf_8("éèäî")),
+ Datum::String(utf_8("aaéèä")),
+ ],
+ None,
+ )
+ .unwrap();
+
+ assert_eq!(&actual, &expected);
+ }
}
use unicode_width::UnicodeWidthStr;
use crate::{
- data::{ByteString, Datum, },
+ data::{ByteString, Datum},
dictionary::{VarType, VarWidth},
sys::raw,
};
_ => *self = Self::default_for_width(width),
}
}
+
+ pub fn codepage_to_unicode(&mut self) {
+ let mut width = self.var_width();
+ width.codepage_to_unicode();
+ if let Some(width) = width.as_string_width() {
+ if self.type_ == Type::AHex {
+ self.w = width as u16 * 2;
+ } else {
+ self.w = width as u16;
+ }
+ }
+ }
}
impl Debug for Format {
Some(TemplateItem { c, n })
}
}
+
+#[cfg(test)]
+mod tests {
+ use crate::format::{Format, Type, Width};
+
+ #[test]
+ fn codepage_to_unicode() {
+ fn check_format(input: Format, expected_width: Width) {
+ let mut output = input;
+ output.codepage_to_unicode();
+ let expected = Format::new(input.type_, expected_width, input.d).unwrap();
+ assert_eq!(output, expected);
+ }
+ check_format(Format::new(Type::A, 1, 0).unwrap(), 3);
+ check_format(Format::new(Type::A, 2, 0).unwrap(), 6);
+ check_format(Format::new(Type::A, 3, 0).unwrap(), 9);
+ check_format(Format::new(Type::A, 1000, 0).unwrap(), 3000);
+ check_format(Format::new(Type::A, 20000, 0).unwrap(), 32767);
+
+ check_format(Format::new(Type::AHex, 2, 0).unwrap(), 6);
+ check_format(Format::new(Type::AHex, 4, 0).unwrap(), 12);
+ check_format(Format::new(Type::AHex, 6, 0).unwrap(), 18);
+ check_format(Format::new(Type::AHex, 2000, 0).unwrap(), 6000);
+ check_format(Format::new(Type::AHex, 20000, 0).unwrap(), 60000);
+ check_format(Format::new(Type::AHex, 30000, 0).unwrap(), 65534);
+
+ check_format(Format::new(Type::F, 40, 0).unwrap(), 40);
+ }
+}
Self::from_encoding(s, UTF_8)
}
+ /// Converts this identifier to UTF-8. This is generally a no-op, because
+ /// our internal encoding is UTF-8, but some identifiers are longer in UTF-8
+ /// than in their code page, which means that to satisfy the 64-byte limit
+ /// this function sometimes has to remove trailing grapheme clusters.
+ pub fn codepage_to_unicode(&mut self) {
+ while self.len() > Self::MAX_LEN {
+ let (new_len, _) = self.as_str().grapheme_indices(true).next_back().unwrap();
+ self.0.truncate(new_len);
+ if self.0.is_empty() {
+ // We had a grapheme cluster longer than 64 bytes!
+ *self = Identifier::new("VAR").unwrap();
+ return;
+ }
+ }
+ }
+
pub fn from_encoding(
s: impl Into<UniCase<String>>,
encoding: &'static Encoding,
#[cfg(test)]
mod tests {
- use encoding_rs::{UTF_8, WINDOWS_1252};
+ use encoding_rs::{Encoding, UTF_8, WINDOWS_1252};
use crate::identifier::Identifier;
assert_eq!(&short, expected_short);
}
}
+
+ #[test]
+ fn codepage_to_unicode() {
+ fn check_unicode(identifier: &str, encoding: &'static Encoding, expected: &str) {
+ let identifier = Identifier::from_encoding(String::from(identifier), encoding).unwrap();
+ let mut actual = identifier.clone();
+ actual.codepage_to_unicode();
+ assert_eq!(actual.as_str(), expected);
+ }
+
+ check_unicode("abc", UTF_8, "abc");
+ check_unicode("éèäî", UTF_8, "éèäî");
+
+ // 32 bytes in windows-1252, 64 bytes in UTF-8, no truncation.
+ check_unicode(
+ "éèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî",
+ WINDOWS_1252,
+ "éèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî",
+ );
+
+ // 33 or 34 bytes in windows-1252, 65 or 66 bytes in UTF-8, truncate
+ // last (2-byte) character.
+ check_unicode(
+ "xéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî",
+ WINDOWS_1252,
+ "xéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèä",
+ );
+ check_unicode(
+ "xyéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî",
+ WINDOWS_1252,
+ "xyéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèä",
+ );
+ }
}
use crate::{
crypto::EncryptedFile,
- data::{ByteString, Datum},
+ data::Datum,
dictionary::{Dictionary, VarWidth, Variable},
identifier::Identifier,
output::{