fn test(syntax: &str) {
let mut engine = Engine::new();
- engine.run(Source::new_default(&Arc::new(
- SyntaxFile::new(syntax.to_string(), Some("test.sps".to_string()), UTF_8),
- )));
+ engine.run(Source::new_default(&Arc::new(SyntaxFile::new(
+ syntax.to_string(),
+ Some("test.sps".to_string()),
+ UTF_8,
+ ))));
}
#[test]
fn test(syntax: &str) {
let mut engine = Engine::new();
- engine.run(Source::new_default(&Arc::new(
- SyntaxFile::new(syntax.to_string(), Some("test.sps".to_string()), UTF_8),
- )));
+ engine.run(Source::new_default(&Arc::new(SyntaxFile::new(
+ syntax.to_string(),
+ Some("test.sps".to_string()),
+ UTF_8,
+ ))));
}
#[test]
fn test(syntax: &str) {
let mut engine = Engine::new();
- engine.run(Source::new_default(&Arc::new(
- SyntaxFile::new(syntax.to_string(), Some("test.sps".to_string()), UTF_8),
- )));
+ engine.run(Source::new_default(&Arc::new(SyntaxFile::new(
+ syntax.to_string(),
+ Some("test.sps".to_string()),
+ UTF_8,
+ ))));
}
#[test]
fn test(syntax: &str) {
let mut engine = Engine::new();
- engine.run(Source::new_default(&Arc::new(
- SyntaxFile::new(syntax.to_string(), Some("test.sps".to_string()), UTF_8),
- )));
+ engine.run(Source::new_default(&Arc::new(SyntaxFile::new(
+ syntax.to_string(),
+ Some("test.sps".to_string()),
+ UTF_8,
+ ))));
}
#[test]
--- /dev/null
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Individual pieces of data.
+//!
+//! [Datum] is the value of one [Variable]. String data in a [Datum] is
+//! represented as [RawString], whose character encoding is determined by the
+//! associated [Variable]. (All the variables in a [Dictionary] have the same
+//! character encoding.)
+//!
+//! [Variable]: crate::dictionary::Variable
+//! [Dictionary]: crate::dictionary::Dictionary
+
+// Warn about missing docs, but not for items declared with `#[cfg(test)]`.
+#![cfg_attr(not(test), warn(missing_docs))]
+
+use std::{
+ borrow::{Borrow, Cow},
+ cmp::Ordering,
+ fmt::{Debug, Display, Formatter},
+ hash::Hash,
+ ops::Deref,
+ str::from_utf8,
+};
+
+use encoding_rs::{mem::decode_latin1, Encoding};
+use ordered_float::OrderedFloat;
+
+use crate::{
+ dictionary::{VarType, VarWidth},
+ sys::raw::EncodedStr,
+};
+
+/// An owned string in an unspecified character encoding.
+///
+/// A [RawString] is usually associated with a [Variable] and uses the
+/// variable's character encoding. We assume that the encoding is one supported
+/// by [encoding_rs] with byte units (that is, not a `UTF-16` encoding). All of
+/// these encodings have some basic ASCII compatibility.
+///
+/// A [RawString] owns its contents and can grow and shrink, like a [Vec] or
+/// [String]. For a borrowed raw string, see [RawStr].
+///
+/// [Variable]: crate::dictionary::Variable
+#[derive(Clone, PartialEq, Default, Eq, PartialOrd, Ord, Hash)]
+pub struct RawString(pub Vec<u8>);
+
+impl RawString {
+ /// Creates a new [RawString] that consists of `n` ASCII spaces.
+ pub fn spaces(n: usize) -> Self {
+ Self(std::iter::repeat_n(b' ', n).collect())
+ }
+
+ /// Creates an [EncodedStr] with `encoding` that borrows this string's
+ /// contents.
+ pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> {
+ EncodedStr::new(&self.0, encoding)
+ }
+
+ /// Extends or shortens this [RawString] to exactly `len` bytes. If the
+ /// string needs to be extended, does so by appending spaces.
+ ///
+ /// If this shortens the string, it can cut off a multibyte character in the
+ /// middle.
+ pub fn resize(&mut self, len: usize) {
+ self.0.resize(len, b' ');
+ }
+
+ /// Removes any trailing ASCII spaces.
+ pub fn trim_end(&mut self) {
+ while self.0.pop_if(|c| *c == b' ').is_some() {}
+ }
+}
+
+impl Borrow<RawStr> for RawString {
+ fn borrow(&self) -> &RawStr {
+ RawStr::from_bytes(&self.0)
+ }
+}
+
+impl Deref for RawString {
+ type Target = RawStr;
+
+ fn deref(&self) -> &Self::Target {
+ self.borrow()
+ }
+}
+
+impl From<Cow<'_, [u8]>> for RawString {
+ fn from(value: Cow<'_, [u8]>) -> Self {
+ Self(value.into_owned())
+ }
+}
+
+impl From<Vec<u8>> for RawString {
+ fn from(source: Vec<u8>) -> Self {
+ Self(source)
+ }
+}
+
+impl From<&[u8]> for RawString {
+ fn from(source: &[u8]) -> Self {
+ Self(source.into())
+ }
+}
+
+impl Debug for RawString {
+ fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+ write!(f, "{:?}", *self)
+ }
+}
+
+/// A borrowed string in an unspecified encoding.
+///
+/// A [RawString] is usually associated with a [Variable] and uses the
+/// variable's character encoding. We assume that the encoding is one supported
+/// by [encoding_rs] with byte units (that is, not a `UTF-16` encoding). All of
+/// these encodings have some basic ASCII compatibility.
+///
+/// For an owned raw string, see [RawString].
+///
+/// [Variable]: crate::dictionary::Variable
+#[repr(transparent)]
+#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct RawStr(pub [u8]);
+
+impl RawStr {
+ /// Creates a new [RawStr] that contains `bytes`.
+ pub fn from_bytes(bytes: &[u8]) -> &Self {
+ // SAFETY: `RawStr` is a transparent wrapper around `[u8]`, so we can
+ // turn a reference to the wrapped type into a reference to the wrapper
+ // type.
+ unsafe { &*(bytes as *const [u8] as *const Self) }
+ }
+
+ /// Returns the raw string's contents as a borrowed byte slice.
+ pub fn as_bytes(&self) -> &[u8] {
+ &self.0
+ }
+
+ /// Returns an object that implements [Display] for printing this [RawStr],
+ /// given that it is encoded in `encoding`.
+ pub fn display(&self, encoding: &'static Encoding) -> DisplayRawString {
+ DisplayRawString(encoding.decode_without_bom_handling(&self.0).0)
+ }
+
+ /// Interprets the raw string's contents as the specified `encoding` and
+ /// returns it decoded into UTF-8, replacing any malformed sequences by
+ /// [REPLACEMENT_CHARACTER].
+ ///
+ /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER
+ pub fn decode(&self, encoding: &'static Encoding) -> Cow<'_, str> {
+ encoding.decode_without_bom_handling(&self.0).0
+ }
+
+ /// Compares this string and `other` for equality, ignoring trailing ASCII
+ /// spaces in either string for the purpose of comparison. (This is
+ /// acceptable because we assume that the encoding is ASCII-compatible.)
+ pub fn eq_ignore_trailing_spaces(&self, other: &RawStr) -> bool {
+ let mut this = self.0.iter();
+ let mut other = other.0.iter();
+ loop {
+ match (this.next(), other.next()) {
+ (Some(a), Some(b)) if a == b => (),
+ (Some(_), Some(_)) => return false,
+ (None, None) => return true,
+ (Some(b' '), None) => return this.all(|c| *c == b' '),
+ (None, Some(b' ')) => return other.all(|c| *c == b' '),
+ (Some(_), None) | (None, Some(_)) => return false,
+ }
+ }
+ }
+
+ /// Returns the string's length in bytes.
+ pub fn len(&self) -> usize {
+ self.0.len()
+ }
+}
+
+/// Helper struct for printing [RawStr] with [format!].
+///
+/// Created by [RawStr::display].
+pub struct DisplayRawString<'a>(Cow<'a, str>);
+
+impl<'a> Display for DisplayRawString<'a> {
+ // If `s` is valid UTF-8, displays it as UTF-8, otherwise as Latin-1
+ // (actually bytes interpreted as Unicode code points).
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{}", &self.0)
+ }
+}
+
+impl Debug for RawStr {
+ fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+ let s = from_utf8(&self.0).map_or_else(|_| decode_latin1(&self.0), Cow::from);
+ write!(f, "{s:?}")
+ }
+}
+
+/// The value of a [Variable](crate::dictionary::Variable).
+#[derive(Clone)]
+pub enum Datum {
+ /// A numeric value.
+ Number(
+ /// A number, or `None` for the system-missing value.
+ Option<f64>,
+ ),
+ /// A string value.
+ String(
+ /// The value, in the variable's encoding.
+ RawString,
+ ),
+}
+
+impl Debug for Datum {
+ fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+ match self {
+ Datum::Number(Some(number)) => write!(f, "{number:?}"),
+ Datum::Number(None) => write!(f, "SYSMIS"),
+ Datum::String(s) => write!(f, "{:?}", s),
+ }
+ }
+}
+
+impl PartialEq for Datum {
+ fn eq(&self, other: &Self) -> bool {
+ match (self, other) {
+ (Self::Number(Some(l0)), Self::Number(Some(r0))) => {
+ OrderedFloat(*l0) == OrderedFloat(*r0)
+ }
+ (Self::Number(None), Self::Number(None)) => true,
+ (Self::String(l0), Self::String(r0)) => l0 == r0,
+ _ => false,
+ }
+ }
+}
+
+impl Eq for Datum {}
+
+impl PartialOrd for Datum {
+ fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+impl Ord for Datum {
+ fn cmp(&self, other: &Self) -> Ordering {
+ match (self, other) {
+ (Datum::Number(a), Datum::Number(b)) => match (a, b) {
+ (None, None) => Ordering::Equal,
+ (None, Some(_)) => Ordering::Less,
+ (Some(_), None) => Ordering::Greater,
+ (Some(a), Some(b)) => a.total_cmp(b),
+ },
+ (Datum::Number(_), Datum::String(_)) => Ordering::Less,
+ (Datum::String(_), Datum::Number(_)) => Ordering::Greater,
+ (Datum::String(a), Datum::String(b)) => a.cmp(b),
+ }
+ }
+}
+
+impl Hash for Datum {
+ fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+ match self {
+ Datum::Number(number) => number.map(OrderedFloat).hash(state),
+ Datum::String(string) => string.hash(state),
+ }
+ }
+}
+
+impl Datum {
+ /// Constructs a new numerical [Datum] for the system-missing value.
+ pub const fn sysmis() -> Self {
+ Self::Number(None)
+ }
+
+ /// Returns the number inside this datum, or `None` if this is a string
+ /// datum.
+ pub fn as_number(&self) -> Option<Option<f64>> {
+ match self {
+ Datum::Number(number) => Some(*number),
+ Datum::String(_) => None,
+ }
+ }
+
+ /// Returns the string inside this datum, or `None` if this is a numeric
+ /// datum.
+ pub fn as_string(&self) -> Option<&RawString> {
+ match self {
+ Datum::Number(_) => None,
+ Datum::String(s) => Some(s),
+ }
+ }
+
+ /// Returns the string inside this datum as a mutable borrow, or `None` if
+ /// this is a numeric datum.
+ pub fn as_string_mut(&mut self) -> Option<&mut RawString> {
+ match self {
+ Datum::Number(_) => None,
+ Datum::String(s) => Some(s),
+ }
+ }
+
+ /// Returns true if this datum can be resized to the given `width` without
+ /// loss, which is true only if this datum and `width` are both string or
+ /// both numeric and, for string widths, if resizing would not drop any
+ /// non-space characters.
+ pub fn is_resizable(&self, width: VarWidth) -> bool {
+ match (self, width) {
+ (Datum::Number(_), VarWidth::Numeric) => true,
+ (Datum::String(s), VarWidth::String(new_width)) => {
+ let new_len = new_width as usize;
+ new_len >= s.len() || s.0[new_len..].iter().all(|c| *c == b' ')
+ }
+ _ => false,
+ }
+ }
+
+ /// Resizes this datum to the given `width`.
+ ///
+ /// # Panic
+ ///
+ /// Panics if resizing would change the datum from numeric to string or vice
+ /// versa.
+ pub fn resize(&mut self, width: VarWidth) {
+ match (self, width) {
+ (Datum::Number(_), VarWidth::Numeric) => (),
+ (Datum::String(s), VarWidth::String(new_width)) => s.resize(new_width as usize),
+ _ => unreachable!(),
+ }
+ }
+
+ /// Returns the [VarType] corresponding to this datum.
+ pub fn var_type(&self) -> VarType {
+ match self {
+ Self::Number(_) => VarType::Numeric,
+ Self::String(_) => VarType::String,
+ }
+ }
+
+ /// Returns the [VarWidth] corresponding to this datum.
+ pub fn width(&self) -> VarWidth {
+ match self {
+ Datum::Number(_) => VarWidth::Numeric,
+ Datum::String(s) => VarWidth::String(s.len().try_into().unwrap()),
+ }
+ }
+
+ /// Compares this datum and `other` for equality, ignoring trailing ASCII
+ /// spaces in either, if they are both strings, for the purpose of
+ /// comparison.
+ pub fn eq_ignore_trailing_spaces(&self, other: &Datum) -> bool {
+ match (self, other) {
+ (Self::String(a), Self::String(b)) => a.eq_ignore_trailing_spaces(b),
+ _ => self == other,
+ }
+ }
+
+ /// Removes trailing ASCII spaces from this datum, if it is a string.
+ pub fn trim_end(&mut self) {
+ match self {
+ Self::Number(_) => (),
+ Self::String(s) => s.trim_end(),
+ }
+ }
+}
+
+impl From<f64> for Datum {
+ fn from(number: f64) -> Self {
+ Some(number).into()
+ }
+}
+
+impl From<Option<f64>> for Datum {
+ fn from(value: Option<f64>) -> Self {
+ Self::Number(value)
+ }
+}
+
+impl From<&str> for Datum {
+ fn from(value: &str) -> Self {
+ value.as_bytes().into()
+ }
+}
+
+impl From<&[u8]> for Datum {
+ fn from(value: &[u8]) -> Self {
+ Self::String(value.into())
+ }
+}
use enum_map::{Enum, EnumMap};
use indexmap::IndexSet;
use num::integer::div_ceil;
-use ordered_float::OrderedFloat;
use thiserror::Error as ThisError;
use unicase::UniCase;
use crate::{
+ data::Datum,
format::{DisplayPlain, Format},
identifier::{ByIdentifier, HasIdentifier, Identifier},
output::pivot::{Axis3, Dimension, Footnote, Footnotes, Group, PivotTable, Value},
settings::Show,
- sys::raw::RawString,
};
/// An index within [Dictionary::variables].
}
}
-#[derive(Clone)]
-pub enum Datum {
- Number(Option<f64>),
- String(RawString),
-}
-
-impl Debug for Datum {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- match self {
- Datum::Number(Some(number)) => write!(f, "{number:?}"),
- Datum::Number(None) => write!(f, "SYSMIS"),
- Datum::String(s) => write!(f, "{:?}", s),
- }
- }
-}
-
-impl PartialEq for Datum {
- fn eq(&self, other: &Self) -> bool {
- match (self, other) {
- (Self::Number(Some(l0)), Self::Number(Some(r0))) => {
- OrderedFloat(*l0) == OrderedFloat(*r0)
- }
- (Self::Number(None), Self::Number(None)) => true,
- (Self::String(l0), Self::String(r0)) => l0 == r0,
- _ => false,
- }
- }
-}
-
-impl Eq for Datum {}
-
-impl PartialOrd for Datum {
- fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
- Some(self.cmp(other))
- }
-}
-
-impl Ord for Datum {
- fn cmp(&self, other: &Self) -> Ordering {
- match (self, other) {
- (Datum::Number(a), Datum::Number(b)) => match (a, b) {
- (None, None) => Ordering::Equal,
- (None, Some(_)) => Ordering::Less,
- (Some(_), None) => Ordering::Greater,
- (Some(a), Some(b)) => a.total_cmp(b),
- },
- (Datum::Number(_), Datum::String(_)) => Ordering::Less,
- (Datum::String(_), Datum::Number(_)) => Ordering::Greater,
- (Datum::String(a), Datum::String(b)) => a.cmp(b),
- }
- }
-}
-
-impl Hash for Datum {
- fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
- match self {
- Datum::Number(number) => number.map(OrderedFloat).hash(state),
- Datum::String(string) => string.hash(state),
- }
- }
-}
-
-impl Datum {
- pub const fn sysmis() -> Self {
- Self::Number(None)
- }
-
- pub fn as_number(&self) -> Option<Option<f64>> {
- match self {
- Datum::Number(number) => Some(*number),
- Datum::String(_) => None,
- }
- }
-
- pub fn as_string(&self) -> Option<&RawString> {
- match self {
- Datum::Number(_) => None,
- Datum::String(s) => Some(s),
- }
- }
-
- pub fn is_resizable(&self, width: VarWidth) -> bool {
- match (self, width) {
- (Datum::Number(_), VarWidth::Numeric) => true,
- (Datum::String(s), VarWidth::String(new_width)) => {
- let new_len = new_width as usize;
- new_len >= s.len() || s.0[new_len..].iter().all(|c| *c == b' ')
- }
- _ => false,
- }
- }
-
- pub fn resize(&mut self, width: VarWidth) {
- match (self, width) {
- (Datum::Number(_), VarWidth::Numeric) => (),
- (Datum::String(s), VarWidth::String(new_width)) => s.resize(new_width as usize),
- _ => unreachable!(),
- }
- }
-
- pub fn var_type(&self) -> VarType {
- match self {
- Self::Number(_) => VarType::Numeric,
- Self::String(_) => VarType::String,
- }
- }
-
- pub fn width(&self) -> VarWidth {
- match self {
- Datum::Number(_) => VarWidth::Numeric,
- Datum::String(s) => VarWidth::String(s.len().try_into().unwrap()),
- }
- }
-
- pub fn eq_ignore_trailing_spaces(&self, other: &Datum) -> bool {
- match (self, other) {
- (Self::String(a), Self::String(b)) => a.eq_ignore_trailing_spaces(b),
- _ => self == other,
- }
- }
-
- pub fn trim_end(&mut self) {
- match self {
- Self::Number(_) => (),
- Self::String(s) => s.trim_end(),
- }
- }
-}
-
-impl From<f64> for Datum {
- fn from(number: f64) -> Self {
- Some(number).into()
- }
-}
-
-impl From<Option<f64>> for Datum {
- fn from(value: Option<f64>) -> Self {
- Self::Number(value)
- }
-}
-
-impl From<&str> for Datum {
- fn from(value: &str) -> Self {
- value.as_bytes().into()
- }
-}
-
-impl From<&[u8]> for Datum {
- fn from(value: &[u8]) -> Self {
- Self::String(value.into())
- }
-}
-
/// A collection of variables, plus additional metadata.
#[derive(Clone, Debug)]
pub struct Dictionary {
#[ignore]
fn test_echo() {
let mut engine = Engine::new();
- engine.run(Source::new_default(&Arc::new(
- SyntaxFile::new(
- "ECHO 'hi there'.\nECHO 'bye there'.\n".to_string(),
- Some("test.sps".to_string()),
- UTF_8,
- ),
- )));
+ engine.run(Source::new_default(&Arc::new(SyntaxFile::new(
+ "ECHO 'hi there'.\nECHO 'bye there'.\n".to_string(),
+ Some("test.sps".to_string()),
+ UTF_8,
+ ))));
}
#[test]
use crate::{
calendar::{calendar_offset_to_gregorian, day_of_year, month_name, short_month_name},
- dictionary::Datum,
+ data::Datum,
endian::ToBytes,
format::{Category, DateTemplate, Decimal, Format, NumberStyle, Settings, TemplateItem, Type},
settings::{EndianSettings, Settings as PsppSettings},
use smallvec::SmallVec;
use crate::{
- dictionary::Datum,
+ data::Datum,
endian::Endian,
format::{AbstractFormat, Epoch, Format, Settings, Type, UncheckedFormat, CC},
lex::{scan::StringScanner, segment::Syntax, Punct, Token},
use unicode_width::UnicodeWidthStr;
use crate::{
- dictionary::{Datum, VarType, VarWidth},
- sys::raw::{self, RawString},
+ data::RawString,
+ data::Datum,
+ dictionary::{VarType, VarWidth},
+ sys::raw,
};
mod display;
use crate::{
calendar::{calendar_gregorian_to_offset, DateError},
- dictionary::Datum,
+ data::Datum,
endian::{Endian, Parse},
format::{DateTemplate, Decimals, Settings, TemplateItem, Type},
settings::{EndianSettings, Settings as PsppSettings},
use crate::{
calendar::{days_in_month, is_leap_year},
- dictionary::Datum,
+ data::Datum,
endian::Endian,
format::{
parse::{ParseError, ParseErrorKind, Sign},
pub mod calendar;
pub mod command;
pub mod crypto;
+pub mod data;
pub mod dictionary;
pub mod endian;
pub mod engine;
use tlo::parse_tlo;
use crate::{
- dictionary::{Datum, VarType, Variable},
+ data::Datum,
+ dictionary::{VarType, Variable},
format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat},
settings::{Settings, Show},
};
use crate::{
calendar::date_time_to_pspp,
+ data::{Datum, RawString},
dictionary::{
- Datum, Dictionary, InvalidRole, MissingValues, MissingValuesError, MultipleResponseSet,
+ Dictionary, InvalidRole, MissingValues, MissingValuesError, MultipleResponseSet,
MultipleResponseType, VarWidth, Variable, VariableSet,
},
endian::Endian,
VarDisplayRecord, VariableAttributesRecord, VariableRecord, VariableSetRecord,
VeryLongStringsRecord, ZHeader, ZTrailer,
},
- Cases, DecodedRecord, RawDatum, RawString, RawWidth,
+ Cases, DecodedRecord, RawDatum, RawWidth,
},
},
};
//! raw details. Most readers will want to use higher-level interfaces.
use crate::{
- dictionary::{Datum, VarType, VarWidth},
+ data::{Datum, RawStr, RawString},
+ dictionary::{VarType, VarWidth},
endian::{Endian, Parse, ToBytes},
format::DisplayPlainF64,
identifier::{Error as IdError, Identifier},
},
};
-use encoding_rs::{mem::decode_latin1, Encoding};
+use encoding_rs::Encoding;
use flate2::read::ZlibDecoder;
use itertools::Itertools;
use smallvec::SmallVec;
use std::{
- borrow::{Borrow, Cow},
+ borrow::Cow,
cell::RefCell,
collections::VecDeque,
fmt::{Debug, Display, Formatter, Result as FmtResult},
iter::repeat_n,
mem::take,
num::NonZeroU8,
- ops::Deref,
- str::from_utf8,
};
use thiserror::Error as ThisError;
}
}
-// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
-// decoded as Latin-1 (actually bytes interpreted as Unicode code points).
-fn default_decode(s: &[u8]) -> Cow<str> {
- from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
-}
-
/// An [Encoding] along with a function to report decoding errors.
///
/// This is used by functions that decode raw records.
match self {
RawDatum::Number(Some(number)) => write!(f, "{number:?}"),
RawDatum::Number(None) => write!(f, "SYSMIS"),
- RawDatum::String(s) => write!(f, "{:?}", default_decode(s)),
+ RawDatum::String(s) => write!(f, "{:?}", RawStr::from_bytes(s)),
}
}
}
} else {
big
};
- write!(f, "{number}")?;
-
- let string = default_decode(&self.0);
- let string = string
- .split(|c: char| c == '\0' || c.is_control())
- .next()
- .unwrap();
- write!(f, "{string:?}")?;
- Ok(())
- }
-}
-
-/// An owned string in an unspecified encoding.
-///
-/// We assume that the encoding is one supported by [encoding_rs] with byte
-/// units (that is, not a `UTF-16` encoding). All of these encodings have some
-/// basic ASCII compatibility.
-///
-/// A [RawString] owns its contents and can grow and shrink, like a [Vec] or
-/// [String]. For a borrowed raw string, see [RawStr].
-#[derive(Clone, PartialEq, Default, Eq, PartialOrd, Ord, Hash)]
-pub struct RawString(pub Vec<u8>);
-
-impl RawString {
- pub fn spaces(n: usize) -> Self {
- Self(std::iter::repeat_n(b' ', n).collect())
- }
- pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> {
- EncodedStr::new(&self.0, encoding)
- }
- pub fn resize(&mut self, len: usize) {
- self.0.resize(len, b' ');
- }
- pub fn len(&self) -> usize {
- self.0.len()
- }
- pub fn trim_end(&mut self) {
- while self.0.pop_if(|c| *c == b' ').is_some() {}
- }
-}
-
-impl Borrow<RawStr> for RawString {
- fn borrow(&self) -> &RawStr {
- RawStr::from_bytes(&self.0)
- }
-}
-
-impl Deref for RawString {
- type Target = RawStr;
-
- fn deref(&self) -> &Self::Target {
- self.borrow()
- }
-}
-
-impl From<Cow<'_, [u8]>> for RawString {
- fn from(value: Cow<'_, [u8]>) -> Self {
- Self(value.into_owned())
- }
-}
-
-impl From<Vec<u8>> for RawString {
- fn from(source: Vec<u8>) -> Self {
- Self(source)
- }
-}
-
-impl From<&[u8]> for RawString {
- fn from(source: &[u8]) -> Self {
- Self(source.into())
- }
-}
-
-impl Debug for RawString {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{:?}", default_decode(self.0.as_slice()))
- }
-}
-
-/// A borrowed string in an unspecified encoding.
-///
-/// We assume that the encoding is one supported by [encoding_rs] with byte
-/// units (that is, not a `UTF-16` encoding). All of these encodings have some
-/// basic ASCII compatibility.
-///
-/// For an owned raw string, see [RawString].
-#[repr(transparent)]
-#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub struct RawStr(pub [u8]);
-
-impl RawStr {
- pub fn from_bytes(bytes: &[u8]) -> &Self {
- // SAFETY: `RawStr` is a transparent wrapper around `[u8]`, so we can
- // turn a reference to the wrapped type into a reference to the wrapper
- // type.
- unsafe { &*(bytes as *const [u8] as *const Self) }
- }
-
- pub fn as_bytes(&self) -> &[u8] {
- &self.0
- }
-
- /// Returns an object that implements [Display] for printing this [RawStr],
- /// given that it is encoded in `encoding`.
- pub fn display(&self, encoding: &'static Encoding) -> DisplayRawString {
- DisplayRawString(encoding.decode_without_bom_handling(&self.0).0)
- }
-
- pub fn decode(&self, encoding: &'static Encoding) -> Cow<'_, str> {
- encoding.decode_without_bom_handling(&self.0).0
- }
-
- pub fn eq_ignore_trailing_spaces(&self, other: &RawStr) -> bool {
- let mut this = self.0.iter();
- let mut other = other.0.iter();
- loop {
- match (this.next(), other.next()) {
- (Some(a), Some(b)) if a == b => (),
- (Some(_), Some(_)) => return false,
- (None, None) => return true,
- (Some(b' '), None) => return this.all(|c| *c == b' '),
- (None, Some(b' ')) => return other.all(|c| *c == b' '),
- (Some(_), None) | (None, Some(_)) => return false,
- }
- }
- }
-}
-
-pub struct DisplayRawString<'a>(Cow<'a, str>);
-
-impl<'a> Display for DisplayRawString<'a> {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- write!(f, "{}", &self.0)
- }
-}
-
-impl Debug for RawStr {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{:?}", default_decode(self.as_bytes()))
+ write!(f, "{number}/{:?}", RawStr::from_bytes(&self.0))
}
}
impl<const N: usize> Debug for RawStrArray<N> {
fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{:?}", default_decode(&self.0))
+ write!(f, "{:?}", RawStr::from_bytes(&self.0))
}
}
};
use crate::{
+ data::{Datum, RawString},
dictionary::{
- Alignment, Attributes, CategoryLabels, Datum, Measure, MissingValueRange, MissingValues,
- VarType, VarWidth,
+ Alignment, Attributes, CategoryLabels, Measure, MissingValueRange, MissingValues, VarType,
+ VarWidth,
},
endian::{Endian, Parse},
identifier::{Error as IdError, Identifier},
sys::raw::{
read_bytes, read_string, read_vec, DecodedRecord, Decoder, Error, Magic, RawDatum,
- RawStrArray, RawString, RawWidth, Record, VarTypes, Warning,
+ RawStrArray, RawWidth, Record, VarTypes, Warning,
},
};
let output = match reader.headers().collect::<Result<Vec<_>, _>>() {
Ok(headers) => {
let cases = reader.cases();
- let encoding =
- infer_encoding(&headers, &mut |warning| warnings.push(warning)).unwrap();
+ let encoding = infer_encoding(&headers, &mut |warning| warnings.push(warning)).unwrap();
let mut decoder = Decoder::new(encoding, |warning| warnings.push(warning));
let mut decoded_records = Vec::new();
for header in headers {