[[package]]
name = "zerocopy"
-version = "0.8.25"
+version = "0.8.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb"
+checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
-version = "0.8.25"
+version = "0.8.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef"
+checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181"
dependencies = [
"proc-macro2",
"quote",
[build-dependencies]
anyhow = "1.0.69"
-flate2 = "1.0.26"
[[bin]]
name = "pspp"
[lib]
path = "src/lib.rs"
-[[test]]
-name = "sack"
-path = "tests/sack.rs"
-harness = false
-
[dev-dependencies]
diff = "0.1.13"
rand = "0.9.0"
use crate::{
engine::Engine,
- lex::lexer::{Source, SourceFile},
+ lex::lexer::{Source, SyntaxFile},
};
fn test(syntax: &str) {
let mut engine = Engine::new();
- engine.run(Source::new_default(&Arc::new(
- SourceFile::for_file_contents(syntax.to_string(), Some("test.sps".to_string()), UTF_8),
- )));
+ engine.run(Source::new_default(&Arc::new(SyntaxFile::new(
+ syntax.to_string(),
+ Some("test.sps".to_string()),
+ UTF_8,
+ ))));
}
#[test]
use crate::{
engine::Engine,
- lex::lexer::{Source, SourceFile},
+ lex::lexer::{Source, SyntaxFile},
};
fn test(syntax: &str) {
let mut engine = Engine::new();
- engine.run(Source::new_default(&Arc::new(
- SourceFile::for_file_contents(syntax.to_string(), Some("test.sps".to_string()), UTF_8),
- )));
+ engine.run(Source::new_default(&Arc::new(SyntaxFile::new(
+ syntax.to_string(),
+ Some("test.sps".to_string()),
+ UTF_8,
+ ))));
}
#[test]
use crate::{
engine::Engine,
- lex::lexer::{Source, SourceFile},
+ lex::lexer::{Source, SyntaxFile},
};
fn test(syntax: &str) {
let mut engine = Engine::new();
- engine.run(Source::new_default(&Arc::new(
- SourceFile::for_file_contents(syntax.to_string(), Some("test.sps".to_string()), UTF_8),
- )));
+ engine.run(Source::new_default(&Arc::new(SyntaxFile::new(
+ syntax.to_string(),
+ Some("test.sps".to_string()),
+ UTF_8,
+ ))));
}
#[test]
use crate::{
engine::Engine,
- lex::lexer::{Source, SourceFile},
+ lex::lexer::{Source, SyntaxFile},
};
fn test(syntax: &str) {
let mut engine = Engine::new();
- engine.run(Source::new_default(&Arc::new(
- SourceFile::for_file_contents(syntax.to_string(), Some("test.sps".to_string()), UTF_8),
- )));
+ engine.run(Source::new_default(&Arc::new(SyntaxFile::new(
+ syntax.to_string(),
+ Some("test.sps".to_string()),
+ UTF_8,
+ ))));
}
#[test]
lex::{
command_name::CommandMatcher,
lexer::{LexToken, TokenSlice},
- token::{Punct, Token},
+ Punct, Token,
},
message::{Diagnostic, Diagnostics},
};
/// This reads enough of the file to verify that it is in the expected
/// format and returns an error if it cannot be read or is not the expected
/// format.
+ ///
+ /// `reader` doesn't need to be [BufRead], and probably should not be. The
+ /// [EncryptedReader] returned by [unlock] or [unlock_literal] will be
+ /// [BufRead].
+ ///
+ /// [unlock]: Self::unlock
+ /// [unlock_literal]: Self::unlock_literal
pub fn new(mut reader: R) -> Result<Self, Error> {
let header =
EncryptedHeader::read_le(&mut NoSeek::new(&mut reader)).map_err(
--- /dev/null
+// PSPP - a program for statistical analysis.
+// Copyright (C) 2025 Free Software Foundation, Inc.
+//
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later
+// version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with
+// this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Individual pieces of data.
+//!
+//! [Datum] is the value of one [Variable]. String data in a [Datum] is
+//! represented as [RawString], whose character encoding is determined by the
+//! associated [Variable]. (All the variables in a [Dictionary] have the same
+//! character encoding.)
+//!
+//! [Variable]: crate::dictionary::Variable
+//! [Dictionary]: crate::dictionary::Dictionary
+
+// Warn about missing docs, but not for items declared with `#[cfg(test)]`.
+#![cfg_attr(not(test), warn(missing_docs))]
+
+use std::{
+ borrow::{Borrow, Cow},
+ cmp::Ordering,
+ fmt::{Debug, Display, Formatter},
+ hash::Hash,
+ ops::Deref,
+ str::from_utf8,
+};
+
+use encoding_rs::{mem::decode_latin1, Encoding, UTF_8};
+use ordered_float::OrderedFloat;
+
+use crate::dictionary::{VarType, VarWidth};
+
+/// An owned string in an unspecified character encoding.
+///
+/// A [RawString] is usually associated with a [Variable] and uses the
+/// variable's character encoding. We assume that the encoding is one supported
+/// by [encoding_rs] with byte units (that is, not a `UTF-16` encoding). All of
+/// these encodings have some basic ASCII compatibility.
+///
+/// A [RawString] owns its contents and can grow and shrink, like a [Vec] or
+/// [String]. For a borrowed raw string, see [RawStr].
+///
+/// [Variable]: crate::dictionary::Variable
+#[derive(Clone, PartialEq, Default, Eq, PartialOrd, Ord, Hash)]
+pub struct RawString(pub Vec<u8>);
+
+impl RawString {
+ /// Creates a new [RawString] that consists of `n` ASCII spaces.
+ pub fn spaces(n: usize) -> Self {
+ Self(std::iter::repeat_n(b' ', n).collect())
+ }
+
+ /// Creates an [EncodedStr] with `encoding` that borrows this string's
+ /// contents.
+ pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> {
+ EncodedStr::new(&self.0, encoding)
+ }
+
+ /// Extends or shortens this [RawString] to exactly `len` bytes. If the
+ /// string needs to be extended, does so by appending spaces.
+ ///
+ /// If this shortens the string, it can cut off a multibyte character in the
+ /// middle.
+ pub fn resize(&mut self, len: usize) {
+ self.0.resize(len, b' ');
+ }
+
+ /// Removes any trailing ASCII spaces.
+ pub fn trim_end(&mut self) {
+ while self.0.pop_if(|c| *c == b' ').is_some() {}
+ }
+}
+
+impl Borrow<RawStr> for RawString {
+ fn borrow(&self) -> &RawStr {
+ RawStr::from_bytes(&self.0)
+ }
+}
+
+impl Deref for RawString {
+ type Target = RawStr;
+
+ fn deref(&self) -> &Self::Target {
+ self.borrow()
+ }
+}
+
+impl From<Cow<'_, [u8]>> for RawString {
+ fn from(value: Cow<'_, [u8]>) -> Self {
+ Self(value.into_owned())
+ }
+}
+
+impl From<Vec<u8>> for RawString {
+ fn from(source: Vec<u8>) -> Self {
+ Self(source)
+ }
+}
+
+impl From<&[u8]> for RawString {
+ fn from(source: &[u8]) -> Self {
+ Self(source.into())
+ }
+}
+
+impl Debug for RawString {
+ fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+ write!(f, "{:?}", *self)
+ }
+}
+
+/// A borrowed string in an unspecified encoding.
+///
+/// A [RawString] is usually associated with a [Variable] and uses the
+/// variable's character encoding. We assume that the encoding is one supported
+/// by [encoding_rs] with byte units (that is, not a `UTF-16` encoding). All of
+/// these encodings have some basic ASCII compatibility.
+///
+/// For an owned raw string, see [RawString].
+///
+/// [Variable]: crate::dictionary::Variable
+#[repr(transparent)]
+#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct RawStr(pub [u8]);
+
+impl RawStr {
+ /// Creates a new [RawStr] that contains `bytes`.
+ pub fn from_bytes(bytes: &[u8]) -> &Self {
+ // SAFETY: `RawStr` is a transparent wrapper around `[u8]`, so we can
+ // turn a reference to the wrapped type into a reference to the wrapper
+ // type.
+ unsafe { &*(bytes as *const [u8] as *const Self) }
+ }
+
+ /// Returns the raw string's contents as a borrowed byte slice.
+ pub fn as_bytes(&self) -> &[u8] {
+ &self.0
+ }
+
+ /// Returns an object that implements [Display] for printing this [RawStr],
+ /// given that it is encoded in `encoding`.
+ pub fn display(&self, encoding: &'static Encoding) -> DisplayRawString {
+ DisplayRawString(encoding.decode_without_bom_handling(&self.0).0)
+ }
+
+ /// Interprets the raw string's contents as the specified `encoding` and
+ /// returns it decoded into UTF-8, replacing any malformed sequences by
+ /// [REPLACEMENT_CHARACTER].
+ ///
+ /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER
+ pub fn decode(&self, encoding: &'static Encoding) -> Cow<'_, str> {
+ encoding.decode_without_bom_handling(&self.0).0
+ }
+
+ /// Compares this string and `other` for equality, ignoring trailing ASCII
+ /// spaces in either string for the purpose of comparison. (This is
+ /// acceptable because we assume that the encoding is ASCII-compatible.)
+ pub fn eq_ignore_trailing_spaces(&self, other: &RawStr) -> bool {
+ let mut this = self.0.iter();
+ let mut other = other.0.iter();
+ loop {
+ match (this.next(), other.next()) {
+ (Some(a), Some(b)) if a == b => (),
+ (Some(_), Some(_)) => return false,
+ (None, None) => return true,
+ (Some(b' '), None) => return this.all(|c| *c == b' '),
+ (None, Some(b' ')) => return other.all(|c| *c == b' '),
+ (Some(_), None) | (None, Some(_)) => return false,
+ }
+ }
+ }
+
+ /// Returns the string's length in bytes.
+ pub fn len(&self) -> usize {
+ self.0.len()
+ }
+}
+
+/// Helper struct for printing [RawStr] with [format!].
+///
+/// Created by [RawStr::display].
+pub struct DisplayRawString<'a>(Cow<'a, str>);
+
+impl<'a> Display for DisplayRawString<'a> {
+ // If `s` is valid UTF-8, displays it as UTF-8, otherwise as Latin-1
+ // (actually bytes interpreted as Unicode code points).
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{}", &self.0)
+ }
+}
+
+impl Debug for RawStr {
+ fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+ let s = from_utf8(&self.0).map_or_else(|_| decode_latin1(&self.0), Cow::from);
+ write!(f, "{s:?}")
+ }
+}
+
+/// The value of a [Variable](crate::dictionary::Variable).
+#[derive(Clone)]
+pub enum Datum {
+ /// A numeric value.
+ Number(
+ /// A number, or `None` for the system-missing value.
+ Option<f64>,
+ ),
+ /// A string value.
+ String(
+ /// The value, in the variable's encoding.
+ RawString,
+ ),
+}
+
+impl Debug for Datum {
+ fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+ match self {
+ Datum::Number(Some(number)) => write!(f, "{number:?}"),
+ Datum::Number(None) => write!(f, "SYSMIS"),
+ Datum::String(s) => write!(f, "{:?}", s),
+ }
+ }
+}
+
+impl PartialEq for Datum {
+ fn eq(&self, other: &Self) -> bool {
+ match (self, other) {
+ (Self::Number(Some(l0)), Self::Number(Some(r0))) => {
+ OrderedFloat(*l0) == OrderedFloat(*r0)
+ }
+ (Self::Number(None), Self::Number(None)) => true,
+ (Self::String(l0), Self::String(r0)) => l0 == r0,
+ _ => false,
+ }
+ }
+}
+
+impl Eq for Datum {}
+
+impl PartialOrd for Datum {
+ fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+impl Ord for Datum {
+ fn cmp(&self, other: &Self) -> Ordering {
+ match (self, other) {
+ (Datum::Number(a), Datum::Number(b)) => match (a, b) {
+ (None, None) => Ordering::Equal,
+ (None, Some(_)) => Ordering::Less,
+ (Some(_), None) => Ordering::Greater,
+ (Some(a), Some(b)) => a.total_cmp(b),
+ },
+ (Datum::Number(_), Datum::String(_)) => Ordering::Less,
+ (Datum::String(_), Datum::Number(_)) => Ordering::Greater,
+ (Datum::String(a), Datum::String(b)) => a.cmp(b),
+ }
+ }
+}
+
+impl Hash for Datum {
+ fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+ match self {
+ Datum::Number(number) => number.map(OrderedFloat).hash(state),
+ Datum::String(string) => string.hash(state),
+ }
+ }
+}
+
+impl Datum {
+ /// Constructs a new numerical [Datum] for the system-missing value.
+ pub const fn sysmis() -> Self {
+ Self::Number(None)
+ }
+
+ /// Returns the number inside this datum, or `None` if this is a string
+ /// datum.
+ pub fn as_number(&self) -> Option<Option<f64>> {
+ match self {
+ Datum::Number(number) => Some(*number),
+ Datum::String(_) => None,
+ }
+ }
+
+ /// Returns the string inside this datum, or `None` if this is a numeric
+ /// datum.
+ pub fn as_string(&self) -> Option<&RawString> {
+ match self {
+ Datum::Number(_) => None,
+ Datum::String(s) => Some(s),
+ }
+ }
+
+ /// Returns the string inside this datum as a mutable borrow, or `None` if
+ /// this is a numeric datum.
+ pub fn as_string_mut(&mut self) -> Option<&mut RawString> {
+ match self {
+ Datum::Number(_) => None,
+ Datum::String(s) => Some(s),
+ }
+ }
+
+ /// Returns true if this datum can be resized to the given `width` without
+ /// loss, which is true only if this datum and `width` are both string or
+ /// both numeric and, for string widths, if resizing would not drop any
+ /// non-space characters.
+ pub fn is_resizable(&self, width: VarWidth) -> bool {
+ match (self, width) {
+ (Datum::Number(_), VarWidth::Numeric) => true,
+ (Datum::String(s), VarWidth::String(new_width)) => {
+ let new_len = new_width as usize;
+ new_len >= s.len() || s.0[new_len..].iter().all(|c| *c == b' ')
+ }
+ _ => false,
+ }
+ }
+
+ /// Resizes this datum to the given `width`.
+ ///
+ /// # Panic
+ ///
+ /// Panics if resizing would change the datum from numeric to string or vice
+ /// versa.
+ pub fn resize(&mut self, width: VarWidth) {
+ match (self, width) {
+ (Datum::Number(_), VarWidth::Numeric) => (),
+ (Datum::String(s), VarWidth::String(new_width)) => s.resize(new_width as usize),
+ _ => unreachable!(),
+ }
+ }
+
+ /// Returns the [VarType] corresponding to this datum.
+ pub fn var_type(&self) -> VarType {
+ match self {
+ Self::Number(_) => VarType::Numeric,
+ Self::String(_) => VarType::String,
+ }
+ }
+
+ /// Returns the [VarWidth] corresponding to this datum.
+ pub fn width(&self) -> VarWidth {
+ match self {
+ Datum::Number(_) => VarWidth::Numeric,
+ Datum::String(s) => VarWidth::String(s.len().try_into().unwrap()),
+ }
+ }
+
+ /// Compares this datum and `other` for equality, ignoring trailing ASCII
+ /// spaces in either, if they are both strings, for the purpose of
+ /// comparison.
+ pub fn eq_ignore_trailing_spaces(&self, other: &Datum) -> bool {
+ match (self, other) {
+ (Self::String(a), Self::String(b)) => a.eq_ignore_trailing_spaces(b),
+ _ => self == other,
+ }
+ }
+
+ /// Removes trailing ASCII spaces from this datum, if it is a string.
+ pub fn trim_end(&mut self) {
+ match self {
+ Self::Number(_) => (),
+ Self::String(s) => s.trim_end(),
+ }
+ }
+}
+
+impl From<f64> for Datum {
+ fn from(number: f64) -> Self {
+ Some(number).into()
+ }
+}
+
+impl From<Option<f64>> for Datum {
+ fn from(value: Option<f64>) -> Self {
+ Self::Number(value)
+ }
+}
+
+impl From<&str> for Datum {
+ fn from(value: &str) -> Self {
+ value.as_bytes().into()
+ }
+}
+
+impl From<&[u8]> for Datum {
+ fn from(value: &[u8]) -> Self {
+ Self::String(value.into())
+ }
+}
+
+/// A case in a data set.
+#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct Case(
+ /// One [Datum] per variable in the corresponding [Dictionary], in the same
+ /// order.
+ ///
+ /// [Dictionary]: crate::dictionary::Dictionary
+ pub Vec<Datum>,
+);
+
+/// An owned string and its [Encoding].
+///
+/// The string is not guaranteed to be valid in the encoding.
+///
+/// The borrowed form of such a string is [EncodedStr].
+#[derive(Clone, Debug)]
+pub enum EncodedString {
+ /// A string in arbitrary encoding.
+ Encoded {
+ /// The bytes of the string.
+ bytes: Vec<u8>,
+
+ /// The string's encoding.
+ ///
+ /// This can be [UTF_8].
+ encoding: &'static Encoding,
+ },
+
+ /// A string that is in UTF-8 and known to be valid.
+ Utf8 {
+ /// The string.
+ s: String,
+ },
+}
+
+impl EncodedString {
+ /// Returns the string's [Encoding].
+ pub fn encoding(&self) -> &'static Encoding {
+ match self {
+ EncodedString::Encoded { encoding, .. } => encoding,
+ EncodedString::Utf8 { .. } => UTF_8,
+ }
+ }
+
+ /// Returns a borrowed form of this string.
+ pub fn borrowed(&self) -> EncodedStr<'_> {
+ match self {
+ EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding },
+ EncodedString::Utf8 { s } => EncodedStr::Utf8 { s },
+ }
+ }
+}
+
+impl<'a> From<EncodedStr<'a>> for EncodedString {
+ fn from(value: EncodedStr<'a>) -> Self {
+ match value {
+ EncodedStr::Encoded { bytes, encoding } => Self::Encoded {
+ bytes: bytes.into(),
+ encoding,
+ },
+ EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() },
+ }
+ }
+}
+
+/// A borrowed string and its [Encoding].
+///
+/// The string is not guaranteed to be valid in the encoding.
+///
+/// The owned form of such a string is [EncodedString].
+pub enum EncodedStr<'a> {
+ /// A string in an arbitrary encoding
+ Encoded {
+ /// The bytes of the string.
+ bytes: &'a [u8],
+
+ /// The string's encoding.
+ ///
+ /// THis can be [UTF_8].
+ encoding: &'static Encoding,
+ },
+
+ /// A string in UTF-8 that is known to be valid.
+ Utf8 {
+ /// The string.
+ s: &'a str,
+ },
+}
+
+impl<'a> EncodedStr<'a> {
+ /// Construct a new string with an arbitrary encoding.
+ pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self {
+ Self::Encoded { bytes, encoding }
+ }
+
+ /// Returns this string recoded in UTF-8. Invalid characters will be
+ /// replaced by [REPLACEMENT_CHARACTER].
+ ///
+ /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER
+ pub fn as_str(&self) -> Cow<'_, str> {
+ match self {
+ EncodedStr::Encoded { bytes, encoding } => {
+ encoding.decode_without_bom_handling(bytes).0
+ }
+ EncodedStr::Utf8 { s } => Cow::from(*s),
+ }
+ }
+
+ /// Returns the bytes in the string, in its encoding.
+ pub fn as_bytes(&self) -> &[u8] {
+ match self {
+ EncodedStr::Encoded { bytes, .. } => bytes,
+ EncodedStr::Utf8 { s } => s.as_bytes(),
+ }
+ }
+
+ /// Returns this string recoded in `encoding`. Invalid characters will be
+ /// replaced by [REPLACEMENT_CHARACTER].
+ ///
+ /// [REPLACEMENT_CHARACTER]: std::char::REPLACEMENT_CHARACTER
+ pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> {
+ match self {
+ EncodedStr::Encoded { bytes, encoding } => {
+ let utf8 = encoding.decode_without_bom_handling(bytes).0;
+ match encoding.encode(&utf8).0 {
+ Cow::Borrowed(_) => {
+ // Recoding into UTF-8 and then back did not change anything.
+ Cow::from(*bytes)
+ }
+ Cow::Owned(owned) => Cow::Owned(owned),
+ }
+ }
+ EncodedStr::Utf8 { s } => encoding.encode(s).0,
+ }
+ }
+
+ /// Returns true if this string is empty.
+ pub fn is_empty(&self) -> bool {
+ match self {
+ EncodedStr::Encoded { bytes, .. } => bytes.is_empty(),
+ EncodedStr::Utf8 { s } => s.is_empty(),
+ }
+ }
+
+ /// Returns a helper for displaying this string in double quotes.
+ pub fn quoted(&self) -> QuotedEncodedStr {
+ QuotedEncodedStr(self)
+ }
+}
+
+impl<'a> From<&'a str> for EncodedStr<'a> {
+ fn from(s: &'a str) -> Self {
+ Self::Utf8 { s }
+ }
+}
+
+impl<'a> From<&'a String> for EncodedStr<'a> {
+ fn from(s: &'a String) -> Self {
+ Self::Utf8 { s: s.as_str() }
+ }
+}
+
+/// Helper struct for displaying a [QuotedEncodedStr] in double quotes.
+pub struct QuotedEncodedStr<'a>(&'a EncodedStr<'a>);
+
+impl Display for QuotedEncodedStr<'_> {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{:?}", self.0.as_str())
+ }
+}
collections::{BTreeMap, BTreeSet, HashMap, HashSet},
fmt::{Debug, Display, Formatter, Result as FmtResult},
hash::Hash,
- ops::{Bound, RangeBounds, RangeInclusive},
+ ops::{Bound, Not, RangeBounds, RangeInclusive},
str::FromStr,
};
use enum_map::{Enum, EnumMap};
use indexmap::IndexSet;
use num::integer::div_ceil;
-use ordered_float::OrderedFloat;
use thiserror::Error as ThisError;
use unicase::UniCase;
use crate::{
- format::Format,
+ data::Datum,
+ format::{DisplayPlain, Format},
identifier::{ByIdentifier, HasIdentifier, Identifier},
output::pivot::{Axis3, Dimension, Footnote, Footnotes, Group, PivotTable, Value},
settings::Show,
- sys::raw::{Alignment, CategoryLabels, Measure, MissingValues, RawString, VarType},
};
/// An index within [Dictionary::variables].
pub type DictIndex = usize;
+/// Variable type.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum VarType {
+ /// A numeric variable.
+ Numeric,
+
+ /// A string variable.
+ String,
+}
+
+impl Not for VarType {
+ type Output = Self;
+
+ fn not(self) -> Self::Output {
+ match self {
+ Self::Numeric => Self::String,
+ Self::String => Self::Numeric,
+ }
+ }
+}
+
+impl Not for &VarType {
+ type Output = VarType;
+
+ fn not(self) -> Self::Output {
+ !*self
+ }
+}
+
+impl Display for VarType {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ match self {
+ VarType::Numeric => write!(f, "numeric"),
+ VarType::String => write!(f, "string"),
+ }
+ }
+}
+
/// [VarType], plus a width for [VarType::String].
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum VarWidth {
}
}
-#[derive(Clone)]
-pub enum Datum<S = RawString> {
- Number(Option<f64>),
- String(S),
-}
-
-impl<S> Debug for Datum<S>
-where
- S: Debug,
-{
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- match self {
- Datum::Number(Some(number)) => write!(f, "{number:?}"),
- Datum::Number(None) => write!(f, "SYSMIS"),
- Datum::String(s) => write!(f, "{:?}", s),
- }
- }
-}
-
-impl PartialEq for Datum {
- fn eq(&self, other: &Self) -> bool {
- match (self, other) {
- (Self::Number(Some(l0)), Self::Number(Some(r0))) => {
- OrderedFloat(*l0) == OrderedFloat(*r0)
- }
- (Self::Number(None), Self::Number(None)) => true,
- (Self::String(l0), Self::String(r0)) => l0 == r0,
- _ => false,
- }
- }
-}
-
-impl Eq for Datum {}
-
-impl PartialOrd for Datum {
- fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
- Some(self.cmp(other))
- }
-}
-
-impl Ord for Datum {
- fn cmp(&self, other: &Self) -> Ordering {
- match (self, other) {
- (Datum::Number(a), Datum::Number(b)) => match (a, b) {
- (None, None) => Ordering::Equal,
- (None, Some(_)) => Ordering::Less,
- (Some(_), None) => Ordering::Greater,
- (Some(a), Some(b)) => a.total_cmp(b),
- },
- (Datum::Number(_), Datum::String(_)) => Ordering::Less,
- (Datum::String(_), Datum::Number(_)) => Ordering::Greater,
- (Datum::String(a), Datum::String(b)) => a.cmp(b),
- }
- }
-}
-
-impl Hash for Datum {
- fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
- match self {
- Datum::Number(number) => number.map(OrderedFloat).hash(state),
- Datum::String(string) => string.hash(state),
- }
- }
-}
-
-impl Datum {
- pub const fn sysmis() -> Self {
- Self::Number(None)
- }
-
- pub fn as_number(&self) -> Option<Option<f64>> {
- match self {
- Datum::Number(number) => Some(*number),
- Datum::String(_) => None,
- }
- }
-
- pub fn as_string(&self) -> Option<&RawString> {
- match self {
- Datum::Number(_) => None,
- Datum::String(s) => Some(s),
- }
- }
-
- pub fn is_resizable(&self, width: VarWidth) -> bool {
- match (self, width) {
- (Datum::Number(_), VarWidth::Numeric) => true,
- (Datum::String(s), VarWidth::String(new_width)) => {
- let new_len = new_width as usize;
- new_len >= s.len() || s.0[new_len..].iter().all(|c| *c == b' ')
- }
- _ => false,
- }
- }
-
- pub fn resize(&mut self, width: VarWidth) {
- match (self, width) {
- (Datum::Number(_), VarWidth::Numeric) => (),
- (Datum::String(s), VarWidth::String(new_width)) => s.resize(new_width as usize),
- _ => unreachable!(),
- }
- }
-
- pub fn var_type(&self) -> VarType {
- match self {
- Self::Number(_) => VarType::Numeric,
- Self::String(_) => VarType::String,
- }
- }
-
- pub fn width(&self) -> VarWidth {
- match self {
- Datum::Number(_) => VarWidth::Numeric,
- Datum::String(s) => VarWidth::String(s.len().try_into().unwrap()),
- }
- }
-
- pub fn eq_ignore_trailing_spaces(&self, other: &Datum) -> bool {
- match (self, other) {
- (Self::String(a), Self::String(b)) => a.eq_ignore_trailing_spaces(b),
- _ => self == other,
- }
- }
-
- pub fn trim_end(&mut self) {
- match self {
- Self::Number(_) => (),
- Self::String(s) => s.trim_end(),
- }
- }
-}
-
-impl From<f64> for Datum {
- fn from(number: f64) -> Self {
- Some(number).into()
- }
-}
-
-impl From<Option<f64>> for Datum {
- fn from(value: Option<f64>) -> Self {
- Self::Number(value)
- }
-}
-
-impl From<&str> for Datum {
- fn from(value: &str) -> Self {
- value.as_bytes().into()
- }
-}
-
-impl From<&[u8]> for Datum {
- fn from(value: &[u8]) -> Self {
- Self::String(value.into())
- }
-}
-
/// A collection of variables, plus additional metadata.
#[derive(Clone, Debug)]
pub struct Dictionary {
}
}
+#[derive(Clone, Default)]
+pub struct MissingValues {
+ /// Individual missing values, up to 3 of them.
+ values: Vec<Datum>,
+
+ /// Optional range of missing values.
+ range: Option<MissingValueRange>,
+}
+
+impl Debug for MissingValues {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ DisplayMissingValues {
+ mv: self,
+ encoding: None,
+ }
+ .fmt(f)
+ }
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum MissingValuesError {
+ TooMany,
+ TooWide,
+ MixedTypes,
+}
+
+impl MissingValues {
+ pub fn new(
+ mut values: Vec<Datum>,
+ range: Option<MissingValueRange>,
+ ) -> Result<Self, MissingValuesError> {
+ if values.len() > 3 {
+ return Err(MissingValuesError::TooMany);
+ }
+
+ let mut var_type = None;
+ for value in values.iter_mut() {
+ value.trim_end();
+ match value.width() {
+ VarWidth::String(w) if w > 8 => return Err(MissingValuesError::TooWide),
+ _ => (),
+ }
+ if var_type.is_some_and(|t| t != value.var_type()) {
+ return Err(MissingValuesError::MixedTypes);
+ }
+ var_type = Some(value.var_type());
+ }
+
+ if var_type == Some(VarType::String) && range.is_some() {
+ return Err(MissingValuesError::MixedTypes);
+ }
+
+ Ok(Self { values, range })
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.values.is_empty() && self.range.is_none()
+ }
+
+ pub fn var_type(&self) -> Option<VarType> {
+ if let Some(datum) = self.values.first() {
+ Some(datum.var_type())
+ } else if self.range.is_some() {
+ Some(VarType::Numeric)
+ } else {
+ None
+ }
+ }
+
+ pub fn contains(&self, value: &Datum) -> bool {
+ if self
+ .values
+ .iter()
+ .any(|datum| datum.eq_ignore_trailing_spaces(value))
+ {
+ return true;
+ }
+
+ match value {
+ Datum::Number(Some(number)) => self.range.is_some_and(|range| range.contains(*number)),
+ _ => false,
+ }
+ }
+
+ pub fn is_resizable(&self, width: VarWidth) -> bool {
+ self.values.iter().all(|datum| datum.is_resizable(width))
+ && self.range.iter().all(|range| range.is_resizable(width))
+ }
+
+ pub fn resize(&mut self, width: VarWidth) {
+ for datum in &mut self.values {
+ datum.resize(width);
+ }
+ if let Some(range) = &mut self.range {
+ range.resize(width);
+ }
+ }
+
+ pub fn display(&self, encoding: &'static Encoding) -> DisplayMissingValues<'_> {
+ DisplayMissingValues {
+ mv: self,
+ encoding: Some(encoding),
+ }
+ }
+}
+
+pub struct DisplayMissingValues<'a> {
+ mv: &'a MissingValues,
+ encoding: Option<&'static Encoding>,
+}
+
+impl<'a> Display for DisplayMissingValues<'a> {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ if let Some(range) = &self.mv.range {
+ write!(f, "{range}")?;
+ if !self.mv.values.is_empty() {
+ write!(f, "; ")?;
+ }
+ }
+
+ for (i, value) in self.mv.values.iter().enumerate() {
+ if i > 0 {
+ write!(f, "; ")?;
+ }
+ match self.encoding {
+ Some(encoding) => value.display_plain(encoding).fmt(f)?,
+ None => value.fmt(f)?,
+ }
+ }
+
+ if self.mv.is_empty() {
+ write!(f, "none")?;
+ }
+ Ok(())
+ }
+}
+
+#[derive(Copy, Clone)]
+pub enum MissingValueRange {
+ In { low: f64, high: f64 },
+ From { low: f64 },
+ To { high: f64 },
+}
+
+impl MissingValueRange {
+ pub fn new(low: f64, high: f64) -> Self {
+ const LOWEST: f64 = f64::MIN.next_up();
+ match (low, high) {
+ (f64::MIN | LOWEST, _) => Self::To { high },
+ (_, f64::MAX) => Self::From { low },
+ (_, _) => Self::In { low, high },
+ }
+ }
+
+ pub fn low(&self) -> Option<f64> {
+ match self {
+ MissingValueRange::In { low, .. } | MissingValueRange::From { low } => Some(*low),
+ MissingValueRange::To { .. } => None,
+ }
+ }
+
+ pub fn high(&self) -> Option<f64> {
+ match self {
+ MissingValueRange::In { high, .. } | MissingValueRange::To { high } => Some(*high),
+ MissingValueRange::From { .. } => None,
+ }
+ }
+
+ pub fn contains(&self, number: f64) -> bool {
+ match self {
+ MissingValueRange::In { low, high } => (*low..*high).contains(&number),
+ MissingValueRange::From { low } => number >= *low,
+ MissingValueRange::To { high } => number <= *high,
+ }
+ }
+
+ pub fn is_resizable(&self, width: VarWidth) -> bool {
+ width.is_numeric()
+ }
+
+ pub fn resize(&self, width: VarWidth) {
+ assert_eq!(width, VarWidth::Numeric);
+ }
+}
+
+impl Display for MissingValueRange {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ match self.low() {
+ Some(low) => low.display_plain().fmt(f)?,
+ None => write!(f, "LOW")?,
+ }
+
+ write!(f, " THRU ")?;
+
+ match self.high() {
+ Some(high) => high.display_plain().fmt(f)?,
+ None => write!(f, "HIGH")?,
+ }
+ Ok(())
+ }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Alignment {
+ Left,
+ Right,
+ Center,
+}
+
+impl Alignment {
+ pub fn default_for_type(var_type: VarType) -> Self {
+ match var_type {
+ VarType::Numeric => Self::Right,
+ VarType::String => Self::Left,
+ }
+ }
+
+ pub fn as_str(&self) -> &'static str {
+ match self {
+ Alignment::Left => "Left",
+ Alignment::Right => "Right",
+ Alignment::Center => "Center",
+ }
+ }
+}
+
+/// [Level of measurement](https://en.wikipedia.org/wiki/Level_of_measurement).
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Measure {
+ /// Nominal values can only be compared for equality.
+ Nominal,
+
+ /// Ordinal values can be meaningfully ordered.
+ Ordinal,
+
+ /// Scale values can be meaningfully compared for the degree of difference.
+ Scale,
+}
+
+impl Measure {
+ pub fn default_for_type(var_type: VarType) -> Option<Measure> {
+ match var_type {
+ VarType::Numeric => None,
+ VarType::String => Some(Self::Nominal),
+ }
+ }
+
+ pub fn as_str(&self) -> &'static str {
+ match self {
+ Measure::Nominal => "Nominal",
+ Measure::Ordinal => "Ordinal",
+ Measure::Scale => "Scale",
+ }
+ }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum CategoryLabels {
+ VarLabels,
+ CountedValues,
+}
+
#[cfg(test)]
mod test {
use std::collections::HashSet;
// You should have received a copy of the GNU General Public License along with
// this program. If not, see <http://www.gnu.org/licenses/>.
-use enum_iterator::Sequence;
use smallvec::SmallVec;
-/// The endianness for integer and floating-point numbers in SPSS system files.
-///
-/// SPSS system files can declare IBM 370 and DEC VAX floating-point
-/// representations, but no file that uses either of these has ever been found
-/// in the wild, so this code does not handle them.
-#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Sequence)]
-pub enum Endian {
- /// Big-endian: MSB at lowest address.
- #[cfg_attr(target_endian = "big", default)]
- Big,
+pub use binrw::Endian;
- /// Little-endian: LSB at lowest address.
- #[cfg_attr(target_endian = "little", default)]
- Little,
-}
-
-impl Endian {
- pub fn identify_u32(expected_value: u32, bytes: [u8; 4]) -> Option<Self> {
- let as_big: u32 = Endian::Big.parse(bytes);
- let as_little: u32 = Endian::Little.parse(bytes);
- match (as_big == expected_value, as_little == expected_value) {
- (true, false) => Some(Endian::Big),
- (false, true) => Some(Endian::Little),
- _ => None,
- }
- }
-
- pub fn identify_f64(expected_value: f64, bytes: [u8; 8]) -> Option<Self> {
- let as_big: f64 = Endian::Big.parse(bytes);
- let as_little: f64 = Endian::Little.parse(bytes);
- match (as_big == expected_value, as_little == expected_value) {
- (true, false) => Some(Endian::Big),
- (false, true) => Some(Endian::Little),
- _ => None,
- }
- }
-
- pub fn to_smallvec<const N: usize>(self, mut value: u64, n: usize) -> SmallVec<[u8; N]> {
- debug_assert!(n <= 8);
- let mut vec = SmallVec::new();
- value <<= 8 * (8 - n);
- for _ in 0..n {
- vec.push((value >> 56) as u8);
- value <<= 8;
- }
- if self == Endian::Little {
- vec.reverse();
- }
- vec
- }
+pub fn endian_to_smallvec<const N: usize>(
+ endian: Endian,
+ mut value: u64,
+ n: usize,
+) -> SmallVec<[u8; N]> {
+ debug_assert!(n <= 8);
+ let mut vec = SmallVec::new();
+ value <<= 8 * (8 - n);
+ for _ in 0..n {
+ vec.push((value >> 56) as u8);
+ value <<= 8;
+ }
+ if endian == Endian::Little {
+ vec.reverse();
+ }
+ vec
}
pub trait ToBytes<T, const N: usize> {
use encoding_rs::UTF_8;
- use crate::lex::lexer::{Source, SourceFile};
+ use crate::lex::lexer::{Source, SyntaxFile};
use super::Engine;
#[ignore]
fn test_echo() {
let mut engine = Engine::new();
- engine.run(Source::new_default(&Arc::new(
- SourceFile::for_file_contents(
- "ECHO 'hi there'.\nECHO 'bye there'.\n".to_string(),
- Some("test.sps".to_string()),
- UTF_8,
- ),
- )));
+ engine.run(Source::new_default(&Arc::new(SyntaxFile::new(
+ "ECHO 'hi there'.\nECHO 'bye there'.\n".to_string(),
+ Some("test.sps".to_string()),
+ UTF_8,
+ ))));
}
#[test]
fn test_descriptives() {
let mut engine = Engine::new();
engine.run(Source::new_default(&Arc::new(
- SourceFile::for_file_contents(
+ SyntaxFile::new(
"DESCRIPTIVES VARIABLES=a (za) b to c/MISSING=x y z/MISSING=VARIABLE INCLUDE/STATISTICS=DEFAULT/SAVE/SORT=SKEWNESS (A)\n".to_string(),
Some("test.sps".to_string()),
UTF_8,
use crate::{
calendar::{calendar_offset_to_gregorian, day_of_year, month_name, short_month_name},
- dictionary::Datum,
- endian::ToBytes,
+ data::Datum,
+ endian::{endian_to_smallvec, ToBytes},
format::{Category, DateTemplate, Decimal, Format, NumberStyle, Settings, TemplateItem, Type},
settings::{EndianSettings, Settings as PsppSettings},
};
} else {
integer
};
- self.endian.output.to_smallvec(integer, self.format.w())
+ endian_to_smallvec(self.endian.output, integer, self.format.w())
}
fn pib(&self, number: Option<f64>) -> SmallVec<[u8; 16]> {
number
};
let integer = number.abs() as u64;
- self.endian.output.to_smallvec(integer, self.format.w())
+ endian_to_smallvec(self.endian.output, integer, self.format.w())
}
fn rb(&self, number: Option<f64>, w: usize) -> SmallVec<[u8; 16]> {
use smallvec::SmallVec;
use crate::{
- dictionary::Datum,
+ data::Datum,
endian::Endian,
format::{AbstractFormat, Epoch, Format, Settings, Type, UncheckedFormat, CC},
- lex::{
- scan::StringScanner,
- segment::Syntax,
- token::{Punct, Token},
- },
+ lex::{scan::StringScanner, segment::Syntax, Punct, Token},
settings::EndianSettings,
};
use unicode_width::UnicodeWidthStr;
use crate::{
- dictionary::{Datum, VarWidth},
- sys::raw::{self, RawString, VarType},
+ data::RawString,
+ data::Datum,
+ dictionary::{VarType, VarWidth},
+ sys::raw,
};
mod display;
}
}
-impl TryFrom<raw::Spec> for UncheckedFormat {
+impl TryFrom<raw::records::RawFormat> for UncheckedFormat {
type Error = Error;
- fn try_from(raw: raw::Spec) -> Result<Self, Self::Error> {
+ fn try_from(raw: raw::records::RawFormat) -> Result<Self, Self::Error> {
let raw = raw.0;
let raw_format = (raw >> 16) as u16;
let format = raw_format.try_into()?;
use crate::{
calendar::{calendar_gregorian_to_offset, DateError},
- dictionary::Datum,
+ data::{Datum, EncodedStr, EncodedString},
endian::{Endian, Parse},
format::{DateTemplate, Decimals, Settings, TemplateItem, Type},
settings::{EndianSettings, Settings as PsppSettings},
- sys::raw::{EncodedStr, EncodedString},
};
use encoding_rs::Encoding;
use smallstr::SmallString;
TrailingGarbage(String),
/// Invalid date.
- #[error("{0}")]
+ #[error(transparent)]
InvalidDate(#[from] DateError),
/// Invalid zoned decimal (Z) syntax.
use crate::{
calendar::{days_in_month, is_leap_year},
- dictionary::Datum,
+ data::{Datum, EncodedStr},
endian::Endian,
format::{
parse::{ParseError, ParseErrorKind, Sign},
Epoch, Format, Settings as FormatSettings, Type,
},
settings::EndianSettings,
- sys::raw::EncodedStr,
};
fn test(name: &str, type_: Type) {
#[error("\"!\" is not a valid identifier.")]
Bang,
- #[error("\"{0}\" may not be used as an identifier because it begins with disallowed character {1:?}.")]
- BadFirstCharacter(String, char),
+ #[error("{string:?} may not be used as an identifier because it begins with disallowed character {c:?}.")]
+ BadFirstCharacter { string: String, c: char },
#[error(
- "\"{0}\" may not be used as an identifier because it contains disallowed character {1:?}."
+ "{string:?} may not be used as an identifier because it contains disallowed character {c:?}."
)]
- BadLaterCharacter(String, char),
+ BadLaterCharacter { string: String, c: char },
#[error("Identifier \"{id}\" is {length} bytes in the encoding in use ({encoding}), which exceeds the {max}-byte limit.")]
TooLong {
let mut i = s.chars();
let first = i.next().unwrap();
if !first.may_start_id() {
- return Err(Error::BadFirstCharacter(s.into(), first));
+ return Err(Error::BadFirstCharacter {
+ string: s.into(),
+ c: first,
+ });
}
for c in i {
if !c.may_continue_id() {
- return Err(Error::BadLaterCharacter(s.into(), c));
+ return Err(Error::BadLaterCharacter {
+ string: s.into(),
+ c,
+ });
}
}
Ok(())
_ => {
let s = self.0.into_inner();
let first = s.chars().next().unwrap();
- Err(Error::BadFirstCharacter(s, first))
+ Err(Error::BadFirstCharacter {
+ string: s,
+ c: first,
+ })
}
}
}
// You should have received a copy of the GNU General Public License along with
// this program. If not, see <http://www.gnu.org/licenses/>.
+//! Command names.
+//!
+//! PSPP needs to parse command names in a few contexts:
+//!
+//! - For executing command syntax.
+//!
+//! - For lexical analysis in [Auto](crate::lex::segment::Syntax::Auto) syntax
+//! mode. In this syntax mode, a line of syntax begins a new command if the
+//! line has no leading white space and it begins with the name of a known
+//! command.
+//!
+//! This module supports identifying commands for these purposes.
+
use crate::identifier::id_match_n_nonstatic;
+/// How a string matches the name of a command.
pub struct Match {
+ /// Is this an exact match?
+ ///
+ /// Words in command names are allowed to be abbreviated to their first 3
+ /// letters. An exact match means that none of the words were abbreviated.
pub exact: bool,
+
+ /// Number of words omitted from the command name.
+ ///
+ /// It means:
+ ///
+ /// - Positive: Number of words omitted at the end of the command name
+ /// (command names may be abbreviated as only as many words needed to be
+ /// unambiguous).
+ ///
+ /// - Negative: The absolute value is the number of extra words at the end
+ /// of the string that are not part of the command name.
+ ///
+ /// - Zero: The string and command name match exactly.
pub missing_words: isize,
}
-/// Compares `string` obtained from the user against the full name of a `command`,
-/// using this algorithm:
+/// Compares `string` obtained from the user against the full name of a
+/// `command`.
+///
+/// It uses this algorithm:
///
/// 1. Divide `command` into words `c[0]` through `c[n - 1]`.
///
/// 2. Divide `string` into words `s[0]` through `s[m - 1]`.
///
-/// 3. Compare word `c[i]` against `s[i]` for `0 <= i < min(n, m)`, using the keyword
-/// matching algorithm implemented by lex_id_match(). If any of them fail to
-/// match, then `string` does not match `command` and the function returns false.
+/// 3. Compare word `c[i]` against `s[i]` for `0 <= i < min(n, m)`, using the
+/// keyword matching algorithm implemented by lex_id_match(). If any of
+/// them fail to match, then `string` does not match `command` and the
+/// function returns `None`.
///
-/// 4. Otherwise, `string` and `command` match. Set *MISSING_WORDS to n - m. Set
-/// *EXACT to false if any of the `S[i]` were found to be abbreviated in the
-/// comparisons done in step 3, or to true if they were all exactly equal
-/// (modulo case). Return true.
+/// 4. Otherwise, `string` and `command` match. Returns a [Match] with
+/// `missing_words` set to `n - m` and `exact` set based on whether any of
+/// the words in the command name were abbreviated.
pub fn command_match(command: &str, string: &str) -> Option<Match> {
let mut command_words = command.split_whitespace();
let mut string_words = string.split_whitespace();
}
impl<'a, T> CommandMatcher<'a, T> {
+ /// Creates a new matcher for `string`.
pub fn new(string: &'a str) -> Self {
Self {
string,
}
/// Consider `command` as a candidate for the command name being parsed. If
- /// `command` is the correct command name, then [Self::get_match] will
- /// return `aux` later.
+ /// `command` is the correct command name, then [get_match](Self::get_match)
+ /// will return `aux` later.
pub fn add(&mut self, command: &str, aux: T) {
if let Some(Match {
missing_words,
}
}
+ /// Returns the best match among the possibilities passed to
+ /// [add](Self::add). Also returns the number of additional words that the
+ /// caller should consider reading, because the full command name might be
+ /// longer (if a command was returned) or because more words might be needed
+ /// for disambiguation (if no command name was returned).
pub fn get_match(self) -> (Option<T>, isize) {
if self.extensible {
(None, 1)
}
}
+/// List of all PSPP command names.
+///
+/// This includes commands that are not yet implemented.
pub const COMMAND_NAMES: &[&str] = &[
"2SLS",
"ACF",
// You should have received a copy of the GNU General Public License along with
// this program. If not, see <http://www.gnu.org/licenses/>.
+//! High-level lexical analysis.
+
use std::{
borrow::{Borrow, Cow},
collections::VecDeque,
use chardetng::EncodingDetector;
use encoding_rs::{Encoding, UTF_8};
-use thiserror::Error as ThisError;
use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
use crate::{
+ lex::scan::merge_tokens,
macros::{macro_tokens_to_syntax, MacroSet, ParseStatus, Parser},
message::{Category, Diagnostic, Location, Point, Severity},
settings::Settings,
};
use super::{
- scan::{MergeResult, ScanError, ScanToken, StringScanner},
+ scan::{MergeAction, ScanError, StringScanner},
segment::{Segmenter, Syntax},
token::Token,
};
Stop,
}
-pub struct SourceFile {
+/// A syntax file and its contents.
+///
+/// This holds the entire contents of a syntax file, which are always read into
+/// memory in their entirety, recoded into UTF-8 if necessary. It includes the
+/// file name (if any), and an index to make finding lines by line number more
+/// efficient.
+pub struct SyntaxFile {
/// `None` if this reader is not associated with a file.
file_name: Option<Arc<String>>,
- /// Encoding.
+ /// Original encoding.
#[allow(dead_code)]
encoding: &'static Encoding,
/// Source file contents.
- buffer: String,
+ contents: String,
/// Byte offsets into `buffer` of starts of lines. The first element is 0.
lines: Vec<usize>,
}
-impl SourceFile {
- fn new(buffer: String, encoding: &'static Encoding, file_name: Option<String>) -> Self {
- let lines = once(0)
- .chain(buffer.match_indices('\n').map(|(index, _s)| index + 1))
- .filter(|index| *index < buffer.len())
- .collect::<Vec<_>>();
- Self {
- file_name: file_name.map(Arc::new),
- encoding,
- buffer,
- lines,
- }
- }
-
+impl SyntaxFile {
+ /// Returns a `SyntaxFile` by reading `path` and recoding it from
+ /// `encoding`.
pub fn for_file<P>(path: P, encoding: Option<&'static Encoding>) -> IoResult<Self>
where
P: AsRef<Path>,
let (contents, _malformed) = encoding.decode_with_bom_removal(&bytes);
Ok(Self::new(
contents.to_string(),
- encoding,
Some(path.as_ref().to_string_lossy().to_string()),
+ encoding,
))
}
- pub fn for_file_contents(
- contents: String,
- file_name: Option<String>,
- encoding: &'static Encoding,
- ) -> Self {
- Self::new(contents, encoding, file_name)
+ /// Creates a new `SyntaxFile` for `contents`, recording that `contents` was
+ /// originally encoded in `encoding` and that it was read from `file_name`.
+ pub fn new(contents: String, file_name: Option<String>, encoding: &'static Encoding) -> Self {
+ let lines = once(0)
+ .chain(contents.match_indices('\n').map(|(index, _s)| index + 1))
+ .filter(|index| *index < contents.len())
+ .collect::<Vec<_>>();
+ Self {
+ file_name: file_name.map(Arc::new),
+ encoding,
+ contents,
+ lines,
+ }
}
- pub fn for_string(contents: String, encoding: &'static Encoding) -> Self {
- Self::new(contents, encoding, None)
+ /// Returns a `SyntaxFile` for `contents`.
+ pub fn for_string(contents: String) -> Self {
+ Self::new(contents, None, UTF_8)
}
fn offset_to_point(&self, offset: usize) -> Point {
Point {
line: line as i32,
column: Some(
- self.buffer
+ self.contents
.get(self.lines[line - 1]..offset)
.unwrap_or_default()
.width() as i32
let line_number = line_number as usize;
let start = self.lines[line_number - 1];
let end = self.lines.get(line_number).copied().unwrap_or(
- self.buffer[start..]
+ self.contents[start..]
.find('\n')
.map(|ofs| ofs + start)
- .unwrap_or(self.buffer.len()),
+ .unwrap_or(self.contents.len()),
);
- self.buffer[start..end].strip_newline()
+ self.contents[start..end].strip_newline()
} else {
""
}
}
}
-impl Default for SourceFile {
+impl Default for SyntaxFile {
fn default() -> Self {
- Self::new(String::new(), UTF_8, None)
+ Self::new(String::new(), None, UTF_8)
}
}
}
/// A token in a [`Source`].
+///
+/// This relates a token back to where it was read, which allows for better
+/// error reporting.
pub struct LexToken {
- /// The regular token.
+ /// The token.
pub token: Token,
- pub file: Arc<SourceFile>,
+ /// The source file that the token was read from.
+ pub file: Arc<SyntaxFile>,
/// For a token obtained through the lexer in an ordinary way, this is the
/// location of the token in the [`Source`]'s buffer.
impl LexToken {
fn representation(&self) -> &str {
- &self.file.buffer[self.pos.clone()]
+ &self.file.contents[self.pos.clone()]
}
}
pos: RangeInclusive<usize>,
}
-#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
-pub enum Error {
- /// Error forming tokens from the input.
- #[error("{0}")]
- TokenError(#[from] ScanError),
-}
-
+/// A sequence of tokens.
pub struct Tokens {
tokens: Vec<LexToken>,
}
}
}
+/// An iterator for [TokenSlice].
pub struct TokenSliceIter<'a> {
slice: &'a TokenSlice,
rest: Range<usize>,
}
impl<'a> TokenSliceIter<'a> {
+ /// Creates a new iterator for `slice`.
pub fn new(slice: &'a TokenSlice) -> Self {
Self {
slice,
rest: slice.range.clone(),
}
}
+
+ /// Returns the tokens not yet visited by the iterator.
pub fn remainder(&self) -> TokenSlice {
TokenSlice {
backing: self.slice.backing.clone(),
}
}
+/// A subrange of tokens inside [Tokens].
#[derive(Clone)]
pub struct TokenSlice {
backing: Rc<Tokens>,
}
}
+#[allow(missing_docs)]
impl TokenSlice {
+ /// Create a new slice that initially contains all of `backing`.
pub fn new(backing: Rc<Tokens>) -> Self {
let range = 0..backing.tokens.len() - 1;
Self { backing, range }
fn tokens(&self) -> &[LexToken] {
&self.backing.tokens[self.range.clone()]
}
+ /// Returns the token with the given `index`, or `None` if `index` is out of
+ /// range.
pub fn get_token(&self, index: usize) -> Option<&Token> {
self.get(index).map(|token| &token.token)
}
+ /// Returns the [LexToken] with the given `index`, or `None` if `index` is
+ /// out of range.
pub fn get(&self, index: usize) -> Option<&LexToken> {
self.tokens().get(index)
}
+ /// Returns an error with the given `text`, citing these tokens.
pub fn error<S>(&self, text: S) -> Diagnostic
where
S: ToString,
self.diagnostic(Severity::Error, text.to_string())
}
+ /// Returns a warning with the given `text`, citing these tokens.
pub fn warning<S>(&self, text: S) -> Diagnostic
where
S: ToString,
self.subslice(self.len()..self.len())
}
- fn file(&self) -> Option<&Arc<SourceFile>> {
+ fn file(&self) -> Option<&Arc<SyntaxFile>> {
let first = self.first();
let last = self.last();
if Arc::ptr_eq(&first.file, &last.file) {
let start = token0.pos.start;
let end = token1.pos.end;
if start < end {
- return Some(&file.buffer[start..end]);
+ return Some(&file.contents[start..end]);
}
}
}
}
}
+/// A source of tokens read from a [SyntaxFile].
pub struct Source {
- file: Arc<SourceFile>,
+ file: Arc<SyntaxFile>,
segmenter: Segmenter,
seg_pos: usize,
lookahead: VecDeque<LexToken>,
}
impl Source {
- pub fn new_default(file: &Arc<SourceFile>) -> Self {
+ /// Creates a new `Source` reading from `file`, using the default [Syntax].
+ pub fn new_default(file: &Arc<SyntaxFile>) -> Self {
Self::new(file, Syntax::default())
}
- pub fn new(file: &Arc<SourceFile>, syntax: Syntax) -> Self {
+ /// Creates a new `Source` reading from `file` using `syntax`.
+ pub fn new(file: &Arc<SyntaxFile>, syntax: Syntax) -> Self {
Self {
file: file.clone(),
segmenter: Segmenter::new(syntax, false),
}
}
+ /// Reads and returns a whole command from this source, expanding the given
+ /// `macros` as it reads.
pub fn read_command(&mut self, macros: &MacroSet) -> Option<Tokens> {
loop {
if let Some(end) = self
if self.lookahead.is_empty() {
return None;
}
- let len = self.file.buffer.len();
+ let len = self.file.contents.len();
self.lookahead.push_back(LexToken {
token: Token::End,
file: self.file.clone(),
}
}
- pub fn read_lookahead(&mut self, macros: &MacroSet) -> bool {
+ fn read_lookahead(&mut self, macros: &MacroSet) -> bool {
let mut errors = Vec::new();
let mut pp = VecDeque::new();
while let Some((seg_len, seg_type)) = self
.segmenter
- .push(&self.file.buffer[self.seg_pos..], true)
+ .push(&self.file.contents[self.seg_pos..], true)
.unwrap()
{
let pos = self.seg_pos..self.seg_pos + seg_len;
self.seg_pos += seg_len;
- match ScanToken::from_segment(&self.file.buffer[pos.clone()], seg_type) {
+ match seg_type.to_token(&self.file.contents[pos.clone()]) {
None => (),
- Some(ScanToken::Token(token)) => {
+ Some(Ok(token)) => {
let end = token == Token::End;
pp.push_back(LexToken {
file: self.file.clone(),
break;
}
}
- Some(ScanToken::Error(error)) => errors.push(LexError { error, pos }),
+ Some(Err(error)) => errors.push(LexError { error, pos }),
}
}
// XXX report errors
};
while let Ok(Some(result)) =
- ScanToken::merge(|index| Ok(merge.get(index).map(|token| &token.token)))
+ merge_tokens(|index| Ok(merge.get(index).map(|token| &token.token)))
{
match result {
- MergeResult::Copy => self.lookahead.push_back(merge.pop_front().unwrap()),
- MergeResult::Expand { n, token } => {
+ MergeAction::Copy => self.lookahead.push_back(merge.pop_front().unwrap()),
+ MergeAction::Expand { n, token } => {
let first = &merge[0];
let last = &merge[n - 1];
self.lookahead.push_back(LexToken {
return;
};
for token in src.range(1..) {
- if parser.push(&token.token, &self.file.buffer[token.pos.clone()], &|e| {
+ if parser.push(&token.token, &self.file.contents[token.pos.clone()], &|e| {
println!("{e:?}")
}) == ParseStatus::Complete
{
use crate::macros::MacroSet;
- use super::{Source, SourceFile};
+ use super::{Source, SyntaxFile};
#[test]
fn test() {
CROSSTABS VARIABLES X (1,7) Y (1,7) /TABLES X BY Y.
"#;
- let file = Arc::new(SourceFile::for_file_contents(
+ let file = Arc::new(SyntaxFile::new(
String::from(code),
Some(String::from("crosstabs.sps")),
UTF_8,
// You should have received a copy of the GNU General Public License along with
// this program. If not, see <http://www.gnu.org/licenses/>.
-//! PSPP syntax scanning.
+//! PSPP lexical analysis.
//!
-//! PSPP divides traditional "lexical analysis" or "tokenization" into two
-//! phases: a lower-level phase called "segmentation" and a higher-level phase
-//! called "scanning". [segment] implements the segmentation phase and
-//! this module the scanning phase.
+//! PSPP divides traditional "lexical analysis" or "tokenization" into three
+//! phases:
//!
-//! Scanning accepts as input a stream of segments, which are UTF-8 strings each
-//! labeled with a segment type. It outputs a stream of "scan tokens", which
-//! are the same as the tokens used by the PSPP parser with a few additional
-//! types.
+//! 1. A low level called "segmentation", implemented in the [segment] module.
+//! This labels syntax strings with [Segment](segment::Segment)s.
+//!
+//! 2. A middle level called "scanning", implemented in the [scan] module.
+//! This transforms and merges segments to form [Token]s.
+//!
+//! 3. A high level called "lexing", implemented in the [lexer] module. Lexing
+//! brings together multiple source files and invokes macro expansion on the
+//! tokens output by the scanner.
+
+// Warn about missing docs, but not for items declared with `#[cfg(test)]`.
+#![cfg_attr(not(test), warn(missing_docs))]
pub mod command_name;
pub mod lexer;
pub mod scan;
pub mod segment;
-pub mod token;
+mod token;
+pub use token::{Punct, Token};
// You should have received a copy of the GNU General Public License along with
// this program. If not, see <http://www.gnu.org/licenses/>.
-//! PSPP lexical analysis.
+//! Mid-level lexical analysis.
//!
-//! PSPP divides traditional "lexical analysis" or "tokenization" into two
-//! phases: a lower-level phase called "segmentation" and a higher-level phase
-//! called "scanning". [mod segment] implements the segmentation phase and [mod
-//! scan] the scanning phase.
+//! This module implements mid-level lexical analysis using the segments
+//! output by the lower-level [segmentation phase](super::segment).
//!
-//! Scanning accepts as input a stream of segments, which are UTF-8 strings each
-//! labeled with a segment type. It outputs a stream of "scan tokens", which
-//! are the same as the tokens used by the PSPP parser with a few additional
-//! types.
+//! Scanning accepts as input a stream of segments, which are UTF-8 strings
+//! labeled with a [segment type](super::segment::Segment). It outputs a stream
+//! of [Token]s used by the PSPP parser or an error.
use crate::identifier::{Identifier, ReservedWord};
use std::collections::VecDeque;
use thiserror::Error as ThisError;
+/// Error returned by [merge_tokens].
#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
pub enum ScanError {
/// Unterminated string constant.
/// Incomplete UTF-8 sequence.
#[error("Incomplete UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
- IncompleteUtf8 { substring: String, offset: usize },
+ IncompleteUtf8 {
+ /// Incomplete sequence.
+ substring: String,
+ /// Offset of start of sequence.
+ offset: usize,
+ },
/// Bad UTF-8 sequence.
#[error("Invalid UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
- BadUtf8 { substring: String, offset: usize },
+ BadUtf8 {
+ /// Invalid sequence.
+ substring: String,
+ /// Offset of start of sequence.
+ offset: usize,
+ },
/// Invalid length Unicode string.
#[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")]
UnexpectedChar(char),
}
-/// The input or output to token merging.
-#[derive(Clone, Debug, PartialEq)]
-pub enum ScanToken {
- Token(Token),
- Error(ScanError),
-}
-
-impl ScanToken {
- pub fn token(self) -> Option<Token> {
- match self {
- ScanToken::Token(token) => Some(token),
- ScanToken::Error(_) => None,
- }
- }
-}
-
-/// The result of merging tokens.
+/// The action returned by [merge_tokens].
#[derive(Clone, Debug)]
-pub enum MergeResult {
+pub enum MergeAction {
/// Copy one token literally from input to output.
Copy,
},
}
+/// Used by [merge_tokens] to indicate that more input is needed.
#[derive(Copy, Clone, Debug)]
pub struct Incomplete;
-impl ScanToken {
- pub fn from_segment(s: &str, segment: Segment) -> Option<Self> {
- match segment {
- Segment::Number => Some(Self::Token(Token::Number(s.parse().unwrap()))),
+impl Segment {
+ /// Tries to transform this segment, which was obtained for `s`, into a
+ /// token. Returns one of:
+ ///
+ /// - `None`: This segment doesn't correspond to any token (because it is a
+ /// comment, white space, etc.) and can be dropped in tokenization.
+ ///
+ /// - `Some(Ok(token))`: This segment corresponds to the given token.
+ ///
+ /// - `Some(Err(error))`: The segment contains an error, which the caller
+ /// should report.
+ ///
+ /// The raw token (or error) that this function returns should ordinarily be
+ /// merged with adjacent tokens with [merge_tokens] or some higher-level
+ /// construct.
+ pub fn to_token(self, s: &str) -> Option<Result<Token, ScanError>> {
+ match self {
+ Segment::Number => Some(Ok(Token::Number(s.parse().unwrap()))),
Segment::QuotedString => {
// Trim quote mark from front and back.
let mut chars = s.chars();
'"' => ("\"", "\"\""),
_ => unreachable!(),
};
- Some(Self::Token(Token::String(
- s.replace(double_quote, single_quote),
- )))
+ Some(Ok(Token::String(s.replace(double_quote, single_quote))))
}
Segment::HexString => {
// Strip `X"` prefix and `"` suffix (or variations).
let s = &s[2..s.len() - 1];
for c in s.chars() {
if !c.is_ascii_hexdigit() {
- return Some(Self::Error(ScanError::BadHexDigit(c)));
+ return Some(Err(ScanError::BadHexDigit(c)));
}
}
if s.len() % 2 != 0 {
- return Some(Self::Error(ScanError::OddLengthHexString(s.len())));
+ return Some(Err(ScanError::OddLengthHexString(s.len())));
}
let bytes = s
.as_bytes()
})
.collect::<Vec<_>>();
match String::from_utf8(bytes) {
- Ok(string) => Some(Self::Token(Token::String(string))),
+ Ok(string) => Some(Ok(Token::String(string))),
Err(error) => {
let details = error.utf8_error();
let offset = details.valid_up_to() * 2;
.map(|len| offset + len * 2)
.unwrap_or(s.len());
let substring = String::from(&s[offset..end]);
- Some(Self::Error(if details.error_len().is_some() {
+ Some(Err(if details.error_len().is_some() {
ScanError::BadUtf8 { substring, offset }
} else {
ScanError::IncompleteUtf8 { substring, offset }
// Strip `U"` prefix and `"` suffix (or variations).
let s = &s[2..s.len() - 1];
if !(1..=8).contains(&s.len()) {
- return Some(Self::Error(ScanError::BadLengthUnicodeString(s.len())));
+ return Some(Err(ScanError::BadLengthUnicodeString(s.len())));
}
let Ok(code_point) = u32::from_str_radix(s, 16) else {
- return Some(Self::Error(ScanError::ExpectedCodePoint));
+ return Some(Err(ScanError::ExpectedCodePoint));
};
let Some(c) = char::from_u32(code_point) else {
- return Some(Self::Error(ScanError::BadCodePoint(code_point)));
+ return Some(Err(ScanError::BadCodePoint(code_point)));
};
- Some(Self::Token(Token::String(String::from(c))))
+ Some(Ok(Token::String(String::from(c))))
}
Segment::UnquotedString
| Segment::InlineData
| Segment::Document
| Segment::MacroBody
- | Segment::MacroName => Some(Self::Token(Token::String(String::from(s)))),
+ | Segment::MacroName => Some(Ok(Token::String(String::from(s)))),
Segment::Identifier => {
if let Ok(reserved_word) = ReservedWord::try_from(s) {
match reserved_word {
- ReservedWord::And => Some(Self::Token(Token::Punct(Punct::And))),
- ReservedWord::Or => Some(Self::Token(Token::Punct(Punct::Or))),
- ReservedWord::Not => Some(Self::Token(Token::Punct(Punct::Not))),
- ReservedWord::Eq => Some(Self::Token(Token::Punct(Punct::Eq))),
- ReservedWord::Ge => Some(Self::Token(Token::Punct(Punct::Ge))),
- ReservedWord::Gt => Some(Self::Token(Token::Punct(Punct::Gt))),
- ReservedWord::Le => Some(Self::Token(Token::Punct(Punct::Le))),
- ReservedWord::Lt => Some(Self::Token(Token::Punct(Punct::Lt))),
- ReservedWord::Ne => Some(Self::Token(Token::Punct(Punct::Ne))),
- ReservedWord::All => Some(Self::Token(Token::Punct(Punct::All))),
- ReservedWord::By => Some(Self::Token(Token::Punct(Punct::By))),
- ReservedWord::To => Some(Self::Token(Token::Punct(Punct::To))),
- ReservedWord::With => Some(Self::Token(Token::Punct(Punct::With))),
+ ReservedWord::And => Some(Ok(Token::Punct(Punct::And))),
+ ReservedWord::Or => Some(Ok(Token::Punct(Punct::Or))),
+ ReservedWord::Not => Some(Ok(Token::Punct(Punct::Not))),
+ ReservedWord::Eq => Some(Ok(Token::Punct(Punct::Eq))),
+ ReservedWord::Ge => Some(Ok(Token::Punct(Punct::Ge))),
+ ReservedWord::Gt => Some(Ok(Token::Punct(Punct::Gt))),
+ ReservedWord::Le => Some(Ok(Token::Punct(Punct::Le))),
+ ReservedWord::Lt => Some(Ok(Token::Punct(Punct::Lt))),
+ ReservedWord::Ne => Some(Ok(Token::Punct(Punct::Ne))),
+ ReservedWord::All => Some(Ok(Token::Punct(Punct::All))),
+ ReservedWord::By => Some(Ok(Token::Punct(Punct::By))),
+ ReservedWord::To => Some(Ok(Token::Punct(Punct::To))),
+ ReservedWord::With => Some(Ok(Token::Punct(Punct::With))),
}
} else {
- Some(Self::Token(Token::Id(Identifier::new(s).unwrap())))
+ Some(Ok(Token::Id(Identifier::new(s).unwrap())))
}
}
Segment::Punct => match s {
- "(" => Some(Self::Token(Token::Punct(Punct::LParen))),
- ")" => Some(Self::Token(Token::Punct(Punct::RParen))),
- "[" => Some(Self::Token(Token::Punct(Punct::LSquare))),
- "]" => Some(Self::Token(Token::Punct(Punct::RSquare))),
- "{" => Some(Self::Token(Token::Punct(Punct::LCurly))),
- "}" => Some(Self::Token(Token::Punct(Punct::RCurly))),
- "," => Some(Self::Token(Token::Punct(Punct::Comma))),
- "=" => Some(Self::Token(Token::Punct(Punct::Equals))),
- "-" => Some(Self::Token(Token::Punct(Punct::Dash))),
- "&" => Some(Self::Token(Token::Punct(Punct::And))),
- "|" => Some(Self::Token(Token::Punct(Punct::Or))),
- "+" => Some(Self::Token(Token::Punct(Punct::Plus))),
- "/" => Some(Self::Token(Token::Punct(Punct::Slash))),
- "*" => Some(Self::Token(Token::Punct(Punct::Asterisk))),
- "<" => Some(Self::Token(Token::Punct(Punct::Lt))),
- ">" => Some(Self::Token(Token::Punct(Punct::Gt))),
- "~" => Some(Self::Token(Token::Punct(Punct::Not))),
- ":" => Some(Self::Token(Token::Punct(Punct::Colon))),
- ";" => Some(Self::Token(Token::Punct(Punct::Semicolon))),
- "**" => Some(Self::Token(Token::Punct(Punct::Exp))),
- "<=" => Some(Self::Token(Token::Punct(Punct::Le))),
- "<>" => Some(Self::Token(Token::Punct(Punct::Ne))),
- "~=" => Some(Self::Token(Token::Punct(Punct::Ne))),
- ">=" => Some(Self::Token(Token::Punct(Punct::Ge))),
- "!" => Some(Self::Token(Token::Punct(Punct::Bang))),
- "%" => Some(Self::Token(Token::Punct(Punct::Percent))),
- "?" => Some(Self::Token(Token::Punct(Punct::Question))),
- "`" => Some(Self::Token(Token::Punct(Punct::Backtick))),
- "_" => Some(Self::Token(Token::Punct(Punct::Underscore))),
- "." => Some(Self::Token(Token::Punct(Punct::Dot))),
- "!*" => Some(Self::Token(Token::Punct(Punct::BangAsterisk))),
+ "(" => Some(Ok(Token::Punct(Punct::LParen))),
+ ")" => Some(Ok(Token::Punct(Punct::RParen))),
+ "[" => Some(Ok(Token::Punct(Punct::LSquare))),
+ "]" => Some(Ok(Token::Punct(Punct::RSquare))),
+ "{" => Some(Ok(Token::Punct(Punct::LCurly))),
+ "}" => Some(Ok(Token::Punct(Punct::RCurly))),
+ "," => Some(Ok(Token::Punct(Punct::Comma))),
+ "=" => Some(Ok(Token::Punct(Punct::Equals))),
+ "-" => Some(Ok(Token::Punct(Punct::Dash))),
+ "&" => Some(Ok(Token::Punct(Punct::And))),
+ "|" => Some(Ok(Token::Punct(Punct::Or))),
+ "+" => Some(Ok(Token::Punct(Punct::Plus))),
+ "/" => Some(Ok(Token::Punct(Punct::Slash))),
+ "*" => Some(Ok(Token::Punct(Punct::Asterisk))),
+ "<" => Some(Ok(Token::Punct(Punct::Lt))),
+ ">" => Some(Ok(Token::Punct(Punct::Gt))),
+ "~" => Some(Ok(Token::Punct(Punct::Not))),
+ ":" => Some(Ok(Token::Punct(Punct::Colon))),
+ ";" => Some(Ok(Token::Punct(Punct::Semicolon))),
+ "**" => Some(Ok(Token::Punct(Punct::Exp))),
+ "<=" => Some(Ok(Token::Punct(Punct::Le))),
+ "<>" => Some(Ok(Token::Punct(Punct::Ne))),
+ "~=" => Some(Ok(Token::Punct(Punct::Ne))),
+ ">=" => Some(Ok(Token::Punct(Punct::Ge))),
+ "!" => Some(Ok(Token::Punct(Punct::Bang))),
+ "%" => Some(Ok(Token::Punct(Punct::Percent))),
+ "?" => Some(Ok(Token::Punct(Punct::Question))),
+ "`" => Some(Ok(Token::Punct(Punct::Backtick))),
+ "_" => Some(Ok(Token::Punct(Punct::Underscore))),
+ "." => Some(Ok(Token::Punct(Punct::Dot))),
+ "!*" => Some(Ok(Token::Punct(Punct::BangAsterisk))),
_ => unreachable!("bad punctuator {s:?}"),
},
Segment::Shbang
| Segment::Comment
| Segment::Newline
| Segment::CommentCommand => None,
- Segment::DoRepeatOverflow => Some(Self::Error(ScanError::DoRepeatOverflow)),
- Segment::StartDocument => {
- Some(Self::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())))
- }
+ Segment::DoRepeatOverflow => Some(Err(ScanError::DoRepeatOverflow)),
+ Segment::StartDocument => Some(Ok(Token::Id(Identifier::new("DOCUMENT").unwrap()))),
Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => {
- Some(Self::Token(Token::End))
+ Some(Ok(Token::End))
}
- Segment::ExpectedQuote => Some(Self::Error(ScanError::ExpectedQuote)),
- Segment::ExpectedExponent => {
- Some(Self::Error(ScanError::ExpectedExponent(String::from(s))))
+ Segment::ExpectedQuote => Some(Err(ScanError::ExpectedQuote)),
+ Segment::ExpectedExponent => Some(Err(ScanError::ExpectedExponent(String::from(s)))),
+ Segment::UnexpectedChar => {
+ Some(Err(ScanError::UnexpectedChar(s.chars().next().unwrap())))
}
- Segment::UnexpectedChar => Some(Self::Error(ScanError::UnexpectedChar(
- s.chars().next().unwrap(),
- ))),
}
}
+}
- /// Attempts to merge a sequence of tokens together into a single token. The
- /// tokens are taken from the beginning of `input`. If successful, removes one
- /// or more token from the beginning of `input` and returnss the merged
- /// token. More input tokens might be needed; if so, leaves `input` alone and
- /// returns `None`. In the latter case, the caller should add more tokens to the
- /// input ([Token::End] or [Token::Punct(Punct::EndCmd)] is always sufficient).
- ///
- /// This performs two different kinds of token merging:
- ///
- /// - String concatenation, where syntax like `"a" + "b"` is converted into a
- /// single string token. This is definitely needed because the parser relies
- /// on it.
- ///
- /// - Negative number merging, where syntax like `-5` is converted from a pair
- /// of tokens (a dash and a positive number) into a single token (a negative
- /// number). This might not be needed anymore because the segmenter
- /// directly treats a dash followed by a number, with optional intervening
- /// white space, as a negative number. It's only needed if we want
- /// intervening comments to be allowed or for part of the negative number
- /// token to be produced by macro expansion.
- pub fn merge<'a, F>(get_token: F) -> Result<Option<MergeResult>, Incomplete>
- where
- F: Fn(usize) -> Result<Option<&'a Token>, Incomplete>,
- {
- let Some(token) = get_token(0)? else {
- return Ok(None);
- };
- match token {
- Token::Punct(Punct::Dash) => match get_token(1)? {
- Some(Token::Number(number)) if number.is_sign_positive() => {
- let number = *number;
- Ok(Some(MergeResult::Expand {
- n: 2,
- token: Token::Number(-number),
- }))
- }
- _ => Ok(Some(MergeResult::Copy)),
- },
- Token::String(_) => {
- let mut i = 0;
- while matches!(get_token(i * 2 + 1)?, Some(Token::Punct(Punct::Plus)))
- && matches!(get_token(i * 2 + 2)?, Some(Token::String(_)))
- {
- i += 1;
- }
- if i == 0 {
- Ok(Some(MergeResult::Copy))
- } else {
- let mut output = String::new();
- for i in 0..=i {
- let Token::String(s) = get_token(i * 2).unwrap().unwrap() else {
- unreachable!()
- };
- output.push_str(s);
- }
- Ok(Some(MergeResult::Expand {
- n: i * 2 + 1,
- token: Token::String(output),
- }))
+/// Attempts to merge a sequence of tokens together into a single token.
+///
+/// The tokens are taken from the beginning of `input`, which given
+/// 0-based token index returns:
+///
+/// * `Ok(Some(token))`: The token with the given index.
+///
+/// * `Ok(None)`: End of input.
+///
+/// * `Err(Incomplete)`: The given token isn't available yet (it may or may not
+/// exist).
+///
+/// This function returns one of:
+///
+/// * `Ok(Some(MergeAction))`: How to transform one or more input tokens into an
+/// output token.
+///
+/// * `Ok(None)`: End of input. (Only returned if `input(0)` is `Ok(None)`.)
+///
+/// * `Err(Incomplete)`: More input tokens are needed. Call again with longer
+/// `input`. ([Token::End] or [Token::Punct(Punct::EndCmd)] is
+/// always sufficient as extra input.)
+///
+/// This performs two different kinds of token merging:
+///
+/// - String concatenation, where syntax like `"a" + "b"` is converted into a
+/// single string token. This is definitely needed because the parser relies
+/// on it.
+///
+/// - Negative number merging, where syntax like `-5` is converted from a pair
+/// of tokens (a dash and a positive number) into a single token (a negative
+/// number). This might not be needed anymore because the segmenter
+/// directly treats a dash followed by a number, with optional intervening
+/// white space, as a negative number. It's only needed if we want
+/// intervening comments to be allowed or for part of the negative number
+/// token to be produced by macro expansion.
+pub fn merge_tokens<'a, F>(input: F) -> Result<Option<MergeAction>, Incomplete>
+where
+ F: Fn(usize) -> Result<Option<&'a Token>, Incomplete>,
+{
+ let Some(token) = input(0)? else {
+ return Ok(None);
+ };
+ match token {
+ Token::Punct(Punct::Dash) => match input(1)? {
+ Some(Token::Number(number)) if number.is_sign_positive() => {
+ let number = *number;
+ Ok(Some(MergeAction::Expand {
+ n: 2,
+ token: Token::Number(-number),
+ }))
+ }
+ _ => Ok(Some(MergeAction::Copy)),
+ },
+ Token::String(_) => {
+ let mut i = 0;
+ while matches!(input(i * 2 + 1)?, Some(Token::Punct(Punct::Plus)))
+ && matches!(input(i * 2 + 2)?, Some(Token::String(_)))
+ {
+ i += 1;
+ }
+ if i == 0 {
+ Ok(Some(MergeAction::Copy))
+ } else {
+ let mut output = String::new();
+ for i in 0..=i {
+ let Token::String(s) = input(i * 2).unwrap().unwrap() else {
+ unreachable!()
+ };
+ output.push_str(s);
}
+ Ok(Some(MergeAction::Expand {
+ n: i * 2 + 1,
+ token: Token::String(output),
+ }))
}
- _ => Ok(Some(MergeResult::Copy)),
}
+ _ => Ok(Some(MergeAction::Copy)),
}
}
+/// Too-simple lexical analyzer for strings.
+///
+/// Given a string, [StringSegmenter] provides iteration over raw tokens.
+/// Unlike [StringScanner], [StringSegmenter] does not merge tokens using
+/// [merge_tokens]. Usually merging is desirable, so [StringScanner] should be
+/// preferred.
+///
+/// This is used as part of macro expansion.
pub struct StringSegmenter<'a> {
input: &'a str,
segmenter: Segmenter,
}
impl<'a> StringSegmenter<'a> {
+ /// Creates a new [StringSegmenter] for `input` using syntax variant `mode`.
+ /// See [Segmenter::new] for an explanation of `is_snippet`.
pub fn new(input: &'a str, mode: Syntax, is_snippet: bool) -> Self {
Self {
input,
}
impl<'a> Iterator for StringSegmenter<'a> {
- type Item = (&'a str, ScanToken);
+ type Item = (&'a str, Result<Token, ScanError>);
fn next(&mut self) -> Option<Self::Item> {
loop {
let (s, rest) = self.input.split_at(seg_len);
self.input = rest;
- if let Some(token) = ScanToken::from_segment(s, seg_type) {
+ if let Some(token) = seg_type.to_token(s) {
return Some((s, token));
}
}
}
}
+/// Simple lexical analyzer for strings.
+///
+/// Given a string, [StringScanner] provides iteration over tokens.
pub struct StringScanner<'a> {
input: &'a str,
eof: bool,
}
impl<'a> StringScanner<'a> {
+ /// Creates a new [StringScanner] for `input` using syntax variant `mode`.
+ /// See [Segmenter::new] for an explanation of `is_snippet`.
pub fn new(input: &'a str, mode: Syntax, is_snippet: bool) -> Self {
Self {
input,
}
}
- fn merge(&mut self, eof: bool) -> Result<Option<ScanToken>, Incomplete> {
- match ScanToken::merge(|index| {
+ fn merge(&mut self, eof: bool) -> Result<Option<Result<Token, ScanError>>, Incomplete> {
+ match merge_tokens(|index| {
if let Some(token) = self.tokens.get(index) {
Ok(Some(token))
} else if eof {
Err(Incomplete)
}
})? {
- Some(MergeResult::Copy) => Ok(Some(ScanToken::Token(self.tokens.pop_front().unwrap()))),
- Some(MergeResult::Expand { n, token }) => {
+ Some(MergeAction::Copy) => Ok(Some(Ok(self.tokens.pop_front().unwrap()))),
+ Some(MergeAction::Expand { n, token }) => {
self.tokens.drain(..n);
- Ok(Some(ScanToken::Token(token)))
+ Ok(Some(Ok(token)))
}
None => Ok(None),
}
}
+ /// Transforms this [StringScanner] into an iterator that includes only the
+ /// [Token]s, omitting [ScanError]s.
pub fn unwrapped(self) -> impl Iterator<Item = Token> + use<'a> {
- self.map(|scan_token| scan_token.token().unwrap())
+ self.map(|scan_token| scan_token.ok().unwrap())
}
}
impl Iterator for StringScanner<'_> {
- type Item = ScanToken;
+ type Item = Result<Token, ScanError>;
fn next(&mut self) -> Option<Self::Item> {
loop {
};
let (s, rest) = self.input.split_at(seg_len);
- match ScanToken::from_segment(s, seg_type) {
- Some(ScanToken::Error(error)) => {
+ match seg_type.to_token(s) {
+ Some(Err(error)) => {
if let Ok(Some(token)) = self.merge(true) {
return Some(token);
}
self.input = rest;
- return Some(ScanToken::Error(error));
+ return Some(Err(error));
}
- Some(ScanToken::Token(token)) => {
+ Some(Ok(token)) => {
self.tokens.push_back(token);
}
None => (),
},
};
-use super::{ScanError, ScanToken, StringScanner};
+use super::{ScanError, StringScanner};
fn print_token(token: &Token) {
match token {
}
#[track_caller]
-fn check_scan(input: &str, mode: Syntax, expected: &[ScanToken]) {
+fn check_scan(input: &str, mode: Syntax, expected: &[Result<Token, ScanError>]) {
let tokens = StringScanner::new(input, mode, false).collect::<Vec<_>>();
if tokens != expected {
for token in &tokens {
match token {
- ScanToken::Token(token) => {
- print!("ScanToken::Token(");
+ Ok(token) => {
+ print!("Ok(");
print_token(token);
print!(")");
}
- ScanToken::Error(error) => print!("ScanToken::Error(ScanError::{error:?})"),
+ Err(error) => print!("Err(ScanError::{error:?})"),
}
println!(",");
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("aB").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("i5").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("$x").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("@efg").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("@@.").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("!abcd").unwrap())),
- ScanToken::Token(Token::Punct(Punct::BangAsterisk)),
- ScanToken::Token(Token::Punct(Punct::BangAsterisk)),
- ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("#.#").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Dot)),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Underscore)),
- ScanToken::Token(Token::Id(Identifier::new("z").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("abcd.").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("abcd").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("QRSTUV").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("QrStUv").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("WXYZ").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Error(ScanError::UnexpectedChar('�')),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("a").unwrap())),
+ Ok(Token::Id(Identifier::new("aB").unwrap())),
+ Ok(Token::Id(Identifier::new("i5").unwrap())),
+ Ok(Token::Id(Identifier::new("$x").unwrap())),
+ Ok(Token::Id(Identifier::new("@efg").unwrap())),
+ Ok(Token::Id(Identifier::new("@@.").unwrap())),
+ Ok(Token::Id(Identifier::new("!abcd").unwrap())),
+ Ok(Token::Punct(Punct::BangAsterisk)),
+ Ok(Token::Punct(Punct::BangAsterisk)),
+ Ok(Token::Id(Identifier::new("a").unwrap())),
+ Ok(Token::Id(Identifier::new("#.#").unwrap())),
+ Ok(Token::Punct(Punct::Dot)),
+ Ok(Token::Id(Identifier::new("x").unwrap())),
+ Ok(Token::Punct(Punct::Underscore)),
+ Ok(Token::Id(Identifier::new("z").unwrap())),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("abcd.").unwrap())),
+ Ok(Token::Id(Identifier::new("abcd").unwrap())),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("QRSTUV").unwrap())),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("QrStUv").unwrap())),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("WXYZ").unwrap())),
+ Ok(Token::End),
+ Err(ScanError::UnexpectedChar('�')),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Punct(Punct::And)),
- ScanToken::Token(Token::Punct(Punct::Or)),
- ScanToken::Token(Token::Punct(Punct::Not)),
- ScanToken::Token(Token::Punct(Punct::Eq)),
- ScanToken::Token(Token::Punct(Punct::Ge)),
- ScanToken::Token(Token::Punct(Punct::Gt)),
- ScanToken::Token(Token::Punct(Punct::Le)),
- ScanToken::Token(Token::Punct(Punct::Lt)),
- ScanToken::Token(Token::Punct(Punct::Ne)),
- ScanToken::Token(Token::Punct(Punct::All)),
- ScanToken::Token(Token::Punct(Punct::By)),
- ScanToken::Token(Token::Punct(Punct::To)),
- ScanToken::Token(Token::Punct(Punct::With)),
- ScanToken::Token(Token::Punct(Punct::And)),
- ScanToken::Token(Token::Punct(Punct::Or)),
- ScanToken::Token(Token::Punct(Punct::Not)),
- ScanToken::Token(Token::Punct(Punct::Eq)),
- ScanToken::Token(Token::Punct(Punct::Ge)),
- ScanToken::Token(Token::Punct(Punct::Gt)),
- ScanToken::Token(Token::Punct(Punct::Le)),
- ScanToken::Token(Token::Punct(Punct::Lt)),
- ScanToken::Token(Token::Punct(Punct::Ne)),
- ScanToken::Token(Token::Punct(Punct::All)),
- ScanToken::Token(Token::Punct(Punct::By)),
- ScanToken::Token(Token::Punct(Punct::To)),
- ScanToken::Token(Token::Punct(Punct::With)),
- ScanToken::Token(Token::Id(Identifier::new("andx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("orx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("notx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("eqx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("gex").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("gtx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("lex").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("ltx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("nex").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("allx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("byx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("tox").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("withx").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("and.").unwrap())),
- ScanToken::Token(Token::Punct(Punct::With)),
- ScanToken::Token(Token::End),
+ Ok(Token::Punct(Punct::And)),
+ Ok(Token::Punct(Punct::Or)),
+ Ok(Token::Punct(Punct::Not)),
+ Ok(Token::Punct(Punct::Eq)),
+ Ok(Token::Punct(Punct::Ge)),
+ Ok(Token::Punct(Punct::Gt)),
+ Ok(Token::Punct(Punct::Le)),
+ Ok(Token::Punct(Punct::Lt)),
+ Ok(Token::Punct(Punct::Ne)),
+ Ok(Token::Punct(Punct::All)),
+ Ok(Token::Punct(Punct::By)),
+ Ok(Token::Punct(Punct::To)),
+ Ok(Token::Punct(Punct::With)),
+ Ok(Token::Punct(Punct::And)),
+ Ok(Token::Punct(Punct::Or)),
+ Ok(Token::Punct(Punct::Not)),
+ Ok(Token::Punct(Punct::Eq)),
+ Ok(Token::Punct(Punct::Ge)),
+ Ok(Token::Punct(Punct::Gt)),
+ Ok(Token::Punct(Punct::Le)),
+ Ok(Token::Punct(Punct::Lt)),
+ Ok(Token::Punct(Punct::Ne)),
+ Ok(Token::Punct(Punct::All)),
+ Ok(Token::Punct(Punct::By)),
+ Ok(Token::Punct(Punct::To)),
+ Ok(Token::Punct(Punct::With)),
+ Ok(Token::Id(Identifier::new("andx").unwrap())),
+ Ok(Token::Id(Identifier::new("orx").unwrap())),
+ Ok(Token::Id(Identifier::new("notx").unwrap())),
+ Ok(Token::Id(Identifier::new("eqx").unwrap())),
+ Ok(Token::Id(Identifier::new("gex").unwrap())),
+ Ok(Token::Id(Identifier::new("gtx").unwrap())),
+ Ok(Token::Id(Identifier::new("lex").unwrap())),
+ Ok(Token::Id(Identifier::new("ltx").unwrap())),
+ Ok(Token::Id(Identifier::new("nex").unwrap())),
+ Ok(Token::Id(Identifier::new("allx").unwrap())),
+ Ok(Token::Id(Identifier::new("byx").unwrap())),
+ Ok(Token::Id(Identifier::new("tox").unwrap())),
+ Ok(Token::Id(Identifier::new("withx").unwrap())),
+ Ok(Token::Id(Identifier::new("and.").unwrap())),
+ Ok(Token::Punct(Punct::With)),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Punct(Punct::Not)),
- ScanToken::Token(Token::Punct(Punct::And)),
- ScanToken::Token(Token::Punct(Punct::Or)),
- ScanToken::Token(Token::Punct(Punct::Equals)),
- ScanToken::Token(Token::Punct(Punct::Ge)),
- ScanToken::Token(Token::Punct(Punct::Gt)),
- ScanToken::Token(Token::Punct(Punct::Le)),
- ScanToken::Token(Token::Punct(Punct::Lt)),
- ScanToken::Token(Token::Punct(Punct::Ne)),
- ScanToken::Token(Token::Punct(Punct::Ne)),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Punct(Punct::Comma)),
- ScanToken::Token(Token::Punct(Punct::Dash)),
- ScanToken::Token(Token::Punct(Punct::Plus)),
- ScanToken::Token(Token::Punct(Punct::Asterisk)),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Punct(Punct::LSquare)),
- ScanToken::Token(Token::Punct(Punct::RSquare)),
- ScanToken::Token(Token::Punct(Punct::Exp)),
- ScanToken::Token(Token::Punct(Punct::Not)),
- ScanToken::Token(Token::Punct(Punct::And)),
- ScanToken::Token(Token::Punct(Punct::Or)),
- ScanToken::Token(Token::Punct(Punct::Equals)),
- ScanToken::Token(Token::Punct(Punct::Ge)),
- ScanToken::Token(Token::Punct(Punct::Gt)),
- ScanToken::Token(Token::Punct(Punct::Le)),
- ScanToken::Token(Token::Punct(Punct::Lt)),
- ScanToken::Token(Token::Punct(Punct::Ne)),
- ScanToken::Token(Token::Punct(Punct::Ne)),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Punct(Punct::Comma)),
- ScanToken::Token(Token::Punct(Punct::Dash)),
- ScanToken::Token(Token::Punct(Punct::Plus)),
- ScanToken::Token(Token::Punct(Punct::Asterisk)),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Punct(Punct::LSquare)),
- ScanToken::Token(Token::Punct(Punct::RSquare)),
- ScanToken::Token(Token::Punct(Punct::Exp)),
- ScanToken::Token(Token::Punct(Punct::Percent)),
- ScanToken::Token(Token::Punct(Punct::Colon)),
- ScanToken::Token(Token::Punct(Punct::Semicolon)),
- ScanToken::Token(Token::Punct(Punct::Question)),
- ScanToken::Token(Token::Punct(Punct::Underscore)),
- ScanToken::Token(Token::Punct(Punct::Backtick)),
- ScanToken::Token(Token::Punct(Punct::LCurly)),
- ScanToken::Token(Token::Punct(Punct::RCurly)),
- ScanToken::Token(Token::Punct(Punct::Not)),
+ Ok(Token::Punct(Punct::Not)),
+ Ok(Token::Punct(Punct::And)),
+ Ok(Token::Punct(Punct::Or)),
+ Ok(Token::Punct(Punct::Equals)),
+ Ok(Token::Punct(Punct::Ge)),
+ Ok(Token::Punct(Punct::Gt)),
+ Ok(Token::Punct(Punct::Le)),
+ Ok(Token::Punct(Punct::Lt)),
+ Ok(Token::Punct(Punct::Ne)),
+ Ok(Token::Punct(Punct::Ne)),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::Punct(Punct::Comma)),
+ Ok(Token::Punct(Punct::Dash)),
+ Ok(Token::Punct(Punct::Plus)),
+ Ok(Token::Punct(Punct::Asterisk)),
+ Ok(Token::Punct(Punct::Slash)),
+ Ok(Token::Punct(Punct::LSquare)),
+ Ok(Token::Punct(Punct::RSquare)),
+ Ok(Token::Punct(Punct::Exp)),
+ Ok(Token::Punct(Punct::Not)),
+ Ok(Token::Punct(Punct::And)),
+ Ok(Token::Punct(Punct::Or)),
+ Ok(Token::Punct(Punct::Equals)),
+ Ok(Token::Punct(Punct::Ge)),
+ Ok(Token::Punct(Punct::Gt)),
+ Ok(Token::Punct(Punct::Le)),
+ Ok(Token::Punct(Punct::Lt)),
+ Ok(Token::Punct(Punct::Ne)),
+ Ok(Token::Punct(Punct::Ne)),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::Punct(Punct::Comma)),
+ Ok(Token::Punct(Punct::Dash)),
+ Ok(Token::Punct(Punct::Plus)),
+ Ok(Token::Punct(Punct::Asterisk)),
+ Ok(Token::Punct(Punct::Slash)),
+ Ok(Token::Punct(Punct::LSquare)),
+ Ok(Token::Punct(Punct::RSquare)),
+ Ok(Token::Punct(Punct::Exp)),
+ Ok(Token::Punct(Punct::Percent)),
+ Ok(Token::Punct(Punct::Colon)),
+ Ok(Token::Punct(Punct::Semicolon)),
+ Ok(Token::Punct(Punct::Question)),
+ Ok(Token::Punct(Punct::Underscore)),
+ Ok(Token::Punct(Punct::Backtick)),
+ Ok(Token::Punct(Punct::LCurly)),
+ Ok(Token::Punct(Punct::RCurly)),
+ Ok(Token::Punct(Punct::Not)),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Number(0.0)),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Number(123.0)),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::Number(0.1)),
- ScanToken::Token(Token::Number(0.1)),
- ScanToken::Token(Token::Number(0.1)),
- ScanToken::Token(Token::Number(50.0)),
- ScanToken::Token(Token::Number(0.6)),
- ScanToken::Token(Token::Number(70.0)),
- ScanToken::Token(Token::Number(60.0)),
- ScanToken::Token(Token::Number(0.006)),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Number(30.0)),
- ScanToken::Token(Token::Number(0.04)),
- ScanToken::Token(Token::Number(5.0)),
- ScanToken::Token(Token::Number(6.0)),
- ScanToken::Token(Token::Number(0.0007)),
- ScanToken::Token(Token::Number(12.3)),
- ScanToken::Token(Token::Number(4.56)),
- ScanToken::Token(Token::Number(789.0)),
- ScanToken::Token(Token::Number(999.0)),
- ScanToken::Token(Token::Number(0.0112)),
- ScanToken::Token(Token::End),
- ScanToken::Error(ScanError::ExpectedExponent(String::from("1e"))),
- ScanToken::Token(Token::Id(Identifier::new("e1").unwrap())),
- ScanToken::Error(ScanError::ExpectedExponent(String::from("1e+"))),
- ScanToken::Error(ScanError::ExpectedExponent(String::from("1e-"))),
+ Ok(Token::Number(0.0)),
+ Ok(Token::Number(1.0)),
+ Ok(Token::Number(1.0)),
+ Ok(Token::Number(1.0)),
+ Ok(Token::Number(1.0)),
+ Ok(Token::End),
+ Ok(Token::Number(123.0)),
+ Ok(Token::End),
+ Ok(Token::End),
+ Ok(Token::Number(1.0)),
+ Ok(Token::Number(0.1)),
+ Ok(Token::Number(0.1)),
+ Ok(Token::Number(0.1)),
+ Ok(Token::Number(50.0)),
+ Ok(Token::Number(0.6)),
+ Ok(Token::Number(70.0)),
+ Ok(Token::Number(60.0)),
+ Ok(Token::Number(0.006)),
+ Ok(Token::End),
+ Ok(Token::Number(30.0)),
+ Ok(Token::Number(0.04)),
+ Ok(Token::Number(5.0)),
+ Ok(Token::Number(6.0)),
+ Ok(Token::Number(0.0007)),
+ Ok(Token::Number(12.3)),
+ Ok(Token::Number(4.56)),
+ Ok(Token::Number(789.0)),
+ Ok(Token::Number(999.0)),
+ Ok(Token::Number(0.0112)),
+ Ok(Token::End),
+ Err(ScanError::ExpectedExponent(String::from("1e"))),
+ Ok(Token::Id(Identifier::new("e1").unwrap())),
+ Err(ScanError::ExpectedExponent(String::from("1e+"))),
+ Err(ScanError::ExpectedExponent(String::from("1e-"))),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Number(-0.0)),
- ScanToken::Token(Token::Number(-1.0)),
- ScanToken::Token(Token::Number(-1.0)),
- ScanToken::Token(Token::Number(-1.0)),
- ScanToken::Token(Token::Number(-1.0)),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Number(-123.0)),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Number(-0.1)),
- ScanToken::Token(Token::Number(-0.1)),
- ScanToken::Token(Token::Number(-0.1)),
- ScanToken::Token(Token::Number(-0.1)),
- ScanToken::Token(Token::Number(-50.0)),
- ScanToken::Token(Token::Number(-0.6)),
- ScanToken::Token(Token::Number(-70.0)),
- ScanToken::Token(Token::Number(-60.0)),
- ScanToken::Token(Token::Number(-0.006)),
- ScanToken::Token(Token::Number(-3.0)),
- ScanToken::Token(Token::Number(-0.04)),
- ScanToken::Token(Token::Number(-5.0)),
- ScanToken::Token(Token::Number(-6.0)),
- ScanToken::Token(Token::Number(-0.0007)),
- ScanToken::Token(Token::Number(-12.3)),
- ScanToken::Token(Token::Number(-4.56)),
- ScanToken::Token(Token::Number(-789.0)),
- ScanToken::Token(Token::Number(-999.0)),
- ScanToken::Token(Token::Number(-0.0112)),
- ScanToken::Token(Token::Number(-1.0)),
- ScanToken::Token(Token::Punct(Punct::Dash)),
- ScanToken::Token(Token::Punct(Punct::Dot)),
- ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e"))),
- ScanToken::Token(Token::Punct(Punct::Dash)),
- ScanToken::Token(Token::Id(Identifier::new("e1").unwrap())),
- ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e+"))),
- ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e-"))),
- ScanToken::Token(Token::Number(-1.0)),
- ScanToken::Token(Token::End),
+ Ok(Token::Number(-0.0)),
+ Ok(Token::Number(-1.0)),
+ Ok(Token::Number(-1.0)),
+ Ok(Token::Number(-1.0)),
+ Ok(Token::Number(-1.0)),
+ Ok(Token::End),
+ Ok(Token::Number(-123.0)),
+ Ok(Token::End),
+ Ok(Token::Number(-0.1)),
+ Ok(Token::Number(-0.1)),
+ Ok(Token::Number(-0.1)),
+ Ok(Token::Number(-0.1)),
+ Ok(Token::Number(-50.0)),
+ Ok(Token::Number(-0.6)),
+ Ok(Token::Number(-70.0)),
+ Ok(Token::Number(-60.0)),
+ Ok(Token::Number(-0.006)),
+ Ok(Token::Number(-3.0)),
+ Ok(Token::Number(-0.04)),
+ Ok(Token::Number(-5.0)),
+ Ok(Token::Number(-6.0)),
+ Ok(Token::Number(-0.0007)),
+ Ok(Token::Number(-12.3)),
+ Ok(Token::Number(-4.56)),
+ Ok(Token::Number(-789.0)),
+ Ok(Token::Number(-999.0)),
+ Ok(Token::Number(-0.0112)),
+ Ok(Token::Number(-1.0)),
+ Ok(Token::Punct(Punct::Dash)),
+ Ok(Token::Punct(Punct::Dot)),
+ Err(ScanError::ExpectedExponent(String::from("-1e"))),
+ Ok(Token::Punct(Punct::Dash)),
+ Ok(Token::Id(Identifier::new("e1").unwrap())),
+ Err(ScanError::ExpectedExponent(String::from("-1e+"))),
+ Err(ScanError::ExpectedExponent(String::from("-1e-"))),
+ Ok(Token::Number(-1.0)),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::String(String::from("x"))),
- ScanToken::Token(Token::String(String::from("y"))),
- ScanToken::Token(Token::String(String::from("abc"))),
- ScanToken::Token(Token::String(String::from("Don't"))),
- ScanToken::Token(Token::String(String::from("Can't"))),
- ScanToken::Token(Token::String(String::from("Won't"))),
- ScanToken::Token(Token::String(String::from("\"quoted\""))),
- ScanToken::Token(Token::String(String::from("\"quoted\""))),
- ScanToken::Token(Token::String(String::from(""))),
- ScanToken::Token(Token::String(String::from(""))),
- ScanToken::Token(Token::String(String::from("'"))),
- ScanToken::Token(Token::String(String::from("\""))),
- ScanToken::Error(ScanError::ExpectedQuote),
- ScanToken::Error(ScanError::ExpectedQuote),
- ScanToken::Token(Token::String(String::from("xyzabcde"))),
- ScanToken::Token(Token::String(String::from("foobar"))),
- ScanToken::Token(Token::String(String::from("foobar"))),
- ScanToken::Token(Token::String(String::from("foo"))),
- ScanToken::Token(Token::Punct(Punct::Plus)),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::String(String::from("bar"))),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Punct(Punct::Plus)),
- ScanToken::Token(Token::String(String::from("AB5152"))),
- ScanToken::Token(Token::String(String::from("4142QR"))),
- ScanToken::Token(Token::String(String::from("ABお"))),
- ScanToken::Token(Token::String(String::from("�あいうえお"))),
- ScanToken::Token(Token::String(String::from("abc�えxyz"))),
+ Ok(Token::String(String::from("x"))),
+ Ok(Token::String(String::from("y"))),
+ Ok(Token::String(String::from("abc"))),
+ Ok(Token::String(String::from("Don't"))),
+ Ok(Token::String(String::from("Can't"))),
+ Ok(Token::String(String::from("Won't"))),
+ Ok(Token::String(String::from("\"quoted\""))),
+ Ok(Token::String(String::from("\"quoted\""))),
+ Ok(Token::String(String::from(""))),
+ Ok(Token::String(String::from(""))),
+ Ok(Token::String(String::from("'"))),
+ Ok(Token::String(String::from("\""))),
+ Err(ScanError::ExpectedQuote),
+ Err(ScanError::ExpectedQuote),
+ Ok(Token::String(String::from("xyzabcde"))),
+ Ok(Token::String(String::from("foobar"))),
+ Ok(Token::String(String::from("foobar"))),
+ Ok(Token::String(String::from("foo"))),
+ Ok(Token::Punct(Punct::Plus)),
+ Ok(Token::End),
+ Ok(Token::String(String::from("bar"))),
+ Ok(Token::End),
+ Ok(Token::Punct(Punct::Plus)),
+ Ok(Token::String(String::from("AB5152"))),
+ Ok(Token::String(String::from("4142QR"))),
+ Ok(Token::String(String::from("ABお"))),
+ Ok(Token::String(String::from("�あいうえお"))),
+ Ok(Token::String(String::from("abc�えxyz"))),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("#").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Bang)),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Id(Identifier::new("usr").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Id(Identifier::new("bin").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Id(Identifier::new("pspp").unwrap())),
+ Ok(Token::Id(Identifier::new("#").unwrap())),
+ Ok(Token::Punct(Punct::Bang)),
+ Ok(Token::Punct(Punct::Slash)),
+ Ok(Token::Id(Identifier::new("usr").unwrap())),
+ Ok(Token::Punct(Punct::Slash)),
+ Ok(Token::Id(Identifier::new("bin").unwrap())),
+ Ok(Token::Punct(Punct::Slash)),
+ Ok(Token::Id(Identifier::new("pspp").unwrap())),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("com").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("is").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("ambiguous").unwrap())),
- ScanToken::Token(Token::Punct(Punct::With)),
- ScanToken::Token(Token::Id(Identifier::new("COMPUTE").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("next").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::End),
+ Ok(Token::End),
+ Ok(Token::End),
+ Ok(Token::End),
+ Ok(Token::End),
+ Ok(Token::End),
+ Ok(Token::End),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("com").unwrap())),
+ Ok(Token::Id(Identifier::new("is").unwrap())),
+ Ok(Token::Id(Identifier::new("ambiguous").unwrap())),
+ Ok(Token::Punct(Punct::With)),
+ Ok(Token::Id(Identifier::new("COMPUTE").unwrap())),
+ Ok(Token::End),
+ Ok(Token::End),
+ Ok(Token::End),
+ Ok(Token::End),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("next").unwrap())),
+ Ok(Token::Id(Identifier::new("command").unwrap())),
+ Ok(Token::End),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())),
- ScanToken::Token(Token::String(String::from("DOCUMENT one line."))),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())),
- ScanToken::Token(Token::String(String::from("DOC more"))),
- ScanToken::Token(Token::String(String::from(" than"))),
- ScanToken::Token(Token::String(String::from(" one"))),
- ScanToken::Token(Token::String(String::from(" line."))),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())),
- ScanToken::Token(Token::String(String::from("docu"))),
- ScanToken::Token(Token::String(String::from("first.paragraph"))),
- ScanToken::Token(Token::String(String::from("isn't parsed as tokens"))),
- ScanToken::Token(Token::String(String::from(""))),
- ScanToken::Token(Token::String(String::from("second paragraph."))),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("DOCUMENT").unwrap())),
+ Ok(Token::String(String::from("DOCUMENT one line."))),
+ Ok(Token::End),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("DOCUMENT").unwrap())),
+ Ok(Token::String(String::from("DOC more"))),
+ Ok(Token::String(String::from(" than"))),
+ Ok(Token::String(String::from(" one"))),
+ Ok(Token::String(String::from(" line."))),
+ Ok(Token::End),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("DOCUMENT").unwrap())),
+ Ok(Token::String(String::from("docu"))),
+ Ok(Token::String(String::from("first.paragraph"))),
+ Ok(Token::String(String::from("isn't parsed as tokens"))),
+ Ok(Token::String(String::from(""))),
+ Ok(Token::String(String::from("second paragraph."))),
+ Ok(Token::End),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("FIL").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("label").unwrap())),
- ScanToken::Token(Token::String(String::from("isn't quoted"))),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("FILE").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("lab").unwrap())),
- ScanToken::Token(Token::String(String::from("is quoted"))),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("FILE").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("lab").unwrap())),
- ScanToken::Token(Token::String(String::from("not quoted here either"))),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("FIL").unwrap())),
+ Ok(Token::Id(Identifier::new("label").unwrap())),
+ Ok(Token::String(String::from("isn't quoted"))),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("FILE").unwrap())),
+ Ok(Token::Id(Identifier::new("lab").unwrap())),
+ Ok(Token::String(String::from("is quoted"))),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("FILE").unwrap())),
+ Ok(Token::Id(Identifier::new("lab").unwrap())),
+ Ok(Token::String(String::from("not quoted here either"))),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("begin").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::String(String::from("123"))),
- ScanToken::Token(Token::String(String::from("xxx"))),
- ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("BEG").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("DAT").unwrap())),
- ScanToken::Token(Token::String(String::from("5 6 7 /* x"))),
- ScanToken::Token(Token::String(String::from(""))),
- ScanToken::Token(Token::String(String::from("end data"))),
- ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("begin").unwrap())),
+ Ok(Token::Id(Identifier::new("data").unwrap())),
+ Ok(Token::End),
+ Ok(Token::String(String::from("123"))),
+ Ok(Token::String(String::from("xxx"))),
+ Ok(Token::Id(Identifier::new("end").unwrap())),
+ Ok(Token::Id(Identifier::new("data").unwrap())),
+ Ok(Token::End),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("BEG").unwrap())),
+ Ok(Token::Id(Identifier::new("DAT").unwrap())),
+ Ok(Token::String(String::from("5 6 7 /* x"))),
+ Ok(Token::String(String::from(""))),
+ Ok(Token::String(String::from("end data"))),
+ Ok(Token::Id(Identifier::new("end").unwrap())),
+ Ok(Token::Id(Identifier::new("data").unwrap())),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("do").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Equals)),
- ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("y").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Equals)),
- ScanToken::Token(Token::Id(Identifier::new("d").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("e").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("f").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::String(String::from(" do repeat a=1 thru 5."))),
- ScanToken::Token(Token::String(String::from("another command."))),
- ScanToken::Token(Token::String(String::from("second command"))),
- ScanToken::Token(Token::String(String::from("+ third command."))),
- ScanToken::Token(Token::String(String::from(
+ Ok(Token::Id(Identifier::new("do").unwrap())),
+ Ok(Token::Id(Identifier::new("repeat").unwrap())),
+ Ok(Token::Id(Identifier::new("x").unwrap())),
+ Ok(Token::Punct(Punct::Equals)),
+ Ok(Token::Id(Identifier::new("a").unwrap())),
+ Ok(Token::Id(Identifier::new("b").unwrap())),
+ Ok(Token::Id(Identifier::new("c").unwrap())),
+ Ok(Token::Id(Identifier::new("y").unwrap())),
+ Ok(Token::Punct(Punct::Equals)),
+ Ok(Token::Id(Identifier::new("d").unwrap())),
+ Ok(Token::Id(Identifier::new("e").unwrap())),
+ Ok(Token::Id(Identifier::new("f").unwrap())),
+ Ok(Token::End),
+ Ok(Token::String(String::from(" do repeat a=1 thru 5."))),
+ Ok(Token::String(String::from("another command."))),
+ Ok(Token::String(String::from("second command"))),
+ Ok(Token::String(String::from("+ third command."))),
+ Ok(Token::String(String::from(
"end /* x */ /* y */ repeat print.",
))),
- ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("end").unwrap())),
+ Ok(Token::Id(Identifier::new("repeat").unwrap())),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Batch,
&[
- ScanToken::Token(Token::Id(Identifier::new("do").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Equals)),
- ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("y").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Equals)),
- ScanToken::Token(Token::Id(Identifier::new("d").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("e").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("f").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::String(String::from("do repeat a=1 thru 5"))),
- ScanToken::Token(Token::String(String::from("another command"))),
- ScanToken::Token(Token::String(String::from("second command"))),
- ScanToken::Token(Token::String(String::from("+ third command"))),
- ScanToken::Token(Token::String(String::from(
+ Ok(Token::Id(Identifier::new("do").unwrap())),
+ Ok(Token::Id(Identifier::new("repeat").unwrap())),
+ Ok(Token::Id(Identifier::new("x").unwrap())),
+ Ok(Token::Punct(Punct::Equals)),
+ Ok(Token::Id(Identifier::new("a").unwrap())),
+ Ok(Token::Id(Identifier::new("b").unwrap())),
+ Ok(Token::Id(Identifier::new("c").unwrap())),
+ Ok(Token::Id(Identifier::new("y").unwrap())),
+ Ok(Token::Punct(Punct::Equals)),
+ Ok(Token::Id(Identifier::new("d").unwrap())),
+ Ok(Token::Id(Identifier::new("e").unwrap())),
+ Ok(Token::Id(Identifier::new("f").unwrap())),
+ Ok(Token::End),
+ Ok(Token::String(String::from("do repeat a=1 thru 5"))),
+ Ok(Token::String(String::from("another command"))),
+ Ok(Token::String(String::from("second command"))),
+ Ok(Token::String(String::from("+ third command"))),
+ Ok(Token::String(String::from(
"end /* x */ /* y */ repeat print",
))),
- ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("do").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("#a").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Equals)),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::String(String::from(" inner command"))),
- ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
+ Ok(Token::Id(Identifier::new("end").unwrap())),
+ Ok(Token::Id(Identifier::new("repeat").unwrap())),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("do").unwrap())),
+ Ok(Token::Id(Identifier::new("repeat").unwrap())),
+ Ok(Token::Id(Identifier::new("#a").unwrap())),
+ Ok(Token::Punct(Punct::Equals)),
+ Ok(Token::Number(1.0)),
+ Ok(Token::End),
+ Ok(Token::String(String::from(" inner command"))),
+ Ok(Token::Id(Identifier::new("end").unwrap())),
+ Ok(Token::Id(Identifier::new("repeat").unwrap())),
],
);
}
"#,
Syntax::Batch,
&[
- ScanToken::Token(Token::Id(Identifier::new("first").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("another").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("line").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("of").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("first").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("second").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("third").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("fourth").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("fifth").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("first").unwrap())),
+ Ok(Token::Id(Identifier::new("command").unwrap())),
+ Ok(Token::Id(Identifier::new("another").unwrap())),
+ Ok(Token::Id(Identifier::new("line").unwrap())),
+ Ok(Token::Id(Identifier::new("of").unwrap())),
+ Ok(Token::Id(Identifier::new("first").unwrap())),
+ Ok(Token::Id(Identifier::new("command").unwrap())),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("second").unwrap())),
+ Ok(Token::Id(Identifier::new("command").unwrap())),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("third").unwrap())),
+ Ok(Token::Id(Identifier::new("command").unwrap())),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("fourth").unwrap())),
+ Ok(Token::Id(Identifier::new("command").unwrap())),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("fifth").unwrap())),
+ Ok(Token::Id(Identifier::new("command").unwrap())),
+ Ok(Token::End),
],
);
}
use crate::{
identifier::Identifier,
lex::{
- scan::ScanToken,
segment::Syntax,
token::{Punct, Token},
},
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("define").unwrap())),
+ Ok(Token::String(String::from("!macro1"))),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::String(String::from("var1 var2 var3"))),
+ Ok(Token::Id(Identifier::new("!enddefine").unwrap())),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::String(String::from(" var1 var2 var3"))),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("define").unwrap())),
+ Ok(Token::String(String::from("!macro1"))),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::String(String::from(" var1 var2 var3"))),
+ Ok(Token::Id(Identifier::new("!enddefine").unwrap())),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("define").unwrap())),
+ Ok(Token::String(String::from("!macro1"))),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::String(String::from("var1 var2 var3"))),
+ Ok(Token::Id(Identifier::new("!enddefine").unwrap())),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("define").unwrap())),
+ Ok(Token::String(String::from("!macro1"))),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::String(String::from("var1 var2 var3"))),
+ Ok(Token::Id(Identifier::new("!enddefine").unwrap())),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("define").unwrap())),
+ Ok(Token::String(String::from("!macro1"))),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::Id(Identifier::new("!enddefine").unwrap())),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::String(String::from(""))),
- ScanToken::Token(Token::String(String::from(""))),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("define").unwrap())),
+ Ok(Token::String(String::from("!macro1"))),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::String(String::from(""))),
+ Ok(Token::String(String::from(""))),
+ Ok(Token::Id(Identifier::new("!enddefine").unwrap())),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Punct(Punct::Comma)),
- ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Punct(Punct::Comma)),
- ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("define").unwrap())),
+ Ok(Token::String(String::from("!macro1"))),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Id(Identifier::new("a").unwrap())),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::Punct(Punct::Comma)),
+ Ok(Token::Id(Identifier::new("b").unwrap())),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::Punct(Punct::Comma)),
+ Ok(Token::Id(Identifier::new("c").unwrap())),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::Id(Identifier::new("!enddefine").unwrap())),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Punct(Punct::Comma)),
- ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Punct(Punct::Comma)),
- ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("define").unwrap())),
+ Ok(Token::String(String::from("!macro1"))),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Id(Identifier::new("a").unwrap())),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::Punct(Punct::Comma)),
+ Ok(Token::Id(Identifier::new("b").unwrap())),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::Punct(Punct::Comma)),
+ Ok(Token::Id(Identifier::new("c").unwrap())),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::Id(Identifier::new("!enddefine").unwrap())),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Comma)),
- ScanToken::Token(Token::Id(Identifier::new("y").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Comma)),
- ScanToken::Token(Token::Id(Identifier::new("z").unwrap())),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::String(String::from("content 1"))),
- ScanToken::Token(Token::String(String::from("content 2"))),
- ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("define").unwrap())),
+ Ok(Token::String(String::from("!macro1"))),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Id(Identifier::new("x").unwrap())),
+ Ok(Token::Punct(Punct::Comma)),
+ Ok(Token::Id(Identifier::new("y").unwrap())),
+ Ok(Token::Punct(Punct::Comma)),
+ Ok(Token::Id(Identifier::new("z").unwrap())),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::String(String::from("content 1"))),
+ Ok(Token::String(String::from("content 2"))),
+ Ok(Token::Id(Identifier::new("!enddefine").unwrap())),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("define").unwrap())),
+ Ok(Token::String(String::from("!macro1"))),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("data").unwrap())),
+ Ok(Token::Id(Identifier::new("list").unwrap())),
+ Ok(Token::Punct(Punct::Slash)),
+ Ok(Token::Id(Identifier::new("x").unwrap())),
+ Ok(Token::Number(1.0)),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("define").unwrap())),
+ Ok(Token::String(String::from("!macro1"))),
+ Ok(Token::Id(Identifier::new("x").unwrap())),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("data").unwrap())),
+ Ok(Token::Id(Identifier::new("list").unwrap())),
+ Ok(Token::Punct(Punct::Slash)),
+ Ok(Token::Id(Identifier::new("x").unwrap())),
+ Ok(Token::Number(1.0)),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("define").unwrap())),
+ Ok(Token::String(String::from("!macro1"))),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("x").unwrap())),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("data").unwrap())),
+ Ok(Token::Id(Identifier::new("list").unwrap())),
+ Ok(Token::Punct(Punct::Slash)),
+ Ok(Token::Id(Identifier::new("x").unwrap())),
+ Ok(Token::Number(1.0)),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::End),
- ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
- ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
- ScanToken::Token(Token::Punct(Punct::Slash)),
- ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
- ScanToken::Token(Token::Number(1.0)),
- ScanToken::Token(Token::End),
+ Ok(Token::Id(Identifier::new("define").unwrap())),
+ Ok(Token::String(String::from("!macro1"))),
+ Ok(Token::End),
+ Ok(Token::Id(Identifier::new("data").unwrap())),
+ Ok(Token::Id(Identifier::new("list").unwrap())),
+ Ok(Token::Punct(Punct::Slash)),
+ Ok(Token::Id(Identifier::new("x").unwrap())),
+ Ok(Token::Number(1.0)),
+ Ok(Token::End),
],
);
}
"#,
Syntax::Auto,
&[
- ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
- ScanToken::Token(Token::String(String::from("!macro1"))),
- ScanToken::Token(Token::Punct(Punct::LParen)),
- ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::String(String::from("content line 1"))),
- ScanToken::Token(Token::String(String::from("content line 2"))),
+ Ok(Token::Id(Identifier::new("define").unwrap())),
+ Ok(Token::String(String::from("!macro1"))),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::String(String::from("content line 1"))),
+ Ok(Token::String(String::from("content line 2"))),
],
);
}
// You should have received a copy of the GNU General Public License along with
// this program. If not, see <http://www.gnu.org/licenses/>.
-//! Syntax segmentation.
+//! Low-level lexical analysis.
//!
-//! PSPP divides traditional "lexical analysis" or "tokenization" into two
-//! phases: a lower-level phase called "segmentation" and a higher-level phase
-//! called "scanning". This module implements the segmentation phase.
-//! [`super::scan`] contains declarations for the scanning phase.
+//! PSPP divides traditional "lexical analysis" or "tokenization" into [three
+//! phases](super). This module implements the low-level segmentation phase.
//!
//! Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label
//! (a segment type) for each byte or contiguous sequence of bytes in the input.
//! form a single string token [Token::String]. Still other segments are
//! ignored (e.g. [Segment::Spaces]) or trigger special behavior such as error
//! messages later in tokenization (e.g. [Segment::ExpectedQuote]).
+//!
+//! [Token::Id]: crate::lex::token::Token::Id
+//! [Token::String]: crate::lex::token::Token::String
use std::cmp::Ordering;
-#[cfg(doc)]
-use crate::lex::token::Token;
-
use crate::{
identifier::{id_match, id_match_n, IdentifierChar},
prompt::PromptStyle,
pub enum Syntax {
/// Try to interpret input correctly regardless of whether it is written
/// for interactive or batch syntax.
+ ///
+ /// This is `Syntax::default()`.
#[default]
Auto,
}
/// The type of a segment.
+///
+/// A [Segment] is a label for a string slice and is normally paired with one.
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum Segment {
+ /// A number.
Number,
+
+ /// A quoted string (`'...'` or `"..."`)..
QuotedString,
+
+ /// A hexadecimal string (`X'...'` or `X"..."`).
HexString,
+
+ /// A Unicode string (`U'...'` or `U"..."`).
UnicodeString,
+
+ /// An unquoted string.
+ ///
+ /// Unquoted strings appear only in a few special-case constructs, such as
+ /// the `FILE LABEL` command.
UnquotedString,
+
+ /// An identifier.
Identifier,
+
+ /// A punctuator or operator.
Punct,
+
+ /// `#!` at the beginning of a syntax file only.
Shbang,
+
+ /// Spaces.
Spaces,
+
+ /// A comment (`/* ... */`).
Comment,
+
+ /// New-line.
Newline,
+
+ /// A comment command (`* ...` or `COMMENT ...`).
CommentCommand,
+
+ /// In a `DO REPEAT` command, one of the lines to be repeated.
DoRepeatCommand,
+
+ /// Indicates `DO REPEAT` nested more deeply than supported.
DoRepeatOverflow,
+
+ /// A line of inline data inside `BEGIN DATA`...`END DATA`.
InlineData,
+
+ /// In `!DEFINE`, an identifier for the macro being defined.
+ ///
+ /// Distinguished from [Identifier](Self::Identifier) because a `MacroName`
+ /// must never be macro-expanded.
MacroName,
+
+ /// Contents of `!DEFINE`...`!ENDDEFINE`.
MacroBody,
+
+ /// Represents the `DOCUMENT` beginning a `DOCUMENT` command.
+ ///
+ /// This token is not associated with any text: the actual `DOCUMENT`
+ /// keyword is part of the following [Document](Self::Document) segment.
+ /// This is because documents include the `DOCUMENT` keyword.
StartDocument,
+
+ /// One of the lines of documents in a `DOCUMENT` command.
+ ///
+ /// The first line of a document includes the `DOCUMENT` keyword itself.
Document,
+
+ /// A command separator.
+ ///
+ /// This segment is usually for `+`, `-`, or `.` at the beginning of a line.
StartCommand,
+
+ /// A command separator.
+ ///
+ /// This segment is usually for a blank line. It also appears at the end of
+ /// a file.
SeparateCommands,
+
+ /// A command separator.
+ ///
+ /// This segment is for `.` at the end of a line.
EndCommand,
+
+ /// Missing quote at the end of a line.
+ ///
+ /// This segment contains a partial quoted string. It starts with a quote
+ /// mark (`"` or `'`, possibly preceded by `X` or `U`) but goes to the end
+ /// of the line without the matching end quote mark.
ExpectedQuote,
+
+ /// Missing exponent in number.
+ ///
+ /// This segment contains a number that ends with `E` or `E+` or `E-`
+ /// without a following exponent.
ExpectedExponent,
+
+ /// Unexpected character.
+ ///
+ /// The segment is a single character that isn't valid in syntax.
UnexpectedChar,
}
bitflags! {
#[derive(Copy, Clone, Debug)]
- pub struct Substate: u8 {
+ struct Substate: u8 {
const START_OF_LINE = 1;
const START_OF_COMMAND = 2;
}
}
+/// Used by [Segmenter] to indicate that more input is needed.
+#[derive(Copy, Clone, Debug)]
+pub struct Incomplete;
+
+/// Labels syntax input with [Segment]s.
#[derive(Copy, Clone)]
pub struct Segmenter {
state: (State, Substate),
syntax: Syntax,
}
-#[derive(Copy, Clone, Debug)]
-pub struct Incomplete;
-
impl Segmenter {
/// Returns a segmenter with the given `syntax`.
///
}
}
+ /// Returns the [Syntax] variant passed in to [new](Self::new).
pub fn syntax(&self) -> Syntax {
self.syntax
}
/// Returns the style of command prompt to display to an interactive user
/// for input in the current state.. The return value is most accurate in
/// with [Syntax::Interactive] syntax and at the beginning of a line (that
- /// is, if [`Segmenter::push`] consumed as much as possible of the input up
- /// to a new-line).
+ /// is, if [Segmenter::push] consumed as much as possible of the input up to
+ /// a new-line).
pub fn prompt(&self) -> PromptStyle {
match self.state.0 {
State::Shbang => PromptStyle::First,
}
}
- /// Attempts to label a prefix of the remaining input with a segment type.
- /// The caller supplies a prefix of the remaining input as `input`. If
- /// `eof` is true, then `input` is the entire (remainder) of the input; if
- /// `eof` is false, then further input is potentially available.
- ///
- /// The input may contain '\n' or '\r\n' line ends in any combination.
- ///
- /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes
- /// in the segment at the beginning of `input` (a number in
- /// `0..=input.len()`) and the type of that segment. The next call should
- /// not include those bytes in `input`, because they have (figuratively)
- /// been consumed by the segmenter.
- ///
- /// Segments can have zero length, including segment types `Type::End`,
- /// `Type::SeparateCommands`, `Type::StartDocument`, `Type::InlineData`, and
- /// `Type::Spaces`.
- ///
- /// Failure occurs only if the segment type of the bytes in `input` cannot
- /// yet be determined. In this case, this function returns `Err(Incomplete)`. If
- /// more input is available, the caller should obtain some more, then call
- /// again with a longer `input`. If this is not enough, the process might
- /// need to repeat again and again. If input is exhausted, then the caller
- /// may call again setting `eof` to true. This function will never return
- /// `Err(Incomplete)` when `eof` is true.
- ///
- /// The caller must not, in a sequence of calls, supply contradictory input.
- /// That is, bytes provided as part of `input` in one call, but not
- /// consumed, must not be provided with *different* values on subsequent
- /// calls. This is because the function must often make decisions based on
- /// looking ahead beyond the bytes that it consumes.
fn push_rest<'a>(
&mut self,
input: &'a str,
}
}
+ /// Attempts to label a prefix of the remaining input with a segment type.
+ /// The caller supplies a prefix of the remaining input as `input`. If
+ /// `eof` is true, then `input` is the entire (remainder) of the input; if
+ /// `eof` is false, then further input is potentially available.
+ ///
+ /// The input may contain `\n` or `\r\n` line ends in any combination.
+ ///
+ /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes
+ /// in the segment at the beginning of `input` (a number in
+ /// `0..=input.len()`) and the type of that segment. The next call should
+ /// not include those bytes in `input`, because the segmenter has
+ /// (figuratively) consumed them.
+ ///
+ /// Segments can have zero length, including segment types
+ /// [Segment::SeparateCommands], [Segment::StartDocument],
+ /// [Segment::InlineData], and [Segment::Spaces].
+ ///
+ /// Failure occurs only if the segment type of the bytes in `input` cannot
+ /// yet be determined. In this case, this function returns
+ /// `Err(Incomplete)`. If more input is available, the caller should obtain
+ /// some more, then call again with a longer `input`. If this is still not
+ /// enough, the process might need to repeat again and again. If input is
+ /// exhausted, then the caller may call again setting `eof` to true. This
+ /// function will never return `Err(Incomplete)` when `eof` is true.
+ ///
+ /// The caller must not, in a sequence of calls, supply contradictory input.
+ /// That is, bytes provided as part of `input` in one call, but not
+ /// consumed, must not be provided with *different* values on subsequent
+ /// calls. This is because the function must often make decisions based on
+ /// looking ahead beyond the bytes that it consumes.
pub fn push(&mut self, input: &str, eof: bool) -> Result<Option<(usize, Segment)>, Incomplete> {
Ok(self
.push_rest(input, eof)?
use crate::identifier::Identifier;
+/// A PSPP syntax token.
#[derive(Clone, Debug, PartialEq)]
pub enum Token {
/// Identifier.
- Id(Identifier),
+ Id(
+ /// The identifier.
+ Identifier,
+ ),
/// Number.
- Number(f64),
+ Number(
+ /// Numeric value.
+ f64,
+ ),
/// Quoted string.
String(String),
/// Command terminator or separator.
///
- /// Usually this is `.`, but a blank line also separates commands, and in
- /// batch mode any line that begins with a non-blank starts a new command.
+ /// The most common command terminator is `.`. A blank line also separates
+ /// commands. In [Batch](crate::lex::segment::Syntax::Batch) mode, any line
+ /// that begins with a non-blank starts a new command. Other special cases
+ /// exist, too.
End,
/// Operators, punctuators, and reserved words.
- Punct(Punct),
+ Punct(
+ /// The punctuator.
+ Punct,
+ ),
}
impl Token {
+ /// Returns the [Identifier] within this token, or `None` if this is not an
+ /// identifier token.
pub fn id(&self) -> Option<&Identifier> {
match self {
Self::Id(identifier) => Some(identifier),
}
}
+ /// Returns true if this token contains an [Identifier] that matches
+ /// `keyword` as decided by [Identifier::matches_keyword], false otherwise.
pub fn matches_keyword(&self, keyword: &str) -> bool {
self.id().is_some_and(|id| id.matches_keyword(keyword))
}
+ /// Returns the number within this token, or `None` if this is not a number
+ /// token.
pub fn as_number(&self) -> Option<f64> {
if let Self::Number(number) = self {
Some(*number)
}
}
+ /// Returns the integer within this token, or `None` if this is not a number
+ /// token with an integer value.
pub fn as_integer(&self) -> Option<i64> {
match self {
Self::Number(number)
}
}
- pub fn as_id(&self) -> Option<&Identifier> {
- match self {
- Self::Id(id) => Some(id),
- _ => None,
- }
- }
-
+ /// Returns the quoted string within this token, or `None` if this is not a
+ /// [Token::String] token.
pub fn as_string(&self) -> Option<&str> {
match self {
Self::String(string) => Some(string.as_str()),
}
}
+/// An operator, punctuator, or reserved word.
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum Punct {
/// `+`.
}
impl Punct {
+ /// Returns a syntax representation of this punctuator.
+ ///
+ /// Some punctuators have more than one valid syntax representation (for
+ /// example, [Punct::And] can be written as `AND` or `&`). This returns one
+ /// of the valid representations.
pub fn as_str(&self) -> &'static str {
match self {
Self::Plus => "+",
pub mod calendar;
pub mod command;
pub mod crypto;
+pub mod data;
pub mod dictionary;
pub mod endian;
pub mod engine;
use crate::{
identifier::Identifier,
lex::{
- scan::{ScanError, ScanToken, StringScanner, StringSegmenter},
+ scan::{ScanError, StringScanner, StringSegmenter},
segment::Syntax,
- token::{Punct, Token},
+ Punct, Token,
},
message::Location,
settings::Settings,
) {
for (syntax, token) in StringSegmenter::new(s, mode, true) {
match token {
- ScanToken::Token(token) => output.push(MacroToken {
+ Ok(token) => output.push(MacroToken {
token,
syntax: String::from(syntax),
}),
- ScanToken::Error(scan_error) => error(MacroError::ScanError(scan_error)),
+ Err(scan_error) => error(MacroError::ScanError(scan_error)),
}
}
}
fn try_unquote_string(input: &str, mode: Syntax) -> Option<String> {
let mut scanner = StringScanner::new(input, mode, true);
- let Some(ScanToken::Token(Token::String(unquoted))) = scanner.next() else {
+ let Some(Ok(Token::String(unquoted))) = scanner.next() else {
return None;
};
let None = scanner.next() else { return None };
/// Parses one function argument from `input`. Each argument to a macro
/// function is one of:
///
- /// - A quoted string or other single literal token.
+ /// - A quoted string or other single literal token.
///
- /// - An argument to the macro being expanded, e.g. `!1` or a named
- /// argument.
+ /// - An argument to the macro being expanded, e.g. `!1` or a named
+ /// argument.
///
- /// - `!*`.
+ /// - `!*`.
///
- /// - A function invocation.
+ /// - A function invocation.
///
/// Each function invocation yields a character sequence to be turned into a
/// sequence of tokens. The case where that character sequence is a single
use anyhow::{anyhow, Result};
use clap::{Args, Parser, Subcommand, ValueEnum};
use encoding_rs::Encoding;
-use pspp::crypto::EncryptedFile;
-use pspp::sys::cooked::{Error, Headers};
-use pspp::sys::raw::{encoding_from_headers, Decoder, Magic, Reader, Record, Warning};
-use std::fs::File;
-use std::io::{stdout, BufReader, Write};
-use std::path::{Path, PathBuf};
-use std::str;
+use pspp::{
+ crypto::EncryptedFile,
+ sys::{
+ raw::{infer_encoding, Decoder, Magic, Reader, Record},
+ ReaderOptions, Records,
+ },
+};
+use std::{
+ fs::File,
+ io::{stdout, BufReader, Write},
+ path::{Path, PathBuf},
+ str,
+};
use thiserror::Error as ThisError;
use zeroize::Zeroizing;
#[arg(short = 'e', long, value_parser = parse_encoding)]
encoding: Option<&'static Encoding>,
+ /// Password for decryption, with or without what SPSS calls "password encryption".
+ ///
+ /// Specify only for an encrypted system file.
+ #[clap(short, long)]
+ password: Option<String>,
+
/// Maximum number of cases to print.
#[arg(short = 'c', long = "cases")]
max_cases: Option<u64>,
}
impl Convert {
- fn warn(warning: Warning) {
- eprintln!("warning: {warning}");
- }
-
- fn err(error: Error) {
- eprintln!("error: {error}");
- }
-
fn run(self) -> Result<()> {
- let mut reader = Reader::new(BufReader::new(File::open(&self.input)?), Self::warn)?;
- let headers = reader.headers().collect::<Result<Vec<_>, _>>()?;
- let encoding = encoding_from_headers(&headers, &mut |w| Self::warn(w))?;
- let mut decoder = Decoder::new(encoding, |w| Self::warn(w));
- let mut decoded_records = Vec::new();
- for header in headers {
- decoded_records.push(header.decode(&mut decoder)?);
+ fn warn(warning: anyhow::Error) {
+ eprintln!("warning: {warning}");
}
- let headers = Headers::new(decoded_records, &mut |e| Self::err(e))?;
- let (dictionary, _metadata, cases) =
- headers.decode(reader.cases(), encoding, |e| Self::err(e))?;
+
+ let (dictionary, _, cases) = ReaderOptions::new()
+ .with_encoding(self.encoding)
+ .with_password(self.password.clone())
+ .open_file(&self.input, warn)?
+ .into_parts();
let writer = match self.output {
Some(path) => Box::new(File::create(path)?) as Box<dyn Write>,
None => Box::new(stdout()),
}
for (_case_number, case) in (0..self.max_cases.unwrap_or(u64::MAX)).zip(cases) {
- output.write_record(case?.into_iter().zip(dictionary.variables.iter()).map(
+ output.write_record(case?.0.into_iter().zip(dictionary.variables.iter()).map(
|(datum, variable)| {
datum
.display(variable.print_format, variable.encoding)
match mode {
Mode::Identify => {
- let Record::Header(header) = reader.headers().next().unwrap()? else {
- unreachable!()
- };
- match header.magic {
+ match reader.header().magic {
Magic::Sav => println!("SPSS System File"),
Magic::Zsav => println!("SPSS System File with Zlib compression"),
Magic::Ebcdic => println!("EBCDIC-encoded SPSS System File"),
return Ok(());
}
Mode::Raw => {
- for header in reader.headers() {
- let header = header?;
+ for record in reader.records() {
+ let header = record?;
println!("{:?}", header);
}
for (_index, case) in (0..max_cases).zip(reader.cases()) {
}
}
Mode::Decoded => {
- let headers: Vec<Record> = reader.headers().collect::<Result<Vec<_>, _>>()?;
+ let records: Vec<Record> = reader.records().collect::<Result<Vec<_>, _>>()?;
let encoding = match encoding {
Some(encoding) => encoding,
- None => encoding_from_headers(&headers, &mut |e| eprintln!("{e}"))?,
+ None => infer_encoding(&records, &mut |e| eprintln!("{e}"))?,
};
let mut decoder = Decoder::new(encoding, |e| eprintln!("{e}"));
- for header in headers {
+ for header in records {
let header = header.decode(&mut decoder);
println!("{:?}", header);
/*
}
}
Mode::Cooked => {
- let headers: Vec<Record> = reader.headers().collect::<Result<Vec<_>, _>>()?;
+ let records: Vec<Record> = reader.records().collect::<Result<Vec<_>, _>>()?;
let encoding = match encoding {
Some(encoding) => encoding,
- None => encoding_from_headers(&headers, &mut |e| eprintln!("{e}"))?,
+ None => infer_encoding(&records, &mut |e| eprintln!("{e}"))?,
};
let mut decoder = Decoder::new(encoding, |e| eprintln!("{e}"));
- let mut decoded_records = Vec::new();
- for header in headers {
- decoded_records.push(header.decode(&mut decoder)?);
- }
- let headers = Headers::new(decoded_records, &mut |e| eprintln!("{e}"))?;
- let (dictionary, metadata, _cases) =
- headers.decode(reader.cases(), encoding, |e| eprintln!("{e}"))?;
+ let records = Records::from_raw(records, &mut decoder);
+ let (dictionary, metadata, _) = records
+ .decode(
+ reader.header().clone().decode(&mut decoder),
+ reader.cases(),
+ encoding,
+ |e| eprintln!("{e}"),
+ )
+ .into_parts();
println!("{dictionary:#?}");
println!("{metadata:#?}");
}
#[derive(ThisError, Debug, PartialEq, Eq)]
enum DimensionParseError {
/// Invalid number.
- #[error("{0}")]
+ #[error(transparent)]
ParseFloatError(ParseFloatError),
/// Unknown unit.
use tlo::parse_tlo;
use crate::{
- dictionary::{Datum, Variable},
+ data::Datum,
+ dictionary::{VarType, Variable},
format::{Decimal, Format, Settings as FormatSettings, Type, UncheckedFormat},
settings::{Settings, Show},
- sys::raw::VarType,
};
pub mod output;
#[derive(ThisError, Debug)]
pub enum ParseLookError {
- #[error("{0}")]
+ #[error(transparent)]
XmlError(#[from] DeError),
- #[error("{0}")]
+ #[error(transparent)]
Utf8Error(#[from] Utf8Error),
- #[error("{0}")]
+ #[error(transparent)]
BinError(#[from] BinError),
- #[error("{0}")]
+ #[error(transparent)]
IoError(#[from] std::io::Error),
}
}
}
-#[derive(Copy, Clone, Default, PartialEq, Eq)]
+#[derive(Copy, Clone, PartialEq, Eq)]
pub struct EndianSettings {
/// Endianness for reading IB, PIB, and RB formats.
pub input: Endian,
pub output: Endian,
}
+impl Default for EndianSettings {
+ fn default() -> Self {
+ Self {
+ input: Endian::NATIVE,
+ output: Endian::NATIVE,
+ }
+ }
+}
+
impl EndianSettings {
pub const fn new(endian: Endian) -> Self {
Self {
// You should have received a copy of the GNU General Public License along with
// this program. If not, see <http://www.gnu.org/licenses/>.
-use std::{collections::BTreeMap, ops::Range};
+use std::{
+ collections::BTreeMap,
+ fs::File,
+ io::{Read, Seek},
+ ops::Range,
+ path::Path,
+};
use crate::{
calendar::date_time_to_pspp,
+ crypto::EncryptedFile,
+ data::{Datum, RawString},
dictionary::{
- Datum, Dictionary, InvalidRole, MultipleResponseSet, MultipleResponseType, VarWidth,
- Variable, VariableSet,
+ Dictionary, InvalidRole, MissingValues, MissingValuesError, MultipleResponseSet,
+ MultipleResponseType, VarWidth, Variable, VariableSet,
},
endian::Endian,
format::{Error as FormatError, Format, UncheckedFormat},
hexfloat::HexFloat,
identifier::{ByIdentifier, Error as IdError, Identifier},
output::pivot::{Group, Value},
- sys::{
- encoding::Error as EncodingError,
- raw::{
- self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension,
- FileAttributesRecord, FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongName,
- LongNamesRecord, LongStringMissingValueRecord, LongStringValueLabelRecord,
- MissingValues, MissingValuesError, MultipleResponseRecord, NumberOfCasesRecord,
- ProductInfoRecord, RawStrArray, RawString, RawWidth, ValueLabel, ValueLabelRecord,
+ sys::raw::{
+ self, infer_encoding,
+ records::{
+ Compression, DocumentRecord, EncodingRecord, Extension, FileAttributesRecord,
+ FileHeader, FloatInfoRecord, IntegerInfoRecord, LongName, LongNamesRecord,
+ LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord,
+ NumberOfCasesRecord, ProductInfoRecord, RawFormat, ValueLabel, ValueLabelRecord,
VarDisplayRecord, VariableAttributesRecord, VariableRecord, VariableSetRecord,
- VeryLongStringsRecord, ZHeader, ZTrailer,
+ VeryLongStringsRecord,
},
+ Cases, DecodedRecord, RawDatum, RawWidth, Reader,
},
};
+use anyhow::{anyhow, Error as AnyError};
+use binrw::io::BufReader;
use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
use encoding_rs::Encoding;
use indexmap::set::MutableValues;
use itertools::Itertools;
use thiserror::Error as ThisError;
-pub use crate::sys::raw::{CategoryLabels, Compression};
-
+/// A warning for decoding [Records] into a [SystemFile].
#[derive(ThisError, Clone, Debug)]
pub enum Error {
- #[error("Missing header record")]
- MissingHeaderRecord,
-
- #[error("{0}")]
- EncodingError(EncodingError),
-
- #[error("Using default encoding {0}.")]
- UsingDefaultEncoding(String),
-
- #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)]
- InvalidVariableWidth { offsets: Range<u64>, width: i32 },
-
- #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
- InvalidLongMissingValueFormat,
-
- #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
- InvalidCreationDate { creation_date: String },
-
- #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
- InvalidCreationTime { creation_time: String },
-
+ /// File creation date is not in the expected format format.
+ #[error("File creation date {0} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
+ InvalidCreationDate(
+ /// Date.
+ String,
+ ),
+
+ /// File creation time is not in the expected format.
+ #[error("File creation time {0} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
+ InvalidCreationTime(
+ /// Time.
+ String,
+ ),
+
+ /// Invalid variable name.
#[error("{id_error} Renaming variable to {new_name}.")]
InvalidVariableName {
+ /// Identifier error.
id_error: IdError,
+ /// New name.
new_name: Identifier,
},
+ /// Invalid print format.
#[error(
- "Substituting {new_spec} for invalid print format on variable {variable}. {format_error}"
+ "Substituting {new_format} for invalid print format on variable {variable}. {format_error}"
)]
InvalidPrintFormat {
- new_spec: Format,
+ /// New format.
+ new_format: Format,
+ /// Variable.
variable: Identifier,
+ /// Underlying error.
format_error: FormatError,
},
+ /// Invalid write format.
#[error(
- "Substituting {new_spec} for invalid write format on variable {variable}. {format_error}"
+ "Substituting {new_format} for invalid write format on variable {variable}. {format_error}"
)]
InvalidWriteFormat {
- new_spec: Format,
+ /// New format.
+ new_format: Format,
+ /// Variable.
variable: Identifier,
+ /// Underlying error.
format_error: FormatError,
},
+ /// Renaming variable with duplicate name {duplicate_name} to {new_name}..
#[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")]
DuplicateVariableName {
+ /// Duplicate name.
duplicate_name: Identifier,
+ /// New name.
new_name: Identifier,
},
- #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")]
- InvalidDictIndex { dict_index: usize, max_index: usize },
-
- #[error("Dictionary index {0} refers to a long string continuation.")]
- DictIndexIsContinuation(usize),
-
- #[error("At offset {offset:#x}, one or more variable indexes for value labels referred to long string continuation records: {indexes:?}")]
- LongStringContinuationIndexes { offset: u64, indexes: Vec<u32> },
-
+ /// Variable index {start_index} is a {width} that should be followed by
+ /// long string continuation records through index {end_index} (inclusive),
+ /// but index {error_index} is not a continuation.
#[error("Variable index {start_index} is a {width} that should be followed by long string continuation records through index {end_index} (inclusive), but index {error_index} is not a continuation")]
MissingLongStringContinuation {
+ /// Width of variable.
width: RawWidth,
+ /// First variable index.
start_index: usize,
+ /// Last variable index.
end_index: usize,
+ /// Index of error.
error_index: usize,
},
+ /// Invalid long string value labels.
#[error(
"At offsets {:#x}...{:#x}, record types 3 and 4 may not add value labels to one or more long string variables: {}", .offsets.start, .offsets.end, variables.iter().join(", ")
)]
InvalidLongStringValueLabels {
+ /// Range of file offsets.
offsets: Range<u64>,
+ /// Variables.
variables: Vec<Identifier>,
},
- #[error("Variables associated with value label are not all of identical type. Variable {numeric_var} is numeric, but variable {string_var} is string.")]
- ValueLabelsDifferentTypes {
- numeric_var: Identifier,
- string_var: Identifier,
- },
-
+ /// Variable has duplicate value labels.
#[error("{variable} has duplicate value labels for the following value(s): {}", values.iter().join(", "))]
DuplicateValueLabels {
+ /// Variable.
variable: Identifier,
+ /// Duplicate values.
values: Vec<String>,
},
+ /// Invalid multiple response set name.
#[error("Invalid multiple response set name. {0}")]
- InvalidMrSetName(IdError),
+ InvalidMrSetName(
+ /// Identifier error.
+ IdError,
+ ),
+ /// Multiple response set includes unknown variable.
#[error("Multiple response set {mr_set} includes unknown variable {short_name}.")]
UnknownMrSetVariable {
+ /// Multiple response set name.
mr_set: Identifier,
+ /// Short name of variable.
short_name: Identifier,
},
+ /// Multiple response set {mr_set} includes variable {variable} more than once.
#[error("Multiple response set {mr_set} includes variable {variable} more than once.")]
DuplicateMrSetVariable {
+ /// Multiple response set name.
mr_set: Identifier,
+ /// Duplicated variable.
variable: Identifier,
},
+ /// Multiple response set {0} has no variables.
#[error("Multiple response set {0} has no variables.")]
- EmptyMrSet(Identifier),
+ EmptyMrSet(
+ /// Multiple response set name.
+ Identifier,
+ ),
+ /// Multiple response set {0} has only one variable.
#[error("Multiple response set {0} has only one variable.")]
- OneVarMrSet(Identifier),
+ OneVarMrSet(
+ /// Multiple response set name.
+ Identifier,
+ ),
+ /// Multiple response set {0} contains both string and numeric variables.
#[error("Multiple response set {0} contains both string and numeric variables.")]
- MixedMrSet(Identifier),
+ MixedMrSet(
+ /// Multiple response set name.
+ Identifier,
+ ),
+ /// Invalid numeric format for counted value {number} in multiple response set {mr_set}.
#[error(
"Invalid numeric format for counted value {number} in multiple response set {mr_set}."
)]
- InvalidMDGroupCountedValue { mr_set: Identifier, number: String },
+ InvalidMDGroupCountedValue {
+ /// Multiple response set name.
+ mr_set: Identifier,
+ /// Value that should be numeric.
+ number: String,
+ },
+ /// Counted value {value} has width {width}, but it must be no wider than
+ /// {max_width}, the width of the narrowest variable in multiple response
+ /// set {mr_set}.
#[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")]
TooWideMDGroupCountedValue {
+ /// Multiple response set name.
mr_set: Identifier,
+ /// Counted value.
value: String,
+ /// Width of counted value.
width: usize,
+ /// Maximum allowed width of counted value.
max_width: u16,
},
- #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")]
- InvalidLongValueLabelWidth {
- name: Identifier,
- width: u32,
- min_width: u16,
- max_width: u16,
- },
-
+ /// Ignoring long string value label for unknown variable {0}.
#[error("Ignoring long string value label for unknown variable {0}.")]
- UnknownLongStringValueLabelVariable(Identifier),
+ UnknownLongStringValueLabelVariable(
+ /// Variable name.
+ Identifier,
+ ),
+ /// Ignoring long string value label for numeric variable {0}.
#[error("Ignoring long string value label for numeric variable {0}.")]
- LongStringValueLabelNumericVariable(Identifier),
-
- #[error("Invalid attribute name. {0}")]
- InvalidAttributeName(IdError),
-
- #[error("Invalid short name in long variable name record. {0}")]
- InvalidShortName(IdError),
-
- #[error("Invalid name in long variable name record. {0}")]
- InvalidLongName(IdError),
-
+ LongStringValueLabelNumericVariable(
+ /// Variable name.
+ Identifier,
+ ),
+
+ /// Invalid variable name {0} in variable attribute record.
+ #[error("Invalid variable name {0} in variable attribute record.")]
+ UnknownAttributeVariableName(
+ /// Variable name.
+ Identifier,
+ ),
+
+ /// Unknown short name {0} in long variable name record.
+ #[error("Unknown short name {0} in long variable name record.")]
+ UnknownShortName(
+ /// Short variable name.
+ Identifier,
+ ),
+
+ /// Duplicate long variable name {0}.
#[error("Duplicate long variable name {0}.")]
- DuplicateLongName(Identifier),
-
- #[error("Invalid variable name in very long string record. {0}")]
- InvalidLongStringName(IdError),
-
+ DuplicateLongName(
+ /// Long variable name.
+ Identifier,
+ ),
+
+ /// Very long string entry for unknown variable {0}.
+ #[error("Very long string entry for unknown variable {0}.")]
+ UnknownVeryLongString(
+ /// Variable name.
+ Identifier,
+ ),
+
+ /// Variable with short name {short_name} listed in very long string record
+ /// with width {width}, which requires only one segment.
#[error("Variable with short name {short_name} listed in very long string record with width {width}, which requires only one segment.")]
- ShortVeryLongString { short_name: Identifier, width: u16 },
+ ShortVeryLongString {
+ /// Short variable name.
+ short_name: Identifier,
+ /// Invalid width.
+ width: u16,
+ },
+ /// Variable with short name {short_name} listed in very long string record
+ /// with width {width} requires string segments for {n_segments} dictionary
+ /// indexes starting at index {index}, but the dictionary only contains
+ /// {len} indexes.
#[error("Variable with short name {short_name} listed in very long string record with width {width} requires string segments for {n_segments} dictionary indexes starting at index {index}, but the dictionary only contains {len} indexes.")]
VeryLongStringOverflow {
+ /// Short variable name.
short_name: Identifier,
+ /// Width.
width: u16,
+ /// Starting index.
index: usize,
+ /// Expected number of segments.
n_segments: usize,
+ /// Number of indexes in dictionary.
len: usize,
},
+ /// Variable with short name {short_name} listed in very long string record
+ /// with width {width} has segment {index} of width {actual} (expected
+ /// {expected}).
#[error("Variable with short name {short_name} listed in very long string record with width {width} has segment {index} of width {actual} (expected {expected}).")]
VeryLongStringInvalidSegmentWidth {
+ /// Variable short name.
short_name: Identifier,
+ /// Variable width.
width: u16,
+ /// Variable index.
index: usize,
+ /// Actual width.
actual: usize,
+ /// Expected width.
expected: usize,
},
- #[error("Invalid variable name in long string value label record. {0}")]
- InvalidLongStringValueLabelName(IdError),
-
- #[error("Invalid variable name in attribute record. {0}")]
- InvalidAttributeVariableName(IdError),
-
- // XXX This is risky because `text` might be arbitarily long.
- #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
- MalformedString { encoding: String, text: String },
-
+ /// File contains multiple {0:?} records.
#[error("File contains multiple {0:?} records.")]
- MoreThanOne(&'static str),
+ MoreThanOne(
+ /// Record name.
+ &'static str,
+ ),
+ /// File designates string variable {name} (index {index}) as weight
+ /// variable, but weight variables must be numeric.
#[error("File designates string variable {name} (index {index}) as weight variable, but weight variables must be numeric.")]
- InvalidWeightVar { name: Identifier, index: u32 },
+ InvalidWeightVar {
+ /// Variable name.
+ name: Identifier,
+ /// Variable index.
+ index: u32,
+ },
+ /// File weight variable index {index} is invalid because it exceeds maximum
+ /// variable index {max_index}.
#[error(
"File weight variable index {index} is invalid because it exceeds maximum variable index {max_index}."
)]
- WeightIndexOutOfRange { index: u32, max_index: usize },
+ WeightIndexOutOfRange {
+ /// Variable index.
+ index: u32,
+ /// Maximum variable index.
+ max_index: usize,
+ },
+ /// File weight variable index {index} is invalid because it refers to long
+ /// string continuation for variable {name}.
#[error(
"File weight variable index {index} is invalid because it refers to long string continuation for variable {name}."
)]
- WeightIndexStringContinuation { index: u32, name: Identifier },
+ WeightIndexStringContinuation {
+ /// Variable index.
+ index: u32,
+ /// Variable name.
+ name: Identifier,
+ },
- #[error("{0}")]
- InvalidRole(InvalidRole),
+ /// Invalid role.
+ #[error(transparent)]
+ InvalidRole(
+ /// Role error.
+ InvalidRole,
+ ),
+ /// File header claims {expected} variable positions but {actual} were read
+ /// from file.
#[error("File header claims {expected} variable positions but {actual} were read from file.")]
- WrongVariablePositions { actual: usize, expected: usize },
+ WrongVariablePositions {
+ /// Actual number of variable positions.
+ actual: usize,
+ /// Number of variable positions claimed by file header.
+ expected: usize,
+ },
+ /// Unknown variable name \"{name}\" in long string missing value record.
#[error("Unknown variable name \"{name}\" in long string missing value record.")]
- LongStringMissingValueUnknownVariable { name: Identifier },
+ LongStringMissingValueUnknownVariable {
+ /// Variable name.
+ name: Identifier,
+ },
+ /// Invalid long string missing value for {} variable {name}.
#[error("Invalid long string missing value for {} variable {name}.", width.display_adjective())]
- LongStringMissingValueBadWdith { name: Identifier, width: VarWidth },
+ LongStringMissingValueBadWdith {
+ /// Variable name.
+ name: Identifier,
+ /// Variable width.
+ width: VarWidth,
+ },
+ /// Long string missing values record says variable {name} has {count}
+ /// missing values, but only 1 to 3 missing values are allowed.
#[error("Long string missing values record says variable {name} has {count} missing values, but only 1 to 3 missing values are allowed.")]
- LongStringMissingValueInvalidCount { name: Identifier, count: usize },
+ LongStringMissingValueInvalidCount {
+ /// Variable name.
+ name: Identifier,
+ /// Claimed number of missing values.
+ count: usize,
+ },
+ /// Long string missing values for variable {0} are too wide.
+ #[error("Long string missing values for variable {0} are too wide.")]
+ MissingValuesTooWide(
+ /// Variable name.
+ Identifier,
+ ),
+
+ /// Unknown extension record with subtype {subtype} at offset {offset:#x},
+ /// consisting of {count} {size}-byte units. Please feel free to report
+ /// this as a bug.
#[error("Unknown extension record with subtype {subtype} at offset {offset:#x}, consisting of {count} {size}-byte units. Please feel free to report this as a bug.")]
UnknownExtensionRecord {
+ /// Extension record file starting offset.
offset: u64,
+ /// Extension record subtype.
subtype: u32,
+ /// Extension record per-element size.
size: u32,
+ /// Number of elements in extension record.
count: u32,
},
+ /// Floating-point representation indicated by system file ({0}) differs from expected (1).
#[error(
"Floating-point representation indicated by system file ({0}) differs from expected (1)."
)]
- UnexpectedFloatFormat(i32),
+ UnexpectedFloatFormat(
+ /// Floating-point format.
+ i32,
+ ),
+ /// Integer format indicated by system file ({actual}) differs from
+ /// expected ({expected})..
#[error(
"Integer format indicated by system file ({actual}) differs from expected ({expected})."
)]
- UnexpectedEndianess { actual: i32, expected: i32 },
+ UnexpectedEndianess {
+ /// Endianness declared by system file.
+ actual: i32,
+ /// Actual endianness used in system file.
+ expected: i32,
+ },
+ /// System file specifies value {actual:?} ({}) as {name} but {expected:?} ({}) was expected..
#[error(
"System file specifies value {actual:?} ({}) as {name} but {expected:?} ({}) was expected.",
HexFloat(*actual),
HexFloat(*expected)
)]
UnexpectedFloatValue {
+ /// Actual floating-point value in system file.
actual: f64,
+ /// Expected floating-point value in system file.
expected: f64,
+ /// Name for this special floating-point value.
name: &'static str,
},
+ /// Variable set \"{variable_set}\" includes unknown variable {variable}.
#[error("Variable set \"{variable_set}\" includes unknown variable {variable}.")]
UnknownVariableSetVariable {
+ /// Name of variable set.
variable_set: String,
+ /// Variable name.
variable: Identifier,
},
- #[error("Details TBD (cooked)")]
- TBD,
+ /// Dictionary has {expected} variables but {actual} variable display
+ /// entries are present.
+ #[error(
+ "Dictionary has {expected} variables but {actual} variable display entries are present."
+ )]
+ WrongNumberOfVarDisplay {
+ /// Expected number of variable-display entries.
+ expected: usize,
+ /// Number of variable-display entries actually present.
+ actual: usize,
+ },
}
-#[derive(Clone, Debug)]
-pub struct Headers {
- pub header: HeaderRecord<String>,
+/// Options for reading a system file.
+#[derive(Default, Clone, Debug)]
+pub struct ReaderOptions {
+ /// Character encoding for text in the system file.
+ ///
+ /// If not set, the character encoding will be determined from reading the
+ /// file, or a default encoding will be used.
+ pub encoding: Option<&'static Encoding>,
+
+ /// Password to use to unlock an encrypted system file.
+ ///
+ /// For an encrypted system file, this must be set to the (encoded or
+ /// unencoded) password.
+ ///
+ /// For a plaintext system file, this must be `None`.
+ pub password: Option<String>,
+}
+
+impl ReaderOptions {
+ /// Construct a new `ReaderOptions` that initially does not specify an
+ /// encoding or password.
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Causes the file to be read using the specified `encoding`, or with a
+ /// default if `encoding` is None.
+ pub fn with_encoding(self, encoding: Option<&'static Encoding>) -> Self {
+ Self { encoding, ..self }
+ }
+
+ /// Causes the file to be read by decrypting it with the given `password` or
+ /// without decrypting if `encoding` is None.
+ pub fn with_password(self, password: Option<String>) -> Self {
+ Self { password, ..self }
+ }
+
+ /// Opens the file at `path`, reporting warnings using `warn`.
+ pub fn open_file<P, F>(self, path: P, warn: F) -> Result<SystemFile, AnyError>
+ where
+ P: AsRef<Path>,
+ F: FnMut(AnyError),
+ {
+ let file = File::open(path)?;
+ if self.password.is_some() {
+ // Don't create `BufReader`, because [EncryptedReader] will buffer.
+ self.open_reader(file, warn)
+ } else {
+ self.open_reader(BufReader::new(file), warn)
+ }
+ }
+
+ /// Opens the file read from `reader`, reporting warnings using `warn`.
+ pub fn open_reader<R, F>(self, reader: R, warn: F) -> Result<SystemFile, AnyError>
+ where
+ R: Read + Seek + 'static,
+ F: FnMut(AnyError),
+ {
+ if let Some(password) = &self.password {
+ Self::open_reader_inner(
+ EncryptedFile::new(reader)?
+ .unlock(password.as_bytes())
+ .map_err(|_| anyhow!("Incorrect password."))?,
+ self.encoding,
+ warn,
+ )
+ } else {
+ Self::open_reader_inner(reader, self.encoding, warn)
+ }
+ }
+
+ fn open_reader_inner<R, F>(
+ reader: R,
+ encoding: Option<&'static Encoding>,
+ mut warn: F,
+ ) -> Result<SystemFile, AnyError>
+ where
+ R: Read + Seek + 'static,
+ F: FnMut(AnyError),
+ {
+ let mut reader = Reader::new(reader, |warning| warn(warning.into()))?;
+ let records = reader.records().collect::<Result<Vec<_>, _>>()?;
+ let header = reader.header().clone();
+ let cases = reader.cases();
+ let encoding = if let Some(encoding) = encoding {
+ encoding
+ } else {
+ infer_encoding(&records, |warning| warn(warning.into()))?
+ };
+ let mut decoder = raw::Decoder::new(encoding, |warning| warn(warning.into()));
+ let header = header.decode(&mut decoder);
+ let records = records
+ .into_iter()
+ .map(|record| record.decode(&mut decoder))
+ .collect::<Records>();
+ let encoding = decoder.into_encoding();
+
+ Ok(records.decode(header, cases, encoding, |e| warn(e.into())))
+ }
+}
+
+/// The content of an SPSS system file.
+#[derive(Debug)]
+pub struct SystemFile {
+ /// The system file dictionary.
+ pub dictionary: Dictionary,
+
+ /// System file metadata that is not part of the dictionary.
+ pub metadata: Metadata,
+
+ /// Data in the system file.
+ pub cases: Cases,
+}
+
+impl SystemFile {
+ /// Returns the individual parts of the [SystemFile].
+ pub fn into_parts(self) -> (Dictionary, Metadata, Cases) {
+ (self.dictionary, self.metadata, self.cases)
+ }
+}
+
+/// Decoded records in a system file, arranged by type.
+///
+/// The `Vec` fields are all in order read from the file.
+#[derive(Clone, Debug, Default)]
+pub struct Records {
+ /// Variable records.
pub variable: Vec<VariableRecord<String>>,
- pub value_label: Vec<ValueLabelRecord<RawStrArray<8>, String>>,
+
+ /// Value label records.
+ pub value_label: Vec<ValueLabelRecord<RawDatum, String>>,
+
+ /// Document records.
pub document: Vec<DocumentRecord<String>>,
- pub integer_info: Option<IntegerInfoRecord>,
- pub float_info: Option<FloatInfoRecord>,
- pub var_display: Option<VarDisplayRecord>,
+
+ /// Integer info record.
+ pub integer_info: Vec<IntegerInfoRecord>,
+
+ /// Float info record.
+ pub float_info: Vec<FloatInfoRecord>,
+
+ /// Variable display record.
+ pub var_display: Vec<VarDisplayRecord>,
+
+ /// Multiple response set records.
pub multiple_response: Vec<MultipleResponseRecord<Identifier, String>>,
+
+ /// Long string value label records.
pub long_string_value_labels: Vec<LongStringValueLabelRecord<Identifier, String>>,
+
+ /// Long string missing value records.
pub long_string_missing_values: Vec<LongStringMissingValueRecord<Identifier>>,
- pub encoding: Option<EncodingRecord>,
- pub number_of_cases: Option<NumberOfCasesRecord>,
+
+ /// Encoding record.
+ pub encoding: Vec<EncodingRecord>,
+
+ /// Number of cases record.
+ pub number_of_cases: Vec<NumberOfCasesRecord>,
+
+ /// Variable sets records.
pub variable_sets: Vec<VariableSetRecord>,
- pub product_info: Option<ProductInfoRecord>,
+
+ /// Product info record.
+ pub product_info: Vec<ProductInfoRecord>,
+
+ /// Long variable naems records.
pub long_names: Vec<LongNamesRecord>,
+
+ /// Very long string variable records.
pub very_long_strings: Vec<VeryLongStringsRecord>,
+
+ /// File attribute records.
pub file_attributes: Vec<FileAttributesRecord>,
+
+ /// Variable attribute records.
pub variable_attributes: Vec<VariableAttributesRecord>,
- pub other_extension: Vec<Extension>,
- pub end_of_headers: Option<u32>,
- pub z_header: Option<ZHeader>,
- pub z_trailer: Option<ZTrailer>,
-}
-fn take_first<T>(
- mut vec: Vec<T>,
- record_name: &'static str,
- warn: &mut impl FnMut(Error),
-) -> Option<T> {
- if vec.len() > 1 {
- warn(Error::MoreThanOne(record_name));
- }
- vec.drain(..).next()
+ /// Other extension records.
+ pub other_extension: Vec<Extension>,
}
-impl Headers {
- pub fn new(
- headers: Vec<raw::DecodedRecord>,
- warn: &mut impl FnMut(Error),
- ) -> Result<Headers, Error> {
- let mut file_header = Vec::new();
- let mut variable = Vec::new();
- let mut value_label = Vec::new();
- let mut document = Vec::new();
- let mut integer_info = Vec::new();
- let mut float_info = Vec::new();
- let mut var_display = Vec::new();
- let mut multiple_response = Vec::new();
- let mut long_string_value_labels = Vec::new();
- let mut long_string_missing_values = Vec::new();
- let mut encoding = Vec::new();
- let mut number_of_cases = Vec::new();
- let mut variable_sets = Vec::new();
- let mut product_info = Vec::new();
- let mut long_names = Vec::new();
- let mut very_long_strings = Vec::new();
- let mut file_attributes = Vec::new();
- let mut variable_attributes = Vec::new();
- let mut other_extension = Vec::new();
- let mut end_of_headers = Vec::new();
- let mut z_header = Vec::new();
- let mut z_trailer = Vec::new();
-
- for header in headers {
- match header {
- DecodedRecord::Header(record) => {
- file_header.push(record);
- }
+impl Extend<raw::DecodedRecord> for Records {
+ fn extend<T>(&mut self, iter: T)
+ where
+ T: IntoIterator<Item = raw::DecodedRecord>,
+ {
+ for record in iter {
+ match record {
DecodedRecord::Variable(record) => {
- variable.push(record);
+ self.variable.push(record);
}
DecodedRecord::ValueLabel(record) => {
- value_label.push(record);
+ self.value_label.push(record);
}
DecodedRecord::Document(record) => {
- document.push(record);
+ self.document.push(record);
}
DecodedRecord::IntegerInfo(record) => {
- integer_info.push(record);
+ self.integer_info.push(record);
}
DecodedRecord::FloatInfo(record) => {
- float_info.push(record);
+ self.float_info.push(record);
}
DecodedRecord::VariableSets(record) => {
- variable_sets.push(record);
+ self.variable_sets.push(record);
}
DecodedRecord::VarDisplay(record) => {
- var_display.push(record);
+ self.var_display.push(record);
}
DecodedRecord::MultipleResponse(record) => {
- multiple_response.push(record);
+ self.multiple_response.push(record);
}
DecodedRecord::LongStringValueLabels(record) => {
- long_string_value_labels.push(record)
+ self.long_string_value_labels.push(record)
}
DecodedRecord::LongStringMissingValues(record) => {
- long_string_missing_values.push(record);
+ self.long_string_missing_values.push(record);
}
DecodedRecord::Encoding(record) => {
- encoding.push(record);
+ self.encoding.push(record);
}
DecodedRecord::NumberOfCases(record) => {
- number_of_cases.push(record);
+ self.number_of_cases.push(record);
}
DecodedRecord::ProductInfo(record) => {
- product_info.push(record);
+ self.product_info.push(record);
}
DecodedRecord::LongNames(record) => {
- long_names.push(record);
+ self.long_names.push(record);
}
DecodedRecord::VeryLongStrings(record) => {
- very_long_strings.push(record);
+ self.very_long_strings.push(record);
}
DecodedRecord::FileAttributes(record) => {
- file_attributes.push(record);
+ self.file_attributes.push(record);
}
DecodedRecord::VariableAttributes(record) => {
- variable_attributes.push(record);
+ self.variable_attributes.push(record);
}
DecodedRecord::OtherExtension(record) => {
- other_extension.push(record);
- }
- DecodedRecord::EndOfHeaders(record) => {
- end_of_headers.push(record);
- }
- DecodedRecord::ZHeader(record) => {
- z_header.push(record);
- }
- DecodedRecord::ZTrailer(record) => {
- z_trailer.push(record);
+ self.other_extension.push(record);
}
+ DecodedRecord::EndOfHeaders(_)
+ | DecodedRecord::ZHeader(_)
+ | DecodedRecord::ZTrailer(_) => (),
}
}
+ }
+}
- let Some(file_header) = take_first(file_header, "file header", warn) else {
- return Err(Error::MissingHeaderRecord);
- };
+impl FromIterator<DecodedRecord> for Records {
+ fn from_iter<T>(iter: T) -> Self
+ where
+ T: IntoIterator<Item = DecodedRecord>,
+ {
+ let mut records = Records::default();
+ records.extend(iter);
+ records
+ }
+}
- Ok(Headers {
- header: file_header,
- variable,
- value_label,
- document,
- integer_info: take_first(integer_info, "integer info", warn),
- float_info: take_first(float_info, "float info", warn),
- var_display: take_first(var_display, "variable display", warn),
- multiple_response,
- long_string_value_labels,
- long_string_missing_values,
- encoding: take_first(encoding, "encoding", warn),
- number_of_cases: take_first(number_of_cases, "number of cases", warn),
- variable_sets,
- product_info: take_first(product_info, "product info", warn),
- long_names,
- very_long_strings,
- file_attributes,
- variable_attributes,
- other_extension,
- end_of_headers: take_first(end_of_headers, "end of headers", warn),
- z_header: take_first(z_header, "z_header", warn),
- z_trailer: take_first(z_trailer, "z_trailer", warn),
- })
+impl Records {
+ /// Constructs `Records` from the raw records in `iter`, decoding them with
+ /// `decoder`.
+ pub fn from_raw<T>(iter: T, decoder: &mut raw::Decoder) -> Self
+ where
+ T: IntoIterator<Item = raw::Record>,
+ {
+ iter.into_iter()
+ .map(|record| record.decode(decoder))
+ .collect()
}
+ /// Decodes this [Records] along with `header` and `cases` into a
+ /// [SystemFile]. `encoding` is the encoding that was used to decode these
+ /// records. Uses `warn` to report warnings.
pub fn decode(
mut self,
+ header: FileHeader<String>,
mut cases: Cases,
encoding: &'static Encoding,
mut warn: impl FnMut(Error),
- ) -> Result<(Dictionary, Metadata, Cases), Error> {
+ ) -> SystemFile {
+ for (count, record_name) in [
+ (self.integer_info.len(), "integer info"),
+ (self.float_info.len(), "float info"),
+ (self.var_display.len(), "variable display"),
+ (self.encoding.len(), "encoding"),
+ (self.number_of_cases.len(), "number of cases"),
+ (self.product_info.len(), "product info"),
+ ] {
+ if count > 1 {
+ warn(Error::MoreThanOne(record_name));
+ }
+ }
+
let mut dictionary = Dictionary::new(encoding);
- let file_label = fix_line_ends(self.header.file_label.trim_end_matches(' '));
+ let file_label = fix_line_ends(header.file_label.trim_end_matches(' '));
if !file_label.is_empty() {
dictionary.file_label = Some(file_label);
}
.map(trim_end_spaces)
.collect();
- if let Some(integer_info) = &self.integer_info {
+ if let Some(integer_info) = self.integer_info.first() {
let floating_point_rep = integer_info.floating_point_rep;
if floating_point_rep != 1 {
warn(Error::UnexpectedFloatFormat(floating_point_rep))
}
- let expected = match self.header.endian {
+ let expected = match header.endian {
Endian::Big => 1,
Endian::Little => 2,
};
}
};
- if let Some(float_info) = &self.float_info {
+ if let Some(float_info) = self.float_info.get(0) {
for (expected, expected2, actual, name) in [
(f64::MIN, None, float_info.sysmis, "SYSMIS"),
(f64::MAX, None, float_info.highest, "HIGHEST"),
}
}
- if let Some(nominal_case_size) = self.header.nominal_case_size {
+ if let Some(nominal_case_size) = header.nominal_case_size {
let n_vars = self.variable.len();
if n_vars != nominal_case_size as usize
&& self
.integer_info
- .as_ref()
+ .get(0)
.is_none_or(|info| info.version.0 != 13)
{
warn(Error::WrongVariablePositions {
variable.width,
|new_spec, format_error| {
warn(Error::InvalidPrintFormat {
- new_spec,
+ new_format: new_spec,
variable: variable.name.clone(),
format_error,
})
variable.width,
|new_spec, format_error| {
warn(Error::InvalidWriteFormat {
- new_spec,
+ new_format: new_spec,
variable: variable.name.clone(),
format_error,
})
value_index += n_values;
}
- if let Some(weight_index) = self.header.weight_index {
+ if let Some(weight_index) = header.weight_index {
let index = weight_index as usize - 1;
if index >= value_index {
warn(Error::WeightIndexOutOfRange {
});
}
- let written_by_readstat = self.header.eye_catcher.contains("ReadStat");
+ let written_by_readstat = header.eye_catcher.contains("ReadStat");
for dict_index in dict_indexes {
let variable = dictionary.variables.get_index_mut2(dict_index).unwrap();
let mut duplicates = Vec::new();
}
}
- if let Some(display) = &self.var_display {
- for (index, display) in display.0.iter().enumerate() {
- if let Some(variable) = dictionary.variables.get_index_mut2(index) {
- if let Some(width) = display.width {
- variable.display_width = width;
- }
- if let Some(alignment) = display.alignment {
- variable.alignment = alignment;
- }
- if let Some(measure) = display.measure {
- variable.measure = Some(measure);
- }
- } else {
- warn(dbg!(Error::TBD));
+ if let Some(display) = self.var_display.first() {
+ if display.0.len() != dictionary.variables.len() {
+ warn(Error::WrongNumberOfVarDisplay {
+ expected: dictionary.variables.len(),
+ actual: display.0.len(),
+ });
+ }
+ for (display, index) in display.0.iter().zip(0..dictionary.variables.len()) {
+ let variable = dictionary.variables.get_index_mut2(index).unwrap();
+ if let Some(width) = display.width {
+ variable.display_width = width;
+ }
+ if let Some(alignment) = display.alignment {
+ variable.alignment = alignment;
+ }
+ if let Some(measure) = display.measure {
+ variable.measure = Some(measure);
}
}
}
for record in self
.multiple_response
.iter()
- .flat_map(|record| record.0.iter())
+ .flat_map(|record| record.sets.iter())
{
match MultipleResponseSet::decode(&dictionary, record, &mut warn) {
Ok(mrset) => {
.flat_map(|record| record.0.into_iter())
{
let Some(index) = dictionary.variables.get_index_of(&record.short_name.0) else {
- warn(dbg!(Error::TBD));
+ warn(Error::UnknownVeryLongString(record.short_name.clone()));
continue;
};
let width = VarWidth::String(record.length);
let n_segments = width.n_segments();
if n_segments == 1 {
- warn(dbg!(Error::ShortVeryLongString {
+ warn(Error::ShortVeryLongString {
short_name: record.short_name.clone(),
- width: record.length
- }));
+ width: record.length,
+ });
continue;
}
if index + n_segments > dictionary.variables.len() {
- warn(dbg!(Error::VeryLongStringOverflow {
+ warn(Error::VeryLongStringOverflow {
short_name: record.short_name.clone(),
width: record.length,
index,
n_segments,
- len: dictionary.variables.len()
- }));
+ len: dictionary.variables.len(),
+ });
continue;
}
let mut short_names = Vec::with_capacity(n_segments);
.unwrap()
.short_names = vec![short_name];
} else {
- warn(dbg!(Error::TBD));
+ warn(Error::UnknownShortName(short_name.clone()));
}
}
}
{
variable.attributes.append(&mut attr_set.attributes);
} else {
- warn(dbg!(Error::TBD));
+ warn(Error::UnknownAttributeVariableName(
+ attr_set.long_var_name.clone(),
+ ));
}
}
for record in self
.long_string_value_labels
.drain(..)
- .flat_map(|record| record.0.into_iter())
+ .flat_map(|record| record.labels.into_iter())
{
let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else {
warn(Error::UnknownLongStringValueLabelVariable(
for mut record in self
.long_string_missing_values
.drain(..)
- .flat_map(|record| record.0.into_iter())
+ .flat_map(|record| record.values.into_iter())
{
let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else {
warn(Error::LongStringMissingValueUnknownVariable {
.collect::<Vec<_>>();
match MissingValues::new(values, None) {
Ok(missing_values) => variable.missing_values = missing_values,
- Err(MissingValuesError::TooWide) => warn(dbg!(Error::TBD)),
+ Err(MissingValuesError::TooWide) => {
+ warn(Error::MissingValuesTooWide(record.var_name.clone()))
+ }
Err(MissingValuesError::TooMany) | Err(MissingValuesError::MixedTypes) => {
unreachable!()
}
});
}
- let metadata = Metadata::decode(&self, warn);
+ let metadata = Metadata::decode(&header, &self, warn);
if let Some(n_cases) = metadata.n_cases {
cases = cases.with_expected_cases(n_cases);
}
- Ok((dictionary, metadata, cases))
+ SystemFile {
+ dictionary,
+ metadata,
+ cases,
+ }
}
}
+/// System file metadata that is not part of [Dictionary].
+///
+/// [Dictionary]: crate::dictionary::Dictionary
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Metadata {
+ /// Creation date and time.
+ ///
+ /// This comes from the file header, not from the file system.
pub creation: NaiveDateTime,
+
+ /// Endianness of integers and floating-point numbers in the file.
pub endian: Endian,
+
+ /// Compression type (if any).
pub compression: Option<Compression>,
+
+ /// Number of cases in the file, if it says.
+ ///
+ /// This is not trustworthy: there can be more or fewer.
pub n_cases: Option<u64>,
+
+ /// Name of the product that wrote the file.
pub product: String,
+
+ /// Extended name of the product that wrote the file.
pub product_ext: Option<String>,
+
+ /// Version number of the product that wrote the file.
+ ///
+ /// For example, `(1,2,3)` is version 1.2.3.
pub version: Option<(i32, i32, i32)>,
}
impl Metadata {
+ /// Returns a pivot table [Group] and associated [Value]s that describe this
+ /// `Metadata` if they are put into a [PivotTable].
+ ///
+ /// [PivotTable]: crate::output::pivot::PivotTable
pub fn to_pivot_rows(&self) -> (Group, Vec<Value>) {
let mut group = Group::new("File Information");
let mut values = Vec::new();
(group, values)
}
- fn decode(headers: &Headers, mut warn: impl FnMut(Error)) -> Self {
- let header = &headers.header;
+ fn decode(header: &FileHeader<String>, headers: &Records, mut warn: impl FnMut(Error)) -> Self {
+ let header = &header;
let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %y")
.unwrap_or_else(|_| {
- warn(Error::InvalidCreationDate {
- creation_date: header.creation_date.to_string(),
- });
+ warn(Error::InvalidCreationDate(header.creation_date.to_string()));
Default::default()
});
let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
.unwrap_or_else(|_| {
- warn(Error::InvalidCreationTime {
- creation_time: header.creation_time.to_string(),
- });
+ warn(Error::InvalidCreationTime(header.creation_time.to_string()));
Default::default()
});
let creation = NaiveDateTime::new(creation_date, creation_time);
compression: header.compression,
n_cases: headers
.number_of_cases
- .as_ref()
+ .first()
.map(|record| record.n_cases)
.or_else(|| header.n_cases.map(|n| n as u64)),
product,
- product_ext: headers.product_info.as_ref().map(|pe| fix_line_ends(&pe.0)),
- version: headers.integer_info.as_ref().map(|ii| ii.version),
+ product_ext: headers.product_info.first().map(|pe| fix_line_ends(&pe.0)),
+ version: headers.integer_info.first().map(|ii| ii.version),
}
}
}
impl MultipleResponseSet {
fn decode(
dictionary: &Dictionary,
- input: &raw::MultipleResponseSet<Identifier, String>,
+ input: &raw::records::MultipleResponseSet<Identifier, String>,
warn: &mut impl FnMut(Error),
) -> Result<Self, Error> {
let mr_set_name = input.name.clone();
}
fn decode_format(
- raw: raw::Spec,
+ raw: RawFormat,
width: VarWidth,
mut warn: impl FnMut(Format, FormatError),
) -> Format {
impl MultipleResponseType {
fn decode(
mr_set: &Identifier,
- input: &raw::MultipleResponseType,
+ input: &raw::records::MultipleResponseType,
min_width: VarWidth,
) -> Result<Self, Error> {
match input {
- raw::MultipleResponseType::MultipleDichotomy { value, labels } => {
+ raw::records::MultipleResponseType::MultipleDichotomy { value, labels } => {
let value = match min_width {
VarWidth::Numeric => {
let string = String::from_utf8_lossy(&value.0);
labels: *labels,
})
}
- raw::MultipleResponseType::MultipleCategory => {
+ raw::records::MultipleResponseType::MultipleCategory => {
Ok(MultipleResponseType::MultipleCategory)
}
}
// You should have received a copy of the GNU General Public License along with
// this program. If not, see <http://www.gnu.org/licenses/>.
+//! Character encodings in system files.
+
use std::sync::LazyLock;
use crate::locale_charset::locale_charset;
include!(concat!(env!("OUT_DIR"), "/encodings.rs"));
+/// Returns the code page number corresponding to `encoding`, or `None` if
+/// unknown.
pub fn codepage_from_encoding(encoding: &str) -> Option<u32> {
CODEPAGE_NAME_TO_NUMBER
.get(encoding.to_ascii_lowercase().as_str())
use thiserror::Error as ThisError;
+/// An error or warning related to encodings.
#[derive(Clone, ThisError, Debug, PartialEq, Eq)]
pub enum Error {
+ /// Warning that the system file doesn't indicate its own encoding.
#[error("This system file does not indicate its own character encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")]
NoEncoding,
+ /// Unknown code page.
#[error("This system file encodes text strings with unknown code page {0}.")]
- UnknownCodepage(i32),
+ UnknownCodepage(
+ /// The code page number.
+ i32,
+ ),
+ /// Unknown encoding.
#[error("This system file encodes text strings with unknown encoding {0}.")]
- UnknownEncoding(String),
+ UnknownEncoding(
+ /// The encoding name.
+ String,
+ ),
+ /// EBCDIC not supported.
#[error("This system file is encoded in EBCDIC, which is not supported.")]
Ebcdic,
}
+/// Returns the default encoding to use.
+///
+/// The default encoding is taken from the system or user's configured locale.
pub fn default_encoding() -> &'static Encoding {
static DEFAULT_ENCODING: LazyLock<&'static Encoding> =
LazyLock::new(|| Encoding::for_label(locale_charset().as_bytes()).unwrap_or(UTF_8));
&DEFAULT_ENCODING
}
+/// Returns the character encoding to use for a system file.
+///
+/// `encoding`, if any, should come from [EncodingRecord], and `character_code`,
+/// if any, should from [IntegerInfoRecord]. Returns an error if the encoding
+/// to use is unclear or unspecified, or if (for EBCDIC) it is unsupported.
+///
+/// [EncodingRecord]: crate::sys::raw::records::EncodingRecord
+/// [IntegerInfoRecord]: crate::sys::raw::records::IntegerInfoRecord
pub fn get_encoding(
encoding: Option<&str>,
character_code: Option<i32>,
Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into()))
}
-
-/*
-#[cfg(test)]
-mod tests {
- use std::thread::spawn;
-
- use encoding_rs::{EUC_JP, UTF_8, WINDOWS_1252};
-
- #[test]
- fn round_trip() {
- let mut threads = Vec::new();
- for thread in 0..128 {
- let start: u32 = thread << 25;
- let end = start + ((1 << 25) - 1);
- threads.push(spawn(move || {
- for i in start..=end {
- let s = i.to_le_bytes();
- let (utf8, replacement) = EUC_JP.decode_without_bom_handling(&s);
- if !replacement {
- let s2 = UTF_8.encode(&utf8).0;
- assert_eq!(s.as_slice(), &*s2);
- }
- }
- }));
- }
- for thread in threads {
- thread.join().unwrap();
- }
- }
-}
-*/
// You should have received a copy of the GNU General Public License along with
// this program. If not, see <http://www.gnu.org/licenses/>.
-pub mod cooked;
+//! Reading and writing system files.
+//!
+//! This module enables reading and writing "system files", the binary format
+//! for SPSS data files. The system file format dates back 40+ years and has
+//! evolved greatly over that time to support new features, but in a way to
+//! facilitate interchange between even the oldest and newest versions of
+//! software.
+//!
+//! To read a system file in the simplest way, use [ReaderOptions].
+
+// Warn about missing docs, but not for items declared with `#[cfg(test)]`.
+#![cfg_attr(not(test), warn(missing_docs))]
+
+mod cooked;
+pub use cooked::*;
pub mod encoding;
pub mod raw;
+
+#[cfg(test)]
pub mod sack;
#[cfg(test)]
// You should have received a copy of the GNU General Public License along with
// this program. If not, see <http://www.gnu.org/licenses/>.
+//! Raw system file record reader.
+//!
+//! This module facilitates reading records from system files in all of their
+//! raw details. Most readers will want to use higher-level interfaces.
+
use crate::{
- dictionary::{Attributes, Datum, VarWidth},
+ data::{Case, Datum, RawStr, RawString},
+ dictionary::{VarType, VarWidth},
endian::{Endian, Parse, ToBytes},
- format::{DisplayPlain, DisplayPlainF64},
+ format::DisplayPlainF64,
identifier::{Error as IdError, Identifier},
- sys::encoding::{default_encoding, get_encoding, Error as EncodingError},
+ sys::{
+ encoding::{default_encoding, get_encoding, Error as EncodingError},
+ raw::records::{
+ AttributeWarning, Compression, DocumentRecord, EncodingRecord, Extension,
+ ExtensionWarning, FileAttributesRecord, FileHeader, FloatInfoRecord, HeaderWarning,
+ IntegerInfoRecord, LongNameWarning, LongNamesRecord, LongStringMissingValueRecord,
+ LongStringMissingValuesWarning, LongStringValueLabelRecord,
+ LongStringValueLabelWarning, MultipleResponseRecord, MultipleResponseWarning,
+ NumberOfCasesRecord, ProductInfoRecord, RawDocumentLine, RawFileAttributesRecord,
+ RawLongNamesRecord, RawProductInfoRecord, RawVariableAttributesRecord,
+ RawVariableSetRecord, RawVeryLongStringsRecord, ValueLabelRecord, ValueLabelWarning,
+ VarDisplayRecord, VariableAttributesRecord, VariableDisplayWarning, VariableRecord,
+ VariableSetRecord, VariableSetWarning, VariableWarning, VeryLongStringWarning,
+ VeryLongStringsRecord, ZHeader, ZTrailer, ZlibTrailerWarning,
+ },
+ },
};
-use encoding_rs::{mem::decode_latin1, Encoding};
+use encoding_rs::Encoding;
use flate2::read::ZlibDecoder;
-use itertools::Itertools;
-use num::Integer;
use smallvec::SmallVec;
use std::{
- borrow::{Borrow, Cow},
+ borrow::Cow,
cell::RefCell,
- collections::{BTreeMap, VecDeque},
+ collections::VecDeque,
fmt::{Debug, Display, Formatter, Result as FmtResult},
io::{empty, Error as IoError, Read, Seek, SeekFrom},
iter::repeat_n,
mem::take,
num::NonZeroU8,
- ops::{Deref, Not, Range},
- str::from_utf8,
+ ops::Range,
};
use thiserror::Error as ThisError;
+pub mod records;
+
+/// An error encountered reading raw system file records.
+///
+/// Any error prevents reading further data from the system file.
+#[derive(Debug)]
+pub struct Error {
+ /// Range of file offsets where the error occurred.
+ pub offsets: Option<Range<u64>>,
+
+ /// Details of the error.
+ pub details: ErrorDetails,
+}
+
+impl std::error::Error for Error {}
+
+impl Error {
+ /// Constructs an error from `offsets` and `details`.
+ pub fn new(offsets: Option<Range<u64>>, details: ErrorDetails) -> Self {
+ Self { offsets, details }
+ }
+}
+
+impl Display for Error {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ if let Some(offsets) = &self.offsets
+ && !offsets.is_empty()
+ {
+ if offsets.end > offsets.start.wrapping_add(1) {
+ write!(
+ f,
+ "Error at file offsets {:#x} to {:#x}: ",
+ offsets.start, offsets.end
+ )?;
+ } else {
+ write!(f, "Error at file offset {:#x}: ", offsets.start)?;
+ }
+ }
+ write!(f, "{}", &self.details)
+ }
+}
+
+impl From<IoError> for Error {
+ fn from(value: IoError) -> Self {
+ Self::new(None, value.into())
+ }
+}
+
+/// Details of an [Error].
#[derive(ThisError, Debug)]
-pub enum Error {
+pub enum ErrorDetails {
+ /// Not an SPSS system file.
#[error("Not an SPSS system file")]
NotASystemFile,
+ /// Encrypted.
+ #[error("File is encrypted but no password was supplied.")]
+ Encrypted,
+
+ /// Bad [Magic].
#[error("Invalid magic number {0:?}")]
BadMagic([u8; 4]),
+ /// I/O error.
#[error("I/O error ({0})")]
Io(#[from] IoError),
+ /// Invalid SAV compression code.
#[error("Invalid SAV compression code {0}")]
InvalidSavCompression(u32),
+ /// Invalid ZSAV compression code {0}.
#[error("Invalid ZSAV compression code {0}")]
InvalidZsavCompression(u32),
- #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
- BadDocumentLength { offset: u64, n: usize, max: usize },
-
- #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
- BadRecordType { offset: u64, rec_type: u32 },
-
- #[error("In variable record starting at offset {start_offset:#x}, variable width is not in the valid range -1 to 255.")]
- BadVariableWidth { start_offset: u64, width: i32 },
-
- #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
- BadVariableLabelCode {
- start_offset: u64,
- code_offset: u64,
- code: u32,
+ /// Document record has document line count ({n}) greater than the maximum number {max}.
+ #[error(
+ "Document record has document line count ({n}) greater than the maximum number {max}."
+ )]
+ BadDocumentLength {
+ /// Number of document lines.
+ n: usize,
+ /// Maximum number of document lines.
+ max: usize,
},
- #[error("At offset {offset:#x}, missing value code ({code}) is not -3, -2, 0, 1, 2, or 3.")]
- BadMissingValueCode { offset: u64, code: i32 },
+ /// Unrecognized record type.
+ #[error("Unrecognized record type {0}.")]
+ BadRecordType(u32),
+
+ /// Variable width in variable record is not in the valid range -1 to 255.
+ #[error("Variable width {0} in variable record is not in the valid range -1 to 255.")]
+ BadVariableWidth(i32),
+
+ /// In variable record, variable label code is not 0 or 1.
+ #[error("In variable record, variable label code {0} is not 0 or 1.")]
+ BadVariableLabelCode(u32),
+
+ /// Missing value code is not -3, -2, 0, 1, 2, or 3.
+ #[error("Missing value code ({0}) is not -3, -2, 0, 1, 2, or 3.")]
+ BadMissingValueCode(i32),
+
+ /// Numeric missing value code is not -3, -2, 0, 1, 2, or 3.
+ #[error("Numeric missing value code ({0}) is not -3, -2, 0, 1, 2, or 3.")]
+ BadNumericMissingValueCode(i32),
+
+ /// String missing value code is not 0, 1, 2, or 3.
+ #[error("String missing value code ({0}) is not 0, 1, 2, or 3.")]
+ BadStringMissingValueCode(i32),
+
+ /// Number of value labels ({n}) is greater than the maximum number {max}.
+ #[error("Number of value labels ({n}) is greater than the maximum number {max}.")]
+ BadNumberOfValueLabels {
+ /// Number of value labels.
+ n: u32,
+ /// Maximum number of value labels.
+ max: u32,
+ },
+ /// Following value label record, found record type {0} instead of expected
+ /// type 4 for variable index record.
+ #[
+ error(
+ "Following value label record, found record type {0} instead of expected type 4 for variable index record"
+ )]
+ ExpectedVarIndexRecord(
+ /// Record type.
+ u32,
+ ),
+
+ /// Number of variables indexes for value labels ({n}) is greater than the
+ /// maximum number ({max}).
#[error(
- "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
+ "Number of variables indexes for value labels ({n}) is greater than the maximum number ({max})."
)]
- BadNumericMissingValueCode { offset: u64, code: i32 },
-
- #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
- BadStringMissingValueCode { offset: u64, code: i32 },
-
- #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
- BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
-
- #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
- ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
-
- #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")]
- TooManyVarIndexes { offset: u64, n: u32, max: u32 },
+ TooManyVarIndexes {
+ /// Number of variable indexes.
+ n: u32,
+ /// Maximum number of variable indexes.
+ max: u32,
+ },
- #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
+ /// Record type 7 subtype {subtype} is too large with element size {size} and {count} elements.
+ #[error(
+ "Record type 7 subtype {subtype} is too large with element size {size} and {count} elements."
+ )]
ExtensionRecordTooLarge {
- offset: u64,
+ /// Subtype.
subtype: u32,
+ /// Element size in bytes.
size: u32,
+ /// Number of elements.
count: u32,
},
- #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
+ /// Unexpected end of file {case_ofs} bytes into a {case_len}-byte case.
+ #[error("Unexpected end of file {case_ofs} bytes into a {case_len}-byte case.")]
EofInCase {
- offset: u64,
+ /// Offset into case in bytes.
case_ofs: u64,
+ /// Expected case length in bytes.
case_len: usize,
},
+ /// Unexpected end of file {case_ofs} bytes and {n_chunks} compression
+ /// chunks into a compressed case.
#[error(
- "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes and {n_chunks} compression chunks into a compressed case."
+ "Unexpected end of file {case_ofs} bytes and {n_chunks} compression chunks into a compressed case."
)]
EofInCompressedCase {
- offset: u64,
+ /// Offset into case in bytes.
case_ofs: u64,
+ /// Number of compression codes consumed.
n_chunks: usize,
},
- #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
- PartialCompressedCase { offset: u64, case_ofs: u64 },
-
- #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
- CompressedNumberExpected { offset: u64, case_ofs: u64 },
-
- #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
- CompressedStringExpected { offset: u64, case_ofs: u64 },
-
+ /// Impossible ztrailer_offset {0:#x}.
#[error("Impossible ztrailer_offset {0:#x}.")]
- ImpossibleZTrailerOffset(u64),
+ ImpossibleZTrailerOffset(
+ /// `ztrailer_offset`
+ u64,
+ ),
+ /// ZLIB header's zlib_offset is {actual:#x} instead of expected
+ /// {expected:#x}.
#[error("ZLIB header's zlib_offset is {actual:#x} instead of expected {expected:#x}.")]
- UnexpectedZHeaderOffset { actual: u64, expected: u64 },
+ UnexpectedZHeaderOffset {
+ /// Actual `zlib_offset`.
+ actual: u64,
+ /// Expected `zlib_offset`.
+ expected: u64,
+ },
+ /// Invalid ZLIB trailer length {0}.
#[error("Invalid ZLIB trailer length {0}.")]
- InvalidZTrailerLength(u64),
-
- #[error(
+ InvalidZTrailerLength(
+ /// ZLIB trailer length.
+ u64,
+ ),
+
+ /// ZLIB trailer bias {actual} is not {} as expected from file header bias.
+ #[
+ error(
"ZLIB trailer bias {actual} is not {} as expected from file header bias.",
DisplayPlainF64(*expected)
)]
- WrongZlibTrailerBias { actual: i64, expected: f64 },
+ WrongZlibTrailerBias {
+ /// ZLIB trailer bias read from file.
+ actual: i64,
+ /// Expected ZLIB trailer bias.
+ expected: f64,
+ },
+ /// ZLIB trailer \"zero\" field has nonzero value {0}.
#[error("ZLIB trailer \"zero\" field has nonzero value {0}.")]
- WrongZlibTrailerZero(u64),
+ WrongZlibTrailerZero(
+ /// Actual value that should have been zero.
+ u64,
+ ),
+ /// ZLIB trailer specifies unexpected {0}-byte block size.
#[error("ZLIB trailer specifies unexpected {0}-byte block size.")]
- WrongZlibTrailerBlockSize(u32),
+ WrongZlibTrailerBlockSize(
+ /// Block size read from file.
+ u32,
+ ),
- #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
+ /// Block count in ZLIB trailer differs from expected block count calculated
+ /// from trailer length.
+ #[error(
+ "Block count {n_blocks} in ZLIB trailer differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}."
+ )]
BadZlibTrailerNBlocks {
- offset: u64,
+ /// Number of blocks.
n_blocks: u32,
+ /// Expected number of blocks.
expected_n_blocks: u64,
+ /// ZLIB trailer length in bytes.
ztrailer_len: u64,
},
- #[error("ZLIB block descriptor {index} reported uncompressed data offset {actual:#x}, when {expected:#x} was expected.")]
+ /// ZLIB block descriptor reported uncompressed data offset different from
+ /// expected.
+ #[error(
+ "ZLIB block descriptor {index} reported uncompressed data offset {actual:#x}, when {expected:#x} was expected."
+ )]
ZlibTrailerBlockWrongUncmpOfs {
+ /// Block descriptor index.
index: usize,
+ /// Actual uncompressed data offset.
actual: u64,
+ /// Expected uncompressed data offset.
expected: u64,
},
- #[error("ZLIB block descriptor {index} reported compressed data offset {actual:#x}, when {expected:#x} was expected.")]
+ /// ZLIB block descriptor {index} reported compressed data offset
+ /// {actual:#x}, when {expected:#x} was expected.
+ #[error(
+ "ZLIB block descriptor {index} reported compressed data offset {actual:#x}, when {expected:#x} was expected."
+ )]
ZlibTrailerBlockWrongCmpOfs {
+ /// Block descriptor index.
index: usize,
+ /// Actual compressed data offset.
actual: u64,
+ /// Expected compressed data offset.
expected: u64,
},
- #[error("ZLIB block descriptor {index} reports compressed size {compressed_size} and uncompressed size {uncompressed_size}.")]
+ /// ZLIB block descriptor {index} reports compressed size {compressed_size}
+ /// and uncompressed size {uncompressed_size}.
+ #[error(
+ "ZLIB block descriptor {index} reports compressed size {compressed_size} and uncompressed size {uncompressed_size}."
+ )]
ZlibExpansion {
+ /// Block descriptor index.
index: usize,
+ /// Compressed size.
compressed_size: u32,
+ /// Uncompressed size.
uncompressed_size: u32,
},
- #[error("ZLIB trailer is at offset {zheader:#x} but {descriptors:#x} would be expected from block descriptors.")]
- ZlibTrailerOffsetInconsistency { descriptors: u64, zheader: u64 },
-
- #[error("File metadata says it contains {expected} cases, but {actual} cases were read.")]
- WrongNumberOfCases { expected: u64, actual: u64 },
-
- #[error("{0}")]
- EncodingError(EncodingError),
-}
-
-#[derive(ThisError, Debug)]
-pub enum Warning {
- #[error("Unexpected end of data inside extension record.")]
- UnexpectedEndOfData,
-
- #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")]
- NoVarIndexes { offset: u64 },
-
- #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", !var_type)]
- MixedVarTypes {
- offset: u64,
- var_type: VarType,
- wrong_types: Vec<u32>,
- },
-
- #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}] or referred to string continuations: {invalid:?}")]
- InvalidVarIndexes {
- offset: u64,
- max: usize,
- invalid: Vec<u32>,
- },
-
- #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
- BadRecordSize {
- offset: u64,
- record: String,
- size: u32,
- expected_size: u32,
- },
-
- #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
- BadRecordCount {
- offset: u64,
- record: String,
- count: u32,
- expected_count: u32,
+ /// ZLIB trailer at unexpected offset.
+ #[error(
+ "ZLIB trailer is at offset {actual:#x} but {expected:#x} would be expected from block descriptors."
+ )]
+ ZlibTrailerOffsetInconsistency {
+ /// Expected offset.
+ expected: u64,
+ /// Actual offset.
+ actual: u64,
},
- #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
- BadLongMissingValueLength {
- record_offset: u64,
- offset: u64,
- value_len: u32,
+ /// File metadata says it contains {expected} cases, but {actual} cases were read.
+ #[error("File metadata says it contains {expected} cases, but {actual} cases were read.")]
+ WrongNumberOfCases {
+ /// Expected number of cases.
+ expected: u64,
+ /// Actual number of cases.
+ actual: u64,
},
- #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
- BadEncodingName { offset: u64 },
-
- // XXX This is risky because `text` might be arbitarily long.
- #[error("Text string contains invalid bytes for {encoding} encoding: {text:?}")]
- MalformedString { encoding: String, text: String },
-
- #[error("Invalid variable measurement level value {0}")]
- InvalidMeasurement(u32),
-
- #[error("Invalid variable display alignment value {0}")]
- InvalidAlignment(u32),
-
- #[error("Invalid attribute name. {0}")]
- InvalidAttributeName(IdError),
-
- #[error("Invalid variable name in attribute record. {0}")]
- InvalidAttributeVariableName(IdError),
-
- #[error("Missing `=` separator in long variable name record.")]
- LongNameMissingEquals,
-
- #[error("Invalid short name in long variable name record. {0}")]
- InvalidShortName(IdError),
-
- #[error("Invalid name in long variable name record. {0}")]
- InvalidLongName(IdError),
-
- #[error("Invalid variable name in very long string record. {0}")]
- InvalidLongStringName(IdError),
-
- #[error("Invalid variable name in variable set record. {0}")]
- InvalidVariableSetName(IdError),
-
- #[error("Variable set missing name delimiter.")]
- VariableSetMissingEquals,
-
- #[error("Invalid multiple response set name. {0}")]
- InvalidMrSetName(IdError),
-
- #[error("Invalid multiple response set variable name. {0}")]
- InvalidMrSetVariableName(IdError),
-
- #[error("Invalid variable name in long string missing values record. {0}")]
- InvalidLongStringMissingValueVariableName(IdError),
-
- #[error("Invalid variable name in long string value label record. {0}")]
- InvalidLongStringValueLabelName(IdError),
-
- #[error("{0}")]
- EncodingError(EncodingError),
+ /// Encoding error.
+ #[error(transparent)]
+ EncodingError(
+ /// The error.
+ #[from]
+ EncodingError,
+ ),
+}
- #[error("Missing value record with range not allowed for string variable")]
- MissingValueStringRange,
+/// A warning reading a raw system file record.
+///
+/// Warnings indicate that something may be amiss, but they do not prevent
+/// reading further records.
+#[derive(Debug)]
+pub struct Warning {
+ /// Range of file offsets where the warning occurred.
+ pub offsets: Option<Range<u64>>,
- #[error("Missing value record at offset {0:#x} not allowed for long string continuation")]
- MissingValueContinuation(u64),
+ /// Details of the warning.
+ pub details: WarningDetails,
+}
- #[error("Invalid multiple dichotomy label type")]
- InvalidMultipleDichotomyLabelType,
+impl std::error::Error for Warning {}
- #[error("Invalid multiple response type")]
- InvalidMultipleResponseType,
+impl Warning {
+ /// Constructs a new [Warning] from `offsets` and `details`.
+ pub fn new(offsets: Option<Range<u64>>, details: impl Into<WarningDetails>) -> Self {
+ Self {
+ offsets,
+ details: details.into(),
+ }
+ }
+}
- #[error("Syntax error in multiple response record ({0})")]
- MultipleResponseSyntaxError(&'static str),
+impl Display for Warning {
+ fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+ if let Some(offsets) = &self.offsets
+ && !offsets.is_empty()
+ {
+ if offsets.end > offsets.start.wrapping_add(1) {
+ write!(
+ f,
+ "Warning at file offsets {:#x} to {:#x}: ",
+ offsets.start, offsets.end
+ )?;
+ } else {
+ write!(f, "Warning at file offset {:#x}: ", offsets.start)?;
+ }
+ }
+ write!(f, "{}", &self.details)
+ }
+}
- #[error("Syntax error parsing counted string (missing trailing space)")]
- CountedStringMissingSpace,
+/// Details of a [Warning].
+#[derive(ThisError, Debug)]
+pub enum WarningDetails {
+ /// Warning for file header.
+ #[error("In file header: {0}")]
+ Header(#[from] HeaderWarning),
- #[error("Syntax error parsing counted string (invalid UTF-8)")]
- CountedStringInvalidUTF8,
+ /// Warning for variable records.
+ #[error("In variable record: {0}")]
+ Variable(#[from] VariableWarning),
- #[error("Syntax error parsing counted string (invalid length {0:?})")]
- CountedStringInvalidLength(String),
+ /// Warning for extension records.
+ #[error("In extension record: {0}")]
+ Extension(#[from] ExtensionWarning),
- #[error("Syntax error parsing counted string (length {0:?} goes past end of input)")]
- CountedStringTooLong(usize),
+ /// Warning for value labels.
+ #[error("In value label record: {0}")]
+ ValueLabel(#[from] ValueLabelWarning),
- #[error("Variable display record contains {count} items but should contain either {first} or {second}.")]
- InvalidVariableDisplayCount {
- count: usize,
- first: usize,
- second: usize,
- },
+ /// Warning for long string missing values.
+ #[error("In long string missing values record: {0}")]
+ LongStringMissingValues(#[from] LongStringMissingValuesWarning),
- #[error("Very long string record missing delimiter in {0:?}.")]
- VeryLongStringMissingDelimiter(String),
+ /// Warning for long string value labels.
+ #[error("In long string value label record: {0}")]
+ LongStringValueLabel(#[from] LongStringValueLabelWarning),
- #[error("Very long string record has invalid length in {0:?}.")]
- VeryLongStringInvalidLength(String),
+ /// Warning for long variable names.
+ #[error("In long variable name record: {0}")]
+ LongName(#[from] LongNameWarning),
- #[error("Attribute record missing left parenthesis, in {0:?}.")]
- AttributeMissingLParen(String),
+ /// Warning for very long strings.
+ #[error("In very long string record: {0}")]
+ VeryLongString(#[from] VeryLongStringWarning),
- #[error("Attribute for {name}[{}] lacks value.", index + 1)]
- AttributeMissingValue { name: Identifier, index: usize },
+ /// Warning for multiple response record.
+ #[error("In multiple response set record: {0}")]
+ MultipleResponse(#[from] MultipleResponseWarning),
- #[error("Attribute for {name}[{}] missing quotations.", index + 1)]
- AttributeMissingQuotes { name: Identifier, index: usize },
+ /// Warning for attribute record.
+ #[error("In file or variable attribute record: {0}")]
+ Attribute(#[from] AttributeWarning),
- #[error("Duplicate attributes for variable {variable}: {}.", attributes.iter().join(", "))]
- DuplicateVariableAttributes {
- variable: Identifier,
- attributes: Vec<Identifier>,
- },
+ /// Warning for variable display record.
+ #[error("In variable display record: {0}")]
+ VariableDisplay(#[from] VariableDisplayWarning),
- #[error("Duplicate dataset attributes with names: {}.", attributes.iter().join(", "))]
- DuplicateFileAttributes { attributes: Vec<Identifier> },
+ /// Warning for variable set record.
+ #[error("In variable set record: {0}")]
+ VariableSet(#[from] VariableSetWarning),
- #[error("Compression bias is {0} instead of the usual values of 0 or 100.")]
- UnexpectedBias(f64),
+ /// Warning for ZLIB trailer.
+ #[error("In ZLIB trailer: {0}")]
+ ZlibTrailer(#[from] ZlibTrailerWarning),
- #[error("ZLIB block descriptor {index} reported block size {actual:#x}, when {expected:#x} was expected.")]
- ZlibTrailerBlockWrongSize {
- index: usize,
- actual: u32,
- expected: u32,
- },
+ /// Bad encoding name.
+ #[error("Encoding record contains an encoding name that is not valid UTF-8.")]
+ BadEncodingName,
- #[error("ZLIB block descriptor {index} reported block size {actual:#x}, when at most {max_expected:#x} was expected.")]
- ZlibTrailerBlockTooBig {
- index: usize,
- actual: u32,
- max_expected: u32,
+ /// Mis-encoded bytes in string.
+ // XXX This is risky because `text` might be arbitarily long.
+ #[error("Text string contains invalid bytes for {encoding} encoding: {text:?}")]
+ MalformedString {
+ /// The encoding.
+ encoding: String,
+ /// The problematic string.
+ text: String,
},
- #[error("Details TBD (raw)")]
- TBD,
+ /// Encoding error.
+ #[error(transparent)]
+ EncodingError(#[from] EncodingError),
}
-impl From<IoError> for Warning {
+impl From<IoError> for WarningDetails {
fn from(_source: IoError) -> Self {
- Self::UnexpectedEndOfData
+ Self::Extension(ExtensionWarning::UnexpectedEndOfData)
}
}
+/// A raw record in a system file.
#[derive(Clone, Debug)]
pub enum Record {
- Header(HeaderRecord<RawString>),
- Variable(VariableRecord<RawString>),
- ValueLabel(ValueLabelRecord<RawStrArray<8>, RawString>),
- Document(DocumentRecord<RawDocumentLine>),
- IntegerInfo(IntegerInfoRecord),
- FloatInfo(FloatInfoRecord),
- VarDisplay(VarDisplayRecord),
- MultipleResponse(MultipleResponseRecord<RawString, RawString>),
- LongStringValueLabels(LongStringValueLabelRecord<RawString, RawString>),
- LongStringMissingValues(LongStringMissingValueRecord<RawString>),
- Encoding(EncodingRecord),
- NumberOfCases(NumberOfCasesRecord),
- VariableSets(RawVariableSetRecord),
- ProductInfo(RawProductInfoRecord),
- LongNames(RawLongNamesRecord),
- VeryLongStrings(RawVeryLongStringsRecord),
- FileAttributes(RawFileAttributesRecord),
- VariableAttributes(RawVariableAttributesRecord),
- OtherExtension(Extension),
- EndOfHeaders(u32),
- ZHeader(ZHeader),
- ZTrailer(ZTrailer),
-}
-
+ /// Variable record.
+ ///
+ /// Each numeric variable has one variable record. Each string variable has
+ /// one variable record per 8-byte segment.
+ Variable(
+ /// The record.
+ VariableRecord<RawString>,
+ ),
+
+ /// Value labels for numeric and short string variables.
+ ///
+ /// These appear after the variable records.
+ ValueLabel(
+ /// The record.
+ ValueLabelRecord<RawDatum, RawString>,
+ ),
+
+ /// Document record.
+ Document(
+ /// The record.
+ DocumentRecord<RawDocumentLine>,
+ ),
+
+ /// Integer info record.
+ IntegerInfo(
+ /// The record.
+ IntegerInfoRecord,
+ ),
+
+ /// Floating-point info record.
+ FloatInfo(
+ /// The record.
+ FloatInfoRecord,
+ ),
+
+ /// Variable display record.
+ VarDisplay(
+ /// The record.
+ VarDisplayRecord,
+ ),
+
+ /// Multiple response variable record.
+ MultipleResponse(
+ /// The record.
+ MultipleResponseRecord<RawString, RawString>,
+ ),
+
+ /// Value labels for long string variables.
+ LongStringValueLabels(
+ /// The record.
+ LongStringValueLabelRecord<RawString, RawString>,
+ ),
+
+ /// Missing values for long string variables.
+ ///
+ /// Missing values for numeric and short string variables appear in the
+ /// variable records.
+ LongStringMissingValues(
+ /// The record.
+ LongStringMissingValueRecord<RawString>,
+ ),
+
+ /// Encoding record.
+ ///
+ /// All the strings in the file are encoded in this encoding, even for
+ /// strings that precede this record.
+ Encoding(
+ /// The record.
+ EncodingRecord,
+ ),
+
+ /// Extended number of cases.
+ ///
+ /// The header record records the number of cases but it only uses a 32-bit
+ /// field.
+ NumberOfCases(
+ /// The record.
+ NumberOfCasesRecord,
+ ),
+
+ /// Variable sets.
+ VariableSets(
+ /// The record.
+ RawVariableSetRecord,
+ ),
+
+ /// Product info.
+ ///
+ /// This supplements the product in the header record.
+ ProductInfo(
+ /// The record.
+ RawProductInfoRecord,
+ ),
+
+ /// Long variable names.
+ LongNames(
+ /// The record.
+ RawLongNamesRecord,
+ ),
+
+ /// Very long string variables, for strings longer than 255 bytes.
+ VeryLongStrings(
+ /// The record.
+ RawVeryLongStringsRecord,
+ ),
+
+ /// File attributes.
+ FileAttributes(
+ /// The record.
+ RawFileAttributesRecord,
+ ),
+
+ /// Variable attributes.
+ VariableAttributes(
+ /// The record.
+ RawVariableAttributesRecord,
+ ),
+
+ /// Extension records not otherwise supported.
+ OtherExtension(
+ /// The record.
+ Extension,
+ ),
+
+ /// End of headers.
+ EndOfHeaders(
+ /// The record.
+ u32,
+ ),
+
+ /// Header record for ZLIB-compressed data.
+ ZHeader(
+ /// The record.
+ ZHeader,
+ ),
+
+ /// Trailer record for ZLIB-compressed data.
+ ZTrailer(
+ /// The record.
+ ZTrailer,
+ ),
+}
+
+/// A [Record] that has been decoded to a more usable form.
+///
+/// Some records can be understand raw, but others need to have strings decoded
+/// (and interpreted as identifiers) or raw data interpreted as either numbers
+/// or strings.
#[derive(Clone, Debug)]
pub enum DecodedRecord {
- Header(HeaderRecord<String>),
+ /// Variable record, with strings decoded.
Variable(VariableRecord<String>),
- ValueLabel(ValueLabelRecord<RawStrArray<8>, String>),
+
+ /// Value label, with strings decoded.
+ ValueLabel(ValueLabelRecord<RawDatum, String>),
+
+ /// Documents, with strings decoded.
Document(DocumentRecord<String>),
+
+ /// Integer info.
IntegerInfo(IntegerInfoRecord),
+
+ /// Floating-point info.
FloatInfo(FloatInfoRecord),
+
+ /// Variable display info.
VarDisplay(VarDisplayRecord),
+
+ /// Multiple response sets, with strings decoded.
MultipleResponse(MultipleResponseRecord<Identifier, String>),
+
+ /// Long string value labels, with strings decoded.
LongStringValueLabels(LongStringValueLabelRecord<Identifier, String>),
+
+ /// Long string missing values, with strings decoded.
LongStringMissingValues(LongStringMissingValueRecord<Identifier>),
+
+ /// Encoding record.
Encoding(EncodingRecord),
+
+ /// Number of cases record.
NumberOfCases(NumberOfCasesRecord),
+
+ /// Variable sets.
VariableSets(VariableSetRecord),
+
+ /// Product info.
ProductInfo(ProductInfoRecord),
+
+ /// Long variable names.
LongNames(LongNamesRecord),
+
+ /// Very long string variables.
VeryLongStrings(VeryLongStringsRecord),
+
+ /// File attributes.
FileAttributes(FileAttributesRecord),
+
+ /// Variable attributes.
VariableAttributes(VariableAttributesRecord),
+
+ /// Extension records not otherwise supported.
OtherExtension(Extension),
+
+ /// End of headers.
EndOfHeaders(u32),
+
+ /// Header record for ZLIB-compressed data.
ZHeader(ZHeader),
+
+ /// Trailer record for ZLIB-compressed data.
ZTrailer(ZTrailer),
}
999 => Ok(Some(Record::EndOfHeaders(
endian.parse(read_bytes(reader)?),
))),
- _ => Err(Error::BadRecordType {
- offset: reader.stream_position()?,
- rec_type,
- }),
+ _ => Err(Error::new(
+ {
+ let offset = reader.stream_position()?;
+ Some(offset - 4..offset)
+ },
+ ErrorDetails::BadRecordType(rec_type),
+ )),
}
}
- pub fn decode(self, decoder: &mut Decoder) -> Result<DecodedRecord, Error> {
- Ok(match self {
- Record::Header(record) => record.decode(decoder),
- Record::Variable(record) => record.decode(decoder),
+ /// Decodes this record into a [DecodedRecord] using `decoder`.
+ pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord {
+ match self {
+ Record::Variable(record) => DecodedRecord::Variable(record.decode(decoder)),
Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)),
- Record::Document(record) => record.decode(decoder),
+ Record::Document(record) => DecodedRecord::Document(record.decode(decoder)),
Record::IntegerInfo(record) => DecodedRecord::IntegerInfo(record.clone()),
Record::FloatInfo(record) => DecodedRecord::FloatInfo(record.clone()),
Record::VarDisplay(record) => DecodedRecord::VarDisplay(record.clone()),
- Record::MultipleResponse(record) => record.decode(decoder),
+ Record::MultipleResponse(record) => {
+ DecodedRecord::MultipleResponse(record.decode(decoder))
+ }
Record::LongStringValueLabels(record) => {
DecodedRecord::LongStringValueLabels(record.decode(decoder))
}
Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record),
Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()),
Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()),
- })
+ }
}
}
-pub fn encoding_from_headers(
- headers: &Vec<Record>,
- warn: &mut impl FnMut(Warning),
+/// Given the raw `records` read from a system file, this tries to figure out
+/// the intended character encoding used in the system file:
+///
+/// - If there is a character encoding record, it uses that encoding.
+///
+/// - If there is an integer info record, it uses that encoding.
+///
+/// - Otherwise, it falls back to a default encoding and issuses a warning with
+/// `warn`.
+///
+/// If the records specify an EBCDIC encoding, this fails with an error because
+/// PSPP only supports ASCII-based encodings.
+pub fn infer_encoding(
+ records: &[Record],
+ mut warn: impl FnMut(Warning),
) -> Result<&'static Encoding, Error> {
- let mut encoding_record = None;
- let mut integer_info_record = None;
- for record in headers {
- match record {
- Record::Encoding(record) => encoding_record = Some(record),
- Record::IntegerInfo(record) => integer_info_record = Some(record),
- _ => (),
- }
- }
- let encoding = encoding_record.map(|record| record.0.as_str());
- let character_code = integer_info_record.map(|record| record.character_code);
+ // Get the character encoding from the first (and only) encoding record.
+ let encoding = records
+ .iter()
+ .filter_map(|record| match record {
+ Record::Encoding(record) => Some(record.0.as_str()),
+ _ => None,
+ })
+ .next();
+
+ // Get the character code from the first (only) integer info record.
+ let character_code = records
+ .iter()
+ .filter_map(|record| match record {
+ Record::IntegerInfo(record) => Some(record.character_code),
+ _ => None,
+ })
+ .next();
+
match get_encoding(encoding, character_code) {
Ok(encoding) => Ok(encoding),
- Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)),
+ Err(err @ EncodingError::Ebcdic) => Err(Error::new(None, err.into())),
Err(err) => {
- warn(Warning::EncodingError(err));
+ warn(Warning::new(None, err));
// Warn that we're using the default encoding.
Ok(default_encoding())
}
}
}
-// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
-// decoded as Latin-1 (actually bytes interpreted as Unicode code points).
-fn default_decode(s: &[u8]) -> Cow<str> {
- from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum Compression {
- Simple,
- ZLib,
-}
-
-#[derive(Clone)]
-pub struct HeaderRecord<S>
-where
- S: Debug,
-{
- /// Offset in file.
- pub offsets: Range<u64>,
-
- /// Magic number.
- pub magic: Magic,
-
- /// Eye-catcher string, product name, in the file's encoding. Padded
- /// on the right with spaces.
- pub eye_catcher: S,
-
- /// Layout code, normally either 2 or 3.
- pub layout_code: u32,
-
- /// Number of variable positions, or `None` if the value in the file is
- /// questionably trustworthy.
- pub nominal_case_size: Option<u32>,
-
- /// Compression type, if any,
- pub compression: Option<Compression>,
-
- /// 1-based variable index of the weight variable, or `None` if the file is
- /// unweighted.
- pub weight_index: Option<u32>,
-
- /// Claimed number of cases, if known.
- pub n_cases: Option<u32>,
-
- /// Compression bias, usually 100.0.
- pub bias: f64,
-
- /// `dd mmm yy` in the file's encoding.
- pub creation_date: S,
-
- /// `HH:MM:SS` in the file's encoding.
- pub creation_time: S,
-
- /// File label, in the file's encoding. Padded on the right with spaces.
- pub file_label: S,
+/// An [Encoding] along with a function to report decoding errors.
+///
+/// This is used by functions that decode raw records.
+pub struct Decoder<'a> {
+ /// The character encoding to use.
+ pub encoding: &'static Encoding,
- /// Endianness of the data in the file header.
- pub endian: Endian,
+ /// Used to reporting [Warning]s during decoding.
+ pub warn: Box<dyn FnMut(Warning) + 'a>,
}
-impl<S> HeaderRecord<S>
-where
- S: Debug,
-{
- fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult
+impl<'de> Decoder<'de> {
+ /// Constructs a decoder for an encoding read or inferred from `records`
+ /// (using [infer_encoding]). This can fail if the headers specify an
+ /// EBCDIC encoding, since this crate only supports ASCII-based encodings.
+ ///
+ /// `warn` will be used to report warnings while decoding records.
+ pub fn with_inferred_encoding<F>(records: &[Record], mut warn: F) -> Result<Self, Error>
where
- T: Debug,
+ F: FnMut(Warning) + 'de,
{
- writeln!(f, "{name:>17}: {:?}", value)
- }
-}
-
-impl<S> Debug for HeaderRecord<S>
-where
- S: Debug,
-{
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- writeln!(f, "File header record:")?;
- self.debug_field(f, "Magic", self.magic)?;
- self.debug_field(f, "Product name", &self.eye_catcher)?;
- self.debug_field(f, "Layout code", self.layout_code)?;
- self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
- self.debug_field(f, "Compression", self.compression)?;
- self.debug_field(f, "Weight index", self.weight_index)?;
- self.debug_field(f, "Number of cases", self.n_cases)?;
- self.debug_field(f, "Compression bias", self.bias)?;
- self.debug_field(f, "Creation date", &self.creation_date)?;
- self.debug_field(f, "Creation time", &self.creation_time)?;
- self.debug_field(f, "File label", &self.file_label)?;
- self.debug_field(f, "Endianness", self.endian)
- }
-}
-
-impl HeaderRecord<RawString> {
- fn read<R: Read + Seek>(r: &mut R, warn: &mut dyn FnMut(Warning)) -> Result<Self, Error> {
- let start = r.stream_position()?;
-
- let magic: [u8; 4] = read_bytes(r)?;
- let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
-
- let eye_catcher = RawString(read_vec(r, 60)?);
- let layout_code: [u8; 4] = read_bytes(r)?;
- let endian = Endian::identify_u32(2, layout_code)
- .or_else(|| Endian::identify_u32(2, layout_code))
- .ok_or(Error::NotASystemFile)?;
- let layout_code = endian.parse(layout_code);
-
- let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
- let nominal_case_size = (1..i32::MAX as u32 / 16)
- .contains(&nominal_case_size)
- .then_some(nominal_case_size);
-
- let compression_code: u32 = endian.parse(read_bytes(r)?);
- let compression = match (magic, compression_code) {
- (Magic::Zsav, 2) => Some(Compression::ZLib),
- (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)),
- (_, 0) => None,
- (_, 1) => Some(Compression::Simple),
- (_, code) => return Err(Error::InvalidSavCompression(code)),
- };
-
- let weight_index: u32 = endian.parse(read_bytes(r)?);
- let weight_index = (weight_index > 0).then_some(weight_index);
-
- let n_cases: u32 = endian.parse(read_bytes(r)?);
- let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
-
- let bias: f64 = endian.parse(read_bytes(r)?);
- if bias != 100.0 && bias != 0.0 {
- warn(Warning::UnexpectedBias(bias));
- }
-
- let creation_date = RawString(read_vec(r, 9)?);
- let creation_time = RawString(read_vec(r, 8)?);
- let file_label = RawString(read_vec(r, 64)?);
- let _: [u8; 3] = read_bytes(r)?;
-
- Ok(HeaderRecord {
- offsets: start..r.stream_position()?,
- magic,
- layout_code,
- nominal_case_size,
- compression,
- weight_index,
- n_cases,
- bias,
- creation_date,
- creation_time,
- eye_catcher,
- file_label,
- endian,
- })
- }
-
- pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord {
- let eye_catcher = decoder.decode(&self.eye_catcher).to_string();
- let file_label = decoder.decode(&self.file_label).to_string();
- let creation_date = decoder.decode(&self.creation_date).to_string();
- let creation_time = decoder.decode(&self.creation_time).to_string();
- DecodedRecord::Header(HeaderRecord {
- eye_catcher,
- weight_index: self.weight_index,
- n_cases: self.n_cases,
- file_label,
- offsets: self.offsets.clone(),
- magic: self.magic,
- layout_code: self.layout_code,
- nominal_case_size: self.nominal_case_size,
- compression: self.compression,
- bias: self.bias,
- creation_date,
- creation_time,
- endian: self.endian,
- })
+ let encoding = infer_encoding(records, &mut warn)?;
+ Ok(Self::new(encoding, warn))
}
-}
-
-pub struct Decoder<'a> {
- pub encoding: &'static Encoding,
- pub warn: Box<dyn FnMut(Warning) + 'a>,
-}
-impl<'de> Decoder<'de> {
+ /// Construct a decoder using `encoding`.
+ ///
+ /// `warn` will be used to report warnings while decoding records.
pub fn new<F>(encoding: &'static Encoding, warn: F) -> Self
where
F: FnMut(Warning) + 'de,
warn: Box::new(warn),
}
}
+
+ /// Drops this decoder, returning its encoding.
+ pub fn into_encoding(self) -> &'static Encoding {
+ self.encoding
+ }
+
fn warn(&mut self, warning: Warning) {
(self.warn)(warning)
}
+
fn decode_slice<'a>(&mut self, input: &'a [u8]) -> Cow<'a, str> {
let (output, malformed) = self.encoding.decode_without_bom_handling(input);
if malformed {
- self.warn(Warning::MalformedString {
- encoding: self.encoding.name().into(),
- text: output.clone().into(),
- });
+ self.warn(Warning::new(
+ None,
+ WarningDetails::MalformedString {
+ encoding: self.encoding.name().into(),
+ text: output.clone().into(),
+ },
+ ));
}
output
}
self.decode_slice(input.0.as_slice())
}
+ /// Decodes `input` to an [Identifier] using our encoding.
pub fn decode_identifier(&mut self, input: &RawString) -> Result<Identifier, IdError> {
let decoded = &self.decode(input);
self.new_identifier(decoded)
}
+ /// Constructs an [Identifier] from `name` using our encoding.
pub fn new_identifier(&self, name: &str) -> Result<Identifier, IdError> {
Identifier::from_encoding(name, self.encoding)
}
}
+/// System file type, inferred from its "magic number".
+///
+/// The magic number is the first four bytes of the file.
#[derive(Copy, Clone, PartialEq, Eq, Hash)]
pub enum Magic {
/// Regular system file.
}
impl TryFrom<[u8; 4]> for Magic {
- type Error = Error;
+ type Error = ErrorDetails;
fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
match value {
Magic::SAV => Ok(Magic::Sav),
Magic::ZSAV => Ok(Magic::Zsav),
Magic::EBCDIC => Ok(Magic::Ebcdic),
- _ => Err(Error::BadMagic(value)),
- }
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum VarType {
- Numeric,
- String,
-}
-
-impl Not for VarType {
- type Output = Self;
-
- fn not(self) -> Self::Output {
- match self {
- Self::Numeric => Self::String,
- Self::String => Self::Numeric,
- }
- }
-}
-
-impl Not for &VarType {
- type Output = VarType;
-
- fn not(self) -> Self::Output {
- !*self
- }
-}
-
-impl Display for VarType {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- match self {
- VarType::Numeric => write!(f, "numeric"),
- VarType::String => write!(f, "string"),
+ _ => Err(ErrorDetails::BadMagic(value)),
}
}
}
}
}
-type RawDatum = Datum<RawStrArray<8>>;
+/// An 8-byte [Datum] but we don't know the string width or character encoding.
+#[derive(Copy, Clone)]
+pub enum RawDatum {
+ /// Number.
+ Number(
+ /// Numeric value.
+ ///
+ /// `None` represents the system-missing value.
+ Option<f64>,
+ ),
+ /// String.
+ String(
+ // String value.
+ //
+ // The true string width and character encoding are unknown.
+ [u8; 8],
+ ),
+}
+
+impl Debug for RawDatum {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ match self {
+ RawDatum::Number(Some(number)) => write!(f, "{number:?}"),
+ RawDatum::Number(None) => write!(f, "SYSMIS"),
+ RawDatum::String(s) => write!(f, "{:?}", RawStr::from_bytes(s)),
+ }
+ }
+}
impl RawDatum {
+ /// Constructs a `RawDatum` from `raw` given that we now know the variable
+ /// type and endianness.
pub fn from_raw(raw: &UntypedDatum, var_type: VarType, endian: Endian) -> Self {
match var_type {
- VarType::String => Datum::String(RawStrArray(raw.0)),
- VarType::Numeric => Datum::Number(endian.parse(raw.0)),
+ VarType::String => RawDatum::String(raw.0),
+ VarType::Numeric => RawDatum::Number(endian.parse(raw.0)),
+ }
+ }
+
+ /// Decodes a `RawDatum` into a [Datum] given that we now know the string
+ /// width.
+ pub fn decode(&self, width: VarWidth) -> Datum {
+ match self {
+ Self::Number(x) => Datum::Number(*x),
+ Self::String(s) => {
+ let width = width.as_string_width().unwrap();
+ Datum::String(RawString::from(&s[..width]))
+ }
}
}
}
reader: &mut R,
case_vars: &[CaseVar],
endian: Endian,
- ) -> Result<Option<Vec<Self>>, Error> {
+ ) -> Result<Option<Case>, Error> {
fn eof<R: Seek>(
reader: &mut R,
case_vars: &[CaseVar],
case_start: u64,
- ) -> Result<Option<Vec<Datum>>, Error> {
+ ) -> Result<Option<Case>, Error> {
let offset = reader.stream_position()?;
if offset == case_start {
Ok(None)
} else {
- Err(Error::EofInCase {
- offset,
- case_ofs: offset - case_start,
- case_len: case_vars.iter().map(CaseVar::bytes).sum(),
- })
+ Err(Error::new(
+ Some(case_start..offset),
+ ErrorDetails::EofInCase {
+ case_ofs: offset - case_start,
+ case_len: case_vars.iter().map(CaseVar::bytes).sum(),
+ },
+ ))
}
}
}
}
}
- Ok(Some(values))
+ Ok(Some(Case(values)))
}
fn read_compressed_chunk<R: Read>(
codes: &mut VecDeque<u8>,
endian: Endian,
bias: f64,
- ) -> Result<Option<Vec<Self>>, Error> {
+ ) -> Result<Option<Case>, Error> {
fn eof<R: Seek>(
reader: &mut R,
case_start: u64,
n_chunks: usize,
- ) -> Result<Option<Vec<Datum>>, Error> {
+ ) -> Result<Option<Case>, Error> {
let offset = reader.stream_position()?;
if n_chunks > 0 {
- Err(Error::EofInCompressedCase {
- case_ofs: offset - case_start,
- n_chunks,
- offset,
- })
+ Err(Error::new(
+ Some(case_start..offset),
+ ErrorDetails::EofInCompressedCase {
+ case_ofs: offset - case_start,
+ n_chunks,
+ },
+ ))
} else {
Ok(None)
}
}
}
}
- Ok(Some(values))
- }
-}
-
-impl RawDatum {
- pub fn decode(&self, width: VarWidth) -> Datum {
- match self {
- Self::Number(x) => Datum::Number(*x),
- Self::String(s) => {
- let width = width.as_string_width().unwrap();
- Datum::String(RawString::from(&s.0[..width]))
- }
- }
+ Ok(Some(Case(values)))
}
}
}
enum ReaderState {
- Start,
Headers,
ZlibHeader,
ZlibTrailer(ZHeader),
End,
}
+/// Reads records from a system file in their raw form.
pub struct Reader<'a, R>
where
R: Read + Seek + 'static,
reader: Option<R>,
warn: Box<dyn FnMut(Warning) + 'a>,
- header: HeaderRecord<RawString>,
+ header: FileHeader<RawString>,
var_types: VarTypes,
state: ReaderState,
where
R: Read + Seek + 'static,
{
+ /// Constructs a new [Reader] from the underlying `reader`. Any warnings
+ /// encountered while reading the system file will be reported with `warn`.
+ ///
+ /// To read an encrypted system file, wrap `reader` in
+ /// [EncryptedReader](crate::crypto::EncryptedReader).
pub fn new(mut reader: R, mut warn: impl FnMut(Warning) + 'a) -> Result<Self, Error> {
- let header = HeaderRecord::read(&mut reader, &mut warn)?;
+ let header = FileHeader::read(&mut reader, &mut warn)?;
Ok(Self {
reader: Some(reader),
warn: Box::new(warn),
header,
var_types: VarTypes::new(),
- state: ReaderState::Start,
+ state: ReaderState::Headers,
cases: None,
})
}
- pub fn headers<'b>(&'b mut self) -> ReadHeaders<'a, 'b, R> {
- ReadHeaders(self)
+
+ /// Returns the header in this reader.
+ pub fn header(&self) -> &FileHeader<RawString> {
+ &self.header
+ }
+
+ /// Returns a structure for reading the system file's records.
+ pub fn records<'b>(&'b mut self) -> Records<'a, 'b, R> {
+ Records(self)
}
+
+ /// Returns a structure for reading the system file's cases.
+ ///
+ /// The cases are only available once all the headers have been read. If
+ /// there is an error reading the headers, or if [cases](Self::cases) is
+ /// called before all of the headers have been read, the returned [Cases]
+ /// will be empty.
pub fn cases(self) -> Cases {
self.cases.unwrap_or_default()
}
}
-pub struct ReadHeaders<'a, 'b, R>(&'b mut Reader<'a, R>)
+/// Reads raw records from a system file.
+pub struct Records<'a, 'b, R>(&'b mut Reader<'a, R>)
where
R: Read + Seek + 'static;
-impl<'a, 'b, R> ReadHeaders<'a, 'b, R>
+impl<'a, 'b, R> Records<'a, 'b, R>
where
R: Read + Seek + 'static,
{
));
}
- fn _next(&mut self) -> Option<<Self as Iterator>::Item> {
+ fn next_inner(&mut self) -> Option<<Self as Iterator>::Item> {
match self.0.state {
- ReaderState::Start => {
- self.0.state = ReaderState::Headers;
- Some(Ok(Record::Header(self.0.header.clone())))
- }
ReaderState::Headers => {
let record = loop {
match Record::read(
}
}
-impl<'a, 'b, R> Iterator for ReadHeaders<'a, 'b, R>
+impl<'a, 'b, R> Iterator for Records<'a, 'b, R>
where
R: Read + Seek + 'static,
{
type Item = Result<Record, Error>;
fn next(&mut self) -> Option<Self::Item> {
- let retval = self._next();
- if matches!(retval, Some(Err(_))) {
- self.0.state = ReaderState::End;
- }
- retval
+ self.next_inner().inspect(|retval| {
+ if retval.is_err() {
+ self.0.state = ReaderState::End;
+ }
+ })
}
}
trait ReadSeek: Read + Seek {}
impl<T> ReadSeek for T where T: Read + Seek {}
-pub struct Case(pub Vec<Datum>);
-
#[derive(Debug)]
struct StringSegment {
data_bytes: usize,
}
}
+/// Reader for cases in a system file.
+///
+/// - [Reader::cases] returns [Cases] in which very long string variables (those
+/// over 255 bytes wide) are still in their raw format, which means that they
+/// are divided into multiple, adjacent string variables, approximately one
+/// variable for each 252 bytes.
+///
+/// - In the [Cases] in [SystemFile], each [Dictionary] variable corresponds to
+/// one [Datum], even for long string variables.
+///
+/// [Dictionary]: crate::dictionary::Dictionary
+/// [SystemFile]: crate::sys::cooked::SystemFile
pub struct Cases {
reader: Box<dyn ReadSeek>,
case_vars: Vec<CaseVar>,
}
impl Cases {
- fn new<R>(reader: R, var_types: VarTypes, header: &HeaderRecord<RawString>) -> Self
+ fn new<R>(reader: R, var_types: VarTypes, header: &FileHeader<RawString>) -> Self
where
R: Read + Seek + 'static,
{
}
}
+ /// Returns this [Cases] with its notion of variable widths updated from
+ /// `widths`.
+ ///
+ /// [Records::decode](crate::sys::Records::decode) uses this to properly handle
+ /// very long string variables (see [Cases] for details).
pub fn with_widths(self, widths: impl IntoIterator<Item = VarWidth>) -> Self {
Self {
case_vars: widths.into_iter().map(CaseVar::new).collect::<Vec<_>>(),
}
}
+ /// Returns this [Cases] updated to expect `expected_cases`. If the actual
+ /// number of cases in the file differs, the reader will issue a warning.
pub fn with_expected_cases(self, expected_cases: u64) -> Self {
Self {
expected_cases: Some(expected_cases),
}
impl Iterator for Cases {
- type Item = Result<Vec<Datum>, Error>;
+ type Item = Result<Case, Error>;
fn next(&mut self) -> Option<Self::Item> {
if self.eof {
if let Some(expected_cases) = self.expected_cases
&& expected_cases != self.read_cases
{
- return Some(Err(Error::WrongNumberOfCases {
- expected: expected_cases,
- actual: self.read_cases,
- }));
+ return Some(Err(Error::new(
+ None,
+ ErrorDetails::WrongNumberOfCases {
+ expected: expected_cases,
+ actual: self.read_cases,
+ },
+ )));
} else {
return None;
}
}
}
-#[derive(Copy, Clone, PartialEq, Eq, Hash)]
-pub struct Spec(pub u32);
-
-impl Debug for Spec {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- let type_ = format_name(self.0 >> 16);
- let w = (self.0 >> 8) & 0xff;
- let d = self.0 & 0xff;
- write!(f, "{:06x} ({type_}{w}.{d})", self.0)
- }
-}
-
-fn format_name(type_: u32) -> Cow<'static, str> {
- match type_ {
- 1 => "A",
- 2 => "AHEX",
- 3 => "COMMA",
- 4 => "DOLLAR",
- 5 => "F",
- 6 => "IB",
- 7 => "PIBHEX",
- 8 => "P",
- 9 => "PIB",
- 10 => "PK",
- 11 => "RB",
- 12 => "RBHEX",
- 15 => "Z",
- 16 => "N",
- 17 => "E",
- 20 => "DATE",
- 21 => "TIME",
- 22 => "DATETIME",
- 23 => "ADATE",
- 24 => "JDATE",
- 25 => "DTIME",
- 26 => "WKDAY",
- 27 => "MONTH",
- 28 => "MOYR",
- 29 => "QYR",
- 30 => "WKYR",
- 31 => "PCT",
- 32 => "DOT",
- 33 => "CCA",
- 34 => "CCB",
- 35 => "CCC",
- 36 => "CCD",
- 37 => "CCE",
- 38 => "EDATE",
- 39 => "SDATE",
- 40 => "MTIME",
- 41 => "YMDHMS",
- _ => return format!("<unknown format {type_}>").into(),
- }
- .into()
-}
-
-#[derive(Clone, Default)]
-pub struct MissingValues {
- /// Individual missing values, up to 3 of them.
- values: Vec<Datum>,
-
- /// Optional range of missing values.
- range: Option<MissingValueRange>,
-}
-
-impl Debug for MissingValues {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- DisplayMissingValues {
- mv: self,
- encoding: None,
- }
- .fmt(f)
- }
-}
-
-#[derive(Copy, Clone, Debug)]
-pub enum MissingValuesError {
- TooMany,
- TooWide,
- MixedTypes,
-}
-
-impl MissingValues {
- pub fn new(
- mut values: Vec<Datum>,
- range: Option<MissingValueRange>,
- ) -> Result<Self, MissingValuesError> {
- if values.len() > 3 {
- return Err(MissingValuesError::TooMany);
- }
-
- let mut var_type = None;
- for value in values.iter_mut() {
- value.trim_end();
- match value.width() {
- VarWidth::String(w) if w > 8 => return Err(MissingValuesError::TooWide),
- _ => (),
- }
- if var_type.is_some_and(|t| t != value.var_type()) {
- return Err(MissingValuesError::MixedTypes);
- }
- var_type = Some(value.var_type());
- }
-
- if var_type == Some(VarType::String) && range.is_some() {
- return Err(MissingValuesError::MixedTypes);
- }
-
- Ok(Self { values, range })
- }
-
- pub fn is_empty(&self) -> bool {
- self.values.is_empty() && self.range.is_none()
- }
-
- pub fn var_type(&self) -> Option<VarType> {
- if let Some(datum) = self.values.first() {
- Some(datum.var_type())
- } else if self.range.is_some() {
- Some(VarType::Numeric)
- } else {
- None
- }
- }
-
- pub fn contains(&self, value: &Datum) -> bool {
- if self
- .values
- .iter()
- .any(|datum| datum.eq_ignore_trailing_spaces(value))
- {
- return true;
- }
-
- match value {
- Datum::Number(Some(number)) => self.range.is_some_and(|range| range.contains(*number)),
- _ => false,
- }
- }
-
- pub fn is_resizable(&self, width: VarWidth) -> bool {
- self.values.iter().all(|datum| datum.is_resizable(width))
- && self.range.iter().all(|range| range.is_resizable(width))
- }
-
- pub fn resize(&mut self, width: VarWidth) {
- for datum in &mut self.values {
- datum.resize(width);
- }
- if let Some(range) = &mut self.range {
- range.resize(width);
- }
- }
-
- fn read<R: Read + Seek>(
- r: &mut R,
- offset: u64,
- raw_width: RawWidth,
- code: i32,
- endian: Endian,
- warn: &mut dyn FnMut(Warning),
- ) -> Result<Self, Error> {
- let (individual_values, has_range) = match code {
- 0 => return Ok(Self::default()),
- 1..=3 => (code as usize, false),
- -2 => (0, true),
- -3 => (1, true),
- _ => return Err(Error::BadMissingValueCode { offset, code }),
- };
-
- let mut values = Vec::with_capacity(individual_values);
- let range = if has_range {
- let low = read_bytes::<8, _>(r)?;
- let high = read_bytes::<8, _>(r)?;
- Some((low, high))
- } else {
- None
- };
- for _ in 0..individual_values {
- values.push(read_bytes::<8, _>(r)?);
- }
-
- match VarWidth::try_from(raw_width) {
- Ok(VarWidth::Numeric) => {
- let values = values
- .into_iter()
- .map(|v| Datum::Number(endian.parse(v)))
- .collect();
-
- let range = range.map(|(low, high)| {
- MissingValueRange::new(endian.parse(low), endian.parse(high))
- });
- return Ok(Self::new(values, range).unwrap());
- }
- Ok(VarWidth::String(_)) if range.is_some() => warn(Warning::MissingValueStringRange),
- Ok(VarWidth::String(width)) => {
- let width = width.min(8) as usize;
- let values = values
- .into_iter()
- .map(|value| Datum::String(RawString::from(&value[..width])))
- .collect();
- return Ok(Self::new(values, None).unwrap());
- }
- Err(()) => warn(Warning::MissingValueContinuation(offset)),
- }
- Ok(Self::default())
- }
-
- pub fn display(&self, encoding: &'static Encoding) -> DisplayMissingValues<'_> {
- DisplayMissingValues {
- mv: self,
- encoding: Some(encoding),
- }
- }
-}
-
-pub struct DisplayMissingValues<'a> {
- mv: &'a MissingValues,
- encoding: Option<&'static Encoding>,
-}
-
-impl<'a> Display for DisplayMissingValues<'a> {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- if let Some(range) = &self.mv.range {
- write!(f, "{range}")?;
- if !self.mv.values.is_empty() {
- write!(f, "; ")?;
- }
- }
-
- for (i, value) in self.mv.values.iter().enumerate() {
- if i > 0 {
- write!(f, "; ")?;
- }
- match self.encoding {
- Some(encoding) => value.display_plain(encoding).fmt(f)?,
- None => value.fmt(f)?,
- }
- }
-
- if self.mv.is_empty() {
- write!(f, "none")?;
- }
- Ok(())
- }
-}
-
-#[derive(Copy, Clone)]
-pub enum MissingValueRange {
- In { low: f64, high: f64 },
- From { low: f64 },
- To { high: f64 },
-}
-
-impl MissingValueRange {
- pub fn new(low: f64, high: f64) -> Self {
- const LOWEST: f64 = f64::MIN.next_up();
- match (low, high) {
- (f64::MIN | LOWEST, _) => Self::To { high },
- (_, f64::MAX) => Self::From { low },
- (_, _) => Self::In { low, high },
- }
- }
-
- pub fn low(&self) -> Option<f64> {
- match self {
- MissingValueRange::In { low, .. } | MissingValueRange::From { low } => Some(*low),
- MissingValueRange::To { .. } => None,
- }
- }
-
- pub fn high(&self) -> Option<f64> {
- match self {
- MissingValueRange::In { high, .. } | MissingValueRange::To { high } => Some(*high),
- MissingValueRange::From { .. } => None,
- }
- }
-
- pub fn contains(&self, number: f64) -> bool {
- match self {
- MissingValueRange::In { low, high } => (*low..*high).contains(&number),
- MissingValueRange::From { low } => number >= *low,
- MissingValueRange::To { high } => number <= *high,
- }
- }
-
- pub fn is_resizable(&self, width: VarWidth) -> bool {
- width.is_numeric()
- }
-
- pub fn resize(&self, width: VarWidth) {
- assert_eq!(width, VarWidth::Numeric);
- }
-}
-
-impl Display for MissingValueRange {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- match self.low() {
- Some(low) => low.display_plain().fmt(f)?,
- None => write!(f, "LOW")?,
- }
-
- write!(f, " THRU ")?;
-
- match self.high() {
- Some(high) => high.display_plain().fmt(f)?,
- None => write!(f, "HIGH")?,
- }
- Ok(())
- }
-}
-
-#[derive(Clone)]
-pub struct VariableRecord<S>
-where
- S: Debug,
-{
- /// Range of offsets in file.
- pub offsets: Range<u64>,
-
- /// Variable width, in the range -1..=255.
- pub width: RawWidth,
-
- /// Variable name, padded on the right with spaces.
- pub name: S,
-
- /// Print format.
- pub print_format: Spec,
-
- /// Write format.
- pub write_format: Spec,
-
- /// Missing values.
- pub missing_values: MissingValues,
-
- /// Optional variable label.
- pub label: Option<S>,
-}
-
+/// Width of a variable record.
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum RawWidth {
+ /// String continuation.
+ ///
+ /// One variable record of this type is present for each 8 bytes after
+ /// the first 8 bytes of a string variable, as a kind of placeholder.
Continuation,
+
+ /// Numeric.
Numeric,
+
+ /// String.
String(NonZeroU8),
}
impl RawWidth {
+ /// Returns the number of value positions corresponding to a variable with
+ /// this type.
pub fn n_values(&self) -> Option<usize> {
match self {
RawWidth::Numeric => Some(1),
RawWidth::String(width) => Some((width.get() as usize).div_ceil(8)),
- _ => None,
+ RawWidth::Continuation => None,
}
}
}
}
}
-impl<S> Debug for VariableRecord<S>
-where
- S: Debug,
-{
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- writeln!(f, "Width: {}", self.width,)?;
- writeln!(f, "Print format: {:?}", self.print_format)?;
- writeln!(f, "Write format: {:?}", self.write_format)?;
- writeln!(f, "Name: {:?}", &self.name)?;
- writeln!(f, "Variable label: {:?}", self.label)?;
- writeln!(f, "Missing values: {:?}", self.missing_values)
- }
-}
-
-impl VariableRecord<RawString> {
- fn read<R: Read + Seek>(
- r: &mut R,
- endian: Endian,
- warn: &mut dyn FnMut(Warning),
- ) -> Result<Record, Error> {
- let start_offset = r.stream_position()?;
- let width: i32 = endian.parse(read_bytes(r)?);
- let width: RawWidth = width.try_into().map_err(|_| Error::BadVariableWidth {
- start_offset,
- width,
- })?;
- let code_offset = r.stream_position()?;
- let has_variable_label: u32 = endian.parse(read_bytes(r)?);
- let missing_value_code: i32 = endian.parse(read_bytes(r)?);
- let print_format = Spec(endian.parse(read_bytes(r)?));
- let write_format = Spec(endian.parse(read_bytes(r)?));
- let name = RawString(read_vec(r, 8)?);
-
- let label = match has_variable_label {
- 0 => None,
- 1 => {
- let len: u32 = endian.parse(read_bytes(r)?);
- let read_len = len.min(65535) as usize;
- let label = RawString(read_vec(r, read_len)?);
-
- let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
- let _ = read_vec(r, padding_bytes as usize)?;
-
- Some(label)
- }
- _ => {
- return Err(Error::BadVariableLabelCode {
- start_offset,
- code_offset,
- code: has_variable_label,
- })
- }
- };
-
- let missing_values =
- MissingValues::read(r, start_offset, width, missing_value_code, endian, warn)?;
-
- let end_offset = r.stream_position()?;
-
- Ok(Record::Variable(VariableRecord {
- offsets: start_offset..end_offset,
- width,
- name,
- print_format,
- write_format,
- missing_values,
- label,
- }))
- }
-
- pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord {
- DecodedRecord::Variable(VariableRecord {
- offsets: self.offsets.clone(),
- width: self.width,
- name: decoder.decode(&self.name).to_string(),
- print_format: self.print_format,
- write_format: self.write_format,
- missing_values: self.missing_values,
- label: self
- .label
- .as_ref()
- .map(|label| decoder.decode(label).to_string()),
- })
- }
-}
-
+/// 8 bytes that represent a number or a string (but that's all we know).
+///
+/// Used when we don't know whether it's a number or a string, or the numerical
+/// endianness, or the string width, or the character encoding. Really all we
+/// know is that it's 8 bytes that mean something.
#[derive(Copy, Clone)]
pub struct UntypedDatum(pub [u8; 8]);
} else {
big
};
- write!(f, "{number}")?;
-
- let string = default_decode(&self.0);
- let string = string
- .split(|c: char| c == '\0' || c.is_control())
- .next()
- .unwrap();
- write!(f, "{string:?}")?;
- Ok(())
- }
-}
-
-/// An owned string in an unspecified encoding.
-///
-/// We assume that the encoding is one supported by [encoding_rs] with byte
-/// units (that is, not a `UTF-16` encoding). All of these encodings have some
-/// basic ASCII compatibility.
-///
-/// A [RawString] owns its contents and can grow and shrink, like a [Vec] or
-/// [String]. For a borrowed raw string, see [RawStr].
-#[derive(Clone, PartialEq, Default, Eq, PartialOrd, Ord, Hash)]
-pub struct RawString(pub Vec<u8>);
-
-impl RawString {
- pub fn spaces(n: usize) -> Self {
- Self(std::iter::repeat_n(b' ', n).collect())
- }
- pub fn as_encoded(&self, encoding: &'static Encoding) -> EncodedStr<'_> {
- EncodedStr::new(&self.0, encoding)
- }
- pub fn resize(&mut self, len: usize) {
- self.0.resize(len, b' ');
- }
- pub fn len(&self) -> usize {
- self.0.len()
- }
- pub fn trim_end(&mut self) {
- while self.0.pop_if(|c| *c == b' ').is_some() {}
- }
-}
-
-impl Borrow<RawStr> for RawString {
- fn borrow(&self) -> &RawStr {
- RawStr::from_bytes(&self.0)
- }
-}
-
-impl Deref for RawString {
- type Target = RawStr;
-
- fn deref(&self) -> &Self::Target {
- self.borrow()
+ write!(f, "{number}/{:?}", RawStr::from_bytes(&self.0))
}
}
-impl From<Cow<'_, [u8]>> for RawString {
- fn from(value: Cow<'_, [u8]>) -> Self {
- Self(value.into_owned())
- }
-}
+/// An 8-byte raw string whose type and encoding are unknown.
+#[derive(Copy, Clone)]
+pub struct RawStrArray<const N: usize>(
+ /// Content.
+ pub [u8; N],
+);
-impl From<Vec<u8>> for RawString {
- fn from(source: Vec<u8>) -> Self {
+impl<const N: usize> From<[u8; N]> for RawStrArray<N> {
+ fn from(source: [u8; N]) -> Self {
Self(source)
}
}
-impl From<&[u8]> for RawString {
- fn from(source: &[u8]) -> Self {
- Self(source.into())
- }
-}
-
-impl Debug for RawString {
+impl<const N: usize> Debug for RawStrArray<N> {
fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{:?}", default_decode(self.0.as_slice()))
+ write!(f, "{:?}", RawStr::from_bytes(&self.0))
}
}
-/// A borrowed string in an unspecified encoding.
-///
-/// We assume that the encoding is one supported by [encoding_rs] with byte
-/// units (that is, not a `UTF-16` encoding). All of these encodings have some
-/// basic ASCII compatibility.
-///
-/// For an owned raw string, see [RawString].
-#[repr(transparent)]
-#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub struct RawStr(pub [u8]);
-
-impl RawStr {
- pub fn from_bytes(bytes: &[u8]) -> &Self {
- // SAFETY: `RawStr` is a transparent wrapper around `[u8]`, so we can
- // turn a reference to the wrapped type into a reference to the wrapper
- // type.
- unsafe { &*(bytes as *const [u8] as *const Self) }
- }
-
- pub fn as_bytes(&self) -> &[u8] {
- &self.0
- }
-
- /// Returns an object that implements [Display] for printing this [RawStr],
- /// given that it is encoded in `encoding`.
- pub fn display(&self, encoding: &'static Encoding) -> DisplayRawString {
- DisplayRawString(encoding.decode_without_bom_handling(&self.0).0)
- }
-
- pub fn decode(&self, encoding: &'static Encoding) -> Cow<'_, str> {
- encoding.decode_without_bom_handling(&self.0).0
+fn skip_bytes<R: Read>(r: &mut R, mut n: usize) -> Result<(), IoError> {
+ thread_local! {
+ static BUF: RefCell<[u8; 256]> = RefCell::new([0u8; 256]);
}
-
- pub fn eq_ignore_trailing_spaces(&self, other: &RawStr) -> bool {
- let mut this = self.0.iter();
- let mut other = other.0.iter();
- loop {
- match (this.next(), other.next()) {
- (Some(a), Some(b)) if a == b => (),
- (Some(_), Some(_)) => return false,
- (None, None) => return true,
- (Some(b' '), None) => return this.all(|c| *c == b' '),
- (None, Some(b' ')) => return other.all(|c| *c == b' '),
- (Some(_), None) | (None, Some(_)) => return false,
- }
+ BUF.with_borrow_mut(|buf| {
+ while n > 0 {
+ let chunk = n.min(buf.len());
+ r.read_exact(&mut buf[..n])?;
+ n -= chunk;
}
- }
+ Ok(())
+ })
}
-pub struct DisplayRawString<'a>(Cow<'a, str>);
-
-impl<'a> Display for DisplayRawString<'a> {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- write!(f, "{}", &self.0)
+fn try_read_bytes_into<R: Read>(r: &mut R, buf: &mut [u8]) -> Result<bool, IoError> {
+ let n = r.read(buf)?;
+ if n > 0 {
+ if n < buf.len() {
+ r.read_exact(&mut buf[n..])?;
+ }
+ Ok(true)
+ } else {
+ Ok(false)
}
}
-impl Debug for RawStr {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{:?}", default_decode(self.as_bytes()))
+fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
+ let mut buf = [0; N];
+ match try_read_bytes_into(r, &mut buf)? {
+ true => Ok(Some(buf)),
+ false => Ok(None),
}
}
-#[derive(Copy, Clone)]
-pub struct RawStrArray<const N: usize>(pub [u8; N]);
-
-impl<const N: usize> From<[u8; N]> for RawStrArray<N> {
- fn from(source: [u8; N]) -> Self {
- Self(source)
- }
-}
-
-impl<const N: usize> Debug for RawStrArray<N> {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "{:?}", default_decode(&self.0))
- }
-}
-
-#[derive(Clone, Debug)]
-pub enum EncodedString {
- Encoded {
- bytes: Vec<u8>,
- encoding: &'static Encoding,
- },
- Utf8 {
- s: String,
- },
-}
-
-impl EncodedString {
- pub fn borrowed(&self) -> EncodedStr<'_> {
- match self {
- EncodedString::Encoded { bytes, encoding } => EncodedStr::Encoded { bytes, encoding },
- EncodedString::Utf8 { s } => EncodedStr::Utf8 { s },
- }
- }
-}
-
-impl<'a> From<EncodedStr<'a>> for EncodedString {
- fn from(value: EncodedStr<'a>) -> Self {
- match value {
- EncodedStr::Encoded { bytes, encoding } => Self::Encoded {
- bytes: bytes.into(),
- encoding,
- },
- EncodedStr::Utf8 { s } => Self::Utf8 { s: s.into() },
- }
- }
-}
-
-pub enum EncodedStr<'a> {
- Encoded {
- bytes: &'a [u8],
- encoding: &'static Encoding,
- },
- Utf8 {
- s: &'a str,
- },
-}
-
-impl<'a> EncodedStr<'a> {
- pub fn new(bytes: &'a [u8], encoding: &'static Encoding) -> Self {
- Self::Encoded { bytes, encoding }
- }
- pub fn as_str(&self) -> Cow<'_, str> {
- match self {
- EncodedStr::Encoded { bytes, encoding } => {
- encoding.decode_without_bom_handling(bytes).0
- }
- EncodedStr::Utf8 { s } => Cow::from(*s),
- }
- }
- pub fn as_bytes(&self) -> &[u8] {
- match self {
- EncodedStr::Encoded { bytes, .. } => bytes,
- EncodedStr::Utf8 { s } => s.as_bytes(),
- }
- }
- pub fn to_encoding(&self, encoding: &'static Encoding) -> Cow<[u8]> {
- match self {
- EncodedStr::Encoded { bytes, encoding } => {
- let utf8 = encoding.decode_without_bom_handling(bytes).0;
- match encoding.encode(&utf8).0 {
- Cow::Borrowed(_) => {
- // Recoding into UTF-8 and then back did not change anything.
- Cow::from(*bytes)
- }
- Cow::Owned(owned) => Cow::Owned(owned),
- }
- }
- EncodedStr::Utf8 { s } => encoding.encode(s).0,
- }
- }
- pub fn is_empty(&self) -> bool {
- match self {
- EncodedStr::Encoded { bytes, .. } => bytes.is_empty(),
- EncodedStr::Utf8 { s } => s.is_empty(),
- }
- }
- pub fn quoted(&self) -> QuotedEncodedStr {
- QuotedEncodedStr(self)
- }
-}
-
-impl<'a> From<&'a str> for EncodedStr<'a> {
- fn from(s: &'a str) -> Self {
- Self::Utf8 { s }
- }
-}
-
-impl<'a> From<&'a String> for EncodedStr<'a> {
- fn from(s: &'a String) -> Self {
- Self::Utf8 { s: s.as_str() }
- }
-}
-
-pub struct QuotedEncodedStr<'a>(&'a EncodedStr<'a>);
-
-impl Display for QuotedEncodedStr<'_> {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- write!(f, "{:?}", self.0.as_str())
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct ValueLabel<V, S>
-where
- V: Debug,
- S: Debug,
-{
- pub datum: Datum<V>,
- pub label: S,
-}
-
-#[derive(Clone)]
-pub struct ValueLabelRecord<V, S>
-where
- V: Debug,
- S: Debug,
-{
- /// Range of offsets in file.
- pub offsets: Range<u64>,
-
- /// The labels.
- pub labels: Vec<ValueLabel<V, S>>,
-
- /// The 1-based indexes of the variable indexes.
- pub dict_indexes: Vec<u32>,
-
- /// The types of the variables.
- pub var_type: VarType,
-}
-
-impl<V, S> Debug for ValueLabelRecord<V, S>
-where
- V: Debug,
- S: Debug,
-{
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- writeln!(f, "labels: ")?;
- for label in self.labels.iter() {
- writeln!(f, "{label:?}")?;
- }
- write!(f, "apply to {} variables", self.var_type)?;
- for dict_index in self.dict_indexes.iter() {
- write!(f, " #{dict_index}")?;
- }
- Ok(())
- }
-}
-
-impl<V, S> ValueLabelRecord<V, S>
-where
- V: Debug,
- S: Debug,
-{
- /// Maximum number of value labels in a record.
- pub const MAX_LABELS: u32 = u32::MAX / 8;
-
- /// Maximum number of variable indexes in a record.
- pub const MAX_INDEXES: u32 = u32::MAX / 8;
-}
-
-impl ValueLabelRecord<RawStrArray<8>, RawString> {
- fn read<R: Read + Seek>(
- r: &mut R,
- endian: Endian,
- var_types: &VarTypes,
- warn: &mut dyn FnMut(Warning),
- ) -> Result<Option<Record>, Error> {
- let label_offset = r.stream_position()?;
- let n: u32 = endian.parse(read_bytes(r)?);
- if n > Self::MAX_LABELS {
- return Err(Error::BadNumberOfValueLabels {
- offset: label_offset,
- n,
- max: Self::MAX_LABELS,
- });
- }
-
- let mut labels = Vec::new();
- for _ in 0..n {
- let value = UntypedDatum(read_bytes(r)?);
- let label_len: u8 = endian.parse(read_bytes(r)?);
- let label_len = label_len as usize;
- let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
-
- let mut label = read_vec(r, padded_len - 1)?;
- label.truncate(label_len);
- labels.push((value, RawString(label)));
- }
-
- let index_offset = r.stream_position()?;
- let rec_type: u32 = endian.parse(read_bytes(r)?);
- if rec_type != 4 {
- return Err(Error::ExpectedVarIndexRecord {
- offset: index_offset,
- rec_type,
- });
- }
-
- let n: u32 = endian.parse(read_bytes(r)?);
- if n > Self::MAX_INDEXES {
- return Err(Error::TooManyVarIndexes {
- offset: index_offset,
- n,
- max: Self::MAX_INDEXES,
- });
- } else if n == 0 {
- dbg!();
- warn(Warning::NoVarIndexes {
- offset: index_offset,
- });
- return Ok(None);
- }
-
- let index_offset = r.stream_position()?;
- let mut dict_indexes = Vec::with_capacity(n as usize);
- let mut invalid_indexes = Vec::new();
- for _ in 0..n {
- let index: u32 = endian.parse(read_bytes(r)?);
- if var_types.is_valid_index(index as usize) {
- dict_indexes.push(index);
- } else {
- invalid_indexes.push(index);
- }
- }
- if !invalid_indexes.is_empty() {
- warn(Warning::InvalidVarIndexes {
- offset: index_offset,
- max: var_types.n_values(),
- invalid: invalid_indexes,
- });
- }
-
- let Some(&first_index) = dict_indexes.first() else {
- return Ok(None);
- };
- let var_type = VarType::from(var_types.types[first_index as usize - 1].unwrap());
- let mut wrong_type_indexes = Vec::new();
- dict_indexes.retain(|&index| {
- if var_types.types[index as usize - 1].map(VarType::from) != Some(var_type) {
- wrong_type_indexes.push(index);
- false
- } else {
- true
- }
- });
- if !wrong_type_indexes.is_empty() {
- warn(Warning::MixedVarTypes {
- offset: index_offset,
- var_type,
- wrong_types: wrong_type_indexes,
- });
- }
-
- let labels = labels
- .into_iter()
- .map(|(value, label)| ValueLabel {
- datum: Datum::from_raw(&value, var_type, endian),
- label,
- })
- .collect();
-
- let end_offset = r.stream_position()?;
- Ok(Some(Record::ValueLabel(ValueLabelRecord {
- offsets: label_offset..end_offset,
- labels,
- dict_indexes,
- var_type,
- })))
- }
-
- fn decode(self, decoder: &mut Decoder) -> ValueLabelRecord<RawStrArray<8>, String> {
- let labels = self
- .labels
- .iter()
- .map(
- |ValueLabel {
- datum: value,
- label,
- }| ValueLabel {
- datum: value.clone(),
- label: decoder.decode(label).to_string(),
- },
- )
- .collect();
- ValueLabelRecord {
- offsets: self.offsets.clone(),
- labels,
- dict_indexes: self.dict_indexes.clone(),
- var_type: self.var_type,
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct DocumentRecord<S>
-where
- S: Debug,
-{
- pub offsets: Range<u64>,
-
- /// The document, as an array of lines. Raw lines are exactly 80 bytes long
- /// and are right-padded with spaces without any new-line termination.
- pub lines: Vec<S>,
-}
-
-pub type RawDocumentLine = RawStrArray<DOC_LINE_LEN>;
-
-/// Length of a line in a document. Document lines are fixed-length and
-/// padded on the right with spaces.
-pub const DOC_LINE_LEN: usize = 80;
-
-impl DocumentRecord<RawDocumentLine> {
- /// Maximum number of lines we will accept in a document. This is simply
- /// the maximum number that will fit in a 32-bit space.
- pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
-
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
- let start_offset = r.stream_position()?;
- let n: u32 = endian.parse(read_bytes(r)?);
- let n = n as usize;
- if n > Self::MAX_LINES {
- Err(Error::BadDocumentLength {
- offset: start_offset,
- n,
- max: Self::MAX_LINES,
- })
- } else {
- let mut lines = Vec::with_capacity(n);
- for _ in 0..n {
- lines.push(RawStrArray(read_bytes(r)?));
- }
- let end_offset = r.stream_position()?;
- Ok(Record::Document(DocumentRecord {
- offsets: start_offset..end_offset,
- lines,
- }))
- }
- }
-
- pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord {
- DecodedRecord::Document(DocumentRecord {
- offsets: self.offsets.clone(),
- lines: self
- .lines
- .iter()
- .map(|s| decoder.decode_slice(&s.0).to_string())
- .collect(),
- })
- }
-}
-
-struct ExtensionRecord<'a> {
- size: Option<u32>,
- count: Option<u32>,
- name: &'a str,
-}
-
-#[derive(Clone, Debug)]
-pub struct IntegerInfoRecord {
- pub offsets: Range<u64>,
- pub version: (i32, i32, i32),
- pub machine_code: i32,
- pub floating_point_rep: i32,
- pub compression_code: i32,
- pub endianness: i32,
- pub character_code: i32,
-}
-
-static INTEGER_INFO_RECORD: ExtensionRecord = ExtensionRecord {
- size: Some(4),
- count: Some(8),
- name: "integer record",
-};
-
-impl IntegerInfoRecord {
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size(&INTEGER_INFO_RECORD)?;
-
- let mut input = &ext.data[..];
- let data: Vec<i32> = (0..8)
- .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
- .collect();
- Ok(Record::IntegerInfo(IntegerInfoRecord {
- offsets: ext.offsets.clone(),
- version: (data[0], data[1], data[2]),
- machine_code: data[3],
- floating_point_rep: data[4],
- compression_code: data[5],
- endianness: data[6],
- character_code: data[7],
- }))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct FloatInfoRecord {
- pub sysmis: f64,
- pub highest: f64,
- pub lowest: f64,
-}
-
-static FLOAT_INFO_RECORD: ExtensionRecord = ExtensionRecord {
- size: Some(8),
- count: Some(3),
- name: "floating point record",
-};
-
-impl FloatInfoRecord {
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size(&FLOAT_INFO_RECORD)?;
-
- let mut input = &ext.data[..];
- let data: Vec<f64> = (0..3)
- .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
- .collect();
- Ok(Record::FloatInfo(FloatInfoRecord {
- sysmis: data[0],
- highest: data[1],
- lowest: data[2],
- }))
- }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum CategoryLabels {
- VarLabels,
- CountedValues,
-}
-
-#[derive(Clone, Debug)]
-pub enum MultipleResponseType {
- MultipleDichotomy {
- value: RawString,
- labels: CategoryLabels,
- },
- MultipleCategory,
-}
-
-impl MultipleResponseType {
- fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> {
- let (mr_type, input) = match input.split_first() {
- Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
- Some((b'D', input)) => {
- let (value, input) = parse_counted_string(input)?;
- (
- MultipleResponseType::MultipleDichotomy {
- value,
- labels: CategoryLabels::VarLabels,
- },
- input,
- )
- }
- Some((b'E', input)) => {
- let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
- (CategoryLabels::CountedValues, rest)
- } else if let Some(rest) = input.strip_prefix(b" 11 ") {
- (CategoryLabels::VarLabels, rest)
- } else {
- return Err(Warning::InvalidMultipleDichotomyLabelType);
- };
- let (value, input) = parse_counted_string(input)?;
- (
- MultipleResponseType::MultipleDichotomy { value, labels },
- input,
- )
- }
- _ => return Err(Warning::InvalidMultipleResponseType),
- };
- Ok((mr_type, input))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct MultipleResponseSet<I, S>
-where
- I: Debug,
- S: Debug,
-{
- pub name: I,
- pub label: S,
- pub mr_type: MultipleResponseType,
- pub short_names: Vec<I>,
-}
-
-impl MultipleResponseSet<RawString, RawString> {
- fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> {
- let Some(equals) = input.iter().position(|&b| b == b'=') else {
- return Err(Warning::MultipleResponseSyntaxError("missing `=`"));
- };
- let (name, input) = input.split_at(equals);
- let input = input.strip_prefix(b"=").unwrap();
- let (mr_type, input) = MultipleResponseType::parse(input)?;
- let Some(input) = input.strip_prefix(b" ") else {
- return Err(Warning::MultipleResponseSyntaxError(
- "missing space after multiple response type",
- ));
- };
- let (label, mut input) = parse_counted_string(input)?;
- let mut vars = Vec::new();
- while input.first() != Some(&b'\n') {
- match input.split_first() {
- Some((b' ', rest)) => {
- let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
- return Err(Warning::MultipleResponseSyntaxError(
- "missing variable name delimiter",
- ));
- };
- let (var, rest) = rest.split_at(length);
- if !var.is_empty() {
- vars.push(var.into());
- }
- input = rest;
- }
- _ => {
- return Err(Warning::MultipleResponseSyntaxError(
- "missing space preceding variable name",
- ))
- }
- }
- }
- while input.first() == Some(&b'\n') {
- input = &input[1..];
- }
- Ok((
- MultipleResponseSet {
- name: name.into(),
- label,
- mr_type,
- short_names: vars,
- },
- input,
- ))
- }
-
- fn decode(
- &self,
- decoder: &mut Decoder,
- ) -> Result<MultipleResponseSet<Identifier, String>, Warning> {
- let mut short_names = Vec::with_capacity(self.short_names.len());
- for short_name in self.short_names.iter() {
- if let Some(short_name) = decoder
- .decode_identifier(short_name)
- .map_err(Warning::InvalidMrSetName)
- .issue_warning(&mut decoder.warn)
- {
- short_names.push(short_name);
- }
- }
- Ok(MultipleResponseSet {
- name: decoder
- .decode_identifier(&self.name)
- .map_err(Warning::InvalidMrSetVariableName)?,
- label: decoder.decode(&self.label).to_string(),
- mr_type: self.mr_type.clone(),
- short_names,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct MultipleResponseRecord<I, S>(pub Vec<MultipleResponseSet<I, S>>)
-where
- I: Debug,
- S: Debug;
-
-static MULTIPLE_RESPONSE_RECORD: ExtensionRecord = ExtensionRecord {
- size: Some(1),
- count: None,
- name: "multiple response set record",
-};
-
-impl MultipleResponseRecord<RawString, RawString> {
- fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
- ext.check_size(&MULTIPLE_RESPONSE_RECORD)?;
-
- let mut input = &ext.data[..];
- let mut sets = Vec::new();
- loop {
- while let Some(suffix) = input.strip_prefix(b"\n") {
- input = suffix;
- }
- if input.is_empty() {
- break;
- }
- let (set, rest) = MultipleResponseSet::parse(input)?;
- sets.push(set);
- input = rest;
- }
- Ok(Record::MultipleResponse(MultipleResponseRecord(sets)))
- }
-}
-
-impl MultipleResponseRecord<RawString, RawString> {
- fn decode(self, decoder: &mut Decoder) -> DecodedRecord {
- let mut sets = Vec::new();
- for set in self.0.iter() {
- if let Some(set) = set.decode(decoder).issue_warning(&mut decoder.warn) {
- sets.push(set);
- }
- }
- DecodedRecord::MultipleResponse(MultipleResponseRecord(sets))
- }
-}
-
-fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> {
- let Some(space) = input.iter().position(|&b| b == b' ') else {
- return Err(Warning::CountedStringMissingSpace);
- };
- let Ok(length) = from_utf8(&input[..space]) else {
- return Err(Warning::CountedStringInvalidUTF8);
- };
- let Ok(length): Result<usize, _> = length.parse() else {
- return Err(Warning::CountedStringInvalidLength(length.into()));
- };
-
- let Some((string, rest)) = input[space + 1..].split_at_checked(length) else {
- return Err(Warning::CountedStringTooLong(length));
- };
- Ok((string.into(), rest))
-}
-
-/// [Level of measurement](https://en.wikipedia.org/wiki/Level_of_measurement).
-#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum Measure {
- /// Nominal values can only be compared for equality.
- Nominal,
-
- /// Ordinal values can be meaningfully ordered.
- Ordinal,
-
- /// Scale values can be meaningfully compared for the degree of difference.
- Scale,
-}
-
-impl Measure {
- pub fn default_for_type(var_type: VarType) -> Option<Measure> {
- match var_type {
- VarType::Numeric => None,
- VarType::String => Some(Self::Nominal),
- }
- }
-
- fn try_decode(source: u32) -> Result<Option<Measure>, Warning> {
- match source {
- 0 => Ok(None),
- 1 => Ok(Some(Measure::Nominal)),
- 2 => Ok(Some(Measure::Ordinal)),
- 3 => Ok(Some(Measure::Scale)),
- _ => Err(Warning::InvalidMeasurement(source)),
- }
- }
-
- pub fn as_str(&self) -> &'static str {
- match self {
- Measure::Nominal => "Nominal",
- Measure::Ordinal => "Ordinal",
- Measure::Scale => "Scale",
- }
- }
-}
-
-#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum Alignment {
- Left,
- Right,
- Center,
-}
-
-impl Alignment {
- fn try_decode(source: u32) -> Result<Option<Alignment>, Warning> {
- match source {
- 0 => Ok(Some(Alignment::Left)),
- 1 => Ok(Some(Alignment::Right)),
- 2 => Ok(Some(Alignment::Center)),
- _ => Err(Warning::InvalidAlignment(source)),
- }
- }
-
- pub fn default_for_type(var_type: VarType) -> Self {
- match var_type {
- VarType::Numeric => Self::Right,
- VarType::String => Self::Left,
- }
- }
-
- pub fn as_str(&self) -> &'static str {
- match self {
- Alignment::Left => "Left",
- Alignment::Right => "Right",
- Alignment::Center => "Center",
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VarDisplay {
- pub measure: Option<Measure>,
- pub width: Option<u32>,
- pub alignment: Option<Alignment>,
-}
-
-#[derive(Clone, Debug)]
-pub struct VarDisplayRecord(pub Vec<VarDisplay>);
-
-impl VarDisplayRecord {
- fn parse(
- ext: &Extension,
- var_types: &VarTypes,
- endian: Endian,
- warn: &mut dyn FnMut(Warning),
- ) -> Result<Record, Warning> {
- if ext.size != 4 {
- return Err(Warning::BadRecordSize {
- offset: ext.offsets.start,
- record: String::from("variable display record"),
- size: ext.size,
- expected_size: 4,
- });
- }
-
- let n_vars = var_types.n_vars();
- let has_width = if ext.count as usize == 3 * n_vars {
- true
- } else if ext.count as usize == 2 * n_vars {
- false
- } else {
- return Err(Warning::InvalidVariableDisplayCount {
- count: ext.count as usize,
- first: 2 * n_vars,
- second: 3 * n_vars,
- });
- };
-
- let mut var_displays = Vec::new();
- let mut input = &ext.data[..];
- for _ in 0..n_vars {
- let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
- .issue_warning(warn)
- .flatten();
- let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
- let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
- .issue_warning(warn)
- .flatten();
- var_displays.push(VarDisplay {
- measure,
- width,
- alignment,
- });
- }
- Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringMissingValues<N>
-where
- N: Debug,
-{
- /// Variable name.
- pub var_name: N,
-
- /// Missing values.
- pub missing_values: Vec<RawStrArray<8>>,
-}
-
-impl LongStringMissingValues<RawString> {
- fn decode(
- &self,
- decoder: &mut Decoder,
- ) -> Result<LongStringMissingValues<Identifier>, IdError> {
- Ok(LongStringMissingValues {
- var_name: decoder.decode_identifier(&self.var_name)?,
- missing_values: self.missing_values.clone(),
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringMissingValueRecord<N>(pub Vec<LongStringMissingValues<N>>)
-where
- N: Debug;
-
-static LONG_STRING_MISSING_VALUE_RECORD: ExtensionRecord = ExtensionRecord {
- size: Some(1),
- count: None,
- name: "long string missing values record",
-};
-
-impl LongStringMissingValueRecord<RawString> {
- fn parse(
- ext: &Extension,
- endian: Endian,
- warn: &mut dyn FnMut(Warning),
- ) -> Result<Record, Warning> {
- ext.check_size(&LONG_STRING_MISSING_VALUE_RECORD)?;
-
- let mut input = &ext.data[..];
- let mut missing_value_set = Vec::new();
- while !input.is_empty() {
- let var_name = read_string(&mut input, endian)?;
- dbg!(&var_name);
- let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
- let value_len: u32 = endian.parse(read_bytes(&mut input)?);
- if value_len != 8 {
- let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
- warn(Warning::BadLongMissingValueLength {
- record_offset: ext.offsets.start,
- offset,
- value_len,
- });
- read_vec(
- &mut input,
- dbg!(value_len as usize * n_missing_values as usize),
- )?;
- continue;
- }
- let mut missing_values = Vec::new();
- for i in 0..n_missing_values {
- if i > 0 {
- // Tolerate files written by old, buggy versions of PSPP
- // where we believed that the value_length was repeated
- // before each missing value.
- let mut peek = input;
- let number: u32 = endian.parse(read_bytes(&mut peek)?);
- if number == 8 {
- input = peek;
- }
- }
-
- let value: [u8; 8] = read_bytes(&mut input)?;
- missing_values.push(RawStrArray(value));
- }
- missing_value_set.push(LongStringMissingValues {
- var_name,
- missing_values,
- });
- }
- Ok(Record::LongStringMissingValues(
- LongStringMissingValueRecord(missing_value_set),
- ))
- }
-}
-
-impl LongStringMissingValueRecord<RawString> {
- pub fn decode(self, decoder: &mut Decoder) -> LongStringMissingValueRecord<Identifier> {
- let mut mvs = Vec::with_capacity(self.0.len());
- for mv in self.0.iter() {
- if let Some(mv) = mv
- .decode(decoder)
- .map_err(Warning::InvalidLongStringMissingValueVariableName)
- .issue_warning(&mut decoder.warn)
- {
- mvs.push(mv);
- }
- }
- LongStringMissingValueRecord(mvs)
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct EncodingRecord(pub String);
-
-static ENCODING_RECORD: ExtensionRecord = ExtensionRecord {
- size: Some(1),
- count: None,
- name: "encoding record",
-};
-
-impl EncodingRecord {
- fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
- ext.check_size(&ENCODING_RECORD)?;
-
- Ok(Record::Encoding(EncodingRecord(
- String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName {
- offset: ext.offsets.start,
- })?,
- )))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct NumberOfCasesRecord {
- /// Always observed as 1.
- pub one: u64,
-
- /// Number of cases.
- pub n_cases: u64,
-}
-
-static NUMBER_OF_CASES_RECORD: ExtensionRecord = ExtensionRecord {
- size: Some(8),
- count: Some(2),
- name: "extended number of cases record",
-};
-
-impl NumberOfCasesRecord {
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size(&NUMBER_OF_CASES_RECORD)?;
-
- let mut input = &ext.data[..];
- let one = endian.parse(read_bytes(&mut input)?);
- let n_cases = endian.parse(read_bytes(&mut input)?);
-
- Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct RawVariableSetRecord(TextRecord);
-
-impl RawVariableSetRecord {
- fn parse(extension: Extension) -> Result<Record, Warning> {
- Ok(Record::VariableSets(Self(TextRecord::parse(
- extension,
- "variable sets record",
- )?)))
- }
- fn decode(self, decoder: &mut Decoder) -> VariableSetRecord {
- let mut sets = Vec::new();
- let input = decoder.decode(&self.0.text);
- for line in input.lines() {
- if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&mut decoder.warn) {
- sets.push(set)
- }
- }
- VariableSetRecord {
- offsets: self.0.offsets,
- sets,
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct RawProductInfoRecord(TextRecord);
-
-impl RawProductInfoRecord {
- fn parse(extension: Extension) -> Result<Record, Warning> {
- Ok(Record::ProductInfo(Self(TextRecord::parse(
- extension,
- "product info record",
- )?)))
- }
- fn decode(self, decoder: &mut Decoder) -> ProductInfoRecord {
- ProductInfoRecord(decoder.decode(&self.0.text).into())
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct RawLongNamesRecord(TextRecord);
-
-impl RawLongNamesRecord {
- fn parse(extension: Extension) -> Result<Record, Warning> {
- Ok(Record::LongNames(Self(TextRecord::parse(
- extension,
- "long names record",
- )?)))
- }
- fn decode(self, decoder: &mut Decoder) -> LongNamesRecord {
- let input = decoder.decode(&self.0.text);
- let mut names = Vec::new();
- for pair in input.split('\t').filter(|s| !s.is_empty()) {
- if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&mut decoder.warn)
- {
- names.push(long_name);
- }
- }
- LongNamesRecord(names)
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct TextRecord {
- pub offsets: Range<u64>,
-
- /// The text content of the record.
- pub text: RawString,
-}
-
-impl TextRecord {
- fn parse(extension: Extension, name: &str) -> Result<TextRecord, Warning> {
- extension.check_size(&ExtensionRecord {
- size: Some(1),
- count: None,
- name,
- })?;
- Ok(Self {
- offsets: extension.offsets,
- text: extension.data.into(),
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VeryLongString {
- pub short_name: Identifier,
- pub length: u16,
-}
-
-impl VeryLongString {
- fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Warning> {
- let Some((short_name, length)) = input.split_once('=') else {
- return Err(Warning::VeryLongStringMissingDelimiter(input.into()));
- };
- let short_name = decoder
- .new_identifier(short_name)
- .and_then(Identifier::must_be_ordinary)
- .map_err(Warning::InvalidLongStringName)?;
- let length = length
- .parse()
- .map_err(|_| Warning::VeryLongStringInvalidLength(input.into()))?;
- Ok(VeryLongString { short_name, length })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct RawVeryLongStringsRecord(TextRecord);
-
-#[derive(Clone, Debug)]
-pub struct VeryLongStringsRecord(pub Vec<VeryLongString>);
-
-impl RawVeryLongStringsRecord {
- fn parse(extension: Extension) -> Result<Record, Warning> {
- Ok(Record::VeryLongStrings(Self(TextRecord::parse(
- extension,
- "very long strings record",
- )?)))
- }
- fn decode(self, decoder: &mut Decoder) -> VeryLongStringsRecord {
- let input = decoder.decode(&self.0.text);
- let mut very_long_strings = Vec::new();
- for tuple in input
- .split('\0')
- .map(|s| s.trim_start_matches('\t'))
- .filter(|s| !s.is_empty())
- {
- if let Some(vls) =
- VeryLongString::parse(decoder, tuple).issue_warning(&mut decoder.warn)
- {
- very_long_strings.push(vls)
- }
- }
- VeryLongStringsRecord(very_long_strings)
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct Attribute {
- pub name: Identifier,
- pub values: Vec<String>,
-}
-
-impl Attribute {
- fn parse<'a>(decoder: &mut Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> {
- let Some((name, mut input)) = input.split_once('(') else {
- return Err(Warning::AttributeMissingLParen(input.into()));
- };
- let name = decoder
- .new_identifier(name)
- .map_err(Warning::InvalidAttributeName)?;
- let mut values = Vec::new();
- loop {
- let Some((value, rest)) = input.split_once('\n') else {
- return Err(Warning::AttributeMissingValue {
- name: name.clone(),
- index: values.len(),
- });
- };
- if let Some(stripped) = value
- .strip_prefix('\'')
- .and_then(|value| value.strip_suffix('\''))
- {
- values.push(stripped.into());
- } else {
- decoder.warn(Warning::AttributeMissingQuotes {
- name: name.clone(),
- index: values.len(),
- });
- values.push(value.into());
- }
- if let Some(rest) = rest.strip_prefix(')') {
- let attribute = Attribute { name, values };
- return Ok((attribute, rest));
- };
- input = rest;
- }
- }
-}
-
-impl Attributes {
- fn parse<'a>(
- decoder: &mut Decoder,
- mut input: &'a str,
- sentinel: Option<char>,
- ) -> Result<(Attributes, &'a str, Vec<Identifier>), Warning> {
- let mut attributes = BTreeMap::new();
- let mut duplicates = Vec::new();
- let rest = loop {
- match input.chars().next() {
- None => break input,
- c if c == sentinel => break &input[1..],
- _ => {
- let (attribute, rest) = Attribute::parse(decoder, input)?;
- if attributes.contains_key(&attribute.name) {
- duplicates.push(attribute.name.clone());
- }
- attributes.insert(attribute.name, attribute.values);
- input = rest;
- }
- }
- };
- Ok((Attributes(attributes), rest, duplicates))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct RawFileAttributesRecord(TextRecord);
-
-#[derive(Clone, Debug, Default)]
-pub struct FileAttributesRecord(pub Attributes);
-
-impl RawFileAttributesRecord {
- fn parse(extension: Extension) -> Result<Record, Warning> {
- Ok(Record::FileAttributes(Self(TextRecord::parse(
- extension,
- "file attributes record",
- )?)))
- }
- fn decode(self, decoder: &mut Decoder) -> FileAttributesRecord {
- let input = decoder.decode(&self.0.text);
- match Attributes::parse(decoder, &input, None).issue_warning(&mut decoder.warn) {
- Some((set, rest, duplicates)) => {
- if !duplicates.is_empty() {
- decoder.warn(Warning::DuplicateFileAttributes {
- attributes: duplicates,
- });
- }
- if !rest.is_empty() {
- decoder.warn(dbg!(Warning::TBD));
- }
- FileAttributesRecord(set)
- }
- None => FileAttributesRecord::default(),
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VarAttributes {
- pub long_var_name: Identifier,
- pub attributes: Attributes,
-}
-
-impl VarAttributes {
- fn parse<'a>(
- decoder: &mut Decoder,
- input: &'a str,
- ) -> Result<(VarAttributes, &'a str), Warning> {
- let Some((long_var_name, rest)) = input.split_once(':') else {
- return Err(dbg!(Warning::TBD));
- };
- let long_var_name = decoder
- .new_identifier(long_var_name)
- .and_then(Identifier::must_be_ordinary)
- .map_err(Warning::InvalidAttributeVariableName)?;
- let (attributes, rest, duplicates) = Attributes::parse(decoder, rest, Some('/'))?;
- if !duplicates.is_empty() {
- decoder.warn(Warning::DuplicateVariableAttributes {
- variable: long_var_name.clone(),
- attributes: duplicates,
- });
- }
- let var_attribute = VarAttributes {
- long_var_name,
- attributes,
- };
- Ok((var_attribute, rest))
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct RawVariableAttributesRecord(TextRecord);
-
-#[derive(Clone, Debug)]
-pub struct VariableAttributesRecord(pub Vec<VarAttributes>);
-
-impl RawVariableAttributesRecord {
- fn parse(extension: Extension) -> Result<Record, Warning> {
- Ok(Record::VariableAttributes(Self(TextRecord::parse(
- extension,
- "variable attributes record",
- )?)))
- }
- fn decode(self, decoder: &mut Decoder) -> VariableAttributesRecord {
- let decoded = decoder.decode(&self.0.text);
- let mut input = decoded.as_ref();
- let mut var_attribute_sets = Vec::new();
- while !input.is_empty() {
- let Some((var_attribute, rest)) =
- VarAttributes::parse(decoder, input).issue_warning(&mut decoder.warn)
- else {
- break;
- };
- var_attribute_sets.push(var_attribute);
- input = rest;
- }
- VariableAttributesRecord(var_attribute_sets)
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongName {
- pub short_name: Identifier,
- pub long_name: Identifier,
-}
-
-impl LongName {
- fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
- let Some((short_name, long_name)) = input.split_once('=') else {
- return Err(dbg!(Warning::LongNameMissingEquals));
- };
- let short_name = decoder
- .new_identifier(short_name)
- .and_then(Identifier::must_be_ordinary)
- .map_err(Warning::InvalidShortName)?;
- let long_name = decoder
- .new_identifier(long_name)
- .and_then(Identifier::must_be_ordinary)
- .map_err(Warning::InvalidLongName)?;
- Ok(LongName {
- short_name,
- long_name,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongNamesRecord(pub Vec<LongName>);
-
-#[derive(Clone, Debug)]
-pub struct ProductInfoRecord(pub String);
-
-#[derive(Clone, Debug)]
-pub struct VariableSet {
- pub name: String,
- pub variable_names: Vec<Identifier>,
-}
-
-impl VariableSet {
- fn parse(input: &str, decoder: &mut Decoder) -> Result<Self, Warning> {
- let (name, input) = input
- .split_once('=')
- .ok_or(Warning::VariableSetMissingEquals)?;
- let mut vars = Vec::new();
- for var in input.split_ascii_whitespace() {
- if let Some(identifier) = decoder
- .new_identifier(var)
- .and_then(Identifier::must_be_ordinary)
- .map_err(Warning::InvalidVariableSetName)
- .issue_warning(&mut decoder.warn)
- {
- vars.push(identifier);
- }
- }
- Ok(VariableSet {
- name: name.to_string(),
- variable_names: vars,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableSetRecord {
- pub offsets: Range<u64>,
- pub sets: Vec<VariableSet>,
-}
-
-trait IssueWarning<T> {
- fn issue_warning(self, warn: &mut dyn FnMut(Warning)) -> Option<T>;
-}
-impl<T> IssueWarning<T> for Result<T, Warning> {
- fn issue_warning(self, warn: &mut dyn FnMut(Warning)) -> Option<T> {
- match self {
- Ok(result) => Some(result),
- Err(error) => {
- warn(error);
- None
- }
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct Extension {
- pub offsets: Range<u64>,
-
- /// Record subtype.
- pub subtype: u32,
-
- /// Size of each data element.
- pub size: u32,
-
- /// Number of data elements.
- pub count: u32,
-
- /// `size * count` bytes of data.
- pub data: Vec<u8>,
-}
-
-impl Extension {
- fn check_size(&self, expected: &ExtensionRecord) -> Result<(), Warning> {
- match expected.size {
- Some(expected_size) if self.size != expected_size => {
- return Err(Warning::BadRecordSize {
- offset: self.offsets.start,
- record: expected.name.into(),
- size: self.size,
- expected_size,
- });
- }
- _ => (),
- }
- match expected.count {
- Some(expected_count) if self.count != expected_count => {
- return Err(Warning::BadRecordCount {
- offset: self.offsets.start,
- record: expected.name.into(),
- count: self.count,
- expected_count,
- });
- }
- _ => (),
- }
- Ok(())
- }
-
- fn read<R: Read + Seek>(
- r: &mut R,
- endian: Endian,
- var_types: &VarTypes,
- warn: &mut dyn FnMut(Warning),
- ) -> Result<Option<Record>, Error> {
- let subtype = endian.parse(read_bytes(r)?);
- let header_offset = r.stream_position()?;
- let size: u32 = endian.parse(read_bytes(r)?);
- let count = endian.parse(read_bytes(r)?);
- let Some(product) = size.checked_mul(count) else {
- return Err(Error::ExtensionRecordTooLarge {
- offset: header_offset,
- subtype,
- size,
- count,
- });
- };
- let start_offset = r.stream_position()?;
- let data = read_vec(r, product as usize)?;
- let end_offset = start_offset + product as u64;
- let extension = Extension {
- offsets: start_offset..end_offset,
- subtype,
- size,
- count,
- data,
- };
- let result = match subtype {
- 3 => IntegerInfoRecord::parse(&extension, endian),
- 4 => FloatInfoRecord::parse(&extension, endian),
- 11 => VarDisplayRecord::parse(&extension, var_types, endian, warn),
- 7 | 19 => MultipleResponseRecord::parse(&extension, endian),
- 21 => LongStringValueLabelRecord::parse(&extension, endian),
- 22 => LongStringMissingValueRecord::parse(&extension, endian, warn),
- 20 => EncodingRecord::parse(&extension, endian),
- 16 => NumberOfCasesRecord::parse(&extension, endian),
- 5 => RawVariableSetRecord::parse(extension),
- 10 => RawProductInfoRecord::parse(extension),
- 13 => RawLongNamesRecord::parse(extension),
- 14 => RawVeryLongStringsRecord::parse(extension),
- 17 => RawFileAttributesRecord::parse(extension),
- 18 => RawVariableAttributesRecord::parse(extension),
- _ => Ok(Record::OtherExtension(extension)),
- };
- match result {
- Ok(result) => Ok(Some(result)),
- Err(error) => {
- warn(error);
- Ok(None)
- }
- }
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct ZHeader {
- /// File offset to the start of the record.
- pub offset: u64,
-
- /// File offset to the ZLIB data header.
- pub zheader_offset: u64,
-
- /// File offset to the ZLIB trailer.
- pub ztrailer_offset: u64,
-
- /// Length of the ZLIB trailer in bytes.
- pub ztrailer_len: u64,
-}
-
-impl ZHeader {
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
- let offset = r.stream_position()?;
- let zheader_offset: u64 = endian.parse(read_bytes(r)?);
- let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
- let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
-
- if zheader_offset != offset {
- return Err(Error::UnexpectedZHeaderOffset {
- actual: zheader_offset,
- expected: offset,
- });
- }
-
- if ztrailer_offset < offset {
- return Err(Error::ImpossibleZTrailerOffset(ztrailer_offset));
- }
-
- if ztrailer_len < 24 || ztrailer_len % 24 != 0 {
- return Err(Error::InvalidZTrailerLength(ztrailer_len));
- }
-
- Ok(ZHeader {
- offset,
- zheader_offset,
- ztrailer_offset,
- ztrailer_len,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct ZTrailer {
- /// File offset to the start of the record.
- pub offset: u64,
-
- /// Compression bias as a negative integer, e.g. -100.
- pub int_bias: i64,
-
- /// Always observed as zero.
- pub zero: u64,
-
- /// Uncompressed size of each block, except possibly the last. Only
- /// `0x3ff000` has been observed so far.
- pub block_size: u32,
-
- /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
- pub blocks: Vec<ZBlock>,
-}
-
-#[derive(Clone, Debug)]
-pub struct ZBlock {
- /// Offset of block of data if simple compression were used.
- pub uncompressed_ofs: u64,
-
- /// Actual offset within the file of the compressed data block.
- pub compressed_ofs: u64,
-
- /// The number of bytes in this data block after decompression. This is
- /// `block_size` in every data block but the last, which may be smaller.
- pub uncompressed_size: u32,
-
- /// The number of bytes in this data block, as stored compressed in this
- /// file.
- pub compressed_size: u32,
-}
-
-impl ZBlock {
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
- Ok(ZBlock {
- uncompressed_ofs: endian.parse(read_bytes(r)?),
- compressed_ofs: endian.parse(read_bytes(r)?),
- uncompressed_size: endian.parse(read_bytes(r)?),
- compressed_size: endian.parse(read_bytes(r)?),
- })
- }
-}
-
-impl ZTrailer {
- fn read<R: Read + Seek>(
- reader: &mut R,
- endian: Endian,
- bias: f64,
- zheader: &ZHeader,
- warn: &mut dyn FnMut(Warning),
- ) -> Result<Option<ZTrailer>, Error> {
- let start_offset = reader.stream_position()?;
- if reader
- .seek(SeekFrom::Start(zheader.ztrailer_offset))
- .is_err()
- {
- return Ok(None);
- }
- let int_bias = endian.parse(read_bytes(reader)?);
- if int_bias as f64 != -bias {
- return Err(Error::WrongZlibTrailerBias {
- actual: int_bias,
- expected: -bias,
- });
- }
- let zero = endian.parse(read_bytes(reader)?);
- if zero != 0 {
- return Err(Error::WrongZlibTrailerZero(zero));
- }
- let block_size = endian.parse(read_bytes(reader)?);
- if block_size != 0x3ff000 {
- return Err(Error::WrongZlibTrailerBlockSize(block_size));
- }
- let n_blocks: u32 = endian.parse(read_bytes(reader)?);
- let expected_n_blocks = (zheader.ztrailer_len - 24) / 24;
- if n_blocks as u64 != expected_n_blocks {
- return Err(Error::BadZlibTrailerNBlocks {
- offset: zheader.ztrailer_offset,
- n_blocks,
- expected_n_blocks,
- ztrailer_len: zheader.ztrailer_len,
- });
- }
- let blocks = (0..n_blocks)
- .map(|_| ZBlock::read(reader, endian))
- .collect::<Result<Vec<_>, _>>()?;
-
- let mut expected_uncmp_ofs = zheader.zheader_offset;
- let mut expected_cmp_ofs = zheader.zheader_offset + 24;
- for (index, block) in blocks.iter().enumerate() {
- if block.uncompressed_ofs != expected_uncmp_ofs {
- return Err(Error::ZlibTrailerBlockWrongUncmpOfs {
- index,
- actual: block.uncompressed_ofs,
- expected: expected_cmp_ofs,
- });
- }
- if block.compressed_ofs != expected_cmp_ofs {
- return Err(Error::ZlibTrailerBlockWrongCmpOfs {
- index,
- actual: block.compressed_ofs,
- expected: expected_cmp_ofs,
- });
- }
- if index < blocks.len() - 1 {
- if block.uncompressed_size != block_size {
- warn(Warning::ZlibTrailerBlockWrongSize {
- index,
- actual: block.uncompressed_size,
- expected: block_size,
- });
- }
- } else {
- if block.uncompressed_size > block_size {
- warn(Warning::ZlibTrailerBlockTooBig {
- index,
- actual: block.uncompressed_size,
- max_expected: block_size,
- });
- }
- }
- // http://www.zlib.net/zlib_tech.html says that the maximum
- // expansion from compression, with worst-case parameters, is 13.5%
- // plus 11 bytes. This code checks for an expansion of more than
- // 14.3% plus 11 bytes.
- if block.compressed_size > block.uncompressed_size + block.uncompressed_size / 7 + 11 {
- return Err(Error::ZlibExpansion {
- index,
- compressed_size: block.compressed_size,
- uncompressed_size: block.uncompressed_size,
- });
- }
-
- expected_cmp_ofs += block.compressed_size as u64;
- expected_uncmp_ofs += block.uncompressed_size as u64;
- }
-
- if expected_cmp_ofs != zheader.ztrailer_offset {
- return Err(Error::ZlibTrailerOffsetInconsistency {
- descriptors: expected_cmp_ofs,
- zheader: zheader.ztrailer_offset,
- });
- }
-
- reader.seek(SeekFrom::Start(start_offset))?;
- Ok(Some(ZTrailer {
- offset: zheader.ztrailer_offset,
- int_bias,
- zero,
- block_size,
- blocks,
- }))
- }
-}
-
-fn skip_bytes<R: Read>(r: &mut R, mut n: usize) -> Result<(), IoError> {
- thread_local! {
- static BUF: RefCell<[u8; 256]> = RefCell::new([0u8; 256]);
- }
- BUF.with_borrow_mut(|buf| {
- while n > 0 {
- let chunk = n.min(buf.len());
- r.read_exact(&mut buf[..n])?;
- n -= chunk;
- }
- Ok(())
- })
-}
-
-fn try_read_bytes_into<R: Read>(r: &mut R, buf: &mut [u8]) -> Result<bool, IoError> {
- let n = r.read(buf)?;
- if n > 0 {
- if n < buf.len() {
- r.read_exact(&mut buf[n..])?;
- }
- Ok(true)
- } else {
- Ok(false)
- }
-}
-
-fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
- let mut buf = [0; N];
- match try_read_bytes_into(r, &mut buf)? {
- true => Ok(Some(buf)),
- false => Ok(None),
- }
-}
-
-fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
- let mut buf = [0; N];
- r.read_exact(&mut buf)?;
- Ok(buf)
-}
+fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
+ let mut buf = [0; N];
+ r.read_exact(&mut buf)?;
+ Ok(buf)
+}
fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
let mut vec = vec![0; n];
Ok(read_vec(r, length as usize)?.into())
}
-#[derive(Clone, Debug)]
-pub struct LongStringValueLabels<N, S>
-where
- S: Debug,
-{
- pub var_name: N,
- pub width: u32,
-
- /// `(value, label)` pairs, where each value is `width` bytes.
- pub labels: Vec<(RawString, S)>,
-}
-
-impl LongStringValueLabels<RawString, RawString> {
- fn decode(
- &self,
- decoder: &mut Decoder,
- ) -> Result<LongStringValueLabels<Identifier, String>, Warning> {
- let var_name = decoder.decode(&self.var_name);
- let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding)
- .map_err(Warning::InvalidLongStringValueLabelName)?;
-
- let mut labels = Vec::with_capacity(self.labels.len());
- for (value, label) in self.labels.iter() {
- let label = decoder.decode(label).to_string();
- labels.push((value.clone(), label));
- }
-
- Ok(LongStringValueLabels {
- var_name,
- width: self.width,
- labels,
- })
- }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringValueLabelRecord<N, S>(pub Vec<LongStringValueLabels<N, S>>)
-where
- N: Debug,
- S: Debug;
-
-static LONG_STRING_VALUE_LABEL_RECORD: ExtensionRecord = ExtensionRecord {
- size: Some(1),
- count: None,
- name: "long string value labels record",
-};
-
-impl LongStringValueLabelRecord<RawString, RawString> {
- fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
- ext.check_size(&LONG_STRING_VALUE_LABEL_RECORD)?;
-
- let mut input = &ext.data[..];
- let mut label_set = Vec::new();
- while !input.is_empty() {
- let var_name = read_string(&mut input, endian)?;
- let width: u32 = endian.parse(read_bytes(&mut input)?);
- let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
- let mut labels = Vec::new();
- for _ in 0..n_labels {
- let value = read_string(&mut input, endian)?;
- let label = read_string(&mut input, endian)?;
- labels.push((value, label));
- }
- label_set.push(LongStringValueLabels {
- var_name,
- width,
- labels,
- })
- }
- Ok(Record::LongStringValueLabels(LongStringValueLabelRecord(
- label_set,
- )))
- }
-}
-
-impl LongStringValueLabelRecord<RawString, RawString> {
- fn decode(self, decoder: &mut Decoder) -> LongStringValueLabelRecord<Identifier, String> {
- let mut labels = Vec::with_capacity(self.0.len());
- for label in &self.0 {
- match label.decode(decoder) {
- Ok(set) => labels.push(set),
- Err(error) => decoder.warn(error),
- }
- }
- LongStringValueLabelRecord(labels)
- }
-}
-
#[derive(Default)]
-pub struct VarTypes {
- pub types: Vec<Option<VarWidth>>,
+struct VarTypes {
+ types: Vec<Option<VarWidth>>,
}
impl VarTypes {
--- /dev/null
+//! Raw records.
+//!
+//! Separated into a submodule just to reduce clutter.
+
+use std::{
+ borrow::Cow,
+ collections::BTreeMap,
+ fmt::{Debug, Formatter},
+ io::{Cursor, ErrorKind, Read, Seek, SeekFrom},
+ ops::Range,
+ str::from_utf8,
+};
+
+use crate::{
+ data::{Datum, RawString},
+ dictionary::{
+ Alignment, Attributes, CategoryLabels, Measure, MissingValueRange, MissingValues, VarType,
+ VarWidth,
+ },
+ endian::{Endian, Parse},
+ identifier::{Error as IdError, Identifier},
+ sys::raw::{
+ read_bytes, read_string, read_vec, Decoder, Error, ErrorDetails, Magic, RawDatum,
+ RawStrArray, RawWidth, Record, UntypedDatum, VarTypes, Warning, WarningDetails,
+ },
+};
+
+use binrw::BinRead;
+use itertools::Itertools;
+use thiserror::Error as ThisError;
+
+/// Type of compression in a system file.
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum Compression {
+ /// Simple bytecode-based compression.
+ Simple,
+ /// [ZLIB] compression.
+ ///
+ /// [ZLIB]: https://www.zlib.net/
+ ZLib,
+}
+
+/// A warning for a file header.
+#[derive(ThisError, Debug)]
+pub enum HeaderWarning {
+ /// Unexpected compression bias.
+ #[error("Compression bias is {0} instead of the usual values of 0 or 100.")]
+ UnexpectedBias(f64),
+}
+
+/// A file header record in a system file.
+#[derive(Clone)]
+pub struct FileHeader<S>
+where
+ S: Debug,
+{
+ /// Magic number.
+ pub magic: Magic,
+
+ /// Eye-catcher string, product name, in the file's encoding. Padded
+ /// on the right with spaces.
+ pub eye_catcher: S,
+
+ /// Layout code, normally either 2 or 3.
+ pub layout_code: u32,
+
+ /// Number of variable positions, or `None` if the value in the file is
+ /// questionably trustworthy.
+ pub nominal_case_size: Option<u32>,
+
+ /// Compression type, if any,
+ pub compression: Option<Compression>,
+
+ /// 1-based variable index of the weight variable, or `None` if the file is
+ /// unweighted.
+ pub weight_index: Option<u32>,
+
+ /// Claimed number of cases, if known.
+ pub n_cases: Option<u32>,
+
+ /// Compression bias, usually 100.0.
+ pub bias: f64,
+
+ /// `dd mmm yy` in the file's encoding.
+ pub creation_date: S,
+
+ /// `HH:MM:SS` in the file's encoding.
+ pub creation_time: S,
+
+ /// File label, in the file's encoding. Padded on the right with spaces.
+ pub file_label: S,
+
+ /// Endianness of the data in the file header.
+ pub endian: Endian,
+}
+
+impl<S> FileHeader<S>
+where
+ S: Debug,
+{
+ fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> std::fmt::Result
+ where
+ T: Debug,
+ {
+ writeln!(f, "{name:>17}: {:?}", value)
+ }
+}
+
+impl<S> Debug for FileHeader<S>
+where
+ S: Debug,
+{
+ fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+ writeln!(f, "File header record:")?;
+ self.debug_field(f, "Magic", self.magic)?;
+ self.debug_field(f, "Product name", &self.eye_catcher)?;
+ self.debug_field(f, "Layout code", self.layout_code)?;
+ self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
+ self.debug_field(f, "Compression", self.compression)?;
+ self.debug_field(f, "Weight index", self.weight_index)?;
+ self.debug_field(f, "Number of cases", self.n_cases)?;
+ self.debug_field(f, "Compression bias", self.bias)?;
+ self.debug_field(f, "Creation date", &self.creation_date)?;
+ self.debug_field(f, "Creation time", &self.creation_time)?;
+ self.debug_field(f, "File label", &self.file_label)?;
+ self.debug_field(f, "Endianness", self.endian)
+ }
+}
+
+impl FileHeader<RawString> {
+ /// Reads a header record from `r`, reporting any warnings via `warn`.
+ pub fn read<R>(r: &mut R, warn: &mut dyn FnMut(Warning)) -> Result<Self, Error>
+ where
+ R: Read + Seek,
+ {
+ let header_bytes = read_vec(r, 176).map_err(|e| {
+ Error::new(
+ None,
+ if e.kind() == ErrorKind::UnexpectedEof {
+ ErrorDetails::NotASystemFile
+ } else {
+ e.into()
+ },
+ )
+ })?;
+ Self::read_inner(&header_bytes, warn).map_err(|details| Error::new(Some(0..176), details))
+ }
+
+ fn read_inner(
+ header_bytes: &[u8],
+ warn: &mut dyn FnMut(Warning),
+ ) -> Result<Self, ErrorDetails> {
+ #[derive(BinRead)]
+ struct RawHeader {
+ magic: [u8; 4],
+ eye_catcher: [u8; 60],
+ layout_code: u32,
+ nominal_case_size: u32,
+ compression_code: u32,
+ weight_index: u32,
+ n_cases: u32,
+ bias: f64,
+ creation_date: [u8; 9],
+ creation_time: [u8; 8],
+ file_label: [u8; 64],
+ _padding: [u8; 3],
+ }
+
+ if &header_bytes[8..20] == b"ENCRYPTEDSAV" {
+ return Err(ErrorDetails::Encrypted);
+ }
+
+ let be_header = RawHeader::read_be(&mut Cursor::new(&header_bytes)).unwrap();
+ let le_header = RawHeader::read_le(&mut Cursor::new(&header_bytes)).unwrap();
+
+ let magic: Magic = be_header
+ .magic
+ .try_into()
+ .map_err(|_| ErrorDetails::NotASystemFile)?;
+
+ let (endian, header) = if be_header.layout_code == 2 {
+ (Endian::Big, &be_header)
+ } else if le_header.layout_code == 2 {
+ (Endian::Little, &le_header)
+ } else {
+ return Err(ErrorDetails::NotASystemFile);
+ };
+
+ let nominal_case_size = (1..i32::MAX.cast_unsigned() / 16)
+ .contains(&header.nominal_case_size)
+ .then_some(header.nominal_case_size);
+
+ let compression = match (magic, header.compression_code) {
+ (Magic::Zsav, 2) => Some(Compression::ZLib),
+ (Magic::Zsav, code) => return Err(ErrorDetails::InvalidZsavCompression(code)),
+ (_, 0) => None,
+ (_, 1) => Some(Compression::Simple),
+ (_, code) => return Err(ErrorDetails::InvalidSavCompression(code)),
+ };
+
+ let weight_index = (header.weight_index > 0).then_some(header.weight_index);
+
+ let n_cases = (header.n_cases < i32::MAX as u32 / 2).then_some(header.n_cases);
+
+ if header.bias != 100.0 && header.bias != 0.0 {
+ warn(Warning::new(
+ Some(84..92),
+ HeaderWarning::UnexpectedBias(header.bias),
+ ));
+ }
+
+ let creation_date = RawString(header.creation_date.into());
+ let creation_time = RawString(header.creation_time.into());
+ let file_label = RawString(header.file_label.into());
+
+ Ok(FileHeader {
+ magic,
+ layout_code: header.layout_code,
+ nominal_case_size,
+ compression,
+ weight_index,
+ n_cases,
+ bias: header.bias,
+ creation_date,
+ creation_time,
+ eye_catcher: RawString(header.eye_catcher.into()),
+ file_label,
+ endian,
+ })
+ }
+
+ /// Decodes this record with `decoder` and returns the decoded version.
+ pub fn decode(self, decoder: &mut Decoder) -> FileHeader<String> {
+ let eye_catcher = decoder.decode(&self.eye_catcher).to_string();
+ let file_label = decoder.decode(&self.file_label).to_string();
+ let creation_date = decoder.decode(&self.creation_date).to_string();
+ let creation_time = decoder.decode(&self.creation_time).to_string();
+ FileHeader {
+ eye_catcher,
+ weight_index: self.weight_index,
+ n_cases: self.n_cases,
+ file_label,
+ magic: self.magic,
+ layout_code: self.layout_code,
+ nominal_case_size: self.nominal_case_size,
+ compression: self.compression,
+ bias: self.bias,
+ creation_date,
+ creation_time,
+ endian: self.endian,
+ }
+ }
+}
+
+/// [Format](crate::format::Format) as represented in a system file.
+#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+pub struct RawFormat(
+ /// The most-significant 16 bits are the type, the next 8 bytes are the
+ /// width, and the least-significant 8 bits are the number of decimals.
+ pub u32,
+);
+
+impl Debug for RawFormat {
+ fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+ let type_ = format_name(self.0 >> 16);
+ let w = (self.0 >> 8) & 0xff;
+ let d = self.0 & 0xff;
+ write!(f, "{:06x} ({type_}{w}.{d})", self.0)
+ }
+}
+
+fn format_name(type_: u32) -> Cow<'static, str> {
+ match type_ {
+ 1 => "A",
+ 2 => "AHEX",
+ 3 => "COMMA",
+ 4 => "DOLLAR",
+ 5 => "F",
+ 6 => "IB",
+ 7 => "PIBHEX",
+ 8 => "P",
+ 9 => "PIB",
+ 10 => "PK",
+ 11 => "RB",
+ 12 => "RBHEX",
+ 15 => "Z",
+ 16 => "N",
+ 17 => "E",
+ 20 => "DATE",
+ 21 => "TIME",
+ 22 => "DATETIME",
+ 23 => "ADATE",
+ 24 => "JDATE",
+ 25 => "DTIME",
+ 26 => "WKDAY",
+ 27 => "MONTH",
+ 28 => "MOYR",
+ 29 => "QYR",
+ 30 => "WKYR",
+ 31 => "PCT",
+ 32 => "DOT",
+ 33 => "CCA",
+ 34 => "CCB",
+ 35 => "CCC",
+ 36 => "CCD",
+ 37 => "CCE",
+ 38 => "EDATE",
+ 39 => "SDATE",
+ 40 => "MTIME",
+ 41 => "YMDHMS",
+ _ => return format!("<unknown format {type_}>").into(),
+ }
+ .into()
+}
+
+impl MissingValues {
+ fn read<R>(
+ r: &mut R,
+ offsets: Range<u64>,
+ raw_width: RawWidth,
+ code: i32,
+ endian: Endian,
+ warn: &mut dyn FnMut(Warning),
+ ) -> Result<Self, Error>
+ where
+ R: Read + Seek,
+ {
+ let (individual_values, has_range) = match code {
+ 0 => return Ok(Self::default()),
+ 1..=3 => (code as usize, false),
+ -2 => (0, true),
+ -3 => (1, true),
+ _ => {
+ return Err(Error::new(
+ Some(offsets),
+ ErrorDetails::BadMissingValueCode(code),
+ ))
+ }
+ };
+
+ Self::read_inner(
+ r,
+ offsets.clone(),
+ raw_width,
+ individual_values,
+ has_range,
+ endian,
+ warn,
+ )
+ .map_err(|details| {
+ Error::new(
+ {
+ let n = individual_values + if has_range { 2 } else { 0 };
+ Some(offsets.start..offsets.end + 8 * n as u64)
+ },
+ details,
+ )
+ })
+ }
+
+ fn read_inner<R>(
+ r: &mut R,
+ offsets: Range<u64>,
+ raw_width: RawWidth,
+ individual_values: usize,
+ has_range: bool,
+ endian: Endian,
+ warn: &mut dyn FnMut(Warning),
+ ) -> Result<Self, ErrorDetails>
+ where
+ R: Read + Seek,
+ {
+ let mut values = Vec::with_capacity(individual_values);
+ let range = if has_range {
+ let low = read_bytes::<8, _>(r)?;
+ let high = read_bytes::<8, _>(r)?;
+ Some((low, high))
+ } else {
+ None
+ };
+ for _ in 0..individual_values {
+ values.push(read_bytes::<8, _>(r)?);
+ }
+
+ match VarWidth::try_from(raw_width) {
+ Ok(VarWidth::Numeric) => {
+ let values = values
+ .into_iter()
+ .map(|v| Datum::Number(endian.parse(v)))
+ .collect();
+
+ let range = range.map(|(low, high)| {
+ MissingValueRange::new(endian.parse(low), endian.parse(high))
+ });
+ return Ok(Self::new(values, range).unwrap());
+ }
+ Ok(VarWidth::String(_)) if range.is_some() => warn(Warning::new(
+ Some(offsets),
+ VariableWarning::MissingValueStringRange,
+ )),
+ Ok(VarWidth::String(width)) => {
+ let width = width.min(8) as usize;
+ let values = values
+ .into_iter()
+ .map(|value| Datum::String(RawString::from(&value[..width])))
+ .collect();
+ return Ok(Self::new(values, None).unwrap());
+ }
+ Err(()) => warn(Warning::new(
+ Some(offsets),
+ VariableWarning::MissingValueContinuation,
+ )),
+ }
+ Ok(Self::default())
+ }
+}
+
+/// Warning for a variable record.
+#[derive(ThisError, Debug)]
+pub enum VariableWarning {
+ /// Missing value record with range not allowed for string variable.
+ #[error("Missing value record with range not allowed for string variable.")]
+ MissingValueStringRange,
+
+ /// Missing value not allowed for long string continuation.
+ #[error("Missing value not allowed for long string continuation")]
+ MissingValueContinuation,
+}
+
+/// A variable record in a system file.
+#[derive(Clone)]
+pub struct VariableRecord<S>
+where
+ S: Debug,
+{
+ /// Range of offsets in file.
+ pub offsets: Range<u64>,
+
+ /// Variable width, in the range -1..=255.
+ pub width: RawWidth,
+
+ /// Variable name, padded on the right with spaces.
+ pub name: S,
+
+ /// Print format.
+ pub print_format: RawFormat,
+
+ /// Write format.
+ pub write_format: RawFormat,
+
+ /// Missing values.
+ pub missing_values: MissingValues,
+
+ /// Optional variable label.
+ pub label: Option<S>,
+}
+
+impl<S> Debug for VariableRecord<S>
+where
+ S: Debug,
+{
+ fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+ writeln!(f, "Width: {}", self.width,)?;
+ writeln!(f, "Print format: {:?}", self.print_format)?;
+ writeln!(f, "Write format: {:?}", self.write_format)?;
+ writeln!(f, "Name: {:?}", &self.name)?;
+ writeln!(f, "Variable label: {:?}", self.label)?;
+ writeln!(f, "Missing values: {:?}", self.missing_values)
+ }
+}
+
+impl VariableRecord<RawString> {
+ /// Reads a variable record from `r`.
+ pub fn read<R>(
+ r: &mut R,
+ endian: Endian,
+ warn: &mut dyn FnMut(Warning),
+ ) -> Result<Record, Error>
+ where
+ R: Read + Seek,
+ {
+ #[derive(BinRead)]
+ struct RawVariableRecord {
+ width: i32,
+ has_variable_label: u32,
+ missing_value_code: i32,
+ print_format: u32,
+ write_format: u32,
+ name: [u8; 8],
+ }
+
+ let start_offset = r.stream_position()?;
+ let offsets = start_offset..start_offset + 28;
+ let raw_record =
+ read_vec(r, 28).map_err(|e| Error::new(Some(offsets.clone()), e.into()))?;
+ let raw_record =
+ RawVariableRecord::read_options(&mut Cursor::new(&raw_record), endian, ()).unwrap();
+
+ let width: RawWidth = raw_record.width.try_into().map_err(|_| {
+ Error::new(
+ Some(offsets.clone()),
+ ErrorDetails::BadVariableWidth(raw_record.width),
+ )
+ })?;
+
+ let label = match raw_record.has_variable_label {
+ 0 => None,
+ 1 => {
+ let len: u32 = endian.parse(read_bytes(r)?);
+ let read_len = len.min(65535) as usize;
+ let label = RawString(read_vec(r, read_len)?);
+
+ let padding_bytes = len.next_multiple_of(4) - len;
+ let _ = read_vec(r, padding_bytes as usize)?;
+
+ Some(label)
+ }
+ _ => {
+ return Err(Error::new(
+ Some(offsets),
+ ErrorDetails::BadVariableLabelCode(raw_record.has_variable_label),
+ ));
+ }
+ };
+
+ let missing_values = MissingValues::read(
+ r,
+ offsets,
+ width,
+ raw_record.missing_value_code,
+ endian,
+ warn,
+ )?;
+
+ let end_offset = r.stream_position()?;
+
+ Ok(Record::Variable(VariableRecord {
+ offsets: start_offset..end_offset,
+ width,
+ name: RawString(raw_record.name.into()),
+ print_format: RawFormat(raw_record.print_format),
+ write_format: RawFormat(raw_record.write_format),
+ missing_values,
+ label,
+ }))
+ }
+
+ /// Decodes a variable record using `decoder`.
+ pub fn decode(self, decoder: &mut Decoder) -> VariableRecord<String> {
+ VariableRecord {
+ offsets: self.offsets.clone(),
+ width: self.width,
+ name: decoder.decode(&self.name).to_string(),
+ print_format: self.print_format,
+ write_format: self.write_format,
+ missing_values: self.missing_values,
+ label: self
+ .label
+ .as_ref()
+ .map(|label| decoder.decode(label).to_string()),
+ }
+ }
+}
+
+/// Warning for a value label record.
+#[derive(ThisError, Debug)]
+pub enum ValueLabelWarning {
+ /// At least one valid variable index for value labels is required but none were specified.
+ #[error("At least one valid variable index is required but none were specified.")]
+ NoVarIndexes,
+
+ /// Mixed variable types in value label record.
+ #[error("First variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", !var_type)]
+ MixedVarTypes {
+ /// Variable type.
+ var_type: VarType,
+ /// Indexes of variables with the other type.
+ wrong_types: Vec<u32>,
+ },
+
+ /// Value label invalid variable indexes.
+ #[error(
+ "One or more variable indexes were not in the valid range [1,{max}] or referred to string continuations: {invalid:?}"
+ )]
+ InvalidVarIndexes {
+ /// Maximum variable index.
+ max: usize,
+ /// Invalid variable indexes.
+ invalid: Vec<u32>,
+ },
+}
+
+/// A value and label in a system file.
+#[derive(Clone, Debug)]
+pub struct ValueLabel<D, S>
+where
+ D: Debug,
+ S: Debug,
+{
+ /// The value being labeled.
+ pub datum: D,
+ /// The label.
+ pub label: S,
+}
+
+/// A value label record in a system file.
+///
+/// This represents both the type-3 and type-4 records together, since they are
+/// always paired anyway.
+#[derive(Clone)]
+pub struct ValueLabelRecord<D, S>
+where
+ D: Debug,
+ S: Debug,
+{
+ /// Range of offsets in file.
+ pub offsets: Range<u64>,
+
+ /// The labels.
+ pub labels: Vec<ValueLabel<D, S>>,
+
+ /// The 1-based indexes of the variable indexes.
+ pub dict_indexes: Vec<u32>,
+
+ /// The types of the variables.
+ pub var_type: VarType,
+}
+
+impl<D, S> Debug for ValueLabelRecord<D, S>
+where
+ D: Debug,
+ S: Debug,
+{
+ fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+ writeln!(f, "labels: ")?;
+ for label in self.labels.iter() {
+ writeln!(f, "{label:?}")?;
+ }
+ write!(f, "apply to {} variables", self.var_type)?;
+ for dict_index in self.dict_indexes.iter() {
+ write!(f, " #{dict_index}")?;
+ }
+ Ok(())
+ }
+}
+
+impl<D, S> ValueLabelRecord<D, S>
+where
+ D: Debug,
+ S: Debug,
+{
+ /// Maximum number of value labels in a record.
+ pub const MAX_LABELS: u32 = u32::MAX / 8;
+
+ /// Maximum number of variable indexes in a record.
+ pub const MAX_INDEXES: u32 = u32::MAX / 8;
+}
+
+impl ValueLabelRecord<RawDatum, RawString> {
+ pub(super) fn read<R: Read + Seek>(
+ r: &mut R,
+ endian: Endian,
+ var_types: &VarTypes,
+ warn: &mut dyn FnMut(Warning),
+ ) -> Result<Option<Record>, Error> {
+ let label_offset = r.stream_position()?;
+ let n: u32 = endian.parse(read_bytes(r)?);
+ if n > Self::MAX_LABELS {
+ return Err(Error::new(
+ Some(label_offset..label_offset + 4),
+ ErrorDetails::BadNumberOfValueLabels {
+ n,
+ max: Self::MAX_LABELS,
+ },
+ ));
+ }
+
+ let mut labels = Vec::new();
+ for _ in 0..n {
+ let value = UntypedDatum(read_bytes(r)?);
+ let label_len: u8 = endian.parse(read_bytes(r)?);
+ let label_len = label_len as usize;
+ let padded_len = (label_len + 1).next_multiple_of(8);
+
+ let mut label = read_vec(r, padded_len - 1)?;
+ label.truncate(label_len);
+ labels.push((value, RawString(label)));
+ }
+
+ let index_offset = r.stream_position()?;
+ let rec_type: u32 = endian.parse(read_bytes(r)?);
+ if rec_type != 4 {
+ return Err(Error::new(
+ Some(index_offset..index_offset + 4),
+ ErrorDetails::ExpectedVarIndexRecord(rec_type),
+ ));
+ }
+
+ let n: u32 = endian.parse(read_bytes(r)?);
+ let n_offsets = index_offset + 4..index_offset + 8;
+ if n > Self::MAX_INDEXES {
+ return Err(Error::new(
+ Some(n_offsets),
+ ErrorDetails::TooManyVarIndexes {
+ n,
+ max: Self::MAX_INDEXES,
+ },
+ ));
+ } else if n == 0 {
+ warn(Warning::new(
+ Some(n_offsets),
+ ValueLabelWarning::NoVarIndexes,
+ ));
+ return Ok(None);
+ }
+
+ let index_offset = r.stream_position()?;
+ let mut dict_indexes = Vec::with_capacity(n as usize);
+ let mut invalid_indexes = Vec::new();
+ for _ in 0..n {
+ let index: u32 = endian.parse(read_bytes(r)?);
+ if var_types.is_valid_index(index as usize) {
+ dict_indexes.push(index);
+ } else {
+ invalid_indexes.push(index);
+ }
+ }
+ let index_offsets = index_offset..r.stream_position()?;
+ if !invalid_indexes.is_empty() {
+ warn(Warning::new(
+ Some(index_offsets.clone()),
+ ValueLabelWarning::InvalidVarIndexes {
+ max: var_types.n_values(),
+ invalid: invalid_indexes,
+ },
+ ));
+ }
+
+ let Some(&first_index) = dict_indexes.first() else {
+ return Ok(None);
+ };
+ let var_type = VarType::from(var_types.types[first_index as usize - 1].unwrap());
+ let mut wrong_type_indexes = Vec::new();
+ dict_indexes.retain(|&index| {
+ if var_types.types[index as usize - 1].map(VarType::from) != Some(var_type) {
+ wrong_type_indexes.push(index);
+ false
+ } else {
+ true
+ }
+ });
+ if !wrong_type_indexes.is_empty() {
+ warn(Warning::new(
+ Some(index_offsets),
+ ValueLabelWarning::MixedVarTypes {
+ var_type,
+ wrong_types: wrong_type_indexes,
+ },
+ ));
+ }
+
+ let labels = labels
+ .into_iter()
+ .map(|(value, label)| ValueLabel {
+ datum: RawDatum::from_raw(&value, var_type, endian),
+ label,
+ })
+ .collect();
+
+ let end_offset = r.stream_position()?;
+ Ok(Some(Record::ValueLabel(ValueLabelRecord {
+ offsets: label_offset..end_offset,
+ labels,
+ dict_indexes,
+ var_type,
+ })))
+ }
+
+ /// Decodes a value label record using `decoder`.
+ pub fn decode(self, decoder: &mut Decoder) -> ValueLabelRecord<RawDatum, String> {
+ let labels = self
+ .labels
+ .iter()
+ .map(
+ |ValueLabel {
+ datum: value,
+ label,
+ }| ValueLabel {
+ datum: value.clone(),
+ label: decoder.decode(label).to_string(),
+ },
+ )
+ .collect();
+ ValueLabelRecord {
+ offsets: self.offsets.clone(),
+ labels,
+ dict_indexes: self.dict_indexes.clone(),
+ var_type: self.var_type,
+ }
+ }
+}
+
+/// A document record in a system file.
+#[derive(Clone, Debug)]
+pub struct DocumentRecord<S>
+where
+ S: Debug,
+{
+ /// The range of file offsets occupied by the record.
+ pub offsets: Range<u64>,
+
+ /// The document, as an array of lines. Raw lines are exactly 80 bytes long
+ /// and are right-padded with spaces without any new-line termination.
+ pub lines: Vec<S>,
+}
+
+/// One line in a document.
+pub type RawDocumentLine = RawStrArray<DOC_LINE_LEN>;
+
+/// Length of a line in a document. Document lines are fixed-length and
+/// padded on the right with spaces.
+pub const DOC_LINE_LEN: usize = 80;
+
+impl DocumentRecord<RawDocumentLine> {
+ /// Maximum number of lines we will accept in a document. This is simply
+ /// the maximum number that will fit in a 32-bit space.
+ pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
+
+ /// Reads a document record from `r`.
+ pub fn read<R>(r: &mut R, endian: Endian) -> Result<Record, Error>
+ where
+ R: Read + Seek,
+ {
+ let start_offset = r.stream_position()?;
+ let n: u32 = endian.parse(read_bytes(r)?);
+ let n = n as usize;
+ if n > Self::MAX_LINES {
+ Err(Error::new(
+ Some(start_offset..start_offset + 4),
+ ErrorDetails::BadDocumentLength {
+ n,
+ max: Self::MAX_LINES,
+ },
+ ))
+ } else {
+ let offsets = start_offset..start_offset.saturating_add((n * DOC_LINE_LEN) as u64);
+ let mut lines = Vec::with_capacity(n);
+ for _ in 0..n {
+ lines.push(RawStrArray(
+ read_bytes(r).map_err(|e| Error::new(Some(offsets.clone()), e.into()))?,
+ ));
+ }
+ Ok(Record::Document(DocumentRecord { offsets, lines }))
+ }
+ }
+
+ /// Decodes the document record using `decoder`.
+ pub fn decode(self, decoder: &mut Decoder) -> DocumentRecord<String> {
+ DocumentRecord {
+ offsets: self.offsets.clone(),
+ lines: self
+ .lines
+ .iter()
+ .map(|s| decoder.decode_slice(&s.0).to_string())
+ .collect(),
+ }
+ }
+}
+
+/// Constraints on an extension record in a system file.
+pub struct ExtensionRecord<'a> {
+ /// The allowed size for elements in the extension record, or `None` to not
+ /// enforce a particular size.
+ pub size: Option<u32>,
+
+ /// The allowed number elements in the extension record, or `None` to not
+ /// enforce a particular count.
+ pub count: Option<u32>,
+
+ /// The name of the record, for error messages.
+ pub name: &'a str,
+}
+
+/// An integer info record in a system file.
+#[derive(Clone, Debug)]
+pub struct IntegerInfoRecord {
+ /// File offsets occupied by the record.
+ pub offsets: Range<u64>,
+
+ /// Version number.
+ ///
+ /// e.g. `(1,2,3)` for version 1.2.3.
+ pub version: (i32, i32, i32),
+
+ /// Identifies the type of machine.
+ ///
+ /// Mostly useless. PSPP uses value -1.
+ pub machine_code: i32,
+
+ /// Floating point representation (1 for IEEE 754).
+ pub floating_point_rep: i32,
+
+ /// [Compression].
+ pub compression_code: i32,
+
+ /// Endianness.
+ pub endianness: i32,
+
+ /// Character encoding (usually a code page number).
+ pub character_code: i32,
+}
+
+impl IntegerInfoRecord {
+ /// Parses this record from `ext`.
+ pub fn parse(ext: &Extension, endian: Endian) -> Result<Record, WarningDetails> {
+ ext.check_size(Some(4), Some(8), "integer record")?;
+
+ let mut input = &ext.data[..];
+ let data: Vec<i32> = (0..8)
+ .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
+ .collect();
+ Ok(Record::IntegerInfo(IntegerInfoRecord {
+ offsets: ext.offsets.clone(),
+ version: (data[0], data[1], data[2]),
+ machine_code: data[3],
+ floating_point_rep: data[4],
+ compression_code: data[5],
+ endianness: data[6],
+ character_code: data[7],
+ }))
+ }
+}
+
+impl FloatInfoRecord {
+ /// Parses this record from `ext`.
+ pub fn parse(ext: &Extension, endian: Endian) -> Result<Record, WarningDetails> {
+ ext.check_size(Some(8), Some(3), "floating point record")?;
+
+ let mut input = &ext.data[..];
+ let data: Vec<f64> = (0..3)
+ .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
+ .collect();
+ Ok(Record::FloatInfo(FloatInfoRecord {
+ sysmis: data[0],
+ highest: data[1],
+ lowest: data[2],
+ }))
+ }
+}
+
+/// A floating-point info record.
+#[derive(Clone, Debug)]
+pub struct FloatInfoRecord {
+ /// Value used for system-missing values.
+ pub sysmis: f64,
+
+ /// Highest numeric value (e.g. [f64::MAX]).
+ pub highest: f64,
+
+ /// Smallest numeric value (e.g. -[f64::MAX]).
+ pub lowest: f64,
+}
+
+/// Long variable names record.
+#[derive(Clone, Debug)]
+pub struct RawLongNamesRecord(
+ /// Text contents of record.
+ TextRecord,
+);
+
+impl RawLongNamesRecord {
+ /// Parses this record from `extension`.
+ pub fn parse(extension: Extension) -> Result<Record, WarningDetails> {
+ Ok(Record::LongNames(Self(TextRecord::parse(
+ extension,
+ "long names record",
+ )?)))
+ }
+
+ /// Decodes this record using `decoder`.
+ pub fn decode(self, decoder: &mut Decoder) -> LongNamesRecord {
+ let input = decoder.decode(&self.0.text);
+ let mut names = Vec::new();
+ for pair in input.split('\t').filter(|s| !s.is_empty()) {
+ if let Some(long_name) =
+ LongName::parse(pair, decoder).issue_warning(&self.0.offsets, &mut decoder.warn)
+ {
+ names.push(long_name);
+ }
+ }
+ LongNamesRecord(names)
+ }
+}
+
+/// An extension record whose contents are a text string.
+#[derive(Clone, Debug)]
+pub struct TextRecord {
+ /// Range of file offsets for this record in bytes.
+ pub offsets: Range<u64>,
+
+ /// The text content of the record.
+ pub text: RawString,
+}
+
+impl TextRecord {
+ /// Parses this record from `extension`.
+ pub fn parse(extension: Extension, name: &'static str) -> Result<TextRecord, WarningDetails> {
+ extension.check_size(Some(1), None, name)?;
+ Ok(Self {
+ offsets: extension.offsets,
+ text: extension.data.into(),
+ })
+ }
+}
+
+/// Warning for a very long string variable record.
+#[derive(ThisError, Debug)]
+pub enum VeryLongStringWarning {
+ /// Invalid variable name.
+ #[error("Invalid variable name. {0}")]
+ InvalidLongStringName(
+ /// Variable name error.
+ IdError,
+ ),
+
+ /// Missing delimiter.
+ #[error("Missing delimiter in {0:?}.")]
+ VeryLongStringMissingDelimiter(String),
+
+ /// Invalid length.
+ #[error("Invalid length in {0:?}.")]
+ VeryLongStringInvalidLength(
+ /// Length.
+ String,
+ ),
+}
+
+/// A very long string parsed from a [VeryLongStringsRecord].
+#[derive(Clone, Debug)]
+pub struct VeryLongString {
+ /// Short name of very long string variable.
+ pub short_name: Identifier,
+
+ /// Length of very long string variable (in `256..=32767`).
+ pub length: u16,
+}
+
+impl VeryLongString {
+ /// Parses a [VeryLongString] from `input` using `decoder`.
+ pub fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, WarningDetails> {
+ let Some((short_name, length)) = input.split_once('=') else {
+ return Err(VeryLongStringWarning::VeryLongStringMissingDelimiter(input.into()).into());
+ };
+ let short_name = decoder
+ .new_identifier(short_name)
+ .and_then(Identifier::must_be_ordinary)
+ .map_err(VeryLongStringWarning::InvalidLongStringName)?;
+ let length = length
+ .parse()
+ .map_err(|_| VeryLongStringWarning::VeryLongStringInvalidLength(input.into()))?;
+ Ok(VeryLongString { short_name, length })
+ }
+}
+
+/// A very long string record as text.
+#[derive(Clone, Debug)]
+pub struct RawVeryLongStringsRecord(TextRecord);
+
+/// A parsed very long string record.
+#[derive(Clone, Debug)]
+pub struct VeryLongStringsRecord(
+ /// The very long strings.
+ pub Vec<VeryLongString>,
+);
+
+impl RawVeryLongStringsRecord {
+ /// Parses this record from `extension`.
+ pub fn parse(extension: Extension) -> Result<Record, WarningDetails> {
+ Ok(Record::VeryLongStrings(Self(TextRecord::parse(
+ extension,
+ "very long strings record",
+ )?)))
+ }
+
+ /// Decodes this record using `decoder`.
+ pub fn decode(self, decoder: &mut Decoder) -> VeryLongStringsRecord {
+ let input = decoder.decode(&self.0.text);
+ let mut very_long_strings = Vec::new();
+ for tuple in input
+ .split('\0')
+ .map(|s| s.trim_start_matches('\t'))
+ .filter(|s| !s.is_empty())
+ {
+ if let Some(vls) = VeryLongString::parse(decoder, tuple)
+ .issue_warning(&self.0.offsets, &mut decoder.warn)
+ {
+ very_long_strings.push(vls)
+ }
+ }
+ VeryLongStringsRecord(very_long_strings)
+ }
+}
+
+/// Warning for a multiple response set record.
+#[derive(ThisError, Debug)]
+pub enum MultipleResponseWarning {
+ /// Invalid multiple response set name.
+ #[error("Invalid multiple response set name. {0}")]
+ InvalidMrSetName(
+ /// Variable name error.
+ IdError,
+ ),
+
+ /// Invalid variable name.
+ #[error("Invalid variable name. {0}")]
+ InvalidMrSetVariableName(
+ /// Variable name error.
+ IdError,
+ ),
+
+ /// Invalid multiple dichotomy label type.
+ #[error("Invalid multiple dichotomy label type.")]
+ InvalidMultipleDichotomyLabelType,
+
+ /// Invalid multiple response type.
+ #[error("Invalid multiple response type.")]
+ InvalidMultipleResponseType,
+
+ /// Syntax error.
+ #[error("Syntax error ({0}).")]
+ MultipleResponseSyntaxError(
+ /// Detailed error.
+ &'static str,
+ ),
+
+ /// Syntax error parsing counted string (missing trailing space).
+ #[error("Syntax error parsing counted string (missing trailing space).")]
+ CountedStringMissingSpace,
+
+ /// Syntax error parsing counted string (invalid UTF-8).
+ #[error("Syntax error parsing counted string (invalid UTF-8).")]
+ CountedStringInvalidUTF8,
+
+ /// Syntax error parsing counted string (invalid length).
+ #[error("Syntax error parsing counted string (invalid length {0:?}).")]
+ CountedStringInvalidLength(
+ /// Length.
+ String,
+ ),
+
+ /// Syntax error parsing counted string (length goes past end of input).
+ #[error("Syntax error parsing counted string (length {0:?} goes past end of input).")]
+ CountedStringTooLong(
+ /// Length.
+ usize,
+ ),
+}
+
+/// The type of a multiple-response set.
+#[derive(Clone, Debug)]
+pub enum MultipleResponseType {
+ /// Multiple-dichotomy set.
+ MultipleDichotomy {
+ /// The value that is counted in the set.
+ value: RawString,
+
+ /// What categories are labeled.
+ labels: CategoryLabels,
+ },
+
+ /// Multiple-category set.
+ MultipleCategory,
+}
+
+impl MultipleResponseType {
+ /// Parses a [MultipleResponseType] from `input`, returning the type and the
+ /// input remaining to be parsed.
+ fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), WarningDetails> {
+ let (mr_type, input) = match input.split_first() {
+ Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
+ Some((b'D', input)) => {
+ let (value, input) = parse_counted_string(input)?;
+ (
+ MultipleResponseType::MultipleDichotomy {
+ value,
+ labels: CategoryLabels::VarLabels,
+ },
+ input,
+ )
+ }
+ Some((b'E', input)) => {
+ let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
+ (CategoryLabels::CountedValues, rest)
+ } else if let Some(rest) = input.strip_prefix(b" 11 ") {
+ (CategoryLabels::VarLabels, rest)
+ } else {
+ return Err(MultipleResponseWarning::InvalidMultipleDichotomyLabelType.into());
+ };
+ let (value, input) = parse_counted_string(input)?;
+ (
+ MultipleResponseType::MultipleDichotomy { value, labels },
+ input,
+ )
+ }
+ _ => return Err(MultipleResponseWarning::InvalidMultipleResponseType.into()),
+ };
+ Ok((mr_type, input))
+ }
+}
+
+/// A multiple-response set in a system file.
+#[derive(Clone, Debug)]
+pub struct MultipleResponseSet<I, S>
+where
+ I: Debug,
+ S: Debug,
+{
+ /// The set's name.
+ pub name: I,
+ /// The set's label.
+ pub label: S,
+ /// The type of multiple-response set.
+ pub mr_type: MultipleResponseType,
+ /// Short names of the variables in the set.
+ pub short_names: Vec<I>,
+}
+
+impl MultipleResponseSet<RawString, RawString> {
+ /// Parses a multiple-response set from `input`. Returns the set and the
+ /// input remaining to be parsed following the set.
+ fn parse(input: &[u8]) -> Result<(Self, &[u8]), WarningDetails> {
+ let Some(equals) = input.iter().position(|&b| b == b'=') else {
+ return Err(MultipleResponseWarning::MultipleResponseSyntaxError("missing `=`").into());
+ };
+ let (name, input) = input.split_at(equals);
+ let input = input.strip_prefix(b"=").unwrap();
+ let (mr_type, input) = MultipleResponseType::parse(input)?;
+ let Some(input) = input.strip_prefix(b" ") else {
+ return Err(MultipleResponseWarning::MultipleResponseSyntaxError(
+ "missing space after multiple response type",
+ )
+ .into());
+ };
+ let (label, mut input) = parse_counted_string(input)?;
+ let mut vars = Vec::new();
+ while input.first() != Some(&b'\n') {
+ match input.split_first() {
+ Some((b' ', rest)) => {
+ let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
+ return Err(MultipleResponseWarning::MultipleResponseSyntaxError(
+ "missing variable name delimiter",
+ )
+ .into());
+ };
+ let (var, rest) = rest.split_at(length);
+ if !var.is_empty() {
+ vars.push(var.into());
+ }
+ input = rest;
+ }
+ _ => {
+ return Err(MultipleResponseWarning::MultipleResponseSyntaxError(
+ "missing space preceding variable name",
+ )
+ .into());
+ }
+ }
+ }
+ while input.first() == Some(&b'\n') {
+ input = &input[1..];
+ }
+ Ok((
+ MultipleResponseSet {
+ name: name.into(),
+ label,
+ mr_type,
+ short_names: vars,
+ },
+ input,
+ ))
+ }
+
+ /// Decodes this multiple-response set using `decoder`. `offsets` is used
+ /// for issuing warnings.
+ fn decode(
+ &self,
+ offsets: &Range<u64>,
+ decoder: &mut Decoder,
+ ) -> Result<MultipleResponseSet<Identifier, String>, WarningDetails> {
+ let mut short_names = Vec::with_capacity(self.short_names.len());
+ for short_name in self.short_names.iter() {
+ if let Some(short_name) = decoder
+ .decode_identifier(short_name)
+ .map_err(MultipleResponseWarning::InvalidMrSetName)
+ .issue_warning(offsets, &mut decoder.warn)
+ {
+ short_names.push(short_name);
+ }
+ }
+ Ok(MultipleResponseSet {
+ name: decoder
+ .decode_identifier(&self.name)
+ .map_err(MultipleResponseWarning::InvalidMrSetVariableName)?,
+ label: decoder.decode(&self.label).to_string(),
+ mr_type: self.mr_type.clone(),
+ short_names,
+ })
+ }
+}
+
+/// A multiple-response set record in a system file.
+#[derive(Clone, Debug)]
+pub struct MultipleResponseRecord<I, S>
+where
+ I: Debug,
+ S: Debug,
+{
+ /// File offsets of the record.
+ pub offsets: Range<u64>,
+
+ /// The multiple-response sets.
+ pub sets: Vec<MultipleResponseSet<I, S>>,
+}
+
+impl MultipleResponseRecord<RawString, RawString> {
+ /// Parses a multiple-response set from `ext`.
+ pub fn parse(ext: &Extension) -> Result<Record, WarningDetails> {
+ ext.check_size(Some(1), None, "multiple response set record")?;
+
+ let mut input = &ext.data[..];
+ let mut sets = Vec::new();
+ loop {
+ while let Some(suffix) = input.strip_prefix(b"\n") {
+ input = suffix;
+ }
+ if input.is_empty() {
+ break;
+ }
+ let (set, rest) = MultipleResponseSet::parse(input)?;
+ sets.push(set);
+ input = rest;
+ }
+ Ok(Record::MultipleResponse(MultipleResponseRecord {
+ offsets: ext.offsets.clone(),
+ sets,
+ }))
+ }
+}
+
+impl MultipleResponseRecord<RawString, RawString> {
+ /// Decodes this record using `decoder`.
+ pub fn decode(self, decoder: &mut Decoder) -> MultipleResponseRecord<Identifier, String> {
+ let mut sets = Vec::new();
+ for set in self.sets.iter() {
+ if let Some(set) = set
+ .decode(&self.offsets, decoder)
+ .issue_warning(&self.offsets, &mut decoder.warn)
+ {
+ sets.push(set);
+ }
+ }
+ MultipleResponseRecord {
+ offsets: self.offsets,
+ sets,
+ }
+ }
+}
+
+fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), WarningDetails> {
+ let Some(space) = input.iter().position(|&b| b == b' ') else {
+ return Err(MultipleResponseWarning::CountedStringMissingSpace.into());
+ };
+ let Ok(length) = from_utf8(&input[..space]) else {
+ return Err(MultipleResponseWarning::CountedStringInvalidUTF8.into());
+ };
+ let Ok(length): Result<usize, _> = length.parse() else {
+ return Err(MultipleResponseWarning::CountedStringInvalidLength(length.into()).into());
+ };
+
+ let Some((string, rest)) = input[space + 1..].split_at_checked(length) else {
+ return Err(MultipleResponseWarning::CountedStringTooLong(length).into());
+ };
+ Ok((string.into(), rest))
+}
+
+/// Warning for a variable display record.
+#[derive(ThisError, Debug)]
+pub enum VariableDisplayWarning {
+ /// Wrong number of variable display items.
+ #[error("Record contains {count} items but should contain either {first} or {second}.")]
+ InvalidVariableDisplayCount {
+ /// Actual count.
+ count: usize,
+ /// First valid count.
+ first: usize,
+ /// Second valid count.
+ second: usize,
+ },
+
+ /// Invalid variable measurement level value.
+ #[error("Invalid variable measurement level value {0}.")]
+ InvalidMeasurement(
+ /// Invalid value.
+ u32,
+ ),
+
+ /// Invalid variable display alignment value.
+ #[error("Invalid variable display alignment value {0}.")]
+ InvalidAlignment(
+ /// Invalid value.
+ u32,
+ ),
+}
+
+impl Measure {
+ fn try_decode(source: u32) -> Result<Option<Measure>, WarningDetails> {
+ match source {
+ 0 => Ok(None),
+ 1 => Ok(Some(Measure::Nominal)),
+ 2 => Ok(Some(Measure::Ordinal)),
+ 3 => Ok(Some(Measure::Scale)),
+ _ => Err(VariableDisplayWarning::InvalidMeasurement(source).into()),
+ }
+ }
+}
+
+impl Alignment {
+ fn try_decode(source: u32) -> Result<Option<Alignment>, WarningDetails> {
+ match source {
+ 0 => Ok(Some(Alignment::Left)),
+ 1 => Ok(Some(Alignment::Right)),
+ 2 => Ok(Some(Alignment::Center)),
+ _ => Err(VariableDisplayWarning::InvalidAlignment(source).into()),
+ }
+ }
+}
+
+/// Variable display settings for one variable, in a system file.
+#[derive(Clone, Debug)]
+pub struct VarDisplay {
+ /// Measurement level.
+ pub measure: Option<Measure>,
+
+ /// Variable display width.
+ pub width: Option<u32>,
+
+ /// Variable alignment.
+ pub alignment: Option<Alignment>,
+}
+
+/// A variable display record in a system file.
+#[derive(Clone, Debug)]
+pub struct VarDisplayRecord(
+ /// Variable display settings for each variable.
+ pub Vec<VarDisplay>,
+);
+
+impl VarDisplayRecord {
+ /// Parses a variable display record from `ext` given variable types `var_types`.
+ fn parse(
+ ext: &Extension,
+ var_types: &VarTypes,
+ endian: Endian,
+ warn: &mut dyn FnMut(Warning),
+ ) -> Result<Record, WarningDetails> {
+ ext.check_size(Some(4), None, "variable display record")?;
+
+ let n_vars = var_types.n_vars();
+ let has_width = if ext.count as usize == 3 * n_vars {
+ true
+ } else if ext.count as usize == 2 * n_vars {
+ false
+ } else {
+ return Err(VariableDisplayWarning::InvalidVariableDisplayCount {
+ count: ext.count as usize,
+ first: 2 * n_vars,
+ second: 3 * n_vars,
+ }
+ .into());
+ };
+
+ let mut var_displays = Vec::new();
+ let mut input = &ext.data[..];
+ for _ in 0..n_vars {
+ let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
+ .issue_warning(&ext.offsets, warn)
+ .flatten();
+ let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
+ let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
+ .issue_warning(&ext.offsets, warn)
+ .flatten();
+ var_displays.push(VarDisplay {
+ measure,
+ width,
+ alignment,
+ });
+ }
+ Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
+ }
+}
+
+/// Warning for a long string missing value record.
+#[derive(ThisError, Debug)]
+pub enum LongStringMissingValuesWarning {
+ /// Invalid value length.
+ #[error("Value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
+ BadValueLength {
+ /// Offset of the value length.
+ offset: u64,
+ /// Actual value length.
+ value_len: u32,
+ },
+
+ /// Invalid variable name.
+ #[error("Invalid variable name. {0}")]
+ InvalidVariableName(
+ /// Variable name error.
+ IdError,
+ ),
+}
+
+/// Missing values for one long string variable.
+#[derive(Clone, Debug)]
+pub struct LongStringMissingValues<N>
+where
+ N: Debug,
+{
+ /// Variable name.
+ pub var_name: N,
+
+ /// Missing values.
+ pub missing_values: Vec<RawStrArray<8>>,
+}
+
+impl LongStringMissingValues<RawString> {
+ /// Decodes these settings using `decoder`.
+ fn decode(
+ &self,
+ decoder: &mut Decoder,
+ ) -> Result<LongStringMissingValues<Identifier>, IdError> {
+ Ok(LongStringMissingValues {
+ var_name: decoder.decode_identifier(&self.var_name)?,
+ missing_values: self.missing_values.clone(),
+ })
+ }
+}
+
+/// Long string missing values record in a sytem file.
+#[derive(Clone, Debug)]
+pub struct LongStringMissingValueRecord<N>
+where
+ N: Debug,
+{
+ /// The record's file offsets.
+ pub offsets: Range<u64>,
+
+ /// The long string missing values.
+ pub values: Vec<LongStringMissingValues<N>>,
+}
+
+impl LongStringMissingValueRecord<RawString> {
+ /// Parses this record from `ext`.
+ pub fn parse(
+ ext: &Extension,
+ endian: Endian,
+ warn: &mut dyn FnMut(Warning),
+ ) -> Result<Record, WarningDetails> {
+ ext.check_size(Some(1), None, "long string missing values record")?;
+
+ let mut input = &ext.data[..];
+ let mut missing_value_set = Vec::new();
+ while !input.is_empty() {
+ let var_name = read_string(&mut input, endian)?;
+ let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
+ let value_len: u32 = endian.parse(read_bytes(&mut input)?);
+ if value_len != 8 {
+ let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
+ warn(Warning::new(
+ Some(ext.offsets.clone()),
+ LongStringMissingValuesWarning::BadValueLength { offset, value_len },
+ ));
+ read_vec(&mut input, value_len as usize * n_missing_values as usize)?;
+ continue;
+ }
+ let mut missing_values = Vec::new();
+ for i in 0..n_missing_values {
+ if i > 0 {
+ // Tolerate files written by old, buggy versions of PSPP
+ // where we believed that the value_length was repeated
+ // before each missing value.
+ let mut peek = input;
+ let number: u32 = endian.parse(read_bytes(&mut peek)?);
+ if number == 8 {
+ input = peek;
+ }
+ }
+
+ let value: [u8; 8] = read_bytes(&mut input)?;
+ missing_values.push(RawStrArray(value));
+ }
+ missing_value_set.push(LongStringMissingValues {
+ var_name,
+ missing_values,
+ });
+ }
+ Ok(Record::LongStringMissingValues(
+ LongStringMissingValueRecord {
+ offsets: ext.offsets.clone(),
+ values: missing_value_set,
+ },
+ ))
+ }
+
+ /// Decodes this record using `decoder`.
+ pub fn decode(self, decoder: &mut Decoder) -> LongStringMissingValueRecord<Identifier> {
+ let mut mvs = Vec::with_capacity(self.values.len());
+ for mv in self.values.iter() {
+ if let Some(mv) = mv
+ .decode(decoder)
+ .map_err(LongStringMissingValuesWarning::InvalidVariableName)
+ .issue_warning(&self.offsets, &mut decoder.warn)
+ {
+ mvs.push(mv);
+ }
+ }
+ LongStringMissingValueRecord {
+ offsets: self.offsets,
+ values: mvs,
+ }
+ }
+}
+
+/// A character encoding record in a system file.
+#[derive(Clone, Debug)]
+pub struct EncodingRecord(
+ /// The encoding name.
+ pub String,
+);
+
+impl EncodingRecord {
+ /// Parses this record from `ext`.
+ pub fn parse(ext: &Extension) -> Result<Record, WarningDetails> {
+ ext.check_size(Some(1), None, "encoding record")?;
+
+ Ok(Record::Encoding(EncodingRecord(
+ String::from_utf8(ext.data.clone()).map_err(|_| WarningDetails::BadEncodingName)?,
+ )))
+ }
+}
+
+/// The extended number of cases record in a system file.
+#[derive(Clone, Debug)]
+pub struct NumberOfCasesRecord {
+ /// Always observed as 1.
+ pub one: u64,
+
+ /// Number of cases.
+ pub n_cases: u64,
+}
+
+impl NumberOfCasesRecord {
+ /// Parses a number of cases record from `ext` using `endian`.
+ pub fn parse(ext: &Extension, endian: Endian) -> Result<Record, WarningDetails> {
+ ext.check_size(Some(8), Some(2), "extended number of cases record")?;
+
+ let mut input = &ext.data[..];
+ let one = endian.parse(read_bytes(&mut input)?);
+ let n_cases = endian.parse(read_bytes(&mut input)?);
+
+ Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
+ }
+}
+
+/// Warning for a variable sets record.
+#[derive(ThisError, Debug)]
+pub enum VariableSetWarning {
+ /// Invalid variable name.
+ #[error("Invalid variable name. {0}")]
+ InvalidVariableSetName(
+ /// Variable name error.
+ IdError,
+ ),
+
+ /// Missing name delimiter.
+ #[error("Missing name delimiter.")]
+ VariableSetMissingEquals,
+}
+
+/// Raw (text) version of the variable set record in a system file.
+#[derive(Clone, Debug)]
+pub struct RawVariableSetRecord(TextRecord);
+
+impl RawVariableSetRecord {
+ /// Parses the record from `extension`.
+ pub fn parse(extension: Extension) -> Result<Record, WarningDetails> {
+ Ok(Record::VariableSets(Self(TextRecord::parse(
+ extension,
+ "variable sets record",
+ )?)))
+ }
+
+ /// Decodes the record using `decoder`.
+ pub fn decode(self, decoder: &mut Decoder) -> VariableSetRecord {
+ let mut sets = Vec::new();
+ let input = decoder.decode(&self.0.text);
+ for line in input.lines() {
+ if let Some(set) = VariableSet::parse(line, decoder, &self.0.offsets)
+ .issue_warning(&self.0.offsets, &mut decoder.warn)
+ {
+ sets.push(set)
+ }
+ }
+ VariableSetRecord {
+ offsets: self.0.offsets,
+ sets,
+ }
+ }
+}
+
+/// Raw (text) version of a product info record in a system file.
+#[derive(Clone, Debug)]
+pub struct RawProductInfoRecord(TextRecord);
+
+impl RawProductInfoRecord {
+ /// Parses the record from `extension`.
+ pub fn parse(extension: Extension) -> Result<Record, WarningDetails> {
+ Ok(Record::ProductInfo(Self(TextRecord::parse(
+ extension,
+ "product info record",
+ )?)))
+ }
+
+ /// Decodes the record using `decoder`.
+ pub fn decode(self, decoder: &mut Decoder) -> ProductInfoRecord {
+ ProductInfoRecord(decoder.decode(&self.0.text).into())
+ }
+}
+
+/// Warning for a file or variable attribute record.
+#[derive(ThisError, Debug)]
+pub enum AttributeWarning {
+ /// Invalid attribute name.
+ #[error("Invalid attribute name. {0}")]
+ InvalidAttributeName(
+ /// Attribute name error.
+ IdError,
+ ),
+
+ /// Invalid variable name in attribute record.
+ #[error("Invalid variable name in attribute record. {0}")]
+ InvalidAttributeVariableName(
+ /// Variable name error.
+ IdError,
+ ),
+
+ /// Attribute record missing left parenthesis.
+ #[error("Attribute record missing left parenthesis, in {0:?}.")]
+ AttributeMissingLParen(
+ /// Bad syntax.
+ String,
+ ),
+
+ /// Attribute lacks value.
+ #[error("Attribute for {name}[{}] lacks value.", index + 1)]
+ AttributeMissingValue {
+ /// Attribute name.
+ name: Identifier,
+ /// 0-based index.
+ index: usize,
+ },
+
+ /// Attribute missing quotations.
+ #[error("Attribute for {name}[{}] missing quotations.", index + 1)]
+ AttributeMissingQuotes {
+ /// Attribute name.
+ name: Identifier,
+ /// 0-based index.
+ index: usize,
+ },
+
+ /// Variable attribute missing `:`.
+ #[error("Variable attribute missing `:`.")]
+ VariableAttributeMissingColon,
+
+ /// Duplicate attributes for variable.
+ #[error("Duplicate attributes for variable {variable}: {}.", attributes.iter().join(", "))]
+ DuplicateVariableAttributes {
+ /// Variable name.
+ variable: Identifier,
+ /// Attributes with duplicates.
+ attributes: Vec<Identifier>,
+ },
+
+ /// Duplicate dataset attributes.
+ #[error("Duplicate dataset attributes with names: {}.", attributes.iter().join(", "))]
+ DuplicateFileAttributes {
+ /// Attributes with duplicates.
+ attributes: Vec<Identifier>,
+ },
+
+ /// File attributes record contains trailing garbage.
+ #[error("File attributes record contains trailing garbage.")]
+ FileAttributesTrailingGarbage,
+}
+
+/// A file or variable attribute in a system file.
+#[derive(Clone, Debug)]
+pub struct Attribute {
+ /// The attribute's name.
+ pub name: Identifier,
+
+ /// The attribute's values.
+ pub values: Vec<String>,
+}
+
+impl Attribute {
+ /// Parses an attribute from the beginning of `input` using `decoder`. Uses
+ /// `offsets` to report warnings. Returns the decoded attribute and the
+ /// part of `input` that remains to be parsed following the attribute.
+ fn parse<'a>(
+ decoder: &mut Decoder,
+ offsets: &Range<u64>,
+ input: &'a str,
+ ) -> Result<(Attribute, &'a str), WarningDetails> {
+ let Some((name, mut input)) = input.split_once('(') else {
+ return Err(AttributeWarning::AttributeMissingLParen(input.into()).into());
+ };
+ let name = decoder
+ .new_identifier(name)
+ .map_err(AttributeWarning::InvalidAttributeName)?;
+ let mut values = Vec::new();
+ loop {
+ let Some((value, rest)) = input.split_once('\n') else {
+ return Err(AttributeWarning::AttributeMissingValue {
+ name: name.clone(),
+ index: values.len(),
+ }
+ .into());
+ };
+ if let Some(stripped) = value
+ .strip_prefix('\'')
+ .and_then(|value| value.strip_suffix('\''))
+ {
+ values.push(stripped.into());
+ } else {
+ decoder.warn(Warning::new(
+ Some(offsets.clone()),
+ AttributeWarning::AttributeMissingQuotes {
+ name: name.clone(),
+ index: values.len(),
+ },
+ ));
+ values.push(value.into());
+ }
+ if let Some(rest) = rest.strip_prefix(')') {
+ let attribute = Attribute { name, values };
+ return Ok((attribute, rest));
+ };
+ input = rest;
+ }
+ }
+}
+
+impl Attributes {
+ /// Parses a set of varaible or file attributes from `input` using
+ /// `decoder`. Uses `offsets` for reporting warnings. If not `None`,
+ /// `sentinel` terminates the attributes. Returns the attributes and the
+ /// part of `input` that remains after parsing the attributes.
+ fn parse<'a>(
+ decoder: &mut Decoder,
+ offsets: &Range<u64>,
+ mut input: &'a str,
+ sentinel: Option<char>,
+ ) -> Result<(Attributes, &'a str, Vec<Identifier>), WarningDetails> {
+ let mut attributes = BTreeMap::new();
+ let mut duplicates = Vec::new();
+ let rest = loop {
+ match input.chars().next() {
+ None => break input,
+ c if c == sentinel => break &input[1..],
+ _ => {
+ let (attribute, rest) = Attribute::parse(decoder, offsets, input)?;
+ if attributes.contains_key(&attribute.name) {
+ duplicates.push(attribute.name.clone());
+ }
+ attributes.insert(attribute.name, attribute.values);
+ input = rest;
+ }
+ }
+ };
+ Ok((Attributes(attributes), rest, duplicates))
+ }
+}
+
+/// A raw (text) file attributes record in a system file.
+#[derive(Clone, Debug)]
+pub struct RawFileAttributesRecord(TextRecord);
+
+/// A decoded file attributes record in a system file.
+#[derive(Clone, Debug, Default)]
+pub struct FileAttributesRecord(pub Attributes);
+
+impl RawFileAttributesRecord {
+ /// Parses this record from `extension`.
+ pub fn parse(extension: Extension) -> Result<Record, WarningDetails> {
+ Ok(Record::FileAttributes(Self(TextRecord::parse(
+ extension,
+ "file attributes record",
+ )?)))
+ }
+
+ /// Decodes this record using `decoder`.
+ pub fn decode(self, decoder: &mut Decoder) -> FileAttributesRecord {
+ let input = decoder.decode(&self.0.text);
+ match Attributes::parse(decoder, &self.0.offsets, &input, None)
+ .issue_warning(&self.0.offsets, &mut decoder.warn)
+ {
+ Some((set, rest, duplicates)) => {
+ if !duplicates.is_empty() {
+ decoder.warn(Warning::new(
+ Some(self.0.offsets.clone()),
+ AttributeWarning::DuplicateFileAttributes {
+ attributes: duplicates,
+ },
+ ));
+ }
+ if !rest.is_empty() {
+ decoder.warn(Warning::new(
+ Some(self.0.offsets.clone()),
+ AttributeWarning::FileAttributesTrailingGarbage,
+ ));
+ }
+ FileAttributesRecord(set)
+ }
+ None => FileAttributesRecord::default(),
+ }
+ }
+}
+
+/// A set of variable attributes in a system file.
+#[derive(Clone, Debug)]
+pub struct VarAttributes {
+ /// The long name of the variable associated with the attributes.
+ pub long_var_name: Identifier,
+
+ /// The attributes.
+ pub attributes: Attributes,
+}
+
+impl VarAttributes {
+ /// Parses a variable attribute set from `input` using `decoder`. Uses
+ /// `offsets` for reporting warnings.
+ fn parse<'a>(
+ decoder: &mut Decoder,
+ offsets: &Range<u64>,
+ input: &'a str,
+ ) -> Result<(VarAttributes, &'a str), WarningDetails> {
+ let Some((long_var_name, rest)) = input.split_once(':') else {
+ return Err(AttributeWarning::VariableAttributeMissingColon.into());
+ };
+ let long_var_name = decoder
+ .new_identifier(long_var_name)
+ .and_then(Identifier::must_be_ordinary)
+ .map_err(AttributeWarning::InvalidAttributeVariableName)?;
+ let (attributes, rest, duplicates) = Attributes::parse(decoder, offsets, rest, Some('/'))?;
+ if !duplicates.is_empty() {
+ decoder.warn(Warning::new(
+ Some(offsets.clone()),
+ AttributeWarning::DuplicateVariableAttributes {
+ variable: long_var_name.clone(),
+ attributes: duplicates,
+ },
+ ));
+ }
+ Ok((
+ VarAttributes {
+ long_var_name,
+ attributes,
+ },
+ rest,
+ ))
+ }
+}
+
+/// A raw (text) variable attributes record in a system file.
+#[derive(Clone, Debug)]
+pub struct RawVariableAttributesRecord(TextRecord);
+
+/// A decoded variable attributes record in a system file.
+#[derive(Clone, Debug)]
+pub struct VariableAttributesRecord(pub Vec<VarAttributes>);
+
+impl RawVariableAttributesRecord {
+ /// Parses a variable attributes record.
+ pub fn parse(extension: Extension) -> Result<Record, WarningDetails> {
+ Ok(Record::VariableAttributes(Self(TextRecord::parse(
+ extension,
+ "variable attributes record",
+ )?)))
+ }
+
+ /// Decodes a variable attributes record using `decoder`.
+ pub fn decode(self, decoder: &mut Decoder) -> VariableAttributesRecord {
+ let decoded = decoder.decode(&self.0.text);
+ let mut input = decoded.as_ref();
+ let mut var_attribute_sets = Vec::new();
+ while !input.is_empty() {
+ let Some((var_attribute, rest)) = VarAttributes::parse(decoder, &self.0.offsets, input)
+ .issue_warning(&self.0.offsets, &mut decoder.warn)
+ else {
+ break;
+ };
+ var_attribute_sets.push(var_attribute);
+ input = rest;
+ }
+ VariableAttributesRecord(var_attribute_sets)
+ }
+}
+
+/// Warning for a long variable name record.
+#[derive(ThisError, Debug)]
+pub enum LongNameWarning {
+ /// Missing `=`.
+ #[error("Missing `=` separator.")]
+ LongNameMissingEquals,
+
+ /// Invalid short name.
+ #[error("Invalid short name. {0}")]
+ InvalidShortName(
+ /// Short variable name error.
+ IdError,
+ ),
+
+ /// Invalid long name.
+ #[error("Invalid long name. {0}")]
+ InvalidLongName(
+ /// Long variable name error.
+ IdError,
+ ),
+}
+
+/// A long variable name in a system file.
+#[derive(Clone, Debug)]
+pub struct LongName {
+ /// The variable's short name.
+ pub short_name: Identifier,
+
+ /// The variable's long name.
+ pub long_name: Identifier,
+}
+
+impl LongName {
+ /// Parses a long variable name from `input` using `decoder`.
+ pub fn parse(input: &str, decoder: &Decoder) -> Result<Self, WarningDetails> {
+ let Some((short_name, long_name)) = input.split_once('=') else {
+ return Err(LongNameWarning::LongNameMissingEquals.into());
+ };
+ let short_name = decoder
+ .new_identifier(short_name)
+ .and_then(Identifier::must_be_ordinary)
+ .map_err(LongNameWarning::InvalidShortName)?;
+ let long_name = decoder
+ .new_identifier(long_name)
+ .and_then(Identifier::must_be_ordinary)
+ .map_err(LongNameWarning::InvalidLongName)?;
+ Ok(LongName {
+ short_name,
+ long_name,
+ })
+ }
+}
+
+/// A long variable name record in a system file.
+#[derive(Clone, Debug)]
+pub struct LongNamesRecord(pub Vec<LongName>);
+
+/// A product info record in a system file.
+#[derive(Clone, Debug)]
+pub struct ProductInfoRecord(pub String);
+
+/// A variable set in a system file.
+#[derive(Clone, Debug)]
+pub struct VariableSet {
+ /// Name of the variable set.
+ pub name: String,
+
+ /// The long variable names of the members of the set.
+ pub variable_names: Vec<Identifier>,
+}
+
+impl VariableSet {
+ /// Parses a variable set from `input` using `decoder`. Uses `offsets` to
+ /// report warnings.
+ fn parse(
+ input: &str,
+ decoder: &mut Decoder,
+ offsets: &Range<u64>,
+ ) -> Result<Self, WarningDetails> {
+ let (name, input) = input
+ .split_once('=')
+ .ok_or(VariableSetWarning::VariableSetMissingEquals)?;
+ let mut vars = Vec::new();
+ for var in input.split_ascii_whitespace() {
+ if let Some(identifier) = decoder
+ .new_identifier(var)
+ .and_then(Identifier::must_be_ordinary)
+ .map_err(VariableSetWarning::InvalidVariableSetName)
+ .issue_warning(offsets, &mut decoder.warn)
+ {
+ vars.push(identifier);
+ }
+ }
+ Ok(VariableSet {
+ name: name.to_string(),
+ variable_names: vars,
+ })
+ }
+}
+
+/// A variable set record in a system file.
+#[derive(Clone, Debug)]
+pub struct VariableSetRecord {
+ /// Range of file offsets occupied by the record.
+ pub offsets: Range<u64>,
+
+ /// The variable sets in the record.
+ pub sets: Vec<VariableSet>,
+}
+
+trait IssueWarning<T> {
+ fn issue_warning(self, offsets: &Range<u64>, warn: &mut dyn FnMut(Warning)) -> Option<T>;
+}
+impl<T, W> IssueWarning<T> for Result<T, W>
+where
+ W: Into<WarningDetails>,
+{
+ fn issue_warning(self, offsets: &Range<u64>, warn: &mut dyn FnMut(Warning)) -> Option<T> {
+ match self {
+ Ok(result) => Some(result),
+ Err(error) => {
+ warn(Warning::new(Some(offsets.clone()), error.into()));
+ None
+ }
+ }
+ }
+}
+
+/// Warning for an extension record.
+#[derive(ThisError, Debug)]
+pub enum ExtensionWarning {
+ /// Unexpected end of data.
+ #[error("Unexpected end of data.")]
+ UnexpectedEndOfData,
+
+ /// Invalid record size.
+ #[error("{record} has bad size {size} bytes instead of the expected {expected_size}.")]
+ BadRecordSize {
+ /// Name of the record.
+ record: &'static str,
+ /// Size of the elements in the record, in bytes.
+ size: u32,
+ /// Expected size of the elements in the record, in bytes.
+ expected_size: u32,
+ },
+
+ /// Invalid record count.
+ #[error("{record} has bad count {count} instead of the expected {expected_count}.")]
+ BadRecordCount {
+ /// Name of the record.
+ record: &'static str,
+ /// Number of elements in the record.
+ count: u32,
+ /// Expected number of elements in the record.
+ expected_count: u32,
+ },
+}
+
+/// An extension record in a system file.
+///
+/// Most of the records in system files are "extension records". This structure
+/// collects everything in an extension record for later processing.
+#[derive(Clone, Debug)]
+pub struct Extension {
+ /// File offsets occupied by the extension record.
+ ///
+ /// These are the offsets of the `data` portion of the record, not including
+ /// the header that specifies the subtype, size, and count.
+ pub offsets: Range<u64>,
+
+ /// Record subtype.
+ pub subtype: u32,
+
+ /// Size of each data element.
+ pub size: u32,
+
+ /// Number of data elements.
+ pub count: u32,
+
+ /// `size * count` bytes of data.
+ pub data: Vec<u8>,
+}
+
+impl Extension {
+ /// Checks that this extension has `size`-byte elements and `count` elements
+ /// total. Uses `name` for error reporting.
+ pub fn check_size(
+ &self,
+ size: Option<u32>,
+ count: Option<u32>,
+ name: &'static str,
+ ) -> Result<(), WarningDetails> {
+ if let Some(expected_size) = size
+ && self.size != expected_size
+ {
+ Err(ExtensionWarning::BadRecordSize {
+ record: name,
+ size: self.size,
+ expected_size,
+ }
+ .into())
+ } else if let Some(expected_count) = count
+ && self.count != expected_count
+ {
+ Err(ExtensionWarning::BadRecordCount {
+ record: name,
+ count: self.count,
+ expected_count,
+ }
+ .into())
+ } else {
+ Ok(())
+ }
+ }
+
+ pub(super) fn read<R: Read + Seek>(
+ r: &mut R,
+ endian: Endian,
+ var_types: &VarTypes,
+ warn: &mut dyn FnMut(Warning),
+ ) -> Result<Option<Record>, Error> {
+ let subtype = endian.parse(read_bytes(r)?);
+ let header_offset = r.stream_position()?;
+ let size: u32 = endian.parse(read_bytes(r)?);
+ let count = endian.parse(read_bytes(r)?);
+ let Some(product) = size.checked_mul(count) else {
+ return Err(Error::new(
+ Some(header_offset..header_offset + 8),
+ ErrorDetails::ExtensionRecordTooLarge {
+ subtype,
+ size,
+ count,
+ },
+ ));
+ };
+ let start_offset = r.stream_position()?;
+ let data = read_vec(r, product as usize)?;
+ let end_offset = start_offset + product as u64;
+ let offsets = start_offset..end_offset;
+ let extension = Extension {
+ offsets: offsets.clone(),
+ subtype,
+ size,
+ count,
+ data,
+ };
+ let result = match subtype {
+ 3 => IntegerInfoRecord::parse(&extension, endian),
+ 4 => FloatInfoRecord::parse(&extension, endian),
+ 11 => VarDisplayRecord::parse(&extension, var_types, endian, warn),
+ 7 | 19 => MultipleResponseRecord::parse(&extension),
+ 21 => LongStringValueLabelRecord::parse(&extension, endian),
+ 22 => LongStringMissingValueRecord::parse(&extension, endian, warn),
+ 20 => EncodingRecord::parse(&extension),
+ 16 => NumberOfCasesRecord::parse(&extension, endian),
+ 5 => RawVariableSetRecord::parse(extension),
+ 10 => RawProductInfoRecord::parse(extension),
+ 13 => RawLongNamesRecord::parse(extension),
+ 14 => RawVeryLongStringsRecord::parse(extension),
+ 17 => RawFileAttributesRecord::parse(extension),
+ 18 => RawVariableAttributesRecord::parse(extension),
+ _ => Ok(Record::OtherExtension(extension)),
+ };
+ match result {
+ Ok(result) => Ok(Some(result)),
+ Err(details) => {
+ warn(Warning::new(Some(offsets), details));
+ Ok(None)
+ }
+ }
+ }
+}
+
+/// Warning for a long string value label record.
+#[derive(ThisError, Debug)]
+pub enum LongStringValueLabelWarning {
+ /// Invalid variable name.
+ #[error("Invalid variable name. {0}")]
+ InvalidVariableName(
+ /// Variable name error.
+ IdError,
+ ),
+}
+
+/// One set of long string value labels record in a system file.
+#[derive(Clone, Debug)]
+pub struct LongStringValueLabels<N, S>
+where
+ S: Debug,
+{
+ /// The variable being labeled.
+ pub var_name: N,
+
+ /// The variable's width (greater than 8, since it's a long string).
+ pub width: u32,
+
+ /// `(value, label)` pairs, where each value is `width` bytes.
+ pub labels: Vec<(RawString, S)>,
+}
+
+impl LongStringValueLabels<RawString, RawString> {
+ /// Decodes a set of long string value labels using `decoder`.
+ fn decode(
+ &self,
+ decoder: &mut Decoder,
+ ) -> Result<LongStringValueLabels<Identifier, String>, WarningDetails> {
+ let var_name = decoder.decode(&self.var_name);
+ let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding)
+ .map_err(LongStringValueLabelWarning::InvalidVariableName)?;
+
+ let mut labels = Vec::with_capacity(self.labels.len());
+ for (value, label) in self.labels.iter() {
+ let label = decoder.decode(label).to_string();
+ labels.push((value.clone(), label));
+ }
+
+ Ok(LongStringValueLabels {
+ var_name,
+ width: self.width,
+ labels,
+ })
+ }
+}
+
+/// A long string value labels record in a system file.
+#[derive(Clone, Debug)]
+pub struct LongStringValueLabelRecord<N, S>
+where
+ N: Debug,
+ S: Debug,
+{
+ /// File offsets occupied by the record.
+ pub offsets: Range<u64>,
+
+ /// The labels.
+ pub labels: Vec<LongStringValueLabels<N, S>>,
+}
+
+impl LongStringValueLabelRecord<RawString, RawString> {
+ /// Parses this record from `ext` using `endian`.
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, WarningDetails> {
+ ext.check_size(Some(1), None, "long string value labels record")?;
+
+ let mut input = &ext.data[..];
+ let mut label_set = Vec::new();
+ while !input.is_empty() {
+ let var_name = read_string(&mut input, endian)?;
+ let width: u32 = endian.parse(read_bytes(&mut input)?);
+ let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
+ let mut labels = Vec::new();
+ for _ in 0..n_labels {
+ let value = read_string(&mut input, endian)?;
+ let label = read_string(&mut input, endian)?;
+ labels.push((value, label));
+ }
+ label_set.push(LongStringValueLabels {
+ var_name,
+ width,
+ labels,
+ })
+ }
+ Ok(Record::LongStringValueLabels(LongStringValueLabelRecord {
+ offsets: ext.offsets.clone(),
+ labels: label_set,
+ }))
+ }
+
+ /// Decodes this record using `decoder`.
+ pub fn decode(self, decoder: &mut Decoder) -> LongStringValueLabelRecord<Identifier, String> {
+ let mut labels = Vec::with_capacity(self.labels.len());
+ for label in &self.labels {
+ match label.decode(decoder) {
+ Ok(set) => labels.push(set),
+ Err(error) => decoder.warn(Warning::new(Some(self.offsets.clone()), error)),
+ }
+ }
+ LongStringValueLabelRecord {
+ offsets: self.offsets,
+ labels,
+ }
+ }
+}
+
+/// ZLIB header, for [Compression::ZLib].
+#[derive(Clone, Debug)]
+pub struct ZHeader {
+ /// File offset to the start of the record.
+ pub offset: u64,
+
+ /// File offset to the ZLIB data header.
+ pub zheader_offset: u64,
+
+ /// File offset to the ZLIB trailer.
+ pub ztrailer_offset: u64,
+
+ /// Length of the ZLIB trailer in bytes.
+ pub ztrailer_len: u64,
+}
+
+impl ZHeader {
+ /// Reads a ZLIB header from `r` using `endian`.
+ pub fn read<R>(r: &mut R, endian: Endian) -> Result<ZHeader, Error>
+ where
+ R: Read + Seek,
+ {
+ let offset = r.stream_position()?;
+ let zheader_offset: u64 = endian.parse(read_bytes(r)?);
+ let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
+ let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
+
+ if zheader_offset != offset {
+ Err(ErrorDetails::UnexpectedZHeaderOffset {
+ actual: zheader_offset,
+ expected: offset,
+ })
+ } else if ztrailer_offset < offset {
+ Err(ErrorDetails::ImpossibleZTrailerOffset(ztrailer_offset))
+ } else if ztrailer_len < 24 || ztrailer_len % 24 != 0 {
+ Err(ErrorDetails::InvalidZTrailerLength(ztrailer_len))
+ } else {
+ Ok(ZHeader {
+ offset,
+ zheader_offset,
+ ztrailer_offset,
+ ztrailer_len,
+ })
+ }
+ .map_err(|details| Error::new(Some(offset..offset + 12), details))
+ }
+}
+
+/// A ZLIB trailer in a system file.
+#[derive(Clone, Debug)]
+pub struct ZTrailer {
+ /// File offset to the start of the record.
+ pub offset: u64,
+
+ /// Compression bias as a negative integer, e.g. -100.
+ pub int_bias: i64,
+
+ /// Always observed as zero.
+ pub zero: u64,
+
+ /// Uncompressed size of each block, except possibly the last. Only
+ /// `0x3ff000` has been observed so far.
+ pub block_size: u32,
+
+ /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
+ pub blocks: Vec<ZBlock>,
+}
+
+/// Warning for a ZLIB trailer record.
+#[derive(ThisError, Debug)]
+pub enum ZlibTrailerWarning {
+ /// Wrong block size.
+ #[error(
+ "ZLIB block descriptor {index} reported block size {actual:#x}, when {expected:#x} was expected."
+ )]
+ ZlibTrailerBlockWrongSize {
+ /// 0-based block descriptor index.
+ index: usize,
+ /// Actual block size.
+ actual: u32,
+ /// Expected block size.
+ expected: u32,
+ },
+
+ /// Block too big.
+ #[error(
+ "ZLIB block descriptor {index} reported block size {actual:#x}, when at most {max_expected:#x} was expected."
+ )]
+ ZlibTrailerBlockTooBig {
+ /// 0-based block descriptor index.
+ index: usize,
+ /// Actual block size.
+ actual: u32,
+ /// Maximum expected block size.
+ max_expected: u32,
+ },
+}
+
+/// A ZLIB block descriptor in a system file.
+#[derive(Clone, Debug)]
+pub struct ZBlock {
+ /// Offset of block of data if simple compression were used.
+ pub uncompressed_ofs: u64,
+
+ /// Actual offset within the file of the compressed data block.
+ pub compressed_ofs: u64,
+
+ /// The number of bytes in this data block after decompression. This is
+ /// `block_size` in every data block but the last, which may be smaller.
+ pub uncompressed_size: u32,
+
+ /// The number of bytes in this data block, as stored compressed in this
+ /// file.
+ pub compressed_size: u32,
+}
+
+impl ZBlock {
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
+ Ok(ZBlock {
+ uncompressed_ofs: endian.parse(read_bytes(r)?),
+ compressed_ofs: endian.parse(read_bytes(r)?),
+ uncompressed_size: endian.parse(read_bytes(r)?),
+ compressed_size: endian.parse(read_bytes(r)?),
+ })
+ }
+
+ /// Returns true if the uncompressed and compressed sizes are plausible.
+ ///
+ /// [zlib Technical Details] says that the maximum expansion from
+ /// compression, with worst-case parameters, is 13.5% plus 11 bytes. This
+ /// code checks for an expansion of more than 14.3% plus 11 bytes.
+ ///
+ /// [zlib Technical Details]: http://www.zlib.net/zlib_tech.html
+ fn has_plausible_sizes(&self) -> bool {
+ self.uncompressed_size
+ .checked_add(self.uncompressed_size / 7 + 11)
+ .is_some_and(|max| self.compressed_size <= max)
+ }
+}
+
+impl ZTrailer {
+ /// Reads a ZLIB trailer from `reader` using `endian`. `bias` is the
+ /// floating-point bias for confirmation against the trailer, and `zheader`
+ /// is the previously read ZLIB header. Uses `warn` to report warnings.
+ pub fn read<R>(
+ reader: &mut R,
+ endian: Endian,
+ bias: f64,
+ zheader: &ZHeader,
+ warn: &mut dyn FnMut(Warning),
+ ) -> Result<Option<ZTrailer>, Error>
+ where
+ R: Read + Seek,
+ {
+ let start_offset = reader.stream_position()?;
+ if reader
+ .seek(SeekFrom::Start(zheader.ztrailer_offset))
+ .is_err()
+ {
+ return Ok(None);
+ }
+ let int_bias = endian.parse(read_bytes(reader)?);
+ let zero = endian.parse(read_bytes(reader)?);
+ let block_size = endian.parse(read_bytes(reader)?);
+ let n_blocks: u32 = endian.parse(read_bytes(reader)?);
+ if int_bias as f64 != -bias {
+ Err(ErrorDetails::WrongZlibTrailerBias {
+ actual: int_bias,
+ expected: -bias,
+ })
+ } else if zero != 0 {
+ Err(ErrorDetails::WrongZlibTrailerZero(zero))
+ } else if block_size != 0x3ff000 {
+ Err(ErrorDetails::WrongZlibTrailerBlockSize(block_size))
+ } else if let expected_n_blocks = (zheader.ztrailer_len - 24) / 24
+ && n_blocks as u64 != expected_n_blocks
+ {
+ Err(ErrorDetails::BadZlibTrailerNBlocks {
+ n_blocks,
+ expected_n_blocks,
+ ztrailer_len: zheader.ztrailer_len,
+ })
+ } else {
+ Ok(())
+ }
+ .map_err(|details| Error::new(Some(start_offset..start_offset + 24), details))?;
+
+ let blocks = (0..n_blocks)
+ .map(|_| ZBlock::read(reader, endian))
+ .collect::<Result<Vec<_>, _>>()?;
+
+ let mut expected_uncmp_ofs = zheader.zheader_offset;
+ let mut expected_cmp_ofs = zheader.zheader_offset + 24;
+ for (index, block) in blocks.iter().enumerate() {
+ let block_start = start_offset + 24 + 24 * index as u64;
+ let block_offsets = block_start..block_start + 24;
+
+ if block.uncompressed_ofs != expected_uncmp_ofs {
+ Err(ErrorDetails::ZlibTrailerBlockWrongUncmpOfs {
+ index,
+ actual: block.uncompressed_ofs,
+ expected: expected_cmp_ofs,
+ })
+ } else if block.compressed_ofs != expected_cmp_ofs {
+ Err(ErrorDetails::ZlibTrailerBlockWrongCmpOfs {
+ index,
+ actual: block.compressed_ofs,
+ expected: expected_cmp_ofs,
+ })
+ } else if !block.has_plausible_sizes() {
+ Err(ErrorDetails::ZlibExpansion {
+ index,
+ compressed_size: block.compressed_size,
+ uncompressed_size: block.uncompressed_size,
+ })
+ } else {
+ Ok(())
+ }
+ .map_err(|details| Error::new(Some(block_offsets.clone()), details))?;
+
+ if index < blocks.len() - 1 {
+ if block.uncompressed_size != block_size {
+ warn(Warning::new(
+ Some(block_offsets),
+ ZlibTrailerWarning::ZlibTrailerBlockWrongSize {
+ index,
+ actual: block.uncompressed_size,
+ expected: block_size,
+ },
+ ));
+ }
+ } else {
+ if block.uncompressed_size > block_size {
+ warn(Warning::new(
+ Some(block_offsets),
+ ZlibTrailerWarning::ZlibTrailerBlockTooBig {
+ index,
+ actual: block.uncompressed_size,
+ max_expected: block_size,
+ },
+ ));
+ }
+ }
+
+ expected_cmp_ofs += block.compressed_size as u64;
+ expected_uncmp_ofs += block.uncompressed_size as u64;
+ }
+
+ if expected_cmp_ofs != zheader.ztrailer_offset {
+ return Err(Error::new(
+ Some(start_offset..start_offset + 24 + 24 * n_blocks as u64),
+ ErrorDetails::ZlibTrailerOffsetInconsistency {
+ expected: expected_cmp_ofs,
+ actual: zheader.ztrailer_offset,
+ },
+ ));
+ }
+
+ reader.seek(SeekFrom::Start(start_offset))?;
+ Ok(Some(ZTrailer {
+ offset: zheader.ztrailer_offset,
+ int_bias,
+ zero,
+ block_size,
+ blocks,
+ }))
+ }
+}
}
}
+/// SAv Construction Kit
+///
+/// The input is a sequence of data items, each followed by a semicolon. Each
+/// data item is converted to the output format and written on stdout. A data
+/// item is one of the following:
+///
+/// - An integer in decimal, in hexadecimal prefixed by `0x`, or in octal
+/// prefixed by `0`. Output as a 32-bit binary integer.
+///
+/// - A floating-point number. Output in 64-bit IEEE 754 format.
+///
+/// - A string enclosed in double quotes. Output literally. There is no
+/// syntax for "escapes". Strings may not contain new-lines.
+///
+/// - A literal of the form `s<number>` followed by a quoted string as above.
+/// Output as the string's contents followed by enough spaces to fill up
+/// `<number>` bytes. For example, `s8 "foo"` is output as `foo` followed
+/// by 5 spaces.
+///
+/// - The literal `i8`, `i16`, or `i64` followed by an integer. Output
+/// as a binary integer with the specified number of bits.
+///
+/// - One of the literals `SYSMIS`, `LOWEST`, or `HIGHEST`. Output as a
+/// 64-bit IEEE 754 float of the appropriate PSPP value.
+///
+/// - `PCSYSMIS`. Output as SPSS/PC+ system-missing value.
+///
+/// - The literal `ENDIAN`. Output as a 32-bit binary integer, either with
+/// value 1 if `--be` is in effect or 2 if `--le` is in effect.
+///
+/// - A pair of parentheses enclosing a sequence of data items, each followed
+/// by a semicolon (the last semicolon is optional). Output as the enclosed
+/// data items in sequence.
+///
+/// - The literal `COUNT` or `COUNT8` followed by a sequence of parenthesized
+/// data items, as above. Output as a 32-bit or 8-bit binary integer whose
+/// value is the number of bytes enclosed within the parentheses, followed
+/// by the enclosed data items themselves.
+///
+/// optionally followed by an asterisk and a positive integer, which specifies a
+/// repeat count for the data item.
pub fn sack(input: &str, input_file_name: Option<&Path>, endian: Endian) -> Result<Vec<u8>> {
let mut symbol_table = HashMap::new();
let output = _sack(input, input_file_name, endian, &mut symbol_table)?;
Details, Item, Text,
},
sys::{
- cooked::Headers,
- raw::{encoding_from_headers, Decoder, Reader},
+ cooked::ReaderOptions,
+ raw::{self, ErrorDetails},
sack::sack,
},
};
-use enum_iterator::all;
-
#[test]
fn variable_labels_and_missing_values() {
test_sack_sysfile("variable_labels_and_missing_values");
test_encrypted_sysfile("test-encrypted.sav", "pspp");
}
+#[test]
+fn encrypted_file_without_password() {
+ let error = ReaderOptions::new()
+ .open_file("src/crypto/testdata/test-encrypted.sav", |_| {
+ panic!();
+ })
+ .unwrap_err();
+ assert!(matches!(
+ error.downcast::<raw::Error>().unwrap().details,
+ ErrorDetails::Encrypted
+ ));
+}
+
fn test_raw_sysfile(name: &str) {
let input_filename = Path::new(env!("CARGO_MANIFEST_DIR"))
.join("src/sys/testdata")
let input = String::from_utf8(std::fs::read(&input_filename).unwrap()).unwrap();
let expected_filename = input_filename.with_extension("expected");
let expected = String::from_utf8(std::fs::read(&expected_filename).unwrap()).unwrap();
- for endian in all::<Endian>() {
+ for endian in [Endian::Big, Endian::Little] {
let expected = expected.replace(
"{endian}",
match endian {
R: Read + Seek + 'static,
{
let mut warnings = Vec::new();
- let mut reader = Reader::new(sysfile, |warning| warnings.push(warning)).unwrap();
- let output = match reader.headers().collect() {
- Ok(headers) => {
- let cases = reader.cases();
- let encoding =
- encoding_from_headers(&headers, &mut |warning| warnings.push(warning)).unwrap();
- let mut decoder = Decoder::new(encoding, |warning| warnings.push(warning));
- let mut decoded_records = Vec::new();
- for header in headers {
- decoded_records.push(header.decode(&mut decoder).unwrap());
- }
- drop(decoder);
-
- let mut errors = Vec::new();
- let headers = Headers::new(decoded_records, &mut |e| errors.push(e)).unwrap();
- let (dictionary, metadata, cases) =
- headers.decode(cases, encoding, |e| errors.push(e)).unwrap();
+ let output = match ReaderOptions::new().open_reader(sysfile, |warning| warnings.push(warning)) {
+ Ok(system_file) => {
+ let (dictionary, metadata, cases) = system_file.into_parts();
let (group, data) = metadata.to_pivot_rows();
let metadata_table = PivotTable::new([(Axis3::Y, Dimension::new(group))]).with_data(
data.into_iter()
.into_iter()
.map(|warning| Arc::new(Item::from(Text::new_log(warning.to_string())))),
);
- output.extend(
- errors
- .into_iter()
- .map(|error| Arc::new(Item::from(Text::new_log(error.to_string())))),
- );
output.push(Arc::new(metadata_table.into()));
output.push(Arc::new(dictionary_table.into()));
output.push(Arc::new(
case_numbers
.push(Value::new_integer(Some((case_numbers.len() + 1) as f64)));
data.push(
- case.into_iter()
+ case.0
+ .into_iter()
.map(|datum| Value::new_datum(&datum, dictionary.encoding))
.collect::<Vec<_>>(),
);
-At offset 0xe0, floating point record has bad count 4 instead of the expected 3.
+Warning at file offsets 0xe0 to 0x100: In extension record: floating point record has bad count 4 instead of the expected 3.
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-At offset 0xe0, integer record has bad count 9 instead of the expected 8.
+Warning at file offsets 0xe0 to 0x104: In extension record: integer record has bad count 9 instead of the expected 8.
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-At offset 0xd4, unrecognized record type 8.
+Error at file offsets 0xd0 to 0xd4: Unrecognized record type 8.
-Missing `=` separator in long variable name record.
+Warning at file offsets 0xe0 to 0xe5: In long variable name record: Missing `=` separator.
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-Compression bias is 50 instead of the usual values of 0 or 100.
+Warning at file offsets 0x54 to 0x5c: In file header: Compression bias is 50 instead of the usual values of 0 or 100.
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-Duplicate dataset attributes with names: Attr1.
+Warning at file offsets 0xe0 to 0xfe: In file or variable attribute record: Duplicate dataset attributes with names: Attr1.
-Duplicate attributes for variable FIRSTVAR: fred.
+Warning at file offsets 0x10e to 0x12d: In file or variable attribute record: Duplicate attributes for variable FIRSTVAR: fred.
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-Invalid name in long variable name record. "_Invalid" may not be used as an identifier because it begins with disallowed character '_'.
+Warning at file offsets 0x140 to 0x1aa: In long variable name record: Invalid long name. "_Invalid" may not be used as an identifier because it begins with disallowed character '_'.
-Invalid name in long variable name record. "$Invalid" may not be used as an identifier because it begins with disallowed character '$'.
+Warning at file offsets 0x140 to 0x1aa: In long variable name record: Invalid long name. "$Invalid" may not be used as an identifier because it begins with disallowed character '$'.
-Invalid name in long variable name record. "#Invalid" may not be used as an identifier because it begins with disallowed character '#'.
+Warning at file offsets 0x140 to 0x1aa: In long variable name record: Invalid long name. "#Invalid" may not be used as an identifier because it begins with disallowed character '#'.
Duplicate long variable name LONGVARIABLENAME.
-At offset 0xd8, record type 7 subtype 3 is too large with element size 4294963200 and 4294963200 elements.
+Error at file offsets 0xd8 to 0xe0: Record type 7 subtype 3 is too large with element size 4294963200 and 4294963200 elements.
-In variable record starting at offset 0xb4, variable label code 2 at offset 0xb8 is not 0 or 1.
+Error at file offsets 0xb4 to 0xd0: In variable record, variable label code 2 is not 0 or 1.
-In long string missing values record starting at offset 0x238, value length at offset 0x2a8 is 12 instead of the expected 8.
+Warning at file offsets 0x238 to 0x2f2: In long string missing values record: Value length at offset 0x2a8 is 12 instead of the expected 8.
File header claims 8 variable positions but 9 were read from file.
-At offset 0xb4, missing value code (-1) is not -3, -2, 0, 1, 2, or 3.
+Error at file offsets 0xb4 to 0xd0: Missing value code (-1) is not -3, -2, 0, 1, 2, or 3.
-At offset 0xb4, missing value code (4) is not -3, -2, 0, 1, 2, or 3.
+Error at file offsets 0xb4 to 0xd0: Missing value code (4) is not -3, -2, 0, 1, 2, or 3.
-At offset 0xd4, unrecognized record type 4.
+Error at file offsets 0xd0 to 0xd4: Unrecognized record type 4.
-Attribute for Attr1[1] lacks value.
+Warning at file offsets 0xe0 to 0xe6: In file or variable attribute record: Attribute for Attr1[1] lacks value.
-Attribute for fred[2] lacks value.
+Warning at file offsets 0xf6 to 0x109: In file or variable attribute record: Attribute for fred[2] lacks value.
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-At offset 0xe8, following value label record, found record type 7 instead of expected type 4 for variable index record
+Error at file offsets 0xe8 to 0xec: Following value label record, found record type 7 instead of expected type 4 for variable index record
-Syntax error parsing counted string (missing trailing space)
+Warning at file offsets 0xe0 to 0xe5: In multiple response set record: Syntax error parsing counted string (missing trailing space).
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-Syntax error parsing counted string (length 4 goes past end of input)
+Warning at file offsets 0xe0 to 0xe9: In multiple response set record: Syntax error parsing counted string (length 4 goes past end of input).
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-Syntax error parsing counted string (missing trailing space)
+Warning at file offsets 0xe0 to 0xe6: In multiple response set record: Syntax error parsing counted string (missing trailing space).
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-Invalid multiple dichotomy label type
+Warning at file offsets 0xe0 to 0xe5: In multiple response set record: Invalid multiple dichotomy label type.
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-Syntax error in multiple response record (missing variable name delimiter)
+Warning at file offsets 0xe0 to 0xec: In multiple response set record: Syntax error (missing variable name delimiter).
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-Syntax error in multiple response record (missing space after multiple response type)
+Warning at file offsets 0xe0 to 0xe5: In multiple response set record: Syntax error (missing space after multiple response type).
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-Syntax error in multiple response record (missing space after multiple response type)
+Warning at file offsets 0xe0 to 0xea: In multiple response set record: Syntax error (missing space after multiple response type).
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-Invalid multiple dichotomy label type
+Warning at file offsets 0xe0 to 0xe5: In multiple response set record: Invalid multiple dichotomy label type.
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-Invalid multiple dichotomy label type
+Warning at file offsets 0xe0 to 0xe6: In multiple response set record: Invalid multiple dichotomy label type.
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-At offset 0x1c8, variable attributes record has bad size 4 bytes instead of the expected 1.
+Warning at file offsets 0x1c8 to 0x1e8: In extension record: variable attributes record has bad size 4 bytes instead of the expected 1.
This system file does not indicate its own character encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING="DETECT" to analyze the possible encodings.
│str15│ 5│ │Nominal │Input│ 15│Left │A15 │A15 │ │
╰─────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯
-Unexpected end of file at offset 0x1ac, 0 bytes and 2 compression chunks into a compressed case.
+Unexpected end of file 0 bytes and 2 compression chunks into a compressed case.
╭────┬──────┬────┬────┬────────┬───────────────╮
│Case│ num1 │num2│str4│ str8 │ str15 │
│num2│ 2│ │ │Input│ 8│Right │F8.0 │F8.0 │ │
╰────┴────────┴─────┴─────────────────┴─────┴─────┴─────────┴────────────┴────────────┴──────────────╯
-Unexpected end of file at offset 0x12c, 8 bytes into a 16-byte case.
+Error at file offsets 0x124 to 0x12c: Unexpected end of file 8 bytes into a 16-byte case.
╭────┬────┬────╮
│Case│num1│num2│
-At offset 0xd4, number of value labels (2147483647) is greater than the maximum number 536870911.
+Error at file offsets 0xd4 to 0xd8: Number of value labels (2147483647) is greater than the maximum number 536870911.
-Attribute for Attr1[1] missing quotations.
+Warning at file offsets 0xe0 to 0xed: In file or variable attribute record: Attribute for Attr1[1] missing quotations.
-Attribute for fred[1] missing quotations.
+Warning at file offsets 0xfd to 0x10f: In file or variable attribute record: Attribute for fred[1] missing quotations.
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-At offset 0x110, one or more variable indexes for value labels were not in the valid range [1,2] or referred to string continuations: [3, 4]
+Warning at file offsets 0x110 to 0x118: In value label record: One or more variable indexes were not in the valid range [1,2] or referred to string continuations: [3, 4]
-At offset 0x138, one or more variable indexes for value labels were not in the valid range [1,2] or referred to string continuations: [5, 6]
+Warning at file offsets 0x138 to 0x140: In value label record: One or more variable indexes were not in the valid range [1,2] or referred to string continuations: [5, 6]
-At offset 0x160, one or more variable indexes for value labels were not in the valid range [1,2] or referred to string continuations: [7, 8]
+Warning at file offsets 0x160 to 0x168: In value label record: One or more variable indexes were not in the valid range [1,2] or referred to string continuations: [7, 8]
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-At offset 0x110, one or more variable indexes for value labels were not in the valid range [1,2] or referred to string continuations: [2]
+Warning at file offsets 0x110 to 0x114: In value label record: One or more variable indexes were not in the valid range [1,2] or referred to string continuations: [2]
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-At offset 0xe8, at least one valid variable index for value labels is required but none were specified.
+Warning at file offsets 0xec to 0xf0: In value label record: At least one valid variable index is required but none were specified.
This system file does not indicate its own character encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING="DETECT" to analyze the possible encodings.
-At offset 0x110, the first variable index is for a string variable but the following variable indexes are for numeric variables: [2]
+Warning at file offsets 0x110 to 0x118: In value label record: First variable index is for a string variable but the following variable indexes are for numeric variables: [2]
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-Invalid variable display alignment value 4294967295
+Warning at file offsets 0xe0 to 0xe8: In variable display record: Invalid variable display alignment value 4294967295.
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-Invalid variable measurement level value 4
+Warning at file offsets 0xe0 to 0xe8: In variable display record: Invalid variable measurement level value 4.
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-Variable display record contains 4 items but should contain either 2 or 3.
+Warning at file offsets 0xe0 to 0xf0: In variable display record: Record contains 4 items but should contain either 2 or 3.
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-At offset 0xe0, variable display record has bad size 8 bytes instead of the expected 4.
+Warning at file offsets 0xe0 to 0xf0: In extension record: variable display record has bad size 8 bytes instead of the expected 4.
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-ZLIB header's zlib_offset is 0x0 instead of expected 0x194.
+Error at file offsets 0x194 to 0x1a0: ZLIB header's zlib_offset is 0x0 instead of expected 0x194.
-Impossible ztrailer_offset 0x0.
+Error at file offsets 0x194 to 0x1a0: Impossible ztrailer_offset 0x0.
-ZLIB trailer is at offset 0x205 but 0x204 would be expected from block descriptors.
+Error at file offsets 0x1ac to 0x1dc: ZLIB trailer is at offset 0x205 but 0x204 would be expected from block descriptors.
-ZLIB block descriptor 1 reported compressed data offset 0x12421, when 0x124f1 was expected.
+Error at file offsets 0x1dc to 0x1f4: ZLIB block descriptor 1 reported compressed data offset 0x12421, when 0x124f1 was expected.
-ZLIB block descriptor 0 reports compressed size 100 and uncompressed size 50.
+Error at file offsets 0x1c4 to 0x1dc: ZLIB block descriptor 0 reports compressed size 100 and uncompressed size 50.
-Invalid ZLIB trailer length 21.
+Error at file offsets 0x194 to 0x1a0: Invalid ZLIB trailer length 21.
-ZLIB block descriptor 0 reported block size 0x400000, when at most 0x3ff000 was expected.
+Warning at file offsets 0x1c4 to 0x1dc: In ZLIB trailer: ZLIB block descriptor 0 reported block size 0x400000, when at most 0x3ff000 was expected.
╭──────────────────────┬────────────────────────╮
│ Created │ 01-JAN-2011 20:53:52│
-ZLIB trailer specifies unexpected 4096-byte block size.
+Error at file offsets 0x1ac to 0x1c4: ZLIB trailer specifies unexpected 4096-byte block size.
-ZLIB block descriptor 0 reported compressed data offset 0x191, when 0x1ac was expected.
+Error at file offsets 0x1c4 to 0x1dc: ZLIB block descriptor 0 reported compressed data offset 0x191, when 0x1ac was expected.
-Block count 2 in ZLIB trailer at offset 0x205 differs from expected block count 1 calculated from trailer length 48.
+Error at file offsets 0x1ac to 0x1c4: Block count 2 in ZLIB trailer differs from expected block count 1 calculated from trailer length 48.
-ZLIB block descriptor 0 reported uncompressed data offset 0x177, when 0x1ac was expected.
+Error at file offsets 0x1c4 to 0x1dc: ZLIB block descriptor 0 reported uncompressed data offset 0x177, when 0x1ac was expected.
-ZLIB trailer bias 0 is not -100 as expected from file header bias.
+Error at file offsets 0x1ac to 0x1c4: ZLIB trailer bias 0 is not -100 as expected from file header bias.
-Block count 1 in ZLIB trailer at offset 0x205 differs from expected block count 2 calculated from trailer length 72.
+Error at file offsets 0x1ac to 0x1c4: Block count 1 in ZLIB trailer differs from expected block count 2 calculated from trailer length 72.
-ZLIB trailer "zero" field has nonzero value 100.
+Error at file offsets 0x1ac to 0x1c4: ZLIB trailer "zero" field has nonzero value 100.
+++ /dev/null
-// PSPP - a program for statistical analysis.
-// Copyright (C) 2025 Free Software Foundation, Inc.
-//
-// This program is free software: you can redistribute it and/or modify it under
-// the terms of the GNU General Public License as published by the Free Software
-// Foundation, either version 3 of the License, or (at your option) any later
-// version.
-//
-// This program is distributed in the hope that it will be useful, but WITHOUT
-// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-// details.
-//
-// You should have received a copy of the GNU General Public License along with
-// this program. If not, see <http://www.gnu.org/licenses/>.
-
-use std::fs::read_to_string;
-use std::path::PathBuf;
-
-use anyhow::{anyhow, Result};
-use clap::Parser;
-use pspp::endian::Endian;
-use pspp::sys::sack::sack;
-
-/// SAv Construction Kit
-///
-/// The input is a sequence of data items, each followed by a semicolon. Each
-/// data item is converted to the output format and written on stdout. A data
-/// item is one of the following:
-///
-/// - An integer in decimal, in hexadecimal prefixed by `0x`, or in octal
-/// prefixed by `0`. Output as a 32-bit binary integer.
-///
-/// - A floating-point number. Output in 64-bit IEEE 754 format.
-///
-/// - A string enclosed in double quotes. Output literally. There is no
-/// syntax for "escapes". Strings may not contain new-lines.
-///
-/// - A literal of the form `s<number>` followed by a quoted string as above.
-/// Output as the string's contents followed by enough spaces to fill up
-/// `<number>` bytes. For example, `s8 "foo"` is output as `foo` followed
-/// by 5 spaces.
-///
-/// - The literal `i8`, `i16`, or `i64` followed by an integer. Output
-/// as a binary integer with the specified number of bits.
-///
-/// - One of the literals `SYSMIS`, `LOWEST`, or `HIGHEST`. Output as a
-/// 64-bit IEEE 754 float of the appropriate PSPP value.
-///
-/// - `PCSYSMIS`. Output as SPSS/PC+ system-missing value.
-///
-/// - The literal `ENDIAN`. Output as a 32-bit binary integer, either with
-/// value 1 if `--be` is in effect or 2 if `--le` is in effect.
-///
-/// - A pair of parentheses enclosing a sequence of data items, each followed
-/// by a semicolon (the last semicolon is optional). Output as the enclosed
-/// data items in sequence.
-///
-/// - The literal `COUNT` or `COUNT8` followed by a sequence of parenthesized
-/// data items, as above. Output as a 32-bit or 8-bit binary integer whose
-/// value is the number of bytes enclosed within the parentheses, followed
-/// by the enclosed data items themselves.
-///
-/// optionally followed by an asterisk and a positive integer, which specifies a
-/// repeat count for the data item.
-#[derive(Parser, Debug)]
-struct Args {
- /// Big-endian output format (default)
- #[arg(long = "be")]
- be: bool,
-
- /// Little-endian output format
- #[arg(long = "le")]
- le: bool,
-
- /// Input file.
- #[arg(required = true, name = "input")]
- input_file_name: PathBuf,
-
- /// Output file.
- #[arg(required = true, name = "output")]
- output_file_name: PathBuf,
-}
-
-fn main() -> Result<()> {
- let Args {
- be,
- le,
- input_file_name,
- output_file_name,
- } = Args::parse();
- let endian = match (be, le) {
- (false, false) | (true, false) => Endian::Big,
- (false, true) => Endian::Little,
- (true, true) => return Err(anyhow!("can't use both `--be` and `--le`")),
- };
-
- let input_file_str = input_file_name.to_string_lossy();
- let input = read_to_string(&input_file_name)
- .map_err(|err| anyhow!("{input_file_str}: read failed ({err})"))?;
-
- let output = sack(&input, Some(&input_file_name), endian)?;
-
- let output_file_str = output_file_name.to_string_lossy();
- std::fs::write(&output_file_name, output)
- .map_err(|err| anyhow!("{output_file_str}: write failed ({err})"))?;
-
- Ok(())
-}