// You should have received a copy of the GNU General Public License along with
// this program. If not, see <http://www.gnu.org/licenses/>.
+//! Raw system file record format.
+//!
+//! This module facilitates reading records from system files in all of their
+//! raw details. Most readers will want to use higher-level interfaces.
+
use crate::{
dictionary::{Attributes, Datum, VarWidth},
endian::{Endian, Parse, ToBytes},
};
use thiserror::Error as ThisError;
+/// An error encountered reading raw system file records.
+///
+/// Any error prevents reading further data from the system file.
#[derive(ThisError, Debug)]
pub enum Error {
#[error("Not an SPSS system file")]
EncodingError(EncodingError),
}
+/// A warning reading a raw system file record.
+///
+/// Warnings indicate that something may be amiss, but they do not prevent
+/// reading further records.
#[derive(ThisError, Debug)]
pub enum Warning {
#[error("Unexpected end of data inside extension record.")]
}
}
+/// A raw record in a system file.
+#[allow(missing_docs)] // Don't warn for missing docs on tuple members.
#[derive(Clone, Debug)]
pub enum Record {
+ /// The file header.
+ ///
+ /// Every system file has exactly one header record, at its very beginning.
Header(HeaderRecord<RawString>),
+
+ /// Variable record.
+ ///
+ /// Each numeric variable has one variable record. Each string variable has
+ /// one variable record per 8-byte segment.
Variable(VariableRecord<RawString>),
+
+ /// Value labels for numeric and short string variables.
+ ///
+ /// These appear after the variable records.
ValueLabel(ValueLabelRecord<RawStrArray<8>, RawString>),
+
+ /// Document record.
Document(DocumentRecord<RawDocumentLine>),
+
+ /// Integer info record.
IntegerInfo(IntegerInfoRecord),
+
+ /// Floating-point info record.
FloatInfo(FloatInfoRecord),
+
+ /// Variable display record.
VarDisplay(VarDisplayRecord),
+
+ /// Multiple response variable record.
MultipleResponse(MultipleResponseRecord<RawString, RawString>),
+
+ /// Value labels for long string variables.
LongStringValueLabels(LongStringValueLabelRecord<RawString, RawString>),
+
+ /// Missing values for long string variables.
+ ///
+ /// Missing values for numeric and short string variables appear in the
+ /// variable records.
LongStringMissingValues(LongStringMissingValueRecord<RawString>),
+
+ /// Encoding record.
+ ///
+ /// All the strings in the file are encoded in this encoding, even for
+ /// strings that precede this record.
Encoding(EncodingRecord),
+
+ /// Extended number of cases.
+ ///
+ /// The header record records the number of cases but it only uses a 32-bit
+ /// field.
NumberOfCases(NumberOfCasesRecord),
+
+ /// Variable sets.
VariableSets(RawVariableSetRecord),
+
+ /// Product info.
+ ///
+ /// This supplements the product in the header record.
ProductInfo(RawProductInfoRecord),
+
+ /// Long variable names.
LongNames(RawLongNamesRecord),
+
+ /// Very long string variables, for strings longer than 255 bytes.
VeryLongStrings(RawVeryLongStringsRecord),
+
+ /// File attributes.
FileAttributes(RawFileAttributesRecord),
+
+ /// Variable attributes.
VariableAttributes(RawVariableAttributesRecord),
+
+ /// Extension records not otherwise supported.
OtherExtension(Extension),
+
+ /// End of headers.
EndOfHeaders(u32),
+
+ /// Header record for ZLIB-compressed data.
ZHeader(ZHeader),
+
+ /// Trailer record for ZLIB-compressed data.
ZTrailer(ZTrailer),
}
+/// A [Record] that has been decoded to a more usable form.
+///
+/// Some records can be understand raw, but others need to have strings decoded
+/// (and interpreted as identifiers) or raw data interpreted as either numbers
+/// or strings.
#[derive(Clone, Debug)]
pub enum DecodedRecord {
+ /// File header, with strings decoded.
Header(HeaderRecord<String>),
+
+ /// Variable record, with strings decoded.
Variable(VariableRecord<String>),
+
+ /// Value label, with strings decoded.
ValueLabel(ValueLabelRecord<RawStrArray<8>, String>),
+
+ /// Documents, with strings decoded.
Document(DocumentRecord<String>),
+
+ /// Integer info.
IntegerInfo(IntegerInfoRecord),
+
+ /// Floating-point info.
FloatInfo(FloatInfoRecord),
+
+ /// Variable display info.
VarDisplay(VarDisplayRecord),
+
+ /// Multiple response sets, with strings decoded.
MultipleResponse(MultipleResponseRecord<Identifier, String>),
+
+ /// Long string value labels, with strings decoded.
LongStringValueLabels(LongStringValueLabelRecord<Identifier, String>),
+
+ /// Long string missing values, with strings decoded.
LongStringMissingValues(LongStringMissingValueRecord<Identifier>),
+
+ /// Encoding record.
Encoding(EncodingRecord),
+
+ /// Number of cases record.
NumberOfCases(NumberOfCasesRecord),
+
+ /// Variable sets.
VariableSets(VariableSetRecord),
+
+ /// Product info.
ProductInfo(ProductInfoRecord),
+
+ /// Long variable names.
LongNames(LongNamesRecord),
+
+ /// Very long string variables.
VeryLongStrings(VeryLongStringsRecord),
+
+ /// File attributes.
FileAttributes(FileAttributesRecord),
+
+ /// Variable attributes.
VariableAttributes(VariableAttributesRecord),
+
+ /// Extension records not otherwise supported.
OtherExtension(Extension),
+
+ /// End of headers.
EndOfHeaders(u32),
+
+ /// Header record for ZLIB-compressed data.
ZHeader(ZHeader),
+
+ /// Trailer record for ZLIB-compressed data.
ZTrailer(ZTrailer),
}
}
}
+ /// Decodes this record into a [DecodedRecord] using `decoder`.
pub fn decode(self, decoder: &mut Decoder) -> Result<DecodedRecord, Error> {
Ok(match self {
Record::Header(record) => record.decode(decoder),
}
pub fn encoding_from_headers(
- headers: &Vec<Record>,
+ headers: &[Record],
warn: &mut impl FnMut(Warning),
) -> Result<&'static Encoding, Error> {
let mut encoding_record = None;
}
}
+/// A type for decoding a [Record] into a [DecodedRecord].
pub struct Decoder<'a> {
+ /// The character encoding to use.
pub encoding: &'static Encoding,
+
+ /// Used to reporting [Warning]s during decoding.
pub warn: Box<dyn FnMut(Warning) + 'a>,
}
impl<'de> Decoder<'de> {
+ /// Constructs a decoder for an encoding read or inferred from
+ /// `records` (using [encoding_from_headers]). This can fail if the headers
+ /// specify an EBCDIC encoding, since this crate only supports ASCII-based
+ /// encodings.
+ ///
+ /// `warn` will be used to report warnings while decoding records.
+ pub fn from_headers<F>(records: &[Record], mut warn: F) -> Result<Self, Error>
+ where
+ F: FnMut(Warning) + 'de,
+ {
+ let encoding = encoding_from_headers(records, &mut warn)?;
+ Ok(Self::new(encoding, warn))
+ }
+
+ /// Construct a decoder using `encooding`.
+ ///
+ /// `warn` will be used to report warnings while decoding records.
pub fn new<F>(encoding: &'static Encoding, warn: F) -> Self
where
F: FnMut(Warning) + 'de,
warn: Box::new(warn),
}
}
+
fn warn(&mut self, warning: Warning) {
(self.warn)(warning)
}
+
fn decode_slice<'a>(&mut self, input: &'a [u8]) -> Cow<'a, str> {
let (output, malformed) = self.encoding.decode_without_bom_handling(input);
if malformed {
self.decode_slice(input.0.as_slice())
}
+ /// Decodes `input` to an [Identifier] using our encoding.
pub fn decode_identifier(&mut self, input: &RawString) -> Result<Identifier, IdError> {
let decoded = &self.decode(input);
self.new_identifier(decoded)
}
+ /// Constructs an [Identifier] from `name` using our encoding.
pub fn new_identifier(&self, name: &str) -> Result<Identifier, IdError> {
Identifier::from_encoding(name, self.encoding)
}
}
}
+/// Variable type.
#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum VarType {
+ /// A numeric variable.
Numeric,
+
+ /// A string variable.
String,
}
}
}
-type RawDatum = Datum<RawStrArray<8>>;
+/// A [Datum] for which the character encoding and variable width is not yet known.
+pub type RawDatum = Datum<RawStrArray<8>>;
impl RawDatum {
+ /// Constructs a `RawDatum` from `raw` given that we now know the variable
+ /// type and endianness.
pub fn from_raw(raw: &UntypedDatum, var_type: VarType, endian: Endian) -> Self {
match var_type {
VarType::String => Datum::String(RawStrArray(raw.0)),
VarType::Numeric => Datum::Number(endian.parse(raw.0)),
}
}
+
+ /// Decodes a `RawDatum` into a [Datum] given that we now know the string
+ /// width.
+ pub fn decode(&self, width: VarWidth) -> Datum {
+ match self {
+ Self::Number(x) => Datum::Number(*x),
+ Self::String(s) => {
+ let width = width.as_string_width().unwrap();
+ Datum::String(RawString::from(&s.0[..width]))
+ }
+ }
+ }
}
impl Datum {
}
}
-impl RawDatum {
- pub fn decode(&self, width: VarWidth) -> Datum {
- match self {
- Self::Number(x) => Datum::Number(*x),
- Self::String(s) => {
- let width = width.as_string_width().unwrap();
- Datum::String(RawString::from(&s.0[..width]))
- }
- }
- }
-}
-
struct ZlibDecodeMultiple<R>
where
R: Read + Seek,
}
}
+/// 8 bytes that represent a number or a string (but that's all we know).
+///
+/// Used when we don't know whether it's a number or a string, or the string
+/// width, or the character encoding.
#[derive(Copy, Clone)]
pub struct UntypedDatum(pub [u8; 8]);