-use crate::endian::{Endian, Parse, ToBytes};
-use crate::{CategoryLabels, Compression};
+use crate::{
+ dictionary::VarWidth,
+ encoding::{default_encoding, get_encoding, Error as EncodingError},
+ endian::{Endian, Parse, ToBytes},
+ identifier::{Error as IdError, Identifier},
+};
-use encoding_rs::mem::decode_latin1;
+use encoding_rs::{mem::decode_latin1, DecoderResult, Encoding};
use flate2::read::ZlibDecoder;
use num::Integer;
-use std::borrow::Cow;
-use std::fmt::{Debug, Formatter, Result as FmtResult};
-use std::str::from_utf8;
use std::{
- collections::VecDeque,
+ borrow::Cow,
+ cell::RefCell,
+ cmp::Ordering,
+ collections::{HashMap, VecDeque},
+ fmt::{Debug, Display, Formatter, Result as FmtResult},
io::{Error as IoError, Read, Seek, SeekFrom},
- iter::FusedIterator,
+ iter::repeat,
+ mem::take,
+ ops::Range,
+ rc::Rc,
+ str::from_utf8,
};
use thiserror::Error as ThisError;
-use self::state::State;
-
#[derive(ThisError, Debug)]
pub enum Error {
#[error("Not an SPSS system file")]
#[error("Invalid ZSAV compression code {0}")]
InvalidZsavCompression(u32),
- #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
- BadVariableWidth { offset: u64, width: i32 },
-
#[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
BadDocumentLength { offset: u64, n: usize, max: usize },
#[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
BadRecordType { offset: u64, rec_type: u32 },
- #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")]
- BadVariableLabelCode { offset: u64, code: u32 },
+ #[error("In variable record starting at offset {start_offset:#x}, variable width is not in the valid range -1 to 255.")]
+ BadVariableWidth {
+ start_offset: u64,
+ width: i32,
+ },
+
+ #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
+ BadVariableLabelCode {
+ start_offset: u64,
+ code_offset: u64,
+ code: u32,
+ },
#[error(
"At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
#[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
- #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
- BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
+ #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
+ ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
+
+ #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")]
+ TooManyVarIndexes { offset: u64, n: u32, max: u32 },
#[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
ExtensionRecordTooLarge {
ztrailer_len: u64,
},
+ #[error("{0}")]
+ EncodingError(EncodingError),
+}
+
+#[derive(ThisError, Debug)]
+pub enum Warning {
+ #[error("Unexpected end of data inside extension record.")]
+ UnexpectedEndOfData,
+
+ #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")]
+ NoVarIndexes { offset: u64 },
+
+ #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())]
+ MixedVarTypes {
+ offset: u64,
+ var_type: VarType,
+ wrong_types: Vec<u32>,
+ },
+
+ #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")]
+ InvalidVarIndexes {
+ offset: u64,
+ max: usize,
+ invalid: Vec<u32>,
+ },
+
#[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
BadRecordSize {
offset: u64,
#[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
BadEncodingName { offset: u64 },
+ // XXX This is risky because `text` might be arbitarily long.
+ #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
+ MalformedString { encoding: String, text: String },
+
+ #[error("Invalid variable measurement level value {0}")]
+ InvalidMeasurement(u32),
+
+ #[error("Invalid variable display alignment value {0}")]
+ InvalidAlignment(u32),
+
+ #[error("Invalid attribute name. {0}")]
+ InvalidAttributeName(IdError),
+
+ #[error("Invalid variable name in attribute record. {0}")]
+ InvalidAttributeVariableName(IdError),
+
+ #[error("Invalid short name in long variable name record. {0}")]
+ InvalidShortName(IdError),
+
+ #[error("Invalid name in long variable name record. {0}")]
+ InvalidLongName(IdError),
+
+ #[error("Invalid variable name in very long string record. {0}")]
+ InvalidLongStringName(IdError),
+
+ #[error("Invalid variable name in variable set record. {0}")]
+ InvalidVariableSetName(IdError),
+
+ #[error("Invalid multiple response set name. {0}")]
+ InvalidMrSetName(IdError),
+
+ #[error("Invalid multiple response set variable name. {0}")]
+ InvalidMrSetVariableName(IdError),
+
+ #[error("Invalid variable name in long string missing values record. {0}")]
+ InvalidLongStringMissingValueVariableName(IdError),
+
+ #[error("Invalid variable name in long string value label record. {0}")]
+ InvalidLongStringValueLabelName(IdError),
+
+ #[error("{0}")]
+ EncodingError(EncodingError),
+
#[error("Details TBD")]
TBD,
}
+impl From<IoError> for Warning {
+ fn from(_source: IoError) -> Self {
+ Self::UnexpectedEndOfData
+ }
+}
+
#[derive(Clone, Debug)]
pub enum Record {
- Header(HeaderRecord),
- Variable(VariableRecord),
- ValueLabel(ValueLabelRecord),
- VarIndexes(VarIndexRecord),
- Document(DocumentRecord),
+ Header(HeaderRecord<RawString>),
+ Variable(VariableRecord<RawString, RawStr<8>>),
+ ValueLabel(ValueLabelRecord<RawStr<8>, RawString>),
+ Document(DocumentRecord<RawDocumentLine>),
+ IntegerInfo(IntegerInfoRecord),
+ FloatInfo(FloatInfoRecord),
+ VarDisplay(VarDisplayRecord),
+ MultipleResponse(MultipleResponseRecord<RawString, RawString>),
+ LongStringValueLabels(LongStringValueLabelRecord<RawString, RawString>),
+ LongStringMissingValues(LongStringMissingValueRecord<RawString, RawStr<8>>),
+ Encoding(EncodingRecord),
+ NumberOfCases(NumberOfCasesRecord),
+ Text(TextRecord),
+ OtherExtension(Extension),
+ EndOfHeaders(u32),
+ ZHeader(ZHeader),
+ ZTrailer(ZTrailer),
+ Cases(Rc<RefCell<Cases>>),
+}
+
+#[derive(Clone, Debug)]
+pub enum DecodedRecord {
+ Header(HeaderRecord<String>),
+ Variable(VariableRecord<String, String>),
+ ValueLabel(ValueLabelRecord<RawStr<8>, String>),
+ Document(DocumentRecord<String>),
IntegerInfo(IntegerInfoRecord),
FloatInfo(FloatInfoRecord),
- VariableSets(TextRecord),
VarDisplay(VarDisplayRecord),
- MultipleResponse(MultipleResponseRecord),
- LongStringValueLabels(LongStringValueLabelRecord),
+ MultipleResponse(MultipleResponseRecord<Identifier, String>),
+ LongStringValueLabels(LongStringValueLabelRecord<Identifier, String>),
+ LongStringMissingValues(LongStringMissingValueRecord<Identifier, String>),
Encoding(EncodingRecord),
NumberOfCases(NumberOfCasesRecord),
- ProductInfo(TextRecord),
- LongNames(TextRecord),
- LongStrings(TextRecord),
- FileAttributes(TextRecord),
- VariableAttributes(TextRecord),
+ VariableSets(VariableSetRecord),
+ ProductInfo(ProductInfoRecord),
+ LongNames(LongNamesRecord),
+ VeryLongStrings(VeryLongStringsRecord),
+ FileAttributes(FileAttributeRecord),
+ VariableAttributes(VariableAttributeRecord),
OtherExtension(Extension),
EndOfHeaders(u32),
ZHeader(ZHeader),
ZTrailer(ZTrailer),
- Case(Vec<Value>),
+ Cases(Rc<RefCell<Cases>>),
}
impl Record {
- fn read<R: Read + Seek>(reader: &mut R, endian: Endian) -> Result<Record, Error> {
+ fn read<R>(
+ reader: &mut R,
+ endian: Endian,
+ var_types: &[VarType],
+ warn: &dyn Fn(Warning),
+ ) -> Result<Option<Record>, Error>
+ where
+ R: Read + Seek,
+ {
let rec_type: u32 = endian.parse(read_bytes(reader)?);
match rec_type {
- 2 => Ok(Record::Variable(VariableRecord::read(reader, endian)?)),
- 3 => Ok(Record::ValueLabel(ValueLabelRecord::read(reader, endian)?)),
- 4 => Ok(Record::VarIndexes(VarIndexRecord::read(reader, endian)?)),
- 6 => Ok(Record::Document(DocumentRecord::read(reader, endian)?)),
- 7 => Ok(Extension::read(reader, endian)?),
- 999 => Ok(Record::EndOfHeaders(endian.parse(read_bytes(reader)?))),
+ 2 => Ok(Some(VariableRecord::read(reader, endian)?)),
+ 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?),
+ 6 => Ok(Some(DocumentRecord::read(reader, endian)?)),
+ 7 => Extension::read(reader, endian, var_types.len(), warn),
+ 999 => Ok(Some(Record::EndOfHeaders(
+ endian.parse(read_bytes(reader)?),
+ ))),
_ => Err(Error::BadRecordType {
offset: reader.stream_position()?,
rec_type,
}),
}
}
+
+ pub fn decode(self, decoder: &Decoder) -> Result<DecodedRecord, Error> {
+ Ok(match self {
+ Record::Header(record) => record.decode(decoder),
+ Record::Variable(record) => record.decode(decoder),
+ Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)),
+ Record::Document(record) => record.decode(decoder),
+ Record::IntegerInfo(record) => DecodedRecord::IntegerInfo(record.clone()),
+ Record::FloatInfo(record) => DecodedRecord::FloatInfo(record.clone()),
+ Record::VarDisplay(record) => DecodedRecord::VarDisplay(record.clone()),
+ Record::MultipleResponse(record) => record.decode(decoder),
+ Record::LongStringValueLabels(record) => {
+ DecodedRecord::LongStringValueLabels(record.decode(decoder))
+ }
+ Record::LongStringMissingValues(record) => {
+ DecodedRecord::LongStringMissingValues(record.decode(decoder))
+ }
+ Record::Encoding(record) => DecodedRecord::Encoding(record.clone()),
+ Record::NumberOfCases(record) => DecodedRecord::NumberOfCases(record.clone()),
+ Record::Text(record) => record.decode(decoder),
+ Record::OtherExtension(record) => DecodedRecord::OtherExtension(record.clone()),
+ Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record),
+ Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()),
+ Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()),
+ Record::Cases(record) => DecodedRecord::Cases(record.clone()),
+ })
+ }
+}
+
+pub fn encoding_from_headers(
+ headers: &Vec<Record>,
+ warn: &impl Fn(Warning),
+) -> Result<&'static Encoding, Error> {
+ let mut encoding_record = None;
+ let mut integer_info_record = None;
+ for record in headers {
+ match record {
+ Record::Encoding(record) => encoding_record = Some(record),
+ Record::IntegerInfo(record) => integer_info_record = Some(record),
+ _ => (),
+ }
+ }
+ let encoding = encoding_record.map(|record| record.0.as_str());
+ let character_code = integer_info_record.map(|record| record.character_code);
+ match get_encoding(encoding, character_code) {
+ Ok(encoding) => Ok(encoding),
+ Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)),
+ Err(err) => {
+ warn(Warning::EncodingError(err));
+ // Warn that we're using the default encoding.
+ Ok(default_encoding())
+ }
+ }
}
// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
// decoded as Latin-1 (actually bytes interpreted as Unicode code points).
-fn default_decode<'a>(s: &'a [u8]) -> Cow<'a, str> {
+fn default_decode(s: &[u8]) -> Cow<str> {
from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
}
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum Compression {
+ Simple,
+ ZLib,
+}
+
+trait Header {
+ fn offsets(&self) -> Range<u64>;
+}
+
#[derive(Clone)]
-pub struct HeaderRecord {
+pub struct HeaderRecord<S>
+where
+ S: Debug,
+{
+ /// Offset in file.
+ pub offsets: Range<u64>,
+
/// Magic number.
pub magic: Magic,
/// Eye-catcher string, product name, in the file's encoding. Padded
/// on the right with spaces.
- pub eye_catcher: UnencodedStr<60>,
+ pub eye_catcher: S,
/// Layout code, normally either 2 or 3.
pub layout_code: u32,
pub bias: f64,
/// `dd mmm yy` in the file's encoding.
- pub creation_date: UnencodedStr<9>,
+ pub creation_date: S,
/// `HH:MM:SS` in the file's encoding.
- pub creation_time: UnencodedStr<8>,
+ pub creation_time: S,
/// File label, in the file's encoding. Padded on the right with spaces.
- pub file_label: UnencodedStr<64>,
+ pub file_label: S,
/// Endianness of the data in the file header.
pub endian: Endian,
}
-impl HeaderRecord {
- fn debug_field<T: Debug>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult {
+impl<S> HeaderRecord<S>
+where
+ S: Debug,
+{
+ fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult
+ where
+ T: Debug,
+ {
writeln!(f, "{name:>17}: {:?}", value)
}
}
-impl Debug for HeaderRecord {
+impl<S> Debug for HeaderRecord<S>
+where
+ S: Debug,
+{
fn fmt(&self, f: &mut Formatter) -> FmtResult {
writeln!(f, "File header record:")?;
self.debug_field(f, "Magic", self.magic)?;
}
}
-impl HeaderRecord {
- fn read<R: Read>(r: &mut R) -> Result<HeaderRecord, Error> {
+impl HeaderRecord<RawString> {
+ fn read<R: Read + Seek>(r: &mut R) -> Result<Self, Error> {
+ let start = r.stream_position()?;
+
let magic: [u8; 4] = read_bytes(r)?;
let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
- let eye_catcher = UnencodedStr::<60>(read_bytes(r)?);
+ let eye_catcher = RawString(read_vec(r, 60)?);
let layout_code: [u8; 4] = read_bytes(r)?;
let endian = Endian::identify_u32(2, layout_code)
.or_else(|| Endian::identify_u32(2, layout_code))
let compression_code: u32 = endian.parse(read_bytes(r)?);
let compression = match (magic, compression_code) {
- (Magic::ZSAV, 2) => Some(Compression::ZLib),
- (Magic::ZSAV, code) => return Err(Error::InvalidZsavCompression(code)),
+ (Magic::Zsav, 2) => Some(Compression::ZLib),
+ (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)),
(_, 0) => None,
(_, 1) => Some(Compression::Simple),
(_, code) => return Err(Error::InvalidSavCompression(code)),
let bias: f64 = endian.parse(read_bytes(r)?);
- let creation_date = UnencodedStr::<9>(read_bytes(r)?);
- let creation_time = UnencodedStr::<8>(read_bytes(r)?);
- let file_label = UnencodedStr::<64>(read_bytes(r)?);
+ let creation_date = RawString(read_vec(r, 9)?);
+ let creation_time = RawString(read_vec(r, 8)?);
+ let file_label = RawString(read_vec(r, 64)?);
let _: [u8; 3] = read_bytes(r)?;
Ok(HeaderRecord {
+ offsets: start..r.stream_position()?,
magic,
layout_code,
nominal_case_size,
endian,
})
}
+
+ pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ let eye_catcher = decoder.decode(&self.eye_catcher).to_string();
+ let file_label = decoder.decode(&self.file_label).to_string();
+ let creation_date = decoder.decode(&self.creation_date).to_string();
+ let creation_time = decoder.decode(&self.creation_time).to_string();
+ DecodedRecord::Header(HeaderRecord {
+ eye_catcher,
+ weight_index: self.weight_index,
+ n_cases: self.n_cases,
+ file_label,
+ offsets: self.offsets.clone(),
+ magic: self.magic,
+ layout_code: self.layout_code,
+ nominal_case_size: self.nominal_case_size,
+ compression: self.compression,
+ bias: self.bias,
+ creation_date,
+ creation_time,
+ endian: self.endian,
+ })
+ }
+}
+
+pub struct Decoder {
+ pub encoding: &'static Encoding,
+ pub warn: Box<dyn Fn(Warning)>,
+}
+
+impl Decoder {
+ pub fn new<F>(encoding: &'static Encoding, warn: F) -> Self
+ where
+ F: Fn(Warning) + 'static,
+ {
+ Self {
+ encoding,
+ warn: Box::new(warn),
+ }
+ }
+ fn warn(&self, warning: Warning) {
+ (self.warn)(warning)
+ }
+ fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
+ let (output, malformed) = self.encoding.decode_without_bom_handling(input);
+ if malformed {
+ self.warn(Warning::MalformedString {
+ encoding: self.encoding.name().into(),
+ text: output.clone().into(),
+ });
+ }
+ output
+ }
+
+ fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> {
+ self.decode_slice(input.0.as_slice())
+ }
+
+ /// Returns `input` decoded from `self.encoding` into UTF-8 such that
+ /// re-encoding the result back into `self.encoding` will have exactly the
+ /// same length in bytes.
+ ///
+ /// XXX warn about errors?
+ pub fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
+ if let (s, false) = self.encoding.decode_without_bom_handling(input) {
+ // This is the common case. Usually there will be no errors.
+ s
+ } else {
+ // Unusual case. Don't bother to optimize it much.
+ let mut decoder = self.encoding.new_decoder_without_bom_handling();
+ let mut output = String::with_capacity(
+ decoder
+ .max_utf8_buffer_length_without_replacement(input.len())
+ .unwrap(),
+ );
+ let mut rest = input;
+ while !rest.is_empty() {
+ match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
+ (DecoderResult::InputEmpty, _) => break,
+ (DecoderResult::OutputFull, _) => unreachable!(),
+ (DecoderResult::Malformed(a, b), consumed) => {
+ let skipped = a as usize + b as usize;
+ output.extend(repeat('?').take(skipped));
+ rest = &rest[consumed..];
+ }
+ }
+ }
+ assert_eq!(self.encoding.encode(&output).0.len(), input.len());
+ output.into()
+ }
+ }
+
+ pub fn decode_identifier(&self, input: &RawString) -> Result<Identifier, IdError> {
+ self.new_identifier(&self.decode(input))
+ }
+
+ pub fn new_identifier(&self, name: &str) -> Result<Identifier, IdError> {
+ Identifier::new(name, self.encoding)
+ }
+}
+
+impl<S> Header for HeaderRecord<S>
+where
+ S: Debug,
+{
+ fn offsets(&self) -> Range<u64> {
+ self.offsets.clone()
+ }
}
#[derive(Copy, Clone, PartialEq, Eq, Hash)]
-pub struct Magic([u8; 4]);
+pub enum Magic {
+ /// Regular system file.
+ Sav,
+
+ /// System file with Zlib-compressed data.
+ Zsav,
+
+ /// EBCDIC-encoded system file.
+ Ebcdic,
+}
impl Magic {
/// Magic number for a regular system file.
- pub const SAV: Magic = Magic(*b"$FL2");
+ pub const SAV: [u8; 4] = *b"$FL2";
/// Magic number for a system file that contains zlib-compressed data.
- pub const ZSAV: Magic = Magic(*b"$FL3");
+ pub const ZSAV: [u8; 4] = *b"$FL3";
- /// Magic number for an EBDIC-encoded system file. This is `$FL2` encoded
+ /// Magic number for an EBCDIC-encoded system file. This is `$FL2` encoded
/// in EBCDIC.
- pub const EBCDIC: Magic = Magic([0x5b, 0xc6, 0xd3, 0xf2]);
+ pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2];
}
impl Debug for Magic {
fn fmt(&self, f: &mut Formatter) -> FmtResult {
- let s = match self {
- &Magic::SAV => "$FL2",
- &Magic::ZSAV => "$FL3",
- &Magic::EBCDIC => "($FL2 in EBCDIC)",
- _ => return write!(f, "{:?}", self.0),
+ let s = match *self {
+ Magic::Sav => "$FL2",
+ Magic::Zsav => "$FL3",
+ Magic::Ebcdic => "($FL2 in EBCDIC)",
};
write!(f, "{s}")
}
type Error = Error;
fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
- let magic = Magic(value);
- match magic {
- Magic::SAV | Magic::ZSAV | Magic::EBCDIC => Ok(magic),
+ match value {
+ Magic::SAV => Ok(Magic::Sav),
+ Magic::ZSAV => Ok(Magic::Zsav),
+ Magic::EBCDIC => Ok(Magic::Ebcdic),
_ => Err(Error::BadMagic(value)),
}
}
}
impl VarType {
- fn from_width(width: i32) -> VarType {
+ pub fn from_width(width: VarWidth) -> VarType {
match width {
- 0 => VarType::Numeric,
- _ => VarType::String,
- }
- }
-}
-
-mod state {
- use super::{
- Compression, Error, HeaderRecord, Record, Value, VarType, VariableRecord, ZHeader,
- ZTrailer, ZlibDecodeMultiple,
- };
- use crate::endian::Endian;
- use std::{
- collections::VecDeque,
- io::{Read, Seek},
- };
-
- pub trait State {
- #[allow(clippy::type_complexity)]
- fn read(self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error>;
- }
-
- struct Start<R: Read + Seek> {
- reader: R,
- }
-
- pub fn new<R: Read + Seek + 'static>(reader: R) -> Box<dyn State> {
- Box::new(Start { reader })
- }
-
- struct CommonState<R: Read + Seek> {
- reader: R,
- endian: Endian,
- bias: f64,
- compression: Option<Compression>,
- var_types: Vec<VarType>,
- }
-
- impl<R: Read + Seek + 'static> State for Start<R> {
- fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
- let header = HeaderRecord::read(&mut self.reader)?;
- let next_state = Headers(CommonState {
- reader: self.reader,
- endian: header.endian,
- bias: header.bias,
- compression: header.compression,
- var_types: Vec::new(),
- });
- Ok(Some((Record::Header(header), Box::new(next_state))))
+ VarWidth::Numeric => Self::Numeric,
+ VarWidth::String(_) => Self::String,
}
}
- struct Headers<R: Read + Seek>(CommonState<R>);
-
- impl<R: Read + Seek + 'static> State for Headers<R> {
- fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
- let record = Record::read(&mut self.0.reader, self.0.endian)?;
- match record {
- Record::Variable(VariableRecord { width, .. }) => {
- self.0.var_types.push(VarType::from_width(width));
- }
- Record::EndOfHeaders(_) => {
- let next_state: Box<dyn State> = match self.0.compression {
- None => Box::new(Data(self.0)),
- Some(Compression::Simple) => Box::new(CompressedData::new(self.0)),
- Some(Compression::ZLib) => Box::new(ZlibHeader(self.0)),
- };
- return Ok(Some((record, next_state)));
- }
- _ => (),
- };
- Ok(Some((record, self)))
- }
- }
-
- struct ZlibHeader<R: Read + Seek>(CommonState<R>);
-
- impl<R: Read + Seek + 'static> State for ZlibHeader<R> {
- fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
- let zheader = ZHeader::read(&mut self.0.reader, self.0.endian)?;
- Ok(Some((Record::ZHeader(zheader), self)))
- }
- }
-
- struct ZlibTrailer<R: Read + Seek>(CommonState<R>, ZHeader);
-
- impl<R: Read + Seek + 'static> State for ZlibTrailer<R> {
- fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
- let retval = ZTrailer::read(
- &mut self.0.reader,
- self.0.endian,
- self.1.ztrailer_offset,
- self.1.ztrailer_len,
- )?;
- let next_state = Box::new(CompressedData::new(CommonState {
- reader: ZlibDecodeMultiple::new(self.0.reader),
- endian: self.0.endian,
- bias: self.0.bias,
- compression: self.0.compression,
- var_types: self.0.var_types,
- }));
- match retval {
- None => next_state.read(),
- Some(ztrailer) => Ok(Some((Record::ZTrailer(ztrailer), next_state))),
- }
- }
- }
-
- struct Data<R: Read + Seek>(CommonState<R>);
-
- impl<R: Read + Seek + 'static> State for Data<R> {
- fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
- match Value::read_case(&mut self.0.reader, &self.0.var_types, self.0.endian)? {
- None => Ok(None),
- Some(values) => Ok(Some((Record::Case(values), self))),
- }
- }
- }
-
- struct CompressedData<R: Read + Seek> {
- common: CommonState<R>,
- codes: VecDeque<u8>,
- }
-
- impl<R: Read + Seek + 'static> CompressedData<R> {
- fn new(common: CommonState<R>) -> CompressedData<R> {
- CompressedData {
- common,
- codes: VecDeque::new(),
- }
+ pub fn opposite(self) -> VarType {
+ match self {
+ Self::Numeric => Self::String,
+ Self::String => Self::Numeric,
}
}
+}
- impl<R: Read + Seek + 'static> State for CompressedData<R> {
- fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
- match Value::read_compressed_case(
- &mut self.common.reader,
- &self.common.var_types,
- &mut self.codes,
- self.common.endian,
- self.common.bias,
- )? {
- None => Ok(None),
- Some(values) => Ok(Some((Record::Case(values), self))),
- }
+impl Display for VarType {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ match self {
+ VarType::Numeric => write!(f, "numeric"),
+ VarType::String => write!(f, "string"),
}
}
}
#[derive(Copy, Clone)]
-pub enum Value {
+pub enum Value<S>
+where
+ S: Debug,
+{
Number(Option<f64>),
- String(UnencodedStr<8>),
+ String(S),
}
-impl Debug for Value {
+type RawValue = Value<RawStr<8>>;
+
+impl<S> Debug for Value<S>
+where
+ S: Debug,
+{
fn fmt(&self, f: &mut Formatter) -> FmtResult {
match self {
Value::Number(Some(number)) => write!(f, "{number:?}"),
Value::Number(None) => write!(f, "SYSMIS"),
- Value::String(bytes) => write!(f, "{:?}", bytes),
+ Value::String(s) => write!(f, "{:?}", s),
}
}
}
-impl Value {
- fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Value, IoError> {
+impl RawValue {
+ fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Self, IoError> {
Ok(Self::from_raw(
- UntypedValue(read_bytes(r)?),
+ &UntypedValue(read_bytes(r)?),
var_type,
endian,
))
}
- pub fn from_raw(raw: UntypedValue, var_type: VarType, endian: Endian) -> Value {
+ pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self {
match var_type {
- VarType::String => Value::String(UnencodedStr(raw.0)),
+ VarType::String => Value::String(RawStr(raw.0)),
VarType::Numeric => {
let number: f64 = endian.parse(raw.0);
Value::Number((number != -f64::MAX).then_some(number))
reader: &mut R,
var_types: &[VarType],
endian: Endian,
- ) -> Result<Option<Vec<Value>>, Error> {
+ ) -> Result<Option<Vec<Self>>, Error> {
let case_start = reader.stream_position()?;
let mut values = Vec::with_capacity(var_types.len());
for (i, &var_type) in var_types.iter().enumerate() {
});
}
};
- values.push(Value::from_raw(UntypedValue(raw), var_type, endian));
+ values.push(Value::from_raw(&UntypedValue(raw), var_type, endian));
}
Ok(Some(values))
}
codes: &mut VecDeque<u8>,
endian: Endian,
bias: f64,
- ) -> Result<Option<Vec<Value>>, Error> {
+ ) -> Result<Option<Vec<Self>>, Error> {
let case_start = reader.stream_position()?;
let mut values = Vec::with_capacity(var_types.len());
for (i, &var_type) in var_types.iter().enumerate() {
match code {
0 => (),
1..=251 => match var_type {
- VarType::Numeric => break Value::Number(Some(code as f64 - bias)),
+ VarType::Numeric => break Self::Number(Some(code as f64 - bias)),
VarType::String => {
- break Value::String(UnencodedStr(endian.to_bytes(code as f64 - bias)))
+ break Self::String(RawStr(endian.to_bytes(code as f64 - bias)))
}
},
252 => {
}
}
253 => {
- break Value::from_raw(UntypedValue(read_bytes(reader)?), var_type, endian)
+ break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian)
}
254 => match var_type {
- VarType::String => break Value::String(UnencodedStr(*b" ")), // XXX EBCDIC
+ VarType::String => break Self::String(RawStr(*b" ")), // XXX EBCDIC
VarType::Numeric => {
return Err(Error::CompressedStringExpected {
offset: case_start,
}
},
255 => match var_type {
- VarType::Numeric => break Value::Number(None),
+ VarType::Numeric => break Self::Number(None),
VarType::String => {
return Err(Error::CompressedNumberExpected {
offset: case_start,
}
Ok(Some(values))
}
+
+ fn decode(self, decoder: &Decoder) -> Value<String> {
+ match self {
+ Self::Number(x) => Value::Number(x),
+ Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
+ }
+ }
}
struct ZlibDecodeMultiple<R>
}
}
-pub struct Reader {
- state: Option<Box<dyn State>>,
+enum ReaderState {
+ Start,
+ Headers,
+ ZlibHeader,
+ ZlibTrailer {
+ ztrailer_offset: u64,
+ ztrailer_len: u64,
+ },
+ Cases,
+ End,
+}
+
+pub struct Reader<R>
+where
+ R: Read + Seek + 'static,
+{
+ reader: Option<R>,
+ warn: Box<dyn Fn(Warning)>,
+
+ header: HeaderRecord<RawString>,
+ var_types: Vec<VarType>,
+
+ state: ReaderState,
}
-impl Reader {
- pub fn new<R: Read + Seek + 'static>(reader: R) -> Result<Reader, Error> {
- Ok(Reader {
- state: Some(state::new(reader)),
+impl<R> Reader<R>
+where
+ R: Read + Seek + 'static,
+{
+ pub fn new<F>(mut reader: R, warn: F) -> Result<Self, Error>
+ where
+ F: Fn(Warning) + 'static,
+ {
+ let header = HeaderRecord::read(&mut reader)?;
+ Ok(Self {
+ reader: Some(reader),
+ warn: Box::new(warn),
+ header,
+ var_types: Vec::new(),
+ state: ReaderState::Start,
})
}
- pub fn collect_headers(&mut self) -> Result<Vec<Record>, Error> {
- let mut headers = Vec::new();
- for record in self {
- match record? {
- Record::EndOfHeaders(_) => break,
- r => headers.push(r),
- };
+ fn cases(&mut self) -> Cases {
+ self.state = ReaderState::End;
+ Cases::new(
+ self.reader.take().unwrap(),
+ take(&mut self.var_types),
+ &self.header,
+ )
+ }
+ fn _next(&mut self) -> Option<<Self as Iterator>::Item> {
+ match self.state {
+ ReaderState::Start => {
+ self.state = ReaderState::Headers;
+ Some(Ok(Record::Header(self.header.clone())))
+ }
+ ReaderState::Headers => {
+ let record = loop {
+ match Record::read(
+ self.reader.as_mut().unwrap(),
+ self.header.endian,
+ self.var_types.as_slice(),
+ &self.warn,
+ ) {
+ Ok(Some(record)) => break record,
+ Ok(None) => (),
+ Err(error) => return Some(Err(error)),
+ }
+ };
+ match record {
+ Record::Variable(VariableRecord { width, .. }) => {
+ self.var_types.push(if width == 0 {
+ VarType::Numeric
+ } else {
+ VarType::String
+ });
+ }
+ Record::EndOfHeaders(_) => {
+ self.state = if let Some(Compression::ZLib) = self.header.compression {
+ ReaderState::ZlibHeader
+ } else {
+ ReaderState::Cases
+ };
+ }
+ _ => (),
+ };
+ Some(Ok(record))
+ }
+ ReaderState::ZlibHeader => {
+ let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian)
+ {
+ Ok(zheader) => zheader,
+ Err(error) => return Some(Err(error)),
+ };
+ self.state = ReaderState::ZlibTrailer {
+ ztrailer_offset: zheader.ztrailer_offset,
+ ztrailer_len: zheader.ztrailer_len,
+ };
+ Some(Ok(Record::ZHeader(zheader)))
+ }
+ ReaderState::ZlibTrailer {
+ ztrailer_offset,
+ ztrailer_len,
+ } => {
+ match ZTrailer::read(
+ self.reader.as_mut().unwrap(),
+ self.header.endian,
+ ztrailer_offset,
+ ztrailer_len,
+ ) {
+ Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
+ Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))),
+ Err(error) => Some(Err(error)),
+ }
+ }
+ ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
+ ReaderState::End => None,
}
- Ok(headers)
}
}
-impl Iterator for Reader {
+impl<R> Iterator for Reader<R>
+where
+ R: Read + Seek + 'static,
+{
type Item = Result<Record, Error>;
fn next(&mut self) -> Option<Self::Item> {
- match self.state.take()?.read() {
- Ok(Some((record, next_state))) => {
- self.state = Some(next_state);
- Some(Ok(record))
- }
- Ok(None) => None,
- Err(error) => Some(Err(error)),
+ let retval = self._next();
+ if matches!(retval, Some(Err(_))) {
+ self.state = ReaderState::End;
+ }
+ retval
+ }
+}
+
+trait ReadSeek: Read + Seek {}
+impl<T> ReadSeek for T where T: Read + Seek {}
+
+pub struct Cases {
+ reader: Box<dyn ReadSeek>,
+ var_types: Vec<VarType>,
+ compression: Option<Compression>,
+ bias: f64,
+ endian: Endian,
+ codes: VecDeque<u8>,
+ eof: bool,
+}
+
+impl Debug for Cases {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ write!(f, "Cases")
+ }
+}
+
+impl Cases {
+ fn new<R>(reader: R, var_types: Vec<VarType>, header: &HeaderRecord<RawString>) -> Self
+ where
+ R: Read + Seek + 'static,
+ {
+ Self {
+ reader: if header.compression == Some(Compression::ZLib) {
+ Box::new(ZlibDecodeMultiple::new(reader))
+ } else {
+ Box::new(reader)
+ },
+ var_types,
+ compression: header.compression,
+ bias: header.bias,
+ endian: header.endian,
+ codes: VecDeque::with_capacity(8),
+ eof: false,
}
}
}
-impl FusedIterator for Reader {}
+impl Iterator for Cases {
+ type Item = Result<Vec<RawValue>, Error>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ if self.eof {
+ return None;
+ }
+
+ let retval = if self.compression.is_some() {
+ Value::read_compressed_case(
+ &mut self.reader,
+ &self.var_types,
+ &mut self.codes,
+ self.endian,
+ self.bias,
+ )
+ .transpose()
+ } else {
+ Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose()
+ };
+ self.eof = matches!(retval, None | Some(Err(_)));
+ retval
+ }
+}
#[derive(Copy, Clone, PartialEq, Eq, Hash)]
pub struct Spec(pub u32);
}
#[derive(Clone)]
-pub struct MissingValues {
+pub struct MissingValues<S = String>
+where
+ S: Debug,
+{
/// Individual missing values, up to 3 of them.
- pub values: Vec<Value>,
+ pub values: Vec<Value<S>>,
/// Optional range of missing values.
- pub range: Option<(Value, Value)>,
+ pub range: Option<(Value<S>, Value<S>)>,
}
-impl Debug for MissingValues {
+impl<S> Debug for MissingValues<S>
+where
+ S: Debug,
+{
fn fmt(&self, f: &mut Formatter) -> FmtResult {
for (i, value) in self.values.iter().enumerate() {
if i > 0 {
write!(f, "{value:?}")?;
}
- if let Some((low, high)) = self.range {
+ if let Some((low, high)) = &self.range {
if !self.values.is_empty() {
write!(f, ", ")?;
}
}
}
-impl MissingValues {
+impl<S> MissingValues<S>
+where
+ S: Debug,
+{
fn is_empty(&self) -> bool {
self.values.is_empty() && self.range.is_none()
}
+}
+
+impl<S> Default for MissingValues<S>
+where
+ S: Debug,
+{
+ fn default() -> Self {
+ Self {
+ values: Vec::new(),
+ range: None,
+ }
+ }
+}
+impl MissingValues<RawStr<8>> {
fn read<R: Read + Seek>(
r: &mut R,
offset: u64,
width: i32,
code: i32,
endian: Endian,
- ) -> Result<MissingValues, Error> {
+ ) -> Result<Self, Error> {
let (n_values, has_range) = match (width, code) {
(_, 0..=3) => (code, false),
(0, -2) => (0, true),
(_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
};
- let var_type = VarType::from_width(width);
+ let var_type = if width == 0 {
+ VarType::Numeric
+ } else {
+ VarType::String
+ };
let mut values = Vec::new();
for _ in 0..n_values {
- values.push(Value::read(r, var_type, endian)?);
+ values.push(RawValue::read(r, var_type, endian)?);
}
let range = if has_range {
- let low = Value::read(r, var_type, endian)?;
- let high = Value::read(r, var_type, endian)?;
+ let low = RawValue::read(r, var_type, endian)?;
+ let high = RawValue::read(r, var_type, endian)?;
Some((low, high))
} else {
None
};
- Ok(MissingValues { values, range })
+ Ok(Self { values, range })
+ }
+ fn decode(&self, decoder: &Decoder) -> MissingValues<String> {
+ MissingValues {
+ values: self
+ .values
+ .iter()
+ .map(|value| value.decode(decoder))
+ .collect(),
+ range: self
+ .range
+ .as_ref()
+ .map(|(low, high)| (low.decode(decoder), high.decode(decoder))),
+ }
}
}
#[derive(Clone)]
-pub struct VariableRecord {
- /// Offset from the start of the file to the start of the record.
- pub offset: u64,
+pub struct VariableRecord<S, V>
+where
+ S: Debug,
+ V: Debug,
+{
+ /// Range of offsets in file.
+ pub offsets: Range<u64>,
/// Variable width, in the range -1..=255.
pub width: i32,
/// Variable name, padded on the right with spaces.
- pub name: UnencodedStr<8>,
+ pub name: S,
/// Print format.
pub print_format: Spec,
pub write_format: Spec,
/// Missing values.
- pub missing_values: MissingValues,
+ pub missing_values: MissingValues<V>,
/// Optional variable label.
- pub label: Option<UnencodedString>,
+ pub label: Option<S>,
}
-impl Debug for VariableRecord {
+impl<S, V> Debug for VariableRecord<S, V>
+where
+ S: Debug,
+ V: Debug,
+{
fn fmt(&self, f: &mut Formatter) -> FmtResult {
writeln!(
f,
"Width: {} ({})",
self.width,
- if self.width > 0 {
- "string"
- } else if self.width == 0 {
- "numeric"
- } else {
- "long string continuation record"
+ match self.width.cmp(&0) {
+ Ordering::Greater => "string",
+ Ordering::Equal => "numeric",
+ Ordering::Less => "long string continuation record",
}
)?;
writeln!(f, "Print format: {:?}", self.print_format)?;
}
}
-impl VariableRecord {
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VariableRecord, Error> {
- let offset = r.stream_position()?;
+impl VariableRecord<RawString, RawStr<8>> {
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
+ let start_offset = r.stream_position()?;
let width: i32 = endian.parse(read_bytes(r)?);
+ if !(-1..=255).contains(&width) {
+ return Err(Error::BadVariableWidth { start_offset, width });
+ }
+ let code_offset = r.stream_position()?;
let has_variable_label: u32 = endian.parse(read_bytes(r)?);
let missing_value_code: i32 = endian.parse(read_bytes(r)?);
let print_format = Spec(endian.parse(read_bytes(r)?));
let write_format = Spec(endian.parse(read_bytes(r)?));
- let name = UnencodedStr::<8>(read_bytes(r)?);
+ let name = RawString(read_vec(r, 8)?);
let label = match has_variable_label {
0 => None,
1 => {
let len: u32 = endian.parse(read_bytes(r)?);
let read_len = len.min(65535) as usize;
- let label = UnencodedString(read_vec(r, read_len)?);
+ let label = RawString(read_vec(r, read_len)?);
let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
let _ = read_vec(r, padding_bytes as usize)?;
}
_ => {
return Err(Error::BadVariableLabelCode {
- offset,
+ start_offset,
+ code_offset,
code: has_variable_label,
})
}
};
- let missing_values = MissingValues::read(r, offset, width, missing_value_code, endian)?;
+ let missing_values =
+ MissingValues::read(r, start_offset, width, missing_value_code, endian)?;
- Ok(VariableRecord {
- offset,
+ let end_offset = r.stream_position()?;
+
+ Ok(Record::Variable(VariableRecord {
+ offsets: start_offset..end_offset,
width,
name,
print_format,
write_format,
missing_values,
label,
+ }))
+ }
+
+ pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ DecodedRecord::Variable(VariableRecord {
+ offsets: self.offsets.clone(),
+ width: self.width,
+ name: decoder.decode(&self.name).to_string(),
+ print_format: self.print_format,
+ write_format: self.write_format,
+ missing_values: self.missing_values.decode(decoder),
+ label: self
+ .label
+ .as_ref()
+ .map(|label| decoder.decode(label).to_string()),
})
}
}
}
#[derive(Clone)]
-pub struct UnencodedString(pub Vec<u8>);
+pub struct RawString(pub Vec<u8>);
-impl From<Vec<u8>> for UnencodedString {
+impl From<Vec<u8>> for RawString {
fn from(source: Vec<u8>) -> Self {
Self(source)
}
}
-impl From<&[u8]> for UnencodedString {
+impl From<&[u8]> for RawString {
fn from(source: &[u8]) -> Self {
Self(source.into())
}
}
-impl Debug for UnencodedString {
+impl Debug for RawString {
fn fmt(&self, f: &mut Formatter) -> FmtResult {
write!(f, "{:?}", default_decode(self.0.as_slice()))
}
}
#[derive(Copy, Clone)]
-pub struct UnencodedStr<const N: usize>(pub [u8; N]);
+pub struct RawStr<const N: usize>(pub [u8; N]);
-impl<const N: usize> From<[u8; N]> for UnencodedStr<N> {
+impl<const N: usize> From<[u8; N]> for RawStr<N> {
fn from(source: [u8; N]) -> Self {
Self(source)
}
}
-impl<const N: usize> Debug for UnencodedStr<N> {
+impl<const N: usize> Debug for RawStr<N> {
fn fmt(&self, f: &mut Formatter) -> FmtResult {
write!(f, "{:?}", default_decode(&self.0))
}
}
-#[derive(Clone)]
-pub struct ValueLabelRecord {
- /// Offset from the start of the file to the start of the record.
- pub offset: u64,
-
- /// The labels.
- pub labels: Vec<(UntypedValue, UnencodedString)>,
+#[derive(Clone, Debug)]
+pub struct ValueLabel<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
+ pub value: Value<V>,
+ pub label: S,
}
-impl Debug for ValueLabelRecord {
+#[derive(Clone)]
+pub struct ValueLabelRecord<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
+ /// Range of offsets in file.
+ pub offsets: Range<u64>,
+
+ /// The labels.
+ pub labels: Vec<ValueLabel<V, S>>,
+
+ /// The 1-based indexes of the variable indexes.
+ pub dict_indexes: Vec<u32>,
+
+ /// The types of the variables.
+ pub var_type: VarType,
+}
+
+impl<V, S> Debug for ValueLabelRecord<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
fn fmt(&self, f: &mut Formatter) -> FmtResult {
- for (value, label) in self.labels.iter() {
- writeln!(f, "{value:?}: {label:?}")?;
+ writeln!(f, "labels: ")?;
+ for label in self.labels.iter() {
+ writeln!(f, "{label:?}")?;
+ }
+ write!(f, "apply to {} variables", self.var_type)?;
+ for dict_index in self.dict_indexes.iter() {
+ write!(f, " #{dict_index}")?;
}
Ok(())
}
}
-impl ValueLabelRecord {
+impl<V, S> Header for ValueLabelRecord<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
+ fn offsets(&self) -> Range<u64> {
+ self.offsets.clone()
+ }
+}
+
+impl<V, S> ValueLabelRecord<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
/// Maximum number of value labels in a record.
- pub const MAX: u32 = u32::MAX / 8;
+ pub const MAX_LABELS: u32 = u32::MAX / 8;
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ValueLabelRecord, Error> {
- let offset = r.stream_position()?;
+ /// Maximum number of variable indexes in a record.
+ pub const MAX_INDEXES: u32 = u32::MAX / 8;
+}
+
+impl ValueLabelRecord<RawStr<8>, RawString> {
+ fn read<R: Read + Seek>(
+ r: &mut R,
+ endian: Endian,
+ var_types: &[VarType],
+ warn: &dyn Fn(Warning),
+ ) -> Result<Option<Record>, Error> {
+ let label_offset = r.stream_position()?;
let n: u32 = endian.parse(read_bytes(r)?);
- if n > ValueLabelRecord::MAX {
+ if n > Self::MAX_LABELS {
return Err(Error::BadNumberOfValueLabels {
- offset,
+ offset: label_offset,
n,
- max: ValueLabelRecord::MAX,
+ max: Self::MAX_LABELS,
});
}
let mut label = read_vec(r, padded_len - 1)?;
label.truncate(label_len);
- labels.push((value, UnencodedString(label)));
+ labels.push((value, RawString(label)));
}
- Ok(ValueLabelRecord { offset, labels })
- }
-}
-
-#[derive(Clone)]
-pub struct VarIndexRecord {
- /// Offset from the start of the file to the start of the record.
- pub offset: u64,
-
- /// The 1-based indexes of the variable indexes.
- pub dict_indexes: Vec<u32>,
-}
-impl Debug for VarIndexRecord {
- fn fmt(&self, f: &mut Formatter) -> FmtResult {
- write!(f, "apply to variables")?;
- for dict_index in self.dict_indexes.iter() {
- write!(f, " #{dict_index}")?;
+ let index_offset = r.stream_position()?;
+ let rec_type: u32 = endian.parse(read_bytes(r)?);
+ if rec_type != 4 {
+ return Err(Error::ExpectedVarIndexRecord {
+ offset: index_offset,
+ rec_type,
+ });
}
- Ok(())
- }
-}
-impl VarIndexRecord {
- /// Maximum number of variable indexes in a record.
- pub const MAX: u32 = u32::MAX / 8;
-
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<VarIndexRecord, Error> {
- let offset = r.stream_position()?;
let n: u32 = endian.parse(read_bytes(r)?);
- if n > VarIndexRecord::MAX {
- return Err(Error::BadNumberOfVarIndexes {
- offset,
+ if n > Self::MAX_INDEXES {
+ return Err(Error::TooManyVarIndexes {
+ offset: index_offset,
n,
- max: VarIndexRecord::MAX,
+ max: Self::MAX_INDEXES,
});
}
+
+ let index_offset = r.stream_position()?;
let mut dict_indexes = Vec::with_capacity(n as usize);
+ let mut invalid_indexes = Vec::new();
for _ in 0..n {
- dict_indexes.push(endian.parse(read_bytes(r)?));
+ let index: u32 = endian.parse(read_bytes(r)?);
+ if index == 0 || index as usize > var_types.len() {
+ dict_indexes.push(index);
+ } else {
+ invalid_indexes.push(index);
+ }
+ }
+ if !invalid_indexes.is_empty() {
+ warn(Warning::InvalidVarIndexes {
+ offset: index_offset,
+ max: var_types.len(),
+ invalid: invalid_indexes,
+ });
}
- Ok(VarIndexRecord {
- offset,
+ let Some(&first_index) = dict_indexes.first() else {
+ warn(Warning::NoVarIndexes {
+ offset: index_offset,
+ });
+ return Ok(None);
+ };
+ let var_type = var_types[first_index as usize - 1];
+ let mut wrong_type_indexes = Vec::new();
+ dict_indexes.retain(|&index| {
+ if var_types[index as usize - 1] != var_type {
+ wrong_type_indexes.push(index);
+ false
+ } else {
+ true
+ }
+ });
+ if !wrong_type_indexes.is_empty() {
+ warn(Warning::MixedVarTypes {
+ offset: index_offset,
+ var_type,
+ wrong_types: wrong_type_indexes,
+ });
+ }
+
+ let labels = labels
+ .into_iter()
+ .map(|(value, label)| ValueLabel {
+ value: Value::from_raw(&value, var_type, endian),
+ label,
+ })
+ .collect();
+
+ let end_offset = r.stream_position()?;
+ Ok(Some(Record::ValueLabel(ValueLabelRecord {
+ offsets: label_offset..end_offset,
+ labels,
dict_indexes,
- })
+ var_type,
+ })))
+ }
+
+ fn decode(self, decoder: &Decoder) -> ValueLabelRecord<RawStr<8>, String> {
+ let labels = self
+ .labels
+ .iter()
+ .map(|ValueLabel { value, label }| ValueLabel {
+ value: *value,
+ label: decoder.decode(label).to_string(),
+ })
+ .collect();
+ ValueLabelRecord {
+ offsets: self.offsets.clone(),
+ labels,
+ dict_indexes: self.dict_indexes.clone(),
+ var_type: self.var_type,
+ }
}
}
#[derive(Clone, Debug)]
-pub struct DocumentRecord {
- /// Offset from the start of the file to the start of the record.
- pub pos: u64,
+pub struct DocumentRecord<S>
+where
+ S: Debug,
+{
+ pub offsets: Range<u64>,
- /// The document, as an array of 80-byte lines.
- pub lines: Vec<DocumentLine>,
+ /// The document, as an array of lines. Raw lines are exactly 80 bytes long
+ /// and are right-padded with spaces without any new-line termination.
+ pub lines: Vec<S>,
}
-pub type DocumentLine = UnencodedStr<{ DocumentRecord::LINE_LEN }>;
+pub type RawDocumentLine = RawStr<DOC_LINE_LEN>;
-impl DocumentRecord {
- /// Length of a line in a document. Document lines are fixed-length and
- /// padded on the right with spaces.
- pub const LINE_LEN: usize = 80;
+/// Length of a line in a document. Document lines are fixed-length and
+/// padded on the right with spaces.
+pub const DOC_LINE_LEN: usize = 80;
+impl DocumentRecord<RawDocumentLine> {
/// Maximum number of lines we will accept in a document. This is simply
/// the maximum number that will fit in a 32-bit space.
- pub const MAX_LINES: usize = i32::MAX as usize / Self::LINE_LEN;
+ pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<DocumentRecord, Error> {
- let offset = r.stream_position()?;
+ fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
+ let start_offset = r.stream_position()?;
let n: u32 = endian.parse(read_bytes(r)?);
let n = n as usize;
if n > Self::MAX_LINES {
Err(Error::BadDocumentLength {
- offset,
+ offset: start_offset,
n,
max: Self::MAX_LINES,
})
} else {
- let pos = r.stream_position()?;
let mut lines = Vec::with_capacity(n);
for _ in 0..n {
- lines.push(UnencodedStr::<{ DocumentRecord::LINE_LEN }>(read_bytes(r)?));
+ lines.push(RawStr(read_bytes(r)?));
}
- Ok(DocumentRecord { pos, lines })
+ let end_offset = r.stream_position()?;
+ Ok(Record::Document(DocumentRecord {
+ offsets: start_offset..end_offset,
+ lines,
+ }))
}
}
+
+ pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ DecodedRecord::Document(DocumentRecord {
+ offsets: self.offsets.clone(),
+ lines: self
+ .lines
+ .iter()
+ .map(|s| decoder.decode_slice(&s.0).to_string())
+ .collect(),
+ })
+ }
}
-trait ExtensionRecord
+impl<S> Header for DocumentRecord<S>
where
- Self: Sized,
+ S: Debug,
{
+ fn offsets(&self) -> Range<u64> {
+ self.offsets.clone()
+ }
+}
+
+trait ExtensionRecord {
const SUBTYPE: u32;
const SIZE: Option<u32>;
const COUNT: Option<u32>;
const NAME: &'static str;
- fn parse(ext: &Extension, endian: Endian, warn: impl Fn(Error)) -> Result<Self, Error>;
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning>;
}
#[derive(Clone, Debug)]
pub struct IntegerInfoRecord {
+ pub offsets: Range<u64>,
pub version: (i32, i32, i32),
pub machine_code: i32,
pub floating_point_rep: i32,
const COUNT: Option<u32> = Some(8);
const NAME: &'static str = "integer record";
- fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
ext.check_size::<Self>()?;
let mut input = &ext.data[..];
let data: Vec<i32> = (0..8)
.map(|_| endian.parse(read_bytes(&mut input).unwrap()))
.collect();
- Ok(IntegerInfoRecord {
+ Ok(Record::IntegerInfo(IntegerInfoRecord {
+ offsets: ext.offsets.clone(),
version: (data[0], data[1], data[2]),
machine_code: data[3],
floating_point_rep: data[4],
compression_code: data[5],
endianness: data[6],
character_code: data[7],
- })
+ }))
}
}
const COUNT: Option<u32> = Some(3);
const NAME: &'static str = "floating point record";
- fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
ext.check_size::<Self>()?;
let mut input = &ext.data[..];
let data: Vec<f64> = (0..3)
.map(|_| endian.parse(read_bytes(&mut input).unwrap()))
.collect();
- Ok(FloatInfoRecord {
+ Ok(Record::FloatInfo(FloatInfoRecord {
sysmis: data[0],
highest: data[1],
lowest: data[2],
- })
+ }))
}
}
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum CategoryLabels {
+ VarLabels,
+ CountedValues,
+}
+
#[derive(Clone, Debug)]
pub enum MultipleResponseType {
MultipleDichotomy {
- value: UnencodedString,
+ value: RawString,
labels: CategoryLabels,
},
MultipleCategory,
}
-#[derive(Clone, Debug)]
-pub struct MultipleResponseSet {
- pub name: UnencodedString,
- pub label: UnencodedString,
- pub mr_type: MultipleResponseType,
- pub vars: Vec<UnencodedString>,
-}
-impl MultipleResponseSet {
- fn parse(input: &[u8]) -> Result<(MultipleResponseSet, &[u8]), Error> {
- let Some(equals) = input.iter().position(|&b| b == b'=') else {
- return Err(Error::TBD);
- };
- let (name, input) = input.split_at(equals);
- let (mr_type, input) = match input.get(0) {
- Some(b'C') => (MultipleResponseType::MultipleCategory, &input[1..]),
- Some(b'D') => {
- let (value, input) = parse_counted_string(&input[1..])?;
+impl MultipleResponseType {
+ fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> {
+ let (mr_type, input) = match input.split_first() {
+ Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
+ Some((b'D', input)) => {
+ let (value, input) = parse_counted_string(input)?;
(
MultipleResponseType::MultipleDichotomy {
- value: value.into(),
+ value,
labels: CategoryLabels::VarLabels,
},
input,
)
}
- Some(b'E') => {
- let Some(b' ') = input.get(1) else {
- return Err(Error::TBD);
- };
- let input = &input[2..];
+ Some((b'E', input)) => {
let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
(CategoryLabels::CountedValues, rest)
} else if let Some(rest) = input.strip_prefix(b" 11 ") {
(CategoryLabels::VarLabels, rest)
} else {
- return Err(Error::TBD);
+ return Err(Warning::TBD);
};
let (value, input) = parse_counted_string(input)?;
(
- MultipleResponseType::MultipleDichotomy {
- value: value.into(),
- labels,
- },
+ MultipleResponseType::MultipleDichotomy { value, labels },
input,
)
}
- _ => return Err(Error::TBD),
+ _ => return Err(Warning::TBD),
};
- let Some(b' ') = input.get(0) else {
- return Err(Error::TBD);
+ Ok((mr_type, input))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseSet<I, S>
+where
+ I: Debug,
+ S: Debug,
+{
+ pub name: I,
+ pub label: S,
+ pub mr_type: MultipleResponseType,
+ pub short_names: Vec<I>,
+}
+
+impl MultipleResponseSet<RawString, RawString> {
+ fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> {
+ let Some(equals) = input.iter().position(|&b| b == b'=') else {
+ return Err(Warning::TBD);
};
- let (label, mut input) = parse_counted_string(&input[1..])?;
+ let (name, input) = input.split_at(equals);
+ let (mr_type, input) = MultipleResponseType::parse(input)?;
+ let Some(input) = input.strip_prefix(b" ") else {
+ return Err(Warning::TBD);
+ };
+ let (label, mut input) = parse_counted_string(input)?;
let mut vars = Vec::new();
- while input.get(0) == Some(&b' ') {
- input = &input[1..];
- let Some(length) = input.iter().position(|b| b" \n".contains(b)) else {
- return Err(Error::TBD);
- };
- if length > 0 {
- vars.push(input[..length].into());
+ while input.first() != Some(&b'\n') {
+ match input.split_first() {
+ Some((b' ', rest)) => {
+ let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
+ return Err(Warning::TBD);
+ };
+ let (var, rest) = rest.split_at(length);
+ if !var.is_empty() {
+ vars.push(var.into());
+ }
+ input = rest;
+ }
+ _ => return Err(Warning::TBD),
}
- input = &input[length..];
- }
- if input.get(0) != Some(&b'\n') {
- return Err(Error::TBD);
}
- while input.get(0) == Some(&b'\n') {
+ while input.first() == Some(&b'\n') {
input = &input[1..];
}
Ok((
MultipleResponseSet {
name: name.into(),
- label: label.into(),
+ label,
mr_type,
- vars,
+ short_names: vars,
},
input,
))
}
+
+ fn decode(
+ &self,
+ decoder: &Decoder,
+ ) -> Result<MultipleResponseSet<Identifier, String>, Warning> {
+ let mut short_names = Vec::with_capacity(self.short_names.len());
+ for short_name in self.short_names.iter() {
+ if let Some(short_name) = decoder
+ .decode_identifier(short_name)
+ .map_err(Warning::InvalidMrSetName)
+ .issue_warning(&decoder.warn)
+ {
+ short_names.push(short_name);
+ }
+ }
+ Ok(MultipleResponseSet {
+ name: decoder
+ .decode_identifier(&self.name)
+ .map_err(Warning::InvalidMrSetVariableName)?,
+ label: decoder.decode(&self.label).to_string(),
+ mr_type: self.mr_type.clone(),
+ short_names,
+ })
+ }
}
#[derive(Clone, Debug)]
-pub struct MultipleResponseRecord(Vec<MultipleResponseSet>);
+pub struct MultipleResponseRecord<I, S>(pub Vec<MultipleResponseSet<I, S>>)
+where
+ I: Debug,
+ S: Debug;
-impl ExtensionRecord for MultipleResponseRecord {
+impl ExtensionRecord for MultipleResponseRecord<RawString, RawString> {
const SUBTYPE: u32 = 7;
const SIZE: Option<u32> = Some(1);
const COUNT: Option<u32> = None;
const NAME: &'static str = "multiple response set record";
- fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
+ fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
ext.check_size::<Self>()?;
let mut input = &ext.data[..];
sets.push(set);
input = rest;
}
- Ok(MultipleResponseRecord(sets))
+ Ok(Record::MultipleResponse(MultipleResponseRecord(sets)))
}
}
-fn parse_counted_string(input: &[u8]) -> Result<(UnencodedString, &[u8]), Error> {
+impl MultipleResponseRecord<RawString, RawString> {
+ fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ let mut sets = Vec::new();
+ for set in self.0.iter() {
+ if let Some(set) = set.decode(decoder).issue_warning(&decoder.warn) {
+ sets.push(set);
+ }
+ }
+ DecodedRecord::MultipleResponse(MultipleResponseRecord(sets))
+ }
+}
+
+fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> {
let Some(space) = input.iter().position(|&b| b == b' ') else {
- return Err(Error::TBD);
+ return Err(Warning::TBD);
};
let Ok(length) = from_utf8(&input[..space]) else {
- return Err(Error::TBD);
+ return Err(Warning::TBD);
};
let Ok(length): Result<usize, _> = length.parse() else {
- return Err(Error::TBD);
+ return Err(Warning::TBD);
};
let input = &input[space + 1..];
if input.len() < length {
- return Err(Error::TBD);
+ return Err(Warning::TBD);
};
let (string, rest) = input.split_at(length);
Ok((string.into(), rest))
}
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Measure {
+ Nominal,
+ Ordinal,
+ Scale,
+}
+
+impl Measure {
+ pub fn default_for_type(var_type: VarType) -> Option<Measure> {
+ match var_type {
+ VarType::Numeric => None,
+ VarType::String => Some(Self::Nominal),
+ }
+ }
+
+ fn try_decode(source: u32) -> Result<Option<Measure>, Warning> {
+ match source {
+ 0 => Ok(None),
+ 1 => Ok(Some(Measure::Nominal)),
+ 2 => Ok(Some(Measure::Ordinal)),
+ 3 => Ok(Some(Measure::Scale)),
+ _ => Err(Warning::InvalidMeasurement(source)),
+ }
+ }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Alignment {
+ Left,
+ Right,
+ Center,
+}
+
+impl Alignment {
+ fn try_decode(source: u32) -> Result<Option<Alignment>, Warning> {
+ match source {
+ 0 => Ok(None),
+ 1 => Ok(Some(Alignment::Left)),
+ 2 => Ok(Some(Alignment::Right)),
+ 3 => Ok(Some(Alignment::Center)),
+ _ => Err(Warning::InvalidAlignment(source)),
+ }
+ }
+
+ pub fn default_for_type(var_type: VarType) -> Self {
+ match var_type {
+ VarType::Numeric => Self::Right,
+ VarType::String => Self::Left,
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VarDisplay {
+ pub measure: Option<Measure>,
+ pub width: Option<u32>,
+ pub alignment: Option<Alignment>,
+}
+
#[derive(Clone, Debug)]
-pub struct VarDisplayRecord(pub Vec<u32>);
+pub struct VarDisplayRecord(pub Vec<VarDisplay>);
-impl ExtensionRecord for VarDisplayRecord {
+impl VarDisplayRecord {
const SUBTYPE: u32 = 11;
- const SIZE: Option<u32> = Some(4);
- const COUNT: Option<u32> = None;
- const NAME: &'static str = "variable display record";
- fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
- ext.check_size::<Self>()?;
+ fn parse(
+ ext: &Extension,
+ n_vars: usize,
+ endian: Endian,
+ warn: &dyn Fn(Warning),
+ ) -> Result<Record, Warning> {
+ if ext.size != 4 {
+ return Err(Warning::BadRecordSize {
+ offset: ext.offsets.start,
+ record: String::from("variable display record"),
+ size: ext.size,
+ expected_size: 4,
+ });
+ }
+ let has_width = if ext.count as usize == 3 * n_vars {
+ true
+ } else if ext.count as usize == 2 * n_vars {
+ false
+ } else {
+ return Err(Warning::TBD);
+ };
+
+ let mut var_displays = Vec::new();
let mut input = &ext.data[..];
- let display = (0..ext.count)
- .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
- .collect();
- Ok(VarDisplayRecord(display))
+ for _ in 0..n_vars {
+ let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
+ .issue_warning(&warn)
+ .flatten();
+ let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
+ let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
+ .issue_warning(&warn)
+ .flatten();
+ var_displays.push(VarDisplay {
+ measure,
+ width,
+ alignment,
+ });
+ }
+ Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
}
}
-pub struct LongStringMissingValues {
+#[derive(Clone, Debug)]
+pub struct LongStringMissingValues<N, V>
+where
+ N: Debug,
+ V: Debug,
+{
/// Variable name.
- pub var_name: UnencodedString,
+ pub var_name: N,
/// Missing values.
- pub missing_values: MissingValues,
+ pub missing_values: MissingValues<V>,
}
-pub struct LongStringMissingValueSet(Vec<LongStringMissingValues>);
+impl LongStringMissingValues<RawString, RawStr<8>> {
+ fn decode(
+ &self,
+ decoder: &Decoder,
+ ) -> Result<LongStringMissingValues<Identifier, String>, IdError> {
+ Ok(LongStringMissingValues {
+ var_name: decoder.decode_identifier(&self.var_name)?,
+ missing_values: self.missing_values.decode(decoder),
+ })
+ }
+}
-impl ExtensionRecord for LongStringMissingValueSet {
+#[derive(Clone, Debug)]
+pub struct LongStringMissingValueRecord<N, V>(pub Vec<LongStringMissingValues<N, V>>)
+where
+ N: Debug,
+ V: Debug;
+
+impl ExtensionRecord for LongStringMissingValueRecord<RawString, RawStr<8>> {
const SUBTYPE: u32 = 22;
const SIZE: Option<u32> = Some(1);
const COUNT: Option<u32> = None;
const NAME: &'static str = "long string missing values record";
- fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
ext.check_size::<Self>()?;
let mut input = &ext.data[..];
let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
let value_len: u32 = endian.parse(read_bytes(&mut input)?);
if value_len != 8 {
- let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offset;
- return Err(Error::BadLongMissingValueLength {
- record_offset: ext.offset,
+ let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
+ return Err(Warning::BadLongMissingValueLength {
+ record_offset: ext.offsets.start,
offset,
value_len,
});
} else {
value
};
- values.push(Value::String(UnencodedStr(value)));
+ values.push(Value::String(RawStr(value)));
}
let missing_values = MissingValues {
values,
missing_values,
});
}
- Ok(LongStringMissingValueSet(missing_value_set))
+ Ok(Record::LongStringMissingValues(
+ LongStringMissingValueRecord(missing_value_set),
+ ))
+ }
+}
+
+impl LongStringMissingValueRecord<RawString, RawStr<8>> {
+ pub fn decode(self, decoder: &Decoder) -> LongStringMissingValueRecord<Identifier, String> {
+ let mut mvs = Vec::with_capacity(self.0.len());
+ for mv in self.0.iter() {
+ if let Some(mv) = mv
+ .decode(decoder)
+ .map_err(Warning::InvalidLongStringMissingValueVariableName)
+ .issue_warning(&decoder.warn)
+ {
+ mvs.push(mv);
+ }
+ }
+ LongStringMissingValueRecord(mvs)
}
}
const COUNT: Option<u32> = None;
const NAME: &'static str = "encoding record";
- fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
+ fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
ext.check_size::<Self>()?;
- Ok(EncodingRecord(
- String::from_utf8(ext.data.clone())
- .map_err(|_| Error::BadEncodingName { offset: ext.offset })?,
- ))
+ Ok(Record::Encoding(EncodingRecord(
+ String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName {
+ offset: ext.offsets.start,
+ })?,
+ )))
}
}
const COUNT: Option<u32> = Some(2);
const NAME: &'static str = "extended number of cases record";
- fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
ext.check_size::<Self>()?;
let mut input = &ext.data[..];
let one = endian.parse(read_bytes(&mut input)?);
let n_cases = endian.parse(read_bytes(&mut input)?);
- Ok(NumberOfCasesRecord { one, n_cases })
+ Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
}
}
#[derive(Clone, Debug)]
pub struct TextRecord {
- /// Offset from the start of the file to the start of the record.
- pub offset: u64,
+ pub offsets: Range<u64>,
+
+ /// Type of record.
+ pub rec_type: TextRecordType,
/// The text content of the record.
- pub text: UnencodedString,
+ pub text: RawString,
+}
+
+#[derive(Clone, Copy, Debug)]
+pub enum TextRecordType {
+ VariableSets,
+ ProductInfo,
+ LongNames,
+ VeryLongStrings,
+ FileAttributes,
+ VariableAttributes,
+}
+
+impl TextRecord {
+ fn new(extension: Extension, rec_type: TextRecordType) -> Self {
+ Self {
+ offsets: extension.offsets,
+ rec_type,
+ text: extension.data.into(),
+ }
+ }
+ pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+ match self.rec_type {
+ TextRecordType::VariableSets => {
+ DecodedRecord::VariableSets(VariableSetRecord::decode(&self, decoder))
+ }
+ TextRecordType::ProductInfo => {
+ DecodedRecord::ProductInfo(ProductInfoRecord::decode(&self, decoder))
+ }
+ TextRecordType::LongNames => {
+ DecodedRecord::LongNames(LongNamesRecord::decode(&self, decoder))
+ }
+ TextRecordType::VeryLongStrings => {
+ DecodedRecord::VeryLongStrings(VeryLongStringsRecord::decode(&self, decoder))
+ }
+ TextRecordType::FileAttributes => {
+ DecodedRecord::FileAttributes(FileAttributeRecord::decode(&self, decoder))
+ }
+ TextRecordType::VariableAttributes => {
+ DecodedRecord::VariableAttributes(VariableAttributeRecord::decode(&self, decoder))
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VeryLongString {
+ pub short_name: Identifier,
+ pub length: u16,
+}
+
+impl VeryLongString {
+ fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Warning> {
+ let Some((short_name, length)) = input.split_once('=') else {
+ return Err(Warning::TBD);
+ };
+ let short_name = decoder
+ .new_identifier(short_name)
+ .map_err(Warning::InvalidLongStringName)?;
+ let length = length.parse().map_err(|_| Warning::TBD)?;
+ Ok(VeryLongString { short_name, length })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VeryLongStringsRecord(Vec<VeryLongString>);
+
+impl VeryLongStringsRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ let input = decoder.decode(&source.text);
+ let mut very_long_strings = Vec::new();
+ for tuple in input
+ .split('\0')
+ .map(|s| s.trim_end_matches('\t'))
+ .filter(|s| !s.is_empty())
+ {
+ if let Some(vls) = VeryLongString::parse(decoder, tuple).issue_warning(&decoder.warn) {
+ very_long_strings.push(vls)
+ }
+ }
+ VeryLongStringsRecord(very_long_strings)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Attribute {
+ pub name: Identifier,
+ pub values: Vec<String>,
+}
+
+impl Attribute {
+ fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> {
+ let Some((name, mut input)) = input.split_once('(') else {
+ return Err(Warning::TBD);
+ };
+ let name = decoder
+ .new_identifier(name)
+ .map_err(Warning::InvalidAttributeName)?;
+ let mut values = Vec::new();
+ loop {
+ let Some((value, rest)) = input.split_once('\n') else {
+ return Err(Warning::TBD);
+ };
+ if let Some(stripped) = value
+ .strip_prefix('\'')
+ .and_then(|value| value.strip_suffix('\''))
+ {
+ values.push(stripped.into());
+ } else {
+ decoder.warn(Warning::TBD);
+ values.push(value.into());
+ }
+ if let Some(rest) = rest.strip_prefix(')') {
+ let attribute = Attribute { name, values };
+ return Ok((attribute, rest));
+ };
+ input = rest;
+ }
+ }
+}
+
+#[derive(Clone, Debug, Default)]
+pub struct AttributeSet(pub HashMap<Identifier, Vec<String>>);
+
+impl AttributeSet {
+ fn parse<'a>(
+ decoder: &Decoder,
+ mut input: &'a str,
+ sentinel: Option<char>,
+ ) -> Result<(AttributeSet, &'a str), Warning> {
+ let mut attributes = HashMap::new();
+ let rest = loop {
+ match input.chars().next() {
+ None => break input,
+ c if c == sentinel => break &input[1..],
+ _ => {
+ let (attribute, rest) = Attribute::parse(decoder, input)?;
+ // XXX report duplicate name
+ attributes.insert(attribute.name, attribute.values);
+ input = rest;
+ }
+ }
+ };
+ Ok((AttributeSet(attributes), rest))
+ }
+}
+
+#[derive(Clone, Debug, Default)]
+pub struct FileAttributeRecord(pub AttributeSet);
+
+impl FileAttributeRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ let input = decoder.decode(&source.text);
+ match AttributeSet::parse(decoder, &input, None).issue_warning(&decoder.warn) {
+ Some((set, rest)) => {
+ if !rest.is_empty() {
+ decoder.warn(Warning::TBD);
+ }
+ FileAttributeRecord(set)
+ }
+ None => FileAttributeRecord::default(),
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VarAttributeSet {
+ pub long_var_name: Identifier,
+ pub attributes: AttributeSet,
+}
+
+impl VarAttributeSet {
+ fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributeSet, &'a str), Warning> {
+ let Some((long_var_name, rest)) = input.split_once(':') else {
+ return Err(Warning::TBD);
+ };
+ let long_var_name = decoder
+ .new_identifier(long_var_name)
+ .map_err(Warning::InvalidAttributeVariableName)?;
+ let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?;
+ let var_attribute = VarAttributeSet {
+ long_var_name,
+ attributes,
+ };
+ Ok((var_attribute, rest))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
+
+impl VariableAttributeRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ let decoded = decoder.decode(&source.text);
+ let mut input = decoded.as_ref();
+ let mut var_attribute_sets = Vec::new();
+ while !input.is_empty() {
+ let Some((var_attribute, rest)) =
+ VarAttributeSet::parse(decoder, input).issue_warning(&decoder.warn)
+ else {
+ break;
+ };
+ var_attribute_sets.push(var_attribute);
+ input = rest;
+ }
+ VariableAttributeRecord(var_attribute_sets)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongName {
+ pub short_name: Identifier,
+ pub long_name: Identifier,
+}
+
+impl LongName {
+ fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
+ let Some((short_name, long_name)) = input.split_once('=') else {
+ return Err(Warning::TBD);
+ };
+ let short_name = decoder
+ .new_identifier(short_name)
+ .map_err(Warning::InvalidShortName)?;
+ let long_name = decoder
+ .new_identifier(long_name)
+ .map_err(Warning::InvalidLongName)?;
+ Ok(LongName {
+ short_name,
+ long_name,
+ })
+ }
}
-impl From<Extension> for TextRecord {
- fn from(source: Extension) -> Self {
- TextRecord { offset: source.offset, text: source.data.into() }
+#[derive(Clone, Debug)]
+pub struct LongNamesRecord(Vec<LongName>);
+
+impl LongNamesRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ let input = decoder.decode(&source.text);
+ let mut names = Vec::new();
+ for pair in input.split('\t').filter(|s| !s.is_empty()) {
+ if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&decoder.warn) {
+ names.push(long_name);
+ }
+ }
+ LongNamesRecord(names)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct ProductInfoRecord(pub String);
+
+impl ProductInfoRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+ Self(decoder.decode(&source.text).into())
+ }
+}
+#[derive(Clone, Debug)]
+pub struct VariableSet {
+ pub name: String,
+ pub vars: Vec<Identifier>,
+}
+
+impl VariableSet {
+ fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
+ let (name, input) = input.split_once('=').ok_or(Warning::TBD)?;
+ let mut vars = Vec::new();
+ for var in input.split_ascii_whitespace() {
+ if let Some(identifier) = decoder
+ .new_identifier(var)
+ .map_err(Warning::InvalidVariableSetName)
+ .issue_warning(&decoder.warn)
+ {
+ vars.push(identifier);
+ }
+ }
+ Ok(VariableSet {
+ name: name.into(),
+ vars,
+ })
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableSetRecord {
+ pub offsets: Range<u64>,
+ pub sets: Vec<VariableSet>,
+}
+
+impl VariableSetRecord {
+ fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord {
+ let mut sets = Vec::new();
+ let input = decoder.decode(&source.text);
+ for line in input.lines() {
+ if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&decoder.warn) {
+ sets.push(set)
+ }
+ }
+ VariableSetRecord {
+ offsets: source.offsets.clone(),
+ sets,
+ }
+ }
+}
+
+trait IssueWarning<T> {
+ fn issue_warning<F>(self, warn: &F) -> Option<T>
+ where
+ F: Fn(Warning);
+}
+impl<T> IssueWarning<T> for Result<T, Warning> {
+ fn issue_warning<F>(self, warn: &F) -> Option<T>
+ where
+ F: Fn(Warning),
+ {
+ match self {
+ Ok(result) => Some(result),
+ Err(error) => {
+ warn(error);
+ None
+ }
+ }
}
}
#[derive(Clone, Debug)]
pub struct Extension {
- /// Offset from the start of the file to the start of the record.
- pub offset: u64,
+ pub offsets: Range<u64>,
/// Record subtype.
pub subtype: u32,
}
impl Extension {
- fn check_size<E: ExtensionRecord>(&self) -> Result<(), Error> {
+ fn check_size<E: ExtensionRecord>(&self) -> Result<(), Warning> {
if let Some(expected_size) = E::SIZE {
if self.size != expected_size {
- return Err(Error::BadRecordSize {
- offset: self.offset,
+ return Err(Warning::BadRecordSize {
+ offset: self.offsets.start,
record: E::NAME.into(),
size: self.size,
expected_size,
}
if let Some(expected_count) = E::COUNT {
if self.count != expected_count {
- return Err(Error::BadRecordCount {
- offset: self.offset,
+ return Err(Warning::BadRecordCount {
+ offset: self.offsets.start,
record: E::NAME.into(),
count: self.count,
expected_count,
Ok(())
}
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
+ fn read<R: Read + Seek>(
+ r: &mut R,
+ endian: Endian,
+ n_vars: usize,
+ warn: &dyn Fn(Warning),
+ ) -> Result<Option<Record>, Error> {
let subtype = endian.parse(read_bytes(r)?);
- let offset = r.stream_position()?;
+ let header_offset = r.stream_position()?;
let size: u32 = endian.parse(read_bytes(r)?);
let count = endian.parse(read_bytes(r)?);
let Some(product) = size.checked_mul(count) else {
return Err(Error::ExtensionRecordTooLarge {
- offset,
+ offset: header_offset,
subtype,
size,
count,
});
};
- let offset = r.stream_position()?;
+ let start_offset = r.stream_position()?;
let data = read_vec(r, product as usize)?;
+ let end_offset = start_offset + product as u64;
let extension = Extension {
- offset,
+ offsets: start_offset..end_offset,
subtype,
size,
count,
data,
};
- match subtype {
- IntegerInfoRecord::SUBTYPE => Ok(Record::IntegerInfo(IntegerInfoRecord::parse(
- &extension,
- endian,
- |_| (),
- )?)),
- FloatInfoRecord::SUBTYPE => Ok(Record::FloatInfo(FloatInfoRecord::parse(
- &extension,
- endian,
- |_| (),
- )?)),
- VarDisplayRecord::SUBTYPE => Ok(Record::VarDisplay(VarDisplayRecord::parse(
- &extension,
- endian,
- |_| (),
- )?)),
- MultipleResponseRecord::SUBTYPE | 19 => Ok(Record::MultipleResponse(
- MultipleResponseRecord::parse(&extension, endian, |_| ())?,
- )),
- LongStringValueLabelRecord::SUBTYPE => Ok(Record::LongStringValueLabels(
- LongStringValueLabelRecord::parse(&extension, endian, |_| ())?,
- )),
- EncodingRecord::SUBTYPE => Ok(Record::Encoding(EncodingRecord::parse(
- &extension,
- endian,
- |_| (),
- )?)),
- NumberOfCasesRecord::SUBTYPE => Ok(Record::NumberOfCases(NumberOfCasesRecord::parse(
- &extension,
- endian,
- |_| (),
- )?)),
- 5 => Ok(Record::VariableSets(extension.into())),
- 10 => Ok(Record::ProductInfo(extension.into())),
- 13 => Ok(Record::LongNames(extension.into())),
- 14 => Ok(Record::LongStrings(extension.into())),
- 17 => Ok(Record::FileAttributes(extension.into())),
- 18 => Ok(Record::VariableAttributes(extension.into())),
+ let result = match subtype {
+ IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian),
+ FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian),
+ VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, n_vars, endian, warn),
+ MultipleResponseRecord::SUBTYPE | 19 => {
+ MultipleResponseRecord::parse(&extension, endian)
+ }
+ LongStringValueLabelRecord::SUBTYPE => {
+ LongStringValueLabelRecord::parse(&extension, endian)
+ }
+ EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian),
+ NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian),
+ 5 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::VariableSets,
+ ))),
+ 10 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::ProductInfo,
+ ))),
+ 13 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::LongNames,
+ ))),
+ 14 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::VeryLongStrings,
+ ))),
+ 17 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::FileAttributes,
+ ))),
+ 18 => Ok(Record::Text(TextRecord::new(
+ extension,
+ TextRecordType::VariableAttributes,
+ ))),
_ => Ok(Record::OtherExtension(extension)),
+ };
+ match result {
+ Ok(result) => Ok(Some(result)),
+ Err(error) => {
+ warn(error);
+ Ok(None)
+ }
}
}
}
Ok(vec)
}
-fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<UnencodedString, IoError> {
+fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<RawString, IoError> {
let length: u32 = endian.parse(read_bytes(r)?);
Ok(read_vec(r, length as usize)?.into())
}
#[derive(Clone, Debug)]
-pub struct LongStringValueLabels {
- pub var_name: UnencodedString,
+pub struct LongStringValueLabels<N, S>
+where
+ S: Debug,
+{
+ pub var_name: N,
pub width: u32,
/// `(value, label)` pairs, where each value is `width` bytes.
- pub labels: Vec<(UnencodedString, UnencodedString)>,
+ pub labels: Vec<(S, S)>,
+}
+
+impl LongStringValueLabels<RawString, RawString> {
+ fn decode(
+ &self,
+ decoder: &Decoder,
+ ) -> Result<LongStringValueLabels<Identifier, String>, Warning> {
+ let var_name = decoder.decode(&self.var_name);
+ let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
+ .map_err(Warning::InvalidLongStringValueLabelName)?;
+
+ let mut labels = Vec::with_capacity(self.labels.len());
+ for (value, label) in self.labels.iter() {
+ let value = decoder.decode_exact_length(&value.0).to_string();
+ let label = decoder.decode(label).to_string();
+ labels.push((value, label));
+ }
+
+ Ok(LongStringValueLabels {
+ var_name,
+ width: self.width,
+ labels,
+ })
+ }
}
#[derive(Clone, Debug)]
-pub struct LongStringValueLabelRecord(Vec<LongStringValueLabels>);
+pub struct LongStringValueLabelRecord<N, S>(pub Vec<LongStringValueLabels<N, S>>)
+where
+ N: Debug,
+ S: Debug;
-impl ExtensionRecord for LongStringValueLabelRecord {
+impl ExtensionRecord for LongStringValueLabelRecord<RawString, RawString> {
const SUBTYPE: u32 = 21;
const SIZE: Option<u32> = Some(1);
const COUNT: Option<u32> = None;
const NAME: &'static str = "long string value labels record";
- fn parse(ext: &Extension, endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
+ fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
ext.check_size::<Self>()?;
let mut input = &ext.data[..];
labels,
})
}
- Ok(LongStringValueLabelRecord(label_set))
+ Ok(Record::LongStringValueLabels(LongStringValueLabelRecord(
+ label_set,
+ )))
+ }
+}
+
+impl LongStringValueLabelRecord<RawString, RawString> {
+ fn decode(self, decoder: &Decoder) -> LongStringValueLabelRecord<Identifier, String> {
+ let mut labels = Vec::with_capacity(self.0.len());
+ for label in &self.0 {
+ match label.decode(decoder) {
+ Ok(set) => labels.push(set),
+ Err(error) => decoder.warn(error),
+ }
+ }
+ LongStringValueLabelRecord(labels)
}
}