use crate::endian::{Endian, Parse, ToBytes};
-use encoding_rs::mem::decode_latin1;
+use encoding_rs::{mem::decode_latin1, DecoderResult, Encoding};
use flate2::read::ZlibDecoder;
use num::Integer;
use std::{
borrow::Cow,
+ cell::RefCell,
cmp::Ordering,
collections::VecDeque,
- fmt::{Debug, Formatter, Result as FmtResult},
+ fmt::{Debug, Display, Formatter, Result as FmtResult},
io::{Error as IoError, Read, Seek, SeekFrom},
+ iter::repeat,
mem::take,
ops::Range,
rc::Rc,
- str::from_utf8, cell::RefCell,
+ str::from_utf8,
};
use thiserror::Error as ThisError;
#[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
- #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
- BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
+ #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")]
+ TooManyVarIndexes { offset: u64, n: u32, max: u32 },
+
+ #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")]
+ NoVarIndexes { offset: u64 },
+
+ #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())]
+ MixedVarTypes {
+ offset: u64,
+ var_type: VarType,
+ wrong_types: Vec<u32>,
+ },
+
+ #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")]
+ InvalidVarIndexes {
+ offset: u64,
+ max: usize,
+ invalid: Vec<u32>,
+ },
#[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
ExtensionRecordTooLarge {
#[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
BadEncodingName { offset: u64 },
+ // XXX This is risky because `text` might be arbitarily long.
+ #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
+ MalformedString { encoding: String, text: String },
+
#[error("Details TBD")]
TBD,
}
#[derive(Clone, Debug)]
pub enum Record {
- Header(HeaderRecord),
- Variable(VariableRecord),
- ValueLabel(ValueLabelRecord),
+ Header(HeaderRecord<RawString>),
+ Variable(VariableRecord<RawString, RawStr<8>>),
+ ValueLabel(ValueLabelRecord<RawStr<8>, RawString>),
Document(DocumentRecord),
IntegerInfo(IntegerInfoRecord),
FloatInfo(FloatInfoRecord),
fn read<R>(
reader: &mut R,
endian: Endian,
+ var_types: &[VarType],
warn: &Box<dyn Fn(Error)>,
) -> Result<Option<Record>, Error>
where
let rec_type: u32 = endian.parse(read_bytes(reader)?);
match rec_type {
2 => Ok(Some(VariableRecord::read(reader, endian)?)),
- 3 => Ok(Some(ValueLabelRecord::read(reader, endian)?)),
+ 3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?),
6 => Ok(Some(DocumentRecord::read(reader, endian)?)),
7 => Extension::read(reader, endian, warn),
999 => Ok(Some(Record::EndOfHeaders(
}
#[derive(Clone)]
-pub struct HeaderRecord {
+pub struct HeaderRecord<S>
+where
+ S: Debug,
+{
/// Offset in file.
pub offsets: Range<u64>,
/// Eye-catcher string, product name, in the file's encoding. Padded
/// on the right with spaces.
- pub eye_catcher: UnencodedStr<60>,
+ pub eye_catcher: S,
/// Layout code, normally either 2 or 3.
pub layout_code: u32,
pub bias: f64,
/// `dd mmm yy` in the file's encoding.
- pub creation_date: UnencodedStr<9>,
+ pub creation_date: S,
/// `HH:MM:SS` in the file's encoding.
- pub creation_time: UnencodedStr<8>,
+ pub creation_time: S,
/// File label, in the file's encoding. Padded on the right with spaces.
- pub file_label: UnencodedStr<64>,
+ pub file_label: S,
/// Endianness of the data in the file header.
pub endian: Endian,
}
-impl HeaderRecord {
- fn debug_field<T: Debug>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult {
+impl<S> HeaderRecord<S>
+where
+ S: Debug,
+{
+ fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult
+ where
+ T: Debug,
+ {
writeln!(f, "{name:>17}: {:?}", value)
}
}
-impl Debug for HeaderRecord {
+impl<S> Debug for HeaderRecord<S>
+where
+ S: Debug,
+{
fn fmt(&self, f: &mut Formatter) -> FmtResult {
writeln!(f, "File header record:")?;
self.debug_field(f, "Magic", self.magic)?;
- self.debug_field(f, "Product name", self.eye_catcher)?;
+ self.debug_field(f, "Product name", &self.eye_catcher)?;
self.debug_field(f, "Layout code", self.layout_code)?;
self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
self.debug_field(f, "Compression", self.compression)?;
self.debug_field(f, "Weight index", self.weight_index)?;
self.debug_field(f, "Number of cases", self.n_cases)?;
self.debug_field(f, "Compression bias", self.bias)?;
- self.debug_field(f, "Creation date", self.creation_date)?;
- self.debug_field(f, "Creation time", self.creation_time)?;
- self.debug_field(f, "File label", self.file_label)?;
+ self.debug_field(f, "Creation date", &self.creation_date)?;
+ self.debug_field(f, "Creation time", &self.creation_time)?;
+ self.debug_field(f, "File label", &self.file_label)?;
self.debug_field(f, "Endianness", self.endian)
}
}
-impl HeaderRecord {
- fn read<R: Read + Seek>(r: &mut R) -> Result<HeaderRecord, Error> {
+impl HeaderRecord<RawString> {
+ fn read<R: Read + Seek>(r: &mut R) -> Result<Self, Error> {
let start = r.stream_position()?;
let magic: [u8; 4] = read_bytes(r)?;
let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
- let eye_catcher = UnencodedStr::<60>(read_bytes(r)?);
+ let eye_catcher = RawString(read_vec(r, 60)?);
let layout_code: [u8; 4] = read_bytes(r)?;
let endian = Endian::identify_u32(2, layout_code)
.or_else(|| Endian::identify_u32(2, layout_code))
let bias: f64 = endian.parse(read_bytes(r)?);
- let creation_date = UnencodedStr::<9>(read_bytes(r)?);
- let creation_time = UnencodedStr::<8>(read_bytes(r)?);
- let file_label = UnencodedStr::<64>(read_bytes(r)?);
+ let creation_date = RawString(read_vec(r, 9)?);
+ let creation_time = RawString(read_vec(r, 8)?);
+ let file_label = RawString(read_vec(r, 64)?);
let _: [u8; 3] = read_bytes(r)?;
Ok(HeaderRecord {
endian,
})
}
+
+ fn decode<'a>(&'a self, decoder: &Decoder) -> HeaderRecord<Cow<'a, str>> {
+ let eye_catcher = decoder.decode(&self.eye_catcher);
+ let file_label = decoder.decode(&self.file_label);
+ let creation_date = decoder.decode(&self.creation_date);
+ let creation_time = decoder.decode(&self.creation_time);
+ HeaderRecord {
+ eye_catcher,
+ weight_index: self.weight_index,
+ n_cases: self.n_cases,
+ file_label,
+ offsets: self.offsets.clone(),
+ magic: self.magic,
+ layout_code: self.layout_code,
+ nominal_case_size: self.nominal_case_size,
+ compression: self.compression,
+ bias: self.bias,
+ creation_date,
+ creation_time,
+ endian: self.endian,
+ }
+ }
+}
+
+struct Decoder {
+ encoding: &'static Encoding,
+ warn: Box<dyn Fn(Error)>,
+}
+
+impl Decoder {
+ fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> {
+ let (output, malformed) = self.encoding.decode_without_bom_handling(&input.0);
+ if malformed {
+ (self.warn)(Error::MalformedString {
+ encoding: self.encoding.name().into(),
+ text: output.clone().into(),
+ });
+ }
+ output
+ }
+
+ /// Returns `input` decoded from `self.encoding` into UTF-8 such that
+ /// re-encoding the result back into `self.encoding` will have exactly the
+ /// same length in bytes.
+ ///
+ /// XXX warn about errors?
+ fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
+ if let (s, false) = self.encoding.decode_without_bom_handling(input) {
+ // This is the common case. Usually there will be no errors.
+ s
+ } else {
+ // Unusual case. Don't bother to optimize it much.
+ let mut decoder = self.encoding.new_decoder_without_bom_handling();
+ let mut output = String::with_capacity(
+ decoder
+ .max_utf8_buffer_length_without_replacement(input.len())
+ .unwrap(),
+ );
+ let mut rest = input;
+ while !rest.is_empty() {
+ match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
+ (DecoderResult::InputEmpty, _) => break,
+ (DecoderResult::OutputFull, _) => unreachable!(),
+ (DecoderResult::Malformed(a, b), consumed) => {
+ let skipped = a as usize + b as usize;
+ output.extend(repeat('?').take(skipped));
+ rest = &rest[consumed..];
+ }
+ }
+ }
+ assert_eq!(self.encoding.encode(&output).0.len(), input.len());
+ output.into()
+ }
+ }
}
-impl Header for HeaderRecord {
+impl<S> Header for HeaderRecord<S>
+where
+ S: Debug,
+{
fn offsets(&self) -> Range<u64> {
self.offsets.clone()
}
_ => VarType::String,
}
}
+
+ fn opposite(self) -> VarType {
+ match self {
+ Self::Numeric => Self::String,
+ Self::String => Self::Numeric,
+ }
+ }
+}
+
+impl Display for VarType {
+ fn fmt(&self, f: &mut Formatter) -> FmtResult {
+ match self {
+ VarType::Numeric => write!(f, "numeric"),
+ VarType::String => write!(f, "string"),
+ }
+ }
}
#[derive(Copy, Clone)]
-pub enum Value {
+pub enum Value<S>
+where
+ S: Debug,
+{
Number(Option<f64>),
- String(UnencodedStr<8>),
+ String(S),
}
-impl Debug for Value {
+type RawValue = Value<RawStr<8>>;
+
+impl<S> Debug for Value<S>
+where
+ S: Debug,
+{
fn fmt(&self, f: &mut Formatter) -> FmtResult {
match self {
Value::Number(Some(number)) => write!(f, "{number:?}"),
Value::Number(None) => write!(f, "SYSMIS"),
- Value::String(bytes) => write!(f, "{:?}", bytes),
+ Value::String(s) => write!(f, "{:?}", s),
}
}
}
-impl Value {
- fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Value, IoError> {
+impl RawValue {
+ fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Self, IoError> {
Ok(Self::from_raw(
&UntypedValue(read_bytes(r)?),
var_type,
))
}
- pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Value {
+ pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self {
match var_type {
- VarType::String => Value::String(UnencodedStr(raw.0)),
+ VarType::String => Value::String(RawStr(raw.0)),
VarType::Numeric => {
let number: f64 = endian.parse(raw.0);
Value::Number((number != -f64::MAX).then_some(number))
reader: &mut R,
var_types: &[VarType],
endian: Endian,
- ) -> Result<Option<Vec<Value>>, Error> {
+ ) -> Result<Option<Vec<Self>>, Error> {
let case_start = reader.stream_position()?;
let mut values = Vec::with_capacity(var_types.len());
for (i, &var_type) in var_types.iter().enumerate() {
codes: &mut VecDeque<u8>,
endian: Endian,
bias: f64,
- ) -> Result<Option<Vec<Value>>, Error> {
+ ) -> Result<Option<Vec<Self>>, Error> {
let case_start = reader.stream_position()?;
let mut values = Vec::with_capacity(var_types.len());
for (i, &var_type) in var_types.iter().enumerate() {
match code {
0 => (),
1..=251 => match var_type {
- VarType::Numeric => break Value::Number(Some(code as f64 - bias)),
+ VarType::Numeric => break Self::Number(Some(code as f64 - bias)),
VarType::String => {
- break Value::String(UnencodedStr(endian.to_bytes(code as f64 - bias)))
+ break Self::String(RawStr(endian.to_bytes(code as f64 - bias)))
}
},
252 => {
}
}
253 => {
- break Value::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian)
+ break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian)
}
254 => match var_type {
- VarType::String => break Value::String(UnencodedStr(*b" ")), // XXX EBCDIC
+ VarType::String => break Self::String(RawStr(*b" ")), // XXX EBCDIC
VarType::Numeric => {
return Err(Error::CompressedStringExpected {
offset: case_start,
}
},
255 => match var_type {
- VarType::Numeric => break Value::Number(None),
+ VarType::Numeric => break Self::Number(None),
VarType::String => {
return Err(Error::CompressedNumberExpected {
offset: case_start,
}
Ok(Some(values))
}
+
+ fn decode(&self, decoder: &Decoder) -> Value<String> {
+ match self {
+ Self::Number(x) => Value::Number(*x),
+ Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
+ }
+ }
}
struct ZlibDecodeMultiple<R>
reader: Option<R>,
warn: Box<dyn Fn(Error)>,
- header: HeaderRecord,
+ header: HeaderRecord<RawString>,
var_types: Vec<VarType>,
state: ReaderState,
match Record::read(
self.reader.as_mut().unwrap(),
self.header.endian,
+ self.var_types.as_slice(),
&self.warn,
) {
Ok(Some(record)) => break record,
bias: f64,
endian: Endian,
codes: VecDeque<u8>,
- eof: bool
+ eof: bool,
}
impl Debug for Cases {
}
impl Cases {
- fn new<R>(reader: R, var_types: Vec<VarType>, header: &HeaderRecord) -> Self
+ fn new<R>(reader: R, var_types: Vec<VarType>, header: &HeaderRecord<RawString>) -> Self
where
R: Read + Seek + 'static,
{
}
impl Iterator for Cases {
- type Item = Result<Vec<Value>, Error>;
+ type Item = Result<Vec<RawValue>, Error>;
fn next(&mut self) -> Option<Self::Item> {
if self.eof {
}
#[derive(Clone)]
-pub struct MissingValues {
+pub struct MissingValues<S>
+where
+ S: Debug,
+{
/// Individual missing values, up to 3 of them.
- pub values: Vec<Value>,
+ pub values: Vec<Value<S>>,
/// Optional range of missing values.
- pub range: Option<(Value, Value)>,
+ pub range: Option<(Value<S>, Value<S>)>,
}
-impl Debug for MissingValues {
+impl<S> Debug for MissingValues<S>
+where
+ S: Debug,
+{
fn fmt(&self, f: &mut Formatter) -> FmtResult {
for (i, value) in self.values.iter().enumerate() {
if i > 0 {
write!(f, "{value:?}")?;
}
- if let Some((low, high)) = self.range {
+ if let Some((low, high)) = &self.range {
if !self.values.is_empty() {
write!(f, ", ")?;
}
}
}
-impl MissingValues {
+impl<S> MissingValues<S>
+where
+ S: Debug,
+{
fn is_empty(&self) -> bool {
self.values.is_empty() && self.range.is_none()
}
+}
+impl MissingValues<RawStr<8>> {
fn read<R: Read + Seek>(
r: &mut R,
offset: u64,
width: i32,
code: i32,
endian: Endian,
- ) -> Result<MissingValues, Error> {
+ ) -> Result<Self, Error> {
let (n_values, has_range) = match (width, code) {
(_, 0..=3) => (code, false),
(0, -2) => (0, true),
let mut values = Vec::new();
for _ in 0..n_values {
- values.push(Value::read(r, var_type, endian)?);
+ values.push(RawValue::read(r, var_type, endian)?);
}
let range = if has_range {
- let low = Value::read(r, var_type, endian)?;
- let high = Value::read(r, var_type, endian)?;
+ let low = RawValue::read(r, var_type, endian)?;
+ let high = RawValue::read(r, var_type, endian)?;
Some((low, high))
} else {
None
};
- Ok(MissingValues { values, range })
+ Ok(Self { values, range })
+ }
+ fn decode<'a>(&'a self, decoder: &Decoder) -> MissingValues<String> {
+ MissingValues {
+ values: self
+ .values
+ .iter()
+ .map(|value| value.decode(decoder))
+ .collect(),
+ range: self
+ .range
+ .as_ref()
+ .map(|(low, high)| (low.decode(decoder), high.decode(decoder))),
+ }
}
}
#[derive(Clone)]
-pub struct VariableRecord {
+pub struct VariableRecord<S, V>
+where
+ S: Debug,
+ V: Debug,
+{
/// Range of offsets in file.
pub offsets: Range<u64>,
pub width: i32,
/// Variable name, padded on the right with spaces.
- pub name: UnencodedStr<8>,
+ pub name: S,
/// Print format.
pub print_format: Spec,
pub write_format: Spec,
/// Missing values.
- pub missing_values: MissingValues,
+ pub missing_values: MissingValues<V>,
/// Optional variable label.
- pub label: Option<UnencodedString>,
+ pub label: Option<S>,
}
-impl Debug for VariableRecord {
+impl<S, V> Debug for VariableRecord<S, V>
+where
+ S: Debug,
+ V: Debug,
+{
fn fmt(&self, f: &mut Formatter) -> FmtResult {
writeln!(
f,
}
}
-impl VariableRecord {
+impl VariableRecord<RawString, RawStr<8>> {
fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
let start_offset = r.stream_position()?;
let width: i32 = endian.parse(read_bytes(r)?);
let missing_value_code: i32 = endian.parse(read_bytes(r)?);
let print_format = Spec(endian.parse(read_bytes(r)?));
let write_format = Spec(endian.parse(read_bytes(r)?));
- let name = UnencodedStr::<8>(read_bytes(r)?);
+ let name = RawString(read_vec(r, 8)?);
let label = match has_variable_label {
0 => None,
1 => {
let len: u32 = endian.parse(read_bytes(r)?);
let read_len = len.min(65535) as usize;
- let label = UnencodedString(read_vec(r, read_len)?);
+ let label = RawString(read_vec(r, read_len)?);
let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
let _ = read_vec(r, padding_bytes as usize)?;
label,
}))
}
+
+ fn decode<'a>(&'a self, decoder: &Decoder) -> VariableRecord<Cow<'a, str>, String> {
+ VariableRecord {
+ offsets: self.offsets.clone(),
+ width: self.width,
+ name: decoder.decode(&self.name),
+ print_format: self.print_format,
+ write_format: self.write_format,
+ missing_values: self.missing_values.decode(decoder),
+ label: self.label.as_ref().map(|label| decoder.decode(label)),
+ }
+ }
}
#[derive(Copy, Clone)]
}
#[derive(Clone)]
-pub struct UnencodedString(pub Vec<u8>);
+pub struct RawString(pub Vec<u8>);
-impl From<Vec<u8>> for UnencodedString {
+impl From<Vec<u8>> for RawString {
fn from(source: Vec<u8>) -> Self {
Self(source)
}
}
-impl From<&[u8]> for UnencodedString {
+impl From<&[u8]> for RawString {
fn from(source: &[u8]) -> Self {
Self(source.into())
}
}
-impl Debug for UnencodedString {
+impl Debug for RawString {
fn fmt(&self, f: &mut Formatter) -> FmtResult {
write!(f, "{:?}", default_decode(self.0.as_slice()))
}
}
#[derive(Copy, Clone)]
-pub struct UnencodedStr<const N: usize>(pub [u8; N]);
+pub struct RawStr<const N: usize>(pub [u8; N]);
-impl<const N: usize> From<[u8; N]> for UnencodedStr<N> {
+impl<const N: usize> From<[u8; N]> for RawStr<N> {
fn from(source: [u8; N]) -> Self {
Self(source)
}
}
-impl<const N: usize> Debug for UnencodedStr<N> {
+impl<const N: usize> Debug for RawStr<N> {
fn fmt(&self, f: &mut Formatter) -> FmtResult {
write!(f, "{:?}", default_decode(&self.0))
}
}
+#[derive(Clone, Debug)]
+pub struct ValueLabel<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
+ pub value: Value<V>,
+ pub label: S,
+}
+
#[derive(Clone)]
-pub struct ValueLabelRecord {
+pub struct ValueLabelRecord<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
/// Range of offsets in file.
pub offsets: Range<u64>,
/// The labels.
- pub labels: Vec<(UntypedValue, UnencodedString)>,
+ pub labels: Vec<ValueLabel<V, S>>,
/// The 1-based indexes of the variable indexes.
pub dict_indexes: Vec<u32>,
+
+ /// The types of the variables.
+ pub var_type: VarType,
}
-impl Debug for ValueLabelRecord {
+impl<V, S> Debug for ValueLabelRecord<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
fn fmt(&self, f: &mut Formatter) -> FmtResult {
writeln!(f, "labels: ")?;
- for (value, label) in self.labels.iter() {
- writeln!(f, "{value:?}: {label:?}")?;
+ for label in self.labels.iter() {
+ writeln!(f, "{label:?}")?;
}
- write!(f, "apply to variables")?;
+ write!(f, "apply to {} variables", self.var_type)?;
for dict_index in self.dict_indexes.iter() {
write!(f, " #{dict_index}")?;
}
}
}
-impl Header for ValueLabelRecord {
+impl<V, S> Header for ValueLabelRecord<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
fn offsets(&self) -> Range<u64> {
self.offsets.clone()
}
}
-impl ValueLabelRecord {
+impl<V, S> ValueLabelRecord<V, S>
+where
+ V: Debug,
+ S: Debug,
+{
/// Maximum number of value labels in a record.
pub const MAX_LABELS: u32 = u32::MAX / 8;
/// Maximum number of variable indexes in a record.
pub const MAX_INDEXES: u32 = u32::MAX / 8;
+}
- fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
+impl ValueLabelRecord<RawStr<8>, RawString> {
+ fn read<R: Read + Seek>(
+ r: &mut R,
+ endian: Endian,
+ var_types: &[VarType],
+ warn: &Box<dyn Fn(Error)>,
+ ) -> Result<Option<Record>, Error> {
let label_offset = r.stream_position()?;
let n: u32 = endian.parse(read_bytes(r)?);
if n > Self::MAX_LABELS {
let mut label = read_vec(r, padded_len - 1)?;
label.truncate(label_len);
- labels.push((value, UnencodedString(label)));
+ labels.push((value, RawString(label)));
}
let index_offset = r.stream_position()?;
let n: u32 = endian.parse(read_bytes(r)?);
if n > Self::MAX_INDEXES {
- return Err(Error::BadNumberOfVarIndexes {
+ return Err(Error::TooManyVarIndexes {
offset: index_offset,
n,
max: Self::MAX_INDEXES,
});
}
+
+ let index_offset = r.stream_position()?;
let mut dict_indexes = Vec::with_capacity(n as usize);
+ let mut invalid_indexes = Vec::new();
for _ in 0..n {
- dict_indexes.push(endian.parse(read_bytes(r)?));
+ let index: u32 = endian.parse(read_bytes(r)?);
+ if index == 0 || index as usize > var_types.len() {
+ dict_indexes.push(index);
+ } else {
+ invalid_indexes.push(index);
+ }
+ }
+ if !invalid_indexes.is_empty() {
+ warn(Error::InvalidVarIndexes {
+ offset: index_offset,
+ max: var_types.len(),
+ invalid: invalid_indexes,
+ });
+ }
+
+ let Some(&first_index) = dict_indexes.first() else {
+ warn(Error::NoVarIndexes {
+ offset: index_offset,
+ });
+ return Ok(None);
+ };
+ let var_type = var_types[first_index as usize - 1];
+ let mut wrong_type_indexes = Vec::new();
+ dict_indexes.retain(|&index| {
+ if var_types[index as usize - 1] != var_type {
+ wrong_type_indexes.push(index);
+ false
+ } else {
+ true
+ }
+ });
+ if !wrong_type_indexes.is_empty() {
+ warn(Error::MixedVarTypes {
+ offset: index_offset,
+ var_type,
+ wrong_types: wrong_type_indexes,
+ });
}
+ let labels = labels
+ .into_iter()
+ .map(|(value, label)| ValueLabel {
+ value: Value::from_raw(&value, var_type, endian),
+ label,
+ })
+ .collect();
+
let end_offset = r.stream_position()?;
- Ok(Record::ValueLabel(ValueLabelRecord {
+ Ok(Some(Record::ValueLabel(ValueLabelRecord {
offsets: label_offset..end_offset,
labels,
dict_indexes,
- }))
+ var_type,
+ })))
}
}
pub lines: Vec<DocumentLine>,
}
-pub type DocumentLine = UnencodedStr<{ DocumentRecord::LINE_LEN }>;
+pub type DocumentLine = RawStr<{ DocumentRecord::LINE_LEN }>;
impl DocumentRecord {
/// Length of a line in a document. Document lines are fixed-length and
} else {
let mut lines = Vec::with_capacity(n);
for _ in 0..n {
- lines.push(UnencodedStr::<{ DocumentRecord::LINE_LEN }>(read_bytes(r)?));
+ lines.push(RawStr::<{ DocumentRecord::LINE_LEN }>(read_bytes(r)?));
}
let end_offset = r.stream_position()?;
Ok(Record::Document(DocumentRecord {
#[derive(Clone, Debug)]
pub enum MultipleResponseType {
MultipleDichotomy {
- value: UnencodedString,
+ value: RawString,
labels: CategoryLabels,
},
MultipleCategory,
#[derive(Clone, Debug)]
pub struct MultipleResponseSet {
- pub name: UnencodedString,
- pub label: UnencodedString,
+ pub name: RawString,
+ pub label: RawString,
pub mr_type: MultipleResponseType,
- pub short_names: Vec<UnencodedString>,
+ pub short_names: Vec<RawString>,
}
impl MultipleResponseSet {
}
}
-fn parse_counted_string(input: &[u8]) -> Result<(UnencodedString, &[u8]), Error> {
+fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Error> {
let Some(space) = input.iter().position(|&b| b == b' ') else {
return Err(Error::TBD);
};
#[derive(Clone, Debug)]
pub struct LongStringMissingValues {
/// Variable name.
- pub var_name: UnencodedString,
+ pub var_name: RawString,
/// Missing values.
- pub missing_values: MissingValues,
+ pub missing_values: MissingValues<RawStr<8>>,
}
#[derive(Clone, Debug)]
} else {
value
};
- values.push(Value::String(UnencodedStr(value)));
+ values.push(Value::String(RawStr(value)));
}
let missing_values = MissingValues {
values,
missing_values,
});
}
- Ok(Record::LongStringMissingValues(LongStringMissingValueRecord(
- missing_value_set,
- )))
+ Ok(Record::LongStringMissingValues(
+ LongStringMissingValueRecord(missing_value_set),
+ ))
}
}
pub offsets: Range<u64>,
/// The text content of the record.
- pub text: UnencodedString,
+ pub text: RawString,
}
impl From<Extension> for TextRecord {
Ok(vec)
}
-fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<UnencodedString, IoError> {
+fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<RawString, IoError> {
let length: u32 = endian.parse(read_bytes(r)?);
Ok(read_vec(r, length as usize)?.into())
}
#[derive(Clone, Debug)]
pub struct LongStringValueLabels {
- pub var_name: UnencodedString,
+ pub var_name: RawString,
pub width: u32,
/// `(value, label)` pairs, where each value is `width` bytes.
- pub labels: Vec<(UnencodedString, UnencodedString)>,
+ pub labels: Vec<(RawString, RawString)>,
}
#[derive(Clone, Debug)]