|Value|Role |
|----:|:----------|
| 0 | Input |
-| 1 | Output |
+| 1 | Target |
| 2 | Both |
| 3 | None |
| 4 | Partition |
variable label on dummy variable records, so readers should take care to
parse dummy variable records in the same way as other variable records.
-
The "<a name="dictionary-index">dictionary index</a>" of a variable is
a 1-based offset in the set of variable records, including dummy
variable records for long string variables. The first variable record
`n_missing_values`. Each element is interpreted as a number for
numeric variables (with `HIGHEST` and `LOWEST` indicated as
described in the [introduction](index.md)). For string variables of
- width less than 8 bytes, elements are right-padded with spaces; for
- string variables wider than 8 bytes, only the first 8 bytes of each
- missing value are specified, with the remainder implicitly all
- spaces.
+ width less than 8 bytes, elements are right-padded with spaces.
For discrete missing values, each element represents one missing
value. When a range is present, the first element denotes the
A few system files have been observed in the wild with invalid
`write` fields, in particular with value 0. Readers should probably
treat invalid `print` or `write` fields as some default format.
+
+## Obsolete Treatment of Long String Missing Values
+
+SPSS and most versions of PSPP write missing values for string
+variables wider than 8 bytes with a [Long String Missing Values
+Record](long-string-missing-values-record.md). Very old versions of
+PSPP instead wrote these missing values on the variables record,
+writing only the first 8 bytes of each missing value, with the
+remainder implicitly all spaces. Any new software should use the
+[Long String Missing Values
+Record](long-string-missing-values-record.md), but it might possibly
+be worthwhile also to accept the old format used by PSPP.
use indexmap::IndexSet;
use num::integer::div_ceil;
use ordered_float::OrderedFloat;
+use thiserror::Error as ThisError;
use unicase::UniCase;
use crate::{
}
impl Role {
+ /// Convert `input` to [Role].
+ ///
+ /// This can't be `FromStr<Option<Role>` because defining traits on `Option`
+ /// is not allowed.
fn try_from_str(input: &str) -> Result<Option<Role>, InvalidRole> {
for (string, value) in [
("input", Some(Role::Input)),
return Ok(value);
}
}
- Err(InvalidRole)
+ Err(InvalidRole::UnknownRole(input.into()))
+ }
+
+ /// Convert `integer` to [Role].
+ ///
+ /// This can't be `TryFrom<Option<Role>>` because defining traits on
+ /// `Option>` is not allowed.
+ fn try_from_integer(integer: i32) -> Result<Option<Role>, InvalidRole> {
+ match integer {
+ 0 => Ok(Some(Role::Input)),
+ 1 => Ok(Some(Role::Target)),
+ 2 => Ok(Some(Role::Both)),
+ 4 => Ok(Some(Role::Partition)),
+ 5 => Ok(Some(Role::Split)),
+ 3 => Ok(None),
+ _ => Err(InvalidRole::UnknownRole(integer.to_string())),
+ }
}
}
}
}
-pub struct InvalidRole;
+#[derive(Clone, Debug, ThisError, PartialEq, Eq)]
+pub enum InvalidRole {
+ #[error("Unknown role {0:?}.")]
+ UnknownRole(String),
+
+ #[error("Role attribute $@Role must have exactly one value (not {0}).")]
+ InvalidValues(usize),
+}
impl TryFrom<&Attributes> for Option<Role> {
type Error = InvalidRole;
let role = Identifier::new("$@Role").unwrap();
value.0.get(&role).map_or(Ok(None), |attribute| {
if let Ok([string]) = <&[String; 1]>::try_from(attribute.as_slice()) {
- Role::try_from_str(string)
+ match string.parse() {
+ Ok(integer) => Role::try_from_integer(integer),
+ Err(_) => Err(InvalidRole::UnknownRole(string.clone())),
+ }
} else {
- Err(InvalidRole)
+ Err(InvalidRole::InvalidValues(attribute.len()))
}
})
}
mod parse;
pub use display::DisplayValue;
-#[derive(ThisError, Debug)]
+#[derive(Clone, ThisError, Debug, PartialEq, Eq)]
pub enum Error {
#[error("Unknown format type {value}.")]
UnknownFormat { value: u16 },
}
}
-#[derive(Clone, Debug, ThisError)]
+#[derive(Clone, Debug, ThisError, PartialEq, Eq)]
pub enum Error {
#[error("Identifier cannot be empty string.")]
Empty,
for header in headers {
decoded_records.push(header.decode(&decoder)?);
}
- let headers = Headers::new(decoded_records, &|e| eprintln!("{e}"))?;
+ let headers = Headers::new(decoded_records, &mut |e| eprintln!("{e}"))?;
let (dictionary, metadata) = decode(headers, encoding, |e| eprintln!("{e}"))?;
println!("{dictionary:#?}");
println!("{metadata:#?}");
pub use crate::sys::raw::{CategoryLabels, Compression};
-#[derive(ThisError, Debug)]
+#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
pub enum Error {
#[error("Missing header record")]
MissingHeaderRecord,
- // XXX this is an internal error
- #[error("More than one file header record")]
- DuplicateHeaderRecord,
-
#[error("{0}")]
EncodingError(EncodingError),
#[error("At offset {offset:#x}, one or more variable indexes for value labels referred to long string continuation records: {indexes:?}")]
LongStringContinuationIndexes { offset: u64, indexes: Vec<u32> },
+ #[error("Variable index {start_index} is a {width} that should be followed by long string continuation records up to index {end_index}, but index {error_index} is not a continuation")]
+ MissingLongStringContinuation {
+ width: RawWidth,
+ start_index: usize,
+ end_index: usize,
+ error_index: usize,
+ },
+
#[error(
"At offsets {:#x}...{:#x}, record types 3 and 4 may not add value labels to one or more long string variables: {variables:?}", .offsets.start, .offsets.end
)]
#[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
MalformedString { encoding: String, text: String },
- #[error("Details TBD")]
+ #[error("File contains multiple {0:?} records.")]
+ MoreThanOne(&'static str),
+
+ #[error("File designates string variable {name} (index {index}) as weight variable, but weight variables must be numeric.")]
+ InvalidWeightVar { name: Identifier, index: u32 },
+
+ #[error(
+ "File weight variable index {index} is greater than maximum variable index {max_index}."
+ )]
+ InvalidWeightIndex { index: u32, max_index: usize },
+
+ #[error("{0}")]
+ InvalidRole(InvalidRole),
+
+ #[error("Details TBD (cooked)")]
TBD,
}
pub cases: Option<Rc<RefCell<Cases>>>,
}
-fn take_first<T, F>(mut vec: Vec<T>, more_than_one: F) -> Option<T>
-where
- F: FnOnce(),
-{
+fn take_first<T>(
+ mut vec: Vec<T>,
+ record_name: &'static str,
+ warn: &mut impl FnMut(Error),
+) -> Option<T> {
if vec.len() > 1 {
- more_than_one();
+ warn(Error::MoreThanOne(record_name));
}
vec.drain(..).next()
}
impl Headers {
- pub fn new(headers: Vec<raw::DecodedRecord>, warn: &impl Fn(Error)) -> Result<Headers, Error> {
+ pub fn new(
+ headers: Vec<raw::DecodedRecord>,
+ warn: &mut impl FnMut(Error),
+ ) -> Result<Headers, Error> {
let mut file_header = Vec::new();
let mut variable = Vec::new();
let mut value_label = Vec::new();
}
}
- let Some(file_header) = take_first(file_header, || warn(Error::DuplicateHeaderRecord))
- else {
+ let Some(file_header) = take_first(file_header, "file header", warn) else {
return Err(Error::MissingHeaderRecord);
};
variable,
value_label,
document,
- integer_info: take_first(integer_info, || warn(Error::TBD)),
- float_info: take_first(float_info, || warn(Error::TBD)),
- var_display: take_first(var_display, || warn(Error::TBD)),
+ integer_info: take_first(integer_info, "integer info", warn),
+ float_info: take_first(float_info, "float info", warn),
+ var_display: take_first(var_display, "variable display", warn),
multiple_response,
long_string_value_labels,
long_string_missing_values,
- encoding: take_first(encoding, || warn(Error::TBD)),
- number_of_cases: take_first(number_of_cases, || warn(Error::TBD)),
+ encoding: take_first(encoding, "encoding", warn),
+ number_of_cases: take_first(number_of_cases, "number of cases", warn),
variable_sets,
- product_info: take_first(product_info, || warn(Error::TBD)),
+ product_info: take_first(product_info, "product info", warn),
long_names,
very_long_strings,
file_attributes,
variable_attributes,
other_extension,
- end_of_headers: take_first(end_of_headers, || warn(Error::TBD)),
- z_header: take_first(z_header, || warn(Error::TBD)),
- z_trailer: take_first(z_trailer, || warn(Error::TBD)),
- cases: take_first(cases, || warn(Error::TBD)),
+ end_of_headers: take_first(end_of_headers, "end of headers", warn),
+ z_header: take_first(z_header, "z_header", warn),
+ z_trailer: take_first(z_trailer, "z_trailer", warn),
+ cases: take_first(cases, "cases", warn),
})
}
}
}
impl Metadata {
- fn decode(headers: &Headers, warn: impl Fn(Error)) -> Self {
+ fn decode(headers: &Headers, mut warn: impl FnMut(Error)) -> Self {
let header = &headers.header;
let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
.unwrap_or_else(|_| {
pub fn decode(
mut headers: Headers,
encoding: &'static Encoding,
- warn: impl Fn(Error),
+ mut warn: impl FnMut(Error),
) -> Result<(Dictionary, Metadata), Error> {
let mut dictionary = Dictionary::new(encoding);
.get(index + offset)
.is_none_or(|record| record.width != RawWidth::Continuation)
{
- warn(Error::TBD);
+ warn(Error::MissingLongStringContinuation {
+ width: input.width,
+ start_index: index,
+ end_index: index + n_values - 1,
+ error_index: index + offset,
+ });
break;
}
}
if variable.is_numeric() {
dictionary.weight = Some(*dict_index);
} else {
- warn(Error::TBD);
+ warn(Error::InvalidWeightVar {
+ index: weight_index,
+ name: variable.name.clone(),
+ });
}
} else {
- warn(Error::TBD);
+ warn(Error::InvalidWeightIndex {
+ index: weight_index,
+ max_index: var_index_map.len(),
+ });
}
}
variable.measure = Some(measure);
}
} else {
- warn(Error::TBD);
+ warn(dbg!(Error::TBD));
}
}
}
.iter()
.flat_map(|record| record.0.iter())
{
- match MultipleResponseSet::decode(&dictionary, record, &warn) {
+ match MultipleResponseSet::decode(&dictionary, record, &mut warn) {
Ok(mrset) => {
dictionary.mrsets.insert(ByIdentifier::new(mrset));
}
.flat_map(|record| record.0.into_iter())
{
let Some(index) = dictionary.variables.get_index_of(&record.short_name.0) else {
- warn(Error::TBD);
+ warn(dbg!(Error::TBD));
continue;
};
let width = VarWidth::String(record.length);
let n_segments = width.n_segments();
if n_segments == 1 {
- warn(Error::TBD);
+ warn(dbg!(Error::TBD));
continue;
}
if index + n_segments > dictionary.variables.len() {
- warn(Error::TBD);
+ warn(dbg!(Error::TBD));
continue;
}
let mut short_names = Vec::with_capacity(n_segments);
short_names.push(segment.short_names[0].clone());
let segment_width = segment.width.as_string_width().unwrap_or(0);
if segment_width.next_multiple_of(8) != alloc_width.next_multiple_of(8) {
- warn(Error::TBD);
+ warn(dbg!(Error::TBD));
continue 'outer;
}
}
.unwrap()
.short_names = vec![short_name];
} else {
- warn(Error::TBD);
+ warn(dbg!(Error::TBD));
}
}
}
{
variable.attributes.append(&mut attr_set.attributes);
} else {
- warn(Error::TBD);
+ warn(dbg!(Error::TBD));
}
}
let variable = dictionary.variables.get_index_mut2(index).unwrap();
match variable.attributes.role() {
Ok(role) => variable.role = role,
- Err(InvalidRole) => warn(Error::TBD),
+ Err(error) => warn(Error::InvalidRole(error)),
}
}
.flat_map(|record| record.0.into_iter())
{
let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else {
- warn(Error::TBD);
+ warn(dbg!(Error::TBD));
continue;
};
let Some(width) = variable.width.as_string_width() else {
- warn(Error::TBD);
+ warn(dbg!(Error::TBD));
continue;
};
for (mut value, label) in record.labels.into_iter() {
.flat_map(|record| record.0.into_iter())
{
let Some((_, variable)) = dictionary.variables.get_full_mut2(&record.var_name.0) else {
- warn(Error::TBD);
+ warn(dbg!(Error::TBD));
continue;
};
let values = record
let mut variables = Vec::with_capacity(record.variable_names.len());
for variable_name in record.variable_names {
let Some((dict_index, _)) = dictionary.variables.get_full_mut2(&variable_name.0) else {
- warn(Error::TBD);
+ warn(dbg!(Error::TBD));
continue;
};
variables.push(dict_index);
fn decode(
dictionary: &Dictionary,
input: &raw::MultipleResponseSet<Identifier, String>,
- warn: &impl Fn(Error),
+ warn: &mut impl FnMut(Error),
) -> Result<Self, Error> {
let mr_set_name = input.name.clone();
let mut variables = Vec::with_capacity(input.short_names.len());
out
}
-fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Format, FormatError)) -> Format {
+fn decode_format(
+ raw: raw::Spec,
+ width: VarWidth,
+ mut warn: impl FnMut(Format, FormatError),
+) -> Format {
UncheckedFormat::try_from(raw)
.and_then(Format::try_from)
.and_then(|x| x.check_width_compatibility(width))
use thiserror::Error as ThisError;
-#[derive(ThisError, Debug)]
+#[derive(Clone, ThisError, Debug, PartialEq, Eq)]
pub enum Error {
#[error("This system file does not indicate its own character encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")]
NoEncoding,
#[error("{0}")]
EncodingError(EncodingError),
- #[error("Details TBD")]
+ #[error("Missing value record with range not allowed for string variable")]
+ MissingValueStringRange,
+
+ #[error("Missing value record at offset {0:#x} not allowed for long string continuation")]
+ MissingValueContinuation(u64),
+
+ #[error("Invalid multiple dichotomy label type")]
+ InvalidMultipleDichotomyLabelType,
+
+ #[error("Invalid multiple response type")]
+ InvalidMultipleResponseType,
+
+ #[error("Syntax error in multiple response record")]
+ MultipleResponseSyntaxError,
+
+ #[error("Syntax error parsing counted string (missing trailing space)")]
+ CountedStringMissingSpace,
+
+ #[error("Syntax error parsing counted string (invalid UTF-8)")]
+ CountedStringInvalidUTF8,
+
+ #[error("Syntax error parsing counted string (invalid length {0:?})")]
+ CountedStringInvalidLength(String),
+
+ #[error("Syntax error parsing counted string (length {0:?} goes past end of input)")]
+ CountedStringTooLong(usize),
+
+ #[error("Variable display record contains {count} items but should contain either {first} or {second}.")]
+ InvalidVariableDisplayCount {
+ count: usize,
+ first: usize,
+ second: usize,
+ },
+
+ #[error("Very long string record missing delimiter in {0:?}.")]
+ VeryLongStringMissingDelimiter(String),
+
+ #[error("Very long string record has invalid length in {0:?}.")]
+ VeryLongStringInvalidLength(String),
+
+ #[error("Attribute record missing left parenthesis, in {0:?}.")]
+ AttributeMissingLParen(String),
+
+ #[error("Attribute record missing new-line, in {0:?}.")]
+ AttributeMissingNewline(String),
+
+ #[error("Attribute record missing quotations, in {0:?}.")]
+ AttributeMissingQuotes(String),
+
+ #[error("Details TBD (raw)")]
TBD,
}
.into()
}
-#[derive(Clone)]
+#[derive(Clone, Default)]
pub struct MissingValues<S = Box<[u8]>>
where
S: Debug,
}
}
-impl<S> Default for MissingValues<S>
-where
- S: Debug,
-{
- fn default() -> Self {
- Self {
- values: Vec::new(),
- range: None,
- }
- }
-}
-
impl MissingValues {
fn read<R: Read + Seek>(
r: &mut R,
offset: u64,
- width: RawWidth,
+ raw_width: RawWidth,
code: i32,
endian: Endian,
warn: &dyn Fn(Warning),
) -> Result<Self, Error> {
let (individual_values, has_range) = match code {
- 0..=3 => (code as usize, false),
+ 0 => return Ok(Self::default()),
+ 1..=3 => (code as usize, false),
-2 => (0, true),
-3 => (1, true),
_ => return Err(Error::BadMissingValueCode { offset, code }),
values.push(read_bytes::<8, _>(r)?);
}
- match VarWidth::try_from(width) {
+ match VarWidth::try_from(raw_width) {
Ok(VarWidth::Numeric) => {
let values = values
.into_iter()
);
return Ok(Self { values, range });
}
- Ok(VarWidth::String(width)) if width <= 8 && range.is_none() => {
+ Ok(VarWidth::String(_)) if range.is_some() => warn(Warning::MissingValueStringRange),
+ Ok(VarWidth::String(width)) => {
+ let width = width.min(8) as usize;
let values = values
.into_iter()
- .map(|value| Value::String(Box::from(&value[..width as usize])))
+ .map(|value| Value::String(Box::from(&value[..width])))
.collect();
return Ok(Self {
values,
range: None,
});
}
- Ok(VarWidth::String(width)) if width > 8 => warn(Warning::TBD),
- Ok(VarWidth::String(_)) => warn(Warning::TBD),
- Err(()) => warn(Warning::TBD),
+ Err(()) => warn(Warning::MissingValueContinuation(offset)),
}
Ok(Self::default())
}
pub label: Option<S>,
}
-#[derive(Copy, Clone, PartialEq, Eq)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum RawWidth {
Continuation,
Numeric,
} else if let Some(rest) = input.strip_prefix(b" 11 ") {
(CategoryLabels::VarLabels, rest)
} else {
- return Err(Warning::TBD);
+ return Err(Warning::InvalidMultipleDichotomyLabelType);
};
let (value, input) = parse_counted_string(input)?;
(
input,
)
}
- _ => return Err(Warning::TBD),
+ _ => return Err(Warning::InvalidMultipleResponseType),
};
Ok((mr_type, input))
}
impl MultipleResponseSet<RawString, RawString> {
fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> {
let Some(equals) = input.iter().position(|&b| b == b'=') else {
- return Err(Warning::TBD);
+ return Err(Warning::MultipleResponseSyntaxError);
};
let (name, input) = input.split_at(equals);
let (mr_type, input) = MultipleResponseType::parse(input)?;
let Some(input) = input.strip_prefix(b" ") else {
- return Err(Warning::TBD);
+ return Err(Warning::MultipleResponseSyntaxError);
};
let (label, mut input) = parse_counted_string(input)?;
let mut vars = Vec::new();
match input.split_first() {
Some((b' ', rest)) => {
let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
- return Err(Warning::TBD);
+ return Err(Warning::MultipleResponseSyntaxError);
};
let (var, rest) = rest.split_at(length);
if !var.is_empty() {
}
input = rest;
}
- _ => return Err(Warning::TBD),
+ _ => return Err(Warning::MultipleResponseSyntaxError),
}
}
while input.first() == Some(&b'\n') {
fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> {
let Some(space) = input.iter().position(|&b| b == b' ') else {
- return Err(Warning::TBD);
+ return Err(Warning::CountedStringMissingSpace);
};
let Ok(length) = from_utf8(&input[..space]) else {
- return Err(Warning::TBD);
+ return Err(Warning::CountedStringInvalidUTF8);
};
let Ok(length): Result<usize, _> = length.parse() else {
- return Err(Warning::TBD);
+ return Err(Warning::CountedStringInvalidLength(length.into()));
};
- let input = &input[space + 1..];
- if input.len() < length {
- return Err(Warning::TBD);
+ let Some((string, rest)) = input[space + 1..].split_at_checked(length) else {
+ return Err(Warning::CountedStringTooLong(length));
};
-
- let (string, rest) = input.split_at(length);
Ok((string.into(), rest))
}
} else if ext.count as usize == 2 * n_vars {
false
} else {
- return Err(Warning::TBD);
+ return Err(Warning::InvalidVariableDisplayCount {
+ count: ext.count as usize,
+ first: 2 * n_vars,
+ second: 2 * n_vars,
+ });
};
let mut var_displays = Vec::new();
impl VeryLongString {
fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Warning> {
let Some((short_name, length)) = input.split_once('=') else {
- return Err(Warning::TBD);
+ return Err(Warning::VeryLongStringMissingDelimiter(input.into()));
};
let short_name = decoder
.new_identifier(short_name)
.and_then(Identifier::must_be_ordinary)
.map_err(Warning::InvalidLongStringName)?;
- let length = length.parse().map_err(|_| Warning::TBD)?;
+ let length = length
+ .parse()
+ .map_err(|_| Warning::VeryLongStringInvalidLength(input.into()))?;
Ok(VeryLongString { short_name, length })
}
}
impl Attribute {
fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> {
let Some((name, mut input)) = input.split_once('(') else {
- return Err(Warning::TBD);
+ return Err(Warning::AttributeMissingLParen(input.into()));
};
let name = decoder
.new_identifier(name)
let mut values = Vec::new();
loop {
let Some((value, rest)) = input.split_once('\n') else {
- return Err(Warning::TBD);
+ return Err(Warning::AttributeMissingNewline(input.into()));
};
if let Some(stripped) = value
.strip_prefix('\'')
{
values.push(stripped.into());
} else {
- decoder.warn(Warning::TBD);
+ decoder.warn(Warning::AttributeMissingQuotes(value.into()));
values.push(value.into());
}
if let Some(rest) = rest.strip_prefix(')') {
for header in headers {
decoded_records.push(header.decode(&decoder).unwrap());
}
- let headers = Headers::new(decoded_records, &|e| eprintln!("{e}")).unwrap();
- let (dictionary, metadata) = decode(headers, encoding, |e| eprintln!("{e}")).unwrap();
+
+ let mut errors = Vec::new();
+ let headers = Headers::new(decoded_records, &mut |e| errors.push(e)).unwrap();
+ let (dictionary, metadata) = decode(headers, encoding, |e| errors.push(e)).unwrap();
+ assert_eq!(errors, vec![]);
println!("{dictionary:#?}");
+ assert_eq!(metadata.endian, Endian::Big);
+ assert_eq!(metadata.compression, None);
+ assert_eq!(metadata.n_cases, Some(1));
+ assert_eq!(metadata.version, Some((1, 2, 3)));
println!("{metadata:#?}");
}