"libc",
]
+[[package]]
+name = "finl_unicode"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6"
+
[[package]]
name = "flate2"
version = "1.0.26"
"chrono",
"clap",
"encoding_rs",
+ "finl_unicode",
"flate2",
"float_next_after",
"hexplay",
"num-traits",
"ordered-float",
"thiserror",
+ "unicase",
]
[[package]]
"winapi",
]
+[[package]]
+name = "unicase"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
+dependencies = [
+ "version_check",
+]
+
[[package]]
name = "unicode-ident"
version = "1.0.6"
ordered-float = "3.7.0"
thiserror = "1.0"
chrono = "0.4.26"
+finl_unicode = "1.2.0"
+unicase = "2.6.0"
[build-dependencies]
anyhow = "1.0.69"
-use std::borrow::Cow;
+use std::{borrow::Cow, collections::HashSet};
use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
use encoding_rs::Encoding;
use crate::{
- Error,
- {endian::Endian, CategoryLabels, Compression},
format::Spec,
+ identifier::{Identifier, Error as IdError},
+ {endian::Endian, CategoryLabels, Compression},
};
+use thiserror::Error as ThisError;
+
+#[derive(ThisError, Debug)]
+pub enum Error {
+ #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
+ BadVariableWidth { offset: u64, width: i32 },
+
+ #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
+ BadLongMissingValueFormat,
+
+ #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
+ InvalidCreationDate { creation_date: String },
+
+ #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
+ InvalidCreationTime { creation_time: String },
+
+ #[error("Invalid variable name: {0}")]
+ BadIdentifier(#[from] IdError),
+
+ #[error("Details TBD")]
+ TBD,
+}
pub struct Decoder {
pub compression: Option<Compression>,
pub endian: Endian,
pub encoding: &'static Encoding,
+ pub var_names: HashSet<Identifier>,
+ n_generated_names: usize,
}
impl Decoder {
+ fn take_name(&mut self, id: Identifier) -> bool {
+ self.var_names.insert(id)
+ }
+ fn generate_name(&mut self) -> Identifier {
+ loop {
+ self.n_generated_names += 1;
+ let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding).unwrap();
+ if self.take_name(name.clone()) {
+ return name;
+ }
+ assert!(self.n_generated_names < usize::MAX);
+ }
+ }
fn decode_string<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
let (output, malformed) = self.encoding.decode_without_bom_handling(input);
if malformed {
pub trait Decode: Sized {
type Input;
- fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Self;
+ fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result<Self, Error>;
}
#[derive(Clone)]
impl Decode for Header {
type Input = crate::raw::Header;
- fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Self {
+ fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result<Self, Error> {
let eye_catcher = decoder.decode_string(&input.eye_catcher, &warn);
let file_label = decoder.decode_string(&input.file_label, &warn);
let creation_date = decoder.decode_string(&input.creation_date, &warn);
});
Default::default()
});
- Header {
+ Ok(Header {
eye_catcher: eye_catcher.into(),
weight_index: input.weight_index.map(|n| n as usize),
n_cases: input.n_cases.map(|n| n as u64),
creation: NaiveDateTime::new(creation_date, creation_time),
file_label: file_label.into(),
- }
+ })
}
}
pub struct Variable {
pub width: i32,
- pub name: String,
+ pub name: Identifier,
pub print_format: Spec,
pub write_format: Spec,
}
+fn decode_var(
+ decoder: &mut Decoder,
+ input: &crate::raw::Variable,
+ warn: impl Fn(Error),
+) -> Result<Option<Variable>, Error> {
+ match input.width {
+ 0..=255 => (),
+ -1 => return Ok(None),
+ _ => {
+ return Err(Error::BadVariableWidth {
+ offset: input.offset,
+ width: input.width,
+ })
+ }
+ };
+ let name = decoder.decode_string(&input.name, &warn);
+ let name = match Identifier::new(&name, decoder.encoding) {
+ Ok(name) => {
+ if !decoder.take_name(name) {
+ decoder.generate_name()
+ } else {
+ name
+ }
+ }
+ Err(error) => {
+ warn(error.into());
+ decoder.generate_name()
+ }
+ };
+}
+
#[derive(Clone)]
pub struct Document(Vec<String>);
impl Decode for Document {
type Input = crate::raw::Document;
- fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Self {
- Document(
+ fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Result<Self, Error> {
+ Ok(Document(
input
.lines
.iter()
.map(|s| decoder.decode_string(s, &warn).into())
.collect(),
- )
+ ))
}
}
--- /dev/null
+use encoding_rs::{EncoderResult, Encoding};
+use finl_unicode::categories::{CharacterCategories, MajorCategory};
+use thiserror::Error as ThisError;
+use unicase::UniCase;
+
+pub trait IdentifierChar {
+ /// Returns true if `self` may be the first character in an identifier.
+ fn may_start_id(self) -> bool;
+
+ /// Returns true if `self` may be a second or subsequent character in an
+ /// identifier.
+ fn may_continue_id(self) -> bool;
+}
+
+impl IdentifierChar for char {
+ fn may_start_id(self) -> bool {
+ use MajorCategory::*;
+
+ ([L, M, S].contains(&self.get_major_category()) || "@#$".contains(self))
+ && self != char::REPLACEMENT_CHARACTER
+ }
+
+ fn may_continue_id(self) -> bool {
+ use MajorCategory::*;
+
+ ([L, M, S, N].contains(&self.get_major_category()) || "@#$._".contains(self))
+ && self != char::REPLACEMENT_CHARACTER
+ }
+}
+
+#[derive(Clone, PartialEq, Eq, Debug, Hash)]
+pub struct Identifier(pub UniCase<String>);
+
+#[derive(Clone, Debug, ThisError)]
+pub enum Error {
+ #[error("Identifier cannot be empty string.")]
+ Empty,
+
+ #[error("\"{0}\" may not be used as an identifier because it is a reserved word.")]
+ Reserved(String),
+
+ #[error("\"{0}\" may not be used as an identifier because it begins with disallowed character \"{1}\".")]
+ BadFirstCharacter(String, char),
+
+ #[error("\"{0}\" may not be used as an identifier because it contains disallowed character \"{1}\".")]
+ BadLaterCharacter(String, char),
+
+ #[error("Identifier \"{id}\" is {length} bytes in the encoding in use ({encoding}), which exceeds the {max}-byte limit.")]
+ TooLong {
+ id: String,
+ length: usize,
+ encoding: &'static str,
+ max: usize,
+ },
+
+ #[error("\"{id}\" may not be used as an identifier because the encoding in use ({encoding}) cannot represent \"{c}\".")]
+ NotEncodable {
+ id: String,
+ encoding: &'static str,
+ c: char,
+ },
+}
+
+fn is_reserved_word(s: &str) -> bool {
+ for word in [
+ "and", "or", "not", "eq", "ge", "gt", "le", "ne", "all", "by", "to", "with",
+ ] {
+ if s.eq_ignore_ascii_case(word) {
+ return true;
+ }
+ }
+ false
+}
+
+impl Identifier {
+ /// Maximum length of an identifier, in bytes. The limit applies in the
+ /// encoding used by the dictionary, not in UTF-8.
+ pub const MAX_LEN: usize = 64;
+
+ pub fn new(s: &str, encoding: &'static Encoding) -> Result<Identifier, Error> {
+ Self::is_plausible(s)?;
+ let (encoded, _, unencodable) = encoding.encode(s);
+ if unencodable {
+ let mut encoder = encoding.new_encoder();
+ let mut buf =
+ Vec::with_capacity(encoder.max_buffer_length_from_utf8_without_replacement(s.len()).unwrap());
+ let EncoderResult::Unmappable(c) = encoder
+ .encode_from_utf8_to_vec_without_replacement(s, &mut buf, true)
+ .0
+ else {
+ unreachable!();
+ };
+ return Err(Error::NotEncodable { id: s.into(), encoding: encoding.name(), c });
+ }
+ if encoded.len() > Self::MAX_LEN {
+ return Err(Error::TooLong { id: s.into(), length: encoded.len(), encoding: encoding.name(), max: Self::MAX_LEN });
+ }
+ Ok(Identifier(s.into()))
+ }
+ pub fn is_plausible(s: &str) -> Result<(), Error> {
+ if s.is_empty() {
+ return Err(Error::Empty);
+ }
+ if is_reserved_word(s) {
+ return Err(Error::Reserved(s.into()));
+ }
+
+ let mut i = s.chars();
+ let first = i.next().unwrap();
+ if !first.may_start_id() {
+ return Err(Error::BadFirstCharacter(s.into(), first));
+ }
+ for c in i {
+ if !c.may_continue_id() {
+ return Err(Error::BadLaterCharacter(s.into(), c));
+ }
+ }
+ Ok(())
+ }
+}
-use std::io::Error as IoError;
-use thiserror::Error as ThisError;
-
pub mod endian;
pub mod raw;
pub mod cooked;
pub mod sack;
pub mod encoding;
pub mod format;
-
-#[derive(ThisError, Debug)]
-pub enum Error {
- #[error("Not an SPSS system file")]
- NotASystemFile,
-
- #[error("Invalid magic number {0:?}")]
- BadMagic([u8; 4]),
-
- #[error("I/O error ({0})")]
- Io(#[from] IoError),
-
- #[error("Invalid SAV compression code {0}")]
- InvalidSavCompression(u32),
-
- #[error("Invalid ZSAV compression code {0}")]
- InvalidZsavCompression(u32),
-
- #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
- BadVariableWidth { offset: u64, width: i32 },
-
- #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
- BadDocumentLength { offset: u64, n: u32, max: u32 },
-
- #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
- BadRecordType { offset: u64, rec_type: u32 },
-
- #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")]
- BadVariableLabelCode { offset: u64, code: u32 },
-
- #[error(
- "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
- )]
- BadNumericMissingValueCode { offset: u64, code: i32 },
-
- #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
- BadStringMissingValueCode { offset: u64, code: i32 },
-
- #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
- BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
-
- #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
- BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
-
- #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
- ExtensionRecordTooLarge {
- offset: u64,
- subtype: u32,
- size: u32,
- count: u32,
- },
-
- #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
- EofInCase {
- offset: u64,
- case_ofs: u64,
- case_len: usize,
- },
-
- #[error(
- "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
- )]
- EofInCompressedCase { offset: u64, case_ofs: u64 },
-
- #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
- PartialCompressedCase { offset: u64, case_ofs: u64 },
-
- #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
- CompressedNumberExpected { offset: u64, case_ofs: u64 },
-
- #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
- CompressedStringExpected { offset: u64, case_ofs: u64 },
-
- #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
- BadZlibTrailerNBlocks {
- offset: u64,
- n_blocks: u32,
- expected_n_blocks: u64,
- ztrailer_len: u64,
- },
-
- #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
- BadRecordSize { offset: u64, record: String, size: u32, expected_size: u32 },
-
- #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
- BadRecordCount { offset: u64, record: String, count: u32, expected_count: u32 },
-
- #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
- BadEncodingName { offset: u64 },
-
- #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
- BadLongMissingValueLength { record_offset: u64, offset: u64, value_len: u32 },
-
- #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
- BadLongMissingValueFormat,
-
- #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
- InvalidCreationDate { creation_date: String },
-
- #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
- InvalidCreationTime { creation_time: String },
-
- #[error("Details TBD")]
- TBD,
-}
+pub mod identifier;
#[derive(Copy, Clone, Debug)]
pub enum Compression {
use crate::endian::{Endian, Parse, ToBytes};
-use crate::{CategoryLabels, Compression, Error};
+use crate::{CategoryLabels, Compression};
use flate2::read::ZlibDecoder;
use num::Integer;
io::{Error as IoError, Read, Seek, SeekFrom},
iter::FusedIterator,
};
+use thiserror::Error as ThisError;
use self::state::State;
+#[derive(ThisError, Debug)]
+pub enum Error {
+ #[error("Not an SPSS system file")]
+ NotASystemFile,
+
+ #[error("Invalid magic number {0:?}")]
+ BadMagic([u8; 4]),
+
+ #[error("I/O error ({0})")]
+ Io(#[from] IoError),
+
+ #[error("Invalid SAV compression code {0}")]
+ InvalidSavCompression(u32),
+
+ #[error("Invalid ZSAV compression code {0}")]
+ InvalidZsavCompression(u32),
+
+ #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
+ BadVariableWidth { offset: u64, width: i32 },
+
+ #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
+ BadDocumentLength { offset: u64, n: u32, max: u32 },
+
+ #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
+ BadRecordType { offset: u64, rec_type: u32 },
+
+ #[error("At offset {offset:#x}, variable label code ({code}) is not 0 or 1.")]
+ BadVariableLabelCode { offset: u64, code: u32 },
+
+ #[error(
+ "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
+ )]
+ BadNumericMissingValueCode { offset: u64, code: i32 },
+
+ #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
+ BadStringMissingValueCode { offset: u64, code: i32 },
+
+ #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
+ BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
+
+ #[error("At offset {offset:#x}, number of variables indexes ({n}) is greater than the maximum number ({max}).")]
+ BadNumberOfVarIndexes { offset: u64, n: u32, max: u32 },
+
+ #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
+ ExtensionRecordTooLarge {
+ offset: u64,
+ subtype: u32,
+ size: u32,
+ count: u32,
+ },
+
+ #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
+ EofInCase {
+ offset: u64,
+ case_ofs: u64,
+ case_len: usize,
+ },
+
+ #[error(
+ "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
+ )]
+ EofInCompressedCase { offset: u64, case_ofs: u64 },
+
+ #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
+ PartialCompressedCase { offset: u64, case_ofs: u64 },
+
+ #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
+ CompressedNumberExpected { offset: u64, case_ofs: u64 },
+
+ #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
+ CompressedStringExpected { offset: u64, case_ofs: u64 },
+
+ #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
+ BadZlibTrailerNBlocks {
+ offset: u64,
+ n_blocks: u32,
+ expected_n_blocks: u64,
+ ztrailer_len: u64,
+ },
+
+ #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
+ BadRecordSize { offset: u64, record: String, size: u32, expected_size: u32 },
+
+ #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
+ BadRecordCount { offset: u64, record: String, count: u32, expected_count: u32 },
+
+ #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
+ BadLongMissingValueLength { record_offset: u64, offset: u64, value_len: u32 },
+
+ #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
+ BadEncodingName { offset: u64 },
+
+ #[error("Details TBD")]
+ TBD,
+}
+
#[derive(Clone, Debug)]
pub enum Record {
Header(Header),