source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
+[[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
[[package]]
name = "anyhow"
version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+[[package]]
+name = "bumpalo"
+version = "3.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
+
[[package]]
name = "cc"
version = "1.0.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+[[package]]
+name = "chrono"
+version = "0.4.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5"
+dependencies = [
+ "android-tzdata",
+ "iana-time-zone",
+ "js-sys",
+ "num-traits",
+ "time",
+ "wasm-bindgen",
+ "winapi",
+]
+
[[package]]
name = "clap"
version = "4.1.7"
"os_str_bytes",
]
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
+
[[package]]
name = "crc32fast"
version = "1.3.2"
"cfg-if",
]
+[[package]]
+name = "encoding_rs"
+version = "0.8.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394"
+dependencies = [
+ "cfg-if",
+]
+
[[package]]
name = "errno"
version = "0.2.8"
"termcolor 0.3.6",
]
+[[package]]
+name = "iana-time-zone"
+version = "0.1.57"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "wasm-bindgen",
+ "windows",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
+dependencies = [
+ "cc",
+]
+
[[package]]
name = "io-lifetimes"
version = "1.0.5"
"windows-sys 0.45.0",
]
+[[package]]
+name = "js-sys"
+version = "0.3.64"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a"
+dependencies = [
+ "wasm-bindgen",
+]
+
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
+[[package]]
+name = "log"
+version = "0.4.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4"
+
[[package]]
name = "miniz_oxide"
version = "0.7.1"
version = "1.0.0"
dependencies = [
"anyhow",
+ "chrono",
"clap",
+ "encoding_rs",
"flate2",
"float_next_after",
"hexplay",
"syn 1.0.109",
]
+[[package]]
+name = "time"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a"
+dependencies = [
+ "libc",
+ "wasi",
+ "winapi",
+]
+
[[package]]
name = "unicode-ident"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
+[[package]]
+name = "wasi"
+version = "0.10.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342"
+dependencies = [
+ "cfg-if",
+ "wasm-bindgen-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-backend"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd"
+dependencies = [
+ "bumpalo",
+ "log",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.27",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.27",
+ "wasm-bindgen-backend",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1"
+
[[package]]
name = "winapi"
version = "0.3.9"
"winapi",
]
+[[package]]
+name = "windows"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
+dependencies = [
+ "windows-targets 0.48.1",
+]
+
[[package]]
name = "windows-sys"
version = "0.45.0"
[dependencies]
anyhow = "1.0.69"
clap = { version = "4.1.7", features = ["derive", "wrap_help"] }
+encoding_rs = "0.8.32"
flate2 = "1.0.26"
float_next_after = "1.0.0"
hexplay = "0.2.1"
num-traits = "0.2.16"
ordered-float = "3.7.0"
thiserror = "1.0"
+chrono = "0.4.26"
[build-dependencies]
anyhow = "1.0.69"
+use std::borrow::Cow;
+
+use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
+use encoding_rs::Encoding;
+
+use crate::{
+ Error,
+ {endian::Endian, CategoryLabels, Compression},
+ format::UncheckedFormat,
+};
+
+pub struct Decoder {
+ pub compression: Option<Compression>,
+ pub endian: Endian,
+ pub encoding: &'static Encoding,
+}
+
+impl Decoder {
+ fn decode_string<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
+ let (output, malformed) = self.encoding.decode_without_bom_handling(input);
+ if malformed {
+ warn(Error::TBD);
+ }
+ output
+ }
+}
+
+pub trait Decode: Sized {
+ type Input;
+ fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Self;
+}
+
+#[derive(Clone)]
+pub struct Header {
+ pub eye_catcher: String,
+ pub weight_index: Option<usize>,
+ pub n_cases: Option<u64>,
+ pub creation: NaiveDateTime,
+ pub file_label: String,
+}
+
+impl Decode for Header {
+ type Input = crate::raw::Header;
+
+ fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Self {
+ let eye_catcher = decoder.decode_string(&input.eye_catcher, &warn);
+ let file_label = decoder.decode_string(&input.file_label, &warn);
+ let creation_date = decoder.decode_string(&input.creation_date, &warn);
+ let creation_date = NaiveDate::parse_from_str(&creation_date, "%v").unwrap_or_else(|_| {
+ warn(Error::InvalidCreationDate {
+ creation_date: creation_date.into(),
+ });
+ Default::default()
+ });
+ let creation_time = decoder.decode_string(&input.creation_time, &warn);
+ let creation_time =
+ NaiveTime::parse_from_str(&creation_time, "%H:%M:%S").unwrap_or_else(|_| {
+ warn(Error::InvalidCreationTime {
+ creation_time: creation_time.into(),
+ });
+ Default::default()
+ });
+ Header {
+ eye_catcher: eye_catcher.into(),
+ weight_index: input.weight_index.map(|n| n as usize),
+ n_cases: input.n_cases.map(|n| n as u64),
+ creation: NaiveDateTime::new(creation_date, creation_time),
+ file_label: file_label.into(),
+ }
+ }
+}
+
+pub struct Variable {
+ pub width: i32,
+ pub name: String,
+ pub print_format: UncheckedFormat,
+ pub write_format: UncheckedFormat,
+}
+
+#[derive(Clone)]
+pub struct Document(Vec<String>);
+
+impl Decode for Document {
+ type Input = crate::raw::Document;
+
+ fn decode(decoder: &Decoder, input: &Self::Input, warn: impl Fn(Error)) -> Self {
+ Document(
+ input
+ .lines
+ .iter()
+ .map(|s| decoder.decode_string(s, &warn).into())
+ .collect(),
+ )
+ }
+}
+
+pub use crate::raw::FloatInfo;
+pub use crate::raw::IntegerInfo;
+
+#[derive(Clone, Debug)]
+pub enum MultipleResponseType {
+ MultipleDichotomy {
+ value: String,
+ labels: CategoryLabels,
+ },
+ MultipleCategory,
+}
+#[derive(Clone, Debug)]
+pub struct MultipleResponseSet {
+ pub name: String,
+ pub label: String,
+ pub mr_type: MultipleResponseType,
+ pub vars: Vec<String>,
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseRecord(Vec<MultipleResponseSet>);
+
+#[derive(Clone, Debug)]
+pub struct ProductInfo(String);
+
+pub enum Measure {
+ Nominal,
+ Ordinal,
+ Scale,
+}
+
+pub enum Alignment {
+ Left,
+ Right,
+ Center,
+}
+
+pub struct VarDisplay {
+ pub measure: Option<Measure>,
+ pub width: u32,
+ pub align: Option<Alignment>,
+}
+
+pub struct VarDisplayRecord(pub Vec<VarDisplay>);
pub mod cooked;
pub mod sack;
pub mod encoding;
+pub mod format;
#[derive(ThisError, Debug)]
pub enum Error {
#[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
BadLongMissingValueLength { record_offset: u64, offset: u64, value_len: u32 },
- #[error("This file has corrupted metadata written by a buggy version of PSPP. To fix it, save a new copy of the file.")]
+ #[error("This file has corrupted metadata written by a buggy version of PSPP. To ensure that other software can read it correctly, save a new copy of the file.")]
BadLongMissingValueFormat,
+ #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format. Using 01 Jan 1970.")]
+ InvalidCreationDate { creation_date: String },
+
+ #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format. Using midnight.")]
+ InvalidCreationTime { creation_time: String },
+
#[error("Details TBD")]
TBD,
}
+
+#[derive(Copy, Clone, Debug)]
+pub enum Compression {
+ Simple,
+ ZLib,
+}
+
+#[derive(Clone, Debug)]
+pub enum CategoryLabels {
+ VarLabels,
+ CountedValues,
+}
use anyhow::Result;
use clap::Parser;
-use pspp::{
- raw::{Reader, Record},
-};
+use pspp::raw::{Reader, Record};
use std::fs::File;
use std::io::BufReader;
use std::path::{Path, PathBuf};
let mut reader = Reader::new(reader)?;
let records: Vec<Record> = reader.collect_headers()?;
- let mut n_cases = 0;
for record in records {
println!("{record:?}");
- match record {
- Record::EndOfHeaders(_) if max_cases == 0 => break,
- Record::Case(_) => {
- n_cases += 1;
- if n_cases >= max_cases {
- break;
- }
- }
- _ => (),
- }
+ if let Record::EndOfHeaders(_) = record {
+ break;
+ };
+ }
+
+ for _ in 0..max_cases {
+ let Some(Ok(Record::Case(data))) = reader.next() else {
+ break;
+ };
+ println!("{:?}", data);
}
Ok(())
}
use crate::endian::{Endian, Parse, ToBytes};
-use crate::Error;
+use crate::{CategoryLabels, Compression, Error};
use flate2::read::ZlibDecoder;
use num::Integer;
use self::state::State;
-#[derive(Copy, Clone, Debug)]
-pub enum Compression {
- Simple,
- ZLib,
-}
-
#[derive(Clone, Debug)]
pub enum Record {
Header(Header),
- Document(Document),
Variable(Variable),
ValueLabel(ValueLabel),
VarIndexes(VarIndexes),
+ Document(Document),
IntegerInfo(IntegerInfo),
FloatInfo(FloatInfo),
VariableSets(UnencodedString),
FileAttributes(UnencodedString),
VariableAttributes(UnencodedString),
TextExtension(TextExtension),
- Extension(Extension),
+ OtherExtension(Extension),
EndOfHeaders(u32),
ZHeader(ZHeader),
ZTrailer(ZTrailer),
if let Ok(s) = from_utf8(s) {
s.into()
} else {
- let s: String = s
- .iter()
- .map(|c| char::from(*c))
- .collect();
+ let s: String = s.iter().map(|c| char::from(*c)).collect();
s.into()
}
}
}
}
-fn format_name(type_: u32) -> &'static str {
+fn format_name(type_: u32) -> Cow<'static, str> {
match type_ {
1 => "A",
2 => "AHEX",
39 => "SDATE",
40 => "MTIME",
41 => "YMDHMS",
- _ => "(unknown)",
- }
+ _ => return format!("<unknown format {type_}>").into()
+ }.into()
}
#[derive(Clone)]
pub name: [u8; 8],
/// Print format.
- pub print_format: u32,
+ pub print_format: Format,
/// Write format.
- pub write_format: u32,
+ pub write_format: Format,
/// Missing values.
pub missing_values: MissingValues,
"long string continuation record"
}
)?;
- writeln!(f, "Print format: {:?}", Format(self.print_format))?;
- writeln!(f, "Write format: {:?}", Format(self.write_format))?;
+ writeln!(f, "Print format: {:?}", self.print_format)?;
+ writeln!(f, "Write format: {:?}", self.write_format)?;
writeln!(f, "Name: {:?}", FallbackEncoding(&self.name))?;
- writeln!(
- f,
- "Variable label: {:?}",
- self.label
- )?;
+ writeln!(f, "Variable label: {:?}", self.label)?;
writeln!(f, "Missing values: {:?}", self.missing_values)
}
}
let width: i32 = endian.parse(read_bytes(r)?);
let has_variable_label: u32 = endian.parse(read_bytes(r)?);
let missing_value_code: i32 = endian.parse(read_bytes(r)?);
- let print_format: u32 = endian.parse(read_bytes(r)?);
- let write_format: u32 = endian.parse(read_bytes(r)?);
+ let print_format = Format(endian.parse(read_bytes(r)?));
+ let write_format = Format(endian.parse(read_bytes(r)?));
let name: [u8; 8] = read_bytes(r)?;
let label = match has_variable_label {
let little = format!("{:?}", little);
let big: f64 = Endian::Big.parse(self.0);
let big = format!("{:?}", big);
- let number = if little.len() <= big.len() { little } else { big };
+ let number = if little.len() <= big.len() {
+ little
+ } else {
+ big
+ };
write!(f, "{number}")?;
let string = fallback_encode(&self.0);
- let string = string.split(|c: char| c == '\0' || c.is_control()).next().unwrap();
+ let string = string
+ .split(|c: char| c == '\0' || c.is_control())
+ .next()
+ .unwrap();
write!(f, "/\"{string}\"")?;
Ok(())
}
}
}
-#[derive(Clone, Debug)]
-pub enum CategoryLabels {
- VarLabels,
- CountedValues,
-}
#[derive(Clone, Debug)]
pub enum MultipleResponseType {
MultipleDichotomy {
Ok((string.into(), rest))
}
-pub struct ExtraProductInfo(String);
+pub struct ProductInfo(String);
-impl TextRecord for ExtraProductInfo {
+impl TextRecord for ProductInfo {
const NAME: &'static str = "extra product info";
fn parse(input: &str, _warn: impl Fn(Error)) -> Result<Self, Error> {
- Ok(ExtraProductInfo(input.into()))
+ Ok(ProductInfo(input.into()))
}
}
#[derive(Clone, Debug)]
-pub struct VarDisplayRecord(Vec<u32>);
+pub struct VarDisplayRecord(pub Vec<u32>);
impl ExtensionRecord for VarDisplayRecord {
const SUBTYPE: u32 = 11;
};
values.push(Value::String(value));
}
- let missing_values = MissingValues { values, range: None };
+ let missing_values = MissingValues {
+ values,
+ range: None,
+ };
missing_value_set.push(LongStringMissingValues {
var_name,
- missing_values
+ missing_values,
});
}
Ok(LongStringMissingValueSet(missing_value_set))
fn parse(ext: &Extension, _endian: Endian, _warn: impl Fn(Error)) -> Result<Self, Error> {
ext.check_size::<Self>()?;
- Ok(EncodingRecord(String::from_utf8(ext.data.clone()).map_err(
- |_| Error::BadEncodingName { offset: ext.offset },
- )?))
+ Ok(EncodingRecord(
+ String::from_utf8(ext.data.clone())
+ .map_err(|_| Error::BadEncodingName { offset: ext.offset })?,
+ ))
}
}
data,
};
match subtype {
- IntegerInfo::SUBTYPE => Ok(Record::IntegerInfo(IntegerInfo::parse(&extension, endian, |_| ())?)),
- FloatInfo::SUBTYPE => Ok(Record::FloatInfo(FloatInfo::parse(&extension, endian, |_| ())?)),
- VarDisplayRecord::SUBTYPE => Ok(Record::VarDisplay(VarDisplayRecord::parse(&extension, endian, |_| ())?)),
- MultipleResponseRecord::SUBTYPE | 19 => Ok(Record::MultipleResponse(MultipleResponseRecord::parse(&extension, endian, |_| ())?)),
- LongStringValueLabelRecord::SUBTYPE => Ok(Record::LongStringValueLabels(LongStringValueLabelRecord::parse(&extension, endian, |_| ())?)),
- EncodingRecord::SUBTYPE => Ok(Record::Encoding(EncodingRecord::parse(&extension, endian, |_| ())?)),
- NumberOfCasesRecord::SUBTYPE => Ok(Record::NumberOfCases(NumberOfCasesRecord::parse(&extension, endian, |_| ())?)),
- x if x == TextExtensionSubtype::VariableSets as u32 => Ok(Record::VariableSets(UnencodedString(extension.data))),
- x if x == TextExtensionSubtype::ProductInfo as u32 => Ok(Record::ProductInfo(UnencodedString(extension.data))),
- x if x == TextExtensionSubtype::LongNames as u32 => Ok(Record::LongNames(UnencodedString(extension.data))),
- x if x == TextExtensionSubtype::LongStrings as u32 => Ok(Record::LongStrings(UnencodedString(extension.data))),
- x if x == TextExtensionSubtype::FileAttributes as u32 => Ok(Record::FileAttributes(UnencodedString(extension.data))),
- x if x == TextExtensionSubtype::VariableAttributes as u32 => Ok(Record::VariableAttributes(UnencodedString(extension.data))),
- _ => Ok(Record::Extension(extension))
+ IntegerInfo::SUBTYPE => Ok(Record::IntegerInfo(IntegerInfo::parse(
+ &extension,
+ endian,
+ |_| (),
+ )?)),
+ FloatInfo::SUBTYPE => Ok(Record::FloatInfo(FloatInfo::parse(
+ &extension,
+ endian,
+ |_| (),
+ )?)),
+ VarDisplayRecord::SUBTYPE => Ok(Record::VarDisplay(VarDisplayRecord::parse(
+ &extension,
+ endian,
+ |_| (),
+ )?)),
+ MultipleResponseRecord::SUBTYPE | 19 => Ok(Record::MultipleResponse(
+ MultipleResponseRecord::parse(&extension, endian, |_| ())?,
+ )),
+ LongStringValueLabelRecord::SUBTYPE => Ok(Record::LongStringValueLabels(
+ LongStringValueLabelRecord::parse(&extension, endian, |_| ())?,
+ )),
+ EncodingRecord::SUBTYPE => Ok(Record::Encoding(EncodingRecord::parse(
+ &extension,
+ endian,
+ |_| (),
+ )?)),
+ NumberOfCasesRecord::SUBTYPE => Ok(Record::NumberOfCases(NumberOfCasesRecord::parse(
+ &extension,
+ endian,
+ |_| (),
+ )?)),
+ x if x == TextExtensionSubtype::VariableSets as u32 => {
+ Ok(Record::VariableSets(UnencodedString(extension.data)))
+ }
+ x if x == TextExtensionSubtype::ProductInfo as u32 => {
+ Ok(Record::ProductInfo(UnencodedString(extension.data)))
+ }
+ x if x == TextExtensionSubtype::LongNames as u32 => {
+ Ok(Record::LongNames(UnencodedString(extension.data)))
+ }
+ x if x == TextExtensionSubtype::LongStrings as u32 => {
+ Ok(Record::LongStrings(UnencodedString(extension.data)))
+ }
+ x if x == TextExtensionSubtype::FileAttributes as u32 => {
+ Ok(Record::FileAttributes(UnencodedString(extension.data)))
+ }
+ x if x == TextExtensionSubtype::VariableAttributes as u32 => {
+ Ok(Record::VariableAttributes(UnencodedString(extension.data)))
+ }
+ _ => Ok(Record::OtherExtension(extension)),
}
}
}