use encoding_rs::Encoding;
use pspp::crypto::EncryptedFile;
use pspp::sys::cooked::{Error, Headers};
-use pspp::sys::raw::{encoding_from_headers, Decoder, Magic, Reader, Record, Warning};
+use pspp::sys::raw::{infer_encoding, Decoder, Magic, Reader, Record, Warning};
use std::fs::File;
use std::io::{stdout, BufReader, Write};
use std::path::{Path, PathBuf};
fn run(self) -> Result<()> {
let mut reader = Reader::new(BufReader::new(File::open(&self.input)?), Self::warn)?;
let headers = reader.headers().collect::<Result<Vec<_>, _>>()?;
- let encoding = encoding_from_headers(&headers, &mut |w| Self::warn(w))?;
- let mut decoder = Decoder::new(encoding, |w| Self::warn(w));
+ let mut decoder = Decoder::with_inferred_encoding(&headers, |w| Self::warn(w))?;
let mut decoded_records = Vec::new();
for header in headers {
decoded_records.push(header.decode(&mut decoder)?);
}
let headers = Headers::new(decoded_records, &mut |e| Self::err(e))?;
let (dictionary, _metadata, cases) =
- headers.decode(reader.cases(), encoding, |e| Self::err(e))?;
+ headers.decode(reader.cases(), decoder.encoding, |e| Self::err(e))?;
let writer = match self.output {
Some(path) => Box::new(File::create(path)?) as Box<dyn Write>,
None => Box::new(stdout()),
let headers: Vec<Record> = reader.headers().collect::<Result<Vec<_>, _>>()?;
let encoding = match encoding {
Some(encoding) => encoding,
- None => encoding_from_headers(&headers, &mut |e| eprintln!("{e}"))?,
+ None => infer_encoding(&headers, &mut |e| eprintln!("{e}"))?,
};
let mut decoder = Decoder::new(encoding, |e| eprintln!("{e}"));
for header in headers {
let headers: Vec<Record> = reader.headers().collect::<Result<Vec<_>, _>>()?;
let encoding = match encoding {
Some(encoding) => encoding,
- None => encoding_from_headers(&headers, &mut |e| eprintln!("{e}"))?,
+ None => infer_encoding(&headers, &mut |e| eprintln!("{e}"))?,
};
let mut decoder = Decoder::new(encoding, |e| eprintln!("{e}"));
let mut decoded_records = Vec::new();
// You should have received a copy of the GNU General Public License along with
// this program. If not, see <http://www.gnu.org/licenses/>.
-//! Raw system file record format.
+//! Raw system file record reader.
//!
//! This module facilitates reading records from system files in all of their
//! raw details. Most readers will want to use higher-level interfaces.
}
}
-pub fn encoding_from_headers(
- headers: &[Record],
+/// Given the raw `records` read from a system file, this tries to figure out
+/// the intended character encoding used in the system file:
+///
+/// - If there is a character encoding record, it uses that encoding.
+///
+/// - If there is an integer info record, it uses that encoding.
+///
+/// - Otherwise, it falls back to a default encoding and issuses a warning with
+/// `warn`.
+///
+/// If the records specify an EBCDIC encoding, this fails with an error because
+/// PSPP only supports ASCII-based encodings.
+pub fn infer_encoding(
+ records: &[Record],
warn: &mut impl FnMut(Warning),
) -> Result<&'static Encoding, Error> {
- let mut encoding_record = None;
- let mut integer_info_record = None;
- for record in headers {
- match record {
- Record::Encoding(record) => encoding_record = Some(record),
- Record::IntegerInfo(record) => integer_info_record = Some(record),
- _ => (),
- }
- }
- let encoding = encoding_record.map(|record| record.0.as_str());
- let character_code = integer_info_record.map(|record| record.character_code);
+ // Get the character encoding from the first (and only) encoding record.
+ let encoding = records
+ .iter()
+ .filter_map(|record| match record {
+ Record::Encoding(record) => Some(record.0.as_str()),
+ _ => None,
+ })
+ .next();
+
+ // Get the character code from the first (only) integer info record.
+ let character_code = records
+ .iter()
+ .filter_map(|record| match record {
+ Record::IntegerInfo(record) => Some(record.character_code),
+ _ => None,
+ })
+ .next();
+
match get_encoding(encoding, character_code) {
Ok(encoding) => Ok(encoding),
Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)),
}
}
-/// A type for decoding a [Record] into a [DecodedRecord].
+/// An [Encoding] along with a function to report decoding errors.
+///
+/// This is used by functions that decode raw records.
pub struct Decoder<'a> {
/// The character encoding to use.
pub encoding: &'static Encoding,
}
impl<'de> Decoder<'de> {
- /// Constructs a decoder for an encoding read or inferred from
- /// `records` (using [encoding_from_headers]). This can fail if the headers
- /// specify an EBCDIC encoding, since this crate only supports ASCII-based
- /// encodings.
+ /// Constructs a decoder for an encoding read or inferred from `records`
+ /// (using [infer_encoding]). This can fail if the headers specify an
+ /// EBCDIC encoding, since this crate only supports ASCII-based encodings.
///
/// `warn` will be used to report warnings while decoding records.
- pub fn from_headers<F>(records: &[Record], mut warn: F) -> Result<Self, Error>
+ pub fn with_inferred_encoding<F>(records: &[Record], mut warn: F) -> Result<Self, Error>
where
F: FnMut(Warning) + 'de,
{
- let encoding = encoding_from_headers(records, &mut warn)?;
+ let encoding = infer_encoding(records, &mut warn)?;
Ok(Self::new(encoding, warn))
}
- /// Construct a decoder using `encooding`.
+ /// Construct a decoder using `encoding`.
///
/// `warn` will be used to report warnings while decoding records.
pub fn new<F>(encoding: &'static Encoding, warn: F) -> Self
}
}
-/// A [Datum] with knowledge of string width or character encoding.
+/// An 8-byte [Datum] but we don't know the string width or character encoding.
#[derive(Copy, Clone)]
pub enum RawDatum {
Number(Option<f64>),
End,
}
+/// Reads records from a system file in their raw form.
+///
+/// This is the lowest-level way to read a system file. To read a system file:
+///
+/// 1. Read the raw [Record]s using `Reader`.
+///
+/// 2. Create a [Decoder] with an appropriate character encoding.
+///
+/// 3. Decode the records into [DecodedRecord]s using the [Decoder].
+///
+/// 4. Assemble the decoded records into [Headers](super::cooked::Headers)
+/// and decode them with [Headers::decode](super::cooked::Headers::decode).
pub struct Reader<'a, R>
where
R: Read + Seek + 'static,
where
R: Read + Seek + 'static,
{
+ /// Constructs a new [Reader] from the underlying `reader`. Any warnings
+ /// encountered while reading the system file will be reported with `warn`.
+ ///
+ /// To read an encrypted system file, wrap `reader` in
+ /// [EncryptedReader](crate::crypto::EncryptedReader).
pub fn new(mut reader: R, mut warn: impl FnMut(Warning) + 'a) -> Result<Self, Error> {
let header = HeaderRecord::read(&mut reader, &mut warn)?;
Ok(Self {
cases: None,
})
}
- pub fn headers<'b>(&'b mut self) -> ReadHeaders<'a, 'b, R> {
- ReadHeaders(self)
+
+ /// Returns a structure for reading the system file's header records.
+ pub fn headers<'b>(&'b mut self) -> HeaderReader<'a, 'b, R> {
+ HeaderReader(self)
}
+
+ /// Returns a structure for reading the system file's cases.
+ ///
+ /// The cases are only available once all the headers have been read. If
+ /// there is an error reading the headers, or if [cases](Self::cases) is
+ /// called before all of the headers have been read, the returned [Cases]
+ /// will be empty.
pub fn cases(self) -> Cases {
self.cases.unwrap_or_default()
}
}
-pub struct ReadHeaders<'a, 'b, R>(&'b mut Reader<'a, R>)
+/// Reader for the raw header records in a system file.
+pub struct HeaderReader<'a, 'b, R>(&'b mut Reader<'a, R>)
where
R: Read + Seek + 'static;
-impl<'a, 'b, R> ReadHeaders<'a, 'b, R>
+impl<'a, 'b, R> HeaderReader<'a, 'b, R>
where
R: Read + Seek + 'static,
{
}
}
-impl<'a, 'b, R> Iterator for ReadHeaders<'a, 'b, R>
+impl<'a, 'b, R> Iterator for HeaderReader<'a, 'b, R>
where
R: Read + Seek + 'static,
{
trait ReadSeek: Read + Seek {}
impl<T> ReadSeek for T where T: Read + Seek {}
-pub struct Case(pub Vec<Datum>);
+/// A case in a system file.
+pub struct Case(
+ /// One [Datum] per variable in the system file dictionary, in the same
+ /// order:
+ ///
+ /// - A raw [Reader] returns cases in which very long string variables
+ /// (those over 255 bytes wide) are still in their raw format, which means
+ /// that they are divided into multiple, adjacent string variables,
+ /// approximately one variable for each 252 bytes.
+ ///
+ /// - [Headers::decode](super::cooked::Headers::decode) returns cases in
+ /// which each [Dictionary](crate::dictionary::Dictionary) variable
+ /// corresponds to one [Datum], even for long string variables.
+ pub Vec<Datum>,
+);
#[derive(Debug)]
struct StringSegment {
}
}
+/// Reader for cases in a system file.
pub struct Cases {
reader: Box<dyn ReadSeek>,
case_vars: Vec<CaseVar>,
}
}
+ /// Returns this [Cases] with its notion of variable widths updated from
+ /// `widths`.
+ ///
+ /// [Headers::decode](super::cooked::Headers::decode) uses this to properly
+ /// handle very long string variables (see [Cases] for details).
pub fn with_widths(self, widths: impl IntoIterator<Item = VarWidth>) -> Self {
Self {
case_vars: widths.into_iter().map(CaseVar::new).collect::<Vec<_>>(),
}
}
+ /// Returns this [Cases] updated to expect `expected_cases`. If the actual
+ /// number of cases in the file differs, the reader will issue a warning.
pub fn with_expected_cases(self, expected_cases: u64) -> Self {
Self {
expected_cases: Some(expected_cases),
}
}
+/// [crate::format::Format] as represented in a system file.
#[derive(Copy, Clone, PartialEq, Eq, Hash)]
-pub struct Spec(pub u32);
+pub struct RawFormat(
+ /// The most-significant 16 bits are the type, the next 8 bytes are the
+ /// width, and the least-significant 8 bits are the number of decimals.
+ pub u32,
+);
-impl Debug for Spec {
+impl Debug for RawFormat {
fn fmt(&self, f: &mut Formatter) -> FmtResult {
let type_ = format_name(self.0 >> 16);
let w = (self.0 >> 8) & 0xff;
pub name: S,
/// Print format.
- pub print_format: Spec,
+ pub print_format: RawFormat,
/// Write format.
- pub write_format: Spec,
+ pub write_format: RawFormat,
/// Missing values.
pub missing_values: MissingValues,
let code_offset = r.stream_position()?;
let has_variable_label: u32 = endian.parse(read_bytes(r)?);
let missing_value_code: i32 = endian.parse(read_bytes(r)?);
- let print_format = Spec(endian.parse(read_bytes(r)?));
- let write_format = Spec(endian.parse(read_bytes(r)?));
+ let print_format = RawFormat(endian.parse(read_bytes(r)?));
+ let write_format = RawFormat(endian.parse(read_bytes(r)?));
let name = RawString(read_vec(r, 8)?);
let label = match has_variable_label {