From 748bf2fa8affe952130b7c130baca71ca5908a54 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 12 Jul 2025 16:30:44 -0700 Subject: [PATCH] work on comments and cleanliness --- rust/pspp/src/format/mod.rs | 4 +- rust/pspp/src/main.rs | 11 ++- rust/pspp/src/sys/cooked.rs | 2 +- rust/pspp/src/sys/raw.rs | 144 +++++++++++++++++++++++++++--------- rust/pspp/src/sys/test.rs | 4 +- 5 files changed, 119 insertions(+), 46 deletions(-) diff --git a/rust/pspp/src/format/mod.rs b/rust/pspp/src/format/mod.rs index 15d4f0e190..ff36586fe0 100644 --- a/rust/pspp/src/format/mod.rs +++ b/rust/pspp/src/format/mod.rs @@ -800,10 +800,10 @@ impl UncheckedFormat { } } -impl TryFrom for UncheckedFormat { +impl TryFrom for UncheckedFormat { type Error = Error; - fn try_from(raw: raw::Spec) -> Result { + fn try_from(raw: raw::RawFormat) -> Result { let raw = raw.0; let raw_format = (raw >> 16) as u16; let format = raw_format.try_into()?; diff --git a/rust/pspp/src/main.rs b/rust/pspp/src/main.rs index 8432f4267d..50eaa8adf1 100644 --- a/rust/pspp/src/main.rs +++ b/rust/pspp/src/main.rs @@ -19,7 +19,7 @@ use clap::{Args, Parser, Subcommand, ValueEnum}; use encoding_rs::Encoding; use pspp::crypto::EncryptedFile; use pspp::sys::cooked::{Error, Headers}; -use pspp::sys::raw::{encoding_from_headers, Decoder, Magic, Reader, Record, Warning}; +use pspp::sys::raw::{infer_encoding, Decoder, Magic, Reader, Record, Warning}; use std::fs::File; use std::io::{stdout, BufReader, Write}; use std::path::{Path, PathBuf}; @@ -88,15 +88,14 @@ impl Convert { fn run(self) -> Result<()> { let mut reader = Reader::new(BufReader::new(File::open(&self.input)?), Self::warn)?; let headers = reader.headers().collect::, _>>()?; - let encoding = encoding_from_headers(&headers, &mut |w| Self::warn(w))?; - let mut decoder = Decoder::new(encoding, |w| Self::warn(w)); + let mut decoder = Decoder::with_inferred_encoding(&headers, |w| Self::warn(w))?; let mut decoded_records = Vec::new(); for header in headers { decoded_records.push(header.decode(&mut decoder)?); } let headers = Headers::new(decoded_records, &mut |e| Self::err(e))?; let (dictionary, _metadata, cases) = - headers.decode(reader.cases(), encoding, |e| Self::err(e))?; + headers.decode(reader.cases(), decoder.encoding, |e| Self::err(e))?; let writer = match self.output { Some(path) => Box::new(File::create(path)?) as Box, None => Box::new(stdout()), @@ -260,7 +259,7 @@ fn dissect( let headers: Vec = reader.headers().collect::, _>>()?; let encoding = match encoding { Some(encoding) => encoding, - None => encoding_from_headers(&headers, &mut |e| eprintln!("{e}"))?, + None => infer_encoding(&headers, &mut |e| eprintln!("{e}"))?, }; let mut decoder = Decoder::new(encoding, |e| eprintln!("{e}")); for header in headers { @@ -283,7 +282,7 @@ fn dissect( let headers: Vec = reader.headers().collect::, _>>()?; let encoding = match encoding { Some(encoding) => encoding, - None => encoding_from_headers(&headers, &mut |e| eprintln!("{e}"))?, + None => infer_encoding(&headers, &mut |e| eprintln!("{e}"))?, }; let mut decoder = Decoder::new(encoding, |e| eprintln!("{e}")); let mut decoded_records = Vec::new(); diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index 1db24b683c..5a68a2d43d 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -1203,7 +1203,7 @@ fn fix_line_ends(s: &str) -> String { } fn decode_format( - raw: raw::Spec, + raw: raw::RawFormat, width: VarWidth, mut warn: impl FnMut(Format, FormatError), ) -> Format { diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 969fee56f5..fb65892999 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -14,7 +14,7 @@ // You should have received a copy of the GNU General Public License along with // this program. If not, see . -//! Raw system file record format. +//! Raw system file record reader. //! //! This module facilitates reading records from system files in all of their //! raw details. Most readers will want to use higher-level interfaces. @@ -607,21 +607,40 @@ impl Record { } } -pub fn encoding_from_headers( - headers: &[Record], +/// Given the raw `records` read from a system file, this tries to figure out +/// the intended character encoding used in the system file: +/// +/// - If there is a character encoding record, it uses that encoding. +/// +/// - If there is an integer info record, it uses that encoding. +/// +/// - Otherwise, it falls back to a default encoding and issuses a warning with +/// `warn`. +/// +/// If the records specify an EBCDIC encoding, this fails with an error because +/// PSPP only supports ASCII-based encodings. +pub fn infer_encoding( + records: &[Record], warn: &mut impl FnMut(Warning), ) -> Result<&'static Encoding, Error> { - let mut encoding_record = None; - let mut integer_info_record = None; - for record in headers { - match record { - Record::Encoding(record) => encoding_record = Some(record), - Record::IntegerInfo(record) => integer_info_record = Some(record), - _ => (), - } - } - let encoding = encoding_record.map(|record| record.0.as_str()); - let character_code = integer_info_record.map(|record| record.character_code); + // Get the character encoding from the first (and only) encoding record. + let encoding = records + .iter() + .filter_map(|record| match record { + Record::Encoding(record) => Some(record.0.as_str()), + _ => None, + }) + .next(); + + // Get the character code from the first (only) integer info record. + let character_code = records + .iter() + .filter_map(|record| match record { + Record::IntegerInfo(record) => Some(record.character_code), + _ => None, + }) + .next(); + match get_encoding(encoding, character_code) { Ok(encoding) => Ok(encoding), Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)), @@ -810,7 +829,9 @@ impl HeaderRecord { } } -/// A type for decoding a [Record] into a [DecodedRecord]. +/// An [Encoding] along with a function to report decoding errors. +/// +/// This is used by functions that decode raw records. pub struct Decoder<'a> { /// The character encoding to use. pub encoding: &'static Encoding, @@ -820,21 +841,20 @@ pub struct Decoder<'a> { } impl<'de> Decoder<'de> { - /// Constructs a decoder for an encoding read or inferred from - /// `records` (using [encoding_from_headers]). This can fail if the headers - /// specify an EBCDIC encoding, since this crate only supports ASCII-based - /// encodings. + /// Constructs a decoder for an encoding read or inferred from `records` + /// (using [infer_encoding]). This can fail if the headers specify an + /// EBCDIC encoding, since this crate only supports ASCII-based encodings. /// /// `warn` will be used to report warnings while decoding records. - pub fn from_headers(records: &[Record], mut warn: F) -> Result + pub fn with_inferred_encoding(records: &[Record], mut warn: F) -> Result where F: FnMut(Warning) + 'de, { - let encoding = encoding_from_headers(records, &mut warn)?; + let encoding = infer_encoding(records, &mut warn)?; Ok(Self::new(encoding, warn)) } - /// Construct a decoder using `encooding`. + /// Construct a decoder using `encoding`. /// /// `warn` will be used to report warnings while decoding records. pub fn new(encoding: &'static Encoding, warn: F) -> Self @@ -988,7 +1008,7 @@ impl TryFrom for VarWidth { } } -/// A [Datum] with knowledge of string width or character encoding. +/// An 8-byte [Datum] but we don't know the string width or character encoding. #[derive(Copy, Clone)] pub enum RawDatum { Number(Option), @@ -1219,6 +1239,18 @@ enum ReaderState { End, } +/// Reads records from a system file in their raw form. +/// +/// This is the lowest-level way to read a system file. To read a system file: +/// +/// 1. Read the raw [Record]s using `Reader`. +/// +/// 2. Create a [Decoder] with an appropriate character encoding. +/// +/// 3. Decode the records into [DecodedRecord]s using the [Decoder]. +/// +/// 4. Assemble the decoded records into [Headers](super::cooked::Headers) +/// and decode them with [Headers::decode](super::cooked::Headers::decode). pub struct Reader<'a, R> where R: Read + Seek + 'static, @@ -1237,6 +1269,11 @@ impl<'a, R> Reader<'a, R> where R: Read + Seek + 'static, { + /// Constructs a new [Reader] from the underlying `reader`. Any warnings + /// encountered while reading the system file will be reported with `warn`. + /// + /// To read an encrypted system file, wrap `reader` in + /// [EncryptedReader](crate::crypto::EncryptedReader). pub fn new(mut reader: R, mut warn: impl FnMut(Warning) + 'a) -> Result { let header = HeaderRecord::read(&mut reader, &mut warn)?; Ok(Self { @@ -1248,19 +1285,29 @@ where cases: None, }) } - pub fn headers<'b>(&'b mut self) -> ReadHeaders<'a, 'b, R> { - ReadHeaders(self) + + /// Returns a structure for reading the system file's header records. + pub fn headers<'b>(&'b mut self) -> HeaderReader<'a, 'b, R> { + HeaderReader(self) } + + /// Returns a structure for reading the system file's cases. + /// + /// The cases are only available once all the headers have been read. If + /// there is an error reading the headers, or if [cases](Self::cases) is + /// called before all of the headers have been read, the returned [Cases] + /// will be empty. pub fn cases(self) -> Cases { self.cases.unwrap_or_default() } } -pub struct ReadHeaders<'a, 'b, R>(&'b mut Reader<'a, R>) +/// Reader for the raw header records in a system file. +pub struct HeaderReader<'a, 'b, R>(&'b mut Reader<'a, R>) where R: Read + Seek + 'static; -impl<'a, 'b, R> ReadHeaders<'a, 'b, R> +impl<'a, 'b, R> HeaderReader<'a, 'b, R> where R: Read + Seek + 'static, { @@ -1339,7 +1386,7 @@ where } } -impl<'a, 'b, R> Iterator for ReadHeaders<'a, 'b, R> +impl<'a, 'b, R> Iterator for HeaderReader<'a, 'b, R> where R: Read + Seek + 'static, { @@ -1357,7 +1404,21 @@ where trait ReadSeek: Read + Seek {} impl ReadSeek for T where T: Read + Seek {} -pub struct Case(pub Vec); +/// A case in a system file. +pub struct Case( + /// One [Datum] per variable in the system file dictionary, in the same + /// order: + /// + /// - A raw [Reader] returns cases in which very long string variables + /// (those over 255 bytes wide) are still in their raw format, which means + /// that they are divided into multiple, adjacent string variables, + /// approximately one variable for each 252 bytes. + /// + /// - [Headers::decode](super::cooked::Headers::decode) returns cases in + /// which each [Dictionary](crate::dictionary::Dictionary) variable + /// corresponds to one [Datum], even for long string variables. + pub Vec, +); #[derive(Debug)] struct StringSegment { @@ -1421,6 +1482,7 @@ impl CaseVar { } } +/// Reader for cases in a system file. pub struct Cases { reader: Box, case_vars: Vec, @@ -1483,6 +1545,11 @@ impl Cases { } } + /// Returns this [Cases] with its notion of variable widths updated from + /// `widths`. + /// + /// [Headers::decode](super::cooked::Headers::decode) uses this to properly + /// handle very long string variables (see [Cases] for details). pub fn with_widths(self, widths: impl IntoIterator) -> Self { Self { case_vars: widths.into_iter().map(CaseVar::new).collect::>(), @@ -1490,6 +1557,8 @@ impl Cases { } } + /// Returns this [Cases] updated to expect `expected_cases`. If the actual + /// number of cases in the file differs, the reader will issue a warning. pub fn with_expected_cases(self, expected_cases: u64) -> Self { Self { expected_cases: Some(expected_cases), @@ -1543,10 +1612,15 @@ impl Iterator for Cases { } } +/// [crate::format::Format] as represented in a system file. #[derive(Copy, Clone, PartialEq, Eq, Hash)] -pub struct Spec(pub u32); +pub struct RawFormat( + /// The most-significant 16 bits are the type, the next 8 bytes are the + /// width, and the least-significant 8 bits are the number of decimals. + pub u32, +); -impl Debug for Spec { +impl Debug for RawFormat { fn fmt(&self, f: &mut Formatter) -> FmtResult { let type_ = format_name(self.0 >> 16); let w = (self.0 >> 8) & 0xff; @@ -1870,10 +1944,10 @@ where pub name: S, /// Print format. - pub print_format: Spec, + pub print_format: RawFormat, /// Write format. - pub write_format: Spec, + pub write_format: RawFormat, /// Missing values. pub missing_values: MissingValues, @@ -1951,8 +2025,8 @@ impl VariableRecord { let code_offset = r.stream_position()?; let has_variable_label: u32 = endian.parse(read_bytes(r)?); let missing_value_code: i32 = endian.parse(read_bytes(r)?); - let print_format = Spec(endian.parse(read_bytes(r)?)); - let write_format = Spec(endian.parse(read_bytes(r)?)); + let print_format = RawFormat(endian.parse(read_bytes(r)?)); + let write_format = RawFormat(endian.parse(read_bytes(r)?)); let name = RawString(read_vec(r, 8)?); let label = match has_variable_label { diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs index 3f94d8266a..34aba7f282 100644 --- a/rust/pspp/src/sys/test.rs +++ b/rust/pspp/src/sys/test.rs @@ -30,7 +30,7 @@ use crate::{ }, sys::{ cooked::Headers, - raw::{encoding_from_headers, Decoder, Reader}, + raw::{infer_encoding, Decoder, Reader}, sack::sack, }, }; @@ -609,7 +609,7 @@ where Ok(headers) => { let cases = reader.cases(); let encoding = - encoding_from_headers(&headers, &mut |warning| warnings.push(warning)).unwrap(); + infer_encoding(&headers, &mut |warning| warnings.push(warning)).unwrap(); let mut decoder = Decoder::new(encoding, |warning| warnings.push(warning)); let mut decoded_records = Vec::new(); for header in headers { -- 2.30.2