From: Ben Pfaff Date: Sat, 12 Jul 2025 22:31:22 +0000 (-0700) Subject: work on comments X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=6fbdfd37296cee6934a8a76ae80b2b38c14d63a3;p=pspp work on comments --- diff --git a/rust/pspp/src/sys/mod.rs b/rust/pspp/src/sys/mod.rs index 02d0801859..10b51ced6d 100644 --- a/rust/pspp/src/sys/mod.rs +++ b/rust/pspp/src/sys/mod.rs @@ -14,9 +14,19 @@ // You should have received a copy of the GNU General Public License along with // this program. If not, see . +//! Reading and writing system files. +//! +//! This module enables reading and writing "system files", the binary format +//! for SPSS data files. The system file format dates back 40+ years and has +//! evolved greatly over that time to support new features, but in a way to +//! facilitate interchange between even the oldest and newest versions of +//! software. + pub mod cooked; pub mod encoding; pub mod raw; + +#[cfg(test)] pub mod sack; #[cfg(test)] diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 9a8a3b90ee..926a52263e 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -14,6 +14,11 @@ // You should have received a copy of the GNU General Public License along with // this program. If not, see . +//! Raw system file record format. +//! +//! This module facilitates reading records from system files in all of their +//! raw details. Most readers will want to use higher-level interfaces. + use crate::{ dictionary::{Attributes, Datum, VarWidth}, endian::{Endian, Parse, ToBytes}, @@ -41,6 +46,9 @@ use std::{ }; use thiserror::Error as ThisError; +/// An error encountered reading raw system file records. +/// +/// Any error prevents reading further data from the system file. #[derive(ThisError, Debug)] pub enum Error { #[error("Not an SPSS system file")] @@ -187,6 +195,10 @@ pub enum Error { EncodingError(EncodingError), } +/// A warning reading a raw system file record. +/// +/// Warnings indicate that something may be amiss, but they do not prevent +/// reading further records. #[derive(ThisError, Debug)] pub enum Warning { #[error("Unexpected end of data inside extension record.")] @@ -369,55 +381,166 @@ impl From for Warning { } } +/// A raw record in a system file. +#[allow(missing_docs)] // Don't warn for missing docs on tuple members. #[derive(Clone, Debug)] pub enum Record { + /// The file header. + /// + /// Every system file has exactly one header record, at its very beginning. Header(HeaderRecord), + + /// Variable record. + /// + /// Each numeric variable has one variable record. Each string variable has + /// one variable record per 8-byte segment. Variable(VariableRecord), + + /// Value labels for numeric and short string variables. + /// + /// These appear after the variable records. ValueLabel(ValueLabelRecord, RawString>), + + /// Document record. Document(DocumentRecord), + + /// Integer info record. IntegerInfo(IntegerInfoRecord), + + /// Floating-point info record. FloatInfo(FloatInfoRecord), + + /// Variable display record. VarDisplay(VarDisplayRecord), + + /// Multiple response variable record. MultipleResponse(MultipleResponseRecord), + + /// Value labels for long string variables. LongStringValueLabels(LongStringValueLabelRecord), + + /// Missing values for long string variables. + /// + /// Missing values for numeric and short string variables appear in the + /// variable records. LongStringMissingValues(LongStringMissingValueRecord), + + /// Encoding record. + /// + /// All the strings in the file are encoded in this encoding, even for + /// strings that precede this record. Encoding(EncodingRecord), + + /// Extended number of cases. + /// + /// The header record records the number of cases but it only uses a 32-bit + /// field. NumberOfCases(NumberOfCasesRecord), + + /// Variable sets. VariableSets(RawVariableSetRecord), + + /// Product info. + /// + /// This supplements the product in the header record. ProductInfo(RawProductInfoRecord), + + /// Long variable names. LongNames(RawLongNamesRecord), + + /// Very long string variables, for strings longer than 255 bytes. VeryLongStrings(RawVeryLongStringsRecord), + + /// File attributes. FileAttributes(RawFileAttributesRecord), + + /// Variable attributes. VariableAttributes(RawVariableAttributesRecord), + + /// Extension records not otherwise supported. OtherExtension(Extension), + + /// End of headers. EndOfHeaders(u32), + + /// Header record for ZLIB-compressed data. ZHeader(ZHeader), + + /// Trailer record for ZLIB-compressed data. ZTrailer(ZTrailer), } +/// A [Record] that has been decoded to a more usable form. +/// +/// Some records can be understand raw, but others need to have strings decoded +/// (and interpreted as identifiers) or raw data interpreted as either numbers +/// or strings. #[derive(Clone, Debug)] pub enum DecodedRecord { + /// File header, with strings decoded. Header(HeaderRecord), + + /// Variable record, with strings decoded. Variable(VariableRecord), + + /// Value label, with strings decoded. ValueLabel(ValueLabelRecord, String>), + + /// Documents, with strings decoded. Document(DocumentRecord), + + /// Integer info. IntegerInfo(IntegerInfoRecord), + + /// Floating-point info. FloatInfo(FloatInfoRecord), + + /// Variable display info. VarDisplay(VarDisplayRecord), + + /// Multiple response sets, with strings decoded. MultipleResponse(MultipleResponseRecord), + + /// Long string value labels, with strings decoded. LongStringValueLabels(LongStringValueLabelRecord), + + /// Long string missing values, with strings decoded. LongStringMissingValues(LongStringMissingValueRecord), + + /// Encoding record. Encoding(EncodingRecord), + + /// Number of cases record. NumberOfCases(NumberOfCasesRecord), + + /// Variable sets. VariableSets(VariableSetRecord), + + /// Product info. ProductInfo(ProductInfoRecord), + + /// Long variable names. LongNames(LongNamesRecord), + + /// Very long string variables. VeryLongStrings(VeryLongStringsRecord), + + /// File attributes. FileAttributes(FileAttributesRecord), + + /// Variable attributes. VariableAttributes(VariableAttributesRecord), + + /// Extension records not otherwise supported. OtherExtension(Extension), + + /// End of headers. EndOfHeaders(u32), + + /// Header record for ZLIB-compressed data. ZHeader(ZHeader), + + /// Trailer record for ZLIB-compressed data. ZTrailer(ZTrailer), } @@ -447,6 +570,7 @@ impl Record { } } + /// Decodes this record into a [DecodedRecord] using `decoder`. pub fn decode(self, decoder: &mut Decoder) -> Result { Ok(match self { Record::Header(record) => record.decode(decoder), @@ -484,7 +608,7 @@ impl Record { } pub fn encoding_from_headers( - headers: &Vec, + headers: &[Record], warn: &mut impl FnMut(Warning), ) -> Result<&'static Encoding, Error> { let mut encoding_record = None; @@ -686,12 +810,33 @@ impl HeaderRecord { } } +/// A type for decoding a [Record] into a [DecodedRecord]. pub struct Decoder<'a> { + /// The character encoding to use. pub encoding: &'static Encoding, + + /// Used to reporting [Warning]s during decoding. pub warn: Box, } impl<'de> Decoder<'de> { + /// Constructs a decoder for an encoding read or inferred from + /// `records` (using [encoding_from_headers]). This can fail if the headers + /// specify an EBCDIC encoding, since this crate only supports ASCII-based + /// encodings. + /// + /// `warn` will be used to report warnings while decoding records. + pub fn from_headers(records: &[Record], mut warn: F) -> Result + where + F: FnMut(Warning) + 'de, + { + let encoding = encoding_from_headers(records, &mut warn)?; + Ok(Self::new(encoding, warn)) + } + + /// Construct a decoder using `encooding`. + /// + /// `warn` will be used to report warnings while decoding records. pub fn new(encoding: &'static Encoding, warn: F) -> Self where F: FnMut(Warning) + 'de, @@ -701,9 +846,11 @@ impl<'de> Decoder<'de> { warn: Box::new(warn), } } + fn warn(&mut self, warning: Warning) { (self.warn)(warning) } + fn decode_slice<'a>(&mut self, input: &'a [u8]) -> Cow<'a, str> { let (output, malformed) = self.encoding.decode_without_bom_handling(input); if malformed { @@ -719,11 +866,13 @@ impl<'de> Decoder<'de> { self.decode_slice(input.0.as_slice()) } + /// Decodes `input` to an [Identifier] using our encoding. pub fn decode_identifier(&mut self, input: &RawString) -> Result { let decoded = &self.decode(input); self.new_identifier(decoded) } + /// Constructs an [Identifier] from `name` using our encoding. pub fn new_identifier(&self, name: &str) -> Result { Identifier::from_encoding(name, self.encoding) } @@ -777,9 +926,13 @@ impl TryFrom<[u8; 4]> for Magic { } } +/// Variable type. #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum VarType { + /// A numeric variable. Numeric, + + /// A string variable. String, } @@ -835,15 +988,30 @@ impl TryFrom for VarWidth { } } -type RawDatum = Datum>; +/// A [Datum] for which the character encoding and variable width is not yet known. +pub type RawDatum = Datum>; impl RawDatum { + /// Constructs a `RawDatum` from `raw` given that we now know the variable + /// type and endianness. pub fn from_raw(raw: &UntypedDatum, var_type: VarType, endian: Endian) -> Self { match var_type { VarType::String => Datum::String(RawStrArray(raw.0)), VarType::Numeric => Datum::Number(endian.parse(raw.0)), } } + + /// Decodes a `RawDatum` into a [Datum] given that we now know the string + /// width. + pub fn decode(&self, width: VarWidth) -> Datum { + match self { + Self::Number(x) => Datum::Number(*x), + Self::String(s) => { + let width = width.as_string_width().unwrap(); + Datum::String(RawString::from(&s.0[..width])) + } + } + } } impl Datum { @@ -985,18 +1153,6 @@ impl Datum { } } -impl RawDatum { - pub fn decode(&self, width: VarWidth) -> Datum { - match self { - Self::Number(x) => Datum::Number(*x), - Self::String(s) => { - let width = width.as_string_width().unwrap(); - Datum::String(RawString::from(&s.0[..width])) - } - } - } -} - struct ZlibDecodeMultiple where R: Read + Seek, @@ -1838,6 +1994,10 @@ impl VariableRecord { } } +/// 8 bytes that represent a number or a string (but that's all we know). +/// +/// Used when we don't know whether it's a number or a string, or the string +/// width, or the character encoding. #[derive(Copy, Clone)] pub struct UntypedDatum(pub [u8; 8]); diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs index 16ec17551e..3f94d8266a 100644 --- a/rust/pspp/src/sys/test.rs +++ b/rust/pspp/src/sys/test.rs @@ -605,7 +605,7 @@ where { let mut warnings = Vec::new(); let mut reader = Reader::new(sysfile, |warning| warnings.push(warning)).unwrap(); - let output = match reader.headers().collect() { + let output = match reader.headers().collect::, _>>() { Ok(headers) => { let cases = reader.cases(); let encoding =