From f652573390c956bbdeeb06d64d28cf0c9df5fa5c Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Mon, 14 Jul 2025 15:00:30 -0700 Subject: [PATCH] Separate the file header from other records, since it's special --- rust/pspp/src/main.rs | 45 ++++++++++++++++------------- rust/pspp/src/sys/cooked.rs | 57 +++++++++++++++++-------------------- rust/pspp/src/sys/raw.rs | 53 ++++++++++++++-------------------- rust/pspp/src/sys/test.rs | 20 +++++++------ 4 files changed, 85 insertions(+), 90 deletions(-) diff --git a/rust/pspp/src/main.rs b/rust/pspp/src/main.rs index 484b538485..37a9750158 100644 --- a/rust/pspp/src/main.rs +++ b/rust/pspp/src/main.rs @@ -87,16 +87,21 @@ impl Convert { fn run(self) -> Result<()> { let mut reader = Reader::new(BufReader::new(File::open(&self.input)?), Self::warn)?; - let headers = reader.headers().collect::, _>>()?; - let mut decoder = Decoder::with_inferred_encoding(&headers, |w| Self::warn(w))?; + let records = reader.records().collect::, _>>()?; + let mut decoder = Decoder::with_inferred_encoding(&records, |w| Self::warn(w))?; let mut decoded_records = Vec::new(); - for header in headers { - decoded_records.push(header.decode(&mut decoder)?); + for record in records { + decoded_records.push(record.decode(&mut decoder)); } let headers = Headers::new(decoded_records, &mut |e| Self::err(e))?; let SystemFile { dictionary, cases, .. - } = headers.decode(reader.cases(), decoder.encoding, |e| Self::err(e)); + } = headers.decode( + reader.header().clone().decode(&mut decoder), + reader.cases(), + decoder.encoding, + |e| Self::err(e), + ); let writer = match self.output { Some(path) => Box::new(File::create(path)?) as Box, None => Box::new(stdout()), @@ -237,10 +242,7 @@ fn dissect( match mode { Mode::Identify => { - let Record::Header(header) = reader.headers().next().unwrap()? else { - unreachable!() - }; - match header.magic { + match reader.header().magic { Magic::Sav => println!("SPSS System File"), Magic::Zsav => println!("SPSS System File with Zlib compression"), Magic::Ebcdic => println!("EBCDIC-encoded SPSS System File"), @@ -248,8 +250,8 @@ fn dissect( return Ok(()); } Mode::Raw => { - for header in reader.headers() { - let header = header?; + for record in reader.records() { + let header = record?; println!("{:?}", header); } for (_index, case) in (0..max_cases).zip(reader.cases()) { @@ -257,13 +259,13 @@ fn dissect( } } Mode::Decoded => { - let headers: Vec = reader.headers().collect::, _>>()?; + let records: Vec = reader.records().collect::, _>>()?; let encoding = match encoding { Some(encoding) => encoding, - None => infer_encoding(&headers, &mut |e| eprintln!("{e}"))?, + None => infer_encoding(&records, &mut |e| eprintln!("{e}"))?, }; let mut decoder = Decoder::new(encoding, |e| eprintln!("{e}")); - for header in headers { + for header in records { let header = header.decode(&mut decoder); println!("{:?}", header); /* @@ -280,22 +282,27 @@ fn dissect( } } Mode::Cooked => { - let headers: Vec = reader.headers().collect::, _>>()?; + let records: Vec = reader.records().collect::, _>>()?; let encoding = match encoding { Some(encoding) => encoding, - None => infer_encoding(&headers, &mut |e| eprintln!("{e}"))?, + None => infer_encoding(&records, &mut |e| eprintln!("{e}"))?, }; let mut decoder = Decoder::new(encoding, |e| eprintln!("{e}")); let mut decoded_records = Vec::new(); - for header in headers { - decoded_records.push(header.decode(&mut decoder)?); + for record in records { + decoded_records.push(record.decode(&mut decoder)); } let headers = Headers::new(decoded_records, &mut |e| eprintln!("{e}"))?; let SystemFile { dictionary, metadata, cases: _, - } = headers.decode(reader.cases(), encoding, |e| eprintln!("{e}")); + } = headers.decode( + reader.header().clone().decode(&mut decoder), + reader.cases(), + encoding, + |e| eprintln!("{e}"), + ); println!("{dictionary:#?}"); println!("{metadata:#?}"); } diff --git a/rust/pspp/src/sys/cooked.rs b/rust/pspp/src/sys/cooked.rs index ff78c14ea5..d539511411 100644 --- a/rust/pspp/src/sys/cooked.rs +++ b/rust/pspp/src/sys/cooked.rs @@ -335,7 +335,6 @@ pub struct SystemFile { #[derive(Clone, Debug)] pub struct Headers { - pub header: HeaderRecord, pub variable: Vec>, pub value_label: Vec>, pub document: Vec>, @@ -372,10 +371,9 @@ fn take_first( impl Headers { pub fn new( - headers: Vec, - warn: &mut impl FnMut(Error), + records: Vec, + mut warn: impl FnMut(Error), ) -> Result { - let mut file_header = Vec::new(); let mut variable = Vec::new(); let mut value_label = Vec::new(); let mut document = Vec::new(); @@ -398,11 +396,8 @@ impl Headers { let mut z_header = Vec::new(); let mut z_trailer = Vec::new(); - for header in headers { - match header { - DecodedRecord::Header(record) => { - file_header.push(record); - } + for record in records { + match record { DecodedRecord::Variable(record) => { variable.push(record); } @@ -469,45 +464,41 @@ impl Headers { } } - let Some(file_header) = take_first(file_header, "file header", warn) else { - return Err(Error::MissingHeaderRecord); - }; - Ok(Headers { - header: file_header, variable, value_label, document, - integer_info: take_first(integer_info, "integer info", warn), - float_info: take_first(float_info, "float info", warn), - var_display: take_first(var_display, "variable display", warn), + integer_info: take_first(integer_info, "integer info", &mut warn), + float_info: take_first(float_info, "float info", &mut warn), + var_display: take_first(var_display, "variable display", &mut warn), multiple_response, long_string_value_labels, long_string_missing_values, - encoding: take_first(encoding, "encoding", warn), - number_of_cases: take_first(number_of_cases, "number of cases", warn), + encoding: take_first(encoding, "encoding", &mut warn), + number_of_cases: take_first(number_of_cases, "number of cases", &mut warn), variable_sets, - product_info: take_first(product_info, "product info", warn), + product_info: take_first(product_info, "product info", &mut warn), long_names, very_long_strings, file_attributes, variable_attributes, other_extension, - end_of_headers: take_first(end_of_headers, "end of headers", warn), - z_header: take_first(z_header, "z_header", warn), - z_trailer: take_first(z_trailer, "z_trailer", warn), + end_of_headers: take_first(end_of_headers, "end of headers", &mut warn), + z_header: take_first(z_header, "z_header", &mut warn), + z_trailer: take_first(z_trailer, "z_trailer", &mut warn), }) } pub fn decode( mut self, + header: HeaderRecord, mut cases: Cases, encoding: &'static Encoding, mut warn: impl FnMut(Error), ) -> SystemFile { let mut dictionary = Dictionary::new(encoding); - let file_label = fix_line_ends(self.header.file_label.trim_end_matches(' ')); + let file_label = fix_line_ends(header.file_label.trim_end_matches(' ')); if !file_label.is_empty() { dictionary.file_label = Some(file_label); } @@ -531,7 +522,7 @@ impl Headers { warn(Error::UnexpectedFloatFormat(floating_point_rep)) } - let expected = match self.header.endian { + let expected = match header.endian { Endian::Big => 1, Endian::Little => 2, }; @@ -562,7 +553,7 @@ impl Headers { } } - if let Some(nominal_case_size) = self.header.nominal_case_size { + if let Some(nominal_case_size) = header.nominal_case_size { let n_vars = self.variable.len(); if n_vars != nominal_case_size as usize && self @@ -674,7 +665,7 @@ impl Headers { value_index += n_values; } - if let Some(weight_index) = self.header.weight_index { + if let Some(weight_index) = header.weight_index { let index = weight_index as usize - 1; if index >= value_index { warn(Error::WeightIndexOutOfRange { @@ -723,7 +714,7 @@ impl Headers { }); } - let written_by_readstat = self.header.eye_catcher.contains("ReadStat"); + let written_by_readstat = header.eye_catcher.contains("ReadStat"); for dict_index in dict_indexes { let variable = dictionary.variables.get_index_mut2(dict_index).unwrap(); let mut duplicates = Vec::new(); @@ -1020,7 +1011,7 @@ impl Headers { }); } - let metadata = Metadata::decode(&self, warn); + let metadata = Metadata::decode(&header, &self, warn); if let Some(n_cases) = metadata.n_cases { cases = cases.with_expected_cases(n_cases); } @@ -1116,8 +1107,12 @@ impl Metadata { (group, values) } - fn decode(headers: &Headers, mut warn: impl FnMut(Error)) -> Self { - let header = &headers.header; + fn decode( + header: &HeaderRecord, + headers: &Headers, + mut warn: impl FnMut(Error), + ) -> Self { + let header = &header; let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %y") .unwrap_or_else(|_| { warn(Error::InvalidCreationDate { diff --git a/rust/pspp/src/sys/raw.rs b/rust/pspp/src/sys/raw.rs index 97fe776d61..572e6b94ee 100644 --- a/rust/pspp/src/sys/raw.rs +++ b/rust/pspp/src/sys/raw.rs @@ -498,11 +498,6 @@ impl From for WarningDetails { #[allow(missing_docs)] // Don't warn for missing docs on tuple members. #[derive(Clone, Debug)] pub enum Record { - /// The file header. - /// - /// Every system file has exactly one header record, at its very beginning. - Header(HeaderRecord), - /// Variable record. /// /// Each numeric variable has one variable record. Each string variable has @@ -590,9 +585,6 @@ pub enum Record { /// or strings. #[derive(Clone, Debug)] pub enum DecodedRecord { - /// File header, with strings decoded. - Header(HeaderRecord), - /// Variable record, with strings decoded. Variable(VariableRecord), @@ -687,9 +679,8 @@ impl Record { } /// Decodes this record into a [DecodedRecord] using `decoder`. - pub fn decode(self, decoder: &mut Decoder) -> Result { - Ok(match self { - Record::Header(record) => DecodedRecord::Header(record.decode(decoder)), + pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord { + match self { Record::Variable(record) => DecodedRecord::Variable(record.decode(decoder)), Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)), Record::Document(record) => DecodedRecord::Document(record.decode(decoder)), @@ -721,7 +712,7 @@ impl Record { Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record), Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()), Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()), - }) + } } } @@ -739,7 +730,7 @@ impl Record { /// PSPP only supports ASCII-based encodings. pub fn infer_encoding( records: &[Record], - warn: &mut impl FnMut(Warning), + mut warn: impl FnMut(Warning), ) -> Result<&'static Encoding, Error> { // Get the character encoding from the first (and only) encoding record. let encoding = records @@ -1157,7 +1148,6 @@ where } enum ReaderState { - Start, Headers, ZlibHeader, ZlibTrailer(ZHeader), @@ -1206,14 +1196,19 @@ where warn: Box::new(warn), header, var_types: VarTypes::new(), - state: ReaderState::Start, + state: ReaderState::Headers, cases: None, }) } - /// Returns a structure for reading the system file's header records. - pub fn headers<'b>(&'b mut self) -> HeaderReader<'a, 'b, R> { - HeaderReader(self) + /// Returns the header in this reader. + pub fn header(&self) -> &HeaderRecord { + &self.header + } + + /// Returns a structure for reading the system file's records. + pub fn records<'b>(&'b mut self) -> Records<'a, 'b, R> { + Records(self) } /// Returns a structure for reading the system file's cases. @@ -1227,12 +1222,12 @@ where } } -/// Reader for the raw header records in a system file. -pub struct HeaderReader<'a, 'b, R>(&'b mut Reader<'a, R>) +/// Reads raw records from a system file. +pub struct Records<'a, 'b, R>(&'b mut Reader<'a, R>) where R: Read + Seek + 'static; -impl<'a, 'b, R> HeaderReader<'a, 'b, R> +impl<'a, 'b, R> Records<'a, 'b, R> where R: Read + Seek + 'static, { @@ -1247,10 +1242,6 @@ where fn _next(&mut self) -> Option<::Item> { match self.0.state { - ReaderState::Start => { - self.0.state = ReaderState::Headers; - Some(Ok(Record::Header(self.0.header.clone()))) - } ReaderState::Headers => { let record = loop { match Record::read( @@ -1311,18 +1302,18 @@ where } } -impl<'a, 'b, R> Iterator for HeaderReader<'a, 'b, R> +impl<'a, 'b, R> Iterator for Records<'a, 'b, R> where R: Read + Seek + 'static, { type Item = Result; fn next(&mut self) -> Option { - let retval = self._next(); - if matches!(retval, Some(Err(_))) { - self.0.state = ReaderState::End; - } - retval + self._next().inspect(|retval| { + if retval.is_err() { + self.0.state = ReaderState::End; + } + }) } } diff --git a/rust/pspp/src/sys/test.rs b/rust/pspp/src/sys/test.rs index e0d0b90439..3c4e9e97d9 100644 --- a/rust/pspp/src/sys/test.rs +++ b/rust/pspp/src/sys/test.rs @@ -603,24 +603,26 @@ where { let mut warnings = Vec::new(); let mut reader = Reader::new(sysfile, |warning| warnings.push(warning)).unwrap(); - let output = match reader.headers().collect::, _>>() { - Ok(headers) => { + let output = match reader.records().collect::, _>>() { + Ok(records) => { + let header = reader.header().clone(); let cases = reader.cases(); - let encoding = infer_encoding(&headers, &mut |warning| warnings.push(warning)).unwrap(); + let encoding = infer_encoding(&records, |warning| warnings.push(warning)).unwrap(); let mut decoder = Decoder::new(encoding, |warning| warnings.push(warning)); - let mut decoded_records = Vec::new(); - for header in headers { - decoded_records.push(header.decode(&mut decoder).unwrap()); - } + let header = header.decode(&mut decoder); + let decoded_records = records + .into_iter() + .map(|record| record.decode(&mut decoder)) + .collect::>(); drop(decoder); let mut errors = Vec::new(); - let headers = Headers::new(decoded_records, &mut |e| errors.push(e)).unwrap(); + let headers = Headers::new(decoded_records, |e| errors.push(e)).unwrap(); let SystemFile { dictionary, metadata, cases, - } = headers.decode(cases, encoding, |e| errors.push(e)); + } = headers.decode(header, cases, encoding, |e| errors.push(e)); let (group, data) = metadata.to_pivot_rows(); let metadata_table = PivotTable::new([(Axis3::Y, Dimension::new(group))]).with_data( data.into_iter() -- 2.30.2