Separate the file header from other records, since it's special
authorBen Pfaff <blp@cs.stanford.edu>
Mon, 14 Jul 2025 22:00:30 +0000 (15:00 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Mon, 14 Jul 2025 22:00:30 +0000 (15:00 -0700)
rust/pspp/src/main.rs
rust/pspp/src/sys/cooked.rs
rust/pspp/src/sys/raw.rs
rust/pspp/src/sys/test.rs

index 484b538485c7284cc3ab9aa804be8f615fe1d7fd..37a97501583ac37c0f7da4f665584ea9a5f2a236 100644 (file)
@@ -87,16 +87,21 @@ impl Convert {
 
     fn run(self) -> Result<()> {
         let mut reader = Reader::new(BufReader::new(File::open(&self.input)?), Self::warn)?;
-        let headers = reader.headers().collect::<Result<Vec<_>, _>>()?;
-        let mut decoder = Decoder::with_inferred_encoding(&headers, |w| Self::warn(w))?;
+        let records = reader.records().collect::<Result<Vec<_>, _>>()?;
+        let mut decoder = Decoder::with_inferred_encoding(&records, |w| Self::warn(w))?;
         let mut decoded_records = Vec::new();
-        for header in headers {
-            decoded_records.push(header.decode(&mut decoder)?);
+        for record in records {
+            decoded_records.push(record.decode(&mut decoder));
         }
         let headers = Headers::new(decoded_records, &mut |e| Self::err(e))?;
         let SystemFile {
             dictionary, cases, ..
-        } = headers.decode(reader.cases(), decoder.encoding, |e| Self::err(e));
+        } = headers.decode(
+            reader.header().clone().decode(&mut decoder),
+            reader.cases(),
+            decoder.encoding,
+            |e| Self::err(e),
+        );
         let writer = match self.output {
             Some(path) => Box::new(File::create(path)?) as Box<dyn Write>,
             None => Box::new(stdout()),
@@ -237,10 +242,7 @@ fn dissect(
 
     match mode {
         Mode::Identify => {
-            let Record::Header(header) = reader.headers().next().unwrap()? else {
-                unreachable!()
-            };
-            match header.magic {
+            match reader.header().magic {
                 Magic::Sav => println!("SPSS System File"),
                 Magic::Zsav => println!("SPSS System File with Zlib compression"),
                 Magic::Ebcdic => println!("EBCDIC-encoded SPSS System File"),
@@ -248,8 +250,8 @@ fn dissect(
             return Ok(());
         }
         Mode::Raw => {
-            for header in reader.headers() {
-                let header = header?;
+            for record in reader.records() {
+                let header = record?;
                 println!("{:?}", header);
             }
             for (_index, case) in (0..max_cases).zip(reader.cases()) {
@@ -257,13 +259,13 @@ fn dissect(
             }
         }
         Mode::Decoded => {
-            let headers: Vec<Record> = reader.headers().collect::<Result<Vec<_>, _>>()?;
+            let records: Vec<Record> = reader.records().collect::<Result<Vec<_>, _>>()?;
             let encoding = match encoding {
                 Some(encoding) => encoding,
-                None => infer_encoding(&headers, &mut |e| eprintln!("{e}"))?,
+                None => infer_encoding(&records, &mut |e| eprintln!("{e}"))?,
             };
             let mut decoder = Decoder::new(encoding, |e| eprintln!("{e}"));
-            for header in headers {
+            for header in records {
                 let header = header.decode(&mut decoder);
                 println!("{:?}", header);
                 /*
@@ -280,22 +282,27 @@ fn dissect(
             }
         }
         Mode::Cooked => {
-            let headers: Vec<Record> = reader.headers().collect::<Result<Vec<_>, _>>()?;
+            let records: Vec<Record> = reader.records().collect::<Result<Vec<_>, _>>()?;
             let encoding = match encoding {
                 Some(encoding) => encoding,
-                None => infer_encoding(&headers, &mut |e| eprintln!("{e}"))?,
+                None => infer_encoding(&records, &mut |e| eprintln!("{e}"))?,
             };
             let mut decoder = Decoder::new(encoding, |e| eprintln!("{e}"));
             let mut decoded_records = Vec::new();
-            for header in headers {
-                decoded_records.push(header.decode(&mut decoder)?);
+            for record in records {
+                decoded_records.push(record.decode(&mut decoder));
             }
             let headers = Headers::new(decoded_records, &mut |e| eprintln!("{e}"))?;
             let SystemFile {
                 dictionary,
                 metadata,
                 cases: _,
-            } = headers.decode(reader.cases(), encoding, |e| eprintln!("{e}"));
+            } = headers.decode(
+                reader.header().clone().decode(&mut decoder),
+                reader.cases(),
+                encoding,
+                |e| eprintln!("{e}"),
+            );
             println!("{dictionary:#?}");
             println!("{metadata:#?}");
         }
index ff78c14ea5204bce3a48d41a325daa80e07acf41..d53951141143205f340bd78b1a56772826fa7ba5 100644 (file)
@@ -335,7 +335,6 @@ pub struct SystemFile {
 
 #[derive(Clone, Debug)]
 pub struct Headers {
-    pub header: HeaderRecord<String>,
     pub variable: Vec<VariableRecord<String>>,
     pub value_label: Vec<ValueLabelRecord<RawDatum, String>>,
     pub document: Vec<DocumentRecord<String>>,
@@ -372,10 +371,9 @@ fn take_first<T>(
 
 impl Headers {
     pub fn new(
-        headers: Vec<raw::DecodedRecord>,
-        warn: &mut impl FnMut(Error),
+        records: Vec<raw::DecodedRecord>,
+        mut warn: impl FnMut(Error),
     ) -> Result<Headers, Error> {
-        let mut file_header = Vec::new();
         let mut variable = Vec::new();
         let mut value_label = Vec::new();
         let mut document = Vec::new();
@@ -398,11 +396,8 @@ impl Headers {
         let mut z_header = Vec::new();
         let mut z_trailer = Vec::new();
 
-        for header in headers {
-            match header {
-                DecodedRecord::Header(record) => {
-                    file_header.push(record);
-                }
+        for record in records {
+            match record {
                 DecodedRecord::Variable(record) => {
                     variable.push(record);
                 }
@@ -469,45 +464,41 @@ impl Headers {
             }
         }
 
-        let Some(file_header) = take_first(file_header, "file header", warn) else {
-            return Err(Error::MissingHeaderRecord);
-        };
-
         Ok(Headers {
-            header: file_header,
             variable,
             value_label,
             document,
-            integer_info: take_first(integer_info, "integer info", warn),
-            float_info: take_first(float_info, "float info", warn),
-            var_display: take_first(var_display, "variable display", warn),
+            integer_info: take_first(integer_info, "integer info", &mut warn),
+            float_info: take_first(float_info, "float info", &mut warn),
+            var_display: take_first(var_display, "variable display", &mut warn),
             multiple_response,
             long_string_value_labels,
             long_string_missing_values,
-            encoding: take_first(encoding, "encoding", warn),
-            number_of_cases: take_first(number_of_cases, "number of cases", warn),
+            encoding: take_first(encoding, "encoding", &mut warn),
+            number_of_cases: take_first(number_of_cases, "number of cases", &mut warn),
             variable_sets,
-            product_info: take_first(product_info, "product info", warn),
+            product_info: take_first(product_info, "product info", &mut warn),
             long_names,
             very_long_strings,
             file_attributes,
             variable_attributes,
             other_extension,
-            end_of_headers: take_first(end_of_headers, "end of headers", warn),
-            z_header: take_first(z_header, "z_header", warn),
-            z_trailer: take_first(z_trailer, "z_trailer", warn),
+            end_of_headers: take_first(end_of_headers, "end of headers", &mut warn),
+            z_header: take_first(z_header, "z_header", &mut warn),
+            z_trailer: take_first(z_trailer, "z_trailer", &mut warn),
         })
     }
 
     pub fn decode(
         mut self,
+        header: HeaderRecord<String>,
         mut cases: Cases,
         encoding: &'static Encoding,
         mut warn: impl FnMut(Error),
     ) -> SystemFile {
         let mut dictionary = Dictionary::new(encoding);
 
-        let file_label = fix_line_ends(self.header.file_label.trim_end_matches(' '));
+        let file_label = fix_line_ends(header.file_label.trim_end_matches(' '));
         if !file_label.is_empty() {
             dictionary.file_label = Some(file_label);
         }
@@ -531,7 +522,7 @@ impl Headers {
                 warn(Error::UnexpectedFloatFormat(floating_point_rep))
             }
 
-            let expected = match self.header.endian {
+            let expected = match header.endian {
                 Endian::Big => 1,
                 Endian::Little => 2,
             };
@@ -562,7 +553,7 @@ impl Headers {
             }
         }
 
-        if let Some(nominal_case_size) = self.header.nominal_case_size {
+        if let Some(nominal_case_size) = header.nominal_case_size {
             let n_vars = self.variable.len();
             if n_vars != nominal_case_size as usize
                 && self
@@ -674,7 +665,7 @@ impl Headers {
             value_index += n_values;
         }
 
-        if let Some(weight_index) = self.header.weight_index {
+        if let Some(weight_index) = header.weight_index {
             let index = weight_index as usize - 1;
             if index >= value_index {
                 warn(Error::WeightIndexOutOfRange {
@@ -723,7 +714,7 @@ impl Headers {
                 });
             }
 
-            let written_by_readstat = self.header.eye_catcher.contains("ReadStat");
+            let written_by_readstat = header.eye_catcher.contains("ReadStat");
             for dict_index in dict_indexes {
                 let variable = dictionary.variables.get_index_mut2(dict_index).unwrap();
                 let mut duplicates = Vec::new();
@@ -1020,7 +1011,7 @@ impl Headers {
             });
         }
 
-        let metadata = Metadata::decode(&self, warn);
+        let metadata = Metadata::decode(&header, &self, warn);
         if let Some(n_cases) = metadata.n_cases {
             cases = cases.with_expected_cases(n_cases);
         }
@@ -1116,8 +1107,12 @@ impl Metadata {
         (group, values)
     }
 
-    fn decode(headers: &Headers, mut warn: impl FnMut(Error)) -> Self {
-        let header = &headers.header;
+    fn decode(
+        header: &HeaderRecord<String>,
+        headers: &Headers,
+        mut warn: impl FnMut(Error),
+    ) -> Self {
+        let header = &header;
         let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %y")
             .unwrap_or_else(|_| {
                 warn(Error::InvalidCreationDate {
index 97fe776d614e41741378524302d19b6f1e97ff1e..572e6b94ee05f51a8fb97f7e2f8dac5e382d14e4 100644 (file)
@@ -498,11 +498,6 @@ impl From<IoError> for WarningDetails {
 #[allow(missing_docs)] // Don't warn for missing docs on tuple members.
 #[derive(Clone, Debug)]
 pub enum Record {
-    /// The file header.
-    ///
-    /// Every system file has exactly one header record, at its very beginning.
-    Header(HeaderRecord<RawString>),
-
     /// Variable record.
     ///
     /// Each numeric variable has one variable record.  Each string variable has
@@ -590,9 +585,6 @@ pub enum Record {
 /// or strings.
 #[derive(Clone, Debug)]
 pub enum DecodedRecord {
-    /// File header, with strings decoded.
-    Header(HeaderRecord<String>),
-
     /// Variable record, with strings decoded.
     Variable(VariableRecord<String>),
 
@@ -687,9 +679,8 @@ impl Record {
     }
 
     /// Decodes this record into a [DecodedRecord] using `decoder`.
-    pub fn decode(self, decoder: &mut Decoder) -> Result<DecodedRecord, Error> {
-        Ok(match self {
-            Record::Header(record) => DecodedRecord::Header(record.decode(decoder)),
+    pub fn decode(self, decoder: &mut Decoder) -> DecodedRecord {
+        match self {
             Record::Variable(record) => DecodedRecord::Variable(record.decode(decoder)),
             Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)),
             Record::Document(record) => DecodedRecord::Document(record.decode(decoder)),
@@ -721,7 +712,7 @@ impl Record {
             Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record),
             Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()),
             Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()),
-        })
+        }
     }
 }
 
@@ -739,7 +730,7 @@ impl Record {
 /// PSPP only supports ASCII-based encodings.
 pub fn infer_encoding(
     records: &[Record],
-    warn: &mut impl FnMut(Warning),
+    mut warn: impl FnMut(Warning),
 ) -> Result<&'static Encoding, Error> {
     // Get the character encoding from the first (and only) encoding record.
     let encoding = records
@@ -1157,7 +1148,6 @@ where
 }
 
 enum ReaderState {
-    Start,
     Headers,
     ZlibHeader,
     ZlibTrailer(ZHeader),
@@ -1206,14 +1196,19 @@ where
             warn: Box::new(warn),
             header,
             var_types: VarTypes::new(),
-            state: ReaderState::Start,
+            state: ReaderState::Headers,
             cases: None,
         })
     }
 
-    /// Returns a structure for reading the system file's header records.
-    pub fn headers<'b>(&'b mut self) -> HeaderReader<'a, 'b, R> {
-        HeaderReader(self)
+    /// Returns the header in this reader.
+    pub fn header(&self) -> &HeaderRecord<RawString> {
+        &self.header
+    }
+
+    /// Returns a structure for reading the system file's records.
+    pub fn records<'b>(&'b mut self) -> Records<'a, 'b, R> {
+        Records(self)
     }
 
     /// Returns a structure for reading the system file's cases.
@@ -1227,12 +1222,12 @@ where
     }
 }
 
-/// Reader for the raw header records in a system file.
-pub struct HeaderReader<'a, 'b, R>(&'b mut Reader<'a, R>)
+/// Reads raw records from a system file.
+pub struct Records<'a, 'b, R>(&'b mut Reader<'a, R>)
 where
     R: Read + Seek + 'static;
 
-impl<'a, 'b, R> HeaderReader<'a, 'b, R>
+impl<'a, 'b, R> Records<'a, 'b, R>
 where
     R: Read + Seek + 'static,
 {
@@ -1247,10 +1242,6 @@ where
 
     fn _next(&mut self) -> Option<<Self as Iterator>::Item> {
         match self.0.state {
-            ReaderState::Start => {
-                self.0.state = ReaderState::Headers;
-                Some(Ok(Record::Header(self.0.header.clone())))
-            }
             ReaderState::Headers => {
                 let record = loop {
                     match Record::read(
@@ -1311,18 +1302,18 @@ where
     }
 }
 
-impl<'a, 'b, R> Iterator for HeaderReader<'a, 'b, R>
+impl<'a, 'b, R> Iterator for Records<'a, 'b, R>
 where
     R: Read + Seek + 'static,
 {
     type Item = Result<Record, Error>;
 
     fn next(&mut self) -> Option<Self::Item> {
-        let retval = self._next();
-        if matches!(retval, Some(Err(_))) {
-            self.0.state = ReaderState::End;
-        }
-        retval
+        self._next().inspect(|retval| {
+            if retval.is_err() {
+                self.0.state = ReaderState::End;
+            }
+        })
     }
 }
 
index e0d0b90439df13d94583c369b18d0b0d3fec15a7..3c4e9e97d9eea6068a40823ffc3a9ba5a95746ee 100644 (file)
@@ -603,24 +603,26 @@ where
 {
     let mut warnings = Vec::new();
     let mut reader = Reader::new(sysfile, |warning| warnings.push(warning)).unwrap();
-    let output = match reader.headers().collect::<Result<Vec<_>, _>>() {
-        Ok(headers) => {
+    let output = match reader.records().collect::<Result<Vec<_>, _>>() {
+        Ok(records) => {
+            let header = reader.header().clone();
             let cases = reader.cases();
-            let encoding = infer_encoding(&headers, &mut |warning| warnings.push(warning)).unwrap();
+            let encoding = infer_encoding(&records, |warning| warnings.push(warning)).unwrap();
             let mut decoder = Decoder::new(encoding, |warning| warnings.push(warning));
-            let mut decoded_records = Vec::new();
-            for header in headers {
-                decoded_records.push(header.decode(&mut decoder).unwrap());
-            }
+            let header = header.decode(&mut decoder);
+            let decoded_records = records
+                .into_iter()
+                .map(|record| record.decode(&mut decoder))
+                .collect::<Vec<_>>();
             drop(decoder);
 
             let mut errors = Vec::new();
-            let headers = Headers::new(decoded_records, &mut |e| errors.push(e)).unwrap();
+            let headers = Headers::new(decoded_records, |e| errors.push(e)).unwrap();
             let SystemFile {
                 dictionary,
                 metadata,
                 cases,
-            } = headers.decode(cases, encoding, |e| errors.push(e));
+            } = headers.decode(header, cases, encoding, |e| errors.push(e));
             let (group, data) = metadata.to_pivot_rows();
             let metadata_table = PivotTable::new([(Axis3::Y, Dimension::new(group))]).with_data(
                 data.into_iter()