work
authorBen Pfaff <blp@cs.stanford.edu>
Wed, 26 Jul 2023 03:38:21 +0000 (20:38 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Wed, 26 Jul 2023 03:38:21 +0000 (20:38 -0700)
rust/src/lib.rs

index b51f1cf209d0408f955bcf6a81741ae04964d9bc..8e8ee6cab4fad89ec9e749960b40f8330cf0cdde 100644 (file)
@@ -27,6 +27,9 @@ pub enum Error {
     #[error("Invalid ZSAV compression code {0}")]
     InvalidZsavCompression(u32),
 
+    #[error("Variable record at offset {offset:#x} specifies width {width} not in valid range [-1,255).")]
+    BadVariableWidth { offset: u64, width: i32 },
+
     #[error("Misplaced type 4 record near offset {0:#x}.")]
     MisplacedType4Record(u64),
 
@@ -103,29 +106,41 @@ pub struct Header {
     /// Magic number.
     pub magic: Magic,
 
-    /// Endianness of the data in the file header.
-    pub endianness: Endian,
+    /// Eye-catcher string, product name, in the file's encoding.  Padded
+    /// on the right with spaces.
+    pub eye_catcher: [u8; 60],
 
-    /// 0-based variable index of the weight variable, or `None` if the file is
-    /// unweighted.
-    pub weight_index: Option<u32>,
+    /// Layout code, normally either 2 or 3.
+    pub layout_code: u32,
 
     /// Number of variable positions, or `None` if the value in the file is
     /// questionably trustworthy.
     pub nominal_case_size: Option<u32>,
 
+    /// Compression type, if any,
+    pub compression: Option<Compression>,
+
+    /// 0-based variable index of the weight variable, or `None` if the file is
+    /// unweighted.
+    pub weight_index: Option<u32>,
+
+    /// Claimed number of cases, if known.
+    pub n_cases: Option<u32>,
+
+    /// Compression bias, usually 100.0.
+    pub bias: f64,
+
     /// `dd mmm yy` in the file's encoding.
     pub creation_date: [u8; 9],
 
     /// `HH:MM:SS` in the file's encoding.
     pub creation_time: [u8; 8],
 
-    /// Eye-catcher string, then product name, in the file's encoding.  Padded
-    /// on the right with spaces.
-    pub eye_catcher: [u8; 60],
-
     /// File label, in the file's encoding.  Padded on the right with spaces.
     pub file_label: [u8; 64],
+
+    /// Endianness of the data in the file header.
+    pub endianness: Endian,
 }
 
 #[derive(Copy, Clone, PartialEq, Eq, Hash)]
@@ -155,15 +170,30 @@ impl TryFrom<[u8; 4]> for Magic {
     }
 }
 
+enum VarType {
+    Number,
+    String,
+}
+
+impl VarType {
+    fn from_width(width: i32) -> VarType {
+        match width {
+            0 => VarType::Number,
+            _ => VarType::String,
+        }
+    }
+}
+
 pub struct Reader<R: Read> {
     r: BufReader<R>,
+    var_types: Vec<VarType>,
     state: ReaderState,
-    endianness: Option<Endian>,
 }
 
 enum ReaderState {
     Start,
-    Headers(Endian),
+    Headers(Endian, Option<Compression>),
+    Data(Endian),
     End,
 }
 
@@ -171,29 +201,37 @@ impl<R: Read + Seek> Reader<R> {
     pub fn new(r: R) -> Result<Reader<R>, Error> {
         Ok(Reader {
             r: BufReader::new(r),
+            var_types: Vec::new(),
             state: ReaderState::Start,
-            endianness: None,
         })
     }
     fn _next(&mut self) -> Result<Option<(Record, ReaderState)>, Error> {
         match self.state {
             ReaderState::Start => {
                 let header = read_header(&mut self.r)?;
-                let endianness = header.endianness;
-                Ok(Some((Record::Header(header), ReaderState::Headers(endianness))))
+                let next_state = ReaderState::Headers(header.endianness, header.compression);
+                Ok(Some((Record::Header(header), next_state)))
             }
-            ReaderState::Headers(e) => {
-                let rec_type: u32 = e.parse(read_bytes(&mut self.r)?);
+            ReaderState::Headers(endian, compression) => {
+                let rec_type: u32 = endian.parse(read_bytes(&mut self.r)?);
                 let record = match rec_type {
-                    2 => Record::Variable(read_variable_record(&mut self.r, e)?),
-                    3 => Record::ValueLabel(read_value_label_record(&mut self.r, e)?),
-                    4 => Record::VarIndexes(read_var_indexes_record(&mut self.r, e)?),
-                    6 => Record::Document(read_document_record(&mut self.r, e)?),
-                    7 => Record::Extension(read_extension_record(&mut self.r, e)?),
+                    2 => {
+                        let variable = read_variable_record(&mut self.r, endian)?;
+                        self.var_types.push(VarType::from_width(variable.width));
+                        Record::Variable(variable)
+                    }
+                    3 => Record::ValueLabel(read_value_label_record(&mut self.r, endian)?),
+                    4 => Record::VarIndexes(read_var_indexes_record(&mut self.r, endian)?),
+                    6 => Record::Document(read_document_record(&mut self.r, endian)?),
+                    7 => Record::Extension(read_extension_record(&mut self.r, endian)?),
                     999 => {
                         let _: [u8; 4] = read_bytes(&mut self.r)?;
-                        return Ok(Some((Record::EndOfHeaders, ReaderState::End)))
-                    },
+                        let next_state = match compression {
+                            None => ReaderState::Data(endian),
+                            _ => ReaderState::End,
+                        };
+                        return Ok(Some((Record::EndOfHeaders, next_state)));
+                    }
                     _ => {
                         return Err(Error::BadRecordType {
                             offset: self.r.stream_position()?,
@@ -201,7 +239,7 @@ impl<R: Read + Seek> Reader<R> {
                         })
                     }
                 };
-                Ok(Some((record, ReaderState::Headers(e))))
+                Ok(Some((record, ReaderState::Headers(endian, compression))))
             }
             ReaderState::End => Ok(None),
         }
@@ -239,6 +277,7 @@ fn read_header<R: Read>(r: &mut R) -> Result<Header, Error> {
     let endianness = Endian::identify_u32(2, layout_code)
         .or_else(|| Endian::identify_u32(2, layout_code))
         .ok_or_else(|| Error::NotASystemFile)?;
+    let layout_code = endianness.parse(layout_code);
 
     let nominal_case_size: u32 = endianness.parse(read_bytes(r)?);
     let nominal_case_size =
@@ -268,13 +307,17 @@ fn read_header<R: Read>(r: &mut R) -> Result<Header, Error> {
 
     Ok(Header {
         magic,
-        endianness,
-        weight_index,
+        layout_code,
         nominal_case_size,
+        compression,
+        weight_index,
+        n_cases,
+        bias,
         creation_date,
         creation_time,
         eye_catcher,
         file_label,
+        endianness,
     })
 }
 
@@ -615,25 +658,6 @@ fn read_zheader<R: Read + Seek>(r: &mut BufReader<R>, e: Endian) -> Result<ZHead
     let ztrailer_offset: u64 = e.parse(read_bytes(r)?);
     let ztrailer_len: u64 = e.parse(read_bytes(r)?);
 
-    if zheader_offset != offset {
-        return Err(Error::BadZlibHeaderOffset {
-            offset,
-            zheader_offset,
-        });
-    }
-    if ztrailer_offset < offset {
-        return Err(Error::BadZlibTrailerOffset {
-            offset,
-            ztrailer_offset,
-        });
-    }
-    if ztrailer_len < 24 || ztrailer_len % 24 != 0 {
-        return Err(Error::BadZlibTrailerLen {
-            offset,
-            ztrailer_len,
-        });
-    }
-
     Ok(ZHeader {
         offset,
         zheader_offset,