work (zlib works?)
authorBen Pfaff <blp@cs.stanford.edu>
Fri, 28 Jul 2023 20:19:45 +0000 (13:19 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Fri, 28 Jul 2023 20:19:45 +0000 (13:19 -0700)
rust/src/lib.rs

index d237f81b801aca3b91b4dce45e9da5ebd2d56015..963e4ef44850db5c57614c5ef20e82e51ea08b87 100644 (file)
@@ -1,6 +1,6 @@
 #![allow(unused_variables)]
 use endian::{Endian, Parse, ToBytes};
-//use flate2::bufread::ZlibDecoder;
+use flate2::bufread::ZlibDecoder;
 use num::Integer;
 use num_derive::FromPrimitive;
 use std::{
@@ -99,7 +99,12 @@ pub enum Error {
     CompressedStringExpected { offset: u64, case_ofs: u64 },
 
     #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
-    BadZlibTrailerNBlocks { offset: u64, n_blocks: u32, expected_n_blocks: u64, ztrailer_len: u64 }
+    BadZlibTrailerNBlocks {
+        offset: u64,
+        n_blocks: u32,
+        expected_n_blocks: u64,
+        ztrailer_len: u64,
+    },
 }
 
 #[derive(Error, Debug)]
@@ -213,19 +218,292 @@ impl VarType {
     }
 }
 
-pub struct Reader<R: Read> {
+pub struct Reader<R: Read + Seek> {
     r: BufReader<R>,
     var_types: Vec<VarType>,
     state: ReaderState,
 }
 
+trait State {
+    fn read(self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error>;
+}
+
+struct Start<R: Read + Seek> {
+    r: BufReader<R>,
+}
+
+impl<R: Read + Seek + 'static> State for Start<R> {
+    fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
+        let header = read_header(&mut self.r)?;
+        Ok(Some((Record::Header(header), self)))
+    }
+}
+
+struct Headers<R: Read + Seek> {
+    reader: BufReader<R>,
+    endian: Endian,
+    compression: Option<Compression>,
+    var_types: Vec<VarType>,
+}
+
+impl<R: Read + Seek + 'static> State for Headers<R> {
+    fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
+        let rec_type: u32 = self.endian.parse(read_bytes(&mut self.reader)?);
+        let record = match rec_type {
+            2 => {
+                let variable = read_variable_record(&mut self.reader, self.endian)?;
+                self.var_types.push(VarType::from_width(variable.width));
+                Record::Variable(variable)
+            }
+            3 => Record::ValueLabel(read_value_label_record(&mut self.reader, self.endian)?),
+            4 => Record::VarIndexes(read_var_indexes_record(&mut self.reader, self.endian)?),
+            6 => Record::Document(read_document_record(&mut self.reader, self.endian)?),
+            7 => Record::Extension(read_extension_record(&mut self.reader, self.endian)?),
+            999 => {
+                let _: [u8; 4] = read_bytes(&mut self.reader)?;
+                let next_state: Box<dyn State> = match self.compression {
+                    None => Box::new(Data {
+                        reader: self.reader,
+                        endian: self.endian,
+                        var_types: self.var_types,
+                    }),
+                    Some(Compression::Simple) => Box::new(CompressedData {
+                        reader: self.reader,
+                        endian: self.endian,
+                        var_types: self.var_types,
+                        codes: VecDeque::new(),
+                    }),
+                    Some(Compression::ZLib) => Box::new(ZlibData {
+                        reader: ZlibDecodeMultiple::new(self.reader),
+                        endian: self.endian,
+                        var_types: self.var_types,
+                        codes: VecDeque::new(),
+                    }),
+                };
+                return Ok(Some((Record::EndOfHeaders, next_state)));
+            }
+            _ => {
+                return Err(Error::BadRecordType {
+                    offset: self.reader.stream_position()?,
+                    rec_type,
+                })
+            }
+        };
+        Ok(Some((record, self)))
+    }
+}
+
+struct Data<R: Read + Seek> {
+    reader: BufReader<R>,
+    endian: Endian,
+    var_types: Vec<VarType>,
+}
+
+impl<R: Read + Seek + 'static> State for Data<R> {
+    fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
+        let case_start = self.reader.stream_position()?;
+        let mut values = Vec::with_capacity(self.var_types.len());
+        for (i, &var_type) in self.var_types.iter().enumerate() {
+            let Some(raw) = try_read_bytes(&mut self.reader)? else {
+                if i == 0 {
+                    return Ok(None);
+                } else {
+                    let offset = self.reader.stream_position()?;
+                    return Err(Error::EofInCase {
+                        offset,
+                        case_ofs: offset - case_start,
+                        case_len: self.var_types.len() * 8,
+                    });
+                }
+            };
+            values.push(Value::from_raw(var_type, raw, self.endian));
+        }
+        Ok(Some((Record::Case(values), self)))
+    }
+}
+
+struct CompressedData<R: Read + Seek> {
+    reader: BufReader<R>,
+    endian: Endian,
+    var_types: Vec<VarType>,
+    codes: VecDeque<u8>,
+}
+
+fn read_compressed_data<R>(
+    reader: &mut R,
+    endian: Endian,
+    var_types: &Vec<VarType>,
+    codes: &mut VecDeque<u8>,
+) -> Result<Option<Record>, Error>
+where
+    R: Read + Seek,
+{
+    let case_start = reader.stream_position()?;
+    let mut values = Vec::with_capacity(var_types.len());
+    let bias = 100.0; // XXX
+    for (i, &var_type) in var_types.iter().enumerate() {
+        let value = loop {
+            let Some(code) = codes.pop_front() else {
+                let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
+                    if i == 0 {
+                        return Ok(None);
+                    } else {
+                        let offset = reader.stream_position()?;
+                        return Err(Error::EofInCompressedCase {
+                            offset,
+                            case_ofs: offset - case_start,
+                        });
+                    }
+                };
+                codes.extend(new_codes.into_iter());
+                continue;
+            };
+            match code {
+                0 => (),
+                1..=251 => match var_type {
+                    VarType::Number => break Value::Number(Some(code as f64 - bias)),
+                    VarType::String => break Value::String(endian.to_bytes(code as f64 - bias)),
+                },
+                252 => {
+                    if i == 0 {
+                        return Ok(None);
+                    } else {
+                        let offset = reader.stream_position()?;
+                        return Err(Error::PartialCompressedCase {
+                            offset,
+                            case_ofs: offset - case_start,
+                        });
+                    }
+                }
+                253 => break Value::from_raw(var_type, read_bytes(reader)?, endian),
+                254 => match var_type {
+                    VarType::String => break Value::String(*b"        "), // XXX EBCDIC
+                    VarType::Number => {
+                        return Err(Error::CompressedStringExpected {
+                            offset: case_start,
+                            case_ofs: reader.stream_position()? - case_start,
+                        })
+                    }
+                },
+                255 => match var_type {
+                    VarType::Number => break Value::Number(None),
+                    VarType::String => {
+                        return Err(Error::CompressedNumberExpected {
+                            offset: case_start,
+                            case_ofs: reader.stream_position()? - case_start,
+                        })
+                    }
+                },
+            }
+        };
+        values.push(value);
+    }
+    Ok(Some(Record::Case(values)))
+}
+
+impl<R: Read + Seek + 'static> State for CompressedData<R> {
+    fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
+        match read_compressed_data(
+            &mut self.reader,
+            self.endian,
+            &self.var_types,
+            &mut self.codes,
+        )? {
+            None => Ok(None),
+            Some(record) => Ok(Some((record, self))),
+        }
+    }
+}
+
+struct ZlibData<R: Read + Seek> {
+    reader: ZlibDecodeMultiple<R>,
+    endian: Endian,
+    var_types: Vec<VarType>,
+    codes: VecDeque<u8>,
+}
+
+impl<R: Read + Seek + 'static> State for ZlibData<R> {
+    fn read(mut self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error> {
+        match read_compressed_data(
+            &mut self.reader,
+            self.endian,
+            &self.var_types,
+            &mut self.codes,
+        )? {
+            None => Ok(None),
+            Some(record) => Ok(Some((record, self))),
+        }
+    }
+}
+
+struct ZlibDecodeMultiple<R>
+where
+    R: Read + Seek,
+{
+    reader: Option<ZlibDecoder<BufReader<R>>>,
+}
+
+impl<R> ZlibDecodeMultiple<R>
+where
+    R: Read + Seek,
+{
+    fn new(reader: BufReader<R>) -> ZlibDecodeMultiple<R> {
+        ZlibDecodeMultiple {
+            reader: Some(ZlibDecoder::new(reader)),
+        }
+    }
+}
+
+impl<R> Read for ZlibDecodeMultiple<R>
+where
+    R: Read + Seek,
+{
+    fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
+        loop {
+            match self.reader.as_mut().unwrap().read(buf)? {
+                0 => {
+                    let inner = self.reader.take().unwrap().into_inner();
+                    self.reader = Some(ZlibDecoder::new(inner));
+                }
+                n => return Ok(n),
+            };
+        }
+    }
+}
+
+impl<R> Seek for ZlibDecodeMultiple<R>
+where
+    R: Read + Seek,
+{
+    fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
+        unimplemented!();
+    }
+}
+
+/*
+impl<R> BufRead for ZlibDecodeMultiple<R>
+where
+    R: Read + Seek,
+{
+    fn fill_buf(&mut self) -> Result<&[u8], IoError> {
+        self.reader.as_mut().unwrap().fill_buf()
+    }
+    fn consume(&mut self, amt: usize) {
+        self.reader.as_mut().unwrap().consume(amt)
+    }
+}*/
+
 enum ReaderState {
     Start,
     Headers(Endian, Option<Compression>),
     Data(Endian),
     CompressedData(Endian, VecDeque<u8>),
     ZHeader(Endian),
-    ZTrailer { endian: Endian, ztrailer_ofs: u64, ztrailer_len: u64 },
+    ZTrailer {
+        endian: Endian,
+        ztrailer_ofs: u64,
+        ztrailer_len: u64,
+    },
     //ZData,
     End,
 }
@@ -304,7 +582,11 @@ impl<R: Read + Seek> Reader<R> {
                             return Ok(None);
                         } else {
                             let offset = self.r.stream_position()?;
-                            return Err(Error::EofInCase { offset, case_ofs: offset - case_start, case_len: self.var_types.len() * 8});
+                            return Err(Error::EofInCase {
+                                offset,
+                                case_ofs: offset - case_start,
+                                case_len: self.var_types.len() * 8,
+                            });
                         }
                     };
                     values.push(Value::from_raw(var_type, raw, endian));
@@ -318,12 +600,16 @@ impl<R: Read + Seek> Reader<R> {
                 for (i, &var_type) in self.var_types.iter().enumerate() {
                     let value = loop {
                         let Some(code) = codes.pop_front() else {
-                            let Some(new_codes): Option<[u8; 8]> = try_read_bytes(&mut self.r)? else {
+                            let Some(new_codes): Option<[u8; 8]> = try_read_bytes(&mut self.r)?
+                            else {
                                 if i == 0 {
                                     return Ok(None);
                                 } else {
                                     let offset = self.r.stream_position()?;
-                                    return Err(Error::EofInCompressedCase { offset, case_ofs: offset - case_start});
+                                    return Err(Error::EofInCompressedCase {
+                                        offset,
+                                        case_ofs: offset - case_start,
+                                    });
                                 }
                             };
                             codes.extend(new_codes.into_iter());
@@ -377,23 +663,29 @@ impl<R: Read + Seek> Reader<R> {
             }
             ReaderState::ZHeader(endian) => {
                 let zheader = read_zheader(&mut self.r, endian)?;
-                self.state = ReaderState::ZTrailer { endian, ztrailer_ofs: zheader.ztrailer_offset, ztrailer_len: zheader.ztrailer_len};
+                self.state = ReaderState::ZTrailer {
+                    endian,
+                    ztrailer_ofs: zheader.ztrailer_offset,
+                    ztrailer_len: zheader.ztrailer_len,
+                };
                 Ok(Some(Record::ZHeader(zheader)))
             }
-            ReaderState::ZTrailer { endian, ztrailer_ofs, ztrailer_len } => {
+            ReaderState::ZTrailer {
+                endian,
+                ztrailer_ofs,
+                ztrailer_len,
+            } => {
                 //self.state = ReaderState::ZData;
                 match read_ztrailer(&mut self.r, endian, ztrailer_ofs, ztrailer_len)? {
-                    Some(ztrailer) => {
-                        Ok(Some(Record::ZTrailer(ztrailer)))
-                    },
-                    None => self._next()
+                    Some(ztrailer) => Ok(Some(Record::ZTrailer(ztrailer))),
+                    None => self._next(),
                 }
             }
-/*
-            ReaderState::ZData(zlib_decoder) => {
-                let zlib_decoder = zlib_decoder.unwrap_or_else(
-            },
-*/
+            /*
+                        ReaderState::ZData(zlib_decoder) => {
+                            let zlib_decoder = zlib_decoder.unwrap_or_else(
+                        },
+            */
             ReaderState::End => Ok(None),
         }
     }
@@ -850,10 +1142,15 @@ pub struct ZBlock {
     pub compressed_size: u32,
 }
 
-fn read_ztrailer<R: Read + Seek>(r: &mut BufReader<R>, endian: Endian, ztrailer_ofs: u64, ztrailer_len: u64) -> Result<Option<ZTrailer>, Error> {
+fn read_ztrailer<R: Read + Seek>(
+    r: &mut BufReader<R>,
+    endian: Endian,
+    ztrailer_ofs: u64,
+    ztrailer_len: u64,
+) -> Result<Option<ZTrailer>, Error> {
     let start_offset = r.stream_position()?;
     if r.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
-        return Ok(None)
+        return Ok(None);
     }
     let int_bias = endian.parse(read_bytes(r)?);
     let zero = endian.parse(read_bytes(r)?);
@@ -861,7 +1158,12 @@ fn read_ztrailer<R: Read + Seek>(r: &mut BufReader<R>, endian: Endian, ztrailer_
     let n_blocks: u32 = endian.parse(read_bytes(r)?);
     let expected_n_blocks = (ztrailer_len - 24) / 24;
     if n_blocks as u64 != expected_n_blocks {
-        return Err(Error::BadZlibTrailerNBlocks { offset: ztrailer_ofs, n_blocks, expected_n_blocks, ztrailer_len })
+        return Err(Error::BadZlibTrailerNBlocks {
+            offset: ztrailer_ofs,
+            n_blocks,
+            expected_n_blocks,
+            ztrailer_len,
+        });
     }
     let mut blocks = Vec::with_capacity(n_blocks as usize);
     for _ in 0..n_blocks {
@@ -869,10 +1171,21 @@ fn read_ztrailer<R: Read + Seek>(r: &mut BufReader<R>, endian: Endian, ztrailer_
         let compressed_ofs = endian.parse(read_bytes(r)?);
         let uncompressed_size = endian.parse(read_bytes(r)?);
         let compressed_size = endian.parse(read_bytes(r)?);
-        blocks.push(ZBlock { uncompressed_ofs, compressed_ofs, uncompressed_size, compressed_size });
+        blocks.push(ZBlock {
+            uncompressed_ofs,
+            compressed_ofs,
+            uncompressed_size,
+            compressed_size,
+        });
     }
     r.seek(SeekFrom::Start(start_offset))?;
-    Ok(Some(ZTrailer { offset: ztrailer_ofs, int_bias, zero, block_size, blocks }))
+    Ok(Some(ZTrailer {
+        offset: ztrailer_ofs,
+        int_bias,
+        zero,
+        block_size,
+        blocks,
+    }))
 }
 
 fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {