work on testing
authorBen Pfaff <blp@cs.stanford.edu>
Sat, 29 Jul 2023 16:17:48 +0000 (09:17 -0700)
committerBen Pfaff <blp@cs.stanford.edu>
Sat, 29 Jul 2023 16:17:48 +0000 (09:17 -0700)
rust/Cargo.lock
rust/Cargo.toml
rust/src/endian.rs
rust/src/lib.rs
rust/src/sack.rs [new file with mode: 0644]

index 950e262847199a3c314ecf69ce1b28985e576e11..c37218a301957c392e5b64a9a70f06eca5935568 100644 (file)
@@ -126,6 +126,12 @@ dependencies = [
  "miniz_oxide",
 ]
 
+[[package]]
+name = "float_next_after"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
+
 [[package]]
 name = "heck"
 version = "0.4.1"
@@ -339,6 +345,7 @@ dependencies = [
  "anyhow",
  "clap",
  "flate2",
+ "float_next_after",
  "hexplay",
  "num",
  "num-derive",
index a58a5fafa671aee11a8cabf80068428f192a0d1d..0990ad1be38e6b4476ee79198f2ca9565b87222c 100644 (file)
@@ -8,6 +8,7 @@ authors = [ "Ben Pfaff", "John Darrington" ]
 anyhow = "1.0.69"
 clap = { version = "4.1.7", features = ["derive"] }
 flate2 = "1.0.26"
+float_next_after = "1.0.0"
 hexplay = "0.2.1"
 num = "0.4.0"
 num-derive = "0.4.0"
index 6bd25ab95ac0408793983a34ed18ca8ad68f0db6..dd562e60eff7c9cf8500879f1064c41618ba5ae7 100644 (file)
@@ -1,9 +1,9 @@
-#[derive(Copy, Clone, Debug)]
 /// The endianness for integer and floating-point numbers in SPSS system files.
 ///
 /// SPSS system files can declare IBM 370 and DEC VAX floating-point
 /// representations, but no file that uses either of these has ever been found
 /// in the wild, so this code does not handle them.
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub enum Endian {
     /// Big-endian: MSB at lowest address.
     Big,
index 49bc74ef84e2430245b16d6b5cb9b8704d267755..8bf72fe02c064c8dcb5f30ff44c8cd6208ec4ba2 100644 (file)
@@ -1,4 +1,3 @@
-#![allow(unused_variables)]
 use endian::{Endian, Parse, ToBytes};
 use flate2::read::ZlibDecoder;
 use num::Integer;
@@ -11,6 +10,7 @@ use std::{
 use thiserror::Error;
 
 pub mod endian;
+pub mod sack;
 
 #[derive(Error, Debug)]
 pub enum Error {
@@ -220,6 +220,7 @@ impl VarType {
 }
 
 trait State {
+    #[allow(clippy::type_complexity)]
     fn read(self: Box<Self>) -> Result<Option<(Record, Box<dyn State>)>, Error>;
 }
 
@@ -463,7 +464,7 @@ where
     R: Read + Seek,
 {
     fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
-        unimplemented!();
+        self.reader.as_mut().unwrap().get_mut().seek(pos)
     }
 }
 
@@ -504,10 +505,10 @@ impl Iterator for Reader {
         match self.state.take()?.read() {
             Ok(Some((record, next_state))) => {
                 self.state = Some(next_state);
-                return Some(Ok(record));
+                Some(Ok(record))
             }
-            Ok(None) => return None,
-            Err(error) => return Some(Err(error)),
+            Ok(None) => None,
+            Err(error) => Some(Err(error)),
         }
     }
 }
diff --git a/rust/src/sack.rs b/rust/src/sack.rs
new file mode 100644 (file)
index 0000000..6be81d8
--- /dev/null
@@ -0,0 +1,168 @@
+use anyhow::{anyhow, Result};
+use float_next_after::NextAfter;
+use std::{iter::Peekable, str::Chars};
+
+use crate::endian::Endian;
+
+pub fn sack(input: &str, endian: Endian) -> Result<Vec<u8>> {
+    let lexer = Lexer::new(input, endian)?;
+    //let mut output = Vec::new();
+    Ok(Vec::new())
+}
+
+enum Token {
+    Integer(i64),
+    Float(f64),
+    PcSysmis,
+    String(String),
+    Semicolon,
+    Asterisk,
+    LParen,
+    RParen,
+    I8,
+    I16,
+    I64,
+    S(usize),
+    Count,
+    Count8,
+    Hex,
+    Label(String),
+    At(String),
+    Minus,
+    Plus,
+}
+
+struct Lexer<'a> {
+    iter: Peekable<Chars<'a>>,
+    token: Option<Token>,
+    line_number: usize,
+    endian: Endian,
+}
+
+impl<'a> Lexer<'a> {
+    fn new(input: &'a str, endian: Endian) -> Result<Lexer<'a>> {
+        let mut lexer = Lexer {
+            iter: input.chars().peekable(),
+            token: None,
+            line_number: 1,
+            endian,
+        };
+        lexer.next()?;
+        Ok(lexer)
+    }
+    fn get(&'a mut self) -> Result<Option<&'a Token>> {
+        if self.token.is_none() {
+            Err(anyhow!("unexpected end of input"))
+        } else {
+            self.token = self.next()?;
+            Ok((&self.token).into())
+        }
+    }
+
+    fn next(&mut self) -> Result<Option<Token>> {
+        // Get the first character of the token, skipping past white space and
+        // comments.
+        let c = loop {
+            let Some(c) = self.iter.next() else {
+                return Ok(None);
+            };
+            let c = if c == '#' {
+                loop {
+                    match self.iter.next() {
+                        None => return Ok(None),
+                        Some('\n') => break,
+                        _ => (),
+                    }
+                }
+                '\n'
+            } else {
+                c
+            };
+            if c == '\n' {
+                self.line_number += 1
+            } else if !c.is_whitespace() && c != '<' && c != '>' {
+                break c;
+            }
+        };
+
+        let token = match c {
+            c if c.is_ascii_digit() || c == '-' => {
+                let mut s = String::from(c);
+                while let Some(c) = self
+                    .iter
+                    .next_if(|&c| c.is_ascii_digit() || c.is_alphabetic() || c == '.')
+                {
+                    s.push(c);
+                }
+
+                if s == "-" {
+                    Token::Minus
+                } else if !s.contains('.') {
+                    Token::Integer(
+                        s.parse()
+                            .map_err(|msg| anyhow!("bad integer literal '{s}' ({msg})"))?,
+                    )
+                } else {
+                    Token::Float(
+                        s.parse()
+                            .map_err(|msg| anyhow!("bad float literal '{s}' ({msg})"))?,
+                    )
+                }
+            }
+            '"' => {
+                let mut s = String::from(c);
+                loop {
+                    match self.iter.next() {
+                        None => return Err(anyhow!("end-of-file inside string")),
+                        Some('\n') => return Err(anyhow!("new-line inside string")),
+                        Some('"') => break,
+                        Some(c) => s.push(c),
+                    }
+                }
+                Token::String(s)
+            }
+            ';' => Token::Semicolon,
+            '*' => Token::Asterisk,
+            '+' => Token::Plus,
+            '(' => Token::LParen,
+            ')' => Token::RParen,
+            c if c.is_alphabetic() || c == '@' || c == '_' => {
+                let mut s = String::from(c);
+                while let Some(c) = self
+                    .iter
+                    .next_if(|&c| c.is_ascii_digit() || c.is_alphabetic() || c == '.' || c == '_')
+                {
+                    s.push(c);
+                }
+                if self.iter.next_if_eq(&':').is_some() {
+                    Token::Label(s)
+                } else if s.starts_with('@') {
+                    Token::At(s)
+                } else if let Some(count) = s.strip_prefix('s') {
+                    Token::S(
+                        count
+                            .parse()
+                            .map_err(|msg| anyhow!("bad counted string '{s}' ({msg})"))?,
+                    )
+                } else {
+                    match &s[..] {
+                        "i8" => Token::I8,
+                        "i16" => Token::I16,
+                        "i64" => Token::I64,
+                        "SYSMIS" => Token::Float(-f64::MAX),
+                        "PCSYSMIS" => Token::PcSysmis,
+                        "LOWEST" => Token::Float((-f64::MAX).next_after(0.0)),
+                        "HIGHEST" => Token::Float(f64::MAX),
+                        "ENDIAN" => Token::Integer(if self.endian == Endian::Big { 1 } else { 2 }),
+                        "COUNT" => Token::Count,
+                        "COUNT8" => Token::Count8,
+                        "hex" => Token::Hex,
+                        _ => return Err(anyhow!("invalid token '{s}'")),
+                    }
+                }
+            }
+            _ => return Err(anyhow!("invalid input byte '{c}'")),
+        };
+        Ok(Some(token))
+    }
+}