From b8f1dedca099b32b4b4b4dda5f265cc1e8db9587 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 29 Jul 2023 09:17:48 -0700 Subject: [PATCH 1/1] work on testing --- rust/Cargo.lock | 7 ++ rust/Cargo.toml | 1 + rust/src/endian.rs | 2 +- rust/src/lib.rs | 11 +-- rust/src/sack.rs | 168 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 183 insertions(+), 6 deletions(-) create mode 100644 rust/src/sack.rs diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 950e262847..c37218a301 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -126,6 +126,12 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "float_next_after" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" + [[package]] name = "heck" version = "0.4.1" @@ -339,6 +345,7 @@ dependencies = [ "anyhow", "clap", "flate2", + "float_next_after", "hexplay", "num", "num-derive", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index a58a5fafa6..0990ad1be3 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -8,6 +8,7 @@ authors = [ "Ben Pfaff", "John Darrington" ] anyhow = "1.0.69" clap = { version = "4.1.7", features = ["derive"] } flate2 = "1.0.26" +float_next_after = "1.0.0" hexplay = "0.2.1" num = "0.4.0" num-derive = "0.4.0" diff --git a/rust/src/endian.rs b/rust/src/endian.rs index 6bd25ab95a..dd562e60ef 100644 --- a/rust/src/endian.rs +++ b/rust/src/endian.rs @@ -1,9 +1,9 @@ -#[derive(Copy, Clone, Debug)] /// The endianness for integer and floating-point numbers in SPSS system files. /// /// SPSS system files can declare IBM 370 and DEC VAX floating-point /// representations, but no file that uses either of these has ever been found /// in the wild, so this code does not handle them. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum Endian { /// Big-endian: MSB at lowest address. Big, diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 49bc74ef84..8bf72fe02c 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,4 +1,3 @@ -#![allow(unused_variables)] use endian::{Endian, Parse, ToBytes}; use flate2::read::ZlibDecoder; use num::Integer; @@ -11,6 +10,7 @@ use std::{ use thiserror::Error; pub mod endian; +pub mod sack; #[derive(Error, Debug)] pub enum Error { @@ -220,6 +220,7 @@ impl VarType { } trait State { + #[allow(clippy::type_complexity)] fn read(self: Box) -> Result)>, Error>; } @@ -463,7 +464,7 @@ where R: Read + Seek, { fn seek(&mut self, pos: SeekFrom) -> Result { - unimplemented!(); + self.reader.as_mut().unwrap().get_mut().seek(pos) } } @@ -504,10 +505,10 @@ impl Iterator for Reader { match self.state.take()?.read() { Ok(Some((record, next_state))) => { self.state = Some(next_state); - return Some(Ok(record)); + Some(Ok(record)) } - Ok(None) => return None, - Err(error) => return Some(Err(error)), + Ok(None) => None, + Err(error) => Some(Err(error)), } } } diff --git a/rust/src/sack.rs b/rust/src/sack.rs new file mode 100644 index 0000000000..6be81d8aff --- /dev/null +++ b/rust/src/sack.rs @@ -0,0 +1,168 @@ +use anyhow::{anyhow, Result}; +use float_next_after::NextAfter; +use std::{iter::Peekable, str::Chars}; + +use crate::endian::Endian; + +pub fn sack(input: &str, endian: Endian) -> Result> { + let lexer = Lexer::new(input, endian)?; + //let mut output = Vec::new(); + Ok(Vec::new()) +} + +enum Token { + Integer(i64), + Float(f64), + PcSysmis, + String(String), + Semicolon, + Asterisk, + LParen, + RParen, + I8, + I16, + I64, + S(usize), + Count, + Count8, + Hex, + Label(String), + At(String), + Minus, + Plus, +} + +struct Lexer<'a> { + iter: Peekable>, + token: Option, + line_number: usize, + endian: Endian, +} + +impl<'a> Lexer<'a> { + fn new(input: &'a str, endian: Endian) -> Result> { + let mut lexer = Lexer { + iter: input.chars().peekable(), + token: None, + line_number: 1, + endian, + }; + lexer.next()?; + Ok(lexer) + } + fn get(&'a mut self) -> Result> { + if self.token.is_none() { + Err(anyhow!("unexpected end of input")) + } else { + self.token = self.next()?; + Ok((&self.token).into()) + } + } + + fn next(&mut self) -> Result> { + // Get the first character of the token, skipping past white space and + // comments. + let c = loop { + let Some(c) = self.iter.next() else { + return Ok(None); + }; + let c = if c == '#' { + loop { + match self.iter.next() { + None => return Ok(None), + Some('\n') => break, + _ => (), + } + } + '\n' + } else { + c + }; + if c == '\n' { + self.line_number += 1 + } else if !c.is_whitespace() && c != '<' && c != '>' { + break c; + } + }; + + let token = match c { + c if c.is_ascii_digit() || c == '-' => { + let mut s = String::from(c); + while let Some(c) = self + .iter + .next_if(|&c| c.is_ascii_digit() || c.is_alphabetic() || c == '.') + { + s.push(c); + } + + if s == "-" { + Token::Minus + } else if !s.contains('.') { + Token::Integer( + s.parse() + .map_err(|msg| anyhow!("bad integer literal '{s}' ({msg})"))?, + ) + } else { + Token::Float( + s.parse() + .map_err(|msg| anyhow!("bad float literal '{s}' ({msg})"))?, + ) + } + } + '"' => { + let mut s = String::from(c); + loop { + match self.iter.next() { + None => return Err(anyhow!("end-of-file inside string")), + Some('\n') => return Err(anyhow!("new-line inside string")), + Some('"') => break, + Some(c) => s.push(c), + } + } + Token::String(s) + } + ';' => Token::Semicolon, + '*' => Token::Asterisk, + '+' => Token::Plus, + '(' => Token::LParen, + ')' => Token::RParen, + c if c.is_alphabetic() || c == '@' || c == '_' => { + let mut s = String::from(c); + while let Some(c) = self + .iter + .next_if(|&c| c.is_ascii_digit() || c.is_alphabetic() || c == '.' || c == '_') + { + s.push(c); + } + if self.iter.next_if_eq(&':').is_some() { + Token::Label(s) + } else if s.starts_with('@') { + Token::At(s) + } else if let Some(count) = s.strip_prefix('s') { + Token::S( + count + .parse() + .map_err(|msg| anyhow!("bad counted string '{s}' ({msg})"))?, + ) + } else { + match &s[..] { + "i8" => Token::I8, + "i16" => Token::I16, + "i64" => Token::I64, + "SYSMIS" => Token::Float(-f64::MAX), + "PCSYSMIS" => Token::PcSysmis, + "LOWEST" => Token::Float((-f64::MAX).next_after(0.0)), + "HIGHEST" => Token::Float(f64::MAX), + "ENDIAN" => Token::Integer(if self.endian == Endian::Big { 1 } else { 2 }), + "COUNT" => Token::Count, + "COUNT8" => Token::Count8, + "hex" => Token::Hex, + _ => return Err(anyhow!("invalid token '{s}'")), + } + } + } + _ => return Err(anyhow!("invalid input byte '{c}'")), + }; + Ok(Some(token)) + } +} -- 2.30.2