From: Ben Pfaff Date: Tue, 1 Aug 2023 05:05:29 +0000 (-0700) Subject: sack works X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=95cde62bdf5210c1c60dad5598a888b864f93161;p=pspp sack works --- diff --git a/Makefile.am b/Makefile.am index 1283efb019..fcc4fe4856 100644 --- a/Makefile.am +++ b/Makefile.am @@ -160,4 +160,6 @@ mimedir = $(datadir)/mime/packages mime_DATA = org.gnu.pspp.xml EXTRA_DIST += org.gnu.pspp.xml -EXTRA_DIST += rust/Cargo.lock rust/Cargo.toml rust/src/main.rs rust/src/lib.rs rust/src/endian.rs rust/src/hexfloat.rs +EXTRA_DIST += rust/Cargo.lock rust/Cargo.toml rust/src/main.rs \ +rust/src/lib.rs rust/src/endian.rs rust/src/hexfloat.rs \ +rust/src/raw.rs rust/src/sack.rs rust/tests/sack.rs diff --git a/rust/src/endian.rs b/rust/src/endian.rs index bf861af2d1..bb63ec518d 100644 --- a/rust/src/endian.rs +++ b/rust/src/endian.rs @@ -53,6 +53,14 @@ impl ToBytes for Endian { } } } +impl ToBytes for Endian { + fn to_bytes(self, value: i32) -> [u8; 4] { + match self { + Endian::Big => i32::to_be_bytes(value), + Endian::Little => i32::to_le_bytes(value), + } + } +} impl ToBytes for Endian { fn to_bytes(self, value: u16) -> [u8; 2] { match self { diff --git a/rust/src/sack.rs b/rust/src/sack.rs index 1e3b4fff11..4dfcf335ef 100644 --- a/rust/src/sack.rs +++ b/rust/src/sack.rs @@ -5,8 +5,7 @@ use std::{ collections::{hash_map::Entry, HashMap}, error::Error as StdError, fmt::{Display, Formatter, Result as FmtResult}, - iter::{repeat, Peekable}, - str::Chars, + iter::repeat, }; use crate::endian::{Endian, ToBytes}; @@ -17,14 +16,21 @@ pub type Result = std::result::Result; pub struct Error { pub file_name: Option, pub line_number: Option, + pub token: Option, pub message: String, } impl Error { - fn new(file_name: Option<&str>, line_number: Option, message: String) -> Error { + fn new( + file_name: Option<&str>, + line_number: Option, + token: Option<&str>, + message: String, + ) -> Error { Error { file_name: file_name.map(String::from), line_number, + token: token.map(String::from), message, } } @@ -34,14 +40,14 @@ impl StdError for Error {} impl Display for Error { fn fmt(&self, f: &mut Formatter) -> FmtResult { - if let Some(ref file_name) = self.file_name { - write!(f, "{file_name}:")?; - if self.line_number.is_none() { - write!(f, " ")?; - } + match (self.file_name.as_ref(), self.line_number) { + (Some(ref file_name), Some(line_number)) => write!(f, "{file_name}:{line_number}: ")?, + (Some(ref file_name), None) => write!(f, "{file_name}: ")?, + (None, Some(line_number)) => write!(f, "line {line_number}: ")?, + (None, None) => (), } - if let Some(line_number) = self.line_number { - write!(f, "{line_number}: ")?; + if let Some(ref token) = self.token { + write!(f, "at '{token}': ")?; } write!(f, "{}", self.message) } @@ -51,11 +57,15 @@ pub fn sack(input: &str, input_file_name: Option<&str>, endian: Endian) -> Resul let mut symbol_table = HashMap::new(); let output = _sack(input, input_file_name, endian, &mut symbol_table)?; let output = if !symbol_table.is_empty() { + for (k, v) in symbol_table.iter() { + println!("{k} => {v:?}"); + } for (k, v) in symbol_table.iter() { if v.is_none() { Err(Error::new( input_file_name, None, + None, format!("label {k} used but never defined"), ))? } @@ -90,7 +100,15 @@ fn parse_data_item( let initial_len = output.len(); match lexer.take()? { - Token::Integer(integer) => output.extend_from_slice(&lexer.endian.to_bytes(integer)), + Token::Integer(integer) => { + let Ok(integer): Result = integer.try_into() else { + Err(lexer.error(format!( + "{integer} is not in the valid range [{},{}]", + u32::min_value(), + u32::max_value() + )))? + }; + output.extend_from_slice(&lexer.endian.to_bytes(integer))}, Token::Float(float) => output.extend_from_slice(&lexer.endian.to_bytes(float.0)), Token::PcSysmis => { output.extend_from_slice(&[0xf5, 0x1e, 0x26, 0x02, 0x8a, 0x8c, 0xed, 0xff]) @@ -100,7 +118,7 @@ fn parse_data_item( Token::I64 => put_integers::(lexer, "i64", output)?, Token::String(string) => output.extend_from_slice(string.as_bytes()), Token::S(size) => { - let Some(Token::String(ref string)) = lexer.token else { + let Some((Token::String(ref string), _)) = lexer.token else { Err(lexer.error(format!("string expected after 's{size}'")))? }; let len = string.len(); @@ -114,7 +132,7 @@ fn parse_data_item( lexer.get()?; } Token::LParen => { - while lexer.token != Some(Token::RParen) { + while !matches!(lexer.token, Some((Token::RParen, _))) { parse_data_item(lexer, output, symbol_table)?; } lexer.get()?; @@ -122,7 +140,7 @@ fn parse_data_item( Token::Count => put_counted_items::(lexer, "COUNT", output, symbol_table)?, Token::Count8 => put_counted_items::(lexer, "COUNT8", output, symbol_table)?, Token::Hex => { - let Some(Token::String(ref string)) = lexer.token else { + let Some((Token::String(ref string), _)) = lexer.token else { Err(lexer.error(String::from("string expected after 'hex'")))? }; let mut i = string.chars(); @@ -139,40 +157,48 @@ fn parse_data_item( } } Token::Label(name) => { + println!("define {name}"); let value = output.len() as u32; - match symbol_table.entry(name) { + match symbol_table.entry(name.clone()) { Entry::Vacant(v) => { v.insert(Some(value)); } - Entry::Occupied(o) => { - if let Some(v) = o.get() { - if *v != value { - Err(lexer.error(String::from("syntax error")))? + Entry::Occupied(mut o) => { + match o.get() { + Some(v) => { + if *v != value { + Err(lexer.error(format!("{name}: can't redefine label for offset {:#x} with offset {:#x}", *v, value)))? + } } + None => drop(o.insert(Some(value))), } } }; + return Ok(true); } Token::At(name) => { - let mut value = symbol_table.entry(name).or_insert(None).unwrap_or(0); - lexer.get()?; + let mut value = symbol_table + .entry(name.clone()) + .or_insert(None) + .unwrap_or(0); + println!("{name} has value {value}"); loop { let plus = match lexer.token { - Some(Token::Plus) => true, - Some(Token::Minus) => false, + Some((Token::Plus, _)) => true, + Some((Token::Minus, _)) => false, _ => break, }; lexer.get()?; let operand = match lexer.token { - Some(Token::At(ref name)) => if let Some(value) = symbol_table.get(name) { + Some((Token::At(ref name), _)) => if let Some(value) = symbol_table.get(name) { *value } else { symbol_table.insert(name.clone(), None); None } .unwrap_or(0), - Some(Token::Integer(integer)) => integer + Some((Token::Integer(integer), _)) => integer .try_into() .map_err(|msg| lexer.error(format!("bad offset literal ({msg})")))?, _ => Err(lexer.error(String::from("expecting @label or integer literal")))?, @@ -190,7 +216,7 @@ fn parse_data_item( } _ => (), }; - if lexer.token == Some(Token::Asterisk) { + if let Some((Token::Asterisk, _)) = lexer.token { lexer.get()?; let Token::Integer(count) = lexer.take()? else { Err(lexer.error(String::from("positive integer expected after '*'")))? @@ -204,10 +230,10 @@ fn parse_data_item( } } match lexer.token { - Some(Token::Semicolon) => { + Some((Token::Semicolon, _)) => { lexer.get()?; } - Some(Token::RParen) => (), + Some((Token::RParen, _)) => (), _ => Err(lexer.error(String::from("';' expected")))?, } Ok(true) @@ -225,15 +251,16 @@ where { let old_size = output.len(); output.extend_from_slice(&lexer.endian.to_bytes(T::zero())); - if lexer.token != Some(Token::LParen) { + let start = output.len(); + if !matches!(lexer.token, Some((Token::LParen, _))) { Err(lexer.error(format!("'(' expected after '{name}'")))? } lexer.get()?; - while lexer.token != Some(Token::RParen) { + while !matches!(lexer.token, Some((Token::RParen, _))) { parse_data_item(lexer, output, symbol_table)?; } lexer.get()?; - let delta = output.len() - old_size; + let delta = output.len() - start; let Ok(delta): Result = delta.try_into() else { Err(lexer.error(format!("{delta} bytes is too much for '{name}'")))? }; @@ -251,11 +278,13 @@ where T: Bounded + Display + TryFrom + Copy, Endian: ToBytes, { + println!("put_integers {:?}", lexer.token); let mut n = 0; while let Some(integer) = lexer.take_if(|t| match t { Token::Integer(integer) => Some(*integer), _ => None, })? { + println!("got integer {integer}"); let Ok(integer) = integer.try_into() else { Err(lexer.error(format!( "{integer} is not in the valid range [{},{}]", @@ -266,6 +295,7 @@ where output.extend_from_slice(&lexer.endian.to_bytes(integer)); n += 1; } + println!("put_integers {:?} {n}", lexer.token); if n == 0 { Err(lexer.error(format!("integer expected after '{name}'")))? } @@ -296,17 +326,37 @@ enum Token { } struct Lexer<'a> { - iter: Peekable>, - token: Option, + input: &'a str, + token: Option<(Token, &'a str)>, input_file_name: Option<&'a str>, line_number: usize, endian: Endian, } +fn skip_comments(mut s: &str) -> (&str, usize) { + let mut n_newlines = 0; + let s = loop { + s = s.trim_start_matches([' ', '\t', '\r', '<', '>']); + if let Some(remainder) = s.strip_prefix('#') { + let Some((_, remainder)) = remainder.split_once('\n') else { + break ""; + }; + s = remainder; + n_newlines += 1; + } else if let Some(remainder) = s.strip_prefix('\n') { + s = remainder; + n_newlines += 1; + } else { + break s; + } + }; + (s, n_newlines) +} + impl<'a> Lexer<'a> { fn new(input: &'a str, input_file_name: Option<&'a str>, endian: Endian) -> Result> { let mut lexer = Lexer { - iter: input.chars().peekable(), + input, token: None, input_file_name, line_number: 1, @@ -316,14 +366,15 @@ impl<'a> Lexer<'a> { Ok(lexer) } fn error(&self, message: String) -> Error { - Error::new(self.input_file_name, Some(self.line_number), message) + let repr = self.token.as_ref().map(|(_, repr)| *repr); + Error::new(self.input_file_name, Some(self.line_number), repr, message) } fn take(&mut self) -> Result { let Some(token) = self.token.take() else { Err(self.error(String::from("unexpected end of input")))? }; self.token = self.next()?; - Ok(token) + Ok(token.0) } fn take_if(&mut self, condition: F) -> Result> where @@ -332,7 +383,7 @@ impl<'a> Lexer<'a> { let Some(ref token) = self.token else { return Ok(None); }; - match condition(token) { + match condition(&token.0) { Some(value) => { self.token = self.next()?; Ok(Some(value)) @@ -345,113 +396,111 @@ impl<'a> Lexer<'a> { Err(self.error(String::from("unexpected end of input")))? } else { self.token = self.next()?; - Ok((&self.token).into()) + match self.token { + Some((ref token, _)) => Ok(Some(token)), + None => Ok(None), + } } } - fn next(&mut self) -> Result> { + fn next(&mut self) -> Result> { // Get the first character of the token, skipping past white space and // comments. - let c = loop { - let Some(c) = self.iter.next() else { - return Ok(None); - }; - let c = if c == '#' { - loop { - match self.iter.next() { - None => return Ok(None), - Some('\n') => break, - _ => (), - } - } - '\n' - } else { - c - }; - if c == '\n' { - self.line_number += 1 - } else if !c.is_whitespace() && c != '<' && c != '>' { - break c; - } - }; + let (s, n_newlines) = skip_comments(self.input); + self.line_number += n_newlines; + self.input = s; - let token = - match c { - c if c.is_ascii_digit() || c == '-' => { - let mut s = String::from(c); - while let Some(c) = self - .iter - .next_if(|&c| c.is_ascii_digit() || c.is_alphabetic() || c == '.') - { - s.push(c); - } - - if s == "-" { - Token::Minus - } else if !s.contains('.') { - Token::Integer(s.parse().map_err(|msg| { - self.error(format!("bad integer literal '{s}' ({msg})")) - })?) - } else { - Token::Float(s.parse().map_err(|msg| { - self.error(format!("bad float literal '{s}' ({msg})")) - })?) - } - } - '"' => { - let mut s = String::new(); - loop { - match self.iter.next() { - None => Err(self.error(String::from("end-of-file inside string")))?, - Some('\n') => Err(self.error(String::from("new-line inside string")))?, - Some('"') => break, - Some(c) => s.push(c), - } - } - Token::String(s) - } - ';' => Token::Semicolon, - '*' => Token::Asterisk, - '+' => Token::Plus, - '(' => Token::LParen, - ')' => Token::RParen, - c if c.is_alphabetic() || c == '@' || c == '_' => { - let mut s = String::from(c); - while let Some(c) = self.iter.next_if(|&c| { - c.is_ascii_digit() || c.is_alphabetic() || c == '.' || c == '_' - }) { - s.push(c); - } - if self.iter.next_if_eq(&':').is_some() { - Token::Label(s) - } else if s.starts_with('@') { - Token::At(s) - } else if let Some(count) = s.strip_prefix('s') { + let start = s; + let mut iter = s.chars(); + let Some(c) = iter.next() else { + return Ok(None); + }; + let (token, rest) = match c { + c if c.is_ascii_digit() || c == '-' => { + let len = s + .find(|c: char| { + !(c.is_ascii_digit() || c.is_alphabetic() || c == '.' || c == '-') + }) + .unwrap_or_else(|| s.len()); + let (number, rest) = s.split_at(len); + let token = if number == "-" { + Token::Minus + } else if let Some(digits) = number.strip_prefix("0x") { + Token::Integer(i64::from_str_radix(digits, 16).map_err(|msg| { + self.error(format!("bad integer literal '{number}' ({msg})")) + })?) + } else if !number.contains('.') { + Token::Integer(number.parse().map_err(|msg| { + self.error(format!("bad integer literal '{number}' ({msg})")) + })?) + } else { + Token::Float(number.parse().map_err(|msg| { + self.error(format!("bad float literal '{number}' ({msg})")) + })?) + }; + (token, rest) + } + '"' => { + let s = iter.as_str(); + let Some(len) = s.find(['\n', '"']) else { + Err(self.error(String::from("end-of-file inside string")))? + }; + let (string, rest) = s.split_at(len); + let Some(rest) = rest.strip_prefix('"') else { + Err(self.error(format!("new-line inside string ({string}...{rest})")))? + }; + (Token::String(string.into()), rest) + } + ';' => (Token::Semicolon, iter.as_str()), + '*' => (Token::Asterisk, iter.as_str()), + '+' => (Token::Plus, iter.as_str()), + '(' => (Token::LParen, iter.as_str()), + ')' => (Token::RParen, iter.as_str()), + c if c.is_alphabetic() || c == '@' || c == '_' => { + let len = s + .find(|c: char| { + !(c.is_ascii_digit() + || c.is_alphabetic() + || c == '@' + || c == '.' + || c == '_') + }) + .unwrap_or_else(|| s.len()); + let (s, rest) = s.split_at(len); + if let Some(rest) = rest.strip_prefix(':') { + (Token::Label(s.into()), rest) + } else if let Some(name) = s.strip_prefix('@') { + (Token::At(name.into()), rest) + } else if let Some(count) = s.strip_prefix('s') { + let token = Token::S(count.parse().map_err(|msg| { self.error(format!("bad counted string '{s}' ({msg})")) - })?) - } else { - match &s[..] { - "i8" => Token::I8, - "i16" => Token::I16, - "i64" => Token::I64, - "SYSMIS" => Token::Float(OrderedFloat(-f64::MAX)), - "PCSYSMIS" => Token::PcSysmis, - "LOWEST" => Token::Float((-f64::MAX).next_after(0.0).into()), - "HIGHEST" => Token::Float(f64::MAX.into()), - "ENDIAN" => { - Token::Integer(if self.endian == Endian::Big { 1 } else { 2 }) - } - "COUNT" => Token::Count, - "COUNT8" => Token::Count8, - "hex" => Token::Hex, - _ => Err(self.error(format!("invalid token '{s}'")))?, - } - } + })?); + (token, rest) + } else { + let token = match &s[..] { + "i8" => Token::I8, + "i16" => Token::I16, + "i64" => Token::I64, + "SYSMIS" => Token::Float(OrderedFloat(-f64::MAX)), + "PCSYSMIS" => Token::PcSysmis, + "LOWEST" => Token::Float((-f64::MAX).next_after(0.0).into()), + "HIGHEST" => Token::Float(f64::MAX.into()), + "ENDIAN" => Token::Integer(if self.endian == Endian::Big { 1 } else { 2 }), + "COUNT" => Token::Count, + "COUNT8" => Token::Count8, + "hex" => Token::Hex, + _ => Err(self.error(format!("invalid token '{s}'")))?, + }; + (token, rest) } - _ => Err(self.error(format!("invalid input byte '{c}'")))?, - }; - Ok(Some(token)) + } + _ => Err(self.error(format!("invalid input byte '{c}'")))?, + }; + self.input = rest; + let repr = &start[..start.len() - rest.len()]; + println!("{token:?} {repr}"); + Ok(Some((token, repr))) } } @@ -475,6 +524,91 @@ mod test { "01 Jan 11"; "20:53:52"; "PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 ""; i8 0 *3; +"#; + let output = sack(input, None, Endian::Big)?; + HexView::new(&output).print()?; + Ok(()) + } + + #[test] + fn pcp_sack() -> Result<()> { + let input = r#" +# File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +@LABELS; @LABELS_END - @LABELS; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; # Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; # Fixed. + i16 0; + i16 15; + 1; + i16 0; # Fixed. + 1; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + # Numeric variable, no label or missing values. + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + + # Numeric variable, variable label. + 0; 0; @NUM2_LABEL - @LABELS_OFS; 0x050800; s8 "NUM2"; PCSYSMIS; + + # Numeric variable with missing value. + 0; 0; 0; 0x050800; s8 "NUM3"; 1.0; + + # Numeric variable, variable label and missing value. + 0; 0; @NUM4_LABEL - @LABELS_OFS; 0x050800; s8 "NUM4"; 2.0; + + # String variable, no label or missing values. + 0; 0; 0; 0x010800; s8 "STR1"; PCSYSMIS; + + # String variable, variable label. + 0; 0; @STR2_LABEL - @LABELS_OFS; 0x010400; s8 "STR2"; PCSYSMIS; + + # String variable with missing value. + 0; 0; 0; 0x010500; s8 "STR3"; s8 "MISS"; + + # String variable, variable label and missing value. + 0; 0; @STR4_LABEL - @LABELS_OFS; 0x010100; s8 "STR4"; s8 "OTHR"; + + # Long string variable + 0; 0; 0; 0x010b00; s8 "STR5"; PCSYSMIS; + 0 * 8; + + # Long string variable with variable label + 0; 0; @STR6_LABEL - @LABELS_OFS; 0x010b00; s8 "STR6"; PCSYSMIS; + 0 * 8; +VARS_END: + +LABELS: + 3; i8 0 0 0; LABELS_OFS: i8 0; + NUM2_LABEL: COUNT8("Numeric variable 2's label"); + NUM4_LABEL: COUNT8("Another numeric variable label"); + STR2_LABEL: COUNT8("STR2's variable label"); + STR4_LABEL: COUNT8("STR4's variable label"); + STR6_LABEL: COUNT8("Another string variable's label"); +LABELS_END: + +DATA: + 0.0; "11/28/14"; 1.0; + 0.0; 1.0; 2.0; PCSYSMIS; s8 "abcdefgh"; s8 "ijkl"; s8 "mnopq"; s8 "r"; + s16 "stuvwxyzAB"; s16 "CDEFGHIJKLM"; +DATA_END: "#; let output = sack(input, None, Endian::Big)?; HexView::new(&output).print()?; diff --git a/rust/tests/sack.rs b/rust/tests/sack.rs index 4f6e9680a0..abb51abe2a 100644 --- a/rust/tests/sack.rs +++ b/rust/tests/sack.rs @@ -1,5 +1,4 @@ use std::fs::read_to_string; -use std::io::{stdout, IsTerminal, Write}; use std::path::PathBuf; use anyhow::{anyhow, Result}; @@ -61,15 +60,14 @@ struct Args { /// Input file. #[arg(required = true, name = "input")] input_file_name: PathBuf, + + /// Output file. + #[arg(required = true, name = "output")] + output_file_name: PathBuf, } fn main() -> Result<()> { - let Args { be, le, input_file_name } = Args::parse(); - if stdout().is_terminal() { - return Err(anyhow!( - "not writing binary data to a terminal; redirect to a file" - )); - } + let Args { be, le, input_file_name, output_file_name } = Args::parse(); let endian = match (be, le) { (false, false) | (true, false) => Endian::Big, (false, true) => Endian::Little, @@ -78,6 +76,6 @@ fn main() -> Result<()> { let input = read_to_string(&input_file_name)?; let input_file_name = input_file_name.to_string_lossy(); let output = sack(&input, Some(&input_file_name), endian)?; - stdout().write_all(&output)?; + std::fs::write(&output_file_name, &output)?; Ok(()) } diff --git a/tests/data/sys-file-reader.at b/tests/data/sys-file-reader.at index 4057109e1b..beb35adae9 100644 --- a/tests/data/sys-file-reader.at +++ b/tests/data/sys-file-reader.at @@ -16,6 +16,16 @@ dnl along with this program. If not, see . dnl AT_BANNER([system file reader - positive]) +m4_divert_text([PREPARE_TESTS], [dnl +sack () { + local endian=$1 file=$2 dir=$(pwd) + $(which sack) "$1" "$2" > expected.sav || return 1 + (cd $top_srcdir/rust && cargo test --test sack -- "$endian" "$dir/$file" "$dir/actual.sav" >/dev/null 2>&1) || return 1 + diff -u <(hd expected.sav) <(hd actual.sav) || return 1 + cat expected.sav +} +]) + AT_SETUP([variable labels and missing values]) AT_KEYWORDS([sack synthetic system file positive]) AT_DATA([sys-file.sack], [dnl