From 3b441caf454ba932dda1245cabfddc595861b23e Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 14 Jul 2024 11:35:52 -0700 Subject: [PATCH] work on scanning --- rust/src/lex/scan/mod.rs | 113 +++++++++++ rust/src/lex/scan/test.rs | 353 +++++++++++++++++++++++++++++++++++ rust/src/lex/segment/mod.rs | 5 +- rust/src/lex/segment/test.rs | 34 ++++ rust/src/lex/token.rs | 13 +- 5 files changed, 513 insertions(+), 5 deletions(-) create mode 100644 rust/src/lex/scan/test.rs diff --git a/rust/src/lex/scan/mod.rs b/rust/src/lex/scan/mod.rs index 343bde8ca2..2e5f993311 100644 --- a/rust/src/lex/scan/mod.rs +++ b/rust/src/lex/scan/mod.rs @@ -10,3 +10,116 @@ //! are the same as the tokens used by the PSPP parser with a few additional //! types. +use super::{ + segment::{Mode, Segment, Segmenter}, + token::{Punct, Token, TokenError}, +}; +use std::collections::VecDeque; + +/// Attempts to merge a sequence of tokens together into a single token. The +/// tokens are taken from the beginning of `input`. If successful, removes one +/// or more token from the beginning of `input` and returnss the merged +/// token. More input tokens might be needed; if so, leaves `input` alone and +/// returns `None`. In the latter case, the caller should add more tokens to the +/// input ([Token::End] or [Token::Punct(Punct::EndCmd)] is always sufficient). +/// +/// This performs two different kinds of token merging: +/// +/// - String concatenation, where syntax like `"a" + "b"` is converted into a +/// single string token. This is definitely needed because the parser relies +/// on it. +/// +/// - Negative number merging, where syntax like `-5` is converted from a pair +/// of tokens (a dash and a positive number) into a single token (a negative +/// number). This might not be needed anymore because the segmenter +/// directly treats a dash followed by a number, with optional intervening +/// white space, as a negative number. It's only needed if we want +/// intervening comments to be allowed or for part of the negative number +/// token to be produced by macro expansion. +pub fn merge_tokens(input: &mut VecDeque) -> Option { + match input.get(0)? { + Token::Punct(Punct::Dash) => match input.get(1)? { + Token::Number(number) if number.is_sign_positive() => { + let number = *number; + input.pop_front().unwrap(); + input.pop_front().unwrap(); + return Some(Token::Number(-number)); + } + _ => Some(input.pop_front().unwrap()), + }, + Token::String(_) => { + let mut i = 0; + while matches!(input.get(i * 2 + 1)?, Token::Punct(Punct::Plus)) + && matches!(input.get(i * 2 + 2)?, Token::String(_)) + { + i += 1; + } + if i == 0 { + Some(input.pop_front().unwrap()) + } else { + let mut output = String::new(); + for i in 0..=i { + let Token::String(s) = &input[i * 2] else { + unreachable!() + }; + output.push_str(&s); + } + for _ in 0..i * 2 + 1 { + input.pop_front().unwrap(); + } + Some(Token::String(output)) + } + } + _ => Some(input.pop_front().unwrap()), + } +} + +pub struct StringLexer<'a> { + input: &'a str, + segmenter: Segmenter, + tokens: VecDeque, +} + +impl<'a> StringLexer<'a> { + pub fn new(input: &'a str, mode: Mode, is_snippet: bool) -> Self { + Self { + input, + segmenter: Segmenter::new(mode, is_snippet), + tokens: VecDeque::with_capacity(1), + } + } +} + +impl<'a> Iterator for StringLexer<'a> { + type Item = Result; + + fn next(&mut self) -> Option { + if let Some(token) = merge_tokens(&mut self.tokens) { + return Some(Ok(token)); + } + loop { + let (rest, segment) = self.segmenter.push(self.input, true).unwrap(); + if segment == Segment::End && self.tokens.is_empty() { + return None; + } + let s = &self.input[..self.input.len() - rest.len()]; + self.input = rest; + match Token::try_from_segment(s, segment) { + Err(error) => { + println!("{:?}", &self.tokens); + return Some(Err(error)); + } + Ok(Some(token)) => { + self.tokens.push_back(token); + if let Some(token) = merge_tokens(&mut self.tokens) { + return Some(Ok(token)); + } + } + Ok(None) => (), + }; + } + } +} + +#[cfg(test)] +mod test; diff --git a/rust/src/lex/scan/test.rs b/rust/src/lex/scan/test.rs new file mode 100644 index 0000000000..1b84e5ffbe --- /dev/null +++ b/rust/src/lex/scan/test.rs @@ -0,0 +1,353 @@ +use crate::lex::{ + segment::Mode, + token::{MacroToken, Punct, Token, TokenError}, +}; + +use super::StringLexer; + +fn print_token(token: &Token) { + match token { + Token::End => print!("Token::End"), + Token::Id(s) => print!("Token::Id(String::from({s:?}))"), + Token::Number(number) => print!("Token::Number({number:?})"), + Token::String(s) => print!("Token::String(String::from({s:?}))"), + Token::EndCommand => print!("Token::EndCommand"), + Token::Punct(punct) => print!("Token::Punct(Punct::{punct:?})"), + Token::MacroToken(m) => print!("Token::MacroToken(MacroToken::{m:?})"), + } +} + +fn check_scan(input: &str, expected: &[Result]) { + let tokens = StringLexer::new(input, Mode::Auto, false).collect::>(); + + if &tokens != expected { + for token in &tokens { + match token { + Ok(token) => { + print!("Ok("); + print_token(token); + print!(")"); + } + Err(error) => print!("Err(TokenError::{error:?})"), + } + println!(","); + } + + eprintln!("tokens differ from expected:"); + let difference = diff::slice(expected, &tokens); + for result in difference { + match result { + diff::Result::Left(left) => eprintln!("-{left:?}"), + diff::Result::Both(left, _right) => eprintln!(" {left:?}"), + diff::Result::Right(right) => eprintln!("+{right:?}"), + } + } + panic!(); + } +} + +#[test] +fn test_identifiers() { + check_scan( + r#"a aB i5 $x @efg @@. !abcd !* !*a #.# .x _z. +abcd. abcd. +QRSTUV./* end of line comment */ +QrStUv./* end of line comment */ +WXYZ. /* unterminated end of line comment +�. /* U+FFFD is not valid in an identifier +"#, + &[ + Ok(Token::Id(String::from("a"))), + Ok(Token::Id(String::from("aB"))), + Ok(Token::Id(String::from("i5"))), + Ok(Token::Id(String::from("$x"))), + Ok(Token::Id(String::from("@efg"))), + Ok(Token::Id(String::from("@@."))), + Ok(Token::MacroToken(MacroToken::MacroId(String::from( + "!abcd", + )))), + Ok(Token::MacroToken(MacroToken::MacroId(String::from("!*")))), + Ok(Token::MacroToken(MacroToken::MacroId(String::from("!*")))), + Ok(Token::Id(String::from("a"))), + Ok(Token::Id(String::from("#.#"))), + Ok(Token::MacroToken(MacroToken::Dot)), + Ok(Token::Id(String::from("x"))), + Ok(Token::MacroToken(MacroToken::Underscore)), + Ok(Token::Id(String::from("z"))), + Ok(Token::EndCommand), + Ok(Token::Id(String::from("abcd."))), + Ok(Token::Id(String::from("abcd"))), + Ok(Token::EndCommand), + Ok(Token::Id(String::from("QRSTUV"))), + Ok(Token::EndCommand), + Ok(Token::Id(String::from("QrStUv"))), + Ok(Token::EndCommand), + Ok(Token::Id(String::from("WXYZ"))), + Ok(Token::EndCommand), + Err(TokenError::UnexpectedChar('�')), + Ok(Token::EndCommand), + ], + ); +} + +#[test] +fn test_reserved_words() { + check_scan( + r#"and or not eq ge gt le lt ne all by to with +AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH +andx orx notx eqx gex gtx lex ltx nex allx byx tox withx +and. with. +"#, + &[ + Ok(Token::Punct(Punct::And)), + Ok(Token::Punct(Punct::Or)), + Ok(Token::Punct(Punct::Not)), + Ok(Token::Punct(Punct::Eq)), + Ok(Token::Punct(Punct::Ge)), + Ok(Token::Punct(Punct::Gt)), + Ok(Token::Punct(Punct::Le)), + Ok(Token::Punct(Punct::Lt)), + Ok(Token::Punct(Punct::Ne)), + Ok(Token::Punct(Punct::All)), + Ok(Token::Punct(Punct::By)), + Ok(Token::Punct(Punct::To)), + Ok(Token::Punct(Punct::With)), + Ok(Token::Punct(Punct::And)), + Ok(Token::Punct(Punct::Or)), + Ok(Token::Punct(Punct::Not)), + Ok(Token::Punct(Punct::Eq)), + Ok(Token::Punct(Punct::Ge)), + Ok(Token::Punct(Punct::Gt)), + Ok(Token::Punct(Punct::Le)), + Ok(Token::Punct(Punct::Lt)), + Ok(Token::Punct(Punct::Ne)), + Ok(Token::Punct(Punct::All)), + Ok(Token::Punct(Punct::By)), + Ok(Token::Punct(Punct::To)), + Ok(Token::Punct(Punct::With)), + Ok(Token::Id(String::from("andx"))), + Ok(Token::Id(String::from("orx"))), + Ok(Token::Id(String::from("notx"))), + Ok(Token::Id(String::from("eqx"))), + Ok(Token::Id(String::from("gex"))), + Ok(Token::Id(String::from("gtx"))), + Ok(Token::Id(String::from("lex"))), + Ok(Token::Id(String::from("ltx"))), + Ok(Token::Id(String::from("nex"))), + Ok(Token::Id(String::from("allx"))), + Ok(Token::Id(String::from("byx"))), + Ok(Token::Id(String::from("tox"))), + Ok(Token::Id(String::from("withx"))), + Ok(Token::Id(String::from("and."))), + Ok(Token::Punct(Punct::With)), + Ok(Token::EndCommand), + ], + ); +} + +#[test] +fn test_punctuation() { + check_scan( + r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] ** +~&|=>=><=<~=<>(),-+*/[]** +% : ; ? _ ` { } ~ +"#, + &[ + Ok(Token::Punct(Punct::Not)), + Ok(Token::Punct(Punct::And)), + Ok(Token::Punct(Punct::Or)), + Ok(Token::Punct(Punct::Equals)), + Ok(Token::Punct(Punct::Ge)), + Ok(Token::Punct(Punct::Gt)), + Ok(Token::Punct(Punct::Le)), + Ok(Token::Punct(Punct::Lt)), + Ok(Token::Punct(Punct::Ne)), + Ok(Token::Punct(Punct::Ne)), + Ok(Token::Punct(Punct::LParen)), + Ok(Token::Punct(Punct::RParen)), + Ok(Token::Punct(Punct::Comma)), + Ok(Token::Punct(Punct::Dash)), + Ok(Token::Punct(Punct::Plus)), + Ok(Token::Punct(Punct::Asterisk)), + Ok(Token::Punct(Punct::Slash)), + Ok(Token::Punct(Punct::LSquare)), + Ok(Token::Punct(Punct::RSquare)), + Ok(Token::Punct(Punct::Exp)), + Ok(Token::Punct(Punct::Not)), + Ok(Token::Punct(Punct::And)), + Ok(Token::Punct(Punct::Or)), + Ok(Token::Punct(Punct::Equals)), + Ok(Token::Punct(Punct::Ge)), + Ok(Token::Punct(Punct::Gt)), + Ok(Token::Punct(Punct::Le)), + Ok(Token::Punct(Punct::Lt)), + Ok(Token::Punct(Punct::Ne)), + Ok(Token::Punct(Punct::Ne)), + Ok(Token::Punct(Punct::LParen)), + Ok(Token::Punct(Punct::RParen)), + Ok(Token::Punct(Punct::Comma)), + Ok(Token::Punct(Punct::Dash)), + Ok(Token::Punct(Punct::Plus)), + Ok(Token::Punct(Punct::Asterisk)), + Ok(Token::Punct(Punct::Slash)), + Ok(Token::Punct(Punct::LSquare)), + Ok(Token::Punct(Punct::RSquare)), + Ok(Token::Punct(Punct::Exp)), + Ok(Token::MacroToken(MacroToken::Percent)), + Ok(Token::Punct(Punct::Colon)), + Ok(Token::Punct(Punct::Semicolon)), + Ok(Token::MacroToken(MacroToken::Question)), + Ok(Token::MacroToken(MacroToken::Underscore)), + Ok(Token::MacroToken(MacroToken::Backtick)), + Ok(Token::Punct(Punct::LCurly)), + Ok(Token::Punct(Punct::RCurly)), + Ok(Token::Punct(Punct::Not)), + ], + ); +} + +#[test] +fn test_positive_numbers() { + check_scan( + r#"0 1 01 001. 1. +123. /* comment 1 */ /* comment 2 */ +.1 0.1 00.1 00.10 +5e1 6E-1 7e+1 6E+01 6e-03 +.3E1 .4e-1 .5E+1 .6e+01 .7E-03 +1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03 +. 1e e1 1e+ 1e- +"#, + &[ + Ok(Token::Number(0.0)), + Ok(Token::Number(1.0)), + Ok(Token::Number(1.0)), + Ok(Token::Number(1.0)), + Ok(Token::Number(1.0)), + Ok(Token::EndCommand), + Ok(Token::Number(123.0)), + Ok(Token::EndCommand), + Ok(Token::EndCommand), + Ok(Token::Number(1.0)), + Ok(Token::Number(0.1)), + Ok(Token::Number(0.1)), + Ok(Token::Number(0.1)), + Ok(Token::Number(50.0)), + Ok(Token::Number(0.6)), + Ok(Token::Number(70.0)), + Ok(Token::Number(60.0)), + Ok(Token::Number(0.006)), + Ok(Token::EndCommand), + Ok(Token::Number(30.0)), + Ok(Token::Number(0.04)), + Ok(Token::Number(5.0)), + Ok(Token::Number(6.0)), + Ok(Token::Number(0.0007)), + Ok(Token::Number(12.3)), + Ok(Token::Number(4.56)), + Ok(Token::Number(789.0)), + Ok(Token::Number(999.0)), + Ok(Token::Number(0.0112)), + Ok(Token::EndCommand), + Err(TokenError::ExpectedExponent(String::from("1e"))), + Ok(Token::Id(String::from("e1"))), + Err(TokenError::ExpectedExponent(String::from("1e+"))), + Err(TokenError::ExpectedExponent(String::from("1e-"))), + ], + ); +} + +#[test] +fn test_negative_numbers() { + check_scan( + r#" -0 -1 -01 -001. -1. + -123. /* comment 1 */ /* comment 2 */ + -.1 -0.1 -00.1 -00.10 + -5e1 -6E-1 -7e+1 -6E+01 -6e-03 + -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03 + -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03 + -/**/1 + -. -1e -e1 -1e+ -1e- -1. +"#, + &[ + Ok(Token::Number(-0.0)), + Ok(Token::Number(-1.0)), + Ok(Token::Number(-1.0)), + Ok(Token::Number(-1.0)), + Ok(Token::Number(-1.0)), + Ok(Token::EndCommand), + Ok(Token::Number(-123.0)), + Ok(Token::EndCommand), + Ok(Token::Number(-0.1)), + Ok(Token::Number(-0.1)), + Ok(Token::Number(-0.1)), + Ok(Token::Number(-0.1)), + Ok(Token::Number(-50.0)), + Ok(Token::Number(-0.6)), + Ok(Token::Number(-70.0)), + Ok(Token::Number(-60.0)), + Ok(Token::Number(-0.006)), + Ok(Token::Number(-3.0)), + Ok(Token::Number(-0.04)), + Ok(Token::Number(-5.0)), + Ok(Token::Number(-6.0)), + Ok(Token::Number(-0.0007)), + Ok(Token::Number(-12.3)), + Ok(Token::Number(-4.56)), + Ok(Token::Number(-789.0)), + Ok(Token::Number(-999.0)), + Ok(Token::Number(-0.0112)), + Ok(Token::Number(-1.0)), + Ok(Token::Punct(Punct::Dash)), + Ok(Token::MacroToken(MacroToken::Dot)), + Err(TokenError::ExpectedExponent(String::from("-1e"))), + Ok(Token::Punct(Punct::Dash)), + Ok(Token::Id(String::from("e1"))), + Err(TokenError::ExpectedExponent(String::from("-1e+"))), + Err(TokenError::ExpectedExponent(String::from("-1e-"))), + Ok(Token::Number(-1.0)), + Ok(Token::EndCommand), + ], + ); +} + + +#[test] +fn test_strings() { + check_scan(r#"'x' "y" 'abc' +'Don''t' "Can't" 'Won''t' +"""quoted""" '"quoted"' +'' "" '''' """" +'missing end quote +"missing double quote +'x' + "y" ++ 'z' + +'a' /* abc */ + "b" /* ++ 'c' +/* */"d"/* */+'e' +'foo' ++ /* special case: + in column 0 would ordinarily start a new command +'bar' +'foo' + + +'bar' +'foo' ++ + +'bar' + ++ +x"4142"+'5152' +"4142"+ +x'5152' +x"4142" ++u'304a' +"�あいうえお" +"abc"+U"FFFD"+u'3048'+"xyz" +"#, &[]); +} +#[test] +fn test_strings2() { + check_scan(r#""""" +'error +'b' +"#, &[]); +} diff --git a/rust/src/lex/segment/mod.rs b/rust/src/lex/segment/mod.rs index 401d523825..ca7dfd0686 100644 --- a/rust/src/lex/segment/mod.rs +++ b/rust/src/lex/segment/mod.rs @@ -25,7 +25,9 @@ use crate::{ }; use bitflags::bitflags; -use super::command_name::{command_match, COMMAND_NAMES}; +use super::{ + command_name::{command_match, COMMAND_NAMES}, +}; /// Segmentation mode. /// @@ -608,7 +610,6 @@ impl Segmenter { mut input: &'a str, eof: bool, ) -> Result<(&'a str, Segment), Incomplete> { - println!("{quote:?} {input:?}"); while let (Some(c), rest) = take(input, eof)? { match c { _ if c == quote => { diff --git a/rust/src/lex/segment/test.rs b/rust/src/lex/segment/test.rs index d01a80d779..056ac7b031 100644 --- a/rust/src/lex/segment/test.rs +++ b/rust/src/lex/segment/test.rs @@ -2157,3 +2157,37 @@ fourth command. ], ); } + +#[test] +fn test_strings2() { + print_segmentation(r#"'x' "y" 'abc' +'Don''t' "Can't" 'Won''t' +"""quoted""" '"quoted"' +'' "" '''' """" +'missing end quote +"missing double quote +'x' + "y" ++ 'z' + +'a' /* abc */ + "b" /* ++ 'c' +/* */"d"/* */+'e' +'foo' ++ /* special case: + in column 0 would ordinarily start a new command +'bar' +'foo' + + +'bar' +'foo' ++ + +'bar' + ++ +x"4142"+'5152' +"4142"+ +x'5152' +x"4142" ++u'304a' +"�あいうえお" +"abc"+U"FFFD"+u'3048'+"xyz" +"#); +} diff --git a/rust/src/lex/token.rs b/rust/src/lex/token.rs index 0b2021b5c8..2c3489c554 100644 --- a/rust/src/lex/token.rs +++ b/rust/src/lex/token.rs @@ -2,6 +2,7 @@ use thiserror::Error as ThisError; use super::segment::Segment; +#[derive(Clone, Debug, PartialEq)] pub enum Token { /// End of input. End, @@ -28,6 +29,7 @@ pub enum Token { MacroToken(MacroToken), } +#[derive(Clone, Debug, PartialEq, Eq)] pub enum Punct { /// `+`. Plus, @@ -115,6 +117,7 @@ pub enum Punct { } /// Tokens that only appear in macros. +#[derive(Clone, Debug, PartialEq, Eq)] pub enum MacroToken { /// Identifier starting with `!`. MacroId(String), @@ -131,6 +134,9 @@ pub enum MacroToken { /// ````. Backtick, + /// `.` (in the middle of a line by itself, where it does not end a command). + Dot, + /// `_`. /// /// Although underscores may appear within identifiers, they can't be the @@ -138,7 +144,7 @@ pub enum MacroToken { Underscore, } -#[derive(ThisError, Debug)] +#[derive(ThisError, Debug, PartialEq, Eq)] pub enum TokenError { /// Unterminated string constant. #[error("Unterminated string constant.")] @@ -178,7 +184,7 @@ pub enum TokenError { } impl Token { - pub fn try_from_segment((segment, s): (Segment, &str)) -> Result, TokenError> { + pub fn try_from_segment(s: &str, segment: Segment) -> Result, TokenError> { match segment { Segment::Number => Ok(Some(Self::Number(s.parse().unwrap()))), Segment::QuotedString => { @@ -287,7 +293,8 @@ impl Token { "?" => Ok(Some(Self::MacroToken(MacroToken::Question))), "`" => Ok(Some(Self::MacroToken(MacroToken::Backtick))), "_" => Ok(Some(Self::MacroToken(MacroToken::Underscore))), - _ => unreachable!(), + "." => Ok(Some(Self::MacroToken(MacroToken::Dot))), + _ => unreachable!("bad punctuator {s:?}"), }, Segment::Shbang | Segment::Spaces -- 2.30.2