From 3b441caf454ba932dda1245cabfddc595861b23e Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@cs.stanford.edu>
Date: Sun, 14 Jul 2024 11:35:52 -0700
Subject: [PATCH] work on scanning

---
 rust/src/lex/scan/mod.rs     | 113 +++++++++++
 rust/src/lex/scan/test.rs    | 353 +++++++++++++++++++++++++++++++++++
 rust/src/lex/segment/mod.rs  |   5 +-
 rust/src/lex/segment/test.rs |  34 ++++
 rust/src/lex/token.rs        |  13 +-
 5 files changed, 513 insertions(+), 5 deletions(-)
 create mode 100644 rust/src/lex/scan/test.rs
diff --git a/rust/src/lex/scan/mod.rs b/rust/src/lex/scan/mod.rs
index 343bde8ca2..2e5f993311 100644
--- a/rust/src/lex/scan/mod.rs
+++ b/rust/src/lex/scan/mod.rs
@@ -10,3 +10,116 @@
 //! are the same as the tokens used by the PSPP parser with a few additional
 //! types.
 
+use super::{
+    segment::{Mode, Segment, Segmenter},
+    token::{Punct, Token, TokenError},
+};
+use std::collections::VecDeque;
+
+/// Attempts to merge a sequence of tokens together into a single token. The
+/// tokens are taken from the beginning of `input`. If successful, removes one
+/// or more token from the beginning of `input` and returnss the merged
+/// token. More input tokens might be needed; if so, leaves `input` alone and
+/// returns `None`. In the latter case, the caller should add more tokens to the
+/// input ([Token::End] or [Token::Punct(Punct::EndCmd)] is always sufficient).
+///
+/// This performs two different kinds of token merging:
+///
+///   - String concatenation, where syntax like `"a" + "b"` is converted into a
+///   single string token.  This is definitely needed because the parser relies
+///   on it.
+///
+///   - Negative number merging, where syntax like `-5` is converted from a pair
+///     of tokens (a dash and a positive number) into a single token (a negative
+///     number).  This might not be needed anymore because the segmenter
+///     directly treats a dash followed by a number, with optional intervening
+///     white space, as a negative number.  It's only needed if we want
+///     intervening comments to be allowed or for part of the negative number
+///     token to be produced by macro expansion.
+pub fn merge_tokens(input: &mut VecDeque<Token>) -> Option<Token> {
+    match input.get(0)? {
+        Token::Punct(Punct::Dash) => match input.get(1)? {
+            Token::Number(number) if number.is_sign_positive() => {
+                let number = *number;
+                input.pop_front().unwrap();
+                input.pop_front().unwrap();
+                return Some(Token::Number(-number));
+            }
+            _ => Some(input.pop_front().unwrap()),
+        },
+        Token::String(_) => {
+            let mut i = 0;
+            while matches!(input.get(i * 2 + 1)?, Token::Punct(Punct::Plus))
+                && matches!(input.get(i * 2 + 2)?, Token::String(_))
+            {
+                i += 1;
+            }
+            if i == 0 {
+                Some(input.pop_front().unwrap())
+            } else {
+                let mut output = String::new();
+                for i in 0..=i {
+                    let Token::String(s) = &input[i * 2] else {
+                        unreachable!()
+                    };
+                    output.push_str(&s);
+                }
+                for _ in 0..i * 2 + 1 {
+                    input.pop_front().unwrap();
+                }
+                Some(Token::String(output))
+            }
+        }
+        _ => Some(input.pop_front().unwrap()),
+    }
+}
+
+pub struct StringLexer<'a> {
+    input: &'a str,
+    segmenter: Segmenter,
+    tokens: VecDeque<Token>,
+}
+
+impl<'a> StringLexer<'a> {
+    pub fn new(input: &'a str, mode: Mode, is_snippet: bool) -> Self {
+        Self {
+            input,
+            segmenter: Segmenter::new(mode, is_snippet),
+            tokens: VecDeque::with_capacity(1),
+        }
+    }
+}
+
+impl<'a> Iterator for StringLexer<'a> {
+    type Item = Result<Token, TokenError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if let Some(token) = merge_tokens(&mut self.tokens) {
+            return Some(Ok(token));
+        }
+        loop {
+            let (rest, segment) = self.segmenter.push(self.input, true).unwrap();
+            if segment == Segment::End && self.tokens.is_empty() {
+                return None;
+            }
+            let s = &self.input[..self.input.len() - rest.len()];
+            self.input = rest;
+            match Token::try_from_segment(s, segment) {
+                Err(error) => {
+                    println!("{:?}", &self.tokens);
+                    return Some(Err(error));
+                }
+                Ok(Some(token)) => {
+                    self.tokens.push_back(token);
+                    if let Some(token) = merge_tokens(&mut self.tokens) {
+                        return Some(Ok(token));
+                    }
+                }
+                Ok(None) => (),
+            };
+        }
+    }
+}
+
+#[cfg(test)]
+mod test;
diff --git a/rust/src/lex/scan/test.rs b/rust/src/lex/scan/test.rs
new file mode 100644
index 0000000000..1b84e5ffbe
--- /dev/null
+++ b/rust/src/lex/scan/test.rs
@@ -0,0 +1,353 @@
+use crate::lex::{
+    segment::Mode,
+    token::{MacroToken, Punct, Token, TokenError},
+};
+
+use super::StringLexer;
+
+fn print_token(token: &Token) {
+    match token {
+        Token::End => print!("Token::End"),
+        Token::Id(s) => print!("Token::Id(String::from({s:?}))"),
+        Token::Number(number) => print!("Token::Number({number:?})"),
+        Token::String(s) => print!("Token::String(String::from({s:?}))"),
+        Token::EndCommand => print!("Token::EndCommand"),
+        Token::Punct(punct) => print!("Token::Punct(Punct::{punct:?})"),
+        Token::MacroToken(m) => print!("Token::MacroToken(MacroToken::{m:?})"),
+    }
+}
+
+fn check_scan(input: &str, expected: &[Result<Token, TokenError>]) {
+    let tokens = StringLexer::new(input, Mode::Auto, false).collect::<Vec<_>>();
+
+    if &tokens != expected {
+        for token in &tokens {
+            match token {
+                Ok(token) => {
+                    print!("Ok(");
+                    print_token(token);
+                    print!(")");
+                }
+                Err(error) => print!("Err(TokenError::{error:?})"),
+            }
+            println!(",");
+        }
+
+        eprintln!("tokens differ from expected:");
+        let difference = diff::slice(expected, &tokens);
+        for result in difference {
+            match result {
+                diff::Result::Left(left) => eprintln!("-{left:?}"),
+                diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
+                diff::Result::Right(right) => eprintln!("+{right:?}"),
+            }
+        }
+        panic!();
+    }
+}
+
+#[test]
+fn test_identifiers() {
+    check_scan(
+        r#"a aB i5 $x @efg @@. !abcd !* !*a #.# .x _z.
+abcd. abcd.
+QRSTUV./* end of line comment */
+QrStUv./* end of line comment */ 
+WXYZ. /* unterminated end of line comment
+ï¿½. /* U+FFFD is not valid in an identifier
+"#,
+        &[
+            Ok(Token::Id(String::from("a"))),
+            Ok(Token::Id(String::from("aB"))),
+            Ok(Token::Id(String::from("i5"))),
+            Ok(Token::Id(String::from("$x"))),
+            Ok(Token::Id(String::from("@efg"))),
+            Ok(Token::Id(String::from("@@."))),
+            Ok(Token::MacroToken(MacroToken::MacroId(String::from(
+                "!abcd",
+            )))),
+            Ok(Token::MacroToken(MacroToken::MacroId(String::from("!*")))),
+            Ok(Token::MacroToken(MacroToken::MacroId(String::from("!*")))),
+            Ok(Token::Id(String::from("a"))),
+            Ok(Token::Id(String::from("#.#"))),
+            Ok(Token::MacroToken(MacroToken::Dot)),
+            Ok(Token::Id(String::from("x"))),
+            Ok(Token::MacroToken(MacroToken::Underscore)),
+            Ok(Token::Id(String::from("z"))),
+            Ok(Token::EndCommand),
+            Ok(Token::Id(String::from("abcd."))),
+            Ok(Token::Id(String::from("abcd"))),
+            Ok(Token::EndCommand),
+            Ok(Token::Id(String::from("QRSTUV"))),
+            Ok(Token::EndCommand),
+            Ok(Token::Id(String::from("QrStUv"))),
+            Ok(Token::EndCommand),
+            Ok(Token::Id(String::from("WXYZ"))),
+            Ok(Token::EndCommand),
+            Err(TokenError::UnexpectedChar('ï¿½')),
+            Ok(Token::EndCommand),
+        ],
+    );
+}
+
+#[test]
+fn test_reserved_words() {
+    check_scan(
+        r#"and or not eq ge gt le lt ne all by to with
+AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
+andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
+and. with.
+"#,
+        &[
+            Ok(Token::Punct(Punct::And)),
+            Ok(Token::Punct(Punct::Or)),
+            Ok(Token::Punct(Punct::Not)),
+            Ok(Token::Punct(Punct::Eq)),
+            Ok(Token::Punct(Punct::Ge)),
+            Ok(Token::Punct(Punct::Gt)),
+            Ok(Token::Punct(Punct::Le)),
+            Ok(Token::Punct(Punct::Lt)),
+            Ok(Token::Punct(Punct::Ne)),
+            Ok(Token::Punct(Punct::All)),
+            Ok(Token::Punct(Punct::By)),
+            Ok(Token::Punct(Punct::To)),
+            Ok(Token::Punct(Punct::With)),
+            Ok(Token::Punct(Punct::And)),
+            Ok(Token::Punct(Punct::Or)),
+            Ok(Token::Punct(Punct::Not)),
+            Ok(Token::Punct(Punct::Eq)),
+            Ok(Token::Punct(Punct::Ge)),
+            Ok(Token::Punct(Punct::Gt)),
+            Ok(Token::Punct(Punct::Le)),
+            Ok(Token::Punct(Punct::Lt)),
+            Ok(Token::Punct(Punct::Ne)),
+            Ok(Token::Punct(Punct::All)),
+            Ok(Token::Punct(Punct::By)),
+            Ok(Token::Punct(Punct::To)),
+            Ok(Token::Punct(Punct::With)),
+            Ok(Token::Id(String::from("andx"))),
+            Ok(Token::Id(String::from("orx"))),
+            Ok(Token::Id(String::from("notx"))),
+            Ok(Token::Id(String::from("eqx"))),
+            Ok(Token::Id(String::from("gex"))),
+            Ok(Token::Id(String::from("gtx"))),
+            Ok(Token::Id(String::from("lex"))),
+            Ok(Token::Id(String::from("ltx"))),
+            Ok(Token::Id(String::from("nex"))),
+            Ok(Token::Id(String::from("allx"))),
+            Ok(Token::Id(String::from("byx"))),
+            Ok(Token::Id(String::from("tox"))),
+            Ok(Token::Id(String::from("withx"))),
+            Ok(Token::Id(String::from("and."))),
+            Ok(Token::Punct(Punct::With)),
+            Ok(Token::EndCommand),
+        ],
+    );
+}
+
+#[test]
+fn test_punctuation() {
+    check_scan(
+        r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] **
+~&|=>=><=<~=<>(),-+*/[]**
+% : ; ? _ ` { } ~
+"#,
+        &[
+            Ok(Token::Punct(Punct::Not)),
+            Ok(Token::Punct(Punct::And)),
+            Ok(Token::Punct(Punct::Or)),
+            Ok(Token::Punct(Punct::Equals)),
+            Ok(Token::Punct(Punct::Ge)),
+            Ok(Token::Punct(Punct::Gt)),
+            Ok(Token::Punct(Punct::Le)),
+            Ok(Token::Punct(Punct::Lt)),
+            Ok(Token::Punct(Punct::Ne)),
+            Ok(Token::Punct(Punct::Ne)),
+            Ok(Token::Punct(Punct::LParen)),
+            Ok(Token::Punct(Punct::RParen)),
+            Ok(Token::Punct(Punct::Comma)),
+            Ok(Token::Punct(Punct::Dash)),
+            Ok(Token::Punct(Punct::Plus)),
+            Ok(Token::Punct(Punct::Asterisk)),
+            Ok(Token::Punct(Punct::Slash)),
+            Ok(Token::Punct(Punct::LSquare)),
+            Ok(Token::Punct(Punct::RSquare)),
+            Ok(Token::Punct(Punct::Exp)),
+            Ok(Token::Punct(Punct::Not)),
+            Ok(Token::Punct(Punct::And)),
+            Ok(Token::Punct(Punct::Or)),
+            Ok(Token::Punct(Punct::Equals)),
+            Ok(Token::Punct(Punct::Ge)),
+            Ok(Token::Punct(Punct::Gt)),
+            Ok(Token::Punct(Punct::Le)),
+            Ok(Token::Punct(Punct::Lt)),
+            Ok(Token::Punct(Punct::Ne)),
+            Ok(Token::Punct(Punct::Ne)),
+            Ok(Token::Punct(Punct::LParen)),
+            Ok(Token::Punct(Punct::RParen)),
+            Ok(Token::Punct(Punct::Comma)),
+            Ok(Token::Punct(Punct::Dash)),
+            Ok(Token::Punct(Punct::Plus)),
+            Ok(Token::Punct(Punct::Asterisk)),
+            Ok(Token::Punct(Punct::Slash)),
+            Ok(Token::Punct(Punct::LSquare)),
+            Ok(Token::Punct(Punct::RSquare)),
+            Ok(Token::Punct(Punct::Exp)),
+            Ok(Token::MacroToken(MacroToken::Percent)),
+            Ok(Token::Punct(Punct::Colon)),
+            Ok(Token::Punct(Punct::Semicolon)),
+            Ok(Token::MacroToken(MacroToken::Question)),
+            Ok(Token::MacroToken(MacroToken::Underscore)),
+            Ok(Token::MacroToken(MacroToken::Backtick)),
+            Ok(Token::Punct(Punct::LCurly)),
+            Ok(Token::Punct(Punct::RCurly)),
+            Ok(Token::Punct(Punct::Not)),
+        ],
+    );
+}
+
+#[test]
+fn test_positive_numbers() {
+    check_scan(
+        r#"0 1 01 001. 1.
+123. /* comment 1 */ /* comment 2 */
+.1 0.1 00.1 00.10
+5e1 6E-1 7e+1 6E+01 6e-03
+.3E1 .4e-1 .5E+1 .6e+01 .7E-03
+1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
+. 1e e1 1e+ 1e-
+"#,
+        &[
+            Ok(Token::Number(0.0)),
+            Ok(Token::Number(1.0)),
+            Ok(Token::Number(1.0)),
+            Ok(Token::Number(1.0)),
+            Ok(Token::Number(1.0)),
+            Ok(Token::EndCommand),
+            Ok(Token::Number(123.0)),
+            Ok(Token::EndCommand),
+            Ok(Token::EndCommand),
+            Ok(Token::Number(1.0)),
+            Ok(Token::Number(0.1)),
+            Ok(Token::Number(0.1)),
+            Ok(Token::Number(0.1)),
+            Ok(Token::Number(50.0)),
+            Ok(Token::Number(0.6)),
+            Ok(Token::Number(70.0)),
+            Ok(Token::Number(60.0)),
+            Ok(Token::Number(0.006)),
+            Ok(Token::EndCommand),
+            Ok(Token::Number(30.0)),
+            Ok(Token::Number(0.04)),
+            Ok(Token::Number(5.0)),
+            Ok(Token::Number(6.0)),
+            Ok(Token::Number(0.0007)),
+            Ok(Token::Number(12.3)),
+            Ok(Token::Number(4.56)),
+            Ok(Token::Number(789.0)),
+            Ok(Token::Number(999.0)),
+            Ok(Token::Number(0.0112)),
+            Ok(Token::EndCommand),
+            Err(TokenError::ExpectedExponent(String::from("1e"))),
+            Ok(Token::Id(String::from("e1"))),
+            Err(TokenError::ExpectedExponent(String::from("1e+"))),
+            Err(TokenError::ExpectedExponent(String::from("1e-"))),
+        ],
+    );
+}
+
+#[test]
+fn test_negative_numbers() {
+    check_scan(
+        r#" -0 -1 -01 -001. -1.
+ -123. /* comment 1 */ /* comment 2 */
+ -.1 -0.1 -00.1 -00.10
+ -5e1 -6E-1 -7e+1 -6E+01 -6e-03
+ -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03
+ -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03
+ -/**/1
+ -. -1e -e1 -1e+ -1e- -1.
+"#,
+        &[
+            Ok(Token::Number(-0.0)),
+            Ok(Token::Number(-1.0)),
+            Ok(Token::Number(-1.0)),
+            Ok(Token::Number(-1.0)),
+            Ok(Token::Number(-1.0)),
+            Ok(Token::EndCommand),
+            Ok(Token::Number(-123.0)),
+            Ok(Token::EndCommand),
+            Ok(Token::Number(-0.1)),
+            Ok(Token::Number(-0.1)),
+            Ok(Token::Number(-0.1)),
+            Ok(Token::Number(-0.1)),
+            Ok(Token::Number(-50.0)),
+            Ok(Token::Number(-0.6)),
+            Ok(Token::Number(-70.0)),
+            Ok(Token::Number(-60.0)),
+            Ok(Token::Number(-0.006)),
+            Ok(Token::Number(-3.0)),
+            Ok(Token::Number(-0.04)),
+            Ok(Token::Number(-5.0)),
+            Ok(Token::Number(-6.0)),
+            Ok(Token::Number(-0.0007)),
+            Ok(Token::Number(-12.3)),
+            Ok(Token::Number(-4.56)),
+            Ok(Token::Number(-789.0)),
+            Ok(Token::Number(-999.0)),
+            Ok(Token::Number(-0.0112)),
+            Ok(Token::Number(-1.0)),
+            Ok(Token::Punct(Punct::Dash)),
+            Ok(Token::MacroToken(MacroToken::Dot)),
+            Err(TokenError::ExpectedExponent(String::from("-1e"))),
+            Ok(Token::Punct(Punct::Dash)),
+            Ok(Token::Id(String::from("e1"))),
+            Err(TokenError::ExpectedExponent(String::from("-1e+"))),
+            Err(TokenError::ExpectedExponent(String::from("-1e-"))),
+            Ok(Token::Number(-1.0)),
+            Ok(Token::EndCommand),
+        ],
+    );
+}
+
+
+#[test]
+fn test_strings() {
+    check_scan(r#"'x' "y" 'abc'
+'Don''t' "Can't" 'Won''t'
+"""quoted""" '"quoted"'
+'' "" '''' """"
+'missing end quote
+"missing double quote
+'x' + "y"
++ 'z' +
+'a' /* abc */ + "b" /*
++ 'c' +/* */"d"/* */+'e'
+'foo'
++          /* special case: + in column 0 would ordinarily start a new command
+'bar'
+'foo'
+ +
+'bar'
+'foo'
++
+
+'bar'
+
++
+x"4142"+'5152'
+"4142"+
+x'5152'
+x"4142"
++u'304a'
+"ï¿½ããããã"
+"abc"+U"FFFD"+u'3048'+"xyz"
+"#, &[]);
+}
+#[test]
+fn test_strings2() {
+    check_scan(r#"""""
+'error
+'b'
+"#, &[]);
+}
diff --git a/rust/src/lex/segment/mod.rs b/rust/src/lex/segment/mod.rs
index 401d523825..ca7dfd0686 100644
--- a/rust/src/lex/segment/mod.rs
+++ b/rust/src/lex/segment/mod.rs
@@ -25,7 +25,9 @@ use crate::{
 };
 use bitflags::bitflags;
 
-use super::command_name::{command_match, COMMAND_NAMES};
+use super::{
+    command_name::{command_match, COMMAND_NAMES},
+};
 
 /// Segmentation mode.
 ///
@@ -608,7 +610,6 @@ impl Segmenter {
         mut input: &'a str,
         eof: bool,
     ) -> Result<(&'a str, Segment), Incomplete> {
-        println!("{quote:?} {input:?}");
         while let (Some(c), rest) = take(input, eof)? {
             match c {
                 _ if c == quote => {
diff --git a/rust/src/lex/segment/test.rs b/rust/src/lex/segment/test.rs
index d01a80d779..056ac7b031 100644
--- a/rust/src/lex/segment/test.rs
+++ b/rust/src/lex/segment/test.rs
@@ -2157,3 +2157,37 @@ fourth command.
         ],
     );
 }
+
+#[test]
+fn test_strings2() {
+    print_segmentation(r#"'x' "y" 'abc'
+'Don''t' "Can't" 'Won''t'
+"""quoted""" '"quoted"'
+'' "" '''' """"
+'missing end quote
+"missing double quote
+'x' + "y"
++ 'z' +
+'a' /* abc */ + "b" /*
++ 'c' +/* */"d"/* */+'e'
+'foo'
++          /* special case: + in column 0 would ordinarily start a new command
+'bar'
+'foo'
+ +
+'bar'
+'foo'
++
+
+'bar'
+
++
+x"4142"+'5152'
+"4142"+
+x'5152'
+x"4142"
++u'304a'
+"ï¿½ããããã"
+"abc"+U"FFFD"+u'3048'+"xyz"
+"#);
+}
diff --git a/rust/src/lex/token.rs b/rust/src/lex/token.rs
index 0b2021b5c8..2c3489c554 100644
--- a/rust/src/lex/token.rs
+++ b/rust/src/lex/token.rs
@@ -2,6 +2,7 @@ use thiserror::Error as ThisError;
 
 use super::segment::Segment;
 
+#[derive(Clone, Debug, PartialEq)]
 pub enum Token {
     /// End of input.
     End,
@@ -28,6 +29,7 @@ pub enum Token {
     MacroToken(MacroToken),
 }
 
+#[derive(Clone, Debug, PartialEq, Eq)]
 pub enum Punct {
     /// `+`.
     Plus,
@@ -115,6 +117,7 @@ pub enum Punct {
 }
 
 /// Tokens that only appear in macros.
+#[derive(Clone, Debug, PartialEq, Eq)]
 pub enum MacroToken {
     /// Identifier starting with `!`.
     MacroId(String),
@@ -131,6 +134,9 @@ pub enum MacroToken {
     /// ````.
     Backtick,
 
+    /// `.` (in the middle of a line by itself, where it does not end a command).
+    Dot,
+
     /// `_`.
     ///
     /// Although underscores may appear within identifiers, they can't be the
@@ -138,7 +144,7 @@ pub enum MacroToken {
     Underscore,
 }
 
-#[derive(ThisError, Debug)]
+#[derive(ThisError, Debug, PartialEq, Eq)]
 pub enum TokenError {
     /// Unterminated string constant.
     #[error("Unterminated string constant.")]
@@ -178,7 +184,7 @@ pub enum TokenError {
 }
 
 impl Token {
-    pub fn try_from_segment((segment, s): (Segment, &str)) -> Result<Option<Self>, TokenError> {
+    pub fn try_from_segment(s: &str, segment: Segment) -> Result<Option<Self>, TokenError> {
         match segment {
             Segment::Number => Ok(Some(Self::Number(s.parse().unwrap()))),
             Segment::QuotedString => {
@@ -287,7 +293,8 @@ impl Token {
                 "?" => Ok(Some(Self::MacroToken(MacroToken::Question))),
                 "`" => Ok(Some(Self::MacroToken(MacroToken::Backtick))),
                 "_" => Ok(Some(Self::MacroToken(MacroToken::Underscore))),
-                _ => unreachable!(),
+                "." => Ok(Some(Self::MacroToken(MacroToken::Dot))),
+                _ => unreachable!("bad punctuator {s:?}"),
             },
             Segment::Shbang
             | Segment::Spaces
-- 
2.30.2