//! are the same as the tokens used by the PSPP parser with a few additional
//! types.
+use super::{
+ segment::{Mode, Segment, Segmenter},
+ token::{Punct, Token, TokenError},
+};
+use std::collections::VecDeque;
+
+/// Attempts to merge a sequence of tokens together into a single token. The
+/// tokens are taken from the beginning of `input`. If successful, removes one
+/// or more token from the beginning of `input` and returnss the merged
+/// token. More input tokens might be needed; if so, leaves `input` alone and
+/// returns `None`. In the latter case, the caller should add more tokens to the
+/// input ([Token::End] or [Token::Punct(Punct::EndCmd)] is always sufficient).
+///
+/// This performs two different kinds of token merging:
+///
+/// - String concatenation, where syntax like `"a" + "b"` is converted into a
+/// single string token. This is definitely needed because the parser relies
+/// on it.
+///
+/// - Negative number merging, where syntax like `-5` is converted from a pair
+/// of tokens (a dash and a positive number) into a single token (a negative
+/// number). This might not be needed anymore because the segmenter
+/// directly treats a dash followed by a number, with optional intervening
+/// white space, as a negative number. It's only needed if we want
+/// intervening comments to be allowed or for part of the negative number
+/// token to be produced by macro expansion.
+pub fn merge_tokens(input: &mut VecDeque<Token>) -> Option<Token> {
+ match input.get(0)? {
+ Token::Punct(Punct::Dash) => match input.get(1)? {
+ Token::Number(number) if number.is_sign_positive() => {
+ let number = *number;
+ input.pop_front().unwrap();
+ input.pop_front().unwrap();
+ return Some(Token::Number(-number));
+ }
+ _ => Some(input.pop_front().unwrap()),
+ },
+ Token::String(_) => {
+ let mut i = 0;
+ while matches!(input.get(i * 2 + 1)?, Token::Punct(Punct::Plus))
+ && matches!(input.get(i * 2 + 2)?, Token::String(_))
+ {
+ i += 1;
+ }
+ if i == 0 {
+ Some(input.pop_front().unwrap())
+ } else {
+ let mut output = String::new();
+ for i in 0..=i {
+ let Token::String(s) = &input[i * 2] else {
+ unreachable!()
+ };
+ output.push_str(&s);
+ }
+ for _ in 0..i * 2 + 1 {
+ input.pop_front().unwrap();
+ }
+ Some(Token::String(output))
+ }
+ }
+ _ => Some(input.pop_front().unwrap()),
+ }
+}
+
+pub struct StringLexer<'a> {
+ input: &'a str,
+ segmenter: Segmenter,
+ tokens: VecDeque<Token>,
+}
+
+impl<'a> StringLexer<'a> {
+ pub fn new(input: &'a str, mode: Mode, is_snippet: bool) -> Self {
+ Self {
+ input,
+ segmenter: Segmenter::new(mode, is_snippet),
+ tokens: VecDeque::with_capacity(1),
+ }
+ }
+}
+
+impl<'a> Iterator for StringLexer<'a> {
+ type Item = Result<Token, TokenError>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ if let Some(token) = merge_tokens(&mut self.tokens) {
+ return Some(Ok(token));
+ }
+ loop {
+ let (rest, segment) = self.segmenter.push(self.input, true).unwrap();
+ if segment == Segment::End && self.tokens.is_empty() {
+ return None;
+ }
+ let s = &self.input[..self.input.len() - rest.len()];
+ self.input = rest;
+ match Token::try_from_segment(s, segment) {
+ Err(error) => {
+ println!("{:?}", &self.tokens);
+ return Some(Err(error));
+ }
+ Ok(Some(token)) => {
+ self.tokens.push_back(token);
+ if let Some(token) = merge_tokens(&mut self.tokens) {
+ return Some(Ok(token));
+ }
+ }
+ Ok(None) => (),
+ };
+ }
+ }
+}
+
+#[cfg(test)]
+mod test;
--- /dev/null
+use crate::lex::{
+ segment::Mode,
+ token::{MacroToken, Punct, Token, TokenError},
+};
+
+use super::StringLexer;
+
+fn print_token(token: &Token) {
+ match token {
+ Token::End => print!("Token::End"),
+ Token::Id(s) => print!("Token::Id(String::from({s:?}))"),
+ Token::Number(number) => print!("Token::Number({number:?})"),
+ Token::String(s) => print!("Token::String(String::from({s:?}))"),
+ Token::EndCommand => print!("Token::EndCommand"),
+ Token::Punct(punct) => print!("Token::Punct(Punct::{punct:?})"),
+ Token::MacroToken(m) => print!("Token::MacroToken(MacroToken::{m:?})"),
+ }
+}
+
+fn check_scan(input: &str, expected: &[Result<Token, TokenError>]) {
+ let tokens = StringLexer::new(input, Mode::Auto, false).collect::<Vec<_>>();
+
+ if &tokens != expected {
+ for token in &tokens {
+ match token {
+ Ok(token) => {
+ print!("Ok(");
+ print_token(token);
+ print!(")");
+ }
+ Err(error) => print!("Err(TokenError::{error:?})"),
+ }
+ println!(",");
+ }
+
+ eprintln!("tokens differ from expected:");
+ let difference = diff::slice(expected, &tokens);
+ for result in difference {
+ match result {
+ diff::Result::Left(left) => eprintln!("-{left:?}"),
+ diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
+ diff::Result::Right(right) => eprintln!("+{right:?}"),
+ }
+ }
+ panic!();
+ }
+}
+
+#[test]
+fn test_identifiers() {
+ check_scan(
+ r#"a aB i5 $x @efg @@. !abcd !* !*a #.# .x _z.
+abcd. abcd.
+QRSTUV./* end of line comment */
+QrStUv./* end of line comment */
+WXYZ. /* unterminated end of line comment
+�. /* U+FFFD is not valid in an identifier
+"#,
+ &[
+ Ok(Token::Id(String::from("a"))),
+ Ok(Token::Id(String::from("aB"))),
+ Ok(Token::Id(String::from("i5"))),
+ Ok(Token::Id(String::from("$x"))),
+ Ok(Token::Id(String::from("@efg"))),
+ Ok(Token::Id(String::from("@@."))),
+ Ok(Token::MacroToken(MacroToken::MacroId(String::from(
+ "!abcd",
+ )))),
+ Ok(Token::MacroToken(MacroToken::MacroId(String::from("!*")))),
+ Ok(Token::MacroToken(MacroToken::MacroId(String::from("!*")))),
+ Ok(Token::Id(String::from("a"))),
+ Ok(Token::Id(String::from("#.#"))),
+ Ok(Token::MacroToken(MacroToken::Dot)),
+ Ok(Token::Id(String::from("x"))),
+ Ok(Token::MacroToken(MacroToken::Underscore)),
+ Ok(Token::Id(String::from("z"))),
+ Ok(Token::EndCommand),
+ Ok(Token::Id(String::from("abcd."))),
+ Ok(Token::Id(String::from("abcd"))),
+ Ok(Token::EndCommand),
+ Ok(Token::Id(String::from("QRSTUV"))),
+ Ok(Token::EndCommand),
+ Ok(Token::Id(String::from("QrStUv"))),
+ Ok(Token::EndCommand),
+ Ok(Token::Id(String::from("WXYZ"))),
+ Ok(Token::EndCommand),
+ Err(TokenError::UnexpectedChar('�')),
+ Ok(Token::EndCommand),
+ ],
+ );
+}
+
+#[test]
+fn test_reserved_words() {
+ check_scan(
+ r#"and or not eq ge gt le lt ne all by to with
+AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
+andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
+and. with.
+"#,
+ &[
+ Ok(Token::Punct(Punct::And)),
+ Ok(Token::Punct(Punct::Or)),
+ Ok(Token::Punct(Punct::Not)),
+ Ok(Token::Punct(Punct::Eq)),
+ Ok(Token::Punct(Punct::Ge)),
+ Ok(Token::Punct(Punct::Gt)),
+ Ok(Token::Punct(Punct::Le)),
+ Ok(Token::Punct(Punct::Lt)),
+ Ok(Token::Punct(Punct::Ne)),
+ Ok(Token::Punct(Punct::All)),
+ Ok(Token::Punct(Punct::By)),
+ Ok(Token::Punct(Punct::To)),
+ Ok(Token::Punct(Punct::With)),
+ Ok(Token::Punct(Punct::And)),
+ Ok(Token::Punct(Punct::Or)),
+ Ok(Token::Punct(Punct::Not)),
+ Ok(Token::Punct(Punct::Eq)),
+ Ok(Token::Punct(Punct::Ge)),
+ Ok(Token::Punct(Punct::Gt)),
+ Ok(Token::Punct(Punct::Le)),
+ Ok(Token::Punct(Punct::Lt)),
+ Ok(Token::Punct(Punct::Ne)),
+ Ok(Token::Punct(Punct::All)),
+ Ok(Token::Punct(Punct::By)),
+ Ok(Token::Punct(Punct::To)),
+ Ok(Token::Punct(Punct::With)),
+ Ok(Token::Id(String::from("andx"))),
+ Ok(Token::Id(String::from("orx"))),
+ Ok(Token::Id(String::from("notx"))),
+ Ok(Token::Id(String::from("eqx"))),
+ Ok(Token::Id(String::from("gex"))),
+ Ok(Token::Id(String::from("gtx"))),
+ Ok(Token::Id(String::from("lex"))),
+ Ok(Token::Id(String::from("ltx"))),
+ Ok(Token::Id(String::from("nex"))),
+ Ok(Token::Id(String::from("allx"))),
+ Ok(Token::Id(String::from("byx"))),
+ Ok(Token::Id(String::from("tox"))),
+ Ok(Token::Id(String::from("withx"))),
+ Ok(Token::Id(String::from("and."))),
+ Ok(Token::Punct(Punct::With)),
+ Ok(Token::EndCommand),
+ ],
+ );
+}
+
+#[test]
+fn test_punctuation() {
+ check_scan(
+ r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] **
+~&|=>=><=<~=<>(),-+*/[]**
+% : ; ? _ ` { } ~
+"#,
+ &[
+ Ok(Token::Punct(Punct::Not)),
+ Ok(Token::Punct(Punct::And)),
+ Ok(Token::Punct(Punct::Or)),
+ Ok(Token::Punct(Punct::Equals)),
+ Ok(Token::Punct(Punct::Ge)),
+ Ok(Token::Punct(Punct::Gt)),
+ Ok(Token::Punct(Punct::Le)),
+ Ok(Token::Punct(Punct::Lt)),
+ Ok(Token::Punct(Punct::Ne)),
+ Ok(Token::Punct(Punct::Ne)),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::Punct(Punct::Comma)),
+ Ok(Token::Punct(Punct::Dash)),
+ Ok(Token::Punct(Punct::Plus)),
+ Ok(Token::Punct(Punct::Asterisk)),
+ Ok(Token::Punct(Punct::Slash)),
+ Ok(Token::Punct(Punct::LSquare)),
+ Ok(Token::Punct(Punct::RSquare)),
+ Ok(Token::Punct(Punct::Exp)),
+ Ok(Token::Punct(Punct::Not)),
+ Ok(Token::Punct(Punct::And)),
+ Ok(Token::Punct(Punct::Or)),
+ Ok(Token::Punct(Punct::Equals)),
+ Ok(Token::Punct(Punct::Ge)),
+ Ok(Token::Punct(Punct::Gt)),
+ Ok(Token::Punct(Punct::Le)),
+ Ok(Token::Punct(Punct::Lt)),
+ Ok(Token::Punct(Punct::Ne)),
+ Ok(Token::Punct(Punct::Ne)),
+ Ok(Token::Punct(Punct::LParen)),
+ Ok(Token::Punct(Punct::RParen)),
+ Ok(Token::Punct(Punct::Comma)),
+ Ok(Token::Punct(Punct::Dash)),
+ Ok(Token::Punct(Punct::Plus)),
+ Ok(Token::Punct(Punct::Asterisk)),
+ Ok(Token::Punct(Punct::Slash)),
+ Ok(Token::Punct(Punct::LSquare)),
+ Ok(Token::Punct(Punct::RSquare)),
+ Ok(Token::Punct(Punct::Exp)),
+ Ok(Token::MacroToken(MacroToken::Percent)),
+ Ok(Token::Punct(Punct::Colon)),
+ Ok(Token::Punct(Punct::Semicolon)),
+ Ok(Token::MacroToken(MacroToken::Question)),
+ Ok(Token::MacroToken(MacroToken::Underscore)),
+ Ok(Token::MacroToken(MacroToken::Backtick)),
+ Ok(Token::Punct(Punct::LCurly)),
+ Ok(Token::Punct(Punct::RCurly)),
+ Ok(Token::Punct(Punct::Not)),
+ ],
+ );
+}
+
+#[test]
+fn test_positive_numbers() {
+ check_scan(
+ r#"0 1 01 001. 1.
+123. /* comment 1 */ /* comment 2 */
+.1 0.1 00.1 00.10
+5e1 6E-1 7e+1 6E+01 6e-03
+.3E1 .4e-1 .5E+1 .6e+01 .7E-03
+1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
+. 1e e1 1e+ 1e-
+"#,
+ &[
+ Ok(Token::Number(0.0)),
+ Ok(Token::Number(1.0)),
+ Ok(Token::Number(1.0)),
+ Ok(Token::Number(1.0)),
+ Ok(Token::Number(1.0)),
+ Ok(Token::EndCommand),
+ Ok(Token::Number(123.0)),
+ Ok(Token::EndCommand),
+ Ok(Token::EndCommand),
+ Ok(Token::Number(1.0)),
+ Ok(Token::Number(0.1)),
+ Ok(Token::Number(0.1)),
+ Ok(Token::Number(0.1)),
+ Ok(Token::Number(50.0)),
+ Ok(Token::Number(0.6)),
+ Ok(Token::Number(70.0)),
+ Ok(Token::Number(60.0)),
+ Ok(Token::Number(0.006)),
+ Ok(Token::EndCommand),
+ Ok(Token::Number(30.0)),
+ Ok(Token::Number(0.04)),
+ Ok(Token::Number(5.0)),
+ Ok(Token::Number(6.0)),
+ Ok(Token::Number(0.0007)),
+ Ok(Token::Number(12.3)),
+ Ok(Token::Number(4.56)),
+ Ok(Token::Number(789.0)),
+ Ok(Token::Number(999.0)),
+ Ok(Token::Number(0.0112)),
+ Ok(Token::EndCommand),
+ Err(TokenError::ExpectedExponent(String::from("1e"))),
+ Ok(Token::Id(String::from("e1"))),
+ Err(TokenError::ExpectedExponent(String::from("1e+"))),
+ Err(TokenError::ExpectedExponent(String::from("1e-"))),
+ ],
+ );
+}
+
+#[test]
+fn test_negative_numbers() {
+ check_scan(
+ r#" -0 -1 -01 -001. -1.
+ -123. /* comment 1 */ /* comment 2 */
+ -.1 -0.1 -00.1 -00.10
+ -5e1 -6E-1 -7e+1 -6E+01 -6e-03
+ -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03
+ -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03
+ -/**/1
+ -. -1e -e1 -1e+ -1e- -1.
+"#,
+ &[
+ Ok(Token::Number(-0.0)),
+ Ok(Token::Number(-1.0)),
+ Ok(Token::Number(-1.0)),
+ Ok(Token::Number(-1.0)),
+ Ok(Token::Number(-1.0)),
+ Ok(Token::EndCommand),
+ Ok(Token::Number(-123.0)),
+ Ok(Token::EndCommand),
+ Ok(Token::Number(-0.1)),
+ Ok(Token::Number(-0.1)),
+ Ok(Token::Number(-0.1)),
+ Ok(Token::Number(-0.1)),
+ Ok(Token::Number(-50.0)),
+ Ok(Token::Number(-0.6)),
+ Ok(Token::Number(-70.0)),
+ Ok(Token::Number(-60.0)),
+ Ok(Token::Number(-0.006)),
+ Ok(Token::Number(-3.0)),
+ Ok(Token::Number(-0.04)),
+ Ok(Token::Number(-5.0)),
+ Ok(Token::Number(-6.0)),
+ Ok(Token::Number(-0.0007)),
+ Ok(Token::Number(-12.3)),
+ Ok(Token::Number(-4.56)),
+ Ok(Token::Number(-789.0)),
+ Ok(Token::Number(-999.0)),
+ Ok(Token::Number(-0.0112)),
+ Ok(Token::Number(-1.0)),
+ Ok(Token::Punct(Punct::Dash)),
+ Ok(Token::MacroToken(MacroToken::Dot)),
+ Err(TokenError::ExpectedExponent(String::from("-1e"))),
+ Ok(Token::Punct(Punct::Dash)),
+ Ok(Token::Id(String::from("e1"))),
+ Err(TokenError::ExpectedExponent(String::from("-1e+"))),
+ Err(TokenError::ExpectedExponent(String::from("-1e-"))),
+ Ok(Token::Number(-1.0)),
+ Ok(Token::EndCommand),
+ ],
+ );
+}
+
+
+#[test]
+fn test_strings() {
+ check_scan(r#"'x' "y" 'abc'
+'Don''t' "Can't" 'Won''t'
+"""quoted""" '"quoted"'
+'' "" '''' """"
+'missing end quote
+"missing double quote
+'x' + "y"
++ 'z' +
+'a' /* abc */ + "b" /*
++ 'c' +/* */"d"/* */+'e'
+'foo'
++ /* special case: + in column 0 would ordinarily start a new command
+'bar'
+'foo'
+ +
+'bar'
+'foo'
++
+
+'bar'
+
++
+x"4142"+'5152'
+"4142"+
+x'5152'
+x"4142"
++u'304a'
+"�あいうえお"
+"abc"+U"FFFD"+u'3048'+"xyz"
+"#, &[]);
+}
+#[test]
+fn test_strings2() {
+ check_scan(r#"""""
+'error
+'b'
+"#, &[]);
+}