From fafd5cffb7209116aea5ac24a3fbf4cea991ed13 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 14 Jul 2024 12:38:40 -0700 Subject: [PATCH] scan code passes all the tests --- rust/src/lex/scan/mod.rs | 312 ++++++++-- rust/src/lex/scan/test.rs | 1096 +++++++++++++++++++++++++++------- rust/src/lex/segment/mod.rs | 10 +- rust/src/lex/segment/test.rs | 36 +- rust/src/lex/token.rs | 175 ------ 5 files changed, 1145 insertions(+), 484 deletions(-) diff --git a/rust/src/lex/scan/mod.rs b/rust/src/lex/scan/mod.rs index 2e5f993311..5503a5bcc0 100644 --- a/rust/src/lex/scan/mod.rs +++ b/rust/src/lex/scan/mod.rs @@ -12,72 +12,260 @@ use super::{ segment::{Mode, Segment, Segmenter}, - token::{Punct, Token, TokenError}, + token::{MacroToken, Punct, Token}, }; use std::collections::VecDeque; +use thiserror::Error as ThisError; -/// Attempts to merge a sequence of tokens together into a single token. The -/// tokens are taken from the beginning of `input`. If successful, removes one -/// or more token from the beginning of `input` and returnss the merged -/// token. More input tokens might be needed; if so, leaves `input` alone and -/// returns `None`. In the latter case, the caller should add more tokens to the -/// input ([Token::End] or [Token::Punct(Punct::EndCmd)] is always sufficient). -/// -/// This performs two different kinds of token merging: -/// -/// - String concatenation, where syntax like `"a" + "b"` is converted into a -/// single string token. This is definitely needed because the parser relies -/// on it. -/// -/// - Negative number merging, where syntax like `-5` is converted from a pair -/// of tokens (a dash and a positive number) into a single token (a negative -/// number). This might not be needed anymore because the segmenter -/// directly treats a dash followed by a number, with optional intervening -/// white space, as a negative number. It's only needed if we want -/// intervening comments to be allowed or for part of the negative number -/// token to be produced by macro expansion. -pub fn merge_tokens(input: &mut VecDeque) -> Option { - match input.get(0)? { - Token::Punct(Punct::Dash) => match input.get(1)? { - Token::Number(number) if number.is_sign_positive() => { - let number = *number; - input.pop_front().unwrap(); - input.pop_front().unwrap(); - return Some(Token::Number(-number)); +#[derive(ThisError, Clone, Debug, PartialEq, Eq)] +pub enum ScanError { + /// Unterminated string constant. + #[error("Unterminated string constant.")] + ExpectedQuote, + + /// Missing exponent. + #[error("Missing exponent following `{0}`")] + ExpectedExponent(String), + + /// Odd length hex string. + #[error("String of hex digits has {0} characters, which is not a multiple of 2.")] + OddLengthHexString(usize), + + /// Invalid hex digit. + #[error("Invalid hex digit {0:?}.")] + BadHexDigit(char), + + /// Invalid length Unicode string. + #[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")] + BadLengthUnicodeString(usize), + + /// Invalid code point. + #[error("U+{0:04X} is not a valid Unicode code point.")] + BadCodePoint(u32), + + /// Expected hexadecimal Unicode code point + #[error("Expected hexadecimal Unicode code point.")] + ExpectedCodePoint, + + /// `DO REPEAT` nested too deeply. + #[error("`DO REPEAT` nested too deeply.")] + DoRepeatOverflow, + + /// Unexpected character. + #[error("Unexpected character {0:?} in input.")] + UnexpectedChar(char), +} + +#[derive(Clone, Debug, PartialEq)] +pub enum ScanToken { + Token(Token), + Error(ScanError), +} + +impl ScanToken { + pub fn from_segment(s: &str, segment: Segment) -> Option { + match segment { + Segment::Number => Some(Self::Token(Token::Number(s.parse().unwrap()))), + Segment::QuotedString => { + // Trim quote mark from front and back. + let mut chars = s.chars(); + let quote = chars.next().unwrap(); + let s = chars.as_str().strip_suffix(quote).unwrap(); + + // Replace doubled quotes by single ones. + let (single_quote, double_quote) = match quote { + '\'' => ("'", "''"), + '"' => ("\"", "\"\""), + _ => unreachable!(), + }; + Some(Self::Token(Token::String( + s.replace(double_quote, single_quote), + ))) } - _ => Some(input.pop_front().unwrap()), - }, - Token::String(_) => { - let mut i = 0; - while matches!(input.get(i * 2 + 1)?, Token::Punct(Punct::Plus)) - && matches!(input.get(i * 2 + 2)?, Token::String(_)) - { - i += 1; + Segment::HexString => { + // Strip `X"` prefix and `"` suffix (or variations). + let s = &s[2..s.len() - 1]; + for c in s.chars() { + if !c.is_ascii_hexdigit() { + return Some(Self::Error(ScanError::BadHexDigit(c))); + } + } + if s.len() % 2 != 0 { + return Some(Self::Error(ScanError::OddLengthHexString(s.len()))); + } + let mut out = String::with_capacity(s.len()); + for pair in s.as_bytes().chunks_exact(2) { + let hi = char::from(pair[0]).to_digit(16).unwrap() as u8; + let lo = char::from(pair[1]).to_digit(16).unwrap() as u8; + out.push(char::from(hi * 16 + lo)); + } + Some(Self::Token(Token::String(out))) + } + Segment::UnicodeString => { + // Strip `U"` prefix and `"` suffix (or variations). + let s = &s[2..s.len() - 1]; + if !(1..=8).contains(&s.len()) { + return Some(Self::Error(ScanError::BadLengthUnicodeString(s.len()))); + } + let Ok(code_point) = u32::from_str_radix(s, 16) else { + return Some(Self::Error(ScanError::ExpectedCodePoint)); + }; + let Some(c) = char::from_u32(code_point) else { + return Some(Self::Error(ScanError::BadCodePoint(code_point))); + }; + Some(Self::Token(Token::String(String::from(c)))) } - if i == 0 { - Some(input.pop_front().unwrap()) - } else { - let mut output = String::new(); - for i in 0..=i { - let Token::String(s) = &input[i * 2] else { - unreachable!() - }; - output.push_str(&s); + + Segment::UnquotedString + | Segment::DoRepeatCommand + | Segment::InlineData + | Segment::Document + | Segment::MacroBody + | Segment::MacroName => Some(Self::Token(Token::String(String::from(s)))), + + Segment::ReservedWord => { + let c0 = s.as_bytes()[0].to_ascii_uppercase(); + let c1 = s.as_bytes()[1].to_ascii_uppercase(); + match (c0, c1) { + (b'B', _) => Some(Self::Token(Token::Punct(Punct::By))), + (b'E', _) => Some(Self::Token(Token::Punct(Punct::Eq))), + (b'G', b'T') => Some(Self::Token(Token::Punct(Punct::Gt))), + (b'G', _) => Some(Self::Token(Token::Punct(Punct::Ge))), + (b'L', b'T') => Some(Self::Token(Token::Punct(Punct::Lt))), + (b'L', _) => Some(Self::Token(Token::Punct(Punct::Le))), + (b'N', b'E') => Some(Self::Token(Token::Punct(Punct::Ne))), + (b'N', _) => Some(Self::Token(Token::Punct(Punct::Not))), + (b'O', _) => Some(Self::Token(Token::Punct(Punct::Or))), + (b'T', _) => Some(Self::Token(Token::Punct(Punct::To))), + (b'A', b'L') => Some(Self::Token(Token::Punct(Punct::All))), + (b'A', _) => Some(Self::Token(Token::Punct(Punct::And))), + (b'W', _) => Some(Self::Token(Token::Punct(Punct::With))), + _ => unreachable!(), } - for _ in 0..i * 2 + 1 { + } + Segment::Identifier => Some(Self::Token(Token::Id(String::from(s)))), + Segment::Punct => match s { + "(" => Some(Self::Token(Token::Punct(Punct::LParen))), + ")" => Some(Self::Token(Token::Punct(Punct::RParen))), + "[" => Some(Self::Token(Token::Punct(Punct::LSquare))), + "]" => Some(Self::Token(Token::Punct(Punct::RSquare))), + "{" => Some(Self::Token(Token::Punct(Punct::LCurly))), + "}" => Some(Self::Token(Token::Punct(Punct::RCurly))), + "," => Some(Self::Token(Token::Punct(Punct::Comma))), + "=" => Some(Self::Token(Token::Punct(Punct::Equals))), + "-" => Some(Self::Token(Token::Punct(Punct::Dash))), + "&" => Some(Self::Token(Token::Punct(Punct::And))), + "|" => Some(Self::Token(Token::Punct(Punct::Or))), + "+" => Some(Self::Token(Token::Punct(Punct::Plus))), + "/" => Some(Self::Token(Token::Punct(Punct::Slash))), + "*" => Some(Self::Token(Token::Punct(Punct::Asterisk))), + "<" => Some(Self::Token(Token::Punct(Punct::Lt))), + ">" => Some(Self::Token(Token::Punct(Punct::Gt))), + "~" => Some(Self::Token(Token::Punct(Punct::Not))), + ":" => Some(Self::Token(Token::Punct(Punct::Colon))), + ";" => Some(Self::Token(Token::Punct(Punct::Semicolon))), + "**" => Some(Self::Token(Token::Punct(Punct::Exp))), + "<=" => Some(Self::Token(Token::Punct(Punct::Le))), + "<>" => Some(Self::Token(Token::Punct(Punct::Ne))), + "~=" => Some(Self::Token(Token::Punct(Punct::Ne))), + ">=" => Some(Self::Token(Token::Punct(Punct::Ge))), + "!" => Some(Self::Token(Token::MacroToken(MacroToken::Bang))), + "%" => Some(Self::Token(Token::MacroToken(MacroToken::Percent))), + "?" => Some(Self::Token(Token::MacroToken(MacroToken::Question))), + "`" => Some(Self::Token(Token::MacroToken(MacroToken::Backtick))), + "_" => Some(Self::Token(Token::MacroToken(MacroToken::Underscore))), + "." => Some(Self::Token(Token::MacroToken(MacroToken::Dot))), + _ => unreachable!("bad punctuator {s:?}"), + }, + Segment::Shbang + | Segment::Spaces + | Segment::Comment + | Segment::Newline + | Segment::CommentCommand => None, + Segment::DoRepeatOverflow => Some(Self::Error(ScanError::DoRepeatOverflow)), + Segment::MacroId => Some(Self::Token(Token::MacroToken(MacroToken::MacroId( + String::from(s), + )))), + Segment::StartDocument => Some(Self::Token(Token::Id(String::from("DOCUMENT")))), + Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => { + Some(Self::Token(Token::EndCommand)) + } + Segment::End => Some(Self::Token(Token::End)), + Segment::ExpectedQuote => Some(Self::Error(ScanError::ExpectedQuote)), + Segment::ExpectedExponent => { + Some(Self::Error(ScanError::ExpectedExponent(String::from(s)))) + } + Segment::UnexpectedChar => Some(Self::Error(ScanError::UnexpectedChar( + s.chars().next().unwrap(), + ))), + } + } + + /// Attempts to merge a sequence of tokens together into a single token. The + /// tokens are taken from the beginning of `input`. If successful, removes one + /// or more token from the beginning of `input` and returnss the merged + /// token. More input tokens might be needed; if so, leaves `input` alone and + /// returns `None`. In the latter case, the caller should add more tokens to the + /// input ([Token::End] or [Token::Punct(Punct::EndCmd)] is always sufficient). + /// + /// This performs two different kinds of token merging: + /// + /// - String concatenation, where syntax like `"a" + "b"` is converted into a + /// single string token. This is definitely needed because the parser relies + /// on it. + /// + /// - Negative number merging, where syntax like `-5` is converted from a pair + /// of tokens (a dash and a positive number) into a single token (a negative + /// number). This might not be needed anymore because the segmenter + /// directly treats a dash followed by a number, with optional intervening + /// white space, as a negative number. It's only needed if we want + /// intervening comments to be allowed or for part of the negative number + /// token to be produced by macro expansion. + pub fn merge(input: &mut VecDeque) -> Option { + match input.get(0)? { + ScanToken::Token(Token::Punct(Punct::Dash)) => match input.get(1)? { + ScanToken::Token(Token::Number(number)) if number.is_sign_positive() => { + let number = *number; + input.pop_front().unwrap(); input.pop_front().unwrap(); + return Some(ScanToken::Token(Token::Number(-number))); + } + _ => Some(input.pop_front().unwrap()), + }, + ScanToken::Token(Token::String(_)) => { + let mut i = 0; + while matches!( + input.get(i * 2 + 1)?, + ScanToken::Token(Token::Punct(Punct::Plus)) + ) && matches!(input.get(i * 2 + 2)?, ScanToken::Token(Token::String(_))) + { + i += 1; + } + if i == 0 { + Some(input.pop_front().unwrap()) + } else { + let mut output = String::new(); + for i in 0..=i { + let ScanToken::Token(Token::String(s)) = &input[i * 2] else { + unreachable!() + }; + output.push_str(&s); + } + for _ in 0..i * 2 + 1 { + input.pop_front().unwrap(); + } + Some(ScanToken::Token(Token::String(output))) } - Some(Token::String(output)) } + _ => Some(input.pop_front().unwrap()), } - _ => Some(input.pop_front().unwrap()), } } pub struct StringLexer<'a> { input: &'a str, segmenter: Segmenter, - tokens: VecDeque, + tokens: VecDeque, } impl<'a> StringLexer<'a> { @@ -91,11 +279,11 @@ impl<'a> StringLexer<'a> { } impl<'a> Iterator for StringLexer<'a> { - type Item = Result; + type Item = ScanToken; fn next(&mut self) -> Option { - if let Some(token) = merge_tokens(&mut self.tokens) { - return Some(Ok(token)); + if let Some(token) = ScanToken::merge(&mut self.tokens) { + return Some(token); } loop { let (rest, segment) = self.segmenter.push(self.input, true).unwrap(); @@ -104,19 +292,13 @@ impl<'a> Iterator for StringLexer<'a> { } let s = &self.input[..self.input.len() - rest.len()]; self.input = rest; - match Token::try_from_segment(s, segment) { - Err(error) => { - println!("{:?}", &self.tokens); - return Some(Err(error)); - } - Ok(Some(token)) => { - self.tokens.push_back(token); - if let Some(token) = merge_tokens(&mut self.tokens) { - return Some(Ok(token)); - } + + if let Some(token) = ScanToken::from_segment(s, segment) { + self.tokens.push_back(token); + if let Some(token) = ScanToken::merge(&mut self.tokens) { + return Some(token); } - Ok(None) => (), - }; + } } } } diff --git a/rust/src/lex/scan/test.rs b/rust/src/lex/scan/test.rs index 1b84e5ffbe..d009131f63 100644 --- a/rust/src/lex/scan/test.rs +++ b/rust/src/lex/scan/test.rs @@ -1,9 +1,9 @@ use crate::lex::{ segment::Mode, - token::{MacroToken, Punct, Token, TokenError}, + token::{MacroToken, Punct, Token}, }; -use super::StringLexer; +use super::{ScanError, ScanToken, StringLexer}; fn print_token(token: &Token) { match token { @@ -13,22 +13,25 @@ fn print_token(token: &Token) { Token::String(s) => print!("Token::String(String::from({s:?}))"), Token::EndCommand => print!("Token::EndCommand"), Token::Punct(punct) => print!("Token::Punct(Punct::{punct:?})"), + Token::MacroToken(MacroToken::MacroId(id)) => { + print!("Token::MacroToken(MacroToken::MacroId(String::from({id:?})))") + } Token::MacroToken(m) => print!("Token::MacroToken(MacroToken::{m:?})"), } } -fn check_scan(input: &str, expected: &[Result]) { - let tokens = StringLexer::new(input, Mode::Auto, false).collect::>(); +fn check_scan(input: &str, mode: Mode, expected: &[ScanToken]) { + let tokens = StringLexer::new(input, mode, false).collect::>(); if &tokens != expected { for token in &tokens { match token { - Ok(token) => { - print!("Ok("); + ScanToken::Token(token) => { + print!("ScanToken::Token("); print_token(token); print!(")"); } - Err(error) => print!("Err(TokenError::{error:?})"), + ScanToken::Error(error) => print!("ScanToken::Error(ScanError::{error:?})"), } println!(","); } @@ -56,36 +59,37 @@ QrStUv./* end of line comment */ WXYZ. /* unterminated end of line comment �. /* U+FFFD is not valid in an identifier "#, + Mode::Auto, &[ - Ok(Token::Id(String::from("a"))), - Ok(Token::Id(String::from("aB"))), - Ok(Token::Id(String::from("i5"))), - Ok(Token::Id(String::from("$x"))), - Ok(Token::Id(String::from("@efg"))), - Ok(Token::Id(String::from("@@."))), - Ok(Token::MacroToken(MacroToken::MacroId(String::from( + ScanToken::Token(Token::Id(String::from("a"))), + ScanToken::Token(Token::Id(String::from("aB"))), + ScanToken::Token(Token::Id(String::from("i5"))), + ScanToken::Token(Token::Id(String::from("$x"))), + ScanToken::Token(Token::Id(String::from("@efg"))), + ScanToken::Token(Token::Id(String::from("@@."))), + ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( "!abcd", )))), - Ok(Token::MacroToken(MacroToken::MacroId(String::from("!*")))), - Ok(Token::MacroToken(MacroToken::MacroId(String::from("!*")))), - Ok(Token::Id(String::from("a"))), - Ok(Token::Id(String::from("#.#"))), - Ok(Token::MacroToken(MacroToken::Dot)), - Ok(Token::Id(String::from("x"))), - Ok(Token::MacroToken(MacroToken::Underscore)), - Ok(Token::Id(String::from("z"))), - Ok(Token::EndCommand), - Ok(Token::Id(String::from("abcd."))), - Ok(Token::Id(String::from("abcd"))), - Ok(Token::EndCommand), - Ok(Token::Id(String::from("QRSTUV"))), - Ok(Token::EndCommand), - Ok(Token::Id(String::from("QrStUv"))), - Ok(Token::EndCommand), - Ok(Token::Id(String::from("WXYZ"))), - Ok(Token::EndCommand), - Err(TokenError::UnexpectedChar('�')), - Ok(Token::EndCommand), + ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from("!*")))), + ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from("!*")))), + ScanToken::Token(Token::Id(String::from("a"))), + ScanToken::Token(Token::Id(String::from("#.#"))), + ScanToken::Token(Token::MacroToken(MacroToken::Dot)), + ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::MacroToken(MacroToken::Underscore)), + ScanToken::Token(Token::Id(String::from("z"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("abcd."))), + ScanToken::Token(Token::Id(String::from("abcd"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("QRSTUV"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("QrStUv"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("WXYZ"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Error(ScanError::UnexpectedChar('�')), + ScanToken::Token(Token::EndCommand), ], ); } @@ -98,49 +102,50 @@ AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH andx orx notx eqx gex gtx lex ltx nex allx byx tox withx and. with. "#, + Mode::Auto, &[ - Ok(Token::Punct(Punct::And)), - Ok(Token::Punct(Punct::Or)), - Ok(Token::Punct(Punct::Not)), - Ok(Token::Punct(Punct::Eq)), - Ok(Token::Punct(Punct::Ge)), - Ok(Token::Punct(Punct::Gt)), - Ok(Token::Punct(Punct::Le)), - Ok(Token::Punct(Punct::Lt)), - Ok(Token::Punct(Punct::Ne)), - Ok(Token::Punct(Punct::All)), - Ok(Token::Punct(Punct::By)), - Ok(Token::Punct(Punct::To)), - Ok(Token::Punct(Punct::With)), - Ok(Token::Punct(Punct::And)), - Ok(Token::Punct(Punct::Or)), - Ok(Token::Punct(Punct::Not)), - Ok(Token::Punct(Punct::Eq)), - Ok(Token::Punct(Punct::Ge)), - Ok(Token::Punct(Punct::Gt)), - Ok(Token::Punct(Punct::Le)), - Ok(Token::Punct(Punct::Lt)), - Ok(Token::Punct(Punct::Ne)), - Ok(Token::Punct(Punct::All)), - Ok(Token::Punct(Punct::By)), - Ok(Token::Punct(Punct::To)), - Ok(Token::Punct(Punct::With)), - Ok(Token::Id(String::from("andx"))), - Ok(Token::Id(String::from("orx"))), - Ok(Token::Id(String::from("notx"))), - Ok(Token::Id(String::from("eqx"))), - Ok(Token::Id(String::from("gex"))), - Ok(Token::Id(String::from("gtx"))), - Ok(Token::Id(String::from("lex"))), - Ok(Token::Id(String::from("ltx"))), - Ok(Token::Id(String::from("nex"))), - Ok(Token::Id(String::from("allx"))), - Ok(Token::Id(String::from("byx"))), - Ok(Token::Id(String::from("tox"))), - Ok(Token::Id(String::from("withx"))), - Ok(Token::Id(String::from("and."))), - Ok(Token::Punct(Punct::With)), - Ok(Token::EndCommand), + ScanToken::Token(Token::Punct(Punct::And)), + ScanToken::Token(Token::Punct(Punct::Or)), + ScanToken::Token(Token::Punct(Punct::Not)), + ScanToken::Token(Token::Punct(Punct::Eq)), + ScanToken::Token(Token::Punct(Punct::Ge)), + ScanToken::Token(Token::Punct(Punct::Gt)), + ScanToken::Token(Token::Punct(Punct::Le)), + ScanToken::Token(Token::Punct(Punct::Lt)), + ScanToken::Token(Token::Punct(Punct::Ne)), + ScanToken::Token(Token::Punct(Punct::All)), + ScanToken::Token(Token::Punct(Punct::By)), + ScanToken::Token(Token::Punct(Punct::To)), + ScanToken::Token(Token::Punct(Punct::With)), + ScanToken::Token(Token::Punct(Punct::And)), + ScanToken::Token(Token::Punct(Punct::Or)), + ScanToken::Token(Token::Punct(Punct::Not)), + ScanToken::Token(Token::Punct(Punct::Eq)), + ScanToken::Token(Token::Punct(Punct::Ge)), + ScanToken::Token(Token::Punct(Punct::Gt)), + ScanToken::Token(Token::Punct(Punct::Le)), + ScanToken::Token(Token::Punct(Punct::Lt)), + ScanToken::Token(Token::Punct(Punct::Ne)), + ScanToken::Token(Token::Punct(Punct::All)), + ScanToken::Token(Token::Punct(Punct::By)), + ScanToken::Token(Token::Punct(Punct::To)), + ScanToken::Token(Token::Punct(Punct::With)), + ScanToken::Token(Token::Id(String::from("andx"))), + ScanToken::Token(Token::Id(String::from("orx"))), + ScanToken::Token(Token::Id(String::from("notx"))), + ScanToken::Token(Token::Id(String::from("eqx"))), + ScanToken::Token(Token::Id(String::from("gex"))), + ScanToken::Token(Token::Id(String::from("gtx"))), + ScanToken::Token(Token::Id(String::from("lex"))), + ScanToken::Token(Token::Id(String::from("ltx"))), + ScanToken::Token(Token::Id(String::from("nex"))), + ScanToken::Token(Token::Id(String::from("allx"))), + ScanToken::Token(Token::Id(String::from("byx"))), + ScanToken::Token(Token::Id(String::from("tox"))), + ScanToken::Token(Token::Id(String::from("withx"))), + ScanToken::Token(Token::Id(String::from("and."))), + ScanToken::Token(Token::Punct(Punct::With)), + ScanToken::Token(Token::EndCommand), ], ); } @@ -152,56 +157,57 @@ fn test_punctuation() { ~&|=>=><=<~=<>(),-+*/[]** % : ; ? _ ` { } ~ "#, + Mode::Auto, &[ - Ok(Token::Punct(Punct::Not)), - Ok(Token::Punct(Punct::And)), - Ok(Token::Punct(Punct::Or)), - Ok(Token::Punct(Punct::Equals)), - Ok(Token::Punct(Punct::Ge)), - Ok(Token::Punct(Punct::Gt)), - Ok(Token::Punct(Punct::Le)), - Ok(Token::Punct(Punct::Lt)), - Ok(Token::Punct(Punct::Ne)), - Ok(Token::Punct(Punct::Ne)), - Ok(Token::Punct(Punct::LParen)), - Ok(Token::Punct(Punct::RParen)), - Ok(Token::Punct(Punct::Comma)), - Ok(Token::Punct(Punct::Dash)), - Ok(Token::Punct(Punct::Plus)), - Ok(Token::Punct(Punct::Asterisk)), - Ok(Token::Punct(Punct::Slash)), - Ok(Token::Punct(Punct::LSquare)), - Ok(Token::Punct(Punct::RSquare)), - Ok(Token::Punct(Punct::Exp)), - Ok(Token::Punct(Punct::Not)), - Ok(Token::Punct(Punct::And)), - Ok(Token::Punct(Punct::Or)), - Ok(Token::Punct(Punct::Equals)), - Ok(Token::Punct(Punct::Ge)), - Ok(Token::Punct(Punct::Gt)), - Ok(Token::Punct(Punct::Le)), - Ok(Token::Punct(Punct::Lt)), - Ok(Token::Punct(Punct::Ne)), - Ok(Token::Punct(Punct::Ne)), - Ok(Token::Punct(Punct::LParen)), - Ok(Token::Punct(Punct::RParen)), - Ok(Token::Punct(Punct::Comma)), - Ok(Token::Punct(Punct::Dash)), - Ok(Token::Punct(Punct::Plus)), - Ok(Token::Punct(Punct::Asterisk)), - Ok(Token::Punct(Punct::Slash)), - Ok(Token::Punct(Punct::LSquare)), - Ok(Token::Punct(Punct::RSquare)), - Ok(Token::Punct(Punct::Exp)), - Ok(Token::MacroToken(MacroToken::Percent)), - Ok(Token::Punct(Punct::Colon)), - Ok(Token::Punct(Punct::Semicolon)), - Ok(Token::MacroToken(MacroToken::Question)), - Ok(Token::MacroToken(MacroToken::Underscore)), - Ok(Token::MacroToken(MacroToken::Backtick)), - Ok(Token::Punct(Punct::LCurly)), - Ok(Token::Punct(Punct::RCurly)), - Ok(Token::Punct(Punct::Not)), + ScanToken::Token(Token::Punct(Punct::Not)), + ScanToken::Token(Token::Punct(Punct::And)), + ScanToken::Token(Token::Punct(Punct::Or)), + ScanToken::Token(Token::Punct(Punct::Equals)), + ScanToken::Token(Token::Punct(Punct::Ge)), + ScanToken::Token(Token::Punct(Punct::Gt)), + ScanToken::Token(Token::Punct(Punct::Le)), + ScanToken::Token(Token::Punct(Punct::Lt)), + ScanToken::Token(Token::Punct(Punct::Ne)), + ScanToken::Token(Token::Punct(Punct::Ne)), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Punct(Punct::Comma)), + ScanToken::Token(Token::Punct(Punct::Dash)), + ScanToken::Token(Token::Punct(Punct::Plus)), + ScanToken::Token(Token::Punct(Punct::Asterisk)), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Punct(Punct::LSquare)), + ScanToken::Token(Token::Punct(Punct::RSquare)), + ScanToken::Token(Token::Punct(Punct::Exp)), + ScanToken::Token(Token::Punct(Punct::Not)), + ScanToken::Token(Token::Punct(Punct::And)), + ScanToken::Token(Token::Punct(Punct::Or)), + ScanToken::Token(Token::Punct(Punct::Equals)), + ScanToken::Token(Token::Punct(Punct::Ge)), + ScanToken::Token(Token::Punct(Punct::Gt)), + ScanToken::Token(Token::Punct(Punct::Le)), + ScanToken::Token(Token::Punct(Punct::Lt)), + ScanToken::Token(Token::Punct(Punct::Ne)), + ScanToken::Token(Token::Punct(Punct::Ne)), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Punct(Punct::Comma)), + ScanToken::Token(Token::Punct(Punct::Dash)), + ScanToken::Token(Token::Punct(Punct::Plus)), + ScanToken::Token(Token::Punct(Punct::Asterisk)), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Punct(Punct::LSquare)), + ScanToken::Token(Token::Punct(Punct::RSquare)), + ScanToken::Token(Token::Punct(Punct::Exp)), + ScanToken::Token(Token::MacroToken(MacroToken::Percent)), + ScanToken::Token(Token::Punct(Punct::Colon)), + ScanToken::Token(Token::Punct(Punct::Semicolon)), + ScanToken::Token(Token::MacroToken(MacroToken::Question)), + ScanToken::Token(Token::MacroToken(MacroToken::Underscore)), + ScanToken::Token(Token::MacroToken(MacroToken::Backtick)), + ScanToken::Token(Token::Punct(Punct::LCurly)), + ScanToken::Token(Token::Punct(Punct::RCurly)), + ScanToken::Token(Token::Punct(Punct::Not)), ], ); } @@ -217,41 +223,42 @@ fn test_positive_numbers() { 1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03 . 1e e1 1e+ 1e- "#, + Mode::Auto, &[ - Ok(Token::Number(0.0)), - Ok(Token::Number(1.0)), - Ok(Token::Number(1.0)), - Ok(Token::Number(1.0)), - Ok(Token::Number(1.0)), - Ok(Token::EndCommand), - Ok(Token::Number(123.0)), - Ok(Token::EndCommand), - Ok(Token::EndCommand), - Ok(Token::Number(1.0)), - Ok(Token::Number(0.1)), - Ok(Token::Number(0.1)), - Ok(Token::Number(0.1)), - Ok(Token::Number(50.0)), - Ok(Token::Number(0.6)), - Ok(Token::Number(70.0)), - Ok(Token::Number(60.0)), - Ok(Token::Number(0.006)), - Ok(Token::EndCommand), - Ok(Token::Number(30.0)), - Ok(Token::Number(0.04)), - Ok(Token::Number(5.0)), - Ok(Token::Number(6.0)), - Ok(Token::Number(0.0007)), - Ok(Token::Number(12.3)), - Ok(Token::Number(4.56)), - Ok(Token::Number(789.0)), - Ok(Token::Number(999.0)), - Ok(Token::Number(0.0112)), - Ok(Token::EndCommand), - Err(TokenError::ExpectedExponent(String::from("1e"))), - Ok(Token::Id(String::from("e1"))), - Err(TokenError::ExpectedExponent(String::from("1e+"))), - Err(TokenError::ExpectedExponent(String::from("1e-"))), + ScanToken::Token(Token::Number(0.0)), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Number(123.0)), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::Number(0.1)), + ScanToken::Token(Token::Number(0.1)), + ScanToken::Token(Token::Number(0.1)), + ScanToken::Token(Token::Number(50.0)), + ScanToken::Token(Token::Number(0.6)), + ScanToken::Token(Token::Number(70.0)), + ScanToken::Token(Token::Number(60.0)), + ScanToken::Token(Token::Number(0.006)), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Number(30.0)), + ScanToken::Token(Token::Number(0.04)), + ScanToken::Token(Token::Number(5.0)), + ScanToken::Token(Token::Number(6.0)), + ScanToken::Token(Token::Number(0.0007)), + ScanToken::Token(Token::Number(12.3)), + ScanToken::Token(Token::Number(4.56)), + ScanToken::Token(Token::Number(789.0)), + ScanToken::Token(Token::Number(999.0)), + ScanToken::Token(Token::Number(0.0112)), + ScanToken::Token(Token::EndCommand), + ScanToken::Error(ScanError::ExpectedExponent(String::from("1e"))), + ScanToken::Token(Token::Id(String::from("e1"))), + ScanToken::Error(ScanError::ExpectedExponent(String::from("1e+"))), + ScanToken::Error(ScanError::ExpectedExponent(String::from("1e-"))), ], ); } @@ -268,52 +275,53 @@ fn test_negative_numbers() { -/**/1 -. -1e -e1 -1e+ -1e- -1. "#, + Mode::Auto, &[ - Ok(Token::Number(-0.0)), - Ok(Token::Number(-1.0)), - Ok(Token::Number(-1.0)), - Ok(Token::Number(-1.0)), - Ok(Token::Number(-1.0)), - Ok(Token::EndCommand), - Ok(Token::Number(-123.0)), - Ok(Token::EndCommand), - Ok(Token::Number(-0.1)), - Ok(Token::Number(-0.1)), - Ok(Token::Number(-0.1)), - Ok(Token::Number(-0.1)), - Ok(Token::Number(-50.0)), - Ok(Token::Number(-0.6)), - Ok(Token::Number(-70.0)), - Ok(Token::Number(-60.0)), - Ok(Token::Number(-0.006)), - Ok(Token::Number(-3.0)), - Ok(Token::Number(-0.04)), - Ok(Token::Number(-5.0)), - Ok(Token::Number(-6.0)), - Ok(Token::Number(-0.0007)), - Ok(Token::Number(-12.3)), - Ok(Token::Number(-4.56)), - Ok(Token::Number(-789.0)), - Ok(Token::Number(-999.0)), - Ok(Token::Number(-0.0112)), - Ok(Token::Number(-1.0)), - Ok(Token::Punct(Punct::Dash)), - Ok(Token::MacroToken(MacroToken::Dot)), - Err(TokenError::ExpectedExponent(String::from("-1e"))), - Ok(Token::Punct(Punct::Dash)), - Ok(Token::Id(String::from("e1"))), - Err(TokenError::ExpectedExponent(String::from("-1e+"))), - Err(TokenError::ExpectedExponent(String::from("-1e-"))), - Ok(Token::Number(-1.0)), - Ok(Token::EndCommand), + ScanToken::Token(Token::Number(-0.0)), + ScanToken::Token(Token::Number(-1.0)), + ScanToken::Token(Token::Number(-1.0)), + ScanToken::Token(Token::Number(-1.0)), + ScanToken::Token(Token::Number(-1.0)), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Number(-123.0)), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Number(-0.1)), + ScanToken::Token(Token::Number(-0.1)), + ScanToken::Token(Token::Number(-0.1)), + ScanToken::Token(Token::Number(-0.1)), + ScanToken::Token(Token::Number(-50.0)), + ScanToken::Token(Token::Number(-0.6)), + ScanToken::Token(Token::Number(-70.0)), + ScanToken::Token(Token::Number(-60.0)), + ScanToken::Token(Token::Number(-0.006)), + ScanToken::Token(Token::Number(-3.0)), + ScanToken::Token(Token::Number(-0.04)), + ScanToken::Token(Token::Number(-5.0)), + ScanToken::Token(Token::Number(-6.0)), + ScanToken::Token(Token::Number(-0.0007)), + ScanToken::Token(Token::Number(-12.3)), + ScanToken::Token(Token::Number(-4.56)), + ScanToken::Token(Token::Number(-789.0)), + ScanToken::Token(Token::Number(-999.0)), + ScanToken::Token(Token::Number(-0.0112)), + ScanToken::Token(Token::Number(-1.0)), + ScanToken::Token(Token::Punct(Punct::Dash)), + ScanToken::Token(Token::MacroToken(MacroToken::Dot)), + ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e"))), + ScanToken::Token(Token::Punct(Punct::Dash)), + ScanToken::Token(Token::Id(String::from("e1"))), + ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e+"))), + ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e-"))), + ScanToken::Token(Token::Number(-1.0)), + ScanToken::Token(Token::EndCommand), ], ); } - #[test] fn test_strings() { - check_scan(r#"'x' "y" 'abc' + check_scan( + r#"'x' "y" 'abc' 'Don''t' "Can't" 'Won''t' """quoted""" '"quoted"' '' "" '''' """" @@ -342,12 +350,692 @@ x"4142" +u'304a' "�あいうえお" "abc"+U"FFFD"+u'3048'+"xyz" -"#, &[]); +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::String(String::from("x"))), + ScanToken::Token(Token::String(String::from("y"))), + ScanToken::Token(Token::String(String::from("abc"))), + ScanToken::Token(Token::String(String::from("Don't"))), + ScanToken::Token(Token::String(String::from("Can't"))), + ScanToken::Token(Token::String(String::from("Won't"))), + ScanToken::Token(Token::String(String::from("\"quoted\""))), + ScanToken::Token(Token::String(String::from("\"quoted\""))), + ScanToken::Token(Token::String(String::from(""))), + ScanToken::Token(Token::String(String::from(""))), + ScanToken::Token(Token::String(String::from("'"))), + ScanToken::Token(Token::String(String::from("\""))), + ScanToken::Error(ScanError::ExpectedQuote), + ScanToken::Error(ScanError::ExpectedQuote), + ScanToken::Token(Token::String(String::from("xyzabcde"))), + ScanToken::Token(Token::String(String::from("foobar"))), + ScanToken::Token(Token::String(String::from("foobar"))), + ScanToken::Token(Token::String(String::from("foo"))), + ScanToken::Token(Token::Punct(Punct::Plus)), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::String(String::from("bar"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Punct(Punct::Plus)), + ScanToken::Token(Token::String(String::from("AB5152"))), + ScanToken::Token(Token::String(String::from("4142QR"))), + ScanToken::Token(Token::String(String::from("ABお"))), + ScanToken::Token(Token::String(String::from("�あいうえお"))), + ScanToken::Token(Token::String(String::from("abc�えxyz"))), + ScanToken::Token(Token::End), + ], + ); +} + +#[test] +fn test_shbang() { + check_scan( + r#"#! /usr/bin/pspp +#! /usr/bin/pspp +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("#"))), + ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from("!")))), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Id(String::from("usr"))), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Id(String::from("bin"))), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Id(String::from("pspp"))), + ], + ); +} + +#[test] +fn test_comments() { + check_scan( + r#"* Comment commands "don't +have to contain valid tokens. + +** Check ambiguity with ** token. +****************. + +comment keyword works too. +COMM also. +com is ambiguous with COMPUTE. + + * Comment need not start at left margin. + +* Comment ends with blank line + +next command. + +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("com"))), + ScanToken::Token(Token::Id(String::from("is"))), + ScanToken::Token(Token::Id(String::from("ambiguous"))), + ScanToken::Token(Token::Punct(Punct::With)), + ScanToken::Token(Token::Id(String::from("COMPUTE"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("next"))), + ScanToken::Token(Token::Id(String::from("command"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ], + ); +} + +#[test] +fn test_document() { + check_scan( + r#"DOCUMENT one line. +DOC more + than + one + line. +docu +first.paragraph +isn't parsed as tokens + +second paragraph. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("DOCUMENT"))), + ScanToken::Token(Token::String(String::from("DOCUMENT one line."))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("DOCUMENT"))), + ScanToken::Token(Token::String(String::from("DOC more"))), + ScanToken::Token(Token::String(String::from(" than"))), + ScanToken::Token(Token::String(String::from(" one"))), + ScanToken::Token(Token::String(String::from(" line."))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("DOCUMENT"))), + ScanToken::Token(Token::String(String::from("docu"))), + ScanToken::Token(Token::String(String::from("first.paragraph"))), + ScanToken::Token(Token::String(String::from("isn't parsed as tokens"))), + ScanToken::Token(Token::String(String::from(""))), + ScanToken::Token(Token::String(String::from("second paragraph."))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ], + ); +} + +#[test] +fn test_file_label() { + check_scan( + r#"FIL label isn't quoted. +FILE + lab 'is quoted'. +FILE /* +/**/ lab not quoted here either + +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("FIL"))), + ScanToken::Token(Token::Id(String::from("label"))), + ScanToken::Token(Token::String(String::from("isn't quoted"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("FILE"))), + ScanToken::Token(Token::Id(String::from("lab"))), + ScanToken::Token(Token::String(String::from("is quoted"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("FILE"))), + ScanToken::Token(Token::Id(String::from("lab"))), + ScanToken::Token(Token::String(String::from("not quoted here either"))), + ScanToken::Token(Token::EndCommand), + ], + ); +} + +#[test] +fn test_begin_data() { + check_scan( + r#"begin data. +123 +xxx +end data. + +BEG /**/ DAT /* +5 6 7 /* x + +end data +end data +. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("begin"))), + ScanToken::Token(Token::Id(String::from("data"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::String(String::from("123"))), + ScanToken::Token(Token::String(String::from("xxx"))), + ScanToken::Token(Token::Id(String::from("end"))), + ScanToken::Token(Token::Id(String::from("data"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("BEG"))), + ScanToken::Token(Token::Id(String::from("DAT"))), + ScanToken::Token(Token::String(String::from("5 6 7 /* x"))), + ScanToken::Token(Token::String(String::from(""))), + ScanToken::Token(Token::String(String::from("end data"))), + ScanToken::Token(Token::Id(String::from("end"))), + ScanToken::Token(Token::Id(String::from("data"))), + ScanToken::Token(Token::EndCommand), + ], + ); +} + +#[test] +fn test_do_repeat() { + check_scan( + r#"do repeat x=a b c + y=d e f. + do repeat a=1 thru 5. +another command. +second command ++ third command. +end /* x */ /* y */ repeat print. +end + repeat. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("do"))), + ScanToken::Token(Token::Id(String::from("repeat"))), + ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::Punct(Punct::Equals)), + ScanToken::Token(Token::Id(String::from("a"))), + ScanToken::Token(Token::Id(String::from("b"))), + ScanToken::Token(Token::Id(String::from("c"))), + ScanToken::Token(Token::Id(String::from("y"))), + ScanToken::Token(Token::Punct(Punct::Equals)), + ScanToken::Token(Token::Id(String::from("d"))), + ScanToken::Token(Token::Id(String::from("e"))), + ScanToken::Token(Token::Id(String::from("f"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::String(String::from(" do repeat a=1 thru 5."))), + ScanToken::Token(Token::String(String::from("another command."))), + ScanToken::Token(Token::String(String::from("second command"))), + ScanToken::Token(Token::String(String::from("+ third command."))), + ScanToken::Token(Token::String(String::from( + "end /* x */ /* y */ repeat print.", + ))), + ScanToken::Token(Token::Id(String::from("end"))), + ScanToken::Token(Token::Id(String::from("repeat"))), + ScanToken::Token(Token::EndCommand), + ], + ); +} + +#[test] +fn test_do_repeat_batch() { + check_scan( + r#"do repeat x=a b c + y=d e f +do repeat a=1 thru 5 +another command +second command ++ third command +end /* x */ /* y */ repeat print +end + repeat +do + repeat #a=1 + + inner command +end repeat +"#, + Mode::Batch, + &[ + ScanToken::Token(Token::Id(String::from("do"))), + ScanToken::Token(Token::Id(String::from("repeat"))), + ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::Punct(Punct::Equals)), + ScanToken::Token(Token::Id(String::from("a"))), + ScanToken::Token(Token::Id(String::from("b"))), + ScanToken::Token(Token::Id(String::from("c"))), + ScanToken::Token(Token::Id(String::from("y"))), + ScanToken::Token(Token::Punct(Punct::Equals)), + ScanToken::Token(Token::Id(String::from("d"))), + ScanToken::Token(Token::Id(String::from("e"))), + ScanToken::Token(Token::Id(String::from("f"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::String(String::from("do repeat a=1 thru 5"))), + ScanToken::Token(Token::String(String::from("another command"))), + ScanToken::Token(Token::String(String::from("second command"))), + ScanToken::Token(Token::String(String::from("+ third command"))), + ScanToken::Token(Token::String(String::from( + "end /* x */ /* y */ repeat print", + ))), + ScanToken::Token(Token::Id(String::from("end"))), + ScanToken::Token(Token::Id(String::from("repeat"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("do"))), + ScanToken::Token(Token::Id(String::from("repeat"))), + ScanToken::Token(Token::Id(String::from("#a"))), + ScanToken::Token(Token::Punct(Punct::Equals)), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::String(String::from(" inner command"))), + ScanToken::Token(Token::Id(String::from("end"))), + ScanToken::Token(Token::Id(String::from("repeat"))), + ], + ); } + #[test] -fn test_strings2() { - check_scan(r#""""" -'error -'b' -"#, &[]); +fn test_batch_mode() { + check_scan( + r#"first command + another line of first command ++ second command +third command + +fourth command. + fifth command. +"#, + Mode::Batch, + &[ + ScanToken::Token(Token::Id(String::from("first"))), + ScanToken::Token(Token::Id(String::from("command"))), + ScanToken::Token(Token::Id(String::from("another"))), + ScanToken::Token(Token::Id(String::from("line"))), + ScanToken::Token(Token::Id(String::from("of"))), + ScanToken::Token(Token::Id(String::from("first"))), + ScanToken::Token(Token::Id(String::from("command"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("second"))), + ScanToken::Token(Token::Id(String::from("command"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("third"))), + ScanToken::Token(Token::Id(String::from("command"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("fourth"))), + ScanToken::Token(Token::Id(String::from("command"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("fifth"))), + ScanToken::Token(Token::Id(String::from("command"))), + ScanToken::Token(Token::EndCommand), + ], + ); +} + +mod define { + use crate::lex::{ + scan::ScanToken, + segment::Mode, + token::{MacroToken, Punct, Token}, + }; + + use super::check_scan; + + #[test] + fn test_simple() { + check_scan( + r#"define !macro1() +var1 var2 var3 +!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::String(String::from("var1 var2 var3"))), + ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( + "!enddefine", + )))), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_no_newline_after_parentheses() { + check_scan( + r#"define !macro1() var1 var2 var3 +!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::String(String::from(" var1 var2 var3"))), + ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( + "!enddefine", + )))), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_no_newline_before_enddefine() { + check_scan( + r#"define !macro1() +var1 var2 var3!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::String(String::from("var1 var2 var3"))), + ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( + "!enddefine", + )))), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_all_on_one_line() { + check_scan( + r#"define !macro1()var1 var2 var3!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::String(String::from("var1 var2 var3"))), + ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( + "!enddefine", + )))), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_empty() { + check_scan( + r#"define !macro1() +!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( + "!enddefine", + )))), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_blank_lines() { + check_scan( + r#"define !macro1() + + +!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::String(String::from(""))), + ScanToken::Token(Token::String(String::from(""))), + ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( + "!enddefine", + )))), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_arguments() { + check_scan( + r#"define !macro1(a(), b(), c()) +!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Id(String::from("a"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Punct(Punct::Comma)), + ScanToken::Token(Token::Id(String::from("b"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Punct(Punct::Comma)), + ScanToken::Token(Token::Id(String::from("c"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( + "!enddefine", + )))), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_multiline_arguments() { + check_scan( + r#"define !macro1( + a(), b( + ), + c() +) +!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Id(String::from("a"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Punct(Punct::Comma)), + ScanToken::Token(Token::Id(String::from("b"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Punct(Punct::Comma)), + ScanToken::Token(Token::Id(String::from("c"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( + "!enddefine", + )))), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_arguments_start_on_second_line() { + check_scan( + r#"define !macro1 +(x,y,z +) +content 1 +content 2 +!enddefine. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::Punct(Punct::Comma)), + ScanToken::Token(Token::Id(String::from("y"))), + ScanToken::Token(Token::Punct(Punct::Comma)), + ScanToken::Token(Token::Id(String::from("z"))), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::String(String::from("content 1"))), + ScanToken::Token(Token::String(String::from("content 2"))), + ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( + "!enddefine", + )))), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_early_end_of_command_1() { + check_scan( + r#"define !macro1. +data list /x 1. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("data"))), + ScanToken::Token(Token::Id(String::from("list"))), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_early_end_of_command_2() { + check_scan( + r#"define !macro1 +x. +data list /x 1. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("data"))), + ScanToken::Token(Token::Id(String::from("list"))), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_early_end_of_command_3() { + check_scan( + r#"define !macro1(. +x. +data list /x 1. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("data"))), + ScanToken::Token(Token::Id(String::from("list"))), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_early_end_of_command_4() { + // Notice the command terminator at the end of the DEFINE command, + // which should not be there and ends it early. + check_scan( + r#"define !macro1. +data list /x 1. +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::EndCommand), + ScanToken::Token(Token::Id(String::from("data"))), + ScanToken::Token(Token::Id(String::from("list"))), + ScanToken::Token(Token::Punct(Punct::Slash)), + ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::Number(1.0)), + ScanToken::Token(Token::EndCommand), + ], + ); + } + + #[test] + fn test_missing_enddefine() { + check_scan( + r#"define !macro1() +content line 1 +content line 2 +"#, + Mode::Auto, + &[ + ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::String(String::from("!macro1"))), + ScanToken::Token(Token::Punct(Punct::LParen)), + ScanToken::Token(Token::Punct(Punct::RParen)), + ScanToken::Token(Token::String(String::from("content line 1"))), + ScanToken::Token(Token::String(String::from("content line 2"))), + ScanToken::Token(Token::End), + ], + ); + } } diff --git a/rust/src/lex/segment/mod.rs b/rust/src/lex/segment/mod.rs index ca7dfd0686..eae0b4810c 100644 --- a/rust/src/lex/segment/mod.rs +++ b/rust/src/lex/segment/mod.rs @@ -381,14 +381,14 @@ fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomple } fn is_start_of_string(input: &str, eof: bool) -> Result { - let (Some(c), _rest) = take(input, eof)? else { + let (Some(c), rest) = take(input, eof)? else { return Ok(false); }; match c { - 'x' | 'X' | 'u' | 'U' => Ok({ - let (c, _rest) = take(input, eof)?; - c == Some('\'') || c == Some('"') - }), + 'x' | 'X' | 'u' | 'U' => { + let (c, _rest) = take(rest, eof)?; + Ok(c == Some('\'') || c == Some('"')) + }, '\'' | '"' => Ok(true), '\n' | '\r' if is_end_of_line(input, eof)? => Ok(true), _ => Ok(false), diff --git a/rust/src/lex/segment/test.rs b/rust/src/lex/segment/test.rs index 056ac7b031..dd2d50cb62 100644 --- a/rust/src/lex/segment/test.rs +++ b/rust/src/lex/segment/test.rs @@ -111,7 +111,7 @@ fn check_segmentation( #[allow(dead_code)] fn print_segmentation(mut input: &str) { - let mut segmenter = Segmenter::new(Mode::Auto, false); + let mut segmenter = Segmenter::new(Mode::Interactive, false); loop { let (rest, segment) = segmenter.push(input, true).unwrap(); let len = input.len() - rest.len(); @@ -2157,37 +2157,3 @@ fourth command. ], ); } - -#[test] -fn test_strings2() { - print_segmentation(r#"'x' "y" 'abc' -'Don''t' "Can't" 'Won''t' -"""quoted""" '"quoted"' -'' "" '''' """" -'missing end quote -"missing double quote -'x' + "y" -+ 'z' + -'a' /* abc */ + "b" /* -+ 'c' +/* */"d"/* */+'e' -'foo' -+ /* special case: + in column 0 would ordinarily start a new command -'bar' -'foo' - + -'bar' -'foo' -+ - -'bar' - -+ -x"4142"+'5152' -"4142"+ -x'5152' -x"4142" -+u'304a' -"�あいうえお" -"abc"+U"FFFD"+u'3048'+"xyz" -"#); -} diff --git a/rust/src/lex/token.rs b/rust/src/lex/token.rs index 2c3489c554..016b282838 100644 --- a/rust/src/lex/token.rs +++ b/rust/src/lex/token.rs @@ -1,7 +1,3 @@ -use thiserror::Error as ThisError; - -use super::segment::Segment; - #[derive(Clone, Debug, PartialEq)] pub enum Token { /// End of input. @@ -143,174 +139,3 @@ pub enum MacroToken { /// first character, so this represents an underscore found on its own. Underscore, } - -#[derive(ThisError, Debug, PartialEq, Eq)] -pub enum TokenError { - /// Unterminated string constant. - #[error("Unterminated string constant.")] - ExpectedQuote, - - /// Missing exponent. - #[error("Missing exponent following `{0}`")] - ExpectedExponent(String), - - /// Odd length hex string. - #[error("String of hex digits has {0} characters, which is not a multiple of 2.")] - OddLengthHexString(usize), - - /// Invalid hex digit. - #[error("Invalid hex digit {0:?}.")] - BadHexDigit(char), - - /// Invalid length Unicode string. - #[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")] - BadLengthUnicodeString(usize), - - /// Invalid code point. - #[error("U+{0:04X} is not a valid Unicode code point.")] - BadCodePoint(u32), - - /// Expected hexadecimal Unicode code point - #[error("Expected hexadecimal Unicode code point.")] - ExpectedCodePoint, - - /// `DO REPEAT` nested too deeply. - #[error("`DO REPEAT` nested too deeply.")] - DoRepeatOverflow, - - /// Unexpected character. - #[error("Unexpected character {0:?} in input.")] - UnexpectedChar(char), -} - -impl Token { - pub fn try_from_segment(s: &str, segment: Segment) -> Result, TokenError> { - match segment { - Segment::Number => Ok(Some(Self::Number(s.parse().unwrap()))), - Segment::QuotedString => { - // Trim quote mark from front and back. - let mut chars = s.chars(); - let quote = chars.next().unwrap(); - let s = chars.as_str().strip_suffix(quote).unwrap(); - - // Replace doubled quotes by single ones. - let (single_quote, double_quote) = match quote { - '\'' => ("'", "''"), - '"' => ("\"", "\"\""), - _ => unreachable!(), - }; - Ok(Some(Self::String(s.replace(double_quote, single_quote)))) - } - Segment::HexString => { - // Strip `X"` prefix and `"` suffix (or variations). - let s = &s[2..s.len() - 1]; - for c in s.chars() { - if !c.is_ascii_hexdigit() { - return Err(TokenError::BadHexDigit(c)) - } - } - if s.len() % 2 != 0 { - return Err(TokenError::OddLengthHexString(s.len())) - } - let mut out = String::with_capacity(s.len()); - for pair in s.as_bytes().chunks_exact(2) { - let hi = char::from(pair[0]).to_digit(16).unwrap() as u8; - let lo = char::from(pair[1]).to_digit(16).unwrap() as u8; - out.push(char::from(hi * 16 + lo)); - } - Ok(Some(Self::String(out))) - } - Segment::UnicodeString => { - // Strip `U"` prefix and `"` suffix (or variations). - let s = &s[2..s.len() - 1]; - if !(1..=8).contains(&s.len()) { - return Err(TokenError::BadLengthUnicodeString(s.len())); - } - let Ok(code_point) = u32::from_str_radix(s, 16) else { - return Err(TokenError::ExpectedCodePoint); - }; - let Some(c) = char::from_u32(code_point) else { - return Err(TokenError::BadCodePoint(code_point)); - }; - Ok(Some(Self::String(String::from(c)))) - } - - Segment::UnquotedString - | Segment::DoRepeatCommand - | Segment::InlineData - | Segment::Document - | Segment::MacroBody - | Segment::MacroName => Ok(Some(Self::String(String::from(s)))), - - Segment::ReservedWord => { - let c0 = s.as_bytes()[0].to_ascii_uppercase(); - let c1 = s.as_bytes()[1].to_ascii_uppercase(); - match (c0, c1) { - (b'B', _) => Ok(Some(Self::Punct(Punct::By))), - (b'E', _) => Ok(Some(Self::Punct(Punct::Eq))), - (b'G', b'T') => Ok(Some(Self::Punct(Punct::Gt))), - (b'G', _) => Ok(Some(Self::Punct(Punct::Ge))), - (b'L', b'T') => Ok(Some(Self::Punct(Punct::Lt))), - (b'L', _) => Ok(Some(Self::Punct(Punct::Le))), - (b'N', b'E') => Ok(Some(Self::Punct(Punct::Ne))), - (b'N', _) => Ok(Some(Self::Punct(Punct::Not))), - (b'O', _) => Ok(Some(Self::Punct(Punct::Or))), - (b'T', _) => Ok(Some(Self::Punct(Punct::To))), - (b'A', b'L') => Ok(Some(Self::Punct(Punct::All))), - (b'A', _) => Ok(Some(Self::Punct(Punct::And))), - (b'W', _) => Ok(Some(Self::Punct(Punct::With))), - _ => unreachable!(), - } - } - Segment::Identifier => Ok(Some(Self::Id(String::from(s)))), - Segment::Punct => match s { - "(" => Ok(Some(Self::Punct(Punct::LParen))), - ")" => Ok(Some(Self::Punct(Punct::RParen))), - "[" => Ok(Some(Self::Punct(Punct::LSquare))), - "]" => Ok(Some(Self::Punct(Punct::RSquare))), - "{" => Ok(Some(Self::Punct(Punct::LCurly))), - "}" => Ok(Some(Self::Punct(Punct::RCurly))), - "," => Ok(Some(Self::Punct(Punct::Comma))), - "=" => Ok(Some(Self::Punct(Punct::Equals))), - "-" => Ok(Some(Self::Punct(Punct::Dash))), - "&" => Ok(Some(Self::Punct(Punct::And))), - "|" => Ok(Some(Self::Punct(Punct::Or))), - "+" => Ok(Some(Self::Punct(Punct::Plus))), - "/" => Ok(Some(Self::Punct(Punct::Slash))), - "*" => Ok(Some(Self::Punct(Punct::Asterisk))), - "<" => Ok(Some(Self::Punct(Punct::Lt))), - ">" => Ok(Some(Self::Punct(Punct::Gt))), - "~" => Ok(Some(Self::Punct(Punct::Not))), - ":" => Ok(Some(Self::Punct(Punct::Colon))), - ";" => Ok(Some(Self::Punct(Punct::Semicolon))), - "**" => Ok(Some(Self::Punct(Punct::Exp))), - "<=" => Ok(Some(Self::Punct(Punct::Le))), - "<>" => Ok(Some(Self::Punct(Punct::Ne))), - "~=" => Ok(Some(Self::Punct(Punct::Ne))), - ">=" => Ok(Some(Self::Punct(Punct::Ge))), - "!" => Ok(Some(Self::MacroToken(MacroToken::Bang))), - "%" => Ok(Some(Self::MacroToken(MacroToken::Percent))), - "?" => Ok(Some(Self::MacroToken(MacroToken::Question))), - "`" => Ok(Some(Self::MacroToken(MacroToken::Backtick))), - "_" => Ok(Some(Self::MacroToken(MacroToken::Underscore))), - "." => Ok(Some(Self::MacroToken(MacroToken::Dot))), - _ => unreachable!("bad punctuator {s:?}"), - }, - Segment::Shbang - | Segment::Spaces - | Segment::Comment - | Segment::Newline - | Segment::CommentCommand => Ok(None), - Segment::DoRepeatOverflow => Err(TokenError::DoRepeatOverflow), - Segment::MacroId => Ok(Some(Self::MacroToken(MacroToken::MacroId(String::from(s))))), - Segment::StartDocument => Ok(Some(Self::Id(String::from("DOCUMENT")))), - Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => { - Ok(Some(Self::EndCommand)) - } - Segment::End => Ok(Some(Self::End)), - Segment::ExpectedQuote => Err(TokenError::ExpectedQuote), - Segment::ExpectedExponent => Err(TokenError::ExpectedExponent(String::from(s))), - Segment::UnexpectedChar => Err(TokenError::UnexpectedChar(s.chars().next().unwrap())), - } - } -} -- 2.30.2