From 1a3d90b858b16774944af1ced6ecab320101924d Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 21 Jul 2024 17:34:34 -0700 Subject: [PATCH] work --- rust/src/identifier.rs | 39 +- rust/src/lex/scan/mod.rs | 46 ++- rust/src/lex/scan/test.rs | 310 +++++++-------- rust/src/lex/token.rs | 20 +- rust/src/lib.rs | 2 +- rust/src/macros.rs | 793 ++++++++++++++++++++++++++++++++----- src/language/lexer/macro.c | 25 +- 7 files changed, 958 insertions(+), 277 deletions(-) diff --git a/rust/src/identifier.rs b/rust/src/identifier.rs index bfb4331991..5bffe2b1c9 100644 --- a/rust/src/identifier.rs +++ b/rust/src/identifier.rs @@ -145,7 +145,7 @@ pub fn is_reserved_word(s: &str) -> bool { ReservedWord::try_from(s).is_ok() } -#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Identifier(pub UniCase); impl Identifier { @@ -162,12 +162,13 @@ impl Identifier { identifier.check_encoding(encoding)?; Ok(identifier) } + /// Checks whether this is a valid identifier in the given `encoding`. An /// identifier that is valid in one encoding might be invalid in another /// because some characters are unencodable or because it is too long. pub fn check_encoding(&self, encoding: &'static Encoding) -> Result<(), Error> { let s = self.0.as_str(); - let (encoded, _, unencodable) = encoding.encode(s); + let (_encoded, _, unencodable) = encoding.encode(s); if unencodable { let mut encoder = encoding.new_encoder(); let mut buf = Vec::with_capacity( @@ -187,6 +188,7 @@ impl Identifier { c, }); } + /* if encoded.len() > Self::MAX_LEN { return Err(Error::TooLong { id: s.into(), @@ -194,7 +196,7 @@ impl Identifier { encoding: encoding.name(), max: Self::MAX_LEN, }); - } + }*/ Ok(()) } pub fn is_plausible(s: &str) -> Result<(), Error> { @@ -220,6 +222,31 @@ impl Identifier { } Ok(()) } + + /// Returns true if `token` is a case-insensitive match for `keyword`. + /// + /// Keywords match `keyword` and `token` are identical, or `token` is at + /// least 3 characters long and those characters are identical to `keyword` + /// or differ only in case. + /// + /// `keyword` must be ASCII. + pub fn matches_keyword(&self, keyword: &str) -> bool { + id_match_n_nonstatic(keyword, self.0.as_str(), 3) + } + + /// Returns true if `token` is a case-insensitive match for at least the + /// first `n` characters of `keyword`. + /// + /// `keyword` must be ASCII. + pub fn matches_keyword_n(&self, keyword: &str, n: usize) -> bool { + id_match_n_nonstatic(keyword, self.0.as_str(), n) + } +} + +impl PartialEq for Identifier { + fn eq(&self, other: &str) -> bool { + self.0.eq(&UniCase::new(other)) + } } /// Returns true if `token` is a case-insensitive match for `keyword`. @@ -267,6 +294,12 @@ impl Display for Identifier { } } +impl Debug for Identifier { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + write!(f, "{}", self.0) + } +} + pub trait HasIdentifier { fn identifier(&self) -> &UniCase; } diff --git a/rust/src/lex/scan/mod.rs b/rust/src/lex/scan/mod.rs index e099fcd402..5f67819bfd 100644 --- a/rust/src/lex/scan/mod.rs +++ b/rust/src/lex/scan/mod.rs @@ -10,7 +10,7 @@ //! are the same as the tokens used by the PSPP parser with a few additional //! types. -use crate::identifier::ReservedWord; +use crate::identifier::{Identifier, ReservedWord}; use super::{ segment::{Mode, Segment, Segmenter}, @@ -170,7 +170,8 @@ impl ScanToken { ReservedWord::With => Some(Self::Token(Token::Punct(Punct::With))), } } else { - Some(Self::Token(Token::Id(String::from(s))))} + Some(Self::Token(Token::Id(Identifier::new(s).unwrap()))) + } } Segment::Punct => match s { "(" => Some(Self::Token(Token::Punct(Punct::LParen))), @@ -212,7 +213,7 @@ impl ScanToken { | Segment::Newline | Segment::CommentCommand => None, Segment::DoRepeatOverflow => Some(Self::Error(ScanError::DoRepeatOverflow)), - Segment::StartDocument => Some(Self::Token(Token::Id(String::from("DOCUMENT")))), + Segment::StartDocument => Some(Self::Token(Token::Id(Identifier::new("DOCUMENT").unwrap()))), Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => { Some(Self::Token(Token::EndCommand)) } @@ -288,13 +289,46 @@ impl ScanToken { } } -pub struct StringLexer<'a> { +pub struct StringSegmenter<'a> { + input: &'a str, + segmenter: Segmenter, +} + +impl<'a> StringSegmenter<'a> { + pub fn new(input: &'a str, mode: Mode, is_snippet: bool) -> Self { + Self { + input, + segmenter: Segmenter::new(mode, is_snippet), + } + } +} + +impl<'a> Iterator for StringSegmenter<'a> { + type Item = (&'a str, ScanToken); + + fn next(&mut self) -> Option { + loop { + let (rest, segment) = self.segmenter.push(self.input, true).unwrap(); + if segment == Segment::End { + return None; + } + let s = &self.input[..self.input.len() - rest.len()]; + self.input = rest; + + if let Some(token) = ScanToken::from_segment(s, segment) { + return Some((s, token)); + } + } + } +} + +pub struct StringScanner<'a> { input: &'a str, segmenter: Segmenter, tokens: VecDeque, } -impl<'a> StringLexer<'a> { +impl<'a> StringScanner<'a> { pub fn new(input: &'a str, mode: Mode, is_snippet: bool) -> Self { Self { input, @@ -304,7 +338,7 @@ impl<'a> StringLexer<'a> { } } -impl<'a> Iterator for StringLexer<'a> { +impl<'a> Iterator for StringScanner<'a> { type Item = ScanToken; fn next(&mut self) -> Option { diff --git a/rust/src/lex/scan/test.rs b/rust/src/lex/scan/test.rs index 6affeac0e4..0ed9be6555 100644 --- a/rust/src/lex/scan/test.rs +++ b/rust/src/lex/scan/test.rs @@ -1,9 +1,9 @@ -use crate::lex::{ +use crate::{identifier::Identifier, lex::{ segment::Mode, token::{Punct, Token}, -}; +}}; -use super::{ScanError, ScanToken, StringLexer}; +use super::{ScanError, ScanToken, StringScanner}; fn print_token(token: &Token) { match token { @@ -17,7 +17,7 @@ fn print_token(token: &Token) { } fn check_scan(input: &str, mode: Mode, expected: &[ScanToken]) { - let tokens = StringLexer::new(input, mode, false).collect::>(); + let tokens = StringScanner::new(input, mode, false).collect::>(); if &tokens != expected { for token in &tokens { @@ -57,30 +57,30 @@ WXYZ. /* unterminated end of line comment "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("a"))), - ScanToken::Token(Token::Id(String::from("aB"))), - ScanToken::Token(Token::Id(String::from("i5"))), - ScanToken::Token(Token::Id(String::from("$x"))), - ScanToken::Token(Token::Id(String::from("@efg"))), - ScanToken::Token(Token::Id(String::from("@@."))), - ScanToken::Token(Token::Id(String::from("!abcd"))), + ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("aB").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("i5").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("$x").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("@efg").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("@@.").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("!abcd").unwrap())), ScanToken::Token(Token::Punct(Punct::BangAsterisk)), ScanToken::Token(Token::Punct(Punct::BangAsterisk)), - ScanToken::Token(Token::Id(String::from("a"))), - ScanToken::Token(Token::Id(String::from("#.#"))), + ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("#.#").unwrap())), ScanToken::Token(Token::Punct(Punct::Dot)), - ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), ScanToken::Token(Token::Punct(Punct::Underscore)), - ScanToken::Token(Token::Id(String::from("z"))), + ScanToken::Token(Token::Id(Identifier::new("z").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("abcd."))), - ScanToken::Token(Token::Id(String::from("abcd"))), + ScanToken::Token(Token::Id(Identifier::new("abcd.").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("abcd").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("QRSTUV"))), + ScanToken::Token(Token::Id(Identifier::new("QRSTUV").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("QrStUv"))), + ScanToken::Token(Token::Id(Identifier::new("QrStUv").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("WXYZ"))), + ScanToken::Token(Token::Id(Identifier::new("WXYZ").unwrap())), ScanToken::Token(Token::EndCommand), ScanToken::Error(ScanError::UnexpectedChar('�')), ScanToken::Token(Token::EndCommand), @@ -124,20 +124,20 @@ and. with. ScanToken::Token(Token::Punct(Punct::By)), ScanToken::Token(Token::Punct(Punct::To)), ScanToken::Token(Token::Punct(Punct::With)), - ScanToken::Token(Token::Id(String::from("andx"))), - ScanToken::Token(Token::Id(String::from("orx"))), - ScanToken::Token(Token::Id(String::from("notx"))), - ScanToken::Token(Token::Id(String::from("eqx"))), - ScanToken::Token(Token::Id(String::from("gex"))), - ScanToken::Token(Token::Id(String::from("gtx"))), - ScanToken::Token(Token::Id(String::from("lex"))), - ScanToken::Token(Token::Id(String::from("ltx"))), - ScanToken::Token(Token::Id(String::from("nex"))), - ScanToken::Token(Token::Id(String::from("allx"))), - ScanToken::Token(Token::Id(String::from("byx"))), - ScanToken::Token(Token::Id(String::from("tox"))), - ScanToken::Token(Token::Id(String::from("withx"))), - ScanToken::Token(Token::Id(String::from("and."))), + ScanToken::Token(Token::Id(Identifier::new("andx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("orx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("notx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("eqx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("gex").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("gtx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("lex").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("ltx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("nex").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("allx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("byx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("tox").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("withx").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("and.").unwrap())), ScanToken::Token(Token::Punct(Punct::With)), ScanToken::Token(Token::EndCommand), ], @@ -250,7 +250,7 @@ fn test_positive_numbers() { ScanToken::Token(Token::Number(0.0112)), ScanToken::Token(Token::EndCommand), ScanToken::Error(ScanError::ExpectedExponent(String::from("1e"))), - ScanToken::Token(Token::Id(String::from("e1"))), + ScanToken::Token(Token::Id(Identifier::new("e1").unwrap())), ScanToken::Error(ScanError::ExpectedExponent(String::from("1e+"))), ScanToken::Error(ScanError::ExpectedExponent(String::from("1e-"))), ], @@ -303,7 +303,7 @@ fn test_negative_numbers() { ScanToken::Token(Token::Punct(Punct::Dot)), ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e"))), ScanToken::Token(Token::Punct(Punct::Dash)), - ScanToken::Token(Token::Id(String::from("e1"))), + ScanToken::Token(Token::Id(Identifier::new("e1").unwrap())), ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e+"))), ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e-"))), ScanToken::Token(Token::Number(-1.0)), @@ -388,14 +388,14 @@ fn test_shbang() { "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("#"))), + ScanToken::Token(Token::Id(Identifier::new("#").unwrap())), ScanToken::Token(Token::Punct(Punct::Bang)), ScanToken::Token(Token::Punct(Punct::Slash)), - ScanToken::Token(Token::Id(String::from("usr"))), + ScanToken::Token(Token::Id(Identifier::new("usr").unwrap())), ScanToken::Token(Token::Punct(Punct::Slash)), - ScanToken::Token(Token::Id(String::from("bin"))), + ScanToken::Token(Token::Id(Identifier::new("bin").unwrap())), ScanToken::Token(Token::Punct(Punct::Slash)), - ScanToken::Token(Token::Id(String::from("pspp"))), + ScanToken::Token(Token::Id(Identifier::new("pspp").unwrap())), ], ); } @@ -429,18 +429,18 @@ next command. ScanToken::Token(Token::EndCommand), ScanToken::Token(Token::EndCommand), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("com"))), - ScanToken::Token(Token::Id(String::from("is"))), - ScanToken::Token(Token::Id(String::from("ambiguous"))), + ScanToken::Token(Token::Id(Identifier::new("com").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("is").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("ambiguous").unwrap())), ScanToken::Token(Token::Punct(Punct::With)), - ScanToken::Token(Token::Id(String::from("COMPUTE"))), + ScanToken::Token(Token::Id(Identifier::new("COMPUTE").unwrap())), ScanToken::Token(Token::EndCommand), ScanToken::Token(Token::EndCommand), ScanToken::Token(Token::EndCommand), ScanToken::Token(Token::EndCommand), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("next"))), - ScanToken::Token(Token::Id(String::from("command"))), + ScanToken::Token(Token::Id(Identifier::new("next").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), ScanToken::Token(Token::EndCommand), ScanToken::Token(Token::EndCommand), ], @@ -463,18 +463,18 @@ second paragraph. "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("DOCUMENT"))), + ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())), ScanToken::Token(Token::String(String::from("DOCUMENT one line."))), ScanToken::Token(Token::EndCommand), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("DOCUMENT"))), + ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())), ScanToken::Token(Token::String(String::from("DOC more"))), ScanToken::Token(Token::String(String::from(" than"))), ScanToken::Token(Token::String(String::from(" one"))), ScanToken::Token(Token::String(String::from(" line."))), ScanToken::Token(Token::EndCommand), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("DOCUMENT"))), + ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())), ScanToken::Token(Token::String(String::from("docu"))), ScanToken::Token(Token::String(String::from("first.paragraph"))), ScanToken::Token(Token::String(String::from("isn't parsed as tokens"))), @@ -498,16 +498,16 @@ FILE /* "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("FIL"))), - ScanToken::Token(Token::Id(String::from("label"))), + ScanToken::Token(Token::Id(Identifier::new("FIL").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("label").unwrap())), ScanToken::Token(Token::String(String::from("isn't quoted"))), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("FILE"))), - ScanToken::Token(Token::Id(String::from("lab"))), + ScanToken::Token(Token::Id(Identifier::new("FILE").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("lab").unwrap())), ScanToken::Token(Token::String(String::from("is quoted"))), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("FILE"))), - ScanToken::Token(Token::Id(String::from("lab"))), + ScanToken::Token(Token::Id(Identifier::new("FILE").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("lab").unwrap())), ScanToken::Token(Token::String(String::from("not quoted here either"))), ScanToken::Token(Token::EndCommand), ], @@ -531,22 +531,22 @@ end data "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("begin"))), - ScanToken::Token(Token::Id(String::from("data"))), + ScanToken::Token(Token::Id(Identifier::new("begin").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), ScanToken::Token(Token::EndCommand), ScanToken::Token(Token::String(String::from("123"))), ScanToken::Token(Token::String(String::from("xxx"))), - ScanToken::Token(Token::Id(String::from("end"))), - ScanToken::Token(Token::Id(String::from("data"))), + ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), ScanToken::Token(Token::EndCommand), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("BEG"))), - ScanToken::Token(Token::Id(String::from("DAT"))), + ScanToken::Token(Token::Id(Identifier::new("BEG").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("DAT").unwrap())), ScanToken::Token(Token::String(String::from("5 6 7 /* x"))), ScanToken::Token(Token::String(String::from(""))), ScanToken::Token(Token::String(String::from("end data"))), - ScanToken::Token(Token::Id(String::from("end"))), - ScanToken::Token(Token::Id(String::from("data"))), + ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), ScanToken::Token(Token::EndCommand), ], ); @@ -567,18 +567,18 @@ end "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("do"))), - ScanToken::Token(Token::Id(String::from("repeat"))), - ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::Id(Identifier::new("do").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), ScanToken::Token(Token::Punct(Punct::Equals)), - ScanToken::Token(Token::Id(String::from("a"))), - ScanToken::Token(Token::Id(String::from("b"))), - ScanToken::Token(Token::Id(String::from("c"))), - ScanToken::Token(Token::Id(String::from("y"))), + ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("b").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("c").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("y").unwrap())), ScanToken::Token(Token::Punct(Punct::Equals)), - ScanToken::Token(Token::Id(String::from("d"))), - ScanToken::Token(Token::Id(String::from("e"))), - ScanToken::Token(Token::Id(String::from("f"))), + ScanToken::Token(Token::Id(Identifier::new("d").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("e").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("f").unwrap())), ScanToken::Token(Token::EndCommand), ScanToken::Token(Token::String(String::from(" do repeat a=1 thru 5."))), ScanToken::Token(Token::String(String::from("another command."))), @@ -587,8 +587,8 @@ end ScanToken::Token(Token::String(String::from( "end /* x */ /* y */ repeat print.", ))), - ScanToken::Token(Token::Id(String::from("end"))), - ScanToken::Token(Token::Id(String::from("repeat"))), + ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), ScanToken::Token(Token::EndCommand), ], ); @@ -614,18 +614,18 @@ end repeat "#, Mode::Batch, &[ - ScanToken::Token(Token::Id(String::from("do"))), - ScanToken::Token(Token::Id(String::from("repeat"))), - ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::Id(Identifier::new("do").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), ScanToken::Token(Token::Punct(Punct::Equals)), - ScanToken::Token(Token::Id(String::from("a"))), - ScanToken::Token(Token::Id(String::from("b"))), - ScanToken::Token(Token::Id(String::from("c"))), - ScanToken::Token(Token::Id(String::from("y"))), + ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("b").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("c").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("y").unwrap())), ScanToken::Token(Token::Punct(Punct::Equals)), - ScanToken::Token(Token::Id(String::from("d"))), - ScanToken::Token(Token::Id(String::from("e"))), - ScanToken::Token(Token::Id(String::from("f"))), + ScanToken::Token(Token::Id(Identifier::new("d").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("e").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("f").unwrap())), ScanToken::Token(Token::EndCommand), ScanToken::Token(Token::String(String::from("do repeat a=1 thru 5"))), ScanToken::Token(Token::String(String::from("another command"))), @@ -634,18 +634,18 @@ end repeat ScanToken::Token(Token::String(String::from( "end /* x */ /* y */ repeat print", ))), - ScanToken::Token(Token::Id(String::from("end"))), - ScanToken::Token(Token::Id(String::from("repeat"))), + ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("do"))), - ScanToken::Token(Token::Id(String::from("repeat"))), - ScanToken::Token(Token::Id(String::from("#a"))), + ScanToken::Token(Token::Id(Identifier::new("do").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("#a").unwrap())), ScanToken::Token(Token::Punct(Punct::Equals)), ScanToken::Token(Token::Number(1.0)), ScanToken::Token(Token::EndCommand), ScanToken::Token(Token::String(String::from(" inner command"))), - ScanToken::Token(Token::Id(String::from("end"))), - ScanToken::Token(Token::Id(String::from("repeat"))), + ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), ], ); } @@ -663,36 +663,36 @@ fourth command. "#, Mode::Batch, &[ - ScanToken::Token(Token::Id(String::from("first"))), - ScanToken::Token(Token::Id(String::from("command"))), - ScanToken::Token(Token::Id(String::from("another"))), - ScanToken::Token(Token::Id(String::from("line"))), - ScanToken::Token(Token::Id(String::from("of"))), - ScanToken::Token(Token::Id(String::from("first"))), - ScanToken::Token(Token::Id(String::from("command"))), + ScanToken::Token(Token::Id(Identifier::new("first").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("another").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("line").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("of").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("first").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("second"))), - ScanToken::Token(Token::Id(String::from("command"))), + ScanToken::Token(Token::Id(Identifier::new("second").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("third"))), - ScanToken::Token(Token::Id(String::from("command"))), + ScanToken::Token(Token::Id(Identifier::new("third").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("fourth"))), - ScanToken::Token(Token::Id(String::from("command"))), + ScanToken::Token(Token::Id(Identifier::new("fourth").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("fifth"))), - ScanToken::Token(Token::Id(String::from("command"))), + ScanToken::Token(Token::Id(Identifier::new("fifth").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), ScanToken::Token(Token::EndCommand), ], ); } mod define { - use crate::lex::{ + use crate::{identifier::Identifier, lex::{ scan::ScanToken, segment::Mode, token::{Punct, Token}, - }; + }}; use super::check_scan; @@ -705,12 +705,12 @@ var1 var2 var3 "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), ScanToken::Token(Token::String(String::from("!macro1"))), ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::String(String::from("var1 var2 var3"))), - ScanToken::Token(Token::Id(String::from("!enddefine"))), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), ], ); @@ -724,12 +724,12 @@ var1 var2 var3 "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), ScanToken::Token(Token::String(String::from("!macro1"))), ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::String(String::from(" var1 var2 var3"))), - ScanToken::Token(Token::Id(String::from("!enddefine"))), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), ], ); @@ -743,12 +743,12 @@ var1 var2 var3!enddefine. "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), ScanToken::Token(Token::String(String::from("!macro1"))), ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::String(String::from("var1 var2 var3"))), - ScanToken::Token(Token::Id(String::from("!enddefine"))), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), ], ); @@ -761,12 +761,12 @@ var1 var2 var3!enddefine. "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), ScanToken::Token(Token::String(String::from("!macro1"))), ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::String(String::from("var1 var2 var3"))), - ScanToken::Token(Token::Id(String::from("!enddefine"))), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), ], ); @@ -780,11 +780,11 @@ var1 var2 var3!enddefine. "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), ScanToken::Token(Token::String(String::from("!macro1"))), ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::Id(String::from("!enddefine"))), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), ], ); @@ -800,13 +800,13 @@ var1 var2 var3!enddefine. "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), ScanToken::Token(Token::String(String::from("!macro1"))), ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::String(String::from(""))), ScanToken::Token(Token::String(String::from(""))), - ScanToken::Token(Token::Id(String::from("!enddefine"))), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), ], ); @@ -820,22 +820,22 @@ var1 var2 var3!enddefine. "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), ScanToken::Token(Token::String(String::from("!macro1"))), ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Id(String::from("a"))), + ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::Punct(Punct::Comma)), - ScanToken::Token(Token::Id(String::from("b"))), + ScanToken::Token(Token::Id(Identifier::new("b").unwrap())), ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::Punct(Punct::Comma)), - ScanToken::Token(Token::Id(String::from("c"))), + ScanToken::Token(Token::Id(Identifier::new("c").unwrap())), ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::Id(String::from("!enddefine"))), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), ], ); @@ -853,22 +853,22 @@ var1 var2 var3!enddefine. "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), ScanToken::Token(Token::String(String::from("!macro1"))), ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Id(String::from("a"))), + ScanToken::Token(Token::Id(Identifier::new("a").unwrap())), ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::Punct(Punct::Comma)), - ScanToken::Token(Token::Id(String::from("b"))), + ScanToken::Token(Token::Id(Identifier::new("b").unwrap())), ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::Punct(Punct::Comma)), - ScanToken::Token(Token::Id(String::from("c"))), + ScanToken::Token(Token::Id(Identifier::new("c").unwrap())), ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::Id(String::from("!enddefine"))), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), ], ); @@ -886,18 +886,18 @@ content 2 "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), ScanToken::Token(Token::String(String::from("!macro1"))), ScanToken::Token(Token::Punct(Punct::LParen)), - ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), ScanToken::Token(Token::Punct(Punct::Comma)), - ScanToken::Token(Token::Id(String::from("y"))), + ScanToken::Token(Token::Id(Identifier::new("y").unwrap())), ScanToken::Token(Token::Punct(Punct::Comma)), - ScanToken::Token(Token::Id(String::from("z"))), + ScanToken::Token(Token::Id(Identifier::new("z").unwrap())), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::String(String::from("content 1"))), ScanToken::Token(Token::String(String::from("content 2"))), - ScanToken::Token(Token::Id(String::from("!enddefine"))), + ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), ], ); @@ -911,13 +911,13 @@ data list /x 1. "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), ScanToken::Token(Token::String(String::from("!macro1"))), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("data"))), - ScanToken::Token(Token::Id(String::from("list"))), + ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("list").unwrap())), ScanToken::Token(Token::Punct(Punct::Slash)), - ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), ScanToken::Token(Token::Number(1.0)), ScanToken::Token(Token::EndCommand), ], @@ -933,14 +933,14 @@ data list /x 1. "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), ScanToken::Token(Token::String(String::from("!macro1"))), - ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("data"))), - ScanToken::Token(Token::Id(String::from("list"))), + ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("list").unwrap())), ScanToken::Token(Token::Punct(Punct::Slash)), - ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), ScanToken::Token(Token::Number(1.0)), ScanToken::Token(Token::EndCommand), ], @@ -956,16 +956,16 @@ data list /x 1. "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), ScanToken::Token(Token::String(String::from("!macro1"))), ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("data"))), - ScanToken::Token(Token::Id(String::from("list"))), + ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("list").unwrap())), ScanToken::Token(Token::Punct(Punct::Slash)), - ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), ScanToken::Token(Token::Number(1.0)), ScanToken::Token(Token::EndCommand), ], @@ -982,13 +982,13 @@ data list /x 1. "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), ScanToken::Token(Token::String(String::from("!macro1"))), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::Id(String::from("data"))), - ScanToken::Token(Token::Id(String::from("list"))), + ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), + ScanToken::Token(Token::Id(Identifier::new("list").unwrap())), ScanToken::Token(Token::Punct(Punct::Slash)), - ScanToken::Token(Token::Id(String::from("x"))), + ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), ScanToken::Token(Token::Number(1.0)), ScanToken::Token(Token::EndCommand), ], @@ -1004,7 +1004,7 @@ content line 2 "#, Mode::Auto, &[ - ScanToken::Token(Token::Id(String::from("define"))), + ScanToken::Token(Token::Id(Identifier::new("define").unwrap())), ScanToken::Token(Token::String(String::from("!macro1"))), ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), diff --git a/rust/src/lex/token.rs b/rust/src/lex/token.rs index 8467a7e927..2b59423b5f 100644 --- a/rust/src/lex/token.rs +++ b/rust/src/lex/token.rs @@ -1,12 +1,14 @@ use std::fmt::{Display, Formatter, Result as FmtResult}; +use crate::identifier::Identifier; + #[derive(Clone, Debug, PartialEq)] pub enum Token { /// End of input. End, /// Identifier. - Id(String), + Id(Identifier), /// Number. Number(f64), @@ -24,6 +26,15 @@ pub enum Token { Punct(Punct), } +impl Token { + pub fn id(&self) -> Option<&Identifier> { + match self { + Self::Id(identifier) => Some(identifier), + _ => None, + } + } +} + fn is_printable(c: char) -> bool { !c.is_control() || ['\t', '\r', '\n'].contains(&c) } @@ -88,9 +99,12 @@ mod test { #[test] fn test_string() { assert_eq!(Token::String(String::from("abc")).to_string(), "\"abc\""); - assert_eq!(Token::String(String::from("\u{0080}")).to_string(), "X\"C280\""); + assert_eq!( + Token::String(String::from("\u{0080}")).to_string(), + "X\"C280\"" + ); } - + #[test] fn test_neg0() { assert_eq!(Token::Number(-0.0).to_string(), "-0"); diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 9fb03e9cf6..46fe08622a 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -12,4 +12,4 @@ pub mod sack; pub mod lex; pub mod prompt; pub mod message; -//pub mod macros; +pub mod macros; diff --git a/rust/src/macros.rs b/rust/src/macros.rs index 33dac11112..883d1c1940 100644 --- a/rust/src/macros.rs +++ b/rust/src/macros.rs @@ -1,9 +1,22 @@ -use std::collections::HashMap; +use std::{ + cmp::Ordering, + collections::{BTreeMap, HashMap}, + mem::take, +}; +use num::Integer; use thiserror::Error as ThisError; use unicase::UniCase; -use crate::{lex::token::Token, message::Location}; +use crate::{ + identifier::Identifier, + lex::{ + scan::{ScanError, ScanToken, StringScanner, StringSegmenter}, + segment::Mode, + token::{Punct, Token}, + }, + message::Location, +}; #[derive(Clone, Debug, ThisError)] pub enum MacroError { @@ -13,16 +26,16 @@ pub enum MacroError { )] ExpectedMoreTokens { n: usize, - arg: String, - macro_: String, + arg: Identifier, + macro_: Identifier, }, /// Expected a particular token at end of command. #[error("Reached end of command expecting {token:?} in argument {arg} to macro {macro_}.")] ExpectedToken { token: String, - arg: String, - macro_: String, + arg: Identifier, + macro_: Identifier, }, /// Expected a particular token, got a different one. @@ -32,16 +45,40 @@ pub enum MacroError { UnexpectedToken { actual: String, expected: String, - arg: String, - macro_: String, + arg: Identifier, + macro_: Identifier, }, + + /// Argument specified multiple times, + #[error("Argument {arg} specified multiple times in call to macro {macro_}.")] + DuplicateArg { arg: Identifier, macro_: Identifier }, + + /// Maximum nesting limit exceeded. + #[error("Maximum nesting level {limit} exceeded. (Use `SET MNEST` to change the limit.)")] + TooDeep { limit: usize }, + + /// Invalid `!*`. + #[error("`!*` may only be used within the expansion of a macro.")] + InvalidBangAsterisk, + + /// Error tokenizing during expansion. + #[error(transparent)] + ScanError(ScanError), + + /// Expecting `)` in macro expression. + #[error("Expecting `)` in macro expression.")] + ExpectingRParen, + + /// Expecting literal. + #[error("Expecting literal or function invocation in macro expression.")] + ExpectingLiteral, } /// A PSPP macro as defined with `!DEFINE`. pub struct Macro { /// The macro's name. This is an ordinary identifier except that it is /// allowed (but not required) to begin with `!`. - pub name: String, + pub name: Identifier, /// Source code location of macro definition, for error reporting. pub location: Location, @@ -50,31 +87,35 @@ pub struct Macro { parameters: Vec, /// Body. - body: Vec, + body: Vec, } impl Macro { - fn initial_state(&self) -> MacroCallState { + fn initial_state(&self) -> ParserState { if self.parameters.is_empty() { - MacroCallState::Finished + ParserState::Finished } else if self.parameters[0].is_positional() { - MacroCallState::Keyword + ParserState::Keyword } else if let ValueType::Enclose(_, _) = self.parameters[0].arg { - MacroCallState::Enclose + ParserState::Enclose } else { - MacroCallState::Arg + ParserState::Arg } } + + fn find_parameter(&self, name: &Identifier) -> Option { + self.parameters.iter().position(|param| ¶m.name == name) + } } struct Parameter { - /// `name` or `1`. - name: UniCase, + /// `!name` or `!1`. + name: Identifier, /// Default value. /// /// The tokens don't include white space, etc. between them. - default: Vec, + default: Vec, /// Macro-expand the value? expand_value: bool, @@ -87,7 +128,7 @@ impl Parameter { /// Returns true if this is a positional parameter. Positional parameters /// are expanded by index (position) rather than by name. fn is_positional(&self) -> bool { - self.name.as_bytes()[1].is_ascii_digit() + self.name.0.as_bytes()[1].is_ascii_digit() } } @@ -108,7 +149,7 @@ enum ValueType { /// A token and the syntax that was tokenized to produce it. The syntax allows /// the token to be turned back into syntax accurately. #[derive(Clone)] -struct BodyToken { +pub struct MacroToken { /// The token. token: Token, @@ -116,9 +157,224 @@ struct BodyToken { syntax: String, } +fn tokenize_string(s: &str, mode: Mode, output: &mut Vec, error: &impl Fn(MacroError)) { + for (syntax, token) in StringSegmenter::new(s, mode, true) { + match token { + ScanToken::Token(token) => output.push(MacroToken { + token, + syntax: String::from(syntax), + }), + ScanToken::Error(scan_error) => error(MacroError::ScanError(scan_error)), + } + } +} + +fn unquote_string(input: String, mode: Mode) -> String { + let mut scanner = StringScanner::new(&input, mode, true); + let Some(ScanToken::Token(Token::String(unquoted))) = scanner.next() else { + return input; + }; + let None = scanner.next() else { return input }; + return unquoted; +} + +struct MacroTokens<'a>(&'a [MacroToken]); + +impl<'a> MacroTokens<'a> { + fn match_(&mut self, s: &str) -> bool { + if let Some((first, rest)) = self.0.split_first() { + if first.syntax.eq_ignore_ascii_case(s) { + self.0 = rest; + return true; + } + } + false + } + fn take_relop(&mut self) -> Option { + if let Some((first, rest)) = self.0.split_first() { + if let Ok(relop) = first.syntax.as_str().try_into() { + self.0 = rest; + return Some(relop); + } + } + None + } + fn advance(&mut self) -> &MacroToken { + let (first, rest) = self.0.split_first().unwrap(); + self.0 = rest; + first + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +enum TokenClass { + /// No space before or after (new-line after). + EndCommand, + + /// Space on both sides. + BinaryOperator, + + /// Space afterward. + Comma, + + /// Don't need spaces except sequentially. + Id, + + /// Don't need spaces except sequentially. + Punct, +} + +impl TokenClass { + fn needs_space(prev: Self, next: Self) -> bool { + match (prev, next) { + // Don't need a space before or after the end of a command. (A + // new-line is needed afterward as a special case.) + (Self::EndCommand, _) | (_, Self::EndCommand) => false, + + // Binary operators always have a space on both sides. + (Self::BinaryOperator, _) | (_, Self::BinaryOperator) => true, + + // A comma always has a space afterward. + (Self::Comma, _) => true, + + // Otherwise, `prev` is `Self::BinaryOperator` or `Self::Punct`, + // which only need a space if there are two or them in a row. + _ => prev == next, + } + } +} + +impl From<&Token> for TokenClass { + fn from(source: &Token) -> Self { + match source { + Token::End => Self::Punct, + Token::Id(_) | Token::Number(_) | Token::String(_) => Self::Id, + Token::EndCommand => Self::EndCommand, + Token::Punct(punct) => match punct { + Punct::LParen + | Punct::RParen + | Punct::LSquare + | Punct::RSquare + | Punct::LCurly + | Punct::RCurly => Self::Punct, + + Punct::Plus + | Punct::Dash + | Punct::Asterisk + | Punct::Slash + | Punct::Equals + | Punct::Colon + | Punct::And + | Punct::Or + | Punct::Not + | Punct::Eq + | Punct::Ge + | Punct::Gt + | Punct::Le + | Punct::Lt + | Punct::Ne + | Punct::All + | Punct::By + | Punct::To + | Punct::With + | Punct::Exp + | Punct::Bang + | Punct::Percent + | Punct::Question + | Punct::Backtick + | Punct::Dot + | Punct::Underscore + | Punct::BangAsterisk => Self::BinaryOperator, + + Punct::Comma | Punct::Semicolon => Self::Comma, + }, + } + } +} + +fn macro_tokens_to_syntax(input: &[MacroToken], output: &mut String) { + for (i, token) in input.iter().enumerate() { + if i > 0 { + let prev = &input[i].token; + let next = &token.token; + if let Token::EndCommand = prev { + output.push('\n'); + } else { + let prev_class: TokenClass = prev.into(); + let next_class: TokenClass = next.into(); + if TokenClass::needs_space(prev_class, next_class) { + output.push(' ') + } + } + output.push_str(&token.syntax); + } + } +} + +trait MacroId { + fn macro_id(&self) -> Option<&Identifier>; +} + +impl MacroId for Token { + fn macro_id(&self) -> Option<&Identifier> { + let id = self.id()?; + id.0.starts_with('!').then_some(id) + } +} + +enum RelOp { + Eq, + Ne, + Lt, + Gt, + Le, + Ge, +} + +impl TryFrom<&str> for RelOp { + type Error = (); + + fn try_from(source: &str) -> Result { + match source { + "=" => Ok(Self::Eq), + "~=" | "<>" => Ok(Self::Ne), + "<" => Ok(Self::Lt), + ">" => Ok(Self::Gt), + "<=" => Ok(Self::Le), + ">=" => Ok(Self::Ge), + _ if source.len() == 3 && source.as_bytes()[0] == b'!' => match ( + source.as_bytes()[0].to_ascii_uppercase(), + source.as_bytes()[1].to_ascii_uppercase(), + ) { + (b'E', b'Q') => Ok(Self::Eq), + (b'N', b'E') => Ok(Self::Ne), + (b'L', b'T') => Ok(Self::Lt), + (b'L', b'E') => Ok(Self::Le), + (b'G', b'T') => Ok(Self::Gt), + (b'G', b'E') => Ok(Self::Ge), + _ => Err(()), + }, + _ => Err(()), + } + } +} + +impl RelOp { + fn evaluate(&self, cmp: Ordering) -> bool { + match self { + RelOp::Eq => cmp == Ordering::Equal, + RelOp::Ne => cmp != Ordering::Equal, + RelOp::Lt => cmp == Ordering::Less, + RelOp::Gt => cmp == Ordering::Greater, + RelOp::Le => cmp != Ordering::Greater, + RelOp::Ge => cmp != Ordering::Less, + } + } +} + type MacroSet = HashMap, Macro>; -pub enum MacroCallState { +enum ParserState { /// Accumulating tokens toward the end of any type of argument. Arg, @@ -136,31 +392,26 @@ pub enum MacroCallState { } /// Macro call parser FSM. -pub struct MacroCall<'a> { +pub struct Parser<'a> { macros: &'a MacroSet, macro_: &'a Macro, - state: MacroCallState, - args: Box<[Option>]>, + state: ParserState, + args: Box<[Option>]>, arg_index: usize, /// Length of macro call so far. n_tokens: usize, } -impl<'a> MacroCall<'a> { +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum ParseStatus { + Complete, + Incomplete, +} + +impl<'a> Parser<'a> { pub fn new(macros: &'a MacroSet, token: &Token) -> Option { - if macros.is_empty() { - return None; - } - let macro_name = match token { - Token::Id(s) => s.clone(), - _ => return None, - }; - // XXX Unicase::new() is very expensive. We probably need to define our - // own Unicase-alike that has a proper Borrow<> implementation. - let Some(macro_) = macros.get(&UniCase::new(macro_name)) else { - return None; - }; + let macro_ = macros.get(&token.id()?.0)?; Some(Self { macros, macro_, @@ -171,17 +422,17 @@ impl<'a> MacroCall<'a> { }) } - fn finished(&mut self) -> Option { - self.state = MacroCallState::Finished; + fn finished(&mut self) { + self.state = ParserState::Finished; for (i, arg) in self.args.iter_mut().enumerate() { if arg.is_none() { *arg = Some(self.macro_.parameters[i].default.clone()); } } - Some(self.n_tokens) + self.state = ParserState::Finished; } - fn next_arg(&mut self) -> Option { + fn next_arg(&mut self) { if self.macro_.parameters.is_empty() { self.finished() } else { @@ -193,31 +444,24 @@ impl<'a> MacroCall<'a> { } else { let param = &self.macro_.parameters[self.arg_index]; self.state = if !param.is_positional() { - MacroCallState::Keyword + ParserState::Keyword } else if let ValueType::Enclose(_, _) = param.arg { - MacroCallState::Enclose + ParserState::Enclose } else { - MacroCallState::Arg + ParserState::Arg }; - None } } else { if self.args.iter().any(|arg| arg.is_none()) { - self.state = MacroCallState::Keyword; - None + self.state = ParserState::Keyword; } else { - self.finished() + self.finished(); } } } } - fn push_arg( - &mut self, - token: &Token, - syntax: &str, - error: &impl Fn(MacroError), - ) -> Option { + fn push_arg(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) { let param = &self.macro_.parameters[self.args.len() - 1]; if let Token::EndCommand | Token::End = token { if let Some(arg) = &self.args[self.arg_index] { @@ -241,7 +485,7 @@ impl<'a> MacroCall<'a> { } } } - return self.finished(); + self.finished(); } self.n_tokens += 1; @@ -261,7 +505,7 @@ impl<'a> MacroCall<'a> { if true // !macro_expand_arg (&mt->token, mc->me, *argp) { - arg.push(BodyToken { + arg.push(MacroToken { token: token.clone(), syntax: String::from(syntax), }); @@ -269,17 +513,10 @@ impl<'a> MacroCall<'a> { } if next_arg { self.next_arg() - } else { - None } } - fn push_enclose( - &mut self, - token: &Token, - syntax: &str, - error: &impl Fn(MacroError), - ) -> Option { + fn push_enclose(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) { let param = &self.macro_.parameters[self.arg_index]; let ValueType::Enclose(start, _) = ¶m.arg else { unreachable!() @@ -287,10 +524,9 @@ impl<'a> MacroCall<'a> { if token == start { self.n_tokens += 1; self.args[self.arg_index].get_or_insert(Vec::new()); - self.state = MacroCallState::Arg; - None + self.state = ParserState::Arg; } else if param.is_positional() && matches!(token, Token::End | Token::EndCommand) { - self.finished() + self.finished(); } else { error(MacroError::UnexpectedToken { actual: String::from(syntax), @@ -298,35 +534,55 @@ impl<'a> MacroCall<'a> { arg: param.name.clone(), macro_: self.macro_.name.clone(), }); - self.finished() + self.finished(); } } - fn push_keyword( - &mut self, - token: &Token, - syntax: &str, - error: &impl Fn(MacroError), - ) -> Option { - let Token::Id(id) = token else { + fn push_keyword(&mut self, token: &Token, _syntax: &str, error: &impl Fn(MacroError)) { + let Some(id) = token.id() else { return self.finished(); }; - let Some(arg_idx) = self - .macro_ - .parameters - .iter() - .position(|param| param.name == UniCase::new(id)) - else {}; + let Some(arg_index) = self.macro_.find_parameter(id) else { + return self.finished(); + }; + self.arg_index = arg_index; + if self.args[arg_index].is_some() { + error(MacroError::DuplicateArg { + arg: id.clone(), + macro_: self.macro_.name.clone(), + }); + } + self.args[arg_index] = Some(Vec::new()); + } + + fn push_equals(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) { + let param = &self.macro_.parameters[self.arg_index]; + if let Token::Punct(Punct::Eq) = token { + self.n_tokens += 1; + self.state = if let ValueType::Enclose(_, _) = param.arg { + ParserState::Enclose + } else { + ParserState::Arg + }; + } else { + error(MacroError::UnexpectedToken { + actual: syntax.into(), + expected: String::from("="), + arg: param.name.clone(), + macro_: self.macro_.name.clone(), + }); + self.finished() + } } /// Adds `token`, which has the given `syntax`, to the collection of tokens /// in `self` that potentially need to be macro expanded. /// - /// Returns `None` if the macro expander needs more tokens, for macro + /// Returns `false` if the macro expander needs more tokens, for macro /// arguments or to decide whether this is actually a macro invocation. The /// caller should call `push` again with the next token. - /// - /// Returns `Some(n)` if the macro was complete with `n` tokens. The caller + ///n + /// Returns `true` if the macro was complete with `n` tokens. The caller /// should call [`Self::expand`] to obtain the expansion. (If `n == 0`, /// then the tokens did not actually invoke a macro at all and the expansion /// will be empty.) @@ -335,13 +591,370 @@ impl<'a> MacroCall<'a> { token: &Token, syntax: &str, error: &impl Fn(MacroError), - ) -> Option { + ) -> ParseStatus { match self.state { - MacroCallState::Arg => self.push_arg(token, syntax, error), - MacroCallState::Enclose => self.push_enclose(token, syntax, error), - MacroCallState::Keyword => todo!(), - MacroCallState::Equals => todo!(), - MacroCallState::Finished => todo!(), + ParserState::Arg => self.push_arg(token, syntax, error), + ParserState::Enclose => self.push_enclose(token, syntax, error), + ParserState::Keyword => self.push_keyword(token, syntax, error), + ParserState::Equals => self.push_equals(token, syntax, error), + ParserState::Finished => (), + } + if let ParserState::Finished = self.state { + ParseStatus::Complete + } else { + ParseStatus::Incomplete } } + + pub fn finish(self) -> Call<'a> { + let ParserState::Finished = self.state else { + panic!() + }; + Call(self) + } } + +/// Expansion stack entry. +struct Frame { + /// A macro name or `!IF`, `!DO`, etc. + name: Option, + + /// Source location, if available. + location: Option, +} + +struct Expander<'a> { + /// Macros to expand recursively. + macros: &'a MacroSet, + + /// Error reporting callback. + error: &'a Box, + + /// Tokenization mode. + mode: Mode, + + /// Remaining nesting levels. + nesting_countdown: usize, + + /// Stack for error reporting. + stack: Vec, + + // May macro calls be expanded? + expand: Option<&'a bool>, + + /// Variables from `!DO` and `!LET`. + vars: &'a BTreeMap, + + // Only set if inside a `!DO` loop. If true, break out of the loop. + break_: Option<&'a mut bool>, + + /// Only set if expanding a macro (and not, say, a macro argument). + macro_: Option<&'a Macro>, + + /// Only set if expanding a macro (and not, say, a macro argument). + args: Option<&'a [Option>]>, +} + +fn bool_to_string(b: bool) -> String { + if b { + String::from("1") + } else { + String::from("0") + } +} + +impl<'a> Expander<'a> { + fn may_expand(&self) -> bool { + self.expand.map(|b| *b).unwrap_or(false) + } + + fn should_break(&self) -> bool { + self.break_.as_ref().map(|b| **b).unwrap_or(false) + } + + fn expand(&mut self, input: &[MacroToken], output: &mut Vec) { + if self.nesting_countdown == 0 { + (self.error)(MacroError::TooDeep { limit: MNEST }); + output.extend(input.iter().cloned()); + } else { + let mut i = 0; + while i < input.len() && !self.should_break() { + let consumed = self.expand__(&input[i..], output); + debug_assert!(consumed > 0); + i += consumed; + } + } + } + + fn expand_arg(&mut self, param_idx: usize, output: &mut Vec) { + let param = &self.macro_.unwrap().parameters[param_idx]; + let arg = &self.args.unwrap()[param_idx].as_ref().unwrap(); + if self.may_expand() && param.expand_value { + let vars = BTreeMap::new(); + let mut stack = take(&mut self.stack); + stack.push(Frame { + name: Some(param.name.clone()), + location: None, + }); + let mut subexpander = Expander { + stack, + vars: &vars, + break_: None, + macro_: None, + args: None, + ..*self + }; + subexpander.expand(&arg, output); + self.stack = subexpander.stack; + self.stack.pop(); + } else { + output.extend(arg.iter().cloned()); + } + } + + /// Parses one function argument from `input`. Each argument to a macro + /// function is one of: + /// + /// - A quoted string or other single literal token. + /// + /// - An argument to the macro being expanded, e.g. `!1` or a named + /// argument. + /// + /// - `!*`. + /// + /// - A function invocation. + /// + /// Each function invocation yields a character sequence to be turned into a + /// sequence of tokens. The case where that character sequence is a single + /// quoted string is an important special case. + fn parse_function_arg(&mut self, input: &mut MacroTokens) -> Option { + if let Some(macro_) = self.macro_ { + match &input.0.get(0)?.token { + Token::Id(id) if id.0.starts_with('!') => { + if let Some(param_idx) = macro_.find_parameter(id) { + let mut s = String::new(); + macro_tokens_to_syntax( + self.args.unwrap()[param_idx].as_ref().unwrap(), + &mut s, + ); + input.advance(); + return Some(s); + } + if let Some(value) = self.vars.get(id) { + return Some(value.clone()); + } + + todo!() // expand macro function + } + Token::Punct(Punct::BangAsterisk) => { + let mut arg = String::new(); + for i in 0..macro_.parameters.len() { + if !macro_.parameters[i].is_positional() { + break; + } + if i > 0 { + arg.push(' ') + } + macro_tokens_to_syntax(self.args.unwrap()[i].as_ref().unwrap(), &mut arg); + } + input.advance(); + return Some(arg); + } + _ => (), + } + } + Some(input.advance().syntax.clone()) + } + + fn evaluate_literal(&mut self, input: &mut MacroTokens) -> Option { + if input.match_("(") { + let value = self.evaluate_or(input)?; + if input.match_(")") { + Some(value) + } else { + (self.error)(MacroError::ExpectingRParen); + None + } + } else if input.match_(")") { + (self.error)(MacroError::ExpectingLiteral); + None + } else { + Some(unquote_string(self.parse_function_arg(input)?, self.mode)) + } + } + + fn evaluate_relational(&mut self, input: &mut MacroTokens) -> Option { + let lhs = self.evaluate_literal(input)?; + let Some(relop) = input.take_relop() else { + return Some(lhs); + }; + let rhs = self.evaluate_literal(input)?; + let cmp = unquote_string(lhs, self.mode).cmp(&unquote_string(rhs, self.mode)); + Some(bool_to_string(relop.evaluate(cmp))) + } + + fn evaluate_not(&mut self, input: &mut MacroTokens) -> Option { + let mut negations = 0; + while input.match_("!AND") || input.match_("&") { + negations += 1; + } + + let operand = self.evaluate_relational(input)?; + if negations == 0 { + return Some(operand); + } + + let mut b = operand != "0"; + if negations.is_odd() { + b = !b; + } + Some(bool_to_string(b)) + } + + fn evaluate_and(&mut self, input: &mut MacroTokens) -> Option { + let mut lhs = self.evaluate_not(input)?; + while input.match_("!AND") || input.match_("&") { + let rhs = self.evaluate_not(input)?; + lhs = bool_to_string(lhs != "0" && rhs != "0"); + } + Some(lhs) + } + fn evaluate_or(&mut self, input: &mut MacroTokens) -> Option { + let mut lhs = self.evaluate_and(input)?; + while input.match_("!OR") || input.match_("|") { + let rhs = self.evaluate_and(input)?; + lhs = bool_to_string(lhs != "0" || rhs != "0"); + } + Some(lhs) + } + + fn evaluate_expression(&mut self, input: &[MacroToken]) -> Option { + let mut tokens = MacroTokens(input); + self.evaluate_or(&mut tokens) + } + + fn expand_if(&mut self, input: &[MacroToken], output: &mut Vec) -> usize { + self.evaluate_expression(input); + todo!() + } + + fn expand__(&mut self, input: &[MacroToken], output: &mut Vec) -> usize { + // Recursive macro calls. + if self.may_expand() { + if let Some(call) = Call::for_tokens(self.macros, input, &self.error) { + let vars = BTreeMap::new(); + let mut stack = take(&mut self.stack); + stack.push(Frame { + name: Some(call.0.macro_.name.clone()), + location: Some(call.0.macro_.location.clone()), + }); + let mut subexpander = Expander { + break_: None, + vars: &vars, + nesting_countdown: self.nesting_countdown.saturating_sub(1), + stack, + ..*self + }; + subexpander.expand(input, output); + self.stack = subexpander.stack; + self.stack.pop(); + return call.0.n_tokens; + } + } + + // Only identifiers beginning with `!` receive further processing. + let id = match &input[0].token { + Token::Id(id) if id.0.starts_with('!') => id, + Token::Punct(Punct::BangAsterisk) => { + if let Some(macro_) = self.macro_ { + for i in 0..macro_.parameters.len() { + self.expand_arg(i, output); + } + } else { + (self.error)(MacroError::InvalidBangAsterisk); + } + return 1; + } + _ => { + output.push(input[0].clone()); + return 1; + } + }; + + // Macro arguments. + if let Some(macro_) = self.macro_ { + if let Some(param_idx) = macro_.find_parameter(id) { + self.expand_arg(param_idx, output); + return 1; + } + } + + // Variables set by `!DO` or `!LET`. + if let Some(value) = self.vars.get(id) { + tokenize_string(value.as_str(), self.mode, output, &self.error); + return 1; + } + + // XXX Macro functions. + if id == "!IF" { + let n = self.expand_if(&input[1..], output); + if n > 0 { + return n; + } + } + + todo!() + } +} + +pub struct Call<'a>(Parser<'a>); + +impl<'a> Call<'a> { + pub fn for_tokens(macros: &'a MacroSet, tokens: &[MacroToken], error: &F) -> Option + where + F: Fn(MacroError), + { + let mut parser = Parser::new(macros, &tokens.get(0)?.token)?; + for token in tokens[1..].iter().chain(&[MacroToken { + token: Token::EndCommand, + syntax: String::from(""), + }]) { + if parser.push(&token.token, &token.syntax, error) == ParseStatus::Complete { + return Some(parser.finish()); + } + } + return None; + } + + pub fn expand(&self, mode: Mode, call_loc: Location, output: &mut Vec, error: F) + where + F: Fn(MacroError) + 'a, + { + let error: Box = Box::new(error); + let vars = BTreeMap::new(); + let mut me = Expander { + macros: self.0.macros, + error: &error, + macro_: Some(self.0.macro_), + args: Some(&self.0.args), + mode, + nesting_countdown: MNEST, + stack: vec![ + Frame { + name: None, + location: Some(call_loc), + }, + Frame { + name: Some(self.0.macro_.name.clone()), + location: Some(self.0.macro_.location.clone()), + }, + ], + vars: &vars, + break_: None, + expand: None, + }; + me.expand(&self.0.macro_.body, output); + } +} + +const MNEST: usize = 50; diff --git a/src/language/lexer/macro.c b/src/language/lexer/macro.c index 9536727a0f..d7bc1b611f 100644 --- a/src/language/lexer/macro.c +++ b/src/language/lexer/macro.c @@ -357,19 +357,10 @@ classify_token (enum token_type type) NOT_REACHED (); } -/* Appends syntax for the tokens in MTS to S. If OFS and LEN are nonnull, sets - OFS[i] to the offset within S of the start of token 'i' in MTS and LEN[i] to - its length. OFS[i] + LEN[i] is not necessarily OFS[i + 1] because some - tokens are separated by white space. */ +/* Appends syntax for the tokens in MTS to S. */ void -macro_tokens_to_syntax (struct macro_tokens *mts, struct string *s, - size_t *ofs, size_t *len) +macro_tokens_to_syntax (struct macro_tokens *mts, struct string *s) { - assert ((ofs != NULL) == (len != NULL)); - - if (!mts->n) - return; - for (size_t i = 0; i < mts->n; i++) { if (i > 0) @@ -388,11 +379,7 @@ macro_tokens_to_syntax (struct macro_tokens *mts, struct string *s, } } - if (ofs) - ofs[i] = s->ss.length; macro_token_to_syntax (&mts->mts[i], s); - if (len) - len[i] = s->ss.length - ofs[i]; } } @@ -938,7 +925,7 @@ parse_function_arg (const struct macro_expander *me, if (param) { size_t param_idx = param - me->macro->params; - macro_tokens_to_syntax (me->args[param_idx], farg, NULL, NULL); + macro_tokens_to_syntax (me->args[param_idx], farg); return 1; } @@ -950,7 +937,7 @@ parse_function_arg (const struct macro_expander *me, break; if (i) ds_put_byte (farg, ' '); - macro_tokens_to_syntax (me->args[i], farg, NULL, NULL); + macro_tokens_to_syntax (me->args[i], farg); } return 1; } @@ -1267,7 +1254,7 @@ expand_macro_function (const struct macro_expander *me, if (mts.n > 1) { struct macro_tokens tail = { .mts = mts.mts + 1, .n = mts.n - 1 }; - macro_tokens_to_syntax (&tail, output, NULL, NULL); + macro_tokens_to_syntax (&tail, output); } macro_tokens_uninit (&mts); ds_destroy (&tmp); @@ -1306,7 +1293,7 @@ expand_macro_function (const struct macro_expander *me, subme.stack = &stack; macro_expand (mts.mts, mts.n, &subme, &exp); - macro_tokens_to_syntax (&exp, output, NULL, NULL); + macro_tokens_to_syntax (&exp, output); macro_tokens_uninit (&exp); macro_tokens_uninit (&mts); } -- 2.30.2