From: Ben Pfaff Date: Wed, 17 Jul 2024 20:57:10 +0000 (-0700) Subject: Stop distinguishing macro tokens from other tokens. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=bab3aad7fa208aca031e3a475642375c88da9b07;p=pspp Stop distinguishing macro tokens from other tokens. --- diff --git a/rust/src/identifier.rs b/rust/src/identifier.rs index 3be08083ca..bfb4331991 100644 --- a/rust/src/identifier.rs +++ b/rust/src/identifier.rs @@ -30,7 +30,7 @@ pub trait IdentifierChar { impl IdentifierChar for char { fn ascii_may_start_id(self) -> bool { - matches!(self, 'a'..='z' | 'A'..='Z' | '@' | '#' | '$') + matches!(self, 'a'..='z' | 'A'..='Z' | '@' | '#' | '$' | '!') } fn may_start_id(self) -> bool { @@ -39,13 +39,12 @@ impl IdentifierChar for char { } else { use MajorCategory::*; - ([L, M, S].contains(&self.get_major_category()) || "@#$".contains(self)) - && self != char::REPLACEMENT_CHARACTER + [L, M, S].contains(&self.get_major_category()) && self != char::REPLACEMENT_CHARACTER } } fn ascii_may_continue_id(self) -> bool { - self.ascii_may_start_id() || matches!(self, '0'..='9' | '.' | '_') + matches!(self, 'a'..='z' | 'A'..='Z' | '0'..='9' | '@' | '#' | '$' | '.' | '_') } fn may_continue_id(self) -> bool { @@ -54,8 +53,7 @@ impl IdentifierChar for char { } else { use MajorCategory::*; - ([L, M, S, N].contains(&self.get_major_category()) || "@#$._".contains(self)) - && self != char::REPLACEMENT_CHARACTER + [L, M, S, N].contains(&self.get_major_category()) && self != char::REPLACEMENT_CHARACTER } } } @@ -68,6 +66,9 @@ pub enum Error { #[error("\"{0}\" may not be used as an identifier because it is a reserved word.")] Reserved(String), + #[error("\"!\" is not a valid identifier.")] + Bang, + #[error("\"{0}\" may not be used as an identifier because it begins with disallowed character \"{1}\".")] BadFirstCharacter(String, char), @@ -90,15 +91,58 @@ pub enum Error { }, } -pub fn is_reserved_word(s: &str) -> bool { - for word in [ - "and", "or", "not", "eq", "ge", "gt", "le", "lt", "ne", "all", "by", "to", "with", - ] { - if s.eq_ignore_ascii_case(word) { - return true; +pub enum ReservedWord { + And, + Or, + Not, + Eq, + Ge, + Gt, + Le, + Lt, + Ne, + All, + By, + To, + With, +} + +impl TryFrom<&str> for ReservedWord { + type Error = (); + + fn try_from(source: &str) -> Result { + if !(2..=4).contains(&source.len()) { + Err(()) + } else { + let b = source.as_bytes(); + let c0 = b[0].to_ascii_uppercase(); + let c1 = b[1].to_ascii_uppercase(); + match (source.len(), c0, c1) { + (2, b'B', b'Y') => Ok(Self::By), + (2, b'E', b'Q') => Ok(Self::Eq), + (2, b'G', b'T') => Ok(Self::Gt), + (2, b'G', b'E') => Ok(Self::Ge), + (2, b'L', b'T') => Ok(Self::Lt), + (2, b'L', b'E') => Ok(Self::Le), + (2, b'N', b'E') => Ok(Self::Ne), + (3, b'N', b'O') if b[2].to_ascii_uppercase() == b'T' => Ok(Self::Not), + (2, b'O', b'R') => Ok(Self::Or), + (2, b'T', b'O') => Ok(Self::To), + (3, b'A', b'L') if b[2].to_ascii_uppercase() == b'L' => Ok(Self::All), + (3, b'A', b'N') if b[2].to_ascii_uppercase() == b'D' => Ok(Self::And), + (4, b'W', b'I') + if b[2].to_ascii_uppercase() == b'T' && b[3].to_ascii_uppercase() == b'H' => + { + Ok(Self::With) + } + _ => Err(()), + } } } - false +} + +pub fn is_reserved_word(s: &str) -> bool { + ReservedWord::try_from(s).is_ok() } #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] @@ -160,6 +204,9 @@ impl Identifier { if is_reserved_word(s) { return Err(Error::Reserved(s.into())); } + if s == "!" { + return Err(Error::Bang); + } let mut i = s.chars(); let first = i.next().unwrap(); diff --git a/rust/src/lex/scan/mod.rs b/rust/src/lex/scan/mod.rs index cbeac4095d..e099fcd402 100644 --- a/rust/src/lex/scan/mod.rs +++ b/rust/src/lex/scan/mod.rs @@ -10,9 +10,11 @@ //! are the same as the tokens used by the PSPP parser with a few additional //! types. +use crate::identifier::ReservedWord; + use super::{ segment::{Mode, Segment, Segmenter}, - token::{MacroToken, Punct, Token}, + token::{Punct, Token}, }; use std::collections::VecDeque; use thiserror::Error as ThisError; @@ -150,27 +152,26 @@ impl ScanToken { | Segment::MacroBody | Segment::MacroName => Some(Self::Token(Token::String(String::from(s)))), - Segment::ReservedWord => { - let c0 = s.as_bytes()[0].to_ascii_uppercase(); - let c1 = s.as_bytes()[1].to_ascii_uppercase(); - match (c0, c1) { - (b'B', _) => Some(Self::Token(Token::Punct(Punct::By))), - (b'E', _) => Some(Self::Token(Token::Punct(Punct::Eq))), - (b'G', b'T') => Some(Self::Token(Token::Punct(Punct::Gt))), - (b'G', _) => Some(Self::Token(Token::Punct(Punct::Ge))), - (b'L', b'T') => Some(Self::Token(Token::Punct(Punct::Lt))), - (b'L', _) => Some(Self::Token(Token::Punct(Punct::Le))), - (b'N', b'E') => Some(Self::Token(Token::Punct(Punct::Ne))), - (b'N', _) => Some(Self::Token(Token::Punct(Punct::Not))), - (b'O', _) => Some(Self::Token(Token::Punct(Punct::Or))), - (b'T', _) => Some(Self::Token(Token::Punct(Punct::To))), - (b'A', b'L') => Some(Self::Token(Token::Punct(Punct::All))), - (b'A', _) => Some(Self::Token(Token::Punct(Punct::And))), - (b'W', _) => Some(Self::Token(Token::Punct(Punct::With))), - _ => unreachable!(), - } + Segment::Identifier => { + if let Ok(reserved_word) = ReservedWord::try_from(s) { + match reserved_word { + ReservedWord::And => Some(Self::Token(Token::Punct(Punct::And))), + ReservedWord::Or => Some(Self::Token(Token::Punct(Punct::Or))), + ReservedWord::Not => Some(Self::Token(Token::Punct(Punct::Not))), + ReservedWord::Eq => Some(Self::Token(Token::Punct(Punct::Eq))), + ReservedWord::Ge => Some(Self::Token(Token::Punct(Punct::Ge))), + ReservedWord::Gt => Some(Self::Token(Token::Punct(Punct::Gt))), + ReservedWord::Le => Some(Self::Token(Token::Punct(Punct::Le))), + ReservedWord::Lt => Some(Self::Token(Token::Punct(Punct::Lt))), + ReservedWord::Ne => Some(Self::Token(Token::Punct(Punct::Ne))), + ReservedWord::All => Some(Self::Token(Token::Punct(Punct::All))), + ReservedWord::By => Some(Self::Token(Token::Punct(Punct::By))), + ReservedWord::To => Some(Self::Token(Token::Punct(Punct::To))), + ReservedWord::With => Some(Self::Token(Token::Punct(Punct::With))), + } + } else { + Some(Self::Token(Token::Id(String::from(s))))} } - Segment::Identifier => Some(Self::Token(Token::Id(String::from(s)))), Segment::Punct => match s { "(" => Some(Self::Token(Token::Punct(Punct::LParen))), ")" => Some(Self::Token(Token::Punct(Punct::RParen))), @@ -196,12 +197,13 @@ impl ScanToken { "<>" => Some(Self::Token(Token::Punct(Punct::Ne))), "~=" => Some(Self::Token(Token::Punct(Punct::Ne))), ">=" => Some(Self::Token(Token::Punct(Punct::Ge))), - "!" => Some(Self::Token(Token::MacroToken(MacroToken::Bang))), - "%" => Some(Self::Token(Token::MacroToken(MacroToken::Percent))), - "?" => Some(Self::Token(Token::MacroToken(MacroToken::Question))), - "`" => Some(Self::Token(Token::MacroToken(MacroToken::Backtick))), - "_" => Some(Self::Token(Token::MacroToken(MacroToken::Underscore))), - "." => Some(Self::Token(Token::MacroToken(MacroToken::Dot))), + "!" => Some(Self::Token(Token::Punct(Punct::Bang))), + "%" => Some(Self::Token(Token::Punct(Punct::Percent))), + "?" => Some(Self::Token(Token::Punct(Punct::Question))), + "`" => Some(Self::Token(Token::Punct(Punct::Backtick))), + "_" =>Some(Self::Token(Token::Punct(Punct::Underscore))), + "." =>Some(Self::Token(Token::Punct(Punct::Dot))), + "!*" => Some(Self::Token(Token::Punct(Punct::BangAsterisk))), _ => unreachable!("bad punctuator {s:?}"), }, Segment::Shbang @@ -210,9 +212,6 @@ impl ScanToken { | Segment::Newline | Segment::CommentCommand => None, Segment::DoRepeatOverflow => Some(Self::Error(ScanError::DoRepeatOverflow)), - Segment::MacroId => Some(Self::Token(Token::MacroToken(MacroToken::MacroId( - String::from(s), - )))), Segment::StartDocument => Some(Self::Token(Token::Id(String::from("DOCUMENT")))), Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => { Some(Self::Token(Token::EndCommand)) diff --git a/rust/src/lex/scan/test.rs b/rust/src/lex/scan/test.rs index d009131f63..6affeac0e4 100644 --- a/rust/src/lex/scan/test.rs +++ b/rust/src/lex/scan/test.rs @@ -1,6 +1,6 @@ use crate::lex::{ segment::Mode, - token::{MacroToken, Punct, Token}, + token::{Punct, Token}, }; use super::{ScanError, ScanToken, StringLexer}; @@ -13,10 +13,6 @@ fn print_token(token: &Token) { Token::String(s) => print!("Token::String(String::from({s:?}))"), Token::EndCommand => print!("Token::EndCommand"), Token::Punct(punct) => print!("Token::Punct(Punct::{punct:?})"), - Token::MacroToken(MacroToken::MacroId(id)) => { - print!("Token::MacroToken(MacroToken::MacroId(String::from({id:?})))") - } - Token::MacroToken(m) => print!("Token::MacroToken(MacroToken::{m:?})"), } } @@ -67,16 +63,14 @@ WXYZ. /* unterminated end of line comment ScanToken::Token(Token::Id(String::from("$x"))), ScanToken::Token(Token::Id(String::from("@efg"))), ScanToken::Token(Token::Id(String::from("@@."))), - ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( - "!abcd", - )))), - ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from("!*")))), - ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from("!*")))), + ScanToken::Token(Token::Id(String::from("!abcd"))), + ScanToken::Token(Token::Punct(Punct::BangAsterisk)), + ScanToken::Token(Token::Punct(Punct::BangAsterisk)), ScanToken::Token(Token::Id(String::from("a"))), ScanToken::Token(Token::Id(String::from("#.#"))), - ScanToken::Token(Token::MacroToken(MacroToken::Dot)), + ScanToken::Token(Token::Punct(Punct::Dot)), ScanToken::Token(Token::Id(String::from("x"))), - ScanToken::Token(Token::MacroToken(MacroToken::Underscore)), + ScanToken::Token(Token::Punct(Punct::Underscore)), ScanToken::Token(Token::Id(String::from("z"))), ScanToken::Token(Token::EndCommand), ScanToken::Token(Token::Id(String::from("abcd."))), @@ -199,12 +193,12 @@ fn test_punctuation() { ScanToken::Token(Token::Punct(Punct::LSquare)), ScanToken::Token(Token::Punct(Punct::RSquare)), ScanToken::Token(Token::Punct(Punct::Exp)), - ScanToken::Token(Token::MacroToken(MacroToken::Percent)), + ScanToken::Token(Token::Punct(Punct::Percent)), ScanToken::Token(Token::Punct(Punct::Colon)), ScanToken::Token(Token::Punct(Punct::Semicolon)), - ScanToken::Token(Token::MacroToken(MacroToken::Question)), - ScanToken::Token(Token::MacroToken(MacroToken::Underscore)), - ScanToken::Token(Token::MacroToken(MacroToken::Backtick)), + ScanToken::Token(Token::Punct(Punct::Question)), + ScanToken::Token(Token::Punct(Punct::Underscore)), + ScanToken::Token(Token::Punct(Punct::Backtick)), ScanToken::Token(Token::Punct(Punct::LCurly)), ScanToken::Token(Token::Punct(Punct::RCurly)), ScanToken::Token(Token::Punct(Punct::Not)), @@ -306,7 +300,7 @@ fn test_negative_numbers() { ScanToken::Token(Token::Number(-0.0112)), ScanToken::Token(Token::Number(-1.0)), ScanToken::Token(Token::Punct(Punct::Dash)), - ScanToken::Token(Token::MacroToken(MacroToken::Dot)), + ScanToken::Token(Token::Punct(Punct::Dot)), ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e"))), ScanToken::Token(Token::Punct(Punct::Dash)), ScanToken::Token(Token::Id(String::from("e1"))), @@ -395,7 +389,7 @@ fn test_shbang() { Mode::Auto, &[ ScanToken::Token(Token::Id(String::from("#"))), - ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from("!")))), + ScanToken::Token(Token::Punct(Punct::Bang)), ScanToken::Token(Token::Punct(Punct::Slash)), ScanToken::Token(Token::Id(String::from("usr"))), ScanToken::Token(Token::Punct(Punct::Slash)), @@ -697,7 +691,7 @@ mod define { use crate::lex::{ scan::ScanToken, segment::Mode, - token::{MacroToken, Punct, Token}, + token::{Punct, Token}, }; use super::check_scan; @@ -716,9 +710,7 @@ var1 var2 var3 ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::String(String::from("var1 var2 var3"))), - ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( - "!enddefine", - )))), + ScanToken::Token(Token::Id(String::from("!enddefine"))), ScanToken::Token(Token::EndCommand), ], ); @@ -737,9 +729,7 @@ var1 var2 var3 ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::String(String::from(" var1 var2 var3"))), - ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( - "!enddefine", - )))), + ScanToken::Token(Token::Id(String::from("!enddefine"))), ScanToken::Token(Token::EndCommand), ], ); @@ -758,9 +748,7 @@ var1 var2 var3!enddefine. ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::String(String::from("var1 var2 var3"))), - ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( - "!enddefine", - )))), + ScanToken::Token(Token::Id(String::from("!enddefine"))), ScanToken::Token(Token::EndCommand), ], ); @@ -778,9 +766,7 @@ var1 var2 var3!enddefine. ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::String(String::from("var1 var2 var3"))), - ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( - "!enddefine", - )))), + ScanToken::Token(Token::Id(String::from("!enddefine"))), ScanToken::Token(Token::EndCommand), ], ); @@ -798,9 +784,7 @@ var1 var2 var3!enddefine. ScanToken::Token(Token::String(String::from("!macro1"))), ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( - "!enddefine", - )))), + ScanToken::Token(Token::Id(String::from("!enddefine"))), ScanToken::Token(Token::EndCommand), ], ); @@ -822,9 +806,7 @@ var1 var2 var3!enddefine. ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::String(String::from(""))), ScanToken::Token(Token::String(String::from(""))), - ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( - "!enddefine", - )))), + ScanToken::Token(Token::Id(String::from("!enddefine"))), ScanToken::Token(Token::EndCommand), ], ); @@ -853,9 +835,7 @@ var1 var2 var3!enddefine. ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( - "!enddefine", - )))), + ScanToken::Token(Token::Id(String::from("!enddefine"))), ScanToken::Token(Token::EndCommand), ], ); @@ -888,9 +868,7 @@ var1 var2 var3!enddefine. ScanToken::Token(Token::Punct(Punct::LParen)), ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::Punct(Punct::RParen)), - ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( - "!enddefine", - )))), + ScanToken::Token(Token::Id(String::from("!enddefine"))), ScanToken::Token(Token::EndCommand), ], ); @@ -919,9 +897,7 @@ content 2 ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::String(String::from("content 1"))), ScanToken::Token(Token::String(String::from("content 2"))), - ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from( - "!enddefine", - )))), + ScanToken::Token(Token::Id(String::from("!enddefine"))), ScanToken::Token(Token::EndCommand), ], ); diff --git a/rust/src/lex/segment/mod.rs b/rust/src/lex/segment/mod.rs index eae0b4810c..f53b46eb30 100644 --- a/rust/src/lex/segment/mod.rs +++ b/rust/src/lex/segment/mod.rs @@ -20,14 +20,12 @@ //! such as error messages later in tokenization (e.g. SEG_EXPECTED_QUOTE). use crate::{ - identifier::{id_match, id_match_n, is_reserved_word, IdentifierChar}, + identifier::{id_match, id_match_n, IdentifierChar}, prompt::PromptStyle, }; use bitflags::bitflags; -use super::{ - command_name::{command_match, COMMAND_NAMES}, -}; +use super::command_name::{command_match, COMMAND_NAMES}; /// Segmentation mode. /// @@ -64,7 +62,6 @@ pub enum Segment { HexString, UnicodeString, UnquotedString, - ReservedWord, Identifier, Punct, Shbang, @@ -75,7 +72,6 @@ pub enum Segment { DoRepeatCommand, DoRepeatOverflow, InlineData, - MacroId, MacroName, MacroBody, StartDocument, @@ -388,7 +384,7 @@ fn is_start_of_string(input: &str, eof: bool) -> Result { 'x' | 'X' | 'u' | 'U' => { let (c, _rest) = take(rest, eof)?; Ok(c == Some('\'') || c == Some('"')) - }, + } '\'' | '"' => Ok(true), '\n' | '\r' if is_end_of_line(input, eof)? => Ok(true), _ => Ok(false), @@ -586,14 +582,14 @@ impl Segmenter { '!' => { let (c, rest2) = take(rest, eof)?; match c { - Some('*') => Ok((rest2, Segment::MacroId)), + Some('*') => Ok((rest2, Segment::Punct)), Some(_) => self.parse_id(input, eof), None => Ok((rest, Segment::Punct)), } } c if c.is_whitespace() => Ok((skip_spaces(rest, eof)?, Segment::Spaces)), c if c.may_start_id() => self.parse_id(input, eof), - '!'..='~' if c != '\\' && c != '^' => { + '#'..='~' if c != '\\' && c != '^' => { self.state.1 = Substate::empty(); Ok((rest, Segment::Punct)) } @@ -656,13 +652,11 @@ impl Segmenter { | Segment::HexString | Segment::UnicodeString | Segment::UnquotedString - | Segment::ReservedWord | Segment::Punct | Segment::CommentCommand | Segment::DoRepeatCommand | Segment::DoRepeatOverflow | Segment::InlineData - | Segment::MacroId | Segment::MacroName | Segment::MacroBody | Segment::StartDocument @@ -744,14 +738,14 @@ impl Segmenter { } self.state.1 = Substate::empty(); - let segment = if is_reserved_word(identifier) { - Segment::ReservedWord - } else if identifier.starts_with('!') { - Segment::MacroId - } else { - Segment::Identifier - }; - Ok((rest, segment)) + Ok(( + rest, + if identifier != "!" { + Segment::Identifier + } else { + Segment::Punct + }, + )) } fn parse_digraph<'a>( &mut self, @@ -1105,8 +1099,7 @@ impl Segmenter { /// - The `DEFINE` keyword. /// /// - An identifier. We transform this into `Type::MacroName` instead of - /// `Type::Identifier` or `Type::MacroId` because this identifier must - /// never be macro-expanded. + /// `Type::Identifier` because this identifier must never be macro-expanded. /// /// - Anything but `(`. /// @@ -1125,7 +1118,7 @@ impl Segmenter { ) -> Result<(&'a str, Segment), Incomplete> { let (rest, segment) = self.subparse(input, eof)?; match segment { - Segment::Identifier | Segment::MacroId if self.state.0 == State::Define1 => { + Segment::Identifier if self.state.0 == State::Define1 => { self.state.0 = State::Define2; return Ok((rest, Segment::MacroName)); } diff --git a/rust/src/lex/segment/test.rs b/rust/src/lex/segment/test.rs index dd2d50cb62..05f0a23d6e 100644 --- a/rust/src/lex/segment/test.rs +++ b/rust/src/lex/segment/test.rs @@ -1,8 +1,12 @@ use crate::prompt::PromptStyle; -use super::{Mode, Segmenter, Segment}; +use super::{Mode, Segment, Segmenter}; -fn push_segment<'a>(segmenter: &mut Segmenter, input: &'a str, one_byte: bool) -> (&'a str, Segment) { +fn push_segment<'a>( + segmenter: &mut Segmenter, + input: &'a str, + one_byte: bool, +) -> (&'a str, Segment) { if one_byte { for len in input.char_indices().map(|(pos, _c)| pos) { if let Ok((rest, segment)) = segmenter.push(&input[..len], false) { @@ -93,7 +97,8 @@ fn check_segmentation( let mut expect_segments: Vec<_> = expect_segments.iter().copied().collect(); assert_eq!(expect_segments.pop(), Some((Segment::End, ""))); assert_eq!(expect_segments.pop(), Some((Segment::Newline, "\n"))); - while let Some((Segment::SeparateCommands | Segment::EndCommand, "")) = expect_segments.last() + while let Some((Segment::SeparateCommands | Segment::EndCommand, "")) = + expect_segments.last() { expect_segments.pop(); } @@ -141,6 +146,7 @@ grève Ângstrom poté f@#_.#6 GhIjK .x 1y _z +!abc abc! "#, Mode::Auto, &[ @@ -152,7 +158,7 @@ GhIjK (Segment::Spaces, " "), (Segment::Identifier, "abcd"), (Segment::Spaces, " "), - (Segment::MacroId, "!abcd"), + (Segment::Identifier, "!abcd"), (Segment::Newline, "\n"), (Segment::Identifier, "A"), (Segment::Spaces, " "), @@ -162,7 +168,7 @@ GhIjK (Segment::Spaces, " "), (Segment::Identifier, "ABCD"), (Segment::Spaces, " "), - (Segment::MacroId, "!ABCD"), + (Segment::Identifier, "!ABCD"), (Segment::Newline, "\n"), (Segment::Identifier, "aB"), (Segment::Spaces, " "), @@ -170,7 +176,7 @@ GhIjK (Segment::Spaces, " "), (Segment::Identifier, "aBcD"), (Segment::Spaces, " "), - (Segment::MacroId, "!aBcD"), + (Segment::Identifier, "!aBcD"), (Segment::Newline, "\n"), (Segment::Identifier, "$x"), (Segment::Spaces, " "), @@ -178,7 +184,7 @@ GhIjK (Segment::Spaces, " "), (Segment::Identifier, "$z"), (Segment::Spaces, " "), - (Segment::MacroId, "!$z"), + (Segment::Identifier, "!$z"), (Segment::Newline, "\n"), (Segment::Identifier, "grève"), (Segment::Spaces, "\u{00a0}"), @@ -196,7 +202,7 @@ GhIjK (Segment::Spaces, " "), (Segment::Identifier, "#d"), (Segment::Spaces, " "), - (Segment::MacroId, "!#d"), + (Segment::Identifier, "!#d"), (Segment::Newline, "\n"), (Segment::Identifier, "@efg"), (Segment::Spaces, " "), @@ -206,7 +212,7 @@ GhIjK (Segment::Spaces, " "), (Segment::Identifier, "@#@"), (Segment::Spaces, " "), - (Segment::MacroId, "!@"), + (Segment::Identifier, "!@"), (Segment::Spaces, " "), (Segment::Newline, "\n"), (Segment::Identifier, "##"), @@ -230,6 +236,11 @@ GhIjK (Segment::Punct, "_"), (Segment::Identifier, "z"), (Segment::Newline, "\n"), + (Segment::Identifier, "!abc"), + (Segment::Spaces, " "), + (Segment::Identifier, "abc"), + (Segment::Punct, "!"), + (Segment::Newline, "\n"), (Segment::End, ""), ], &[ @@ -244,6 +255,7 @@ GhIjK PromptStyle::Later, PromptStyle::Later, PromptStyle::Later, + PromptStyle::Later, ], ); } @@ -380,57 +392,57 @@ and. with. "#, Mode::Auto, &[ - (Segment::ReservedWord, "and"), + (Segment::Identifier, "and"), (Segment::Spaces, " "), - (Segment::ReservedWord, "or"), + (Segment::Identifier, "or"), (Segment::Spaces, " "), - (Segment::ReservedWord, "not"), + (Segment::Identifier, "not"), (Segment::Spaces, " "), - (Segment::ReservedWord, "eq"), + (Segment::Identifier, "eq"), (Segment::Spaces, " "), - (Segment::ReservedWord, "ge"), + (Segment::Identifier, "ge"), (Segment::Spaces, " "), - (Segment::ReservedWord, "gt"), + (Segment::Identifier, "gt"), (Segment::Spaces, " "), - (Segment::ReservedWord, "le"), + (Segment::Identifier, "le"), (Segment::Spaces, " "), - (Segment::ReservedWord, "lt"), + (Segment::Identifier, "lt"), (Segment::Spaces, " "), - (Segment::ReservedWord, "ne"), + (Segment::Identifier, "ne"), (Segment::Spaces, " "), - (Segment::ReservedWord, "all"), + (Segment::Identifier, "all"), (Segment::Spaces, " "), - (Segment::ReservedWord, "by"), + (Segment::Identifier, "by"), (Segment::Spaces, " "), - (Segment::ReservedWord, "to"), + (Segment::Identifier, "to"), (Segment::Spaces, " "), - (Segment::ReservedWord, "with"), + (Segment::Identifier, "with"), (Segment::Newline, "\n"), - (Segment::ReservedWord, "AND"), + (Segment::Identifier, "AND"), (Segment::Spaces, " "), - (Segment::ReservedWord, "OR"), + (Segment::Identifier, "OR"), (Segment::Spaces, " "), - (Segment::ReservedWord, "NOT"), + (Segment::Identifier, "NOT"), (Segment::Spaces, " "), - (Segment::ReservedWord, "EQ"), + (Segment::Identifier, "EQ"), (Segment::Spaces, " "), - (Segment::ReservedWord, "GE"), + (Segment::Identifier, "GE"), (Segment::Spaces, " "), - (Segment::ReservedWord, "GT"), + (Segment::Identifier, "GT"), (Segment::Spaces, " "), - (Segment::ReservedWord, "LE"), + (Segment::Identifier, "LE"), (Segment::Spaces, " "), - (Segment::ReservedWord, "LT"), + (Segment::Identifier, "LT"), (Segment::Spaces, " "), - (Segment::ReservedWord, "NE"), + (Segment::Identifier, "NE"), (Segment::Spaces, " "), - (Segment::ReservedWord, "ALL"), + (Segment::Identifier, "ALL"), (Segment::Spaces, " "), - (Segment::ReservedWord, "BY"), + (Segment::Identifier, "BY"), (Segment::Spaces, " "), - (Segment::ReservedWord, "TO"), + (Segment::Identifier, "TO"), (Segment::Spaces, " "), - (Segment::ReservedWord, "WITH"), + (Segment::Identifier, "WITH"), (Segment::Newline, "\n"), (Segment::Identifier, "andx"), (Segment::Spaces, " "), @@ -460,7 +472,7 @@ and. with. (Segment::Newline, "\n"), (Segment::Identifier, "and."), (Segment::Spaces, " "), - (Segment::ReservedWord, "with"), + (Segment::Identifier, "with"), (Segment::EndCommand, "."), (Segment::Newline, "\n"), (Segment::End, ""), @@ -543,7 +555,7 @@ fn test_punctuation() { (Segment::Punct, "["), (Segment::Punct, "]"), (Segment::Punct, "**"), - (Segment::MacroId, "!*"), + (Segment::Punct, "!*"), (Segment::Newline, "\n"), (Segment::Punct, "%"), (Segment::Spaces, " "), @@ -563,7 +575,7 @@ fn test_punctuation() { (Segment::Spaces, " "), (Segment::Punct, "~"), (Segment::Spaces, " "), - (Segment::MacroId, "!*"), + (Segment::Punct, "!*"), (Segment::Newline, "\n"), (Segment::End, ""), ], @@ -889,7 +901,7 @@ title my title. (Segment::EndCommand, "."), (Segment::Newline, "\n"), (Segment::Identifier, "#"), - (Segment::MacroId, "!"), + (Segment::Punct, "!"), (Segment::Spaces, " "), (Segment::Punct, "/"), (Segment::Identifier, "usr"), @@ -953,7 +965,7 @@ next command. (Segment::Spaces, " "), (Segment::Identifier, "ambiguous"), (Segment::Spaces, " "), - (Segment::ReservedWord, "with"), + (Segment::Identifier, "with"), (Segment::Spaces, " "), (Segment::Identifier, "COMPUTE"), (Segment::EndCommand, "."), @@ -1227,7 +1239,7 @@ not data (Segment::Number, "123"), (Segment::EndCommand, "."), (Segment::Newline, "\n"), - (Segment::ReservedWord, "not"), + (Segment::Identifier, "not"), (Segment::Spaces, " "), (Segment::Identifier, "data"), (Segment::Newline, "\n"), @@ -1310,7 +1322,10 @@ end repeat. (Segment::Newline, "\n"), (Segment::DoRepeatCommand, "+ third command."), (Segment::Newline, "\n"), - (Segment::DoRepeatCommand, "end /* x */ /* y */ repeat print."), + ( + Segment::DoRepeatCommand, + "end /* x */ /* y */ repeat print.", + ), (Segment::Newline, "\n"), (Segment::Identifier, "end"), (Segment::Newline, "\n"), @@ -1539,7 +1554,7 @@ var1 var2 var3 "!enddefine" (Segment::Newline, "\n"), (Segment::MacroBody, "var1 var2 var3 \"!enddefine\""), (Segment::Newline, "\n"), - (Segment::MacroId, "!enddefine"), + (Segment::Identifier, "!enddefine"), (Segment::EndCommand, "."), (Segment::Newline, "\n"), (Segment::End, ""), @@ -1563,7 +1578,7 @@ var1 var2 var3 "!enddefine" (Segment::Punct, ")"), (Segment::MacroBody, " var1 var2 var3 /* !enddefine"), (Segment::Newline, "\n"), - (Segment::MacroId, "!enddefine"), + (Segment::Identifier, "!enddefine"), (Segment::EndCommand, "."), (Segment::Newline, "\n"), (Segment::End, ""), @@ -1587,7 +1602,7 @@ var1 var2 var3!enddefine. (Segment::Punct, ")"), (Segment::Newline, "\n"), (Segment::MacroBody, "var1 var2 var3"), - (Segment::MacroId, "!enddefine"), + (Segment::Identifier, "!enddefine"), (Segment::EndCommand, "."), (Segment::Newline, "\n"), (Segment::End, ""), @@ -1609,7 +1624,7 @@ var1 var2 var3!enddefine. (Segment::Punct, "("), (Segment::Punct, ")"), (Segment::MacroBody, "var1 var2 var3"), - (Segment::MacroId, "!enddefine"), + (Segment::Identifier, "!enddefine"), (Segment::EndCommand, "."), (Segment::Newline, "\n"), (Segment::End, ""), @@ -1632,7 +1647,7 @@ var1 var2 var3!enddefine. (Segment::Punct, "("), (Segment::Punct, ")"), (Segment::Newline, "\n"), - (Segment::MacroId, "!enddefine"), + (Segment::Identifier, "!enddefine"), (Segment::EndCommand, "."), (Segment::Newline, "\n"), (Segment::End, ""), @@ -1661,7 +1676,7 @@ var1 var2 var3!enddefine. (Segment::Newline, "\n"), (Segment::MacroBody, ""), (Segment::Newline, "\n"), - (Segment::MacroId, "!enddefine"), + (Segment::Identifier, "!enddefine"), (Segment::EndCommand, "."), (Segment::Newline, "\n"), (Segment::End, ""), @@ -1702,7 +1717,7 @@ var1 var2 var3!enddefine. (Segment::Punct, ")"), (Segment::Punct, ")"), (Segment::Newline, "\n"), - (Segment::MacroId, "!enddefine"), + (Segment::Identifier, "!enddefine"), (Segment::EndCommand, "."), (Segment::Newline, "\n"), (Segment::End, ""), @@ -1748,7 +1763,7 @@ var1 var2 var3!enddefine. (Segment::Newline, "\n"), (Segment::Punct, ")"), (Segment::Newline, "\n"), - (Segment::MacroId, "!enddefine"), + (Segment::Identifier, "!enddefine"), (Segment::EndCommand, "."), (Segment::Newline, "\n"), (Segment::End, ""), @@ -1793,7 +1808,7 @@ content 2 (Segment::Newline, "\n"), (Segment::MacroBody, "content 2"), (Segment::Newline, "\n"), - (Segment::MacroId, "!enddefine"), + (Segment::Identifier, "!enddefine"), (Segment::EndCommand, "."), (Segment::Newline, "\n"), (Segment::End, ""), diff --git a/rust/src/lex/token.rs b/rust/src/lex/token.rs index 868a79dac9..8467a7e927 100644 --- a/rust/src/lex/token.rs +++ b/rust/src/lex/token.rs @@ -22,9 +22,6 @@ pub enum Token { /// Operators, punctuators, and reserved words. Punct(Punct), - - /// Tokens that only appear in macros. - MacroToken(MacroToken), } fn is_printable(c: char) -> bool { @@ -79,7 +76,6 @@ impl Display for Token { } Token::EndCommand => write!(f, "."), Token::Punct(punct) => punct.fmt(f), - Token::MacroToken(mt) => mt.fmt(f), } } } @@ -186,6 +182,32 @@ pub enum Punct { /// `**`. Exp, + + /// `!` (only appears in macros). + Bang, + + /// `%` (only appears in macros). + Percent, + + /// `?` (only appears in macros). + Question, + + /// ```` (only appears in macros). + Backtick, + + /// `.`. + /// + /// This represents a dot in the middle of a line by itself, where it does not end a command. + Dot, + + /// `_` (only appears in macros). + /// + /// Although underscores may appear within identifiers, they can't be the + /// first character, so this represents an underscore found on its own. + Underscore, + + /// `!*` (only appears in macros). + BangAsterisk, } impl Punct { @@ -219,6 +241,13 @@ impl Punct { Self::To => "TO", Self::With => "WITH", Self::Exp => "**", + Self::Bang => "!", + Self::Percent => "%", + Self::Question => "?", + Self::Backtick => "`", + Self::Dot => ".", + Self::Underscore => "_", + Self::BangAsterisk => "!*", } } } @@ -227,54 +256,3 @@ impl Display for Punct { write!(f, "{}", self.as_str()) } } - -/// Tokens that only appear in macros. -#[derive(Clone, Debug, PartialEq, Eq)] -pub enum MacroToken { - /// Identifier starting with `!`. - MacroId(String), - - /// `!`. - Bang, - - /// `%`. - Percent, - - /// `?`. - Question, - - /// ````. - Backtick, - - /// `.` (in the middle of a line by itself, where it does not end a command). - Dot, - - /// `_`. - /// - /// Although underscores may appear within identifiers, they can't be the - /// first character, so this represents an underscore found on its own. - Underscore, -} - -impl MacroToken { - pub fn as_str(&self) -> &str { - match self { - MacroToken::MacroId(id) => &id, - MacroToken::Bang => "!", - MacroToken::Percent => "%", - MacroToken::Question => "?", - MacroToken::Backtick => "`", - MacroToken::Dot => ".", - MacroToken::Underscore => "_", - } - } -} - -impl Display for MacroToken { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - match self { - MacroToken::MacroId(id) => write!(f, "{id}"), - _ => write!(f, "{}", self.as_str()), - } - } -} diff --git a/rust/src/macros.rs b/rust/src/macros.rs index 2f9188003a..33dac11112 100644 --- a/rust/src/macros.rs +++ b/rust/src/macros.rs @@ -3,10 +3,7 @@ use std::collections::HashMap; use thiserror::Error as ThisError; use unicase::UniCase; -use crate::{ - lex::token::{MacroToken, Token}, - message::Location, -}; +use crate::{lex::token::Token, message::Location}; #[derive(Clone, Debug, ThisError)] pub enum MacroError { @@ -20,13 +17,24 @@ pub enum MacroError { macro_: String, }, - /// Expected a particular token. + /// Expected a particular token at end of command. #[error("Reached end of command expecting {token:?} in argument {arg} to macro {macro_}.")] ExpectedToken { token: String, arg: String, macro_: String, }, + + /// Expected a particular token, got a different one. + #[error( + "Found `{actual}` while expecting `{expected}` reading argument {arg} to macro {macro_}." + )] + UnexpectedToken { + actual: String, + expected: String, + arg: String, + macro_: String, + }, } /// A PSPP macro as defined with `!DEFINE`. @@ -54,14 +62,14 @@ impl Macro { } else if let ValueType::Enclose(_, _) = self.parameters[0].arg { MacroCallState::Enclose } else { - MacroCallState::NewArg + MacroCallState::Arg } } } struct Parameter { - /// `!name` or `!1`. - name: String, + /// `name` or `1`. + name: UniCase, /// Default value. /// @@ -99,6 +107,7 @@ enum ValueType { /// A token and the syntax that was tokenized to produce it. The syntax allows /// the token to be turned back into syntax accurately. +#[derive(Clone)] struct BodyToken { /// The token. token: Token, @@ -110,11 +119,8 @@ struct BodyToken { type MacroSet = HashMap, Macro>; pub enum MacroCallState { - /// Starting a new argument. - NewArg, - /// Accumulating tokens toward the end of any type of argument. - ContinueArg, + Arg, /// Expecting the opening delimiter of an ARG_ENCLOSE argument. Enclose, @@ -134,7 +140,8 @@ pub struct MacroCall<'a> { macros: &'a MacroSet, macro_: &'a Macro, state: MacroCallState, - args: Vec>, + args: Box<[Option>]>, + arg_index: usize, /// Length of macro call so far. n_tokens: usize, @@ -146,11 +153,9 @@ impl<'a> MacroCall<'a> { return None; } let macro_name = match token { - Token::Id(s) => s, - Token::MacroToken(MacroToken::MacroId(s)) => s, + Token::Id(s) => s.clone(), _ => return None, - } - .clone(); + }; // XXX Unicase::new() is very expensive. We probably need to define our // own Unicase-alike that has a proper Borrow<> implementation. let Some(macro_) = macros.get(&UniCase::new(macro_name)) else { @@ -160,39 +165,180 @@ impl<'a> MacroCall<'a> { macros, macro_, state: macro_.initial_state(), - args: Vec::with_capacity(macro_.parameters.len()), + args: (0..macro_.parameters.len()).map(|_| None).collect(), + arg_index: 0, n_tokens: 1, }) } - fn push_continue_arg(&mut self, token: &Token, syntax: &String, error: &impl Fn(MacroError)) { - if let Token::EndCommand | Token::End = token { - let param = &self.macro_.parameters[self.args.len() - 1]; - let arg = self.args.last().unwrap(); - match param.arg { - ValueType::NTokens(n) => error(MacroError::ExpectedMoreTokens { - n: n - arg.len(), - arg: param.name.clone(), - macro_: self.macro_.name.clone(), - }), - ValueType::CharEnd(end) | ValueType::Enclose(_, end) => todo!(), - ValueType::CmdEnd => todo!(), + fn finished(&mut self) -> Option { + self.state = MacroCallState::Finished; + for (i, arg) in self.args.iter_mut().enumerate() { + if arg.is_none() { + *arg = Some(self.macro_.parameters[i].default.clone()); } } + Some(self.n_tokens) } - fn push_new_arg(&mut self, token: &Token, syntax: &String, error: &impl Fn(MacroError)) { + + fn next_arg(&mut self) -> Option { + if self.macro_.parameters.is_empty() { + self.finished() + } else { + let param = &self.macro_.parameters[self.arg_index]; + if param.is_positional() { + self.arg_index += 1; + if self.arg_index >= self.args.len() { + self.finished() + } else { + let param = &self.macro_.parameters[self.arg_index]; + self.state = if !param.is_positional() { + MacroCallState::Keyword + } else if let ValueType::Enclose(_, _) = param.arg { + MacroCallState::Enclose + } else { + MacroCallState::Arg + }; + None + } + } else { + if self.args.iter().any(|arg| arg.is_none()) { + self.state = MacroCallState::Keyword; + None + } else { + self.finished() + } + } + } + } + + fn push_arg( + &mut self, + token: &Token, + syntax: &str, + error: &impl Fn(MacroError), + ) -> Option { + let param = &self.macro_.parameters[self.args.len() - 1]; if let Token::EndCommand | Token::End = token { - return self.mc_finished(); + if let Some(arg) = &self.args[self.arg_index] { + let param = &self.macro_.parameters[self.args.len() - 1]; + + match ¶m.arg { + ValueType::NTokens(n) => error(MacroError::ExpectedMoreTokens { + n: n - arg.len(), + arg: param.name.clone(), + macro_: self.macro_.name.clone(), + }), + ValueType::CharEnd(end) | ValueType::Enclose(_, end) => { + error(MacroError::ExpectedToken { + token: end.to_string(), + arg: param.name.clone(), + macro_: self.macro_.name.clone(), + }) + } + ValueType::CmdEnd => { + // This is OK, it's the expected way to end the argument. + } + } + } + return self.finished(); + } + + self.n_tokens += 1; + let arg = self.args[self.arg_index].get_or_insert(Vec::new()); + let ( + add_token, // Should we add `mt` to the current arg? + next_arg, // Should we advance to the next arg? + ) = match ¶m.arg { + ValueType::NTokens(n) => (arg.len() + 1 >= *n, true), + ValueType::CharEnd(end) | ValueType::Enclose(_, end) => { + let at_end = token == end; + (at_end, !at_end) + } + ValueType::CmdEnd => (false, true), + }; + if add_token { + if true + // !macro_expand_arg (&mt->token, mc->me, *argp) + { + arg.push(BodyToken { + token: token.clone(), + syntax: String::from(syntax), + }); + } + } + if next_arg { + self.next_arg() + } else { + None + } + } + + fn push_enclose( + &mut self, + token: &Token, + syntax: &str, + error: &impl Fn(MacroError), + ) -> Option { + let param = &self.macro_.parameters[self.arg_index]; + let ValueType::Enclose(start, _) = ¶m.arg else { + unreachable!() + }; + if token == start { + self.n_tokens += 1; + self.args[self.arg_index].get_or_insert(Vec::new()); + self.state = MacroCallState::Arg; + None + } else if param.is_positional() && matches!(token, Token::End | Token::EndCommand) { + self.finished() + } else { + error(MacroError::UnexpectedToken { + actual: String::from(syntax), + expected: start.to_string(), + arg: param.name.clone(), + macro_: self.macro_.name.clone(), + }); + self.finished() } - self.args.push(Vec::new()); - self.state = MacroCallState::ContinueArg; - self.push_continue_arg(token, syntax, error); } - pub fn push(&mut self, token: &Token, syntax: &String, error: &impl Fn(MacroError)) -> ! { + + fn push_keyword( + &mut self, + token: &Token, + syntax: &str, + error: &impl Fn(MacroError), + ) -> Option { + let Token::Id(id) = token else { + return self.finished(); + }; + let Some(arg_idx) = self + .macro_ + .parameters + .iter() + .position(|param| param.name == UniCase::new(id)) + else {}; + } + + /// Adds `token`, which has the given `syntax`, to the collection of tokens + /// in `self` that potentially need to be macro expanded. + /// + /// Returns `None` if the macro expander needs more tokens, for macro + /// arguments or to decide whether this is actually a macro invocation. The + /// caller should call `push` again with the next token. + /// + /// Returns `Some(n)` if the macro was complete with `n` tokens. The caller + /// should call [`Self::expand`] to obtain the expansion. (If `n == 0`, + /// then the tokens did not actually invoke a macro at all and the expansion + /// will be empty.) + pub fn push( + &mut self, + token: &Token, + syntax: &str, + error: &impl Fn(MacroError), + ) -> Option { match self.state { - MacroCallState::NewArg => self.push_new_arg(token, syntax, error), - MacroCallState::ContinueArg => self.push_continue_arg(token, syntax, error), - MacroCallState::Enclose => todo!(), + MacroCallState::Arg => self.push_arg(token, syntax, error), + MacroCallState::Enclose => self.push_enclose(token, syntax, error), MacroCallState::Keyword => todo!(), MacroCallState::Equals => todo!(), MacroCallState::Finished => todo!(),