impl IdentifierChar for char {
fn ascii_may_start_id(self) -> bool {
- matches!(self, 'a'..='z' | 'A'..='Z' | '@' | '#' | '$')
+ matches!(self, 'a'..='z' | 'A'..='Z' | '@' | '#' | '$' | '!')
}
fn may_start_id(self) -> bool {
} else {
use MajorCategory::*;
- ([L, M, S].contains(&self.get_major_category()) || "@#$".contains(self))
- && self != char::REPLACEMENT_CHARACTER
+ [L, M, S].contains(&self.get_major_category()) && self != char::REPLACEMENT_CHARACTER
}
}
fn ascii_may_continue_id(self) -> bool {
- self.ascii_may_start_id() || matches!(self, '0'..='9' | '.' | '_')
+ matches!(self, 'a'..='z' | 'A'..='Z' | '0'..='9' | '@' | '#' | '$' | '.' | '_')
}
fn may_continue_id(self) -> bool {
} else {
use MajorCategory::*;
- ([L, M, S, N].contains(&self.get_major_category()) || "@#$._".contains(self))
- && self != char::REPLACEMENT_CHARACTER
+ [L, M, S, N].contains(&self.get_major_category()) && self != char::REPLACEMENT_CHARACTER
}
}
}
#[error("\"{0}\" may not be used as an identifier because it is a reserved word.")]
Reserved(String),
+ #[error("\"!\" is not a valid identifier.")]
+ Bang,
+
#[error("\"{0}\" may not be used as an identifier because it begins with disallowed character \"{1}\".")]
BadFirstCharacter(String, char),
},
}
-pub fn is_reserved_word(s: &str) -> bool {
- for word in [
- "and", "or", "not", "eq", "ge", "gt", "le", "lt", "ne", "all", "by", "to", "with",
- ] {
- if s.eq_ignore_ascii_case(word) {
- return true;
+pub enum ReservedWord {
+ And,
+ Or,
+ Not,
+ Eq,
+ Ge,
+ Gt,
+ Le,
+ Lt,
+ Ne,
+ All,
+ By,
+ To,
+ With,
+}
+
+impl TryFrom<&str> for ReservedWord {
+ type Error = ();
+
+ fn try_from(source: &str) -> Result<Self, Self::Error> {
+ if !(2..=4).contains(&source.len()) {
+ Err(())
+ } else {
+ let b = source.as_bytes();
+ let c0 = b[0].to_ascii_uppercase();
+ let c1 = b[1].to_ascii_uppercase();
+ match (source.len(), c0, c1) {
+ (2, b'B', b'Y') => Ok(Self::By),
+ (2, b'E', b'Q') => Ok(Self::Eq),
+ (2, b'G', b'T') => Ok(Self::Gt),
+ (2, b'G', b'E') => Ok(Self::Ge),
+ (2, b'L', b'T') => Ok(Self::Lt),
+ (2, b'L', b'E') => Ok(Self::Le),
+ (2, b'N', b'E') => Ok(Self::Ne),
+ (3, b'N', b'O') if b[2].to_ascii_uppercase() == b'T' => Ok(Self::Not),
+ (2, b'O', b'R') => Ok(Self::Or),
+ (2, b'T', b'O') => Ok(Self::To),
+ (3, b'A', b'L') if b[2].to_ascii_uppercase() == b'L' => Ok(Self::All),
+ (3, b'A', b'N') if b[2].to_ascii_uppercase() == b'D' => Ok(Self::And),
+ (4, b'W', b'I')
+ if b[2].to_ascii_uppercase() == b'T' && b[3].to_ascii_uppercase() == b'H' =>
+ {
+ Ok(Self::With)
+ }
+ _ => Err(()),
+ }
}
}
- false
+}
+
+pub fn is_reserved_word(s: &str) -> bool {
+ ReservedWord::try_from(s).is_ok()
}
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
if is_reserved_word(s) {
return Err(Error::Reserved(s.into()));
}
+ if s == "!" {
+ return Err(Error::Bang);
+ }
let mut i = s.chars();
let first = i.next().unwrap();
//! are the same as the tokens used by the PSPP parser with a few additional
//! types.
+use crate::identifier::ReservedWord;
+
use super::{
segment::{Mode, Segment, Segmenter},
- token::{MacroToken, Punct, Token},
+ token::{Punct, Token},
};
use std::collections::VecDeque;
use thiserror::Error as ThisError;
| Segment::MacroBody
| Segment::MacroName => Some(Self::Token(Token::String(String::from(s)))),
- Segment::ReservedWord => {
- let c0 = s.as_bytes()[0].to_ascii_uppercase();
- let c1 = s.as_bytes()[1].to_ascii_uppercase();
- match (c0, c1) {
- (b'B', _) => Some(Self::Token(Token::Punct(Punct::By))),
- (b'E', _) => Some(Self::Token(Token::Punct(Punct::Eq))),
- (b'G', b'T') => Some(Self::Token(Token::Punct(Punct::Gt))),
- (b'G', _) => Some(Self::Token(Token::Punct(Punct::Ge))),
- (b'L', b'T') => Some(Self::Token(Token::Punct(Punct::Lt))),
- (b'L', _) => Some(Self::Token(Token::Punct(Punct::Le))),
- (b'N', b'E') => Some(Self::Token(Token::Punct(Punct::Ne))),
- (b'N', _) => Some(Self::Token(Token::Punct(Punct::Not))),
- (b'O', _) => Some(Self::Token(Token::Punct(Punct::Or))),
- (b'T', _) => Some(Self::Token(Token::Punct(Punct::To))),
- (b'A', b'L') => Some(Self::Token(Token::Punct(Punct::All))),
- (b'A', _) => Some(Self::Token(Token::Punct(Punct::And))),
- (b'W', _) => Some(Self::Token(Token::Punct(Punct::With))),
- _ => unreachable!(),
- }
+ Segment::Identifier => {
+ if let Ok(reserved_word) = ReservedWord::try_from(s) {
+ match reserved_word {
+ ReservedWord::And => Some(Self::Token(Token::Punct(Punct::And))),
+ ReservedWord::Or => Some(Self::Token(Token::Punct(Punct::Or))),
+ ReservedWord::Not => Some(Self::Token(Token::Punct(Punct::Not))),
+ ReservedWord::Eq => Some(Self::Token(Token::Punct(Punct::Eq))),
+ ReservedWord::Ge => Some(Self::Token(Token::Punct(Punct::Ge))),
+ ReservedWord::Gt => Some(Self::Token(Token::Punct(Punct::Gt))),
+ ReservedWord::Le => Some(Self::Token(Token::Punct(Punct::Le))),
+ ReservedWord::Lt => Some(Self::Token(Token::Punct(Punct::Lt))),
+ ReservedWord::Ne => Some(Self::Token(Token::Punct(Punct::Ne))),
+ ReservedWord::All => Some(Self::Token(Token::Punct(Punct::All))),
+ ReservedWord::By => Some(Self::Token(Token::Punct(Punct::By))),
+ ReservedWord::To => Some(Self::Token(Token::Punct(Punct::To))),
+ ReservedWord::With => Some(Self::Token(Token::Punct(Punct::With))),
+ }
+ } else {
+ Some(Self::Token(Token::Id(String::from(s))))}
}
- Segment::Identifier => Some(Self::Token(Token::Id(String::from(s)))),
Segment::Punct => match s {
"(" => Some(Self::Token(Token::Punct(Punct::LParen))),
")" => Some(Self::Token(Token::Punct(Punct::RParen))),
"<>" => Some(Self::Token(Token::Punct(Punct::Ne))),
"~=" => Some(Self::Token(Token::Punct(Punct::Ne))),
">=" => Some(Self::Token(Token::Punct(Punct::Ge))),
- "!" => Some(Self::Token(Token::MacroToken(MacroToken::Bang))),
- "%" => Some(Self::Token(Token::MacroToken(MacroToken::Percent))),
- "?" => Some(Self::Token(Token::MacroToken(MacroToken::Question))),
- "`" => Some(Self::Token(Token::MacroToken(MacroToken::Backtick))),
- "_" => Some(Self::Token(Token::MacroToken(MacroToken::Underscore))),
- "." => Some(Self::Token(Token::MacroToken(MacroToken::Dot))),
+ "!" => Some(Self::Token(Token::Punct(Punct::Bang))),
+ "%" => Some(Self::Token(Token::Punct(Punct::Percent))),
+ "?" => Some(Self::Token(Token::Punct(Punct::Question))),
+ "`" => Some(Self::Token(Token::Punct(Punct::Backtick))),
+ "_" =>Some(Self::Token(Token::Punct(Punct::Underscore))),
+ "." =>Some(Self::Token(Token::Punct(Punct::Dot))),
+ "!*" => Some(Self::Token(Token::Punct(Punct::BangAsterisk))),
_ => unreachable!("bad punctuator {s:?}"),
},
Segment::Shbang
| Segment::Newline
| Segment::CommentCommand => None,
Segment::DoRepeatOverflow => Some(Self::Error(ScanError::DoRepeatOverflow)),
- Segment::MacroId => Some(Self::Token(Token::MacroToken(MacroToken::MacroId(
- String::from(s),
- )))),
Segment::StartDocument => Some(Self::Token(Token::Id(String::from("DOCUMENT")))),
Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => {
Some(Self::Token(Token::EndCommand))
use crate::lex::{
segment::Mode,
- token::{MacroToken, Punct, Token},
+ token::{Punct, Token},
};
use super::{ScanError, ScanToken, StringLexer};
Token::String(s) => print!("Token::String(String::from({s:?}))"),
Token::EndCommand => print!("Token::EndCommand"),
Token::Punct(punct) => print!("Token::Punct(Punct::{punct:?})"),
- Token::MacroToken(MacroToken::MacroId(id)) => {
- print!("Token::MacroToken(MacroToken::MacroId(String::from({id:?})))")
- }
- Token::MacroToken(m) => print!("Token::MacroToken(MacroToken::{m:?})"),
}
}
ScanToken::Token(Token::Id(String::from("$x"))),
ScanToken::Token(Token::Id(String::from("@efg"))),
ScanToken::Token(Token::Id(String::from("@@."))),
- ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
- "!abcd",
- )))),
- ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from("!*")))),
- ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from("!*")))),
+ ScanToken::Token(Token::Id(String::from("!abcd"))),
+ ScanToken::Token(Token::Punct(Punct::BangAsterisk)),
+ ScanToken::Token(Token::Punct(Punct::BangAsterisk)),
ScanToken::Token(Token::Id(String::from("a"))),
ScanToken::Token(Token::Id(String::from("#.#"))),
- ScanToken::Token(Token::MacroToken(MacroToken::Dot)),
+ ScanToken::Token(Token::Punct(Punct::Dot)),
ScanToken::Token(Token::Id(String::from("x"))),
- ScanToken::Token(Token::MacroToken(MacroToken::Underscore)),
+ ScanToken::Token(Token::Punct(Punct::Underscore)),
ScanToken::Token(Token::Id(String::from("z"))),
ScanToken::Token(Token::EndCommand),
ScanToken::Token(Token::Id(String::from("abcd."))),
ScanToken::Token(Token::Punct(Punct::LSquare)),
ScanToken::Token(Token::Punct(Punct::RSquare)),
ScanToken::Token(Token::Punct(Punct::Exp)),
- ScanToken::Token(Token::MacroToken(MacroToken::Percent)),
+ ScanToken::Token(Token::Punct(Punct::Percent)),
ScanToken::Token(Token::Punct(Punct::Colon)),
ScanToken::Token(Token::Punct(Punct::Semicolon)),
- ScanToken::Token(Token::MacroToken(MacroToken::Question)),
- ScanToken::Token(Token::MacroToken(MacroToken::Underscore)),
- ScanToken::Token(Token::MacroToken(MacroToken::Backtick)),
+ ScanToken::Token(Token::Punct(Punct::Question)),
+ ScanToken::Token(Token::Punct(Punct::Underscore)),
+ ScanToken::Token(Token::Punct(Punct::Backtick)),
ScanToken::Token(Token::Punct(Punct::LCurly)),
ScanToken::Token(Token::Punct(Punct::RCurly)),
ScanToken::Token(Token::Punct(Punct::Not)),
ScanToken::Token(Token::Number(-0.0112)),
ScanToken::Token(Token::Number(-1.0)),
ScanToken::Token(Token::Punct(Punct::Dash)),
- ScanToken::Token(Token::MacroToken(MacroToken::Dot)),
+ ScanToken::Token(Token::Punct(Punct::Dot)),
ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e"))),
ScanToken::Token(Token::Punct(Punct::Dash)),
ScanToken::Token(Token::Id(String::from("e1"))),
Mode::Auto,
&[
ScanToken::Token(Token::Id(String::from("#"))),
- ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from("!")))),
+ ScanToken::Token(Token::Punct(Punct::Bang)),
ScanToken::Token(Token::Punct(Punct::Slash)),
ScanToken::Token(Token::Id(String::from("usr"))),
ScanToken::Token(Token::Punct(Punct::Slash)),
use crate::lex::{
scan::ScanToken,
segment::Mode,
- token::{MacroToken, Punct, Token},
+ token::{Punct, Token},
};
use super::check_scan;
ScanToken::Token(Token::Punct(Punct::LParen)),
ScanToken::Token(Token::Punct(Punct::RParen)),
ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
- ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
- "!enddefine",
- )))),
+ ScanToken::Token(Token::Id(String::from("!enddefine"))),
ScanToken::Token(Token::EndCommand),
],
);
ScanToken::Token(Token::Punct(Punct::LParen)),
ScanToken::Token(Token::Punct(Punct::RParen)),
ScanToken::Token(Token::String(String::from(" var1 var2 var3"))),
- ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
- "!enddefine",
- )))),
+ ScanToken::Token(Token::Id(String::from("!enddefine"))),
ScanToken::Token(Token::EndCommand),
],
);
ScanToken::Token(Token::Punct(Punct::LParen)),
ScanToken::Token(Token::Punct(Punct::RParen)),
ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
- ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
- "!enddefine",
- )))),
+ ScanToken::Token(Token::Id(String::from("!enddefine"))),
ScanToken::Token(Token::EndCommand),
],
);
ScanToken::Token(Token::Punct(Punct::LParen)),
ScanToken::Token(Token::Punct(Punct::RParen)),
ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
- ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
- "!enddefine",
- )))),
+ ScanToken::Token(Token::Id(String::from("!enddefine"))),
ScanToken::Token(Token::EndCommand),
],
);
ScanToken::Token(Token::String(String::from("!macro1"))),
ScanToken::Token(Token::Punct(Punct::LParen)),
ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
- "!enddefine",
- )))),
+ ScanToken::Token(Token::Id(String::from("!enddefine"))),
ScanToken::Token(Token::EndCommand),
],
);
ScanToken::Token(Token::Punct(Punct::RParen)),
ScanToken::Token(Token::String(String::from(""))),
ScanToken::Token(Token::String(String::from(""))),
- ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
- "!enddefine",
- )))),
+ ScanToken::Token(Token::Id(String::from("!enddefine"))),
ScanToken::Token(Token::EndCommand),
],
);
ScanToken::Token(Token::Punct(Punct::LParen)),
ScanToken::Token(Token::Punct(Punct::RParen)),
ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
- "!enddefine",
- )))),
+ ScanToken::Token(Token::Id(String::from("!enddefine"))),
ScanToken::Token(Token::EndCommand),
],
);
ScanToken::Token(Token::Punct(Punct::LParen)),
ScanToken::Token(Token::Punct(Punct::RParen)),
ScanToken::Token(Token::Punct(Punct::RParen)),
- ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
- "!enddefine",
- )))),
+ ScanToken::Token(Token::Id(String::from("!enddefine"))),
ScanToken::Token(Token::EndCommand),
],
);
ScanToken::Token(Token::Punct(Punct::RParen)),
ScanToken::Token(Token::String(String::from("content 1"))),
ScanToken::Token(Token::String(String::from("content 2"))),
- ScanToken::Token(Token::MacroToken(MacroToken::MacroId(String::from(
- "!enddefine",
- )))),
+ ScanToken::Token(Token::Id(String::from("!enddefine"))),
ScanToken::Token(Token::EndCommand),
],
);
//! such as error messages later in tokenization (e.g. SEG_EXPECTED_QUOTE).
use crate::{
- identifier::{id_match, id_match_n, is_reserved_word, IdentifierChar},
+ identifier::{id_match, id_match_n, IdentifierChar},
prompt::PromptStyle,
};
use bitflags::bitflags;
-use super::{
- command_name::{command_match, COMMAND_NAMES},
-};
+use super::command_name::{command_match, COMMAND_NAMES};
/// Segmentation mode.
///
HexString,
UnicodeString,
UnquotedString,
- ReservedWord,
Identifier,
Punct,
Shbang,
DoRepeatCommand,
DoRepeatOverflow,
InlineData,
- MacroId,
MacroName,
MacroBody,
StartDocument,
'x' | 'X' | 'u' | 'U' => {
let (c, _rest) = take(rest, eof)?;
Ok(c == Some('\'') || c == Some('"'))
- },
+ }
'\'' | '"' => Ok(true),
'\n' | '\r' if is_end_of_line(input, eof)? => Ok(true),
_ => Ok(false),
'!' => {
let (c, rest2) = take(rest, eof)?;
match c {
- Some('*') => Ok((rest2, Segment::MacroId)),
+ Some('*') => Ok((rest2, Segment::Punct)),
Some(_) => self.parse_id(input, eof),
None => Ok((rest, Segment::Punct)),
}
}
c if c.is_whitespace() => Ok((skip_spaces(rest, eof)?, Segment::Spaces)),
c if c.may_start_id() => self.parse_id(input, eof),
- '!'..='~' if c != '\\' && c != '^' => {
+ '#'..='~' if c != '\\' && c != '^' => {
self.state.1 = Substate::empty();
Ok((rest, Segment::Punct))
}
| Segment::HexString
| Segment::UnicodeString
| Segment::UnquotedString
- | Segment::ReservedWord
| Segment::Punct
| Segment::CommentCommand
| Segment::DoRepeatCommand
| Segment::DoRepeatOverflow
| Segment::InlineData
- | Segment::MacroId
| Segment::MacroName
| Segment::MacroBody
| Segment::StartDocument
}
self.state.1 = Substate::empty();
- let segment = if is_reserved_word(identifier) {
- Segment::ReservedWord
- } else if identifier.starts_with('!') {
- Segment::MacroId
- } else {
- Segment::Identifier
- };
- Ok((rest, segment))
+ Ok((
+ rest,
+ if identifier != "!" {
+ Segment::Identifier
+ } else {
+ Segment::Punct
+ },
+ ))
}
fn parse_digraph<'a>(
&mut self,
/// - The `DEFINE` keyword.
///
/// - An identifier. We transform this into `Type::MacroName` instead of
- /// `Type::Identifier` or `Type::MacroId` because this identifier must
- /// never be macro-expanded.
+ /// `Type::Identifier` because this identifier must never be macro-expanded.
///
/// - Anything but `(`.
///
) -> Result<(&'a str, Segment), Incomplete> {
let (rest, segment) = self.subparse(input, eof)?;
match segment {
- Segment::Identifier | Segment::MacroId if self.state.0 == State::Define1 => {
+ Segment::Identifier if self.state.0 == State::Define1 => {
self.state.0 = State::Define2;
return Ok((rest, Segment::MacroName));
}
use crate::prompt::PromptStyle;
-use super::{Mode, Segmenter, Segment};
+use super::{Mode, Segment, Segmenter};
-fn push_segment<'a>(segmenter: &mut Segmenter, input: &'a str, one_byte: bool) -> (&'a str, Segment) {
+fn push_segment<'a>(
+ segmenter: &mut Segmenter,
+ input: &'a str,
+ one_byte: bool,
+) -> (&'a str, Segment) {
if one_byte {
for len in input.char_indices().map(|(pos, _c)| pos) {
if let Ok((rest, segment)) = segmenter.push(&input[..len], false) {
let mut expect_segments: Vec<_> = expect_segments.iter().copied().collect();
assert_eq!(expect_segments.pop(), Some((Segment::End, "")));
assert_eq!(expect_segments.pop(), Some((Segment::Newline, "\n")));
- while let Some((Segment::SeparateCommands | Segment::EndCommand, "")) = expect_segments.last()
+ while let Some((Segment::SeparateCommands | Segment::EndCommand, "")) =
+ expect_segments.last()
{
expect_segments.pop();
}
f@#_.#6
GhIjK
.x 1y _z
+!abc abc!
"#,
Mode::Auto,
&[
(Segment::Spaces, " "),
(Segment::Identifier, "abcd"),
(Segment::Spaces, " "),
- (Segment::MacroId, "!abcd"),
+ (Segment::Identifier, "!abcd"),
(Segment::Newline, "\n"),
(Segment::Identifier, "A"),
(Segment::Spaces, " "),
(Segment::Spaces, " "),
(Segment::Identifier, "ABCD"),
(Segment::Spaces, " "),
- (Segment::MacroId, "!ABCD"),
+ (Segment::Identifier, "!ABCD"),
(Segment::Newline, "\n"),
(Segment::Identifier, "aB"),
(Segment::Spaces, " "),
(Segment::Spaces, " "),
(Segment::Identifier, "aBcD"),
(Segment::Spaces, " "),
- (Segment::MacroId, "!aBcD"),
+ (Segment::Identifier, "!aBcD"),
(Segment::Newline, "\n"),
(Segment::Identifier, "$x"),
(Segment::Spaces, " "),
(Segment::Spaces, " "),
(Segment::Identifier, "$z"),
(Segment::Spaces, " "),
- (Segment::MacroId, "!$z"),
+ (Segment::Identifier, "!$z"),
(Segment::Newline, "\n"),
(Segment::Identifier, "grève"),
(Segment::Spaces, "\u{00a0}"),
(Segment::Spaces, " "),
(Segment::Identifier, "#d"),
(Segment::Spaces, " "),
- (Segment::MacroId, "!#d"),
+ (Segment::Identifier, "!#d"),
(Segment::Newline, "\n"),
(Segment::Identifier, "@efg"),
(Segment::Spaces, " "),
(Segment::Spaces, " "),
(Segment::Identifier, "@#@"),
(Segment::Spaces, " "),
- (Segment::MacroId, "!@"),
+ (Segment::Identifier, "!@"),
(Segment::Spaces, " "),
(Segment::Newline, "\n"),
(Segment::Identifier, "##"),
(Segment::Punct, "_"),
(Segment::Identifier, "z"),
(Segment::Newline, "\n"),
+ (Segment::Identifier, "!abc"),
+ (Segment::Spaces, " "),
+ (Segment::Identifier, "abc"),
+ (Segment::Punct, "!"),
+ (Segment::Newline, "\n"),
(Segment::End, ""),
],
&[
PromptStyle::Later,
PromptStyle::Later,
PromptStyle::Later,
+ PromptStyle::Later,
],
);
}
"#,
Mode::Auto,
&[
- (Segment::ReservedWord, "and"),
+ (Segment::Identifier, "and"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "or"),
+ (Segment::Identifier, "or"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "not"),
+ (Segment::Identifier, "not"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "eq"),
+ (Segment::Identifier, "eq"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "ge"),
+ (Segment::Identifier, "ge"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "gt"),
+ (Segment::Identifier, "gt"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "le"),
+ (Segment::Identifier, "le"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "lt"),
+ (Segment::Identifier, "lt"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "ne"),
+ (Segment::Identifier, "ne"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "all"),
+ (Segment::Identifier, "all"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "by"),
+ (Segment::Identifier, "by"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "to"),
+ (Segment::Identifier, "to"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "with"),
+ (Segment::Identifier, "with"),
(Segment::Newline, "\n"),
- (Segment::ReservedWord, "AND"),
+ (Segment::Identifier, "AND"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "OR"),
+ (Segment::Identifier, "OR"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "NOT"),
+ (Segment::Identifier, "NOT"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "EQ"),
+ (Segment::Identifier, "EQ"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "GE"),
+ (Segment::Identifier, "GE"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "GT"),
+ (Segment::Identifier, "GT"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "LE"),
+ (Segment::Identifier, "LE"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "LT"),
+ (Segment::Identifier, "LT"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "NE"),
+ (Segment::Identifier, "NE"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "ALL"),
+ (Segment::Identifier, "ALL"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "BY"),
+ (Segment::Identifier, "BY"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "TO"),
+ (Segment::Identifier, "TO"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "WITH"),
+ (Segment::Identifier, "WITH"),
(Segment::Newline, "\n"),
(Segment::Identifier, "andx"),
(Segment::Spaces, " "),
(Segment::Newline, "\n"),
(Segment::Identifier, "and."),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "with"),
+ (Segment::Identifier, "with"),
(Segment::EndCommand, "."),
(Segment::Newline, "\n"),
(Segment::End, ""),
(Segment::Punct, "["),
(Segment::Punct, "]"),
(Segment::Punct, "**"),
- (Segment::MacroId, "!*"),
+ (Segment::Punct, "!*"),
(Segment::Newline, "\n"),
(Segment::Punct, "%"),
(Segment::Spaces, " "),
(Segment::Spaces, " "),
(Segment::Punct, "~"),
(Segment::Spaces, " "),
- (Segment::MacroId, "!*"),
+ (Segment::Punct, "!*"),
(Segment::Newline, "\n"),
(Segment::End, ""),
],
(Segment::EndCommand, "."),
(Segment::Newline, "\n"),
(Segment::Identifier, "#"),
- (Segment::MacroId, "!"),
+ (Segment::Punct, "!"),
(Segment::Spaces, " "),
(Segment::Punct, "/"),
(Segment::Identifier, "usr"),
(Segment::Spaces, " "),
(Segment::Identifier, "ambiguous"),
(Segment::Spaces, " "),
- (Segment::ReservedWord, "with"),
+ (Segment::Identifier, "with"),
(Segment::Spaces, " "),
(Segment::Identifier, "COMPUTE"),
(Segment::EndCommand, "."),
(Segment::Number, "123"),
(Segment::EndCommand, "."),
(Segment::Newline, "\n"),
- (Segment::ReservedWord, "not"),
+ (Segment::Identifier, "not"),
(Segment::Spaces, " "),
(Segment::Identifier, "data"),
(Segment::Newline, "\n"),
(Segment::Newline, "\n"),
(Segment::DoRepeatCommand, "+ third command."),
(Segment::Newline, "\n"),
- (Segment::DoRepeatCommand, "end /* x */ /* y */ repeat print."),
+ (
+ Segment::DoRepeatCommand,
+ "end /* x */ /* y */ repeat print.",
+ ),
(Segment::Newline, "\n"),
(Segment::Identifier, "end"),
(Segment::Newline, "\n"),
(Segment::Newline, "\n"),
(Segment::MacroBody, "var1 var2 var3 \"!enddefine\""),
(Segment::Newline, "\n"),
- (Segment::MacroId, "!enddefine"),
+ (Segment::Identifier, "!enddefine"),
(Segment::EndCommand, "."),
(Segment::Newline, "\n"),
(Segment::End, ""),
(Segment::Punct, ")"),
(Segment::MacroBody, " var1 var2 var3 /* !enddefine"),
(Segment::Newline, "\n"),
- (Segment::MacroId, "!enddefine"),
+ (Segment::Identifier, "!enddefine"),
(Segment::EndCommand, "."),
(Segment::Newline, "\n"),
(Segment::End, ""),
(Segment::Punct, ")"),
(Segment::Newline, "\n"),
(Segment::MacroBody, "var1 var2 var3"),
- (Segment::MacroId, "!enddefine"),
+ (Segment::Identifier, "!enddefine"),
(Segment::EndCommand, "."),
(Segment::Newline, "\n"),
(Segment::End, ""),
(Segment::Punct, "("),
(Segment::Punct, ")"),
(Segment::MacroBody, "var1 var2 var3"),
- (Segment::MacroId, "!enddefine"),
+ (Segment::Identifier, "!enddefine"),
(Segment::EndCommand, "."),
(Segment::Newline, "\n"),
(Segment::End, ""),
(Segment::Punct, "("),
(Segment::Punct, ")"),
(Segment::Newline, "\n"),
- (Segment::MacroId, "!enddefine"),
+ (Segment::Identifier, "!enddefine"),
(Segment::EndCommand, "."),
(Segment::Newline, "\n"),
(Segment::End, ""),
(Segment::Newline, "\n"),
(Segment::MacroBody, ""),
(Segment::Newline, "\n"),
- (Segment::MacroId, "!enddefine"),
+ (Segment::Identifier, "!enddefine"),
(Segment::EndCommand, "."),
(Segment::Newline, "\n"),
(Segment::End, ""),
(Segment::Punct, ")"),
(Segment::Punct, ")"),
(Segment::Newline, "\n"),
- (Segment::MacroId, "!enddefine"),
+ (Segment::Identifier, "!enddefine"),
(Segment::EndCommand, "."),
(Segment::Newline, "\n"),
(Segment::End, ""),
(Segment::Newline, "\n"),
(Segment::Punct, ")"),
(Segment::Newline, "\n"),
- (Segment::MacroId, "!enddefine"),
+ (Segment::Identifier, "!enddefine"),
(Segment::EndCommand, "."),
(Segment::Newline, "\n"),
(Segment::End, ""),
(Segment::Newline, "\n"),
(Segment::MacroBody, "content 2"),
(Segment::Newline, "\n"),
- (Segment::MacroId, "!enddefine"),
+ (Segment::Identifier, "!enddefine"),
(Segment::EndCommand, "."),
(Segment::Newline, "\n"),
(Segment::End, ""),
/// Operators, punctuators, and reserved words.
Punct(Punct),
-
- /// Tokens that only appear in macros.
- MacroToken(MacroToken),
}
fn is_printable(c: char) -> bool {
}
Token::EndCommand => write!(f, "."),
Token::Punct(punct) => punct.fmt(f),
- Token::MacroToken(mt) => mt.fmt(f),
}
}
}
/// `**`.
Exp,
+
+ /// `!` (only appears in macros).
+ Bang,
+
+ /// `%` (only appears in macros).
+ Percent,
+
+ /// `?` (only appears in macros).
+ Question,
+
+ /// ```` (only appears in macros).
+ Backtick,
+
+ /// `.`.
+ ///
+ /// This represents a dot in the middle of a line by itself, where it does not end a command.
+ Dot,
+
+ /// `_` (only appears in macros).
+ ///
+ /// Although underscores may appear within identifiers, they can't be the
+ /// first character, so this represents an underscore found on its own.
+ Underscore,
+
+ /// `!*` (only appears in macros).
+ BangAsterisk,
}
impl Punct {
Self::To => "TO",
Self::With => "WITH",
Self::Exp => "**",
+ Self::Bang => "!",
+ Self::Percent => "%",
+ Self::Question => "?",
+ Self::Backtick => "`",
+ Self::Dot => ".",
+ Self::Underscore => "_",
+ Self::BangAsterisk => "!*",
}
}
}
write!(f, "{}", self.as_str())
}
}
-
-/// Tokens that only appear in macros.
-#[derive(Clone, Debug, PartialEq, Eq)]
-pub enum MacroToken {
- /// Identifier starting with `!`.
- MacroId(String),
-
- /// `!`.
- Bang,
-
- /// `%`.
- Percent,
-
- /// `?`.
- Question,
-
- /// ````.
- Backtick,
-
- /// `.` (in the middle of a line by itself, where it does not end a command).
- Dot,
-
- /// `_`.
- ///
- /// Although underscores may appear within identifiers, they can't be the
- /// first character, so this represents an underscore found on its own.
- Underscore,
-}
-
-impl MacroToken {
- pub fn as_str(&self) -> &str {
- match self {
- MacroToken::MacroId(id) => &id,
- MacroToken::Bang => "!",
- MacroToken::Percent => "%",
- MacroToken::Question => "?",
- MacroToken::Backtick => "`",
- MacroToken::Dot => ".",
- MacroToken::Underscore => "_",
- }
- }
-}
-
-impl Display for MacroToken {
- fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
- match self {
- MacroToken::MacroId(id) => write!(f, "{id}"),
- _ => write!(f, "{}", self.as_str()),
- }
- }
-}
use thiserror::Error as ThisError;
use unicase::UniCase;
-use crate::{
- lex::token::{MacroToken, Token},
- message::Location,
-};
+use crate::{lex::token::Token, message::Location};
#[derive(Clone, Debug, ThisError)]
pub enum MacroError {
macro_: String,
},
- /// Expected a particular token.
+ /// Expected a particular token at end of command.
#[error("Reached end of command expecting {token:?} in argument {arg} to macro {macro_}.")]
ExpectedToken {
token: String,
arg: String,
macro_: String,
},
+
+ /// Expected a particular token, got a different one.
+ #[error(
+ "Found `{actual}` while expecting `{expected}` reading argument {arg} to macro {macro_}."
+ )]
+ UnexpectedToken {
+ actual: String,
+ expected: String,
+ arg: String,
+ macro_: String,
+ },
}
/// A PSPP macro as defined with `!DEFINE`.
} else if let ValueType::Enclose(_, _) = self.parameters[0].arg {
MacroCallState::Enclose
} else {
- MacroCallState::NewArg
+ MacroCallState::Arg
}
}
}
struct Parameter {
- /// `!name` or `!1`.
- name: String,
+ /// `name` or `1`.
+ name: UniCase<String>,
/// Default value.
///
/// A token and the syntax that was tokenized to produce it. The syntax allows
/// the token to be turned back into syntax accurately.
+#[derive(Clone)]
struct BodyToken {
/// The token.
token: Token,
type MacroSet = HashMap<UniCase<String>, Macro>;
pub enum MacroCallState {
- /// Starting a new argument.
- NewArg,
-
/// Accumulating tokens toward the end of any type of argument.
- ContinueArg,
+ Arg,
/// Expecting the opening delimiter of an ARG_ENCLOSE argument.
Enclose,
macros: &'a MacroSet,
macro_: &'a Macro,
state: MacroCallState,
- args: Vec<Vec<BodyToken>>,
+ args: Box<[Option<Vec<BodyToken>>]>,
+ arg_index: usize,
/// Length of macro call so far.
n_tokens: usize,
return None;
}
let macro_name = match token {
- Token::Id(s) => s,
- Token::MacroToken(MacroToken::MacroId(s)) => s,
+ Token::Id(s) => s.clone(),
_ => return None,
- }
- .clone();
+ };
// XXX Unicase::new() is very expensive. We probably need to define our
// own Unicase-alike that has a proper Borrow<> implementation.
let Some(macro_) = macros.get(&UniCase::new(macro_name)) else {
macros,
macro_,
state: macro_.initial_state(),
- args: Vec::with_capacity(macro_.parameters.len()),
+ args: (0..macro_.parameters.len()).map(|_| None).collect(),
+ arg_index: 0,
n_tokens: 1,
})
}
- fn push_continue_arg(&mut self, token: &Token, syntax: &String, error: &impl Fn(MacroError)) {
- if let Token::EndCommand | Token::End = token {
- let param = &self.macro_.parameters[self.args.len() - 1];
- let arg = self.args.last().unwrap();
- match param.arg {
- ValueType::NTokens(n) => error(MacroError::ExpectedMoreTokens {
- n: n - arg.len(),
- arg: param.name.clone(),
- macro_: self.macro_.name.clone(),
- }),
- ValueType::CharEnd(end) | ValueType::Enclose(_, end) => todo!(),
- ValueType::CmdEnd => todo!(),
+ fn finished(&mut self) -> Option<usize> {
+ self.state = MacroCallState::Finished;
+ for (i, arg) in self.args.iter_mut().enumerate() {
+ if arg.is_none() {
+ *arg = Some(self.macro_.parameters[i].default.clone());
}
}
+ Some(self.n_tokens)
}
- fn push_new_arg(&mut self, token: &Token, syntax: &String, error: &impl Fn(MacroError)) {
+
+ fn next_arg(&mut self) -> Option<usize> {
+ if self.macro_.parameters.is_empty() {
+ self.finished()
+ } else {
+ let param = &self.macro_.parameters[self.arg_index];
+ if param.is_positional() {
+ self.arg_index += 1;
+ if self.arg_index >= self.args.len() {
+ self.finished()
+ } else {
+ let param = &self.macro_.parameters[self.arg_index];
+ self.state = if !param.is_positional() {
+ MacroCallState::Keyword
+ } else if let ValueType::Enclose(_, _) = param.arg {
+ MacroCallState::Enclose
+ } else {
+ MacroCallState::Arg
+ };
+ None
+ }
+ } else {
+ if self.args.iter().any(|arg| arg.is_none()) {
+ self.state = MacroCallState::Keyword;
+ None
+ } else {
+ self.finished()
+ }
+ }
+ }
+ }
+
+ fn push_arg(
+ &mut self,
+ token: &Token,
+ syntax: &str,
+ error: &impl Fn(MacroError),
+ ) -> Option<usize> {
+ let param = &self.macro_.parameters[self.args.len() - 1];
if let Token::EndCommand | Token::End = token {
- return self.mc_finished();
+ if let Some(arg) = &self.args[self.arg_index] {
+ let param = &self.macro_.parameters[self.args.len() - 1];
+
+ match ¶m.arg {
+ ValueType::NTokens(n) => error(MacroError::ExpectedMoreTokens {
+ n: n - arg.len(),
+ arg: param.name.clone(),
+ macro_: self.macro_.name.clone(),
+ }),
+ ValueType::CharEnd(end) | ValueType::Enclose(_, end) => {
+ error(MacroError::ExpectedToken {
+ token: end.to_string(),
+ arg: param.name.clone(),
+ macro_: self.macro_.name.clone(),
+ })
+ }
+ ValueType::CmdEnd => {
+ // This is OK, it's the expected way to end the argument.
+ }
+ }
+ }
+ return self.finished();
+ }
+
+ self.n_tokens += 1;
+ let arg = self.args[self.arg_index].get_or_insert(Vec::new());
+ let (
+ add_token, // Should we add `mt` to the current arg?
+ next_arg, // Should we advance to the next arg?
+ ) = match ¶m.arg {
+ ValueType::NTokens(n) => (arg.len() + 1 >= *n, true),
+ ValueType::CharEnd(end) | ValueType::Enclose(_, end) => {
+ let at_end = token == end;
+ (at_end, !at_end)
+ }
+ ValueType::CmdEnd => (false, true),
+ };
+ if add_token {
+ if true
+ // !macro_expand_arg (&mt->token, mc->me, *argp)
+ {
+ arg.push(BodyToken {
+ token: token.clone(),
+ syntax: String::from(syntax),
+ });
+ }
+ }
+ if next_arg {
+ self.next_arg()
+ } else {
+ None
+ }
+ }
+
+ fn push_enclose(
+ &mut self,
+ token: &Token,
+ syntax: &str,
+ error: &impl Fn(MacroError),
+ ) -> Option<usize> {
+ let param = &self.macro_.parameters[self.arg_index];
+ let ValueType::Enclose(start, _) = ¶m.arg else {
+ unreachable!()
+ };
+ if token == start {
+ self.n_tokens += 1;
+ self.args[self.arg_index].get_or_insert(Vec::new());
+ self.state = MacroCallState::Arg;
+ None
+ } else if param.is_positional() && matches!(token, Token::End | Token::EndCommand) {
+ self.finished()
+ } else {
+ error(MacroError::UnexpectedToken {
+ actual: String::from(syntax),
+ expected: start.to_string(),
+ arg: param.name.clone(),
+ macro_: self.macro_.name.clone(),
+ });
+ self.finished()
}
- self.args.push(Vec::new());
- self.state = MacroCallState::ContinueArg;
- self.push_continue_arg(token, syntax, error);
}
- pub fn push(&mut self, token: &Token, syntax: &String, error: &impl Fn(MacroError)) -> ! {
+
+ fn push_keyword(
+ &mut self,
+ token: &Token,
+ syntax: &str,
+ error: &impl Fn(MacroError),
+ ) -> Option<usize> {
+ let Token::Id(id) = token else {
+ return self.finished();
+ };
+ let Some(arg_idx) = self
+ .macro_
+ .parameters
+ .iter()
+ .position(|param| param.name == UniCase::new(id))
+ else {};
+ }
+
+ /// Adds `token`, which has the given `syntax`, to the collection of tokens
+ /// in `self` that potentially need to be macro expanded.
+ ///
+ /// Returns `None` if the macro expander needs more tokens, for macro
+ /// arguments or to decide whether this is actually a macro invocation. The
+ /// caller should call `push` again with the next token.
+ ///
+ /// Returns `Some(n)` if the macro was complete with `n` tokens. The caller
+ /// should call [`Self::expand`] to obtain the expansion. (If `n == 0`,
+ /// then the tokens did not actually invoke a macro at all and the expansion
+ /// will be empty.)
+ pub fn push(
+ &mut self,
+ token: &Token,
+ syntax: &str,
+ error: &impl Fn(MacroError),
+ ) -> Option<usize> {
match self.state {
- MacroCallState::NewArg => self.push_new_arg(token, syntax, error),
- MacroCallState::ContinueArg => self.push_continue_arg(token, syntax, error),
- MacroCallState::Enclose => todo!(),
+ MacroCallState::Arg => self.push_arg(token, syntax, error),
+ MacroCallState::Enclose => self.push_enclose(token, syntax, error),
MacroCallState::Keyword => todo!(),
MacroCallState::Equals => todo!(),
MacroCallState::Finished => todo!(),