From 0bc508c8f964826c2769f676c99dc4e1c08b2895 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 25 Aug 2024 17:07:09 -0700 Subject: [PATCH] new lexer --- rust/pspp/src/command.rs | 31 +++----- rust/pspp/src/engine.rs | 20 +++-- rust/pspp/src/lex/lexer.rs | 141 ++++++++++++++++++++++++++++----- rust/pspp/src/lex/scan/mod.rs | 5 +- rust/pspp/src/lex/scan/test.rs | 29 ------- rust/pspp/src/lex/token.rs | 4 - rust/pspp/src/macros.rs | 5 +- 7 files changed, 143 insertions(+), 92 deletions(-) diff --git a/rust/pspp/src/command.rs b/rust/pspp/src/command.rs index a4051a2f55..325d3d9502 100644 --- a/rust/pspp/src/command.rs +++ b/rust/pspp/src/command.rs @@ -8,7 +8,7 @@ use crate::{ integer::ToInteger, lex::{ command_name::CommandMatcher, - lexer::Lexer, + lexer::{NewLexer}, token::{Punct, Token}, }, message::Diagnostic, @@ -126,7 +126,7 @@ fn commands() -> &'static [Command] { COMMANDS.get_or_init(|| new_commands()).as_slice() } -fn parse_command_word(lexer: &mut Lexer, s: &mut String, n: isize) -> bool { +fn parse_command_word(lexer: &mut NewLexer, s: &mut String, n: isize) -> bool { let separator = match s.chars().next_back() { Some(c) if c != '-' => " ", _ => "", @@ -162,7 +162,7 @@ fn find_best_match(s: &str) -> (Option<&'static Command>, isize) { } fn parse_command_name( - lexer: &mut Lexer, + lexer: &mut NewLexer, error: &Box, ) -> Result<(&'static Command, isize), ()> { let mut s = String::new(); @@ -203,7 +203,7 @@ pub enum Success { pub fn end_of_command(context: &Context) -> Result { match context.lexer.token() { - Token::EndCommand | Token::End => Ok(Success::Success), + Token::EndCommand => Ok(Success::Success), _ => { context.error( context @@ -215,47 +215,34 @@ pub fn end_of_command(context: &Context) -> Result { } } -fn parse_in_state(lexer: &mut Lexer, error: &Box, _state: State) { +fn parse_in_state(mut lexer: NewLexer, error: &Box, _state: State) { println!("{}:{}", file!(), line!()); match lexer.token() { - Token::End | Token::EndCommand => (), + Token::EndCommand => (), _ => { - println!("{}:{}", file!(), line!()); - if let Ok((command, n_tokens)) = parse_command_name(lexer, error) { + if let Ok((command, n_tokens)) = parse_command_name(&mut lexer, error) { for _ in 0..n_tokens { lexer.get(); } - println!("{}:{}", file!(), line!()); let mut context = Context { error, lexer, command_name: Some(command.name), }; - println!("{}:{}", file!(), line!()); (command.run)(&mut context); - println!("{}:{}", file!(), line!()); let _ = end_of_command(&context); - println!("{}:{}", file!(), line!()); } - println!("{}:{}", file!(), line!()); - lexer.discard_rest_of_command(); - println!("{}:{}", file!(), line!()); } } - println!("{}:{}", file!(), line!()); - while let Token::EndCommand = lexer.token() { - lexer.get(); - } - println!("{}:{}", file!(), line!()); } -pub fn parse(lexer: &mut Lexer, error: &Box) { +pub fn parse_command(lexer: NewLexer, error: &Box) { parse_in_state(lexer, error, State::Initial) } pub struct Context<'a> { error: &'a Box, - lexer: &'a mut Lexer, + lexer: NewLexer<'a>, command_name: Option<&'static str>, } diff --git a/rust/pspp/src/engine.rs b/rust/pspp/src/engine.rs index 851d427ddd..32132c8cdb 100644 --- a/rust/pspp/src/engine.rs +++ b/rust/pspp/src/engine.rs @@ -1,9 +1,9 @@ use crate::{ - command::parse, + command::parse_command, lex::{ - lexer::{Lexer, Source}, - token::Token, + lexer::{Lexer, NewLexer, NewSource}, }, + macros::MacroSet, message::Diagnostic, }; @@ -17,15 +17,13 @@ impl Engine { lexer: Lexer::new(Box::new(|location, error| println!("{location}: {error}"))), } } - fn run(&mut self, source: Source) { - self.lexer.append(source); - self.lexer.get(); - while self.lexer.token() != &Token::End { - println!("{}:{}", file!(), line!()); + fn run(&mut self, mut source: NewSource) { + let macros = MacroSet::new(); + while let Some(tokens) = source.read_command(¯os) { let error: Box = Box::new(|diagnostic| { println!("{diagnostic}"); }); - parse(&mut self.lexer, &error); + parse_command(NewLexer::new(&tokens), &error); } } } @@ -34,14 +32,14 @@ impl Engine { mod tests { use encoding_rs::UTF_8; - use crate::lex::lexer::{Source, SourceFile}; + use crate::lex::lexer::{NewSource, SourceFile}; use super::Engine; #[test] fn test_echo() { let mut engine = Engine::new(); - engine.run(Source::new_default(SourceFile::for_file_contents( + engine.run(NewSource::new_default(SourceFile::for_file_contents( "ECHO 'hi there'.\nECHO 'bye there'.\n".to_string(), Some("test.sps".to_string()), UTF_8, diff --git a/rust/pspp/src/lex/lexer.rs b/rust/pspp/src/lex/lexer.rs index dac66928df..288aa7e9fa 100644 --- a/rust/pspp/src/lex/lexer.rs +++ b/rust/pspp/src/lex/lexer.rs @@ -285,11 +285,6 @@ impl Source { match scan_token { None => false, Some(ScanToken::Token(token)) => { - let token = if let Token::End = token { - Token::EndCommand - } else { - token - }; self.pp.push_back(LexToken { token, pos, @@ -465,7 +460,6 @@ impl Source { while index >= self.parse.len() { if let Some(token) = self.parse.last() { match token.token { - Token::End => return &Token::End, Token::EndCommand => return &Token::EndCommand, _ => (), } @@ -656,7 +650,7 @@ impl Lexer { }; if !self.source.get_parse(&context) { if !self.pop_stack() { - return &Token::End; + return &Token::EndCommand; } } } @@ -670,7 +664,7 @@ impl Lexer { } else { self.source = Source::default(); self.source.parse.push(LexToken { - token: Token::End, + token: Token::EndCommand, pos: 0..0, macro_rep: None, }); @@ -758,14 +752,14 @@ impl Lexer { /// Advances past any tokens up to [Token::EndCommand] or [Token::End]. pub fn discard_rest_of_command(&mut self) { - while !matches!(self.token(), Token::EndCommand | Token::End) { + while !matches!(self.token(), Token::EndCommand) { self.get(); } } pub fn at_end(&self) -> bool { match self.source.token() { - Token::End | Token::EndCommand => true, + Token::EndCommand => true, _ => false, } } @@ -787,17 +781,18 @@ pub enum Error { TokenError(#[from] ScanError), } +/* #[cfg(test)] mod tests { use encoding_rs::UTF_8; use crate::lex::token::Token; - use super::{Lexer, Source, SourceFile}; + use super::{Lexer, NewLexer, Source, SourceFile}; #[test] fn test() { - let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}"))); + let mut lexer = NewLexer::new(Box::new(|location, error| println!("{location}: {error}"))); lexer.include(Source::new_default(SourceFile::for_string( String::from( r#"#! /usr/local/bin/pspp @@ -811,6 +806,8 @@ LIST. ), UTF_8, ))); + while let Some(tokens) = lexer.read_command() { + loop { lexer.get(); let token = lexer.token(); @@ -873,8 +870,8 @@ lis|.\0", } } } - -struct Tokens { +*/ +pub struct Tokens { file: Arc, tokens: Vec, } @@ -892,9 +889,97 @@ impl Debug for Tokens { } } -struct NewLexer<'a> { +impl Tokens { + /// If the tokens in `ofs` contains a macro call, this returns the raw + /// syntax for the macro call (not for the expansion) and for any other + /// tokens included in that range. The syntax is encoded in UTF-8 and in + /// the original form supplied to the lexer so that, for example, it may + /// include comments, spaces, and new-lines if it spans multiple tokens. + /// + /// Returns `None` if the token range doesn't include a macro call. + fn get_macro_call(&self, ofs: RangeInclusive) -> Option<&str> { + if self + .tokens + .get(ofs.clone()) + .unwrap_or_default() + .iter() + .all(|token| token.macro_rep.is_none()) + { + return None; + } + + let token0 = &self.tokens[*ofs.start()]; + let token1 = &self.tokens[*ofs.end()]; + Some(&self.file.buffer[token0.pos.start..token1.pos.end]) + } + + fn ofs_location(&self, range: RangeInclusive) -> Location { + if *range.start() <= *range.end() && *range.end() < self.tokens.len() { + self.file + .token_location(&self.tokens[*range.start()]..=&self.tokens[*range.end()]) + } else { + Location { + file_name: self.file.file_name.clone(), + span: None, + omit_underlines: false, + } + } + } + + pub fn diagnostic( + &self, + severity: Severity, + ofs: RangeInclusive, + text: String, + ) -> Diagnostic { + let mut s = String::new(); + if let Some(call) = self.get_macro_call(ofs.clone()) { + write!(&mut s, "In syntax expanded from `{}`: ", ellipsize(call)).unwrap(); + } + + if !text.is_empty() { + s.push_str(&text); + } else { + s.push_str("Syntax error."); + } + + if !s.ends_with('.') { + s.push('.'); + } + + let location = self.ofs_location(ofs); + let mut source = Vec::new(); + if let Some(Range { + start: Point { line: l0, .. }, + end: Point { line: l1, .. }, + }) = location.span + { + let lines = if l1 - l0 > 3 { + vec![l0, l0 + 1, l1] + } else { + (l0..=l1).collect() + }; + for line_number in lines { + source.push((line_number, self.file.get_line(line_number).to_string())); + } + } + + Diagnostic { + category: Category::Syntax, + severity, + location, + source, + stack: Vec::new(), + command_name: None, // XXX + text: s, + } + } +} + +pub struct NewLexer<'a> { backing: &'a Tokens, tokens: &'a [LexToken], + start: usize, pos: usize, } @@ -903,6 +988,7 @@ impl<'a> NewLexer<'a> { Self { backing, tokens: backing.tokens.as_slice(), + start: 0, pos: 0, } } @@ -929,18 +1015,35 @@ impl<'a> NewLexer<'a> { pub fn token(&self) -> &Token { self.tokens .get(self.pos) - .map_or(&Token::End, |token| &token.token) + .map_or(&Token::EndCommand, |token| &token.token) } pub fn next(&self, ofs: isize) -> &Token { ofs.checked_add(self.pos as isize) .and_then(|index| usize::try_from(index).ok()) .and_then(|index| self.tokens.get(index)) - .map_or(&Token::End, |token| &token.token) + .map_or(&Token::EndCommand, |token| &token.token) + } + +/* + pub fn force_string(&mut self) -> Result { + if let Token::String(s) = self.token() { + let s = s.clone(); + self. + } + }*/ + + pub fn error(&self, text: S) -> Diagnostic + where + S: ToString, + { + let abs_pos = self.start + self.pos; + self.backing + .diagnostic(Severity::Error, abs_pos..=abs_pos, text.to_string()) } } -struct NewSource { +pub struct NewSource { file: Arc, segmenter: Segmenter, seg_pos: usize, @@ -1121,7 +1224,7 @@ mod new_lexer_tests { use crate::macros::MacroSet; - use super::{NewLexer, NewSource, Source, SourceFile}; + use super::{NewSource, SourceFile}; #[test] fn test() { diff --git a/rust/pspp/src/lex/scan/mod.rs b/rust/pspp/src/lex/scan/mod.rs index e525f4c4b7..af23a77053 100644 --- a/rust/pspp/src/lex/scan/mod.rs +++ b/rust/pspp/src/lex/scan/mod.rs @@ -392,10 +392,7 @@ impl<'a> Iterator for StringScanner<'a> { } let Some((seg_len, seg_type)) = self.segmenter.push(self.input, true).unwrap() else { - if !self.eof { - self.eof = true; - self.tokens.push_back(Token::End); - } + self.eof = true; return self.merge(true).unwrap(); }; let (s, rest) = self.input.split_at(seg_len); diff --git a/rust/pspp/src/lex/scan/test.rs b/rust/pspp/src/lex/scan/test.rs index 2f43f6a74d..f9c28905d6 100644 --- a/rust/pspp/src/lex/scan/test.rs +++ b/rust/pspp/src/lex/scan/test.rs @@ -10,7 +10,6 @@ use super::{ScanError, ScanToken, StringScanner}; fn print_token(token: &Token) { match token { - Token::End => print!("Token::End"), Token::Id(s) => print!("Token::Id(String::from({s:?}))"), Token::Number(number) => print!("Token::Number({number:?})"), Token::String(s) => print!("Token::String(String::from({s:?}))"), @@ -88,7 +87,6 @@ WXYZ. /* unterminated end of line comment ScanToken::Token(Token::EndCommand), ScanToken::Error(ScanError::UnexpectedChar('�')), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -145,7 +143,6 @@ and. with. ScanToken::Token(Token::Id(Identifier::new("and.").unwrap())), ScanToken::Token(Token::Punct(Punct::With)), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -208,7 +205,6 @@ fn test_punctuation() { ScanToken::Token(Token::Punct(Punct::LCurly)), ScanToken::Token(Token::Punct(Punct::RCurly)), ScanToken::Token(Token::Punct(Punct::Not)), - ScanToken::Token(Token::End), ], ); } @@ -260,7 +256,6 @@ fn test_positive_numbers() { ScanToken::Token(Token::Id(Identifier::new("e1").unwrap())), ScanToken::Error(ScanError::ExpectedExponent(String::from("1e+"))), ScanToken::Error(ScanError::ExpectedExponent(String::from("1e-"))), - ScanToken::Token(Token::End), ], ); } @@ -316,7 +311,6 @@ fn test_negative_numbers() { ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e-"))), ScanToken::Token(Token::Number(-1.0)), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -384,7 +378,6 @@ x"4142" ScanToken::Token(Token::String(String::from("ABお"))), ScanToken::Token(Token::String(String::from("�あいうえお"))), ScanToken::Token(Token::String(String::from("abc�えxyz"))), - ScanToken::Token(Token::End), ], ); } @@ -405,7 +398,6 @@ fn test_shbang() { ScanToken::Token(Token::Id(Identifier::new("bin").unwrap())), ScanToken::Token(Token::Punct(Punct::Slash)), ScanToken::Token(Token::Id(Identifier::new("pspp").unwrap())), - ScanToken::Token(Token::End), ], ); } @@ -453,7 +445,6 @@ next command. ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), ScanToken::Token(Token::EndCommand), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -493,7 +484,6 @@ second paragraph. ScanToken::Token(Token::String(String::from("second paragraph."))), ScanToken::Token(Token::EndCommand), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -522,7 +512,6 @@ FILE /* ScanToken::Token(Token::Id(Identifier::new("lab").unwrap())), ScanToken::Token(Token::String(String::from("not quoted here either"))), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -561,7 +550,6 @@ end data ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), ScanToken::Token(Token::Id(Identifier::new("data").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -604,7 +592,6 @@ end ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -661,7 +648,6 @@ end repeat ScanToken::Token(Token::String(String::from(" inner command"))), ScanToken::Token(Token::Id(Identifier::new("end").unwrap())), ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())), - ScanToken::Token(Token::End), ], ); } @@ -699,7 +685,6 @@ fourth command. ScanToken::Token(Token::Id(Identifier::new("fifth").unwrap())), ScanToken::Token(Token::Id(Identifier::new("command").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -732,7 +717,6 @@ var1 var2 var3 ScanToken::Token(Token::String(String::from("var1 var2 var3"))), ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -752,7 +736,6 @@ var1 var2 var3 ScanToken::Token(Token::String(String::from(" var1 var2 var3"))), ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -772,7 +755,6 @@ var1 var2 var3!enddefine. ScanToken::Token(Token::String(String::from("var1 var2 var3"))), ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -791,7 +773,6 @@ var1 var2 var3!enddefine. ScanToken::Token(Token::String(String::from("var1 var2 var3"))), ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -810,7 +791,6 @@ var1 var2 var3!enddefine. ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -833,7 +813,6 @@ var1 var2 var3!enddefine. ScanToken::Token(Token::String(String::from(""))), ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -863,7 +842,6 @@ var1 var2 var3!enddefine. ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -897,7 +875,6 @@ var1 var2 var3!enddefine. ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -927,7 +904,6 @@ content 2 ScanToken::Token(Token::String(String::from("content 2"))), ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -949,7 +925,6 @@ data list /x 1. ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), ScanToken::Token(Token::Number(1.0)), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -973,7 +948,6 @@ data list /x 1. ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), ScanToken::Token(Token::Number(1.0)), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -999,7 +973,6 @@ data list /x 1. ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), ScanToken::Token(Token::Number(1.0)), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -1023,7 +996,6 @@ data list /x 1. ScanToken::Token(Token::Id(Identifier::new("x").unwrap())), ScanToken::Token(Token::Number(1.0)), ScanToken::Token(Token::EndCommand), - ScanToken::Token(Token::End), ], ); } @@ -1043,7 +1015,6 @@ content line 2 ScanToken::Token(Token::Punct(Punct::RParen)), ScanToken::Token(Token::String(String::from("content line 1"))), ScanToken::Token(Token::String(String::from("content line 2"))), - ScanToken::Token(Token::End), ], ); } diff --git a/rust/pspp/src/lex/token.rs b/rust/pspp/src/lex/token.rs index 2b59423b5f..847c1eae06 100644 --- a/rust/pspp/src/lex/token.rs +++ b/rust/pspp/src/lex/token.rs @@ -4,9 +4,6 @@ use crate::identifier::Identifier; #[derive(Clone, Debug, PartialEq)] pub enum Token { - /// End of input. - End, - /// Identifier. Id(Identifier), @@ -54,7 +51,6 @@ fn string_representation(s: &str, quote: char, f: &mut Formatter<'_>) -> FmtResu impl Display for Token { fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { match self { - Token::End => Ok(()), Token::Id(s) => write!(f, "{s}"), Token::Number(number) => { if number.is_sign_negative() { diff --git a/rust/pspp/src/macros.rs b/rust/pspp/src/macros.rs index 7b0a34e223..a0308996ea 100644 --- a/rust/pspp/src/macros.rs +++ b/rust/pspp/src/macros.rs @@ -380,7 +380,6 @@ impl TokenClass { impl From<&Token> for TokenClass { fn from(source: &Token) -> Self { match source { - Token::End => Self::Punct, Token::Id(_) | Token::Number(_) | Token::String(_) => Self::Id, Token::EndCommand => Self::EndCommand, Token::Punct(punct) => match punct { @@ -589,7 +588,7 @@ impl<'a> Parser<'a> { fn push_arg(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) { let param = &self.macro_.parameters[self.args.len() - 1]; - if let Token::EndCommand | Token::End = token { + if let Token::EndCommand = token { if let Some(arg) = &self.args[self.arg_index] { let param = &self.macro_.parameters[self.args.len() - 1]; @@ -651,7 +650,7 @@ impl<'a> Parser<'a> { self.n_tokens += 1; self.args[self.arg_index].get_or_insert(Vec::new()); self.state = ParserState::Arg; - } else if param.is_positional() && matches!(token, Token::End | Token::EndCommand) { + } else if param.is_positional() && matches!(token, Token::EndCommand) { self.finished(); } else { error(MacroError::UnexpectedToken { -- 2.30.2