From: Ben Pfaff Date: Mon, 19 Aug 2024 21:53:31 +0000 (-0700) Subject: basic command parsing wroks X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c3359147412d5d6e4266cd445a58b47ca7f2ecb5;p=pspp basic command parsing wroks --- diff --git a/rust/src/command.rs b/rust/src/command.rs new file mode 100644 index 0000000000..4a5b24d10f --- /dev/null +++ b/rust/src/command.rs @@ -0,0 +1,224 @@ +use std::{fmt::Write, sync::OnceLock}; + +use flagset::{flags, FlagSet}; + +use crate::{ + integer::ToInteger, + lex::{ + command_name::CommandMatcher, + lexer::Lexer, + token::{Punct, Token}, + }, + message::Diagnostic, +}; + +flags! { + enum State: u8 { + /// No active dataset yet defined. + Initial, + + /// Active dataset has been defined. + Data, + + /// Inside `INPUT PROGRAM`. + InputProgram, + + /// Inside `FILE TYPE`. + FileType, + + /// State nested inside `LOOP` or `DO IF`, inside [State::Data]. + NestedData, + + /// State nested inside `LOOP` or `DO IF`, inside [State::InputProgram]. + NestedInputProgram, + } +} + +struct Command { + allowed_states: FlagSet, + enhanced_only: bool, + testing_only: bool, + no_abbrev: bool, + name: &'static str, + run: Box Result<(), Failure> + Send + Sync>, +} + +fn commands() -> &'static [Command] { + fn new_commands() -> Vec { + vec![Command { + allowed_states: State::Initial | State::Data, + enhanced_only: false, + testing_only: false, + no_abbrev: false, + name: "ECHO", + run: Box::new(|_context| { + println!("hi"); + Ok(()) + }), + }] + } + + static COMMANDS: OnceLock> = OnceLock::new(); + COMMANDS.get_or_init(|| new_commands()).as_slice() +} + +fn parse_command_word(lexer: &mut Lexer, s: &mut String, n: isize) -> bool { + let separator = match s.chars().next_back() { + Some(c) if c != '-' => " ", + _ => "", + }; + + match lexer.next(n) { + Token::Punct(Punct::Dash) => { + s.push('-'); + true + } + Token::Id(id) => { + write!(s, "{separator}{id}").unwrap(); + true + } + Token::Number(number) if number.is_sign_positive() => { + if let Some(integer) = number.to_exact_usize() { + write!(s, "{separator}{integer}").unwrap(); + true + } else { + false + } + } + _ => false, + } +} + +fn find_best_match(s: &str) -> (Option<&'static Command>, isize) { + let mut cm = CommandMatcher::new(s); + for command in commands() { + cm.add(command.name, command); + } + cm.get_match() +} + +fn parse_command_name( + lexer: &mut Lexer, + error: &Box, +) -> Result<(&'static Command, isize), ()> { + let mut s = String::new(); + let mut word = 0; + let mut missing_words = 0; + let mut command = None; + while parse_command_word(lexer, &mut s, word) { + (command, missing_words) = find_best_match(&s); + if missing_words <= 0 { + break; + } + word += 1; + } + if command.is_none() && missing_words > 0 { + s.push_str(" ."); + (command, missing_words) = find_best_match(&s); + s.truncate(s.len() - 2); + } + + match command { + Some(command) => Ok((command, (word + 1) + missing_words)), + None => { + if s.is_empty() { + error(lexer.error("Syntax error expecting command name")) + } else { + error(lexer.error("Unknown command `{s}`.")) + }; + Err(()) + } + } +} + +pub enum Success { + Success, + Eof, + Finish, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum Failure { + Failure, + NotImplemented, + CascadingFailure, +} + +pub fn end_of_command(context: &Context) -> Result { + match context.lexer.token() { + Token::EndCommand | Token::End => Ok(Success::Success), + _ => { + context.error( + context + .lexer + .error("Syntax error expecting end of command."), + ); + Err(Failure::Failure) + } + } +} + +fn _parse_in_state( + lexer: &mut Lexer, + error: &Box, + _state: State, +) -> Result { + match lexer.token() { + Token::End => Ok(Success::Eof), + Token::EndCommand => Ok(Success::Success), + _ => { + let (command, n_tokens) = + parse_command_name(lexer, error).map_err(|_| Failure::Failure)?; + for _ in 0..n_tokens { + lexer.get(); + } + let context = Context { + error, + lexer, + command_name: Some(command.name), + }; + match (command.run)(&context) { + Ok(()) => end_of_command(&context), + Err(error) => Err(error) + } + } + } +} + +fn parse_in_state( + lexer: &mut Lexer, + error: &Box, + state: State, +) -> Result { + let result = _parse_in_state(lexer, error, state); + if result.is_err() { + lexer.interactive_reset(); + } + lexer.discard_rest_of_command(); + + match result { + Ok(Success::Eof) | Ok(Success::Finish) => (), + _ => { + while let Token::EndCommand = lexer.token() { + lexer.get(); + } + } + }; + result +} + +pub fn parse(lexer: &mut Lexer, error: &Box) -> Result { + parse_in_state(lexer, error, State::Initial) +} + +pub struct Context<'a> { + error: &'a Box, + lexer: &'a mut Lexer, + command_name: Option<&'static str>, +} + +impl<'a> Context<'a> { + pub fn error(&self, diagnostic: Diagnostic) { + (self.error)(diagnostic); + } +} diff --git a/rust/src/engine.rs b/rust/src/engine.rs new file mode 100644 index 0000000000..800626e3f7 --- /dev/null +++ b/rust/src/engine.rs @@ -0,0 +1,69 @@ +use crate::{ + command::{parse, Failure, Success}, + lex::lexer::{Lexer, Source}, + message::Diagnostic, +}; + +pub struct Engine { + lexer: Lexer, +} + +impl Engine { + fn new() -> Self { + Self { + lexer: Lexer::new(Box::new(|location, error| println!("{location}: {error}"))), + } + } + fn run(&mut self, source: Source) { + self.lexer.append(source); + self.lexer.get(); + loop { + let error: Box = Box::new(|diagnostic| { + println!("{diagnostic}"); + }); + match parse(&mut self.lexer, &error) { + Ok(Success::Eof) | Ok(Success::Finish) => break, + Ok(Success::Success) => (), + Err(error) => match self.lexer.error_handling() { + crate::lex::lexer::ErrorHandling::Continue + if error == Failure::CascadingFailure => + { + println!("Stopping syntax file processing here to avoid a cascade of dependent command failures."); + self.lexer.discard_noninteractive(); + break; + } + crate::lex::lexer::ErrorHandling::Stop => { + println!("Error encountered while ERROR=STOP is effective."); + self.lexer.discard_noninteractive(); + break; + } + _ => (), + }, + } + } + } +} + +#[cfg(test)] +mod tests { + use encoding_rs::UTF_8; + + use crate::lex::{ + lexer::{ErrorHandling, Source}, + segment::Mode, + }; + + use super::Engine; + + #[test] + fn test_echo() { + let mut engine = Engine::new(); + engine.run(Source::for_file_contents( + "ECHO 'hi there'.\n".to_string(), + Some("test.sps".to_string()), + UTF_8, + Mode::default(), + ErrorHandling::default(), + )); + } +} diff --git a/rust/src/integer.rs b/rust/src/integer.rs new file mode 100644 index 0000000000..6c76839927 --- /dev/null +++ b/rust/src/integer.rs @@ -0,0 +1,86 @@ +pub trait ToInteger { + fn to_exact_integer(&self) -> Option + where + T: FromFloat; + fn to_exact_usize(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_u8(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_u16(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_u32(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_u64(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_u128(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_isize(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_i8(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_i16(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_i32(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_i64(&self) -> Option { + self.to_exact_integer() + } + fn to_exact_i128(&self) -> Option { + self.to_exact_integer() + } +} + +impl ToInteger for f64 { + fn to_exact_integer(&self) -> Option + where + T: FromFloat, + { + T::from_float(*self) + } +} + +pub trait FromFloat { + fn from_float(x: f64) -> Option + where + Self: Sized; +} + +macro_rules! impl_from_float { + ($T:ident) => { + impl FromFloat for $T { + fn from_float(x: f64) -> Option + where + Self: Sized, + { + if x.trunc() == x && x >= $T::MIN as f64 && x <= $T::MAX as f64 { + Some(x as Self) + } else { + None + } + } + } + }; +} + +impl_from_float!(usize); +impl_from_float!(u8); +impl_from_float!(u16); +impl_from_float!(u32); +impl_from_float!(u64); +impl_from_float!(u128); +impl_from_float!(isize); +impl_from_float!(i8); +impl_from_float!(i16); +impl_from_float!(i32); +impl_from_float!(i64); +impl_from_float!(i128); diff --git a/rust/src/lex/command_name.rs b/rust/src/lex/command_name.rs index 208bd457b9..bccea1483b 100644 --- a/rust/src/lex/command_name.rs +++ b/rust/src/lex/command_name.rs @@ -23,7 +23,7 @@ fn count_words(s: &str) -> isize { /// 4. Otherwise, `string` and `command` match. Set *MISSING_WORDS to n - m. Set /// *EXACT to false if any of the S[i] were found to be abbreviated in the /// comparisons done in step 3, or to true if they were all exactly equal -/// (modulo case). Return true. */ +/// (modulo case). Return true. pub fn command_match(command: &str, string: &str) -> Option { let mut command_words = command.split_whitespace(); let mut string_words = string.split_whitespace(); @@ -32,13 +32,13 @@ pub fn command_match(command: &str, string: &str) -> Option { let Some(cw) = command_words.next() else { return Some(Match { exact, - missing_words: -count_words(string), + missing_words: -(string_words.count() as isize), }); }; let Some(sw) = string_words.next() else { return Some(Match { exact, - missing_words: 1 + count_words(command), + missing_words: 1 + command_words.count() as isize, }); }; if !id_match_n_nonstatic(cw, sw, 3) { @@ -50,6 +50,67 @@ pub fn command_match(command: &str, string: &str) -> Option { } } +/// Matches a string against a collection of command names. +pub struct CommandMatcher<'a, T> { + string: &'a str, + extensible: bool, + exact_match: Option, + n_matches: usize, + match_: Option, + match_missing_words: isize, +} + +impl<'a, T> CommandMatcher<'a, T> { + pub fn new(string: &'a str) -> Self { + Self { + string, + extensible: false, + exact_match: None, + n_matches: 0, + match_: None, + match_missing_words: 0, + } + } + + /// Consider `command` as a candidate for the command name being parsed. If + /// `command` is the correct command name, then [Self::get_match] will + /// return `aux` later. + pub fn add(&mut self, command: &str, aux: T) { + if let Some(Match { + missing_words, + exact, + }) = command_match(command, self.string) + { + if missing_words > 0 { + self.extensible = true; + } else if exact && missing_words == 0 { + self.exact_match = Some(aux); + } else { + if missing_words > self.match_missing_words { + self.n_matches = 0; + } + if missing_words >= self.match_missing_words || self.n_matches == 0 { + self.n_matches += 1; + self.match_ = Some(aux); + self.match_missing_words = missing_words; + } + } + } + } + + pub fn get_match(self) -> (Option, isize) { + if self.extensible { + (None, 1) + } else if let Some(exact_match) = self.exact_match { + (Some(exact_match), 0) + } else if self.n_matches == 1 { + (self.match_, self.match_missing_words) + } else { + (None, self.match_missing_words) + } + } +} + pub const COMMAND_NAMES: &'static [&'static str] = &[ "2SLS", "ACF", diff --git a/rust/src/lex/lexer.rs b/rust/src/lex/lexer.rs index fd2c5cdba7..4bd1335595 100644 --- a/rust/src/lex/lexer.rs +++ b/rust/src/lex/lexer.rs @@ -17,7 +17,7 @@ use unicode_width::{UnicodeWidthChar, UnicodeWidthStr}; use crate::{ macros::{macro_tokens_to_syntax, MacroSet, ParseStatus, Parser}, - message::{Diagnostic, Location, Point, Severity}, + message::{Category, Diagnostic, Location, Point, Severity}, prompt::PromptStyle, settings::Settings, }; @@ -436,6 +436,7 @@ impl Source { } unreachable!(); } + fn get_parse(&mut self, context: &Context) -> bool { // XXX deal with accumulated messages self.get_parse__(context) @@ -457,6 +458,24 @@ impl Source { } } + /// Returns the syntax for 1-based line-number `line_number`. + fn get_line(&self, line_number: i32) -> &str { + if (1..=self.lines.len() as i32).contains(&line_number) { + let line_number = line_number as usize; + let start = self.lines[line_number - 1]; + let end = self + .lines + .get(line_number) + .copied() + .unwrap_or(self.buffer.len()); + let line = &self.buffer[start..end]; + line.strip_suffix("\r\n") + .unwrap_or(line.strip_suffix('\n').unwrap_or(line)) + } else { + "" + } + } + fn token_location(&self, range: RangeInclusive<&LexToken>) -> Location { Location { file_name: self.file_name.clone(), @@ -528,20 +547,25 @@ impl Source { Some(&self.buffer[token0.pos.start..token1.pos.end]) } - fn diagnostic(&self, severity: Severity, ofs: RangeInclusive, text: S) -> Diagnostic - where - S: AsRef, - { - let text = text.as_ref(); + fn is_empty(&self) -> bool { + self.buffer.is_empty() && self.eof + } + + fn diagnostic( + &self, + severity: Severity, + ofs: RangeInclusive, + text: String, + ) -> Diagnostic { let mut s = String::with_capacity(text.len() + 16); - if self.buffer.is_empty() && self.eof { - write!(&mut s, "At end of input: "); + if self.is_empty() { + s.push_str("At end of input: "); } else if let Some(call) = self.get_macro_call(ofs.clone()) { - write!(&mut s, "In syntax expanded from `{}`: ", ellipsize(call)); + write!(&mut s, "In syntax expanded from `{}`: ", ellipsize(call)).unwrap(); } if !text.is_empty() { - s.push_str(text); + s.push_str(&text); } else { s.push_str("Syntax error."); } @@ -550,14 +574,50 @@ impl Source { s.push('.'); } + let location = self.ofs_location(ofs); + let mut source = Vec::new(); + if let Some(Range { + start: Point { line: l0, .. }, + end: Point { line: l1, .. }, + }) = location.span + { + let lines = if l1 - l0 > 3 { + vec![l0, l0 + 1, l1] + } else { + (l0..=l1).collect() + }; + for line_number in lines { + source.push((line_number, self.get_line(line_number).to_string())); + } + } + Diagnostic { + category: Category::Syntax, severity, - location: self.ofs_location(ofs), + location, + source, stack: Vec::new(), command_name: None, // XXX text: s, } } + + fn interactive_reset(&mut self) { + if self.error_handling == ErrorHandling::Terminal { + let Source { + error_handling, + encoding, + read, + .. + } = mem::take(self); + *self = Self { + error_handling, + encoding, + read, + ..Source::default() + }; + } + } } fn ellipsize(s: &str) -> Cow { @@ -649,22 +709,28 @@ impl Lexer { macros: &self.macros, error: &self.error, }; - if !self.source.get_parse(&context) { - let Some(new_source) = self.stack.pop() else { - self.source = Source::default(); - self.source.parse.push(LexToken { - token: Token::End, - pos: 0..0, - macro_rep: None, - }); - return &Token::End; - }; - self.source = new_source; + if !self.source.get_parse(&context) && !self.pop_stack() { + return &Token::End; } } self.source.token() } + fn pop_stack(&mut self) -> bool { + if let Some(new_source) = self.stack.pop() { + self.source = new_source; + true + } else { + self.source = Source::default(); + self.source.parse.push(LexToken { + token: Token::End, + pos: 0..0, + macro_rep: None, + }); + false + } + } + /// Inserts `source` so that the next token comes from it. This is only /// permitted when the lexer is either empty or at `Token::EndCommand`. pub fn include(&mut self, mut source: Source) { @@ -700,7 +766,10 @@ impl Lexer { self.source.next(offset, &context) } - pub fn error(&self, text: String) -> Diagnostic { + pub fn error(&self, text: S) -> Diagnostic + where + S: ToString, + { self.diagnostic( Severity::Error, self.source.parse_ofs..=self.source.parse_ofs, @@ -708,13 +777,55 @@ impl Lexer { ) } - pub fn diagnostic( + pub fn diagnostic( &self, severity: Severity, ofs: RangeInclusive, - text: String, - ) -> Diagnostic { - self.source.diagnostic(severity, ofs, text) + text: S, + ) -> Diagnostic + where + S: ToString, + { + self.source.diagnostic(severity, ofs, text.to_string()) + } + + pub fn error_handling(&self) -> ErrorHandling { + self.source.error_handling + } + + /// Discards all lookahead tokens, then discards all input sources + /// until it encounters one with error mode [ErrorHandling::Terminal] or until it + /// runs out of input sources. + pub fn discard_noninteractive(&mut self) { + while self.source.error_handling != ErrorHandling::Ignore { + self.source.pp.clear(); + self.source.merge.clear(); + self.source.parse.clear(); + self.source.parse_ofs = 0; + + if self.source.error_handling == ErrorHandling::Terminal || !self.pop_stack() { + return; + } + } + } + + /// If the source that the lexer is currently reading has error mode + /// [ErrorHandling::Terminal], discards all buffered input and tokens, so + /// that the next token to be read comes directly from whatever is next read + /// from the stream. + /// + /// It makes sense to call this function after encountering an error in a + /// command entered on the console, because usually the user would prefer + /// not to have cascading errors. + pub fn interactive_reset(&mut self) { + self.source.interactive_reset() + } + + /// Advances past any tokens up to [Token::EndCommand] or [Token::End]. + pub fn discard_rest_of_command(&mut self) { + while !matches!(self.token(), Token::EndCommand | Token::End) { + self.get(); + } } } diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 3841e83cbb..3548e020ee 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -16,3 +16,5 @@ pub mod message; pub mod macros; pub mod settings; pub mod command; +pub mod integer; +pub mod engine; diff --git a/rust/src/message.rs b/rust/src/message.rs index 964649abfc..a3ba1d8e9f 100644 --- a/rust/src/message.rs +++ b/rust/src/message.rs @@ -129,22 +129,124 @@ impl Location { } } -#[derive(Enum)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Enum)] pub enum Severity { Error, Warning, Note, } +impl Severity { + fn as_str(&self) -> &'static str { + match self { + Severity::Error => "error", + Severity::Warning => "warning", + Severity::Note => "note", + } + } +} + +impl Display for Severity { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + write!(f, "{}", self.as_str()) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum Category { + General, + Syntax, + Data, +} + pub struct Stack { location: Location, description: String, } pub struct Diagnostic { - severity: Severity, - location: Location, - stack: Vec, - command_name: Option<&'static str>, - text: String, + pub severity: Severity, + pub category: Category, + pub location: Location, + pub source: Vec<(i32, String)>, + pub stack: Vec, + pub command_name: Option<&'static str>, + pub text: String, +} + +impl Display for Diagnostic { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + for Stack { + location, + description, + } in &self.stack + { + if !!location.is_empty() { + write!(f, "{location}: ")?; + } + writeln!(f, "{description}")?; + } + if self.category != Category::General && !self.location.is_empty() { + write!(f, "{}: ", self.location)?; + } + + write!(f, "{}: ", self.severity)?; + + match self.command_name { + Some(command_name) if self.category == Category::Syntax => { + write!(f, "{command_name}: ")? + } + _ => (), + } + + write!(f, "{}", self.text)?; + + if let Some(Range { + start: Point { + line: l0, + column: Some(c0), + }, + end: Point { + line: l1, + column: Some(c1), + }, + }) = self.location.span + { + let mut prev_line_number = None; + for (line_number, line) in &self.source { + if let Some(prev_line_number) = prev_line_number { + if *line_number != prev_line_number + 1 { + write!(f, "\n ... |")?; + } + } + prev_line_number = Some(line_number); + + write!(f, "\n{line_number:5} | {line}")?; + + if !self.location.omit_underlines { + let c0 = if *line_number == l0 { c0 } else { 1 }; + let c1 = if *line_number == l1 { + c1 + } else { + line.width() as i32 + }; + write!(f, "\n |")?; + for _ in 0..c0 { + f.write_str(" ")?; + } + if *line_number == l0 { + f.write_str("^")?; + for _ in c0..c1 { + f.write_str("~")?; + } + } else { + for _ in c0..=c1 { + f.write_str("~")?; + } + } + } + } + } + Ok(()) + } }