From b5876c0136985a0e0c62b0f4977e36cc4743be8e Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 8 Oct 2024 08:17:36 -0700 Subject: [PATCH] work on parser --- rust/pspp-derive/src/lib.rs | 69 ++++++---- rust/pspp/src/command.rs | 203 ++++++++++++++-------------- rust/pspp/src/lex/lexer.rs | 260 ++++++++++++------------------------ 3 files changed, 229 insertions(+), 303 deletions(-) diff --git a/rust/pspp-derive/src/lib.rs b/rust/pspp-derive/src/lib.rs index da1a879a14..d5b344de8b 100644 --- a/rust/pspp-derive/src/lib.rs +++ b/rust/pspp-derive/src/lib.rs @@ -1,6 +1,6 @@ use proc_macro::TokenStream; use proc_macro2::{Literal, TokenStream as TokenStream2}; -use quote::{quote, ToTokens}; +use quote::{format_ident, quote, ToTokens}; use syn::{spanned::Spanned, Attribute, DataEnum, DataStruct, DeriveInput, Error, Fields, Token}; #[proc_macro_derive(FromTokens, attributes(pspp))] @@ -37,28 +37,30 @@ fn derive_enum(ast: &DeriveInput, e: &DataEnum) -> Result { let ident = &variant.ident; let ident_string = ident.to_string(); let match_expr = if let Some(syntax) = field_attrs.syntax { - quote! { cursor.match_syntax(#syntax) } + quote! { input.skip_syntax(#syntax) } } else if ident_string.eq_ignore_ascii_case("all") { - quote! { cursor.match_(&Token::Punct(Punct::All))} + quote! { input.skip(&Token::Punct(Punct::All))} } else { - quote! { cursor.match_keyword(#ident_string)} + quote! { input.skip_keyword(#ident_string)} }; - let construction = construct_fields(&variant.fields, true); + let construction = construct_fields(&variant.fields, quote! { Self::#ident}, true); let check_equals = if struct_attrs.required_equals && !variant.fields.is_empty() { - quote! { cursor.force(&Token::Punct(Punct::Equals)).map_err(ParseError::Error)?; } + quote! { let ((), input) = parse_token(input, &Token::Punct(Punct::Equals)).mismatch_to_error()?; } } else { - quote!{} + quote! {} }; - body.extend(quote! { if #match_expr { #check_equals Self::#ident #construction } }); + body.extend( + quote! { if let Some(input) = #match_expr { #check_equals #construction } }, + ); } - body.extend(quote! { else { return Err(ParseError::Mismatch(cursor.error("Syntax error."))); } }); + body.extend(quote! { else { Err(ParseError::Mismatch(input.error("Syntax error."))) } }); let name = &ast.ident; let lifetime = struct_attrs.lifetime(); let output = quote! { impl<'a> FromTokens<'a> for #name #lifetime { - fn from_tokens(cursor: &Cursor<'a>) -> ParseResult { - Ok(#body) + fn from_tokens(input: TokenSlice<'a>) -> ParseResult<'a, Self> { + #body } } }; @@ -66,38 +68,53 @@ fn derive_enum(ast: &DeriveInput, e: &DataEnum) -> Result { Ok(output) } -fn construct_fields(fields: &Fields, mismatch_to_error: bool) -> impl ToTokens { +fn construct_fields( + fields: &Fields, + name: impl ToTokens, + mismatch_to_error: bool, +) -> impl ToTokens { let mut construction = TokenStream2::new(); let convert = if mismatch_to_error { quote! { .mismatch_to_error() } } else { quote! {} }; - for field in fields { - let value = quote! { FromTokens::from_tokens(cursor) #convert ? }; - if let Some(name) = field.ident.as_ref() { - construction.extend(quote! { #name: #value, }); - } else { - construction.extend(quote! { #value, }); - } + for (index, _field) in fields.iter().enumerate() { + let varname = format_ident!("field{index}"); + construction + .extend(quote! { let (#varname, input) = FromTokens::from_tokens(input) #convert ?; }); } - match fields { - Fields::Named(_) => quote! { { #construction } }, - Fields::Unnamed(_) => quote! { ( #construction ) }, - Fields::Unit => quote! {}, + Fields::Named(named) => { + let mut body = TokenStream2::new(); + for (index, field) in named.named.iter().enumerate() { + let varname = format_ident!("field{index}"); + let field_name = &field.ident; + body.extend(quote! { #field_name: #varname, }); + } + quote! { #construction Ok((#name { #body }, input)) } + } + Fields::Unnamed(unnamed) => { + let mut body = TokenStream2::new(); + for (index, _field) in unnamed.unnamed.iter().enumerate() { + let varname = format_ident!("field{index}"); + body.extend(quote! { #varname, }); + } + quote! { #construction Ok((#name ( #body ), input)) } + } + Fields::Unit => quote! { Ok((#name, input)) }, } } fn derive_struct(ast: &DeriveInput, s: &DataStruct) -> Result { let struct_attrs = StructAttrs::parse(&ast.attrs)?; let name = &ast.ident; - let construction = construct_fields(&s.fields, false); + let construction = construct_fields(&s.fields, quote! {#name}, false); let lifetime = struct_attrs.lifetime(); let output = quote! { impl<'a> FromTokens<'a> for #name #lifetime { - fn from_tokens(cursor: &Cursor<'a>) -> ParseResult { - Ok(#name #construction) + fn from_tokens(input: TokenSlice<'a>) -> ParseResult<'a, Self> { + #construction } } }; diff --git a/rust/pspp/src/command.rs b/rust/pspp/src/command.rs index fd79e62b0e..802641d853 100644 --- a/rust/pspp/src/command.rs +++ b/rust/pspp/src/command.rs @@ -9,7 +9,7 @@ use crate::{ integer::ToInteger, lex::{ command_name::CommandMatcher, - lexer::{Cursor, TokenSlice}, + lexer::{LexToken, TokenSlice}, token::{Punct, Token}, }, message::Diagnostic, @@ -112,23 +112,23 @@ enum ParseError { Mismatch(Diagnostic), } -type ParseResult = Result; +type ParseResult<'a, T> = Result<(T, TokenSlice<'a>), ParseError>; trait MismatchToError { fn mismatch_to_error(self) -> Self; } -impl MismatchToError for ParseResult { +impl<'a, T> MismatchToError for ParseResult<'a, T> { fn mismatch_to_error(self) -> Self { match self { Err(ParseError::Mismatch(diagnostic)) => Err(ParseError::Error(diagnostic)), - rest => rest + rest => rest, } } } trait FromTokens<'a> { - fn from_tokens(cursor: &Cursor<'a>) -> ParseResult + fn from_tokens(input: TokenSlice<'a>) -> ParseResult<'a, Self> where Self: Sized; } @@ -137,17 +137,14 @@ impl<'a, T> FromTokens<'a> for Option where T: FromTokens<'a>, { - fn from_tokens(cursor: &Cursor<'a>) -> ParseResult + fn from_tokens(input: TokenSlice<'a>) -> ParseResult<'a, Self> where Self: Sized, { - let saved_position = cursor.get_pos(); - match T::from_tokens(cursor) { - Ok(result) => Ok(Some(result)), - Err(_error) => { - cursor.set_pos(saved_position); - Ok(None) - } + match T::from_tokens(input) { + Ok((value, rest)) => Ok((Some(value), rest)), + Err(ParseError::Mismatch(_)) => Ok((None, input)), + Err(ParseError::Error(error)) => Err(ParseError::Error(error)), } } } @@ -156,24 +153,31 @@ impl<'a, T> FromTokens<'a> for Vec where T: FromTokens<'a>, { - fn from_tokens(cursor: &Cursor<'a>) -> ParseResult + fn from_tokens(mut input: TokenSlice<'a>) -> ParseResult<'a, Self> where Self: Sized, { let mut vector = Vec::new(); - while let Ok(result) = cursor.with_pos(|| T::from_tokens(cursor)) { - vector.push(result); + loop { + match T::from_tokens(input) { + Ok((value, rest)) => { + vector.push(value); + input = rest; + } + Err(ParseError::Mismatch(_)) => break, + Err(ParseError::Error(e)) => return Err(ParseError::Error(e)), + } } - Ok(vector) + Ok((vector, input)) } } impl<'a> FromTokens<'a> for TokenSlice<'a> { - fn from_tokens(cursor: &Cursor<'a>) -> ParseResult + fn from_tokens(input: TokenSlice<'a>) -> ParseResult<'a, Self> where Self: Sized, { - Ok(cursor.take_remainder()) + Ok((input, input.end())) } } @@ -191,7 +195,7 @@ impl < 'a > FromTokens < 'a > for DescriptivesSubcommand<'a> } }*/ -#[derive(FromTokens, Debug)] +#[derive(Debug, FromTokens)] #[pspp(add_lifetime)] struct Descriptives<'a> { subcommands: Vec>>, @@ -204,30 +208,24 @@ impl<'a, T> FromTokens<'a> for Subcommand where T: FromTokens<'a>, { - fn from_tokens(cursor: &Cursor<'a>) -> ParseResult + fn from_tokens(input: TokenSlice<'a>) -> ParseResult<'a, Self> where Self: Sized, { - cursor.advance_until(|token| token != &Token::Punct(Punct::Slash)); - if cursor.at_end() { + let start = input.skip_until(|token| token != &Token::Punct(Punct::Slash)); + if start.is_empty() { return Err(ParseError::Error( - cursor.error("Syntax error at end of input."), + input.error("Syntax error at end of input."), )); } - let start = cursor.get_pos(); - cursor.advance_until(|token| token == &Token::Punct(Punct::Slash)); - let subcommand = cursor.subcursor(start..cursor.get_pos()); - match T::from_tokens(&subcommand) { - Ok(result) => Ok(Self(result)), - Err(error) => { - cursor.set_pos(start); - Err(error) - } - } + let end = start.skip_to(&Token::Punct(Punct::Slash)); + let subcommand = start.subslice(0..start.len() - end.len()); + let (value, rest) = T::from_tokens(subcommand)?; + Ok((Self(value), end)) } } -#[derive(FromTokens, Debug)] +#[derive(Debug, FromTokens)] #[pspp(add_lifetime, required_equals)] enum DescriptivesSubcommand<'a> { Variables(Vec>), @@ -237,40 +235,14 @@ enum DescriptivesSubcommand<'a> { Sort(Sort), } -/* -impl<'a> FromTokens<'a> for DescriptivesSubcommand<'a> { - fn from_tokens(cursor: &Cursor<'a>) -> ParseResult { - println!("{}:{}", file!(), line!()); - Ok(if cursor.match_keyword("Variables") { - println!("{}:{}", file!(), line!()); - cursor.force(&Token::Punct(Punct::Equals))?; - println!("{}:{}", file!(), line!()); - Self::Variables(FromTokens::from_tokens(cursor)?) - } else if cursor.match_keyword("Missing") { - cursor.force(&Token::Punct(Punct::Equals))?; - Self::Missing(FromTokens::from_tokens(cursor)?) - } else if cursor.match_keyword("Save") { - Self::Save - } else if cursor.match_keyword("Statistics") { - cursor.force(&Token::Punct(Punct::Equals))?; - Self::Statistics(FromTokens::from_tokens(cursor)?) - } else if cursor.match_keyword("Sort") { - cursor.force(&Token::Punct(Punct::Equals))?; - Self::Sort(FromTokens::from_tokens(cursor)?) - } else { - return Err(cursor.error("Syntax error.")); - }) - } -}*/ - -#[derive(FromTokens, Debug)] +#[derive(Debug, FromTokens)] enum Missing { Variable, Listwise, Include, } -#[derive(FromTokens, Debug)] +#[derive(Debug, FromTokens)] #[pspp(add_lifetime)] struct DescriptivesVarRange<'a> { vars: VarRange<'a>, @@ -284,18 +256,34 @@ impl<'a, T> FromTokens<'a> for InParens where T: FromTokens<'a>, { - fn from_tokens(cursor: &Cursor<'a>) -> ParseResult + fn from_tokens(input: TokenSlice<'a>) -> ParseResult<'a, Self> where Self: Sized, { - cursor - .force(&Token::Punct(Punct::LParen)) - .map_err(ParseError::Mismatch)?; - let inner = T::from_tokens(cursor)?; - cursor - .force(&Token::Punct(Punct::RParen)) - .map_err(ParseError::Mismatch)?; - Ok(Self(inner)) + let ((), input) = parse_token(input, &Token::Punct(Punct::LParen))?; + let (inner, input) = T::from_tokens(input)?; + let ((), input) = parse_token(input, &Token::Punct(Punct::RParen))?; + Ok((Self(inner), input)) + } +} + +fn parse_token<'a>(input: TokenSlice<'a>, token: &Token) -> ParseResult<'a, ()> { + if let Some(rest) = input.skip(token) { + Ok(((), rest)) + } else { + Err(ParseError::Mismatch( + input.error(format!("expecting {token}")), + )) + } +} + +fn parse_keyword<'a>(input: TokenSlice<'a>, keyword: &str) -> ParseResult<'a, ()> { + if let Some(rest) = input.skip_if(|token| token.matches_keyword(keyword)) { + Ok(((), rest)) + } else { + Err(ParseError::Mismatch( + input.error(format!("expecting {keyword}")), + )) } } @@ -306,36 +294,51 @@ struct VarRange<'a> { } impl<'a> FromTokens<'a> for VarRange<'a> { - fn from_tokens(cursor: &Cursor<'a>) -> ParseResult + fn from_tokens(input: TokenSlice<'a>) -> ParseResult<'a, Self> where Self: Sized, { - Ok(Self { - from: cursor.force_id().map_err(ParseError::Mismatch)?, - to: cursor - .match_(&Token::Punct(Punct::To)) - .then(|| cursor.force_id().map_err(ParseError::Mismatch)) - .transpose()?, - }) + let (from, input) = parse_id(input)?; + if let Ok(((), input)) = parse_token(input, &Token::Punct(Punct::To)) { + if let Ok((to, input)) = parse_id(input) { + return Ok((Self { from, to: Some(to) }, input)); + } + } + Ok((Self { from, to: None }, input)) + } +} + +fn parse_id<'a>(input: TokenSlice<'a>) -> ParseResult<'a, &'a Identifier> { + let mut iter = input.iter(); + if let Some(LexToken { + token: Token::Id(id), + .. + }) = iter.next() + { + Ok((id, iter.remainder())) + } else { + Err(ParseError::Mismatch( + input.error("Syntax error expecting identifier."), + )) } } impl<'a> FromTokens<'a> for &'a Identifier { - fn from_tokens(cursor: &Cursor<'a>) -> ParseResult + fn from_tokens(input: TokenSlice<'a>) -> ParseResult<'a, Self> where Self: Sized, { - cursor.force_id().map_err(ParseError::Mismatch) + parse_id(input) } } -#[derive(FromTokens, Debug)] +#[derive(Debug, FromTokens)] struct Sort { key: SortKey, direction: Option, } -#[derive(FromTokens, Debug)] +#[derive(Debug, FromTokens)] enum SortKey { Mean, SMean, @@ -350,7 +353,7 @@ enum SortKey { Name, } -#[derive(FromTokens, Debug)] +#[derive(Debug, FromTokens)] enum Direction { #[pspp(syntax = "(A)")] Ascending, @@ -358,7 +361,7 @@ enum Direction { Descending, } -#[derive(FromTokens, Debug)] +#[derive(Debug, FromTokens)] enum Statistic { Default, Mean, @@ -390,14 +393,20 @@ fn commands() -> &'static [Command] { no_abbrev: false, name: "DESCRIPTIVES", run: Box::new(|context| { - let cursor = context.lexer.cursor(); println!("{}:{}", file!(), line!()); - while let Ok(subcommand) = >::from_tokens(&cursor) { - println!("{subcommand:?}"); - println!( - "{:?}", - DescriptivesSubcommand::from_tokens(&subcommand.0.cursor()) - ); + let mut input = context.lexer; + loop { + match >::from_tokens(input) { + Ok((subcommand, rest)) => { + println!("{subcommand:?}"); + println!("{:?}", DescriptivesSubcommand::from_tokens(subcommand.0)); + input = rest; + } + Err(error) => { + println!("{error:?}"); + break; + } + } } println!("{}:{}", file!(), line!()); }), @@ -408,13 +417,7 @@ fn commands() -> &'static [Command] { testing_only: false, no_abbrev: false, name: "ECHO", - run: Box::new(|context| { - let cursor = context.lexer.cursor(); - match cursor.force_string() { - Ok(s) => println!("\"{s}\""), - Err(e) => println!("{e}"), - } - }), + run: Box::new(|_context| todo!()), }, /* Command { diff --git a/rust/pspp/src/lex/lexer.rs b/rust/pspp/src/lex/lexer.rs index dd514acc05..f9c1ced0bc 100644 --- a/rust/pspp/src/lex/lexer.rs +++ b/rust/pspp/src/lex/lexer.rs @@ -1,6 +1,5 @@ use std::{ borrow::{Borrow, Cow}, - cell::Cell, collections::VecDeque, fmt::{Debug, Formatter, Result as FmtResult, Write}, fs, @@ -18,15 +17,13 @@ use thiserror::Error as ThisError; use unicode_width::{UnicodeWidthChar, UnicodeWidthStr}; use crate::{ - identifier::Identifier, - lex::scan::StringScanner, macros::{macro_tokens_to_syntax, MacroSet, ParseStatus, Parser}, message::{Category, Diagnostic, Location, Point, Severity}, settings::Settings, }; use super::{ - scan::{MergeResult, ScanError, ScanToken}, + scan::{MergeResult, ScanError, ScanToken, StringScanner}, segment::{Segmenter, Syntax}, token::Token, }; @@ -361,7 +358,33 @@ impl Debug for Tokens { } } -#[derive(Clone)] +pub struct TokenSliceIter<'a> { + tokens: &'a [LexToken], +} + +impl<'a> TokenSliceIter<'a> { + pub fn remainder(&self) -> TokenSlice<'a> { + TokenSlice { + tokens: self.tokens, + } + } +} + +impl<'a> Iterator for TokenSliceIter<'a> { + type Item = &'a LexToken; + + fn next(&mut self) -> Option { + let (first, rest) = self.tokens.split_first().unwrap(); + if !rest.is_empty() { + self.tokens = rest; + Some(first) + } else { + None + } + } +} + +#[derive(Copy, Clone)] pub struct TokenSlice<'a> { tokens: &'a [LexToken], } @@ -386,10 +409,6 @@ impl<'a> TokenSlice<'a> { } } - pub fn cursor(&'a self) -> Cursor<'a> { - Cursor::new(self) - } - pub fn get_token(&self, index: usize) -> Option<&'a Token> { //self.get(index).map(|token| &token.token) if index < self.len() { @@ -428,6 +447,9 @@ impl<'a> TokenSlice<'a> { fn last(&self) -> &LexToken { self.tokens.last().unwrap() } + pub fn end(&self) -> Self { + self.subslice(self.len()..self.len()) + } fn file(&self) -> Option<&Arc> { let first = self.first(); @@ -447,8 +469,10 @@ impl<'a> TokenSlice<'a> { self.len() == 0 } - pub fn iter(&self) -> std::slice::Iter { - (&self.tokens[..self.len()]).iter() + pub fn iter(&self) -> TokenSliceIter<'a> { + TokenSliceIter { + tokens: self.tokens, + } } /// If the tokens contains a macro call, this returns the raw @@ -483,6 +507,54 @@ impl<'a> TokenSlice<'a> { } } + pub fn skip_to(&self, token: &Token) -> Self { + self.skip_until(|t| t == token) + } + + pub fn skip_until(&self, f: F) -> Self + where + F: Fn(&Token) -> bool, + { + for (index, token) in self.iter().enumerate() { + if f(&token.token) { + return self.subslice(index..self.len()); + } + } + self.end() + } + + pub fn skip(&self, token: &Token) -> Option { + self.skip_if(|t| t == token) + } + + pub fn skip_if(&self, f: F) -> Option + where + F: Fn(&Token) -> bool, + { + let mut iter = self.iter(); + if iter.next().map_or(false, |token| f(&token.token)) { + Some(iter.remainder()) + } else { + None + } + } + + pub fn skip_keyword(&self, keyword: &str) -> Option { + self.skip_if(|token| token.matches_keyword(keyword)) + } + + pub fn skip_syntax(&self, syntax: &str) -> Option { + let syntax_scanner = StringScanner::new(syntax, Syntax::Interactive, true); + let mut input = *self; + for scan_token in syntax_scanner { + let ScanToken::Token(token) = scan_token else { + unreachable!() + }; + input = self.skip(&token)?; + } + Some(input) + } + pub fn diagnostic(&self, severity: Severity, text: String) -> Diagnostic { let mut s = String::new(); if let Some(call) = self.get_macro_call() { @@ -544,172 +616,6 @@ impl<'a> TokenSlice<'a> { } } -#[derive(Clone)] -pub struct Cursor<'a> { - slice: TokenSlice<'a>, - - /// This allows [Self::force_string] etc. to advance while returning the - /// token without cloning it. - pos: Cell, -} - -impl<'a> Cursor<'a> { - pub fn new(slice: &TokenSlice<'a>) -> Self { - Self { - slice: slice.clone(), - pos: Cell::new(0), - } - } - - pub fn get_pos(&self) -> usize { - self.pos.get() - } - - pub fn set_pos(&self, position: usize) { - self.pos.set(position); - } - - pub fn with_pos(&self, f: F) -> Result - where - F: FnOnce() -> Result, - { - let position = self.get_pos(); - let retval = f(); - if retval.is_err() { - self.set_pos(position); - } - retval - } - - pub fn subcursor(&self, range: Range) -> Cursor<'a> { - Self::new(&self.slice.subslice(range)) - } - - pub fn remainder(&self) -> TokenSlice<'a> { - self.slice.subslice(self.pos.get()..self.slice.len()) - } - - pub fn take_remainder(&self) -> TokenSlice<'a> { - let remainder = self.remainder(); - self.pos.set(self.slice.len()); - remainder - } - - pub fn force_string(&self) -> Result<&str, Diagnostic> { - if let Some(Token::String(s)) = self.token() { - self.next(); - Ok(s.as_str()) - } else { - Err(self.error("Syntax error expecting string.")) - } - } - - pub fn force_id(&self) -> Result<&'a Identifier, Diagnostic> { - if let Some(Token::Id(id)) = self.token() { - self.next(); - Ok(id) - } else { - Err(self.error("Syntax error expecting identifier.")) - } - } - - pub fn force(&self, token: &Token) -> Result<(), Diagnostic> { - match self.token() { - Some(t) if t == token => { - self.next(); - Ok(()) - } - _ => Err(self.error(format!("Syntax error expecting {token}."))), - } - } - - pub fn error(&self, text: S) -> Diagnostic - where - S: ToString, - { - self.remainder().error(text) - } - - pub fn advance_to(&self, token: &Token) -> bool { - self.advance_until(|t| t == token) - } - - pub fn advance_until(&self, f: F) -> bool - where - F: Fn(&Token) -> bool, - { - while let Some(token) = self.token() { - if f(token) { - return true; - } - self.next(); - } - false - } - - pub fn at(&self, token: &Token) -> bool { - if let Some(token2) = self.token() { - token == token2 - } else { - false - } - } - - pub fn match_(&self, token: &Token) -> bool { - let at = self.at(token); - if at { - self.next(); - } - at - } - - pub fn match_keyword(&self, keyword: &str) -> bool { - if let Some(token) = self.token() { - if token.matches_keyword(keyword) { - self.next(); - return true; - } - } - false - } - - pub fn at_end(&self) -> bool { - self.pos.get() >= self.slice.len() - } - - pub fn token(&self) -> Option<&'a Token> { - self.slice.get_token(self.pos.get()) - } - - pub fn next(&self) { - if self.pos.get() < self.slice.len() { - self.pos.set(self.pos.get() + 1) - } - } - - pub fn prev(&self) { - if self.pos.get() > 0 { - self.pos.set(self.pos.get() - 1) - } - } - - pub fn match_syntax(&self, syntax: &str) -> bool { - self.with_pos(|| { - let syntax_scanner = StringScanner::new(syntax, Syntax::Interactive, true); - for scan_token in syntax_scanner { - let ScanToken::Token(token) = scan_token else { - unreachable!() - }; - if !self.match_(&token) { - return Err(()); - }; - } - Ok(()) - }) - .is_ok() - } -} - pub struct Source { file: Arc, segmenter: Segmenter, -- 2.30.2