From fd0957e0c289f6809c32fc9ef3a5a1e82680dd7b Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Thu, 25 Jul 2024 08:31:02 -0700 Subject: [PATCH] macro implementation is feature-complete? --- rust/src/identifier.rs | 2 +- rust/src/macros.rs | 339 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 307 insertions(+), 34 deletions(-) diff --git a/rust/src/identifier.rs b/rust/src/identifier.rs index 5bffe2b1c9..2d5c0317ec 100644 --- a/rust/src/identifier.rs +++ b/rust/src/identifier.rs @@ -153,7 +153,7 @@ impl Identifier { /// encoding used by the dictionary, not in UTF-8. pub const MAX_LEN: usize = 64; - pub fn new(s: &str) -> Result { + pub fn new(s: &str) -> Result { Self::from_encoding(s, UTF_8) } pub fn from_encoding(s: &str, encoding: &'static Encoding) -> Result { diff --git a/rust/src/macros.rs b/rust/src/macros.rs index 89397db238..9bfaf28d94 100644 --- a/rust/src/macros.rs +++ b/rust/src/macros.rs @@ -5,6 +5,8 @@ use std::{ cmp::Ordering, collections::{BTreeMap, HashMap, HashSet}, mem::take, + num::NonZeroUsize, + ops::RangeInclusive, }; use thiserror::Error as ThisError; use unicase::UniCase; @@ -132,6 +134,38 @@ pub enum MacroError { /// `!BREAK` outside `!DO`. #[error("`!BREAK` outside `!DO`.")] BreakOutsideDo, + + /// `,` or `)` expected in call to macro function. + #[error("`,` or `)` expected in call to macro function `{0}`.")] + ExpectingCommaOrRParen(Identifier), + + /// Macro function takes one argument. + #[error("Macro function `{name}` takes one argument (not {n_args}).")] + ExpectingOneArg { name: Identifier, n_args: usize }, + + /// Macro function takes two arguments. + #[error("Macro function `{name}` takes two arguments (not {n_args}).")] + ExpectingTwoArgs { name: Identifier, n_args: usize }, + + /// Macro function takes two or three arguments. + #[error("Macro function `{name}` takes two or three arguments (not {n_args}).")] + ExpectingTwoOrThreeArgs { name: Identifier, n_args: usize }, + + /// Macro function needs at least one argument). + #[error("Macro function `{name}` needs at least one argument).")] + ExpectingOneOrMoreArgs { name: Identifier }, + + /// Argument to `!BLANKS` must be non-negative integer (not `{0}`). + #[error("Argument to `!BLANKS` must be non-negative integer (not `{0}`).")] + InvalidBlanks(String), + + /// Second argument of `!SUBSTR` must be positive integer (not `{0}`). + #[error("Second argument of `!SUBSTR` must be positive integer (not `{0}`).")] + InvalidSubstr2(String), + + /// Third argument of `!SUBSTR` must be non-negative integer (not `{0}`). + #[error("Third argument of `!SUBSTR` must be non-negative integer (not `{0}`).")] + InvalidSubstr3(String), } /// A PSPP macro as defined with `!DEFINE`. @@ -217,7 +251,7 @@ pub struct MacroToken { syntax: String, } -fn tokenize_string(s: &str, mode: Mode, output: &mut Vec, error: &impl Fn(MacroError)) { +fn tokenize_string_into(s: &str, mode: Mode, error: &impl Fn(MacroError), output: &mut Vec) { for (syntax, token) in StringSegmenter::new(s, mode, true) { match token { ScanToken::Token(token) => output.push(MacroToken { @@ -229,19 +263,32 @@ fn tokenize_string(s: &str, mode: Mode, output: &mut Vec, error: &im } } -fn unquote_string(input: String, mode: Mode) -> String { - let mut scanner = StringScanner::new(&input, mode, true); +fn tokenize_string(s: &str, mode: Mode, error: &impl Fn(MacroError)) -> Vec { + let mut tokens = Vec::new(); + tokenize_string_into(s, mode, error, &mut tokens); + tokens +} + +fn try_unquote_string(input: &String, mode: Mode) -> Option { + let mut scanner = StringScanner::new(input, mode, true); let Some(ScanToken::Token(Token::String(unquoted))) = scanner.next() else { - return input; + return None; }; - let None = scanner.next() else { return input }; - return unquoted; + let None = scanner.next() else { return None }; + return Some(unquoted); +} + +fn unquote_string(input: String, mode: Mode) -> String { + try_unquote_string(&input, mode).unwrap_or(input) } #[derive(Clone)] struct MacroTokens<'a>(&'a [MacroToken]); impl<'a> MacroTokens<'a> { + fn is_empty(&self) -> bool { + self.0.is_empty() + } fn match_(&mut self, s: &str) -> bool { if let Some((first, rest)) = self.0.split_first() { if first.syntax.eq_ignore_ascii_case(s) { @@ -260,6 +307,9 @@ impl<'a> MacroTokens<'a> { } None } + fn macro_id(&self) -> Option<&Identifier> { + self.0.get(0).map(|mt| mt.token.macro_id()).flatten() + } fn take_macro_id(&mut self) -> Option<&Identifier> { let result = self.0.get(0).map(|mt| mt.token.macro_id()).flatten(); if result.is_some() { @@ -267,6 +317,15 @@ impl<'a> MacroTokens<'a> { } result } + fn take(&mut self) -> Option<&MacroToken> { + match self.0.split_first() { + Some((first, rest)) => { + self.0 = rest; + Some(first) + } + None => None, + } + } fn advance(&mut self) -> &MacroToken { let (first, rest) = self.0.split_first().unwrap(); self.0 = rest; @@ -845,7 +904,7 @@ impl<'a> Expander<'a> { (self.error)(MacroError::TooDeep { limit: MNEST }); output.extend(take(&mut input.0).iter().cloned()); } else { - while !input.0.is_empty() && !self.should_break() { + while !input.is_empty() && !self.should_break() { self.expand__(input, output); } } @@ -877,6 +936,214 @@ impl<'a> Expander<'a> { output.extend(arg.iter().cloned()); } } + fn parse_function_args( + &mut self, + function: &Identifier, + input: &mut MacroTokens, + ) -> Option> { + input.advance(); + input.advance(); + let mut args = Vec::new(); + if input.match_(")") { + return Some(args); + } + loop { + args.push(self.parse_function_arg(input)?); + match input.take() { + Some(MacroToken { + token: Token::Punct(Punct::Comma), + .. + }) => (), + Some(MacroToken { + token: Token::Punct(Punct::RParen), + .. + }) => return Some(args), + _ => { + (self.error)(MacroError::ExpectingCommaOrRParen(function.clone())); + return None; + } + } + } + } + + fn expand_blanks(e: &mut Expander, args: Vec) -> Option { + let Ok(n) = args[0].trim().parse::() else { + (e.error)(MacroError::InvalidBlanks(args[0].clone())); + return None; + }; + Some(std::iter::repeat(' ').take(n).collect()) + } + + fn expand_concat(e: &mut Expander, args: Vec) -> Option { + Some( + args.into_iter() + .map(|arg| unquote_string(arg, e.mode)) + .collect(), + ) + } + + fn expand_eval(e: &mut Expander, args: Vec) -> Option { + let tokens = tokenize_string(&args[0], e.mode, e.error); + let mut stack = take(&mut e.stack); + stack.push(Frame { + name: Some(Identifier::new("!EVAL").unwrap()), + location: None, + }); + let mut break_ = false; + let mut subexpander = Expander { + break_: Some(&mut break_), + stack, + vars: e.vars, + ..*e + }; + let mut output = Vec::new(); + subexpander.expand(&mut MacroTokens(tokens.as_slice()), &mut output); + subexpander.stack.pop(); + e.stack = subexpander.stack; + let mut output_string = String::new(); + macro_tokens_to_syntax(&mut output, &mut output_string); + Some(output_string) + } + + fn expand_head(e: &mut Expander, mut args: Vec) -> Option { + let arg = unquote_string(args.remove(0), e.mode); + let mut output = tokenize_string(&arg, e.mode, e.error); + if output.is_empty() { + Some(String::new()) + } else { + Some(output.swap_remove(0).syntax) + } + } + + fn expand_index(_e: &mut Expander, args: Vec) -> Option { + let haystack = &args[0]; + let needle = &args[1]; + let position = haystack.find(needle); + Some(format!( + "{}", + position.map_or(0, |position| &haystack[0..position].chars().count() + 1) + )) + } + + fn expand_length(_e: &mut Expander, args: Vec) -> Option { + Some(format!("{}", args[0].chars().count())) + } + + fn expand_quote(e: &mut Expander, mut args: Vec) -> Option { + let arg = args.remove(0); + if try_unquote_string(&arg, e.mode).is_some() { + Some(arg) + } else { + let mut output = String::with_capacity(arg.len() + 2); + output.push('\''); + for c in arg.chars() { + if c == '"' { + output.push('\''); + } + output.push(c); + } + output.push('\''); + Some(output) + } + } + + fn expand_substr(e: &mut Expander, args: Vec) -> Option { + let Ok(start) = args[1].trim().parse::() else { + (e.error)(MacroError::InvalidSubstr3(args[0].clone())); + return None; + }; + let start = start.get(); + let Ok(count) = args[2].trim().parse::() else { + (e.error)(MacroError::InvalidSubstr2(args[0].clone())); + return None; + }; + + Some(args[0].chars().skip(start - 1).take(count).collect()) + } + + fn expand_tail(e: &mut Expander, mut args: Vec) -> Option { + let arg = unquote_string(args.remove(0), e.mode); + let mut output = tokenize_string(&arg, e.mode, e.error); + Some( + output + .pop() + .map_or_else(|| String::new(), |tail| tail.syntax), + ) + } + + fn expand_unquote(e: &mut Expander, mut args: Vec) -> Option { + Some(unquote_string(args.remove(0), e.mode)) + } + + fn expand_upcase(e: &mut Expander, mut args: Vec) -> Option { + Some(unquote_string(args.remove(0), e.mode).to_uppercase()) + } + + fn expand_macro_function(&mut self, orig_input: &mut MacroTokens) -> Option { + let mut input = orig_input.clone(); + let name = input.macro_id()?; + if name == "!NULL" { + return Some(String::new()); + } + if input.0.len() < 2 || !matches!(input.0[1].token, Token::Punct(Punct::LParen)) { + return None; + } + + struct MacroFunction { + name: Identifier, + args: RangeInclusive, + parser: fn(&mut Expander, Vec) -> Option, + } + impl MacroFunction { + fn new( + name: &str, + args: RangeInclusive, + parser: fn(&mut Expander, Vec) -> Option, + ) -> Self { + Self { + name: Identifier::new(name).unwrap(), + args, + parser, + } + } + } + lazy_static! { + static ref MACRO_FUNCTIONS: [MacroFunction; 11] = [ + MacroFunction::new("!BLANKS", 1..=1, Expander::expand_blanks), + MacroFunction::new("!CONCAT", 1..=usize::MAX, Expander::expand_concat), + MacroFunction::new("!HEAD", 1..=1, Expander::expand_head), + MacroFunction::new("!INDEX", 2..=2, Expander::expand_index), + MacroFunction::new("!LENGTH", 1..=1, Expander::expand_length), + MacroFunction::new("!QUOTE", 1..=1, Expander::expand_quote), + MacroFunction::new("!SUBSTR", 2..=3, Expander::expand_substr), + MacroFunction::new("!TAIL", 1..=1, Expander::expand_tail), + MacroFunction::new("!UNQUOTE", 1..=1, Expander::expand_unquote), + MacroFunction::new("!UPCASE", 1..=1, Expander::expand_upcase), + MacroFunction::new("!EVAL", 1..=1, Expander::expand_eval), + ]; + } + + let function = MACRO_FUNCTIONS.iter().find(|mf| &mf.name == name)?; + + let args = self.parse_function_args(&function.name, &mut input)?; + + let n_args = args.len(); + if !function.args.contains(&n_args) { + let name = function.name.clone(); + let error = match &function.args { + x if x == &(1..=1) => MacroError::ExpectingOneArg { name, n_args }, + x if x == &(2..=2) => MacroError::ExpectingTwoArgs { name, n_args }, + x if x == &(2..=3) => MacroError::ExpectingTwoOrThreeArgs { name, n_args }, + x if x == &(1..=usize::MAX) => MacroError::ExpectingOneOrMoreArgs { name }, + _ => unreachable!(), + }; + (self.error)(error); + return None; + } + + *orig_input = input; + (function.parser)(self, args) + } /// Parses one function argument from `input`. Each argument to a macro /// function is one of: @@ -910,7 +1177,9 @@ impl<'a> Expander<'a> { return Some(value.clone()); } - todo!() // expand macro function + if let Some(output) = self.expand_macro_function(input) { + return Some(output); + } } Token::Punct(Punct::BangAsterisk) => { let mut arg = String::new(); @@ -1000,8 +1269,7 @@ impl<'a> Expander<'a> { fn evaluate_number(&mut self, input: &mut MacroTokens) -> Option { let s = self.evaluate_expression(input)?; - let mut tokens = Vec::new(); - tokenize_string(&s, self.mode, &mut tokens, self.error); + let tokens = tokenize_string(&s, self.mode, self.error); let ( Some(MacroToken { token: Token::Number(number), @@ -1022,7 +1290,7 @@ impl<'a> Expander<'a> { ) -> Option<(MacroTokens<'b>, IfEndClause)> { let input_copy = input.clone(); let mut nesting = 0; - while !input.0.is_empty() { + while !input.is_empty() { if input.match_("!IF") { nesting += 1; } else if input.match_("!IFEND") { @@ -1143,7 +1411,7 @@ impl<'a> Expander<'a> { fn find_doend<'b>(&mut self, input: &mut MacroTokens<'b>) -> Option> { let input_copy = input.clone(); let mut nesting = 0; - while !input.0.is_empty() { + while !input.is_empty() { if input.match_("!DO") { nesting += 1; } else if input.match_("!DOEND") { @@ -1161,7 +1429,7 @@ impl<'a> Expander<'a> { return None; } - fn expand_do(&mut self, orig_input: &mut MacroTokens) -> bool { + fn expand_do(&mut self, orig_input: &mut MacroTokens, output: &mut Vec) -> bool { let mut input = orig_input.clone(); if !input.match_("!DO") { return false; @@ -1171,25 +1439,11 @@ impl<'a> Expander<'a> { return false; }; - let mut stack = take(&mut self.stack); - stack.push(Frame { - name: Some(Identifier::new("!DO").unwrap()), - location: None, - }); - let mut break_ = false; - let mut subexpander = Expander { - break_: Some(&mut break_), - stack, - vars: self.vars, - ..*self - }; - let (items, miterate_error) = if input.match_("!IN") { let Some(list) = self.evaluate_expression(&mut input) else { return false; }; - let mut items = Vec::new(); - tokenize_string(list.as_str(), self.mode, &mut items, &self.error); + let items = tokenize_string(list.as_str(), self.mode, &self.error); ( DoInput::from_list(items), MacroError::MiterateList(MITERATE), @@ -1230,8 +1484,21 @@ impl<'a> Expander<'a> { return false; }; + let mut stack = take(&mut self.stack); + stack.push(Frame { + name: Some(Identifier::new("!DO").unwrap()), + location: None, + }); + let mut break_ = false; + let mut subexpander = Expander { + break_: Some(&mut break_), + stack, + vars: self.vars, + ..*self + }; + for (i, item) in items.enumerate() { - if break_ { + if subexpander.should_break() { break; } if i >= MITERATE { @@ -1244,6 +1511,7 @@ impl<'a> Expander<'a> { } else { vars.insert(var_name.clone(), item); } + subexpander.expand(&mut body.clone(), output); } *orig_input = input; true @@ -1266,10 +1534,11 @@ impl<'a> Expander<'a> { stack, ..*self }; - subexpander.expand(input, output); + let mut body = MacroTokens(call.0.macro_.body.as_slice()); + subexpander.expand(&mut body, output); self.stack = subexpander.stack; self.stack.pop(); - input.0 = &[]; + input.0 = &input.0[call.len()..]; return; } } @@ -1305,7 +1574,7 @@ impl<'a> Expander<'a> { // Variables set by `!DO` or `!LET`. if let Some(value) = self.vars.borrow().get(id) { - tokenize_string(value.as_str(), self.mode, output, &self.error); + tokenize_string_into(value.as_str(), self.mode, &self.error, output); input.advance(); return; } @@ -1317,7 +1586,7 @@ impl<'a> Expander<'a> { if self.expand_let(input) { return; } - if self.expand_do(input) { + if self.expand_do(input, output) { return; } @@ -1390,6 +1659,10 @@ impl<'a> Call<'a> { let mut body = MacroTokens(&self.0.macro_.body); me.expand(&mut body, output); } + + pub fn len(&self) -> usize { + self.0.n_tokens + } } const MNEST: usize = 50; -- 2.30.2