1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2021 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/macro.h"
25 #include "data/settings.h"
26 #include "language/lexer/segment.h"
27 #include "language/lexer/scan.h"
28 #include "libpspp/assertion.h"
29 #include "libpspp/i18n.h"
30 #include "libpspp/message.h"
31 #include "libpspp/str.h"
32 #include "libpspp/string-array.h"
35 #define _(msgid) gettext (msgid)
38 macro_token_copy (struct macro_token *dst, const struct macro_token *src)
40 token_copy (&dst->token, &src->token);
41 ss_alloc_substring (&dst->representation, src->representation);
45 macro_token_uninit (struct macro_token *mt)
47 token_uninit (&mt->token);
48 ss_dealloc (&mt->representation);
52 macro_token_to_representation (struct macro_token *mt, struct string *s)
54 ds_put_substring (s, mt->representation);
58 macro_tokens_copy (struct macro_tokens *dst, const struct macro_tokens *src)
60 *dst = (struct macro_tokens) {
61 .mts = xmalloc (src->n * sizeof *dst->mts),
65 for (size_t i = 0; i < src->n; i++)
66 macro_token_copy (&dst->mts[i], &src->mts[i]);
70 macro_tokens_uninit (struct macro_tokens *mts)
72 for (size_t i = 0; i < mts->n; i++)
73 macro_token_uninit (&mts->mts[i]);
78 macro_tokens_add_uninit (struct macro_tokens *mts)
80 if (mts->n >= mts->allocated)
81 mts->mts = x2nrealloc (mts->mts, &mts->allocated, sizeof *mts->mts);
82 return &mts->mts[mts->n++];
86 macro_tokens_add (struct macro_tokens *mts, const struct macro_token *mt)
88 macro_token_copy (macro_tokens_add_uninit (mts), mt);
92 macro_tokens_from_string (struct macro_tokens *mts, const struct substring src,
93 enum segmenter_mode mode)
97 struct segmenter segmenter;
98 struct substring body;
101 struct state state = {
102 .segmenter = SEGMENTER_INIT (mode),
105 struct state saved = state;
107 while (state.body.length > 0)
109 struct macro_token mt = {
110 .token = { .type = T_STOP },
111 .representation = { .string = state.body.string },
113 struct token *token = &mt.token;
115 struct scanner scanner;
116 scanner_init (&scanner, token);
120 enum segment_type type;
121 int seg_len = segmenter_push (&state.segmenter, state.body.string,
122 state.body.length, true, &type);
123 assert (seg_len >= 0);
125 struct substring segment = ss_head (state.body, seg_len);
126 ss_advance (&state.body, seg_len);
128 enum scan_result result = scanner_push (&scanner, type, segment, token);
129 if (result == SCAN_SAVE)
131 else if (result == SCAN_BACK)
136 else if (result == SCAN_DONE)
140 /* We have a token in 'token'. */
141 if (is_scan_type (token->type))
143 if (token->type != SCAN_SKIP)
145 /* XXX report error */
150 mt.representation.length = state.body.string - mt.representation.string;
151 macro_tokens_add (mts, &mt);
153 token_uninit (token);
158 macro_tokens_print (const struct macro_tokens *mts, FILE *stream)
160 for (size_t i = 0; i < mts->n; i++)
161 token_print (&mts->mts[i].token, stream);
166 TC_ENDCMD, /* No space before or after (new-line after). */
167 TC_BINOP, /* Space on both sides. */
168 TC_COMMA, /* Space afterward. */
169 TC_ID, /* Don't need spaces except sequentially. */
170 TC_PUNCT, /* Don't need spaces except sequentially. */
174 needs_space (enum token_class prev, enum token_class next)
176 /* Don't need a space before or after the end of a command.
177 (A new-line is needed afterward as a special case.) */
178 if (prev == TC_ENDCMD || next == TC_ENDCMD)
181 /* Binary operators always have a space on both sides. */
182 if (prev == TC_BINOP || next == TC_BINOP)
185 /* A comma always has a space afterward. */
186 if (prev == TC_COMMA)
189 /* Otherwise, PREV is TC_ID or TC_PUNCT, which only need a space if there are
190 two or them in a row. */
194 static enum token_class
195 classify_token (enum token_type type)
248 macro_tokens_to_representation (struct macro_tokens *mts, struct string *s)
253 macro_token_to_representation (&mts->mts[0], s);
254 for (size_t i = 1; i < mts->n; i++)
256 enum token_type prev = mts->mts[i - 1].token.type;
257 enum token_type next = mts->mts[i].token.type;
259 if (prev == T_ENDCMD)
260 ds_put_byte (s, '\n');
263 enum token_class pc = classify_token (prev);
264 enum token_class nc = classify_token (next);
265 if (needs_space (pc, nc))
266 ds_put_byte (s, ' ');
269 macro_token_to_representation (&mts->mts[i], s);
274 macro_destroy (struct macro *m)
280 for (size_t i = 0; i < m->n_params; i++)
282 struct macro_param *p = &m->params[i];
285 macro_tokens_uninit (&p->def);
293 token_uninit (&p->charend);
297 token_uninit (&p->enclose[0]);
298 token_uninit (&p->enclose[1]);
306 macro_tokens_uninit (&m->body);
311 macro_set_create (void)
313 struct macro_set *set = xmalloc (sizeof *set);
314 *set = (struct macro_set) {
315 .macros = HMAP_INITIALIZER (set->macros),
321 macro_set_destroy (struct macro_set *set)
326 struct macro *macro, *next;
327 HMAP_FOR_EACH_SAFE (macro, next, struct macro, hmap_node, &set->macros)
329 hmap_delete (&set->macros, ¯o->hmap_node);
330 macro_destroy (macro);
332 hmap_destroy (&set->macros);
337 hash_macro_name (const char *name)
339 return utf8_hash_case_string (name, 0);
342 static struct macro *
343 macro_set_find__ (struct macro_set *set, const char *name)
346 HMAP_FOR_EACH_WITH_HASH (macro, struct macro, hmap_node,
347 hash_macro_name (name), &set->macros)
348 if (!utf8_strcasecmp (macro->name, name))
355 macro_set_find (const struct macro_set *set, const char *name)
357 return macro_set_find__ (CONST_CAST (struct macro_set *, set), name);
360 /* Adds M to SET. M replaces any existing macro with the same name. Takes
363 macro_set_add (struct macro_set *set, struct macro *m)
365 struct macro *victim = macro_set_find__ (set, m->name);
368 hmap_delete (&set->macros, &victim->hmap_node);
369 macro_destroy (victim);
372 hmap_insert (&set->macros, &m->hmap_node, hash_macro_name (m->name));
380 /* Accumulating tokens in me->params toward the end of any type of
384 /* Expecting the opening delimiter of an ARG_ENCLOSE argument. */
387 /* Expecting a keyword for a keyword argument. */
390 /* Expecting an equal sign for a keyword argument. */
395 struct macro_expander
397 const struct macro_set *macros;
402 const struct macro *macro;
403 struct macro_tokens **args;
404 const struct macro_param *param;
408 me_finished (struct macro_expander *me)
410 for (size_t i = 0; i < me->macro->n_params; i++)
413 me->args[i] = xmalloc (sizeof *me->args[i]);
414 macro_tokens_copy (me->args[i], &me->macro->params[i].def);
420 me_next_arg (struct macro_expander *me)
424 assert (!me->macro->n_params);
425 return me_finished (me);
427 else if (me->param->positional)
430 if (me->param >= &me->macro->params[me->macro->n_params])
431 return me_finished (me);
434 me->state = (!me->param->positional ? ME_KEYWORD
435 : me->param->arg_type == ARG_ENCLOSE ? ME_ENCLOSE
442 for (size_t i = 0; i < me->macro->n_params; i++)
445 me->state = ME_KEYWORD;
448 return me_finished (me);
453 me_error (struct macro_expander *me)
455 me->state = ME_ERROR;
460 me_add_arg (struct macro_expander *me, const struct macro_token *mt)
462 const struct macro_param *p = me->param;
464 const struct token *token = &mt->token;
465 if ((token->type == T_ENDCMD || token->type == T_STOP)
466 && p->arg_type != ARG_CMDEND)
468 msg (SE, _("Unexpected end of command reading argument %s "
469 "to macro %s."), me->param->name, me->macro->name);
471 return me_error (me);
476 struct macro_tokens **argp = &me->args[p - me->macro->params];
478 *argp = xzalloc (sizeof **argp);
479 struct macro_tokens *arg = *argp;
480 if (p->arg_type == ARG_N_TOKENS)
482 macro_tokens_add (arg, mt);
483 if (arg->n >= p->n_tokens)
484 return me_next_arg (me);
487 else if (p->arg_type == ARG_CMDEND)
489 if (token->type == T_ENDCMD || token->type == T_STOP)
490 return me_next_arg (me);
491 macro_tokens_add (arg, mt);
496 const struct token *end
497 = p->arg_type == ARG_CHAREND ? &p->charend : &p->enclose[1];
498 if (token_equal (token, end))
499 return me_next_arg (me);
500 macro_tokens_add (arg, mt);
506 me_expected (struct macro_expander *me, const struct macro_token *actual,
507 const struct token *expected)
509 const struct substring actual_s
510 = (actual->representation.length ? actual->representation
511 : ss_cstr (_("<end of input>")));
512 char *expected_s = token_to_string (expected);
513 msg (SE, _("Found `%.*s' while expecting `%s' reading argument %s "
515 (int) actual_s.length, actual_s.string, expected_s,
516 me->param->name, me->macro->name);
519 return me_error (me);
523 me_enclose (struct macro_expander *me, const struct macro_token *mt)
525 const struct token *token = &mt->token;
528 if (token_equal (&me->param->enclose[0], token))
534 return me_expected (me, mt, &me->param->enclose[0]);
537 static const struct macro_param *
538 macro_find_parameter_by_name (const struct macro *m, struct substring name)
540 if (ss_first (name) == '!')
541 ss_advance (&name, 1);
543 for (size_t i = 0; i < m->n_params; i++)
545 const struct macro_param *p = &m->params[i];
546 struct substring p_name = ss_cstr (p->name + 1);
547 if (!utf8_strncasecmp (p_name.string, p_name.length,
548 name.string, name.length))
555 me_keyword (struct macro_expander *me, const struct macro_token *mt)
557 const struct token *token = &mt->token;
558 if (token->type != T_ID)
559 return me_finished (me);
561 const struct macro_param *p = macro_find_parameter_by_name (me->macro,
565 size_t arg_index = p - me->macro->params;
567 if (me->args[arg_index])
570 _("Argument %s multiply specified in call to macro %s."),
571 p->name, me->macro->name);
572 return me_error (me);
576 me->state = ME_EQUALS;
580 return me_finished (me);
584 me_equals (struct macro_expander *me, const struct macro_token *mt)
586 const struct token *token = &mt->token;
589 if (token->type == T_EQUALS)
595 return me_expected (me, mt, &(struct token) { .type = T_EQUALS });
599 macro_expander_create (const struct macro_set *macros,
600 const struct token *token,
601 struct macro_expander **mep)
604 if (macro_set_is_empty (macros))
606 if (token->type != T_ID && token->type != T_MACRO_ID)
609 const struct macro *macro = macro_set_find (macros, token->string.string);
613 struct macro_expander *me = xmalloc (sizeof *me);
614 *me = (struct macro_expander) {
621 if (!macro->n_params)
625 me->state = (!macro->params[0].positional ? ME_KEYWORD
626 : macro->params[0].arg_type == ARG_ENCLOSE ? ME_ENCLOSE
628 me->args = xcalloc (macro->n_params, sizeof *me->args);
629 me->param = macro->params;
635 macro_expander_destroy (struct macro_expander *me)
640 for (size_t i = 0; i < me->macro->n_params; i++)
643 macro_tokens_uninit (me->args[i]);
650 /* Adds TOKEN to the collection of tokens in ME that potentially need to be
653 Returns -1 if the tokens added do not actually invoke a macro. The caller
654 should consume the first token without expanding it.
656 Returns 0 if the macro expander needs more tokens, for macro arguments or to
657 decide whether this is actually a macro invocation. The caller should call
658 macro_expander_add() again with the next token.
660 Returns a positive number to indicate that the returned number of tokens
661 invoke a macro. The number returned might be less than the number of tokens
662 added because it can take a few tokens of lookahead to determine whether the
663 macro invocation is finished. The caller should call
664 macro_expander_get_expansion() to obtain the expansion. */
666 macro_expander_add (struct macro_expander *me, const struct macro_token *mt)
674 return me_add_arg (me, mt);
677 return me_enclose (me, mt);
680 return me_keyword (me, mt);
683 return me_equals (me, mt);
690 /* Each argument to a macro function is one of:
692 - A quoted string or other single literal token.
694 - An argument to the macro being expanded, e.g. !1 or a named argument.
698 - A function invocation.
700 Each function invocation yields a character sequence to be turned into a
701 sequence of tokens. The case where that character sequence is a single
702 quoted string is an important special case.
704 struct parse_macro_function_ctx
706 struct macro_token *input;
708 int nesting_countdown;
709 const struct macro_set *macros;
710 const struct macro_expander *me;
715 macro_expand (const struct macro_tokens *,
716 int nesting_countdown, const struct macro_set *,
717 const struct macro_expander *, bool *expand, struct macro_tokens *exp);
720 expand_macro_function (struct parse_macro_function_ctx *ctx,
721 struct string *output, size_t *input_consumed);
724 parse_function_arg (struct parse_macro_function_ctx *ctx,
725 size_t i, struct string *farg)
727 struct macro_token *tokens = ctx->input;
728 const struct token *token = &tokens[i].token;
729 if (token->type == T_MACRO_ID)
731 const struct macro_param *param = macro_find_parameter_by_name (
732 ctx->me->macro, token->string);
735 size_t param_idx = param - ctx->me->macro->params;
736 const struct macro_tokens *marg = ctx->me->args[param_idx];
737 for (size_t i = 0; i < marg->n; i++)
740 ds_put_byte (farg, ' ');
741 ds_put_substring (farg, marg->mts[i].representation);
746 struct parse_macro_function_ctx subctx = {
747 .input = &ctx->input[i],
748 .n_input = ctx->n_input - i,
749 .nesting_countdown = ctx->nesting_countdown,
750 .macros = ctx->macros,
752 .expand = ctx->expand,
754 size_t subinput_consumed;
755 if (expand_macro_function (&subctx, farg, &subinput_consumed))
756 return subinput_consumed;
759 ds_put_substring (farg, tokens[i].representation);
764 parse_macro_function (struct parse_macro_function_ctx *ctx,
765 struct string_array *args,
766 struct substring function,
767 int min_args, int max_args,
768 size_t *input_consumed)
770 struct macro_token *tokens = ctx->input;
771 size_t n_tokens = ctx->n_input;
774 || tokens[0].token.type != T_MACRO_ID
775 || !ss_equals_case (tokens[0].token.string, function))
778 if (n_tokens < 2 || tokens[1].token.type != T_LPAREN)
780 printf ("`(' expected following %s'\n", function.string);
784 string_array_init (args);
786 for (size_t i = 2;; )
790 if (tokens[i].token.type == T_RPAREN)
792 *input_consumed = i + 1;
793 if (args->n < min_args || args->n > max_args)
795 printf ("Wrong number of arguments to %s.\n", function.string);
801 struct string s = DS_EMPTY_INITIALIZER;
802 i += parse_function_arg (ctx, i, &s);
808 string_array_append_nocopy (args, ds_steal_cstr (&s));
810 if (tokens[i].token.type == T_COMMA)
812 else if (tokens[i].token.type != T_RPAREN)
814 printf ("Expecting `,' or `)' in %s invocation.", function.string);
820 printf ("Missing closing parenthesis in arguments to %s.\n",
824 string_array_destroy (args);
829 unquote_string (const char *s, struct string *content)
831 struct string_lexer slex;
832 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE /* XXX */);
835 if (!string_lexer_next (&slex, &token1))
838 if (token1.type != T_STRING)
840 token_uninit (&token1);
845 if (string_lexer_next (&slex, &token2))
847 token_uninit (&token1);
848 token_uninit (&token2);
852 ds_put_substring (content, token1.string);
853 token_uninit (&token1);
858 parse_integer (const char *s, int *np)
863 long int n = strtol (s, &tail, 10);
864 *np = n < INT_MIN ? INT_MIN : n > INT_MAX ? INT_MAX : n;
865 tail += strspn (tail, CC_SPACES);
866 return *tail == '\0' && errno != ERANGE && n == *np;
870 expand_macro_function (struct parse_macro_function_ctx *ctx,
871 struct string *output,
872 size_t *input_consumed)
874 struct string_array args;
876 if (parse_macro_function (ctx, &args, ss_cstr ("!length"), 1, 1,
878 ds_put_format (output, "%zu", strlen (args.strings[0]));
879 else if (parse_macro_function (ctx, &args, ss_cstr ("!blanks"), 1, 1,
883 if (!parse_integer (args.strings[0], &n))
885 printf ("argument to !BLANKS must be non-negative integer (not \"%s\")\n", args.strings[0]);
886 string_array_destroy (&args);
890 ds_put_byte_multiple (output, ' ', n);
892 else if (parse_macro_function (ctx, &args, ss_cstr ("!concat"), 1, INT_MAX,
895 for (size_t i = 0; i < args.n; i++)
896 if (!unquote_string (args.strings[i], output))
897 ds_put_cstr (output, args.strings[i]);
899 else if (parse_macro_function (ctx, &args, ss_cstr ("!head"), 1, 1,
902 struct string content = DS_EMPTY_INITIALIZER;
903 const char *s = (unquote_string (args.strings[0], &content)
904 ? ds_cstr (&content) : args.strings[0]);
906 struct macro_tokens mts = { .n = 0 };
907 macro_tokens_from_string (&mts, ss_cstr (s), SEG_MODE_INTERACTIVE /* XXX */);
909 ds_put_substring (output, mts.mts[0].representation);
910 macro_tokens_uninit (&mts);
911 ds_destroy (&content);
913 else if (parse_macro_function (ctx, &args, ss_cstr ("!index"), 2, 2,
916 const char *haystack = args.strings[0];
917 const char *needle = strstr (haystack, args.strings[1]);
918 ds_put_format (output, "%zu", needle ? needle - haystack + 1 : 0);
920 else if (parse_macro_function (ctx, &args, ss_cstr ("!quote"), 1, 1,
923 if (unquote_string (args.strings[0], NULL))
924 ds_put_cstr (output, args.strings[0]);
927 ds_extend (output, strlen (args.strings[0]) + 2);
928 ds_put_byte (output, '\'');
929 for (const char *p = args.strings[0]; *p; p++)
932 ds_put_byte (output, '\'');
933 ds_put_byte (output, *p);
935 ds_put_byte (output, '\'');
938 else if (parse_macro_function (ctx, &args, ss_cstr ("!substr"), 2, 3,
942 if (!parse_integer (args.strings[1], &start) || start < 1)
944 printf ("second argument to !SUBSTR must be positive integer (not \"%s\")\n", args.strings[1]);
945 string_array_destroy (&args);
950 if (args.n > 2 && (!parse_integer (args.strings[2], &count) || count < 0))
952 printf ("third argument to !SUBSTR must be non-negative integer (not \"%s\")\n", args.strings[1]);
953 string_array_destroy (&args);
957 struct substring s = ss_cstr (args.strings[0]);
958 ds_put_substring (output, ss_substr (s, start - 1, count));
960 else if (parse_macro_function (ctx, &args, ss_cstr ("!tail"), 1, 1,
963 struct string content = DS_EMPTY_INITIALIZER;
964 const char *s = (unquote_string (args.strings[0], &content)
965 ? ds_cstr (&content) : args.strings[0]);
967 struct macro_tokens mts = { .n = 0 };
968 macro_tokens_from_string (&mts, ss_cstr (s), SEG_MODE_INTERACTIVE /* XXX */);
971 struct macro_tokens tail = { .mts = mts.mts + 1, .n = mts.n - 1 };
972 macro_tokens_to_representation (&tail, output);
974 macro_tokens_uninit (&mts);
975 ds_destroy (&content);
977 else if (parse_macro_function (ctx, &args, ss_cstr ("!unquote"), 1, 1,
980 if (!unquote_string (args.strings[0], output))
981 ds_put_cstr (output, args.strings[0]);
983 else if (ctx->n_input > 0
984 && ctx->input[0].token.type == T_MACRO_ID
985 && ss_equals_case (ctx->input[0].token.string, ss_cstr ("!null")))
993 string_array_destroy (&args);
998 macro_expand (const struct macro_tokens *mts,
999 int nesting_countdown, const struct macro_set *macros,
1000 const struct macro_expander *me, bool *expand,
1001 struct macro_tokens *exp)
1005 - Macro names in macro bodies are not expanded by default. !EVAL()
1008 - Macro names in arguments to macro invocations (outside of macro bodies)
1009 are expanded by default, unless !NOEXPAND. */
1010 if (nesting_countdown <= 0)
1012 printf ("maximum nesting level exceeded\n");
1013 for (size_t i = 0; i < mts->n; i++)
1014 macro_tokens_add (exp, &mts->mts[i]);
1018 for (size_t i = 0; i < mts->n; i++)
1020 const struct macro_token *mt = &mts->mts[i];
1021 const struct token *token = &mt->token;
1022 if (token->type == T_MACRO_ID && me)
1024 const struct macro_param *param = macro_find_parameter_by_name (
1025 me->macro, token->string);
1028 const struct macro_tokens *arg = me->args[param - me->macro->params];
1029 //macro_tokens_print (arg, stdout);
1030 if (*expand && param->expand_arg)
1031 macro_expand (arg, nesting_countdown, macros, NULL, expand, exp);
1033 for (size_t i = 0; i < arg->n; i++)
1034 macro_tokens_add (exp, &arg->mts[i]);
1041 struct macro_expander *subme;
1042 int retval = macro_expander_create (macros, token, &subme);
1043 for (size_t j = 1; !retval; j++)
1045 const struct macro_token endcmd = { .token = { .type = T_ENDCMD } };
1046 retval = macro_expander_add (
1047 subme, i + j < mts->n ? &mts->mts[i + j] : &endcmd);
1052 macro_expand (&subme->macro->body, nesting_countdown - 1, macros,
1053 subme, expand, exp);
1054 macro_expander_destroy (subme);
1058 macro_expander_destroy (subme);
1061 if (token->type != T_MACRO_ID)
1063 macro_tokens_add (exp, mt);
1067 /* Maybe each arg should just be a string, either a quoted string or a
1068 non-quoted string containing tokens. */
1069 struct parse_macro_function_ctx ctx = {
1070 .input = &mts->mts[i],
1071 .n_input = mts->n - i,
1072 .nesting_countdown = nesting_countdown,
1077 struct string function_output = DS_EMPTY_INITIALIZER;
1078 size_t function_consumed;
1079 if (expand_macro_function (&ctx, &function_output, &function_consumed))
1081 i += function_consumed - 1;
1083 macro_tokens_from_string (exp, function_output.ss,
1084 SEG_MODE_INTERACTIVE /* XXX */);
1085 ds_destroy (&function_output);
1090 if (ss_equals_case (token->string, ss_cstr ("!onexpand")))
1092 else if (ss_equals_case (token->string, ss_cstr ("!offexpand")))
1095 macro_tokens_add (exp, mt);
1100 macro_expander_get_expansion (struct macro_expander *me, struct macro_tokens *exp)
1103 for (size_t i = 0; i < me->macro->n_params; i++)
1105 printf ("%s:\n", me->macro->params[i].name);
1106 macro_tokens_print (me->args[i], stdout);
1111 macro_expand (&me->macro->body, settings_get_mnest (),
1112 me->macros, me, &expand, exp);
1115 printf ("expansion:\n");
1116 macro_tokens_print (exp, stdout);