From b8f7ae6610f04de0b4325a905cc69beb65bad2ab Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 10 Sep 2022 18:23:09 -0700 Subject: [PATCH] DEFINE: Improve error messages for parsing macro body. Without this change, errors always got reported following the macro body, not for the particular token at issue. --- src/language/control/define.c | 92 +++++++++++++++++++++++++------- src/language/lexer/lexer.c | 6 +++ src/language/lexer/lexer.h | 1 + src/language/lexer/macro.c | 44 +++++++-------- src/language/lexer/macro.h | 3 -- src/libpspp/message.c | 22 ++++++++ src/libpspp/message.h | 3 ++ tests/language/control/define.at | 6 ++- 8 files changed, 128 insertions(+), 49 deletions(-) diff --git a/src/language/control/define.c b/src/language/control/define.c index 14eafacb42..7b1807be1e 100644 --- a/src/language/control/define.c +++ b/src/language/control/define.c @@ -89,6 +89,78 @@ dup_arg_type (struct lexer *lexer, bool *saw_arg_type) } } +static bool +parse_macro_body (struct lexer *lexer, struct macro_tokens *mts) +{ + *mts = (struct macro_tokens) { .n = 0 }; + struct string body = DS_EMPTY_INITIALIZER; + struct msg_point start = lex_ofs_start_point (lexer, lex_ofs (lexer)); + while (!match_macro_id (lexer, "!ENDDEFINE")) + { + if (lex_token (lexer) != T_STRING) + { + lex_error (lexer, + _("Syntax error expecting macro body or !ENDDEFINE.")); + ds_destroy (&body); + return false; + } + + ds_put_substring (&body, lex_tokss (lexer)); + ds_put_byte (&body, '\n'); + lex_get (lexer); + } + + struct segmenter segmenter = segmenter_init (lex_get_syntax_mode (lexer), + true); + struct substring p = body.ss; + bool ok = true; + while (p.length > 0) + { + enum segment_type type; + int seg_len = segmenter_push (&segmenter, p.string, + p.length, true, &type); + assert (seg_len >= 0); + + struct macro_token mt = { + .token = { .type = T_STOP }, + .syntax = ss_head (p, seg_len), + }; + enum tokenize_result result + = token_from_segment (type, mt.syntax, &mt.token); + ss_advance (&p, seg_len); + + switch (result) + { + case TOKENIZE_EMPTY: + break; + + case TOKENIZE_TOKEN: + macro_tokens_add (mts, &mt); + break; + + case TOKENIZE_ERROR: + size_t start_offset = mt.syntax.string - body.ss.string; + size_t end_offset = start_offset + (mt.syntax.length ? mt.syntax.length - 1 : 0); + + const struct msg_location loc = { + .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)), + .start = msg_point_advance (start, ss_buffer (body.ss.string, start_offset)), + .end = msg_point_advance (start, ss_buffer (body.ss.string, end_offset)), + .src = CONST_CAST (struct lex_source *, lex_source (lexer)), + }; + msg_at (SE, &loc, "%s", mt.token.string.string); + intern_unref (loc.file_name); + + ok = false; + break; + } + + token_uninit (&mt.token); + } + ds_destroy (&body); + return ok; +} + int cmd_define (struct lexer *lexer, struct dataset *ds UNUSED) { @@ -270,21 +342,8 @@ cmd_define (struct lexer *lexer, struct dataset *ds UNUSED) goto error; } - struct string body = DS_EMPTY_INITIALIZER; - while (!match_macro_id (lexer, "!ENDDEFINE")) - { - if (lex_token (lexer) != T_STRING) - { - lex_error (lexer, - _("Syntax error expecting macro body or !ENDDEFINE.")); - ds_destroy (&body); - goto error; - } - - ds_put_substring (&body, lex_tokss (lexer)); - ds_put_byte (&body, '\n'); - lex_get (lexer); - } + if (!parse_macro_body (lexer, &m->body)) + goto error; struct msg_point macro_end = lex_ofs_end_point (lexer, lex_ofs (lexer) - 1); m->location = xmalloc (sizeof *m->location); @@ -294,9 +353,6 @@ cmd_define (struct lexer *lexer, struct dataset *ds UNUSED) .end = { .line = macro_end.line }, }; - macro_tokens_from_string (&m->body, body.ss, lex_get_syntax_mode (lexer)); - ds_destroy (&body); - lex_define_macro (lexer, m); return CMD_SUCCESS; diff --git a/src/language/lexer/lexer.c b/src/language/lexer/lexer.c index cdfc848360..7d268647e6 100644 --- a/src/language/lexer/lexer.c +++ b/src/language/lexer/lexer.c @@ -1923,6 +1923,12 @@ lex_source__ (const struct lexer *lexer) : ll_data (ll_head (&lexer->sources), struct lex_source, ll)); } +const struct lex_source * +lex_source (const struct lexer *lexer) +{ + return lex_source__ (lexer); +} + /* Returns the text of the syntax in SRC for tokens with offsets OFS0 through OFS1 in the current command, inclusive. (For example, if OFS0 and OFS1 are both zero, this requests the syntax for the first token in the current diff --git a/src/language/lexer/lexer.h b/src/language/lexer/lexer.h index 027339f96e..a23e4bf693 100644 --- a/src/language/lexer/lexer.h +++ b/src/language/lexer/lexer.h @@ -174,6 +174,7 @@ bool lex_next_is_from_macro (const struct lexer *, int n); const char *lex_get_file_name (const struct lexer *); struct msg_location *lex_get_location (const struct lexer *, int n0, int n1); const char *lex_get_encoding (const struct lexer *); +const struct lex_source *lex_source (const struct lexer *); /* Issuing errors and warnings. */ void lex_error (struct lexer *, const char *, ...) PRINTF_FORMAT (2, 3); diff --git a/src/language/lexer/macro.c b/src/language/lexer/macro.c index 15ed9f12d2..05b3041bb3 100644 --- a/src/language/lexer/macro.c +++ b/src/language/lexer/macro.c @@ -220,12 +220,12 @@ macro_tokens_add (struct macro_tokens *mts, const struct macro_token *mt) macro_token_copy (macro_tokens_add_uninit (mts), mt); } -/* Tokenizes SRC according to MODE and appends the tokens to MTS. Uses STACK, - if nonull, for error reporting. */ +/* Tokenizes SRC according to MODE and appends the tokens to MTS, using STACK + for error reporting. */ static void -macro_tokens_from_string__ (struct macro_tokens *mts, const struct substring src, - enum segmenter_mode mode, - const struct macro_expansion_stack *stack) +macro_tokens_from_string (struct macro_tokens *mts, const struct substring src, + enum segmenter_mode mode, + const struct macro_expansion_stack *stack) { struct segmenter segmenter = segmenter_init (mode, true); struct substring body = src; @@ -267,14 +267,6 @@ macro_tokens_from_string__ (struct macro_tokens *mts, const struct substring src } } -/* Tokenizes SRC according to MODE and appends the tokens to MTS. */ -void -macro_tokens_from_string (struct macro_tokens *mts, const struct substring src, - enum segmenter_mode mode) -{ - macro_tokens_from_string__ (mts, src, mode, NULL); -} - void macro_tokens_print (const struct macro_tokens *mts, FILE *stream) { @@ -1204,8 +1196,8 @@ expand_macro_function (const struct macro_expander *me, me->segmenter_mode, &tmp); struct macro_tokens mts = { .n = 0 }; - macro_tokens_from_string__ (&mts, ss_cstr (s), me->segmenter_mode, - me->stack); + macro_tokens_from_string (&mts, ss_cstr (s), me->segmenter_mode, + me->stack); if (mts.n > 0) ds_put_substring (output, mts.mts[0].syntax); macro_tokens_uninit (&mts); @@ -1274,8 +1266,8 @@ expand_macro_function (const struct macro_expander *me, me->segmenter_mode, &tmp); struct macro_tokens mts = { .n = 0 }; - macro_tokens_from_string__ (&mts, ss_cstr (s), me->segmenter_mode, - me->stack); + macro_tokens_from_string (&mts, ss_cstr (s), me->segmenter_mode, + me->stack); if (mts.n > 1) { struct macro_tokens tail = { .mts = mts.mts + 1, .n = mts.n - 1 }; @@ -1306,8 +1298,8 @@ expand_macro_function (const struct macro_expander *me, case MF_EVAL: { struct macro_tokens mts = { .n = 0 }; - macro_tokens_from_string__ (&mts, ss_cstr (args.strings[0]), - me->segmenter_mode, me->stack); + macro_tokens_from_string (&mts, ss_cstr (args.strings[0]), + me->segmenter_mode, me->stack); struct macro_tokens exp = { .n = 0 }; struct macro_expansion_stack stack = { .name = "!EVAL", @@ -1575,7 +1567,7 @@ macro_evaluate_number (const struct macro_token **tokens, size_t n_tokens, return false; struct macro_tokens mts = { .n = 0 }; - macro_tokens_from_string__ (&mts, ss_cstr (s), me->segmenter_mode, me->stack); + macro_tokens_from_string (&mts, ss_cstr (s), me->segmenter_mode, me->stack); if (mts.n != 1 || !token_is_number (&mts.mts[0].token)) { macro_error (me->stack, mts.n > 0 ? &mts.mts[0] : NULL, @@ -1817,8 +1809,8 @@ macro_expand_do (const struct macro_token *tokens, size_t n_tokens, return 0; struct macro_tokens items = { .n = 0 }; - macro_tokens_from_string__ (&items, ss_cstr (list), me->segmenter_mode, - me->stack); + macro_tokens_from_string (&items, ss_cstr (list), me->segmenter_mode, + me->stack); free (list); const struct macro_token *do_end = find_doend (subme.stack, p, end); @@ -1984,8 +1976,8 @@ macro_expand_arg (const struct token *token, const struct macro_expander *me, token->string.length); if (var) { - macro_tokens_from_string__ (exp, ss_cstr (var), - me->segmenter_mode, me->stack); + macro_tokens_from_string (exp, ss_cstr (var), + me->segmenter_mode, me->stack); return true; } @@ -2055,8 +2047,8 @@ macro_expand__ (const struct macro_token *mts, size_t n, size_t n_function = expand_macro_function (me, mts, n, &function_output); if (n_function) { - macro_tokens_from_string__ (exp, function_output.ss, - me->segmenter_mode, me->stack); + macro_tokens_from_string (exp, function_output.ss, + me->segmenter_mode, me->stack); ds_destroy (&function_output); return n_function; diff --git a/src/language/lexer/macro.h b/src/language/lexer/macro.h index 4a6f73dff9..db6edd83a9 100644 --- a/src/language/lexer/macro.h +++ b/src/language/lexer/macro.h @@ -57,9 +57,6 @@ void macro_tokens_uninit (struct macro_tokens *); struct macro_token *macro_tokens_add_uninit (struct macro_tokens *); void macro_tokens_add (struct macro_tokens *, const struct macro_token *); -void macro_tokens_from_string (struct macro_tokens *, const struct substring, - enum segmenter_mode); - void macro_tokens_to_syntax (struct macro_tokens *, struct string *, size_t *ofs, size_t *len); diff --git a/src/libpspp/message.c b/src/libpspp/message.c index 38726d9f5b..ac24d55d12 100644 --- a/src/libpspp/message.c +++ b/src/libpspp/message.c @@ -112,6 +112,28 @@ msg_set_handler (const struct msg_handler *handler) msg_handler = *handler; } +/* msg_point. */ + +/* Takes POINT, adds to it the syntax in SYNTAX, incrementing the line number + for each new-line in SYNTAX and the column number for each column, and + returns the result. */ +struct msg_point +msg_point_advance (struct msg_point point, struct substring syntax) +{ + for (;;) + { + size_t newline = ss_find_byte (syntax, '\n'); + if (newline == SIZE_MAX) + break; + point.line++; + point.column = 1; + ss_advance (&syntax, newline + 1); + } + + point.column += ss_utf8_count_columns (syntax); + return point; +} + /* msg_location. */ void diff --git a/src/libpspp/message.h b/src/libpspp/message.h index d994a369f8..24d2defe46 100644 --- a/src/libpspp/message.h +++ b/src/libpspp/message.h @@ -23,6 +23,7 @@ #include "libpspp/compiler.h" struct string; +struct substring; /* What kind of message is this? */ enum msg_category @@ -85,6 +86,8 @@ struct msg_point int column; }; +struct msg_point msg_point_advance (struct msg_point, struct substring); + /* Location of the cause of an error. */ struct msg_location { diff --git a/tests/language/control/define.at b/tests/language/control/define.at index e528509b1e..b3896b1aca 100644 --- a/tests/language/control/define.at +++ b/tests/language/control/define.at @@ -1790,8 +1790,10 @@ x'123' !ENDDEFINE. ]) AT_CHECK([pspp define.sps], [1], [dnl -define.sps:3: error: DEFINE: String of hex digits has 3 characters, which is -not a multiple of 2. +define.sps:2.1-2.6: error: DEFINE: String of hex digits has 3 characters, which +is not a multiple of 2. + 2 | x'123' + | ^~~~~~ ]) AT_CLEANUP -- 2.30.2