From 8037ba709328b0896cf825f0ca6e4177ee4a512b Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 18 Mar 2023 14:09:38 -0700 Subject: [PATCH] lexer: Avoid reentering the lexer upon error in special cases. Thanks to Youngseok Choi for reporting the bug. --- src/language/lexer/lexer.c | 44 ++++++++++++++++++++++++++++++----- tests/language/lexer/lexer.at | 16 +++++++++++++ 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/src/language/lexer/lexer.c b/src/language/lexer/lexer.c index 19404f1ee0..489ade8372 100644 --- a/src/language/lexer/lexer.c +++ b/src/language/lexer/lexer.c @@ -282,6 +282,11 @@ struct lexer { struct ll_list sources; /* Contains "struct lex_source"s. */ struct macro_set *macros; + + /* Temporarily stores errors and warnings to be emitted by the lexer while + lexing is going on, to avoid reentrancy. */ + struct msg **messages; + size_t n_messages, allocated_messages; }; static struct lex_source *lex_source__ (const struct lexer *); @@ -344,6 +349,8 @@ lex_destroy (struct lexer *lexer) { struct lex_source *source, *next; + assert (!lexer->messages); + ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources) { ll_remove (&source->ll); @@ -2102,7 +2109,12 @@ lex_get_error (struct lex_source *src, const struct lex_token *token) .location = lex_token_location_rw (src, token, token), .text = ss_xstrdup (token->token.string), }; - msg_emit (m); + + struct lexer *lexer = src->lexer; + if (lexer->n_messages >= lexer->allocated_messages) + lexer->messages = x2nrealloc (lexer->messages, &lexer->allocated_messages, + sizeof *lexer->messages); + lexer->messages[lexer->n_messages++] = m; } /* Attempts to append an additional token to 'pp' in SRC, reading more from the @@ -2352,12 +2364,8 @@ lex_source_get_merge (struct lex_source *src) return false; } -/* Attempts to obtain at least one new token into 'lookahead' in SRC. - - Returns true if successful, false on failure. In the latter case, SRC is - exhausted and 'src->eof' is now true. */ static bool -lex_source_get_parse (struct lex_source *src) +lex_source_get_parse__ (struct lex_source *src) { struct merger m = MERGER_INIT; struct token out; @@ -2410,6 +2418,30 @@ lex_source_get_parse (struct lex_source *src) } } } + +/* Attempts to obtain at least one new token into 'lookahead' in SRC. + + Returns true if successful, false on failure. In the latter case, SRC is + exhausted and 'src->eof' is now true. */ +static bool +lex_source_get_parse (struct lex_source *src) +{ + bool ok = lex_source_get_parse__ (src); + struct lexer *lexer = src->lexer; + if (lexer->n_messages) + { + struct msg **messages = lexer->messages; + size_t n = lexer->n_messages; + + lexer->messages = NULL; + lexer->n_messages = lexer->allocated_messages = 0; + + for (size_t i = 0; i < n; i++) + msg_emit (messages[i]); + free (messages); + } + return ok; +} static void lex_source_push_endcmd__ (struct lex_source *src) diff --git a/tests/language/lexer/lexer.at b/tests/language/lexer/lexer.at index 5b6b660ecd..7519ffc2fa 100644 --- a/tests/language/lexer/lexer.at +++ b/tests/language/lexer/lexer.at @@ -133,3 +133,19 @@ AT_CHECK([pspp -O format=csv lexer.sps], [1], [dnl ]) AT_CLEANUP + +AT_SETUP([lexer crash due to reentrancy in error processing]) +dnl ^ is an invalid character in input that triggers an error message. +dnl 100 of them, as shown below, exceeds the 100-error limit. The +dnl minus sign causes the lexer to look ahead for a number, and then +dnl the ^ encountered afterward causes an error too, and then the +dnl message emission handler might reenter the lexer looking for a +dnl location, which can then cause the lexer to try to get a token +dnl again. It's a whole mess and the new way of avoiding reentrancy +dnl by keeping a collection of messages to emit until we've almost +dnl returned to the top level is much less prone to error. +AT_DATA([lexer.sps], [dnl +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-^ +]) +AT_CHECK([pspp lexer.sps], [1], [ignore]) +AT_CLEANUP \ No newline at end of file -- 2.30.2