X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Flexer%2Flexer.c;h=562fcc5827d3a4662d237633cb3785dee6e0cf33;hb=37cbc94c3ed75eb0ac345c7ada138ef4662f1433;hp=2823c24ab3524f19d411eaea7b9d82ce4a985665;hpb=edb4c98e1bbb8e0961c5e9f87452ed65ba0f8404;p=pspp diff --git a/src/language/lexer/lexer.c b/src/language/lexer/lexer.c index 2823c24ab3..562fcc5827 100644 --- a/src/language/lexer/lexer.c +++ b/src/language/lexer/lexer.c @@ -122,9 +122,24 @@ struct lex_source int n_newlines; /* Number of new-lines up to seg_pos. */ bool suppress_next_newline; - /* Tokens. */ - struct deque deque; /* Indexes into 'tokens'. */ - struct lex_token *tokens; /* Lookahead tokens for parser. */ + /* Tokens. + + This is mostly like a deque, with the conceptual invariant that back <= + middle <= front (modulo SIZE_MAX+1). The tokens available for parsing + lie between 'back' and 'middle': the token at 'back' is the current + token, the token at 'back + 1' is the next token, and so on. There are + usually no tokens between 'middle' and 'front'; if there are, then they + need to go through macro expansion and are not yet available for + parsing. + + 'capacity' is the current number of elements in 'tokens'. It is always + a power of 2. 'front', 'middle', and 'back' refer to indexes in + 'tokens' modulo 'capacity'. */ + size_t front; + size_t middle; + size_t back; + size_t capacity; + struct lex_token *tokens; }; static struct lex_source *lex_source_create (struct lexer *, @@ -144,7 +159,7 @@ static char *lex_source_get_syntax__ (const struct lex_source *, static const struct lex_token *lex_next__ (const struct lexer *, int n); static void lex_source_push_endcmd__ (struct lex_source *); -static void lex_source_pop__ (struct lex_source *); +static void lex_source_pop_back (struct lex_source *); static bool lex_source_get (const struct lex_source *); static void lex_source_error_valist (struct lex_source *, int n0, int n1, const char *format, va_list) @@ -231,31 +246,46 @@ lex_append (struct lexer *lexer, struct lex_reader *reader) /* Advancing. */ +/* Adds a new token at the front of SRC and returns a pointer to it. The + caller should initialize it. Does not advance the middle pointer, so the + token isn't immediately available to the parser. */ static struct lex_token * lex_push_token__ (struct lex_source *src) { - struct lex_token *token; - - if (deque_is_full (&src->deque)) - src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens); + if (src->front - src->back >= src->capacity) + { + /* Expansion works just like a deque, so we reuse the code. */ + struct deque deque = { + .capacity = src->capacity, + .front = src->front, + .back = src->back, + }; + src->tokens = deque_expand (&deque, src->tokens, sizeof *src->tokens); + src->capacity = deque.capacity; + } - token = &src->tokens[deque_push_front (&src->deque)]; + struct lex_token *token = &src->tokens[src->front++ & (src->capacity - 1)]; token->token = (struct token) { .type = T_STOP }; token->macro_rep = NULL; token->ref_cnt = NULL; return token; } +/* Removes the current token from SRC and uninitializes it. */ static void -lex_source_pop__ (struct lex_source *src) +lex_source_pop_back (struct lex_source *src) { - lex_token_uninit (&src->tokens[deque_pop_back (&src->deque)]); + assert (src->middle - src->back > 0); + lex_token_uninit (&src->tokens[src->back++ & (src->capacity - 1)]); } +/* Removes the token at the greatest lookahead from SRC and uninitializes + it. */ static void lex_source_pop_front (struct lex_source *src) { - lex_token_uninit (&src->tokens[deque_pop_front (&src->deque)]); + assert (src->front - src->middle > 0); + lex_token_uninit (&src->tokens[--src->front & (src->capacity - 1)]); } /* Advances LEXER to the next token, consuming the current token. */ @@ -268,10 +298,10 @@ lex_get (struct lexer *lexer) if (src == NULL) return; - if (!deque_is_empty (&src->deque)) - lex_source_pop__ (src); + if (src->middle - src->back > 0) + lex_source_pop_back (src); - while (deque_is_empty (&src->deque)) + while (src->back == src->middle) if (!lex_source_get (src)) { lex_source_destroy (src); @@ -901,28 +931,30 @@ lex_next__ (const struct lexer *lexer_, int n) } } +/* Returns the token in SRC with the greatest lookahead. */ static const struct lex_token * -lex_source_front (const struct lex_source *src) +lex_source_middle (const struct lex_source *src) { - return &src->tokens[deque_front (&src->deque, 0)]; + assert (src->middle - src->back > 0); + return &src->tokens[(src->middle - 1) & (src->capacity - 1)]; } static const struct lex_token * lex_source_next__ (const struct lex_source *src, int n) { - while (deque_count (&src->deque) <= n) + while (src->middle - src->back <= n) { - if (!deque_is_empty (&src->deque)) + if (src->middle - src->back > 0) { - const struct lex_token *front = lex_source_front (src); - if (front->token.type == T_STOP || front->token.type == T_ENDCMD) - return front; + const struct lex_token *middle = lex_source_middle (src); + if (middle->token.type == T_STOP || middle->token.type == T_ENDCMD) + return middle; } lex_source_get (src); } - return &src->tokens[deque_back (&src->deque, n)]; + return &src->tokens[(src->back + n) & (src->capacity - 1)]; } /* Returns the "struct token" of the token N after the current one in LEXER. @@ -1210,6 +1242,34 @@ lex_get_file_name (const struct lexer *lexer) return src == NULL ? NULL : src->reader->file_name; } +/* Returns a newly allocated msg_location for the syntax that represents tokens + with 0-based offsets N0...N1, inclusive, from the current token. The caller + must eventually free the location (with msg_location_destroy()). */ +struct msg_location * +lex_get_location (const struct lexer *lexer, int n0, int n1) +{ + struct msg_location *loc = lex_get_lines (lexer, n0, n1); + loc->first_column = lex_get_first_column (lexer, n0); + loc->last_column = lex_get_last_column (lexer, n1); + return loc; +} + +/* Returns a newly allocated msg_location for the syntax that represents tokens + with 0-based offsets N0...N1, inclusive, from the current token. The + location only covers the tokens' lines, not the columns. The caller must + eventually free the location (with msg_location_destroy()). */ +struct msg_location * +lex_get_lines (const struct lexer *lexer, int n0, int n1) +{ + struct msg_location *loc = xmalloc (sizeof *loc); + *loc = (struct msg_location) { + .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)), + .first_line = lex_get_first_line_number (lexer, n0), + .last_line = lex_get_last_line_number (lexer, n1), + }; + return loc; +} + const char * lex_get_encoding (const struct lexer *lexer) { @@ -1264,8 +1324,10 @@ lex_interactive_reset (struct lexer *lexer) src->suppress_next_newline = false; src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter), false); - while (!deque_is_empty (&src->deque)) - lex_source_pop__ (src); + while (src->middle - src->back > 0) + lex_source_pop_back (src); + while (src->front - src->middle > 0) + lex_source_pop_front (src); lex_source_push_endcmd__ (src); } } @@ -1288,8 +1350,8 @@ lex_discard_noninteractive (struct lexer *lexer) if (src != NULL) { - while (!deque_is_empty (&src->deque)) - lex_source_pop__ (src); + while (src->middle - src->back > 0) + lex_source_pop_back (src); for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL; src = lex_source__ (lexer)) @@ -1309,7 +1371,7 @@ lex_source_max_tail__ (const struct lex_source *src) /* Use the oldest token also. (We know that src->deque cannot be empty because we are in the process of adding a new token, which is already initialized enough to use here.) */ - token = &src->tokens[deque_back (&src->deque, 0)]; + token = &src->tokens[src->back & (src->capacity - 1)]; assert (token->token_pos >= token->line_pos); max_tail = MIN (max_tail, token->line_pos); @@ -1418,8 +1480,8 @@ lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1) return ds_steal_cstr (&s); } -static void -lex_ellipsize__ (struct substring in, char *out, size_t out_size) +void +lex_ellipsize (struct substring in, char *out, size_t out_size) { size_t out_maxlen; size_t out_len; @@ -1493,14 +1555,14 @@ lex_source_error_valist (struct lex_source *src, int n0, int n1, /* Get the syntax that caused the error. */ char *syntax = lex_source_get_syntax__ (src, n0, n1); char syntax_cstr[64]; - lex_ellipsize__ (ss_cstr (syntax), syntax_cstr, sizeof syntax_cstr); + lex_ellipsize (ss_cstr (syntax), syntax_cstr, sizeof syntax_cstr); free (syntax); /* Get the macro call(s) that expanded to the syntax that caused the error. */ char call_cstr[64]; struct substring call = lex_source_get_macro_call (src, n0, n1); - lex_ellipsize__ (call, call_cstr, sizeof call_cstr); + lex_ellipsize (call, call_cstr, sizeof call_cstr); if (syntax_cstr[0]) { @@ -1526,39 +1588,56 @@ lex_source_error_valist (struct lex_source *src, int n0, int n1, if (ds_last (&s) != '.') ds_put_byte (&s, '.'); - struct msg m = { - .category = MSG_C_SYNTAX, - .severity = MSG_S_ERROR, - .file_name = src->reader->file_name, + struct msg_location *location = xmalloc (sizeof *location); + *location = (struct msg_location) { + .file_name = xstrdup_if_nonnull (src->reader->file_name), .first_line = lex_source_get_first_line_number (src, n0), .last_line = lex_source_get_last_line_number (src, n1), .first_column = lex_source_get_first_column (src, n0), .last_column = lex_source_get_last_column (src, n1), + }; + struct msg *m = xmalloc (sizeof *m); + *m = (struct msg) { + .category = MSG_C_SYNTAX, + .severity = MSG_S_ERROR, + .location = location, .text = ds_steal_cstr (&s), }; - msg_emit (&m); + msg_emit (m); } -static void PRINTF_FORMAT (2, 3) -lex_get_error (struct lex_source *src, const char *format, ...) +static void PRINTF_FORMAT (4, 5) +lex_source_error (struct lex_source *src, int n0, int n1, + const char *format, ...) { va_list args; - int n; - va_start (args, format); + lex_source_error_valist (src, n0, n1, format, args); + va_end (args); +} - n = deque_count (&src->deque) - 1; - lex_source_error_valist (src, n, n, format, args); - lex_source_pop_front (src); +static void +lex_get_error (struct lex_source *src, const char *s) +{ + size_t old_middle = src->middle; + src->middle = src->front; + size_t n = src->front - src->back - 1; + lex_source_error (src, n, n, "%s", s); + src->middle = old_middle; - va_end (args); + lex_source_pop_front (src); } -/* Attempts to append an additional token into SRC's deque, reading more from - the underlying lex_reader if necessary. Returns true if a new token was - added to SRC's deque, false otherwise. */ +/* Attempts to append an additional token at the front of SRC, reading more + from the underlying lex_reader if necessary. Returns true if a new token + was added to SRC's deque, false otherwise. The caller should retry failures + unless SRC's 'eof' marker was set to true indicating that there will be no + more tokens from this source. + + Does not make the new token available for lookahead yet; the caller must + adjust SRC's 'middle' pointer to do so. */ static bool -lex_source_try_get (struct lex_source *src) +lex_source_try_get__ (struct lex_source *src) { /* State maintained while scanning tokens. Usually we only need a single state, but scanner_push() can return SCAN_SAVE to indicate that the state @@ -1698,44 +1777,17 @@ lex_source_try_get (struct lex_source *src) return true; case SCAN_BAD_HEX_LENGTH: - lex_get_error (src, _("String of hex digits has %d characters, which " - "is not a multiple of 2"), - (int) token->token.number); - return false; - case SCAN_BAD_HEX_DIGIT: case SCAN_BAD_UNICODE_DIGIT: - lex_get_error (src, _("`%c' is not a valid hex digit"), - (int) token->token.number); - return false; - case SCAN_BAD_UNICODE_LENGTH: - lex_get_error (src, _("Unicode string contains %d bytes, which is " - "not in the valid range of 1 to 8 bytes"), - (int) token->token.number); - return false; - case SCAN_BAD_UNICODE_CODE_POINT: - lex_get_error (src, _("U+%04X is not a valid Unicode code point"), - (int) token->token.number); - return false; - case SCAN_EXPECTED_QUOTE: - lex_get_error (src, _("Unterminated string constant")); - return false; - case SCAN_EXPECTED_EXPONENT: - lex_get_error (src, _("Missing exponent following `%s'"), - token->token.string.string); - return false; - case SCAN_UNEXPECTED_CHAR: - { - char c_name[16]; - lex_get_error (src, _("Bad character %s in input"), - uc_name (token->token.number, c_name)); - return false; - } + char *msg = scan_token_to_error (&token->token); + lex_get_error (src, msg); + free (msg); + return false; case SCAN_SKIP: lex_source_pop_front (src); @@ -1745,16 +1797,19 @@ lex_source_try_get (struct lex_source *src) NOT_REACHED (); } +/* Attempts to add a new token at the front of SRC. Returns true if + successful, false on failure. On failure, the end of SRC has been reached + and no more tokens will be forthcoming from it. + + Does not make the new token available for lookahead yet; the caller must + adjust SRC's 'middle' pointer to do so. */ static bool lex_source_get__ (struct lex_source *src) { - for (;;) - { - if (src->eof) - return false; - else if (lex_source_try_get (src)) - return true; - } + while (!src->eof) + if (lex_source_try_get__ (src)) + return true; + return false; } static bool @@ -1762,60 +1817,88 @@ lex_source_get (const struct lex_source *src_) { struct lex_source *src = CONST_CAST (struct lex_source *, src_); - size_t old_count = deque_count (&src->deque); - if (!lex_source_get__ (src)) - return false; + if (src->front - src->middle == 0) + { + if (!lex_source_get__ (src)) + return false; + } if (!settings_get_mexpand ()) - return true; + { + src->middle++; + return true; + } struct macro_expander *me; - int retval = macro_expander_create (src->lexer->macros, - &lex_source_front (src)->token, - &me); - while (!retval) + int n_call = macro_expander_create ( + src->lexer->macros, &src->tokens[src->middle & (src->capacity - 1)].token, + &me); + for (int middle_ofs = 1; !n_call; middle_ofs++) { - if (!lex_source_get__ (src)) + if (src->front - src->middle <= middle_ofs && !lex_source_get__ (src)) { /* This should not be reachable because we always get a T_ENDCMD at the end of an input file (transformed from T_STOP by - lex_source_try_get()) and the macro_expander should always + lex_source_try_get__()) and the macro_expander should always terminate expansion on T_ENDCMD. */ NOT_REACHED (); } - const struct lex_token *front = lex_source_front (src); - size_t start = front->token_pos; - size_t end = front->token_pos + front->token_len; + const struct lex_token *t = &src->tokens[(src->middle + middle_ofs) + & (src->capacity - 1)]; + size_t start = t->token_pos; + size_t end = t->token_pos + t->token_len; const struct macro_token mt = { - .token = front->token, + .token = t->token, .representation = ss_buffer (&src->buffer[start - src->tail], end - start), }; - retval = macro_expander_add (me, &mt); + src->middle += middle_ofs + 1; + n_call = macro_expander_add (me, &mt); + src->middle -= middle_ofs + 1; } - if (retval < 0) + if (n_call < 0) { - /* XXX handle case where there's a macro invocation starting from some - later token we've already obtained */ + /* False alarm: no macro expansion after all. Use first token as + lookahead. We'll retry macro expansion from the second token next + time around. */ macro_expander_destroy (me); + src->middle++; return true; } - /* XXX handle case where the macro invocation doesn't use all the tokens */ - const struct lex_token *call_first = lex_source_next__ (src, old_count); - const struct lex_token *call_last = lex_source_front (src); + /* The first 'n_call' tokens starting at 'middle' will be replaced by a + macro expansion. There might be more tokens after that, up to 'front'. + + Figure out the boundary of the macro call in the syntax, to go into the + lex_tokens for the expansion so that later error messages can report what + macro was called. */ + const struct lex_token *call_first + = &src->tokens[src->middle & (src->capacity - 1)]; + const struct lex_token *call_last + = &src->tokens[(src->middle + n_call - 1) & (src->capacity - 1)]; size_t call_pos = call_first->token_pos; size_t call_len = (call_last->token_pos + call_last->token_len) - call_pos; size_t line_pos = call_first->line_pos; int first_line = call_first->first_line; - while (deque_count (&src->deque) > old_count) - lex_source_pop_front (src); + /* Destroy the tokens for the call, and save any tokens following the call so + we can add them back later. */ + for (size_t i = src->middle; i != src->middle + n_call; i++) + lex_token_uninit (&src->tokens[i & (src->capacity - 1)]); + size_t n_save = src->front - (src->middle + n_call); + struct lex_token *save_tokens = xnmalloc (n_save, sizeof *save_tokens); + for (size_t i = 0; i < n_save; i++) + save_tokens[i] = src->tokens[(src->middle + n_call + i) + & (src->capacity - 1)]; + src->front = src->middle; + + /* Now expand the macro. */ struct macro_tokens expansion = { .n = 0 }; - macro_expander_get_expansion (me, &expansion); + macro_expander_get_expansion (me, src->reader->syntax, &expansion); macro_expander_destroy (me); + /* Convert the macro expansion into syntax for possible error messages later. */ size_t *ofs = xnmalloc (expansion.n, sizeof *ofs); size_t *len = xnmalloc (expansion.n, sizeof *len); struct string s = DS_EMPTY_INITIALIZER; @@ -1825,6 +1908,7 @@ lex_source_get (const struct lex_source *src_) output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s), _("Macro Expansion"))); + /* Append the macro expansion tokens to the lookahead. */ char *macro_rep = ds_steal_cstr (&s); size_t *ref_cnt = xmalloc (sizeof *ref_cnt); *ref_cnt = expansion.n; @@ -1841,6 +1925,7 @@ lex_source_get (const struct lex_source *src_) .len = len[i], .ref_cnt = ref_cnt, }; + src->middle++; ss_dealloc (&expansion.mts[i].representation); } @@ -1848,25 +1933,32 @@ lex_source_get (const struct lex_source *src_) free (ofs); free (len); + /* Finally, put the saved tokens back. */ + for (size_t i = 0; i < n_save; i++) + *lex_push_token__ (src) = save_tokens[i]; + free (save_tokens); + return true; } static void lex_source_push_endcmd__ (struct lex_source *src) { - *lex_push_token__ (src) = (struct lex_token) { .token = { .type = T_ENDCMD } }; + assert (src->back == src->middle && src->middle == src->front); + *lex_push_token__ (src) = (struct lex_token) { + .token = { .type = T_ENDCMD } }; + src->middle++; } static struct lex_source * lex_source_create (struct lexer *lexer, struct lex_reader *reader) { - struct lex_source *src; - - src = xzalloc (sizeof *src); - src->reader = reader; - src->segmenter = segmenter_init (reader->syntax, false); - src->lexer = lexer; - src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens); + struct lex_source *src = xmalloc (sizeof *src); + *src = (struct lex_source) { + .reader = reader, + .segmenter = segmenter_init (reader->syntax, false), + .lexer = lexer, + }; lex_source_push_endcmd__ (src); @@ -1883,8 +1975,10 @@ lex_source_destroy (struct lex_source *src) free (file_name); free (encoding); free (src->buffer); - while (!deque_is_empty (&src->deque)) - lex_source_pop__ (src); + while (src->middle - src->back > 0) + lex_source_pop_back (src); + while (src->front - src->middle > 0) + lex_source_pop_front (src); free (src->tokens); ll_remove (&src->ll); free (src);