X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Flexer%2Flexer.c;h=805d6fa4965829187087a1574553986f55105707;hb=0fde6afee3c995bf264c24c438f43eeb58b859b5;hp=bad24d3f9baeca4e4be9e5c32c95a3a9c509d130;hpb=61cb03a73ff9f5d38e9728d4bf5a449212d3acdc;p=pspp diff --git a/src/language/lexer/lexer.c b/src/language/lexer/lexer.c index bad24d3f9b..805d6fa496 100644 --- a/src/language/lexer/lexer.c +++ b/src/language/lexer/lexer.c @@ -38,6 +38,7 @@ #include "libpspp/cast.h" #include "libpspp/deque.h" #include "libpspp/i18n.h" +#include "libpspp/intern.h" #include "libpspp/ll.h" #include "libpspp/message.h" #include "libpspp/misc.h" @@ -65,12 +66,9 @@ struct lex_token location of the token in terms of the lex_source's buffer. For a token produced through macro expansion, this is the entire macro - call. - - src->tail <= line_pos <= token_pos <= src->head. */ - size_t token_pos; /* Start of token. */ + call. */ + size_t token_pos; /* Offset into src->buffer of token start. */ size_t token_len; /* Length of source for token in bytes. */ - size_t line_pos; /* Start of line containing token_pos. */ int first_line; /* Line number at token_pos. */ /* For a token obtained through macro expansion, this is just this token. @@ -113,7 +111,6 @@ static void lex_stage_uninit (struct lex_stage *); static size_t lex_stage_count (const struct lex_stage *); static bool lex_stage_is_empty (const struct lex_stage *); -static struct lex_token *lex_stage_last (struct lex_stage *); static struct lex_token *lex_stage_first (struct lex_stage *); static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs); @@ -153,14 +150,6 @@ lex_stage_count (const struct lex_stage *stage) return deque_count (&stage->deque); } -/* Returns the last token in STAGE, which must be nonempty. The last token is - the one accessed with the greatest lookahead. */ -static struct lex_token * -lex_stage_last (struct lex_stage *stage) -{ - return stage->tokens[deque_front (&stage->deque, 0)]; -} - /* Returns the first token in STAGE, which must be nonempty. The first token is the one accessed with the least lookahead. */ static struct lex_token * @@ -188,11 +177,18 @@ lex_stage_push_last (struct lex_stage *stage, struct lex_token *token) stage->tokens[deque_push_front (&stage->deque)] = token; } +/* Removes and returns the first token from STAGE. */ +static struct lex_token * +lex_stage_take_first (struct lex_stage *stage) +{ + return stage->tokens[deque_pop_back (&stage->deque)]; +} + /* Removes the first token from STAGE and uninitializes it. */ static void lex_stage_pop_first (struct lex_stage *stage) { - lex_token_destroy (stage->tokens[deque_pop_back (&stage->deque)]); + lex_token_destroy (lex_stage_take_first (stage)); } /* Removes the first N tokens from SRC, appending them to DST as the last @@ -201,10 +197,7 @@ static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n) { for (size_t i = 0; i < n; i++) - { - lex_stage_push_last (dst, lex_stage_first (src)); - deque_pop_back (&src->deque); - } + lex_stage_push_last (dst, lex_stage_take_first (src)); } /* A source of tokens, corresponding to a syntax file. @@ -220,15 +213,13 @@ struct lex_source bool eof; /* True if T_STOP was read from 'reader'. */ /* Buffer of UTF-8 bytes. */ - char *buffer; + char *buffer; /* Source file contents. */ + size_t length; /* Number of bytes filled. */ size_t allocated; /* Number of bytes allocated. */ - size_t tail; /* &buffer[0] offset into UTF-8 source. */ - size_t head; /* &buffer[head - tail] offset into source. */ - /* Positions in source file, tail <= pos <= head for each member here. */ + /* Offsets into 'buffer'. */ size_t journal_pos; /* First byte not yet output to journal. */ size_t seg_pos; /* First byte not yet scanned as token. */ - size_t line_pos; /* First byte of line containing seg_pos. */ int n_newlines; /* Number of new-lines up to seg_pos. */ bool suppress_next_newline; @@ -245,12 +236,17 @@ struct lex_source in 'merge'. - merge: Tokens that need to pass through scan_merge() to end up in - 'lookahead'. + 'parse'. + + - parse: Tokens available to the client for parsing. - - lookahead: Tokens available to the client for parsing. */ + 'pp' and 'merge' store tokens only temporarily until they pass into + 'parse'. Tokens then live in 'parse' until the command is fully + consumed, at which time they are freed together. */ struct lex_stage pp; struct lex_stage merge; - struct lex_stage lookahead; + struct lex_token **parse; + size_t n_parse, allocated_parse, parse_ofs; }; static struct lex_source *lex_source_create (struct lexer *, @@ -269,8 +265,10 @@ static char *lex_source_get_syntax__ (const struct lex_source *, int n0, int n1); static const struct lex_token *lex_next__ (const struct lexer *, int n); static void lex_source_push_endcmd__ (struct lex_source *); +static void lex_source_push_parse (struct lex_source *, struct lex_token *); +static void lex_source_clear_parse (struct lex_source *); -static bool lex_source_get_lookahead (struct lex_source *); +static bool lex_source_get_parse (struct lex_source *); static void lex_source_error_valist (struct lex_source *, int n0, int n1, const char *format, va_list) PRINTF_FORMAT (4, 0); @@ -366,11 +364,16 @@ lex_get (struct lexer *lexer) if (src == NULL) return; - if (!lex_stage_is_empty (&src->lookahead)) - lex_stage_pop_first (&src->lookahead); + if (src->parse_ofs < src->n_parse) + { + if (src->parse[src->parse_ofs]->token.type == T_ENDCMD) + lex_source_clear_parse (src); + else + src->parse_ofs++; + } - while (lex_stage_is_empty (&src->lookahead)) - if (!lex_source_get_lookahead (src)) + while (src->parse_ofs == src->n_parse) + if (!lex_source_get_parse (src)) { lex_source_destroy (src); src = lex_source__ (lexer); @@ -1025,19 +1028,32 @@ static const struct lex_token * lex_source_next__ (const struct lex_source *src_, int n) { struct lex_source *src = CONST_CAST (struct lex_source *, src_); - while (lex_stage_count (&src->lookahead) <= n) + + if (n < 0) + { + if (-n <= src->parse_ofs) + return src->parse[src->parse_ofs - (-n)]; + else + { + static const struct lex_token endcmd_token + = { .token = { .type = T_ENDCMD } }; + return &endcmd_token; + } + } + + while (src->n_parse - src->parse_ofs <= n) { - if (!lex_stage_is_empty (&src->lookahead)) + if (src->n_parse > 0) { - const struct lex_token *t = lex_stage_last (&src->lookahead); + const struct lex_token *t = src->parse[src->n_parse - 1]; if (t->token.type == T_STOP || t->token.type == T_ENDCMD) return t; } - lex_source_get_lookahead (src); + lex_source_get_parse (src); } - return lex_stage_nth (&src->lookahead, n); + return src->parse[src->parse_ofs + n]; } /* Returns the "struct token" of the token N after the current one in LEXER. @@ -1213,31 +1229,31 @@ lex_token_get_last_line_number (const struct lex_source *src, return 0; else { - char *token_str = &src->buffer[token->token_pos - src->tail]; + char *token_str = &src->buffer[token->token_pos]; return token->first_line + count_newlines (token_str, token->token_len) + 1; } } +static int +lex_token_get_column__ (const struct lex_source *src, size_t offset) +{ + const char *newline = memrchr (src->buffer, '\n', offset); + size_t line_ofs = newline ? newline - src->buffer + 1 : 0; + return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1; +} + static int lex_token_get_first_column (const struct lex_source *src, const struct lex_token *token) { - return utf8_count_columns (&src->buffer[token->line_pos - src->tail], - token->token_pos - token->line_pos) + 1; + return lex_token_get_column__ (src, token->token_pos); } static int lex_token_get_last_column (const struct lex_source *src, const struct lex_token *token) { - char *start, *end, *newline; - - start = &src->buffer[token->line_pos - src->tail]; - end = &src->buffer[(token->token_pos + token->token_len) - src->tail]; - newline = memrchr (start, '\n', end - start); - if (newline != NULL) - start = newline + 1; - return utf8_count_columns (start, end - start) + 1; + return lex_token_get_column__ (src, token->token_pos + token->token_len); } static struct msg_location @@ -1246,7 +1262,7 @@ lex_token_location (const struct lex_source *src, const struct lex_token *t1) { return (struct msg_location) { - .file_name = src->reader->file_name, + .file_name = intern_new_if_nonnull (src->reader->file_name), .first_line = t0->first_line, .last_line = lex_token_get_last_line_number (src, t1), .first_column = lex_token_get_first_column (src, t0), @@ -1362,7 +1378,7 @@ lex_get_lines (const struct lexer *lexer, int n0, int n1) { struct msg_location *loc = xmalloc (sizeof *loc); *loc = (struct msg_location) { - .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)), + .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)), .first_line = lex_get_first_line_number (lexer, n0), .last_line = lex_get_last_line_number (lexer, n1), }; @@ -1417,15 +1433,15 @@ lex_interactive_reset (struct lexer *lexer) struct lex_source *src = lex_source__ (lexer); if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL) { - src->head = src->tail = 0; - src->journal_pos = src->seg_pos = src->line_pos = 0; + src->length = 0; + src->journal_pos = src->seg_pos = 0; src->n_newlines = 0; src->suppress_next_newline = false; src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter), false); lex_stage_clear (&src->pp); lex_stage_clear (&src->merge); - lex_stage_clear (&src->lookahead); + lex_source_clear_parse (src); lex_source_push_endcmd__ (src); } } @@ -1450,7 +1466,7 @@ lex_discard_noninteractive (struct lexer *lexer) { lex_stage_clear (&src->pp); lex_stage_clear (&src->merge); - lex_stage_clear (&src->lookahead); + lex_source_clear_parse (src); for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL; src = lex_source__ (lexer)) @@ -1458,50 +1474,11 @@ lex_discard_noninteractive (struct lexer *lexer) } } -static size_t -lex_source_max_tail__ (const struct lex_source *src_) -{ - struct lex_source *src = CONST_CAST (struct lex_source *, src_); - - assert (src->seg_pos >= src->line_pos); - size_t max_tail = MIN (src->journal_pos, src->line_pos); - - /* Use the oldest token also. */ - struct lex_stage *stages[] = { &src->lookahead, &src->merge, &src->pp }; - for (size_t i = 0; i < sizeof stages / sizeof *stages; i++) - if (!lex_stage_is_empty (stages[i])) - { - struct lex_token *first = lex_stage_first (stages[i]); - assert (first->token_pos >= first->line_pos); - return MIN (max_tail, first->line_pos); - } - - return max_tail; -} - static void lex_source_expand__ (struct lex_source *src) { - if (src->head - src->tail >= src->allocated) - { - size_t max_tail = lex_source_max_tail__ (src); - if (max_tail > src->tail) - { - /* Advance the tail, freeing up room at the head. */ - memmove (src->buffer, src->buffer + (max_tail - src->tail), - src->head - max_tail); - src->tail = max_tail; - } - else - { - /* Buffer is completely full. Expand it. */ - src->buffer = x2realloc (src->buffer, &src->allocated); - } - } - else - { - /* There's space available at the head of the buffer. Nothing to do. */ - } + if (src->length >= src->allocated) + src->buffer = x2realloc (src->buffer, &src->allocated); } static void @@ -1511,10 +1488,10 @@ lex_source_read__ (struct lex_source *src) { lex_source_expand__ (src); - size_t head_ofs = src->head - src->tail; - size_t space = src->allocated - head_ofs; + size_t space = src->allocated - src->length; enum prompt_style prompt = segmenter_get_prompt (&src->segmenter); - size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs], + size_t n = src->reader->class->read (src->reader, + &src->buffer[src->length], space, prompt); assert (n <= space); @@ -1522,14 +1499,13 @@ lex_source_read__ (struct lex_source *src) { /* End of input. */ src->reader->eof = true; - lex_source_expand__ (src); return; } - src->head += n; + src->length += n; } - while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n', - src->head - src->seg_pos)); + while (!memchr (&src->buffer[src->seg_pos], '\n', + src->length - src->seg_pos)); } static struct lex_source * @@ -1573,8 +1549,7 @@ lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1) { size_t start = first->token_pos; size_t end = last->token_pos + last->token_len; - ds_put_substring (&s, ss_buffer (&src->buffer[start - src->tail], - end - start)); + ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start)); } else { @@ -1618,7 +1593,7 @@ lex_source_get_macro_call (struct lex_source *src, int n0, int n1) size_t start = token0->token_pos; size_t end = token1->token_pos + token1->token_len; - return ss_buffer (&src->buffer[start - src->tail], end - start); + return ss_buffer (&src->buffer[start], end - start); } static void @@ -1688,8 +1663,7 @@ static void lex_get_error (struct lex_source *src, const struct lex_token *token) { char syntax[64]; - str_ellipsize (ss_buffer (&src->buffer[token->token_pos - src->tail], - token->token_len), + str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len), syntax, sizeof syntax); struct string s = DS_EMPTY_INITIALIZER; @@ -1719,7 +1693,6 @@ lex_source_try_get_pp (struct lex_source *src) token->token = (struct token) { .type = T_STOP }; token->macro_rep = NULL; token->ref_cnt = NULL; - token->line_pos = src->line_pos; token->token_pos = src->seg_pos; if (src->reader->line_number > 0) token->first_line = src->reader->line_number + src->n_newlines; @@ -1732,9 +1705,9 @@ lex_source_try_get_pp (struct lex_source *src) int seg_len; for (;;) { - segment = &src->buffer[src->seg_pos - src->tail]; + segment = &src->buffer[src->seg_pos]; seg_len = segmenter_push (&src->segmenter, segment, - src->head - src->seg_pos, + src->length - src->seg_pos, src->reader->eof, &seg_type); if (seg_len >= 0) break; @@ -1748,10 +1721,7 @@ lex_source_try_get_pp (struct lex_source *src) token->token_len = seg_len; src->seg_pos += seg_len; if (seg_type == SEG_NEWLINE) - { - src->line_pos = src->seg_pos; - src->n_newlines++; - } + src->n_newlines++; /* Get a token from the segment. */ enum tokenize_result result = token_from_segment ( @@ -1773,15 +1743,15 @@ lex_source_try_get_pp (struct lex_source *src) for (int i = 0; i < n_lines; i++) { /* Beginning of line. */ - const char *line = &src->buffer[src->journal_pos - src->tail]; + const char *line = &src->buffer[src->journal_pos]; /* Calculate line length, including \n or \r\n end-of-line if present. - We use src->head even though that may be beyond what we've actually - converted to tokens (which is only through line_pos). That's because, - if we're emitting the line due to SEG_END_COMMAND, we want to take the - whole line through the newline, not just through the '.'. */ - size_t max_len = src->head - src->journal_pos; + We use src->length even though that may be beyond what we've actually + converted to tokens. That's because, if we're emitting the line due + to SEG_END_COMMAND, we want to take the whole line through the + newline, not just through the '.'. */ + size_t max_len = src->length - src->journal_pos; const char *newline = memchr (line, '\n', max_len); size_t line_len = newline ? newline - line + 1 : max_len; @@ -1873,7 +1843,7 @@ lex_source_try_get_merge (const struct lex_source *src_) size_t end = t->token_pos + t->token_len; const struct macro_token mt = { .token = t->token, - .syntax = ss_buffer (&src->buffer[start - src->tail], end - start), + .syntax = ss_buffer (&src->buffer[start], end - start), }; const struct msg_location loc = lex_token_location (src, t, t); n_call = macro_call_add (mc, &mt, &loc); @@ -1922,7 +1892,6 @@ lex_source_try_get_merge (const struct lex_source *src_) .token = expansion.mts[i].token, .token_pos = c0->token_pos, .token_len = (c1->token_pos + c1->token_len) - c0->token_pos, - .line_pos = c0->line_pos, .first_line = c0->first_line, .macro_rep = macro_rep, .ofs = ofs[i], @@ -1965,7 +1934,7 @@ lex_source_get_merge (struct lex_source *src) Returns true if successful, false on failure. In the latter case, SRC is exhausted and 'src->eof' is now true. */ static bool -lex_source_get_lookahead (struct lex_source *src) +lex_source_get_parse (struct lex_source *src) { struct merger m = MERGER_INIT; struct token out; @@ -1984,7 +1953,7 @@ lex_source_get_lookahead (struct lex_source *src) &out); if (!retval) { - lex_stage_shift (&src->lookahead, &src->merge, 1); + lex_source_push_parse (src, lex_stage_take_first (&src->merge)); return true; } else if (retval > 0) @@ -1999,7 +1968,6 @@ lex_source_get_lookahead (struct lex_source *src) .token = out, .token_pos = first->token_pos, .token_len = (last->token_pos - first->token_pos) + last->token_len, - .line_pos = first->line_pos, .first_line = first->first_line, /* This works well if all the tokens were not expanded from macros, @@ -2012,7 +1980,7 @@ lex_source_get_lookahead (struct lex_source *src) }; if (t->ref_cnt) ++*t->ref_cnt; - lex_stage_push_last (&src->lookahead, t); + lex_source_push_parse (src, t); for (int i = 0; i < retval; i++) lex_stage_pop_first (&src->merge); @@ -2024,10 +1992,28 @@ lex_source_get_lookahead (struct lex_source *src) static void lex_source_push_endcmd__ (struct lex_source *src) { - assert (lex_stage_is_empty (&src->lookahead)); + assert (src->n_parse == 0); + struct lex_token *token = xmalloc (sizeof *token); *token = (struct lex_token) { .token = { .type = T_ENDCMD } }; - lex_stage_push_last (&src->lookahead, token); + lex_source_push_parse (src, token); +} + +static void +lex_source_push_parse (struct lex_source *src, struct lex_token *token) +{ + if (src->n_parse >= src->allocated_parse) + src->parse = x2nrealloc (src->parse, &src->allocated_parse, + sizeof *src->parse); + src->parse[src->n_parse++] = token; +} + +static void +lex_source_clear_parse (struct lex_source *src) +{ + for (size_t i = 0; i < src->n_parse; i++) + lex_token_destroy (src->parse[i]); + src->n_parse = src->parse_ofs = 0; } static struct lex_source * @@ -2057,7 +2043,8 @@ lex_source_destroy (struct lex_source *src) free (src->buffer); lex_stage_uninit (&src->pp); lex_stage_uninit (&src->merge); - lex_stage_uninit (&src->lookahead); + lex_source_clear_parse (src); + free (src->parse); ll_remove (&src->ll); free (src); }