X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Flexer%2Flexer.c;h=7d40ce1cd50a39ad70baabc9547855d74d5caada;hb=39df27f80745cf9622ac5e916a098c17961c2585;hp=34b697efe2f5593b071ddd322c88a3b9fd4901f9;hpb=29917c4f5908454803e663d2ad78bca4bc35e805;p=pspp diff --git a/src/language/lexer/lexer.c b/src/language/lexer/lexer.c index 34b697efe2..7d40ce1cd5 100644 --- a/src/language/lexer/lexer.c +++ b/src/language/lexer/lexer.c @@ -28,9 +28,9 @@ #include #include #include -#include #include "language/command.h" +#include "language/lexer/macro.h" #include "language/lexer/scan.h" #include "language/lexer/segment.h" #include "language/lexer/token.h" @@ -38,6 +38,7 @@ #include "libpspp/cast.h" #include "libpspp/deque.h" #include "libpspp/i18n.h" +#include "libpspp/intern.h" #include "libpspp/ll.h" #include "libpspp/message.h" #include "libpspp/misc.h" @@ -61,14 +62,155 @@ struct lex_token /* The regular token information. */ struct token token; - /* Location of token in terms of the lex_source's buffer. - src->tail <= line_pos <= token_pos <= src->head. */ - size_t token_pos; /* Start of token. */ + /* For a token obtained through the lexer in an ordinary way, this is the + location of the token in terms of the lex_source's buffer. + + For a token produced through macro expansion, this is the entire macro + call. */ + size_t token_pos; /* Offset into src->buffer of token start. */ size_t token_len; /* Length of source for token in bytes. */ - size_t line_pos; /* Start of line containing token_pos. */ - int first_line; /* Line number at token_pos. */ + + /* For a token obtained through macro expansion, this is just this token. + + For a token obtained through the lexer in an ordinary way, these are + nulls and zeros. */ + char *macro_rep; /* The whole macro expansion. */ + size_t ofs; /* Offset of this token in macro_rep. */ + size_t len; /* Length of this token in macro_rep. */ + size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */ + }; + +static struct msg_point lex_token_start_point (const struct lex_source *, + const struct lex_token *); +static struct msg_point lex_token_end_point (const struct lex_source *, + const struct lex_token *); + +/* Source offset of the last byte in TOKEN. */ +static size_t +lex_token_end (const struct lex_token *token) +{ + return token->token_pos + MAX (token->token_len, 1) - 1; +} + +static void +lex_token_destroy (struct lex_token *t) +{ + token_uninit (&t->token); + if (t->ref_cnt) + { + assert (*t->ref_cnt > 0); + if (!--*t->ref_cnt) + { + free (t->macro_rep); + free (t->ref_cnt); + } + } + free (t); +} + +/* A deque of lex_tokens that comprises one stage in the token pipeline in a + lex_source. */ +struct lex_stage + { + struct deque deque; + struct lex_token **tokens; }; +static void lex_stage_clear (struct lex_stage *); +static void lex_stage_uninit (struct lex_stage *); + +static size_t lex_stage_count (const struct lex_stage *); +static bool lex_stage_is_empty (const struct lex_stage *); + +static struct lex_token *lex_stage_first (struct lex_stage *); +static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs); + +static void lex_stage_push_last (struct lex_stage *, struct lex_token *); +static void lex_stage_pop_first (struct lex_stage *); + +static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, + size_t n); + +/* Deletes all the tokens from STAGE. */ +static void +lex_stage_clear (struct lex_stage *stage) +{ + while (!deque_is_empty (&stage->deque)) + lex_stage_pop_first (stage); +} + +/* Deletes all the tokens from STAGE and frees storage for the deque. */ +static void +lex_stage_uninit (struct lex_stage *stage) +{ + lex_stage_clear (stage); + free (stage->tokens); +} + +/* Returns true if STAGE contains no tokens, otherwise false. */ +static bool +lex_stage_is_empty (const struct lex_stage *stage) +{ + return deque_is_empty (&stage->deque); +} + +/* Returns the number of tokens in STAGE. */ +static size_t +lex_stage_count (const struct lex_stage *stage) +{ + return deque_count (&stage->deque); +} + +/* Returns the first token in STAGE, which must be nonempty. + The first token is the one accessed with the least lookahead. */ +static struct lex_token * +lex_stage_first (struct lex_stage *stage) +{ + return lex_stage_nth (stage, 0); +} + +/* Returns the token the given INDEX in STAGE. The first token (with the least + lookahead) is 0, the second token is 1, and so on. There must be at least + INDEX + 1 tokens in STAGE. */ +static struct lex_token * +lex_stage_nth (struct lex_stage *stage, size_t index) +{ + return stage->tokens[deque_back (&stage->deque, index)]; +} + +/* Adds TOKEN so that it becomes the last token in STAGE. */ +static void +lex_stage_push_last (struct lex_stage *stage, struct lex_token *token) +{ + if (deque_is_full (&stage->deque)) + stage->tokens = deque_expand (&stage->deque, stage->tokens, + sizeof *stage->tokens); + stage->tokens[deque_push_front (&stage->deque)] = token; +} + +/* Removes and returns the first token from STAGE. */ +static struct lex_token * +lex_stage_take_first (struct lex_stage *stage) +{ + return stage->tokens[deque_pop_back (&stage->deque)]; +} + +/* Removes the first token from STAGE and uninitializes it. */ +static void +lex_stage_pop_first (struct lex_stage *stage) +{ + lex_token_destroy (lex_stage_take_first (stage)); +} + +/* Removes the first N tokens from SRC, appending them to DST as the last + tokens. */ +static void +lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n) +{ + for (size_t i = 0; i < n; i++) + lex_stage_push_last (dst, lex_stage_take_first (src)); +} + /* A source of tokens, corresponding to a syntax file. This is conceptually a lex_reader wrapped with everything needed to convert @@ -76,44 +218,78 @@ struct lex_token struct lex_source { struct ll ll; /* In lexer's list of sources. */ + + /* Reference count: + + - One for struct lexer. + + - One for each struct msg_location that references this source. */ + size_t n_refs; + struct lex_reader *reader; + struct lexer *lexer; struct segmenter segmenter; bool eof; /* True if T_STOP was read from 'reader'. */ /* Buffer of UTF-8 bytes. */ - char *buffer; + char *buffer; /* Source file contents. */ + size_t length; /* Number of bytes filled. */ size_t allocated; /* Number of bytes allocated. */ - size_t tail; /* &buffer[0] offset into UTF-8 source. */ - size_t head; /* &buffer[head - tail] offset into source. */ - /* Positions in source file, tail <= pos <= head for each member here. */ + /* Offsets into 'buffer'. */ size_t journal_pos; /* First byte not yet output to journal. */ size_t seg_pos; /* First byte not yet scanned as token. */ - size_t line_pos; /* First byte of line containing seg_pos. */ - int n_newlines; /* Number of new-lines up to seg_pos. */ + /* Offset into 'buffer' of starts of lines. */ + size_t *lines; + size_t n_lines, allocated_lines; + bool suppress_next_newline; - /* Tokens. */ - struct deque deque; /* Indexes into 'tokens'. */ - struct lex_token *tokens; /* Lookahead tokens for parser. */ + /* Tokens. + + This is a pipeline with the following stages. Each token eventually + made available to the parser passes through of these stages. The stages + are named after the processing that happens in each one. + + Initially, tokens come from the segmenter and scanner to 'pp': + + - pp: Tokens that need to pass through the macro preprocessor to end up + in 'merge'. + + - merge: Tokens that need to pass through scan_merge() to end up in + 'parse'. + + - parse: Tokens available to the client for parsing. + + 'pp' and 'merge' store tokens only temporarily until they pass into + 'parse'. Tokens then live in 'parse' until the command is fully + consumed, at which time they are freed together. */ + struct lex_stage pp; + struct lex_stage merge; + struct lex_token **parse; + size_t n_parse, allocated_parse, parse_ofs; }; -static struct lex_source *lex_source_create (struct lex_reader *); -static void lex_source_destroy (struct lex_source *); +static struct lex_source *lex_source_create (struct lexer *, + struct lex_reader *); /* Lexer. */ struct lexer { struct ll_list sources; /* Contains "struct lex_source"s. */ + struct macro_set *macros; }; static struct lex_source *lex_source__ (const struct lexer *); +static char *lex_source_syntax__ (const struct lex_source *, + int ofs0, int ofs1); static const struct lex_token *lex_next__ (const struct lexer *, int n); static void lex_source_push_endcmd__ (struct lex_source *); +static void lex_source_push_parse (struct lex_source *, struct lex_token *); +static void lex_source_clear_parse (struct lex_source *); -static void lex_source_pop__ (struct lex_source *); -static bool lex_source_get__ (const struct lex_source *); +static bool lex_source_get_parse (struct lex_source *); static void lex_source_error_valist (struct lex_source *, int n0, int n1, const char *format, va_list) PRINTF_FORMAT (4, 0); @@ -127,7 +303,7 @@ lex_reader_init (struct lex_reader *reader, const struct lex_reader_class *class) { reader->class = class; - reader->syntax = LEX_SYNTAX_AUTO; + reader->syntax = SEG_MODE_AUTO; reader->error = LEX_ERROR_CONTINUE; reader->file_name = NULL; reader->encoding = NULL; @@ -148,8 +324,11 @@ lex_reader_set_file_name (struct lex_reader *reader, const char *file_name) struct lexer * lex_create (void) { - struct lexer *lexer = xzalloc (sizeof *lexer); - ll_init (&lexer->sources); + struct lexer *lexer = xmalloc (sizeof *lexer); + *lexer = (struct lexer) { + .sources = LL_INITIALIZER (lexer->sources), + .macros = macro_set_create (), + }; return lexer; } @@ -162,11 +341,23 @@ lex_destroy (struct lexer *lexer) struct lex_source *source, *next; ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources) - lex_source_destroy (source); + { + ll_remove (&source->ll); + lex_source_unref (source); + } + macro_set_destroy (lexer->macros); free (lexer); } } +/* Adds M to LEXER's set of macros. M replaces any existing macro with the + same name. Takes ownership of M. */ +void +lex_define_macro (struct lexer *lexer, struct macro *m) +{ + macro_set_add (lexer->macros, m); +} + /* Inserts READER into LEXER so that the next token read by LEXER comes from READER. Before the caller, LEXER must either be empty or at a T_ENDCMD token. */ @@ -174,7 +365,7 @@ void lex_include (struct lexer *lexer, struct lex_reader *reader) { assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD); - ll_push_head (&lexer->sources, &lex_source_create (reader)->ll); + ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll); } /* Appends READER to LEXER, so that it will be read after all other current @@ -182,36 +373,11 @@ lex_include (struct lexer *lexer, struct lex_reader *reader) void lex_append (struct lexer *lexer, struct lex_reader *reader) { - ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll); + ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll); } /* Advancing. */ -static struct lex_token * -lex_push_token__ (struct lex_source *src) -{ - struct lex_token *token; - - if (deque_is_full (&src->deque)) - src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens); - - token = &src->tokens[deque_push_front (&src->deque)]; - token_init (&token->token); - return token; -} - -static void -lex_source_pop__ (struct lex_source *src) -{ - token_destroy (&src->tokens[deque_pop_back (&src->deque)].token); -} - -static void -lex_source_pop_front (struct lex_source *src) -{ - token_destroy (&src->tokens[deque_pop_front (&src->deque)].token); -} - /* Advances LEXER to the next token, consuming the current token. */ void lex_get (struct lexer *lexer) @@ -222,18 +388,32 @@ lex_get (struct lexer *lexer) if (src == NULL) return; - if (!deque_is_empty (&src->deque)) - lex_source_pop__ (src); + if (src->parse_ofs < src->n_parse) + { + if (src->parse[src->parse_ofs]->token.type == T_ENDCMD) + lex_source_clear_parse (src); + else + src->parse_ofs++; + } - while (deque_is_empty (&src->deque)) - if (!lex_source_get__ (src)) + while (src->parse_ofs == src->n_parse) + if (!lex_source_get_parse (src)) { - lex_source_destroy (src); + ll_remove (&src->ll); + lex_source_unref (src); src = lex_source__ (lexer); if (src == NULL) return; } } + +/* Advances LEXER by N tokens. */ +void +lex_get_n (struct lexer *lexer, size_t n) +{ + while (n-- > 0) + lex_get (lexer); +} /* Issuing errors. */ @@ -269,23 +449,40 @@ lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...) va_end (args); } -/* Prints a syntax error message saying that OPTION0 or one of the other - strings following it, up to the first NULL, is expected. */ +/* Prints a syntax error message saying that one of the strings provided as + varargs, up to the first NULL, is expected. */ void -(lex_error_expecting) (struct lexer *lexer, const char *option0, ...) +(lex_error_expecting) (struct lexer *lexer, ...) { - enum { MAX_OPTIONS = 8 }; - const char *options[MAX_OPTIONS + 1]; va_list args; - int n; - va_start (args, option0); - options[0] = option0; - n = 0; - while (n + 1 < MAX_OPTIONS && options[n] != NULL) - options[++n] = va_arg (args, const char *); + va_start (args, lexer); + lex_error_expecting_valist (lexer, args); va_end (args); +} + +/* Prints a syntax error message saying that one of the options provided in + ARGS, up to the first NULL, is expected. */ +void +lex_error_expecting_valist (struct lexer *lexer, va_list args) +{ + enum { MAX_OPTIONS = 9 }; + const char *options[MAX_OPTIONS]; + int n = 0; + while (n < MAX_OPTIONS) + { + const char *option = va_arg (args, const char *); + if (!option) + break; + + options[n++] = option; + } + lex_error_expecting_array (lexer, options, n); +} +void +lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n) +{ switch (n) { case 0: @@ -333,8 +530,14 @@ void options[5], options[6], options[7]); break; + case 9: + lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, %s, or %s"), + options[0], options[1], options[2], options[3], options[4], + options[5], options[6], options[7], options[8]); + break; + default: - NOT_REACHED (); + lex_error (lexer, NULL); } } @@ -402,7 +605,8 @@ lex_next_error_valist (struct lexer *lexer, int n0, int n1, ds_put_cstr (&s, ": "); ds_put_vformat (&s, format, args); } - ds_put_byte (&s, '.'); + if (ds_last (&s) != '.') + ds_put_byte (&s, '.'); msg (SE, "%s", ds_cstr (&s)); ds_destroy (&s); } @@ -474,15 +678,14 @@ lex_integer (const struct lexer *lexer) bool lex_next_is_number (const struct lexer *lexer, int n) { - enum token_type next_token = lex_next_token (lexer, n); - return next_token == T_POS_NUM || next_token == T_NEG_NUM; + return token_is_number (lex_next (lexer, n)); } /* Returns true if the token N ahead of the current token is a string. */ bool lex_next_is_string (const struct lexer *lexer, int n) { - return lex_next_token (lexer, n) == T_STRING; + return token_is_string (lex_next (lexer, n)); } /* Returns the value of the token N ahead of the current token, which must be a @@ -490,21 +693,14 @@ lex_next_is_string (const struct lexer *lexer, int n) double lex_next_number (const struct lexer *lexer, int n) { - assert (lex_next_is_number (lexer, n)); - return lex_next_tokval (lexer, n); + return token_number (lex_next (lexer, n)); } /* Returns true if the token N ahead of the current token is an integer. */ bool lex_next_is_integer (const struct lexer *lexer, int n) { - double value; - - if (!lex_next_is_number (lexer, n)) - return false; - - value = lex_next_tokval (lexer, n); - return value > LONG_MIN && value <= LONG_MAX && floor (value) == value; + return token_is_integer (lex_next (lexer, n)); } /* Returns the value of the token N ahead of the current token, which must be @@ -512,8 +708,7 @@ lex_next_is_integer (const struct lexer *lexer, int n) long lex_next_integer (const struct lexer *lexer, int n) { - assert (lex_next_is_integer (lexer, n)); - return lex_next_tokval (lexer, n); + return token_integer (lex_next (lexer, n)); } /* Token matching functions. */ @@ -662,6 +857,109 @@ lex_force_int (struct lexer *lexer) } } +/* If the current token is an integer in the range MIN...MAX (inclusive), does + nothing and returns true. Otherwise, reports an error and returns false. + If NAME is nonnull, then it is used in the error message. */ +bool +lex_force_int_range (struct lexer *lexer, const char *name, long min, long max) +{ + bool is_number = lex_is_number (lexer); + bool is_integer = lex_is_integer (lexer); + bool too_small = (is_integer ? lex_integer (lexer) < min + : is_number ? lex_number (lexer) < min + : false); + bool too_big = (is_integer ? lex_integer (lexer) > max + : is_number ? lex_number (lexer) > max + : false); + if (is_integer && !too_small && !too_big) + return true; + + if (min > max) + { + /* Weird, maybe a bug in the caller. Just report that we needed an + integer. */ + if (name) + lex_error (lexer, _("Integer expected for %s."), name); + else + lex_error (lexer, _("Integer expected.")); + } + else if (min == max) + { + if (name) + lex_error (lexer, _("Expected %ld for %s."), min, name); + else + lex_error (lexer, _("Expected %ld."), min); + } + else if (min + 1 == max) + { + if (name) + lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name); + else + lex_error (lexer, _("Expected %ld or %ld."), min, min + 1); + } + else + { + bool report_lower_bound = (min > INT_MIN / 2) || too_small; + bool report_upper_bound = (max < INT_MAX / 2) || too_big; + + if (report_lower_bound && report_upper_bound) + { + if (name) + lex_error (lexer, + _("Expected integer between %ld and %ld for %s."), + min, max, name); + else + lex_error (lexer, _("Expected integer between %ld and %ld."), + min, max); + } + else if (report_lower_bound) + { + if (min == 0) + { + if (name) + lex_error (lexer, _("Expected non-negative integer for %s."), + name); + else + lex_error (lexer, _("Expected non-negative integer.")); + } + else if (min == 1) + { + if (name) + lex_error (lexer, _("Expected positive integer for %s."), + name); + else + lex_error (lexer, _("Expected positive integer.")); + } + else + { + if (name) + lex_error (lexer, _("Expected integer %ld or greater for %s."), + min, name); + else + lex_error (lexer, _("Expected integer %ld or greater."), min); + } + } + else if (report_upper_bound) + { + if (name) + lex_error (lexer, + _("Expected integer less than or equal to %ld for %s."), + max, name); + else + lex_error (lexer, _("Expected integer less than or equal to %ld."), + max); + } + else + { + if (name) + lex_error (lexer, _("Integer expected for %s."), name); + else + lex_error (lexer, _("Integer expected.")); + } + } + return false; +} + /* If the current token is a number, does nothing and returns true. Otherwise, reports an error and returns false. */ bool @@ -674,6 +972,236 @@ lex_force_num (struct lexer *lexer) return false; } +/* If the current token is an number in the closed range [MIN,MAX], does + nothing and returns true. Otherwise, reports an error and returns false. + If NAME is nonnull, then it is used in the error message. */ +bool +lex_force_num_range_closed (struct lexer *lexer, const char *name, + double min, double max) +{ + bool is_number = lex_is_number (lexer); + bool too_small = is_number && lex_number (lexer) < min; + bool too_big = is_number && lex_number (lexer) > max; + if (is_number && !too_small && !too_big) + return true; + + if (min > max) + { + /* Weird, maybe a bug in the caller. Just report that we needed an + number. */ + if (name) + lex_error (lexer, _("Number expected for %s."), name); + else + lex_error (lexer, _("Number expected.")); + } + else if (min == max) + { + if (name) + lex_error (lexer, _("Expected %g for %s."), min, name); + else + lex_error (lexer, _("Expected %g."), min); + } + else + { + bool report_lower_bound = min > -DBL_MAX || too_small; + bool report_upper_bound = max < DBL_MAX || too_big; + + if (report_lower_bound && report_upper_bound) + { + if (name) + lex_error (lexer, + _("Expected number between %g and %g for %s."), + min, max, name); + else + lex_error (lexer, _("Expected number between %g and %g."), + min, max); + } + else if (report_lower_bound) + { + if (min == 0) + { + if (name) + lex_error (lexer, _("Expected non-negative number for %s."), + name); + else + lex_error (lexer, _("Expected non-negative number.")); + } + else + { + if (name) + lex_error (lexer, _("Expected number %g or greater for %s."), + min, name); + else + lex_error (lexer, _("Expected number %g or greater."), min); + } + } + else if (report_upper_bound) + { + if (name) + lex_error (lexer, + _("Expected number less than or equal to %g for %s."), + max, name); + else + lex_error (lexer, _("Expected number less than or equal to %g."), + max); + } + else + { + if (name) + lex_error (lexer, _("Number expected for %s."), name); + else + lex_error (lexer, _("Number expected.")); + } + } + return false; +} + +/* If the current token is an number in the half-open range [MIN,MAX), does + nothing and returns true. Otherwise, reports an error and returns false. + If NAME is nonnull, then it is used in the error message. */ +bool +lex_force_num_range_halfopen (struct lexer *lexer, const char *name, + double min, double max) +{ + bool is_number = lex_is_number (lexer); + bool too_small = is_number && lex_number (lexer) < min; + bool too_big = is_number && lex_number (lexer) >= max; + if (is_number && !too_small && !too_big) + return true; + + if (min >= max) + { + /* Weird, maybe a bug in the caller. Just report that we needed an + number. */ + if (name) + lex_error (lexer, _("Number expected for %s."), name); + else + lex_error (lexer, _("Number expected.")); + } + else + { + bool report_lower_bound = min > -DBL_MAX || too_small; + bool report_upper_bound = max < DBL_MAX || too_big; + + if (report_lower_bound && report_upper_bound) + { + if (name) + lex_error (lexer, _("Expected number in [%g,%g) for %s."), + min, max, name); + else + lex_error (lexer, _("Expected number in [%g,%g)."), + min, max); + } + else if (report_lower_bound) + { + if (min == 0) + { + if (name) + lex_error (lexer, _("Expected non-negative number for %s."), + name); + else + lex_error (lexer, _("Expected non-negative number.")); + } + else + { + if (name) + lex_error (lexer, _("Expected number %g or greater for %s."), + min, name); + else + lex_error (lexer, _("Expected number %g or greater."), min); + } + } + else if (report_upper_bound) + { + if (name) + lex_error (lexer, + _("Expected number less than %g for %s."), max, name); + else + lex_error (lexer, _("Expected number less than %g."), max); + } + else + { + if (name) + lex_error (lexer, _("Number expected for %s."), name); + else + lex_error (lexer, _("Number expected.")); + } + } + return false; +} + +/* If the current token is an number in the open range (MIN,MAX], does + nothing and returns true. Otherwise, reports an error and returns false. + If NAME is nonnull, then it is used in the error message. */ +bool +lex_force_num_range_open (struct lexer *lexer, const char *name, + double min, double max) +{ + bool is_number = lex_is_number (lexer); + bool too_small = is_number && lex_number (lexer) <= min; + bool too_big = is_number && lex_number (lexer) >= max; + if (is_number && !too_small && !too_big) + return true; + + if (min >= max) + { + /* Weird, maybe a bug in the caller. Just report that we needed an + number. */ + if (name) + lex_error (lexer, _("Number expected for %s."), name); + else + lex_error (lexer, _("Number expected.")); + } + else + { + bool report_lower_bound = min > -DBL_MAX || too_small; + bool report_upper_bound = max < DBL_MAX || too_big; + + if (report_lower_bound && report_upper_bound) + { + if (name) + lex_error (lexer, _("Expected number in (%g,%g) for %s."), + min, max, name); + else + lex_error (lexer, _("Expected number in (%g,%g)."), min, max); + } + else if (report_lower_bound) + { + if (min == 0) + { + if (name) + lex_error (lexer, _("Expected positive number for %s."), name); + else + lex_error (lexer, _("Expected positive number.")); + } + else + { + if (name) + lex_error (lexer, _("Expected number greater than %g for %s."), + min, name); + else + lex_error (lexer, _("Expected number greater than %g."), min); + } + } + else if (report_upper_bound) + { + if (name) + lex_error (lexer, _("Expected number less than %g for %s."), + max, name); + else + lex_error (lexer, _("Expected number less than %g."), max); + } + else + { + if (name) + lex_error (lexer, _("Number expected for %s."), name); + else + lex_error (lexer, _("Number expected.")); + } + } + return false; +} + /* If the current token is an identifier, does nothing and returns true. Otherwise, reports an error and returns false. */ bool @@ -752,31 +1280,42 @@ lex_next__ (const struct lexer *lexer_, int n) return lex_source_next__ (src, n); else { - static const struct lex_token stop_token = - { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 }; - + static const struct lex_token stop_token = { .token = { .type = T_STOP } }; return &stop_token; } } static const struct lex_token * -lex_source_next__ (const struct lex_source *src, int n) +lex_source_ofs__ (const struct lex_source *src_, int ofs) { - while (deque_count (&src->deque) <= n) + struct lex_source *src = CONST_CAST (struct lex_source *, src_); + + if (ofs < 0) { - if (!deque_is_empty (&src->deque)) - { - struct lex_token *front; + static const struct lex_token endcmd_token + = { .token = { .type = T_ENDCMD } }; + return &endcmd_token; + } - front = &src->tokens[deque_front (&src->deque, 0)]; - if (front->token.type == T_STOP || front->token.type == T_ENDCMD) - return front; + while (ofs >= src->n_parse) + { + if (src->n_parse > 0) + { + const struct lex_token *t = src->parse[src->n_parse - 1]; + if (t->token.type == T_STOP || t->token.type == T_ENDCMD) + return t; } - lex_source_get__ (src); + lex_source_get_parse (src); } - return &src->tokens[deque_back (&src->deque, n)]; + return src->parse[ofs]; +} + +static const struct lex_token * +lex_source_next__ (const struct lex_source *src, int n) +{ + return lex_source_ofs__ (src, n + src->parse_ofs); } /* Returns the "struct token" of the token N after the current one in LEXER. @@ -803,8 +1342,7 @@ lex_next_token (const struct lexer *lexer, int n) double lex_next_tokval (const struct lexer *lexer, int n) { - const struct token *token = lex_next (lexer, n); - return token->number; + return token_number (lex_next (lexer, n)); } /* Returns the null-terminated string in the token N after the current one, in @@ -826,8 +1364,8 @@ lex_next_tokcstr (const struct lexer *lexer, int n) The string is null-terminated (but the null terminator is not included in the returned substring's 'length'). - Only T_ID and T_STRING tokens have meaningful strings. For other tokens - this functions this function will always return NULL. + Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other + tokens this functions this function will always return NULL. The UTF-8 encoding of the returned string is correct for variable names and other identifiers. Use filename_to_utf8() to use it as a filename. Use @@ -838,198 +1376,279 @@ lex_next_tokss (const struct lexer *lexer, int n) return lex_next (lexer, n)->string; } -static bool -lex_tokens_match (const struct token *actual, const struct token *expected) +/* Returns the offset of the current token within the command being parsed in + LEXER. This is 0 for the first token in a command, 1 for the second, and so + on. The return value is useful later for referring to this token in calls + to lex_ofs_*(). */ +int +lex_ofs (const struct lexer *lexer) { - if (actual->type != expected->type) - return false; - - switch (actual->type) - { - case T_POS_NUM: - case T_NEG_NUM: - return actual->number == expected->number; + struct lex_source *src = lex_source__ (lexer); + return src ? src->parse_ofs : 0; +} - case T_ID: - return lex_id_match (expected->string, actual->string); +/* Returns the token within LEXER's current command with offset OFS. Use + lex_ofs() to find out the offset of the current token. */ +const struct token * +lex_ofs_token (const struct lexer *lexer_, int ofs) +{ + struct lexer *lexer = CONST_CAST (struct lexer *, lexer_); + struct lex_source *src = lex_source__ (lexer); - case T_STRING: - return (actual->string.length == expected->string.length - && !memcmp (actual->string.string, expected->string.string, - actual->string.length)); - - default: - return true; + if (src != NULL) + return &lex_source_next__ (src, ofs - src->parse_ofs)->token; + else + { + static const struct token stop_token = { .type = T_STOP }; + return &stop_token; } } -/* If LEXER is positioned at the sequence of tokens that may be parsed from S, - skips it and returns true. Otherwise, returns false. +/* Allocates and returns a new struct msg_location that spans tokens with + offsets OFS0 through OFS1, inclusive, within the current command in + LEXER. See lex_ofs() for an explanation of token offsets. - S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS", - "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their - first three letters. */ -bool -lex_match_phrase (struct lexer *lexer, const char *s) + The caller owns and must eventually free the returned object. */ +struct msg_location * +lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1) { - struct string_lexer slex; - struct token token; - int i; + int ofs = lex_ofs (lexer); + return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs); +} - i = 0; - string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE); - while (string_lexer_next (&slex, &token)) - if (token.type != SCAN_SKIP) - { - bool match = lex_tokens_match (lex_next (lexer, i++), &token); - token_destroy (&token); - if (!match) - return false; - } +/* Returns a msg_point for the first character in the token with offset OFS, + where offset 0 is the first token in the command currently being parsed, 1 + the second token, and so on. These are absolute offsets, not relative to + the token currently being parsed within the command. - while (i-- > 0) - lex_get (lexer); - return true; + Returns zeros for a T_STOP token. + */ +struct msg_point +lex_ofs_start_point (const struct lexer *lexer, int ofs) +{ + const struct lex_source *src = lex_source__ (lexer); + return (src + ? lex_token_start_point (src, lex_source_ofs__ (src, ofs)) + : (struct msg_point) { 0, 0 }); } -static int -lex_source_get_first_line_number (const struct lex_source *src, int n) +/* Returns a msg_point for the last character, inclusive, in the token with + offset OFS, where offset 0 is the first token in the command currently being + parsed, 1 the second token, and so on. These are absolute offsets, not + relative to the token currently being parsed within the command. + + Returns zeros for a T_STOP token. + + Most of the time, a single token is wholly within a single line of syntax, + so that the start and end point for a given offset have the same line + number. There are two exceptions: a T_STRING token can be made up of + multiple segments on adjacent lines connected with "+" punctuators, and a + T_NEG_NUM token can consist of a "-" on one line followed by the number on + the next. + */ +struct msg_point +lex_ofs_end_point (const struct lexer *lexer, int ofs) { - return lex_source_next__ (src, n)->first_line; + const struct lex_source *src = lex_source__ (lexer); + return (src + ? lex_token_end_point (src, lex_source_ofs__ (src, ofs)) + : (struct msg_point) { 0, 0 }); } -static int -count_newlines (char *s, size_t length) +/* Returns the text of the syntax in tokens N0 ahead of the current one, + through N1 ahead of the current one, inclusive. (For example, if N0 and N1 + are both zero, this requests the syntax for the current token.) + + The caller must eventually free the returned string (with free()). The + syntax is encoded in UTF-8 and in the original form supplied to the lexer so + that, for example, it may include comments, spaces, and new-lines if it + spans multiple tokens. Macro expansion, however, has already been + performed. */ +char * +lex_next_representation (const struct lexer *lexer, int n0, int n1) { - int n_newlines = 0; - char *newline; + const struct lex_source *src = lex_source__ (lexer); + return (src + ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs) + : xstrdup ("")); +} - while ((newline = memchr (s, '\n', length)) != NULL) - { - n_newlines++; - length -= (newline + 1) - s; - s = newline + 1; - } - return n_newlines; +/* Returns the text of the syntax in tokens with offsets OFS0 to OFS1, + inclusive. (For example, if OFS0 and OFS1 are both zero, this requests the + syntax for the first token in the current command.) + + The caller must eventually free the returned string (with free()). The + syntax is encoded in UTF-8 and in the original form supplied to the lexer so + that, for example, it may include comments, spaces, and new-lines if it + spans multiple tokens. Macro expansion, however, has already been + performed. */ +char * +lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1) +{ + const struct lex_source *src = lex_source__ (lexer); + return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup (""); } -static int -lex_source_get_last_line_number (const struct lex_source *src, int n) +/* Returns true if the token N ahead of the current one was produced by macro + expansion, false otherwise. */ +bool +lex_next_is_from_macro (const struct lexer *lexer, int n) { - const struct lex_token *token = lex_source_next__ (src, n); + return lex_next__ (lexer, n)->macro_rep != NULL; +} - if (token->first_line == 0) - return 0; - else +static bool +lex_tokens_match (const struct token *actual, const struct token *expected) +{ + if (actual->type != expected->type) + return false; + + switch (actual->type) { - char *token_str = &src->buffer[token->token_pos - src->tail]; - return token->first_line + count_newlines (token_str, token->token_len) + 1; + case T_POS_NUM: + case T_NEG_NUM: + return actual->number == expected->number; + + case T_ID: + return lex_id_match (expected->string, actual->string); + + case T_STRING: + return (actual->string.length == expected->string.length + && !memcmp (actual->string.string, expected->string.string, + actual->string.length)); + + default: + return true; } } -static int -count_columns (const char *s_, size_t length) +static size_t +lex_at_phrase__ (struct lexer *lexer, const char *s) { - const uint8_t *s = CHAR_CAST (const uint8_t *, s_); - int columns; - size_t ofs; - int mblen; + struct string_lexer slex; + struct token token; - columns = 0; - for (ofs = 0; ofs < length; ofs += mblen) + size_t i = 0; + string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true); + while (string_lexer_next (&slex, &token)) { - ucs4_t uc; - - mblen = u8_mbtouc (&uc, s + ofs, length - ofs); - if (uc != '\t') - { - int width = uc_width (uc, "UTF-8"); - if (width > 0) - columns += width; - } - else - columns = ROUND_UP (columns + 1, 8); + bool match = lex_tokens_match (lex_next (lexer, i++), &token); + token_uninit (&token); + if (!match) + return 0; } + return i; +} - return columns + 1; +/* If LEXER is positioned at the sequence of tokens that may be parsed from S, + returns true. Otherwise, returns false. + + S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS", + "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their + first three letters. */ +bool +lex_at_phrase (struct lexer *lexer, const char *s) +{ + return lex_at_phrase__ (lexer, s) > 0; } -static int -lex_source_get_first_column (const struct lex_source *src, int n) +/* If LEXER is positioned at the sequence of tokens that may be parsed from S, + skips it and returns true. Otherwise, returns false. + + S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS", + "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their + first three letters. */ +bool +lex_match_phrase (struct lexer *lexer, const char *s) { - const struct lex_token *token = lex_source_next__ (src, n); - return count_columns (&src->buffer[token->line_pos - src->tail], - token->token_pos - token->line_pos); + size_t n = lex_at_phrase__ (lexer, s); + if (n > 0) + lex_get_n (lexer, n); + return n > 0; } +/* Returns the 1-based line number of the source text at the byte OFFSET in + SRC. */ static int -lex_source_get_last_column (const struct lex_source *src, int n) +lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset) { - const struct lex_token *token = lex_source_next__ (src, n); - char *start, *end, *newline; - - start = &src->buffer[token->line_pos - src->tail]; - end = &src->buffer[(token->token_pos + token->token_len) - src->tail]; - newline = memrchr (start, '\n', end - start); - if (newline != NULL) - start = newline + 1; - return count_columns (start, end - start); + size_t lo = 0; + size_t hi = src->n_lines; + for (;;) + { + size_t mid = (lo + hi) / 2; + if (mid + 1 >= src->n_lines) + return src->n_lines; + else if (offset >= src->lines[mid + 1]) + lo = mid; + else if (offset < src->lines[mid]) + hi = mid; + else + return mid + 1; + } } -/* Returns the 1-based line number of the start of the syntax that represents - the token N after the current one in LEXER. Returns 0 for a T_STOP token or - if the token is drawn from a source that does not have line numbers. */ -int -lex_get_first_line_number (const struct lexer *lexer, int n) +/* Returns the 1-based column number of the source text at the byte OFFSET in + SRC. */ +static int +lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset) { - const struct lex_source *src = lex_source__ (lexer); - return src != NULL ? lex_source_get_first_line_number (src, n) : 0; + const char *newline = memrchr (src->buffer, '\n', offset); + size_t line_ofs = newline ? newline - src->buffer + 1 : 0; + return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1; } -/* Returns the 1-based line number of the end of the syntax that represents the - token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP - token or if the token is drawn from a source that does not have line - numbers. +static struct msg_point +lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset) +{ + return (struct msg_point) { + .line = lex_source_ofs_to_line_number (src, offset), + .column = lex_source_ofs_to_column_number (src, offset), + }; +} - Most of the time, a single token is wholly within a single line of syntax, - but there are two exceptions: a T_STRING token can be made up of multiple - segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM - token can consist of a "-" on one line followed by the number on the next. - */ -int -lex_get_last_line_number (const struct lexer *lexer, int n) +static struct msg_point +lex_token_start_point (const struct lex_source *src, + const struct lex_token *token) { - const struct lex_source *src = lex_source__ (lexer); - return src != NULL ? lex_source_get_last_line_number (src, n) : 0; + return lex_source_ofs_to_point__ (src, token->token_pos); } -/* Returns the 1-based column number of the start of the syntax that represents - the token N after the current one in LEXER. Returns 0 for a T_STOP - token. +static struct msg_point +lex_token_end_point (const struct lex_source *src, + const struct lex_token *token) +{ + return lex_source_ofs_to_point__ (src, lex_token_end (token)); +} - Column numbers are measured according to the width of characters as shown in - a typical fixed-width font, in which CJK characters have width 2 and - combining characters have width 0. */ -int -lex_get_first_column (const struct lexer *lexer, int n) +static struct msg_location +lex_token_location (const struct lex_source *src, + const struct lex_token *t0, + const struct lex_token *t1) { - const struct lex_source *src = lex_source__ (lexer); - return src != NULL ? lex_source_get_first_column (src, n) : 0; + return (struct msg_location) { + .file_name = intern_new_if_nonnull (src->reader->file_name), + .start = lex_token_start_point (src, t0), + .end = lex_token_end_point (src, t1), + }; } -/* Returns the 1-based column number of the end of the syntax that represents - the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP - token. +static struct msg_location * +lex_token_location_rw (const struct lex_source *src, + const struct lex_token *t0, + const struct lex_token *t1) +{ + struct msg_location location = lex_token_location (src, t0, t1); + return msg_location_dup (&location); +} - Column numbers are measured according to the width of characters as shown in - a typical fixed-width font, in which CJK characters have width 2 and - combining characters have width 0. */ -int -lex_get_last_column (const struct lexer *lexer, int n) +static struct msg_location * +lex_source_get_location (const struct lex_source *src, int n0, int n1) { - const struct lex_source *src = lex_source__ (lexer); - return src != NULL ? lex_source_get_last_column (src, n) : 0; + return lex_token_location_rw (src, + lex_source_next__ (src, n0), + lex_source_next__ (src, n1)); } /* Returns the name of the syntax file from which the current command is drawn. @@ -1046,6 +1665,23 @@ lex_get_file_name (const struct lexer *lexer) return src == NULL ? NULL : src->reader->file_name; } +/* Returns a newly allocated msg_location for the syntax that represents tokens + with 0-based offsets N0...N1, inclusive, from the current token. The caller + must eventually free the location (with msg_location_destroy()). */ +struct msg_location * +lex_get_location (const struct lexer *lexer, int n0, int n1) +{ + struct msg_location *loc = xmalloc (sizeof *loc); + *loc = (struct msg_location) { + .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)), + .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)), + .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)), + .src = lex_source__ (lexer), + }; + lex_source_ref (loc->src); + return loc; +} + const char * lex_get_encoding (const struct lexer *lexer) { @@ -1053,19 +1689,18 @@ lex_get_encoding (const struct lexer *lexer) return src == NULL ? NULL : src->reader->encoding; } - /* Returns the syntax mode for the syntax file from which the current drawn is - drawn. Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's - source does not have line numbers. + drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source + does not have line numbers. There is no version of this function that takes an N argument because lookahead only works to the end of a command and any given command is always within a single syntax file. */ -enum lex_syntax_mode +enum segmenter_mode lex_get_syntax_mode (const struct lexer *lexer) { struct lex_source *src = lex_source__ (lexer); - return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax; + return src == NULL ? SEG_MODE_AUTO : src->reader->syntax; } /* Returns the error mode for the syntax file from which the current drawn is @@ -1095,13 +1730,15 @@ lex_interactive_reset (struct lexer *lexer) struct lex_source *src = lex_source__ (lexer); if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL) { - src->head = src->tail = 0; - src->journal_pos = src->seg_pos = src->line_pos = 0; - src->n_newlines = 0; + src->length = 0; + src->journal_pos = src->seg_pos = 0; + src->n_lines = 0; src->suppress_next_newline = false; - segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter)); - while (!deque_is_empty (&src->deque)) - lex_source_pop__ (src); + src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter), + false); + lex_stage_clear (&src->pp); + lex_stage_clear (&src->merge); + lex_source_clear_parse (src); lex_source_push_endcmd__ (src); } } @@ -1124,57 +1761,24 @@ lex_discard_noninteractive (struct lexer *lexer) if (src != NULL) { - while (!deque_is_empty (&src->deque)) - lex_source_pop__ (src); + lex_stage_clear (&src->pp); + lex_stage_clear (&src->merge); + lex_source_clear_parse (src); for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL; src = lex_source__ (lexer)) - lex_source_destroy (src); + { + ll_remove (&src->ll); + lex_source_unref (src); + } } } -static size_t -lex_source_max_tail__ (const struct lex_source *src) -{ - const struct lex_token *token; - size_t max_tail; - - assert (src->seg_pos >= src->line_pos); - max_tail = MIN (src->journal_pos, src->line_pos); - - /* Use the oldest token also. (We know that src->deque cannot be empty - because we are in the process of adding a new token, which is already - initialized enough to use here.) */ - token = &src->tokens[deque_back (&src->deque, 0)]; - assert (token->token_pos >= token->line_pos); - max_tail = MIN (max_tail, token->line_pos); - - return max_tail; -} - static void lex_source_expand__ (struct lex_source *src) { - if (src->head - src->tail >= src->allocated) - { - size_t max_tail = lex_source_max_tail__ (src); - if (max_tail > src->tail) - { - /* Advance the tail, freeing up room at the head. */ - memmove (src->buffer, src->buffer + (max_tail - src->tail), - src->head - max_tail); - src->tail = max_tail; - } - else - { - /* Buffer is completely full. Expand it. */ - src->buffer = x2realloc (src->buffer, &src->allocated); - } - } - else - { - /* There's space available at the head of the buffer. Nothing to do. */ - } + if (src->length >= src->allocated) + src->buffer = x2realloc (src->buffer, &src->allocated); } static void @@ -1184,10 +1788,10 @@ lex_source_read__ (struct lex_source *src) { lex_source_expand__ (src); - size_t head_ofs = src->head - src->tail; - size_t space = src->allocated - head_ofs; + size_t space = src->allocated - src->length; enum prompt_style prompt = segmenter_get_prompt (&src->segmenter); - size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs], + size_t n = src->reader->class->read (src->reader, + &src->buffer[src->length], space, prompt); assert (n <= space); @@ -1195,14 +1799,13 @@ lex_source_read__ (struct lex_source *src) { /* End of input. */ src->reader->eof = true; - lex_source_expand__ (src); return; } - src->head += n; + src->length += n; } - while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n', - src->head - src->seg_pos)); + while (!memchr (&src->buffer[src->seg_pos], '\n', + src->length - src->seg_pos)); } static struct lex_source * @@ -1212,50 +1815,86 @@ lex_source__ (const struct lexer *lexer) : ll_data (ll_head (&lexer->sources), struct lex_source, ll)); } -static struct substring -lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1) +/* Returns the text of the syntax in SRC for tokens with offsets OFS0 through + OFS1 in the current command, inclusive. (For example, if OFS0 and OFS1 are + both zero, this requests the syntax for the first token in the current + command.) The caller must eventually free the returned string (with + free()). The syntax is encoded in UTF-8 and in the original form supplied + to the lexer so that, for example, it may include comments, spaces, and + new-lines if it spans multiple tokens. Macro expansion, however, has + already been performed. */ +static char * +lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1) { - const struct lex_token *token0 = lex_source_next__ (src, n0); - const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1)); - size_t start = token0->token_pos; - size_t end = token1->token_pos + token1->token_len; + struct string s = DS_EMPTY_INITIALIZER; + for (size_t i = ofs0; i <= ofs1; ) + { + /* Find [I,J) as the longest sequence of tokens not produced by macro + expansion, or otherwise the longest sequence expanded from a single + macro call. */ + const struct lex_token *first = lex_source_ofs__ (src, i); + size_t j; + for (j = i + 1; j <= ofs1; j++) + { + const struct lex_token *cur = lex_source_ofs__ (src, j); + if ((first->macro_rep != NULL) != (cur->macro_rep != NULL) + || first->macro_rep != cur->macro_rep) + break; + } + const struct lex_token *last = lex_source_ofs__ (src, j - 1); - return ss_buffer (&src->buffer[start - src->tail], end - start); + /* Now add the syntax for this sequence of tokens to SRC. */ + if (!ds_is_empty (&s)) + ds_put_byte (&s, ' '); + if (!first->macro_rep) + { + size_t start = first->token_pos; + size_t end = last->token_pos + last->token_len; + ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start)); + } + else + { + size_t start = first->ofs; + size_t end = last->ofs + last->len; + ds_put_substring (&s, ss_buffer (first->macro_rep + start, + end - start)); + } + + i = j; + } + return ds_steal_cstr (&s); } -static void -lex_ellipsize__ (struct substring in, char *out, size_t out_size) +static bool +lex_source_contains_macro_call (struct lex_source *src, int n0, int n1) { - size_t out_maxlen; - size_t out_len; - int mblen; - - assert (out_size >= 16); - out_maxlen = out_size - 1; - if (in.length > out_maxlen - 3) - out_maxlen -= 3; + for (int i = n0; i <= n1; i++) + if (lex_source_next__ (src, i)->macro_rep) + return true; + return false; +} - for (out_len = 0; out_len < in.length; out_len += mblen) - { - if (in.string[out_len] == '\n' - || in.string[out_len] == '\0' - || (in.string[out_len] == '\r' - && out_len + 1 < in.length - && in.string[out_len + 1] == '\n')) - break; +/* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the + raw UTF-8 syntax for the macro call (not for the expansion) and for any + other tokens included in that range. The syntax is encoded in UTF-8 and in + the original form supplied to the lexer so that, for example, it may include + comments, spaces, and new-lines if it spans multiple tokens. - mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len), - in.length - out_len); + Returns an empty string if the token range doesn't include a macro call. - if (mblen < 0) - break; + The caller must not modify or free the returned string. */ +static struct substring +lex_source_get_macro_call (struct lex_source *src, int n0, int n1) +{ + if (!lex_source_contains_macro_call (src, n0, n1)) + return ss_empty (); - if (out_len + mblen > out_maxlen) - break; - } + const struct lex_token *token0 = lex_source_next__ (src, n0); + const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1)); + size_t start = token0->token_pos; + size_t end = token1->token_pos + token1->token_len; - memcpy (out, in.string, out_len); - strcpy (&out[out_len], out_len < in.length ? "..." : ""); + return ss_buffer (&src->buffer[start], end - start); } static void @@ -1272,16 +1911,36 @@ lex_source_error_valist (struct lex_source *src, int n0, int n1, ds_put_cstr (&s, _("Syntax error at end of command")); else { - struct substring syntax = lex_source_get_syntax__ (src, n0, n1); - if (!ss_is_empty (syntax)) + /* Get the syntax that caused the error. */ + char *raw_syntax = lex_source_syntax__ (src, n0 + src->parse_ofs, + n1 + src->parse_ofs); + char syntax[64]; + str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax); + free (raw_syntax); + + /* Get the macro call(s) that expanded to the syntax that caused the + error. */ + char call[64]; + str_ellipsize (lex_source_get_macro_call (src, n0, n1), + call, sizeof call); + + if (syntax[0]) { - char syntax_cstr[64]; - - lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr); - ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr); + if (call[0]) + ds_put_format (&s, + _("Syntax error at `%s' (in expansion of `%s')"), + syntax, call); + else + ds_put_format (&s, _("Syntax error at `%s'"), syntax); } else - ds_put_cstr (&s, _("Syntax error")); + { + if (call[0]) + ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"), + call); + else + ds_put_cstr (&s, _("Syntax error")); + } } if (format) @@ -1289,128 +1948,92 @@ lex_source_error_valist (struct lex_source *src, int n0, int n1, ds_put_cstr (&s, ": "); ds_put_vformat (&s, format, args); } - ds_put_byte (&s, '.'); + if (ds_last (&s) != '.') + ds_put_byte (&s, '.'); - struct msg m = { + struct msg *m = xmalloc (sizeof *m); + *m = (struct msg) { .category = MSG_C_SYNTAX, .severity = MSG_S_ERROR, - .file_name = src->reader->file_name, - .first_line = lex_source_get_first_line_number (src, n0), - .last_line = lex_source_get_last_line_number (src, n1), - .first_column = lex_source_get_first_column (src, n0), - .last_column = lex_source_get_last_column (src, n1), + .location = lex_source_get_location (src, n0, n1), .text = ds_steal_cstr (&s), }; - msg_emit (&m); + msg_emit (m); } -static void PRINTF_FORMAT (2, 3) -lex_get_error (struct lex_source *src, const char *format, ...) +static void +lex_get_error (struct lex_source *src, const struct lex_token *token) { - va_list args; - int n; - - va_start (args, format); + char syntax[64]; + str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len), + syntax, sizeof syntax); - n = deque_count (&src->deque) - 1; - lex_source_error_valist (src, n, n, format, args); - lex_source_pop_front (src); + struct string s = DS_EMPTY_INITIALIZER; + ds_put_format (&s, _("Syntax error at `%s'"), syntax); + ds_put_format (&s, ": %s", token->token.string.string); - va_end (args); + struct msg *m = xmalloc (sizeof *m); + *m = (struct msg) { + .category = MSG_C_SYNTAX, + .severity = MSG_S_ERROR, + .location = lex_token_location_rw (src, token, token), + .text = ds_steal_cstr (&s), + }; + msg_emit (m); } -/* Attempts to append an additional token into SRC's deque, reading more from - the underlying lex_reader if necessary.. Returns true if successful, false - if the deque already represents (a suffix of) the whole lex_reader's - contents, */ +/* Attempts to append an additional token to 'pp' in SRC, reading more from the + underlying lex_reader if necessary. Returns true if a new token was added + to SRC's deque, false otherwise. The caller should retry failures unless + SRC's 'eof' marker was set to true indicating that there will be no more + tokens from this source. */ static bool -lex_source_get__ (const struct lex_source *src_) +lex_source_try_get_pp (struct lex_source *src) { - struct lex_source *src = CONST_CAST (struct lex_source *, src_); - if (src->eof) - return false; - - /* State maintained while scanning tokens. Usually we only need a single - state, but scanner_push() can return SCAN_SAVE to indicate that the state - needs to be saved and possibly restored later with SCAN_BACK. */ - struct state - { - struct segmenter segmenter; - enum segment_type last_segment; - int newlines; /* Number of newlines encountered so far. */ - /* Maintained here so we can update lex_source's similar members when we - finish. */ - size_t line_pos; - size_t seg_pos; - }; - - /* Initialize state. */ - struct state state = - { - .segmenter = src->segmenter, - .newlines = 0, - .seg_pos = src->seg_pos, - .line_pos = src->line_pos, - }; - struct state saved = state; - /* Append a new token to SRC and initialize it. */ - struct lex_token *token = lex_push_token__ (src); - struct scanner scanner; - scanner_init (&scanner, &token->token); - token->line_pos = src->line_pos; + struct lex_token *token = xmalloc (sizeof *token); + token->token = (struct token) { .type = T_STOP }; + token->macro_rep = NULL; + token->ref_cnt = NULL; token->token_pos = src->seg_pos; - if (src->reader->line_number > 0) - token->first_line = src->reader->line_number + src->n_newlines; - else - token->first_line = 0; - /* Extract segments and pass them through the scanner until we obtain a - token. */ + /* Extract a segment. */ + const char *segment; + enum segment_type seg_type; + int seg_len; for (;;) { - /* Extract a segment. */ - const char *segment = &src->buffer[state.seg_pos - src->tail]; - size_t seg_maxlen = src->head - state.seg_pos; - enum segment_type type; - int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen, - src->reader->eof, &type); - if (seg_len < 0) - { - /* The segmenter needs more input to produce a segment. */ - assert (!src->reader->eof); - lex_source_read__ (src); - continue; - } + segment = &src->buffer[src->seg_pos]; + seg_len = segmenter_push (&src->segmenter, segment, + src->length - src->seg_pos, + src->reader->eof, &seg_type); + if (seg_len >= 0) + break; - /* Update state based on the segment. */ - state.last_segment = type; - state.seg_pos += seg_len; - if (type == SEG_NEWLINE) - { - state.newlines++; - state.line_pos = state.seg_pos; - } + /* The segmenter needs more input to produce a segment. */ + assert (!src->reader->eof); + lex_source_read__ (src); + } - /* Pass the segment into the scanner and try to get a token out. */ - enum scan_result result = scanner_push (&scanner, type, - ss_buffer (segment, seg_len), - &token->token); - if (result == SCAN_SAVE) - saved = state; - else if (result == SCAN_BACK) - { - state = saved; - break; - } - else if (result == SCAN_DONE) - break; + /* Update state based on the segment. */ + token->token_len = seg_len; + src->seg_pos += seg_len; + if (seg_type == SEG_NEWLINE) + { + if (src->n_lines >= src->allocated_lines) + src->lines = x2nrealloc (src->lines, &src->allocated_lines, + sizeof *src->lines); + src->lines[src->n_lines++] = src->seg_pos; } + /* Get a token from the segment. */ + enum tokenize_result result = token_from_segment ( + seg_type, ss_buffer (segment, seg_len), &token->token); + /* If we've reached the end of a line, or the end of a command, then pass the line to the output engine as a syntax text item. */ - int n_lines = state.newlines; - if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline) + int n_lines = seg_type == SEG_NEWLINE; + if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline) { n_lines++; src->suppress_next_newline = true; @@ -1423,15 +2046,15 @@ lex_source_get__ (const struct lex_source *src_) for (int i = 0; i < n_lines; i++) { /* Beginning of line. */ - const char *line = &src->buffer[src->journal_pos - src->tail]; + const char *line = &src->buffer[src->journal_pos]; /* Calculate line length, including \n or \r\n end-of-line if present. - We use src->head even though that may be beyond what we've actually - converted to tokens (which is only through state.line_pos). That's - because, if we're emitting the line due to SEG_END_COMMAND, we want to - take the whole line through the newline, not just through the '.'. */ - size_t max_len = src->head - src->journal_pos; + We use src->length even though that may be beyond what we've actually + converted to tokens. That's because, if we're emitting the line due + to SEG_END_COMMAND, we want to take the whole line through the + newline, not just through the '.'. */ + size_t max_len = src->length - src->journal_pos; const char *newline = memchr (line, '\n', max_len); size_t line_len = newline ? newline - line + 1 : max_len; @@ -1450,115 +2073,307 @@ lex_source_get__ (const struct lex_source *src_) src->journal_pos += line_len; } - token->token_len = state.seg_pos - src->seg_pos; + switch (result) + { + case TOKENIZE_ERROR: + lex_get_error (src, token); + /* Fall through. */ + case TOKENIZE_EMPTY: + lex_token_destroy (token); + return false; + + case TOKENIZE_TOKEN: + if (token->token.type == T_STOP) + { + token->token.type = T_ENDCMD; + src->eof = true; + } + lex_stage_push_last (&src->pp, token); + return true; + } + NOT_REACHED (); +} + +/* Attempts to append a new token to SRC. Returns true if successful, false on + failure. On failure, the end of SRC has been reached and no more tokens + will be forthcoming from it. - src->segmenter = state.segmenter; - src->seg_pos = state.seg_pos; - src->line_pos = state.line_pos; - src->n_newlines += state.newlines; + Does not make the new token available for lookahead yet; the caller must + adjust SRC's 'middle' pointer to do so. */ +static bool +lex_source_get_pp (struct lex_source *src) +{ + while (!src->eof) + if (lex_source_try_get_pp (src)) + return true; + return false; +} - switch (token->token.type) +static bool +lex_source_try_get_merge (const struct lex_source *src_) +{ + struct lex_source *src = CONST_CAST (struct lex_source *, src_); + + if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src)) + return false; + + if (!settings_get_mexpand ()) { - default: - break; + lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp)); + return true; + } - case T_STOP: - token->token.type = T_ENDCMD; - src->eof = true; - break; + /* Now pass tokens one-by-one to the macro expander. - case SCAN_BAD_HEX_LENGTH: - lex_get_error (src, _("String of hex digits has %d characters, which " - "is not a multiple of 2"), - (int) token->token.number); - break; + In the common case where there is no macro to expand, the loop is not + entered. */ + struct macro_call *mc; + int n_call = macro_call_create (src->lexer->macros, + &lex_stage_first (&src->pp)->token, &mc); + for (int ofs = 1; !n_call; ofs++) + { + if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src)) + { + /* This should not be reachable because we always get a T_ENDCMD at + the end of an input file (transformed from T_STOP by + lex_source_try_get_pp()) and the macro_expander should always + terminate expansion on T_ENDCMD. */ + NOT_REACHED (); + } - case SCAN_BAD_HEX_DIGIT: - case SCAN_BAD_UNICODE_DIGIT: - lex_get_error (src, _("`%c' is not a valid hex digit"), - (int) token->token.number); - break; + const struct lex_token *t = lex_stage_nth (&src->pp, ofs); + const struct macro_token mt = { + .token = t->token, + .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len), + }; + const struct msg_location loc = lex_token_location (src, t, t); + n_call = macro_call_add (mc, &mt, &loc); + } + if (n_call < 0) + { + /* False alarm: no macro expansion after all. Use first token as + lookahead. We'll retry macro expansion from the second token next + time around. */ + macro_call_destroy (mc); + lex_stage_shift (&src->merge, &src->pp, 1); + return true; + } - case SCAN_BAD_UNICODE_LENGTH: - lex_get_error (src, _("Unicode string contains %d bytes, which is " - "not in the valid range of 1 to 8 bytes"), - (int) token->token.number); - break; + /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive, + are a macro call. (These are likely to be the only tokens in 'pp'.) + Expand them. */ + const struct lex_token *c0 = lex_stage_first (&src->pp); + const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1); + struct macro_tokens expansion = { .n = 0 }; + struct msg_location loc = lex_token_location (src, c0, c1); + macro_call_expand (mc, src->reader->syntax, &loc, &expansion); + macro_call_destroy (mc); + + /* Convert the macro expansion into syntax for possible error messages + later. */ + size_t *ofs = xnmalloc (expansion.n, sizeof *ofs); + size_t *len = xnmalloc (expansion.n, sizeof *len); + struct string s = DS_EMPTY_INITIALIZER; + macro_tokens_to_syntax (&expansion, &s, ofs, len); + + if (settings_get_mprint ()) + output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s), + _("Macro Expansion"))); + + /* Append the macro expansion tokens to the lookahead. */ + if (expansion.n > 0) + { + char *macro_rep = ds_steal_cstr (&s); + size_t *ref_cnt = xmalloc (sizeof *ref_cnt); + *ref_cnt = expansion.n; + for (size_t i = 0; i < expansion.n; i++) + { + struct lex_token *token = xmalloc (sizeof *token); + *token = (struct lex_token) { + .token = expansion.mts[i].token, + .token_pos = c0->token_pos, + .token_len = (c1->token_pos + c1->token_len) - c0->token_pos, + .macro_rep = macro_rep, + .ofs = ofs[i], + .len = len[i], + .ref_cnt = ref_cnt, + }; + lex_stage_push_last (&src->merge, token); + + ss_dealloc (&expansion.mts[i].syntax); + } + } + else + ds_destroy (&s); + free (expansion.mts); + free (ofs); + free (len); - case SCAN_BAD_UNICODE_CODE_POINT: - lex_get_error (src, _("U+%04X is not a valid Unicode code point"), - (int) token->token.number); - break; + /* Destroy the tokens for the call. */ + for (size_t i = 0; i < n_call; i++) + lex_stage_pop_first (&src->pp); - case SCAN_EXPECTED_QUOTE: - lex_get_error (src, _("Unterminated string constant")); - break; + return expansion.n > 0; +} - case SCAN_EXPECTED_EXPONENT: - lex_get_error (src, _("Missing exponent following `%s'"), - token->token.string.string); - break; +/* Attempts to obtain at least one new token into 'merge' in SRC. - case SCAN_UNEXPECTED_DOT: - lex_get_error (src, _("Unexpected `.' in middle of command")); - break; + Returns true if successful, false on failure. In the latter case, SRC is + exhausted and 'src->eof' is now true. */ +static bool +lex_source_get_merge (struct lex_source *src) +{ + while (!src->eof) + if (lex_source_try_get_merge (src)) + return true; + return false; +} - case SCAN_UNEXPECTED_CHAR: - { - char c_name[16]; - lex_get_error (src, _("Bad character %s in input"), - uc_name (token->token.number, c_name)); - } - break; +/* Attempts to obtain at least one new token into 'lookahead' in SRC. - case SCAN_SKIP: - lex_source_pop_front (src); - break; - } + Returns true if successful, false on failure. In the latter case, SRC is + exhausted and 'src->eof' is now true. */ +static bool +lex_source_get_parse (struct lex_source *src) +{ + struct merger m = MERGER_INIT; + struct token out; + for (size_t i = 0; ; i++) + { + while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src)) + { + /* We always get a T_ENDCMD at the end of an input file + (transformed from T_STOP by lex_source_try_get_pp()) and + merger_add() should never return -1 on T_ENDCMD. */ + assert (lex_stage_is_empty (&src->merge)); + return false; + } - return true; + int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token, + &out); + if (!retval) + { + lex_source_push_parse (src, lex_stage_take_first (&src->merge)); + return true; + } + else if (retval > 0) + { + /* Add a token that merges all the tokens together. */ + const struct lex_token *first = lex_stage_first (&src->merge); + const struct lex_token *last = lex_stage_nth (&src->merge, + retval - 1); + bool macro = first->macro_rep && first->macro_rep == last->macro_rep; + struct lex_token *t = xmalloc (sizeof *t); + *t = (struct lex_token) { + .token = out, + .token_pos = first->token_pos, + .token_len = (last->token_pos - first->token_pos) + last->token_len, + + /* This works well if all the tokens were not expanded from macros, + or if they came from the same macro expansion. It just gives up + in the other (corner) cases. */ + .macro_rep = macro ? first->macro_rep : NULL, + .ofs = macro ? first->ofs : 0, + .len = macro ? (last->ofs - first->ofs) + last->len : 0, + .ref_cnt = macro ? first->ref_cnt : NULL, + }; + if (t->ref_cnt) + ++*t->ref_cnt; + lex_source_push_parse (src, t); + + for (int i = 0; i < retval; i++) + lex_stage_pop_first (&src->merge); + return true; + } + } } static void lex_source_push_endcmd__ (struct lex_source *src) { - struct lex_token *token = lex_push_token__ (src); - token->token.type = T_ENDCMD; - token->token_pos = 0; - token->token_len = 0; - token->line_pos = 0; - token->first_line = 0; + assert (src->n_parse == 0); + + struct lex_token *token = xmalloc (sizeof *token); + *token = (struct lex_token) { .token = { .type = T_ENDCMD } }; + lex_source_push_parse (src, token); } -static struct lex_source * -lex_source_create (struct lex_reader *reader) +static void +lex_source_push_parse (struct lex_source *src, struct lex_token *token) { - struct lex_source *src; - enum segmenter_mode mode; - - src = xzalloc (sizeof *src); - src->reader = reader; + if (src->n_parse >= src->allocated_parse) + src->parse = x2nrealloc (src->parse, &src->allocated_parse, + sizeof *src->parse); + src->parse[src->n_parse++] = token; +} - if (reader->syntax == LEX_SYNTAX_AUTO) - mode = SEG_MODE_AUTO; - else if (reader->syntax == LEX_SYNTAX_INTERACTIVE) - mode = SEG_MODE_INTERACTIVE; - else if (reader->syntax == LEX_SYNTAX_BATCH) - mode = SEG_MODE_BATCH; - else - NOT_REACHED (); - segmenter_init (&src->segmenter, mode); +static void +lex_source_clear_parse (struct lex_source *src) +{ + for (size_t i = 0; i < src->n_parse; i++) + lex_token_destroy (src->parse[i]); + src->n_parse = src->parse_ofs = 0; +} - src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens); +static struct lex_source * +lex_source_create (struct lexer *lexer, struct lex_reader *reader) +{ + size_t allocated_lines = 4; + size_t *lines = xmalloc (allocated_lines * sizeof *lines); + *lines = 0; + + struct lex_source *src = xmalloc (sizeof *src); + *src = (struct lex_source) { + .n_refs = 1, + .reader = reader, + .segmenter = segmenter_init (reader->syntax, false), + .lexer = lexer, + .lines = lines, + .n_lines = 1, + .allocated_lines = allocated_lines, + }; lex_source_push_endcmd__ (src); return src; } -static void -lex_source_destroy (struct lex_source *src) +void +lex_set_message_handler (struct lexer *lexer, + void (*output_msg) (const struct msg *, + struct lexer *)) +{ + struct msg_handler msg_handler = { + .output_msg = (void (*)(const struct msg *, void *)) output_msg, + .aux = lexer, + .lex_source_ref = lex_source_ref, + .lex_source_unref = lex_source_unref, + .lex_source_get_line = lex_source_get_line, + }; + msg_set_handler (&msg_handler); +} + +void +lex_source_ref (const struct lex_source *src_) +{ + struct lex_source *src = CONST_CAST (struct lex_source *, src_); + if (src) + { + assert (src->n_refs > 0); + src->n_refs++; + } +} + +void +lex_source_unref (struct lex_source *src) { + if (!src) + return; + + assert (src->n_refs > 0); + if (--src->n_refs > 0) + return; + char *file_name = src->reader->file_name; char *encoding = src->reader->encoding; if (src->reader->class->destroy != NULL) @@ -1566,10 +2381,11 @@ lex_source_destroy (struct lex_source *src) free (file_name); free (encoding); free (src->buffer); - while (!deque_is_empty (&src->deque)) - lex_source_pop__ (src); - free (src->tokens); - ll_remove (&src->ll); + free (src->lines); + lex_stage_uninit (&src->pp); + lex_stage_uninit (&src->merge); + lex_source_clear_parse (src); + free (src->parse); free (src); } @@ -1590,7 +2406,7 @@ static struct lex_reader_class lex_file_reader_class; Returns a null pointer if FILE_NAME cannot be opened. */ struct lex_reader * lex_reader_for_file (const char *file_name, const char *encoding, - enum lex_syntax_mode syntax, + enum segmenter_mode syntax, enum lex_error_mode error) { struct lex_file_reader *r; @@ -1678,7 +2494,7 @@ lex_reader_for_substring_nocopy (struct substring s, const char *encoding) r = xmalloc (sizeof *r); lex_reader_init (&r->reader, &lex_string_reader_class); - r->reader.syntax = LEX_SYNTAX_AUTO; + r->reader.syntax = SEG_MODE_AUTO; r->reader.encoding = xstrdup_if_nonnull (encoding); r->s = s; r->offset = 0; @@ -1745,3 +2561,14 @@ static struct lex_reader_class lex_string_reader_class = lex_string_read, lex_string_close }; + +struct substring +lex_source_get_line (const struct lex_source *src, int line) +{ + if (line < 1 || line > src->n_lines) + return ss_empty (); + + size_t ofs = src->lines[line - 1]; + size_t end = line >= src->n_lines ? src->length : src->lines[line]; + return ss_buffer (&src->buffer[ofs], end - ofs); +}