X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Flexer%2Flexer.c;h=7d40ce1cd50a39ad70baabc9547855d74d5caada;hb=39df27f80745cf9622ac5e916a098c17961c2585;hp=8b3f2a48d00474811a09c090f8a82d1b75ed7546;hpb=8f04b0ced35a66cfdebefbcb53c81979add36ca3;p=pspp diff --git a/src/language/lexer/lexer.c b/src/language/lexer/lexer.c index 8b3f2a48d0..7d40ce1cd5 100644 --- a/src/language/lexer/lexer.c +++ b/src/language/lexer/lexer.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006, 2009 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -15,455 +15,601 @@ along with this program. If not, see . */ #include -#include "lexer.h" -#include -#include -#include + +#include "language/lexer/lexer.h" + #include +#include #include #include #include -#include #include -#include -#include -#include -#include -#include -#include -#include - -#include "xalloc.h" +#include +#include +#include +#include + +#include "language/command.h" +#include "language/lexer/macro.h" +#include "language/lexer/scan.h" +#include "language/lexer/segment.h" +#include "language/lexer/token.h" +#include "libpspp/assertion.h" +#include "libpspp/cast.h" +#include "libpspp/deque.h" +#include "libpspp/i18n.h" +#include "libpspp/intern.h" +#include "libpspp/ll.h" +#include "libpspp/message.h" +#include "libpspp/misc.h" +#include "libpspp/str.h" +#include "libpspp/u8-istream.h" +#include "output/journal.h" +#include "output/output-item.h" + +#include "gl/c-ctype.h" +#include "gl/minmax.h" +#include "gl/xalloc.h" +#include "gl/xmemdup0.h" #include "gettext.h" #define _(msgid) gettext (msgid) #define N_(msgid) msgid +/* A token within a lex_source. */ +struct lex_token + { + /* The regular token information. */ + struct token token; -#define DUMP_TOKENS 0 + /* For a token obtained through the lexer in an ordinary way, this is the + location of the token in terms of the lex_source's buffer. + For a token produced through macro expansion, this is the entire macro + call. */ + size_t token_pos; /* Offset into src->buffer of token start. */ + size_t token_len; /* Length of source for token in bytes. */ + /* For a token obtained through macro expansion, this is just this token. -struct lexer -{ - struct string line_buffer; + For a token obtained through the lexer in an ordinary way, these are + nulls and zeros. */ + char *macro_rep; /* The whole macro expansion. */ + size_t ofs; /* Offset of this token in macro_rep. */ + size_t len; /* Length of this token in macro_rep. */ + size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */ + }; - struct source_stream *ss; +static struct msg_point lex_token_start_point (const struct lex_source *, + const struct lex_token *); +static struct msg_point lex_token_end_point (const struct lex_source *, + const struct lex_token *); - int token; /* Current token. */ - double tokval; /* T_POS_NUM, T_NEG_NUM: the token's value. */ +/* Source offset of the last byte in TOKEN. */ +static size_t +lex_token_end (const struct lex_token *token) +{ + return token->token_pos + MAX (token->token_len, 1) - 1; +} - char tokid [VAR_NAME_LEN + 1]; /* T_ID: the identifier. */ +static void +lex_token_destroy (struct lex_token *t) +{ + token_uninit (&t->token); + if (t->ref_cnt) + { + assert (*t->ref_cnt > 0); + if (!--*t->ref_cnt) + { + free (t->macro_rep); + free (t->ref_cnt); + } + } + free (t); +} + +/* A deque of lex_tokens that comprises one stage in the token pipeline in a + lex_source. */ +struct lex_stage + { + struct deque deque; + struct lex_token **tokens; + }; - struct string tokstr; /* T_ID, T_STRING: token string value. - For T_ID, this is not truncated as is - tokid. */ +static void lex_stage_clear (struct lex_stage *); +static void lex_stage_uninit (struct lex_stage *); - char *prog; /* Pointer to next token in line_buffer. */ - bool dot; /* True only if this line ends with a terminal dot. */ +static size_t lex_stage_count (const struct lex_stage *); +static bool lex_stage_is_empty (const struct lex_stage *); - int put_token ; /* If nonzero, next token returned by lex_get(). - Used only in exceptional circumstances. */ +static struct lex_token *lex_stage_first (struct lex_stage *); +static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs); - struct string put_tokstr; - double put_tokval; -}; +static void lex_stage_push_last (struct lex_stage *, struct lex_token *); +static void lex_stage_pop_first (struct lex_stage *); +static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, + size_t n); -static int parse_id (struct lexer *); +/* Deletes all the tokens from STAGE. */ +static void +lex_stage_clear (struct lex_stage *stage) +{ + while (!deque_is_empty (&stage->deque)) + lex_stage_pop_first (stage); +} -/* How a string represents its contents. */ -enum string_type - { - CHARACTER_STRING, /* Characters. */ - BINARY_STRING, /* Binary digits. */ - OCTAL_STRING, /* Octal digits. */ - HEX_STRING /* Hexadecimal digits. */ - }; +/* Deletes all the tokens from STAGE and frees storage for the deque. */ +static void +lex_stage_uninit (struct lex_stage *stage) +{ + lex_stage_clear (stage); + free (stage->tokens); +} -static int parse_string (struct lexer *, enum string_type); +/* Returns true if STAGE contains no tokens, otherwise false. */ +static bool +lex_stage_is_empty (const struct lex_stage *stage) +{ + return deque_is_empty (&stage->deque); +} -#if DUMP_TOKENS -static void dump_token (struct lexer *); -#endif - -/* Initialization. */ +/* Returns the number of tokens in STAGE. */ +static size_t +lex_stage_count (const struct lex_stage *stage) +{ + return deque_count (&stage->deque); +} -/* Initializes the lexer. */ -struct lexer * -lex_create (struct source_stream *ss) +/* Returns the first token in STAGE, which must be nonempty. + The first token is the one accessed with the least lookahead. */ +static struct lex_token * +lex_stage_first (struct lex_stage *stage) { - struct lexer *lexer = xzalloc (sizeof (*lexer)); + return lex_stage_nth (stage, 0); +} - ds_init_empty (&lexer->tokstr); - ds_init_empty (&lexer->put_tokstr); - ds_init_empty (&lexer->line_buffer); - lexer->ss = ss; +/* Returns the token the given INDEX in STAGE. The first token (with the least + lookahead) is 0, the second token is 1, and so on. There must be at least + INDEX + 1 tokens in STAGE. */ +static struct lex_token * +lex_stage_nth (struct lex_stage *stage, size_t index) +{ + return stage->tokens[deque_back (&stage->deque, index)]; +} - return lexer; +/* Adds TOKEN so that it becomes the last token in STAGE. */ +static void +lex_stage_push_last (struct lex_stage *stage, struct lex_token *token) +{ + if (deque_is_full (&stage->deque)) + stage->tokens = deque_expand (&stage->deque, stage->tokens, + sizeof *stage->tokens); + stage->tokens[deque_push_front (&stage->deque)] = token; +} + +/* Removes and returns the first token from STAGE. */ +static struct lex_token * +lex_stage_take_first (struct lex_stage *stage) +{ + return stage->tokens[deque_pop_back (&stage->deque)]; } -struct source_stream * -lex_get_source_stream (const struct lexer *lex) +/* Removes the first token from STAGE and uninitializes it. */ +static void +lex_stage_pop_first (struct lex_stage *stage) { - return lex->ss; + lex_token_destroy (lex_stage_take_first (stage)); } -enum syntax_mode -lex_current_syntax_mode (const struct lexer *lex) +/* Removes the first N tokens from SRC, appending them to DST as the last + tokens. */ +static void +lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n) { - return source_stream_current_syntax_mode (lex->ss); + for (size_t i = 0; i < n; i++) + lex_stage_push_last (dst, lex_stage_take_first (src)); } -enum error_mode -lex_current_error_mode (const struct lexer *lex) +/* A source of tokens, corresponding to a syntax file. + + This is conceptually a lex_reader wrapped with everything needed to convert + its UTF-8 bytes into tokens. */ +struct lex_source + { + struct ll ll; /* In lexer's list of sources. */ + + /* Reference count: + + - One for struct lexer. + + - One for each struct msg_location that references this source. */ + size_t n_refs; + + struct lex_reader *reader; + struct lexer *lexer; + struct segmenter segmenter; + bool eof; /* True if T_STOP was read from 'reader'. */ + + /* Buffer of UTF-8 bytes. */ + char *buffer; /* Source file contents. */ + size_t length; /* Number of bytes filled. */ + size_t allocated; /* Number of bytes allocated. */ + + /* Offsets into 'buffer'. */ + size_t journal_pos; /* First byte not yet output to journal. */ + size_t seg_pos; /* First byte not yet scanned as token. */ + + /* Offset into 'buffer' of starts of lines. */ + size_t *lines; + size_t n_lines, allocated_lines; + + bool suppress_next_newline; + + /* Tokens. + + This is a pipeline with the following stages. Each token eventually + made available to the parser passes through of these stages. The stages + are named after the processing that happens in each one. + + Initially, tokens come from the segmenter and scanner to 'pp': + + - pp: Tokens that need to pass through the macro preprocessor to end up + in 'merge'. + + - merge: Tokens that need to pass through scan_merge() to end up in + 'parse'. + + - parse: Tokens available to the client for parsing. + + 'pp' and 'merge' store tokens only temporarily until they pass into + 'parse'. Tokens then live in 'parse' until the command is fully + consumed, at which time they are freed together. */ + struct lex_stage pp; + struct lex_stage merge; + struct lex_token **parse; + size_t n_parse, allocated_parse, parse_ofs; + }; + +static struct lex_source *lex_source_create (struct lexer *, + struct lex_reader *); + +/* Lexer. */ +struct lexer + { + struct ll_list sources; /* Contains "struct lex_source"s. */ + struct macro_set *macros; + }; + +static struct lex_source *lex_source__ (const struct lexer *); +static char *lex_source_syntax__ (const struct lex_source *, + int ofs0, int ofs1); +static const struct lex_token *lex_next__ (const struct lexer *, int n); +static void lex_source_push_endcmd__ (struct lex_source *); +static void lex_source_push_parse (struct lex_source *, struct lex_token *); +static void lex_source_clear_parse (struct lex_source *); + +static bool lex_source_get_parse (struct lex_source *); +static void lex_source_error_valist (struct lex_source *, int n0, int n1, + const char *format, va_list) + PRINTF_FORMAT (4, 0); +static const struct lex_token *lex_source_next__ (const struct lex_source *, + int n); + +/* Initializes READER with the specified CLASS and otherwise some reasonable + defaults. The caller should fill in the others members as desired. */ +void +lex_reader_init (struct lex_reader *reader, + const struct lex_reader_class *class) { - return source_stream_current_error_mode (lex->ss); + reader->class = class; + reader->syntax = SEG_MODE_AUTO; + reader->error = LEX_ERROR_CONTINUE; + reader->file_name = NULL; + reader->encoding = NULL; + reader->line_number = 0; + reader->eof = false; } +/* Frees any file name already in READER and replaces it by a copy of + FILE_NAME, or if FILE_NAME is null then clears any existing name. */ +void +lex_reader_set_file_name (struct lex_reader *reader, const char *file_name) +{ + free (reader->file_name); + reader->file_name = xstrdup_if_nonnull (file_name); +} + +/* Creates and returns a new lexer. */ +struct lexer * +lex_create (void) +{ + struct lexer *lexer = xmalloc (sizeof *lexer); + *lexer = (struct lexer) { + .sources = LL_INITIALIZER (lexer->sources), + .macros = macro_set_create (), + }; + return lexer; +} +/* Destroys LEXER. */ void lex_destroy (struct lexer *lexer) { - if ( NULL != lexer ) + if (lexer != NULL) { - ds_destroy (&lexer->put_tokstr); - ds_destroy (&lexer->tokstr); - ds_destroy (&lexer->line_buffer); + struct lex_source *source, *next; + ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources) + { + ll_remove (&source->ll); + lex_source_unref (source); + } + macro_set_destroy (lexer->macros); free (lexer); } } - -/* Common functions. */ +/* Adds M to LEXER's set of macros. M replaces any existing macro with the + same name. Takes ownership of M. */ +void +lex_define_macro (struct lexer *lexer, struct macro *m) +{ + macro_set_add (lexer->macros, m); +} -/* Copies put_token, lexer->put_tokstr, put_tokval into token, tokstr, - tokval, respectively, and sets tokid appropriately. */ -static void -restore_token (struct lexer *lexer) +/* Inserts READER into LEXER so that the next token read by LEXER comes from + READER. Before the caller, LEXER must either be empty or at a T_ENDCMD + token. */ +void +lex_include (struct lexer *lexer, struct lex_reader *reader) { - assert (lexer->put_token != 0); - lexer->token = lexer->put_token; - ds_assign_string (&lexer->tokstr, &lexer->put_tokstr); - str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr)); - lexer->tokval = lexer->put_tokval; - lexer->put_token = 0; + assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD); + ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll); } -/* Copies token, tokstr, lexer->tokval into lexer->put_token, put_tokstr, - put_lexer->tokval respectively. */ -static void -save_token (struct lexer *lexer) +/* Appends READER to LEXER, so that it will be read after all other current + readers have already been read. */ +void +lex_append (struct lexer *lexer, struct lex_reader *reader) { - lexer->put_token = lexer->token; - ds_assign_string (&lexer->put_tokstr, &lexer->tokstr); - lexer->put_tokval = lexer->tokval; + ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll); } + +/* Advancing. */ -/* Parses a single token, setting appropriate global variables to - indicate the token's attributes. */ +/* Advances LEXER to the next token, consuming the current token. */ void lex_get (struct lexer *lexer) { - /* Find a token. */ - for (;;) + struct lex_source *src; + + src = lex_source__ (lexer); + if (src == NULL) + return; + + if (src->parse_ofs < src->n_parse) { - if (NULL == lexer->prog && ! lex_get_line (lexer) ) - { - lexer->token = T_STOP; - return; - } + if (src->parse[src->parse_ofs]->token.type == T_ENDCMD) + lex_source_clear_parse (src); + else + src->parse_ofs++; + } + + while (src->parse_ofs == src->n_parse) + if (!lex_source_get_parse (src)) + { + ll_remove (&src->ll); + lex_source_unref (src); + src = lex_source__ (lexer); + if (src == NULL) + return; + } +} + +/* Advances LEXER by N tokens. */ +void +lex_get_n (struct lexer *lexer, size_t n) +{ + while (n-- > 0) + lex_get (lexer); +} + +/* Issuing errors. */ + +/* Prints a syntax error message containing the current token and + given message MESSAGE (if non-null). */ +void +lex_error (struct lexer *lexer, const char *format, ...) +{ + va_list args; + + va_start (args, format); + lex_next_error_valist (lexer, 0, 0, format, args); + va_end (args); +} + +/* Prints a syntax error message containing the current token and + given message MESSAGE (if non-null). */ +void +lex_error_valist (struct lexer *lexer, const char *format, va_list args) +{ + lex_next_error_valist (lexer, 0, 0, format, args); +} + +/* Prints a syntax error message containing the current token and + given message MESSAGE (if non-null). */ +void +lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...) +{ + va_list args; + + va_start (args, format); + lex_next_error_valist (lexer, n0, n1, format, args); + va_end (args); +} + +/* Prints a syntax error message saying that one of the strings provided as + varargs, up to the first NULL, is expected. */ +void +(lex_error_expecting) (struct lexer *lexer, ...) +{ + va_list args; + + va_start (args, lexer); + lex_error_expecting_valist (lexer, args); + va_end (args); +} - /* If a token was pushed ahead, return it. */ - if (lexer->put_token) +/* Prints a syntax error message saying that one of the options provided in + ARGS, up to the first NULL, is expected. */ +void +lex_error_expecting_valist (struct lexer *lexer, va_list args) +{ + enum { MAX_OPTIONS = 9 }; + const char *options[MAX_OPTIONS]; + int n = 0; + while (n < MAX_OPTIONS) { - restore_token (lexer); -#if DUMP_TOKENS - dump_token (lexer); -#endif - return; + const char *option = va_arg (args, const char *); + if (!option) + break; + + options[n++] = option; } + lex_error_expecting_array (lexer, options, n); +} - for (;;) +void +lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n) +{ + switch (n) { - /* Skip whitespace. */ - while (c_isspace ((unsigned char) *lexer->prog)) - lexer->prog++; - - if (*lexer->prog) - break; - - if (lexer->dot) - { - lexer->dot = 0; - lexer->token = '.'; -#if DUMP_TOKENS - dump_token (lexer); -#endif - return; - } - else if (!lex_get_line (lexer)) - { - lexer->prog = NULL; - lexer->token = T_STOP; -#if DUMP_TOKENS - dump_token (lexer); -#endif - return; - } - - if (lexer->put_token) - { - restore_token (lexer); -#if DUMP_TOKENS - dump_token (lexer); -#endif - return; - } - } + case 0: + lex_error (lexer, NULL); + break; + case 1: + lex_error (lexer, _("expecting %s"), options[0]); + break; - /* Actually parse the token. */ - ds_clear (&lexer->tokstr); + case 2: + lex_error (lexer, _("expecting %s or %s"), options[0], options[1]); + break; - switch (*lexer->prog) - { - case '-': case '.': - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - { - char *tail; - - /* `-' can introduce a negative number, or it can be a - token by itself. If it is not followed by a digit or a - decimal point, it is definitely not a number. - Otherwise, it might be either, but most of the time we - want it as a number. When the syntax calls for a `-' - token, lex_negative_to_dash() must be used to break - negative numbers into two tokens. */ - if (*lexer->prog == '-') - { - ds_put_char (&lexer->tokstr, *lexer->prog++); - while (c_isspace ((unsigned char) *lexer->prog)) - lexer->prog++; - - if (!c_isdigit ((unsigned char) *lexer->prog) && *lexer->prog != '.') - { - lexer->token = '-'; - break; - } - lexer->token = T_NEG_NUM; - } - else - lexer->token = T_POS_NUM; - - /* Parse the number, copying it into tokstr. */ - while (c_isdigit ((unsigned char) *lexer->prog)) - ds_put_char (&lexer->tokstr, *lexer->prog++); - if (*lexer->prog == '.') - { - ds_put_char (&lexer->tokstr, *lexer->prog++); - while (c_isdigit ((unsigned char) *lexer->prog)) - ds_put_char (&lexer->tokstr, *lexer->prog++); - } - if (*lexer->prog == 'e' || *lexer->prog == 'E') - { - ds_put_char (&lexer->tokstr, *lexer->prog++); - if (*lexer->prog == '+' || *lexer->prog == '-') - ds_put_char (&lexer->tokstr, *lexer->prog++); - while (c_isdigit ((unsigned char) *lexer->prog)) - ds_put_char (&lexer->tokstr, *lexer->prog++); - } - - /* Parse as floating point. */ - lexer->tokval = c_strtod (ds_cstr (&lexer->tokstr), &tail); - if (*tail) - { - msg (SE, _("%s does not form a valid number."), - ds_cstr (&lexer->tokstr)); - lexer->tokval = 0.0; - - ds_clear (&lexer->tokstr); - ds_put_char (&lexer->tokstr, '0'); - } - - break; - } - - case '\'': case '"': - lexer->token = parse_string (lexer, CHARACTER_STRING); - break; - - case '(': case ')': case ',': case '=': case '+': case '/': - case '[': case ']': - lexer->token = *lexer->prog++; - break; - - case '*': - if (*++lexer->prog == '*') - { - lexer->prog++; - lexer->token = T_EXP; - } - else - lexer->token = '*'; - break; - - case '<': - if (*++lexer->prog == '=') - { - lexer->prog++; - lexer->token = T_LE; - } - else if (*lexer->prog == '>') - { - lexer->prog++; - lexer->token = T_NE; - } - else - lexer->token = T_LT; - break; - - case '>': - if (*++lexer->prog == '=') - { - lexer->prog++; - lexer->token = T_GE; - } - else - lexer->token = T_GT; - break; - - case '~': - if (*++lexer->prog == '=') - { - lexer->prog++; - lexer->token = T_NE; - } - else - lexer->token = T_NOT; - break; - - case '&': - lexer->prog++; - lexer->token = T_AND; - break; - - case '|': - lexer->prog++; - lexer->token = T_OR; - break; - - case 'b': case 'B': - if (lexer->prog[1] == '\'' || lexer->prog[1] == '"') - lexer->token = parse_string (lexer, BINARY_STRING); - else - lexer->token = parse_id (lexer); - break; + case 3: + lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1], + options[2]); + break; - case 'o': case 'O': - if (lexer->prog[1] == '\'' || lexer->prog[1] == '"') - lexer->token = parse_string (lexer, OCTAL_STRING); - else - lexer->token = parse_id (lexer); - break; + case 4: + lex_error (lexer, _("expecting %s, %s, %s, or %s"), + options[0], options[1], options[2], options[3]); + break; - case 'x': case 'X': - if (lexer->prog[1] == '\'' || lexer->prog[1] == '"') - lexer->token = parse_string (lexer, HEX_STRING); - else - lexer->token = parse_id (lexer); - break; + case 5: + lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"), + options[0], options[1], options[2], options[3], options[4]); + break; - default: - if (lex_is_id1 (*lexer->prog)) - { - lexer->token = parse_id (lexer); - break; - } - else - { - unsigned char c = *lexer->prog++; - char *c_name = xasprintf (c_isgraph (c) ? "%c" : "\\%o", c); - msg (SE, _("Bad character in input: `%s'."), c_name); - free (c_name); - continue; - } - } + case 6: + lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"), + options[0], options[1], options[2], options[3], options[4], + options[5]); break; - } -#if DUMP_TOKENS - dump_token (lexer); -#endif -} + case 7: + lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"), + options[0], options[1], options[2], options[3], options[4], + options[5], options[6]); + break; -/* Parses an identifier at the current position into tokid and - tokstr. - Returns the correct token type. */ -static int -parse_id (struct lexer *lexer) -{ - struct substring rest_of_line - = ss_substr (ds_ss (&lexer->line_buffer), - ds_pointer_to_position (&lexer->line_buffer, lexer->prog), - SIZE_MAX); - struct substring id = ss_head (rest_of_line, - lex_id_get_length (rest_of_line)); - lexer->prog += ss_length (id); + case 8: + lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"), + options[0], options[1], options[2], options[3], options[4], + options[5], options[6], options[7]); + break; + + case 9: + lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, %s, or %s"), + options[0], options[1], options[2], options[3], options[4], + options[5], options[6], options[7], options[8]); + break; - ds_assign_substring (&lexer->tokstr, id); - str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr)); - return lex_id_to_token (id); + default: + lex_error (lexer, NULL); + } } -/* Reports an error to the effect that subcommand SBC may only be - specified once. */ +/* Reports an error to the effect that subcommand SBC may only be specified + once. + + This function does not take a lexer as an argument or use lex_error(), + because the result would ordinarily just be redundant: "Syntax error at + SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does + not help the user find the error. */ void lex_sbc_only_once (const char *sbc) { msg (SE, _("Subcommand %s may only be specified once."), sbc); } -/* Reports an error to the effect that subcommand SBC is - missing. */ +/* Reports an error to the effect that subcommand SBC is missing. + + This function does not take a lexer as an argument or use lex_error(), + because a missing subcommand can normally be detected only after the whole + command has been parsed, and so lex_error() would always report "Syntax + error at end of command", which does not help the user find the error. */ +void +lex_sbc_missing (const char *sbc) +{ + msg (SE, _("Required subcommand %s was not specified."), sbc); +} + +/* Reports an error to the effect that specification SPEC may only be specified + once within subcommand SBC. */ +void +lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec) +{ + lex_error (lexer, _("%s may only be specified once within subcommand %s"), + spec, sbc); +} + +/* Reports an error to the effect that specification SPEC is missing within + subcommand SBC. */ void -lex_sbc_missing (struct lexer *lexer, const char *sbc) +lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec) { - lex_error (lexer, _("missing required subcommand %s"), sbc); + lex_error (lexer, _("Required %s specification missing from %s subcommand"), + sbc, spec); } /* Prints a syntax error message containing the current token and given message MESSAGE (if non-null). */ void -lex_error (struct lexer *lexer, const char *message, ...) +lex_next_error_valist (struct lexer *lexer, int n0, int n1, + const char *format, va_list args) { - char *token_rep; - char where[128]; + struct lex_source *src = lex_source__ (lexer); - token_rep = lex_token_representation (lexer); - if (lexer->token == T_STOP) - strcpy (where, "end of file"); - else if (lexer->token == '.') - strcpy (where, "end of command"); + if (src != NULL) + lex_source_error_valist (src, n0, n1, format, args); else - snprintf (where, sizeof where, "`%s'", token_rep); - free (token_rep); - - if (message) { - char buf[1024]; - va_list args; + struct string s; - va_start (args, message); - vsnprintf (buf, 1024, message, args); - va_end (args); - - msg (SE, _("Syntax error %s at %s."), buf, where); + ds_init_empty (&s); + ds_put_format (&s, _("Syntax error at end of input")); + if (format != NULL) + { + ds_put_cstr (&s, ": "); + ds_put_vformat (&s, format, args); + } + if (ds_last (&s) != '.') + ds_put_byte (&s, '.'); + msg (SE, "%s", ds_cstr (&s)); + ds_destroy (&s); } - else - msg (SE, _("Syntax error at %s."), where); } /* Checks that we're at end of command. @@ -473,7 +619,7 @@ lex_error (struct lexer *lexer, const char *message, ...) int lex_end_of_command (struct lexer *lexer) { - if (lexer->token != '.') + if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP) { lex_error (lexer, _("expecting end of command")); return CMD_FAILURE; @@ -486,56 +632,93 @@ lex_end_of_command (struct lexer *lexer) /* Returns true if the current token is a number. */ bool -lex_is_number (struct lexer *lexer) +lex_is_number (const struct lexer *lexer) { - return lexer->token == T_POS_NUM || lexer->token == T_NEG_NUM; + return lex_next_is_number (lexer, 0); } - /* Returns true if the current token is a string. */ bool -lex_is_string (struct lexer *lexer) +lex_is_string (const struct lexer *lexer) { - return lexer->token == T_STRING; + return lex_next_is_string (lexer, 0); } - /* Returns the value of the current token, which must be a floating point number. */ double -lex_number (struct lexer *lexer) +lex_number (const struct lexer *lexer) { - assert (lex_is_number (lexer)); - return lexer->tokval; + return lex_next_number (lexer, 0); } /* Returns true iff the current token is an integer. */ bool -lex_is_integer (struct lexer *lexer) +lex_is_integer (const struct lexer *lexer) { - return (lex_is_number (lexer) - && lexer->tokval > LONG_MIN - && lexer->tokval <= LONG_MAX - && floor (lexer->tokval) == lexer->tokval); + return lex_next_is_integer (lexer, 0); } /* Returns the value of the current token, which must be an integer. */ long -lex_integer (struct lexer *lexer) +lex_integer (const struct lexer *lexer) +{ + return lex_next_integer (lexer, 0); +} + +/* Token testing functions with lookahead. + + A value of 0 for N as an argument to any of these functions refers to the + current token. Lookahead is limited to the current command. Any N greater + than the number of tokens remaining in the current command will be treated + as referring to a T_ENDCMD token. */ + +/* Returns true if the token N ahead of the current token is a number. */ +bool +lex_next_is_number (const struct lexer *lexer, int n) +{ + return token_is_number (lex_next (lexer, n)); +} + +/* Returns true if the token N ahead of the current token is a string. */ +bool +lex_next_is_string (const struct lexer *lexer, int n) +{ + return token_is_string (lex_next (lexer, n)); +} + +/* Returns the value of the token N ahead of the current token, which must be a + floating point number. */ +double +lex_next_number (const struct lexer *lexer, int n) +{ + return token_number (lex_next (lexer, n)); +} + +/* Returns true if the token N ahead of the current token is an integer. */ +bool +lex_next_is_integer (const struct lexer *lexer, int n) +{ + return token_is_integer (lex_next (lexer, n)); +} + +/* Returns the value of the token N ahead of the current token, which must be + an integer. */ +long +lex_next_integer (const struct lexer *lexer, int n) { - assert (lex_is_integer (lexer)); - return lexer->tokval; + return token_integer (lex_next (lexer, n)); } /* Token matching functions. */ -/* If TOK is the current token, skips it and returns true +/* If the current token has the specified TYPE, skips it and returns true. Otherwise, returns false. */ bool -lex_match (struct lexer *lexer, int t) +lex_match (struct lexer *lexer, enum token_type type) { - if (lexer->token == t) + if (lex_token (lexer) == type) { lex_get (lexer); return true; @@ -544,25 +727,26 @@ lex_match (struct lexer *lexer, int t) return false; } -/* If the current token is the identifier S, skips it and returns - true. The identifier may be abbreviated to its first three - letters. - Otherwise, returns false. */ +/* If the current token matches IDENTIFIER, skips it and returns true. + IDENTIFIER may be abbreviated to its first three letters. Otherwise, + returns false. + + IDENTIFIER must be an ASCII string. */ bool -lex_match_id (struct lexer *lexer, const char *s) +lex_match_id (struct lexer *lexer, const char *identifier) { - return lex_match_id_n (lexer, s, 3); + return lex_match_id_n (lexer, identifier, 3); } -/* If the current token is the identifier S, skips it and returns - true. The identifier may be abbreviated to its first N - letters. - Otherwise, returns false. */ +/* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER + may be abbreviated to its first N letters. Otherwise, returns false. + + IDENTIFIER must be an ASCII string. */ bool -lex_match_id_n (struct lexer *lexer, const char *s, size_t n) +lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n) { - if (lexer->token == T_ID - && lex_id_match_n (ss_cstr (s), ss_cstr (lexer->tokid), n)) + if (lex_token (lexer) == T_ID + && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n)) { lex_get (lexer); return true; @@ -571,8 +755,8 @@ lex_match_id_n (struct lexer *lexer, const char *s, size_t n) return false; } -/* If the current token is integer N, skips it and returns true. - Otherwise, returns false. */ +/* If the current token is integer X, skips it and returns true. Otherwise, + returns false. */ bool lex_match_int (struct lexer *lexer, int x) { @@ -587,44 +771,55 @@ lex_match_int (struct lexer *lexer, int x) /* Forced matches. */ -/* If this token is identifier S, fetches the next token and returns - nonzero. - Otherwise, reports an error and returns zero. */ +/* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be + abbreviated to its first 3 letters. Otherwise, reports an error and returns + false. + + IDENTIFIER must be an ASCII string. */ bool -lex_force_match_id (struct lexer *lexer, const char *s) +lex_force_match_id (struct lexer *lexer, const char *identifier) { - if (lex_match_id (lexer, s)) + if (lex_match_id (lexer, identifier)) return true; else { - lex_error (lexer, _("expecting `%s'"), s); + lex_error_expecting (lexer, identifier); return false; } } -/* If the current token is T, skips the token. Otherwise, reports an - error and returns from the current function with return value false. */ +/* If the current token has the specified TYPE, skips it and returns true. + Otherwise, reports an error and returns false. */ bool -lex_force_match (struct lexer *lexer, int t) +lex_force_match (struct lexer *lexer, enum token_type type) { - if (lexer->token == t) + if (lex_token (lexer) == type) { lex_get (lexer); return true; } else { - lex_error (lexer, _("expecting `%s'"), lex_token_name (t)); + const char *type_string = token_type_to_string (type); + if (type_string) + { + char *s = xasprintf ("`%s'", type_string); + lex_error_expecting (lexer, s); + free (s); + } + else + lex_error_expecting (lexer, token_type_to_name (type)); + return false; } } -/* If this token is a string, does nothing and returns true. +/* If the current token is a string, does nothing and returns true. Otherwise, reports an error and returns false. */ bool lex_force_string (struct lexer *lexer) { - if (lexer->token == T_STRING) + if (lex_is_string (lexer)) return true; else { @@ -633,7 +828,22 @@ lex_force_string (struct lexer *lexer) } } -/* If this token is an integer, does nothing and returns true. +/* If the current token is a string or an identifier, does nothing and returns + true. Otherwise, reports an error and returns false. + + This is meant for use in syntactic situations where we want to encourage the + user to supply a quoted string, but for compatibility we also accept + identifiers. (One example of such a situation is file names.) Therefore, + the error message issued when the current token is wrong only says that a + string is expected and doesn't mention that an identifier would also be + accepted. */ +bool +lex_force_string_or_id (struct lexer *lexer) +{ + return lex_token (lexer) == T_ID || lex_force_string (lexer); +} + +/* If the current token is an integer, does nothing and returns true. Otherwise, reports an error and returns false. */ bool lex_force_int (struct lexer *lexer) @@ -647,687 +857,1718 @@ lex_force_int (struct lexer *lexer) } } -/* If this token is a number, does nothing and returns true. - Otherwise, reports an error and returns false. */ -bool -lex_force_num (struct lexer *lexer) -{ - if (lex_is_number (lexer)) - return true; - - lex_error (lexer, _("expecting number")); - return false; -} - -/* If this token is an identifier, does nothing and returns true. - Otherwise, reports an error and returns false. */ +/* If the current token is an integer in the range MIN...MAX (inclusive), does + nothing and returns true. Otherwise, reports an error and returns false. + If NAME is nonnull, then it is used in the error message. */ bool -lex_force_id (struct lexer *lexer) +lex_force_int_range (struct lexer *lexer, const char *name, long min, long max) { - if (lexer->token == T_ID) + bool is_number = lex_is_number (lexer); + bool is_integer = lex_is_integer (lexer); + bool too_small = (is_integer ? lex_integer (lexer) < min + : is_number ? lex_number (lexer) < min + : false); + bool too_big = (is_integer ? lex_integer (lexer) > max + : is_number ? lex_number (lexer) > max + : false); + if (is_integer && !too_small && !too_big) return true; - lex_error (lexer, _("expecting identifier")); - return false; -} - -/* Weird token functions. */ - -/* Returns the first character of the next token, except that if the - next token is not an identifier, the character returned will not be - a character that can begin an identifier. Specifically, the - hexstring lead-in X' causes lookahead() to return '. Note that an - alphanumeric return value doesn't guarantee an ID token, it could - also be a reserved-word token. */ -int -lex_look_ahead (struct lexer *lexer) -{ - if (lexer->put_token) - return lexer->put_token; - - for (;;) + if (min > max) { - if (NULL == lexer->prog && ! lex_get_line (lexer) ) - return 0; - - for (;;) - { - while (c_isspace ((unsigned char) *lexer->prog)) - lexer->prog++; - if (*lexer->prog) - break; - - if (lexer->dot) - return '.'; - else if (!lex_get_line (lexer)) - return 0; - - if (lexer->put_token) - return lexer->put_token; - } - - if ((toupper ((unsigned char) *lexer->prog) == 'X' - || toupper ((unsigned char) *lexer->prog) == 'B' - || toupper ((unsigned char) *lexer->prog) == 'O') - && (lexer->prog[1] == '\'' || lexer->prog[1] == '"')) - return '\''; - - return *lexer->prog; + /* Weird, maybe a bug in the caller. Just report that we needed an + integer. */ + if (name) + lex_error (lexer, _("Integer expected for %s."), name); + else + lex_error (lexer, _("Integer expected.")); } + else if (min == max) + { + if (name) + lex_error (lexer, _("Expected %ld for %s."), min, name); + else + lex_error (lexer, _("Expected %ld."), min); + } + else if (min + 1 == max) + { + if (name) + lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name); + else + lex_error (lexer, _("Expected %ld or %ld."), min, min + 1); + } + else + { + bool report_lower_bound = (min > INT_MIN / 2) || too_small; + bool report_upper_bound = (max < INT_MAX / 2) || too_big; + + if (report_lower_bound && report_upper_bound) + { + if (name) + lex_error (lexer, + _("Expected integer between %ld and %ld for %s."), + min, max, name); + else + lex_error (lexer, _("Expected integer between %ld and %ld."), + min, max); + } + else if (report_lower_bound) + { + if (min == 0) + { + if (name) + lex_error (lexer, _("Expected non-negative integer for %s."), + name); + else + lex_error (lexer, _("Expected non-negative integer.")); + } + else if (min == 1) + { + if (name) + lex_error (lexer, _("Expected positive integer for %s."), + name); + else + lex_error (lexer, _("Expected positive integer.")); + } + else + { + if (name) + lex_error (lexer, _("Expected integer %ld or greater for %s."), + min, name); + else + lex_error (lexer, _("Expected integer %ld or greater."), min); + } + } + else if (report_upper_bound) + { + if (name) + lex_error (lexer, + _("Expected integer less than or equal to %ld for %s."), + max, name); + else + lex_error (lexer, _("Expected integer less than or equal to %ld."), + max); + } + else + { + if (name) + lex_error (lexer, _("Integer expected for %s."), name); + else + lex_error (lexer, _("Integer expected.")); + } + } + return false; +} + +/* If the current token is a number, does nothing and returns true. + Otherwise, reports an error and returns false. */ +bool +lex_force_num (struct lexer *lexer) +{ + if (lex_is_number (lexer)) + return true; + + lex_error (lexer, _("expecting number")); + return false; +} + +/* If the current token is an number in the closed range [MIN,MAX], does + nothing and returns true. Otherwise, reports an error and returns false. + If NAME is nonnull, then it is used in the error message. */ +bool +lex_force_num_range_closed (struct lexer *lexer, const char *name, + double min, double max) +{ + bool is_number = lex_is_number (lexer); + bool too_small = is_number && lex_number (lexer) < min; + bool too_big = is_number && lex_number (lexer) > max; + if (is_number && !too_small && !too_big) + return true; + + if (min > max) + { + /* Weird, maybe a bug in the caller. Just report that we needed an + number. */ + if (name) + lex_error (lexer, _("Number expected for %s."), name); + else + lex_error (lexer, _("Number expected.")); + } + else if (min == max) + { + if (name) + lex_error (lexer, _("Expected %g for %s."), min, name); + else + lex_error (lexer, _("Expected %g."), min); + } + else + { + bool report_lower_bound = min > -DBL_MAX || too_small; + bool report_upper_bound = max < DBL_MAX || too_big; + + if (report_lower_bound && report_upper_bound) + { + if (name) + lex_error (lexer, + _("Expected number between %g and %g for %s."), + min, max, name); + else + lex_error (lexer, _("Expected number between %g and %g."), + min, max); + } + else if (report_lower_bound) + { + if (min == 0) + { + if (name) + lex_error (lexer, _("Expected non-negative number for %s."), + name); + else + lex_error (lexer, _("Expected non-negative number.")); + } + else + { + if (name) + lex_error (lexer, _("Expected number %g or greater for %s."), + min, name); + else + lex_error (lexer, _("Expected number %g or greater."), min); + } + } + else if (report_upper_bound) + { + if (name) + lex_error (lexer, + _("Expected number less than or equal to %g for %s."), + max, name); + else + lex_error (lexer, _("Expected number less than or equal to %g."), + max); + } + else + { + if (name) + lex_error (lexer, _("Number expected for %s."), name); + else + lex_error (lexer, _("Number expected.")); + } + } + return false; +} + +/* If the current token is an number in the half-open range [MIN,MAX), does + nothing and returns true. Otherwise, reports an error and returns false. + If NAME is nonnull, then it is used in the error message. */ +bool +lex_force_num_range_halfopen (struct lexer *lexer, const char *name, + double min, double max) +{ + bool is_number = lex_is_number (lexer); + bool too_small = is_number && lex_number (lexer) < min; + bool too_big = is_number && lex_number (lexer) >= max; + if (is_number && !too_small && !too_big) + return true; + + if (min >= max) + { + /* Weird, maybe a bug in the caller. Just report that we needed an + number. */ + if (name) + lex_error (lexer, _("Number expected for %s."), name); + else + lex_error (lexer, _("Number expected.")); + } + else + { + bool report_lower_bound = min > -DBL_MAX || too_small; + bool report_upper_bound = max < DBL_MAX || too_big; + + if (report_lower_bound && report_upper_bound) + { + if (name) + lex_error (lexer, _("Expected number in [%g,%g) for %s."), + min, max, name); + else + lex_error (lexer, _("Expected number in [%g,%g)."), + min, max); + } + else if (report_lower_bound) + { + if (min == 0) + { + if (name) + lex_error (lexer, _("Expected non-negative number for %s."), + name); + else + lex_error (lexer, _("Expected non-negative number.")); + } + else + { + if (name) + lex_error (lexer, _("Expected number %g or greater for %s."), + min, name); + else + lex_error (lexer, _("Expected number %g or greater."), min); + } + } + else if (report_upper_bound) + { + if (name) + lex_error (lexer, + _("Expected number less than %g for %s."), max, name); + else + lex_error (lexer, _("Expected number less than %g."), max); + } + else + { + if (name) + lex_error (lexer, _("Number expected for %s."), name); + else + lex_error (lexer, _("Number expected.")); + } + } + return false; +} + +/* If the current token is an number in the open range (MIN,MAX], does + nothing and returns true. Otherwise, reports an error and returns false. + If NAME is nonnull, then it is used in the error message. */ +bool +lex_force_num_range_open (struct lexer *lexer, const char *name, + double min, double max) +{ + bool is_number = lex_is_number (lexer); + bool too_small = is_number && lex_number (lexer) <= min; + bool too_big = is_number && lex_number (lexer) >= max; + if (is_number && !too_small && !too_big) + return true; + + if (min >= max) + { + /* Weird, maybe a bug in the caller. Just report that we needed an + number. */ + if (name) + lex_error (lexer, _("Number expected for %s."), name); + else + lex_error (lexer, _("Number expected.")); + } + else + { + bool report_lower_bound = min > -DBL_MAX || too_small; + bool report_upper_bound = max < DBL_MAX || too_big; + + if (report_lower_bound && report_upper_bound) + { + if (name) + lex_error (lexer, _("Expected number in (%g,%g) for %s."), + min, max, name); + else + lex_error (lexer, _("Expected number in (%g,%g)."), min, max); + } + else if (report_lower_bound) + { + if (min == 0) + { + if (name) + lex_error (lexer, _("Expected positive number for %s."), name); + else + lex_error (lexer, _("Expected positive number.")); + } + else + { + if (name) + lex_error (lexer, _("Expected number greater than %g for %s."), + min, name); + else + lex_error (lexer, _("Expected number greater than %g."), min); + } + } + else if (report_upper_bound) + { + if (name) + lex_error (lexer, _("Expected number less than %g for %s."), + max, name); + else + lex_error (lexer, _("Expected number less than %g."), max); + } + else + { + if (name) + lex_error (lexer, _("Number expected for %s."), name); + else + lex_error (lexer, _("Number expected.")); + } + } + return false; +} + +/* If the current token is an identifier, does nothing and returns true. + Otherwise, reports an error and returns false. */ +bool +lex_force_id (struct lexer *lexer) +{ + if (lex_token (lexer) == T_ID) + return true; + + lex_error (lexer, _("expecting identifier")); + return false; +} + +/* Token accessors. */ + +/* Returns the type of LEXER's current token. */ +enum token_type +lex_token (const struct lexer *lexer) +{ + return lex_next_token (lexer, 0); +} + +/* Returns the number in LEXER's current token. + + Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other + tokens this function will always return zero. */ +double +lex_tokval (const struct lexer *lexer) +{ + return lex_next_tokval (lexer, 0); +} + +/* Returns the null-terminated string in LEXER's current token, UTF-8 encoded. + + Only T_ID and T_STRING tokens have meaningful strings. For other tokens + this functions this function will always return NULL. + + The UTF-8 encoding of the returned string is correct for variable names and + other identifiers. Use filename_to_utf8() to use it as a filename. Use + data_in() to use it in a "union value". */ +const char * +lex_tokcstr (const struct lexer *lexer) +{ + return lex_next_tokcstr (lexer, 0); +} + +/* Returns the string in LEXER's current token, UTF-8 encoded. The string is + null-terminated (but the null terminator is not included in the returned + substring's 'length'). + + Only T_ID and T_STRING tokens have meaningful strings. For other tokens + this functions this function will always return NULL. + + The UTF-8 encoding of the returned string is correct for variable names and + other identifiers. Use filename_to_utf8() to use it as a filename. Use + data_in() to use it in a "union value". */ +struct substring +lex_tokss (const struct lexer *lexer) +{ + return lex_next_tokss (lexer, 0); +} + +/* Looking ahead. + + A value of 0 for N as an argument to any of these functions refers to the + current token. Lookahead is limited to the current command. Any N greater + than the number of tokens remaining in the current command will be treated + as referring to a T_ENDCMD token. */ + +static const struct lex_token * +lex_next__ (const struct lexer *lexer_, int n) +{ + struct lexer *lexer = CONST_CAST (struct lexer *, lexer_); + struct lex_source *src = lex_source__ (lexer); + + if (src != NULL) + return lex_source_next__ (src, n); + else + { + static const struct lex_token stop_token = { .token = { .type = T_STOP } }; + return &stop_token; + } +} + +static const struct lex_token * +lex_source_ofs__ (const struct lex_source *src_, int ofs) +{ + struct lex_source *src = CONST_CAST (struct lex_source *, src_); + + if (ofs < 0) + { + static const struct lex_token endcmd_token + = { .token = { .type = T_ENDCMD } }; + return &endcmd_token; + } + + while (ofs >= src->n_parse) + { + if (src->n_parse > 0) + { + const struct lex_token *t = src->parse[src->n_parse - 1]; + if (t->token.type == T_STOP || t->token.type == T_ENDCMD) + return t; + } + + lex_source_get_parse (src); + } + + return src->parse[ofs]; +} + +static const struct lex_token * +lex_source_next__ (const struct lex_source *src, int n) +{ + return lex_source_ofs__ (src, n + src->parse_ofs); +} + +/* Returns the "struct token" of the token N after the current one in LEXER. + The returned pointer can be invalidated by pretty much any succeeding call + into the lexer, although the string pointer within the returned token is + only invalidated by consuming the token (e.g. with lex_get()). */ +const struct token * +lex_next (const struct lexer *lexer, int n) +{ + return &lex_next__ (lexer, n)->token; +} + +/* Returns the type of the token N after the current one in LEXER. */ +enum token_type +lex_next_token (const struct lexer *lexer, int n) +{ + return lex_next (lexer, n)->type; +} + +/* Returns the number in the tokn N after the current one in LEXER. + + Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other + tokens this function will always return zero. */ +double +lex_next_tokval (const struct lexer *lexer, int n) +{ + return token_number (lex_next (lexer, n)); +} + +/* Returns the null-terminated string in the token N after the current one, in + UTF-8 encoding. + + Only T_ID and T_STRING tokens have meaningful strings. For other tokens + this functions this function will always return NULL. + + The UTF-8 encoding of the returned string is correct for variable names and + other identifiers. Use filename_to_utf8() to use it as a filename. Use + data_in() to use it in a "union value". */ +const char * +lex_next_tokcstr (const struct lexer *lexer, int n) +{ + return lex_next_tokss (lexer, n).string; +} + +/* Returns the string in the token N after the current one, in UTF-8 encoding. + The string is null-terminated (but the null terminator is not included in + the returned substring's 'length'). + + Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other + tokens this functions this function will always return NULL. + + The UTF-8 encoding of the returned string is correct for variable names and + other identifiers. Use filename_to_utf8() to use it as a filename. Use + data_in() to use it in a "union value". */ +struct substring +lex_next_tokss (const struct lexer *lexer, int n) +{ + return lex_next (lexer, n)->string; +} + +/* Returns the offset of the current token within the command being parsed in + LEXER. This is 0 for the first token in a command, 1 for the second, and so + on. The return value is useful later for referring to this token in calls + to lex_ofs_*(). */ +int +lex_ofs (const struct lexer *lexer) +{ + struct lex_source *src = lex_source__ (lexer); + return src ? src->parse_ofs : 0; +} + +/* Returns the token within LEXER's current command with offset OFS. Use + lex_ofs() to find out the offset of the current token. */ +const struct token * +lex_ofs_token (const struct lexer *lexer_, int ofs) +{ + struct lexer *lexer = CONST_CAST (struct lexer *, lexer_); + struct lex_source *src = lex_source__ (lexer); + + if (src != NULL) + return &lex_source_next__ (src, ofs - src->parse_ofs)->token; + else + { + static const struct token stop_token = { .type = T_STOP }; + return &stop_token; + } +} + +/* Allocates and returns a new struct msg_location that spans tokens with + offsets OFS0 through OFS1, inclusive, within the current command in + LEXER. See lex_ofs() for an explanation of token offsets. + + The caller owns and must eventually free the returned object. */ +struct msg_location * +lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1) +{ + int ofs = lex_ofs (lexer); + return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs); +} + +/* Returns a msg_point for the first character in the token with offset OFS, + where offset 0 is the first token in the command currently being parsed, 1 + the second token, and so on. These are absolute offsets, not relative to + the token currently being parsed within the command. + + Returns zeros for a T_STOP token. + */ +struct msg_point +lex_ofs_start_point (const struct lexer *lexer, int ofs) +{ + const struct lex_source *src = lex_source__ (lexer); + return (src + ? lex_token_start_point (src, lex_source_ofs__ (src, ofs)) + : (struct msg_point) { 0, 0 }); +} + +/* Returns a msg_point for the last character, inclusive, in the token with + offset OFS, where offset 0 is the first token in the command currently being + parsed, 1 the second token, and so on. These are absolute offsets, not + relative to the token currently being parsed within the command. + + Returns zeros for a T_STOP token. + + Most of the time, a single token is wholly within a single line of syntax, + so that the start and end point for a given offset have the same line + number. There are two exceptions: a T_STRING token can be made up of + multiple segments on adjacent lines connected with "+" punctuators, and a + T_NEG_NUM token can consist of a "-" on one line followed by the number on + the next. + */ +struct msg_point +lex_ofs_end_point (const struct lexer *lexer, int ofs) +{ + const struct lex_source *src = lex_source__ (lexer); + return (src + ? lex_token_end_point (src, lex_source_ofs__ (src, ofs)) + : (struct msg_point) { 0, 0 }); +} + +/* Returns the text of the syntax in tokens N0 ahead of the current one, + through N1 ahead of the current one, inclusive. (For example, if N0 and N1 + are both zero, this requests the syntax for the current token.) + + The caller must eventually free the returned string (with free()). The + syntax is encoded in UTF-8 and in the original form supplied to the lexer so + that, for example, it may include comments, spaces, and new-lines if it + spans multiple tokens. Macro expansion, however, has already been + performed. */ +char * +lex_next_representation (const struct lexer *lexer, int n0, int n1) +{ + const struct lex_source *src = lex_source__ (lexer); + return (src + ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs) + : xstrdup ("")); +} + + +/* Returns the text of the syntax in tokens with offsets OFS0 to OFS1, + inclusive. (For example, if OFS0 and OFS1 are both zero, this requests the + syntax for the first token in the current command.) + + The caller must eventually free the returned string (with free()). The + syntax is encoded in UTF-8 and in the original form supplied to the lexer so + that, for example, it may include comments, spaces, and new-lines if it + spans multiple tokens. Macro expansion, however, has already been + performed. */ +char * +lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1) +{ + const struct lex_source *src = lex_source__ (lexer); + return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup (""); +} + +/* Returns true if the token N ahead of the current one was produced by macro + expansion, false otherwise. */ +bool +lex_next_is_from_macro (const struct lexer *lexer, int n) +{ + return lex_next__ (lexer, n)->macro_rep != NULL; +} + +static bool +lex_tokens_match (const struct token *actual, const struct token *expected) +{ + if (actual->type != expected->type) + return false; + + switch (actual->type) + { + case T_POS_NUM: + case T_NEG_NUM: + return actual->number == expected->number; + + case T_ID: + return lex_id_match (expected->string, actual->string); + + case T_STRING: + return (actual->string.length == expected->string.length + && !memcmp (actual->string.string, expected->string.string, + actual->string.length)); + + default: + return true; + } +} + +static size_t +lex_at_phrase__ (struct lexer *lexer, const char *s) +{ + struct string_lexer slex; + struct token token; + + size_t i = 0; + string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true); + while (string_lexer_next (&slex, &token)) + { + bool match = lex_tokens_match (lex_next (lexer, i++), &token); + token_uninit (&token); + if (!match) + return 0; + } + return i; +} + +/* If LEXER is positioned at the sequence of tokens that may be parsed from S, + returns true. Otherwise, returns false. + + S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS", + "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their + first three letters. */ +bool +lex_at_phrase (struct lexer *lexer, const char *s) +{ + return lex_at_phrase__ (lexer, s) > 0; +} + +/* If LEXER is positioned at the sequence of tokens that may be parsed from S, + skips it and returns true. Otherwise, returns false. + + S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS", + "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their + first three letters. */ +bool +lex_match_phrase (struct lexer *lexer, const char *s) +{ + size_t n = lex_at_phrase__ (lexer, s); + if (n > 0) + lex_get_n (lexer, n); + return n > 0; +} + +/* Returns the 1-based line number of the source text at the byte OFFSET in + SRC. */ +static int +lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset) +{ + size_t lo = 0; + size_t hi = src->n_lines; + for (;;) + { + size_t mid = (lo + hi) / 2; + if (mid + 1 >= src->n_lines) + return src->n_lines; + else if (offset >= src->lines[mid + 1]) + lo = mid; + else if (offset < src->lines[mid]) + hi = mid; + else + return mid + 1; + } +} + +/* Returns the 1-based column number of the source text at the byte OFFSET in + SRC. */ +static int +lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset) +{ + const char *newline = memrchr (src->buffer, '\n', offset); + size_t line_ofs = newline ? newline - src->buffer + 1 : 0; + return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1; +} + +static struct msg_point +lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset) +{ + return (struct msg_point) { + .line = lex_source_ofs_to_line_number (src, offset), + .column = lex_source_ofs_to_column_number (src, offset), + }; +} + +static struct msg_point +lex_token_start_point (const struct lex_source *src, + const struct lex_token *token) +{ + return lex_source_ofs_to_point__ (src, token->token_pos); +} + +static struct msg_point +lex_token_end_point (const struct lex_source *src, + const struct lex_token *token) +{ + return lex_source_ofs_to_point__ (src, lex_token_end (token)); +} + +static struct msg_location +lex_token_location (const struct lex_source *src, + const struct lex_token *t0, + const struct lex_token *t1) +{ + return (struct msg_location) { + .file_name = intern_new_if_nonnull (src->reader->file_name), + .start = lex_token_start_point (src, t0), + .end = lex_token_end_point (src, t1), + }; +} + +static struct msg_location * +lex_token_location_rw (const struct lex_source *src, + const struct lex_token *t0, + const struct lex_token *t1) +{ + struct msg_location location = lex_token_location (src, t0, t1); + return msg_location_dup (&location); +} + +static struct msg_location * +lex_source_get_location (const struct lex_source *src, int n0, int n1) +{ + return lex_token_location_rw (src, + lex_source_next__ (src, n0), + lex_source_next__ (src, n1)); +} + +/* Returns the name of the syntax file from which the current command is drawn. + Returns NULL for a T_STOP token or if the command's source does not have + line numbers. + + There is no version of this function that takes an N argument because + lookahead only works to the end of a command and any given command is always + within a single syntax file. */ +const char * +lex_get_file_name (const struct lexer *lexer) +{ + struct lex_source *src = lex_source__ (lexer); + return src == NULL ? NULL : src->reader->file_name; } -/* Makes the current token become the next token to be read; the - current token is set to T. */ -void -lex_put_back (struct lexer *lexer, int t) -{ - save_token (lexer); - lexer->token = t; -} - -/* Makes the current token become the next token to be read; the - current token is set to the identifier ID. */ -void -lex_put_back_id (struct lexer *lexer, const char *id) +/* Returns a newly allocated msg_location for the syntax that represents tokens + with 0-based offsets N0...N1, inclusive, from the current token. The caller + must eventually free the location (with msg_location_destroy()). */ +struct msg_location * +lex_get_location (const struct lexer *lexer, int n0, int n1) { - assert (lex_id_to_token (ss_cstr (id)) == T_ID); - save_token (lexer); - lexer->token = T_ID; - ds_assign_cstr (&lexer->tokstr, id); - str_copy_trunc (lexer->tokid, sizeof lexer->tokid, ds_cstr (&lexer->tokstr)); + struct msg_location *loc = xmalloc (sizeof *loc); + *loc = (struct msg_location) { + .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)), + .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)), + .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)), + .src = lex_source__ (lexer), + }; + lex_source_ref (loc->src); + return loc; } - -/* Weird line processing functions. */ -/* Returns the entire contents of the current line. */ const char * -lex_entire_line (const struct lexer *lexer) +lex_get_encoding (const struct lexer *lexer) { - return ds_cstr (&lexer->line_buffer); + struct lex_source *src = lex_source__ (lexer); + return src == NULL ? NULL : src->reader->encoding; } -const struct string * -lex_entire_line_ds (const struct lexer *lexer) +/* Returns the syntax mode for the syntax file from which the current drawn is + drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source + does not have line numbers. + + There is no version of this function that takes an N argument because + lookahead only works to the end of a command and any given command is always + within a single syntax file. */ +enum segmenter_mode +lex_get_syntax_mode (const struct lexer *lexer) { - return &lexer->line_buffer; + struct lex_source *src = lex_source__ (lexer); + return src == NULL ? SEG_MODE_AUTO : src->reader->syntax; } -/* As lex_entire_line(), but only returns the part of the current line - that hasn't already been tokenized. */ -const char * -lex_rest_of_line (const struct lexer *lexer) +/* Returns the error mode for the syntax file from which the current drawn is + drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's + source does not have line numbers. + + There is no version of this function that takes an N argument because + lookahead only works to the end of a command and any given command is always + within a single syntax file. */ +enum lex_error_mode +lex_get_error_mode (const struct lexer *lexer) { - return lexer->prog; + struct lex_source *src = lex_source__ (lexer); + return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error; } -/* Returns true if the current line ends in a terminal dot, - false otherwise. */ -bool -lex_end_dot (const struct lexer *lexer) +/* If the source that LEXER is currently reading has error mode + LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next + token to be read comes directly from whatever is next read from the stream. + + It makes sense to call this function after encountering an error in a + command entered on the console, because usually the user would prefer not to + have cascading errors. */ +void +lex_interactive_reset (struct lexer *lexer) { - return lexer->dot; + struct lex_source *src = lex_source__ (lexer); + if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL) + { + src->length = 0; + src->journal_pos = src->seg_pos = 0; + src->n_lines = 0; + src->suppress_next_newline = false; + src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter), + false); + lex_stage_clear (&src->pp); + lex_stage_clear (&src->merge); + lex_source_clear_parse (src); + lex_source_push_endcmd__ (src); + } } -/* Causes the rest of the current input line to be ignored for - tokenization purposes. */ +/* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */ void -lex_discard_line (struct lexer *lexer) +lex_discard_rest_of_command (struct lexer *lexer) { - ds_cstr (&lexer->line_buffer); /* Ensures ds_end points to something valid */ - lexer->prog = ds_end (&lexer->line_buffer); - lexer->dot = false; - lexer->put_token = 0; + while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD) + lex_get (lexer); } - -/* Discards the rest of the current command. - When we're reading commands from a file, we skip tokens until - a terminal dot or EOF. - When we're reading commands interactively from the user, - that's just discarding the current line, because presumably - the user doesn't want to finish typing a command that will be - ignored anyway. */ +/* Discards all lookahead tokens in LEXER, then discards all input sources + until it encounters one with error mode LEX_ERROR_TERMINAL or until it + runs out of input sources. */ void -lex_discard_rest_of_command (struct lexer *lexer) +lex_discard_noninteractive (struct lexer *lexer) { - if (!getl_is_interactive (lexer->ss)) + struct lex_source *src = lex_source__ (lexer); + + if (src != NULL) { - while (lexer->token != T_STOP && lexer->token != '.') - lex_get (lexer); + lex_stage_clear (&src->pp); + lex_stage_clear (&src->merge); + lex_source_clear_parse (src); + + for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL; + src = lex_source__ (lexer)) + { + ll_remove (&src->ll); + lex_source_unref (src); + } } - else - lex_discard_line (lexer); } -/* Weird line reading functions. */ +static void +lex_source_expand__ (struct lex_source *src) +{ + if (src->length >= src->allocated) + src->buffer = x2realloc (src->buffer, &src->allocated); +} -/* Remove C-style comments in STRING, begun by slash-star and - terminated by star-slash or newline. */ static void -strip_comments (struct string *string) +lex_source_read__ (struct lex_source *src) +{ + do + { + lex_source_expand__ (src); + + size_t space = src->allocated - src->length; + enum prompt_style prompt = segmenter_get_prompt (&src->segmenter); + size_t n = src->reader->class->read (src->reader, + &src->buffer[src->length], + space, prompt); + assert (n <= space); + + if (n == 0) + { + /* End of input. */ + src->reader->eof = true; + return; + } + + src->length += n; + } + while (!memchr (&src->buffer[src->seg_pos], '\n', + src->length - src->seg_pos)); +} + +static struct lex_source * +lex_source__ (const struct lexer *lexer) { - char *cp; - int quote; - bool in_comment; + return (ll_is_empty (&lexer->sources) ? NULL + : ll_data (ll_head (&lexer->sources), struct lex_source, ll)); +} - in_comment = false; - quote = EOF; - for (cp = ds_cstr (string); *cp; ) +/* Returns the text of the syntax in SRC for tokens with offsets OFS0 through + OFS1 in the current command, inclusive. (For example, if OFS0 and OFS1 are + both zero, this requests the syntax for the first token in the current + command.) The caller must eventually free the returned string (with + free()). The syntax is encoded in UTF-8 and in the original form supplied + to the lexer so that, for example, it may include comments, spaces, and + new-lines if it spans multiple tokens. Macro expansion, however, has + already been performed. */ +static char * +lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1) +{ + struct string s = DS_EMPTY_INITIALIZER; + for (size_t i = ofs0; i <= ofs1; ) { - /* If we're not in a comment, check for quote marks. */ - if (!in_comment) + /* Find [I,J) as the longest sequence of tokens not produced by macro + expansion, or otherwise the longest sequence expanded from a single + macro call. */ + const struct lex_token *first = lex_source_ofs__ (src, i); + size_t j; + for (j = i + 1; j <= ofs1; j++) { - if (*cp == quote) - quote = EOF; - else if (*cp == '\'' || *cp == '"') - quote = *cp; + const struct lex_token *cur = lex_source_ofs__ (src, j); + if ((first->macro_rep != NULL) != (cur->macro_rep != NULL) + || first->macro_rep != cur->macro_rep) + break; } + const struct lex_token *last = lex_source_ofs__ (src, j - 1); - /* If we're not inside a quotation, check for comment. */ - if (quote == EOF) + /* Now add the syntax for this sequence of tokens to SRC. */ + if (!ds_is_empty (&s)) + ds_put_byte (&s, ' '); + if (!first->macro_rep) { - if (cp[0] == '/' && cp[1] == '*') - { - in_comment = true; - *cp++ = ' '; - *cp++ = ' '; - continue; - } - else if (in_comment && cp[0] == '*' && cp[1] == '/') - { - in_comment = false; - *cp++ = ' '; - *cp++ = ' '; - continue; - } + size_t start = first->token_pos; + size_t end = last->token_pos + last->token_len; + ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start)); + } + else + { + size_t start = first->ofs; + size_t end = last->ofs + last->len; + ds_put_substring (&s, ss_buffer (first->macro_rep + start, + end - start)); } - /* Check commenting. */ - if (in_comment) - *cp = ' '; - cp++; + i = j; } + return ds_steal_cstr (&s); } -/* Prepares LINE, which is subject to the given SYNTAX rules, for - tokenization by stripping comments and determining whether it - is the beginning or end of a command and storing into - *LINE_STARTS_COMMAND and *LINE_ENDS_COMMAND appropriately. */ -void -lex_preprocess_line (struct string *line, - enum syntax_mode syntax, - bool *line_starts_command, - bool *line_ends_command) -{ - strip_comments (line); - ds_rtrim (line, ss_cstr (CC_SPACES)); - *line_ends_command = (ds_chomp (line, settings_get_endcmd ()) - || (ds_is_empty (line) && settings_get_nulline ())); - *line_starts_command = false; - if (syntax == GETL_BATCH) - { - int first = ds_first (line); - *line_starts_command = !c_isspace (first); - if (first == '+' || first == '-') - *ds_data (line) = ' '; - } +static bool +lex_source_contains_macro_call (struct lex_source *src, int n0, int n1) +{ + for (int i = n0; i <= n1; i++) + if (lex_source_next__ (src, i)->macro_rep) + return true; + return false; } -/* Reads a line, without performing any preprocessing. - Sets *SYNTAX, if SYNTAX is non-null, to the line's syntax - mode. */ -bool -lex_get_line_raw (struct lexer *lexer) +/* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the + raw UTF-8 syntax for the macro call (not for the expansion) and for any + other tokens included in that range. The syntax is encoded in UTF-8 and in + the original form supplied to the lexer so that, for example, it may include + comments, spaces, and new-lines if it spans multiple tokens. + + Returns an empty string if the token range doesn't include a macro call. + + The caller must not modify or free the returned string. */ +static struct substring +lex_source_get_macro_call (struct lex_source *src, int n0, int n1) { - bool ok = getl_read_line (lexer->ss, &lexer->line_buffer); - enum syntax_mode mode = lex_current_syntax_mode (lexer); - journal_write (mode == GETL_BATCH, ds_cstr (&lexer->line_buffer)); + if (!lex_source_contains_macro_call (src, n0, n1)) + return ss_empty (); + + const struct lex_token *token0 = lex_source_next__ (src, n0); + const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1)); + size_t start = token0->token_pos; + size_t end = token1->token_pos + token1->token_len; - return ok; + return ss_buffer (&src->buffer[start], end - start); } -/* Reads a line for use by the tokenizer, and preprocesses it by - removing comments, stripping trailing whitespace and the - terminal dot, and removing leading indentors. */ -bool -lex_get_line (struct lexer *lexer) +static void +lex_source_error_valist (struct lex_source *src, int n0, int n1, + const char *format, va_list args) { - bool line_starts_command; + const struct lex_token *token; + struct string s; - if (!lex_get_line_raw (lexer)) + ds_init_empty (&s); + + token = lex_source_next__ (src, n0); + if (token->token.type == T_ENDCMD) + ds_put_cstr (&s, _("Syntax error at end of command")); + else { - lexer->prog = NULL; - return false; + /* Get the syntax that caused the error. */ + char *raw_syntax = lex_source_syntax__ (src, n0 + src->parse_ofs, + n1 + src->parse_ofs); + char syntax[64]; + str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax); + free (raw_syntax); + + /* Get the macro call(s) that expanded to the syntax that caused the + error. */ + char call[64]; + str_ellipsize (lex_source_get_macro_call (src, n0, n1), + call, sizeof call); + + if (syntax[0]) + { + if (call[0]) + ds_put_format (&s, + _("Syntax error at `%s' (in expansion of `%s')"), + syntax, call); + else + ds_put_format (&s, _("Syntax error at `%s'"), syntax); + } + else + { + if (call[0]) + ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"), + call); + else + ds_put_cstr (&s, _("Syntax error")); + } } - lex_preprocess_line (&lexer->line_buffer, - lex_current_syntax_mode (lexer), - &line_starts_command, &lexer->dot); - - if (line_starts_command) - lexer->put_token = '.'; - - lexer->prog = ds_cstr (&lexer->line_buffer); - return true; + if (format) + { + ds_put_cstr (&s, ": "); + ds_put_vformat (&s, format, args); + } + if (ds_last (&s) != '.') + ds_put_byte (&s, '.'); + + struct msg *m = xmalloc (sizeof *m); + *m = (struct msg) { + .category = MSG_C_SYNTAX, + .severity = MSG_S_ERROR, + .location = lex_source_get_location (src, n0, n1), + .text = ds_steal_cstr (&s), + }; + msg_emit (m); } - -/* Token names. */ -/* Returns the name of a token. */ -const char * -lex_token_name (int token) +static void +lex_get_error (struct lex_source *src, const struct lex_token *token) { - if (lex_is_keyword (token)) - return lex_id_name (token); - else if (token < 256) - { - static char t[256][2]; - char *s = t[token]; - s[0] = token; - s[1] = '\0'; - return s; - } - else - NOT_REACHED (); + char syntax[64]; + str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len), + syntax, sizeof syntax); + + struct string s = DS_EMPTY_INITIALIZER; + ds_put_format (&s, _("Syntax error at `%s'"), syntax); + ds_put_format (&s, ": %s", token->token.string.string); + + struct msg *m = xmalloc (sizeof *m); + *m = (struct msg) { + .category = MSG_C_SYNTAX, + .severity = MSG_S_ERROR, + .location = lex_token_location_rw (src, token, token), + .text = ds_steal_cstr (&s), + }; + msg_emit (m); } -/* Returns an ASCII representation of the current token as a - malloc()'d string. */ -char * -lex_token_representation (struct lexer *lexer) +/* Attempts to append an additional token to 'pp' in SRC, reading more from the + underlying lex_reader if necessary. Returns true if a new token was added + to SRC's deque, false otherwise. The caller should retry failures unless + SRC's 'eof' marker was set to true indicating that there will be no more + tokens from this source. */ +static bool +lex_source_try_get_pp (struct lex_source *src) { - char *token_rep; + /* Append a new token to SRC and initialize it. */ + struct lex_token *token = xmalloc (sizeof *token); + token->token = (struct token) { .type = T_STOP }; + token->macro_rep = NULL; + token->ref_cnt = NULL; + token->token_pos = src->seg_pos; + + /* Extract a segment. */ + const char *segment; + enum segment_type seg_type; + int seg_len; + for (;;) + { + segment = &src->buffer[src->seg_pos]; + seg_len = segmenter_push (&src->segmenter, segment, + src->length - src->seg_pos, + src->reader->eof, &seg_type); + if (seg_len >= 0) + break; + + /* The segmenter needs more input to produce a segment. */ + assert (!src->reader->eof); + lex_source_read__ (src); + } - switch (lexer->token) + /* Update state based on the segment. */ + token->token_len = seg_len; + src->seg_pos += seg_len; + if (seg_type == SEG_NEWLINE) { - case T_ID: - case T_POS_NUM: - case T_NEG_NUM: - return ds_xstrdup (&lexer->tokstr); - break; + if (src->n_lines >= src->allocated_lines) + src->lines = x2nrealloc (src->lines, &src->allocated_lines, + sizeof *src->lines); + src->lines[src->n_lines++] = src->seg_pos; + } - case T_STRING: - { - int hexstring = 0; - char *sp, *dp; - - for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++) - if (!c_isprint ((unsigned char) *sp)) - { - hexstring = 1; - break; - } - - token_rep = xmalloc (2 + ds_length (&lexer->tokstr) * 2 + 1 + 1); - - dp = token_rep; - if (hexstring) - *dp++ = 'X'; - *dp++ = '\''; - - if (!hexstring) - for (sp = ds_cstr (&lexer->tokstr); *sp; ) - { - if (*sp == '\'') - *dp++ = '\''; - *dp++ = (unsigned char) *sp++; - } - else - for (sp = ds_cstr (&lexer->tokstr); sp < ds_end (&lexer->tokstr); sp++) - { - *dp++ = (((unsigned char) *sp) >> 4)["0123456789ABCDEF"]; - *dp++ = (((unsigned char) *sp) & 15)["0123456789ABCDEF"]; - } - *dp++ = '\''; - *dp = '\0'; - - return token_rep; - } - break; + /* Get a token from the segment. */ + enum tokenize_result result = token_from_segment ( + seg_type, ss_buffer (segment, seg_len), &token->token); - case T_STOP: - token_rep = xmalloc (1); - *token_rep = '\0'; - return token_rep; + /* If we've reached the end of a line, or the end of a command, then pass + the line to the output engine as a syntax text item. */ + int n_lines = seg_type == SEG_NEWLINE; + if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline) + { + n_lines++; + src->suppress_next_newline = true; + } + else if (n_lines > 0 && src->suppress_next_newline) + { + n_lines--; + src->suppress_next_newline = false; + } + for (int i = 0; i < n_lines; i++) + { + /* Beginning of line. */ + const char *line = &src->buffer[src->journal_pos]; + + /* Calculate line length, including \n or \r\n end-of-line if present. + + We use src->length even though that may be beyond what we've actually + converted to tokens. That's because, if we're emitting the line due + to SEG_END_COMMAND, we want to take the whole line through the + newline, not just through the '.'. */ + size_t max_len = src->length - src->journal_pos; + const char *newline = memchr (line, '\n', max_len); + size_t line_len = newline ? newline - line + 1 : max_len; + + /* Calculate line length excluding end-of-line. */ + size_t copy_len = line_len; + if (copy_len > 0 && line[copy_len - 1] == '\n') + copy_len--; + if (copy_len > 0 && line[copy_len - 1] == '\r') + copy_len--; + + /* Submit the line as syntax. */ + output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX, + xmemdup0 (line, copy_len), + NULL)); + + src->journal_pos += line_len; + } - case T_EXP: - return xstrdup ("**"); + switch (result) + { + case TOKENIZE_ERROR: + lex_get_error (src, token); + /* Fall through. */ + case TOKENIZE_EMPTY: + lex_token_destroy (token); + return false; - default: - return xstrdup (lex_token_name (lexer->token)); + case TOKENIZE_TOKEN: + if (token->token.type == T_STOP) + { + token->token.type = T_ENDCMD; + src->eof = true; + } + lex_stage_push_last (&src->pp, token); + return true; } - NOT_REACHED (); } - -/* Really weird functions. */ -/* Most of the time, a `-' is a lead-in to a negative number. But - sometimes it's actually part of the syntax. If a dash can be part - of syntax then this function is called to rip it off of a - number. */ -void -lex_negative_to_dash (struct lexer *lexer) +/* Attempts to append a new token to SRC. Returns true if successful, false on + failure. On failure, the end of SRC has been reached and no more tokens + will be forthcoming from it. + + Does not make the new token available for lookahead yet; the caller must + adjust SRC's 'middle' pointer to do so. */ +static bool +lex_source_get_pp (struct lex_source *src) { - if (lexer->token == T_NEG_NUM) - { - lexer->token = T_POS_NUM; - lexer->tokval = -lexer->tokval; - ds_assign_substring (&lexer->tokstr, ds_substr (&lexer->tokstr, 1, SIZE_MAX)); - save_token (lexer); - lexer->token = '-'; - } + while (!src->eof) + if (lex_source_try_get_pp (src)) + return true; + return false; } -/* Skip a COMMENT command. */ -void -lex_skip_comment (struct lexer *lexer) +static bool +lex_source_try_get_merge (const struct lex_source *src_) { - for (;;) + struct lex_source *src = CONST_CAST (struct lex_source *, src_); + + if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src)) + return false; + + if (!settings_get_mexpand ()) + { + lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp)); + return true; + } + + /* Now pass tokens one-by-one to the macro expander. + + In the common case where there is no macro to expand, the loop is not + entered. */ + struct macro_call *mc; + int n_call = macro_call_create (src->lexer->macros, + &lex_stage_first (&src->pp)->token, &mc); + for (int ofs = 1; !n_call; ofs++) { - if (!lex_get_line (lexer)) + if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src)) { - lexer->put_token = T_STOP; - lexer->prog = NULL; - return; + /* This should not be reachable because we always get a T_ENDCMD at + the end of an input file (transformed from T_STOP by + lex_source_try_get_pp()) and the macro_expander should always + terminate expansion on T_ENDCMD. */ + NOT_REACHED (); } - if (lexer->put_token == '.') - break; + const struct lex_token *t = lex_stage_nth (&src->pp, ofs); + const struct macro_token mt = { + .token = t->token, + .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len), + }; + const struct msg_location loc = lex_token_location (src, t, t); + n_call = macro_call_add (mc, &mt, &loc); + } + if (n_call < 0) + { + /* False alarm: no macro expansion after all. Use first token as + lookahead. We'll retry macro expansion from the second token next + time around. */ + macro_call_destroy (mc); + lex_stage_shift (&src->merge, &src->pp, 1); + return true; + } - ds_cstr (&lexer->line_buffer); /* Ensures ds_end will point to a valid char */ - lexer->prog = ds_end (&lexer->line_buffer); - if (lexer->dot) - break; + /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive, + are a macro call. (These are likely to be the only tokens in 'pp'.) + Expand them. */ + const struct lex_token *c0 = lex_stage_first (&src->pp); + const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1); + struct macro_tokens expansion = { .n = 0 }; + struct msg_location loc = lex_token_location (src, c0, c1); + macro_call_expand (mc, src->reader->syntax, &loc, &expansion); + macro_call_destroy (mc); + + /* Convert the macro expansion into syntax for possible error messages + later. */ + size_t *ofs = xnmalloc (expansion.n, sizeof *ofs); + size_t *len = xnmalloc (expansion.n, sizeof *len); + struct string s = DS_EMPTY_INITIALIZER; + macro_tokens_to_syntax (&expansion, &s, ofs, len); + + if (settings_get_mprint ()) + output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s), + _("Macro Expansion"))); + + /* Append the macro expansion tokens to the lookahead. */ + if (expansion.n > 0) + { + char *macro_rep = ds_steal_cstr (&s); + size_t *ref_cnt = xmalloc (sizeof *ref_cnt); + *ref_cnt = expansion.n; + for (size_t i = 0; i < expansion.n; i++) + { + struct lex_token *token = xmalloc (sizeof *token); + *token = (struct lex_token) { + .token = expansion.mts[i].token, + .token_pos = c0->token_pos, + .token_len = (c1->token_pos + c1->token_len) - c0->token_pos, + .macro_rep = macro_rep, + .ofs = ofs[i], + .len = len[i], + .ref_cnt = ref_cnt, + }; + lex_stage_push_last (&src->merge, token); + + ss_dealloc (&expansion.mts[i].syntax); + } } + else + ds_destroy (&s); + free (expansion.mts); + free (ofs); + free (len); + + /* Destroy the tokens for the call. */ + for (size_t i = 0; i < n_call; i++) + lex_stage_pop_first (&src->pp); + + return expansion.n > 0; } - -/* Private functions. */ -/* When invoked, tokstr contains a string of binary, octal, or - hex digits, according to TYPE. The string is converted to - characters having the specified values. */ -static void -convert_numeric_string_to_char_string (struct lexer *lexer, - enum string_type type) -{ - const char *base_name; - int base; - int chars_per_byte; - size_t byte_cnt; - size_t i; - char *p; +/* Attempts to obtain at least one new token into 'merge' in SRC. - switch (type) - { - case BINARY_STRING: - base_name = _("binary"); - base = 2; - chars_per_byte = 8; - break; - case OCTAL_STRING: - base_name = _("octal"); - base = 8; - chars_per_byte = 3; - break; - case HEX_STRING: - base_name = _("hex"); - base = 16; - chars_per_byte = 2; - break; - default: - NOT_REACHED (); - } + Returns true if successful, false on failure. In the latter case, SRC is + exhausted and 'src->eof' is now true. */ +static bool +lex_source_get_merge (struct lex_source *src) +{ + while (!src->eof) + if (lex_source_try_get_merge (src)) + return true; + return false; +} - byte_cnt = ds_length (&lexer->tokstr) / chars_per_byte; - if (ds_length (&lexer->tokstr) % chars_per_byte) - msg (SE, _("String of %s digits has %zu characters, which is not a " - "multiple of %d."), - base_name, ds_length (&lexer->tokstr), chars_per_byte); +/* Attempts to obtain at least one new token into 'lookahead' in SRC. - p = ds_cstr (&lexer->tokstr); - for (i = 0; i < byte_cnt; i++) + Returns true if successful, false on failure. In the latter case, SRC is + exhausted and 'src->eof' is now true. */ +static bool +lex_source_get_parse (struct lex_source *src) +{ + struct merger m = MERGER_INIT; + struct token out; + for (size_t i = 0; ; i++) { - int value; - int j; + while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src)) + { + /* We always get a T_ENDCMD at the end of an input file + (transformed from T_STOP by lex_source_try_get_pp()) and + merger_add() should never return -1 on T_ENDCMD. */ + assert (lex_stage_is_empty (&src->merge)); + return false; + } - value = 0; - for (j = 0; j < chars_per_byte; j++, p++) - { - int v; + int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token, + &out); + if (!retval) + { + lex_source_push_parse (src, lex_stage_take_first (&src->merge)); + return true; + } + else if (retval > 0) + { + /* Add a token that merges all the tokens together. */ + const struct lex_token *first = lex_stage_first (&src->merge); + const struct lex_token *last = lex_stage_nth (&src->merge, + retval - 1); + bool macro = first->macro_rep && first->macro_rep == last->macro_rep; + struct lex_token *t = xmalloc (sizeof *t); + *t = (struct lex_token) { + .token = out, + .token_pos = first->token_pos, + .token_len = (last->token_pos - first->token_pos) + last->token_len, + + /* This works well if all the tokens were not expanded from macros, + or if they came from the same macro expansion. It just gives up + in the other (corner) cases. */ + .macro_rep = macro ? first->macro_rep : NULL, + .ofs = macro ? first->ofs : 0, + .len = macro ? (last->ofs - first->ofs) + last->len : 0, + .ref_cnt = macro ? first->ref_cnt : NULL, + }; + if (t->ref_cnt) + ++*t->ref_cnt; + lex_source_push_parse (src, t); + + for (int i = 0; i < retval; i++) + lex_stage_pop_first (&src->merge); + return true; + } + } +} + +static void +lex_source_push_endcmd__ (struct lex_source *src) +{ + assert (src->n_parse == 0); - if (*p >= '0' && *p <= '9') - v = *p - '0'; - else - { - static const char alpha[] = "abcdef"; - const char *q = strchr (alpha, tolower ((unsigned char) *p)); + struct lex_token *token = xmalloc (sizeof *token); + *token = (struct lex_token) { .token = { .type = T_ENDCMD } }; + lex_source_push_parse (src, token); +} - if (q) - v = q - alpha + 10; - else - v = base; - } +static void +lex_source_push_parse (struct lex_source *src, struct lex_token *token) +{ + if (src->n_parse >= src->allocated_parse) + src->parse = x2nrealloc (src->parse, &src->allocated_parse, + sizeof *src->parse); + src->parse[src->n_parse++] = token; +} - if (v >= base) - msg (SE, _("`%c' is not a valid %s digit."), *p, base_name); +static void +lex_source_clear_parse (struct lex_source *src) +{ + for (size_t i = 0; i < src->n_parse; i++) + lex_token_destroy (src->parse[i]); + src->n_parse = src->parse_ofs = 0; +} - value = value * base + v; - } +static struct lex_source * +lex_source_create (struct lexer *lexer, struct lex_reader *reader) +{ + size_t allocated_lines = 4; + size_t *lines = xmalloc (allocated_lines * sizeof *lines); + *lines = 0; + + struct lex_source *src = xmalloc (sizeof *src); + *src = (struct lex_source) { + .n_refs = 1, + .reader = reader, + .segmenter = segmenter_init (reader->syntax, false), + .lexer = lexer, + .lines = lines, + .n_lines = 1, + .allocated_lines = allocated_lines, + }; - ds_cstr (&lexer->tokstr)[i] = (unsigned char) value; - } + lex_source_push_endcmd__ (src); - ds_truncate (&lexer->tokstr, byte_cnt); + return src; } -/* Parses a string from the input buffer into tokstr. The input - buffer pointer lexer->prog must point to the initial single or double - quote. TYPE indicates the type of string to be parsed. - Returns token type. */ -static int -parse_string (struct lexer *lexer, enum string_type type) +void +lex_set_message_handler (struct lexer *lexer, + void (*output_msg) (const struct msg *, + struct lexer *)) { - if (type != CHARACTER_STRING) - lexer->prog++; + struct msg_handler msg_handler = { + .output_msg = (void (*)(const struct msg *, void *)) output_msg, + .aux = lexer, + .lex_source_ref = lex_source_ref, + .lex_source_unref = lex_source_unref, + .lex_source_get_line = lex_source_get_line, + }; + msg_set_handler (&msg_handler); +} - /* Accumulate the entire string, joining sections indicated by + - signs. */ - for (;;) +void +lex_source_ref (const struct lex_source *src_) +{ + struct lex_source *src = CONST_CAST (struct lex_source *, src_); + if (src) { - /* Single or double quote. */ - int c = *lexer->prog++; + assert (src->n_refs > 0); + src->n_refs++; + } +} - /* Accumulate section. */ - for (;;) - { - /* Check end of line. */ - if (*lexer->prog == '\0') - { - msg (SE, _("Unterminated string constant.")); - goto finish; - } - - /* Double quote characters to embed them in strings. */ - if (*lexer->prog == c) - { - if (lexer->prog[1] == c) - lexer->prog++; - else - break; - } - - ds_put_char (&lexer->tokstr, *lexer->prog++); - } - lexer->prog++; +void +lex_source_unref (struct lex_source *src) +{ + if (!src) + return; + + assert (src->n_refs > 0); + if (--src->n_refs > 0) + return; + + char *file_name = src->reader->file_name; + char *encoding = src->reader->encoding; + if (src->reader->class->destroy != NULL) + src->reader->class->destroy (src->reader); + free (file_name); + free (encoding); + free (src->buffer); + free (src->lines); + lex_stage_uninit (&src->pp); + lex_stage_uninit (&src->merge); + lex_source_clear_parse (src); + free (src->parse); + free (src); +} + +struct lex_file_reader + { + struct lex_reader reader; + struct u8_istream *istream; + }; - /* Skip whitespace after final quote mark. */ - if (lexer->prog == NULL) - break; - for (;;) - { - while (c_isspace ((unsigned char) *lexer->prog)) - lexer->prog++; - if (*lexer->prog) - break; +static struct lex_reader_class lex_file_reader_class; - if (lexer->dot) - goto finish; +/* Creates and returns a new lex_reader that will read from file FILE_NAME (or + from stdin if FILE_NAME is "-"). The file is expected to be encoded with + ENCODING, which should take one of the forms accepted by + u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error + mode of the new reader, respectively. - if (!lex_get_line (lexer)) - goto finish; - } + Returns a null pointer if FILE_NAME cannot be opened. */ +struct lex_reader * +lex_reader_for_file (const char *file_name, const char *encoding, + enum segmenter_mode syntax, + enum lex_error_mode error) +{ + struct lex_file_reader *r; + struct u8_istream *istream; - /* Skip plus sign. */ - if (*lexer->prog != '+') - break; - lexer->prog++; + istream = (!strcmp(file_name, "-") + ? u8_istream_for_fd (encoding, STDIN_FILENO) + : u8_istream_for_file (encoding, file_name, O_RDONLY)); + if (istream == NULL) + { + msg (ME, _("Opening `%s': %s."), file_name, strerror (errno)); + return NULL; + } - /* Skip whitespace after plus sign. */ - if (lexer->prog == NULL) - break; - for (;;) - { - while (c_isspace ((unsigned char) *lexer->prog)) - lexer->prog++; - if (*lexer->prog) - break; + r = xmalloc (sizeof *r); + lex_reader_init (&r->reader, &lex_file_reader_class); + r->reader.syntax = syntax; + r->reader.error = error; + r->reader.file_name = xstrdup (file_name); + r->reader.encoding = xstrdup_if_nonnull (encoding); + r->reader.line_number = 1; + r->istream = istream; - if (lexer->dot) - goto finish; + return &r->reader; +} - if (!lex_get_line (lexer)) - { - msg (SE, _("Unexpected end of file in string concatenation.")); - goto finish; - } - } +static struct lex_file_reader * +lex_file_reader_cast (struct lex_reader *r) +{ + return UP_CAST (r, struct lex_file_reader, reader); +} - /* Ensure that a valid string follows. */ - if (*lexer->prog != '\'' && *lexer->prog != '"') - { - msg (SE, _("String expected following `+'.")); - goto finish; - } +static size_t +lex_file_read (struct lex_reader *r_, char *buf, size_t n, + enum prompt_style prompt_style UNUSED) +{ + struct lex_file_reader *r = lex_file_reader_cast (r_); + ssize_t n_read = u8_istream_read (r->istream, buf, n); + if (n_read < 0) + { + msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno)); + return 0; } + return n_read; +} - /* We come here when we've finished concatenating all the string sections - into one large string. */ -finish: - if (type != CHARACTER_STRING) - convert_numeric_string_to_char_string (lexer, type); +static void +lex_file_close (struct lex_reader *r_) +{ + struct lex_file_reader *r = lex_file_reader_cast (r_); - if (ds_length (&lexer->tokstr) > 255) + if (u8_istream_fileno (r->istream) != STDIN_FILENO) { - msg (SE, _("String exceeds 255 characters in length (%zu characters)."), - ds_length (&lexer->tokstr)); - ds_truncate (&lexer->tokstr, 255); + if (u8_istream_close (r->istream) != 0) + msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno)); } + else + u8_istream_free (r->istream); - return T_STRING; + free (r); } + +static struct lex_reader_class lex_file_reader_class = + { + lex_file_read, + lex_file_close + }; -#if DUMP_TOKENS -/* Reads one token from the lexer and writes a textual representation - on stdout for debugging purposes. */ -static void -dump_token (struct lexer *lexer) -{ +struct lex_string_reader { - const char *curfn; - int curln; + struct lex_reader reader; + struct substring s; + size_t offset; + }; - curln = getl_source_location (lexer->ss); - curfn = getl_source_name (lexer->ss); - if (curfn) - fprintf (stderr, "%s:%d\t", curfn, curln); - } +static struct lex_reader_class lex_string_reader_class; - switch (lexer->token) - { - case T_ID: - fprintf (stderr, "ID\t%s\n", lexer->tokid); - break; +/* Creates and returns a new lex_reader for the contents of S, which must be + encoded in the given ENCODING. The new reader takes ownership of S and will free it + with ss_dealloc() when it is closed. */ +struct lex_reader * +lex_reader_for_substring_nocopy (struct substring s, const char *encoding) +{ + struct lex_string_reader *r; - case T_POS_NUM: - case T_NEG_NUM: - fprintf (stderr, "NUM\t%f\n", lexer->tokval); - break; + r = xmalloc (sizeof *r); + lex_reader_init (&r->reader, &lex_string_reader_class); + r->reader.syntax = SEG_MODE_AUTO; + r->reader.encoding = xstrdup_if_nonnull (encoding); + r->s = s; + r->offset = 0; - case T_STRING: - fprintf (stderr, "STRING\t\"%s\"\n", ds_cstr (&lexer->tokstr)); - break; + return &r->reader; +} - case T_STOP: - fprintf (stderr, "STOP\n"); - break; +/* Creates and returns a new lex_reader for a copy of null-terminated string S, + which must be encoded in ENCODING. The caller retains ownership of S. */ +struct lex_reader * +lex_reader_for_string (const char *s, const char *encoding) +{ + struct substring ss; + ss_alloc_substring (&ss, ss_cstr (s)); + return lex_reader_for_substring_nocopy (ss, encoding); +} - case T_EXP: - fprintf (stderr, "MISC\tEXP\""); - break; +/* Formats FORMAT as a printf()-like format string and creates and returns a + new lex_reader for the formatted result. */ +struct lex_reader * +lex_reader_for_format (const char *format, const char *encoding, ...) +{ + struct lex_reader *r; + va_list args; - case 0: - fprintf (stderr, "MISC\tEOF\n"); - break; + va_start (args, encoding); + r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding); + va_end (args); - default: - if (lex_is_keyword (lexer->token)) - fprintf (stderr, "KEYWORD\t%s\n", lex_token_name (lexer->token)); - else - fprintf (stderr, "PUNCT\t%c\n", lexer->token); - break; - } + return r; } -#endif /* DUMP_TOKENS */ - -/* Token Accessor Functions */ - -int -lex_token (const struct lexer *lexer) +static struct lex_string_reader * +lex_string_reader_cast (struct lex_reader *r) { - return lexer->token; + return UP_CAST (r, struct lex_string_reader, reader); } -double -lex_tokval (const struct lexer *lexer) +static size_t +lex_string_read (struct lex_reader *r_, char *buf, size_t n, + enum prompt_style prompt_style UNUSED) { - return lexer->tokval; + struct lex_string_reader *r = lex_string_reader_cast (r_); + size_t chunk; + + chunk = MIN (n, r->s.length - r->offset); + memcpy (buf, r->s.string + r->offset, chunk); + r->offset += chunk; + + return chunk; } -const char * -lex_tokid (const struct lexer *lexer) +static void +lex_string_close (struct lex_reader *r_) { - return lexer->tokid; + struct lex_string_reader *r = lex_string_reader_cast (r_); + + ss_dealloc (&r->s); + free (r); } -const struct string * -lex_tokstr (const struct lexer *lexer) +static struct lex_reader_class lex_string_reader_class = + { + lex_string_read, + lex_string_close + }; + +struct substring +lex_source_get_line (const struct lex_source *src, int line) { - return &lexer->tokstr; -} + if (line < 1 || line > src->n_lines) + return ss_empty (); -/* If the lexer is positioned at the (pseudo)identifier S, which - may contain a hyphen ('-'), skips it and returns true. Each - half of the identifier may be abbreviated to its first three - letters. - Otherwise, returns false. */ -bool -lex_match_hyphenated_word (struct lexer *lexer, const char *s) -{ - const char *hyphen = strchr (s, '-'); - if (hyphen == NULL) - return lex_match_id (lexer, s); - else if (lexer->token != T_ID - || !lex_id_match (ss_buffer (s, hyphen - s), ss_cstr (lexer->tokid)) - || lex_look_ahead (lexer) != '-') - return false; - else - { - lex_get (lexer); - lex_force_match (lexer, '-'); - lex_force_match_id (lexer, hyphen + 1); - return true; - } + size_t ofs = src->lines[line - 1]; + size_t end = line >= src->n_lines ? src->length : src->lines[line]; + return ss_buffer (&src->buffer[ofs], end - ofs); } -