1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "language/command.h"
34 #include "language/lexer/macro.h"
35 #include "language/lexer/scan.h"
36 #include "language/lexer/segment.h"
37 #include "language/lexer/token.h"
38 #include "libpspp/assertion.h"
39 #include "libpspp/cast.h"
40 #include "libpspp/deque.h"
41 #include "libpspp/i18n.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
71 src->tail <= line_pos <= token_pos <= src->head. */
72 size_t token_pos; /* Start of token. */
73 size_t token_len; /* Length of source for token in bytes. */
74 size_t line_pos; /* Start of line containing token_pos. */
75 int first_line; /* Line number at token_pos. */
77 /* For a token obtained through macro expansion, this is just this token.
79 For a token obtained through the lexer in an ordinary way, these are
81 char *macro_rep; /* The whole macro expansion. */
82 size_t ofs; /* Offset of this token in macro_rep. */
83 size_t len; /* Length of this token in macro_rep. */
84 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
88 lex_token_uninit (struct lex_token *t)
90 token_uninit (&t->token);
93 assert (*t->ref_cnt > 0);
102 /* A source of tokens, corresponding to a syntax file.
104 This is conceptually a lex_reader wrapped with everything needed to convert
105 its UTF-8 bytes into tokens. */
108 struct ll ll; /* In lexer's list of sources. */
109 struct lex_reader *reader;
111 struct segmenter segmenter;
112 bool eof; /* True if T_STOP was read from 'reader'. */
114 /* Buffer of UTF-8 bytes. */
116 size_t allocated; /* Number of bytes allocated. */
117 size_t tail; /* &buffer[0] offset into UTF-8 source. */
118 size_t head; /* &buffer[head - tail] offset into source. */
120 /* Positions in source file, tail <= pos <= head for each member here. */
121 size_t journal_pos; /* First byte not yet output to journal. */
122 size_t seg_pos; /* First byte not yet scanned as token. */
123 size_t line_pos; /* First byte of line containing seg_pos. */
125 int n_newlines; /* Number of new-lines up to seg_pos. */
126 bool suppress_next_newline;
130 This is mostly like a deque, with the invariant that 'back <= middle <=
131 front' (modulo SIZE_MAX+1). The tokens available for parsing are
132 between 'back' and 'middle': the token at 'back' is the current token,
133 the token at 'back + 1' is the next token, and so on. There are usually
134 no tokens between 'middle' and 'front'; if there are, then they need to
135 go through macro expansion and are not yet available for parsing.
137 'capacity' is the current number of elements in 'tokens'. It is always
138 a power of 2. 'front', 'middle', and 'back' refer to indexes in
139 'tokens' modulo 'capacity'. */
144 struct lex_token *tokens;
147 static struct lex_source *lex_source_create (struct lexer *,
148 struct lex_reader *);
149 static void lex_source_destroy (struct lex_source *);
154 struct ll_list sources; /* Contains "struct lex_source"s. */
155 struct macro_set *macros;
158 static struct lex_source *lex_source__ (const struct lexer *);
159 static char *lex_source_get_syntax__ (const struct lex_source *,
161 static const struct lex_token *lex_next__ (const struct lexer *, int n);
162 static void lex_source_push_endcmd__ (struct lex_source *);
164 static void lex_source_pop_back (struct lex_source *);
165 static bool lex_source_get (const struct lex_source *);
166 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
167 const char *format, va_list)
168 PRINTF_FORMAT (4, 0);
169 static const struct lex_token *lex_source_next__ (const struct lex_source *,
172 /* Initializes READER with the specified CLASS and otherwise some reasonable
173 defaults. The caller should fill in the others members as desired. */
175 lex_reader_init (struct lex_reader *reader,
176 const struct lex_reader_class *class)
178 reader->class = class;
179 reader->syntax = SEG_MODE_AUTO;
180 reader->error = LEX_ERROR_CONTINUE;
181 reader->file_name = NULL;
182 reader->encoding = NULL;
183 reader->line_number = 0;
187 /* Frees any file name already in READER and replaces it by a copy of
188 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
190 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
192 free (reader->file_name);
193 reader->file_name = xstrdup_if_nonnull (file_name);
196 /* Creates and returns a new lexer. */
200 struct lexer *lexer = xmalloc (sizeof *lexer);
201 *lexer = (struct lexer) {
202 .sources = LL_INITIALIZER (lexer->sources),
203 .macros = macro_set_create (),
208 /* Destroys LEXER. */
210 lex_destroy (struct lexer *lexer)
214 struct lex_source *source, *next;
216 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
217 lex_source_destroy (source);
218 macro_set_destroy (lexer->macros);
223 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
224 same name. Takes ownership of M. */
226 lex_define_macro (struct lexer *lexer, struct macro *m)
228 macro_set_add (lexer->macros, m);
231 /* Inserts READER into LEXER so that the next token read by LEXER comes from
232 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
235 lex_include (struct lexer *lexer, struct lex_reader *reader)
237 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
238 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
241 /* Appends READER to LEXER, so that it will be read after all other current
242 readers have already been read. */
244 lex_append (struct lexer *lexer, struct lex_reader *reader)
246 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
251 /* Adds a new token at the front of SRC and returns a pointer to it. The
252 caller should initialize it. Does not advance the middle pointer, so the
253 token isn't immediately available to the parser. */
254 static struct lex_token *
255 lex_push_token__ (struct lex_source *src)
257 if (src->front - src->back >= src->capacity)
259 /* Expansion works just like a deque, so we reuse the code. */
260 struct deque deque = {
261 .capacity = src->capacity,
265 src->tokens = deque_expand (&deque, src->tokens, sizeof *src->tokens);
266 src->capacity = deque.capacity;
269 struct lex_token *token = &src->tokens[src->front++ & (src->capacity - 1)];
270 token->token = (struct token) { .type = T_STOP };
271 token->macro_rep = NULL;
272 token->ref_cnt = NULL;
276 /* Removes the current token from SRC and uninitializes it. */
278 lex_source_pop_back (struct lex_source *src)
280 assert (src->middle - src->back > 0);
281 lex_token_uninit (&src->tokens[src->back++ & (src->capacity - 1)]);
284 /* Removes the token at the greatest lookahead from SRC and uninitializes
287 lex_source_pop_front (struct lex_source *src)
289 assert (src->front - src->middle > 0);
290 lex_token_uninit (&src->tokens[--src->front & (src->capacity - 1)]);
293 /* Advances LEXER to the next token, consuming the current token. */
295 lex_get (struct lexer *lexer)
297 struct lex_source *src;
299 src = lex_source__ (lexer);
303 if (src->middle - src->back > 0)
304 lex_source_pop_back (src);
306 while (src->back == src->middle)
307 if (!lex_source_get (src))
309 lex_source_destroy (src);
310 src = lex_source__ (lexer);
316 /* Issuing errors. */
318 /* Prints a syntax error message containing the current token and
319 given message MESSAGE (if non-null). */
321 lex_error (struct lexer *lexer, const char *format, ...)
325 va_start (args, format);
326 lex_next_error_valist (lexer, 0, 0, format, args);
330 /* Prints a syntax error message containing the current token and
331 given message MESSAGE (if non-null). */
333 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
335 lex_next_error_valist (lexer, 0, 0, format, args);
338 /* Prints a syntax error message containing the current token and
339 given message MESSAGE (if non-null). */
341 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
345 va_start (args, format);
346 lex_next_error_valist (lexer, n0, n1, format, args);
350 /* Prints a syntax error message saying that one of the strings provided as
351 varargs, up to the first NULL, is expected. */
353 (lex_error_expecting) (struct lexer *lexer, ...)
357 va_start (args, lexer);
358 lex_error_expecting_valist (lexer, args);
362 /* Prints a syntax error message saying that one of the options provided in
363 ARGS, up to the first NULL, is expected. */
365 lex_error_expecting_valist (struct lexer *lexer, va_list args)
367 enum { MAX_OPTIONS = 9 };
368 const char *options[MAX_OPTIONS];
370 while (n < MAX_OPTIONS)
372 const char *option = va_arg (args, const char *);
376 options[n++] = option;
378 lex_error_expecting_array (lexer, options, n);
382 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
387 lex_error (lexer, NULL);
391 lex_error (lexer, _("expecting %s"), options[0]);
395 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
399 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
404 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
405 options[0], options[1], options[2], options[3]);
409 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
410 options[0], options[1], options[2], options[3], options[4]);
414 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
415 options[0], options[1], options[2], options[3], options[4],
420 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
421 options[0], options[1], options[2], options[3], options[4],
422 options[5], options[6]);
426 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
427 options[0], options[1], options[2], options[3], options[4],
428 options[5], options[6], options[7]);
432 lex_error (lexer, NULL);
436 /* Reports an error to the effect that subcommand SBC may only be specified
439 This function does not take a lexer as an argument or use lex_error(),
440 because the result would ordinarily just be redundant: "Syntax error at
441 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
442 not help the user find the error. */
444 lex_sbc_only_once (const char *sbc)
446 msg (SE, _("Subcommand %s may only be specified once."), sbc);
449 /* Reports an error to the effect that subcommand SBC is missing.
451 This function does not take a lexer as an argument or use lex_error(),
452 because a missing subcommand can normally be detected only after the whole
453 command has been parsed, and so lex_error() would always report "Syntax
454 error at end of command", which does not help the user find the error. */
456 lex_sbc_missing (const char *sbc)
458 msg (SE, _("Required subcommand %s was not specified."), sbc);
461 /* Reports an error to the effect that specification SPEC may only be specified
462 once within subcommand SBC. */
464 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
466 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
470 /* Reports an error to the effect that specification SPEC is missing within
473 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
475 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
479 /* Prints a syntax error message containing the current token and
480 given message MESSAGE (if non-null). */
482 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
483 const char *format, va_list args)
485 struct lex_source *src = lex_source__ (lexer);
488 lex_source_error_valist (src, n0, n1, format, args);
494 ds_put_format (&s, _("Syntax error at end of input"));
497 ds_put_cstr (&s, ": ");
498 ds_put_vformat (&s, format, args);
500 ds_put_byte (&s, '.');
501 msg (SE, "%s", ds_cstr (&s));
506 /* Checks that we're at end of command.
507 If so, returns a successful command completion code.
508 If not, flags a syntax error and returns an error command
511 lex_end_of_command (struct lexer *lexer)
513 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
515 lex_error (lexer, _("expecting end of command"));
522 /* Token testing functions. */
524 /* Returns true if the current token is a number. */
526 lex_is_number (const struct lexer *lexer)
528 return lex_next_is_number (lexer, 0);
531 /* Returns true if the current token is a string. */
533 lex_is_string (const struct lexer *lexer)
535 return lex_next_is_string (lexer, 0);
538 /* Returns the value of the current token, which must be a
539 floating point number. */
541 lex_number (const struct lexer *lexer)
543 return lex_next_number (lexer, 0);
546 /* Returns true iff the current token is an integer. */
548 lex_is_integer (const struct lexer *lexer)
550 return lex_next_is_integer (lexer, 0);
553 /* Returns the value of the current token, which must be an
556 lex_integer (const struct lexer *lexer)
558 return lex_next_integer (lexer, 0);
561 /* Token testing functions with lookahead.
563 A value of 0 for N as an argument to any of these functions refers to the
564 current token. Lookahead is limited to the current command. Any N greater
565 than the number of tokens remaining in the current command will be treated
566 as referring to a T_ENDCMD token. */
568 /* Returns true if the token N ahead of the current token is a number. */
570 lex_next_is_number (const struct lexer *lexer, int n)
572 return token_is_number (lex_next (lexer, n));
575 /* Returns true if the token N ahead of the current token is a string. */
577 lex_next_is_string (const struct lexer *lexer, int n)
579 return token_is_string (lex_next (lexer, n));
582 /* Returns the value of the token N ahead of the current token, which must be a
583 floating point number. */
585 lex_next_number (const struct lexer *lexer, int n)
587 return token_number (lex_next (lexer, n));
590 /* Returns true if the token N ahead of the current token is an integer. */
592 lex_next_is_integer (const struct lexer *lexer, int n)
594 return token_is_integer (lex_next (lexer, n));
597 /* Returns the value of the token N ahead of the current token, which must be
600 lex_next_integer (const struct lexer *lexer, int n)
602 return token_integer (lex_next (lexer, n));
605 /* Token matching functions. */
607 /* If the current token has the specified TYPE, skips it and returns true.
608 Otherwise, returns false. */
610 lex_match (struct lexer *lexer, enum token_type type)
612 if (lex_token (lexer) == type)
621 /* If the current token matches IDENTIFIER, skips it and returns true.
622 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
625 IDENTIFIER must be an ASCII string. */
627 lex_match_id (struct lexer *lexer, const char *identifier)
629 return lex_match_id_n (lexer, identifier, 3);
632 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
633 may be abbreviated to its first N letters. Otherwise, returns false.
635 IDENTIFIER must be an ASCII string. */
637 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
639 if (lex_token (lexer) == T_ID
640 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
649 /* If the current token is integer X, skips it and returns true. Otherwise,
652 lex_match_int (struct lexer *lexer, int x)
654 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
663 /* Forced matches. */
665 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
666 abbreviated to its first 3 letters. Otherwise, reports an error and returns
669 IDENTIFIER must be an ASCII string. */
671 lex_force_match_id (struct lexer *lexer, const char *identifier)
673 if (lex_match_id (lexer, identifier))
677 lex_error_expecting (lexer, identifier);
682 /* If the current token has the specified TYPE, skips it and returns true.
683 Otherwise, reports an error and returns false. */
685 lex_force_match (struct lexer *lexer, enum token_type type)
687 if (lex_token (lexer) == type)
694 const char *type_string = token_type_to_string (type);
697 char *s = xasprintf ("`%s'", type_string);
698 lex_error_expecting (lexer, s);
702 lex_error_expecting (lexer, token_type_to_name (type));
708 /* If the current token is a string, does nothing and returns true.
709 Otherwise, reports an error and returns false. */
711 lex_force_string (struct lexer *lexer)
713 if (lex_is_string (lexer))
717 lex_error (lexer, _("expecting string"));
722 /* If the current token is a string or an identifier, does nothing and returns
723 true. Otherwise, reports an error and returns false.
725 This is meant for use in syntactic situations where we want to encourage the
726 user to supply a quoted string, but for compatibility we also accept
727 identifiers. (One example of such a situation is file names.) Therefore,
728 the error message issued when the current token is wrong only says that a
729 string is expected and doesn't mention that an identifier would also be
732 lex_force_string_or_id (struct lexer *lexer)
734 return lex_token (lexer) == T_ID || lex_force_string (lexer);
737 /* If the current token is an integer, does nothing and returns true.
738 Otherwise, reports an error and returns false. */
740 lex_force_int (struct lexer *lexer)
742 if (lex_is_integer (lexer))
746 lex_error (lexer, _("expecting integer"));
751 /* If the current token is an integer in the range MIN...MAX (inclusive), does
752 nothing and returns true. Otherwise, reports an error and returns false.
753 If NAME is nonnull, then it is used in the error message. */
755 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
757 bool is_integer = lex_is_integer (lexer);
758 bool too_small = is_integer && lex_integer (lexer) < min;
759 bool too_big = is_integer && lex_integer (lexer) > max;
760 if (is_integer && !too_small && !too_big)
765 /* Weird, maybe a bug in the caller. Just report that we needed an
768 lex_error (lexer, _("Integer expected for %s."), name);
770 lex_error (lexer, _("Integer expected."));
775 lex_error (lexer, _("Expected %ld for %s."), min, name);
777 lex_error (lexer, _("Expected %ld."), min);
779 else if (min + 1 == max)
782 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
784 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
788 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
789 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
791 if (report_lower_bound && report_upper_bound)
795 _("Expected integer between %ld and %ld for %s."),
798 lex_error (lexer, _("Expected integer between %ld and %ld."),
801 else if (report_lower_bound)
806 lex_error (lexer, _("Expected non-negative integer for %s."),
809 lex_error (lexer, _("Expected non-negative integer."));
814 lex_error (lexer, _("Expected positive integer for %s."),
817 lex_error (lexer, _("Expected positive integer."));
820 else if (report_upper_bound)
824 _("Expected integer less than or equal to %ld for %s."),
827 lex_error (lexer, _("Expected integer less than or equal to %ld."),
833 lex_error (lexer, _("Integer expected for %s."), name);
835 lex_error (lexer, _("Integer expected."));
841 /* If the current token is a number, does nothing and returns true.
842 Otherwise, reports an error and returns false. */
844 lex_force_num (struct lexer *lexer)
846 if (lex_is_number (lexer))
849 lex_error (lexer, _("expecting number"));
853 /* If the current token is an identifier, does nothing and returns true.
854 Otherwise, reports an error and returns false. */
856 lex_force_id (struct lexer *lexer)
858 if (lex_token (lexer) == T_ID)
861 lex_error (lexer, _("expecting identifier"));
865 /* Token accessors. */
867 /* Returns the type of LEXER's current token. */
869 lex_token (const struct lexer *lexer)
871 return lex_next_token (lexer, 0);
874 /* Returns the number in LEXER's current token.
876 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
877 tokens this function will always return zero. */
879 lex_tokval (const struct lexer *lexer)
881 return lex_next_tokval (lexer, 0);
884 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
886 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
887 this functions this function will always return NULL.
889 The UTF-8 encoding of the returned string is correct for variable names and
890 other identifiers. Use filename_to_utf8() to use it as a filename. Use
891 data_in() to use it in a "union value". */
893 lex_tokcstr (const struct lexer *lexer)
895 return lex_next_tokcstr (lexer, 0);
898 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
899 null-terminated (but the null terminator is not included in the returned
900 substring's 'length').
902 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
903 this functions this function will always return NULL.
905 The UTF-8 encoding of the returned string is correct for variable names and
906 other identifiers. Use filename_to_utf8() to use it as a filename. Use
907 data_in() to use it in a "union value". */
909 lex_tokss (const struct lexer *lexer)
911 return lex_next_tokss (lexer, 0);
916 A value of 0 for N as an argument to any of these functions refers to the
917 current token. Lookahead is limited to the current command. Any N greater
918 than the number of tokens remaining in the current command will be treated
919 as referring to a T_ENDCMD token. */
921 static const struct lex_token *
922 lex_next__ (const struct lexer *lexer_, int n)
924 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
925 struct lex_source *src = lex_source__ (lexer);
928 return lex_source_next__ (src, n);
931 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
936 /* Returns the token in SRC with the greatest lookahead. */
937 static const struct lex_token *
938 lex_source_middle (const struct lex_source *src)
940 assert (src->middle - src->back > 0);
941 return &src->tokens[(src->middle - 1) & (src->capacity - 1)];
944 static const struct lex_token *
945 lex_source_next__ (const struct lex_source *src, int n)
947 while (src->middle - src->back <= n)
949 if (src->middle - src->back > 0)
951 const struct lex_token *middle = lex_source_middle (src);
952 if (middle->token.type == T_STOP || middle->token.type == T_ENDCMD)
956 lex_source_get (src);
959 return &src->tokens[(src->back + n) & (src->capacity - 1)];
962 /* Returns the "struct token" of the token N after the current one in LEXER.
963 The returned pointer can be invalidated by pretty much any succeeding call
964 into the lexer, although the string pointer within the returned token is
965 only invalidated by consuming the token (e.g. with lex_get()). */
967 lex_next (const struct lexer *lexer, int n)
969 return &lex_next__ (lexer, n)->token;
972 /* Returns the type of the token N after the current one in LEXER. */
974 lex_next_token (const struct lexer *lexer, int n)
976 return lex_next (lexer, n)->type;
979 /* Returns the number in the tokn N after the current one in LEXER.
981 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
982 tokens this function will always return zero. */
984 lex_next_tokval (const struct lexer *lexer, int n)
986 return token_number (lex_next (lexer, n));
989 /* Returns the null-terminated string in the token N after the current one, in
992 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
993 this functions this function will always return NULL.
995 The UTF-8 encoding of the returned string is correct for variable names and
996 other identifiers. Use filename_to_utf8() to use it as a filename. Use
997 data_in() to use it in a "union value". */
999 lex_next_tokcstr (const struct lexer *lexer, int n)
1001 return lex_next_tokss (lexer, n).string;
1004 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1005 The string is null-terminated (but the null terminator is not included in
1006 the returned substring's 'length').
1008 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1009 tokens this functions this function will always return NULL.
1011 The UTF-8 encoding of the returned string is correct for variable names and
1012 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1013 data_in() to use it in a "union value". */
1015 lex_next_tokss (const struct lexer *lexer, int n)
1017 return lex_next (lexer, n)->string;
1020 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1021 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1022 are both zero, this requests the syntax for the current token.) The caller
1023 must eventually free the returned string (with free()). The syntax is
1024 encoded in UTF-8 and in the original form supplied to the lexer so that, for
1025 example, it may include comments, spaces, and new-lines if it spans multiple
1026 tokens. Macro expansion, however, has already been performed. */
1028 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1030 return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1034 lex_next_is_from_macro (const struct lexer *lexer, int n)
1036 return lex_next__ (lexer, n)->macro_rep != NULL;
1040 lex_tokens_match (const struct token *actual, const struct token *expected)
1042 if (actual->type != expected->type)
1045 switch (actual->type)
1049 return actual->number == expected->number;
1052 return lex_id_match (expected->string, actual->string);
1055 return (actual->string.length == expected->string.length
1056 && !memcmp (actual->string.string, expected->string.string,
1057 actual->string.length));
1064 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1065 skips it and returns true. Otherwise, returns false.
1067 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1068 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1069 first three letters. */
1071 lex_match_phrase (struct lexer *lexer, const char *s)
1073 struct string_lexer slex;
1078 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1079 while (string_lexer_next (&slex, &token))
1080 if (token.type != SCAN_SKIP)
1082 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1083 token_uninit (&token);
1094 lex_source_get_first_line_number (const struct lex_source *src, int n)
1096 return lex_source_next__ (src, n)->first_line;
1100 count_newlines (char *s, size_t length)
1105 while ((newline = memchr (s, '\n', length)) != NULL)
1108 length -= (newline + 1) - s;
1116 lex_source_get_last_line_number (const struct lex_source *src, int n)
1118 const struct lex_token *token = lex_source_next__ (src, n);
1120 if (token->first_line == 0)
1124 char *token_str = &src->buffer[token->token_pos - src->tail];
1125 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1130 count_columns (const char *s_, size_t length)
1132 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1138 for (ofs = 0; ofs < length; ofs += mblen)
1142 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1145 int width = uc_width (uc, "UTF-8");
1150 columns = ROUND_UP (columns + 1, 8);
1157 lex_source_get_first_column (const struct lex_source *src, int n)
1159 const struct lex_token *token = lex_source_next__ (src, n);
1160 return count_columns (&src->buffer[token->line_pos - src->tail],
1161 token->token_pos - token->line_pos);
1165 lex_source_get_last_column (const struct lex_source *src, int n)
1167 const struct lex_token *token = lex_source_next__ (src, n);
1168 char *start, *end, *newline;
1170 start = &src->buffer[token->line_pos - src->tail];
1171 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1172 newline = memrchr (start, '\n', end - start);
1173 if (newline != NULL)
1174 start = newline + 1;
1175 return count_columns (start, end - start);
1178 /* Returns the 1-based line number of the start of the syntax that represents
1179 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1180 if the token is drawn from a source that does not have line numbers. */
1182 lex_get_first_line_number (const struct lexer *lexer, int n)
1184 const struct lex_source *src = lex_source__ (lexer);
1185 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
1188 /* Returns the 1-based line number of the end of the syntax that represents the
1189 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1190 token or if the token is drawn from a source that does not have line
1193 Most of the time, a single token is wholly within a single line of syntax,
1194 but there are two exceptions: a T_STRING token can be made up of multiple
1195 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1196 token can consist of a "-" on one line followed by the number on the next.
1199 lex_get_last_line_number (const struct lexer *lexer, int n)
1201 const struct lex_source *src = lex_source__ (lexer);
1202 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1205 /* Returns the 1-based column number of the start of the syntax that represents
1206 the token N after the current one in LEXER. Returns 0 for a T_STOP
1209 Column numbers are measured according to the width of characters as shown in
1210 a typical fixed-width font, in which CJK characters have width 2 and
1211 combining characters have width 0. */
1213 lex_get_first_column (const struct lexer *lexer, int n)
1215 const struct lex_source *src = lex_source__ (lexer);
1216 return src != NULL ? lex_source_get_first_column (src, n) : 0;
1219 /* Returns the 1-based column number of the end of the syntax that represents
1220 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1223 Column numbers are measured according to the width of characters as shown in
1224 a typical fixed-width font, in which CJK characters have width 2 and
1225 combining characters have width 0. */
1227 lex_get_last_column (const struct lexer *lexer, int n)
1229 const struct lex_source *src = lex_source__ (lexer);
1230 return src != NULL ? lex_source_get_last_column (src, n) : 0;
1233 /* Returns the name of the syntax file from which the current command is drawn.
1234 Returns NULL for a T_STOP token or if the command's source does not have
1237 There is no version of this function that takes an N argument because
1238 lookahead only works to the end of a command and any given command is always
1239 within a single syntax file. */
1241 lex_get_file_name (const struct lexer *lexer)
1243 struct lex_source *src = lex_source__ (lexer);
1244 return src == NULL ? NULL : src->reader->file_name;
1247 /* Returns a newly allocated msg_location for the syntax that represents tokens
1248 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1249 must eventually free the location (with msg_location_destroy()). */
1250 struct msg_location *
1251 lex_get_location (const struct lexer *lexer, int n0, int n1)
1253 struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1254 loc->first_column = lex_get_first_column (lexer, n0);
1255 loc->last_column = lex_get_last_column (lexer, n1);
1259 /* Returns a newly allocated msg_location for the syntax that represents tokens
1260 with 0-based offsets N0...N1, inclusive, from the current token. The
1261 location only covers the tokens' lines, not the columns. The caller must
1262 eventually free the location (with msg_location_destroy()). */
1263 struct msg_location *
1264 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1266 struct msg_location *loc = xmalloc (sizeof *loc);
1267 *loc = (struct msg_location) {
1268 .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1269 .first_line = lex_get_first_line_number (lexer, n0),
1270 .last_line = lex_get_last_line_number (lexer, n1),
1276 lex_get_encoding (const struct lexer *lexer)
1278 struct lex_source *src = lex_source__ (lexer);
1279 return src == NULL ? NULL : src->reader->encoding;
1282 /* Returns the syntax mode for the syntax file from which the current drawn is
1283 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1284 does not have line numbers.
1286 There is no version of this function that takes an N argument because
1287 lookahead only works to the end of a command and any given command is always
1288 within a single syntax file. */
1290 lex_get_syntax_mode (const struct lexer *lexer)
1292 struct lex_source *src = lex_source__ (lexer);
1293 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1296 /* Returns the error mode for the syntax file from which the current drawn is
1297 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1298 source does not have line numbers.
1300 There is no version of this function that takes an N argument because
1301 lookahead only works to the end of a command and any given command is always
1302 within a single syntax file. */
1304 lex_get_error_mode (const struct lexer *lexer)
1306 struct lex_source *src = lex_source__ (lexer);
1307 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1310 /* If the source that LEXER is currently reading has error mode
1311 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1312 token to be read comes directly from whatever is next read from the stream.
1314 It makes sense to call this function after encountering an error in a
1315 command entered on the console, because usually the user would prefer not to
1316 have cascading errors. */
1318 lex_interactive_reset (struct lexer *lexer)
1320 struct lex_source *src = lex_source__ (lexer);
1321 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1323 src->head = src->tail = 0;
1324 src->journal_pos = src->seg_pos = src->line_pos = 0;
1325 src->n_newlines = 0;
1326 src->suppress_next_newline = false;
1327 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1329 while (src->middle - src->back > 0)
1330 lex_source_pop_back (src);
1331 while (src->front - src->middle > 0)
1332 lex_source_pop_front (src);
1333 lex_source_push_endcmd__ (src);
1337 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1339 lex_discard_rest_of_command (struct lexer *lexer)
1341 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1345 /* Discards all lookahead tokens in LEXER, then discards all input sources
1346 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1347 runs out of input sources. */
1349 lex_discard_noninteractive (struct lexer *lexer)
1351 struct lex_source *src = lex_source__ (lexer);
1355 while (src->middle - src->back > 0)
1356 lex_source_pop_back (src);
1358 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1359 src = lex_source__ (lexer))
1360 lex_source_destroy (src);
1365 lex_source_max_tail__ (const struct lex_source *src)
1367 const struct lex_token *token;
1370 assert (src->seg_pos >= src->line_pos);
1371 max_tail = MIN (src->journal_pos, src->line_pos);
1373 /* Use the oldest token also. (We know that src->deque cannot be empty
1374 because we are in the process of adding a new token, which is already
1375 initialized enough to use here.) */
1376 token = &src->tokens[src->back & (src->capacity - 1)];
1377 assert (token->token_pos >= token->line_pos);
1378 max_tail = MIN (max_tail, token->line_pos);
1384 lex_source_expand__ (struct lex_source *src)
1386 if (src->head - src->tail >= src->allocated)
1388 size_t max_tail = lex_source_max_tail__ (src);
1389 if (max_tail > src->tail)
1391 /* Advance the tail, freeing up room at the head. */
1392 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1393 src->head - max_tail);
1394 src->tail = max_tail;
1398 /* Buffer is completely full. Expand it. */
1399 src->buffer = x2realloc (src->buffer, &src->allocated);
1404 /* There's space available at the head of the buffer. Nothing to do. */
1409 lex_source_read__ (struct lex_source *src)
1413 lex_source_expand__ (src);
1415 size_t head_ofs = src->head - src->tail;
1416 size_t space = src->allocated - head_ofs;
1417 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1418 size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1420 assert (n <= space);
1425 src->reader->eof = true;
1426 lex_source_expand__ (src);
1432 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1433 src->head - src->seg_pos));
1436 static struct lex_source *
1437 lex_source__ (const struct lexer *lexer)
1439 return (ll_is_empty (&lexer->sources) ? NULL
1440 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1444 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1446 struct string s = DS_EMPTY_INITIALIZER;
1447 for (size_t i = n0; i <= n1; )
1449 /* Find [I,J) as the longest sequence of tokens not produced by macro
1450 expansion, or otherwise the longest sequence expanded from a single
1452 const struct lex_token *first = lex_source_next__ (src, i);
1454 for (j = i + 1; j <= n1; j++)
1456 const struct lex_token *cur = lex_source_next__ (src, j);
1457 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1458 || first->macro_rep != cur->macro_rep)
1461 const struct lex_token *last = lex_source_next__ (src, j - 1);
1463 if (!ds_is_empty (&s))
1464 ds_put_byte (&s, ' ');
1465 if (!first->macro_rep)
1467 size_t start = first->token_pos;
1468 size_t end = last->token_pos + last->token_len;
1469 ds_put_substring (&s, ss_buffer (&src->buffer[start - src->tail],
1474 size_t start = first->ofs;
1475 size_t end = last->ofs + last->len;
1476 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1482 return ds_steal_cstr (&s);
1486 lex_ellipsize (struct substring in, char *out, size_t out_size)
1492 assert (out_size >= 16);
1493 out_maxlen = out_size - 1;
1494 if (in.length > out_maxlen - 3)
1497 for (out_len = 0; out_len < in.length; out_len += mblen)
1499 if (in.string[out_len] == '\n'
1500 || in.string[out_len] == '\0'
1501 || (in.string[out_len] == '\r'
1502 && out_len + 1 < in.length
1503 && in.string[out_len + 1] == '\n'))
1506 mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1507 in.length - out_len);
1512 if (out_len + mblen > out_maxlen)
1516 memcpy (out, in.string, out_len);
1517 strcpy (&out[out_len], out_len < in.length ? "..." : "");
1521 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1523 for (size_t i = n0; i <= n1; i++)
1524 if (lex_source_next__ (src, i)->macro_rep)
1529 static struct substring
1530 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1532 if (!lex_source_contains_macro_call (src, n0, n1))
1535 const struct lex_token *token0 = lex_source_next__ (src, n0);
1536 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1537 size_t start = token0->token_pos;
1538 size_t end = token1->token_pos + token1->token_len;
1540 return ss_buffer (&src->buffer[start - src->tail], end - start);
1544 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1545 const char *format, va_list args)
1547 const struct lex_token *token;
1552 token = lex_source_next__ (src, n0);
1553 if (token->token.type == T_ENDCMD)
1554 ds_put_cstr (&s, _("Syntax error at end of command"));
1557 /* Get the syntax that caused the error. */
1558 char *syntax = lex_source_get_syntax__ (src, n0, n1);
1559 char syntax_cstr[64];
1560 lex_ellipsize (ss_cstr (syntax), syntax_cstr, sizeof syntax_cstr);
1563 /* Get the macro call(s) that expanded to the syntax that caused the
1566 struct substring call = lex_source_get_macro_call (src, n0, n1);
1567 lex_ellipsize (call, call_cstr, sizeof call_cstr);
1572 ds_put_format (&s, _("Syntax error at `%s' "
1573 "(in expansion of `%s')"),
1574 syntax_cstr, call_cstr);
1576 ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1578 else if (call_cstr[0])
1579 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1582 ds_put_cstr (&s, _("Syntax error"));
1587 ds_put_cstr (&s, ": ");
1588 ds_put_vformat (&s, format, args);
1590 if (ds_last (&s) != '.')
1591 ds_put_byte (&s, '.');
1593 struct msg_location *location = xmalloc (sizeof *location);
1594 *location = (struct msg_location) {
1595 .file_name = xstrdup_if_nonnull (src->reader->file_name),
1596 .first_line = lex_source_get_first_line_number (src, n0),
1597 .last_line = lex_source_get_last_line_number (src, n1),
1598 .first_column = lex_source_get_first_column (src, n0),
1599 .last_column = lex_source_get_last_column (src, n1),
1601 struct msg *m = xmalloc (sizeof *m);
1603 .category = MSG_C_SYNTAX,
1604 .severity = MSG_S_ERROR,
1605 .location = location,
1606 .text = ds_steal_cstr (&s),
1611 static void PRINTF_FORMAT (4, 5)
1612 lex_source_error (struct lex_source *src, int n0, int n1,
1613 const char *format, ...)
1616 va_start (args, format);
1617 lex_source_error_valist (src, n0, n1, format, args);
1622 lex_get_error (struct lex_source *src, const char *s)
1624 size_t old_middle = src->middle;
1625 src->middle = src->front;
1626 size_t n = src->front - src->back - 1;
1627 lex_source_error (src, n, n, "%s", s);
1628 src->middle = old_middle;
1630 lex_source_pop_front (src);
1633 /* Attempts to append an additional token at the front of SRC, reading more
1634 from the underlying lex_reader if necessary. Returns true if a new token
1635 was added to SRC's deque, false otherwise. The caller should retry failures
1636 unless SRC's 'eof' marker was set to true indicating that there will be no
1637 more tokens from this source.
1639 Does not make the new token available for lookahead yet; the caller must
1640 adjust SRC's 'middle' pointer to do so. */
1642 lex_source_try_get__ (struct lex_source *src)
1644 /* State maintained while scanning tokens. Usually we only need a single
1645 state, but scanner_push() can return SCAN_SAVE to indicate that the state
1646 needs to be saved and possibly restored later with SCAN_BACK. */
1649 struct segmenter segmenter;
1650 enum segment_type last_segment;
1651 int newlines; /* Number of newlines encountered so far. */
1652 /* Maintained here so we can update lex_source's similar members when we
1658 /* Initialize state. */
1659 struct state state =
1661 .segmenter = src->segmenter,
1663 .seg_pos = src->seg_pos,
1664 .line_pos = src->line_pos,
1666 struct state saved = state;
1668 /* Append a new token to SRC and initialize it. */
1669 struct lex_token *token = lex_push_token__ (src);
1670 struct scanner scanner;
1671 scanner_init (&scanner, &token->token);
1672 token->line_pos = src->line_pos;
1673 token->token_pos = src->seg_pos;
1674 if (src->reader->line_number > 0)
1675 token->first_line = src->reader->line_number + src->n_newlines;
1677 token->first_line = 0;
1679 /* Extract segments and pass them through the scanner until we obtain a
1683 /* Extract a segment. */
1684 const char *segment = &src->buffer[state.seg_pos - src->tail];
1685 size_t seg_maxlen = src->head - state.seg_pos;
1686 enum segment_type type;
1687 int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1688 src->reader->eof, &type);
1691 /* The segmenter needs more input to produce a segment. */
1692 assert (!src->reader->eof);
1693 lex_source_read__ (src);
1697 /* Update state based on the segment. */
1698 state.last_segment = type;
1699 state.seg_pos += seg_len;
1700 if (type == SEG_NEWLINE)
1703 state.line_pos = state.seg_pos;
1706 /* Pass the segment into the scanner and try to get a token out. */
1707 enum scan_result result = scanner_push (&scanner, type,
1708 ss_buffer (segment, seg_len),
1710 if (result == SCAN_SAVE)
1712 else if (result == SCAN_BACK)
1717 else if (result == SCAN_DONE)
1721 /* If we've reached the end of a line, or the end of a command, then pass
1722 the line to the output engine as a syntax text item. */
1723 int n_lines = state.newlines;
1724 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1727 src->suppress_next_newline = true;
1729 else if (n_lines > 0 && src->suppress_next_newline)
1732 src->suppress_next_newline = false;
1734 for (int i = 0; i < n_lines; i++)
1736 /* Beginning of line. */
1737 const char *line = &src->buffer[src->journal_pos - src->tail];
1739 /* Calculate line length, including \n or \r\n end-of-line if present.
1741 We use src->head even though that may be beyond what we've actually
1742 converted to tokens (which is only through state.line_pos). That's
1743 because, if we're emitting the line due to SEG_END_COMMAND, we want to
1744 take the whole line through the newline, not just through the '.'. */
1745 size_t max_len = src->head - src->journal_pos;
1746 const char *newline = memchr (line, '\n', max_len);
1747 size_t line_len = newline ? newline - line + 1 : max_len;
1749 /* Calculate line length excluding end-of-line. */
1750 size_t copy_len = line_len;
1751 if (copy_len > 0 && line[copy_len - 1] == '\n')
1753 if (copy_len > 0 && line[copy_len - 1] == '\r')
1756 /* Submit the line as syntax. */
1757 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1758 xmemdup0 (line, copy_len),
1761 src->journal_pos += line_len;
1764 token->token_len = state.seg_pos - src->seg_pos;
1766 src->segmenter = state.segmenter;
1767 src->seg_pos = state.seg_pos;
1768 src->line_pos = state.line_pos;
1769 src->n_newlines += state.newlines;
1771 switch (token->token.type)
1777 token->token.type = T_ENDCMD;
1781 case SCAN_BAD_HEX_LENGTH:
1782 case SCAN_BAD_HEX_DIGIT:
1783 case SCAN_BAD_UNICODE_DIGIT:
1784 case SCAN_BAD_UNICODE_LENGTH:
1785 case SCAN_BAD_UNICODE_CODE_POINT:
1786 case SCAN_EXPECTED_QUOTE:
1787 case SCAN_EXPECTED_EXPONENT:
1788 case SCAN_UNEXPECTED_CHAR:
1789 char *msg = scan_token_to_error (&token->token);
1790 lex_get_error (src, msg);
1795 lex_source_pop_front (src);
1802 /* Attempts to add a new token at the front of SRC. Returns true if
1803 successful, false on failure. On failure, the end of SRC has been reached
1804 and no more tokens will be forthcoming from it.
1806 Does not make the new token available for lookahead yet; the caller must
1807 adjust SRC's 'middle' pointer to do so. */
1809 lex_source_get__ (struct lex_source *src)
1812 if (lex_source_try_get__ (src))
1818 lex_source_get (const struct lex_source *src_)
1820 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1822 if (src->front - src->middle == 0)
1824 if (!lex_source_get__ (src))
1828 if (!settings_get_mexpand ())
1834 struct macro_expander *me;
1835 int n_call = macro_expander_create (
1836 src->lexer->macros, &src->tokens[src->middle & (src->capacity - 1)].token,
1838 for (int middle_ofs = 1; !n_call; middle_ofs++)
1840 if (src->front - src->middle <= middle_ofs && !lex_source_get__ (src))
1842 /* This should not be reachable because we always get a T_ENDCMD at
1843 the end of an input file (transformed from T_STOP by
1844 lex_source_try_get__()) and the macro_expander should always
1845 terminate expansion on T_ENDCMD. */
1849 const struct lex_token *t = &src->tokens[(src->middle + middle_ofs)
1850 & (src->capacity - 1)];
1851 size_t start = t->token_pos;
1852 size_t end = t->token_pos + t->token_len;
1853 const struct macro_token mt = {
1855 .representation = ss_buffer (&src->buffer[start - src->tail],
1858 src->middle += middle_ofs + 1;
1859 n_call = macro_expander_add (me, &mt);
1860 src->middle -= middle_ofs + 1;
1864 /* False alarm: no macro expansion after all. Use first token as
1865 lookahead. We'll retry macro expansion from the second token next
1867 macro_expander_destroy (me);
1872 /* Now expand the macro.
1874 We temporarily add the macro call's tokens to the source in case the macro
1875 expansion calls msg() to report an error and error processing tries to get
1876 the location of the error with, e.g. lex_get_first_line_number(), which
1877 would re-enter this code. This is a kluge; it might be cleaner to pass
1878 the line number into macro_expander_get_expansion(). */
1879 src->middle += n_call;
1880 struct macro_tokens expansion = { .n = 0 };
1881 macro_expander_get_expansion (me, src->reader->syntax, &expansion);
1882 macro_expander_destroy (me);
1883 src->middle -= n_call;
1885 /* Convert the macro expansion into syntax for possible error messages later. */
1886 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1887 size_t *len = xnmalloc (expansion.n, sizeof *len);
1888 struct string s = DS_EMPTY_INITIALIZER;
1889 macro_tokens_to_representation (&expansion, &s, ofs, len);
1891 if (settings_get_mprint ())
1892 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1893 _("Macro Expansion")));
1895 /* The first 'n_call' tokens starting at 'middle' will be replaced by the
1896 macro expansion. There might be more tokens after that, up to 'front'.
1898 Figure out the boundary of the macro call in the syntax, to go into the
1899 lex_tokens for the expansion so that later error messages can report what
1900 macro was called. */
1901 const struct lex_token *call_first
1902 = &src->tokens[src->middle & (src->capacity - 1)];
1903 const struct lex_token *call_last
1904 = &src->tokens[(src->middle + n_call - 1) & (src->capacity - 1)];
1905 size_t call_pos = call_first->token_pos;
1906 size_t call_len = (call_last->token_pos + call_last->token_len) - call_pos;
1907 size_t line_pos = call_first->line_pos;
1908 int first_line = call_first->first_line;
1910 /* Destroy the tokens for the call, and save any tokens following the call so
1911 we can add them back later. */
1912 for (size_t i = src->middle; i != src->middle + n_call; i++)
1913 lex_token_uninit (&src->tokens[i & (src->capacity - 1)]);
1914 size_t n_save = src->front - (src->middle + n_call);
1915 struct lex_token *save_tokens = xnmalloc (n_save, sizeof *save_tokens);
1916 for (size_t i = 0; i < n_save; i++)
1917 save_tokens[i] = src->tokens[(src->middle + n_call + i)
1918 & (src->capacity - 1)];
1919 src->front = src->middle;
1921 /* Append the macro expansion tokens to the lookahead. */
1922 char *macro_rep = ds_steal_cstr (&s);
1923 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1924 *ref_cnt = expansion.n;
1925 for (size_t i = 0; i < expansion.n; i++)
1927 *lex_push_token__ (src) = (struct lex_token) {
1928 .token = expansion.mts[i].token,
1929 .token_pos = call_pos,
1930 .token_len = call_len,
1931 .line_pos = line_pos,
1932 .first_line = first_line,
1933 .macro_rep = macro_rep,
1940 ss_dealloc (&expansion.mts[i].representation);
1942 free (expansion.mts);
1946 /* Finally, put the saved tokens back. */
1947 for (size_t i = 0; i < n_save; i++)
1948 *lex_push_token__ (src) = save_tokens[i];
1955 lex_source_push_endcmd__ (struct lex_source *src)
1957 assert (src->back == src->middle && src->middle == src->front);
1958 *lex_push_token__ (src) = (struct lex_token) {
1959 .token = { .type = T_ENDCMD } };
1963 static struct lex_source *
1964 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
1966 struct lex_source *src = xmalloc (sizeof *src);
1967 *src = (struct lex_source) {
1969 .segmenter = segmenter_init (reader->syntax, false),
1973 lex_source_push_endcmd__ (src);
1979 lex_source_destroy (struct lex_source *src)
1981 char *file_name = src->reader->file_name;
1982 char *encoding = src->reader->encoding;
1983 if (src->reader->class->destroy != NULL)
1984 src->reader->class->destroy (src->reader);
1988 while (src->middle - src->back > 0)
1989 lex_source_pop_back (src);
1990 while (src->front - src->middle > 0)
1991 lex_source_pop_front (src);
1993 ll_remove (&src->ll);
1997 struct lex_file_reader
1999 struct lex_reader reader;
2000 struct u8_istream *istream;
2003 static struct lex_reader_class lex_file_reader_class;
2005 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2006 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2007 ENCODING, which should take one of the forms accepted by
2008 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2009 mode of the new reader, respectively.
2011 Returns a null pointer if FILE_NAME cannot be opened. */
2013 lex_reader_for_file (const char *file_name, const char *encoding,
2014 enum segmenter_mode syntax,
2015 enum lex_error_mode error)
2017 struct lex_file_reader *r;
2018 struct u8_istream *istream;
2020 istream = (!strcmp(file_name, "-")
2021 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2022 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2023 if (istream == NULL)
2025 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2029 r = xmalloc (sizeof *r);
2030 lex_reader_init (&r->reader, &lex_file_reader_class);
2031 r->reader.syntax = syntax;
2032 r->reader.error = error;
2033 r->reader.file_name = xstrdup (file_name);
2034 r->reader.encoding = xstrdup_if_nonnull (encoding);
2035 r->reader.line_number = 1;
2036 r->istream = istream;
2041 static struct lex_file_reader *
2042 lex_file_reader_cast (struct lex_reader *r)
2044 return UP_CAST (r, struct lex_file_reader, reader);
2048 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2049 enum prompt_style prompt_style UNUSED)
2051 struct lex_file_reader *r = lex_file_reader_cast (r_);
2052 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2055 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2062 lex_file_close (struct lex_reader *r_)
2064 struct lex_file_reader *r = lex_file_reader_cast (r_);
2066 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2068 if (u8_istream_close (r->istream) != 0)
2069 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2072 u8_istream_free (r->istream);
2077 static struct lex_reader_class lex_file_reader_class =
2083 struct lex_string_reader
2085 struct lex_reader reader;
2090 static struct lex_reader_class lex_string_reader_class;
2092 /* Creates and returns a new lex_reader for the contents of S, which must be
2093 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2094 with ss_dealloc() when it is closed. */
2096 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2098 struct lex_string_reader *r;
2100 r = xmalloc (sizeof *r);
2101 lex_reader_init (&r->reader, &lex_string_reader_class);
2102 r->reader.syntax = SEG_MODE_AUTO;
2103 r->reader.encoding = xstrdup_if_nonnull (encoding);
2110 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2111 which must be encoded in ENCODING. The caller retains ownership of S. */
2113 lex_reader_for_string (const char *s, const char *encoding)
2115 struct substring ss;
2116 ss_alloc_substring (&ss, ss_cstr (s));
2117 return lex_reader_for_substring_nocopy (ss, encoding);
2120 /* Formats FORMAT as a printf()-like format string and creates and returns a
2121 new lex_reader for the formatted result. */
2123 lex_reader_for_format (const char *format, const char *encoding, ...)
2125 struct lex_reader *r;
2128 va_start (args, encoding);
2129 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2135 static struct lex_string_reader *
2136 lex_string_reader_cast (struct lex_reader *r)
2138 return UP_CAST (r, struct lex_string_reader, reader);
2142 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2143 enum prompt_style prompt_style UNUSED)
2145 struct lex_string_reader *r = lex_string_reader_cast (r_);
2148 chunk = MIN (n, r->s.length - r->offset);
2149 memcpy (buf, r->s.string + r->offset, chunk);
2156 lex_string_close (struct lex_reader *r_)
2158 struct lex_string_reader *r = lex_string_reader_cast (r_);
2164 static struct lex_reader_class lex_string_reader_class =