1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "language/command.h"
34 #include "language/lexer/macro.h"
35 #include "language/lexer/scan.h"
36 #include "language/lexer/segment.h"
37 #include "language/lexer/token.h"
38 #include "libpspp/assertion.h"
39 #include "libpspp/cast.h"
40 #include "libpspp/deque.h"
41 #include "libpspp/i18n.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
71 src->tail <= line_pos <= token_pos <= src->head. */
72 size_t token_pos; /* Start of token. */
73 size_t token_len; /* Length of source for token in bytes. */
74 size_t line_pos; /* Start of line containing token_pos. */
75 int first_line; /* Line number at token_pos. */
77 /* For a token obtained through macro expansion, this is just this token. */
78 char *macro_rep; /* The whole macro expansion. */
79 size_t ofs; /* Offset of this token in macro_rep. */
80 size_t len; /* Length of this token in macro_rep. */
81 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
85 lex_token_uninit (struct lex_token *t)
87 token_uninit (&t->token);
90 assert (*t->ref_cnt > 0);
99 /* A source of tokens, corresponding to a syntax file.
101 This is conceptually a lex_reader wrapped with everything needed to convert
102 its UTF-8 bytes into tokens. */
105 struct ll ll; /* In lexer's list of sources. */
106 struct lex_reader *reader;
108 struct segmenter segmenter;
109 bool eof; /* True if T_STOP was read from 'reader'. */
111 /* Buffer of UTF-8 bytes. */
113 size_t allocated; /* Number of bytes allocated. */
114 size_t tail; /* &buffer[0] offset into UTF-8 source. */
115 size_t head; /* &buffer[head - tail] offset into source. */
117 /* Positions in source file, tail <= pos <= head for each member here. */
118 size_t journal_pos; /* First byte not yet output to journal. */
119 size_t seg_pos; /* First byte not yet scanned as token. */
120 size_t line_pos; /* First byte of line containing seg_pos. */
122 int n_newlines; /* Number of new-lines up to seg_pos. */
123 bool suppress_next_newline;
127 This is mostly like a deque, with the conceptual invariant that back <=
128 middle <= front (modulo SIZE_MAX+1). The tokens available for parsing
129 lie between 'back' and 'middle': the token at 'back' is the current
130 token, the token at 'back + 1' is the next token, and so on. There are
131 usually no tokens between 'middle' and 'front'; if there are, then they
132 need to go through macro expansion and are not yet available for
135 'capacity' is the current number of elements in 'tokens'. It is always
136 a power of 2. 'front', 'middle', and 'back' refer to indexes in
137 'tokens' modulo 'capacity'. */
142 struct lex_token *tokens;
145 static struct lex_source *lex_source_create (struct lexer *,
146 struct lex_reader *);
147 static void lex_source_destroy (struct lex_source *);
152 struct ll_list sources; /* Contains "struct lex_source"s. */
153 struct macro_set *macros;
156 static struct lex_source *lex_source__ (const struct lexer *);
157 static char *lex_source_get_syntax__ (const struct lex_source *,
159 static const struct lex_token *lex_next__ (const struct lexer *, int n);
160 static void lex_source_push_endcmd__ (struct lex_source *);
162 static void lex_source_pop_back (struct lex_source *);
163 static bool lex_source_get (const struct lex_source *);
164 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
165 const char *format, va_list)
166 PRINTF_FORMAT (4, 0);
167 static const struct lex_token *lex_source_next__ (const struct lex_source *,
170 /* Initializes READER with the specified CLASS and otherwise some reasonable
171 defaults. The caller should fill in the others members as desired. */
173 lex_reader_init (struct lex_reader *reader,
174 const struct lex_reader_class *class)
176 reader->class = class;
177 reader->syntax = SEG_MODE_AUTO;
178 reader->error = LEX_ERROR_CONTINUE;
179 reader->file_name = NULL;
180 reader->encoding = NULL;
181 reader->line_number = 0;
185 /* Frees any file name already in READER and replaces it by a copy of
186 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
188 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
190 free (reader->file_name);
191 reader->file_name = xstrdup_if_nonnull (file_name);
194 /* Creates and returns a new lexer. */
198 struct lexer *lexer = xmalloc (sizeof *lexer);
199 *lexer = (struct lexer) {
200 .sources = LL_INITIALIZER (lexer->sources),
201 .macros = macro_set_create (),
206 /* Destroys LEXER. */
208 lex_destroy (struct lexer *lexer)
212 struct lex_source *source, *next;
214 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
215 lex_source_destroy (source);
216 macro_set_destroy (lexer->macros);
221 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
222 same name. Takes ownership of M. */
224 lex_define_macro (struct lexer *lexer, struct macro *m)
226 macro_set_add (lexer->macros, m);
229 /* Inserts READER into LEXER so that the next token read by LEXER comes from
230 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
233 lex_include (struct lexer *lexer, struct lex_reader *reader)
235 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
236 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
239 /* Appends READER to LEXER, so that it will be read after all other current
240 readers have already been read. */
242 lex_append (struct lexer *lexer, struct lex_reader *reader)
244 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
249 /* Adds a new token at the front of SRC and returns a pointer to it. The
250 caller should initialize it. Does not advance the middle pointer, so the
251 token isn't immediately available to the parser. */
252 static struct lex_token *
253 lex_push_token__ (struct lex_source *src)
255 if (src->front - src->back >= src->capacity)
257 /* Expansion works just like a deque, so we reuse the code. */
258 struct deque deque = {
259 .capacity = src->capacity,
263 src->tokens = deque_expand (&deque, src->tokens, sizeof *src->tokens);
264 src->capacity = deque.capacity;
267 struct lex_token *token = &src->tokens[src->front++ & (src->capacity - 1)];
268 token->token = (struct token) { .type = T_STOP };
269 token->macro_rep = NULL;
270 token->ref_cnt = NULL;
274 /* Removes the current token from SRC and uninitializes it. */
276 lex_source_pop_back (struct lex_source *src)
278 assert (src->middle - src->back > 0);
279 lex_token_uninit (&src->tokens[src->back++ & (src->capacity - 1)]);
282 /* Removes the token at the greatest lookahead from SRC and uninitializes
285 lex_source_pop_front (struct lex_source *src)
287 assert (src->front - src->middle > 0);
288 lex_token_uninit (&src->tokens[--src->front & (src->capacity - 1)]);
291 /* Advances LEXER to the next token, consuming the current token. */
293 lex_get (struct lexer *lexer)
295 struct lex_source *src;
297 src = lex_source__ (lexer);
301 if (src->middle - src->back > 0)
302 lex_source_pop_back (src);
304 while (src->back == src->middle)
305 if (!lex_source_get (src))
307 lex_source_destroy (src);
308 src = lex_source__ (lexer);
314 /* Issuing errors. */
316 /* Prints a syntax error message containing the current token and
317 given message MESSAGE (if non-null). */
319 lex_error (struct lexer *lexer, const char *format, ...)
323 va_start (args, format);
324 lex_next_error_valist (lexer, 0, 0, format, args);
328 /* Prints a syntax error message containing the current token and
329 given message MESSAGE (if non-null). */
331 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
333 lex_next_error_valist (lexer, 0, 0, format, args);
336 /* Prints a syntax error message containing the current token and
337 given message MESSAGE (if non-null). */
339 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
343 va_start (args, format);
344 lex_next_error_valist (lexer, n0, n1, format, args);
348 /* Prints a syntax error message saying that one of the strings provided as
349 varargs, up to the first NULL, is expected. */
351 (lex_error_expecting) (struct lexer *lexer, ...)
355 va_start (args, lexer);
356 lex_error_expecting_valist (lexer, args);
360 /* Prints a syntax error message saying that one of the options provided in
361 ARGS, up to the first NULL, is expected. */
363 lex_error_expecting_valist (struct lexer *lexer, va_list args)
365 enum { MAX_OPTIONS = 9 };
366 const char *options[MAX_OPTIONS];
368 while (n < MAX_OPTIONS)
370 const char *option = va_arg (args, const char *);
374 options[n++] = option;
376 lex_error_expecting_array (lexer, options, n);
380 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
385 lex_error (lexer, NULL);
389 lex_error (lexer, _("expecting %s"), options[0]);
393 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
397 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
402 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
403 options[0], options[1], options[2], options[3]);
407 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
408 options[0], options[1], options[2], options[3], options[4]);
412 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
413 options[0], options[1], options[2], options[3], options[4],
418 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
419 options[0], options[1], options[2], options[3], options[4],
420 options[5], options[6]);
424 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
425 options[0], options[1], options[2], options[3], options[4],
426 options[5], options[6], options[7]);
430 lex_error (lexer, NULL);
434 /* Reports an error to the effect that subcommand SBC may only be specified
437 This function does not take a lexer as an argument or use lex_error(),
438 because the result would ordinarily just be redundant: "Syntax error at
439 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
440 not help the user find the error. */
442 lex_sbc_only_once (const char *sbc)
444 msg (SE, _("Subcommand %s may only be specified once."), sbc);
447 /* Reports an error to the effect that subcommand SBC is missing.
449 This function does not take a lexer as an argument or use lex_error(),
450 because a missing subcommand can normally be detected only after the whole
451 command has been parsed, and so lex_error() would always report "Syntax
452 error at end of command", which does not help the user find the error. */
454 lex_sbc_missing (const char *sbc)
456 msg (SE, _("Required subcommand %s was not specified."), sbc);
459 /* Reports an error to the effect that specification SPEC may only be specified
460 once within subcommand SBC. */
462 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
464 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
468 /* Reports an error to the effect that specification SPEC is missing within
471 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
473 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
477 /* Prints a syntax error message containing the current token and
478 given message MESSAGE (if non-null). */
480 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
481 const char *format, va_list args)
483 struct lex_source *src = lex_source__ (lexer);
486 lex_source_error_valist (src, n0, n1, format, args);
492 ds_put_format (&s, _("Syntax error at end of input"));
495 ds_put_cstr (&s, ": ");
496 ds_put_vformat (&s, format, args);
498 ds_put_byte (&s, '.');
499 msg (SE, "%s", ds_cstr (&s));
504 /* Checks that we're at end of command.
505 If so, returns a successful command completion code.
506 If not, flags a syntax error and returns an error command
509 lex_end_of_command (struct lexer *lexer)
511 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
513 lex_error (lexer, _("expecting end of command"));
520 /* Token testing functions. */
522 /* Returns true if the current token is a number. */
524 lex_is_number (const struct lexer *lexer)
526 return lex_next_is_number (lexer, 0);
529 /* Returns true if the current token is a string. */
531 lex_is_string (const struct lexer *lexer)
533 return lex_next_is_string (lexer, 0);
536 /* Returns the value of the current token, which must be a
537 floating point number. */
539 lex_number (const struct lexer *lexer)
541 return lex_next_number (lexer, 0);
544 /* Returns true iff the current token is an integer. */
546 lex_is_integer (const struct lexer *lexer)
548 return lex_next_is_integer (lexer, 0);
551 /* Returns the value of the current token, which must be an
554 lex_integer (const struct lexer *lexer)
556 return lex_next_integer (lexer, 0);
559 /* Token testing functions with lookahead.
561 A value of 0 for N as an argument to any of these functions refers to the
562 current token. Lookahead is limited to the current command. Any N greater
563 than the number of tokens remaining in the current command will be treated
564 as referring to a T_ENDCMD token. */
566 /* Returns true if the token N ahead of the current token is a number. */
568 lex_next_is_number (const struct lexer *lexer, int n)
570 return token_is_number (lex_next (lexer, n));
573 /* Returns true if the token N ahead of the current token is a string. */
575 lex_next_is_string (const struct lexer *lexer, int n)
577 return token_is_string (lex_next (lexer, n));
580 /* Returns the value of the token N ahead of the current token, which must be a
581 floating point number. */
583 lex_next_number (const struct lexer *lexer, int n)
585 return token_number (lex_next (lexer, n));
588 /* Returns true if the token N ahead of the current token is an integer. */
590 lex_next_is_integer (const struct lexer *lexer, int n)
592 return token_is_integer (lex_next (lexer, n));
595 /* Returns the value of the token N ahead of the current token, which must be
598 lex_next_integer (const struct lexer *lexer, int n)
600 return token_integer (lex_next (lexer, n));
603 /* Token matching functions. */
605 /* If the current token has the specified TYPE, skips it and returns true.
606 Otherwise, returns false. */
608 lex_match (struct lexer *lexer, enum token_type type)
610 if (lex_token (lexer) == type)
619 /* If the current token matches IDENTIFIER, skips it and returns true.
620 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
623 IDENTIFIER must be an ASCII string. */
625 lex_match_id (struct lexer *lexer, const char *identifier)
627 return lex_match_id_n (lexer, identifier, 3);
630 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
631 may be abbreviated to its first N letters. Otherwise, returns false.
633 IDENTIFIER must be an ASCII string. */
635 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
637 if (lex_token (lexer) == T_ID
638 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
647 /* If the current token is integer X, skips it and returns true. Otherwise,
650 lex_match_int (struct lexer *lexer, int x)
652 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
661 /* Forced matches. */
663 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
664 abbreviated to its first 3 letters. Otherwise, reports an error and returns
667 IDENTIFIER must be an ASCII string. */
669 lex_force_match_id (struct lexer *lexer, const char *identifier)
671 if (lex_match_id (lexer, identifier))
675 lex_error_expecting (lexer, identifier);
680 /* If the current token has the specified TYPE, skips it and returns true.
681 Otherwise, reports an error and returns false. */
683 lex_force_match (struct lexer *lexer, enum token_type type)
685 if (lex_token (lexer) == type)
692 const char *type_string = token_type_to_string (type);
695 char *s = xasprintf ("`%s'", type_string);
696 lex_error_expecting (lexer, s);
700 lex_error_expecting (lexer, token_type_to_name (type));
706 /* If the current token is a string, does nothing and returns true.
707 Otherwise, reports an error and returns false. */
709 lex_force_string (struct lexer *lexer)
711 if (lex_is_string (lexer))
715 lex_error (lexer, _("expecting string"));
720 /* If the current token is a string or an identifier, does nothing and returns
721 true. Otherwise, reports an error and returns false.
723 This is meant for use in syntactic situations where we want to encourage the
724 user to supply a quoted string, but for compatibility we also accept
725 identifiers. (One example of such a situation is file names.) Therefore,
726 the error message issued when the current token is wrong only says that a
727 string is expected and doesn't mention that an identifier would also be
730 lex_force_string_or_id (struct lexer *lexer)
732 return lex_token (lexer) == T_ID || lex_force_string (lexer);
735 /* If the current token is an integer, does nothing and returns true.
736 Otherwise, reports an error and returns false. */
738 lex_force_int (struct lexer *lexer)
740 if (lex_is_integer (lexer))
744 lex_error (lexer, _("expecting integer"));
749 /* If the current token is an integer in the range MIN...MAX (inclusive), does
750 nothing and returns true. Otherwise, reports an error and returns false.
751 If NAME is nonnull, then it is used in the error message. */
753 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
755 bool is_integer = lex_is_integer (lexer);
756 bool too_small = is_integer && lex_integer (lexer) < min;
757 bool too_big = is_integer && lex_integer (lexer) > max;
758 if (is_integer && !too_small && !too_big)
763 /* Weird, maybe a bug in the caller. Just report that we needed an
766 lex_error (lexer, _("Integer expected for %s."), name);
768 lex_error (lexer, _("Integer expected."));
773 lex_error (lexer, _("Expected %ld for %s."), min, name);
775 lex_error (lexer, _("Expected %ld."), min);
777 else if (min + 1 == max)
780 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
782 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
786 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
787 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
789 if (report_lower_bound && report_upper_bound)
793 _("Expected integer between %ld and %ld for %s."),
796 lex_error (lexer, _("Expected integer between %ld and %ld."),
799 else if (report_lower_bound)
804 lex_error (lexer, _("Expected non-negative integer for %s."),
807 lex_error (lexer, _("Expected non-negative integer."));
812 lex_error (lexer, _("Expected positive integer for %s."),
815 lex_error (lexer, _("Expected positive integer."));
818 else if (report_upper_bound)
822 _("Expected integer less than or equal to %ld for %s."),
825 lex_error (lexer, _("Expected integer less than or equal to %ld."),
831 lex_error (lexer, _("Integer expected for %s."), name);
833 lex_error (lexer, _("Integer expected."));
839 /* If the current token is a number, does nothing and returns true.
840 Otherwise, reports an error and returns false. */
842 lex_force_num (struct lexer *lexer)
844 if (lex_is_number (lexer))
847 lex_error (lexer, _("expecting number"));
851 /* If the current token is an identifier, does nothing and returns true.
852 Otherwise, reports an error and returns false. */
854 lex_force_id (struct lexer *lexer)
856 if (lex_token (lexer) == T_ID)
859 lex_error (lexer, _("expecting identifier"));
863 /* Token accessors. */
865 /* Returns the type of LEXER's current token. */
867 lex_token (const struct lexer *lexer)
869 return lex_next_token (lexer, 0);
872 /* Returns the number in LEXER's current token.
874 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
875 tokens this function will always return zero. */
877 lex_tokval (const struct lexer *lexer)
879 return lex_next_tokval (lexer, 0);
882 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
884 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
885 this functions this function will always return NULL.
887 The UTF-8 encoding of the returned string is correct for variable names and
888 other identifiers. Use filename_to_utf8() to use it as a filename. Use
889 data_in() to use it in a "union value". */
891 lex_tokcstr (const struct lexer *lexer)
893 return lex_next_tokcstr (lexer, 0);
896 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
897 null-terminated (but the null terminator is not included in the returned
898 substring's 'length').
900 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
901 this functions this function will always return NULL.
903 The UTF-8 encoding of the returned string is correct for variable names and
904 other identifiers. Use filename_to_utf8() to use it as a filename. Use
905 data_in() to use it in a "union value". */
907 lex_tokss (const struct lexer *lexer)
909 return lex_next_tokss (lexer, 0);
914 A value of 0 for N as an argument to any of these functions refers to the
915 current token. Lookahead is limited to the current command. Any N greater
916 than the number of tokens remaining in the current command will be treated
917 as referring to a T_ENDCMD token. */
919 static const struct lex_token *
920 lex_next__ (const struct lexer *lexer_, int n)
922 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
923 struct lex_source *src = lex_source__ (lexer);
926 return lex_source_next__ (src, n);
929 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
934 /* Returns the token in SRC with the greatest lookahead. */
935 static const struct lex_token *
936 lex_source_middle (const struct lex_source *src)
938 assert (src->middle - src->back > 0);
939 return &src->tokens[(src->middle - 1) & (src->capacity - 1)];
942 static const struct lex_token *
943 lex_source_next__ (const struct lex_source *src, int n)
945 while (src->middle - src->back <= n)
947 if (src->middle - src->back > 0)
949 const struct lex_token *middle = lex_source_middle (src);
950 if (middle->token.type == T_STOP || middle->token.type == T_ENDCMD)
954 lex_source_get (src);
957 return &src->tokens[(src->back + n) & (src->capacity - 1)];
960 /* Returns the "struct token" of the token N after the current one in LEXER.
961 The returned pointer can be invalidated by pretty much any succeeding call
962 into the lexer, although the string pointer within the returned token is
963 only invalidated by consuming the token (e.g. with lex_get()). */
965 lex_next (const struct lexer *lexer, int n)
967 return &lex_next__ (lexer, n)->token;
970 /* Returns the type of the token N after the current one in LEXER. */
972 lex_next_token (const struct lexer *lexer, int n)
974 return lex_next (lexer, n)->type;
977 /* Returns the number in the tokn N after the current one in LEXER.
979 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
980 tokens this function will always return zero. */
982 lex_next_tokval (const struct lexer *lexer, int n)
984 return token_number (lex_next (lexer, n));
987 /* Returns the null-terminated string in the token N after the current one, in
990 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
991 this functions this function will always return NULL.
993 The UTF-8 encoding of the returned string is correct for variable names and
994 other identifiers. Use filename_to_utf8() to use it as a filename. Use
995 data_in() to use it in a "union value". */
997 lex_next_tokcstr (const struct lexer *lexer, int n)
999 return lex_next_tokss (lexer, n).string;
1002 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1003 The string is null-terminated (but the null terminator is not included in
1004 the returned substring's 'length').
1006 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1007 tokens this functions this function will always return NULL.
1009 The UTF-8 encoding of the returned string is correct for variable names and
1010 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1011 data_in() to use it in a "union value". */
1013 lex_next_tokss (const struct lexer *lexer, int n)
1015 return lex_next (lexer, n)->string;
1018 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1019 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1020 are both zero, this requests the syntax for the current token.) The caller
1021 must eventually free the returned string (with free()). The syntax is
1022 encoded in UTF-8 and in the original form supplied to the lexer so that, for
1023 example, it may include comments, spaces, and new-lines if it spans multiple
1024 tokens. Macro expansion, however, has already been performed. */
1026 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1028 return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1032 lex_next_is_from_macro (const struct lexer *lexer, int n)
1034 return lex_next__ (lexer, n)->macro_rep != NULL;
1038 lex_tokens_match (const struct token *actual, const struct token *expected)
1040 if (actual->type != expected->type)
1043 switch (actual->type)
1047 return actual->number == expected->number;
1050 return lex_id_match (expected->string, actual->string);
1053 return (actual->string.length == expected->string.length
1054 && !memcmp (actual->string.string, expected->string.string,
1055 actual->string.length));
1062 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1063 skips it and returns true. Otherwise, returns false.
1065 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1066 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1067 first three letters. */
1069 lex_match_phrase (struct lexer *lexer, const char *s)
1071 struct string_lexer slex;
1076 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1077 while (string_lexer_next (&slex, &token))
1078 if (token.type != SCAN_SKIP)
1080 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1081 token_uninit (&token);
1092 lex_source_get_first_line_number (const struct lex_source *src, int n)
1094 return lex_source_next__ (src, n)->first_line;
1098 count_newlines (char *s, size_t length)
1103 while ((newline = memchr (s, '\n', length)) != NULL)
1106 length -= (newline + 1) - s;
1114 lex_source_get_last_line_number (const struct lex_source *src, int n)
1116 const struct lex_token *token = lex_source_next__ (src, n);
1118 if (token->first_line == 0)
1122 char *token_str = &src->buffer[token->token_pos - src->tail];
1123 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1128 count_columns (const char *s_, size_t length)
1130 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1136 for (ofs = 0; ofs < length; ofs += mblen)
1140 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1143 int width = uc_width (uc, "UTF-8");
1148 columns = ROUND_UP (columns + 1, 8);
1155 lex_source_get_first_column (const struct lex_source *src, int n)
1157 const struct lex_token *token = lex_source_next__ (src, n);
1158 return count_columns (&src->buffer[token->line_pos - src->tail],
1159 token->token_pos - token->line_pos);
1163 lex_source_get_last_column (const struct lex_source *src, int n)
1165 const struct lex_token *token = lex_source_next__ (src, n);
1166 char *start, *end, *newline;
1168 start = &src->buffer[token->line_pos - src->tail];
1169 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1170 newline = memrchr (start, '\n', end - start);
1171 if (newline != NULL)
1172 start = newline + 1;
1173 return count_columns (start, end - start);
1176 /* Returns the 1-based line number of the start of the syntax that represents
1177 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1178 if the token is drawn from a source that does not have line numbers. */
1180 lex_get_first_line_number (const struct lexer *lexer, int n)
1182 const struct lex_source *src = lex_source__ (lexer);
1183 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
1186 /* Returns the 1-based line number of the end of the syntax that represents the
1187 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1188 token or if the token is drawn from a source that does not have line
1191 Most of the time, a single token is wholly within a single line of syntax,
1192 but there are two exceptions: a T_STRING token can be made up of multiple
1193 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1194 token can consist of a "-" on one line followed by the number on the next.
1197 lex_get_last_line_number (const struct lexer *lexer, int n)
1199 const struct lex_source *src = lex_source__ (lexer);
1200 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1203 /* Returns the 1-based column number of the start of the syntax that represents
1204 the token N after the current one in LEXER. Returns 0 for a T_STOP
1207 Column numbers are measured according to the width of characters as shown in
1208 a typical fixed-width font, in which CJK characters have width 2 and
1209 combining characters have width 0. */
1211 lex_get_first_column (const struct lexer *lexer, int n)
1213 const struct lex_source *src = lex_source__ (lexer);
1214 return src != NULL ? lex_source_get_first_column (src, n) : 0;
1217 /* Returns the 1-based column number of the end of the syntax that represents
1218 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1221 Column numbers are measured according to the width of characters as shown in
1222 a typical fixed-width font, in which CJK characters have width 2 and
1223 combining characters have width 0. */
1225 lex_get_last_column (const struct lexer *lexer, int n)
1227 const struct lex_source *src = lex_source__ (lexer);
1228 return src != NULL ? lex_source_get_last_column (src, n) : 0;
1231 /* Returns the name of the syntax file from which the current command is drawn.
1232 Returns NULL for a T_STOP token or if the command's source does not have
1235 There is no version of this function that takes an N argument because
1236 lookahead only works to the end of a command and any given command is always
1237 within a single syntax file. */
1239 lex_get_file_name (const struct lexer *lexer)
1241 struct lex_source *src = lex_source__ (lexer);
1242 return src == NULL ? NULL : src->reader->file_name;
1245 /* Returns a newly allocated msg_location for the syntax that represents tokens
1246 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1247 must eventually free the location (with msg_location_destroy()). */
1248 struct msg_location *
1249 lex_get_location (const struct lexer *lexer, int n0, int n1)
1251 struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1252 loc->first_column = lex_get_first_column (lexer, n0);
1253 loc->last_column = lex_get_last_column (lexer, n1);
1257 /* Returns a newly allocated msg_location for the syntax that represents tokens
1258 with 0-based offsets N0...N1, inclusive, from the current token. The
1259 location only covers the tokens' lines, not the columns. The caller must
1260 eventually free the location (with msg_location_destroy()). */
1261 struct msg_location *
1262 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1264 struct msg_location *loc = xmalloc (sizeof *loc);
1265 *loc = (struct msg_location) {
1266 .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1267 .first_line = lex_get_first_line_number (lexer, n0),
1268 .last_line = lex_get_last_line_number (lexer, n1),
1274 lex_get_encoding (const struct lexer *lexer)
1276 struct lex_source *src = lex_source__ (lexer);
1277 return src == NULL ? NULL : src->reader->encoding;
1280 /* Returns the syntax mode for the syntax file from which the current drawn is
1281 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1282 does not have line numbers.
1284 There is no version of this function that takes an N argument because
1285 lookahead only works to the end of a command and any given command is always
1286 within a single syntax file. */
1288 lex_get_syntax_mode (const struct lexer *lexer)
1290 struct lex_source *src = lex_source__ (lexer);
1291 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1294 /* Returns the error mode for the syntax file from which the current drawn is
1295 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1296 source does not have line numbers.
1298 There is no version of this function that takes an N argument because
1299 lookahead only works to the end of a command and any given command is always
1300 within a single syntax file. */
1302 lex_get_error_mode (const struct lexer *lexer)
1304 struct lex_source *src = lex_source__ (lexer);
1305 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1308 /* If the source that LEXER is currently reading has error mode
1309 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1310 token to be read comes directly from whatever is next read from the stream.
1312 It makes sense to call this function after encountering an error in a
1313 command entered on the console, because usually the user would prefer not to
1314 have cascading errors. */
1316 lex_interactive_reset (struct lexer *lexer)
1318 struct lex_source *src = lex_source__ (lexer);
1319 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1321 src->head = src->tail = 0;
1322 src->journal_pos = src->seg_pos = src->line_pos = 0;
1323 src->n_newlines = 0;
1324 src->suppress_next_newline = false;
1325 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1327 while (src->middle - src->back > 0)
1328 lex_source_pop_back (src);
1329 while (src->front - src->middle > 0)
1330 lex_source_pop_front (src);
1331 lex_source_push_endcmd__ (src);
1335 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1337 lex_discard_rest_of_command (struct lexer *lexer)
1339 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1343 /* Discards all lookahead tokens in LEXER, then discards all input sources
1344 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1345 runs out of input sources. */
1347 lex_discard_noninteractive (struct lexer *lexer)
1349 struct lex_source *src = lex_source__ (lexer);
1353 while (src->middle - src->back > 0)
1354 lex_source_pop_back (src);
1356 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1357 src = lex_source__ (lexer))
1358 lex_source_destroy (src);
1363 lex_source_max_tail__ (const struct lex_source *src)
1365 const struct lex_token *token;
1368 assert (src->seg_pos >= src->line_pos);
1369 max_tail = MIN (src->journal_pos, src->line_pos);
1371 /* Use the oldest token also. (We know that src->deque cannot be empty
1372 because we are in the process of adding a new token, which is already
1373 initialized enough to use here.) */
1374 token = &src->tokens[src->back & (src->capacity - 1)];
1375 assert (token->token_pos >= token->line_pos);
1376 max_tail = MIN (max_tail, token->line_pos);
1382 lex_source_expand__ (struct lex_source *src)
1384 if (src->head - src->tail >= src->allocated)
1386 size_t max_tail = lex_source_max_tail__ (src);
1387 if (max_tail > src->tail)
1389 /* Advance the tail, freeing up room at the head. */
1390 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1391 src->head - max_tail);
1392 src->tail = max_tail;
1396 /* Buffer is completely full. Expand it. */
1397 src->buffer = x2realloc (src->buffer, &src->allocated);
1402 /* There's space available at the head of the buffer. Nothing to do. */
1407 lex_source_read__ (struct lex_source *src)
1411 lex_source_expand__ (src);
1413 size_t head_ofs = src->head - src->tail;
1414 size_t space = src->allocated - head_ofs;
1415 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1416 size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1418 assert (n <= space);
1423 src->reader->eof = true;
1424 lex_source_expand__ (src);
1430 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1431 src->head - src->seg_pos));
1434 static struct lex_source *
1435 lex_source__ (const struct lexer *lexer)
1437 return (ll_is_empty (&lexer->sources) ? NULL
1438 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1442 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1444 struct string s = DS_EMPTY_INITIALIZER;
1445 for (size_t i = n0; i <= n1; )
1447 /* Find [I,J) as the longest sequence of tokens not produced by macro
1448 expansion, or otherwise the longest sequence expanded from a single
1450 const struct lex_token *first = lex_source_next__ (src, i);
1452 for (j = i + 1; j <= n1; j++)
1454 const struct lex_token *cur = lex_source_next__ (src, j);
1455 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1456 || first->macro_rep != cur->macro_rep)
1459 const struct lex_token *last = lex_source_next__ (src, j - 1);
1461 if (!ds_is_empty (&s))
1462 ds_put_byte (&s, ' ');
1463 if (!first->macro_rep)
1465 size_t start = first->token_pos;
1466 size_t end = last->token_pos + last->token_len;
1467 ds_put_substring (&s, ss_buffer (&src->buffer[start - src->tail],
1472 size_t start = first->ofs;
1473 size_t end = last->ofs + last->len;
1474 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1480 return ds_steal_cstr (&s);
1484 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1490 assert (out_size >= 16);
1491 out_maxlen = out_size - 1;
1492 if (in.length > out_maxlen - 3)
1495 for (out_len = 0; out_len < in.length; out_len += mblen)
1497 if (in.string[out_len] == '\n'
1498 || in.string[out_len] == '\0'
1499 || (in.string[out_len] == '\r'
1500 && out_len + 1 < in.length
1501 && in.string[out_len + 1] == '\n'))
1504 mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1505 in.length - out_len);
1510 if (out_len + mblen > out_maxlen)
1514 memcpy (out, in.string, out_len);
1515 strcpy (&out[out_len], out_len < in.length ? "..." : "");
1519 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1521 for (size_t i = n0; i <= n1; i++)
1522 if (lex_source_next__ (src, i)->macro_rep)
1527 static struct substring
1528 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1530 if (!lex_source_contains_macro_call (src, n0, n1))
1533 const struct lex_token *token0 = lex_source_next__ (src, n0);
1534 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1535 size_t start = token0->token_pos;
1536 size_t end = token1->token_pos + token1->token_len;
1538 return ss_buffer (&src->buffer[start - src->tail], end - start);
1542 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1543 const char *format, va_list args)
1545 const struct lex_token *token;
1550 token = lex_source_next__ (src, n0);
1551 if (token->token.type == T_ENDCMD)
1552 ds_put_cstr (&s, _("Syntax error at end of command"));
1555 /* Get the syntax that caused the error. */
1556 char *syntax = lex_source_get_syntax__ (src, n0, n1);
1557 char syntax_cstr[64];
1558 lex_ellipsize__ (ss_cstr (syntax), syntax_cstr, sizeof syntax_cstr);
1561 /* Get the macro call(s) that expanded to the syntax that caused the
1564 struct substring call = lex_source_get_macro_call (src, n0, n1);
1565 lex_ellipsize__ (call, call_cstr, sizeof call_cstr);
1570 ds_put_format (&s, _("Syntax error at `%s' "
1571 "(in expansion of `%s')"),
1572 syntax_cstr, call_cstr);
1574 ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1576 else if (call_cstr[0])
1577 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1580 ds_put_cstr (&s, _("Syntax error"));
1585 ds_put_cstr (&s, ": ");
1586 ds_put_vformat (&s, format, args);
1588 if (ds_last (&s) != '.')
1589 ds_put_byte (&s, '.');
1591 struct msg_location *location = xmalloc (sizeof *location);
1592 *location = (struct msg_location) {
1593 .file_name = xstrdup_if_nonnull (src->reader->file_name),
1594 .first_line = lex_source_get_first_line_number (src, n0),
1595 .last_line = lex_source_get_last_line_number (src, n1),
1596 .first_column = lex_source_get_first_column (src, n0),
1597 .last_column = lex_source_get_last_column (src, n1),
1599 struct msg *m = xmalloc (sizeof *m);
1601 .category = MSG_C_SYNTAX,
1602 .severity = MSG_S_ERROR,
1603 .location = location,
1604 .text = ds_steal_cstr (&s),
1609 static void PRINTF_FORMAT (2, 3)
1610 lex_get_error (struct lex_source *src, const char *format, ...)
1613 va_start (args, format);
1615 size_t old_middle = src->middle;
1616 src->middle = src->front;
1617 size_t n = src->front - src->back - 1;
1618 lex_source_error_valist (src, n, n, format, args);
1619 src->middle = old_middle;
1621 lex_source_pop_front (src);
1626 /* Attempts to append an additional token at the front of SRC, reading more
1627 from the underlying lex_reader if necessary. Returns true if a new token
1628 was added to SRC's deque, false otherwise. The caller should retry failures
1629 unless SRC's 'eof' marker was set to true indicating that there will be no
1630 more tokens from this source.
1632 Does not make the new token available for lookahead yet; the caller must
1633 adjust SRC's 'middle' pointer to do so. */
1635 lex_source_try_get__ (struct lex_source *src)
1637 /* State maintained while scanning tokens. Usually we only need a single
1638 state, but scanner_push() can return SCAN_SAVE to indicate that the state
1639 needs to be saved and possibly restored later with SCAN_BACK. */
1642 struct segmenter segmenter;
1643 enum segment_type last_segment;
1644 int newlines; /* Number of newlines encountered so far. */
1645 /* Maintained here so we can update lex_source's similar members when we
1651 /* Initialize state. */
1652 struct state state =
1654 .segmenter = src->segmenter,
1656 .seg_pos = src->seg_pos,
1657 .line_pos = src->line_pos,
1659 struct state saved = state;
1661 /* Append a new token to SRC and initialize it. */
1662 struct lex_token *token = lex_push_token__ (src);
1663 struct scanner scanner;
1664 scanner_init (&scanner, &token->token);
1665 token->line_pos = src->line_pos;
1666 token->token_pos = src->seg_pos;
1667 if (src->reader->line_number > 0)
1668 token->first_line = src->reader->line_number + src->n_newlines;
1670 token->first_line = 0;
1672 /* Extract segments and pass them through the scanner until we obtain a
1676 /* Extract a segment. */
1677 const char *segment = &src->buffer[state.seg_pos - src->tail];
1678 size_t seg_maxlen = src->head - state.seg_pos;
1679 enum segment_type type;
1680 int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1681 src->reader->eof, &type);
1684 /* The segmenter needs more input to produce a segment. */
1685 assert (!src->reader->eof);
1686 lex_source_read__ (src);
1690 /* Update state based on the segment. */
1691 state.last_segment = type;
1692 state.seg_pos += seg_len;
1693 if (type == SEG_NEWLINE)
1696 state.line_pos = state.seg_pos;
1699 /* Pass the segment into the scanner and try to get a token out. */
1700 enum scan_result result = scanner_push (&scanner, type,
1701 ss_buffer (segment, seg_len),
1703 if (result == SCAN_SAVE)
1705 else if (result == SCAN_BACK)
1710 else if (result == SCAN_DONE)
1714 /* If we've reached the end of a line, or the end of a command, then pass
1715 the line to the output engine as a syntax text item. */
1716 int n_lines = state.newlines;
1717 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1720 src->suppress_next_newline = true;
1722 else if (n_lines > 0 && src->suppress_next_newline)
1725 src->suppress_next_newline = false;
1727 for (int i = 0; i < n_lines; i++)
1729 /* Beginning of line. */
1730 const char *line = &src->buffer[src->journal_pos - src->tail];
1732 /* Calculate line length, including \n or \r\n end-of-line if present.
1734 We use src->head even though that may be beyond what we've actually
1735 converted to tokens (which is only through state.line_pos). That's
1736 because, if we're emitting the line due to SEG_END_COMMAND, we want to
1737 take the whole line through the newline, not just through the '.'. */
1738 size_t max_len = src->head - src->journal_pos;
1739 const char *newline = memchr (line, '\n', max_len);
1740 size_t line_len = newline ? newline - line + 1 : max_len;
1742 /* Calculate line length excluding end-of-line. */
1743 size_t copy_len = line_len;
1744 if (copy_len > 0 && line[copy_len - 1] == '\n')
1746 if (copy_len > 0 && line[copy_len - 1] == '\r')
1749 /* Submit the line as syntax. */
1750 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1751 xmemdup0 (line, copy_len),
1754 src->journal_pos += line_len;
1757 token->token_len = state.seg_pos - src->seg_pos;
1759 src->segmenter = state.segmenter;
1760 src->seg_pos = state.seg_pos;
1761 src->line_pos = state.line_pos;
1762 src->n_newlines += state.newlines;
1764 switch (token->token.type)
1770 token->token.type = T_ENDCMD;
1774 case SCAN_BAD_HEX_LENGTH:
1775 lex_get_error (src, _("String of hex digits has %d characters, which "
1776 "is not a multiple of 2"),
1777 (int) token->token.number);
1780 case SCAN_BAD_HEX_DIGIT:
1781 case SCAN_BAD_UNICODE_DIGIT:
1782 lex_get_error (src, _("`%c' is not a valid hex digit"),
1783 (int) token->token.number);
1786 case SCAN_BAD_UNICODE_LENGTH:
1787 lex_get_error (src, _("Unicode string contains %d bytes, which is "
1788 "not in the valid range of 1 to 8 bytes"),
1789 (int) token->token.number);
1792 case SCAN_BAD_UNICODE_CODE_POINT:
1793 lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1794 (int) token->token.number);
1797 case SCAN_EXPECTED_QUOTE:
1798 lex_get_error (src, _("Unterminated string constant"));
1801 case SCAN_EXPECTED_EXPONENT:
1802 lex_get_error (src, _("Missing exponent following `%s'"),
1803 token->token.string.string);
1806 case SCAN_UNEXPECTED_CHAR:
1809 lex_get_error (src, _("Bad character %s in input"),
1810 uc_name (token->token.number, c_name));
1815 lex_source_pop_front (src);
1822 /* Attempts to add a new token at the front of SRC. Returns true if
1823 successful, false on failure. On failure, the end of SRC has been reached
1824 and no more tokens will be forthcoming from it.
1826 Does not make the new token available for lookahead yet; the caller must
1827 adjust SRC's 'middle' pointer to do so. */
1829 lex_source_get__ (struct lex_source *src)
1832 if (lex_source_try_get__ (src))
1838 lex_source_get (const struct lex_source *src_)
1840 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1842 if (src->front - src->middle == 0)
1844 if (!lex_source_get__ (src))
1848 if (!settings_get_mexpand ())
1854 struct macro_expander *me;
1855 int n_call = macro_expander_create (
1856 src->lexer->macros, &src->tokens[src->middle & (src->capacity - 1)].token,
1858 for (int middle_ofs = 1; !n_call; middle_ofs++)
1860 if (src->front - src->middle <= middle_ofs && !lex_source_get__ (src))
1862 /* This should not be reachable because we always get a T_ENDCMD at
1863 the end of an input file (transformed from T_STOP by
1864 lex_source_try_get__()) and the macro_expander should always
1865 terminate expansion on T_ENDCMD. */
1869 const struct lex_token *t = &src->tokens[(src->middle + middle_ofs)
1870 & (src->capacity - 1)];
1871 size_t start = t->token_pos;
1872 size_t end = t->token_pos + t->token_len;
1873 const struct macro_token mt = {
1875 .representation = ss_buffer (&src->buffer[start - src->tail],
1878 src->middle += middle_ofs + 1;
1879 n_call = macro_expander_add (me, &mt);
1880 src->middle -= middle_ofs + 1;
1884 /* False alarm: no macro expansion after all. Use first token as
1885 lookahead. We'll retry macro expansion from the second token next
1887 macro_expander_destroy (me);
1892 /* The first 'n_call' tokens starting at 'middle' will be replaced by a
1893 macro expansion. There might be more tokens after that, up to 'front'.
1895 Figure out the boundary of the macro call in the syntax, to go into the
1896 lex_tokens for the expansion so that later error messages can report what
1897 macro was called. */
1898 const struct lex_token *call_first
1899 = &src->tokens[src->middle & (src->capacity - 1)];
1900 const struct lex_token *call_last
1901 = &src->tokens[(src->middle + n_call - 1) & (src->capacity - 1)];
1902 size_t call_pos = call_first->token_pos;
1903 size_t call_len = (call_last->token_pos + call_last->token_len) - call_pos;
1904 size_t line_pos = call_first->line_pos;
1905 int first_line = call_first->first_line;
1907 /* Destroy the tokens for the call, and save any tokens following the call so
1908 we can add them back later. */
1909 for (size_t i = src->middle; i != src->middle + n_call; i++)
1910 lex_token_uninit (&src->tokens[i & (src->capacity - 1)]);
1911 size_t n_save = src->front - (src->middle + n_call);
1912 struct lex_token *save_tokens = xnmalloc (n_save, sizeof *save_tokens);
1913 for (size_t i = 0; i < n_save; i++)
1914 save_tokens[i] = src->tokens[(src->middle + n_call + i)
1915 & (src->capacity - 1)];
1916 src->front = src->middle;
1918 /* Now expand the macro. */
1919 struct macro_tokens expansion = { .n = 0 };
1920 macro_expander_get_expansion (me, &expansion);
1921 macro_expander_destroy (me);
1923 /* Convert the macro expansion into syntax for possible error messages later. */
1924 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1925 size_t *len = xnmalloc (expansion.n, sizeof *len);
1926 struct string s = DS_EMPTY_INITIALIZER;
1927 macro_tokens_to_representation (&expansion, &s, ofs, len);
1929 if (settings_get_mprint ())
1930 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1931 _("Macro Expansion")));
1933 /* Append the macro expansion tokens to the lookahead. */
1934 char *macro_rep = ds_steal_cstr (&s);
1935 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1936 *ref_cnt = expansion.n;
1937 for (size_t i = 0; i < expansion.n; i++)
1939 *lex_push_token__ (src) = (struct lex_token) {
1940 .token = expansion.mts[i].token,
1941 .token_pos = call_pos,
1942 .token_len = call_len,
1943 .line_pos = line_pos,
1944 .first_line = first_line,
1945 .macro_rep = macro_rep,
1952 ss_dealloc (&expansion.mts[i].representation);
1954 free (expansion.mts);
1958 /* Finally, put the saved tokens back. */
1959 for (size_t i = 0; i < n_save; i++)
1960 *lex_push_token__ (src) = save_tokens[i];
1967 lex_source_push_endcmd__ (struct lex_source *src)
1969 assert (src->back == src->middle && src->middle == src->front);
1970 *lex_push_token__ (src) = (struct lex_token) {
1971 .token = { .type = T_ENDCMD } };
1975 static struct lex_source *
1976 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
1978 struct lex_source *src = xmalloc (sizeof *src);
1979 *src = (struct lex_source) {
1981 .segmenter = segmenter_init (reader->syntax, false),
1985 lex_source_push_endcmd__ (src);
1991 lex_source_destroy (struct lex_source *src)
1993 char *file_name = src->reader->file_name;
1994 char *encoding = src->reader->encoding;
1995 if (src->reader->class->destroy != NULL)
1996 src->reader->class->destroy (src->reader);
2000 while (src->middle - src->back > 0)
2001 lex_source_pop_back (src);
2002 while (src->front - src->middle > 0)
2003 lex_source_pop_front (src);
2005 ll_remove (&src->ll);
2009 struct lex_file_reader
2011 struct lex_reader reader;
2012 struct u8_istream *istream;
2015 static struct lex_reader_class lex_file_reader_class;
2017 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2018 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2019 ENCODING, which should take one of the forms accepted by
2020 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2021 mode of the new reader, respectively.
2023 Returns a null pointer if FILE_NAME cannot be opened. */
2025 lex_reader_for_file (const char *file_name, const char *encoding,
2026 enum segmenter_mode syntax,
2027 enum lex_error_mode error)
2029 struct lex_file_reader *r;
2030 struct u8_istream *istream;
2032 istream = (!strcmp(file_name, "-")
2033 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2034 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2035 if (istream == NULL)
2037 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2041 r = xmalloc (sizeof *r);
2042 lex_reader_init (&r->reader, &lex_file_reader_class);
2043 r->reader.syntax = syntax;
2044 r->reader.error = error;
2045 r->reader.file_name = xstrdup (file_name);
2046 r->reader.encoding = xstrdup_if_nonnull (encoding);
2047 r->reader.line_number = 1;
2048 r->istream = istream;
2053 static struct lex_file_reader *
2054 lex_file_reader_cast (struct lex_reader *r)
2056 return UP_CAST (r, struct lex_file_reader, reader);
2060 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2061 enum prompt_style prompt_style UNUSED)
2063 struct lex_file_reader *r = lex_file_reader_cast (r_);
2064 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2067 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2074 lex_file_close (struct lex_reader *r_)
2076 struct lex_file_reader *r = lex_file_reader_cast (r_);
2078 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2080 if (u8_istream_close (r->istream) != 0)
2081 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2084 u8_istream_free (r->istream);
2089 static struct lex_reader_class lex_file_reader_class =
2095 struct lex_string_reader
2097 struct lex_reader reader;
2102 static struct lex_reader_class lex_string_reader_class;
2104 /* Creates and returns a new lex_reader for the contents of S, which must be
2105 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2106 with ss_dealloc() when it is closed. */
2108 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2110 struct lex_string_reader *r;
2112 r = xmalloc (sizeof *r);
2113 lex_reader_init (&r->reader, &lex_string_reader_class);
2114 r->reader.syntax = SEG_MODE_AUTO;
2115 r->reader.encoding = xstrdup_if_nonnull (encoding);
2122 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2123 which must be encoded in ENCODING. The caller retains ownership of S. */
2125 lex_reader_for_string (const char *s, const char *encoding)
2127 struct substring ss;
2128 ss_alloc_substring (&ss, ss_cstr (s));
2129 return lex_reader_for_substring_nocopy (ss, encoding);
2132 /* Formats FORMAT as a printf()-like format string and creates and returns a
2133 new lex_reader for the formatted result. */
2135 lex_reader_for_format (const char *format, const char *encoding, ...)
2137 struct lex_reader *r;
2140 va_start (args, encoding);
2141 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2147 static struct lex_string_reader *
2148 lex_string_reader_cast (struct lex_reader *r)
2150 return UP_CAST (r, struct lex_string_reader, reader);
2154 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2155 enum prompt_style prompt_style UNUSED)
2157 struct lex_string_reader *r = lex_string_reader_cast (r_);
2160 chunk = MIN (n, r->s.length - r->offset);
2161 memcpy (buf, r->s.string + r->offset, chunk);
2168 lex_string_close (struct lex_reader *r_)
2170 struct lex_string_reader *r = lex_string_reader_cast (r_);
2176 static struct lex_reader_class lex_string_reader_class =