1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "language/command.h"
34 #include "language/lexer/macro.h"
35 #include "language/lexer/scan.h"
36 #include "language/lexer/segment.h"
37 #include "language/lexer/token.h"
38 #include "libpspp/assertion.h"
39 #include "libpspp/cast.h"
40 #include "libpspp/deque.h"
41 #include "libpspp/i18n.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
71 src->tail <= line_pos <= token_pos <= src->head. */
72 size_t token_pos; /* Start of token. */
73 size_t token_len; /* Length of source for token in bytes. */
74 size_t line_pos; /* Start of line containing token_pos. */
75 int first_line; /* Line number at token_pos. */
77 /* For a token obtained through macro expansion, this is just this token.
79 For a token obtained through the lexer in an ordinary way, these are
81 char *macro_rep; /* The whole macro expansion. */
82 size_t ofs; /* Offset of this token in macro_rep. */
83 size_t len; /* Length of this token in macro_rep. */
84 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
88 lex_token_uninit (struct lex_token *t)
90 token_uninit (&t->token);
93 assert (*t->ref_cnt > 0);
102 /* A source of tokens, corresponding to a syntax file.
104 This is conceptually a lex_reader wrapped with everything needed to convert
105 its UTF-8 bytes into tokens. */
108 struct ll ll; /* In lexer's list of sources. */
109 struct lex_reader *reader;
111 struct segmenter segmenter;
112 bool eof; /* True if T_STOP was read from 'reader'. */
114 /* Buffer of UTF-8 bytes. */
116 size_t allocated; /* Number of bytes allocated. */
117 size_t tail; /* &buffer[0] offset into UTF-8 source. */
118 size_t head; /* &buffer[head - tail] offset into source. */
120 /* Positions in source file, tail <= pos <= head for each member here. */
121 size_t journal_pos; /* First byte not yet output to journal. */
122 size_t seg_pos; /* First byte not yet scanned as token. */
123 size_t line_pos; /* First byte of line containing seg_pos. */
125 int n_newlines; /* Number of new-lines up to seg_pos. */
126 bool suppress_next_newline;
130 This is mostly like a deque, with the invariant that 'back <= middle <=
131 front' (modulo SIZE_MAX+1). The tokens available for parsing are
132 between 'back' and 'middle': the token at 'back' is the current token,
133 the token at 'back + 1' is the next token, and so on. There are usually
134 no tokens between 'middle' and 'front'; if there are, then they need to
135 go through macro expansion and are not yet available for parsing.
137 'capacity' is the current number of elements in 'tokens'. It is always
138 a power of 2. 'front', 'middle', and 'back' refer to indexes in
139 'tokens' modulo 'capacity'. */
144 size_t mask; /* capacity - 1 */
145 struct lex_token *tokens;
148 static struct lex_source *lex_source_create (struct lexer *,
149 struct lex_reader *);
150 static void lex_source_destroy (struct lex_source *);
155 struct ll_list sources; /* Contains "struct lex_source"s. */
156 struct macro_set *macros;
159 static struct lex_source *lex_source__ (const struct lexer *);
160 static char *lex_source_get_syntax__ (const struct lex_source *,
162 static const struct lex_token *lex_next__ (const struct lexer *, int n);
163 static void lex_source_push_endcmd__ (struct lex_source *);
165 static void lex_source_pop_back (struct lex_source *);
166 static bool lex_source_get (const struct lex_source *);
167 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
168 const char *format, va_list)
169 PRINTF_FORMAT (4, 0);
170 static const struct lex_token *lex_source_next__ (const struct lex_source *,
173 /* Initializes READER with the specified CLASS and otherwise some reasonable
174 defaults. The caller should fill in the others members as desired. */
176 lex_reader_init (struct lex_reader *reader,
177 const struct lex_reader_class *class)
179 reader->class = class;
180 reader->syntax = SEG_MODE_AUTO;
181 reader->error = LEX_ERROR_CONTINUE;
182 reader->file_name = NULL;
183 reader->encoding = NULL;
184 reader->line_number = 0;
188 /* Frees any file name already in READER and replaces it by a copy of
189 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
191 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
193 free (reader->file_name);
194 reader->file_name = xstrdup_if_nonnull (file_name);
197 /* Creates and returns a new lexer. */
201 struct lexer *lexer = xmalloc (sizeof *lexer);
202 *lexer = (struct lexer) {
203 .sources = LL_INITIALIZER (lexer->sources),
204 .macros = macro_set_create (),
209 /* Destroys LEXER. */
211 lex_destroy (struct lexer *lexer)
215 struct lex_source *source, *next;
217 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
218 lex_source_destroy (source);
219 macro_set_destroy (lexer->macros);
224 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
225 same name. Takes ownership of M. */
227 lex_define_macro (struct lexer *lexer, struct macro *m)
229 macro_set_add (lexer->macros, m);
232 /* Inserts READER into LEXER so that the next token read by LEXER comes from
233 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
236 lex_include (struct lexer *lexer, struct lex_reader *reader)
238 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
239 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
242 /* Appends READER to LEXER, so that it will be read after all other current
243 readers have already been read. */
245 lex_append (struct lexer *lexer, struct lex_reader *reader)
247 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
252 /* Adds a new token at the front of SRC and returns a pointer to it. The
253 caller should initialize it. Does not advance the middle pointer, so the
254 token isn't immediately available to the parser. */
255 static struct lex_token *
256 lex_push_token__ (struct lex_source *src)
258 if (src->front - src->back >= src->capacity)
260 /* Expansion works just like a deque, so we reuse the code. */
261 struct deque deque = {
262 .capacity = src->capacity,
266 src->tokens = deque_expand (&deque, src->tokens, sizeof *src->tokens);
267 src->capacity = deque.capacity;
268 src->mask = src->capacity - 1;
271 struct lex_token *token = &src->tokens[src->front++ & src->mask];
272 token->token = (struct token) { .type = T_STOP };
273 token->macro_rep = NULL;
274 token->ref_cnt = NULL;
278 /* Removes the current token from SRC and uninitializes it. */
280 lex_source_pop_back (struct lex_source *src)
282 assert (src->middle - src->back > 0);
283 lex_token_uninit (&src->tokens[src->back++ & src->mask]);
286 /* Removes the token at the greatest lookahead from SRC and uninitializes
289 lex_source_pop_front (struct lex_source *src)
291 assert (src->front - src->middle > 0);
292 lex_token_uninit (&src->tokens[--src->front & src->mask]);
295 /* Advances LEXER to the next token, consuming the current token. */
297 lex_get (struct lexer *lexer)
299 struct lex_source *src;
301 src = lex_source__ (lexer);
305 if (src->middle - src->back > 0)
306 lex_source_pop_back (src);
308 while (src->back == src->middle)
309 if (!lex_source_get (src))
311 lex_source_destroy (src);
312 src = lex_source__ (lexer);
318 /* Issuing errors. */
320 /* Prints a syntax error message containing the current token and
321 given message MESSAGE (if non-null). */
323 lex_error (struct lexer *lexer, const char *format, ...)
327 va_start (args, format);
328 lex_next_error_valist (lexer, 0, 0, format, args);
332 /* Prints a syntax error message containing the current token and
333 given message MESSAGE (if non-null). */
335 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
337 lex_next_error_valist (lexer, 0, 0, format, args);
340 /* Prints a syntax error message containing the current token and
341 given message MESSAGE (if non-null). */
343 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
347 va_start (args, format);
348 lex_next_error_valist (lexer, n0, n1, format, args);
352 /* Prints a syntax error message saying that one of the strings provided as
353 varargs, up to the first NULL, is expected. */
355 (lex_error_expecting) (struct lexer *lexer, ...)
359 va_start (args, lexer);
360 lex_error_expecting_valist (lexer, args);
364 /* Prints a syntax error message saying that one of the options provided in
365 ARGS, up to the first NULL, is expected. */
367 lex_error_expecting_valist (struct lexer *lexer, va_list args)
369 enum { MAX_OPTIONS = 9 };
370 const char *options[MAX_OPTIONS];
372 while (n < MAX_OPTIONS)
374 const char *option = va_arg (args, const char *);
378 options[n++] = option;
380 lex_error_expecting_array (lexer, options, n);
384 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
389 lex_error (lexer, NULL);
393 lex_error (lexer, _("expecting %s"), options[0]);
397 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
401 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
406 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
407 options[0], options[1], options[2], options[3]);
411 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
412 options[0], options[1], options[2], options[3], options[4]);
416 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
417 options[0], options[1], options[2], options[3], options[4],
422 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
423 options[0], options[1], options[2], options[3], options[4],
424 options[5], options[6]);
428 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
429 options[0], options[1], options[2], options[3], options[4],
430 options[5], options[6], options[7]);
434 lex_error (lexer, NULL);
438 /* Reports an error to the effect that subcommand SBC may only be specified
441 This function does not take a lexer as an argument or use lex_error(),
442 because the result would ordinarily just be redundant: "Syntax error at
443 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
444 not help the user find the error. */
446 lex_sbc_only_once (const char *sbc)
448 msg (SE, _("Subcommand %s may only be specified once."), sbc);
451 /* Reports an error to the effect that subcommand SBC is missing.
453 This function does not take a lexer as an argument or use lex_error(),
454 because a missing subcommand can normally be detected only after the whole
455 command has been parsed, and so lex_error() would always report "Syntax
456 error at end of command", which does not help the user find the error. */
458 lex_sbc_missing (const char *sbc)
460 msg (SE, _("Required subcommand %s was not specified."), sbc);
463 /* Reports an error to the effect that specification SPEC may only be specified
464 once within subcommand SBC. */
466 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
468 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
472 /* Reports an error to the effect that specification SPEC is missing within
475 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
477 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
481 /* Prints a syntax error message containing the current token and
482 given message MESSAGE (if non-null). */
484 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
485 const char *format, va_list args)
487 struct lex_source *src = lex_source__ (lexer);
490 lex_source_error_valist (src, n0, n1, format, args);
496 ds_put_format (&s, _("Syntax error at end of input"));
499 ds_put_cstr (&s, ": ");
500 ds_put_vformat (&s, format, args);
502 ds_put_byte (&s, '.');
503 msg (SE, "%s", ds_cstr (&s));
508 /* Checks that we're at end of command.
509 If so, returns a successful command completion code.
510 If not, flags a syntax error and returns an error command
513 lex_end_of_command (struct lexer *lexer)
515 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
517 lex_error (lexer, _("expecting end of command"));
524 /* Token testing functions. */
526 /* Returns true if the current token is a number. */
528 lex_is_number (const struct lexer *lexer)
530 return lex_next_is_number (lexer, 0);
533 /* Returns true if the current token is a string. */
535 lex_is_string (const struct lexer *lexer)
537 return lex_next_is_string (lexer, 0);
540 /* Returns the value of the current token, which must be a
541 floating point number. */
543 lex_number (const struct lexer *lexer)
545 return lex_next_number (lexer, 0);
548 /* Returns true iff the current token is an integer. */
550 lex_is_integer (const struct lexer *lexer)
552 return lex_next_is_integer (lexer, 0);
555 /* Returns the value of the current token, which must be an
558 lex_integer (const struct lexer *lexer)
560 return lex_next_integer (lexer, 0);
563 /* Token testing functions with lookahead.
565 A value of 0 for N as an argument to any of these functions refers to the
566 current token. Lookahead is limited to the current command. Any N greater
567 than the number of tokens remaining in the current command will be treated
568 as referring to a T_ENDCMD token. */
570 /* Returns true if the token N ahead of the current token is a number. */
572 lex_next_is_number (const struct lexer *lexer, int n)
574 return token_is_number (lex_next (lexer, n));
577 /* Returns true if the token N ahead of the current token is a string. */
579 lex_next_is_string (const struct lexer *lexer, int n)
581 return token_is_string (lex_next (lexer, n));
584 /* Returns the value of the token N ahead of the current token, which must be a
585 floating point number. */
587 lex_next_number (const struct lexer *lexer, int n)
589 return token_number (lex_next (lexer, n));
592 /* Returns true if the token N ahead of the current token is an integer. */
594 lex_next_is_integer (const struct lexer *lexer, int n)
596 return token_is_integer (lex_next (lexer, n));
599 /* Returns the value of the token N ahead of the current token, which must be
602 lex_next_integer (const struct lexer *lexer, int n)
604 return token_integer (lex_next (lexer, n));
607 /* Token matching functions. */
609 /* If the current token has the specified TYPE, skips it and returns true.
610 Otherwise, returns false. */
612 lex_match (struct lexer *lexer, enum token_type type)
614 if (lex_token (lexer) == type)
623 /* If the current token matches IDENTIFIER, skips it and returns true.
624 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
627 IDENTIFIER must be an ASCII string. */
629 lex_match_id (struct lexer *lexer, const char *identifier)
631 return lex_match_id_n (lexer, identifier, 3);
634 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
635 may be abbreviated to its first N letters. Otherwise, returns false.
637 IDENTIFIER must be an ASCII string. */
639 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
641 if (lex_token (lexer) == T_ID
642 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
651 /* If the current token is integer X, skips it and returns true. Otherwise,
654 lex_match_int (struct lexer *lexer, int x)
656 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
665 /* Forced matches. */
667 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
668 abbreviated to its first 3 letters. Otherwise, reports an error and returns
671 IDENTIFIER must be an ASCII string. */
673 lex_force_match_id (struct lexer *lexer, const char *identifier)
675 if (lex_match_id (lexer, identifier))
679 lex_error_expecting (lexer, identifier);
684 /* If the current token has the specified TYPE, skips it and returns true.
685 Otherwise, reports an error and returns false. */
687 lex_force_match (struct lexer *lexer, enum token_type type)
689 if (lex_token (lexer) == type)
696 const char *type_string = token_type_to_string (type);
699 char *s = xasprintf ("`%s'", type_string);
700 lex_error_expecting (lexer, s);
704 lex_error_expecting (lexer, token_type_to_name (type));
710 /* If the current token is a string, does nothing and returns true.
711 Otherwise, reports an error and returns false. */
713 lex_force_string (struct lexer *lexer)
715 if (lex_is_string (lexer))
719 lex_error (lexer, _("expecting string"));
724 /* If the current token is a string or an identifier, does nothing and returns
725 true. Otherwise, reports an error and returns false.
727 This is meant for use in syntactic situations where we want to encourage the
728 user to supply a quoted string, but for compatibility we also accept
729 identifiers. (One example of such a situation is file names.) Therefore,
730 the error message issued when the current token is wrong only says that a
731 string is expected and doesn't mention that an identifier would also be
734 lex_force_string_or_id (struct lexer *lexer)
736 return lex_token (lexer) == T_ID || lex_force_string (lexer);
739 /* If the current token is an integer, does nothing and returns true.
740 Otherwise, reports an error and returns false. */
742 lex_force_int (struct lexer *lexer)
744 if (lex_is_integer (lexer))
748 lex_error (lexer, _("expecting integer"));
753 /* If the current token is an integer in the range MIN...MAX (inclusive), does
754 nothing and returns true. Otherwise, reports an error and returns false.
755 If NAME is nonnull, then it is used in the error message. */
757 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
759 bool is_integer = lex_is_integer (lexer);
760 bool too_small = is_integer && lex_integer (lexer) < min;
761 bool too_big = is_integer && lex_integer (lexer) > max;
762 if (is_integer && !too_small && !too_big)
767 /* Weird, maybe a bug in the caller. Just report that we needed an
770 lex_error (lexer, _("Integer expected for %s."), name);
772 lex_error (lexer, _("Integer expected."));
777 lex_error (lexer, _("Expected %ld for %s."), min, name);
779 lex_error (lexer, _("Expected %ld."), min);
781 else if (min + 1 == max)
784 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
786 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
790 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
791 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
793 if (report_lower_bound && report_upper_bound)
797 _("Expected integer between %ld and %ld for %s."),
800 lex_error (lexer, _("Expected integer between %ld and %ld."),
803 else if (report_lower_bound)
808 lex_error (lexer, _("Expected non-negative integer for %s."),
811 lex_error (lexer, _("Expected non-negative integer."));
816 lex_error (lexer, _("Expected positive integer for %s."),
819 lex_error (lexer, _("Expected positive integer."));
822 else if (report_upper_bound)
826 _("Expected integer less than or equal to %ld for %s."),
829 lex_error (lexer, _("Expected integer less than or equal to %ld."),
835 lex_error (lexer, _("Integer expected for %s."), name);
837 lex_error (lexer, _("Integer expected."));
843 /* If the current token is a number, does nothing and returns true.
844 Otherwise, reports an error and returns false. */
846 lex_force_num (struct lexer *lexer)
848 if (lex_is_number (lexer))
851 lex_error (lexer, _("expecting number"));
855 /* If the current token is an identifier, does nothing and returns true.
856 Otherwise, reports an error and returns false. */
858 lex_force_id (struct lexer *lexer)
860 if (lex_token (lexer) == T_ID)
863 lex_error (lexer, _("expecting identifier"));
867 /* Token accessors. */
869 /* Returns the type of LEXER's current token. */
871 lex_token (const struct lexer *lexer)
873 return lex_next_token (lexer, 0);
876 /* Returns the number in LEXER's current token.
878 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
879 tokens this function will always return zero. */
881 lex_tokval (const struct lexer *lexer)
883 return lex_next_tokval (lexer, 0);
886 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
888 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
889 this functions this function will always return NULL.
891 The UTF-8 encoding of the returned string is correct for variable names and
892 other identifiers. Use filename_to_utf8() to use it as a filename. Use
893 data_in() to use it in a "union value". */
895 lex_tokcstr (const struct lexer *lexer)
897 return lex_next_tokcstr (lexer, 0);
900 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
901 null-terminated (but the null terminator is not included in the returned
902 substring's 'length').
904 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
905 this functions this function will always return NULL.
907 The UTF-8 encoding of the returned string is correct for variable names and
908 other identifiers. Use filename_to_utf8() to use it as a filename. Use
909 data_in() to use it in a "union value". */
911 lex_tokss (const struct lexer *lexer)
913 return lex_next_tokss (lexer, 0);
918 A value of 0 for N as an argument to any of these functions refers to the
919 current token. Lookahead is limited to the current command. Any N greater
920 than the number of tokens remaining in the current command will be treated
921 as referring to a T_ENDCMD token. */
923 static const struct lex_token *
924 lex_next__ (const struct lexer *lexer_, int n)
926 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
927 struct lex_source *src = lex_source__ (lexer);
930 return lex_source_next__ (src, n);
933 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
938 /* Returns the token in SRC with the greatest lookahead. */
939 static const struct lex_token *
940 lex_source_middle (const struct lex_source *src)
942 assert (src->middle - src->back > 0);
943 return &src->tokens[(src->middle - 1) & src->mask];
946 static const struct lex_token *
947 lex_source_next__ (const struct lex_source *src, int n)
949 while (src->middle - src->back <= n)
951 if (src->middle - src->back > 0)
953 const struct lex_token *middle = lex_source_middle (src);
954 if (middle->token.type == T_STOP || middle->token.type == T_ENDCMD)
958 lex_source_get (src);
961 return &src->tokens[(src->back + n) & src->mask];
964 /* Returns the "struct token" of the token N after the current one in LEXER.
965 The returned pointer can be invalidated by pretty much any succeeding call
966 into the lexer, although the string pointer within the returned token is
967 only invalidated by consuming the token (e.g. with lex_get()). */
969 lex_next (const struct lexer *lexer, int n)
971 return &lex_next__ (lexer, n)->token;
974 /* Returns the type of the token N after the current one in LEXER. */
976 lex_next_token (const struct lexer *lexer, int n)
978 return lex_next (lexer, n)->type;
981 /* Returns the number in the tokn N after the current one in LEXER.
983 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
984 tokens this function will always return zero. */
986 lex_next_tokval (const struct lexer *lexer, int n)
988 return token_number (lex_next (lexer, n));
991 /* Returns the null-terminated string in the token N after the current one, in
994 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
995 this functions this function will always return NULL.
997 The UTF-8 encoding of the returned string is correct for variable names and
998 other identifiers. Use filename_to_utf8() to use it as a filename. Use
999 data_in() to use it in a "union value". */
1001 lex_next_tokcstr (const struct lexer *lexer, int n)
1003 return lex_next_tokss (lexer, n).string;
1006 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1007 The string is null-terminated (but the null terminator is not included in
1008 the returned substring's 'length').
1010 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1011 tokens this functions this function will always return NULL.
1013 The UTF-8 encoding of the returned string is correct for variable names and
1014 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1015 data_in() to use it in a "union value". */
1017 lex_next_tokss (const struct lexer *lexer, int n)
1019 return lex_next (lexer, n)->string;
1022 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1023 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1024 are both zero, this requests the syntax for the current token.) The caller
1025 must eventually free the returned string (with free()). The syntax is
1026 encoded in UTF-8 and in the original form supplied to the lexer so that, for
1027 example, it may include comments, spaces, and new-lines if it spans multiple
1028 tokens. Macro expansion, however, has already been performed. */
1030 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1032 return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1035 /* Returns true if the token N ahead of the current one was produced by macro
1036 expansion, false otherwise. */
1038 lex_next_is_from_macro (const struct lexer *lexer, int n)
1040 return lex_next__ (lexer, n)->macro_rep != NULL;
1044 lex_tokens_match (const struct token *actual, const struct token *expected)
1046 if (actual->type != expected->type)
1049 switch (actual->type)
1053 return actual->number == expected->number;
1056 return lex_id_match (expected->string, actual->string);
1059 return (actual->string.length == expected->string.length
1060 && !memcmp (actual->string.string, expected->string.string,
1061 actual->string.length));
1068 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1069 skips it and returns true. Otherwise, returns false.
1071 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1072 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1073 first three letters. */
1075 lex_match_phrase (struct lexer *lexer, const char *s)
1077 struct string_lexer slex;
1082 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1083 while (string_lexer_next (&slex, &token))
1084 if (token.type != SCAN_SKIP)
1086 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1087 token_uninit (&token);
1098 lex_source_get_first_line_number (const struct lex_source *src, int n)
1100 return lex_source_next__ (src, n)->first_line;
1104 count_newlines (char *s, size_t length)
1109 while ((newline = memchr (s, '\n', length)) != NULL)
1112 length -= (newline + 1) - s;
1120 lex_source_get_last_line_number (const struct lex_source *src, int n)
1122 const struct lex_token *token = lex_source_next__ (src, n);
1124 if (token->first_line == 0)
1128 char *token_str = &src->buffer[token->token_pos - src->tail];
1129 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1134 count_columns (const char *s_, size_t length)
1136 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1142 for (ofs = 0; ofs < length; ofs += mblen)
1146 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1149 int width = uc_width (uc, "UTF-8");
1154 columns = ROUND_UP (columns + 1, 8);
1161 lex_source_get_first_column (const struct lex_source *src, int n)
1163 const struct lex_token *token = lex_source_next__ (src, n);
1164 return count_columns (&src->buffer[token->line_pos - src->tail],
1165 token->token_pos - token->line_pos);
1169 lex_source_get_last_column (const struct lex_source *src, int n)
1171 const struct lex_token *token = lex_source_next__ (src, n);
1172 char *start, *end, *newline;
1174 start = &src->buffer[token->line_pos - src->tail];
1175 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1176 newline = memrchr (start, '\n', end - start);
1177 if (newline != NULL)
1178 start = newline + 1;
1179 return count_columns (start, end - start);
1182 /* Returns the 1-based line number of the start of the syntax that represents
1183 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1184 if the token is drawn from a source that does not have line numbers. */
1186 lex_get_first_line_number (const struct lexer *lexer, int n)
1188 const struct lex_source *src = lex_source__ (lexer);
1189 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
1192 /* Returns the 1-based line number of the end of the syntax that represents the
1193 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1194 token or if the token is drawn from a source that does not have line
1197 Most of the time, a single token is wholly within a single line of syntax,
1198 but there are two exceptions: a T_STRING token can be made up of multiple
1199 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1200 token can consist of a "-" on one line followed by the number on the next.
1203 lex_get_last_line_number (const struct lexer *lexer, int n)
1205 const struct lex_source *src = lex_source__ (lexer);
1206 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1209 /* Returns the 1-based column number of the start of the syntax that represents
1210 the token N after the current one in LEXER. Returns 0 for a T_STOP
1213 Column numbers are measured according to the width of characters as shown in
1214 a typical fixed-width font, in which CJK characters have width 2 and
1215 combining characters have width 0. */
1217 lex_get_first_column (const struct lexer *lexer, int n)
1219 const struct lex_source *src = lex_source__ (lexer);
1220 return src != NULL ? lex_source_get_first_column (src, n) : 0;
1223 /* Returns the 1-based column number of the end of the syntax that represents
1224 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1227 Column numbers are measured according to the width of characters as shown in
1228 a typical fixed-width font, in which CJK characters have width 2 and
1229 combining characters have width 0. */
1231 lex_get_last_column (const struct lexer *lexer, int n)
1233 const struct lex_source *src = lex_source__ (lexer);
1234 return src != NULL ? lex_source_get_last_column (src, n) : 0;
1237 /* Returns the name of the syntax file from which the current command is drawn.
1238 Returns NULL for a T_STOP token or if the command's source does not have
1241 There is no version of this function that takes an N argument because
1242 lookahead only works to the end of a command and any given command is always
1243 within a single syntax file. */
1245 lex_get_file_name (const struct lexer *lexer)
1247 struct lex_source *src = lex_source__ (lexer);
1248 return src == NULL ? NULL : src->reader->file_name;
1251 /* Returns a newly allocated msg_location for the syntax that represents tokens
1252 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1253 must eventually free the location (with msg_location_destroy()). */
1254 struct msg_location *
1255 lex_get_location (const struct lexer *lexer, int n0, int n1)
1257 struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1258 loc->first_column = lex_get_first_column (lexer, n0);
1259 loc->last_column = lex_get_last_column (lexer, n1);
1263 /* Returns a newly allocated msg_location for the syntax that represents tokens
1264 with 0-based offsets N0...N1, inclusive, from the current token. The
1265 location only covers the tokens' lines, not the columns. The caller must
1266 eventually free the location (with msg_location_destroy()). */
1267 struct msg_location *
1268 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1270 struct msg_location *loc = xmalloc (sizeof *loc);
1271 *loc = (struct msg_location) {
1272 .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1273 .first_line = lex_get_first_line_number (lexer, n0),
1274 .last_line = lex_get_last_line_number (lexer, n1),
1280 lex_get_encoding (const struct lexer *lexer)
1282 struct lex_source *src = lex_source__ (lexer);
1283 return src == NULL ? NULL : src->reader->encoding;
1286 /* Returns the syntax mode for the syntax file from which the current drawn is
1287 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1288 does not have line numbers.
1290 There is no version of this function that takes an N argument because
1291 lookahead only works to the end of a command and any given command is always
1292 within a single syntax file. */
1294 lex_get_syntax_mode (const struct lexer *lexer)
1296 struct lex_source *src = lex_source__ (lexer);
1297 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1300 /* Returns the error mode for the syntax file from which the current drawn is
1301 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1302 source does not have line numbers.
1304 There is no version of this function that takes an N argument because
1305 lookahead only works to the end of a command and any given command is always
1306 within a single syntax file. */
1308 lex_get_error_mode (const struct lexer *lexer)
1310 struct lex_source *src = lex_source__ (lexer);
1311 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1314 /* If the source that LEXER is currently reading has error mode
1315 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1316 token to be read comes directly from whatever is next read from the stream.
1318 It makes sense to call this function after encountering an error in a
1319 command entered on the console, because usually the user would prefer not to
1320 have cascading errors. */
1322 lex_interactive_reset (struct lexer *lexer)
1324 struct lex_source *src = lex_source__ (lexer);
1325 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1327 src->head = src->tail = 0;
1328 src->journal_pos = src->seg_pos = src->line_pos = 0;
1329 src->n_newlines = 0;
1330 src->suppress_next_newline = false;
1331 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1333 while (src->middle - src->back > 0)
1334 lex_source_pop_back (src);
1335 while (src->front - src->middle > 0)
1336 lex_source_pop_front (src);
1337 lex_source_push_endcmd__ (src);
1341 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1343 lex_discard_rest_of_command (struct lexer *lexer)
1345 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1349 /* Discards all lookahead tokens in LEXER, then discards all input sources
1350 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1351 runs out of input sources. */
1353 lex_discard_noninteractive (struct lexer *lexer)
1355 struct lex_source *src = lex_source__ (lexer);
1359 while (src->middle - src->back > 0)
1360 lex_source_pop_back (src);
1362 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1363 src = lex_source__ (lexer))
1364 lex_source_destroy (src);
1369 lex_source_max_tail__ (const struct lex_source *src)
1371 const struct lex_token *token;
1374 assert (src->seg_pos >= src->line_pos);
1375 max_tail = MIN (src->journal_pos, src->line_pos);
1377 /* Use the oldest token also. (We know that src->deque cannot be empty
1378 because we are in the process of adding a new token, which is already
1379 initialized enough to use here.) */
1380 token = &src->tokens[src->back & src->mask];
1381 assert (token->token_pos >= token->line_pos);
1382 max_tail = MIN (max_tail, token->line_pos);
1388 lex_source_expand__ (struct lex_source *src)
1390 if (src->head - src->tail >= src->allocated)
1392 size_t max_tail = lex_source_max_tail__ (src);
1393 if (max_tail > src->tail)
1395 /* Advance the tail, freeing up room at the head. */
1396 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1397 src->head - max_tail);
1398 src->tail = max_tail;
1402 /* Buffer is completely full. Expand it. */
1403 src->buffer = x2realloc (src->buffer, &src->allocated);
1408 /* There's space available at the head of the buffer. Nothing to do. */
1413 lex_source_read__ (struct lex_source *src)
1417 lex_source_expand__ (src);
1419 size_t head_ofs = src->head - src->tail;
1420 size_t space = src->allocated - head_ofs;
1421 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1422 size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1424 assert (n <= space);
1429 src->reader->eof = true;
1430 lex_source_expand__ (src);
1436 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1437 src->head - src->seg_pos));
1440 static struct lex_source *
1441 lex_source__ (const struct lexer *lexer)
1443 return (ll_is_empty (&lexer->sources) ? NULL
1444 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1447 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1448 one, through N1 ahead of the current one, inclusive. (For example, if N0
1449 and N1 are both zero, this requests the syntax for the current token.) The
1450 caller must eventually free the returned string (with free()). The syntax
1451 is encoded in UTF-8 and in the original form supplied to the lexer so that,
1452 for example, it may include comments, spaces, and new-lines if it spans
1453 multiple tokens. Macro expansion, however, has already been performed. */
1455 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1457 struct string s = DS_EMPTY_INITIALIZER;
1458 for (size_t i = n0; i <= n1; )
1460 /* Find [I,J) as the longest sequence of tokens not produced by macro
1461 expansion, or otherwise the longest sequence expanded from a single
1463 const struct lex_token *first = lex_source_next__ (src, i);
1465 for (j = i + 1; j <= n1; j++)
1467 const struct lex_token *cur = lex_source_next__ (src, j);
1468 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1469 || first->macro_rep != cur->macro_rep)
1472 const struct lex_token *last = lex_source_next__ (src, j - 1);
1474 /* Now add the syntax for this sequence of tokens to SRC. */
1475 if (!ds_is_empty (&s))
1476 ds_put_byte (&s, ' ');
1477 if (!first->macro_rep)
1479 size_t start = first->token_pos;
1480 size_t end = last->token_pos + last->token_len;
1481 ds_put_substring (&s, ss_buffer (&src->buffer[start - src->tail],
1486 size_t start = first->ofs;
1487 size_t end = last->ofs + last->len;
1488 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1494 return ds_steal_cstr (&s);
1498 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1500 for (size_t i = n0; i <= n1; i++)
1501 if (lex_source_next__ (src, i)->macro_rep)
1506 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1507 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1508 other tokens included in that range. The syntax is encoded in UTF-8 and in
1509 the original form supplied to the lexer so that, for example, it may include
1510 comments, spaces, and new-lines if it spans multiple tokens.
1512 Returns an empty string if the token range doesn't include a macro call.
1514 The caller must not modify or free the returned string. */
1515 static struct substring
1516 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1518 if (!lex_source_contains_macro_call (src, n0, n1))
1521 const struct lex_token *token0 = lex_source_next__ (src, n0);
1522 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1523 size_t start = token0->token_pos;
1524 size_t end = token1->token_pos + token1->token_len;
1526 return ss_buffer (&src->buffer[start - src->tail], end - start);
1530 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1531 const char *format, va_list args)
1533 const struct lex_token *token;
1538 token = lex_source_next__ (src, n0);
1539 if (token->token.type == T_ENDCMD)
1540 ds_put_cstr (&s, _("Syntax error at end of command"));
1543 /* Get the syntax that caused the error. */
1544 char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1546 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1549 /* Get the macro call(s) that expanded to the syntax that caused the
1552 str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1559 _("Syntax error at `%s' (in expansion of `%s')"),
1562 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1567 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1570 ds_put_cstr (&s, _("Syntax error"));
1576 ds_put_cstr (&s, ": ");
1577 ds_put_vformat (&s, format, args);
1579 if (ds_last (&s) != '.')
1580 ds_put_byte (&s, '.');
1582 struct msg_location *location = xmalloc (sizeof *location);
1583 *location = (struct msg_location) {
1584 .file_name = xstrdup_if_nonnull (src->reader->file_name),
1585 .first_line = lex_source_get_first_line_number (src, n0),
1586 .last_line = lex_source_get_last_line_number (src, n1),
1587 .first_column = lex_source_get_first_column (src, n0),
1588 .last_column = lex_source_get_last_column (src, n1),
1590 struct msg *m = xmalloc (sizeof *m);
1592 .category = MSG_C_SYNTAX,
1593 .severity = MSG_S_ERROR,
1594 .location = location,
1595 .text = ds_steal_cstr (&s),
1600 static void PRINTF_FORMAT (4, 5)
1601 lex_source_error (struct lex_source *src, int n0, int n1,
1602 const char *format, ...)
1605 va_start (args, format);
1606 lex_source_error_valist (src, n0, n1, format, args);
1611 lex_get_error (struct lex_source *src, const char *s)
1613 size_t old_middle = src->middle;
1614 src->middle = src->front;
1615 size_t n = src->front - src->back - 1;
1616 lex_source_error (src, n, n, "%s", s);
1617 src->middle = old_middle;
1619 lex_source_pop_front (src);
1622 /* Attempts to append an additional token at the front of SRC, reading more
1623 from the underlying lex_reader if necessary. Returns true if a new token
1624 was added to SRC's deque, false otherwise. The caller should retry failures
1625 unless SRC's 'eof' marker was set to true indicating that there will be no
1626 more tokens from this source.
1628 Does not make the new token available for lookahead yet; the caller must
1629 adjust SRC's 'middle' pointer to do so. */
1631 lex_source_try_get__ (struct lex_source *src)
1633 /* State maintained while scanning tokens. Usually we only need a single
1634 state, but scanner_push() can return SCAN_SAVE to indicate that the state
1635 needs to be saved and possibly restored later with SCAN_BACK. */
1638 struct segmenter segmenter;
1639 enum segment_type last_segment;
1640 int newlines; /* Number of newlines encountered so far. */
1641 /* Maintained here so we can update lex_source's similar members when we
1647 /* Initialize state. */
1648 struct state state =
1650 .segmenter = src->segmenter,
1652 .seg_pos = src->seg_pos,
1653 .line_pos = src->line_pos,
1655 struct state saved = state;
1657 /* Append a new token to SRC and initialize it. */
1658 struct lex_token *token = lex_push_token__ (src);
1659 struct scanner scanner;
1660 scanner_init (&scanner, &token->token);
1661 token->line_pos = src->line_pos;
1662 token->token_pos = src->seg_pos;
1663 if (src->reader->line_number > 0)
1664 token->first_line = src->reader->line_number + src->n_newlines;
1666 token->first_line = 0;
1668 /* Extract segments and pass them through the scanner until we obtain a
1672 /* Extract a segment. */
1673 const char *segment = &src->buffer[state.seg_pos - src->tail];
1674 size_t seg_maxlen = src->head - state.seg_pos;
1675 enum segment_type type;
1676 int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1677 src->reader->eof, &type);
1680 /* The segmenter needs more input to produce a segment. */
1681 assert (!src->reader->eof);
1682 lex_source_read__ (src);
1686 /* Update state based on the segment. */
1687 state.last_segment = type;
1688 state.seg_pos += seg_len;
1689 if (type == SEG_NEWLINE)
1692 state.line_pos = state.seg_pos;
1695 /* Pass the segment into the scanner and try to get a token out. */
1696 enum scan_result result = scanner_push (&scanner, type,
1697 ss_buffer (segment, seg_len),
1699 if (result == SCAN_SAVE)
1701 else if (result == SCAN_BACK)
1706 else if (result == SCAN_DONE)
1710 /* If we've reached the end of a line, or the end of a command, then pass
1711 the line to the output engine as a syntax text item. */
1712 int n_lines = state.newlines;
1713 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1716 src->suppress_next_newline = true;
1718 else if (n_lines > 0 && src->suppress_next_newline)
1721 src->suppress_next_newline = false;
1723 for (int i = 0; i < n_lines; i++)
1725 /* Beginning of line. */
1726 const char *line = &src->buffer[src->journal_pos - src->tail];
1728 /* Calculate line length, including \n or \r\n end-of-line if present.
1730 We use src->head even though that may be beyond what we've actually
1731 converted to tokens (which is only through state.line_pos). That's
1732 because, if we're emitting the line due to SEG_END_COMMAND, we want to
1733 take the whole line through the newline, not just through the '.'. */
1734 size_t max_len = src->head - src->journal_pos;
1735 const char *newline = memchr (line, '\n', max_len);
1736 size_t line_len = newline ? newline - line + 1 : max_len;
1738 /* Calculate line length excluding end-of-line. */
1739 size_t copy_len = line_len;
1740 if (copy_len > 0 && line[copy_len - 1] == '\n')
1742 if (copy_len > 0 && line[copy_len - 1] == '\r')
1745 /* Submit the line as syntax. */
1746 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1747 xmemdup0 (line, copy_len),
1750 src->journal_pos += line_len;
1753 token->token_len = state.seg_pos - src->seg_pos;
1755 src->segmenter = state.segmenter;
1756 src->seg_pos = state.seg_pos;
1757 src->line_pos = state.line_pos;
1758 src->n_newlines += state.newlines;
1760 switch (token->token.type)
1766 token->token.type = T_ENDCMD;
1770 case SCAN_BAD_HEX_LENGTH:
1771 case SCAN_BAD_HEX_DIGIT:
1772 case SCAN_BAD_UNICODE_DIGIT:
1773 case SCAN_BAD_UNICODE_LENGTH:
1774 case SCAN_BAD_UNICODE_CODE_POINT:
1775 case SCAN_EXPECTED_QUOTE:
1776 case SCAN_EXPECTED_EXPONENT:
1777 case SCAN_UNEXPECTED_CHAR:
1778 char *msg = scan_token_to_error (&token->token);
1779 lex_get_error (src, msg);
1784 lex_source_pop_front (src);
1791 /* Attempts to add a new token at the front of SRC. Returns true if
1792 successful, false on failure. On failure, the end of SRC has been reached
1793 and no more tokens will be forthcoming from it.
1795 Does not make the new token available for lookahead yet; the caller must
1796 adjust SRC's 'middle' pointer to do so. */
1798 lex_source_get__ (struct lex_source *src)
1801 if (lex_source_try_get__ (src))
1806 /* Attempts to obtain a new token for SRC, in particular expanding the number
1807 of lookahead tokens (the tokens between 'back' and 'middle').
1809 Returns true if successful, false on failure. In the latter case, SRC is
1810 exhausted and 'src->eof' is now true. */
1812 lex_source_get (const struct lex_source *src_)
1814 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1816 /* In the common case, call into the scanner and segmenter to obtain a new
1817 token between 'middle' and 'front'. In the uncommon case, there can be one
1818 or a few tokens there already, leftovers from a macro expansion.
1820 If we call into the scanner and it fails, then we've hit EOF and we're
1822 if (src->front - src->middle == 0 && !lex_source_get__ (src))
1825 /* We have at least one token available between 'middle' and 'front'.
1827 The remaining complication is all about macro expansion. If macro
1828 expansion is disabled, we're done. */
1829 if (!settings_get_mexpand ())
1835 /* Now pass tokens one-by-one to the macro expander.
1837 In the common case where there is no macro to expand, the loop is not
1839 struct macro_expander *me;
1840 int n_call = macro_expander_create (
1841 src->lexer->macros, &src->tokens[src->middle & src->mask].token,
1843 for (int middle_ofs = 1; !n_call; middle_ofs++)
1845 if (src->front - src->middle <= middle_ofs && !lex_source_get__ (src))
1847 /* This should not be reachable because we always get a T_ENDCMD at
1848 the end of an input file (transformed from T_STOP by
1849 lex_source_try_get__()) and the macro_expander should always
1850 terminate expansion on T_ENDCMD. */
1854 const struct lex_token *t = &src->tokens[(src->middle + middle_ofs)
1856 size_t start = t->token_pos;
1857 size_t end = t->token_pos + t->token_len;
1858 const struct macro_token mt = {
1860 .representation = ss_buffer (&src->buffer[start - src->tail],
1864 /* We temporarily add the tokens to the source to avoid re-entry if
1865 macro_expander_add() reports an error and to give better error
1867 src->middle += middle_ofs + 1;
1868 n_call = macro_expander_add (me, &mt);
1869 src->middle -= middle_ofs + 1;
1873 /* False alarm: no macro expansion after all. Use first token as
1874 lookahead. We'll retry macro expansion from the second token next
1876 macro_expander_destroy (me);
1881 /* Now expand the macro.
1883 We temporarily add the macro call's tokens to the source in case the macro
1884 expansion calls msg() to report an error and error processing tries to get
1885 the location of the error with, e.g. lex_get_first_line_number(), which
1886 would re-enter this code. This is a kluge; it might be cleaner to pass
1887 the line number into macro_expander_get_expansion(). */
1888 src->middle += n_call;
1889 struct macro_tokens expansion = { .n = 0 };
1890 macro_expander_get_expansion (me, src->reader->syntax, &expansion);
1891 macro_expander_destroy (me);
1892 src->middle -= n_call;
1894 /* Convert the macro expansion into syntax for possible error messages later. */
1895 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1896 size_t *len = xnmalloc (expansion.n, sizeof *len);
1897 struct string s = DS_EMPTY_INITIALIZER;
1898 macro_tokens_to_representation (&expansion, &s, ofs, len);
1900 if (settings_get_mprint ())
1901 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1902 _("Macro Expansion")));
1904 /* The first 'n_call' tokens starting at 'middle' will be replaced by the
1905 macro expansion. There might be more tokens after that, up to 'front'.
1907 Figure out the boundary of the macro call in the syntax, to go into the
1908 lex_tokens for the expansion so that later error messages can report what
1909 macro was called. */
1910 const struct lex_token *call_first = &src->tokens[src->middle & src->mask];
1911 const struct lex_token *call_last
1912 = &src->tokens[(src->middle + n_call - 1) & src->mask];
1913 size_t call_pos = call_first->token_pos;
1914 size_t call_len = (call_last->token_pos + call_last->token_len) - call_pos;
1915 size_t line_pos = call_first->line_pos;
1916 int first_line = call_first->first_line;
1918 /* Destroy the tokens for the call, and save any tokens following the call so
1919 we can add them back later. */
1920 for (size_t i = src->middle; i != src->middle + n_call; i++)
1921 lex_token_uninit (&src->tokens[i & src->mask]);
1922 size_t n_save = src->front - (src->middle + n_call);
1923 struct lex_token *save_tokens = xnmalloc (n_save, sizeof *save_tokens);
1924 for (size_t i = 0; i < n_save; i++)
1925 save_tokens[i] = src->tokens[(src->middle + n_call + i) & src->mask];
1926 src->front = src->middle;
1928 /* Append the macro expansion tokens to the lookahead. */
1929 char *macro_rep = ds_steal_cstr (&s);
1930 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1931 *ref_cnt = expansion.n;
1932 for (size_t i = 0; i < expansion.n; i++)
1934 *lex_push_token__ (src) = (struct lex_token) {
1935 .token = expansion.mts[i].token,
1936 .token_pos = call_pos,
1937 .token_len = call_len,
1938 .line_pos = line_pos,
1939 .first_line = first_line,
1940 .macro_rep = macro_rep,
1947 ss_dealloc (&expansion.mts[i].representation);
1949 free (expansion.mts);
1953 /* Finally, put the saved tokens back. */
1954 for (size_t i = 0; i < n_save; i++)
1955 *lex_push_token__ (src) = save_tokens[i];
1962 lex_source_push_endcmd__ (struct lex_source *src)
1964 assert (src->back == src->middle && src->middle == src->front);
1965 *lex_push_token__ (src) = (struct lex_token) {
1966 .token = { .type = T_ENDCMD } };
1970 static struct lex_source *
1971 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
1973 struct lex_source *src = xmalloc (sizeof *src);
1974 *src = (struct lex_source) {
1976 .segmenter = segmenter_init (reader->syntax, false),
1980 lex_source_push_endcmd__ (src);
1986 lex_source_destroy (struct lex_source *src)
1988 char *file_name = src->reader->file_name;
1989 char *encoding = src->reader->encoding;
1990 if (src->reader->class->destroy != NULL)
1991 src->reader->class->destroy (src->reader);
1995 while (src->middle - src->back > 0)
1996 lex_source_pop_back (src);
1997 while (src->front - src->middle > 0)
1998 lex_source_pop_front (src);
2000 ll_remove (&src->ll);
2004 struct lex_file_reader
2006 struct lex_reader reader;
2007 struct u8_istream *istream;
2010 static struct lex_reader_class lex_file_reader_class;
2012 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2013 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2014 ENCODING, which should take one of the forms accepted by
2015 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2016 mode of the new reader, respectively.
2018 Returns a null pointer if FILE_NAME cannot be opened. */
2020 lex_reader_for_file (const char *file_name, const char *encoding,
2021 enum segmenter_mode syntax,
2022 enum lex_error_mode error)
2024 struct lex_file_reader *r;
2025 struct u8_istream *istream;
2027 istream = (!strcmp(file_name, "-")
2028 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2029 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2030 if (istream == NULL)
2032 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2036 r = xmalloc (sizeof *r);
2037 lex_reader_init (&r->reader, &lex_file_reader_class);
2038 r->reader.syntax = syntax;
2039 r->reader.error = error;
2040 r->reader.file_name = xstrdup (file_name);
2041 r->reader.encoding = xstrdup_if_nonnull (encoding);
2042 r->reader.line_number = 1;
2043 r->istream = istream;
2048 static struct lex_file_reader *
2049 lex_file_reader_cast (struct lex_reader *r)
2051 return UP_CAST (r, struct lex_file_reader, reader);
2055 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2056 enum prompt_style prompt_style UNUSED)
2058 struct lex_file_reader *r = lex_file_reader_cast (r_);
2059 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2062 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2069 lex_file_close (struct lex_reader *r_)
2071 struct lex_file_reader *r = lex_file_reader_cast (r_);
2073 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2075 if (u8_istream_close (r->istream) != 0)
2076 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2079 u8_istream_free (r->istream);
2084 static struct lex_reader_class lex_file_reader_class =
2090 struct lex_string_reader
2092 struct lex_reader reader;
2097 static struct lex_reader_class lex_string_reader_class;
2099 /* Creates and returns a new lex_reader for the contents of S, which must be
2100 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2101 with ss_dealloc() when it is closed. */
2103 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2105 struct lex_string_reader *r;
2107 r = xmalloc (sizeof *r);
2108 lex_reader_init (&r->reader, &lex_string_reader_class);
2109 r->reader.syntax = SEG_MODE_AUTO;
2110 r->reader.encoding = xstrdup_if_nonnull (encoding);
2117 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2118 which must be encoded in ENCODING. The caller retains ownership of S. */
2120 lex_reader_for_string (const char *s, const char *encoding)
2122 struct substring ss;
2123 ss_alloc_substring (&ss, ss_cstr (s));
2124 return lex_reader_for_substring_nocopy (ss, encoding);
2127 /* Formats FORMAT as a printf()-like format string and creates and returns a
2128 new lex_reader for the formatted result. */
2130 lex_reader_for_format (const char *format, const char *encoding, ...)
2132 struct lex_reader *r;
2135 va_start (args, encoding);
2136 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2142 static struct lex_string_reader *
2143 lex_string_reader_cast (struct lex_reader *r)
2145 return UP_CAST (r, struct lex_string_reader, reader);
2149 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2150 enum prompt_style prompt_style UNUSED)
2152 struct lex_string_reader *r = lex_string_reader_cast (r_);
2155 chunk = MIN (n, r->s.length - r->offset);
2156 memcpy (buf, r->s.string + r->offset, chunk);
2163 lex_string_close (struct lex_reader *r_)
2165 struct lex_string_reader *r = lex_string_reader_cast (r_);
2171 static struct lex_reader_class lex_string_reader_class =