1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/intern.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
70 size_t token_pos; /* Offset into src->buffer of token start. */
71 size_t token_len; /* Length of source for token in bytes. */
72 int first_line; /* Line number at token_pos. */
74 /* For a token obtained through macro expansion, this is just this token.
76 For a token obtained through the lexer in an ordinary way, these are
78 char *macro_rep; /* The whole macro expansion. */
79 size_t ofs; /* Offset of this token in macro_rep. */
80 size_t len; /* Length of this token in macro_rep. */
81 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
85 lex_token_destroy (struct lex_token *t)
87 token_uninit (&t->token);
90 assert (*t->ref_cnt > 0);
100 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
105 struct lex_token **tokens;
108 static void lex_stage_clear (struct lex_stage *);
109 static void lex_stage_uninit (struct lex_stage *);
111 static size_t lex_stage_count (const struct lex_stage *);
112 static bool lex_stage_is_empty (const struct lex_stage *);
114 static struct lex_token *lex_stage_first (struct lex_stage *);
115 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
117 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
118 static void lex_stage_pop_first (struct lex_stage *);
120 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
123 /* Deletes all the tokens from STAGE. */
125 lex_stage_clear (struct lex_stage *stage)
127 while (!deque_is_empty (&stage->deque))
128 lex_stage_pop_first (stage);
131 /* Deletes all the tokens from STAGE and frees storage for the deque. */
133 lex_stage_uninit (struct lex_stage *stage)
135 lex_stage_clear (stage);
136 free (stage->tokens);
139 /* Returns true if STAGE contains no tokens, otherwise false. */
141 lex_stage_is_empty (const struct lex_stage *stage)
143 return deque_is_empty (&stage->deque);
146 /* Returns the number of tokens in STAGE. */
148 lex_stage_count (const struct lex_stage *stage)
150 return deque_count (&stage->deque);
153 /* Returns the first token in STAGE, which must be nonempty.
154 The first token is the one accessed with the least lookahead. */
155 static struct lex_token *
156 lex_stage_first (struct lex_stage *stage)
158 return lex_stage_nth (stage, 0);
161 /* Returns the token the given INDEX in STAGE. The first token (with the least
162 lookahead) is 0, the second token is 1, and so on. There must be at least
163 INDEX + 1 tokens in STAGE. */
164 static struct lex_token *
165 lex_stage_nth (struct lex_stage *stage, size_t index)
167 return stage->tokens[deque_back (&stage->deque, index)];
170 /* Adds TOKEN so that it becomes the last token in STAGE. */
172 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
174 if (deque_is_full (&stage->deque))
175 stage->tokens = deque_expand (&stage->deque, stage->tokens,
176 sizeof *stage->tokens);
177 stage->tokens[deque_push_front (&stage->deque)] = token;
180 /* Removes and returns the first token from STAGE. */
181 static struct lex_token *
182 lex_stage_take_first (struct lex_stage *stage)
184 return stage->tokens[deque_pop_back (&stage->deque)];
187 /* Removes the first token from STAGE and uninitializes it. */
189 lex_stage_pop_first (struct lex_stage *stage)
191 lex_token_destroy (lex_stage_take_first (stage));
194 /* Removes the first N tokens from SRC, appending them to DST as the last
197 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
199 for (size_t i = 0; i < n; i++)
200 lex_stage_push_last (dst, lex_stage_take_first (src));
203 /* A source of tokens, corresponding to a syntax file.
205 This is conceptually a lex_reader wrapped with everything needed to convert
206 its UTF-8 bytes into tokens. */
209 struct ll ll; /* In lexer's list of sources. */
210 struct lex_reader *reader;
212 struct segmenter segmenter;
213 bool eof; /* True if T_STOP was read from 'reader'. */
215 /* Buffer of UTF-8 bytes. */
216 char *buffer; /* Source file contents. */
217 size_t length; /* Number of bytes filled. */
218 size_t allocated; /* Number of bytes allocated. */
220 /* Offsets into 'buffer'. */
221 size_t journal_pos; /* First byte not yet output to journal. */
222 size_t seg_pos; /* First byte not yet scanned as token. */
224 int n_newlines; /* Number of new-lines up to seg_pos. */
225 bool suppress_next_newline;
229 This is a pipeline with the following stages. Each token eventually
230 made available to the parser passes through of these stages. The stages
231 are named after the processing that happens in each one.
233 Initially, tokens come from the segmenter and scanner to 'pp':
235 - pp: Tokens that need to pass through the macro preprocessor to end up
238 - merge: Tokens that need to pass through scan_merge() to end up in
241 - parse: Tokens available to the client for parsing.
243 'pp' and 'merge' store tokens only temporarily until they pass into
244 'parse'. Tokens then live in 'parse' until the command is fully
245 consumed, at which time they are freed together. */
247 struct lex_stage merge;
248 struct lex_token **parse;
249 size_t n_parse, allocated_parse, parse_ofs;
252 static struct lex_source *lex_source_create (struct lexer *,
253 struct lex_reader *);
254 static void lex_source_destroy (struct lex_source *);
259 struct ll_list sources; /* Contains "struct lex_source"s. */
260 struct macro_set *macros;
263 static struct lex_source *lex_source__ (const struct lexer *);
264 static char *lex_source_get_syntax__ (const struct lex_source *,
266 static const struct lex_token *lex_next__ (const struct lexer *, int n);
267 static void lex_source_push_endcmd__ (struct lex_source *);
268 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
269 static void lex_source_clear_parse (struct lex_source *);
271 static bool lex_source_get_parse (struct lex_source *);
272 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
273 const char *format, va_list)
274 PRINTF_FORMAT (4, 0);
275 static const struct lex_token *lex_source_next__ (const struct lex_source *,
278 /* Initializes READER with the specified CLASS and otherwise some reasonable
279 defaults. The caller should fill in the others members as desired. */
281 lex_reader_init (struct lex_reader *reader,
282 const struct lex_reader_class *class)
284 reader->class = class;
285 reader->syntax = SEG_MODE_AUTO;
286 reader->error = LEX_ERROR_CONTINUE;
287 reader->file_name = NULL;
288 reader->encoding = NULL;
289 reader->line_number = 0;
293 /* Frees any file name already in READER and replaces it by a copy of
294 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
296 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
298 free (reader->file_name);
299 reader->file_name = xstrdup_if_nonnull (file_name);
302 /* Creates and returns a new lexer. */
306 struct lexer *lexer = xmalloc (sizeof *lexer);
307 *lexer = (struct lexer) {
308 .sources = LL_INITIALIZER (lexer->sources),
309 .macros = macro_set_create (),
314 /* Destroys LEXER. */
316 lex_destroy (struct lexer *lexer)
320 struct lex_source *source, *next;
322 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
323 lex_source_destroy (source);
324 macro_set_destroy (lexer->macros);
329 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
330 same name. Takes ownership of M. */
332 lex_define_macro (struct lexer *lexer, struct macro *m)
334 macro_set_add (lexer->macros, m);
337 /* Inserts READER into LEXER so that the next token read by LEXER comes from
338 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
341 lex_include (struct lexer *lexer, struct lex_reader *reader)
343 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
344 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
347 /* Appends READER to LEXER, so that it will be read after all other current
348 readers have already been read. */
350 lex_append (struct lexer *lexer, struct lex_reader *reader)
352 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
357 /* Advances LEXER to the next token, consuming the current token. */
359 lex_get (struct lexer *lexer)
361 struct lex_source *src;
363 src = lex_source__ (lexer);
367 if (src->parse_ofs < src->n_parse)
369 if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
370 lex_source_clear_parse (src);
375 while (src->parse_ofs == src->n_parse)
376 if (!lex_source_get_parse (src))
378 lex_source_destroy (src);
379 src = lex_source__ (lexer);
385 /* Advances LEXER by N tokens. */
387 lex_get_n (struct lexer *lexer, size_t n)
393 /* Issuing errors. */
395 /* Prints a syntax error message containing the current token and
396 given message MESSAGE (if non-null). */
398 lex_error (struct lexer *lexer, const char *format, ...)
402 va_start (args, format);
403 lex_next_error_valist (lexer, 0, 0, format, args);
407 /* Prints a syntax error message containing the current token and
408 given message MESSAGE (if non-null). */
410 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
412 lex_next_error_valist (lexer, 0, 0, format, args);
415 /* Prints a syntax error message containing the current token and
416 given message MESSAGE (if non-null). */
418 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
422 va_start (args, format);
423 lex_next_error_valist (lexer, n0, n1, format, args);
427 /* Prints a syntax error message saying that one of the strings provided as
428 varargs, up to the first NULL, is expected. */
430 (lex_error_expecting) (struct lexer *lexer, ...)
434 va_start (args, lexer);
435 lex_error_expecting_valist (lexer, args);
439 /* Prints a syntax error message saying that one of the options provided in
440 ARGS, up to the first NULL, is expected. */
442 lex_error_expecting_valist (struct lexer *lexer, va_list args)
444 enum { MAX_OPTIONS = 9 };
445 const char *options[MAX_OPTIONS];
447 while (n < MAX_OPTIONS)
449 const char *option = va_arg (args, const char *);
453 options[n++] = option;
455 lex_error_expecting_array (lexer, options, n);
459 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
464 lex_error (lexer, NULL);
468 lex_error (lexer, _("expecting %s"), options[0]);
472 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
476 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
481 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
482 options[0], options[1], options[2], options[3]);
486 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
487 options[0], options[1], options[2], options[3], options[4]);
491 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
492 options[0], options[1], options[2], options[3], options[4],
497 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
498 options[0], options[1], options[2], options[3], options[4],
499 options[5], options[6]);
503 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
504 options[0], options[1], options[2], options[3], options[4],
505 options[5], options[6], options[7]);
509 lex_error (lexer, NULL);
513 /* Reports an error to the effect that subcommand SBC may only be specified
516 This function does not take a lexer as an argument or use lex_error(),
517 because the result would ordinarily just be redundant: "Syntax error at
518 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
519 not help the user find the error. */
521 lex_sbc_only_once (const char *sbc)
523 msg (SE, _("Subcommand %s may only be specified once."), sbc);
526 /* Reports an error to the effect that subcommand SBC is missing.
528 This function does not take a lexer as an argument or use lex_error(),
529 because a missing subcommand can normally be detected only after the whole
530 command has been parsed, and so lex_error() would always report "Syntax
531 error at end of command", which does not help the user find the error. */
533 lex_sbc_missing (const char *sbc)
535 msg (SE, _("Required subcommand %s was not specified."), sbc);
538 /* Reports an error to the effect that specification SPEC may only be specified
539 once within subcommand SBC. */
541 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
543 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
547 /* Reports an error to the effect that specification SPEC is missing within
550 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
552 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
556 /* Prints a syntax error message containing the current token and
557 given message MESSAGE (if non-null). */
559 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
560 const char *format, va_list args)
562 struct lex_source *src = lex_source__ (lexer);
565 lex_source_error_valist (src, n0, n1, format, args);
571 ds_put_format (&s, _("Syntax error at end of input"));
574 ds_put_cstr (&s, ": ");
575 ds_put_vformat (&s, format, args);
577 if (ds_last (&s) != '.')
578 ds_put_byte (&s, '.');
579 msg (SE, "%s", ds_cstr (&s));
584 /* Checks that we're at end of command.
585 If so, returns a successful command completion code.
586 If not, flags a syntax error and returns an error command
589 lex_end_of_command (struct lexer *lexer)
591 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
593 lex_error (lexer, _("expecting end of command"));
600 /* Token testing functions. */
602 /* Returns true if the current token is a number. */
604 lex_is_number (const struct lexer *lexer)
606 return lex_next_is_number (lexer, 0);
609 /* Returns true if the current token is a string. */
611 lex_is_string (const struct lexer *lexer)
613 return lex_next_is_string (lexer, 0);
616 /* Returns the value of the current token, which must be a
617 floating point number. */
619 lex_number (const struct lexer *lexer)
621 return lex_next_number (lexer, 0);
624 /* Returns true iff the current token is an integer. */
626 lex_is_integer (const struct lexer *lexer)
628 return lex_next_is_integer (lexer, 0);
631 /* Returns the value of the current token, which must be an
634 lex_integer (const struct lexer *lexer)
636 return lex_next_integer (lexer, 0);
639 /* Token testing functions with lookahead.
641 A value of 0 for N as an argument to any of these functions refers to the
642 current token. Lookahead is limited to the current command. Any N greater
643 than the number of tokens remaining in the current command will be treated
644 as referring to a T_ENDCMD token. */
646 /* Returns true if the token N ahead of the current token is a number. */
648 lex_next_is_number (const struct lexer *lexer, int n)
650 return token_is_number (lex_next (lexer, n));
653 /* Returns true if the token N ahead of the current token is a string. */
655 lex_next_is_string (const struct lexer *lexer, int n)
657 return token_is_string (lex_next (lexer, n));
660 /* Returns the value of the token N ahead of the current token, which must be a
661 floating point number. */
663 lex_next_number (const struct lexer *lexer, int n)
665 return token_number (lex_next (lexer, n));
668 /* Returns true if the token N ahead of the current token is an integer. */
670 lex_next_is_integer (const struct lexer *lexer, int n)
672 return token_is_integer (lex_next (lexer, n));
675 /* Returns the value of the token N ahead of the current token, which must be
678 lex_next_integer (const struct lexer *lexer, int n)
680 return token_integer (lex_next (lexer, n));
683 /* Token matching functions. */
685 /* If the current token has the specified TYPE, skips it and returns true.
686 Otherwise, returns false. */
688 lex_match (struct lexer *lexer, enum token_type type)
690 if (lex_token (lexer) == type)
699 /* If the current token matches IDENTIFIER, skips it and returns true.
700 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
703 IDENTIFIER must be an ASCII string. */
705 lex_match_id (struct lexer *lexer, const char *identifier)
707 return lex_match_id_n (lexer, identifier, 3);
710 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
711 may be abbreviated to its first N letters. Otherwise, returns false.
713 IDENTIFIER must be an ASCII string. */
715 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
717 if (lex_token (lexer) == T_ID
718 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
727 /* If the current token is integer X, skips it and returns true. Otherwise,
730 lex_match_int (struct lexer *lexer, int x)
732 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
741 /* Forced matches. */
743 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
744 abbreviated to its first 3 letters. Otherwise, reports an error and returns
747 IDENTIFIER must be an ASCII string. */
749 lex_force_match_id (struct lexer *lexer, const char *identifier)
751 if (lex_match_id (lexer, identifier))
755 lex_error_expecting (lexer, identifier);
760 /* If the current token has the specified TYPE, skips it and returns true.
761 Otherwise, reports an error and returns false. */
763 lex_force_match (struct lexer *lexer, enum token_type type)
765 if (lex_token (lexer) == type)
772 const char *type_string = token_type_to_string (type);
775 char *s = xasprintf ("`%s'", type_string);
776 lex_error_expecting (lexer, s);
780 lex_error_expecting (lexer, token_type_to_name (type));
786 /* If the current token is a string, does nothing and returns true.
787 Otherwise, reports an error and returns false. */
789 lex_force_string (struct lexer *lexer)
791 if (lex_is_string (lexer))
795 lex_error (lexer, _("expecting string"));
800 /* If the current token is a string or an identifier, does nothing and returns
801 true. Otherwise, reports an error and returns false.
803 This is meant for use in syntactic situations where we want to encourage the
804 user to supply a quoted string, but for compatibility we also accept
805 identifiers. (One example of such a situation is file names.) Therefore,
806 the error message issued when the current token is wrong only says that a
807 string is expected and doesn't mention that an identifier would also be
810 lex_force_string_or_id (struct lexer *lexer)
812 return lex_token (lexer) == T_ID || lex_force_string (lexer);
815 /* If the current token is an integer, does nothing and returns true.
816 Otherwise, reports an error and returns false. */
818 lex_force_int (struct lexer *lexer)
820 if (lex_is_integer (lexer))
824 lex_error (lexer, _("expecting integer"));
829 /* If the current token is an integer in the range MIN...MAX (inclusive), does
830 nothing and returns true. Otherwise, reports an error and returns false.
831 If NAME is nonnull, then it is used in the error message. */
833 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
835 bool is_number = lex_is_number (lexer);
836 bool is_integer = lex_is_integer (lexer);
837 bool too_small = (is_integer ? lex_integer (lexer) < min
838 : is_number ? lex_number (lexer) < min
840 bool too_big = (is_integer ? lex_integer (lexer) > max
841 : is_number ? lex_number (lexer) > max
843 if (is_integer && !too_small && !too_big)
848 /* Weird, maybe a bug in the caller. Just report that we needed an
851 lex_error (lexer, _("Integer expected for %s."), name);
853 lex_error (lexer, _("Integer expected."));
858 lex_error (lexer, _("Expected %ld for %s."), min, name);
860 lex_error (lexer, _("Expected %ld."), min);
862 else if (min + 1 == max)
865 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
867 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
871 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
872 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
874 if (report_lower_bound && report_upper_bound)
878 _("Expected integer between %ld and %ld for %s."),
881 lex_error (lexer, _("Expected integer between %ld and %ld."),
884 else if (report_lower_bound)
889 lex_error (lexer, _("Expected non-negative integer for %s."),
892 lex_error (lexer, _("Expected non-negative integer."));
897 lex_error (lexer, _("Expected positive integer for %s."),
900 lex_error (lexer, _("Expected positive integer."));
905 lex_error (lexer, _("Expected integer %ld or greater for %s."),
908 lex_error (lexer, _("Expected integer %ld or greater."), min);
911 else if (report_upper_bound)
915 _("Expected integer less than or equal to %ld for %s."),
918 lex_error (lexer, _("Expected integer less than or equal to %ld."),
924 lex_error (lexer, _("Integer expected for %s."), name);
926 lex_error (lexer, _("Integer expected."));
932 /* If the current token is a number, does nothing and returns true.
933 Otherwise, reports an error and returns false. */
935 lex_force_num (struct lexer *lexer)
937 if (lex_is_number (lexer))
940 lex_error (lexer, _("expecting number"));
944 /* If the current token is an identifier, does nothing and returns true.
945 Otherwise, reports an error and returns false. */
947 lex_force_id (struct lexer *lexer)
949 if (lex_token (lexer) == T_ID)
952 lex_error (lexer, _("expecting identifier"));
956 /* Token accessors. */
958 /* Returns the type of LEXER's current token. */
960 lex_token (const struct lexer *lexer)
962 return lex_next_token (lexer, 0);
965 /* Returns the number in LEXER's current token.
967 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
968 tokens this function will always return zero. */
970 lex_tokval (const struct lexer *lexer)
972 return lex_next_tokval (lexer, 0);
975 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
977 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
978 this functions this function will always return NULL.
980 The UTF-8 encoding of the returned string is correct for variable names and
981 other identifiers. Use filename_to_utf8() to use it as a filename. Use
982 data_in() to use it in a "union value". */
984 lex_tokcstr (const struct lexer *lexer)
986 return lex_next_tokcstr (lexer, 0);
989 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
990 null-terminated (but the null terminator is not included in the returned
991 substring's 'length').
993 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
994 this functions this function will always return NULL.
996 The UTF-8 encoding of the returned string is correct for variable names and
997 other identifiers. Use filename_to_utf8() to use it as a filename. Use
998 data_in() to use it in a "union value". */
1000 lex_tokss (const struct lexer *lexer)
1002 return lex_next_tokss (lexer, 0);
1007 A value of 0 for N as an argument to any of these functions refers to the
1008 current token. Lookahead is limited to the current command. Any N greater
1009 than the number of tokens remaining in the current command will be treated
1010 as referring to a T_ENDCMD token. */
1012 static const struct lex_token *
1013 lex_next__ (const struct lexer *lexer_, int n)
1015 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1016 struct lex_source *src = lex_source__ (lexer);
1019 return lex_source_next__ (src, n);
1022 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1027 static const struct lex_token *
1028 lex_source_next__ (const struct lex_source *src_, int n)
1030 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1034 if (-n <= src->parse_ofs)
1035 return src->parse[src->parse_ofs - (-n)];
1038 static const struct lex_token endcmd_token
1039 = { .token = { .type = T_ENDCMD } };
1040 return &endcmd_token;
1044 while (src->n_parse - src->parse_ofs <= n)
1046 if (src->n_parse > 0)
1048 const struct lex_token *t = src->parse[src->n_parse - 1];
1049 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1053 lex_source_get_parse (src);
1056 return src->parse[src->parse_ofs + n];
1059 /* Returns the "struct token" of the token N after the current one in LEXER.
1060 The returned pointer can be invalidated by pretty much any succeeding call
1061 into the lexer, although the string pointer within the returned token is
1062 only invalidated by consuming the token (e.g. with lex_get()). */
1063 const struct token *
1064 lex_next (const struct lexer *lexer, int n)
1066 return &lex_next__ (lexer, n)->token;
1069 /* Returns the type of the token N after the current one in LEXER. */
1071 lex_next_token (const struct lexer *lexer, int n)
1073 return lex_next (lexer, n)->type;
1076 /* Returns the number in the tokn N after the current one in LEXER.
1078 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1079 tokens this function will always return zero. */
1081 lex_next_tokval (const struct lexer *lexer, int n)
1083 return token_number (lex_next (lexer, n));
1086 /* Returns the null-terminated string in the token N after the current one, in
1089 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1090 this functions this function will always return NULL.
1092 The UTF-8 encoding of the returned string is correct for variable names and
1093 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1094 data_in() to use it in a "union value". */
1096 lex_next_tokcstr (const struct lexer *lexer, int n)
1098 return lex_next_tokss (lexer, n).string;
1101 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1102 The string is null-terminated (but the null terminator is not included in
1103 the returned substring's 'length').
1105 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1106 tokens this functions this function will always return NULL.
1108 The UTF-8 encoding of the returned string is correct for variable names and
1109 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1110 data_in() to use it in a "union value". */
1112 lex_next_tokss (const struct lexer *lexer, int n)
1114 return lex_next (lexer, n)->string;
1117 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1118 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1119 are both zero, this requests the syntax for the current token.) The caller
1120 must eventually free the returned string (with free()). The syntax is
1121 encoded in UTF-8 and in the original form supplied to the lexer so that, for
1122 example, it may include comments, spaces, and new-lines if it spans multiple
1123 tokens. Macro expansion, however, has already been performed. */
1125 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1127 return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1130 /* Returns true if the token N ahead of the current one was produced by macro
1131 expansion, false otherwise. */
1133 lex_next_is_from_macro (const struct lexer *lexer, int n)
1135 return lex_next__ (lexer, n)->macro_rep != NULL;
1139 lex_tokens_match (const struct token *actual, const struct token *expected)
1141 if (actual->type != expected->type)
1144 switch (actual->type)
1148 return actual->number == expected->number;
1151 return lex_id_match (expected->string, actual->string);
1154 return (actual->string.length == expected->string.length
1155 && !memcmp (actual->string.string, expected->string.string,
1156 actual->string.length));
1164 lex_at_phrase__ (struct lexer *lexer, const char *s)
1166 struct string_lexer slex;
1170 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1171 while (string_lexer_next (&slex, &token))
1173 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1174 token_uninit (&token);
1181 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1182 returns true. Otherwise, returns false.
1184 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1185 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1186 first three letters. */
1188 lex_at_phrase (struct lexer *lexer, const char *s)
1190 return lex_at_phrase__ (lexer, s) > 0;
1193 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1194 skips it and returns true. Otherwise, returns false.
1196 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1197 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1198 first three letters. */
1200 lex_match_phrase (struct lexer *lexer, const char *s)
1202 size_t n = lex_at_phrase__ (lexer, s);
1204 lex_get_n (lexer, n);
1209 count_newlines (char *s, size_t length)
1214 while ((newline = memchr (s, '\n', length)) != NULL)
1217 length -= (newline + 1) - s;
1225 lex_token_get_last_line_number (const struct lex_source *src,
1226 const struct lex_token *token)
1228 if (token->first_line == 0)
1232 char *token_str = &src->buffer[token->token_pos];
1233 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1238 lex_token_get_column__ (const struct lex_source *src, size_t offset)
1240 const char *newline = memrchr (src->buffer, '\n', offset);
1241 size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1242 return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1246 lex_token_get_first_column (const struct lex_source *src,
1247 const struct lex_token *token)
1249 return lex_token_get_column__ (src, token->token_pos);
1253 lex_token_get_last_column (const struct lex_source *src,
1254 const struct lex_token *token)
1256 return lex_token_get_column__ (src, token->token_pos + token->token_len);
1259 static struct msg_location
1260 lex_token_location (const struct lex_source *src,
1261 const struct lex_token *t0,
1262 const struct lex_token *t1)
1264 return (struct msg_location) {
1265 .file_name = intern_new_if_nonnull (src->reader->file_name),
1266 .first_line = t0->first_line,
1267 .last_line = lex_token_get_last_line_number (src, t1),
1268 .first_column = lex_token_get_first_column (src, t0),
1269 .last_column = lex_token_get_last_column (src, t1),
1273 static struct msg_location *
1274 lex_token_location_rw (const struct lex_source *src,
1275 const struct lex_token *t0,
1276 const struct lex_token *t1)
1278 struct msg_location location = lex_token_location (src, t0, t1);
1279 return msg_location_dup (&location);
1282 static struct msg_location *
1283 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1285 return lex_token_location_rw (src,
1286 lex_source_next__ (src, n0),
1287 lex_source_next__ (src, n1));
1290 /* Returns the 1-based line number of the start of the syntax that represents
1291 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1292 if the token is drawn from a source that does not have line numbers. */
1294 lex_get_first_line_number (const struct lexer *lexer, int n)
1296 const struct lex_source *src = lex_source__ (lexer);
1297 return src ? lex_source_next__ (src, n)->first_line : 0;
1300 /* Returns the 1-based line number of the end of the syntax that represents the
1301 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1302 token or if the token is drawn from a source that does not have line
1305 Most of the time, a single token is wholly within a single line of syntax,
1306 but there are two exceptions: a T_STRING token can be made up of multiple
1307 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1308 token can consist of a "-" on one line followed by the number on the next.
1311 lex_get_last_line_number (const struct lexer *lexer, int n)
1313 const struct lex_source *src = lex_source__ (lexer);
1314 return src ? lex_token_get_last_line_number (src,
1315 lex_source_next__ (src, n)) : 0;
1318 /* Returns the 1-based column number of the start of the syntax that represents
1319 the token N after the current one in LEXER. Returns 0 for a T_STOP
1322 Column numbers are measured according to the width of characters as shown in
1323 a typical fixed-width font, in which CJK characters have width 2 and
1324 combining characters have width 0. */
1326 lex_get_first_column (const struct lexer *lexer, int n)
1328 const struct lex_source *src = lex_source__ (lexer);
1329 return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1332 /* Returns the 1-based column number of the end of the syntax that represents
1333 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1336 Column numbers are measured according to the width of characters as shown in
1337 a typical fixed-width font, in which CJK characters have width 2 and
1338 combining characters have width 0. */
1340 lex_get_last_column (const struct lexer *lexer, int n)
1342 const struct lex_source *src = lex_source__ (lexer);
1343 return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1346 /* Returns the name of the syntax file from which the current command is drawn.
1347 Returns NULL for a T_STOP token or if the command's source does not have
1350 There is no version of this function that takes an N argument because
1351 lookahead only works to the end of a command and any given command is always
1352 within a single syntax file. */
1354 lex_get_file_name (const struct lexer *lexer)
1356 struct lex_source *src = lex_source__ (lexer);
1357 return src == NULL ? NULL : src->reader->file_name;
1360 /* Returns a newly allocated msg_location for the syntax that represents tokens
1361 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1362 must eventually free the location (with msg_location_destroy()). */
1363 struct msg_location *
1364 lex_get_location (const struct lexer *lexer, int n0, int n1)
1366 struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1367 loc->first_column = lex_get_first_column (lexer, n0);
1368 loc->last_column = lex_get_last_column (lexer, n1);
1372 /* Returns a newly allocated msg_location for the syntax that represents tokens
1373 with 0-based offsets N0...N1, inclusive, from the current token. The
1374 location only covers the tokens' lines, not the columns. The caller must
1375 eventually free the location (with msg_location_destroy()). */
1376 struct msg_location *
1377 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1379 struct msg_location *loc = xmalloc (sizeof *loc);
1380 *loc = (struct msg_location) {
1381 .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1382 .first_line = lex_get_first_line_number (lexer, n0),
1383 .last_line = lex_get_last_line_number (lexer, n1),
1389 lex_get_encoding (const struct lexer *lexer)
1391 struct lex_source *src = lex_source__ (lexer);
1392 return src == NULL ? NULL : src->reader->encoding;
1395 /* Returns the syntax mode for the syntax file from which the current drawn is
1396 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1397 does not have line numbers.
1399 There is no version of this function that takes an N argument because
1400 lookahead only works to the end of a command and any given command is always
1401 within a single syntax file. */
1403 lex_get_syntax_mode (const struct lexer *lexer)
1405 struct lex_source *src = lex_source__ (lexer);
1406 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1409 /* Returns the error mode for the syntax file from which the current drawn is
1410 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1411 source does not have line numbers.
1413 There is no version of this function that takes an N argument because
1414 lookahead only works to the end of a command and any given command is always
1415 within a single syntax file. */
1417 lex_get_error_mode (const struct lexer *lexer)
1419 struct lex_source *src = lex_source__ (lexer);
1420 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1423 /* If the source that LEXER is currently reading has error mode
1424 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1425 token to be read comes directly from whatever is next read from the stream.
1427 It makes sense to call this function after encountering an error in a
1428 command entered on the console, because usually the user would prefer not to
1429 have cascading errors. */
1431 lex_interactive_reset (struct lexer *lexer)
1433 struct lex_source *src = lex_source__ (lexer);
1434 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1437 src->journal_pos = src->seg_pos = 0;
1438 src->n_newlines = 0;
1439 src->suppress_next_newline = false;
1440 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1442 lex_stage_clear (&src->pp);
1443 lex_stage_clear (&src->merge);
1444 lex_source_clear_parse (src);
1445 lex_source_push_endcmd__ (src);
1449 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1451 lex_discard_rest_of_command (struct lexer *lexer)
1453 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1457 /* Discards all lookahead tokens in LEXER, then discards all input sources
1458 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1459 runs out of input sources. */
1461 lex_discard_noninteractive (struct lexer *lexer)
1463 struct lex_source *src = lex_source__ (lexer);
1467 lex_stage_clear (&src->pp);
1468 lex_stage_clear (&src->merge);
1469 lex_source_clear_parse (src);
1471 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1472 src = lex_source__ (lexer))
1473 lex_source_destroy (src);
1478 lex_source_expand__ (struct lex_source *src)
1480 if (src->length >= src->allocated)
1481 src->buffer = x2realloc (src->buffer, &src->allocated);
1485 lex_source_read__ (struct lex_source *src)
1489 lex_source_expand__ (src);
1491 size_t space = src->allocated - src->length;
1492 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1493 size_t n = src->reader->class->read (src->reader,
1494 &src->buffer[src->length],
1496 assert (n <= space);
1501 src->reader->eof = true;
1507 while (!memchr (&src->buffer[src->seg_pos], '\n',
1508 src->length - src->seg_pos));
1511 static struct lex_source *
1512 lex_source__ (const struct lexer *lexer)
1514 return (ll_is_empty (&lexer->sources) ? NULL
1515 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1518 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1519 one, through N1 ahead of the current one, inclusive. (For example, if N0
1520 and N1 are both zero, this requests the syntax for the current token.) The
1521 caller must eventually free the returned string (with free()). The syntax
1522 is encoded in UTF-8 and in the original form supplied to the lexer so that,
1523 for example, it may include comments, spaces, and new-lines if it spans
1524 multiple tokens. Macro expansion, however, has already been performed. */
1526 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1528 struct string s = DS_EMPTY_INITIALIZER;
1529 for (size_t i = n0; i <= n1; )
1531 /* Find [I,J) as the longest sequence of tokens not produced by macro
1532 expansion, or otherwise the longest sequence expanded from a single
1534 const struct lex_token *first = lex_source_next__ (src, i);
1536 for (j = i + 1; j <= n1; j++)
1538 const struct lex_token *cur = lex_source_next__ (src, j);
1539 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1540 || first->macro_rep != cur->macro_rep)
1543 const struct lex_token *last = lex_source_next__ (src, j - 1);
1545 /* Now add the syntax for this sequence of tokens to SRC. */
1546 if (!ds_is_empty (&s))
1547 ds_put_byte (&s, ' ');
1548 if (!first->macro_rep)
1550 size_t start = first->token_pos;
1551 size_t end = last->token_pos + last->token_len;
1552 ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1556 size_t start = first->ofs;
1557 size_t end = last->ofs + last->len;
1558 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1564 return ds_steal_cstr (&s);
1568 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1570 for (size_t i = n0; i <= n1; i++)
1571 if (lex_source_next__ (src, i)->macro_rep)
1576 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1577 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1578 other tokens included in that range. The syntax is encoded in UTF-8 and in
1579 the original form supplied to the lexer so that, for example, it may include
1580 comments, spaces, and new-lines if it spans multiple tokens.
1582 Returns an empty string if the token range doesn't include a macro call.
1584 The caller must not modify or free the returned string. */
1585 static struct substring
1586 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1588 if (!lex_source_contains_macro_call (src, n0, n1))
1591 const struct lex_token *token0 = lex_source_next__ (src, n0);
1592 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1593 size_t start = token0->token_pos;
1594 size_t end = token1->token_pos + token1->token_len;
1596 return ss_buffer (&src->buffer[start], end - start);
1600 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1601 const char *format, va_list args)
1603 const struct lex_token *token;
1608 token = lex_source_next__ (src, n0);
1609 if (token->token.type == T_ENDCMD)
1610 ds_put_cstr (&s, _("Syntax error at end of command"));
1613 /* Get the syntax that caused the error. */
1614 char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1616 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1619 /* Get the macro call(s) that expanded to the syntax that caused the
1622 str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1629 _("Syntax error at `%s' (in expansion of `%s')"),
1632 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1637 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1640 ds_put_cstr (&s, _("Syntax error"));
1646 ds_put_cstr (&s, ": ");
1647 ds_put_vformat (&s, format, args);
1649 if (ds_last (&s) != '.')
1650 ds_put_byte (&s, '.');
1652 struct msg *m = xmalloc (sizeof *m);
1654 .category = MSG_C_SYNTAX,
1655 .severity = MSG_S_ERROR,
1656 .location = lex_source_get_location (src, n0, n1),
1657 .text = ds_steal_cstr (&s),
1663 lex_get_error (struct lex_source *src, const struct lex_token *token)
1666 str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1667 syntax, sizeof syntax);
1669 struct string s = DS_EMPTY_INITIALIZER;
1670 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1671 ds_put_format (&s, ": %s", token->token.string.string);
1673 struct msg *m = xmalloc (sizeof *m);
1675 .category = MSG_C_SYNTAX,
1676 .severity = MSG_S_ERROR,
1677 .location = lex_token_location_rw (src, token, token),
1678 .text = ds_steal_cstr (&s),
1683 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1684 underlying lex_reader if necessary. Returns true if a new token was added
1685 to SRC's deque, false otherwise. The caller should retry failures unless
1686 SRC's 'eof' marker was set to true indicating that there will be no more
1687 tokens from this source. */
1689 lex_source_try_get_pp (struct lex_source *src)
1691 /* Append a new token to SRC and initialize it. */
1692 struct lex_token *token = xmalloc (sizeof *token);
1693 token->token = (struct token) { .type = T_STOP };
1694 token->macro_rep = NULL;
1695 token->ref_cnt = NULL;
1696 token->token_pos = src->seg_pos;
1697 if (src->reader->line_number > 0)
1698 token->first_line = src->reader->line_number + src->n_newlines;
1700 token->first_line = 0;
1702 /* Extract a segment. */
1703 const char *segment;
1704 enum segment_type seg_type;
1708 segment = &src->buffer[src->seg_pos];
1709 seg_len = segmenter_push (&src->segmenter, segment,
1710 src->length - src->seg_pos,
1711 src->reader->eof, &seg_type);
1715 /* The segmenter needs more input to produce a segment. */
1716 assert (!src->reader->eof);
1717 lex_source_read__ (src);
1720 /* Update state based on the segment. */
1721 token->token_len = seg_len;
1722 src->seg_pos += seg_len;
1723 if (seg_type == SEG_NEWLINE)
1726 /* Get a token from the segment. */
1727 enum tokenize_result result = token_from_segment (
1728 seg_type, ss_buffer (segment, seg_len), &token->token);
1730 /* If we've reached the end of a line, or the end of a command, then pass
1731 the line to the output engine as a syntax text item. */
1732 int n_lines = seg_type == SEG_NEWLINE;
1733 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1736 src->suppress_next_newline = true;
1738 else if (n_lines > 0 && src->suppress_next_newline)
1741 src->suppress_next_newline = false;
1743 for (int i = 0; i < n_lines; i++)
1745 /* Beginning of line. */
1746 const char *line = &src->buffer[src->journal_pos];
1748 /* Calculate line length, including \n or \r\n end-of-line if present.
1750 We use src->length even though that may be beyond what we've actually
1751 converted to tokens. That's because, if we're emitting the line due
1752 to SEG_END_COMMAND, we want to take the whole line through the
1753 newline, not just through the '.'. */
1754 size_t max_len = src->length - src->journal_pos;
1755 const char *newline = memchr (line, '\n', max_len);
1756 size_t line_len = newline ? newline - line + 1 : max_len;
1758 /* Calculate line length excluding end-of-line. */
1759 size_t copy_len = line_len;
1760 if (copy_len > 0 && line[copy_len - 1] == '\n')
1762 if (copy_len > 0 && line[copy_len - 1] == '\r')
1765 /* Submit the line as syntax. */
1766 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1767 xmemdup0 (line, copy_len),
1770 src->journal_pos += line_len;
1775 case TOKENIZE_ERROR:
1776 lex_get_error (src, token);
1778 case TOKENIZE_EMPTY:
1779 lex_token_destroy (token);
1782 case TOKENIZE_TOKEN:
1783 if (token->token.type == T_STOP)
1785 token->token.type = T_ENDCMD;
1788 lex_stage_push_last (&src->pp, token);
1794 /* Attempts to append a new token to SRC. Returns true if successful, false on
1795 failure. On failure, the end of SRC has been reached and no more tokens
1796 will be forthcoming from it.
1798 Does not make the new token available for lookahead yet; the caller must
1799 adjust SRC's 'middle' pointer to do so. */
1801 lex_source_get_pp (struct lex_source *src)
1804 if (lex_source_try_get_pp (src))
1810 lex_source_try_get_merge (const struct lex_source *src_)
1812 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1814 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1817 if (!settings_get_mexpand ())
1819 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1823 /* Now pass tokens one-by-one to the macro expander.
1825 In the common case where there is no macro to expand, the loop is not
1827 struct macro_call *mc;
1828 int n_call = macro_call_create (src->lexer->macros,
1829 &lex_stage_first (&src->pp)->token, &mc);
1830 for (int ofs = 1; !n_call; ofs++)
1832 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1834 /* This should not be reachable because we always get a T_ENDCMD at
1835 the end of an input file (transformed from T_STOP by
1836 lex_source_try_get_pp()) and the macro_expander should always
1837 terminate expansion on T_ENDCMD. */
1841 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1842 size_t start = t->token_pos;
1843 size_t end = t->token_pos + t->token_len;
1844 const struct macro_token mt = {
1846 .syntax = ss_buffer (&src->buffer[start], end - start),
1848 const struct msg_location loc = lex_token_location (src, t, t);
1849 n_call = macro_call_add (mc, &mt, &loc);
1853 /* False alarm: no macro expansion after all. Use first token as
1854 lookahead. We'll retry macro expansion from the second token next
1856 macro_call_destroy (mc);
1857 lex_stage_shift (&src->merge, &src->pp, 1);
1861 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1862 are a macro call. (These are likely to be the only tokens in 'pp'.)
1864 const struct lex_token *c0 = lex_stage_first (&src->pp);
1865 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1866 struct macro_tokens expansion = { .n = 0 };
1867 struct msg_location loc = lex_token_location (src, c0, c1);
1868 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1869 macro_call_destroy (mc);
1871 /* Convert the macro expansion into syntax for possible error messages
1873 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1874 size_t *len = xnmalloc (expansion.n, sizeof *len);
1875 struct string s = DS_EMPTY_INITIALIZER;
1876 macro_tokens_to_syntax (&expansion, &s, ofs, len);
1878 if (settings_get_mprint ())
1879 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1880 _("Macro Expansion")));
1882 /* Append the macro expansion tokens to the lookahead. */
1883 if (expansion.n > 0)
1885 char *macro_rep = ds_steal_cstr (&s);
1886 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1887 *ref_cnt = expansion.n;
1888 for (size_t i = 0; i < expansion.n; i++)
1890 struct lex_token *token = xmalloc (sizeof *token);
1891 *token = (struct lex_token) {
1892 .token = expansion.mts[i].token,
1893 .token_pos = c0->token_pos,
1894 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1895 .first_line = c0->first_line,
1896 .macro_rep = macro_rep,
1901 lex_stage_push_last (&src->merge, token);
1903 ss_dealloc (&expansion.mts[i].syntax);
1908 free (expansion.mts);
1912 /* Destroy the tokens for the call. */
1913 for (size_t i = 0; i < n_call; i++)
1914 lex_stage_pop_first (&src->pp);
1916 return expansion.n > 0;
1919 /* Attempts to obtain at least one new token into 'merge' in SRC.
1921 Returns true if successful, false on failure. In the latter case, SRC is
1922 exhausted and 'src->eof' is now true. */
1924 lex_source_get_merge (struct lex_source *src)
1927 if (lex_source_try_get_merge (src))
1932 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1934 Returns true if successful, false on failure. In the latter case, SRC is
1935 exhausted and 'src->eof' is now true. */
1937 lex_source_get_parse (struct lex_source *src)
1939 struct merger m = MERGER_INIT;
1941 for (size_t i = 0; ; i++)
1943 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1945 /* We always get a T_ENDCMD at the end of an input file
1946 (transformed from T_STOP by lex_source_try_get_pp()) and
1947 merger_add() should never return -1 on T_ENDCMD. */
1948 assert (lex_stage_is_empty (&src->merge));
1952 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1956 lex_source_push_parse (src, lex_stage_take_first (&src->merge));
1959 else if (retval > 0)
1961 /* Add a token that merges all the tokens together. */
1962 const struct lex_token *first = lex_stage_first (&src->merge);
1963 const struct lex_token *last = lex_stage_nth (&src->merge,
1965 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1966 struct lex_token *t = xmalloc (sizeof *t);
1967 *t = (struct lex_token) {
1969 .token_pos = first->token_pos,
1970 .token_len = (last->token_pos - first->token_pos) + last->token_len,
1971 .first_line = first->first_line,
1973 /* This works well if all the tokens were not expanded from macros,
1974 or if they came from the same macro expansion. It just gives up
1975 in the other (corner) cases. */
1976 .macro_rep = macro ? first->macro_rep : NULL,
1977 .ofs = macro ? first->ofs : 0,
1978 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
1979 .ref_cnt = macro ? first->ref_cnt : NULL,
1983 lex_source_push_parse (src, t);
1985 for (int i = 0; i < retval; i++)
1986 lex_stage_pop_first (&src->merge);
1993 lex_source_push_endcmd__ (struct lex_source *src)
1995 assert (src->n_parse == 0);
1997 struct lex_token *token = xmalloc (sizeof *token);
1998 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
1999 lex_source_push_parse (src, token);
2003 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2005 if (src->n_parse >= src->allocated_parse)
2006 src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2007 sizeof *src->parse);
2008 src->parse[src->n_parse++] = token;
2012 lex_source_clear_parse (struct lex_source *src)
2014 for (size_t i = 0; i < src->n_parse; i++)
2015 lex_token_destroy (src->parse[i]);
2016 src->n_parse = src->parse_ofs = 0;
2019 static struct lex_source *
2020 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2022 struct lex_source *src = xmalloc (sizeof *src);
2023 *src = (struct lex_source) {
2025 .segmenter = segmenter_init (reader->syntax, false),
2029 lex_source_push_endcmd__ (src);
2035 lex_source_destroy (struct lex_source *src)
2037 char *file_name = src->reader->file_name;
2038 char *encoding = src->reader->encoding;
2039 if (src->reader->class->destroy != NULL)
2040 src->reader->class->destroy (src->reader);
2044 lex_stage_uninit (&src->pp);
2045 lex_stage_uninit (&src->merge);
2046 lex_source_clear_parse (src);
2048 ll_remove (&src->ll);
2052 struct lex_file_reader
2054 struct lex_reader reader;
2055 struct u8_istream *istream;
2058 static struct lex_reader_class lex_file_reader_class;
2060 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2061 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2062 ENCODING, which should take one of the forms accepted by
2063 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2064 mode of the new reader, respectively.
2066 Returns a null pointer if FILE_NAME cannot be opened. */
2068 lex_reader_for_file (const char *file_name, const char *encoding,
2069 enum segmenter_mode syntax,
2070 enum lex_error_mode error)
2072 struct lex_file_reader *r;
2073 struct u8_istream *istream;
2075 istream = (!strcmp(file_name, "-")
2076 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2077 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2078 if (istream == NULL)
2080 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2084 r = xmalloc (sizeof *r);
2085 lex_reader_init (&r->reader, &lex_file_reader_class);
2086 r->reader.syntax = syntax;
2087 r->reader.error = error;
2088 r->reader.file_name = xstrdup (file_name);
2089 r->reader.encoding = xstrdup_if_nonnull (encoding);
2090 r->reader.line_number = 1;
2091 r->istream = istream;
2096 static struct lex_file_reader *
2097 lex_file_reader_cast (struct lex_reader *r)
2099 return UP_CAST (r, struct lex_file_reader, reader);
2103 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2104 enum prompt_style prompt_style UNUSED)
2106 struct lex_file_reader *r = lex_file_reader_cast (r_);
2107 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2110 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2117 lex_file_close (struct lex_reader *r_)
2119 struct lex_file_reader *r = lex_file_reader_cast (r_);
2121 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2123 if (u8_istream_close (r->istream) != 0)
2124 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2127 u8_istream_free (r->istream);
2132 static struct lex_reader_class lex_file_reader_class =
2138 struct lex_string_reader
2140 struct lex_reader reader;
2145 static struct lex_reader_class lex_string_reader_class;
2147 /* Creates and returns a new lex_reader for the contents of S, which must be
2148 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2149 with ss_dealloc() when it is closed. */
2151 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2153 struct lex_string_reader *r;
2155 r = xmalloc (sizeof *r);
2156 lex_reader_init (&r->reader, &lex_string_reader_class);
2157 r->reader.syntax = SEG_MODE_AUTO;
2158 r->reader.encoding = xstrdup_if_nonnull (encoding);
2165 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2166 which must be encoded in ENCODING. The caller retains ownership of S. */
2168 lex_reader_for_string (const char *s, const char *encoding)
2170 struct substring ss;
2171 ss_alloc_substring (&ss, ss_cstr (s));
2172 return lex_reader_for_substring_nocopy (ss, encoding);
2175 /* Formats FORMAT as a printf()-like format string and creates and returns a
2176 new lex_reader for the formatted result. */
2178 lex_reader_for_format (const char *format, const char *encoding, ...)
2180 struct lex_reader *r;
2183 va_start (args, encoding);
2184 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2190 static struct lex_string_reader *
2191 lex_string_reader_cast (struct lex_reader *r)
2193 return UP_CAST (r, struct lex_string_reader, reader);
2197 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2198 enum prompt_style prompt_style UNUSED)
2200 struct lex_string_reader *r = lex_string_reader_cast (r_);
2203 chunk = MIN (n, r->s.length - r->offset);
2204 memcpy (buf, r->s.string + r->offset, chunk);
2211 lex_string_close (struct lex_reader *r_)
2213 struct lex_string_reader *r = lex_string_reader_cast (r_);
2219 static struct lex_reader_class lex_string_reader_class =