1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "language/command.h"
34 #include "language/lexer/macro.h"
35 #include "language/lexer/scan.h"
36 #include "language/lexer/segment.h"
37 #include "language/lexer/token.h"
38 #include "libpspp/assertion.h"
39 #include "libpspp/cast.h"
40 #include "libpspp/deque.h"
41 #include "libpspp/i18n.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
71 src->tail <= line_pos <= token_pos <= src->head. */
72 size_t token_pos; /* Start of token. */
73 size_t token_len; /* Length of source for token in bytes. */
74 size_t line_pos; /* Start of line containing token_pos. */
75 int first_line; /* Line number at token_pos. */
77 /* For a token obtained through macro expansion, this is just this token.
79 For a token obtained through the lexer in an ordinary way, these are
81 char *macro_rep; /* The whole macro expansion. */
82 size_t ofs; /* Offset of this token in macro_rep. */
83 size_t len; /* Length of this token in macro_rep. */
84 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
88 lex_token_destroy (struct lex_token *t)
90 token_uninit (&t->token);
93 assert (*t->ref_cnt > 0);
103 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
108 struct lex_token **tokens;
111 static void lex_stage_clear (struct lex_stage *);
112 static void lex_stage_uninit (struct lex_stage *);
114 static size_t lex_stage_count (const struct lex_stage *);
115 static bool lex_stage_is_empty (const struct lex_stage *);
117 static struct lex_token *lex_stage_last (struct lex_stage *);
118 static struct lex_token *lex_stage_first (struct lex_stage *);
119 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
121 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
122 static void lex_stage_pop_first (struct lex_stage *);
124 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
127 /* Deletes all the tokens from STAGE. */
129 lex_stage_clear (struct lex_stage *stage)
131 while (!deque_is_empty (&stage->deque))
132 lex_stage_pop_first (stage);
135 /* Deletes all the tokens from STAGE and frees storage for the deque. */
137 lex_stage_uninit (struct lex_stage *stage)
139 lex_stage_clear (stage);
140 free (stage->tokens);
143 /* Returns true if STAGE contains no tokens, otherwise false. */
145 lex_stage_is_empty (const struct lex_stage *stage)
147 return deque_is_empty (&stage->deque);
150 /* Returns the number of tokens in STAGE. */
152 lex_stage_count (const struct lex_stage *stage)
154 return deque_count (&stage->deque);
157 /* Returns the last token in STAGE, which must be nonempty. The last token is
158 the one accessed with the greatest lookahead. */
159 static struct lex_token *
160 lex_stage_last (struct lex_stage *stage)
162 return stage->tokens[deque_front (&stage->deque, 0)];
165 /* Returns the first token in STAGE, which must be nonempty.
166 The first token is the one accessed with the least lookahead. */
167 static struct lex_token *
168 lex_stage_first (struct lex_stage *stage)
170 return lex_stage_nth (stage, 0);
173 /* Returns the token the given INDEX in STAGE. The first token (with the least
174 lookahead) is 0, the second token is 1, and so on. There must be at least
175 INDEX + 1 tokens in STAGE. */
176 static struct lex_token *
177 lex_stage_nth (struct lex_stage *stage, size_t index)
179 return stage->tokens[deque_back (&stage->deque, index)];
182 /* Adds TOKEN so that it becomes the last token in STAGE. */
184 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
186 if (deque_is_full (&stage->deque))
187 stage->tokens = deque_expand (&stage->deque, stage->tokens,
188 sizeof *stage->tokens);
189 stage->tokens[deque_push_front (&stage->deque)] = token;
192 /* Removes the first token from STAGE and uninitializes it. */
194 lex_stage_pop_first (struct lex_stage *stage)
196 lex_token_destroy (stage->tokens[deque_pop_back (&stage->deque)]);
199 /* Removes the first N tokens from SRC, appending them to DST as the last
202 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
204 for (size_t i = 0; i < n; i++)
206 lex_stage_push_last (dst, lex_stage_first (src));
207 deque_pop_back (&src->deque);
211 /* A source of tokens, corresponding to a syntax file.
213 This is conceptually a lex_reader wrapped with everything needed to convert
214 its UTF-8 bytes into tokens. */
217 struct ll ll; /* In lexer's list of sources. */
218 struct lex_reader *reader;
220 struct segmenter segmenter;
221 bool eof; /* True if T_STOP was read from 'reader'. */
223 /* Buffer of UTF-8 bytes. */
225 size_t allocated; /* Number of bytes allocated. */
226 size_t tail; /* &buffer[0] offset into UTF-8 source. */
227 size_t head; /* &buffer[head - tail] offset into source. */
229 /* Positions in source file, tail <= pos <= head for each member here. */
230 size_t journal_pos; /* First byte not yet output to journal. */
231 size_t seg_pos; /* First byte not yet scanned as token. */
232 size_t line_pos; /* First byte of line containing seg_pos. */
234 int n_newlines; /* Number of new-lines up to seg_pos. */
235 bool suppress_next_newline;
239 This is a pipeline with the following stages. Each token eventually
240 made available to the parser passes through of these stages. The stages
241 are named after the processing that happens in each one.
243 Initially, tokens come from the segmenter and scanner to 'pp':
245 - pp: Tokens that need to pass through the macro preprocessor to end up
248 - merge: Tokens that need to pass through scan_merge() to end up in
251 - lookahead: Tokens available to the client for parsing. */
253 struct lex_stage merge;
254 struct lex_stage lookahead;
257 static struct lex_source *lex_source_create (struct lexer *,
258 struct lex_reader *);
259 static void lex_source_destroy (struct lex_source *);
264 struct ll_list sources; /* Contains "struct lex_source"s. */
265 struct macro_set *macros;
268 static struct lex_source *lex_source__ (const struct lexer *);
269 static char *lex_source_get_syntax__ (const struct lex_source *,
271 static const struct lex_token *lex_next__ (const struct lexer *, int n);
272 static void lex_source_push_endcmd__ (struct lex_source *);
274 static bool lex_source_get_lookahead (struct lex_source *);
275 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
276 const char *format, va_list)
277 PRINTF_FORMAT (4, 0);
278 static const struct lex_token *lex_source_next__ (const struct lex_source *,
281 /* Initializes READER with the specified CLASS and otherwise some reasonable
282 defaults. The caller should fill in the others members as desired. */
284 lex_reader_init (struct lex_reader *reader,
285 const struct lex_reader_class *class)
287 reader->class = class;
288 reader->syntax = SEG_MODE_AUTO;
289 reader->error = LEX_ERROR_CONTINUE;
290 reader->file_name = NULL;
291 reader->encoding = NULL;
292 reader->line_number = 0;
296 /* Frees any file name already in READER and replaces it by a copy of
297 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
299 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
301 free (reader->file_name);
302 reader->file_name = xstrdup_if_nonnull (file_name);
305 /* Creates and returns a new lexer. */
309 struct lexer *lexer = xmalloc (sizeof *lexer);
310 *lexer = (struct lexer) {
311 .sources = LL_INITIALIZER (lexer->sources),
312 .macros = macro_set_create (),
317 /* Destroys LEXER. */
319 lex_destroy (struct lexer *lexer)
323 struct lex_source *source, *next;
325 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
326 lex_source_destroy (source);
327 macro_set_destroy (lexer->macros);
332 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
333 same name. Takes ownership of M. */
335 lex_define_macro (struct lexer *lexer, struct macro *m)
337 macro_set_add (lexer->macros, m);
340 /* Inserts READER into LEXER so that the next token read by LEXER comes from
341 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
344 lex_include (struct lexer *lexer, struct lex_reader *reader)
346 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
347 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
350 /* Appends READER to LEXER, so that it will be read after all other current
351 readers have already been read. */
353 lex_append (struct lexer *lexer, struct lex_reader *reader)
355 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
360 /* Advances LEXER to the next token, consuming the current token. */
362 lex_get (struct lexer *lexer)
364 struct lex_source *src;
366 src = lex_source__ (lexer);
370 if (!lex_stage_is_empty (&src->lookahead))
371 lex_stage_pop_first (&src->lookahead);
373 while (lex_stage_is_empty (&src->lookahead))
374 if (!lex_source_get_lookahead (src))
376 lex_source_destroy (src);
377 src = lex_source__ (lexer);
383 /* Issuing errors. */
385 /* Prints a syntax error message containing the current token and
386 given message MESSAGE (if non-null). */
388 lex_error (struct lexer *lexer, const char *format, ...)
392 va_start (args, format);
393 lex_next_error_valist (lexer, 0, 0, format, args);
397 /* Prints a syntax error message containing the current token and
398 given message MESSAGE (if non-null). */
400 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
402 lex_next_error_valist (lexer, 0, 0, format, args);
405 /* Prints a syntax error message containing the current token and
406 given message MESSAGE (if non-null). */
408 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
412 va_start (args, format);
413 lex_next_error_valist (lexer, n0, n1, format, args);
417 /* Prints a syntax error message saying that one of the strings provided as
418 varargs, up to the first NULL, is expected. */
420 (lex_error_expecting) (struct lexer *lexer, ...)
424 va_start (args, lexer);
425 lex_error_expecting_valist (lexer, args);
429 /* Prints a syntax error message saying that one of the options provided in
430 ARGS, up to the first NULL, is expected. */
432 lex_error_expecting_valist (struct lexer *lexer, va_list args)
434 enum { MAX_OPTIONS = 9 };
435 const char *options[MAX_OPTIONS];
437 while (n < MAX_OPTIONS)
439 const char *option = va_arg (args, const char *);
443 options[n++] = option;
445 lex_error_expecting_array (lexer, options, n);
449 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
454 lex_error (lexer, NULL);
458 lex_error (lexer, _("expecting %s"), options[0]);
462 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
466 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
471 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
472 options[0], options[1], options[2], options[3]);
476 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
477 options[0], options[1], options[2], options[3], options[4]);
481 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
482 options[0], options[1], options[2], options[3], options[4],
487 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
488 options[0], options[1], options[2], options[3], options[4],
489 options[5], options[6]);
493 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
494 options[0], options[1], options[2], options[3], options[4],
495 options[5], options[6], options[7]);
499 lex_error (lexer, NULL);
503 /* Reports an error to the effect that subcommand SBC may only be specified
506 This function does not take a lexer as an argument or use lex_error(),
507 because the result would ordinarily just be redundant: "Syntax error at
508 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
509 not help the user find the error. */
511 lex_sbc_only_once (const char *sbc)
513 msg (SE, _("Subcommand %s may only be specified once."), sbc);
516 /* Reports an error to the effect that subcommand SBC is missing.
518 This function does not take a lexer as an argument or use lex_error(),
519 because a missing subcommand can normally be detected only after the whole
520 command has been parsed, and so lex_error() would always report "Syntax
521 error at end of command", which does not help the user find the error. */
523 lex_sbc_missing (const char *sbc)
525 msg (SE, _("Required subcommand %s was not specified."), sbc);
528 /* Reports an error to the effect that specification SPEC may only be specified
529 once within subcommand SBC. */
531 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
533 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
537 /* Reports an error to the effect that specification SPEC is missing within
540 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
542 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
546 /* Prints a syntax error message containing the current token and
547 given message MESSAGE (if non-null). */
549 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
550 const char *format, va_list args)
552 struct lex_source *src = lex_source__ (lexer);
555 lex_source_error_valist (src, n0, n1, format, args);
561 ds_put_format (&s, _("Syntax error at end of input"));
564 ds_put_cstr (&s, ": ");
565 ds_put_vformat (&s, format, args);
567 ds_put_byte (&s, '.');
568 msg (SE, "%s", ds_cstr (&s));
573 /* Checks that we're at end of command.
574 If so, returns a successful command completion code.
575 If not, flags a syntax error and returns an error command
578 lex_end_of_command (struct lexer *lexer)
580 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
582 lex_error (lexer, _("expecting end of command"));
589 /* Token testing functions. */
591 /* Returns true if the current token is a number. */
593 lex_is_number (const struct lexer *lexer)
595 return lex_next_is_number (lexer, 0);
598 /* Returns true if the current token is a string. */
600 lex_is_string (const struct lexer *lexer)
602 return lex_next_is_string (lexer, 0);
605 /* Returns the value of the current token, which must be a
606 floating point number. */
608 lex_number (const struct lexer *lexer)
610 return lex_next_number (lexer, 0);
613 /* Returns true iff the current token is an integer. */
615 lex_is_integer (const struct lexer *lexer)
617 return lex_next_is_integer (lexer, 0);
620 /* Returns the value of the current token, which must be an
623 lex_integer (const struct lexer *lexer)
625 return lex_next_integer (lexer, 0);
628 /* Token testing functions with lookahead.
630 A value of 0 for N as an argument to any of these functions refers to the
631 current token. Lookahead is limited to the current command. Any N greater
632 than the number of tokens remaining in the current command will be treated
633 as referring to a T_ENDCMD token. */
635 /* Returns true if the token N ahead of the current token is a number. */
637 lex_next_is_number (const struct lexer *lexer, int n)
639 return token_is_number (lex_next (lexer, n));
642 /* Returns true if the token N ahead of the current token is a string. */
644 lex_next_is_string (const struct lexer *lexer, int n)
646 return token_is_string (lex_next (lexer, n));
649 /* Returns the value of the token N ahead of the current token, which must be a
650 floating point number. */
652 lex_next_number (const struct lexer *lexer, int n)
654 return token_number (lex_next (lexer, n));
657 /* Returns true if the token N ahead of the current token is an integer. */
659 lex_next_is_integer (const struct lexer *lexer, int n)
661 return token_is_integer (lex_next (lexer, n));
664 /* Returns the value of the token N ahead of the current token, which must be
667 lex_next_integer (const struct lexer *lexer, int n)
669 return token_integer (lex_next (lexer, n));
672 /* Token matching functions. */
674 /* If the current token has the specified TYPE, skips it and returns true.
675 Otherwise, returns false. */
677 lex_match (struct lexer *lexer, enum token_type type)
679 if (lex_token (lexer) == type)
688 /* If the current token matches IDENTIFIER, skips it and returns true.
689 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
692 IDENTIFIER must be an ASCII string. */
694 lex_match_id (struct lexer *lexer, const char *identifier)
696 return lex_match_id_n (lexer, identifier, 3);
699 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
700 may be abbreviated to its first N letters. Otherwise, returns false.
702 IDENTIFIER must be an ASCII string. */
704 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
706 if (lex_token (lexer) == T_ID
707 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
716 /* If the current token is integer X, skips it and returns true. Otherwise,
719 lex_match_int (struct lexer *lexer, int x)
721 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
730 /* Forced matches. */
732 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
733 abbreviated to its first 3 letters. Otherwise, reports an error and returns
736 IDENTIFIER must be an ASCII string. */
738 lex_force_match_id (struct lexer *lexer, const char *identifier)
740 if (lex_match_id (lexer, identifier))
744 lex_error_expecting (lexer, identifier);
749 /* If the current token has the specified TYPE, skips it and returns true.
750 Otherwise, reports an error and returns false. */
752 lex_force_match (struct lexer *lexer, enum token_type type)
754 if (lex_token (lexer) == type)
761 const char *type_string = token_type_to_string (type);
764 char *s = xasprintf ("`%s'", type_string);
765 lex_error_expecting (lexer, s);
769 lex_error_expecting (lexer, token_type_to_name (type));
775 /* If the current token is a string, does nothing and returns true.
776 Otherwise, reports an error and returns false. */
778 lex_force_string (struct lexer *lexer)
780 if (lex_is_string (lexer))
784 lex_error (lexer, _("expecting string"));
789 /* If the current token is a string or an identifier, does nothing and returns
790 true. Otherwise, reports an error and returns false.
792 This is meant for use in syntactic situations where we want to encourage the
793 user to supply a quoted string, but for compatibility we also accept
794 identifiers. (One example of such a situation is file names.) Therefore,
795 the error message issued when the current token is wrong only says that a
796 string is expected and doesn't mention that an identifier would also be
799 lex_force_string_or_id (struct lexer *lexer)
801 return lex_token (lexer) == T_ID || lex_force_string (lexer);
804 /* If the current token is an integer, does nothing and returns true.
805 Otherwise, reports an error and returns false. */
807 lex_force_int (struct lexer *lexer)
809 if (lex_is_integer (lexer))
813 lex_error (lexer, _("expecting integer"));
818 /* If the current token is an integer in the range MIN...MAX (inclusive), does
819 nothing and returns true. Otherwise, reports an error and returns false.
820 If NAME is nonnull, then it is used in the error message. */
822 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
824 bool is_number = lex_is_number (lexer);
825 bool is_integer = lex_is_integer (lexer);
826 bool too_small = (is_integer ? lex_integer (lexer) < min
827 : is_number ? lex_number (lexer) < min
829 bool too_big = (is_integer ? lex_integer (lexer) > max
830 : is_number ? lex_number (lexer) > max
832 if (is_integer && !too_small && !too_big)
837 /* Weird, maybe a bug in the caller. Just report that we needed an
840 lex_error (lexer, _("Integer expected for %s."), name);
842 lex_error (lexer, _("Integer expected."));
847 lex_error (lexer, _("Expected %ld for %s."), min, name);
849 lex_error (lexer, _("Expected %ld."), min);
851 else if (min + 1 == max)
854 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
856 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
860 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
861 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
863 if (report_lower_bound && report_upper_bound)
867 _("Expected integer between %ld and %ld for %s."),
870 lex_error (lexer, _("Expected integer between %ld and %ld."),
873 else if (report_lower_bound)
878 lex_error (lexer, _("Expected non-negative integer for %s."),
881 lex_error (lexer, _("Expected non-negative integer."));
886 lex_error (lexer, _("Expected positive integer for %s."),
889 lex_error (lexer, _("Expected positive integer."));
894 lex_error (lexer, _("Expected integer %ld or greater for %s."),
897 lex_error (lexer, _("Expected integer %ld or greater."), min);
900 else if (report_upper_bound)
904 _("Expected integer less than or equal to %ld for %s."),
907 lex_error (lexer, _("Expected integer less than or equal to %ld."),
913 lex_error (lexer, _("Integer expected for %s."), name);
915 lex_error (lexer, _("Integer expected."));
921 /* If the current token is a number, does nothing and returns true.
922 Otherwise, reports an error and returns false. */
924 lex_force_num (struct lexer *lexer)
926 if (lex_is_number (lexer))
929 lex_error (lexer, _("expecting number"));
933 /* If the current token is an identifier, does nothing and returns true.
934 Otherwise, reports an error and returns false. */
936 lex_force_id (struct lexer *lexer)
938 if (lex_token (lexer) == T_ID)
941 lex_error (lexer, _("expecting identifier"));
945 /* Token accessors. */
947 /* Returns the type of LEXER's current token. */
949 lex_token (const struct lexer *lexer)
951 return lex_next_token (lexer, 0);
954 /* Returns the number in LEXER's current token.
956 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
957 tokens this function will always return zero. */
959 lex_tokval (const struct lexer *lexer)
961 return lex_next_tokval (lexer, 0);
964 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
966 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
967 this functions this function will always return NULL.
969 The UTF-8 encoding of the returned string is correct for variable names and
970 other identifiers. Use filename_to_utf8() to use it as a filename. Use
971 data_in() to use it in a "union value". */
973 lex_tokcstr (const struct lexer *lexer)
975 return lex_next_tokcstr (lexer, 0);
978 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
979 null-terminated (but the null terminator is not included in the returned
980 substring's 'length').
982 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
983 this functions this function will always return NULL.
985 The UTF-8 encoding of the returned string is correct for variable names and
986 other identifiers. Use filename_to_utf8() to use it as a filename. Use
987 data_in() to use it in a "union value". */
989 lex_tokss (const struct lexer *lexer)
991 return lex_next_tokss (lexer, 0);
996 A value of 0 for N as an argument to any of these functions refers to the
997 current token. Lookahead is limited to the current command. Any N greater
998 than the number of tokens remaining in the current command will be treated
999 as referring to a T_ENDCMD token. */
1001 static const struct lex_token *
1002 lex_next__ (const struct lexer *lexer_, int n)
1004 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1005 struct lex_source *src = lex_source__ (lexer);
1008 return lex_source_next__ (src, n);
1011 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1016 static const struct lex_token *
1017 lex_source_next__ (const struct lex_source *src_, int n)
1019 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1020 while (lex_stage_count (&src->lookahead) <= n)
1022 if (!lex_stage_is_empty (&src->lookahead))
1024 const struct lex_token *t = lex_stage_last (&src->lookahead);
1025 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1029 lex_source_get_lookahead (src);
1032 return lex_stage_nth (&src->lookahead, n);
1035 /* Returns the "struct token" of the token N after the current one in LEXER.
1036 The returned pointer can be invalidated by pretty much any succeeding call
1037 into the lexer, although the string pointer within the returned token is
1038 only invalidated by consuming the token (e.g. with lex_get()). */
1039 const struct token *
1040 lex_next (const struct lexer *lexer, int n)
1042 return &lex_next__ (lexer, n)->token;
1045 /* Returns the type of the token N after the current one in LEXER. */
1047 lex_next_token (const struct lexer *lexer, int n)
1049 return lex_next (lexer, n)->type;
1052 /* Returns the number in the tokn N after the current one in LEXER.
1054 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1055 tokens this function will always return zero. */
1057 lex_next_tokval (const struct lexer *lexer, int n)
1059 return token_number (lex_next (lexer, n));
1062 /* Returns the null-terminated string in the token N after the current one, in
1065 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1066 this functions this function will always return NULL.
1068 The UTF-8 encoding of the returned string is correct for variable names and
1069 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1070 data_in() to use it in a "union value". */
1072 lex_next_tokcstr (const struct lexer *lexer, int n)
1074 return lex_next_tokss (lexer, n).string;
1077 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1078 The string is null-terminated (but the null terminator is not included in
1079 the returned substring's 'length').
1081 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1082 tokens this functions this function will always return NULL.
1084 The UTF-8 encoding of the returned string is correct for variable names and
1085 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1086 data_in() to use it in a "union value". */
1088 lex_next_tokss (const struct lexer *lexer, int n)
1090 return lex_next (lexer, n)->string;
1093 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1094 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1095 are both zero, this requests the syntax for the current token.) The caller
1096 must eventually free the returned string (with free()). The syntax is
1097 encoded in UTF-8 and in the original form supplied to the lexer so that, for
1098 example, it may include comments, spaces, and new-lines if it spans multiple
1099 tokens. Macro expansion, however, has already been performed. */
1101 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1103 return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1106 /* Returns true if the token N ahead of the current one was produced by macro
1107 expansion, false otherwise. */
1109 lex_next_is_from_macro (const struct lexer *lexer, int n)
1111 return lex_next__ (lexer, n)->macro_rep != NULL;
1115 lex_tokens_match (const struct token *actual, const struct token *expected)
1117 if (actual->type != expected->type)
1120 switch (actual->type)
1124 return actual->number == expected->number;
1127 return lex_id_match (expected->string, actual->string);
1130 return (actual->string.length == expected->string.length
1131 && !memcmp (actual->string.string, expected->string.string,
1132 actual->string.length));
1139 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1140 skips it and returns true. Otherwise, returns false.
1142 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1143 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1144 first three letters. */
1146 lex_match_phrase (struct lexer *lexer, const char *s)
1148 struct string_lexer slex;
1153 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1154 while (string_lexer_next (&slex, &token))
1156 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1157 token_uninit (&token);
1168 count_newlines (char *s, size_t length)
1173 while ((newline = memchr (s, '\n', length)) != NULL)
1176 length -= (newline + 1) - s;
1184 lex_token_get_last_line_number (const struct lex_source *src,
1185 const struct lex_token *token)
1187 if (token->first_line == 0)
1191 char *token_str = &src->buffer[token->token_pos - src->tail];
1192 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1197 count_columns (const char *s_, size_t length)
1199 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1205 for (ofs = 0; ofs < length; ofs += mblen)
1209 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1212 int width = uc_width (uc, "UTF-8");
1217 columns = ROUND_UP (columns + 1, 8);
1224 lex_token_get_first_column (const struct lex_source *src,
1225 const struct lex_token *token)
1227 return count_columns (&src->buffer[token->line_pos - src->tail],
1228 token->token_pos - token->line_pos);
1232 lex_token_get_last_column (const struct lex_source *src,
1233 const struct lex_token *token)
1235 char *start, *end, *newline;
1237 start = &src->buffer[token->line_pos - src->tail];
1238 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1239 newline = memrchr (start, '\n', end - start);
1240 if (newline != NULL)
1241 start = newline + 1;
1242 return count_columns (start, end - start);
1245 static struct msg_location
1246 lex_token_location (const struct lex_source *src,
1247 const struct lex_token *t0,
1248 const struct lex_token *t1)
1250 return (struct msg_location) {
1251 .file_name = src->reader->file_name,
1252 .first_line = t0->first_line,
1253 .last_line = lex_token_get_last_line_number (src, t1),
1254 .first_column = lex_token_get_first_column (src, t0),
1255 .last_column = lex_token_get_last_column (src, t1),
1259 static struct msg_location *
1260 lex_token_location_rw (const struct lex_source *src,
1261 const struct lex_token *t0,
1262 const struct lex_token *t1)
1264 struct msg_location location = lex_token_location (src, t0, t1);
1265 return msg_location_dup (&location);
1268 static struct msg_location *
1269 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1271 return lex_token_location_rw (src,
1272 lex_source_next__ (src, n0),
1273 lex_source_next__ (src, n1));
1276 /* Returns the 1-based line number of the start of the syntax that represents
1277 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1278 if the token is drawn from a source that does not have line numbers. */
1280 lex_get_first_line_number (const struct lexer *lexer, int n)
1282 const struct lex_source *src = lex_source__ (lexer);
1283 return src ? lex_source_next__ (src, n)->first_line : 0;
1286 /* Returns the 1-based line number of the end of the syntax that represents the
1287 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1288 token or if the token is drawn from a source that does not have line
1291 Most of the time, a single token is wholly within a single line of syntax,
1292 but there are two exceptions: a T_STRING token can be made up of multiple
1293 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1294 token can consist of a "-" on one line followed by the number on the next.
1297 lex_get_last_line_number (const struct lexer *lexer, int n)
1299 const struct lex_source *src = lex_source__ (lexer);
1300 return src ? lex_token_get_last_line_number (src,
1301 lex_source_next__ (src, n)) : 0;
1304 /* Returns the 1-based column number of the start of the syntax that represents
1305 the token N after the current one in LEXER. Returns 0 for a T_STOP
1308 Column numbers are measured according to the width of characters as shown in
1309 a typical fixed-width font, in which CJK characters have width 2 and
1310 combining characters have width 0. */
1312 lex_get_first_column (const struct lexer *lexer, int n)
1314 const struct lex_source *src = lex_source__ (lexer);
1315 return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1318 /* Returns the 1-based column number of the end of the syntax that represents
1319 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1322 Column numbers are measured according to the width of characters as shown in
1323 a typical fixed-width font, in which CJK characters have width 2 and
1324 combining characters have width 0. */
1326 lex_get_last_column (const struct lexer *lexer, int n)
1328 const struct lex_source *src = lex_source__ (lexer);
1329 return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1332 /* Returns the name of the syntax file from which the current command is drawn.
1333 Returns NULL for a T_STOP token or if the command's source does not have
1336 There is no version of this function that takes an N argument because
1337 lookahead only works to the end of a command and any given command is always
1338 within a single syntax file. */
1340 lex_get_file_name (const struct lexer *lexer)
1342 struct lex_source *src = lex_source__ (lexer);
1343 return src == NULL ? NULL : src->reader->file_name;
1346 /* Returns a newly allocated msg_location for the syntax that represents tokens
1347 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1348 must eventually free the location (with msg_location_destroy()). */
1349 struct msg_location *
1350 lex_get_location (const struct lexer *lexer, int n0, int n1)
1352 struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1353 loc->first_column = lex_get_first_column (lexer, n0);
1354 loc->last_column = lex_get_last_column (lexer, n1);
1358 /* Returns a newly allocated msg_location for the syntax that represents tokens
1359 with 0-based offsets N0...N1, inclusive, from the current token. The
1360 location only covers the tokens' lines, not the columns. The caller must
1361 eventually free the location (with msg_location_destroy()). */
1362 struct msg_location *
1363 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1365 struct msg_location *loc = xmalloc (sizeof *loc);
1366 *loc = (struct msg_location) {
1367 .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1368 .first_line = lex_get_first_line_number (lexer, n0),
1369 .last_line = lex_get_last_line_number (lexer, n1),
1375 lex_get_encoding (const struct lexer *lexer)
1377 struct lex_source *src = lex_source__ (lexer);
1378 return src == NULL ? NULL : src->reader->encoding;
1381 /* Returns the syntax mode for the syntax file from which the current drawn is
1382 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1383 does not have line numbers.
1385 There is no version of this function that takes an N argument because
1386 lookahead only works to the end of a command and any given command is always
1387 within a single syntax file. */
1389 lex_get_syntax_mode (const struct lexer *lexer)
1391 struct lex_source *src = lex_source__ (lexer);
1392 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1395 /* Returns the error mode for the syntax file from which the current drawn is
1396 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1397 source does not have line numbers.
1399 There is no version of this function that takes an N argument because
1400 lookahead only works to the end of a command and any given command is always
1401 within a single syntax file. */
1403 lex_get_error_mode (const struct lexer *lexer)
1405 struct lex_source *src = lex_source__ (lexer);
1406 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1409 /* If the source that LEXER is currently reading has error mode
1410 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1411 token to be read comes directly from whatever is next read from the stream.
1413 It makes sense to call this function after encountering an error in a
1414 command entered on the console, because usually the user would prefer not to
1415 have cascading errors. */
1417 lex_interactive_reset (struct lexer *lexer)
1419 struct lex_source *src = lex_source__ (lexer);
1420 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1422 src->head = src->tail = 0;
1423 src->journal_pos = src->seg_pos = src->line_pos = 0;
1424 src->n_newlines = 0;
1425 src->suppress_next_newline = false;
1426 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1428 lex_stage_clear (&src->pp);
1429 lex_stage_clear (&src->merge);
1430 lex_stage_clear (&src->lookahead);
1431 lex_source_push_endcmd__ (src);
1435 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1437 lex_discard_rest_of_command (struct lexer *lexer)
1439 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1443 /* Discards all lookahead tokens in LEXER, then discards all input sources
1444 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1445 runs out of input sources. */
1447 lex_discard_noninteractive (struct lexer *lexer)
1449 struct lex_source *src = lex_source__ (lexer);
1453 lex_stage_clear (&src->pp);
1454 lex_stage_clear (&src->merge);
1455 lex_stage_clear (&src->lookahead);
1457 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1458 src = lex_source__ (lexer))
1459 lex_source_destroy (src);
1464 lex_source_max_tail__ (const struct lex_source *src_)
1466 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1468 assert (src->seg_pos >= src->line_pos);
1469 size_t max_tail = MIN (src->journal_pos, src->line_pos);
1471 /* Use the oldest token also. */
1472 struct lex_stage *stages[] = { &src->lookahead, &src->merge, &src->pp };
1473 for (size_t i = 0; i < sizeof stages / sizeof *stages; i++)
1474 if (!lex_stage_is_empty (stages[i]))
1476 struct lex_token *first = lex_stage_first (stages[i]);
1477 assert (first->token_pos >= first->line_pos);
1478 return MIN (max_tail, first->line_pos);
1485 lex_source_expand__ (struct lex_source *src)
1487 if (src->head - src->tail >= src->allocated)
1489 size_t max_tail = lex_source_max_tail__ (src);
1490 if (max_tail > src->tail)
1492 /* Advance the tail, freeing up room at the head. */
1493 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1494 src->head - max_tail);
1495 src->tail = max_tail;
1499 /* Buffer is completely full. Expand it. */
1500 src->buffer = x2realloc (src->buffer, &src->allocated);
1505 /* There's space available at the head of the buffer. Nothing to do. */
1510 lex_source_read__ (struct lex_source *src)
1514 lex_source_expand__ (src);
1516 size_t head_ofs = src->head - src->tail;
1517 size_t space = src->allocated - head_ofs;
1518 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1519 size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1521 assert (n <= space);
1526 src->reader->eof = true;
1527 lex_source_expand__ (src);
1533 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1534 src->head - src->seg_pos));
1537 static struct lex_source *
1538 lex_source__ (const struct lexer *lexer)
1540 return (ll_is_empty (&lexer->sources) ? NULL
1541 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1544 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1545 one, through N1 ahead of the current one, inclusive. (For example, if N0
1546 and N1 are both zero, this requests the syntax for the current token.) The
1547 caller must eventually free the returned string (with free()). The syntax
1548 is encoded in UTF-8 and in the original form supplied to the lexer so that,
1549 for example, it may include comments, spaces, and new-lines if it spans
1550 multiple tokens. Macro expansion, however, has already been performed. */
1552 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1554 struct string s = DS_EMPTY_INITIALIZER;
1555 for (size_t i = n0; i <= n1; )
1557 /* Find [I,J) as the longest sequence of tokens not produced by macro
1558 expansion, or otherwise the longest sequence expanded from a single
1560 const struct lex_token *first = lex_source_next__ (src, i);
1562 for (j = i + 1; j <= n1; j++)
1564 const struct lex_token *cur = lex_source_next__ (src, j);
1565 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1566 || first->macro_rep != cur->macro_rep)
1569 const struct lex_token *last = lex_source_next__ (src, j - 1);
1571 /* Now add the syntax for this sequence of tokens to SRC. */
1572 if (!ds_is_empty (&s))
1573 ds_put_byte (&s, ' ');
1574 if (!first->macro_rep)
1576 size_t start = first->token_pos;
1577 size_t end = last->token_pos + last->token_len;
1578 ds_put_substring (&s, ss_buffer (&src->buffer[start - src->tail],
1583 size_t start = first->ofs;
1584 size_t end = last->ofs + last->len;
1585 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1591 return ds_steal_cstr (&s);
1595 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1597 for (size_t i = n0; i <= n1; i++)
1598 if (lex_source_next__ (src, i)->macro_rep)
1603 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1604 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1605 other tokens included in that range. The syntax is encoded in UTF-8 and in
1606 the original form supplied to the lexer so that, for example, it may include
1607 comments, spaces, and new-lines if it spans multiple tokens.
1609 Returns an empty string if the token range doesn't include a macro call.
1611 The caller must not modify or free the returned string. */
1612 static struct substring
1613 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1615 if (!lex_source_contains_macro_call (src, n0, n1))
1618 const struct lex_token *token0 = lex_source_next__ (src, n0);
1619 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1620 size_t start = token0->token_pos;
1621 size_t end = token1->token_pos + token1->token_len;
1623 return ss_buffer (&src->buffer[start - src->tail], end - start);
1627 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1628 const char *format, va_list args)
1630 const struct lex_token *token;
1635 token = lex_source_next__ (src, n0);
1636 if (token->token.type == T_ENDCMD)
1637 ds_put_cstr (&s, _("Syntax error at end of command"));
1640 /* Get the syntax that caused the error. */
1641 char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1643 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1646 /* Get the macro call(s) that expanded to the syntax that caused the
1649 str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1656 _("Syntax error at `%s' (in expansion of `%s')"),
1659 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1664 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1667 ds_put_cstr (&s, _("Syntax error"));
1673 ds_put_cstr (&s, ": ");
1674 ds_put_vformat (&s, format, args);
1676 if (ds_last (&s) != '.')
1677 ds_put_byte (&s, '.');
1679 struct msg *m = xmalloc (sizeof *m);
1681 .category = MSG_C_SYNTAX,
1682 .severity = MSG_S_ERROR,
1683 .location = lex_source_get_location (src, n0, n1),
1684 .text = ds_steal_cstr (&s),
1690 lex_get_error (struct lex_source *src, const struct lex_token *token)
1693 str_ellipsize (ss_buffer (&src->buffer[token->token_pos - src->tail],
1695 syntax, sizeof syntax);
1697 struct string s = DS_EMPTY_INITIALIZER;
1698 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1699 ds_put_format (&s, ": %s", token->token.string.string);
1701 struct msg *m = xmalloc (sizeof *m);
1703 .category = MSG_C_SYNTAX,
1704 .severity = MSG_S_ERROR,
1705 .location = lex_token_location_rw (src, token, token),
1706 .text = ds_steal_cstr (&s),
1711 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1712 underlying lex_reader if necessary. Returns true if a new token was added
1713 to SRC's deque, false otherwise. The caller should retry failures unless
1714 SRC's 'eof' marker was set to true indicating that there will be no more
1715 tokens from this source. */
1717 lex_source_try_get_pp (struct lex_source *src)
1719 /* Append a new token to SRC and initialize it. */
1720 struct lex_token *token = xmalloc (sizeof *token);
1721 token->token = (struct token) { .type = T_STOP };
1722 token->macro_rep = NULL;
1723 token->ref_cnt = NULL;
1724 token->line_pos = src->line_pos;
1725 token->token_pos = src->seg_pos;
1726 if (src->reader->line_number > 0)
1727 token->first_line = src->reader->line_number + src->n_newlines;
1729 token->first_line = 0;
1731 /* Extract a segment. */
1732 const char *segment;
1733 enum segment_type seg_type;
1737 segment = &src->buffer[src->seg_pos - src->tail];
1738 seg_len = segmenter_push (&src->segmenter, segment,
1739 src->head - src->seg_pos,
1740 src->reader->eof, &seg_type);
1744 /* The segmenter needs more input to produce a segment. */
1745 assert (!src->reader->eof);
1746 lex_source_read__ (src);
1749 /* Update state based on the segment. */
1750 token->token_len = seg_len;
1751 src->seg_pos += seg_len;
1752 if (seg_type == SEG_NEWLINE)
1754 src->line_pos = src->seg_pos;
1758 /* Get a token from the segment. */
1759 enum tokenize_result result = token_from_segment (
1760 seg_type, ss_buffer (segment, seg_len), &token->token);
1762 /* If we've reached the end of a line, or the end of a command, then pass
1763 the line to the output engine as a syntax text item. */
1764 int n_lines = seg_type == SEG_NEWLINE;
1765 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1768 src->suppress_next_newline = true;
1770 else if (n_lines > 0 && src->suppress_next_newline)
1773 src->suppress_next_newline = false;
1775 for (int i = 0; i < n_lines; i++)
1777 /* Beginning of line. */
1778 const char *line = &src->buffer[src->journal_pos - src->tail];
1780 /* Calculate line length, including \n or \r\n end-of-line if present.
1782 We use src->head even though that may be beyond what we've actually
1783 converted to tokens (which is only through line_pos). That's because,
1784 if we're emitting the line due to SEG_END_COMMAND, we want to take the
1785 whole line through the newline, not just through the '.'. */
1786 size_t max_len = src->head - src->journal_pos;
1787 const char *newline = memchr (line, '\n', max_len);
1788 size_t line_len = newline ? newline - line + 1 : max_len;
1790 /* Calculate line length excluding end-of-line. */
1791 size_t copy_len = line_len;
1792 if (copy_len > 0 && line[copy_len - 1] == '\n')
1794 if (copy_len > 0 && line[copy_len - 1] == '\r')
1797 /* Submit the line as syntax. */
1798 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1799 xmemdup0 (line, copy_len),
1802 src->journal_pos += line_len;
1807 case TOKENIZE_ERROR:
1808 lex_get_error (src, token);
1810 case TOKENIZE_EMPTY:
1811 lex_token_destroy (token);
1814 case TOKENIZE_TOKEN:
1815 if (token->token.type == T_STOP)
1817 token->token.type = T_ENDCMD;
1820 lex_stage_push_last (&src->pp, token);
1826 /* Attempts to append a new token to SRC. Returns true if successful, false on
1827 failure. On failure, the end of SRC has been reached and no more tokens
1828 will be forthcoming from it.
1830 Does not make the new token available for lookahead yet; the caller must
1831 adjust SRC's 'middle' pointer to do so. */
1833 lex_source_get_pp (struct lex_source *src)
1836 if (lex_source_try_get_pp (src))
1842 lex_source_try_get_merge (const struct lex_source *src_)
1844 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1846 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1849 if (!settings_get_mexpand ())
1851 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1855 /* Now pass tokens one-by-one to the macro expander.
1857 In the common case where there is no macro to expand, the loop is not
1859 struct macro_call *mc;
1860 int n_call = macro_call_create (src->lexer->macros,
1861 &lex_stage_first (&src->pp)->token, &mc);
1862 for (int ofs = 1; !n_call; ofs++)
1864 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1866 /* This should not be reachable because we always get a T_ENDCMD at
1867 the end of an input file (transformed from T_STOP by
1868 lex_source_try_get_pp()) and the macro_expander should always
1869 terminate expansion on T_ENDCMD. */
1873 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1874 size_t start = t->token_pos;
1875 size_t end = t->token_pos + t->token_len;
1876 const struct macro_token mt = {
1878 .syntax = ss_buffer (&src->buffer[start - src->tail], end - start),
1880 const struct msg_location loc = lex_token_location (src, t, t);
1881 n_call = macro_call_add (mc, &mt, &loc);
1885 /* False alarm: no macro expansion after all. Use first token as
1886 lookahead. We'll retry macro expansion from the second token next
1888 macro_call_destroy (mc);
1889 lex_stage_shift (&src->merge, &src->pp, 1);
1893 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1894 are a macro call. (These are likely to be the only tokens in 'pp'.)
1896 const struct lex_token *c0 = lex_stage_first (&src->pp);
1897 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1898 struct macro_tokens expansion = { .n = 0 };
1899 struct msg_location loc = lex_token_location (src, c0, c1);
1900 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1901 macro_call_destroy (mc);
1903 /* Convert the macro expansion into syntax for possible error messages
1905 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1906 size_t *len = xnmalloc (expansion.n, sizeof *len);
1907 struct string s = DS_EMPTY_INITIALIZER;
1908 macro_tokens_to_syntax (&expansion, &s, ofs, len);
1910 if (settings_get_mprint ())
1911 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1912 _("Macro Expansion")));
1914 /* Append the macro expansion tokens to the lookahead. */
1915 if (expansion.n > 0)
1917 char *macro_rep = ds_steal_cstr (&s);
1918 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1919 *ref_cnt = expansion.n;
1920 for (size_t i = 0; i < expansion.n; i++)
1922 struct lex_token *token = xmalloc (sizeof *token);
1923 *token = (struct lex_token) {
1924 .token = expansion.mts[i].token,
1925 .token_pos = c0->token_pos,
1926 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1927 .line_pos = c0->line_pos,
1928 .first_line = c0->first_line,
1929 .macro_rep = macro_rep,
1934 lex_stage_push_last (&src->merge, token);
1936 ss_dealloc (&expansion.mts[i].syntax);
1941 free (expansion.mts);
1945 /* Destroy the tokens for the call. */
1946 for (size_t i = 0; i < n_call; i++)
1947 lex_stage_pop_first (&src->pp);
1949 return expansion.n > 0;
1952 /* Attempts to obtain at least one new token into 'merge' in SRC.
1954 Returns true if successful, false on failure. In the latter case, SRC is
1955 exhausted and 'src->eof' is now true. */
1957 lex_source_get_merge (struct lex_source *src)
1960 if (lex_source_try_get_merge (src))
1965 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1967 Returns true if successful, false on failure. In the latter case, SRC is
1968 exhausted and 'src->eof' is now true. */
1970 lex_source_get_lookahead (struct lex_source *src)
1972 struct merger m = MERGER_INIT;
1974 for (size_t i = 0; ; i++)
1976 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1978 /* We always get a T_ENDCMD at the end of an input file
1979 (transformed from T_STOP by lex_source_try_get_pp()) and
1980 merger_add() should never return -1 on T_ENDCMD. */
1981 assert (lex_stage_is_empty (&src->merge));
1985 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1989 lex_stage_shift (&src->lookahead, &src->merge, 1);
1992 else if (retval > 0)
1994 /* Add a token that merges all the tokens together. */
1995 const struct lex_token *first = lex_stage_first (&src->merge);
1996 const struct lex_token *last = lex_stage_nth (&src->merge,
1998 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1999 struct lex_token *t = xmalloc (sizeof *t);
2000 *t = (struct lex_token) {
2002 .token_pos = first->token_pos,
2003 .token_len = (last->token_pos - first->token_pos) + last->token_len,
2004 .line_pos = first->line_pos,
2005 .first_line = first->first_line,
2007 /* This works well if all the tokens were not expanded from macros,
2008 or if they came from the same macro expansion. It just gives up
2009 in the other (corner) cases. */
2010 .macro_rep = macro ? first->macro_rep : NULL,
2011 .ofs = macro ? first->ofs : 0,
2012 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2013 .ref_cnt = macro ? first->ref_cnt : NULL,
2017 lex_stage_push_last (&src->lookahead, t);
2019 for (int i = 0; i < retval; i++)
2020 lex_stage_pop_first (&src->merge);
2027 lex_source_push_endcmd__ (struct lex_source *src)
2029 assert (lex_stage_is_empty (&src->lookahead));
2030 struct lex_token *token = xmalloc (sizeof *token);
2031 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2032 lex_stage_push_last (&src->lookahead, token);
2035 static struct lex_source *
2036 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2038 struct lex_source *src = xmalloc (sizeof *src);
2039 *src = (struct lex_source) {
2041 .segmenter = segmenter_init (reader->syntax, false),
2045 lex_source_push_endcmd__ (src);
2051 lex_source_destroy (struct lex_source *src)
2053 char *file_name = src->reader->file_name;
2054 char *encoding = src->reader->encoding;
2055 if (src->reader->class->destroy != NULL)
2056 src->reader->class->destroy (src->reader);
2060 lex_stage_uninit (&src->pp);
2061 lex_stage_uninit (&src->merge);
2062 lex_stage_uninit (&src->lookahead);
2063 ll_remove (&src->ll);
2067 struct lex_file_reader
2069 struct lex_reader reader;
2070 struct u8_istream *istream;
2073 static struct lex_reader_class lex_file_reader_class;
2075 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2076 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2077 ENCODING, which should take one of the forms accepted by
2078 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2079 mode of the new reader, respectively.
2081 Returns a null pointer if FILE_NAME cannot be opened. */
2083 lex_reader_for_file (const char *file_name, const char *encoding,
2084 enum segmenter_mode syntax,
2085 enum lex_error_mode error)
2087 struct lex_file_reader *r;
2088 struct u8_istream *istream;
2090 istream = (!strcmp(file_name, "-")
2091 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2092 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2093 if (istream == NULL)
2095 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2099 r = xmalloc (sizeof *r);
2100 lex_reader_init (&r->reader, &lex_file_reader_class);
2101 r->reader.syntax = syntax;
2102 r->reader.error = error;
2103 r->reader.file_name = xstrdup (file_name);
2104 r->reader.encoding = xstrdup_if_nonnull (encoding);
2105 r->reader.line_number = 1;
2106 r->istream = istream;
2111 static struct lex_file_reader *
2112 lex_file_reader_cast (struct lex_reader *r)
2114 return UP_CAST (r, struct lex_file_reader, reader);
2118 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2119 enum prompt_style prompt_style UNUSED)
2121 struct lex_file_reader *r = lex_file_reader_cast (r_);
2122 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2125 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2132 lex_file_close (struct lex_reader *r_)
2134 struct lex_file_reader *r = lex_file_reader_cast (r_);
2136 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2138 if (u8_istream_close (r->istream) != 0)
2139 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2142 u8_istream_free (r->istream);
2147 static struct lex_reader_class lex_file_reader_class =
2153 struct lex_string_reader
2155 struct lex_reader reader;
2160 static struct lex_reader_class lex_string_reader_class;
2162 /* Creates and returns a new lex_reader for the contents of S, which must be
2163 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2164 with ss_dealloc() when it is closed. */
2166 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2168 struct lex_string_reader *r;
2170 r = xmalloc (sizeof *r);
2171 lex_reader_init (&r->reader, &lex_string_reader_class);
2172 r->reader.syntax = SEG_MODE_AUTO;
2173 r->reader.encoding = xstrdup_if_nonnull (encoding);
2180 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2181 which must be encoded in ENCODING. The caller retains ownership of S. */
2183 lex_reader_for_string (const char *s, const char *encoding)
2185 struct substring ss;
2186 ss_alloc_substring (&ss, ss_cstr (s));
2187 return lex_reader_for_substring_nocopy (ss, encoding);
2190 /* Formats FORMAT as a printf()-like format string and creates and returns a
2191 new lex_reader for the formatted result. */
2193 lex_reader_for_format (const char *format, const char *encoding, ...)
2195 struct lex_reader *r;
2198 va_start (args, encoding);
2199 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2205 static struct lex_string_reader *
2206 lex_string_reader_cast (struct lex_reader *r)
2208 return UP_CAST (r, struct lex_string_reader, reader);
2212 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2213 enum prompt_style prompt_style UNUSED)
2215 struct lex_string_reader *r = lex_string_reader_cast (r_);
2218 chunk = MIN (n, r->s.length - r->offset);
2219 memcpy (buf, r->s.string + r->offset, chunk);
2226 lex_string_close (struct lex_reader *r_)
2228 struct lex_string_reader *r = lex_string_reader_cast (r_);
2234 static struct lex_reader_class lex_string_reader_class =