1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "language/command.h"
34 #include "language/lexer/macro.h"
35 #include "language/lexer/scan.h"
36 #include "language/lexer/segment.h"
37 #include "language/lexer/token.h"
38 #include "libpspp/assertion.h"
39 #include "libpspp/cast.h"
40 #include "libpspp/deque.h"
41 #include "libpspp/i18n.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
71 src->tail <= line_pos <= token_pos <= src->head. */
72 size_t token_pos; /* Start of token. */
73 size_t token_len; /* Length of source for token in bytes. */
74 size_t line_pos; /* Start of line containing token_pos. */
75 int first_line; /* Line number at token_pos. */
77 /* For a token obtained through macro expansion, this is just this token.
79 For a token obtained through the lexer in an ordinary way, these are
81 char *macro_rep; /* The whole macro expansion. */
82 size_t ofs; /* Offset of this token in macro_rep. */
83 size_t len; /* Length of this token in macro_rep. */
84 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
88 lex_token_destroy (struct lex_token *t)
90 token_uninit (&t->token);
93 assert (*t->ref_cnt > 0);
103 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
108 struct lex_token **tokens;
111 static void lex_stage_clear (struct lex_stage *);
112 static void lex_stage_uninit (struct lex_stage *);
114 static size_t lex_stage_count (const struct lex_stage *);
115 static bool lex_stage_is_empty (const struct lex_stage *);
117 static struct lex_token *lex_stage_last (struct lex_stage *);
118 static struct lex_token *lex_stage_first (struct lex_stage *);
119 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
121 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
122 static void lex_stage_pop_first (struct lex_stage *);
124 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
127 /* Deletes all the tokens from STAGE. */
129 lex_stage_clear (struct lex_stage *stage)
131 while (!deque_is_empty (&stage->deque))
132 lex_stage_pop_first (stage);
135 /* Deletes all the tokens from STAGE and frees storage for the deque. */
137 lex_stage_uninit (struct lex_stage *stage)
139 lex_stage_clear (stage);
140 free (stage->tokens);
143 /* Returns true if STAGE contains no tokens, otherwise false. */
145 lex_stage_is_empty (const struct lex_stage *stage)
147 return deque_is_empty (&stage->deque);
150 /* Returns the number of tokens in STAGE. */
152 lex_stage_count (const struct lex_stage *stage)
154 return deque_count (&stage->deque);
157 /* Returns the last token in STAGE, which must be nonempty. The last token is
158 the one accessed with the greatest lookahead. */
159 static struct lex_token *
160 lex_stage_last (struct lex_stage *stage)
162 return stage->tokens[deque_front (&stage->deque, 0)];
165 /* Returns the first token in STAGE, which must be nonempty.
166 The first token is the one accessed with the least lookahead. */
167 static struct lex_token *
168 lex_stage_first (struct lex_stage *stage)
170 return lex_stage_nth (stage, 0);
173 /* Returns the token the given INDEX in STAGE. The first token (with the least
174 lookahead) is 0, the second token is 1, and so on. There must be at least
175 INDEX + 1 tokens in STAGE. */
176 static struct lex_token *
177 lex_stage_nth (struct lex_stage *stage, size_t index)
179 return stage->tokens[deque_back (&stage->deque, index)];
182 /* Adds TOKEN so that it becomes the last token in STAGE. */
184 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
186 if (deque_is_full (&stage->deque))
187 stage->tokens = deque_expand (&stage->deque, stage->tokens,
188 sizeof *stage->tokens);
189 stage->tokens[deque_push_front (&stage->deque)] = token;
192 /* Removes the first token from STAGE and uninitializes it. */
194 lex_stage_pop_first (struct lex_stage *stage)
196 lex_token_destroy (stage->tokens[deque_pop_back (&stage->deque)]);
199 /* Removes the first N tokens from SRC, appending them to DST as the last
202 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
204 for (size_t i = 0; i < n; i++)
206 lex_stage_push_last (dst, lex_stage_first (src));
207 deque_pop_back (&src->deque);
211 /* A source of tokens, corresponding to a syntax file.
213 This is conceptually a lex_reader wrapped with everything needed to convert
214 its UTF-8 bytes into tokens. */
217 struct ll ll; /* In lexer's list of sources. */
218 struct lex_reader *reader;
220 struct segmenter segmenter;
221 bool eof; /* True if T_STOP was read from 'reader'. */
223 /* Buffer of UTF-8 bytes. */
225 size_t allocated; /* Number of bytes allocated. */
226 size_t tail; /* &buffer[0] offset into UTF-8 source. */
227 size_t head; /* &buffer[head - tail] offset into source. */
229 /* Positions in source file, tail <= pos <= head for each member here. */
230 size_t journal_pos; /* First byte not yet output to journal. */
231 size_t seg_pos; /* First byte not yet scanned as token. */
232 size_t line_pos; /* First byte of line containing seg_pos. */
234 int n_newlines; /* Number of new-lines up to seg_pos. */
235 bool suppress_next_newline;
239 This is a pipeline with the following stages. Each token eventually
240 made available to the parser passes through of these stages. The stages
241 are named after the processing that happens in each one.
243 Initially, tokens come from the segmenter and scanner to 'pp':
245 - pp: Tokens that need to pass through the macro preprocessor to end up
248 - merge: Tokens that need to pass through scan_merge() to end up in
251 - lookahead: Tokens available to the client for parsing. */
253 struct lex_stage merge;
254 struct lex_stage lookahead;
257 static struct lex_source *lex_source_create (struct lexer *,
258 struct lex_reader *);
259 static void lex_source_destroy (struct lex_source *);
264 struct ll_list sources; /* Contains "struct lex_source"s. */
265 struct macro_set *macros;
268 static struct lex_source *lex_source__ (const struct lexer *);
269 static char *lex_source_get_syntax__ (const struct lex_source *,
271 static const struct lex_token *lex_next__ (const struct lexer *, int n);
272 static void lex_source_push_endcmd__ (struct lex_source *);
274 static bool lex_source_get_lookahead (struct lex_source *);
275 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
276 const char *format, va_list)
277 PRINTF_FORMAT (4, 0);
278 static const struct lex_token *lex_source_next__ (const struct lex_source *,
281 /* Initializes READER with the specified CLASS and otherwise some reasonable
282 defaults. The caller should fill in the others members as desired. */
284 lex_reader_init (struct lex_reader *reader,
285 const struct lex_reader_class *class)
287 reader->class = class;
288 reader->syntax = SEG_MODE_AUTO;
289 reader->error = LEX_ERROR_CONTINUE;
290 reader->file_name = NULL;
291 reader->encoding = NULL;
292 reader->line_number = 0;
296 /* Frees any file name already in READER and replaces it by a copy of
297 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
299 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
301 free (reader->file_name);
302 reader->file_name = xstrdup_if_nonnull (file_name);
305 /* Creates and returns a new lexer. */
309 struct lexer *lexer = xmalloc (sizeof *lexer);
310 *lexer = (struct lexer) {
311 .sources = LL_INITIALIZER (lexer->sources),
312 .macros = macro_set_create (),
317 /* Destroys LEXER. */
319 lex_destroy (struct lexer *lexer)
323 struct lex_source *source, *next;
325 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
326 lex_source_destroy (source);
327 macro_set_destroy (lexer->macros);
332 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
333 same name. Takes ownership of M. */
335 lex_define_macro (struct lexer *lexer, struct macro *m)
337 macro_set_add (lexer->macros, m);
340 /* Inserts READER into LEXER so that the next token read by LEXER comes from
341 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
344 lex_include (struct lexer *lexer, struct lex_reader *reader)
346 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
347 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
350 /* Appends READER to LEXER, so that it will be read after all other current
351 readers have already been read. */
353 lex_append (struct lexer *lexer, struct lex_reader *reader)
355 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
360 /* Advances LEXER to the next token, consuming the current token. */
362 lex_get (struct lexer *lexer)
364 struct lex_source *src;
366 src = lex_source__ (lexer);
370 if (!lex_stage_is_empty (&src->lookahead))
371 lex_stage_pop_first (&src->lookahead);
373 while (lex_stage_is_empty (&src->lookahead))
374 if (!lex_source_get_lookahead (src))
376 lex_source_destroy (src);
377 src = lex_source__ (lexer);
383 /* Issuing errors. */
385 /* Prints a syntax error message containing the current token and
386 given message MESSAGE (if non-null). */
388 lex_error (struct lexer *lexer, const char *format, ...)
392 va_start (args, format);
393 lex_next_error_valist (lexer, 0, 0, format, args);
397 /* Prints a syntax error message containing the current token and
398 given message MESSAGE (if non-null). */
400 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
402 lex_next_error_valist (lexer, 0, 0, format, args);
405 /* Prints a syntax error message containing the current token and
406 given message MESSAGE (if non-null). */
408 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
412 va_start (args, format);
413 lex_next_error_valist (lexer, n0, n1, format, args);
417 /* Prints a syntax error message saying that one of the strings provided as
418 varargs, up to the first NULL, is expected. */
420 (lex_error_expecting) (struct lexer *lexer, ...)
424 va_start (args, lexer);
425 lex_error_expecting_valist (lexer, args);
429 /* Prints a syntax error message saying that one of the options provided in
430 ARGS, up to the first NULL, is expected. */
432 lex_error_expecting_valist (struct lexer *lexer, va_list args)
434 enum { MAX_OPTIONS = 9 };
435 const char *options[MAX_OPTIONS];
437 while (n < MAX_OPTIONS)
439 const char *option = va_arg (args, const char *);
443 options[n++] = option;
445 lex_error_expecting_array (lexer, options, n);
449 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
454 lex_error (lexer, NULL);
458 lex_error (lexer, _("expecting %s"), options[0]);
462 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
466 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
471 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
472 options[0], options[1], options[2], options[3]);
476 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
477 options[0], options[1], options[2], options[3], options[4]);
481 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
482 options[0], options[1], options[2], options[3], options[4],
487 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
488 options[0], options[1], options[2], options[3], options[4],
489 options[5], options[6]);
493 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
494 options[0], options[1], options[2], options[3], options[4],
495 options[5], options[6], options[7]);
499 lex_error (lexer, NULL);
503 /* Reports an error to the effect that subcommand SBC may only be specified
506 This function does not take a lexer as an argument or use lex_error(),
507 because the result would ordinarily just be redundant: "Syntax error at
508 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
509 not help the user find the error. */
511 lex_sbc_only_once (const char *sbc)
513 msg (SE, _("Subcommand %s may only be specified once."), sbc);
516 /* Reports an error to the effect that subcommand SBC is missing.
518 This function does not take a lexer as an argument or use lex_error(),
519 because a missing subcommand can normally be detected only after the whole
520 command has been parsed, and so lex_error() would always report "Syntax
521 error at end of command", which does not help the user find the error. */
523 lex_sbc_missing (const char *sbc)
525 msg (SE, _("Required subcommand %s was not specified."), sbc);
528 /* Reports an error to the effect that specification SPEC may only be specified
529 once within subcommand SBC. */
531 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
533 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
537 /* Reports an error to the effect that specification SPEC is missing within
540 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
542 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
546 /* Prints a syntax error message containing the current token and
547 given message MESSAGE (if non-null). */
549 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
550 const char *format, va_list args)
552 struct lex_source *src = lex_source__ (lexer);
555 lex_source_error_valist (src, n0, n1, format, args);
561 ds_put_format (&s, _("Syntax error at end of input"));
564 ds_put_cstr (&s, ": ");
565 ds_put_vformat (&s, format, args);
567 ds_put_byte (&s, '.');
568 msg (SE, "%s", ds_cstr (&s));
573 /* Checks that we're at end of command.
574 If so, returns a successful command completion code.
575 If not, flags a syntax error and returns an error command
578 lex_end_of_command (struct lexer *lexer)
580 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
582 lex_error (lexer, _("expecting end of command"));
589 /* Token testing functions. */
591 /* Returns true if the current token is a number. */
593 lex_is_number (const struct lexer *lexer)
595 return lex_next_is_number (lexer, 0);
598 /* Returns true if the current token is a string. */
600 lex_is_string (const struct lexer *lexer)
602 return lex_next_is_string (lexer, 0);
605 /* Returns the value of the current token, which must be a
606 floating point number. */
608 lex_number (const struct lexer *lexer)
610 return lex_next_number (lexer, 0);
613 /* Returns true iff the current token is an integer. */
615 lex_is_integer (const struct lexer *lexer)
617 return lex_next_is_integer (lexer, 0);
620 /* Returns the value of the current token, which must be an
623 lex_integer (const struct lexer *lexer)
625 return lex_next_integer (lexer, 0);
628 /* Token testing functions with lookahead.
630 A value of 0 for N as an argument to any of these functions refers to the
631 current token. Lookahead is limited to the current command. Any N greater
632 than the number of tokens remaining in the current command will be treated
633 as referring to a T_ENDCMD token. */
635 /* Returns true if the token N ahead of the current token is a number. */
637 lex_next_is_number (const struct lexer *lexer, int n)
639 return token_is_number (lex_next (lexer, n));
642 /* Returns true if the token N ahead of the current token is a string. */
644 lex_next_is_string (const struct lexer *lexer, int n)
646 return token_is_string (lex_next (lexer, n));
649 /* Returns the value of the token N ahead of the current token, which must be a
650 floating point number. */
652 lex_next_number (const struct lexer *lexer, int n)
654 return token_number (lex_next (lexer, n));
657 /* Returns true if the token N ahead of the current token is an integer. */
659 lex_next_is_integer (const struct lexer *lexer, int n)
661 return token_is_integer (lex_next (lexer, n));
664 /* Returns the value of the token N ahead of the current token, which must be
667 lex_next_integer (const struct lexer *lexer, int n)
669 return token_integer (lex_next (lexer, n));
672 /* Token matching functions. */
674 /* If the current token has the specified TYPE, skips it and returns true.
675 Otherwise, returns false. */
677 lex_match (struct lexer *lexer, enum token_type type)
679 if (lex_token (lexer) == type)
688 /* If the current token matches IDENTIFIER, skips it and returns true.
689 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
692 IDENTIFIER must be an ASCII string. */
694 lex_match_id (struct lexer *lexer, const char *identifier)
696 return lex_match_id_n (lexer, identifier, 3);
699 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
700 may be abbreviated to its first N letters. Otherwise, returns false.
702 IDENTIFIER must be an ASCII string. */
704 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
706 if (lex_token (lexer) == T_ID
707 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
716 /* If the current token is integer X, skips it and returns true. Otherwise,
719 lex_match_int (struct lexer *lexer, int x)
721 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
730 /* Forced matches. */
732 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
733 abbreviated to its first 3 letters. Otherwise, reports an error and returns
736 IDENTIFIER must be an ASCII string. */
738 lex_force_match_id (struct lexer *lexer, const char *identifier)
740 if (lex_match_id (lexer, identifier))
744 lex_error_expecting (lexer, identifier);
749 /* If the current token has the specified TYPE, skips it and returns true.
750 Otherwise, reports an error and returns false. */
752 lex_force_match (struct lexer *lexer, enum token_type type)
754 if (lex_token (lexer) == type)
761 const char *type_string = token_type_to_string (type);
764 char *s = xasprintf ("`%s'", type_string);
765 lex_error_expecting (lexer, s);
769 lex_error_expecting (lexer, token_type_to_name (type));
775 /* If the current token is a string, does nothing and returns true.
776 Otherwise, reports an error and returns false. */
778 lex_force_string (struct lexer *lexer)
780 if (lex_is_string (lexer))
784 lex_error (lexer, _("expecting string"));
789 /* If the current token is a string or an identifier, does nothing and returns
790 true. Otherwise, reports an error and returns false.
792 This is meant for use in syntactic situations where we want to encourage the
793 user to supply a quoted string, but for compatibility we also accept
794 identifiers. (One example of such a situation is file names.) Therefore,
795 the error message issued when the current token is wrong only says that a
796 string is expected and doesn't mention that an identifier would also be
799 lex_force_string_or_id (struct lexer *lexer)
801 return lex_token (lexer) == T_ID || lex_force_string (lexer);
804 /* If the current token is an integer, does nothing and returns true.
805 Otherwise, reports an error and returns false. */
807 lex_force_int (struct lexer *lexer)
809 if (lex_is_integer (lexer))
813 lex_error (lexer, _("expecting integer"));
818 /* If the current token is an integer in the range MIN...MAX (inclusive), does
819 nothing and returns true. Otherwise, reports an error and returns false.
820 If NAME is nonnull, then it is used in the error message. */
822 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
824 bool is_integer = lex_is_integer (lexer);
825 bool too_small = is_integer && lex_integer (lexer) < min;
826 bool too_big = is_integer && lex_integer (lexer) > max;
827 if (is_integer && !too_small && !too_big)
832 /* Weird, maybe a bug in the caller. Just report that we needed an
835 lex_error (lexer, _("Integer expected for %s."), name);
837 lex_error (lexer, _("Integer expected."));
842 lex_error (lexer, _("Expected %ld for %s."), min, name);
844 lex_error (lexer, _("Expected %ld."), min);
846 else if (min + 1 == max)
849 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
851 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
855 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
856 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
858 if (report_lower_bound && report_upper_bound)
862 _("Expected integer between %ld and %ld for %s."),
865 lex_error (lexer, _("Expected integer between %ld and %ld."),
868 else if (report_lower_bound)
873 lex_error (lexer, _("Expected non-negative integer for %s."),
876 lex_error (lexer, _("Expected non-negative integer."));
881 lex_error (lexer, _("Expected positive integer for %s."),
884 lex_error (lexer, _("Expected positive integer."));
887 else if (report_upper_bound)
891 _("Expected integer less than or equal to %ld for %s."),
894 lex_error (lexer, _("Expected integer less than or equal to %ld."),
900 lex_error (lexer, _("Integer expected for %s."), name);
902 lex_error (lexer, _("Integer expected."));
908 /* If the current token is a number, does nothing and returns true.
909 Otherwise, reports an error and returns false. */
911 lex_force_num (struct lexer *lexer)
913 if (lex_is_number (lexer))
916 lex_error (lexer, _("expecting number"));
920 /* If the current token is an identifier, does nothing and returns true.
921 Otherwise, reports an error and returns false. */
923 lex_force_id (struct lexer *lexer)
925 if (lex_token (lexer) == T_ID)
928 lex_error (lexer, _("expecting identifier"));
932 /* Token accessors. */
934 /* Returns the type of LEXER's current token. */
936 lex_token (const struct lexer *lexer)
938 return lex_next_token (lexer, 0);
941 /* Returns the number in LEXER's current token.
943 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
944 tokens this function will always return zero. */
946 lex_tokval (const struct lexer *lexer)
948 return lex_next_tokval (lexer, 0);
951 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
953 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
954 this functions this function will always return NULL.
956 The UTF-8 encoding of the returned string is correct for variable names and
957 other identifiers. Use filename_to_utf8() to use it as a filename. Use
958 data_in() to use it in a "union value". */
960 lex_tokcstr (const struct lexer *lexer)
962 return lex_next_tokcstr (lexer, 0);
965 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
966 null-terminated (but the null terminator is not included in the returned
967 substring's 'length').
969 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
970 this functions this function will always return NULL.
972 The UTF-8 encoding of the returned string is correct for variable names and
973 other identifiers. Use filename_to_utf8() to use it as a filename. Use
974 data_in() to use it in a "union value". */
976 lex_tokss (const struct lexer *lexer)
978 return lex_next_tokss (lexer, 0);
983 A value of 0 for N as an argument to any of these functions refers to the
984 current token. Lookahead is limited to the current command. Any N greater
985 than the number of tokens remaining in the current command will be treated
986 as referring to a T_ENDCMD token. */
988 static const struct lex_token *
989 lex_next__ (const struct lexer *lexer_, int n)
991 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
992 struct lex_source *src = lex_source__ (lexer);
995 return lex_source_next__ (src, n);
998 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1003 static const struct lex_token *
1004 lex_source_next__ (const struct lex_source *src_, int n)
1006 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1007 while (lex_stage_count (&src->lookahead) <= n)
1009 if (!lex_stage_is_empty (&src->lookahead))
1011 const struct lex_token *t = lex_stage_last (&src->lookahead);
1012 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1016 lex_source_get_lookahead (src);
1019 return lex_stage_nth (&src->lookahead, n);
1022 /* Returns the "struct token" of the token N after the current one in LEXER.
1023 The returned pointer can be invalidated by pretty much any succeeding call
1024 into the lexer, although the string pointer within the returned token is
1025 only invalidated by consuming the token (e.g. with lex_get()). */
1026 const struct token *
1027 lex_next (const struct lexer *lexer, int n)
1029 return &lex_next__ (lexer, n)->token;
1032 /* Returns the type of the token N after the current one in LEXER. */
1034 lex_next_token (const struct lexer *lexer, int n)
1036 return lex_next (lexer, n)->type;
1039 /* Returns the number in the tokn N after the current one in LEXER.
1041 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1042 tokens this function will always return zero. */
1044 lex_next_tokval (const struct lexer *lexer, int n)
1046 return token_number (lex_next (lexer, n));
1049 /* Returns the null-terminated string in the token N after the current one, in
1052 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1053 this functions this function will always return NULL.
1055 The UTF-8 encoding of the returned string is correct for variable names and
1056 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1057 data_in() to use it in a "union value". */
1059 lex_next_tokcstr (const struct lexer *lexer, int n)
1061 return lex_next_tokss (lexer, n).string;
1064 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1065 The string is null-terminated (but the null terminator is not included in
1066 the returned substring's 'length').
1068 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1069 tokens this functions this function will always return NULL.
1071 The UTF-8 encoding of the returned string is correct for variable names and
1072 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1073 data_in() to use it in a "union value". */
1075 lex_next_tokss (const struct lexer *lexer, int n)
1077 return lex_next (lexer, n)->string;
1080 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1081 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1082 are both zero, this requests the syntax for the current token.) The caller
1083 must eventually free the returned string (with free()). The syntax is
1084 encoded in UTF-8 and in the original form supplied to the lexer so that, for
1085 example, it may include comments, spaces, and new-lines if it spans multiple
1086 tokens. Macro expansion, however, has already been performed. */
1088 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1090 return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1093 /* Returns true if the token N ahead of the current one was produced by macro
1094 expansion, false otherwise. */
1096 lex_next_is_from_macro (const struct lexer *lexer, int n)
1098 return lex_next__ (lexer, n)->macro_rep != NULL;
1102 lex_tokens_match (const struct token *actual, const struct token *expected)
1104 if (actual->type != expected->type)
1107 switch (actual->type)
1111 return actual->number == expected->number;
1114 return lex_id_match (expected->string, actual->string);
1117 return (actual->string.length == expected->string.length
1118 && !memcmp (actual->string.string, expected->string.string,
1119 actual->string.length));
1126 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1127 skips it and returns true. Otherwise, returns false.
1129 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1130 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1131 first three letters. */
1133 lex_match_phrase (struct lexer *lexer, const char *s)
1135 struct string_lexer slex;
1140 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1141 while (string_lexer_next (&slex, &token))
1143 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1144 token_uninit (&token);
1155 count_newlines (char *s, size_t length)
1160 while ((newline = memchr (s, '\n', length)) != NULL)
1163 length -= (newline + 1) - s;
1171 lex_token_get_last_line_number (const struct lex_source *src,
1172 const struct lex_token *token)
1174 if (token->first_line == 0)
1178 char *token_str = &src->buffer[token->token_pos - src->tail];
1179 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1184 count_columns (const char *s_, size_t length)
1186 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1192 for (ofs = 0; ofs < length; ofs += mblen)
1196 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1199 int width = uc_width (uc, "UTF-8");
1204 columns = ROUND_UP (columns + 1, 8);
1211 lex_token_get_first_column (const struct lex_source *src,
1212 const struct lex_token *token)
1214 return count_columns (&src->buffer[token->line_pos - src->tail],
1215 token->token_pos - token->line_pos);
1219 lex_token_get_last_column (const struct lex_source *src,
1220 const struct lex_token *token)
1222 char *start, *end, *newline;
1224 start = &src->buffer[token->line_pos - src->tail];
1225 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1226 newline = memrchr (start, '\n', end - start);
1227 if (newline != NULL)
1228 start = newline + 1;
1229 return count_columns (start, end - start);
1232 static struct msg_location
1233 lex_token_location (const struct lex_source *src,
1234 const struct lex_token *t0,
1235 const struct lex_token *t1)
1237 return (struct msg_location) {
1238 .file_name = src->reader->file_name,
1239 .first_line = t0->first_line,
1240 .last_line = lex_token_get_last_line_number (src, t1),
1241 .first_column = lex_token_get_first_column (src, t0),
1242 .last_column = lex_token_get_last_column (src, t1),
1246 static struct msg_location *
1247 lex_token_location_rw (const struct lex_source *src,
1248 const struct lex_token *t0,
1249 const struct lex_token *t1)
1251 struct msg_location location = lex_token_location (src, t0, t1);
1252 return msg_location_dup (&location);
1255 static struct msg_location *
1256 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1258 return lex_token_location_rw (src,
1259 lex_source_next__ (src, n0),
1260 lex_source_next__ (src, n1));
1263 /* Returns the 1-based line number of the start of the syntax that represents
1264 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1265 if the token is drawn from a source that does not have line numbers. */
1267 lex_get_first_line_number (const struct lexer *lexer, int n)
1269 const struct lex_source *src = lex_source__ (lexer);
1270 return src ? lex_source_next__ (src, n)->first_line : 0;
1273 /* Returns the 1-based line number of the end of the syntax that represents the
1274 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1275 token or if the token is drawn from a source that does not have line
1278 Most of the time, a single token is wholly within a single line of syntax,
1279 but there are two exceptions: a T_STRING token can be made up of multiple
1280 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1281 token can consist of a "-" on one line followed by the number on the next.
1284 lex_get_last_line_number (const struct lexer *lexer, int n)
1286 const struct lex_source *src = lex_source__ (lexer);
1287 return src ? lex_token_get_last_line_number (src,
1288 lex_source_next__ (src, n)) : 0;
1291 /* Returns the 1-based column number of the start of the syntax that represents
1292 the token N after the current one in LEXER. Returns 0 for a T_STOP
1295 Column numbers are measured according to the width of characters as shown in
1296 a typical fixed-width font, in which CJK characters have width 2 and
1297 combining characters have width 0. */
1299 lex_get_first_column (const struct lexer *lexer, int n)
1301 const struct lex_source *src = lex_source__ (lexer);
1302 return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1305 /* Returns the 1-based column number of the end of the syntax that represents
1306 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1309 Column numbers are measured according to the width of characters as shown in
1310 a typical fixed-width font, in which CJK characters have width 2 and
1311 combining characters have width 0. */
1313 lex_get_last_column (const struct lexer *lexer, int n)
1315 const struct lex_source *src = lex_source__ (lexer);
1316 return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1319 /* Returns the name of the syntax file from which the current command is drawn.
1320 Returns NULL for a T_STOP token or if the command's source does not have
1323 There is no version of this function that takes an N argument because
1324 lookahead only works to the end of a command and any given command is always
1325 within a single syntax file. */
1327 lex_get_file_name (const struct lexer *lexer)
1329 struct lex_source *src = lex_source__ (lexer);
1330 return src == NULL ? NULL : src->reader->file_name;
1333 /* Returns a newly allocated msg_location for the syntax that represents tokens
1334 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1335 must eventually free the location (with msg_location_destroy()). */
1336 struct msg_location *
1337 lex_get_location (const struct lexer *lexer, int n0, int n1)
1339 struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1340 loc->first_column = lex_get_first_column (lexer, n0);
1341 loc->last_column = lex_get_last_column (lexer, n1);
1345 /* Returns a newly allocated msg_location for the syntax that represents tokens
1346 with 0-based offsets N0...N1, inclusive, from the current token. The
1347 location only covers the tokens' lines, not the columns. The caller must
1348 eventually free the location (with msg_location_destroy()). */
1349 struct msg_location *
1350 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1352 struct msg_location *loc = xmalloc (sizeof *loc);
1353 *loc = (struct msg_location) {
1354 .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1355 .first_line = lex_get_first_line_number (lexer, n0),
1356 .last_line = lex_get_last_line_number (lexer, n1),
1362 lex_get_encoding (const struct lexer *lexer)
1364 struct lex_source *src = lex_source__ (lexer);
1365 return src == NULL ? NULL : src->reader->encoding;
1368 /* Returns the syntax mode for the syntax file from which the current drawn is
1369 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1370 does not have line numbers.
1372 There is no version of this function that takes an N argument because
1373 lookahead only works to the end of a command and any given command is always
1374 within a single syntax file. */
1376 lex_get_syntax_mode (const struct lexer *lexer)
1378 struct lex_source *src = lex_source__ (lexer);
1379 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1382 /* Returns the error mode for the syntax file from which the current drawn is
1383 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1384 source does not have line numbers.
1386 There is no version of this function that takes an N argument because
1387 lookahead only works to the end of a command and any given command is always
1388 within a single syntax file. */
1390 lex_get_error_mode (const struct lexer *lexer)
1392 struct lex_source *src = lex_source__ (lexer);
1393 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1396 /* If the source that LEXER is currently reading has error mode
1397 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1398 token to be read comes directly from whatever is next read from the stream.
1400 It makes sense to call this function after encountering an error in a
1401 command entered on the console, because usually the user would prefer not to
1402 have cascading errors. */
1404 lex_interactive_reset (struct lexer *lexer)
1406 struct lex_source *src = lex_source__ (lexer);
1407 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1409 src->head = src->tail = 0;
1410 src->journal_pos = src->seg_pos = src->line_pos = 0;
1411 src->n_newlines = 0;
1412 src->suppress_next_newline = false;
1413 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1415 lex_stage_clear (&src->pp);
1416 lex_stage_clear (&src->merge);
1417 lex_stage_clear (&src->lookahead);
1418 lex_source_push_endcmd__ (src);
1422 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1424 lex_discard_rest_of_command (struct lexer *lexer)
1426 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1430 /* Discards all lookahead tokens in LEXER, then discards all input sources
1431 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1432 runs out of input sources. */
1434 lex_discard_noninteractive (struct lexer *lexer)
1436 struct lex_source *src = lex_source__ (lexer);
1440 lex_stage_clear (&src->pp);
1441 lex_stage_clear (&src->merge);
1442 lex_stage_clear (&src->lookahead);
1444 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1445 src = lex_source__ (lexer))
1446 lex_source_destroy (src);
1451 lex_source_max_tail__ (const struct lex_source *src_)
1453 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1455 assert (src->seg_pos >= src->line_pos);
1456 size_t max_tail = MIN (src->journal_pos, src->line_pos);
1458 /* Use the oldest token also. */
1459 struct lex_stage *stages[] = { &src->lookahead, &src->merge, &src->pp };
1460 for (size_t i = 0; i < sizeof stages / sizeof *stages; i++)
1461 if (!lex_stage_is_empty (stages[i]))
1463 struct lex_token *first = lex_stage_first (stages[i]);
1464 assert (first->token_pos >= first->line_pos);
1465 return MIN (max_tail, first->line_pos);
1472 lex_source_expand__ (struct lex_source *src)
1474 if (src->head - src->tail >= src->allocated)
1476 size_t max_tail = lex_source_max_tail__ (src);
1477 if (max_tail > src->tail)
1479 /* Advance the tail, freeing up room at the head. */
1480 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1481 src->head - max_tail);
1482 src->tail = max_tail;
1486 /* Buffer is completely full. Expand it. */
1487 src->buffer = x2realloc (src->buffer, &src->allocated);
1492 /* There's space available at the head of the buffer. Nothing to do. */
1497 lex_source_read__ (struct lex_source *src)
1501 lex_source_expand__ (src);
1503 size_t head_ofs = src->head - src->tail;
1504 size_t space = src->allocated - head_ofs;
1505 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1506 size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1508 assert (n <= space);
1513 src->reader->eof = true;
1514 lex_source_expand__ (src);
1520 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1521 src->head - src->seg_pos));
1524 static struct lex_source *
1525 lex_source__ (const struct lexer *lexer)
1527 return (ll_is_empty (&lexer->sources) ? NULL
1528 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1531 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1532 one, through N1 ahead of the current one, inclusive. (For example, if N0
1533 and N1 are both zero, this requests the syntax for the current token.) The
1534 caller must eventually free the returned string (with free()). The syntax
1535 is encoded in UTF-8 and in the original form supplied to the lexer so that,
1536 for example, it may include comments, spaces, and new-lines if it spans
1537 multiple tokens. Macro expansion, however, has already been performed. */
1539 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1541 struct string s = DS_EMPTY_INITIALIZER;
1542 for (size_t i = n0; i <= n1; )
1544 /* Find [I,J) as the longest sequence of tokens not produced by macro
1545 expansion, or otherwise the longest sequence expanded from a single
1547 const struct lex_token *first = lex_source_next__ (src, i);
1549 for (j = i + 1; j <= n1; j++)
1551 const struct lex_token *cur = lex_source_next__ (src, j);
1552 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1553 || first->macro_rep != cur->macro_rep)
1556 const struct lex_token *last = lex_source_next__ (src, j - 1);
1558 /* Now add the syntax for this sequence of tokens to SRC. */
1559 if (!ds_is_empty (&s))
1560 ds_put_byte (&s, ' ');
1561 if (!first->macro_rep)
1563 size_t start = first->token_pos;
1564 size_t end = last->token_pos + last->token_len;
1565 ds_put_substring (&s, ss_buffer (&src->buffer[start - src->tail],
1570 size_t start = first->ofs;
1571 size_t end = last->ofs + last->len;
1572 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1578 return ds_steal_cstr (&s);
1582 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1584 for (size_t i = n0; i <= n1; i++)
1585 if (lex_source_next__ (src, i)->macro_rep)
1590 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1591 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1592 other tokens included in that range. The syntax is encoded in UTF-8 and in
1593 the original form supplied to the lexer so that, for example, it may include
1594 comments, spaces, and new-lines if it spans multiple tokens.
1596 Returns an empty string if the token range doesn't include a macro call.
1598 The caller must not modify or free the returned string. */
1599 static struct substring
1600 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1602 if (!lex_source_contains_macro_call (src, n0, n1))
1605 const struct lex_token *token0 = lex_source_next__ (src, n0);
1606 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1607 size_t start = token0->token_pos;
1608 size_t end = token1->token_pos + token1->token_len;
1610 return ss_buffer (&src->buffer[start - src->tail], end - start);
1614 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1615 const char *format, va_list args)
1617 const struct lex_token *token;
1622 token = lex_source_next__ (src, n0);
1623 if (token->token.type == T_ENDCMD)
1624 ds_put_cstr (&s, _("Syntax error at end of command"));
1627 /* Get the syntax that caused the error. */
1628 char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1630 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1633 /* Get the macro call(s) that expanded to the syntax that caused the
1636 str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1643 _("Syntax error at `%s' (in expansion of `%s')"),
1646 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1651 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1654 ds_put_cstr (&s, _("Syntax error"));
1660 ds_put_cstr (&s, ": ");
1661 ds_put_vformat (&s, format, args);
1663 if (ds_last (&s) != '.')
1664 ds_put_byte (&s, '.');
1666 struct msg *m = xmalloc (sizeof *m);
1668 .category = MSG_C_SYNTAX,
1669 .severity = MSG_S_ERROR,
1670 .location = lex_source_get_location (src, n0, n1),
1671 .text = ds_steal_cstr (&s),
1677 lex_get_error (struct lex_source *src, const struct lex_token *token)
1680 str_ellipsize (ss_buffer (&src->buffer[token->token_pos - src->tail],
1682 syntax, sizeof syntax);
1684 struct string s = DS_EMPTY_INITIALIZER;
1685 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1686 ds_put_format (&s, ": %s", token->token.string.string);
1688 struct msg *m = xmalloc (sizeof *m);
1690 .category = MSG_C_SYNTAX,
1691 .severity = MSG_S_ERROR,
1692 .location = lex_token_location_rw (src, token, token),
1693 .text = ds_steal_cstr (&s),
1698 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1699 underlying lex_reader if necessary. Returns true if a new token was added
1700 to SRC's deque, false otherwise. The caller should retry failures unless
1701 SRC's 'eof' marker was set to true indicating that there will be no more
1702 tokens from this source. */
1704 lex_source_try_get_pp (struct lex_source *src)
1706 /* Append a new token to SRC and initialize it. */
1707 struct lex_token *token = xmalloc (sizeof *token);
1708 token->token = (struct token) { .type = T_STOP };
1709 token->macro_rep = NULL;
1710 token->ref_cnt = NULL;
1711 token->line_pos = src->line_pos;
1712 token->token_pos = src->seg_pos;
1713 if (src->reader->line_number > 0)
1714 token->first_line = src->reader->line_number + src->n_newlines;
1716 token->first_line = 0;
1718 /* Extract a segment. */
1719 const char *segment;
1720 enum segment_type seg_type;
1724 segment = &src->buffer[src->seg_pos - src->tail];
1725 seg_len = segmenter_push (&src->segmenter, segment,
1726 src->head - src->seg_pos,
1727 src->reader->eof, &seg_type);
1731 /* The segmenter needs more input to produce a segment. */
1732 assert (!src->reader->eof);
1733 lex_source_read__ (src);
1736 /* Update state based on the segment. */
1737 token->token_len = seg_len;
1738 src->seg_pos += seg_len;
1739 if (seg_type == SEG_NEWLINE)
1741 src->line_pos = src->seg_pos;
1745 /* Get a token from the segment. */
1746 enum tokenize_result result = token_from_segment (
1747 seg_type, ss_buffer (segment, seg_len), &token->token);
1749 /* If we've reached the end of a line, or the end of a command, then pass
1750 the line to the output engine as a syntax text item. */
1751 int n_lines = seg_type == SEG_NEWLINE;
1752 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1755 src->suppress_next_newline = true;
1757 else if (n_lines > 0 && src->suppress_next_newline)
1760 src->suppress_next_newline = false;
1762 for (int i = 0; i < n_lines; i++)
1764 /* Beginning of line. */
1765 const char *line = &src->buffer[src->journal_pos - src->tail];
1767 /* Calculate line length, including \n or \r\n end-of-line if present.
1769 We use src->head even though that may be beyond what we've actually
1770 converted to tokens (which is only through line_pos). That's because,
1771 if we're emitting the line due to SEG_END_COMMAND, we want to take the
1772 whole line through the newline, not just through the '.'. */
1773 size_t max_len = src->head - src->journal_pos;
1774 const char *newline = memchr (line, '\n', max_len);
1775 size_t line_len = newline ? newline - line + 1 : max_len;
1777 /* Calculate line length excluding end-of-line. */
1778 size_t copy_len = line_len;
1779 if (copy_len > 0 && line[copy_len - 1] == '\n')
1781 if (copy_len > 0 && line[copy_len - 1] == '\r')
1784 /* Submit the line as syntax. */
1785 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1786 xmemdup0 (line, copy_len),
1789 src->journal_pos += line_len;
1794 case TOKENIZE_ERROR:
1795 lex_get_error (src, token);
1797 case TOKENIZE_EMPTY:
1798 lex_token_destroy (token);
1801 case TOKENIZE_TOKEN:
1802 if (token->token.type == T_STOP)
1804 token->token.type = T_ENDCMD;
1807 lex_stage_push_last (&src->pp, token);
1813 /* Attempts to append a new token to SRC. Returns true if successful, false on
1814 failure. On failure, the end of SRC has been reached and no more tokens
1815 will be forthcoming from it.
1817 Does not make the new token available for lookahead yet; the caller must
1818 adjust SRC's 'middle' pointer to do so. */
1820 lex_source_get_pp (struct lex_source *src)
1823 if (lex_source_try_get_pp (src))
1829 lex_source_try_get_merge (const struct lex_source *src_)
1831 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1833 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1836 if (!settings_get_mexpand ())
1838 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1842 /* Now pass tokens one-by-one to the macro expander.
1844 In the common case where there is no macro to expand, the loop is not
1846 struct macro_call *mc;
1847 int n_call = macro_call_create (src->lexer->macros,
1848 &lex_stage_first (&src->pp)->token, &mc);
1849 for (int ofs = 1; !n_call; ofs++)
1851 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1853 /* This should not be reachable because we always get a T_ENDCMD at
1854 the end of an input file (transformed from T_STOP by
1855 lex_source_try_get_pp()) and the macro_expander should always
1856 terminate expansion on T_ENDCMD. */
1860 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1861 size_t start = t->token_pos;
1862 size_t end = t->token_pos + t->token_len;
1863 const struct macro_token mt = {
1865 .syntax = ss_buffer (&src->buffer[start - src->tail], end - start),
1867 const struct msg_location loc = lex_token_location (src, t, t);
1868 n_call = macro_call_add (mc, &mt, &loc);
1872 /* False alarm: no macro expansion after all. Use first token as
1873 lookahead. We'll retry macro expansion from the second token next
1875 macro_call_destroy (mc);
1876 lex_stage_shift (&src->merge, &src->pp, 1);
1880 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1881 are a macro call. (These are likely to be the only tokens in 'pp'.)
1883 const struct lex_token *c0 = lex_stage_first (&src->pp);
1884 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1885 struct macro_tokens expansion = { .n = 0 };
1886 struct msg_location loc = lex_token_location (src, c0, c1);
1887 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1888 macro_call_destroy (mc);
1890 /* Convert the macro expansion into syntax for possible error messages
1892 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1893 size_t *len = xnmalloc (expansion.n, sizeof *len);
1894 struct string s = DS_EMPTY_INITIALIZER;
1895 macro_tokens_to_syntax (&expansion, &s, ofs, len);
1897 if (settings_get_mprint ())
1898 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1899 _("Macro Expansion")));
1901 /* Append the macro expansion tokens to the lookahead. */
1902 if (expansion.n > 0)
1904 char *macro_rep = ds_steal_cstr (&s);
1905 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1906 *ref_cnt = expansion.n;
1907 for (size_t i = 0; i < expansion.n; i++)
1909 struct lex_token *token = xmalloc (sizeof *token);
1910 *token = (struct lex_token) {
1911 .token = expansion.mts[i].token,
1912 .token_pos = c0->token_pos,
1913 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1914 .line_pos = c0->line_pos,
1915 .first_line = c0->first_line,
1916 .macro_rep = macro_rep,
1921 lex_stage_push_last (&src->merge, token);
1923 ss_dealloc (&expansion.mts[i].syntax);
1928 free (expansion.mts);
1932 /* Destroy the tokens for the call. */
1933 for (size_t i = 0; i < n_call; i++)
1934 lex_stage_pop_first (&src->pp);
1936 return expansion.n > 0;
1939 /* Attempts to obtain at least one new token into 'merge' in SRC.
1941 Returns true if successful, false on failure. In the latter case, SRC is
1942 exhausted and 'src->eof' is now true. */
1944 lex_source_get_merge (struct lex_source *src)
1947 if (lex_source_try_get_merge (src))
1952 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1954 Returns true if successful, false on failure. In the latter case, SRC is
1955 exhausted and 'src->eof' is now true. */
1957 lex_source_get_lookahead (struct lex_source *src)
1959 struct merger m = MERGER_INIT;
1961 for (size_t i = 0; ; i++)
1963 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1965 /* We always get a T_ENDCMD at the end of an input file
1966 (transformed from T_STOP by lex_source_try_get_pp()) and
1967 merger_add() should never return -1 on T_ENDCMD. */
1968 assert (lex_stage_is_empty (&src->merge));
1972 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1976 lex_stage_shift (&src->lookahead, &src->merge, 1);
1979 else if (retval > 0)
1981 /* Add a token that merges all the tokens together. */
1982 const struct lex_token *first = lex_stage_first (&src->merge);
1983 const struct lex_token *last = lex_stage_nth (&src->merge,
1985 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1986 struct lex_token *t = xmalloc (sizeof *t);
1987 *t = (struct lex_token) {
1989 .token_pos = first->token_pos,
1990 .token_len = (last->token_pos - first->token_pos) + last->token_len,
1991 .line_pos = first->line_pos,
1992 .first_line = first->first_line,
1994 /* This works well if all the tokens were not expanded from macros,
1995 or if they came from the same macro expansion. It just gives up
1996 in the other (corner) cases. */
1997 .macro_rep = macro ? first->macro_rep : NULL,
1998 .ofs = macro ? first->ofs : 0,
1999 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2000 .ref_cnt = macro ? first->ref_cnt : NULL,
2004 lex_stage_push_last (&src->lookahead, t);
2006 for (int i = 0; i < retval; i++)
2007 lex_stage_pop_first (&src->merge);
2014 lex_source_push_endcmd__ (struct lex_source *src)
2016 assert (lex_stage_is_empty (&src->lookahead));
2017 struct lex_token *token = xmalloc (sizeof *token);
2018 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2019 lex_stage_push_last (&src->lookahead, token);
2022 static struct lex_source *
2023 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2025 struct lex_source *src = xmalloc (sizeof *src);
2026 *src = (struct lex_source) {
2028 .segmenter = segmenter_init (reader->syntax, false),
2032 lex_source_push_endcmd__ (src);
2038 lex_source_destroy (struct lex_source *src)
2040 char *file_name = src->reader->file_name;
2041 char *encoding = src->reader->encoding;
2042 if (src->reader->class->destroy != NULL)
2043 src->reader->class->destroy (src->reader);
2047 lex_stage_uninit (&src->pp);
2048 lex_stage_uninit (&src->merge);
2049 lex_stage_uninit (&src->lookahead);
2050 ll_remove (&src->ll);
2054 struct lex_file_reader
2056 struct lex_reader reader;
2057 struct u8_istream *istream;
2060 static struct lex_reader_class lex_file_reader_class;
2062 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2063 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2064 ENCODING, which should take one of the forms accepted by
2065 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2066 mode of the new reader, respectively.
2068 Returns a null pointer if FILE_NAME cannot be opened. */
2070 lex_reader_for_file (const char *file_name, const char *encoding,
2071 enum segmenter_mode syntax,
2072 enum lex_error_mode error)
2074 struct lex_file_reader *r;
2075 struct u8_istream *istream;
2077 istream = (!strcmp(file_name, "-")
2078 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2079 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2080 if (istream == NULL)
2082 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2086 r = xmalloc (sizeof *r);
2087 lex_reader_init (&r->reader, &lex_file_reader_class);
2088 r->reader.syntax = syntax;
2089 r->reader.error = error;
2090 r->reader.file_name = xstrdup (file_name);
2091 r->reader.encoding = xstrdup_if_nonnull (encoding);
2092 r->reader.line_number = 1;
2093 r->istream = istream;
2098 static struct lex_file_reader *
2099 lex_file_reader_cast (struct lex_reader *r)
2101 return UP_CAST (r, struct lex_file_reader, reader);
2105 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2106 enum prompt_style prompt_style UNUSED)
2108 struct lex_file_reader *r = lex_file_reader_cast (r_);
2109 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2112 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2119 lex_file_close (struct lex_reader *r_)
2121 struct lex_file_reader *r = lex_file_reader_cast (r_);
2123 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2125 if (u8_istream_close (r->istream) != 0)
2126 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2129 u8_istream_free (r->istream);
2134 static struct lex_reader_class lex_file_reader_class =
2140 struct lex_string_reader
2142 struct lex_reader reader;
2147 static struct lex_reader_class lex_string_reader_class;
2149 /* Creates and returns a new lex_reader for the contents of S, which must be
2150 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2151 with ss_dealloc() when it is closed. */
2153 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2155 struct lex_string_reader *r;
2157 r = xmalloc (sizeof *r);
2158 lex_reader_init (&r->reader, &lex_string_reader_class);
2159 r->reader.syntax = SEG_MODE_AUTO;
2160 r->reader.encoding = xstrdup_if_nonnull (encoding);
2167 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2168 which must be encoded in ENCODING. The caller retains ownership of S. */
2170 lex_reader_for_string (const char *s, const char *encoding)
2172 struct substring ss;
2173 ss_alloc_substring (&ss, ss_cstr (s));
2174 return lex_reader_for_substring_nocopy (ss, encoding);
2177 /* Formats FORMAT as a printf()-like format string and creates and returns a
2178 new lex_reader for the formatted result. */
2180 lex_reader_for_format (const char *format, const char *encoding, ...)
2182 struct lex_reader *r;
2185 va_start (args, encoding);
2186 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2192 static struct lex_string_reader *
2193 lex_string_reader_cast (struct lex_reader *r)
2195 return UP_CAST (r, struct lex_string_reader, reader);
2199 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2200 enum prompt_style prompt_style UNUSED)
2202 struct lex_string_reader *r = lex_string_reader_cast (r_);
2205 chunk = MIN (n, r->s.length - r->offset);
2206 memcpy (buf, r->s.string + r->offset, chunk);
2213 lex_string_close (struct lex_reader *r_)
2215 struct lex_string_reader *r = lex_string_reader_cast (r_);
2221 static struct lex_reader_class lex_string_reader_class =