1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/ll.h"
42 #include "libpspp/message.h"
43 #include "libpspp/misc.h"
44 #include "libpspp/str.h"
45 #include "libpspp/u8-istream.h"
46 #include "output/journal.h"
47 #include "output/output-item.h"
49 #include "gl/c-ctype.h"
50 #include "gl/minmax.h"
51 #include "gl/xalloc.h"
52 #include "gl/xmemdup0.h"
55 #define _(msgid) gettext (msgid)
56 #define N_(msgid) msgid
58 /* A token within a lex_source. */
61 /* The regular token information. */
64 /* For a token obtained through the lexer in an ordinary way, this is the
65 location of the token in terms of the lex_source's buffer.
67 For a token produced through macro expansion, this is the entire macro
70 src->tail <= line_pos <= token_pos <= src->head. */
71 size_t token_pos; /* Start of token. */
72 size_t token_len; /* Length of source for token in bytes. */
73 size_t line_pos; /* Start of line containing token_pos. */
74 int first_line; /* Line number at token_pos. */
76 /* For a token obtained through macro expansion, this is just this token.
78 For a token obtained through the lexer in an ordinary way, these are
80 char *macro_rep; /* The whole macro expansion. */
81 size_t ofs; /* Offset of this token in macro_rep. */
82 size_t len; /* Length of this token in macro_rep. */
83 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
87 lex_token_destroy (struct lex_token *t)
89 token_uninit (&t->token);
92 assert (*t->ref_cnt > 0);
102 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
107 struct lex_token **tokens;
110 static void lex_stage_clear (struct lex_stage *);
111 static void lex_stage_uninit (struct lex_stage *);
113 static size_t lex_stage_count (const struct lex_stage *);
114 static bool lex_stage_is_empty (const struct lex_stage *);
116 static struct lex_token *lex_stage_last (struct lex_stage *);
117 static struct lex_token *lex_stage_first (struct lex_stage *);
118 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
120 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
121 static void lex_stage_pop_first (struct lex_stage *);
123 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
126 /* Deletes all the tokens from STAGE. */
128 lex_stage_clear (struct lex_stage *stage)
130 while (!deque_is_empty (&stage->deque))
131 lex_stage_pop_first (stage);
134 /* Deletes all the tokens from STAGE and frees storage for the deque. */
136 lex_stage_uninit (struct lex_stage *stage)
138 lex_stage_clear (stage);
139 free (stage->tokens);
142 /* Returns true if STAGE contains no tokens, otherwise false. */
144 lex_stage_is_empty (const struct lex_stage *stage)
146 return deque_is_empty (&stage->deque);
149 /* Returns the number of tokens in STAGE. */
151 lex_stage_count (const struct lex_stage *stage)
153 return deque_count (&stage->deque);
156 /* Returns the last token in STAGE, which must be nonempty. The last token is
157 the one accessed with the greatest lookahead. */
158 static struct lex_token *
159 lex_stage_last (struct lex_stage *stage)
161 return stage->tokens[deque_front (&stage->deque, 0)];
164 /* Returns the first token in STAGE, which must be nonempty.
165 The first token is the one accessed with the least lookahead. */
166 static struct lex_token *
167 lex_stage_first (struct lex_stage *stage)
169 return lex_stage_nth (stage, 0);
172 /* Returns the token the given INDEX in STAGE. The first token (with the least
173 lookahead) is 0, the second token is 1, and so on. There must be at least
174 INDEX + 1 tokens in STAGE. */
175 static struct lex_token *
176 lex_stage_nth (struct lex_stage *stage, size_t index)
178 return stage->tokens[deque_back (&stage->deque, index)];
181 /* Adds TOKEN so that it becomes the last token in STAGE. */
183 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
185 if (deque_is_full (&stage->deque))
186 stage->tokens = deque_expand (&stage->deque, stage->tokens,
187 sizeof *stage->tokens);
188 stage->tokens[deque_push_front (&stage->deque)] = token;
191 /* Removes the first token from STAGE and uninitializes it. */
193 lex_stage_pop_first (struct lex_stage *stage)
195 lex_token_destroy (stage->tokens[deque_pop_back (&stage->deque)]);
198 /* Removes the first N tokens from SRC, appending them to DST as the last
201 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
203 for (size_t i = 0; i < n; i++)
205 lex_stage_push_last (dst, lex_stage_first (src));
206 deque_pop_back (&src->deque);
210 /* A source of tokens, corresponding to a syntax file.
212 This is conceptually a lex_reader wrapped with everything needed to convert
213 its UTF-8 bytes into tokens. */
216 struct ll ll; /* In lexer's list of sources. */
217 struct lex_reader *reader;
219 struct segmenter segmenter;
220 bool eof; /* True if T_STOP was read from 'reader'. */
222 /* Buffer of UTF-8 bytes. */
224 size_t allocated; /* Number of bytes allocated. */
225 size_t tail; /* &buffer[0] offset into UTF-8 source. */
226 size_t head; /* &buffer[head - tail] offset into source. */
228 /* Positions in source file, tail <= pos <= head for each member here. */
229 size_t journal_pos; /* First byte not yet output to journal. */
230 size_t seg_pos; /* First byte not yet scanned as token. */
231 size_t line_pos; /* First byte of line containing seg_pos. */
233 int n_newlines; /* Number of new-lines up to seg_pos. */
234 bool suppress_next_newline;
238 This is a pipeline with the following stages. Each token eventually
239 made available to the parser passes through of these stages. The stages
240 are named after the processing that happens in each one.
242 Initially, tokens come from the segmenter and scanner to 'pp':
244 - pp: Tokens that need to pass through the macro preprocessor to end up
247 - merge: Tokens that need to pass through scan_merge() to end up in
250 - lookahead: Tokens available to the client for parsing. */
252 struct lex_stage merge;
253 struct lex_stage lookahead;
256 static struct lex_source *lex_source_create (struct lexer *,
257 struct lex_reader *);
258 static void lex_source_destroy (struct lex_source *);
263 struct ll_list sources; /* Contains "struct lex_source"s. */
264 struct macro_set *macros;
267 static struct lex_source *lex_source__ (const struct lexer *);
268 static char *lex_source_get_syntax__ (const struct lex_source *,
270 static const struct lex_token *lex_next__ (const struct lexer *, int n);
271 static void lex_source_push_endcmd__ (struct lex_source *);
273 static bool lex_source_get_lookahead (struct lex_source *);
274 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
275 const char *format, va_list)
276 PRINTF_FORMAT (4, 0);
277 static const struct lex_token *lex_source_next__ (const struct lex_source *,
280 /* Initializes READER with the specified CLASS and otherwise some reasonable
281 defaults. The caller should fill in the others members as desired. */
283 lex_reader_init (struct lex_reader *reader,
284 const struct lex_reader_class *class)
286 reader->class = class;
287 reader->syntax = SEG_MODE_AUTO;
288 reader->error = LEX_ERROR_CONTINUE;
289 reader->file_name = NULL;
290 reader->encoding = NULL;
291 reader->line_number = 0;
295 /* Frees any file name already in READER and replaces it by a copy of
296 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
298 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
300 free (reader->file_name);
301 reader->file_name = xstrdup_if_nonnull (file_name);
304 /* Creates and returns a new lexer. */
308 struct lexer *lexer = xmalloc (sizeof *lexer);
309 *lexer = (struct lexer) {
310 .sources = LL_INITIALIZER (lexer->sources),
311 .macros = macro_set_create (),
316 /* Destroys LEXER. */
318 lex_destroy (struct lexer *lexer)
322 struct lex_source *source, *next;
324 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
325 lex_source_destroy (source);
326 macro_set_destroy (lexer->macros);
331 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
332 same name. Takes ownership of M. */
334 lex_define_macro (struct lexer *lexer, struct macro *m)
336 macro_set_add (lexer->macros, m);
339 /* Inserts READER into LEXER so that the next token read by LEXER comes from
340 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
343 lex_include (struct lexer *lexer, struct lex_reader *reader)
345 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
346 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
349 /* Appends READER to LEXER, so that it will be read after all other current
350 readers have already been read. */
352 lex_append (struct lexer *lexer, struct lex_reader *reader)
354 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
359 /* Advances LEXER to the next token, consuming the current token. */
361 lex_get (struct lexer *lexer)
363 struct lex_source *src;
365 src = lex_source__ (lexer);
369 if (!lex_stage_is_empty (&src->lookahead))
370 lex_stage_pop_first (&src->lookahead);
372 while (lex_stage_is_empty (&src->lookahead))
373 if (!lex_source_get_lookahead (src))
375 lex_source_destroy (src);
376 src = lex_source__ (lexer);
382 /* Advances LEXER by N tokens. */
384 lex_get_n (struct lexer *lexer, size_t n)
390 /* Issuing errors. */
392 /* Prints a syntax error message containing the current token and
393 given message MESSAGE (if non-null). */
395 lex_error (struct lexer *lexer, const char *format, ...)
399 va_start (args, format);
400 lex_next_error_valist (lexer, 0, 0, format, args);
404 /* Prints a syntax error message containing the current token and
405 given message MESSAGE (if non-null). */
407 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
409 lex_next_error_valist (lexer, 0, 0, format, args);
412 /* Prints a syntax error message containing the current token and
413 given message MESSAGE (if non-null). */
415 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
419 va_start (args, format);
420 lex_next_error_valist (lexer, n0, n1, format, args);
424 /* Prints a syntax error message saying that one of the strings provided as
425 varargs, up to the first NULL, is expected. */
427 (lex_error_expecting) (struct lexer *lexer, ...)
431 va_start (args, lexer);
432 lex_error_expecting_valist (lexer, args);
436 /* Prints a syntax error message saying that one of the options provided in
437 ARGS, up to the first NULL, is expected. */
439 lex_error_expecting_valist (struct lexer *lexer, va_list args)
441 enum { MAX_OPTIONS = 9 };
442 const char *options[MAX_OPTIONS];
444 while (n < MAX_OPTIONS)
446 const char *option = va_arg (args, const char *);
450 options[n++] = option;
452 lex_error_expecting_array (lexer, options, n);
456 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
461 lex_error (lexer, NULL);
465 lex_error (lexer, _("expecting %s"), options[0]);
469 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
473 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
478 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
479 options[0], options[1], options[2], options[3]);
483 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
484 options[0], options[1], options[2], options[3], options[4]);
488 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
489 options[0], options[1], options[2], options[3], options[4],
494 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
495 options[0], options[1], options[2], options[3], options[4],
496 options[5], options[6]);
500 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
501 options[0], options[1], options[2], options[3], options[4],
502 options[5], options[6], options[7]);
506 lex_error (lexer, NULL);
510 /* Reports an error to the effect that subcommand SBC may only be specified
513 This function does not take a lexer as an argument or use lex_error(),
514 because the result would ordinarily just be redundant: "Syntax error at
515 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
516 not help the user find the error. */
518 lex_sbc_only_once (const char *sbc)
520 msg (SE, _("Subcommand %s may only be specified once."), sbc);
523 /* Reports an error to the effect that subcommand SBC is missing.
525 This function does not take a lexer as an argument or use lex_error(),
526 because a missing subcommand can normally be detected only after the whole
527 command has been parsed, and so lex_error() would always report "Syntax
528 error at end of command", which does not help the user find the error. */
530 lex_sbc_missing (const char *sbc)
532 msg (SE, _("Required subcommand %s was not specified."), sbc);
535 /* Reports an error to the effect that specification SPEC may only be specified
536 once within subcommand SBC. */
538 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
540 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
544 /* Reports an error to the effect that specification SPEC is missing within
547 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
549 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
553 /* Prints a syntax error message containing the current token and
554 given message MESSAGE (if non-null). */
556 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
557 const char *format, va_list args)
559 struct lex_source *src = lex_source__ (lexer);
562 lex_source_error_valist (src, n0, n1, format, args);
568 ds_put_format (&s, _("Syntax error at end of input"));
571 ds_put_cstr (&s, ": ");
572 ds_put_vformat (&s, format, args);
574 if (ds_last (&s) != '.')
575 ds_put_byte (&s, '.');
576 msg (SE, "%s", ds_cstr (&s));
581 /* Checks that we're at end of command.
582 If so, returns a successful command completion code.
583 If not, flags a syntax error and returns an error command
586 lex_end_of_command (struct lexer *lexer)
588 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
590 lex_error (lexer, _("expecting end of command"));
597 /* Token testing functions. */
599 /* Returns true if the current token is a number. */
601 lex_is_number (const struct lexer *lexer)
603 return lex_next_is_number (lexer, 0);
606 /* Returns true if the current token is a string. */
608 lex_is_string (const struct lexer *lexer)
610 return lex_next_is_string (lexer, 0);
613 /* Returns the value of the current token, which must be a
614 floating point number. */
616 lex_number (const struct lexer *lexer)
618 return lex_next_number (lexer, 0);
621 /* Returns true iff the current token is an integer. */
623 lex_is_integer (const struct lexer *lexer)
625 return lex_next_is_integer (lexer, 0);
628 /* Returns the value of the current token, which must be an
631 lex_integer (const struct lexer *lexer)
633 return lex_next_integer (lexer, 0);
636 /* Token testing functions with lookahead.
638 A value of 0 for N as an argument to any of these functions refers to the
639 current token. Lookahead is limited to the current command. Any N greater
640 than the number of tokens remaining in the current command will be treated
641 as referring to a T_ENDCMD token. */
643 /* Returns true if the token N ahead of the current token is a number. */
645 lex_next_is_number (const struct lexer *lexer, int n)
647 return token_is_number (lex_next (lexer, n));
650 /* Returns true if the token N ahead of the current token is a string. */
652 lex_next_is_string (const struct lexer *lexer, int n)
654 return token_is_string (lex_next (lexer, n));
657 /* Returns the value of the token N ahead of the current token, which must be a
658 floating point number. */
660 lex_next_number (const struct lexer *lexer, int n)
662 return token_number (lex_next (lexer, n));
665 /* Returns true if the token N ahead of the current token is an integer. */
667 lex_next_is_integer (const struct lexer *lexer, int n)
669 return token_is_integer (lex_next (lexer, n));
672 /* Returns the value of the token N ahead of the current token, which must be
675 lex_next_integer (const struct lexer *lexer, int n)
677 return token_integer (lex_next (lexer, n));
680 /* Token matching functions. */
682 /* If the current token has the specified TYPE, skips it and returns true.
683 Otherwise, returns false. */
685 lex_match (struct lexer *lexer, enum token_type type)
687 if (lex_token (lexer) == type)
696 /* If the current token matches IDENTIFIER, skips it and returns true.
697 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
700 IDENTIFIER must be an ASCII string. */
702 lex_match_id (struct lexer *lexer, const char *identifier)
704 return lex_match_id_n (lexer, identifier, 3);
707 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
708 may be abbreviated to its first N letters. Otherwise, returns false.
710 IDENTIFIER must be an ASCII string. */
712 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
714 if (lex_token (lexer) == T_ID
715 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
724 /* If the current token is integer X, skips it and returns true. Otherwise,
727 lex_match_int (struct lexer *lexer, int x)
729 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
738 /* Forced matches. */
740 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
741 abbreviated to its first 3 letters. Otherwise, reports an error and returns
744 IDENTIFIER must be an ASCII string. */
746 lex_force_match_id (struct lexer *lexer, const char *identifier)
748 if (lex_match_id (lexer, identifier))
752 lex_error_expecting (lexer, identifier);
757 /* If the current token has the specified TYPE, skips it and returns true.
758 Otherwise, reports an error and returns false. */
760 lex_force_match (struct lexer *lexer, enum token_type type)
762 if (lex_token (lexer) == type)
769 const char *type_string = token_type_to_string (type);
772 char *s = xasprintf ("`%s'", type_string);
773 lex_error_expecting (lexer, s);
777 lex_error_expecting (lexer, token_type_to_name (type));
783 /* If the current token is a string, does nothing and returns true.
784 Otherwise, reports an error and returns false. */
786 lex_force_string (struct lexer *lexer)
788 if (lex_is_string (lexer))
792 lex_error (lexer, _("expecting string"));
797 /* If the current token is a string or an identifier, does nothing and returns
798 true. Otherwise, reports an error and returns false.
800 This is meant for use in syntactic situations where we want to encourage the
801 user to supply a quoted string, but for compatibility we also accept
802 identifiers. (One example of such a situation is file names.) Therefore,
803 the error message issued when the current token is wrong only says that a
804 string is expected and doesn't mention that an identifier would also be
807 lex_force_string_or_id (struct lexer *lexer)
809 return lex_token (lexer) == T_ID || lex_force_string (lexer);
812 /* If the current token is an integer, does nothing and returns true.
813 Otherwise, reports an error and returns false. */
815 lex_force_int (struct lexer *lexer)
817 if (lex_is_integer (lexer))
821 lex_error (lexer, _("expecting integer"));
826 /* If the current token is an integer in the range MIN...MAX (inclusive), does
827 nothing and returns true. Otherwise, reports an error and returns false.
828 If NAME is nonnull, then it is used in the error message. */
830 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
832 bool is_number = lex_is_number (lexer);
833 bool is_integer = lex_is_integer (lexer);
834 bool too_small = (is_integer ? lex_integer (lexer) < min
835 : is_number ? lex_number (lexer) < min
837 bool too_big = (is_integer ? lex_integer (lexer) > max
838 : is_number ? lex_number (lexer) > max
840 if (is_integer && !too_small && !too_big)
845 /* Weird, maybe a bug in the caller. Just report that we needed an
848 lex_error (lexer, _("Integer expected for %s."), name);
850 lex_error (lexer, _("Integer expected."));
855 lex_error (lexer, _("Expected %ld for %s."), min, name);
857 lex_error (lexer, _("Expected %ld."), min);
859 else if (min + 1 == max)
862 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
864 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
868 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
869 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
871 if (report_lower_bound && report_upper_bound)
875 _("Expected integer between %ld and %ld for %s."),
878 lex_error (lexer, _("Expected integer between %ld and %ld."),
881 else if (report_lower_bound)
886 lex_error (lexer, _("Expected non-negative integer for %s."),
889 lex_error (lexer, _("Expected non-negative integer."));
894 lex_error (lexer, _("Expected positive integer for %s."),
897 lex_error (lexer, _("Expected positive integer."));
902 lex_error (lexer, _("Expected integer %ld or greater for %s."),
905 lex_error (lexer, _("Expected integer %ld or greater."), min);
908 else if (report_upper_bound)
912 _("Expected integer less than or equal to %ld for %s."),
915 lex_error (lexer, _("Expected integer less than or equal to %ld."),
921 lex_error (lexer, _("Integer expected for %s."), name);
923 lex_error (lexer, _("Integer expected."));
929 /* If the current token is a number, does nothing and returns true.
930 Otherwise, reports an error and returns false. */
932 lex_force_num (struct lexer *lexer)
934 if (lex_is_number (lexer))
937 lex_error (lexer, _("expecting number"));
941 /* If the current token is an identifier, does nothing and returns true.
942 Otherwise, reports an error and returns false. */
944 lex_force_id (struct lexer *lexer)
946 if (lex_token (lexer) == T_ID)
949 lex_error (lexer, _("expecting identifier"));
953 /* Token accessors. */
955 /* Returns the type of LEXER's current token. */
957 lex_token (const struct lexer *lexer)
959 return lex_next_token (lexer, 0);
962 /* Returns the number in LEXER's current token.
964 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
965 tokens this function will always return zero. */
967 lex_tokval (const struct lexer *lexer)
969 return lex_next_tokval (lexer, 0);
972 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
974 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
975 this functions this function will always return NULL.
977 The UTF-8 encoding of the returned string is correct for variable names and
978 other identifiers. Use filename_to_utf8() to use it as a filename. Use
979 data_in() to use it in a "union value". */
981 lex_tokcstr (const struct lexer *lexer)
983 return lex_next_tokcstr (lexer, 0);
986 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
987 null-terminated (but the null terminator is not included in the returned
988 substring's 'length').
990 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
991 this functions this function will always return NULL.
993 The UTF-8 encoding of the returned string is correct for variable names and
994 other identifiers. Use filename_to_utf8() to use it as a filename. Use
995 data_in() to use it in a "union value". */
997 lex_tokss (const struct lexer *lexer)
999 return lex_next_tokss (lexer, 0);
1004 A value of 0 for N as an argument to any of these functions refers to the
1005 current token. Lookahead is limited to the current command. Any N greater
1006 than the number of tokens remaining in the current command will be treated
1007 as referring to a T_ENDCMD token. */
1009 static const struct lex_token *
1010 lex_next__ (const struct lexer *lexer_, int n)
1012 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1013 struct lex_source *src = lex_source__ (lexer);
1016 return lex_source_next__ (src, n);
1019 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1024 static const struct lex_token *
1025 lex_source_next__ (const struct lex_source *src_, int n)
1027 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1028 while (lex_stage_count (&src->lookahead) <= n)
1030 if (!lex_stage_is_empty (&src->lookahead))
1032 const struct lex_token *t = lex_stage_last (&src->lookahead);
1033 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1037 lex_source_get_lookahead (src);
1040 return lex_stage_nth (&src->lookahead, n);
1043 /* Returns the "struct token" of the token N after the current one in LEXER.
1044 The returned pointer can be invalidated by pretty much any succeeding call
1045 into the lexer, although the string pointer within the returned token is
1046 only invalidated by consuming the token (e.g. with lex_get()). */
1047 const struct token *
1048 lex_next (const struct lexer *lexer, int n)
1050 return &lex_next__ (lexer, n)->token;
1053 /* Returns the type of the token N after the current one in LEXER. */
1055 lex_next_token (const struct lexer *lexer, int n)
1057 return lex_next (lexer, n)->type;
1060 /* Returns the number in the tokn N after the current one in LEXER.
1062 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1063 tokens this function will always return zero. */
1065 lex_next_tokval (const struct lexer *lexer, int n)
1067 return token_number (lex_next (lexer, n));
1070 /* Returns the null-terminated string in the token N after the current one, in
1073 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1074 this functions this function will always return NULL.
1076 The UTF-8 encoding of the returned string is correct for variable names and
1077 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1078 data_in() to use it in a "union value". */
1080 lex_next_tokcstr (const struct lexer *lexer, int n)
1082 return lex_next_tokss (lexer, n).string;
1085 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1086 The string is null-terminated (but the null terminator is not included in
1087 the returned substring's 'length').
1089 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1090 tokens this functions this function will always return NULL.
1092 The UTF-8 encoding of the returned string is correct for variable names and
1093 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1094 data_in() to use it in a "union value". */
1096 lex_next_tokss (const struct lexer *lexer, int n)
1098 return lex_next (lexer, n)->string;
1101 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1102 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1103 are both zero, this requests the syntax for the current token.) The caller
1104 must eventually free the returned string (with free()). The syntax is
1105 encoded in UTF-8 and in the original form supplied to the lexer so that, for
1106 example, it may include comments, spaces, and new-lines if it spans multiple
1107 tokens. Macro expansion, however, has already been performed. */
1109 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1111 return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1114 /* Returns true if the token N ahead of the current one was produced by macro
1115 expansion, false otherwise. */
1117 lex_next_is_from_macro (const struct lexer *lexer, int n)
1119 return lex_next__ (lexer, n)->macro_rep != NULL;
1123 lex_tokens_match (const struct token *actual, const struct token *expected)
1125 if (actual->type != expected->type)
1128 switch (actual->type)
1132 return actual->number == expected->number;
1135 return lex_id_match (expected->string, actual->string);
1138 return (actual->string.length == expected->string.length
1139 && !memcmp (actual->string.string, expected->string.string,
1140 actual->string.length));
1148 lex_at_phrase__ (struct lexer *lexer, const char *s)
1150 struct string_lexer slex;
1154 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1155 while (string_lexer_next (&slex, &token))
1157 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1158 token_uninit (&token);
1165 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1166 returns true. Otherwise, returns false.
1168 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1169 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1170 first three letters. */
1172 lex_at_phrase (struct lexer *lexer, const char *s)
1174 return lex_at_phrase__ (lexer, s) > 0;
1177 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1178 skips it and returns true. Otherwise, returns false.
1180 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1181 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1182 first three letters. */
1184 lex_match_phrase (struct lexer *lexer, const char *s)
1186 size_t n = lex_at_phrase__ (lexer, s);
1188 lex_get_n (lexer, n);
1193 count_newlines (char *s, size_t length)
1198 while ((newline = memchr (s, '\n', length)) != NULL)
1201 length -= (newline + 1) - s;
1209 lex_token_get_last_line_number (const struct lex_source *src,
1210 const struct lex_token *token)
1212 if (token->first_line == 0)
1216 char *token_str = &src->buffer[token->token_pos - src->tail];
1217 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1222 lex_token_get_first_column (const struct lex_source *src,
1223 const struct lex_token *token)
1225 return utf8_count_columns (&src->buffer[token->line_pos - src->tail],
1226 token->token_pos - token->line_pos) + 1;
1230 lex_token_get_last_column (const struct lex_source *src,
1231 const struct lex_token *token)
1233 char *start, *end, *newline;
1235 start = &src->buffer[token->line_pos - src->tail];
1236 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1237 newline = memrchr (start, '\n', end - start);
1238 if (newline != NULL)
1239 start = newline + 1;
1240 return utf8_count_columns (start, end - start) + 1;
1243 static struct msg_location
1244 lex_token_location (const struct lex_source *src,
1245 const struct lex_token *t0,
1246 const struct lex_token *t1)
1248 return (struct msg_location) {
1249 .file_name = src->reader->file_name,
1250 .first_line = t0->first_line,
1251 .last_line = lex_token_get_last_line_number (src, t1),
1252 .first_column = lex_token_get_first_column (src, t0),
1253 .last_column = lex_token_get_last_column (src, t1),
1257 static struct msg_location *
1258 lex_token_location_rw (const struct lex_source *src,
1259 const struct lex_token *t0,
1260 const struct lex_token *t1)
1262 struct msg_location location = lex_token_location (src, t0, t1);
1263 return msg_location_dup (&location);
1266 static struct msg_location *
1267 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1269 return lex_token_location_rw (src,
1270 lex_source_next__ (src, n0),
1271 lex_source_next__ (src, n1));
1274 /* Returns the 1-based line number of the start of the syntax that represents
1275 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1276 if the token is drawn from a source that does not have line numbers. */
1278 lex_get_first_line_number (const struct lexer *lexer, int n)
1280 const struct lex_source *src = lex_source__ (lexer);
1281 return src ? lex_source_next__ (src, n)->first_line : 0;
1284 /* Returns the 1-based line number of the end of the syntax that represents the
1285 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1286 token or if the token is drawn from a source that does not have line
1289 Most of the time, a single token is wholly within a single line of syntax,
1290 but there are two exceptions: a T_STRING token can be made up of multiple
1291 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1292 token can consist of a "-" on one line followed by the number on the next.
1295 lex_get_last_line_number (const struct lexer *lexer, int n)
1297 const struct lex_source *src = lex_source__ (lexer);
1298 return src ? lex_token_get_last_line_number (src,
1299 lex_source_next__ (src, n)) : 0;
1302 /* Returns the 1-based column number of the start of the syntax that represents
1303 the token N after the current one in LEXER. Returns 0 for a T_STOP
1306 Column numbers are measured according to the width of characters as shown in
1307 a typical fixed-width font, in which CJK characters have width 2 and
1308 combining characters have width 0. */
1310 lex_get_first_column (const struct lexer *lexer, int n)
1312 const struct lex_source *src = lex_source__ (lexer);
1313 return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1316 /* Returns the 1-based column number of the end of the syntax that represents
1317 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1320 Column numbers are measured according to the width of characters as shown in
1321 a typical fixed-width font, in which CJK characters have width 2 and
1322 combining characters have width 0. */
1324 lex_get_last_column (const struct lexer *lexer, int n)
1326 const struct lex_source *src = lex_source__ (lexer);
1327 return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1330 /* Returns the name of the syntax file from which the current command is drawn.
1331 Returns NULL for a T_STOP token or if the command's source does not have
1334 There is no version of this function that takes an N argument because
1335 lookahead only works to the end of a command and any given command is always
1336 within a single syntax file. */
1338 lex_get_file_name (const struct lexer *lexer)
1340 struct lex_source *src = lex_source__ (lexer);
1341 return src == NULL ? NULL : src->reader->file_name;
1344 /* Returns a newly allocated msg_location for the syntax that represents tokens
1345 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1346 must eventually free the location (with msg_location_destroy()). */
1347 struct msg_location *
1348 lex_get_location (const struct lexer *lexer, int n0, int n1)
1350 struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1351 loc->first_column = lex_get_first_column (lexer, n0);
1352 loc->last_column = lex_get_last_column (lexer, n1);
1356 /* Returns a newly allocated msg_location for the syntax that represents tokens
1357 with 0-based offsets N0...N1, inclusive, from the current token. The
1358 location only covers the tokens' lines, not the columns. The caller must
1359 eventually free the location (with msg_location_destroy()). */
1360 struct msg_location *
1361 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1363 struct msg_location *loc = xmalloc (sizeof *loc);
1364 *loc = (struct msg_location) {
1365 .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1366 .first_line = lex_get_first_line_number (lexer, n0),
1367 .last_line = lex_get_last_line_number (lexer, n1),
1373 lex_get_encoding (const struct lexer *lexer)
1375 struct lex_source *src = lex_source__ (lexer);
1376 return src == NULL ? NULL : src->reader->encoding;
1379 /* Returns the syntax mode for the syntax file from which the current drawn is
1380 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1381 does not have line numbers.
1383 There is no version of this function that takes an N argument because
1384 lookahead only works to the end of a command and any given command is always
1385 within a single syntax file. */
1387 lex_get_syntax_mode (const struct lexer *lexer)
1389 struct lex_source *src = lex_source__ (lexer);
1390 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1393 /* Returns the error mode for the syntax file from which the current drawn is
1394 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1395 source does not have line numbers.
1397 There is no version of this function that takes an N argument because
1398 lookahead only works to the end of a command and any given command is always
1399 within a single syntax file. */
1401 lex_get_error_mode (const struct lexer *lexer)
1403 struct lex_source *src = lex_source__ (lexer);
1404 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1407 /* If the source that LEXER is currently reading has error mode
1408 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1409 token to be read comes directly from whatever is next read from the stream.
1411 It makes sense to call this function after encountering an error in a
1412 command entered on the console, because usually the user would prefer not to
1413 have cascading errors. */
1415 lex_interactive_reset (struct lexer *lexer)
1417 struct lex_source *src = lex_source__ (lexer);
1418 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1420 src->head = src->tail = 0;
1421 src->journal_pos = src->seg_pos = src->line_pos = 0;
1422 src->n_newlines = 0;
1423 src->suppress_next_newline = false;
1424 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1426 lex_stage_clear (&src->pp);
1427 lex_stage_clear (&src->merge);
1428 lex_stage_clear (&src->lookahead);
1429 lex_source_push_endcmd__ (src);
1433 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1435 lex_discard_rest_of_command (struct lexer *lexer)
1437 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1441 /* Discards all lookahead tokens in LEXER, then discards all input sources
1442 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1443 runs out of input sources. */
1445 lex_discard_noninteractive (struct lexer *lexer)
1447 struct lex_source *src = lex_source__ (lexer);
1451 lex_stage_clear (&src->pp);
1452 lex_stage_clear (&src->merge);
1453 lex_stage_clear (&src->lookahead);
1455 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1456 src = lex_source__ (lexer))
1457 lex_source_destroy (src);
1462 lex_source_max_tail__ (const struct lex_source *src_)
1464 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1466 assert (src->seg_pos >= src->line_pos);
1467 size_t max_tail = MIN (src->journal_pos, src->line_pos);
1469 /* Use the oldest token also. */
1470 struct lex_stage *stages[] = { &src->lookahead, &src->merge, &src->pp };
1471 for (size_t i = 0; i < sizeof stages / sizeof *stages; i++)
1472 if (!lex_stage_is_empty (stages[i]))
1474 struct lex_token *first = lex_stage_first (stages[i]);
1475 assert (first->token_pos >= first->line_pos);
1476 return MIN (max_tail, first->line_pos);
1483 lex_source_expand__ (struct lex_source *src)
1485 if (src->head - src->tail >= src->allocated)
1487 size_t max_tail = lex_source_max_tail__ (src);
1488 if (max_tail > src->tail)
1490 /* Advance the tail, freeing up room at the head. */
1491 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1492 src->head - max_tail);
1493 src->tail = max_tail;
1497 /* Buffer is completely full. Expand it. */
1498 src->buffer = x2realloc (src->buffer, &src->allocated);
1503 /* There's space available at the head of the buffer. Nothing to do. */
1508 lex_source_read__ (struct lex_source *src)
1512 lex_source_expand__ (src);
1514 size_t head_ofs = src->head - src->tail;
1515 size_t space = src->allocated - head_ofs;
1516 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1517 size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1519 assert (n <= space);
1524 src->reader->eof = true;
1525 lex_source_expand__ (src);
1531 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1532 src->head - src->seg_pos));
1535 static struct lex_source *
1536 lex_source__ (const struct lexer *lexer)
1538 return (ll_is_empty (&lexer->sources) ? NULL
1539 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1542 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1543 one, through N1 ahead of the current one, inclusive. (For example, if N0
1544 and N1 are both zero, this requests the syntax for the current token.) The
1545 caller must eventually free the returned string (with free()). The syntax
1546 is encoded in UTF-8 and in the original form supplied to the lexer so that,
1547 for example, it may include comments, spaces, and new-lines if it spans
1548 multiple tokens. Macro expansion, however, has already been performed. */
1550 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1552 struct string s = DS_EMPTY_INITIALIZER;
1553 for (size_t i = n0; i <= n1; )
1555 /* Find [I,J) as the longest sequence of tokens not produced by macro
1556 expansion, or otherwise the longest sequence expanded from a single
1558 const struct lex_token *first = lex_source_next__ (src, i);
1560 for (j = i + 1; j <= n1; j++)
1562 const struct lex_token *cur = lex_source_next__ (src, j);
1563 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1564 || first->macro_rep != cur->macro_rep)
1567 const struct lex_token *last = lex_source_next__ (src, j - 1);
1569 /* Now add the syntax for this sequence of tokens to SRC. */
1570 if (!ds_is_empty (&s))
1571 ds_put_byte (&s, ' ');
1572 if (!first->macro_rep)
1574 size_t start = first->token_pos;
1575 size_t end = last->token_pos + last->token_len;
1576 ds_put_substring (&s, ss_buffer (&src->buffer[start - src->tail],
1581 size_t start = first->ofs;
1582 size_t end = last->ofs + last->len;
1583 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1589 return ds_steal_cstr (&s);
1593 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1595 for (size_t i = n0; i <= n1; i++)
1596 if (lex_source_next__ (src, i)->macro_rep)
1601 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1602 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1603 other tokens included in that range. The syntax is encoded in UTF-8 and in
1604 the original form supplied to the lexer so that, for example, it may include
1605 comments, spaces, and new-lines if it spans multiple tokens.
1607 Returns an empty string if the token range doesn't include a macro call.
1609 The caller must not modify or free the returned string. */
1610 static struct substring
1611 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1613 if (!lex_source_contains_macro_call (src, n0, n1))
1616 const struct lex_token *token0 = lex_source_next__ (src, n0);
1617 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1618 size_t start = token0->token_pos;
1619 size_t end = token1->token_pos + token1->token_len;
1621 return ss_buffer (&src->buffer[start - src->tail], end - start);
1625 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1626 const char *format, va_list args)
1628 const struct lex_token *token;
1633 token = lex_source_next__ (src, n0);
1634 if (token->token.type == T_ENDCMD)
1635 ds_put_cstr (&s, _("Syntax error at end of command"));
1638 /* Get the syntax that caused the error. */
1639 char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1641 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1644 /* Get the macro call(s) that expanded to the syntax that caused the
1647 str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1654 _("Syntax error at `%s' (in expansion of `%s')"),
1657 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1662 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1665 ds_put_cstr (&s, _("Syntax error"));
1671 ds_put_cstr (&s, ": ");
1672 ds_put_vformat (&s, format, args);
1674 if (ds_last (&s) != '.')
1675 ds_put_byte (&s, '.');
1677 struct msg *m = xmalloc (sizeof *m);
1679 .category = MSG_C_SYNTAX,
1680 .severity = MSG_S_ERROR,
1681 .location = lex_source_get_location (src, n0, n1),
1682 .text = ds_steal_cstr (&s),
1688 lex_get_error (struct lex_source *src, const struct lex_token *token)
1691 str_ellipsize (ss_buffer (&src->buffer[token->token_pos - src->tail],
1693 syntax, sizeof syntax);
1695 struct string s = DS_EMPTY_INITIALIZER;
1696 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1697 ds_put_format (&s, ": %s", token->token.string.string);
1699 struct msg *m = xmalloc (sizeof *m);
1701 .category = MSG_C_SYNTAX,
1702 .severity = MSG_S_ERROR,
1703 .location = lex_token_location_rw (src, token, token),
1704 .text = ds_steal_cstr (&s),
1709 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1710 underlying lex_reader if necessary. Returns true if a new token was added
1711 to SRC's deque, false otherwise. The caller should retry failures unless
1712 SRC's 'eof' marker was set to true indicating that there will be no more
1713 tokens from this source. */
1715 lex_source_try_get_pp (struct lex_source *src)
1717 /* Append a new token to SRC and initialize it. */
1718 struct lex_token *token = xmalloc (sizeof *token);
1719 token->token = (struct token) { .type = T_STOP };
1720 token->macro_rep = NULL;
1721 token->ref_cnt = NULL;
1722 token->line_pos = src->line_pos;
1723 token->token_pos = src->seg_pos;
1724 if (src->reader->line_number > 0)
1725 token->first_line = src->reader->line_number + src->n_newlines;
1727 token->first_line = 0;
1729 /* Extract a segment. */
1730 const char *segment;
1731 enum segment_type seg_type;
1735 segment = &src->buffer[src->seg_pos - src->tail];
1736 seg_len = segmenter_push (&src->segmenter, segment,
1737 src->head - src->seg_pos,
1738 src->reader->eof, &seg_type);
1742 /* The segmenter needs more input to produce a segment. */
1743 assert (!src->reader->eof);
1744 lex_source_read__ (src);
1747 /* Update state based on the segment. */
1748 token->token_len = seg_len;
1749 src->seg_pos += seg_len;
1750 if (seg_type == SEG_NEWLINE)
1752 src->line_pos = src->seg_pos;
1756 /* Get a token from the segment. */
1757 enum tokenize_result result = token_from_segment (
1758 seg_type, ss_buffer (segment, seg_len), &token->token);
1760 /* If we've reached the end of a line, or the end of a command, then pass
1761 the line to the output engine as a syntax text item. */
1762 int n_lines = seg_type == SEG_NEWLINE;
1763 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1766 src->suppress_next_newline = true;
1768 else if (n_lines > 0 && src->suppress_next_newline)
1771 src->suppress_next_newline = false;
1773 for (int i = 0; i < n_lines; i++)
1775 /* Beginning of line. */
1776 const char *line = &src->buffer[src->journal_pos - src->tail];
1778 /* Calculate line length, including \n or \r\n end-of-line if present.
1780 We use src->head even though that may be beyond what we've actually
1781 converted to tokens (which is only through line_pos). That's because,
1782 if we're emitting the line due to SEG_END_COMMAND, we want to take the
1783 whole line through the newline, not just through the '.'. */
1784 size_t max_len = src->head - src->journal_pos;
1785 const char *newline = memchr (line, '\n', max_len);
1786 size_t line_len = newline ? newline - line + 1 : max_len;
1788 /* Calculate line length excluding end-of-line. */
1789 size_t copy_len = line_len;
1790 if (copy_len > 0 && line[copy_len - 1] == '\n')
1792 if (copy_len > 0 && line[copy_len - 1] == '\r')
1795 /* Submit the line as syntax. */
1796 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1797 xmemdup0 (line, copy_len),
1800 src->journal_pos += line_len;
1805 case TOKENIZE_ERROR:
1806 lex_get_error (src, token);
1808 case TOKENIZE_EMPTY:
1809 lex_token_destroy (token);
1812 case TOKENIZE_TOKEN:
1813 if (token->token.type == T_STOP)
1815 token->token.type = T_ENDCMD;
1818 lex_stage_push_last (&src->pp, token);
1824 /* Attempts to append a new token to SRC. Returns true if successful, false on
1825 failure. On failure, the end of SRC has been reached and no more tokens
1826 will be forthcoming from it.
1828 Does not make the new token available for lookahead yet; the caller must
1829 adjust SRC's 'middle' pointer to do so. */
1831 lex_source_get_pp (struct lex_source *src)
1834 if (lex_source_try_get_pp (src))
1840 lex_source_try_get_merge (const struct lex_source *src_)
1842 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1844 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1847 if (!settings_get_mexpand ())
1849 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1853 /* Now pass tokens one-by-one to the macro expander.
1855 In the common case where there is no macro to expand, the loop is not
1857 struct macro_call *mc;
1858 int n_call = macro_call_create (src->lexer->macros,
1859 &lex_stage_first (&src->pp)->token, &mc);
1860 for (int ofs = 1; !n_call; ofs++)
1862 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1864 /* This should not be reachable because we always get a T_ENDCMD at
1865 the end of an input file (transformed from T_STOP by
1866 lex_source_try_get_pp()) and the macro_expander should always
1867 terminate expansion on T_ENDCMD. */
1871 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1872 size_t start = t->token_pos;
1873 size_t end = t->token_pos + t->token_len;
1874 const struct macro_token mt = {
1876 .syntax = ss_buffer (&src->buffer[start - src->tail], end - start),
1878 const struct msg_location loc = lex_token_location (src, t, t);
1879 n_call = macro_call_add (mc, &mt, &loc);
1883 /* False alarm: no macro expansion after all. Use first token as
1884 lookahead. We'll retry macro expansion from the second token next
1886 macro_call_destroy (mc);
1887 lex_stage_shift (&src->merge, &src->pp, 1);
1891 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1892 are a macro call. (These are likely to be the only tokens in 'pp'.)
1894 const struct lex_token *c0 = lex_stage_first (&src->pp);
1895 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1896 struct macro_tokens expansion = { .n = 0 };
1897 struct msg_location loc = lex_token_location (src, c0, c1);
1898 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1899 macro_call_destroy (mc);
1901 /* Convert the macro expansion into syntax for possible error messages
1903 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1904 size_t *len = xnmalloc (expansion.n, sizeof *len);
1905 struct string s = DS_EMPTY_INITIALIZER;
1906 macro_tokens_to_syntax (&expansion, &s, ofs, len);
1908 if (settings_get_mprint ())
1909 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1910 _("Macro Expansion")));
1912 /* Append the macro expansion tokens to the lookahead. */
1913 if (expansion.n > 0)
1915 char *macro_rep = ds_steal_cstr (&s);
1916 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1917 *ref_cnt = expansion.n;
1918 for (size_t i = 0; i < expansion.n; i++)
1920 struct lex_token *token = xmalloc (sizeof *token);
1921 *token = (struct lex_token) {
1922 .token = expansion.mts[i].token,
1923 .token_pos = c0->token_pos,
1924 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1925 .line_pos = c0->line_pos,
1926 .first_line = c0->first_line,
1927 .macro_rep = macro_rep,
1932 lex_stage_push_last (&src->merge, token);
1934 ss_dealloc (&expansion.mts[i].syntax);
1939 free (expansion.mts);
1943 /* Destroy the tokens for the call. */
1944 for (size_t i = 0; i < n_call; i++)
1945 lex_stage_pop_first (&src->pp);
1947 return expansion.n > 0;
1950 /* Attempts to obtain at least one new token into 'merge' in SRC.
1952 Returns true if successful, false on failure. In the latter case, SRC is
1953 exhausted and 'src->eof' is now true. */
1955 lex_source_get_merge (struct lex_source *src)
1958 if (lex_source_try_get_merge (src))
1963 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1965 Returns true if successful, false on failure. In the latter case, SRC is
1966 exhausted and 'src->eof' is now true. */
1968 lex_source_get_lookahead (struct lex_source *src)
1970 struct merger m = MERGER_INIT;
1972 for (size_t i = 0; ; i++)
1974 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1976 /* We always get a T_ENDCMD at the end of an input file
1977 (transformed from T_STOP by lex_source_try_get_pp()) and
1978 merger_add() should never return -1 on T_ENDCMD. */
1979 assert (lex_stage_is_empty (&src->merge));
1983 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1987 lex_stage_shift (&src->lookahead, &src->merge, 1);
1990 else if (retval > 0)
1992 /* Add a token that merges all the tokens together. */
1993 const struct lex_token *first = lex_stage_first (&src->merge);
1994 const struct lex_token *last = lex_stage_nth (&src->merge,
1996 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1997 struct lex_token *t = xmalloc (sizeof *t);
1998 *t = (struct lex_token) {
2000 .token_pos = first->token_pos,
2001 .token_len = (last->token_pos - first->token_pos) + last->token_len,
2002 .line_pos = first->line_pos,
2003 .first_line = first->first_line,
2005 /* This works well if all the tokens were not expanded from macros,
2006 or if they came from the same macro expansion. It just gives up
2007 in the other (corner) cases. */
2008 .macro_rep = macro ? first->macro_rep : NULL,
2009 .ofs = macro ? first->ofs : 0,
2010 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2011 .ref_cnt = macro ? first->ref_cnt : NULL,
2015 lex_stage_push_last (&src->lookahead, t);
2017 for (int i = 0; i < retval; i++)
2018 lex_stage_pop_first (&src->merge);
2025 lex_source_push_endcmd__ (struct lex_source *src)
2027 assert (lex_stage_is_empty (&src->lookahead));
2028 struct lex_token *token = xmalloc (sizeof *token);
2029 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2030 lex_stage_push_last (&src->lookahead, token);
2033 static struct lex_source *
2034 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2036 struct lex_source *src = xmalloc (sizeof *src);
2037 *src = (struct lex_source) {
2039 .segmenter = segmenter_init (reader->syntax, false),
2043 lex_source_push_endcmd__ (src);
2049 lex_source_destroy (struct lex_source *src)
2051 char *file_name = src->reader->file_name;
2052 char *encoding = src->reader->encoding;
2053 if (src->reader->class->destroy != NULL)
2054 src->reader->class->destroy (src->reader);
2058 lex_stage_uninit (&src->pp);
2059 lex_stage_uninit (&src->merge);
2060 lex_stage_uninit (&src->lookahead);
2061 ll_remove (&src->ll);
2065 struct lex_file_reader
2067 struct lex_reader reader;
2068 struct u8_istream *istream;
2071 static struct lex_reader_class lex_file_reader_class;
2073 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2074 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2075 ENCODING, which should take one of the forms accepted by
2076 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2077 mode of the new reader, respectively.
2079 Returns a null pointer if FILE_NAME cannot be opened. */
2081 lex_reader_for_file (const char *file_name, const char *encoding,
2082 enum segmenter_mode syntax,
2083 enum lex_error_mode error)
2085 struct lex_file_reader *r;
2086 struct u8_istream *istream;
2088 istream = (!strcmp(file_name, "-")
2089 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2090 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2091 if (istream == NULL)
2093 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2097 r = xmalloc (sizeof *r);
2098 lex_reader_init (&r->reader, &lex_file_reader_class);
2099 r->reader.syntax = syntax;
2100 r->reader.error = error;
2101 r->reader.file_name = xstrdup (file_name);
2102 r->reader.encoding = xstrdup_if_nonnull (encoding);
2103 r->reader.line_number = 1;
2104 r->istream = istream;
2109 static struct lex_file_reader *
2110 lex_file_reader_cast (struct lex_reader *r)
2112 return UP_CAST (r, struct lex_file_reader, reader);
2116 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2117 enum prompt_style prompt_style UNUSED)
2119 struct lex_file_reader *r = lex_file_reader_cast (r_);
2120 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2123 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2130 lex_file_close (struct lex_reader *r_)
2132 struct lex_file_reader *r = lex_file_reader_cast (r_);
2134 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2136 if (u8_istream_close (r->istream) != 0)
2137 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2140 u8_istream_free (r->istream);
2145 static struct lex_reader_class lex_file_reader_class =
2151 struct lex_string_reader
2153 struct lex_reader reader;
2158 static struct lex_reader_class lex_string_reader_class;
2160 /* Creates and returns a new lex_reader for the contents of S, which must be
2161 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2162 with ss_dealloc() when it is closed. */
2164 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2166 struct lex_string_reader *r;
2168 r = xmalloc (sizeof *r);
2169 lex_reader_init (&r->reader, &lex_string_reader_class);
2170 r->reader.syntax = SEG_MODE_AUTO;
2171 r->reader.encoding = xstrdup_if_nonnull (encoding);
2178 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2179 which must be encoded in ENCODING. The caller retains ownership of S. */
2181 lex_reader_for_string (const char *s, const char *encoding)
2183 struct substring ss;
2184 ss_alloc_substring (&ss, ss_cstr (s));
2185 return lex_reader_for_substring_nocopy (ss, encoding);
2188 /* Formats FORMAT as a printf()-like format string and creates and returns a
2189 new lex_reader for the formatted result. */
2191 lex_reader_for_format (const char *format, const char *encoding, ...)
2193 struct lex_reader *r;
2196 va_start (args, encoding);
2197 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2203 static struct lex_string_reader *
2204 lex_string_reader_cast (struct lex_reader *r)
2206 return UP_CAST (r, struct lex_string_reader, reader);
2210 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2211 enum prompt_style prompt_style UNUSED)
2213 struct lex_string_reader *r = lex_string_reader_cast (r_);
2216 chunk = MIN (n, r->s.length - r->offset);
2217 memcpy (buf, r->s.string + r->offset, chunk);
2224 lex_string_close (struct lex_reader *r_)
2226 struct lex_string_reader *r = lex_string_reader_cast (r_);
2232 static struct lex_reader_class lex_string_reader_class =