1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/ll.h"
42 #include "libpspp/message.h"
43 #include "libpspp/misc.h"
44 #include "libpspp/str.h"
45 #include "libpspp/u8-istream.h"
46 #include "output/journal.h"
47 #include "output/output-item.h"
49 #include "gl/c-ctype.h"
50 #include "gl/minmax.h"
51 #include "gl/xalloc.h"
52 #include "gl/xmemdup0.h"
55 #define _(msgid) gettext (msgid)
56 #define N_(msgid) msgid
58 /* A token within a lex_source. */
61 /* The regular token information. */
64 /* For a token obtained through the lexer in an ordinary way, this is the
65 location of the token in terms of the lex_source's buffer.
67 For a token produced through macro expansion, this is the entire macro
69 size_t token_pos; /* Offset into src->buffer of token start. */
70 size_t token_len; /* Length of source for token in bytes. */
71 size_t line_pos; /* Start of line containing token_pos. */
72 int first_line; /* Line number at token_pos. */
74 /* For a token obtained through macro expansion, this is just this token.
76 For a token obtained through the lexer in an ordinary way, these are
78 char *macro_rep; /* The whole macro expansion. */
79 size_t ofs; /* Offset of this token in macro_rep. */
80 size_t len; /* Length of this token in macro_rep. */
81 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
85 lex_token_destroy (struct lex_token *t)
87 token_uninit (&t->token);
90 assert (*t->ref_cnt > 0);
100 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
105 struct lex_token **tokens;
108 static void lex_stage_clear (struct lex_stage *);
109 static void lex_stage_uninit (struct lex_stage *);
111 static size_t lex_stage_count (const struct lex_stage *);
112 static bool lex_stage_is_empty (const struct lex_stage *);
114 static struct lex_token *lex_stage_last (struct lex_stage *);
115 static struct lex_token *lex_stage_first (struct lex_stage *);
116 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
118 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
119 static void lex_stage_pop_first (struct lex_stage *);
121 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
124 /* Deletes all the tokens from STAGE. */
126 lex_stage_clear (struct lex_stage *stage)
128 while (!deque_is_empty (&stage->deque))
129 lex_stage_pop_first (stage);
132 /* Deletes all the tokens from STAGE and frees storage for the deque. */
134 lex_stage_uninit (struct lex_stage *stage)
136 lex_stage_clear (stage);
137 free (stage->tokens);
140 /* Returns true if STAGE contains no tokens, otherwise false. */
142 lex_stage_is_empty (const struct lex_stage *stage)
144 return deque_is_empty (&stage->deque);
147 /* Returns the number of tokens in STAGE. */
149 lex_stage_count (const struct lex_stage *stage)
151 return deque_count (&stage->deque);
154 /* Returns the last token in STAGE, which must be nonempty. The last token is
155 the one accessed with the greatest lookahead. */
156 static struct lex_token *
157 lex_stage_last (struct lex_stage *stage)
159 return stage->tokens[deque_front (&stage->deque, 0)];
162 /* Returns the first token in STAGE, which must be nonempty.
163 The first token is the one accessed with the least lookahead. */
164 static struct lex_token *
165 lex_stage_first (struct lex_stage *stage)
167 return lex_stage_nth (stage, 0);
170 /* Returns the token the given INDEX in STAGE. The first token (with the least
171 lookahead) is 0, the second token is 1, and so on. There must be at least
172 INDEX + 1 tokens in STAGE. */
173 static struct lex_token *
174 lex_stage_nth (struct lex_stage *stage, size_t index)
176 return stage->tokens[deque_back (&stage->deque, index)];
179 /* Adds TOKEN so that it becomes the last token in STAGE. */
181 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
183 if (deque_is_full (&stage->deque))
184 stage->tokens = deque_expand (&stage->deque, stage->tokens,
185 sizeof *stage->tokens);
186 stage->tokens[deque_push_front (&stage->deque)] = token;
189 /* Removes the first token from STAGE and uninitializes it. */
191 lex_stage_pop_first (struct lex_stage *stage)
193 lex_token_destroy (stage->tokens[deque_pop_back (&stage->deque)]);
196 /* Removes the first N tokens from SRC, appending them to DST as the last
199 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
201 for (size_t i = 0; i < n; i++)
203 lex_stage_push_last (dst, lex_stage_first (src));
204 deque_pop_back (&src->deque);
208 /* A source of tokens, corresponding to a syntax file.
210 This is conceptually a lex_reader wrapped with everything needed to convert
211 its UTF-8 bytes into tokens. */
214 struct ll ll; /* In lexer's list of sources. */
215 struct lex_reader *reader;
217 struct segmenter segmenter;
218 bool eof; /* True if T_STOP was read from 'reader'. */
220 /* Buffer of UTF-8 bytes. */
221 char *buffer; /* Source file contents. */
222 size_t length; /* Number of bytes filled. */
223 size_t allocated; /* Number of bytes allocated. */
225 /* Offsets into 'buffer'. */
226 size_t journal_pos; /* First byte not yet output to journal. */
227 size_t seg_pos; /* First byte not yet scanned as token. */
228 size_t line_pos; /* First byte of line containing seg_pos. */
230 int n_newlines; /* Number of new-lines up to seg_pos. */
231 bool suppress_next_newline;
235 This is a pipeline with the following stages. Each token eventually
236 made available to the parser passes through of these stages. The stages
237 are named after the processing that happens in each one.
239 Initially, tokens come from the segmenter and scanner to 'pp':
241 - pp: Tokens that need to pass through the macro preprocessor to end up
244 - merge: Tokens that need to pass through scan_merge() to end up in
247 - lookahead: Tokens available to the client for parsing. */
249 struct lex_stage merge;
250 struct lex_stage lookahead;
253 static struct lex_source *lex_source_create (struct lexer *,
254 struct lex_reader *);
255 static void lex_source_destroy (struct lex_source *);
260 struct ll_list sources; /* Contains "struct lex_source"s. */
261 struct macro_set *macros;
264 static struct lex_source *lex_source__ (const struct lexer *);
265 static char *lex_source_get_syntax__ (const struct lex_source *,
267 static const struct lex_token *lex_next__ (const struct lexer *, int n);
268 static void lex_source_push_endcmd__ (struct lex_source *);
270 static bool lex_source_get_lookahead (struct lex_source *);
271 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
272 const char *format, va_list)
273 PRINTF_FORMAT (4, 0);
274 static const struct lex_token *lex_source_next__ (const struct lex_source *,
277 /* Initializes READER with the specified CLASS and otherwise some reasonable
278 defaults. The caller should fill in the others members as desired. */
280 lex_reader_init (struct lex_reader *reader,
281 const struct lex_reader_class *class)
283 reader->class = class;
284 reader->syntax = SEG_MODE_AUTO;
285 reader->error = LEX_ERROR_CONTINUE;
286 reader->file_name = NULL;
287 reader->encoding = NULL;
288 reader->line_number = 0;
292 /* Frees any file name already in READER and replaces it by a copy of
293 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
295 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
297 free (reader->file_name);
298 reader->file_name = xstrdup_if_nonnull (file_name);
301 /* Creates and returns a new lexer. */
305 struct lexer *lexer = xmalloc (sizeof *lexer);
306 *lexer = (struct lexer) {
307 .sources = LL_INITIALIZER (lexer->sources),
308 .macros = macro_set_create (),
313 /* Destroys LEXER. */
315 lex_destroy (struct lexer *lexer)
319 struct lex_source *source, *next;
321 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
322 lex_source_destroy (source);
323 macro_set_destroy (lexer->macros);
328 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
329 same name. Takes ownership of M. */
331 lex_define_macro (struct lexer *lexer, struct macro *m)
333 macro_set_add (lexer->macros, m);
336 /* Inserts READER into LEXER so that the next token read by LEXER comes from
337 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
340 lex_include (struct lexer *lexer, struct lex_reader *reader)
342 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
343 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
346 /* Appends READER to LEXER, so that it will be read after all other current
347 readers have already been read. */
349 lex_append (struct lexer *lexer, struct lex_reader *reader)
351 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
356 /* Advances LEXER to the next token, consuming the current token. */
358 lex_get (struct lexer *lexer)
360 struct lex_source *src;
362 src = lex_source__ (lexer);
366 if (!lex_stage_is_empty (&src->lookahead))
367 lex_stage_pop_first (&src->lookahead);
369 while (lex_stage_is_empty (&src->lookahead))
370 if (!lex_source_get_lookahead (src))
372 lex_source_destroy (src);
373 src = lex_source__ (lexer);
379 /* Advances LEXER by N tokens. */
381 lex_get_n (struct lexer *lexer, size_t n)
387 /* Issuing errors. */
389 /* Prints a syntax error message containing the current token and
390 given message MESSAGE (if non-null). */
392 lex_error (struct lexer *lexer, const char *format, ...)
396 va_start (args, format);
397 lex_next_error_valist (lexer, 0, 0, format, args);
401 /* Prints a syntax error message containing the current token and
402 given message MESSAGE (if non-null). */
404 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
406 lex_next_error_valist (lexer, 0, 0, format, args);
409 /* Prints a syntax error message containing the current token and
410 given message MESSAGE (if non-null). */
412 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
416 va_start (args, format);
417 lex_next_error_valist (lexer, n0, n1, format, args);
421 /* Prints a syntax error message saying that one of the strings provided as
422 varargs, up to the first NULL, is expected. */
424 (lex_error_expecting) (struct lexer *lexer, ...)
428 va_start (args, lexer);
429 lex_error_expecting_valist (lexer, args);
433 /* Prints a syntax error message saying that one of the options provided in
434 ARGS, up to the first NULL, is expected. */
436 lex_error_expecting_valist (struct lexer *lexer, va_list args)
438 enum { MAX_OPTIONS = 9 };
439 const char *options[MAX_OPTIONS];
441 while (n < MAX_OPTIONS)
443 const char *option = va_arg (args, const char *);
447 options[n++] = option;
449 lex_error_expecting_array (lexer, options, n);
453 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
458 lex_error (lexer, NULL);
462 lex_error (lexer, _("expecting %s"), options[0]);
466 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
470 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
475 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
476 options[0], options[1], options[2], options[3]);
480 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
481 options[0], options[1], options[2], options[3], options[4]);
485 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
486 options[0], options[1], options[2], options[3], options[4],
491 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
492 options[0], options[1], options[2], options[3], options[4],
493 options[5], options[6]);
497 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
498 options[0], options[1], options[2], options[3], options[4],
499 options[5], options[6], options[7]);
503 lex_error (lexer, NULL);
507 /* Reports an error to the effect that subcommand SBC may only be specified
510 This function does not take a lexer as an argument or use lex_error(),
511 because the result would ordinarily just be redundant: "Syntax error at
512 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
513 not help the user find the error. */
515 lex_sbc_only_once (const char *sbc)
517 msg (SE, _("Subcommand %s may only be specified once."), sbc);
520 /* Reports an error to the effect that subcommand SBC is missing.
522 This function does not take a lexer as an argument or use lex_error(),
523 because a missing subcommand can normally be detected only after the whole
524 command has been parsed, and so lex_error() would always report "Syntax
525 error at end of command", which does not help the user find the error. */
527 lex_sbc_missing (const char *sbc)
529 msg (SE, _("Required subcommand %s was not specified."), sbc);
532 /* Reports an error to the effect that specification SPEC may only be specified
533 once within subcommand SBC. */
535 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
537 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
541 /* Reports an error to the effect that specification SPEC is missing within
544 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
546 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
550 /* Prints a syntax error message containing the current token and
551 given message MESSAGE (if non-null). */
553 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
554 const char *format, va_list args)
556 struct lex_source *src = lex_source__ (lexer);
559 lex_source_error_valist (src, n0, n1, format, args);
565 ds_put_format (&s, _("Syntax error at end of input"));
568 ds_put_cstr (&s, ": ");
569 ds_put_vformat (&s, format, args);
571 if (ds_last (&s) != '.')
572 ds_put_byte (&s, '.');
573 msg (SE, "%s", ds_cstr (&s));
578 /* Checks that we're at end of command.
579 If so, returns a successful command completion code.
580 If not, flags a syntax error and returns an error command
583 lex_end_of_command (struct lexer *lexer)
585 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
587 lex_error (lexer, _("expecting end of command"));
594 /* Token testing functions. */
596 /* Returns true if the current token is a number. */
598 lex_is_number (const struct lexer *lexer)
600 return lex_next_is_number (lexer, 0);
603 /* Returns true if the current token is a string. */
605 lex_is_string (const struct lexer *lexer)
607 return lex_next_is_string (lexer, 0);
610 /* Returns the value of the current token, which must be a
611 floating point number. */
613 lex_number (const struct lexer *lexer)
615 return lex_next_number (lexer, 0);
618 /* Returns true iff the current token is an integer. */
620 lex_is_integer (const struct lexer *lexer)
622 return lex_next_is_integer (lexer, 0);
625 /* Returns the value of the current token, which must be an
628 lex_integer (const struct lexer *lexer)
630 return lex_next_integer (lexer, 0);
633 /* Token testing functions with lookahead.
635 A value of 0 for N as an argument to any of these functions refers to the
636 current token. Lookahead is limited to the current command. Any N greater
637 than the number of tokens remaining in the current command will be treated
638 as referring to a T_ENDCMD token. */
640 /* Returns true if the token N ahead of the current token is a number. */
642 lex_next_is_number (const struct lexer *lexer, int n)
644 return token_is_number (lex_next (lexer, n));
647 /* Returns true if the token N ahead of the current token is a string. */
649 lex_next_is_string (const struct lexer *lexer, int n)
651 return token_is_string (lex_next (lexer, n));
654 /* Returns the value of the token N ahead of the current token, which must be a
655 floating point number. */
657 lex_next_number (const struct lexer *lexer, int n)
659 return token_number (lex_next (lexer, n));
662 /* Returns true if the token N ahead of the current token is an integer. */
664 lex_next_is_integer (const struct lexer *lexer, int n)
666 return token_is_integer (lex_next (lexer, n));
669 /* Returns the value of the token N ahead of the current token, which must be
672 lex_next_integer (const struct lexer *lexer, int n)
674 return token_integer (lex_next (lexer, n));
677 /* Token matching functions. */
679 /* If the current token has the specified TYPE, skips it and returns true.
680 Otherwise, returns false. */
682 lex_match (struct lexer *lexer, enum token_type type)
684 if (lex_token (lexer) == type)
693 /* If the current token matches IDENTIFIER, skips it and returns true.
694 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
697 IDENTIFIER must be an ASCII string. */
699 lex_match_id (struct lexer *lexer, const char *identifier)
701 return lex_match_id_n (lexer, identifier, 3);
704 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
705 may be abbreviated to its first N letters. Otherwise, returns false.
707 IDENTIFIER must be an ASCII string. */
709 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
711 if (lex_token (lexer) == T_ID
712 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
721 /* If the current token is integer X, skips it and returns true. Otherwise,
724 lex_match_int (struct lexer *lexer, int x)
726 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
735 /* Forced matches. */
737 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
738 abbreviated to its first 3 letters. Otherwise, reports an error and returns
741 IDENTIFIER must be an ASCII string. */
743 lex_force_match_id (struct lexer *lexer, const char *identifier)
745 if (lex_match_id (lexer, identifier))
749 lex_error_expecting (lexer, identifier);
754 /* If the current token has the specified TYPE, skips it and returns true.
755 Otherwise, reports an error and returns false. */
757 lex_force_match (struct lexer *lexer, enum token_type type)
759 if (lex_token (lexer) == type)
766 const char *type_string = token_type_to_string (type);
769 char *s = xasprintf ("`%s'", type_string);
770 lex_error_expecting (lexer, s);
774 lex_error_expecting (lexer, token_type_to_name (type));
780 /* If the current token is a string, does nothing and returns true.
781 Otherwise, reports an error and returns false. */
783 lex_force_string (struct lexer *lexer)
785 if (lex_is_string (lexer))
789 lex_error (lexer, _("expecting string"));
794 /* If the current token is a string or an identifier, does nothing and returns
795 true. Otherwise, reports an error and returns false.
797 This is meant for use in syntactic situations where we want to encourage the
798 user to supply a quoted string, but for compatibility we also accept
799 identifiers. (One example of such a situation is file names.) Therefore,
800 the error message issued when the current token is wrong only says that a
801 string is expected and doesn't mention that an identifier would also be
804 lex_force_string_or_id (struct lexer *lexer)
806 return lex_token (lexer) == T_ID || lex_force_string (lexer);
809 /* If the current token is an integer, does nothing and returns true.
810 Otherwise, reports an error and returns false. */
812 lex_force_int (struct lexer *lexer)
814 if (lex_is_integer (lexer))
818 lex_error (lexer, _("expecting integer"));
823 /* If the current token is an integer in the range MIN...MAX (inclusive), does
824 nothing and returns true. Otherwise, reports an error and returns false.
825 If NAME is nonnull, then it is used in the error message. */
827 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
829 bool is_number = lex_is_number (lexer);
830 bool is_integer = lex_is_integer (lexer);
831 bool too_small = (is_integer ? lex_integer (lexer) < min
832 : is_number ? lex_number (lexer) < min
834 bool too_big = (is_integer ? lex_integer (lexer) > max
835 : is_number ? lex_number (lexer) > max
837 if (is_integer && !too_small && !too_big)
842 /* Weird, maybe a bug in the caller. Just report that we needed an
845 lex_error (lexer, _("Integer expected for %s."), name);
847 lex_error (lexer, _("Integer expected."));
852 lex_error (lexer, _("Expected %ld for %s."), min, name);
854 lex_error (lexer, _("Expected %ld."), min);
856 else if (min + 1 == max)
859 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
861 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
865 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
866 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
868 if (report_lower_bound && report_upper_bound)
872 _("Expected integer between %ld and %ld for %s."),
875 lex_error (lexer, _("Expected integer between %ld and %ld."),
878 else if (report_lower_bound)
883 lex_error (lexer, _("Expected non-negative integer for %s."),
886 lex_error (lexer, _("Expected non-negative integer."));
891 lex_error (lexer, _("Expected positive integer for %s."),
894 lex_error (lexer, _("Expected positive integer."));
899 lex_error (lexer, _("Expected integer %ld or greater for %s."),
902 lex_error (lexer, _("Expected integer %ld or greater."), min);
905 else if (report_upper_bound)
909 _("Expected integer less than or equal to %ld for %s."),
912 lex_error (lexer, _("Expected integer less than or equal to %ld."),
918 lex_error (lexer, _("Integer expected for %s."), name);
920 lex_error (lexer, _("Integer expected."));
926 /* If the current token is a number, does nothing and returns true.
927 Otherwise, reports an error and returns false. */
929 lex_force_num (struct lexer *lexer)
931 if (lex_is_number (lexer))
934 lex_error (lexer, _("expecting number"));
938 /* If the current token is an identifier, does nothing and returns true.
939 Otherwise, reports an error and returns false. */
941 lex_force_id (struct lexer *lexer)
943 if (lex_token (lexer) == T_ID)
946 lex_error (lexer, _("expecting identifier"));
950 /* Token accessors. */
952 /* Returns the type of LEXER's current token. */
954 lex_token (const struct lexer *lexer)
956 return lex_next_token (lexer, 0);
959 /* Returns the number in LEXER's current token.
961 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
962 tokens this function will always return zero. */
964 lex_tokval (const struct lexer *lexer)
966 return lex_next_tokval (lexer, 0);
969 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
971 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
972 this functions this function will always return NULL.
974 The UTF-8 encoding of the returned string is correct for variable names and
975 other identifiers. Use filename_to_utf8() to use it as a filename. Use
976 data_in() to use it in a "union value". */
978 lex_tokcstr (const struct lexer *lexer)
980 return lex_next_tokcstr (lexer, 0);
983 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
984 null-terminated (but the null terminator is not included in the returned
985 substring's 'length').
987 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
988 this functions this function will always return NULL.
990 The UTF-8 encoding of the returned string is correct for variable names and
991 other identifiers. Use filename_to_utf8() to use it as a filename. Use
992 data_in() to use it in a "union value". */
994 lex_tokss (const struct lexer *lexer)
996 return lex_next_tokss (lexer, 0);
1001 A value of 0 for N as an argument to any of these functions refers to the
1002 current token. Lookahead is limited to the current command. Any N greater
1003 than the number of tokens remaining in the current command will be treated
1004 as referring to a T_ENDCMD token. */
1006 static const struct lex_token *
1007 lex_next__ (const struct lexer *lexer_, int n)
1009 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1010 struct lex_source *src = lex_source__ (lexer);
1013 return lex_source_next__ (src, n);
1016 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1021 static const struct lex_token *
1022 lex_source_next__ (const struct lex_source *src_, int n)
1024 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1025 while (lex_stage_count (&src->lookahead) <= n)
1027 if (!lex_stage_is_empty (&src->lookahead))
1029 const struct lex_token *t = lex_stage_last (&src->lookahead);
1030 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1034 lex_source_get_lookahead (src);
1037 return lex_stage_nth (&src->lookahead, n);
1040 /* Returns the "struct token" of the token N after the current one in LEXER.
1041 The returned pointer can be invalidated by pretty much any succeeding call
1042 into the lexer, although the string pointer within the returned token is
1043 only invalidated by consuming the token (e.g. with lex_get()). */
1044 const struct token *
1045 lex_next (const struct lexer *lexer, int n)
1047 return &lex_next__ (lexer, n)->token;
1050 /* Returns the type of the token N after the current one in LEXER. */
1052 lex_next_token (const struct lexer *lexer, int n)
1054 return lex_next (lexer, n)->type;
1057 /* Returns the number in the tokn N after the current one in LEXER.
1059 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1060 tokens this function will always return zero. */
1062 lex_next_tokval (const struct lexer *lexer, int n)
1064 return token_number (lex_next (lexer, n));
1067 /* Returns the null-terminated string in the token N after the current one, in
1070 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1071 this functions this function will always return NULL.
1073 The UTF-8 encoding of the returned string is correct for variable names and
1074 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1075 data_in() to use it in a "union value". */
1077 lex_next_tokcstr (const struct lexer *lexer, int n)
1079 return lex_next_tokss (lexer, n).string;
1082 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1083 The string is null-terminated (but the null terminator is not included in
1084 the returned substring's 'length').
1086 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1087 tokens this functions this function will always return NULL.
1089 The UTF-8 encoding of the returned string is correct for variable names and
1090 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1091 data_in() to use it in a "union value". */
1093 lex_next_tokss (const struct lexer *lexer, int n)
1095 return lex_next (lexer, n)->string;
1098 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1099 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1100 are both zero, this requests the syntax for the current token.) The caller
1101 must eventually free the returned string (with free()). The syntax is
1102 encoded in UTF-8 and in the original form supplied to the lexer so that, for
1103 example, it may include comments, spaces, and new-lines if it spans multiple
1104 tokens. Macro expansion, however, has already been performed. */
1106 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1108 return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1111 /* Returns true if the token N ahead of the current one was produced by macro
1112 expansion, false otherwise. */
1114 lex_next_is_from_macro (const struct lexer *lexer, int n)
1116 return lex_next__ (lexer, n)->macro_rep != NULL;
1120 lex_tokens_match (const struct token *actual, const struct token *expected)
1122 if (actual->type != expected->type)
1125 switch (actual->type)
1129 return actual->number == expected->number;
1132 return lex_id_match (expected->string, actual->string);
1135 return (actual->string.length == expected->string.length
1136 && !memcmp (actual->string.string, expected->string.string,
1137 actual->string.length));
1145 lex_at_phrase__ (struct lexer *lexer, const char *s)
1147 struct string_lexer slex;
1151 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1152 while (string_lexer_next (&slex, &token))
1154 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1155 token_uninit (&token);
1162 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1163 returns true. Otherwise, returns false.
1165 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1166 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1167 first three letters. */
1169 lex_at_phrase (struct lexer *lexer, const char *s)
1171 return lex_at_phrase__ (lexer, s) > 0;
1174 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1175 skips it and returns true. Otherwise, returns false.
1177 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1178 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1179 first three letters. */
1181 lex_match_phrase (struct lexer *lexer, const char *s)
1183 size_t n = lex_at_phrase__ (lexer, s);
1185 lex_get_n (lexer, n);
1190 count_newlines (char *s, size_t length)
1195 while ((newline = memchr (s, '\n', length)) != NULL)
1198 length -= (newline + 1) - s;
1206 lex_token_get_last_line_number (const struct lex_source *src,
1207 const struct lex_token *token)
1209 if (token->first_line == 0)
1213 char *token_str = &src->buffer[token->token_pos];
1214 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1219 lex_token_get_first_column (const struct lex_source *src,
1220 const struct lex_token *token)
1222 return utf8_count_columns (&src->buffer[token->line_pos],
1223 token->token_pos - token->line_pos) + 1;
1227 lex_token_get_last_column (const struct lex_source *src,
1228 const struct lex_token *token)
1230 char *start, *end, *newline;
1232 start = &src->buffer[token->line_pos];
1233 end = &src->buffer[token->token_pos + token->token_len];
1234 newline = memrchr (start, '\n', end - start);
1235 if (newline != NULL)
1236 start = newline + 1;
1237 return utf8_count_columns (start, end - start) + 1;
1240 static struct msg_location
1241 lex_token_location (const struct lex_source *src,
1242 const struct lex_token *t0,
1243 const struct lex_token *t1)
1245 return (struct msg_location) {
1246 .file_name = src->reader->file_name,
1247 .first_line = t0->first_line,
1248 .last_line = lex_token_get_last_line_number (src, t1),
1249 .first_column = lex_token_get_first_column (src, t0),
1250 .last_column = lex_token_get_last_column (src, t1),
1254 static struct msg_location *
1255 lex_token_location_rw (const struct lex_source *src,
1256 const struct lex_token *t0,
1257 const struct lex_token *t1)
1259 struct msg_location location = lex_token_location (src, t0, t1);
1260 return msg_location_dup (&location);
1263 static struct msg_location *
1264 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1266 return lex_token_location_rw (src,
1267 lex_source_next__ (src, n0),
1268 lex_source_next__ (src, n1));
1271 /* Returns the 1-based line number of the start of the syntax that represents
1272 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1273 if the token is drawn from a source that does not have line numbers. */
1275 lex_get_first_line_number (const struct lexer *lexer, int n)
1277 const struct lex_source *src = lex_source__ (lexer);
1278 return src ? lex_source_next__ (src, n)->first_line : 0;
1281 /* Returns the 1-based line number of the end of the syntax that represents the
1282 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1283 token or if the token is drawn from a source that does not have line
1286 Most of the time, a single token is wholly within a single line of syntax,
1287 but there are two exceptions: a T_STRING token can be made up of multiple
1288 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1289 token can consist of a "-" on one line followed by the number on the next.
1292 lex_get_last_line_number (const struct lexer *lexer, int n)
1294 const struct lex_source *src = lex_source__ (lexer);
1295 return src ? lex_token_get_last_line_number (src,
1296 lex_source_next__ (src, n)) : 0;
1299 /* Returns the 1-based column number of the start of the syntax that represents
1300 the token N after the current one in LEXER. Returns 0 for a T_STOP
1303 Column numbers are measured according to the width of characters as shown in
1304 a typical fixed-width font, in which CJK characters have width 2 and
1305 combining characters have width 0. */
1307 lex_get_first_column (const struct lexer *lexer, int n)
1309 const struct lex_source *src = lex_source__ (lexer);
1310 return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1313 /* Returns the 1-based column number of the end of the syntax that represents
1314 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1317 Column numbers are measured according to the width of characters as shown in
1318 a typical fixed-width font, in which CJK characters have width 2 and
1319 combining characters have width 0. */
1321 lex_get_last_column (const struct lexer *lexer, int n)
1323 const struct lex_source *src = lex_source__ (lexer);
1324 return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1327 /* Returns the name of the syntax file from which the current command is drawn.
1328 Returns NULL for a T_STOP token or if the command's source does not have
1331 There is no version of this function that takes an N argument because
1332 lookahead only works to the end of a command and any given command is always
1333 within a single syntax file. */
1335 lex_get_file_name (const struct lexer *lexer)
1337 struct lex_source *src = lex_source__ (lexer);
1338 return src == NULL ? NULL : src->reader->file_name;
1341 /* Returns a newly allocated msg_location for the syntax that represents tokens
1342 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1343 must eventually free the location (with msg_location_destroy()). */
1344 struct msg_location *
1345 lex_get_location (const struct lexer *lexer, int n0, int n1)
1347 struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1348 loc->first_column = lex_get_first_column (lexer, n0);
1349 loc->last_column = lex_get_last_column (lexer, n1);
1353 /* Returns a newly allocated msg_location for the syntax that represents tokens
1354 with 0-based offsets N0...N1, inclusive, from the current token. The
1355 location only covers the tokens' lines, not the columns. The caller must
1356 eventually free the location (with msg_location_destroy()). */
1357 struct msg_location *
1358 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1360 struct msg_location *loc = xmalloc (sizeof *loc);
1361 *loc = (struct msg_location) {
1362 .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1363 .first_line = lex_get_first_line_number (lexer, n0),
1364 .last_line = lex_get_last_line_number (lexer, n1),
1370 lex_get_encoding (const struct lexer *lexer)
1372 struct lex_source *src = lex_source__ (lexer);
1373 return src == NULL ? NULL : src->reader->encoding;
1376 /* Returns the syntax mode for the syntax file from which the current drawn is
1377 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1378 does not have line numbers.
1380 There is no version of this function that takes an N argument because
1381 lookahead only works to the end of a command and any given command is always
1382 within a single syntax file. */
1384 lex_get_syntax_mode (const struct lexer *lexer)
1386 struct lex_source *src = lex_source__ (lexer);
1387 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1390 /* Returns the error mode for the syntax file from which the current drawn is
1391 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1392 source does not have line numbers.
1394 There is no version of this function that takes an N argument because
1395 lookahead only works to the end of a command and any given command is always
1396 within a single syntax file. */
1398 lex_get_error_mode (const struct lexer *lexer)
1400 struct lex_source *src = lex_source__ (lexer);
1401 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1404 /* If the source that LEXER is currently reading has error mode
1405 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1406 token to be read comes directly from whatever is next read from the stream.
1408 It makes sense to call this function after encountering an error in a
1409 command entered on the console, because usually the user would prefer not to
1410 have cascading errors. */
1412 lex_interactive_reset (struct lexer *lexer)
1414 struct lex_source *src = lex_source__ (lexer);
1415 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1418 src->journal_pos = src->seg_pos = src->line_pos = 0;
1419 src->n_newlines = 0;
1420 src->suppress_next_newline = false;
1421 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1423 lex_stage_clear (&src->pp);
1424 lex_stage_clear (&src->merge);
1425 lex_stage_clear (&src->lookahead);
1426 lex_source_push_endcmd__ (src);
1430 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1432 lex_discard_rest_of_command (struct lexer *lexer)
1434 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1438 /* Discards all lookahead tokens in LEXER, then discards all input sources
1439 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1440 runs out of input sources. */
1442 lex_discard_noninteractive (struct lexer *lexer)
1444 struct lex_source *src = lex_source__ (lexer);
1448 lex_stage_clear (&src->pp);
1449 lex_stage_clear (&src->merge);
1450 lex_stage_clear (&src->lookahead);
1452 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1453 src = lex_source__ (lexer))
1454 lex_source_destroy (src);
1459 lex_source_expand__ (struct lex_source *src)
1461 if (src->length >= src->allocated)
1462 src->buffer = x2realloc (src->buffer, &src->allocated);
1466 lex_source_read__ (struct lex_source *src)
1470 lex_source_expand__ (src);
1472 size_t space = src->allocated - src->length;
1473 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1474 size_t n = src->reader->class->read (src->reader,
1475 &src->buffer[src->length],
1477 assert (n <= space);
1482 src->reader->eof = true;
1488 while (!memchr (&src->buffer[src->seg_pos], '\n',
1489 src->length - src->seg_pos));
1492 static struct lex_source *
1493 lex_source__ (const struct lexer *lexer)
1495 return (ll_is_empty (&lexer->sources) ? NULL
1496 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1499 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1500 one, through N1 ahead of the current one, inclusive. (For example, if N0
1501 and N1 are both zero, this requests the syntax for the current token.) The
1502 caller must eventually free the returned string (with free()). The syntax
1503 is encoded in UTF-8 and in the original form supplied to the lexer so that,
1504 for example, it may include comments, spaces, and new-lines if it spans
1505 multiple tokens. Macro expansion, however, has already been performed. */
1507 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1509 struct string s = DS_EMPTY_INITIALIZER;
1510 for (size_t i = n0; i <= n1; )
1512 /* Find [I,J) as the longest sequence of tokens not produced by macro
1513 expansion, or otherwise the longest sequence expanded from a single
1515 const struct lex_token *first = lex_source_next__ (src, i);
1517 for (j = i + 1; j <= n1; j++)
1519 const struct lex_token *cur = lex_source_next__ (src, j);
1520 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1521 || first->macro_rep != cur->macro_rep)
1524 const struct lex_token *last = lex_source_next__ (src, j - 1);
1526 /* Now add the syntax for this sequence of tokens to SRC. */
1527 if (!ds_is_empty (&s))
1528 ds_put_byte (&s, ' ');
1529 if (!first->macro_rep)
1531 size_t start = first->token_pos;
1532 size_t end = last->token_pos + last->token_len;
1533 ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1537 size_t start = first->ofs;
1538 size_t end = last->ofs + last->len;
1539 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1545 return ds_steal_cstr (&s);
1549 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1551 for (size_t i = n0; i <= n1; i++)
1552 if (lex_source_next__ (src, i)->macro_rep)
1557 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1558 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1559 other tokens included in that range. The syntax is encoded in UTF-8 and in
1560 the original form supplied to the lexer so that, for example, it may include
1561 comments, spaces, and new-lines if it spans multiple tokens.
1563 Returns an empty string if the token range doesn't include a macro call.
1565 The caller must not modify or free the returned string. */
1566 static struct substring
1567 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1569 if (!lex_source_contains_macro_call (src, n0, n1))
1572 const struct lex_token *token0 = lex_source_next__ (src, n0);
1573 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1574 size_t start = token0->token_pos;
1575 size_t end = token1->token_pos + token1->token_len;
1577 return ss_buffer (&src->buffer[start], end - start);
1581 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1582 const char *format, va_list args)
1584 const struct lex_token *token;
1589 token = lex_source_next__ (src, n0);
1590 if (token->token.type == T_ENDCMD)
1591 ds_put_cstr (&s, _("Syntax error at end of command"));
1594 /* Get the syntax that caused the error. */
1595 char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1597 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1600 /* Get the macro call(s) that expanded to the syntax that caused the
1603 str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1610 _("Syntax error at `%s' (in expansion of `%s')"),
1613 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1618 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1621 ds_put_cstr (&s, _("Syntax error"));
1627 ds_put_cstr (&s, ": ");
1628 ds_put_vformat (&s, format, args);
1630 if (ds_last (&s) != '.')
1631 ds_put_byte (&s, '.');
1633 struct msg *m = xmalloc (sizeof *m);
1635 .category = MSG_C_SYNTAX,
1636 .severity = MSG_S_ERROR,
1637 .location = lex_source_get_location (src, n0, n1),
1638 .text = ds_steal_cstr (&s),
1644 lex_get_error (struct lex_source *src, const struct lex_token *token)
1647 str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1648 syntax, sizeof syntax);
1650 struct string s = DS_EMPTY_INITIALIZER;
1651 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1652 ds_put_format (&s, ": %s", token->token.string.string);
1654 struct msg *m = xmalloc (sizeof *m);
1656 .category = MSG_C_SYNTAX,
1657 .severity = MSG_S_ERROR,
1658 .location = lex_token_location_rw (src, token, token),
1659 .text = ds_steal_cstr (&s),
1664 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1665 underlying lex_reader if necessary. Returns true if a new token was added
1666 to SRC's deque, false otherwise. The caller should retry failures unless
1667 SRC's 'eof' marker was set to true indicating that there will be no more
1668 tokens from this source. */
1670 lex_source_try_get_pp (struct lex_source *src)
1672 /* Append a new token to SRC and initialize it. */
1673 struct lex_token *token = xmalloc (sizeof *token);
1674 token->token = (struct token) { .type = T_STOP };
1675 token->macro_rep = NULL;
1676 token->ref_cnt = NULL;
1677 token->line_pos = src->line_pos;
1678 token->token_pos = src->seg_pos;
1679 if (src->reader->line_number > 0)
1680 token->first_line = src->reader->line_number + src->n_newlines;
1682 token->first_line = 0;
1684 /* Extract a segment. */
1685 const char *segment;
1686 enum segment_type seg_type;
1690 segment = &src->buffer[src->seg_pos];
1691 seg_len = segmenter_push (&src->segmenter, segment,
1692 src->length - src->seg_pos,
1693 src->reader->eof, &seg_type);
1697 /* The segmenter needs more input to produce a segment. */
1698 assert (!src->reader->eof);
1699 lex_source_read__ (src);
1702 /* Update state based on the segment. */
1703 token->token_len = seg_len;
1704 src->seg_pos += seg_len;
1705 if (seg_type == SEG_NEWLINE)
1707 src->line_pos = src->seg_pos;
1711 /* Get a token from the segment. */
1712 enum tokenize_result result = token_from_segment (
1713 seg_type, ss_buffer (segment, seg_len), &token->token);
1715 /* If we've reached the end of a line, or the end of a command, then pass
1716 the line to the output engine as a syntax text item. */
1717 int n_lines = seg_type == SEG_NEWLINE;
1718 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1721 src->suppress_next_newline = true;
1723 else if (n_lines > 0 && src->suppress_next_newline)
1726 src->suppress_next_newline = false;
1728 for (int i = 0; i < n_lines; i++)
1730 /* Beginning of line. */
1731 const char *line = &src->buffer[src->journal_pos];
1733 /* Calculate line length, including \n or \r\n end-of-line if present.
1735 We use src->length even though that may be beyond what we've actually
1736 converted to tokens (which is only through line_pos). That's because,
1737 if we're emitting the line due to SEG_END_COMMAND, we want to take the
1738 whole line through the newline, not just through the '.'. */
1739 size_t max_len = src->length - src->journal_pos;
1740 const char *newline = memchr (line, '\n', max_len);
1741 size_t line_len = newline ? newline - line + 1 : max_len;
1743 /* Calculate line length excluding end-of-line. */
1744 size_t copy_len = line_len;
1745 if (copy_len > 0 && line[copy_len - 1] == '\n')
1747 if (copy_len > 0 && line[copy_len - 1] == '\r')
1750 /* Submit the line as syntax. */
1751 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1752 xmemdup0 (line, copy_len),
1755 src->journal_pos += line_len;
1760 case TOKENIZE_ERROR:
1761 lex_get_error (src, token);
1763 case TOKENIZE_EMPTY:
1764 lex_token_destroy (token);
1767 case TOKENIZE_TOKEN:
1768 if (token->token.type == T_STOP)
1770 token->token.type = T_ENDCMD;
1773 lex_stage_push_last (&src->pp, token);
1779 /* Attempts to append a new token to SRC. Returns true if successful, false on
1780 failure. On failure, the end of SRC has been reached and no more tokens
1781 will be forthcoming from it.
1783 Does not make the new token available for lookahead yet; the caller must
1784 adjust SRC's 'middle' pointer to do so. */
1786 lex_source_get_pp (struct lex_source *src)
1789 if (lex_source_try_get_pp (src))
1795 lex_source_try_get_merge (const struct lex_source *src_)
1797 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1799 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1802 if (!settings_get_mexpand ())
1804 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1808 /* Now pass tokens one-by-one to the macro expander.
1810 In the common case where there is no macro to expand, the loop is not
1812 struct macro_call *mc;
1813 int n_call = macro_call_create (src->lexer->macros,
1814 &lex_stage_first (&src->pp)->token, &mc);
1815 for (int ofs = 1; !n_call; ofs++)
1817 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1819 /* This should not be reachable because we always get a T_ENDCMD at
1820 the end of an input file (transformed from T_STOP by
1821 lex_source_try_get_pp()) and the macro_expander should always
1822 terminate expansion on T_ENDCMD. */
1826 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1827 size_t start = t->token_pos;
1828 size_t end = t->token_pos + t->token_len;
1829 const struct macro_token mt = {
1831 .syntax = ss_buffer (&src->buffer[start], end - start),
1833 const struct msg_location loc = lex_token_location (src, t, t);
1834 n_call = macro_call_add (mc, &mt, &loc);
1838 /* False alarm: no macro expansion after all. Use first token as
1839 lookahead. We'll retry macro expansion from the second token next
1841 macro_call_destroy (mc);
1842 lex_stage_shift (&src->merge, &src->pp, 1);
1846 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1847 are a macro call. (These are likely to be the only tokens in 'pp'.)
1849 const struct lex_token *c0 = lex_stage_first (&src->pp);
1850 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1851 struct macro_tokens expansion = { .n = 0 };
1852 struct msg_location loc = lex_token_location (src, c0, c1);
1853 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1854 macro_call_destroy (mc);
1856 /* Convert the macro expansion into syntax for possible error messages
1858 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1859 size_t *len = xnmalloc (expansion.n, sizeof *len);
1860 struct string s = DS_EMPTY_INITIALIZER;
1861 macro_tokens_to_syntax (&expansion, &s, ofs, len);
1863 if (settings_get_mprint ())
1864 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1865 _("Macro Expansion")));
1867 /* Append the macro expansion tokens to the lookahead. */
1868 if (expansion.n > 0)
1870 char *macro_rep = ds_steal_cstr (&s);
1871 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1872 *ref_cnt = expansion.n;
1873 for (size_t i = 0; i < expansion.n; i++)
1875 struct lex_token *token = xmalloc (sizeof *token);
1876 *token = (struct lex_token) {
1877 .token = expansion.mts[i].token,
1878 .token_pos = c0->token_pos,
1879 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1880 .line_pos = c0->line_pos,
1881 .first_line = c0->first_line,
1882 .macro_rep = macro_rep,
1887 lex_stage_push_last (&src->merge, token);
1889 ss_dealloc (&expansion.mts[i].syntax);
1894 free (expansion.mts);
1898 /* Destroy the tokens for the call. */
1899 for (size_t i = 0; i < n_call; i++)
1900 lex_stage_pop_first (&src->pp);
1902 return expansion.n > 0;
1905 /* Attempts to obtain at least one new token into 'merge' in SRC.
1907 Returns true if successful, false on failure. In the latter case, SRC is
1908 exhausted and 'src->eof' is now true. */
1910 lex_source_get_merge (struct lex_source *src)
1913 if (lex_source_try_get_merge (src))
1918 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1920 Returns true if successful, false on failure. In the latter case, SRC is
1921 exhausted and 'src->eof' is now true. */
1923 lex_source_get_lookahead (struct lex_source *src)
1925 struct merger m = MERGER_INIT;
1927 for (size_t i = 0; ; i++)
1929 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1931 /* We always get a T_ENDCMD at the end of an input file
1932 (transformed from T_STOP by lex_source_try_get_pp()) and
1933 merger_add() should never return -1 on T_ENDCMD. */
1934 assert (lex_stage_is_empty (&src->merge));
1938 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1942 lex_stage_shift (&src->lookahead, &src->merge, 1);
1945 else if (retval > 0)
1947 /* Add a token that merges all the tokens together. */
1948 const struct lex_token *first = lex_stage_first (&src->merge);
1949 const struct lex_token *last = lex_stage_nth (&src->merge,
1951 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1952 struct lex_token *t = xmalloc (sizeof *t);
1953 *t = (struct lex_token) {
1955 .token_pos = first->token_pos,
1956 .token_len = (last->token_pos - first->token_pos) + last->token_len,
1957 .line_pos = first->line_pos,
1958 .first_line = first->first_line,
1960 /* This works well if all the tokens were not expanded from macros,
1961 or if they came from the same macro expansion. It just gives up
1962 in the other (corner) cases. */
1963 .macro_rep = macro ? first->macro_rep : NULL,
1964 .ofs = macro ? first->ofs : 0,
1965 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
1966 .ref_cnt = macro ? first->ref_cnt : NULL,
1970 lex_stage_push_last (&src->lookahead, t);
1972 for (int i = 0; i < retval; i++)
1973 lex_stage_pop_first (&src->merge);
1980 lex_source_push_endcmd__ (struct lex_source *src)
1982 assert (lex_stage_is_empty (&src->lookahead));
1983 struct lex_token *token = xmalloc (sizeof *token);
1984 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
1985 lex_stage_push_last (&src->lookahead, token);
1988 static struct lex_source *
1989 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
1991 struct lex_source *src = xmalloc (sizeof *src);
1992 *src = (struct lex_source) {
1994 .segmenter = segmenter_init (reader->syntax, false),
1998 lex_source_push_endcmd__ (src);
2004 lex_source_destroy (struct lex_source *src)
2006 char *file_name = src->reader->file_name;
2007 char *encoding = src->reader->encoding;
2008 if (src->reader->class->destroy != NULL)
2009 src->reader->class->destroy (src->reader);
2013 lex_stage_uninit (&src->pp);
2014 lex_stage_uninit (&src->merge);
2015 lex_stage_uninit (&src->lookahead);
2016 ll_remove (&src->ll);
2020 struct lex_file_reader
2022 struct lex_reader reader;
2023 struct u8_istream *istream;
2026 static struct lex_reader_class lex_file_reader_class;
2028 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2029 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2030 ENCODING, which should take one of the forms accepted by
2031 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2032 mode of the new reader, respectively.
2034 Returns a null pointer if FILE_NAME cannot be opened. */
2036 lex_reader_for_file (const char *file_name, const char *encoding,
2037 enum segmenter_mode syntax,
2038 enum lex_error_mode error)
2040 struct lex_file_reader *r;
2041 struct u8_istream *istream;
2043 istream = (!strcmp(file_name, "-")
2044 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2045 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2046 if (istream == NULL)
2048 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2052 r = xmalloc (sizeof *r);
2053 lex_reader_init (&r->reader, &lex_file_reader_class);
2054 r->reader.syntax = syntax;
2055 r->reader.error = error;
2056 r->reader.file_name = xstrdup (file_name);
2057 r->reader.encoding = xstrdup_if_nonnull (encoding);
2058 r->reader.line_number = 1;
2059 r->istream = istream;
2064 static struct lex_file_reader *
2065 lex_file_reader_cast (struct lex_reader *r)
2067 return UP_CAST (r, struct lex_file_reader, reader);
2071 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2072 enum prompt_style prompt_style UNUSED)
2074 struct lex_file_reader *r = lex_file_reader_cast (r_);
2075 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2078 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2085 lex_file_close (struct lex_reader *r_)
2087 struct lex_file_reader *r = lex_file_reader_cast (r_);
2089 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2091 if (u8_istream_close (r->istream) != 0)
2092 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2095 u8_istream_free (r->istream);
2100 static struct lex_reader_class lex_file_reader_class =
2106 struct lex_string_reader
2108 struct lex_reader reader;
2113 static struct lex_reader_class lex_string_reader_class;
2115 /* Creates and returns a new lex_reader for the contents of S, which must be
2116 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2117 with ss_dealloc() when it is closed. */
2119 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2121 struct lex_string_reader *r;
2123 r = xmalloc (sizeof *r);
2124 lex_reader_init (&r->reader, &lex_string_reader_class);
2125 r->reader.syntax = SEG_MODE_AUTO;
2126 r->reader.encoding = xstrdup_if_nonnull (encoding);
2133 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2134 which must be encoded in ENCODING. The caller retains ownership of S. */
2136 lex_reader_for_string (const char *s, const char *encoding)
2138 struct substring ss;
2139 ss_alloc_substring (&ss, ss_cstr (s));
2140 return lex_reader_for_substring_nocopy (ss, encoding);
2143 /* Formats FORMAT as a printf()-like format string and creates and returns a
2144 new lex_reader for the formatted result. */
2146 lex_reader_for_format (const char *format, const char *encoding, ...)
2148 struct lex_reader *r;
2151 va_start (args, encoding);
2152 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2158 static struct lex_string_reader *
2159 lex_string_reader_cast (struct lex_reader *r)
2161 return UP_CAST (r, struct lex_string_reader, reader);
2165 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2166 enum prompt_style prompt_style UNUSED)
2168 struct lex_string_reader *r = lex_string_reader_cast (r_);
2171 chunk = MIN (n, r->s.length - r->offset);
2172 memcpy (buf, r->s.string + r->offset, chunk);
2179 lex_string_close (struct lex_reader *r_)
2181 struct lex_string_reader *r = lex_string_reader_cast (r_);
2187 static struct lex_reader_class lex_string_reader_class =