1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/ll.h"
42 #include "libpspp/message.h"
43 #include "libpspp/misc.h"
44 #include "libpspp/str.h"
45 #include "libpspp/u8-istream.h"
46 #include "output/journal.h"
47 #include "output/output-item.h"
49 #include "gl/c-ctype.h"
50 #include "gl/minmax.h"
51 #include "gl/xalloc.h"
52 #include "gl/xmemdup0.h"
55 #define _(msgid) gettext (msgid)
56 #define N_(msgid) msgid
58 /* A token within a lex_source. */
61 /* The regular token information. */
64 /* For a token obtained through the lexer in an ordinary way, this is the
65 location of the token in terms of the lex_source's buffer.
67 For a token produced through macro expansion, this is the entire macro
69 size_t token_pos; /* Offset into src->buffer of token start. */
70 size_t token_len; /* Length of source for token in bytes. */
71 size_t line_pos; /* Start of line containing token_pos. */
72 int first_line; /* Line number at token_pos. */
74 /* For a token obtained through macro expansion, this is just this token.
76 For a token obtained through the lexer in an ordinary way, these are
78 char *macro_rep; /* The whole macro expansion. */
79 size_t ofs; /* Offset of this token in macro_rep. */
80 size_t len; /* Length of this token in macro_rep. */
81 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
85 lex_token_destroy (struct lex_token *t)
87 token_uninit (&t->token);
90 assert (*t->ref_cnt > 0);
100 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
105 struct lex_token **tokens;
108 static void lex_stage_clear (struct lex_stage *);
109 static void lex_stage_uninit (struct lex_stage *);
111 static size_t lex_stage_count (const struct lex_stage *);
112 static bool lex_stage_is_empty (const struct lex_stage *);
114 static struct lex_token *lex_stage_last (struct lex_stage *);
115 static struct lex_token *lex_stage_first (struct lex_stage *);
116 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
118 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
119 static void lex_stage_pop_first (struct lex_stage *);
121 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
124 /* Deletes all the tokens from STAGE. */
126 lex_stage_clear (struct lex_stage *stage)
128 while (!deque_is_empty (&stage->deque))
129 lex_stage_pop_first (stage);
132 /* Deletes all the tokens from STAGE and frees storage for the deque. */
134 lex_stage_uninit (struct lex_stage *stage)
136 lex_stage_clear (stage);
137 free (stage->tokens);
140 /* Returns true if STAGE contains no tokens, otherwise false. */
142 lex_stage_is_empty (const struct lex_stage *stage)
144 return deque_is_empty (&stage->deque);
147 /* Returns the number of tokens in STAGE. */
149 lex_stage_count (const struct lex_stage *stage)
151 return deque_count (&stage->deque);
154 /* Returns the last token in STAGE, which must be nonempty. The last token is
155 the one accessed with the greatest lookahead. */
156 static struct lex_token *
157 lex_stage_last (struct lex_stage *stage)
159 return stage->tokens[deque_front (&stage->deque, 0)];
162 /* Returns the first token in STAGE, which must be nonempty.
163 The first token is the one accessed with the least lookahead. */
164 static struct lex_token *
165 lex_stage_first (struct lex_stage *stage)
167 return lex_stage_nth (stage, 0);
170 /* Returns the token the given INDEX in STAGE. The first token (with the least
171 lookahead) is 0, the second token is 1, and so on. There must be at least
172 INDEX + 1 tokens in STAGE. */
173 static struct lex_token *
174 lex_stage_nth (struct lex_stage *stage, size_t index)
176 return stage->tokens[deque_back (&stage->deque, index)];
179 /* Adds TOKEN so that it becomes the last token in STAGE. */
181 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
183 if (deque_is_full (&stage->deque))
184 stage->tokens = deque_expand (&stage->deque, stage->tokens,
185 sizeof *stage->tokens);
186 stage->tokens[deque_push_front (&stage->deque)] = token;
189 /* Removes the first token from STAGE and uninitializes it. */
191 lex_stage_pop_first (struct lex_stage *stage)
193 lex_token_destroy (stage->tokens[deque_pop_back (&stage->deque)]);
196 /* Removes the first N tokens from SRC, appending them to DST as the last
199 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
201 for (size_t i = 0; i < n; i++)
203 lex_stage_push_last (dst, lex_stage_first (src));
204 deque_pop_back (&src->deque);
208 /* A source of tokens, corresponding to a syntax file.
210 This is conceptually a lex_reader wrapped with everything needed to convert
211 its UTF-8 bytes into tokens. */
214 struct ll ll; /* In lexer's list of sources. */
215 struct lex_reader *reader;
217 struct segmenter segmenter;
218 bool eof; /* True if T_STOP was read from 'reader'. */
220 /* Buffer of UTF-8 bytes. */
221 char *buffer; /* Source file contents. */
222 size_t length; /* Number of bytes filled. */
223 size_t allocated; /* Number of bytes allocated. */
225 /* Offsets into 'buffer'. */
226 size_t journal_pos; /* First byte not yet output to journal. */
227 size_t seg_pos; /* First byte not yet scanned as token. */
228 size_t line_pos; /* First byte of line containing seg_pos. */
230 int n_newlines; /* Number of new-lines up to seg_pos. */
231 bool suppress_next_newline;
235 This is a pipeline with the following stages. Each token eventually
236 made available to the parser passes through of these stages. The stages
237 are named after the processing that happens in each one.
239 Initially, tokens come from the segmenter and scanner to 'pp':
241 - pp: Tokens that need to pass through the macro preprocessor to end up
244 - merge: Tokens that need to pass through scan_merge() to end up in
247 - lookahead: Tokens available to the client for parsing. */
249 struct lex_stage merge;
250 struct lex_stage lookahead;
253 static struct lex_source *lex_source_create (struct lexer *,
254 struct lex_reader *);
255 static void lex_source_destroy (struct lex_source *);
260 struct ll_list sources; /* Contains "struct lex_source"s. */
261 struct macro_set *macros;
264 static struct lex_source *lex_source__ (const struct lexer *);
265 static char *lex_source_get_syntax__ (const struct lex_source *,
267 static const struct lex_token *lex_next__ (const struct lexer *, int n);
268 static void lex_source_push_endcmd__ (struct lex_source *);
270 static bool lex_source_get_lookahead (struct lex_source *);
271 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
272 const char *format, va_list)
273 PRINTF_FORMAT (4, 0);
274 static const struct lex_token *lex_source_next__ (const struct lex_source *,
277 /* Initializes READER with the specified CLASS and otherwise some reasonable
278 defaults. The caller should fill in the others members as desired. */
280 lex_reader_init (struct lex_reader *reader,
281 const struct lex_reader_class *class)
283 reader->class = class;
284 reader->syntax = SEG_MODE_AUTO;
285 reader->error = LEX_ERROR_CONTINUE;
286 reader->file_name = NULL;
287 reader->encoding = NULL;
288 reader->line_number = 0;
292 /* Frees any file name already in READER and replaces it by a copy of
293 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
295 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
297 free (reader->file_name);
298 reader->file_name = xstrdup_if_nonnull (file_name);
301 /* Creates and returns a new lexer. */
305 struct lexer *lexer = xmalloc (sizeof *lexer);
306 *lexer = (struct lexer) {
307 .sources = LL_INITIALIZER (lexer->sources),
308 .macros = macro_set_create (),
313 /* Destroys LEXER. */
315 lex_destroy (struct lexer *lexer)
319 struct lex_source *source, *next;
321 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
322 lex_source_destroy (source);
323 macro_set_destroy (lexer->macros);
328 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
329 same name. Takes ownership of M. */
331 lex_define_macro (struct lexer *lexer, struct macro *m)
333 macro_set_add (lexer->macros, m);
336 /* Inserts READER into LEXER so that the next token read by LEXER comes from
337 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
340 lex_include (struct lexer *lexer, struct lex_reader *reader)
342 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
343 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
346 /* Appends READER to LEXER, so that it will be read after all other current
347 readers have already been read. */
349 lex_append (struct lexer *lexer, struct lex_reader *reader)
351 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
356 /* Advances LEXER to the next token, consuming the current token. */
358 lex_get (struct lexer *lexer)
360 struct lex_source *src;
362 src = lex_source__ (lexer);
366 if (!lex_stage_is_empty (&src->lookahead))
367 lex_stage_pop_first (&src->lookahead);
369 while (lex_stage_is_empty (&src->lookahead))
370 if (!lex_source_get_lookahead (src))
372 lex_source_destroy (src);
373 src = lex_source__ (lexer);
379 /* Advances LEXER by N tokens. */
381 lex_get_n (struct lexer *lexer, size_t n)
387 /* Issuing errors. */
389 /* Prints a syntax error message containing the current token and
390 given message MESSAGE (if non-null). */
392 lex_error (struct lexer *lexer, const char *format, ...)
396 va_start (args, format);
397 lex_next_error_valist (lexer, 0, 0, format, args);
401 /* Prints a syntax error message containing the current token and
402 given message MESSAGE (if non-null). */
404 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
406 lex_next_error_valist (lexer, 0, 0, format, args);
409 /* Prints a syntax error message containing the current token and
410 given message MESSAGE (if non-null). */
412 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
416 va_start (args, format);
417 lex_next_error_valist (lexer, n0, n1, format, args);
421 /* Prints a syntax error message saying that one of the strings provided as
422 varargs, up to the first NULL, is expected. */
424 (lex_error_expecting) (struct lexer *lexer, ...)
428 va_start (args, lexer);
429 lex_error_expecting_valist (lexer, args);
433 /* Prints a syntax error message saying that one of the options provided in
434 ARGS, up to the first NULL, is expected. */
436 lex_error_expecting_valist (struct lexer *lexer, va_list args)
438 enum { MAX_OPTIONS = 9 };
439 const char *options[MAX_OPTIONS];
441 while (n < MAX_OPTIONS)
443 const char *option = va_arg (args, const char *);
447 options[n++] = option;
449 lex_error_expecting_array (lexer, options, n);
453 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
458 lex_error (lexer, NULL);
462 lex_error (lexer, _("expecting %s"), options[0]);
466 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
470 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
475 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
476 options[0], options[1], options[2], options[3]);
480 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
481 options[0], options[1], options[2], options[3], options[4]);
485 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
486 options[0], options[1], options[2], options[3], options[4],
491 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
492 options[0], options[1], options[2], options[3], options[4],
493 options[5], options[6]);
497 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
498 options[0], options[1], options[2], options[3], options[4],
499 options[5], options[6], options[7]);
503 lex_error (lexer, NULL);
507 /* Reports an error to the effect that subcommand SBC may only be specified
510 This function does not take a lexer as an argument or use lex_error(),
511 because the result would ordinarily just be redundant: "Syntax error at
512 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
513 not help the user find the error. */
515 lex_sbc_only_once (const char *sbc)
517 msg (SE, _("Subcommand %s may only be specified once."), sbc);
520 /* Reports an error to the effect that subcommand SBC is missing.
522 This function does not take a lexer as an argument or use lex_error(),
523 because a missing subcommand can normally be detected only after the whole
524 command has been parsed, and so lex_error() would always report "Syntax
525 error at end of command", which does not help the user find the error. */
527 lex_sbc_missing (const char *sbc)
529 msg (SE, _("Required subcommand %s was not specified."), sbc);
532 /* Reports an error to the effect that specification SPEC may only be specified
533 once within subcommand SBC. */
535 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
537 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
541 /* Reports an error to the effect that specification SPEC is missing within
544 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
546 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
550 /* Prints a syntax error message containing the current token and
551 given message MESSAGE (if non-null). */
553 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
554 const char *format, va_list args)
556 struct lex_source *src = lex_source__ (lexer);
559 lex_source_error_valist (src, n0, n1, format, args);
565 ds_put_format (&s, _("Syntax error at end of input"));
568 ds_put_cstr (&s, ": ");
569 ds_put_vformat (&s, format, args);
571 if (ds_last (&s) != '.')
572 ds_put_byte (&s, '.');
573 msg (SE, "%s", ds_cstr (&s));
578 /* Checks that we're at end of command.
579 If so, returns a successful command completion code.
580 If not, flags a syntax error and returns an error command
583 lex_end_of_command (struct lexer *lexer)
585 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
587 lex_error (lexer, _("expecting end of command"));
594 /* Token testing functions. */
596 /* Returns true if the current token is a number. */
598 lex_is_number (const struct lexer *lexer)
600 return lex_next_is_number (lexer, 0);
603 /* Returns true if the current token is a string. */
605 lex_is_string (const struct lexer *lexer)
607 return lex_next_is_string (lexer, 0);
610 /* Returns the value of the current token, which must be a
611 floating point number. */
613 lex_number (const struct lexer *lexer)
615 return lex_next_number (lexer, 0);
618 /* Returns true iff the current token is an integer. */
620 lex_is_integer (const struct lexer *lexer)
622 return lex_next_is_integer (lexer, 0);
625 /* Returns the value of the current token, which must be an
628 lex_integer (const struct lexer *lexer)
630 return lex_next_integer (lexer, 0);
633 /* Token testing functions with lookahead.
635 A value of 0 for N as an argument to any of these functions refers to the
636 current token. Lookahead is limited to the current command. Any N greater
637 than the number of tokens remaining in the current command will be treated
638 as referring to a T_ENDCMD token. */
640 /* Returns true if the token N ahead of the current token is a number. */
642 lex_next_is_number (const struct lexer *lexer, int n)
644 return token_is_number (lex_next (lexer, n));
647 /* Returns true if the token N ahead of the current token is a string. */
649 lex_next_is_string (const struct lexer *lexer, int n)
651 return token_is_string (lex_next (lexer, n));
654 /* Returns the value of the token N ahead of the current token, which must be a
655 floating point number. */
657 lex_next_number (const struct lexer *lexer, int n)
659 return token_number (lex_next (lexer, n));
662 /* Returns true if the token N ahead of the current token is an integer. */
664 lex_next_is_integer (const struct lexer *lexer, int n)
666 return token_is_integer (lex_next (lexer, n));
669 /* Returns the value of the token N ahead of the current token, which must be
672 lex_next_integer (const struct lexer *lexer, int n)
674 return token_integer (lex_next (lexer, n));
677 /* Token matching functions. */
679 /* If the current token has the specified TYPE, skips it and returns true.
680 Otherwise, returns false. */
682 lex_match (struct lexer *lexer, enum token_type type)
684 if (lex_token (lexer) == type)
693 /* If the current token matches IDENTIFIER, skips it and returns true.
694 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
697 IDENTIFIER must be an ASCII string. */
699 lex_match_id (struct lexer *lexer, const char *identifier)
701 return lex_match_id_n (lexer, identifier, 3);
704 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
705 may be abbreviated to its first N letters. Otherwise, returns false.
707 IDENTIFIER must be an ASCII string. */
709 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
711 if (lex_token (lexer) == T_ID
712 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
721 /* If the current token is integer X, skips it and returns true. Otherwise,
724 lex_match_int (struct lexer *lexer, int x)
726 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
735 /* Forced matches. */
737 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
738 abbreviated to its first 3 letters. Otherwise, reports an error and returns
741 IDENTIFIER must be an ASCII string. */
743 lex_force_match_id (struct lexer *lexer, const char *identifier)
745 if (lex_match_id (lexer, identifier))
749 lex_error_expecting (lexer, identifier);
754 /* If the current token has the specified TYPE, skips it and returns true.
755 Otherwise, reports an error and returns false. */
757 lex_force_match (struct lexer *lexer, enum token_type type)
759 if (lex_token (lexer) == type)
766 const char *type_string = token_type_to_string (type);
769 char *s = xasprintf ("`%s'", type_string);
770 lex_error_expecting (lexer, s);
774 lex_error_expecting (lexer, token_type_to_name (type));
780 /* If the current token is a string, does nothing and returns true.
781 Otherwise, reports an error and returns false. */
783 lex_force_string (struct lexer *lexer)
785 if (lex_is_string (lexer))
789 lex_error (lexer, _("expecting string"));
794 /* If the current token is a string or an identifier, does nothing and returns
795 true. Otherwise, reports an error and returns false.
797 This is meant for use in syntactic situations where we want to encourage the
798 user to supply a quoted string, but for compatibility we also accept
799 identifiers. (One example of such a situation is file names.) Therefore,
800 the error message issued when the current token is wrong only says that a
801 string is expected and doesn't mention that an identifier would also be
804 lex_force_string_or_id (struct lexer *lexer)
806 return lex_token (lexer) == T_ID || lex_force_string (lexer);
809 /* If the current token is an integer, does nothing and returns true.
810 Otherwise, reports an error and returns false. */
812 lex_force_int (struct lexer *lexer)
814 if (lex_is_integer (lexer))
818 lex_error (lexer, _("expecting integer"));
823 /* If the current token is an integer in the range MIN...MAX (inclusive), does
824 nothing and returns true. Otherwise, reports an error and returns false.
825 If NAME is nonnull, then it is used in the error message. */
827 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
829 bool is_integer = lex_is_integer (lexer);
830 bool too_small = is_integer && lex_integer (lexer) < min;
831 bool too_big = is_integer && lex_integer (lexer) > max;
832 if (is_integer && !too_small && !too_big)
837 /* Weird, maybe a bug in the caller. Just report that we needed an
840 lex_error (lexer, _("Integer expected for %s."), name);
842 lex_error (lexer, _("Integer expected."));
847 lex_error (lexer, _("Expected %ld for %s."), min, name);
849 lex_error (lexer, _("Expected %ld."), min);
851 else if (min + 1 == max)
854 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
856 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
860 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
861 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
863 if (report_lower_bound && report_upper_bound)
867 _("Expected integer between %ld and %ld for %s."),
870 lex_error (lexer, _("Expected integer between %ld and %ld."),
873 else if (report_lower_bound)
878 lex_error (lexer, _("Expected non-negative integer for %s."),
881 lex_error (lexer, _("Expected non-negative integer."));
886 lex_error (lexer, _("Expected positive integer for %s."),
889 lex_error (lexer, _("Expected positive integer."));
892 else if (report_upper_bound)
896 _("Expected integer less than or equal to %ld for %s."),
899 lex_error (lexer, _("Expected integer less than or equal to %ld."),
905 lex_error (lexer, _("Integer expected for %s."), name);
907 lex_error (lexer, _("Integer expected."));
913 /* If the current token is a number, does nothing and returns true.
914 Otherwise, reports an error and returns false. */
916 lex_force_num (struct lexer *lexer)
918 if (lex_is_number (lexer))
921 lex_error (lexer, _("expecting number"));
925 /* If the current token is an identifier, does nothing and returns true.
926 Otherwise, reports an error and returns false. */
928 lex_force_id (struct lexer *lexer)
930 if (lex_token (lexer) == T_ID)
933 lex_error (lexer, _("expecting identifier"));
937 /* Token accessors. */
939 /* Returns the type of LEXER's current token. */
941 lex_token (const struct lexer *lexer)
943 return lex_next_token (lexer, 0);
946 /* Returns the number in LEXER's current token.
948 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
949 tokens this function will always return zero. */
951 lex_tokval (const struct lexer *lexer)
953 return lex_next_tokval (lexer, 0);
956 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
958 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
959 this functions this function will always return NULL.
961 The UTF-8 encoding of the returned string is correct for variable names and
962 other identifiers. Use filename_to_utf8() to use it as a filename. Use
963 data_in() to use it in a "union value". */
965 lex_tokcstr (const struct lexer *lexer)
967 return lex_next_tokcstr (lexer, 0);
970 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
971 null-terminated (but the null terminator is not included in the returned
972 substring's 'length').
974 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
975 this functions this function will always return NULL.
977 The UTF-8 encoding of the returned string is correct for variable names and
978 other identifiers. Use filename_to_utf8() to use it as a filename. Use
979 data_in() to use it in a "union value". */
981 lex_tokss (const struct lexer *lexer)
983 return lex_next_tokss (lexer, 0);
988 A value of 0 for N as an argument to any of these functions refers to the
989 current token. Lookahead is limited to the current command. Any N greater
990 than the number of tokens remaining in the current command will be treated
991 as referring to a T_ENDCMD token. */
993 static const struct lex_token *
994 lex_next__ (const struct lexer *lexer_, int n)
996 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
997 struct lex_source *src = lex_source__ (lexer);
1000 return lex_source_next__ (src, n);
1003 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1008 static const struct lex_token *
1009 lex_source_next__ (const struct lex_source *src_, int n)
1011 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1012 while (lex_stage_count (&src->lookahead) <= n)
1014 if (!lex_stage_is_empty (&src->lookahead))
1016 const struct lex_token *t = lex_stage_last (&src->lookahead);
1017 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1021 lex_source_get_lookahead (src);
1024 return lex_stage_nth (&src->lookahead, n);
1027 /* Returns the "struct token" of the token N after the current one in LEXER.
1028 The returned pointer can be invalidated by pretty much any succeeding call
1029 into the lexer, although the string pointer within the returned token is
1030 only invalidated by consuming the token (e.g. with lex_get()). */
1031 const struct token *
1032 lex_next (const struct lexer *lexer, int n)
1034 return &lex_next__ (lexer, n)->token;
1037 /* Returns the type of the token N after the current one in LEXER. */
1039 lex_next_token (const struct lexer *lexer, int n)
1041 return lex_next (lexer, n)->type;
1044 /* Returns the number in the tokn N after the current one in LEXER.
1046 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1047 tokens this function will always return zero. */
1049 lex_next_tokval (const struct lexer *lexer, int n)
1051 return token_number (lex_next (lexer, n));
1054 /* Returns the null-terminated string in the token N after the current one, in
1057 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1058 this functions this function will always return NULL.
1060 The UTF-8 encoding of the returned string is correct for variable names and
1061 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1062 data_in() to use it in a "union value". */
1064 lex_next_tokcstr (const struct lexer *lexer, int n)
1066 return lex_next_tokss (lexer, n).string;
1069 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1070 The string is null-terminated (but the null terminator is not included in
1071 the returned substring's 'length').
1073 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1074 tokens this functions this function will always return NULL.
1076 The UTF-8 encoding of the returned string is correct for variable names and
1077 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1078 data_in() to use it in a "union value". */
1080 lex_next_tokss (const struct lexer *lexer, int n)
1082 return lex_next (lexer, n)->string;
1085 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1086 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1087 are both zero, this requests the syntax for the current token.) The caller
1088 must eventually free the returned string (with free()). The syntax is
1089 encoded in UTF-8 and in the original form supplied to the lexer so that, for
1090 example, it may include comments, spaces, and new-lines if it spans multiple
1091 tokens. Macro expansion, however, has already been performed. */
1093 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1095 return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1098 /* Returns true if the token N ahead of the current one was produced by macro
1099 expansion, false otherwise. */
1101 lex_next_is_from_macro (const struct lexer *lexer, int n)
1103 return lex_next__ (lexer, n)->macro_rep != NULL;
1107 lex_tokens_match (const struct token *actual, const struct token *expected)
1109 if (actual->type != expected->type)
1112 switch (actual->type)
1116 return actual->number == expected->number;
1119 return lex_id_match (expected->string, actual->string);
1122 return (actual->string.length == expected->string.length
1123 && !memcmp (actual->string.string, expected->string.string,
1124 actual->string.length));
1132 lex_at_phrase__ (struct lexer *lexer, const char *s)
1134 struct string_lexer slex;
1138 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1139 while (string_lexer_next (&slex, &token))
1141 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1142 token_uninit (&token);
1149 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1150 returns true. Otherwise, returns false.
1152 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1153 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1154 first three letters. */
1156 lex_at_phrase (struct lexer *lexer, const char *s)
1158 return lex_at_phrase__ (lexer, s) > 0;
1161 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1162 skips it and returns true. Otherwise, returns false.
1164 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1165 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1166 first three letters. */
1168 lex_match_phrase (struct lexer *lexer, const char *s)
1170 size_t n = lex_at_phrase__ (lexer, s);
1172 lex_get_n (lexer, n);
1177 count_newlines (char *s, size_t length)
1182 while ((newline = memchr (s, '\n', length)) != NULL)
1185 length -= (newline + 1) - s;
1193 lex_token_get_last_line_number (const struct lex_source *src,
1194 const struct lex_token *token)
1196 if (token->first_line == 0)
1200 char *token_str = &src->buffer[token->token_pos];
1201 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1206 lex_token_get_first_column (const struct lex_source *src,
1207 const struct lex_token *token)
1209 return utf8_count_columns (&src->buffer[token->line_pos],
1210 token->token_pos - token->line_pos) + 1;
1214 lex_token_get_last_column (const struct lex_source *src,
1215 const struct lex_token *token)
1217 char *start, *end, *newline;
1219 start = &src->buffer[token->line_pos];
1220 end = &src->buffer[token->token_pos + token->token_len];
1221 newline = memrchr (start, '\n', end - start);
1222 if (newline != NULL)
1223 start = newline + 1;
1224 return utf8_count_columns (start, end - start) + 1;
1227 static struct msg_location
1228 lex_token_location (const struct lex_source *src,
1229 const struct lex_token *t0,
1230 const struct lex_token *t1)
1232 return (struct msg_location) {
1233 .file_name = src->reader->file_name,
1234 .first_line = t0->first_line,
1235 .last_line = lex_token_get_last_line_number (src, t1),
1236 .first_column = lex_token_get_first_column (src, t0),
1237 .last_column = lex_token_get_last_column (src, t1),
1241 static struct msg_location *
1242 lex_token_location_rw (const struct lex_source *src,
1243 const struct lex_token *t0,
1244 const struct lex_token *t1)
1246 struct msg_location location = lex_token_location (src, t0, t1);
1247 return msg_location_dup (&location);
1250 static struct msg_location *
1251 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1253 return lex_token_location_rw (src,
1254 lex_source_next__ (src, n0),
1255 lex_source_next__ (src, n1));
1258 /* Returns the 1-based line number of the start of the syntax that represents
1259 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1260 if the token is drawn from a source that does not have line numbers. */
1262 lex_get_first_line_number (const struct lexer *lexer, int n)
1264 const struct lex_source *src = lex_source__ (lexer);
1265 return src ? lex_source_next__ (src, n)->first_line : 0;
1268 /* Returns the 1-based line number of the end of the syntax that represents the
1269 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1270 token or if the token is drawn from a source that does not have line
1273 Most of the time, a single token is wholly within a single line of syntax,
1274 but there are two exceptions: a T_STRING token can be made up of multiple
1275 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1276 token can consist of a "-" on one line followed by the number on the next.
1279 lex_get_last_line_number (const struct lexer *lexer, int n)
1281 const struct lex_source *src = lex_source__ (lexer);
1282 return src ? lex_token_get_last_line_number (src,
1283 lex_source_next__ (src, n)) : 0;
1286 /* Returns the 1-based column number of the start of the syntax that represents
1287 the token N after the current one in LEXER. Returns 0 for a T_STOP
1290 Column numbers are measured according to the width of characters as shown in
1291 a typical fixed-width font, in which CJK characters have width 2 and
1292 combining characters have width 0. */
1294 lex_get_first_column (const struct lexer *lexer, int n)
1296 const struct lex_source *src = lex_source__ (lexer);
1297 return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1300 /* Returns the 1-based column number of the end of the syntax that represents
1301 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1304 Column numbers are measured according to the width of characters as shown in
1305 a typical fixed-width font, in which CJK characters have width 2 and
1306 combining characters have width 0. */
1308 lex_get_last_column (const struct lexer *lexer, int n)
1310 const struct lex_source *src = lex_source__ (lexer);
1311 return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1314 /* Returns the name of the syntax file from which the current command is drawn.
1315 Returns NULL for a T_STOP token or if the command's source does not have
1318 There is no version of this function that takes an N argument because
1319 lookahead only works to the end of a command and any given command is always
1320 within a single syntax file. */
1322 lex_get_file_name (const struct lexer *lexer)
1324 struct lex_source *src = lex_source__ (lexer);
1325 return src == NULL ? NULL : src->reader->file_name;
1328 /* Returns a newly allocated msg_location for the syntax that represents tokens
1329 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1330 must eventually free the location (with msg_location_destroy()). */
1331 struct msg_location *
1332 lex_get_location (const struct lexer *lexer, int n0, int n1)
1334 struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1335 loc->first_column = lex_get_first_column (lexer, n0);
1336 loc->last_column = lex_get_last_column (lexer, n1);
1340 /* Returns a newly allocated msg_location for the syntax that represents tokens
1341 with 0-based offsets N0...N1, inclusive, from the current token. The
1342 location only covers the tokens' lines, not the columns. The caller must
1343 eventually free the location (with msg_location_destroy()). */
1344 struct msg_location *
1345 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1347 struct msg_location *loc = xmalloc (sizeof *loc);
1348 *loc = (struct msg_location) {
1349 .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1350 .first_line = lex_get_first_line_number (lexer, n0),
1351 .last_line = lex_get_last_line_number (lexer, n1),
1357 lex_get_encoding (const struct lexer *lexer)
1359 struct lex_source *src = lex_source__ (lexer);
1360 return src == NULL ? NULL : src->reader->encoding;
1363 /* Returns the syntax mode for the syntax file from which the current drawn is
1364 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1365 does not have line numbers.
1367 There is no version of this function that takes an N argument because
1368 lookahead only works to the end of a command and any given command is always
1369 within a single syntax file. */
1371 lex_get_syntax_mode (const struct lexer *lexer)
1373 struct lex_source *src = lex_source__ (lexer);
1374 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1377 /* Returns the error mode for the syntax file from which the current drawn is
1378 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1379 source does not have line numbers.
1381 There is no version of this function that takes an N argument because
1382 lookahead only works to the end of a command and any given command is always
1383 within a single syntax file. */
1385 lex_get_error_mode (const struct lexer *lexer)
1387 struct lex_source *src = lex_source__ (lexer);
1388 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1391 /* If the source that LEXER is currently reading has error mode
1392 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1393 token to be read comes directly from whatever is next read from the stream.
1395 It makes sense to call this function after encountering an error in a
1396 command entered on the console, because usually the user would prefer not to
1397 have cascading errors. */
1399 lex_interactive_reset (struct lexer *lexer)
1401 struct lex_source *src = lex_source__ (lexer);
1402 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1405 src->journal_pos = src->seg_pos = src->line_pos = 0;
1406 src->n_newlines = 0;
1407 src->suppress_next_newline = false;
1408 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1410 lex_stage_clear (&src->pp);
1411 lex_stage_clear (&src->merge);
1412 lex_stage_clear (&src->lookahead);
1413 lex_source_push_endcmd__ (src);
1417 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1419 lex_discard_rest_of_command (struct lexer *lexer)
1421 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1425 /* Discards all lookahead tokens in LEXER, then discards all input sources
1426 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1427 runs out of input sources. */
1429 lex_discard_noninteractive (struct lexer *lexer)
1431 struct lex_source *src = lex_source__ (lexer);
1435 lex_stage_clear (&src->pp);
1436 lex_stage_clear (&src->merge);
1437 lex_stage_clear (&src->lookahead);
1439 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1440 src = lex_source__ (lexer))
1441 lex_source_destroy (src);
1446 lex_source_expand__ (struct lex_source *src)
1448 if (src->length >= src->allocated)
1449 src->buffer = x2realloc (src->buffer, &src->allocated);
1453 lex_source_read__ (struct lex_source *src)
1457 lex_source_expand__ (src);
1459 size_t space = src->allocated - src->length;
1460 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1461 size_t n = src->reader->class->read (src->reader,
1462 &src->buffer[src->length],
1464 assert (n <= space);
1469 src->reader->eof = true;
1470 lex_source_expand__ (src);
1476 while (!memchr (&src->buffer[src->seg_pos], '\n',
1477 src->length - src->seg_pos));
1480 static struct lex_source *
1481 lex_source__ (const struct lexer *lexer)
1483 return (ll_is_empty (&lexer->sources) ? NULL
1484 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1487 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1488 one, through N1 ahead of the current one, inclusive. (For example, if N0
1489 and N1 are both zero, this requests the syntax for the current token.) The
1490 caller must eventually free the returned string (with free()). The syntax
1491 is encoded in UTF-8 and in the original form supplied to the lexer so that,
1492 for example, it may include comments, spaces, and new-lines if it spans
1493 multiple tokens. Macro expansion, however, has already been performed. */
1495 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1497 struct string s = DS_EMPTY_INITIALIZER;
1498 for (size_t i = n0; i <= n1; )
1500 /* Find [I,J) as the longest sequence of tokens not produced by macro
1501 expansion, or otherwise the longest sequence expanded from a single
1503 const struct lex_token *first = lex_source_next__ (src, i);
1505 for (j = i + 1; j <= n1; j++)
1507 const struct lex_token *cur = lex_source_next__ (src, j);
1508 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1509 || first->macro_rep != cur->macro_rep)
1512 const struct lex_token *last = lex_source_next__ (src, j - 1);
1514 /* Now add the syntax for this sequence of tokens to SRC. */
1515 if (!ds_is_empty (&s))
1516 ds_put_byte (&s, ' ');
1517 if (!first->macro_rep)
1519 size_t start = first->token_pos;
1520 size_t end = last->token_pos + last->token_len;
1521 ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1525 size_t start = first->ofs;
1526 size_t end = last->ofs + last->len;
1527 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1533 return ds_steal_cstr (&s);
1537 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1539 for (size_t i = n0; i <= n1; i++)
1540 if (lex_source_next__ (src, i)->macro_rep)
1545 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1546 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1547 other tokens included in that range. The syntax is encoded in UTF-8 and in
1548 the original form supplied to the lexer so that, for example, it may include
1549 comments, spaces, and new-lines if it spans multiple tokens.
1551 Returns an empty string if the token range doesn't include a macro call.
1553 The caller must not modify or free the returned string. */
1554 static struct substring
1555 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1557 if (!lex_source_contains_macro_call (src, n0, n1))
1560 const struct lex_token *token0 = lex_source_next__ (src, n0);
1561 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1562 size_t start = token0->token_pos;
1563 size_t end = token1->token_pos + token1->token_len;
1565 return ss_buffer (&src->buffer[start], end - start);
1569 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1570 const char *format, va_list args)
1572 const struct lex_token *token;
1577 token = lex_source_next__ (src, n0);
1578 if (token->token.type == T_ENDCMD)
1579 ds_put_cstr (&s, _("Syntax error at end of command"));
1582 /* Get the syntax that caused the error. */
1583 char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1585 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1588 /* Get the macro call(s) that expanded to the syntax that caused the
1591 str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1598 _("Syntax error at `%s' (in expansion of `%s')"),
1601 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1606 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1609 ds_put_cstr (&s, _("Syntax error"));
1615 ds_put_cstr (&s, ": ");
1616 ds_put_vformat (&s, format, args);
1618 if (ds_last (&s) != '.')
1619 ds_put_byte (&s, '.');
1621 struct msg *m = xmalloc (sizeof *m);
1623 .category = MSG_C_SYNTAX,
1624 .severity = MSG_S_ERROR,
1625 .location = lex_source_get_location (src, n0, n1),
1626 .text = ds_steal_cstr (&s),
1632 lex_get_error (struct lex_source *src, const struct lex_token *token)
1635 str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1636 syntax, sizeof syntax);
1638 struct string s = DS_EMPTY_INITIALIZER;
1639 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1640 ds_put_format (&s, ": %s", token->token.string.string);
1642 struct msg *m = xmalloc (sizeof *m);
1644 .category = MSG_C_SYNTAX,
1645 .severity = MSG_S_ERROR,
1646 .location = lex_token_location_rw (src, token, token),
1647 .text = ds_steal_cstr (&s),
1652 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1653 underlying lex_reader if necessary. Returns true if a new token was added
1654 to SRC's deque, false otherwise. The caller should retry failures unless
1655 SRC's 'eof' marker was set to true indicating that there will be no more
1656 tokens from this source. */
1658 lex_source_try_get_pp (struct lex_source *src)
1660 /* Append a new token to SRC and initialize it. */
1661 struct lex_token *token = xmalloc (sizeof *token);
1662 token->token = (struct token) { .type = T_STOP };
1663 token->macro_rep = NULL;
1664 token->ref_cnt = NULL;
1665 token->line_pos = src->line_pos;
1666 token->token_pos = src->seg_pos;
1667 if (src->reader->line_number > 0)
1668 token->first_line = src->reader->line_number + src->n_newlines;
1670 token->first_line = 0;
1672 /* Extract a segment. */
1673 const char *segment;
1674 enum segment_type seg_type;
1678 segment = &src->buffer[src->seg_pos];
1679 seg_len = segmenter_push (&src->segmenter, segment,
1680 src->length - src->seg_pos,
1681 src->reader->eof, &seg_type);
1685 /* The segmenter needs more input to produce a segment. */
1686 assert (!src->reader->eof);
1687 lex_source_read__ (src);
1690 /* Update state based on the segment. */
1691 token->token_len = seg_len;
1692 src->seg_pos += seg_len;
1693 if (seg_type == SEG_NEWLINE)
1695 src->line_pos = src->seg_pos;
1699 /* Get a token from the segment. */
1700 enum tokenize_result result = token_from_segment (
1701 seg_type, ss_buffer (segment, seg_len), &token->token);
1703 /* If we've reached the end of a line, or the end of a command, then pass
1704 the line to the output engine as a syntax text item. */
1705 int n_lines = seg_type == SEG_NEWLINE;
1706 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1709 src->suppress_next_newline = true;
1711 else if (n_lines > 0 && src->suppress_next_newline)
1714 src->suppress_next_newline = false;
1716 for (int i = 0; i < n_lines; i++)
1718 /* Beginning of line. */
1719 const char *line = &src->buffer[src->journal_pos];
1721 /* Calculate line length, including \n or \r\n end-of-line if present.
1723 We use src->head even though that may be beyond what we've actually
1724 converted to tokens (which is only through line_pos). That's because,
1725 if we're emitting the line due to SEG_END_COMMAND, we want to take the
1726 whole line through the newline, not just through the '.'. */
1727 size_t max_len = src->length - src->journal_pos;
1728 const char *newline = memchr (line, '\n', max_len);
1729 size_t line_len = newline ? newline - line + 1 : max_len;
1731 /* Calculate line length excluding end-of-line. */
1732 size_t copy_len = line_len;
1733 if (copy_len > 0 && line[copy_len - 1] == '\n')
1735 if (copy_len > 0 && line[copy_len - 1] == '\r')
1738 /* Submit the line as syntax. */
1739 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1740 xmemdup0 (line, copy_len),
1743 src->journal_pos += line_len;
1748 case TOKENIZE_ERROR:
1749 lex_get_error (src, token);
1751 case TOKENIZE_EMPTY:
1752 lex_token_destroy (token);
1755 case TOKENIZE_TOKEN:
1756 if (token->token.type == T_STOP)
1758 token->token.type = T_ENDCMD;
1761 lex_stage_push_last (&src->pp, token);
1767 /* Attempts to append a new token to SRC. Returns true if successful, false on
1768 failure. On failure, the end of SRC has been reached and no more tokens
1769 will be forthcoming from it.
1771 Does not make the new token available for lookahead yet; the caller must
1772 adjust SRC's 'middle' pointer to do so. */
1774 lex_source_get_pp (struct lex_source *src)
1777 if (lex_source_try_get_pp (src))
1783 lex_source_try_get_merge (const struct lex_source *src_)
1785 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1787 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1790 if (!settings_get_mexpand ())
1792 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1796 /* Now pass tokens one-by-one to the macro expander.
1798 In the common case where there is no macro to expand, the loop is not
1800 struct macro_call *mc;
1801 int n_call = macro_call_create (src->lexer->macros,
1802 &lex_stage_first (&src->pp)->token, &mc);
1803 for (int ofs = 1; !n_call; ofs++)
1805 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1807 /* This should not be reachable because we always get a T_ENDCMD at
1808 the end of an input file (transformed from T_STOP by
1809 lex_source_try_get_pp()) and the macro_expander should always
1810 terminate expansion on T_ENDCMD. */
1814 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1815 size_t start = t->token_pos;
1816 size_t end = t->token_pos + t->token_len;
1817 const struct macro_token mt = {
1819 .syntax = ss_buffer (&src->buffer[start], end - start),
1821 const struct msg_location loc = lex_token_location (src, t, t);
1822 n_call = macro_call_add (mc, &mt, &loc);
1826 /* False alarm: no macro expansion after all. Use first token as
1827 lookahead. We'll retry macro expansion from the second token next
1829 macro_call_destroy (mc);
1830 lex_stage_shift (&src->merge, &src->pp, 1);
1834 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1835 are a macro call. (These are likely to be the only tokens in 'pp'.)
1837 const struct lex_token *c0 = lex_stage_first (&src->pp);
1838 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1839 struct macro_tokens expansion = { .n = 0 };
1840 struct msg_location loc = lex_token_location (src, c0, c1);
1841 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1842 macro_call_destroy (mc);
1844 /* Convert the macro expansion into syntax for possible error messages
1846 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1847 size_t *len = xnmalloc (expansion.n, sizeof *len);
1848 struct string s = DS_EMPTY_INITIALIZER;
1849 macro_tokens_to_syntax (&expansion, &s, ofs, len);
1851 if (settings_get_mprint ())
1852 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1853 _("Macro Expansion")));
1855 /* Append the macro expansion tokens to the lookahead. */
1856 if (expansion.n > 0)
1858 char *macro_rep = ds_steal_cstr (&s);
1859 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1860 *ref_cnt = expansion.n;
1861 for (size_t i = 0; i < expansion.n; i++)
1863 struct lex_token *token = xmalloc (sizeof *token);
1864 *token = (struct lex_token) {
1865 .token = expansion.mts[i].token,
1866 .token_pos = c0->token_pos,
1867 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1868 .line_pos = c0->line_pos,
1869 .first_line = c0->first_line,
1870 .macro_rep = macro_rep,
1875 lex_stage_push_last (&src->merge, token);
1877 ss_dealloc (&expansion.mts[i].syntax);
1882 free (expansion.mts);
1886 /* Destroy the tokens for the call. */
1887 for (size_t i = 0; i < n_call; i++)
1888 lex_stage_pop_first (&src->pp);
1890 return expansion.n > 0;
1893 /* Attempts to obtain at least one new token into 'merge' in SRC.
1895 Returns true if successful, false on failure. In the latter case, SRC is
1896 exhausted and 'src->eof' is now true. */
1898 lex_source_get_merge (struct lex_source *src)
1901 if (lex_source_try_get_merge (src))
1906 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1908 Returns true if successful, false on failure. In the latter case, SRC is
1909 exhausted and 'src->eof' is now true. */
1911 lex_source_get_lookahead (struct lex_source *src)
1913 struct merger m = MERGER_INIT;
1915 for (size_t i = 0; ; i++)
1917 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1919 /* We always get a T_ENDCMD at the end of an input file
1920 (transformed from T_STOP by lex_source_try_get_pp()) and
1921 merger_add() should never return -1 on T_ENDCMD. */
1922 assert (lex_stage_is_empty (&src->merge));
1926 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1930 lex_stage_shift (&src->lookahead, &src->merge, 1);
1933 else if (retval > 0)
1935 /* Add a token that merges all the tokens together. */
1936 const struct lex_token *first = lex_stage_first (&src->merge);
1937 const struct lex_token *last = lex_stage_nth (&src->merge,
1939 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1940 struct lex_token *t = xmalloc (sizeof *t);
1941 *t = (struct lex_token) {
1943 .token_pos = first->token_pos,
1944 .token_len = (last->token_pos - first->token_pos) + last->token_len,
1945 .line_pos = first->line_pos,
1946 .first_line = first->first_line,
1948 /* This works well if all the tokens were not expanded from macros,
1949 or if they came from the same macro expansion. It just gives up
1950 in the other (corner) cases. */
1951 .macro_rep = macro ? first->macro_rep : NULL,
1952 .ofs = macro ? first->ofs : 0,
1953 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
1954 .ref_cnt = macro ? first->ref_cnt : NULL,
1958 lex_stage_push_last (&src->lookahead, t);
1960 for (int i = 0; i < retval; i++)
1961 lex_stage_pop_first (&src->merge);
1968 lex_source_push_endcmd__ (struct lex_source *src)
1970 assert (lex_stage_is_empty (&src->lookahead));
1971 struct lex_token *token = xmalloc (sizeof *token);
1972 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
1973 lex_stage_push_last (&src->lookahead, token);
1976 static struct lex_source *
1977 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
1979 struct lex_source *src = xmalloc (sizeof *src);
1980 *src = (struct lex_source) {
1982 .segmenter = segmenter_init (reader->syntax, false),
1986 lex_source_push_endcmd__ (src);
1992 lex_source_destroy (struct lex_source *src)
1994 char *file_name = src->reader->file_name;
1995 char *encoding = src->reader->encoding;
1996 if (src->reader->class->destroy != NULL)
1997 src->reader->class->destroy (src->reader);
2001 lex_stage_uninit (&src->pp);
2002 lex_stage_uninit (&src->merge);
2003 lex_stage_uninit (&src->lookahead);
2004 ll_remove (&src->ll);
2008 struct lex_file_reader
2010 struct lex_reader reader;
2011 struct u8_istream *istream;
2014 static struct lex_reader_class lex_file_reader_class;
2016 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2017 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2018 ENCODING, which should take one of the forms accepted by
2019 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2020 mode of the new reader, respectively.
2022 Returns a null pointer if FILE_NAME cannot be opened. */
2024 lex_reader_for_file (const char *file_name, const char *encoding,
2025 enum segmenter_mode syntax,
2026 enum lex_error_mode error)
2028 struct lex_file_reader *r;
2029 struct u8_istream *istream;
2031 istream = (!strcmp(file_name, "-")
2032 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2033 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2034 if (istream == NULL)
2036 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2040 r = xmalloc (sizeof *r);
2041 lex_reader_init (&r->reader, &lex_file_reader_class);
2042 r->reader.syntax = syntax;
2043 r->reader.error = error;
2044 r->reader.file_name = xstrdup (file_name);
2045 r->reader.encoding = xstrdup_if_nonnull (encoding);
2046 r->reader.line_number = 1;
2047 r->istream = istream;
2052 static struct lex_file_reader *
2053 lex_file_reader_cast (struct lex_reader *r)
2055 return UP_CAST (r, struct lex_file_reader, reader);
2059 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2060 enum prompt_style prompt_style UNUSED)
2062 struct lex_file_reader *r = lex_file_reader_cast (r_);
2063 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2066 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2073 lex_file_close (struct lex_reader *r_)
2075 struct lex_file_reader *r = lex_file_reader_cast (r_);
2077 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2079 if (u8_istream_close (r->istream) != 0)
2080 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2083 u8_istream_free (r->istream);
2088 static struct lex_reader_class lex_file_reader_class =
2094 struct lex_string_reader
2096 struct lex_reader reader;
2101 static struct lex_reader_class lex_string_reader_class;
2103 /* Creates and returns a new lex_reader for the contents of S, which must be
2104 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2105 with ss_dealloc() when it is closed. */
2107 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2109 struct lex_string_reader *r;
2111 r = xmalloc (sizeof *r);
2112 lex_reader_init (&r->reader, &lex_string_reader_class);
2113 r->reader.syntax = SEG_MODE_AUTO;
2114 r->reader.encoding = xstrdup_if_nonnull (encoding);
2121 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2122 which must be encoded in ENCODING. The caller retains ownership of S. */
2124 lex_reader_for_string (const char *s, const char *encoding)
2126 struct substring ss;
2127 ss_alloc_substring (&ss, ss_cstr (s));
2128 return lex_reader_for_substring_nocopy (ss, encoding);
2131 /* Formats FORMAT as a printf()-like format string and creates and returns a
2132 new lex_reader for the formatted result. */
2134 lex_reader_for_format (const char *format, const char *encoding, ...)
2136 struct lex_reader *r;
2139 va_start (args, encoding);
2140 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2146 static struct lex_string_reader *
2147 lex_string_reader_cast (struct lex_reader *r)
2149 return UP_CAST (r, struct lex_string_reader, reader);
2153 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2154 enum prompt_style prompt_style UNUSED)
2156 struct lex_string_reader *r = lex_string_reader_cast (r_);
2159 chunk = MIN (n, r->s.length - r->offset);
2160 memcpy (buf, r->s.string + r->offset, chunk);
2167 lex_string_close (struct lex_reader *r_)
2169 struct lex_string_reader *r = lex_string_reader_cast (r_);
2175 static struct lex_reader_class lex_string_reader_class =