1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/ll.h"
42 #include "libpspp/message.h"
43 #include "libpspp/misc.h"
44 #include "libpspp/str.h"
45 #include "libpspp/u8-istream.h"
46 #include "output/journal.h"
47 #include "output/output-item.h"
49 #include "gl/c-ctype.h"
50 #include "gl/minmax.h"
51 #include "gl/xalloc.h"
52 #include "gl/xmemdup0.h"
55 #define _(msgid) gettext (msgid)
56 #define N_(msgid) msgid
58 /* A token within a lex_source. */
61 /* The regular token information. */
64 /* For a token obtained through the lexer in an ordinary way, this is the
65 location of the token in terms of the lex_source's buffer.
67 For a token produced through macro expansion, this is the entire macro
69 size_t token_pos; /* Offset into src->buffer of token start. */
70 size_t token_len; /* Length of source for token in bytes. */
71 int first_line; /* Line number at token_pos. */
73 /* For a token obtained through macro expansion, this is just this token.
75 For a token obtained through the lexer in an ordinary way, these are
77 char *macro_rep; /* The whole macro expansion. */
78 size_t ofs; /* Offset of this token in macro_rep. */
79 size_t len; /* Length of this token in macro_rep. */
80 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
84 lex_token_destroy (struct lex_token *t)
86 token_uninit (&t->token);
89 assert (*t->ref_cnt > 0);
99 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
104 struct lex_token **tokens;
107 static void lex_stage_clear (struct lex_stage *);
108 static void lex_stage_uninit (struct lex_stage *);
110 static size_t lex_stage_count (const struct lex_stage *);
111 static bool lex_stage_is_empty (const struct lex_stage *);
113 static struct lex_token *lex_stage_last (struct lex_stage *);
114 static struct lex_token *lex_stage_first (struct lex_stage *);
115 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
117 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
118 static void lex_stage_pop_first (struct lex_stage *);
120 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
123 /* Deletes all the tokens from STAGE. */
125 lex_stage_clear (struct lex_stage *stage)
127 while (!deque_is_empty (&stage->deque))
128 lex_stage_pop_first (stage);
131 /* Deletes all the tokens from STAGE and frees storage for the deque. */
133 lex_stage_uninit (struct lex_stage *stage)
135 lex_stage_clear (stage);
136 free (stage->tokens);
139 /* Returns true if STAGE contains no tokens, otherwise false. */
141 lex_stage_is_empty (const struct lex_stage *stage)
143 return deque_is_empty (&stage->deque);
146 /* Returns the number of tokens in STAGE. */
148 lex_stage_count (const struct lex_stage *stage)
150 return deque_count (&stage->deque);
153 /* Returns the last token in STAGE, which must be nonempty. The last token is
154 the one accessed with the greatest lookahead. */
155 static struct lex_token *
156 lex_stage_last (struct lex_stage *stage)
158 return stage->tokens[deque_front (&stage->deque, 0)];
161 /* Returns the first token in STAGE, which must be nonempty.
162 The first token is the one accessed with the least lookahead. */
163 static struct lex_token *
164 lex_stage_first (struct lex_stage *stage)
166 return lex_stage_nth (stage, 0);
169 /* Returns the token the given INDEX in STAGE. The first token (with the least
170 lookahead) is 0, the second token is 1, and so on. There must be at least
171 INDEX + 1 tokens in STAGE. */
172 static struct lex_token *
173 lex_stage_nth (struct lex_stage *stage, size_t index)
175 return stage->tokens[deque_back (&stage->deque, index)];
178 /* Adds TOKEN so that it becomes the last token in STAGE. */
180 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
182 if (deque_is_full (&stage->deque))
183 stage->tokens = deque_expand (&stage->deque, stage->tokens,
184 sizeof *stage->tokens);
185 stage->tokens[deque_push_front (&stage->deque)] = token;
188 /* Removes the first token from STAGE and uninitializes it. */
190 lex_stage_pop_first (struct lex_stage *stage)
192 lex_token_destroy (stage->tokens[deque_pop_back (&stage->deque)]);
195 /* Removes the first N tokens from SRC, appending them to DST as the last
198 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
200 for (size_t i = 0; i < n; i++)
202 lex_stage_push_last (dst, lex_stage_first (src));
203 deque_pop_back (&src->deque);
207 /* A source of tokens, corresponding to a syntax file.
209 This is conceptually a lex_reader wrapped with everything needed to convert
210 its UTF-8 bytes into tokens. */
213 struct ll ll; /* In lexer's list of sources. */
214 struct lex_reader *reader;
216 struct segmenter segmenter;
217 bool eof; /* True if T_STOP was read from 'reader'. */
219 /* Buffer of UTF-8 bytes. */
220 char *buffer; /* Source file contents. */
221 size_t length; /* Number of bytes filled. */
222 size_t allocated; /* Number of bytes allocated. */
224 /* Offsets into 'buffer'. */
225 size_t journal_pos; /* First byte not yet output to journal. */
226 size_t seg_pos; /* First byte not yet scanned as token. */
228 int n_newlines; /* Number of new-lines up to seg_pos. */
229 bool suppress_next_newline;
233 This is a pipeline with the following stages. Each token eventually
234 made available to the parser passes through of these stages. The stages
235 are named after the processing that happens in each one.
237 Initially, tokens come from the segmenter and scanner to 'pp':
239 - pp: Tokens that need to pass through the macro preprocessor to end up
242 - merge: Tokens that need to pass through scan_merge() to end up in
245 - lookahead: Tokens available to the client for parsing. */
247 struct lex_stage merge;
248 struct lex_stage lookahead;
251 static struct lex_source *lex_source_create (struct lexer *,
252 struct lex_reader *);
253 static void lex_source_destroy (struct lex_source *);
258 struct ll_list sources; /* Contains "struct lex_source"s. */
259 struct macro_set *macros;
262 static struct lex_source *lex_source__ (const struct lexer *);
263 static char *lex_source_get_syntax__ (const struct lex_source *,
265 static const struct lex_token *lex_next__ (const struct lexer *, int n);
266 static void lex_source_push_endcmd__ (struct lex_source *);
268 static bool lex_source_get_lookahead (struct lex_source *);
269 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
270 const char *format, va_list)
271 PRINTF_FORMAT (4, 0);
272 static const struct lex_token *lex_source_next__ (const struct lex_source *,
275 /* Initializes READER with the specified CLASS and otherwise some reasonable
276 defaults. The caller should fill in the others members as desired. */
278 lex_reader_init (struct lex_reader *reader,
279 const struct lex_reader_class *class)
281 reader->class = class;
282 reader->syntax = SEG_MODE_AUTO;
283 reader->error = LEX_ERROR_CONTINUE;
284 reader->file_name = NULL;
285 reader->encoding = NULL;
286 reader->line_number = 0;
290 /* Frees any file name already in READER and replaces it by a copy of
291 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
293 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
295 free (reader->file_name);
296 reader->file_name = xstrdup_if_nonnull (file_name);
299 /* Creates and returns a new lexer. */
303 struct lexer *lexer = xmalloc (sizeof *lexer);
304 *lexer = (struct lexer) {
305 .sources = LL_INITIALIZER (lexer->sources),
306 .macros = macro_set_create (),
311 /* Destroys LEXER. */
313 lex_destroy (struct lexer *lexer)
317 struct lex_source *source, *next;
319 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
320 lex_source_destroy (source);
321 macro_set_destroy (lexer->macros);
326 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
327 same name. Takes ownership of M. */
329 lex_define_macro (struct lexer *lexer, struct macro *m)
331 macro_set_add (lexer->macros, m);
334 /* Inserts READER into LEXER so that the next token read by LEXER comes from
335 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
338 lex_include (struct lexer *lexer, struct lex_reader *reader)
340 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
341 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
344 /* Appends READER to LEXER, so that it will be read after all other current
345 readers have already been read. */
347 lex_append (struct lexer *lexer, struct lex_reader *reader)
349 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
354 /* Advances LEXER to the next token, consuming the current token. */
356 lex_get (struct lexer *lexer)
358 struct lex_source *src;
360 src = lex_source__ (lexer);
364 if (!lex_stage_is_empty (&src->lookahead))
365 lex_stage_pop_first (&src->lookahead);
367 while (lex_stage_is_empty (&src->lookahead))
368 if (!lex_source_get_lookahead (src))
370 lex_source_destroy (src);
371 src = lex_source__ (lexer);
377 /* Advances LEXER by N tokens. */
379 lex_get_n (struct lexer *lexer, size_t n)
385 /* Issuing errors. */
387 /* Prints a syntax error message containing the current token and
388 given message MESSAGE (if non-null). */
390 lex_error (struct lexer *lexer, const char *format, ...)
394 va_start (args, format);
395 lex_next_error_valist (lexer, 0, 0, format, args);
399 /* Prints a syntax error message containing the current token and
400 given message MESSAGE (if non-null). */
402 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
404 lex_next_error_valist (lexer, 0, 0, format, args);
407 /* Prints a syntax error message containing the current token and
408 given message MESSAGE (if non-null). */
410 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
414 va_start (args, format);
415 lex_next_error_valist (lexer, n0, n1, format, args);
419 /* Prints a syntax error message saying that one of the strings provided as
420 varargs, up to the first NULL, is expected. */
422 (lex_error_expecting) (struct lexer *lexer, ...)
426 va_start (args, lexer);
427 lex_error_expecting_valist (lexer, args);
431 /* Prints a syntax error message saying that one of the options provided in
432 ARGS, up to the first NULL, is expected. */
434 lex_error_expecting_valist (struct lexer *lexer, va_list args)
436 enum { MAX_OPTIONS = 9 };
437 const char *options[MAX_OPTIONS];
439 while (n < MAX_OPTIONS)
441 const char *option = va_arg (args, const char *);
445 options[n++] = option;
447 lex_error_expecting_array (lexer, options, n);
451 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
456 lex_error (lexer, NULL);
460 lex_error (lexer, _("expecting %s"), options[0]);
464 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
468 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
473 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
474 options[0], options[1], options[2], options[3]);
478 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
479 options[0], options[1], options[2], options[3], options[4]);
483 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
484 options[0], options[1], options[2], options[3], options[4],
489 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
490 options[0], options[1], options[2], options[3], options[4],
491 options[5], options[6]);
495 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
496 options[0], options[1], options[2], options[3], options[4],
497 options[5], options[6], options[7]);
501 lex_error (lexer, NULL);
505 /* Reports an error to the effect that subcommand SBC may only be specified
508 This function does not take a lexer as an argument or use lex_error(),
509 because the result would ordinarily just be redundant: "Syntax error at
510 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
511 not help the user find the error. */
513 lex_sbc_only_once (const char *sbc)
515 msg (SE, _("Subcommand %s may only be specified once."), sbc);
518 /* Reports an error to the effect that subcommand SBC is missing.
520 This function does not take a lexer as an argument or use lex_error(),
521 because a missing subcommand can normally be detected only after the whole
522 command has been parsed, and so lex_error() would always report "Syntax
523 error at end of command", which does not help the user find the error. */
525 lex_sbc_missing (const char *sbc)
527 msg (SE, _("Required subcommand %s was not specified."), sbc);
530 /* Reports an error to the effect that specification SPEC may only be specified
531 once within subcommand SBC. */
533 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
535 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
539 /* Reports an error to the effect that specification SPEC is missing within
542 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
544 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
548 /* Prints a syntax error message containing the current token and
549 given message MESSAGE (if non-null). */
551 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
552 const char *format, va_list args)
554 struct lex_source *src = lex_source__ (lexer);
557 lex_source_error_valist (src, n0, n1, format, args);
563 ds_put_format (&s, _("Syntax error at end of input"));
566 ds_put_cstr (&s, ": ");
567 ds_put_vformat (&s, format, args);
569 if (ds_last (&s) != '.')
570 ds_put_byte (&s, '.');
571 msg (SE, "%s", ds_cstr (&s));
576 /* Checks that we're at end of command.
577 If so, returns a successful command completion code.
578 If not, flags a syntax error and returns an error command
581 lex_end_of_command (struct lexer *lexer)
583 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
585 lex_error (lexer, _("expecting end of command"));
592 /* Token testing functions. */
594 /* Returns true if the current token is a number. */
596 lex_is_number (const struct lexer *lexer)
598 return lex_next_is_number (lexer, 0);
601 /* Returns true if the current token is a string. */
603 lex_is_string (const struct lexer *lexer)
605 return lex_next_is_string (lexer, 0);
608 /* Returns the value of the current token, which must be a
609 floating point number. */
611 lex_number (const struct lexer *lexer)
613 return lex_next_number (lexer, 0);
616 /* Returns true iff the current token is an integer. */
618 lex_is_integer (const struct lexer *lexer)
620 return lex_next_is_integer (lexer, 0);
623 /* Returns the value of the current token, which must be an
626 lex_integer (const struct lexer *lexer)
628 return lex_next_integer (lexer, 0);
631 /* Token testing functions with lookahead.
633 A value of 0 for N as an argument to any of these functions refers to the
634 current token. Lookahead is limited to the current command. Any N greater
635 than the number of tokens remaining in the current command will be treated
636 as referring to a T_ENDCMD token. */
638 /* Returns true if the token N ahead of the current token is a number. */
640 lex_next_is_number (const struct lexer *lexer, int n)
642 return token_is_number (lex_next (lexer, n));
645 /* Returns true if the token N ahead of the current token is a string. */
647 lex_next_is_string (const struct lexer *lexer, int n)
649 return token_is_string (lex_next (lexer, n));
652 /* Returns the value of the token N ahead of the current token, which must be a
653 floating point number. */
655 lex_next_number (const struct lexer *lexer, int n)
657 return token_number (lex_next (lexer, n));
660 /* Returns true if the token N ahead of the current token is an integer. */
662 lex_next_is_integer (const struct lexer *lexer, int n)
664 return token_is_integer (lex_next (lexer, n));
667 /* Returns the value of the token N ahead of the current token, which must be
670 lex_next_integer (const struct lexer *lexer, int n)
672 return token_integer (lex_next (lexer, n));
675 /* Token matching functions. */
677 /* If the current token has the specified TYPE, skips it and returns true.
678 Otherwise, returns false. */
680 lex_match (struct lexer *lexer, enum token_type type)
682 if (lex_token (lexer) == type)
691 /* If the current token matches IDENTIFIER, skips it and returns true.
692 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
695 IDENTIFIER must be an ASCII string. */
697 lex_match_id (struct lexer *lexer, const char *identifier)
699 return lex_match_id_n (lexer, identifier, 3);
702 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
703 may be abbreviated to its first N letters. Otherwise, returns false.
705 IDENTIFIER must be an ASCII string. */
707 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
709 if (lex_token (lexer) == T_ID
710 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
719 /* If the current token is integer X, skips it and returns true. Otherwise,
722 lex_match_int (struct lexer *lexer, int x)
724 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
733 /* Forced matches. */
735 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
736 abbreviated to its first 3 letters. Otherwise, reports an error and returns
739 IDENTIFIER must be an ASCII string. */
741 lex_force_match_id (struct lexer *lexer, const char *identifier)
743 if (lex_match_id (lexer, identifier))
747 lex_error_expecting (lexer, identifier);
752 /* If the current token has the specified TYPE, skips it and returns true.
753 Otherwise, reports an error and returns false. */
755 lex_force_match (struct lexer *lexer, enum token_type type)
757 if (lex_token (lexer) == type)
764 const char *type_string = token_type_to_string (type);
767 char *s = xasprintf ("`%s'", type_string);
768 lex_error_expecting (lexer, s);
772 lex_error_expecting (lexer, token_type_to_name (type));
778 /* If the current token is a string, does nothing and returns true.
779 Otherwise, reports an error and returns false. */
781 lex_force_string (struct lexer *lexer)
783 if (lex_is_string (lexer))
787 lex_error (lexer, _("expecting string"));
792 /* If the current token is a string or an identifier, does nothing and returns
793 true. Otherwise, reports an error and returns false.
795 This is meant for use in syntactic situations where we want to encourage the
796 user to supply a quoted string, but for compatibility we also accept
797 identifiers. (One example of such a situation is file names.) Therefore,
798 the error message issued when the current token is wrong only says that a
799 string is expected and doesn't mention that an identifier would also be
802 lex_force_string_or_id (struct lexer *lexer)
804 return lex_token (lexer) == T_ID || lex_force_string (lexer);
807 /* If the current token is an integer, does nothing and returns true.
808 Otherwise, reports an error and returns false. */
810 lex_force_int (struct lexer *lexer)
812 if (lex_is_integer (lexer))
816 lex_error (lexer, _("expecting integer"));
821 /* If the current token is an integer in the range MIN...MAX (inclusive), does
822 nothing and returns true. Otherwise, reports an error and returns false.
823 If NAME is nonnull, then it is used in the error message. */
825 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
827 bool is_number = lex_is_number (lexer);
828 bool is_integer = lex_is_integer (lexer);
829 bool too_small = (is_integer ? lex_integer (lexer) < min
830 : is_number ? lex_number (lexer) < min
832 bool too_big = (is_integer ? lex_integer (lexer) > max
833 : is_number ? lex_number (lexer) > max
835 if (is_integer && !too_small && !too_big)
840 /* Weird, maybe a bug in the caller. Just report that we needed an
843 lex_error (lexer, _("Integer expected for %s."), name);
845 lex_error (lexer, _("Integer expected."));
850 lex_error (lexer, _("Expected %ld for %s."), min, name);
852 lex_error (lexer, _("Expected %ld."), min);
854 else if (min + 1 == max)
857 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
859 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
863 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
864 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
866 if (report_lower_bound && report_upper_bound)
870 _("Expected integer between %ld and %ld for %s."),
873 lex_error (lexer, _("Expected integer between %ld and %ld."),
876 else if (report_lower_bound)
881 lex_error (lexer, _("Expected non-negative integer for %s."),
884 lex_error (lexer, _("Expected non-negative integer."));
889 lex_error (lexer, _("Expected positive integer for %s."),
892 lex_error (lexer, _("Expected positive integer."));
897 lex_error (lexer, _("Expected integer %ld or greater for %s."),
900 lex_error (lexer, _("Expected integer %ld or greater."), min);
903 else if (report_upper_bound)
907 _("Expected integer less than or equal to %ld for %s."),
910 lex_error (lexer, _("Expected integer less than or equal to %ld."),
916 lex_error (lexer, _("Integer expected for %s."), name);
918 lex_error (lexer, _("Integer expected."));
924 /* If the current token is a number, does nothing and returns true.
925 Otherwise, reports an error and returns false. */
927 lex_force_num (struct lexer *lexer)
929 if (lex_is_number (lexer))
932 lex_error (lexer, _("expecting number"));
936 /* If the current token is an identifier, does nothing and returns true.
937 Otherwise, reports an error and returns false. */
939 lex_force_id (struct lexer *lexer)
941 if (lex_token (lexer) == T_ID)
944 lex_error (lexer, _("expecting identifier"));
948 /* Token accessors. */
950 /* Returns the type of LEXER's current token. */
952 lex_token (const struct lexer *lexer)
954 return lex_next_token (lexer, 0);
957 /* Returns the number in LEXER's current token.
959 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
960 tokens this function will always return zero. */
962 lex_tokval (const struct lexer *lexer)
964 return lex_next_tokval (lexer, 0);
967 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
969 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
970 this functions this function will always return NULL.
972 The UTF-8 encoding of the returned string is correct for variable names and
973 other identifiers. Use filename_to_utf8() to use it as a filename. Use
974 data_in() to use it in a "union value". */
976 lex_tokcstr (const struct lexer *lexer)
978 return lex_next_tokcstr (lexer, 0);
981 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
982 null-terminated (but the null terminator is not included in the returned
983 substring's 'length').
985 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
986 this functions this function will always return NULL.
988 The UTF-8 encoding of the returned string is correct for variable names and
989 other identifiers. Use filename_to_utf8() to use it as a filename. Use
990 data_in() to use it in a "union value". */
992 lex_tokss (const struct lexer *lexer)
994 return lex_next_tokss (lexer, 0);
999 A value of 0 for N as an argument to any of these functions refers to the
1000 current token. Lookahead is limited to the current command. Any N greater
1001 than the number of tokens remaining in the current command will be treated
1002 as referring to a T_ENDCMD token. */
1004 static const struct lex_token *
1005 lex_next__ (const struct lexer *lexer_, int n)
1007 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1008 struct lex_source *src = lex_source__ (lexer);
1011 return lex_source_next__ (src, n);
1014 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1019 static const struct lex_token *
1020 lex_source_next__ (const struct lex_source *src_, int n)
1022 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1023 while (lex_stage_count (&src->lookahead) <= n)
1025 if (!lex_stage_is_empty (&src->lookahead))
1027 const struct lex_token *t = lex_stage_last (&src->lookahead);
1028 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1032 lex_source_get_lookahead (src);
1035 return lex_stage_nth (&src->lookahead, n);
1038 /* Returns the "struct token" of the token N after the current one in LEXER.
1039 The returned pointer can be invalidated by pretty much any succeeding call
1040 into the lexer, although the string pointer within the returned token is
1041 only invalidated by consuming the token (e.g. with lex_get()). */
1042 const struct token *
1043 lex_next (const struct lexer *lexer, int n)
1045 return &lex_next__ (lexer, n)->token;
1048 /* Returns the type of the token N after the current one in LEXER. */
1050 lex_next_token (const struct lexer *lexer, int n)
1052 return lex_next (lexer, n)->type;
1055 /* Returns the number in the tokn N after the current one in LEXER.
1057 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1058 tokens this function will always return zero. */
1060 lex_next_tokval (const struct lexer *lexer, int n)
1062 return token_number (lex_next (lexer, n));
1065 /* Returns the null-terminated string in the token N after the current one, in
1068 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1069 this functions this function will always return NULL.
1071 The UTF-8 encoding of the returned string is correct for variable names and
1072 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1073 data_in() to use it in a "union value". */
1075 lex_next_tokcstr (const struct lexer *lexer, int n)
1077 return lex_next_tokss (lexer, n).string;
1080 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1081 The string is null-terminated (but the null terminator is not included in
1082 the returned substring's 'length').
1084 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1085 tokens this functions this function will always return NULL.
1087 The UTF-8 encoding of the returned string is correct for variable names and
1088 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1089 data_in() to use it in a "union value". */
1091 lex_next_tokss (const struct lexer *lexer, int n)
1093 return lex_next (lexer, n)->string;
1096 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1097 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1098 are both zero, this requests the syntax for the current token.) The caller
1099 must eventually free the returned string (with free()). The syntax is
1100 encoded in UTF-8 and in the original form supplied to the lexer so that, for
1101 example, it may include comments, spaces, and new-lines if it spans multiple
1102 tokens. Macro expansion, however, has already been performed. */
1104 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1106 return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1109 /* Returns true if the token N ahead of the current one was produced by macro
1110 expansion, false otherwise. */
1112 lex_next_is_from_macro (const struct lexer *lexer, int n)
1114 return lex_next__ (lexer, n)->macro_rep != NULL;
1118 lex_tokens_match (const struct token *actual, const struct token *expected)
1120 if (actual->type != expected->type)
1123 switch (actual->type)
1127 return actual->number == expected->number;
1130 return lex_id_match (expected->string, actual->string);
1133 return (actual->string.length == expected->string.length
1134 && !memcmp (actual->string.string, expected->string.string,
1135 actual->string.length));
1143 lex_at_phrase__ (struct lexer *lexer, const char *s)
1145 struct string_lexer slex;
1149 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1150 while (string_lexer_next (&slex, &token))
1152 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1153 token_uninit (&token);
1160 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1161 returns true. Otherwise, returns false.
1163 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1164 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1165 first three letters. */
1167 lex_at_phrase (struct lexer *lexer, const char *s)
1169 return lex_at_phrase__ (lexer, s) > 0;
1172 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1173 skips it and returns true. Otherwise, returns false.
1175 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1176 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1177 first three letters. */
1179 lex_match_phrase (struct lexer *lexer, const char *s)
1181 size_t n = lex_at_phrase__ (lexer, s);
1183 lex_get_n (lexer, n);
1188 count_newlines (char *s, size_t length)
1193 while ((newline = memchr (s, '\n', length)) != NULL)
1196 length -= (newline + 1) - s;
1204 lex_token_get_last_line_number (const struct lex_source *src,
1205 const struct lex_token *token)
1207 if (token->first_line == 0)
1211 char *token_str = &src->buffer[token->token_pos];
1212 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1217 lex_token_get_column__ (const struct lex_source *src, size_t offset)
1219 const char *newline = memrchr (src->buffer, '\n', offset);
1220 size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1221 return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1225 lex_token_get_first_column (const struct lex_source *src,
1226 const struct lex_token *token)
1228 return lex_token_get_column__ (src, token->token_pos);
1232 lex_token_get_last_column (const struct lex_source *src,
1233 const struct lex_token *token)
1235 return lex_token_get_column__ (src, token->token_pos + token->token_len);
1238 static struct msg_location
1239 lex_token_location (const struct lex_source *src,
1240 const struct lex_token *t0,
1241 const struct lex_token *t1)
1243 return (struct msg_location) {
1244 .file_name = src->reader->file_name,
1245 .first_line = t0->first_line,
1246 .last_line = lex_token_get_last_line_number (src, t1),
1247 .first_column = lex_token_get_first_column (src, t0),
1248 .last_column = lex_token_get_last_column (src, t1),
1252 static struct msg_location *
1253 lex_token_location_rw (const struct lex_source *src,
1254 const struct lex_token *t0,
1255 const struct lex_token *t1)
1257 struct msg_location location = lex_token_location (src, t0, t1);
1258 return msg_location_dup (&location);
1261 static struct msg_location *
1262 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1264 return lex_token_location_rw (src,
1265 lex_source_next__ (src, n0),
1266 lex_source_next__ (src, n1));
1269 /* Returns the 1-based line number of the start of the syntax that represents
1270 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1271 if the token is drawn from a source that does not have line numbers. */
1273 lex_get_first_line_number (const struct lexer *lexer, int n)
1275 const struct lex_source *src = lex_source__ (lexer);
1276 return src ? lex_source_next__ (src, n)->first_line : 0;
1279 /* Returns the 1-based line number of the end of the syntax that represents the
1280 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1281 token or if the token is drawn from a source that does not have line
1284 Most of the time, a single token is wholly within a single line of syntax,
1285 but there are two exceptions: a T_STRING token can be made up of multiple
1286 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1287 token can consist of a "-" on one line followed by the number on the next.
1290 lex_get_last_line_number (const struct lexer *lexer, int n)
1292 const struct lex_source *src = lex_source__ (lexer);
1293 return src ? lex_token_get_last_line_number (src,
1294 lex_source_next__ (src, n)) : 0;
1297 /* Returns the 1-based column number of the start of the syntax that represents
1298 the token N after the current one in LEXER. Returns 0 for a T_STOP
1301 Column numbers are measured according to the width of characters as shown in
1302 a typical fixed-width font, in which CJK characters have width 2 and
1303 combining characters have width 0. */
1305 lex_get_first_column (const struct lexer *lexer, int n)
1307 const struct lex_source *src = lex_source__ (lexer);
1308 return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1311 /* Returns the 1-based column number of the end of the syntax that represents
1312 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1315 Column numbers are measured according to the width of characters as shown in
1316 a typical fixed-width font, in which CJK characters have width 2 and
1317 combining characters have width 0. */
1319 lex_get_last_column (const struct lexer *lexer, int n)
1321 const struct lex_source *src = lex_source__ (lexer);
1322 return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1325 /* Returns the name of the syntax file from which the current command is drawn.
1326 Returns NULL for a T_STOP token or if the command's source does not have
1329 There is no version of this function that takes an N argument because
1330 lookahead only works to the end of a command and any given command is always
1331 within a single syntax file. */
1333 lex_get_file_name (const struct lexer *lexer)
1335 struct lex_source *src = lex_source__ (lexer);
1336 return src == NULL ? NULL : src->reader->file_name;
1339 /* Returns a newly allocated msg_location for the syntax that represents tokens
1340 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1341 must eventually free the location (with msg_location_destroy()). */
1342 struct msg_location *
1343 lex_get_location (const struct lexer *lexer, int n0, int n1)
1345 struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1346 loc->first_column = lex_get_first_column (lexer, n0);
1347 loc->last_column = lex_get_last_column (lexer, n1);
1351 /* Returns a newly allocated msg_location for the syntax that represents tokens
1352 with 0-based offsets N0...N1, inclusive, from the current token. The
1353 location only covers the tokens' lines, not the columns. The caller must
1354 eventually free the location (with msg_location_destroy()). */
1355 struct msg_location *
1356 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1358 struct msg_location *loc = xmalloc (sizeof *loc);
1359 *loc = (struct msg_location) {
1360 .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1361 .first_line = lex_get_first_line_number (lexer, n0),
1362 .last_line = lex_get_last_line_number (lexer, n1),
1368 lex_get_encoding (const struct lexer *lexer)
1370 struct lex_source *src = lex_source__ (lexer);
1371 return src == NULL ? NULL : src->reader->encoding;
1374 /* Returns the syntax mode for the syntax file from which the current drawn is
1375 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1376 does not have line numbers.
1378 There is no version of this function that takes an N argument because
1379 lookahead only works to the end of a command and any given command is always
1380 within a single syntax file. */
1382 lex_get_syntax_mode (const struct lexer *lexer)
1384 struct lex_source *src = lex_source__ (lexer);
1385 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1388 /* Returns the error mode for the syntax file from which the current drawn is
1389 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1390 source does not have line numbers.
1392 There is no version of this function that takes an N argument because
1393 lookahead only works to the end of a command and any given command is always
1394 within a single syntax file. */
1396 lex_get_error_mode (const struct lexer *lexer)
1398 struct lex_source *src = lex_source__ (lexer);
1399 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1402 /* If the source that LEXER is currently reading has error mode
1403 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1404 token to be read comes directly from whatever is next read from the stream.
1406 It makes sense to call this function after encountering an error in a
1407 command entered on the console, because usually the user would prefer not to
1408 have cascading errors. */
1410 lex_interactive_reset (struct lexer *lexer)
1412 struct lex_source *src = lex_source__ (lexer);
1413 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1416 src->journal_pos = src->seg_pos = 0;
1417 src->n_newlines = 0;
1418 src->suppress_next_newline = false;
1419 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1421 lex_stage_clear (&src->pp);
1422 lex_stage_clear (&src->merge);
1423 lex_stage_clear (&src->lookahead);
1424 lex_source_push_endcmd__ (src);
1428 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1430 lex_discard_rest_of_command (struct lexer *lexer)
1432 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1436 /* Discards all lookahead tokens in LEXER, then discards all input sources
1437 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1438 runs out of input sources. */
1440 lex_discard_noninteractive (struct lexer *lexer)
1442 struct lex_source *src = lex_source__ (lexer);
1446 lex_stage_clear (&src->pp);
1447 lex_stage_clear (&src->merge);
1448 lex_stage_clear (&src->lookahead);
1450 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1451 src = lex_source__ (lexer))
1452 lex_source_destroy (src);
1457 lex_source_expand__ (struct lex_source *src)
1459 if (src->length >= src->allocated)
1460 src->buffer = x2realloc (src->buffer, &src->allocated);
1464 lex_source_read__ (struct lex_source *src)
1468 lex_source_expand__ (src);
1470 size_t space = src->allocated - src->length;
1471 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1472 size_t n = src->reader->class->read (src->reader,
1473 &src->buffer[src->length],
1475 assert (n <= space);
1480 src->reader->eof = true;
1486 while (!memchr (&src->buffer[src->seg_pos], '\n',
1487 src->length - src->seg_pos));
1490 static struct lex_source *
1491 lex_source__ (const struct lexer *lexer)
1493 return (ll_is_empty (&lexer->sources) ? NULL
1494 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1497 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1498 one, through N1 ahead of the current one, inclusive. (For example, if N0
1499 and N1 are both zero, this requests the syntax for the current token.) The
1500 caller must eventually free the returned string (with free()). The syntax
1501 is encoded in UTF-8 and in the original form supplied to the lexer so that,
1502 for example, it may include comments, spaces, and new-lines if it spans
1503 multiple tokens. Macro expansion, however, has already been performed. */
1505 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1507 struct string s = DS_EMPTY_INITIALIZER;
1508 for (size_t i = n0; i <= n1; )
1510 /* Find [I,J) as the longest sequence of tokens not produced by macro
1511 expansion, or otherwise the longest sequence expanded from a single
1513 const struct lex_token *first = lex_source_next__ (src, i);
1515 for (j = i + 1; j <= n1; j++)
1517 const struct lex_token *cur = lex_source_next__ (src, j);
1518 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1519 || first->macro_rep != cur->macro_rep)
1522 const struct lex_token *last = lex_source_next__ (src, j - 1);
1524 /* Now add the syntax for this sequence of tokens to SRC. */
1525 if (!ds_is_empty (&s))
1526 ds_put_byte (&s, ' ');
1527 if (!first->macro_rep)
1529 size_t start = first->token_pos;
1530 size_t end = last->token_pos + last->token_len;
1531 ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1535 size_t start = first->ofs;
1536 size_t end = last->ofs + last->len;
1537 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1543 return ds_steal_cstr (&s);
1547 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1549 for (size_t i = n0; i <= n1; i++)
1550 if (lex_source_next__ (src, i)->macro_rep)
1555 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1556 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1557 other tokens included in that range. The syntax is encoded in UTF-8 and in
1558 the original form supplied to the lexer so that, for example, it may include
1559 comments, spaces, and new-lines if it spans multiple tokens.
1561 Returns an empty string if the token range doesn't include a macro call.
1563 The caller must not modify or free the returned string. */
1564 static struct substring
1565 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1567 if (!lex_source_contains_macro_call (src, n0, n1))
1570 const struct lex_token *token0 = lex_source_next__ (src, n0);
1571 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1572 size_t start = token0->token_pos;
1573 size_t end = token1->token_pos + token1->token_len;
1575 return ss_buffer (&src->buffer[start], end - start);
1579 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1580 const char *format, va_list args)
1582 const struct lex_token *token;
1587 token = lex_source_next__ (src, n0);
1588 if (token->token.type == T_ENDCMD)
1589 ds_put_cstr (&s, _("Syntax error at end of command"));
1592 /* Get the syntax that caused the error. */
1593 char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1595 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1598 /* Get the macro call(s) that expanded to the syntax that caused the
1601 str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1608 _("Syntax error at `%s' (in expansion of `%s')"),
1611 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1616 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1619 ds_put_cstr (&s, _("Syntax error"));
1625 ds_put_cstr (&s, ": ");
1626 ds_put_vformat (&s, format, args);
1628 if (ds_last (&s) != '.')
1629 ds_put_byte (&s, '.');
1631 struct msg *m = xmalloc (sizeof *m);
1633 .category = MSG_C_SYNTAX,
1634 .severity = MSG_S_ERROR,
1635 .location = lex_source_get_location (src, n0, n1),
1636 .text = ds_steal_cstr (&s),
1642 lex_get_error (struct lex_source *src, const struct lex_token *token)
1645 str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1646 syntax, sizeof syntax);
1648 struct string s = DS_EMPTY_INITIALIZER;
1649 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1650 ds_put_format (&s, ": %s", token->token.string.string);
1652 struct msg *m = xmalloc (sizeof *m);
1654 .category = MSG_C_SYNTAX,
1655 .severity = MSG_S_ERROR,
1656 .location = lex_token_location_rw (src, token, token),
1657 .text = ds_steal_cstr (&s),
1662 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1663 underlying lex_reader if necessary. Returns true if a new token was added
1664 to SRC's deque, false otherwise. The caller should retry failures unless
1665 SRC's 'eof' marker was set to true indicating that there will be no more
1666 tokens from this source. */
1668 lex_source_try_get_pp (struct lex_source *src)
1670 /* Append a new token to SRC and initialize it. */
1671 struct lex_token *token = xmalloc (sizeof *token);
1672 token->token = (struct token) { .type = T_STOP };
1673 token->macro_rep = NULL;
1674 token->ref_cnt = NULL;
1675 token->token_pos = src->seg_pos;
1676 if (src->reader->line_number > 0)
1677 token->first_line = src->reader->line_number + src->n_newlines;
1679 token->first_line = 0;
1681 /* Extract a segment. */
1682 const char *segment;
1683 enum segment_type seg_type;
1687 segment = &src->buffer[src->seg_pos];
1688 seg_len = segmenter_push (&src->segmenter, segment,
1689 src->length - src->seg_pos,
1690 src->reader->eof, &seg_type);
1694 /* The segmenter needs more input to produce a segment. */
1695 assert (!src->reader->eof);
1696 lex_source_read__ (src);
1699 /* Update state based on the segment. */
1700 token->token_len = seg_len;
1701 src->seg_pos += seg_len;
1702 if (seg_type == SEG_NEWLINE)
1705 /* Get a token from the segment. */
1706 enum tokenize_result result = token_from_segment (
1707 seg_type, ss_buffer (segment, seg_len), &token->token);
1709 /* If we've reached the end of a line, or the end of a command, then pass
1710 the line to the output engine as a syntax text item. */
1711 int n_lines = seg_type == SEG_NEWLINE;
1712 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1715 src->suppress_next_newline = true;
1717 else if (n_lines > 0 && src->suppress_next_newline)
1720 src->suppress_next_newline = false;
1722 for (int i = 0; i < n_lines; i++)
1724 /* Beginning of line. */
1725 const char *line = &src->buffer[src->journal_pos];
1727 /* Calculate line length, including \n or \r\n end-of-line if present.
1729 We use src->length even though that may be beyond what we've actually
1730 converted to tokens. That's because, if we're emitting the line due
1731 to SEG_END_COMMAND, we want to take the whole line through the
1732 newline, not just through the '.'. */
1733 size_t max_len = src->length - src->journal_pos;
1734 const char *newline = memchr (line, '\n', max_len);
1735 size_t line_len = newline ? newline - line + 1 : max_len;
1737 /* Calculate line length excluding end-of-line. */
1738 size_t copy_len = line_len;
1739 if (copy_len > 0 && line[copy_len - 1] == '\n')
1741 if (copy_len > 0 && line[copy_len - 1] == '\r')
1744 /* Submit the line as syntax. */
1745 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1746 xmemdup0 (line, copy_len),
1749 src->journal_pos += line_len;
1754 case TOKENIZE_ERROR:
1755 lex_get_error (src, token);
1757 case TOKENIZE_EMPTY:
1758 lex_token_destroy (token);
1761 case TOKENIZE_TOKEN:
1762 if (token->token.type == T_STOP)
1764 token->token.type = T_ENDCMD;
1767 lex_stage_push_last (&src->pp, token);
1773 /* Attempts to append a new token to SRC. Returns true if successful, false on
1774 failure. On failure, the end of SRC has been reached and no more tokens
1775 will be forthcoming from it.
1777 Does not make the new token available for lookahead yet; the caller must
1778 adjust SRC's 'middle' pointer to do so. */
1780 lex_source_get_pp (struct lex_source *src)
1783 if (lex_source_try_get_pp (src))
1789 lex_source_try_get_merge (const struct lex_source *src_)
1791 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1793 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1796 if (!settings_get_mexpand ())
1798 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1802 /* Now pass tokens one-by-one to the macro expander.
1804 In the common case where there is no macro to expand, the loop is not
1806 struct macro_call *mc;
1807 int n_call = macro_call_create (src->lexer->macros,
1808 &lex_stage_first (&src->pp)->token, &mc);
1809 for (int ofs = 1; !n_call; ofs++)
1811 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1813 /* This should not be reachable because we always get a T_ENDCMD at
1814 the end of an input file (transformed from T_STOP by
1815 lex_source_try_get_pp()) and the macro_expander should always
1816 terminate expansion on T_ENDCMD. */
1820 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1821 size_t start = t->token_pos;
1822 size_t end = t->token_pos + t->token_len;
1823 const struct macro_token mt = {
1825 .syntax = ss_buffer (&src->buffer[start], end - start),
1827 const struct msg_location loc = lex_token_location (src, t, t);
1828 n_call = macro_call_add (mc, &mt, &loc);
1832 /* False alarm: no macro expansion after all. Use first token as
1833 lookahead. We'll retry macro expansion from the second token next
1835 macro_call_destroy (mc);
1836 lex_stage_shift (&src->merge, &src->pp, 1);
1840 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1841 are a macro call. (These are likely to be the only tokens in 'pp'.)
1843 const struct lex_token *c0 = lex_stage_first (&src->pp);
1844 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1845 struct macro_tokens expansion = { .n = 0 };
1846 struct msg_location loc = lex_token_location (src, c0, c1);
1847 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1848 macro_call_destroy (mc);
1850 /* Convert the macro expansion into syntax for possible error messages
1852 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1853 size_t *len = xnmalloc (expansion.n, sizeof *len);
1854 struct string s = DS_EMPTY_INITIALIZER;
1855 macro_tokens_to_syntax (&expansion, &s, ofs, len);
1857 if (settings_get_mprint ())
1858 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1859 _("Macro Expansion")));
1861 /* Append the macro expansion tokens to the lookahead. */
1862 if (expansion.n > 0)
1864 char *macro_rep = ds_steal_cstr (&s);
1865 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1866 *ref_cnt = expansion.n;
1867 for (size_t i = 0; i < expansion.n; i++)
1869 struct lex_token *token = xmalloc (sizeof *token);
1870 *token = (struct lex_token) {
1871 .token = expansion.mts[i].token,
1872 .token_pos = c0->token_pos,
1873 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1874 .first_line = c0->first_line,
1875 .macro_rep = macro_rep,
1880 lex_stage_push_last (&src->merge, token);
1882 ss_dealloc (&expansion.mts[i].syntax);
1887 free (expansion.mts);
1891 /* Destroy the tokens for the call. */
1892 for (size_t i = 0; i < n_call; i++)
1893 lex_stage_pop_first (&src->pp);
1895 return expansion.n > 0;
1898 /* Attempts to obtain at least one new token into 'merge' in SRC.
1900 Returns true if successful, false on failure. In the latter case, SRC is
1901 exhausted and 'src->eof' is now true. */
1903 lex_source_get_merge (struct lex_source *src)
1906 if (lex_source_try_get_merge (src))
1911 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1913 Returns true if successful, false on failure. In the latter case, SRC is
1914 exhausted and 'src->eof' is now true. */
1916 lex_source_get_lookahead (struct lex_source *src)
1918 struct merger m = MERGER_INIT;
1920 for (size_t i = 0; ; i++)
1922 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1924 /* We always get a T_ENDCMD at the end of an input file
1925 (transformed from T_STOP by lex_source_try_get_pp()) and
1926 merger_add() should never return -1 on T_ENDCMD. */
1927 assert (lex_stage_is_empty (&src->merge));
1931 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1935 lex_stage_shift (&src->lookahead, &src->merge, 1);
1938 else if (retval > 0)
1940 /* Add a token that merges all the tokens together. */
1941 const struct lex_token *first = lex_stage_first (&src->merge);
1942 const struct lex_token *last = lex_stage_nth (&src->merge,
1944 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1945 struct lex_token *t = xmalloc (sizeof *t);
1946 *t = (struct lex_token) {
1948 .token_pos = first->token_pos,
1949 .token_len = (last->token_pos - first->token_pos) + last->token_len,
1950 .first_line = first->first_line,
1952 /* This works well if all the tokens were not expanded from macros,
1953 or if they came from the same macro expansion. It just gives up
1954 in the other (corner) cases. */
1955 .macro_rep = macro ? first->macro_rep : NULL,
1956 .ofs = macro ? first->ofs : 0,
1957 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
1958 .ref_cnt = macro ? first->ref_cnt : NULL,
1962 lex_stage_push_last (&src->lookahead, t);
1964 for (int i = 0; i < retval; i++)
1965 lex_stage_pop_first (&src->merge);
1972 lex_source_push_endcmd__ (struct lex_source *src)
1974 assert (lex_stage_is_empty (&src->lookahead));
1975 struct lex_token *token = xmalloc (sizeof *token);
1976 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
1977 lex_stage_push_last (&src->lookahead, token);
1980 static struct lex_source *
1981 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
1983 struct lex_source *src = xmalloc (sizeof *src);
1984 *src = (struct lex_source) {
1986 .segmenter = segmenter_init (reader->syntax, false),
1990 lex_source_push_endcmd__ (src);
1996 lex_source_destroy (struct lex_source *src)
1998 char *file_name = src->reader->file_name;
1999 char *encoding = src->reader->encoding;
2000 if (src->reader->class->destroy != NULL)
2001 src->reader->class->destroy (src->reader);
2005 lex_stage_uninit (&src->pp);
2006 lex_stage_uninit (&src->merge);
2007 lex_stage_uninit (&src->lookahead);
2008 ll_remove (&src->ll);
2012 struct lex_file_reader
2014 struct lex_reader reader;
2015 struct u8_istream *istream;
2018 static struct lex_reader_class lex_file_reader_class;
2020 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2021 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2022 ENCODING, which should take one of the forms accepted by
2023 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2024 mode of the new reader, respectively.
2026 Returns a null pointer if FILE_NAME cannot be opened. */
2028 lex_reader_for_file (const char *file_name, const char *encoding,
2029 enum segmenter_mode syntax,
2030 enum lex_error_mode error)
2032 struct lex_file_reader *r;
2033 struct u8_istream *istream;
2035 istream = (!strcmp(file_name, "-")
2036 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2037 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2038 if (istream == NULL)
2040 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2044 r = xmalloc (sizeof *r);
2045 lex_reader_init (&r->reader, &lex_file_reader_class);
2046 r->reader.syntax = syntax;
2047 r->reader.error = error;
2048 r->reader.file_name = xstrdup (file_name);
2049 r->reader.encoding = xstrdup_if_nonnull (encoding);
2050 r->reader.line_number = 1;
2051 r->istream = istream;
2056 static struct lex_file_reader *
2057 lex_file_reader_cast (struct lex_reader *r)
2059 return UP_CAST (r, struct lex_file_reader, reader);
2063 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2064 enum prompt_style prompt_style UNUSED)
2066 struct lex_file_reader *r = lex_file_reader_cast (r_);
2067 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2070 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2077 lex_file_close (struct lex_reader *r_)
2079 struct lex_file_reader *r = lex_file_reader_cast (r_);
2081 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2083 if (u8_istream_close (r->istream) != 0)
2084 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2087 u8_istream_free (r->istream);
2092 static struct lex_reader_class lex_file_reader_class =
2098 struct lex_string_reader
2100 struct lex_reader reader;
2105 static struct lex_reader_class lex_string_reader_class;
2107 /* Creates and returns a new lex_reader for the contents of S, which must be
2108 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2109 with ss_dealloc() when it is closed. */
2111 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2113 struct lex_string_reader *r;
2115 r = xmalloc (sizeof *r);
2116 lex_reader_init (&r->reader, &lex_string_reader_class);
2117 r->reader.syntax = SEG_MODE_AUTO;
2118 r->reader.encoding = xstrdup_if_nonnull (encoding);
2125 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2126 which must be encoded in ENCODING. The caller retains ownership of S. */
2128 lex_reader_for_string (const char *s, const char *encoding)
2130 struct substring ss;
2131 ss_alloc_substring (&ss, ss_cstr (s));
2132 return lex_reader_for_substring_nocopy (ss, encoding);
2135 /* Formats FORMAT as a printf()-like format string and creates and returns a
2136 new lex_reader for the formatted result. */
2138 lex_reader_for_format (const char *format, const char *encoding, ...)
2140 struct lex_reader *r;
2143 va_start (args, encoding);
2144 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2150 static struct lex_string_reader *
2151 lex_string_reader_cast (struct lex_reader *r)
2153 return UP_CAST (r, struct lex_string_reader, reader);
2157 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2158 enum prompt_style prompt_style UNUSED)
2160 struct lex_string_reader *r = lex_string_reader_cast (r_);
2163 chunk = MIN (n, r->s.length - r->offset);
2164 memcpy (buf, r->s.string + r->offset, chunk);
2171 lex_string_close (struct lex_reader *r_)
2173 struct lex_string_reader *r = lex_string_reader_cast (r_);
2179 static struct lex_reader_class lex_string_reader_class =