1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "language/command.h"
34 #include "language/lexer/macro.h"
35 #include "language/lexer/scan.h"
36 #include "language/lexer/segment.h"
37 #include "language/lexer/token.h"
38 #include "libpspp/assertion.h"
39 #include "libpspp/cast.h"
40 #include "libpspp/deque.h"
41 #include "libpspp/i18n.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
71 src->tail <= line_pos <= token_pos <= src->head. */
72 size_t token_pos; /* Start of token. */
73 size_t token_len; /* Length of source for token in bytes. */
74 size_t line_pos; /* Start of line containing token_pos. */
75 int first_line; /* Line number at token_pos. */
77 /* For a token obtained through macro expansion, this is just this token.
79 For a token obtained through the lexer in an ordinary way, these are
81 char *macro_rep; /* The whole macro expansion. */
82 size_t ofs; /* Offset of this token in macro_rep. */
83 size_t len; /* Length of this token in macro_rep. */
84 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
88 lex_token_destroy (struct lex_token *t)
90 token_uninit (&t->token);
93 assert (*t->ref_cnt > 0);
103 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
108 struct lex_token **tokens;
111 static void lex_stage_clear (struct lex_stage *);
112 static void lex_stage_uninit (struct lex_stage *);
114 static size_t lex_stage_count (const struct lex_stage *);
115 static bool lex_stage_is_empty (const struct lex_stage *);
117 static struct lex_token *lex_stage_last (struct lex_stage *);
118 static struct lex_token *lex_stage_first (struct lex_stage *);
119 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
121 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
122 static void lex_stage_pop_first (struct lex_stage *);
124 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
127 /* Deletes all the tokens from STAGE. */
129 lex_stage_clear (struct lex_stage *stage)
131 while (!deque_is_empty (&stage->deque))
132 lex_stage_pop_first (stage);
135 /* Deletes all the tokens from STAGE and frees storage for the deque. */
137 lex_stage_uninit (struct lex_stage *stage)
139 lex_stage_clear (stage);
140 free (stage->tokens);
143 /* Returns true if STAGE contains no tokens, otherwise false. */
145 lex_stage_is_empty (const struct lex_stage *stage)
147 return deque_is_empty (&stage->deque);
150 /* Returns the number of tokens in STAGE. */
152 lex_stage_count (const struct lex_stage *stage)
154 return deque_count (&stage->deque);
157 /* Returns the last token in STAGE, which must be nonempty. The last token is
158 the one accessed with the greatest lookahead. */
159 static struct lex_token *
160 lex_stage_last (struct lex_stage *stage)
162 return stage->tokens[deque_front (&stage->deque, 0)];
165 /* Returns the first token in STAGE, which must be nonempty.
166 The first token is the one accessed with the least lookahead. */
167 static struct lex_token *
168 lex_stage_first (struct lex_stage *stage)
170 return lex_stage_nth (stage, 0);
173 /* Returns the token the given INDEX in STAGE. The first token (with the least
174 lookahead) is 0, the second token is 1, and so on. There must be at least
175 INDEX + 1 tokens in STAGE. */
176 static struct lex_token *
177 lex_stage_nth (struct lex_stage *stage, size_t index)
179 return stage->tokens[deque_back (&stage->deque, index)];
182 /* Adds TOKEN so that it becomes the last token in STAGE. */
184 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
186 if (deque_is_full (&stage->deque))
187 stage->tokens = deque_expand (&stage->deque, stage->tokens,
188 sizeof *stage->tokens);
189 stage->tokens[deque_push_front (&stage->deque)] = token;
192 /* Removes the first token from STAGE and uninitializes it. */
194 lex_stage_pop_first (struct lex_stage *stage)
196 lex_token_destroy (stage->tokens[deque_pop_back (&stage->deque)]);
199 /* Removes the first N tokens from SRC, appending them to DST as the last
202 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
204 for (size_t i = 0; i < n; i++)
206 lex_stage_push_last (dst, lex_stage_first (src));
207 deque_pop_back (&src->deque);
211 /* A source of tokens, corresponding to a syntax file.
213 This is conceptually a lex_reader wrapped with everything needed to convert
214 its UTF-8 bytes into tokens. */
217 struct ll ll; /* In lexer's list of sources. */
218 struct lex_reader *reader;
220 struct segmenter segmenter;
221 bool eof; /* True if T_STOP was read from 'reader'. */
223 /* Buffer of UTF-8 bytes. */
225 size_t allocated; /* Number of bytes allocated. */
226 size_t tail; /* &buffer[0] offset into UTF-8 source. */
227 size_t head; /* &buffer[head - tail] offset into source. */
229 /* Positions in source file, tail <= pos <= head for each member here. */
230 size_t journal_pos; /* First byte not yet output to journal. */
231 size_t seg_pos; /* First byte not yet scanned as token. */
232 size_t line_pos; /* First byte of line containing seg_pos. */
234 int n_newlines; /* Number of new-lines up to seg_pos. */
235 bool suppress_next_newline;
239 This is a pipeline with the following stages. Each token eventually
240 made available to the parser passes through of these stages. The stages
241 are named after the processing that happens in each one.
243 Initially, tokens come from the segmenter and scanner to 'pp':
245 - pp: Tokens that need to pass through the macro preprocessor to end up
248 - merge: Tokens that need to pass through scan_merge() to end up in
251 - lookahead: Tokens available to the client for parsing. */
253 struct lex_stage merge;
254 struct lex_stage lookahead;
257 static struct lex_source *lex_source_create (struct lexer *,
258 struct lex_reader *);
259 static void lex_source_destroy (struct lex_source *);
264 struct ll_list sources; /* Contains "struct lex_source"s. */
265 struct macro_set *macros;
268 static struct lex_source *lex_source__ (const struct lexer *);
269 static char *lex_source_get_syntax__ (const struct lex_source *,
271 static const struct lex_token *lex_next__ (const struct lexer *, int n);
272 static void lex_source_push_endcmd__ (struct lex_source *);
274 static bool lex_source_get_lookahead (struct lex_source *);
275 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
276 const char *format, va_list)
277 PRINTF_FORMAT (4, 0);
278 static const struct lex_token *lex_source_next__ (const struct lex_source *,
281 /* Initializes READER with the specified CLASS and otherwise some reasonable
282 defaults. The caller should fill in the others members as desired. */
284 lex_reader_init (struct lex_reader *reader,
285 const struct lex_reader_class *class)
287 reader->class = class;
288 reader->syntax = SEG_MODE_AUTO;
289 reader->error = LEX_ERROR_CONTINUE;
290 reader->file_name = NULL;
291 reader->encoding = NULL;
292 reader->line_number = 0;
296 /* Frees any file name already in READER and replaces it by a copy of
297 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
299 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
301 free (reader->file_name);
302 reader->file_name = xstrdup_if_nonnull (file_name);
305 /* Creates and returns a new lexer. */
309 struct lexer *lexer = xmalloc (sizeof *lexer);
310 *lexer = (struct lexer) {
311 .sources = LL_INITIALIZER (lexer->sources),
312 .macros = macro_set_create (),
317 /* Destroys LEXER. */
319 lex_destroy (struct lexer *lexer)
323 struct lex_source *source, *next;
325 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
326 lex_source_destroy (source);
327 macro_set_destroy (lexer->macros);
332 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
333 same name. Takes ownership of M. */
335 lex_define_macro (struct lexer *lexer, struct macro *m)
337 macro_set_add (lexer->macros, m);
340 /* Inserts READER into LEXER so that the next token read by LEXER comes from
341 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
344 lex_include (struct lexer *lexer, struct lex_reader *reader)
346 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
347 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
350 /* Appends READER to LEXER, so that it will be read after all other current
351 readers have already been read. */
353 lex_append (struct lexer *lexer, struct lex_reader *reader)
355 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
360 /* Advances LEXER to the next token, consuming the current token. */
362 lex_get (struct lexer *lexer)
364 struct lex_source *src;
366 src = lex_source__ (lexer);
370 if (!lex_stage_is_empty (&src->lookahead))
371 lex_stage_pop_first (&src->lookahead);
373 while (lex_stage_is_empty (&src->lookahead))
374 if (!lex_source_get_lookahead (src))
376 lex_source_destroy (src);
377 src = lex_source__ (lexer);
383 /* Advances LEXER by N tokens. */
385 lex_get_n (struct lexer *lexer, size_t n)
391 /* Issuing errors. */
393 /* Prints a syntax error message containing the current token and
394 given message MESSAGE (if non-null). */
396 lex_error (struct lexer *lexer, const char *format, ...)
400 va_start (args, format);
401 lex_next_error_valist (lexer, 0, 0, format, args);
405 /* Prints a syntax error message containing the current token and
406 given message MESSAGE (if non-null). */
408 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
410 lex_next_error_valist (lexer, 0, 0, format, args);
413 /* Prints a syntax error message containing the current token and
414 given message MESSAGE (if non-null). */
416 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
420 va_start (args, format);
421 lex_next_error_valist (lexer, n0, n1, format, args);
425 /* Prints a syntax error message saying that one of the strings provided as
426 varargs, up to the first NULL, is expected. */
428 (lex_error_expecting) (struct lexer *lexer, ...)
432 va_start (args, lexer);
433 lex_error_expecting_valist (lexer, args);
437 /* Prints a syntax error message saying that one of the options provided in
438 ARGS, up to the first NULL, is expected. */
440 lex_error_expecting_valist (struct lexer *lexer, va_list args)
442 enum { MAX_OPTIONS = 9 };
443 const char *options[MAX_OPTIONS];
445 while (n < MAX_OPTIONS)
447 const char *option = va_arg (args, const char *);
451 options[n++] = option;
453 lex_error_expecting_array (lexer, options, n);
457 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
462 lex_error (lexer, NULL);
466 lex_error (lexer, _("expecting %s"), options[0]);
470 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
474 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
479 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
480 options[0], options[1], options[2], options[3]);
484 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
485 options[0], options[1], options[2], options[3], options[4]);
489 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
490 options[0], options[1], options[2], options[3], options[4],
495 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
496 options[0], options[1], options[2], options[3], options[4],
497 options[5], options[6]);
501 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
502 options[0], options[1], options[2], options[3], options[4],
503 options[5], options[6], options[7]);
507 lex_error (lexer, NULL);
511 /* Reports an error to the effect that subcommand SBC may only be specified
514 This function does not take a lexer as an argument or use lex_error(),
515 because the result would ordinarily just be redundant: "Syntax error at
516 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
517 not help the user find the error. */
519 lex_sbc_only_once (const char *sbc)
521 msg (SE, _("Subcommand %s may only be specified once."), sbc);
524 /* Reports an error to the effect that subcommand SBC is missing.
526 This function does not take a lexer as an argument or use lex_error(),
527 because a missing subcommand can normally be detected only after the whole
528 command has been parsed, and so lex_error() would always report "Syntax
529 error at end of command", which does not help the user find the error. */
531 lex_sbc_missing (const char *sbc)
533 msg (SE, _("Required subcommand %s was not specified."), sbc);
536 /* Reports an error to the effect that specification SPEC may only be specified
537 once within subcommand SBC. */
539 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
541 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
545 /* Reports an error to the effect that specification SPEC is missing within
548 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
550 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
554 /* Prints a syntax error message containing the current token and
555 given message MESSAGE (if non-null). */
557 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
558 const char *format, va_list args)
560 struct lex_source *src = lex_source__ (lexer);
563 lex_source_error_valist (src, n0, n1, format, args);
569 ds_put_format (&s, _("Syntax error at end of input"));
572 ds_put_cstr (&s, ": ");
573 ds_put_vformat (&s, format, args);
575 if (ds_last (&s) != '.')
576 ds_put_byte (&s, '.');
577 msg (SE, "%s", ds_cstr (&s));
582 /* Checks that we're at end of command.
583 If so, returns a successful command completion code.
584 If not, flags a syntax error and returns an error command
587 lex_end_of_command (struct lexer *lexer)
589 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
591 lex_error (lexer, _("expecting end of command"));
598 /* Token testing functions. */
600 /* Returns true if the current token is a number. */
602 lex_is_number (const struct lexer *lexer)
604 return lex_next_is_number (lexer, 0);
607 /* Returns true if the current token is a string. */
609 lex_is_string (const struct lexer *lexer)
611 return lex_next_is_string (lexer, 0);
614 /* Returns the value of the current token, which must be a
615 floating point number. */
617 lex_number (const struct lexer *lexer)
619 return lex_next_number (lexer, 0);
622 /* Returns true iff the current token is an integer. */
624 lex_is_integer (const struct lexer *lexer)
626 return lex_next_is_integer (lexer, 0);
629 /* Returns the value of the current token, which must be an
632 lex_integer (const struct lexer *lexer)
634 return lex_next_integer (lexer, 0);
637 /* Token testing functions with lookahead.
639 A value of 0 for N as an argument to any of these functions refers to the
640 current token. Lookahead is limited to the current command. Any N greater
641 than the number of tokens remaining in the current command will be treated
642 as referring to a T_ENDCMD token. */
644 /* Returns true if the token N ahead of the current token is a number. */
646 lex_next_is_number (const struct lexer *lexer, int n)
648 return token_is_number (lex_next (lexer, n));
651 /* Returns true if the token N ahead of the current token is a string. */
653 lex_next_is_string (const struct lexer *lexer, int n)
655 return token_is_string (lex_next (lexer, n));
658 /* Returns the value of the token N ahead of the current token, which must be a
659 floating point number. */
661 lex_next_number (const struct lexer *lexer, int n)
663 return token_number (lex_next (lexer, n));
666 /* Returns true if the token N ahead of the current token is an integer. */
668 lex_next_is_integer (const struct lexer *lexer, int n)
670 return token_is_integer (lex_next (lexer, n));
673 /* Returns the value of the token N ahead of the current token, which must be
676 lex_next_integer (const struct lexer *lexer, int n)
678 return token_integer (lex_next (lexer, n));
681 /* Token matching functions. */
683 /* If the current token has the specified TYPE, skips it and returns true.
684 Otherwise, returns false. */
686 lex_match (struct lexer *lexer, enum token_type type)
688 if (lex_token (lexer) == type)
697 /* If the current token matches IDENTIFIER, skips it and returns true.
698 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
701 IDENTIFIER must be an ASCII string. */
703 lex_match_id (struct lexer *lexer, const char *identifier)
705 return lex_match_id_n (lexer, identifier, 3);
708 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
709 may be abbreviated to its first N letters. Otherwise, returns false.
711 IDENTIFIER must be an ASCII string. */
713 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
715 if (lex_token (lexer) == T_ID
716 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
725 /* If the current token is integer X, skips it and returns true. Otherwise,
728 lex_match_int (struct lexer *lexer, int x)
730 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
739 /* Forced matches. */
741 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
742 abbreviated to its first 3 letters. Otherwise, reports an error and returns
745 IDENTIFIER must be an ASCII string. */
747 lex_force_match_id (struct lexer *lexer, const char *identifier)
749 if (lex_match_id (lexer, identifier))
753 lex_error_expecting (lexer, identifier);
758 /* If the current token has the specified TYPE, skips it and returns true.
759 Otherwise, reports an error and returns false. */
761 lex_force_match (struct lexer *lexer, enum token_type type)
763 if (lex_token (lexer) == type)
770 const char *type_string = token_type_to_string (type);
773 char *s = xasprintf ("`%s'", type_string);
774 lex_error_expecting (lexer, s);
778 lex_error_expecting (lexer, token_type_to_name (type));
784 /* If the current token is a string, does nothing and returns true.
785 Otherwise, reports an error and returns false. */
787 lex_force_string (struct lexer *lexer)
789 if (lex_is_string (lexer))
793 lex_error (lexer, _("expecting string"));
798 /* If the current token is a string or an identifier, does nothing and returns
799 true. Otherwise, reports an error and returns false.
801 This is meant for use in syntactic situations where we want to encourage the
802 user to supply a quoted string, but for compatibility we also accept
803 identifiers. (One example of such a situation is file names.) Therefore,
804 the error message issued when the current token is wrong only says that a
805 string is expected and doesn't mention that an identifier would also be
808 lex_force_string_or_id (struct lexer *lexer)
810 return lex_token (lexer) == T_ID || lex_force_string (lexer);
813 /* If the current token is an integer, does nothing and returns true.
814 Otherwise, reports an error and returns false. */
816 lex_force_int (struct lexer *lexer)
818 if (lex_is_integer (lexer))
822 lex_error (lexer, _("expecting integer"));
827 /* If the current token is an integer in the range MIN...MAX (inclusive), does
828 nothing and returns true. Otherwise, reports an error and returns false.
829 If NAME is nonnull, then it is used in the error message. */
831 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
833 bool is_number = lex_is_number (lexer);
834 bool is_integer = lex_is_integer (lexer);
835 bool too_small = (is_integer ? lex_integer (lexer) < min
836 : is_number ? lex_number (lexer) < min
838 bool too_big = (is_integer ? lex_integer (lexer) > max
839 : is_number ? lex_number (lexer) > max
841 if (is_integer && !too_small && !too_big)
846 /* Weird, maybe a bug in the caller. Just report that we needed an
849 lex_error (lexer, _("Integer expected for %s."), name);
851 lex_error (lexer, _("Integer expected."));
856 lex_error (lexer, _("Expected %ld for %s."), min, name);
858 lex_error (lexer, _("Expected %ld."), min);
860 else if (min + 1 == max)
863 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
865 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
869 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
870 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
872 if (report_lower_bound && report_upper_bound)
876 _("Expected integer between %ld and %ld for %s."),
879 lex_error (lexer, _("Expected integer between %ld and %ld."),
882 else if (report_lower_bound)
887 lex_error (lexer, _("Expected non-negative integer for %s."),
890 lex_error (lexer, _("Expected non-negative integer."));
895 lex_error (lexer, _("Expected positive integer for %s."),
898 lex_error (lexer, _("Expected positive integer."));
903 lex_error (lexer, _("Expected integer %ld or greater for %s."),
906 lex_error (lexer, _("Expected integer %ld or greater."), min);
909 else if (report_upper_bound)
913 _("Expected integer less than or equal to %ld for %s."),
916 lex_error (lexer, _("Expected integer less than or equal to %ld."),
922 lex_error (lexer, _("Integer expected for %s."), name);
924 lex_error (lexer, _("Integer expected."));
930 /* If the current token is a number, does nothing and returns true.
931 Otherwise, reports an error and returns false. */
933 lex_force_num (struct lexer *lexer)
935 if (lex_is_number (lexer))
938 lex_error (lexer, _("expecting number"));
942 /* If the current token is an identifier, does nothing and returns true.
943 Otherwise, reports an error and returns false. */
945 lex_force_id (struct lexer *lexer)
947 if (lex_token (lexer) == T_ID)
950 lex_error (lexer, _("expecting identifier"));
954 /* Token accessors. */
956 /* Returns the type of LEXER's current token. */
958 lex_token (const struct lexer *lexer)
960 return lex_next_token (lexer, 0);
963 /* Returns the number in LEXER's current token.
965 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
966 tokens this function will always return zero. */
968 lex_tokval (const struct lexer *lexer)
970 return lex_next_tokval (lexer, 0);
973 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
975 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
976 this functions this function will always return NULL.
978 The UTF-8 encoding of the returned string is correct for variable names and
979 other identifiers. Use filename_to_utf8() to use it as a filename. Use
980 data_in() to use it in a "union value". */
982 lex_tokcstr (const struct lexer *lexer)
984 return lex_next_tokcstr (lexer, 0);
987 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
988 null-terminated (but the null terminator is not included in the returned
989 substring's 'length').
991 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
992 this functions this function will always return NULL.
994 The UTF-8 encoding of the returned string is correct for variable names and
995 other identifiers. Use filename_to_utf8() to use it as a filename. Use
996 data_in() to use it in a "union value". */
998 lex_tokss (const struct lexer *lexer)
1000 return lex_next_tokss (lexer, 0);
1005 A value of 0 for N as an argument to any of these functions refers to the
1006 current token. Lookahead is limited to the current command. Any N greater
1007 than the number of tokens remaining in the current command will be treated
1008 as referring to a T_ENDCMD token. */
1010 static const struct lex_token *
1011 lex_next__ (const struct lexer *lexer_, int n)
1013 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1014 struct lex_source *src = lex_source__ (lexer);
1017 return lex_source_next__ (src, n);
1020 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1025 static const struct lex_token *
1026 lex_source_next__ (const struct lex_source *src_, int n)
1028 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1029 while (lex_stage_count (&src->lookahead) <= n)
1031 if (!lex_stage_is_empty (&src->lookahead))
1033 const struct lex_token *t = lex_stage_last (&src->lookahead);
1034 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1038 lex_source_get_lookahead (src);
1041 return lex_stage_nth (&src->lookahead, n);
1044 /* Returns the "struct token" of the token N after the current one in LEXER.
1045 The returned pointer can be invalidated by pretty much any succeeding call
1046 into the lexer, although the string pointer within the returned token is
1047 only invalidated by consuming the token (e.g. with lex_get()). */
1048 const struct token *
1049 lex_next (const struct lexer *lexer, int n)
1051 return &lex_next__ (lexer, n)->token;
1054 /* Returns the type of the token N after the current one in LEXER. */
1056 lex_next_token (const struct lexer *lexer, int n)
1058 return lex_next (lexer, n)->type;
1061 /* Returns the number in the tokn N after the current one in LEXER.
1063 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1064 tokens this function will always return zero. */
1066 lex_next_tokval (const struct lexer *lexer, int n)
1068 return token_number (lex_next (lexer, n));
1071 /* Returns the null-terminated string in the token N after the current one, in
1074 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1075 this functions this function will always return NULL.
1077 The UTF-8 encoding of the returned string is correct for variable names and
1078 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1079 data_in() to use it in a "union value". */
1081 lex_next_tokcstr (const struct lexer *lexer, int n)
1083 return lex_next_tokss (lexer, n).string;
1086 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1087 The string is null-terminated (but the null terminator is not included in
1088 the returned substring's 'length').
1090 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1091 tokens this functions this function will always return NULL.
1093 The UTF-8 encoding of the returned string is correct for variable names and
1094 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1095 data_in() to use it in a "union value". */
1097 lex_next_tokss (const struct lexer *lexer, int n)
1099 return lex_next (lexer, n)->string;
1102 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1103 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1104 are both zero, this requests the syntax for the current token.) The caller
1105 must eventually free the returned string (with free()). The syntax is
1106 encoded in UTF-8 and in the original form supplied to the lexer so that, for
1107 example, it may include comments, spaces, and new-lines if it spans multiple
1108 tokens. Macro expansion, however, has already been performed. */
1110 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1112 return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1115 /* Returns true if the token N ahead of the current one was produced by macro
1116 expansion, false otherwise. */
1118 lex_next_is_from_macro (const struct lexer *lexer, int n)
1120 return lex_next__ (lexer, n)->macro_rep != NULL;
1124 lex_tokens_match (const struct token *actual, const struct token *expected)
1126 if (actual->type != expected->type)
1129 switch (actual->type)
1133 return actual->number == expected->number;
1136 return lex_id_match (expected->string, actual->string);
1139 return (actual->string.length == expected->string.length
1140 && !memcmp (actual->string.string, expected->string.string,
1141 actual->string.length));
1149 lex_at_phrase__ (struct lexer *lexer, const char *s)
1151 struct string_lexer slex;
1155 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1156 while (string_lexer_next (&slex, &token))
1158 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1159 token_uninit (&token);
1166 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1167 returns true. Otherwise, returns false.
1169 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1170 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1171 first three letters. */
1173 lex_at_phrase (struct lexer *lexer, const char *s)
1175 return lex_at_phrase__ (lexer, s) > 0;
1178 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1179 skips it and returns true. Otherwise, returns false.
1181 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1182 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1183 first three letters. */
1185 lex_match_phrase (struct lexer *lexer, const char *s)
1187 size_t n = lex_at_phrase__ (lexer, s);
1189 lex_get_n (lexer, n);
1194 count_newlines (char *s, size_t length)
1199 while ((newline = memchr (s, '\n', length)) != NULL)
1202 length -= (newline + 1) - s;
1210 lex_token_get_last_line_number (const struct lex_source *src,
1211 const struct lex_token *token)
1213 if (token->first_line == 0)
1217 char *token_str = &src->buffer[token->token_pos - src->tail];
1218 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1223 count_columns (const char *s_, size_t length)
1225 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1231 for (ofs = 0; ofs < length; ofs += mblen)
1235 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1238 int width = uc_width (uc, "UTF-8");
1243 columns = ROUND_UP (columns + 1, 8);
1250 lex_token_get_first_column (const struct lex_source *src,
1251 const struct lex_token *token)
1253 return count_columns (&src->buffer[token->line_pos - src->tail],
1254 token->token_pos - token->line_pos);
1258 lex_token_get_last_column (const struct lex_source *src,
1259 const struct lex_token *token)
1261 char *start, *end, *newline;
1263 start = &src->buffer[token->line_pos - src->tail];
1264 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1265 newline = memrchr (start, '\n', end - start);
1266 if (newline != NULL)
1267 start = newline + 1;
1268 return count_columns (start, end - start);
1271 static struct msg_location
1272 lex_token_location (const struct lex_source *src,
1273 const struct lex_token *t0,
1274 const struct lex_token *t1)
1276 return (struct msg_location) {
1277 .file_name = src->reader->file_name,
1278 .first_line = t0->first_line,
1279 .last_line = lex_token_get_last_line_number (src, t1),
1280 .first_column = lex_token_get_first_column (src, t0),
1281 .last_column = lex_token_get_last_column (src, t1),
1285 static struct msg_location *
1286 lex_token_location_rw (const struct lex_source *src,
1287 const struct lex_token *t0,
1288 const struct lex_token *t1)
1290 struct msg_location location = lex_token_location (src, t0, t1);
1291 return msg_location_dup (&location);
1294 static struct msg_location *
1295 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1297 return lex_token_location_rw (src,
1298 lex_source_next__ (src, n0),
1299 lex_source_next__ (src, n1));
1302 /* Returns the 1-based line number of the start of the syntax that represents
1303 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1304 if the token is drawn from a source that does not have line numbers. */
1306 lex_get_first_line_number (const struct lexer *lexer, int n)
1308 const struct lex_source *src = lex_source__ (lexer);
1309 return src ? lex_source_next__ (src, n)->first_line : 0;
1312 /* Returns the 1-based line number of the end of the syntax that represents the
1313 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1314 token or if the token is drawn from a source that does not have line
1317 Most of the time, a single token is wholly within a single line of syntax,
1318 but there are two exceptions: a T_STRING token can be made up of multiple
1319 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1320 token can consist of a "-" on one line followed by the number on the next.
1323 lex_get_last_line_number (const struct lexer *lexer, int n)
1325 const struct lex_source *src = lex_source__ (lexer);
1326 return src ? lex_token_get_last_line_number (src,
1327 lex_source_next__ (src, n)) : 0;
1330 /* Returns the 1-based column number of the start of the syntax that represents
1331 the token N after the current one in LEXER. Returns 0 for a T_STOP
1334 Column numbers are measured according to the width of characters as shown in
1335 a typical fixed-width font, in which CJK characters have width 2 and
1336 combining characters have width 0. */
1338 lex_get_first_column (const struct lexer *lexer, int n)
1340 const struct lex_source *src = lex_source__ (lexer);
1341 return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1344 /* Returns the 1-based column number of the end of the syntax that represents
1345 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1348 Column numbers are measured according to the width of characters as shown in
1349 a typical fixed-width font, in which CJK characters have width 2 and
1350 combining characters have width 0. */
1352 lex_get_last_column (const struct lexer *lexer, int n)
1354 const struct lex_source *src = lex_source__ (lexer);
1355 return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1358 /* Returns the name of the syntax file from which the current command is drawn.
1359 Returns NULL for a T_STOP token or if the command's source does not have
1362 There is no version of this function that takes an N argument because
1363 lookahead only works to the end of a command and any given command is always
1364 within a single syntax file. */
1366 lex_get_file_name (const struct lexer *lexer)
1368 struct lex_source *src = lex_source__ (lexer);
1369 return src == NULL ? NULL : src->reader->file_name;
1372 /* Returns a newly allocated msg_location for the syntax that represents tokens
1373 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1374 must eventually free the location (with msg_location_destroy()). */
1375 struct msg_location *
1376 lex_get_location (const struct lexer *lexer, int n0, int n1)
1378 struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1379 loc->first_column = lex_get_first_column (lexer, n0);
1380 loc->last_column = lex_get_last_column (lexer, n1);
1384 /* Returns a newly allocated msg_location for the syntax that represents tokens
1385 with 0-based offsets N0...N1, inclusive, from the current token. The
1386 location only covers the tokens' lines, not the columns. The caller must
1387 eventually free the location (with msg_location_destroy()). */
1388 struct msg_location *
1389 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1391 struct msg_location *loc = xmalloc (sizeof *loc);
1392 *loc = (struct msg_location) {
1393 .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1394 .first_line = lex_get_first_line_number (lexer, n0),
1395 .last_line = lex_get_last_line_number (lexer, n1),
1401 lex_get_encoding (const struct lexer *lexer)
1403 struct lex_source *src = lex_source__ (lexer);
1404 return src == NULL ? NULL : src->reader->encoding;
1407 /* Returns the syntax mode for the syntax file from which the current drawn is
1408 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1409 does not have line numbers.
1411 There is no version of this function that takes an N argument because
1412 lookahead only works to the end of a command and any given command is always
1413 within a single syntax file. */
1415 lex_get_syntax_mode (const struct lexer *lexer)
1417 struct lex_source *src = lex_source__ (lexer);
1418 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1421 /* Returns the error mode for the syntax file from which the current drawn is
1422 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1423 source does not have line numbers.
1425 There is no version of this function that takes an N argument because
1426 lookahead only works to the end of a command and any given command is always
1427 within a single syntax file. */
1429 lex_get_error_mode (const struct lexer *lexer)
1431 struct lex_source *src = lex_source__ (lexer);
1432 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1435 /* If the source that LEXER is currently reading has error mode
1436 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1437 token to be read comes directly from whatever is next read from the stream.
1439 It makes sense to call this function after encountering an error in a
1440 command entered on the console, because usually the user would prefer not to
1441 have cascading errors. */
1443 lex_interactive_reset (struct lexer *lexer)
1445 struct lex_source *src = lex_source__ (lexer);
1446 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1448 src->head = src->tail = 0;
1449 src->journal_pos = src->seg_pos = src->line_pos = 0;
1450 src->n_newlines = 0;
1451 src->suppress_next_newline = false;
1452 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1454 lex_stage_clear (&src->pp);
1455 lex_stage_clear (&src->merge);
1456 lex_stage_clear (&src->lookahead);
1457 lex_source_push_endcmd__ (src);
1461 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1463 lex_discard_rest_of_command (struct lexer *lexer)
1465 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1469 /* Discards all lookahead tokens in LEXER, then discards all input sources
1470 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1471 runs out of input sources. */
1473 lex_discard_noninteractive (struct lexer *lexer)
1475 struct lex_source *src = lex_source__ (lexer);
1479 lex_stage_clear (&src->pp);
1480 lex_stage_clear (&src->merge);
1481 lex_stage_clear (&src->lookahead);
1483 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1484 src = lex_source__ (lexer))
1485 lex_source_destroy (src);
1490 lex_source_max_tail__ (const struct lex_source *src_)
1492 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1494 assert (src->seg_pos >= src->line_pos);
1495 size_t max_tail = MIN (src->journal_pos, src->line_pos);
1497 /* Use the oldest token also. */
1498 struct lex_stage *stages[] = { &src->lookahead, &src->merge, &src->pp };
1499 for (size_t i = 0; i < sizeof stages / sizeof *stages; i++)
1500 if (!lex_stage_is_empty (stages[i]))
1502 struct lex_token *first = lex_stage_first (stages[i]);
1503 assert (first->token_pos >= first->line_pos);
1504 return MIN (max_tail, first->line_pos);
1511 lex_source_expand__ (struct lex_source *src)
1513 if (src->head - src->tail >= src->allocated)
1515 size_t max_tail = lex_source_max_tail__ (src);
1516 if (max_tail > src->tail)
1518 /* Advance the tail, freeing up room at the head. */
1519 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1520 src->head - max_tail);
1521 src->tail = max_tail;
1525 /* Buffer is completely full. Expand it. */
1526 src->buffer = x2realloc (src->buffer, &src->allocated);
1531 /* There's space available at the head of the buffer. Nothing to do. */
1536 lex_source_read__ (struct lex_source *src)
1540 lex_source_expand__ (src);
1542 size_t head_ofs = src->head - src->tail;
1543 size_t space = src->allocated - head_ofs;
1544 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1545 size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1547 assert (n <= space);
1552 src->reader->eof = true;
1553 lex_source_expand__ (src);
1559 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1560 src->head - src->seg_pos));
1563 static struct lex_source *
1564 lex_source__ (const struct lexer *lexer)
1566 return (ll_is_empty (&lexer->sources) ? NULL
1567 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1570 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1571 one, through N1 ahead of the current one, inclusive. (For example, if N0
1572 and N1 are both zero, this requests the syntax for the current token.) The
1573 caller must eventually free the returned string (with free()). The syntax
1574 is encoded in UTF-8 and in the original form supplied to the lexer so that,
1575 for example, it may include comments, spaces, and new-lines if it spans
1576 multiple tokens. Macro expansion, however, has already been performed. */
1578 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1580 struct string s = DS_EMPTY_INITIALIZER;
1581 for (size_t i = n0; i <= n1; )
1583 /* Find [I,J) as the longest sequence of tokens not produced by macro
1584 expansion, or otherwise the longest sequence expanded from a single
1586 const struct lex_token *first = lex_source_next__ (src, i);
1588 for (j = i + 1; j <= n1; j++)
1590 const struct lex_token *cur = lex_source_next__ (src, j);
1591 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1592 || first->macro_rep != cur->macro_rep)
1595 const struct lex_token *last = lex_source_next__ (src, j - 1);
1597 /* Now add the syntax for this sequence of tokens to SRC. */
1598 if (!ds_is_empty (&s))
1599 ds_put_byte (&s, ' ');
1600 if (!first->macro_rep)
1602 size_t start = first->token_pos;
1603 size_t end = last->token_pos + last->token_len;
1604 ds_put_substring (&s, ss_buffer (&src->buffer[start - src->tail],
1609 size_t start = first->ofs;
1610 size_t end = last->ofs + last->len;
1611 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1617 return ds_steal_cstr (&s);
1621 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1623 for (size_t i = n0; i <= n1; i++)
1624 if (lex_source_next__ (src, i)->macro_rep)
1629 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1630 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1631 other tokens included in that range. The syntax is encoded in UTF-8 and in
1632 the original form supplied to the lexer so that, for example, it may include
1633 comments, spaces, and new-lines if it spans multiple tokens.
1635 Returns an empty string if the token range doesn't include a macro call.
1637 The caller must not modify or free the returned string. */
1638 static struct substring
1639 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1641 if (!lex_source_contains_macro_call (src, n0, n1))
1644 const struct lex_token *token0 = lex_source_next__ (src, n0);
1645 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1646 size_t start = token0->token_pos;
1647 size_t end = token1->token_pos + token1->token_len;
1649 return ss_buffer (&src->buffer[start - src->tail], end - start);
1653 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1654 const char *format, va_list args)
1656 const struct lex_token *token;
1661 token = lex_source_next__ (src, n0);
1662 if (token->token.type == T_ENDCMD)
1663 ds_put_cstr (&s, _("Syntax error at end of command"));
1666 /* Get the syntax that caused the error. */
1667 char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1669 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1672 /* Get the macro call(s) that expanded to the syntax that caused the
1675 str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1682 _("Syntax error at `%s' (in expansion of `%s')"),
1685 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1690 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1693 ds_put_cstr (&s, _("Syntax error"));
1699 ds_put_cstr (&s, ": ");
1700 ds_put_vformat (&s, format, args);
1702 if (ds_last (&s) != '.')
1703 ds_put_byte (&s, '.');
1705 struct msg *m = xmalloc (sizeof *m);
1707 .category = MSG_C_SYNTAX,
1708 .severity = MSG_S_ERROR,
1709 .location = lex_source_get_location (src, n0, n1),
1710 .text = ds_steal_cstr (&s),
1716 lex_get_error (struct lex_source *src, const struct lex_token *token)
1719 str_ellipsize (ss_buffer (&src->buffer[token->token_pos - src->tail],
1721 syntax, sizeof syntax);
1723 struct string s = DS_EMPTY_INITIALIZER;
1724 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1725 ds_put_format (&s, ": %s", token->token.string.string);
1727 struct msg *m = xmalloc (sizeof *m);
1729 .category = MSG_C_SYNTAX,
1730 .severity = MSG_S_ERROR,
1731 .location = lex_token_location_rw (src, token, token),
1732 .text = ds_steal_cstr (&s),
1737 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1738 underlying lex_reader if necessary. Returns true if a new token was added
1739 to SRC's deque, false otherwise. The caller should retry failures unless
1740 SRC's 'eof' marker was set to true indicating that there will be no more
1741 tokens from this source. */
1743 lex_source_try_get_pp (struct lex_source *src)
1745 /* Append a new token to SRC and initialize it. */
1746 struct lex_token *token = xmalloc (sizeof *token);
1747 token->token = (struct token) { .type = T_STOP };
1748 token->macro_rep = NULL;
1749 token->ref_cnt = NULL;
1750 token->line_pos = src->line_pos;
1751 token->token_pos = src->seg_pos;
1752 if (src->reader->line_number > 0)
1753 token->first_line = src->reader->line_number + src->n_newlines;
1755 token->first_line = 0;
1757 /* Extract a segment. */
1758 const char *segment;
1759 enum segment_type seg_type;
1763 segment = &src->buffer[src->seg_pos - src->tail];
1764 seg_len = segmenter_push (&src->segmenter, segment,
1765 src->head - src->seg_pos,
1766 src->reader->eof, &seg_type);
1770 /* The segmenter needs more input to produce a segment. */
1771 assert (!src->reader->eof);
1772 lex_source_read__ (src);
1775 /* Update state based on the segment. */
1776 token->token_len = seg_len;
1777 src->seg_pos += seg_len;
1778 if (seg_type == SEG_NEWLINE)
1780 src->line_pos = src->seg_pos;
1784 /* Get a token from the segment. */
1785 enum tokenize_result result = token_from_segment (
1786 seg_type, ss_buffer (segment, seg_len), &token->token);
1788 /* If we've reached the end of a line, or the end of a command, then pass
1789 the line to the output engine as a syntax text item. */
1790 int n_lines = seg_type == SEG_NEWLINE;
1791 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1794 src->suppress_next_newline = true;
1796 else if (n_lines > 0 && src->suppress_next_newline)
1799 src->suppress_next_newline = false;
1801 for (int i = 0; i < n_lines; i++)
1803 /* Beginning of line. */
1804 const char *line = &src->buffer[src->journal_pos - src->tail];
1806 /* Calculate line length, including \n or \r\n end-of-line if present.
1808 We use src->head even though that may be beyond what we've actually
1809 converted to tokens (which is only through line_pos). That's because,
1810 if we're emitting the line due to SEG_END_COMMAND, we want to take the
1811 whole line through the newline, not just through the '.'. */
1812 size_t max_len = src->head - src->journal_pos;
1813 const char *newline = memchr (line, '\n', max_len);
1814 size_t line_len = newline ? newline - line + 1 : max_len;
1816 /* Calculate line length excluding end-of-line. */
1817 size_t copy_len = line_len;
1818 if (copy_len > 0 && line[copy_len - 1] == '\n')
1820 if (copy_len > 0 && line[copy_len - 1] == '\r')
1823 /* Submit the line as syntax. */
1824 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1825 xmemdup0 (line, copy_len),
1828 src->journal_pos += line_len;
1833 case TOKENIZE_ERROR:
1834 lex_get_error (src, token);
1836 case TOKENIZE_EMPTY:
1837 lex_token_destroy (token);
1840 case TOKENIZE_TOKEN:
1841 if (token->token.type == T_STOP)
1843 token->token.type = T_ENDCMD;
1846 lex_stage_push_last (&src->pp, token);
1852 /* Attempts to append a new token to SRC. Returns true if successful, false on
1853 failure. On failure, the end of SRC has been reached and no more tokens
1854 will be forthcoming from it.
1856 Does not make the new token available for lookahead yet; the caller must
1857 adjust SRC's 'middle' pointer to do so. */
1859 lex_source_get_pp (struct lex_source *src)
1862 if (lex_source_try_get_pp (src))
1868 lex_source_try_get_merge (const struct lex_source *src_)
1870 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1872 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1875 if (!settings_get_mexpand ())
1877 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1881 /* Now pass tokens one-by-one to the macro expander.
1883 In the common case where there is no macro to expand, the loop is not
1885 struct macro_call *mc;
1886 int n_call = macro_call_create (src->lexer->macros,
1887 &lex_stage_first (&src->pp)->token, &mc);
1888 for (int ofs = 1; !n_call; ofs++)
1890 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1892 /* This should not be reachable because we always get a T_ENDCMD at
1893 the end of an input file (transformed from T_STOP by
1894 lex_source_try_get_pp()) and the macro_expander should always
1895 terminate expansion on T_ENDCMD. */
1899 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1900 size_t start = t->token_pos;
1901 size_t end = t->token_pos + t->token_len;
1902 const struct macro_token mt = {
1904 .syntax = ss_buffer (&src->buffer[start - src->tail], end - start),
1906 const struct msg_location loc = lex_token_location (src, t, t);
1907 n_call = macro_call_add (mc, &mt, &loc);
1911 /* False alarm: no macro expansion after all. Use first token as
1912 lookahead. We'll retry macro expansion from the second token next
1914 macro_call_destroy (mc);
1915 lex_stage_shift (&src->merge, &src->pp, 1);
1919 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1920 are a macro call. (These are likely to be the only tokens in 'pp'.)
1922 const struct lex_token *c0 = lex_stage_first (&src->pp);
1923 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1924 struct macro_tokens expansion = { .n = 0 };
1925 struct msg_location loc = lex_token_location (src, c0, c1);
1926 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1927 macro_call_destroy (mc);
1929 /* Convert the macro expansion into syntax for possible error messages
1931 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1932 size_t *len = xnmalloc (expansion.n, sizeof *len);
1933 struct string s = DS_EMPTY_INITIALIZER;
1934 macro_tokens_to_syntax (&expansion, &s, ofs, len);
1936 if (settings_get_mprint ())
1937 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1938 _("Macro Expansion")));
1940 /* Append the macro expansion tokens to the lookahead. */
1941 if (expansion.n > 0)
1943 char *macro_rep = ds_steal_cstr (&s);
1944 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1945 *ref_cnt = expansion.n;
1946 for (size_t i = 0; i < expansion.n; i++)
1948 struct lex_token *token = xmalloc (sizeof *token);
1949 *token = (struct lex_token) {
1950 .token = expansion.mts[i].token,
1951 .token_pos = c0->token_pos,
1952 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1953 .line_pos = c0->line_pos,
1954 .first_line = c0->first_line,
1955 .macro_rep = macro_rep,
1960 lex_stage_push_last (&src->merge, token);
1962 ss_dealloc (&expansion.mts[i].syntax);
1967 free (expansion.mts);
1971 /* Destroy the tokens for the call. */
1972 for (size_t i = 0; i < n_call; i++)
1973 lex_stage_pop_first (&src->pp);
1975 return expansion.n > 0;
1978 /* Attempts to obtain at least one new token into 'merge' in SRC.
1980 Returns true if successful, false on failure. In the latter case, SRC is
1981 exhausted and 'src->eof' is now true. */
1983 lex_source_get_merge (struct lex_source *src)
1986 if (lex_source_try_get_merge (src))
1991 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1993 Returns true if successful, false on failure. In the latter case, SRC is
1994 exhausted and 'src->eof' is now true. */
1996 lex_source_get_lookahead (struct lex_source *src)
1998 struct merger m = MERGER_INIT;
2000 for (size_t i = 0; ; i++)
2002 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
2004 /* We always get a T_ENDCMD at the end of an input file
2005 (transformed from T_STOP by lex_source_try_get_pp()) and
2006 merger_add() should never return -1 on T_ENDCMD. */
2007 assert (lex_stage_is_empty (&src->merge));
2011 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
2015 lex_stage_shift (&src->lookahead, &src->merge, 1);
2018 else if (retval > 0)
2020 /* Add a token that merges all the tokens together. */
2021 const struct lex_token *first = lex_stage_first (&src->merge);
2022 const struct lex_token *last = lex_stage_nth (&src->merge,
2024 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2025 struct lex_token *t = xmalloc (sizeof *t);
2026 *t = (struct lex_token) {
2028 .token_pos = first->token_pos,
2029 .token_len = (last->token_pos - first->token_pos) + last->token_len,
2030 .line_pos = first->line_pos,
2031 .first_line = first->first_line,
2033 /* This works well if all the tokens were not expanded from macros,
2034 or if they came from the same macro expansion. It just gives up
2035 in the other (corner) cases. */
2036 .macro_rep = macro ? first->macro_rep : NULL,
2037 .ofs = macro ? first->ofs : 0,
2038 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2039 .ref_cnt = macro ? first->ref_cnt : NULL,
2043 lex_stage_push_last (&src->lookahead, t);
2045 for (int i = 0; i < retval; i++)
2046 lex_stage_pop_first (&src->merge);
2053 lex_source_push_endcmd__ (struct lex_source *src)
2055 assert (lex_stage_is_empty (&src->lookahead));
2056 struct lex_token *token = xmalloc (sizeof *token);
2057 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2058 lex_stage_push_last (&src->lookahead, token);
2061 static struct lex_source *
2062 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2064 struct lex_source *src = xmalloc (sizeof *src);
2065 *src = (struct lex_source) {
2067 .segmenter = segmenter_init (reader->syntax, false),
2071 lex_source_push_endcmd__ (src);
2077 lex_source_destroy (struct lex_source *src)
2079 char *file_name = src->reader->file_name;
2080 char *encoding = src->reader->encoding;
2081 if (src->reader->class->destroy != NULL)
2082 src->reader->class->destroy (src->reader);
2086 lex_stage_uninit (&src->pp);
2087 lex_stage_uninit (&src->merge);
2088 lex_stage_uninit (&src->lookahead);
2089 ll_remove (&src->ll);
2093 struct lex_file_reader
2095 struct lex_reader reader;
2096 struct u8_istream *istream;
2099 static struct lex_reader_class lex_file_reader_class;
2101 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2102 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2103 ENCODING, which should take one of the forms accepted by
2104 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2105 mode of the new reader, respectively.
2107 Returns a null pointer if FILE_NAME cannot be opened. */
2109 lex_reader_for_file (const char *file_name, const char *encoding,
2110 enum segmenter_mode syntax,
2111 enum lex_error_mode error)
2113 struct lex_file_reader *r;
2114 struct u8_istream *istream;
2116 istream = (!strcmp(file_name, "-")
2117 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2118 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2119 if (istream == NULL)
2121 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2125 r = xmalloc (sizeof *r);
2126 lex_reader_init (&r->reader, &lex_file_reader_class);
2127 r->reader.syntax = syntax;
2128 r->reader.error = error;
2129 r->reader.file_name = xstrdup (file_name);
2130 r->reader.encoding = xstrdup_if_nonnull (encoding);
2131 r->reader.line_number = 1;
2132 r->istream = istream;
2137 static struct lex_file_reader *
2138 lex_file_reader_cast (struct lex_reader *r)
2140 return UP_CAST (r, struct lex_file_reader, reader);
2144 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2145 enum prompt_style prompt_style UNUSED)
2147 struct lex_file_reader *r = lex_file_reader_cast (r_);
2148 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2151 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2158 lex_file_close (struct lex_reader *r_)
2160 struct lex_file_reader *r = lex_file_reader_cast (r_);
2162 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2164 if (u8_istream_close (r->istream) != 0)
2165 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2168 u8_istream_free (r->istream);
2173 static struct lex_reader_class lex_file_reader_class =
2179 struct lex_string_reader
2181 struct lex_reader reader;
2186 static struct lex_reader_class lex_string_reader_class;
2188 /* Creates and returns a new lex_reader for the contents of S, which must be
2189 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2190 with ss_dealloc() when it is closed. */
2192 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2194 struct lex_string_reader *r;
2196 r = xmalloc (sizeof *r);
2197 lex_reader_init (&r->reader, &lex_string_reader_class);
2198 r->reader.syntax = SEG_MODE_AUTO;
2199 r->reader.encoding = xstrdup_if_nonnull (encoding);
2206 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2207 which must be encoded in ENCODING. The caller retains ownership of S. */
2209 lex_reader_for_string (const char *s, const char *encoding)
2211 struct substring ss;
2212 ss_alloc_substring (&ss, ss_cstr (s));
2213 return lex_reader_for_substring_nocopy (ss, encoding);
2216 /* Formats FORMAT as a printf()-like format string and creates and returns a
2217 new lex_reader for the formatted result. */
2219 lex_reader_for_format (const char *format, const char *encoding, ...)
2221 struct lex_reader *r;
2224 va_start (args, encoding);
2225 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2231 static struct lex_string_reader *
2232 lex_string_reader_cast (struct lex_reader *r)
2234 return UP_CAST (r, struct lex_string_reader, reader);
2238 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2239 enum prompt_style prompt_style UNUSED)
2241 struct lex_string_reader *r = lex_string_reader_cast (r_);
2244 chunk = MIN (n, r->s.length - r->offset);
2245 memcpy (buf, r->s.string + r->offset, chunk);
2252 lex_string_close (struct lex_reader *r_)
2254 struct lex_string_reader *r = lex_string_reader_cast (r_);
2260 static struct lex_reader_class lex_string_reader_class =