1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/intern.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
70 size_t token_pos; /* Offset into src->buffer of token start. */
71 size_t token_len; /* Length of source for token in bytes. */
73 /* For a token obtained through macro expansion, this is just this token.
75 For a token obtained through the lexer in an ordinary way, these are
77 char *macro_rep; /* The whole macro expansion. */
78 size_t ofs; /* Offset of this token in macro_rep. */
79 size_t len; /* Length of this token in macro_rep. */
80 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
83 static struct msg_point lex_token_start_point (const struct lex_source *,
84 const struct lex_token *);
85 static struct msg_point lex_token_end_point (const struct lex_source *,
86 const struct lex_token *);
88 static size_t lex_ofs_at_phrase__ (struct lexer *, int ofs, const char *s);
90 /* Source offset of the last byte in TOKEN. */
92 lex_token_end (const struct lex_token *token)
94 return token->token_pos + MAX (token->token_len, 1) - 1;
98 lex_token_destroy (struct lex_token *t)
100 token_uninit (&t->token);
103 assert (*t->ref_cnt > 0);
113 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
118 struct lex_token **tokens;
121 static void lex_stage_clear (struct lex_stage *);
122 static void lex_stage_uninit (struct lex_stage *);
124 static size_t lex_stage_count (const struct lex_stage *);
125 static bool lex_stage_is_empty (const struct lex_stage *);
127 static struct lex_token *lex_stage_first (struct lex_stage *);
128 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
130 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
131 static void lex_stage_pop_first (struct lex_stage *);
133 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
136 /* Deletes all the tokens from STAGE. */
138 lex_stage_clear (struct lex_stage *stage)
140 while (!deque_is_empty (&stage->deque))
141 lex_stage_pop_first (stage);
144 /* Deletes all the tokens from STAGE and frees storage for the deque. */
146 lex_stage_uninit (struct lex_stage *stage)
148 lex_stage_clear (stage);
149 free (stage->tokens);
152 /* Returns true if STAGE contains no tokens, otherwise false. */
154 lex_stage_is_empty (const struct lex_stage *stage)
156 return deque_is_empty (&stage->deque);
159 /* Returns the number of tokens in STAGE. */
161 lex_stage_count (const struct lex_stage *stage)
163 return deque_count (&stage->deque);
166 /* Returns the first token in STAGE, which must be nonempty.
167 The first token is the one accessed with the least lookahead. */
168 static struct lex_token *
169 lex_stage_first (struct lex_stage *stage)
171 return lex_stage_nth (stage, 0);
174 /* Returns the token the given INDEX in STAGE. The first token (with the least
175 lookahead) is 0, the second token is 1, and so on. There must be at least
176 INDEX + 1 tokens in STAGE. */
177 static struct lex_token *
178 lex_stage_nth (struct lex_stage *stage, size_t index)
180 return stage->tokens[deque_back (&stage->deque, index)];
183 /* Adds TOKEN so that it becomes the last token in STAGE. */
185 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
187 if (deque_is_full (&stage->deque))
188 stage->tokens = deque_expand (&stage->deque, stage->tokens,
189 sizeof *stage->tokens);
190 stage->tokens[deque_push_front (&stage->deque)] = token;
193 /* Removes and returns the first token from STAGE. */
194 static struct lex_token *
195 lex_stage_take_first (struct lex_stage *stage)
197 return stage->tokens[deque_pop_back (&stage->deque)];
200 /* Removes the first token from STAGE and uninitializes it. */
202 lex_stage_pop_first (struct lex_stage *stage)
204 lex_token_destroy (lex_stage_take_first (stage));
207 /* Removes the first N tokens from SRC, appending them to DST as the last
210 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
212 for (size_t i = 0; i < n; i++)
213 lex_stage_push_last (dst, lex_stage_take_first (src));
216 /* A source of tokens, corresponding to a syntax file.
218 This is conceptually a lex_reader wrapped with everything needed to convert
219 its UTF-8 bytes into tokens. */
222 struct ll ll; /* In lexer's list of sources. */
226 - One for struct lexer.
228 - One for each struct msg_location that references this source. */
231 struct lex_reader *reader;
233 struct segmenter segmenter;
234 bool eof; /* True if T_STOP was read from 'reader'. */
236 /* Buffer of UTF-8 bytes. */
237 char *buffer; /* Source file contents. */
238 size_t length; /* Number of bytes filled. */
239 size_t allocated; /* Number of bytes allocated. */
241 /* Offsets into 'buffer'. */
242 size_t journal_pos; /* First byte not yet output to journal. */
243 size_t seg_pos; /* First byte not yet scanned as token. */
245 /* Offset into 'buffer' of starts of lines. */
247 size_t n_lines, allocated_lines;
249 bool suppress_next_newline;
253 This is a pipeline with the following stages. Each token eventually
254 made available to the parser passes through of these stages. The stages
255 are named after the processing that happens in each one.
257 Initially, tokens come from the segmenter and scanner to 'pp':
259 - pp: Tokens that need to pass through the macro preprocessor to end up
262 - merge: Tokens that need to pass through scan_merge() to end up in
265 - parse: Tokens available to the client for parsing.
267 'pp' and 'merge' store tokens only temporarily until they pass into
268 'parse'. Tokens then live in 'parse' until the command is fully
269 consumed, at which time they are freed together. */
271 struct lex_stage merge;
272 struct lex_token **parse;
273 size_t n_parse, allocated_parse, parse_ofs;
276 static struct lex_source *lex_source_create (struct lexer *,
277 struct lex_reader *);
282 struct ll_list sources; /* Contains "struct lex_source"s. */
283 struct macro_set *macros;
286 static struct lex_source *lex_source__ (const struct lexer *);
287 static char *lex_source_syntax__ (const struct lex_source *,
289 static const struct lex_token *lex_next__ (const struct lexer *, int n);
290 static void lex_source_push_endcmd__ (struct lex_source *);
291 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
292 static void lex_source_clear_parse (struct lex_source *);
294 static bool lex_source_get_parse (struct lex_source *);
295 static void lex_source_msg_valist (struct lex_source *, enum msg_class,
297 const char *format, va_list)
298 PRINTF_FORMAT (5, 0);
299 static const struct lex_token *lex_source_next__ (const struct lex_source *,
302 /* Initializes READER with the specified CLASS and otherwise some reasonable
303 defaults. The caller should fill in the others members as desired. */
305 lex_reader_init (struct lex_reader *reader,
306 const struct lex_reader_class *class)
308 reader->class = class;
309 reader->syntax = SEG_MODE_AUTO;
310 reader->error = LEX_ERROR_CONTINUE;
311 reader->file_name = NULL;
312 reader->encoding = NULL;
313 reader->line_number = 0;
317 /* Frees any file name already in READER and replaces it by a copy of
318 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
320 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
322 free (reader->file_name);
323 reader->file_name = xstrdup_if_nonnull (file_name);
326 /* Creates and returns a new lexer. */
330 struct lexer *lexer = xmalloc (sizeof *lexer);
331 *lexer = (struct lexer) {
332 .sources = LL_INITIALIZER (lexer->sources),
333 .macros = macro_set_create (),
338 /* Destroys LEXER. */
340 lex_destroy (struct lexer *lexer)
344 struct lex_source *source, *next;
346 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
348 ll_remove (&source->ll);
349 lex_source_unref (source);
351 macro_set_destroy (lexer->macros);
356 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
357 same name. Takes ownership of M. */
359 lex_define_macro (struct lexer *lexer, struct macro *m)
361 macro_set_add (lexer->macros, m);
364 /* Inserts READER into LEXER so that the next token read by LEXER comes from
365 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
368 lex_include (struct lexer *lexer, struct lex_reader *reader)
370 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
371 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
374 /* Appends READER to LEXER, so that it will be read after all other current
375 readers have already been read. */
377 lex_append (struct lexer *lexer, struct lex_reader *reader)
379 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
384 /* Advances LEXER to the next token, consuming the current token. */
386 lex_get (struct lexer *lexer)
388 struct lex_source *src;
390 src = lex_source__ (lexer);
394 if (src->parse_ofs < src->n_parse)
396 if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
397 lex_source_clear_parse (src);
402 while (src->parse_ofs == src->n_parse)
403 if (!lex_source_get_parse (src))
405 ll_remove (&src->ll);
406 lex_source_unref (src);
407 src = lex_source__ (lexer);
413 /* Advances LEXER by N tokens. */
415 lex_get_n (struct lexer *lexer, size_t n)
421 /* Issuing errors. */
423 /* Prints a syntax error message containing the current token and
424 given message MESSAGE (if non-null). */
426 lex_error (struct lexer *lexer, const char *format, ...)
430 va_start (args, format);
431 lex_ofs_msg_valist (lexer, SE, lex_ofs (lexer), lex_ofs (lexer),
436 /* Prints a syntax error message for the span of tokens N0 through N1,
437 inclusive, from the current token in LEXER, adding message MESSAGE (if
440 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
444 va_start (args, format);
445 int ofs = lex_ofs (lexer);
446 lex_ofs_msg_valist (lexer, SE, n0 + ofs, n1 + ofs, format, args);
450 /* Prints a syntax error message for the span of tokens with offsets OFS0
451 through OFS1, inclusive, within the current command in LEXER, adding message
452 MESSAGE (if non-null). */
454 lex_ofs_error (struct lexer *lexer, int ofs0, int ofs1, const char *format, ...)
458 va_start (args, format);
459 lex_ofs_msg_valist (lexer, SE, ofs0, ofs1, format, args);
463 /* Prints a message of the given CLASS containing the current token and given
464 message MESSAGE (if non-null). */
466 lex_msg (struct lexer *lexer, enum msg_class class, const char *format, ...)
470 va_start (args, format);
471 lex_ofs_msg_valist (lexer, class, lex_ofs (lexer), lex_ofs (lexer),
476 /* Prints a syntax error message for the span of tokens N0 through N1,
477 inclusive, from the current token in LEXER, adding message MESSAGE (if
480 lex_next_msg (struct lexer *lexer, enum msg_class class, int n0, int n1,
481 const char *format, ...)
485 va_start (args, format);
486 int ofs = lex_ofs (lexer);
487 lex_ofs_msg_valist (lexer, class, n0 + ofs, n1 + ofs, format, args);
491 /* Prints a message of the given CLASS for the span of tokens with offsets OFS0
492 through OFS1, inclusive, within the current command in LEXER, adding message
493 MESSAGE (if non-null). */
495 lex_ofs_msg (struct lexer *lexer, enum msg_class class, int ofs0, int ofs1,
496 const char *format, ...)
500 va_start (args, format);
501 lex_ofs_msg_valist (lexer, class, ofs0, ofs1, format, args);
505 /* Prints a syntax error message saying that one of the strings provided as
506 varargs, up to the first NULL, is expected. */
508 (lex_error_expecting) (struct lexer *lexer, ...)
512 va_start (args, lexer);
513 lex_error_expecting_valist (lexer, args);
517 /* Prints a syntax error message saying that one of the options provided in
518 ARGS, up to the first NULL, is expected. */
520 lex_error_expecting_valist (struct lexer *lexer, va_list args)
522 const char **options = NULL;
523 size_t allocated = 0;
528 const char *option = va_arg (args, const char *);
533 options = x2nrealloc (options, &allocated, sizeof *options);
534 options[n++] = option;
536 lex_error_expecting_array (lexer, options, n);
541 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
546 lex_error (lexer, NULL);
550 lex_error (lexer, _("Syntax error expecting %s."), options[0]);
554 lex_error (lexer, _("Syntax error expecting %s or %s."),
555 options[0], options[1]);
559 lex_error (lexer, _("Syntax error expecting %s, %s, or %s."),
560 options[0], options[1], options[2]);
564 lex_error (lexer, _("Syntax error expecting %s, %s, %s, or %s."),
565 options[0], options[1], options[2], options[3]);
569 lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, or %s."),
570 options[0], options[1], options[2], options[3], options[4]);
574 lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, or %s."),
575 options[0], options[1], options[2], options[3], options[4],
580 lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, "
582 options[0], options[1], options[2], options[3], options[4],
583 options[5], options[6]);
587 lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, %s, "
589 options[0], options[1], options[2], options[3], options[4],
590 options[5], options[6], options[7]);
595 struct string s = DS_EMPTY_INITIALIZER;
596 for (size_t i = 0; i < n; i++)
599 ds_put_cstr (&s, ", ");
600 ds_put_cstr (&s, options[i]);
602 lex_error (lexer, _("Syntax error expecting one of the following: %s."),
610 /* Reports an error to the effect that subcommand SBC may only be specified
613 lex_sbc_only_once (struct lexer *lexer, const char *sbc)
615 int ofs = lex_ofs (lexer) - 1;
616 if (lex_ofs_token (lexer, ofs)->type == T_EQUALS)
619 /* lex_ofs_at_phrase__() handles subcommand names that are keywords, such as
621 if (lex_ofs_at_phrase__ (lexer, ofs, sbc))
622 lex_ofs_error (lexer, ofs, ofs,
623 _("Subcommand %s may only be specified once."), sbc);
625 msg (SE, _("Subcommand %s may only be specified once."), sbc);
628 /* Reports an error to the effect that subcommand SBC is missing.
630 This function does not take a lexer as an argument or use lex_error(),
631 because a missing subcommand can normally be detected only after the whole
632 command has been parsed, and so lex_error() would always report "Syntax
633 error at end of command", which does not help the user find the error. */
635 lex_sbc_missing (struct lexer *lexer, const char *sbc)
637 lex_ofs_error (lexer, 0, lex_max_ofs (lexer),
638 _("Required subcommand %s was not specified."), sbc);
641 /* Reports an error to the effect that specification SPEC may only be specified
642 once within subcommand SBC. */
644 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
646 lex_error (lexer, _("%s may only be specified once within subcommand %s."),
650 /* Reports an error to the effect that specification SPEC is missing within
653 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
655 lex_error (lexer, _("Required %s specification missing from %s subcommand."),
659 /* Prints a syntax error message for the span of tokens with offsets OFS0
660 through OFS1, inclusive, within the current command in LEXER, adding message
661 MESSAGE (if non-null) with the given ARGS. */
663 lex_ofs_msg_valist (struct lexer *lexer, enum msg_class class,
664 int ofs0, int ofs1, const char *format, va_list args)
666 lex_source_msg_valist (lex_source__ (lexer), class, ofs0, ofs1, format, args);
669 /* Checks that we're at end of command.
670 If so, returns a successful command completion code.
671 If not, flags a syntax error and returns an error command
674 lex_end_of_command (struct lexer *lexer)
676 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
678 lex_error (lexer, _("Syntax error expecting end of command."));
685 /* Token testing functions. */
687 /* Returns true if the current token is a number. */
689 lex_is_number (const struct lexer *lexer)
691 return lex_next_is_number (lexer, 0);
694 /* Returns true if the current token is a string. */
696 lex_is_string (const struct lexer *lexer)
698 return lex_next_is_string (lexer, 0);
701 /* Returns the value of the current token, which must be a
702 floating point number. */
704 lex_number (const struct lexer *lexer)
706 return lex_next_number (lexer, 0);
709 /* Returns true iff the current token is an integer. */
711 lex_is_integer (const struct lexer *lexer)
713 return lex_next_is_integer (lexer, 0);
716 /* Returns the value of the current token, which must be an
719 lex_integer (const struct lexer *lexer)
721 return lex_next_integer (lexer, 0);
724 /* Token testing functions with lookahead.
726 A value of 0 for N as an argument to any of these functions refers to the
727 current token. Lookahead is limited to the current command. Any N greater
728 than the number of tokens remaining in the current command will be treated
729 as referring to a T_ENDCMD token. */
731 /* Returns true if the token N ahead of the current token is a number. */
733 lex_next_is_number (const struct lexer *lexer, int n)
735 return token_is_number (lex_next (lexer, n));
738 /* Returns true if the token N ahead of the current token is a string. */
740 lex_next_is_string (const struct lexer *lexer, int n)
742 return token_is_string (lex_next (lexer, n));
745 /* Returns the value of the token N ahead of the current token, which must be a
746 floating point number. */
748 lex_next_number (const struct lexer *lexer, int n)
750 return token_number (lex_next (lexer, n));
753 /* Returns true if the token N ahead of the current token is an integer. */
755 lex_next_is_integer (const struct lexer *lexer, int n)
757 return token_is_integer (lex_next (lexer, n));
760 /* Returns the value of the token N ahead of the current token, which must be
763 lex_next_integer (const struct lexer *lexer, int n)
765 return token_integer (lex_next (lexer, n));
768 /* Token matching functions. */
770 /* If the current token has the specified TYPE, skips it and returns true.
771 Otherwise, returns false. */
773 lex_match (struct lexer *lexer, enum token_type type)
775 if (lex_token (lexer) == type)
784 /* If the current token matches IDENTIFIER, skips it and returns true.
785 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
788 IDENTIFIER must be an ASCII string. */
790 lex_match_id (struct lexer *lexer, const char *identifier)
792 return lex_match_id_n (lexer, identifier, 3);
795 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
796 may be abbreviated to its first N letters. Otherwise, returns false.
798 IDENTIFIER must be an ASCII string. */
800 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
802 if (lex_token (lexer) == T_ID
803 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
812 /* If the current token is integer X, skips it and returns true. Otherwise,
815 lex_match_int (struct lexer *lexer, int x)
817 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
826 /* Forced matches. */
828 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
829 abbreviated to its first 3 letters. Otherwise, reports an error and returns
832 IDENTIFIER must be an ASCII string. */
834 lex_force_match_id (struct lexer *lexer, const char *identifier)
836 if (lex_match_id (lexer, identifier))
840 lex_error_expecting (lexer, identifier);
845 /* If the current token has the specified TYPE, skips it and returns true.
846 Otherwise, reports an error and returns false. */
848 lex_force_match (struct lexer *lexer, enum token_type type)
850 if (lex_token (lexer) == type)
857 const char *type_string = token_type_to_string (type);
860 char *s = xasprintf ("`%s'", type_string);
861 lex_error_expecting (lexer, s);
865 lex_error_expecting (lexer, token_type_to_name (type));
871 /* If the current token is a string, does nothing and returns true.
872 Otherwise, reports an error and returns false. */
874 lex_force_string (struct lexer *lexer)
876 if (lex_is_string (lexer))
880 lex_error (lexer, _("Syntax error expecting string."));
885 /* If the current token is a string or an identifier, does nothing and returns
886 true. Otherwise, reports an error and returns false.
888 This is meant for use in syntactic situations where we want to encourage the
889 user to supply a quoted string, but for compatibility we also accept
890 identifiers. (One example of such a situation is file names.) Therefore,
891 the error message issued when the current token is wrong only says that a
892 string is expected and doesn't mention that an identifier would also be
895 lex_force_string_or_id (struct lexer *lexer)
897 return lex_token (lexer) == T_ID || lex_force_string (lexer);
900 /* If the current token is an integer, does nothing and returns true.
901 Otherwise, reports an error and returns false. */
903 lex_force_int (struct lexer *lexer)
905 if (lex_is_integer (lexer))
909 lex_error (lexer, _("Syntax error expecting integer."));
914 /* If the current token is an integer in the range MIN...MAX (inclusive), does
915 nothing and returns true. Otherwise, reports an error and returns false.
916 If NAME is nonnull, then it is used in the error message. */
918 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
920 bool is_number = lex_is_number (lexer);
921 bool is_integer = lex_is_integer (lexer);
922 bool too_small = (is_integer ? lex_integer (lexer) < min
923 : is_number ? lex_number (lexer) < min
925 bool too_big = (is_integer ? lex_integer (lexer) > max
926 : is_number ? lex_number (lexer) > max
928 if (is_integer && !too_small && !too_big)
933 /* Weird, maybe a bug in the caller. Just report that we needed an
936 lex_error (lexer, _("Syntax error expecting integer for %s."), name);
938 lex_error (lexer, _("Syntax error expecting integer."));
943 lex_error (lexer, _("Syntax error expecting %ld for %s."), min, name);
945 lex_error (lexer, _("Syntax error expecting %ld."), min);
947 else if (min + 1 == max)
950 lex_error (lexer, _("Syntax error expecting %ld or %ld for %s."),
953 lex_error (lexer, _("Syntax error expecting %ld or %ld."),
958 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
959 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
961 if (report_lower_bound && report_upper_bound)
965 _("Syntax error expecting integer "
966 "between %ld and %ld for %s."),
969 lex_error (lexer, _("Syntax error expecting integer "
970 "between %ld and %ld."),
973 else if (report_lower_bound)
978 lex_error (lexer, _("Syntax error expecting "
979 "non-negative integer for %s."),
982 lex_error (lexer, _("Syntax error expecting "
983 "non-negative integer."));
988 lex_error (lexer, _("Syntax error expecting "
989 "positive integer for %s."),
992 lex_error (lexer, _("Syntax error expecting "
993 "positive integer."));
998 lex_error (lexer, _("Syntax error expecting "
999 "integer %ld or greater for %s."),
1002 lex_error (lexer, _("Syntax error expecting "
1003 "integer %ld or greater."), min);
1006 else if (report_upper_bound)
1010 _("Syntax error expecting integer less than or equal "
1014 lex_error (lexer, _("Syntax error expecting integer less than or "
1021 lex_error (lexer, _("Syntax error expecting integer for %s."),
1024 lex_error (lexer, _("Syntax error expecting integer."));
1030 /* If the current token is a number, does nothing and returns true.
1031 Otherwise, reports an error and returns false. */
1033 lex_force_num (struct lexer *lexer)
1035 if (lex_is_number (lexer))
1038 lex_error (lexer, _("Syntax error expecting number."));
1042 /* If the current token is an number in the closed range [MIN,MAX], does
1043 nothing and returns true. Otherwise, reports an error and returns false.
1044 If NAME is nonnull, then it is used in the error message. */
1046 lex_force_num_range_closed (struct lexer *lexer, const char *name,
1047 double min, double max)
1049 bool is_number = lex_is_number (lexer);
1050 bool too_small = is_number && lex_number (lexer) < min;
1051 bool too_big = is_number && lex_number (lexer) > max;
1052 if (is_number && !too_small && !too_big)
1057 /* Weird, maybe a bug in the caller. Just report that we needed an
1060 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1062 lex_error (lexer, _("Syntax error expecting number."));
1064 else if (min == max)
1067 lex_error (lexer, _("Syntax error expecting number %g for %s."),
1070 lex_error (lexer, _("Syntax error expecting number %g."), min);
1074 bool report_lower_bound = min > -DBL_MAX || too_small;
1075 bool report_upper_bound = max < DBL_MAX || too_big;
1077 if (report_lower_bound && report_upper_bound)
1081 _("Syntax error expecting number "
1082 "between %g and %g for %s."),
1085 lex_error (lexer, _("Syntax error expecting number "
1086 "between %g and %g."),
1089 else if (report_lower_bound)
1094 lex_error (lexer, _("Syntax error expecting "
1095 "non-negative number for %s."),
1098 lex_error (lexer, _("Syntax error expecting "
1099 "non-negative number."));
1104 lex_error (lexer, _("Syntax error expecting number "
1105 "%g or greater for %s."),
1108 lex_error (lexer, _("Syntax error expecting number "
1109 "%g or greater."), min);
1112 else if (report_upper_bound)
1116 _("Syntax error expecting number "
1117 "less than or equal to %g for %s."),
1120 lex_error (lexer, _("Syntax error expecting number "
1121 "less than or equal to %g."),
1127 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1129 lex_error (lexer, _("Syntax error expecting number."));
1135 /* If the current token is an number in the half-open range [MIN,MAX), does
1136 nothing and returns true. Otherwise, reports an error and returns false.
1137 If NAME is nonnull, then it is used in the error message. */
1139 lex_force_num_range_halfopen (struct lexer *lexer, const char *name,
1140 double min, double max)
1142 bool is_number = lex_is_number (lexer);
1143 bool too_small = is_number && lex_number (lexer) < min;
1144 bool too_big = is_number && lex_number (lexer) >= max;
1145 if (is_number && !too_small && !too_big)
1150 /* Weird, maybe a bug in the caller. Just report that we needed an
1153 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1155 lex_error (lexer, _("Syntax error expecting number."));
1159 bool report_lower_bound = min > -DBL_MAX || too_small;
1160 bool report_upper_bound = max < DBL_MAX || too_big;
1162 if (report_lower_bound && report_upper_bound)
1165 lex_error (lexer, _("Syntax error expecting number "
1166 "in [%g,%g) for %s."),
1169 lex_error (lexer, _("Syntax error expecting number in [%g,%g)."),
1172 else if (report_lower_bound)
1177 lex_error (lexer, _("Syntax error expecting "
1178 "non-negative number for %s."),
1181 lex_error (lexer, _("Syntax error expecting "
1182 "non-negative number."));
1187 lex_error (lexer, _("Syntax error expecting "
1188 "number %g or greater for %s."),
1191 lex_error (lexer, _("Syntax error expecting "
1192 "number %g or greater."), min);
1195 else if (report_upper_bound)
1199 _("Syntax error expecting "
1200 "number less than %g for %s."), max, name);
1202 lex_error (lexer, _("Syntax error expecting "
1203 "number less than %g."), max);
1208 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1210 lex_error (lexer, _("Syntax error expecting number."));
1216 /* If the current token is an number in the open range (MIN,MAX), does
1217 nothing and returns true. Otherwise, reports an error and returns false.
1218 If NAME is nonnull, then it is used in the error message. */
1220 lex_force_num_range_open (struct lexer *lexer, const char *name,
1221 double min, double max)
1223 bool is_number = lex_is_number (lexer);
1224 bool too_small = is_number && lex_number (lexer) <= min;
1225 bool too_big = is_number && lex_number (lexer) >= max;
1226 if (is_number && !too_small && !too_big)
1231 /* Weird, maybe a bug in the caller. Just report that we needed an
1234 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1236 lex_error (lexer, _("Syntax error expecting number."));
1240 bool report_lower_bound = min > -DBL_MAX || too_small;
1241 bool report_upper_bound = max < DBL_MAX || too_big;
1243 if (report_lower_bound && report_upper_bound)
1246 lex_error (lexer, _("Syntax error expecting number "
1247 "in (%g,%g) for %s."),
1250 lex_error (lexer, _("Syntax error expecting number "
1251 "in (%g,%g)."), min, max);
1253 else if (report_lower_bound)
1258 lex_error (lexer, _("Syntax error expecting "
1259 "positive number for %s."), name);
1261 lex_error (lexer, _("Syntax error expecting "
1262 "positive number."));
1267 lex_error (lexer, _("Syntax error expecting number "
1268 "greater than %g for %s."),
1271 lex_error (lexer, _("Syntax error expecting number "
1272 "greater than %g."), min);
1275 else if (report_upper_bound)
1278 lex_error (lexer, _("Syntax error expecting number "
1279 "less than %g for %s."),
1282 lex_error (lexer, _("Syntax error expecting number "
1283 "less than %g."), max);
1288 lex_error (lexer, _("Syntax error expecting number "
1291 lex_error (lexer, _("Syntax error expecting number."));
1297 /* If the current token is an identifier, does nothing and returns true.
1298 Otherwise, reports an error and returns false. */
1300 lex_force_id (struct lexer *lexer)
1302 if (lex_token (lexer) == T_ID)
1305 lex_error (lexer, _("Syntax error expecting identifier."));
1309 /* Token accessors. */
1311 /* Returns the type of LEXER's current token. */
1313 lex_token (const struct lexer *lexer)
1315 return lex_next_token (lexer, 0);
1318 /* Returns the number in LEXER's current token.
1320 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1321 tokens this function will always return zero. */
1323 lex_tokval (const struct lexer *lexer)
1325 return lex_next_tokval (lexer, 0);
1328 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
1330 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1331 this functions this function will always return NULL.
1333 The UTF-8 encoding of the returned string is correct for variable names and
1334 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1335 data_in() to use it in a "union value". */
1337 lex_tokcstr (const struct lexer *lexer)
1339 return lex_next_tokcstr (lexer, 0);
1342 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
1343 null-terminated (but the null terminator is not included in the returned
1344 substring's 'length').
1346 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1347 this functions this function will always return NULL.
1349 The UTF-8 encoding of the returned string is correct for variable names and
1350 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1351 data_in() to use it in a "union value". */
1353 lex_tokss (const struct lexer *lexer)
1355 return lex_next_tokss (lexer, 0);
1360 A value of 0 for N as an argument to any of these functions refers to the
1361 current token. Lookahead is limited to the current command. Any N greater
1362 than the number of tokens remaining in the current command will be treated
1363 as referring to a T_ENDCMD token. */
1365 static const struct lex_token *
1366 lex_next__ (const struct lexer *lexer_, int n)
1368 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1369 struct lex_source *src = lex_source__ (lexer);
1372 return lex_source_next__ (src, n);
1375 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1380 static const struct lex_token *
1381 lex_source_ofs__ (const struct lex_source *src_, int ofs)
1383 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1387 static const struct lex_token endcmd_token
1388 = { .token = { .type = T_ENDCMD } };
1389 return &endcmd_token;
1392 while (ofs >= src->n_parse)
1394 if (src->n_parse > 0)
1396 const struct lex_token *t = src->parse[src->n_parse - 1];
1397 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1401 lex_source_get_parse (src);
1404 return src->parse[ofs];
1407 static const struct lex_token *
1408 lex_source_next__ (const struct lex_source *src, int n)
1410 return lex_source_ofs__ (src, n + src->parse_ofs);
1413 /* Returns the "struct token" of the token N after the current one in LEXER.
1414 The returned pointer can be invalidated by pretty much any succeeding call
1415 into the lexer, although the string pointer within the returned token is
1416 only invalidated by consuming the token (e.g. with lex_get()). */
1417 const struct token *
1418 lex_next (const struct lexer *lexer, int n)
1420 return &lex_next__ (lexer, n)->token;
1423 /* Returns the type of the token N after the current one in LEXER. */
1425 lex_next_token (const struct lexer *lexer, int n)
1427 return lex_next (lexer, n)->type;
1430 /* Returns the number in the tokn N after the current one in LEXER.
1432 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1433 tokens this function will always return zero. */
1435 lex_next_tokval (const struct lexer *lexer, int n)
1437 return token_number (lex_next (lexer, n));
1440 /* Returns the null-terminated string in the token N after the current one, in
1443 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1444 this functions this function will always return NULL.
1446 The UTF-8 encoding of the returned string is correct for variable names and
1447 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1448 data_in() to use it in a "union value". */
1450 lex_next_tokcstr (const struct lexer *lexer, int n)
1452 return lex_next_tokss (lexer, n).string;
1455 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1456 The string is null-terminated (but the null terminator is not included in
1457 the returned substring's 'length').
1459 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1460 tokens this functions this function will always return NULL.
1462 The UTF-8 encoding of the returned string is correct for variable names and
1463 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1464 data_in() to use it in a "union value". */
1466 lex_next_tokss (const struct lexer *lexer, int n)
1468 return lex_next (lexer, n)->string;
1471 /* Returns the offset of the current token within the command being parsed in
1472 LEXER. This is 0 for the first token in a command, 1 for the second, and so
1473 on. The return value is useful later for referring to this token in calls
1476 lex_ofs (const struct lexer *lexer)
1478 struct lex_source *src = lex_source__ (lexer);
1479 return src ? src->parse_ofs : 0;
1482 /* Returns the offset of the last token in the current command. */
1484 lex_max_ofs (const struct lexer *lexer)
1486 struct lex_source *src = lex_source__ (lexer);
1490 int ofs = MAX (1, src->n_parse) - 1;
1493 enum token_type type = lex_source_ofs__ (src, ofs)->token.type;
1494 if (type == T_ENDCMD || type == T_STOP)
1501 /* Returns the token within LEXER's current command with offset OFS. Use
1502 lex_ofs() to find out the offset of the current token. */
1503 const struct token *
1504 lex_ofs_token (const struct lexer *lexer_, int ofs)
1506 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1507 struct lex_source *src = lex_source__ (lexer);
1510 return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1513 static const struct token stop_token = { .type = T_STOP };
1518 /* Allocates and returns a new struct msg_location that spans tokens with
1519 offsets OFS0 through OFS1, inclusive, within the current command in
1520 LEXER. See lex_ofs() for an explanation of token offsets.
1522 The caller owns and must eventually free the returned object. */
1523 struct msg_location *
1524 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1526 int ofs = lex_ofs (lexer);
1527 return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1530 /* Returns a msg_point for the first character in the token with offset OFS,
1531 where offset 0 is the first token in the command currently being parsed, 1
1532 the second token, and so on. These are absolute offsets, not relative to
1533 the token currently being parsed within the command.
1535 Returns zeros for a T_STOP token.
1538 lex_ofs_start_point (const struct lexer *lexer, int ofs)
1540 const struct lex_source *src = lex_source__ (lexer);
1542 ? lex_token_start_point (src, lex_source_ofs__ (src, ofs))
1543 : (struct msg_point) { 0, 0 });
1546 /* Returns a msg_point for the last character, inclusive, in the token with
1547 offset OFS, where offset 0 is the first token in the command currently being
1548 parsed, 1 the second token, and so on. These are absolute offsets, not
1549 relative to the token currently being parsed within the command.
1551 Returns zeros for a T_STOP token.
1553 Most of the time, a single token is wholly within a single line of syntax,
1554 so that the start and end point for a given offset have the same line
1555 number. There are two exceptions: a T_STRING token can be made up of
1556 multiple segments on adjacent lines connected with "+" punctuators, and a
1557 T_NEG_NUM token can consist of a "-" on one line followed by the number on
1561 lex_ofs_end_point (const struct lexer *lexer, int ofs)
1563 const struct lex_source *src = lex_source__ (lexer);
1565 ? lex_token_end_point (src, lex_source_ofs__ (src, ofs))
1566 : (struct msg_point) { 0, 0 });
1569 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1570 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1571 are both zero, this requests the syntax for the current token.)
1573 The caller must eventually free the returned string (with free()). The
1574 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1575 that, for example, it may include comments, spaces, and new-lines if it
1576 spans multiple tokens. Macro expansion, however, has already been
1579 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1581 const struct lex_source *src = lex_source__ (lexer);
1583 ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs)
1588 /* Returns the text of the syntax in tokens with offsets OFS0 to OFS1,
1589 inclusive. (For example, if OFS0 and OFS1 are both zero, this requests the
1590 syntax for the first token in the current command.)
1592 The caller must eventually free the returned string (with free()). The
1593 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1594 that, for example, it may include comments, spaces, and new-lines if it
1595 spans multiple tokens. Macro expansion, however, has already been
1598 lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1)
1600 const struct lex_source *src = lex_source__ (lexer);
1601 return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup ("");
1604 /* Returns true if the token N ahead of the current one was produced by macro
1605 expansion, false otherwise. */
1607 lex_next_is_from_macro (const struct lexer *lexer, int n)
1609 return lex_next__ (lexer, n)->macro_rep != NULL;
1613 lex_tokens_match (const struct token *actual, const struct token *expected)
1615 if (actual->type != expected->type)
1618 switch (actual->type)
1622 return actual->number == expected->number;
1625 return lex_id_match (expected->string, actual->string);
1628 return (actual->string.length == expected->string.length
1629 && !memcmp (actual->string.string, expected->string.string,
1630 actual->string.length));
1638 lex_ofs_at_phrase__ (struct lexer *lexer, int ofs, const char *s)
1640 struct string_lexer slex;
1644 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1645 while (string_lexer_next (&slex, &token))
1647 bool match = lex_tokens_match (lex_ofs_token (lexer, ofs + i++), &token);
1648 token_uninit (&token);
1655 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1656 returns true. Otherwise, returns false.
1658 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1659 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1660 first three letters. */
1662 lex_at_phrase (struct lexer *lexer, const char *s)
1664 return lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s) > 0;
1667 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1668 skips it and returns true. Otherwise, returns false.
1670 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1671 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1672 first three letters. */
1674 lex_match_phrase (struct lexer *lexer, const char *s)
1676 size_t n = lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s);
1678 lex_get_n (lexer, n);
1682 /* Returns the 1-based line number of the source text at the byte OFFSET in
1685 lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset)
1688 size_t hi = src->n_lines;
1691 size_t mid = (lo + hi) / 2;
1692 if (mid + 1 >= src->n_lines)
1693 return src->n_lines;
1694 else if (offset >= src->lines[mid + 1])
1696 else if (offset < src->lines[mid])
1703 /* Returns the 1-based column number of the source text at the byte OFFSET in
1706 lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset)
1708 const char *newline = memrchr (src->buffer, '\n', offset);
1709 size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1710 return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1713 static struct msg_point
1714 lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset)
1716 return (struct msg_point) {
1717 .line = lex_source_ofs_to_line_number (src, offset),
1718 .column = lex_source_ofs_to_column_number (src, offset),
1722 static struct msg_point
1723 lex_token_start_point (const struct lex_source *src,
1724 const struct lex_token *token)
1726 return lex_source_ofs_to_point__ (src, token->token_pos);
1729 static struct msg_point
1730 lex_token_end_point (const struct lex_source *src,
1731 const struct lex_token *token)
1733 return lex_source_ofs_to_point__ (src, lex_token_end (token));
1736 static struct msg_location
1737 lex_token_location (const struct lex_source *src,
1738 const struct lex_token *t0,
1739 const struct lex_token *t1)
1741 return (struct msg_location) {
1742 .file_name = intern_new_if_nonnull (src->reader->file_name),
1743 .start = lex_token_start_point (src, t0),
1744 .end = lex_token_end_point (src, t1),
1745 .src = CONST_CAST (struct lex_source *, src),
1749 static struct msg_location *
1750 lex_token_location_rw (const struct lex_source *src,
1751 const struct lex_token *t0,
1752 const struct lex_token *t1)
1754 struct msg_location location = lex_token_location (src, t0, t1);
1755 return msg_location_dup (&location);
1758 static struct msg_location *
1759 lex_source_get_location (const struct lex_source *src, int ofs0, int ofs1)
1761 return lex_token_location_rw (src,
1762 lex_source_ofs__ (src, ofs0),
1763 lex_source_ofs__ (src, ofs1));
1766 /* Returns the name of the syntax file from which the current command is drawn.
1767 Returns NULL for a T_STOP token or if the command's source does not have
1770 There is no version of this function that takes an N argument because
1771 lookahead only works to the end of a command and any given command is always
1772 within a single syntax file. */
1774 lex_get_file_name (const struct lexer *lexer)
1776 struct lex_source *src = lex_source__ (lexer);
1777 return src == NULL ? NULL : src->reader->file_name;
1780 /* Returns a newly allocated msg_location for the syntax that represents tokens
1781 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1782 must eventually free the location (with msg_location_destroy()). */
1783 struct msg_location *
1784 lex_get_location (const struct lexer *lexer, int n0, int n1)
1786 struct msg_location *loc = xmalloc (sizeof *loc);
1787 *loc = (struct msg_location) {
1788 .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1789 .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)),
1790 .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)),
1791 .src = lex_source__ (lexer),
1793 lex_source_ref (loc->src);
1798 lex_get_encoding (const struct lexer *lexer)
1800 struct lex_source *src = lex_source__ (lexer);
1801 return src == NULL ? NULL : src->reader->encoding;
1804 /* Returns the syntax mode for the syntax file from which the current drawn is
1805 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1806 does not have line numbers.
1808 There is no version of this function that takes an N argument because
1809 lookahead only works to the end of a command and any given command is always
1810 within a single syntax file. */
1812 lex_get_syntax_mode (const struct lexer *lexer)
1814 struct lex_source *src = lex_source__ (lexer);
1815 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1818 /* Returns the error mode for the syntax file from which the current drawn is
1819 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1820 source does not have line numbers.
1822 There is no version of this function that takes an N argument because
1823 lookahead only works to the end of a command and any given command is always
1824 within a single syntax file. */
1826 lex_get_error_mode (const struct lexer *lexer)
1828 struct lex_source *src = lex_source__ (lexer);
1829 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1832 /* If the source that LEXER is currently reading has error mode
1833 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1834 token to be read comes directly from whatever is next read from the stream.
1836 It makes sense to call this function after encountering an error in a
1837 command entered on the console, because usually the user would prefer not to
1838 have cascading errors. */
1840 lex_interactive_reset (struct lexer *lexer)
1842 struct lex_source *src = lex_source__ (lexer);
1843 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1846 src->journal_pos = src->seg_pos = 0;
1848 src->suppress_next_newline = false;
1849 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1851 lex_stage_clear (&src->pp);
1852 lex_stage_clear (&src->merge);
1853 lex_source_clear_parse (src);
1854 lex_source_push_endcmd__ (src);
1858 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1860 lex_discard_rest_of_command (struct lexer *lexer)
1862 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1866 /* Discards all lookahead tokens in LEXER, then discards all input sources
1867 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1868 runs out of input sources. */
1870 lex_discard_noninteractive (struct lexer *lexer)
1872 struct lex_source *src = lex_source__ (lexer);
1875 if (src->reader->error == LEX_ERROR_IGNORE)
1878 lex_stage_clear (&src->pp);
1879 lex_stage_clear (&src->merge);
1880 lex_source_clear_parse (src);
1882 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1883 src = lex_source__ (lexer))
1885 ll_remove (&src->ll);
1886 lex_source_unref (src);
1892 lex_source_expand__ (struct lex_source *src)
1894 if (src->length >= src->allocated)
1895 src->buffer = x2realloc (src->buffer, &src->allocated);
1899 lex_source_read__ (struct lex_source *src)
1903 lex_source_expand__ (src);
1905 size_t space = src->allocated - src->length;
1906 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1907 size_t n = src->reader->class->read (src->reader,
1908 &src->buffer[src->length],
1910 assert (n <= space);
1915 src->reader->eof = true;
1921 while (!memchr (&src->buffer[src->seg_pos], '\n',
1922 src->length - src->seg_pos));
1925 static struct lex_source *
1926 lex_source__ (const struct lexer *lexer)
1928 return (ll_is_empty (&lexer->sources) ? NULL
1929 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1932 const struct lex_source *
1933 lex_source (const struct lexer *lexer)
1935 return lex_source__ (lexer);
1938 /* Returns the text of the syntax in SRC for tokens with offsets OFS0 through
1939 OFS1 in the current command, inclusive. (For example, if OFS0 and OFS1 are
1940 both zero, this requests the syntax for the first token in the current
1941 command.) The caller must eventually free the returned string (with
1942 free()). The syntax is encoded in UTF-8 and in the original form supplied
1943 to the lexer so that, for example, it may include comments, spaces, and
1944 new-lines if it spans multiple tokens. Macro expansion, however, has
1945 already been performed. */
1947 lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1)
1949 struct string s = DS_EMPTY_INITIALIZER;
1950 for (size_t i = ofs0; i <= ofs1; )
1952 /* Find [I,J) as the longest sequence of tokens not produced by macro
1953 expansion, or otherwise the longest sequence expanded from a single
1955 const struct lex_token *first = lex_source_ofs__ (src, i);
1957 for (j = i + 1; j <= ofs1; j++)
1959 const struct lex_token *cur = lex_source_ofs__ (src, j);
1960 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1961 || first->macro_rep != cur->macro_rep)
1964 const struct lex_token *last = lex_source_ofs__ (src, j - 1);
1966 /* Now add the syntax for this sequence of tokens to SRC. */
1967 if (!ds_is_empty (&s))
1968 ds_put_byte (&s, ' ');
1969 if (!first->macro_rep)
1971 size_t start = first->token_pos;
1972 size_t end = last->token_pos + last->token_len;
1973 ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1977 size_t start = first->ofs;
1978 size_t end = last->ofs + last->len;
1979 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1985 return ds_steal_cstr (&s);
1989 lex_source_contains_macro_call (struct lex_source *src, int ofs0, int ofs1)
1991 for (int i = ofs0; i <= ofs1; i++)
1992 if (lex_source_ofs__ (src, i)->macro_rep)
1997 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1998 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1999 other tokens included in that range. The syntax is encoded in UTF-8 and in
2000 the original form supplied to the lexer so that, for example, it may include
2001 comments, spaces, and new-lines if it spans multiple tokens.
2003 Returns an empty string if the token range doesn't include a macro call.
2005 The caller must not modify or free the returned string. */
2006 static struct substring
2007 lex_source_get_macro_call (struct lex_source *src, int ofs0, int ofs1)
2009 if (!lex_source_contains_macro_call (src, ofs0, ofs1))
2012 const struct lex_token *token0 = lex_source_ofs__ (src, ofs0);
2013 const struct lex_token *token1 = lex_source_ofs__ (src, MAX (ofs0, ofs1));
2014 size_t start = token0->token_pos;
2015 size_t end = token1->token_pos + token1->token_len;
2017 return ss_buffer (&src->buffer[start], end - start);
2021 lex_source_msg_valist (struct lex_source *src, enum msg_class class,
2022 int ofs0, int ofs1, const char *format, va_list args)
2024 struct string s = DS_EMPTY_INITIALIZER;
2028 /* Get the macro call(s) that expanded to the syntax that caused the
2031 str_ellipsize (lex_source_get_macro_call (src, ofs0, ofs1),
2034 ds_put_format (&s, _("In syntax expanded from `%s'"), call);
2037 ds_put_cstr (&s, _("At end of input"));
2039 if (!ds_is_empty (&s))
2040 ds_put_cstr (&s, ": ");
2042 ds_put_vformat (&s, format, args);
2044 ds_put_cstr (&s, _("Syntax error."));
2046 if (ds_last (&s) != '.')
2047 ds_put_byte (&s, '.');
2049 struct msg *m = xmalloc (sizeof *m);
2051 .category = msg_class_to_category (class),
2052 .severity = msg_class_to_severity (class),
2053 .location = src ? lex_source_get_location (src, ofs0, ofs1) : NULL,
2054 .text = ds_steal_cstr (&s),
2060 lex_get_error (struct lex_source *src, const struct lex_token *token)
2063 str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
2064 syntax, sizeof syntax);
2066 struct string s = DS_EMPTY_INITIALIZER;
2067 ds_put_cstr (&s, token->token.string.string);
2069 struct msg *m = xmalloc (sizeof *m);
2071 .category = MSG_C_SYNTAX,
2072 .severity = MSG_S_ERROR,
2073 .location = lex_token_location_rw (src, token, token),
2074 .text = ds_steal_cstr (&s),
2079 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
2080 underlying lex_reader if necessary. Returns true if a new token was added
2081 to SRC's deque, false otherwise. The caller should retry failures unless
2082 SRC's 'eof' marker was set to true indicating that there will be no more
2083 tokens from this source. */
2085 lex_source_try_get_pp (struct lex_source *src)
2087 /* Append a new token to SRC and initialize it. */
2088 struct lex_token *token = xmalloc (sizeof *token);
2089 token->token = (struct token) { .type = T_STOP };
2090 token->macro_rep = NULL;
2091 token->ref_cnt = NULL;
2092 token->token_pos = src->seg_pos;
2094 /* Extract a segment. */
2095 const char *segment;
2096 enum segment_type seg_type;
2100 segment = &src->buffer[src->seg_pos];
2101 seg_len = segmenter_push (&src->segmenter, segment,
2102 src->length - src->seg_pos,
2103 src->reader->eof, &seg_type);
2107 /* The segmenter needs more input to produce a segment. */
2108 assert (!src->reader->eof);
2109 lex_source_read__ (src);
2112 /* Update state based on the segment. */
2113 token->token_len = seg_len;
2114 src->seg_pos += seg_len;
2115 if (seg_type == SEG_NEWLINE)
2117 if (src->n_lines >= src->allocated_lines)
2118 src->lines = x2nrealloc (src->lines, &src->allocated_lines,
2119 sizeof *src->lines);
2120 src->lines[src->n_lines++] = src->seg_pos;
2123 /* Get a token from the segment. */
2124 enum tokenize_result result = token_from_segment (
2125 seg_type, ss_buffer (segment, seg_len), &token->token);
2127 /* If we've reached the end of a line, or the end of a command, then pass
2128 the line to the output engine as a syntax text item. */
2129 int n_lines = seg_type == SEG_NEWLINE;
2130 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
2133 src->suppress_next_newline = true;
2135 else if (n_lines > 0 && src->suppress_next_newline)
2138 src->suppress_next_newline = false;
2140 for (int i = 0; i < n_lines; i++)
2142 /* Beginning of line. */
2143 const char *line = &src->buffer[src->journal_pos];
2145 /* Calculate line length, including \n or \r\n end-of-line if present.
2147 We use src->length even though that may be beyond what we've actually
2148 converted to tokens. That's because, if we're emitting the line due
2149 to SEG_END_COMMAND, we want to take the whole line through the
2150 newline, not just through the '.'. */
2151 size_t max_len = src->length - src->journal_pos;
2152 const char *newline = memchr (line, '\n', max_len);
2153 size_t line_len = newline ? newline - line + 1 : max_len;
2155 /* Calculate line length excluding end-of-line. */
2156 size_t copy_len = line_len;
2157 if (copy_len > 0 && line[copy_len - 1] == '\n')
2159 if (copy_len > 0 && line[copy_len - 1] == '\r')
2162 /* Submit the line as syntax. */
2163 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
2164 xmemdup0 (line, copy_len),
2167 src->journal_pos += line_len;
2172 case TOKENIZE_ERROR:
2173 lex_get_error (src, token);
2175 case TOKENIZE_EMPTY:
2176 lex_token_destroy (token);
2179 case TOKENIZE_TOKEN:
2180 if (token->token.type == T_STOP)
2182 token->token.type = T_ENDCMD;
2185 lex_stage_push_last (&src->pp, token);
2191 /* Attempts to append a new token to SRC. Returns true if successful, false on
2192 failure. On failure, the end of SRC has been reached and no more tokens
2193 will be forthcoming from it.
2195 Does not make the new token available for lookahead yet; the caller must
2196 adjust SRC's 'middle' pointer to do so. */
2198 lex_source_get_pp (struct lex_source *src)
2201 if (lex_source_try_get_pp (src))
2207 lex_source_try_get_merge (const struct lex_source *src_)
2209 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2211 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
2214 if (!settings_get_mexpand ())
2216 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
2220 /* Now pass tokens one-by-one to the macro expander.
2222 In the common case where there is no macro to expand, the loop is not
2224 struct macro_call *mc;
2225 int n_call = macro_call_create (src->lexer->macros,
2226 &lex_stage_first (&src->pp)->token, &mc);
2227 for (int ofs = 1; !n_call; ofs++)
2229 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
2231 /* This should not be reachable because we always get a T_ENDCMD at
2232 the end of an input file (transformed from T_STOP by
2233 lex_source_try_get_pp()) and the macro_expander should always
2234 terminate expansion on T_ENDCMD. */
2238 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
2239 const struct macro_token mt = {
2241 .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len),
2243 const struct msg_location loc = lex_token_location (src, t, t);
2244 n_call = macro_call_add (mc, &mt, &loc);
2248 /* False alarm: no macro expansion after all. Use first token as
2249 lookahead. We'll retry macro expansion from the second token next
2251 macro_call_destroy (mc);
2252 lex_stage_shift (&src->merge, &src->pp, 1);
2256 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
2257 are a macro call. (These are likely to be the only tokens in 'pp'.)
2259 const struct lex_token *c0 = lex_stage_first (&src->pp);
2260 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
2261 struct macro_tokens expansion = { .n = 0 };
2262 struct msg_location loc = lex_token_location (src, c0, c1);
2263 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
2264 macro_call_destroy (mc);
2266 /* Convert the macro expansion into syntax for possible error messages
2268 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
2269 size_t *len = xnmalloc (expansion.n, sizeof *len);
2270 struct string s = DS_EMPTY_INITIALIZER;
2271 macro_tokens_to_syntax (&expansion, &s, ofs, len);
2273 if (settings_get_mprint ())
2274 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
2275 _("Macro Expansion")));
2277 /* Append the macro expansion tokens to the lookahead. */
2278 if (expansion.n > 0)
2280 char *macro_rep = ds_steal_cstr (&s);
2281 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
2282 *ref_cnt = expansion.n;
2283 for (size_t i = 0; i < expansion.n; i++)
2285 struct lex_token *token = xmalloc (sizeof *token);
2286 *token = (struct lex_token) {
2287 .token = expansion.mts[i].token,
2288 .token_pos = c0->token_pos,
2289 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
2290 .macro_rep = macro_rep,
2295 lex_stage_push_last (&src->merge, token);
2297 ss_dealloc (&expansion.mts[i].syntax);
2302 free (expansion.mts);
2306 /* Destroy the tokens for the call. */
2307 for (size_t i = 0; i < n_call; i++)
2308 lex_stage_pop_first (&src->pp);
2310 return expansion.n > 0;
2313 /* Attempts to obtain at least one new token into 'merge' in SRC.
2315 Returns true if successful, false on failure. In the latter case, SRC is
2316 exhausted and 'src->eof' is now true. */
2318 lex_source_get_merge (struct lex_source *src)
2321 if (lex_source_try_get_merge (src))
2326 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
2328 Returns true if successful, false on failure. In the latter case, SRC is
2329 exhausted and 'src->eof' is now true. */
2331 lex_source_get_parse (struct lex_source *src)
2333 struct merger m = MERGER_INIT;
2335 for (size_t i = 0; ; i++)
2337 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
2339 /* We always get a T_ENDCMD at the end of an input file
2340 (transformed from T_STOP by lex_source_try_get_pp()) and
2341 merger_add() should never return -1 on T_ENDCMD. */
2342 assert (lex_stage_is_empty (&src->merge));
2346 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
2350 lex_source_push_parse (src, lex_stage_take_first (&src->merge));
2353 else if (retval > 0)
2355 /* Add a token that merges all the tokens together. */
2356 const struct lex_token *first = lex_stage_first (&src->merge);
2357 const struct lex_token *last = lex_stage_nth (&src->merge,
2359 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2360 struct lex_token *t = xmalloc (sizeof *t);
2361 *t = (struct lex_token) {
2363 .token_pos = first->token_pos,
2364 .token_len = (last->token_pos - first->token_pos) + last->token_len,
2366 /* This works well if all the tokens were not expanded from macros,
2367 or if they came from the same macro expansion. It just gives up
2368 in the other (corner) cases. */
2369 .macro_rep = macro ? first->macro_rep : NULL,
2370 .ofs = macro ? first->ofs : 0,
2371 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2372 .ref_cnt = macro ? first->ref_cnt : NULL,
2376 lex_source_push_parse (src, t);
2378 for (int i = 0; i < retval; i++)
2379 lex_stage_pop_first (&src->merge);
2386 lex_source_push_endcmd__ (struct lex_source *src)
2388 assert (src->n_parse == 0);
2390 struct lex_token *token = xmalloc (sizeof *token);
2391 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2392 lex_source_push_parse (src, token);
2396 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2398 if (src->n_parse >= src->allocated_parse)
2399 src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2400 sizeof *src->parse);
2401 src->parse[src->n_parse++] = token;
2405 lex_source_clear_parse (struct lex_source *src)
2407 for (size_t i = 0; i < src->n_parse; i++)
2408 lex_token_destroy (src->parse[i]);
2409 src->n_parse = src->parse_ofs = 0;
2412 static struct lex_source *
2413 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2415 size_t allocated_lines = 4;
2416 size_t *lines = xmalloc (allocated_lines * sizeof *lines);
2419 struct lex_source *src = xmalloc (sizeof *src);
2420 *src = (struct lex_source) {
2423 .segmenter = segmenter_init (reader->syntax, false),
2427 .allocated_lines = allocated_lines,
2430 lex_source_push_endcmd__ (src);
2436 lex_set_message_handler (struct lexer *lexer,
2437 void (*output_msg) (const struct msg *,
2440 struct msg_handler msg_handler = {
2441 .output_msg = (void (*)(const struct msg *, void *)) output_msg,
2443 .lex_source_ref = lex_source_ref,
2444 .lex_source_unref = lex_source_unref,
2445 .lex_source_get_line = lex_source_get_line,
2447 msg_set_handler (&msg_handler);
2451 lex_source_ref (const struct lex_source *src_)
2453 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2456 assert (src->n_refs > 0);
2463 lex_source_unref (struct lex_source *src)
2468 assert (src->n_refs > 0);
2469 if (--src->n_refs > 0)
2472 char *file_name = src->reader->file_name;
2473 char *encoding = src->reader->encoding;
2474 if (src->reader->class->destroy != NULL)
2475 src->reader->class->destroy (src->reader);
2480 lex_stage_uninit (&src->pp);
2481 lex_stage_uninit (&src->merge);
2482 lex_source_clear_parse (src);
2487 struct lex_file_reader
2489 struct lex_reader reader;
2490 struct u8_istream *istream;
2493 static struct lex_reader_class lex_file_reader_class;
2495 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2496 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2497 ENCODING, which should take one of the forms accepted by
2498 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2499 mode of the new reader, respectively.
2501 Returns a null pointer if FILE_NAME cannot be opened. */
2503 lex_reader_for_file (const char *file_name, const char *encoding,
2504 enum segmenter_mode syntax,
2505 enum lex_error_mode error)
2507 struct lex_file_reader *r;
2508 struct u8_istream *istream;
2510 istream = (!strcmp(file_name, "-")
2511 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2512 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2513 if (istream == NULL)
2515 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2519 r = xmalloc (sizeof *r);
2520 lex_reader_init (&r->reader, &lex_file_reader_class);
2521 r->reader.syntax = syntax;
2522 r->reader.error = error;
2523 r->reader.file_name = xstrdup (file_name);
2524 r->reader.encoding = xstrdup_if_nonnull (encoding);
2525 r->reader.line_number = 1;
2526 r->istream = istream;
2531 static struct lex_file_reader *
2532 lex_file_reader_cast (struct lex_reader *r)
2534 return UP_CAST (r, struct lex_file_reader, reader);
2538 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2539 enum prompt_style prompt_style UNUSED)
2541 struct lex_file_reader *r = lex_file_reader_cast (r_);
2542 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2545 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2552 lex_file_close (struct lex_reader *r_)
2554 struct lex_file_reader *r = lex_file_reader_cast (r_);
2556 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2558 if (u8_istream_close (r->istream) != 0)
2559 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2562 u8_istream_free (r->istream);
2567 static struct lex_reader_class lex_file_reader_class =
2573 struct lex_string_reader
2575 struct lex_reader reader;
2580 static struct lex_reader_class lex_string_reader_class;
2582 /* Creates and returns a new lex_reader for the contents of S, which must be
2583 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2584 with ss_dealloc() when it is closed. */
2586 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2588 struct lex_string_reader *r;
2590 r = xmalloc (sizeof *r);
2591 lex_reader_init (&r->reader, &lex_string_reader_class);
2592 r->reader.syntax = SEG_MODE_AUTO;
2593 r->reader.encoding = xstrdup_if_nonnull (encoding);
2600 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2601 which must be encoded in ENCODING. The caller retains ownership of S. */
2603 lex_reader_for_string (const char *s, const char *encoding)
2605 return lex_reader_for_substring_nocopy (ss_clone (ss_cstr (s)), encoding);
2608 /* Formats FORMAT as a printf()-like format string and creates and returns a
2609 new lex_reader for the formatted result. */
2611 lex_reader_for_format (const char *format, const char *encoding, ...)
2613 struct lex_reader *r;
2616 va_start (args, encoding);
2617 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2623 static struct lex_string_reader *
2624 lex_string_reader_cast (struct lex_reader *r)
2626 return UP_CAST (r, struct lex_string_reader, reader);
2630 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2631 enum prompt_style prompt_style UNUSED)
2633 struct lex_string_reader *r = lex_string_reader_cast (r_);
2636 chunk = MIN (n, r->s.length - r->offset);
2637 memcpy (buf, r->s.string + r->offset, chunk);
2644 lex_string_close (struct lex_reader *r_)
2646 struct lex_string_reader *r = lex_string_reader_cast (r_);
2652 static struct lex_reader_class lex_string_reader_class =
2659 lex_source_get_line (const struct lex_source *src, int line)
2661 if (line < 1 || line > src->n_lines)
2664 size_t ofs = src->lines[line - 1];
2666 if (line < src->n_lines)
2667 end = src->lines[line];
2670 const char *newline = memchr (src->buffer + ofs, '\n', src->length - ofs);
2671 end = newline ? newline - src->buffer : src->length;
2673 return ss_buffer (&src->buffer[ofs], end - ofs);