1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/intern.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
70 size_t token_pos; /* Offset into src->buffer of token start. */
71 size_t token_len; /* Length of source for token in bytes. */
73 /* For a token obtained through macro expansion, this is just this token.
75 For a token obtained through the lexer in an ordinary way, these are
77 char *macro_rep; /* The whole macro expansion. */
78 size_t ofs; /* Offset of this token in macro_rep. */
79 size_t len; /* Length of this token in macro_rep. */
80 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
83 static struct msg_point lex_token_start_point (const struct lex_source *,
84 const struct lex_token *);
85 static struct msg_point lex_token_end_point (const struct lex_source *,
86 const struct lex_token *);
88 static size_t lex_ofs_at_phrase__ (struct lexer *, int ofs, const char *s);
90 /* Source offset of the last byte in TOKEN. */
92 lex_token_end (const struct lex_token *token)
94 return token->token_pos + MAX (token->token_len, 1) - 1;
98 lex_token_destroy (struct lex_token *t)
100 token_uninit (&t->token);
103 assert (*t->ref_cnt > 0);
113 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
118 struct lex_token **tokens;
121 static void lex_stage_clear (struct lex_stage *);
122 static void lex_stage_uninit (struct lex_stage *);
124 static size_t lex_stage_count (const struct lex_stage *);
125 static bool lex_stage_is_empty (const struct lex_stage *);
127 static struct lex_token *lex_stage_first (struct lex_stage *);
128 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
130 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
131 static void lex_stage_pop_first (struct lex_stage *);
133 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
136 /* Deletes all the tokens from STAGE. */
138 lex_stage_clear (struct lex_stage *stage)
140 while (!deque_is_empty (&stage->deque))
141 lex_stage_pop_first (stage);
144 /* Deletes all the tokens from STAGE and frees storage for the deque. */
146 lex_stage_uninit (struct lex_stage *stage)
148 lex_stage_clear (stage);
149 free (stage->tokens);
152 /* Returns true if STAGE contains no tokens, otherwise false. */
154 lex_stage_is_empty (const struct lex_stage *stage)
156 return deque_is_empty (&stage->deque);
159 /* Returns the number of tokens in STAGE. */
161 lex_stage_count (const struct lex_stage *stage)
163 return deque_count (&stage->deque);
166 /* Returns the first token in STAGE, which must be nonempty.
167 The first token is the one accessed with the least lookahead. */
168 static struct lex_token *
169 lex_stage_first (struct lex_stage *stage)
171 return lex_stage_nth (stage, 0);
174 /* Returns the token the given INDEX in STAGE. The first token (with the least
175 lookahead) is 0, the second token is 1, and so on. There must be at least
176 INDEX + 1 tokens in STAGE. */
177 static struct lex_token *
178 lex_stage_nth (struct lex_stage *stage, size_t index)
180 return stage->tokens[deque_back (&stage->deque, index)];
183 /* Adds TOKEN so that it becomes the last token in STAGE. */
185 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
187 if (deque_is_full (&stage->deque))
188 stage->tokens = deque_expand (&stage->deque, stage->tokens,
189 sizeof *stage->tokens);
190 stage->tokens[deque_push_front (&stage->deque)] = token;
193 /* Removes and returns the first token from STAGE. */
194 static struct lex_token *
195 lex_stage_take_first (struct lex_stage *stage)
197 return stage->tokens[deque_pop_back (&stage->deque)];
200 /* Removes the first token from STAGE and uninitializes it. */
202 lex_stage_pop_first (struct lex_stage *stage)
204 lex_token_destroy (lex_stage_take_first (stage));
207 /* Removes the first N tokens from SRC, appending them to DST as the last
210 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
212 for (size_t i = 0; i < n; i++)
213 lex_stage_push_last (dst, lex_stage_take_first (src));
216 /* A source of tokens, corresponding to a syntax file.
218 This is conceptually a lex_reader wrapped with everything needed to convert
219 its UTF-8 bytes into tokens. */
222 struct ll ll; /* In lexer's list of sources. */
226 - One for struct lexer.
228 - One for each struct msg_location that references this source. */
231 struct lex_reader *reader;
233 struct segmenter segmenter;
234 bool eof; /* True if T_STOP was read from 'reader'. */
236 /* Buffer of UTF-8 bytes. */
237 char *buffer; /* Source file contents. */
238 size_t length; /* Number of bytes filled. */
239 size_t allocated; /* Number of bytes allocated. */
241 /* Offsets into 'buffer'. */
242 size_t journal_pos; /* First byte not yet output to journal. */
243 size_t seg_pos; /* First byte not yet scanned as token. */
245 /* Offset into 'buffer' of starts of lines. */
247 size_t n_lines, allocated_lines;
249 bool suppress_next_newline;
253 This is a pipeline with the following stages. Each token eventually
254 made available to the parser passes through of these stages. The stages
255 are named after the processing that happens in each one.
257 Initially, tokens come from the segmenter and scanner to 'pp':
259 - pp: Tokens that need to pass through the macro preprocessor to end up
262 - merge: Tokens that need to pass through scan_merge() to end up in
265 - parse: Tokens available to the client for parsing.
267 'pp' and 'merge' store tokens only temporarily until they pass into
268 'parse'. Tokens then live in 'parse' until the command is fully
269 consumed, at which time they are freed together. */
271 struct lex_stage merge;
272 struct lex_token **parse;
273 size_t n_parse, allocated_parse, parse_ofs;
276 static struct lex_source *lex_source_create (struct lexer *,
277 struct lex_reader *);
282 struct ll_list sources; /* Contains "struct lex_source"s. */
283 struct macro_set *macros;
286 static struct lex_source *lex_source__ (const struct lexer *);
287 static char *lex_source_syntax__ (const struct lex_source *,
289 static const struct lex_token *lex_next__ (const struct lexer *, int n);
290 static void lex_source_push_endcmd__ (struct lex_source *);
291 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
292 static void lex_source_clear_parse (struct lex_source *);
294 static bool lex_source_get_parse (struct lex_source *);
295 static void lex_source_msg_valist (struct lex_source *, enum msg_class,
297 const char *format, va_list)
298 PRINTF_FORMAT (5, 0);
299 static const struct lex_token *lex_source_next__ (const struct lex_source *,
302 /* Initializes READER with the specified CLASS and otherwise some reasonable
303 defaults. The caller should fill in the others members as desired. */
305 lex_reader_init (struct lex_reader *reader,
306 const struct lex_reader_class *class)
308 reader->class = class;
309 reader->syntax = SEG_MODE_AUTO;
310 reader->error = LEX_ERROR_CONTINUE;
311 reader->file_name = NULL;
312 reader->encoding = NULL;
313 reader->line_number = 0;
317 /* Frees any file name already in READER and replaces it by a copy of
318 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
320 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
322 free (reader->file_name);
323 reader->file_name = xstrdup_if_nonnull (file_name);
326 /* Creates and returns a new lexer. */
330 struct lexer *lexer = xmalloc (sizeof *lexer);
331 *lexer = (struct lexer) {
332 .sources = LL_INITIALIZER (lexer->sources),
333 .macros = macro_set_create (),
338 /* Destroys LEXER. */
340 lex_destroy (struct lexer *lexer)
344 struct lex_source *source, *next;
346 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
348 ll_remove (&source->ll);
349 lex_source_unref (source);
351 macro_set_destroy (lexer->macros);
356 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
357 same name. Takes ownership of M. */
359 lex_define_macro (struct lexer *lexer, struct macro *m)
361 macro_set_add (lexer->macros, m);
364 /* Inserts READER into LEXER so that the next token read by LEXER comes from
365 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
368 lex_include (struct lexer *lexer, struct lex_reader *reader)
370 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
371 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
374 /* Appends READER to LEXER, so that it will be read after all other current
375 readers have already been read. */
377 lex_append (struct lexer *lexer, struct lex_reader *reader)
379 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
384 /* Advances LEXER to the next token, consuming the current token. */
386 lex_get (struct lexer *lexer)
388 struct lex_source *src;
390 src = lex_source__ (lexer);
394 if (src->parse_ofs < src->n_parse)
396 if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
397 lex_source_clear_parse (src);
402 while (src->parse_ofs == src->n_parse)
403 if (!lex_source_get_parse (src))
405 ll_remove (&src->ll);
406 lex_source_unref (src);
407 src = lex_source__ (lexer);
413 /* Advances LEXER by N tokens. */
415 lex_get_n (struct lexer *lexer, size_t n)
421 /* Issuing errors. */
423 /* Prints a syntax error message containing the current token and
424 given message MESSAGE (if non-null). */
426 lex_error (struct lexer *lexer, const char *format, ...)
430 va_start (args, format);
431 lex_ofs_msg_valist (lexer, SE, lex_ofs (lexer), lex_ofs (lexer),
436 /* Prints a syntax error message for the span of tokens N0 through N1,
437 inclusive, from the current token in LEXER, adding message MESSAGE (if
440 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
444 va_start (args, format);
445 int ofs = lex_ofs (lexer);
446 lex_ofs_msg_valist (lexer, SE, n0 + ofs, n1 + ofs, format, args);
450 /* Prints a syntax error message for the span of tokens with offsets OFS0
451 through OFS1, inclusive, within the current command in LEXER, adding message
452 MESSAGE (if non-null). */
454 lex_ofs_error (struct lexer *lexer, int ofs0, int ofs1, const char *format, ...)
458 va_start (args, format);
459 lex_ofs_msg_valist (lexer, SE, ofs0, ofs1, format, args);
463 /* Prints a message of the given CLASS containing the current token and given
464 message MESSAGE (if non-null). */
466 lex_msg (struct lexer *lexer, enum msg_class class, const char *format, ...)
470 va_start (args, format);
471 lex_ofs_msg_valist (lexer, class, lex_ofs (lexer), lex_ofs (lexer),
476 /* Prints a syntax error message for the span of tokens N0 through N1,
477 inclusive, from the current token in LEXER, adding message MESSAGE (if
480 lex_next_msg (struct lexer *lexer, enum msg_class class, int n0, int n1,
481 const char *format, ...)
485 va_start (args, format);
486 int ofs = lex_ofs (lexer);
487 lex_ofs_msg_valist (lexer, class, n0 + ofs, n1 + ofs, format, args);
491 /* Prints a message of the given CLASS for the span of tokens with offsets OFS0
492 through OFS1, inclusive, within the current command in LEXER, adding message
493 MESSAGE (if non-null). */
495 lex_ofs_msg (struct lexer *lexer, enum msg_class class, int ofs0, int ofs1,
496 const char *format, ...)
500 va_start (args, format);
501 lex_ofs_msg_valist (lexer, class, ofs0, ofs1, format, args);
505 /* Prints a syntax error message saying that one of the strings provided as
506 varargs, up to the first NULL, is expected. */
508 (lex_error_expecting) (struct lexer *lexer, ...)
512 va_start (args, lexer);
513 lex_error_expecting_valist (lexer, args);
517 /* Prints a syntax error message saying that one of the options provided in
518 ARGS, up to the first NULL, is expected. */
520 lex_error_expecting_valist (struct lexer *lexer, va_list args)
522 enum { MAX_OPTIONS = 9 };
523 const char *options[MAX_OPTIONS];
525 while (n < MAX_OPTIONS)
527 const char *option = va_arg (args, const char *);
531 options[n++] = option;
533 lex_error_expecting_array (lexer, options, n);
537 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
542 lex_error (lexer, NULL);
546 lex_error (lexer, _("Syntax error expecting %s."), options[0]);
550 lex_error (lexer, _("Syntax error expecting %s or %s."),
551 options[0], options[1]);
555 lex_error (lexer, _("Syntax error expecting %s, %s, or %s."),
556 options[0], options[1], options[2]);
560 lex_error (lexer, _("Syntax error expecting %s, %s, %s, or %s."),
561 options[0], options[1], options[2], options[3]);
565 lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, or %s."),
566 options[0], options[1], options[2], options[3], options[4]);
570 lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, or %s."),
571 options[0], options[1], options[2], options[3], options[4],
576 lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, "
578 options[0], options[1], options[2], options[3], options[4],
579 options[5], options[6]);
583 lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, %s, "
585 options[0], options[1], options[2], options[3], options[4],
586 options[5], options[6], options[7]);
591 struct string s = DS_EMPTY_INITIALIZER;
592 for (size_t i = 0; i < n; i++)
595 ds_put_cstr (&s, ", ");
596 ds_put_cstr (&s, options[i]);
598 lex_error (lexer, _("Syntax error expecting one of the following: %s."),
606 /* Reports an error to the effect that subcommand SBC may only be specified
609 lex_sbc_only_once (struct lexer *lexer, const char *sbc)
611 int ofs = lex_ofs (lexer) - 1;
612 if (lex_ofs_token (lexer, ofs)->type == T_EQUALS)
615 /* lex_ofs_at_phrase__() handles subcommand names that are keywords, such as
617 if (lex_ofs_at_phrase__ (lexer, ofs, sbc))
618 lex_ofs_error (lexer, ofs, ofs,
619 _("Subcommand %s may only be specified once."), sbc);
621 msg (SE, _("Subcommand %s may only be specified once."), sbc);
624 /* Reports an error to the effect that subcommand SBC is missing.
626 This function does not take a lexer as an argument or use lex_error(),
627 because a missing subcommand can normally be detected only after the whole
628 command has been parsed, and so lex_error() would always report "Syntax
629 error at end of command", which does not help the user find the error. */
631 lex_sbc_missing (struct lexer *lexer, const char *sbc)
633 lex_ofs_error (lexer, 0, lex_max_ofs (lexer),
634 _("Required subcommand %s was not specified."), sbc);
637 /* Reports an error to the effect that specification SPEC may only be specified
638 once within subcommand SBC. */
640 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
642 lex_error (lexer, _("%s may only be specified once within subcommand %s."),
646 /* Reports an error to the effect that specification SPEC is missing within
649 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
651 lex_error (lexer, _("Required %s specification missing from %s subcommand."),
655 /* Prints a syntax error message for the span of tokens with offsets OFS0
656 through OFS1, inclusive, within the current command in LEXER, adding message
657 MESSAGE (if non-null) with the given ARGS. */
659 lex_ofs_msg_valist (struct lexer *lexer, enum msg_class class,
660 int ofs0, int ofs1, const char *format, va_list args)
662 lex_source_msg_valist (lex_source__ (lexer), class, ofs0, ofs1, format, args);
665 /* Checks that we're at end of command.
666 If so, returns a successful command completion code.
667 If not, flags a syntax error and returns an error command
670 lex_end_of_command (struct lexer *lexer)
672 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
674 lex_error (lexer, _("Syntax error expecting end of command."));
681 /* Token testing functions. */
683 /* Returns true if the current token is a number. */
685 lex_is_number (const struct lexer *lexer)
687 return lex_next_is_number (lexer, 0);
690 /* Returns true if the current token is a string. */
692 lex_is_string (const struct lexer *lexer)
694 return lex_next_is_string (lexer, 0);
697 /* Returns the value of the current token, which must be a
698 floating point number. */
700 lex_number (const struct lexer *lexer)
702 return lex_next_number (lexer, 0);
705 /* Returns true iff the current token is an integer. */
707 lex_is_integer (const struct lexer *lexer)
709 return lex_next_is_integer (lexer, 0);
712 /* Returns the value of the current token, which must be an
715 lex_integer (const struct lexer *lexer)
717 return lex_next_integer (lexer, 0);
720 /* Token testing functions with lookahead.
722 A value of 0 for N as an argument to any of these functions refers to the
723 current token. Lookahead is limited to the current command. Any N greater
724 than the number of tokens remaining in the current command will be treated
725 as referring to a T_ENDCMD token. */
727 /* Returns true if the token N ahead of the current token is a number. */
729 lex_next_is_number (const struct lexer *lexer, int n)
731 return token_is_number (lex_next (lexer, n));
734 /* Returns true if the token N ahead of the current token is a string. */
736 lex_next_is_string (const struct lexer *lexer, int n)
738 return token_is_string (lex_next (lexer, n));
741 /* Returns the value of the token N ahead of the current token, which must be a
742 floating point number. */
744 lex_next_number (const struct lexer *lexer, int n)
746 return token_number (lex_next (lexer, n));
749 /* Returns true if the token N ahead of the current token is an integer. */
751 lex_next_is_integer (const struct lexer *lexer, int n)
753 return token_is_integer (lex_next (lexer, n));
756 /* Returns the value of the token N ahead of the current token, which must be
759 lex_next_integer (const struct lexer *lexer, int n)
761 return token_integer (lex_next (lexer, n));
764 /* Token matching functions. */
766 /* If the current token has the specified TYPE, skips it and returns true.
767 Otherwise, returns false. */
769 lex_match (struct lexer *lexer, enum token_type type)
771 if (lex_token (lexer) == type)
780 /* If the current token matches IDENTIFIER, skips it and returns true.
781 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
784 IDENTIFIER must be an ASCII string. */
786 lex_match_id (struct lexer *lexer, const char *identifier)
788 return lex_match_id_n (lexer, identifier, 3);
791 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
792 may be abbreviated to its first N letters. Otherwise, returns false.
794 IDENTIFIER must be an ASCII string. */
796 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
798 if (lex_token (lexer) == T_ID
799 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
808 /* If the current token is integer X, skips it and returns true. Otherwise,
811 lex_match_int (struct lexer *lexer, int x)
813 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
822 /* Forced matches. */
824 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
825 abbreviated to its first 3 letters. Otherwise, reports an error and returns
828 IDENTIFIER must be an ASCII string. */
830 lex_force_match_id (struct lexer *lexer, const char *identifier)
832 if (lex_match_id (lexer, identifier))
836 lex_error_expecting (lexer, identifier);
841 /* If the current token has the specified TYPE, skips it and returns true.
842 Otherwise, reports an error and returns false. */
844 lex_force_match (struct lexer *lexer, enum token_type type)
846 if (lex_token (lexer) == type)
853 const char *type_string = token_type_to_string (type);
856 char *s = xasprintf ("`%s'", type_string);
857 lex_error_expecting (lexer, s);
861 lex_error_expecting (lexer, token_type_to_name (type));
867 /* If the current token is a string, does nothing and returns true.
868 Otherwise, reports an error and returns false. */
870 lex_force_string (struct lexer *lexer)
872 if (lex_is_string (lexer))
876 lex_error (lexer, _("Syntax error expecting string."));
881 /* If the current token is a string or an identifier, does nothing and returns
882 true. Otherwise, reports an error and returns false.
884 This is meant for use in syntactic situations where we want to encourage the
885 user to supply a quoted string, but for compatibility we also accept
886 identifiers. (One example of such a situation is file names.) Therefore,
887 the error message issued when the current token is wrong only says that a
888 string is expected and doesn't mention that an identifier would also be
891 lex_force_string_or_id (struct lexer *lexer)
893 return lex_token (lexer) == T_ID || lex_force_string (lexer);
896 /* If the current token is an integer, does nothing and returns true.
897 Otherwise, reports an error and returns false. */
899 lex_force_int (struct lexer *lexer)
901 if (lex_is_integer (lexer))
905 lex_error (lexer, _("Syntax error expecting integer."));
910 /* If the current token is an integer in the range MIN...MAX (inclusive), does
911 nothing and returns true. Otherwise, reports an error and returns false.
912 If NAME is nonnull, then it is used in the error message. */
914 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
916 bool is_number = lex_is_number (lexer);
917 bool is_integer = lex_is_integer (lexer);
918 bool too_small = (is_integer ? lex_integer (lexer) < min
919 : is_number ? lex_number (lexer) < min
921 bool too_big = (is_integer ? lex_integer (lexer) > max
922 : is_number ? lex_number (lexer) > max
924 if (is_integer && !too_small && !too_big)
929 /* Weird, maybe a bug in the caller. Just report that we needed an
932 lex_error (lexer, _("Syntax error expecting integer for %s."), name);
934 lex_error (lexer, _("Syntax error expecting integer."));
939 lex_error (lexer, _("Syntax error expecting %ld for %s."), min, name);
941 lex_error (lexer, _("Syntax error expecting %ld."), min);
943 else if (min + 1 == max)
946 lex_error (lexer, _("Syntax error expecting %ld or %ld for %s."),
949 lex_error (lexer, _("Syntax error expecting %ld or %ld."),
954 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
955 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
957 if (report_lower_bound && report_upper_bound)
961 _("Syntax error expecting integer "
962 "between %ld and %ld for %s."),
965 lex_error (lexer, _("Syntax error expecting integer "
966 "between %ld and %ld."),
969 else if (report_lower_bound)
974 lex_error (lexer, _("Syntax error expecting "
975 "non-negative integer for %s."),
978 lex_error (lexer, _("Syntax error expecting "
979 "non-negative integer."));
984 lex_error (lexer, _("Syntax error expecting "
985 "positive integer for %s."),
988 lex_error (lexer, _("Syntax error expecting "
989 "positive integer."));
994 lex_error (lexer, _("Syntax error expecting "
995 "integer %ld or greater for %s."),
998 lex_error (lexer, _("Syntax error expecting "
999 "integer %ld or greater."), min);
1002 else if (report_upper_bound)
1006 _("Syntax error expecting integer less than or equal "
1010 lex_error (lexer, _("Syntax error expecting integer less than or "
1017 lex_error (lexer, _("Syntax error expecting integer for %s."),
1020 lex_error (lexer, _("Syntax error expecting integer."));
1026 /* If the current token is a number, does nothing and returns true.
1027 Otherwise, reports an error and returns false. */
1029 lex_force_num (struct lexer *lexer)
1031 if (lex_is_number (lexer))
1034 lex_error (lexer, _("Syntax error expecting number."));
1038 /* If the current token is an number in the closed range [MIN,MAX], does
1039 nothing and returns true. Otherwise, reports an error and returns false.
1040 If NAME is nonnull, then it is used in the error message. */
1042 lex_force_num_range_closed (struct lexer *lexer, const char *name,
1043 double min, double max)
1045 bool is_number = lex_is_number (lexer);
1046 bool too_small = is_number && lex_number (lexer) < min;
1047 bool too_big = is_number && lex_number (lexer) > max;
1048 if (is_number && !too_small && !too_big)
1053 /* Weird, maybe a bug in the caller. Just report that we needed an
1056 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1058 lex_error (lexer, _("Syntax error expecting number."));
1060 else if (min == max)
1063 lex_error (lexer, _("Syntax error expecting number %g for %s."),
1066 lex_error (lexer, _("Syntax error expecting number %g."), min);
1070 bool report_lower_bound = min > -DBL_MAX || too_small;
1071 bool report_upper_bound = max < DBL_MAX || too_big;
1073 if (report_lower_bound && report_upper_bound)
1077 _("Syntax error expecting number "
1078 "between %g and %g for %s."),
1081 lex_error (lexer, _("Syntax error expecting number "
1082 "between %g and %g."),
1085 else if (report_lower_bound)
1090 lex_error (lexer, _("Syntax error expecting "
1091 "non-negative number for %s."),
1094 lex_error (lexer, _("Syntax error expecting "
1095 "non-negative number."));
1100 lex_error (lexer, _("Syntax error expecting number "
1101 "%g or greater for %s."),
1104 lex_error (lexer, _("Syntax error expecting number "
1105 "%g or greater."), min);
1108 else if (report_upper_bound)
1112 _("Syntax error expecting number "
1113 "less than or equal to %g for %s."),
1116 lex_error (lexer, _("Syntax error expecting number "
1117 "less than or equal to %g."),
1123 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1125 lex_error (lexer, _("Syntax error expecting number."));
1131 /* If the current token is an number in the half-open range [MIN,MAX), does
1132 nothing and returns true. Otherwise, reports an error and returns false.
1133 If NAME is nonnull, then it is used in the error message. */
1135 lex_force_num_range_halfopen (struct lexer *lexer, const char *name,
1136 double min, double max)
1138 bool is_number = lex_is_number (lexer);
1139 bool too_small = is_number && lex_number (lexer) < min;
1140 bool too_big = is_number && lex_number (lexer) >= max;
1141 if (is_number && !too_small && !too_big)
1146 /* Weird, maybe a bug in the caller. Just report that we needed an
1149 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1151 lex_error (lexer, _("Syntax error expecting number."));
1155 bool report_lower_bound = min > -DBL_MAX || too_small;
1156 bool report_upper_bound = max < DBL_MAX || too_big;
1158 if (report_lower_bound && report_upper_bound)
1161 lex_error (lexer, _("Syntax error expecting number "
1162 "in [%g,%g) for %s."),
1165 lex_error (lexer, _("Syntax error expecting number in [%g,%g)."),
1168 else if (report_lower_bound)
1173 lex_error (lexer, _("Syntax error expecting "
1174 "non-negative number for %s."),
1177 lex_error (lexer, _("Syntax error expecting "
1178 "non-negative number."));
1183 lex_error (lexer, _("Syntax error expecting "
1184 "number %g or greater for %s."),
1187 lex_error (lexer, _("Syntax error expecting "
1188 "number %g or greater."), min);
1191 else if (report_upper_bound)
1195 _("Syntax error expecting "
1196 "number less than %g for %s."), max, name);
1198 lex_error (lexer, _("Syntax error expecting "
1199 "number less than %g."), max);
1204 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1206 lex_error (lexer, _("Syntax error expecting number."));
1212 /* If the current token is an number in the open range (MIN,MAX), does
1213 nothing and returns true. Otherwise, reports an error and returns false.
1214 If NAME is nonnull, then it is used in the error message. */
1216 lex_force_num_range_open (struct lexer *lexer, const char *name,
1217 double min, double max)
1219 bool is_number = lex_is_number (lexer);
1220 bool too_small = is_number && lex_number (lexer) <= min;
1221 bool too_big = is_number && lex_number (lexer) >= max;
1222 if (is_number && !too_small && !too_big)
1227 /* Weird, maybe a bug in the caller. Just report that we needed an
1230 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1232 lex_error (lexer, _("Syntax error expecting number."));
1236 bool report_lower_bound = min > -DBL_MAX || too_small;
1237 bool report_upper_bound = max < DBL_MAX || too_big;
1239 if (report_lower_bound && report_upper_bound)
1242 lex_error (lexer, _("Syntax error expecting number "
1243 "in (%g,%g) for %s."),
1246 lex_error (lexer, _("Syntax error expecting number "
1247 "in (%g,%g)."), min, max);
1249 else if (report_lower_bound)
1254 lex_error (lexer, _("Syntax error expecting "
1255 "positive number for %s."), name);
1257 lex_error (lexer, _("Syntax error expecting "
1258 "positive number."));
1263 lex_error (lexer, _("Syntax error expecting number "
1264 "greater than %g for %s."),
1267 lex_error (lexer, _("Syntax error expecting number "
1268 "greater than %g."), min);
1271 else if (report_upper_bound)
1274 lex_error (lexer, _("Syntax error expecting number "
1275 "less than %g for %s."),
1278 lex_error (lexer, _("Syntax error expecting number "
1279 "less than %g."), max);
1284 lex_error (lexer, _("Syntax error expecting number "
1287 lex_error (lexer, _("Syntax error expecting number."));
1293 /* If the current token is an identifier, does nothing and returns true.
1294 Otherwise, reports an error and returns false. */
1296 lex_force_id (struct lexer *lexer)
1298 if (lex_token (lexer) == T_ID)
1301 lex_error (lexer, _("Syntax error expecting identifier."));
1305 /* Token accessors. */
1307 /* Returns the type of LEXER's current token. */
1309 lex_token (const struct lexer *lexer)
1311 return lex_next_token (lexer, 0);
1314 /* Returns the number in LEXER's current token.
1316 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1317 tokens this function will always return zero. */
1319 lex_tokval (const struct lexer *lexer)
1321 return lex_next_tokval (lexer, 0);
1324 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
1326 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1327 this functions this function will always return NULL.
1329 The UTF-8 encoding of the returned string is correct for variable names and
1330 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1331 data_in() to use it in a "union value". */
1333 lex_tokcstr (const struct lexer *lexer)
1335 return lex_next_tokcstr (lexer, 0);
1338 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
1339 null-terminated (but the null terminator is not included in the returned
1340 substring's 'length').
1342 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1343 this functions this function will always return NULL.
1345 The UTF-8 encoding of the returned string is correct for variable names and
1346 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1347 data_in() to use it in a "union value". */
1349 lex_tokss (const struct lexer *lexer)
1351 return lex_next_tokss (lexer, 0);
1356 A value of 0 for N as an argument to any of these functions refers to the
1357 current token. Lookahead is limited to the current command. Any N greater
1358 than the number of tokens remaining in the current command will be treated
1359 as referring to a T_ENDCMD token. */
1361 static const struct lex_token *
1362 lex_next__ (const struct lexer *lexer_, int n)
1364 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1365 struct lex_source *src = lex_source__ (lexer);
1368 return lex_source_next__ (src, n);
1371 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1376 static const struct lex_token *
1377 lex_source_ofs__ (const struct lex_source *src_, int ofs)
1379 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1383 static const struct lex_token endcmd_token
1384 = { .token = { .type = T_ENDCMD } };
1385 return &endcmd_token;
1388 while (ofs >= src->n_parse)
1390 if (src->n_parse > 0)
1392 const struct lex_token *t = src->parse[src->n_parse - 1];
1393 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1397 lex_source_get_parse (src);
1400 return src->parse[ofs];
1403 static const struct lex_token *
1404 lex_source_next__ (const struct lex_source *src, int n)
1406 return lex_source_ofs__ (src, n + src->parse_ofs);
1409 /* Returns the "struct token" of the token N after the current one in LEXER.
1410 The returned pointer can be invalidated by pretty much any succeeding call
1411 into the lexer, although the string pointer within the returned token is
1412 only invalidated by consuming the token (e.g. with lex_get()). */
1413 const struct token *
1414 lex_next (const struct lexer *lexer, int n)
1416 return &lex_next__ (lexer, n)->token;
1419 /* Returns the type of the token N after the current one in LEXER. */
1421 lex_next_token (const struct lexer *lexer, int n)
1423 return lex_next (lexer, n)->type;
1426 /* Returns the number in the tokn N after the current one in LEXER.
1428 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1429 tokens this function will always return zero. */
1431 lex_next_tokval (const struct lexer *lexer, int n)
1433 return token_number (lex_next (lexer, n));
1436 /* Returns the null-terminated string in the token N after the current one, in
1439 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1440 this functions this function will always return NULL.
1442 The UTF-8 encoding of the returned string is correct for variable names and
1443 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1444 data_in() to use it in a "union value". */
1446 lex_next_tokcstr (const struct lexer *lexer, int n)
1448 return lex_next_tokss (lexer, n).string;
1451 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1452 The string is null-terminated (but the null terminator is not included in
1453 the returned substring's 'length').
1455 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1456 tokens this functions this function will always return NULL.
1458 The UTF-8 encoding of the returned string is correct for variable names and
1459 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1460 data_in() to use it in a "union value". */
1462 lex_next_tokss (const struct lexer *lexer, int n)
1464 return lex_next (lexer, n)->string;
1467 /* Returns the offset of the current token within the command being parsed in
1468 LEXER. This is 0 for the first token in a command, 1 for the second, and so
1469 on. The return value is useful later for referring to this token in calls
1472 lex_ofs (const struct lexer *lexer)
1474 struct lex_source *src = lex_source__ (lexer);
1475 return src ? src->parse_ofs : 0;
1478 /* Returns the offset of the last token in the current command. */
1480 lex_max_ofs (const struct lexer *lexer)
1482 struct lex_source *src = lex_source__ (lexer);
1486 int ofs = MAX (1, src->n_parse) - 1;
1489 enum token_type type = lex_source_ofs__ (src, ofs)->token.type;
1490 if (type == T_ENDCMD || type == T_STOP)
1497 /* Returns the token within LEXER's current command with offset OFS. Use
1498 lex_ofs() to find out the offset of the current token. */
1499 const struct token *
1500 lex_ofs_token (const struct lexer *lexer_, int ofs)
1502 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1503 struct lex_source *src = lex_source__ (lexer);
1506 return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1509 static const struct token stop_token = { .type = T_STOP };
1514 /* Allocates and returns a new struct msg_location that spans tokens with
1515 offsets OFS0 through OFS1, inclusive, within the current command in
1516 LEXER. See lex_ofs() for an explanation of token offsets.
1518 The caller owns and must eventually free the returned object. */
1519 struct msg_location *
1520 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1522 int ofs = lex_ofs (lexer);
1523 return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1526 /* Returns a msg_point for the first character in the token with offset OFS,
1527 where offset 0 is the first token in the command currently being parsed, 1
1528 the second token, and so on. These are absolute offsets, not relative to
1529 the token currently being parsed within the command.
1531 Returns zeros for a T_STOP token.
1534 lex_ofs_start_point (const struct lexer *lexer, int ofs)
1536 const struct lex_source *src = lex_source__ (lexer);
1538 ? lex_token_start_point (src, lex_source_ofs__ (src, ofs))
1539 : (struct msg_point) { 0, 0 });
1542 /* Returns a msg_point for the last character, inclusive, in the token with
1543 offset OFS, where offset 0 is the first token in the command currently being
1544 parsed, 1 the second token, and so on. These are absolute offsets, not
1545 relative to the token currently being parsed within the command.
1547 Returns zeros for a T_STOP token.
1549 Most of the time, a single token is wholly within a single line of syntax,
1550 so that the start and end point for a given offset have the same line
1551 number. There are two exceptions: a T_STRING token can be made up of
1552 multiple segments on adjacent lines connected with "+" punctuators, and a
1553 T_NEG_NUM token can consist of a "-" on one line followed by the number on
1557 lex_ofs_end_point (const struct lexer *lexer, int ofs)
1559 const struct lex_source *src = lex_source__ (lexer);
1561 ? lex_token_end_point (src, lex_source_ofs__ (src, ofs))
1562 : (struct msg_point) { 0, 0 });
1565 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1566 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1567 are both zero, this requests the syntax for the current token.)
1569 The caller must eventually free the returned string (with free()). The
1570 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1571 that, for example, it may include comments, spaces, and new-lines if it
1572 spans multiple tokens. Macro expansion, however, has already been
1575 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1577 const struct lex_source *src = lex_source__ (lexer);
1579 ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs)
1584 /* Returns the text of the syntax in tokens with offsets OFS0 to OFS1,
1585 inclusive. (For example, if OFS0 and OFS1 are both zero, this requests the
1586 syntax for the first token in the current command.)
1588 The caller must eventually free the returned string (with free()). The
1589 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1590 that, for example, it may include comments, spaces, and new-lines if it
1591 spans multiple tokens. Macro expansion, however, has already been
1594 lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1)
1596 const struct lex_source *src = lex_source__ (lexer);
1597 return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup ("");
1600 /* Returns true if the token N ahead of the current one was produced by macro
1601 expansion, false otherwise. */
1603 lex_next_is_from_macro (const struct lexer *lexer, int n)
1605 return lex_next__ (lexer, n)->macro_rep != NULL;
1609 lex_tokens_match (const struct token *actual, const struct token *expected)
1611 if (actual->type != expected->type)
1614 switch (actual->type)
1618 return actual->number == expected->number;
1621 return lex_id_match (expected->string, actual->string);
1624 return (actual->string.length == expected->string.length
1625 && !memcmp (actual->string.string, expected->string.string,
1626 actual->string.length));
1634 lex_ofs_at_phrase__ (struct lexer *lexer, int ofs, const char *s)
1636 struct string_lexer slex;
1640 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1641 while (string_lexer_next (&slex, &token))
1643 bool match = lex_tokens_match (lex_ofs_token (lexer, ofs + i++), &token);
1644 token_uninit (&token);
1651 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1652 returns true. Otherwise, returns false.
1654 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1655 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1656 first three letters. */
1658 lex_at_phrase (struct lexer *lexer, const char *s)
1660 return lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s) > 0;
1663 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1664 skips it and returns true. Otherwise, returns false.
1666 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1667 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1668 first three letters. */
1670 lex_match_phrase (struct lexer *lexer, const char *s)
1672 size_t n = lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s);
1674 lex_get_n (lexer, n);
1678 /* Returns the 1-based line number of the source text at the byte OFFSET in
1681 lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset)
1684 size_t hi = src->n_lines;
1687 size_t mid = (lo + hi) / 2;
1688 if (mid + 1 >= src->n_lines)
1689 return src->n_lines;
1690 else if (offset >= src->lines[mid + 1])
1692 else if (offset < src->lines[mid])
1699 /* Returns the 1-based column number of the source text at the byte OFFSET in
1702 lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset)
1704 const char *newline = memrchr (src->buffer, '\n', offset);
1705 size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1706 return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1709 static struct msg_point
1710 lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset)
1712 return (struct msg_point) {
1713 .line = lex_source_ofs_to_line_number (src, offset),
1714 .column = lex_source_ofs_to_column_number (src, offset),
1718 static struct msg_point
1719 lex_token_start_point (const struct lex_source *src,
1720 const struct lex_token *token)
1722 return lex_source_ofs_to_point__ (src, token->token_pos);
1725 static struct msg_point
1726 lex_token_end_point (const struct lex_source *src,
1727 const struct lex_token *token)
1729 return lex_source_ofs_to_point__ (src, lex_token_end (token));
1732 static struct msg_location
1733 lex_token_location (const struct lex_source *src,
1734 const struct lex_token *t0,
1735 const struct lex_token *t1)
1737 return (struct msg_location) {
1738 .file_name = intern_new_if_nonnull (src->reader->file_name),
1739 .start = lex_token_start_point (src, t0),
1740 .end = lex_token_end_point (src, t1),
1741 .src = CONST_CAST (struct lex_source *, src),
1745 static struct msg_location *
1746 lex_token_location_rw (const struct lex_source *src,
1747 const struct lex_token *t0,
1748 const struct lex_token *t1)
1750 struct msg_location location = lex_token_location (src, t0, t1);
1751 return msg_location_dup (&location);
1754 static struct msg_location *
1755 lex_source_get_location (const struct lex_source *src, int ofs0, int ofs1)
1757 return lex_token_location_rw (src,
1758 lex_source_ofs__ (src, ofs0),
1759 lex_source_ofs__ (src, ofs1));
1762 /* Returns the name of the syntax file from which the current command is drawn.
1763 Returns NULL for a T_STOP token or if the command's source does not have
1766 There is no version of this function that takes an N argument because
1767 lookahead only works to the end of a command and any given command is always
1768 within a single syntax file. */
1770 lex_get_file_name (const struct lexer *lexer)
1772 struct lex_source *src = lex_source__ (lexer);
1773 return src == NULL ? NULL : src->reader->file_name;
1776 /* Returns a newly allocated msg_location for the syntax that represents tokens
1777 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1778 must eventually free the location (with msg_location_destroy()). */
1779 struct msg_location *
1780 lex_get_location (const struct lexer *lexer, int n0, int n1)
1782 struct msg_location *loc = xmalloc (sizeof *loc);
1783 *loc = (struct msg_location) {
1784 .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1785 .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)),
1786 .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)),
1787 .src = lex_source__ (lexer),
1789 lex_source_ref (loc->src);
1794 lex_get_encoding (const struct lexer *lexer)
1796 struct lex_source *src = lex_source__ (lexer);
1797 return src == NULL ? NULL : src->reader->encoding;
1800 /* Returns the syntax mode for the syntax file from which the current drawn is
1801 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1802 does not have line numbers.
1804 There is no version of this function that takes an N argument because
1805 lookahead only works to the end of a command and any given command is always
1806 within a single syntax file. */
1808 lex_get_syntax_mode (const struct lexer *lexer)
1810 struct lex_source *src = lex_source__ (lexer);
1811 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1814 /* Returns the error mode for the syntax file from which the current drawn is
1815 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1816 source does not have line numbers.
1818 There is no version of this function that takes an N argument because
1819 lookahead only works to the end of a command and any given command is always
1820 within a single syntax file. */
1822 lex_get_error_mode (const struct lexer *lexer)
1824 struct lex_source *src = lex_source__ (lexer);
1825 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1828 /* If the source that LEXER is currently reading has error mode
1829 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1830 token to be read comes directly from whatever is next read from the stream.
1832 It makes sense to call this function after encountering an error in a
1833 command entered on the console, because usually the user would prefer not to
1834 have cascading errors. */
1836 lex_interactive_reset (struct lexer *lexer)
1838 struct lex_source *src = lex_source__ (lexer);
1839 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1842 src->journal_pos = src->seg_pos = 0;
1844 src->suppress_next_newline = false;
1845 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1847 lex_stage_clear (&src->pp);
1848 lex_stage_clear (&src->merge);
1849 lex_source_clear_parse (src);
1850 lex_source_push_endcmd__ (src);
1854 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1856 lex_discard_rest_of_command (struct lexer *lexer)
1858 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1862 /* Discards all lookahead tokens in LEXER, then discards all input sources
1863 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1864 runs out of input sources. */
1866 lex_discard_noninteractive (struct lexer *lexer)
1868 struct lex_source *src = lex_source__ (lexer);
1872 lex_stage_clear (&src->pp);
1873 lex_stage_clear (&src->merge);
1874 lex_source_clear_parse (src);
1876 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1877 src = lex_source__ (lexer))
1879 ll_remove (&src->ll);
1880 lex_source_unref (src);
1886 lex_source_expand__ (struct lex_source *src)
1888 if (src->length >= src->allocated)
1889 src->buffer = x2realloc (src->buffer, &src->allocated);
1893 lex_source_read__ (struct lex_source *src)
1897 lex_source_expand__ (src);
1899 size_t space = src->allocated - src->length;
1900 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1901 size_t n = src->reader->class->read (src->reader,
1902 &src->buffer[src->length],
1904 assert (n <= space);
1909 src->reader->eof = true;
1915 while (!memchr (&src->buffer[src->seg_pos], '\n',
1916 src->length - src->seg_pos));
1919 static struct lex_source *
1920 lex_source__ (const struct lexer *lexer)
1922 return (ll_is_empty (&lexer->sources) ? NULL
1923 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1926 /* Returns the text of the syntax in SRC for tokens with offsets OFS0 through
1927 OFS1 in the current command, inclusive. (For example, if OFS0 and OFS1 are
1928 both zero, this requests the syntax for the first token in the current
1929 command.) The caller must eventually free the returned string (with
1930 free()). The syntax is encoded in UTF-8 and in the original form supplied
1931 to the lexer so that, for example, it may include comments, spaces, and
1932 new-lines if it spans multiple tokens. Macro expansion, however, has
1933 already been performed. */
1935 lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1)
1937 struct string s = DS_EMPTY_INITIALIZER;
1938 for (size_t i = ofs0; i <= ofs1; )
1940 /* Find [I,J) as the longest sequence of tokens not produced by macro
1941 expansion, or otherwise the longest sequence expanded from a single
1943 const struct lex_token *first = lex_source_ofs__ (src, i);
1945 for (j = i + 1; j <= ofs1; j++)
1947 const struct lex_token *cur = lex_source_ofs__ (src, j);
1948 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1949 || first->macro_rep != cur->macro_rep)
1952 const struct lex_token *last = lex_source_ofs__ (src, j - 1);
1954 /* Now add the syntax for this sequence of tokens to SRC. */
1955 if (!ds_is_empty (&s))
1956 ds_put_byte (&s, ' ');
1957 if (!first->macro_rep)
1959 size_t start = first->token_pos;
1960 size_t end = last->token_pos + last->token_len;
1961 ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1965 size_t start = first->ofs;
1966 size_t end = last->ofs + last->len;
1967 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1973 return ds_steal_cstr (&s);
1977 lex_source_contains_macro_call (struct lex_source *src, int ofs0, int ofs1)
1979 for (int i = ofs0; i <= ofs1; i++)
1980 if (lex_source_ofs__ (src, i)->macro_rep)
1985 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1986 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1987 other tokens included in that range. The syntax is encoded in UTF-8 and in
1988 the original form supplied to the lexer so that, for example, it may include
1989 comments, spaces, and new-lines if it spans multiple tokens.
1991 Returns an empty string if the token range doesn't include a macro call.
1993 The caller must not modify or free the returned string. */
1994 static struct substring
1995 lex_source_get_macro_call (struct lex_source *src, int ofs0, int ofs1)
1997 if (!lex_source_contains_macro_call (src, ofs0, ofs1))
2000 const struct lex_token *token0 = lex_source_ofs__ (src, ofs0);
2001 const struct lex_token *token1 = lex_source_ofs__ (src, MAX (ofs0, ofs1));
2002 size_t start = token0->token_pos;
2003 size_t end = token1->token_pos + token1->token_len;
2005 return ss_buffer (&src->buffer[start], end - start);
2009 lex_source_msg_valist (struct lex_source *src, enum msg_class class,
2010 int ofs0, int ofs1, const char *format, va_list args)
2012 struct string s = DS_EMPTY_INITIALIZER;
2016 /* Get the macro call(s) that expanded to the syntax that caused the
2019 str_ellipsize (lex_source_get_macro_call (src, ofs0, ofs1),
2022 ds_put_format (&s, _("In syntax expanded from `%s'"), call);
2025 ds_put_cstr (&s, _("At end of input"));
2027 if (!ds_is_empty (&s))
2028 ds_put_cstr (&s, ": ");
2030 ds_put_vformat (&s, format, args);
2032 ds_put_cstr (&s, _("Syntax error."));
2034 if (ds_last (&s) != '.')
2035 ds_put_byte (&s, '.');
2037 struct msg *m = xmalloc (sizeof *m);
2039 .category = msg_class_to_category (class),
2040 .severity = msg_class_to_severity (class),
2041 .location = src ? lex_source_get_location (src, ofs0, ofs1) : NULL,
2042 .text = ds_steal_cstr (&s),
2048 lex_get_error (struct lex_source *src, const struct lex_token *token)
2051 str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
2052 syntax, sizeof syntax);
2054 struct string s = DS_EMPTY_INITIALIZER;
2055 ds_put_cstr (&s, token->token.string.string);
2057 struct msg *m = xmalloc (sizeof *m);
2059 .category = MSG_C_SYNTAX,
2060 .severity = MSG_S_ERROR,
2061 .location = lex_token_location_rw (src, token, token),
2062 .text = ds_steal_cstr (&s),
2067 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
2068 underlying lex_reader if necessary. Returns true if a new token was added
2069 to SRC's deque, false otherwise. The caller should retry failures unless
2070 SRC's 'eof' marker was set to true indicating that there will be no more
2071 tokens from this source. */
2073 lex_source_try_get_pp (struct lex_source *src)
2075 /* Append a new token to SRC and initialize it. */
2076 struct lex_token *token = xmalloc (sizeof *token);
2077 token->token = (struct token) { .type = T_STOP };
2078 token->macro_rep = NULL;
2079 token->ref_cnt = NULL;
2080 token->token_pos = src->seg_pos;
2082 /* Extract a segment. */
2083 const char *segment;
2084 enum segment_type seg_type;
2088 segment = &src->buffer[src->seg_pos];
2089 seg_len = segmenter_push (&src->segmenter, segment,
2090 src->length - src->seg_pos,
2091 src->reader->eof, &seg_type);
2095 /* The segmenter needs more input to produce a segment. */
2096 assert (!src->reader->eof);
2097 lex_source_read__ (src);
2100 /* Update state based on the segment. */
2101 token->token_len = seg_len;
2102 src->seg_pos += seg_len;
2103 if (seg_type == SEG_NEWLINE)
2105 if (src->n_lines >= src->allocated_lines)
2106 src->lines = x2nrealloc (src->lines, &src->allocated_lines,
2107 sizeof *src->lines);
2108 src->lines[src->n_lines++] = src->seg_pos;
2111 /* Get a token from the segment. */
2112 enum tokenize_result result = token_from_segment (
2113 seg_type, ss_buffer (segment, seg_len), &token->token);
2115 /* If we've reached the end of a line, or the end of a command, then pass
2116 the line to the output engine as a syntax text item. */
2117 int n_lines = seg_type == SEG_NEWLINE;
2118 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
2121 src->suppress_next_newline = true;
2123 else if (n_lines > 0 && src->suppress_next_newline)
2126 src->suppress_next_newline = false;
2128 for (int i = 0; i < n_lines; i++)
2130 /* Beginning of line. */
2131 const char *line = &src->buffer[src->journal_pos];
2133 /* Calculate line length, including \n or \r\n end-of-line if present.
2135 We use src->length even though that may be beyond what we've actually
2136 converted to tokens. That's because, if we're emitting the line due
2137 to SEG_END_COMMAND, we want to take the whole line through the
2138 newline, not just through the '.'. */
2139 size_t max_len = src->length - src->journal_pos;
2140 const char *newline = memchr (line, '\n', max_len);
2141 size_t line_len = newline ? newline - line + 1 : max_len;
2143 /* Calculate line length excluding end-of-line. */
2144 size_t copy_len = line_len;
2145 if (copy_len > 0 && line[copy_len - 1] == '\n')
2147 if (copy_len > 0 && line[copy_len - 1] == '\r')
2150 /* Submit the line as syntax. */
2151 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
2152 xmemdup0 (line, copy_len),
2155 src->journal_pos += line_len;
2160 case TOKENIZE_ERROR:
2161 lex_get_error (src, token);
2163 case TOKENIZE_EMPTY:
2164 lex_token_destroy (token);
2167 case TOKENIZE_TOKEN:
2168 if (token->token.type == T_STOP)
2170 token->token.type = T_ENDCMD;
2173 lex_stage_push_last (&src->pp, token);
2179 /* Attempts to append a new token to SRC. Returns true if successful, false on
2180 failure. On failure, the end of SRC has been reached and no more tokens
2181 will be forthcoming from it.
2183 Does not make the new token available for lookahead yet; the caller must
2184 adjust SRC's 'middle' pointer to do so. */
2186 lex_source_get_pp (struct lex_source *src)
2189 if (lex_source_try_get_pp (src))
2195 lex_source_try_get_merge (const struct lex_source *src_)
2197 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2199 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
2202 if (!settings_get_mexpand ())
2204 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
2208 /* Now pass tokens one-by-one to the macro expander.
2210 In the common case where there is no macro to expand, the loop is not
2212 struct macro_call *mc;
2213 int n_call = macro_call_create (src->lexer->macros,
2214 &lex_stage_first (&src->pp)->token, &mc);
2215 for (int ofs = 1; !n_call; ofs++)
2217 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
2219 /* This should not be reachable because we always get a T_ENDCMD at
2220 the end of an input file (transformed from T_STOP by
2221 lex_source_try_get_pp()) and the macro_expander should always
2222 terminate expansion on T_ENDCMD. */
2226 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
2227 const struct macro_token mt = {
2229 .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len),
2231 const struct msg_location loc = lex_token_location (src, t, t);
2232 n_call = macro_call_add (mc, &mt, &loc);
2236 /* False alarm: no macro expansion after all. Use first token as
2237 lookahead. We'll retry macro expansion from the second token next
2239 macro_call_destroy (mc);
2240 lex_stage_shift (&src->merge, &src->pp, 1);
2244 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
2245 are a macro call. (These are likely to be the only tokens in 'pp'.)
2247 const struct lex_token *c0 = lex_stage_first (&src->pp);
2248 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
2249 struct macro_tokens expansion = { .n = 0 };
2250 struct msg_location loc = lex_token_location (src, c0, c1);
2251 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
2252 macro_call_destroy (mc);
2254 /* Convert the macro expansion into syntax for possible error messages
2256 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
2257 size_t *len = xnmalloc (expansion.n, sizeof *len);
2258 struct string s = DS_EMPTY_INITIALIZER;
2259 macro_tokens_to_syntax (&expansion, &s, ofs, len);
2261 if (settings_get_mprint ())
2262 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
2263 _("Macro Expansion")));
2265 /* Append the macro expansion tokens to the lookahead. */
2266 if (expansion.n > 0)
2268 char *macro_rep = ds_steal_cstr (&s);
2269 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
2270 *ref_cnt = expansion.n;
2271 for (size_t i = 0; i < expansion.n; i++)
2273 struct lex_token *token = xmalloc (sizeof *token);
2274 *token = (struct lex_token) {
2275 .token = expansion.mts[i].token,
2276 .token_pos = c0->token_pos,
2277 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
2278 .macro_rep = macro_rep,
2283 lex_stage_push_last (&src->merge, token);
2285 ss_dealloc (&expansion.mts[i].syntax);
2290 free (expansion.mts);
2294 /* Destroy the tokens for the call. */
2295 for (size_t i = 0; i < n_call; i++)
2296 lex_stage_pop_first (&src->pp);
2298 return expansion.n > 0;
2301 /* Attempts to obtain at least one new token into 'merge' in SRC.
2303 Returns true if successful, false on failure. In the latter case, SRC is
2304 exhausted and 'src->eof' is now true. */
2306 lex_source_get_merge (struct lex_source *src)
2309 if (lex_source_try_get_merge (src))
2314 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
2316 Returns true if successful, false on failure. In the latter case, SRC is
2317 exhausted and 'src->eof' is now true. */
2319 lex_source_get_parse (struct lex_source *src)
2321 struct merger m = MERGER_INIT;
2323 for (size_t i = 0; ; i++)
2325 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
2327 /* We always get a T_ENDCMD at the end of an input file
2328 (transformed from T_STOP by lex_source_try_get_pp()) and
2329 merger_add() should never return -1 on T_ENDCMD. */
2330 assert (lex_stage_is_empty (&src->merge));
2334 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
2338 lex_source_push_parse (src, lex_stage_take_first (&src->merge));
2341 else if (retval > 0)
2343 /* Add a token that merges all the tokens together. */
2344 const struct lex_token *first = lex_stage_first (&src->merge);
2345 const struct lex_token *last = lex_stage_nth (&src->merge,
2347 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2348 struct lex_token *t = xmalloc (sizeof *t);
2349 *t = (struct lex_token) {
2351 .token_pos = first->token_pos,
2352 .token_len = (last->token_pos - first->token_pos) + last->token_len,
2354 /* This works well if all the tokens were not expanded from macros,
2355 or if they came from the same macro expansion. It just gives up
2356 in the other (corner) cases. */
2357 .macro_rep = macro ? first->macro_rep : NULL,
2358 .ofs = macro ? first->ofs : 0,
2359 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2360 .ref_cnt = macro ? first->ref_cnt : NULL,
2364 lex_source_push_parse (src, t);
2366 for (int i = 0; i < retval; i++)
2367 lex_stage_pop_first (&src->merge);
2374 lex_source_push_endcmd__ (struct lex_source *src)
2376 assert (src->n_parse == 0);
2378 struct lex_token *token = xmalloc (sizeof *token);
2379 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2380 lex_source_push_parse (src, token);
2384 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2386 if (src->n_parse >= src->allocated_parse)
2387 src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2388 sizeof *src->parse);
2389 src->parse[src->n_parse++] = token;
2393 lex_source_clear_parse (struct lex_source *src)
2395 for (size_t i = 0; i < src->n_parse; i++)
2396 lex_token_destroy (src->parse[i]);
2397 src->n_parse = src->parse_ofs = 0;
2400 static struct lex_source *
2401 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2403 size_t allocated_lines = 4;
2404 size_t *lines = xmalloc (allocated_lines * sizeof *lines);
2407 struct lex_source *src = xmalloc (sizeof *src);
2408 *src = (struct lex_source) {
2411 .segmenter = segmenter_init (reader->syntax, false),
2415 .allocated_lines = allocated_lines,
2418 lex_source_push_endcmd__ (src);
2424 lex_set_message_handler (struct lexer *lexer,
2425 void (*output_msg) (const struct msg *,
2428 struct msg_handler msg_handler = {
2429 .output_msg = (void (*)(const struct msg *, void *)) output_msg,
2431 .lex_source_ref = lex_source_ref,
2432 .lex_source_unref = lex_source_unref,
2433 .lex_source_get_line = lex_source_get_line,
2435 msg_set_handler (&msg_handler);
2439 lex_source_ref (const struct lex_source *src_)
2441 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2444 assert (src->n_refs > 0);
2451 lex_source_unref (struct lex_source *src)
2456 assert (src->n_refs > 0);
2457 if (--src->n_refs > 0)
2460 char *file_name = src->reader->file_name;
2461 char *encoding = src->reader->encoding;
2462 if (src->reader->class->destroy != NULL)
2463 src->reader->class->destroy (src->reader);
2468 lex_stage_uninit (&src->pp);
2469 lex_stage_uninit (&src->merge);
2470 lex_source_clear_parse (src);
2475 struct lex_file_reader
2477 struct lex_reader reader;
2478 struct u8_istream *istream;
2481 static struct lex_reader_class lex_file_reader_class;
2483 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2484 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2485 ENCODING, which should take one of the forms accepted by
2486 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2487 mode of the new reader, respectively.
2489 Returns a null pointer if FILE_NAME cannot be opened. */
2491 lex_reader_for_file (const char *file_name, const char *encoding,
2492 enum segmenter_mode syntax,
2493 enum lex_error_mode error)
2495 struct lex_file_reader *r;
2496 struct u8_istream *istream;
2498 istream = (!strcmp(file_name, "-")
2499 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2500 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2501 if (istream == NULL)
2503 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2507 r = xmalloc (sizeof *r);
2508 lex_reader_init (&r->reader, &lex_file_reader_class);
2509 r->reader.syntax = syntax;
2510 r->reader.error = error;
2511 r->reader.file_name = xstrdup (file_name);
2512 r->reader.encoding = xstrdup_if_nonnull (encoding);
2513 r->reader.line_number = 1;
2514 r->istream = istream;
2519 static struct lex_file_reader *
2520 lex_file_reader_cast (struct lex_reader *r)
2522 return UP_CAST (r, struct lex_file_reader, reader);
2526 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2527 enum prompt_style prompt_style UNUSED)
2529 struct lex_file_reader *r = lex_file_reader_cast (r_);
2530 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2533 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2540 lex_file_close (struct lex_reader *r_)
2542 struct lex_file_reader *r = lex_file_reader_cast (r_);
2544 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2546 if (u8_istream_close (r->istream) != 0)
2547 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2550 u8_istream_free (r->istream);
2555 static struct lex_reader_class lex_file_reader_class =
2561 struct lex_string_reader
2563 struct lex_reader reader;
2568 static struct lex_reader_class lex_string_reader_class;
2570 /* Creates and returns a new lex_reader for the contents of S, which must be
2571 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2572 with ss_dealloc() when it is closed. */
2574 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2576 struct lex_string_reader *r;
2578 r = xmalloc (sizeof *r);
2579 lex_reader_init (&r->reader, &lex_string_reader_class);
2580 r->reader.syntax = SEG_MODE_AUTO;
2581 r->reader.encoding = xstrdup_if_nonnull (encoding);
2588 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2589 which must be encoded in ENCODING. The caller retains ownership of S. */
2591 lex_reader_for_string (const char *s, const char *encoding)
2593 struct substring ss;
2594 ss_alloc_substring (&ss, ss_cstr (s));
2595 return lex_reader_for_substring_nocopy (ss, encoding);
2598 /* Formats FORMAT as a printf()-like format string and creates and returns a
2599 new lex_reader for the formatted result. */
2601 lex_reader_for_format (const char *format, const char *encoding, ...)
2603 struct lex_reader *r;
2606 va_start (args, encoding);
2607 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2613 static struct lex_string_reader *
2614 lex_string_reader_cast (struct lex_reader *r)
2616 return UP_CAST (r, struct lex_string_reader, reader);
2620 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2621 enum prompt_style prompt_style UNUSED)
2623 struct lex_string_reader *r = lex_string_reader_cast (r_);
2626 chunk = MIN (n, r->s.length - r->offset);
2627 memcpy (buf, r->s.string + r->offset, chunk);
2634 lex_string_close (struct lex_reader *r_)
2636 struct lex_string_reader *r = lex_string_reader_cast (r_);
2642 static struct lex_reader_class lex_string_reader_class =
2649 lex_source_get_line (const struct lex_source *src, int line)
2651 if (line < 1 || line > src->n_lines)
2654 size_t ofs = src->lines[line - 1];
2656 if (line < src->n_lines)
2657 end = src->lines[line];
2660 const char *newline = memchr (src->buffer + ofs, '\n', src->length - ofs);
2661 end = newline ? newline - src->buffer : src->length;
2663 return ss_buffer (&src->buffer[ofs], end - ofs);