1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/intern.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
70 size_t token_pos; /* Offset into src->buffer of token start. */
71 size_t token_len; /* Length of source for token in bytes. */
73 /* For a token obtained through macro expansion, this is just this token.
75 For a token obtained through the lexer in an ordinary way, these are
77 char *macro_rep; /* The whole macro expansion. */
78 size_t ofs; /* Offset of this token in macro_rep. */
79 size_t len; /* Length of this token in macro_rep. */
80 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
83 static struct msg_point lex_token_start_point (const struct lex_source *,
84 const struct lex_token *);
85 static struct msg_point lex_token_end_point (const struct lex_source *,
86 const struct lex_token *);
88 /* Source offset of the last byte in TOKEN. */
90 lex_token_end (const struct lex_token *token)
92 return token->token_pos + MAX (token->token_len, 1) - 1;
96 lex_token_destroy (struct lex_token *t)
98 token_uninit (&t->token);
101 assert (*t->ref_cnt > 0);
111 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
116 struct lex_token **tokens;
119 static void lex_stage_clear (struct lex_stage *);
120 static void lex_stage_uninit (struct lex_stage *);
122 static size_t lex_stage_count (const struct lex_stage *);
123 static bool lex_stage_is_empty (const struct lex_stage *);
125 static struct lex_token *lex_stage_first (struct lex_stage *);
126 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
128 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
129 static void lex_stage_pop_first (struct lex_stage *);
131 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
134 /* Deletes all the tokens from STAGE. */
136 lex_stage_clear (struct lex_stage *stage)
138 while (!deque_is_empty (&stage->deque))
139 lex_stage_pop_first (stage);
142 /* Deletes all the tokens from STAGE and frees storage for the deque. */
144 lex_stage_uninit (struct lex_stage *stage)
146 lex_stage_clear (stage);
147 free (stage->tokens);
150 /* Returns true if STAGE contains no tokens, otherwise false. */
152 lex_stage_is_empty (const struct lex_stage *stage)
154 return deque_is_empty (&stage->deque);
157 /* Returns the number of tokens in STAGE. */
159 lex_stage_count (const struct lex_stage *stage)
161 return deque_count (&stage->deque);
164 /* Returns the first token in STAGE, which must be nonempty.
165 The first token is the one accessed with the least lookahead. */
166 static struct lex_token *
167 lex_stage_first (struct lex_stage *stage)
169 return lex_stage_nth (stage, 0);
172 /* Returns the token the given INDEX in STAGE. The first token (with the least
173 lookahead) is 0, the second token is 1, and so on. There must be at least
174 INDEX + 1 tokens in STAGE. */
175 static struct lex_token *
176 lex_stage_nth (struct lex_stage *stage, size_t index)
178 return stage->tokens[deque_back (&stage->deque, index)];
181 /* Adds TOKEN so that it becomes the last token in STAGE. */
183 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
185 if (deque_is_full (&stage->deque))
186 stage->tokens = deque_expand (&stage->deque, stage->tokens,
187 sizeof *stage->tokens);
188 stage->tokens[deque_push_front (&stage->deque)] = token;
191 /* Removes and returns the first token from STAGE. */
192 static struct lex_token *
193 lex_stage_take_first (struct lex_stage *stage)
195 return stage->tokens[deque_pop_back (&stage->deque)];
198 /* Removes the first token from STAGE and uninitializes it. */
200 lex_stage_pop_first (struct lex_stage *stage)
202 lex_token_destroy (lex_stage_take_first (stage));
205 /* Removes the first N tokens from SRC, appending them to DST as the last
208 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
210 for (size_t i = 0; i < n; i++)
211 lex_stage_push_last (dst, lex_stage_take_first (src));
214 /* A source of tokens, corresponding to a syntax file.
216 This is conceptually a lex_reader wrapped with everything needed to convert
217 its UTF-8 bytes into tokens. */
220 struct ll ll; /* In lexer's list of sources. */
224 - One for struct lexer.
226 - One for each struct msg_location that references this source. */
229 struct lex_reader *reader;
231 struct segmenter segmenter;
232 bool eof; /* True if T_STOP was read from 'reader'. */
234 /* Buffer of UTF-8 bytes. */
235 char *buffer; /* Source file contents. */
236 size_t length; /* Number of bytes filled. */
237 size_t allocated; /* Number of bytes allocated. */
239 /* Offsets into 'buffer'. */
240 size_t journal_pos; /* First byte not yet output to journal. */
241 size_t seg_pos; /* First byte not yet scanned as token. */
243 /* Offset into 'buffer' of starts of lines. */
245 size_t n_lines, allocated_lines;
247 bool suppress_next_newline;
251 This is a pipeline with the following stages. Each token eventually
252 made available to the parser passes through of these stages. The stages
253 are named after the processing that happens in each one.
255 Initially, tokens come from the segmenter and scanner to 'pp':
257 - pp: Tokens that need to pass through the macro preprocessor to end up
260 - merge: Tokens that need to pass through scan_merge() to end up in
263 - parse: Tokens available to the client for parsing.
265 'pp' and 'merge' store tokens only temporarily until they pass into
266 'parse'. Tokens then live in 'parse' until the command is fully
267 consumed, at which time they are freed together. */
269 struct lex_stage merge;
270 struct lex_token **parse;
271 size_t n_parse, allocated_parse, parse_ofs;
274 static struct lex_source *lex_source_create (struct lexer *,
275 struct lex_reader *);
280 struct ll_list sources; /* Contains "struct lex_source"s. */
281 struct macro_set *macros;
284 static struct lex_source *lex_source__ (const struct lexer *);
285 static char *lex_source_syntax__ (const struct lex_source *,
287 static const struct lex_token *lex_next__ (const struct lexer *, int n);
288 static void lex_source_push_endcmd__ (struct lex_source *);
289 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
290 static void lex_source_clear_parse (struct lex_source *);
292 static bool lex_source_get_parse (struct lex_source *);
293 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
294 const char *format, va_list)
295 PRINTF_FORMAT (4, 0);
296 static const struct lex_token *lex_source_next__ (const struct lex_source *,
299 /* Initializes READER with the specified CLASS and otherwise some reasonable
300 defaults. The caller should fill in the others members as desired. */
302 lex_reader_init (struct lex_reader *reader,
303 const struct lex_reader_class *class)
305 reader->class = class;
306 reader->syntax = SEG_MODE_AUTO;
307 reader->error = LEX_ERROR_CONTINUE;
308 reader->file_name = NULL;
309 reader->encoding = NULL;
310 reader->line_number = 0;
314 /* Frees any file name already in READER and replaces it by a copy of
315 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
317 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
319 free (reader->file_name);
320 reader->file_name = xstrdup_if_nonnull (file_name);
323 /* Creates and returns a new lexer. */
327 struct lexer *lexer = xmalloc (sizeof *lexer);
328 *lexer = (struct lexer) {
329 .sources = LL_INITIALIZER (lexer->sources),
330 .macros = macro_set_create (),
335 /* Destroys LEXER. */
337 lex_destroy (struct lexer *lexer)
341 struct lex_source *source, *next;
343 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
345 ll_remove (&source->ll);
346 lex_source_unref (source);
348 macro_set_destroy (lexer->macros);
353 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
354 same name. Takes ownership of M. */
356 lex_define_macro (struct lexer *lexer, struct macro *m)
358 macro_set_add (lexer->macros, m);
361 /* Inserts READER into LEXER so that the next token read by LEXER comes from
362 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
365 lex_include (struct lexer *lexer, struct lex_reader *reader)
367 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
368 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
371 /* Appends READER to LEXER, so that it will be read after all other current
372 readers have already been read. */
374 lex_append (struct lexer *lexer, struct lex_reader *reader)
376 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
381 /* Advances LEXER to the next token, consuming the current token. */
383 lex_get (struct lexer *lexer)
385 struct lex_source *src;
387 src = lex_source__ (lexer);
391 if (src->parse_ofs < src->n_parse)
393 if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
394 lex_source_clear_parse (src);
399 while (src->parse_ofs == src->n_parse)
400 if (!lex_source_get_parse (src))
402 ll_remove (&src->ll);
403 lex_source_unref (src);
404 src = lex_source__ (lexer);
410 /* Advances LEXER by N tokens. */
412 lex_get_n (struct lexer *lexer, size_t n)
418 /* Issuing errors. */
420 /* Prints a syntax error message containing the current token and
421 given message MESSAGE (if non-null). */
423 lex_error (struct lexer *lexer, const char *format, ...)
427 va_start (args, format);
428 lex_next_error_valist (lexer, 0, 0, format, args);
432 /* Prints a syntax error message containing the current token and
433 given message MESSAGE (if non-null). */
435 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
437 lex_next_error_valist (lexer, 0, 0, format, args);
440 /* Prints a syntax error message containing the current token and
441 given message MESSAGE (if non-null). */
443 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
447 va_start (args, format);
448 lex_next_error_valist (lexer, n0, n1, format, args);
452 /* Prints a syntax error message saying that one of the strings provided as
453 varargs, up to the first NULL, is expected. */
455 (lex_error_expecting) (struct lexer *lexer, ...)
459 va_start (args, lexer);
460 lex_error_expecting_valist (lexer, args);
464 /* Prints a syntax error message saying that one of the options provided in
465 ARGS, up to the first NULL, is expected. */
467 lex_error_expecting_valist (struct lexer *lexer, va_list args)
469 enum { MAX_OPTIONS = 9 };
470 const char *options[MAX_OPTIONS];
472 while (n < MAX_OPTIONS)
474 const char *option = va_arg (args, const char *);
478 options[n++] = option;
480 lex_error_expecting_array (lexer, options, n);
484 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
489 lex_error (lexer, NULL);
493 lex_error (lexer, _("expecting %s"), options[0]);
497 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
501 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
506 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
507 options[0], options[1], options[2], options[3]);
511 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
512 options[0], options[1], options[2], options[3], options[4]);
516 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
517 options[0], options[1], options[2], options[3], options[4],
522 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
523 options[0], options[1], options[2], options[3], options[4],
524 options[5], options[6]);
528 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
529 options[0], options[1], options[2], options[3], options[4],
530 options[5], options[6], options[7]);
534 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, %s, or %s"),
535 options[0], options[1], options[2], options[3], options[4],
536 options[5], options[6], options[7], options[8]);
540 lex_error (lexer, NULL);
544 /* Reports an error to the effect that subcommand SBC may only be specified
547 This function does not take a lexer as an argument or use lex_error(),
548 because the result would ordinarily just be redundant: "Syntax error at
549 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
550 not help the user find the error. */
552 lex_sbc_only_once (const char *sbc)
554 msg (SE, _("Subcommand %s may only be specified once."), sbc);
557 /* Reports an error to the effect that subcommand SBC is missing.
559 This function does not take a lexer as an argument or use lex_error(),
560 because a missing subcommand can normally be detected only after the whole
561 command has been parsed, and so lex_error() would always report "Syntax
562 error at end of command", which does not help the user find the error. */
564 lex_sbc_missing (const char *sbc)
566 msg (SE, _("Required subcommand %s was not specified."), sbc);
569 /* Reports an error to the effect that specification SPEC may only be specified
570 once within subcommand SBC. */
572 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
574 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
578 /* Reports an error to the effect that specification SPEC is missing within
581 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
583 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
587 /* Prints a syntax error message containing the current token and
588 given message MESSAGE (if non-null). */
590 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
591 const char *format, va_list args)
593 struct lex_source *src = lex_source__ (lexer);
596 lex_source_error_valist (src, n0, n1, format, args);
602 ds_put_format (&s, _("Syntax error at end of input"));
605 ds_put_cstr (&s, ": ");
606 ds_put_vformat (&s, format, args);
608 if (ds_last (&s) != '.')
609 ds_put_byte (&s, '.');
610 msg (SE, "%s", ds_cstr (&s));
615 /* Checks that we're at end of command.
616 If so, returns a successful command completion code.
617 If not, flags a syntax error and returns an error command
620 lex_end_of_command (struct lexer *lexer)
622 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
624 lex_error (lexer, _("expecting end of command"));
631 /* Token testing functions. */
633 /* Returns true if the current token is a number. */
635 lex_is_number (const struct lexer *lexer)
637 return lex_next_is_number (lexer, 0);
640 /* Returns true if the current token is a string. */
642 lex_is_string (const struct lexer *lexer)
644 return lex_next_is_string (lexer, 0);
647 /* Returns the value of the current token, which must be a
648 floating point number. */
650 lex_number (const struct lexer *lexer)
652 return lex_next_number (lexer, 0);
655 /* Returns true iff the current token is an integer. */
657 lex_is_integer (const struct lexer *lexer)
659 return lex_next_is_integer (lexer, 0);
662 /* Returns the value of the current token, which must be an
665 lex_integer (const struct lexer *lexer)
667 return lex_next_integer (lexer, 0);
670 /* Token testing functions with lookahead.
672 A value of 0 for N as an argument to any of these functions refers to the
673 current token. Lookahead is limited to the current command. Any N greater
674 than the number of tokens remaining in the current command will be treated
675 as referring to a T_ENDCMD token. */
677 /* Returns true if the token N ahead of the current token is a number. */
679 lex_next_is_number (const struct lexer *lexer, int n)
681 return token_is_number (lex_next (lexer, n));
684 /* Returns true if the token N ahead of the current token is a string. */
686 lex_next_is_string (const struct lexer *lexer, int n)
688 return token_is_string (lex_next (lexer, n));
691 /* Returns the value of the token N ahead of the current token, which must be a
692 floating point number. */
694 lex_next_number (const struct lexer *lexer, int n)
696 return token_number (lex_next (lexer, n));
699 /* Returns true if the token N ahead of the current token is an integer. */
701 lex_next_is_integer (const struct lexer *lexer, int n)
703 return token_is_integer (lex_next (lexer, n));
706 /* Returns the value of the token N ahead of the current token, which must be
709 lex_next_integer (const struct lexer *lexer, int n)
711 return token_integer (lex_next (lexer, n));
714 /* Token matching functions. */
716 /* If the current token has the specified TYPE, skips it and returns true.
717 Otherwise, returns false. */
719 lex_match (struct lexer *lexer, enum token_type type)
721 if (lex_token (lexer) == type)
730 /* If the current token matches IDENTIFIER, skips it and returns true.
731 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
734 IDENTIFIER must be an ASCII string. */
736 lex_match_id (struct lexer *lexer, const char *identifier)
738 return lex_match_id_n (lexer, identifier, 3);
741 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
742 may be abbreviated to its first N letters. Otherwise, returns false.
744 IDENTIFIER must be an ASCII string. */
746 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
748 if (lex_token (lexer) == T_ID
749 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
758 /* If the current token is integer X, skips it and returns true. Otherwise,
761 lex_match_int (struct lexer *lexer, int x)
763 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
772 /* Forced matches. */
774 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
775 abbreviated to its first 3 letters. Otherwise, reports an error and returns
778 IDENTIFIER must be an ASCII string. */
780 lex_force_match_id (struct lexer *lexer, const char *identifier)
782 if (lex_match_id (lexer, identifier))
786 lex_error_expecting (lexer, identifier);
791 /* If the current token has the specified TYPE, skips it and returns true.
792 Otherwise, reports an error and returns false. */
794 lex_force_match (struct lexer *lexer, enum token_type type)
796 if (lex_token (lexer) == type)
803 const char *type_string = token_type_to_string (type);
806 char *s = xasprintf ("`%s'", type_string);
807 lex_error_expecting (lexer, s);
811 lex_error_expecting (lexer, token_type_to_name (type));
817 /* If the current token is a string, does nothing and returns true.
818 Otherwise, reports an error and returns false. */
820 lex_force_string (struct lexer *lexer)
822 if (lex_is_string (lexer))
826 lex_error (lexer, _("expecting string"));
831 /* If the current token is a string or an identifier, does nothing and returns
832 true. Otherwise, reports an error and returns false.
834 This is meant for use in syntactic situations where we want to encourage the
835 user to supply a quoted string, but for compatibility we also accept
836 identifiers. (One example of such a situation is file names.) Therefore,
837 the error message issued when the current token is wrong only says that a
838 string is expected and doesn't mention that an identifier would also be
841 lex_force_string_or_id (struct lexer *lexer)
843 return lex_token (lexer) == T_ID || lex_force_string (lexer);
846 /* If the current token is an integer, does nothing and returns true.
847 Otherwise, reports an error and returns false. */
849 lex_force_int (struct lexer *lexer)
851 if (lex_is_integer (lexer))
855 lex_error (lexer, _("expecting integer"));
860 /* If the current token is an integer in the range MIN...MAX (inclusive), does
861 nothing and returns true. Otherwise, reports an error and returns false.
862 If NAME is nonnull, then it is used in the error message. */
864 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
866 bool is_number = lex_is_number (lexer);
867 bool is_integer = lex_is_integer (lexer);
868 bool too_small = (is_integer ? lex_integer (lexer) < min
869 : is_number ? lex_number (lexer) < min
871 bool too_big = (is_integer ? lex_integer (lexer) > max
872 : is_number ? lex_number (lexer) > max
874 if (is_integer && !too_small && !too_big)
879 /* Weird, maybe a bug in the caller. Just report that we needed an
882 lex_error (lexer, _("Integer expected for %s."), name);
884 lex_error (lexer, _("Integer expected."));
889 lex_error (lexer, _("Expected %ld for %s."), min, name);
891 lex_error (lexer, _("Expected %ld."), min);
893 else if (min + 1 == max)
896 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
898 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
902 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
903 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
905 if (report_lower_bound && report_upper_bound)
909 _("Expected integer between %ld and %ld for %s."),
912 lex_error (lexer, _("Expected integer between %ld and %ld."),
915 else if (report_lower_bound)
920 lex_error (lexer, _("Expected non-negative integer for %s."),
923 lex_error (lexer, _("Expected non-negative integer."));
928 lex_error (lexer, _("Expected positive integer for %s."),
931 lex_error (lexer, _("Expected positive integer."));
936 lex_error (lexer, _("Expected integer %ld or greater for %s."),
939 lex_error (lexer, _("Expected integer %ld or greater."), min);
942 else if (report_upper_bound)
946 _("Expected integer less than or equal to %ld for %s."),
949 lex_error (lexer, _("Expected integer less than or equal to %ld."),
955 lex_error (lexer, _("Integer expected for %s."), name);
957 lex_error (lexer, _("Integer expected."));
963 /* If the current token is a number, does nothing and returns true.
964 Otherwise, reports an error and returns false. */
966 lex_force_num (struct lexer *lexer)
968 if (lex_is_number (lexer))
971 lex_error (lexer, _("expecting number"));
975 /* If the current token is an number in the closed range [MIN,MAX], does
976 nothing and returns true. Otherwise, reports an error and returns false.
977 If NAME is nonnull, then it is used in the error message. */
979 lex_force_num_range_closed (struct lexer *lexer, const char *name,
980 double min, double max)
982 bool is_number = lex_is_number (lexer);
983 bool too_small = is_number && lex_number (lexer) < min;
984 bool too_big = is_number && lex_number (lexer) > max;
985 if (is_number && !too_small && !too_big)
990 /* Weird, maybe a bug in the caller. Just report that we needed an
993 lex_error (lexer, _("Number expected for %s."), name);
995 lex_error (lexer, _("Number expected."));
1000 lex_error (lexer, _("Expected %g for %s."), min, name);
1002 lex_error (lexer, _("Expected %g."), min);
1006 bool report_lower_bound = min > -DBL_MAX || too_small;
1007 bool report_upper_bound = max < DBL_MAX || too_big;
1009 if (report_lower_bound && report_upper_bound)
1013 _("Expected number between %g and %g for %s."),
1016 lex_error (lexer, _("Expected number between %g and %g."),
1019 else if (report_lower_bound)
1024 lex_error (lexer, _("Expected non-negative number for %s."),
1027 lex_error (lexer, _("Expected non-negative number."));
1032 lex_error (lexer, _("Expected number %g or greater for %s."),
1035 lex_error (lexer, _("Expected number %g or greater."), min);
1038 else if (report_upper_bound)
1042 _("Expected number less than or equal to %g for %s."),
1045 lex_error (lexer, _("Expected number less than or equal to %g."),
1051 lex_error (lexer, _("Number expected for %s."), name);
1053 lex_error (lexer, _("Number expected."));
1059 /* If the current token is an number in the half-open range [MIN,MAX), does
1060 nothing and returns true. Otherwise, reports an error and returns false.
1061 If NAME is nonnull, then it is used in the error message. */
1063 lex_force_num_range_halfopen (struct lexer *lexer, const char *name,
1064 double min, double max)
1066 bool is_number = lex_is_number (lexer);
1067 bool too_small = is_number && lex_number (lexer) < min;
1068 bool too_big = is_number && lex_number (lexer) >= max;
1069 if (is_number && !too_small && !too_big)
1074 /* Weird, maybe a bug in the caller. Just report that we needed an
1077 lex_error (lexer, _("Number expected for %s."), name);
1079 lex_error (lexer, _("Number expected."));
1083 bool report_lower_bound = min > -DBL_MAX || too_small;
1084 bool report_upper_bound = max < DBL_MAX || too_big;
1086 if (report_lower_bound && report_upper_bound)
1089 lex_error (lexer, _("Expected number in [%g,%g) for %s."),
1092 lex_error (lexer, _("Expected number in [%g,%g)."),
1095 else if (report_lower_bound)
1100 lex_error (lexer, _("Expected non-negative number for %s."),
1103 lex_error (lexer, _("Expected non-negative number."));
1108 lex_error (lexer, _("Expected number %g or greater for %s."),
1111 lex_error (lexer, _("Expected number %g or greater."), min);
1114 else if (report_upper_bound)
1118 _("Expected number less than %g for %s."), max, name);
1120 lex_error (lexer, _("Expected number less than %g."), max);
1125 lex_error (lexer, _("Number expected for %s."), name);
1127 lex_error (lexer, _("Number expected."));
1133 /* If the current token is an number in the open range (MIN,MAX], does
1134 nothing and returns true. Otherwise, reports an error and returns false.
1135 If NAME is nonnull, then it is used in the error message. */
1137 lex_force_num_range_open (struct lexer *lexer, const char *name,
1138 double min, double max)
1140 bool is_number = lex_is_number (lexer);
1141 bool too_small = is_number && lex_number (lexer) <= min;
1142 bool too_big = is_number && lex_number (lexer) >= max;
1143 if (is_number && !too_small && !too_big)
1148 /* Weird, maybe a bug in the caller. Just report that we needed an
1151 lex_error (lexer, _("Number expected for %s."), name);
1153 lex_error (lexer, _("Number expected."));
1157 bool report_lower_bound = min > -DBL_MAX || too_small;
1158 bool report_upper_bound = max < DBL_MAX || too_big;
1160 if (report_lower_bound && report_upper_bound)
1163 lex_error (lexer, _("Expected number in (%g,%g) for %s."),
1166 lex_error (lexer, _("Expected number in (%g,%g)."), min, max);
1168 else if (report_lower_bound)
1173 lex_error (lexer, _("Expected positive number for %s."), name);
1175 lex_error (lexer, _("Expected positive number."));
1180 lex_error (lexer, _("Expected number greater than %g for %s."),
1183 lex_error (lexer, _("Expected number greater than %g."), min);
1186 else if (report_upper_bound)
1189 lex_error (lexer, _("Expected number less than %g for %s."),
1192 lex_error (lexer, _("Expected number less than %g."), max);
1197 lex_error (lexer, _("Number expected for %s."), name);
1199 lex_error (lexer, _("Number expected."));
1205 /* If the current token is an identifier, does nothing and returns true.
1206 Otherwise, reports an error and returns false. */
1208 lex_force_id (struct lexer *lexer)
1210 if (lex_token (lexer) == T_ID)
1213 lex_error (lexer, _("expecting identifier"));
1217 /* Token accessors. */
1219 /* Returns the type of LEXER's current token. */
1221 lex_token (const struct lexer *lexer)
1223 return lex_next_token (lexer, 0);
1226 /* Returns the number in LEXER's current token.
1228 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1229 tokens this function will always return zero. */
1231 lex_tokval (const struct lexer *lexer)
1233 return lex_next_tokval (lexer, 0);
1236 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
1238 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1239 this functions this function will always return NULL.
1241 The UTF-8 encoding of the returned string is correct for variable names and
1242 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1243 data_in() to use it in a "union value". */
1245 lex_tokcstr (const struct lexer *lexer)
1247 return lex_next_tokcstr (lexer, 0);
1250 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
1251 null-terminated (but the null terminator is not included in the returned
1252 substring's 'length').
1254 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1255 this functions this function will always return NULL.
1257 The UTF-8 encoding of the returned string is correct for variable names and
1258 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1259 data_in() to use it in a "union value". */
1261 lex_tokss (const struct lexer *lexer)
1263 return lex_next_tokss (lexer, 0);
1268 A value of 0 for N as an argument to any of these functions refers to the
1269 current token. Lookahead is limited to the current command. Any N greater
1270 than the number of tokens remaining in the current command will be treated
1271 as referring to a T_ENDCMD token. */
1273 static const struct lex_token *
1274 lex_next__ (const struct lexer *lexer_, int n)
1276 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1277 struct lex_source *src = lex_source__ (lexer);
1280 return lex_source_next__ (src, n);
1283 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1288 static const struct lex_token *
1289 lex_source_ofs__ (const struct lex_source *src_, int ofs)
1291 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1295 static const struct lex_token endcmd_token
1296 = { .token = { .type = T_ENDCMD } };
1297 return &endcmd_token;
1300 while (ofs >= src->n_parse)
1302 if (src->n_parse > 0)
1304 const struct lex_token *t = src->parse[src->n_parse - 1];
1305 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1309 lex_source_get_parse (src);
1312 return src->parse[ofs];
1315 static const struct lex_token *
1316 lex_source_next__ (const struct lex_source *src, int n)
1318 return lex_source_ofs__ (src, n + src->parse_ofs);
1321 /* Returns the "struct token" of the token N after the current one in LEXER.
1322 The returned pointer can be invalidated by pretty much any succeeding call
1323 into the lexer, although the string pointer within the returned token is
1324 only invalidated by consuming the token (e.g. with lex_get()). */
1325 const struct token *
1326 lex_next (const struct lexer *lexer, int n)
1328 return &lex_next__ (lexer, n)->token;
1331 /* Returns the type of the token N after the current one in LEXER. */
1333 lex_next_token (const struct lexer *lexer, int n)
1335 return lex_next (lexer, n)->type;
1338 /* Returns the number in the tokn N after the current one in LEXER.
1340 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1341 tokens this function will always return zero. */
1343 lex_next_tokval (const struct lexer *lexer, int n)
1345 return token_number (lex_next (lexer, n));
1348 /* Returns the null-terminated string in the token N after the current one, in
1351 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1352 this functions this function will always return NULL.
1354 The UTF-8 encoding of the returned string is correct for variable names and
1355 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1356 data_in() to use it in a "union value". */
1358 lex_next_tokcstr (const struct lexer *lexer, int n)
1360 return lex_next_tokss (lexer, n).string;
1363 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1364 The string is null-terminated (but the null terminator is not included in
1365 the returned substring's 'length').
1367 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1368 tokens this functions this function will always return NULL.
1370 The UTF-8 encoding of the returned string is correct for variable names and
1371 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1372 data_in() to use it in a "union value". */
1374 lex_next_tokss (const struct lexer *lexer, int n)
1376 return lex_next (lexer, n)->string;
1379 /* Returns the offset of the current token within the command being parsed in
1380 LEXER. This is 0 for the first token in a command, 1 for the second, and so
1381 on. The return value is useful later for referring to this token in calls
1384 lex_ofs (const struct lexer *lexer)
1386 struct lex_source *src = lex_source__ (lexer);
1387 return src ? src->parse_ofs : 0;
1390 /* Returns the token within LEXER's current command with offset OFS. Use
1391 lex_ofs() to find out the offset of the current token. */
1392 const struct token *
1393 lex_ofs_token (const struct lexer *lexer_, int ofs)
1395 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1396 struct lex_source *src = lex_source__ (lexer);
1399 return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1402 static const struct token stop_token = { .type = T_STOP };
1407 /* Allocates and returns a new struct msg_location that spans tokens with
1408 offsets OFS0 through OFS1, inclusive, within the current command in
1409 LEXER. See lex_ofs() for an explanation of token offsets.
1411 The caller owns and must eventually free the returned object. */
1412 struct msg_location *
1413 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1415 int ofs = lex_ofs (lexer);
1416 return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1419 /* Returns a msg_point for the first character in the token with offset OFS,
1420 where offset 0 is the first token in the command currently being parsed, 1
1421 the second token, and so on. These are absolute offsets, not relative to
1422 the token currently being parsed within the command.
1424 Returns zeros for a T_STOP token.
1427 lex_ofs_start_point (const struct lexer *lexer, int ofs)
1429 const struct lex_source *src = lex_source__ (lexer);
1431 ? lex_token_start_point (src, lex_source_ofs__ (src, ofs))
1432 : (struct msg_point) { 0, 0 });
1435 /* Returns a msg_point for the last character, inclusive, in the token with
1436 offset OFS, where offset 0 is the first token in the command currently being
1437 parsed, 1 the second token, and so on. These are absolute offsets, not
1438 relative to the token currently being parsed within the command.
1440 Returns zeros for a T_STOP token.
1442 Most of the time, a single token is wholly within a single line of syntax,
1443 so that the start and end point for a given offset have the same line
1444 number. There are two exceptions: a T_STRING token can be made up of
1445 multiple segments on adjacent lines connected with "+" punctuators, and a
1446 T_NEG_NUM token can consist of a "-" on one line followed by the number on
1450 lex_ofs_end_point (const struct lexer *lexer, int ofs)
1452 const struct lex_source *src = lex_source__ (lexer);
1454 ? lex_token_end_point (src, lex_source_ofs__ (src, ofs))
1455 : (struct msg_point) { 0, 0 });
1458 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1459 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1460 are both zero, this requests the syntax for the current token.)
1462 The caller must eventually free the returned string (with free()). The
1463 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1464 that, for example, it may include comments, spaces, and new-lines if it
1465 spans multiple tokens. Macro expansion, however, has already been
1468 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1470 const struct lex_source *src = lex_source__ (lexer);
1472 ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs)
1477 /* Returns the text of the syntax in tokens with offsets OFS0 to OFS1,
1478 inclusive. (For example, if OFS0 and OFS1 are both zero, this requests the
1479 syntax for the first token in the current command.)
1481 The caller must eventually free the returned string (with free()). The
1482 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1483 that, for example, it may include comments, spaces, and new-lines if it
1484 spans multiple tokens. Macro expansion, however, has already been
1487 lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1)
1489 const struct lex_source *src = lex_source__ (lexer);
1490 return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup ("");
1493 /* Returns true if the token N ahead of the current one was produced by macro
1494 expansion, false otherwise. */
1496 lex_next_is_from_macro (const struct lexer *lexer, int n)
1498 return lex_next__ (lexer, n)->macro_rep != NULL;
1502 lex_tokens_match (const struct token *actual, const struct token *expected)
1504 if (actual->type != expected->type)
1507 switch (actual->type)
1511 return actual->number == expected->number;
1514 return lex_id_match (expected->string, actual->string);
1517 return (actual->string.length == expected->string.length
1518 && !memcmp (actual->string.string, expected->string.string,
1519 actual->string.length));
1527 lex_at_phrase__ (struct lexer *lexer, const char *s)
1529 struct string_lexer slex;
1533 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1534 while (string_lexer_next (&slex, &token))
1536 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1537 token_uninit (&token);
1544 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1545 returns true. Otherwise, returns false.
1547 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1548 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1549 first three letters. */
1551 lex_at_phrase (struct lexer *lexer, const char *s)
1553 return lex_at_phrase__ (lexer, s) > 0;
1556 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1557 skips it and returns true. Otherwise, returns false.
1559 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1560 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1561 first three letters. */
1563 lex_match_phrase (struct lexer *lexer, const char *s)
1565 size_t n = lex_at_phrase__ (lexer, s);
1567 lex_get_n (lexer, n);
1571 /* Returns the 1-based line number of the source text at the byte OFFSET in
1574 lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset)
1577 size_t hi = src->n_lines;
1580 size_t mid = (lo + hi) / 2;
1581 if (mid + 1 >= src->n_lines)
1582 return src->n_lines;
1583 else if (offset >= src->lines[mid + 1])
1585 else if (offset < src->lines[mid])
1592 /* Returns the 1-based column number of the source text at the byte OFFSET in
1595 lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset)
1597 const char *newline = memrchr (src->buffer, '\n', offset);
1598 size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1599 return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1602 static struct msg_point
1603 lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset)
1605 return (struct msg_point) {
1606 .line = lex_source_ofs_to_line_number (src, offset),
1607 .column = lex_source_ofs_to_column_number (src, offset),
1611 static struct msg_point
1612 lex_token_start_point (const struct lex_source *src,
1613 const struct lex_token *token)
1615 return lex_source_ofs_to_point__ (src, token->token_pos);
1618 static struct msg_point
1619 lex_token_end_point (const struct lex_source *src,
1620 const struct lex_token *token)
1622 return lex_source_ofs_to_point__ (src, lex_token_end (token));
1625 static struct msg_location
1626 lex_token_location (const struct lex_source *src,
1627 const struct lex_token *t0,
1628 const struct lex_token *t1)
1630 return (struct msg_location) {
1631 .file_name = intern_new_if_nonnull (src->reader->file_name),
1632 .start = lex_token_start_point (src, t0),
1633 .end = lex_token_end_point (src, t1),
1637 static struct msg_location *
1638 lex_token_location_rw (const struct lex_source *src,
1639 const struct lex_token *t0,
1640 const struct lex_token *t1)
1642 struct msg_location location = lex_token_location (src, t0, t1);
1643 return msg_location_dup (&location);
1646 static struct msg_location *
1647 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1649 return lex_token_location_rw (src,
1650 lex_source_next__ (src, n0),
1651 lex_source_next__ (src, n1));
1654 /* Returns the name of the syntax file from which the current command is drawn.
1655 Returns NULL for a T_STOP token or if the command's source does not have
1658 There is no version of this function that takes an N argument because
1659 lookahead only works to the end of a command and any given command is always
1660 within a single syntax file. */
1662 lex_get_file_name (const struct lexer *lexer)
1664 struct lex_source *src = lex_source__ (lexer);
1665 return src == NULL ? NULL : src->reader->file_name;
1668 /* Returns a newly allocated msg_location for the syntax that represents tokens
1669 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1670 must eventually free the location (with msg_location_destroy()). */
1671 struct msg_location *
1672 lex_get_location (const struct lexer *lexer, int n0, int n1)
1674 struct msg_location *loc = xmalloc (sizeof *loc);
1675 *loc = (struct msg_location) {
1676 .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1677 .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)),
1678 .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)),
1679 .src = lex_source__ (lexer),
1681 lex_source_ref (loc->src);
1686 lex_get_encoding (const struct lexer *lexer)
1688 struct lex_source *src = lex_source__ (lexer);
1689 return src == NULL ? NULL : src->reader->encoding;
1692 /* Returns the syntax mode for the syntax file from which the current drawn is
1693 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1694 does not have line numbers.
1696 There is no version of this function that takes an N argument because
1697 lookahead only works to the end of a command and any given command is always
1698 within a single syntax file. */
1700 lex_get_syntax_mode (const struct lexer *lexer)
1702 struct lex_source *src = lex_source__ (lexer);
1703 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1706 /* Returns the error mode for the syntax file from which the current drawn is
1707 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1708 source does not have line numbers.
1710 There is no version of this function that takes an N argument because
1711 lookahead only works to the end of a command and any given command is always
1712 within a single syntax file. */
1714 lex_get_error_mode (const struct lexer *lexer)
1716 struct lex_source *src = lex_source__ (lexer);
1717 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1720 /* If the source that LEXER is currently reading has error mode
1721 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1722 token to be read comes directly from whatever is next read from the stream.
1724 It makes sense to call this function after encountering an error in a
1725 command entered on the console, because usually the user would prefer not to
1726 have cascading errors. */
1728 lex_interactive_reset (struct lexer *lexer)
1730 struct lex_source *src = lex_source__ (lexer);
1731 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1734 src->journal_pos = src->seg_pos = 0;
1736 src->suppress_next_newline = false;
1737 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1739 lex_stage_clear (&src->pp);
1740 lex_stage_clear (&src->merge);
1741 lex_source_clear_parse (src);
1742 lex_source_push_endcmd__ (src);
1746 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1748 lex_discard_rest_of_command (struct lexer *lexer)
1750 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1754 /* Discards all lookahead tokens in LEXER, then discards all input sources
1755 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1756 runs out of input sources. */
1758 lex_discard_noninteractive (struct lexer *lexer)
1760 struct lex_source *src = lex_source__ (lexer);
1764 lex_stage_clear (&src->pp);
1765 lex_stage_clear (&src->merge);
1766 lex_source_clear_parse (src);
1768 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1769 src = lex_source__ (lexer))
1771 ll_remove (&src->ll);
1772 lex_source_unref (src);
1778 lex_source_expand__ (struct lex_source *src)
1780 if (src->length >= src->allocated)
1781 src->buffer = x2realloc (src->buffer, &src->allocated);
1785 lex_source_read__ (struct lex_source *src)
1789 lex_source_expand__ (src);
1791 size_t space = src->allocated - src->length;
1792 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1793 size_t n = src->reader->class->read (src->reader,
1794 &src->buffer[src->length],
1796 assert (n <= space);
1801 src->reader->eof = true;
1807 while (!memchr (&src->buffer[src->seg_pos], '\n',
1808 src->length - src->seg_pos));
1811 static struct lex_source *
1812 lex_source__ (const struct lexer *lexer)
1814 return (ll_is_empty (&lexer->sources) ? NULL
1815 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1818 /* Returns the text of the syntax in SRC for tokens with offsets OFS0 through
1819 OFS1 in the current command, inclusive. (For example, if OFS0 and OFS1 are
1820 both zero, this requests the syntax for the first token in the current
1821 command.) The caller must eventually free the returned string (with
1822 free()). The syntax is encoded in UTF-8 and in the original form supplied
1823 to the lexer so that, for example, it may include comments, spaces, and
1824 new-lines if it spans multiple tokens. Macro expansion, however, has
1825 already been performed. */
1827 lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1)
1829 struct string s = DS_EMPTY_INITIALIZER;
1830 for (size_t i = ofs0; i <= ofs1; )
1832 /* Find [I,J) as the longest sequence of tokens not produced by macro
1833 expansion, or otherwise the longest sequence expanded from a single
1835 const struct lex_token *first = lex_source_ofs__ (src, i);
1837 for (j = i + 1; j <= ofs1; j++)
1839 const struct lex_token *cur = lex_source_ofs__ (src, j);
1840 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1841 || first->macro_rep != cur->macro_rep)
1844 const struct lex_token *last = lex_source_ofs__ (src, j - 1);
1846 /* Now add the syntax for this sequence of tokens to SRC. */
1847 if (!ds_is_empty (&s))
1848 ds_put_byte (&s, ' ');
1849 if (!first->macro_rep)
1851 size_t start = first->token_pos;
1852 size_t end = last->token_pos + last->token_len;
1853 ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1857 size_t start = first->ofs;
1858 size_t end = last->ofs + last->len;
1859 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1865 return ds_steal_cstr (&s);
1869 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1871 for (int i = n0; i <= n1; i++)
1872 if (lex_source_next__ (src, i)->macro_rep)
1877 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1878 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1879 other tokens included in that range. The syntax is encoded in UTF-8 and in
1880 the original form supplied to the lexer so that, for example, it may include
1881 comments, spaces, and new-lines if it spans multiple tokens.
1883 Returns an empty string if the token range doesn't include a macro call.
1885 The caller must not modify or free the returned string. */
1886 static struct substring
1887 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1889 if (!lex_source_contains_macro_call (src, n0, n1))
1892 const struct lex_token *token0 = lex_source_next__ (src, n0);
1893 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1894 size_t start = token0->token_pos;
1895 size_t end = token1->token_pos + token1->token_len;
1897 return ss_buffer (&src->buffer[start], end - start);
1901 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1902 const char *format, va_list args)
1904 const struct lex_token *token;
1909 token = lex_source_next__ (src, n0);
1910 if (token->token.type == T_ENDCMD)
1911 ds_put_cstr (&s, _("Syntax error at end of command"));
1914 /* Get the syntax that caused the error. */
1915 char *raw_syntax = lex_source_syntax__ (src, n0 + src->parse_ofs,
1916 n1 + src->parse_ofs);
1918 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1921 /* Get the macro call(s) that expanded to the syntax that caused the
1924 str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1931 _("Syntax error at `%s' (in expansion of `%s')"),
1934 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1939 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1942 ds_put_cstr (&s, _("Syntax error"));
1948 ds_put_cstr (&s, ": ");
1949 ds_put_vformat (&s, format, args);
1951 if (ds_last (&s) != '.')
1952 ds_put_byte (&s, '.');
1954 struct msg *m = xmalloc (sizeof *m);
1956 .category = MSG_C_SYNTAX,
1957 .severity = MSG_S_ERROR,
1958 .location = lex_source_get_location (src, n0, n1),
1959 .text = ds_steal_cstr (&s),
1965 lex_get_error (struct lex_source *src, const struct lex_token *token)
1968 str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1969 syntax, sizeof syntax);
1971 struct string s = DS_EMPTY_INITIALIZER;
1972 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1973 ds_put_format (&s, ": %s", token->token.string.string);
1975 struct msg *m = xmalloc (sizeof *m);
1977 .category = MSG_C_SYNTAX,
1978 .severity = MSG_S_ERROR,
1979 .location = lex_token_location_rw (src, token, token),
1980 .text = ds_steal_cstr (&s),
1985 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1986 underlying lex_reader if necessary. Returns true if a new token was added
1987 to SRC's deque, false otherwise. The caller should retry failures unless
1988 SRC's 'eof' marker was set to true indicating that there will be no more
1989 tokens from this source. */
1991 lex_source_try_get_pp (struct lex_source *src)
1993 /* Append a new token to SRC and initialize it. */
1994 struct lex_token *token = xmalloc (sizeof *token);
1995 token->token = (struct token) { .type = T_STOP };
1996 token->macro_rep = NULL;
1997 token->ref_cnt = NULL;
1998 token->token_pos = src->seg_pos;
2000 /* Extract a segment. */
2001 const char *segment;
2002 enum segment_type seg_type;
2006 segment = &src->buffer[src->seg_pos];
2007 seg_len = segmenter_push (&src->segmenter, segment,
2008 src->length - src->seg_pos,
2009 src->reader->eof, &seg_type);
2013 /* The segmenter needs more input to produce a segment. */
2014 assert (!src->reader->eof);
2015 lex_source_read__ (src);
2018 /* Update state based on the segment. */
2019 token->token_len = seg_len;
2020 src->seg_pos += seg_len;
2021 if (seg_type == SEG_NEWLINE)
2023 if (src->n_lines >= src->allocated_lines)
2024 src->lines = x2nrealloc (src->lines, &src->allocated_lines,
2025 sizeof *src->lines);
2026 src->lines[src->n_lines++] = src->seg_pos;
2029 /* Get a token from the segment. */
2030 enum tokenize_result result = token_from_segment (
2031 seg_type, ss_buffer (segment, seg_len), &token->token);
2033 /* If we've reached the end of a line, or the end of a command, then pass
2034 the line to the output engine as a syntax text item. */
2035 int n_lines = seg_type == SEG_NEWLINE;
2036 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
2039 src->suppress_next_newline = true;
2041 else if (n_lines > 0 && src->suppress_next_newline)
2044 src->suppress_next_newline = false;
2046 for (int i = 0; i < n_lines; i++)
2048 /* Beginning of line. */
2049 const char *line = &src->buffer[src->journal_pos];
2051 /* Calculate line length, including \n or \r\n end-of-line if present.
2053 We use src->length even though that may be beyond what we've actually
2054 converted to tokens. That's because, if we're emitting the line due
2055 to SEG_END_COMMAND, we want to take the whole line through the
2056 newline, not just through the '.'. */
2057 size_t max_len = src->length - src->journal_pos;
2058 const char *newline = memchr (line, '\n', max_len);
2059 size_t line_len = newline ? newline - line + 1 : max_len;
2061 /* Calculate line length excluding end-of-line. */
2062 size_t copy_len = line_len;
2063 if (copy_len > 0 && line[copy_len - 1] == '\n')
2065 if (copy_len > 0 && line[copy_len - 1] == '\r')
2068 /* Submit the line as syntax. */
2069 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
2070 xmemdup0 (line, copy_len),
2073 src->journal_pos += line_len;
2078 case TOKENIZE_ERROR:
2079 lex_get_error (src, token);
2081 case TOKENIZE_EMPTY:
2082 lex_token_destroy (token);
2085 case TOKENIZE_TOKEN:
2086 if (token->token.type == T_STOP)
2088 token->token.type = T_ENDCMD;
2091 lex_stage_push_last (&src->pp, token);
2097 /* Attempts to append a new token to SRC. Returns true if successful, false on
2098 failure. On failure, the end of SRC has been reached and no more tokens
2099 will be forthcoming from it.
2101 Does not make the new token available for lookahead yet; the caller must
2102 adjust SRC's 'middle' pointer to do so. */
2104 lex_source_get_pp (struct lex_source *src)
2107 if (lex_source_try_get_pp (src))
2113 lex_source_try_get_merge (const struct lex_source *src_)
2115 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2117 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
2120 if (!settings_get_mexpand ())
2122 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
2126 /* Now pass tokens one-by-one to the macro expander.
2128 In the common case where there is no macro to expand, the loop is not
2130 struct macro_call *mc;
2131 int n_call = macro_call_create (src->lexer->macros,
2132 &lex_stage_first (&src->pp)->token, &mc);
2133 for (int ofs = 1; !n_call; ofs++)
2135 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
2137 /* This should not be reachable because we always get a T_ENDCMD at
2138 the end of an input file (transformed from T_STOP by
2139 lex_source_try_get_pp()) and the macro_expander should always
2140 terminate expansion on T_ENDCMD. */
2144 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
2145 const struct macro_token mt = {
2147 .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len),
2149 const struct msg_location loc = lex_token_location (src, t, t);
2150 n_call = macro_call_add (mc, &mt, &loc);
2154 /* False alarm: no macro expansion after all. Use first token as
2155 lookahead. We'll retry macro expansion from the second token next
2157 macro_call_destroy (mc);
2158 lex_stage_shift (&src->merge, &src->pp, 1);
2162 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
2163 are a macro call. (These are likely to be the only tokens in 'pp'.)
2165 const struct lex_token *c0 = lex_stage_first (&src->pp);
2166 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
2167 struct macro_tokens expansion = { .n = 0 };
2168 struct msg_location loc = lex_token_location (src, c0, c1);
2169 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
2170 macro_call_destroy (mc);
2172 /* Convert the macro expansion into syntax for possible error messages
2174 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
2175 size_t *len = xnmalloc (expansion.n, sizeof *len);
2176 struct string s = DS_EMPTY_INITIALIZER;
2177 macro_tokens_to_syntax (&expansion, &s, ofs, len);
2179 if (settings_get_mprint ())
2180 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
2181 _("Macro Expansion")));
2183 /* Append the macro expansion tokens to the lookahead. */
2184 if (expansion.n > 0)
2186 char *macro_rep = ds_steal_cstr (&s);
2187 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
2188 *ref_cnt = expansion.n;
2189 for (size_t i = 0; i < expansion.n; i++)
2191 struct lex_token *token = xmalloc (sizeof *token);
2192 *token = (struct lex_token) {
2193 .token = expansion.mts[i].token,
2194 .token_pos = c0->token_pos,
2195 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
2196 .macro_rep = macro_rep,
2201 lex_stage_push_last (&src->merge, token);
2203 ss_dealloc (&expansion.mts[i].syntax);
2208 free (expansion.mts);
2212 /* Destroy the tokens for the call. */
2213 for (size_t i = 0; i < n_call; i++)
2214 lex_stage_pop_first (&src->pp);
2216 return expansion.n > 0;
2219 /* Attempts to obtain at least one new token into 'merge' in SRC.
2221 Returns true if successful, false on failure. In the latter case, SRC is
2222 exhausted and 'src->eof' is now true. */
2224 lex_source_get_merge (struct lex_source *src)
2227 if (lex_source_try_get_merge (src))
2232 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
2234 Returns true if successful, false on failure. In the latter case, SRC is
2235 exhausted and 'src->eof' is now true. */
2237 lex_source_get_parse (struct lex_source *src)
2239 struct merger m = MERGER_INIT;
2241 for (size_t i = 0; ; i++)
2243 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
2245 /* We always get a T_ENDCMD at the end of an input file
2246 (transformed from T_STOP by lex_source_try_get_pp()) and
2247 merger_add() should never return -1 on T_ENDCMD. */
2248 assert (lex_stage_is_empty (&src->merge));
2252 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
2256 lex_source_push_parse (src, lex_stage_take_first (&src->merge));
2259 else if (retval > 0)
2261 /* Add a token that merges all the tokens together. */
2262 const struct lex_token *first = lex_stage_first (&src->merge);
2263 const struct lex_token *last = lex_stage_nth (&src->merge,
2265 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2266 struct lex_token *t = xmalloc (sizeof *t);
2267 *t = (struct lex_token) {
2269 .token_pos = first->token_pos,
2270 .token_len = (last->token_pos - first->token_pos) + last->token_len,
2272 /* This works well if all the tokens were not expanded from macros,
2273 or if they came from the same macro expansion. It just gives up
2274 in the other (corner) cases. */
2275 .macro_rep = macro ? first->macro_rep : NULL,
2276 .ofs = macro ? first->ofs : 0,
2277 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2278 .ref_cnt = macro ? first->ref_cnt : NULL,
2282 lex_source_push_parse (src, t);
2284 for (int i = 0; i < retval; i++)
2285 lex_stage_pop_first (&src->merge);
2292 lex_source_push_endcmd__ (struct lex_source *src)
2294 assert (src->n_parse == 0);
2296 struct lex_token *token = xmalloc (sizeof *token);
2297 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2298 lex_source_push_parse (src, token);
2302 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2304 if (src->n_parse >= src->allocated_parse)
2305 src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2306 sizeof *src->parse);
2307 src->parse[src->n_parse++] = token;
2311 lex_source_clear_parse (struct lex_source *src)
2313 for (size_t i = 0; i < src->n_parse; i++)
2314 lex_token_destroy (src->parse[i]);
2315 src->n_parse = src->parse_ofs = 0;
2318 static struct lex_source *
2319 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2321 size_t allocated_lines = 4;
2322 size_t *lines = xmalloc (allocated_lines * sizeof *lines);
2325 struct lex_source *src = xmalloc (sizeof *src);
2326 *src = (struct lex_source) {
2329 .segmenter = segmenter_init (reader->syntax, false),
2333 .allocated_lines = allocated_lines,
2336 lex_source_push_endcmd__ (src);
2342 lex_set_message_handler (struct lexer *lexer,
2343 void (*output_msg) (const struct msg *,
2346 struct msg_handler msg_handler = {
2347 .output_msg = (void (*)(const struct msg *, void *)) output_msg,
2349 .lex_source_ref = lex_source_ref,
2350 .lex_source_unref = lex_source_unref,
2351 .lex_source_get_line = lex_source_get_line,
2353 msg_set_handler (&msg_handler);
2357 lex_source_ref (const struct lex_source *src_)
2359 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2362 assert (src->n_refs > 0);
2368 lex_source_unref (struct lex_source *src)
2373 assert (src->n_refs > 0);
2374 if (--src->n_refs > 0)
2377 char *file_name = src->reader->file_name;
2378 char *encoding = src->reader->encoding;
2379 if (src->reader->class->destroy != NULL)
2380 src->reader->class->destroy (src->reader);
2385 lex_stage_uninit (&src->pp);
2386 lex_stage_uninit (&src->merge);
2387 lex_source_clear_parse (src);
2392 struct lex_file_reader
2394 struct lex_reader reader;
2395 struct u8_istream *istream;
2398 static struct lex_reader_class lex_file_reader_class;
2400 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2401 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2402 ENCODING, which should take one of the forms accepted by
2403 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2404 mode of the new reader, respectively.
2406 Returns a null pointer if FILE_NAME cannot be opened. */
2408 lex_reader_for_file (const char *file_name, const char *encoding,
2409 enum segmenter_mode syntax,
2410 enum lex_error_mode error)
2412 struct lex_file_reader *r;
2413 struct u8_istream *istream;
2415 istream = (!strcmp(file_name, "-")
2416 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2417 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2418 if (istream == NULL)
2420 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2424 r = xmalloc (sizeof *r);
2425 lex_reader_init (&r->reader, &lex_file_reader_class);
2426 r->reader.syntax = syntax;
2427 r->reader.error = error;
2428 r->reader.file_name = xstrdup (file_name);
2429 r->reader.encoding = xstrdup_if_nonnull (encoding);
2430 r->reader.line_number = 1;
2431 r->istream = istream;
2436 static struct lex_file_reader *
2437 lex_file_reader_cast (struct lex_reader *r)
2439 return UP_CAST (r, struct lex_file_reader, reader);
2443 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2444 enum prompt_style prompt_style UNUSED)
2446 struct lex_file_reader *r = lex_file_reader_cast (r_);
2447 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2450 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2457 lex_file_close (struct lex_reader *r_)
2459 struct lex_file_reader *r = lex_file_reader_cast (r_);
2461 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2463 if (u8_istream_close (r->istream) != 0)
2464 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2467 u8_istream_free (r->istream);
2472 static struct lex_reader_class lex_file_reader_class =
2478 struct lex_string_reader
2480 struct lex_reader reader;
2485 static struct lex_reader_class lex_string_reader_class;
2487 /* Creates and returns a new lex_reader for the contents of S, which must be
2488 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2489 with ss_dealloc() when it is closed. */
2491 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2493 struct lex_string_reader *r;
2495 r = xmalloc (sizeof *r);
2496 lex_reader_init (&r->reader, &lex_string_reader_class);
2497 r->reader.syntax = SEG_MODE_AUTO;
2498 r->reader.encoding = xstrdup_if_nonnull (encoding);
2505 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2506 which must be encoded in ENCODING. The caller retains ownership of S. */
2508 lex_reader_for_string (const char *s, const char *encoding)
2510 struct substring ss;
2511 ss_alloc_substring (&ss, ss_cstr (s));
2512 return lex_reader_for_substring_nocopy (ss, encoding);
2515 /* Formats FORMAT as a printf()-like format string and creates and returns a
2516 new lex_reader for the formatted result. */
2518 lex_reader_for_format (const char *format, const char *encoding, ...)
2520 struct lex_reader *r;
2523 va_start (args, encoding);
2524 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2530 static struct lex_string_reader *
2531 lex_string_reader_cast (struct lex_reader *r)
2533 return UP_CAST (r, struct lex_string_reader, reader);
2537 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2538 enum prompt_style prompt_style UNUSED)
2540 struct lex_string_reader *r = lex_string_reader_cast (r_);
2543 chunk = MIN (n, r->s.length - r->offset);
2544 memcpy (buf, r->s.string + r->offset, chunk);
2551 lex_string_close (struct lex_reader *r_)
2553 struct lex_string_reader *r = lex_string_reader_cast (r_);
2559 static struct lex_reader_class lex_string_reader_class =
2566 lex_source_get_line (const struct lex_source *src, int line)
2568 if (line < 1 || line > src->n_lines)
2571 size_t ofs = src->lines[line - 1];
2572 size_t end = line >= src->n_lines ? src->length : src->lines[line];
2573 return ss_buffer (&src->buffer[ofs], end - ofs);