1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/intern.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
70 size_t token_pos; /* Offset into src->buffer of token start. */
71 size_t token_len; /* Length of source for token in bytes. */
73 /* For a token obtained through macro expansion, this is just this token.
75 For a token obtained through the lexer in an ordinary way, these are
77 char *macro_rep; /* The whole macro expansion. */
78 size_t ofs; /* Offset of this token in macro_rep. */
79 size_t len; /* Length of this token in macro_rep. */
80 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
83 static struct msg_point lex_token_start_point (const struct lex_source *,
84 const struct lex_token *);
85 static struct msg_point lex_token_end_point (const struct lex_source *,
86 const struct lex_token *);
88 /* Source offset of the last byte in TOKEN. */
90 lex_token_end (const struct lex_token *token)
92 return token->token_pos + MAX (token->token_len, 1) - 1;
96 lex_token_destroy (struct lex_token *t)
98 token_uninit (&t->token);
101 assert (*t->ref_cnt > 0);
111 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
116 struct lex_token **tokens;
119 static void lex_stage_clear (struct lex_stage *);
120 static void lex_stage_uninit (struct lex_stage *);
122 static size_t lex_stage_count (const struct lex_stage *);
123 static bool lex_stage_is_empty (const struct lex_stage *);
125 static struct lex_token *lex_stage_first (struct lex_stage *);
126 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
128 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
129 static void lex_stage_pop_first (struct lex_stage *);
131 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
134 /* Deletes all the tokens from STAGE. */
136 lex_stage_clear (struct lex_stage *stage)
138 while (!deque_is_empty (&stage->deque))
139 lex_stage_pop_first (stage);
142 /* Deletes all the tokens from STAGE and frees storage for the deque. */
144 lex_stage_uninit (struct lex_stage *stage)
146 lex_stage_clear (stage);
147 free (stage->tokens);
150 /* Returns true if STAGE contains no tokens, otherwise false. */
152 lex_stage_is_empty (const struct lex_stage *stage)
154 return deque_is_empty (&stage->deque);
157 /* Returns the number of tokens in STAGE. */
159 lex_stage_count (const struct lex_stage *stage)
161 return deque_count (&stage->deque);
164 /* Returns the first token in STAGE, which must be nonempty.
165 The first token is the one accessed with the least lookahead. */
166 static struct lex_token *
167 lex_stage_first (struct lex_stage *stage)
169 return lex_stage_nth (stage, 0);
172 /* Returns the token the given INDEX in STAGE. The first token (with the least
173 lookahead) is 0, the second token is 1, and so on. There must be at least
174 INDEX + 1 tokens in STAGE. */
175 static struct lex_token *
176 lex_stage_nth (struct lex_stage *stage, size_t index)
178 return stage->tokens[deque_back (&stage->deque, index)];
181 /* Adds TOKEN so that it becomes the last token in STAGE. */
183 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
185 if (deque_is_full (&stage->deque))
186 stage->tokens = deque_expand (&stage->deque, stage->tokens,
187 sizeof *stage->tokens);
188 stage->tokens[deque_push_front (&stage->deque)] = token;
191 /* Removes and returns the first token from STAGE. */
192 static struct lex_token *
193 lex_stage_take_first (struct lex_stage *stage)
195 return stage->tokens[deque_pop_back (&stage->deque)];
198 /* Removes the first token from STAGE and uninitializes it. */
200 lex_stage_pop_first (struct lex_stage *stage)
202 lex_token_destroy (lex_stage_take_first (stage));
205 /* Removes the first N tokens from SRC, appending them to DST as the last
208 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
210 for (size_t i = 0; i < n; i++)
211 lex_stage_push_last (dst, lex_stage_take_first (src));
214 /* A source of tokens, corresponding to a syntax file.
216 This is conceptually a lex_reader wrapped with everything needed to convert
217 its UTF-8 bytes into tokens. */
220 struct ll ll; /* In lexer's list of sources. */
224 - One for struct lexer.
226 - One for each struct msg_location that references this source. */
229 struct lex_reader *reader;
231 struct segmenter segmenter;
232 bool eof; /* True if T_STOP was read from 'reader'. */
234 /* Buffer of UTF-8 bytes. */
235 char *buffer; /* Source file contents. */
236 size_t length; /* Number of bytes filled. */
237 size_t allocated; /* Number of bytes allocated. */
239 /* Offsets into 'buffer'. */
240 size_t journal_pos; /* First byte not yet output to journal. */
241 size_t seg_pos; /* First byte not yet scanned as token. */
243 /* Offset into 'buffer' of starts of lines. */
245 size_t n_lines, allocated_lines;
247 bool suppress_next_newline;
251 This is a pipeline with the following stages. Each token eventually
252 made available to the parser passes through of these stages. The stages
253 are named after the processing that happens in each one.
255 Initially, tokens come from the segmenter and scanner to 'pp':
257 - pp: Tokens that need to pass through the macro preprocessor to end up
260 - merge: Tokens that need to pass through scan_merge() to end up in
263 - parse: Tokens available to the client for parsing.
265 'pp' and 'merge' store tokens only temporarily until they pass into
266 'parse'. Tokens then live in 'parse' until the command is fully
267 consumed, at which time they are freed together. */
269 struct lex_stage merge;
270 struct lex_token **parse;
271 size_t n_parse, allocated_parse, parse_ofs;
274 static struct lex_source *lex_source_create (struct lexer *,
275 struct lex_reader *);
280 struct ll_list sources; /* Contains "struct lex_source"s. */
281 struct macro_set *macros;
284 static struct lex_source *lex_source__ (const struct lexer *);
285 static char *lex_source_syntax__ (const struct lex_source *,
287 static const struct lex_token *lex_next__ (const struct lexer *, int n);
288 static void lex_source_push_endcmd__ (struct lex_source *);
289 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
290 static void lex_source_clear_parse (struct lex_source *);
292 static bool lex_source_get_parse (struct lex_source *);
293 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
294 const char *format, va_list)
295 PRINTF_FORMAT (4, 0);
296 static const struct lex_token *lex_source_next__ (const struct lex_source *,
299 /* Initializes READER with the specified CLASS and otherwise some reasonable
300 defaults. The caller should fill in the others members as desired. */
302 lex_reader_init (struct lex_reader *reader,
303 const struct lex_reader_class *class)
305 reader->class = class;
306 reader->syntax = SEG_MODE_AUTO;
307 reader->error = LEX_ERROR_CONTINUE;
308 reader->file_name = NULL;
309 reader->encoding = NULL;
310 reader->line_number = 0;
314 /* Frees any file name already in READER and replaces it by a copy of
315 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
317 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
319 free (reader->file_name);
320 reader->file_name = xstrdup_if_nonnull (file_name);
323 /* Creates and returns a new lexer. */
327 struct lexer *lexer = xmalloc (sizeof *lexer);
328 *lexer = (struct lexer) {
329 .sources = LL_INITIALIZER (lexer->sources),
330 .macros = macro_set_create (),
335 /* Destroys LEXER. */
337 lex_destroy (struct lexer *lexer)
341 struct lex_source *source, *next;
343 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
345 ll_remove (&source->ll);
346 lex_source_unref (source);
348 macro_set_destroy (lexer->macros);
353 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
354 same name. Takes ownership of M. */
356 lex_define_macro (struct lexer *lexer, struct macro *m)
358 macro_set_add (lexer->macros, m);
361 /* Inserts READER into LEXER so that the next token read by LEXER comes from
362 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
365 lex_include (struct lexer *lexer, struct lex_reader *reader)
367 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
368 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
371 /* Appends READER to LEXER, so that it will be read after all other current
372 readers have already been read. */
374 lex_append (struct lexer *lexer, struct lex_reader *reader)
376 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
381 /* Advances LEXER to the next token, consuming the current token. */
383 lex_get (struct lexer *lexer)
385 struct lex_source *src;
387 src = lex_source__ (lexer);
391 if (src->parse_ofs < src->n_parse)
393 if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
394 lex_source_clear_parse (src);
399 while (src->parse_ofs == src->n_parse)
400 if (!lex_source_get_parse (src))
402 ll_remove (&src->ll);
403 lex_source_unref (src);
404 src = lex_source__ (lexer);
410 /* Advances LEXER by N tokens. */
412 lex_get_n (struct lexer *lexer, size_t n)
418 /* Issuing errors. */
420 /* Prints a syntax error message containing the current token and
421 given message MESSAGE (if non-null). */
423 lex_error (struct lexer *lexer, const char *format, ...)
427 va_start (args, format);
428 lex_next_error_valist (lexer, 0, 0, format, args);
432 /* Prints a syntax error message containing the current token and
433 given message MESSAGE (if non-null). */
435 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
437 lex_next_error_valist (lexer, 0, 0, format, args);
440 /* Prints a syntax error message containing the current token and
441 given message MESSAGE (if non-null). */
443 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
447 va_start (args, format);
448 lex_next_error_valist (lexer, n0, n1, format, args);
452 /* Prints a syntax error message saying that one of the strings provided as
453 varargs, up to the first NULL, is expected. */
455 (lex_error_expecting) (struct lexer *lexer, ...)
459 va_start (args, lexer);
460 lex_error_expecting_valist (lexer, args);
464 /* Prints a syntax error message saying that one of the options provided in
465 ARGS, up to the first NULL, is expected. */
467 lex_error_expecting_valist (struct lexer *lexer, va_list args)
469 enum { MAX_OPTIONS = 9 };
470 const char *options[MAX_OPTIONS];
472 while (n < MAX_OPTIONS)
474 const char *option = va_arg (args, const char *);
478 options[n++] = option;
480 lex_error_expecting_array (lexer, options, n);
484 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
489 lex_error (lexer, NULL);
493 lex_error (lexer, _("expecting %s"), options[0]);
497 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
501 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
506 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
507 options[0], options[1], options[2], options[3]);
511 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
512 options[0], options[1], options[2], options[3], options[4]);
516 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
517 options[0], options[1], options[2], options[3], options[4],
522 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
523 options[0], options[1], options[2], options[3], options[4],
524 options[5], options[6]);
528 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
529 options[0], options[1], options[2], options[3], options[4],
530 options[5], options[6], options[7]);
535 struct string s = DS_EMPTY_INITIALIZER;
536 for (size_t i = 0; i < n; i++)
539 ds_put_cstr (&s, ", ");
540 ds_put_cstr (&s, options[i]);
542 lex_error (lexer, _("expecting one of the following: %s"),
550 /* Reports an error to the effect that subcommand SBC may only be specified
553 This function does not take a lexer as an argument or use lex_error(),
554 because the result would ordinarily just be redundant: "Syntax error at
555 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
556 not help the user find the error. */
558 lex_sbc_only_once (const char *sbc)
560 msg (SE, _("Subcommand %s may only be specified once."), sbc);
563 /* Reports an error to the effect that subcommand SBC is missing.
565 This function does not take a lexer as an argument or use lex_error(),
566 because a missing subcommand can normally be detected only after the whole
567 command has been parsed, and so lex_error() would always report "Syntax
568 error at end of command", which does not help the user find the error. */
570 lex_sbc_missing (const char *sbc)
572 msg (SE, _("Required subcommand %s was not specified."), sbc);
575 /* Reports an error to the effect that specification SPEC may only be specified
576 once within subcommand SBC. */
578 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
580 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
584 /* Reports an error to the effect that specification SPEC is missing within
587 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
589 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
593 /* Prints a syntax error message containing the current token and
594 given message MESSAGE (if non-null). */
596 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
597 const char *format, va_list args)
599 struct lex_source *src = lex_source__ (lexer);
602 lex_source_error_valist (src, n0, n1, format, args);
608 ds_put_format (&s, _("Syntax error at end of input"));
611 ds_put_cstr (&s, ": ");
612 ds_put_vformat (&s, format, args);
614 if (ds_last (&s) != '.')
615 ds_put_byte (&s, '.');
616 msg (SE, "%s", ds_cstr (&s));
621 /* Checks that we're at end of command.
622 If so, returns a successful command completion code.
623 If not, flags a syntax error and returns an error command
626 lex_end_of_command (struct lexer *lexer)
628 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
630 lex_error (lexer, _("expecting end of command"));
637 /* Token testing functions. */
639 /* Returns true if the current token is a number. */
641 lex_is_number (const struct lexer *lexer)
643 return lex_next_is_number (lexer, 0);
646 /* Returns true if the current token is a string. */
648 lex_is_string (const struct lexer *lexer)
650 return lex_next_is_string (lexer, 0);
653 /* Returns the value of the current token, which must be a
654 floating point number. */
656 lex_number (const struct lexer *lexer)
658 return lex_next_number (lexer, 0);
661 /* Returns true iff the current token is an integer. */
663 lex_is_integer (const struct lexer *lexer)
665 return lex_next_is_integer (lexer, 0);
668 /* Returns the value of the current token, which must be an
671 lex_integer (const struct lexer *lexer)
673 return lex_next_integer (lexer, 0);
676 /* Token testing functions with lookahead.
678 A value of 0 for N as an argument to any of these functions refers to the
679 current token. Lookahead is limited to the current command. Any N greater
680 than the number of tokens remaining in the current command will be treated
681 as referring to a T_ENDCMD token. */
683 /* Returns true if the token N ahead of the current token is a number. */
685 lex_next_is_number (const struct lexer *lexer, int n)
687 return token_is_number (lex_next (lexer, n));
690 /* Returns true if the token N ahead of the current token is a string. */
692 lex_next_is_string (const struct lexer *lexer, int n)
694 return token_is_string (lex_next (lexer, n));
697 /* Returns the value of the token N ahead of the current token, which must be a
698 floating point number. */
700 lex_next_number (const struct lexer *lexer, int n)
702 return token_number (lex_next (lexer, n));
705 /* Returns true if the token N ahead of the current token is an integer. */
707 lex_next_is_integer (const struct lexer *lexer, int n)
709 return token_is_integer (lex_next (lexer, n));
712 /* Returns the value of the token N ahead of the current token, which must be
715 lex_next_integer (const struct lexer *lexer, int n)
717 return token_integer (lex_next (lexer, n));
720 /* Token matching functions. */
722 /* If the current token has the specified TYPE, skips it and returns true.
723 Otherwise, returns false. */
725 lex_match (struct lexer *lexer, enum token_type type)
727 if (lex_token (lexer) == type)
736 /* If the current token matches IDENTIFIER, skips it and returns true.
737 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
740 IDENTIFIER must be an ASCII string. */
742 lex_match_id (struct lexer *lexer, const char *identifier)
744 return lex_match_id_n (lexer, identifier, 3);
747 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
748 may be abbreviated to its first N letters. Otherwise, returns false.
750 IDENTIFIER must be an ASCII string. */
752 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
754 if (lex_token (lexer) == T_ID
755 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
764 /* If the current token is integer X, skips it and returns true. Otherwise,
767 lex_match_int (struct lexer *lexer, int x)
769 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
778 /* Forced matches. */
780 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
781 abbreviated to its first 3 letters. Otherwise, reports an error and returns
784 IDENTIFIER must be an ASCII string. */
786 lex_force_match_id (struct lexer *lexer, const char *identifier)
788 if (lex_match_id (lexer, identifier))
792 lex_error_expecting (lexer, identifier);
797 /* If the current token has the specified TYPE, skips it and returns true.
798 Otherwise, reports an error and returns false. */
800 lex_force_match (struct lexer *lexer, enum token_type type)
802 if (lex_token (lexer) == type)
809 const char *type_string = token_type_to_string (type);
812 char *s = xasprintf ("`%s'", type_string);
813 lex_error_expecting (lexer, s);
817 lex_error_expecting (lexer, token_type_to_name (type));
823 /* If the current token is a string, does nothing and returns true.
824 Otherwise, reports an error and returns false. */
826 lex_force_string (struct lexer *lexer)
828 if (lex_is_string (lexer))
832 lex_error (lexer, _("expecting string"));
837 /* If the current token is a string or an identifier, does nothing and returns
838 true. Otherwise, reports an error and returns false.
840 This is meant for use in syntactic situations where we want to encourage the
841 user to supply a quoted string, but for compatibility we also accept
842 identifiers. (One example of such a situation is file names.) Therefore,
843 the error message issued when the current token is wrong only says that a
844 string is expected and doesn't mention that an identifier would also be
847 lex_force_string_or_id (struct lexer *lexer)
849 return lex_token (lexer) == T_ID || lex_force_string (lexer);
852 /* If the current token is an integer, does nothing and returns true.
853 Otherwise, reports an error and returns false. */
855 lex_force_int (struct lexer *lexer)
857 if (lex_is_integer (lexer))
861 lex_error (lexer, _("expecting integer"));
866 /* If the current token is an integer in the range MIN...MAX (inclusive), does
867 nothing and returns true. Otherwise, reports an error and returns false.
868 If NAME is nonnull, then it is used in the error message. */
870 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
872 bool is_number = lex_is_number (lexer);
873 bool is_integer = lex_is_integer (lexer);
874 bool too_small = (is_integer ? lex_integer (lexer) < min
875 : is_number ? lex_number (lexer) < min
877 bool too_big = (is_integer ? lex_integer (lexer) > max
878 : is_number ? lex_number (lexer) > max
880 if (is_integer && !too_small && !too_big)
885 /* Weird, maybe a bug in the caller. Just report that we needed an
888 lex_error (lexer, _("Integer expected for %s."), name);
890 lex_error (lexer, _("Integer expected."));
895 lex_error (lexer, _("Expected %ld for %s."), min, name);
897 lex_error (lexer, _("Expected %ld."), min);
899 else if (min + 1 == max)
902 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
904 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
908 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
909 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
911 if (report_lower_bound && report_upper_bound)
915 _("Expected integer between %ld and %ld for %s."),
918 lex_error (lexer, _("Expected integer between %ld and %ld."),
921 else if (report_lower_bound)
926 lex_error (lexer, _("Expected non-negative integer for %s."),
929 lex_error (lexer, _("Expected non-negative integer."));
934 lex_error (lexer, _("Expected positive integer for %s."),
937 lex_error (lexer, _("Expected positive integer."));
942 lex_error (lexer, _("Expected integer %ld or greater for %s."),
945 lex_error (lexer, _("Expected integer %ld or greater."), min);
948 else if (report_upper_bound)
952 _("Expected integer less than or equal to %ld for %s."),
955 lex_error (lexer, _("Expected integer less than or equal to %ld."),
961 lex_error (lexer, _("Integer expected for %s."), name);
963 lex_error (lexer, _("Integer expected."));
969 /* If the current token is a number, does nothing and returns true.
970 Otherwise, reports an error and returns false. */
972 lex_force_num (struct lexer *lexer)
974 if (lex_is_number (lexer))
977 lex_error (lexer, _("expecting number"));
981 /* If the current token is an number in the closed range [MIN,MAX], does
982 nothing and returns true. Otherwise, reports an error and returns false.
983 If NAME is nonnull, then it is used in the error message. */
985 lex_force_num_range_closed (struct lexer *lexer, const char *name,
986 double min, double max)
988 bool is_number = lex_is_number (lexer);
989 bool too_small = is_number && lex_number (lexer) < min;
990 bool too_big = is_number && lex_number (lexer) > max;
991 if (is_number && !too_small && !too_big)
996 /* Weird, maybe a bug in the caller. Just report that we needed an
999 lex_error (lexer, _("Number expected for %s."), name);
1001 lex_error (lexer, _("Number expected."));
1003 else if (min == max)
1006 lex_error (lexer, _("Expected %g for %s."), min, name);
1008 lex_error (lexer, _("Expected %g."), min);
1012 bool report_lower_bound = min > -DBL_MAX || too_small;
1013 bool report_upper_bound = max < DBL_MAX || too_big;
1015 if (report_lower_bound && report_upper_bound)
1019 _("Expected number between %g and %g for %s."),
1022 lex_error (lexer, _("Expected number between %g and %g."),
1025 else if (report_lower_bound)
1030 lex_error (lexer, _("Expected non-negative number for %s."),
1033 lex_error (lexer, _("Expected non-negative number."));
1038 lex_error (lexer, _("Expected number %g or greater for %s."),
1041 lex_error (lexer, _("Expected number %g or greater."), min);
1044 else if (report_upper_bound)
1048 _("Expected number less than or equal to %g for %s."),
1051 lex_error (lexer, _("Expected number less than or equal to %g."),
1057 lex_error (lexer, _("Number expected for %s."), name);
1059 lex_error (lexer, _("Number expected."));
1065 /* If the current token is an number in the half-open range [MIN,MAX), does
1066 nothing and returns true. Otherwise, reports an error and returns false.
1067 If NAME is nonnull, then it is used in the error message. */
1069 lex_force_num_range_halfopen (struct lexer *lexer, const char *name,
1070 double min, double max)
1072 bool is_number = lex_is_number (lexer);
1073 bool too_small = is_number && lex_number (lexer) < min;
1074 bool too_big = is_number && lex_number (lexer) >= max;
1075 if (is_number && !too_small && !too_big)
1080 /* Weird, maybe a bug in the caller. Just report that we needed an
1083 lex_error (lexer, _("Number expected for %s."), name);
1085 lex_error (lexer, _("Number expected."));
1089 bool report_lower_bound = min > -DBL_MAX || too_small;
1090 bool report_upper_bound = max < DBL_MAX || too_big;
1092 if (report_lower_bound && report_upper_bound)
1095 lex_error (lexer, _("Expected number in [%g,%g) for %s."),
1098 lex_error (lexer, _("Expected number in [%g,%g)."),
1101 else if (report_lower_bound)
1106 lex_error (lexer, _("Expected non-negative number for %s."),
1109 lex_error (lexer, _("Expected non-negative number."));
1114 lex_error (lexer, _("Expected number %g or greater for %s."),
1117 lex_error (lexer, _("Expected number %g or greater."), min);
1120 else if (report_upper_bound)
1124 _("Expected number less than %g for %s."), max, name);
1126 lex_error (lexer, _("Expected number less than %g."), max);
1131 lex_error (lexer, _("Number expected for %s."), name);
1133 lex_error (lexer, _("Number expected."));
1139 /* If the current token is an number in the open range (MIN,MAX], does
1140 nothing and returns true. Otherwise, reports an error and returns false.
1141 If NAME is nonnull, then it is used in the error message. */
1143 lex_force_num_range_open (struct lexer *lexer, const char *name,
1144 double min, double max)
1146 bool is_number = lex_is_number (lexer);
1147 bool too_small = is_number && lex_number (lexer) <= min;
1148 bool too_big = is_number && lex_number (lexer) >= max;
1149 if (is_number && !too_small && !too_big)
1154 /* Weird, maybe a bug in the caller. Just report that we needed an
1157 lex_error (lexer, _("Number expected for %s."), name);
1159 lex_error (lexer, _("Number expected."));
1163 bool report_lower_bound = min > -DBL_MAX || too_small;
1164 bool report_upper_bound = max < DBL_MAX || too_big;
1166 if (report_lower_bound && report_upper_bound)
1169 lex_error (lexer, _("Expected number in (%g,%g) for %s."),
1172 lex_error (lexer, _("Expected number in (%g,%g)."), min, max);
1174 else if (report_lower_bound)
1179 lex_error (lexer, _("Expected positive number for %s."), name);
1181 lex_error (lexer, _("Expected positive number."));
1186 lex_error (lexer, _("Expected number greater than %g for %s."),
1189 lex_error (lexer, _("Expected number greater than %g."), min);
1192 else if (report_upper_bound)
1195 lex_error (lexer, _("Expected number less than %g for %s."),
1198 lex_error (lexer, _("Expected number less than %g."), max);
1203 lex_error (lexer, _("Number expected for %s."), name);
1205 lex_error (lexer, _("Number expected."));
1211 /* If the current token is an identifier, does nothing and returns true.
1212 Otherwise, reports an error and returns false. */
1214 lex_force_id (struct lexer *lexer)
1216 if (lex_token (lexer) == T_ID)
1219 lex_error (lexer, _("expecting identifier"));
1223 /* Token accessors. */
1225 /* Returns the type of LEXER's current token. */
1227 lex_token (const struct lexer *lexer)
1229 return lex_next_token (lexer, 0);
1232 /* Returns the number in LEXER's current token.
1234 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1235 tokens this function will always return zero. */
1237 lex_tokval (const struct lexer *lexer)
1239 return lex_next_tokval (lexer, 0);
1242 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
1244 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1245 this functions this function will always return NULL.
1247 The UTF-8 encoding of the returned string is correct for variable names and
1248 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1249 data_in() to use it in a "union value". */
1251 lex_tokcstr (const struct lexer *lexer)
1253 return lex_next_tokcstr (lexer, 0);
1256 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
1257 null-terminated (but the null terminator is not included in the returned
1258 substring's 'length').
1260 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1261 this functions this function will always return NULL.
1263 The UTF-8 encoding of the returned string is correct for variable names and
1264 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1265 data_in() to use it in a "union value". */
1267 lex_tokss (const struct lexer *lexer)
1269 return lex_next_tokss (lexer, 0);
1274 A value of 0 for N as an argument to any of these functions refers to the
1275 current token. Lookahead is limited to the current command. Any N greater
1276 than the number of tokens remaining in the current command will be treated
1277 as referring to a T_ENDCMD token. */
1279 static const struct lex_token *
1280 lex_next__ (const struct lexer *lexer_, int n)
1282 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1283 struct lex_source *src = lex_source__ (lexer);
1286 return lex_source_next__ (src, n);
1289 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1294 static const struct lex_token *
1295 lex_source_ofs__ (const struct lex_source *src_, int ofs)
1297 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1301 static const struct lex_token endcmd_token
1302 = { .token = { .type = T_ENDCMD } };
1303 return &endcmd_token;
1306 while (ofs >= src->n_parse)
1308 if (src->n_parse > 0)
1310 const struct lex_token *t = src->parse[src->n_parse - 1];
1311 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1315 lex_source_get_parse (src);
1318 return src->parse[ofs];
1321 static const struct lex_token *
1322 lex_source_next__ (const struct lex_source *src, int n)
1324 return lex_source_ofs__ (src, n + src->parse_ofs);
1327 /* Returns the "struct token" of the token N after the current one in LEXER.
1328 The returned pointer can be invalidated by pretty much any succeeding call
1329 into the lexer, although the string pointer within the returned token is
1330 only invalidated by consuming the token (e.g. with lex_get()). */
1331 const struct token *
1332 lex_next (const struct lexer *lexer, int n)
1334 return &lex_next__ (lexer, n)->token;
1337 /* Returns the type of the token N after the current one in LEXER. */
1339 lex_next_token (const struct lexer *lexer, int n)
1341 return lex_next (lexer, n)->type;
1344 /* Returns the number in the tokn N after the current one in LEXER.
1346 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1347 tokens this function will always return zero. */
1349 lex_next_tokval (const struct lexer *lexer, int n)
1351 return token_number (lex_next (lexer, n));
1354 /* Returns the null-terminated string in the token N after the current one, in
1357 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1358 this functions this function will always return NULL.
1360 The UTF-8 encoding of the returned string is correct for variable names and
1361 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1362 data_in() to use it in a "union value". */
1364 lex_next_tokcstr (const struct lexer *lexer, int n)
1366 return lex_next_tokss (lexer, n).string;
1369 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1370 The string is null-terminated (but the null terminator is not included in
1371 the returned substring's 'length').
1373 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1374 tokens this functions this function will always return NULL.
1376 The UTF-8 encoding of the returned string is correct for variable names and
1377 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1378 data_in() to use it in a "union value". */
1380 lex_next_tokss (const struct lexer *lexer, int n)
1382 return lex_next (lexer, n)->string;
1385 /* Returns the offset of the current token within the command being parsed in
1386 LEXER. This is 0 for the first token in a command, 1 for the second, and so
1387 on. The return value is useful later for referring to this token in calls
1390 lex_ofs (const struct lexer *lexer)
1392 struct lex_source *src = lex_source__ (lexer);
1393 return src ? src->parse_ofs : 0;
1396 /* Returns the token within LEXER's current command with offset OFS. Use
1397 lex_ofs() to find out the offset of the current token. */
1398 const struct token *
1399 lex_ofs_token (const struct lexer *lexer_, int ofs)
1401 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1402 struct lex_source *src = lex_source__ (lexer);
1405 return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1408 static const struct token stop_token = { .type = T_STOP };
1413 /* Allocates and returns a new struct msg_location that spans tokens with
1414 offsets OFS0 through OFS1, inclusive, within the current command in
1415 LEXER. See lex_ofs() for an explanation of token offsets.
1417 The caller owns and must eventually free the returned object. */
1418 struct msg_location *
1419 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1421 int ofs = lex_ofs (lexer);
1422 return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1425 /* Returns a msg_point for the first character in the token with offset OFS,
1426 where offset 0 is the first token in the command currently being parsed, 1
1427 the second token, and so on. These are absolute offsets, not relative to
1428 the token currently being parsed within the command.
1430 Returns zeros for a T_STOP token.
1433 lex_ofs_start_point (const struct lexer *lexer, int ofs)
1435 const struct lex_source *src = lex_source__ (lexer);
1437 ? lex_token_start_point (src, lex_source_ofs__ (src, ofs))
1438 : (struct msg_point) { 0, 0 });
1441 /* Returns a msg_point for the last character, inclusive, in the token with
1442 offset OFS, where offset 0 is the first token in the command currently being
1443 parsed, 1 the second token, and so on. These are absolute offsets, not
1444 relative to the token currently being parsed within the command.
1446 Returns zeros for a T_STOP token.
1448 Most of the time, a single token is wholly within a single line of syntax,
1449 so that the start and end point for a given offset have the same line
1450 number. There are two exceptions: a T_STRING token can be made up of
1451 multiple segments on adjacent lines connected with "+" punctuators, and a
1452 T_NEG_NUM token can consist of a "-" on one line followed by the number on
1456 lex_ofs_end_point (const struct lexer *lexer, int ofs)
1458 const struct lex_source *src = lex_source__ (lexer);
1460 ? lex_token_end_point (src, lex_source_ofs__ (src, ofs))
1461 : (struct msg_point) { 0, 0 });
1464 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1465 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1466 are both zero, this requests the syntax for the current token.)
1468 The caller must eventually free the returned string (with free()). The
1469 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1470 that, for example, it may include comments, spaces, and new-lines if it
1471 spans multiple tokens. Macro expansion, however, has already been
1474 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1476 const struct lex_source *src = lex_source__ (lexer);
1478 ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs)
1483 /* Returns the text of the syntax in tokens with offsets OFS0 to OFS1,
1484 inclusive. (For example, if OFS0 and OFS1 are both zero, this requests the
1485 syntax for the first token in the current command.)
1487 The caller must eventually free the returned string (with free()). The
1488 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1489 that, for example, it may include comments, spaces, and new-lines if it
1490 spans multiple tokens. Macro expansion, however, has already been
1493 lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1)
1495 const struct lex_source *src = lex_source__ (lexer);
1496 return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup ("");
1499 /* Returns true if the token N ahead of the current one was produced by macro
1500 expansion, false otherwise. */
1502 lex_next_is_from_macro (const struct lexer *lexer, int n)
1504 return lex_next__ (lexer, n)->macro_rep != NULL;
1508 lex_tokens_match (const struct token *actual, const struct token *expected)
1510 if (actual->type != expected->type)
1513 switch (actual->type)
1517 return actual->number == expected->number;
1520 return lex_id_match (expected->string, actual->string);
1523 return (actual->string.length == expected->string.length
1524 && !memcmp (actual->string.string, expected->string.string,
1525 actual->string.length));
1533 lex_at_phrase__ (struct lexer *lexer, const char *s)
1535 struct string_lexer slex;
1539 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1540 while (string_lexer_next (&slex, &token))
1542 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1543 token_uninit (&token);
1550 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1551 returns true. Otherwise, returns false.
1553 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1554 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1555 first three letters. */
1557 lex_at_phrase (struct lexer *lexer, const char *s)
1559 return lex_at_phrase__ (lexer, s) > 0;
1562 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1563 skips it and returns true. Otherwise, returns false.
1565 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1566 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1567 first three letters. */
1569 lex_match_phrase (struct lexer *lexer, const char *s)
1571 size_t n = lex_at_phrase__ (lexer, s);
1573 lex_get_n (lexer, n);
1577 /* Returns the 1-based line number of the source text at the byte OFFSET in
1580 lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset)
1583 size_t hi = src->n_lines;
1586 size_t mid = (lo + hi) / 2;
1587 if (mid + 1 >= src->n_lines)
1588 return src->n_lines;
1589 else if (offset >= src->lines[mid + 1])
1591 else if (offset < src->lines[mid])
1598 /* Returns the 1-based column number of the source text at the byte OFFSET in
1601 lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset)
1603 const char *newline = memrchr (src->buffer, '\n', offset);
1604 size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1605 return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1608 static struct msg_point
1609 lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset)
1611 return (struct msg_point) {
1612 .line = lex_source_ofs_to_line_number (src, offset),
1613 .column = lex_source_ofs_to_column_number (src, offset),
1617 static struct msg_point
1618 lex_token_start_point (const struct lex_source *src,
1619 const struct lex_token *token)
1621 return lex_source_ofs_to_point__ (src, token->token_pos);
1624 static struct msg_point
1625 lex_token_end_point (const struct lex_source *src,
1626 const struct lex_token *token)
1628 return lex_source_ofs_to_point__ (src, lex_token_end (token));
1631 static struct msg_location
1632 lex_token_location (const struct lex_source *src,
1633 const struct lex_token *t0,
1634 const struct lex_token *t1)
1636 return (struct msg_location) {
1637 .file_name = intern_new_if_nonnull (src->reader->file_name),
1638 .start = lex_token_start_point (src, t0),
1639 .end = lex_token_end_point (src, t1),
1643 static struct msg_location *
1644 lex_token_location_rw (const struct lex_source *src,
1645 const struct lex_token *t0,
1646 const struct lex_token *t1)
1648 struct msg_location location = lex_token_location (src, t0, t1);
1649 return msg_location_dup (&location);
1652 static struct msg_location *
1653 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1655 return lex_token_location_rw (src,
1656 lex_source_next__ (src, n0),
1657 lex_source_next__ (src, n1));
1660 /* Returns the name of the syntax file from which the current command is drawn.
1661 Returns NULL for a T_STOP token or if the command's source does not have
1664 There is no version of this function that takes an N argument because
1665 lookahead only works to the end of a command and any given command is always
1666 within a single syntax file. */
1668 lex_get_file_name (const struct lexer *lexer)
1670 struct lex_source *src = lex_source__ (lexer);
1671 return src == NULL ? NULL : src->reader->file_name;
1674 /* Returns a newly allocated msg_location for the syntax that represents tokens
1675 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1676 must eventually free the location (with msg_location_destroy()). */
1677 struct msg_location *
1678 lex_get_location (const struct lexer *lexer, int n0, int n1)
1680 struct msg_location *loc = xmalloc (sizeof *loc);
1681 *loc = (struct msg_location) {
1682 .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1683 .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)),
1684 .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)),
1685 .src = lex_source__ (lexer),
1687 lex_source_ref (loc->src);
1692 lex_get_encoding (const struct lexer *lexer)
1694 struct lex_source *src = lex_source__ (lexer);
1695 return src == NULL ? NULL : src->reader->encoding;
1698 /* Returns the syntax mode for the syntax file from which the current drawn is
1699 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1700 does not have line numbers.
1702 There is no version of this function that takes an N argument because
1703 lookahead only works to the end of a command and any given command is always
1704 within a single syntax file. */
1706 lex_get_syntax_mode (const struct lexer *lexer)
1708 struct lex_source *src = lex_source__ (lexer);
1709 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1712 /* Returns the error mode for the syntax file from which the current drawn is
1713 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1714 source does not have line numbers.
1716 There is no version of this function that takes an N argument because
1717 lookahead only works to the end of a command and any given command is always
1718 within a single syntax file. */
1720 lex_get_error_mode (const struct lexer *lexer)
1722 struct lex_source *src = lex_source__ (lexer);
1723 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1726 /* If the source that LEXER is currently reading has error mode
1727 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1728 token to be read comes directly from whatever is next read from the stream.
1730 It makes sense to call this function after encountering an error in a
1731 command entered on the console, because usually the user would prefer not to
1732 have cascading errors. */
1734 lex_interactive_reset (struct lexer *lexer)
1736 struct lex_source *src = lex_source__ (lexer);
1737 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1740 src->journal_pos = src->seg_pos = 0;
1742 src->suppress_next_newline = false;
1743 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1745 lex_stage_clear (&src->pp);
1746 lex_stage_clear (&src->merge);
1747 lex_source_clear_parse (src);
1748 lex_source_push_endcmd__ (src);
1752 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1754 lex_discard_rest_of_command (struct lexer *lexer)
1756 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1760 /* Discards all lookahead tokens in LEXER, then discards all input sources
1761 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1762 runs out of input sources. */
1764 lex_discard_noninteractive (struct lexer *lexer)
1766 struct lex_source *src = lex_source__ (lexer);
1770 lex_stage_clear (&src->pp);
1771 lex_stage_clear (&src->merge);
1772 lex_source_clear_parse (src);
1774 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1775 src = lex_source__ (lexer))
1777 ll_remove (&src->ll);
1778 lex_source_unref (src);
1784 lex_source_expand__ (struct lex_source *src)
1786 if (src->length >= src->allocated)
1787 src->buffer = x2realloc (src->buffer, &src->allocated);
1791 lex_source_read__ (struct lex_source *src)
1795 lex_source_expand__ (src);
1797 size_t space = src->allocated - src->length;
1798 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1799 size_t n = src->reader->class->read (src->reader,
1800 &src->buffer[src->length],
1802 assert (n <= space);
1807 src->reader->eof = true;
1813 while (!memchr (&src->buffer[src->seg_pos], '\n',
1814 src->length - src->seg_pos));
1817 static struct lex_source *
1818 lex_source__ (const struct lexer *lexer)
1820 return (ll_is_empty (&lexer->sources) ? NULL
1821 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1824 /* Returns the text of the syntax in SRC for tokens with offsets OFS0 through
1825 OFS1 in the current command, inclusive. (For example, if OFS0 and OFS1 are
1826 both zero, this requests the syntax for the first token in the current
1827 command.) The caller must eventually free the returned string (with
1828 free()). The syntax is encoded in UTF-8 and in the original form supplied
1829 to the lexer so that, for example, it may include comments, spaces, and
1830 new-lines if it spans multiple tokens. Macro expansion, however, has
1831 already been performed. */
1833 lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1)
1835 struct string s = DS_EMPTY_INITIALIZER;
1836 for (size_t i = ofs0; i <= ofs1; )
1838 /* Find [I,J) as the longest sequence of tokens not produced by macro
1839 expansion, or otherwise the longest sequence expanded from a single
1841 const struct lex_token *first = lex_source_ofs__ (src, i);
1843 for (j = i + 1; j <= ofs1; j++)
1845 const struct lex_token *cur = lex_source_ofs__ (src, j);
1846 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1847 || first->macro_rep != cur->macro_rep)
1850 const struct lex_token *last = lex_source_ofs__ (src, j - 1);
1852 /* Now add the syntax for this sequence of tokens to SRC. */
1853 if (!ds_is_empty (&s))
1854 ds_put_byte (&s, ' ');
1855 if (!first->macro_rep)
1857 size_t start = first->token_pos;
1858 size_t end = last->token_pos + last->token_len;
1859 ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1863 size_t start = first->ofs;
1864 size_t end = last->ofs + last->len;
1865 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1871 return ds_steal_cstr (&s);
1875 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1877 for (int i = n0; i <= n1; i++)
1878 if (lex_source_next__ (src, i)->macro_rep)
1883 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1884 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1885 other tokens included in that range. The syntax is encoded in UTF-8 and in
1886 the original form supplied to the lexer so that, for example, it may include
1887 comments, spaces, and new-lines if it spans multiple tokens.
1889 Returns an empty string if the token range doesn't include a macro call.
1891 The caller must not modify or free the returned string. */
1892 static struct substring
1893 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1895 if (!lex_source_contains_macro_call (src, n0, n1))
1898 const struct lex_token *token0 = lex_source_next__ (src, n0);
1899 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1900 size_t start = token0->token_pos;
1901 size_t end = token1->token_pos + token1->token_len;
1903 return ss_buffer (&src->buffer[start], end - start);
1907 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1908 const char *format, va_list args)
1910 const struct lex_token *token;
1915 token = lex_source_next__ (src, n0);
1916 if (token->token.type == T_ENDCMD)
1917 ds_put_cstr (&s, _("Syntax error at end of command"));
1920 /* Get the syntax that caused the error. */
1921 char *raw_syntax = lex_source_syntax__ (src, n0 + src->parse_ofs,
1922 n1 + src->parse_ofs);
1924 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1927 /* Get the macro call(s) that expanded to the syntax that caused the
1930 str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1937 _("Syntax error at `%s' (in expansion of `%s')"),
1940 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1945 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1948 ds_put_cstr (&s, _("Syntax error"));
1954 ds_put_cstr (&s, ": ");
1955 ds_put_vformat (&s, format, args);
1957 if (ds_last (&s) != '.')
1958 ds_put_byte (&s, '.');
1960 struct msg *m = xmalloc (sizeof *m);
1962 .category = MSG_C_SYNTAX,
1963 .severity = MSG_S_ERROR,
1964 .location = lex_source_get_location (src, n0, n1),
1965 .text = ds_steal_cstr (&s),
1971 lex_get_error (struct lex_source *src, const struct lex_token *token)
1974 str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1975 syntax, sizeof syntax);
1977 struct string s = DS_EMPTY_INITIALIZER;
1978 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1979 ds_put_format (&s, ": %s", token->token.string.string);
1981 struct msg *m = xmalloc (sizeof *m);
1983 .category = MSG_C_SYNTAX,
1984 .severity = MSG_S_ERROR,
1985 .location = lex_token_location_rw (src, token, token),
1986 .text = ds_steal_cstr (&s),
1991 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1992 underlying lex_reader if necessary. Returns true if a new token was added
1993 to SRC's deque, false otherwise. The caller should retry failures unless
1994 SRC's 'eof' marker was set to true indicating that there will be no more
1995 tokens from this source. */
1997 lex_source_try_get_pp (struct lex_source *src)
1999 /* Append a new token to SRC and initialize it. */
2000 struct lex_token *token = xmalloc (sizeof *token);
2001 token->token = (struct token) { .type = T_STOP };
2002 token->macro_rep = NULL;
2003 token->ref_cnt = NULL;
2004 token->token_pos = src->seg_pos;
2006 /* Extract a segment. */
2007 const char *segment;
2008 enum segment_type seg_type;
2012 segment = &src->buffer[src->seg_pos];
2013 seg_len = segmenter_push (&src->segmenter, segment,
2014 src->length - src->seg_pos,
2015 src->reader->eof, &seg_type);
2019 /* The segmenter needs more input to produce a segment. */
2020 assert (!src->reader->eof);
2021 lex_source_read__ (src);
2024 /* Update state based on the segment. */
2025 token->token_len = seg_len;
2026 src->seg_pos += seg_len;
2027 if (seg_type == SEG_NEWLINE)
2029 if (src->n_lines >= src->allocated_lines)
2030 src->lines = x2nrealloc (src->lines, &src->allocated_lines,
2031 sizeof *src->lines);
2032 src->lines[src->n_lines++] = src->seg_pos;
2035 /* Get a token from the segment. */
2036 enum tokenize_result result = token_from_segment (
2037 seg_type, ss_buffer (segment, seg_len), &token->token);
2039 /* If we've reached the end of a line, or the end of a command, then pass
2040 the line to the output engine as a syntax text item. */
2041 int n_lines = seg_type == SEG_NEWLINE;
2042 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
2045 src->suppress_next_newline = true;
2047 else if (n_lines > 0 && src->suppress_next_newline)
2050 src->suppress_next_newline = false;
2052 for (int i = 0; i < n_lines; i++)
2054 /* Beginning of line. */
2055 const char *line = &src->buffer[src->journal_pos];
2057 /* Calculate line length, including \n or \r\n end-of-line if present.
2059 We use src->length even though that may be beyond what we've actually
2060 converted to tokens. That's because, if we're emitting the line due
2061 to SEG_END_COMMAND, we want to take the whole line through the
2062 newline, not just through the '.'. */
2063 size_t max_len = src->length - src->journal_pos;
2064 const char *newline = memchr (line, '\n', max_len);
2065 size_t line_len = newline ? newline - line + 1 : max_len;
2067 /* Calculate line length excluding end-of-line. */
2068 size_t copy_len = line_len;
2069 if (copy_len > 0 && line[copy_len - 1] == '\n')
2071 if (copy_len > 0 && line[copy_len - 1] == '\r')
2074 /* Submit the line as syntax. */
2075 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
2076 xmemdup0 (line, copy_len),
2079 src->journal_pos += line_len;
2084 case TOKENIZE_ERROR:
2085 lex_get_error (src, token);
2087 case TOKENIZE_EMPTY:
2088 lex_token_destroy (token);
2091 case TOKENIZE_TOKEN:
2092 if (token->token.type == T_STOP)
2094 token->token.type = T_ENDCMD;
2097 lex_stage_push_last (&src->pp, token);
2103 /* Attempts to append a new token to SRC. Returns true if successful, false on
2104 failure. On failure, the end of SRC has been reached and no more tokens
2105 will be forthcoming from it.
2107 Does not make the new token available for lookahead yet; the caller must
2108 adjust SRC's 'middle' pointer to do so. */
2110 lex_source_get_pp (struct lex_source *src)
2113 if (lex_source_try_get_pp (src))
2119 lex_source_try_get_merge (const struct lex_source *src_)
2121 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2123 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
2126 if (!settings_get_mexpand ())
2128 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
2132 /* Now pass tokens one-by-one to the macro expander.
2134 In the common case where there is no macro to expand, the loop is not
2136 struct macro_call *mc;
2137 int n_call = macro_call_create (src->lexer->macros,
2138 &lex_stage_first (&src->pp)->token, &mc);
2139 for (int ofs = 1; !n_call; ofs++)
2141 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
2143 /* This should not be reachable because we always get a T_ENDCMD at
2144 the end of an input file (transformed from T_STOP by
2145 lex_source_try_get_pp()) and the macro_expander should always
2146 terminate expansion on T_ENDCMD. */
2150 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
2151 const struct macro_token mt = {
2153 .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len),
2155 const struct msg_location loc = lex_token_location (src, t, t);
2156 n_call = macro_call_add (mc, &mt, &loc);
2160 /* False alarm: no macro expansion after all. Use first token as
2161 lookahead. We'll retry macro expansion from the second token next
2163 macro_call_destroy (mc);
2164 lex_stage_shift (&src->merge, &src->pp, 1);
2168 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
2169 are a macro call. (These are likely to be the only tokens in 'pp'.)
2171 const struct lex_token *c0 = lex_stage_first (&src->pp);
2172 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
2173 struct macro_tokens expansion = { .n = 0 };
2174 struct msg_location loc = lex_token_location (src, c0, c1);
2175 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
2176 macro_call_destroy (mc);
2178 /* Convert the macro expansion into syntax for possible error messages
2180 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
2181 size_t *len = xnmalloc (expansion.n, sizeof *len);
2182 struct string s = DS_EMPTY_INITIALIZER;
2183 macro_tokens_to_syntax (&expansion, &s, ofs, len);
2185 if (settings_get_mprint ())
2186 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
2187 _("Macro Expansion")));
2189 /* Append the macro expansion tokens to the lookahead. */
2190 if (expansion.n > 0)
2192 char *macro_rep = ds_steal_cstr (&s);
2193 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
2194 *ref_cnt = expansion.n;
2195 for (size_t i = 0; i < expansion.n; i++)
2197 struct lex_token *token = xmalloc (sizeof *token);
2198 *token = (struct lex_token) {
2199 .token = expansion.mts[i].token,
2200 .token_pos = c0->token_pos,
2201 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
2202 .macro_rep = macro_rep,
2207 lex_stage_push_last (&src->merge, token);
2209 ss_dealloc (&expansion.mts[i].syntax);
2214 free (expansion.mts);
2218 /* Destroy the tokens for the call. */
2219 for (size_t i = 0; i < n_call; i++)
2220 lex_stage_pop_first (&src->pp);
2222 return expansion.n > 0;
2225 /* Attempts to obtain at least one new token into 'merge' in SRC.
2227 Returns true if successful, false on failure. In the latter case, SRC is
2228 exhausted and 'src->eof' is now true. */
2230 lex_source_get_merge (struct lex_source *src)
2233 if (lex_source_try_get_merge (src))
2238 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
2240 Returns true if successful, false on failure. In the latter case, SRC is
2241 exhausted and 'src->eof' is now true. */
2243 lex_source_get_parse (struct lex_source *src)
2245 struct merger m = MERGER_INIT;
2247 for (size_t i = 0; ; i++)
2249 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
2251 /* We always get a T_ENDCMD at the end of an input file
2252 (transformed from T_STOP by lex_source_try_get_pp()) and
2253 merger_add() should never return -1 on T_ENDCMD. */
2254 assert (lex_stage_is_empty (&src->merge));
2258 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
2262 lex_source_push_parse (src, lex_stage_take_first (&src->merge));
2265 else if (retval > 0)
2267 /* Add a token that merges all the tokens together. */
2268 const struct lex_token *first = lex_stage_first (&src->merge);
2269 const struct lex_token *last = lex_stage_nth (&src->merge,
2271 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2272 struct lex_token *t = xmalloc (sizeof *t);
2273 *t = (struct lex_token) {
2275 .token_pos = first->token_pos,
2276 .token_len = (last->token_pos - first->token_pos) + last->token_len,
2278 /* This works well if all the tokens were not expanded from macros,
2279 or if they came from the same macro expansion. It just gives up
2280 in the other (corner) cases. */
2281 .macro_rep = macro ? first->macro_rep : NULL,
2282 .ofs = macro ? first->ofs : 0,
2283 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2284 .ref_cnt = macro ? first->ref_cnt : NULL,
2288 lex_source_push_parse (src, t);
2290 for (int i = 0; i < retval; i++)
2291 lex_stage_pop_first (&src->merge);
2298 lex_source_push_endcmd__ (struct lex_source *src)
2300 assert (src->n_parse == 0);
2302 struct lex_token *token = xmalloc (sizeof *token);
2303 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2304 lex_source_push_parse (src, token);
2308 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2310 if (src->n_parse >= src->allocated_parse)
2311 src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2312 sizeof *src->parse);
2313 src->parse[src->n_parse++] = token;
2317 lex_source_clear_parse (struct lex_source *src)
2319 for (size_t i = 0; i < src->n_parse; i++)
2320 lex_token_destroy (src->parse[i]);
2321 src->n_parse = src->parse_ofs = 0;
2324 static struct lex_source *
2325 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2327 size_t allocated_lines = 4;
2328 size_t *lines = xmalloc (allocated_lines * sizeof *lines);
2331 struct lex_source *src = xmalloc (sizeof *src);
2332 *src = (struct lex_source) {
2335 .segmenter = segmenter_init (reader->syntax, false),
2339 .allocated_lines = allocated_lines,
2342 lex_source_push_endcmd__ (src);
2348 lex_set_message_handler (struct lexer *lexer,
2349 void (*output_msg) (const struct msg *,
2352 struct msg_handler msg_handler = {
2353 .output_msg = (void (*)(const struct msg *, void *)) output_msg,
2355 .lex_source_ref = lex_source_ref,
2356 .lex_source_unref = lex_source_unref,
2357 .lex_source_get_line = lex_source_get_line,
2359 msg_set_handler (&msg_handler);
2363 lex_source_ref (const struct lex_source *src_)
2365 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2368 assert (src->n_refs > 0);
2374 lex_source_unref (struct lex_source *src)
2379 assert (src->n_refs > 0);
2380 if (--src->n_refs > 0)
2383 char *file_name = src->reader->file_name;
2384 char *encoding = src->reader->encoding;
2385 if (src->reader->class->destroy != NULL)
2386 src->reader->class->destroy (src->reader);
2391 lex_stage_uninit (&src->pp);
2392 lex_stage_uninit (&src->merge);
2393 lex_source_clear_parse (src);
2398 struct lex_file_reader
2400 struct lex_reader reader;
2401 struct u8_istream *istream;
2404 static struct lex_reader_class lex_file_reader_class;
2406 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2407 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2408 ENCODING, which should take one of the forms accepted by
2409 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2410 mode of the new reader, respectively.
2412 Returns a null pointer if FILE_NAME cannot be opened. */
2414 lex_reader_for_file (const char *file_name, const char *encoding,
2415 enum segmenter_mode syntax,
2416 enum lex_error_mode error)
2418 struct lex_file_reader *r;
2419 struct u8_istream *istream;
2421 istream = (!strcmp(file_name, "-")
2422 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2423 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2424 if (istream == NULL)
2426 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2430 r = xmalloc (sizeof *r);
2431 lex_reader_init (&r->reader, &lex_file_reader_class);
2432 r->reader.syntax = syntax;
2433 r->reader.error = error;
2434 r->reader.file_name = xstrdup (file_name);
2435 r->reader.encoding = xstrdup_if_nonnull (encoding);
2436 r->reader.line_number = 1;
2437 r->istream = istream;
2442 static struct lex_file_reader *
2443 lex_file_reader_cast (struct lex_reader *r)
2445 return UP_CAST (r, struct lex_file_reader, reader);
2449 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2450 enum prompt_style prompt_style UNUSED)
2452 struct lex_file_reader *r = lex_file_reader_cast (r_);
2453 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2456 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2463 lex_file_close (struct lex_reader *r_)
2465 struct lex_file_reader *r = lex_file_reader_cast (r_);
2467 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2469 if (u8_istream_close (r->istream) != 0)
2470 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2473 u8_istream_free (r->istream);
2478 static struct lex_reader_class lex_file_reader_class =
2484 struct lex_string_reader
2486 struct lex_reader reader;
2491 static struct lex_reader_class lex_string_reader_class;
2493 /* Creates and returns a new lex_reader for the contents of S, which must be
2494 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2495 with ss_dealloc() when it is closed. */
2497 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2499 struct lex_string_reader *r;
2501 r = xmalloc (sizeof *r);
2502 lex_reader_init (&r->reader, &lex_string_reader_class);
2503 r->reader.syntax = SEG_MODE_AUTO;
2504 r->reader.encoding = xstrdup_if_nonnull (encoding);
2511 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2512 which must be encoded in ENCODING. The caller retains ownership of S. */
2514 lex_reader_for_string (const char *s, const char *encoding)
2516 struct substring ss;
2517 ss_alloc_substring (&ss, ss_cstr (s));
2518 return lex_reader_for_substring_nocopy (ss, encoding);
2521 /* Formats FORMAT as a printf()-like format string and creates and returns a
2522 new lex_reader for the formatted result. */
2524 lex_reader_for_format (const char *format, const char *encoding, ...)
2526 struct lex_reader *r;
2529 va_start (args, encoding);
2530 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2536 static struct lex_string_reader *
2537 lex_string_reader_cast (struct lex_reader *r)
2539 return UP_CAST (r, struct lex_string_reader, reader);
2543 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2544 enum prompt_style prompt_style UNUSED)
2546 struct lex_string_reader *r = lex_string_reader_cast (r_);
2549 chunk = MIN (n, r->s.length - r->offset);
2550 memcpy (buf, r->s.string + r->offset, chunk);
2557 lex_string_close (struct lex_reader *r_)
2559 struct lex_string_reader *r = lex_string_reader_cast (r_);
2565 static struct lex_reader_class lex_string_reader_class =
2572 lex_source_get_line (const struct lex_source *src, int line)
2574 if (line < 1 || line > src->n_lines)
2577 size_t ofs = src->lines[line - 1];
2578 size_t end = line >= src->n_lines ? src->length : src->lines[line];
2579 return ss_buffer (&src->buffer[ofs], end - ofs);