1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/intern.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
70 size_t token_pos; /* Offset into src->buffer of token start. */
71 size_t token_len; /* Length of source for token in bytes. */
73 /* For a token obtained through macro expansion, this is just this token.
75 For a token obtained through the lexer in an ordinary way, these are
77 char *macro_rep; /* The whole macro expansion. */
78 size_t ofs; /* Offset of this token in macro_rep. */
79 size_t len; /* Length of this token in macro_rep. */
80 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
83 static struct msg_point lex_token_start_point (const struct lex_source *,
84 const struct lex_token *);
85 static struct msg_point lex_token_end_point (const struct lex_source *,
86 const struct lex_token *);
88 /* Source offset of the last byte in TOKEN. */
90 lex_token_end (const struct lex_token *token)
92 return token->token_pos + MAX (token->token_len, 1) - 1;
96 lex_token_destroy (struct lex_token *t)
98 token_uninit (&t->token);
101 assert (*t->ref_cnt > 0);
111 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
116 struct lex_token **tokens;
119 static void lex_stage_clear (struct lex_stage *);
120 static void lex_stage_uninit (struct lex_stage *);
122 static size_t lex_stage_count (const struct lex_stage *);
123 static bool lex_stage_is_empty (const struct lex_stage *);
125 static struct lex_token *lex_stage_first (struct lex_stage *);
126 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
128 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
129 static void lex_stage_pop_first (struct lex_stage *);
131 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
134 /* Deletes all the tokens from STAGE. */
136 lex_stage_clear (struct lex_stage *stage)
138 while (!deque_is_empty (&stage->deque))
139 lex_stage_pop_first (stage);
142 /* Deletes all the tokens from STAGE and frees storage for the deque. */
144 lex_stage_uninit (struct lex_stage *stage)
146 lex_stage_clear (stage);
147 free (stage->tokens);
150 /* Returns true if STAGE contains no tokens, otherwise false. */
152 lex_stage_is_empty (const struct lex_stage *stage)
154 return deque_is_empty (&stage->deque);
157 /* Returns the number of tokens in STAGE. */
159 lex_stage_count (const struct lex_stage *stage)
161 return deque_count (&stage->deque);
164 /* Returns the first token in STAGE, which must be nonempty.
165 The first token is the one accessed with the least lookahead. */
166 static struct lex_token *
167 lex_stage_first (struct lex_stage *stage)
169 return lex_stage_nth (stage, 0);
172 /* Returns the token the given INDEX in STAGE. The first token (with the least
173 lookahead) is 0, the second token is 1, and so on. There must be at least
174 INDEX + 1 tokens in STAGE. */
175 static struct lex_token *
176 lex_stage_nth (struct lex_stage *stage, size_t index)
178 return stage->tokens[deque_back (&stage->deque, index)];
181 /* Adds TOKEN so that it becomes the last token in STAGE. */
183 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
185 if (deque_is_full (&stage->deque))
186 stage->tokens = deque_expand (&stage->deque, stage->tokens,
187 sizeof *stage->tokens);
188 stage->tokens[deque_push_front (&stage->deque)] = token;
191 /* Removes and returns the first token from STAGE. */
192 static struct lex_token *
193 lex_stage_take_first (struct lex_stage *stage)
195 return stage->tokens[deque_pop_back (&stage->deque)];
198 /* Removes the first token from STAGE and uninitializes it. */
200 lex_stage_pop_first (struct lex_stage *stage)
202 lex_token_destroy (lex_stage_take_first (stage));
205 /* Removes the first N tokens from SRC, appending them to DST as the last
208 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
210 for (size_t i = 0; i < n; i++)
211 lex_stage_push_last (dst, lex_stage_take_first (src));
214 /* A source of tokens, corresponding to a syntax file.
216 This is conceptually a lex_reader wrapped with everything needed to convert
217 its UTF-8 bytes into tokens. */
220 struct ll ll; /* In lexer's list of sources. */
224 - One for struct lexer.
226 - One for each struct msg_location that references this source. */
229 struct lex_reader *reader;
231 struct segmenter segmenter;
232 bool eof; /* True if T_STOP was read from 'reader'. */
234 /* Buffer of UTF-8 bytes. */
235 char *buffer; /* Source file contents. */
236 size_t length; /* Number of bytes filled. */
237 size_t allocated; /* Number of bytes allocated. */
239 /* Offsets into 'buffer'. */
240 size_t journal_pos; /* First byte not yet output to journal. */
241 size_t seg_pos; /* First byte not yet scanned as token. */
243 /* Offset into 'buffer' of starts of lines. */
245 size_t n_lines, allocated_lines;
247 bool suppress_next_newline;
251 This is a pipeline with the following stages. Each token eventually
252 made available to the parser passes through of these stages. The stages
253 are named after the processing that happens in each one.
255 Initially, tokens come from the segmenter and scanner to 'pp':
257 - pp: Tokens that need to pass through the macro preprocessor to end up
260 - merge: Tokens that need to pass through scan_merge() to end up in
263 - parse: Tokens available to the client for parsing.
265 'pp' and 'merge' store tokens only temporarily until they pass into
266 'parse'. Tokens then live in 'parse' until the command is fully
267 consumed, at which time they are freed together. */
269 struct lex_stage merge;
270 struct lex_token **parse;
271 size_t n_parse, allocated_parse, parse_ofs;
274 static struct lex_source *lex_source_create (struct lexer *,
275 struct lex_reader *);
280 struct ll_list sources; /* Contains "struct lex_source"s. */
281 struct macro_set *macros;
284 static struct lex_source *lex_source__ (const struct lexer *);
285 static char *lex_source_syntax__ (const struct lex_source *,
287 static const struct lex_token *lex_next__ (const struct lexer *, int n);
288 static void lex_source_push_endcmd__ (struct lex_source *);
289 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
290 static void lex_source_clear_parse (struct lex_source *);
292 static bool lex_source_get_parse (struct lex_source *);
293 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
294 const char *format, va_list)
295 PRINTF_FORMAT (4, 0);
296 static const struct lex_token *lex_source_next__ (const struct lex_source *,
299 /* Initializes READER with the specified CLASS and otherwise some reasonable
300 defaults. The caller should fill in the others members as desired. */
302 lex_reader_init (struct lex_reader *reader,
303 const struct lex_reader_class *class)
305 reader->class = class;
306 reader->syntax = SEG_MODE_AUTO;
307 reader->error = LEX_ERROR_CONTINUE;
308 reader->file_name = NULL;
309 reader->encoding = NULL;
310 reader->line_number = 0;
314 /* Frees any file name already in READER and replaces it by a copy of
315 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
317 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
319 free (reader->file_name);
320 reader->file_name = xstrdup_if_nonnull (file_name);
323 /* Creates and returns a new lexer. */
327 struct lexer *lexer = xmalloc (sizeof *lexer);
328 *lexer = (struct lexer) {
329 .sources = LL_INITIALIZER (lexer->sources),
330 .macros = macro_set_create (),
335 /* Destroys LEXER. */
337 lex_destroy (struct lexer *lexer)
341 struct lex_source *source, *next;
343 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
345 ll_remove (&source->ll);
346 lex_source_unref (source);
348 macro_set_destroy (lexer->macros);
353 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
354 same name. Takes ownership of M. */
356 lex_define_macro (struct lexer *lexer, struct macro *m)
358 macro_set_add (lexer->macros, m);
361 /* Inserts READER into LEXER so that the next token read by LEXER comes from
362 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
365 lex_include (struct lexer *lexer, struct lex_reader *reader)
367 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
368 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
371 /* Appends READER to LEXER, so that it will be read after all other current
372 readers have already been read. */
374 lex_append (struct lexer *lexer, struct lex_reader *reader)
376 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
381 /* Advances LEXER to the next token, consuming the current token. */
383 lex_get (struct lexer *lexer)
385 struct lex_source *src;
387 src = lex_source__ (lexer);
391 if (src->parse_ofs < src->n_parse)
393 if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
394 lex_source_clear_parse (src);
399 while (src->parse_ofs == src->n_parse)
400 if (!lex_source_get_parse (src))
402 ll_remove (&src->ll);
403 lex_source_unref (src);
404 src = lex_source__ (lexer);
410 /* Advances LEXER by N tokens. */
412 lex_get_n (struct lexer *lexer, size_t n)
418 /* Issuing errors. */
420 /* Prints a syntax error message containing the current token and
421 given message MESSAGE (if non-null). */
423 lex_error (struct lexer *lexer, const char *format, ...)
427 va_start (args, format);
428 lex_next_error_valist (lexer, 0, 0, format, args);
432 /* Prints a syntax error message containing the current token and
433 given message MESSAGE (if non-null). */
435 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
437 lex_next_error_valist (lexer, 0, 0, format, args);
440 /* Prints a syntax error message containing the current token and
441 given message MESSAGE (if non-null). */
443 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
447 va_start (args, format);
448 lex_next_error_valist (lexer, n0, n1, format, args);
452 /* Prints a syntax error message saying that one of the strings provided as
453 varargs, up to the first NULL, is expected. */
455 (lex_error_expecting) (struct lexer *lexer, ...)
459 va_start (args, lexer);
460 lex_error_expecting_valist (lexer, args);
464 /* Prints a syntax error message saying that one of the options provided in
465 ARGS, up to the first NULL, is expected. */
467 lex_error_expecting_valist (struct lexer *lexer, va_list args)
469 enum { MAX_OPTIONS = 9 };
470 const char *options[MAX_OPTIONS];
472 while (n < MAX_OPTIONS)
474 const char *option = va_arg (args, const char *);
478 options[n++] = option;
480 lex_error_expecting_array (lexer, options, n);
484 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
489 lex_error (lexer, NULL);
493 lex_error (lexer, _("expecting %s"), options[0]);
497 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
501 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
506 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
507 options[0], options[1], options[2], options[3]);
511 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
512 options[0], options[1], options[2], options[3], options[4]);
516 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
517 options[0], options[1], options[2], options[3], options[4],
522 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
523 options[0], options[1], options[2], options[3], options[4],
524 options[5], options[6]);
528 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
529 options[0], options[1], options[2], options[3], options[4],
530 options[5], options[6], options[7]);
534 lex_error (lexer, NULL);
538 /* Reports an error to the effect that subcommand SBC may only be specified
541 This function does not take a lexer as an argument or use lex_error(),
542 because the result would ordinarily just be redundant: "Syntax error at
543 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
544 not help the user find the error. */
546 lex_sbc_only_once (const char *sbc)
548 msg (SE, _("Subcommand %s may only be specified once."), sbc);
551 /* Reports an error to the effect that subcommand SBC is missing.
553 This function does not take a lexer as an argument or use lex_error(),
554 because a missing subcommand can normally be detected only after the whole
555 command has been parsed, and so lex_error() would always report "Syntax
556 error at end of command", which does not help the user find the error. */
558 lex_sbc_missing (const char *sbc)
560 msg (SE, _("Required subcommand %s was not specified."), sbc);
563 /* Reports an error to the effect that specification SPEC may only be specified
564 once within subcommand SBC. */
566 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
568 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
572 /* Reports an error to the effect that specification SPEC is missing within
575 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
577 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
581 /* Prints a syntax error message containing the current token and
582 given message MESSAGE (if non-null). */
584 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
585 const char *format, va_list args)
587 struct lex_source *src = lex_source__ (lexer);
590 lex_source_error_valist (src, n0, n1, format, args);
596 ds_put_format (&s, _("Syntax error at end of input"));
599 ds_put_cstr (&s, ": ");
600 ds_put_vformat (&s, format, args);
602 if (ds_last (&s) != '.')
603 ds_put_byte (&s, '.');
604 msg (SE, "%s", ds_cstr (&s));
609 /* Checks that we're at end of command.
610 If so, returns a successful command completion code.
611 If not, flags a syntax error and returns an error command
614 lex_end_of_command (struct lexer *lexer)
616 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
618 lex_error (lexer, _("expecting end of command"));
625 /* Token testing functions. */
627 /* Returns true if the current token is a number. */
629 lex_is_number (const struct lexer *lexer)
631 return lex_next_is_number (lexer, 0);
634 /* Returns true if the current token is a string. */
636 lex_is_string (const struct lexer *lexer)
638 return lex_next_is_string (lexer, 0);
641 /* Returns the value of the current token, which must be a
642 floating point number. */
644 lex_number (const struct lexer *lexer)
646 return lex_next_number (lexer, 0);
649 /* Returns true iff the current token is an integer. */
651 lex_is_integer (const struct lexer *lexer)
653 return lex_next_is_integer (lexer, 0);
656 /* Returns the value of the current token, which must be an
659 lex_integer (const struct lexer *lexer)
661 return lex_next_integer (lexer, 0);
664 /* Token testing functions with lookahead.
666 A value of 0 for N as an argument to any of these functions refers to the
667 current token. Lookahead is limited to the current command. Any N greater
668 than the number of tokens remaining in the current command will be treated
669 as referring to a T_ENDCMD token. */
671 /* Returns true if the token N ahead of the current token is a number. */
673 lex_next_is_number (const struct lexer *lexer, int n)
675 return token_is_number (lex_next (lexer, n));
678 /* Returns true if the token N ahead of the current token is a string. */
680 lex_next_is_string (const struct lexer *lexer, int n)
682 return token_is_string (lex_next (lexer, n));
685 /* Returns the value of the token N ahead of the current token, which must be a
686 floating point number. */
688 lex_next_number (const struct lexer *lexer, int n)
690 return token_number (lex_next (lexer, n));
693 /* Returns true if the token N ahead of the current token is an integer. */
695 lex_next_is_integer (const struct lexer *lexer, int n)
697 return token_is_integer (lex_next (lexer, n));
700 /* Returns the value of the token N ahead of the current token, which must be
703 lex_next_integer (const struct lexer *lexer, int n)
705 return token_integer (lex_next (lexer, n));
708 /* Token matching functions. */
710 /* If the current token has the specified TYPE, skips it and returns true.
711 Otherwise, returns false. */
713 lex_match (struct lexer *lexer, enum token_type type)
715 if (lex_token (lexer) == type)
724 /* If the current token matches IDENTIFIER, skips it and returns true.
725 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
728 IDENTIFIER must be an ASCII string. */
730 lex_match_id (struct lexer *lexer, const char *identifier)
732 return lex_match_id_n (lexer, identifier, 3);
735 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
736 may be abbreviated to its first N letters. Otherwise, returns false.
738 IDENTIFIER must be an ASCII string. */
740 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
742 if (lex_token (lexer) == T_ID
743 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
752 /* If the current token is integer X, skips it and returns true. Otherwise,
755 lex_match_int (struct lexer *lexer, int x)
757 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
766 /* Forced matches. */
768 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
769 abbreviated to its first 3 letters. Otherwise, reports an error and returns
772 IDENTIFIER must be an ASCII string. */
774 lex_force_match_id (struct lexer *lexer, const char *identifier)
776 if (lex_match_id (lexer, identifier))
780 lex_error_expecting (lexer, identifier);
785 /* If the current token has the specified TYPE, skips it and returns true.
786 Otherwise, reports an error and returns false. */
788 lex_force_match (struct lexer *lexer, enum token_type type)
790 if (lex_token (lexer) == type)
797 const char *type_string = token_type_to_string (type);
800 char *s = xasprintf ("`%s'", type_string);
801 lex_error_expecting (lexer, s);
805 lex_error_expecting (lexer, token_type_to_name (type));
811 /* If the current token is a string, does nothing and returns true.
812 Otherwise, reports an error and returns false. */
814 lex_force_string (struct lexer *lexer)
816 if (lex_is_string (lexer))
820 lex_error (lexer, _("expecting string"));
825 /* If the current token is a string or an identifier, does nothing and returns
826 true. Otherwise, reports an error and returns false.
828 This is meant for use in syntactic situations where we want to encourage the
829 user to supply a quoted string, but for compatibility we also accept
830 identifiers. (One example of such a situation is file names.) Therefore,
831 the error message issued when the current token is wrong only says that a
832 string is expected and doesn't mention that an identifier would also be
835 lex_force_string_or_id (struct lexer *lexer)
837 return lex_token (lexer) == T_ID || lex_force_string (lexer);
840 /* If the current token is an integer, does nothing and returns true.
841 Otherwise, reports an error and returns false. */
843 lex_force_int (struct lexer *lexer)
845 if (lex_is_integer (lexer))
849 lex_error (lexer, _("expecting integer"));
854 /* If the current token is an integer in the range MIN...MAX (inclusive), does
855 nothing and returns true. Otherwise, reports an error and returns false.
856 If NAME is nonnull, then it is used in the error message. */
858 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
860 bool is_number = lex_is_number (lexer);
861 bool is_integer = lex_is_integer (lexer);
862 bool too_small = (is_integer ? lex_integer (lexer) < min
863 : is_number ? lex_number (lexer) < min
865 bool too_big = (is_integer ? lex_integer (lexer) > max
866 : is_number ? lex_number (lexer) > max
868 if (is_integer && !too_small && !too_big)
873 /* Weird, maybe a bug in the caller. Just report that we needed an
876 lex_error (lexer, _("Integer expected for %s."), name);
878 lex_error (lexer, _("Integer expected."));
883 lex_error (lexer, _("Expected %ld for %s."), min, name);
885 lex_error (lexer, _("Expected %ld."), min);
887 else if (min + 1 == max)
890 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
892 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
896 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
897 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
899 if (report_lower_bound && report_upper_bound)
903 _("Expected integer between %ld and %ld for %s."),
906 lex_error (lexer, _("Expected integer between %ld and %ld."),
909 else if (report_lower_bound)
914 lex_error (lexer, _("Expected non-negative integer for %s."),
917 lex_error (lexer, _("Expected non-negative integer."));
922 lex_error (lexer, _("Expected positive integer for %s."),
925 lex_error (lexer, _("Expected positive integer."));
930 lex_error (lexer, _("Expected integer %ld or greater for %s."),
933 lex_error (lexer, _("Expected integer %ld or greater."), min);
936 else if (report_upper_bound)
940 _("Expected integer less than or equal to %ld for %s."),
943 lex_error (lexer, _("Expected integer less than or equal to %ld."),
949 lex_error (lexer, _("Integer expected for %s."), name);
951 lex_error (lexer, _("Integer expected."));
957 /* If the current token is a number, does nothing and returns true.
958 Otherwise, reports an error and returns false. */
960 lex_force_num (struct lexer *lexer)
962 if (lex_is_number (lexer))
965 lex_error (lexer, _("expecting number"));
969 /* If the current token is an number in the closed range [MIN,MAX], does
970 nothing and returns true. Otherwise, reports an error and returns false.
971 If NAME is nonnull, then it is used in the error message. */
973 lex_force_num_range_closed (struct lexer *lexer, const char *name,
974 double min, double max)
976 bool is_number = lex_is_number (lexer);
977 bool too_small = is_number && lex_number (lexer) < min;
978 bool too_big = is_number && lex_number (lexer) > max;
979 if (is_number && !too_small && !too_big)
984 /* Weird, maybe a bug in the caller. Just report that we needed an
987 lex_error (lexer, _("Number expected for %s."), name);
989 lex_error (lexer, _("Number expected."));
994 lex_error (lexer, _("Expected %g for %s."), min, name);
996 lex_error (lexer, _("Expected %g."), min);
1000 bool report_lower_bound = min > -DBL_MAX || too_small;
1001 bool report_upper_bound = max < DBL_MAX || too_big;
1003 if (report_lower_bound && report_upper_bound)
1007 _("Expected number between %g and %g for %s."),
1010 lex_error (lexer, _("Expected number between %g and %g."),
1013 else if (report_lower_bound)
1018 lex_error (lexer, _("Expected non-negative number for %s."),
1021 lex_error (lexer, _("Expected non-negative number."));
1026 lex_error (lexer, _("Expected number %g or greater for %s."),
1029 lex_error (lexer, _("Expected number %g or greater."), min);
1032 else if (report_upper_bound)
1036 _("Expected number less than or equal to %g for %s."),
1039 lex_error (lexer, _("Expected number less than or equal to %g."),
1045 lex_error (lexer, _("Number expected for %s."), name);
1047 lex_error (lexer, _("Number expected."));
1053 /* If the current token is an number in the half-open range [MIN,MAX), does
1054 nothing and returns true. Otherwise, reports an error and returns false.
1055 If NAME is nonnull, then it is used in the error message. */
1057 lex_force_num_range_halfopen (struct lexer *lexer, const char *name,
1058 double min, double max)
1060 bool is_number = lex_is_number (lexer);
1061 bool too_small = is_number && lex_number (lexer) < min;
1062 bool too_big = is_number && lex_number (lexer) >= max;
1063 if (is_number && !too_small && !too_big)
1068 /* Weird, maybe a bug in the caller. Just report that we needed an
1071 lex_error (lexer, _("Number expected for %s."), name);
1073 lex_error (lexer, _("Number expected."));
1077 bool report_lower_bound = min > -DBL_MAX || too_small;
1078 bool report_upper_bound = max < DBL_MAX || too_big;
1080 if (report_lower_bound && report_upper_bound)
1083 lex_error (lexer, _("Expected number in [%g,%g) for %s."),
1086 lex_error (lexer, _("Expected number in [%g,%g)."),
1089 else if (report_lower_bound)
1094 lex_error (lexer, _("Expected non-negative number for %s."),
1097 lex_error (lexer, _("Expected non-negative number."));
1102 lex_error (lexer, _("Expected number %g or greater for %s."),
1105 lex_error (lexer, _("Expected number %g or greater."), min);
1108 else if (report_upper_bound)
1112 _("Expected number less than %g for %s."), max, name);
1114 lex_error (lexer, _("Expected number less than %g."), max);
1119 lex_error (lexer, _("Number expected for %s."), name);
1121 lex_error (lexer, _("Number expected."));
1127 /* If the current token is an number in the open range (MIN,MAX], does
1128 nothing and returns true. Otherwise, reports an error and returns false.
1129 If NAME is nonnull, then it is used in the error message. */
1131 lex_force_num_range_open (struct lexer *lexer, const char *name,
1132 double min, double max)
1134 bool is_number = lex_is_number (lexer);
1135 bool too_small = is_number && lex_number (lexer) <= min;
1136 bool too_big = is_number && lex_number (lexer) >= max;
1137 if (is_number && !too_small && !too_big)
1142 /* Weird, maybe a bug in the caller. Just report that we needed an
1145 lex_error (lexer, _("Number expected for %s."), name);
1147 lex_error (lexer, _("Number expected."));
1151 bool report_lower_bound = min > -DBL_MAX || too_small;
1152 bool report_upper_bound = max < DBL_MAX || too_big;
1154 if (report_lower_bound && report_upper_bound)
1157 lex_error (lexer, _("Expected number in (%g,%g) for %s."),
1160 lex_error (lexer, _("Expected number in (%g,%g)."), min, max);
1162 else if (report_lower_bound)
1167 lex_error (lexer, _("Expected positive number for %s."), name);
1169 lex_error (lexer, _("Expected positive number."));
1174 lex_error (lexer, _("Expected number greater than %g for %s."),
1177 lex_error (lexer, _("Expected number greater than %g."), min);
1180 else if (report_upper_bound)
1183 lex_error (lexer, _("Expected number less than %g for %s."),
1186 lex_error (lexer, _("Expected number less than %g."), max);
1191 lex_error (lexer, _("Number expected for %s."), name);
1193 lex_error (lexer, _("Number expected."));
1199 /* If the current token is an identifier, does nothing and returns true.
1200 Otherwise, reports an error and returns false. */
1202 lex_force_id (struct lexer *lexer)
1204 if (lex_token (lexer) == T_ID)
1207 lex_error (lexer, _("expecting identifier"));
1211 /* Token accessors. */
1213 /* Returns the type of LEXER's current token. */
1215 lex_token (const struct lexer *lexer)
1217 return lex_next_token (lexer, 0);
1220 /* Returns the number in LEXER's current token.
1222 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1223 tokens this function will always return zero. */
1225 lex_tokval (const struct lexer *lexer)
1227 return lex_next_tokval (lexer, 0);
1230 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
1232 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1233 this functions this function will always return NULL.
1235 The UTF-8 encoding of the returned string is correct for variable names and
1236 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1237 data_in() to use it in a "union value". */
1239 lex_tokcstr (const struct lexer *lexer)
1241 return lex_next_tokcstr (lexer, 0);
1244 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
1245 null-terminated (but the null terminator is not included in the returned
1246 substring's 'length').
1248 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1249 this functions this function will always return NULL.
1251 The UTF-8 encoding of the returned string is correct for variable names and
1252 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1253 data_in() to use it in a "union value". */
1255 lex_tokss (const struct lexer *lexer)
1257 return lex_next_tokss (lexer, 0);
1262 A value of 0 for N as an argument to any of these functions refers to the
1263 current token. Lookahead is limited to the current command. Any N greater
1264 than the number of tokens remaining in the current command will be treated
1265 as referring to a T_ENDCMD token. */
1267 static const struct lex_token *
1268 lex_next__ (const struct lexer *lexer_, int n)
1270 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1271 struct lex_source *src = lex_source__ (lexer);
1274 return lex_source_next__ (src, n);
1277 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1282 static const struct lex_token *
1283 lex_source_ofs__ (const struct lex_source *src_, int ofs)
1285 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1289 static const struct lex_token endcmd_token
1290 = { .token = { .type = T_ENDCMD } };
1291 return &endcmd_token;
1294 while (ofs >= src->n_parse)
1296 if (src->n_parse > 0)
1298 const struct lex_token *t = src->parse[src->n_parse - 1];
1299 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1303 lex_source_get_parse (src);
1306 return src->parse[ofs];
1309 static const struct lex_token *
1310 lex_source_next__ (const struct lex_source *src, int n)
1312 return lex_source_ofs__ (src, n + src->parse_ofs);
1315 /* Returns the "struct token" of the token N after the current one in LEXER.
1316 The returned pointer can be invalidated by pretty much any succeeding call
1317 into the lexer, although the string pointer within the returned token is
1318 only invalidated by consuming the token (e.g. with lex_get()). */
1319 const struct token *
1320 lex_next (const struct lexer *lexer, int n)
1322 return &lex_next__ (lexer, n)->token;
1325 /* Returns the type of the token N after the current one in LEXER. */
1327 lex_next_token (const struct lexer *lexer, int n)
1329 return lex_next (lexer, n)->type;
1332 /* Returns the number in the tokn N after the current one in LEXER.
1334 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1335 tokens this function will always return zero. */
1337 lex_next_tokval (const struct lexer *lexer, int n)
1339 return token_number (lex_next (lexer, n));
1342 /* Returns the null-terminated string in the token N after the current one, in
1345 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1346 this functions this function will always return NULL.
1348 The UTF-8 encoding of the returned string is correct for variable names and
1349 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1350 data_in() to use it in a "union value". */
1352 lex_next_tokcstr (const struct lexer *lexer, int n)
1354 return lex_next_tokss (lexer, n).string;
1357 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1358 The string is null-terminated (but the null terminator is not included in
1359 the returned substring's 'length').
1361 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1362 tokens this functions this function will always return NULL.
1364 The UTF-8 encoding of the returned string is correct for variable names and
1365 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1366 data_in() to use it in a "union value". */
1368 lex_next_tokss (const struct lexer *lexer, int n)
1370 return lex_next (lexer, n)->string;
1373 /* Returns the offset of the current token within the command being parsed in
1374 LEXER. This is 0 for the first token in a command, 1 for the second, and so
1375 on. The return value is useful later for referring to this token in calls
1378 lex_ofs (const struct lexer *lexer)
1380 struct lex_source *src = lex_source__ (lexer);
1381 return src ? src->parse_ofs : 0;
1384 /* Returns the token within LEXER's current command with offset OFS. Use
1385 lex_ofs() to find out the offset of the current token. */
1386 const struct token *
1387 lex_ofs_token (const struct lexer *lexer_, int ofs)
1389 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1390 struct lex_source *src = lex_source__ (lexer);
1393 return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1396 static const struct token stop_token = { .type = T_STOP };
1401 /* Allocates and returns a new struct msg_location that spans tokens with
1402 offsets OFS0 through OFS1, inclusive, within the current command in
1403 LEXER. See lex_ofs() for an explanation of token offsets.
1405 The caller owns and must eventually free the returned object. */
1406 struct msg_location *
1407 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1409 int ofs = lex_ofs (lexer);
1410 return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1413 /* Returns a msg_point for the first character in the token with offset OFS,
1414 where offset 0 is the first token in the command currently being parsed, 1
1415 the second token, and so on. These are absolute offsets, not relative to
1416 the token currently being parsed within the command.
1418 Returns zeros for a T_STOP token.
1421 lex_ofs_start_point (const struct lexer *lexer, int ofs)
1423 const struct lex_source *src = lex_source__ (lexer);
1425 ? lex_token_start_point (src, lex_source_ofs__ (src, ofs))
1426 : (struct msg_point) { 0, 0 });
1429 /* Returns a msg_point for the last character, inclusive, in the token with
1430 offset OFS, where offset 0 is the first token in the command currently being
1431 parsed, 1 the second token, and so on. These are absolute offsets, not
1432 relative to the token currently being parsed within the command.
1434 Returns zeros for a T_STOP token.
1436 Most of the time, a single token is wholly within a single line of syntax,
1437 so that the start and end point for a given offset have the same line
1438 number. There are two exceptions: a T_STRING token can be made up of
1439 multiple segments on adjacent lines connected with "+" punctuators, and a
1440 T_NEG_NUM token can consist of a "-" on one line followed by the number on
1444 lex_ofs_end_point (const struct lexer *lexer, int ofs)
1446 const struct lex_source *src = lex_source__ (lexer);
1448 ? lex_token_end_point (src, lex_source_ofs__ (src, ofs))
1449 : (struct msg_point) { 0, 0 });
1452 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1453 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1454 are both zero, this requests the syntax for the current token.)
1456 The caller must eventually free the returned string (with free()). The
1457 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1458 that, for example, it may include comments, spaces, and new-lines if it
1459 spans multiple tokens. Macro expansion, however, has already been
1462 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1464 const struct lex_source *src = lex_source__ (lexer);
1466 ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs)
1471 /* Returns the text of the syntax in tokens with offsets OFS0 to OFS1,
1472 inclusive. (For example, if OFS0 and OFS1 are both zero, this requests the
1473 syntax for the first token in the current command.)
1475 The caller must eventually free the returned string (with free()). The
1476 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1477 that, for example, it may include comments, spaces, and new-lines if it
1478 spans multiple tokens. Macro expansion, however, has already been
1481 lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1)
1483 const struct lex_source *src = lex_source__ (lexer);
1484 return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup ("");
1487 /* Returns true if the token N ahead of the current one was produced by macro
1488 expansion, false otherwise. */
1490 lex_next_is_from_macro (const struct lexer *lexer, int n)
1492 return lex_next__ (lexer, n)->macro_rep != NULL;
1496 lex_tokens_match (const struct token *actual, const struct token *expected)
1498 if (actual->type != expected->type)
1501 switch (actual->type)
1505 return actual->number == expected->number;
1508 return lex_id_match (expected->string, actual->string);
1511 return (actual->string.length == expected->string.length
1512 && !memcmp (actual->string.string, expected->string.string,
1513 actual->string.length));
1521 lex_at_phrase__ (struct lexer *lexer, const char *s)
1523 struct string_lexer slex;
1527 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1528 while (string_lexer_next (&slex, &token))
1530 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1531 token_uninit (&token);
1538 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1539 returns true. Otherwise, returns false.
1541 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1542 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1543 first three letters. */
1545 lex_at_phrase (struct lexer *lexer, const char *s)
1547 return lex_at_phrase__ (lexer, s) > 0;
1550 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1551 skips it and returns true. Otherwise, returns false.
1553 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1554 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1555 first three letters. */
1557 lex_match_phrase (struct lexer *lexer, const char *s)
1559 size_t n = lex_at_phrase__ (lexer, s);
1561 lex_get_n (lexer, n);
1565 /* Returns the 1-based line number of the source text at the byte OFFSET in
1568 lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset)
1571 size_t hi = src->n_lines;
1574 size_t mid = (lo + hi) / 2;
1575 if (mid + 1 >= src->n_lines)
1576 return src->n_lines;
1577 else if (offset >= src->lines[mid + 1])
1579 else if (offset < src->lines[mid])
1586 /* Returns the 1-based column number of the source text at the byte OFFSET in
1589 lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset)
1591 const char *newline = memrchr (src->buffer, '\n', offset);
1592 size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1593 return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1596 static struct msg_point
1597 lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset)
1599 return (struct msg_point) {
1600 .line = lex_source_ofs_to_line_number (src, offset),
1601 .column = lex_source_ofs_to_column_number (src, offset),
1605 static struct msg_point
1606 lex_token_start_point (const struct lex_source *src,
1607 const struct lex_token *token)
1609 return lex_source_ofs_to_point__ (src, token->token_pos);
1612 static struct msg_point
1613 lex_token_end_point (const struct lex_source *src,
1614 const struct lex_token *token)
1616 return lex_source_ofs_to_point__ (src, lex_token_end (token));
1619 static struct msg_location
1620 lex_token_location (const struct lex_source *src,
1621 const struct lex_token *t0,
1622 const struct lex_token *t1)
1624 return (struct msg_location) {
1625 .file_name = intern_new_if_nonnull (src->reader->file_name),
1626 .start = lex_token_start_point (src, t0),
1627 .end = lex_token_end_point (src, t1),
1631 static struct msg_location *
1632 lex_token_location_rw (const struct lex_source *src,
1633 const struct lex_token *t0,
1634 const struct lex_token *t1)
1636 struct msg_location location = lex_token_location (src, t0, t1);
1637 return msg_location_dup (&location);
1640 static struct msg_location *
1641 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1643 return lex_token_location_rw (src,
1644 lex_source_next__ (src, n0),
1645 lex_source_next__ (src, n1));
1648 /* Returns the name of the syntax file from which the current command is drawn.
1649 Returns NULL for a T_STOP token or if the command's source does not have
1652 There is no version of this function that takes an N argument because
1653 lookahead only works to the end of a command and any given command is always
1654 within a single syntax file. */
1656 lex_get_file_name (const struct lexer *lexer)
1658 struct lex_source *src = lex_source__ (lexer);
1659 return src == NULL ? NULL : src->reader->file_name;
1662 /* Returns a newly allocated msg_location for the syntax that represents tokens
1663 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1664 must eventually free the location (with msg_location_destroy()). */
1665 struct msg_location *
1666 lex_get_location (const struct lexer *lexer, int n0, int n1)
1668 struct msg_location *loc = xmalloc (sizeof *loc);
1669 *loc = (struct msg_location) {
1670 .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1671 .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)),
1672 .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)),
1673 .src = lex_source__ (lexer),
1675 lex_source_ref (loc->src);
1680 lex_get_encoding (const struct lexer *lexer)
1682 struct lex_source *src = lex_source__ (lexer);
1683 return src == NULL ? NULL : src->reader->encoding;
1686 /* Returns the syntax mode for the syntax file from which the current drawn is
1687 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1688 does not have line numbers.
1690 There is no version of this function that takes an N argument because
1691 lookahead only works to the end of a command and any given command is always
1692 within a single syntax file. */
1694 lex_get_syntax_mode (const struct lexer *lexer)
1696 struct lex_source *src = lex_source__ (lexer);
1697 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1700 /* Returns the error mode for the syntax file from which the current drawn is
1701 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1702 source does not have line numbers.
1704 There is no version of this function that takes an N argument because
1705 lookahead only works to the end of a command and any given command is always
1706 within a single syntax file. */
1708 lex_get_error_mode (const struct lexer *lexer)
1710 struct lex_source *src = lex_source__ (lexer);
1711 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1714 /* If the source that LEXER is currently reading has error mode
1715 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1716 token to be read comes directly from whatever is next read from the stream.
1718 It makes sense to call this function after encountering an error in a
1719 command entered on the console, because usually the user would prefer not to
1720 have cascading errors. */
1722 lex_interactive_reset (struct lexer *lexer)
1724 struct lex_source *src = lex_source__ (lexer);
1725 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1728 src->journal_pos = src->seg_pos = 0;
1730 src->suppress_next_newline = false;
1731 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1733 lex_stage_clear (&src->pp);
1734 lex_stage_clear (&src->merge);
1735 lex_source_clear_parse (src);
1736 lex_source_push_endcmd__ (src);
1740 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1742 lex_discard_rest_of_command (struct lexer *lexer)
1744 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1748 /* Discards all lookahead tokens in LEXER, then discards all input sources
1749 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1750 runs out of input sources. */
1752 lex_discard_noninteractive (struct lexer *lexer)
1754 struct lex_source *src = lex_source__ (lexer);
1758 lex_stage_clear (&src->pp);
1759 lex_stage_clear (&src->merge);
1760 lex_source_clear_parse (src);
1762 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1763 src = lex_source__ (lexer))
1765 ll_remove (&src->ll);
1766 lex_source_unref (src);
1772 lex_source_expand__ (struct lex_source *src)
1774 if (src->length >= src->allocated)
1775 src->buffer = x2realloc (src->buffer, &src->allocated);
1779 lex_source_read__ (struct lex_source *src)
1783 lex_source_expand__ (src);
1785 size_t space = src->allocated - src->length;
1786 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1787 size_t n = src->reader->class->read (src->reader,
1788 &src->buffer[src->length],
1790 assert (n <= space);
1795 src->reader->eof = true;
1801 while (!memchr (&src->buffer[src->seg_pos], '\n',
1802 src->length - src->seg_pos));
1805 static struct lex_source *
1806 lex_source__ (const struct lexer *lexer)
1808 return (ll_is_empty (&lexer->sources) ? NULL
1809 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1812 /* Returns the text of the syntax in SRC for tokens with offsets OFS0 through
1813 OFS1 in the current command, inclusive. (For example, if OFS0 and OFS1 are
1814 both zero, this requests the syntax for the first token in the current
1815 command.) The caller must eventually free the returned string (with
1816 free()). The syntax is encoded in UTF-8 and in the original form supplied
1817 to the lexer so that, for example, it may include comments, spaces, and
1818 new-lines if it spans multiple tokens. Macro expansion, however, has
1819 already been performed. */
1821 lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1)
1823 struct string s = DS_EMPTY_INITIALIZER;
1824 for (size_t i = ofs0; i <= ofs1; )
1826 /* Find [I,J) as the longest sequence of tokens not produced by macro
1827 expansion, or otherwise the longest sequence expanded from a single
1829 const struct lex_token *first = lex_source_ofs__ (src, i);
1831 for (j = i + 1; j <= ofs1; j++)
1833 const struct lex_token *cur = lex_source_ofs__ (src, j);
1834 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1835 || first->macro_rep != cur->macro_rep)
1838 const struct lex_token *last = lex_source_ofs__ (src, j - 1);
1840 /* Now add the syntax for this sequence of tokens to SRC. */
1841 if (!ds_is_empty (&s))
1842 ds_put_byte (&s, ' ');
1843 if (!first->macro_rep)
1845 size_t start = first->token_pos;
1846 size_t end = last->token_pos + last->token_len;
1847 ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1851 size_t start = first->ofs;
1852 size_t end = last->ofs + last->len;
1853 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1859 return ds_steal_cstr (&s);
1863 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1865 for (size_t i = n0; i <= n1; i++)
1866 if (lex_source_next__ (src, i)->macro_rep)
1871 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1872 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1873 other tokens included in that range. The syntax is encoded in UTF-8 and in
1874 the original form supplied to the lexer so that, for example, it may include
1875 comments, spaces, and new-lines if it spans multiple tokens.
1877 Returns an empty string if the token range doesn't include a macro call.
1879 The caller must not modify or free the returned string. */
1880 static struct substring
1881 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1883 if (!lex_source_contains_macro_call (src, n0, n1))
1886 const struct lex_token *token0 = lex_source_next__ (src, n0);
1887 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1888 size_t start = token0->token_pos;
1889 size_t end = token1->token_pos + token1->token_len;
1891 return ss_buffer (&src->buffer[start], end - start);
1895 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1896 const char *format, va_list args)
1898 const struct lex_token *token;
1903 token = lex_source_next__ (src, n0);
1904 if (token->token.type == T_ENDCMD)
1905 ds_put_cstr (&s, _("Syntax error at end of command"));
1908 /* Get the syntax that caused the error. */
1909 char *raw_syntax = lex_source_syntax__ (src, n0 + src->parse_ofs,
1910 n1 + src->parse_ofs);
1912 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1915 /* Get the macro call(s) that expanded to the syntax that caused the
1918 str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1925 _("Syntax error at `%s' (in expansion of `%s')"),
1928 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1933 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1936 ds_put_cstr (&s, _("Syntax error"));
1942 ds_put_cstr (&s, ": ");
1943 ds_put_vformat (&s, format, args);
1945 if (ds_last (&s) != '.')
1946 ds_put_byte (&s, '.');
1948 struct msg *m = xmalloc (sizeof *m);
1950 .category = MSG_C_SYNTAX,
1951 .severity = MSG_S_ERROR,
1952 .location = lex_source_get_location (src, n0, n1),
1953 .text = ds_steal_cstr (&s),
1959 lex_get_error (struct lex_source *src, const struct lex_token *token)
1962 str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1963 syntax, sizeof syntax);
1965 struct string s = DS_EMPTY_INITIALIZER;
1966 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1967 ds_put_format (&s, ": %s", token->token.string.string);
1969 struct msg *m = xmalloc (sizeof *m);
1971 .category = MSG_C_SYNTAX,
1972 .severity = MSG_S_ERROR,
1973 .location = lex_token_location_rw (src, token, token),
1974 .text = ds_steal_cstr (&s),
1979 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1980 underlying lex_reader if necessary. Returns true if a new token was added
1981 to SRC's deque, false otherwise. The caller should retry failures unless
1982 SRC's 'eof' marker was set to true indicating that there will be no more
1983 tokens from this source. */
1985 lex_source_try_get_pp (struct lex_source *src)
1987 /* Append a new token to SRC and initialize it. */
1988 struct lex_token *token = xmalloc (sizeof *token);
1989 token->token = (struct token) { .type = T_STOP };
1990 token->macro_rep = NULL;
1991 token->ref_cnt = NULL;
1992 token->token_pos = src->seg_pos;
1994 /* Extract a segment. */
1995 const char *segment;
1996 enum segment_type seg_type;
2000 segment = &src->buffer[src->seg_pos];
2001 seg_len = segmenter_push (&src->segmenter, segment,
2002 src->length - src->seg_pos,
2003 src->reader->eof, &seg_type);
2007 /* The segmenter needs more input to produce a segment. */
2008 assert (!src->reader->eof);
2009 lex_source_read__ (src);
2012 /* Update state based on the segment. */
2013 token->token_len = seg_len;
2014 src->seg_pos += seg_len;
2015 if (seg_type == SEG_NEWLINE)
2017 if (src->n_lines >= src->allocated_lines)
2018 src->lines = x2nrealloc (src->lines, &src->allocated_lines,
2019 sizeof *src->lines);
2020 src->lines[src->n_lines++] = src->seg_pos;
2023 /* Get a token from the segment. */
2024 enum tokenize_result result = token_from_segment (
2025 seg_type, ss_buffer (segment, seg_len), &token->token);
2027 /* If we've reached the end of a line, or the end of a command, then pass
2028 the line to the output engine as a syntax text item. */
2029 int n_lines = seg_type == SEG_NEWLINE;
2030 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
2033 src->suppress_next_newline = true;
2035 else if (n_lines > 0 && src->suppress_next_newline)
2038 src->suppress_next_newline = false;
2040 for (int i = 0; i < n_lines; i++)
2042 /* Beginning of line. */
2043 const char *line = &src->buffer[src->journal_pos];
2045 /* Calculate line length, including \n or \r\n end-of-line if present.
2047 We use src->length even though that may be beyond what we've actually
2048 converted to tokens. That's because, if we're emitting the line due
2049 to SEG_END_COMMAND, we want to take the whole line through the
2050 newline, not just through the '.'. */
2051 size_t max_len = src->length - src->journal_pos;
2052 const char *newline = memchr (line, '\n', max_len);
2053 size_t line_len = newline ? newline - line + 1 : max_len;
2055 /* Calculate line length excluding end-of-line. */
2056 size_t copy_len = line_len;
2057 if (copy_len > 0 && line[copy_len - 1] == '\n')
2059 if (copy_len > 0 && line[copy_len - 1] == '\r')
2062 /* Submit the line as syntax. */
2063 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
2064 xmemdup0 (line, copy_len),
2067 src->journal_pos += line_len;
2072 case TOKENIZE_ERROR:
2073 lex_get_error (src, token);
2075 case TOKENIZE_EMPTY:
2076 lex_token_destroy (token);
2079 case TOKENIZE_TOKEN:
2080 if (token->token.type == T_STOP)
2082 token->token.type = T_ENDCMD;
2085 lex_stage_push_last (&src->pp, token);
2091 /* Attempts to append a new token to SRC. Returns true if successful, false on
2092 failure. On failure, the end of SRC has been reached and no more tokens
2093 will be forthcoming from it.
2095 Does not make the new token available for lookahead yet; the caller must
2096 adjust SRC's 'middle' pointer to do so. */
2098 lex_source_get_pp (struct lex_source *src)
2101 if (lex_source_try_get_pp (src))
2107 lex_source_try_get_merge (const struct lex_source *src_)
2109 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2111 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
2114 if (!settings_get_mexpand ())
2116 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
2120 /* Now pass tokens one-by-one to the macro expander.
2122 In the common case where there is no macro to expand, the loop is not
2124 struct macro_call *mc;
2125 int n_call = macro_call_create (src->lexer->macros,
2126 &lex_stage_first (&src->pp)->token, &mc);
2127 for (int ofs = 1; !n_call; ofs++)
2129 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
2131 /* This should not be reachable because we always get a T_ENDCMD at
2132 the end of an input file (transformed from T_STOP by
2133 lex_source_try_get_pp()) and the macro_expander should always
2134 terminate expansion on T_ENDCMD. */
2138 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
2139 const struct macro_token mt = {
2141 .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len),
2143 const struct msg_location loc = lex_token_location (src, t, t);
2144 n_call = macro_call_add (mc, &mt, &loc);
2148 /* False alarm: no macro expansion after all. Use first token as
2149 lookahead. We'll retry macro expansion from the second token next
2151 macro_call_destroy (mc);
2152 lex_stage_shift (&src->merge, &src->pp, 1);
2156 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
2157 are a macro call. (These are likely to be the only tokens in 'pp'.)
2159 const struct lex_token *c0 = lex_stage_first (&src->pp);
2160 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
2161 struct macro_tokens expansion = { .n = 0 };
2162 struct msg_location loc = lex_token_location (src, c0, c1);
2163 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
2164 macro_call_destroy (mc);
2166 /* Convert the macro expansion into syntax for possible error messages
2168 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
2169 size_t *len = xnmalloc (expansion.n, sizeof *len);
2170 struct string s = DS_EMPTY_INITIALIZER;
2171 macro_tokens_to_syntax (&expansion, &s, ofs, len);
2173 if (settings_get_mprint ())
2174 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
2175 _("Macro Expansion")));
2177 /* Append the macro expansion tokens to the lookahead. */
2178 if (expansion.n > 0)
2180 char *macro_rep = ds_steal_cstr (&s);
2181 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
2182 *ref_cnt = expansion.n;
2183 for (size_t i = 0; i < expansion.n; i++)
2185 struct lex_token *token = xmalloc (sizeof *token);
2186 *token = (struct lex_token) {
2187 .token = expansion.mts[i].token,
2188 .token_pos = c0->token_pos,
2189 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
2190 .macro_rep = macro_rep,
2195 lex_stage_push_last (&src->merge, token);
2197 ss_dealloc (&expansion.mts[i].syntax);
2202 free (expansion.mts);
2206 /* Destroy the tokens for the call. */
2207 for (size_t i = 0; i < n_call; i++)
2208 lex_stage_pop_first (&src->pp);
2210 return expansion.n > 0;
2213 /* Attempts to obtain at least one new token into 'merge' in SRC.
2215 Returns true if successful, false on failure. In the latter case, SRC is
2216 exhausted and 'src->eof' is now true. */
2218 lex_source_get_merge (struct lex_source *src)
2221 if (lex_source_try_get_merge (src))
2226 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
2228 Returns true if successful, false on failure. In the latter case, SRC is
2229 exhausted and 'src->eof' is now true. */
2231 lex_source_get_parse (struct lex_source *src)
2233 struct merger m = MERGER_INIT;
2235 for (size_t i = 0; ; i++)
2237 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
2239 /* We always get a T_ENDCMD at the end of an input file
2240 (transformed from T_STOP by lex_source_try_get_pp()) and
2241 merger_add() should never return -1 on T_ENDCMD. */
2242 assert (lex_stage_is_empty (&src->merge));
2246 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
2250 lex_source_push_parse (src, lex_stage_take_first (&src->merge));
2253 else if (retval > 0)
2255 /* Add a token that merges all the tokens together. */
2256 const struct lex_token *first = lex_stage_first (&src->merge);
2257 const struct lex_token *last = lex_stage_nth (&src->merge,
2259 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2260 struct lex_token *t = xmalloc (sizeof *t);
2261 *t = (struct lex_token) {
2263 .token_pos = first->token_pos,
2264 .token_len = (last->token_pos - first->token_pos) + last->token_len,
2266 /* This works well if all the tokens were not expanded from macros,
2267 or if they came from the same macro expansion. It just gives up
2268 in the other (corner) cases. */
2269 .macro_rep = macro ? first->macro_rep : NULL,
2270 .ofs = macro ? first->ofs : 0,
2271 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2272 .ref_cnt = macro ? first->ref_cnt : NULL,
2276 lex_source_push_parse (src, t);
2278 for (int i = 0; i < retval; i++)
2279 lex_stage_pop_first (&src->merge);
2286 lex_source_push_endcmd__ (struct lex_source *src)
2288 assert (src->n_parse == 0);
2290 struct lex_token *token = xmalloc (sizeof *token);
2291 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2292 lex_source_push_parse (src, token);
2296 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2298 if (src->n_parse >= src->allocated_parse)
2299 src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2300 sizeof *src->parse);
2301 src->parse[src->n_parse++] = token;
2305 lex_source_clear_parse (struct lex_source *src)
2307 for (size_t i = 0; i < src->n_parse; i++)
2308 lex_token_destroy (src->parse[i]);
2309 src->n_parse = src->parse_ofs = 0;
2312 static struct lex_source *
2313 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2315 size_t allocated_lines = 4;
2316 size_t *lines = xmalloc (allocated_lines * sizeof *lines);
2319 struct lex_source *src = xmalloc (sizeof *src);
2320 *src = (struct lex_source) {
2323 .segmenter = segmenter_init (reader->syntax, false),
2327 .allocated_lines = allocated_lines,
2330 lex_source_push_endcmd__ (src);
2336 lex_set_message_handler (struct lexer *lexer,
2337 void (*output_msg) (const struct msg *,
2340 struct msg_handler msg_handler = {
2341 .output_msg = (void (*)(const struct msg *, void *)) output_msg,
2343 .lex_source_ref = lex_source_ref,
2344 .lex_source_unref = lex_source_unref,
2345 .lex_source_get_line = lex_source_get_line,
2347 msg_set_handler (&msg_handler);
2351 lex_source_ref (const struct lex_source *src_)
2353 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2356 assert (src->n_refs > 0);
2362 lex_source_unref (struct lex_source *src)
2367 assert (src->n_refs > 0);
2368 if (--src->n_refs > 0)
2371 char *file_name = src->reader->file_name;
2372 char *encoding = src->reader->encoding;
2373 if (src->reader->class->destroy != NULL)
2374 src->reader->class->destroy (src->reader);
2379 lex_stage_uninit (&src->pp);
2380 lex_stage_uninit (&src->merge);
2381 lex_source_clear_parse (src);
2386 struct lex_file_reader
2388 struct lex_reader reader;
2389 struct u8_istream *istream;
2392 static struct lex_reader_class lex_file_reader_class;
2394 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2395 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2396 ENCODING, which should take one of the forms accepted by
2397 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2398 mode of the new reader, respectively.
2400 Returns a null pointer if FILE_NAME cannot be opened. */
2402 lex_reader_for_file (const char *file_name, const char *encoding,
2403 enum segmenter_mode syntax,
2404 enum lex_error_mode error)
2406 struct lex_file_reader *r;
2407 struct u8_istream *istream;
2409 istream = (!strcmp(file_name, "-")
2410 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2411 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2412 if (istream == NULL)
2414 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2418 r = xmalloc (sizeof *r);
2419 lex_reader_init (&r->reader, &lex_file_reader_class);
2420 r->reader.syntax = syntax;
2421 r->reader.error = error;
2422 r->reader.file_name = xstrdup (file_name);
2423 r->reader.encoding = xstrdup_if_nonnull (encoding);
2424 r->reader.line_number = 1;
2425 r->istream = istream;
2430 static struct lex_file_reader *
2431 lex_file_reader_cast (struct lex_reader *r)
2433 return UP_CAST (r, struct lex_file_reader, reader);
2437 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2438 enum prompt_style prompt_style UNUSED)
2440 struct lex_file_reader *r = lex_file_reader_cast (r_);
2441 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2444 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2451 lex_file_close (struct lex_reader *r_)
2453 struct lex_file_reader *r = lex_file_reader_cast (r_);
2455 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2457 if (u8_istream_close (r->istream) != 0)
2458 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2461 u8_istream_free (r->istream);
2466 static struct lex_reader_class lex_file_reader_class =
2472 struct lex_string_reader
2474 struct lex_reader reader;
2479 static struct lex_reader_class lex_string_reader_class;
2481 /* Creates and returns a new lex_reader for the contents of S, which must be
2482 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2483 with ss_dealloc() when it is closed. */
2485 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2487 struct lex_string_reader *r;
2489 r = xmalloc (sizeof *r);
2490 lex_reader_init (&r->reader, &lex_string_reader_class);
2491 r->reader.syntax = SEG_MODE_AUTO;
2492 r->reader.encoding = xstrdup_if_nonnull (encoding);
2499 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2500 which must be encoded in ENCODING. The caller retains ownership of S. */
2502 lex_reader_for_string (const char *s, const char *encoding)
2504 struct substring ss;
2505 ss_alloc_substring (&ss, ss_cstr (s));
2506 return lex_reader_for_substring_nocopy (ss, encoding);
2509 /* Formats FORMAT as a printf()-like format string and creates and returns a
2510 new lex_reader for the formatted result. */
2512 lex_reader_for_format (const char *format, const char *encoding, ...)
2514 struct lex_reader *r;
2517 va_start (args, encoding);
2518 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2524 static struct lex_string_reader *
2525 lex_string_reader_cast (struct lex_reader *r)
2527 return UP_CAST (r, struct lex_string_reader, reader);
2531 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2532 enum prompt_style prompt_style UNUSED)
2534 struct lex_string_reader *r = lex_string_reader_cast (r_);
2537 chunk = MIN (n, r->s.length - r->offset);
2538 memcpy (buf, r->s.string + r->offset, chunk);
2545 lex_string_close (struct lex_reader *r_)
2547 struct lex_string_reader *r = lex_string_reader_cast (r_);
2553 static struct lex_reader_class lex_string_reader_class =
2560 lex_source_get_line (const struct lex_source *src, int line)
2562 if (line < 1 || line > src->n_lines)
2565 size_t ofs = src->lines[line - 1];
2566 size_t end = line >= src->n_lines ? src->length : src->lines[line];
2567 return ss_buffer (&src->buffer[ofs], end - ofs);