1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/intern.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
70 size_t token_pos; /* Offset into src->buffer of token start. */
71 size_t token_len; /* Length of source for token in bytes. */
73 /* For a token obtained through macro expansion, this is just this token.
75 For a token obtained through the lexer in an ordinary way, these are
77 char *macro_rep; /* The whole macro expansion. */
78 size_t ofs; /* Offset of this token in macro_rep. */
79 size_t len; /* Length of this token in macro_rep. */
80 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
83 static struct msg_point lex_token_start_point (const struct lex_source *,
84 const struct lex_token *);
85 static struct msg_point lex_token_end_point (const struct lex_source *,
86 const struct lex_token *);
88 /* Source offset of the last byte in TOKEN. */
90 lex_token_end (const struct lex_token *token)
92 return token->token_pos + MAX (token->token_len, 1) - 1;
96 lex_token_destroy (struct lex_token *t)
98 token_uninit (&t->token);
101 assert (*t->ref_cnt > 0);
111 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
116 struct lex_token **tokens;
119 static void lex_stage_clear (struct lex_stage *);
120 static void lex_stage_uninit (struct lex_stage *);
122 static size_t lex_stage_count (const struct lex_stage *);
123 static bool lex_stage_is_empty (const struct lex_stage *);
125 static struct lex_token *lex_stage_first (struct lex_stage *);
126 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
128 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
129 static void lex_stage_pop_first (struct lex_stage *);
131 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
134 /* Deletes all the tokens from STAGE. */
136 lex_stage_clear (struct lex_stage *stage)
138 while (!deque_is_empty (&stage->deque))
139 lex_stage_pop_first (stage);
142 /* Deletes all the tokens from STAGE and frees storage for the deque. */
144 lex_stage_uninit (struct lex_stage *stage)
146 lex_stage_clear (stage);
147 free (stage->tokens);
150 /* Returns true if STAGE contains no tokens, otherwise false. */
152 lex_stage_is_empty (const struct lex_stage *stage)
154 return deque_is_empty (&stage->deque);
157 /* Returns the number of tokens in STAGE. */
159 lex_stage_count (const struct lex_stage *stage)
161 return deque_count (&stage->deque);
164 /* Returns the first token in STAGE, which must be nonempty.
165 The first token is the one accessed with the least lookahead. */
166 static struct lex_token *
167 lex_stage_first (struct lex_stage *stage)
169 return lex_stage_nth (stage, 0);
172 /* Returns the token the given INDEX in STAGE. The first token (with the least
173 lookahead) is 0, the second token is 1, and so on. There must be at least
174 INDEX + 1 tokens in STAGE. */
175 static struct lex_token *
176 lex_stage_nth (struct lex_stage *stage, size_t index)
178 return stage->tokens[deque_back (&stage->deque, index)];
181 /* Adds TOKEN so that it becomes the last token in STAGE. */
183 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
185 if (deque_is_full (&stage->deque))
186 stage->tokens = deque_expand (&stage->deque, stage->tokens,
187 sizeof *stage->tokens);
188 stage->tokens[deque_push_front (&stage->deque)] = token;
191 /* Removes and returns the first token from STAGE. */
192 static struct lex_token *
193 lex_stage_take_first (struct lex_stage *stage)
195 return stage->tokens[deque_pop_back (&stage->deque)];
198 /* Removes the first token from STAGE and uninitializes it. */
200 lex_stage_pop_first (struct lex_stage *stage)
202 lex_token_destroy (lex_stage_take_first (stage));
205 /* Removes the first N tokens from SRC, appending them to DST as the last
208 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
210 for (size_t i = 0; i < n; i++)
211 lex_stage_push_last (dst, lex_stage_take_first (src));
214 /* A source of tokens, corresponding to a syntax file.
216 This is conceptually a lex_reader wrapped with everything needed to convert
217 its UTF-8 bytes into tokens. */
220 struct ll ll; /* In lexer's list of sources. */
224 - One for struct lexer.
226 - One for each struct msg_location that references this source. */
229 struct lex_reader *reader;
231 struct segmenter segmenter;
232 bool eof; /* True if T_STOP was read from 'reader'. */
234 /* Buffer of UTF-8 bytes. */
235 char *buffer; /* Source file contents. */
236 size_t length; /* Number of bytes filled. */
237 size_t allocated; /* Number of bytes allocated. */
239 /* Offsets into 'buffer'. */
240 size_t journal_pos; /* First byte not yet output to journal. */
241 size_t seg_pos; /* First byte not yet scanned as token. */
243 /* Offset into 'buffer' of starts of lines. */
245 size_t n_lines, allocated_lines;
247 bool suppress_next_newline;
251 This is a pipeline with the following stages. Each token eventually
252 made available to the parser passes through of these stages. The stages
253 are named after the processing that happens in each one.
255 Initially, tokens come from the segmenter and scanner to 'pp':
257 - pp: Tokens that need to pass through the macro preprocessor to end up
260 - merge: Tokens that need to pass through scan_merge() to end up in
263 - parse: Tokens available to the client for parsing.
265 'pp' and 'merge' store tokens only temporarily until they pass into
266 'parse'. Tokens then live in 'parse' until the command is fully
267 consumed, at which time they are freed together. */
269 struct lex_stage merge;
270 struct lex_token **parse;
271 size_t n_parse, allocated_parse, parse_ofs;
274 static struct lex_source *lex_source_create (struct lexer *,
275 struct lex_reader *);
280 struct ll_list sources; /* Contains "struct lex_source"s. */
281 struct macro_set *macros;
284 static struct lex_source *lex_source__ (const struct lexer *);
285 static char *lex_source_syntax__ (const struct lex_source *,
287 static const struct lex_token *lex_next__ (const struct lexer *, int n);
288 static void lex_source_push_endcmd__ (struct lex_source *);
289 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
290 static void lex_source_clear_parse (struct lex_source *);
292 static bool lex_source_get_parse (struct lex_source *);
293 static void lex_source_error_valist (struct lex_source *, int ofs0, int ofs1,
294 const char *format, va_list)
295 PRINTF_FORMAT (4, 0);
296 static const struct lex_token *lex_source_next__ (const struct lex_source *,
299 /* Initializes READER with the specified CLASS and otherwise some reasonable
300 defaults. The caller should fill in the others members as desired. */
302 lex_reader_init (struct lex_reader *reader,
303 const struct lex_reader_class *class)
305 reader->class = class;
306 reader->syntax = SEG_MODE_AUTO;
307 reader->error = LEX_ERROR_CONTINUE;
308 reader->file_name = NULL;
309 reader->encoding = NULL;
310 reader->line_number = 0;
314 /* Frees any file name already in READER and replaces it by a copy of
315 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
317 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
319 free (reader->file_name);
320 reader->file_name = xstrdup_if_nonnull (file_name);
323 /* Creates and returns a new lexer. */
327 struct lexer *lexer = xmalloc (sizeof *lexer);
328 *lexer = (struct lexer) {
329 .sources = LL_INITIALIZER (lexer->sources),
330 .macros = macro_set_create (),
335 /* Destroys LEXER. */
337 lex_destroy (struct lexer *lexer)
341 struct lex_source *source, *next;
343 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
345 ll_remove (&source->ll);
346 lex_source_unref (source);
348 macro_set_destroy (lexer->macros);
353 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
354 same name. Takes ownership of M. */
356 lex_define_macro (struct lexer *lexer, struct macro *m)
358 macro_set_add (lexer->macros, m);
361 /* Inserts READER into LEXER so that the next token read by LEXER comes from
362 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
365 lex_include (struct lexer *lexer, struct lex_reader *reader)
367 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
368 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
371 /* Appends READER to LEXER, so that it will be read after all other current
372 readers have already been read. */
374 lex_append (struct lexer *lexer, struct lex_reader *reader)
376 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
381 /* Advances LEXER to the next token, consuming the current token. */
383 lex_get (struct lexer *lexer)
385 struct lex_source *src;
387 src = lex_source__ (lexer);
391 if (src->parse_ofs < src->n_parse)
393 if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
394 lex_source_clear_parse (src);
399 while (src->parse_ofs == src->n_parse)
400 if (!lex_source_get_parse (src))
402 ll_remove (&src->ll);
403 lex_source_unref (src);
404 src = lex_source__ (lexer);
410 /* Advances LEXER by N tokens. */
412 lex_get_n (struct lexer *lexer, size_t n)
418 /* Issuing errors. */
420 /* Prints a syntax error message containing the current token and
421 given message MESSAGE (if non-null). */
423 lex_error (struct lexer *lexer, const char *format, ...)
427 va_start (args, format);
428 lex_ofs_error_valist (lexer, lex_ofs (lexer), lex_ofs (lexer), format, args);
432 /* Prints a syntax error message containing the current token and
433 given message MESSAGE (if non-null). */
435 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
437 lex_ofs_error_valist (lexer, lex_ofs (lexer), lex_ofs (lexer), format, args);
440 /* Prints a syntax error message for the span of tokens N0 through N1,
441 inclusive, from the current token in LEXER, adding message MESSAGE (if
444 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
448 va_start (args, format);
449 int ofs = lex_ofs (lexer);
450 lex_ofs_error_valist (lexer, n0 + ofs, n1 + ofs, format, args);
454 /* Prints a syntax error message for the span of tokens with offsets OFS0
455 through OFS1, inclusive, within the current command in LEXER, adding message
456 MESSAGE (if non-null). */
458 lex_ofs_error (struct lexer *lexer, int ofs0, int ofs1, const char *format, ...)
462 va_start (args, format);
463 lex_ofs_error_valist (lexer, ofs0, ofs1, format, args);
467 /* Prints a syntax error message saying that one of the strings provided as
468 varargs, up to the first NULL, is expected. */
470 (lex_error_expecting) (struct lexer *lexer, ...)
474 va_start (args, lexer);
475 lex_error_expecting_valist (lexer, args);
479 /* Prints a syntax error message saying that one of the options provided in
480 ARGS, up to the first NULL, is expected. */
482 lex_error_expecting_valist (struct lexer *lexer, va_list args)
484 enum { MAX_OPTIONS = 9 };
485 const char *options[MAX_OPTIONS];
487 while (n < MAX_OPTIONS)
489 const char *option = va_arg (args, const char *);
493 options[n++] = option;
495 lex_error_expecting_array (lexer, options, n);
499 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
504 lex_error (lexer, NULL);
508 lex_error (lexer, _("expecting %s"), options[0]);
512 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
516 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
521 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
522 options[0], options[1], options[2], options[3]);
526 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
527 options[0], options[1], options[2], options[3], options[4]);
531 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
532 options[0], options[1], options[2], options[3], options[4],
537 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
538 options[0], options[1], options[2], options[3], options[4],
539 options[5], options[6]);
543 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
544 options[0], options[1], options[2], options[3], options[4],
545 options[5], options[6], options[7]);
549 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, %s, or %s"),
550 options[0], options[1], options[2], options[3], options[4],
551 options[5], options[6], options[7], options[8]);
555 lex_error (lexer, NULL);
559 /* Reports an error to the effect that subcommand SBC may only be specified
562 This function does not take a lexer as an argument or use lex_error(),
563 because the result would ordinarily just be redundant: "Syntax error at
564 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
565 not help the user find the error. */
567 lex_sbc_only_once (const char *sbc)
569 msg (SE, _("Subcommand %s may only be specified once."), sbc);
572 /* Reports an error to the effect that subcommand SBC is missing.
574 This function does not take a lexer as an argument or use lex_error(),
575 because a missing subcommand can normally be detected only after the whole
576 command has been parsed, and so lex_error() would always report "Syntax
577 error at end of command", which does not help the user find the error. */
579 lex_sbc_missing (const char *sbc)
581 msg (SE, _("Required subcommand %s was not specified."), sbc);
584 /* Reports an error to the effect that specification SPEC may only be specified
585 once within subcommand SBC. */
587 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
589 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
593 /* Reports an error to the effect that specification SPEC is missing within
596 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
598 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
602 /* Prints a syntax error message for the span of tokens with offsets OFS0
603 through OFS1, inclusive, within the current command in LEXER, adding message
604 MESSAGE (if non-null) with the given ARGS. */
606 lex_ofs_error_valist (struct lexer *lexer, int ofs0, int ofs1,
607 const char *format, va_list args)
609 struct lex_source *src = lex_source__ (lexer);
612 lex_source_error_valist (src, ofs0, ofs1, format, args);
618 ds_put_format (&s, _("Syntax error at end of input"));
621 ds_put_cstr (&s, ": ");
622 ds_put_vformat (&s, format, args);
624 if (ds_last (&s) != '.')
625 ds_put_byte (&s, '.');
626 msg (SE, "%s", ds_cstr (&s));
631 /* Checks that we're at end of command.
632 If so, returns a successful command completion code.
633 If not, flags a syntax error and returns an error command
636 lex_end_of_command (struct lexer *lexer)
638 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
640 lex_error (lexer, _("expecting end of command"));
647 /* Token testing functions. */
649 /* Returns true if the current token is a number. */
651 lex_is_number (const struct lexer *lexer)
653 return lex_next_is_number (lexer, 0);
656 /* Returns true if the current token is a string. */
658 lex_is_string (const struct lexer *lexer)
660 return lex_next_is_string (lexer, 0);
663 /* Returns the value of the current token, which must be a
664 floating point number. */
666 lex_number (const struct lexer *lexer)
668 return lex_next_number (lexer, 0);
671 /* Returns true iff the current token is an integer. */
673 lex_is_integer (const struct lexer *lexer)
675 return lex_next_is_integer (lexer, 0);
678 /* Returns the value of the current token, which must be an
681 lex_integer (const struct lexer *lexer)
683 return lex_next_integer (lexer, 0);
686 /* Token testing functions with lookahead.
688 A value of 0 for N as an argument to any of these functions refers to the
689 current token. Lookahead is limited to the current command. Any N greater
690 than the number of tokens remaining in the current command will be treated
691 as referring to a T_ENDCMD token. */
693 /* Returns true if the token N ahead of the current token is a number. */
695 lex_next_is_number (const struct lexer *lexer, int n)
697 return token_is_number (lex_next (lexer, n));
700 /* Returns true if the token N ahead of the current token is a string. */
702 lex_next_is_string (const struct lexer *lexer, int n)
704 return token_is_string (lex_next (lexer, n));
707 /* Returns the value of the token N ahead of the current token, which must be a
708 floating point number. */
710 lex_next_number (const struct lexer *lexer, int n)
712 return token_number (lex_next (lexer, n));
715 /* Returns true if the token N ahead of the current token is an integer. */
717 lex_next_is_integer (const struct lexer *lexer, int n)
719 return token_is_integer (lex_next (lexer, n));
722 /* Returns the value of the token N ahead of the current token, which must be
725 lex_next_integer (const struct lexer *lexer, int n)
727 return token_integer (lex_next (lexer, n));
730 /* Token matching functions. */
732 /* If the current token has the specified TYPE, skips it and returns true.
733 Otherwise, returns false. */
735 lex_match (struct lexer *lexer, enum token_type type)
737 if (lex_token (lexer) == type)
746 /* If the current token matches IDENTIFIER, skips it and returns true.
747 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
750 IDENTIFIER must be an ASCII string. */
752 lex_match_id (struct lexer *lexer, const char *identifier)
754 return lex_match_id_n (lexer, identifier, 3);
757 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
758 may be abbreviated to its first N letters. Otherwise, returns false.
760 IDENTIFIER must be an ASCII string. */
762 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
764 if (lex_token (lexer) == T_ID
765 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
774 /* If the current token is integer X, skips it and returns true. Otherwise,
777 lex_match_int (struct lexer *lexer, int x)
779 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
788 /* Forced matches. */
790 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
791 abbreviated to its first 3 letters. Otherwise, reports an error and returns
794 IDENTIFIER must be an ASCII string. */
796 lex_force_match_id (struct lexer *lexer, const char *identifier)
798 if (lex_match_id (lexer, identifier))
802 lex_error_expecting (lexer, identifier);
807 /* If the current token has the specified TYPE, skips it and returns true.
808 Otherwise, reports an error and returns false. */
810 lex_force_match (struct lexer *lexer, enum token_type type)
812 if (lex_token (lexer) == type)
819 const char *type_string = token_type_to_string (type);
822 char *s = xasprintf ("`%s'", type_string);
823 lex_error_expecting (lexer, s);
827 lex_error_expecting (lexer, token_type_to_name (type));
833 /* If the current token is a string, does nothing and returns true.
834 Otherwise, reports an error and returns false. */
836 lex_force_string (struct lexer *lexer)
838 if (lex_is_string (lexer))
842 lex_error (lexer, _("expecting string"));
847 /* If the current token is a string or an identifier, does nothing and returns
848 true. Otherwise, reports an error and returns false.
850 This is meant for use in syntactic situations where we want to encourage the
851 user to supply a quoted string, but for compatibility we also accept
852 identifiers. (One example of such a situation is file names.) Therefore,
853 the error message issued when the current token is wrong only says that a
854 string is expected and doesn't mention that an identifier would also be
857 lex_force_string_or_id (struct lexer *lexer)
859 return lex_token (lexer) == T_ID || lex_force_string (lexer);
862 /* If the current token is an integer, does nothing and returns true.
863 Otherwise, reports an error and returns false. */
865 lex_force_int (struct lexer *lexer)
867 if (lex_is_integer (lexer))
871 lex_error (lexer, _("expecting integer"));
876 /* If the current token is an integer in the range MIN...MAX (inclusive), does
877 nothing and returns true. Otherwise, reports an error and returns false.
878 If NAME is nonnull, then it is used in the error message. */
880 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
882 bool is_number = lex_is_number (lexer);
883 bool is_integer = lex_is_integer (lexer);
884 bool too_small = (is_integer ? lex_integer (lexer) < min
885 : is_number ? lex_number (lexer) < min
887 bool too_big = (is_integer ? lex_integer (lexer) > max
888 : is_number ? lex_number (lexer) > max
890 if (is_integer && !too_small && !too_big)
895 /* Weird, maybe a bug in the caller. Just report that we needed an
898 lex_error (lexer, _("Integer expected for %s."), name);
900 lex_error (lexer, _("Integer expected."));
905 lex_error (lexer, _("Expected %ld for %s."), min, name);
907 lex_error (lexer, _("Expected %ld."), min);
909 else if (min + 1 == max)
912 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
914 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
918 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
919 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
921 if (report_lower_bound && report_upper_bound)
925 _("Expected integer between %ld and %ld for %s."),
928 lex_error (lexer, _("Expected integer between %ld and %ld."),
931 else if (report_lower_bound)
936 lex_error (lexer, _("Expected non-negative integer for %s."),
939 lex_error (lexer, _("Expected non-negative integer."));
944 lex_error (lexer, _("Expected positive integer for %s."),
947 lex_error (lexer, _("Expected positive integer."));
952 lex_error (lexer, _("Expected integer %ld or greater for %s."),
955 lex_error (lexer, _("Expected integer %ld or greater."), min);
958 else if (report_upper_bound)
962 _("Expected integer less than or equal to %ld for %s."),
965 lex_error (lexer, _("Expected integer less than or equal to %ld."),
971 lex_error (lexer, _("Integer expected for %s."), name);
973 lex_error (lexer, _("Integer expected."));
979 /* If the current token is a number, does nothing and returns true.
980 Otherwise, reports an error and returns false. */
982 lex_force_num (struct lexer *lexer)
984 if (lex_is_number (lexer))
987 lex_error (lexer, _("expecting number"));
991 /* If the current token is an number in the closed range [MIN,MAX], does
992 nothing and returns true. Otherwise, reports an error and returns false.
993 If NAME is nonnull, then it is used in the error message. */
995 lex_force_num_range_closed (struct lexer *lexer, const char *name,
996 double min, double max)
998 bool is_number = lex_is_number (lexer);
999 bool too_small = is_number && lex_number (lexer) < min;
1000 bool too_big = is_number && lex_number (lexer) > max;
1001 if (is_number && !too_small && !too_big)
1006 /* Weird, maybe a bug in the caller. Just report that we needed an
1009 lex_error (lexer, _("Number expected for %s."), name);
1011 lex_error (lexer, _("Number expected."));
1013 else if (min == max)
1016 lex_error (lexer, _("Expected %g for %s."), min, name);
1018 lex_error (lexer, _("Expected %g."), min);
1022 bool report_lower_bound = min > -DBL_MAX || too_small;
1023 bool report_upper_bound = max < DBL_MAX || too_big;
1025 if (report_lower_bound && report_upper_bound)
1029 _("Expected number between %g and %g for %s."),
1032 lex_error (lexer, _("Expected number between %g and %g."),
1035 else if (report_lower_bound)
1040 lex_error (lexer, _("Expected non-negative number for %s."),
1043 lex_error (lexer, _("Expected non-negative number."));
1048 lex_error (lexer, _("Expected number %g or greater for %s."),
1051 lex_error (lexer, _("Expected number %g or greater."), min);
1054 else if (report_upper_bound)
1058 _("Expected number less than or equal to %g for %s."),
1061 lex_error (lexer, _("Expected number less than or equal to %g."),
1067 lex_error (lexer, _("Number expected for %s."), name);
1069 lex_error (lexer, _("Number expected."));
1075 /* If the current token is an number in the half-open range [MIN,MAX), does
1076 nothing and returns true. Otherwise, reports an error and returns false.
1077 If NAME is nonnull, then it is used in the error message. */
1079 lex_force_num_range_halfopen (struct lexer *lexer, const char *name,
1080 double min, double max)
1082 bool is_number = lex_is_number (lexer);
1083 bool too_small = is_number && lex_number (lexer) < min;
1084 bool too_big = is_number && lex_number (lexer) >= max;
1085 if (is_number && !too_small && !too_big)
1090 /* Weird, maybe a bug in the caller. Just report that we needed an
1093 lex_error (lexer, _("Number expected for %s."), name);
1095 lex_error (lexer, _("Number expected."));
1099 bool report_lower_bound = min > -DBL_MAX || too_small;
1100 bool report_upper_bound = max < DBL_MAX || too_big;
1102 if (report_lower_bound && report_upper_bound)
1105 lex_error (lexer, _("Expected number in [%g,%g) for %s."),
1108 lex_error (lexer, _("Expected number in [%g,%g)."),
1111 else if (report_lower_bound)
1116 lex_error (lexer, _("Expected non-negative number for %s."),
1119 lex_error (lexer, _("Expected non-negative number."));
1124 lex_error (lexer, _("Expected number %g or greater for %s."),
1127 lex_error (lexer, _("Expected number %g or greater."), min);
1130 else if (report_upper_bound)
1134 _("Expected number less than %g for %s."), max, name);
1136 lex_error (lexer, _("Expected number less than %g."), max);
1141 lex_error (lexer, _("Number expected for %s."), name);
1143 lex_error (lexer, _("Number expected."));
1149 /* If the current token is an number in the open range (MIN,MAX], does
1150 nothing and returns true. Otherwise, reports an error and returns false.
1151 If NAME is nonnull, then it is used in the error message. */
1153 lex_force_num_range_open (struct lexer *lexer, const char *name,
1154 double min, double max)
1156 bool is_number = lex_is_number (lexer);
1157 bool too_small = is_number && lex_number (lexer) <= min;
1158 bool too_big = is_number && lex_number (lexer) >= max;
1159 if (is_number && !too_small && !too_big)
1164 /* Weird, maybe a bug in the caller. Just report that we needed an
1167 lex_error (lexer, _("Number expected for %s."), name);
1169 lex_error (lexer, _("Number expected."));
1173 bool report_lower_bound = min > -DBL_MAX || too_small;
1174 bool report_upper_bound = max < DBL_MAX || too_big;
1176 if (report_lower_bound && report_upper_bound)
1179 lex_error (lexer, _("Expected number in (%g,%g) for %s."),
1182 lex_error (lexer, _("Expected number in (%g,%g)."), min, max);
1184 else if (report_lower_bound)
1189 lex_error (lexer, _("Expected positive number for %s."), name);
1191 lex_error (lexer, _("Expected positive number."));
1196 lex_error (lexer, _("Expected number greater than %g for %s."),
1199 lex_error (lexer, _("Expected number greater than %g."), min);
1202 else if (report_upper_bound)
1205 lex_error (lexer, _("Expected number less than %g for %s."),
1208 lex_error (lexer, _("Expected number less than %g."), max);
1213 lex_error (lexer, _("Number expected for %s."), name);
1215 lex_error (lexer, _("Number expected."));
1221 /* If the current token is an identifier, does nothing and returns true.
1222 Otherwise, reports an error and returns false. */
1224 lex_force_id (struct lexer *lexer)
1226 if (lex_token (lexer) == T_ID)
1229 lex_error (lexer, _("expecting identifier"));
1233 /* Token accessors. */
1235 /* Returns the type of LEXER's current token. */
1237 lex_token (const struct lexer *lexer)
1239 return lex_next_token (lexer, 0);
1242 /* Returns the number in LEXER's current token.
1244 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1245 tokens this function will always return zero. */
1247 lex_tokval (const struct lexer *lexer)
1249 return lex_next_tokval (lexer, 0);
1252 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
1254 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1255 this functions this function will always return NULL.
1257 The UTF-8 encoding of the returned string is correct for variable names and
1258 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1259 data_in() to use it in a "union value". */
1261 lex_tokcstr (const struct lexer *lexer)
1263 return lex_next_tokcstr (lexer, 0);
1266 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
1267 null-terminated (but the null terminator is not included in the returned
1268 substring's 'length').
1270 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1271 this functions this function will always return NULL.
1273 The UTF-8 encoding of the returned string is correct for variable names and
1274 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1275 data_in() to use it in a "union value". */
1277 lex_tokss (const struct lexer *lexer)
1279 return lex_next_tokss (lexer, 0);
1284 A value of 0 for N as an argument to any of these functions refers to the
1285 current token. Lookahead is limited to the current command. Any N greater
1286 than the number of tokens remaining in the current command will be treated
1287 as referring to a T_ENDCMD token. */
1289 static const struct lex_token *
1290 lex_next__ (const struct lexer *lexer_, int n)
1292 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1293 struct lex_source *src = lex_source__ (lexer);
1296 return lex_source_next__ (src, n);
1299 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1304 static const struct lex_token *
1305 lex_source_ofs__ (const struct lex_source *src_, int ofs)
1307 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1311 static const struct lex_token endcmd_token
1312 = { .token = { .type = T_ENDCMD } };
1313 return &endcmd_token;
1316 while (ofs >= src->n_parse)
1318 if (src->n_parse > 0)
1320 const struct lex_token *t = src->parse[src->n_parse - 1];
1321 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1325 lex_source_get_parse (src);
1328 return src->parse[ofs];
1331 static const struct lex_token *
1332 lex_source_next__ (const struct lex_source *src, int n)
1334 return lex_source_ofs__ (src, n + src->parse_ofs);
1337 /* Returns the "struct token" of the token N after the current one in LEXER.
1338 The returned pointer can be invalidated by pretty much any succeeding call
1339 into the lexer, although the string pointer within the returned token is
1340 only invalidated by consuming the token (e.g. with lex_get()). */
1341 const struct token *
1342 lex_next (const struct lexer *lexer, int n)
1344 return &lex_next__ (lexer, n)->token;
1347 /* Returns the type of the token N after the current one in LEXER. */
1349 lex_next_token (const struct lexer *lexer, int n)
1351 return lex_next (lexer, n)->type;
1354 /* Returns the number in the tokn N after the current one in LEXER.
1356 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1357 tokens this function will always return zero. */
1359 lex_next_tokval (const struct lexer *lexer, int n)
1361 return token_number (lex_next (lexer, n));
1364 /* Returns the null-terminated string in the token N after the current one, in
1367 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1368 this functions this function will always return NULL.
1370 The UTF-8 encoding of the returned string is correct for variable names and
1371 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1372 data_in() to use it in a "union value". */
1374 lex_next_tokcstr (const struct lexer *lexer, int n)
1376 return lex_next_tokss (lexer, n).string;
1379 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1380 The string is null-terminated (but the null terminator is not included in
1381 the returned substring's 'length').
1383 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1384 tokens this functions this function will always return NULL.
1386 The UTF-8 encoding of the returned string is correct for variable names and
1387 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1388 data_in() to use it in a "union value". */
1390 lex_next_tokss (const struct lexer *lexer, int n)
1392 return lex_next (lexer, n)->string;
1395 /* Returns the offset of the current token within the command being parsed in
1396 LEXER. This is 0 for the first token in a command, 1 for the second, and so
1397 on. The return value is useful later for referring to this token in calls
1400 lex_ofs (const struct lexer *lexer)
1402 struct lex_source *src = lex_source__ (lexer);
1403 return src ? src->parse_ofs : 0;
1406 /* Returns the token within LEXER's current command with offset OFS. Use
1407 lex_ofs() to find out the offset of the current token. */
1408 const struct token *
1409 lex_ofs_token (const struct lexer *lexer_, int ofs)
1411 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1412 struct lex_source *src = lex_source__ (lexer);
1415 return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1418 static const struct token stop_token = { .type = T_STOP };
1423 /* Allocates and returns a new struct msg_location that spans tokens with
1424 offsets OFS0 through OFS1, inclusive, within the current command in
1425 LEXER. See lex_ofs() for an explanation of token offsets.
1427 The caller owns and must eventually free the returned object. */
1428 struct msg_location *
1429 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1431 int ofs = lex_ofs (lexer);
1432 return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1435 /* Returns a msg_point for the first character in the token with offset OFS,
1436 where offset 0 is the first token in the command currently being parsed, 1
1437 the second token, and so on. These are absolute offsets, not relative to
1438 the token currently being parsed within the command.
1440 Returns zeros for a T_STOP token.
1443 lex_ofs_start_point (const struct lexer *lexer, int ofs)
1445 const struct lex_source *src = lex_source__ (lexer);
1447 ? lex_token_start_point (src, lex_source_ofs__ (src, ofs))
1448 : (struct msg_point) { 0, 0 });
1451 /* Returns a msg_point for the last character, inclusive, in the token with
1452 offset OFS, where offset 0 is the first token in the command currently being
1453 parsed, 1 the second token, and so on. These are absolute offsets, not
1454 relative to the token currently being parsed within the command.
1456 Returns zeros for a T_STOP token.
1458 Most of the time, a single token is wholly within a single line of syntax,
1459 so that the start and end point for a given offset have the same line
1460 number. There are two exceptions: a T_STRING token can be made up of
1461 multiple segments on adjacent lines connected with "+" punctuators, and a
1462 T_NEG_NUM token can consist of a "-" on one line followed by the number on
1466 lex_ofs_end_point (const struct lexer *lexer, int ofs)
1468 const struct lex_source *src = lex_source__ (lexer);
1470 ? lex_token_end_point (src, lex_source_ofs__ (src, ofs))
1471 : (struct msg_point) { 0, 0 });
1474 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1475 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1476 are both zero, this requests the syntax for the current token.)
1478 The caller must eventually free the returned string (with free()). The
1479 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1480 that, for example, it may include comments, spaces, and new-lines if it
1481 spans multiple tokens. Macro expansion, however, has already been
1484 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1486 const struct lex_source *src = lex_source__ (lexer);
1488 ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs)
1493 /* Returns the text of the syntax in tokens with offsets OFS0 to OFS1,
1494 inclusive. (For example, if OFS0 and OFS1 are both zero, this requests the
1495 syntax for the first token in the current command.)
1497 The caller must eventually free the returned string (with free()). The
1498 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1499 that, for example, it may include comments, spaces, and new-lines if it
1500 spans multiple tokens. Macro expansion, however, has already been
1503 lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1)
1505 const struct lex_source *src = lex_source__ (lexer);
1506 return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup ("");
1509 /* Returns true if the token N ahead of the current one was produced by macro
1510 expansion, false otherwise. */
1512 lex_next_is_from_macro (const struct lexer *lexer, int n)
1514 return lex_next__ (lexer, n)->macro_rep != NULL;
1518 lex_tokens_match (const struct token *actual, const struct token *expected)
1520 if (actual->type != expected->type)
1523 switch (actual->type)
1527 return actual->number == expected->number;
1530 return lex_id_match (expected->string, actual->string);
1533 return (actual->string.length == expected->string.length
1534 && !memcmp (actual->string.string, expected->string.string,
1535 actual->string.length));
1543 lex_at_phrase__ (struct lexer *lexer, const char *s)
1545 struct string_lexer slex;
1549 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1550 while (string_lexer_next (&slex, &token))
1552 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1553 token_uninit (&token);
1560 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1561 returns true. Otherwise, returns false.
1563 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1564 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1565 first three letters. */
1567 lex_at_phrase (struct lexer *lexer, const char *s)
1569 return lex_at_phrase__ (lexer, s) > 0;
1572 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1573 skips it and returns true. Otherwise, returns false.
1575 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1576 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1577 first three letters. */
1579 lex_match_phrase (struct lexer *lexer, const char *s)
1581 size_t n = lex_at_phrase__ (lexer, s);
1583 lex_get_n (lexer, n);
1587 /* Returns the 1-based line number of the source text at the byte OFFSET in
1590 lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset)
1593 size_t hi = src->n_lines;
1596 size_t mid = (lo + hi) / 2;
1597 if (mid + 1 >= src->n_lines)
1598 return src->n_lines;
1599 else if (offset >= src->lines[mid + 1])
1601 else if (offset < src->lines[mid])
1608 /* Returns the 1-based column number of the source text at the byte OFFSET in
1611 lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset)
1613 const char *newline = memrchr (src->buffer, '\n', offset);
1614 size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1615 return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1618 static struct msg_point
1619 lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset)
1621 return (struct msg_point) {
1622 .line = lex_source_ofs_to_line_number (src, offset),
1623 .column = lex_source_ofs_to_column_number (src, offset),
1627 static struct msg_point
1628 lex_token_start_point (const struct lex_source *src,
1629 const struct lex_token *token)
1631 return lex_source_ofs_to_point__ (src, token->token_pos);
1634 static struct msg_point
1635 lex_token_end_point (const struct lex_source *src,
1636 const struct lex_token *token)
1638 return lex_source_ofs_to_point__ (src, lex_token_end (token));
1641 static struct msg_location
1642 lex_token_location (const struct lex_source *src,
1643 const struct lex_token *t0,
1644 const struct lex_token *t1)
1646 return (struct msg_location) {
1647 .file_name = intern_new_if_nonnull (src->reader->file_name),
1648 .start = lex_token_start_point (src, t0),
1649 .end = lex_token_end_point (src, t1),
1653 static struct msg_location *
1654 lex_token_location_rw (const struct lex_source *src,
1655 const struct lex_token *t0,
1656 const struct lex_token *t1)
1658 struct msg_location location = lex_token_location (src, t0, t1);
1659 return msg_location_dup (&location);
1662 static struct msg_location *
1663 lex_source_get_location (const struct lex_source *src, int ofs0, int ofs1)
1665 return lex_token_location_rw (src,
1666 lex_source_ofs__ (src, ofs0),
1667 lex_source_ofs__ (src, ofs1));
1670 /* Returns the name of the syntax file from which the current command is drawn.
1671 Returns NULL for a T_STOP token or if the command's source does not have
1674 There is no version of this function that takes an N argument because
1675 lookahead only works to the end of a command and any given command is always
1676 within a single syntax file. */
1678 lex_get_file_name (const struct lexer *lexer)
1680 struct lex_source *src = lex_source__ (lexer);
1681 return src == NULL ? NULL : src->reader->file_name;
1684 /* Returns a newly allocated msg_location for the syntax that represents tokens
1685 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1686 must eventually free the location (with msg_location_destroy()). */
1687 struct msg_location *
1688 lex_get_location (const struct lexer *lexer, int n0, int n1)
1690 struct msg_location *loc = xmalloc (sizeof *loc);
1691 *loc = (struct msg_location) {
1692 .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1693 .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)),
1694 .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)),
1695 .src = lex_source__ (lexer),
1697 lex_source_ref (loc->src);
1702 lex_get_encoding (const struct lexer *lexer)
1704 struct lex_source *src = lex_source__ (lexer);
1705 return src == NULL ? NULL : src->reader->encoding;
1708 /* Returns the syntax mode for the syntax file from which the current drawn is
1709 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1710 does not have line numbers.
1712 There is no version of this function that takes an N argument because
1713 lookahead only works to the end of a command and any given command is always
1714 within a single syntax file. */
1716 lex_get_syntax_mode (const struct lexer *lexer)
1718 struct lex_source *src = lex_source__ (lexer);
1719 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1722 /* Returns the error mode for the syntax file from which the current drawn is
1723 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1724 source does not have line numbers.
1726 There is no version of this function that takes an N argument because
1727 lookahead only works to the end of a command and any given command is always
1728 within a single syntax file. */
1730 lex_get_error_mode (const struct lexer *lexer)
1732 struct lex_source *src = lex_source__ (lexer);
1733 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1736 /* If the source that LEXER is currently reading has error mode
1737 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1738 token to be read comes directly from whatever is next read from the stream.
1740 It makes sense to call this function after encountering an error in a
1741 command entered on the console, because usually the user would prefer not to
1742 have cascading errors. */
1744 lex_interactive_reset (struct lexer *lexer)
1746 struct lex_source *src = lex_source__ (lexer);
1747 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1750 src->journal_pos = src->seg_pos = 0;
1752 src->suppress_next_newline = false;
1753 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1755 lex_stage_clear (&src->pp);
1756 lex_stage_clear (&src->merge);
1757 lex_source_clear_parse (src);
1758 lex_source_push_endcmd__ (src);
1762 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1764 lex_discard_rest_of_command (struct lexer *lexer)
1766 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1770 /* Discards all lookahead tokens in LEXER, then discards all input sources
1771 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1772 runs out of input sources. */
1774 lex_discard_noninteractive (struct lexer *lexer)
1776 struct lex_source *src = lex_source__ (lexer);
1780 lex_stage_clear (&src->pp);
1781 lex_stage_clear (&src->merge);
1782 lex_source_clear_parse (src);
1784 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1785 src = lex_source__ (lexer))
1787 ll_remove (&src->ll);
1788 lex_source_unref (src);
1794 lex_source_expand__ (struct lex_source *src)
1796 if (src->length >= src->allocated)
1797 src->buffer = x2realloc (src->buffer, &src->allocated);
1801 lex_source_read__ (struct lex_source *src)
1805 lex_source_expand__ (src);
1807 size_t space = src->allocated - src->length;
1808 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1809 size_t n = src->reader->class->read (src->reader,
1810 &src->buffer[src->length],
1812 assert (n <= space);
1817 src->reader->eof = true;
1823 while (!memchr (&src->buffer[src->seg_pos], '\n',
1824 src->length - src->seg_pos));
1827 static struct lex_source *
1828 lex_source__ (const struct lexer *lexer)
1830 return (ll_is_empty (&lexer->sources) ? NULL
1831 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1834 /* Returns the text of the syntax in SRC for tokens with offsets OFS0 through
1835 OFS1 in the current command, inclusive. (For example, if OFS0 and OFS1 are
1836 both zero, this requests the syntax for the first token in the current
1837 command.) The caller must eventually free the returned string (with
1838 free()). The syntax is encoded in UTF-8 and in the original form supplied
1839 to the lexer so that, for example, it may include comments, spaces, and
1840 new-lines if it spans multiple tokens. Macro expansion, however, has
1841 already been performed. */
1843 lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1)
1845 struct string s = DS_EMPTY_INITIALIZER;
1846 for (size_t i = ofs0; i <= ofs1; )
1848 /* Find [I,J) as the longest sequence of tokens not produced by macro
1849 expansion, or otherwise the longest sequence expanded from a single
1851 const struct lex_token *first = lex_source_ofs__ (src, i);
1853 for (j = i + 1; j <= ofs1; j++)
1855 const struct lex_token *cur = lex_source_ofs__ (src, j);
1856 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1857 || first->macro_rep != cur->macro_rep)
1860 const struct lex_token *last = lex_source_ofs__ (src, j - 1);
1862 /* Now add the syntax for this sequence of tokens to SRC. */
1863 if (!ds_is_empty (&s))
1864 ds_put_byte (&s, ' ');
1865 if (!first->macro_rep)
1867 size_t start = first->token_pos;
1868 size_t end = last->token_pos + last->token_len;
1869 ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1873 size_t start = first->ofs;
1874 size_t end = last->ofs + last->len;
1875 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1881 return ds_steal_cstr (&s);
1885 lex_source_contains_macro_call (struct lex_source *src, int ofs0, int ofs1)
1887 for (int i = ofs0; i <= ofs1; i++)
1888 if (lex_source_ofs__ (src, i)->macro_rep)
1893 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1894 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1895 other tokens included in that range. The syntax is encoded in UTF-8 and in
1896 the original form supplied to the lexer so that, for example, it may include
1897 comments, spaces, and new-lines if it spans multiple tokens.
1899 Returns an empty string if the token range doesn't include a macro call.
1901 The caller must not modify or free the returned string. */
1902 static struct substring
1903 lex_source_get_macro_call (struct lex_source *src, int ofs0, int ofs1)
1905 if (!lex_source_contains_macro_call (src, ofs0, ofs1))
1908 const struct lex_token *token0 = lex_source_ofs__ (src, ofs0);
1909 const struct lex_token *token1 = lex_source_ofs__ (src, MAX (ofs0, ofs1));
1910 size_t start = token0->token_pos;
1911 size_t end = token1->token_pos + token1->token_len;
1913 return ss_buffer (&src->buffer[start], end - start);
1917 lex_source_error_valist (struct lex_source *src, int ofs0, int ofs1,
1918 const char *format, va_list args)
1920 const struct lex_token *token;
1925 token = lex_source_ofs__ (src, ofs0);
1926 if (token->token.type == T_ENDCMD)
1927 ds_put_cstr (&s, _("Syntax error at end of command"));
1930 /* Get the syntax that caused the error. */
1931 char *raw_syntax = lex_source_syntax__ (src, ofs0, ofs1);
1933 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1936 /* Get the macro call(s) that expanded to the syntax that caused the
1939 str_ellipsize (lex_source_get_macro_call (src, ofs0, ofs1),
1946 _("Syntax error at `%s' (in expansion of `%s')"),
1949 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1954 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1957 ds_put_cstr (&s, _("Syntax error"));
1963 ds_put_cstr (&s, ": ");
1964 ds_put_vformat (&s, format, args);
1966 if (ds_last (&s) != '.')
1967 ds_put_byte (&s, '.');
1969 struct msg *m = xmalloc (sizeof *m);
1971 .category = MSG_C_SYNTAX,
1972 .severity = MSG_S_ERROR,
1973 .location = lex_source_get_location (src, ofs0, ofs1),
1974 .text = ds_steal_cstr (&s),
1980 lex_get_error (struct lex_source *src, const struct lex_token *token)
1983 str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1984 syntax, sizeof syntax);
1986 struct string s = DS_EMPTY_INITIALIZER;
1987 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1988 ds_put_format (&s, ": %s", token->token.string.string);
1990 struct msg *m = xmalloc (sizeof *m);
1992 .category = MSG_C_SYNTAX,
1993 .severity = MSG_S_ERROR,
1994 .location = lex_token_location_rw (src, token, token),
1995 .text = ds_steal_cstr (&s),
2000 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
2001 underlying lex_reader if necessary. Returns true if a new token was added
2002 to SRC's deque, false otherwise. The caller should retry failures unless
2003 SRC's 'eof' marker was set to true indicating that there will be no more
2004 tokens from this source. */
2006 lex_source_try_get_pp (struct lex_source *src)
2008 /* Append a new token to SRC and initialize it. */
2009 struct lex_token *token = xmalloc (sizeof *token);
2010 token->token = (struct token) { .type = T_STOP };
2011 token->macro_rep = NULL;
2012 token->ref_cnt = NULL;
2013 token->token_pos = src->seg_pos;
2015 /* Extract a segment. */
2016 const char *segment;
2017 enum segment_type seg_type;
2021 segment = &src->buffer[src->seg_pos];
2022 seg_len = segmenter_push (&src->segmenter, segment,
2023 src->length - src->seg_pos,
2024 src->reader->eof, &seg_type);
2028 /* The segmenter needs more input to produce a segment. */
2029 assert (!src->reader->eof);
2030 lex_source_read__ (src);
2033 /* Update state based on the segment. */
2034 token->token_len = seg_len;
2035 src->seg_pos += seg_len;
2036 if (seg_type == SEG_NEWLINE)
2038 if (src->n_lines >= src->allocated_lines)
2039 src->lines = x2nrealloc (src->lines, &src->allocated_lines,
2040 sizeof *src->lines);
2041 src->lines[src->n_lines++] = src->seg_pos;
2044 /* Get a token from the segment. */
2045 enum tokenize_result result = token_from_segment (
2046 seg_type, ss_buffer (segment, seg_len), &token->token);
2048 /* If we've reached the end of a line, or the end of a command, then pass
2049 the line to the output engine as a syntax text item. */
2050 int n_lines = seg_type == SEG_NEWLINE;
2051 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
2054 src->suppress_next_newline = true;
2056 else if (n_lines > 0 && src->suppress_next_newline)
2059 src->suppress_next_newline = false;
2061 for (int i = 0; i < n_lines; i++)
2063 /* Beginning of line. */
2064 const char *line = &src->buffer[src->journal_pos];
2066 /* Calculate line length, including \n or \r\n end-of-line if present.
2068 We use src->length even though that may be beyond what we've actually
2069 converted to tokens. That's because, if we're emitting the line due
2070 to SEG_END_COMMAND, we want to take the whole line through the
2071 newline, not just through the '.'. */
2072 size_t max_len = src->length - src->journal_pos;
2073 const char *newline = memchr (line, '\n', max_len);
2074 size_t line_len = newline ? newline - line + 1 : max_len;
2076 /* Calculate line length excluding end-of-line. */
2077 size_t copy_len = line_len;
2078 if (copy_len > 0 && line[copy_len - 1] == '\n')
2080 if (copy_len > 0 && line[copy_len - 1] == '\r')
2083 /* Submit the line as syntax. */
2084 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
2085 xmemdup0 (line, copy_len),
2088 src->journal_pos += line_len;
2093 case TOKENIZE_ERROR:
2094 lex_get_error (src, token);
2096 case TOKENIZE_EMPTY:
2097 lex_token_destroy (token);
2100 case TOKENIZE_TOKEN:
2101 if (token->token.type == T_STOP)
2103 token->token.type = T_ENDCMD;
2106 lex_stage_push_last (&src->pp, token);
2112 /* Attempts to append a new token to SRC. Returns true if successful, false on
2113 failure. On failure, the end of SRC has been reached and no more tokens
2114 will be forthcoming from it.
2116 Does not make the new token available for lookahead yet; the caller must
2117 adjust SRC's 'middle' pointer to do so. */
2119 lex_source_get_pp (struct lex_source *src)
2122 if (lex_source_try_get_pp (src))
2128 lex_source_try_get_merge (const struct lex_source *src_)
2130 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2132 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
2135 if (!settings_get_mexpand ())
2137 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
2141 /* Now pass tokens one-by-one to the macro expander.
2143 In the common case where there is no macro to expand, the loop is not
2145 struct macro_call *mc;
2146 int n_call = macro_call_create (src->lexer->macros,
2147 &lex_stage_first (&src->pp)->token, &mc);
2148 for (int ofs = 1; !n_call; ofs++)
2150 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
2152 /* This should not be reachable because we always get a T_ENDCMD at
2153 the end of an input file (transformed from T_STOP by
2154 lex_source_try_get_pp()) and the macro_expander should always
2155 terminate expansion on T_ENDCMD. */
2159 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
2160 const struct macro_token mt = {
2162 .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len),
2164 const struct msg_location loc = lex_token_location (src, t, t);
2165 n_call = macro_call_add (mc, &mt, &loc);
2169 /* False alarm: no macro expansion after all. Use first token as
2170 lookahead. We'll retry macro expansion from the second token next
2172 macro_call_destroy (mc);
2173 lex_stage_shift (&src->merge, &src->pp, 1);
2177 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
2178 are a macro call. (These are likely to be the only tokens in 'pp'.)
2180 const struct lex_token *c0 = lex_stage_first (&src->pp);
2181 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
2182 struct macro_tokens expansion = { .n = 0 };
2183 struct msg_location loc = lex_token_location (src, c0, c1);
2184 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
2185 macro_call_destroy (mc);
2187 /* Convert the macro expansion into syntax for possible error messages
2189 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
2190 size_t *len = xnmalloc (expansion.n, sizeof *len);
2191 struct string s = DS_EMPTY_INITIALIZER;
2192 macro_tokens_to_syntax (&expansion, &s, ofs, len);
2194 if (settings_get_mprint ())
2195 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
2196 _("Macro Expansion")));
2198 /* Append the macro expansion tokens to the lookahead. */
2199 if (expansion.n > 0)
2201 char *macro_rep = ds_steal_cstr (&s);
2202 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
2203 *ref_cnt = expansion.n;
2204 for (size_t i = 0; i < expansion.n; i++)
2206 struct lex_token *token = xmalloc (sizeof *token);
2207 *token = (struct lex_token) {
2208 .token = expansion.mts[i].token,
2209 .token_pos = c0->token_pos,
2210 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
2211 .macro_rep = macro_rep,
2216 lex_stage_push_last (&src->merge, token);
2218 ss_dealloc (&expansion.mts[i].syntax);
2223 free (expansion.mts);
2227 /* Destroy the tokens for the call. */
2228 for (size_t i = 0; i < n_call; i++)
2229 lex_stage_pop_first (&src->pp);
2231 return expansion.n > 0;
2234 /* Attempts to obtain at least one new token into 'merge' in SRC.
2236 Returns true if successful, false on failure. In the latter case, SRC is
2237 exhausted and 'src->eof' is now true. */
2239 lex_source_get_merge (struct lex_source *src)
2242 if (lex_source_try_get_merge (src))
2247 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
2249 Returns true if successful, false on failure. In the latter case, SRC is
2250 exhausted and 'src->eof' is now true. */
2252 lex_source_get_parse (struct lex_source *src)
2254 struct merger m = MERGER_INIT;
2256 for (size_t i = 0; ; i++)
2258 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
2260 /* We always get a T_ENDCMD at the end of an input file
2261 (transformed from T_STOP by lex_source_try_get_pp()) and
2262 merger_add() should never return -1 on T_ENDCMD. */
2263 assert (lex_stage_is_empty (&src->merge));
2267 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
2271 lex_source_push_parse (src, lex_stage_take_first (&src->merge));
2274 else if (retval > 0)
2276 /* Add a token that merges all the tokens together. */
2277 const struct lex_token *first = lex_stage_first (&src->merge);
2278 const struct lex_token *last = lex_stage_nth (&src->merge,
2280 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2281 struct lex_token *t = xmalloc (sizeof *t);
2282 *t = (struct lex_token) {
2284 .token_pos = first->token_pos,
2285 .token_len = (last->token_pos - first->token_pos) + last->token_len,
2287 /* This works well if all the tokens were not expanded from macros,
2288 or if they came from the same macro expansion. It just gives up
2289 in the other (corner) cases. */
2290 .macro_rep = macro ? first->macro_rep : NULL,
2291 .ofs = macro ? first->ofs : 0,
2292 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2293 .ref_cnt = macro ? first->ref_cnt : NULL,
2297 lex_source_push_parse (src, t);
2299 for (int i = 0; i < retval; i++)
2300 lex_stage_pop_first (&src->merge);
2307 lex_source_push_endcmd__ (struct lex_source *src)
2309 assert (src->n_parse == 0);
2311 struct lex_token *token = xmalloc (sizeof *token);
2312 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2313 lex_source_push_parse (src, token);
2317 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2319 if (src->n_parse >= src->allocated_parse)
2320 src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2321 sizeof *src->parse);
2322 src->parse[src->n_parse++] = token;
2326 lex_source_clear_parse (struct lex_source *src)
2328 for (size_t i = 0; i < src->n_parse; i++)
2329 lex_token_destroy (src->parse[i]);
2330 src->n_parse = src->parse_ofs = 0;
2333 static struct lex_source *
2334 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2336 size_t allocated_lines = 4;
2337 size_t *lines = xmalloc (allocated_lines * sizeof *lines);
2340 struct lex_source *src = xmalloc (sizeof *src);
2341 *src = (struct lex_source) {
2344 .segmenter = segmenter_init (reader->syntax, false),
2348 .allocated_lines = allocated_lines,
2351 lex_source_push_endcmd__ (src);
2357 lex_set_message_handler (struct lexer *lexer,
2358 void (*output_msg) (const struct msg *,
2361 struct msg_handler msg_handler = {
2362 .output_msg = (void (*)(const struct msg *, void *)) output_msg,
2364 .lex_source_ref = lex_source_ref,
2365 .lex_source_unref = lex_source_unref,
2366 .lex_source_get_line = lex_source_get_line,
2368 msg_set_handler (&msg_handler);
2372 lex_source_ref (const struct lex_source *src_)
2374 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2377 assert (src->n_refs > 0);
2383 lex_source_unref (struct lex_source *src)
2388 assert (src->n_refs > 0);
2389 if (--src->n_refs > 0)
2392 char *file_name = src->reader->file_name;
2393 char *encoding = src->reader->encoding;
2394 if (src->reader->class->destroy != NULL)
2395 src->reader->class->destroy (src->reader);
2400 lex_stage_uninit (&src->pp);
2401 lex_stage_uninit (&src->merge);
2402 lex_source_clear_parse (src);
2407 struct lex_file_reader
2409 struct lex_reader reader;
2410 struct u8_istream *istream;
2413 static struct lex_reader_class lex_file_reader_class;
2415 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2416 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2417 ENCODING, which should take one of the forms accepted by
2418 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2419 mode of the new reader, respectively.
2421 Returns a null pointer if FILE_NAME cannot be opened. */
2423 lex_reader_for_file (const char *file_name, const char *encoding,
2424 enum segmenter_mode syntax,
2425 enum lex_error_mode error)
2427 struct lex_file_reader *r;
2428 struct u8_istream *istream;
2430 istream = (!strcmp(file_name, "-")
2431 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2432 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2433 if (istream == NULL)
2435 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2439 r = xmalloc (sizeof *r);
2440 lex_reader_init (&r->reader, &lex_file_reader_class);
2441 r->reader.syntax = syntax;
2442 r->reader.error = error;
2443 r->reader.file_name = xstrdup (file_name);
2444 r->reader.encoding = xstrdup_if_nonnull (encoding);
2445 r->reader.line_number = 1;
2446 r->istream = istream;
2451 static struct lex_file_reader *
2452 lex_file_reader_cast (struct lex_reader *r)
2454 return UP_CAST (r, struct lex_file_reader, reader);
2458 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2459 enum prompt_style prompt_style UNUSED)
2461 struct lex_file_reader *r = lex_file_reader_cast (r_);
2462 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2465 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2472 lex_file_close (struct lex_reader *r_)
2474 struct lex_file_reader *r = lex_file_reader_cast (r_);
2476 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2478 if (u8_istream_close (r->istream) != 0)
2479 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2482 u8_istream_free (r->istream);
2487 static struct lex_reader_class lex_file_reader_class =
2493 struct lex_string_reader
2495 struct lex_reader reader;
2500 static struct lex_reader_class lex_string_reader_class;
2502 /* Creates and returns a new lex_reader for the contents of S, which must be
2503 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2504 with ss_dealloc() when it is closed. */
2506 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2508 struct lex_string_reader *r;
2510 r = xmalloc (sizeof *r);
2511 lex_reader_init (&r->reader, &lex_string_reader_class);
2512 r->reader.syntax = SEG_MODE_AUTO;
2513 r->reader.encoding = xstrdup_if_nonnull (encoding);
2520 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2521 which must be encoded in ENCODING. The caller retains ownership of S. */
2523 lex_reader_for_string (const char *s, const char *encoding)
2525 struct substring ss;
2526 ss_alloc_substring (&ss, ss_cstr (s));
2527 return lex_reader_for_substring_nocopy (ss, encoding);
2530 /* Formats FORMAT as a printf()-like format string and creates and returns a
2531 new lex_reader for the formatted result. */
2533 lex_reader_for_format (const char *format, const char *encoding, ...)
2535 struct lex_reader *r;
2538 va_start (args, encoding);
2539 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2545 static struct lex_string_reader *
2546 lex_string_reader_cast (struct lex_reader *r)
2548 return UP_CAST (r, struct lex_string_reader, reader);
2552 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2553 enum prompt_style prompt_style UNUSED)
2555 struct lex_string_reader *r = lex_string_reader_cast (r_);
2558 chunk = MIN (n, r->s.length - r->offset);
2559 memcpy (buf, r->s.string + r->offset, chunk);
2566 lex_string_close (struct lex_reader *r_)
2568 struct lex_string_reader *r = lex_string_reader_cast (r_);
2574 static struct lex_reader_class lex_string_reader_class =
2581 lex_source_get_line (const struct lex_source *src, int line)
2583 if (line < 1 || line > src->n_lines)
2586 size_t ofs = src->lines[line - 1];
2587 size_t end = line >= src->n_lines ? src->length : src->lines[line];
2588 return ss_buffer (&src->buffer[ofs], end - ofs);