1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/intern.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
70 size_t token_pos; /* Offset into src->buffer of token start. */
71 size_t token_len; /* Length of source for token in bytes. */
73 /* For a token obtained through macro expansion, this is just this token.
75 For a token obtained through the lexer in an ordinary way, these are
77 char *macro_rep; /* The whole macro expansion. */
78 size_t ofs; /* Offset of this token in macro_rep. */
79 size_t len; /* Length of this token in macro_rep. */
80 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
83 static struct msg_point lex_token_start_point (const struct lex_source *,
84 const struct lex_token *);
85 static struct msg_point lex_token_end_point (const struct lex_source *,
86 const struct lex_token *);
88 /* Source offset of the last byte in TOKEN. */
90 lex_token_end (const struct lex_token *token)
92 return token->token_pos + MAX (token->token_len, 1) - 1;
96 lex_token_destroy (struct lex_token *t)
98 token_uninit (&t->token);
101 assert (*t->ref_cnt > 0);
111 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
116 struct lex_token **tokens;
119 static void lex_stage_clear (struct lex_stage *);
120 static void lex_stage_uninit (struct lex_stage *);
122 static size_t lex_stage_count (const struct lex_stage *);
123 static bool lex_stage_is_empty (const struct lex_stage *);
125 static struct lex_token *lex_stage_first (struct lex_stage *);
126 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
128 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
129 static void lex_stage_pop_first (struct lex_stage *);
131 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
134 /* Deletes all the tokens from STAGE. */
136 lex_stage_clear (struct lex_stage *stage)
138 while (!deque_is_empty (&stage->deque))
139 lex_stage_pop_first (stage);
142 /* Deletes all the tokens from STAGE and frees storage for the deque. */
144 lex_stage_uninit (struct lex_stage *stage)
146 lex_stage_clear (stage);
147 free (stage->tokens);
150 /* Returns true if STAGE contains no tokens, otherwise false. */
152 lex_stage_is_empty (const struct lex_stage *stage)
154 return deque_is_empty (&stage->deque);
157 /* Returns the number of tokens in STAGE. */
159 lex_stage_count (const struct lex_stage *stage)
161 return deque_count (&stage->deque);
164 /* Returns the first token in STAGE, which must be nonempty.
165 The first token is the one accessed with the least lookahead. */
166 static struct lex_token *
167 lex_stage_first (struct lex_stage *stage)
169 return lex_stage_nth (stage, 0);
172 /* Returns the token the given INDEX in STAGE. The first token (with the least
173 lookahead) is 0, the second token is 1, and so on. There must be at least
174 INDEX + 1 tokens in STAGE. */
175 static struct lex_token *
176 lex_stage_nth (struct lex_stage *stage, size_t index)
178 return stage->tokens[deque_back (&stage->deque, index)];
181 /* Adds TOKEN so that it becomes the last token in STAGE. */
183 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
185 if (deque_is_full (&stage->deque))
186 stage->tokens = deque_expand (&stage->deque, stage->tokens,
187 sizeof *stage->tokens);
188 stage->tokens[deque_push_front (&stage->deque)] = token;
191 /* Removes and returns the first token from STAGE. */
192 static struct lex_token *
193 lex_stage_take_first (struct lex_stage *stage)
195 return stage->tokens[deque_pop_back (&stage->deque)];
198 /* Removes the first token from STAGE and uninitializes it. */
200 lex_stage_pop_first (struct lex_stage *stage)
202 lex_token_destroy (lex_stage_take_first (stage));
205 /* Removes the first N tokens from SRC, appending them to DST as the last
208 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
210 for (size_t i = 0; i < n; i++)
211 lex_stage_push_last (dst, lex_stage_take_first (src));
214 /* A source of tokens, corresponding to a syntax file.
216 This is conceptually a lex_reader wrapped with everything needed to convert
217 its UTF-8 bytes into tokens. */
220 struct ll ll; /* In lexer's list of sources. */
224 - One for struct lexer.
226 - One for each struct msg_location that references this source. */
229 struct lex_reader *reader;
231 struct segmenter segmenter;
232 bool eof; /* True if T_STOP was read from 'reader'. */
234 /* Buffer of UTF-8 bytes. */
235 char *buffer; /* Source file contents. */
236 size_t length; /* Number of bytes filled. */
237 size_t allocated; /* Number of bytes allocated. */
239 /* Offsets into 'buffer'. */
240 size_t journal_pos; /* First byte not yet output to journal. */
241 size_t seg_pos; /* First byte not yet scanned as token. */
243 /* Offset into 'buffer' of starts of lines. */
245 size_t n_lines, allocated_lines;
247 bool suppress_next_newline;
251 This is a pipeline with the following stages. Each token eventually
252 made available to the parser passes through of these stages. The stages
253 are named after the processing that happens in each one.
255 Initially, tokens come from the segmenter and scanner to 'pp':
257 - pp: Tokens that need to pass through the macro preprocessor to end up
260 - merge: Tokens that need to pass through scan_merge() to end up in
263 - parse: Tokens available to the client for parsing.
265 'pp' and 'merge' store tokens only temporarily until they pass into
266 'parse'. Tokens then live in 'parse' until the command is fully
267 consumed, at which time they are freed together. */
269 struct lex_stage merge;
270 struct lex_token **parse;
271 size_t n_parse, allocated_parse, parse_ofs;
274 static struct lex_source *lex_source_create (struct lexer *,
275 struct lex_reader *);
280 struct ll_list sources; /* Contains "struct lex_source"s. */
281 struct macro_set *macros;
284 static struct lex_source *lex_source__ (const struct lexer *);
285 static char *lex_source_syntax__ (const struct lex_source *,
287 static const struct lex_token *lex_next__ (const struct lexer *, int n);
288 static void lex_source_push_endcmd__ (struct lex_source *);
289 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
290 static void lex_source_clear_parse (struct lex_source *);
292 static bool lex_source_get_parse (struct lex_source *);
293 static void lex_source_error_valist (struct lex_source *, int ofs0, int ofs1,
294 const char *format, va_list)
295 PRINTF_FORMAT (4, 0);
296 static const struct lex_token *lex_source_next__ (const struct lex_source *,
299 /* Initializes READER with the specified CLASS and otherwise some reasonable
300 defaults. The caller should fill in the others members as desired. */
302 lex_reader_init (struct lex_reader *reader,
303 const struct lex_reader_class *class)
305 reader->class = class;
306 reader->syntax = SEG_MODE_AUTO;
307 reader->error = LEX_ERROR_CONTINUE;
308 reader->file_name = NULL;
309 reader->encoding = NULL;
310 reader->line_number = 0;
314 /* Frees any file name already in READER and replaces it by a copy of
315 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
317 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
319 free (reader->file_name);
320 reader->file_name = xstrdup_if_nonnull (file_name);
323 /* Creates and returns a new lexer. */
327 struct lexer *lexer = xmalloc (sizeof *lexer);
328 *lexer = (struct lexer) {
329 .sources = LL_INITIALIZER (lexer->sources),
330 .macros = macro_set_create (),
335 /* Destroys LEXER. */
337 lex_destroy (struct lexer *lexer)
341 struct lex_source *source, *next;
343 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
345 ll_remove (&source->ll);
346 lex_source_unref (source);
348 macro_set_destroy (lexer->macros);
353 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
354 same name. Takes ownership of M. */
356 lex_define_macro (struct lexer *lexer, struct macro *m)
358 macro_set_add (lexer->macros, m);
361 /* Inserts READER into LEXER so that the next token read by LEXER comes from
362 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
365 lex_include (struct lexer *lexer, struct lex_reader *reader)
367 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
368 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
371 /* Appends READER to LEXER, so that it will be read after all other current
372 readers have already been read. */
374 lex_append (struct lexer *lexer, struct lex_reader *reader)
376 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
381 /* Advances LEXER to the next token, consuming the current token. */
383 lex_get (struct lexer *lexer)
385 struct lex_source *src;
387 src = lex_source__ (lexer);
391 if (src->parse_ofs < src->n_parse)
393 if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
394 lex_source_clear_parse (src);
399 while (src->parse_ofs == src->n_parse)
400 if (!lex_source_get_parse (src))
402 ll_remove (&src->ll);
403 lex_source_unref (src);
404 src = lex_source__ (lexer);
410 /* Advances LEXER by N tokens. */
412 lex_get_n (struct lexer *lexer, size_t n)
418 /* Issuing errors. */
420 /* Prints a syntax error message containing the current token and
421 given message MESSAGE (if non-null). */
423 lex_error (struct lexer *lexer, const char *format, ...)
427 va_start (args, format);
428 lex_ofs_error_valist (lexer, lex_ofs (lexer), lex_ofs (lexer), format, args);
432 /* Prints a syntax error message containing the current token and
433 given message MESSAGE (if non-null). */
435 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
437 lex_ofs_error_valist (lexer, lex_ofs (lexer), lex_ofs (lexer), format, args);
440 /* Prints a syntax error message for the span of tokens N0 through N1,
441 inclusive, from the current token in LEXER, adding message MESSAGE (if
444 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
448 va_start (args, format);
449 int ofs = lex_ofs (lexer);
450 lex_ofs_error_valist (lexer, n0 + ofs, n1 + ofs, format, args);
454 /* Prints a syntax error message for the span of tokens with offsets OFS0
455 through OFS1, inclusive, within the current command in LEXER, adding message
456 MESSAGE (if non-null). */
458 lex_ofs_error (struct lexer *lexer, int ofs0, int ofs1, const char *format, ...)
462 va_start (args, format);
463 lex_ofs_error_valist (lexer, ofs0, ofs1, format, args);
467 /* Prints a syntax error message saying that one of the strings provided as
468 varargs, up to the first NULL, is expected. */
470 (lex_error_expecting) (struct lexer *lexer, ...)
474 va_start (args, lexer);
475 lex_error_expecting_valist (lexer, args);
479 /* Prints a syntax error message saying that one of the options provided in
480 ARGS, up to the first NULL, is expected. */
482 lex_error_expecting_valist (struct lexer *lexer, va_list args)
484 enum { MAX_OPTIONS = 9 };
485 const char *options[MAX_OPTIONS];
487 while (n < MAX_OPTIONS)
489 const char *option = va_arg (args, const char *);
493 options[n++] = option;
495 lex_error_expecting_array (lexer, options, n);
499 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
504 lex_error (lexer, NULL);
508 lex_error (lexer, _("expecting %s"), options[0]);
512 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
516 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
521 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
522 options[0], options[1], options[2], options[3]);
526 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
527 options[0], options[1], options[2], options[3], options[4]);
531 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
532 options[0], options[1], options[2], options[3], options[4],
537 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
538 options[0], options[1], options[2], options[3], options[4],
539 options[5], options[6]);
543 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
544 options[0], options[1], options[2], options[3], options[4],
545 options[5], options[6], options[7]);
550 struct string s = DS_EMPTY_INITIALIZER;
551 for (size_t i = 0; i < n; i++)
554 ds_put_cstr (&s, ", ");
555 ds_put_cstr (&s, options[i]);
557 lex_error (lexer, _("expecting one of the following: %s"),
565 /* Reports an error to the effect that subcommand SBC may only be specified
568 This function does not take a lexer as an argument or use lex_error(),
569 because the result would ordinarily just be redundant: "Syntax error at
570 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
571 not help the user find the error. */
573 lex_sbc_only_once (const char *sbc)
575 msg (SE, _("Subcommand %s may only be specified once."), sbc);
578 /* Reports an error to the effect that subcommand SBC is missing.
580 This function does not take a lexer as an argument or use lex_error(),
581 because a missing subcommand can normally be detected only after the whole
582 command has been parsed, and so lex_error() would always report "Syntax
583 error at end of command", which does not help the user find the error. */
585 lex_sbc_missing (const char *sbc)
587 msg (SE, _("Required subcommand %s was not specified."), sbc);
590 /* Reports an error to the effect that specification SPEC may only be specified
591 once within subcommand SBC. */
593 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
595 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
599 /* Reports an error to the effect that specification SPEC is missing within
602 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
604 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
608 /* Prints a syntax error message for the span of tokens with offsets OFS0
609 through OFS1, inclusive, within the current command in LEXER, adding message
610 MESSAGE (if non-null) with the given ARGS. */
612 lex_ofs_error_valist (struct lexer *lexer, int ofs0, int ofs1,
613 const char *format, va_list args)
615 struct lex_source *src = lex_source__ (lexer);
618 lex_source_error_valist (src, ofs0, ofs1, format, args);
624 ds_put_format (&s, _("Syntax error at end of input"));
627 ds_put_cstr (&s, ": ");
628 ds_put_vformat (&s, format, args);
630 if (ds_last (&s) != '.')
631 ds_put_byte (&s, '.');
632 msg (SE, "%s", ds_cstr (&s));
637 /* Checks that we're at end of command.
638 If so, returns a successful command completion code.
639 If not, flags a syntax error and returns an error command
642 lex_end_of_command (struct lexer *lexer)
644 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
646 lex_error (lexer, _("expecting end of command"));
653 /* Token testing functions. */
655 /* Returns true if the current token is a number. */
657 lex_is_number (const struct lexer *lexer)
659 return lex_next_is_number (lexer, 0);
662 /* Returns true if the current token is a string. */
664 lex_is_string (const struct lexer *lexer)
666 return lex_next_is_string (lexer, 0);
669 /* Returns the value of the current token, which must be a
670 floating point number. */
672 lex_number (const struct lexer *lexer)
674 return lex_next_number (lexer, 0);
677 /* Returns true iff the current token is an integer. */
679 lex_is_integer (const struct lexer *lexer)
681 return lex_next_is_integer (lexer, 0);
684 /* Returns the value of the current token, which must be an
687 lex_integer (const struct lexer *lexer)
689 return lex_next_integer (lexer, 0);
692 /* Token testing functions with lookahead.
694 A value of 0 for N as an argument to any of these functions refers to the
695 current token. Lookahead is limited to the current command. Any N greater
696 than the number of tokens remaining in the current command will be treated
697 as referring to a T_ENDCMD token. */
699 /* Returns true if the token N ahead of the current token is a number. */
701 lex_next_is_number (const struct lexer *lexer, int n)
703 return token_is_number (lex_next (lexer, n));
706 /* Returns true if the token N ahead of the current token is a string. */
708 lex_next_is_string (const struct lexer *lexer, int n)
710 return token_is_string (lex_next (lexer, n));
713 /* Returns the value of the token N ahead of the current token, which must be a
714 floating point number. */
716 lex_next_number (const struct lexer *lexer, int n)
718 return token_number (lex_next (lexer, n));
721 /* Returns true if the token N ahead of the current token is an integer. */
723 lex_next_is_integer (const struct lexer *lexer, int n)
725 return token_is_integer (lex_next (lexer, n));
728 /* Returns the value of the token N ahead of the current token, which must be
731 lex_next_integer (const struct lexer *lexer, int n)
733 return token_integer (lex_next (lexer, n));
736 /* Token matching functions. */
738 /* If the current token has the specified TYPE, skips it and returns true.
739 Otherwise, returns false. */
741 lex_match (struct lexer *lexer, enum token_type type)
743 if (lex_token (lexer) == type)
752 /* If the current token matches IDENTIFIER, skips it and returns true.
753 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
756 IDENTIFIER must be an ASCII string. */
758 lex_match_id (struct lexer *lexer, const char *identifier)
760 return lex_match_id_n (lexer, identifier, 3);
763 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
764 may be abbreviated to its first N letters. Otherwise, returns false.
766 IDENTIFIER must be an ASCII string. */
768 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
770 if (lex_token (lexer) == T_ID
771 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
780 /* If the current token is integer X, skips it and returns true. Otherwise,
783 lex_match_int (struct lexer *lexer, int x)
785 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
794 /* Forced matches. */
796 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
797 abbreviated to its first 3 letters. Otherwise, reports an error and returns
800 IDENTIFIER must be an ASCII string. */
802 lex_force_match_id (struct lexer *lexer, const char *identifier)
804 if (lex_match_id (lexer, identifier))
808 lex_error_expecting (lexer, identifier);
813 /* If the current token has the specified TYPE, skips it and returns true.
814 Otherwise, reports an error and returns false. */
816 lex_force_match (struct lexer *lexer, enum token_type type)
818 if (lex_token (lexer) == type)
825 const char *type_string = token_type_to_string (type);
828 char *s = xasprintf ("`%s'", type_string);
829 lex_error_expecting (lexer, s);
833 lex_error_expecting (lexer, token_type_to_name (type));
839 /* If the current token is a string, does nothing and returns true.
840 Otherwise, reports an error and returns false. */
842 lex_force_string (struct lexer *lexer)
844 if (lex_is_string (lexer))
848 lex_error (lexer, _("expecting string"));
853 /* If the current token is a string or an identifier, does nothing and returns
854 true. Otherwise, reports an error and returns false.
856 This is meant for use in syntactic situations where we want to encourage the
857 user to supply a quoted string, but for compatibility we also accept
858 identifiers. (One example of such a situation is file names.) Therefore,
859 the error message issued when the current token is wrong only says that a
860 string is expected and doesn't mention that an identifier would also be
863 lex_force_string_or_id (struct lexer *lexer)
865 return lex_token (lexer) == T_ID || lex_force_string (lexer);
868 /* If the current token is an integer, does nothing and returns true.
869 Otherwise, reports an error and returns false. */
871 lex_force_int (struct lexer *lexer)
873 if (lex_is_integer (lexer))
877 lex_error (lexer, _("expecting integer"));
882 /* If the current token is an integer in the range MIN...MAX (inclusive), does
883 nothing and returns true. Otherwise, reports an error and returns false.
884 If NAME is nonnull, then it is used in the error message. */
886 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
888 bool is_number = lex_is_number (lexer);
889 bool is_integer = lex_is_integer (lexer);
890 bool too_small = (is_integer ? lex_integer (lexer) < min
891 : is_number ? lex_number (lexer) < min
893 bool too_big = (is_integer ? lex_integer (lexer) > max
894 : is_number ? lex_number (lexer) > max
896 if (is_integer && !too_small && !too_big)
901 /* Weird, maybe a bug in the caller. Just report that we needed an
904 lex_error (lexer, _("Integer expected for %s."), name);
906 lex_error (lexer, _("Integer expected."));
911 lex_error (lexer, _("Expected %ld for %s."), min, name);
913 lex_error (lexer, _("Expected %ld."), min);
915 else if (min + 1 == max)
918 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
920 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
924 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
925 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
927 if (report_lower_bound && report_upper_bound)
931 _("Expected integer between %ld and %ld for %s."),
934 lex_error (lexer, _("Expected integer between %ld and %ld."),
937 else if (report_lower_bound)
942 lex_error (lexer, _("Expected non-negative integer for %s."),
945 lex_error (lexer, _("Expected non-negative integer."));
950 lex_error (lexer, _("Expected positive integer for %s."),
953 lex_error (lexer, _("Expected positive integer."));
958 lex_error (lexer, _("Expected integer %ld or greater for %s."),
961 lex_error (lexer, _("Expected integer %ld or greater."), min);
964 else if (report_upper_bound)
968 _("Expected integer less than or equal to %ld for %s."),
971 lex_error (lexer, _("Expected integer less than or equal to %ld."),
977 lex_error (lexer, _("Integer expected for %s."), name);
979 lex_error (lexer, _("Integer expected."));
985 /* If the current token is a number, does nothing and returns true.
986 Otherwise, reports an error and returns false. */
988 lex_force_num (struct lexer *lexer)
990 if (lex_is_number (lexer))
993 lex_error (lexer, _("expecting number"));
997 /* If the current token is an number in the closed range [MIN,MAX], does
998 nothing and returns true. Otherwise, reports an error and returns false.
999 If NAME is nonnull, then it is used in the error message. */
1001 lex_force_num_range_closed (struct lexer *lexer, const char *name,
1002 double min, double max)
1004 bool is_number = lex_is_number (lexer);
1005 bool too_small = is_number && lex_number (lexer) < min;
1006 bool too_big = is_number && lex_number (lexer) > max;
1007 if (is_number && !too_small && !too_big)
1012 /* Weird, maybe a bug in the caller. Just report that we needed an
1015 lex_error (lexer, _("Number expected for %s."), name);
1017 lex_error (lexer, _("Number expected."));
1019 else if (min == max)
1022 lex_error (lexer, _("Expected %g for %s."), min, name);
1024 lex_error (lexer, _("Expected %g."), min);
1028 bool report_lower_bound = min > -DBL_MAX || too_small;
1029 bool report_upper_bound = max < DBL_MAX || too_big;
1031 if (report_lower_bound && report_upper_bound)
1035 _("Expected number between %g and %g for %s."),
1038 lex_error (lexer, _("Expected number between %g and %g."),
1041 else if (report_lower_bound)
1046 lex_error (lexer, _("Expected non-negative number for %s."),
1049 lex_error (lexer, _("Expected non-negative number."));
1054 lex_error (lexer, _("Expected number %g or greater for %s."),
1057 lex_error (lexer, _("Expected number %g or greater."), min);
1060 else if (report_upper_bound)
1064 _("Expected number less than or equal to %g for %s."),
1067 lex_error (lexer, _("Expected number less than or equal to %g."),
1073 lex_error (lexer, _("Number expected for %s."), name);
1075 lex_error (lexer, _("Number expected."));
1081 /* If the current token is an number in the half-open range [MIN,MAX), does
1082 nothing and returns true. Otherwise, reports an error and returns false.
1083 If NAME is nonnull, then it is used in the error message. */
1085 lex_force_num_range_halfopen (struct lexer *lexer, const char *name,
1086 double min, double max)
1088 bool is_number = lex_is_number (lexer);
1089 bool too_small = is_number && lex_number (lexer) < min;
1090 bool too_big = is_number && lex_number (lexer) >= max;
1091 if (is_number && !too_small && !too_big)
1096 /* Weird, maybe a bug in the caller. Just report that we needed an
1099 lex_error (lexer, _("Number expected for %s."), name);
1101 lex_error (lexer, _("Number expected."));
1105 bool report_lower_bound = min > -DBL_MAX || too_small;
1106 bool report_upper_bound = max < DBL_MAX || too_big;
1108 if (report_lower_bound && report_upper_bound)
1111 lex_error (lexer, _("Expected number in [%g,%g) for %s."),
1114 lex_error (lexer, _("Expected number in [%g,%g)."),
1117 else if (report_lower_bound)
1122 lex_error (lexer, _("Expected non-negative number for %s."),
1125 lex_error (lexer, _("Expected non-negative number."));
1130 lex_error (lexer, _("Expected number %g or greater for %s."),
1133 lex_error (lexer, _("Expected number %g or greater."), min);
1136 else if (report_upper_bound)
1140 _("Expected number less than %g for %s."), max, name);
1142 lex_error (lexer, _("Expected number less than %g."), max);
1147 lex_error (lexer, _("Number expected for %s."), name);
1149 lex_error (lexer, _("Number expected."));
1155 /* If the current token is an number in the open range (MIN,MAX], does
1156 nothing and returns true. Otherwise, reports an error and returns false.
1157 If NAME is nonnull, then it is used in the error message. */
1159 lex_force_num_range_open (struct lexer *lexer, const char *name,
1160 double min, double max)
1162 bool is_number = lex_is_number (lexer);
1163 bool too_small = is_number && lex_number (lexer) <= min;
1164 bool too_big = is_number && lex_number (lexer) >= max;
1165 if (is_number && !too_small && !too_big)
1170 /* Weird, maybe a bug in the caller. Just report that we needed an
1173 lex_error (lexer, _("Number expected for %s."), name);
1175 lex_error (lexer, _("Number expected."));
1179 bool report_lower_bound = min > -DBL_MAX || too_small;
1180 bool report_upper_bound = max < DBL_MAX || too_big;
1182 if (report_lower_bound && report_upper_bound)
1185 lex_error (lexer, _("Expected number in (%g,%g) for %s."),
1188 lex_error (lexer, _("Expected number in (%g,%g)."), min, max);
1190 else if (report_lower_bound)
1195 lex_error (lexer, _("Expected positive number for %s."), name);
1197 lex_error (lexer, _("Expected positive number."));
1202 lex_error (lexer, _("Expected number greater than %g for %s."),
1205 lex_error (lexer, _("Expected number greater than %g."), min);
1208 else if (report_upper_bound)
1211 lex_error (lexer, _("Expected number less than %g for %s."),
1214 lex_error (lexer, _("Expected number less than %g."), max);
1219 lex_error (lexer, _("Number expected for %s."), name);
1221 lex_error (lexer, _("Number expected."));
1227 /* If the current token is an identifier, does nothing and returns true.
1228 Otherwise, reports an error and returns false. */
1230 lex_force_id (struct lexer *lexer)
1232 if (lex_token (lexer) == T_ID)
1235 lex_error (lexer, _("expecting identifier"));
1239 /* Token accessors. */
1241 /* Returns the type of LEXER's current token. */
1243 lex_token (const struct lexer *lexer)
1245 return lex_next_token (lexer, 0);
1248 /* Returns the number in LEXER's current token.
1250 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1251 tokens this function will always return zero. */
1253 lex_tokval (const struct lexer *lexer)
1255 return lex_next_tokval (lexer, 0);
1258 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
1260 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1261 this functions this function will always return NULL.
1263 The UTF-8 encoding of the returned string is correct for variable names and
1264 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1265 data_in() to use it in a "union value". */
1267 lex_tokcstr (const struct lexer *lexer)
1269 return lex_next_tokcstr (lexer, 0);
1272 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
1273 null-terminated (but the null terminator is not included in the returned
1274 substring's 'length').
1276 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1277 this functions this function will always return NULL.
1279 The UTF-8 encoding of the returned string is correct for variable names and
1280 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1281 data_in() to use it in a "union value". */
1283 lex_tokss (const struct lexer *lexer)
1285 return lex_next_tokss (lexer, 0);
1290 A value of 0 for N as an argument to any of these functions refers to the
1291 current token. Lookahead is limited to the current command. Any N greater
1292 than the number of tokens remaining in the current command will be treated
1293 as referring to a T_ENDCMD token. */
1295 static const struct lex_token *
1296 lex_next__ (const struct lexer *lexer_, int n)
1298 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1299 struct lex_source *src = lex_source__ (lexer);
1302 return lex_source_next__ (src, n);
1305 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1310 static const struct lex_token *
1311 lex_source_ofs__ (const struct lex_source *src_, int ofs)
1313 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1317 static const struct lex_token endcmd_token
1318 = { .token = { .type = T_ENDCMD } };
1319 return &endcmd_token;
1322 while (ofs >= src->n_parse)
1324 if (src->n_parse > 0)
1326 const struct lex_token *t = src->parse[src->n_parse - 1];
1327 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1331 lex_source_get_parse (src);
1334 return src->parse[ofs];
1337 static const struct lex_token *
1338 lex_source_next__ (const struct lex_source *src, int n)
1340 return lex_source_ofs__ (src, n + src->parse_ofs);
1343 /* Returns the "struct token" of the token N after the current one in LEXER.
1344 The returned pointer can be invalidated by pretty much any succeeding call
1345 into the lexer, although the string pointer within the returned token is
1346 only invalidated by consuming the token (e.g. with lex_get()). */
1347 const struct token *
1348 lex_next (const struct lexer *lexer, int n)
1350 return &lex_next__ (lexer, n)->token;
1353 /* Returns the type of the token N after the current one in LEXER. */
1355 lex_next_token (const struct lexer *lexer, int n)
1357 return lex_next (lexer, n)->type;
1360 /* Returns the number in the tokn N after the current one in LEXER.
1362 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1363 tokens this function will always return zero. */
1365 lex_next_tokval (const struct lexer *lexer, int n)
1367 return token_number (lex_next (lexer, n));
1370 /* Returns the null-terminated string in the token N after the current one, in
1373 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1374 this functions this function will always return NULL.
1376 The UTF-8 encoding of the returned string is correct for variable names and
1377 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1378 data_in() to use it in a "union value". */
1380 lex_next_tokcstr (const struct lexer *lexer, int n)
1382 return lex_next_tokss (lexer, n).string;
1385 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1386 The string is null-terminated (but the null terminator is not included in
1387 the returned substring's 'length').
1389 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1390 tokens this functions this function will always return NULL.
1392 The UTF-8 encoding of the returned string is correct for variable names and
1393 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1394 data_in() to use it in a "union value". */
1396 lex_next_tokss (const struct lexer *lexer, int n)
1398 return lex_next (lexer, n)->string;
1401 /* Returns the offset of the current token within the command being parsed in
1402 LEXER. This is 0 for the first token in a command, 1 for the second, and so
1403 on. The return value is useful later for referring to this token in calls
1406 lex_ofs (const struct lexer *lexer)
1408 struct lex_source *src = lex_source__ (lexer);
1409 return src ? src->parse_ofs : 0;
1412 /* Returns the token within LEXER's current command with offset OFS. Use
1413 lex_ofs() to find out the offset of the current token. */
1414 const struct token *
1415 lex_ofs_token (const struct lexer *lexer_, int ofs)
1417 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1418 struct lex_source *src = lex_source__ (lexer);
1421 return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1424 static const struct token stop_token = { .type = T_STOP };
1429 /* Allocates and returns a new struct msg_location that spans tokens with
1430 offsets OFS0 through OFS1, inclusive, within the current command in
1431 LEXER. See lex_ofs() for an explanation of token offsets.
1433 The caller owns and must eventually free the returned object. */
1434 struct msg_location *
1435 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1437 int ofs = lex_ofs (lexer);
1438 return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1441 /* Returns a msg_point for the first character in the token with offset OFS,
1442 where offset 0 is the first token in the command currently being parsed, 1
1443 the second token, and so on. These are absolute offsets, not relative to
1444 the token currently being parsed within the command.
1446 Returns zeros for a T_STOP token.
1449 lex_ofs_start_point (const struct lexer *lexer, int ofs)
1451 const struct lex_source *src = lex_source__ (lexer);
1453 ? lex_token_start_point (src, lex_source_ofs__ (src, ofs))
1454 : (struct msg_point) { 0, 0 });
1457 /* Returns a msg_point for the last character, inclusive, in the token with
1458 offset OFS, where offset 0 is the first token in the command currently being
1459 parsed, 1 the second token, and so on. These are absolute offsets, not
1460 relative to the token currently being parsed within the command.
1462 Returns zeros for a T_STOP token.
1464 Most of the time, a single token is wholly within a single line of syntax,
1465 so that the start and end point for a given offset have the same line
1466 number. There are two exceptions: a T_STRING token can be made up of
1467 multiple segments on adjacent lines connected with "+" punctuators, and a
1468 T_NEG_NUM token can consist of a "-" on one line followed by the number on
1472 lex_ofs_end_point (const struct lexer *lexer, int ofs)
1474 const struct lex_source *src = lex_source__ (lexer);
1476 ? lex_token_end_point (src, lex_source_ofs__ (src, ofs))
1477 : (struct msg_point) { 0, 0 });
1480 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1481 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1482 are both zero, this requests the syntax for the current token.)
1484 The caller must eventually free the returned string (with free()). The
1485 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1486 that, for example, it may include comments, spaces, and new-lines if it
1487 spans multiple tokens. Macro expansion, however, has already been
1490 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1492 const struct lex_source *src = lex_source__ (lexer);
1494 ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs)
1499 /* Returns the text of the syntax in tokens with offsets OFS0 to OFS1,
1500 inclusive. (For example, if OFS0 and OFS1 are both zero, this requests the
1501 syntax for the first token in the current command.)
1503 The caller must eventually free the returned string (with free()). The
1504 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1505 that, for example, it may include comments, spaces, and new-lines if it
1506 spans multiple tokens. Macro expansion, however, has already been
1509 lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1)
1511 const struct lex_source *src = lex_source__ (lexer);
1512 return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup ("");
1515 /* Returns true if the token N ahead of the current one was produced by macro
1516 expansion, false otherwise. */
1518 lex_next_is_from_macro (const struct lexer *lexer, int n)
1520 return lex_next__ (lexer, n)->macro_rep != NULL;
1524 lex_tokens_match (const struct token *actual, const struct token *expected)
1526 if (actual->type != expected->type)
1529 switch (actual->type)
1533 return actual->number == expected->number;
1536 return lex_id_match (expected->string, actual->string);
1539 return (actual->string.length == expected->string.length
1540 && !memcmp (actual->string.string, expected->string.string,
1541 actual->string.length));
1549 lex_at_phrase__ (struct lexer *lexer, const char *s)
1551 struct string_lexer slex;
1555 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1556 while (string_lexer_next (&slex, &token))
1558 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1559 token_uninit (&token);
1566 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1567 returns true. Otherwise, returns false.
1569 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1570 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1571 first three letters. */
1573 lex_at_phrase (struct lexer *lexer, const char *s)
1575 return lex_at_phrase__ (lexer, s) > 0;
1578 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1579 skips it and returns true. Otherwise, returns false.
1581 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1582 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1583 first three letters. */
1585 lex_match_phrase (struct lexer *lexer, const char *s)
1587 size_t n = lex_at_phrase__ (lexer, s);
1589 lex_get_n (lexer, n);
1593 /* Returns the 1-based line number of the source text at the byte OFFSET in
1596 lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset)
1599 size_t hi = src->n_lines;
1602 size_t mid = (lo + hi) / 2;
1603 if (mid + 1 >= src->n_lines)
1604 return src->n_lines;
1605 else if (offset >= src->lines[mid + 1])
1607 else if (offset < src->lines[mid])
1614 /* Returns the 1-based column number of the source text at the byte OFFSET in
1617 lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset)
1619 const char *newline = memrchr (src->buffer, '\n', offset);
1620 size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1621 return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1624 static struct msg_point
1625 lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset)
1627 return (struct msg_point) {
1628 .line = lex_source_ofs_to_line_number (src, offset),
1629 .column = lex_source_ofs_to_column_number (src, offset),
1633 static struct msg_point
1634 lex_token_start_point (const struct lex_source *src,
1635 const struct lex_token *token)
1637 return lex_source_ofs_to_point__ (src, token->token_pos);
1640 static struct msg_point
1641 lex_token_end_point (const struct lex_source *src,
1642 const struct lex_token *token)
1644 return lex_source_ofs_to_point__ (src, lex_token_end (token));
1647 static struct msg_location
1648 lex_token_location (const struct lex_source *src,
1649 const struct lex_token *t0,
1650 const struct lex_token *t1)
1652 return (struct msg_location) {
1653 .file_name = intern_new_if_nonnull (src->reader->file_name),
1654 .start = lex_token_start_point (src, t0),
1655 .end = lex_token_end_point (src, t1),
1659 static struct msg_location *
1660 lex_token_location_rw (const struct lex_source *src,
1661 const struct lex_token *t0,
1662 const struct lex_token *t1)
1664 struct msg_location location = lex_token_location (src, t0, t1);
1665 return msg_location_dup (&location);
1668 static struct msg_location *
1669 lex_source_get_location (const struct lex_source *src, int ofs0, int ofs1)
1671 return lex_token_location_rw (src,
1672 lex_source_ofs__ (src, ofs0),
1673 lex_source_ofs__ (src, ofs1));
1676 /* Returns the name of the syntax file from which the current command is drawn.
1677 Returns NULL for a T_STOP token or if the command's source does not have
1680 There is no version of this function that takes an N argument because
1681 lookahead only works to the end of a command and any given command is always
1682 within a single syntax file. */
1684 lex_get_file_name (const struct lexer *lexer)
1686 struct lex_source *src = lex_source__ (lexer);
1687 return src == NULL ? NULL : src->reader->file_name;
1690 /* Returns a newly allocated msg_location for the syntax that represents tokens
1691 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1692 must eventually free the location (with msg_location_destroy()). */
1693 struct msg_location *
1694 lex_get_location (const struct lexer *lexer, int n0, int n1)
1696 struct msg_location *loc = xmalloc (sizeof *loc);
1697 *loc = (struct msg_location) {
1698 .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1699 .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)),
1700 .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)),
1701 .src = lex_source__ (lexer),
1703 lex_source_ref (loc->src);
1708 lex_get_encoding (const struct lexer *lexer)
1710 struct lex_source *src = lex_source__ (lexer);
1711 return src == NULL ? NULL : src->reader->encoding;
1714 /* Returns the syntax mode for the syntax file from which the current drawn is
1715 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1716 does not have line numbers.
1718 There is no version of this function that takes an N argument because
1719 lookahead only works to the end of a command and any given command is always
1720 within a single syntax file. */
1722 lex_get_syntax_mode (const struct lexer *lexer)
1724 struct lex_source *src = lex_source__ (lexer);
1725 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1728 /* Returns the error mode for the syntax file from which the current drawn is
1729 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1730 source does not have line numbers.
1732 There is no version of this function that takes an N argument because
1733 lookahead only works to the end of a command and any given command is always
1734 within a single syntax file. */
1736 lex_get_error_mode (const struct lexer *lexer)
1738 struct lex_source *src = lex_source__ (lexer);
1739 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1742 /* If the source that LEXER is currently reading has error mode
1743 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1744 token to be read comes directly from whatever is next read from the stream.
1746 It makes sense to call this function after encountering an error in a
1747 command entered on the console, because usually the user would prefer not to
1748 have cascading errors. */
1750 lex_interactive_reset (struct lexer *lexer)
1752 struct lex_source *src = lex_source__ (lexer);
1753 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1756 src->journal_pos = src->seg_pos = 0;
1758 src->suppress_next_newline = false;
1759 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1761 lex_stage_clear (&src->pp);
1762 lex_stage_clear (&src->merge);
1763 lex_source_clear_parse (src);
1764 lex_source_push_endcmd__ (src);
1768 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1770 lex_discard_rest_of_command (struct lexer *lexer)
1772 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1776 /* Discards all lookahead tokens in LEXER, then discards all input sources
1777 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1778 runs out of input sources. */
1780 lex_discard_noninteractive (struct lexer *lexer)
1782 struct lex_source *src = lex_source__ (lexer);
1786 lex_stage_clear (&src->pp);
1787 lex_stage_clear (&src->merge);
1788 lex_source_clear_parse (src);
1790 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1791 src = lex_source__ (lexer))
1793 ll_remove (&src->ll);
1794 lex_source_unref (src);
1800 lex_source_expand__ (struct lex_source *src)
1802 if (src->length >= src->allocated)
1803 src->buffer = x2realloc (src->buffer, &src->allocated);
1807 lex_source_read__ (struct lex_source *src)
1811 lex_source_expand__ (src);
1813 size_t space = src->allocated - src->length;
1814 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1815 size_t n = src->reader->class->read (src->reader,
1816 &src->buffer[src->length],
1818 assert (n <= space);
1823 src->reader->eof = true;
1829 while (!memchr (&src->buffer[src->seg_pos], '\n',
1830 src->length - src->seg_pos));
1833 static struct lex_source *
1834 lex_source__ (const struct lexer *lexer)
1836 return (ll_is_empty (&lexer->sources) ? NULL
1837 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1840 /* Returns the text of the syntax in SRC for tokens with offsets OFS0 through
1841 OFS1 in the current command, inclusive. (For example, if OFS0 and OFS1 are
1842 both zero, this requests the syntax for the first token in the current
1843 command.) The caller must eventually free the returned string (with
1844 free()). The syntax is encoded in UTF-8 and in the original form supplied
1845 to the lexer so that, for example, it may include comments, spaces, and
1846 new-lines if it spans multiple tokens. Macro expansion, however, has
1847 already been performed. */
1849 lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1)
1851 struct string s = DS_EMPTY_INITIALIZER;
1852 for (size_t i = ofs0; i <= ofs1; )
1854 /* Find [I,J) as the longest sequence of tokens not produced by macro
1855 expansion, or otherwise the longest sequence expanded from a single
1857 const struct lex_token *first = lex_source_ofs__ (src, i);
1859 for (j = i + 1; j <= ofs1; j++)
1861 const struct lex_token *cur = lex_source_ofs__ (src, j);
1862 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1863 || first->macro_rep != cur->macro_rep)
1866 const struct lex_token *last = lex_source_ofs__ (src, j - 1);
1868 /* Now add the syntax for this sequence of tokens to SRC. */
1869 if (!ds_is_empty (&s))
1870 ds_put_byte (&s, ' ');
1871 if (!first->macro_rep)
1873 size_t start = first->token_pos;
1874 size_t end = last->token_pos + last->token_len;
1875 ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1879 size_t start = first->ofs;
1880 size_t end = last->ofs + last->len;
1881 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1887 return ds_steal_cstr (&s);
1891 lex_source_contains_macro_call (struct lex_source *src, int ofs0, int ofs1)
1893 for (int i = ofs0; i <= ofs1; i++)
1894 if (lex_source_ofs__ (src, i)->macro_rep)
1899 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1900 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1901 other tokens included in that range. The syntax is encoded in UTF-8 and in
1902 the original form supplied to the lexer so that, for example, it may include
1903 comments, spaces, and new-lines if it spans multiple tokens.
1905 Returns an empty string if the token range doesn't include a macro call.
1907 The caller must not modify or free the returned string. */
1908 static struct substring
1909 lex_source_get_macro_call (struct lex_source *src, int ofs0, int ofs1)
1911 if (!lex_source_contains_macro_call (src, ofs0, ofs1))
1914 const struct lex_token *token0 = lex_source_ofs__ (src, ofs0);
1915 const struct lex_token *token1 = lex_source_ofs__ (src, MAX (ofs0, ofs1));
1916 size_t start = token0->token_pos;
1917 size_t end = token1->token_pos + token1->token_len;
1919 return ss_buffer (&src->buffer[start], end - start);
1923 lex_source_error_valist (struct lex_source *src, int ofs0, int ofs1,
1924 const char *format, va_list args)
1926 const struct lex_token *token;
1931 token = lex_source_ofs__ (src, ofs0);
1932 if (token->token.type == T_ENDCMD)
1933 ds_put_cstr (&s, _("Syntax error at end of command"));
1936 /* Get the syntax that caused the error. */
1937 char *raw_syntax = lex_source_syntax__ (src, ofs0, ofs1);
1939 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1942 /* Get the macro call(s) that expanded to the syntax that caused the
1945 str_ellipsize (lex_source_get_macro_call (src, ofs0, ofs1),
1952 _("Syntax error at `%s' (in expansion of `%s')"),
1955 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1960 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1963 ds_put_cstr (&s, _("Syntax error"));
1969 ds_put_cstr (&s, ": ");
1970 ds_put_vformat (&s, format, args);
1972 if (ds_last (&s) != '.')
1973 ds_put_byte (&s, '.');
1975 struct msg *m = xmalloc (sizeof *m);
1977 .category = MSG_C_SYNTAX,
1978 .severity = MSG_S_ERROR,
1979 .location = lex_source_get_location (src, ofs0, ofs1),
1980 .text = ds_steal_cstr (&s),
1986 lex_get_error (struct lex_source *src, const struct lex_token *token)
1989 str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1990 syntax, sizeof syntax);
1992 struct string s = DS_EMPTY_INITIALIZER;
1993 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1994 ds_put_format (&s, ": %s", token->token.string.string);
1996 struct msg *m = xmalloc (sizeof *m);
1998 .category = MSG_C_SYNTAX,
1999 .severity = MSG_S_ERROR,
2000 .location = lex_token_location_rw (src, token, token),
2001 .text = ds_steal_cstr (&s),
2006 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
2007 underlying lex_reader if necessary. Returns true if a new token was added
2008 to SRC's deque, false otherwise. The caller should retry failures unless
2009 SRC's 'eof' marker was set to true indicating that there will be no more
2010 tokens from this source. */
2012 lex_source_try_get_pp (struct lex_source *src)
2014 /* Append a new token to SRC and initialize it. */
2015 struct lex_token *token = xmalloc (sizeof *token);
2016 token->token = (struct token) { .type = T_STOP };
2017 token->macro_rep = NULL;
2018 token->ref_cnt = NULL;
2019 token->token_pos = src->seg_pos;
2021 /* Extract a segment. */
2022 const char *segment;
2023 enum segment_type seg_type;
2027 segment = &src->buffer[src->seg_pos];
2028 seg_len = segmenter_push (&src->segmenter, segment,
2029 src->length - src->seg_pos,
2030 src->reader->eof, &seg_type);
2034 /* The segmenter needs more input to produce a segment. */
2035 assert (!src->reader->eof);
2036 lex_source_read__ (src);
2039 /* Update state based on the segment. */
2040 token->token_len = seg_len;
2041 src->seg_pos += seg_len;
2042 if (seg_type == SEG_NEWLINE)
2044 if (src->n_lines >= src->allocated_lines)
2045 src->lines = x2nrealloc (src->lines, &src->allocated_lines,
2046 sizeof *src->lines);
2047 src->lines[src->n_lines++] = src->seg_pos;
2050 /* Get a token from the segment. */
2051 enum tokenize_result result = token_from_segment (
2052 seg_type, ss_buffer (segment, seg_len), &token->token);
2054 /* If we've reached the end of a line, or the end of a command, then pass
2055 the line to the output engine as a syntax text item. */
2056 int n_lines = seg_type == SEG_NEWLINE;
2057 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
2060 src->suppress_next_newline = true;
2062 else if (n_lines > 0 && src->suppress_next_newline)
2065 src->suppress_next_newline = false;
2067 for (int i = 0; i < n_lines; i++)
2069 /* Beginning of line. */
2070 const char *line = &src->buffer[src->journal_pos];
2072 /* Calculate line length, including \n or \r\n end-of-line if present.
2074 We use src->length even though that may be beyond what we've actually
2075 converted to tokens. That's because, if we're emitting the line due
2076 to SEG_END_COMMAND, we want to take the whole line through the
2077 newline, not just through the '.'. */
2078 size_t max_len = src->length - src->journal_pos;
2079 const char *newline = memchr (line, '\n', max_len);
2080 size_t line_len = newline ? newline - line + 1 : max_len;
2082 /* Calculate line length excluding end-of-line. */
2083 size_t copy_len = line_len;
2084 if (copy_len > 0 && line[copy_len - 1] == '\n')
2086 if (copy_len > 0 && line[copy_len - 1] == '\r')
2089 /* Submit the line as syntax. */
2090 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
2091 xmemdup0 (line, copy_len),
2094 src->journal_pos += line_len;
2099 case TOKENIZE_ERROR:
2100 lex_get_error (src, token);
2102 case TOKENIZE_EMPTY:
2103 lex_token_destroy (token);
2106 case TOKENIZE_TOKEN:
2107 if (token->token.type == T_STOP)
2109 token->token.type = T_ENDCMD;
2112 lex_stage_push_last (&src->pp, token);
2118 /* Attempts to append a new token to SRC. Returns true if successful, false on
2119 failure. On failure, the end of SRC has been reached and no more tokens
2120 will be forthcoming from it.
2122 Does not make the new token available for lookahead yet; the caller must
2123 adjust SRC's 'middle' pointer to do so. */
2125 lex_source_get_pp (struct lex_source *src)
2128 if (lex_source_try_get_pp (src))
2134 lex_source_try_get_merge (const struct lex_source *src_)
2136 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2138 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
2141 if (!settings_get_mexpand ())
2143 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
2147 /* Now pass tokens one-by-one to the macro expander.
2149 In the common case where there is no macro to expand, the loop is not
2151 struct macro_call *mc;
2152 int n_call = macro_call_create (src->lexer->macros,
2153 &lex_stage_first (&src->pp)->token, &mc);
2154 for (int ofs = 1; !n_call; ofs++)
2156 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
2158 /* This should not be reachable because we always get a T_ENDCMD at
2159 the end of an input file (transformed from T_STOP by
2160 lex_source_try_get_pp()) and the macro_expander should always
2161 terminate expansion on T_ENDCMD. */
2165 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
2166 const struct macro_token mt = {
2168 .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len),
2170 const struct msg_location loc = lex_token_location (src, t, t);
2171 n_call = macro_call_add (mc, &mt, &loc);
2175 /* False alarm: no macro expansion after all. Use first token as
2176 lookahead. We'll retry macro expansion from the second token next
2178 macro_call_destroy (mc);
2179 lex_stage_shift (&src->merge, &src->pp, 1);
2183 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
2184 are a macro call. (These are likely to be the only tokens in 'pp'.)
2186 const struct lex_token *c0 = lex_stage_first (&src->pp);
2187 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
2188 struct macro_tokens expansion = { .n = 0 };
2189 struct msg_location loc = lex_token_location (src, c0, c1);
2190 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
2191 macro_call_destroy (mc);
2193 /* Convert the macro expansion into syntax for possible error messages
2195 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
2196 size_t *len = xnmalloc (expansion.n, sizeof *len);
2197 struct string s = DS_EMPTY_INITIALIZER;
2198 macro_tokens_to_syntax (&expansion, &s, ofs, len);
2200 if (settings_get_mprint ())
2201 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
2202 _("Macro Expansion")));
2204 /* Append the macro expansion tokens to the lookahead. */
2205 if (expansion.n > 0)
2207 char *macro_rep = ds_steal_cstr (&s);
2208 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
2209 *ref_cnt = expansion.n;
2210 for (size_t i = 0; i < expansion.n; i++)
2212 struct lex_token *token = xmalloc (sizeof *token);
2213 *token = (struct lex_token) {
2214 .token = expansion.mts[i].token,
2215 .token_pos = c0->token_pos,
2216 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
2217 .macro_rep = macro_rep,
2222 lex_stage_push_last (&src->merge, token);
2224 ss_dealloc (&expansion.mts[i].syntax);
2229 free (expansion.mts);
2233 /* Destroy the tokens for the call. */
2234 for (size_t i = 0; i < n_call; i++)
2235 lex_stage_pop_first (&src->pp);
2237 return expansion.n > 0;
2240 /* Attempts to obtain at least one new token into 'merge' in SRC.
2242 Returns true if successful, false on failure. In the latter case, SRC is
2243 exhausted and 'src->eof' is now true. */
2245 lex_source_get_merge (struct lex_source *src)
2248 if (lex_source_try_get_merge (src))
2253 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
2255 Returns true if successful, false on failure. In the latter case, SRC is
2256 exhausted and 'src->eof' is now true. */
2258 lex_source_get_parse (struct lex_source *src)
2260 struct merger m = MERGER_INIT;
2262 for (size_t i = 0; ; i++)
2264 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
2266 /* We always get a T_ENDCMD at the end of an input file
2267 (transformed from T_STOP by lex_source_try_get_pp()) and
2268 merger_add() should never return -1 on T_ENDCMD. */
2269 assert (lex_stage_is_empty (&src->merge));
2273 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
2277 lex_source_push_parse (src, lex_stage_take_first (&src->merge));
2280 else if (retval > 0)
2282 /* Add a token that merges all the tokens together. */
2283 const struct lex_token *first = lex_stage_first (&src->merge);
2284 const struct lex_token *last = lex_stage_nth (&src->merge,
2286 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2287 struct lex_token *t = xmalloc (sizeof *t);
2288 *t = (struct lex_token) {
2290 .token_pos = first->token_pos,
2291 .token_len = (last->token_pos - first->token_pos) + last->token_len,
2293 /* This works well if all the tokens were not expanded from macros,
2294 or if they came from the same macro expansion. It just gives up
2295 in the other (corner) cases. */
2296 .macro_rep = macro ? first->macro_rep : NULL,
2297 .ofs = macro ? first->ofs : 0,
2298 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2299 .ref_cnt = macro ? first->ref_cnt : NULL,
2303 lex_source_push_parse (src, t);
2305 for (int i = 0; i < retval; i++)
2306 lex_stage_pop_first (&src->merge);
2313 lex_source_push_endcmd__ (struct lex_source *src)
2315 assert (src->n_parse == 0);
2317 struct lex_token *token = xmalloc (sizeof *token);
2318 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2319 lex_source_push_parse (src, token);
2323 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2325 if (src->n_parse >= src->allocated_parse)
2326 src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2327 sizeof *src->parse);
2328 src->parse[src->n_parse++] = token;
2332 lex_source_clear_parse (struct lex_source *src)
2334 for (size_t i = 0; i < src->n_parse; i++)
2335 lex_token_destroy (src->parse[i]);
2336 src->n_parse = src->parse_ofs = 0;
2339 static struct lex_source *
2340 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2342 size_t allocated_lines = 4;
2343 size_t *lines = xmalloc (allocated_lines * sizeof *lines);
2346 struct lex_source *src = xmalloc (sizeof *src);
2347 *src = (struct lex_source) {
2350 .segmenter = segmenter_init (reader->syntax, false),
2354 .allocated_lines = allocated_lines,
2357 lex_source_push_endcmd__ (src);
2363 lex_set_message_handler (struct lexer *lexer,
2364 void (*output_msg) (const struct msg *,
2367 struct msg_handler msg_handler = {
2368 .output_msg = (void (*)(const struct msg *, void *)) output_msg,
2370 .lex_source_ref = lex_source_ref,
2371 .lex_source_unref = lex_source_unref,
2372 .lex_source_get_line = lex_source_get_line,
2374 msg_set_handler (&msg_handler);
2378 lex_source_ref (const struct lex_source *src_)
2380 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2383 assert (src->n_refs > 0);
2389 lex_source_unref (struct lex_source *src)
2394 assert (src->n_refs > 0);
2395 if (--src->n_refs > 0)
2398 char *file_name = src->reader->file_name;
2399 char *encoding = src->reader->encoding;
2400 if (src->reader->class->destroy != NULL)
2401 src->reader->class->destroy (src->reader);
2406 lex_stage_uninit (&src->pp);
2407 lex_stage_uninit (&src->merge);
2408 lex_source_clear_parse (src);
2413 struct lex_file_reader
2415 struct lex_reader reader;
2416 struct u8_istream *istream;
2419 static struct lex_reader_class lex_file_reader_class;
2421 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2422 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2423 ENCODING, which should take one of the forms accepted by
2424 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2425 mode of the new reader, respectively.
2427 Returns a null pointer if FILE_NAME cannot be opened. */
2429 lex_reader_for_file (const char *file_name, const char *encoding,
2430 enum segmenter_mode syntax,
2431 enum lex_error_mode error)
2433 struct lex_file_reader *r;
2434 struct u8_istream *istream;
2436 istream = (!strcmp(file_name, "-")
2437 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2438 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2439 if (istream == NULL)
2441 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2445 r = xmalloc (sizeof *r);
2446 lex_reader_init (&r->reader, &lex_file_reader_class);
2447 r->reader.syntax = syntax;
2448 r->reader.error = error;
2449 r->reader.file_name = xstrdup (file_name);
2450 r->reader.encoding = xstrdup_if_nonnull (encoding);
2451 r->reader.line_number = 1;
2452 r->istream = istream;
2457 static struct lex_file_reader *
2458 lex_file_reader_cast (struct lex_reader *r)
2460 return UP_CAST (r, struct lex_file_reader, reader);
2464 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2465 enum prompt_style prompt_style UNUSED)
2467 struct lex_file_reader *r = lex_file_reader_cast (r_);
2468 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2471 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2478 lex_file_close (struct lex_reader *r_)
2480 struct lex_file_reader *r = lex_file_reader_cast (r_);
2482 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2484 if (u8_istream_close (r->istream) != 0)
2485 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2488 u8_istream_free (r->istream);
2493 static struct lex_reader_class lex_file_reader_class =
2499 struct lex_string_reader
2501 struct lex_reader reader;
2506 static struct lex_reader_class lex_string_reader_class;
2508 /* Creates and returns a new lex_reader for the contents of S, which must be
2509 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2510 with ss_dealloc() when it is closed. */
2512 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2514 struct lex_string_reader *r;
2516 r = xmalloc (sizeof *r);
2517 lex_reader_init (&r->reader, &lex_string_reader_class);
2518 r->reader.syntax = SEG_MODE_AUTO;
2519 r->reader.encoding = xstrdup_if_nonnull (encoding);
2526 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2527 which must be encoded in ENCODING. The caller retains ownership of S. */
2529 lex_reader_for_string (const char *s, const char *encoding)
2531 struct substring ss;
2532 ss_alloc_substring (&ss, ss_cstr (s));
2533 return lex_reader_for_substring_nocopy (ss, encoding);
2536 /* Formats FORMAT as a printf()-like format string and creates and returns a
2537 new lex_reader for the formatted result. */
2539 lex_reader_for_format (const char *format, const char *encoding, ...)
2541 struct lex_reader *r;
2544 va_start (args, encoding);
2545 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2551 static struct lex_string_reader *
2552 lex_string_reader_cast (struct lex_reader *r)
2554 return UP_CAST (r, struct lex_string_reader, reader);
2558 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2559 enum prompt_style prompt_style UNUSED)
2561 struct lex_string_reader *r = lex_string_reader_cast (r_);
2564 chunk = MIN (n, r->s.length - r->offset);
2565 memcpy (buf, r->s.string + r->offset, chunk);
2572 lex_string_close (struct lex_reader *r_)
2574 struct lex_string_reader *r = lex_string_reader_cast (r_);
2580 static struct lex_reader_class lex_string_reader_class =
2587 lex_source_get_line (const struct lex_source *src, int line)
2589 if (line < 1 || line > src->n_lines)
2592 size_t ofs = src->lines[line - 1];
2593 size_t end = line >= src->n_lines ? src->length : src->lines[line];
2594 return ss_buffer (&src->buffer[ofs], end - ofs);