1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/intern.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
70 size_t token_pos; /* Offset into src->buffer of token start. */
71 size_t token_len; /* Length of source for token in bytes. */
73 /* For a token obtained through macro expansion, this is just this token.
75 For a token obtained through the lexer in an ordinary way, these are
77 char *macro_rep; /* The whole macro expansion. */
78 size_t ofs; /* Offset of this token in macro_rep. */
79 size_t len; /* Length of this token in macro_rep. */
80 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
83 static struct msg_point lex_token_start_point (const struct lex_source *,
84 const struct lex_token *);
85 static struct msg_point lex_token_end_point (const struct lex_source *,
86 const struct lex_token *);
88 static bool lex_ofs_at_phrase__ (struct lexer *, int ofs, const char *s,
91 /* Source offset of the last byte in TOKEN. */
93 lex_token_end (const struct lex_token *token)
95 return token->token_pos + MAX (token->token_len, 1) - 1;
99 lex_token_destroy (struct lex_token *t)
101 token_uninit (&t->token);
104 assert (*t->ref_cnt > 0);
114 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
119 struct lex_token **tokens;
122 static void lex_stage_clear (struct lex_stage *);
123 static void lex_stage_uninit (struct lex_stage *);
125 static size_t lex_stage_count (const struct lex_stage *);
126 static bool lex_stage_is_empty (const struct lex_stage *);
128 static struct lex_token *lex_stage_first (struct lex_stage *);
129 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
131 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
132 static void lex_stage_pop_first (struct lex_stage *);
134 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
137 /* Deletes all the tokens from STAGE. */
139 lex_stage_clear (struct lex_stage *stage)
141 while (!deque_is_empty (&stage->deque))
142 lex_stage_pop_first (stage);
145 /* Deletes all the tokens from STAGE and frees storage for the deque. */
147 lex_stage_uninit (struct lex_stage *stage)
149 lex_stage_clear (stage);
150 free (stage->tokens);
153 /* Returns true if STAGE contains no tokens, otherwise false. */
155 lex_stage_is_empty (const struct lex_stage *stage)
157 return deque_is_empty (&stage->deque);
160 /* Returns the number of tokens in STAGE. */
162 lex_stage_count (const struct lex_stage *stage)
164 return deque_count (&stage->deque);
167 /* Returns the first token in STAGE, which must be nonempty.
168 The first token is the one accessed with the least lookahead. */
169 static struct lex_token *
170 lex_stage_first (struct lex_stage *stage)
172 return lex_stage_nth (stage, 0);
175 /* Returns the token the given INDEX in STAGE. The first token (with the least
176 lookahead) is 0, the second token is 1, and so on. There must be at least
177 INDEX + 1 tokens in STAGE. */
178 static struct lex_token *
179 lex_stage_nth (struct lex_stage *stage, size_t index)
181 return stage->tokens[deque_back (&stage->deque, index)];
184 /* Adds TOKEN so that it becomes the last token in STAGE. */
186 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
188 if (deque_is_full (&stage->deque))
189 stage->tokens = deque_expand (&stage->deque, stage->tokens,
190 sizeof *stage->tokens);
191 stage->tokens[deque_push_front (&stage->deque)] = token;
194 /* Removes and returns the first token from STAGE. */
195 static struct lex_token *
196 lex_stage_take_first (struct lex_stage *stage)
198 return stage->tokens[deque_pop_back (&stage->deque)];
201 /* Removes the first token from STAGE and uninitializes it. */
203 lex_stage_pop_first (struct lex_stage *stage)
205 lex_token_destroy (lex_stage_take_first (stage));
208 /* Removes the first N tokens from SRC, appending them to DST as the last
211 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
213 for (size_t i = 0; i < n; i++)
214 lex_stage_push_last (dst, lex_stage_take_first (src));
217 /* A source of tokens, corresponding to a syntax file.
219 This is conceptually a lex_reader wrapped with everything needed to convert
220 its UTF-8 bytes into tokens. */
223 struct ll ll; /* In lexer's list of sources. */
227 - One for struct lexer.
229 - One for each struct msg_location that references this source. */
232 struct lex_reader *reader;
234 struct segmenter segmenter;
235 bool eof; /* True if T_STOP was read from 'reader'. */
237 /* Buffer of UTF-8 bytes. */
238 char *buffer; /* Source file contents. */
239 size_t length; /* Number of bytes filled. */
240 size_t allocated; /* Number of bytes allocated. */
242 /* Offsets into 'buffer'. */
243 size_t journal_pos; /* First byte not yet output to journal. */
244 size_t seg_pos; /* First byte not yet scanned as token. */
246 /* Offset into 'buffer' of starts of lines. */
248 size_t n_lines, allocated_lines;
250 bool suppress_next_newline;
254 This is a pipeline with the following stages. Each token eventually
255 made available to the parser passes through of these stages. The stages
256 are named after the processing that happens in each one.
258 Initially, tokens come from the segmenter and scanner to 'pp':
260 - pp: Tokens that need to pass through the macro preprocessor to end up
263 - merge: Tokens that need to pass through scan_merge() to end up in
266 - parse: Tokens available to the client for parsing.
268 'pp' and 'merge' store tokens only temporarily until they pass into
269 'parse'. Tokens then live in 'parse' until the command is fully
270 consumed, at which time they are freed together. */
272 struct lex_stage merge;
273 struct lex_token **parse;
274 size_t n_parse, allocated_parse, parse_ofs;
277 static struct lex_source *lex_source_create (struct lexer *,
278 struct lex_reader *);
283 struct ll_list sources; /* Contains "struct lex_source"s. */
284 struct macro_set *macros;
287 static struct lex_source *lex_source__ (const struct lexer *);
288 static char *lex_source_syntax__ (const struct lex_source *,
290 static const struct lex_token *lex_next__ (const struct lexer *, int n);
291 static void lex_source_push_endcmd__ (struct lex_source *);
292 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
293 static void lex_source_clear_parse (struct lex_source *);
295 static bool lex_source_get_parse (struct lex_source *);
296 static void lex_source_msg_valist (struct lex_source *, enum msg_class,
298 const char *format, va_list)
299 PRINTF_FORMAT (5, 0);
300 static const struct lex_token *lex_source_next__ (const struct lex_source *,
303 /* Initializes READER with the specified CLASS and otherwise some reasonable
304 defaults. The caller should fill in the others members as desired. */
306 lex_reader_init (struct lex_reader *reader,
307 const struct lex_reader_class *class)
309 reader->class = class;
310 reader->syntax = SEG_MODE_AUTO;
311 reader->error = LEX_ERROR_CONTINUE;
312 reader->file_name = NULL;
313 reader->encoding = NULL;
314 reader->line_number = 0;
318 /* Frees any file name already in READER and replaces it by a copy of
319 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
321 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
323 free (reader->file_name);
324 reader->file_name = xstrdup_if_nonnull (file_name);
327 /* Creates and returns a new lexer. */
331 struct lexer *lexer = xmalloc (sizeof *lexer);
332 *lexer = (struct lexer) {
333 .sources = LL_INITIALIZER (lexer->sources),
334 .macros = macro_set_create (),
339 /* Destroys LEXER. */
341 lex_destroy (struct lexer *lexer)
345 struct lex_source *source, *next;
347 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
349 ll_remove (&source->ll);
350 lex_source_unref (source);
352 macro_set_destroy (lexer->macros);
357 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
358 same name. Takes ownership of M. */
360 lex_define_macro (struct lexer *lexer, struct macro *m)
362 macro_set_add (lexer->macros, m);
365 /* Returns LEXER's macro set. The caller should not modify it. */
366 const struct macro_set *
367 lex_get_macros (const struct lexer *lexer)
369 return lexer->macros;
372 /* Inserts READER into LEXER so that the next token read by LEXER comes from
373 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
376 lex_include (struct lexer *lexer, struct lex_reader *reader)
378 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
379 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
382 /* Appends READER to LEXER, so that it will be read after all other current
383 readers have already been read. */
385 lex_append (struct lexer *lexer, struct lex_reader *reader)
387 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
392 /* Advances LEXER to the next token, consuming the current token. */
394 lex_get (struct lexer *lexer)
396 struct lex_source *src;
398 src = lex_source__ (lexer);
402 if (src->parse_ofs < src->n_parse)
404 if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
405 lex_source_clear_parse (src);
410 while (src->parse_ofs == src->n_parse)
411 if (!lex_source_get_parse (src))
413 ll_remove (&src->ll);
414 lex_source_unref (src);
415 src = lex_source__ (lexer);
421 /* Advances LEXER by N tokens. */
423 lex_get_n (struct lexer *lexer, size_t n)
429 /* Issuing errors. */
431 /* Prints a syntax error message containing the current token and
432 given message MESSAGE (if non-null). */
434 lex_error (struct lexer *lexer, const char *format, ...)
438 va_start (args, format);
439 lex_ofs_msg_valist (lexer, SE, lex_ofs (lexer), lex_ofs (lexer),
444 /* Prints a syntax error message for the span of tokens N0 through N1,
445 inclusive, from the current token in LEXER, adding message MESSAGE (if
448 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
452 va_start (args, format);
453 int ofs = lex_ofs (lexer);
454 lex_ofs_msg_valist (lexer, SE, n0 + ofs, n1 + ofs, format, args);
458 /* Prints a syntax error message for the span of tokens with offsets OFS0
459 through OFS1, inclusive, within the current command in LEXER, adding message
460 MESSAGE (if non-null). */
462 lex_ofs_error (struct lexer *lexer, int ofs0, int ofs1, const char *format, ...)
466 va_start (args, format);
467 lex_ofs_msg_valist (lexer, SE, ofs0, ofs1, format, args);
471 /* Prints a message of the given CLASS containing the current token and given
472 message MESSAGE (if non-null). */
474 lex_msg (struct lexer *lexer, enum msg_class class, const char *format, ...)
478 va_start (args, format);
479 lex_ofs_msg_valist (lexer, class, lex_ofs (lexer), lex_ofs (lexer),
484 /* Prints a syntax error message for the span of tokens N0 through N1,
485 inclusive, from the current token in LEXER, adding message MESSAGE (if
488 lex_next_msg (struct lexer *lexer, enum msg_class class, int n0, int n1,
489 const char *format, ...)
493 va_start (args, format);
494 int ofs = lex_ofs (lexer);
495 lex_ofs_msg_valist (lexer, class, n0 + ofs, n1 + ofs, format, args);
499 /* Prints a message of the given CLASS for the span of tokens with offsets OFS0
500 through OFS1, inclusive, within the current command in LEXER, adding message
501 MESSAGE (if non-null). */
503 lex_ofs_msg (struct lexer *lexer, enum msg_class class, int ofs0, int ofs1,
504 const char *format, ...)
508 va_start (args, format);
509 lex_ofs_msg_valist (lexer, class, ofs0, ofs1, format, args);
513 /* Prints a syntax error message saying that one of the strings provided as
514 varargs, up to the first NULL, is expected. */
516 (lex_error_expecting) (struct lexer *lexer, ...)
520 va_start (args, lexer);
521 lex_error_expecting_valist (lexer, args);
525 /* Prints a syntax error message saying that one of the options provided in
526 ARGS, up to the first NULL, is expected. */
528 lex_error_expecting_valist (struct lexer *lexer, va_list args)
530 const char **options = NULL;
531 size_t allocated = 0;
536 const char *option = va_arg (args, const char *);
541 options = x2nrealloc (options, &allocated, sizeof *options);
542 options[n++] = option;
544 lex_error_expecting_array (lexer, options, n);
549 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
554 lex_error (lexer, NULL);
558 lex_error (lexer, _("Syntax error expecting %s."), options[0]);
562 lex_error (lexer, _("Syntax error expecting %s or %s."),
563 options[0], options[1]);
567 lex_error (lexer, _("Syntax error expecting %s, %s, or %s."),
568 options[0], options[1], options[2]);
572 lex_error (lexer, _("Syntax error expecting %s, %s, %s, or %s."),
573 options[0], options[1], options[2], options[3]);
577 lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, or %s."),
578 options[0], options[1], options[2], options[3], options[4]);
582 lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, or %s."),
583 options[0], options[1], options[2], options[3], options[4],
588 lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, "
590 options[0], options[1], options[2], options[3], options[4],
591 options[5], options[6]);
595 lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, %s, "
597 options[0], options[1], options[2], options[3], options[4],
598 options[5], options[6], options[7]);
603 struct string s = DS_EMPTY_INITIALIZER;
604 for (size_t i = 0; i < n; i++)
607 ds_put_cstr (&s, ", ");
608 ds_put_cstr (&s, options[i]);
610 lex_error (lexer, _("Syntax error expecting one of the following: %s."),
618 /* Reports an error to the effect that subcommand SBC may only be specified
621 lex_sbc_only_once (struct lexer *lexer, const char *sbc)
623 int ofs = lex_ofs (lexer) - 1;
624 if (lex_ofs_token (lexer, ofs)->type == T_EQUALS)
627 /* lex_ofs_at_phrase__() handles subcommand names that are keywords, such as
629 if (lex_ofs_at_phrase__ (lexer, ofs, sbc, NULL))
630 lex_ofs_error (lexer, ofs, ofs,
631 _("Subcommand %s may only be specified once."), sbc);
633 msg (SE, _("Subcommand %s may only be specified once."), sbc);
636 /* Reports an error to the effect that subcommand SBC is missing.
638 This function does not take a lexer as an argument or use lex_error(),
639 because a missing subcommand can normally be detected only after the whole
640 command has been parsed, and so lex_error() would always report "Syntax
641 error at end of command", which does not help the user find the error. */
643 lex_sbc_missing (struct lexer *lexer, const char *sbc)
645 lex_ofs_error (lexer, 0, lex_max_ofs (lexer),
646 _("Required subcommand %s was not specified."), sbc);
649 /* Reports an error to the effect that specification SPEC may only be specified
650 once within subcommand SBC. */
652 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
654 lex_error (lexer, _("%s may only be specified once within subcommand %s."),
658 /* Reports an error to the effect that specification SPEC is missing within
661 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
663 lex_error (lexer, _("Required %s specification missing from %s subcommand."),
667 /* Prints a syntax error message for the span of tokens with offsets OFS0
668 through OFS1, inclusive, within the current command in LEXER, adding message
669 MESSAGE (if non-null) with the given ARGS. */
671 lex_ofs_msg_valist (struct lexer *lexer, enum msg_class class,
672 int ofs0, int ofs1, const char *format, va_list args)
674 lex_source_msg_valist (lex_source__ (lexer), class, ofs0, ofs1, format, args);
677 /* Checks that we're at end of command.
678 If so, returns a successful command completion code.
679 If not, flags a syntax error and returns an error command
682 lex_end_of_command (struct lexer *lexer)
684 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
686 lex_error (lexer, _("Syntax error expecting end of command."));
693 /* Token testing functions. */
695 /* Returns true if the current token is a number. */
697 lex_is_number (const struct lexer *lexer)
699 return lex_next_is_number (lexer, 0);
702 /* Returns true if the current token is a string. */
704 lex_is_string (const struct lexer *lexer)
706 return lex_next_is_string (lexer, 0);
709 /* Returns the value of the current token, which must be a
710 floating point number. */
712 lex_number (const struct lexer *lexer)
714 return lex_next_number (lexer, 0);
717 /* Returns true iff the current token is an integer. */
719 lex_is_integer (const struct lexer *lexer)
721 return lex_next_is_integer (lexer, 0);
724 /* Returns the value of the current token, which must be an
727 lex_integer (const struct lexer *lexer)
729 return lex_next_integer (lexer, 0);
732 /* Token testing functions with lookahead.
734 A value of 0 for N as an argument to any of these functions refers to the
735 current token. Lookahead is limited to the current command. Any N greater
736 than the number of tokens remaining in the current command will be treated
737 as referring to a T_ENDCMD token. */
739 /* Returns true if the token N ahead of the current token is a number. */
741 lex_next_is_number (const struct lexer *lexer, int n)
743 return token_is_number (lex_next (lexer, n));
746 /* Returns true if the token N ahead of the current token is a string. */
748 lex_next_is_string (const struct lexer *lexer, int n)
750 return token_is_string (lex_next (lexer, n));
753 /* Returns the value of the token N ahead of the current token, which must be a
754 floating point number. */
756 lex_next_number (const struct lexer *lexer, int n)
758 return token_number (lex_next (lexer, n));
761 /* Returns true if the token N ahead of the current token is an integer. */
763 lex_next_is_integer (const struct lexer *lexer, int n)
765 return token_is_integer (lex_next (lexer, n));
768 /* Returns the value of the token N ahead of the current token, which must be
771 lex_next_integer (const struct lexer *lexer, int n)
773 return token_integer (lex_next (lexer, n));
776 /* Token matching functions. */
778 /* If the current token has the specified TYPE, skips it and returns true.
779 Otherwise, returns false. */
781 lex_match (struct lexer *lexer, enum token_type type)
783 if (lex_token (lexer) == type)
792 /* If the current token matches IDENTIFIER, skips it and returns true.
793 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
796 IDENTIFIER must be an ASCII string. */
798 lex_match_id (struct lexer *lexer, const char *identifier)
800 return lex_match_id_n (lexer, identifier, 3);
803 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
804 may be abbreviated to its first N letters. Otherwise, returns false.
806 IDENTIFIER must be an ASCII string. */
808 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
810 if (lex_token (lexer) == T_ID
811 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
820 /* If the current token is integer X, skips it and returns true. Otherwise,
823 lex_match_int (struct lexer *lexer, int x)
825 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
834 /* Forced matches. */
836 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
837 abbreviated to its first 3 letters. Otherwise, reports an error and returns
840 IDENTIFIER must be an ASCII string. */
842 lex_force_match_id (struct lexer *lexer, const char *identifier)
844 if (lex_match_id (lexer, identifier))
848 lex_error_expecting (lexer, identifier);
853 /* If the current token has the specified TYPE, skips it and returns true.
854 Otherwise, reports an error and returns false. */
856 lex_force_match (struct lexer *lexer, enum token_type type)
858 if (lex_token (lexer) == type)
865 const char *type_string = token_type_to_string (type);
868 char *s = xasprintf ("`%s'", type_string);
869 lex_error_expecting (lexer, s);
873 lex_error_expecting (lexer, token_type_to_name (type));
879 /* If the current token is a string, does nothing and returns true.
880 Otherwise, reports an error and returns false. */
882 lex_force_string (struct lexer *lexer)
884 if (lex_is_string (lexer))
888 lex_error (lexer, _("Syntax error expecting string."));
893 /* If the current token is a string or an identifier, does nothing and returns
894 true. Otherwise, reports an error and returns false.
896 This is meant for use in syntactic situations where we want to encourage the
897 user to supply a quoted string, but for compatibility we also accept
898 identifiers. (One example of such a situation is file names.) Therefore,
899 the error message issued when the current token is wrong only says that a
900 string is expected and doesn't mention that an identifier would also be
903 lex_force_string_or_id (struct lexer *lexer)
905 return lex_token (lexer) == T_ID || lex_force_string (lexer);
908 /* If the current token is an integer, does nothing and returns true.
909 Otherwise, reports an error and returns false. */
911 lex_force_int (struct lexer *lexer)
913 if (lex_is_integer (lexer))
917 lex_error (lexer, _("Syntax error expecting integer."));
922 /* If the current token is an integer in the range MIN...MAX (inclusive), does
923 nothing and returns true. Otherwise, reports an error and returns false.
924 If NAME is nonnull, then it is used in the error message. */
926 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
928 bool is_number = lex_is_number (lexer);
929 bool is_integer = lex_is_integer (lexer);
930 bool too_small = (is_integer ? lex_integer (lexer) < min
931 : is_number ? lex_number (lexer) < min
933 bool too_big = (is_integer ? lex_integer (lexer) > max
934 : is_number ? lex_number (lexer) > max
936 if (is_integer && !too_small && !too_big)
941 /* Weird, maybe a bug in the caller. Just report that we needed an
944 lex_error (lexer, _("Syntax error expecting integer for %s."), name);
946 lex_error (lexer, _("Syntax error expecting integer."));
951 lex_error (lexer, _("Syntax error expecting %ld for %s."), min, name);
953 lex_error (lexer, _("Syntax error expecting %ld."), min);
955 else if (min + 1 == max)
958 lex_error (lexer, _("Syntax error expecting %ld or %ld for %s."),
961 lex_error (lexer, _("Syntax error expecting %ld or %ld."),
966 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
967 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
969 if (report_lower_bound && report_upper_bound)
973 _("Syntax error expecting integer "
974 "between %ld and %ld for %s."),
977 lex_error (lexer, _("Syntax error expecting integer "
978 "between %ld and %ld."),
981 else if (report_lower_bound)
986 lex_error (lexer, _("Syntax error expecting "
987 "non-negative integer for %s."),
990 lex_error (lexer, _("Syntax error expecting "
991 "non-negative integer."));
996 lex_error (lexer, _("Syntax error expecting "
997 "positive integer for %s."),
1000 lex_error (lexer, _("Syntax error expecting "
1001 "positive integer."));
1006 lex_error (lexer, _("Syntax error expecting "
1007 "integer %ld or greater for %s."),
1010 lex_error (lexer, _("Syntax error expecting "
1011 "integer %ld or greater."), min);
1014 else if (report_upper_bound)
1018 _("Syntax error expecting integer less than or equal "
1022 lex_error (lexer, _("Syntax error expecting integer less than or "
1029 lex_error (lexer, _("Syntax error expecting integer for %s."),
1032 lex_error (lexer, _("Syntax error expecting integer."));
1038 /* If the current token is a number, does nothing and returns true.
1039 Otherwise, reports an error and returns false. */
1041 lex_force_num (struct lexer *lexer)
1043 if (lex_is_number (lexer))
1046 lex_error (lexer, _("Syntax error expecting number."));
1050 /* If the current token is an number in the closed range [MIN,MAX], does
1051 nothing and returns true. Otherwise, reports an error and returns false.
1052 If NAME is nonnull, then it is used in the error message. */
1054 lex_force_num_range_closed (struct lexer *lexer, const char *name,
1055 double min, double max)
1057 bool is_number = lex_is_number (lexer);
1058 bool too_small = is_number && lex_number (lexer) < min;
1059 bool too_big = is_number && lex_number (lexer) > max;
1060 if (is_number && !too_small && !too_big)
1065 /* Weird, maybe a bug in the caller. Just report that we needed an
1068 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1070 lex_error (lexer, _("Syntax error expecting number."));
1072 else if (min == max)
1075 lex_error (lexer, _("Syntax error expecting number %g for %s."),
1078 lex_error (lexer, _("Syntax error expecting number %g."), min);
1082 bool report_lower_bound = min > -DBL_MAX || too_small;
1083 bool report_upper_bound = max < DBL_MAX || too_big;
1085 if (report_lower_bound && report_upper_bound)
1089 _("Syntax error expecting number "
1090 "between %g and %g for %s."),
1093 lex_error (lexer, _("Syntax error expecting number "
1094 "between %g and %g."),
1097 else if (report_lower_bound)
1102 lex_error (lexer, _("Syntax error expecting "
1103 "non-negative number for %s."),
1106 lex_error (lexer, _("Syntax error expecting "
1107 "non-negative number."));
1112 lex_error (lexer, _("Syntax error expecting number "
1113 "%g or greater for %s."),
1116 lex_error (lexer, _("Syntax error expecting number "
1117 "%g or greater."), min);
1120 else if (report_upper_bound)
1124 _("Syntax error expecting number "
1125 "less than or equal to %g for %s."),
1128 lex_error (lexer, _("Syntax error expecting number "
1129 "less than or equal to %g."),
1135 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1137 lex_error (lexer, _("Syntax error expecting number."));
1143 /* If the current token is an number in the half-open range [MIN,MAX), does
1144 nothing and returns true. Otherwise, reports an error and returns false.
1145 If NAME is nonnull, then it is used in the error message. */
1147 lex_force_num_range_halfopen (struct lexer *lexer, const char *name,
1148 double min, double max)
1150 bool is_number = lex_is_number (lexer);
1151 bool too_small = is_number && lex_number (lexer) < min;
1152 bool too_big = is_number && lex_number (lexer) >= max;
1153 if (is_number && !too_small && !too_big)
1158 /* Weird, maybe a bug in the caller. Just report that we needed an
1161 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1163 lex_error (lexer, _("Syntax error expecting number."));
1167 bool report_lower_bound = min > -DBL_MAX || too_small;
1168 bool report_upper_bound = max < DBL_MAX || too_big;
1170 if (report_lower_bound && report_upper_bound)
1173 lex_error (lexer, _("Syntax error expecting number "
1174 "in [%g,%g) for %s."),
1177 lex_error (lexer, _("Syntax error expecting number in [%g,%g)."),
1180 else if (report_lower_bound)
1185 lex_error (lexer, _("Syntax error expecting "
1186 "non-negative number for %s."),
1189 lex_error (lexer, _("Syntax error expecting "
1190 "non-negative number."));
1195 lex_error (lexer, _("Syntax error expecting "
1196 "number %g or greater for %s."),
1199 lex_error (lexer, _("Syntax error expecting "
1200 "number %g or greater."), min);
1203 else if (report_upper_bound)
1207 _("Syntax error expecting "
1208 "number less than %g for %s."), max, name);
1210 lex_error (lexer, _("Syntax error expecting "
1211 "number less than %g."), max);
1216 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1218 lex_error (lexer, _("Syntax error expecting number."));
1224 /* If the current token is an number in the open range (MIN,MAX), does
1225 nothing and returns true. Otherwise, reports an error and returns false.
1226 If NAME is nonnull, then it is used in the error message. */
1228 lex_force_num_range_open (struct lexer *lexer, const char *name,
1229 double min, double max)
1231 bool is_number = lex_is_number (lexer);
1232 bool too_small = is_number && lex_number (lexer) <= min;
1233 bool too_big = is_number && lex_number (lexer) >= max;
1234 if (is_number && !too_small && !too_big)
1239 /* Weird, maybe a bug in the caller. Just report that we needed an
1242 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1244 lex_error (lexer, _("Syntax error expecting number."));
1248 bool report_lower_bound = min > -DBL_MAX || too_small;
1249 bool report_upper_bound = max < DBL_MAX || too_big;
1251 if (report_lower_bound && report_upper_bound)
1254 lex_error (lexer, _("Syntax error expecting number "
1255 "in (%g,%g) for %s."),
1258 lex_error (lexer, _("Syntax error expecting number "
1259 "in (%g,%g)."), min, max);
1261 else if (report_lower_bound)
1266 lex_error (lexer, _("Syntax error expecting "
1267 "positive number for %s."), name);
1269 lex_error (lexer, _("Syntax error expecting "
1270 "positive number."));
1275 lex_error (lexer, _("Syntax error expecting number "
1276 "greater than %g for %s."),
1279 lex_error (lexer, _("Syntax error expecting number "
1280 "greater than %g."), min);
1283 else if (report_upper_bound)
1286 lex_error (lexer, _("Syntax error expecting number "
1287 "less than %g for %s."),
1290 lex_error (lexer, _("Syntax error expecting number "
1291 "less than %g."), max);
1296 lex_error (lexer, _("Syntax error expecting number "
1299 lex_error (lexer, _("Syntax error expecting number."));
1305 /* If the current token is an identifier, does nothing and returns true.
1306 Otherwise, reports an error and returns false. */
1308 lex_force_id (struct lexer *lexer)
1310 if (lex_token (lexer) == T_ID)
1313 lex_error (lexer, _("Syntax error expecting identifier."));
1317 /* Token accessors. */
1319 /* Returns the type of LEXER's current token. */
1321 lex_token (const struct lexer *lexer)
1323 return lex_next_token (lexer, 0);
1326 /* Returns the number in LEXER's current token.
1328 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1329 tokens this function will always return zero. */
1331 lex_tokval (const struct lexer *lexer)
1333 return lex_next_tokval (lexer, 0);
1336 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
1338 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1339 this functions this function will always return NULL.
1341 The UTF-8 encoding of the returned string is correct for variable names and
1342 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1343 data_in() to use it in a "union value". */
1345 lex_tokcstr (const struct lexer *lexer)
1347 return lex_next_tokcstr (lexer, 0);
1350 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
1351 null-terminated (but the null terminator is not included in the returned
1352 substring's 'length').
1354 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1355 this functions this function will always return NULL.
1357 The UTF-8 encoding of the returned string is correct for variable names and
1358 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1359 data_in() to use it in a "union value". */
1361 lex_tokss (const struct lexer *lexer)
1363 return lex_next_tokss (lexer, 0);
1368 A value of 0 for N as an argument to any of these functions refers to the
1369 current token. Lookahead is limited to the current command. Any N greater
1370 than the number of tokens remaining in the current command will be treated
1371 as referring to a T_ENDCMD token. */
1373 static const struct lex_token *
1374 lex_next__ (const struct lexer *lexer_, int n)
1376 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1377 struct lex_source *src = lex_source__ (lexer);
1380 return lex_source_next__ (src, n);
1383 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1388 static const struct lex_token *
1389 lex_source_ofs__ (const struct lex_source *src_, int ofs)
1391 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1395 static const struct lex_token endcmd_token
1396 = { .token = { .type = T_ENDCMD } };
1397 return &endcmd_token;
1400 while (ofs >= src->n_parse)
1402 if (src->n_parse > 0)
1404 const struct lex_token *t = src->parse[src->n_parse - 1];
1405 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1409 lex_source_get_parse (src);
1412 return src->parse[ofs];
1415 static const struct lex_token *
1416 lex_source_next__ (const struct lex_source *src, int n)
1418 return lex_source_ofs__ (src, n + src->parse_ofs);
1421 /* Returns the "struct token" of the token N after the current one in LEXER.
1422 The returned pointer can be invalidated by pretty much any succeeding call
1423 into the lexer, although the string pointer within the returned token is
1424 only invalidated by consuming the token (e.g. with lex_get()). */
1425 const struct token *
1426 lex_next (const struct lexer *lexer, int n)
1428 return &lex_next__ (lexer, n)->token;
1431 /* Returns the type of the token N after the current one in LEXER. */
1433 lex_next_token (const struct lexer *lexer, int n)
1435 return lex_next (lexer, n)->type;
1438 /* Returns the number in the tokn N after the current one in LEXER.
1440 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1441 tokens this function will always return zero. */
1443 lex_next_tokval (const struct lexer *lexer, int n)
1445 return token_number (lex_next (lexer, n));
1448 /* Returns the null-terminated string in the token N after the current one, in
1451 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1452 this functions this function will always return NULL.
1454 The UTF-8 encoding of the returned string is correct for variable names and
1455 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1456 data_in() to use it in a "union value". */
1458 lex_next_tokcstr (const struct lexer *lexer, int n)
1460 return lex_next_tokss (lexer, n).string;
1463 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1464 The string is null-terminated (but the null terminator is not included in
1465 the returned substring's 'length').
1467 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1468 tokens this functions this function will always return NULL.
1470 The UTF-8 encoding of the returned string is correct for variable names and
1471 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1472 data_in() to use it in a "union value". */
1474 lex_next_tokss (const struct lexer *lexer, int n)
1476 return lex_next (lexer, n)->string;
1479 /* Returns the offset of the current token within the command being parsed in
1480 LEXER. This is 0 for the first token in a command, 1 for the second, and so
1481 on. The return value is useful later for referring to this token in calls
1484 lex_ofs (const struct lexer *lexer)
1486 struct lex_source *src = lex_source__ (lexer);
1487 return src ? src->parse_ofs : 0;
1490 /* Returns the offset of the last token in the current command. */
1492 lex_max_ofs (const struct lexer *lexer)
1494 struct lex_source *src = lex_source__ (lexer);
1498 int ofs = MAX (1, src->n_parse) - 1;
1501 enum token_type type = lex_source_ofs__ (src, ofs)->token.type;
1502 if (type == T_ENDCMD || type == T_STOP)
1509 /* Returns the token within LEXER's current command with offset OFS. Use
1510 lex_ofs() to find out the offset of the current token. */
1511 const struct token *
1512 lex_ofs_token (const struct lexer *lexer_, int ofs)
1514 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1515 struct lex_source *src = lex_source__ (lexer);
1518 return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1521 static const struct token stop_token = { .type = T_STOP };
1526 /* Allocates and returns a new struct msg_location that spans tokens with
1527 offsets OFS0 through OFS1, inclusive, within the current command in
1528 LEXER. See lex_ofs() for an explanation of token offsets.
1530 The caller owns and must eventually free the returned object. */
1531 struct msg_location *
1532 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1534 int ofs = lex_ofs (lexer);
1535 return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1538 /* Returns a msg_point for the first character in the token with offset OFS,
1539 where offset 0 is the first token in the command currently being parsed, 1
1540 the second token, and so on. These are absolute offsets, not relative to
1541 the token currently being parsed within the command.
1543 Returns zeros for a T_STOP token.
1546 lex_ofs_start_point (const struct lexer *lexer, int ofs)
1548 const struct lex_source *src = lex_source__ (lexer);
1550 ? lex_token_start_point (src, lex_source_ofs__ (src, ofs))
1551 : (struct msg_point) { 0, 0 });
1554 /* Returns a msg_point for the last character, inclusive, in the token with
1555 offset OFS, where offset 0 is the first token in the command currently being
1556 parsed, 1 the second token, and so on. These are absolute offsets, not
1557 relative to the token currently being parsed within the command.
1559 Returns zeros for a T_STOP token.
1561 Most of the time, a single token is wholly within a single line of syntax,
1562 so that the start and end point for a given offset have the same line
1563 number. There are two exceptions: a T_STRING token can be made up of
1564 multiple segments on adjacent lines connected with "+" punctuators, and a
1565 T_NEG_NUM token can consist of a "-" on one line followed by the number on
1569 lex_ofs_end_point (const struct lexer *lexer, int ofs)
1571 const struct lex_source *src = lex_source__ (lexer);
1573 ? lex_token_end_point (src, lex_source_ofs__ (src, ofs))
1574 : (struct msg_point) { 0, 0 });
1577 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1578 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1579 are both zero, this requests the syntax for the current token.)
1581 The caller must eventually free the returned string (with free()). The
1582 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1583 that, for example, it may include comments, spaces, and new-lines if it
1584 spans multiple tokens. Macro expansion, however, has already been
1587 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1589 const struct lex_source *src = lex_source__ (lexer);
1591 ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs)
1596 /* Returns the text of the syntax in tokens with offsets OFS0 to OFS1,
1597 inclusive. (For example, if OFS0 and OFS1 are both zero, this requests the
1598 syntax for the first token in the current command.)
1600 The caller must eventually free the returned string (with free()). The
1601 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1602 that, for example, it may include comments, spaces, and new-lines if it
1603 spans multiple tokens. Macro expansion, however, has already been
1606 lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1)
1608 const struct lex_source *src = lex_source__ (lexer);
1609 return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup ("");
1612 /* Returns true if the token N ahead of the current one was produced by macro
1613 expansion, false otherwise. */
1615 lex_next_is_from_macro (const struct lexer *lexer, int n)
1617 return lex_next__ (lexer, n)->macro_rep != NULL;
1621 lex_tokens_match (const struct token *actual, const struct token *expected)
1623 if (actual->type != expected->type)
1626 switch (actual->type)
1630 return actual->number == expected->number;
1633 return lex_id_match (expected->string, actual->string);
1636 return (actual->string.length == expected->string.length
1637 && !memcmp (actual->string.string, expected->string.string,
1638 actual->string.length));
1646 lex_ofs_at_phrase__ (struct lexer *lexer, int ofs, const char *s,
1649 struct string_lexer slex;
1652 size_t n_matched = 0;
1653 bool all_matched = true;
1654 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1655 while (string_lexer_next (&slex, &token))
1657 bool match = lex_tokens_match (lex_ofs_token (lexer, ofs + n_matched),
1659 token_uninit (&token);
1662 all_matched = false;
1668 *n_matchedp = n_matched;
1672 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1673 returns true. Otherwise, returns false.
1675 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1676 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1677 first three letters. */
1679 lex_at_phrase (struct lexer *lexer, const char *s)
1681 return lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s, NULL);
1684 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1685 skips it and returns true. Otherwise, returns false.
1687 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1688 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1689 first three letters. */
1691 lex_match_phrase (struct lexer *lexer, const char *s)
1694 if (!lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s, &n_matched))
1696 lex_get_n (lexer, n_matched);
1700 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1701 skips it and returns true. Otherwise, issues an error and returns false.
1703 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1704 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1705 first three letters. */
1707 lex_force_match_phrase (struct lexer *lexer, const char *s)
1710 bool ok = lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s, &n_matched);
1712 lex_get_n (lexer, n_matched);
1714 lex_next_error (lexer, 0, n_matched, _("Syntax error expecting `%s'."), s);
1718 /* Returns the 1-based line number of the source text at the byte OFFSET in
1721 lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset)
1724 size_t hi = src->n_lines;
1727 size_t mid = (lo + hi) / 2;
1728 if (mid + 1 >= src->n_lines)
1729 return src->n_lines;
1730 else if (offset >= src->lines[mid + 1])
1732 else if (offset < src->lines[mid])
1739 /* Returns the 1-based column number of the source text at the byte OFFSET in
1742 lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset)
1744 const char *newline = memrchr (src->buffer, '\n', offset);
1745 size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1746 return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1749 static struct msg_point
1750 lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset)
1752 return (struct msg_point) {
1753 .line = lex_source_ofs_to_line_number (src, offset),
1754 .column = lex_source_ofs_to_column_number (src, offset),
1758 static struct msg_point
1759 lex_token_start_point (const struct lex_source *src,
1760 const struct lex_token *token)
1762 return lex_source_ofs_to_point__ (src, token->token_pos);
1765 static struct msg_point
1766 lex_token_end_point (const struct lex_source *src,
1767 const struct lex_token *token)
1769 return lex_source_ofs_to_point__ (src, lex_token_end (token));
1772 static struct msg_location
1773 lex_token_location (const struct lex_source *src,
1774 const struct lex_token *t0,
1775 const struct lex_token *t1)
1777 return (struct msg_location) {
1778 .file_name = intern_new_if_nonnull (src->reader->file_name),
1779 .start = lex_token_start_point (src, t0),
1780 .end = lex_token_end_point (src, t1),
1781 .src = CONST_CAST (struct lex_source *, src),
1785 static struct msg_location *
1786 lex_token_location_rw (const struct lex_source *src,
1787 const struct lex_token *t0,
1788 const struct lex_token *t1)
1790 struct msg_location location = lex_token_location (src, t0, t1);
1791 return msg_location_dup (&location);
1794 static struct msg_location *
1795 lex_source_get_location (const struct lex_source *src, int ofs0, int ofs1)
1797 return lex_token_location_rw (src,
1798 lex_source_ofs__ (src, ofs0),
1799 lex_source_ofs__ (src, ofs1));
1802 /* Returns the name of the syntax file from which the current command is drawn.
1803 Returns NULL for a T_STOP token or if the command's source does not have
1806 There is no version of this function that takes an N argument because
1807 lookahead only works to the end of a command and any given command is always
1808 within a single syntax file. */
1810 lex_get_file_name (const struct lexer *lexer)
1812 struct lex_source *src = lex_source__ (lexer);
1813 return src == NULL ? NULL : src->reader->file_name;
1816 /* Returns a newly allocated msg_location for the syntax that represents tokens
1817 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1818 must eventually free the location (with msg_location_destroy()). */
1819 struct msg_location *
1820 lex_get_location (const struct lexer *lexer, int n0, int n1)
1822 struct msg_location *loc = xmalloc (sizeof *loc);
1823 *loc = (struct msg_location) {
1824 .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1825 .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)),
1826 .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)),
1827 .src = lex_source__ (lexer),
1829 lex_source_ref (loc->src);
1834 lex_get_encoding (const struct lexer *lexer)
1836 struct lex_source *src = lex_source__ (lexer);
1837 return src == NULL ? NULL : src->reader->encoding;
1840 /* Returns the syntax mode for the syntax file from which the current drawn is
1841 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1842 does not have line numbers.
1844 There is no version of this function that takes an N argument because
1845 lookahead only works to the end of a command and any given command is always
1846 within a single syntax file. */
1848 lex_get_syntax_mode (const struct lexer *lexer)
1850 struct lex_source *src = lex_source__ (lexer);
1851 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1854 /* Returns the error mode for the syntax file from which the current drawn is
1855 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1856 source does not have line numbers.
1858 There is no version of this function that takes an N argument because
1859 lookahead only works to the end of a command and any given command is always
1860 within a single syntax file. */
1862 lex_get_error_mode (const struct lexer *lexer)
1864 struct lex_source *src = lex_source__ (lexer);
1865 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1868 /* If the source that LEXER is currently reading has error mode
1869 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1870 token to be read comes directly from whatever is next read from the stream.
1872 It makes sense to call this function after encountering an error in a
1873 command entered on the console, because usually the user would prefer not to
1874 have cascading errors. */
1876 lex_interactive_reset (struct lexer *lexer)
1878 struct lex_source *src = lex_source__ (lexer);
1879 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1882 src->journal_pos = src->seg_pos = 0;
1884 src->suppress_next_newline = false;
1885 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1887 lex_stage_clear (&src->pp);
1888 lex_stage_clear (&src->merge);
1889 lex_source_clear_parse (src);
1890 lex_source_push_endcmd__ (src);
1894 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1896 lex_discard_rest_of_command (struct lexer *lexer)
1898 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1902 /* Discards all lookahead tokens in LEXER, then discards all input sources
1903 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1904 runs out of input sources. */
1906 lex_discard_noninteractive (struct lexer *lexer)
1908 struct lex_source *src = lex_source__ (lexer);
1911 if (src->reader->error == LEX_ERROR_IGNORE)
1914 lex_stage_clear (&src->pp);
1915 lex_stage_clear (&src->merge);
1916 lex_source_clear_parse (src);
1918 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1919 src = lex_source__ (lexer))
1921 ll_remove (&src->ll);
1922 lex_source_unref (src);
1928 lex_source_expand__ (struct lex_source *src)
1930 if (src->length >= src->allocated)
1931 src->buffer = x2realloc (src->buffer, &src->allocated);
1935 lex_source_read__ (struct lex_source *src)
1939 lex_source_expand__ (src);
1941 size_t space = src->allocated - src->length;
1942 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1943 size_t n = src->reader->class->read (src->reader,
1944 &src->buffer[src->length],
1946 assert (n <= space);
1951 src->reader->eof = true;
1957 while (!memchr (&src->buffer[src->seg_pos], '\n',
1958 src->length - src->seg_pos));
1961 static struct lex_source *
1962 lex_source__ (const struct lexer *lexer)
1964 return (ll_is_empty (&lexer->sources) ? NULL
1965 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1968 const struct lex_source *
1969 lex_source (const struct lexer *lexer)
1971 return lex_source__ (lexer);
1974 /* Returns the text of the syntax in SRC for tokens with offsets OFS0 through
1975 OFS1 in the current command, inclusive. (For example, if OFS0 and OFS1 are
1976 both zero, this requests the syntax for the first token in the current
1977 command.) The caller must eventually free the returned string (with
1978 free()). The syntax is encoded in UTF-8 and in the original form supplied
1979 to the lexer so that, for example, it may include comments, spaces, and
1980 new-lines if it spans multiple tokens. Macro expansion, however, has
1981 already been performed. */
1983 lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1)
1985 struct string s = DS_EMPTY_INITIALIZER;
1986 for (size_t i = ofs0; i <= ofs1; )
1988 /* Find [I,J) as the longest sequence of tokens not produced by macro
1989 expansion, or otherwise the longest sequence expanded from a single
1991 const struct lex_token *first = lex_source_ofs__ (src, i);
1993 for (j = i + 1; j <= ofs1; j++)
1995 const struct lex_token *cur = lex_source_ofs__ (src, j);
1996 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1997 || first->macro_rep != cur->macro_rep)
2000 const struct lex_token *last = lex_source_ofs__ (src, j - 1);
2002 /* Now add the syntax for this sequence of tokens to SRC. */
2003 if (!ds_is_empty (&s))
2004 ds_put_byte (&s, ' ');
2005 if (!first->macro_rep)
2007 size_t start = first->token_pos;
2008 size_t end = last->token_pos + last->token_len;
2009 ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
2013 size_t start = first->ofs;
2014 size_t end = last->ofs + last->len;
2015 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
2021 return ds_steal_cstr (&s);
2025 lex_source_contains_macro_call (struct lex_source *src, int ofs0, int ofs1)
2027 for (int i = ofs0; i <= ofs1; i++)
2028 if (lex_source_ofs__ (src, i)->macro_rep)
2033 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
2034 raw UTF-8 syntax for the macro call (not for the expansion) and for any
2035 other tokens included in that range. The syntax is encoded in UTF-8 and in
2036 the original form supplied to the lexer so that, for example, it may include
2037 comments, spaces, and new-lines if it spans multiple tokens.
2039 Returns an empty string if the token range doesn't include a macro call.
2041 The caller must not modify or free the returned string. */
2042 static struct substring
2043 lex_source_get_macro_call (struct lex_source *src, int ofs0, int ofs1)
2045 if (!lex_source_contains_macro_call (src, ofs0, ofs1))
2048 const struct lex_token *token0 = lex_source_ofs__ (src, ofs0);
2049 const struct lex_token *token1 = lex_source_ofs__ (src, MAX (ofs0, ofs1));
2050 size_t start = token0->token_pos;
2051 size_t end = token1->token_pos + token1->token_len;
2053 return ss_buffer (&src->buffer[start], end - start);
2057 lex_source_msg_valist (struct lex_source *src, enum msg_class class,
2058 int ofs0, int ofs1, const char *format, va_list args)
2060 struct string s = DS_EMPTY_INITIALIZER;
2064 /* Get the macro call(s) that expanded to the syntax that caused the
2067 str_ellipsize (lex_source_get_macro_call (src, ofs0, ofs1),
2070 ds_put_format (&s, _("In syntax expanded from `%s'"), call);
2073 ds_put_cstr (&s, _("At end of input"));
2075 if (!ds_is_empty (&s))
2076 ds_put_cstr (&s, ": ");
2078 ds_put_vformat (&s, format, args);
2080 ds_put_cstr (&s, _("Syntax error."));
2082 if (ds_last (&s) != '.')
2083 ds_put_byte (&s, '.');
2085 struct msg *m = xmalloc (sizeof *m);
2087 .category = msg_class_to_category (class),
2088 .severity = msg_class_to_severity (class),
2089 .location = src ? lex_source_get_location (src, ofs0, ofs1) : NULL,
2090 .text = ds_steal_cstr (&s),
2096 lex_get_error (struct lex_source *src, const struct lex_token *token)
2098 struct msg *m = xmalloc (sizeof *m);
2100 .category = MSG_C_SYNTAX,
2101 .severity = MSG_S_ERROR,
2102 .location = lex_token_location_rw (src, token, token),
2103 .text = ss_xstrdup (token->token.string),
2108 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
2109 underlying lex_reader if necessary. Returns true if a new token was added
2110 to SRC's deque, false otherwise. The caller should retry failures unless
2111 SRC's 'eof' marker was set to true indicating that there will be no more
2112 tokens from this source. */
2114 lex_source_try_get_pp (struct lex_source *src)
2116 /* Append a new token to SRC and initialize it. */
2117 struct lex_token *token = xmalloc (sizeof *token);
2118 token->token = (struct token) { .type = T_STOP };
2119 token->macro_rep = NULL;
2120 token->ref_cnt = NULL;
2121 token->token_pos = src->seg_pos;
2123 /* Extract a segment. */
2124 const char *segment;
2125 enum segment_type seg_type;
2129 segment = &src->buffer[src->seg_pos];
2130 seg_len = segmenter_push (&src->segmenter, segment,
2131 src->length - src->seg_pos,
2132 src->reader->eof, &seg_type);
2136 /* The segmenter needs more input to produce a segment. */
2137 assert (!src->reader->eof);
2138 lex_source_read__ (src);
2141 /* Update state based on the segment. */
2142 token->token_len = seg_len;
2143 src->seg_pos += seg_len;
2144 if (seg_type == SEG_NEWLINE)
2146 if (src->n_lines >= src->allocated_lines)
2147 src->lines = x2nrealloc (src->lines, &src->allocated_lines,
2148 sizeof *src->lines);
2149 src->lines[src->n_lines++] = src->seg_pos;
2152 /* Get a token from the segment. */
2153 enum tokenize_result result = token_from_segment (
2154 seg_type, ss_buffer (segment, seg_len), &token->token);
2156 /* If we've reached the end of a line, or the end of a command, then pass
2157 the line to the output engine as a syntax text item. */
2158 int n_lines = seg_type == SEG_NEWLINE;
2159 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
2162 src->suppress_next_newline = true;
2164 else if (n_lines > 0 && src->suppress_next_newline)
2167 src->suppress_next_newline = false;
2169 for (int i = 0; i < n_lines; i++)
2171 /* Beginning of line. */
2172 const char *line = &src->buffer[src->journal_pos];
2174 /* Calculate line length, including \n or \r\n end-of-line if present.
2176 We use src->length even though that may be beyond what we've actually
2177 converted to tokens. That's because, if we're emitting the line due
2178 to SEG_END_COMMAND, we want to take the whole line through the
2179 newline, not just through the '.'. */
2180 size_t max_len = src->length - src->journal_pos;
2181 const char *newline = memchr (line, '\n', max_len);
2182 size_t line_len = newline ? newline - line + 1 : max_len;
2184 /* Calculate line length excluding end-of-line. */
2185 size_t copy_len = line_len;
2186 if (copy_len > 0 && line[copy_len - 1] == '\n')
2188 if (copy_len > 0 && line[copy_len - 1] == '\r')
2191 /* Submit the line as syntax. */
2192 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
2193 xmemdup0 (line, copy_len),
2196 src->journal_pos += line_len;
2201 case TOKENIZE_ERROR:
2202 lex_get_error (src, token);
2204 case TOKENIZE_EMPTY:
2205 lex_token_destroy (token);
2208 case TOKENIZE_TOKEN:
2209 if (token->token.type == T_STOP)
2211 token->token.type = T_ENDCMD;
2214 lex_stage_push_last (&src->pp, token);
2220 /* Attempts to append a new token to SRC. Returns true if successful, false on
2221 failure. On failure, the end of SRC has been reached and no more tokens
2222 will be forthcoming from it.
2224 Does not make the new token available for lookahead yet; the caller must
2225 adjust SRC's 'middle' pointer to do so. */
2227 lex_source_get_pp (struct lex_source *src)
2230 if (lex_source_try_get_pp (src))
2236 lex_source_try_get_merge (const struct lex_source *src_)
2238 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2240 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
2243 if (!settings_get_mexpand ())
2245 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
2249 /* Now pass tokens one-by-one to the macro expander.
2251 In the common case where there is no macro to expand, the loop is not
2253 struct macro_call *mc;
2254 int n_call = macro_call_create (src->lexer->macros,
2255 &lex_stage_first (&src->pp)->token, &mc);
2256 for (int ofs = 1; !n_call; ofs++)
2258 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
2260 /* This should not be reachable because we always get a T_ENDCMD at
2261 the end of an input file (transformed from T_STOP by
2262 lex_source_try_get_pp()) and the macro_expander should always
2263 terminate expansion on T_ENDCMD. */
2267 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
2268 const struct macro_token mt = {
2270 .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len),
2272 const struct msg_location loc = lex_token_location (src, t, t);
2273 n_call = macro_call_add (mc, &mt, &loc);
2277 /* False alarm: no macro expansion after all. Use first token as
2278 lookahead. We'll retry macro expansion from the second token next
2280 macro_call_destroy (mc);
2281 lex_stage_shift (&src->merge, &src->pp, 1);
2285 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
2286 are a macro call. (These are likely to be the only tokens in 'pp'.)
2288 const struct lex_token *c0 = lex_stage_first (&src->pp);
2289 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
2290 struct macro_tokens expansion = { .n = 0 };
2291 struct msg_location loc = lex_token_location (src, c0, c1);
2292 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
2293 macro_call_destroy (mc);
2295 /* Convert the macro expansion into syntax for possible error messages
2297 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
2298 size_t *len = xnmalloc (expansion.n, sizeof *len);
2299 struct string s = DS_EMPTY_INITIALIZER;
2300 macro_tokens_to_syntax (&expansion, &s, ofs, len);
2302 if (settings_get_mprint ())
2303 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
2304 _("Macro Expansion")));
2306 /* Append the macro expansion tokens to the lookahead. */
2307 if (expansion.n > 0)
2309 char *macro_rep = ds_steal_cstr (&s);
2310 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
2311 *ref_cnt = expansion.n;
2312 for (size_t i = 0; i < expansion.n; i++)
2314 struct lex_token *token = xmalloc (sizeof *token);
2315 *token = (struct lex_token) {
2316 .token = expansion.mts[i].token,
2317 .token_pos = c0->token_pos,
2318 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
2319 .macro_rep = macro_rep,
2324 lex_stage_push_last (&src->merge, token);
2326 ss_dealloc (&expansion.mts[i].syntax);
2331 free (expansion.mts);
2335 /* Destroy the tokens for the call. */
2336 for (size_t i = 0; i < n_call; i++)
2337 lex_stage_pop_first (&src->pp);
2339 return expansion.n > 0;
2342 /* Attempts to obtain at least one new token into 'merge' in SRC.
2344 Returns true if successful, false on failure. In the latter case, SRC is
2345 exhausted and 'src->eof' is now true. */
2347 lex_source_get_merge (struct lex_source *src)
2350 if (lex_source_try_get_merge (src))
2355 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
2357 Returns true if successful, false on failure. In the latter case, SRC is
2358 exhausted and 'src->eof' is now true. */
2360 lex_source_get_parse (struct lex_source *src)
2362 struct merger m = MERGER_INIT;
2364 for (size_t i = 0; ; i++)
2366 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
2368 /* We always get a T_ENDCMD at the end of an input file
2369 (transformed from T_STOP by lex_source_try_get_pp()) and
2370 merger_add() should never return -1 on T_ENDCMD. */
2371 assert (lex_stage_is_empty (&src->merge));
2375 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
2379 lex_source_push_parse (src, lex_stage_take_first (&src->merge));
2382 else if (retval > 0)
2384 /* Add a token that merges all the tokens together. */
2385 const struct lex_token *first = lex_stage_first (&src->merge);
2386 const struct lex_token *last = lex_stage_nth (&src->merge,
2388 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2389 struct lex_token *t = xmalloc (sizeof *t);
2390 *t = (struct lex_token) {
2392 .token_pos = first->token_pos,
2393 .token_len = (last->token_pos - first->token_pos) + last->token_len,
2395 /* This works well if all the tokens were not expanded from macros,
2396 or if they came from the same macro expansion. It just gives up
2397 in the other (corner) cases. */
2398 .macro_rep = macro ? first->macro_rep : NULL,
2399 .ofs = macro ? first->ofs : 0,
2400 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2401 .ref_cnt = macro ? first->ref_cnt : NULL,
2405 lex_source_push_parse (src, t);
2407 for (int i = 0; i < retval; i++)
2408 lex_stage_pop_first (&src->merge);
2415 lex_source_push_endcmd__ (struct lex_source *src)
2417 assert (src->n_parse == 0);
2419 struct lex_token *token = xmalloc (sizeof *token);
2420 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2421 lex_source_push_parse (src, token);
2425 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2427 if (src->n_parse >= src->allocated_parse)
2428 src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2429 sizeof *src->parse);
2430 src->parse[src->n_parse++] = token;
2434 lex_source_clear_parse (struct lex_source *src)
2436 for (size_t i = 0; i < src->n_parse; i++)
2437 lex_token_destroy (src->parse[i]);
2438 src->n_parse = src->parse_ofs = 0;
2441 static struct lex_source *
2442 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2444 size_t allocated_lines = 4;
2445 size_t *lines = xmalloc (allocated_lines * sizeof *lines);
2448 struct lex_source *src = xmalloc (sizeof *src);
2449 *src = (struct lex_source) {
2452 .segmenter = segmenter_init (reader->syntax, false),
2456 .allocated_lines = allocated_lines,
2459 lex_source_push_endcmd__ (src);
2465 lex_set_message_handler (struct lexer *lexer,
2466 void (*output_msg) (const struct msg *,
2469 struct msg_handler msg_handler = {
2470 .output_msg = (void (*)(const struct msg *, void *)) output_msg,
2472 .lex_source_ref = lex_source_ref,
2473 .lex_source_unref = lex_source_unref,
2474 .lex_source_get_line = lex_source_get_line,
2476 msg_set_handler (&msg_handler);
2480 lex_source_ref (const struct lex_source *src_)
2482 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2485 assert (src->n_refs > 0);
2492 lex_source_unref (struct lex_source *src)
2497 assert (src->n_refs > 0);
2498 if (--src->n_refs > 0)
2501 char *file_name = src->reader->file_name;
2502 char *encoding = src->reader->encoding;
2503 if (src->reader->class->destroy != NULL)
2504 src->reader->class->destroy (src->reader);
2509 lex_stage_uninit (&src->pp);
2510 lex_stage_uninit (&src->merge);
2511 lex_source_clear_parse (src);
2516 struct lex_file_reader
2518 struct lex_reader reader;
2519 struct u8_istream *istream;
2522 static struct lex_reader_class lex_file_reader_class;
2524 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2525 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2526 ENCODING, which should take one of the forms accepted by
2527 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2528 mode of the new reader, respectively.
2530 Returns a null pointer if FILE_NAME cannot be opened. */
2532 lex_reader_for_file (const char *file_name, const char *encoding,
2533 enum segmenter_mode syntax,
2534 enum lex_error_mode error)
2536 struct lex_file_reader *r;
2537 struct u8_istream *istream;
2539 istream = (!strcmp(file_name, "-")
2540 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2541 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2542 if (istream == NULL)
2544 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2548 r = xmalloc (sizeof *r);
2549 lex_reader_init (&r->reader, &lex_file_reader_class);
2550 r->reader.syntax = syntax;
2551 r->reader.error = error;
2552 r->reader.file_name = xstrdup (file_name);
2553 r->reader.encoding = xstrdup_if_nonnull (encoding);
2554 r->reader.line_number = 1;
2555 r->istream = istream;
2560 static struct lex_file_reader *
2561 lex_file_reader_cast (struct lex_reader *r)
2563 return UP_CAST (r, struct lex_file_reader, reader);
2567 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2568 enum prompt_style prompt_style UNUSED)
2570 struct lex_file_reader *r = lex_file_reader_cast (r_);
2571 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2574 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2581 lex_file_close (struct lex_reader *r_)
2583 struct lex_file_reader *r = lex_file_reader_cast (r_);
2585 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2587 if (u8_istream_close (r->istream) != 0)
2588 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2591 u8_istream_free (r->istream);
2596 static struct lex_reader_class lex_file_reader_class =
2602 struct lex_string_reader
2604 struct lex_reader reader;
2609 static struct lex_reader_class lex_string_reader_class;
2611 /* Creates and returns a new lex_reader for the contents of S, which must be
2612 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2613 with ss_dealloc() when it is closed. */
2615 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2617 struct lex_string_reader *r;
2619 r = xmalloc (sizeof *r);
2620 lex_reader_init (&r->reader, &lex_string_reader_class);
2621 r->reader.syntax = SEG_MODE_AUTO;
2622 r->reader.encoding = xstrdup_if_nonnull (encoding);
2629 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2630 which must be encoded in ENCODING. The caller retains ownership of S. */
2632 lex_reader_for_string (const char *s, const char *encoding)
2634 return lex_reader_for_substring_nocopy (ss_clone (ss_cstr (s)), encoding);
2637 /* Formats FORMAT as a printf()-like format string and creates and returns a
2638 new lex_reader for the formatted result. */
2640 lex_reader_for_format (const char *format, const char *encoding, ...)
2642 struct lex_reader *r;
2645 va_start (args, encoding);
2646 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2652 static struct lex_string_reader *
2653 lex_string_reader_cast (struct lex_reader *r)
2655 return UP_CAST (r, struct lex_string_reader, reader);
2659 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2660 enum prompt_style prompt_style UNUSED)
2662 struct lex_string_reader *r = lex_string_reader_cast (r_);
2665 chunk = MIN (n, r->s.length - r->offset);
2666 memcpy (buf, r->s.string + r->offset, chunk);
2673 lex_string_close (struct lex_reader *r_)
2675 struct lex_string_reader *r = lex_string_reader_cast (r_);
2681 static struct lex_reader_class lex_string_reader_class =
2688 lex_source_get_line (const struct lex_source *src, int line)
2690 if (line < 1 || line > src->n_lines)
2693 size_t ofs = src->lines[line - 1];
2695 if (line < src->n_lines)
2696 end = src->lines[line];
2699 const char *newline = memchr (src->buffer + ofs, '\n', src->length - ofs);
2700 end = newline ? newline - src->buffer : src->length;
2702 return ss_buffer (&src->buffer[ofs], end - ofs);