1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/intern.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
70 size_t token_pos; /* Offset into src->buffer of token start. */
71 size_t token_len; /* Length of source for token in bytes. */
73 /* For a token obtained through macro expansion, this is just this token.
75 For a token obtained through the lexer in an ordinary way, these are
77 char *macro_rep; /* The whole macro expansion. */
78 size_t ofs; /* Offset of this token in macro_rep. */
79 size_t len; /* Length of this token in macro_rep. */
80 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
83 static struct msg_point lex_token_start_point (const struct lex_source *,
84 const struct lex_token *);
85 static struct msg_point lex_token_end_point (const struct lex_source *,
86 const struct lex_token *);
88 static bool lex_ofs_at_phrase__ (struct lexer *, int ofs, const char *s,
91 /* Source offset of the last byte in TOKEN. */
93 lex_token_end (const struct lex_token *token)
95 return token->token_pos + MAX (token->token_len, 1) - 1;
99 lex_token_destroy (struct lex_token *t)
101 token_uninit (&t->token);
104 assert (*t->ref_cnt > 0);
114 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
119 struct lex_token **tokens;
122 static void lex_stage_clear (struct lex_stage *);
123 static void lex_stage_uninit (struct lex_stage *);
125 static size_t lex_stage_count (const struct lex_stage *);
126 static bool lex_stage_is_empty (const struct lex_stage *);
128 static struct lex_token *lex_stage_first (struct lex_stage *);
129 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
131 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
132 static void lex_stage_pop_first (struct lex_stage *);
134 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
137 /* Deletes all the tokens from STAGE. */
139 lex_stage_clear (struct lex_stage *stage)
141 while (!deque_is_empty (&stage->deque))
142 lex_stage_pop_first (stage);
145 /* Deletes all the tokens from STAGE and frees storage for the deque. */
147 lex_stage_uninit (struct lex_stage *stage)
149 lex_stage_clear (stage);
150 free (stage->tokens);
153 /* Returns true if STAGE contains no tokens, otherwise false. */
155 lex_stage_is_empty (const struct lex_stage *stage)
157 return deque_is_empty (&stage->deque);
160 /* Returns the number of tokens in STAGE. */
162 lex_stage_count (const struct lex_stage *stage)
164 return deque_count (&stage->deque);
167 /* Returns the first token in STAGE, which must be nonempty.
168 The first token is the one accessed with the least lookahead. */
169 static struct lex_token *
170 lex_stage_first (struct lex_stage *stage)
172 return lex_stage_nth (stage, 0);
175 /* Returns the token the given INDEX in STAGE. The first token (with the least
176 lookahead) is 0, the second token is 1, and so on. There must be at least
177 INDEX + 1 tokens in STAGE. */
178 static struct lex_token *
179 lex_stage_nth (struct lex_stage *stage, size_t index)
181 return stage->tokens[deque_back (&stage->deque, index)];
184 /* Adds TOKEN so that it becomes the last token in STAGE. */
186 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
188 if (deque_is_full (&stage->deque))
189 stage->tokens = deque_expand (&stage->deque, stage->tokens,
190 sizeof *stage->tokens);
191 stage->tokens[deque_push_front (&stage->deque)] = token;
194 /* Removes and returns the first token from STAGE. */
195 static struct lex_token *
196 lex_stage_take_first (struct lex_stage *stage)
198 return stage->tokens[deque_pop_back (&stage->deque)];
201 /* Removes the first token from STAGE and uninitializes it. */
203 lex_stage_pop_first (struct lex_stage *stage)
205 lex_token_destroy (lex_stage_take_first (stage));
208 /* Removes the first N tokens from SRC, appending them to DST as the last
211 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
213 for (size_t i = 0; i < n; i++)
214 lex_stage_push_last (dst, lex_stage_take_first (src));
217 /* A source of tokens, corresponding to a syntax file.
219 This is conceptually a lex_reader wrapped with everything needed to convert
220 its UTF-8 bytes into tokens. */
223 struct ll ll; /* In lexer's list of sources. */
227 - One for struct lexer.
229 - One for each struct msg_location that references this source. */
232 struct lex_reader *reader;
234 struct segmenter segmenter;
235 bool eof; /* True if T_STOP was read from 'reader'. */
237 /* Buffer of UTF-8 bytes. */
238 char *buffer; /* Source file contents. */
239 size_t length; /* Number of bytes filled. */
240 size_t allocated; /* Number of bytes allocated. */
242 /* Offsets into 'buffer'. */
243 size_t journal_pos; /* First byte not yet output to journal. */
244 size_t seg_pos; /* First byte not yet scanned as token. */
246 /* Offset into 'buffer' of starts of lines. */
248 size_t n_lines, allocated_lines;
250 bool suppress_next_newline;
254 This is a pipeline with the following stages. Each token eventually
255 made available to the parser passes through of these stages. The stages
256 are named after the processing that happens in each one.
258 Initially, tokens come from the segmenter and scanner to 'pp':
260 - pp: Tokens that need to pass through the macro preprocessor to end up
263 - merge: Tokens that need to pass through scan_merge() to end up in
266 - parse: Tokens available to the client for parsing.
268 'pp' and 'merge' store tokens only temporarily until they pass into
269 'parse'. Tokens then live in 'parse' until the command is fully
270 consumed, at which time they are freed together. */
272 struct lex_stage merge;
273 struct lex_token **parse;
274 size_t n_parse, allocated_parse, parse_ofs;
277 static struct lex_source *lex_source_create (struct lexer *,
278 struct lex_reader *);
283 struct ll_list sources; /* Contains "struct lex_source"s. */
284 struct macro_set *macros;
287 static struct lex_source *lex_source__ (const struct lexer *);
288 static char *lex_source_syntax__ (const struct lex_source *,
290 static const struct lex_token *lex_next__ (const struct lexer *, int n);
291 static void lex_source_push_endcmd__ (struct lex_source *);
292 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
293 static void lex_source_clear_parse (struct lex_source *);
295 static bool lex_source_get_parse (struct lex_source *);
296 static void lex_source_msg_valist (struct lex_source *, enum msg_class,
298 const char *format, va_list)
299 PRINTF_FORMAT (5, 0);
300 static const struct lex_token *lex_source_next__ (const struct lex_source *,
303 /* Initializes READER with the specified CLASS and otherwise some reasonable
304 defaults. The caller should fill in the others members as desired. */
306 lex_reader_init (struct lex_reader *reader,
307 const struct lex_reader_class *class)
309 reader->class = class;
310 reader->syntax = SEG_MODE_AUTO;
311 reader->error = LEX_ERROR_CONTINUE;
312 reader->file_name = NULL;
313 reader->encoding = NULL;
314 reader->line_number = 0;
318 /* Frees any file name already in READER and replaces it by a copy of
319 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
321 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
323 free (reader->file_name);
324 reader->file_name = xstrdup_if_nonnull (file_name);
327 /* Creates and returns a new lexer. */
331 struct lexer *lexer = xmalloc (sizeof *lexer);
332 *lexer = (struct lexer) {
333 .sources = LL_INITIALIZER (lexer->sources),
334 .macros = macro_set_create (),
339 /* Destroys LEXER. */
341 lex_destroy (struct lexer *lexer)
345 struct lex_source *source, *next;
347 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
349 ll_remove (&source->ll);
350 lex_source_unref (source);
352 macro_set_destroy (lexer->macros);
357 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
358 same name. Takes ownership of M. */
360 lex_define_macro (struct lexer *lexer, struct macro *m)
362 macro_set_add (lexer->macros, m);
365 /* Inserts READER into LEXER so that the next token read by LEXER comes from
366 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
369 lex_include (struct lexer *lexer, struct lex_reader *reader)
371 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
372 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
375 /* Appends READER to LEXER, so that it will be read after all other current
376 readers have already been read. */
378 lex_append (struct lexer *lexer, struct lex_reader *reader)
380 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
385 /* Advances LEXER to the next token, consuming the current token. */
387 lex_get (struct lexer *lexer)
389 struct lex_source *src;
391 src = lex_source__ (lexer);
395 if (src->parse_ofs < src->n_parse)
397 if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
398 lex_source_clear_parse (src);
403 while (src->parse_ofs == src->n_parse)
404 if (!lex_source_get_parse (src))
406 ll_remove (&src->ll);
407 lex_source_unref (src);
408 src = lex_source__ (lexer);
414 /* Advances LEXER by N tokens. */
416 lex_get_n (struct lexer *lexer, size_t n)
422 /* Issuing errors. */
424 /* Prints a syntax error message containing the current token and
425 given message MESSAGE (if non-null). */
427 lex_error (struct lexer *lexer, const char *format, ...)
431 va_start (args, format);
432 lex_ofs_msg_valist (lexer, SE, lex_ofs (lexer), lex_ofs (lexer),
437 /* Prints a syntax error message for the span of tokens N0 through N1,
438 inclusive, from the current token in LEXER, adding message MESSAGE (if
441 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
445 va_start (args, format);
446 int ofs = lex_ofs (lexer);
447 lex_ofs_msg_valist (lexer, SE, n0 + ofs, n1 + ofs, format, args);
451 /* Prints a syntax error message for the span of tokens with offsets OFS0
452 through OFS1, inclusive, within the current command in LEXER, adding message
453 MESSAGE (if non-null). */
455 lex_ofs_error (struct lexer *lexer, int ofs0, int ofs1, const char *format, ...)
459 va_start (args, format);
460 lex_ofs_msg_valist (lexer, SE, ofs0, ofs1, format, args);
464 /* Prints a message of the given CLASS containing the current token and given
465 message MESSAGE (if non-null). */
467 lex_msg (struct lexer *lexer, enum msg_class class, const char *format, ...)
471 va_start (args, format);
472 lex_ofs_msg_valist (lexer, class, lex_ofs (lexer), lex_ofs (lexer),
477 /* Prints a syntax error message for the span of tokens N0 through N1,
478 inclusive, from the current token in LEXER, adding message MESSAGE (if
481 lex_next_msg (struct lexer *lexer, enum msg_class class, int n0, int n1,
482 const char *format, ...)
486 va_start (args, format);
487 int ofs = lex_ofs (lexer);
488 lex_ofs_msg_valist (lexer, class, n0 + ofs, n1 + ofs, format, args);
492 /* Prints a message of the given CLASS for the span of tokens with offsets OFS0
493 through OFS1, inclusive, within the current command in LEXER, adding message
494 MESSAGE (if non-null). */
496 lex_ofs_msg (struct lexer *lexer, enum msg_class class, int ofs0, int ofs1,
497 const char *format, ...)
501 va_start (args, format);
502 lex_ofs_msg_valist (lexer, class, ofs0, ofs1, format, args);
506 /* Prints a syntax error message saying that one of the strings provided as
507 varargs, up to the first NULL, is expected. */
509 (lex_error_expecting) (struct lexer *lexer, ...)
513 va_start (args, lexer);
514 lex_error_expecting_valist (lexer, args);
518 /* Prints a syntax error message saying that one of the options provided in
519 ARGS, up to the first NULL, is expected. */
521 lex_error_expecting_valist (struct lexer *lexer, va_list args)
523 const char **options = NULL;
524 size_t allocated = 0;
529 const char *option = va_arg (args, const char *);
534 options = x2nrealloc (options, &allocated, sizeof *options);
535 options[n++] = option;
537 lex_error_expecting_array (lexer, options, n);
542 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
547 lex_error (lexer, NULL);
551 lex_error (lexer, _("Syntax error expecting %s."), options[0]);
555 lex_error (lexer, _("Syntax error expecting %s or %s."),
556 options[0], options[1]);
560 lex_error (lexer, _("Syntax error expecting %s, %s, or %s."),
561 options[0], options[1], options[2]);
565 lex_error (lexer, _("Syntax error expecting %s, %s, %s, or %s."),
566 options[0], options[1], options[2], options[3]);
570 lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, or %s."),
571 options[0], options[1], options[2], options[3], options[4]);
575 lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, or %s."),
576 options[0], options[1], options[2], options[3], options[4],
581 lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, "
583 options[0], options[1], options[2], options[3], options[4],
584 options[5], options[6]);
588 lex_error (lexer, _("Syntax error expecting %s, %s, %s, %s, %s, %s, %s, "
590 options[0], options[1], options[2], options[3], options[4],
591 options[5], options[6], options[7]);
596 struct string s = DS_EMPTY_INITIALIZER;
597 for (size_t i = 0; i < n; i++)
600 ds_put_cstr (&s, ", ");
601 ds_put_cstr (&s, options[i]);
603 lex_error (lexer, _("Syntax error expecting one of the following: %s."),
611 /* Reports an error to the effect that subcommand SBC may only be specified
614 lex_sbc_only_once (struct lexer *lexer, const char *sbc)
616 int ofs = lex_ofs (lexer) - 1;
617 if (lex_ofs_token (lexer, ofs)->type == T_EQUALS)
620 /* lex_ofs_at_phrase__() handles subcommand names that are keywords, such as
622 if (lex_ofs_at_phrase__ (lexer, ofs, sbc, NULL))
623 lex_ofs_error (lexer, ofs, ofs,
624 _("Subcommand %s may only be specified once."), sbc);
626 msg (SE, _("Subcommand %s may only be specified once."), sbc);
629 /* Reports an error to the effect that subcommand SBC is missing.
631 This function does not take a lexer as an argument or use lex_error(),
632 because a missing subcommand can normally be detected only after the whole
633 command has been parsed, and so lex_error() would always report "Syntax
634 error at end of command", which does not help the user find the error. */
636 lex_sbc_missing (struct lexer *lexer, const char *sbc)
638 lex_ofs_error (lexer, 0, lex_max_ofs (lexer),
639 _("Required subcommand %s was not specified."), sbc);
642 /* Reports an error to the effect that specification SPEC may only be specified
643 once within subcommand SBC. */
645 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
647 lex_error (lexer, _("%s may only be specified once within subcommand %s."),
651 /* Reports an error to the effect that specification SPEC is missing within
654 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
656 lex_error (lexer, _("Required %s specification missing from %s subcommand."),
660 /* Prints a syntax error message for the span of tokens with offsets OFS0
661 through OFS1, inclusive, within the current command in LEXER, adding message
662 MESSAGE (if non-null) with the given ARGS. */
664 lex_ofs_msg_valist (struct lexer *lexer, enum msg_class class,
665 int ofs0, int ofs1, const char *format, va_list args)
667 lex_source_msg_valist (lex_source__ (lexer), class, ofs0, ofs1, format, args);
670 /* Checks that we're at end of command.
671 If so, returns a successful command completion code.
672 If not, flags a syntax error and returns an error command
675 lex_end_of_command (struct lexer *lexer)
677 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
679 lex_error (lexer, _("Syntax error expecting end of command."));
686 /* Token testing functions. */
688 /* Returns true if the current token is a number. */
690 lex_is_number (const struct lexer *lexer)
692 return lex_next_is_number (lexer, 0);
695 /* Returns true if the current token is a string. */
697 lex_is_string (const struct lexer *lexer)
699 return lex_next_is_string (lexer, 0);
702 /* Returns the value of the current token, which must be a
703 floating point number. */
705 lex_number (const struct lexer *lexer)
707 return lex_next_number (lexer, 0);
710 /* Returns true iff the current token is an integer. */
712 lex_is_integer (const struct lexer *lexer)
714 return lex_next_is_integer (lexer, 0);
717 /* Returns the value of the current token, which must be an
720 lex_integer (const struct lexer *lexer)
722 return lex_next_integer (lexer, 0);
725 /* Token testing functions with lookahead.
727 A value of 0 for N as an argument to any of these functions refers to the
728 current token. Lookahead is limited to the current command. Any N greater
729 than the number of tokens remaining in the current command will be treated
730 as referring to a T_ENDCMD token. */
732 /* Returns true if the token N ahead of the current token is a number. */
734 lex_next_is_number (const struct lexer *lexer, int n)
736 return token_is_number (lex_next (lexer, n));
739 /* Returns true if the token N ahead of the current token is a string. */
741 lex_next_is_string (const struct lexer *lexer, int n)
743 return token_is_string (lex_next (lexer, n));
746 /* Returns the value of the token N ahead of the current token, which must be a
747 floating point number. */
749 lex_next_number (const struct lexer *lexer, int n)
751 return token_number (lex_next (lexer, n));
754 /* Returns true if the token N ahead of the current token is an integer. */
756 lex_next_is_integer (const struct lexer *lexer, int n)
758 return token_is_integer (lex_next (lexer, n));
761 /* Returns the value of the token N ahead of the current token, which must be
764 lex_next_integer (const struct lexer *lexer, int n)
766 return token_integer (lex_next (lexer, n));
769 /* Token matching functions. */
771 /* If the current token has the specified TYPE, skips it and returns true.
772 Otherwise, returns false. */
774 lex_match (struct lexer *lexer, enum token_type type)
776 if (lex_token (lexer) == type)
785 /* If the current token matches IDENTIFIER, skips it and returns true.
786 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
789 IDENTIFIER must be an ASCII string. */
791 lex_match_id (struct lexer *lexer, const char *identifier)
793 return lex_match_id_n (lexer, identifier, 3);
796 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
797 may be abbreviated to its first N letters. Otherwise, returns false.
799 IDENTIFIER must be an ASCII string. */
801 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
803 if (lex_token (lexer) == T_ID
804 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
813 /* If the current token is integer X, skips it and returns true. Otherwise,
816 lex_match_int (struct lexer *lexer, int x)
818 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
827 /* Forced matches. */
829 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
830 abbreviated to its first 3 letters. Otherwise, reports an error and returns
833 IDENTIFIER must be an ASCII string. */
835 lex_force_match_id (struct lexer *lexer, const char *identifier)
837 if (lex_match_id (lexer, identifier))
841 lex_error_expecting (lexer, identifier);
846 /* If the current token has the specified TYPE, skips it and returns true.
847 Otherwise, reports an error and returns false. */
849 lex_force_match (struct lexer *lexer, enum token_type type)
851 if (lex_token (lexer) == type)
858 const char *type_string = token_type_to_string (type);
861 char *s = xasprintf ("`%s'", type_string);
862 lex_error_expecting (lexer, s);
866 lex_error_expecting (lexer, token_type_to_name (type));
872 /* If the current token is a string, does nothing and returns true.
873 Otherwise, reports an error and returns false. */
875 lex_force_string (struct lexer *lexer)
877 if (lex_is_string (lexer))
881 lex_error (lexer, _("Syntax error expecting string."));
886 /* If the current token is a string or an identifier, does nothing and returns
887 true. Otherwise, reports an error and returns false.
889 This is meant for use in syntactic situations where we want to encourage the
890 user to supply a quoted string, but for compatibility we also accept
891 identifiers. (One example of such a situation is file names.) Therefore,
892 the error message issued when the current token is wrong only says that a
893 string is expected and doesn't mention that an identifier would also be
896 lex_force_string_or_id (struct lexer *lexer)
898 return lex_token (lexer) == T_ID || lex_force_string (lexer);
901 /* If the current token is an integer, does nothing and returns true.
902 Otherwise, reports an error and returns false. */
904 lex_force_int (struct lexer *lexer)
906 if (lex_is_integer (lexer))
910 lex_error (lexer, _("Syntax error expecting integer."));
915 /* If the current token is an integer in the range MIN...MAX (inclusive), does
916 nothing and returns true. Otherwise, reports an error and returns false.
917 If NAME is nonnull, then it is used in the error message. */
919 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
921 bool is_number = lex_is_number (lexer);
922 bool is_integer = lex_is_integer (lexer);
923 bool too_small = (is_integer ? lex_integer (lexer) < min
924 : is_number ? lex_number (lexer) < min
926 bool too_big = (is_integer ? lex_integer (lexer) > max
927 : is_number ? lex_number (lexer) > max
929 if (is_integer && !too_small && !too_big)
934 /* Weird, maybe a bug in the caller. Just report that we needed an
937 lex_error (lexer, _("Syntax error expecting integer for %s."), name);
939 lex_error (lexer, _("Syntax error expecting integer."));
944 lex_error (lexer, _("Syntax error expecting %ld for %s."), min, name);
946 lex_error (lexer, _("Syntax error expecting %ld."), min);
948 else if (min + 1 == max)
951 lex_error (lexer, _("Syntax error expecting %ld or %ld for %s."),
954 lex_error (lexer, _("Syntax error expecting %ld or %ld."),
959 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
960 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
962 if (report_lower_bound && report_upper_bound)
966 _("Syntax error expecting integer "
967 "between %ld and %ld for %s."),
970 lex_error (lexer, _("Syntax error expecting integer "
971 "between %ld and %ld."),
974 else if (report_lower_bound)
979 lex_error (lexer, _("Syntax error expecting "
980 "non-negative integer for %s."),
983 lex_error (lexer, _("Syntax error expecting "
984 "non-negative integer."));
989 lex_error (lexer, _("Syntax error expecting "
990 "positive integer for %s."),
993 lex_error (lexer, _("Syntax error expecting "
994 "positive integer."));
999 lex_error (lexer, _("Syntax error expecting "
1000 "integer %ld or greater for %s."),
1003 lex_error (lexer, _("Syntax error expecting "
1004 "integer %ld or greater."), min);
1007 else if (report_upper_bound)
1011 _("Syntax error expecting integer less than or equal "
1015 lex_error (lexer, _("Syntax error expecting integer less than or "
1022 lex_error (lexer, _("Syntax error expecting integer for %s."),
1025 lex_error (lexer, _("Syntax error expecting integer."));
1031 /* If the current token is a number, does nothing and returns true.
1032 Otherwise, reports an error and returns false. */
1034 lex_force_num (struct lexer *lexer)
1036 if (lex_is_number (lexer))
1039 lex_error (lexer, _("Syntax error expecting number."));
1043 /* If the current token is an number in the closed range [MIN,MAX], does
1044 nothing and returns true. Otherwise, reports an error and returns false.
1045 If NAME is nonnull, then it is used in the error message. */
1047 lex_force_num_range_closed (struct lexer *lexer, const char *name,
1048 double min, double max)
1050 bool is_number = lex_is_number (lexer);
1051 bool too_small = is_number && lex_number (lexer) < min;
1052 bool too_big = is_number && lex_number (lexer) > max;
1053 if (is_number && !too_small && !too_big)
1058 /* Weird, maybe a bug in the caller. Just report that we needed an
1061 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1063 lex_error (lexer, _("Syntax error expecting number."));
1065 else if (min == max)
1068 lex_error (lexer, _("Syntax error expecting number %g for %s."),
1071 lex_error (lexer, _("Syntax error expecting number %g."), min);
1075 bool report_lower_bound = min > -DBL_MAX || too_small;
1076 bool report_upper_bound = max < DBL_MAX || too_big;
1078 if (report_lower_bound && report_upper_bound)
1082 _("Syntax error expecting number "
1083 "between %g and %g for %s."),
1086 lex_error (lexer, _("Syntax error expecting number "
1087 "between %g and %g."),
1090 else if (report_lower_bound)
1095 lex_error (lexer, _("Syntax error expecting "
1096 "non-negative number for %s."),
1099 lex_error (lexer, _("Syntax error expecting "
1100 "non-negative number."));
1105 lex_error (lexer, _("Syntax error expecting number "
1106 "%g or greater for %s."),
1109 lex_error (lexer, _("Syntax error expecting number "
1110 "%g or greater."), min);
1113 else if (report_upper_bound)
1117 _("Syntax error expecting number "
1118 "less than or equal to %g for %s."),
1121 lex_error (lexer, _("Syntax error expecting number "
1122 "less than or equal to %g."),
1128 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1130 lex_error (lexer, _("Syntax error expecting number."));
1136 /* If the current token is an number in the half-open range [MIN,MAX), does
1137 nothing and returns true. Otherwise, reports an error and returns false.
1138 If NAME is nonnull, then it is used in the error message. */
1140 lex_force_num_range_halfopen (struct lexer *lexer, const char *name,
1141 double min, double max)
1143 bool is_number = lex_is_number (lexer);
1144 bool too_small = is_number && lex_number (lexer) < min;
1145 bool too_big = is_number && lex_number (lexer) >= max;
1146 if (is_number && !too_small && !too_big)
1151 /* Weird, maybe a bug in the caller. Just report that we needed an
1154 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1156 lex_error (lexer, _("Syntax error expecting number."));
1160 bool report_lower_bound = min > -DBL_MAX || too_small;
1161 bool report_upper_bound = max < DBL_MAX || too_big;
1163 if (report_lower_bound && report_upper_bound)
1166 lex_error (lexer, _("Syntax error expecting number "
1167 "in [%g,%g) for %s."),
1170 lex_error (lexer, _("Syntax error expecting number in [%g,%g)."),
1173 else if (report_lower_bound)
1178 lex_error (lexer, _("Syntax error expecting "
1179 "non-negative number for %s."),
1182 lex_error (lexer, _("Syntax error expecting "
1183 "non-negative number."));
1188 lex_error (lexer, _("Syntax error expecting "
1189 "number %g or greater for %s."),
1192 lex_error (lexer, _("Syntax error expecting "
1193 "number %g or greater."), min);
1196 else if (report_upper_bound)
1200 _("Syntax error expecting "
1201 "number less than %g for %s."), max, name);
1203 lex_error (lexer, _("Syntax error expecting "
1204 "number less than %g."), max);
1209 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1211 lex_error (lexer, _("Syntax error expecting number."));
1217 /* If the current token is an number in the open range (MIN,MAX), does
1218 nothing and returns true. Otherwise, reports an error and returns false.
1219 If NAME is nonnull, then it is used in the error message. */
1221 lex_force_num_range_open (struct lexer *lexer, const char *name,
1222 double min, double max)
1224 bool is_number = lex_is_number (lexer);
1225 bool too_small = is_number && lex_number (lexer) <= min;
1226 bool too_big = is_number && lex_number (lexer) >= max;
1227 if (is_number && !too_small && !too_big)
1232 /* Weird, maybe a bug in the caller. Just report that we needed an
1235 lex_error (lexer, _("Syntax error expecting number for %s."), name);
1237 lex_error (lexer, _("Syntax error expecting number."));
1241 bool report_lower_bound = min > -DBL_MAX || too_small;
1242 bool report_upper_bound = max < DBL_MAX || too_big;
1244 if (report_lower_bound && report_upper_bound)
1247 lex_error (lexer, _("Syntax error expecting number "
1248 "in (%g,%g) for %s."),
1251 lex_error (lexer, _("Syntax error expecting number "
1252 "in (%g,%g)."), min, max);
1254 else if (report_lower_bound)
1259 lex_error (lexer, _("Syntax error expecting "
1260 "positive number for %s."), name);
1262 lex_error (lexer, _("Syntax error expecting "
1263 "positive number."));
1268 lex_error (lexer, _("Syntax error expecting number "
1269 "greater than %g for %s."),
1272 lex_error (lexer, _("Syntax error expecting number "
1273 "greater than %g."), min);
1276 else if (report_upper_bound)
1279 lex_error (lexer, _("Syntax error expecting number "
1280 "less than %g for %s."),
1283 lex_error (lexer, _("Syntax error expecting number "
1284 "less than %g."), max);
1289 lex_error (lexer, _("Syntax error expecting number "
1292 lex_error (lexer, _("Syntax error expecting number."));
1298 /* If the current token is an identifier, does nothing and returns true.
1299 Otherwise, reports an error and returns false. */
1301 lex_force_id (struct lexer *lexer)
1303 if (lex_token (lexer) == T_ID)
1306 lex_error (lexer, _("Syntax error expecting identifier."));
1310 /* Token accessors. */
1312 /* Returns the type of LEXER's current token. */
1314 lex_token (const struct lexer *lexer)
1316 return lex_next_token (lexer, 0);
1319 /* Returns the number in LEXER's current token.
1321 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1322 tokens this function will always return zero. */
1324 lex_tokval (const struct lexer *lexer)
1326 return lex_next_tokval (lexer, 0);
1329 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
1331 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1332 this functions this function will always return NULL.
1334 The UTF-8 encoding of the returned string is correct for variable names and
1335 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1336 data_in() to use it in a "union value". */
1338 lex_tokcstr (const struct lexer *lexer)
1340 return lex_next_tokcstr (lexer, 0);
1343 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
1344 null-terminated (but the null terminator is not included in the returned
1345 substring's 'length').
1347 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1348 this functions this function will always return NULL.
1350 The UTF-8 encoding of the returned string is correct for variable names and
1351 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1352 data_in() to use it in a "union value". */
1354 lex_tokss (const struct lexer *lexer)
1356 return lex_next_tokss (lexer, 0);
1361 A value of 0 for N as an argument to any of these functions refers to the
1362 current token. Lookahead is limited to the current command. Any N greater
1363 than the number of tokens remaining in the current command will be treated
1364 as referring to a T_ENDCMD token. */
1366 static const struct lex_token *
1367 lex_next__ (const struct lexer *lexer_, int n)
1369 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1370 struct lex_source *src = lex_source__ (lexer);
1373 return lex_source_next__ (src, n);
1376 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1381 static const struct lex_token *
1382 lex_source_ofs__ (const struct lex_source *src_, int ofs)
1384 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1388 static const struct lex_token endcmd_token
1389 = { .token = { .type = T_ENDCMD } };
1390 return &endcmd_token;
1393 while (ofs >= src->n_parse)
1395 if (src->n_parse > 0)
1397 const struct lex_token *t = src->parse[src->n_parse - 1];
1398 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1402 lex_source_get_parse (src);
1405 return src->parse[ofs];
1408 static const struct lex_token *
1409 lex_source_next__ (const struct lex_source *src, int n)
1411 return lex_source_ofs__ (src, n + src->parse_ofs);
1414 /* Returns the "struct token" of the token N after the current one in LEXER.
1415 The returned pointer can be invalidated by pretty much any succeeding call
1416 into the lexer, although the string pointer within the returned token is
1417 only invalidated by consuming the token (e.g. with lex_get()). */
1418 const struct token *
1419 lex_next (const struct lexer *lexer, int n)
1421 return &lex_next__ (lexer, n)->token;
1424 /* Returns the type of the token N after the current one in LEXER. */
1426 lex_next_token (const struct lexer *lexer, int n)
1428 return lex_next (lexer, n)->type;
1431 /* Returns the number in the tokn N after the current one in LEXER.
1433 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1434 tokens this function will always return zero. */
1436 lex_next_tokval (const struct lexer *lexer, int n)
1438 return token_number (lex_next (lexer, n));
1441 /* Returns the null-terminated string in the token N after the current one, in
1444 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1445 this functions this function will always return NULL.
1447 The UTF-8 encoding of the returned string is correct for variable names and
1448 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1449 data_in() to use it in a "union value". */
1451 lex_next_tokcstr (const struct lexer *lexer, int n)
1453 return lex_next_tokss (lexer, n).string;
1456 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1457 The string is null-terminated (but the null terminator is not included in
1458 the returned substring's 'length').
1460 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1461 tokens this functions this function will always return NULL.
1463 The UTF-8 encoding of the returned string is correct for variable names and
1464 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1465 data_in() to use it in a "union value". */
1467 lex_next_tokss (const struct lexer *lexer, int n)
1469 return lex_next (lexer, n)->string;
1472 /* Returns the offset of the current token within the command being parsed in
1473 LEXER. This is 0 for the first token in a command, 1 for the second, and so
1474 on. The return value is useful later for referring to this token in calls
1477 lex_ofs (const struct lexer *lexer)
1479 struct lex_source *src = lex_source__ (lexer);
1480 return src ? src->parse_ofs : 0;
1483 /* Returns the offset of the last token in the current command. */
1485 lex_max_ofs (const struct lexer *lexer)
1487 struct lex_source *src = lex_source__ (lexer);
1491 int ofs = MAX (1, src->n_parse) - 1;
1494 enum token_type type = lex_source_ofs__ (src, ofs)->token.type;
1495 if (type == T_ENDCMD || type == T_STOP)
1502 /* Returns the token within LEXER's current command with offset OFS. Use
1503 lex_ofs() to find out the offset of the current token. */
1504 const struct token *
1505 lex_ofs_token (const struct lexer *lexer_, int ofs)
1507 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1508 struct lex_source *src = lex_source__ (lexer);
1511 return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1514 static const struct token stop_token = { .type = T_STOP };
1519 /* Allocates and returns a new struct msg_location that spans tokens with
1520 offsets OFS0 through OFS1, inclusive, within the current command in
1521 LEXER. See lex_ofs() for an explanation of token offsets.
1523 The caller owns and must eventually free the returned object. */
1524 struct msg_location *
1525 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1527 int ofs = lex_ofs (lexer);
1528 return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1531 /* Returns a msg_point for the first character in the token with offset OFS,
1532 where offset 0 is the first token in the command currently being parsed, 1
1533 the second token, and so on. These are absolute offsets, not relative to
1534 the token currently being parsed within the command.
1536 Returns zeros for a T_STOP token.
1539 lex_ofs_start_point (const struct lexer *lexer, int ofs)
1541 const struct lex_source *src = lex_source__ (lexer);
1543 ? lex_token_start_point (src, lex_source_ofs__ (src, ofs))
1544 : (struct msg_point) { 0, 0 });
1547 /* Returns a msg_point for the last character, inclusive, in the token with
1548 offset OFS, where offset 0 is the first token in the command currently being
1549 parsed, 1 the second token, and so on. These are absolute offsets, not
1550 relative to the token currently being parsed within the command.
1552 Returns zeros for a T_STOP token.
1554 Most of the time, a single token is wholly within a single line of syntax,
1555 so that the start and end point for a given offset have the same line
1556 number. There are two exceptions: a T_STRING token can be made up of
1557 multiple segments on adjacent lines connected with "+" punctuators, and a
1558 T_NEG_NUM token can consist of a "-" on one line followed by the number on
1562 lex_ofs_end_point (const struct lexer *lexer, int ofs)
1564 const struct lex_source *src = lex_source__ (lexer);
1566 ? lex_token_end_point (src, lex_source_ofs__ (src, ofs))
1567 : (struct msg_point) { 0, 0 });
1570 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1571 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1572 are both zero, this requests the syntax for the current token.)
1574 The caller must eventually free the returned string (with free()). The
1575 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1576 that, for example, it may include comments, spaces, and new-lines if it
1577 spans multiple tokens. Macro expansion, however, has already been
1580 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1582 const struct lex_source *src = lex_source__ (lexer);
1584 ? lex_source_syntax__ (src, n0 + src->parse_ofs, n1 + src->parse_ofs)
1589 /* Returns the text of the syntax in tokens with offsets OFS0 to OFS1,
1590 inclusive. (For example, if OFS0 and OFS1 are both zero, this requests the
1591 syntax for the first token in the current command.)
1593 The caller must eventually free the returned string (with free()). The
1594 syntax is encoded in UTF-8 and in the original form supplied to the lexer so
1595 that, for example, it may include comments, spaces, and new-lines if it
1596 spans multiple tokens. Macro expansion, however, has already been
1599 lex_ofs_representation (const struct lexer *lexer, int ofs0, int ofs1)
1601 const struct lex_source *src = lex_source__ (lexer);
1602 return src ? lex_source_syntax__ (src, ofs0, ofs1) : xstrdup ("");
1605 /* Returns true if the token N ahead of the current one was produced by macro
1606 expansion, false otherwise. */
1608 lex_next_is_from_macro (const struct lexer *lexer, int n)
1610 return lex_next__ (lexer, n)->macro_rep != NULL;
1614 lex_tokens_match (const struct token *actual, const struct token *expected)
1616 if (actual->type != expected->type)
1619 switch (actual->type)
1623 return actual->number == expected->number;
1626 return lex_id_match (expected->string, actual->string);
1629 return (actual->string.length == expected->string.length
1630 && !memcmp (actual->string.string, expected->string.string,
1631 actual->string.length));
1639 lex_ofs_at_phrase__ (struct lexer *lexer, int ofs, const char *s,
1642 struct string_lexer slex;
1645 size_t n_matched = 0;
1646 bool all_matched = true;
1647 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1648 while (string_lexer_next (&slex, &token))
1650 bool match = lex_tokens_match (lex_ofs_token (lexer, ofs + n_matched),
1652 token_uninit (&token);
1655 all_matched = false;
1661 *n_matchedp = n_matched;
1665 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1666 returns true. Otherwise, returns false.
1668 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1669 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1670 first three letters. */
1672 lex_at_phrase (struct lexer *lexer, const char *s)
1674 return lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s, NULL);
1677 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1678 skips it and returns true. Otherwise, returns false.
1680 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1681 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1682 first three letters. */
1684 lex_match_phrase (struct lexer *lexer, const char *s)
1687 if (!lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s, &n_matched))
1689 lex_get_n (lexer, n_matched);
1693 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1694 skips it and returns true. Otherwise, issues an error and returns false.
1696 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1697 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1698 first three letters. */
1700 lex_force_match_phrase (struct lexer *lexer, const char *s)
1703 bool ok = lex_ofs_at_phrase__ (lexer, lex_ofs (lexer), s, &n_matched);
1705 lex_get_n (lexer, n_matched);
1707 lex_next_error (lexer, 0, n_matched, _("Syntax error expecting `%s'."), s);
1711 /* Returns the 1-based line number of the source text at the byte OFFSET in
1714 lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset)
1717 size_t hi = src->n_lines;
1720 size_t mid = (lo + hi) / 2;
1721 if (mid + 1 >= src->n_lines)
1722 return src->n_lines;
1723 else if (offset >= src->lines[mid + 1])
1725 else if (offset < src->lines[mid])
1732 /* Returns the 1-based column number of the source text at the byte OFFSET in
1735 lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset)
1737 const char *newline = memrchr (src->buffer, '\n', offset);
1738 size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1739 return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1742 static struct msg_point
1743 lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset)
1745 return (struct msg_point) {
1746 .line = lex_source_ofs_to_line_number (src, offset),
1747 .column = lex_source_ofs_to_column_number (src, offset),
1751 static struct msg_point
1752 lex_token_start_point (const struct lex_source *src,
1753 const struct lex_token *token)
1755 return lex_source_ofs_to_point__ (src, token->token_pos);
1758 static struct msg_point
1759 lex_token_end_point (const struct lex_source *src,
1760 const struct lex_token *token)
1762 return lex_source_ofs_to_point__ (src, lex_token_end (token));
1765 static struct msg_location
1766 lex_token_location (const struct lex_source *src,
1767 const struct lex_token *t0,
1768 const struct lex_token *t1)
1770 return (struct msg_location) {
1771 .file_name = intern_new_if_nonnull (src->reader->file_name),
1772 .start = lex_token_start_point (src, t0),
1773 .end = lex_token_end_point (src, t1),
1774 .src = CONST_CAST (struct lex_source *, src),
1778 static struct msg_location *
1779 lex_token_location_rw (const struct lex_source *src,
1780 const struct lex_token *t0,
1781 const struct lex_token *t1)
1783 struct msg_location location = lex_token_location (src, t0, t1);
1784 return msg_location_dup (&location);
1787 static struct msg_location *
1788 lex_source_get_location (const struct lex_source *src, int ofs0, int ofs1)
1790 return lex_token_location_rw (src,
1791 lex_source_ofs__ (src, ofs0),
1792 lex_source_ofs__ (src, ofs1));
1795 /* Returns the name of the syntax file from which the current command is drawn.
1796 Returns NULL for a T_STOP token or if the command's source does not have
1799 There is no version of this function that takes an N argument because
1800 lookahead only works to the end of a command and any given command is always
1801 within a single syntax file. */
1803 lex_get_file_name (const struct lexer *lexer)
1805 struct lex_source *src = lex_source__ (lexer);
1806 return src == NULL ? NULL : src->reader->file_name;
1809 /* Returns a newly allocated msg_location for the syntax that represents tokens
1810 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1811 must eventually free the location (with msg_location_destroy()). */
1812 struct msg_location *
1813 lex_get_location (const struct lexer *lexer, int n0, int n1)
1815 struct msg_location *loc = xmalloc (sizeof *loc);
1816 *loc = (struct msg_location) {
1817 .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1818 .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)),
1819 .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)),
1820 .src = lex_source__ (lexer),
1822 lex_source_ref (loc->src);
1827 lex_get_encoding (const struct lexer *lexer)
1829 struct lex_source *src = lex_source__ (lexer);
1830 return src == NULL ? NULL : src->reader->encoding;
1833 /* Returns the syntax mode for the syntax file from which the current drawn is
1834 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1835 does not have line numbers.
1837 There is no version of this function that takes an N argument because
1838 lookahead only works to the end of a command and any given command is always
1839 within a single syntax file. */
1841 lex_get_syntax_mode (const struct lexer *lexer)
1843 struct lex_source *src = lex_source__ (lexer);
1844 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1847 /* Returns the error mode for the syntax file from which the current drawn is
1848 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1849 source does not have line numbers.
1851 There is no version of this function that takes an N argument because
1852 lookahead only works to the end of a command and any given command is always
1853 within a single syntax file. */
1855 lex_get_error_mode (const struct lexer *lexer)
1857 struct lex_source *src = lex_source__ (lexer);
1858 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1861 /* If the source that LEXER is currently reading has error mode
1862 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1863 token to be read comes directly from whatever is next read from the stream.
1865 It makes sense to call this function after encountering an error in a
1866 command entered on the console, because usually the user would prefer not to
1867 have cascading errors. */
1869 lex_interactive_reset (struct lexer *lexer)
1871 struct lex_source *src = lex_source__ (lexer);
1872 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1875 src->journal_pos = src->seg_pos = 0;
1877 src->suppress_next_newline = false;
1878 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1880 lex_stage_clear (&src->pp);
1881 lex_stage_clear (&src->merge);
1882 lex_source_clear_parse (src);
1883 lex_source_push_endcmd__ (src);
1887 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1889 lex_discard_rest_of_command (struct lexer *lexer)
1891 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1895 /* Discards all lookahead tokens in LEXER, then discards all input sources
1896 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1897 runs out of input sources. */
1899 lex_discard_noninteractive (struct lexer *lexer)
1901 struct lex_source *src = lex_source__ (lexer);
1904 if (src->reader->error == LEX_ERROR_IGNORE)
1907 lex_stage_clear (&src->pp);
1908 lex_stage_clear (&src->merge);
1909 lex_source_clear_parse (src);
1911 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1912 src = lex_source__ (lexer))
1914 ll_remove (&src->ll);
1915 lex_source_unref (src);
1921 lex_source_expand__ (struct lex_source *src)
1923 if (src->length >= src->allocated)
1924 src->buffer = x2realloc (src->buffer, &src->allocated);
1928 lex_source_read__ (struct lex_source *src)
1932 lex_source_expand__ (src);
1934 size_t space = src->allocated - src->length;
1935 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1936 size_t n = src->reader->class->read (src->reader,
1937 &src->buffer[src->length],
1939 assert (n <= space);
1944 src->reader->eof = true;
1950 while (!memchr (&src->buffer[src->seg_pos], '\n',
1951 src->length - src->seg_pos));
1954 static struct lex_source *
1955 lex_source__ (const struct lexer *lexer)
1957 return (ll_is_empty (&lexer->sources) ? NULL
1958 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1961 const struct lex_source *
1962 lex_source (const struct lexer *lexer)
1964 return lex_source__ (lexer);
1967 /* Returns the text of the syntax in SRC for tokens with offsets OFS0 through
1968 OFS1 in the current command, inclusive. (For example, if OFS0 and OFS1 are
1969 both zero, this requests the syntax for the first token in the current
1970 command.) The caller must eventually free the returned string (with
1971 free()). The syntax is encoded in UTF-8 and in the original form supplied
1972 to the lexer so that, for example, it may include comments, spaces, and
1973 new-lines if it spans multiple tokens. Macro expansion, however, has
1974 already been performed. */
1976 lex_source_syntax__ (const struct lex_source *src, int ofs0, int ofs1)
1978 struct string s = DS_EMPTY_INITIALIZER;
1979 for (size_t i = ofs0; i <= ofs1; )
1981 /* Find [I,J) as the longest sequence of tokens not produced by macro
1982 expansion, or otherwise the longest sequence expanded from a single
1984 const struct lex_token *first = lex_source_ofs__ (src, i);
1986 for (j = i + 1; j <= ofs1; j++)
1988 const struct lex_token *cur = lex_source_ofs__ (src, j);
1989 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1990 || first->macro_rep != cur->macro_rep)
1993 const struct lex_token *last = lex_source_ofs__ (src, j - 1);
1995 /* Now add the syntax for this sequence of tokens to SRC. */
1996 if (!ds_is_empty (&s))
1997 ds_put_byte (&s, ' ');
1998 if (!first->macro_rep)
2000 size_t start = first->token_pos;
2001 size_t end = last->token_pos + last->token_len;
2002 ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
2006 size_t start = first->ofs;
2007 size_t end = last->ofs + last->len;
2008 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
2014 return ds_steal_cstr (&s);
2018 lex_source_contains_macro_call (struct lex_source *src, int ofs0, int ofs1)
2020 for (int i = ofs0; i <= ofs1; i++)
2021 if (lex_source_ofs__ (src, i)->macro_rep)
2026 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
2027 raw UTF-8 syntax for the macro call (not for the expansion) and for any
2028 other tokens included in that range. The syntax is encoded in UTF-8 and in
2029 the original form supplied to the lexer so that, for example, it may include
2030 comments, spaces, and new-lines if it spans multiple tokens.
2032 Returns an empty string if the token range doesn't include a macro call.
2034 The caller must not modify or free the returned string. */
2035 static struct substring
2036 lex_source_get_macro_call (struct lex_source *src, int ofs0, int ofs1)
2038 if (!lex_source_contains_macro_call (src, ofs0, ofs1))
2041 const struct lex_token *token0 = lex_source_ofs__ (src, ofs0);
2042 const struct lex_token *token1 = lex_source_ofs__ (src, MAX (ofs0, ofs1));
2043 size_t start = token0->token_pos;
2044 size_t end = token1->token_pos + token1->token_len;
2046 return ss_buffer (&src->buffer[start], end - start);
2050 lex_source_msg_valist (struct lex_source *src, enum msg_class class,
2051 int ofs0, int ofs1, const char *format, va_list args)
2053 struct string s = DS_EMPTY_INITIALIZER;
2057 /* Get the macro call(s) that expanded to the syntax that caused the
2060 str_ellipsize (lex_source_get_macro_call (src, ofs0, ofs1),
2063 ds_put_format (&s, _("In syntax expanded from `%s'"), call);
2066 ds_put_cstr (&s, _("At end of input"));
2068 if (!ds_is_empty (&s))
2069 ds_put_cstr (&s, ": ");
2071 ds_put_vformat (&s, format, args);
2073 ds_put_cstr (&s, _("Syntax error."));
2075 if (ds_last (&s) != '.')
2076 ds_put_byte (&s, '.');
2078 struct msg *m = xmalloc (sizeof *m);
2080 .category = msg_class_to_category (class),
2081 .severity = msg_class_to_severity (class),
2082 .location = src ? lex_source_get_location (src, ofs0, ofs1) : NULL,
2083 .text = ds_steal_cstr (&s),
2089 lex_get_error (struct lex_source *src, const struct lex_token *token)
2092 str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
2093 syntax, sizeof syntax);
2095 struct string s = DS_EMPTY_INITIALIZER;
2096 ds_put_cstr (&s, token->token.string.string);
2098 struct msg *m = xmalloc (sizeof *m);
2100 .category = MSG_C_SYNTAX,
2101 .severity = MSG_S_ERROR,
2102 .location = lex_token_location_rw (src, token, token),
2103 .text = ds_steal_cstr (&s),
2108 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
2109 underlying lex_reader if necessary. Returns true if a new token was added
2110 to SRC's deque, false otherwise. The caller should retry failures unless
2111 SRC's 'eof' marker was set to true indicating that there will be no more
2112 tokens from this source. */
2114 lex_source_try_get_pp (struct lex_source *src)
2116 /* Append a new token to SRC and initialize it. */
2117 struct lex_token *token = xmalloc (sizeof *token);
2118 token->token = (struct token) { .type = T_STOP };
2119 token->macro_rep = NULL;
2120 token->ref_cnt = NULL;
2121 token->token_pos = src->seg_pos;
2123 /* Extract a segment. */
2124 const char *segment;
2125 enum segment_type seg_type;
2129 segment = &src->buffer[src->seg_pos];
2130 seg_len = segmenter_push (&src->segmenter, segment,
2131 src->length - src->seg_pos,
2132 src->reader->eof, &seg_type);
2136 /* The segmenter needs more input to produce a segment. */
2137 assert (!src->reader->eof);
2138 lex_source_read__ (src);
2141 /* Update state based on the segment. */
2142 token->token_len = seg_len;
2143 src->seg_pos += seg_len;
2144 if (seg_type == SEG_NEWLINE)
2146 if (src->n_lines >= src->allocated_lines)
2147 src->lines = x2nrealloc (src->lines, &src->allocated_lines,
2148 sizeof *src->lines);
2149 src->lines[src->n_lines++] = src->seg_pos;
2152 /* Get a token from the segment. */
2153 enum tokenize_result result = token_from_segment (
2154 seg_type, ss_buffer (segment, seg_len), &token->token);
2156 /* If we've reached the end of a line, or the end of a command, then pass
2157 the line to the output engine as a syntax text item. */
2158 int n_lines = seg_type == SEG_NEWLINE;
2159 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
2162 src->suppress_next_newline = true;
2164 else if (n_lines > 0 && src->suppress_next_newline)
2167 src->suppress_next_newline = false;
2169 for (int i = 0; i < n_lines; i++)
2171 /* Beginning of line. */
2172 const char *line = &src->buffer[src->journal_pos];
2174 /* Calculate line length, including \n or \r\n end-of-line if present.
2176 We use src->length even though that may be beyond what we've actually
2177 converted to tokens. That's because, if we're emitting the line due
2178 to SEG_END_COMMAND, we want to take the whole line through the
2179 newline, not just through the '.'. */
2180 size_t max_len = src->length - src->journal_pos;
2181 const char *newline = memchr (line, '\n', max_len);
2182 size_t line_len = newline ? newline - line + 1 : max_len;
2184 /* Calculate line length excluding end-of-line. */
2185 size_t copy_len = line_len;
2186 if (copy_len > 0 && line[copy_len - 1] == '\n')
2188 if (copy_len > 0 && line[copy_len - 1] == '\r')
2191 /* Submit the line as syntax. */
2192 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
2193 xmemdup0 (line, copy_len),
2196 src->journal_pos += line_len;
2201 case TOKENIZE_ERROR:
2202 lex_get_error (src, token);
2204 case TOKENIZE_EMPTY:
2205 lex_token_destroy (token);
2208 case TOKENIZE_TOKEN:
2209 if (token->token.type == T_STOP)
2211 token->token.type = T_ENDCMD;
2214 lex_stage_push_last (&src->pp, token);
2220 /* Attempts to append a new token to SRC. Returns true if successful, false on
2221 failure. On failure, the end of SRC has been reached and no more tokens
2222 will be forthcoming from it.
2224 Does not make the new token available for lookahead yet; the caller must
2225 adjust SRC's 'middle' pointer to do so. */
2227 lex_source_get_pp (struct lex_source *src)
2230 if (lex_source_try_get_pp (src))
2236 lex_source_try_get_merge (const struct lex_source *src_)
2238 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2240 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
2243 if (!settings_get_mexpand ())
2245 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
2249 /* Now pass tokens one-by-one to the macro expander.
2251 In the common case where there is no macro to expand, the loop is not
2253 struct macro_call *mc;
2254 int n_call = macro_call_create (src->lexer->macros,
2255 &lex_stage_first (&src->pp)->token, &mc);
2256 for (int ofs = 1; !n_call; ofs++)
2258 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
2260 /* This should not be reachable because we always get a T_ENDCMD at
2261 the end of an input file (transformed from T_STOP by
2262 lex_source_try_get_pp()) and the macro_expander should always
2263 terminate expansion on T_ENDCMD. */
2267 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
2268 const struct macro_token mt = {
2270 .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len),
2272 const struct msg_location loc = lex_token_location (src, t, t);
2273 n_call = macro_call_add (mc, &mt, &loc);
2277 /* False alarm: no macro expansion after all. Use first token as
2278 lookahead. We'll retry macro expansion from the second token next
2280 macro_call_destroy (mc);
2281 lex_stage_shift (&src->merge, &src->pp, 1);
2285 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
2286 are a macro call. (These are likely to be the only tokens in 'pp'.)
2288 const struct lex_token *c0 = lex_stage_first (&src->pp);
2289 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
2290 struct macro_tokens expansion = { .n = 0 };
2291 struct msg_location loc = lex_token_location (src, c0, c1);
2292 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
2293 macro_call_destroy (mc);
2295 /* Convert the macro expansion into syntax for possible error messages
2297 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
2298 size_t *len = xnmalloc (expansion.n, sizeof *len);
2299 struct string s = DS_EMPTY_INITIALIZER;
2300 macro_tokens_to_syntax (&expansion, &s, ofs, len);
2302 if (settings_get_mprint ())
2303 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
2304 _("Macro Expansion")));
2306 /* Append the macro expansion tokens to the lookahead. */
2307 if (expansion.n > 0)
2309 char *macro_rep = ds_steal_cstr (&s);
2310 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
2311 *ref_cnt = expansion.n;
2312 for (size_t i = 0; i < expansion.n; i++)
2314 struct lex_token *token = xmalloc (sizeof *token);
2315 *token = (struct lex_token) {
2316 .token = expansion.mts[i].token,
2317 .token_pos = c0->token_pos,
2318 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
2319 .macro_rep = macro_rep,
2324 lex_stage_push_last (&src->merge, token);
2326 ss_dealloc (&expansion.mts[i].syntax);
2331 free (expansion.mts);
2335 /* Destroy the tokens for the call. */
2336 for (size_t i = 0; i < n_call; i++)
2337 lex_stage_pop_first (&src->pp);
2339 return expansion.n > 0;
2342 /* Attempts to obtain at least one new token into 'merge' in SRC.
2344 Returns true if successful, false on failure. In the latter case, SRC is
2345 exhausted and 'src->eof' is now true. */
2347 lex_source_get_merge (struct lex_source *src)
2350 if (lex_source_try_get_merge (src))
2355 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
2357 Returns true if successful, false on failure. In the latter case, SRC is
2358 exhausted and 'src->eof' is now true. */
2360 lex_source_get_parse (struct lex_source *src)
2362 struct merger m = MERGER_INIT;
2364 for (size_t i = 0; ; i++)
2366 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
2368 /* We always get a T_ENDCMD at the end of an input file
2369 (transformed from T_STOP by lex_source_try_get_pp()) and
2370 merger_add() should never return -1 on T_ENDCMD. */
2371 assert (lex_stage_is_empty (&src->merge));
2375 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
2379 lex_source_push_parse (src, lex_stage_take_first (&src->merge));
2382 else if (retval > 0)
2384 /* Add a token that merges all the tokens together. */
2385 const struct lex_token *first = lex_stage_first (&src->merge);
2386 const struct lex_token *last = lex_stage_nth (&src->merge,
2388 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2389 struct lex_token *t = xmalloc (sizeof *t);
2390 *t = (struct lex_token) {
2392 .token_pos = first->token_pos,
2393 .token_len = (last->token_pos - first->token_pos) + last->token_len,
2395 /* This works well if all the tokens were not expanded from macros,
2396 or if they came from the same macro expansion. It just gives up
2397 in the other (corner) cases. */
2398 .macro_rep = macro ? first->macro_rep : NULL,
2399 .ofs = macro ? first->ofs : 0,
2400 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2401 .ref_cnt = macro ? first->ref_cnt : NULL,
2405 lex_source_push_parse (src, t);
2407 for (int i = 0; i < retval; i++)
2408 lex_stage_pop_first (&src->merge);
2415 lex_source_push_endcmd__ (struct lex_source *src)
2417 assert (src->n_parse == 0);
2419 struct lex_token *token = xmalloc (sizeof *token);
2420 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2421 lex_source_push_parse (src, token);
2425 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2427 if (src->n_parse >= src->allocated_parse)
2428 src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2429 sizeof *src->parse);
2430 src->parse[src->n_parse++] = token;
2434 lex_source_clear_parse (struct lex_source *src)
2436 for (size_t i = 0; i < src->n_parse; i++)
2437 lex_token_destroy (src->parse[i]);
2438 src->n_parse = src->parse_ofs = 0;
2441 static struct lex_source *
2442 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2444 size_t allocated_lines = 4;
2445 size_t *lines = xmalloc (allocated_lines * sizeof *lines);
2448 struct lex_source *src = xmalloc (sizeof *src);
2449 *src = (struct lex_source) {
2452 .segmenter = segmenter_init (reader->syntax, false),
2456 .allocated_lines = allocated_lines,
2459 lex_source_push_endcmd__ (src);
2465 lex_set_message_handler (struct lexer *lexer,
2466 void (*output_msg) (const struct msg *,
2469 struct msg_handler msg_handler = {
2470 .output_msg = (void (*)(const struct msg *, void *)) output_msg,
2472 .lex_source_ref = lex_source_ref,
2473 .lex_source_unref = lex_source_unref,
2474 .lex_source_get_line = lex_source_get_line,
2476 msg_set_handler (&msg_handler);
2480 lex_source_ref (const struct lex_source *src_)
2482 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2485 assert (src->n_refs > 0);
2492 lex_source_unref (struct lex_source *src)
2497 assert (src->n_refs > 0);
2498 if (--src->n_refs > 0)
2501 char *file_name = src->reader->file_name;
2502 char *encoding = src->reader->encoding;
2503 if (src->reader->class->destroy != NULL)
2504 src->reader->class->destroy (src->reader);
2509 lex_stage_uninit (&src->pp);
2510 lex_stage_uninit (&src->merge);
2511 lex_source_clear_parse (src);
2516 struct lex_file_reader
2518 struct lex_reader reader;
2519 struct u8_istream *istream;
2522 static struct lex_reader_class lex_file_reader_class;
2524 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2525 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2526 ENCODING, which should take one of the forms accepted by
2527 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2528 mode of the new reader, respectively.
2530 Returns a null pointer if FILE_NAME cannot be opened. */
2532 lex_reader_for_file (const char *file_name, const char *encoding,
2533 enum segmenter_mode syntax,
2534 enum lex_error_mode error)
2536 struct lex_file_reader *r;
2537 struct u8_istream *istream;
2539 istream = (!strcmp(file_name, "-")
2540 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2541 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2542 if (istream == NULL)
2544 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2548 r = xmalloc (sizeof *r);
2549 lex_reader_init (&r->reader, &lex_file_reader_class);
2550 r->reader.syntax = syntax;
2551 r->reader.error = error;
2552 r->reader.file_name = xstrdup (file_name);
2553 r->reader.encoding = xstrdup_if_nonnull (encoding);
2554 r->reader.line_number = 1;
2555 r->istream = istream;
2560 static struct lex_file_reader *
2561 lex_file_reader_cast (struct lex_reader *r)
2563 return UP_CAST (r, struct lex_file_reader, reader);
2567 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2568 enum prompt_style prompt_style UNUSED)
2570 struct lex_file_reader *r = lex_file_reader_cast (r_);
2571 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2574 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2581 lex_file_close (struct lex_reader *r_)
2583 struct lex_file_reader *r = lex_file_reader_cast (r_);
2585 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2587 if (u8_istream_close (r->istream) != 0)
2588 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2591 u8_istream_free (r->istream);
2596 static struct lex_reader_class lex_file_reader_class =
2602 struct lex_string_reader
2604 struct lex_reader reader;
2609 static struct lex_reader_class lex_string_reader_class;
2611 /* Creates and returns a new lex_reader for the contents of S, which must be
2612 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2613 with ss_dealloc() when it is closed. */
2615 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2617 struct lex_string_reader *r;
2619 r = xmalloc (sizeof *r);
2620 lex_reader_init (&r->reader, &lex_string_reader_class);
2621 r->reader.syntax = SEG_MODE_AUTO;
2622 r->reader.encoding = xstrdup_if_nonnull (encoding);
2629 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2630 which must be encoded in ENCODING. The caller retains ownership of S. */
2632 lex_reader_for_string (const char *s, const char *encoding)
2634 return lex_reader_for_substring_nocopy (ss_clone (ss_cstr (s)), encoding);
2637 /* Formats FORMAT as a printf()-like format string and creates and returns a
2638 new lex_reader for the formatted result. */
2640 lex_reader_for_format (const char *format, const char *encoding, ...)
2642 struct lex_reader *r;
2645 va_start (args, encoding);
2646 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2652 static struct lex_string_reader *
2653 lex_string_reader_cast (struct lex_reader *r)
2655 return UP_CAST (r, struct lex_string_reader, reader);
2659 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2660 enum prompt_style prompt_style UNUSED)
2662 struct lex_string_reader *r = lex_string_reader_cast (r_);
2665 chunk = MIN (n, r->s.length - r->offset);
2666 memcpy (buf, r->s.string + r->offset, chunk);
2673 lex_string_close (struct lex_reader *r_)
2675 struct lex_string_reader *r = lex_string_reader_cast (r_);
2681 static struct lex_reader_class lex_string_reader_class =
2688 lex_source_get_line (const struct lex_source *src, int line)
2690 if (line < 1 || line > src->n_lines)
2693 size_t ofs = src->lines[line - 1];
2695 if (line < src->n_lines)
2696 end = src->lines[line];
2699 const char *newline = memchr (src->buffer + ofs, '\n', src->length - ofs);
2700 end = newline ? newline - src->buffer : src->length;
2702 return ss_buffer (&src->buffer[ofs], end - ofs);