1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/intern.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
70 size_t token_pos; /* Offset into src->buffer of token start. */
71 size_t token_len; /* Length of source for token in bytes. */
73 /* For a token obtained through macro expansion, this is just this token.
75 For a token obtained through the lexer in an ordinary way, these are
77 char *macro_rep; /* The whole macro expansion. */
78 size_t ofs; /* Offset of this token in macro_rep. */
79 size_t len; /* Length of this token in macro_rep. */
80 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
83 static struct msg_point lex_token_start_point (const struct lex_source *,
84 const struct lex_token *);
85 static struct msg_point lex_token_end_point (const struct lex_source *,
86 const struct lex_token *);
88 /* Source offset of the last byte in TOKEN. */
90 lex_token_end (const struct lex_token *token)
92 return token->token_pos + MAX (token->token_len, 1) - 1;
96 lex_token_destroy (struct lex_token *t)
98 token_uninit (&t->token);
101 assert (*t->ref_cnt > 0);
111 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
116 struct lex_token **tokens;
119 static void lex_stage_clear (struct lex_stage *);
120 static void lex_stage_uninit (struct lex_stage *);
122 static size_t lex_stage_count (const struct lex_stage *);
123 static bool lex_stage_is_empty (const struct lex_stage *);
125 static struct lex_token *lex_stage_first (struct lex_stage *);
126 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
128 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
129 static void lex_stage_pop_first (struct lex_stage *);
131 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
134 /* Deletes all the tokens from STAGE. */
136 lex_stage_clear (struct lex_stage *stage)
138 while (!deque_is_empty (&stage->deque))
139 lex_stage_pop_first (stage);
142 /* Deletes all the tokens from STAGE and frees storage for the deque. */
144 lex_stage_uninit (struct lex_stage *stage)
146 lex_stage_clear (stage);
147 free (stage->tokens);
150 /* Returns true if STAGE contains no tokens, otherwise false. */
152 lex_stage_is_empty (const struct lex_stage *stage)
154 return deque_is_empty (&stage->deque);
157 /* Returns the number of tokens in STAGE. */
159 lex_stage_count (const struct lex_stage *stage)
161 return deque_count (&stage->deque);
164 /* Returns the first token in STAGE, which must be nonempty.
165 The first token is the one accessed with the least lookahead. */
166 static struct lex_token *
167 lex_stage_first (struct lex_stage *stage)
169 return lex_stage_nth (stage, 0);
172 /* Returns the token the given INDEX in STAGE. The first token (with the least
173 lookahead) is 0, the second token is 1, and so on. There must be at least
174 INDEX + 1 tokens in STAGE. */
175 static struct lex_token *
176 lex_stage_nth (struct lex_stage *stage, size_t index)
178 return stage->tokens[deque_back (&stage->deque, index)];
181 /* Adds TOKEN so that it becomes the last token in STAGE. */
183 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
185 if (deque_is_full (&stage->deque))
186 stage->tokens = deque_expand (&stage->deque, stage->tokens,
187 sizeof *stage->tokens);
188 stage->tokens[deque_push_front (&stage->deque)] = token;
191 /* Removes and returns the first token from STAGE. */
192 static struct lex_token *
193 lex_stage_take_first (struct lex_stage *stage)
195 return stage->tokens[deque_pop_back (&stage->deque)];
198 /* Removes the first token from STAGE and uninitializes it. */
200 lex_stage_pop_first (struct lex_stage *stage)
202 lex_token_destroy (lex_stage_take_first (stage));
205 /* Removes the first N tokens from SRC, appending them to DST as the last
208 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
210 for (size_t i = 0; i < n; i++)
211 lex_stage_push_last (dst, lex_stage_take_first (src));
214 /* A source of tokens, corresponding to a syntax file.
216 This is conceptually a lex_reader wrapped with everything needed to convert
217 its UTF-8 bytes into tokens. */
220 struct ll ll; /* In lexer's list of sources. */
224 - One for struct lexer.
226 - One for each struct msg_location that references this source. */
229 struct lex_reader *reader;
231 struct segmenter segmenter;
232 bool eof; /* True if T_STOP was read from 'reader'. */
234 /* Buffer of UTF-8 bytes. */
235 char *buffer; /* Source file contents. */
236 size_t length; /* Number of bytes filled. */
237 size_t allocated; /* Number of bytes allocated. */
239 /* Offsets into 'buffer'. */
240 size_t journal_pos; /* First byte not yet output to journal. */
241 size_t seg_pos; /* First byte not yet scanned as token. */
243 /* Offset into 'buffer' of starts of lines. */
245 size_t n_lines, allocated_lines;
247 bool suppress_next_newline;
251 This is a pipeline with the following stages. Each token eventually
252 made available to the parser passes through of these stages. The stages
253 are named after the processing that happens in each one.
255 Initially, tokens come from the segmenter and scanner to 'pp':
257 - pp: Tokens that need to pass through the macro preprocessor to end up
260 - merge: Tokens that need to pass through scan_merge() to end up in
263 - parse: Tokens available to the client for parsing.
265 'pp' and 'merge' store tokens only temporarily until they pass into
266 'parse'. Tokens then live in 'parse' until the command is fully
267 consumed, at which time they are freed together. */
269 struct lex_stage merge;
270 struct lex_token **parse;
271 size_t n_parse, allocated_parse, parse_ofs;
274 static struct lex_source *lex_source_create (struct lexer *,
275 struct lex_reader *);
280 struct ll_list sources; /* Contains "struct lex_source"s. */
281 struct macro_set *macros;
284 static struct lex_source *lex_source__ (const struct lexer *);
285 static char *lex_source_get_syntax__ (const struct lex_source *,
287 static const struct lex_token *lex_next__ (const struct lexer *, int n);
288 static void lex_source_push_endcmd__ (struct lex_source *);
289 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
290 static void lex_source_clear_parse (struct lex_source *);
292 static bool lex_source_get_parse (struct lex_source *);
293 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
294 const char *format, va_list)
295 PRINTF_FORMAT (4, 0);
296 static const struct lex_token *lex_source_next__ (const struct lex_source *,
299 /* Initializes READER with the specified CLASS and otherwise some reasonable
300 defaults. The caller should fill in the others members as desired. */
302 lex_reader_init (struct lex_reader *reader,
303 const struct lex_reader_class *class)
305 reader->class = class;
306 reader->syntax = SEG_MODE_AUTO;
307 reader->error = LEX_ERROR_CONTINUE;
308 reader->file_name = NULL;
309 reader->encoding = NULL;
310 reader->line_number = 0;
314 /* Frees any file name already in READER and replaces it by a copy of
315 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
317 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
319 free (reader->file_name);
320 reader->file_name = xstrdup_if_nonnull (file_name);
323 /* Creates and returns a new lexer. */
327 struct lexer *lexer = xmalloc (sizeof *lexer);
328 *lexer = (struct lexer) {
329 .sources = LL_INITIALIZER (lexer->sources),
330 .macros = macro_set_create (),
335 /* Destroys LEXER. */
337 lex_destroy (struct lexer *lexer)
341 struct lex_source *source, *next;
343 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
345 ll_remove (&source->ll);
346 lex_source_unref (source);
348 macro_set_destroy (lexer->macros);
353 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
354 same name. Takes ownership of M. */
356 lex_define_macro (struct lexer *lexer, struct macro *m)
358 macro_set_add (lexer->macros, m);
361 /* Inserts READER into LEXER so that the next token read by LEXER comes from
362 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
365 lex_include (struct lexer *lexer, struct lex_reader *reader)
367 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
368 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
371 /* Appends READER to LEXER, so that it will be read after all other current
372 readers have already been read. */
374 lex_append (struct lexer *lexer, struct lex_reader *reader)
376 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
381 /* Advances LEXER to the next token, consuming the current token. */
383 lex_get (struct lexer *lexer)
385 struct lex_source *src;
387 src = lex_source__ (lexer);
391 if (src->parse_ofs < src->n_parse)
393 if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
394 lex_source_clear_parse (src);
399 while (src->parse_ofs == src->n_parse)
400 if (!lex_source_get_parse (src))
402 ll_remove (&src->ll);
403 lex_source_unref (src);
404 src = lex_source__ (lexer);
410 /* Advances LEXER by N tokens. */
412 lex_get_n (struct lexer *lexer, size_t n)
418 /* Issuing errors. */
420 /* Prints a syntax error message containing the current token and
421 given message MESSAGE (if non-null). */
423 lex_error (struct lexer *lexer, const char *format, ...)
427 va_start (args, format);
428 lex_next_error_valist (lexer, 0, 0, format, args);
432 /* Prints a syntax error message containing the current token and
433 given message MESSAGE (if non-null). */
435 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
437 lex_next_error_valist (lexer, 0, 0, format, args);
440 /* Prints a syntax error message containing the current token and
441 given message MESSAGE (if non-null). */
443 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
447 va_start (args, format);
448 lex_next_error_valist (lexer, n0, n1, format, args);
452 /* Prints a syntax error message saying that one of the strings provided as
453 varargs, up to the first NULL, is expected. */
455 (lex_error_expecting) (struct lexer *lexer, ...)
459 va_start (args, lexer);
460 lex_error_expecting_valist (lexer, args);
464 /* Prints a syntax error message saying that one of the options provided in
465 ARGS, up to the first NULL, is expected. */
467 lex_error_expecting_valist (struct lexer *lexer, va_list args)
469 enum { MAX_OPTIONS = 9 };
470 const char *options[MAX_OPTIONS];
472 while (n < MAX_OPTIONS)
474 const char *option = va_arg (args, const char *);
478 options[n++] = option;
480 lex_error_expecting_array (lexer, options, n);
484 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
489 lex_error (lexer, NULL);
493 lex_error (lexer, _("expecting %s"), options[0]);
497 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
501 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
506 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
507 options[0], options[1], options[2], options[3]);
511 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
512 options[0], options[1], options[2], options[3], options[4]);
516 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
517 options[0], options[1], options[2], options[3], options[4],
522 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
523 options[0], options[1], options[2], options[3], options[4],
524 options[5], options[6]);
528 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
529 options[0], options[1], options[2], options[3], options[4],
530 options[5], options[6], options[7]);
534 lex_error (lexer, NULL);
538 /* Reports an error to the effect that subcommand SBC may only be specified
541 This function does not take a lexer as an argument or use lex_error(),
542 because the result would ordinarily just be redundant: "Syntax error at
543 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
544 not help the user find the error. */
546 lex_sbc_only_once (const char *sbc)
548 msg (SE, _("Subcommand %s may only be specified once."), sbc);
551 /* Reports an error to the effect that subcommand SBC is missing.
553 This function does not take a lexer as an argument or use lex_error(),
554 because a missing subcommand can normally be detected only after the whole
555 command has been parsed, and so lex_error() would always report "Syntax
556 error at end of command", which does not help the user find the error. */
558 lex_sbc_missing (const char *sbc)
560 msg (SE, _("Required subcommand %s was not specified."), sbc);
563 /* Reports an error to the effect that specification SPEC may only be specified
564 once within subcommand SBC. */
566 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
568 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
572 /* Reports an error to the effect that specification SPEC is missing within
575 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
577 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
581 /* Prints a syntax error message containing the current token and
582 given message MESSAGE (if non-null). */
584 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
585 const char *format, va_list args)
587 struct lex_source *src = lex_source__ (lexer);
590 lex_source_error_valist (src, n0, n1, format, args);
596 ds_put_format (&s, _("Syntax error at end of input"));
599 ds_put_cstr (&s, ": ");
600 ds_put_vformat (&s, format, args);
602 if (ds_last (&s) != '.')
603 ds_put_byte (&s, '.');
604 msg (SE, "%s", ds_cstr (&s));
609 /* Checks that we're at end of command.
610 If so, returns a successful command completion code.
611 If not, flags a syntax error and returns an error command
614 lex_end_of_command (struct lexer *lexer)
616 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
618 lex_error (lexer, _("expecting end of command"));
625 /* Token testing functions. */
627 /* Returns true if the current token is a number. */
629 lex_is_number (const struct lexer *lexer)
631 return lex_next_is_number (lexer, 0);
634 /* Returns true if the current token is a string. */
636 lex_is_string (const struct lexer *lexer)
638 return lex_next_is_string (lexer, 0);
641 /* Returns the value of the current token, which must be a
642 floating point number. */
644 lex_number (const struct lexer *lexer)
646 return lex_next_number (lexer, 0);
649 /* Returns true iff the current token is an integer. */
651 lex_is_integer (const struct lexer *lexer)
653 return lex_next_is_integer (lexer, 0);
656 /* Returns the value of the current token, which must be an
659 lex_integer (const struct lexer *lexer)
661 return lex_next_integer (lexer, 0);
664 /* Token testing functions with lookahead.
666 A value of 0 for N as an argument to any of these functions refers to the
667 current token. Lookahead is limited to the current command. Any N greater
668 than the number of tokens remaining in the current command will be treated
669 as referring to a T_ENDCMD token. */
671 /* Returns true if the token N ahead of the current token is a number. */
673 lex_next_is_number (const struct lexer *lexer, int n)
675 return token_is_number (lex_next (lexer, n));
678 /* Returns true if the token N ahead of the current token is a string. */
680 lex_next_is_string (const struct lexer *lexer, int n)
682 return token_is_string (lex_next (lexer, n));
685 /* Returns the value of the token N ahead of the current token, which must be a
686 floating point number. */
688 lex_next_number (const struct lexer *lexer, int n)
690 return token_number (lex_next (lexer, n));
693 /* Returns true if the token N ahead of the current token is an integer. */
695 lex_next_is_integer (const struct lexer *lexer, int n)
697 return token_is_integer (lex_next (lexer, n));
700 /* Returns the value of the token N ahead of the current token, which must be
703 lex_next_integer (const struct lexer *lexer, int n)
705 return token_integer (lex_next (lexer, n));
708 /* Token matching functions. */
710 /* If the current token has the specified TYPE, skips it and returns true.
711 Otherwise, returns false. */
713 lex_match (struct lexer *lexer, enum token_type type)
715 if (lex_token (lexer) == type)
724 /* If the current token matches IDENTIFIER, skips it and returns true.
725 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
728 IDENTIFIER must be an ASCII string. */
730 lex_match_id (struct lexer *lexer, const char *identifier)
732 return lex_match_id_n (lexer, identifier, 3);
735 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
736 may be abbreviated to its first N letters. Otherwise, returns false.
738 IDENTIFIER must be an ASCII string. */
740 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
742 if (lex_token (lexer) == T_ID
743 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
752 /* If the current token is integer X, skips it and returns true. Otherwise,
755 lex_match_int (struct lexer *lexer, int x)
757 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
766 /* Forced matches. */
768 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
769 abbreviated to its first 3 letters. Otherwise, reports an error and returns
772 IDENTIFIER must be an ASCII string. */
774 lex_force_match_id (struct lexer *lexer, const char *identifier)
776 if (lex_match_id (lexer, identifier))
780 lex_error_expecting (lexer, identifier);
785 /* If the current token has the specified TYPE, skips it and returns true.
786 Otherwise, reports an error and returns false. */
788 lex_force_match (struct lexer *lexer, enum token_type type)
790 if (lex_token (lexer) == type)
797 const char *type_string = token_type_to_string (type);
800 char *s = xasprintf ("`%s'", type_string);
801 lex_error_expecting (lexer, s);
805 lex_error_expecting (lexer, token_type_to_name (type));
811 /* If the current token is a string, does nothing and returns true.
812 Otherwise, reports an error and returns false. */
814 lex_force_string (struct lexer *lexer)
816 if (lex_is_string (lexer))
820 lex_error (lexer, _("expecting string"));
825 /* If the current token is a string or an identifier, does nothing and returns
826 true. Otherwise, reports an error and returns false.
828 This is meant for use in syntactic situations where we want to encourage the
829 user to supply a quoted string, but for compatibility we also accept
830 identifiers. (One example of such a situation is file names.) Therefore,
831 the error message issued when the current token is wrong only says that a
832 string is expected and doesn't mention that an identifier would also be
835 lex_force_string_or_id (struct lexer *lexer)
837 return lex_token (lexer) == T_ID || lex_force_string (lexer);
840 /* If the current token is an integer, does nothing and returns true.
841 Otherwise, reports an error and returns false. */
843 lex_force_int (struct lexer *lexer)
845 if (lex_is_integer (lexer))
849 lex_error (lexer, _("expecting integer"));
854 /* If the current token is an integer in the range MIN...MAX (inclusive), does
855 nothing and returns true. Otherwise, reports an error and returns false.
856 If NAME is nonnull, then it is used in the error message. */
858 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
860 bool is_number = lex_is_number (lexer);
861 bool is_integer = lex_is_integer (lexer);
862 bool too_small = (is_integer ? lex_integer (lexer) < min
863 : is_number ? lex_number (lexer) < min
865 bool too_big = (is_integer ? lex_integer (lexer) > max
866 : is_number ? lex_number (lexer) > max
868 if (is_integer && !too_small && !too_big)
873 /* Weird, maybe a bug in the caller. Just report that we needed an
876 lex_error (lexer, _("Integer expected for %s."), name);
878 lex_error (lexer, _("Integer expected."));
883 lex_error (lexer, _("Expected %ld for %s."), min, name);
885 lex_error (lexer, _("Expected %ld."), min);
887 else if (min + 1 == max)
890 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
892 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
896 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
897 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
899 if (report_lower_bound && report_upper_bound)
903 _("Expected integer between %ld and %ld for %s."),
906 lex_error (lexer, _("Expected integer between %ld and %ld."),
909 else if (report_lower_bound)
914 lex_error (lexer, _("Expected non-negative integer for %s."),
917 lex_error (lexer, _("Expected non-negative integer."));
922 lex_error (lexer, _("Expected positive integer for %s."),
925 lex_error (lexer, _("Expected positive integer."));
930 lex_error (lexer, _("Expected integer %ld or greater for %s."),
933 lex_error (lexer, _("Expected integer %ld or greater."), min);
936 else if (report_upper_bound)
940 _("Expected integer less than or equal to %ld for %s."),
943 lex_error (lexer, _("Expected integer less than or equal to %ld."),
949 lex_error (lexer, _("Integer expected for %s."), name);
951 lex_error (lexer, _("Integer expected."));
957 /* If the current token is a number, does nothing and returns true.
958 Otherwise, reports an error and returns false. */
960 lex_force_num (struct lexer *lexer)
962 if (lex_is_number (lexer))
965 lex_error (lexer, _("expecting number"));
969 /* If the current token is an identifier, does nothing and returns true.
970 Otherwise, reports an error and returns false. */
972 lex_force_id (struct lexer *lexer)
974 if (lex_token (lexer) == T_ID)
977 lex_error (lexer, _("expecting identifier"));
981 /* Token accessors. */
983 /* Returns the type of LEXER's current token. */
985 lex_token (const struct lexer *lexer)
987 return lex_next_token (lexer, 0);
990 /* Returns the number in LEXER's current token.
992 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
993 tokens this function will always return zero. */
995 lex_tokval (const struct lexer *lexer)
997 return lex_next_tokval (lexer, 0);
1000 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
1002 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1003 this functions this function will always return NULL.
1005 The UTF-8 encoding of the returned string is correct for variable names and
1006 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1007 data_in() to use it in a "union value". */
1009 lex_tokcstr (const struct lexer *lexer)
1011 return lex_next_tokcstr (lexer, 0);
1014 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
1015 null-terminated (but the null terminator is not included in the returned
1016 substring's 'length').
1018 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1019 this functions this function will always return NULL.
1021 The UTF-8 encoding of the returned string is correct for variable names and
1022 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1023 data_in() to use it in a "union value". */
1025 lex_tokss (const struct lexer *lexer)
1027 return lex_next_tokss (lexer, 0);
1032 A value of 0 for N as an argument to any of these functions refers to the
1033 current token. Lookahead is limited to the current command. Any N greater
1034 than the number of tokens remaining in the current command will be treated
1035 as referring to a T_ENDCMD token. */
1037 static const struct lex_token *
1038 lex_next__ (const struct lexer *lexer_, int n)
1040 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1041 struct lex_source *src = lex_source__ (lexer);
1044 return lex_source_next__ (src, n);
1047 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1052 static const struct lex_token *
1053 lex_source_ofs__ (const struct lex_source *src_, int ofs)
1055 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1059 static const struct lex_token endcmd_token
1060 = { .token = { .type = T_ENDCMD } };
1061 return &endcmd_token;
1064 while (ofs >= src->n_parse)
1066 if (src->n_parse > 0)
1068 const struct lex_token *t = src->parse[src->n_parse - 1];
1069 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1073 lex_source_get_parse (src);
1076 return src->parse[ofs];
1079 static const struct lex_token *
1080 lex_source_next__ (const struct lex_source *src, int n)
1082 return lex_source_ofs__ (src, n + src->parse_ofs);
1085 /* Returns the "struct token" of the token N after the current one in LEXER.
1086 The returned pointer can be invalidated by pretty much any succeeding call
1087 into the lexer, although the string pointer within the returned token is
1088 only invalidated by consuming the token (e.g. with lex_get()). */
1089 const struct token *
1090 lex_next (const struct lexer *lexer, int n)
1092 return &lex_next__ (lexer, n)->token;
1095 /* Returns the type of the token N after the current one in LEXER. */
1097 lex_next_token (const struct lexer *lexer, int n)
1099 return lex_next (lexer, n)->type;
1102 /* Returns the number in the tokn N after the current one in LEXER.
1104 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1105 tokens this function will always return zero. */
1107 lex_next_tokval (const struct lexer *lexer, int n)
1109 return token_number (lex_next (lexer, n));
1112 /* Returns the null-terminated string in the token N after the current one, in
1115 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1116 this functions this function will always return NULL.
1118 The UTF-8 encoding of the returned string is correct for variable names and
1119 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1120 data_in() to use it in a "union value". */
1122 lex_next_tokcstr (const struct lexer *lexer, int n)
1124 return lex_next_tokss (lexer, n).string;
1127 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1128 The string is null-terminated (but the null terminator is not included in
1129 the returned substring's 'length').
1131 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1132 tokens this functions this function will always return NULL.
1134 The UTF-8 encoding of the returned string is correct for variable names and
1135 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1136 data_in() to use it in a "union value". */
1138 lex_next_tokss (const struct lexer *lexer, int n)
1140 return lex_next (lexer, n)->string;
1143 /* Returns the offset of the current token within the command being parsed in
1144 LEXER. This is 0 for the first token in a command, 1 for the second, and so
1145 on. The return value is useful later for referring to this token in calls
1148 lex_ofs (const struct lexer *lexer)
1150 struct lex_source *src = lex_source__ (lexer);
1151 return src ? src->parse_ofs : 0;
1154 /* Returns the token within LEXER's current command with offset OFS. Use
1155 lex_ofs() to find out the offset of the current token. */
1156 const struct token *
1157 lex_ofs_token (const struct lexer *lexer_, int ofs)
1159 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1160 struct lex_source *src = lex_source__ (lexer);
1163 return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1166 static const struct token stop_token = { .type = T_STOP };
1171 /* Allocates and returns a new struct msg_location that spans tokens with
1172 offsets OFS0 through OFS1, inclusive, within the current command in
1173 LEXER. See lex_ofs() for an explanation of token offsets.
1175 The caller owns and must eventually free the returned object. */
1176 struct msg_location *
1177 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1179 int ofs = lex_ofs (lexer);
1180 return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1183 /* Returns a msg_point for the first character in the token with offset OFS,
1184 where offset 0 is the first token in the command currently being parsed, 1
1185 the second token, and so on. These are absolute offsets, not relative to
1186 the token currently being parsed within the command.
1188 Returns zeros for a T_STOP token.
1191 lex_ofs_start_point (const struct lexer *lexer, int ofs)
1193 const struct lex_source *src = lex_source__ (lexer);
1195 ? lex_token_start_point (src, lex_source_ofs__ (src, ofs))
1196 : (struct msg_point) { 0, 0 });
1199 /* Returns a msg_point for the last character, inclusive, in the token with
1200 offset OFS, where offset 0 is the first token in the command currently being
1201 parsed, 1 the second token, and so on. These are absolute offsets, not
1202 relative to the token currently being parsed within the command.
1204 Returns zeros for a T_STOP token.
1206 Most of the time, a single token is wholly within a single line of syntax,
1207 so that the start and end point for a given offset have the same line
1208 number. There are two exceptions: a T_STRING token can be made up of
1209 multiple segments on adjacent lines connected with "+" punctuators, and a
1210 T_NEG_NUM token can consist of a "-" on one line followed by the number on
1214 lex_ofs_end_point (const struct lexer *lexer, int ofs)
1216 const struct lex_source *src = lex_source__ (lexer);
1218 ? lex_token_end_point (src, lex_source_ofs__ (src, ofs))
1219 : (struct msg_point) { 0, 0 });
1222 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1223 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1224 are both zero, this requests the syntax for the current token.) The caller
1225 must eventually free the returned string (with free()). The syntax is
1226 encoded in UTF-8 and in the original form supplied to the lexer so that, for
1227 example, it may include comments, spaces, and new-lines if it spans multiple
1228 tokens. Macro expansion, however, has already been performed. */
1230 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1232 return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1235 /* Returns true if the token N ahead of the current one was produced by macro
1236 expansion, false otherwise. */
1238 lex_next_is_from_macro (const struct lexer *lexer, int n)
1240 return lex_next__ (lexer, n)->macro_rep != NULL;
1244 lex_tokens_match (const struct token *actual, const struct token *expected)
1246 if (actual->type != expected->type)
1249 switch (actual->type)
1253 return actual->number == expected->number;
1256 return lex_id_match (expected->string, actual->string);
1259 return (actual->string.length == expected->string.length
1260 && !memcmp (actual->string.string, expected->string.string,
1261 actual->string.length));
1269 lex_at_phrase__ (struct lexer *lexer, const char *s)
1271 struct string_lexer slex;
1275 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1276 while (string_lexer_next (&slex, &token))
1278 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1279 token_uninit (&token);
1286 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1287 returns true. Otherwise, returns false.
1289 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1290 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1291 first three letters. */
1293 lex_at_phrase (struct lexer *lexer, const char *s)
1295 return lex_at_phrase__ (lexer, s) > 0;
1298 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1299 skips it and returns true. Otherwise, returns false.
1301 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1302 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1303 first three letters. */
1305 lex_match_phrase (struct lexer *lexer, const char *s)
1307 size_t n = lex_at_phrase__ (lexer, s);
1309 lex_get_n (lexer, n);
1313 /* Returns the 1-based line number of the source text at the byte OFFSET in
1316 lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset)
1319 size_t hi = src->n_lines;
1322 size_t mid = (lo + hi) / 2;
1323 if (mid + 1 >= src->n_lines)
1324 return src->n_lines;
1325 else if (offset >= src->lines[mid + 1])
1327 else if (offset < src->lines[mid])
1334 /* Returns the 1-based column number of the source text at the byte OFFSET in
1337 lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset)
1339 const char *newline = memrchr (src->buffer, '\n', offset);
1340 size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1341 return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1344 static struct msg_point
1345 lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset)
1347 return (struct msg_point) {
1348 .line = lex_source_ofs_to_line_number (src, offset),
1349 .column = lex_source_ofs_to_column_number (src, offset),
1353 static struct msg_point
1354 lex_token_start_point (const struct lex_source *src,
1355 const struct lex_token *token)
1357 return lex_source_ofs_to_point__ (src, token->token_pos);
1360 static struct msg_point
1361 lex_token_end_point (const struct lex_source *src,
1362 const struct lex_token *token)
1364 return lex_source_ofs_to_point__ (src, lex_token_end (token));
1367 static struct msg_location
1368 lex_token_location (const struct lex_source *src,
1369 const struct lex_token *t0,
1370 const struct lex_token *t1)
1372 return (struct msg_location) {
1373 .file_name = intern_new_if_nonnull (src->reader->file_name),
1374 .start = lex_token_start_point (src, t0),
1375 .end = lex_token_end_point (src, t1),
1379 static struct msg_location *
1380 lex_token_location_rw (const struct lex_source *src,
1381 const struct lex_token *t0,
1382 const struct lex_token *t1)
1384 struct msg_location location = lex_token_location (src, t0, t1);
1385 return msg_location_dup (&location);
1388 static struct msg_location *
1389 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1391 return lex_token_location_rw (src,
1392 lex_source_next__ (src, n0),
1393 lex_source_next__ (src, n1));
1396 /* Returns the name of the syntax file from which the current command is drawn.
1397 Returns NULL for a T_STOP token or if the command's source does not have
1400 There is no version of this function that takes an N argument because
1401 lookahead only works to the end of a command and any given command is always
1402 within a single syntax file. */
1404 lex_get_file_name (const struct lexer *lexer)
1406 struct lex_source *src = lex_source__ (lexer);
1407 return src == NULL ? NULL : src->reader->file_name;
1410 /* Returns a newly allocated msg_location for the syntax that represents tokens
1411 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1412 must eventually free the location (with msg_location_destroy()). */
1413 struct msg_location *
1414 lex_get_location (const struct lexer *lexer, int n0, int n1)
1416 struct msg_location *loc = xmalloc (sizeof *loc);
1417 *loc = (struct msg_location) {
1418 .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1419 .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)),
1420 .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)),
1421 .src = lex_source__ (lexer),
1423 lex_source_ref (loc->src);
1428 lex_get_encoding (const struct lexer *lexer)
1430 struct lex_source *src = lex_source__ (lexer);
1431 return src == NULL ? NULL : src->reader->encoding;
1434 /* Returns the syntax mode for the syntax file from which the current drawn is
1435 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1436 does not have line numbers.
1438 There is no version of this function that takes an N argument because
1439 lookahead only works to the end of a command and any given command is always
1440 within a single syntax file. */
1442 lex_get_syntax_mode (const struct lexer *lexer)
1444 struct lex_source *src = lex_source__ (lexer);
1445 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1448 /* Returns the error mode for the syntax file from which the current drawn is
1449 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1450 source does not have line numbers.
1452 There is no version of this function that takes an N argument because
1453 lookahead only works to the end of a command and any given command is always
1454 within a single syntax file. */
1456 lex_get_error_mode (const struct lexer *lexer)
1458 struct lex_source *src = lex_source__ (lexer);
1459 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1462 /* If the source that LEXER is currently reading has error mode
1463 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1464 token to be read comes directly from whatever is next read from the stream.
1466 It makes sense to call this function after encountering an error in a
1467 command entered on the console, because usually the user would prefer not to
1468 have cascading errors. */
1470 lex_interactive_reset (struct lexer *lexer)
1472 struct lex_source *src = lex_source__ (lexer);
1473 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1476 src->journal_pos = src->seg_pos = 0;
1478 src->suppress_next_newline = false;
1479 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1481 lex_stage_clear (&src->pp);
1482 lex_stage_clear (&src->merge);
1483 lex_source_clear_parse (src);
1484 lex_source_push_endcmd__ (src);
1488 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1490 lex_discard_rest_of_command (struct lexer *lexer)
1492 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1496 /* Discards all lookahead tokens in LEXER, then discards all input sources
1497 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1498 runs out of input sources. */
1500 lex_discard_noninteractive (struct lexer *lexer)
1502 struct lex_source *src = lex_source__ (lexer);
1506 lex_stage_clear (&src->pp);
1507 lex_stage_clear (&src->merge);
1508 lex_source_clear_parse (src);
1510 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1511 src = lex_source__ (lexer))
1513 ll_remove (&src->ll);
1514 lex_source_unref (src);
1520 lex_source_expand__ (struct lex_source *src)
1522 if (src->length >= src->allocated)
1523 src->buffer = x2realloc (src->buffer, &src->allocated);
1527 lex_source_read__ (struct lex_source *src)
1531 lex_source_expand__ (src);
1533 size_t space = src->allocated - src->length;
1534 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1535 size_t n = src->reader->class->read (src->reader,
1536 &src->buffer[src->length],
1538 assert (n <= space);
1543 src->reader->eof = true;
1549 while (!memchr (&src->buffer[src->seg_pos], '\n',
1550 src->length - src->seg_pos));
1553 static struct lex_source *
1554 lex_source__ (const struct lexer *lexer)
1556 return (ll_is_empty (&lexer->sources) ? NULL
1557 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1560 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1561 one, through N1 ahead of the current one, inclusive. (For example, if N0
1562 and N1 are both zero, this requests the syntax for the current token.) The
1563 caller must eventually free the returned string (with free()). The syntax
1564 is encoded in UTF-8 and in the original form supplied to the lexer so that,
1565 for example, it may include comments, spaces, and new-lines if it spans
1566 multiple tokens. Macro expansion, however, has already been performed. */
1568 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1570 struct string s = DS_EMPTY_INITIALIZER;
1571 for (size_t i = n0; i <= n1; )
1573 /* Find [I,J) as the longest sequence of tokens not produced by macro
1574 expansion, or otherwise the longest sequence expanded from a single
1576 const struct lex_token *first = lex_source_next__ (src, i);
1578 for (j = i + 1; j <= n1; j++)
1580 const struct lex_token *cur = lex_source_next__ (src, j);
1581 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1582 || first->macro_rep != cur->macro_rep)
1585 const struct lex_token *last = lex_source_next__ (src, j - 1);
1587 /* Now add the syntax for this sequence of tokens to SRC. */
1588 if (!ds_is_empty (&s))
1589 ds_put_byte (&s, ' ');
1590 if (!first->macro_rep)
1592 size_t start = first->token_pos;
1593 size_t end = last->token_pos + last->token_len;
1594 ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1598 size_t start = first->ofs;
1599 size_t end = last->ofs + last->len;
1600 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1606 return ds_steal_cstr (&s);
1610 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1612 for (size_t i = n0; i <= n1; i++)
1613 if (lex_source_next__ (src, i)->macro_rep)
1618 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1619 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1620 other tokens included in that range. The syntax is encoded in UTF-8 and in
1621 the original form supplied to the lexer so that, for example, it may include
1622 comments, spaces, and new-lines if it spans multiple tokens.
1624 Returns an empty string if the token range doesn't include a macro call.
1626 The caller must not modify or free the returned string. */
1627 static struct substring
1628 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1630 if (!lex_source_contains_macro_call (src, n0, n1))
1633 const struct lex_token *token0 = lex_source_next__ (src, n0);
1634 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1635 size_t start = token0->token_pos;
1636 size_t end = token1->token_pos + token1->token_len;
1638 return ss_buffer (&src->buffer[start], end - start);
1642 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1643 const char *format, va_list args)
1645 const struct lex_token *token;
1650 token = lex_source_next__ (src, n0);
1651 if (token->token.type == T_ENDCMD)
1652 ds_put_cstr (&s, _("Syntax error at end of command"));
1655 /* Get the syntax that caused the error. */
1656 char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1658 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1661 /* Get the macro call(s) that expanded to the syntax that caused the
1664 str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1671 _("Syntax error at `%s' (in expansion of `%s')"),
1674 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1679 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1682 ds_put_cstr (&s, _("Syntax error"));
1688 ds_put_cstr (&s, ": ");
1689 ds_put_vformat (&s, format, args);
1691 if (ds_last (&s) != '.')
1692 ds_put_byte (&s, '.');
1694 struct msg *m = xmalloc (sizeof *m);
1696 .category = MSG_C_SYNTAX,
1697 .severity = MSG_S_ERROR,
1698 .location = lex_source_get_location (src, n0, n1),
1699 .text = ds_steal_cstr (&s),
1705 lex_get_error (struct lex_source *src, const struct lex_token *token)
1708 str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1709 syntax, sizeof syntax);
1711 struct string s = DS_EMPTY_INITIALIZER;
1712 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1713 ds_put_format (&s, ": %s", token->token.string.string);
1715 struct msg *m = xmalloc (sizeof *m);
1717 .category = MSG_C_SYNTAX,
1718 .severity = MSG_S_ERROR,
1719 .location = lex_token_location_rw (src, token, token),
1720 .text = ds_steal_cstr (&s),
1725 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1726 underlying lex_reader if necessary. Returns true if a new token was added
1727 to SRC's deque, false otherwise. The caller should retry failures unless
1728 SRC's 'eof' marker was set to true indicating that there will be no more
1729 tokens from this source. */
1731 lex_source_try_get_pp (struct lex_source *src)
1733 /* Append a new token to SRC and initialize it. */
1734 struct lex_token *token = xmalloc (sizeof *token);
1735 token->token = (struct token) { .type = T_STOP };
1736 token->macro_rep = NULL;
1737 token->ref_cnt = NULL;
1738 token->token_pos = src->seg_pos;
1740 /* Extract a segment. */
1741 const char *segment;
1742 enum segment_type seg_type;
1746 segment = &src->buffer[src->seg_pos];
1747 seg_len = segmenter_push (&src->segmenter, segment,
1748 src->length - src->seg_pos,
1749 src->reader->eof, &seg_type);
1753 /* The segmenter needs more input to produce a segment. */
1754 assert (!src->reader->eof);
1755 lex_source_read__ (src);
1758 /* Update state based on the segment. */
1759 token->token_len = seg_len;
1760 src->seg_pos += seg_len;
1761 if (seg_type == SEG_NEWLINE)
1763 if (src->n_lines >= src->allocated_lines)
1764 src->lines = x2nrealloc (src->lines, &src->allocated_lines,
1765 sizeof *src->lines);
1766 src->lines[src->n_lines++] = src->seg_pos;
1769 /* Get a token from the segment. */
1770 enum tokenize_result result = token_from_segment (
1771 seg_type, ss_buffer (segment, seg_len), &token->token);
1773 /* If we've reached the end of a line, or the end of a command, then pass
1774 the line to the output engine as a syntax text item. */
1775 int n_lines = seg_type == SEG_NEWLINE;
1776 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1779 src->suppress_next_newline = true;
1781 else if (n_lines > 0 && src->suppress_next_newline)
1784 src->suppress_next_newline = false;
1786 for (int i = 0; i < n_lines; i++)
1788 /* Beginning of line. */
1789 const char *line = &src->buffer[src->journal_pos];
1791 /* Calculate line length, including \n or \r\n end-of-line if present.
1793 We use src->length even though that may be beyond what we've actually
1794 converted to tokens. That's because, if we're emitting the line due
1795 to SEG_END_COMMAND, we want to take the whole line through the
1796 newline, not just through the '.'. */
1797 size_t max_len = src->length - src->journal_pos;
1798 const char *newline = memchr (line, '\n', max_len);
1799 size_t line_len = newline ? newline - line + 1 : max_len;
1801 /* Calculate line length excluding end-of-line. */
1802 size_t copy_len = line_len;
1803 if (copy_len > 0 && line[copy_len - 1] == '\n')
1805 if (copy_len > 0 && line[copy_len - 1] == '\r')
1808 /* Submit the line as syntax. */
1809 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1810 xmemdup0 (line, copy_len),
1813 src->journal_pos += line_len;
1818 case TOKENIZE_ERROR:
1819 lex_get_error (src, token);
1821 case TOKENIZE_EMPTY:
1822 lex_token_destroy (token);
1825 case TOKENIZE_TOKEN:
1826 if (token->token.type == T_STOP)
1828 token->token.type = T_ENDCMD;
1831 lex_stage_push_last (&src->pp, token);
1837 /* Attempts to append a new token to SRC. Returns true if successful, false on
1838 failure. On failure, the end of SRC has been reached and no more tokens
1839 will be forthcoming from it.
1841 Does not make the new token available for lookahead yet; the caller must
1842 adjust SRC's 'middle' pointer to do so. */
1844 lex_source_get_pp (struct lex_source *src)
1847 if (lex_source_try_get_pp (src))
1853 lex_source_try_get_merge (const struct lex_source *src_)
1855 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1857 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1860 if (!settings_get_mexpand ())
1862 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1866 /* Now pass tokens one-by-one to the macro expander.
1868 In the common case where there is no macro to expand, the loop is not
1870 struct macro_call *mc;
1871 int n_call = macro_call_create (src->lexer->macros,
1872 &lex_stage_first (&src->pp)->token, &mc);
1873 for (int ofs = 1; !n_call; ofs++)
1875 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1877 /* This should not be reachable because we always get a T_ENDCMD at
1878 the end of an input file (transformed from T_STOP by
1879 lex_source_try_get_pp()) and the macro_expander should always
1880 terminate expansion on T_ENDCMD. */
1884 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1885 const struct macro_token mt = {
1887 .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len),
1889 const struct msg_location loc = lex_token_location (src, t, t);
1890 n_call = macro_call_add (mc, &mt, &loc);
1894 /* False alarm: no macro expansion after all. Use first token as
1895 lookahead. We'll retry macro expansion from the second token next
1897 macro_call_destroy (mc);
1898 lex_stage_shift (&src->merge, &src->pp, 1);
1902 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1903 are a macro call. (These are likely to be the only tokens in 'pp'.)
1905 const struct lex_token *c0 = lex_stage_first (&src->pp);
1906 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1907 struct macro_tokens expansion = { .n = 0 };
1908 struct msg_location loc = lex_token_location (src, c0, c1);
1909 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1910 macro_call_destroy (mc);
1912 /* Convert the macro expansion into syntax for possible error messages
1914 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1915 size_t *len = xnmalloc (expansion.n, sizeof *len);
1916 struct string s = DS_EMPTY_INITIALIZER;
1917 macro_tokens_to_syntax (&expansion, &s, ofs, len);
1919 if (settings_get_mprint ())
1920 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1921 _("Macro Expansion")));
1923 /* Append the macro expansion tokens to the lookahead. */
1924 if (expansion.n > 0)
1926 char *macro_rep = ds_steal_cstr (&s);
1927 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1928 *ref_cnt = expansion.n;
1929 for (size_t i = 0; i < expansion.n; i++)
1931 struct lex_token *token = xmalloc (sizeof *token);
1932 *token = (struct lex_token) {
1933 .token = expansion.mts[i].token,
1934 .token_pos = c0->token_pos,
1935 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1936 .macro_rep = macro_rep,
1941 lex_stage_push_last (&src->merge, token);
1943 ss_dealloc (&expansion.mts[i].syntax);
1948 free (expansion.mts);
1952 /* Destroy the tokens for the call. */
1953 for (size_t i = 0; i < n_call; i++)
1954 lex_stage_pop_first (&src->pp);
1956 return expansion.n > 0;
1959 /* Attempts to obtain at least one new token into 'merge' in SRC.
1961 Returns true if successful, false on failure. In the latter case, SRC is
1962 exhausted and 'src->eof' is now true. */
1964 lex_source_get_merge (struct lex_source *src)
1967 if (lex_source_try_get_merge (src))
1972 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1974 Returns true if successful, false on failure. In the latter case, SRC is
1975 exhausted and 'src->eof' is now true. */
1977 lex_source_get_parse (struct lex_source *src)
1979 struct merger m = MERGER_INIT;
1981 for (size_t i = 0; ; i++)
1983 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1985 /* We always get a T_ENDCMD at the end of an input file
1986 (transformed from T_STOP by lex_source_try_get_pp()) and
1987 merger_add() should never return -1 on T_ENDCMD. */
1988 assert (lex_stage_is_empty (&src->merge));
1992 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1996 lex_source_push_parse (src, lex_stage_take_first (&src->merge));
1999 else if (retval > 0)
2001 /* Add a token that merges all the tokens together. */
2002 const struct lex_token *first = lex_stage_first (&src->merge);
2003 const struct lex_token *last = lex_stage_nth (&src->merge,
2005 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2006 struct lex_token *t = xmalloc (sizeof *t);
2007 *t = (struct lex_token) {
2009 .token_pos = first->token_pos,
2010 .token_len = (last->token_pos - first->token_pos) + last->token_len,
2012 /* This works well if all the tokens were not expanded from macros,
2013 or if they came from the same macro expansion. It just gives up
2014 in the other (corner) cases. */
2015 .macro_rep = macro ? first->macro_rep : NULL,
2016 .ofs = macro ? first->ofs : 0,
2017 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2018 .ref_cnt = macro ? first->ref_cnt : NULL,
2022 lex_source_push_parse (src, t);
2024 for (int i = 0; i < retval; i++)
2025 lex_stage_pop_first (&src->merge);
2032 lex_source_push_endcmd__ (struct lex_source *src)
2034 assert (src->n_parse == 0);
2036 struct lex_token *token = xmalloc (sizeof *token);
2037 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2038 lex_source_push_parse (src, token);
2042 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2044 if (src->n_parse >= src->allocated_parse)
2045 src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2046 sizeof *src->parse);
2047 src->parse[src->n_parse++] = token;
2051 lex_source_clear_parse (struct lex_source *src)
2053 for (size_t i = 0; i < src->n_parse; i++)
2054 lex_token_destroy (src->parse[i]);
2055 src->n_parse = src->parse_ofs = 0;
2058 static struct lex_source *
2059 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2061 size_t allocated_lines = 4;
2062 size_t *lines = xmalloc (allocated_lines * sizeof *lines);
2065 struct lex_source *src = xmalloc (sizeof *src);
2066 *src = (struct lex_source) {
2069 .segmenter = segmenter_init (reader->syntax, false),
2073 .allocated_lines = allocated_lines,
2076 lex_source_push_endcmd__ (src);
2082 lex_set_message_handler (struct lexer *lexer,
2083 void (*output_msg) (const struct msg *,
2086 struct msg_handler msg_handler = {
2087 .output_msg = (void (*)(const struct msg *, void *)) output_msg,
2089 .lex_source_ref = lex_source_ref,
2090 .lex_source_unref = lex_source_unref,
2091 .lex_source_get_line = lex_source_get_line,
2093 msg_set_handler (&msg_handler);
2097 lex_source_ref (const struct lex_source *src_)
2099 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2102 assert (src->n_refs > 0);
2108 lex_source_unref (struct lex_source *src)
2113 assert (src->n_refs > 0);
2114 if (--src->n_refs > 0)
2117 char *file_name = src->reader->file_name;
2118 char *encoding = src->reader->encoding;
2119 if (src->reader->class->destroy != NULL)
2120 src->reader->class->destroy (src->reader);
2125 lex_stage_uninit (&src->pp);
2126 lex_stage_uninit (&src->merge);
2127 lex_source_clear_parse (src);
2132 struct lex_file_reader
2134 struct lex_reader reader;
2135 struct u8_istream *istream;
2138 static struct lex_reader_class lex_file_reader_class;
2140 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2141 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2142 ENCODING, which should take one of the forms accepted by
2143 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2144 mode of the new reader, respectively.
2146 Returns a null pointer if FILE_NAME cannot be opened. */
2148 lex_reader_for_file (const char *file_name, const char *encoding,
2149 enum segmenter_mode syntax,
2150 enum lex_error_mode error)
2152 struct lex_file_reader *r;
2153 struct u8_istream *istream;
2155 istream = (!strcmp(file_name, "-")
2156 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2157 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2158 if (istream == NULL)
2160 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2164 r = xmalloc (sizeof *r);
2165 lex_reader_init (&r->reader, &lex_file_reader_class);
2166 r->reader.syntax = syntax;
2167 r->reader.error = error;
2168 r->reader.file_name = xstrdup (file_name);
2169 r->reader.encoding = xstrdup_if_nonnull (encoding);
2170 r->reader.line_number = 1;
2171 r->istream = istream;
2176 static struct lex_file_reader *
2177 lex_file_reader_cast (struct lex_reader *r)
2179 return UP_CAST (r, struct lex_file_reader, reader);
2183 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2184 enum prompt_style prompt_style UNUSED)
2186 struct lex_file_reader *r = lex_file_reader_cast (r_);
2187 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2190 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2197 lex_file_close (struct lex_reader *r_)
2199 struct lex_file_reader *r = lex_file_reader_cast (r_);
2201 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2203 if (u8_istream_close (r->istream) != 0)
2204 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2207 u8_istream_free (r->istream);
2212 static struct lex_reader_class lex_file_reader_class =
2218 struct lex_string_reader
2220 struct lex_reader reader;
2225 static struct lex_reader_class lex_string_reader_class;
2227 /* Creates and returns a new lex_reader for the contents of S, which must be
2228 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2229 with ss_dealloc() when it is closed. */
2231 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2233 struct lex_string_reader *r;
2235 r = xmalloc (sizeof *r);
2236 lex_reader_init (&r->reader, &lex_string_reader_class);
2237 r->reader.syntax = SEG_MODE_AUTO;
2238 r->reader.encoding = xstrdup_if_nonnull (encoding);
2245 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2246 which must be encoded in ENCODING. The caller retains ownership of S. */
2248 lex_reader_for_string (const char *s, const char *encoding)
2250 struct substring ss;
2251 ss_alloc_substring (&ss, ss_cstr (s));
2252 return lex_reader_for_substring_nocopy (ss, encoding);
2255 /* Formats FORMAT as a printf()-like format string and creates and returns a
2256 new lex_reader for the formatted result. */
2258 lex_reader_for_format (const char *format, const char *encoding, ...)
2260 struct lex_reader *r;
2263 va_start (args, encoding);
2264 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2270 static struct lex_string_reader *
2271 lex_string_reader_cast (struct lex_reader *r)
2273 return UP_CAST (r, struct lex_string_reader, reader);
2277 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2278 enum prompt_style prompt_style UNUSED)
2280 struct lex_string_reader *r = lex_string_reader_cast (r_);
2283 chunk = MIN (n, r->s.length - r->offset);
2284 memcpy (buf, r->s.string + r->offset, chunk);
2291 lex_string_close (struct lex_reader *r_)
2293 struct lex_string_reader *r = lex_string_reader_cast (r_);
2299 static struct lex_reader_class lex_string_reader_class =
2306 lex_source_get_line (const struct lex_source *src, int line)
2308 if (line < 1 || line > src->n_lines)
2311 size_t ofs = src->lines[line - 1];
2312 size_t end = line >= src->n_lines ? src->length : src->lines[line];
2313 return ss_buffer (&src->buffer[ofs], end - ofs);