1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/ll.h"
42 #include "libpspp/message.h"
43 #include "libpspp/misc.h"
44 #include "libpspp/str.h"
45 #include "libpspp/u8-istream.h"
46 #include "output/journal.h"
47 #include "output/output-item.h"
49 #include "gl/c-ctype.h"
50 #include "gl/minmax.h"
51 #include "gl/xalloc.h"
52 #include "gl/xmemdup0.h"
55 #define _(msgid) gettext (msgid)
56 #define N_(msgid) msgid
58 /* A token within a lex_source. */
61 /* The regular token information. */
64 /* For a token obtained through the lexer in an ordinary way, this is the
65 location of the token in terms of the lex_source's buffer.
67 For a token produced through macro expansion, this is the entire macro
69 size_t token_pos; /* Offset into src->buffer of token start. */
70 size_t token_len; /* Length of source for token in bytes. */
71 int first_line; /* Line number at token_pos. */
73 /* For a token obtained through macro expansion, this is just this token.
75 For a token obtained through the lexer in an ordinary way, these are
77 char *macro_rep; /* The whole macro expansion. */
78 size_t ofs; /* Offset of this token in macro_rep. */
79 size_t len; /* Length of this token in macro_rep. */
80 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
84 lex_token_destroy (struct lex_token *t)
86 token_uninit (&t->token);
89 assert (*t->ref_cnt > 0);
99 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
104 struct lex_token **tokens;
107 static void lex_stage_clear (struct lex_stage *);
108 static void lex_stage_uninit (struct lex_stage *);
110 static size_t lex_stage_count (const struct lex_stage *);
111 static bool lex_stage_is_empty (const struct lex_stage *);
113 static struct lex_token *lex_stage_first (struct lex_stage *);
114 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
116 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
117 static void lex_stage_pop_first (struct lex_stage *);
119 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
122 /* Deletes all the tokens from STAGE. */
124 lex_stage_clear (struct lex_stage *stage)
126 while (!deque_is_empty (&stage->deque))
127 lex_stage_pop_first (stage);
130 /* Deletes all the tokens from STAGE and frees storage for the deque. */
132 lex_stage_uninit (struct lex_stage *stage)
134 lex_stage_clear (stage);
135 free (stage->tokens);
138 /* Returns true if STAGE contains no tokens, otherwise false. */
140 lex_stage_is_empty (const struct lex_stage *stage)
142 return deque_is_empty (&stage->deque);
145 /* Returns the number of tokens in STAGE. */
147 lex_stage_count (const struct lex_stage *stage)
149 return deque_count (&stage->deque);
152 /* Returns the first token in STAGE, which must be nonempty.
153 The first token is the one accessed with the least lookahead. */
154 static struct lex_token *
155 lex_stage_first (struct lex_stage *stage)
157 return lex_stage_nth (stage, 0);
160 /* Returns the token the given INDEX in STAGE. The first token (with the least
161 lookahead) is 0, the second token is 1, and so on. There must be at least
162 INDEX + 1 tokens in STAGE. */
163 static struct lex_token *
164 lex_stage_nth (struct lex_stage *stage, size_t index)
166 return stage->tokens[deque_back (&stage->deque, index)];
169 /* Adds TOKEN so that it becomes the last token in STAGE. */
171 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
173 if (deque_is_full (&stage->deque))
174 stage->tokens = deque_expand (&stage->deque, stage->tokens,
175 sizeof *stage->tokens);
176 stage->tokens[deque_push_front (&stage->deque)] = token;
179 /* Removes and returns the first token from STAGE. */
180 static struct lex_token *
181 lex_stage_take_first (struct lex_stage *stage)
183 return stage->tokens[deque_pop_back (&stage->deque)];
186 /* Removes the first token from STAGE and uninitializes it. */
188 lex_stage_pop_first (struct lex_stage *stage)
190 lex_token_destroy (lex_stage_take_first (stage));
193 /* Removes the first N tokens from SRC, appending them to DST as the last
196 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
198 for (size_t i = 0; i < n; i++)
199 lex_stage_push_last (dst, lex_stage_take_first (src));
202 /* A source of tokens, corresponding to a syntax file.
204 This is conceptually a lex_reader wrapped with everything needed to convert
205 its UTF-8 bytes into tokens. */
208 struct ll ll; /* In lexer's list of sources. */
209 struct lex_reader *reader;
211 struct segmenter segmenter;
212 bool eof; /* True if T_STOP was read from 'reader'. */
214 /* Buffer of UTF-8 bytes. */
215 char *buffer; /* Source file contents. */
216 size_t length; /* Number of bytes filled. */
217 size_t allocated; /* Number of bytes allocated. */
219 /* Offsets into 'buffer'. */
220 size_t journal_pos; /* First byte not yet output to journal. */
221 size_t seg_pos; /* First byte not yet scanned as token. */
223 int n_newlines; /* Number of new-lines up to seg_pos. */
224 bool suppress_next_newline;
228 This is a pipeline with the following stages. Each token eventually
229 made available to the parser passes through of these stages. The stages
230 are named after the processing that happens in each one.
232 Initially, tokens come from the segmenter and scanner to 'pp':
234 - pp: Tokens that need to pass through the macro preprocessor to end up
237 - merge: Tokens that need to pass through scan_merge() to end up in
240 - parse: Tokens available to the client for parsing.
242 'pp' and 'merge' store tokens only temporarily until they pass into
243 'parse'. Tokens then live in 'parse' until the command is fully
244 consumed, at which time they are freed together. */
246 struct lex_stage merge;
247 struct lex_token **parse;
248 size_t n_parse, allocated_parse, parse_ofs;
251 static struct lex_source *lex_source_create (struct lexer *,
252 struct lex_reader *);
253 static void lex_source_destroy (struct lex_source *);
258 struct ll_list sources; /* Contains "struct lex_source"s. */
259 struct macro_set *macros;
262 static struct lex_source *lex_source__ (const struct lexer *);
263 static char *lex_source_get_syntax__ (const struct lex_source *,
265 static const struct lex_token *lex_next__ (const struct lexer *, int n);
266 static void lex_source_push_endcmd__ (struct lex_source *);
267 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
268 static void lex_source_clear_parse (struct lex_source *);
270 static bool lex_source_get_parse (struct lex_source *);
271 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
272 const char *format, va_list)
273 PRINTF_FORMAT (4, 0);
274 static const struct lex_token *lex_source_next__ (const struct lex_source *,
277 /* Initializes READER with the specified CLASS and otherwise some reasonable
278 defaults. The caller should fill in the others members as desired. */
280 lex_reader_init (struct lex_reader *reader,
281 const struct lex_reader_class *class)
283 reader->class = class;
284 reader->syntax = SEG_MODE_AUTO;
285 reader->error = LEX_ERROR_CONTINUE;
286 reader->file_name = NULL;
287 reader->encoding = NULL;
288 reader->line_number = 0;
292 /* Frees any file name already in READER and replaces it by a copy of
293 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
295 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
297 free (reader->file_name);
298 reader->file_name = xstrdup_if_nonnull (file_name);
301 /* Creates and returns a new lexer. */
305 struct lexer *lexer = xmalloc (sizeof *lexer);
306 *lexer = (struct lexer) {
307 .sources = LL_INITIALIZER (lexer->sources),
308 .macros = macro_set_create (),
313 /* Destroys LEXER. */
315 lex_destroy (struct lexer *lexer)
319 struct lex_source *source, *next;
321 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
322 lex_source_destroy (source);
323 macro_set_destroy (lexer->macros);
328 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
329 same name. Takes ownership of M. */
331 lex_define_macro (struct lexer *lexer, struct macro *m)
333 macro_set_add (lexer->macros, m);
336 /* Inserts READER into LEXER so that the next token read by LEXER comes from
337 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
340 lex_include (struct lexer *lexer, struct lex_reader *reader)
342 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
343 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
346 /* Appends READER to LEXER, so that it will be read after all other current
347 readers have already been read. */
349 lex_append (struct lexer *lexer, struct lex_reader *reader)
351 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
356 /* Advances LEXER to the next token, consuming the current token. */
358 lex_get (struct lexer *lexer)
360 struct lex_source *src;
362 src = lex_source__ (lexer);
366 if (src->parse_ofs < src->n_parse)
368 if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
369 lex_source_clear_parse (src);
374 while (src->parse_ofs == src->n_parse)
375 if (!lex_source_get_parse (src))
377 lex_source_destroy (src);
378 src = lex_source__ (lexer);
384 /* Advances LEXER by N tokens. */
386 lex_get_n (struct lexer *lexer, size_t n)
392 /* Issuing errors. */
394 /* Prints a syntax error message containing the current token and
395 given message MESSAGE (if non-null). */
397 lex_error (struct lexer *lexer, const char *format, ...)
401 va_start (args, format);
402 lex_next_error_valist (lexer, 0, 0, format, args);
406 /* Prints a syntax error message containing the current token and
407 given message MESSAGE (if non-null). */
409 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
411 lex_next_error_valist (lexer, 0, 0, format, args);
414 /* Prints a syntax error message containing the current token and
415 given message MESSAGE (if non-null). */
417 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
421 va_start (args, format);
422 lex_next_error_valist (lexer, n0, n1, format, args);
426 /* Prints a syntax error message saying that one of the strings provided as
427 varargs, up to the first NULL, is expected. */
429 (lex_error_expecting) (struct lexer *lexer, ...)
433 va_start (args, lexer);
434 lex_error_expecting_valist (lexer, args);
438 /* Prints a syntax error message saying that one of the options provided in
439 ARGS, up to the first NULL, is expected. */
441 lex_error_expecting_valist (struct lexer *lexer, va_list args)
443 enum { MAX_OPTIONS = 9 };
444 const char *options[MAX_OPTIONS];
446 while (n < MAX_OPTIONS)
448 const char *option = va_arg (args, const char *);
452 options[n++] = option;
454 lex_error_expecting_array (lexer, options, n);
458 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
463 lex_error (lexer, NULL);
467 lex_error (lexer, _("expecting %s"), options[0]);
471 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
475 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
480 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
481 options[0], options[1], options[2], options[3]);
485 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
486 options[0], options[1], options[2], options[3], options[4]);
490 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
491 options[0], options[1], options[2], options[3], options[4],
496 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
497 options[0], options[1], options[2], options[3], options[4],
498 options[5], options[6]);
502 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
503 options[0], options[1], options[2], options[3], options[4],
504 options[5], options[6], options[7]);
508 lex_error (lexer, NULL);
512 /* Reports an error to the effect that subcommand SBC may only be specified
515 This function does not take a lexer as an argument or use lex_error(),
516 because the result would ordinarily just be redundant: "Syntax error at
517 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
518 not help the user find the error. */
520 lex_sbc_only_once (const char *sbc)
522 msg (SE, _("Subcommand %s may only be specified once."), sbc);
525 /* Reports an error to the effect that subcommand SBC is missing.
527 This function does not take a lexer as an argument or use lex_error(),
528 because a missing subcommand can normally be detected only after the whole
529 command has been parsed, and so lex_error() would always report "Syntax
530 error at end of command", which does not help the user find the error. */
532 lex_sbc_missing (const char *sbc)
534 msg (SE, _("Required subcommand %s was not specified."), sbc);
537 /* Reports an error to the effect that specification SPEC may only be specified
538 once within subcommand SBC. */
540 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
542 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
546 /* Reports an error to the effect that specification SPEC is missing within
549 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
551 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
555 /* Prints a syntax error message containing the current token and
556 given message MESSAGE (if non-null). */
558 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
559 const char *format, va_list args)
561 struct lex_source *src = lex_source__ (lexer);
564 lex_source_error_valist (src, n0, n1, format, args);
570 ds_put_format (&s, _("Syntax error at end of input"));
573 ds_put_cstr (&s, ": ");
574 ds_put_vformat (&s, format, args);
576 if (ds_last (&s) != '.')
577 ds_put_byte (&s, '.');
578 msg (SE, "%s", ds_cstr (&s));
583 /* Checks that we're at end of command.
584 If so, returns a successful command completion code.
585 If not, flags a syntax error and returns an error command
588 lex_end_of_command (struct lexer *lexer)
590 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
592 lex_error (lexer, _("expecting end of command"));
599 /* Token testing functions. */
601 /* Returns true if the current token is a number. */
603 lex_is_number (const struct lexer *lexer)
605 return lex_next_is_number (lexer, 0);
608 /* Returns true if the current token is a string. */
610 lex_is_string (const struct lexer *lexer)
612 return lex_next_is_string (lexer, 0);
615 /* Returns the value of the current token, which must be a
616 floating point number. */
618 lex_number (const struct lexer *lexer)
620 return lex_next_number (lexer, 0);
623 /* Returns true iff the current token is an integer. */
625 lex_is_integer (const struct lexer *lexer)
627 return lex_next_is_integer (lexer, 0);
630 /* Returns the value of the current token, which must be an
633 lex_integer (const struct lexer *lexer)
635 return lex_next_integer (lexer, 0);
638 /* Token testing functions with lookahead.
640 A value of 0 for N as an argument to any of these functions refers to the
641 current token. Lookahead is limited to the current command. Any N greater
642 than the number of tokens remaining in the current command will be treated
643 as referring to a T_ENDCMD token. */
645 /* Returns true if the token N ahead of the current token is a number. */
647 lex_next_is_number (const struct lexer *lexer, int n)
649 return token_is_number (lex_next (lexer, n));
652 /* Returns true if the token N ahead of the current token is a string. */
654 lex_next_is_string (const struct lexer *lexer, int n)
656 return token_is_string (lex_next (lexer, n));
659 /* Returns the value of the token N ahead of the current token, which must be a
660 floating point number. */
662 lex_next_number (const struct lexer *lexer, int n)
664 return token_number (lex_next (lexer, n));
667 /* Returns true if the token N ahead of the current token is an integer. */
669 lex_next_is_integer (const struct lexer *lexer, int n)
671 return token_is_integer (lex_next (lexer, n));
674 /* Returns the value of the token N ahead of the current token, which must be
677 lex_next_integer (const struct lexer *lexer, int n)
679 return token_integer (lex_next (lexer, n));
682 /* Token matching functions. */
684 /* If the current token has the specified TYPE, skips it and returns true.
685 Otherwise, returns false. */
687 lex_match (struct lexer *lexer, enum token_type type)
689 if (lex_token (lexer) == type)
698 /* If the current token matches IDENTIFIER, skips it and returns true.
699 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
702 IDENTIFIER must be an ASCII string. */
704 lex_match_id (struct lexer *lexer, const char *identifier)
706 return lex_match_id_n (lexer, identifier, 3);
709 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
710 may be abbreviated to its first N letters. Otherwise, returns false.
712 IDENTIFIER must be an ASCII string. */
714 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
716 if (lex_token (lexer) == T_ID
717 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
726 /* If the current token is integer X, skips it and returns true. Otherwise,
729 lex_match_int (struct lexer *lexer, int x)
731 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
740 /* Forced matches. */
742 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
743 abbreviated to its first 3 letters. Otherwise, reports an error and returns
746 IDENTIFIER must be an ASCII string. */
748 lex_force_match_id (struct lexer *lexer, const char *identifier)
750 if (lex_match_id (lexer, identifier))
754 lex_error_expecting (lexer, identifier);
759 /* If the current token has the specified TYPE, skips it and returns true.
760 Otherwise, reports an error and returns false. */
762 lex_force_match (struct lexer *lexer, enum token_type type)
764 if (lex_token (lexer) == type)
771 const char *type_string = token_type_to_string (type);
774 char *s = xasprintf ("`%s'", type_string);
775 lex_error_expecting (lexer, s);
779 lex_error_expecting (lexer, token_type_to_name (type));
785 /* If the current token is a string, does nothing and returns true.
786 Otherwise, reports an error and returns false. */
788 lex_force_string (struct lexer *lexer)
790 if (lex_is_string (lexer))
794 lex_error (lexer, _("expecting string"));
799 /* If the current token is a string or an identifier, does nothing and returns
800 true. Otherwise, reports an error and returns false.
802 This is meant for use in syntactic situations where we want to encourage the
803 user to supply a quoted string, but for compatibility we also accept
804 identifiers. (One example of such a situation is file names.) Therefore,
805 the error message issued when the current token is wrong only says that a
806 string is expected and doesn't mention that an identifier would also be
809 lex_force_string_or_id (struct lexer *lexer)
811 return lex_token (lexer) == T_ID || lex_force_string (lexer);
814 /* If the current token is an integer, does nothing and returns true.
815 Otherwise, reports an error and returns false. */
817 lex_force_int (struct lexer *lexer)
819 if (lex_is_integer (lexer))
823 lex_error (lexer, _("expecting integer"));
828 /* If the current token is an integer in the range MIN...MAX (inclusive), does
829 nothing and returns true. Otherwise, reports an error and returns false.
830 If NAME is nonnull, then it is used in the error message. */
832 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
834 bool is_integer = lex_is_integer (lexer);
835 bool too_small = is_integer && lex_integer (lexer) < min;
836 bool too_big = is_integer && lex_integer (lexer) > max;
837 if (is_integer && !too_small && !too_big)
842 /* Weird, maybe a bug in the caller. Just report that we needed an
845 lex_error (lexer, _("Integer expected for %s."), name);
847 lex_error (lexer, _("Integer expected."));
852 lex_error (lexer, _("Expected %ld for %s."), min, name);
854 lex_error (lexer, _("Expected %ld."), min);
856 else if (min + 1 == max)
859 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
861 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
865 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
866 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
868 if (report_lower_bound && report_upper_bound)
872 _("Expected integer between %ld and %ld for %s."),
875 lex_error (lexer, _("Expected integer between %ld and %ld."),
878 else if (report_lower_bound)
883 lex_error (lexer, _("Expected non-negative integer for %s."),
886 lex_error (lexer, _("Expected non-negative integer."));
891 lex_error (lexer, _("Expected positive integer for %s."),
894 lex_error (lexer, _("Expected positive integer."));
897 else if (report_upper_bound)
901 _("Expected integer less than or equal to %ld for %s."),
904 lex_error (lexer, _("Expected integer less than or equal to %ld."),
910 lex_error (lexer, _("Integer expected for %s."), name);
912 lex_error (lexer, _("Integer expected."));
918 /* If the current token is a number, does nothing and returns true.
919 Otherwise, reports an error and returns false. */
921 lex_force_num (struct lexer *lexer)
923 if (lex_is_number (lexer))
926 lex_error (lexer, _("expecting number"));
930 /* If the current token is an identifier, does nothing and returns true.
931 Otherwise, reports an error and returns false. */
933 lex_force_id (struct lexer *lexer)
935 if (lex_token (lexer) == T_ID)
938 lex_error (lexer, _("expecting identifier"));
942 /* Token accessors. */
944 /* Returns the type of LEXER's current token. */
946 lex_token (const struct lexer *lexer)
948 return lex_next_token (lexer, 0);
951 /* Returns the number in LEXER's current token.
953 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
954 tokens this function will always return zero. */
956 lex_tokval (const struct lexer *lexer)
958 return lex_next_tokval (lexer, 0);
961 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
963 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
964 this functions this function will always return NULL.
966 The UTF-8 encoding of the returned string is correct for variable names and
967 other identifiers. Use filename_to_utf8() to use it as a filename. Use
968 data_in() to use it in a "union value". */
970 lex_tokcstr (const struct lexer *lexer)
972 return lex_next_tokcstr (lexer, 0);
975 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
976 null-terminated (but the null terminator is not included in the returned
977 substring's 'length').
979 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
980 this functions this function will always return NULL.
982 The UTF-8 encoding of the returned string is correct for variable names and
983 other identifiers. Use filename_to_utf8() to use it as a filename. Use
984 data_in() to use it in a "union value". */
986 lex_tokss (const struct lexer *lexer)
988 return lex_next_tokss (lexer, 0);
993 A value of 0 for N as an argument to any of these functions refers to the
994 current token. Lookahead is limited to the current command. Any N greater
995 than the number of tokens remaining in the current command will be treated
996 as referring to a T_ENDCMD token. */
998 static const struct lex_token *
999 lex_next__ (const struct lexer *lexer_, int n)
1001 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1002 struct lex_source *src = lex_source__ (lexer);
1005 return lex_source_next__ (src, n);
1008 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1013 static const struct lex_token *
1014 lex_source_next__ (const struct lex_source *src_, int n)
1016 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1020 if (-n <= src->parse_ofs)
1021 return src->parse[src->parse_ofs - (-n)];
1024 static const struct lex_token endcmd_token
1025 = { .token = { .type = T_ENDCMD } };
1026 return &endcmd_token;
1030 while (src->n_parse - src->parse_ofs <= n)
1032 if (src->n_parse > 0)
1034 const struct lex_token *t = src->parse[src->n_parse - 1];
1035 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1039 lex_source_get_parse (src);
1042 return src->parse[src->parse_ofs + n];
1045 /* Returns the "struct token" of the token N after the current one in LEXER.
1046 The returned pointer can be invalidated by pretty much any succeeding call
1047 into the lexer, although the string pointer within the returned token is
1048 only invalidated by consuming the token (e.g. with lex_get()). */
1049 const struct token *
1050 lex_next (const struct lexer *lexer, int n)
1052 return &lex_next__ (lexer, n)->token;
1055 /* Returns the type of the token N after the current one in LEXER. */
1057 lex_next_token (const struct lexer *lexer, int n)
1059 return lex_next (lexer, n)->type;
1062 /* Returns the number in the tokn N after the current one in LEXER.
1064 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1065 tokens this function will always return zero. */
1067 lex_next_tokval (const struct lexer *lexer, int n)
1069 return token_number (lex_next (lexer, n));
1072 /* Returns the null-terminated string in the token N after the current one, in
1075 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1076 this functions this function will always return NULL.
1078 The UTF-8 encoding of the returned string is correct for variable names and
1079 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1080 data_in() to use it in a "union value". */
1082 lex_next_tokcstr (const struct lexer *lexer, int n)
1084 return lex_next_tokss (lexer, n).string;
1087 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1088 The string is null-terminated (but the null terminator is not included in
1089 the returned substring's 'length').
1091 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1092 tokens this functions this function will always return NULL.
1094 The UTF-8 encoding of the returned string is correct for variable names and
1095 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1096 data_in() to use it in a "union value". */
1098 lex_next_tokss (const struct lexer *lexer, int n)
1100 return lex_next (lexer, n)->string;
1103 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1104 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1105 are both zero, this requests the syntax for the current token.) The caller
1106 must eventually free the returned string (with free()). The syntax is
1107 encoded in UTF-8 and in the original form supplied to the lexer so that, for
1108 example, it may include comments, spaces, and new-lines if it spans multiple
1109 tokens. Macro expansion, however, has already been performed. */
1111 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1113 return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1116 /* Returns true if the token N ahead of the current one was produced by macro
1117 expansion, false otherwise. */
1119 lex_next_is_from_macro (const struct lexer *lexer, int n)
1121 return lex_next__ (lexer, n)->macro_rep != NULL;
1125 lex_tokens_match (const struct token *actual, const struct token *expected)
1127 if (actual->type != expected->type)
1130 switch (actual->type)
1134 return actual->number == expected->number;
1137 return lex_id_match (expected->string, actual->string);
1140 return (actual->string.length == expected->string.length
1141 && !memcmp (actual->string.string, expected->string.string,
1142 actual->string.length));
1150 lex_at_phrase__ (struct lexer *lexer, const char *s)
1152 struct string_lexer slex;
1156 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1157 while (string_lexer_next (&slex, &token))
1159 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1160 token_uninit (&token);
1167 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1168 returns true. Otherwise, returns false.
1170 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1171 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1172 first three letters. */
1174 lex_at_phrase (struct lexer *lexer, const char *s)
1176 return lex_at_phrase__ (lexer, s) > 0;
1179 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1180 skips it and returns true. Otherwise, returns false.
1182 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1183 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1184 first three letters. */
1186 lex_match_phrase (struct lexer *lexer, const char *s)
1188 size_t n = lex_at_phrase__ (lexer, s);
1190 lex_get_n (lexer, n);
1195 count_newlines (char *s, size_t length)
1200 while ((newline = memchr (s, '\n', length)) != NULL)
1203 length -= (newline + 1) - s;
1211 lex_token_get_last_line_number (const struct lex_source *src,
1212 const struct lex_token *token)
1214 if (token->first_line == 0)
1218 char *token_str = &src->buffer[token->token_pos];
1219 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1224 lex_token_get_column__ (const struct lex_source *src, size_t offset)
1226 const char *newline = memrchr (src->buffer, '\n', offset);
1227 size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1228 return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1232 lex_token_get_first_column (const struct lex_source *src,
1233 const struct lex_token *token)
1235 return lex_token_get_column__ (src, token->token_pos);
1239 lex_token_get_last_column (const struct lex_source *src,
1240 const struct lex_token *token)
1242 return lex_token_get_column__ (src, token->token_pos + token->token_len);
1245 static struct msg_location
1246 lex_token_location (const struct lex_source *src,
1247 const struct lex_token *t0,
1248 const struct lex_token *t1)
1250 return (struct msg_location) {
1251 .file_name = src->reader->file_name,
1252 .first_line = t0->first_line,
1253 .last_line = lex_token_get_last_line_number (src, t1),
1254 .first_column = lex_token_get_first_column (src, t0),
1255 .last_column = lex_token_get_last_column (src, t1),
1259 static struct msg_location *
1260 lex_token_location_rw (const struct lex_source *src,
1261 const struct lex_token *t0,
1262 const struct lex_token *t1)
1264 struct msg_location location = lex_token_location (src, t0, t1);
1265 return msg_location_dup (&location);
1268 static struct msg_location *
1269 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1271 return lex_token_location_rw (src,
1272 lex_source_next__ (src, n0),
1273 lex_source_next__ (src, n1));
1276 /* Returns the 1-based line number of the start of the syntax that represents
1277 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1278 if the token is drawn from a source that does not have line numbers. */
1280 lex_get_first_line_number (const struct lexer *lexer, int n)
1282 const struct lex_source *src = lex_source__ (lexer);
1283 return src ? lex_source_next__ (src, n)->first_line : 0;
1286 /* Returns the 1-based line number of the end of the syntax that represents the
1287 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1288 token or if the token is drawn from a source that does not have line
1291 Most of the time, a single token is wholly within a single line of syntax,
1292 but there are two exceptions: a T_STRING token can be made up of multiple
1293 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1294 token can consist of a "-" on one line followed by the number on the next.
1297 lex_get_last_line_number (const struct lexer *lexer, int n)
1299 const struct lex_source *src = lex_source__ (lexer);
1300 return src ? lex_token_get_last_line_number (src,
1301 lex_source_next__ (src, n)) : 0;
1304 /* Returns the 1-based column number of the start of the syntax that represents
1305 the token N after the current one in LEXER. Returns 0 for a T_STOP
1308 Column numbers are measured according to the width of characters as shown in
1309 a typical fixed-width font, in which CJK characters have width 2 and
1310 combining characters have width 0. */
1312 lex_get_first_column (const struct lexer *lexer, int n)
1314 const struct lex_source *src = lex_source__ (lexer);
1315 return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1318 /* Returns the 1-based column number of the end of the syntax that represents
1319 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1322 Column numbers are measured according to the width of characters as shown in
1323 a typical fixed-width font, in which CJK characters have width 2 and
1324 combining characters have width 0. */
1326 lex_get_last_column (const struct lexer *lexer, int n)
1328 const struct lex_source *src = lex_source__ (lexer);
1329 return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1332 /* Returns the name of the syntax file from which the current command is drawn.
1333 Returns NULL for a T_STOP token or if the command's source does not have
1336 There is no version of this function that takes an N argument because
1337 lookahead only works to the end of a command and any given command is always
1338 within a single syntax file. */
1340 lex_get_file_name (const struct lexer *lexer)
1342 struct lex_source *src = lex_source__ (lexer);
1343 return src == NULL ? NULL : src->reader->file_name;
1346 /* Returns a newly allocated msg_location for the syntax that represents tokens
1347 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1348 must eventually free the location (with msg_location_destroy()). */
1349 struct msg_location *
1350 lex_get_location (const struct lexer *lexer, int n0, int n1)
1352 struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1353 loc->first_column = lex_get_first_column (lexer, n0);
1354 loc->last_column = lex_get_last_column (lexer, n1);
1358 /* Returns a newly allocated msg_location for the syntax that represents tokens
1359 with 0-based offsets N0...N1, inclusive, from the current token. The
1360 location only covers the tokens' lines, not the columns. The caller must
1361 eventually free the location (with msg_location_destroy()). */
1362 struct msg_location *
1363 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1365 struct msg_location *loc = xmalloc (sizeof *loc);
1366 *loc = (struct msg_location) {
1367 .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1368 .first_line = lex_get_first_line_number (lexer, n0),
1369 .last_line = lex_get_last_line_number (lexer, n1),
1375 lex_get_encoding (const struct lexer *lexer)
1377 struct lex_source *src = lex_source__ (lexer);
1378 return src == NULL ? NULL : src->reader->encoding;
1381 /* Returns the syntax mode for the syntax file from which the current drawn is
1382 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1383 does not have line numbers.
1385 There is no version of this function that takes an N argument because
1386 lookahead only works to the end of a command and any given command is always
1387 within a single syntax file. */
1389 lex_get_syntax_mode (const struct lexer *lexer)
1391 struct lex_source *src = lex_source__ (lexer);
1392 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1395 /* Returns the error mode for the syntax file from which the current drawn is
1396 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1397 source does not have line numbers.
1399 There is no version of this function that takes an N argument because
1400 lookahead only works to the end of a command and any given command is always
1401 within a single syntax file. */
1403 lex_get_error_mode (const struct lexer *lexer)
1405 struct lex_source *src = lex_source__ (lexer);
1406 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1409 /* If the source that LEXER is currently reading has error mode
1410 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1411 token to be read comes directly from whatever is next read from the stream.
1413 It makes sense to call this function after encountering an error in a
1414 command entered on the console, because usually the user would prefer not to
1415 have cascading errors. */
1417 lex_interactive_reset (struct lexer *lexer)
1419 struct lex_source *src = lex_source__ (lexer);
1420 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1423 src->journal_pos = src->seg_pos = 0;
1424 src->n_newlines = 0;
1425 src->suppress_next_newline = false;
1426 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1428 lex_stage_clear (&src->pp);
1429 lex_stage_clear (&src->merge);
1430 lex_source_clear_parse (src);
1431 lex_source_push_endcmd__ (src);
1435 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1437 lex_discard_rest_of_command (struct lexer *lexer)
1439 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1443 /* Discards all lookahead tokens in LEXER, then discards all input sources
1444 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1445 runs out of input sources. */
1447 lex_discard_noninteractive (struct lexer *lexer)
1449 struct lex_source *src = lex_source__ (lexer);
1453 lex_stage_clear (&src->pp);
1454 lex_stage_clear (&src->merge);
1455 lex_source_clear_parse (src);
1457 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1458 src = lex_source__ (lexer))
1459 lex_source_destroy (src);
1464 lex_source_expand__ (struct lex_source *src)
1466 if (src->length >= src->allocated)
1467 src->buffer = x2realloc (src->buffer, &src->allocated);
1471 lex_source_read__ (struct lex_source *src)
1475 lex_source_expand__ (src);
1477 size_t space = src->allocated - src->length;
1478 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1479 size_t n = src->reader->class->read (src->reader,
1480 &src->buffer[src->length],
1482 assert (n <= space);
1487 src->reader->eof = true;
1488 lex_source_expand__ (src);
1494 while (!memchr (&src->buffer[src->seg_pos], '\n',
1495 src->length - src->seg_pos));
1498 static struct lex_source *
1499 lex_source__ (const struct lexer *lexer)
1501 return (ll_is_empty (&lexer->sources) ? NULL
1502 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1505 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1506 one, through N1 ahead of the current one, inclusive. (For example, if N0
1507 and N1 are both zero, this requests the syntax for the current token.) The
1508 caller must eventually free the returned string (with free()). The syntax
1509 is encoded in UTF-8 and in the original form supplied to the lexer so that,
1510 for example, it may include comments, spaces, and new-lines if it spans
1511 multiple tokens. Macro expansion, however, has already been performed. */
1513 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1515 struct string s = DS_EMPTY_INITIALIZER;
1516 for (size_t i = n0; i <= n1; )
1518 /* Find [I,J) as the longest sequence of tokens not produced by macro
1519 expansion, or otherwise the longest sequence expanded from a single
1521 const struct lex_token *first = lex_source_next__ (src, i);
1523 for (j = i + 1; j <= n1; j++)
1525 const struct lex_token *cur = lex_source_next__ (src, j);
1526 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1527 || first->macro_rep != cur->macro_rep)
1530 const struct lex_token *last = lex_source_next__ (src, j - 1);
1532 /* Now add the syntax for this sequence of tokens to SRC. */
1533 if (!ds_is_empty (&s))
1534 ds_put_byte (&s, ' ');
1535 if (!first->macro_rep)
1537 size_t start = first->token_pos;
1538 size_t end = last->token_pos + last->token_len;
1539 ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1543 size_t start = first->ofs;
1544 size_t end = last->ofs + last->len;
1545 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1551 return ds_steal_cstr (&s);
1555 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1557 for (size_t i = n0; i <= n1; i++)
1558 if (lex_source_next__ (src, i)->macro_rep)
1563 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1564 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1565 other tokens included in that range. The syntax is encoded in UTF-8 and in
1566 the original form supplied to the lexer so that, for example, it may include
1567 comments, spaces, and new-lines if it spans multiple tokens.
1569 Returns an empty string if the token range doesn't include a macro call.
1571 The caller must not modify or free the returned string. */
1572 static struct substring
1573 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1575 if (!lex_source_contains_macro_call (src, n0, n1))
1578 const struct lex_token *token0 = lex_source_next__ (src, n0);
1579 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1580 size_t start = token0->token_pos;
1581 size_t end = token1->token_pos + token1->token_len;
1583 return ss_buffer (&src->buffer[start], end - start);
1587 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1588 const char *format, va_list args)
1590 const struct lex_token *token;
1595 token = lex_source_next__ (src, n0);
1596 if (token->token.type == T_ENDCMD)
1597 ds_put_cstr (&s, _("Syntax error at end of command"));
1600 /* Get the syntax that caused the error. */
1601 char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1603 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1606 /* Get the macro call(s) that expanded to the syntax that caused the
1609 str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1616 _("Syntax error at `%s' (in expansion of `%s')"),
1619 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1624 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1627 ds_put_cstr (&s, _("Syntax error"));
1633 ds_put_cstr (&s, ": ");
1634 ds_put_vformat (&s, format, args);
1636 if (ds_last (&s) != '.')
1637 ds_put_byte (&s, '.');
1639 struct msg *m = xmalloc (sizeof *m);
1641 .category = MSG_C_SYNTAX,
1642 .severity = MSG_S_ERROR,
1643 .location = lex_source_get_location (src, n0, n1),
1644 .text = ds_steal_cstr (&s),
1650 lex_get_error (struct lex_source *src, const struct lex_token *token)
1653 str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1654 syntax, sizeof syntax);
1656 struct string s = DS_EMPTY_INITIALIZER;
1657 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1658 ds_put_format (&s, ": %s", token->token.string.string);
1660 struct msg *m = xmalloc (sizeof *m);
1662 .category = MSG_C_SYNTAX,
1663 .severity = MSG_S_ERROR,
1664 .location = lex_token_location_rw (src, token, token),
1665 .text = ds_steal_cstr (&s),
1670 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1671 underlying lex_reader if necessary. Returns true if a new token was added
1672 to SRC's deque, false otherwise. The caller should retry failures unless
1673 SRC's 'eof' marker was set to true indicating that there will be no more
1674 tokens from this source. */
1676 lex_source_try_get_pp (struct lex_source *src)
1678 /* Append a new token to SRC and initialize it. */
1679 struct lex_token *token = xmalloc (sizeof *token);
1680 token->token = (struct token) { .type = T_STOP };
1681 token->macro_rep = NULL;
1682 token->ref_cnt = NULL;
1683 token->token_pos = src->seg_pos;
1684 if (src->reader->line_number > 0)
1685 token->first_line = src->reader->line_number + src->n_newlines;
1687 token->first_line = 0;
1689 /* Extract a segment. */
1690 const char *segment;
1691 enum segment_type seg_type;
1695 segment = &src->buffer[src->seg_pos];
1696 seg_len = segmenter_push (&src->segmenter, segment,
1697 src->length - src->seg_pos,
1698 src->reader->eof, &seg_type);
1702 /* The segmenter needs more input to produce a segment. */
1703 assert (!src->reader->eof);
1704 lex_source_read__ (src);
1707 /* Update state based on the segment. */
1708 token->token_len = seg_len;
1709 src->seg_pos += seg_len;
1710 if (seg_type == SEG_NEWLINE)
1713 /* Get a token from the segment. */
1714 enum tokenize_result result = token_from_segment (
1715 seg_type, ss_buffer (segment, seg_len), &token->token);
1717 /* If we've reached the end of a line, or the end of a command, then pass
1718 the line to the output engine as a syntax text item. */
1719 int n_lines = seg_type == SEG_NEWLINE;
1720 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1723 src->suppress_next_newline = true;
1725 else if (n_lines > 0 && src->suppress_next_newline)
1728 src->suppress_next_newline = false;
1730 for (int i = 0; i < n_lines; i++)
1732 /* Beginning of line. */
1733 const char *line = &src->buffer[src->journal_pos];
1735 /* Calculate line length, including \n or \r\n end-of-line if present.
1737 We use src->head even though that may be beyond what we've actually
1738 converted to tokens (which is only through line_pos). That's because,
1739 if we're emitting the line due to SEG_END_COMMAND, we want to take the
1740 whole line through the newline, not just through the '.'. */
1741 size_t max_len = src->length - src->journal_pos;
1742 const char *newline = memchr (line, '\n', max_len);
1743 size_t line_len = newline ? newline - line + 1 : max_len;
1745 /* Calculate line length excluding end-of-line. */
1746 size_t copy_len = line_len;
1747 if (copy_len > 0 && line[copy_len - 1] == '\n')
1749 if (copy_len > 0 && line[copy_len - 1] == '\r')
1752 /* Submit the line as syntax. */
1753 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1754 xmemdup0 (line, copy_len),
1757 src->journal_pos += line_len;
1762 case TOKENIZE_ERROR:
1763 lex_get_error (src, token);
1765 case TOKENIZE_EMPTY:
1766 lex_token_destroy (token);
1769 case TOKENIZE_TOKEN:
1770 if (token->token.type == T_STOP)
1772 token->token.type = T_ENDCMD;
1775 lex_stage_push_last (&src->pp, token);
1781 /* Attempts to append a new token to SRC. Returns true if successful, false on
1782 failure. On failure, the end of SRC has been reached and no more tokens
1783 will be forthcoming from it.
1785 Does not make the new token available for lookahead yet; the caller must
1786 adjust SRC's 'middle' pointer to do so. */
1788 lex_source_get_pp (struct lex_source *src)
1791 if (lex_source_try_get_pp (src))
1797 lex_source_try_get_merge (const struct lex_source *src_)
1799 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1801 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1804 if (!settings_get_mexpand ())
1806 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1810 /* Now pass tokens one-by-one to the macro expander.
1812 In the common case where there is no macro to expand, the loop is not
1814 struct macro_call *mc;
1815 int n_call = macro_call_create (src->lexer->macros,
1816 &lex_stage_first (&src->pp)->token, &mc);
1817 for (int ofs = 1; !n_call; ofs++)
1819 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1821 /* This should not be reachable because we always get a T_ENDCMD at
1822 the end of an input file (transformed from T_STOP by
1823 lex_source_try_get_pp()) and the macro_expander should always
1824 terminate expansion on T_ENDCMD. */
1828 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1829 size_t start = t->token_pos;
1830 size_t end = t->token_pos + t->token_len;
1831 const struct macro_token mt = {
1833 .syntax = ss_buffer (&src->buffer[start], end - start),
1835 const struct msg_location loc = lex_token_location (src, t, t);
1836 n_call = macro_call_add (mc, &mt, &loc);
1840 /* False alarm: no macro expansion after all. Use first token as
1841 lookahead. We'll retry macro expansion from the second token next
1843 macro_call_destroy (mc);
1844 lex_stage_shift (&src->merge, &src->pp, 1);
1848 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1849 are a macro call. (These are likely to be the only tokens in 'pp'.)
1851 const struct lex_token *c0 = lex_stage_first (&src->pp);
1852 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1853 struct macro_tokens expansion = { .n = 0 };
1854 struct msg_location loc = lex_token_location (src, c0, c1);
1855 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1856 macro_call_destroy (mc);
1858 /* Convert the macro expansion into syntax for possible error messages
1860 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1861 size_t *len = xnmalloc (expansion.n, sizeof *len);
1862 struct string s = DS_EMPTY_INITIALIZER;
1863 macro_tokens_to_syntax (&expansion, &s, ofs, len);
1865 if (settings_get_mprint ())
1866 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1867 _("Macro Expansion")));
1869 /* Append the macro expansion tokens to the lookahead. */
1870 if (expansion.n > 0)
1872 char *macro_rep = ds_steal_cstr (&s);
1873 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1874 *ref_cnt = expansion.n;
1875 for (size_t i = 0; i < expansion.n; i++)
1877 struct lex_token *token = xmalloc (sizeof *token);
1878 *token = (struct lex_token) {
1879 .token = expansion.mts[i].token,
1880 .token_pos = c0->token_pos,
1881 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1882 .first_line = c0->first_line,
1883 .macro_rep = macro_rep,
1888 lex_stage_push_last (&src->merge, token);
1890 ss_dealloc (&expansion.mts[i].syntax);
1895 free (expansion.mts);
1899 /* Destroy the tokens for the call. */
1900 for (size_t i = 0; i < n_call; i++)
1901 lex_stage_pop_first (&src->pp);
1903 return expansion.n > 0;
1906 /* Attempts to obtain at least one new token into 'merge' in SRC.
1908 Returns true if successful, false on failure. In the latter case, SRC is
1909 exhausted and 'src->eof' is now true. */
1911 lex_source_get_merge (struct lex_source *src)
1914 if (lex_source_try_get_merge (src))
1919 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1921 Returns true if successful, false on failure. In the latter case, SRC is
1922 exhausted and 'src->eof' is now true. */
1924 lex_source_get_parse (struct lex_source *src)
1926 struct merger m = MERGER_INIT;
1928 for (size_t i = 0; ; i++)
1930 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1932 /* We always get a T_ENDCMD at the end of an input file
1933 (transformed from T_STOP by lex_source_try_get_pp()) and
1934 merger_add() should never return -1 on T_ENDCMD. */
1935 assert (lex_stage_is_empty (&src->merge));
1939 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1943 lex_source_push_parse (src, lex_stage_take_first (&src->merge));
1946 else if (retval > 0)
1948 /* Add a token that merges all the tokens together. */
1949 const struct lex_token *first = lex_stage_first (&src->merge);
1950 const struct lex_token *last = lex_stage_nth (&src->merge,
1952 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1953 struct lex_token *t = xmalloc (sizeof *t);
1954 *t = (struct lex_token) {
1956 .token_pos = first->token_pos,
1957 .token_len = (last->token_pos - first->token_pos) + last->token_len,
1958 .first_line = first->first_line,
1960 /* This works well if all the tokens were not expanded from macros,
1961 or if they came from the same macro expansion. It just gives up
1962 in the other (corner) cases. */
1963 .macro_rep = macro ? first->macro_rep : NULL,
1964 .ofs = macro ? first->ofs : 0,
1965 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
1966 .ref_cnt = macro ? first->ref_cnt : NULL,
1970 lex_source_push_parse (src, t);
1972 for (int i = 0; i < retval; i++)
1973 lex_stage_pop_first (&src->merge);
1980 lex_source_push_endcmd__ (struct lex_source *src)
1982 assert (src->n_parse == 0);
1984 struct lex_token *token = xmalloc (sizeof *token);
1985 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
1986 lex_source_push_parse (src, token);
1990 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
1992 if (src->n_parse >= src->allocated_parse)
1993 src->parse = x2nrealloc (src->parse, &src->allocated_parse,
1994 sizeof *src->parse);
1995 src->parse[src->n_parse++] = token;
1999 lex_source_clear_parse (struct lex_source *src)
2001 for (size_t i = 0; i < src->n_parse; i++)
2002 lex_token_destroy (src->parse[i]);
2003 src->n_parse = src->parse_ofs = 0;
2006 static struct lex_source *
2007 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2009 struct lex_source *src = xmalloc (sizeof *src);
2010 *src = (struct lex_source) {
2012 .segmenter = segmenter_init (reader->syntax, false),
2016 lex_source_push_endcmd__ (src);
2022 lex_source_destroy (struct lex_source *src)
2024 char *file_name = src->reader->file_name;
2025 char *encoding = src->reader->encoding;
2026 if (src->reader->class->destroy != NULL)
2027 src->reader->class->destroy (src->reader);
2031 lex_stage_uninit (&src->pp);
2032 lex_stage_uninit (&src->merge);
2033 lex_source_clear_parse (src);
2035 ll_remove (&src->ll);
2039 struct lex_file_reader
2041 struct lex_reader reader;
2042 struct u8_istream *istream;
2045 static struct lex_reader_class lex_file_reader_class;
2047 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2048 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2049 ENCODING, which should take one of the forms accepted by
2050 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2051 mode of the new reader, respectively.
2053 Returns a null pointer if FILE_NAME cannot be opened. */
2055 lex_reader_for_file (const char *file_name, const char *encoding,
2056 enum segmenter_mode syntax,
2057 enum lex_error_mode error)
2059 struct lex_file_reader *r;
2060 struct u8_istream *istream;
2062 istream = (!strcmp(file_name, "-")
2063 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2064 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2065 if (istream == NULL)
2067 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2071 r = xmalloc (sizeof *r);
2072 lex_reader_init (&r->reader, &lex_file_reader_class);
2073 r->reader.syntax = syntax;
2074 r->reader.error = error;
2075 r->reader.file_name = xstrdup (file_name);
2076 r->reader.encoding = xstrdup_if_nonnull (encoding);
2077 r->reader.line_number = 1;
2078 r->istream = istream;
2083 static struct lex_file_reader *
2084 lex_file_reader_cast (struct lex_reader *r)
2086 return UP_CAST (r, struct lex_file_reader, reader);
2090 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2091 enum prompt_style prompt_style UNUSED)
2093 struct lex_file_reader *r = lex_file_reader_cast (r_);
2094 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2097 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2104 lex_file_close (struct lex_reader *r_)
2106 struct lex_file_reader *r = lex_file_reader_cast (r_);
2108 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2110 if (u8_istream_close (r->istream) != 0)
2111 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2114 u8_istream_free (r->istream);
2119 static struct lex_reader_class lex_file_reader_class =
2125 struct lex_string_reader
2127 struct lex_reader reader;
2132 static struct lex_reader_class lex_string_reader_class;
2134 /* Creates and returns a new lex_reader for the contents of S, which must be
2135 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2136 with ss_dealloc() when it is closed. */
2138 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2140 struct lex_string_reader *r;
2142 r = xmalloc (sizeof *r);
2143 lex_reader_init (&r->reader, &lex_string_reader_class);
2144 r->reader.syntax = SEG_MODE_AUTO;
2145 r->reader.encoding = xstrdup_if_nonnull (encoding);
2152 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2153 which must be encoded in ENCODING. The caller retains ownership of S. */
2155 lex_reader_for_string (const char *s, const char *encoding)
2157 struct substring ss;
2158 ss_alloc_substring (&ss, ss_cstr (s));
2159 return lex_reader_for_substring_nocopy (ss, encoding);
2162 /* Formats FORMAT as a printf()-like format string and creates and returns a
2163 new lex_reader for the formatted result. */
2165 lex_reader_for_format (const char *format, const char *encoding, ...)
2167 struct lex_reader *r;
2170 va_start (args, encoding);
2171 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2177 static struct lex_string_reader *
2178 lex_string_reader_cast (struct lex_reader *r)
2180 return UP_CAST (r, struct lex_string_reader, reader);
2184 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2185 enum prompt_style prompt_style UNUSED)
2187 struct lex_string_reader *r = lex_string_reader_cast (r_);
2190 chunk = MIN (n, r->s.length - r->offset);
2191 memcpy (buf, r->s.string + r->offset, chunk);
2198 lex_string_close (struct lex_reader *r_)
2200 struct lex_string_reader *r = lex_string_reader_cast (r_);
2206 static struct lex_reader_class lex_string_reader_class =