1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/intern.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
70 size_t token_pos; /* Offset into src->buffer of token start. */
71 size_t token_len; /* Length of source for token in bytes. */
73 /* For a token obtained through macro expansion, this is just this token.
75 For a token obtained through the lexer in an ordinary way, these are
77 char *macro_rep; /* The whole macro expansion. */
78 size_t ofs; /* Offset of this token in macro_rep. */
79 size_t len; /* Length of this token in macro_rep. */
80 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
84 lex_token_destroy (struct lex_token *t)
86 token_uninit (&t->token);
89 assert (*t->ref_cnt > 0);
99 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
104 struct lex_token **tokens;
107 static void lex_stage_clear (struct lex_stage *);
108 static void lex_stage_uninit (struct lex_stage *);
110 static size_t lex_stage_count (const struct lex_stage *);
111 static bool lex_stage_is_empty (const struct lex_stage *);
113 static struct lex_token *lex_stage_first (struct lex_stage *);
114 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
116 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
117 static void lex_stage_pop_first (struct lex_stage *);
119 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
122 /* Deletes all the tokens from STAGE. */
124 lex_stage_clear (struct lex_stage *stage)
126 while (!deque_is_empty (&stage->deque))
127 lex_stage_pop_first (stage);
130 /* Deletes all the tokens from STAGE and frees storage for the deque. */
132 lex_stage_uninit (struct lex_stage *stage)
134 lex_stage_clear (stage);
135 free (stage->tokens);
138 /* Returns true if STAGE contains no tokens, otherwise false. */
140 lex_stage_is_empty (const struct lex_stage *stage)
142 return deque_is_empty (&stage->deque);
145 /* Returns the number of tokens in STAGE. */
147 lex_stage_count (const struct lex_stage *stage)
149 return deque_count (&stage->deque);
152 /* Returns the first token in STAGE, which must be nonempty.
153 The first token is the one accessed with the least lookahead. */
154 static struct lex_token *
155 lex_stage_first (struct lex_stage *stage)
157 return lex_stage_nth (stage, 0);
160 /* Returns the token the given INDEX in STAGE. The first token (with the least
161 lookahead) is 0, the second token is 1, and so on. There must be at least
162 INDEX + 1 tokens in STAGE. */
163 static struct lex_token *
164 lex_stage_nth (struct lex_stage *stage, size_t index)
166 return stage->tokens[deque_back (&stage->deque, index)];
169 /* Adds TOKEN so that it becomes the last token in STAGE. */
171 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
173 if (deque_is_full (&stage->deque))
174 stage->tokens = deque_expand (&stage->deque, stage->tokens,
175 sizeof *stage->tokens);
176 stage->tokens[deque_push_front (&stage->deque)] = token;
179 /* Removes and returns the first token from STAGE. */
180 static struct lex_token *
181 lex_stage_take_first (struct lex_stage *stage)
183 return stage->tokens[deque_pop_back (&stage->deque)];
186 /* Removes the first token from STAGE and uninitializes it. */
188 lex_stage_pop_first (struct lex_stage *stage)
190 lex_token_destroy (lex_stage_take_first (stage));
193 /* Removes the first N tokens from SRC, appending them to DST as the last
196 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
198 for (size_t i = 0; i < n; i++)
199 lex_stage_push_last (dst, lex_stage_take_first (src));
202 /* A source of tokens, corresponding to a syntax file.
204 This is conceptually a lex_reader wrapped with everything needed to convert
205 its UTF-8 bytes into tokens. */
208 struct ll ll; /* In lexer's list of sources. */
209 struct lex_reader *reader;
211 struct segmenter segmenter;
212 bool eof; /* True if T_STOP was read from 'reader'. */
214 /* Buffer of UTF-8 bytes. */
215 char *buffer; /* Source file contents. */
216 size_t length; /* Number of bytes filled. */
217 size_t allocated; /* Number of bytes allocated. */
219 /* Offsets into 'buffer'. */
220 size_t journal_pos; /* First byte not yet output to journal. */
221 size_t seg_pos; /* First byte not yet scanned as token. */
223 /* Offset into 'buffer' of starts of lines. */
225 size_t n_lines, allocated_lines;
227 bool suppress_next_newline;
231 This is a pipeline with the following stages. Each token eventually
232 made available to the parser passes through of these stages. The stages
233 are named after the processing that happens in each one.
235 Initially, tokens come from the segmenter and scanner to 'pp':
237 - pp: Tokens that need to pass through the macro preprocessor to end up
240 - merge: Tokens that need to pass through scan_merge() to end up in
243 - parse: Tokens available to the client for parsing.
245 'pp' and 'merge' store tokens only temporarily until they pass into
246 'parse'. Tokens then live in 'parse' until the command is fully
247 consumed, at which time they are freed together. */
249 struct lex_stage merge;
250 struct lex_token **parse;
251 size_t n_parse, allocated_parse, parse_ofs;
254 static struct lex_source *lex_source_create (struct lexer *,
255 struct lex_reader *);
256 static void lex_source_destroy (struct lex_source *);
261 struct ll_list sources; /* Contains "struct lex_source"s. */
262 struct macro_set *macros;
265 static struct lex_source *lex_source__ (const struct lexer *);
266 static char *lex_source_get_syntax__ (const struct lex_source *,
268 static const struct lex_token *lex_next__ (const struct lexer *, int n);
269 static void lex_source_push_endcmd__ (struct lex_source *);
270 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
271 static void lex_source_clear_parse (struct lex_source *);
273 static bool lex_source_get_parse (struct lex_source *);
274 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
275 const char *format, va_list)
276 PRINTF_FORMAT (4, 0);
277 static const struct lex_token *lex_source_next__ (const struct lex_source *,
280 /* Initializes READER with the specified CLASS and otherwise some reasonable
281 defaults. The caller should fill in the others members as desired. */
283 lex_reader_init (struct lex_reader *reader,
284 const struct lex_reader_class *class)
286 reader->class = class;
287 reader->syntax = SEG_MODE_AUTO;
288 reader->error = LEX_ERROR_CONTINUE;
289 reader->file_name = NULL;
290 reader->encoding = NULL;
291 reader->line_number = 0;
295 /* Frees any file name already in READER and replaces it by a copy of
296 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
298 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
300 free (reader->file_name);
301 reader->file_name = xstrdup_if_nonnull (file_name);
304 /* Creates and returns a new lexer. */
308 struct lexer *lexer = xmalloc (sizeof *lexer);
309 *lexer = (struct lexer) {
310 .sources = LL_INITIALIZER (lexer->sources),
311 .macros = macro_set_create (),
316 /* Destroys LEXER. */
318 lex_destroy (struct lexer *lexer)
322 struct lex_source *source, *next;
324 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
325 lex_source_destroy (source);
326 macro_set_destroy (lexer->macros);
331 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
332 same name. Takes ownership of M. */
334 lex_define_macro (struct lexer *lexer, struct macro *m)
336 macro_set_add (lexer->macros, m);
339 /* Inserts READER into LEXER so that the next token read by LEXER comes from
340 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
343 lex_include (struct lexer *lexer, struct lex_reader *reader)
345 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
346 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
349 /* Appends READER to LEXER, so that it will be read after all other current
350 readers have already been read. */
352 lex_append (struct lexer *lexer, struct lex_reader *reader)
354 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
359 /* Advances LEXER to the next token, consuming the current token. */
361 lex_get (struct lexer *lexer)
363 struct lex_source *src;
365 src = lex_source__ (lexer);
369 if (src->parse_ofs < src->n_parse)
371 if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
372 lex_source_clear_parse (src);
377 while (src->parse_ofs == src->n_parse)
378 if (!lex_source_get_parse (src))
380 lex_source_destroy (src);
381 src = lex_source__ (lexer);
387 /* Advances LEXER by N tokens. */
389 lex_get_n (struct lexer *lexer, size_t n)
395 /* Issuing errors. */
397 /* Prints a syntax error message containing the current token and
398 given message MESSAGE (if non-null). */
400 lex_error (struct lexer *lexer, const char *format, ...)
404 va_start (args, format);
405 lex_next_error_valist (lexer, 0, 0, format, args);
409 /* Prints a syntax error message containing the current token and
410 given message MESSAGE (if non-null). */
412 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
414 lex_next_error_valist (lexer, 0, 0, format, args);
417 /* Prints a syntax error message containing the current token and
418 given message MESSAGE (if non-null). */
420 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
424 va_start (args, format);
425 lex_next_error_valist (lexer, n0, n1, format, args);
429 /* Prints a syntax error message saying that one of the strings provided as
430 varargs, up to the first NULL, is expected. */
432 (lex_error_expecting) (struct lexer *lexer, ...)
436 va_start (args, lexer);
437 lex_error_expecting_valist (lexer, args);
441 /* Prints a syntax error message saying that one of the options provided in
442 ARGS, up to the first NULL, is expected. */
444 lex_error_expecting_valist (struct lexer *lexer, va_list args)
446 enum { MAX_OPTIONS = 9 };
447 const char *options[MAX_OPTIONS];
449 while (n < MAX_OPTIONS)
451 const char *option = va_arg (args, const char *);
455 options[n++] = option;
457 lex_error_expecting_array (lexer, options, n);
461 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
466 lex_error (lexer, NULL);
470 lex_error (lexer, _("expecting %s"), options[0]);
474 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
478 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
483 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
484 options[0], options[1], options[2], options[3]);
488 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
489 options[0], options[1], options[2], options[3], options[4]);
493 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
494 options[0], options[1], options[2], options[3], options[4],
499 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
500 options[0], options[1], options[2], options[3], options[4],
501 options[5], options[6]);
505 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
506 options[0], options[1], options[2], options[3], options[4],
507 options[5], options[6], options[7]);
511 lex_error (lexer, NULL);
515 /* Reports an error to the effect that subcommand SBC may only be specified
518 This function does not take a lexer as an argument or use lex_error(),
519 because the result would ordinarily just be redundant: "Syntax error at
520 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
521 not help the user find the error. */
523 lex_sbc_only_once (const char *sbc)
525 msg (SE, _("Subcommand %s may only be specified once."), sbc);
528 /* Reports an error to the effect that subcommand SBC is missing.
530 This function does not take a lexer as an argument or use lex_error(),
531 because a missing subcommand can normally be detected only after the whole
532 command has been parsed, and so lex_error() would always report "Syntax
533 error at end of command", which does not help the user find the error. */
535 lex_sbc_missing (const char *sbc)
537 msg (SE, _("Required subcommand %s was not specified."), sbc);
540 /* Reports an error to the effect that specification SPEC may only be specified
541 once within subcommand SBC. */
543 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
545 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
549 /* Reports an error to the effect that specification SPEC is missing within
552 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
554 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
558 /* Prints a syntax error message containing the current token and
559 given message MESSAGE (if non-null). */
561 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
562 const char *format, va_list args)
564 struct lex_source *src = lex_source__ (lexer);
567 lex_source_error_valist (src, n0, n1, format, args);
573 ds_put_format (&s, _("Syntax error at end of input"));
576 ds_put_cstr (&s, ": ");
577 ds_put_vformat (&s, format, args);
579 if (ds_last (&s) != '.')
580 ds_put_byte (&s, '.');
581 msg (SE, "%s", ds_cstr (&s));
586 /* Checks that we're at end of command.
587 If so, returns a successful command completion code.
588 If not, flags a syntax error and returns an error command
591 lex_end_of_command (struct lexer *lexer)
593 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
595 lex_error (lexer, _("expecting end of command"));
602 /* Token testing functions. */
604 /* Returns true if the current token is a number. */
606 lex_is_number (const struct lexer *lexer)
608 return lex_next_is_number (lexer, 0);
611 /* Returns true if the current token is a string. */
613 lex_is_string (const struct lexer *lexer)
615 return lex_next_is_string (lexer, 0);
618 /* Returns the value of the current token, which must be a
619 floating point number. */
621 lex_number (const struct lexer *lexer)
623 return lex_next_number (lexer, 0);
626 /* Returns true iff the current token is an integer. */
628 lex_is_integer (const struct lexer *lexer)
630 return lex_next_is_integer (lexer, 0);
633 /* Returns the value of the current token, which must be an
636 lex_integer (const struct lexer *lexer)
638 return lex_next_integer (lexer, 0);
641 /* Token testing functions with lookahead.
643 A value of 0 for N as an argument to any of these functions refers to the
644 current token. Lookahead is limited to the current command. Any N greater
645 than the number of tokens remaining in the current command will be treated
646 as referring to a T_ENDCMD token. */
648 /* Returns true if the token N ahead of the current token is a number. */
650 lex_next_is_number (const struct lexer *lexer, int n)
652 return token_is_number (lex_next (lexer, n));
655 /* Returns true if the token N ahead of the current token is a string. */
657 lex_next_is_string (const struct lexer *lexer, int n)
659 return token_is_string (lex_next (lexer, n));
662 /* Returns the value of the token N ahead of the current token, which must be a
663 floating point number. */
665 lex_next_number (const struct lexer *lexer, int n)
667 return token_number (lex_next (lexer, n));
670 /* Returns true if the token N ahead of the current token is an integer. */
672 lex_next_is_integer (const struct lexer *lexer, int n)
674 return token_is_integer (lex_next (lexer, n));
677 /* Returns the value of the token N ahead of the current token, which must be
680 lex_next_integer (const struct lexer *lexer, int n)
682 return token_integer (lex_next (lexer, n));
685 /* Token matching functions. */
687 /* If the current token has the specified TYPE, skips it and returns true.
688 Otherwise, returns false. */
690 lex_match (struct lexer *lexer, enum token_type type)
692 if (lex_token (lexer) == type)
701 /* If the current token matches IDENTIFIER, skips it and returns true.
702 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
705 IDENTIFIER must be an ASCII string. */
707 lex_match_id (struct lexer *lexer, const char *identifier)
709 return lex_match_id_n (lexer, identifier, 3);
712 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
713 may be abbreviated to its first N letters. Otherwise, returns false.
715 IDENTIFIER must be an ASCII string. */
717 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
719 if (lex_token (lexer) == T_ID
720 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
729 /* If the current token is integer X, skips it and returns true. Otherwise,
732 lex_match_int (struct lexer *lexer, int x)
734 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
743 /* Forced matches. */
745 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
746 abbreviated to its first 3 letters. Otherwise, reports an error and returns
749 IDENTIFIER must be an ASCII string. */
751 lex_force_match_id (struct lexer *lexer, const char *identifier)
753 if (lex_match_id (lexer, identifier))
757 lex_error_expecting (lexer, identifier);
762 /* If the current token has the specified TYPE, skips it and returns true.
763 Otherwise, reports an error and returns false. */
765 lex_force_match (struct lexer *lexer, enum token_type type)
767 if (lex_token (lexer) == type)
774 const char *type_string = token_type_to_string (type);
777 char *s = xasprintf ("`%s'", type_string);
778 lex_error_expecting (lexer, s);
782 lex_error_expecting (lexer, token_type_to_name (type));
788 /* If the current token is a string, does nothing and returns true.
789 Otherwise, reports an error and returns false. */
791 lex_force_string (struct lexer *lexer)
793 if (lex_is_string (lexer))
797 lex_error (lexer, _("expecting string"));
802 /* If the current token is a string or an identifier, does nothing and returns
803 true. Otherwise, reports an error and returns false.
805 This is meant for use in syntactic situations where we want to encourage the
806 user to supply a quoted string, but for compatibility we also accept
807 identifiers. (One example of such a situation is file names.) Therefore,
808 the error message issued when the current token is wrong only says that a
809 string is expected and doesn't mention that an identifier would also be
812 lex_force_string_or_id (struct lexer *lexer)
814 return lex_token (lexer) == T_ID || lex_force_string (lexer);
817 /* If the current token is an integer, does nothing and returns true.
818 Otherwise, reports an error and returns false. */
820 lex_force_int (struct lexer *lexer)
822 if (lex_is_integer (lexer))
826 lex_error (lexer, _("expecting integer"));
831 /* If the current token is an integer in the range MIN...MAX (inclusive), does
832 nothing and returns true. Otherwise, reports an error and returns false.
833 If NAME is nonnull, then it is used in the error message. */
835 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
837 bool is_integer = lex_is_integer (lexer);
838 bool too_small = is_integer && lex_integer (lexer) < min;
839 bool too_big = is_integer && lex_integer (lexer) > max;
840 if (is_integer && !too_small && !too_big)
845 /* Weird, maybe a bug in the caller. Just report that we needed an
848 lex_error (lexer, _("Integer expected for %s."), name);
850 lex_error (lexer, _("Integer expected."));
855 lex_error (lexer, _("Expected %ld for %s."), min, name);
857 lex_error (lexer, _("Expected %ld."), min);
859 else if (min + 1 == max)
862 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
864 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
868 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
869 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
871 if (report_lower_bound && report_upper_bound)
875 _("Expected integer between %ld and %ld for %s."),
878 lex_error (lexer, _("Expected integer between %ld and %ld."),
881 else if (report_lower_bound)
886 lex_error (lexer, _("Expected non-negative integer for %s."),
889 lex_error (lexer, _("Expected non-negative integer."));
894 lex_error (lexer, _("Expected positive integer for %s."),
897 lex_error (lexer, _("Expected positive integer."));
900 else if (report_upper_bound)
904 _("Expected integer less than or equal to %ld for %s."),
907 lex_error (lexer, _("Expected integer less than or equal to %ld."),
913 lex_error (lexer, _("Integer expected for %s."), name);
915 lex_error (lexer, _("Integer expected."));
921 /* If the current token is a number, does nothing and returns true.
922 Otherwise, reports an error and returns false. */
924 lex_force_num (struct lexer *lexer)
926 if (lex_is_number (lexer))
929 lex_error (lexer, _("expecting number"));
933 /* If the current token is an identifier, does nothing and returns true.
934 Otherwise, reports an error and returns false. */
936 lex_force_id (struct lexer *lexer)
938 if (lex_token (lexer) == T_ID)
941 lex_error (lexer, _("expecting identifier"));
945 /* Token accessors. */
947 /* Returns the type of LEXER's current token. */
949 lex_token (const struct lexer *lexer)
951 return lex_next_token (lexer, 0);
954 /* Returns the number in LEXER's current token.
956 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
957 tokens this function will always return zero. */
959 lex_tokval (const struct lexer *lexer)
961 return lex_next_tokval (lexer, 0);
964 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
966 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
967 this functions this function will always return NULL.
969 The UTF-8 encoding of the returned string is correct for variable names and
970 other identifiers. Use filename_to_utf8() to use it as a filename. Use
971 data_in() to use it in a "union value". */
973 lex_tokcstr (const struct lexer *lexer)
975 return lex_next_tokcstr (lexer, 0);
978 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
979 null-terminated (but the null terminator is not included in the returned
980 substring's 'length').
982 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
983 this functions this function will always return NULL.
985 The UTF-8 encoding of the returned string is correct for variable names and
986 other identifiers. Use filename_to_utf8() to use it as a filename. Use
987 data_in() to use it in a "union value". */
989 lex_tokss (const struct lexer *lexer)
991 return lex_next_tokss (lexer, 0);
996 A value of 0 for N as an argument to any of these functions refers to the
997 current token. Lookahead is limited to the current command. Any N greater
998 than the number of tokens remaining in the current command will be treated
999 as referring to a T_ENDCMD token. */
1001 static const struct lex_token *
1002 lex_next__ (const struct lexer *lexer_, int n)
1004 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1005 struct lex_source *src = lex_source__ (lexer);
1008 return lex_source_next__ (src, n);
1011 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1016 static const struct lex_token *
1017 lex_source_next__ (const struct lex_source *src_, int n)
1019 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1023 if (-n <= src->parse_ofs)
1024 return src->parse[src->parse_ofs - (-n)];
1027 static const struct lex_token endcmd_token
1028 = { .token = { .type = T_ENDCMD } };
1029 return &endcmd_token;
1033 while (src->n_parse - src->parse_ofs <= n)
1035 if (src->n_parse > 0)
1037 const struct lex_token *t = src->parse[src->n_parse - 1];
1038 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1042 lex_source_get_parse (src);
1045 return src->parse[src->parse_ofs + n];
1048 /* Returns the "struct token" of the token N after the current one in LEXER.
1049 The returned pointer can be invalidated by pretty much any succeeding call
1050 into the lexer, although the string pointer within the returned token is
1051 only invalidated by consuming the token (e.g. with lex_get()). */
1052 const struct token *
1053 lex_next (const struct lexer *lexer, int n)
1055 return &lex_next__ (lexer, n)->token;
1058 /* Returns the type of the token N after the current one in LEXER. */
1060 lex_next_token (const struct lexer *lexer, int n)
1062 return lex_next (lexer, n)->type;
1065 /* Returns the number in the tokn N after the current one in LEXER.
1067 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1068 tokens this function will always return zero. */
1070 lex_next_tokval (const struct lexer *lexer, int n)
1072 return token_number (lex_next (lexer, n));
1075 /* Returns the null-terminated string in the token N after the current one, in
1078 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1079 this functions this function will always return NULL.
1081 The UTF-8 encoding of the returned string is correct for variable names and
1082 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1083 data_in() to use it in a "union value". */
1085 lex_next_tokcstr (const struct lexer *lexer, int n)
1087 return lex_next_tokss (lexer, n).string;
1090 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1091 The string is null-terminated (but the null terminator is not included in
1092 the returned substring's 'length').
1094 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1095 tokens this functions this function will always return NULL.
1097 The UTF-8 encoding of the returned string is correct for variable names and
1098 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1099 data_in() to use it in a "union value". */
1101 lex_next_tokss (const struct lexer *lexer, int n)
1103 return lex_next (lexer, n)->string;
1107 lex_ofs (const struct lexer *lexer)
1109 struct lex_source *src = lex_source__ (lexer);
1110 return src ? src->parse_ofs : 0;
1113 const struct token *
1114 lex_ofs_token (const struct lexer *lexer_, int ofs)
1116 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1117 struct lex_source *src = lex_source__ (lexer);
1120 return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1123 static const struct token stop_token = { .type = T_STOP };
1128 struct msg_location *
1129 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1131 int ofs = lex_ofs (lexer);
1132 return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1135 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1136 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1137 are both zero, this requests the syntax for the current token.) The caller
1138 must eventually free the returned string (with free()). The syntax is
1139 encoded in UTF-8 and in the original form supplied to the lexer so that, for
1140 example, it may include comments, spaces, and new-lines if it spans multiple
1141 tokens. Macro expansion, however, has already been performed. */
1143 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1145 return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1148 /* Returns true if the token N ahead of the current one was produced by macro
1149 expansion, false otherwise. */
1151 lex_next_is_from_macro (const struct lexer *lexer, int n)
1153 return lex_next__ (lexer, n)->macro_rep != NULL;
1157 lex_tokens_match (const struct token *actual, const struct token *expected)
1159 if (actual->type != expected->type)
1162 switch (actual->type)
1166 return actual->number == expected->number;
1169 return lex_id_match (expected->string, actual->string);
1172 return (actual->string.length == expected->string.length
1173 && !memcmp (actual->string.string, expected->string.string,
1174 actual->string.length));
1182 lex_at_phrase__ (struct lexer *lexer, const char *s)
1184 struct string_lexer slex;
1188 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1189 while (string_lexer_next (&slex, &token))
1191 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1192 token_uninit (&token);
1199 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1200 returns true. Otherwise, returns false.
1202 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1203 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1204 first three letters. */
1206 lex_at_phrase (struct lexer *lexer, const char *s)
1208 return lex_at_phrase__ (lexer, s) > 0;
1211 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1212 skips it and returns true. Otherwise, returns false.
1214 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1215 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1216 first three letters. */
1218 lex_match_phrase (struct lexer *lexer, const char *s)
1220 size_t n = lex_at_phrase__ (lexer, s);
1222 lex_get_n (lexer, n);
1227 count_newlines (char *s, size_t length)
1232 while ((newline = memchr (s, '\n', length)) != NULL)
1235 length -= (newline + 1) - s;
1243 lex_token_get_last_line_number (const struct lex_source *src,
1244 const struct lex_token *token)
1246 size_t end = token->token_pos + token->token_len
1247 return lex_source_ofs_to_line_number (src,
1248 if (token->first_line == 0)
1252 char *token_str = &src->buffer[token->token_pos];
1253 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1258 lex_token_get_column__ (const struct lex_source *src, size_t offset)
1260 const char *newline = memrchr (src->buffer, '\n', offset);
1261 size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1262 return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1266 lex_token_get_first_column (const struct lex_source *src,
1267 const struct lex_token *token)
1269 return lex_token_get_column__ (src, token->token_pos);
1273 lex_token_get_last_column (const struct lex_source *src,
1274 const struct lex_token *token)
1276 return lex_token_get_column__ (src, token->token_pos + token->token_len);
1279 static struct msg_location
1280 lex_token_location (const struct lex_source *src,
1281 const struct lex_token *t0,
1282 const struct lex_token *t1)
1284 int first_column = lex_token_get_first_column (src, t0);
1285 int last_line = lex_token_get_last_line_number (src, t1) - 1;
1286 int last_column = lex_token_get_last_column (src, t1) - 1;
1287 return (struct msg_location) {
1288 .file_name = intern_new_if_nonnull (src->reader->file_name),
1289 .p[0] = { .line = t0->first_line, .column = first_column },
1290 .p[1] = { .line = last_line, .column = last_column },
1294 static struct msg_location *
1295 lex_token_location_rw (const struct lex_source *src,
1296 const struct lex_token *t0,
1297 const struct lex_token *t1)
1299 struct msg_location location = lex_token_location (src, t0, t1);
1300 return msg_location_dup (&location);
1303 static struct msg_location *
1304 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1306 return lex_token_location_rw (src,
1307 lex_source_next__ (src, n0),
1308 lex_source_next__ (src, n1));
1311 /* Returns the 1-based line number of the start of the syntax that represents
1312 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1313 if the token is drawn from a source that does not have line numbers. */
1315 lex_get_first_line_number (const struct lexer *lexer, int n)
1317 const struct lex_source *src = lex_source__ (lexer);
1318 return src ? lex_source_next__ (src, n)->first_line : 0;
1321 /* Returns the 1-based line number of the end of the syntax that represents the
1322 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1323 token or if the token is drawn from a source that does not have line
1326 Most of the time, a single token is wholly within a single line of syntax,
1327 but there are two exceptions: a T_STRING token can be made up of multiple
1328 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1329 token can consist of a "-" on one line followed by the number on the next.
1332 lex_get_last_line_number (const struct lexer *lexer, int n)
1334 const struct lex_source *src = lex_source__ (lexer);
1335 return src ? lex_token_get_last_line_number (src,
1336 lex_source_next__ (src, n)) : 0;
1339 /* Returns the 1-based column number of the start of the syntax that represents
1340 the token N after the current one in LEXER. Returns 0 for a T_STOP
1343 Column numbers are measured according to the width of characters as shown in
1344 a typical fixed-width font, in which CJK characters have width 2 and
1345 combining characters have width 0. */
1347 lex_get_first_column (const struct lexer *lexer, int n)
1349 const struct lex_source *src = lex_source__ (lexer);
1350 return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1353 /* Returns the 1-based column number of the end of the syntax that represents
1354 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1357 Column numbers are measured according to the width of characters as shown in
1358 a typical fixed-width font, in which CJK characters have width 2 and
1359 combining characters have width 0. */
1361 lex_get_last_column (const struct lexer *lexer, int n)
1363 const struct lex_source *src = lex_source__ (lexer);
1364 return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1367 /* Returns the name of the syntax file from which the current command is drawn.
1368 Returns NULL for a T_STOP token or if the command's source does not have
1371 There is no version of this function that takes an N argument because
1372 lookahead only works to the end of a command and any given command is always
1373 within a single syntax file. */
1375 lex_get_file_name (const struct lexer *lexer)
1377 struct lex_source *src = lex_source__ (lexer);
1378 return src == NULL ? NULL : src->reader->file_name;
1381 /* Returns a newly allocated msg_location for the syntax that represents tokens
1382 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1383 must eventually free the location (with msg_location_destroy()). */
1384 struct msg_location *
1385 lex_get_location (const struct lexer *lexer, int n0, int n1)
1387 struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1388 loc->p[0].column = lex_get_first_column (lexer, n0);
1389 loc->p[1].column = lex_get_last_column (lexer, n1) - 1;
1393 /* Returns a newly allocated msg_location for the syntax that represents tokens
1394 with 0-based offsets N0...N1, inclusive, from the current token. The
1395 location only covers the tokens' lines, not the columns. The caller must
1396 eventually free the location (with msg_location_destroy()). */
1397 struct msg_location *
1398 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1400 struct msg_location *loc = xmalloc (sizeof *loc);
1401 int first_line = lex_get_first_line_number (lexer, n0);
1402 int last_line = lex_get_last_line_number (lexer, n1) - 1;
1403 *loc = (struct msg_location) {
1404 .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1405 .p[0] = { .line = first_line },
1406 .p[1] = { .line = last_line },
1412 lex_extend_location (const struct lexer *lexer, int n, struct msg_location **loc)
1414 struct msg_location *new = lex_get_location (lexer, n, n);
1415 msg_location_merge (loc, new);
1416 msg_location_destroy (new);
1420 lex_get_encoding (const struct lexer *lexer)
1422 struct lex_source *src = lex_source__ (lexer);
1423 return src == NULL ? NULL : src->reader->encoding;
1426 /* Returns the syntax mode for the syntax file from which the current drawn is
1427 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1428 does not have line numbers.
1430 There is no version of this function that takes an N argument because
1431 lookahead only works to the end of a command and any given command is always
1432 within a single syntax file. */
1434 lex_get_syntax_mode (const struct lexer *lexer)
1436 struct lex_source *src = lex_source__ (lexer);
1437 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1440 /* Returns the error mode for the syntax file from which the current drawn is
1441 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1442 source does not have line numbers.
1444 There is no version of this function that takes an N argument because
1445 lookahead only works to the end of a command and any given command is always
1446 within a single syntax file. */
1448 lex_get_error_mode (const struct lexer *lexer)
1450 struct lex_source *src = lex_source__ (lexer);
1451 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1454 /* If the source that LEXER is currently reading has error mode
1455 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1456 token to be read comes directly from whatever is next read from the stream.
1458 It makes sense to call this function after encountering an error in a
1459 command entered on the console, because usually the user would prefer not to
1460 have cascading errors. */
1462 lex_interactive_reset (struct lexer *lexer)
1464 struct lex_source *src = lex_source__ (lexer);
1465 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1468 src->journal_pos = src->seg_pos = 0;
1469 src->n_newlines = 0;
1470 src->suppress_next_newline = false;
1471 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1473 lex_stage_clear (&src->pp);
1474 lex_stage_clear (&src->merge);
1475 lex_source_clear_parse (src);
1476 lex_source_push_endcmd__ (src);
1480 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1482 lex_discard_rest_of_command (struct lexer *lexer)
1484 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1488 /* Discards all lookahead tokens in LEXER, then discards all input sources
1489 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1490 runs out of input sources. */
1492 lex_discard_noninteractive (struct lexer *lexer)
1494 struct lex_source *src = lex_source__ (lexer);
1498 lex_stage_clear (&src->pp);
1499 lex_stage_clear (&src->merge);
1500 lex_source_clear_parse (src);
1502 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1503 src = lex_source__ (lexer))
1504 lex_source_destroy (src);
1509 lex_source_expand__ (struct lex_source *src)
1511 if (src->length >= src->allocated)
1512 src->buffer = x2realloc (src->buffer, &src->allocated);
1516 lex_source_read__ (struct lex_source *src)
1520 lex_source_expand__ (src);
1522 size_t space = src->allocated - src->length;
1523 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1524 size_t n = src->reader->class->read (src->reader,
1525 &src->buffer[src->length],
1527 assert (n <= space);
1532 src->reader->eof = true;
1533 lex_source_expand__ (src);
1539 while (!memchr (&src->buffer[src->seg_pos], '\n',
1540 src->length - src->seg_pos));
1543 static struct lex_source *
1544 lex_source__ (const struct lexer *lexer)
1546 return (ll_is_empty (&lexer->sources) ? NULL
1547 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1550 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1551 one, through N1 ahead of the current one, inclusive. (For example, if N0
1552 and N1 are both zero, this requests the syntax for the current token.) The
1553 caller must eventually free the returned string (with free()). The syntax
1554 is encoded in UTF-8 and in the original form supplied to the lexer so that,
1555 for example, it may include comments, spaces, and new-lines if it spans
1556 multiple tokens. Macro expansion, however, has already been performed. */
1558 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1560 struct string s = DS_EMPTY_INITIALIZER;
1561 for (size_t i = n0; i <= n1; )
1563 /* Find [I,J) as the longest sequence of tokens not produced by macro
1564 expansion, or otherwise the longest sequence expanded from a single
1566 const struct lex_token *first = lex_source_next__ (src, i);
1568 for (j = i + 1; j <= n1; j++)
1570 const struct lex_token *cur = lex_source_next__ (src, j);
1571 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1572 || first->macro_rep != cur->macro_rep)
1575 const struct lex_token *last = lex_source_next__ (src, j - 1);
1577 /* Now add the syntax for this sequence of tokens to SRC. */
1578 if (!ds_is_empty (&s))
1579 ds_put_byte (&s, ' ');
1580 if (!first->macro_rep)
1582 size_t start = first->token_pos;
1583 size_t end = last->token_pos + last->token_len;
1584 ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1588 size_t start = first->ofs;
1589 size_t end = last->ofs + last->len;
1590 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1596 return ds_steal_cstr (&s);
1600 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1602 for (size_t i = n0; i <= n1; i++)
1603 if (lex_source_next__ (src, i)->macro_rep)
1608 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1609 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1610 other tokens included in that range. The syntax is encoded in UTF-8 and in
1611 the original form supplied to the lexer so that, for example, it may include
1612 comments, spaces, and new-lines if it spans multiple tokens.
1614 Returns an empty string if the token range doesn't include a macro call.
1616 The caller must not modify or free the returned string. */
1617 static struct substring
1618 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1620 if (!lex_source_contains_macro_call (src, n0, n1))
1623 const struct lex_token *token0 = lex_source_next__ (src, n0);
1624 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1625 size_t start = token0->token_pos;
1626 size_t end = token1->token_pos + token1->token_len;
1628 return ss_buffer (&src->buffer[start], end - start);
1632 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1633 const char *format, va_list args)
1635 const struct lex_token *token;
1640 token = lex_source_next__ (src, n0);
1641 if (token->token.type == T_ENDCMD)
1642 ds_put_cstr (&s, _("Syntax error at end of command"));
1645 /* Get the syntax that caused the error. */
1646 char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1648 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1651 /* Get the macro call(s) that expanded to the syntax that caused the
1654 str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1661 _("Syntax error at `%s' (in expansion of `%s')"),
1664 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1669 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1672 ds_put_cstr (&s, _("Syntax error"));
1678 ds_put_cstr (&s, ": ");
1679 ds_put_vformat (&s, format, args);
1681 if (ds_last (&s) != '.')
1682 ds_put_byte (&s, '.');
1684 struct msg *m = xmalloc (sizeof *m);
1686 .category = MSG_C_SYNTAX,
1687 .severity = MSG_S_ERROR,
1688 .location = lex_source_get_location (src, n0, n1),
1689 .text = ds_steal_cstr (&s),
1695 lex_get_error (struct lex_source *src, const struct lex_token *token)
1698 str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1699 syntax, sizeof syntax);
1701 struct string s = DS_EMPTY_INITIALIZER;
1702 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1703 ds_put_format (&s, ": %s", token->token.string.string);
1705 struct msg *m = xmalloc (sizeof *m);
1707 .category = MSG_C_SYNTAX,
1708 .severity = MSG_S_ERROR,
1709 .location = lex_token_location_rw (src, token, token),
1710 .text = ds_steal_cstr (&s),
1715 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1716 underlying lex_reader if necessary. Returns true if a new token was added
1717 to SRC's deque, false otherwise. The caller should retry failures unless
1718 SRC's 'eof' marker was set to true indicating that there will be no more
1719 tokens from this source. */
1721 lex_source_try_get_pp (struct lex_source *src)
1723 /* Append a new token to SRC and initialize it. */
1724 struct lex_token *token = xmalloc (sizeof *token);
1725 token->token = (struct token) { .type = T_STOP };
1726 token->macro_rep = NULL;
1727 token->ref_cnt = NULL;
1728 token->token_pos = src->seg_pos;
1729 if (src->reader->line_number > 0)
1730 token->first_line = src->reader->line_number + src->n_newlines;
1732 token->first_line = 0;
1734 /* Extract a segment. */
1735 const char *segment;
1736 enum segment_type seg_type;
1740 segment = &src->buffer[src->seg_pos];
1741 seg_len = segmenter_push (&src->segmenter, segment,
1742 src->length - src->seg_pos,
1743 src->reader->eof, &seg_type);
1747 /* The segmenter needs more input to produce a segment. */
1748 assert (!src->reader->eof);
1749 lex_source_read__ (src);
1752 /* Update state based on the segment. */
1753 token->token_len = seg_len;
1754 src->seg_pos += seg_len;
1755 if (seg_type == SEG_NEWLINE)
1758 /* Get a token from the segment. */
1759 enum tokenize_result result = token_from_segment (
1760 seg_type, ss_buffer (segment, seg_len), &token->token);
1762 /* If we've reached the end of a line, or the end of a command, then pass
1763 the line to the output engine as a syntax text item. */
1764 int n_lines = seg_type == SEG_NEWLINE;
1765 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1768 src->suppress_next_newline = true;
1770 else if (n_lines > 0 && src->suppress_next_newline)
1773 src->suppress_next_newline = false;
1775 for (int i = 0; i < n_lines; i++)
1777 /* Beginning of line. */
1778 const char *line = &src->buffer[src->journal_pos];
1780 /* Calculate line length, including \n or \r\n end-of-line if present.
1782 We use src->head even though that may be beyond what we've actually
1783 converted to tokens (which is only through line_pos). That's because,
1784 if we're emitting the line due to SEG_END_COMMAND, we want to take the
1785 whole line through the newline, not just through the '.'. */
1786 size_t max_len = src->length - src->journal_pos;
1787 const char *newline = memchr (line, '\n', max_len);
1788 size_t line_len = newline ? newline - line + 1 : max_len;
1790 /* Calculate line length excluding end-of-line. */
1791 size_t copy_len = line_len;
1792 if (copy_len > 0 && line[copy_len - 1] == '\n')
1794 if (copy_len > 0 && line[copy_len - 1] == '\r')
1797 /* Submit the line as syntax. */
1798 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1799 xmemdup0 (line, copy_len),
1802 src->journal_pos += line_len;
1807 case TOKENIZE_ERROR:
1808 lex_get_error (src, token);
1810 case TOKENIZE_EMPTY:
1811 lex_token_destroy (token);
1814 case TOKENIZE_TOKEN:
1815 if (token->token.type == T_STOP)
1817 token->token.type = T_ENDCMD;
1820 lex_stage_push_last (&src->pp, token);
1826 /* Attempts to append a new token to SRC. Returns true if successful, false on
1827 failure. On failure, the end of SRC has been reached and no more tokens
1828 will be forthcoming from it.
1830 Does not make the new token available for lookahead yet; the caller must
1831 adjust SRC's 'middle' pointer to do so. */
1833 lex_source_get_pp (struct lex_source *src)
1836 if (lex_source_try_get_pp (src))
1842 lex_source_try_get_merge (const struct lex_source *src_)
1844 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1846 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1849 if (!settings_get_mexpand ())
1851 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1855 /* Now pass tokens one-by-one to the macro expander.
1857 In the common case where there is no macro to expand, the loop is not
1859 struct macro_call *mc;
1860 int n_call = macro_call_create (src->lexer->macros,
1861 &lex_stage_first (&src->pp)->token, &mc);
1862 for (int ofs = 1; !n_call; ofs++)
1864 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1866 /* This should not be reachable because we always get a T_ENDCMD at
1867 the end of an input file (transformed from T_STOP by
1868 lex_source_try_get_pp()) and the macro_expander should always
1869 terminate expansion on T_ENDCMD. */
1873 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1874 size_t start = t->token_pos;
1875 size_t end = t->token_pos + t->token_len;
1876 const struct macro_token mt = {
1878 .syntax = ss_buffer (&src->buffer[start], end - start),
1880 const struct msg_location loc = lex_token_location (src, t, t);
1881 n_call = macro_call_add (mc, &mt, &loc);
1885 /* False alarm: no macro expansion after all. Use first token as
1886 lookahead. We'll retry macro expansion from the second token next
1888 macro_call_destroy (mc);
1889 lex_stage_shift (&src->merge, &src->pp, 1);
1893 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1894 are a macro call. (These are likely to be the only tokens in 'pp'.)
1896 const struct lex_token *c0 = lex_stage_first (&src->pp);
1897 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1898 struct macro_tokens expansion = { .n = 0 };
1899 struct msg_location loc = lex_token_location (src, c0, c1);
1900 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1901 macro_call_destroy (mc);
1903 /* Convert the macro expansion into syntax for possible error messages
1905 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1906 size_t *len = xnmalloc (expansion.n, sizeof *len);
1907 struct string s = DS_EMPTY_INITIALIZER;
1908 macro_tokens_to_syntax (&expansion, &s, ofs, len);
1910 if (settings_get_mprint ())
1911 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1912 _("Macro Expansion")));
1914 /* Append the macro expansion tokens to the lookahead. */
1915 if (expansion.n > 0)
1917 char *macro_rep = ds_steal_cstr (&s);
1918 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1919 *ref_cnt = expansion.n;
1920 for (size_t i = 0; i < expansion.n; i++)
1922 struct lex_token *token = xmalloc (sizeof *token);
1923 *token = (struct lex_token) {
1924 .token = expansion.mts[i].token,
1925 .token_pos = c0->token_pos,
1926 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1927 .first_line = c0->first_line,
1928 .macro_rep = macro_rep,
1933 lex_stage_push_last (&src->merge, token);
1935 ss_dealloc (&expansion.mts[i].syntax);
1940 free (expansion.mts);
1944 /* Destroy the tokens for the call. */
1945 for (size_t i = 0; i < n_call; i++)
1946 lex_stage_pop_first (&src->pp);
1948 return expansion.n > 0;
1951 /* Attempts to obtain at least one new token into 'merge' in SRC.
1953 Returns true if successful, false on failure. In the latter case, SRC is
1954 exhausted and 'src->eof' is now true. */
1956 lex_source_get_merge (struct lex_source *src)
1959 if (lex_source_try_get_merge (src))
1964 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1966 Returns true if successful, false on failure. In the latter case, SRC is
1967 exhausted and 'src->eof' is now true. */
1969 lex_source_get_parse (struct lex_source *src)
1971 struct merger m = MERGER_INIT;
1973 for (size_t i = 0; ; i++)
1975 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1977 /* We always get a T_ENDCMD at the end of an input file
1978 (transformed from T_STOP by lex_source_try_get_pp()) and
1979 merger_add() should never return -1 on T_ENDCMD. */
1980 assert (lex_stage_is_empty (&src->merge));
1984 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1988 lex_source_push_parse (src, lex_stage_take_first (&src->merge));
1991 else if (retval > 0)
1993 /* Add a token that merges all the tokens together. */
1994 const struct lex_token *first = lex_stage_first (&src->merge);
1995 const struct lex_token *last = lex_stage_nth (&src->merge,
1997 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1998 struct lex_token *t = xmalloc (sizeof *t);
1999 *t = (struct lex_token) {
2001 .token_pos = first->token_pos,
2002 .token_len = (last->token_pos - first->token_pos) + last->token_len,
2003 .first_line = first->first_line,
2005 /* This works well if all the tokens were not expanded from macros,
2006 or if they came from the same macro expansion. It just gives up
2007 in the other (corner) cases. */
2008 .macro_rep = macro ? first->macro_rep : NULL,
2009 .ofs = macro ? first->ofs : 0,
2010 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2011 .ref_cnt = macro ? first->ref_cnt : NULL,
2015 lex_source_push_parse (src, t);
2017 for (int i = 0; i < retval; i++)
2018 lex_stage_pop_first (&src->merge);
2025 lex_source_push_endcmd__ (struct lex_source *src)
2027 assert (src->n_parse == 0);
2029 struct lex_token *token = xmalloc (sizeof *token);
2030 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2031 lex_source_push_parse (src, token);
2035 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2037 if (src->n_parse >= src->allocated_parse)
2038 src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2039 sizeof *src->parse);
2040 src->parse[src->n_parse++] = token;
2044 lex_source_clear_parse (struct lex_source *src)
2046 for (size_t i = 0; i < src->n_parse; i++)
2047 lex_token_destroy (src->parse[i]);
2048 src->n_parse = src->parse_ofs = 0;
2051 static struct lex_source *
2052 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2054 struct lex_source *src = xmalloc (sizeof *src);
2055 *src = (struct lex_source) {
2057 .segmenter = segmenter_init (reader->syntax, false),
2061 lex_source_push_endcmd__ (src);
2067 lex_source_destroy (struct lex_source *src)
2069 char *file_name = src->reader->file_name;
2070 char *encoding = src->reader->encoding;
2071 if (src->reader->class->destroy != NULL)
2072 src->reader->class->destroy (src->reader);
2076 lex_stage_uninit (&src->pp);
2077 lex_stage_uninit (&src->merge);
2078 lex_source_clear_parse (src);
2080 ll_remove (&src->ll);
2084 struct lex_file_reader
2086 struct lex_reader reader;
2087 struct u8_istream *istream;
2090 static struct lex_reader_class lex_file_reader_class;
2092 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2093 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2094 ENCODING, which should take one of the forms accepted by
2095 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2096 mode of the new reader, respectively.
2098 Returns a null pointer if FILE_NAME cannot be opened. */
2100 lex_reader_for_file (const char *file_name, const char *encoding,
2101 enum segmenter_mode syntax,
2102 enum lex_error_mode error)
2104 struct lex_file_reader *r;
2105 struct u8_istream *istream;
2107 istream = (!strcmp(file_name, "-")
2108 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2109 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2110 if (istream == NULL)
2112 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2116 r = xmalloc (sizeof *r);
2117 lex_reader_init (&r->reader, &lex_file_reader_class);
2118 r->reader.syntax = syntax;
2119 r->reader.error = error;
2120 r->reader.file_name = xstrdup (file_name);
2121 r->reader.encoding = xstrdup_if_nonnull (encoding);
2122 r->reader.line_number = 1;
2123 r->istream = istream;
2128 static struct lex_file_reader *
2129 lex_file_reader_cast (struct lex_reader *r)
2131 return UP_CAST (r, struct lex_file_reader, reader);
2135 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2136 enum prompt_style prompt_style UNUSED)
2138 struct lex_file_reader *r = lex_file_reader_cast (r_);
2139 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2142 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2149 lex_file_close (struct lex_reader *r_)
2151 struct lex_file_reader *r = lex_file_reader_cast (r_);
2153 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2155 if (u8_istream_close (r->istream) != 0)
2156 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2159 u8_istream_free (r->istream);
2164 static struct lex_reader_class lex_file_reader_class =
2170 struct lex_string_reader
2172 struct lex_reader reader;
2177 static struct lex_reader_class lex_string_reader_class;
2179 /* Creates and returns a new lex_reader for the contents of S, which must be
2180 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2181 with ss_dealloc() when it is closed. */
2183 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2185 struct lex_string_reader *r;
2187 r = xmalloc (sizeof *r);
2188 lex_reader_init (&r->reader, &lex_string_reader_class);
2189 r->reader.syntax = SEG_MODE_AUTO;
2190 r->reader.encoding = xstrdup_if_nonnull (encoding);
2197 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2198 which must be encoded in ENCODING. The caller retains ownership of S. */
2200 lex_reader_for_string (const char *s, const char *encoding)
2202 struct substring ss;
2203 ss_alloc_substring (&ss, ss_cstr (s));
2204 return lex_reader_for_substring_nocopy (ss, encoding);
2207 /* Formats FORMAT as a printf()-like format string and creates and returns a
2208 new lex_reader for the formatted result. */
2210 lex_reader_for_format (const char *format, const char *encoding, ...)
2212 struct lex_reader *r;
2215 va_start (args, encoding);
2216 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2222 static struct lex_string_reader *
2223 lex_string_reader_cast (struct lex_reader *r)
2225 return UP_CAST (r, struct lex_string_reader, reader);
2229 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2230 enum prompt_style prompt_style UNUSED)
2232 struct lex_string_reader *r = lex_string_reader_cast (r_);
2235 chunk = MIN (n, r->s.length - r->offset);
2236 memcpy (buf, r->s.string + r->offset, chunk);
2243 lex_string_close (struct lex_reader *r_)
2245 struct lex_string_reader *r = lex_string_reader_cast (r_);
2251 static struct lex_reader_class lex_string_reader_class =