1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
32 #include "language/command.h"
33 #include "language/lexer/macro.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/ll.h"
42 #include "libpspp/message.h"
43 #include "libpspp/misc.h"
44 #include "libpspp/str.h"
45 #include "libpspp/u8-istream.h"
46 #include "output/journal.h"
47 #include "output/output-item.h"
49 #include "gl/c-ctype.h"
50 #include "gl/minmax.h"
51 #include "gl/xalloc.h"
52 #include "gl/xmemdup0.h"
55 #define _(msgid) gettext (msgid)
56 #define N_(msgid) msgid
58 /* A token within a lex_source. */
61 /* The regular token information. */
64 /* For a token obtained through the lexer in an ordinary way, this is the
65 location of the token in terms of the lex_source's buffer.
67 For a token produced through macro expansion, this is the entire macro
69 size_t token_pos; /* Offset into src->buffer of token start. */
70 size_t token_len; /* Length of source for token in bytes. */
71 int first_line; /* Line number at token_pos. */
73 /* For a token obtained through macro expansion, this is just this token.
75 For a token obtained through the lexer in an ordinary way, these are
77 char *macro_rep; /* The whole macro expansion. */
78 size_t ofs; /* Offset of this token in macro_rep. */
79 size_t len; /* Length of this token in macro_rep. */
80 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
84 lex_token_destroy (struct lex_token *t)
86 token_uninit (&t->token);
89 assert (*t->ref_cnt > 0);
99 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
104 struct lex_token **tokens;
107 static void lex_stage_clear (struct lex_stage *);
108 static void lex_stage_uninit (struct lex_stage *);
110 static size_t lex_stage_count (const struct lex_stage *);
111 static bool lex_stage_is_empty (const struct lex_stage *);
113 static struct lex_token *lex_stage_last (struct lex_stage *);
114 static struct lex_token *lex_stage_first (struct lex_stage *);
115 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
117 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
118 static void lex_stage_pop_first (struct lex_stage *);
120 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
123 /* Deletes all the tokens from STAGE. */
125 lex_stage_clear (struct lex_stage *stage)
127 while (!deque_is_empty (&stage->deque))
128 lex_stage_pop_first (stage);
131 /* Deletes all the tokens from STAGE and frees storage for the deque. */
133 lex_stage_uninit (struct lex_stage *stage)
135 lex_stage_clear (stage);
136 free (stage->tokens);
139 /* Returns true if STAGE contains no tokens, otherwise false. */
141 lex_stage_is_empty (const struct lex_stage *stage)
143 return deque_is_empty (&stage->deque);
146 /* Returns the number of tokens in STAGE. */
148 lex_stage_count (const struct lex_stage *stage)
150 return deque_count (&stage->deque);
153 /* Returns the last token in STAGE, which must be nonempty. The last token is
154 the one accessed with the greatest lookahead. */
155 static struct lex_token *
156 lex_stage_last (struct lex_stage *stage)
158 return stage->tokens[deque_front (&stage->deque, 0)];
161 /* Returns the first token in STAGE, which must be nonempty.
162 The first token is the one accessed with the least lookahead. */
163 static struct lex_token *
164 lex_stage_first (struct lex_stage *stage)
166 return lex_stage_nth (stage, 0);
169 /* Returns the token the given INDEX in STAGE. The first token (with the least
170 lookahead) is 0, the second token is 1, and so on. There must be at least
171 INDEX + 1 tokens in STAGE. */
172 static struct lex_token *
173 lex_stage_nth (struct lex_stage *stage, size_t index)
175 return stage->tokens[deque_back (&stage->deque, index)];
178 /* Adds TOKEN so that it becomes the last token in STAGE. */
180 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
182 if (deque_is_full (&stage->deque))
183 stage->tokens = deque_expand (&stage->deque, stage->tokens,
184 sizeof *stage->tokens);
185 stage->tokens[deque_push_front (&stage->deque)] = token;
188 /* Removes the first token from STAGE and uninitializes it. */
190 lex_stage_pop_first (struct lex_stage *stage)
192 lex_token_destroy (stage->tokens[deque_pop_back (&stage->deque)]);
195 /* Removes the first N tokens from SRC, appending them to DST as the last
198 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
200 for (size_t i = 0; i < n; i++)
202 lex_stage_push_last (dst, lex_stage_first (src));
203 deque_pop_back (&src->deque);
207 /* A source of tokens, corresponding to a syntax file.
209 This is conceptually a lex_reader wrapped with everything needed to convert
210 its UTF-8 bytes into tokens. */
213 struct ll ll; /* In lexer's list of sources. */
214 struct lex_reader *reader;
216 struct segmenter segmenter;
217 bool eof; /* True if T_STOP was read from 'reader'. */
219 /* Buffer of UTF-8 bytes. */
220 char *buffer; /* Source file contents. */
221 size_t length; /* Number of bytes filled. */
222 size_t allocated; /* Number of bytes allocated. */
224 /* Offsets into 'buffer'. */
225 size_t journal_pos; /* First byte not yet output to journal. */
226 size_t seg_pos; /* First byte not yet scanned as token. */
228 int n_newlines; /* Number of new-lines up to seg_pos. */
229 bool suppress_next_newline;
233 This is a pipeline with the following stages. Each token eventually
234 made available to the parser passes through of these stages. The stages
235 are named after the processing that happens in each one.
237 Initially, tokens come from the segmenter and scanner to 'pp':
239 - pp: Tokens that need to pass through the macro preprocessor to end up
242 - merge: Tokens that need to pass through scan_merge() to end up in
245 - lookahead: Tokens available to the client for parsing. */
247 struct lex_stage merge;
248 struct lex_stage lookahead;
251 static struct lex_source *lex_source_create (struct lexer *,
252 struct lex_reader *);
253 static void lex_source_destroy (struct lex_source *);
258 struct ll_list sources; /* Contains "struct lex_source"s. */
259 struct macro_set *macros;
262 static struct lex_source *lex_source__ (const struct lexer *);
263 static char *lex_source_get_syntax__ (const struct lex_source *,
265 static const struct lex_token *lex_next__ (const struct lexer *, int n);
266 static void lex_source_push_endcmd__ (struct lex_source *);
268 static bool lex_source_get_lookahead (struct lex_source *);
269 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
270 const char *format, va_list)
271 PRINTF_FORMAT (4, 0);
272 static const struct lex_token *lex_source_next__ (const struct lex_source *,
275 /* Initializes READER with the specified CLASS and otherwise some reasonable
276 defaults. The caller should fill in the others members as desired. */
278 lex_reader_init (struct lex_reader *reader,
279 const struct lex_reader_class *class)
281 reader->class = class;
282 reader->syntax = SEG_MODE_AUTO;
283 reader->error = LEX_ERROR_CONTINUE;
284 reader->file_name = NULL;
285 reader->encoding = NULL;
286 reader->line_number = 0;
290 /* Frees any file name already in READER and replaces it by a copy of
291 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
293 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
295 free (reader->file_name);
296 reader->file_name = xstrdup_if_nonnull (file_name);
299 /* Creates and returns a new lexer. */
303 struct lexer *lexer = xmalloc (sizeof *lexer);
304 *lexer = (struct lexer) {
305 .sources = LL_INITIALIZER (lexer->sources),
306 .macros = macro_set_create (),
311 /* Destroys LEXER. */
313 lex_destroy (struct lexer *lexer)
317 struct lex_source *source, *next;
319 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
320 lex_source_destroy (source);
321 macro_set_destroy (lexer->macros);
326 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
327 same name. Takes ownership of M. */
329 lex_define_macro (struct lexer *lexer, struct macro *m)
331 macro_set_add (lexer->macros, m);
334 /* Inserts READER into LEXER so that the next token read by LEXER comes from
335 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
338 lex_include (struct lexer *lexer, struct lex_reader *reader)
340 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
341 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
344 /* Appends READER to LEXER, so that it will be read after all other current
345 readers have already been read. */
347 lex_append (struct lexer *lexer, struct lex_reader *reader)
349 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
354 /* Advances LEXER to the next token, consuming the current token. */
356 lex_get (struct lexer *lexer)
358 struct lex_source *src;
360 src = lex_source__ (lexer);
364 if (!lex_stage_is_empty (&src->lookahead))
365 lex_stage_pop_first (&src->lookahead);
367 while (lex_stage_is_empty (&src->lookahead))
368 if (!lex_source_get_lookahead (src))
370 lex_source_destroy (src);
371 src = lex_source__ (lexer);
377 /* Advances LEXER by N tokens. */
379 lex_get_n (struct lexer *lexer, size_t n)
385 /* Issuing errors. */
387 /* Prints a syntax error message containing the current token and
388 given message MESSAGE (if non-null). */
390 lex_error (struct lexer *lexer, const char *format, ...)
394 va_start (args, format);
395 lex_next_error_valist (lexer, 0, 0, format, args);
399 /* Prints a syntax error message containing the current token and
400 given message MESSAGE (if non-null). */
402 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
404 lex_next_error_valist (lexer, 0, 0, format, args);
407 /* Prints a syntax error message containing the current token and
408 given message MESSAGE (if non-null). */
410 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
414 va_start (args, format);
415 lex_next_error_valist (lexer, n0, n1, format, args);
419 /* Prints a syntax error message saying that one of the strings provided as
420 varargs, up to the first NULL, is expected. */
422 (lex_error_expecting) (struct lexer *lexer, ...)
426 va_start (args, lexer);
427 lex_error_expecting_valist (lexer, args);
431 /* Prints a syntax error message saying that one of the options provided in
432 ARGS, up to the first NULL, is expected. */
434 lex_error_expecting_valist (struct lexer *lexer, va_list args)
436 enum { MAX_OPTIONS = 9 };
437 const char *options[MAX_OPTIONS];
439 while (n < MAX_OPTIONS)
441 const char *option = va_arg (args, const char *);
445 options[n++] = option;
447 lex_error_expecting_array (lexer, options, n);
451 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
456 lex_error (lexer, NULL);
460 lex_error (lexer, _("expecting %s"), options[0]);
464 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
468 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
473 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
474 options[0], options[1], options[2], options[3]);
478 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
479 options[0], options[1], options[2], options[3], options[4]);
483 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
484 options[0], options[1], options[2], options[3], options[4],
489 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
490 options[0], options[1], options[2], options[3], options[4],
491 options[5], options[6]);
495 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
496 options[0], options[1], options[2], options[3], options[4],
497 options[5], options[6], options[7]);
501 lex_error (lexer, NULL);
505 /* Reports an error to the effect that subcommand SBC may only be specified
508 This function does not take a lexer as an argument or use lex_error(),
509 because the result would ordinarily just be redundant: "Syntax error at
510 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
511 not help the user find the error. */
513 lex_sbc_only_once (const char *sbc)
515 msg (SE, _("Subcommand %s may only be specified once."), sbc);
518 /* Reports an error to the effect that subcommand SBC is missing.
520 This function does not take a lexer as an argument or use lex_error(),
521 because a missing subcommand can normally be detected only after the whole
522 command has been parsed, and so lex_error() would always report "Syntax
523 error at end of command", which does not help the user find the error. */
525 lex_sbc_missing (const char *sbc)
527 msg (SE, _("Required subcommand %s was not specified."), sbc);
530 /* Reports an error to the effect that specification SPEC may only be specified
531 once within subcommand SBC. */
533 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
535 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
539 /* Reports an error to the effect that specification SPEC is missing within
542 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
544 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
548 /* Prints a syntax error message containing the current token and
549 given message MESSAGE (if non-null). */
551 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
552 const char *format, va_list args)
554 struct lex_source *src = lex_source__ (lexer);
557 lex_source_error_valist (src, n0, n1, format, args);
563 ds_put_format (&s, _("Syntax error at end of input"));
566 ds_put_cstr (&s, ": ");
567 ds_put_vformat (&s, format, args);
569 if (ds_last (&s) != '.')
570 ds_put_byte (&s, '.');
571 msg (SE, "%s", ds_cstr (&s));
576 /* Checks that we're at end of command.
577 If so, returns a successful command completion code.
578 If not, flags a syntax error and returns an error command
581 lex_end_of_command (struct lexer *lexer)
583 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
585 lex_error (lexer, _("expecting end of command"));
592 /* Token testing functions. */
594 /* Returns true if the current token is a number. */
596 lex_is_number (const struct lexer *lexer)
598 return lex_next_is_number (lexer, 0);
601 /* Returns true if the current token is a string. */
603 lex_is_string (const struct lexer *lexer)
605 return lex_next_is_string (lexer, 0);
608 /* Returns the value of the current token, which must be a
609 floating point number. */
611 lex_number (const struct lexer *lexer)
613 return lex_next_number (lexer, 0);
616 /* Returns true iff the current token is an integer. */
618 lex_is_integer (const struct lexer *lexer)
620 return lex_next_is_integer (lexer, 0);
623 /* Returns the value of the current token, which must be an
626 lex_integer (const struct lexer *lexer)
628 return lex_next_integer (lexer, 0);
631 /* Token testing functions with lookahead.
633 A value of 0 for N as an argument to any of these functions refers to the
634 current token. Lookahead is limited to the current command. Any N greater
635 than the number of tokens remaining in the current command will be treated
636 as referring to a T_ENDCMD token. */
638 /* Returns true if the token N ahead of the current token is a number. */
640 lex_next_is_number (const struct lexer *lexer, int n)
642 return token_is_number (lex_next (lexer, n));
645 /* Returns true if the token N ahead of the current token is a string. */
647 lex_next_is_string (const struct lexer *lexer, int n)
649 return token_is_string (lex_next (lexer, n));
652 /* Returns the value of the token N ahead of the current token, which must be a
653 floating point number. */
655 lex_next_number (const struct lexer *lexer, int n)
657 return token_number (lex_next (lexer, n));
660 /* Returns true if the token N ahead of the current token is an integer. */
662 lex_next_is_integer (const struct lexer *lexer, int n)
664 return token_is_integer (lex_next (lexer, n));
667 /* Returns the value of the token N ahead of the current token, which must be
670 lex_next_integer (const struct lexer *lexer, int n)
672 return token_integer (lex_next (lexer, n));
675 /* Token matching functions. */
677 /* If the current token has the specified TYPE, skips it and returns true.
678 Otherwise, returns false. */
680 lex_match (struct lexer *lexer, enum token_type type)
682 if (lex_token (lexer) == type)
691 /* If the current token matches IDENTIFIER, skips it and returns true.
692 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
695 IDENTIFIER must be an ASCII string. */
697 lex_match_id (struct lexer *lexer, const char *identifier)
699 return lex_match_id_n (lexer, identifier, 3);
702 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
703 may be abbreviated to its first N letters. Otherwise, returns false.
705 IDENTIFIER must be an ASCII string. */
707 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
709 if (lex_token (lexer) == T_ID
710 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
719 /* If the current token is integer X, skips it and returns true. Otherwise,
722 lex_match_int (struct lexer *lexer, int x)
724 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
733 /* Forced matches. */
735 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
736 abbreviated to its first 3 letters. Otherwise, reports an error and returns
739 IDENTIFIER must be an ASCII string. */
741 lex_force_match_id (struct lexer *lexer, const char *identifier)
743 if (lex_match_id (lexer, identifier))
747 lex_error_expecting (lexer, identifier);
752 /* If the current token has the specified TYPE, skips it and returns true.
753 Otherwise, reports an error and returns false. */
755 lex_force_match (struct lexer *lexer, enum token_type type)
757 if (lex_token (lexer) == type)
764 const char *type_string = token_type_to_string (type);
767 char *s = xasprintf ("`%s'", type_string);
768 lex_error_expecting (lexer, s);
772 lex_error_expecting (lexer, token_type_to_name (type));
778 /* If the current token is a string, does nothing and returns true.
779 Otherwise, reports an error and returns false. */
781 lex_force_string (struct lexer *lexer)
783 if (lex_is_string (lexer))
787 lex_error (lexer, _("expecting string"));
792 /* If the current token is a string or an identifier, does nothing and returns
793 true. Otherwise, reports an error and returns false.
795 This is meant for use in syntactic situations where we want to encourage the
796 user to supply a quoted string, but for compatibility we also accept
797 identifiers. (One example of such a situation is file names.) Therefore,
798 the error message issued when the current token is wrong only says that a
799 string is expected and doesn't mention that an identifier would also be
802 lex_force_string_or_id (struct lexer *lexer)
804 return lex_token (lexer) == T_ID || lex_force_string (lexer);
807 /* If the current token is an integer, does nothing and returns true.
808 Otherwise, reports an error and returns false. */
810 lex_force_int (struct lexer *lexer)
812 if (lex_is_integer (lexer))
816 lex_error (lexer, _("expecting integer"));
821 /* If the current token is an integer in the range MIN...MAX (inclusive), does
822 nothing and returns true. Otherwise, reports an error and returns false.
823 If NAME is nonnull, then it is used in the error message. */
825 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
827 bool is_integer = lex_is_integer (lexer);
828 bool too_small = is_integer && lex_integer (lexer) < min;
829 bool too_big = is_integer && lex_integer (lexer) > max;
830 if (is_integer && !too_small && !too_big)
835 /* Weird, maybe a bug in the caller. Just report that we needed an
838 lex_error (lexer, _("Integer expected for %s."), name);
840 lex_error (lexer, _("Integer expected."));
845 lex_error (lexer, _("Expected %ld for %s."), min, name);
847 lex_error (lexer, _("Expected %ld."), min);
849 else if (min + 1 == max)
852 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
854 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
858 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
859 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
861 if (report_lower_bound && report_upper_bound)
865 _("Expected integer between %ld and %ld for %s."),
868 lex_error (lexer, _("Expected integer between %ld and %ld."),
871 else if (report_lower_bound)
876 lex_error (lexer, _("Expected non-negative integer for %s."),
879 lex_error (lexer, _("Expected non-negative integer."));
884 lex_error (lexer, _("Expected positive integer for %s."),
887 lex_error (lexer, _("Expected positive integer."));
890 else if (report_upper_bound)
894 _("Expected integer less than or equal to %ld for %s."),
897 lex_error (lexer, _("Expected integer less than or equal to %ld."),
903 lex_error (lexer, _("Integer expected for %s."), name);
905 lex_error (lexer, _("Integer expected."));
911 /* If the current token is a number, does nothing and returns true.
912 Otherwise, reports an error and returns false. */
914 lex_force_num (struct lexer *lexer)
916 if (lex_is_number (lexer))
919 lex_error (lexer, _("expecting number"));
923 /* If the current token is an identifier, does nothing and returns true.
924 Otherwise, reports an error and returns false. */
926 lex_force_id (struct lexer *lexer)
928 if (lex_token (lexer) == T_ID)
931 lex_error (lexer, _("expecting identifier"));
935 /* Token accessors. */
937 /* Returns the type of LEXER's current token. */
939 lex_token (const struct lexer *lexer)
941 return lex_next_token (lexer, 0);
944 /* Returns the number in LEXER's current token.
946 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
947 tokens this function will always return zero. */
949 lex_tokval (const struct lexer *lexer)
951 return lex_next_tokval (lexer, 0);
954 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
956 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
957 this functions this function will always return NULL.
959 The UTF-8 encoding of the returned string is correct for variable names and
960 other identifiers. Use filename_to_utf8() to use it as a filename. Use
961 data_in() to use it in a "union value". */
963 lex_tokcstr (const struct lexer *lexer)
965 return lex_next_tokcstr (lexer, 0);
968 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
969 null-terminated (but the null terminator is not included in the returned
970 substring's 'length').
972 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
973 this functions this function will always return NULL.
975 The UTF-8 encoding of the returned string is correct for variable names and
976 other identifiers. Use filename_to_utf8() to use it as a filename. Use
977 data_in() to use it in a "union value". */
979 lex_tokss (const struct lexer *lexer)
981 return lex_next_tokss (lexer, 0);
986 A value of 0 for N as an argument to any of these functions refers to the
987 current token. Lookahead is limited to the current command. Any N greater
988 than the number of tokens remaining in the current command will be treated
989 as referring to a T_ENDCMD token. */
991 static const struct lex_token *
992 lex_next__ (const struct lexer *lexer_, int n)
994 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
995 struct lex_source *src = lex_source__ (lexer);
998 return lex_source_next__ (src, n);
1001 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1006 static const struct lex_token *
1007 lex_source_next__ (const struct lex_source *src_, int n)
1009 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1010 while (lex_stage_count (&src->lookahead) <= n)
1012 if (!lex_stage_is_empty (&src->lookahead))
1014 const struct lex_token *t = lex_stage_last (&src->lookahead);
1015 if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1019 lex_source_get_lookahead (src);
1022 return lex_stage_nth (&src->lookahead, n);
1025 /* Returns the "struct token" of the token N after the current one in LEXER.
1026 The returned pointer can be invalidated by pretty much any succeeding call
1027 into the lexer, although the string pointer within the returned token is
1028 only invalidated by consuming the token (e.g. with lex_get()). */
1029 const struct token *
1030 lex_next (const struct lexer *lexer, int n)
1032 return &lex_next__ (lexer, n)->token;
1035 /* Returns the type of the token N after the current one in LEXER. */
1037 lex_next_token (const struct lexer *lexer, int n)
1039 return lex_next (lexer, n)->type;
1042 /* Returns the number in the tokn N after the current one in LEXER.
1044 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
1045 tokens this function will always return zero. */
1047 lex_next_tokval (const struct lexer *lexer, int n)
1049 return token_number (lex_next (lexer, n));
1052 /* Returns the null-terminated string in the token N after the current one, in
1055 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
1056 this functions this function will always return NULL.
1058 The UTF-8 encoding of the returned string is correct for variable names and
1059 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1060 data_in() to use it in a "union value". */
1062 lex_next_tokcstr (const struct lexer *lexer, int n)
1064 return lex_next_tokss (lexer, n).string;
1067 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1068 The string is null-terminated (but the null terminator is not included in
1069 the returned substring's 'length').
1071 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
1072 tokens this functions this function will always return NULL.
1074 The UTF-8 encoding of the returned string is correct for variable names and
1075 other identifiers. Use filename_to_utf8() to use it as a filename. Use
1076 data_in() to use it in a "union value". */
1078 lex_next_tokss (const struct lexer *lexer, int n)
1080 return lex_next (lexer, n)->string;
1083 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1084 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
1085 are both zero, this requests the syntax for the current token.) The caller
1086 must eventually free the returned string (with free()). The syntax is
1087 encoded in UTF-8 and in the original form supplied to the lexer so that, for
1088 example, it may include comments, spaces, and new-lines if it spans multiple
1089 tokens. Macro expansion, however, has already been performed. */
1091 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1093 return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1096 /* Returns true if the token N ahead of the current one was produced by macro
1097 expansion, false otherwise. */
1099 lex_next_is_from_macro (const struct lexer *lexer, int n)
1101 return lex_next__ (lexer, n)->macro_rep != NULL;
1105 lex_tokens_match (const struct token *actual, const struct token *expected)
1107 if (actual->type != expected->type)
1110 switch (actual->type)
1114 return actual->number == expected->number;
1117 return lex_id_match (expected->string, actual->string);
1120 return (actual->string.length == expected->string.length
1121 && !memcmp (actual->string.string, expected->string.string,
1122 actual->string.length));
1130 lex_at_phrase__ (struct lexer *lexer, const char *s)
1132 struct string_lexer slex;
1136 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1137 while (string_lexer_next (&slex, &token))
1139 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1140 token_uninit (&token);
1147 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1148 returns true. Otherwise, returns false.
1150 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1151 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1152 first three letters. */
1154 lex_at_phrase (struct lexer *lexer, const char *s)
1156 return lex_at_phrase__ (lexer, s) > 0;
1159 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1160 skips it and returns true. Otherwise, returns false.
1162 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1163 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1164 first three letters. */
1166 lex_match_phrase (struct lexer *lexer, const char *s)
1168 size_t n = lex_at_phrase__ (lexer, s);
1170 lex_get_n (lexer, n);
1175 count_newlines (char *s, size_t length)
1180 while ((newline = memchr (s, '\n', length)) != NULL)
1183 length -= (newline + 1) - s;
1191 lex_token_get_last_line_number (const struct lex_source *src,
1192 const struct lex_token *token)
1194 if (token->first_line == 0)
1198 char *token_str = &src->buffer[token->token_pos];
1199 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1204 lex_token_get_column__ (const struct lex_source *src, size_t offset)
1206 const char *newline = memrchr (src->buffer, '\n', offset);
1207 size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1208 return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1212 lex_token_get_first_column (const struct lex_source *src,
1213 const struct lex_token *token)
1215 return lex_token_get_column__ (src, token->token_pos);
1219 lex_token_get_last_column (const struct lex_source *src,
1220 const struct lex_token *token)
1222 return lex_token_get_column__ (src, token->token_pos + token->token_len);
1225 static struct msg_location
1226 lex_token_location (const struct lex_source *src,
1227 const struct lex_token *t0,
1228 const struct lex_token *t1)
1230 return (struct msg_location) {
1231 .file_name = src->reader->file_name,
1232 .first_line = t0->first_line,
1233 .last_line = lex_token_get_last_line_number (src, t1),
1234 .first_column = lex_token_get_first_column (src, t0),
1235 .last_column = lex_token_get_last_column (src, t1),
1239 static struct msg_location *
1240 lex_token_location_rw (const struct lex_source *src,
1241 const struct lex_token *t0,
1242 const struct lex_token *t1)
1244 struct msg_location location = lex_token_location (src, t0, t1);
1245 return msg_location_dup (&location);
1248 static struct msg_location *
1249 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1251 return lex_token_location_rw (src,
1252 lex_source_next__ (src, n0),
1253 lex_source_next__ (src, n1));
1256 /* Returns the 1-based line number of the start of the syntax that represents
1257 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1258 if the token is drawn from a source that does not have line numbers. */
1260 lex_get_first_line_number (const struct lexer *lexer, int n)
1262 const struct lex_source *src = lex_source__ (lexer);
1263 return src ? lex_source_next__ (src, n)->first_line : 0;
1266 /* Returns the 1-based line number of the end of the syntax that represents the
1267 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1268 token or if the token is drawn from a source that does not have line
1271 Most of the time, a single token is wholly within a single line of syntax,
1272 but there are two exceptions: a T_STRING token can be made up of multiple
1273 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1274 token can consist of a "-" on one line followed by the number on the next.
1277 lex_get_last_line_number (const struct lexer *lexer, int n)
1279 const struct lex_source *src = lex_source__ (lexer);
1280 return src ? lex_token_get_last_line_number (src,
1281 lex_source_next__ (src, n)) : 0;
1284 /* Returns the 1-based column number of the start of the syntax that represents
1285 the token N after the current one in LEXER. Returns 0 for a T_STOP
1288 Column numbers are measured according to the width of characters as shown in
1289 a typical fixed-width font, in which CJK characters have width 2 and
1290 combining characters have width 0. */
1292 lex_get_first_column (const struct lexer *lexer, int n)
1294 const struct lex_source *src = lex_source__ (lexer);
1295 return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1298 /* Returns the 1-based column number of the end of the syntax that represents
1299 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1302 Column numbers are measured according to the width of characters as shown in
1303 a typical fixed-width font, in which CJK characters have width 2 and
1304 combining characters have width 0. */
1306 lex_get_last_column (const struct lexer *lexer, int n)
1308 const struct lex_source *src = lex_source__ (lexer);
1309 return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1312 /* Returns the name of the syntax file from which the current command is drawn.
1313 Returns NULL for a T_STOP token or if the command's source does not have
1316 There is no version of this function that takes an N argument because
1317 lookahead only works to the end of a command and any given command is always
1318 within a single syntax file. */
1320 lex_get_file_name (const struct lexer *lexer)
1322 struct lex_source *src = lex_source__ (lexer);
1323 return src == NULL ? NULL : src->reader->file_name;
1326 /* Returns a newly allocated msg_location for the syntax that represents tokens
1327 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1328 must eventually free the location (with msg_location_destroy()). */
1329 struct msg_location *
1330 lex_get_location (const struct lexer *lexer, int n0, int n1)
1332 struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1333 loc->first_column = lex_get_first_column (lexer, n0);
1334 loc->last_column = lex_get_last_column (lexer, n1);
1338 /* Returns a newly allocated msg_location for the syntax that represents tokens
1339 with 0-based offsets N0...N1, inclusive, from the current token. The
1340 location only covers the tokens' lines, not the columns. The caller must
1341 eventually free the location (with msg_location_destroy()). */
1342 struct msg_location *
1343 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1345 struct msg_location *loc = xmalloc (sizeof *loc);
1346 *loc = (struct msg_location) {
1347 .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1348 .first_line = lex_get_first_line_number (lexer, n0),
1349 .last_line = lex_get_last_line_number (lexer, n1),
1355 lex_get_encoding (const struct lexer *lexer)
1357 struct lex_source *src = lex_source__ (lexer);
1358 return src == NULL ? NULL : src->reader->encoding;
1361 /* Returns the syntax mode for the syntax file from which the current drawn is
1362 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1363 does not have line numbers.
1365 There is no version of this function that takes an N argument because
1366 lookahead only works to the end of a command and any given command is always
1367 within a single syntax file. */
1369 lex_get_syntax_mode (const struct lexer *lexer)
1371 struct lex_source *src = lex_source__ (lexer);
1372 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1375 /* Returns the error mode for the syntax file from which the current drawn is
1376 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1377 source does not have line numbers.
1379 There is no version of this function that takes an N argument because
1380 lookahead only works to the end of a command and any given command is always
1381 within a single syntax file. */
1383 lex_get_error_mode (const struct lexer *lexer)
1385 struct lex_source *src = lex_source__ (lexer);
1386 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1389 /* If the source that LEXER is currently reading has error mode
1390 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1391 token to be read comes directly from whatever is next read from the stream.
1393 It makes sense to call this function after encountering an error in a
1394 command entered on the console, because usually the user would prefer not to
1395 have cascading errors. */
1397 lex_interactive_reset (struct lexer *lexer)
1399 struct lex_source *src = lex_source__ (lexer);
1400 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1403 src->journal_pos = src->seg_pos = 0;
1404 src->n_newlines = 0;
1405 src->suppress_next_newline = false;
1406 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1408 lex_stage_clear (&src->pp);
1409 lex_stage_clear (&src->merge);
1410 lex_stage_clear (&src->lookahead);
1411 lex_source_push_endcmd__ (src);
1415 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1417 lex_discard_rest_of_command (struct lexer *lexer)
1419 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1423 /* Discards all lookahead tokens in LEXER, then discards all input sources
1424 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1425 runs out of input sources. */
1427 lex_discard_noninteractive (struct lexer *lexer)
1429 struct lex_source *src = lex_source__ (lexer);
1433 lex_stage_clear (&src->pp);
1434 lex_stage_clear (&src->merge);
1435 lex_stage_clear (&src->lookahead);
1437 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1438 src = lex_source__ (lexer))
1439 lex_source_destroy (src);
1444 lex_source_expand__ (struct lex_source *src)
1446 if (src->length >= src->allocated)
1447 src->buffer = x2realloc (src->buffer, &src->allocated);
1451 lex_source_read__ (struct lex_source *src)
1455 lex_source_expand__ (src);
1457 size_t space = src->allocated - src->length;
1458 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1459 size_t n = src->reader->class->read (src->reader,
1460 &src->buffer[src->length],
1462 assert (n <= space);
1467 src->reader->eof = true;
1468 lex_source_expand__ (src);
1474 while (!memchr (&src->buffer[src->seg_pos], '\n',
1475 src->length - src->seg_pos));
1478 static struct lex_source *
1479 lex_source__ (const struct lexer *lexer)
1481 return (ll_is_empty (&lexer->sources) ? NULL
1482 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1485 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1486 one, through N1 ahead of the current one, inclusive. (For example, if N0
1487 and N1 are both zero, this requests the syntax for the current token.) The
1488 caller must eventually free the returned string (with free()). The syntax
1489 is encoded in UTF-8 and in the original form supplied to the lexer so that,
1490 for example, it may include comments, spaces, and new-lines if it spans
1491 multiple tokens. Macro expansion, however, has already been performed. */
1493 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1495 struct string s = DS_EMPTY_INITIALIZER;
1496 for (size_t i = n0; i <= n1; )
1498 /* Find [I,J) as the longest sequence of tokens not produced by macro
1499 expansion, or otherwise the longest sequence expanded from a single
1501 const struct lex_token *first = lex_source_next__ (src, i);
1503 for (j = i + 1; j <= n1; j++)
1505 const struct lex_token *cur = lex_source_next__ (src, j);
1506 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1507 || first->macro_rep != cur->macro_rep)
1510 const struct lex_token *last = lex_source_next__ (src, j - 1);
1512 /* Now add the syntax for this sequence of tokens to SRC. */
1513 if (!ds_is_empty (&s))
1514 ds_put_byte (&s, ' ');
1515 if (!first->macro_rep)
1517 size_t start = first->token_pos;
1518 size_t end = last->token_pos + last->token_len;
1519 ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1523 size_t start = first->ofs;
1524 size_t end = last->ofs + last->len;
1525 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1531 return ds_steal_cstr (&s);
1535 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1537 for (size_t i = n0; i <= n1; i++)
1538 if (lex_source_next__ (src, i)->macro_rep)
1543 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1544 raw UTF-8 syntax for the macro call (not for the expansion) and for any
1545 other tokens included in that range. The syntax is encoded in UTF-8 and in
1546 the original form supplied to the lexer so that, for example, it may include
1547 comments, spaces, and new-lines if it spans multiple tokens.
1549 Returns an empty string if the token range doesn't include a macro call.
1551 The caller must not modify or free the returned string. */
1552 static struct substring
1553 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1555 if (!lex_source_contains_macro_call (src, n0, n1))
1558 const struct lex_token *token0 = lex_source_next__ (src, n0);
1559 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1560 size_t start = token0->token_pos;
1561 size_t end = token1->token_pos + token1->token_len;
1563 return ss_buffer (&src->buffer[start], end - start);
1567 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1568 const char *format, va_list args)
1570 const struct lex_token *token;
1575 token = lex_source_next__ (src, n0);
1576 if (token->token.type == T_ENDCMD)
1577 ds_put_cstr (&s, _("Syntax error at end of command"));
1580 /* Get the syntax that caused the error. */
1581 char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1583 str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1586 /* Get the macro call(s) that expanded to the syntax that caused the
1589 str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1596 _("Syntax error at `%s' (in expansion of `%s')"),
1599 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1604 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1607 ds_put_cstr (&s, _("Syntax error"));
1613 ds_put_cstr (&s, ": ");
1614 ds_put_vformat (&s, format, args);
1616 if (ds_last (&s) != '.')
1617 ds_put_byte (&s, '.');
1619 struct msg *m = xmalloc (sizeof *m);
1621 .category = MSG_C_SYNTAX,
1622 .severity = MSG_S_ERROR,
1623 .location = lex_source_get_location (src, n0, n1),
1624 .text = ds_steal_cstr (&s),
1630 lex_get_error (struct lex_source *src, const struct lex_token *token)
1633 str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1634 syntax, sizeof syntax);
1636 struct string s = DS_EMPTY_INITIALIZER;
1637 ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1638 ds_put_format (&s, ": %s", token->token.string.string);
1640 struct msg *m = xmalloc (sizeof *m);
1642 .category = MSG_C_SYNTAX,
1643 .severity = MSG_S_ERROR,
1644 .location = lex_token_location_rw (src, token, token),
1645 .text = ds_steal_cstr (&s),
1650 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1651 underlying lex_reader if necessary. Returns true if a new token was added
1652 to SRC's deque, false otherwise. The caller should retry failures unless
1653 SRC's 'eof' marker was set to true indicating that there will be no more
1654 tokens from this source. */
1656 lex_source_try_get_pp (struct lex_source *src)
1658 /* Append a new token to SRC and initialize it. */
1659 struct lex_token *token = xmalloc (sizeof *token);
1660 token->token = (struct token) { .type = T_STOP };
1661 token->macro_rep = NULL;
1662 token->ref_cnt = NULL;
1663 token->token_pos = src->seg_pos;
1664 if (src->reader->line_number > 0)
1665 token->first_line = src->reader->line_number + src->n_newlines;
1667 token->first_line = 0;
1669 /* Extract a segment. */
1670 const char *segment;
1671 enum segment_type seg_type;
1675 segment = &src->buffer[src->seg_pos];
1676 seg_len = segmenter_push (&src->segmenter, segment,
1677 src->length - src->seg_pos,
1678 src->reader->eof, &seg_type);
1682 /* The segmenter needs more input to produce a segment. */
1683 assert (!src->reader->eof);
1684 lex_source_read__ (src);
1687 /* Update state based on the segment. */
1688 token->token_len = seg_len;
1689 src->seg_pos += seg_len;
1690 if (seg_type == SEG_NEWLINE)
1693 /* Get a token from the segment. */
1694 enum tokenize_result result = token_from_segment (
1695 seg_type, ss_buffer (segment, seg_len), &token->token);
1697 /* If we've reached the end of a line, or the end of a command, then pass
1698 the line to the output engine as a syntax text item. */
1699 int n_lines = seg_type == SEG_NEWLINE;
1700 if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1703 src->suppress_next_newline = true;
1705 else if (n_lines > 0 && src->suppress_next_newline)
1708 src->suppress_next_newline = false;
1710 for (int i = 0; i < n_lines; i++)
1712 /* Beginning of line. */
1713 const char *line = &src->buffer[src->journal_pos];
1715 /* Calculate line length, including \n or \r\n end-of-line if present.
1717 We use src->head even though that may be beyond what we've actually
1718 converted to tokens (which is only through line_pos). That's because,
1719 if we're emitting the line due to SEG_END_COMMAND, we want to take the
1720 whole line through the newline, not just through the '.'. */
1721 size_t max_len = src->length - src->journal_pos;
1722 const char *newline = memchr (line, '\n', max_len);
1723 size_t line_len = newline ? newline - line + 1 : max_len;
1725 /* Calculate line length excluding end-of-line. */
1726 size_t copy_len = line_len;
1727 if (copy_len > 0 && line[copy_len - 1] == '\n')
1729 if (copy_len > 0 && line[copy_len - 1] == '\r')
1732 /* Submit the line as syntax. */
1733 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1734 xmemdup0 (line, copy_len),
1737 src->journal_pos += line_len;
1742 case TOKENIZE_ERROR:
1743 lex_get_error (src, token);
1745 case TOKENIZE_EMPTY:
1746 lex_token_destroy (token);
1749 case TOKENIZE_TOKEN:
1750 if (token->token.type == T_STOP)
1752 token->token.type = T_ENDCMD;
1755 lex_stage_push_last (&src->pp, token);
1761 /* Attempts to append a new token to SRC. Returns true if successful, false on
1762 failure. On failure, the end of SRC has been reached and no more tokens
1763 will be forthcoming from it.
1765 Does not make the new token available for lookahead yet; the caller must
1766 adjust SRC's 'middle' pointer to do so. */
1768 lex_source_get_pp (struct lex_source *src)
1771 if (lex_source_try_get_pp (src))
1777 lex_source_try_get_merge (const struct lex_source *src_)
1779 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1781 if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1784 if (!settings_get_mexpand ())
1786 lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1790 /* Now pass tokens one-by-one to the macro expander.
1792 In the common case where there is no macro to expand, the loop is not
1794 struct macro_call *mc;
1795 int n_call = macro_call_create (src->lexer->macros,
1796 &lex_stage_first (&src->pp)->token, &mc);
1797 for (int ofs = 1; !n_call; ofs++)
1799 if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1801 /* This should not be reachable because we always get a T_ENDCMD at
1802 the end of an input file (transformed from T_STOP by
1803 lex_source_try_get_pp()) and the macro_expander should always
1804 terminate expansion on T_ENDCMD. */
1808 const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1809 size_t start = t->token_pos;
1810 size_t end = t->token_pos + t->token_len;
1811 const struct macro_token mt = {
1813 .syntax = ss_buffer (&src->buffer[start], end - start),
1815 const struct msg_location loc = lex_token_location (src, t, t);
1816 n_call = macro_call_add (mc, &mt, &loc);
1820 /* False alarm: no macro expansion after all. Use first token as
1821 lookahead. We'll retry macro expansion from the second token next
1823 macro_call_destroy (mc);
1824 lex_stage_shift (&src->merge, &src->pp, 1);
1828 /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1829 are a macro call. (These are likely to be the only tokens in 'pp'.)
1831 const struct lex_token *c0 = lex_stage_first (&src->pp);
1832 const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1833 struct macro_tokens expansion = { .n = 0 };
1834 struct msg_location loc = lex_token_location (src, c0, c1);
1835 macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1836 macro_call_destroy (mc);
1838 /* Convert the macro expansion into syntax for possible error messages
1840 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1841 size_t *len = xnmalloc (expansion.n, sizeof *len);
1842 struct string s = DS_EMPTY_INITIALIZER;
1843 macro_tokens_to_syntax (&expansion, &s, ofs, len);
1845 if (settings_get_mprint ())
1846 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1847 _("Macro Expansion")));
1849 /* Append the macro expansion tokens to the lookahead. */
1850 if (expansion.n > 0)
1852 char *macro_rep = ds_steal_cstr (&s);
1853 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1854 *ref_cnt = expansion.n;
1855 for (size_t i = 0; i < expansion.n; i++)
1857 struct lex_token *token = xmalloc (sizeof *token);
1858 *token = (struct lex_token) {
1859 .token = expansion.mts[i].token,
1860 .token_pos = c0->token_pos,
1861 .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1862 .first_line = c0->first_line,
1863 .macro_rep = macro_rep,
1868 lex_stage_push_last (&src->merge, token);
1870 ss_dealloc (&expansion.mts[i].syntax);
1875 free (expansion.mts);
1879 /* Destroy the tokens for the call. */
1880 for (size_t i = 0; i < n_call; i++)
1881 lex_stage_pop_first (&src->pp);
1883 return expansion.n > 0;
1886 /* Attempts to obtain at least one new token into 'merge' in SRC.
1888 Returns true if successful, false on failure. In the latter case, SRC is
1889 exhausted and 'src->eof' is now true. */
1891 lex_source_get_merge (struct lex_source *src)
1894 if (lex_source_try_get_merge (src))
1899 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1901 Returns true if successful, false on failure. In the latter case, SRC is
1902 exhausted and 'src->eof' is now true. */
1904 lex_source_get_lookahead (struct lex_source *src)
1906 struct merger m = MERGER_INIT;
1908 for (size_t i = 0; ; i++)
1910 while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1912 /* We always get a T_ENDCMD at the end of an input file
1913 (transformed from T_STOP by lex_source_try_get_pp()) and
1914 merger_add() should never return -1 on T_ENDCMD. */
1915 assert (lex_stage_is_empty (&src->merge));
1919 int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1923 lex_stage_shift (&src->lookahead, &src->merge, 1);
1926 else if (retval > 0)
1928 /* Add a token that merges all the tokens together. */
1929 const struct lex_token *first = lex_stage_first (&src->merge);
1930 const struct lex_token *last = lex_stage_nth (&src->merge,
1932 bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1933 struct lex_token *t = xmalloc (sizeof *t);
1934 *t = (struct lex_token) {
1936 .token_pos = first->token_pos,
1937 .token_len = (last->token_pos - first->token_pos) + last->token_len,
1938 .first_line = first->first_line,
1940 /* This works well if all the tokens were not expanded from macros,
1941 or if they came from the same macro expansion. It just gives up
1942 in the other (corner) cases. */
1943 .macro_rep = macro ? first->macro_rep : NULL,
1944 .ofs = macro ? first->ofs : 0,
1945 .len = macro ? (last->ofs - first->ofs) + last->len : 0,
1946 .ref_cnt = macro ? first->ref_cnt : NULL,
1950 lex_stage_push_last (&src->lookahead, t);
1952 for (int i = 0; i < retval; i++)
1953 lex_stage_pop_first (&src->merge);
1960 lex_source_push_endcmd__ (struct lex_source *src)
1962 assert (lex_stage_is_empty (&src->lookahead));
1963 struct lex_token *token = xmalloc (sizeof *token);
1964 *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
1965 lex_stage_push_last (&src->lookahead, token);
1968 static struct lex_source *
1969 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
1971 struct lex_source *src = xmalloc (sizeof *src);
1972 *src = (struct lex_source) {
1974 .segmenter = segmenter_init (reader->syntax, false),
1978 lex_source_push_endcmd__ (src);
1984 lex_source_destroy (struct lex_source *src)
1986 char *file_name = src->reader->file_name;
1987 char *encoding = src->reader->encoding;
1988 if (src->reader->class->destroy != NULL)
1989 src->reader->class->destroy (src->reader);
1993 lex_stage_uninit (&src->pp);
1994 lex_stage_uninit (&src->merge);
1995 lex_stage_uninit (&src->lookahead);
1996 ll_remove (&src->ll);
2000 struct lex_file_reader
2002 struct lex_reader reader;
2003 struct u8_istream *istream;
2006 static struct lex_reader_class lex_file_reader_class;
2008 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2009 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
2010 ENCODING, which should take one of the forms accepted by
2011 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
2012 mode of the new reader, respectively.
2014 Returns a null pointer if FILE_NAME cannot be opened. */
2016 lex_reader_for_file (const char *file_name, const char *encoding,
2017 enum segmenter_mode syntax,
2018 enum lex_error_mode error)
2020 struct lex_file_reader *r;
2021 struct u8_istream *istream;
2023 istream = (!strcmp(file_name, "-")
2024 ? u8_istream_for_fd (encoding, STDIN_FILENO)
2025 : u8_istream_for_file (encoding, file_name, O_RDONLY));
2026 if (istream == NULL)
2028 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2032 r = xmalloc (sizeof *r);
2033 lex_reader_init (&r->reader, &lex_file_reader_class);
2034 r->reader.syntax = syntax;
2035 r->reader.error = error;
2036 r->reader.file_name = xstrdup (file_name);
2037 r->reader.encoding = xstrdup_if_nonnull (encoding);
2038 r->reader.line_number = 1;
2039 r->istream = istream;
2044 static struct lex_file_reader *
2045 lex_file_reader_cast (struct lex_reader *r)
2047 return UP_CAST (r, struct lex_file_reader, reader);
2051 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2052 enum prompt_style prompt_style UNUSED)
2054 struct lex_file_reader *r = lex_file_reader_cast (r_);
2055 ssize_t n_read = u8_istream_read (r->istream, buf, n);
2058 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2065 lex_file_close (struct lex_reader *r_)
2067 struct lex_file_reader *r = lex_file_reader_cast (r_);
2069 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2071 if (u8_istream_close (r->istream) != 0)
2072 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2075 u8_istream_free (r->istream);
2080 static struct lex_reader_class lex_file_reader_class =
2086 struct lex_string_reader
2088 struct lex_reader reader;
2093 static struct lex_reader_class lex_string_reader_class;
2095 /* Creates and returns a new lex_reader for the contents of S, which must be
2096 encoded in the given ENCODING. The new reader takes ownership of S and will free it
2097 with ss_dealloc() when it is closed. */
2099 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2101 struct lex_string_reader *r;
2103 r = xmalloc (sizeof *r);
2104 lex_reader_init (&r->reader, &lex_string_reader_class);
2105 r->reader.syntax = SEG_MODE_AUTO;
2106 r->reader.encoding = xstrdup_if_nonnull (encoding);
2113 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2114 which must be encoded in ENCODING. The caller retains ownership of S. */
2116 lex_reader_for_string (const char *s, const char *encoding)
2118 struct substring ss;
2119 ss_alloc_substring (&ss, ss_cstr (s));
2120 return lex_reader_for_substring_nocopy (ss, encoding);
2123 /* Formats FORMAT as a printf()-like format string and creates and returns a
2124 new lex_reader for the formatted result. */
2126 lex_reader_for_format (const char *format, const char *encoding, ...)
2128 struct lex_reader *r;
2131 va_start (args, encoding);
2132 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2138 static struct lex_string_reader *
2139 lex_string_reader_cast (struct lex_reader *r)
2141 return UP_CAST (r, struct lex_string_reader, reader);
2145 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2146 enum prompt_style prompt_style UNUSED)
2148 struct lex_string_reader *r = lex_string_reader_cast (r_);
2151 chunk = MIN (n, r->s.length - r->offset);
2152 memcpy (buf, r->s.string + r->offset, chunk);
2159 lex_string_close (struct lex_reader *r_)
2161 struct lex_string_reader *r = lex_string_reader_cast (r_);
2167 static struct lex_reader_class lex_string_reader_class =