1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "language/command.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/ll.h"
42 #include "libpspp/message.h"
43 #include "libpspp/misc.h"
44 #include "libpspp/str.h"
45 #include "libpspp/u8-istream.h"
46 #include "output/journal.h"
47 #include "output/output-item.h"
49 #include "gl/c-ctype.h"
50 #include "gl/minmax.h"
51 #include "gl/xalloc.h"
52 #include "gl/xmemdup0.h"
55 #define _(msgid) gettext (msgid)
56 #define N_(msgid) msgid
58 /* A token within a lex_source. */
61 /* The regular token information. */
64 /* Location of token in terms of the lex_source's buffer.
65 src->tail <= line_pos <= token_pos <= src->head. */
66 size_t token_pos; /* Start of token. */
67 size_t token_len; /* Length of source for token in bytes. */
68 size_t line_pos; /* Start of line containing token_pos. */
69 int first_line; /* Line number at token_pos. */
72 /* A source of tokens, corresponding to a syntax file.
74 This is conceptually a lex_reader wrapped with everything needed to convert
75 its UTF-8 bytes into tokens. */
78 struct ll ll; /* In lexer's list of sources. */
79 struct lex_reader *reader;
80 struct segmenter segmenter;
81 bool eof; /* True if T_STOP was read from 'reader'. */
83 /* Buffer of UTF-8 bytes. */
85 size_t allocated; /* Number of bytes allocated. */
86 size_t tail; /* &buffer[0] offset into UTF-8 source. */
87 size_t head; /* &buffer[head - tail] offset into source. */
89 /* Positions in source file, tail <= pos <= head for each member here. */
90 size_t journal_pos; /* First byte not yet output to journal. */
91 size_t seg_pos; /* First byte not yet scanned as token. */
92 size_t line_pos; /* First byte of line containing seg_pos. */
94 int n_newlines; /* Number of new-lines up to seg_pos. */
95 bool suppress_next_newline;
98 struct deque deque; /* Indexes into 'tokens'. */
99 struct lex_token *tokens; /* Lookahead tokens for parser. */
102 static struct lex_source *lex_source_create (struct lex_reader *);
103 static void lex_source_destroy (struct lex_source *);
108 struct ll_list sources; /* Contains "struct lex_source"s. */
111 static struct lex_source *lex_source__ (const struct lexer *);
112 static struct substring lex_source_get_syntax__ (const struct lex_source *,
114 static const struct lex_token *lex_next__ (const struct lexer *, int n);
115 static void lex_source_push_endcmd__ (struct lex_source *);
117 static void lex_source_pop__ (struct lex_source *);
118 static bool lex_source_get__ (const struct lex_source *);
119 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
120 const char *format, va_list)
121 PRINTF_FORMAT (4, 0);
122 static const struct lex_token *lex_source_next__ (const struct lex_source *,
125 /* Initializes READER with the specified CLASS and otherwise some reasonable
126 defaults. The caller should fill in the others members as desired. */
128 lex_reader_init (struct lex_reader *reader,
129 const struct lex_reader_class *class)
131 reader->class = class;
132 reader->syntax = SEG_MODE_AUTO;
133 reader->error = LEX_ERROR_CONTINUE;
134 reader->file_name = NULL;
135 reader->encoding = NULL;
136 reader->line_number = 0;
140 /* Frees any file name already in READER and replaces it by a copy of
141 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
143 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
145 free (reader->file_name);
146 reader->file_name = xstrdup_if_nonnull (file_name);
149 /* Creates and returns a new lexer. */
153 struct lexer *lexer = xzalloc (sizeof *lexer);
154 ll_init (&lexer->sources);
158 /* Destroys LEXER. */
160 lex_destroy (struct lexer *lexer)
164 struct lex_source *source, *next;
166 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
167 lex_source_destroy (source);
172 /* Inserts READER into LEXER so that the next token read by LEXER comes from
173 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
176 lex_include (struct lexer *lexer, struct lex_reader *reader)
178 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
179 ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
182 /* Appends READER to LEXER, so that it will be read after all other current
183 readers have already been read. */
185 lex_append (struct lexer *lexer, struct lex_reader *reader)
187 ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
192 static struct lex_token *
193 lex_push_token__ (struct lex_source *src)
195 struct lex_token *token;
197 if (deque_is_full (&src->deque))
198 src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
200 token = &src->tokens[deque_push_front (&src->deque)];
201 token->token = (struct token) { .type = T_STOP };
206 lex_source_pop__ (struct lex_source *src)
208 token_uninit (&src->tokens[deque_pop_back (&src->deque)].token);
212 lex_source_pop_front (struct lex_source *src)
214 token_uninit (&src->tokens[deque_pop_front (&src->deque)].token);
217 /* Advances LEXER to the next token, consuming the current token. */
219 lex_get (struct lexer *lexer)
221 struct lex_source *src;
223 src = lex_source__ (lexer);
227 if (!deque_is_empty (&src->deque))
228 lex_source_pop__ (src);
230 while (deque_is_empty (&src->deque))
231 if (!lex_source_get__ (src))
233 lex_source_destroy (src);
234 src = lex_source__ (lexer);
240 /* Issuing errors. */
242 /* Prints a syntax error message containing the current token and
243 given message MESSAGE (if non-null). */
245 lex_error (struct lexer *lexer, const char *format, ...)
249 va_start (args, format);
250 lex_next_error_valist (lexer, 0, 0, format, args);
254 /* Prints a syntax error message containing the current token and
255 given message MESSAGE (if non-null). */
257 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
259 lex_next_error_valist (lexer, 0, 0, format, args);
262 /* Prints a syntax error message containing the current token and
263 given message MESSAGE (if non-null). */
265 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
269 va_start (args, format);
270 lex_next_error_valist (lexer, n0, n1, format, args);
274 /* Prints a syntax error message saying that one of the strings provided as
275 varargs, up to the first NULL, is expected. */
277 (lex_error_expecting) (struct lexer *lexer, ...)
281 va_start (args, lexer);
282 lex_error_expecting_valist (lexer, args);
286 /* Prints a syntax error message saying that one of the options provided in
287 ARGS, up to the first NULL, is expected. */
289 lex_error_expecting_valist (struct lexer *lexer, va_list args)
291 enum { MAX_OPTIONS = 9 };
292 const char *options[MAX_OPTIONS];
294 while (n < MAX_OPTIONS)
296 const char *option = va_arg (args, const char *);
300 options[n++] = option;
302 lex_error_expecting_array (lexer, options, n);
306 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
311 lex_error (lexer, NULL);
315 lex_error (lexer, _("expecting %s"), options[0]);
319 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
323 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
328 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
329 options[0], options[1], options[2], options[3]);
333 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
334 options[0], options[1], options[2], options[3], options[4]);
338 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
339 options[0], options[1], options[2], options[3], options[4],
344 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
345 options[0], options[1], options[2], options[3], options[4],
346 options[5], options[6]);
350 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
351 options[0], options[1], options[2], options[3], options[4],
352 options[5], options[6], options[7]);
356 lex_error (lexer, NULL);
360 /* Reports an error to the effect that subcommand SBC may only be specified
363 This function does not take a lexer as an argument or use lex_error(),
364 because the result would ordinarily just be redundant: "Syntax error at
365 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
366 not help the user find the error. */
368 lex_sbc_only_once (const char *sbc)
370 msg (SE, _("Subcommand %s may only be specified once."), sbc);
373 /* Reports an error to the effect that subcommand SBC is missing.
375 This function does not take a lexer as an argument or use lex_error(),
376 because a missing subcommand can normally be detected only after the whole
377 command has been parsed, and so lex_error() would always report "Syntax
378 error at end of command", which does not help the user find the error. */
380 lex_sbc_missing (const char *sbc)
382 msg (SE, _("Required subcommand %s was not specified."), sbc);
385 /* Reports an error to the effect that specification SPEC may only be specified
386 once within subcommand SBC. */
388 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
390 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
394 /* Reports an error to the effect that specification SPEC is missing within
397 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
399 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
403 /* Prints a syntax error message containing the current token and
404 given message MESSAGE (if non-null). */
406 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
407 const char *format, va_list args)
409 struct lex_source *src = lex_source__ (lexer);
412 lex_source_error_valist (src, n0, n1, format, args);
418 ds_put_format (&s, _("Syntax error at end of input"));
421 ds_put_cstr (&s, ": ");
422 ds_put_vformat (&s, format, args);
424 ds_put_byte (&s, '.');
425 msg (SE, "%s", ds_cstr (&s));
430 /* Checks that we're at end of command.
431 If so, returns a successful command completion code.
432 If not, flags a syntax error and returns an error command
435 lex_end_of_command (struct lexer *lexer)
437 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
439 lex_error (lexer, _("expecting end of command"));
446 /* Token testing functions. */
448 /* Returns true if the current token is a number. */
450 lex_is_number (const struct lexer *lexer)
452 return lex_next_is_number (lexer, 0);
455 /* Returns true if the current token is a string. */
457 lex_is_string (const struct lexer *lexer)
459 return lex_next_is_string (lexer, 0);
462 /* Returns the value of the current token, which must be a
463 floating point number. */
465 lex_number (const struct lexer *lexer)
467 return lex_next_number (lexer, 0);
470 /* Returns true iff the current token is an integer. */
472 lex_is_integer (const struct lexer *lexer)
474 return lex_next_is_integer (lexer, 0);
477 /* Returns the value of the current token, which must be an
480 lex_integer (const struct lexer *lexer)
482 return lex_next_integer (lexer, 0);
485 /* Token testing functions with lookahead.
487 A value of 0 for N as an argument to any of these functions refers to the
488 current token. Lookahead is limited to the current command. Any N greater
489 than the number of tokens remaining in the current command will be treated
490 as referring to a T_ENDCMD token. */
492 /* Returns true if the token N ahead of the current token is a number. */
494 lex_next_is_number (const struct lexer *lexer, int n)
496 return token_is_number (lex_next (lexer, n));
499 /* Returns true if the token N ahead of the current token is a string. */
501 lex_next_is_string (const struct lexer *lexer, int n)
503 return token_is_string (lex_next (lexer, n));
506 /* Returns the value of the token N ahead of the current token, which must be a
507 floating point number. */
509 lex_next_number (const struct lexer *lexer, int n)
511 return token_number (lex_next (lexer, n));
514 /* Returns true if the token N ahead of the current token is an integer. */
516 lex_next_is_integer (const struct lexer *lexer, int n)
518 return token_is_integer (lex_next (lexer, n));
521 /* Returns the value of the token N ahead of the current token, which must be
524 lex_next_integer (const struct lexer *lexer, int n)
526 return token_integer (lex_next (lexer, n));
529 /* Token matching functions. */
531 /* If the current token has the specified TYPE, skips it and returns true.
532 Otherwise, returns false. */
534 lex_match (struct lexer *lexer, enum token_type type)
536 if (lex_token (lexer) == type)
545 /* If the current token matches IDENTIFIER, skips it and returns true.
546 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
549 IDENTIFIER must be an ASCII string. */
551 lex_match_id (struct lexer *lexer, const char *identifier)
553 return lex_match_id_n (lexer, identifier, 3);
556 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
557 may be abbreviated to its first N letters. Otherwise, returns false.
559 IDENTIFIER must be an ASCII string. */
561 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
563 if (lex_token (lexer) == T_ID
564 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
573 /* If the current token is integer X, skips it and returns true. Otherwise,
576 lex_match_int (struct lexer *lexer, int x)
578 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
587 /* Forced matches. */
589 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
590 abbreviated to its first 3 letters. Otherwise, reports an error and returns
593 IDENTIFIER must be an ASCII string. */
595 lex_force_match_id (struct lexer *lexer, const char *identifier)
597 if (lex_match_id (lexer, identifier))
601 lex_error_expecting (lexer, identifier);
606 /* If the current token has the specified TYPE, skips it and returns true.
607 Otherwise, reports an error and returns false. */
609 lex_force_match (struct lexer *lexer, enum token_type type)
611 if (lex_token (lexer) == type)
618 const char *type_string = token_type_to_string (type);
621 char *s = xasprintf ("`%s'", type_string);
622 lex_error_expecting (lexer, s);
626 lex_error_expecting (lexer, token_type_to_name (type));
632 /* If the current token is a string, does nothing and returns true.
633 Otherwise, reports an error and returns false. */
635 lex_force_string (struct lexer *lexer)
637 if (lex_is_string (lexer))
641 lex_error (lexer, _("expecting string"));
646 /* If the current token is a string or an identifier, does nothing and returns
647 true. Otherwise, reports an error and returns false.
649 This is meant for use in syntactic situations where we want to encourage the
650 user to supply a quoted string, but for compatibility we also accept
651 identifiers. (One example of such a situation is file names.) Therefore,
652 the error message issued when the current token is wrong only says that a
653 string is expected and doesn't mention that an identifier would also be
656 lex_force_string_or_id (struct lexer *lexer)
658 return lex_token (lexer) == T_ID || lex_force_string (lexer);
661 /* If the current token is an integer, does nothing and returns true.
662 Otherwise, reports an error and returns false. */
664 lex_force_int (struct lexer *lexer)
666 if (lex_is_integer (lexer))
670 lex_error (lexer, _("expecting integer"));
675 /* If the current token is an integer in the range MIN...MAX (inclusive), does
676 nothing and returns true. Otherwise, reports an error and returns false.
677 If NAME is nonnull, then it is used in the error message. */
679 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
681 bool is_integer = lex_is_integer (lexer);
682 bool too_small = is_integer && lex_integer (lexer) < min;
683 bool too_big = is_integer && lex_integer (lexer) > max;
684 if (is_integer && !too_small && !too_big)
689 /* Weird, maybe a bug in the caller. Just report that we needed an
692 lex_error (lexer, _("Integer expected for %s."), name);
694 lex_error (lexer, _("Integer expected."));
699 lex_error (lexer, _("Expected %ld for %s."), min, name);
701 lex_error (lexer, _("Expected %ld."), min);
703 else if (min + 1 == max)
706 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
708 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
712 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
713 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
715 if (report_lower_bound && report_upper_bound)
719 _("Expected integer between %ld and %ld for %s."),
722 lex_error (lexer, _("Expected integer between %ld and %ld."),
725 else if (report_lower_bound)
730 lex_error (lexer, _("Expected non-negative integer for %s."),
733 lex_error (lexer, _("Expected non-negative integer."));
738 lex_error (lexer, _("Expected positive integer for %s."),
741 lex_error (lexer, _("Expected positive integer."));
744 else if (report_upper_bound)
748 _("Expected integer less than or equal to %ld for %s."),
751 lex_error (lexer, _("Expected integer less than or equal to %ld."),
757 lex_error (lexer, _("Integer expected for %s."), name);
759 lex_error (lexer, _("Integer expected."));
765 /* If the current token is a number, does nothing and returns true.
766 Otherwise, reports an error and returns false. */
768 lex_force_num (struct lexer *lexer)
770 if (lex_is_number (lexer))
773 lex_error (lexer, _("expecting number"));
777 /* If the current token is an identifier, does nothing and returns true.
778 Otherwise, reports an error and returns false. */
780 lex_force_id (struct lexer *lexer)
782 if (lex_token (lexer) == T_ID)
785 lex_error (lexer, _("expecting identifier"));
789 /* Token accessors. */
791 /* Returns the type of LEXER's current token. */
793 lex_token (const struct lexer *lexer)
795 return lex_next_token (lexer, 0);
798 /* Returns the number in LEXER's current token.
800 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
801 tokens this function will always return zero. */
803 lex_tokval (const struct lexer *lexer)
805 return lex_next_tokval (lexer, 0);
808 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
810 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
811 this functions this function will always return NULL.
813 The UTF-8 encoding of the returned string is correct for variable names and
814 other identifiers. Use filename_to_utf8() to use it as a filename. Use
815 data_in() to use it in a "union value". */
817 lex_tokcstr (const struct lexer *lexer)
819 return lex_next_tokcstr (lexer, 0);
822 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
823 null-terminated (but the null terminator is not included in the returned
824 substring's 'length').
826 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
827 this functions this function will always return NULL.
829 The UTF-8 encoding of the returned string is correct for variable names and
830 other identifiers. Use filename_to_utf8() to use it as a filename. Use
831 data_in() to use it in a "union value". */
833 lex_tokss (const struct lexer *lexer)
835 return lex_next_tokss (lexer, 0);
840 A value of 0 for N as an argument to any of these functions refers to the
841 current token. Lookahead is limited to the current command. Any N greater
842 than the number of tokens remaining in the current command will be treated
843 as referring to a T_ENDCMD token. */
845 static const struct lex_token *
846 lex_next__ (const struct lexer *lexer_, int n)
848 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
849 struct lex_source *src = lex_source__ (lexer);
852 return lex_source_next__ (src, n);
855 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
860 static const struct lex_token *
861 lex_source_next__ (const struct lex_source *src, int n)
863 while (deque_count (&src->deque) <= n)
865 if (!deque_is_empty (&src->deque))
867 struct lex_token *front;
869 front = &src->tokens[deque_front (&src->deque, 0)];
870 if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
874 lex_source_get__ (src);
877 return &src->tokens[deque_back (&src->deque, n)];
880 /* Returns the "struct token" of the token N after the current one in LEXER.
881 The returned pointer can be invalidated by pretty much any succeeding call
882 into the lexer, although the string pointer within the returned token is
883 only invalidated by consuming the token (e.g. with lex_get()). */
885 lex_next (const struct lexer *lexer, int n)
887 return &lex_next__ (lexer, n)->token;
890 /* Returns the type of the token N after the current one in LEXER. */
892 lex_next_token (const struct lexer *lexer, int n)
894 return lex_next (lexer, n)->type;
897 /* Returns the number in the tokn N after the current one in LEXER.
899 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
900 tokens this function will always return zero. */
902 lex_next_tokval (const struct lexer *lexer, int n)
904 return token_number (lex_next (lexer, n));
907 /* Returns the null-terminated string in the token N after the current one, in
910 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
911 this functions this function will always return NULL.
913 The UTF-8 encoding of the returned string is correct for variable names and
914 other identifiers. Use filename_to_utf8() to use it as a filename. Use
915 data_in() to use it in a "union value". */
917 lex_next_tokcstr (const struct lexer *lexer, int n)
919 return lex_next_tokss (lexer, n).string;
922 /* Returns the string in the token N after the current one, in UTF-8 encoding.
923 The string is null-terminated (but the null terminator is not included in
924 the returned substring's 'length').
926 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
927 tokens this functions this function will always return NULL.
929 The UTF-8 encoding of the returned string is correct for variable names and
930 other identifiers. Use filename_to_utf8() to use it as a filename. Use
931 data_in() to use it in a "union value". */
933 lex_next_tokss (const struct lexer *lexer, int n)
935 return lex_next (lexer, n)->string;
938 /* Returns the text of the syntax in tokens N0 ahead of the current one,
939 through N1 ahead of the current one, inclusive. (For example, if N0 and N1
940 are both zero, this requests the syntax for the current token.) The caller
941 must not modify or free the returned string. The syntax is encoded in UTF-8
942 and in the original form supplied to the lexer so that, for example, it may
943 include comments, spaces, and new-lines if it spans multiple tokens. */
945 lex_next_representation (const struct lexer *lexer, int n0, int n1)
947 return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
951 lex_tokens_match (const struct token *actual, const struct token *expected)
953 if (actual->type != expected->type)
956 switch (actual->type)
960 return actual->number == expected->number;
963 return lex_id_match (expected->string, actual->string);
966 return (actual->string.length == expected->string.length
967 && !memcmp (actual->string.string, expected->string.string,
968 actual->string.length));
975 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
976 skips it and returns true. Otherwise, returns false.
978 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
979 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
980 first three letters. */
982 lex_match_phrase (struct lexer *lexer, const char *s)
984 struct string_lexer slex;
989 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
990 while (string_lexer_next (&slex, &token))
991 if (token.type != SCAN_SKIP)
993 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
994 token_uninit (&token);
1005 lex_source_get_first_line_number (const struct lex_source *src, int n)
1007 return lex_source_next__ (src, n)->first_line;
1011 count_newlines (char *s, size_t length)
1016 while ((newline = memchr (s, '\n', length)) != NULL)
1019 length -= (newline + 1) - s;
1027 lex_source_get_last_line_number (const struct lex_source *src, int n)
1029 const struct lex_token *token = lex_source_next__ (src, n);
1031 if (token->first_line == 0)
1035 char *token_str = &src->buffer[token->token_pos - src->tail];
1036 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1041 count_columns (const char *s_, size_t length)
1043 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1049 for (ofs = 0; ofs < length; ofs += mblen)
1053 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1056 int width = uc_width (uc, "UTF-8");
1061 columns = ROUND_UP (columns + 1, 8);
1068 lex_source_get_first_column (const struct lex_source *src, int n)
1070 const struct lex_token *token = lex_source_next__ (src, n);
1071 return count_columns (&src->buffer[token->line_pos - src->tail],
1072 token->token_pos - token->line_pos);
1076 lex_source_get_last_column (const struct lex_source *src, int n)
1078 const struct lex_token *token = lex_source_next__ (src, n);
1079 char *start, *end, *newline;
1081 start = &src->buffer[token->line_pos - src->tail];
1082 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1083 newline = memrchr (start, '\n', end - start);
1084 if (newline != NULL)
1085 start = newline + 1;
1086 return count_columns (start, end - start);
1089 /* Returns the 1-based line number of the start of the syntax that represents
1090 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1091 if the token is drawn from a source that does not have line numbers. */
1093 lex_get_first_line_number (const struct lexer *lexer, int n)
1095 const struct lex_source *src = lex_source__ (lexer);
1096 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
1099 /* Returns the 1-based line number of the end of the syntax that represents the
1100 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1101 token or if the token is drawn from a source that does not have line
1104 Most of the time, a single token is wholly within a single line of syntax,
1105 but there are two exceptions: a T_STRING token can be made up of multiple
1106 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1107 token can consist of a "-" on one line followed by the number on the next.
1110 lex_get_last_line_number (const struct lexer *lexer, int n)
1112 const struct lex_source *src = lex_source__ (lexer);
1113 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1116 /* Returns the 1-based column number of the start of the syntax that represents
1117 the token N after the current one in LEXER. Returns 0 for a T_STOP
1120 Column numbers are measured according to the width of characters as shown in
1121 a typical fixed-width font, in which CJK characters have width 2 and
1122 combining characters have width 0. */
1124 lex_get_first_column (const struct lexer *lexer, int n)
1126 const struct lex_source *src = lex_source__ (lexer);
1127 return src != NULL ? lex_source_get_first_column (src, n) : 0;
1130 /* Returns the 1-based column number of the end of the syntax that represents
1131 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1134 Column numbers are measured according to the width of characters as shown in
1135 a typical fixed-width font, in which CJK characters have width 2 and
1136 combining characters have width 0. */
1138 lex_get_last_column (const struct lexer *lexer, int n)
1140 const struct lex_source *src = lex_source__ (lexer);
1141 return src != NULL ? lex_source_get_last_column (src, n) : 0;
1144 /* Returns the name of the syntax file from which the current command is drawn.
1145 Returns NULL for a T_STOP token or if the command's source does not have
1148 There is no version of this function that takes an N argument because
1149 lookahead only works to the end of a command and any given command is always
1150 within a single syntax file. */
1152 lex_get_file_name (const struct lexer *lexer)
1154 struct lex_source *src = lex_source__ (lexer);
1155 return src == NULL ? NULL : src->reader->file_name;
1158 /* Returns a newly allocated msg_location for the syntax that represents tokens
1159 with 0-based offsets N0...N1, inclusive, from the current token. The caller
1160 must eventually free the location (with msg_location_destroy()). */
1161 struct msg_location *
1162 lex_get_location (const struct lexer *lexer, int n0, int n1)
1164 struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1165 loc->first_column = lex_get_first_column (lexer, n0);
1166 loc->last_column = lex_get_last_column (lexer, n1);
1170 /* Returns a newly allocated msg_location for the syntax that represents tokens
1171 with 0-based offsets N0...N1, inclusive, from the current token. The
1172 location only covers the tokens' lines, not the columns. The caller must
1173 eventually free the location (with msg_location_destroy()). */
1174 struct msg_location *
1175 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1177 struct msg_location *loc = xmalloc (sizeof *loc);
1178 *loc = (struct msg_location) {
1179 .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1180 .first_line = lex_get_first_line_number (lexer, n0),
1181 .last_line = lex_get_last_line_number (lexer, n1),
1187 lex_get_encoding (const struct lexer *lexer)
1189 struct lex_source *src = lex_source__ (lexer);
1190 return src == NULL ? NULL : src->reader->encoding;
1194 /* Returns the syntax mode for the syntax file from which the current drawn is
1195 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1196 does not have line numbers.
1198 There is no version of this function that takes an N argument because
1199 lookahead only works to the end of a command and any given command is always
1200 within a single syntax file. */
1202 lex_get_syntax_mode (const struct lexer *lexer)
1204 struct lex_source *src = lex_source__ (lexer);
1205 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1208 /* Returns the error mode for the syntax file from which the current drawn is
1209 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1210 source does not have line numbers.
1212 There is no version of this function that takes an N argument because
1213 lookahead only works to the end of a command and any given command is always
1214 within a single syntax file. */
1216 lex_get_error_mode (const struct lexer *lexer)
1218 struct lex_source *src = lex_source__ (lexer);
1219 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1222 /* If the source that LEXER is currently reading has error mode
1223 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1224 token to be read comes directly from whatever is next read from the stream.
1226 It makes sense to call this function after encountering an error in a
1227 command entered on the console, because usually the user would prefer not to
1228 have cascading errors. */
1230 lex_interactive_reset (struct lexer *lexer)
1232 struct lex_source *src = lex_source__ (lexer);
1233 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1235 src->head = src->tail = 0;
1236 src->journal_pos = src->seg_pos = src->line_pos = 0;
1237 src->n_newlines = 0;
1238 src->suppress_next_newline = false;
1239 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1241 while (!deque_is_empty (&src->deque))
1242 lex_source_pop__ (src);
1243 lex_source_push_endcmd__ (src);
1247 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1249 lex_discard_rest_of_command (struct lexer *lexer)
1251 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1255 /* Discards all lookahead tokens in LEXER, then discards all input sources
1256 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1257 runs out of input sources. */
1259 lex_discard_noninteractive (struct lexer *lexer)
1261 struct lex_source *src = lex_source__ (lexer);
1265 while (!deque_is_empty (&src->deque))
1266 lex_source_pop__ (src);
1268 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1269 src = lex_source__ (lexer))
1270 lex_source_destroy (src);
1275 lex_source_max_tail__ (const struct lex_source *src)
1277 const struct lex_token *token;
1280 assert (src->seg_pos >= src->line_pos);
1281 max_tail = MIN (src->journal_pos, src->line_pos);
1283 /* Use the oldest token also. (We know that src->deque cannot be empty
1284 because we are in the process of adding a new token, which is already
1285 initialized enough to use here.) */
1286 token = &src->tokens[deque_back (&src->deque, 0)];
1287 assert (token->token_pos >= token->line_pos);
1288 max_tail = MIN (max_tail, token->line_pos);
1294 lex_source_expand__ (struct lex_source *src)
1296 if (src->head - src->tail >= src->allocated)
1298 size_t max_tail = lex_source_max_tail__ (src);
1299 if (max_tail > src->tail)
1301 /* Advance the tail, freeing up room at the head. */
1302 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1303 src->head - max_tail);
1304 src->tail = max_tail;
1308 /* Buffer is completely full. Expand it. */
1309 src->buffer = x2realloc (src->buffer, &src->allocated);
1314 /* There's space available at the head of the buffer. Nothing to do. */
1319 lex_source_read__ (struct lex_source *src)
1323 lex_source_expand__ (src);
1325 size_t head_ofs = src->head - src->tail;
1326 size_t space = src->allocated - head_ofs;
1327 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1328 size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1330 assert (n <= space);
1335 src->reader->eof = true;
1336 lex_source_expand__ (src);
1342 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1343 src->head - src->seg_pos));
1346 static struct lex_source *
1347 lex_source__ (const struct lexer *lexer)
1349 return (ll_is_empty (&lexer->sources) ? NULL
1350 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1353 static struct substring
1354 lex_tokens_get_syntax__ (const struct lex_source *src,
1355 const struct lex_token *token0,
1356 const struct lex_token *token1)
1358 size_t start = token0->token_pos;
1359 size_t end = token1->token_pos + token1->token_len;
1361 return ss_buffer (&src->buffer[start - src->tail], end - start);
1364 static struct substring
1365 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1367 return lex_tokens_get_syntax__ (src,
1368 lex_source_next__ (src, n0),
1369 lex_source_next__ (src, MAX (n0, n1)));
1373 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1379 assert (out_size >= 16);
1380 out_maxlen = out_size - 1;
1381 if (in.length > out_maxlen - 3)
1384 for (out_len = 0; out_len < in.length; out_len += mblen)
1386 if (in.string[out_len] == '\n'
1387 || in.string[out_len] == '\0'
1388 || (in.string[out_len] == '\r'
1389 && out_len + 1 < in.length
1390 && in.string[out_len + 1] == '\n'))
1393 mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1394 in.length - out_len);
1399 if (out_len + mblen > out_maxlen)
1403 memcpy (out, in.string, out_len);
1404 strcpy (&out[out_len], out_len < in.length ? "..." : "");
1408 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1409 const char *format, va_list args)
1411 const struct lex_token *token;
1416 token = lex_source_next__ (src, n0);
1417 if (token->token.type == T_ENDCMD)
1418 ds_put_cstr (&s, _("Syntax error at end of command"));
1421 struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1422 if (!ss_is_empty (syntax))
1424 char syntax_cstr[64];
1426 lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1427 ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1430 ds_put_cstr (&s, _("Syntax error"));
1435 ds_put_cstr (&s, ": ");
1436 ds_put_vformat (&s, format, args);
1438 if (ds_last (&s) != '.')
1439 ds_put_byte (&s, '.');
1441 struct msg_location *location = xmalloc (sizeof *location);
1442 *location = (struct msg_location) {
1443 .file_name = xstrdup_if_nonnull (src->reader->file_name),
1444 .first_line = lex_source_get_first_line_number (src, n0),
1445 .last_line = lex_source_get_last_line_number (src, n1),
1446 .first_column = lex_source_get_first_column (src, n0),
1447 .last_column = lex_source_get_last_column (src, n1),
1449 struct msg *m = xmalloc (sizeof *m);
1451 .category = MSG_C_SYNTAX,
1452 .severity = MSG_S_ERROR,
1453 .location = location,
1454 .text = ds_steal_cstr (&s),
1459 static void PRINTF_FORMAT (4, 5)
1460 lex_source_error (struct lex_source *src, int n0, int n1,
1461 const char *format, ...)
1464 va_start (args, format);
1465 lex_source_error_valist (src, n0, n1, format, args);
1470 lex_get_error (struct lex_source *src, const char *s)
1472 int n = deque_count (&src->deque) - 1;
1473 lex_source_error (src, n, n, "%s", s);
1474 lex_source_pop_front (src);
1477 /* Attempts to append an additional token into SRC's deque, reading more from
1478 the underlying lex_reader if necessary. Returns true if successful, false
1479 if the deque already represents (a suffix of) the whole lex_reader's
1482 lex_source_get__ (const struct lex_source *src_)
1484 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1488 /* State maintained while scanning tokens. Usually we only need a single
1489 state, but scanner_push() can return SCAN_SAVE to indicate that the state
1490 needs to be saved and possibly restored later with SCAN_BACK. */
1493 struct segmenter segmenter;
1494 enum segment_type last_segment;
1495 int newlines; /* Number of newlines encountered so far. */
1496 /* Maintained here so we can update lex_source's similar members when we
1502 /* Initialize state. */
1503 struct state state =
1505 .segmenter = src->segmenter,
1507 .seg_pos = src->seg_pos,
1508 .line_pos = src->line_pos,
1510 struct state saved = state;
1512 /* Append a new token to SRC and initialize it. */
1513 struct lex_token *token = lex_push_token__ (src);
1514 struct scanner scanner;
1515 scanner_init (&scanner, &token->token);
1516 token->line_pos = src->line_pos;
1517 token->token_pos = src->seg_pos;
1518 if (src->reader->line_number > 0)
1519 token->first_line = src->reader->line_number + src->n_newlines;
1521 token->first_line = 0;
1523 /* Extract segments and pass them through the scanner until we obtain a
1527 /* Extract a segment. */
1528 const char *segment = &src->buffer[state.seg_pos - src->tail];
1529 size_t seg_maxlen = src->head - state.seg_pos;
1530 enum segment_type type;
1531 int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1532 src->reader->eof, &type);
1535 /* The segmenter needs more input to produce a segment. */
1536 assert (!src->reader->eof);
1537 lex_source_read__ (src);
1541 /* Update state based on the segment. */
1542 state.last_segment = type;
1543 state.seg_pos += seg_len;
1544 if (type == SEG_NEWLINE)
1547 state.line_pos = state.seg_pos;
1550 /* Pass the segment into the scanner and try to get a token out. */
1551 enum scan_result result = scanner_push (&scanner, type,
1552 ss_buffer (segment, seg_len),
1554 if (result == SCAN_SAVE)
1556 else if (result == SCAN_BACK)
1561 else if (result == SCAN_DONE)
1565 /* If we've reached the end of a line, or the end of a command, then pass
1566 the line to the output engine as a syntax text item. */
1567 int n_lines = state.newlines;
1568 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1571 src->suppress_next_newline = true;
1573 else if (n_lines > 0 && src->suppress_next_newline)
1576 src->suppress_next_newline = false;
1578 for (int i = 0; i < n_lines; i++)
1580 /* Beginning of line. */
1581 const char *line = &src->buffer[src->journal_pos - src->tail];
1583 /* Calculate line length, including \n or \r\n end-of-line if present.
1585 We use src->head even though that may be beyond what we've actually
1586 converted to tokens (which is only through state.line_pos). That's
1587 because, if we're emitting the line due to SEG_END_COMMAND, we want to
1588 take the whole line through the newline, not just through the '.'. */
1589 size_t max_len = src->head - src->journal_pos;
1590 const char *newline = memchr (line, '\n', max_len);
1591 size_t line_len = newline ? newline - line + 1 : max_len;
1593 /* Calculate line length excluding end-of-line. */
1594 size_t copy_len = line_len;
1595 if (copy_len > 0 && line[copy_len - 1] == '\n')
1597 if (copy_len > 0 && line[copy_len - 1] == '\r')
1600 /* Submit the line as syntax. */
1601 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1602 xmemdup0 (line, copy_len),
1605 src->journal_pos += line_len;
1608 token->token_len = state.seg_pos - src->seg_pos;
1610 src->segmenter = state.segmenter;
1611 src->seg_pos = state.seg_pos;
1612 src->line_pos = state.line_pos;
1613 src->n_newlines += state.newlines;
1615 switch (token->token.type)
1621 token->token.type = T_ENDCMD;
1625 case SCAN_BAD_HEX_LENGTH:
1626 case SCAN_BAD_HEX_DIGIT:
1627 case SCAN_BAD_UNICODE_DIGIT:
1628 case SCAN_BAD_UNICODE_LENGTH:
1629 case SCAN_BAD_UNICODE_CODE_POINT:
1630 case SCAN_EXPECTED_QUOTE:
1631 case SCAN_EXPECTED_EXPONENT:
1632 case SCAN_UNEXPECTED_CHAR:
1633 char *msg = scan_token_to_error (&token->token);
1634 lex_get_error (src, msg);
1639 lex_source_pop_front (src);
1647 lex_source_push_endcmd__ (struct lex_source *src)
1649 struct lex_token *token = lex_push_token__ (src);
1650 token->token.type = T_ENDCMD;
1651 token->token_pos = 0;
1652 token->token_len = 0;
1653 token->line_pos = 0;
1654 token->first_line = 0;
1657 static struct lex_source *
1658 lex_source_create (struct lex_reader *reader)
1660 struct lex_source *src = xmalloc (sizeof *src);
1661 *src = (struct lex_source) {
1663 .segmenter = segmenter_init (reader->syntax, false),
1664 .tokens = deque_init (&src->deque, 4, sizeof *src->tokens),
1667 lex_source_push_endcmd__ (src);
1673 lex_source_destroy (struct lex_source *src)
1675 char *file_name = src->reader->file_name;
1676 char *encoding = src->reader->encoding;
1677 if (src->reader->class->destroy != NULL)
1678 src->reader->class->destroy (src->reader);
1682 while (!deque_is_empty (&src->deque))
1683 lex_source_pop__ (src);
1685 ll_remove (&src->ll);
1689 struct lex_file_reader
1691 struct lex_reader reader;
1692 struct u8_istream *istream;
1695 static struct lex_reader_class lex_file_reader_class;
1697 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1698 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
1699 ENCODING, which should take one of the forms accepted by
1700 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
1701 mode of the new reader, respectively.
1703 Returns a null pointer if FILE_NAME cannot be opened. */
1705 lex_reader_for_file (const char *file_name, const char *encoding,
1706 enum segmenter_mode syntax,
1707 enum lex_error_mode error)
1709 struct lex_file_reader *r;
1710 struct u8_istream *istream;
1712 istream = (!strcmp(file_name, "-")
1713 ? u8_istream_for_fd (encoding, STDIN_FILENO)
1714 : u8_istream_for_file (encoding, file_name, O_RDONLY));
1715 if (istream == NULL)
1717 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1721 r = xmalloc (sizeof *r);
1722 lex_reader_init (&r->reader, &lex_file_reader_class);
1723 r->reader.syntax = syntax;
1724 r->reader.error = error;
1725 r->reader.file_name = xstrdup (file_name);
1726 r->reader.encoding = xstrdup_if_nonnull (encoding);
1727 r->reader.line_number = 1;
1728 r->istream = istream;
1733 static struct lex_file_reader *
1734 lex_file_reader_cast (struct lex_reader *r)
1736 return UP_CAST (r, struct lex_file_reader, reader);
1740 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1741 enum prompt_style prompt_style UNUSED)
1743 struct lex_file_reader *r = lex_file_reader_cast (r_);
1744 ssize_t n_read = u8_istream_read (r->istream, buf, n);
1747 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
1754 lex_file_close (struct lex_reader *r_)
1756 struct lex_file_reader *r = lex_file_reader_cast (r_);
1758 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1760 if (u8_istream_close (r->istream) != 0)
1761 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
1764 u8_istream_free (r->istream);
1769 static struct lex_reader_class lex_file_reader_class =
1775 struct lex_string_reader
1777 struct lex_reader reader;
1782 static struct lex_reader_class lex_string_reader_class;
1784 /* Creates and returns a new lex_reader for the contents of S, which must be
1785 encoded in the given ENCODING. The new reader takes ownership of S and will free it
1786 with ss_dealloc() when it is closed. */
1788 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
1790 struct lex_string_reader *r;
1792 r = xmalloc (sizeof *r);
1793 lex_reader_init (&r->reader, &lex_string_reader_class);
1794 r->reader.syntax = SEG_MODE_AUTO;
1795 r->reader.encoding = xstrdup_if_nonnull (encoding);
1802 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1803 which must be encoded in ENCODING. The caller retains ownership of S. */
1805 lex_reader_for_string (const char *s, const char *encoding)
1807 struct substring ss;
1808 ss_alloc_substring (&ss, ss_cstr (s));
1809 return lex_reader_for_substring_nocopy (ss, encoding);
1812 /* Formats FORMAT as a printf()-like format string and creates and returns a
1813 new lex_reader for the formatted result. */
1815 lex_reader_for_format (const char *format, const char *encoding, ...)
1817 struct lex_reader *r;
1820 va_start (args, encoding);
1821 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
1827 static struct lex_string_reader *
1828 lex_string_reader_cast (struct lex_reader *r)
1830 return UP_CAST (r, struct lex_string_reader, reader);
1834 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1835 enum prompt_style prompt_style UNUSED)
1837 struct lex_string_reader *r = lex_string_reader_cast (r_);
1840 chunk = MIN (n, r->s.length - r->offset);
1841 memcpy (buf, r->s.string + r->offset, chunk);
1848 lex_string_close (struct lex_reader *r_)
1850 struct lex_string_reader *r = lex_string_reader_cast (r_);
1856 static struct lex_reader_class lex_string_reader_class =