1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "language/command.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/ll.h"
42 #include "libpspp/message.h"
43 #include "libpspp/macro.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* Location of token in terms of the lex_source's buffer.
66 src->tail <= line_pos <= token_pos <= src->head. */
67 size_t token_pos; /* Start of token. */
68 size_t token_len; /* Length of source for token in bytes. */
69 size_t line_pos; /* Start of line containing token_pos. */
70 int first_line; /* Line number at token_pos. */
73 /* A source of tokens, corresponding to a syntax file.
75 This is conceptually a lex_reader wrapped with everything needed to convert
76 its UTF-8 bytes into tokens. */
79 struct ll ll; /* In lexer's list of sources. */
80 struct lex_reader *reader;
82 struct segmenter segmenter;
83 bool eof; /* True if T_STOP was read from 'reader'. */
85 /* Buffer of UTF-8 bytes. */
87 size_t allocated; /* Number of bytes allocated. */
88 size_t tail; /* &buffer[0] offset into UTF-8 source. */
89 size_t head; /* &buffer[head - tail] offset into source. */
91 /* Positions in source file, tail <= pos <= head for each member here. */
92 size_t journal_pos; /* First byte not yet output to journal. */
93 size_t seg_pos; /* First byte not yet scanned as token. */
94 size_t line_pos; /* First byte of line containing seg_pos. */
96 int n_newlines; /* Number of new-lines up to seg_pos. */
97 bool suppress_next_newline;
100 struct deque deque; /* Indexes into 'tokens'. */
101 struct lex_token *tokens; /* Lookahead tokens for parser. */
104 static struct lex_source *lex_source_create (struct lex_reader *);
105 static void lex_source_destroy (struct lex_source *);
110 struct ll_list sources; /* Contains "struct lex_source"s. */
113 static struct lex_source *lex_source__ (const struct lexer *);
114 static const struct lex_token *lex_next__ (const struct lexer *, int n);
115 static void lex_source_push_endcmd__ (struct lex_source *);
117 static void lex_source_pop__ (struct lex_source *);
118 static bool lex_source_get (const struct lex_source *);
119 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
120 const char *format, va_list)
121 PRINTF_FORMAT (4, 0);
122 static const struct lex_token *lex_source_next__ (const struct lex_source *,
125 /* Initializes READER with the specified CLASS and otherwise some reasonable
126 defaults. The caller should fill in the others members as desired. */
128 lex_reader_init (struct lex_reader *reader,
129 const struct lex_reader_class *class)
131 reader->class = class;
132 reader->syntax = SEG_MODE_AUTO;
133 reader->error = LEX_ERROR_CONTINUE;
134 reader->file_name = NULL;
135 reader->encoding = NULL;
136 reader->line_number = 0;
140 /* Frees any file name already in READER and replaces it by a copy of
141 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
143 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
145 free (reader->file_name);
146 reader->file_name = xstrdup_if_nonnull (file_name);
149 /* Creates and returns a new lexer. */
153 struct lexer *lexer = xzalloc (sizeof *lexer);
154 ll_init (&lexer->sources);
158 /* Destroys LEXER. */
160 lex_destroy (struct lexer *lexer)
164 struct lex_source *source, *next;
166 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
167 lex_source_destroy (source);
172 /* Inserts READER into LEXER so that the next token read by LEXER comes from
173 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
176 lex_include (struct lexer *lexer, struct lex_reader *reader)
178 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
179 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
182 /* Appends READER to LEXER, so that it will be read after all other current
183 readers have already been read. */
185 lex_append (struct lexer *lexer, struct lex_reader *reader)
187 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
192 static struct lex_token *
193 lex_push_token__ (struct lex_source *src)
195 struct lex_token *token;
197 if (deque_is_full (&src->deque))
198 src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
200 token = &src->tokens[deque_push_front (&src->deque)];
201 token->token = (struct token) { .type = T_STOP };
206 lex_source_pop__ (struct lex_source *src)
208 token_uninit (&src->tokens[deque_pop_back (&src->deque)].token);
212 lex_source_pop_front (struct lex_source *src)
214 token_uninit (&src->tokens[deque_pop_front (&src->deque)].token);
217 /* Advances LEXER to the next token, consuming the current token. */
219 lex_get (struct lexer *lexer)
221 struct lex_source *src;
223 src = lex_source__ (lexer);
227 if (!deque_is_empty (&src->deque))
228 lex_source_pop__ (src);
230 while (deque_is_empty (&src->deque))
231 if (!lex_source_get (src))
233 lex_source_destroy (src);
234 src = lex_source__ (lexer);
240 /* Issuing errors. */
242 /* Prints a syntax error message containing the current token and
243 given message MESSAGE (if non-null). */
245 lex_error (struct lexer *lexer, const char *format, ...)
249 va_start (args, format);
250 lex_next_error_valist (lexer, 0, 0, format, args);
254 /* Prints a syntax error message containing the current token and
255 given message MESSAGE (if non-null). */
257 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
259 lex_next_error_valist (lexer, 0, 0, format, args);
262 /* Prints a syntax error message containing the current token and
263 given message MESSAGE (if non-null). */
265 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
269 va_start (args, format);
270 lex_next_error_valist (lexer, n0, n1, format, args);
274 /* Prints a syntax error message saying that one of the strings provided as
275 varargs, up to the first NULL, is expected. */
277 (lex_error_expecting) (struct lexer *lexer, ...)
281 va_start (args, lexer);
282 lex_error_expecting_valist (lexer, args);
286 /* Prints a syntax error message saying that one of the options provided in
287 ARGS, up to the first NULL, is expected. */
289 lex_error_expecting_valist (struct lexer *lexer, va_list args)
291 enum { MAX_OPTIONS = 9 };
292 const char *options[MAX_OPTIONS];
294 while (n < MAX_OPTIONS)
296 const char *option = va_arg (args, const char *);
300 options[n++] = option;
302 lex_error_expecting_array (lexer, options, n);
306 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
311 lex_error (lexer, NULL);
315 lex_error (lexer, _("expecting %s"), options[0]);
319 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
323 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
328 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
329 options[0], options[1], options[2], options[3]);
333 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
334 options[0], options[1], options[2], options[3], options[4]);
338 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
339 options[0], options[1], options[2], options[3], options[4],
344 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
345 options[0], options[1], options[2], options[3], options[4],
346 options[5], options[6]);
350 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
351 options[0], options[1], options[2], options[3], options[4],
352 options[5], options[6], options[7]);
356 lex_error (lexer, NULL);
360 /* Reports an error to the effect that subcommand SBC may only be specified
363 This function does not take a lexer as an argument or use lex_error(),
364 because the result would ordinarily just be redundant: "Syntax error at
365 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
366 not help the user find the error. */
368 lex_sbc_only_once (const char *sbc)
370 msg (SE, _("Subcommand %s may only be specified once."), sbc);
373 /* Reports an error to the effect that subcommand SBC is missing.
375 This function does not take a lexer as an argument or use lex_error(),
376 because a missing subcommand can normally be detected only after the whole
377 command has been parsed, and so lex_error() would always report "Syntax
378 error at end of command", which does not help the user find the error. */
380 lex_sbc_missing (const char *sbc)
382 msg (SE, _("Required subcommand %s was not specified."), sbc);
385 /* Reports an error to the effect that specification SPEC may only be specified
386 once within subcommand SBC. */
388 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
390 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
394 /* Reports an error to the effect that specification SPEC is missing within
397 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
399 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
403 /* Prints a syntax error message containing the current token and
404 given message MESSAGE (if non-null). */
406 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
407 const char *format, va_list args)
409 struct lex_source *src = lex_source__ (lexer);
412 lex_source_error_valist (src, n0, n1, format, args);
418 ds_put_format (&s, _("Syntax error at end of input"));
421 ds_put_cstr (&s, ": ");
422 ds_put_vformat (&s, format, args);
424 ds_put_byte (&s, '.');
425 msg (SE, "%s", ds_cstr (&s));
430 /* Checks that we're at end of command.
431 If so, returns a successful command completion code.
432 If not, flags a syntax error and returns an error command
435 lex_end_of_command (struct lexer *lexer)
437 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
439 lex_error (lexer, _("expecting end of command"));
446 /* Token testing functions. */
448 /* Returns true if the current token is a number. */
450 lex_is_number (const struct lexer *lexer)
452 return lex_next_is_number (lexer, 0);
455 /* Returns true if the current token is a string. */
457 lex_is_string (const struct lexer *lexer)
459 return lex_next_is_string (lexer, 0);
462 /* Returns the value of the current token, which must be a
463 floating point number. */
465 lex_number (const struct lexer *lexer)
467 return lex_next_number (lexer, 0);
470 /* Returns true iff the current token is an integer. */
472 lex_is_integer (const struct lexer *lexer)
474 return lex_next_is_integer (lexer, 0);
477 /* Returns the value of the current token, which must be an
480 lex_integer (const struct lexer *lexer)
482 return lex_next_integer (lexer, 0);
485 /* Token testing functions with lookahead.
487 A value of 0 for N as an argument to any of these functions refers to the
488 current token. Lookahead is limited to the current command. Any N greater
489 than the number of tokens remaining in the current command will be treated
490 as referring to a T_ENDCMD token. */
492 /* Returns true if the token N ahead of the current token is a number. */
494 lex_next_is_number (const struct lexer *lexer, int n)
496 enum token_type next_token = lex_next_token (lexer, n);
497 return next_token == T_POS_NUM || next_token == T_NEG_NUM;
500 /* Returns true if the token N ahead of the current token is a string. */
502 lex_next_is_string (const struct lexer *lexer, int n)
504 return lex_next_token (lexer, n) == T_STRING;
507 /* Returns the value of the token N ahead of the current token, which must be a
508 floating point number. */
510 lex_next_number (const struct lexer *lexer, int n)
512 assert (lex_next_is_number (lexer, n));
513 return lex_next_tokval (lexer, n);
516 /* Returns true if the token N ahead of the current token is an integer. */
518 lex_next_is_integer (const struct lexer *lexer, int n)
522 if (!lex_next_is_number (lexer, n))
525 value = lex_next_tokval (lexer, n);
526 return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
529 /* Returns the value of the token N ahead of the current token, which must be
532 lex_next_integer (const struct lexer *lexer, int n)
534 assert (lex_next_is_integer (lexer, n));
535 return lex_next_tokval (lexer, n);
538 /* Token matching functions. */
540 /* If the current token has the specified TYPE, skips it and returns true.
541 Otherwise, returns false. */
543 lex_match (struct lexer *lexer, enum token_type type)
545 if (lex_token (lexer) == type)
554 /* If the current token matches IDENTIFIER, skips it and returns true.
555 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
558 IDENTIFIER must be an ASCII string. */
560 lex_match_id (struct lexer *lexer, const char *identifier)
562 return lex_match_id_n (lexer, identifier, 3);
565 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
566 may be abbreviated to its first N letters. Otherwise, returns false.
568 IDENTIFIER must be an ASCII string. */
570 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
572 if (lex_token (lexer) == T_ID
573 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
582 /* If the current token is integer X, skips it and returns true. Otherwise,
585 lex_match_int (struct lexer *lexer, int x)
587 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
596 /* Forced matches. */
598 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
599 abbreviated to its first 3 letters. Otherwise, reports an error and returns
602 IDENTIFIER must be an ASCII string. */
604 lex_force_match_id (struct lexer *lexer, const char *identifier)
606 if (lex_match_id (lexer, identifier))
610 lex_error_expecting (lexer, identifier);
615 /* If the current token has the specified TYPE, skips it and returns true.
616 Otherwise, reports an error and returns false. */
618 lex_force_match (struct lexer *lexer, enum token_type type)
620 if (lex_token (lexer) == type)
627 const char *type_string = token_type_to_string (type);
630 char *s = xasprintf ("`%s'", type_string);
631 lex_error_expecting (lexer, s);
635 lex_error_expecting (lexer, token_type_to_name (type));
641 /* If the current token is a string, does nothing and returns true.
642 Otherwise, reports an error and returns false. */
644 lex_force_string (struct lexer *lexer)
646 if (lex_is_string (lexer))
650 lex_error (lexer, _("expecting string"));
655 /* If the current token is a string or an identifier, does nothing and returns
656 true. Otherwise, reports an error and returns false.
658 This is meant for use in syntactic situations where we want to encourage the
659 user to supply a quoted string, but for compatibility we also accept
660 identifiers. (One example of such a situation is file names.) Therefore,
661 the error message issued when the current token is wrong only says that a
662 string is expected and doesn't mention that an identifier would also be
665 lex_force_string_or_id (struct lexer *lexer)
667 return lex_token (lexer) == T_ID || lex_force_string (lexer);
670 /* If the current token is an integer, does nothing and returns true.
671 Otherwise, reports an error and returns false. */
673 lex_force_int (struct lexer *lexer)
675 if (lex_is_integer (lexer))
679 lex_error (lexer, _("expecting integer"));
684 /* If the current token is an integer in the range MIN...MAX (inclusive), does
685 nothing and returns true. Otherwise, reports an error and returns false.
686 If NAME is nonnull, then it is used in the error message. */
688 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
690 bool is_integer = lex_is_integer (lexer);
691 bool too_small = is_integer && lex_integer (lexer) < min;
692 bool too_big = is_integer && lex_integer (lexer) > max;
693 if (is_integer && !too_small && !too_big)
698 /* Weird, maybe a bug in the caller. Just report that we needed an
701 lex_error (lexer, _("Integer expected for %s."), name);
703 lex_error (lexer, _("Integer expected."));
708 lex_error (lexer, _("Expected %ld for %s."), min, name);
710 lex_error (lexer, _("Expected %ld."), min);
712 else if (min + 1 == max)
715 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
717 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
721 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
722 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
724 if (report_lower_bound && report_upper_bound)
728 _("Expected integer between %ld and %ld for %s."),
731 lex_error (lexer, _("Expected integer between %ld and %ld."),
734 else if (report_lower_bound)
739 lex_error (lexer, _("Expected non-negative integer for %s."),
742 lex_error (lexer, _("Expected non-negative integer."));
747 lex_error (lexer, _("Expected positive integer for %s."),
750 lex_error (lexer, _("Expected positive integer."));
753 else if (report_upper_bound)
757 _("Expected integer less than or equal to %ld for %s."),
760 lex_error (lexer, _("Expected integer less than or equal to %ld."),
766 lex_error (lexer, _("Integer expected for %s."), name);
768 lex_error (lexer, _("Integer expected."));
774 /* If the current token is a number, does nothing and returns true.
775 Otherwise, reports an error and returns false. */
777 lex_force_num (struct lexer *lexer)
779 if (lex_is_number (lexer))
782 lex_error (lexer, _("expecting number"));
786 /* If the current token is an identifier, does nothing and returns true.
787 Otherwise, reports an error and returns false. */
789 lex_force_id (struct lexer *lexer)
791 if (lex_token (lexer) == T_ID)
794 lex_error (lexer, _("expecting identifier"));
798 /* Token accessors. */
800 /* Returns the type of LEXER's current token. */
802 lex_token (const struct lexer *lexer)
804 return lex_next_token (lexer, 0);
807 /* Returns the number in LEXER's current token.
809 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
810 tokens this function will always return zero. */
812 lex_tokval (const struct lexer *lexer)
814 return lex_next_tokval (lexer, 0);
817 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
819 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
820 this functions this function will always return NULL.
822 The UTF-8 encoding of the returned string is correct for variable names and
823 other identifiers. Use filename_to_utf8() to use it as a filename. Use
824 data_in() to use it in a "union value". */
826 lex_tokcstr (const struct lexer *lexer)
828 return lex_next_tokcstr (lexer, 0);
831 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
832 null-terminated (but the null terminator is not included in the returned
833 substring's 'length').
835 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
836 this functions this function will always return NULL.
838 The UTF-8 encoding of the returned string is correct for variable names and
839 other identifiers. Use filename_to_utf8() to use it as a filename. Use
840 data_in() to use it in a "union value". */
842 lex_tokss (const struct lexer *lexer)
844 return lex_next_tokss (lexer, 0);
849 A value of 0 for N as an argument to any of these functions refers to the
850 current token. Lookahead is limited to the current command. Any N greater
851 than the number of tokens remaining in the current command will be treated
852 as referring to a T_ENDCMD token. */
854 static const struct lex_token *
855 lex_next__ (const struct lexer *lexer_, int n)
857 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
858 struct lex_source *src = lex_source__ (lexer);
861 return lex_source_next__ (src, n);
864 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
869 static const struct token *
870 lex_source_front (const struct lex_source *src)
872 return &src->tokens[deque_front (&src->deque, 0)].token;
875 static const struct lex_token *
876 lex_source_next__ (const struct lex_source *src, int n)
878 while (deque_count (&src->deque) <= n)
880 if (!deque_is_empty (&src->deque))
882 const struct token *front = lex_source_front (src);
883 if (front->type == T_STOP || front->type == T_ENDCMD)
887 lex_source_get (src);
890 return &src->tokens[deque_back (&src->deque, n)];
893 /* Returns the "struct token" of the token N after the current one in LEXER.
894 The returned pointer can be invalidated by pretty much any succeeding call
895 into the lexer, although the string pointer within the returned token is
896 only invalidated by consuming the token (e.g. with lex_get()). */
898 lex_next (const struct lexer *lexer, int n)
900 return &lex_next__ (lexer, n)->token;
903 /* Returns the type of the token N after the current one in LEXER. */
905 lex_next_token (const struct lexer *lexer, int n)
907 return lex_next (lexer, n)->type;
910 /* Returns the number in the tokn N after the current one in LEXER.
912 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
913 tokens this function will always return zero. */
915 lex_next_tokval (const struct lexer *lexer, int n)
917 const struct token *token = lex_next (lexer, n);
918 return token->number;
921 /* Returns the null-terminated string in the token N after the current one, in
924 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
925 this functions this function will always return NULL.
927 The UTF-8 encoding of the returned string is correct for variable names and
928 other identifiers. Use filename_to_utf8() to use it as a filename. Use
929 data_in() to use it in a "union value". */
931 lex_next_tokcstr (const struct lexer *lexer, int n)
933 return lex_next_tokss (lexer, n).string;
936 /* Returns the string in the token N after the current one, in UTF-8 encoding.
937 The string is null-terminated (but the null terminator is not included in
938 the returned substring's 'length').
940 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
941 tokens this functions this function will always return NULL.
943 The UTF-8 encoding of the returned string is correct for variable names and
944 other identifiers. Use filename_to_utf8() to use it as a filename. Use
945 data_in() to use it in a "union value". */
947 lex_next_tokss (const struct lexer *lexer, int n)
949 return lex_next (lexer, n)->string;
953 lex_tokens_match (const struct token *actual, const struct token *expected)
955 if (actual->type != expected->type)
958 switch (actual->type)
962 return actual->number == expected->number;
965 return lex_id_match (expected->string, actual->string);
968 return (actual->string.length == expected->string.length
969 && !memcmp (actual->string.string, expected->string.string,
970 actual->string.length));
977 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
978 skips it and returns true. Otherwise, returns false.
980 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
981 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
982 first three letters. */
984 lex_match_phrase (struct lexer *lexer, const char *s)
986 struct string_lexer slex;
991 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE);
992 while (string_lexer_next (&slex, &token))
993 if (token.type != SCAN_SKIP)
995 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
996 token_uninit (&token);
1007 lex_source_get_first_line_number (const struct lex_source *src, int n)
1009 return lex_source_next__ (src, n)->first_line;
1013 count_newlines (char *s, size_t length)
1018 while ((newline = memchr (s, '\n', length)) != NULL)
1021 length -= (newline + 1) - s;
1029 lex_source_get_last_line_number (const struct lex_source *src, int n)
1031 const struct lex_token *token = lex_source_next__ (src, n);
1033 if (token->first_line == 0)
1037 char *token_str = &src->buffer[token->token_pos - src->tail];
1038 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1043 count_columns (const char *s_, size_t length)
1045 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1051 for (ofs = 0; ofs < length; ofs += mblen)
1055 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1058 int width = uc_width (uc, "UTF-8");
1063 columns = ROUND_UP (columns + 1, 8);
1070 lex_source_get_first_column (const struct lex_source *src, int n)
1072 const struct lex_token *token = lex_source_next__ (src, n);
1073 return count_columns (&src->buffer[token->line_pos - src->tail],
1074 token->token_pos - token->line_pos);
1078 lex_source_get_last_column (const struct lex_source *src, int n)
1080 const struct lex_token *token = lex_source_next__ (src, n);
1081 char *start, *end, *newline;
1083 start = &src->buffer[token->line_pos - src->tail];
1084 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1085 newline = memrchr (start, '\n', end - start);
1086 if (newline != NULL)
1087 start = newline + 1;
1088 return count_columns (start, end - start);
1091 /* Returns the 1-based line number of the start of the syntax that represents
1092 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1093 if the token is drawn from a source that does not have line numbers. */
1095 lex_get_first_line_number (const struct lexer *lexer, int n)
1097 const struct lex_source *src = lex_source__ (lexer);
1098 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
1101 /* Returns the 1-based line number of the end of the syntax that represents the
1102 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1103 token or if the token is drawn from a source that does not have line
1106 Most of the time, a single token is wholly within a single line of syntax,
1107 but there are two exceptions: a T_STRING token can be made up of multiple
1108 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1109 token can consist of a "-" on one line followed by the number on the next.
1112 lex_get_last_line_number (const struct lexer *lexer, int n)
1114 const struct lex_source *src = lex_source__ (lexer);
1115 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1118 /* Returns the 1-based column number of the start of the syntax that represents
1119 the token N after the current one in LEXER. Returns 0 for a T_STOP
1122 Column numbers are measured according to the width of characters as shown in
1123 a typical fixed-width font, in which CJK characters have width 2 and
1124 combining characters have width 0. */
1126 lex_get_first_column (const struct lexer *lexer, int n)
1128 const struct lex_source *src = lex_source__ (lexer);
1129 return src != NULL ? lex_source_get_first_column (src, n) : 0;
1132 /* Returns the 1-based column number of the end of the syntax that represents
1133 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1136 Column numbers are measured according to the width of characters as shown in
1137 a typical fixed-width font, in which CJK characters have width 2 and
1138 combining characters have width 0. */
1140 lex_get_last_column (const struct lexer *lexer, int n)
1142 const struct lex_source *src = lex_source__ (lexer);
1143 return src != NULL ? lex_source_get_last_column (src, n) : 0;
1146 /* Returns the name of the syntax file from which the current command is drawn.
1147 Returns NULL for a T_STOP token or if the command's source does not have
1150 There is no version of this function that takes an N argument because
1151 lookahead only works to the end of a command and any given command is always
1152 within a single syntax file. */
1154 lex_get_file_name (const struct lexer *lexer)
1156 struct lex_source *src = lex_source__ (lexer);
1157 return src == NULL ? NULL : src->reader->file_name;
1161 lex_get_encoding (const struct lexer *lexer)
1163 struct lex_source *src = lex_source__ (lexer);
1164 return src == NULL ? NULL : src->reader->encoding;
1168 /* Returns the syntax mode for the syntax file from which the current drawn is
1169 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1170 does not have line numbers.
1172 There is no version of this function that takes an N argument because
1173 lookahead only works to the end of a command and any given command is always
1174 within a single syntax file. */
1176 lex_get_syntax_mode (const struct lexer *lexer)
1178 struct lex_source *src = lex_source__ (lexer);
1179 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1182 /* Returns the error mode for the syntax file from which the current drawn is
1183 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1184 source does not have line numbers.
1186 There is no version of this function that takes an N argument because
1187 lookahead only works to the end of a command and any given command is always
1188 within a single syntax file. */
1190 lex_get_error_mode (const struct lexer *lexer)
1192 struct lex_source *src = lex_source__ (lexer);
1193 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1196 /* If the source that LEXER is currently reading has error mode
1197 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1198 token to be read comes directly from whatever is next read from the stream.
1200 It makes sense to call this function after encountering an error in a
1201 command entered on the console, because usually the user would prefer not to
1202 have cascading errors. */
1204 lex_interactive_reset (struct lexer *lexer)
1206 struct lex_source *src = lex_source__ (lexer);
1207 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1209 src->head = src->tail = 0;
1210 src->journal_pos = src->seg_pos = src->line_pos = 0;
1211 src->n_newlines = 0;
1212 src->suppress_next_newline = false;
1213 segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1214 while (!deque_is_empty (&src->deque))
1215 lex_source_pop__ (src);
1216 lex_source_push_endcmd__ (src);
1220 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1222 lex_discard_rest_of_command (struct lexer *lexer)
1224 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1228 /* Discards all lookahead tokens in LEXER, then discards all input sources
1229 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1230 runs out of input sources. */
1232 lex_discard_noninteractive (struct lexer *lexer)
1234 struct lex_source *src = lex_source__ (lexer);
1238 while (!deque_is_empty (&src->deque))
1239 lex_source_pop__ (src);
1241 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1242 src = lex_source__ (lexer))
1243 lex_source_destroy (src);
1248 lex_source_max_tail__ (const struct lex_source *src)
1250 const struct lex_token *token;
1253 assert (src->seg_pos >= src->line_pos);
1254 max_tail = MIN (src->journal_pos, src->line_pos);
1256 /* Use the oldest token also. (We know that src->deque cannot be empty
1257 because we are in the process of adding a new token, which is already
1258 initialized enough to use here.) */
1259 token = &src->tokens[deque_back (&src->deque, 0)];
1260 assert (token->token_pos >= token->line_pos);
1261 max_tail = MIN (max_tail, token->line_pos);
1267 lex_source_expand__ (struct lex_source *src)
1269 if (src->head - src->tail >= src->allocated)
1271 size_t max_tail = lex_source_max_tail__ (src);
1272 if (max_tail > src->tail)
1274 /* Advance the tail, freeing up room at the head. */
1275 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1276 src->head - max_tail);
1277 src->tail = max_tail;
1281 /* Buffer is completely full. Expand it. */
1282 src->buffer = x2realloc (src->buffer, &src->allocated);
1287 /* There's space available at the head of the buffer. Nothing to do. */
1292 lex_source_read__ (struct lex_source *src)
1296 lex_source_expand__ (src);
1298 size_t head_ofs = src->head - src->tail;
1299 size_t space = src->allocated - head_ofs;
1300 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1301 size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1303 assert (n <= space);
1308 src->reader->eof = true;
1309 lex_source_expand__ (src);
1315 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1316 src->head - src->seg_pos));
1319 static struct lex_source *
1320 lex_source__ (const struct lexer *lexer)
1322 return (ll_is_empty (&lexer->sources) ? NULL
1323 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1326 static struct substring
1327 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1329 const struct lex_token *token0 = lex_source_next__ (src, n0);
1330 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1331 size_t start = token0->token_pos;
1332 size_t end = token1->token_pos + token1->token_len;
1334 return ss_buffer (&src->buffer[start - src->tail], end - start);
1338 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1344 assert (out_size >= 16);
1345 out_maxlen = out_size - 1;
1346 if (in.length > out_maxlen - 3)
1349 for (out_len = 0; out_len < in.length; out_len += mblen)
1351 if (in.string[out_len] == '\n'
1352 || in.string[out_len] == '\0'
1353 || (in.string[out_len] == '\r'
1354 && out_len + 1 < in.length
1355 && in.string[out_len + 1] == '\n'))
1358 mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1359 in.length - out_len);
1364 if (out_len + mblen > out_maxlen)
1368 memcpy (out, in.string, out_len);
1369 strcpy (&out[out_len], out_len < in.length ? "..." : "");
1373 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1374 const char *format, va_list args)
1376 const struct lex_token *token;
1381 token = lex_source_next__ (src, n0);
1382 if (token->token.type == T_ENDCMD)
1383 ds_put_cstr (&s, _("Syntax error at end of command"));
1386 struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1387 if (!ss_is_empty (syntax))
1389 char syntax_cstr[64];
1391 lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1392 ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1395 ds_put_cstr (&s, _("Syntax error"));
1400 ds_put_cstr (&s, ": ");
1401 ds_put_vformat (&s, format, args);
1403 if (ds_last (&s) != '.')
1404 ds_put_byte (&s, '.');
1407 .category = MSG_C_SYNTAX,
1408 .severity = MSG_S_ERROR,
1409 .file_name = src->reader->file_name,
1410 .first_line = lex_source_get_first_line_number (src, n0),
1411 .last_line = lex_source_get_last_line_number (src, n1),
1412 .first_column = lex_source_get_first_column (src, n0),
1413 .last_column = lex_source_get_last_column (src, n1),
1414 .text = ds_steal_cstr (&s),
1419 static void PRINTF_FORMAT (2, 3)
1420 lex_get_error (struct lex_source *src, const char *format, ...)
1425 va_start (args, format);
1427 n = deque_count (&src->deque) - 1;
1428 lex_source_error_valist (src, n, n, format, args);
1429 lex_source_pop_front (src);
1434 /* Attempts to append an additional token into SRC's deque, reading more from
1435 the underlying lex_reader if necessary. Returns true if a new token was
1436 added to SRC's deque, false otherwise. */
1438 lex_source_try_get (struct lex_source *src)
1440 /* State maintained while scanning tokens. Usually we only need a single
1441 state, but scanner_push() can return SCAN_SAVE to indicate that the state
1442 needs to be saved and possibly restored later with SCAN_BACK. */
1445 struct segmenter segmenter;
1446 enum segment_type last_segment;
1447 int newlines; /* Number of newlines encountered so far. */
1448 /* Maintained here so we can update lex_source's similar members when we
1454 /* Initialize state. */
1455 struct state state =
1457 .segmenter = src->segmenter,
1459 .seg_pos = src->seg_pos,
1460 .line_pos = src->line_pos,
1462 struct state saved = state;
1464 /* Append a new token to SRC and initialize it. */
1465 struct lex_token *token = lex_push_token__ (src);
1466 struct scanner scanner;
1467 scanner_init (&scanner, &token->token);
1468 token->line_pos = src->line_pos;
1469 token->token_pos = src->seg_pos;
1470 if (src->reader->line_number > 0)
1471 token->first_line = src->reader->line_number + src->n_newlines;
1473 token->first_line = 0;
1475 /* Extract segments and pass them through the scanner until we obtain a
1479 /* Extract a segment. */
1480 const char *segment = &src->buffer[state.seg_pos - src->tail];
1481 size_t seg_maxlen = src->head - state.seg_pos;
1482 enum segment_type type;
1483 int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1484 src->reader->eof, &type);
1487 /* The segmenter needs more input to produce a segment. */
1488 assert (!src->reader->eof);
1489 lex_source_read__ (src);
1493 /* Update state based on the segment. */
1494 state.last_segment = type;
1495 state.seg_pos += seg_len;
1496 if (type == SEG_NEWLINE)
1499 state.line_pos = state.seg_pos;
1502 /* Pass the segment into the scanner and try to get a token out. */
1503 enum scan_result result = scanner_push (&scanner, type,
1504 ss_buffer (segment, seg_len),
1506 if (result == SCAN_SAVE)
1508 else if (result == SCAN_BACK)
1513 else if (result == SCAN_DONE)
1517 /* If we've reached the end of a line, or the end of a command, then pass
1518 the line to the output engine as a syntax text item. */
1519 int n_lines = state.newlines;
1520 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1523 src->suppress_next_newline = true;
1525 else if (n_lines > 0 && src->suppress_next_newline)
1528 src->suppress_next_newline = false;
1530 for (int i = 0; i < n_lines; i++)
1532 /* Beginning of line. */
1533 const char *line = &src->buffer[src->journal_pos - src->tail];
1535 /* Calculate line length, including \n or \r\n end-of-line if present.
1537 We use src->head even though that may be beyond what we've actually
1538 converted to tokens (which is only through state.line_pos). That's
1539 because, if we're emitting the line due to SEG_END_COMMAND, we want to
1540 take the whole line through the newline, not just through the '.'. */
1541 size_t max_len = src->head - src->journal_pos;
1542 const char *newline = memchr (line, '\n', max_len);
1543 size_t line_len = newline ? newline - line + 1 : max_len;
1545 /* Calculate line length excluding end-of-line. */
1546 size_t copy_len = line_len;
1547 if (copy_len > 0 && line[copy_len - 1] == '\n')
1549 if (copy_len > 0 && line[copy_len - 1] == '\r')
1552 /* Submit the line as syntax. */
1553 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1554 xmemdup0 (line, copy_len),
1557 src->journal_pos += line_len;
1560 token->token_len = state.seg_pos - src->seg_pos;
1562 src->segmenter = state.segmenter;
1563 src->seg_pos = state.seg_pos;
1564 src->line_pos = state.line_pos;
1565 src->n_newlines += state.newlines;
1567 switch (token->token.type)
1573 token->token.type = T_ENDCMD;
1577 case SCAN_BAD_HEX_LENGTH:
1578 lex_get_error (src, _("String of hex digits has %d characters, which "
1579 "is not a multiple of 2"),
1580 (int) token->token.number);
1583 case SCAN_BAD_HEX_DIGIT:
1584 case SCAN_BAD_UNICODE_DIGIT:
1585 lex_get_error (src, _("`%c' is not a valid hex digit"),
1586 (int) token->token.number);
1589 case SCAN_BAD_UNICODE_LENGTH:
1590 lex_get_error (src, _("Unicode string contains %d bytes, which is "
1591 "not in the valid range of 1 to 8 bytes"),
1592 (int) token->token.number);
1595 case SCAN_BAD_UNICODE_CODE_POINT:
1596 lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1597 (int) token->token.number);
1600 case SCAN_EXPECTED_QUOTE:
1601 lex_get_error (src, _("Unterminated string constant"));
1604 case SCAN_EXPECTED_EXPONENT:
1605 lex_get_error (src, _("Missing exponent following `%s'"),
1606 token->token.string.string);
1609 case SCAN_UNEXPECTED_CHAR:
1612 lex_get_error (src, _("Bad character %s in input"),
1613 uc_name (token->token.number, c_name));
1618 lex_source_pop_front (src);
1626 lex_source_get__ (struct lex_source *src)
1631 else if (lex_source_try_get (src))
1637 lex_source_get (const struct lex_source *src_)
1639 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1641 if (!lex_source_get (src))
1644 struct macro_expander *me = macro_expander_create (src->lexer,
1645 lex_source_front (src));
1651 if (!lex_source_get (src))
1653 /* This should not be reachable because we always get a T_STOP at the
1654 end of input and the macro_expander should always terminate
1655 expansion on T_STOP. */
1659 int retval = macro_expander_add (me, lex_source_front (src));
1668 lex_source_push_endcmd__ (struct lex_source *src)
1670 struct lex_token *token = lex_push_token__ (src);
1671 token->token.type = T_ENDCMD;
1672 token->token_pos = 0;
1673 token->token_len = 0;
1674 token->line_pos = 0;
1675 token->first_line = 0;
1678 static struct lex_source *
1679 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
1681 struct lex_source *src;
1683 src = xzalloc (sizeof *src);
1684 src->reader = reader;
1685 segmenter_init (&src->segmenter, reader->syntax);
1687 src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1689 lex_source_push_endcmd__ (src);
1695 lex_source_destroy (struct lex_source *src)
1697 char *file_name = src->reader->file_name;
1698 char *encoding = src->reader->encoding;
1699 if (src->reader->class->destroy != NULL)
1700 src->reader->class->destroy (src->reader);
1704 while (!deque_is_empty (&src->deque))
1705 lex_source_pop__ (src);
1707 ll_remove (&src->ll);
1711 struct lex_file_reader
1713 struct lex_reader reader;
1714 struct u8_istream *istream;
1717 static struct lex_reader_class lex_file_reader_class;
1719 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1720 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
1721 ENCODING, which should take one of the forms accepted by
1722 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
1723 mode of the new reader, respectively.
1725 Returns a null pointer if FILE_NAME cannot be opened. */
1727 lex_reader_for_file (const char *file_name, const char *encoding,
1728 enum segmenter_mode syntax,
1729 enum lex_error_mode error)
1731 struct lex_file_reader *r;
1732 struct u8_istream *istream;
1734 istream = (!strcmp(file_name, "-")
1735 ? u8_istream_for_fd (encoding, STDIN_FILENO)
1736 : u8_istream_for_file (encoding, file_name, O_RDONLY));
1737 if (istream == NULL)
1739 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1743 r = xmalloc (sizeof *r);
1744 lex_reader_init (&r->reader, &lex_file_reader_class);
1745 r->reader.syntax = syntax;
1746 r->reader.error = error;
1747 r->reader.file_name = xstrdup (file_name);
1748 r->reader.encoding = xstrdup_if_nonnull (encoding);
1749 r->reader.line_number = 1;
1750 r->istream = istream;
1755 static struct lex_file_reader *
1756 lex_file_reader_cast (struct lex_reader *r)
1758 return UP_CAST (r, struct lex_file_reader, reader);
1762 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1763 enum prompt_style prompt_style UNUSED)
1765 struct lex_file_reader *r = lex_file_reader_cast (r_);
1766 ssize_t n_read = u8_istream_read (r->istream, buf, n);
1769 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
1776 lex_file_close (struct lex_reader *r_)
1778 struct lex_file_reader *r = lex_file_reader_cast (r_);
1780 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1782 if (u8_istream_close (r->istream) != 0)
1783 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
1786 u8_istream_free (r->istream);
1791 static struct lex_reader_class lex_file_reader_class =
1797 struct lex_string_reader
1799 struct lex_reader reader;
1804 static struct lex_reader_class lex_string_reader_class;
1806 /* Creates and returns a new lex_reader for the contents of S, which must be
1807 encoded in the given ENCODING. The new reader takes ownership of S and will free it
1808 with ss_dealloc() when it is closed. */
1810 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
1812 struct lex_string_reader *r;
1814 r = xmalloc (sizeof *r);
1815 lex_reader_init (&r->reader, &lex_string_reader_class);
1816 r->reader.syntax = SEG_MODE_AUTO;
1817 r->reader.encoding = xstrdup_if_nonnull (encoding);
1824 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1825 which must be encoded in ENCODING. The caller retains ownership of S. */
1827 lex_reader_for_string (const char *s, const char *encoding)
1829 struct substring ss;
1830 ss_alloc_substring (&ss, ss_cstr (s));
1831 return lex_reader_for_substring_nocopy (ss, encoding);
1834 /* Formats FORMAT as a printf()-like format string and creates and returns a
1835 new lex_reader for the formatted result. */
1837 lex_reader_for_format (const char *format, const char *encoding, ...)
1839 struct lex_reader *r;
1842 va_start (args, encoding);
1843 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
1849 static struct lex_string_reader *
1850 lex_string_reader_cast (struct lex_reader *r)
1852 return UP_CAST (r, struct lex_string_reader, reader);
1856 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1857 enum prompt_style prompt_style UNUSED)
1859 struct lex_string_reader *r = lex_string_reader_cast (r_);
1862 chunk = MIN (n, r->s.length - r->offset);
1863 memcpy (buf, r->s.string + r->offset, chunk);
1870 lex_string_close (struct lex_reader *r_)
1872 struct lex_string_reader *r = lex_string_reader_cast (r_);
1878 static struct lex_reader_class lex_string_reader_class =