1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "language/command.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/ll.h"
42 #include "libpspp/message.h"
43 #include "libpspp/misc.h"
44 #include "libpspp/str.h"
45 #include "libpspp/u8-istream.h"
46 #include "output/journal.h"
47 #include "output/output-item.h"
49 #include "gl/c-ctype.h"
50 #include "gl/minmax.h"
51 #include "gl/xalloc.h"
52 #include "gl/xmemdup0.h"
55 #define _(msgid) gettext (msgid)
56 #define N_(msgid) msgid
58 /* A token within a lex_source. */
61 /* The regular token information. */
64 /* Location of token in terms of the lex_source's buffer.
65 src->tail <= line_pos <= token_pos <= src->head. */
66 size_t token_pos; /* Start of token. */
67 size_t token_len; /* Length of source for token in bytes. */
68 size_t line_pos; /* Start of line containing token_pos. */
69 int first_line; /* Line number at token_pos. */
72 /* A source of tokens, corresponding to a syntax file.
74 This is conceptually a lex_reader wrapped with everything needed to convert
75 its UTF-8 bytes into tokens. */
78 struct ll ll; /* In lexer's list of sources. */
79 struct lex_reader *reader;
80 struct segmenter segmenter;
81 bool eof; /* True if T_STOP was read from 'reader'. */
83 /* Buffer of UTF-8 bytes. */
85 size_t allocated; /* Number of bytes allocated. */
86 size_t tail; /* &buffer[0] offset into UTF-8 source. */
87 size_t head; /* &buffer[head - tail] offset into source. */
89 /* Positions in source file, tail <= pos <= head for each member here. */
90 size_t journal_pos; /* First byte not yet output to journal. */
91 size_t seg_pos; /* First byte not yet scanned as token. */
92 size_t line_pos; /* First byte of line containing seg_pos. */
94 int n_newlines; /* Number of new-lines up to seg_pos. */
95 bool suppress_next_newline;
98 struct deque deque; /* Indexes into 'tokens'. */
99 struct lex_token *tokens; /* Lookahead tokens for parser. */
102 static struct lex_source *lex_source_create (struct lex_reader *);
103 static void lex_source_destroy (struct lex_source *);
108 struct ll_list sources; /* Contains "struct lex_source"s. */
111 static struct lex_source *lex_source__ (const struct lexer *);
112 static const struct lex_token *lex_next__ (const struct lexer *, int n);
113 static void lex_source_push_endcmd__ (struct lex_source *);
115 static void lex_source_pop__ (struct lex_source *);
116 static bool lex_source_get__ (const struct lex_source *);
117 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
118 const char *format, va_list)
119 PRINTF_FORMAT (4, 0);
120 static const struct lex_token *lex_source_next__ (const struct lex_source *,
123 /* Initializes READER with the specified CLASS and otherwise some reasonable
124 defaults. The caller should fill in the others members as desired. */
126 lex_reader_init (struct lex_reader *reader,
127 const struct lex_reader_class *class)
129 reader->class = class;
130 reader->syntax = SEG_MODE_AUTO;
131 reader->error = LEX_ERROR_CONTINUE;
132 reader->file_name = NULL;
133 reader->encoding = NULL;
134 reader->line_number = 0;
138 /* Frees any file name already in READER and replaces it by a copy of
139 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
141 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
143 free (reader->file_name);
144 reader->file_name = xstrdup_if_nonnull (file_name);
147 /* Creates and returns a new lexer. */
151 struct lexer *lexer = xzalloc (sizeof *lexer);
152 ll_init (&lexer->sources);
156 /* Destroys LEXER. */
158 lex_destroy (struct lexer *lexer)
162 struct lex_source *source, *next;
164 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
165 lex_source_destroy (source);
170 /* Inserts READER into LEXER so that the next token read by LEXER comes from
171 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
174 lex_include (struct lexer *lexer, struct lex_reader *reader)
176 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
177 ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
180 /* Appends READER to LEXER, so that it will be read after all other current
181 readers have already been read. */
183 lex_append (struct lexer *lexer, struct lex_reader *reader)
185 ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
190 static struct lex_token *
191 lex_push_token__ (struct lex_source *src)
193 struct lex_token *token;
195 if (deque_is_full (&src->deque))
196 src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
198 token = &src->tokens[deque_push_front (&src->deque)];
199 token_init (&token->token);
204 lex_source_pop__ (struct lex_source *src)
206 token_uninit (&src->tokens[deque_pop_back (&src->deque)].token);
210 lex_source_pop_front (struct lex_source *src)
212 token_uninit (&src->tokens[deque_pop_front (&src->deque)].token);
215 /* Advances LEXER to the next token, consuming the current token. */
217 lex_get (struct lexer *lexer)
219 struct lex_source *src;
221 src = lex_source__ (lexer);
225 if (!deque_is_empty (&src->deque))
226 lex_source_pop__ (src);
228 while (deque_is_empty (&src->deque))
229 if (!lex_source_get__ (src))
231 lex_source_destroy (src);
232 src = lex_source__ (lexer);
238 /* Issuing errors. */
240 /* Prints a syntax error message containing the current token and
241 given message MESSAGE (if non-null). */
243 lex_error (struct lexer *lexer, const char *format, ...)
247 va_start (args, format);
248 lex_next_error_valist (lexer, 0, 0, format, args);
252 /* Prints a syntax error message containing the current token and
253 given message MESSAGE (if non-null). */
255 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
257 lex_next_error_valist (lexer, 0, 0, format, args);
260 /* Prints a syntax error message containing the current token and
261 given message MESSAGE (if non-null). */
263 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
267 va_start (args, format);
268 lex_next_error_valist (lexer, n0, n1, format, args);
272 /* Prints a syntax error message saying that one of the strings provided as
273 varargs, up to the first NULL, is expected. */
275 (lex_error_expecting) (struct lexer *lexer, ...)
279 va_start (args, lexer);
280 lex_error_expecting_valist (lexer, args);
284 /* Prints a syntax error message saying that one of the options provided in
285 ARGS, up to the first NULL, is expected. */
287 lex_error_expecting_valist (struct lexer *lexer, va_list args)
289 enum { MAX_OPTIONS = 9 };
290 const char *options[MAX_OPTIONS];
292 while (n < MAX_OPTIONS)
294 const char *option = va_arg (args, const char *);
298 options[n++] = option;
300 lex_error_expecting_array (lexer, options, n);
304 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
309 lex_error (lexer, NULL);
313 lex_error (lexer, _("expecting %s"), options[0]);
317 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
321 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
326 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
327 options[0], options[1], options[2], options[3]);
331 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
332 options[0], options[1], options[2], options[3], options[4]);
336 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
337 options[0], options[1], options[2], options[3], options[4],
342 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
343 options[0], options[1], options[2], options[3], options[4],
344 options[5], options[6]);
348 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
349 options[0], options[1], options[2], options[3], options[4],
350 options[5], options[6], options[7]);
354 lex_error (lexer, NULL);
358 /* Reports an error to the effect that subcommand SBC may only be specified
361 This function does not take a lexer as an argument or use lex_error(),
362 because the result would ordinarily just be redundant: "Syntax error at
363 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
364 not help the user find the error. */
366 lex_sbc_only_once (const char *sbc)
368 msg (SE, _("Subcommand %s may only be specified once."), sbc);
371 /* Reports an error to the effect that subcommand SBC is missing.
373 This function does not take a lexer as an argument or use lex_error(),
374 because a missing subcommand can normally be detected only after the whole
375 command has been parsed, and so lex_error() would always report "Syntax
376 error at end of command", which does not help the user find the error. */
378 lex_sbc_missing (const char *sbc)
380 msg (SE, _("Required subcommand %s was not specified."), sbc);
383 /* Reports an error to the effect that specification SPEC may only be specified
384 once within subcommand SBC. */
386 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
388 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
392 /* Reports an error to the effect that specification SPEC is missing within
395 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
397 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
401 /* Prints a syntax error message containing the current token and
402 given message MESSAGE (if non-null). */
404 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
405 const char *format, va_list args)
407 struct lex_source *src = lex_source__ (lexer);
410 lex_source_error_valist (src, n0, n1, format, args);
416 ds_put_format (&s, _("Syntax error at end of input"));
419 ds_put_cstr (&s, ": ");
420 ds_put_vformat (&s, format, args);
422 ds_put_byte (&s, '.');
423 msg (SE, "%s", ds_cstr (&s));
428 /* Checks that we're at end of command.
429 If so, returns a successful command completion code.
430 If not, flags a syntax error and returns an error command
433 lex_end_of_command (struct lexer *lexer)
435 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
437 lex_error (lexer, _("expecting end of command"));
444 /* Token testing functions. */
446 /* Returns true if the current token is a number. */
448 lex_is_number (const struct lexer *lexer)
450 return lex_next_is_number (lexer, 0);
453 /* Returns true if the current token is a string. */
455 lex_is_string (const struct lexer *lexer)
457 return lex_next_is_string (lexer, 0);
460 /* Returns the value of the current token, which must be a
461 floating point number. */
463 lex_number (const struct lexer *lexer)
465 return lex_next_number (lexer, 0);
468 /* Returns true iff the current token is an integer. */
470 lex_is_integer (const struct lexer *lexer)
472 return lex_next_is_integer (lexer, 0);
475 /* Returns the value of the current token, which must be an
478 lex_integer (const struct lexer *lexer)
480 return lex_next_integer (lexer, 0);
483 /* Token testing functions with lookahead.
485 A value of 0 for N as an argument to any of these functions refers to the
486 current token. Lookahead is limited to the current command. Any N greater
487 than the number of tokens remaining in the current command will be treated
488 as referring to a T_ENDCMD token. */
490 /* Returns true if the token N ahead of the current token is a number. */
492 lex_next_is_number (const struct lexer *lexer, int n)
494 return token_is_number (lex_next (lexer, n));
497 /* Returns true if the token N ahead of the current token is a string. */
499 lex_next_is_string (const struct lexer *lexer, int n)
501 return token_is_string (lex_next (lexer, n));
504 /* Returns the value of the token N ahead of the current token, which must be a
505 floating point number. */
507 lex_next_number (const struct lexer *lexer, int n)
509 return token_number (lex_next (lexer, n));
512 /* Returns true if the token N ahead of the current token is an integer. */
514 lex_next_is_integer (const struct lexer *lexer, int n)
516 return token_is_integer (lex_next (lexer, n));
519 /* Returns the value of the token N ahead of the current token, which must be
522 lex_next_integer (const struct lexer *lexer, int n)
524 return token_integer (lex_next (lexer, n));
527 /* Token matching functions. */
529 /* If the current token has the specified TYPE, skips it and returns true.
530 Otherwise, returns false. */
532 lex_match (struct lexer *lexer, enum token_type type)
534 if (lex_token (lexer) == type)
543 /* If the current token matches IDENTIFIER, skips it and returns true.
544 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
547 IDENTIFIER must be an ASCII string. */
549 lex_match_id (struct lexer *lexer, const char *identifier)
551 return lex_match_id_n (lexer, identifier, 3);
554 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
555 may be abbreviated to its first N letters. Otherwise, returns false.
557 IDENTIFIER must be an ASCII string. */
559 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
561 if (lex_token (lexer) == T_ID
562 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
571 /* If the current token is integer X, skips it and returns true. Otherwise,
574 lex_match_int (struct lexer *lexer, int x)
576 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
585 /* Forced matches. */
587 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
588 abbreviated to its first 3 letters. Otherwise, reports an error and returns
591 IDENTIFIER must be an ASCII string. */
593 lex_force_match_id (struct lexer *lexer, const char *identifier)
595 if (lex_match_id (lexer, identifier))
599 lex_error_expecting (lexer, identifier);
604 /* If the current token has the specified TYPE, skips it and returns true.
605 Otherwise, reports an error and returns false. */
607 lex_force_match (struct lexer *lexer, enum token_type type)
609 if (lex_token (lexer) == type)
616 const char *type_string = token_type_to_string (type);
619 char *s = xasprintf ("`%s'", type_string);
620 lex_error_expecting (lexer, s);
624 lex_error_expecting (lexer, token_type_to_name (type));
630 /* If the current token is a string, does nothing and returns true.
631 Otherwise, reports an error and returns false. */
633 lex_force_string (struct lexer *lexer)
635 if (lex_is_string (lexer))
639 lex_error (lexer, _("expecting string"));
644 /* If the current token is a string or an identifier, does nothing and returns
645 true. Otherwise, reports an error and returns false.
647 This is meant for use in syntactic situations where we want to encourage the
648 user to supply a quoted string, but for compatibility we also accept
649 identifiers. (One example of such a situation is file names.) Therefore,
650 the error message issued when the current token is wrong only says that a
651 string is expected and doesn't mention that an identifier would also be
654 lex_force_string_or_id (struct lexer *lexer)
656 return lex_token (lexer) == T_ID || lex_force_string (lexer);
659 /* If the current token is an integer, does nothing and returns true.
660 Otherwise, reports an error and returns false. */
662 lex_force_int (struct lexer *lexer)
664 if (lex_is_integer (lexer))
668 lex_error (lexer, _("expecting integer"));
673 /* If the current token is an integer in the range MIN...MAX (inclusive), does
674 nothing and returns true. Otherwise, reports an error and returns false.
675 If NAME is nonnull, then it is used in the error message. */
677 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
679 bool is_integer = lex_is_integer (lexer);
680 bool too_small = is_integer && lex_integer (lexer) < min;
681 bool too_big = is_integer && lex_integer (lexer) > max;
682 if (is_integer && !too_small && !too_big)
687 /* Weird, maybe a bug in the caller. Just report that we needed an
690 lex_error (lexer, _("Integer expected for %s."), name);
692 lex_error (lexer, _("Integer expected."));
697 lex_error (lexer, _("Expected %ld for %s."), min, name);
699 lex_error (lexer, _("Expected %ld."), min);
701 else if (min + 1 == max)
704 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
706 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
710 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
711 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
713 if (report_lower_bound && report_upper_bound)
717 _("Expected integer between %ld and %ld for %s."),
720 lex_error (lexer, _("Expected integer between %ld and %ld."),
723 else if (report_lower_bound)
728 lex_error (lexer, _("Expected non-negative integer for %s."),
731 lex_error (lexer, _("Expected non-negative integer."));
736 lex_error (lexer, _("Expected positive integer for %s."),
739 lex_error (lexer, _("Expected positive integer."));
742 else if (report_upper_bound)
746 _("Expected integer less than or equal to %ld for %s."),
749 lex_error (lexer, _("Expected integer less than or equal to %ld."),
755 lex_error (lexer, _("Integer expected for %s."), name);
757 lex_error (lexer, _("Integer expected."));
763 /* If the current token is a number, does nothing and returns true.
764 Otherwise, reports an error and returns false. */
766 lex_force_num (struct lexer *lexer)
768 if (lex_is_number (lexer))
771 lex_error (lexer, _("expecting number"));
775 /* If the current token is an identifier, does nothing and returns true.
776 Otherwise, reports an error and returns false. */
778 lex_force_id (struct lexer *lexer)
780 if (lex_token (lexer) == T_ID)
783 lex_error (lexer, _("expecting identifier"));
787 /* Token accessors. */
789 /* Returns the type of LEXER's current token. */
791 lex_token (const struct lexer *lexer)
793 return lex_next_token (lexer, 0);
796 /* Returns the number in LEXER's current token.
798 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
799 tokens this function will always return zero. */
801 lex_tokval (const struct lexer *lexer)
803 return lex_next_tokval (lexer, 0);
806 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
808 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
809 this functions this function will always return NULL.
811 The UTF-8 encoding of the returned string is correct for variable names and
812 other identifiers. Use filename_to_utf8() to use it as a filename. Use
813 data_in() to use it in a "union value". */
815 lex_tokcstr (const struct lexer *lexer)
817 return lex_next_tokcstr (lexer, 0);
820 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
821 null-terminated (but the null terminator is not included in the returned
822 substring's 'length').
824 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
825 this functions this function will always return NULL.
827 The UTF-8 encoding of the returned string is correct for variable names and
828 other identifiers. Use filename_to_utf8() to use it as a filename. Use
829 data_in() to use it in a "union value". */
831 lex_tokss (const struct lexer *lexer)
833 return lex_next_tokss (lexer, 0);
838 A value of 0 for N as an argument to any of these functions refers to the
839 current token. Lookahead is limited to the current command. Any N greater
840 than the number of tokens remaining in the current command will be treated
841 as referring to a T_ENDCMD token. */
843 static const struct lex_token *
844 lex_next__ (const struct lexer *lexer_, int n)
846 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
847 struct lex_source *src = lex_source__ (lexer);
850 return lex_source_next__ (src, n);
853 static const struct lex_token stop_token =
854 { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
860 static const struct lex_token *
861 lex_source_next__ (const struct lex_source *src, int n)
863 while (deque_count (&src->deque) <= n)
865 if (!deque_is_empty (&src->deque))
867 struct lex_token *front;
869 front = &src->tokens[deque_front (&src->deque, 0)];
870 if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
874 lex_source_get__ (src);
877 return &src->tokens[deque_back (&src->deque, n)];
880 /* Returns the "struct token" of the token N after the current one in LEXER.
881 The returned pointer can be invalidated by pretty much any succeeding call
882 into the lexer, although the string pointer within the returned token is
883 only invalidated by consuming the token (e.g. with lex_get()). */
885 lex_next (const struct lexer *lexer, int n)
887 return &lex_next__ (lexer, n)->token;
890 /* Returns the type of the token N after the current one in LEXER. */
892 lex_next_token (const struct lexer *lexer, int n)
894 return lex_next (lexer, n)->type;
897 /* Returns the number in the tokn N after the current one in LEXER.
899 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
900 tokens this function will always return zero. */
902 lex_next_tokval (const struct lexer *lexer, int n)
904 return token_number (lex_next (lexer, n));
907 /* Returns the null-terminated string in the token N after the current one, in
910 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
911 this functions this function will always return NULL.
913 The UTF-8 encoding of the returned string is correct for variable names and
914 other identifiers. Use filename_to_utf8() to use it as a filename. Use
915 data_in() to use it in a "union value". */
917 lex_next_tokcstr (const struct lexer *lexer, int n)
919 return lex_next_tokss (lexer, n).string;
922 /* Returns the string in the token N after the current one, in UTF-8 encoding.
923 The string is null-terminated (but the null terminator is not included in
924 the returned substring's 'length').
926 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
927 tokens this functions this function will always return NULL.
929 The UTF-8 encoding of the returned string is correct for variable names and
930 other identifiers. Use filename_to_utf8() to use it as a filename. Use
931 data_in() to use it in a "union value". */
933 lex_next_tokss (const struct lexer *lexer, int n)
935 return lex_next (lexer, n)->string;
939 lex_tokens_match (const struct token *actual, const struct token *expected)
941 if (actual->type != expected->type)
944 switch (actual->type)
948 return actual->number == expected->number;
951 return lex_id_match (expected->string, actual->string);
954 return (actual->string.length == expected->string.length
955 && !memcmp (actual->string.string, expected->string.string,
956 actual->string.length));
963 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
964 skips it and returns true. Otherwise, returns false.
966 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
967 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
968 first three letters. */
970 lex_match_phrase (struct lexer *lexer, const char *s)
972 struct string_lexer slex;
977 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE);
978 while (string_lexer_next (&slex, &token))
979 if (token.type != SCAN_SKIP)
981 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
982 token_uninit (&token);
993 lex_source_get_first_line_number (const struct lex_source *src, int n)
995 return lex_source_next__ (src, n)->first_line;
999 count_newlines (char *s, size_t length)
1004 while ((newline = memchr (s, '\n', length)) != NULL)
1007 length -= (newline + 1) - s;
1015 lex_source_get_last_line_number (const struct lex_source *src, int n)
1017 const struct lex_token *token = lex_source_next__ (src, n);
1019 if (token->first_line == 0)
1023 char *token_str = &src->buffer[token->token_pos - src->tail];
1024 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1029 count_columns (const char *s_, size_t length)
1031 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1037 for (ofs = 0; ofs < length; ofs += mblen)
1041 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1044 int width = uc_width (uc, "UTF-8");
1049 columns = ROUND_UP (columns + 1, 8);
1056 lex_source_get_first_column (const struct lex_source *src, int n)
1058 const struct lex_token *token = lex_source_next__ (src, n);
1059 return count_columns (&src->buffer[token->line_pos - src->tail],
1060 token->token_pos - token->line_pos);
1064 lex_source_get_last_column (const struct lex_source *src, int n)
1066 const struct lex_token *token = lex_source_next__ (src, n);
1067 char *start, *end, *newline;
1069 start = &src->buffer[token->line_pos - src->tail];
1070 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1071 newline = memrchr (start, '\n', end - start);
1072 if (newline != NULL)
1073 start = newline + 1;
1074 return count_columns (start, end - start);
1077 /* Returns the 1-based line number of the start of the syntax that represents
1078 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1079 if the token is drawn from a source that does not have line numbers. */
1081 lex_get_first_line_number (const struct lexer *lexer, int n)
1083 const struct lex_source *src = lex_source__ (lexer);
1084 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
1087 /* Returns the 1-based line number of the end of the syntax that represents the
1088 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1089 token or if the token is drawn from a source that does not have line
1092 Most of the time, a single token is wholly within a single line of syntax,
1093 but there are two exceptions: a T_STRING token can be made up of multiple
1094 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1095 token can consist of a "-" on one line followed by the number on the next.
1098 lex_get_last_line_number (const struct lexer *lexer, int n)
1100 const struct lex_source *src = lex_source__ (lexer);
1101 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1104 /* Returns the 1-based column number of the start of the syntax that represents
1105 the token N after the current one in LEXER. Returns 0 for a T_STOP
1108 Column numbers are measured according to the width of characters as shown in
1109 a typical fixed-width font, in which CJK characters have width 2 and
1110 combining characters have width 0. */
1112 lex_get_first_column (const struct lexer *lexer, int n)
1114 const struct lex_source *src = lex_source__ (lexer);
1115 return src != NULL ? lex_source_get_first_column (src, n) : 0;
1118 /* Returns the 1-based column number of the end of the syntax that represents
1119 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1122 Column numbers are measured according to the width of characters as shown in
1123 a typical fixed-width font, in which CJK characters have width 2 and
1124 combining characters have width 0. */
1126 lex_get_last_column (const struct lexer *lexer, int n)
1128 const struct lex_source *src = lex_source__ (lexer);
1129 return src != NULL ? lex_source_get_last_column (src, n) : 0;
1132 /* Returns the name of the syntax file from which the current command is drawn.
1133 Returns NULL for a T_STOP token or if the command's source does not have
1136 There is no version of this function that takes an N argument because
1137 lookahead only works to the end of a command and any given command is always
1138 within a single syntax file. */
1140 lex_get_file_name (const struct lexer *lexer)
1142 struct lex_source *src = lex_source__ (lexer);
1143 return src == NULL ? NULL : src->reader->file_name;
1147 lex_get_encoding (const struct lexer *lexer)
1149 struct lex_source *src = lex_source__ (lexer);
1150 return src == NULL ? NULL : src->reader->encoding;
1154 /* Returns the syntax mode for the syntax file from which the current drawn is
1155 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1156 does not have line numbers.
1158 There is no version of this function that takes an N argument because
1159 lookahead only works to the end of a command and any given command is always
1160 within a single syntax file. */
1162 lex_get_syntax_mode (const struct lexer *lexer)
1164 struct lex_source *src = lex_source__ (lexer);
1165 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1168 /* Returns the error mode for the syntax file from which the current drawn is
1169 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1170 source does not have line numbers.
1172 There is no version of this function that takes an N argument because
1173 lookahead only works to the end of a command and any given command is always
1174 within a single syntax file. */
1176 lex_get_error_mode (const struct lexer *lexer)
1178 struct lex_source *src = lex_source__ (lexer);
1179 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1182 /* If the source that LEXER is currently reading has error mode
1183 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1184 token to be read comes directly from whatever is next read from the stream.
1186 It makes sense to call this function after encountering an error in a
1187 command entered on the console, because usually the user would prefer not to
1188 have cascading errors. */
1190 lex_interactive_reset (struct lexer *lexer)
1192 struct lex_source *src = lex_source__ (lexer);
1193 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1195 src->head = src->tail = 0;
1196 src->journal_pos = src->seg_pos = src->line_pos = 0;
1197 src->n_newlines = 0;
1198 src->suppress_next_newline = false;
1199 segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1200 while (!deque_is_empty (&src->deque))
1201 lex_source_pop__ (src);
1202 lex_source_push_endcmd__ (src);
1206 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1208 lex_discard_rest_of_command (struct lexer *lexer)
1210 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1214 /* Discards all lookahead tokens in LEXER, then discards all input sources
1215 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1216 runs out of input sources. */
1218 lex_discard_noninteractive (struct lexer *lexer)
1220 struct lex_source *src = lex_source__ (lexer);
1224 while (!deque_is_empty (&src->deque))
1225 lex_source_pop__ (src);
1227 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1228 src = lex_source__ (lexer))
1229 lex_source_destroy (src);
1234 lex_source_max_tail__ (const struct lex_source *src)
1236 const struct lex_token *token;
1239 assert (src->seg_pos >= src->line_pos);
1240 max_tail = MIN (src->journal_pos, src->line_pos);
1242 /* Use the oldest token also. (We know that src->deque cannot be empty
1243 because we are in the process of adding a new token, which is already
1244 initialized enough to use here.) */
1245 token = &src->tokens[deque_back (&src->deque, 0)];
1246 assert (token->token_pos >= token->line_pos);
1247 max_tail = MIN (max_tail, token->line_pos);
1253 lex_source_expand__ (struct lex_source *src)
1255 if (src->head - src->tail >= src->allocated)
1257 size_t max_tail = lex_source_max_tail__ (src);
1258 if (max_tail > src->tail)
1260 /* Advance the tail, freeing up room at the head. */
1261 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1262 src->head - max_tail);
1263 src->tail = max_tail;
1267 /* Buffer is completely full. Expand it. */
1268 src->buffer = x2realloc (src->buffer, &src->allocated);
1273 /* There's space available at the head of the buffer. Nothing to do. */
1278 lex_source_read__ (struct lex_source *src)
1282 lex_source_expand__ (src);
1284 size_t head_ofs = src->head - src->tail;
1285 size_t space = src->allocated - head_ofs;
1286 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1287 size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1289 assert (n <= space);
1294 src->reader->eof = true;
1295 lex_source_expand__ (src);
1301 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1302 src->head - src->seg_pos));
1305 static struct lex_source *
1306 lex_source__ (const struct lexer *lexer)
1308 return (ll_is_empty (&lexer->sources) ? NULL
1309 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1312 static struct substring
1313 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1315 const struct lex_token *token0 = lex_source_next__ (src, n0);
1316 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1317 size_t start = token0->token_pos;
1318 size_t end = token1->token_pos + token1->token_len;
1320 return ss_buffer (&src->buffer[start - src->tail], end - start);
1324 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1330 assert (out_size >= 16);
1331 out_maxlen = out_size - 1;
1332 if (in.length > out_maxlen - 3)
1335 for (out_len = 0; out_len < in.length; out_len += mblen)
1337 if (in.string[out_len] == '\n'
1338 || in.string[out_len] == '\0'
1339 || (in.string[out_len] == '\r'
1340 && out_len + 1 < in.length
1341 && in.string[out_len + 1] == '\n'))
1344 mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1345 in.length - out_len);
1350 if (out_len + mblen > out_maxlen)
1354 memcpy (out, in.string, out_len);
1355 strcpy (&out[out_len], out_len < in.length ? "..." : "");
1359 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1360 const char *format, va_list args)
1362 const struct lex_token *token;
1367 token = lex_source_next__ (src, n0);
1368 if (token->token.type == T_ENDCMD)
1369 ds_put_cstr (&s, _("Syntax error at end of command"));
1372 struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1373 if (!ss_is_empty (syntax))
1375 char syntax_cstr[64];
1377 lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1378 ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1381 ds_put_cstr (&s, _("Syntax error"));
1386 ds_put_cstr (&s, ": ");
1387 ds_put_vformat (&s, format, args);
1389 if (ds_last (&s) != '.')
1390 ds_put_byte (&s, '.');
1393 .category = MSG_C_SYNTAX,
1394 .severity = MSG_S_ERROR,
1395 .file_name = src->reader->file_name,
1396 .first_line = lex_source_get_first_line_number (src, n0),
1397 .last_line = lex_source_get_last_line_number (src, n1),
1398 .first_column = lex_source_get_first_column (src, n0),
1399 .last_column = lex_source_get_last_column (src, n1),
1400 .text = ds_steal_cstr (&s),
1405 static void PRINTF_FORMAT (2, 3)
1406 lex_get_error (struct lex_source *src, const char *format, ...)
1411 va_start (args, format);
1413 n = deque_count (&src->deque) - 1;
1414 lex_source_error_valist (src, n, n, format, args);
1415 lex_source_pop_front (src);
1420 /* Attempts to append an additional token into SRC's deque, reading more from
1421 the underlying lex_reader if necessary. Returns true if successful, false
1422 if the deque already represents (a suffix of) the whole lex_reader's
1425 lex_source_get__ (const struct lex_source *src_)
1427 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1431 /* State maintained while scanning tokens. Usually we only need a single
1432 state, but scanner_push() can return SCAN_SAVE to indicate that the state
1433 needs to be saved and possibly restored later with SCAN_BACK. */
1436 struct segmenter segmenter;
1437 enum segment_type last_segment;
1438 int newlines; /* Number of newlines encountered so far. */
1439 /* Maintained here so we can update lex_source's similar members when we
1445 /* Initialize state. */
1446 struct state state =
1448 .segmenter = src->segmenter,
1450 .seg_pos = src->seg_pos,
1451 .line_pos = src->line_pos,
1453 struct state saved = state;
1455 /* Append a new token to SRC and initialize it. */
1456 struct lex_token *token = lex_push_token__ (src);
1457 struct scanner scanner;
1458 scanner_init (&scanner, &token->token);
1459 token->line_pos = src->line_pos;
1460 token->token_pos = src->seg_pos;
1461 if (src->reader->line_number > 0)
1462 token->first_line = src->reader->line_number + src->n_newlines;
1464 token->first_line = 0;
1466 /* Extract segments and pass them through the scanner until we obtain a
1470 /* Extract a segment. */
1471 const char *segment = &src->buffer[state.seg_pos - src->tail];
1472 size_t seg_maxlen = src->head - state.seg_pos;
1473 enum segment_type type;
1474 int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1475 src->reader->eof, &type);
1478 /* The segmenter needs more input to produce a segment. */
1479 assert (!src->reader->eof);
1480 lex_source_read__ (src);
1484 /* Update state based on the segment. */
1485 state.last_segment = type;
1486 state.seg_pos += seg_len;
1487 if (type == SEG_NEWLINE)
1490 state.line_pos = state.seg_pos;
1493 /* Pass the segment into the scanner and try to get a token out. */
1494 enum scan_result result = scanner_push (&scanner, type,
1495 ss_buffer (segment, seg_len),
1497 if (result == SCAN_SAVE)
1499 else if (result == SCAN_BACK)
1504 else if (result == SCAN_DONE)
1508 /* If we've reached the end of a line, or the end of a command, then pass
1509 the line to the output engine as a syntax text item. */
1510 int n_lines = state.newlines;
1511 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1514 src->suppress_next_newline = true;
1516 else if (n_lines > 0 && src->suppress_next_newline)
1519 src->suppress_next_newline = false;
1521 for (int i = 0; i < n_lines; i++)
1523 /* Beginning of line. */
1524 const char *line = &src->buffer[src->journal_pos - src->tail];
1526 /* Calculate line length, including \n or \r\n end-of-line if present.
1528 We use src->head even though that may be beyond what we've actually
1529 converted to tokens (which is only through state.line_pos). That's
1530 because, if we're emitting the line due to SEG_END_COMMAND, we want to
1531 take the whole line through the newline, not just through the '.'. */
1532 size_t max_len = src->head - src->journal_pos;
1533 const char *newline = memchr (line, '\n', max_len);
1534 size_t line_len = newline ? newline - line + 1 : max_len;
1536 /* Calculate line length excluding end-of-line. */
1537 size_t copy_len = line_len;
1538 if (copy_len > 0 && line[copy_len - 1] == '\n')
1540 if (copy_len > 0 && line[copy_len - 1] == '\r')
1543 /* Submit the line as syntax. */
1544 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1545 xmemdup0 (line, copy_len),
1548 src->journal_pos += line_len;
1551 token->token_len = state.seg_pos - src->seg_pos;
1553 src->segmenter = state.segmenter;
1554 src->seg_pos = state.seg_pos;
1555 src->line_pos = state.line_pos;
1556 src->n_newlines += state.newlines;
1558 switch (token->token.type)
1564 token->token.type = T_ENDCMD;
1568 case SCAN_BAD_HEX_LENGTH:
1569 lex_get_error (src, _("String of hex digits has %d characters, which "
1570 "is not a multiple of 2"),
1571 (int) token->token.number);
1574 case SCAN_BAD_HEX_DIGIT:
1575 case SCAN_BAD_UNICODE_DIGIT:
1576 lex_get_error (src, _("`%c' is not a valid hex digit"),
1577 (int) token->token.number);
1580 case SCAN_BAD_UNICODE_LENGTH:
1581 lex_get_error (src, _("Unicode string contains %d bytes, which is "
1582 "not in the valid range of 1 to 8 bytes"),
1583 (int) token->token.number);
1586 case SCAN_BAD_UNICODE_CODE_POINT:
1587 lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1588 (int) token->token.number);
1591 case SCAN_EXPECTED_QUOTE:
1592 lex_get_error (src, _("Unterminated string constant"));
1595 case SCAN_EXPECTED_EXPONENT:
1596 lex_get_error (src, _("Missing exponent following `%s'"),
1597 token->token.string.string);
1600 case SCAN_UNEXPECTED_CHAR:
1603 lex_get_error (src, _("Bad character %s in input"),
1604 uc_name (token->token.number, c_name));
1609 lex_source_pop_front (src);
1617 lex_source_push_endcmd__ (struct lex_source *src)
1619 struct lex_token *token = lex_push_token__ (src);
1620 token->token.type = T_ENDCMD;
1621 token->token_pos = 0;
1622 token->token_len = 0;
1623 token->line_pos = 0;
1624 token->first_line = 0;
1627 static struct lex_source *
1628 lex_source_create (struct lex_reader *reader)
1630 struct lex_source *src;
1632 src = xzalloc (sizeof *src);
1633 src->reader = reader;
1634 segmenter_init (&src->segmenter, reader->syntax);
1635 src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1637 lex_source_push_endcmd__ (src);
1643 lex_source_destroy (struct lex_source *src)
1645 char *file_name = src->reader->file_name;
1646 char *encoding = src->reader->encoding;
1647 if (src->reader->class->destroy != NULL)
1648 src->reader->class->destroy (src->reader);
1652 while (!deque_is_empty (&src->deque))
1653 lex_source_pop__ (src);
1655 ll_remove (&src->ll);
1659 struct lex_file_reader
1661 struct lex_reader reader;
1662 struct u8_istream *istream;
1665 static struct lex_reader_class lex_file_reader_class;
1667 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1668 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
1669 ENCODING, which should take one of the forms accepted by
1670 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
1671 mode of the new reader, respectively.
1673 Returns a null pointer if FILE_NAME cannot be opened. */
1675 lex_reader_for_file (const char *file_name, const char *encoding,
1676 enum segmenter_mode syntax,
1677 enum lex_error_mode error)
1679 struct lex_file_reader *r;
1680 struct u8_istream *istream;
1682 istream = (!strcmp(file_name, "-")
1683 ? u8_istream_for_fd (encoding, STDIN_FILENO)
1684 : u8_istream_for_file (encoding, file_name, O_RDONLY));
1685 if (istream == NULL)
1687 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1691 r = xmalloc (sizeof *r);
1692 lex_reader_init (&r->reader, &lex_file_reader_class);
1693 r->reader.syntax = syntax;
1694 r->reader.error = error;
1695 r->reader.file_name = xstrdup (file_name);
1696 r->reader.encoding = xstrdup_if_nonnull (encoding);
1697 r->reader.line_number = 1;
1698 r->istream = istream;
1703 static struct lex_file_reader *
1704 lex_file_reader_cast (struct lex_reader *r)
1706 return UP_CAST (r, struct lex_file_reader, reader);
1710 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1711 enum prompt_style prompt_style UNUSED)
1713 struct lex_file_reader *r = lex_file_reader_cast (r_);
1714 ssize_t n_read = u8_istream_read (r->istream, buf, n);
1717 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
1724 lex_file_close (struct lex_reader *r_)
1726 struct lex_file_reader *r = lex_file_reader_cast (r_);
1728 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1730 if (u8_istream_close (r->istream) != 0)
1731 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
1734 u8_istream_free (r->istream);
1739 static struct lex_reader_class lex_file_reader_class =
1745 struct lex_string_reader
1747 struct lex_reader reader;
1752 static struct lex_reader_class lex_string_reader_class;
1754 /* Creates and returns a new lex_reader for the contents of S, which must be
1755 encoded in the given ENCODING. The new reader takes ownership of S and will free it
1756 with ss_dealloc() when it is closed. */
1758 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
1760 struct lex_string_reader *r;
1762 r = xmalloc (sizeof *r);
1763 lex_reader_init (&r->reader, &lex_string_reader_class);
1764 r->reader.syntax = SEG_MODE_AUTO;
1765 r->reader.encoding = xstrdup_if_nonnull (encoding);
1772 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1773 which must be encoded in ENCODING. The caller retains ownership of S. */
1775 lex_reader_for_string (const char *s, const char *encoding)
1777 struct substring ss;
1778 ss_alloc_substring (&ss, ss_cstr (s));
1779 return lex_reader_for_substring_nocopy (ss, encoding);
1782 /* Formats FORMAT as a printf()-like format string and creates and returns a
1783 new lex_reader for the formatted result. */
1785 lex_reader_for_format (const char *format, const char *encoding, ...)
1787 struct lex_reader *r;
1790 va_start (args, encoding);
1791 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
1797 static struct lex_string_reader *
1798 lex_string_reader_cast (struct lex_reader *r)
1800 return UP_CAST (r, struct lex_string_reader, reader);
1804 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1805 enum prompt_style prompt_style UNUSED)
1807 struct lex_string_reader *r = lex_string_reader_cast (r_);
1810 chunk = MIN (n, r->s.length - r->offset);
1811 memcpy (buf, r->s.string + r->offset, chunk);
1818 lex_string_close (struct lex_reader *r_)
1820 struct lex_string_reader *r = lex_string_reader_cast (r_);
1826 static struct lex_reader_class lex_string_reader_class =