1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "language/command.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/ll.h"
42 #include "libpspp/message.h"
43 #include "libpspp/misc.h"
44 #include "libpspp/str.h"
45 #include "libpspp/u8-istream.h"
46 #include "output/journal.h"
47 #include "output/text-item.h"
49 #include "gl/c-ctype.h"
50 #include "gl/minmax.h"
51 #include "gl/xalloc.h"
52 #include "gl/xmemdup0.h"
55 #define _(msgid) gettext (msgid)
56 #define N_(msgid) msgid
58 /* A token within a lex_source. */
61 /* The regular token information. */
64 /* Location of token in terms of the lex_source's buffer.
65 src->tail <= line_pos <= token_pos <= src->head. */
66 size_t token_pos; /* Start of token. */
67 size_t token_len; /* Length of source for token in bytes. */
68 size_t line_pos; /* Start of line containing token_pos. */
69 int first_line; /* Line number at token_pos. */
72 /* A source of tokens, corresponding to a syntax file.
74 This is conceptually a lex_reader wrapped with everything needed to convert
75 its UTF-8 bytes into tokens. */
78 struct ll ll; /* In lexer's list of sources. */
79 struct lex_reader *reader;
80 struct segmenter segmenter;
81 bool eof; /* True if T_STOP was read from 'reader'. */
83 /* Buffer of UTF-8 bytes. */
85 size_t allocated; /* Number of bytes allocated. */
86 size_t tail; /* &buffer[0] offset into UTF-8 source. */
87 size_t head; /* &buffer[head - tail] offset into source. */
89 /* Positions in source file, tail <= pos <= head for each member here. */
90 size_t journal_pos; /* First byte not yet output to journal. */
91 size_t seg_pos; /* First byte not yet scanned as token. */
92 size_t line_pos; /* First byte of line containing seg_pos. */
94 int n_newlines; /* Number of new-lines up to seg_pos. */
95 bool suppress_next_newline;
98 struct deque deque; /* Indexes into 'tokens'. */
99 struct lex_token *tokens; /* Lookahead tokens for parser. */
102 static struct lex_source *lex_source_create (struct lex_reader *);
103 static void lex_source_destroy (struct lex_source *);
108 struct ll_list sources; /* Contains "struct lex_source"s. */
111 static struct lex_source *lex_source__ (const struct lexer *);
112 static const struct lex_token *lex_next__ (const struct lexer *, int n);
113 static void lex_source_push_endcmd__ (struct lex_source *);
115 static void lex_source_pop__ (struct lex_source *);
116 static bool lex_source_get__ (const struct lex_source *);
117 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
118 const char *format, va_list)
119 PRINTF_FORMAT (4, 0);
120 static const struct lex_token *lex_source_next__ (const struct lex_source *,
123 /* Initializes READER with the specified CLASS and otherwise some reasonable
124 defaults. The caller should fill in the others members as desired. */
126 lex_reader_init (struct lex_reader *reader,
127 const struct lex_reader_class *class)
129 reader->class = class;
130 reader->syntax = LEX_SYNTAX_AUTO;
131 reader->error = LEX_ERROR_CONTINUE;
132 reader->file_name = NULL;
133 reader->encoding = NULL;
134 reader->line_number = 0;
138 /* Frees any file name already in READER and replaces it by a copy of
139 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
141 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
143 free (reader->file_name);
144 reader->file_name = file_name != NULL ? xstrdup (file_name) : NULL;
147 /* Creates and returns a new lexer. */
151 struct lexer *lexer = xzalloc (sizeof *lexer);
152 ll_init (&lexer->sources);
156 /* Destroys LEXER. */
158 lex_destroy (struct lexer *lexer)
162 struct lex_source *source, *next;
164 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
165 lex_source_destroy (source);
170 /* Inserts READER into LEXER so that the next token read by LEXER comes from
171 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
174 lex_include (struct lexer *lexer, struct lex_reader *reader)
176 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
177 ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
180 /* Appends READER to LEXER, so that it will be read after all other current
181 readers have already been read. */
183 lex_append (struct lexer *lexer, struct lex_reader *reader)
185 ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
190 static struct lex_token *
191 lex_push_token__ (struct lex_source *src)
193 struct lex_token *token;
195 if (deque_is_full (&src->deque))
196 src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
198 token = &src->tokens[deque_push_front (&src->deque)];
199 token_init (&token->token);
204 lex_source_pop__ (struct lex_source *src)
206 token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
210 lex_source_pop_front (struct lex_source *src)
212 token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
215 /* Advances LEXER to the next token, consuming the current token. */
217 lex_get (struct lexer *lexer)
219 struct lex_source *src;
221 src = lex_source__ (lexer);
225 if (!deque_is_empty (&src->deque))
226 lex_source_pop__ (src);
228 while (deque_is_empty (&src->deque))
229 if (!lex_source_get__ (src))
231 lex_source_destroy (src);
232 src = lex_source__ (lexer);
238 /* Issuing errors. */
240 /* Prints a syntax error message containing the current token and
241 given message MESSAGE (if non-null). */
243 lex_error (struct lexer *lexer, const char *format, ...)
247 va_start (args, format);
248 lex_next_error_valist (lexer, 0, 0, format, args);
252 /* Prints a syntax error message containing the current token and
253 given message MESSAGE (if non-null). */
255 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
257 lex_next_error_valist (lexer, 0, 0, format, args);
260 /* Prints a syntax error message containing the current token and
261 given message MESSAGE (if non-null). */
263 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
267 va_start (args, format);
268 lex_next_error_valist (lexer, n0, n1, format, args);
272 /* Prints a syntax error message saying that OPTION0 or one of the other
273 strings following it, up to the first NULL, is expected. */
275 (lex_error_expecting) (struct lexer *lexer, const char *option0, ...)
277 enum { MAX_OPTIONS = 8 };
278 const char *options[MAX_OPTIONS + 1];
282 va_start (args, option0);
283 options[0] = option0;
285 while (n + 1 < MAX_OPTIONS && options[n] != NULL)
286 options[++n] = va_arg (args, const char *);
292 lex_error (lexer, NULL);
296 lex_error (lexer, _("expecting %s"), options[0]);
300 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
304 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
309 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
310 options[0], options[1], options[2], options[3]);
314 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
315 options[0], options[1], options[2], options[3], options[4]);
319 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
320 options[0], options[1], options[2], options[3], options[4],
325 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
326 options[0], options[1], options[2], options[3], options[4],
327 options[5], options[6]);
331 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
332 options[0], options[1], options[2], options[3], options[4],
333 options[5], options[6], options[7]);
341 /* Reports an error to the effect that subcommand SBC may only be specified
344 This function does not take a lexer as an argument or use lex_error(),
345 because the result would ordinarily just be redundant: "Syntax error at
346 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
347 not help the user find the error. */
349 lex_sbc_only_once (const char *sbc)
351 msg (SE, _("Subcommand %s may only be specified once."), sbc);
354 /* Reports an error to the effect that subcommand SBC is missing.
356 This function does not take a lexer as an argument or use lex_error(),
357 because a missing subcommand can normally be detected only after the whole
358 command has been parsed, and so lex_error() would always report "Syntax
359 error at end of command", which does not help the user find the error. */
361 lex_sbc_missing (const char *sbc)
363 msg (SE, _("Required subcommand %s was not specified."), sbc);
366 /* Reports an error to the effect that specification SPEC may only be specified
367 once within subcommand SBC. */
369 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
371 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
375 /* Reports an error to the effect that specification SPEC is missing within
378 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
380 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
384 /* Prints a syntax error message containing the current token and
385 given message MESSAGE (if non-null). */
387 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
388 const char *format, va_list args)
390 struct lex_source *src = lex_source__ (lexer);
393 lex_source_error_valist (src, n0, n1, format, args);
399 ds_put_format (&s, _("Syntax error at end of input"));
402 ds_put_cstr (&s, ": ");
403 ds_put_vformat (&s, format, args);
405 ds_put_byte (&s, '.');
406 msg (SE, "%s", ds_cstr (&s));
411 /* Checks that we're at end of command.
412 If so, returns a successful command completion code.
413 If not, flags a syntax error and returns an error command
416 lex_end_of_command (struct lexer *lexer)
418 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
420 lex_error (lexer, _("expecting end of command"));
427 /* Token testing functions. */
429 /* Returns true if the current token is a number. */
431 lex_is_number (const struct lexer *lexer)
433 return lex_next_is_number (lexer, 0);
436 /* Returns true if the current token is a string. */
438 lex_is_string (const struct lexer *lexer)
440 return lex_next_is_string (lexer, 0);
443 /* Returns the value of the current token, which must be a
444 floating point number. */
446 lex_number (const struct lexer *lexer)
448 return lex_next_number (lexer, 0);
451 /* Returns true iff the current token is an integer. */
453 lex_is_integer (const struct lexer *lexer)
455 return lex_next_is_integer (lexer, 0);
458 /* Returns the value of the current token, which must be an
461 lex_integer (const struct lexer *lexer)
463 return lex_next_integer (lexer, 0);
466 /* Token testing functions with lookahead.
468 A value of 0 for N as an argument to any of these functions refers to the
469 current token. Lookahead is limited to the current command. Any N greater
470 than the number of tokens remaining in the current command will be treated
471 as referring to a T_ENDCMD token. */
473 /* Returns true if the token N ahead of the current token is a number. */
475 lex_next_is_number (const struct lexer *lexer, int n)
477 enum token_type next_token = lex_next_token (lexer, n);
478 return next_token == T_POS_NUM || next_token == T_NEG_NUM;
481 /* Returns true if the token N ahead of the current token is a string. */
483 lex_next_is_string (const struct lexer *lexer, int n)
485 return lex_next_token (lexer, n) == T_STRING;
488 /* Returns the value of the token N ahead of the current token, which must be a
489 floating point number. */
491 lex_next_number (const struct lexer *lexer, int n)
493 assert (lex_next_is_number (lexer, n));
494 return lex_next_tokval (lexer, n);
497 /* Returns true if the token N ahead of the current token is an integer. */
499 lex_next_is_integer (const struct lexer *lexer, int n)
503 if (!lex_next_is_number (lexer, n))
506 value = lex_next_tokval (lexer, n);
507 return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
510 /* Returns the value of the token N ahead of the current token, which must be
513 lex_next_integer (const struct lexer *lexer, int n)
515 assert (lex_next_is_integer (lexer, n));
516 return lex_next_tokval (lexer, n);
519 /* Token matching functions. */
521 /* If the current token has the specified TYPE, skips it and returns true.
522 Otherwise, returns false. */
524 lex_match (struct lexer *lexer, enum token_type type)
526 if (lex_token (lexer) == type)
535 /* If the current token matches IDENTIFIER, skips it and returns true.
536 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
539 IDENTIFIER must be an ASCII string. */
541 lex_match_id (struct lexer *lexer, const char *identifier)
543 return lex_match_id_n (lexer, identifier, 3);
546 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
547 may be abbreviated to its first N letters. Otherwise, returns false.
549 IDENTIFIER must be an ASCII string. */
551 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
553 if (lex_token (lexer) == T_ID
554 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
563 /* If the current token is integer X, skips it and returns true. Otherwise,
566 lex_match_int (struct lexer *lexer, int x)
568 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
577 /* Forced matches. */
579 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
580 abbreviated to its first 3 letters. Otherwise, reports an error and returns
583 IDENTIFIER must be an ASCII string. */
585 lex_force_match_id (struct lexer *lexer, const char *identifier)
587 if (lex_match_id (lexer, identifier))
591 lex_error_expecting (lexer, identifier);
596 /* If the current token has the specified TYPE, skips it and returns true.
597 Otherwise, reports an error and returns false. */
599 lex_force_match (struct lexer *lexer, enum token_type type)
601 if (lex_token (lexer) == type)
608 const char *type_string = token_type_to_string (type);
611 char *s = xasprintf ("`%s'", type_string);
612 lex_error_expecting (lexer, s);
616 lex_error_expecting (lexer, token_type_to_name (type));
622 /* If the current token is a string, does nothing and returns true.
623 Otherwise, reports an error and returns false. */
625 lex_force_string (struct lexer *lexer)
627 if (lex_is_string (lexer))
631 lex_error (lexer, _("expecting string"));
636 /* If the current token is a string or an identifier, does nothing and returns
637 true. Otherwise, reports an error and returns false.
639 This is meant for use in syntactic situations where we want to encourage the
640 user to supply a quoted string, but for compatibility we also accept
641 identifiers. (One example of such a situation is file names.) Therefore,
642 the error message issued when the current token is wrong only says that a
643 string is expected and doesn't mention that an identifier would also be
646 lex_force_string_or_id (struct lexer *lexer)
648 return lex_token (lexer) == T_ID || lex_force_string (lexer);
651 /* If the current token is an integer, does nothing and returns true.
652 Otherwise, reports an error and returns false. */
654 lex_force_int (struct lexer *lexer)
656 if (lex_is_integer (lexer))
660 lex_error (lexer, _("expecting integer"));
665 /* If the current token is a number, does nothing and returns true.
666 Otherwise, reports an error and returns false. */
668 lex_force_num (struct lexer *lexer)
670 if (lex_is_number (lexer))
673 lex_error (lexer, _("expecting number"));
677 /* If the current token is an identifier, does nothing and returns true.
678 Otherwise, reports an error and returns false. */
680 lex_force_id (struct lexer *lexer)
682 if (lex_token (lexer) == T_ID)
685 lex_error (lexer, _("expecting identifier"));
689 /* Token accessors. */
691 /* Returns the type of LEXER's current token. */
693 lex_token (const struct lexer *lexer)
695 return lex_next_token (lexer, 0);
698 /* Returns the number in LEXER's current token.
700 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
701 tokens this function will always return zero. */
703 lex_tokval (const struct lexer *lexer)
705 return lex_next_tokval (lexer, 0);
708 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
710 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
711 this functions this function will always return NULL.
713 The UTF-8 encoding of the returned string is correct for variable names and
714 other identifiers. Use filename_to_utf8() to use it as a filename. Use
715 data_in() to use it in a "union value". */
717 lex_tokcstr (const struct lexer *lexer)
719 return lex_next_tokcstr (lexer, 0);
722 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
723 null-terminated (but the null terminator is not included in the returned
724 substring's 'length').
726 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
727 this functions this function will always return NULL.
729 The UTF-8 encoding of the returned string is correct for variable names and
730 other identifiers. Use filename_to_utf8() to use it as a filename. Use
731 data_in() to use it in a "union value". */
733 lex_tokss (const struct lexer *lexer)
735 return lex_next_tokss (lexer, 0);
740 A value of 0 for N as an argument to any of these functions refers to the
741 current token. Lookahead is limited to the current command. Any N greater
742 than the number of tokens remaining in the current command will be treated
743 as referring to a T_ENDCMD token. */
745 static const struct lex_token *
746 lex_next__ (const struct lexer *lexer_, int n)
748 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
749 struct lex_source *src = lex_source__ (lexer);
752 return lex_source_next__ (src, n);
755 static const struct lex_token stop_token =
756 { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
762 static const struct lex_token *
763 lex_source_next__ (const struct lex_source *src, int n)
765 while (deque_count (&src->deque) <= n)
767 if (!deque_is_empty (&src->deque))
769 struct lex_token *front;
771 front = &src->tokens[deque_front (&src->deque, 0)];
772 if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
776 lex_source_get__ (src);
779 return &src->tokens[deque_back (&src->deque, n)];
782 /* Returns the "struct token" of the token N after the current one in LEXER.
783 The returned pointer can be invalidated by pretty much any succeeding call
784 into the lexer, although the string pointer within the returned token is
785 only invalidated by consuming the token (e.g. with lex_get()). */
787 lex_next (const struct lexer *lexer, int n)
789 return &lex_next__ (lexer, n)->token;
792 /* Returns the type of the token N after the current one in LEXER. */
794 lex_next_token (const struct lexer *lexer, int n)
796 return lex_next (lexer, n)->type;
799 /* Returns the number in the tokn N after the current one in LEXER.
801 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
802 tokens this function will always return zero. */
804 lex_next_tokval (const struct lexer *lexer, int n)
806 const struct token *token = lex_next (lexer, n);
807 return token->number;
810 /* Returns the null-terminated string in the token N after the current one, in
813 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
814 this functions this function will always return NULL.
816 The UTF-8 encoding of the returned string is correct for variable names and
817 other identifiers. Use filename_to_utf8() to use it as a filename. Use
818 data_in() to use it in a "union value". */
820 lex_next_tokcstr (const struct lexer *lexer, int n)
822 return lex_next_tokss (lexer, n).string;
825 /* Returns the string in the token N after the current one, in UTF-8 encoding.
826 The string is null-terminated (but the null terminator is not included in
827 the returned substring's 'length').
829 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
830 this functions this function will always return NULL.
832 The UTF-8 encoding of the returned string is correct for variable names and
833 other identifiers. Use filename_to_utf8() to use it as a filename. Use
834 data_in() to use it in a "union value". */
836 lex_next_tokss (const struct lexer *lexer, int n)
838 return lex_next (lexer, n)->string;
842 lex_tokens_match (const struct token *actual, const struct token *expected)
844 if (actual->type != expected->type)
847 switch (actual->type)
851 return actual->number == expected->number;
854 return lex_id_match (expected->string, actual->string);
857 return (actual->string.length == expected->string.length
858 && !memcmp (actual->string.string, expected->string.string,
859 actual->string.length));
866 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
867 skips it and returns true. Otherwise, returns false.
869 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
870 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
871 first three letters. */
873 lex_match_phrase (struct lexer *lexer, const char *s)
875 struct string_lexer slex;
880 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE);
881 while (string_lexer_next (&slex, &token))
882 if (token.type != SCAN_SKIP)
884 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
885 token_destroy (&token);
896 lex_source_get_first_line_number (const struct lex_source *src, int n)
898 return lex_source_next__ (src, n)->first_line;
902 count_newlines (char *s, size_t length)
907 while ((newline = memchr (s, '\n', length)) != NULL)
910 length -= (newline + 1) - s;
918 lex_source_get_last_line_number (const struct lex_source *src, int n)
920 const struct lex_token *token = lex_source_next__ (src, n);
922 if (token->first_line == 0)
926 char *token_str = &src->buffer[token->token_pos - src->tail];
927 return token->first_line + count_newlines (token_str, token->token_len) + 1;
932 count_columns (const char *s_, size_t length)
934 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
940 for (ofs = 0; ofs < length; ofs += mblen)
944 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
947 int width = uc_width (uc, "UTF-8");
952 columns = ROUND_UP (columns + 1, 8);
959 lex_source_get_first_column (const struct lex_source *src, int n)
961 const struct lex_token *token = lex_source_next__ (src, n);
962 return count_columns (&src->buffer[token->line_pos - src->tail],
963 token->token_pos - token->line_pos);
967 lex_source_get_last_column (const struct lex_source *src, int n)
969 const struct lex_token *token = lex_source_next__ (src, n);
970 char *start, *end, *newline;
972 start = &src->buffer[token->line_pos - src->tail];
973 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
974 newline = memrchr (start, '\n', end - start);
977 return count_columns (start, end - start);
980 /* Returns the 1-based line number of the start of the syntax that represents
981 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
982 if the token is drawn from a source that does not have line numbers. */
984 lex_get_first_line_number (const struct lexer *lexer, int n)
986 const struct lex_source *src = lex_source__ (lexer);
987 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
990 /* Returns the 1-based line number of the end of the syntax that represents the
991 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
992 token or if the token is drawn from a source that does not have line
995 Most of the time, a single token is wholly within a single line of syntax,
996 but there are two exceptions: a T_STRING token can be made up of multiple
997 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
998 token can consist of a "-" on one line followed by the number on the next.
1001 lex_get_last_line_number (const struct lexer *lexer, int n)
1003 const struct lex_source *src = lex_source__ (lexer);
1004 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1007 /* Returns the 1-based column number of the start of the syntax that represents
1008 the token N after the current one in LEXER. Returns 0 for a T_STOP
1011 Column numbers are measured according to the width of characters as shown in
1012 a typical fixed-width font, in which CJK characters have width 2 and
1013 combining characters have width 0. */
1015 lex_get_first_column (const struct lexer *lexer, int n)
1017 const struct lex_source *src = lex_source__ (lexer);
1018 return src != NULL ? lex_source_get_first_column (src, n) : 0;
1021 /* Returns the 1-based column number of the end of the syntax that represents
1022 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1025 Column numbers are measured according to the width of characters as shown in
1026 a typical fixed-width font, in which CJK characters have width 2 and
1027 combining characters have width 0. */
1029 lex_get_last_column (const struct lexer *lexer, int n)
1031 const struct lex_source *src = lex_source__ (lexer);
1032 return src != NULL ? lex_source_get_last_column (src, n) : 0;
1035 /* Returns the name of the syntax file from which the current command is drawn.
1036 Returns NULL for a T_STOP token or if the command's source does not have
1039 There is no version of this function that takes an N argument because
1040 lookahead only works to the end of a command and any given command is always
1041 within a single syntax file. */
1043 lex_get_file_name (const struct lexer *lexer)
1045 struct lex_source *src = lex_source__ (lexer);
1046 return src == NULL ? NULL : src->reader->file_name;
1050 lex_get_encoding (const struct lexer *lexer)
1052 struct lex_source *src = lex_source__ (lexer);
1053 return src == NULL ? NULL : src->reader->encoding;
1057 /* Returns the syntax mode for the syntax file from which the current drawn is
1058 drawn. Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1059 source does not have line numbers.
1061 There is no version of this function that takes an N argument because
1062 lookahead only works to the end of a command and any given command is always
1063 within a single syntax file. */
1064 enum lex_syntax_mode
1065 lex_get_syntax_mode (const struct lexer *lexer)
1067 struct lex_source *src = lex_source__ (lexer);
1068 return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1071 /* Returns the error mode for the syntax file from which the current drawn is
1072 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1073 source does not have line numbers.
1075 There is no version of this function that takes an N argument because
1076 lookahead only works to the end of a command and any given command is always
1077 within a single syntax file. */
1079 lex_get_error_mode (const struct lexer *lexer)
1081 struct lex_source *src = lex_source__ (lexer);
1082 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1085 /* If the source that LEXER is currently reading has error mode
1086 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1087 token to be read comes directly from whatever is next read from the stream.
1089 It makes sense to call this function after encountering an error in a
1090 command entered on the console, because usually the user would prefer not to
1091 have cascading errors. */
1093 lex_interactive_reset (struct lexer *lexer)
1095 struct lex_source *src = lex_source__ (lexer);
1096 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1098 src->head = src->tail = 0;
1099 src->journal_pos = src->seg_pos = src->line_pos = 0;
1100 src->n_newlines = 0;
1101 src->suppress_next_newline = false;
1102 segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1103 while (!deque_is_empty (&src->deque))
1104 lex_source_pop__ (src);
1105 lex_source_push_endcmd__ (src);
1109 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1111 lex_discard_rest_of_command (struct lexer *lexer)
1113 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1117 /* Discards all lookahead tokens in LEXER, then discards all input sources
1118 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1119 runs out of input sources. */
1121 lex_discard_noninteractive (struct lexer *lexer)
1123 struct lex_source *src = lex_source__ (lexer);
1127 while (!deque_is_empty (&src->deque))
1128 lex_source_pop__ (src);
1130 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1131 src = lex_source__ (lexer))
1132 lex_source_destroy (src);
1137 lex_source_max_tail__ (const struct lex_source *src)
1139 const struct lex_token *token;
1142 assert (src->seg_pos >= src->line_pos);
1143 max_tail = MIN (src->journal_pos, src->line_pos);
1145 /* Use the oldest token also. (We know that src->deque cannot be empty
1146 because we are in the process of adding a new token, which is already
1147 initialized enough to use here.) */
1148 token = &src->tokens[deque_back (&src->deque, 0)];
1149 assert (token->token_pos >= token->line_pos);
1150 max_tail = MIN (max_tail, token->line_pos);
1156 lex_source_expand__ (struct lex_source *src)
1158 if (src->head - src->tail >= src->allocated)
1160 size_t max_tail = lex_source_max_tail__ (src);
1161 if (max_tail > src->tail)
1163 /* Advance the tail, freeing up room at the head. */
1164 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1165 src->head - max_tail);
1166 src->tail = max_tail;
1170 /* Buffer is completely full. Expand it. */
1171 src->buffer = x2realloc (src->buffer, &src->allocated);
1176 /* There's space available at the head of the buffer. Nothing to do. */
1181 lex_source_read__ (struct lex_source *src)
1185 lex_source_expand__ (src);
1187 size_t head_ofs = src->head - src->tail;
1188 size_t space = src->allocated - head_ofs;
1189 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1190 size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1192 assert (n <= space);
1197 src->reader->eof = true;
1198 lex_source_expand__ (src);
1204 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1205 src->head - src->seg_pos));
1208 static struct lex_source *
1209 lex_source__ (const struct lexer *lexer)
1211 return (ll_is_empty (&lexer->sources) ? NULL
1212 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1215 static struct substring
1216 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1218 const struct lex_token *token0 = lex_source_next__ (src, n0);
1219 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1220 size_t start = token0->token_pos;
1221 size_t end = token1->token_pos + token1->token_len;
1223 return ss_buffer (&src->buffer[start - src->tail], end - start);
1227 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1233 assert (out_size >= 16);
1234 out_maxlen = out_size - 1;
1235 if (in.length > out_maxlen - 3)
1238 for (out_len = 0; out_len < in.length; out_len += mblen)
1240 if (in.string[out_len] == '\n'
1241 || in.string[out_len] == '\0'
1242 || (in.string[out_len] == '\r'
1243 && out_len + 1 < in.length
1244 && in.string[out_len + 1] == '\n'))
1247 mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1248 in.length - out_len);
1253 if (out_len + mblen > out_maxlen)
1257 memcpy (out, in.string, out_len);
1258 strcpy (&out[out_len], out_len < in.length ? "..." : "");
1262 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1263 const char *format, va_list args)
1265 const struct lex_token *token;
1270 token = lex_source_next__ (src, n0);
1271 if (token->token.type == T_ENDCMD)
1272 ds_put_cstr (&s, _("Syntax error at end of command"));
1275 struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1276 if (!ss_is_empty (syntax))
1278 char syntax_cstr[64];
1280 lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1281 ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1284 ds_put_cstr (&s, _("Syntax error"));
1289 ds_put_cstr (&s, ": ");
1290 ds_put_vformat (&s, format, args);
1292 ds_put_byte (&s, '.');
1295 .category = MSG_C_SYNTAX,
1296 .severity = MSG_S_ERROR,
1297 .file_name = src->reader->file_name,
1298 .first_line = lex_source_get_first_line_number (src, n0),
1299 .last_line = lex_source_get_last_line_number (src, n1),
1300 .first_column = lex_source_get_first_column (src, n0),
1301 .last_column = lex_source_get_last_column (src, n1),
1302 .text = ds_steal_cstr (&s),
1307 static void PRINTF_FORMAT (2, 3)
1308 lex_get_error (struct lex_source *src, const char *format, ...)
1313 va_start (args, format);
1315 n = deque_count (&src->deque) - 1;
1316 lex_source_error_valist (src, n, n, format, args);
1317 lex_source_pop_front (src);
1322 /* Attempts to append an additional token into SRC's deque, reading more from
1323 the underlying lex_reader if necessary.. Returns true if successful, false
1324 if the deque already represents (a suffix of) the whole lex_reader's
1327 lex_source_get__ (const struct lex_source *src_)
1329 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1333 /* State maintained while scanning tokens. Usually we only need a single
1334 state, but scanner_push() can return SCAN_SAVE to indicate that the state
1335 needs to be saved and possibly restored later with SCAN_BACK. */
1338 struct segmenter segmenter;
1339 enum segment_type last_segment;
1340 int newlines; /* Number of newlines encountered so far. */
1341 /* Maintained here so we can update lex_source's similar members when we
1347 /* Initialize state. */
1348 struct state state =
1350 .segmenter = src->segmenter,
1352 .seg_pos = src->seg_pos,
1353 .line_pos = src->line_pos,
1355 struct state saved = state;
1357 /* Append a new token to SRC and initialize it. */
1358 struct lex_token *token = lex_push_token__ (src);
1359 struct scanner scanner;
1360 scanner_init (&scanner, &token->token);
1361 token->line_pos = src->line_pos;
1362 token->token_pos = src->seg_pos;
1363 if (src->reader->line_number > 0)
1364 token->first_line = src->reader->line_number + src->n_newlines;
1366 token->first_line = 0;
1368 /* Extract segments and pass them through the scanner until we obtain a
1372 /* Extract a segment. */
1373 const char *segment = &src->buffer[state.seg_pos - src->tail];
1374 size_t seg_maxlen = src->head - state.seg_pos;
1375 enum segment_type type;
1376 int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1377 src->reader->eof, &type);
1380 /* The segmenter needs more input to produce a segment. */
1381 assert (!src->reader->eof);
1382 lex_source_read__ (src);
1386 /* Update state based on the segment. */
1387 state.last_segment = type;
1388 state.seg_pos += seg_len;
1389 if (type == SEG_NEWLINE)
1392 state.line_pos = state.seg_pos;
1395 /* Pass the segment into the scanner and try to get a token out. */
1396 enum scan_result result = scanner_push (&scanner, type,
1397 ss_buffer (segment, seg_len),
1399 if (result == SCAN_SAVE)
1401 else if (result == SCAN_BACK)
1406 else if (result == SCAN_DONE)
1410 /* If we've reached the end of a line, or the end of a command, then pass
1411 the line to the output engine as a syntax text item. */
1412 int n_lines = state.newlines;
1413 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1416 src->suppress_next_newline = true;
1418 else if (n_lines > 0 && src->suppress_next_newline)
1421 src->suppress_next_newline = false;
1423 for (int i = 0; i < n_lines; i++)
1425 /* Beginning of line. */
1426 const char *line = &src->buffer[src->journal_pos - src->tail];
1428 /* Calculate line length, including \n or \r\n end-of-line if present.
1430 We use src->head even though that may be beyond what we've actually
1431 converted to tokens (which is only through state.line_pos). That's
1432 because, if we're emitting the line due to SEG_END_COMMAND, we want to
1433 take the whole line through the newline, not just through the '.'. */
1434 size_t max_len = src->head - src->journal_pos;
1435 const char *newline = memchr (line, '\n', max_len);
1436 size_t line_len = newline ? newline - line + 1 : max_len;
1438 /* Calculate line length excluding end-of-line. */
1439 size_t copy_len = line_len;
1440 if (copy_len > 0 && line[copy_len - 1] == '\n')
1442 if (copy_len > 0 && line[copy_len - 1] == '\r')
1445 /* Submit the line as syntax. */
1446 text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1447 xmemdup0 (line, copy_len)));
1449 src->journal_pos += line_len;
1452 token->token_len = state.seg_pos - src->seg_pos;
1454 src->segmenter = state.segmenter;
1455 src->seg_pos = state.seg_pos;
1456 src->line_pos = state.line_pos;
1457 src->n_newlines += state.newlines;
1459 switch (token->token.type)
1465 token->token.type = T_ENDCMD;
1469 case SCAN_BAD_HEX_LENGTH:
1470 lex_get_error (src, _("String of hex digits has %d characters, which "
1471 "is not a multiple of 2"),
1472 (int) token->token.number);
1475 case SCAN_BAD_HEX_DIGIT:
1476 case SCAN_BAD_UNICODE_DIGIT:
1477 lex_get_error (src, _("`%c' is not a valid hex digit"),
1478 (int) token->token.number);
1481 case SCAN_BAD_UNICODE_LENGTH:
1482 lex_get_error (src, _("Unicode string contains %d bytes, which is "
1483 "not in the valid range of 1 to 8 bytes"),
1484 (int) token->token.number);
1487 case SCAN_BAD_UNICODE_CODE_POINT:
1488 lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1489 (int) token->token.number);
1492 case SCAN_EXPECTED_QUOTE:
1493 lex_get_error (src, _("Unterminated string constant"));
1496 case SCAN_EXPECTED_EXPONENT:
1497 lex_get_error (src, _("Missing exponent following `%s'"),
1498 token->token.string.string);
1501 case SCAN_UNEXPECTED_DOT:
1502 lex_get_error (src, _("Unexpected `.' in middle of command"));
1505 case SCAN_UNEXPECTED_CHAR:
1508 lex_get_error (src, _("Bad character %s in input"),
1509 uc_name (token->token.number, c_name));
1514 lex_source_pop_front (src);
1522 lex_source_push_endcmd__ (struct lex_source *src)
1524 struct lex_token *token = lex_push_token__ (src);
1525 token->token.type = T_ENDCMD;
1526 token->token_pos = 0;
1527 token->token_len = 0;
1528 token->line_pos = 0;
1529 token->first_line = 0;
1532 static struct lex_source *
1533 lex_source_create (struct lex_reader *reader)
1535 struct lex_source *src;
1536 enum segmenter_mode mode;
1538 src = xzalloc (sizeof *src);
1539 src->reader = reader;
1541 if (reader->syntax == LEX_SYNTAX_AUTO)
1542 mode = SEG_MODE_AUTO;
1543 else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1544 mode = SEG_MODE_INTERACTIVE;
1545 else if (reader->syntax == LEX_SYNTAX_BATCH)
1546 mode = SEG_MODE_BATCH;
1549 segmenter_init (&src->segmenter, mode);
1551 src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1553 lex_source_push_endcmd__ (src);
1559 lex_source_destroy (struct lex_source *src)
1561 char *file_name = src->reader->file_name;
1562 char *encoding = src->reader->encoding;
1563 if (src->reader->class->destroy != NULL)
1564 src->reader->class->destroy (src->reader);
1568 while (!deque_is_empty (&src->deque))
1569 lex_source_pop__ (src);
1571 ll_remove (&src->ll);
1575 struct lex_file_reader
1577 struct lex_reader reader;
1578 struct u8_istream *istream;
1581 static struct lex_reader_class lex_file_reader_class;
1583 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1584 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
1585 ENCODING, which should take one of the forms accepted by
1586 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
1587 mode of the new reader, respectively.
1589 Returns a null pointer if FILE_NAME cannot be opened. */
1591 lex_reader_for_file (const char *file_name, const char *encoding,
1592 enum lex_syntax_mode syntax,
1593 enum lex_error_mode error)
1595 struct lex_file_reader *r;
1596 struct u8_istream *istream;
1598 istream = (!strcmp(file_name, "-")
1599 ? u8_istream_for_fd (encoding, STDIN_FILENO)
1600 : u8_istream_for_file (encoding, file_name, O_RDONLY));
1601 if (istream == NULL)
1603 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1607 r = xmalloc (sizeof *r);
1608 lex_reader_init (&r->reader, &lex_file_reader_class);
1609 r->reader.syntax = syntax;
1610 r->reader.error = error;
1611 r->reader.file_name = xstrdup (file_name);
1612 r->reader.encoding = encoding ? xstrdup (encoding) : NULL;
1613 r->reader.line_number = 1;
1614 r->istream = istream;
1619 static struct lex_file_reader *
1620 lex_file_reader_cast (struct lex_reader *r)
1622 return UP_CAST (r, struct lex_file_reader, reader);
1626 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1627 enum prompt_style prompt_style UNUSED)
1629 struct lex_file_reader *r = lex_file_reader_cast (r_);
1630 ssize_t n_read = u8_istream_read (r->istream, buf, n);
1633 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
1640 lex_file_close (struct lex_reader *r_)
1642 struct lex_file_reader *r = lex_file_reader_cast (r_);
1644 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1646 if (u8_istream_close (r->istream) != 0)
1647 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
1650 u8_istream_free (r->istream);
1655 static struct lex_reader_class lex_file_reader_class =
1661 struct lex_string_reader
1663 struct lex_reader reader;
1668 static struct lex_reader_class lex_string_reader_class;
1670 /* Creates and returns a new lex_reader for the contents of S, which must be
1671 encoded in the given ENCODING. The new reader takes ownership of S and will free it
1672 with ss_dealloc() when it is closed. */
1674 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
1676 struct lex_string_reader *r;
1678 r = xmalloc (sizeof *r);
1679 lex_reader_init (&r->reader, &lex_string_reader_class);
1680 r->reader.syntax = LEX_SYNTAX_AUTO;
1681 r->reader.encoding = encoding ? xstrdup (encoding) : NULL;
1688 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1689 which must be encoded in ENCODING. The caller retains ownership of S. */
1691 lex_reader_for_string (const char *s, const char *encoding)
1693 struct substring ss;
1694 ss_alloc_substring (&ss, ss_cstr (s));
1695 return lex_reader_for_substring_nocopy (ss, encoding);
1698 /* Formats FORMAT as a printf()-like format string and creates and returns a
1699 new lex_reader for the formatted result. */
1701 lex_reader_for_format (const char *format, const char *encoding, ...)
1703 struct lex_reader *r;
1706 va_start (args, encoding);
1707 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
1713 static struct lex_string_reader *
1714 lex_string_reader_cast (struct lex_reader *r)
1716 return UP_CAST (r, struct lex_string_reader, reader);
1720 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1721 enum prompt_style prompt_style UNUSED)
1723 struct lex_string_reader *r = lex_string_reader_cast (r_);
1726 chunk = MIN (n, r->s.length - r->offset);
1727 memcpy (buf, r->s.string + r->offset, chunk);
1734 lex_string_close (struct lex_reader *r_)
1736 struct lex_string_reader *r = lex_string_reader_cast (r_);
1742 static struct lex_reader_class lex_string_reader_class =