1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "data/file-name.h"
34 #include "language/command.h"
35 #include "language/lexer/scan.h"
36 #include "language/lexer/segment.h"
37 #include "language/lexer/token.h"
38 #include "libpspp/assertion.h"
39 #include "libpspp/cast.h"
40 #include "libpspp/deque.h"
41 #include "libpspp/i18n.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/text-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* Location of token in terms of the lex_source's buffer.
66 src->tail <= line_pos <= token_pos <= src->head. */
67 size_t token_pos; /* Start of token. */
68 size_t token_len; /* Length of source for token in bytes. */
69 size_t line_pos; /* Start of line containing token_pos. */
70 int first_line; /* Line number at token_pos. */
73 /* A source of tokens, corresponding to a syntax file.
75 This is conceptually a lex_reader wrapped with everything needed to convert
76 its UTF-8 bytes into tokens. */
79 struct ll ll; /* In lexer's list of sources. */
80 struct lex_reader *reader;
81 struct segmenter segmenter;
82 bool eof; /* True if T_STOP was read from 'reader'. */
84 /* Buffer of UTF-8 bytes. */
86 size_t allocated; /* Number of bytes allocated. */
87 size_t tail; /* &buffer[0] offset into UTF-8 source. */
88 size_t head; /* &buffer[head - tail] offset into source. */
90 /* Positions in source file, tail <= pos <= head for each member here. */
91 size_t journal_pos; /* First byte not yet output to journal. */
92 size_t seg_pos; /* First byte not yet scanned as token. */
93 size_t line_pos; /* First byte of line containing seg_pos. */
95 int n_newlines; /* Number of new-lines up to seg_pos. */
96 bool suppress_next_newline;
99 struct deque deque; /* Indexes into 'tokens'. */
100 struct lex_token *tokens; /* Lookahead tokens for parser. */
103 static struct lex_source *lex_source_create (struct lex_reader *);
104 static void lex_source_destroy (struct lex_source *);
109 struct ll_list sources; /* Contains "struct lex_source"s. */
112 static struct lex_source *lex_source__ (const struct lexer *);
113 static const struct lex_token *lex_next__ (const struct lexer *, int n);
114 static void lex_source_push_endcmd__ (struct lex_source *);
116 static void lex_source_pop__ (struct lex_source *);
117 static bool lex_source_get__ (const struct lex_source *);
118 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
119 const char *format, va_list)
120 PRINTF_FORMAT (4, 0);
121 static const struct lex_token *lex_source_next__ (const struct lex_source *,
124 /* Initializes READER with the specified CLASS and otherwise some reasonable
125 defaults. The caller should fill in the others members as desired. */
127 lex_reader_init (struct lex_reader *reader,
128 const struct lex_reader_class *class)
130 reader->class = class;
131 reader->syntax = LEX_SYNTAX_AUTO;
132 reader->error = LEX_ERROR_INTERACTIVE;
133 reader->file_name = NULL;
134 reader->line_number = 0;
137 /* Frees any file name already in READER and replaces it by a copy of
138 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
140 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
142 free (reader->file_name);
143 reader->file_name = file_name != NULL ? xstrdup (file_name) : NULL;
146 /* Creates and returns a new lexer. */
150 struct lexer *lexer = xzalloc (sizeof *lexer);
151 ll_init (&lexer->sources);
155 /* Destroys LEXER. */
157 lex_destroy (struct lexer *lexer)
161 struct lex_source *source, *next;
163 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
164 lex_source_destroy (source);
169 /* Inserts READER into LEXER so that the next token read by LEXER comes from
170 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
173 lex_include (struct lexer *lexer, struct lex_reader *reader)
175 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
176 ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
179 /* Appends READER to LEXER, so that it will be read after all other current
180 readers have already been read. */
182 lex_append (struct lexer *lexer, struct lex_reader *reader)
184 ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
189 static struct lex_token *
190 lex_push_token__ (struct lex_source *src)
192 struct lex_token *token;
194 if (deque_is_full (&src->deque))
195 src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
197 token = &src->tokens[deque_push_front (&src->deque)];
198 token_init (&token->token);
203 lex_source_pop__ (struct lex_source *src)
205 token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
209 lex_source_pop_front (struct lex_source *src)
211 token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
214 /* Advances LEXER to the next token, consuming the current token. */
216 lex_get (struct lexer *lexer)
218 struct lex_source *src;
220 src = lex_source__ (lexer);
224 if (!deque_is_empty (&src->deque))
225 lex_source_pop__ (src);
227 while (deque_is_empty (&src->deque))
228 if (!lex_source_get__ (src))
230 lex_source_destroy (src);
231 src = lex_source__ (lexer);
237 /* Issuing errors. */
239 /* Prints a syntax error message containing the current token and
240 given message MESSAGE (if non-null). */
242 lex_error (struct lexer *lexer, const char *format, ...)
246 va_start (args, format);
247 lex_next_error_valist (lexer, 0, 0, format, args);
251 /* Prints a syntax error message containing the current token and
252 given message MESSAGE (if non-null). */
254 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
256 lex_next_error_valist (lexer, 0, 0, format, args);
259 /* Prints a syntax error message containing the current token and
260 given message MESSAGE (if non-null). */
262 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
266 va_start (args, format);
267 lex_next_error_valist (lexer, n0, n1, format, args);
271 /* Prints a syntax error message saying that OPTION0 or one of the other
272 strings following it, up to the first NULL, is expected. */
274 lex_error_expecting (struct lexer *lexer, const char *option0, ...)
276 enum { MAX_OPTIONS = 8 };
277 const char *options[MAX_OPTIONS + 1];
281 va_start (args, option0);
282 options[0] = option0;
284 while (n + 1 < MAX_OPTIONS && options[n] != NULL)
285 options[++n] = va_arg (args, const char *);
291 lex_error (lexer, NULL);
295 lex_error (lexer, _("expecting %s"), options[0]);
299 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
303 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
308 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
309 options[0], options[1], options[2], options[3]);
313 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
314 options[0], options[1], options[2], options[3], options[4]);
318 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
319 options[0], options[1], options[2], options[3], options[4],
324 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
325 options[0], options[1], options[2], options[3], options[4],
326 options[5], options[6]);
330 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
331 options[0], options[1], options[2], options[3], options[4],
332 options[5], options[6], options[7]);
340 /* Reports an error to the effect that subcommand SBC may only be
343 lex_sbc_only_once (const char *sbc)
345 msg (SE, _("Subcommand %s may only be specified once."), sbc);
348 /* Reports an error to the effect that subcommand SBC is
351 lex_sbc_missing (struct lexer *lexer, const char *sbc)
353 lex_error (lexer, _("missing required subcommand %s"), sbc);
356 /* Prints a syntax error message containing the current token and
357 given message MESSAGE (if non-null). */
359 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
360 const char *format, va_list args)
362 struct lex_source *src = lex_source__ (lexer);
365 lex_source_error_valist (src, n0, n1, format, args);
371 ds_put_format (&s, _("Syntax error at end of input"));
374 ds_put_cstr (&s, ": ");
375 ds_put_vformat (&s, format, args);
377 ds_put_byte (&s, '.');
378 msg (SE, "%s", ds_cstr (&s));
383 /* Checks that we're at end of command.
384 If so, returns a successful command completion code.
385 If not, flags a syntax error and returns an error command
388 lex_end_of_command (struct lexer *lexer)
390 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
392 lex_error (lexer, _("expecting end of command"));
399 /* Token testing functions. */
401 /* Returns true if the current token is a number. */
403 lex_is_number (struct lexer *lexer)
405 return lex_next_is_number (lexer, 0);
408 /* Returns true if the current token is a string. */
410 lex_is_string (struct lexer *lexer)
412 return lex_next_is_string (lexer, 0);
415 /* Returns the value of the current token, which must be a
416 floating point number. */
418 lex_number (struct lexer *lexer)
420 return lex_next_number (lexer, 0);
423 /* Returns true iff the current token is an integer. */
425 lex_is_integer (struct lexer *lexer)
427 return lex_next_is_integer (lexer, 0);
430 /* Returns the value of the current token, which must be an
433 lex_integer (struct lexer *lexer)
435 return lex_next_integer (lexer, 0);
438 /* Token testing functions with lookahead.
440 A value of 0 for N as an argument to any of these functions refers to the
441 current token. Lookahead is limited to the current command. Any N greater
442 than the number of tokens remaining in the current command will be treated
443 as referring to a T_ENDCMD token. */
445 /* Returns true if the token N ahead of the current token is a number. */
447 lex_next_is_number (struct lexer *lexer, int n)
449 enum token_type next_token = lex_next_token (lexer, n);
450 return next_token == T_POS_NUM || next_token == T_NEG_NUM;
453 /* Returns true if the token N ahead of the current token is a string. */
455 lex_next_is_string (struct lexer *lexer, int n)
457 return lex_next_token (lexer, n) == T_STRING;
460 /* Returns the value of the token N ahead of the current token, which must be a
461 floating point number. */
463 lex_next_number (struct lexer *lexer, int n)
465 assert (lex_next_is_number (lexer, n));
466 return lex_next_tokval (lexer, n);
469 /* Returns true if the token N ahead of the current token is an integer. */
471 lex_next_is_integer (struct lexer *lexer, int n)
475 if (!lex_next_is_number (lexer, n))
478 value = lex_next_tokval (lexer, n);
479 return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
482 /* Returns the value of the token N ahead of the current token, which must be
485 lex_next_integer (struct lexer *lexer, int n)
487 assert (lex_next_is_integer (lexer, n));
488 return lex_next_tokval (lexer, n);
491 /* Token matching functions. */
493 /* If the current token has the specified TYPE, skips it and returns true.
494 Otherwise, returns false. */
496 lex_match (struct lexer *lexer, enum token_type type)
498 if (lex_token (lexer) == type)
507 /* If the current token matches IDENTIFIER, skips it and returns true.
508 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
511 IDENTIFIER must be an ASCII string. */
513 lex_match_id (struct lexer *lexer, const char *identifier)
515 return lex_match_id_n (lexer, identifier, 3);
518 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
519 may be abbreviated to its first N letters. Otherwise, returns false.
521 IDENTIFIER must be an ASCII string. */
523 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
525 if (lex_token (lexer) == T_ID
526 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
535 /* If the current token is integer X, skips it and returns true. Otherwise,
538 lex_match_int (struct lexer *lexer, int x)
540 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
549 /* Forced matches. */
551 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
552 abbreviated to its first 3 letters. Otherwise, reports an error and returns
555 IDENTIFIER must be an ASCII string. */
557 lex_force_match_id (struct lexer *lexer, const char *identifier)
559 if (lex_match_id (lexer, identifier))
563 lex_error_expecting (lexer, identifier, NULL_SENTINEL);
568 /* If the current token has the specified TYPE, skips it and returns true.
569 Otherwise, reports an error and returns false. */
571 lex_force_match (struct lexer *lexer, enum token_type type)
573 if (lex_token (lexer) == type)
580 char *s = xasprintf ("`%s'", token_type_to_string (type));
581 lex_error_expecting (lexer, s, NULL_SENTINEL);
587 /* If the current token is a string, does nothing and returns true.
588 Otherwise, reports an error and returns false. */
590 lex_force_string (struct lexer *lexer)
592 if (lex_is_string (lexer))
596 lex_error (lexer, _("expecting string"));
601 /* If the current token is an integer, does nothing and returns true.
602 Otherwise, reports an error and returns false. */
604 lex_force_int (struct lexer *lexer)
606 if (lex_is_integer (lexer))
610 lex_error (lexer, _("expecting integer"));
615 /* If the current token is a number, does nothing and returns true.
616 Otherwise, reports an error and returns false. */
618 lex_force_num (struct lexer *lexer)
620 if (lex_is_number (lexer))
623 lex_error (lexer, _("expecting number"));
627 /* If the current token is an identifier, does nothing and returns true.
628 Otherwise, reports an error and returns false. */
630 lex_force_id (struct lexer *lexer)
632 if (lex_token (lexer) == T_ID)
635 lex_error (lexer, _("expecting identifier"));
639 /* Token accessors. */
641 /* Returns the type of LEXER's current token. */
643 lex_token (const struct lexer *lexer)
645 return lex_next_token (lexer, 0);
648 /* Returns the number in LEXER's current token.
650 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
651 tokens this function will always return zero. */
653 lex_tokval (const struct lexer *lexer)
655 return lex_next_tokval (lexer, 0);
658 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
660 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
661 this functions this function will always return NULL.
663 The UTF-8 encoding of the returned string is correct for variable names and
664 other identifiers. Use filename_to_utf8() to use it as a filename. Use
665 data_in() to use it in a "union value". */
667 lex_tokcstr (const struct lexer *lexer)
669 return lex_next_tokcstr (lexer, 0);
672 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
673 null-terminated (but the null terminator is not included in the returned
674 substring's 'length').
676 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
677 this functions this function will always return NULL.
679 The UTF-8 encoding of the returned string is correct for variable names and
680 other identifiers. Use filename_to_utf8() to use it as a filename. Use
681 data_in() to use it in a "union value". */
683 lex_tokss (const struct lexer *lexer)
685 return lex_next_tokss (lexer, 0);
690 A value of 0 for N as an argument to any of these functions refers to the
691 current token. Lookahead is limited to the current command. Any N greater
692 than the number of tokens remaining in the current command will be treated
693 as referring to a T_ENDCMD token. */
695 static const struct lex_token *
696 lex_next__ (const struct lexer *lexer_, int n)
698 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
699 struct lex_source *src = lex_source__ (lexer);
702 return lex_source_next__ (src, n);
705 static const struct lex_token stop_token =
706 { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
712 static const struct lex_token *
713 lex_source_next__ (const struct lex_source *src, int n)
715 while (deque_count (&src->deque) <= n)
717 if (!deque_is_empty (&src->deque))
719 struct lex_token *front;
721 front = &src->tokens[deque_front (&src->deque, 0)];
722 if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
726 lex_source_get__ (src);
729 return &src->tokens[deque_back (&src->deque, n)];
732 /* Returns the "struct token" of the token N after the current one in LEXER.
733 The returned pointer can be invalidated by pretty much any succeeding call
734 into the lexer, although the string pointer within the returned token is
735 only invalidated by consuming the token (e.g. with lex_get()). */
737 lex_next (const struct lexer *lexer, int n)
739 return &lex_next__ (lexer, n)->token;
742 /* Returns the type of the token N after the current one in LEXER. */
744 lex_next_token (const struct lexer *lexer, int n)
746 return lex_next (lexer, n)->type;
749 /* Returns the number in the tokn N after the current one in LEXER.
751 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
752 tokens this function will always return zero. */
754 lex_next_tokval (const struct lexer *lexer, int n)
756 const struct token *token = lex_next (lexer, n);
757 return token->number;
760 /* Returns the null-terminated string in the token N after the current one, in
763 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
764 this functions this function will always return NULL.
766 The UTF-8 encoding of the returned string is correct for variable names and
767 other identifiers. Use filename_to_utf8() to use it as a filename. Use
768 data_in() to use it in a "union value". */
770 lex_next_tokcstr (const struct lexer *lexer, int n)
772 return lex_next_tokss (lexer, n).string;
775 /* Returns the string in the token N after the current one, in UTF-8 encoding.
776 The string is null-terminated (but the null terminator is not included in
777 the returned substring's 'length').
779 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
780 this functions this function will always return NULL.
782 The UTF-8 encoding of the returned string is correct for variable names and
783 other identifiers. Use filename_to_utf8() to use it as a filename. Use
784 data_in() to use it in a "union value". */
786 lex_next_tokss (const struct lexer *lexer, int n)
788 return lex_next (lexer, n)->string;
791 /* If LEXER is positioned at the (pseudo)identifier S, skips it and returns
792 true. Otherwise, returns false.
794 S may consist of an arbitrary number of identifiers, integers, and
795 punctuation e.g. "KRUSKAL-WALLIS", "2SLS", or "END INPUT PROGRAM".
796 Identifiers may be abbreviated to their first three letters. Currently only
797 hyphens, slashes, and equals signs are supported as punctuation (but it
798 would be easy to add more).
800 S must be an ASCII string. */
802 lex_match_phrase (struct lexer *lexer, const char *s)
806 for (tok_idx = 0; ; tok_idx++)
808 enum token_type token;
811 while (c_isspace (*s))
819 for (i = 0; i < tok_idx; i++)
824 token = lex_next_token (lexer, tok_idx);
834 if (token != T_SLASH)
840 if (token != T_EQUALS)
845 case '0': case '1': case '2': case '3': case '4':
846 case '5': case '6': case '7': case '8': case '9':
850 if (token != T_POS_NUM)
856 value = value * 10 + (*s++ - '0');
858 while (c_isdigit (*s));
860 if (lex_next_tokval (lexer, tok_idx) != value)
873 len = lex_id_get_length (ss_cstr (s));
874 if (!lex_id_match (ss_buffer (s, len),
875 lex_next_tokss (lexer, tok_idx)))
887 lex_source_get_first_line_number (const struct lex_source *src, int n)
889 return lex_source_next__ (src, n)->first_line;
893 count_newlines (char *s, size_t length)
898 while ((newline = memchr (s, '\n', length)) != NULL)
901 length -= (newline + 1) - s;
909 lex_source_get_last_line_number (const struct lex_source *src, int n)
911 const struct lex_token *token = lex_source_next__ (src, n);
913 if (token->first_line == 0)
917 char *token_str = &src->buffer[token->token_pos - src->tail];
918 return token->first_line + count_newlines (token_str, token->token_len) + 1;
923 count_columns (const char *s_, size_t length)
925 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
931 for (ofs = 0; ofs < length; ofs += mblen)
935 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
938 int width = uc_width (uc, "UTF-8");
943 columns = ROUND_UP (columns + 1, 8);
950 lex_source_get_first_column (const struct lex_source *src, int n)
952 const struct lex_token *token = lex_source_next__ (src, n);
953 return count_columns (&src->buffer[token->line_pos - src->tail],
954 token->token_pos - token->line_pos);
958 lex_source_get_last_column (const struct lex_source *src, int n)
960 const struct lex_token *token = lex_source_next__ (src, n);
961 char *start, *end, *newline;
963 start = &src->buffer[token->line_pos - src->tail];
964 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
965 newline = memrchr (start, '\n', end - start);
968 return count_columns (start, end - start);
971 /* Returns the 1-based line number of the start of the syntax that represents
972 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
973 if the token is drawn from a source that does not have line numbers. */
975 lex_get_first_line_number (const struct lexer *lexer, int n)
977 const struct lex_source *src = lex_source__ (lexer);
978 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
981 /* Returns the 1-based line number of the end of the syntax that represents the
982 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
983 token or if the token is drawn from a source that does not have line
986 Most of the time, a single token is wholly within a single line of syntax,
987 but there are two exceptions: a T_STRING token can be made up of multiple
988 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
989 token can consist of a "-" on one line followed by the number on the next.
992 lex_get_last_line_number (const struct lexer *lexer, int n)
994 const struct lex_source *src = lex_source__ (lexer);
995 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
998 /* Returns the 1-based column number of the start of the syntax that represents
999 the token N after the current one in LEXER. Returns 0 for a T_STOP
1002 Column numbers are measured according to the width of characters as shown in
1003 a typical fixed-width font, in which CJK characters have width 2 and
1004 combining characters have width 0. */
1006 lex_get_first_column (const struct lexer *lexer, int n)
1008 const struct lex_source *src = lex_source__ (lexer);
1009 return src != NULL ? lex_source_get_first_column (src, n) : 0;
1012 /* Returns the 1-based column number of the end of the syntax that represents
1013 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1016 Column numbers are measured according to the width of characters as shown in
1017 a typical fixed-width font, in which CJK characters have width 2 and
1018 combining characters have width 0. */
1020 lex_get_last_column (const struct lexer *lexer, int n)
1022 const struct lex_source *src = lex_source__ (lexer);
1023 return src != NULL ? lex_source_get_last_column (src, n) : 0;
1026 /* Returns the name of the syntax file from which the current command is drawn.
1027 Returns NULL for a T_STOP token or if the command's source does not have
1030 There is no version of this function that takes an N argument because
1031 lookahead only works to the end of a command and any given command is always
1032 within a single syntax file. */
1034 lex_get_file_name (const struct lexer *lexer)
1036 struct lex_source *src = lex_source__ (lexer);
1037 return src == NULL ? NULL : src->reader->file_name;
1040 /* Returns the syntax mode for the syntax file from which the current drawn is
1041 drawn. Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1042 source does not have line numbers.
1044 There is no version of this function that takes an N argument because
1045 lookahead only works to the end of a command and any given command is always
1046 within a single syntax file. */
1047 enum lex_syntax_mode
1048 lex_get_syntax_mode (const struct lexer *lexer)
1050 struct lex_source *src = lex_source__ (lexer);
1051 return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1054 /* Returns the error mode for the syntax file from which the current drawn is
1055 drawn. Returns LEX_ERROR_INTERACTIVE for a T_STOP token or if the command's
1056 source does not have line numbers.
1058 There is no version of this function that takes an N argument because
1059 lookahead only works to the end of a command and any given command is always
1060 within a single syntax file. */
1062 lex_get_error_mode (const struct lexer *lexer)
1064 struct lex_source *src = lex_source__ (lexer);
1065 return src == NULL ? LEX_ERROR_INTERACTIVE : src->reader->error;
1068 /* If the source that LEXER is currently reading has error mode
1069 LEX_ERROR_INTERACTIVE, discards all buffered input and tokens, so that the
1070 next token to be read comes directly from whatever is next read from the
1073 It makes sense to call this function after encountering an error in a
1074 command entered on the console, because usually the user would prefer not to
1075 have cascading errors. */
1077 lex_interactive_reset (struct lexer *lexer)
1079 struct lex_source *src = lex_source__ (lexer);
1080 if (src != NULL && src->reader->error == LEX_ERROR_INTERACTIVE)
1082 src->head = src->tail = 0;
1083 src->journal_pos = src->seg_pos = src->line_pos = 0;
1084 src->n_newlines = 0;
1085 src->suppress_next_newline = false;
1086 segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1087 while (!deque_is_empty (&src->deque))
1088 lex_source_pop__ (src);
1089 lex_source_push_endcmd__ (src);
1093 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1095 lex_discard_rest_of_command (struct lexer *lexer)
1097 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1101 /* Discards all lookahead tokens in LEXER, then discards all input sources
1102 until it encounters one with error mode LEX_ERROR_INTERACTIVE or until it
1103 runs out of input sources. */
1105 lex_discard_noninteractive (struct lexer *lexer)
1107 struct lex_source *src = lex_source__ (lexer);
1111 while (!deque_is_empty (&src->deque))
1112 lex_source_pop__ (src);
1114 for (; src != NULL && src->reader->error != LEX_ERROR_INTERACTIVE;
1115 src = lex_source__ (lexer))
1116 lex_source_destroy (src);
1121 lex_source_max_tail__ (const struct lex_source *src)
1123 const struct lex_token *token;
1126 assert (src->seg_pos >= src->line_pos);
1127 max_tail = MIN (src->journal_pos, src->line_pos);
1129 /* Use the oldest token also. (We know that src->deque cannot be empty
1130 because we are in the process of adding a new token, which is already
1131 initialized enough to use here.) */
1132 token = &src->tokens[deque_back (&src->deque, 0)];
1133 assert (token->token_pos >= token->line_pos);
1134 max_tail = MIN (max_tail, token->line_pos);
1140 lex_source_expand__ (struct lex_source *src)
1142 if (src->head - src->tail >= src->allocated)
1144 size_t max_tail = lex_source_max_tail__ (src);
1145 if (max_tail > src->tail)
1147 /* Advance the tail, freeing up room at the head. */
1148 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1149 src->head - max_tail);
1150 src->tail = max_tail;
1154 /* Buffer is completely full. Expand it. */
1155 src->buffer = x2realloc (src->buffer, &src->allocated);
1160 /* There's space available at the head of the buffer. Nothing to do. */
1165 lex_source_read__ (struct lex_source *src)
1172 lex_source_expand__ (src);
1174 head_ofs = src->head - src->tail;
1175 n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1176 src->allocated - head_ofs,
1177 segmenter_get_prompt (&src->segmenter));
1182 Ensure that the input always ends in a new-line followed by a null
1183 byte, as required by the segmenter library. */
1185 if (src->head == src->tail
1186 || src->buffer[src->head - src->tail - 1] != '\n')
1187 src->buffer[src->head++ - src->tail] = '\n';
1189 lex_source_expand__ (src);
1190 src->buffer[src->head++ - src->tail] = '\0';
1197 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1198 src->head - src->seg_pos));
1201 static struct lex_source *
1202 lex_source__ (const struct lexer *lexer)
1204 return (ll_is_empty (&lexer->sources) ? NULL
1205 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1208 static struct substring
1209 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1211 const struct lex_token *token0 = lex_source_next__ (src, n0);
1212 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1213 size_t start = token0->token_pos;
1214 size_t end = token1->token_pos + token1->token_len;
1216 return ss_buffer (&src->buffer[start - src->tail], end - start);
1220 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1226 assert (out_size >= 16);
1227 out_maxlen = out_size - (in.length >= out_size ? 3 : 0) - 1;
1228 for (out_len = 0; out_len < in.length; out_len += mblen)
1230 if (in.string[out_len] == '\n'
1231 || (in.string[out_len] == '\r'
1232 && out_len + 1 < in.length
1233 && in.string[out_len + 1] == '\n'))
1236 mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1237 in.length - out_len);
1238 if (out_len + mblen > out_maxlen)
1242 memcpy (out, in.string, out_len);
1243 strcpy (&out[out_len], out_len < in.length ? "..." : "");
1247 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1248 const char *format, va_list args)
1250 const struct lex_token *token;
1256 token = lex_source_next__ (src, n0);
1257 if (token->token.type == T_ENDCMD)
1258 ds_put_cstr (&s, _("Syntax error at end of command"));
1261 struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1262 if (!ss_is_empty (syntax))
1264 char syntax_cstr[64];
1266 lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1267 ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1270 ds_put_cstr (&s, _("Syntax error"));
1275 ds_put_cstr (&s, ": ");
1276 ds_put_vformat (&s, format, args);
1278 ds_put_byte (&s, '.');
1280 m.category = MSG_C_SYNTAX;
1281 m.severity = MSG_S_ERROR;
1282 m.file_name = src->reader->file_name;
1283 m.first_line = lex_source_get_first_line_number (src, n0);
1284 m.last_line = lex_source_get_last_line_number (src, n1);
1285 m.first_column = lex_source_get_first_column (src, n0);
1286 m.last_column = lex_source_get_last_column (src, n1);
1287 m.text = ds_steal_cstr (&s);
1291 static void PRINTF_FORMAT (2, 3)
1292 lex_get_error (struct lex_source *src, const char *format, ...)
1297 va_start (args, format);
1299 n = deque_count (&src->deque) - 1;
1300 lex_source_error_valist (src, n, n, format, args);
1301 lex_source_pop_front (src);
1307 lex_source_get__ (const struct lex_source *src_)
1309 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1313 struct segmenter segmenter;
1314 enum segment_type last_segment;
1320 struct state state, saved;
1321 enum scan_result result;
1322 struct scanner scanner;
1323 struct lex_token *token;
1330 state.segmenter = src->segmenter;
1332 state.seg_pos = src->seg_pos;
1333 state.line_pos = src->line_pos;
1336 token = lex_push_token__ (src);
1337 scanner_init (&scanner, &token->token);
1338 token->line_pos = src->line_pos;
1339 token->token_pos = src->seg_pos;
1340 if (src->reader->line_number > 0)
1341 token->first_line = src->reader->line_number + src->n_newlines;
1343 token->first_line = 0;
1347 enum segment_type type;
1348 const char *segment;
1352 segment = &src->buffer[state.seg_pos - src->tail];
1353 seg_maxlen = src->head - state.seg_pos;
1354 seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen, &type);
1357 lex_source_read__ (src);
1361 state.last_segment = type;
1362 state.seg_pos += seg_len;
1363 if (type == SEG_NEWLINE)
1366 state.line_pos = state.seg_pos;
1369 result = scanner_push (&scanner, type, ss_buffer (segment, seg_len),
1371 if (result == SCAN_SAVE)
1373 else if (result == SCAN_BACK)
1378 else if (result == SCAN_DONE)
1382 n_lines = state.newlines;
1383 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1386 src->suppress_next_newline = true;
1388 else if (n_lines > 0 && src->suppress_next_newline)
1391 src->suppress_next_newline = false;
1393 for (i = 0; i < n_lines; i++)
1395 const char *newline;
1400 line = &src->buffer[src->journal_pos - src->tail];
1401 newline = rawmemchr (line, '\n');
1402 line_len = newline - line;
1403 if (line_len > 0 && line[line_len - 1] == '\r')
1406 syntax = malloc (line_len + 2);
1407 memcpy (syntax, line, line_len);
1408 syntax[line_len] = '\n';
1409 syntax[line_len + 1] = '\0';
1411 text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX, syntax));
1413 src->journal_pos += newline - line + 1;
1416 token->token_len = state.seg_pos - src->seg_pos;
1418 src->segmenter = state.segmenter;
1419 src->seg_pos = state.seg_pos;
1420 src->line_pos = state.line_pos;
1421 src->n_newlines += state.newlines;
1423 switch (token->token.type)
1429 token->token.type = T_ENDCMD;
1433 case SCAN_BAD_HEX_LENGTH:
1434 lex_get_error (src, _("String of hex digits has %d characters, which "
1435 "is not a multiple of 2"),
1436 (int) token->token.number);
1439 case SCAN_BAD_HEX_DIGIT:
1440 case SCAN_BAD_UNICODE_DIGIT:
1441 lex_get_error (src, _("`%c' is not a valid hex digit"),
1442 (int) token->token.number);
1445 case SCAN_BAD_UNICODE_LENGTH:
1446 lex_get_error (src, _("Unicode string contains %d bytes, which is "
1447 "not in the valid range of 1 to 8 bytes"),
1448 (int) token->token.number);
1451 case SCAN_BAD_UNICODE_CODE_POINT:
1452 lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1453 (int) token->token.number);
1456 case SCAN_EXPECTED_QUOTE:
1457 lex_get_error (src, _("Unterminated string constant"));
1460 case SCAN_EXPECTED_EXPONENT:
1461 lex_get_error (src, _("Missing exponent following `%s'"),
1462 token->token.string.string);
1465 case SCAN_UNEXPECTED_DOT:
1466 lex_get_error (src, _("Unexpected `.' in middle of command"));
1469 case SCAN_UNEXPECTED_CHAR:
1472 lex_get_error (src, _("Bad character %s in input"),
1473 uc_name (token->token.number, c_name));
1478 lex_source_pop_front (src);
1486 lex_source_push_endcmd__ (struct lex_source *src)
1488 struct lex_token *token = lex_push_token__ (src);
1489 token->token.type = T_ENDCMD;
1490 token->token_pos = 0;
1491 token->token_len = 0;
1492 token->line_pos = 0;
1493 token->first_line = 0;
1496 static struct lex_source *
1497 lex_source_create (struct lex_reader *reader)
1499 struct lex_source *src;
1500 enum segmenter_mode mode;
1502 src = xzalloc (sizeof *src);
1503 src->reader = reader;
1505 if (reader->syntax == LEX_SYNTAX_AUTO)
1506 mode = SEG_MODE_AUTO;
1507 else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1508 mode = SEG_MODE_INTERACTIVE;
1509 else if (reader->syntax == LEX_SYNTAX_BATCH)
1510 mode = SEG_MODE_BATCH;
1513 segmenter_init (&src->segmenter, mode);
1515 src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1517 lex_source_push_endcmd__ (src);
1523 lex_source_destroy (struct lex_source *src)
1525 char *file_name = src->reader->file_name;
1526 if (src->reader->class->destroy != NULL)
1527 src->reader->class->destroy (src->reader);
1530 while (!deque_is_empty (&src->deque))
1531 lex_source_pop__ (src);
1533 ll_remove (&src->ll);
1537 struct lex_file_reader
1539 struct lex_reader reader;
1540 struct u8_istream *istream;
1544 static struct lex_reader_class lex_file_reader_class;
1546 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1547 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
1548 ENCODING, which should take one of the forms accepted by
1549 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
1550 mode of the new reader, respectively.
1552 Returns a null pointer if FILE_NAME cannot be opened. */
1554 lex_reader_for_file (const char *file_name, const char *encoding,
1555 enum lex_syntax_mode syntax,
1556 enum lex_error_mode error)
1558 struct lex_file_reader *r;
1559 struct u8_istream *istream;
1561 istream = (!strcmp(file_name, "-")
1562 ? u8_istream_for_fd (encoding, STDIN_FILENO)
1563 : u8_istream_for_file (encoding, file_name, O_RDONLY));
1564 if (istream == NULL)
1566 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1570 r = xmalloc (sizeof *r);
1571 lex_reader_init (&r->reader, &lex_file_reader_class);
1572 r->reader.syntax = syntax;
1573 r->reader.error = error;
1574 r->reader.file_name = xstrdup (file_name);
1575 r->reader.line_number = 1;
1576 r->istream = istream;
1577 r->file_name = xstrdup (file_name);
1582 static struct lex_file_reader *
1583 lex_file_reader_cast (struct lex_reader *r)
1585 return UP_CAST (r, struct lex_file_reader, reader);
1589 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1590 enum prompt_style prompt_style UNUSED)
1592 struct lex_file_reader *r = lex_file_reader_cast (r_);
1593 ssize_t n_read = u8_istream_read (r->istream, buf, n);
1596 msg (ME, _("Error reading `%s': %s."), r->file_name, strerror (errno));
1603 lex_file_close (struct lex_reader *r_)
1605 struct lex_file_reader *r = lex_file_reader_cast (r_);
1607 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1609 if (u8_istream_close (r->istream) != 0)
1610 msg (ME, _("Error closing `%s': %s."), r->file_name, strerror (errno));
1613 u8_istream_free (r->istream);
1615 free (r->file_name);
1619 static struct lex_reader_class lex_file_reader_class =
1625 struct lex_string_reader
1627 struct lex_reader reader;
1632 static struct lex_reader_class lex_string_reader_class;
1634 /* Creates and returns a new lex_reader for the contents of S, which must be
1635 encoded in UTF-8. The new reader takes ownership of S and will free it
1636 with ss_dealloc() when it is closed. */
1638 lex_reader_for_substring_nocopy (struct substring s)
1640 struct lex_string_reader *r;
1642 r = xmalloc (sizeof *r);
1643 lex_reader_init (&r->reader, &lex_string_reader_class);
1644 r->reader.syntax = LEX_SYNTAX_INTERACTIVE;
1651 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1652 which must be encoded in UTF-8. The caller retains ownership of S. */
1654 lex_reader_for_string (const char *s)
1656 struct substring ss;
1657 ss_alloc_substring (&ss, ss_cstr (s));
1658 return lex_reader_for_substring_nocopy (ss);
1661 /* Formats FORMAT as a printf()-like format string and creates and returns a
1662 new lex_reader for the formatted result. */
1664 lex_reader_for_format (const char *format, ...)
1666 struct lex_reader *r;
1669 va_start (args, format);
1670 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)));
1676 static struct lex_string_reader *
1677 lex_string_reader_cast (struct lex_reader *r)
1679 return UP_CAST (r, struct lex_string_reader, reader);
1683 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1684 enum prompt_style prompt_style UNUSED)
1686 struct lex_string_reader *r = lex_string_reader_cast (r_);
1689 chunk = MIN (n, r->s.length - r->offset);
1690 memcpy (buf, r->s.string + r->offset, chunk);
1697 lex_string_close (struct lex_reader *r_)
1699 struct lex_string_reader *r = lex_string_reader_cast (r_);
1705 static struct lex_reader_class lex_string_reader_class =