1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "data/file-name.h"
34 #include "language/command.h"
35 #include "language/lexer/scan.h"
36 #include "language/lexer/segment.h"
37 #include "language/lexer/token.h"
38 #include "libpspp/assertion.h"
39 #include "libpspp/cast.h"
40 #include "libpspp/deque.h"
41 #include "libpspp/i18n.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/text-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* Location of token in terms of the lex_source's buffer.
66 src->tail <= line_pos <= token_pos <= src->head. */
67 size_t token_pos; /* Start of token. */
68 size_t token_len; /* Length of source for token in bytes. */
69 size_t line_pos; /* Start of line containing token_pos. */
70 int first_line; /* Line number at token_pos. */
73 /* A source of tokens, corresponding to a syntax file.
75 This is conceptually a lex_reader wrapped with everything needed to convert
76 its UTF-8 bytes into tokens. */
79 struct ll ll; /* In lexer's list of sources. */
80 struct lex_reader *reader;
81 struct segmenter segmenter;
82 bool eof; /* True if T_STOP was read from 'reader'. */
84 /* Buffer of UTF-8 bytes. */
86 size_t allocated; /* Number of bytes allocated. */
87 size_t tail; /* &buffer[0] offset into UTF-8 source. */
88 size_t head; /* &buffer[head - tail] offset into source. */
90 /* Positions in source file, tail <= pos <= head for each member here. */
91 size_t journal_pos; /* First byte not yet output to journal. */
92 size_t seg_pos; /* First byte not yet scanned as token. */
93 size_t line_pos; /* First byte of line containing seg_pos. */
95 int n_newlines; /* Number of new-lines up to seg_pos. */
96 bool suppress_next_newline;
99 struct deque deque; /* Indexes into 'tokens'. */
100 struct lex_token *tokens; /* Lookahead tokens for parser. */
103 static struct lex_source *lex_source_create (struct lex_reader *);
104 static void lex_source_destroy (struct lex_source *);
109 struct ll_list sources; /* Contains "struct lex_source"s. */
112 static struct lex_source *lex_source__ (const struct lexer *);
113 static const struct lex_token *lex_next__ (const struct lexer *, int n);
114 static void lex_source_push_endcmd__ (struct lex_source *);
116 static void lex_source_pop__ (struct lex_source *);
117 static bool lex_source_get__ (const struct lex_source *);
118 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
119 const char *format, va_list)
120 PRINTF_FORMAT (4, 0);
121 static const struct lex_token *lex_source_next__ (const struct lex_source *,
124 /* Initializes READER with the specified CLASS and otherwise some reasonable
125 defaults. The caller should fill in the others members as desired. */
127 lex_reader_init (struct lex_reader *reader,
128 const struct lex_reader_class *class)
130 reader->class = class;
131 reader->syntax = LEX_SYNTAX_AUTO;
132 reader->error = LEX_ERROR_INTERACTIVE;
133 reader->file_name = NULL;
134 reader->line_number = 0;
137 /* Frees any file name already in READER and replaces it by a copy of
138 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
140 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
142 free (reader->file_name);
143 reader->file_name = file_name != NULL ? xstrdup (file_name) : NULL;
146 /* Creates and returns a new lexer. */
150 struct lexer *lexer = xzalloc (sizeof *lexer);
151 ll_init (&lexer->sources);
155 /* Destroys LEXER. */
157 lex_destroy (struct lexer *lexer)
161 struct lex_source *source, *next;
163 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
164 lex_source_destroy (source);
169 /* Inserts READER into LEXER so that the next token read by LEXER comes from
170 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
173 lex_include (struct lexer *lexer, struct lex_reader *reader)
175 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
176 ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
179 /* Appends READER to LEXER, so that it will be read after all other current
180 readers have already been read. */
182 lex_append (struct lexer *lexer, struct lex_reader *reader)
184 ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
189 static struct lex_token *
190 lex_push_token__ (struct lex_source *src)
192 struct lex_token *token;
194 if (deque_is_full (&src->deque))
195 src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
197 token = &src->tokens[deque_push_front (&src->deque)];
198 token_init (&token->token);
203 lex_source_pop__ (struct lex_source *src)
205 token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
209 lex_source_pop_front (struct lex_source *src)
211 token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
214 /* Advances LEXER to the next token, consuming the current token. */
216 lex_get (struct lexer *lexer)
218 struct lex_source *src;
220 src = lex_source__ (lexer);
224 if (!deque_is_empty (&src->deque))
225 lex_source_pop__ (src);
227 while (deque_is_empty (&src->deque))
228 if (!lex_source_get__ (src))
230 lex_source_destroy (src);
231 src = lex_source__ (lexer);
237 /* Issuing errors. */
239 /* Prints a syntax error message containing the current token and
240 given message MESSAGE (if non-null). */
242 lex_error (struct lexer *lexer, const char *format, ...)
246 va_start (args, format);
247 lex_next_error_valist (lexer, 0, 0, format, args);
251 /* Prints a syntax error message containing the current token and
252 given message MESSAGE (if non-null). */
254 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
256 lex_next_error_valist (lexer, 0, 0, format, args);
259 /* Prints a syntax error message containing the current token and
260 given message MESSAGE (if non-null). */
262 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
266 va_start (args, format);
267 lex_next_error_valist (lexer, n0, n1, format, args);
271 /* Reports an error to the effect that subcommand SBC may only be
274 lex_sbc_only_once (const char *sbc)
276 msg (SE, _("Subcommand %s may only be specified once."), sbc);
279 /* Reports an error to the effect that subcommand SBC is
282 lex_sbc_missing (struct lexer *lexer, const char *sbc)
284 lex_error (lexer, _("missing required subcommand %s"), sbc);
287 /* Prints a syntax error message containing the current token and
288 given message MESSAGE (if non-null). */
290 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
291 const char *format, va_list args)
293 struct lex_source *src = lex_source__ (lexer);
296 lex_source_error_valist (src, n0, n1, format, args);
302 ds_put_format (&s, _("Syntax error at end of input"));
305 ds_put_cstr (&s, ": ");
306 ds_put_vformat (&s, format, args);
308 ds_put_byte (&s, '.');
309 msg (SE, "%s", ds_cstr (&s));
314 /* Checks that we're at end of command.
315 If so, returns a successful command completion code.
316 If not, flags a syntax error and returns an error command
319 lex_end_of_command (struct lexer *lexer)
321 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
323 lex_error (lexer, _("expecting end of command"));
330 /* Token testing functions. */
332 /* Returns true if the current token is a number. */
334 lex_is_number (struct lexer *lexer)
336 return lex_next_is_number (lexer, 0);
339 /* Returns true if the current token is a string. */
341 lex_is_string (struct lexer *lexer)
343 return lex_next_is_string (lexer, 0);
346 /* Returns the value of the current token, which must be a
347 floating point number. */
349 lex_number (struct lexer *lexer)
351 return lex_next_number (lexer, 0);
354 /* Returns true iff the current token is an integer. */
356 lex_is_integer (struct lexer *lexer)
358 return lex_next_is_integer (lexer, 0);
361 /* Returns the value of the current token, which must be an
364 lex_integer (struct lexer *lexer)
366 return lex_next_integer (lexer, 0);
369 /* Token testing functions with lookahead.
371 A value of 0 for N as an argument to any of these functions refers to the
372 current token. Lookahead is limited to the current command. Any N greater
373 than the number of tokens remaining in the current command will be treated
374 as referring to a T_ENDCMD token. */
376 /* Returns true if the token N ahead of the current token is a number. */
378 lex_next_is_number (struct lexer *lexer, int n)
380 enum token_type next_token = lex_next_token (lexer, n);
381 return next_token == T_POS_NUM || next_token == T_NEG_NUM;
384 /* Returns true if the token N ahead of the current token is a string. */
386 lex_next_is_string (struct lexer *lexer, int n)
388 return lex_next_token (lexer, n) == T_STRING;
391 /* Returns the value of the token N ahead of the current token, which must be a
392 floating point number. */
394 lex_next_number (struct lexer *lexer, int n)
396 assert (lex_next_is_number (lexer, n));
397 return lex_next_tokval (lexer, n);
400 /* Returns true if the token N ahead of the current token is an integer. */
402 lex_next_is_integer (struct lexer *lexer, int n)
406 if (!lex_next_is_number (lexer, n))
409 value = lex_next_tokval (lexer, n);
410 return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
413 /* Returns the value of the token N ahead of the current token, which must be
416 lex_next_integer (struct lexer *lexer, int n)
418 assert (lex_next_is_integer (lexer, n));
419 return lex_next_tokval (lexer, n);
422 /* Token matching functions. */
424 /* If the current token has the specified TYPE, skips it and returns true.
425 Otherwise, returns false. */
427 lex_match (struct lexer *lexer, enum token_type type)
429 if (lex_token (lexer) == type)
438 /* If the current token matches IDENTIFIER, skips it and returns true.
439 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
442 IDENTIFIER must be an ASCII string. */
444 lex_match_id (struct lexer *lexer, const char *identifier)
446 return lex_match_id_n (lexer, identifier, 3);
449 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
450 may be abbreviated to its first N letters. Otherwise, returns false.
452 IDENTIFIER must be an ASCII string. */
454 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
456 if (lex_token (lexer) == T_ID
457 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
466 /* If the current token is integer X, skips it and returns true. Otherwise,
469 lex_match_int (struct lexer *lexer, int x)
471 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
480 /* Forced matches. */
482 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
483 abbreviated to its first 3 letters. Otherwise, reports an error and returns
486 IDENTIFIER must be an ASCII string. */
488 lex_force_match_id (struct lexer *lexer, const char *identifier)
490 if (lex_match_id (lexer, identifier))
494 lex_error (lexer, _("expecting `%s'"), identifier);
499 /* If the current token has the specified TYPE, skips it and returns true.
500 Otherwise, reports an error and returns false. */
502 lex_force_match (struct lexer *lexer, enum token_type type)
504 if (lex_token (lexer) == type)
511 lex_error (lexer, _("expecting `%s'"), token_type_to_string (type));
516 /* If the current token is a string, does nothing and returns true.
517 Otherwise, reports an error and returns false. */
519 lex_force_string (struct lexer *lexer)
521 if (lex_is_string (lexer))
525 lex_error (lexer, _("expecting string"));
530 /* If the current token is an integer, does nothing and returns true.
531 Otherwise, reports an error and returns false. */
533 lex_force_int (struct lexer *lexer)
535 if (lex_is_integer (lexer))
539 lex_error (lexer, _("expecting integer"));
544 /* If the current token is a number, does nothing and returns true.
545 Otherwise, reports an error and returns false. */
547 lex_force_num (struct lexer *lexer)
549 if (lex_is_number (lexer))
552 lex_error (lexer, _("expecting number"));
556 /* If the current token is an identifier, does nothing and returns true.
557 Otherwise, reports an error and returns false. */
559 lex_force_id (struct lexer *lexer)
561 if (lex_token (lexer) == T_ID)
564 lex_error (lexer, _("expecting identifier"));
568 /* Token accessors. */
570 /* Returns the type of LEXER's current token. */
572 lex_token (const struct lexer *lexer)
574 return lex_next_token (lexer, 0);
577 /* Returns the number in LEXER's current token.
579 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
580 tokens this function will always return zero. */
582 lex_tokval (const struct lexer *lexer)
584 return lex_next_tokval (lexer, 0);
587 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
589 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
590 this functions this function will always return NULL.
592 The UTF-8 encoding of the returned string is correct for variable names and
593 other identifiers. Use filename_to_utf8() to use it as a filename. Use
594 data_in() to use it in a "union value". */
596 lex_tokcstr (const struct lexer *lexer)
598 return lex_next_tokcstr (lexer, 0);
601 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
602 null-terminated (but the null terminator is not included in the returned
603 substring's 'length').
605 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
606 this functions this function will always return NULL.
608 The UTF-8 encoding of the returned string is correct for variable names and
609 other identifiers. Use filename_to_utf8() to use it as a filename. Use
610 data_in() to use it in a "union value". */
612 lex_tokss (const struct lexer *lexer)
614 return lex_next_tokss (lexer, 0);
619 A value of 0 for N as an argument to any of these functions refers to the
620 current token. Lookahead is limited to the current command. Any N greater
621 than the number of tokens remaining in the current command will be treated
622 as referring to a T_ENDCMD token. */
624 static const struct lex_token *
625 lex_next__ (const struct lexer *lexer_, int n)
627 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
628 struct lex_source *src = lex_source__ (lexer);
631 return lex_source_next__ (src, n);
634 static const struct lex_token stop_token =
635 { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
641 static const struct lex_token *
642 lex_source_next__ (const struct lex_source *src, int n)
644 while (deque_count (&src->deque) <= n)
646 if (!deque_is_empty (&src->deque))
648 struct lex_token *front;
650 front = &src->tokens[deque_front (&src->deque, 0)];
651 if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
655 lex_source_get__ (src);
658 return &src->tokens[deque_back (&src->deque, n)];
661 /* Returns the "struct token" of the token N after the current one in LEXER.
662 The returned pointer can be invalidated by pretty much any succeeding call
663 into the lexer, although the string pointer within the returned token is
664 only invalidated by consuming the token (e.g. with lex_get()). */
666 lex_next (const struct lexer *lexer, int n)
668 return &lex_next__ (lexer, n)->token;
671 /* Returns the type of the token N after the current one in LEXER. */
673 lex_next_token (const struct lexer *lexer, int n)
675 return lex_next (lexer, n)->type;
678 /* Returns the number in the tokn N after the current one in LEXER.
680 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
681 tokens this function will always return zero. */
683 lex_next_tokval (const struct lexer *lexer, int n)
685 const struct token *token = lex_next (lexer, n);
686 return token->number;
689 /* Returns the null-terminated string in the token N after the current one, in
692 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
693 this functions this function will always return NULL.
695 The UTF-8 encoding of the returned string is correct for variable names and
696 other identifiers. Use filename_to_utf8() to use it as a filename. Use
697 data_in() to use it in a "union value". */
699 lex_next_tokcstr (const struct lexer *lexer, int n)
701 return lex_next_tokss (lexer, n).string;
704 /* Returns the string in the token N after the current one, in UTF-8 encoding.
705 The string is null-terminated (but the null terminator is not included in
706 the returned substring's 'length').
708 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
709 this functions this function will always return NULL.
711 The UTF-8 encoding of the returned string is correct for variable names and
712 other identifiers. Use filename_to_utf8() to use it as a filename. Use
713 data_in() to use it in a "union value". */
715 lex_next_tokss (const struct lexer *lexer, int n)
717 return lex_next (lexer, n)->string;
720 /* If LEXER is positioned at the (pseudo)identifier S, skips it and returns
721 true. Otherwise, returns false.
723 S may consist of an arbitrary number of identifiers, integers, and
724 punctuation e.g. "KRUSKAL-WALLIS", "2SLS", or "END INPUT PROGRAM".
725 Identifiers may be abbreviated to their first three letters. Currently only
726 hyphens, slashes, and equals signs are supported as punctuation (but it
727 would be easy to add more).
729 S must be an ASCII string. */
731 lex_match_phrase (struct lexer *lexer, const char *s)
735 for (tok_idx = 0; ; tok_idx++)
737 enum token_type token;
740 while (c_isspace (*s))
748 for (i = 0; i < tok_idx; i++)
753 token = lex_next_token (lexer, tok_idx);
763 if (token != T_SLASH)
769 if (token != T_EQUALS)
774 case '0': case '1': case '2': case '3': case '4':
775 case '5': case '6': case '7': case '8': case '9':
779 if (token != T_POS_NUM)
785 value = value * 10 + (*s++ - '0');
787 while (c_isdigit (*s));
789 if (lex_next_tokval (lexer, tok_idx) != value)
802 len = lex_id_get_length (ss_cstr (s));
803 if (!lex_id_match (ss_buffer (s, len),
804 lex_next_tokss (lexer, tok_idx)))
816 lex_source_get_first_line_number (const struct lex_source *src, int n)
818 return lex_source_next__ (src, n)->first_line;
822 count_newlines (char *s, size_t length)
827 while ((newline = memchr (s, '\n', length)) != NULL)
830 length -= (newline + 1) - s;
838 lex_source_get_last_line_number (const struct lex_source *src, int n)
840 const struct lex_token *token = lex_source_next__ (src, n);
842 if (token->first_line == 0)
846 char *token_str = &src->buffer[token->token_pos - src->tail];
847 return token->first_line + count_newlines (token_str, token->token_len) + 1;
852 count_columns (const char *s_, size_t length)
854 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
860 for (ofs = 0; ofs < length; ofs += mblen)
864 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
867 int width = uc_width (uc, "UTF-8");
872 columns = ROUND_UP (columns + 1, 8);
879 lex_source_get_first_column (const struct lex_source *src, int n)
881 const struct lex_token *token = lex_source_next__ (src, n);
882 return count_columns (&src->buffer[token->line_pos - src->tail],
883 token->token_pos - token->line_pos);
887 lex_source_get_last_column (const struct lex_source *src, int n)
889 const struct lex_token *token = lex_source_next__ (src, n);
890 char *start, *end, *newline;
892 start = &src->buffer[token->line_pos - src->tail];
893 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
894 newline = memrchr (start, '\n', end - start);
897 return count_columns (start, end - start);
900 /* Returns the 1-based line number of the start of the syntax that represents
901 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
902 if the token is drawn from a source that does not have line numbers. */
904 lex_get_first_line_number (const struct lexer *lexer, int n)
906 const struct lex_source *src = lex_source__ (lexer);
907 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
910 /* Returns the 1-based line number of the end of the syntax that represents the
911 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
912 token or if the token is drawn from a source that does not have line
915 Most of the time, a single token is wholly within a single line of syntax,
916 but there are two exceptions: a T_STRING token can be made up of multiple
917 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
918 token can consist of a "-" on one line followed by the number on the next.
921 lex_get_last_line_number (const struct lexer *lexer, int n)
923 const struct lex_source *src = lex_source__ (lexer);
924 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
927 /* Returns the 1-based column number of the start of the syntax that represents
928 the token N after the current one in LEXER. Returns 0 for a T_STOP
931 Column numbers are measured according to the width of characters as shown in
932 a typical fixed-width font, in which CJK characters have width 2 and
933 combining characters have width 0. */
935 lex_get_first_column (const struct lexer *lexer, int n)
937 const struct lex_source *src = lex_source__ (lexer);
938 return src != NULL ? lex_source_get_first_column (src, n) : 0;
941 /* Returns the 1-based column number of the end of the syntax that represents
942 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
945 Column numbers are measured according to the width of characters as shown in
946 a typical fixed-width font, in which CJK characters have width 2 and
947 combining characters have width 0. */
949 lex_get_last_column (const struct lexer *lexer, int n)
951 const struct lex_source *src = lex_source__ (lexer);
952 return src != NULL ? lex_source_get_last_column (src, n) : 0;
955 /* Returns the name of the syntax file from which the current command is drawn.
956 Returns NULL for a T_STOP token or if the command's source does not have
959 There is no version of this function that takes an N argument because
960 lookahead only works to the end of a command and any given command is always
961 within a single syntax file. */
963 lex_get_file_name (const struct lexer *lexer)
965 struct lex_source *src = lex_source__ (lexer);
966 return src == NULL ? NULL : src->reader->file_name;
969 /* Returns the syntax mode for the syntax file from which the current drawn is
970 drawn. Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
971 source does not have line numbers.
973 There is no version of this function that takes an N argument because
974 lookahead only works to the end of a command and any given command is always
975 within a single syntax file. */
977 lex_get_syntax_mode (const struct lexer *lexer)
979 struct lex_source *src = lex_source__ (lexer);
980 return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
983 /* Returns the error mode for the syntax file from which the current drawn is
984 drawn. Returns LEX_ERROR_INTERACTIVE for a T_STOP token or if the command's
985 source does not have line numbers.
987 There is no version of this function that takes an N argument because
988 lookahead only works to the end of a command and any given command is always
989 within a single syntax file. */
991 lex_get_error_mode (const struct lexer *lexer)
993 struct lex_source *src = lex_source__ (lexer);
994 return src == NULL ? LEX_ERROR_INTERACTIVE : src->reader->error;
997 /* If the source that LEXER is currently reading has error mode
998 LEX_ERROR_INTERACTIVE, discards all buffered input and tokens, so that the
999 next token to be read comes directly from whatever is next read from the
1002 It makes sense to call this function after encountering an error in a
1003 command entered on the console, because usually the user would prefer not to
1004 have cascading errors. */
1006 lex_interactive_reset (struct lexer *lexer)
1008 struct lex_source *src = lex_source__ (lexer);
1009 if (src != NULL && src->reader->error == LEX_ERROR_INTERACTIVE)
1011 src->head = src->tail = 0;
1012 src->journal_pos = src->seg_pos = src->line_pos = 0;
1013 src->n_newlines = 0;
1014 src->suppress_next_newline = false;
1015 segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1016 while (!deque_is_empty (&src->deque))
1017 lex_source_pop__ (src);
1018 lex_source_push_endcmd__ (src);
1022 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1024 lex_discard_rest_of_command (struct lexer *lexer)
1026 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1030 /* Discards all lookahead tokens in LEXER, then discards all input sources
1031 until it encounters one with error mode LEX_ERROR_INTERACTIVE or until it
1032 runs out of input sources. */
1034 lex_discard_noninteractive (struct lexer *lexer)
1036 struct lex_source *src = lex_source__ (lexer);
1040 while (!deque_is_empty (&src->deque))
1041 lex_source_pop__ (src);
1043 for (; src != NULL && src->reader->error != LEX_ERROR_INTERACTIVE;
1044 src = lex_source__ (lexer))
1045 lex_source_destroy (src);
1050 lex_source_max_tail__ (const struct lex_source *src)
1052 const struct lex_token *token;
1055 assert (src->seg_pos >= src->line_pos);
1056 max_tail = MIN (src->journal_pos, src->line_pos);
1058 /* Use the oldest token also. (We know that src->deque cannot be empty
1059 because we are in the process of adding a new token, which is already
1060 initialized enough to use here.) */
1061 token = &src->tokens[deque_back (&src->deque, 0)];
1062 assert (token->token_pos >= token->line_pos);
1063 max_tail = MIN (max_tail, token->line_pos);
1069 lex_source_expand__ (struct lex_source *src)
1071 if (src->head - src->tail >= src->allocated)
1073 size_t max_tail = lex_source_max_tail__ (src);
1074 if (max_tail > src->tail)
1076 /* Advance the tail, freeing up room at the head. */
1077 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1078 src->head - max_tail);
1079 src->tail = max_tail;
1083 /* Buffer is completely full. Expand it. */
1084 src->buffer = x2realloc (src->buffer, &src->allocated);
1089 /* There's space available at the head of the buffer. Nothing to do. */
1094 lex_source_read__ (struct lex_source *src)
1101 lex_source_expand__ (src);
1103 head_ofs = src->head - src->tail;
1104 n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1105 src->allocated - head_ofs,
1106 segmenter_get_prompt (&src->segmenter));
1111 Ensure that the input always ends in a new-line followed by a null
1112 byte, as required by the segmenter library. */
1114 if (src->head == src->tail
1115 || src->buffer[src->head - src->tail - 1] != '\n')
1116 src->buffer[src->head++ - src->tail] = '\n';
1118 lex_source_expand__ (src);
1119 src->buffer[src->head++ - src->tail] = '\0';
1126 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1127 src->head - src->seg_pos));
1130 static struct lex_source *
1131 lex_source__ (const struct lexer *lexer)
1133 return (ll_is_empty (&lexer->sources) ? NULL
1134 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1137 static struct substring
1138 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1140 const struct lex_token *token0 = lex_source_next__ (src, n0);
1141 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1142 size_t start = token0->token_pos;
1143 size_t end = token1->token_pos + token1->token_len;
1145 return ss_buffer (&src->buffer[start - src->tail], end - start);
1149 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1155 assert (out_size >= 16);
1156 out_maxlen = out_size - (in.length >= out_size ? 3 : 0) - 1;
1157 for (out_len = 0; out_len < in.length; out_len += mblen)
1159 if (in.string[out_len] == '\n'
1160 || (in.string[out_len] == '\r'
1161 && out_len + 1 < in.length
1162 && in.string[out_len + 1] == '\n'))
1165 mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1166 in.length - out_len);
1167 if (out_len + mblen > out_maxlen)
1171 memcpy (out, in.string, out_len);
1172 strcpy (&out[out_len], out_len < in.length ? "..." : "");
1176 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1177 const char *format, va_list args)
1179 const struct lex_token *token;
1185 token = lex_source_next__ (src, n0);
1186 if (token->token.type == T_ENDCMD)
1187 ds_put_cstr (&s, _("Syntax error at end of command"));
1190 struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1191 if (!ss_is_empty (syntax))
1193 char syntax_cstr[64];
1195 lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1196 ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1199 ds_put_cstr (&s, _("Syntax error"));
1204 ds_put_cstr (&s, ": ");
1205 ds_put_vformat (&s, format, args);
1207 ds_put_byte (&s, '.');
1209 m.category = MSG_C_SYNTAX;
1210 m.severity = MSG_S_ERROR;
1211 m.file_name = src->reader->file_name;
1212 m.first_line = lex_source_get_first_line_number (src, n0);
1213 m.last_line = lex_source_get_last_line_number (src, n1);
1214 m.first_column = lex_source_get_first_column (src, n0);
1215 m.last_column = lex_source_get_last_column (src, n1);
1216 m.text = ds_steal_cstr (&s);
1220 static void PRINTF_FORMAT (2, 3)
1221 lex_get_error (struct lex_source *src, const char *format, ...)
1226 va_start (args, format);
1228 n = deque_count (&src->deque) - 1;
1229 lex_source_error_valist (src, n, n, format, args);
1230 lex_source_pop_front (src);
1236 lex_source_get__ (const struct lex_source *src_)
1238 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1242 struct segmenter segmenter;
1243 enum segment_type last_segment;
1249 struct state state, saved;
1250 enum scan_result result;
1251 struct scanner scanner;
1252 struct lex_token *token;
1259 state.segmenter = src->segmenter;
1261 state.seg_pos = src->seg_pos;
1262 state.line_pos = src->line_pos;
1265 token = lex_push_token__ (src);
1266 scanner_init (&scanner, &token->token);
1267 token->line_pos = src->line_pos;
1268 token->token_pos = src->seg_pos;
1269 if (src->reader->line_number > 0)
1270 token->first_line = src->reader->line_number + src->n_newlines;
1272 token->first_line = 0;
1276 enum segment_type type;
1277 const char *segment;
1281 segment = &src->buffer[state.seg_pos - src->tail];
1282 seg_maxlen = src->head - state.seg_pos;
1283 seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen, &type);
1286 lex_source_read__ (src);
1290 state.last_segment = type;
1291 state.seg_pos += seg_len;
1292 if (type == SEG_NEWLINE)
1295 state.line_pos = state.seg_pos;
1298 result = scanner_push (&scanner, type, ss_buffer (segment, seg_len),
1300 if (result == SCAN_SAVE)
1302 else if (result == SCAN_BACK)
1307 else if (result == SCAN_DONE)
1311 n_lines = state.newlines;
1312 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1315 src->suppress_next_newline = true;
1317 else if (n_lines > 0 && src->suppress_next_newline)
1320 src->suppress_next_newline = false;
1322 for (i = 0; i < n_lines; i++)
1324 const char *newline;
1329 line = &src->buffer[src->journal_pos - src->tail];
1330 newline = rawmemchr (line, '\n');
1331 line_len = newline - line;
1332 if (line_len > 0 && line[line_len - 1] == '\r')
1335 syntax = malloc (line_len + 2);
1336 memcpy (syntax, line, line_len);
1337 syntax[line_len] = '\n';
1338 syntax[line_len + 1] = '\0';
1340 text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX, syntax));
1342 src->journal_pos += newline - line + 1;
1345 token->token_len = state.seg_pos - src->seg_pos;
1347 src->segmenter = state.segmenter;
1348 src->seg_pos = state.seg_pos;
1349 src->line_pos = state.line_pos;
1350 src->n_newlines += state.newlines;
1352 switch (token->token.type)
1358 token->token.type = T_ENDCMD;
1362 case SCAN_BAD_HEX_LENGTH:
1363 lex_get_error (src, _("String of hex digits has %d characters, which "
1364 "is not a multiple of 2"),
1365 (int) token->token.number);
1368 case SCAN_BAD_HEX_DIGIT:
1369 case SCAN_BAD_UNICODE_DIGIT:
1370 lex_get_error (src, _("`%c' is not a valid hex digit"),
1371 (int) token->token.number);
1374 case SCAN_BAD_UNICODE_LENGTH:
1375 lex_get_error (src, _("Unicode string contains %d bytes, which is "
1376 "not in the valid range of 1 to 8 bytes"),
1377 (int) token->token.number);
1380 case SCAN_BAD_UNICODE_CODE_POINT:
1381 lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1382 (int) token->token.number);
1385 case SCAN_EXPECTED_QUOTE:
1386 lex_get_error (src, _("Unterminated string constant"));
1389 case SCAN_EXPECTED_EXPONENT:
1390 lex_get_error (src, _("Missing exponent following `%s'"),
1391 token->token.string.string);
1394 case SCAN_UNEXPECTED_DOT:
1395 lex_get_error (src, _("Unexpected `.' in middle of command"));
1398 case SCAN_UNEXPECTED_CHAR:
1401 lex_get_error (src, _("Bad character %s in input"),
1402 uc_name (token->token.number, c_name));
1407 lex_source_pop_front (src);
1415 lex_source_push_endcmd__ (struct lex_source *src)
1417 struct lex_token *token = lex_push_token__ (src);
1418 token->token.type = T_ENDCMD;
1419 token->token_pos = 0;
1420 token->token_len = 0;
1421 token->line_pos = 0;
1422 token->first_line = 0;
1425 static struct lex_source *
1426 lex_source_create (struct lex_reader *reader)
1428 struct lex_source *src;
1429 enum segmenter_mode mode;
1431 src = xzalloc (sizeof *src);
1432 src->reader = reader;
1434 if (reader->syntax == LEX_SYNTAX_AUTO)
1435 mode = SEG_MODE_AUTO;
1436 else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1437 mode = SEG_MODE_INTERACTIVE;
1438 else if (reader->syntax == LEX_SYNTAX_BATCH)
1439 mode = SEG_MODE_BATCH;
1442 segmenter_init (&src->segmenter, mode);
1444 src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1446 lex_source_push_endcmd__ (src);
1452 lex_source_destroy (struct lex_source *src)
1454 char *file_name = src->reader->file_name;
1455 if (src->reader->class->close != NULL)
1456 src->reader->class->close (src->reader);
1459 while (!deque_is_empty (&src->deque))
1460 lex_source_pop__ (src);
1462 ll_remove (&src->ll);
1466 struct lex_file_reader
1468 struct lex_reader reader;
1469 struct u8_istream *istream;
1473 static struct lex_reader_class lex_file_reader_class;
1475 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1476 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
1477 ENCODING, which should take one of the forms accepted by
1478 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
1479 mode of the new reader, respectively.
1481 Returns a null pointer if FILE_NAME cannot be opened. */
1483 lex_reader_for_file (const char *file_name, const char *encoding,
1484 enum lex_syntax_mode syntax,
1485 enum lex_error_mode error)
1487 struct lex_file_reader *r;
1488 struct u8_istream *istream;
1490 istream = (!strcmp(file_name, "-")
1491 ? u8_istream_for_fd (encoding, STDIN_FILENO)
1492 : u8_istream_for_file (encoding, file_name, O_RDONLY));
1493 if (istream == NULL)
1495 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1499 r = xmalloc (sizeof *r);
1500 lex_reader_init (&r->reader, &lex_file_reader_class);
1501 r->reader.syntax = syntax;
1502 r->reader.error = error;
1503 r->reader.file_name = xstrdup (file_name);
1504 r->reader.line_number = 1;
1505 r->istream = istream;
1506 r->file_name = xstrdup (file_name);
1511 static struct lex_file_reader *
1512 lex_file_reader_cast (struct lex_reader *r)
1514 return UP_CAST (r, struct lex_file_reader, reader);
1518 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1519 enum prompt_style prompt_style UNUSED)
1521 struct lex_file_reader *r = lex_file_reader_cast (r_);
1522 ssize_t n_read = u8_istream_read (r->istream, buf, n);
1525 msg (ME, _("Error reading `%s': %s."), r->file_name, strerror (errno));
1532 lex_file_close (struct lex_reader *r_)
1534 struct lex_file_reader *r = lex_file_reader_cast (r_);
1536 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1538 if (u8_istream_close (r->istream) != 0)
1539 msg (ME, _("Error closing `%s': %s."), r->file_name, strerror (errno));
1542 u8_istream_free (r->istream);
1544 free (r->file_name);
1548 static struct lex_reader_class lex_file_reader_class =
1554 struct lex_string_reader
1556 struct lex_reader reader;
1561 static struct lex_reader_class lex_string_reader_class;
1563 /* Creates and returns a new lex_reader for the contents of S, which must be
1564 encoded in UTF-8. The new reader takes ownership of S and will free it
1565 with ss_dealloc() when it is closed. */
1567 lex_reader_for_substring_nocopy (struct substring s)
1569 struct lex_string_reader *r;
1571 r = xmalloc (sizeof *r);
1572 lex_reader_init (&r->reader, &lex_string_reader_class);
1573 r->reader.syntax = LEX_SYNTAX_INTERACTIVE;
1580 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1581 which must be encoded in UTF-8. The caller retains ownership of S. */
1583 lex_reader_for_string (const char *s)
1585 struct substring ss;
1586 ss_alloc_substring (&ss, ss_cstr (s));
1587 return lex_reader_for_substring_nocopy (ss);
1590 /* Formats FORMAT as a printf()-like format string and creates and returns a
1591 new lex_reader for the formatted result. */
1593 lex_reader_for_format (const char *format, ...)
1595 struct lex_reader *r;
1598 va_start (args, format);
1599 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)));
1605 static struct lex_string_reader *
1606 lex_string_reader_cast (struct lex_reader *r)
1608 return UP_CAST (r, struct lex_string_reader, reader);
1612 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1613 enum prompt_style prompt_style UNUSED)
1615 struct lex_string_reader *r = lex_string_reader_cast (r_);
1618 chunk = MIN (n, r->s.length - r->offset);
1619 memcpy (buf, r->s.string + r->offset, chunk);
1626 lex_string_close (struct lex_reader *r_)
1628 struct lex_string_reader *r = lex_string_reader_cast (r_);
1634 static struct lex_reader_class lex_string_reader_class =