1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "data/file-name.h"
34 #include "language/command.h"
35 #include "language/lexer/scan.h"
36 #include "language/lexer/segment.h"
37 #include "language/lexer/token.h"
38 #include "libpspp/assertion.h"
39 #include "libpspp/cast.h"
40 #include "libpspp/deque.h"
41 #include "libpspp/i18n.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/text-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* Location of token in terms of the lex_source's buffer.
66 src->tail <= line_pos <= token_pos <= src->head. */
67 size_t token_pos; /* Start of token. */
68 size_t token_len; /* Length of source for token in bytes. */
69 size_t line_pos; /* Start of line containing token_pos. */
70 int first_line; /* Line number at token_pos. */
73 /* A source of tokens, corresponding to a syntax file.
75 This is conceptually a lex_reader wrapped with everything needed to convert
76 its UTF-8 bytes into tokens. */
79 struct ll ll; /* In lexer's list of sources. */
80 struct lex_reader *reader;
81 struct segmenter segmenter;
82 bool eof; /* True if T_STOP was read from 'reader'. */
84 /* Buffer of UTF-8 bytes. */
86 size_t allocated; /* Number of bytes allocated. */
87 size_t tail; /* &buffer[0] offset into UTF-8 source. */
88 size_t head; /* &buffer[head - tail] offset into source. */
90 /* Positions in source file, tail <= pos <= head for each member here. */
91 size_t journal_pos; /* First byte not yet output to journal. */
92 size_t seg_pos; /* First byte not yet scanned as token. */
93 size_t line_pos; /* First byte of line containing seg_pos. */
95 int n_newlines; /* Number of new-lines up to seg_pos. */
96 bool suppress_next_newline;
99 struct deque deque; /* Indexes into 'tokens'. */
100 struct lex_token *tokens; /* Lookahead tokens for parser. */
103 static struct lex_source *lex_source_create (struct lex_reader *);
104 static void lex_source_destroy (struct lex_source *);
109 struct ll_list sources; /* Contains "struct lex_source"s. */
112 static struct lex_source *lex_source__ (const struct lexer *);
113 static const struct lex_token *lex_next__ (const struct lexer *, int n);
114 static void lex_source_push_endcmd__ (struct lex_source *);
116 static void lex_source_pop__ (struct lex_source *);
117 static bool lex_source_get__ (const struct lex_source *);
118 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
119 const char *format, va_list)
120 PRINTF_FORMAT (4, 0);
121 static const struct lex_token *lex_source_next__ (const struct lex_source *,
124 /* Initializes READER with the specified CLASS and otherwise some reasonable
125 defaults. The caller should fill in the others members as desired. */
127 lex_reader_init (struct lex_reader *reader,
128 const struct lex_reader_class *class)
130 reader->class = class;
131 reader->syntax = LEX_SYNTAX_AUTO;
132 reader->error = LEX_ERROR_INTERACTIVE;
133 reader->file_name = NULL;
134 reader->line_number = 0;
137 /* Frees any file name already in READER and replaces it by a copy of
138 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
140 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
142 free (reader->file_name);
143 reader->file_name = file_name != NULL ? xstrdup (file_name) : NULL;
146 /* Creates and returns a new lexer. */
150 struct lexer *lexer = xzalloc (sizeof *lexer);
151 ll_init (&lexer->sources);
155 /* Destroys LEXER. */
157 lex_destroy (struct lexer *lexer)
161 struct lex_source *source, *next;
163 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
164 lex_source_destroy (source);
169 /* Inserts READER into LEXER so that the next token read by LEXER comes from
170 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
173 lex_include (struct lexer *lexer, struct lex_reader *reader)
175 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
176 ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
179 /* Appends READER to LEXER, so that it will be read after all other current
180 readers have already been read. */
182 lex_append (struct lexer *lexer, struct lex_reader *reader)
184 ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
189 static struct lex_token *
190 lex_push_token__ (struct lex_source *src)
192 struct lex_token *token;
194 if (deque_is_full (&src->deque))
195 src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
197 token = &src->tokens[deque_push_front (&src->deque)];
198 token_init (&token->token);
203 lex_source_pop__ (struct lex_source *src)
205 token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
209 lex_source_pop_front (struct lex_source *src)
211 token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
214 /* Advances LEXER to the next token, consuming the current token. */
216 lex_get (struct lexer *lexer)
218 struct lex_source *src;
220 src = lex_source__ (lexer);
224 if (!deque_is_empty (&src->deque))
225 lex_source_pop__ (src);
227 while (deque_is_empty (&src->deque))
228 if (!lex_source_get__ (src))
230 lex_source_destroy (src);
231 src = lex_source__ (lexer);
237 /* Issuing errors. */
239 /* Prints a syntax error message containing the current token and
240 given message MESSAGE (if non-null). */
242 lex_error (struct lexer *lexer, const char *format, ...)
246 va_start (args, format);
247 lex_next_error_valist (lexer, 0, 0, format, args);
251 /* Prints a syntax error message containing the current token and
252 given message MESSAGE (if non-null). */
254 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
256 lex_next_error_valist (lexer, 0, 0, format, args);
259 /* Prints a syntax error message containing the current token and
260 given message MESSAGE (if non-null). */
262 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
266 va_start (args, format);
267 lex_next_error_valist (lexer, n0, n1, format, args);
271 /* Prints a syntax error message saying that OPTION0 or one of the other
272 strings following it, up to the first NULL, is expected. */
274 lex_error_expecting (struct lexer *lexer, const char *option0, ...)
276 enum { MAX_OPTIONS = 8 };
277 const char *options[MAX_OPTIONS + 1];
281 va_start (args, option0);
282 options[0] = option0;
284 while (n + 1 < MAX_OPTIONS && options[n] != NULL)
285 options[++n] = va_arg (args, const char *);
291 lex_error (lexer, NULL);
295 lex_error (lexer, _("expecting %s"), options[0]);
299 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
303 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
308 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
309 options[0], options[1], options[2], options[3]);
313 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
314 options[0], options[1], options[2], options[3], options[4]);
318 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
319 options[0], options[1], options[2], options[3], options[4],
324 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
325 options[0], options[1], options[2], options[3], options[4],
326 options[5], options[6]);
330 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
331 options[0], options[1], options[2], options[3], options[4],
332 options[5], options[6], options[7]);
340 /* Reports an error to the effect that subcommand SBC may only be specified
343 This function does not take a lexer as an argument or use lex_error(),
344 because the result would ordinarily just be redundant: "Syntax error at
345 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
346 not help the user find the error. */
348 lex_sbc_only_once (const char *sbc)
350 msg (SE, _("Subcommand %s may only be specified once."), sbc);
353 /* Reports an error to the effect that subcommand SBC is missing.
355 This function does not take a lexer as an argument or use lex_error(),
356 because a missing subcommand can normally be detected only after the whole
357 command has been parsed, and so lex_error() would always report "Syntax
358 error at end of command", which does not help the user find the error. */
360 lex_sbc_missing (const char *sbc)
362 msg (SE, _("Required subcommand %s was not specified."), sbc);
365 /* Prints a syntax error message containing the current token and
366 given message MESSAGE (if non-null). */
368 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
369 const char *format, va_list args)
371 struct lex_source *src = lex_source__ (lexer);
374 lex_source_error_valist (src, n0, n1, format, args);
380 ds_put_format (&s, _("Syntax error at end of input"));
383 ds_put_cstr (&s, ": ");
384 ds_put_vformat (&s, format, args);
386 ds_put_byte (&s, '.');
387 msg (SE, "%s", ds_cstr (&s));
392 /* Checks that we're at end of command.
393 If so, returns a successful command completion code.
394 If not, flags a syntax error and returns an error command
397 lex_end_of_command (struct lexer *lexer)
399 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
401 lex_error (lexer, _("expecting end of command"));
408 /* Token testing functions. */
410 /* Returns true if the current token is a number. */
412 lex_is_number (struct lexer *lexer)
414 return lex_next_is_number (lexer, 0);
417 /* Returns true if the current token is a string. */
419 lex_is_string (struct lexer *lexer)
421 return lex_next_is_string (lexer, 0);
424 /* Returns the value of the current token, which must be a
425 floating point number. */
427 lex_number (struct lexer *lexer)
429 return lex_next_number (lexer, 0);
432 /* Returns true iff the current token is an integer. */
434 lex_is_integer (struct lexer *lexer)
436 return lex_next_is_integer (lexer, 0);
439 /* Returns the value of the current token, which must be an
442 lex_integer (struct lexer *lexer)
444 return lex_next_integer (lexer, 0);
447 /* Token testing functions with lookahead.
449 A value of 0 for N as an argument to any of these functions refers to the
450 current token. Lookahead is limited to the current command. Any N greater
451 than the number of tokens remaining in the current command will be treated
452 as referring to a T_ENDCMD token. */
454 /* Returns true if the token N ahead of the current token is a number. */
456 lex_next_is_number (struct lexer *lexer, int n)
458 enum token_type next_token = lex_next_token (lexer, n);
459 return next_token == T_POS_NUM || next_token == T_NEG_NUM;
462 /* Returns true if the token N ahead of the current token is a string. */
464 lex_next_is_string (struct lexer *lexer, int n)
466 return lex_next_token (lexer, n) == T_STRING;
469 /* Returns the value of the token N ahead of the current token, which must be a
470 floating point number. */
472 lex_next_number (struct lexer *lexer, int n)
474 assert (lex_next_is_number (lexer, n));
475 return lex_next_tokval (lexer, n);
478 /* Returns true if the token N ahead of the current token is an integer. */
480 lex_next_is_integer (struct lexer *lexer, int n)
484 if (!lex_next_is_number (lexer, n))
487 value = lex_next_tokval (lexer, n);
488 return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
491 /* Returns the value of the token N ahead of the current token, which must be
494 lex_next_integer (struct lexer *lexer, int n)
496 assert (lex_next_is_integer (lexer, n));
497 return lex_next_tokval (lexer, n);
500 /* Token matching functions. */
502 /* If the current token has the specified TYPE, skips it and returns true.
503 Otherwise, returns false. */
505 lex_match (struct lexer *lexer, enum token_type type)
507 if (lex_token (lexer) == type)
516 /* If the current token matches IDENTIFIER, skips it and returns true.
517 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
520 IDENTIFIER must be an ASCII string. */
522 lex_match_id (struct lexer *lexer, const char *identifier)
524 return lex_match_id_n (lexer, identifier, 3);
527 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
528 may be abbreviated to its first N letters. Otherwise, returns false.
530 IDENTIFIER must be an ASCII string. */
532 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
534 if (lex_token (lexer) == T_ID
535 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
544 /* If the current token is integer X, skips it and returns true. Otherwise,
547 lex_match_int (struct lexer *lexer, int x)
549 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
558 /* Forced matches. */
560 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
561 abbreviated to its first 3 letters. Otherwise, reports an error and returns
564 IDENTIFIER must be an ASCII string. */
566 lex_force_match_id (struct lexer *lexer, const char *identifier)
568 if (lex_match_id (lexer, identifier))
572 lex_error_expecting (lexer, identifier, NULL_SENTINEL);
577 /* If the current token has the specified TYPE, skips it and returns true.
578 Otherwise, reports an error and returns false. */
580 lex_force_match (struct lexer *lexer, enum token_type type)
582 if (lex_token (lexer) == type)
589 char *s = xasprintf ("`%s'", token_type_to_string (type));
590 lex_error_expecting (lexer, s, NULL_SENTINEL);
596 /* If the current token is a string, does nothing and returns true.
597 Otherwise, reports an error and returns false. */
599 lex_force_string (struct lexer *lexer)
601 if (lex_is_string (lexer))
605 lex_error (lexer, _("expecting string"));
610 /* If the current token is an integer, does nothing and returns true.
611 Otherwise, reports an error and returns false. */
613 lex_force_int (struct lexer *lexer)
615 if (lex_is_integer (lexer))
619 lex_error (lexer, _("expecting integer"));
624 /* If the current token is a number, does nothing and returns true.
625 Otherwise, reports an error and returns false. */
627 lex_force_num (struct lexer *lexer)
629 if (lex_is_number (lexer))
632 lex_error (lexer, _("expecting number"));
636 /* If the current token is an identifier, does nothing and returns true.
637 Otherwise, reports an error and returns false. */
639 lex_force_id (struct lexer *lexer)
641 if (lex_token (lexer) == T_ID)
644 lex_error (lexer, _("expecting identifier"));
648 /* Token accessors. */
650 /* Returns the type of LEXER's current token. */
652 lex_token (const struct lexer *lexer)
654 return lex_next_token (lexer, 0);
657 /* Returns the number in LEXER's current token.
659 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
660 tokens this function will always return zero. */
662 lex_tokval (const struct lexer *lexer)
664 return lex_next_tokval (lexer, 0);
667 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
669 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
670 this functions this function will always return NULL.
672 The UTF-8 encoding of the returned string is correct for variable names and
673 other identifiers. Use filename_to_utf8() to use it as a filename. Use
674 data_in() to use it in a "union value". */
676 lex_tokcstr (const struct lexer *lexer)
678 return lex_next_tokcstr (lexer, 0);
681 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
682 null-terminated (but the null terminator is not included in the returned
683 substring's 'length').
685 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
686 this functions this function will always return NULL.
688 The UTF-8 encoding of the returned string is correct for variable names and
689 other identifiers. Use filename_to_utf8() to use it as a filename. Use
690 data_in() to use it in a "union value". */
692 lex_tokss (const struct lexer *lexer)
694 return lex_next_tokss (lexer, 0);
699 A value of 0 for N as an argument to any of these functions refers to the
700 current token. Lookahead is limited to the current command. Any N greater
701 than the number of tokens remaining in the current command will be treated
702 as referring to a T_ENDCMD token. */
704 static const struct lex_token *
705 lex_next__ (const struct lexer *lexer_, int n)
707 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
708 struct lex_source *src = lex_source__ (lexer);
711 return lex_source_next__ (src, n);
714 static const struct lex_token stop_token =
715 { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
721 static const struct lex_token *
722 lex_source_next__ (const struct lex_source *src, int n)
724 while (deque_count (&src->deque) <= n)
726 if (!deque_is_empty (&src->deque))
728 struct lex_token *front;
730 front = &src->tokens[deque_front (&src->deque, 0)];
731 if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
735 lex_source_get__ (src);
738 return &src->tokens[deque_back (&src->deque, n)];
741 /* Returns the "struct token" of the token N after the current one in LEXER.
742 The returned pointer can be invalidated by pretty much any succeeding call
743 into the lexer, although the string pointer within the returned token is
744 only invalidated by consuming the token (e.g. with lex_get()). */
746 lex_next (const struct lexer *lexer, int n)
748 return &lex_next__ (lexer, n)->token;
751 /* Returns the type of the token N after the current one in LEXER. */
753 lex_next_token (const struct lexer *lexer, int n)
755 return lex_next (lexer, n)->type;
758 /* Returns the number in the tokn N after the current one in LEXER.
760 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
761 tokens this function will always return zero. */
763 lex_next_tokval (const struct lexer *lexer, int n)
765 const struct token *token = lex_next (lexer, n);
766 return token->number;
769 /* Returns the null-terminated string in the token N after the current one, in
772 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
773 this functions this function will always return NULL.
775 The UTF-8 encoding of the returned string is correct for variable names and
776 other identifiers. Use filename_to_utf8() to use it as a filename. Use
777 data_in() to use it in a "union value". */
779 lex_next_tokcstr (const struct lexer *lexer, int n)
781 return lex_next_tokss (lexer, n).string;
784 /* Returns the string in the token N after the current one, in UTF-8 encoding.
785 The string is null-terminated (but the null terminator is not included in
786 the returned substring's 'length').
788 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
789 this functions this function will always return NULL.
791 The UTF-8 encoding of the returned string is correct for variable names and
792 other identifiers. Use filename_to_utf8() to use it as a filename. Use
793 data_in() to use it in a "union value". */
795 lex_next_tokss (const struct lexer *lexer, int n)
797 return lex_next (lexer, n)->string;
800 /* If LEXER is positioned at the (pseudo)identifier S, skips it and returns
801 true. Otherwise, returns false.
803 S may consist of an arbitrary number of identifiers, integers, and
804 punctuation e.g. "KRUSKAL-WALLIS", "2SLS", or "END INPUT PROGRAM".
805 Identifiers may be abbreviated to their first three letters. Currently only
806 hyphens, slashes, and equals signs are supported as punctuation (but it
807 would be easy to add more).
809 S must be an ASCII string. */
811 lex_match_phrase (struct lexer *lexer, const char *s)
815 for (tok_idx = 0; ; tok_idx++)
817 enum token_type token;
820 while (c_isspace (*s))
828 for (i = 0; i < tok_idx; i++)
833 token = lex_next_token (lexer, tok_idx);
843 if (token != T_SLASH)
849 if (token != T_EQUALS)
854 case '0': case '1': case '2': case '3': case '4':
855 case '5': case '6': case '7': case '8': case '9':
859 if (token != T_POS_NUM)
865 value = value * 10 + (*s++ - '0');
867 while (c_isdigit (*s));
869 if (lex_next_tokval (lexer, tok_idx) != value)
882 len = lex_id_get_length (ss_cstr (s));
883 if (!lex_id_match (ss_buffer (s, len),
884 lex_next_tokss (lexer, tok_idx)))
896 lex_source_get_first_line_number (const struct lex_source *src, int n)
898 return lex_source_next__ (src, n)->first_line;
902 count_newlines (char *s, size_t length)
907 while ((newline = memchr (s, '\n', length)) != NULL)
910 length -= (newline + 1) - s;
918 lex_source_get_last_line_number (const struct lex_source *src, int n)
920 const struct lex_token *token = lex_source_next__ (src, n);
922 if (token->first_line == 0)
926 char *token_str = &src->buffer[token->token_pos - src->tail];
927 return token->first_line + count_newlines (token_str, token->token_len) + 1;
932 count_columns (const char *s_, size_t length)
934 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
940 for (ofs = 0; ofs < length; ofs += mblen)
944 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
947 int width = uc_width (uc, "UTF-8");
952 columns = ROUND_UP (columns + 1, 8);
959 lex_source_get_first_column (const struct lex_source *src, int n)
961 const struct lex_token *token = lex_source_next__ (src, n);
962 return count_columns (&src->buffer[token->line_pos - src->tail],
963 token->token_pos - token->line_pos);
967 lex_source_get_last_column (const struct lex_source *src, int n)
969 const struct lex_token *token = lex_source_next__ (src, n);
970 char *start, *end, *newline;
972 start = &src->buffer[token->line_pos - src->tail];
973 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
974 newline = memrchr (start, '\n', end - start);
977 return count_columns (start, end - start);
980 /* Returns the 1-based line number of the start of the syntax that represents
981 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
982 if the token is drawn from a source that does not have line numbers. */
984 lex_get_first_line_number (const struct lexer *lexer, int n)
986 const struct lex_source *src = lex_source__ (lexer);
987 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
990 /* Returns the 1-based line number of the end of the syntax that represents the
991 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
992 token or if the token is drawn from a source that does not have line
995 Most of the time, a single token is wholly within a single line of syntax,
996 but there are two exceptions: a T_STRING token can be made up of multiple
997 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
998 token can consist of a "-" on one line followed by the number on the next.
1001 lex_get_last_line_number (const struct lexer *lexer, int n)
1003 const struct lex_source *src = lex_source__ (lexer);
1004 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1007 /* Returns the 1-based column number of the start of the syntax that represents
1008 the token N after the current one in LEXER. Returns 0 for a T_STOP
1011 Column numbers are measured according to the width of characters as shown in
1012 a typical fixed-width font, in which CJK characters have width 2 and
1013 combining characters have width 0. */
1015 lex_get_first_column (const struct lexer *lexer, int n)
1017 const struct lex_source *src = lex_source__ (lexer);
1018 return src != NULL ? lex_source_get_first_column (src, n) : 0;
1021 /* Returns the 1-based column number of the end of the syntax that represents
1022 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1025 Column numbers are measured according to the width of characters as shown in
1026 a typical fixed-width font, in which CJK characters have width 2 and
1027 combining characters have width 0. */
1029 lex_get_last_column (const struct lexer *lexer, int n)
1031 const struct lex_source *src = lex_source__ (lexer);
1032 return src != NULL ? lex_source_get_last_column (src, n) : 0;
1035 /* Returns the name of the syntax file from which the current command is drawn.
1036 Returns NULL for a T_STOP token or if the command's source does not have
1039 There is no version of this function that takes an N argument because
1040 lookahead only works to the end of a command and any given command is always
1041 within a single syntax file. */
1043 lex_get_file_name (const struct lexer *lexer)
1045 struct lex_source *src = lex_source__ (lexer);
1046 return src == NULL ? NULL : src->reader->file_name;
1049 /* Returns the syntax mode for the syntax file from which the current drawn is
1050 drawn. Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1051 source does not have line numbers.
1053 There is no version of this function that takes an N argument because
1054 lookahead only works to the end of a command and any given command is always
1055 within a single syntax file. */
1056 enum lex_syntax_mode
1057 lex_get_syntax_mode (const struct lexer *lexer)
1059 struct lex_source *src = lex_source__ (lexer);
1060 return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1063 /* Returns the error mode for the syntax file from which the current drawn is
1064 drawn. Returns LEX_ERROR_INTERACTIVE for a T_STOP token or if the command's
1065 source does not have line numbers.
1067 There is no version of this function that takes an N argument because
1068 lookahead only works to the end of a command and any given command is always
1069 within a single syntax file. */
1071 lex_get_error_mode (const struct lexer *lexer)
1073 struct lex_source *src = lex_source__ (lexer);
1074 return src == NULL ? LEX_ERROR_INTERACTIVE : src->reader->error;
1077 /* If the source that LEXER is currently reading has error mode
1078 LEX_ERROR_INTERACTIVE, discards all buffered input and tokens, so that the
1079 next token to be read comes directly from whatever is next read from the
1082 It makes sense to call this function after encountering an error in a
1083 command entered on the console, because usually the user would prefer not to
1084 have cascading errors. */
1086 lex_interactive_reset (struct lexer *lexer)
1088 struct lex_source *src = lex_source__ (lexer);
1089 if (src != NULL && src->reader->error == LEX_ERROR_INTERACTIVE)
1091 src->head = src->tail = 0;
1092 src->journal_pos = src->seg_pos = src->line_pos = 0;
1093 src->n_newlines = 0;
1094 src->suppress_next_newline = false;
1095 segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1096 while (!deque_is_empty (&src->deque))
1097 lex_source_pop__ (src);
1098 lex_source_push_endcmd__ (src);
1102 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1104 lex_discard_rest_of_command (struct lexer *lexer)
1106 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1110 /* Discards all lookahead tokens in LEXER, then discards all input sources
1111 until it encounters one with error mode LEX_ERROR_INTERACTIVE or until it
1112 runs out of input sources. */
1114 lex_discard_noninteractive (struct lexer *lexer)
1116 struct lex_source *src = lex_source__ (lexer);
1120 while (!deque_is_empty (&src->deque))
1121 lex_source_pop__ (src);
1123 for (; src != NULL && src->reader->error != LEX_ERROR_INTERACTIVE;
1124 src = lex_source__ (lexer))
1125 lex_source_destroy (src);
1130 lex_source_max_tail__ (const struct lex_source *src)
1132 const struct lex_token *token;
1135 assert (src->seg_pos >= src->line_pos);
1136 max_tail = MIN (src->journal_pos, src->line_pos);
1138 /* Use the oldest token also. (We know that src->deque cannot be empty
1139 because we are in the process of adding a new token, which is already
1140 initialized enough to use here.) */
1141 token = &src->tokens[deque_back (&src->deque, 0)];
1142 assert (token->token_pos >= token->line_pos);
1143 max_tail = MIN (max_tail, token->line_pos);
1149 lex_source_expand__ (struct lex_source *src)
1151 if (src->head - src->tail >= src->allocated)
1153 size_t max_tail = lex_source_max_tail__ (src);
1154 if (max_tail > src->tail)
1156 /* Advance the tail, freeing up room at the head. */
1157 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1158 src->head - max_tail);
1159 src->tail = max_tail;
1163 /* Buffer is completely full. Expand it. */
1164 src->buffer = x2realloc (src->buffer, &src->allocated);
1169 /* There's space available at the head of the buffer. Nothing to do. */
1174 lex_source_read__ (struct lex_source *src)
1181 lex_source_expand__ (src);
1183 head_ofs = src->head - src->tail;
1184 n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1185 src->allocated - head_ofs,
1186 segmenter_get_prompt (&src->segmenter));
1191 Ensure that the input always ends in a new-line followed by a null
1192 byte, as required by the segmenter library. */
1194 if (src->head == src->tail
1195 || src->buffer[src->head - src->tail - 1] != '\n')
1196 src->buffer[src->head++ - src->tail] = '\n';
1198 lex_source_expand__ (src);
1199 src->buffer[src->head++ - src->tail] = '\0';
1206 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1207 src->head - src->seg_pos));
1210 static struct lex_source *
1211 lex_source__ (const struct lexer *lexer)
1213 return (ll_is_empty (&lexer->sources) ? NULL
1214 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1217 static struct substring
1218 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1220 const struct lex_token *token0 = lex_source_next__ (src, n0);
1221 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1222 size_t start = token0->token_pos;
1223 size_t end = token1->token_pos + token1->token_len;
1225 return ss_buffer (&src->buffer[start - src->tail], end - start);
1229 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1235 assert (out_size >= 16);
1236 out_maxlen = out_size - (in.length >= out_size ? 3 : 0) - 1;
1237 for (out_len = 0; out_len < in.length; out_len += mblen)
1239 if (in.string[out_len] == '\n'
1240 || (in.string[out_len] == '\r'
1241 && out_len + 1 < in.length
1242 && in.string[out_len + 1] == '\n'))
1245 mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1246 in.length - out_len);
1247 if (out_len + mblen > out_maxlen)
1251 memcpy (out, in.string, out_len);
1252 strcpy (&out[out_len], out_len < in.length ? "..." : "");
1256 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1257 const char *format, va_list args)
1259 const struct lex_token *token;
1265 token = lex_source_next__ (src, n0);
1266 if (token->token.type == T_ENDCMD)
1267 ds_put_cstr (&s, _("Syntax error at end of command"));
1270 struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1271 if (!ss_is_empty (syntax))
1273 char syntax_cstr[64];
1275 lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1276 ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1279 ds_put_cstr (&s, _("Syntax error"));
1284 ds_put_cstr (&s, ": ");
1285 ds_put_vformat (&s, format, args);
1287 ds_put_byte (&s, '.');
1289 m.category = MSG_C_SYNTAX;
1290 m.severity = MSG_S_ERROR;
1291 m.file_name = src->reader->file_name;
1292 m.first_line = lex_source_get_first_line_number (src, n0);
1293 m.last_line = lex_source_get_last_line_number (src, n1);
1294 m.first_column = lex_source_get_first_column (src, n0);
1295 m.last_column = lex_source_get_last_column (src, n1);
1296 m.text = ds_steal_cstr (&s);
1300 static void PRINTF_FORMAT (2, 3)
1301 lex_get_error (struct lex_source *src, const char *format, ...)
1306 va_start (args, format);
1308 n = deque_count (&src->deque) - 1;
1309 lex_source_error_valist (src, n, n, format, args);
1310 lex_source_pop_front (src);
1316 lex_source_get__ (const struct lex_source *src_)
1318 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1322 struct segmenter segmenter;
1323 enum segment_type last_segment;
1329 struct state state, saved;
1330 enum scan_result result;
1331 struct scanner scanner;
1332 struct lex_token *token;
1339 state.segmenter = src->segmenter;
1341 state.seg_pos = src->seg_pos;
1342 state.line_pos = src->line_pos;
1345 token = lex_push_token__ (src);
1346 scanner_init (&scanner, &token->token);
1347 token->line_pos = src->line_pos;
1348 token->token_pos = src->seg_pos;
1349 if (src->reader->line_number > 0)
1350 token->first_line = src->reader->line_number + src->n_newlines;
1352 token->first_line = 0;
1356 enum segment_type type;
1357 const char *segment;
1361 segment = &src->buffer[state.seg_pos - src->tail];
1362 seg_maxlen = src->head - state.seg_pos;
1363 seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen, &type);
1366 lex_source_read__ (src);
1370 state.last_segment = type;
1371 state.seg_pos += seg_len;
1372 if (type == SEG_NEWLINE)
1375 state.line_pos = state.seg_pos;
1378 result = scanner_push (&scanner, type, ss_buffer (segment, seg_len),
1380 if (result == SCAN_SAVE)
1382 else if (result == SCAN_BACK)
1387 else if (result == SCAN_DONE)
1391 n_lines = state.newlines;
1392 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1395 src->suppress_next_newline = true;
1397 else if (n_lines > 0 && src->suppress_next_newline)
1400 src->suppress_next_newline = false;
1402 for (i = 0; i < n_lines; i++)
1404 const char *newline;
1409 line = &src->buffer[src->journal_pos - src->tail];
1410 newline = rawmemchr (line, '\n');
1411 line_len = newline - line;
1412 if (line_len > 0 && line[line_len - 1] == '\r')
1415 syntax = malloc (line_len + 2);
1416 memcpy (syntax, line, line_len);
1417 syntax[line_len] = '\n';
1418 syntax[line_len + 1] = '\0';
1420 text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX, syntax));
1422 src->journal_pos += newline - line + 1;
1425 token->token_len = state.seg_pos - src->seg_pos;
1427 src->segmenter = state.segmenter;
1428 src->seg_pos = state.seg_pos;
1429 src->line_pos = state.line_pos;
1430 src->n_newlines += state.newlines;
1432 switch (token->token.type)
1438 token->token.type = T_ENDCMD;
1442 case SCAN_BAD_HEX_LENGTH:
1443 lex_get_error (src, _("String of hex digits has %d characters, which "
1444 "is not a multiple of 2"),
1445 (int) token->token.number);
1448 case SCAN_BAD_HEX_DIGIT:
1449 case SCAN_BAD_UNICODE_DIGIT:
1450 lex_get_error (src, _("`%c' is not a valid hex digit"),
1451 (int) token->token.number);
1454 case SCAN_BAD_UNICODE_LENGTH:
1455 lex_get_error (src, _("Unicode string contains %d bytes, which is "
1456 "not in the valid range of 1 to 8 bytes"),
1457 (int) token->token.number);
1460 case SCAN_BAD_UNICODE_CODE_POINT:
1461 lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1462 (int) token->token.number);
1465 case SCAN_EXPECTED_QUOTE:
1466 lex_get_error (src, _("Unterminated string constant"));
1469 case SCAN_EXPECTED_EXPONENT:
1470 lex_get_error (src, _("Missing exponent following `%s'"),
1471 token->token.string.string);
1474 case SCAN_UNEXPECTED_DOT:
1475 lex_get_error (src, _("Unexpected `.' in middle of command"));
1478 case SCAN_UNEXPECTED_CHAR:
1481 lex_get_error (src, _("Bad character %s in input"),
1482 uc_name (token->token.number, c_name));
1487 lex_source_pop_front (src);
1495 lex_source_push_endcmd__ (struct lex_source *src)
1497 struct lex_token *token = lex_push_token__ (src);
1498 token->token.type = T_ENDCMD;
1499 token->token_pos = 0;
1500 token->token_len = 0;
1501 token->line_pos = 0;
1502 token->first_line = 0;
1505 static struct lex_source *
1506 lex_source_create (struct lex_reader *reader)
1508 struct lex_source *src;
1509 enum segmenter_mode mode;
1511 src = xzalloc (sizeof *src);
1512 src->reader = reader;
1514 if (reader->syntax == LEX_SYNTAX_AUTO)
1515 mode = SEG_MODE_AUTO;
1516 else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1517 mode = SEG_MODE_INTERACTIVE;
1518 else if (reader->syntax == LEX_SYNTAX_BATCH)
1519 mode = SEG_MODE_BATCH;
1522 segmenter_init (&src->segmenter, mode);
1524 src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1526 lex_source_push_endcmd__ (src);
1532 lex_source_destroy (struct lex_source *src)
1534 char *file_name = src->reader->file_name;
1535 if (src->reader->class->destroy != NULL)
1536 src->reader->class->destroy (src->reader);
1539 while (!deque_is_empty (&src->deque))
1540 lex_source_pop__ (src);
1542 ll_remove (&src->ll);
1546 struct lex_file_reader
1548 struct lex_reader reader;
1549 struct u8_istream *istream;
1553 static struct lex_reader_class lex_file_reader_class;
1555 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1556 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
1557 ENCODING, which should take one of the forms accepted by
1558 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
1559 mode of the new reader, respectively.
1561 Returns a null pointer if FILE_NAME cannot be opened. */
1563 lex_reader_for_file (const char *file_name, const char *encoding,
1564 enum lex_syntax_mode syntax,
1565 enum lex_error_mode error)
1567 struct lex_file_reader *r;
1568 struct u8_istream *istream;
1570 istream = (!strcmp(file_name, "-")
1571 ? u8_istream_for_fd (encoding, STDIN_FILENO)
1572 : u8_istream_for_file (encoding, file_name, O_RDONLY));
1573 if (istream == NULL)
1575 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1579 r = xmalloc (sizeof *r);
1580 lex_reader_init (&r->reader, &lex_file_reader_class);
1581 r->reader.syntax = syntax;
1582 r->reader.error = error;
1583 r->reader.file_name = xstrdup (file_name);
1584 r->reader.line_number = 1;
1585 r->istream = istream;
1586 r->file_name = xstrdup (file_name);
1591 static struct lex_file_reader *
1592 lex_file_reader_cast (struct lex_reader *r)
1594 return UP_CAST (r, struct lex_file_reader, reader);
1598 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1599 enum prompt_style prompt_style UNUSED)
1601 struct lex_file_reader *r = lex_file_reader_cast (r_);
1602 ssize_t n_read = u8_istream_read (r->istream, buf, n);
1605 msg (ME, _("Error reading `%s': %s."), r->file_name, strerror (errno));
1612 lex_file_close (struct lex_reader *r_)
1614 struct lex_file_reader *r = lex_file_reader_cast (r_);
1616 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1618 if (u8_istream_close (r->istream) != 0)
1619 msg (ME, _("Error closing `%s': %s."), r->file_name, strerror (errno));
1622 u8_istream_free (r->istream);
1624 free (r->file_name);
1628 static struct lex_reader_class lex_file_reader_class =
1634 struct lex_string_reader
1636 struct lex_reader reader;
1641 static struct lex_reader_class lex_string_reader_class;
1643 /* Creates and returns a new lex_reader for the contents of S, which must be
1644 encoded in UTF-8. The new reader takes ownership of S and will free it
1645 with ss_dealloc() when it is closed. */
1647 lex_reader_for_substring_nocopy (struct substring s)
1649 struct lex_string_reader *r;
1651 r = xmalloc (sizeof *r);
1652 lex_reader_init (&r->reader, &lex_string_reader_class);
1653 r->reader.syntax = LEX_SYNTAX_INTERACTIVE;
1660 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1661 which must be encoded in UTF-8. The caller retains ownership of S. */
1663 lex_reader_for_string (const char *s)
1665 struct substring ss;
1666 ss_alloc_substring (&ss, ss_cstr (s));
1667 return lex_reader_for_substring_nocopy (ss);
1670 /* Formats FORMAT as a printf()-like format string and creates and returns a
1671 new lex_reader for the formatted result. */
1673 lex_reader_for_format (const char *format, ...)
1675 struct lex_reader *r;
1678 va_start (args, format);
1679 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)));
1685 static struct lex_string_reader *
1686 lex_string_reader_cast (struct lex_reader *r)
1688 return UP_CAST (r, struct lex_string_reader, reader);
1692 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1693 enum prompt_style prompt_style UNUSED)
1695 struct lex_string_reader *r = lex_string_reader_cast (r_);
1698 chunk = MIN (n, r->s.length - r->offset);
1699 memcpy (buf, r->s.string + r->offset, chunk);
1706 lex_string_close (struct lex_reader *r_)
1708 struct lex_string_reader *r = lex_string_reader_cast (r_);
1714 static struct lex_reader_class lex_string_reader_class =