1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "data/file-name.h"
34 #include "language/command.h"
35 #include "language/lexer/scan.h"
36 #include "language/lexer/segment.h"
37 #include "language/lexer/token.h"
38 #include "libpspp/assertion.h"
39 #include "libpspp/cast.h"
40 #include "libpspp/deque.h"
41 #include "libpspp/i18n.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/text-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* Location of token in terms of the lex_source's buffer.
66 src->tail <= line_pos <= token_pos <= src->head. */
67 size_t token_pos; /* Start of token. */
68 size_t token_len; /* Length of source for token in bytes. */
69 size_t line_pos; /* Start of line containing token_pos. */
70 int first_line; /* Line number at token_pos. */
73 /* A source of tokens, corresponding to a syntax file.
75 This is conceptually a lex_reader wrapped with everything needed to convert
76 its UTF-8 bytes into tokens. */
79 struct ll ll; /* In lexer's list of sources. */
80 struct lex_reader *reader;
81 struct segmenter segmenter;
82 bool eof; /* True if T_STOP was read from 'reader'. */
84 /* Buffer of UTF-8 bytes. */
86 size_t allocated; /* Number of bytes allocated. */
87 size_t tail; /* &buffer[0] offset into UTF-8 source. */
88 size_t head; /* &buffer[head - tail] offset into source. */
90 /* Positions in source file, tail <= pos <= head for each member here. */
91 size_t journal_pos; /* First byte not yet output to journal. */
92 size_t seg_pos; /* First byte not yet scanned as token. */
93 size_t line_pos; /* First byte of line containing seg_pos. */
95 int n_newlines; /* Number of new-lines up to seg_pos. */
96 bool suppress_next_newline;
99 struct deque deque; /* Indexes into 'tokens'. */
100 struct lex_token *tokens; /* Lookahead tokens for parser. */
103 static struct lex_source *lex_source_create (struct lex_reader *);
104 static void lex_source_destroy (struct lex_source *);
109 struct ll_list sources; /* Contains "struct lex_source"s. */
112 static struct lex_source *lex_source__ (const struct lexer *);
113 static const struct lex_token *lex_next__ (const struct lexer *, int n);
114 static void lex_source_push_endcmd__ (struct lex_source *);
116 static void lex_source_pop__ (struct lex_source *);
117 static bool lex_source_get__ (const struct lex_source *);
118 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
119 const char *format, va_list)
120 PRINTF_FORMAT (4, 0);
121 static const struct lex_token *lex_source_next__ (const struct lex_source *,
124 /* Initializes READER with the specified CLASS and otherwise some reasonable
125 defaults. The caller should fill in the others members as desired. */
127 lex_reader_init (struct lex_reader *reader,
128 const struct lex_reader_class *class)
130 reader->class = class;
131 reader->syntax = LEX_SYNTAX_AUTO;
132 reader->error = LEX_ERROR_INTERACTIVE;
133 reader->file_name = NULL;
134 reader->line_number = 0;
137 /* Frees any file name already in READER and replaces it by a copy of
138 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
140 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
142 free (reader->file_name);
143 reader->file_name = file_name != NULL ? xstrdup (file_name) : NULL;
146 /* Creates and returns a new lexer. */
150 struct lexer *lexer = xzalloc (sizeof *lexer);
151 ll_init (&lexer->sources);
155 /* Destroys LEXER. */
157 lex_destroy (struct lexer *lexer)
161 struct lex_source *source, *next;
163 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
164 lex_source_destroy (source);
169 /* Inserts READER into LEXER so that the next token read by LEXER comes from
170 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
173 lex_include (struct lexer *lexer, struct lex_reader *reader)
175 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
176 ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
179 /* Appends READER to LEXER, so that it will be read after all other current
180 readers have already been read. */
182 lex_append (struct lexer *lexer, struct lex_reader *reader)
184 ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
189 static struct lex_token *
190 lex_push_token__ (struct lex_source *src)
192 struct lex_token *token;
194 if (deque_is_full (&src->deque))
195 src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
197 token = &src->tokens[deque_push_front (&src->deque)];
198 token_init (&token->token);
203 lex_source_pop__ (struct lex_source *src)
205 token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
209 lex_source_pop_front (struct lex_source *src)
211 token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
214 /* Advances LEXER to the next token, consuming the current token. */
216 lex_get (struct lexer *lexer)
218 struct lex_source *src;
220 src = lex_source__ (lexer);
224 if (!deque_is_empty (&src->deque))
225 lex_source_pop__ (src);
227 while (deque_is_empty (&src->deque))
228 if (!lex_source_get__ (src))
230 lex_source_destroy (src);
231 src = lex_source__ (lexer);
237 /* Issuing errors. */
239 /* Prints a syntax error message containing the current token and
240 given message MESSAGE (if non-null). */
242 lex_error (struct lexer *lexer, const char *format, ...)
246 va_start (args, format);
247 lex_next_error_valist (lexer, 0, 0, format, args);
251 /* Prints a syntax error message containing the current token and
252 given message MESSAGE (if non-null). */
254 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
256 lex_next_error_valist (lexer, 0, 0, format, args);
259 /* Prints a syntax error message containing the current token and
260 given message MESSAGE (if non-null). */
262 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
266 va_start (args, format);
267 lex_next_error_valist (lexer, n0, n1, format, args);
271 /* Prints a syntax error message saying that OPTION0 or one of the other
272 strings following it, up to the first NULL, is expected. */
274 lex_error_expecting (struct lexer *lexer, const char *option0, ...)
276 enum { MAX_OPTIONS = 8 };
277 const char *options[MAX_OPTIONS + 1];
281 va_start (args, option0);
282 options[0] = option0;
284 while (n + 1 < MAX_OPTIONS && options[n] != NULL)
285 options[++n] = va_arg (args, const char *);
291 lex_error (lexer, NULL);
295 lex_error (lexer, _("expecting %s"), options[0]);
299 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
303 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
308 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
309 options[0], options[1], options[2], options[3]);
313 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
314 options[0], options[1], options[2], options[3], options[4]);
318 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
319 options[0], options[1], options[2], options[3], options[4],
324 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
325 options[0], options[1], options[2], options[3], options[4],
326 options[5], options[6]);
330 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
331 options[0], options[1], options[2], options[3], options[4],
332 options[5], options[6], options[7]);
340 /* Reports an error to the effect that subcommand SBC may only be specified
343 This function does not take a lexer as an argument or use lex_error(),
344 because the result would ordinarily just be redundant: "Syntax error at
345 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
346 not help the user find the error. */
348 lex_sbc_only_once (const char *sbc)
350 msg (SE, _("Subcommand %s may only be specified once."), sbc);
353 /* Reports an error to the effect that subcommand SBC is missing.
355 This function does not take a lexer as an argument or use lex_error(),
356 because a missing subcommand can normally be detected only after the whole
357 command has been parsed, and so lex_error() would always report "Syntax
358 error at end of command", which does not help the user find the error. */
360 lex_sbc_missing (const char *sbc)
362 msg (SE, _("Required subcommand %s was not specified."), sbc);
365 /* Reports an error to the effect that specification SPEC may only be specified
366 once within subcommand SBC. */
368 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
370 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
374 /* Reports an error to the effect that specification SPEC is missing within
377 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
379 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
383 /* Prints a syntax error message containing the current token and
384 given message MESSAGE (if non-null). */
386 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
387 const char *format, va_list args)
389 struct lex_source *src = lex_source__ (lexer);
392 lex_source_error_valist (src, n0, n1, format, args);
398 ds_put_format (&s, _("Syntax error at end of input"));
401 ds_put_cstr (&s, ": ");
402 ds_put_vformat (&s, format, args);
404 ds_put_byte (&s, '.');
405 msg (SE, "%s", ds_cstr (&s));
410 /* Checks that we're at end of command.
411 If so, returns a successful command completion code.
412 If not, flags a syntax error and returns an error command
415 lex_end_of_command (struct lexer *lexer)
417 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
419 lex_error (lexer, _("expecting end of command"));
426 /* Token testing functions. */
428 /* Returns true if the current token is a number. */
430 lex_is_number (struct lexer *lexer)
432 return lex_next_is_number (lexer, 0);
435 /* Returns true if the current token is a string. */
437 lex_is_string (struct lexer *lexer)
439 return lex_next_is_string (lexer, 0);
442 /* Returns the value of the current token, which must be a
443 floating point number. */
445 lex_number (struct lexer *lexer)
447 return lex_next_number (lexer, 0);
450 /* Returns true iff the current token is an integer. */
452 lex_is_integer (struct lexer *lexer)
454 return lex_next_is_integer (lexer, 0);
457 /* Returns the value of the current token, which must be an
460 lex_integer (struct lexer *lexer)
462 return lex_next_integer (lexer, 0);
465 /* Token testing functions with lookahead.
467 A value of 0 for N as an argument to any of these functions refers to the
468 current token. Lookahead is limited to the current command. Any N greater
469 than the number of tokens remaining in the current command will be treated
470 as referring to a T_ENDCMD token. */
472 /* Returns true if the token N ahead of the current token is a number. */
474 lex_next_is_number (struct lexer *lexer, int n)
476 enum token_type next_token = lex_next_token (lexer, n);
477 return next_token == T_POS_NUM || next_token == T_NEG_NUM;
480 /* Returns true if the token N ahead of the current token is a string. */
482 lex_next_is_string (struct lexer *lexer, int n)
484 return lex_next_token (lexer, n) == T_STRING;
487 /* Returns the value of the token N ahead of the current token, which must be a
488 floating point number. */
490 lex_next_number (struct lexer *lexer, int n)
492 assert (lex_next_is_number (lexer, n));
493 return lex_next_tokval (lexer, n);
496 /* Returns true if the token N ahead of the current token is an integer. */
498 lex_next_is_integer (struct lexer *lexer, int n)
502 if (!lex_next_is_number (lexer, n))
505 value = lex_next_tokval (lexer, n);
506 return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
509 /* Returns the value of the token N ahead of the current token, which must be
512 lex_next_integer (struct lexer *lexer, int n)
514 assert (lex_next_is_integer (lexer, n));
515 return lex_next_tokval (lexer, n);
518 /* Token matching functions. */
520 /* If the current token has the specified TYPE, skips it and returns true.
521 Otherwise, returns false. */
523 lex_match (struct lexer *lexer, enum token_type type)
525 if (lex_token (lexer) == type)
534 /* If the current token matches IDENTIFIER, skips it and returns true.
535 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
538 IDENTIFIER must be an ASCII string. */
540 lex_match_id (struct lexer *lexer, const char *identifier)
542 return lex_match_id_n (lexer, identifier, 3);
545 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
546 may be abbreviated to its first N letters. Otherwise, returns false.
548 IDENTIFIER must be an ASCII string. */
550 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
552 if (lex_token (lexer) == T_ID
553 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
562 /* If the current token is integer X, skips it and returns true. Otherwise,
565 lex_match_int (struct lexer *lexer, int x)
567 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
576 /* Forced matches. */
578 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
579 abbreviated to its first 3 letters. Otherwise, reports an error and returns
582 IDENTIFIER must be an ASCII string. */
584 lex_force_match_id (struct lexer *lexer, const char *identifier)
586 if (lex_match_id (lexer, identifier))
590 lex_error_expecting (lexer, identifier, NULL_SENTINEL);
595 /* If the current token has the specified TYPE, skips it and returns true.
596 Otherwise, reports an error and returns false. */
598 lex_force_match (struct lexer *lexer, enum token_type type)
600 if (lex_token (lexer) == type)
607 char *s = xasprintf ("`%s'", token_type_to_string (type));
608 lex_error_expecting (lexer, s, NULL_SENTINEL);
614 /* If the current token is a string, does nothing and returns true.
615 Otherwise, reports an error and returns false. */
617 lex_force_string (struct lexer *lexer)
619 if (lex_is_string (lexer))
623 lex_error (lexer, _("expecting string"));
628 /* If the current token is a string or an identifier, does nothing and returns
629 true. Otherwise, reports an error and returns false.
631 This is meant for use in syntactic situations where we want to encourage the
632 user to supply a quoted string, but for compatibility we also accept
633 identifiers. (One example of such a situation is file names.) Therefore,
634 the error message issued when the current token is wrong only says that a
635 string is expected and doesn't mention that an identifier would also be
638 lex_force_string_or_id (struct lexer *lexer)
640 return lex_is_integer (lexer) || lex_force_string (lexer);
643 /* If the current token is an integer, does nothing and returns true.
644 Otherwise, reports an error and returns false. */
646 lex_force_int (struct lexer *lexer)
648 if (lex_is_integer (lexer))
652 lex_error (lexer, _("expecting integer"));
657 /* If the current token is a number, does nothing and returns true.
658 Otherwise, reports an error and returns false. */
660 lex_force_num (struct lexer *lexer)
662 if (lex_is_number (lexer))
665 lex_error (lexer, _("expecting number"));
669 /* If the current token is an identifier, does nothing and returns true.
670 Otherwise, reports an error and returns false. */
672 lex_force_id (struct lexer *lexer)
674 if (lex_token (lexer) == T_ID)
677 lex_error (lexer, _("expecting identifier"));
681 /* Token accessors. */
683 /* Returns the type of LEXER's current token. */
685 lex_token (const struct lexer *lexer)
687 return lex_next_token (lexer, 0);
690 /* Returns the number in LEXER's current token.
692 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
693 tokens this function will always return zero. */
695 lex_tokval (const struct lexer *lexer)
697 return lex_next_tokval (lexer, 0);
700 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
702 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
703 this functions this function will always return NULL.
705 The UTF-8 encoding of the returned string is correct for variable names and
706 other identifiers. Use filename_to_utf8() to use it as a filename. Use
707 data_in() to use it in a "union value". */
709 lex_tokcstr (const struct lexer *lexer)
711 return lex_next_tokcstr (lexer, 0);
714 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
715 null-terminated (but the null terminator is not included in the returned
716 substring's 'length').
718 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
719 this functions this function will always return NULL.
721 The UTF-8 encoding of the returned string is correct for variable names and
722 other identifiers. Use filename_to_utf8() to use it as a filename. Use
723 data_in() to use it in a "union value". */
725 lex_tokss (const struct lexer *lexer)
727 return lex_next_tokss (lexer, 0);
732 A value of 0 for N as an argument to any of these functions refers to the
733 current token. Lookahead is limited to the current command. Any N greater
734 than the number of tokens remaining in the current command will be treated
735 as referring to a T_ENDCMD token. */
737 static const struct lex_token *
738 lex_next__ (const struct lexer *lexer_, int n)
740 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
741 struct lex_source *src = lex_source__ (lexer);
744 return lex_source_next__ (src, n);
747 static const struct lex_token stop_token =
748 { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
754 static const struct lex_token *
755 lex_source_next__ (const struct lex_source *src, int n)
757 while (deque_count (&src->deque) <= n)
759 if (!deque_is_empty (&src->deque))
761 struct lex_token *front;
763 front = &src->tokens[deque_front (&src->deque, 0)];
764 if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
768 lex_source_get__ (src);
771 return &src->tokens[deque_back (&src->deque, n)];
774 /* Returns the "struct token" of the token N after the current one in LEXER.
775 The returned pointer can be invalidated by pretty much any succeeding call
776 into the lexer, although the string pointer within the returned token is
777 only invalidated by consuming the token (e.g. with lex_get()). */
779 lex_next (const struct lexer *lexer, int n)
781 return &lex_next__ (lexer, n)->token;
784 /* Returns the type of the token N after the current one in LEXER. */
786 lex_next_token (const struct lexer *lexer, int n)
788 return lex_next (lexer, n)->type;
791 /* Returns the number in the tokn N after the current one in LEXER.
793 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
794 tokens this function will always return zero. */
796 lex_next_tokval (const struct lexer *lexer, int n)
798 const struct token *token = lex_next (lexer, n);
799 return token->number;
802 /* Returns the null-terminated string in the token N after the current one, in
805 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
806 this functions this function will always return NULL.
808 The UTF-8 encoding of the returned string is correct for variable names and
809 other identifiers. Use filename_to_utf8() to use it as a filename. Use
810 data_in() to use it in a "union value". */
812 lex_next_tokcstr (const struct lexer *lexer, int n)
814 return lex_next_tokss (lexer, n).string;
817 /* Returns the string in the token N after the current one, in UTF-8 encoding.
818 The string is null-terminated (but the null terminator is not included in
819 the returned substring's 'length').
821 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
822 this functions this function will always return NULL.
824 The UTF-8 encoding of the returned string is correct for variable names and
825 other identifiers. Use filename_to_utf8() to use it as a filename. Use
826 data_in() to use it in a "union value". */
828 lex_next_tokss (const struct lexer *lexer, int n)
830 return lex_next (lexer, n)->string;
833 /* If LEXER is positioned at the (pseudo)identifier S, skips it and returns
834 true. Otherwise, returns false.
836 S may consist of an arbitrary number of identifiers, integers, and
837 punctuation e.g. "KRUSKAL-WALLIS", "2SLS", or "END INPUT PROGRAM".
838 Identifiers may be abbreviated to their first three letters. Currently only
839 hyphens, slashes, and equals signs are supported as punctuation (but it
840 would be easy to add more).
842 S must be an ASCII string. */
844 lex_match_phrase (struct lexer *lexer, const char *s)
848 for (tok_idx = 0; ; tok_idx++)
850 enum token_type token;
853 while (c_isspace (*s))
861 for (i = 0; i < tok_idx; i++)
866 token = lex_next_token (lexer, tok_idx);
876 if (token != T_SLASH)
882 if (token != T_EQUALS)
887 case '0': case '1': case '2': case '3': case '4':
888 case '5': case '6': case '7': case '8': case '9':
892 if (token != T_POS_NUM)
898 value = value * 10 + (*s++ - '0');
900 while (c_isdigit (*s));
902 if (lex_next_tokval (lexer, tok_idx) != value)
915 len = lex_id_get_length (ss_cstr (s));
916 if (!lex_id_match (ss_buffer (s, len),
917 lex_next_tokss (lexer, tok_idx)))
929 lex_source_get_first_line_number (const struct lex_source *src, int n)
931 return lex_source_next__ (src, n)->first_line;
935 count_newlines (char *s, size_t length)
940 while ((newline = memchr (s, '\n', length)) != NULL)
943 length -= (newline + 1) - s;
951 lex_source_get_last_line_number (const struct lex_source *src, int n)
953 const struct lex_token *token = lex_source_next__ (src, n);
955 if (token->first_line == 0)
959 char *token_str = &src->buffer[token->token_pos - src->tail];
960 return token->first_line + count_newlines (token_str, token->token_len) + 1;
965 count_columns (const char *s_, size_t length)
967 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
973 for (ofs = 0; ofs < length; ofs += mblen)
977 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
980 int width = uc_width (uc, "UTF-8");
985 columns = ROUND_UP (columns + 1, 8);
992 lex_source_get_first_column (const struct lex_source *src, int n)
994 const struct lex_token *token = lex_source_next__ (src, n);
995 return count_columns (&src->buffer[token->line_pos - src->tail],
996 token->token_pos - token->line_pos);
1000 lex_source_get_last_column (const struct lex_source *src, int n)
1002 const struct lex_token *token = lex_source_next__ (src, n);
1003 char *start, *end, *newline;
1005 start = &src->buffer[token->line_pos - src->tail];
1006 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1007 newline = memrchr (start, '\n', end - start);
1008 if (newline != NULL)
1009 start = newline + 1;
1010 return count_columns (start, end - start);
1013 /* Returns the 1-based line number of the start of the syntax that represents
1014 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1015 if the token is drawn from a source that does not have line numbers. */
1017 lex_get_first_line_number (const struct lexer *lexer, int n)
1019 const struct lex_source *src = lex_source__ (lexer);
1020 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
1023 /* Returns the 1-based line number of the end of the syntax that represents the
1024 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1025 token or if the token is drawn from a source that does not have line
1028 Most of the time, a single token is wholly within a single line of syntax,
1029 but there are two exceptions: a T_STRING token can be made up of multiple
1030 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1031 token can consist of a "-" on one line followed by the number on the next.
1034 lex_get_last_line_number (const struct lexer *lexer, int n)
1036 const struct lex_source *src = lex_source__ (lexer);
1037 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1040 /* Returns the 1-based column number of the start of the syntax that represents
1041 the token N after the current one in LEXER. Returns 0 for a T_STOP
1044 Column numbers are measured according to the width of characters as shown in
1045 a typical fixed-width font, in which CJK characters have width 2 and
1046 combining characters have width 0. */
1048 lex_get_first_column (const struct lexer *lexer, int n)
1050 const struct lex_source *src = lex_source__ (lexer);
1051 return src != NULL ? lex_source_get_first_column (src, n) : 0;
1054 /* Returns the 1-based column number of the end of the syntax that represents
1055 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1058 Column numbers are measured according to the width of characters as shown in
1059 a typical fixed-width font, in which CJK characters have width 2 and
1060 combining characters have width 0. */
1062 lex_get_last_column (const struct lexer *lexer, int n)
1064 const struct lex_source *src = lex_source__ (lexer);
1065 return src != NULL ? lex_source_get_last_column (src, n) : 0;
1068 /* Returns the name of the syntax file from which the current command is drawn.
1069 Returns NULL for a T_STOP token or if the command's source does not have
1072 There is no version of this function that takes an N argument because
1073 lookahead only works to the end of a command and any given command is always
1074 within a single syntax file. */
1076 lex_get_file_name (const struct lexer *lexer)
1078 struct lex_source *src = lex_source__ (lexer);
1079 return src == NULL ? NULL : src->reader->file_name;
1082 /* Returns the syntax mode for the syntax file from which the current drawn is
1083 drawn. Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1084 source does not have line numbers.
1086 There is no version of this function that takes an N argument because
1087 lookahead only works to the end of a command and any given command is always
1088 within a single syntax file. */
1089 enum lex_syntax_mode
1090 lex_get_syntax_mode (const struct lexer *lexer)
1092 struct lex_source *src = lex_source__ (lexer);
1093 return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1096 /* Returns the error mode for the syntax file from which the current drawn is
1097 drawn. Returns LEX_ERROR_INTERACTIVE for a T_STOP token or if the command's
1098 source does not have line numbers.
1100 There is no version of this function that takes an N argument because
1101 lookahead only works to the end of a command and any given command is always
1102 within a single syntax file. */
1104 lex_get_error_mode (const struct lexer *lexer)
1106 struct lex_source *src = lex_source__ (lexer);
1107 return src == NULL ? LEX_ERROR_INTERACTIVE : src->reader->error;
1110 /* If the source that LEXER is currently reading has error mode
1111 LEX_ERROR_INTERACTIVE, discards all buffered input and tokens, so that the
1112 next token to be read comes directly from whatever is next read from the
1115 It makes sense to call this function after encountering an error in a
1116 command entered on the console, because usually the user would prefer not to
1117 have cascading errors. */
1119 lex_interactive_reset (struct lexer *lexer)
1121 struct lex_source *src = lex_source__ (lexer);
1122 if (src != NULL && src->reader->error == LEX_ERROR_INTERACTIVE)
1124 src->head = src->tail = 0;
1125 src->journal_pos = src->seg_pos = src->line_pos = 0;
1126 src->n_newlines = 0;
1127 src->suppress_next_newline = false;
1128 segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1129 while (!deque_is_empty (&src->deque))
1130 lex_source_pop__ (src);
1131 lex_source_push_endcmd__ (src);
1135 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1137 lex_discard_rest_of_command (struct lexer *lexer)
1139 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1143 /* Discards all lookahead tokens in LEXER, then discards all input sources
1144 until it encounters one with error mode LEX_ERROR_INTERACTIVE or until it
1145 runs out of input sources. */
1147 lex_discard_noninteractive (struct lexer *lexer)
1149 struct lex_source *src = lex_source__ (lexer);
1153 while (!deque_is_empty (&src->deque))
1154 lex_source_pop__ (src);
1156 for (; src != NULL && src->reader->error != LEX_ERROR_INTERACTIVE;
1157 src = lex_source__ (lexer))
1158 lex_source_destroy (src);
1163 lex_source_max_tail__ (const struct lex_source *src)
1165 const struct lex_token *token;
1168 assert (src->seg_pos >= src->line_pos);
1169 max_tail = MIN (src->journal_pos, src->line_pos);
1171 /* Use the oldest token also. (We know that src->deque cannot be empty
1172 because we are in the process of adding a new token, which is already
1173 initialized enough to use here.) */
1174 token = &src->tokens[deque_back (&src->deque, 0)];
1175 assert (token->token_pos >= token->line_pos);
1176 max_tail = MIN (max_tail, token->line_pos);
1182 lex_source_expand__ (struct lex_source *src)
1184 if (src->head - src->tail >= src->allocated)
1186 size_t max_tail = lex_source_max_tail__ (src);
1187 if (max_tail > src->tail)
1189 /* Advance the tail, freeing up room at the head. */
1190 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1191 src->head - max_tail);
1192 src->tail = max_tail;
1196 /* Buffer is completely full. Expand it. */
1197 src->buffer = x2realloc (src->buffer, &src->allocated);
1202 /* There's space available at the head of the buffer. Nothing to do. */
1207 lex_source_read__ (struct lex_source *src)
1215 lex_source_expand__ (src);
1217 head_ofs = src->head - src->tail;
1218 space = src->allocated - head_ofs;
1219 n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1221 segmenter_get_prompt (&src->segmenter));
1222 assert (n <= space);
1228 Ensure that the input always ends in a new-line followed by a null
1229 byte, as required by the segmenter library. */
1231 if (src->head == src->tail
1232 || src->buffer[src->head - src->tail - 1] != '\n')
1233 src->buffer[src->head++ - src->tail] = '\n';
1235 lex_source_expand__ (src);
1236 src->buffer[src->head++ - src->tail] = '\0';
1243 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1244 src->head - src->seg_pos));
1247 static struct lex_source *
1248 lex_source__ (const struct lexer *lexer)
1250 return (ll_is_empty (&lexer->sources) ? NULL
1251 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1254 static struct substring
1255 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1257 const struct lex_token *token0 = lex_source_next__ (src, n0);
1258 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1259 size_t start = token0->token_pos;
1260 size_t end = token1->token_pos + token1->token_len;
1262 return ss_buffer (&src->buffer[start - src->tail], end - start);
1266 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1272 assert (out_size >= 16);
1273 out_maxlen = out_size - (in.length >= out_size ? 3 : 0) - 1;
1274 for (out_len = 0; out_len < in.length; out_len += mblen)
1276 if (in.string[out_len] == '\n'
1277 || (in.string[out_len] == '\r'
1278 && out_len + 1 < in.length
1279 && in.string[out_len + 1] == '\n'))
1282 mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1283 in.length - out_len);
1284 if (out_len + mblen > out_maxlen)
1288 memcpy (out, in.string, out_len);
1289 strcpy (&out[out_len], out_len < in.length ? "..." : "");
1293 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1294 const char *format, va_list args)
1296 const struct lex_token *token;
1302 token = lex_source_next__ (src, n0);
1303 if (token->token.type == T_ENDCMD)
1304 ds_put_cstr (&s, _("Syntax error at end of command"));
1307 struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1308 if (!ss_is_empty (syntax))
1310 char syntax_cstr[64];
1312 lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1313 ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1316 ds_put_cstr (&s, _("Syntax error"));
1321 ds_put_cstr (&s, ": ");
1322 ds_put_vformat (&s, format, args);
1324 ds_put_byte (&s, '.');
1326 m.category = MSG_C_SYNTAX;
1327 m.severity = MSG_S_ERROR;
1328 m.file_name = src->reader->file_name;
1329 m.first_line = lex_source_get_first_line_number (src, n0);
1330 m.last_line = lex_source_get_last_line_number (src, n1);
1331 m.first_column = lex_source_get_first_column (src, n0);
1332 m.last_column = lex_source_get_last_column (src, n1);
1333 m.text = ds_steal_cstr (&s);
1337 static void PRINTF_FORMAT (2, 3)
1338 lex_get_error (struct lex_source *src, const char *format, ...)
1343 va_start (args, format);
1345 n = deque_count (&src->deque) - 1;
1346 lex_source_error_valist (src, n, n, format, args);
1347 lex_source_pop_front (src);
1353 lex_source_get__ (const struct lex_source *src_)
1355 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1359 struct segmenter segmenter;
1360 enum segment_type last_segment;
1366 struct state state, saved;
1367 enum scan_result result;
1368 struct scanner scanner;
1369 struct lex_token *token;
1376 state.segmenter = src->segmenter;
1378 state.seg_pos = src->seg_pos;
1379 state.line_pos = src->line_pos;
1382 token = lex_push_token__ (src);
1383 scanner_init (&scanner, &token->token);
1384 token->line_pos = src->line_pos;
1385 token->token_pos = src->seg_pos;
1386 if (src->reader->line_number > 0)
1387 token->first_line = src->reader->line_number + src->n_newlines;
1389 token->first_line = 0;
1393 enum segment_type type;
1394 const char *segment;
1398 segment = &src->buffer[state.seg_pos - src->tail];
1399 seg_maxlen = src->head - state.seg_pos;
1400 seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen, &type);
1403 lex_source_read__ (src);
1407 state.last_segment = type;
1408 state.seg_pos += seg_len;
1409 if (type == SEG_NEWLINE)
1412 state.line_pos = state.seg_pos;
1415 result = scanner_push (&scanner, type, ss_buffer (segment, seg_len),
1417 if (result == SCAN_SAVE)
1419 else if (result == SCAN_BACK)
1424 else if (result == SCAN_DONE)
1428 n_lines = state.newlines;
1429 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1432 src->suppress_next_newline = true;
1434 else if (n_lines > 0 && src->suppress_next_newline)
1437 src->suppress_next_newline = false;
1439 for (i = 0; i < n_lines; i++)
1441 const char *newline;
1446 line = &src->buffer[src->journal_pos - src->tail];
1447 newline = rawmemchr (line, '\n');
1448 line_len = newline - line;
1449 if (line_len > 0 && line[line_len - 1] == '\r')
1452 syntax = malloc (line_len + 2);
1453 memcpy (syntax, line, line_len);
1454 syntax[line_len] = '\n';
1455 syntax[line_len + 1] = '\0';
1457 text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX, syntax));
1459 src->journal_pos += newline - line + 1;
1462 token->token_len = state.seg_pos - src->seg_pos;
1464 src->segmenter = state.segmenter;
1465 src->seg_pos = state.seg_pos;
1466 src->line_pos = state.line_pos;
1467 src->n_newlines += state.newlines;
1469 switch (token->token.type)
1475 token->token.type = T_ENDCMD;
1479 case SCAN_BAD_HEX_LENGTH:
1480 lex_get_error (src, _("String of hex digits has %d characters, which "
1481 "is not a multiple of 2"),
1482 (int) token->token.number);
1485 case SCAN_BAD_HEX_DIGIT:
1486 case SCAN_BAD_UNICODE_DIGIT:
1487 lex_get_error (src, _("`%c' is not a valid hex digit"),
1488 (int) token->token.number);
1491 case SCAN_BAD_UNICODE_LENGTH:
1492 lex_get_error (src, _("Unicode string contains %d bytes, which is "
1493 "not in the valid range of 1 to 8 bytes"),
1494 (int) token->token.number);
1497 case SCAN_BAD_UNICODE_CODE_POINT:
1498 lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1499 (int) token->token.number);
1502 case SCAN_EXPECTED_QUOTE:
1503 lex_get_error (src, _("Unterminated string constant"));
1506 case SCAN_EXPECTED_EXPONENT:
1507 lex_get_error (src, _("Missing exponent following `%s'"),
1508 token->token.string.string);
1511 case SCAN_UNEXPECTED_DOT:
1512 lex_get_error (src, _("Unexpected `.' in middle of command"));
1515 case SCAN_UNEXPECTED_CHAR:
1518 lex_get_error (src, _("Bad character %s in input"),
1519 uc_name (token->token.number, c_name));
1524 lex_source_pop_front (src);
1532 lex_source_push_endcmd__ (struct lex_source *src)
1534 struct lex_token *token = lex_push_token__ (src);
1535 token->token.type = T_ENDCMD;
1536 token->token_pos = 0;
1537 token->token_len = 0;
1538 token->line_pos = 0;
1539 token->first_line = 0;
1542 static struct lex_source *
1543 lex_source_create (struct lex_reader *reader)
1545 struct lex_source *src;
1546 enum segmenter_mode mode;
1548 src = xzalloc (sizeof *src);
1549 src->reader = reader;
1551 if (reader->syntax == LEX_SYNTAX_AUTO)
1552 mode = SEG_MODE_AUTO;
1553 else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1554 mode = SEG_MODE_INTERACTIVE;
1555 else if (reader->syntax == LEX_SYNTAX_BATCH)
1556 mode = SEG_MODE_BATCH;
1559 segmenter_init (&src->segmenter, mode);
1561 src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1563 lex_source_push_endcmd__ (src);
1569 lex_source_destroy (struct lex_source *src)
1571 char *file_name = src->reader->file_name;
1572 if (src->reader->class->destroy != NULL)
1573 src->reader->class->destroy (src->reader);
1576 while (!deque_is_empty (&src->deque))
1577 lex_source_pop__ (src);
1579 ll_remove (&src->ll);
1583 struct lex_file_reader
1585 struct lex_reader reader;
1586 struct u8_istream *istream;
1590 static struct lex_reader_class lex_file_reader_class;
1592 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1593 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
1594 ENCODING, which should take one of the forms accepted by
1595 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
1596 mode of the new reader, respectively.
1598 Returns a null pointer if FILE_NAME cannot be opened. */
1600 lex_reader_for_file (const char *file_name, const char *encoding,
1601 enum lex_syntax_mode syntax,
1602 enum lex_error_mode error)
1604 struct lex_file_reader *r;
1605 struct u8_istream *istream;
1607 istream = (!strcmp(file_name, "-")
1608 ? u8_istream_for_fd (encoding, STDIN_FILENO)
1609 : u8_istream_for_file (encoding, file_name, O_RDONLY));
1610 if (istream == NULL)
1612 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1616 r = xmalloc (sizeof *r);
1617 lex_reader_init (&r->reader, &lex_file_reader_class);
1618 r->reader.syntax = syntax;
1619 r->reader.error = error;
1620 r->reader.file_name = xstrdup (file_name);
1621 r->reader.line_number = 1;
1622 r->istream = istream;
1623 r->file_name = xstrdup (file_name);
1628 static struct lex_file_reader *
1629 lex_file_reader_cast (struct lex_reader *r)
1631 return UP_CAST (r, struct lex_file_reader, reader);
1635 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1636 enum prompt_style prompt_style UNUSED)
1638 struct lex_file_reader *r = lex_file_reader_cast (r_);
1639 ssize_t n_read = u8_istream_read (r->istream, buf, n);
1642 msg (ME, _("Error reading `%s': %s."), r->file_name, strerror (errno));
1649 lex_file_close (struct lex_reader *r_)
1651 struct lex_file_reader *r = lex_file_reader_cast (r_);
1653 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1655 if (u8_istream_close (r->istream) != 0)
1656 msg (ME, _("Error closing `%s': %s."), r->file_name, strerror (errno));
1659 u8_istream_free (r->istream);
1661 free (r->file_name);
1665 static struct lex_reader_class lex_file_reader_class =
1671 struct lex_string_reader
1673 struct lex_reader reader;
1678 static struct lex_reader_class lex_string_reader_class;
1680 /* Creates and returns a new lex_reader for the contents of S, which must be
1681 encoded in UTF-8. The new reader takes ownership of S and will free it
1682 with ss_dealloc() when it is closed. */
1684 lex_reader_for_substring_nocopy (struct substring s)
1686 struct lex_string_reader *r;
1688 r = xmalloc (sizeof *r);
1689 lex_reader_init (&r->reader, &lex_string_reader_class);
1690 r->reader.syntax = LEX_SYNTAX_INTERACTIVE;
1697 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1698 which must be encoded in UTF-8. The caller retains ownership of S. */
1700 lex_reader_for_string (const char *s)
1702 struct substring ss;
1703 ss_alloc_substring (&ss, ss_cstr (s));
1704 return lex_reader_for_substring_nocopy (ss);
1707 /* Formats FORMAT as a printf()-like format string and creates and returns a
1708 new lex_reader for the formatted result. */
1710 lex_reader_for_format (const char *format, ...)
1712 struct lex_reader *r;
1715 va_start (args, format);
1716 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)));
1722 static struct lex_string_reader *
1723 lex_string_reader_cast (struct lex_reader *r)
1725 return UP_CAST (r, struct lex_string_reader, reader);
1729 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1730 enum prompt_style prompt_style UNUSED)
1732 struct lex_string_reader *r = lex_string_reader_cast (r_);
1735 chunk = MIN (n, r->s.length - r->offset);
1736 memcpy (buf, r->s.string + r->offset, chunk);
1743 lex_string_close (struct lex_reader *r_)
1745 struct lex_string_reader *r = lex_string_reader_cast (r_);
1751 static struct lex_reader_class lex_string_reader_class =