1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "data/file-name.h"
34 #include "language/command.h"
35 #include "language/lexer/scan.h"
36 #include "language/lexer/segment.h"
37 #include "language/lexer/token.h"
38 #include "libpspp/assertion.h"
39 #include "libpspp/cast.h"
40 #include "libpspp/deque.h"
41 #include "libpspp/i18n.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/text-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* Location of token in terms of the lex_source's buffer.
66 src->tail <= line_pos <= token_pos <= src->head. */
67 size_t token_pos; /* Start of token. */
68 size_t token_len; /* Length of source for token in bytes. */
69 size_t line_pos; /* Start of line containing token_pos. */
70 int first_line; /* Line number at token_pos. */
73 /* A source of tokens, corresponding to a syntax file.
75 This is conceptually a lex_reader wrapped with everything needed to convert
76 its UTF-8 bytes into tokens. */
79 struct ll ll; /* In lexer's list of sources. */
80 struct lex_reader *reader;
81 struct segmenter segmenter;
82 bool eof; /* True if T_STOP was read from 'reader'. */
84 /* Buffer of UTF-8 bytes. */
86 size_t allocated; /* Number of bytes allocated. */
87 size_t tail; /* &buffer[0] offset into UTF-8 source. */
88 size_t head; /* &buffer[head - tail] offset into source. */
90 /* Positions in source file, tail <= pos <= head for each member here. */
91 size_t journal_pos; /* First byte not yet output to journal. */
92 size_t seg_pos; /* First byte not yet scanned as token. */
93 size_t line_pos; /* First byte of line containing seg_pos. */
95 int n_newlines; /* Number of new-lines up to seg_pos. */
96 bool suppress_next_newline;
99 struct deque deque; /* Indexes into 'tokens'. */
100 struct lex_token *tokens; /* Lookahead tokens for parser. */
103 static struct lex_source *lex_source_create (struct lex_reader *);
104 static void lex_source_destroy (struct lex_source *);
109 struct ll_list sources; /* Contains "struct lex_source"s. */
112 static struct lex_source *lex_source__ (const struct lexer *);
113 static const struct lex_token *lex_next__ (const struct lexer *, int n);
114 static void lex_source_push_endcmd__ (struct lex_source *);
116 static void lex_source_pop__ (struct lex_source *);
117 static bool lex_source_get__ (const struct lex_source *);
118 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
119 const char *format, va_list)
120 PRINTF_FORMAT (4, 0);
121 static const struct lex_token *lex_source_next__ (const struct lex_source *,
124 /* Initializes READER with the specified CLASS and otherwise some reasonable
125 defaults. The caller should fill in the others members as desired. */
127 lex_reader_init (struct lex_reader *reader,
128 const struct lex_reader_class *class)
130 reader->class = class;
131 reader->syntax = LEX_SYNTAX_AUTO;
132 reader->error = LEX_ERROR_CONTINUE;
133 reader->file_name = NULL;
134 reader->encoding = NULL;
135 reader->line_number = 0;
138 /* Frees any file name already in READER and replaces it by a copy of
139 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
141 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
143 free (reader->file_name);
144 reader->file_name = file_name != NULL ? xstrdup (file_name) : NULL;
147 /* Creates and returns a new lexer. */
151 struct lexer *lexer = xzalloc (sizeof *lexer);
152 ll_init (&lexer->sources);
156 /* Destroys LEXER. */
158 lex_destroy (struct lexer *lexer)
162 struct lex_source *source, *next;
164 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
165 lex_source_destroy (source);
170 /* Inserts READER into LEXER so that the next token read by LEXER comes from
171 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
174 lex_include (struct lexer *lexer, struct lex_reader *reader)
176 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
177 ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
180 /* Appends READER to LEXER, so that it will be read after all other current
181 readers have already been read. */
183 lex_append (struct lexer *lexer, struct lex_reader *reader)
185 ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
190 static struct lex_token *
191 lex_push_token__ (struct lex_source *src)
193 struct lex_token *token;
195 if (deque_is_full (&src->deque))
196 src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
198 token = &src->tokens[deque_push_front (&src->deque)];
199 token_init (&token->token);
204 lex_source_pop__ (struct lex_source *src)
206 token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
210 lex_source_pop_front (struct lex_source *src)
212 token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
215 /* Advances LEXER to the next token, consuming the current token. */
217 lex_get (struct lexer *lexer)
219 struct lex_source *src;
221 src = lex_source__ (lexer);
225 if (!deque_is_empty (&src->deque))
226 lex_source_pop__ (src);
228 while (deque_is_empty (&src->deque))
229 if (!lex_source_get__ (src))
231 lex_source_destroy (src);
232 src = lex_source__ (lexer);
238 /* Issuing errors. */
240 /* Prints a syntax error message containing the current token and
241 given message MESSAGE (if non-null). */
243 lex_error (struct lexer *lexer, const char *format, ...)
247 va_start (args, format);
248 lex_next_error_valist (lexer, 0, 0, format, args);
252 /* Prints a syntax error message containing the current token and
253 given message MESSAGE (if non-null). */
255 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
257 lex_next_error_valist (lexer, 0, 0, format, args);
260 /* Prints a syntax error message containing the current token and
261 given message MESSAGE (if non-null). */
263 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
267 va_start (args, format);
268 lex_next_error_valist (lexer, n0, n1, format, args);
272 /* Prints a syntax error message saying that OPTION0 or one of the other
273 strings following it, up to the first NULL, is expected. */
275 lex_error_expecting (struct lexer *lexer, const char *option0, ...)
277 enum { MAX_OPTIONS = 8 };
278 const char *options[MAX_OPTIONS + 1];
282 va_start (args, option0);
283 options[0] = option0;
285 while (n + 1 < MAX_OPTIONS && options[n] != NULL)
286 options[++n] = va_arg (args, const char *);
292 lex_error (lexer, NULL);
296 lex_error (lexer, _("expecting %s"), options[0]);
300 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
304 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
309 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
310 options[0], options[1], options[2], options[3]);
314 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
315 options[0], options[1], options[2], options[3], options[4]);
319 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
320 options[0], options[1], options[2], options[3], options[4],
325 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
326 options[0], options[1], options[2], options[3], options[4],
327 options[5], options[6]);
331 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
332 options[0], options[1], options[2], options[3], options[4],
333 options[5], options[6], options[7]);
341 /* Reports an error to the effect that subcommand SBC may only be specified
344 This function does not take a lexer as an argument or use lex_error(),
345 because the result would ordinarily just be redundant: "Syntax error at
346 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
347 not help the user find the error. */
349 lex_sbc_only_once (const char *sbc)
351 msg (SE, _("Subcommand %s may only be specified once."), sbc);
354 /* Reports an error to the effect that subcommand SBC is missing.
356 This function does not take a lexer as an argument or use lex_error(),
357 because a missing subcommand can normally be detected only after the whole
358 command has been parsed, and so lex_error() would always report "Syntax
359 error at end of command", which does not help the user find the error. */
361 lex_sbc_missing (const char *sbc)
363 msg (SE, _("Required subcommand %s was not specified."), sbc);
366 /* Reports an error to the effect that specification SPEC may only be specified
367 once within subcommand SBC. */
369 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
371 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
375 /* Reports an error to the effect that specification SPEC is missing within
378 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
380 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
384 /* Prints a syntax error message containing the current token and
385 given message MESSAGE (if non-null). */
387 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
388 const char *format, va_list args)
390 struct lex_source *src = lex_source__ (lexer);
393 lex_source_error_valist (src, n0, n1, format, args);
399 ds_put_format (&s, _("Syntax error at end of input"));
402 ds_put_cstr (&s, ": ");
403 ds_put_vformat (&s, format, args);
405 ds_put_byte (&s, '.');
406 msg (SE, "%s", ds_cstr (&s));
411 /* Checks that we're at end of command.
412 If so, returns a successful command completion code.
413 If not, flags a syntax error and returns an error command
416 lex_end_of_command (struct lexer *lexer)
418 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
420 lex_error (lexer, _("expecting end of command"));
427 /* Token testing functions. */
429 /* Returns true if the current token is a number. */
431 lex_is_number (struct lexer *lexer)
433 return lex_next_is_number (lexer, 0);
436 /* Returns true if the current token is a string. */
438 lex_is_string (struct lexer *lexer)
440 return lex_next_is_string (lexer, 0);
443 /* Returns the value of the current token, which must be a
444 floating point number. */
446 lex_number (struct lexer *lexer)
448 return lex_next_number (lexer, 0);
451 /* Returns true iff the current token is an integer. */
453 lex_is_integer (struct lexer *lexer)
455 return lex_next_is_integer (lexer, 0);
458 /* Returns the value of the current token, which must be an
461 lex_integer (struct lexer *lexer)
463 return lex_next_integer (lexer, 0);
466 /* Token testing functions with lookahead.
468 A value of 0 for N as an argument to any of these functions refers to the
469 current token. Lookahead is limited to the current command. Any N greater
470 than the number of tokens remaining in the current command will be treated
471 as referring to a T_ENDCMD token. */
473 /* Returns true if the token N ahead of the current token is a number. */
475 lex_next_is_number (struct lexer *lexer, int n)
477 enum token_type next_token = lex_next_token (lexer, n);
478 return next_token == T_POS_NUM || next_token == T_NEG_NUM;
481 /* Returns true if the token N ahead of the current token is a string. */
483 lex_next_is_string (struct lexer *lexer, int n)
485 return lex_next_token (lexer, n) == T_STRING;
488 /* Returns the value of the token N ahead of the current token, which must be a
489 floating point number. */
491 lex_next_number (struct lexer *lexer, int n)
493 assert (lex_next_is_number (lexer, n));
494 return lex_next_tokval (lexer, n);
497 /* Returns true if the token N ahead of the current token is an integer. */
499 lex_next_is_integer (struct lexer *lexer, int n)
503 if (!lex_next_is_number (lexer, n))
506 value = lex_next_tokval (lexer, n);
507 return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
510 /* Returns the value of the token N ahead of the current token, which must be
513 lex_next_integer (struct lexer *lexer, int n)
515 assert (lex_next_is_integer (lexer, n));
516 return lex_next_tokval (lexer, n);
519 /* Token matching functions. */
521 /* If the current token has the specified TYPE, skips it and returns true.
522 Otherwise, returns false. */
524 lex_match (struct lexer *lexer, enum token_type type)
526 if (lex_token (lexer) == type)
535 /* If the current token matches IDENTIFIER, skips it and returns true.
536 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
539 IDENTIFIER must be an ASCII string. */
541 lex_match_id (struct lexer *lexer, const char *identifier)
543 return lex_match_id_n (lexer, identifier, 3);
546 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
547 may be abbreviated to its first N letters. Otherwise, returns false.
549 IDENTIFIER must be an ASCII string. */
551 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
553 if (lex_token (lexer) == T_ID
554 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
563 /* If the current token is integer X, skips it and returns true. Otherwise,
566 lex_match_int (struct lexer *lexer, int x)
568 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
577 /* Forced matches. */
579 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
580 abbreviated to its first 3 letters. Otherwise, reports an error and returns
583 IDENTIFIER must be an ASCII string. */
585 lex_force_match_id (struct lexer *lexer, const char *identifier)
587 if (lex_match_id (lexer, identifier))
591 lex_error_expecting (lexer, identifier, NULL_SENTINEL);
596 /* If the current token has the specified TYPE, skips it and returns true.
597 Otherwise, reports an error and returns false. */
599 lex_force_match (struct lexer *lexer, enum token_type type)
601 if (lex_token (lexer) == type)
608 char *s = xasprintf ("`%s'", token_type_to_string (type));
609 lex_error_expecting (lexer, s, NULL_SENTINEL);
615 /* If the current token is a string, does nothing and returns true.
616 Otherwise, reports an error and returns false. */
618 lex_force_string (struct lexer *lexer)
620 if (lex_is_string (lexer))
624 lex_error (lexer, _("expecting string"));
629 /* If the current token is a string or an identifier, does nothing and returns
630 true. Otherwise, reports an error and returns false.
632 This is meant for use in syntactic situations where we want to encourage the
633 user to supply a quoted string, but for compatibility we also accept
634 identifiers. (One example of such a situation is file names.) Therefore,
635 the error message issued when the current token is wrong only says that a
636 string is expected and doesn't mention that an identifier would also be
639 lex_force_string_or_id (struct lexer *lexer)
641 return lex_is_integer (lexer) || lex_force_string (lexer);
644 /* If the current token is an integer, does nothing and returns true.
645 Otherwise, reports an error and returns false. */
647 lex_force_int (struct lexer *lexer)
649 if (lex_is_integer (lexer))
653 lex_error (lexer, _("expecting integer"));
658 /* If the current token is a number, does nothing and returns true.
659 Otherwise, reports an error and returns false. */
661 lex_force_num (struct lexer *lexer)
663 if (lex_is_number (lexer))
666 lex_error (lexer, _("expecting number"));
670 /* If the current token is an identifier, does nothing and returns true.
671 Otherwise, reports an error and returns false. */
673 lex_force_id (struct lexer *lexer)
675 if (lex_token (lexer) == T_ID)
678 lex_error (lexer, _("expecting identifier"));
682 /* Token accessors. */
684 /* Returns the type of LEXER's current token. */
686 lex_token (const struct lexer *lexer)
688 return lex_next_token (lexer, 0);
691 /* Returns the number in LEXER's current token.
693 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
694 tokens this function will always return zero. */
696 lex_tokval (const struct lexer *lexer)
698 return lex_next_tokval (lexer, 0);
701 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
703 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
704 this functions this function will always return NULL.
706 The UTF-8 encoding of the returned string is correct for variable names and
707 other identifiers. Use filename_to_utf8() to use it as a filename. Use
708 data_in() to use it in a "union value". */
710 lex_tokcstr (const struct lexer *lexer)
712 return lex_next_tokcstr (lexer, 0);
715 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
716 null-terminated (but the null terminator is not included in the returned
717 substring's 'length').
719 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
720 this functions this function will always return NULL.
722 The UTF-8 encoding of the returned string is correct for variable names and
723 other identifiers. Use filename_to_utf8() to use it as a filename. Use
724 data_in() to use it in a "union value". */
726 lex_tokss (const struct lexer *lexer)
728 return lex_next_tokss (lexer, 0);
733 A value of 0 for N as an argument to any of these functions refers to the
734 current token. Lookahead is limited to the current command. Any N greater
735 than the number of tokens remaining in the current command will be treated
736 as referring to a T_ENDCMD token. */
738 static const struct lex_token *
739 lex_next__ (const struct lexer *lexer_, int n)
741 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
742 struct lex_source *src = lex_source__ (lexer);
745 return lex_source_next__ (src, n);
748 static const struct lex_token stop_token =
749 { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
755 static const struct lex_token *
756 lex_source_next__ (const struct lex_source *src, int n)
758 while (deque_count (&src->deque) <= n)
760 if (!deque_is_empty (&src->deque))
762 struct lex_token *front;
764 front = &src->tokens[deque_front (&src->deque, 0)];
765 if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
769 lex_source_get__ (src);
772 return &src->tokens[deque_back (&src->deque, n)];
775 /* Returns the "struct token" of the token N after the current one in LEXER.
776 The returned pointer can be invalidated by pretty much any succeeding call
777 into the lexer, although the string pointer within the returned token is
778 only invalidated by consuming the token (e.g. with lex_get()). */
780 lex_next (const struct lexer *lexer, int n)
782 return &lex_next__ (lexer, n)->token;
785 /* Returns the type of the token N after the current one in LEXER. */
787 lex_next_token (const struct lexer *lexer, int n)
789 return lex_next (lexer, n)->type;
792 /* Returns the number in the tokn N after the current one in LEXER.
794 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
795 tokens this function will always return zero. */
797 lex_next_tokval (const struct lexer *lexer, int n)
799 const struct token *token = lex_next (lexer, n);
800 return token->number;
803 /* Returns the null-terminated string in the token N after the current one, in
806 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
807 this functions this function will always return NULL.
809 The UTF-8 encoding of the returned string is correct for variable names and
810 other identifiers. Use filename_to_utf8() to use it as a filename. Use
811 data_in() to use it in a "union value". */
813 lex_next_tokcstr (const struct lexer *lexer, int n)
815 return lex_next_tokss (lexer, n).string;
818 /* Returns the string in the token N after the current one, in UTF-8 encoding.
819 The string is null-terminated (but the null terminator is not included in
820 the returned substring's 'length').
822 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
823 this functions this function will always return NULL.
825 The UTF-8 encoding of the returned string is correct for variable names and
826 other identifiers. Use filename_to_utf8() to use it as a filename. Use
827 data_in() to use it in a "union value". */
829 lex_next_tokss (const struct lexer *lexer, int n)
831 return lex_next (lexer, n)->string;
835 lex_tokens_match (const struct token *actual, const struct token *expected)
837 if (actual->type != expected->type)
840 switch (actual->type)
844 return actual->number == expected->number;
847 return lex_id_match (expected->string, actual->string);
850 return (actual->string.length == expected->string.length
851 && !memcmp (actual->string.string, expected->string.string,
852 actual->string.length));
859 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
860 skips it and returns true. Otherwise, returns false.
862 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
863 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
864 first three letters. */
866 lex_match_phrase (struct lexer *lexer, const char *s)
868 struct string_lexer slex;
873 string_lexer_init (&slex, s, SEG_MODE_INTERACTIVE);
874 while (string_lexer_next (&slex, &token))
875 if (token.type != SCAN_SKIP)
877 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
878 token_destroy (&token);
889 lex_source_get_first_line_number (const struct lex_source *src, int n)
891 return lex_source_next__ (src, n)->first_line;
895 count_newlines (char *s, size_t length)
900 while ((newline = memchr (s, '\n', length)) != NULL)
903 length -= (newline + 1) - s;
911 lex_source_get_last_line_number (const struct lex_source *src, int n)
913 const struct lex_token *token = lex_source_next__ (src, n);
915 if (token->first_line == 0)
919 char *token_str = &src->buffer[token->token_pos - src->tail];
920 return token->first_line + count_newlines (token_str, token->token_len) + 1;
925 count_columns (const char *s_, size_t length)
927 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
933 for (ofs = 0; ofs < length; ofs += mblen)
937 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
940 int width = uc_width (uc, "UTF-8");
945 columns = ROUND_UP (columns + 1, 8);
952 lex_source_get_first_column (const struct lex_source *src, int n)
954 const struct lex_token *token = lex_source_next__ (src, n);
955 return count_columns (&src->buffer[token->line_pos - src->tail],
956 token->token_pos - token->line_pos);
960 lex_source_get_last_column (const struct lex_source *src, int n)
962 const struct lex_token *token = lex_source_next__ (src, n);
963 char *start, *end, *newline;
965 start = &src->buffer[token->line_pos - src->tail];
966 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
967 newline = memrchr (start, '\n', end - start);
970 return count_columns (start, end - start);
973 /* Returns the 1-based line number of the start of the syntax that represents
974 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
975 if the token is drawn from a source that does not have line numbers. */
977 lex_get_first_line_number (const struct lexer *lexer, int n)
979 const struct lex_source *src = lex_source__ (lexer);
980 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
983 /* Returns the 1-based line number of the end of the syntax that represents the
984 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
985 token or if the token is drawn from a source that does not have line
988 Most of the time, a single token is wholly within a single line of syntax,
989 but there are two exceptions: a T_STRING token can be made up of multiple
990 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
991 token can consist of a "-" on one line followed by the number on the next.
994 lex_get_last_line_number (const struct lexer *lexer, int n)
996 const struct lex_source *src = lex_source__ (lexer);
997 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1000 /* Returns the 1-based column number of the start of the syntax that represents
1001 the token N after the current one in LEXER. Returns 0 for a T_STOP
1004 Column numbers are measured according to the width of characters as shown in
1005 a typical fixed-width font, in which CJK characters have width 2 and
1006 combining characters have width 0. */
1008 lex_get_first_column (const struct lexer *lexer, int n)
1010 const struct lex_source *src = lex_source__ (lexer);
1011 return src != NULL ? lex_source_get_first_column (src, n) : 0;
1014 /* Returns the 1-based column number of the end of the syntax that represents
1015 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1018 Column numbers are measured according to the width of characters as shown in
1019 a typical fixed-width font, in which CJK characters have width 2 and
1020 combining characters have width 0. */
1022 lex_get_last_column (const struct lexer *lexer, int n)
1024 const struct lex_source *src = lex_source__ (lexer);
1025 return src != NULL ? lex_source_get_last_column (src, n) : 0;
1028 /* Returns the name of the syntax file from which the current command is drawn.
1029 Returns NULL for a T_STOP token or if the command's source does not have
1032 There is no version of this function that takes an N argument because
1033 lookahead only works to the end of a command and any given command is always
1034 within a single syntax file. */
1036 lex_get_file_name (const struct lexer *lexer)
1038 struct lex_source *src = lex_source__ (lexer);
1039 return src == NULL ? NULL : src->reader->file_name;
1043 lex_get_encoding (const struct lexer *lexer)
1045 struct lex_source *src = lex_source__ (lexer);
1046 return src == NULL ? NULL : src->reader->encoding;
1050 /* Returns the syntax mode for the syntax file from which the current drawn is
1051 drawn. Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1052 source does not have line numbers.
1054 There is no version of this function that takes an N argument because
1055 lookahead only works to the end of a command and any given command is always
1056 within a single syntax file. */
1057 enum lex_syntax_mode
1058 lex_get_syntax_mode (const struct lexer *lexer)
1060 struct lex_source *src = lex_source__ (lexer);
1061 return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1064 /* Returns the error mode for the syntax file from which the current drawn is
1065 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1066 source does not have line numbers.
1068 There is no version of this function that takes an N argument because
1069 lookahead only works to the end of a command and any given command is always
1070 within a single syntax file. */
1072 lex_get_error_mode (const struct lexer *lexer)
1074 struct lex_source *src = lex_source__ (lexer);
1075 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1078 /* If the source that LEXER is currently reading has error mode
1079 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1080 token to be read comes directly from whatever is next read from the stream.
1082 It makes sense to call this function after encountering an error in a
1083 command entered on the console, because usually the user would prefer not to
1084 have cascading errors. */
1086 lex_interactive_reset (struct lexer *lexer)
1088 struct lex_source *src = lex_source__ (lexer);
1089 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1091 src->head = src->tail = 0;
1092 src->journal_pos = src->seg_pos = src->line_pos = 0;
1093 src->n_newlines = 0;
1094 src->suppress_next_newline = false;
1095 segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1096 while (!deque_is_empty (&src->deque))
1097 lex_source_pop__ (src);
1098 lex_source_push_endcmd__ (src);
1102 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1104 lex_discard_rest_of_command (struct lexer *lexer)
1106 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1110 /* Discards all lookahead tokens in LEXER, then discards all input sources
1111 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1112 runs out of input sources. */
1114 lex_discard_noninteractive (struct lexer *lexer)
1116 struct lex_source *src = lex_source__ (lexer);
1120 while (!deque_is_empty (&src->deque))
1121 lex_source_pop__ (src);
1123 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1124 src = lex_source__ (lexer))
1125 lex_source_destroy (src);
1130 lex_source_max_tail__ (const struct lex_source *src)
1132 const struct lex_token *token;
1135 assert (src->seg_pos >= src->line_pos);
1136 max_tail = MIN (src->journal_pos, src->line_pos);
1138 /* Use the oldest token also. (We know that src->deque cannot be empty
1139 because we are in the process of adding a new token, which is already
1140 initialized enough to use here.) */
1141 token = &src->tokens[deque_back (&src->deque, 0)];
1142 assert (token->token_pos >= token->line_pos);
1143 max_tail = MIN (max_tail, token->line_pos);
1149 lex_source_expand__ (struct lex_source *src)
1151 if (src->head - src->tail >= src->allocated)
1153 size_t max_tail = lex_source_max_tail__ (src);
1154 if (max_tail > src->tail)
1156 /* Advance the tail, freeing up room at the head. */
1157 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1158 src->head - max_tail);
1159 src->tail = max_tail;
1163 /* Buffer is completely full. Expand it. */
1164 src->buffer = x2realloc (src->buffer, &src->allocated);
1169 /* There's space available at the head of the buffer. Nothing to do. */
1174 lex_source_read__ (struct lex_source *src)
1182 lex_source_expand__ (src);
1184 head_ofs = src->head - src->tail;
1185 space = src->allocated - head_ofs;
1186 n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1188 segmenter_get_prompt (&src->segmenter));
1189 assert (n <= space);
1195 Ensure that the input always ends in a new-line followed by a null
1196 byte, as required by the segmenter library. */
1198 if (src->head == src->tail
1199 || src->buffer[src->head - src->tail - 1] != '\n')
1200 src->buffer[src->head++ - src->tail] = '\n';
1202 lex_source_expand__ (src);
1203 src->buffer[src->head++ - src->tail] = '\0';
1210 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1211 src->head - src->seg_pos));
1214 static struct lex_source *
1215 lex_source__ (const struct lexer *lexer)
1217 return (ll_is_empty (&lexer->sources) ? NULL
1218 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1221 static struct substring
1222 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1224 const struct lex_token *token0 = lex_source_next__ (src, n0);
1225 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1226 size_t start = token0->token_pos;
1227 size_t end = token1->token_pos + token1->token_len;
1229 return ss_buffer (&src->buffer[start - src->tail], end - start);
1233 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1239 assert (out_size >= 16);
1240 out_maxlen = out_size - (in.length >= out_size ? 3 : 0) - 1;
1241 for (out_len = 0; out_len < in.length; out_len += mblen)
1243 if (in.string[out_len] == '\n'
1244 || (in.string[out_len] == '\r'
1245 && out_len + 1 < in.length
1246 && in.string[out_len + 1] == '\n'))
1249 mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1250 in.length - out_len);
1251 if (out_len + mblen > out_maxlen)
1255 memcpy (out, in.string, out_len);
1256 strcpy (&out[out_len], out_len < in.length ? "..." : "");
1260 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1261 const char *format, va_list args)
1263 const struct lex_token *token;
1269 token = lex_source_next__ (src, n0);
1270 if (token->token.type == T_ENDCMD)
1271 ds_put_cstr (&s, _("Syntax error at end of command"));
1274 struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1275 if (!ss_is_empty (syntax))
1277 char syntax_cstr[64];
1279 lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1280 ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1283 ds_put_cstr (&s, _("Syntax error"));
1288 ds_put_cstr (&s, ": ");
1289 ds_put_vformat (&s, format, args);
1291 ds_put_byte (&s, '.');
1293 m.category = MSG_C_SYNTAX;
1294 m.severity = MSG_S_ERROR;
1295 m.file_name = src->reader->file_name;
1296 m.first_line = lex_source_get_first_line_number (src, n0);
1297 m.last_line = lex_source_get_last_line_number (src, n1);
1298 m.first_column = lex_source_get_first_column (src, n0);
1299 m.last_column = lex_source_get_last_column (src, n1);
1300 m.text = ds_steal_cstr (&s);
1304 static void PRINTF_FORMAT (2, 3)
1305 lex_get_error (struct lex_source *src, const char *format, ...)
1310 va_start (args, format);
1312 n = deque_count (&src->deque) - 1;
1313 lex_source_error_valist (src, n, n, format, args);
1314 lex_source_pop_front (src);
1320 lex_source_get__ (const struct lex_source *src_)
1322 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1326 struct segmenter segmenter;
1327 enum segment_type last_segment;
1333 struct state state, saved;
1334 enum scan_result result;
1335 struct scanner scanner;
1336 struct lex_token *token;
1343 state.segmenter = src->segmenter;
1345 state.seg_pos = src->seg_pos;
1346 state.line_pos = src->line_pos;
1349 token = lex_push_token__ (src);
1350 scanner_init (&scanner, &token->token);
1351 token->line_pos = src->line_pos;
1352 token->token_pos = src->seg_pos;
1353 if (src->reader->line_number > 0)
1354 token->first_line = src->reader->line_number + src->n_newlines;
1356 token->first_line = 0;
1360 enum segment_type type;
1361 const char *segment;
1365 segment = &src->buffer[state.seg_pos - src->tail];
1366 seg_maxlen = src->head - state.seg_pos;
1367 seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen, &type);
1370 lex_source_read__ (src);
1374 state.last_segment = type;
1375 state.seg_pos += seg_len;
1376 if (type == SEG_NEWLINE)
1379 state.line_pos = state.seg_pos;
1382 result = scanner_push (&scanner, type, ss_buffer (segment, seg_len),
1384 if (result == SCAN_SAVE)
1386 else if (result == SCAN_BACK)
1391 else if (result == SCAN_DONE)
1395 n_lines = state.newlines;
1396 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1399 src->suppress_next_newline = true;
1401 else if (n_lines > 0 && src->suppress_next_newline)
1404 src->suppress_next_newline = false;
1406 for (i = 0; i < n_lines; i++)
1408 const char *newline;
1413 line = &src->buffer[src->journal_pos - src->tail];
1414 newline = rawmemchr (line, '\n');
1415 line_len = newline - line;
1416 if (line_len > 0 && line[line_len - 1] == '\r')
1419 syntax = malloc (line_len + 2);
1420 memcpy (syntax, line, line_len);
1421 syntax[line_len] = '\n';
1422 syntax[line_len + 1] = '\0';
1424 text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX, syntax));
1426 src->journal_pos += newline - line + 1;
1429 token->token_len = state.seg_pos - src->seg_pos;
1431 src->segmenter = state.segmenter;
1432 src->seg_pos = state.seg_pos;
1433 src->line_pos = state.line_pos;
1434 src->n_newlines += state.newlines;
1436 switch (token->token.type)
1442 token->token.type = T_ENDCMD;
1446 case SCAN_BAD_HEX_LENGTH:
1447 lex_get_error (src, _("String of hex digits has %d characters, which "
1448 "is not a multiple of 2"),
1449 (int) token->token.number);
1452 case SCAN_BAD_HEX_DIGIT:
1453 case SCAN_BAD_UNICODE_DIGIT:
1454 lex_get_error (src, _("`%c' is not a valid hex digit"),
1455 (int) token->token.number);
1458 case SCAN_BAD_UNICODE_LENGTH:
1459 lex_get_error (src, _("Unicode string contains %d bytes, which is "
1460 "not in the valid range of 1 to 8 bytes"),
1461 (int) token->token.number);
1464 case SCAN_BAD_UNICODE_CODE_POINT:
1465 lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1466 (int) token->token.number);
1469 case SCAN_EXPECTED_QUOTE:
1470 lex_get_error (src, _("Unterminated string constant"));
1473 case SCAN_EXPECTED_EXPONENT:
1474 lex_get_error (src, _("Missing exponent following `%s'"),
1475 token->token.string.string);
1478 case SCAN_UNEXPECTED_DOT:
1479 lex_get_error (src, _("Unexpected `.' in middle of command"));
1482 case SCAN_UNEXPECTED_CHAR:
1485 lex_get_error (src, _("Bad character %s in input"),
1486 uc_name (token->token.number, c_name));
1491 lex_source_pop_front (src);
1499 lex_source_push_endcmd__ (struct lex_source *src)
1501 struct lex_token *token = lex_push_token__ (src);
1502 token->token.type = T_ENDCMD;
1503 token->token_pos = 0;
1504 token->token_len = 0;
1505 token->line_pos = 0;
1506 token->first_line = 0;
1509 static struct lex_source *
1510 lex_source_create (struct lex_reader *reader)
1512 struct lex_source *src;
1513 enum segmenter_mode mode;
1515 src = xzalloc (sizeof *src);
1516 src->reader = reader;
1518 if (reader->syntax == LEX_SYNTAX_AUTO)
1519 mode = SEG_MODE_AUTO;
1520 else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1521 mode = SEG_MODE_INTERACTIVE;
1522 else if (reader->syntax == LEX_SYNTAX_BATCH)
1523 mode = SEG_MODE_BATCH;
1526 segmenter_init (&src->segmenter, mode);
1528 src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1530 lex_source_push_endcmd__ (src);
1536 lex_source_destroy (struct lex_source *src)
1538 char *file_name = src->reader->file_name;
1539 char *encoding = src->reader->encoding;
1540 if (src->reader->class->destroy != NULL)
1541 src->reader->class->destroy (src->reader);
1545 while (!deque_is_empty (&src->deque))
1546 lex_source_pop__ (src);
1548 ll_remove (&src->ll);
1552 struct lex_file_reader
1554 struct lex_reader reader;
1555 struct u8_istream *istream;
1558 static struct lex_reader_class lex_file_reader_class;
1560 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1561 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
1562 ENCODING, which should take one of the forms accepted by
1563 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
1564 mode of the new reader, respectively.
1566 Returns a null pointer if FILE_NAME cannot be opened. */
1568 lex_reader_for_file (const char *file_name, const char *encoding,
1569 enum lex_syntax_mode syntax,
1570 enum lex_error_mode error)
1572 struct lex_file_reader *r;
1573 struct u8_istream *istream;
1575 istream = (!strcmp(file_name, "-")
1576 ? u8_istream_for_fd (encoding, STDIN_FILENO)
1577 : u8_istream_for_file (encoding, file_name, O_RDONLY));
1578 if (istream == NULL)
1580 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1584 r = xmalloc (sizeof *r);
1585 lex_reader_init (&r->reader, &lex_file_reader_class);
1586 r->reader.syntax = syntax;
1587 r->reader.error = error;
1588 r->reader.file_name = xstrdup (file_name);
1589 r->reader.encoding = encoding ? xstrdup (encoding) : NULL;
1590 r->reader.line_number = 1;
1591 r->istream = istream;
1596 static struct lex_file_reader *
1597 lex_file_reader_cast (struct lex_reader *r)
1599 return UP_CAST (r, struct lex_file_reader, reader);
1603 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1604 enum prompt_style prompt_style UNUSED)
1606 struct lex_file_reader *r = lex_file_reader_cast (r_);
1607 ssize_t n_read = u8_istream_read (r->istream, buf, n);
1610 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
1617 lex_file_close (struct lex_reader *r_)
1619 struct lex_file_reader *r = lex_file_reader_cast (r_);
1621 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1623 if (u8_istream_close (r->istream) != 0)
1624 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
1627 u8_istream_free (r->istream);
1632 static struct lex_reader_class lex_file_reader_class =
1638 struct lex_string_reader
1640 struct lex_reader reader;
1645 static struct lex_reader_class lex_string_reader_class;
1647 /* Creates and returns a new lex_reader for the contents of S, which must be
1648 encoded in the given ENCODING. The new reader takes ownership of S and will free it
1649 with ss_dealloc() when it is closed. */
1651 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
1653 struct lex_string_reader *r;
1655 r = xmalloc (sizeof *r);
1656 lex_reader_init (&r->reader, &lex_string_reader_class);
1657 r->reader.syntax = LEX_SYNTAX_AUTO;
1658 r->reader.encoding = encoding ? xstrdup (encoding) : NULL;
1665 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1666 which must be encoded in ENCODING. The caller retains ownership of S. */
1668 lex_reader_for_string (const char *s, const char *encoding)
1670 struct substring ss;
1671 ss_alloc_substring (&ss, ss_cstr (s));
1672 return lex_reader_for_substring_nocopy (ss, encoding);
1675 /* Formats FORMAT as a printf()-like format string and creates and returns a
1676 new lex_reader for the formatted result. */
1678 lex_reader_for_format (const char *format, const char *encoding, ...)
1680 struct lex_reader *r;
1683 va_start (args, encoding);
1684 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
1690 static struct lex_string_reader *
1691 lex_string_reader_cast (struct lex_reader *r)
1693 return UP_CAST (r, struct lex_string_reader, reader);
1697 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1698 enum prompt_style prompt_style UNUSED)
1700 struct lex_string_reader *r = lex_string_reader_cast (r_);
1703 chunk = MIN (n, r->s.length - r->offset);
1704 memcpy (buf, r->s.string + r->offset, chunk);
1711 lex_string_close (struct lex_reader *r_)
1713 struct lex_string_reader *r = lex_string_reader_cast (r_);
1719 static struct lex_reader_class lex_string_reader_class =