1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "language/command.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/ll.h"
42 #include "libpspp/message.h"
43 #include "libpspp/misc.h"
44 #include "libpspp/str.h"
45 #include "libpspp/u8-istream.h"
46 #include "output/journal.h"
47 #include "output/output-item.h"
49 #include "gl/c-ctype.h"
50 #include "gl/minmax.h"
51 #include "gl/xalloc.h"
52 #include "gl/xmemdup0.h"
55 #define _(msgid) gettext (msgid)
56 #define N_(msgid) msgid
58 /* A token within a lex_source. */
61 /* The regular token information. */
64 /* Location of token in terms of the lex_source's buffer.
65 src->tail <= line_pos <= token_pos <= src->head. */
66 size_t token_pos; /* Start of token. */
67 size_t token_len; /* Length of source for token in bytes. */
68 size_t line_pos; /* Start of line containing token_pos. */
69 int first_line; /* Line number at token_pos. */
72 /* A source of tokens, corresponding to a syntax file.
74 This is conceptually a lex_reader wrapped with everything needed to convert
75 its UTF-8 bytes into tokens. */
78 struct ll ll; /* In lexer's list of sources. */
79 struct lex_reader *reader;
80 struct segmenter segmenter;
81 bool eof; /* True if T_STOP was read from 'reader'. */
83 /* Buffer of UTF-8 bytes. */
85 size_t allocated; /* Number of bytes allocated. */
86 size_t tail; /* &buffer[0] offset into UTF-8 source. */
87 size_t head; /* &buffer[head - tail] offset into source. */
89 /* Positions in source file, tail <= pos <= head for each member here. */
90 size_t journal_pos; /* First byte not yet output to journal. */
91 size_t seg_pos; /* First byte not yet scanned as token. */
92 size_t line_pos; /* First byte of line containing seg_pos. */
94 int n_newlines; /* Number of new-lines up to seg_pos. */
95 bool suppress_next_newline;
98 struct deque deque; /* Indexes into 'tokens'. */
99 struct lex_token *tokens; /* Lookahead tokens for parser. */
102 static struct lex_source *lex_source_create (struct lex_reader *);
103 static void lex_source_destroy (struct lex_source *);
108 struct ll_list sources; /* Contains "struct lex_source"s. */
111 static struct lex_source *lex_source__ (const struct lexer *);
112 static const struct lex_token *lex_next__ (const struct lexer *, int n);
113 static void lex_source_push_endcmd__ (struct lex_source *);
115 static void lex_source_pop__ (struct lex_source *);
116 static bool lex_source_get__ (const struct lex_source *);
117 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
118 const char *format, va_list)
119 PRINTF_FORMAT (4, 0);
120 static const struct lex_token *lex_source_next__ (const struct lex_source *,
123 /* Initializes READER with the specified CLASS and otherwise some reasonable
124 defaults. The caller should fill in the others members as desired. */
126 lex_reader_init (struct lex_reader *reader,
127 const struct lex_reader_class *class)
129 reader->class = class;
130 reader->syntax = LEX_SYNTAX_AUTO;
131 reader->error = LEX_ERROR_CONTINUE;
132 reader->file_name = NULL;
133 reader->encoding = NULL;
134 reader->line_number = 0;
138 /* Frees any file name already in READER and replaces it by a copy of
139 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
141 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
143 free (reader->file_name);
144 reader->file_name = xstrdup_if_nonnull (file_name);
147 /* Creates and returns a new lexer. */
151 struct lexer *lexer = xzalloc (sizeof *lexer);
152 ll_init (&lexer->sources);
156 /* Destroys LEXER. */
158 lex_destroy (struct lexer *lexer)
162 struct lex_source *source, *next;
164 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
165 lex_source_destroy (source);
170 /* Inserts READER into LEXER so that the next token read by LEXER comes from
171 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
174 lex_include (struct lexer *lexer, struct lex_reader *reader)
176 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
177 ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
180 /* Appends READER to LEXER, so that it will be read after all other current
181 readers have already been read. */
183 lex_append (struct lexer *lexer, struct lex_reader *reader)
185 ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
190 static struct lex_token *
191 lex_push_token__ (struct lex_source *src)
193 struct lex_token *token;
195 if (deque_is_full (&src->deque))
196 src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
198 token = &src->tokens[deque_push_front (&src->deque)];
199 token_init (&token->token);
204 lex_source_pop__ (struct lex_source *src)
206 token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
210 lex_source_pop_front (struct lex_source *src)
212 token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
215 /* Advances LEXER to the next token, consuming the current token. */
217 lex_get (struct lexer *lexer)
219 struct lex_source *src;
221 src = lex_source__ (lexer);
225 if (!deque_is_empty (&src->deque))
226 lex_source_pop__ (src);
228 while (deque_is_empty (&src->deque))
229 if (!lex_source_get__ (src))
231 lex_source_destroy (src);
232 src = lex_source__ (lexer);
238 /* Issuing errors. */
240 /* Prints a syntax error message containing the current token and
241 given message MESSAGE (if non-null). */
243 lex_error (struct lexer *lexer, const char *format, ...)
247 va_start (args, format);
248 lex_next_error_valist (lexer, 0, 0, format, args);
252 /* Prints a syntax error message containing the current token and
253 given message MESSAGE (if non-null). */
255 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
257 lex_next_error_valist (lexer, 0, 0, format, args);
260 /* Prints a syntax error message containing the current token and
261 given message MESSAGE (if non-null). */
263 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
267 va_start (args, format);
268 lex_next_error_valist (lexer, n0, n1, format, args);
272 /* Prints a syntax error message saying that one of the strings provided as
273 varargs, up to the first NULL, is expected. */
275 (lex_error_expecting) (struct lexer *lexer, ...)
279 va_start (args, lexer);
280 lex_error_expecting_valist (lexer, args);
284 /* Prints a syntax error message saying that one of the options provided in
285 ARGS, up to the first NULL, is expected. */
287 lex_error_expecting_valist (struct lexer *lexer, va_list args)
289 enum { MAX_OPTIONS = 9 };
290 const char *options[MAX_OPTIONS];
292 while (n < MAX_OPTIONS)
294 const char *option = va_arg (args, const char *);
298 options[n++] = option;
300 lex_error_expecting_array (lexer, options, n);
304 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
309 lex_error (lexer, NULL);
313 lex_error (lexer, _("expecting %s"), options[0]);
317 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
321 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
326 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
327 options[0], options[1], options[2], options[3]);
331 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
332 options[0], options[1], options[2], options[3], options[4]);
336 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
337 options[0], options[1], options[2], options[3], options[4],
342 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
343 options[0], options[1], options[2], options[3], options[4],
344 options[5], options[6]);
348 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
349 options[0], options[1], options[2], options[3], options[4],
350 options[5], options[6], options[7]);
354 lex_error (lexer, NULL);
358 /* Reports an error to the effect that subcommand SBC may only be specified
361 This function does not take a lexer as an argument or use lex_error(),
362 because the result would ordinarily just be redundant: "Syntax error at
363 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
364 not help the user find the error. */
366 lex_sbc_only_once (const char *sbc)
368 msg (SE, _("Subcommand %s may only be specified once."), sbc);
371 /* Reports an error to the effect that subcommand SBC is missing.
373 This function does not take a lexer as an argument or use lex_error(),
374 because a missing subcommand can normally be detected only after the whole
375 command has been parsed, and so lex_error() would always report "Syntax
376 error at end of command", which does not help the user find the error. */
378 lex_sbc_missing (const char *sbc)
380 msg (SE, _("Required subcommand %s was not specified."), sbc);
383 /* Reports an error to the effect that specification SPEC may only be specified
384 once within subcommand SBC. */
386 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
388 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
392 /* Reports an error to the effect that specification SPEC is missing within
395 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
397 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
401 /* Prints a syntax error message containing the current token and
402 given message MESSAGE (if non-null). */
404 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
405 const char *format, va_list args)
407 struct lex_source *src = lex_source__ (lexer);
410 lex_source_error_valist (src, n0, n1, format, args);
416 ds_put_format (&s, _("Syntax error at end of input"));
419 ds_put_cstr (&s, ": ");
420 ds_put_vformat (&s, format, args);
422 ds_put_byte (&s, '.');
423 msg (SE, "%s", ds_cstr (&s));
428 /* Checks that we're at end of command.
429 If so, returns a successful command completion code.
430 If not, flags a syntax error and returns an error command
433 lex_end_of_command (struct lexer *lexer)
435 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
437 lex_error (lexer, _("expecting end of command"));
444 /* Token testing functions. */
446 /* Returns true if the current token is a number. */
448 lex_is_number (const struct lexer *lexer)
450 return lex_next_is_number (lexer, 0);
453 /* Returns true if the current token is a string. */
455 lex_is_string (const struct lexer *lexer)
457 return lex_next_is_string (lexer, 0);
460 /* Returns the value of the current token, which must be a
461 floating point number. */
463 lex_number (const struct lexer *lexer)
465 return lex_next_number (lexer, 0);
468 /* Returns true iff the current token is an integer. */
470 lex_is_integer (const struct lexer *lexer)
472 return lex_next_is_integer (lexer, 0);
475 /* Returns the value of the current token, which must be an
478 lex_integer (const struct lexer *lexer)
480 return lex_next_integer (lexer, 0);
483 /* Token testing functions with lookahead.
485 A value of 0 for N as an argument to any of these functions refers to the
486 current token. Lookahead is limited to the current command. Any N greater
487 than the number of tokens remaining in the current command will be treated
488 as referring to a T_ENDCMD token. */
490 /* Returns true if the token N ahead of the current token is a number. */
492 lex_next_is_number (const struct lexer *lexer, int n)
494 enum token_type next_token = lex_next_token (lexer, n);
495 return next_token == T_POS_NUM || next_token == T_NEG_NUM;
498 /* Returns true if the token N ahead of the current token is a string. */
500 lex_next_is_string (const struct lexer *lexer, int n)
502 return lex_next_token (lexer, n) == T_STRING;
505 /* Returns the value of the token N ahead of the current token, which must be a
506 floating point number. */
508 lex_next_number (const struct lexer *lexer, int n)
510 assert (lex_next_is_number (lexer, n));
511 return lex_next_tokval (lexer, n);
514 /* Returns true if the token N ahead of the current token is an integer. */
516 lex_next_is_integer (const struct lexer *lexer, int n)
520 if (!lex_next_is_number (lexer, n))
523 value = lex_next_tokval (lexer, n);
524 return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
527 /* Returns the value of the token N ahead of the current token, which must be
530 lex_next_integer (const struct lexer *lexer, int n)
532 assert (lex_next_is_integer (lexer, n));
533 return lex_next_tokval (lexer, n);
536 /* Token matching functions. */
538 /* If the current token has the specified TYPE, skips it and returns true.
539 Otherwise, returns false. */
541 lex_match (struct lexer *lexer, enum token_type type)
543 if (lex_token (lexer) == type)
552 /* If the current token matches IDENTIFIER, skips it and returns true.
553 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
556 IDENTIFIER must be an ASCII string. */
558 lex_match_id (struct lexer *lexer, const char *identifier)
560 return lex_match_id_n (lexer, identifier, 3);
563 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
564 may be abbreviated to its first N letters. Otherwise, returns false.
566 IDENTIFIER must be an ASCII string. */
568 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
570 if (lex_token (lexer) == T_ID
571 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
580 /* If the current token is integer X, skips it and returns true. Otherwise,
583 lex_match_int (struct lexer *lexer, int x)
585 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
594 /* Forced matches. */
596 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
597 abbreviated to its first 3 letters. Otherwise, reports an error and returns
600 IDENTIFIER must be an ASCII string. */
602 lex_force_match_id (struct lexer *lexer, const char *identifier)
604 if (lex_match_id (lexer, identifier))
608 lex_error_expecting (lexer, identifier);
613 /* If the current token has the specified TYPE, skips it and returns true.
614 Otherwise, reports an error and returns false. */
616 lex_force_match (struct lexer *lexer, enum token_type type)
618 if (lex_token (lexer) == type)
625 const char *type_string = token_type_to_string (type);
628 char *s = xasprintf ("`%s'", type_string);
629 lex_error_expecting (lexer, s);
633 lex_error_expecting (lexer, token_type_to_name (type));
639 /* If the current token is a string, does nothing and returns true.
640 Otherwise, reports an error and returns false. */
642 lex_force_string (struct lexer *lexer)
644 if (lex_is_string (lexer))
648 lex_error (lexer, _("expecting string"));
653 /* If the current token is a string or an identifier, does nothing and returns
654 true. Otherwise, reports an error and returns false.
656 This is meant for use in syntactic situations where we want to encourage the
657 user to supply a quoted string, but for compatibility we also accept
658 identifiers. (One example of such a situation is file names.) Therefore,
659 the error message issued when the current token is wrong only says that a
660 string is expected and doesn't mention that an identifier would also be
663 lex_force_string_or_id (struct lexer *lexer)
665 return lex_token (lexer) == T_ID || lex_force_string (lexer);
668 /* If the current token is an integer, does nothing and returns true.
669 Otherwise, reports an error and returns false. */
671 lex_force_int (struct lexer *lexer)
673 if (lex_is_integer (lexer))
677 lex_error (lexer, _("expecting integer"));
682 /* If the current token is a number, does nothing and returns true.
683 Otherwise, reports an error and returns false. */
685 lex_force_num (struct lexer *lexer)
687 if (lex_is_number (lexer))
690 lex_error (lexer, _("expecting number"));
694 /* If the current token is an identifier, does nothing and returns true.
695 Otherwise, reports an error and returns false. */
697 lex_force_id (struct lexer *lexer)
699 if (lex_token (lexer) == T_ID)
702 lex_error (lexer, _("expecting identifier"));
706 /* Token accessors. */
708 /* Returns the type of LEXER's current token. */
710 lex_token (const struct lexer *lexer)
712 return lex_next_token (lexer, 0);
715 /* Returns the number in LEXER's current token.
717 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
718 tokens this function will always return zero. */
720 lex_tokval (const struct lexer *lexer)
722 return lex_next_tokval (lexer, 0);
725 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
727 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
728 this functions this function will always return NULL.
730 The UTF-8 encoding of the returned string is correct for variable names and
731 other identifiers. Use filename_to_utf8() to use it as a filename. Use
732 data_in() to use it in a "union value". */
734 lex_tokcstr (const struct lexer *lexer)
736 return lex_next_tokcstr (lexer, 0);
739 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
740 null-terminated (but the null terminator is not included in the returned
741 substring's 'length').
743 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
744 this functions this function will always return NULL.
746 The UTF-8 encoding of the returned string is correct for variable names and
747 other identifiers. Use filename_to_utf8() to use it as a filename. Use
748 data_in() to use it in a "union value". */
750 lex_tokss (const struct lexer *lexer)
752 return lex_next_tokss (lexer, 0);
757 A value of 0 for N as an argument to any of these functions refers to the
758 current token. Lookahead is limited to the current command. Any N greater
759 than the number of tokens remaining in the current command will be treated
760 as referring to a T_ENDCMD token. */
762 static const struct lex_token *
763 lex_next__ (const struct lexer *lexer_, int n)
765 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
766 struct lex_source *src = lex_source__ (lexer);
769 return lex_source_next__ (src, n);
772 static const struct lex_token stop_token =
773 { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
779 static const struct lex_token *
780 lex_source_next__ (const struct lex_source *src, int n)
782 while (deque_count (&src->deque) <= n)
784 if (!deque_is_empty (&src->deque))
786 struct lex_token *front;
788 front = &src->tokens[deque_front (&src->deque, 0)];
789 if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
793 lex_source_get__ (src);
796 return &src->tokens[deque_back (&src->deque, n)];
799 /* Returns the "struct token" of the token N after the current one in LEXER.
800 The returned pointer can be invalidated by pretty much any succeeding call
801 into the lexer, although the string pointer within the returned token is
802 only invalidated by consuming the token (e.g. with lex_get()). */
804 lex_next (const struct lexer *lexer, int n)
806 return &lex_next__ (lexer, n)->token;
809 /* Returns the type of the token N after the current one in LEXER. */
811 lex_next_token (const struct lexer *lexer, int n)
813 return lex_next (lexer, n)->type;
816 /* Returns the number in the tokn N after the current one in LEXER.
818 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
819 tokens this function will always return zero. */
821 lex_next_tokval (const struct lexer *lexer, int n)
823 const struct token *token = lex_next (lexer, n);
824 return token->number;
827 /* Returns the null-terminated string in the token N after the current one, in
830 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
831 this functions this function will always return NULL.
833 The UTF-8 encoding of the returned string is correct for variable names and
834 other identifiers. Use filename_to_utf8() to use it as a filename. Use
835 data_in() to use it in a "union value". */
837 lex_next_tokcstr (const struct lexer *lexer, int n)
839 return lex_next_tokss (lexer, n).string;
842 /* Returns the string in the token N after the current one, in UTF-8 encoding.
843 The string is null-terminated (but the null terminator is not included in
844 the returned substring's 'length').
846 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
847 this functions this function will always return NULL.
849 The UTF-8 encoding of the returned string is correct for variable names and
850 other identifiers. Use filename_to_utf8() to use it as a filename. Use
851 data_in() to use it in a "union value". */
853 lex_next_tokss (const struct lexer *lexer, int n)
855 return lex_next (lexer, n)->string;
859 lex_tokens_match (const struct token *actual, const struct token *expected)
861 if (actual->type != expected->type)
864 switch (actual->type)
868 return actual->number == expected->number;
871 return lex_id_match (expected->string, actual->string);
874 return (actual->string.length == expected->string.length
875 && !memcmp (actual->string.string, expected->string.string,
876 actual->string.length));
883 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
884 skips it and returns true. Otherwise, returns false.
886 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
887 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
888 first three letters. */
890 lex_match_phrase (struct lexer *lexer, const char *s)
892 struct string_lexer slex;
897 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE);
898 while (string_lexer_next (&slex, &token))
899 if (token.type != SCAN_SKIP)
901 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
902 token_destroy (&token);
913 lex_source_get_first_line_number (const struct lex_source *src, int n)
915 return lex_source_next__ (src, n)->first_line;
919 count_newlines (char *s, size_t length)
924 while ((newline = memchr (s, '\n', length)) != NULL)
927 length -= (newline + 1) - s;
935 lex_source_get_last_line_number (const struct lex_source *src, int n)
937 const struct lex_token *token = lex_source_next__ (src, n);
939 if (token->first_line == 0)
943 char *token_str = &src->buffer[token->token_pos - src->tail];
944 return token->first_line + count_newlines (token_str, token->token_len) + 1;
949 count_columns (const char *s_, size_t length)
951 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
957 for (ofs = 0; ofs < length; ofs += mblen)
961 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
964 int width = uc_width (uc, "UTF-8");
969 columns = ROUND_UP (columns + 1, 8);
976 lex_source_get_first_column (const struct lex_source *src, int n)
978 const struct lex_token *token = lex_source_next__ (src, n);
979 return count_columns (&src->buffer[token->line_pos - src->tail],
980 token->token_pos - token->line_pos);
984 lex_source_get_last_column (const struct lex_source *src, int n)
986 const struct lex_token *token = lex_source_next__ (src, n);
987 char *start, *end, *newline;
989 start = &src->buffer[token->line_pos - src->tail];
990 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
991 newline = memrchr (start, '\n', end - start);
994 return count_columns (start, end - start);
997 /* Returns the 1-based line number of the start of the syntax that represents
998 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
999 if the token is drawn from a source that does not have line numbers. */
1001 lex_get_first_line_number (const struct lexer *lexer, int n)
1003 const struct lex_source *src = lex_source__ (lexer);
1004 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
1007 /* Returns the 1-based line number of the end of the syntax that represents the
1008 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1009 token or if the token is drawn from a source that does not have line
1012 Most of the time, a single token is wholly within a single line of syntax,
1013 but there are two exceptions: a T_STRING token can be made up of multiple
1014 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1015 token can consist of a "-" on one line followed by the number on the next.
1018 lex_get_last_line_number (const struct lexer *lexer, int n)
1020 const struct lex_source *src = lex_source__ (lexer);
1021 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1024 /* Returns the 1-based column number of the start of the syntax that represents
1025 the token N after the current one in LEXER. Returns 0 for a T_STOP
1028 Column numbers are measured according to the width of characters as shown in
1029 a typical fixed-width font, in which CJK characters have width 2 and
1030 combining characters have width 0. */
1032 lex_get_first_column (const struct lexer *lexer, int n)
1034 const struct lex_source *src = lex_source__ (lexer);
1035 return src != NULL ? lex_source_get_first_column (src, n) : 0;
1038 /* Returns the 1-based column number of the end of the syntax that represents
1039 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1042 Column numbers are measured according to the width of characters as shown in
1043 a typical fixed-width font, in which CJK characters have width 2 and
1044 combining characters have width 0. */
1046 lex_get_last_column (const struct lexer *lexer, int n)
1048 const struct lex_source *src = lex_source__ (lexer);
1049 return src != NULL ? lex_source_get_last_column (src, n) : 0;
1052 /* Returns the name of the syntax file from which the current command is drawn.
1053 Returns NULL for a T_STOP token or if the command's source does not have
1056 There is no version of this function that takes an N argument because
1057 lookahead only works to the end of a command and any given command is always
1058 within a single syntax file. */
1060 lex_get_file_name (const struct lexer *lexer)
1062 struct lex_source *src = lex_source__ (lexer);
1063 return src == NULL ? NULL : src->reader->file_name;
1067 lex_get_encoding (const struct lexer *lexer)
1069 struct lex_source *src = lex_source__ (lexer);
1070 return src == NULL ? NULL : src->reader->encoding;
1074 /* Returns the syntax mode for the syntax file from which the current drawn is
1075 drawn. Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1076 source does not have line numbers.
1078 There is no version of this function that takes an N argument because
1079 lookahead only works to the end of a command and any given command is always
1080 within a single syntax file. */
1081 enum lex_syntax_mode
1082 lex_get_syntax_mode (const struct lexer *lexer)
1084 struct lex_source *src = lex_source__ (lexer);
1085 return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1088 /* Returns the error mode for the syntax file from which the current drawn is
1089 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1090 source does not have line numbers.
1092 There is no version of this function that takes an N argument because
1093 lookahead only works to the end of a command and any given command is always
1094 within a single syntax file. */
1096 lex_get_error_mode (const struct lexer *lexer)
1098 struct lex_source *src = lex_source__ (lexer);
1099 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1102 /* If the source that LEXER is currently reading has error mode
1103 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1104 token to be read comes directly from whatever is next read from the stream.
1106 It makes sense to call this function after encountering an error in a
1107 command entered on the console, because usually the user would prefer not to
1108 have cascading errors. */
1110 lex_interactive_reset (struct lexer *lexer)
1112 struct lex_source *src = lex_source__ (lexer);
1113 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1115 src->head = src->tail = 0;
1116 src->journal_pos = src->seg_pos = src->line_pos = 0;
1117 src->n_newlines = 0;
1118 src->suppress_next_newline = false;
1119 segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1120 while (!deque_is_empty (&src->deque))
1121 lex_source_pop__ (src);
1122 lex_source_push_endcmd__ (src);
1126 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1128 lex_discard_rest_of_command (struct lexer *lexer)
1130 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1134 /* Discards all lookahead tokens in LEXER, then discards all input sources
1135 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1136 runs out of input sources. */
1138 lex_discard_noninteractive (struct lexer *lexer)
1140 struct lex_source *src = lex_source__ (lexer);
1144 while (!deque_is_empty (&src->deque))
1145 lex_source_pop__ (src);
1147 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1148 src = lex_source__ (lexer))
1149 lex_source_destroy (src);
1154 lex_source_max_tail__ (const struct lex_source *src)
1156 const struct lex_token *token;
1159 assert (src->seg_pos >= src->line_pos);
1160 max_tail = MIN (src->journal_pos, src->line_pos);
1162 /* Use the oldest token also. (We know that src->deque cannot be empty
1163 because we are in the process of adding a new token, which is already
1164 initialized enough to use here.) */
1165 token = &src->tokens[deque_back (&src->deque, 0)];
1166 assert (token->token_pos >= token->line_pos);
1167 max_tail = MIN (max_tail, token->line_pos);
1173 lex_source_expand__ (struct lex_source *src)
1175 if (src->head - src->tail >= src->allocated)
1177 size_t max_tail = lex_source_max_tail__ (src);
1178 if (max_tail > src->tail)
1180 /* Advance the tail, freeing up room at the head. */
1181 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1182 src->head - max_tail);
1183 src->tail = max_tail;
1187 /* Buffer is completely full. Expand it. */
1188 src->buffer = x2realloc (src->buffer, &src->allocated);
1193 /* There's space available at the head of the buffer. Nothing to do. */
1198 lex_source_read__ (struct lex_source *src)
1202 lex_source_expand__ (src);
1204 size_t head_ofs = src->head - src->tail;
1205 size_t space = src->allocated - head_ofs;
1206 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1207 size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1209 assert (n <= space);
1214 src->reader->eof = true;
1215 lex_source_expand__ (src);
1221 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1222 src->head - src->seg_pos));
1225 static struct lex_source *
1226 lex_source__ (const struct lexer *lexer)
1228 return (ll_is_empty (&lexer->sources) ? NULL
1229 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1232 static struct substring
1233 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1235 const struct lex_token *token0 = lex_source_next__ (src, n0);
1236 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1237 size_t start = token0->token_pos;
1238 size_t end = token1->token_pos + token1->token_len;
1240 return ss_buffer (&src->buffer[start - src->tail], end - start);
1244 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1250 assert (out_size >= 16);
1251 out_maxlen = out_size - 1;
1252 if (in.length > out_maxlen - 3)
1255 for (out_len = 0; out_len < in.length; out_len += mblen)
1257 if (in.string[out_len] == '\n'
1258 || in.string[out_len] == '\0'
1259 || (in.string[out_len] == '\r'
1260 && out_len + 1 < in.length
1261 && in.string[out_len + 1] == '\n'))
1264 mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1265 in.length - out_len);
1270 if (out_len + mblen > out_maxlen)
1274 memcpy (out, in.string, out_len);
1275 strcpy (&out[out_len], out_len < in.length ? "..." : "");
1279 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1280 const char *format, va_list args)
1282 const struct lex_token *token;
1287 token = lex_source_next__ (src, n0);
1288 if (token->token.type == T_ENDCMD)
1289 ds_put_cstr (&s, _("Syntax error at end of command"));
1292 struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1293 if (!ss_is_empty (syntax))
1295 char syntax_cstr[64];
1297 lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1298 ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1301 ds_put_cstr (&s, _("Syntax error"));
1306 ds_put_cstr (&s, ": ");
1307 ds_put_vformat (&s, format, args);
1309 ds_put_byte (&s, '.');
1312 .category = MSG_C_SYNTAX,
1313 .severity = MSG_S_ERROR,
1314 .file_name = src->reader->file_name,
1315 .first_line = lex_source_get_first_line_number (src, n0),
1316 .last_line = lex_source_get_last_line_number (src, n1),
1317 .first_column = lex_source_get_first_column (src, n0),
1318 .last_column = lex_source_get_last_column (src, n1),
1319 .text = ds_steal_cstr (&s),
1324 static void PRINTF_FORMAT (2, 3)
1325 lex_get_error (struct lex_source *src, const char *format, ...)
1330 va_start (args, format);
1332 n = deque_count (&src->deque) - 1;
1333 lex_source_error_valist (src, n, n, format, args);
1334 lex_source_pop_front (src);
1339 /* Attempts to append an additional token into SRC's deque, reading more from
1340 the underlying lex_reader if necessary.. Returns true if successful, false
1341 if the deque already represents (a suffix of) the whole lex_reader's
1344 lex_source_get__ (const struct lex_source *src_)
1346 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1350 /* State maintained while scanning tokens. Usually we only need a single
1351 state, but scanner_push() can return SCAN_SAVE to indicate that the state
1352 needs to be saved and possibly restored later with SCAN_BACK. */
1355 struct segmenter segmenter;
1356 enum segment_type last_segment;
1357 int newlines; /* Number of newlines encountered so far. */
1358 /* Maintained here so we can update lex_source's similar members when we
1364 /* Initialize state. */
1365 struct state state =
1367 .segmenter = src->segmenter,
1369 .seg_pos = src->seg_pos,
1370 .line_pos = src->line_pos,
1372 struct state saved = state;
1374 /* Append a new token to SRC and initialize it. */
1375 struct lex_token *token = lex_push_token__ (src);
1376 struct scanner scanner;
1377 scanner_init (&scanner, &token->token);
1378 token->line_pos = src->line_pos;
1379 token->token_pos = src->seg_pos;
1380 if (src->reader->line_number > 0)
1381 token->first_line = src->reader->line_number + src->n_newlines;
1383 token->first_line = 0;
1385 /* Extract segments and pass them through the scanner until we obtain a
1389 /* Extract a segment. */
1390 const char *segment = &src->buffer[state.seg_pos - src->tail];
1391 size_t seg_maxlen = src->head - state.seg_pos;
1392 enum segment_type type;
1393 int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1394 src->reader->eof, &type);
1397 /* The segmenter needs more input to produce a segment. */
1398 assert (!src->reader->eof);
1399 lex_source_read__ (src);
1403 /* Update state based on the segment. */
1404 state.last_segment = type;
1405 state.seg_pos += seg_len;
1406 if (type == SEG_NEWLINE)
1409 state.line_pos = state.seg_pos;
1412 /* Pass the segment into the scanner and try to get a token out. */
1413 enum scan_result result = scanner_push (&scanner, type,
1414 ss_buffer (segment, seg_len),
1416 if (result == SCAN_SAVE)
1418 else if (result == SCAN_BACK)
1423 else if (result == SCAN_DONE)
1427 /* If we've reached the end of a line, or the end of a command, then pass
1428 the line to the output engine as a syntax text item. */
1429 int n_lines = state.newlines;
1430 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1433 src->suppress_next_newline = true;
1435 else if (n_lines > 0 && src->suppress_next_newline)
1438 src->suppress_next_newline = false;
1440 for (int i = 0; i < n_lines; i++)
1442 /* Beginning of line. */
1443 const char *line = &src->buffer[src->journal_pos - src->tail];
1445 /* Calculate line length, including \n or \r\n end-of-line if present.
1447 We use src->head even though that may be beyond what we've actually
1448 converted to tokens (which is only through state.line_pos). That's
1449 because, if we're emitting the line due to SEG_END_COMMAND, we want to
1450 take the whole line through the newline, not just through the '.'. */
1451 size_t max_len = src->head - src->journal_pos;
1452 const char *newline = memchr (line, '\n', max_len);
1453 size_t line_len = newline ? newline - line + 1 : max_len;
1455 /* Calculate line length excluding end-of-line. */
1456 size_t copy_len = line_len;
1457 if (copy_len > 0 && line[copy_len - 1] == '\n')
1459 if (copy_len > 0 && line[copy_len - 1] == '\r')
1462 /* Submit the line as syntax. */
1463 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1464 xmemdup0 (line, copy_len),
1467 src->journal_pos += line_len;
1470 token->token_len = state.seg_pos - src->seg_pos;
1472 src->segmenter = state.segmenter;
1473 src->seg_pos = state.seg_pos;
1474 src->line_pos = state.line_pos;
1475 src->n_newlines += state.newlines;
1477 switch (token->token.type)
1483 token->token.type = T_ENDCMD;
1487 case SCAN_BAD_HEX_LENGTH:
1488 lex_get_error (src, _("String of hex digits has %d characters, which "
1489 "is not a multiple of 2"),
1490 (int) token->token.number);
1493 case SCAN_BAD_HEX_DIGIT:
1494 case SCAN_BAD_UNICODE_DIGIT:
1495 lex_get_error (src, _("`%c' is not a valid hex digit"),
1496 (int) token->token.number);
1499 case SCAN_BAD_UNICODE_LENGTH:
1500 lex_get_error (src, _("Unicode string contains %d bytes, which is "
1501 "not in the valid range of 1 to 8 bytes"),
1502 (int) token->token.number);
1505 case SCAN_BAD_UNICODE_CODE_POINT:
1506 lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1507 (int) token->token.number);
1510 case SCAN_EXPECTED_QUOTE:
1511 lex_get_error (src, _("Unterminated string constant"));
1514 case SCAN_EXPECTED_EXPONENT:
1515 lex_get_error (src, _("Missing exponent following `%s'"),
1516 token->token.string.string);
1519 case SCAN_UNEXPECTED_DOT:
1520 lex_get_error (src, _("Unexpected `.' in middle of command"));
1523 case SCAN_UNEXPECTED_CHAR:
1526 lex_get_error (src, _("Bad character %s in input"),
1527 uc_name (token->token.number, c_name));
1532 lex_source_pop_front (src);
1540 lex_source_push_endcmd__ (struct lex_source *src)
1542 struct lex_token *token = lex_push_token__ (src);
1543 token->token.type = T_ENDCMD;
1544 token->token_pos = 0;
1545 token->token_len = 0;
1546 token->line_pos = 0;
1547 token->first_line = 0;
1550 static struct lex_source *
1551 lex_source_create (struct lex_reader *reader)
1553 struct lex_source *src;
1554 enum segmenter_mode mode;
1556 src = xzalloc (sizeof *src);
1557 src->reader = reader;
1559 if (reader->syntax == LEX_SYNTAX_AUTO)
1560 mode = SEG_MODE_AUTO;
1561 else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1562 mode = SEG_MODE_INTERACTIVE;
1563 else if (reader->syntax == LEX_SYNTAX_BATCH)
1564 mode = SEG_MODE_BATCH;
1567 segmenter_init (&src->segmenter, mode);
1569 src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1571 lex_source_push_endcmd__ (src);
1577 lex_source_destroy (struct lex_source *src)
1579 char *file_name = src->reader->file_name;
1580 char *encoding = src->reader->encoding;
1581 if (src->reader->class->destroy != NULL)
1582 src->reader->class->destroy (src->reader);
1586 while (!deque_is_empty (&src->deque))
1587 lex_source_pop__ (src);
1589 ll_remove (&src->ll);
1593 struct lex_file_reader
1595 struct lex_reader reader;
1596 struct u8_istream *istream;
1599 static struct lex_reader_class lex_file_reader_class;
1601 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1602 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
1603 ENCODING, which should take one of the forms accepted by
1604 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
1605 mode of the new reader, respectively.
1607 Returns a null pointer if FILE_NAME cannot be opened. */
1609 lex_reader_for_file (const char *file_name, const char *encoding,
1610 enum lex_syntax_mode syntax,
1611 enum lex_error_mode error)
1613 struct lex_file_reader *r;
1614 struct u8_istream *istream;
1616 istream = (!strcmp(file_name, "-")
1617 ? u8_istream_for_fd (encoding, STDIN_FILENO)
1618 : u8_istream_for_file (encoding, file_name, O_RDONLY));
1619 if (istream == NULL)
1621 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1625 r = xmalloc (sizeof *r);
1626 lex_reader_init (&r->reader, &lex_file_reader_class);
1627 r->reader.syntax = syntax;
1628 r->reader.error = error;
1629 r->reader.file_name = xstrdup (file_name);
1630 r->reader.encoding = xstrdup_if_nonnull (encoding);
1631 r->reader.line_number = 1;
1632 r->istream = istream;
1637 static struct lex_file_reader *
1638 lex_file_reader_cast (struct lex_reader *r)
1640 return UP_CAST (r, struct lex_file_reader, reader);
1644 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1645 enum prompt_style prompt_style UNUSED)
1647 struct lex_file_reader *r = lex_file_reader_cast (r_);
1648 ssize_t n_read = u8_istream_read (r->istream, buf, n);
1651 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
1658 lex_file_close (struct lex_reader *r_)
1660 struct lex_file_reader *r = lex_file_reader_cast (r_);
1662 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1664 if (u8_istream_close (r->istream) != 0)
1665 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
1668 u8_istream_free (r->istream);
1673 static struct lex_reader_class lex_file_reader_class =
1679 struct lex_string_reader
1681 struct lex_reader reader;
1686 static struct lex_reader_class lex_string_reader_class;
1688 /* Creates and returns a new lex_reader for the contents of S, which must be
1689 encoded in the given ENCODING. The new reader takes ownership of S and will free it
1690 with ss_dealloc() when it is closed. */
1692 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
1694 struct lex_string_reader *r;
1696 r = xmalloc (sizeof *r);
1697 lex_reader_init (&r->reader, &lex_string_reader_class);
1698 r->reader.syntax = LEX_SYNTAX_AUTO;
1699 r->reader.encoding = xstrdup_if_nonnull (encoding);
1706 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1707 which must be encoded in ENCODING. The caller retains ownership of S. */
1709 lex_reader_for_string (const char *s, const char *encoding)
1711 struct substring ss;
1712 ss_alloc_substring (&ss, ss_cstr (s));
1713 return lex_reader_for_substring_nocopy (ss, encoding);
1716 /* Formats FORMAT as a printf()-like format string and creates and returns a
1717 new lex_reader for the formatted result. */
1719 lex_reader_for_format (const char *format, const char *encoding, ...)
1721 struct lex_reader *r;
1724 va_start (args, encoding);
1725 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
1731 static struct lex_string_reader *
1732 lex_string_reader_cast (struct lex_reader *r)
1734 return UP_CAST (r, struct lex_string_reader, reader);
1738 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1739 enum prompt_style prompt_style UNUSED)
1741 struct lex_string_reader *r = lex_string_reader_cast (r_);
1744 chunk = MIN (n, r->s.length - r->offset);
1745 memcpy (buf, r->s.string + r->offset, chunk);
1752 lex_string_close (struct lex_reader *r_)
1754 struct lex_string_reader *r = lex_string_reader_cast (r_);
1760 static struct lex_reader_class lex_string_reader_class =