1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "language/command.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/ll.h"
42 #include "libpspp/message.h"
43 #include "libpspp/misc.h"
44 #include "libpspp/str.h"
45 #include "libpspp/u8-istream.h"
46 #include "output/journal.h"
47 #include "output/text-item.h"
49 #include "gl/c-ctype.h"
50 #include "gl/minmax.h"
51 #include "gl/xalloc.h"
52 #include "gl/xmemdup0.h"
55 #define _(msgid) gettext (msgid)
56 #define N_(msgid) msgid
58 /* A token within a lex_source. */
61 /* The regular token information. */
64 /* Location of token in terms of the lex_source's buffer.
65 src->tail <= line_pos <= token_pos <= src->head. */
66 size_t token_pos; /* Start of token. */
67 size_t token_len; /* Length of source for token in bytes. */
68 size_t line_pos; /* Start of line containing token_pos. */
69 int first_line; /* Line number at token_pos. */
72 /* A source of tokens, corresponding to a syntax file.
74 This is conceptually a lex_reader wrapped with everything needed to convert
75 its UTF-8 bytes into tokens. */
78 struct ll ll; /* In lexer's list of sources. */
79 struct lex_reader *reader;
80 struct segmenter segmenter;
81 bool eof; /* True if T_STOP was read from 'reader'. */
83 /* Buffer of UTF-8 bytes. */
85 size_t allocated; /* Number of bytes allocated. */
86 size_t tail; /* &buffer[0] offset into UTF-8 source. */
87 size_t head; /* &buffer[head - tail] offset into source. */
89 /* Positions in source file, tail <= pos <= head for each member here. */
90 size_t journal_pos; /* First byte not yet output to journal. */
91 size_t seg_pos; /* First byte not yet scanned as token. */
92 size_t line_pos; /* First byte of line containing seg_pos. */
94 int n_newlines; /* Number of new-lines up to seg_pos. */
95 bool suppress_next_newline;
98 struct deque deque; /* Indexes into 'tokens'. */
99 struct lex_token *tokens; /* Lookahead tokens for parser. */
102 static struct lex_source *lex_source_create (struct lex_reader *);
103 static void lex_source_destroy (struct lex_source *);
108 struct ll_list sources; /* Contains "struct lex_source"s. */
111 static struct lex_source *lex_source__ (const struct lexer *);
112 static const struct lex_token *lex_next__ (const struct lexer *, int n);
113 static void lex_source_push_endcmd__ (struct lex_source *);
115 static void lex_source_pop__ (struct lex_source *);
116 static bool lex_source_get__ (const struct lex_source *);
117 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
118 const char *format, va_list)
119 PRINTF_FORMAT (4, 0);
120 static const struct lex_token *lex_source_next__ (const struct lex_source *,
123 /* Initializes READER with the specified CLASS and otherwise some reasonable
124 defaults. The caller should fill in the others members as desired. */
126 lex_reader_init (struct lex_reader *reader,
127 const struct lex_reader_class *class)
129 reader->class = class;
130 reader->syntax = LEX_SYNTAX_AUTO;
131 reader->error = LEX_ERROR_CONTINUE;
132 reader->file_name = NULL;
133 reader->encoding = NULL;
134 reader->line_number = 0;
137 /* Frees any file name already in READER and replaces it by a copy of
138 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
140 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
142 free (reader->file_name);
143 reader->file_name = file_name != NULL ? xstrdup (file_name) : NULL;
146 /* Creates and returns a new lexer. */
150 struct lexer *lexer = xzalloc (sizeof *lexer);
151 ll_init (&lexer->sources);
155 /* Destroys LEXER. */
157 lex_destroy (struct lexer *lexer)
161 struct lex_source *source, *next;
163 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
164 lex_source_destroy (source);
169 /* Inserts READER into LEXER so that the next token read by LEXER comes from
170 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
173 lex_include (struct lexer *lexer, struct lex_reader *reader)
175 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
176 ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
179 /* Appends READER to LEXER, so that it will be read after all other current
180 readers have already been read. */
182 lex_append (struct lexer *lexer, struct lex_reader *reader)
184 ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
189 static struct lex_token *
190 lex_push_token__ (struct lex_source *src)
192 struct lex_token *token;
194 if (deque_is_full (&src->deque))
195 src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
197 token = &src->tokens[deque_push_front (&src->deque)];
198 token_init (&token->token);
203 lex_source_pop__ (struct lex_source *src)
205 token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
209 lex_source_pop_front (struct lex_source *src)
211 token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
214 /* Advances LEXER to the next token, consuming the current token. */
216 lex_get (struct lexer *lexer)
218 struct lex_source *src;
220 src = lex_source__ (lexer);
224 if (!deque_is_empty (&src->deque))
225 lex_source_pop__ (src);
227 while (deque_is_empty (&src->deque))
228 if (!lex_source_get__ (src))
230 lex_source_destroy (src);
231 src = lex_source__ (lexer);
237 /* Issuing errors. */
239 /* Prints a syntax error message containing the current token and
240 given message MESSAGE (if non-null). */
242 lex_error (struct lexer *lexer, const char *format, ...)
246 va_start (args, format);
247 lex_next_error_valist (lexer, 0, 0, format, args);
251 /* Prints a syntax error message containing the current token and
252 given message MESSAGE (if non-null). */
254 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
256 lex_next_error_valist (lexer, 0, 0, format, args);
259 /* Prints a syntax error message containing the current token and
260 given message MESSAGE (if non-null). */
262 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
266 va_start (args, format);
267 lex_next_error_valist (lexer, n0, n1, format, args);
271 /* Prints a syntax error message saying that OPTION0 or one of the other
272 strings following it, up to the first NULL, is expected. */
274 lex_error_expecting (struct lexer *lexer, const char *option0, ...)
276 enum { MAX_OPTIONS = 8 };
277 const char *options[MAX_OPTIONS + 1];
281 va_start (args, option0);
282 options[0] = option0;
284 while (n + 1 < MAX_OPTIONS && options[n] != NULL)
285 options[++n] = va_arg (args, const char *);
291 lex_error (lexer, NULL);
295 lex_error (lexer, _("expecting %s"), options[0]);
299 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
303 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
308 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
309 options[0], options[1], options[2], options[3]);
313 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
314 options[0], options[1], options[2], options[3], options[4]);
318 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
319 options[0], options[1], options[2], options[3], options[4],
324 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
325 options[0], options[1], options[2], options[3], options[4],
326 options[5], options[6]);
330 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
331 options[0], options[1], options[2], options[3], options[4],
332 options[5], options[6], options[7]);
340 /* Reports an error to the effect that subcommand SBC may only be specified
343 This function does not take a lexer as an argument or use lex_error(),
344 because the result would ordinarily just be redundant: "Syntax error at
345 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
346 not help the user find the error. */
348 lex_sbc_only_once (const char *sbc)
350 msg (SE, _("Subcommand %s may only be specified once."), sbc);
353 /* Reports an error to the effect that subcommand SBC is missing.
355 This function does not take a lexer as an argument or use lex_error(),
356 because a missing subcommand can normally be detected only after the whole
357 command has been parsed, and so lex_error() would always report "Syntax
358 error at end of command", which does not help the user find the error. */
360 lex_sbc_missing (const char *sbc)
362 msg (SE, _("Required subcommand %s was not specified."), sbc);
365 /* Reports an error to the effect that specification SPEC may only be specified
366 once within subcommand SBC. */
368 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
370 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
374 /* Reports an error to the effect that specification SPEC is missing within
377 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
379 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
383 /* Prints a syntax error message containing the current token and
384 given message MESSAGE (if non-null). */
386 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
387 const char *format, va_list args)
389 struct lex_source *src = lex_source__ (lexer);
392 lex_source_error_valist (src, n0, n1, format, args);
398 ds_put_format (&s, _("Syntax error at end of input"));
401 ds_put_cstr (&s, ": ");
402 ds_put_vformat (&s, format, args);
404 ds_put_byte (&s, '.');
405 msg (SE, "%s", ds_cstr (&s));
410 /* Checks that we're at end of command.
411 If so, returns a successful command completion code.
412 If not, flags a syntax error and returns an error command
415 lex_end_of_command (struct lexer *lexer)
417 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
419 lex_error (lexer, _("expecting end of command"));
426 /* Token testing functions. */
428 /* Returns true if the current token is a number. */
430 lex_is_number (struct lexer *lexer)
432 return lex_next_is_number (lexer, 0);
435 /* Returns true if the current token is a string. */
437 lex_is_string (struct lexer *lexer)
439 return lex_next_is_string (lexer, 0);
442 /* Returns the value of the current token, which must be a
443 floating point number. */
445 lex_number (struct lexer *lexer)
447 return lex_next_number (lexer, 0);
450 /* Returns true iff the current token is an integer. */
452 lex_is_integer (struct lexer *lexer)
454 return lex_next_is_integer (lexer, 0);
457 /* Returns the value of the current token, which must be an
460 lex_integer (struct lexer *lexer)
462 return lex_next_integer (lexer, 0);
465 /* Token testing functions with lookahead.
467 A value of 0 for N as an argument to any of these functions refers to the
468 current token. Lookahead is limited to the current command. Any N greater
469 than the number of tokens remaining in the current command will be treated
470 as referring to a T_ENDCMD token. */
472 /* Returns true if the token N ahead of the current token is a number. */
474 lex_next_is_number (struct lexer *lexer, int n)
476 enum token_type next_token = lex_next_token (lexer, n);
477 return next_token == T_POS_NUM || next_token == T_NEG_NUM;
480 /* Returns true if the token N ahead of the current token is a string. */
482 lex_next_is_string (struct lexer *lexer, int n)
484 return lex_next_token (lexer, n) == T_STRING;
487 /* Returns the value of the token N ahead of the current token, which must be a
488 floating point number. */
490 lex_next_number (struct lexer *lexer, int n)
492 assert (lex_next_is_number (lexer, n));
493 return lex_next_tokval (lexer, n);
496 /* Returns true if the token N ahead of the current token is an integer. */
498 lex_next_is_integer (struct lexer *lexer, int n)
502 if (!lex_next_is_number (lexer, n))
505 value = lex_next_tokval (lexer, n);
506 return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
509 /* Returns the value of the token N ahead of the current token, which must be
512 lex_next_integer (struct lexer *lexer, int n)
514 assert (lex_next_is_integer (lexer, n));
515 return lex_next_tokval (lexer, n);
518 /* Token matching functions. */
520 /* If the current token has the specified TYPE, skips it and returns true.
521 Otherwise, returns false. */
523 lex_match (struct lexer *lexer, enum token_type type)
525 if (lex_token (lexer) == type)
534 /* If the current token matches IDENTIFIER, skips it and returns true.
535 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
538 IDENTIFIER must be an ASCII string. */
540 lex_match_id (struct lexer *lexer, const char *identifier)
542 return lex_match_id_n (lexer, identifier, 3);
545 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
546 may be abbreviated to its first N letters. Otherwise, returns false.
548 IDENTIFIER must be an ASCII string. */
550 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
552 if (lex_token (lexer) == T_ID
553 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
562 /* If the current token is integer X, skips it and returns true. Otherwise,
565 lex_match_int (struct lexer *lexer, int x)
567 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
576 /* Forced matches. */
578 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
579 abbreviated to its first 3 letters. Otherwise, reports an error and returns
582 IDENTIFIER must be an ASCII string. */
584 lex_force_match_id (struct lexer *lexer, const char *identifier)
586 if (lex_match_id (lexer, identifier))
590 lex_error_expecting (lexer, identifier, NULL_SENTINEL);
595 /* If the current token has the specified TYPE, skips it and returns true.
596 Otherwise, reports an error and returns false. */
598 lex_force_match (struct lexer *lexer, enum token_type type)
600 if (lex_token (lexer) == type)
607 const char *type_string = token_type_to_string (type);
610 char *s = xasprintf ("`%s'", type_string);
611 lex_error_expecting (lexer, s, NULL_SENTINEL);
615 lex_error_expecting (lexer, token_type_to_name (type), NULL_SENTINEL);
621 /* If the current token is a string, does nothing and returns true.
622 Otherwise, reports an error and returns false. */
624 lex_force_string (struct lexer *lexer)
626 if (lex_is_string (lexer))
630 lex_error (lexer, _("expecting string"));
635 /* If the current token is a string or an identifier, does nothing and returns
636 true. Otherwise, reports an error and returns false.
638 This is meant for use in syntactic situations where we want to encourage the
639 user to supply a quoted string, but for compatibility we also accept
640 identifiers. (One example of such a situation is file names.) Therefore,
641 the error message issued when the current token is wrong only says that a
642 string is expected and doesn't mention that an identifier would also be
645 lex_force_string_or_id (struct lexer *lexer)
647 return lex_token (lexer) == T_ID || lex_force_string (lexer);
650 /* If the current token is an integer, does nothing and returns true.
651 Otherwise, reports an error and returns false. */
653 lex_force_int (struct lexer *lexer)
655 if (lex_is_integer (lexer))
659 lex_error (lexer, _("expecting integer"));
664 /* If the current token is a number, does nothing and returns true.
665 Otherwise, reports an error and returns false. */
667 lex_force_num (struct lexer *lexer)
669 if (lex_is_number (lexer))
672 lex_error (lexer, _("expecting number"));
676 /* If the current token is an identifier, does nothing and returns true.
677 Otherwise, reports an error and returns false. */
679 lex_force_id (struct lexer *lexer)
681 if (lex_token (lexer) == T_ID)
684 lex_error (lexer, _("expecting identifier"));
688 /* Token accessors. */
690 /* Returns the type of LEXER's current token. */
692 lex_token (const struct lexer *lexer)
694 return lex_next_token (lexer, 0);
697 /* Returns the number in LEXER's current token.
699 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
700 tokens this function will always return zero. */
702 lex_tokval (const struct lexer *lexer)
704 return lex_next_tokval (lexer, 0);
707 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
709 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
710 this functions this function will always return NULL.
712 The UTF-8 encoding of the returned string is correct for variable names and
713 other identifiers. Use filename_to_utf8() to use it as a filename. Use
714 data_in() to use it in a "union value". */
716 lex_tokcstr (const struct lexer *lexer)
718 return lex_next_tokcstr (lexer, 0);
721 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
722 null-terminated (but the null terminator is not included in the returned
723 substring's 'length').
725 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
726 this functions this function will always return NULL.
728 The UTF-8 encoding of the returned string is correct for variable names and
729 other identifiers. Use filename_to_utf8() to use it as a filename. Use
730 data_in() to use it in a "union value". */
732 lex_tokss (const struct lexer *lexer)
734 return lex_next_tokss (lexer, 0);
739 A value of 0 for N as an argument to any of these functions refers to the
740 current token. Lookahead is limited to the current command. Any N greater
741 than the number of tokens remaining in the current command will be treated
742 as referring to a T_ENDCMD token. */
744 static const struct lex_token *
745 lex_next__ (const struct lexer *lexer_, int n)
747 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
748 struct lex_source *src = lex_source__ (lexer);
751 return lex_source_next__ (src, n);
754 static const struct lex_token stop_token =
755 { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
761 static const struct lex_token *
762 lex_source_next__ (const struct lex_source *src, int n)
764 while (deque_count (&src->deque) <= n)
766 if (!deque_is_empty (&src->deque))
768 struct lex_token *front;
770 front = &src->tokens[deque_front (&src->deque, 0)];
771 if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
775 lex_source_get__ (src);
778 return &src->tokens[deque_back (&src->deque, n)];
781 /* Returns the "struct token" of the token N after the current one in LEXER.
782 The returned pointer can be invalidated by pretty much any succeeding call
783 into the lexer, although the string pointer within the returned token is
784 only invalidated by consuming the token (e.g. with lex_get()). */
786 lex_next (const struct lexer *lexer, int n)
788 return &lex_next__ (lexer, n)->token;
791 /* Returns the type of the token N after the current one in LEXER. */
793 lex_next_token (const struct lexer *lexer, int n)
795 return lex_next (lexer, n)->type;
798 /* Returns the number in the tokn N after the current one in LEXER.
800 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
801 tokens this function will always return zero. */
803 lex_next_tokval (const struct lexer *lexer, int n)
805 const struct token *token = lex_next (lexer, n);
806 return token->number;
809 /* Returns the null-terminated string in the token N after the current one, in
812 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
813 this functions this function will always return NULL.
815 The UTF-8 encoding of the returned string is correct for variable names and
816 other identifiers. Use filename_to_utf8() to use it as a filename. Use
817 data_in() to use it in a "union value". */
819 lex_next_tokcstr (const struct lexer *lexer, int n)
821 return lex_next_tokss (lexer, n).string;
824 /* Returns the string in the token N after the current one, in UTF-8 encoding.
825 The string is null-terminated (but the null terminator is not included in
826 the returned substring's 'length').
828 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
829 this functions this function will always return NULL.
831 The UTF-8 encoding of the returned string is correct for variable names and
832 other identifiers. Use filename_to_utf8() to use it as a filename. Use
833 data_in() to use it in a "union value". */
835 lex_next_tokss (const struct lexer *lexer, int n)
837 return lex_next (lexer, n)->string;
841 lex_tokens_match (const struct token *actual, const struct token *expected)
843 if (actual->type != expected->type)
846 switch (actual->type)
850 return actual->number == expected->number;
853 return lex_id_match (expected->string, actual->string);
856 return (actual->string.length == expected->string.length
857 && !memcmp (actual->string.string, expected->string.string,
858 actual->string.length));
865 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
866 skips it and returns true. Otherwise, returns false.
868 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
869 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
870 first three letters. */
872 lex_match_phrase (struct lexer *lexer, const char *s)
874 struct string_lexer slex;
879 string_lexer_init (&slex, s, SEG_MODE_INTERACTIVE);
880 while (string_lexer_next (&slex, &token))
881 if (token.type != SCAN_SKIP)
883 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
884 token_destroy (&token);
895 lex_source_get_first_line_number (const struct lex_source *src, int n)
897 return lex_source_next__ (src, n)->first_line;
901 count_newlines (char *s, size_t length)
906 while ((newline = memchr (s, '\n', length)) != NULL)
909 length -= (newline + 1) - s;
917 lex_source_get_last_line_number (const struct lex_source *src, int n)
919 const struct lex_token *token = lex_source_next__ (src, n);
921 if (token->first_line == 0)
925 char *token_str = &src->buffer[token->token_pos - src->tail];
926 return token->first_line + count_newlines (token_str, token->token_len) + 1;
931 count_columns (const char *s_, size_t length)
933 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
939 for (ofs = 0; ofs < length; ofs += mblen)
943 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
946 int width = uc_width (uc, "UTF-8");
951 columns = ROUND_UP (columns + 1, 8);
958 lex_source_get_first_column (const struct lex_source *src, int n)
960 const struct lex_token *token = lex_source_next__ (src, n);
961 return count_columns (&src->buffer[token->line_pos - src->tail],
962 token->token_pos - token->line_pos);
966 lex_source_get_last_column (const struct lex_source *src, int n)
968 const struct lex_token *token = lex_source_next__ (src, n);
969 char *start, *end, *newline;
971 start = &src->buffer[token->line_pos - src->tail];
972 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
973 newline = memrchr (start, '\n', end - start);
976 return count_columns (start, end - start);
979 /* Returns the 1-based line number of the start of the syntax that represents
980 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
981 if the token is drawn from a source that does not have line numbers. */
983 lex_get_first_line_number (const struct lexer *lexer, int n)
985 const struct lex_source *src = lex_source__ (lexer);
986 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
989 /* Returns the 1-based line number of the end of the syntax that represents the
990 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
991 token or if the token is drawn from a source that does not have line
994 Most of the time, a single token is wholly within a single line of syntax,
995 but there are two exceptions: a T_STRING token can be made up of multiple
996 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
997 token can consist of a "-" on one line followed by the number on the next.
1000 lex_get_last_line_number (const struct lexer *lexer, int n)
1002 const struct lex_source *src = lex_source__ (lexer);
1003 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1006 /* Returns the 1-based column number of the start of the syntax that represents
1007 the token N after the current one in LEXER. Returns 0 for a T_STOP
1010 Column numbers are measured according to the width of characters as shown in
1011 a typical fixed-width font, in which CJK characters have width 2 and
1012 combining characters have width 0. */
1014 lex_get_first_column (const struct lexer *lexer, int n)
1016 const struct lex_source *src = lex_source__ (lexer);
1017 return src != NULL ? lex_source_get_first_column (src, n) : 0;
1020 /* Returns the 1-based column number of the end of the syntax that represents
1021 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1024 Column numbers are measured according to the width of characters as shown in
1025 a typical fixed-width font, in which CJK characters have width 2 and
1026 combining characters have width 0. */
1028 lex_get_last_column (const struct lexer *lexer, int n)
1030 const struct lex_source *src = lex_source__ (lexer);
1031 return src != NULL ? lex_source_get_last_column (src, n) : 0;
1034 /* Returns the name of the syntax file from which the current command is drawn.
1035 Returns NULL for a T_STOP token or if the command's source does not have
1038 There is no version of this function that takes an N argument because
1039 lookahead only works to the end of a command and any given command is always
1040 within a single syntax file. */
1042 lex_get_file_name (const struct lexer *lexer)
1044 struct lex_source *src = lex_source__ (lexer);
1045 return src == NULL ? NULL : src->reader->file_name;
1049 lex_get_encoding (const struct lexer *lexer)
1051 struct lex_source *src = lex_source__ (lexer);
1052 return src == NULL ? NULL : src->reader->encoding;
1056 /* Returns the syntax mode for the syntax file from which the current drawn is
1057 drawn. Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1058 source does not have line numbers.
1060 There is no version of this function that takes an N argument because
1061 lookahead only works to the end of a command and any given command is always
1062 within a single syntax file. */
1063 enum lex_syntax_mode
1064 lex_get_syntax_mode (const struct lexer *lexer)
1066 struct lex_source *src = lex_source__ (lexer);
1067 return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1070 /* Returns the error mode for the syntax file from which the current drawn is
1071 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1072 source does not have line numbers.
1074 There is no version of this function that takes an N argument because
1075 lookahead only works to the end of a command and any given command is always
1076 within a single syntax file. */
1078 lex_get_error_mode (const struct lexer *lexer)
1080 struct lex_source *src = lex_source__ (lexer);
1081 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1084 /* If the source that LEXER is currently reading has error mode
1085 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1086 token to be read comes directly from whatever is next read from the stream.
1088 It makes sense to call this function after encountering an error in a
1089 command entered on the console, because usually the user would prefer not to
1090 have cascading errors. */
1092 lex_interactive_reset (struct lexer *lexer)
1094 struct lex_source *src = lex_source__ (lexer);
1095 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1097 src->head = src->tail = 0;
1098 src->journal_pos = src->seg_pos = src->line_pos = 0;
1099 src->n_newlines = 0;
1100 src->suppress_next_newline = false;
1101 segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1102 while (!deque_is_empty (&src->deque))
1103 lex_source_pop__ (src);
1104 lex_source_push_endcmd__ (src);
1108 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1110 lex_discard_rest_of_command (struct lexer *lexer)
1112 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1116 /* Discards all lookahead tokens in LEXER, then discards all input sources
1117 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1118 runs out of input sources. */
1120 lex_discard_noninteractive (struct lexer *lexer)
1122 struct lex_source *src = lex_source__ (lexer);
1126 while (!deque_is_empty (&src->deque))
1127 lex_source_pop__ (src);
1129 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1130 src = lex_source__ (lexer))
1131 lex_source_destroy (src);
1136 lex_source_max_tail__ (const struct lex_source *src)
1138 const struct lex_token *token;
1141 assert (src->seg_pos >= src->line_pos);
1142 max_tail = MIN (src->journal_pos, src->line_pos);
1144 /* Use the oldest token also. (We know that src->deque cannot be empty
1145 because we are in the process of adding a new token, which is already
1146 initialized enough to use here.) */
1147 token = &src->tokens[deque_back (&src->deque, 0)];
1148 assert (token->token_pos >= token->line_pos);
1149 max_tail = MIN (max_tail, token->line_pos);
1155 lex_source_expand__ (struct lex_source *src)
1157 if (src->head - src->tail >= src->allocated)
1159 size_t max_tail = lex_source_max_tail__ (src);
1160 if (max_tail > src->tail)
1162 /* Advance the tail, freeing up room at the head. */
1163 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1164 src->head - max_tail);
1165 src->tail = max_tail;
1169 /* Buffer is completely full. Expand it. */
1170 src->buffer = x2realloc (src->buffer, &src->allocated);
1175 /* There's space available at the head of the buffer. Nothing to do. */
1180 lex_source_read__ (struct lex_source *src)
1184 lex_source_expand__ (src);
1186 size_t head_ofs = src->head - src->tail;
1187 size_t space = src->allocated - head_ofs;
1188 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1189 size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1191 assert (n <= space);
1193 for (char *p = &src->buffer[head_ofs]; p < &src->buffer[head_ofs + n];
1198 m.category = MSG_C_SYNTAX;
1199 m.severity = MSG_S_ERROR;
1200 m.file_name = src->reader->file_name;
1205 m.text = xstrdup ("Bad character U+0000 in input.");
1215 Ensure that the input always ends in a new-line followed by a null
1216 byte, as required by the segmenter library. */
1218 if (src->head == src->tail
1219 || src->buffer[src->head - src->tail - 1] != '\n')
1220 src->buffer[src->head++ - src->tail] = '\n';
1222 lex_source_expand__ (src);
1223 src->buffer[src->head++ - src->tail] = '\0';
1230 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1231 src->head - src->seg_pos));
1234 static struct lex_source *
1235 lex_source__ (const struct lexer *lexer)
1237 return (ll_is_empty (&lexer->sources) ? NULL
1238 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1241 static struct substring
1242 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1244 const struct lex_token *token0 = lex_source_next__ (src, n0);
1245 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1246 size_t start = token0->token_pos;
1247 size_t end = token1->token_pos + token1->token_len;
1249 return ss_buffer (&src->buffer[start - src->tail], end - start);
1253 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1259 assert (out_size >= 16);
1260 out_maxlen = out_size - (in.length >= out_size ? 3 : 0) - 1;
1261 for (out_len = 0; out_len < in.length; out_len += mblen)
1263 if (in.string[out_len] == '\n'
1264 || (in.string[out_len] == '\r'
1265 && out_len + 1 < in.length
1266 && in.string[out_len + 1] == '\n'))
1269 mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1270 in.length - out_len);
1271 if (out_len + mblen > out_maxlen)
1275 memcpy (out, in.string, out_len);
1276 strcpy (&out[out_len], out_len < in.length ? "..." : "");
1280 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1281 const char *format, va_list args)
1283 const struct lex_token *token;
1289 token = lex_source_next__ (src, n0);
1290 if (token->token.type == T_ENDCMD)
1291 ds_put_cstr (&s, _("Syntax error at end of command"));
1294 struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1295 if (!ss_is_empty (syntax))
1297 char syntax_cstr[64];
1299 lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1300 ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1303 ds_put_cstr (&s, _("Syntax error"));
1308 ds_put_cstr (&s, ": ");
1309 ds_put_vformat (&s, format, args);
1311 ds_put_byte (&s, '.');
1313 m.category = MSG_C_SYNTAX;
1314 m.severity = MSG_S_ERROR;
1315 m.file_name = src->reader->file_name;
1316 m.first_line = lex_source_get_first_line_number (src, n0);
1317 m.last_line = lex_source_get_last_line_number (src, n1);
1318 m.first_column = lex_source_get_first_column (src, n0);
1319 m.last_column = lex_source_get_last_column (src, n1);
1320 m.text = ds_steal_cstr (&s);
1324 static void PRINTF_FORMAT (2, 3)
1325 lex_get_error (struct lex_source *src, const char *format, ...)
1330 va_start (args, format);
1332 n = deque_count (&src->deque) - 1;
1333 lex_source_error_valist (src, n, n, format, args);
1334 lex_source_pop_front (src);
1339 /* Attempts to append an additional token into SRC's deque, reading more from
1340 the underlying lex_reader if necessary.. Returns true if successful, false
1341 if the deque already represents (a suffix of) the whole lex_reader's
1344 lex_source_get__ (const struct lex_source *src_)
1346 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1350 /* State maintained while scanning tokens. Usually we only need a single
1351 state, but scanner_push() can return SCAN_SAVE to indicate that the state
1352 needs to be saved and possibly restored later with SCAN_BACK. */
1355 struct segmenter segmenter;
1356 enum segment_type last_segment;
1357 int newlines; /* Number of newlines encountered so far. */
1358 /* Maintained here so we can update lex_source's similar members when we
1364 /* Initialize state. */
1365 struct state state =
1367 .segmenter = src->segmenter,
1369 .seg_pos = src->seg_pos,
1370 .line_pos = src->line_pos,
1372 struct state saved = state;
1374 /* Append a new token to SRC and initialize it. */
1375 struct lex_token *token = lex_push_token__ (src);
1376 struct scanner scanner;
1377 scanner_init (&scanner, &token->token);
1378 token->line_pos = src->line_pos;
1379 token->token_pos = src->seg_pos;
1380 if (src->reader->line_number > 0)
1381 token->first_line = src->reader->line_number + src->n_newlines;
1383 token->first_line = 0;
1385 /* Extract segments and pass them through the scanner until we obtain a
1389 /* Extract a segment. */
1390 const char *segment = &src->buffer[state.seg_pos - src->tail];
1391 size_t seg_maxlen = src->head - state.seg_pos;
1392 enum segment_type type;
1393 int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1397 /* The segmenter needs more input to produce a segment. */
1398 lex_source_read__ (src);
1402 /* Update state based on the segment. */
1403 state.last_segment = type;
1404 state.seg_pos += seg_len;
1405 if (type == SEG_NEWLINE)
1408 state.line_pos = state.seg_pos;
1411 /* Pass the segment into the scanner and try to get a token out. */
1412 enum scan_result result = scanner_push (&scanner, type,
1413 ss_buffer (segment, seg_len),
1415 if (result == SCAN_SAVE)
1417 else if (result == SCAN_BACK)
1422 else if (result == SCAN_DONE)
1426 /* If we've reached the end of a line, or the end of a command, then pass
1427 the line to the output engine as a syntax text item. */
1428 int n_lines = state.newlines;
1429 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1432 src->suppress_next_newline = true;
1434 else if (n_lines > 0 && src->suppress_next_newline)
1437 src->suppress_next_newline = false;
1439 for (int i = 0; i < n_lines; i++)
1441 const char *line = &src->buffer[src->journal_pos - src->tail];
1442 const char *newline = rawmemchr (line, '\n');
1443 size_t line_len = newline - line;
1444 if (line_len > 0 && line[line_len - 1] == '\r')
1447 char *syntax = malloc (line_len + 2);
1448 memcpy (syntax, line, line_len);
1449 syntax[line_len] = '\n';
1450 syntax[line_len + 1] = '\0';
1452 text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX, syntax));
1454 src->journal_pos += newline - line + 1;
1457 token->token_len = state.seg_pos - src->seg_pos;
1459 src->segmenter = state.segmenter;
1460 src->seg_pos = state.seg_pos;
1461 src->line_pos = state.line_pos;
1462 src->n_newlines += state.newlines;
1464 switch (token->token.type)
1470 token->token.type = T_ENDCMD;
1474 case SCAN_BAD_HEX_LENGTH:
1475 lex_get_error (src, _("String of hex digits has %d characters, which "
1476 "is not a multiple of 2"),
1477 (int) token->token.number);
1480 case SCAN_BAD_HEX_DIGIT:
1481 case SCAN_BAD_UNICODE_DIGIT:
1482 lex_get_error (src, _("`%c' is not a valid hex digit"),
1483 (int) token->token.number);
1486 case SCAN_BAD_UNICODE_LENGTH:
1487 lex_get_error (src, _("Unicode string contains %d bytes, which is "
1488 "not in the valid range of 1 to 8 bytes"),
1489 (int) token->token.number);
1492 case SCAN_BAD_UNICODE_CODE_POINT:
1493 lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1494 (int) token->token.number);
1497 case SCAN_EXPECTED_QUOTE:
1498 lex_get_error (src, _("Unterminated string constant"));
1501 case SCAN_EXPECTED_EXPONENT:
1502 lex_get_error (src, _("Missing exponent following `%s'"),
1503 token->token.string.string);
1506 case SCAN_UNEXPECTED_DOT:
1507 lex_get_error (src, _("Unexpected `.' in middle of command"));
1510 case SCAN_UNEXPECTED_CHAR:
1513 lex_get_error (src, _("Bad character %s in input"),
1514 uc_name (token->token.number, c_name));
1519 lex_source_pop_front (src);
1527 lex_source_push_endcmd__ (struct lex_source *src)
1529 struct lex_token *token = lex_push_token__ (src);
1530 token->token.type = T_ENDCMD;
1531 token->token_pos = 0;
1532 token->token_len = 0;
1533 token->line_pos = 0;
1534 token->first_line = 0;
1537 static struct lex_source *
1538 lex_source_create (struct lex_reader *reader)
1540 struct lex_source *src;
1541 enum segmenter_mode mode;
1543 src = xzalloc (sizeof *src);
1544 src->reader = reader;
1546 if (reader->syntax == LEX_SYNTAX_AUTO)
1547 mode = SEG_MODE_AUTO;
1548 else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1549 mode = SEG_MODE_INTERACTIVE;
1550 else if (reader->syntax == LEX_SYNTAX_BATCH)
1551 mode = SEG_MODE_BATCH;
1554 segmenter_init (&src->segmenter, mode);
1556 src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1558 lex_source_push_endcmd__ (src);
1564 lex_source_destroy (struct lex_source *src)
1566 char *file_name = src->reader->file_name;
1567 char *encoding = src->reader->encoding;
1568 if (src->reader->class->destroy != NULL)
1569 src->reader->class->destroy (src->reader);
1573 while (!deque_is_empty (&src->deque))
1574 lex_source_pop__ (src);
1576 ll_remove (&src->ll);
1580 struct lex_file_reader
1582 struct lex_reader reader;
1583 struct u8_istream *istream;
1586 static struct lex_reader_class lex_file_reader_class;
1588 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1589 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
1590 ENCODING, which should take one of the forms accepted by
1591 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
1592 mode of the new reader, respectively.
1594 Returns a null pointer if FILE_NAME cannot be opened. */
1596 lex_reader_for_file (const char *file_name, const char *encoding,
1597 enum lex_syntax_mode syntax,
1598 enum lex_error_mode error)
1600 struct lex_file_reader *r;
1601 struct u8_istream *istream;
1603 istream = (!strcmp(file_name, "-")
1604 ? u8_istream_for_fd (encoding, STDIN_FILENO)
1605 : u8_istream_for_file (encoding, file_name, O_RDONLY));
1606 if (istream == NULL)
1608 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1612 r = xmalloc (sizeof *r);
1613 lex_reader_init (&r->reader, &lex_file_reader_class);
1614 r->reader.syntax = syntax;
1615 r->reader.error = error;
1616 r->reader.file_name = xstrdup (file_name);
1617 r->reader.encoding = encoding ? xstrdup (encoding) : NULL;
1618 r->reader.line_number = 1;
1619 r->istream = istream;
1624 static struct lex_file_reader *
1625 lex_file_reader_cast (struct lex_reader *r)
1627 return UP_CAST (r, struct lex_file_reader, reader);
1631 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1632 enum prompt_style prompt_style UNUSED)
1634 struct lex_file_reader *r = lex_file_reader_cast (r_);
1635 ssize_t n_read = u8_istream_read (r->istream, buf, n);
1638 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
1645 lex_file_close (struct lex_reader *r_)
1647 struct lex_file_reader *r = lex_file_reader_cast (r_);
1649 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1651 if (u8_istream_close (r->istream) != 0)
1652 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
1655 u8_istream_free (r->istream);
1660 static struct lex_reader_class lex_file_reader_class =
1666 struct lex_string_reader
1668 struct lex_reader reader;
1673 static struct lex_reader_class lex_string_reader_class;
1675 /* Creates and returns a new lex_reader for the contents of S, which must be
1676 encoded in the given ENCODING. The new reader takes ownership of S and will free it
1677 with ss_dealloc() when it is closed. */
1679 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
1681 struct lex_string_reader *r;
1683 r = xmalloc (sizeof *r);
1684 lex_reader_init (&r->reader, &lex_string_reader_class);
1685 r->reader.syntax = LEX_SYNTAX_AUTO;
1686 r->reader.encoding = encoding ? xstrdup (encoding) : NULL;
1693 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1694 which must be encoded in ENCODING. The caller retains ownership of S. */
1696 lex_reader_for_string (const char *s, const char *encoding)
1698 struct substring ss;
1699 ss_alloc_substring (&ss, ss_cstr (s));
1700 return lex_reader_for_substring_nocopy (ss, encoding);
1703 /* Formats FORMAT as a printf()-like format string and creates and returns a
1704 new lex_reader for the formatted result. */
1706 lex_reader_for_format (const char *format, const char *encoding, ...)
1708 struct lex_reader *r;
1711 va_start (args, encoding);
1712 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
1718 static struct lex_string_reader *
1719 lex_string_reader_cast (struct lex_reader *r)
1721 return UP_CAST (r, struct lex_string_reader, reader);
1725 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1726 enum prompt_style prompt_style UNUSED)
1728 struct lex_string_reader *r = lex_string_reader_cast (r_);
1731 chunk = MIN (n, r->s.length - r->offset);
1732 memcpy (buf, r->s.string + r->offset, chunk);
1739 lex_string_close (struct lex_reader *r_)
1741 struct lex_string_reader *r = lex_string_reader_cast (r_);
1747 static struct lex_reader_class lex_string_reader_class =