1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "data/file-name.h"
34 #include "language/command.h"
35 #include "language/lexer/scan.h"
36 #include "language/lexer/segment.h"
37 #include "language/lexer/token.h"
38 #include "libpspp/assertion.h"
39 #include "libpspp/cast.h"
40 #include "libpspp/deque.h"
41 #include "libpspp/i18n.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/text-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* Location of token in terms of the lex_source's buffer.
66 src->tail <= line_pos <= token_pos <= src->head. */
67 size_t token_pos; /* Start of token. */
68 size_t token_len; /* Length of source for token in bytes. */
69 size_t line_pos; /* Start of line containing token_pos. */
70 int first_line; /* Line number at token_pos. */
73 /* A source of tokens, corresponding to a syntax file.
75 This is conceptually a lex_reader wrapped with everything needed to convert
76 its UTF-8 bytes into tokens. */
79 struct ll ll; /* In lexer's list of sources. */
80 struct lex_reader *reader;
81 struct segmenter segmenter;
82 bool eof; /* True if T_STOP was read from 'reader'. */
84 /* Buffer of UTF-8 bytes. */
86 size_t allocated; /* Number of bytes allocated. */
87 size_t tail; /* &buffer[0] offset into UTF-8 source. */
88 size_t head; /* &buffer[head - tail] offset into source. */
90 /* Positions in source file, tail <= pos <= head for each member here. */
91 size_t journal_pos; /* First byte not yet output to journal. */
92 size_t seg_pos; /* First byte not yet scanned as token. */
93 size_t line_pos; /* First byte of line containing seg_pos. */
95 int n_newlines; /* Number of new-lines up to seg_pos. */
96 bool suppress_next_newline;
99 struct deque deque; /* Indexes into 'tokens'. */
100 struct lex_token *tokens; /* Lookahead tokens for parser. */
103 static struct lex_source *lex_source_create (struct lex_reader *);
104 static void lex_source_destroy (struct lex_source *);
109 struct ll_list sources; /* Contains "struct lex_source"s. */
112 static struct lex_source *lex_source__ (const struct lexer *);
113 static const struct lex_token *lex_next__ (const struct lexer *, int n);
114 static void lex_source_push_endcmd__ (struct lex_source *);
116 static void lex_source_pop__ (struct lex_source *);
117 static bool lex_source_get__ (const struct lex_source *);
118 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
119 const char *format, va_list)
120 PRINTF_FORMAT (4, 0);
121 static const struct lex_token *lex_source_next__ (const struct lex_source *,
124 /* Initializes READER with the specified CLASS and otherwise some reasonable
125 defaults. The caller should fill in the others members as desired. */
127 lex_reader_init (struct lex_reader *reader,
128 const struct lex_reader_class *class)
130 reader->class = class;
131 reader->syntax = LEX_SYNTAX_AUTO;
132 reader->error = LEX_ERROR_INTERACTIVE;
133 reader->file_name = NULL;
134 reader->line_number = 0;
137 /* Frees any file name already in READER and replaces it by a copy of
138 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
140 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
142 free (reader->file_name);
143 reader->file_name = file_name != NULL ? xstrdup (file_name) : NULL;
146 /* Creates and returns a new lexer. */
150 struct lexer *lexer = xzalloc (sizeof *lexer);
151 ll_init (&lexer->sources);
155 /* Destroys LEXER. */
157 lex_destroy (struct lexer *lexer)
161 struct lex_source *source, *next;
163 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
164 lex_source_destroy (source);
169 /* Inserts READER into LEXER so that the next token read by LEXER comes from
170 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
173 lex_include (struct lexer *lexer, struct lex_reader *reader)
175 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
176 ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
179 /* Appends READER to LEXER, so that it will be read after all other current
180 readers have already been read. */
182 lex_append (struct lexer *lexer, struct lex_reader *reader)
184 ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
189 static struct lex_token *
190 lex_push_token__ (struct lex_source *src)
192 struct lex_token *token;
194 if (deque_is_full (&src->deque))
195 src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
197 token = &src->tokens[deque_push_front (&src->deque)];
198 token_init (&token->token);
203 lex_source_pop__ (struct lex_source *src)
205 token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
209 lex_source_pop_front (struct lex_source *src)
211 token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
214 /* Advances LEXER to the next token, consuming the current token. */
216 lex_get (struct lexer *lexer)
218 struct lex_source *src;
220 src = lex_source__ (lexer);
224 if (!deque_is_empty (&src->deque))
225 lex_source_pop__ (src);
227 while (deque_is_empty (&src->deque))
228 if (!lex_source_get__ (src))
230 lex_source_destroy (src);
231 src = lex_source__ (lexer);
237 /* Issuing errors. */
239 /* Prints a syntax error message containing the current token and
240 given message MESSAGE (if non-null). */
242 lex_error (struct lexer *lexer, const char *format, ...)
246 va_start (args, format);
247 lex_next_error_valist (lexer, 0, 0, format, args);
251 /* Prints a syntax error message containing the current token and
252 given message MESSAGE (if non-null). */
254 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
256 lex_next_error_valist (lexer, 0, 0, format, args);
259 /* Prints a syntax error message containing the current token and
260 given message MESSAGE (if non-null). */
262 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
266 va_start (args, format);
267 lex_next_error_valist (lexer, n0, n1, format, args);
271 /* Prints a syntax error message saying that OPTION0 or one of the other
272 strings following it, up to the first NULL, is expected. */
274 lex_error_expecting (struct lexer *lexer, const char *option0, ...)
276 enum { MAX_OPTIONS = 8 };
277 const char *options[MAX_OPTIONS + 1];
281 va_start (args, option0);
282 options[0] = option0;
284 while (n + 1 < MAX_OPTIONS && options[n] != NULL)
285 options[++n] = va_arg (args, const char *);
291 lex_error (lexer, NULL);
295 lex_error (lexer, _("expecting %s"), options[0]);
299 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
303 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
308 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
309 options[0], options[1], options[2], options[3]);
313 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
314 options[0], options[1], options[2], options[3], options[4]);
318 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
319 options[0], options[1], options[2], options[3], options[4],
324 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
325 options[0], options[1], options[2], options[3], options[4],
326 options[5], options[6]);
330 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
331 options[0], options[1], options[2], options[3], options[4],
332 options[5], options[6], options[7]);
340 /* Reports an error to the effect that subcommand SBC may only be specified
343 This function does not take a lexer as an argument or use lex_error(),
344 because the result would ordinarily just be redundant: "Syntax error at
345 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
346 not help the user find the error. */
348 lex_sbc_only_once (const char *sbc)
350 msg (SE, _("Subcommand %s may only be specified once."), sbc);
353 /* Reports an error to the effect that subcommand SBC is missing.
355 This function does not take a lexer as an argument or use lex_error(),
356 because a missing subcommand can normally be detected only after the whole
357 command has been parsed, and so lex_error() would always report "Syntax
358 error at end of command", which does not help the user find the error. */
360 lex_sbc_missing (const char *sbc)
362 msg (SE, _("Required subcommand %s was not specified."), sbc);
365 /* Reports an error to the effect that specification SPEC may only be specified
366 once within subcommand SBC. */
368 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
370 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
374 /* Reports an error to the effect that specification SPEC is missing within
377 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
379 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
383 /* Prints a syntax error message containing the current token and
384 given message MESSAGE (if non-null). */
386 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
387 const char *format, va_list args)
389 struct lex_source *src = lex_source__ (lexer);
392 lex_source_error_valist (src, n0, n1, format, args);
398 ds_put_format (&s, _("Syntax error at end of input"));
401 ds_put_cstr (&s, ": ");
402 ds_put_vformat (&s, format, args);
404 ds_put_byte (&s, '.');
405 msg (SE, "%s", ds_cstr (&s));
410 /* Checks that we're at end of command.
411 If so, returns a successful command completion code.
412 If not, flags a syntax error and returns an error command
415 lex_end_of_command (struct lexer *lexer)
417 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
419 lex_error (lexer, _("expecting end of command"));
426 /* Token testing functions. */
428 /* Returns true if the current token is a number. */
430 lex_is_number (struct lexer *lexer)
432 return lex_next_is_number (lexer, 0);
435 /* Returns true if the current token is a string. */
437 lex_is_string (struct lexer *lexer)
439 return lex_next_is_string (lexer, 0);
442 /* Returns the value of the current token, which must be a
443 floating point number. */
445 lex_number (struct lexer *lexer)
447 return lex_next_number (lexer, 0);
450 /* Returns true iff the current token is an integer. */
452 lex_is_integer (struct lexer *lexer)
454 return lex_next_is_integer (lexer, 0);
457 /* Returns the value of the current token, which must be an
460 lex_integer (struct lexer *lexer)
462 return lex_next_integer (lexer, 0);
465 /* Token testing functions with lookahead.
467 A value of 0 for N as an argument to any of these functions refers to the
468 current token. Lookahead is limited to the current command. Any N greater
469 than the number of tokens remaining in the current command will be treated
470 as referring to a T_ENDCMD token. */
472 /* Returns true if the token N ahead of the current token is a number. */
474 lex_next_is_number (struct lexer *lexer, int n)
476 enum token_type next_token = lex_next_token (lexer, n);
477 return next_token == T_POS_NUM || next_token == T_NEG_NUM;
480 /* Returns true if the token N ahead of the current token is a string. */
482 lex_next_is_string (struct lexer *lexer, int n)
484 return lex_next_token (lexer, n) == T_STRING;
487 /* Returns the value of the token N ahead of the current token, which must be a
488 floating point number. */
490 lex_next_number (struct lexer *lexer, int n)
492 assert (lex_next_is_number (lexer, n));
493 return lex_next_tokval (lexer, n);
496 /* Returns true if the token N ahead of the current token is an integer. */
498 lex_next_is_integer (struct lexer *lexer, int n)
502 if (!lex_next_is_number (lexer, n))
505 value = lex_next_tokval (lexer, n);
506 return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
509 /* Returns the value of the token N ahead of the current token, which must be
512 lex_next_integer (struct lexer *lexer, int n)
514 assert (lex_next_is_integer (lexer, n));
515 return lex_next_tokval (lexer, n);
518 /* Token matching functions. */
520 /* If the current token has the specified TYPE, skips it and returns true.
521 Otherwise, returns false. */
523 lex_match (struct lexer *lexer, enum token_type type)
525 if (lex_token (lexer) == type)
534 /* If the current token matches IDENTIFIER, skips it and returns true.
535 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
538 IDENTIFIER must be an ASCII string. */
540 lex_match_id (struct lexer *lexer, const char *identifier)
542 return lex_match_id_n (lexer, identifier, 3);
545 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
546 may be abbreviated to its first N letters. Otherwise, returns false.
548 IDENTIFIER must be an ASCII string. */
550 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
552 if (lex_token (lexer) == T_ID
553 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
562 /* If the current token is integer X, skips it and returns true. Otherwise,
565 lex_match_int (struct lexer *lexer, int x)
567 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
576 /* Forced matches. */
578 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
579 abbreviated to its first 3 letters. Otherwise, reports an error and returns
582 IDENTIFIER must be an ASCII string. */
584 lex_force_match_id (struct lexer *lexer, const char *identifier)
586 if (lex_match_id (lexer, identifier))
590 lex_error_expecting (lexer, identifier, NULL_SENTINEL);
595 /* If the current token has the specified TYPE, skips it and returns true.
596 Otherwise, reports an error and returns false. */
598 lex_force_match (struct lexer *lexer, enum token_type type)
600 if (lex_token (lexer) == type)
607 char *s = xasprintf ("`%s'", token_type_to_string (type));
608 lex_error_expecting (lexer, s, NULL_SENTINEL);
614 /* If the current token is a string, does nothing and returns true.
615 Otherwise, reports an error and returns false. */
617 lex_force_string (struct lexer *lexer)
619 if (lex_is_string (lexer))
623 lex_error (lexer, _("expecting string"));
628 /* If the current token is an integer, does nothing and returns true.
629 Otherwise, reports an error and returns false. */
631 lex_force_int (struct lexer *lexer)
633 if (lex_is_integer (lexer))
637 lex_error (lexer, _("expecting integer"));
642 /* If the current token is a number, does nothing and returns true.
643 Otherwise, reports an error and returns false. */
645 lex_force_num (struct lexer *lexer)
647 if (lex_is_number (lexer))
650 lex_error (lexer, _("expecting number"));
654 /* If the current token is an identifier, does nothing and returns true.
655 Otherwise, reports an error and returns false. */
657 lex_force_id (struct lexer *lexer)
659 if (lex_token (lexer) == T_ID)
662 lex_error (lexer, _("expecting identifier"));
666 /* Token accessors. */
668 /* Returns the type of LEXER's current token. */
670 lex_token (const struct lexer *lexer)
672 return lex_next_token (lexer, 0);
675 /* Returns the number in LEXER's current token.
677 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
678 tokens this function will always return zero. */
680 lex_tokval (const struct lexer *lexer)
682 return lex_next_tokval (lexer, 0);
685 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
687 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
688 this functions this function will always return NULL.
690 The UTF-8 encoding of the returned string is correct for variable names and
691 other identifiers. Use filename_to_utf8() to use it as a filename. Use
692 data_in() to use it in a "union value". */
694 lex_tokcstr (const struct lexer *lexer)
696 return lex_next_tokcstr (lexer, 0);
699 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
700 null-terminated (but the null terminator is not included in the returned
701 substring's 'length').
703 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
704 this functions this function will always return NULL.
706 The UTF-8 encoding of the returned string is correct for variable names and
707 other identifiers. Use filename_to_utf8() to use it as a filename. Use
708 data_in() to use it in a "union value". */
710 lex_tokss (const struct lexer *lexer)
712 return lex_next_tokss (lexer, 0);
717 A value of 0 for N as an argument to any of these functions refers to the
718 current token. Lookahead is limited to the current command. Any N greater
719 than the number of tokens remaining in the current command will be treated
720 as referring to a T_ENDCMD token. */
722 static const struct lex_token *
723 lex_next__ (const struct lexer *lexer_, int n)
725 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
726 struct lex_source *src = lex_source__ (lexer);
729 return lex_source_next__ (src, n);
732 static const struct lex_token stop_token =
733 { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
739 static const struct lex_token *
740 lex_source_next__ (const struct lex_source *src, int n)
742 while (deque_count (&src->deque) <= n)
744 if (!deque_is_empty (&src->deque))
746 struct lex_token *front;
748 front = &src->tokens[deque_front (&src->deque, 0)];
749 if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
753 lex_source_get__ (src);
756 return &src->tokens[deque_back (&src->deque, n)];
759 /* Returns the "struct token" of the token N after the current one in LEXER.
760 The returned pointer can be invalidated by pretty much any succeeding call
761 into the lexer, although the string pointer within the returned token is
762 only invalidated by consuming the token (e.g. with lex_get()). */
764 lex_next (const struct lexer *lexer, int n)
766 return &lex_next__ (lexer, n)->token;
769 /* Returns the type of the token N after the current one in LEXER. */
771 lex_next_token (const struct lexer *lexer, int n)
773 return lex_next (lexer, n)->type;
776 /* Returns the number in the tokn N after the current one in LEXER.
778 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
779 tokens this function will always return zero. */
781 lex_next_tokval (const struct lexer *lexer, int n)
783 const struct token *token = lex_next (lexer, n);
784 return token->number;
787 /* Returns the null-terminated string in the token N after the current one, in
790 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
791 this functions this function will always return NULL.
793 The UTF-8 encoding of the returned string is correct for variable names and
794 other identifiers. Use filename_to_utf8() to use it as a filename. Use
795 data_in() to use it in a "union value". */
797 lex_next_tokcstr (const struct lexer *lexer, int n)
799 return lex_next_tokss (lexer, n).string;
802 /* Returns the string in the token N after the current one, in UTF-8 encoding.
803 The string is null-terminated (but the null terminator is not included in
804 the returned substring's 'length').
806 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
807 this functions this function will always return NULL.
809 The UTF-8 encoding of the returned string is correct for variable names and
810 other identifiers. Use filename_to_utf8() to use it as a filename. Use
811 data_in() to use it in a "union value". */
813 lex_next_tokss (const struct lexer *lexer, int n)
815 return lex_next (lexer, n)->string;
818 /* If LEXER is positioned at the (pseudo)identifier S, skips it and returns
819 true. Otherwise, returns false.
821 S may consist of an arbitrary number of identifiers, integers, and
822 punctuation e.g. "KRUSKAL-WALLIS", "2SLS", or "END INPUT PROGRAM".
823 Identifiers may be abbreviated to their first three letters. Currently only
824 hyphens, slashes, and equals signs are supported as punctuation (but it
825 would be easy to add more).
827 S must be an ASCII string. */
829 lex_match_phrase (struct lexer *lexer, const char *s)
833 for (tok_idx = 0; ; tok_idx++)
835 enum token_type token;
838 while (c_isspace (*s))
846 for (i = 0; i < tok_idx; i++)
851 token = lex_next_token (lexer, tok_idx);
861 if (token != T_SLASH)
867 if (token != T_EQUALS)
872 case '0': case '1': case '2': case '3': case '4':
873 case '5': case '6': case '7': case '8': case '9':
877 if (token != T_POS_NUM)
883 value = value * 10 + (*s++ - '0');
885 while (c_isdigit (*s));
887 if (lex_next_tokval (lexer, tok_idx) != value)
900 len = lex_id_get_length (ss_cstr (s));
901 if (!lex_id_match (ss_buffer (s, len),
902 lex_next_tokss (lexer, tok_idx)))
914 lex_source_get_first_line_number (const struct lex_source *src, int n)
916 return lex_source_next__ (src, n)->first_line;
920 count_newlines (char *s, size_t length)
925 while ((newline = memchr (s, '\n', length)) != NULL)
928 length -= (newline + 1) - s;
936 lex_source_get_last_line_number (const struct lex_source *src, int n)
938 const struct lex_token *token = lex_source_next__ (src, n);
940 if (token->first_line == 0)
944 char *token_str = &src->buffer[token->token_pos - src->tail];
945 return token->first_line + count_newlines (token_str, token->token_len) + 1;
950 count_columns (const char *s_, size_t length)
952 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
958 for (ofs = 0; ofs < length; ofs += mblen)
962 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
965 int width = uc_width (uc, "UTF-8");
970 columns = ROUND_UP (columns + 1, 8);
977 lex_source_get_first_column (const struct lex_source *src, int n)
979 const struct lex_token *token = lex_source_next__ (src, n);
980 return count_columns (&src->buffer[token->line_pos - src->tail],
981 token->token_pos - token->line_pos);
985 lex_source_get_last_column (const struct lex_source *src, int n)
987 const struct lex_token *token = lex_source_next__ (src, n);
988 char *start, *end, *newline;
990 start = &src->buffer[token->line_pos - src->tail];
991 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
992 newline = memrchr (start, '\n', end - start);
995 return count_columns (start, end - start);
998 /* Returns the 1-based line number of the start of the syntax that represents
999 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1000 if the token is drawn from a source that does not have line numbers. */
1002 lex_get_first_line_number (const struct lexer *lexer, int n)
1004 const struct lex_source *src = lex_source__ (lexer);
1005 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
1008 /* Returns the 1-based line number of the end of the syntax that represents the
1009 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1010 token or if the token is drawn from a source that does not have line
1013 Most of the time, a single token is wholly within a single line of syntax,
1014 but there are two exceptions: a T_STRING token can be made up of multiple
1015 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1016 token can consist of a "-" on one line followed by the number on the next.
1019 lex_get_last_line_number (const struct lexer *lexer, int n)
1021 const struct lex_source *src = lex_source__ (lexer);
1022 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1025 /* Returns the 1-based column number of the start of the syntax that represents
1026 the token N after the current one in LEXER. Returns 0 for a T_STOP
1029 Column numbers are measured according to the width of characters as shown in
1030 a typical fixed-width font, in which CJK characters have width 2 and
1031 combining characters have width 0. */
1033 lex_get_first_column (const struct lexer *lexer, int n)
1035 const struct lex_source *src = lex_source__ (lexer);
1036 return src != NULL ? lex_source_get_first_column (src, n) : 0;
1039 /* Returns the 1-based column number of the end of the syntax that represents
1040 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1043 Column numbers are measured according to the width of characters as shown in
1044 a typical fixed-width font, in which CJK characters have width 2 and
1045 combining characters have width 0. */
1047 lex_get_last_column (const struct lexer *lexer, int n)
1049 const struct lex_source *src = lex_source__ (lexer);
1050 return src != NULL ? lex_source_get_last_column (src, n) : 0;
1053 /* Returns the name of the syntax file from which the current command is drawn.
1054 Returns NULL for a T_STOP token or if the command's source does not have
1057 There is no version of this function that takes an N argument because
1058 lookahead only works to the end of a command and any given command is always
1059 within a single syntax file. */
1061 lex_get_file_name (const struct lexer *lexer)
1063 struct lex_source *src = lex_source__ (lexer);
1064 return src == NULL ? NULL : src->reader->file_name;
1067 /* Returns the syntax mode for the syntax file from which the current drawn is
1068 drawn. Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1069 source does not have line numbers.
1071 There is no version of this function that takes an N argument because
1072 lookahead only works to the end of a command and any given command is always
1073 within a single syntax file. */
1074 enum lex_syntax_mode
1075 lex_get_syntax_mode (const struct lexer *lexer)
1077 struct lex_source *src = lex_source__ (lexer);
1078 return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1081 /* Returns the error mode for the syntax file from which the current drawn is
1082 drawn. Returns LEX_ERROR_INTERACTIVE for a T_STOP token or if the command's
1083 source does not have line numbers.
1085 There is no version of this function that takes an N argument because
1086 lookahead only works to the end of a command and any given command is always
1087 within a single syntax file. */
1089 lex_get_error_mode (const struct lexer *lexer)
1091 struct lex_source *src = lex_source__ (lexer);
1092 return src == NULL ? LEX_ERROR_INTERACTIVE : src->reader->error;
1095 /* If the source that LEXER is currently reading has error mode
1096 LEX_ERROR_INTERACTIVE, discards all buffered input and tokens, so that the
1097 next token to be read comes directly from whatever is next read from the
1100 It makes sense to call this function after encountering an error in a
1101 command entered on the console, because usually the user would prefer not to
1102 have cascading errors. */
1104 lex_interactive_reset (struct lexer *lexer)
1106 struct lex_source *src = lex_source__ (lexer);
1107 if (src != NULL && src->reader->error == LEX_ERROR_INTERACTIVE)
1109 src->head = src->tail = 0;
1110 src->journal_pos = src->seg_pos = src->line_pos = 0;
1111 src->n_newlines = 0;
1112 src->suppress_next_newline = false;
1113 segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1114 while (!deque_is_empty (&src->deque))
1115 lex_source_pop__ (src);
1116 lex_source_push_endcmd__ (src);
1120 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1122 lex_discard_rest_of_command (struct lexer *lexer)
1124 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1128 /* Discards all lookahead tokens in LEXER, then discards all input sources
1129 until it encounters one with error mode LEX_ERROR_INTERACTIVE or until it
1130 runs out of input sources. */
1132 lex_discard_noninteractive (struct lexer *lexer)
1134 struct lex_source *src = lex_source__ (lexer);
1138 while (!deque_is_empty (&src->deque))
1139 lex_source_pop__ (src);
1141 for (; src != NULL && src->reader->error != LEX_ERROR_INTERACTIVE;
1142 src = lex_source__ (lexer))
1143 lex_source_destroy (src);
1148 lex_source_max_tail__ (const struct lex_source *src)
1150 const struct lex_token *token;
1153 assert (src->seg_pos >= src->line_pos);
1154 max_tail = MIN (src->journal_pos, src->line_pos);
1156 /* Use the oldest token also. (We know that src->deque cannot be empty
1157 because we are in the process of adding a new token, which is already
1158 initialized enough to use here.) */
1159 token = &src->tokens[deque_back (&src->deque, 0)];
1160 assert (token->token_pos >= token->line_pos);
1161 max_tail = MIN (max_tail, token->line_pos);
1167 lex_source_expand__ (struct lex_source *src)
1169 if (src->head - src->tail >= src->allocated)
1171 size_t max_tail = lex_source_max_tail__ (src);
1172 if (max_tail > src->tail)
1174 /* Advance the tail, freeing up room at the head. */
1175 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1176 src->head - max_tail);
1177 src->tail = max_tail;
1181 /* Buffer is completely full. Expand it. */
1182 src->buffer = x2realloc (src->buffer, &src->allocated);
1187 /* There's space available at the head of the buffer. Nothing to do. */
1192 lex_source_read__ (struct lex_source *src)
1199 lex_source_expand__ (src);
1201 head_ofs = src->head - src->tail;
1202 n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1203 src->allocated - head_ofs,
1204 segmenter_get_prompt (&src->segmenter));
1209 Ensure that the input always ends in a new-line followed by a null
1210 byte, as required by the segmenter library. */
1212 if (src->head == src->tail
1213 || src->buffer[src->head - src->tail - 1] != '\n')
1214 src->buffer[src->head++ - src->tail] = '\n';
1216 lex_source_expand__ (src);
1217 src->buffer[src->head++ - src->tail] = '\0';
1224 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1225 src->head - src->seg_pos));
1228 static struct lex_source *
1229 lex_source__ (const struct lexer *lexer)
1231 return (ll_is_empty (&lexer->sources) ? NULL
1232 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1235 static struct substring
1236 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1238 const struct lex_token *token0 = lex_source_next__ (src, n0);
1239 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1240 size_t start = token0->token_pos;
1241 size_t end = token1->token_pos + token1->token_len;
1243 return ss_buffer (&src->buffer[start - src->tail], end - start);
1247 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1253 assert (out_size >= 16);
1254 out_maxlen = out_size - (in.length >= out_size ? 3 : 0) - 1;
1255 for (out_len = 0; out_len < in.length; out_len += mblen)
1257 if (in.string[out_len] == '\n'
1258 || (in.string[out_len] == '\r'
1259 && out_len + 1 < in.length
1260 && in.string[out_len + 1] == '\n'))
1263 mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1264 in.length - out_len);
1265 if (out_len + mblen > out_maxlen)
1269 memcpy (out, in.string, out_len);
1270 strcpy (&out[out_len], out_len < in.length ? "..." : "");
1274 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1275 const char *format, va_list args)
1277 const struct lex_token *token;
1283 token = lex_source_next__ (src, n0);
1284 if (token->token.type == T_ENDCMD)
1285 ds_put_cstr (&s, _("Syntax error at end of command"));
1288 struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1289 if (!ss_is_empty (syntax))
1291 char syntax_cstr[64];
1293 lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1294 ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1297 ds_put_cstr (&s, _("Syntax error"));
1302 ds_put_cstr (&s, ": ");
1303 ds_put_vformat (&s, format, args);
1305 ds_put_byte (&s, '.');
1307 m.category = MSG_C_SYNTAX;
1308 m.severity = MSG_S_ERROR;
1309 m.file_name = src->reader->file_name;
1310 m.first_line = lex_source_get_first_line_number (src, n0);
1311 m.last_line = lex_source_get_last_line_number (src, n1);
1312 m.first_column = lex_source_get_first_column (src, n0);
1313 m.last_column = lex_source_get_last_column (src, n1);
1314 m.text = ds_steal_cstr (&s);
1318 static void PRINTF_FORMAT (2, 3)
1319 lex_get_error (struct lex_source *src, const char *format, ...)
1324 va_start (args, format);
1326 n = deque_count (&src->deque) - 1;
1327 lex_source_error_valist (src, n, n, format, args);
1328 lex_source_pop_front (src);
1334 lex_source_get__ (const struct lex_source *src_)
1336 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1340 struct segmenter segmenter;
1341 enum segment_type last_segment;
1347 struct state state, saved;
1348 enum scan_result result;
1349 struct scanner scanner;
1350 struct lex_token *token;
1357 state.segmenter = src->segmenter;
1359 state.seg_pos = src->seg_pos;
1360 state.line_pos = src->line_pos;
1363 token = lex_push_token__ (src);
1364 scanner_init (&scanner, &token->token);
1365 token->line_pos = src->line_pos;
1366 token->token_pos = src->seg_pos;
1367 if (src->reader->line_number > 0)
1368 token->first_line = src->reader->line_number + src->n_newlines;
1370 token->first_line = 0;
1374 enum segment_type type;
1375 const char *segment;
1379 segment = &src->buffer[state.seg_pos - src->tail];
1380 seg_maxlen = src->head - state.seg_pos;
1381 seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen, &type);
1384 lex_source_read__ (src);
1388 state.last_segment = type;
1389 state.seg_pos += seg_len;
1390 if (type == SEG_NEWLINE)
1393 state.line_pos = state.seg_pos;
1396 result = scanner_push (&scanner, type, ss_buffer (segment, seg_len),
1398 if (result == SCAN_SAVE)
1400 else if (result == SCAN_BACK)
1405 else if (result == SCAN_DONE)
1409 n_lines = state.newlines;
1410 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1413 src->suppress_next_newline = true;
1415 else if (n_lines > 0 && src->suppress_next_newline)
1418 src->suppress_next_newline = false;
1420 for (i = 0; i < n_lines; i++)
1422 const char *newline;
1427 line = &src->buffer[src->journal_pos - src->tail];
1428 newline = rawmemchr (line, '\n');
1429 line_len = newline - line;
1430 if (line_len > 0 && line[line_len - 1] == '\r')
1433 syntax = malloc (line_len + 2);
1434 memcpy (syntax, line, line_len);
1435 syntax[line_len] = '\n';
1436 syntax[line_len + 1] = '\0';
1438 text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX, syntax));
1440 src->journal_pos += newline - line + 1;
1443 token->token_len = state.seg_pos - src->seg_pos;
1445 src->segmenter = state.segmenter;
1446 src->seg_pos = state.seg_pos;
1447 src->line_pos = state.line_pos;
1448 src->n_newlines += state.newlines;
1450 switch (token->token.type)
1456 token->token.type = T_ENDCMD;
1460 case SCAN_BAD_HEX_LENGTH:
1461 lex_get_error (src, _("String of hex digits has %d characters, which "
1462 "is not a multiple of 2"),
1463 (int) token->token.number);
1466 case SCAN_BAD_HEX_DIGIT:
1467 case SCAN_BAD_UNICODE_DIGIT:
1468 lex_get_error (src, _("`%c' is not a valid hex digit"),
1469 (int) token->token.number);
1472 case SCAN_BAD_UNICODE_LENGTH:
1473 lex_get_error (src, _("Unicode string contains %d bytes, which is "
1474 "not in the valid range of 1 to 8 bytes"),
1475 (int) token->token.number);
1478 case SCAN_BAD_UNICODE_CODE_POINT:
1479 lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1480 (int) token->token.number);
1483 case SCAN_EXPECTED_QUOTE:
1484 lex_get_error (src, _("Unterminated string constant"));
1487 case SCAN_EXPECTED_EXPONENT:
1488 lex_get_error (src, _("Missing exponent following `%s'"),
1489 token->token.string.string);
1492 case SCAN_UNEXPECTED_DOT:
1493 lex_get_error (src, _("Unexpected `.' in middle of command"));
1496 case SCAN_UNEXPECTED_CHAR:
1499 lex_get_error (src, _("Bad character %s in input"),
1500 uc_name (token->token.number, c_name));
1505 lex_source_pop_front (src);
1513 lex_source_push_endcmd__ (struct lex_source *src)
1515 struct lex_token *token = lex_push_token__ (src);
1516 token->token.type = T_ENDCMD;
1517 token->token_pos = 0;
1518 token->token_len = 0;
1519 token->line_pos = 0;
1520 token->first_line = 0;
1523 static struct lex_source *
1524 lex_source_create (struct lex_reader *reader)
1526 struct lex_source *src;
1527 enum segmenter_mode mode;
1529 src = xzalloc (sizeof *src);
1530 src->reader = reader;
1532 if (reader->syntax == LEX_SYNTAX_AUTO)
1533 mode = SEG_MODE_AUTO;
1534 else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1535 mode = SEG_MODE_INTERACTIVE;
1536 else if (reader->syntax == LEX_SYNTAX_BATCH)
1537 mode = SEG_MODE_BATCH;
1540 segmenter_init (&src->segmenter, mode);
1542 src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1544 lex_source_push_endcmd__ (src);
1550 lex_source_destroy (struct lex_source *src)
1552 char *file_name = src->reader->file_name;
1553 if (src->reader->class->destroy != NULL)
1554 src->reader->class->destroy (src->reader);
1557 while (!deque_is_empty (&src->deque))
1558 lex_source_pop__ (src);
1560 ll_remove (&src->ll);
1564 struct lex_file_reader
1566 struct lex_reader reader;
1567 struct u8_istream *istream;
1571 static struct lex_reader_class lex_file_reader_class;
1573 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1574 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
1575 ENCODING, which should take one of the forms accepted by
1576 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
1577 mode of the new reader, respectively.
1579 Returns a null pointer if FILE_NAME cannot be opened. */
1581 lex_reader_for_file (const char *file_name, const char *encoding,
1582 enum lex_syntax_mode syntax,
1583 enum lex_error_mode error)
1585 struct lex_file_reader *r;
1586 struct u8_istream *istream;
1588 istream = (!strcmp(file_name, "-")
1589 ? u8_istream_for_fd (encoding, STDIN_FILENO)
1590 : u8_istream_for_file (encoding, file_name, O_RDONLY));
1591 if (istream == NULL)
1593 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1597 r = xmalloc (sizeof *r);
1598 lex_reader_init (&r->reader, &lex_file_reader_class);
1599 r->reader.syntax = syntax;
1600 r->reader.error = error;
1601 r->reader.file_name = xstrdup (file_name);
1602 r->reader.line_number = 1;
1603 r->istream = istream;
1604 r->file_name = xstrdup (file_name);
1609 static struct lex_file_reader *
1610 lex_file_reader_cast (struct lex_reader *r)
1612 return UP_CAST (r, struct lex_file_reader, reader);
1616 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1617 enum prompt_style prompt_style UNUSED)
1619 struct lex_file_reader *r = lex_file_reader_cast (r_);
1620 ssize_t n_read = u8_istream_read (r->istream, buf, n);
1623 msg (ME, _("Error reading `%s': %s."), r->file_name, strerror (errno));
1630 lex_file_close (struct lex_reader *r_)
1632 struct lex_file_reader *r = lex_file_reader_cast (r_);
1634 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1636 if (u8_istream_close (r->istream) != 0)
1637 msg (ME, _("Error closing `%s': %s."), r->file_name, strerror (errno));
1640 u8_istream_free (r->istream);
1642 free (r->file_name);
1646 static struct lex_reader_class lex_file_reader_class =
1652 struct lex_string_reader
1654 struct lex_reader reader;
1659 static struct lex_reader_class lex_string_reader_class;
1661 /* Creates and returns a new lex_reader for the contents of S, which must be
1662 encoded in UTF-8. The new reader takes ownership of S and will free it
1663 with ss_dealloc() when it is closed. */
1665 lex_reader_for_substring_nocopy (struct substring s)
1667 struct lex_string_reader *r;
1669 r = xmalloc (sizeof *r);
1670 lex_reader_init (&r->reader, &lex_string_reader_class);
1671 r->reader.syntax = LEX_SYNTAX_INTERACTIVE;
1678 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1679 which must be encoded in UTF-8. The caller retains ownership of S. */
1681 lex_reader_for_string (const char *s)
1683 struct substring ss;
1684 ss_alloc_substring (&ss, ss_cstr (s));
1685 return lex_reader_for_substring_nocopy (ss);
1688 /* Formats FORMAT as a printf()-like format string and creates and returns a
1689 new lex_reader for the formatted result. */
1691 lex_reader_for_format (const char *format, ...)
1693 struct lex_reader *r;
1696 va_start (args, format);
1697 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)));
1703 static struct lex_string_reader *
1704 lex_string_reader_cast (struct lex_reader *r)
1706 return UP_CAST (r, struct lex_string_reader, reader);
1710 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1711 enum prompt_style prompt_style UNUSED)
1713 struct lex_string_reader *r = lex_string_reader_cast (r_);
1716 chunk = MIN (n, r->s.length - r->offset);
1717 memcpy (buf, r->s.string + r->offset, chunk);
1724 lex_string_close (struct lex_reader *r_)
1726 struct lex_string_reader *r = lex_string_reader_cast (r_);
1732 static struct lex_reader_class lex_string_reader_class =