1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/lexer/lexer.h"
33 #include "language/command.h"
34 #include "language/lexer/macro.h"
35 #include "language/lexer/scan.h"
36 #include "language/lexer/segment.h"
37 #include "language/lexer/token.h"
38 #include "libpspp/assertion.h"
39 #include "libpspp/cast.h"
40 #include "libpspp/deque.h"
41 #include "libpspp/i18n.h"
42 #include "libpspp/ll.h"
43 #include "libpspp/message.h"
44 #include "libpspp/misc.h"
45 #include "libpspp/str.h"
46 #include "libpspp/u8-istream.h"
47 #include "output/journal.h"
48 #include "output/output-item.h"
50 #include "gl/c-ctype.h"
51 #include "gl/minmax.h"
52 #include "gl/xalloc.h"
53 #include "gl/xmemdup0.h"
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
59 /* A token within a lex_source. */
62 /* The regular token information. */
65 /* For a token obtained through the lexer in an ordinary way, this is the
66 location of the token in terms of the lex_source's buffer.
68 For a token produced through macro expansion, this is the entire macro
71 src->tail <= line_pos <= token_pos <= src->head. */
72 size_t token_pos; /* Start of token. */
73 size_t token_len; /* Length of source for token in bytes. */
74 size_t line_pos; /* Start of line containing token_pos. */
75 int first_line; /* Line number at token_pos. */
77 /* For a token obtained through macro expansion, this is just this token. */
78 char *macro_rep; /* The whole macro expansion. */
79 size_t ofs; /* Offset of this token in macro_rep. */
80 size_t len; /* Length of this token in macro_rep. */
81 size_t *ref_cnt; /* Number of lex_tokens that refer to macro_rep. */
85 lex_token_uninit (struct lex_token *t)
87 token_uninit (&t->token);
90 assert (*t->ref_cnt > 0);
99 /* A source of tokens, corresponding to a syntax file.
101 This is conceptually a lex_reader wrapped with everything needed to convert
102 its UTF-8 bytes into tokens. */
105 struct ll ll; /* In lexer's list of sources. */
106 struct lex_reader *reader;
108 struct segmenter segmenter;
109 bool eof; /* True if T_STOP was read from 'reader'. */
111 /* Buffer of UTF-8 bytes. */
113 size_t allocated; /* Number of bytes allocated. */
114 size_t tail; /* &buffer[0] offset into UTF-8 source. */
115 size_t head; /* &buffer[head - tail] offset into source. */
117 /* Positions in source file, tail <= pos <= head for each member here. */
118 size_t journal_pos; /* First byte not yet output to journal. */
119 size_t seg_pos; /* First byte not yet scanned as token. */
120 size_t line_pos; /* First byte of line containing seg_pos. */
122 int n_newlines; /* Number of new-lines up to seg_pos. */
123 bool suppress_next_newline;
126 struct deque deque; /* Indexes into 'tokens'. */
127 struct lex_token *tokens; /* Lookahead tokens for parser. */
130 static struct lex_source *lex_source_create (struct lexer *,
131 struct lex_reader *);
132 static void lex_source_destroy (struct lex_source *);
137 struct ll_list sources; /* Contains "struct lex_source"s. */
138 struct macro_set *macros;
141 static struct lex_source *lex_source__ (const struct lexer *);
142 static char *lex_source_get_syntax__ (const struct lex_source *,
144 static const struct lex_token *lex_next__ (const struct lexer *, int n);
145 static void lex_source_push_endcmd__ (struct lex_source *);
147 static void lex_source_pop__ (struct lex_source *);
148 static bool lex_source_get (const struct lex_source *);
149 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
150 const char *format, va_list)
151 PRINTF_FORMAT (4, 0);
152 static const struct lex_token *lex_source_next__ (const struct lex_source *,
155 /* Initializes READER with the specified CLASS and otherwise some reasonable
156 defaults. The caller should fill in the others members as desired. */
158 lex_reader_init (struct lex_reader *reader,
159 const struct lex_reader_class *class)
161 reader->class = class;
162 reader->syntax = SEG_MODE_AUTO;
163 reader->error = LEX_ERROR_CONTINUE;
164 reader->file_name = NULL;
165 reader->encoding = NULL;
166 reader->line_number = 0;
170 /* Frees any file name already in READER and replaces it by a copy of
171 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
173 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
175 free (reader->file_name);
176 reader->file_name = xstrdup_if_nonnull (file_name);
179 /* Creates and returns a new lexer. */
183 struct lexer *lexer = xmalloc (sizeof *lexer);
184 *lexer = (struct lexer) {
185 .sources = LL_INITIALIZER (lexer->sources),
186 .macros = macro_set_create (),
191 /* Destroys LEXER. */
193 lex_destroy (struct lexer *lexer)
197 struct lex_source *source, *next;
199 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
200 lex_source_destroy (source);
201 macro_set_destroy (lexer->macros);
206 /* Adds M to LEXER's set of macros. M replaces any existing macro with the
207 same name. Takes ownership of M. */
209 lex_define_macro (struct lexer *lexer, struct macro *m)
211 macro_set_add (lexer->macros, m);
214 /* Inserts READER into LEXER so that the next token read by LEXER comes from
215 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
218 lex_include (struct lexer *lexer, struct lex_reader *reader)
220 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
221 ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
224 /* Appends READER to LEXER, so that it will be read after all other current
225 readers have already been read. */
227 lex_append (struct lexer *lexer, struct lex_reader *reader)
229 ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
234 static struct lex_token *
235 lex_push_token__ (struct lex_source *src)
237 struct lex_token *token;
239 if (deque_is_full (&src->deque))
240 src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
242 token = &src->tokens[deque_push_front (&src->deque)];
243 token->token = (struct token) { .type = T_STOP };
244 token->macro_rep = NULL;
245 token->ref_cnt = NULL;
250 lex_source_pop__ (struct lex_source *src)
252 lex_token_uninit (&src->tokens[deque_pop_back (&src->deque)]);
256 lex_source_pop_front (struct lex_source *src)
258 lex_token_uninit (&src->tokens[deque_pop_front (&src->deque)]);
261 /* Advances LEXER to the next token, consuming the current token. */
263 lex_get (struct lexer *lexer)
265 struct lex_source *src;
267 src = lex_source__ (lexer);
271 if (!deque_is_empty (&src->deque))
272 lex_source_pop__ (src);
274 while (deque_is_empty (&src->deque))
275 if (!lex_source_get (src))
277 lex_source_destroy (src);
278 src = lex_source__ (lexer);
284 /* Issuing errors. */
286 /* Prints a syntax error message containing the current token and
287 given message MESSAGE (if non-null). */
289 lex_error (struct lexer *lexer, const char *format, ...)
293 va_start (args, format);
294 lex_next_error_valist (lexer, 0, 0, format, args);
298 /* Prints a syntax error message containing the current token and
299 given message MESSAGE (if non-null). */
301 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
303 lex_next_error_valist (lexer, 0, 0, format, args);
306 /* Prints a syntax error message containing the current token and
307 given message MESSAGE (if non-null). */
309 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
313 va_start (args, format);
314 lex_next_error_valist (lexer, n0, n1, format, args);
318 /* Prints a syntax error message saying that one of the strings provided as
319 varargs, up to the first NULL, is expected. */
321 (lex_error_expecting) (struct lexer *lexer, ...)
325 va_start (args, lexer);
326 lex_error_expecting_valist (lexer, args);
330 /* Prints a syntax error message saying that one of the options provided in
331 ARGS, up to the first NULL, is expected. */
333 lex_error_expecting_valist (struct lexer *lexer, va_list args)
335 enum { MAX_OPTIONS = 9 };
336 const char *options[MAX_OPTIONS];
338 while (n < MAX_OPTIONS)
340 const char *option = va_arg (args, const char *);
344 options[n++] = option;
346 lex_error_expecting_array (lexer, options, n);
350 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
355 lex_error (lexer, NULL);
359 lex_error (lexer, _("expecting %s"), options[0]);
363 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
367 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
372 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
373 options[0], options[1], options[2], options[3]);
377 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
378 options[0], options[1], options[2], options[3], options[4]);
382 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
383 options[0], options[1], options[2], options[3], options[4],
388 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
389 options[0], options[1], options[2], options[3], options[4],
390 options[5], options[6]);
394 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
395 options[0], options[1], options[2], options[3], options[4],
396 options[5], options[6], options[7]);
400 lex_error (lexer, NULL);
404 /* Reports an error to the effect that subcommand SBC may only be specified
407 This function does not take a lexer as an argument or use lex_error(),
408 because the result would ordinarily just be redundant: "Syntax error at
409 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
410 not help the user find the error. */
412 lex_sbc_only_once (const char *sbc)
414 msg (SE, _("Subcommand %s may only be specified once."), sbc);
417 /* Reports an error to the effect that subcommand SBC is missing.
419 This function does not take a lexer as an argument or use lex_error(),
420 because a missing subcommand can normally be detected only after the whole
421 command has been parsed, and so lex_error() would always report "Syntax
422 error at end of command", which does not help the user find the error. */
424 lex_sbc_missing (const char *sbc)
426 msg (SE, _("Required subcommand %s was not specified."), sbc);
429 /* Reports an error to the effect that specification SPEC may only be specified
430 once within subcommand SBC. */
432 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
434 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
438 /* Reports an error to the effect that specification SPEC is missing within
441 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
443 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
447 /* Prints a syntax error message containing the current token and
448 given message MESSAGE (if non-null). */
450 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
451 const char *format, va_list args)
453 struct lex_source *src = lex_source__ (lexer);
456 lex_source_error_valist (src, n0, n1, format, args);
462 ds_put_format (&s, _("Syntax error at end of input"));
465 ds_put_cstr (&s, ": ");
466 ds_put_vformat (&s, format, args);
468 ds_put_byte (&s, '.');
469 msg (SE, "%s", ds_cstr (&s));
474 /* Checks that we're at end of command.
475 If so, returns a successful command completion code.
476 If not, flags a syntax error and returns an error command
479 lex_end_of_command (struct lexer *lexer)
481 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
483 lex_error (lexer, _("expecting end of command"));
490 /* Token testing functions. */
492 /* Returns true if the current token is a number. */
494 lex_is_number (const struct lexer *lexer)
496 return lex_next_is_number (lexer, 0);
499 /* Returns true if the current token is a string. */
501 lex_is_string (const struct lexer *lexer)
503 return lex_next_is_string (lexer, 0);
506 /* Returns the value of the current token, which must be a
507 floating point number. */
509 lex_number (const struct lexer *lexer)
511 return lex_next_number (lexer, 0);
514 /* Returns true iff the current token is an integer. */
516 lex_is_integer (const struct lexer *lexer)
518 return lex_next_is_integer (lexer, 0);
521 /* Returns the value of the current token, which must be an
524 lex_integer (const struct lexer *lexer)
526 return lex_next_integer (lexer, 0);
529 /* Token testing functions with lookahead.
531 A value of 0 for N as an argument to any of these functions refers to the
532 current token. Lookahead is limited to the current command. Any N greater
533 than the number of tokens remaining in the current command will be treated
534 as referring to a T_ENDCMD token. */
536 /* Returns true if the token N ahead of the current token is a number. */
538 lex_next_is_number (const struct lexer *lexer, int n)
540 return token_is_number (lex_next (lexer, n));
543 /* Returns true if the token N ahead of the current token is a string. */
545 lex_next_is_string (const struct lexer *lexer, int n)
547 return token_is_string (lex_next (lexer, n));
550 /* Returns the value of the token N ahead of the current token, which must be a
551 floating point number. */
553 lex_next_number (const struct lexer *lexer, int n)
555 return token_number (lex_next (lexer, n));
558 /* Returns true if the token N ahead of the current token is an integer. */
560 lex_next_is_integer (const struct lexer *lexer, int n)
562 return token_is_integer (lex_next (lexer, n));
565 /* Returns the value of the token N ahead of the current token, which must be
568 lex_next_integer (const struct lexer *lexer, int n)
570 return token_integer (lex_next (lexer, n));
573 /* Token matching functions. */
575 /* If the current token has the specified TYPE, skips it and returns true.
576 Otherwise, returns false. */
578 lex_match (struct lexer *lexer, enum token_type type)
580 if (lex_token (lexer) == type)
589 /* If the current token matches IDENTIFIER, skips it and returns true.
590 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
593 IDENTIFIER must be an ASCII string. */
595 lex_match_id (struct lexer *lexer, const char *identifier)
597 return lex_match_id_n (lexer, identifier, 3);
600 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
601 may be abbreviated to its first N letters. Otherwise, returns false.
603 IDENTIFIER must be an ASCII string. */
605 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
607 if (lex_token (lexer) == T_ID
608 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
617 /* If the current token is integer X, skips it and returns true. Otherwise,
620 lex_match_int (struct lexer *lexer, int x)
622 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
631 /* Forced matches. */
633 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
634 abbreviated to its first 3 letters. Otherwise, reports an error and returns
637 IDENTIFIER must be an ASCII string. */
639 lex_force_match_id (struct lexer *lexer, const char *identifier)
641 if (lex_match_id (lexer, identifier))
645 lex_error_expecting (lexer, identifier);
650 /* If the current token has the specified TYPE, skips it and returns true.
651 Otherwise, reports an error and returns false. */
653 lex_force_match (struct lexer *lexer, enum token_type type)
655 if (lex_token (lexer) == type)
662 const char *type_string = token_type_to_string (type);
665 char *s = xasprintf ("`%s'", type_string);
666 lex_error_expecting (lexer, s);
670 lex_error_expecting (lexer, token_type_to_name (type));
676 /* If the current token is a string, does nothing and returns true.
677 Otherwise, reports an error and returns false. */
679 lex_force_string (struct lexer *lexer)
681 if (lex_is_string (lexer))
685 lex_error (lexer, _("expecting string"));
690 /* If the current token is a string or an identifier, does nothing and returns
691 true. Otherwise, reports an error and returns false.
693 This is meant for use in syntactic situations where we want to encourage the
694 user to supply a quoted string, but for compatibility we also accept
695 identifiers. (One example of such a situation is file names.) Therefore,
696 the error message issued when the current token is wrong only says that a
697 string is expected and doesn't mention that an identifier would also be
700 lex_force_string_or_id (struct lexer *lexer)
702 return lex_token (lexer) == T_ID || lex_force_string (lexer);
705 /* If the current token is an integer, does nothing and returns true.
706 Otherwise, reports an error and returns false. */
708 lex_force_int (struct lexer *lexer)
710 if (lex_is_integer (lexer))
714 lex_error (lexer, _("expecting integer"));
719 /* If the current token is an integer in the range MIN...MAX (inclusive), does
720 nothing and returns true. Otherwise, reports an error and returns false.
721 If NAME is nonnull, then it is used in the error message. */
723 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
725 bool is_integer = lex_is_integer (lexer);
726 bool too_small = is_integer && lex_integer (lexer) < min;
727 bool too_big = is_integer && lex_integer (lexer) > max;
728 if (is_integer && !too_small && !too_big)
733 /* Weird, maybe a bug in the caller. Just report that we needed an
736 lex_error (lexer, _("Integer expected for %s."), name);
738 lex_error (lexer, _("Integer expected."));
743 lex_error (lexer, _("Expected %ld for %s."), min, name);
745 lex_error (lexer, _("Expected %ld."), min);
747 else if (min + 1 == max)
750 lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
752 lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
756 bool report_lower_bound = (min > INT_MIN / 2) || too_small;
757 bool report_upper_bound = (max < INT_MAX / 2) || too_big;
759 if (report_lower_bound && report_upper_bound)
763 _("Expected integer between %ld and %ld for %s."),
766 lex_error (lexer, _("Expected integer between %ld and %ld."),
769 else if (report_lower_bound)
774 lex_error (lexer, _("Expected non-negative integer for %s."),
777 lex_error (lexer, _("Expected non-negative integer."));
782 lex_error (lexer, _("Expected positive integer for %s."),
785 lex_error (lexer, _("Expected positive integer."));
788 else if (report_upper_bound)
792 _("Expected integer less than or equal to %ld for %s."),
795 lex_error (lexer, _("Expected integer less than or equal to %ld."),
801 lex_error (lexer, _("Integer expected for %s."), name);
803 lex_error (lexer, _("Integer expected."));
809 /* If the current token is a number, does nothing and returns true.
810 Otherwise, reports an error and returns false. */
812 lex_force_num (struct lexer *lexer)
814 if (lex_is_number (lexer))
817 lex_error (lexer, _("expecting number"));
821 /* If the current token is an identifier, does nothing and returns true.
822 Otherwise, reports an error and returns false. */
824 lex_force_id (struct lexer *lexer)
826 if (lex_token (lexer) == T_ID)
829 lex_error (lexer, _("expecting identifier"));
833 /* Token accessors. */
835 /* Returns the type of LEXER's current token. */
837 lex_token (const struct lexer *lexer)
839 return lex_next_token (lexer, 0);
842 /* Returns the number in LEXER's current token.
844 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
845 tokens this function will always return zero. */
847 lex_tokval (const struct lexer *lexer)
849 return lex_next_tokval (lexer, 0);
852 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
854 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
855 this functions this function will always return NULL.
857 The UTF-8 encoding of the returned string is correct for variable names and
858 other identifiers. Use filename_to_utf8() to use it as a filename. Use
859 data_in() to use it in a "union value". */
861 lex_tokcstr (const struct lexer *lexer)
863 return lex_next_tokcstr (lexer, 0);
866 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
867 null-terminated (but the null terminator is not included in the returned
868 substring's 'length').
870 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
871 this functions this function will always return NULL.
873 The UTF-8 encoding of the returned string is correct for variable names and
874 other identifiers. Use filename_to_utf8() to use it as a filename. Use
875 data_in() to use it in a "union value". */
877 lex_tokss (const struct lexer *lexer)
879 return lex_next_tokss (lexer, 0);
884 A value of 0 for N as an argument to any of these functions refers to the
885 current token. Lookahead is limited to the current command. Any N greater
886 than the number of tokens remaining in the current command will be treated
887 as referring to a T_ENDCMD token. */
889 static const struct lex_token *
890 lex_next__ (const struct lexer *lexer_, int n)
892 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
893 struct lex_source *src = lex_source__ (lexer);
896 return lex_source_next__ (src, n);
899 static const struct lex_token stop_token = { .token = { .type = T_STOP } };
904 static const struct lex_token *
905 lex_source_front (const struct lex_source *src)
907 return &src->tokens[deque_front (&src->deque, 0)];
910 static const struct lex_token *
911 lex_source_next__ (const struct lex_source *src, int n)
913 while (deque_count (&src->deque) <= n)
915 if (!deque_is_empty (&src->deque))
917 const struct lex_token *front = lex_source_front (src);
918 if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
922 lex_source_get (src);
925 return &src->tokens[deque_back (&src->deque, n)];
928 /* Returns the "struct token" of the token N after the current one in LEXER.
929 The returned pointer can be invalidated by pretty much any succeeding call
930 into the lexer, although the string pointer within the returned token is
931 only invalidated by consuming the token (e.g. with lex_get()). */
933 lex_next (const struct lexer *lexer, int n)
935 return &lex_next__ (lexer, n)->token;
938 /* Returns the type of the token N after the current one in LEXER. */
940 lex_next_token (const struct lexer *lexer, int n)
942 return lex_next (lexer, n)->type;
945 /* Returns the number in the tokn N after the current one in LEXER.
947 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
948 tokens this function will always return zero. */
950 lex_next_tokval (const struct lexer *lexer, int n)
952 return token_number (lex_next (lexer, n));
955 /* Returns the null-terminated string in the token N after the current one, in
958 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
959 this functions this function will always return NULL.
961 The UTF-8 encoding of the returned string is correct for variable names and
962 other identifiers. Use filename_to_utf8() to use it as a filename. Use
963 data_in() to use it in a "union value". */
965 lex_next_tokcstr (const struct lexer *lexer, int n)
967 return lex_next_tokss (lexer, n).string;
970 /* Returns the string in the token N after the current one, in UTF-8 encoding.
971 The string is null-terminated (but the null terminator is not included in
972 the returned substring's 'length').
974 Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings. For other
975 tokens this functions this function will always return NULL.
977 The UTF-8 encoding of the returned string is correct for variable names and
978 other identifiers. Use filename_to_utf8() to use it as a filename. Use
979 data_in() to use it in a "union value". */
981 lex_next_tokss (const struct lexer *lexer, int n)
983 return lex_next (lexer, n)->string;
987 lex_next_representation (const struct lexer *lexer, int n0, int n1)
989 return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
993 lex_next_is_from_macro (const struct lexer *lexer, int n)
995 return lex_next__ (lexer, n)->macro_rep != NULL;
999 lex_tokens_match (const struct token *actual, const struct token *expected)
1001 if (actual->type != expected->type)
1004 switch (actual->type)
1008 return actual->number == expected->number;
1011 return lex_id_match (expected->string, actual->string);
1014 return (actual->string.length == expected->string.length
1015 && !memcmp (actual->string.string, expected->string.string,
1016 actual->string.length));
1023 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1024 skips it and returns true. Otherwise, returns false.
1026 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1027 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
1028 first three letters. */
1030 lex_match_phrase (struct lexer *lexer, const char *s)
1032 struct string_lexer slex;
1037 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1038 while (string_lexer_next (&slex, &token))
1039 if (token.type != SCAN_SKIP)
1041 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1042 token_uninit (&token);
1053 lex_source_get_first_line_number (const struct lex_source *src, int n)
1055 return lex_source_next__ (src, n)->first_line;
1059 count_newlines (char *s, size_t length)
1064 while ((newline = memchr (s, '\n', length)) != NULL)
1067 length -= (newline + 1) - s;
1075 lex_source_get_last_line_number (const struct lex_source *src, int n)
1077 const struct lex_token *token = lex_source_next__ (src, n);
1079 if (token->first_line == 0)
1083 char *token_str = &src->buffer[token->token_pos - src->tail];
1084 return token->first_line + count_newlines (token_str, token->token_len) + 1;
1089 count_columns (const char *s_, size_t length)
1091 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1097 for (ofs = 0; ofs < length; ofs += mblen)
1101 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1104 int width = uc_width (uc, "UTF-8");
1109 columns = ROUND_UP (columns + 1, 8);
1116 lex_source_get_first_column (const struct lex_source *src, int n)
1118 const struct lex_token *token = lex_source_next__ (src, n);
1119 return count_columns (&src->buffer[token->line_pos - src->tail],
1120 token->token_pos - token->line_pos);
1124 lex_source_get_last_column (const struct lex_source *src, int n)
1126 const struct lex_token *token = lex_source_next__ (src, n);
1127 char *start, *end, *newline;
1129 start = &src->buffer[token->line_pos - src->tail];
1130 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1131 newline = memrchr (start, '\n', end - start);
1132 if (newline != NULL)
1133 start = newline + 1;
1134 return count_columns (start, end - start);
1137 /* Returns the 1-based line number of the start of the syntax that represents
1138 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
1139 if the token is drawn from a source that does not have line numbers. */
1141 lex_get_first_line_number (const struct lexer *lexer, int n)
1143 const struct lex_source *src = lex_source__ (lexer);
1144 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
1147 /* Returns the 1-based line number of the end of the syntax that represents the
1148 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1149 token or if the token is drawn from a source that does not have line
1152 Most of the time, a single token is wholly within a single line of syntax,
1153 but there are two exceptions: a T_STRING token can be made up of multiple
1154 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1155 token can consist of a "-" on one line followed by the number on the next.
1158 lex_get_last_line_number (const struct lexer *lexer, int n)
1160 const struct lex_source *src = lex_source__ (lexer);
1161 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1164 /* Returns the 1-based column number of the start of the syntax that represents
1165 the token N after the current one in LEXER. Returns 0 for a T_STOP
1168 Column numbers are measured according to the width of characters as shown in
1169 a typical fixed-width font, in which CJK characters have width 2 and
1170 combining characters have width 0. */
1172 lex_get_first_column (const struct lexer *lexer, int n)
1174 const struct lex_source *src = lex_source__ (lexer);
1175 return src != NULL ? lex_source_get_first_column (src, n) : 0;
1178 /* Returns the 1-based column number of the end of the syntax that represents
1179 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1182 Column numbers are measured according to the width of characters as shown in
1183 a typical fixed-width font, in which CJK characters have width 2 and
1184 combining characters have width 0. */
1186 lex_get_last_column (const struct lexer *lexer, int n)
1188 const struct lex_source *src = lex_source__ (lexer);
1189 return src != NULL ? lex_source_get_last_column (src, n) : 0;
1192 /* Returns the name of the syntax file from which the current command is drawn.
1193 Returns NULL for a T_STOP token or if the command's source does not have
1196 There is no version of this function that takes an N argument because
1197 lookahead only works to the end of a command and any given command is always
1198 within a single syntax file. */
1200 lex_get_file_name (const struct lexer *lexer)
1202 struct lex_source *src = lex_source__ (lexer);
1203 return src == NULL ? NULL : src->reader->file_name;
1207 lex_get_encoding (const struct lexer *lexer)
1209 struct lex_source *src = lex_source__ (lexer);
1210 return src == NULL ? NULL : src->reader->encoding;
1213 /* Returns the syntax mode for the syntax file from which the current drawn is
1214 drawn. Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1215 does not have line numbers.
1217 There is no version of this function that takes an N argument because
1218 lookahead only works to the end of a command and any given command is always
1219 within a single syntax file. */
1221 lex_get_syntax_mode (const struct lexer *lexer)
1223 struct lex_source *src = lex_source__ (lexer);
1224 return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1227 /* Returns the error mode for the syntax file from which the current drawn is
1228 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1229 source does not have line numbers.
1231 There is no version of this function that takes an N argument because
1232 lookahead only works to the end of a command and any given command is always
1233 within a single syntax file. */
1235 lex_get_error_mode (const struct lexer *lexer)
1237 struct lex_source *src = lex_source__ (lexer);
1238 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1241 /* If the source that LEXER is currently reading has error mode
1242 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1243 token to be read comes directly from whatever is next read from the stream.
1245 It makes sense to call this function after encountering an error in a
1246 command entered on the console, because usually the user would prefer not to
1247 have cascading errors. */
1249 lex_interactive_reset (struct lexer *lexer)
1251 struct lex_source *src = lex_source__ (lexer);
1252 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1254 src->head = src->tail = 0;
1255 src->journal_pos = src->seg_pos = src->line_pos = 0;
1256 src->n_newlines = 0;
1257 src->suppress_next_newline = false;
1258 src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1260 while (!deque_is_empty (&src->deque))
1261 lex_source_pop__ (src);
1262 lex_source_push_endcmd__ (src);
1266 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1268 lex_discard_rest_of_command (struct lexer *lexer)
1270 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1274 /* Discards all lookahead tokens in LEXER, then discards all input sources
1275 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1276 runs out of input sources. */
1278 lex_discard_noninteractive (struct lexer *lexer)
1280 struct lex_source *src = lex_source__ (lexer);
1284 while (!deque_is_empty (&src->deque))
1285 lex_source_pop__ (src);
1287 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1288 src = lex_source__ (lexer))
1289 lex_source_destroy (src);
1294 lex_source_max_tail__ (const struct lex_source *src)
1296 const struct lex_token *token;
1299 assert (src->seg_pos >= src->line_pos);
1300 max_tail = MIN (src->journal_pos, src->line_pos);
1302 /* Use the oldest token also. (We know that src->deque cannot be empty
1303 because we are in the process of adding a new token, which is already
1304 initialized enough to use here.) */
1305 token = &src->tokens[deque_back (&src->deque, 0)];
1306 assert (token->token_pos >= token->line_pos);
1307 max_tail = MIN (max_tail, token->line_pos);
1313 lex_source_expand__ (struct lex_source *src)
1315 if (src->head - src->tail >= src->allocated)
1317 size_t max_tail = lex_source_max_tail__ (src);
1318 if (max_tail > src->tail)
1320 /* Advance the tail, freeing up room at the head. */
1321 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1322 src->head - max_tail);
1323 src->tail = max_tail;
1327 /* Buffer is completely full. Expand it. */
1328 src->buffer = x2realloc (src->buffer, &src->allocated);
1333 /* There's space available at the head of the buffer. Nothing to do. */
1338 lex_source_read__ (struct lex_source *src)
1342 lex_source_expand__ (src);
1344 size_t head_ofs = src->head - src->tail;
1345 size_t space = src->allocated - head_ofs;
1346 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1347 size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1349 assert (n <= space);
1354 src->reader->eof = true;
1355 lex_source_expand__ (src);
1361 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1362 src->head - src->seg_pos));
1365 static struct lex_source *
1366 lex_source__ (const struct lexer *lexer)
1368 return (ll_is_empty (&lexer->sources) ? NULL
1369 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1373 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1375 struct string s = DS_EMPTY_INITIALIZER;
1376 for (size_t i = n0; i <= n1; )
1378 /* Find [I,J) as the longest sequence of tokens not produced by macro
1379 expansion, or otherwise the longest sequence expanded from a single
1381 const struct lex_token *first = lex_source_next__ (src, i);
1383 for (j = i + 1; j <= n1; j++)
1385 const struct lex_token *cur = lex_source_next__ (src, j);
1386 if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1387 || first->macro_rep != cur->macro_rep)
1390 const struct lex_token *last = lex_source_next__ (src, j - 1);
1392 if (!ds_is_empty (&s))
1393 ds_put_byte (&s, ' ');
1394 if (!first->macro_rep)
1396 size_t start = first->token_pos;
1397 size_t end = last->token_pos + last->token_len;
1398 ds_put_substring (&s, ss_buffer (&src->buffer[start - src->tail],
1403 size_t start = first->ofs;
1404 size_t end = last->ofs + last->len;
1405 ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1411 return ds_steal_cstr (&s);
1415 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1421 assert (out_size >= 16);
1422 out_maxlen = out_size - 1;
1423 if (in.length > out_maxlen - 3)
1426 for (out_len = 0; out_len < in.length; out_len += mblen)
1428 if (in.string[out_len] == '\n'
1429 || in.string[out_len] == '\0'
1430 || (in.string[out_len] == '\r'
1431 && out_len + 1 < in.length
1432 && in.string[out_len + 1] == '\n'))
1435 mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1436 in.length - out_len);
1441 if (out_len + mblen > out_maxlen)
1445 memcpy (out, in.string, out_len);
1446 strcpy (&out[out_len], out_len < in.length ? "..." : "");
1450 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1452 for (size_t i = n0; i <= n1; i++)
1453 if (lex_source_next__ (src, i)->macro_rep)
1458 static struct substring
1459 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1461 if (!lex_source_contains_macro_call (src, n0, n1))
1464 const struct lex_token *token0 = lex_source_next__ (src, n0);
1465 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1466 size_t start = token0->token_pos;
1467 size_t end = token1->token_pos + token1->token_len;
1469 return ss_buffer (&src->buffer[start - src->tail], end - start);
1473 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1474 const char *format, va_list args)
1476 const struct lex_token *token;
1481 token = lex_source_next__ (src, n0);
1482 if (token->token.type == T_ENDCMD)
1483 ds_put_cstr (&s, _("Syntax error at end of command"));
1486 /* Get the syntax that caused the error. */
1487 char *syntax = lex_source_get_syntax__ (src, n0, n1);
1488 char syntax_cstr[64];
1489 lex_ellipsize__ (ss_cstr (syntax), syntax_cstr, sizeof syntax_cstr);
1492 /* Get the macro call(s) that expanded to the syntax that caused the
1495 struct substring call = lex_source_get_macro_call (src, n0, n1);
1496 lex_ellipsize__ (call, call_cstr, sizeof call_cstr);
1501 ds_put_format (&s, _("Syntax error at `%s' "
1502 "(in expansion of `%s')"),
1503 syntax_cstr, call_cstr);
1505 ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1507 else if (call_cstr[0])
1508 ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1511 ds_put_cstr (&s, _("Syntax error"));
1516 ds_put_cstr (&s, ": ");
1517 ds_put_vformat (&s, format, args);
1519 if (ds_last (&s) != '.')
1520 ds_put_byte (&s, '.');
1523 .category = MSG_C_SYNTAX,
1524 .severity = MSG_S_ERROR,
1525 .file_name = src->reader->file_name,
1526 .first_line = lex_source_get_first_line_number (src, n0),
1527 .last_line = lex_source_get_last_line_number (src, n1),
1528 .first_column = lex_source_get_first_column (src, n0),
1529 .last_column = lex_source_get_last_column (src, n1),
1530 .text = ds_steal_cstr (&s),
1535 static void PRINTF_FORMAT (2, 3)
1536 lex_get_error (struct lex_source *src, const char *format, ...)
1541 va_start (args, format);
1543 n = deque_count (&src->deque) - 1;
1544 lex_source_error_valist (src, n, n, format, args);
1545 lex_source_pop_front (src);
1550 /* Attempts to append an additional token into SRC's deque, reading more from
1551 the underlying lex_reader if necessary. Returns true if a new token was
1552 added to SRC's deque, false otherwise. */
1554 lex_source_try_get (struct lex_source *src)
1556 /* State maintained while scanning tokens. Usually we only need a single
1557 state, but scanner_push() can return SCAN_SAVE to indicate that the state
1558 needs to be saved and possibly restored later with SCAN_BACK. */
1561 struct segmenter segmenter;
1562 enum segment_type last_segment;
1563 int newlines; /* Number of newlines encountered so far. */
1564 /* Maintained here so we can update lex_source's similar members when we
1570 /* Initialize state. */
1571 struct state state =
1573 .segmenter = src->segmenter,
1575 .seg_pos = src->seg_pos,
1576 .line_pos = src->line_pos,
1578 struct state saved = state;
1580 /* Append a new token to SRC and initialize it. */
1581 struct lex_token *token = lex_push_token__ (src);
1582 struct scanner scanner;
1583 scanner_init (&scanner, &token->token);
1584 token->line_pos = src->line_pos;
1585 token->token_pos = src->seg_pos;
1586 if (src->reader->line_number > 0)
1587 token->first_line = src->reader->line_number + src->n_newlines;
1589 token->first_line = 0;
1591 /* Extract segments and pass them through the scanner until we obtain a
1595 /* Extract a segment. */
1596 const char *segment = &src->buffer[state.seg_pos - src->tail];
1597 size_t seg_maxlen = src->head - state.seg_pos;
1598 enum segment_type type;
1599 int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1600 src->reader->eof, &type);
1603 /* The segmenter needs more input to produce a segment. */
1604 assert (!src->reader->eof);
1605 lex_source_read__ (src);
1609 /* Update state based on the segment. */
1610 state.last_segment = type;
1611 state.seg_pos += seg_len;
1612 if (type == SEG_NEWLINE)
1615 state.line_pos = state.seg_pos;
1618 /* Pass the segment into the scanner and try to get a token out. */
1619 enum scan_result result = scanner_push (&scanner, type,
1620 ss_buffer (segment, seg_len),
1622 if (result == SCAN_SAVE)
1624 else if (result == SCAN_BACK)
1629 else if (result == SCAN_DONE)
1633 /* If we've reached the end of a line, or the end of a command, then pass
1634 the line to the output engine as a syntax text item. */
1635 int n_lines = state.newlines;
1636 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1639 src->suppress_next_newline = true;
1641 else if (n_lines > 0 && src->suppress_next_newline)
1644 src->suppress_next_newline = false;
1646 for (int i = 0; i < n_lines; i++)
1648 /* Beginning of line. */
1649 const char *line = &src->buffer[src->journal_pos - src->tail];
1651 /* Calculate line length, including \n or \r\n end-of-line if present.
1653 We use src->head even though that may be beyond what we've actually
1654 converted to tokens (which is only through state.line_pos). That's
1655 because, if we're emitting the line due to SEG_END_COMMAND, we want to
1656 take the whole line through the newline, not just through the '.'. */
1657 size_t max_len = src->head - src->journal_pos;
1658 const char *newline = memchr (line, '\n', max_len);
1659 size_t line_len = newline ? newline - line + 1 : max_len;
1661 /* Calculate line length excluding end-of-line. */
1662 size_t copy_len = line_len;
1663 if (copy_len > 0 && line[copy_len - 1] == '\n')
1665 if (copy_len > 0 && line[copy_len - 1] == '\r')
1668 /* Submit the line as syntax. */
1669 output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1670 xmemdup0 (line, copy_len),
1673 src->journal_pos += line_len;
1676 token->token_len = state.seg_pos - src->seg_pos;
1678 src->segmenter = state.segmenter;
1679 src->seg_pos = state.seg_pos;
1680 src->line_pos = state.line_pos;
1681 src->n_newlines += state.newlines;
1683 switch (token->token.type)
1689 token->token.type = T_ENDCMD;
1693 case SCAN_BAD_HEX_LENGTH:
1694 lex_get_error (src, _("String of hex digits has %d characters, which "
1695 "is not a multiple of 2"),
1696 (int) token->token.number);
1699 case SCAN_BAD_HEX_DIGIT:
1700 case SCAN_BAD_UNICODE_DIGIT:
1701 lex_get_error (src, _("`%c' is not a valid hex digit"),
1702 (int) token->token.number);
1705 case SCAN_BAD_UNICODE_LENGTH:
1706 lex_get_error (src, _("Unicode string contains %d bytes, which is "
1707 "not in the valid range of 1 to 8 bytes"),
1708 (int) token->token.number);
1711 case SCAN_BAD_UNICODE_CODE_POINT:
1712 lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1713 (int) token->token.number);
1716 case SCAN_EXPECTED_QUOTE:
1717 lex_get_error (src, _("Unterminated string constant"));
1720 case SCAN_EXPECTED_EXPONENT:
1721 lex_get_error (src, _("Missing exponent following `%s'"),
1722 token->token.string.string);
1725 case SCAN_UNEXPECTED_CHAR:
1728 lex_get_error (src, _("Bad character %s in input"),
1729 uc_name (token->token.number, c_name));
1734 lex_source_pop_front (src);
1742 lex_source_get__ (struct lex_source *src)
1748 else if (lex_source_try_get (src))
1754 lex_source_get (const struct lex_source *src_)
1756 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1758 size_t old_count = deque_count (&src->deque);
1759 if (!lex_source_get__ (src))
1762 if (!settings_get_mexpand ())
1765 struct macro_expander *me;
1766 int retval = macro_expander_create (src->lexer->macros,
1767 &lex_source_front (src)->token,
1771 if (!lex_source_get__ (src))
1773 /* This should not be reachable because we always get a T_ENDCMD at
1774 the end of an input file (transformed from T_STOP by
1775 lex_source_try_get()) and the macro_expander should always
1776 terminate expansion on T_ENDCMD. */
1780 const struct lex_token *front = lex_source_front (src);
1781 size_t start = front->token_pos;
1782 size_t end = front->token_pos + front->token_len;
1783 const struct macro_token mt = {
1784 .token = front->token,
1785 .representation = ss_buffer (&src->buffer[start - src->tail],
1788 retval = macro_expander_add (me, &mt);
1792 /* XXX handle case where there's a macro invocation starting from some
1793 later token we've already obtained */
1794 macro_expander_destroy (me);
1798 /* XXX handle case where the macro invocation doesn't use all the tokens */
1799 const struct lex_token *call_first = lex_source_next__ (src, old_count);
1800 const struct lex_token *call_last = lex_source_front (src);
1801 size_t call_pos = call_first->token_pos;
1802 size_t call_len = (call_last->token_pos + call_last->token_len) - call_pos;
1803 size_t line_pos = call_first->line_pos;
1804 int first_line = call_first->first_line;
1805 while (deque_count (&src->deque) > old_count)
1806 lex_source_pop_front (src);
1808 struct macro_tokens expansion = { .n = 0 };
1809 macro_expander_get_expansion (me, &expansion);
1810 macro_expander_destroy (me);
1812 size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1813 size_t *len = xnmalloc (expansion.n, sizeof *len);
1814 struct string s = DS_EMPTY_INITIALIZER;
1815 macro_tokens_to_representation (&expansion, &s, ofs, len);
1817 if (settings_get_mprint ())
1818 output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1819 _("Macro Expansion")));
1821 char *macro_rep = ds_steal_cstr (&s);
1822 size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1823 *ref_cnt = expansion.n;
1824 for (size_t i = 0; i < expansion.n; i++)
1826 *lex_push_token__ (src) = (struct lex_token) {
1827 .token = expansion.mts[i].token,
1828 .token_pos = call_pos,
1829 .token_len = call_len,
1830 .line_pos = line_pos,
1831 .first_line = first_line,
1832 .macro_rep = macro_rep,
1838 ss_dealloc (&expansion.mts[i].representation);
1840 free (expansion.mts);
1848 lex_source_push_endcmd__ (struct lex_source *src)
1850 *lex_push_token__ (src) = (struct lex_token) { .token = { .type = T_ENDCMD } };
1853 static struct lex_source *
1854 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
1856 struct lex_source *src;
1858 src = xzalloc (sizeof *src);
1859 src->reader = reader;
1860 src->segmenter = segmenter_init (reader->syntax, false);
1862 src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1864 lex_source_push_endcmd__ (src);
1870 lex_source_destroy (struct lex_source *src)
1872 char *file_name = src->reader->file_name;
1873 char *encoding = src->reader->encoding;
1874 if (src->reader->class->destroy != NULL)
1875 src->reader->class->destroy (src->reader);
1879 while (!deque_is_empty (&src->deque))
1880 lex_source_pop__ (src);
1882 ll_remove (&src->ll);
1886 struct lex_file_reader
1888 struct lex_reader reader;
1889 struct u8_istream *istream;
1892 static struct lex_reader_class lex_file_reader_class;
1894 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1895 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
1896 ENCODING, which should take one of the forms accepted by
1897 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
1898 mode of the new reader, respectively.
1900 Returns a null pointer if FILE_NAME cannot be opened. */
1902 lex_reader_for_file (const char *file_name, const char *encoding,
1903 enum segmenter_mode syntax,
1904 enum lex_error_mode error)
1906 struct lex_file_reader *r;
1907 struct u8_istream *istream;
1909 istream = (!strcmp(file_name, "-")
1910 ? u8_istream_for_fd (encoding, STDIN_FILENO)
1911 : u8_istream_for_file (encoding, file_name, O_RDONLY));
1912 if (istream == NULL)
1914 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1918 r = xmalloc (sizeof *r);
1919 lex_reader_init (&r->reader, &lex_file_reader_class);
1920 r->reader.syntax = syntax;
1921 r->reader.error = error;
1922 r->reader.file_name = xstrdup (file_name);
1923 r->reader.encoding = xstrdup_if_nonnull (encoding);
1924 r->reader.line_number = 1;
1925 r->istream = istream;
1930 static struct lex_file_reader *
1931 lex_file_reader_cast (struct lex_reader *r)
1933 return UP_CAST (r, struct lex_file_reader, reader);
1937 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1938 enum prompt_style prompt_style UNUSED)
1940 struct lex_file_reader *r = lex_file_reader_cast (r_);
1941 ssize_t n_read = u8_istream_read (r->istream, buf, n);
1944 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
1951 lex_file_close (struct lex_reader *r_)
1953 struct lex_file_reader *r = lex_file_reader_cast (r_);
1955 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1957 if (u8_istream_close (r->istream) != 0)
1958 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
1961 u8_istream_free (r->istream);
1966 static struct lex_reader_class lex_file_reader_class =
1972 struct lex_string_reader
1974 struct lex_reader reader;
1979 static struct lex_reader_class lex_string_reader_class;
1981 /* Creates and returns a new lex_reader for the contents of S, which must be
1982 encoded in the given ENCODING. The new reader takes ownership of S and will free it
1983 with ss_dealloc() when it is closed. */
1985 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
1987 struct lex_string_reader *r;
1989 r = xmalloc (sizeof *r);
1990 lex_reader_init (&r->reader, &lex_string_reader_class);
1991 r->reader.syntax = SEG_MODE_AUTO;
1992 r->reader.encoding = xstrdup_if_nonnull (encoding);
1999 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2000 which must be encoded in ENCODING. The caller retains ownership of S. */
2002 lex_reader_for_string (const char *s, const char *encoding)
2004 struct substring ss;
2005 ss_alloc_substring (&ss, ss_cstr (s));
2006 return lex_reader_for_substring_nocopy (ss, encoding);
2009 /* Formats FORMAT as a printf()-like format string and creates and returns a
2010 new lex_reader for the formatted result. */
2012 lex_reader_for_format (const char *format, const char *encoding, ...)
2014 struct lex_reader *r;
2017 va_start (args, encoding);
2018 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2024 static struct lex_string_reader *
2025 lex_string_reader_cast (struct lex_reader *r)
2027 return UP_CAST (r, struct lex_string_reader, reader);
2031 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2032 enum prompt_style prompt_style UNUSED)
2034 struct lex_string_reader *r = lex_string_reader_cast (r_);
2037 chunk = MIN (n, r->s.length - r->offset);
2038 memcpy (buf, r->s.string + r->offset, chunk);
2045 lex_string_close (struct lex_reader *r_)
2047 struct lex_string_reader *r = lex_string_reader_cast (r_);
2053 static struct lex_reader_class lex_string_reader_class =