pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31
  32 #include "language/command.h"
  33 #include "language/lexer/macro.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/ll.h"
  42 #include "libpspp/message.h"
  43 #include "libpspp/misc.h"
  44 #include "libpspp/str.h"
  45 #include "libpspp/u8-istream.h"
  46 #include "output/journal.h"
  47 #include "output/output-item.h"
  48
  49 #include "gl/c-ctype.h"
  50 #include "gl/minmax.h"
  51 #include "gl/xalloc.h"
  52 #include "gl/xmemdup0.h"
  53
  54 #include "gettext.h"
  55 #define _(msgid) gettext (msgid)
  56 #define N_(msgid) msgid
  57
  58 /* A token within a lex_source. */
  59 struct lex_token
  60   {
  61     /* The regular token information. */
  62     struct token token;
  63
  64     /* For a token obtained through the lexer in an ordinary way, this is the
  65        location of the token in terms of the lex_source's buffer.
  66
  67        For a token produced through macro expansion, this is the entire macro
  68        call. */
  69     size_t token_pos;           /* Offset into src->buffer of token start. */
  70     size_t token_len;           /* Length of source for token in bytes. */
  71     int first_line;             /* Line number at token_pos. */
  72
  73     /* For a token obtained through macro expansion, this is just this token.
  74
  75        For a token obtained through the lexer in an ordinary way, these are
  76        nulls and zeros. */
  77     char *macro_rep;        /* The whole macro expansion. */
  78     size_t ofs;             /* Offset of this token in macro_rep. */
  79     size_t len;             /* Length of this token in macro_rep. */
  80     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  81   };
  82
  83 static void
  84 lex_token_destroy (struct lex_token *t)
  85 {
  86   token_uninit (&t->token);
  87   if (t->ref_cnt)
  88     {
  89       assert (*t->ref_cnt > 0);
  90       if (!--*t->ref_cnt)
  91         {
  92           free (t->macro_rep);
  93           free (t->ref_cnt);
  94         }
  95     }
  96   free (t);
  97 }
  98 \f
  99 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 100    lex_source. */
 101 struct lex_stage
 102   {
 103     struct deque deque;
 104     struct lex_token **tokens;
 105   };
 106
 107 static void lex_stage_clear (struct lex_stage *);
 108 static void lex_stage_uninit (struct lex_stage *);
 109
 110 static size_t lex_stage_count (const struct lex_stage *);
 111 static bool lex_stage_is_empty (const struct lex_stage *);
 112
 113 static struct lex_token *lex_stage_first (struct lex_stage *);
 114 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 115
 116 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 117 static void lex_stage_pop_first (struct lex_stage *);
 118
 119 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 120                              size_t n);
 121
 122 /* Deletes all the tokens from STAGE. */
 123 static void
 124 lex_stage_clear (struct lex_stage *stage)
 125 {
 126   while (!deque_is_empty (&stage->deque))
 127     lex_stage_pop_first (stage);
 128 }
 129
 130 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 131 static void
 132 lex_stage_uninit (struct lex_stage *stage)
 133 {
 134   lex_stage_clear (stage);
 135   free (stage->tokens);
 136 }
 137
 138 /* Returns true if STAGE contains no tokens, otherwise false. */
 139 static bool
 140 lex_stage_is_empty (const struct lex_stage *stage)
 141 {
 142   return deque_is_empty (&stage->deque);
 143 }
 144
 145 /* Returns the number of tokens in STAGE. */
 146 static size_t
 147 lex_stage_count (const struct lex_stage *stage)
 148 {
 149   return deque_count (&stage->deque);
 150 }
 151
 152 /* Returns the first token in STAGE, which must be nonempty.
 153    The first token is the one accessed with the least lookahead. */
 154 static struct lex_token *
 155 lex_stage_first (struct lex_stage *stage)
 156 {
 157   return lex_stage_nth (stage, 0);
 158 }
 159
 160 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 161    lookahead) is 0, the second token is 1, and so on.  There must be at least
 162    INDEX + 1 tokens in STAGE. */
 163 static struct lex_token *
 164 lex_stage_nth (struct lex_stage *stage, size_t index)
 165 {
 166   return stage->tokens[deque_back (&stage->deque, index)];
 167 }
 168
 169 /* Adds TOKEN so that it becomes the last token in STAGE. */
 170 static void
 171 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 172 {
 173   if (deque_is_full (&stage->deque))
 174     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 175                                   sizeof *stage->tokens);
 176   stage->tokens[deque_push_front (&stage->deque)] = token;
 177 }
 178
 179 /* Removes and returns the first token from STAGE. */
 180 static struct lex_token *
 181 lex_stage_take_first (struct lex_stage *stage)
 182 {
 183   return stage->tokens[deque_pop_back (&stage->deque)];
 184 }
 185
 186 /* Removes the first token from STAGE and uninitializes it. */
 187 static void
 188 lex_stage_pop_first (struct lex_stage *stage)
 189 {
 190   lex_token_destroy (lex_stage_take_first (stage));
 191 }
 192
 193 /* Removes the first N tokens from SRC, appending them to DST as the last
 194    tokens. */
 195 static void
 196 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 197 {
 198   for (size_t i = 0; i < n; i++)
 199     lex_stage_push_last (dst, lex_stage_take_first (src));
 200 }
 201
 202 /* A source of tokens, corresponding to a syntax file.
 203
 204    This is conceptually a lex_reader wrapped with everything needed to convert
 205    its UTF-8 bytes into tokens. */
 206 struct lex_source
 207   {
 208     struct ll ll;               /* In lexer's list of sources. */
 209     struct lex_reader *reader;
 210     struct lexer *lexer;
 211     struct segmenter segmenter;
 212     bool eof;                   /* True if T_STOP was read from 'reader'. */
 213
 214     /* Buffer of UTF-8 bytes. */
 215     char *buffer;               /* Source file contents. */
 216     size_t length;              /* Number of bytes filled. */
 217     size_t allocated;           /* Number of bytes allocated. */
 218
 219     /* Offsets into 'buffer'. */
 220     size_t journal_pos;         /* First byte not yet output to journal. */
 221     size_t seg_pos;             /* First byte not yet scanned as token. */
 222
 223     int n_newlines;             /* Number of new-lines up to seg_pos. */
 224     bool suppress_next_newline;
 225
 226     /* Tokens.
 227
 228        This is a pipeline with the following stages.  Each token eventually
 229        made available to the parser passes through of these stages.  The stages
 230        are named after the processing that happens in each one.
 231
 232        Initially, tokens come from the segmenter and scanner to 'pp':
 233
 234        - pp: Tokens that need to pass through the macro preprocessor to end up
 235          in 'merge'.
 236
 237        - merge: Tokens that need to pass through scan_merge() to end up in
 238          'parse'.
 239
 240        - parse: Tokens available to the client for parsing.
 241
 242       'pp' and 'merge' store tokens only temporarily until they pass into
 243       'parse'.  Tokens then live in 'parse' until the command is fully
 244       consumed, at which time they are freed together. */
 245     struct lex_stage pp;
 246     struct lex_stage merge;
 247     struct lex_token **parse;
 248     size_t n_parse, allocated_parse, parse_ofs;
 249   };
 250
 251 static struct lex_source *lex_source_create (struct lexer *,
 252                                              struct lex_reader *);
 253 static void lex_source_destroy (struct lex_source *);
 254
 255 /* Lexer. */
 256 struct lexer
 257   {
 258     struct ll_list sources;     /* Contains "struct lex_source"s. */
 259     struct macro_set *macros;
 260   };
 261
 262 static struct lex_source *lex_source__ (const struct lexer *);
 263 static char *lex_source_get_syntax__ (const struct lex_source *,
 264                                       int n0, int n1);
 265 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 266 static void lex_source_push_endcmd__ (struct lex_source *);
 267 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
 268 static void lex_source_clear_parse (struct lex_source *);
 269
 270 static bool lex_source_get_parse (struct lex_source *);
 271 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 272                                      const char *format, va_list)
 273    PRINTF_FORMAT (4, 0);
 274 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 275                                                   int n);
 276 \f
 277 /* Initializes READER with the specified CLASS and otherwise some reasonable
 278    defaults.  The caller should fill in the others members as desired. */
 279 void
 280 lex_reader_init (struct lex_reader *reader,
 281                  const struct lex_reader_class *class)
 282 {
 283   reader->class = class;
 284   reader->syntax = SEG_MODE_AUTO;
 285   reader->error = LEX_ERROR_CONTINUE;
 286   reader->file_name = NULL;
 287   reader->encoding = NULL;
 288   reader->line_number = 0;
 289   reader->eof = false;
 290 }
 291
 292 /* Frees any file name already in READER and replaces it by a copy of
 293    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 294 void
 295 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 296 {
 297   free (reader->file_name);
 298   reader->file_name = xstrdup_if_nonnull (file_name);
 299 }
 300 \f
 301 /* Creates and returns a new lexer. */
 302 struct lexer *
 303 lex_create (void)
 304 {
 305   struct lexer *lexer = xmalloc (sizeof *lexer);
 306   *lexer = (struct lexer) {
 307     .sources = LL_INITIALIZER (lexer->sources),
 308     .macros = macro_set_create (),
 309   };
 310   return lexer;
 311 }
 312
 313 /* Destroys LEXER. */
 314 void
 315 lex_destroy (struct lexer *lexer)
 316 {
 317   if (lexer != NULL)
 318     {
 319       struct lex_source *source, *next;
 320
 321       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 322         lex_source_destroy (source);
 323       macro_set_destroy (lexer->macros);
 324       free (lexer);
 325     }
 326 }
 327
 328 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 329    same name.  Takes ownership of M. */
 330 void
 331 lex_define_macro (struct lexer *lexer, struct macro *m)
 332 {
 333   macro_set_add (lexer->macros, m);
 334 }
 335
 336 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 337    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 338    token. */
 339 void
 340 lex_include (struct lexer *lexer, struct lex_reader *reader)
 341 {
 342   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 343   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 344 }
 345
 346 /* Appends READER to LEXER, so that it will be read after all other current
 347    readers have already been read. */
 348 void
 349 lex_append (struct lexer *lexer, struct lex_reader *reader)
 350 {
 351   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 352 }
 353 \f
 354 /* Advancing. */
 355
 356 /* Advances LEXER to the next token, consuming the current token. */
 357 void
 358 lex_get (struct lexer *lexer)
 359 {
 360   struct lex_source *src;
 361
 362   src = lex_source__ (lexer);
 363   if (src == NULL)
 364     return;
 365
 366   if (src->parse_ofs < src->n_parse)
 367     {
 368       if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
 369         lex_source_clear_parse (src);
 370       else
 371         src->parse_ofs++;
 372     }
 373
 374   while (src->parse_ofs == src->n_parse)
 375     if (!lex_source_get_parse (src))
 376       {
 377         lex_source_destroy (src);
 378         src = lex_source__ (lexer);
 379         if (src == NULL)
 380           return;
 381       }
 382 }
 383
 384 /* Advances LEXER by N tokens. */
 385 void
 386 lex_get_n (struct lexer *lexer, size_t n)
 387 {
 388   while (n-- > 0)
 389     lex_get (lexer);
 390 }
 391 \f
 392 /* Issuing errors. */
 393
 394 /* Prints a syntax error message containing the current token and
 395    given message MESSAGE (if non-null). */
 396 void
 397 lex_error (struct lexer *lexer, const char *format, ...)
 398 {
 399   va_list args;
 400
 401   va_start (args, format);
 402   lex_next_error_valist (lexer, 0, 0, format, args);
 403   va_end (args);
 404 }
 405
 406 /* Prints a syntax error message containing the current token and
 407    given message MESSAGE (if non-null). */
 408 void
 409 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 410 {
 411   lex_next_error_valist (lexer, 0, 0, format, args);
 412 }
 413
 414 /* Prints a syntax error message containing the current token and
 415    given message MESSAGE (if non-null). */
 416 void
 417 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 418 {
 419   va_list args;
 420
 421   va_start (args, format);
 422   lex_next_error_valist (lexer, n0, n1, format, args);
 423   va_end (args);
 424 }
 425
 426 /* Prints a syntax error message saying that one of the strings provided as
 427    varargs, up to the first NULL, is expected. */
 428 void
 429 (lex_error_expecting) (struct lexer *lexer, ...)
 430 {
 431   va_list args;
 432
 433   va_start (args, lexer);
 434   lex_error_expecting_valist (lexer, args);
 435   va_end (args);
 436 }
 437
 438 /* Prints a syntax error message saying that one of the options provided in
 439    ARGS, up to the first NULL, is expected. */
 440 void
 441 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 442 {
 443   enum { MAX_OPTIONS = 9 };
 444   const char *options[MAX_OPTIONS];
 445   int n = 0;
 446   while (n < MAX_OPTIONS)
 447     {
 448       const char *option = va_arg (args, const char *);
 449       if (!option)
 450         break;
 451
 452       options[n++] = option;
 453     }
 454   lex_error_expecting_array (lexer, options, n);
 455 }
 456
 457 void
 458 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 459 {
 460   switch (n)
 461     {
 462     case 0:
 463       lex_error (lexer, NULL);
 464       break;
 465
 466     case 1:
 467       lex_error (lexer, _("expecting %s"), options[0]);
 468       break;
 469
 470     case 2:
 471       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 472       break;
 473
 474     case 3:
 475       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 476                  options[2]);
 477       break;
 478
 479     case 4:
 480       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 481                  options[0], options[1], options[2], options[3]);
 482       break;
 483
 484     case 5:
 485       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 486                  options[0], options[1], options[2], options[3], options[4]);
 487       break;
 488
 489     case 6:
 490       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 491                  options[0], options[1], options[2], options[3], options[4],
 492                  options[5]);
 493       break;
 494
 495     case 7:
 496       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 497                  options[0], options[1], options[2], options[3], options[4],
 498                  options[5], options[6]);
 499       break;
 500
 501     case 8:
 502       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 503                  options[0], options[1], options[2], options[3], options[4],
 504                  options[5], options[6], options[7]);
 505       break;
 506
 507     default:
 508       lex_error (lexer, NULL);
 509     }
 510 }
 511
 512 /* Reports an error to the effect that subcommand SBC may only be specified
 513    once.
 514
 515    This function does not take a lexer as an argument or use lex_error(),
 516    because the result would ordinarily just be redundant: "Syntax error at
 517    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 518    not help the user find the error. */
 519 void
 520 lex_sbc_only_once (const char *sbc)
 521 {
 522   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 523 }
 524
 525 /* Reports an error to the effect that subcommand SBC is missing.
 526
 527    This function does not take a lexer as an argument or use lex_error(),
 528    because a missing subcommand can normally be detected only after the whole
 529    command has been parsed, and so lex_error() would always report "Syntax
 530    error at end of command", which does not help the user find the error. */
 531 void
 532 lex_sbc_missing (const char *sbc)
 533 {
 534   msg (SE, _("Required subcommand %s was not specified."), sbc);
 535 }
 536
 537 /* Reports an error to the effect that specification SPEC may only be specified
 538    once within subcommand SBC. */
 539 void
 540 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 541 {
 542   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 543              spec, sbc);
 544 }
 545
 546 /* Reports an error to the effect that specification SPEC is missing within
 547    subcommand SBC. */
 548 void
 549 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 550 {
 551   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 552              sbc, spec);
 553 }
 554
 555 /* Prints a syntax error message containing the current token and
 556    given message MESSAGE (if non-null). */
 557 void
 558 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 559                        const char *format, va_list args)
 560 {
 561   struct lex_source *src = lex_source__ (lexer);
 562
 563   if (src != NULL)
 564     lex_source_error_valist (src, n0, n1, format, args);
 565   else
 566     {
 567       struct string s;
 568
 569       ds_init_empty (&s);
 570       ds_put_format (&s, _("Syntax error at end of input"));
 571       if (format != NULL)
 572         {
 573           ds_put_cstr (&s, ": ");
 574           ds_put_vformat (&s, format, args);
 575         }
 576       if (ds_last (&s) != '.')
 577         ds_put_byte (&s, '.');
 578       msg (SE, "%s", ds_cstr (&s));
 579       ds_destroy (&s);
 580     }
 581 }
 582
 583 /* Checks that we're at end of command.
 584    If so, returns a successful command completion code.
 585    If not, flags a syntax error and returns an error command
 586    completion code. */
 587 int
 588 lex_end_of_command (struct lexer *lexer)
 589 {
 590   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 591     {
 592       lex_error (lexer, _("expecting end of command"));
 593       return CMD_FAILURE;
 594     }
 595   else
 596     return CMD_SUCCESS;
 597 }
 598 \f
 599 /* Token testing functions. */
 600
 601 /* Returns true if the current token is a number. */
 602 bool
 603 lex_is_number (const struct lexer *lexer)
 604 {
 605   return lex_next_is_number (lexer, 0);
 606 }
 607
 608 /* Returns true if the current token is a string. */
 609 bool
 610 lex_is_string (const struct lexer *lexer)
 611 {
 612   return lex_next_is_string (lexer, 0);
 613 }
 614
 615 /* Returns the value of the current token, which must be a
 616    floating point number. */
 617 double
 618 lex_number (const struct lexer *lexer)
 619 {
 620   return lex_next_number (lexer, 0);
 621 }
 622
 623 /* Returns true iff the current token is an integer. */
 624 bool
 625 lex_is_integer (const struct lexer *lexer)
 626 {
 627   return lex_next_is_integer (lexer, 0);
 628 }
 629
 630 /* Returns the value of the current token, which must be an
 631    integer. */
 632 long
 633 lex_integer (const struct lexer *lexer)
 634 {
 635   return lex_next_integer (lexer, 0);
 636 }
 637 \f
 638 /* Token testing functions with lookahead.
 639
 640    A value of 0 for N as an argument to any of these functions refers to the
 641    current token.  Lookahead is limited to the current command.  Any N greater
 642    than the number of tokens remaining in the current command will be treated
 643    as referring to a T_ENDCMD token. */
 644
 645 /* Returns true if the token N ahead of the current token is a number. */
 646 bool
 647 lex_next_is_number (const struct lexer *lexer, int n)
 648 {
 649   return token_is_number (lex_next (lexer, n));
 650 }
 651
 652 /* Returns true if the token N ahead of the current token is a string. */
 653 bool
 654 lex_next_is_string (const struct lexer *lexer, int n)
 655 {
 656   return token_is_string (lex_next (lexer, n));
 657 }
 658
 659 /* Returns the value of the token N ahead of the current token, which must be a
 660    floating point number. */
 661 double
 662 lex_next_number (const struct lexer *lexer, int n)
 663 {
 664   return token_number (lex_next (lexer, n));
 665 }
 666
 667 /* Returns true if the token N ahead of the current token is an integer. */
 668 bool
 669 lex_next_is_integer (const struct lexer *lexer, int n)
 670 {
 671   return token_is_integer (lex_next (lexer, n));
 672 }
 673
 674 /* Returns the value of the token N ahead of the current token, which must be
 675    an integer. */
 676 long
 677 lex_next_integer (const struct lexer *lexer, int n)
 678 {
 679   return token_integer (lex_next (lexer, n));
 680 }
 681 \f
 682 /* Token matching functions. */
 683
 684 /* If the current token has the specified TYPE, skips it and returns true.
 685    Otherwise, returns false. */
 686 bool
 687 lex_match (struct lexer *lexer, enum token_type type)
 688 {
 689   if (lex_token (lexer) == type)
 690     {
 691       lex_get (lexer);
 692       return true;
 693     }
 694   else
 695     return false;
 696 }
 697
 698 /* If the current token matches IDENTIFIER, skips it and returns true.
 699    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 700    returns false.
 701
 702    IDENTIFIER must be an ASCII string. */
 703 bool
 704 lex_match_id (struct lexer *lexer, const char *identifier)
 705 {
 706   return lex_match_id_n (lexer, identifier, 3);
 707 }
 708
 709 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 710    may be abbreviated to its first N letters.  Otherwise, returns false.
 711
 712    IDENTIFIER must be an ASCII string. */
 713 bool
 714 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 715 {
 716   if (lex_token (lexer) == T_ID
 717       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 718     {
 719       lex_get (lexer);
 720       return true;
 721     }
 722   else
 723     return false;
 724 }
 725
 726 /* If the current token is integer X, skips it and returns true.  Otherwise,
 727    returns false. */
 728 bool
 729 lex_match_int (struct lexer *lexer, int x)
 730 {
 731   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 732     {
 733       lex_get (lexer);
 734       return true;
 735     }
 736   else
 737     return false;
 738 }
 739 \f
 740 /* Forced matches. */
 741
 742 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 743    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 744    false.
 745
 746    IDENTIFIER must be an ASCII string. */
 747 bool
 748 lex_force_match_id (struct lexer *lexer, const char *identifier)
 749 {
 750   if (lex_match_id (lexer, identifier))
 751     return true;
 752   else
 753     {
 754       lex_error_expecting (lexer, identifier);
 755       return false;
 756     }
 757 }
 758
 759 /* If the current token has the specified TYPE, skips it and returns true.
 760    Otherwise, reports an error and returns false. */
 761 bool
 762 lex_force_match (struct lexer *lexer, enum token_type type)
 763 {
 764   if (lex_token (lexer) == type)
 765     {
 766       lex_get (lexer);
 767       return true;
 768     }
 769   else
 770     {
 771       const char *type_string = token_type_to_string (type);
 772       if (type_string)
 773         {
 774           char *s = xasprintf ("`%s'", type_string);
 775           lex_error_expecting (lexer, s);
 776           free (s);
 777         }
 778       else
 779         lex_error_expecting (lexer, token_type_to_name (type));
 780
 781       return false;
 782     }
 783 }
 784
 785 /* If the current token is a string, does nothing and returns true.
 786    Otherwise, reports an error and returns false. */
 787 bool
 788 lex_force_string (struct lexer *lexer)
 789 {
 790   if (lex_is_string (lexer))
 791     return true;
 792   else
 793     {
 794       lex_error (lexer, _("expecting string"));
 795       return false;
 796     }
 797 }
 798
 799 /* If the current token is a string or an identifier, does nothing and returns
 800    true.  Otherwise, reports an error and returns false.
 801
 802    This is meant for use in syntactic situations where we want to encourage the
 803    user to supply a quoted string, but for compatibility we also accept
 804    identifiers.  (One example of such a situation is file names.)  Therefore,
 805    the error message issued when the current token is wrong only says that a
 806    string is expected and doesn't mention that an identifier would also be
 807    accepted. */
 808 bool
 809 lex_force_string_or_id (struct lexer *lexer)
 810 {
 811   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 812 }
 813
 814 /* If the current token is an integer, does nothing and returns true.
 815    Otherwise, reports an error and returns false. */
 816 bool
 817 lex_force_int (struct lexer *lexer)
 818 {
 819   if (lex_is_integer (lexer))
 820     return true;
 821   else
 822     {
 823       lex_error (lexer, _("expecting integer"));
 824       return false;
 825     }
 826 }
 827
 828 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 829    nothing and returns true.  Otherwise, reports an error and returns false.
 830    If NAME is nonnull, then it is used in the error message. */
 831 bool
 832 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 833 {
 834   bool is_number = lex_is_number (lexer);
 835   bool is_integer = lex_is_integer (lexer);
 836   bool too_small = (is_integer ? lex_integer (lexer) < min
 837                     : is_number ? lex_number (lexer) < min
 838                     : false);
 839   bool too_big = (is_integer ? lex_integer (lexer) > max
 840                   : is_number ? lex_number (lexer) > max
 841                   : false);
 842   if (is_integer && !too_small && !too_big)
 843     return true;
 844
 845   if (min > max)
 846     {
 847       /* Weird, maybe a bug in the caller.  Just report that we needed an
 848          integer. */
 849       if (name)
 850         lex_error (lexer, _("Integer expected for %s."), name);
 851       else
 852         lex_error (lexer, _("Integer expected."));
 853     }
 854   else if (min == max)
 855     {
 856       if (name)
 857         lex_error (lexer, _("Expected %ld for %s."), min, name);
 858       else
 859         lex_error (lexer, _("Expected %ld."), min);
 860     }
 861   else if (min + 1 == max)
 862     {
 863       if (name)
 864         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 865       else
 866         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 867     }
 868   else
 869     {
 870       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 871       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 872
 873       if (report_lower_bound && report_upper_bound)
 874         {
 875           if (name)
 876             lex_error (lexer,
 877                        _("Expected integer between %ld and %ld for %s."),
 878                        min, max, name);
 879           else
 880             lex_error (lexer, _("Expected integer between %ld and %ld."),
 881                        min, max);
 882         }
 883       else if (report_lower_bound)
 884         {
 885           if (min == 0)
 886             {
 887               if (name)
 888                 lex_error (lexer, _("Expected non-negative integer for %s."),
 889                            name);
 890               else
 891                 lex_error (lexer, _("Expected non-negative integer."));
 892             }
 893           else if (min == 1)
 894             {
 895               if (name)
 896                 lex_error (lexer, _("Expected positive integer for %s."),
 897                            name);
 898               else
 899                 lex_error (lexer, _("Expected positive integer."));
 900             }
 901           else
 902             {
 903               if (name)
 904                 lex_error (lexer, _("Expected integer %ld or greater for %s."),
 905                            min, name);
 906               else
 907                 lex_error (lexer, _("Expected integer %ld or greater."), min);
 908             }
 909         }
 910       else if (report_upper_bound)
 911         {
 912           if (name)
 913             lex_error (lexer,
 914                        _("Expected integer less than or equal to %ld for %s."),
 915                        max, name);
 916           else
 917             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 918                        max);
 919         }
 920       else
 921         {
 922           if (name)
 923             lex_error (lexer, _("Integer expected for %s."), name);
 924           else
 925             lex_error (lexer, _("Integer expected."));
 926         }
 927     }
 928   return false;
 929 }
 930
 931 /* If the current token is a number, does nothing and returns true.
 932    Otherwise, reports an error and returns false. */
 933 bool
 934 lex_force_num (struct lexer *lexer)
 935 {
 936   if (lex_is_number (lexer))
 937     return true;
 938
 939   lex_error (lexer, _("expecting number"));
 940   return false;
 941 }
 942
 943 /* If the current token is an identifier, does nothing and returns true.
 944    Otherwise, reports an error and returns false. */
 945 bool
 946 lex_force_id (struct lexer *lexer)
 947 {
 948   if (lex_token (lexer) == T_ID)
 949     return true;
 950
 951   lex_error (lexer, _("expecting identifier"));
 952   return false;
 953 }
 954 \f
 955 /* Token accessors. */
 956
 957 /* Returns the type of LEXER's current token. */
 958 enum token_type
 959 lex_token (const struct lexer *lexer)
 960 {
 961   return lex_next_token (lexer, 0);
 962 }
 963
 964 /* Returns the number in LEXER's current token.
 965
 966    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 967    tokens this function will always return zero. */
 968 double
 969 lex_tokval (const struct lexer *lexer)
 970 {
 971   return lex_next_tokval (lexer, 0);
 972 }
 973
 974 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 975
 976    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 977    this functions this function will always return NULL.
 978
 979    The UTF-8 encoding of the returned string is correct for variable names and
 980    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 981    data_in() to use it in a "union value".  */
 982 const char *
 983 lex_tokcstr (const struct lexer *lexer)
 984 {
 985   return lex_next_tokcstr (lexer, 0);
 986 }
 987
 988 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 989    null-terminated (but the null terminator is not included in the returned
 990    substring's 'length').
 991
 992    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 993    this functions this function will always return NULL.
 994
 995    The UTF-8 encoding of the returned string is correct for variable names and
 996    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 997    data_in() to use it in a "union value".  */
 998 struct substring
 999 lex_tokss (const struct lexer *lexer)
1000 {
1001   return lex_next_tokss (lexer, 0);
1002 }
1003 \f
1004 /* Looking ahead.
1005
1006    A value of 0 for N as an argument to any of these functions refers to the
1007    current token.  Lookahead is limited to the current command.  Any N greater
1008    than the number of tokens remaining in the current command will be treated
1009    as referring to a T_ENDCMD token. */
1010
1011 static const struct lex_token *
1012 lex_next__ (const struct lexer *lexer_, int n)
1013 {
1014   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1015   struct lex_source *src = lex_source__ (lexer);
1016
1017   if (src != NULL)
1018     return lex_source_next__ (src, n);
1019   else
1020     {
1021       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1022       return &stop_token;
1023     }
1024 }
1025
1026 static const struct lex_token *
1027 lex_source_next__ (const struct lex_source *src_, int n)
1028 {
1029   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1030
1031   if (n < 0)
1032     {
1033       if (-n <= src->parse_ofs)
1034         return src->parse[src->parse_ofs - (-n)];
1035       else
1036         {
1037           static const struct lex_token endcmd_token
1038             = { .token = { .type = T_ENDCMD } };
1039           return &endcmd_token;
1040         }
1041     }
1042
1043   while (src->n_parse - src->parse_ofs <= n)
1044     {
1045       if (src->n_parse > 0)
1046         {
1047           const struct lex_token *t = src->parse[src->n_parse - 1];
1048           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1049             return t;
1050         }
1051
1052       lex_source_get_parse (src);
1053     }
1054
1055   return src->parse[src->parse_ofs + n];
1056 }
1057
1058 /* Returns the "struct token" of the token N after the current one in LEXER.
1059    The returned pointer can be invalidated by pretty much any succeeding call
1060    into the lexer, although the string pointer within the returned token is
1061    only invalidated by consuming the token (e.g. with lex_get()). */
1062 const struct token *
1063 lex_next (const struct lexer *lexer, int n)
1064 {
1065   return &lex_next__ (lexer, n)->token;
1066 }
1067
1068 /* Returns the type of the token N after the current one in LEXER. */
1069 enum token_type
1070 lex_next_token (const struct lexer *lexer, int n)
1071 {
1072   return lex_next (lexer, n)->type;
1073 }
1074
1075 /* Returns the number in the tokn N after the current one in LEXER.
1076
1077    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1078    tokens this function will always return zero. */
1079 double
1080 lex_next_tokval (const struct lexer *lexer, int n)
1081 {
1082   return token_number (lex_next (lexer, n));
1083 }
1084
1085 /* Returns the null-terminated string in the token N after the current one, in
1086    UTF-8 encoding.
1087
1088    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1089    this functions this function will always return NULL.
1090
1091    The UTF-8 encoding of the returned string is correct for variable names and
1092    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1093    data_in() to use it in a "union value".  */
1094 const char *
1095 lex_next_tokcstr (const struct lexer *lexer, int n)
1096 {
1097   return lex_next_tokss (lexer, n).string;
1098 }
1099
1100 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1101    The string is null-terminated (but the null terminator is not included in
1102    the returned substring's 'length').
1103
1104    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1105    tokens this functions this function will always return NULL.
1106
1107    The UTF-8 encoding of the returned string is correct for variable names and
1108    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1109    data_in() to use it in a "union value".  */
1110 struct substring
1111 lex_next_tokss (const struct lexer *lexer, int n)
1112 {
1113   return lex_next (lexer, n)->string;
1114 }
1115
1116 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1117    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1118    are both zero, this requests the syntax for the current token.)  The caller
1119    must eventually free the returned string (with free()).  The syntax is
1120    encoded in UTF-8 and in the original form supplied to the lexer so that, for
1121    example, it may include comments, spaces, and new-lines if it spans multiple
1122    tokens.  Macro expansion, however, has already been performed. */
1123 char *
1124 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1125 {
1126   return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1127 }
1128
1129 /* Returns true if the token N ahead of the current one was produced by macro
1130    expansion, false otherwise. */
1131 bool
1132 lex_next_is_from_macro (const struct lexer *lexer, int n)
1133 {
1134   return lex_next__ (lexer, n)->macro_rep != NULL;
1135 }
1136
1137 static bool
1138 lex_tokens_match (const struct token *actual, const struct token *expected)
1139 {
1140   if (actual->type != expected->type)
1141     return false;
1142
1143   switch (actual->type)
1144     {
1145     case T_POS_NUM:
1146     case T_NEG_NUM:
1147       return actual->number == expected->number;
1148
1149     case T_ID:
1150       return lex_id_match (expected->string, actual->string);
1151
1152     case T_STRING:
1153       return (actual->string.length == expected->string.length
1154               && !memcmp (actual->string.string, expected->string.string,
1155                           actual->string.length));
1156
1157     default:
1158       return true;
1159     }
1160 }
1161
1162 static size_t
1163 lex_at_phrase__ (struct lexer *lexer, const char *s)
1164 {
1165   struct string_lexer slex;
1166   struct token token;
1167
1168   size_t i = 0;
1169   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1170   while (string_lexer_next (&slex, &token))
1171     {
1172       bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1173       token_uninit (&token);
1174       if (!match)
1175         return 0;
1176     }
1177   return i;
1178 }
1179
1180 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1181    returns true.  Otherwise, returns false.
1182
1183    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1184    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1185    first three letters. */
1186 bool
1187 lex_at_phrase (struct lexer *lexer, const char *s)
1188 {
1189   return lex_at_phrase__ (lexer, s) > 0;
1190 }
1191
1192 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1193    skips it and returns true.  Otherwise, returns false.
1194
1195    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1196    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1197    first three letters. */
1198 bool
1199 lex_match_phrase (struct lexer *lexer, const char *s)
1200 {
1201   size_t n = lex_at_phrase__ (lexer, s);
1202   if (n > 0)
1203     lex_get_n (lexer, n);
1204   return n > 0;
1205 }
1206
1207 static int
1208 count_newlines (char *s, size_t length)
1209 {
1210   int n_newlines = 0;
1211   char *newline;
1212
1213   while ((newline = memchr (s, '\n', length)) != NULL)
1214     {
1215       n_newlines++;
1216       length -= (newline + 1) - s;
1217       s = newline + 1;
1218     }
1219
1220   return n_newlines;
1221 }
1222
1223 static int
1224 lex_token_get_last_line_number (const struct lex_source *src,
1225                                 const struct lex_token *token)
1226 {
1227   if (token->first_line == 0)
1228     return 0;
1229   else
1230     {
1231       char *token_str = &src->buffer[token->token_pos];
1232       return token->first_line + count_newlines (token_str, token->token_len) + 1;
1233     }
1234 }
1235
1236 static int
1237 lex_token_get_column__ (const struct lex_source *src, size_t offset)
1238 {
1239   const char *newline = memrchr (src->buffer, '\n', offset);
1240   size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1241   return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1242 }
1243
1244 static int
1245 lex_token_get_first_column (const struct lex_source *src,
1246                             const struct lex_token *token)
1247 {
1248   return lex_token_get_column__ (src, token->token_pos);
1249 }
1250
1251 static int
1252 lex_token_get_last_column (const struct lex_source *src,
1253                            const struct lex_token *token)
1254 {
1255   return lex_token_get_column__ (src, token->token_pos + token->token_len);
1256 }
1257
1258 static struct msg_location
1259 lex_token_location (const struct lex_source *src,
1260                     const struct lex_token *t0,
1261                     const struct lex_token *t1)
1262 {
1263   return (struct msg_location) {
1264     .file_name = src->reader->file_name,
1265     .first_line = t0->first_line,
1266     .last_line = lex_token_get_last_line_number (src, t1),
1267     .first_column = lex_token_get_first_column (src, t0),
1268     .last_column = lex_token_get_last_column (src, t1),
1269   };
1270 }
1271
1272 static struct msg_location *
1273 lex_token_location_rw (const struct lex_source *src,
1274                        const struct lex_token *t0,
1275                        const struct lex_token *t1)
1276 {
1277   struct msg_location location = lex_token_location (src, t0, t1);
1278   return msg_location_dup (&location);
1279 }
1280
1281 static struct msg_location *
1282 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1283 {
1284   return lex_token_location_rw (src,
1285                                 lex_source_next__ (src, n0),
1286                                 lex_source_next__ (src, n1));
1287 }
1288
1289 /* Returns the 1-based line number of the start of the syntax that represents
1290    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1291    if the token is drawn from a source that does not have line numbers. */
1292 int
1293 lex_get_first_line_number (const struct lexer *lexer, int n)
1294 {
1295   const struct lex_source *src = lex_source__ (lexer);
1296   return src ? lex_source_next__ (src, n)->first_line : 0;
1297 }
1298
1299 /* Returns the 1-based line number of the end of the syntax that represents the
1300    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1301    token or if the token is drawn from a source that does not have line
1302    numbers.
1303
1304    Most of the time, a single token is wholly within a single line of syntax,
1305    but there are two exceptions: a T_STRING token can be made up of multiple
1306    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1307    token can consist of a "-" on one line followed by the number on the next.
1308  */
1309 int
1310 lex_get_last_line_number (const struct lexer *lexer, int n)
1311 {
1312   const struct lex_source *src = lex_source__ (lexer);
1313   return src ? lex_token_get_last_line_number (src,
1314                                                lex_source_next__ (src, n)) : 0;
1315 }
1316
1317 /* Returns the 1-based column number of the start of the syntax that represents
1318    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1319    token.
1320
1321    Column numbers are measured according to the width of characters as shown in
1322    a typical fixed-width font, in which CJK characters have width 2 and
1323    combining characters have width 0.  */
1324 int
1325 lex_get_first_column (const struct lexer *lexer, int n)
1326 {
1327   const struct lex_source *src = lex_source__ (lexer);
1328   return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1329 }
1330
1331 /* Returns the 1-based column number of the end of the syntax that represents
1332    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1333    token.
1334
1335    Column numbers are measured according to the width of characters as shown in
1336    a typical fixed-width font, in which CJK characters have width 2 and
1337    combining characters have width 0.  */
1338 int
1339 lex_get_last_column (const struct lexer *lexer, int n)
1340 {
1341   const struct lex_source *src = lex_source__ (lexer);
1342   return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1343 }
1344
1345 /* Returns the name of the syntax file from which the current command is drawn.
1346    Returns NULL for a T_STOP token or if the command's source does not have
1347    line numbers.
1348
1349    There is no version of this function that takes an N argument because
1350    lookahead only works to the end of a command and any given command is always
1351    within a single syntax file. */
1352 const char *
1353 lex_get_file_name (const struct lexer *lexer)
1354 {
1355   struct lex_source *src = lex_source__ (lexer);
1356   return src == NULL ? NULL : src->reader->file_name;
1357 }
1358
1359 /* Returns a newly allocated msg_location for the syntax that represents tokens
1360    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1361    must eventually free the location (with msg_location_destroy()). */
1362 struct msg_location *
1363 lex_get_location (const struct lexer *lexer, int n0, int n1)
1364 {
1365   struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1366   loc->first_column = lex_get_first_column (lexer, n0);
1367   loc->last_column = lex_get_last_column (lexer, n1);
1368   return loc;
1369 }
1370
1371 /* Returns a newly allocated msg_location for the syntax that represents tokens
1372    with 0-based offsets N0...N1, inclusive, from the current token.  The
1373    location only covers the tokens' lines, not the columns.  The caller must
1374    eventually free the location (with msg_location_destroy()). */
1375 struct msg_location *
1376 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1377 {
1378   struct msg_location *loc = xmalloc (sizeof *loc);
1379   *loc = (struct msg_location) {
1380     .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1381     .first_line = lex_get_first_line_number (lexer, n0),
1382     .last_line = lex_get_last_line_number (lexer, n1),
1383   };
1384   return loc;
1385 }
1386
1387 const char *
1388 lex_get_encoding (const struct lexer *lexer)
1389 {
1390   struct lex_source *src = lex_source__ (lexer);
1391   return src == NULL ? NULL : src->reader->encoding;
1392 }
1393
1394 /* Returns the syntax mode for the syntax file from which the current drawn is
1395    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1396    does not have line numbers.
1397
1398    There is no version of this function that takes an N argument because
1399    lookahead only works to the end of a command and any given command is always
1400    within a single syntax file. */
1401 enum segmenter_mode
1402 lex_get_syntax_mode (const struct lexer *lexer)
1403 {
1404   struct lex_source *src = lex_source__ (lexer);
1405   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1406 }
1407
1408 /* Returns the error mode for the syntax file from which the current drawn is
1409    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1410    source does not have line numbers.
1411
1412    There is no version of this function that takes an N argument because
1413    lookahead only works to the end of a command and any given command is always
1414    within a single syntax file. */
1415 enum lex_error_mode
1416 lex_get_error_mode (const struct lexer *lexer)
1417 {
1418   struct lex_source *src = lex_source__ (lexer);
1419   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1420 }
1421
1422 /* If the source that LEXER is currently reading has error mode
1423    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1424    token to be read comes directly from whatever is next read from the stream.
1425
1426    It makes sense to call this function after encountering an error in a
1427    command entered on the console, because usually the user would prefer not to
1428    have cascading errors. */
1429 void
1430 lex_interactive_reset (struct lexer *lexer)
1431 {
1432   struct lex_source *src = lex_source__ (lexer);
1433   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1434     {
1435       src->length = 0;
1436       src->journal_pos = src->seg_pos = 0;
1437       src->n_newlines = 0;
1438       src->suppress_next_newline = false;
1439       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1440                                        false);
1441       lex_stage_clear (&src->pp);
1442       lex_stage_clear (&src->merge);
1443       lex_source_clear_parse (src);
1444       lex_source_push_endcmd__ (src);
1445     }
1446 }
1447
1448 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1449 void
1450 lex_discard_rest_of_command (struct lexer *lexer)
1451 {
1452   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1453     lex_get (lexer);
1454 }
1455
1456 /* Discards all lookahead tokens in LEXER, then discards all input sources
1457    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1458    runs out of input sources. */
1459 void
1460 lex_discard_noninteractive (struct lexer *lexer)
1461 {
1462   struct lex_source *src = lex_source__ (lexer);
1463
1464   if (src != NULL)
1465     {
1466       lex_stage_clear (&src->pp);
1467       lex_stage_clear (&src->merge);
1468       lex_source_clear_parse (src);
1469
1470       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1471            src = lex_source__ (lexer))
1472         lex_source_destroy (src);
1473     }
1474 }
1475 \f
1476 static void
1477 lex_source_expand__ (struct lex_source *src)
1478 {
1479   if (src->length >= src->allocated)
1480     src->buffer = x2realloc (src->buffer, &src->allocated);
1481 }
1482
1483 static void
1484 lex_source_read__ (struct lex_source *src)
1485 {
1486   do
1487     {
1488       lex_source_expand__ (src);
1489
1490       size_t space = src->allocated - src->length;
1491       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1492       size_t n = src->reader->class->read (src->reader,
1493                                            &src->buffer[src->length],
1494                                            space, prompt);
1495       assert (n <= space);
1496
1497       if (n == 0)
1498         {
1499           /* End of input. */
1500           src->reader->eof = true;
1501           return;
1502         }
1503
1504       src->length += n;
1505     }
1506   while (!memchr (&src->buffer[src->seg_pos], '\n',
1507                   src->length - src->seg_pos));
1508 }
1509
1510 static struct lex_source *
1511 lex_source__ (const struct lexer *lexer)
1512 {
1513   return (ll_is_empty (&lexer->sources) ? NULL
1514           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1515 }
1516
1517 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1518    one, through N1 ahead of the current one, inclusive.  (For example, if N0
1519    and N1 are both zero, this requests the syntax for the current token.)  The
1520    caller must eventually free the returned string (with free()).  The syntax
1521    is encoded in UTF-8 and in the original form supplied to the lexer so that,
1522    for example, it may include comments, spaces, and new-lines if it spans
1523    multiple tokens.  Macro expansion, however, has already been performed. */
1524 static char *
1525 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1526 {
1527   struct string s = DS_EMPTY_INITIALIZER;
1528   for (size_t i = n0; i <= n1; )
1529     {
1530       /* Find [I,J) as the longest sequence of tokens not produced by macro
1531          expansion, or otherwise the longest sequence expanded from a single
1532          macro call. */
1533       const struct lex_token *first = lex_source_next__ (src, i);
1534       size_t j;
1535       for (j = i + 1; j <= n1; j++)
1536         {
1537           const struct lex_token *cur = lex_source_next__ (src, j);
1538           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1539               || first->macro_rep != cur->macro_rep)
1540             break;
1541         }
1542       const struct lex_token *last = lex_source_next__ (src, j - 1);
1543
1544       /* Now add the syntax for this sequence of tokens to SRC. */
1545       if (!ds_is_empty (&s))
1546         ds_put_byte (&s, ' ');
1547       if (!first->macro_rep)
1548         {
1549           size_t start = first->token_pos;
1550           size_t end = last->token_pos + last->token_len;
1551           ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1552         }
1553       else
1554         {
1555           size_t start = first->ofs;
1556           size_t end = last->ofs + last->len;
1557           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1558                                            end - start));
1559         }
1560
1561       i = j;
1562     }
1563   return ds_steal_cstr (&s);
1564 }
1565
1566 static bool
1567 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1568 {
1569   for (size_t i = n0; i <= n1; i++)
1570     if (lex_source_next__ (src, i)->macro_rep)
1571       return true;
1572   return false;
1573 }
1574
1575 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1576    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1577    other tokens included in that range.  The syntax is encoded in UTF-8 and in
1578    the original form supplied to the lexer so that, for example, it may include
1579    comments, spaces, and new-lines if it spans multiple tokens.
1580
1581    Returns an empty string if the token range doesn't include a macro call.
1582
1583    The caller must not modify or free the returned string. */
1584 static struct substring
1585 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1586 {
1587   if (!lex_source_contains_macro_call (src, n0, n1))
1588     return ss_empty ();
1589
1590   const struct lex_token *token0 = lex_source_next__ (src, n0);
1591   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1592   size_t start = token0->token_pos;
1593   size_t end = token1->token_pos + token1->token_len;
1594
1595   return ss_buffer (&src->buffer[start], end - start);
1596 }
1597
1598 static void
1599 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1600                          const char *format, va_list args)
1601 {
1602   const struct lex_token *token;
1603   struct string s;
1604
1605   ds_init_empty (&s);
1606
1607   token = lex_source_next__ (src, n0);
1608   if (token->token.type == T_ENDCMD)
1609     ds_put_cstr (&s, _("Syntax error at end of command"));
1610   else
1611     {
1612       /* Get the syntax that caused the error. */
1613       char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1614       char syntax[64];
1615       str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1616       free (raw_syntax);
1617
1618       /* Get the macro call(s) that expanded to the syntax that caused the
1619          error. */
1620       char call[64];
1621       str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1622                      call, sizeof call);
1623
1624       if (syntax[0])
1625         {
1626           if (call[0])
1627             ds_put_format (&s,
1628                            _("Syntax error at `%s' (in expansion of `%s')"),
1629                            syntax, call);
1630           else
1631             ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1632         }
1633       else
1634         {
1635           if (call[0])
1636             ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1637                            call);
1638           else
1639             ds_put_cstr (&s, _("Syntax error"));
1640         }
1641     }
1642
1643   if (format)
1644     {
1645       ds_put_cstr (&s, ": ");
1646       ds_put_vformat (&s, format, args);
1647     }
1648   if (ds_last (&s) != '.')
1649     ds_put_byte (&s, '.');
1650
1651   struct msg *m = xmalloc (sizeof *m);
1652   *m = (struct msg) {
1653     .category = MSG_C_SYNTAX,
1654     .severity = MSG_S_ERROR,
1655     .location = lex_source_get_location (src, n0, n1),
1656     .text = ds_steal_cstr (&s),
1657   };
1658   msg_emit (m);
1659 }
1660
1661 static void
1662 lex_get_error (struct lex_source *src, const struct lex_token *token)
1663 {
1664   char syntax[64];
1665   str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1666                  syntax, sizeof syntax);
1667
1668   struct string s = DS_EMPTY_INITIALIZER;
1669   ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1670   ds_put_format (&s, ": %s", token->token.string.string);
1671
1672   struct msg *m = xmalloc (sizeof *m);
1673   *m = (struct msg) {
1674     .category = MSG_C_SYNTAX,
1675     .severity = MSG_S_ERROR,
1676     .location = lex_token_location_rw (src, token, token),
1677     .text = ds_steal_cstr (&s),
1678   };
1679   msg_emit (m);
1680 }
1681
1682 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1683    underlying lex_reader if necessary.  Returns true if a new token was added
1684    to SRC's deque, false otherwise.  The caller should retry failures unless
1685    SRC's 'eof' marker was set to true indicating that there will be no more
1686    tokens from this source. */
1687 static bool
1688 lex_source_try_get_pp (struct lex_source *src)
1689 {
1690   /* Append a new token to SRC and initialize it. */
1691   struct lex_token *token = xmalloc (sizeof *token);
1692   token->token = (struct token) { .type = T_STOP };
1693   token->macro_rep = NULL;
1694   token->ref_cnt = NULL;
1695   token->token_pos = src->seg_pos;
1696   if (src->reader->line_number > 0)
1697     token->first_line = src->reader->line_number + src->n_newlines;
1698   else
1699     token->first_line = 0;
1700
1701   /* Extract a segment. */
1702   const char *segment;
1703   enum segment_type seg_type;
1704   int seg_len;
1705   for (;;)
1706     {
1707       segment = &src->buffer[src->seg_pos];
1708       seg_len = segmenter_push (&src->segmenter, segment,
1709                                 src->length - src->seg_pos,
1710                                 src->reader->eof, &seg_type);
1711       if (seg_len >= 0)
1712         break;
1713
1714       /* The segmenter needs more input to produce a segment. */
1715       assert (!src->reader->eof);
1716       lex_source_read__ (src);
1717     }
1718
1719   /* Update state based on the segment. */
1720   token->token_len = seg_len;
1721   src->seg_pos += seg_len;
1722   if (seg_type == SEG_NEWLINE)
1723     src->n_newlines++;
1724
1725   /* Get a token from the segment. */
1726   enum tokenize_result result = token_from_segment (
1727     seg_type, ss_buffer (segment, seg_len), &token->token);
1728
1729   /* If we've reached the end of a line, or the end of a command, then pass
1730      the line to the output engine as a syntax text item.  */
1731   int n_lines = seg_type == SEG_NEWLINE;
1732   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1733     {
1734       n_lines++;
1735       src->suppress_next_newline = true;
1736     }
1737   else if (n_lines > 0 && src->suppress_next_newline)
1738     {
1739       n_lines--;
1740       src->suppress_next_newline = false;
1741     }
1742   for (int i = 0; i < n_lines; i++)
1743     {
1744       /* Beginning of line. */
1745       const char *line = &src->buffer[src->journal_pos];
1746
1747       /* Calculate line length, including \n or \r\n end-of-line if present.
1748
1749          We use src->length even though that may be beyond what we've actually
1750          converted to tokens.  That's because, if we're emitting the line due
1751          to SEG_END_COMMAND, we want to take the whole line through the
1752          newline, not just through the '.'. */
1753       size_t max_len = src->length - src->journal_pos;
1754       const char *newline = memchr (line, '\n', max_len);
1755       size_t line_len = newline ? newline - line + 1 : max_len;
1756
1757       /* Calculate line length excluding end-of-line. */
1758       size_t copy_len = line_len;
1759       if (copy_len > 0 && line[copy_len - 1] == '\n')
1760         copy_len--;
1761       if (copy_len > 0 && line[copy_len - 1] == '\r')
1762         copy_len--;
1763
1764       /* Submit the line as syntax. */
1765       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1766                                                    xmemdup0 (line, copy_len),
1767                                                    NULL));
1768
1769       src->journal_pos += line_len;
1770     }
1771
1772   switch (result)
1773     {
1774     case TOKENIZE_ERROR:
1775       lex_get_error (src, token);
1776       /* Fall through. */
1777     case TOKENIZE_EMPTY:
1778       lex_token_destroy (token);
1779       return false;
1780
1781     case TOKENIZE_TOKEN:
1782       if (token->token.type == T_STOP)
1783         {
1784           token->token.type = T_ENDCMD;
1785           src->eof = true;
1786         }
1787       lex_stage_push_last (&src->pp, token);
1788       return true;
1789     }
1790   NOT_REACHED ();
1791 }
1792
1793 /* Attempts to append a new token to SRC.  Returns true if successful, false on
1794    failure.  On failure, the end of SRC has been reached and no more tokens
1795    will be forthcoming from it.
1796
1797    Does not make the new token available for lookahead yet; the caller must
1798    adjust SRC's 'middle' pointer to do so. */
1799 static bool
1800 lex_source_get_pp (struct lex_source *src)
1801 {
1802   while (!src->eof)
1803     if (lex_source_try_get_pp (src))
1804       return true;
1805   return false;
1806 }
1807
1808 static bool
1809 lex_source_try_get_merge (const struct lex_source *src_)
1810 {
1811   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1812
1813   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1814     return false;
1815
1816   if (!settings_get_mexpand ())
1817     {
1818       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1819       return true;
1820     }
1821
1822   /* Now pass tokens one-by-one to the macro expander.
1823
1824      In the common case where there is no macro to expand, the loop is not
1825      entered.  */
1826   struct macro_call *mc;
1827   int n_call = macro_call_create (src->lexer->macros,
1828                                   &lex_stage_first (&src->pp)->token, &mc);
1829   for (int ofs = 1; !n_call; ofs++)
1830     {
1831       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1832         {
1833           /* This should not be reachable because we always get a T_ENDCMD at
1834              the end of an input file (transformed from T_STOP by
1835              lex_source_try_get_pp()) and the macro_expander should always
1836              terminate expansion on T_ENDCMD. */
1837           NOT_REACHED ();
1838         }
1839
1840       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1841       size_t start = t->token_pos;
1842       size_t end = t->token_pos + t->token_len;
1843       const struct macro_token mt = {
1844         .token = t->token,
1845         .syntax = ss_buffer (&src->buffer[start], end - start),
1846       };
1847       const struct msg_location loc = lex_token_location (src, t, t);
1848       n_call = macro_call_add (mc, &mt, &loc);
1849     }
1850   if (n_call < 0)
1851     {
1852       /* False alarm: no macro expansion after all.  Use first token as
1853          lookahead.  We'll retry macro expansion from the second token next
1854          time around. */
1855       macro_call_destroy (mc);
1856       lex_stage_shift (&src->merge, &src->pp, 1);
1857       return true;
1858     }
1859
1860   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1861      are a macro call.  (These are likely to be the only tokens in 'pp'.)
1862      Expand them.  */
1863   const struct lex_token *c0 = lex_stage_first (&src->pp);
1864   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1865   struct macro_tokens expansion = { .n = 0 };
1866   struct msg_location loc = lex_token_location (src, c0, c1);
1867   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1868   macro_call_destroy (mc);
1869
1870   /* Convert the macro expansion into syntax for possible error messages
1871      later. */
1872   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1873   size_t *len = xnmalloc (expansion.n, sizeof *len);
1874   struct string s = DS_EMPTY_INITIALIZER;
1875   macro_tokens_to_syntax (&expansion, &s, ofs, len);
1876
1877   if (settings_get_mprint ())
1878     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1879                                           _("Macro Expansion")));
1880
1881   /* Append the macro expansion tokens to the lookahead. */
1882   if (expansion.n > 0)
1883     {
1884       char *macro_rep = ds_steal_cstr (&s);
1885       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1886       *ref_cnt = expansion.n;
1887       for (size_t i = 0; i < expansion.n; i++)
1888         {
1889           struct lex_token *token = xmalloc (sizeof *token);
1890           *token = (struct lex_token) {
1891             .token = expansion.mts[i].token,
1892             .token_pos = c0->token_pos,
1893             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1894             .first_line = c0->first_line,
1895             .macro_rep = macro_rep,
1896             .ofs = ofs[i],
1897             .len = len[i],
1898             .ref_cnt = ref_cnt,
1899           };
1900           lex_stage_push_last (&src->merge, token);
1901
1902           ss_dealloc (&expansion.mts[i].syntax);
1903         }
1904     }
1905   else
1906     ds_destroy (&s);
1907   free (expansion.mts);
1908   free (ofs);
1909   free (len);
1910
1911   /* Destroy the tokens for the call. */
1912   for (size_t i = 0; i < n_call; i++)
1913     lex_stage_pop_first (&src->pp);
1914
1915   return expansion.n > 0;
1916 }
1917
1918 /* Attempts to obtain at least one new token into 'merge' in SRC.
1919
1920    Returns true if successful, false on failure.  In the latter case, SRC is
1921    exhausted and 'src->eof' is now true. */
1922 static bool
1923 lex_source_get_merge (struct lex_source *src)
1924 {
1925   while (!src->eof)
1926     if (lex_source_try_get_merge (src))
1927       return true;
1928   return false;
1929 }
1930
1931 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1932
1933    Returns true if successful, false on failure.  In the latter case, SRC is
1934    exhausted and 'src->eof' is now true. */
1935 static bool
1936 lex_source_get_parse (struct lex_source *src)
1937 {
1938   struct merger m = MERGER_INIT;
1939   struct token out;
1940   for (size_t i = 0; ; i++)
1941     {
1942       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1943         {
1944           /* We always get a T_ENDCMD at the end of an input file
1945              (transformed from T_STOP by lex_source_try_get_pp()) and
1946              merger_add() should never return -1 on T_ENDCMD. */
1947           assert (lex_stage_is_empty (&src->merge));
1948           return false;
1949         }
1950
1951       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1952                                &out);
1953       if (!retval)
1954         {
1955           lex_source_push_parse (src, lex_stage_take_first (&src->merge));
1956           return true;
1957         }
1958       else if (retval > 0)
1959         {
1960           /* Add a token that merges all the tokens together. */
1961           const struct lex_token *first = lex_stage_first (&src->merge);
1962           const struct lex_token *last = lex_stage_nth (&src->merge,
1963                                                         retval - 1);
1964           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1965           struct lex_token *t = xmalloc (sizeof *t);
1966           *t = (struct lex_token) {
1967             .token = out,
1968             .token_pos = first->token_pos,
1969             .token_len = (last->token_pos - first->token_pos) + last->token_len,
1970             .first_line = first->first_line,
1971
1972             /* This works well if all the tokens were not expanded from macros,
1973                or if they came from the same macro expansion.  It just gives up
1974                in the other (corner) cases. */
1975             .macro_rep = macro ? first->macro_rep : NULL,
1976             .ofs = macro ? first->ofs : 0,
1977             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
1978             .ref_cnt = macro ? first->ref_cnt : NULL,
1979           };
1980           if (t->ref_cnt)
1981             ++*t->ref_cnt;
1982           lex_source_push_parse (src, t);
1983
1984           for (int i = 0; i < retval; i++)
1985             lex_stage_pop_first (&src->merge);
1986           return true;
1987         }
1988     }
1989 }
1990 \f
1991 static void
1992 lex_source_push_endcmd__ (struct lex_source *src)
1993 {
1994   assert (src->n_parse == 0);
1995
1996   struct lex_token *token = xmalloc (sizeof *token);
1997   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
1998   lex_source_push_parse (src, token);
1999 }
2000
2001 static void
2002 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2003 {
2004   if (src->n_parse >= src->allocated_parse)
2005     src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2006                              sizeof *src->parse);
2007   src->parse[src->n_parse++] = token;
2008 }
2009
2010 static void
2011 lex_source_clear_parse (struct lex_source *src)
2012 {
2013   for (size_t i = 0; i < src->n_parse; i++)
2014     lex_token_destroy (src->parse[i]);
2015   src->n_parse = src->parse_ofs = 0;
2016 }
2017
2018 static struct lex_source *
2019 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2020 {
2021   struct lex_source *src = xmalloc (sizeof *src);
2022   *src = (struct lex_source) {
2023     .reader = reader,
2024     .segmenter = segmenter_init (reader->syntax, false),
2025     .lexer = lexer,
2026   };
2027
2028   lex_source_push_endcmd__ (src);
2029
2030   return src;
2031 }
2032
2033 static void
2034 lex_source_destroy (struct lex_source *src)
2035 {
2036   char *file_name = src->reader->file_name;
2037   char *encoding = src->reader->encoding;
2038   if (src->reader->class->destroy != NULL)
2039     src->reader->class->destroy (src->reader);
2040   free (file_name);
2041   free (encoding);
2042   free (src->buffer);
2043   lex_stage_uninit (&src->pp);
2044   lex_stage_uninit (&src->merge);
2045   lex_source_clear_parse (src);
2046   free (src->parse);
2047   ll_remove (&src->ll);
2048   free (src);
2049 }
2050 \f
2051 struct lex_file_reader
2052   {
2053     struct lex_reader reader;
2054     struct u8_istream *istream;
2055   };
2056
2057 static struct lex_reader_class lex_file_reader_class;
2058
2059 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2060    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2061    ENCODING, which should take one of the forms accepted by
2062    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2063    mode of the new reader, respectively.
2064
2065    Returns a null pointer if FILE_NAME cannot be opened. */
2066 struct lex_reader *
2067 lex_reader_for_file (const char *file_name, const char *encoding,
2068                      enum segmenter_mode syntax,
2069                      enum lex_error_mode error)
2070 {
2071   struct lex_file_reader *r;
2072   struct u8_istream *istream;
2073
2074   istream = (!strcmp(file_name, "-")
2075              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2076              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2077   if (istream == NULL)
2078     {
2079       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2080       return NULL;
2081     }
2082
2083   r = xmalloc (sizeof *r);
2084   lex_reader_init (&r->reader, &lex_file_reader_class);
2085   r->reader.syntax = syntax;
2086   r->reader.error = error;
2087   r->reader.file_name = xstrdup (file_name);
2088   r->reader.encoding = xstrdup_if_nonnull (encoding);
2089   r->reader.line_number = 1;
2090   r->istream = istream;
2091
2092   return &r->reader;
2093 }
2094
2095 static struct lex_file_reader *
2096 lex_file_reader_cast (struct lex_reader *r)
2097 {
2098   return UP_CAST (r, struct lex_file_reader, reader);
2099 }
2100
2101 static size_t
2102 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2103                enum prompt_style prompt_style UNUSED)
2104 {
2105   struct lex_file_reader *r = lex_file_reader_cast (r_);
2106   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2107   if (n_read < 0)
2108     {
2109       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2110       return 0;
2111     }
2112   return n_read;
2113 }
2114
2115 static void
2116 lex_file_close (struct lex_reader *r_)
2117 {
2118   struct lex_file_reader *r = lex_file_reader_cast (r_);
2119
2120   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2121     {
2122       if (u8_istream_close (r->istream) != 0)
2123         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2124     }
2125   else
2126     u8_istream_free (r->istream);
2127
2128   free (r);
2129 }
2130
2131 static struct lex_reader_class lex_file_reader_class =
2132   {
2133     lex_file_read,
2134     lex_file_close
2135   };
2136 \f
2137 struct lex_string_reader
2138   {
2139     struct lex_reader reader;
2140     struct substring s;
2141     size_t offset;
2142   };
2143
2144 static struct lex_reader_class lex_string_reader_class;
2145
2146 /* Creates and returns a new lex_reader for the contents of S, which must be
2147    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2148    with ss_dealloc() when it is closed. */
2149 struct lex_reader *
2150 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2151 {
2152   struct lex_string_reader *r;
2153
2154   r = xmalloc (sizeof *r);
2155   lex_reader_init (&r->reader, &lex_string_reader_class);
2156   r->reader.syntax = SEG_MODE_AUTO;
2157   r->reader.encoding = xstrdup_if_nonnull (encoding);
2158   r->s = s;
2159   r->offset = 0;
2160
2161   return &r->reader;
2162 }
2163
2164 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2165    which must be encoded in ENCODING.  The caller retains ownership of S. */
2166 struct lex_reader *
2167 lex_reader_for_string (const char *s, const char *encoding)
2168 {
2169   struct substring ss;
2170   ss_alloc_substring (&ss, ss_cstr (s));
2171   return lex_reader_for_substring_nocopy (ss, encoding);
2172 }
2173
2174 /* Formats FORMAT as a printf()-like format string and creates and returns a
2175    new lex_reader for the formatted result.  */
2176 struct lex_reader *
2177 lex_reader_for_format (const char *format, const char *encoding, ...)
2178 {
2179   struct lex_reader *r;
2180   va_list args;
2181
2182   va_start (args, encoding);
2183   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2184   va_end (args);
2185
2186   return r;
2187 }
2188
2189 static struct lex_string_reader *
2190 lex_string_reader_cast (struct lex_reader *r)
2191 {
2192   return UP_CAST (r, struct lex_string_reader, reader);
2193 }
2194
2195 static size_t
2196 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2197                  enum prompt_style prompt_style UNUSED)
2198 {
2199   struct lex_string_reader *r = lex_string_reader_cast (r_);
2200   size_t chunk;
2201
2202   chunk = MIN (n, r->s.length - r->offset);
2203   memcpy (buf, r->s.string + r->offset, chunk);
2204   r->offset += chunk;
2205
2206   return chunk;
2207 }
2208
2209 static void
2210 lex_string_close (struct lex_reader *r_)
2211 {
2212   struct lex_string_reader *r = lex_string_reader_cast (r_);
2213
2214   ss_dealloc (&r->s);
2215   free (r);
2216 }
2217
2218 static struct lex_reader_class lex_string_reader_class =
2219   {
2220     lex_string_read,
2221     lex_string_close
2222   };