pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31
  32 #include "language/command.h"
  33 #include "language/lexer/macro.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/ll.h"
  42 #include "libpspp/message.h"
  43 #include "libpspp/misc.h"
  44 #include "libpspp/str.h"
  45 #include "libpspp/u8-istream.h"
  46 #include "output/journal.h"
  47 #include "output/output-item.h"
  48
  49 #include "gl/c-ctype.h"
  50 #include "gl/minmax.h"
  51 #include "gl/xalloc.h"
  52 #include "gl/xmemdup0.h"
  53
  54 #include "gettext.h"
  55 #define _(msgid) gettext (msgid)
  56 #define N_(msgid) msgid
  57
  58 /* A token within a lex_source. */
  59 struct lex_token
  60   {
  61     /* The regular token information. */
  62     struct token token;
  63
  64     /* For a token obtained through the lexer in an ordinary way, this is the
  65        location of the token in terms of the lex_source's buffer.
  66
  67        For a token produced through macro expansion, this is the entire macro
  68        call. */
  69     size_t token_pos;           /* Offset into src->buffer of token start. */
  70     size_t token_len;           /* Length of source for token in bytes. */
  71     int first_line;             /* Line number at token_pos. */
  72
  73     /* For a token obtained through macro expansion, this is just this token.
  74
  75        For a token obtained through the lexer in an ordinary way, these are
  76        nulls and zeros. */
  77     char *macro_rep;        /* The whole macro expansion. */
  78     size_t ofs;             /* Offset of this token in macro_rep. */
  79     size_t len;             /* Length of this token in macro_rep. */
  80     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  81   };
  82
  83 static void
  84 lex_token_destroy (struct lex_token *t)
  85 {
  86   token_uninit (&t->token);
  87   if (t->ref_cnt)
  88     {
  89       assert (*t->ref_cnt > 0);
  90       if (!--*t->ref_cnt)
  91         {
  92           free (t->macro_rep);
  93           free (t->ref_cnt);
  94         }
  95     }
  96   free (t);
  97 }
  98 \f
  99 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 100    lex_source. */
 101 struct lex_stage
 102   {
 103     struct deque deque;
 104     struct lex_token **tokens;
 105   };
 106
 107 static void lex_stage_clear (struct lex_stage *);
 108 static void lex_stage_uninit (struct lex_stage *);
 109
 110 static size_t lex_stage_count (const struct lex_stage *);
 111 static bool lex_stage_is_empty (const struct lex_stage *);
 112
 113 static struct lex_token *lex_stage_last (struct lex_stage *);
 114 static struct lex_token *lex_stage_first (struct lex_stage *);
 115 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 116
 117 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 118 static void lex_stage_pop_first (struct lex_stage *);
 119
 120 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 121                              size_t n);
 122
 123 /* Deletes all the tokens from STAGE. */
 124 static void
 125 lex_stage_clear (struct lex_stage *stage)
 126 {
 127   while (!deque_is_empty (&stage->deque))
 128     lex_stage_pop_first (stage);
 129 }
 130
 131 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 132 static void
 133 lex_stage_uninit (struct lex_stage *stage)
 134 {
 135   lex_stage_clear (stage);
 136   free (stage->tokens);
 137 }
 138
 139 /* Returns true if STAGE contains no tokens, otherwise false. */
 140 static bool
 141 lex_stage_is_empty (const struct lex_stage *stage)
 142 {
 143   return deque_is_empty (&stage->deque);
 144 }
 145
 146 /* Returns the number of tokens in STAGE. */
 147 static size_t
 148 lex_stage_count (const struct lex_stage *stage)
 149 {
 150   return deque_count (&stage->deque);
 151 }
 152
 153 /* Returns the last token in STAGE, which must be nonempty.  The last token is
 154    the one accessed with the greatest lookahead. */
 155 static struct lex_token *
 156 lex_stage_last (struct lex_stage *stage)
 157 {
 158   return stage->tokens[deque_front (&stage->deque, 0)];
 159 }
 160
 161 /* Returns the first token in STAGE, which must be nonempty.
 162    The first token is the one accessed with the least lookahead. */
 163 static struct lex_token *
 164 lex_stage_first (struct lex_stage *stage)
 165 {
 166   return lex_stage_nth (stage, 0);
 167 }
 168
 169 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 170    lookahead) is 0, the second token is 1, and so on.  There must be at least
 171    INDEX + 1 tokens in STAGE. */
 172 static struct lex_token *
 173 lex_stage_nth (struct lex_stage *stage, size_t index)
 174 {
 175   return stage->tokens[deque_back (&stage->deque, index)];
 176 }
 177
 178 /* Adds TOKEN so that it becomes the last token in STAGE. */
 179 static void
 180 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 181 {
 182   if (deque_is_full (&stage->deque))
 183     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 184                                   sizeof *stage->tokens);
 185   stage->tokens[deque_push_front (&stage->deque)] = token;
 186 }
 187
 188 /* Removes the first token from STAGE and uninitializes it. */
 189 static void
 190 lex_stage_pop_first (struct lex_stage *stage)
 191 {
 192   lex_token_destroy (stage->tokens[deque_pop_back (&stage->deque)]);
 193 }
 194
 195 /* Removes the first N tokens from SRC, appending them to DST as the last
 196    tokens. */
 197 static void
 198 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 199 {
 200   for (size_t i = 0; i < n; i++)
 201     {
 202       lex_stage_push_last (dst, lex_stage_first (src));
 203       deque_pop_back (&src->deque);
 204     }
 205 }
 206
 207 /* A source of tokens, corresponding to a syntax file.
 208
 209    This is conceptually a lex_reader wrapped with everything needed to convert
 210    its UTF-8 bytes into tokens. */
 211 struct lex_source
 212   {
 213     struct ll ll;               /* In lexer's list of sources. */
 214     struct lex_reader *reader;
 215     struct lexer *lexer;
 216     struct segmenter segmenter;
 217     bool eof;                   /* True if T_STOP was read from 'reader'. */
 218
 219     /* Buffer of UTF-8 bytes. */
 220     char *buffer;               /* Source file contents. */
 221     size_t length;              /* Number of bytes filled. */
 222     size_t allocated;           /* Number of bytes allocated. */
 223
 224     /* Offsets into 'buffer'. */
 225     size_t journal_pos;         /* First byte not yet output to journal. */
 226     size_t seg_pos;             /* First byte not yet scanned as token. */
 227
 228     int n_newlines;             /* Number of new-lines up to seg_pos. */
 229     bool suppress_next_newline;
 230
 231     /* Tokens.
 232
 233        This is a pipeline with the following stages.  Each token eventually
 234        made available to the parser passes through of these stages.  The stages
 235        are named after the processing that happens in each one.
 236
 237        Initially, tokens come from the segmenter and scanner to 'pp':
 238
 239        - pp: Tokens that need to pass through the macro preprocessor to end up
 240          in 'merge'.
 241
 242        - merge: Tokens that need to pass through scan_merge() to end up in
 243          'lookahead'.
 244
 245        - lookahead: Tokens available to the client for parsing. */
 246     struct lex_stage pp;
 247     struct lex_stage merge;
 248     struct lex_stage lookahead;
 249   };
 250
 251 static struct lex_source *lex_source_create (struct lexer *,
 252                                              struct lex_reader *);
 253 static void lex_source_destroy (struct lex_source *);
 254
 255 /* Lexer. */
 256 struct lexer
 257   {
 258     struct ll_list sources;     /* Contains "struct lex_source"s. */
 259     struct macro_set *macros;
 260   };
 261
 262 static struct lex_source *lex_source__ (const struct lexer *);
 263 static char *lex_source_get_syntax__ (const struct lex_source *,
 264                                       int n0, int n1);
 265 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 266 static void lex_source_push_endcmd__ (struct lex_source *);
 267
 268 static bool lex_source_get_lookahead (struct lex_source *);
 269 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 270                                      const char *format, va_list)
 271    PRINTF_FORMAT (4, 0);
 272 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 273                                                   int n);
 274 \f
 275 /* Initializes READER with the specified CLASS and otherwise some reasonable
 276    defaults.  The caller should fill in the others members as desired. */
 277 void
 278 lex_reader_init (struct lex_reader *reader,
 279                  const struct lex_reader_class *class)
 280 {
 281   reader->class = class;
 282   reader->syntax = SEG_MODE_AUTO;
 283   reader->error = LEX_ERROR_CONTINUE;
 284   reader->file_name = NULL;
 285   reader->encoding = NULL;
 286   reader->line_number = 0;
 287   reader->eof = false;
 288 }
 289
 290 /* Frees any file name already in READER and replaces it by a copy of
 291    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 292 void
 293 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 294 {
 295   free (reader->file_name);
 296   reader->file_name = xstrdup_if_nonnull (file_name);
 297 }
 298 \f
 299 /* Creates and returns a new lexer. */
 300 struct lexer *
 301 lex_create (void)
 302 {
 303   struct lexer *lexer = xmalloc (sizeof *lexer);
 304   *lexer = (struct lexer) {
 305     .sources = LL_INITIALIZER (lexer->sources),
 306     .macros = macro_set_create (),
 307   };
 308   return lexer;
 309 }
 310
 311 /* Destroys LEXER. */
 312 void
 313 lex_destroy (struct lexer *lexer)
 314 {
 315   if (lexer != NULL)
 316     {
 317       struct lex_source *source, *next;
 318
 319       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 320         lex_source_destroy (source);
 321       macro_set_destroy (lexer->macros);
 322       free (lexer);
 323     }
 324 }
 325
 326 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 327    same name.  Takes ownership of M. */
 328 void
 329 lex_define_macro (struct lexer *lexer, struct macro *m)
 330 {
 331   macro_set_add (lexer->macros, m);
 332 }
 333
 334 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 335    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 336    token. */
 337 void
 338 lex_include (struct lexer *lexer, struct lex_reader *reader)
 339 {
 340   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 341   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 342 }
 343
 344 /* Appends READER to LEXER, so that it will be read after all other current
 345    readers have already been read. */
 346 void
 347 lex_append (struct lexer *lexer, struct lex_reader *reader)
 348 {
 349   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 350 }
 351 \f
 352 /* Advancing. */
 353
 354 /* Advances LEXER to the next token, consuming the current token. */
 355 void
 356 lex_get (struct lexer *lexer)
 357 {
 358   struct lex_source *src;
 359
 360   src = lex_source__ (lexer);
 361   if (src == NULL)
 362     return;
 363
 364   if (!lex_stage_is_empty (&src->lookahead))
 365     lex_stage_pop_first (&src->lookahead);
 366
 367   while (lex_stage_is_empty (&src->lookahead))
 368     if (!lex_source_get_lookahead (src))
 369       {
 370         lex_source_destroy (src);
 371         src = lex_source__ (lexer);
 372         if (src == NULL)
 373           return;
 374       }
 375 }
 376
 377 /* Advances LEXER by N tokens. */
 378 void
 379 lex_get_n (struct lexer *lexer, size_t n)
 380 {
 381   while (n-- > 0)
 382     lex_get (lexer);
 383 }
 384 \f
 385 /* Issuing errors. */
 386
 387 /* Prints a syntax error message containing the current token and
 388    given message MESSAGE (if non-null). */
 389 void
 390 lex_error (struct lexer *lexer, const char *format, ...)
 391 {
 392   va_list args;
 393
 394   va_start (args, format);
 395   lex_next_error_valist (lexer, 0, 0, format, args);
 396   va_end (args);
 397 }
 398
 399 /* Prints a syntax error message containing the current token and
 400    given message MESSAGE (if non-null). */
 401 void
 402 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 403 {
 404   lex_next_error_valist (lexer, 0, 0, format, args);
 405 }
 406
 407 /* Prints a syntax error message containing the current token and
 408    given message MESSAGE (if non-null). */
 409 void
 410 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 411 {
 412   va_list args;
 413
 414   va_start (args, format);
 415   lex_next_error_valist (lexer, n0, n1, format, args);
 416   va_end (args);
 417 }
 418
 419 /* Prints a syntax error message saying that one of the strings provided as
 420    varargs, up to the first NULL, is expected. */
 421 void
 422 (lex_error_expecting) (struct lexer *lexer, ...)
 423 {
 424   va_list args;
 425
 426   va_start (args, lexer);
 427   lex_error_expecting_valist (lexer, args);
 428   va_end (args);
 429 }
 430
 431 /* Prints a syntax error message saying that one of the options provided in
 432    ARGS, up to the first NULL, is expected. */
 433 void
 434 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 435 {
 436   enum { MAX_OPTIONS = 9 };
 437   const char *options[MAX_OPTIONS];
 438   int n = 0;
 439   while (n < MAX_OPTIONS)
 440     {
 441       const char *option = va_arg (args, const char *);
 442       if (!option)
 443         break;
 444
 445       options[n++] = option;
 446     }
 447   lex_error_expecting_array (lexer, options, n);
 448 }
 449
 450 void
 451 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 452 {
 453   switch (n)
 454     {
 455     case 0:
 456       lex_error (lexer, NULL);
 457       break;
 458
 459     case 1:
 460       lex_error (lexer, _("expecting %s"), options[0]);
 461       break;
 462
 463     case 2:
 464       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 465       break;
 466
 467     case 3:
 468       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 469                  options[2]);
 470       break;
 471
 472     case 4:
 473       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 474                  options[0], options[1], options[2], options[3]);
 475       break;
 476
 477     case 5:
 478       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 479                  options[0], options[1], options[2], options[3], options[4]);
 480       break;
 481
 482     case 6:
 483       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 484                  options[0], options[1], options[2], options[3], options[4],
 485                  options[5]);
 486       break;
 487
 488     case 7:
 489       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 490                  options[0], options[1], options[2], options[3], options[4],
 491                  options[5], options[6]);
 492       break;
 493
 494     case 8:
 495       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 496                  options[0], options[1], options[2], options[3], options[4],
 497                  options[5], options[6], options[7]);
 498       break;
 499
 500     default:
 501       lex_error (lexer, NULL);
 502     }
 503 }
 504
 505 /* Reports an error to the effect that subcommand SBC may only be specified
 506    once.
 507
 508    This function does not take a lexer as an argument or use lex_error(),
 509    because the result would ordinarily just be redundant: "Syntax error at
 510    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 511    not help the user find the error. */
 512 void
 513 lex_sbc_only_once (const char *sbc)
 514 {
 515   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 516 }
 517
 518 /* Reports an error to the effect that subcommand SBC is missing.
 519
 520    This function does not take a lexer as an argument or use lex_error(),
 521    because a missing subcommand can normally be detected only after the whole
 522    command has been parsed, and so lex_error() would always report "Syntax
 523    error at end of command", which does not help the user find the error. */
 524 void
 525 lex_sbc_missing (const char *sbc)
 526 {
 527   msg (SE, _("Required subcommand %s was not specified."), sbc);
 528 }
 529
 530 /* Reports an error to the effect that specification SPEC may only be specified
 531    once within subcommand SBC. */
 532 void
 533 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 534 {
 535   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 536              spec, sbc);
 537 }
 538
 539 /* Reports an error to the effect that specification SPEC is missing within
 540    subcommand SBC. */
 541 void
 542 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 543 {
 544   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 545              sbc, spec);
 546 }
 547
 548 /* Prints a syntax error message containing the current token and
 549    given message MESSAGE (if non-null). */
 550 void
 551 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 552                        const char *format, va_list args)
 553 {
 554   struct lex_source *src = lex_source__ (lexer);
 555
 556   if (src != NULL)
 557     lex_source_error_valist (src, n0, n1, format, args);
 558   else
 559     {
 560       struct string s;
 561
 562       ds_init_empty (&s);
 563       ds_put_format (&s, _("Syntax error at end of input"));
 564       if (format != NULL)
 565         {
 566           ds_put_cstr (&s, ": ");
 567           ds_put_vformat (&s, format, args);
 568         }
 569       if (ds_last (&s) != '.')
 570         ds_put_byte (&s, '.');
 571       msg (SE, "%s", ds_cstr (&s));
 572       ds_destroy (&s);
 573     }
 574 }
 575
 576 /* Checks that we're at end of command.
 577    If so, returns a successful command completion code.
 578    If not, flags a syntax error and returns an error command
 579    completion code. */
 580 int
 581 lex_end_of_command (struct lexer *lexer)
 582 {
 583   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 584     {
 585       lex_error (lexer, _("expecting end of command"));
 586       return CMD_FAILURE;
 587     }
 588   else
 589     return CMD_SUCCESS;
 590 }
 591 \f
 592 /* Token testing functions. */
 593
 594 /* Returns true if the current token is a number. */
 595 bool
 596 lex_is_number (const struct lexer *lexer)
 597 {
 598   return lex_next_is_number (lexer, 0);
 599 }
 600
 601 /* Returns true if the current token is a string. */
 602 bool
 603 lex_is_string (const struct lexer *lexer)
 604 {
 605   return lex_next_is_string (lexer, 0);
 606 }
 607
 608 /* Returns the value of the current token, which must be a
 609    floating point number. */
 610 double
 611 lex_number (const struct lexer *lexer)
 612 {
 613   return lex_next_number (lexer, 0);
 614 }
 615
 616 /* Returns true iff the current token is an integer. */
 617 bool
 618 lex_is_integer (const struct lexer *lexer)
 619 {
 620   return lex_next_is_integer (lexer, 0);
 621 }
 622
 623 /* Returns the value of the current token, which must be an
 624    integer. */
 625 long
 626 lex_integer (const struct lexer *lexer)
 627 {
 628   return lex_next_integer (lexer, 0);
 629 }
 630 \f
 631 /* Token testing functions with lookahead.
 632
 633    A value of 0 for N as an argument to any of these functions refers to the
 634    current token.  Lookahead is limited to the current command.  Any N greater
 635    than the number of tokens remaining in the current command will be treated
 636    as referring to a T_ENDCMD token. */
 637
 638 /* Returns true if the token N ahead of the current token is a number. */
 639 bool
 640 lex_next_is_number (const struct lexer *lexer, int n)
 641 {
 642   return token_is_number (lex_next (lexer, n));
 643 }
 644
 645 /* Returns true if the token N ahead of the current token is a string. */
 646 bool
 647 lex_next_is_string (const struct lexer *lexer, int n)
 648 {
 649   return token_is_string (lex_next (lexer, n));
 650 }
 651
 652 /* Returns the value of the token N ahead of the current token, which must be a
 653    floating point number. */
 654 double
 655 lex_next_number (const struct lexer *lexer, int n)
 656 {
 657   return token_number (lex_next (lexer, n));
 658 }
 659
 660 /* Returns true if the token N ahead of the current token is an integer. */
 661 bool
 662 lex_next_is_integer (const struct lexer *lexer, int n)
 663 {
 664   return token_is_integer (lex_next (lexer, n));
 665 }
 666
 667 /* Returns the value of the token N ahead of the current token, which must be
 668    an integer. */
 669 long
 670 lex_next_integer (const struct lexer *lexer, int n)
 671 {
 672   return token_integer (lex_next (lexer, n));
 673 }
 674 \f
 675 /* Token matching functions. */
 676
 677 /* If the current token has the specified TYPE, skips it and returns true.
 678    Otherwise, returns false. */
 679 bool
 680 lex_match (struct lexer *lexer, enum token_type type)
 681 {
 682   if (lex_token (lexer) == type)
 683     {
 684       lex_get (lexer);
 685       return true;
 686     }
 687   else
 688     return false;
 689 }
 690
 691 /* If the current token matches IDENTIFIER, skips it and returns true.
 692    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 693    returns false.
 694
 695    IDENTIFIER must be an ASCII string. */
 696 bool
 697 lex_match_id (struct lexer *lexer, const char *identifier)
 698 {
 699   return lex_match_id_n (lexer, identifier, 3);
 700 }
 701
 702 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 703    may be abbreviated to its first N letters.  Otherwise, returns false.
 704
 705    IDENTIFIER must be an ASCII string. */
 706 bool
 707 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 708 {
 709   if (lex_token (lexer) == T_ID
 710       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 711     {
 712       lex_get (lexer);
 713       return true;
 714     }
 715   else
 716     return false;
 717 }
 718
 719 /* If the current token is integer X, skips it and returns true.  Otherwise,
 720    returns false. */
 721 bool
 722 lex_match_int (struct lexer *lexer, int x)
 723 {
 724   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 725     {
 726       lex_get (lexer);
 727       return true;
 728     }
 729   else
 730     return false;
 731 }
 732 \f
 733 /* Forced matches. */
 734
 735 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 736    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 737    false.
 738
 739    IDENTIFIER must be an ASCII string. */
 740 bool
 741 lex_force_match_id (struct lexer *lexer, const char *identifier)
 742 {
 743   if (lex_match_id (lexer, identifier))
 744     return true;
 745   else
 746     {
 747       lex_error_expecting (lexer, identifier);
 748       return false;
 749     }
 750 }
 751
 752 /* If the current token has the specified TYPE, skips it and returns true.
 753    Otherwise, reports an error and returns false. */
 754 bool
 755 lex_force_match (struct lexer *lexer, enum token_type type)
 756 {
 757   if (lex_token (lexer) == type)
 758     {
 759       lex_get (lexer);
 760       return true;
 761     }
 762   else
 763     {
 764       const char *type_string = token_type_to_string (type);
 765       if (type_string)
 766         {
 767           char *s = xasprintf ("`%s'", type_string);
 768           lex_error_expecting (lexer, s);
 769           free (s);
 770         }
 771       else
 772         lex_error_expecting (lexer, token_type_to_name (type));
 773
 774       return false;
 775     }
 776 }
 777
 778 /* If the current token is a string, does nothing and returns true.
 779    Otherwise, reports an error and returns false. */
 780 bool
 781 lex_force_string (struct lexer *lexer)
 782 {
 783   if (lex_is_string (lexer))
 784     return true;
 785   else
 786     {
 787       lex_error (lexer, _("expecting string"));
 788       return false;
 789     }
 790 }
 791
 792 /* If the current token is a string or an identifier, does nothing and returns
 793    true.  Otherwise, reports an error and returns false.
 794
 795    This is meant for use in syntactic situations where we want to encourage the
 796    user to supply a quoted string, but for compatibility we also accept
 797    identifiers.  (One example of such a situation is file names.)  Therefore,
 798    the error message issued when the current token is wrong only says that a
 799    string is expected and doesn't mention that an identifier would also be
 800    accepted. */
 801 bool
 802 lex_force_string_or_id (struct lexer *lexer)
 803 {
 804   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 805 }
 806
 807 /* If the current token is an integer, does nothing and returns true.
 808    Otherwise, reports an error and returns false. */
 809 bool
 810 lex_force_int (struct lexer *lexer)
 811 {
 812   if (lex_is_integer (lexer))
 813     return true;
 814   else
 815     {
 816       lex_error (lexer, _("expecting integer"));
 817       return false;
 818     }
 819 }
 820
 821 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 822    nothing and returns true.  Otherwise, reports an error and returns false.
 823    If NAME is nonnull, then it is used in the error message. */
 824 bool
 825 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 826 {
 827   bool is_integer = lex_is_integer (lexer);
 828   bool too_small = is_integer && lex_integer (lexer) < min;
 829   bool too_big = is_integer && lex_integer (lexer) > max;
 830   if (is_integer && !too_small && !too_big)
 831     return true;
 832
 833   if (min > max)
 834     {
 835       /* Weird, maybe a bug in the caller.  Just report that we needed an
 836          integer. */
 837       if (name)
 838         lex_error (lexer, _("Integer expected for %s."), name);
 839       else
 840         lex_error (lexer, _("Integer expected."));
 841     }
 842   else if (min == max)
 843     {
 844       if (name)
 845         lex_error (lexer, _("Expected %ld for %s."), min, name);
 846       else
 847         lex_error (lexer, _("Expected %ld."), min);
 848     }
 849   else if (min + 1 == max)
 850     {
 851       if (name)
 852         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 853       else
 854         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 855     }
 856   else
 857     {
 858       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 859       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 860
 861       if (report_lower_bound && report_upper_bound)
 862         {
 863           if (name)
 864             lex_error (lexer,
 865                        _("Expected integer between %ld and %ld for %s."),
 866                        min, max, name);
 867           else
 868             lex_error (lexer, _("Expected integer between %ld and %ld."),
 869                        min, max);
 870         }
 871       else if (report_lower_bound)
 872         {
 873           if (min == 0)
 874             {
 875               if (name)
 876                 lex_error (lexer, _("Expected non-negative integer for %s."),
 877                            name);
 878               else
 879                 lex_error (lexer, _("Expected non-negative integer."));
 880             }
 881           else if (min == 1)
 882             {
 883               if (name)
 884                 lex_error (lexer, _("Expected positive integer for %s."),
 885                            name);
 886               else
 887                 lex_error (lexer, _("Expected positive integer."));
 888             }
 889         }
 890       else if (report_upper_bound)
 891         {
 892           if (name)
 893             lex_error (lexer,
 894                        _("Expected integer less than or equal to %ld for %s."),
 895                        max, name);
 896           else
 897             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 898                        max);
 899         }
 900       else
 901         {
 902           if (name)
 903             lex_error (lexer, _("Integer expected for %s."), name);
 904           else
 905             lex_error (lexer, _("Integer expected."));
 906         }
 907     }
 908   return false;
 909 }
 910
 911 /* If the current token is a number, does nothing and returns true.
 912    Otherwise, reports an error and returns false. */
 913 bool
 914 lex_force_num (struct lexer *lexer)
 915 {
 916   if (lex_is_number (lexer))
 917     return true;
 918
 919   lex_error (lexer, _("expecting number"));
 920   return false;
 921 }
 922
 923 /* If the current token is an identifier, does nothing and returns true.
 924    Otherwise, reports an error and returns false. */
 925 bool
 926 lex_force_id (struct lexer *lexer)
 927 {
 928   if (lex_token (lexer) == T_ID)
 929     return true;
 930
 931   lex_error (lexer, _("expecting identifier"));
 932   return false;
 933 }
 934 \f
 935 /* Token accessors. */
 936
 937 /* Returns the type of LEXER's current token. */
 938 enum token_type
 939 lex_token (const struct lexer *lexer)
 940 {
 941   return lex_next_token (lexer, 0);
 942 }
 943
 944 /* Returns the number in LEXER's current token.
 945
 946    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 947    tokens this function will always return zero. */
 948 double
 949 lex_tokval (const struct lexer *lexer)
 950 {
 951   return lex_next_tokval (lexer, 0);
 952 }
 953
 954 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 955
 956    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 957    this functions this function will always return NULL.
 958
 959    The UTF-8 encoding of the returned string is correct for variable names and
 960    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 961    data_in() to use it in a "union value".  */
 962 const char *
 963 lex_tokcstr (const struct lexer *lexer)
 964 {
 965   return lex_next_tokcstr (lexer, 0);
 966 }
 967
 968 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 969    null-terminated (but the null terminator is not included in the returned
 970    substring's 'length').
 971
 972    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 973    this functions this function will always return NULL.
 974
 975    The UTF-8 encoding of the returned string is correct for variable names and
 976    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 977    data_in() to use it in a "union value".  */
 978 struct substring
 979 lex_tokss (const struct lexer *lexer)
 980 {
 981   return lex_next_tokss (lexer, 0);
 982 }
 983 \f
 984 /* Looking ahead.
 985
 986    A value of 0 for N as an argument to any of these functions refers to the
 987    current token.  Lookahead is limited to the current command.  Any N greater
 988    than the number of tokens remaining in the current command will be treated
 989    as referring to a T_ENDCMD token. */
 990
 991 static const struct lex_token *
 992 lex_next__ (const struct lexer *lexer_, int n)
 993 {
 994   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
 995   struct lex_source *src = lex_source__ (lexer);
 996
 997   if (src != NULL)
 998     return lex_source_next__ (src, n);
 999   else
1000     {
1001       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1002       return &stop_token;
1003     }
1004 }
1005
1006 static const struct lex_token *
1007 lex_source_next__ (const struct lex_source *src_, int n)
1008 {
1009   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1010   while (lex_stage_count (&src->lookahead) <= n)
1011     {
1012       if (!lex_stage_is_empty (&src->lookahead))
1013         {
1014           const struct lex_token *t = lex_stage_last (&src->lookahead);
1015           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1016             return t;
1017         }
1018
1019       lex_source_get_lookahead (src);
1020     }
1021
1022   return lex_stage_nth (&src->lookahead, n);
1023 }
1024
1025 /* Returns the "struct token" of the token N after the current one in LEXER.
1026    The returned pointer can be invalidated by pretty much any succeeding call
1027    into the lexer, although the string pointer within the returned token is
1028    only invalidated by consuming the token (e.g. with lex_get()). */
1029 const struct token *
1030 lex_next (const struct lexer *lexer, int n)
1031 {
1032   return &lex_next__ (lexer, n)->token;
1033 }
1034
1035 /* Returns the type of the token N after the current one in LEXER. */
1036 enum token_type
1037 lex_next_token (const struct lexer *lexer, int n)
1038 {
1039   return lex_next (lexer, n)->type;
1040 }
1041
1042 /* Returns the number in the tokn N after the current one in LEXER.
1043
1044    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1045    tokens this function will always return zero. */
1046 double
1047 lex_next_tokval (const struct lexer *lexer, int n)
1048 {
1049   return token_number (lex_next (lexer, n));
1050 }
1051
1052 /* Returns the null-terminated string in the token N after the current one, in
1053    UTF-8 encoding.
1054
1055    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1056    this functions this function will always return NULL.
1057
1058    The UTF-8 encoding of the returned string is correct for variable names and
1059    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1060    data_in() to use it in a "union value".  */
1061 const char *
1062 lex_next_tokcstr (const struct lexer *lexer, int n)
1063 {
1064   return lex_next_tokss (lexer, n).string;
1065 }
1066
1067 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1068    The string is null-terminated (but the null terminator is not included in
1069    the returned substring's 'length').
1070
1071    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1072    tokens this functions this function will always return NULL.
1073
1074    The UTF-8 encoding of the returned string is correct for variable names and
1075    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1076    data_in() to use it in a "union value".  */
1077 struct substring
1078 lex_next_tokss (const struct lexer *lexer, int n)
1079 {
1080   return lex_next (lexer, n)->string;
1081 }
1082
1083 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1084    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1085    are both zero, this requests the syntax for the current token.)  The caller
1086    must eventually free the returned string (with free()).  The syntax is
1087    encoded in UTF-8 and in the original form supplied to the lexer so that, for
1088    example, it may include comments, spaces, and new-lines if it spans multiple
1089    tokens.  Macro expansion, however, has already been performed. */
1090 char *
1091 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1092 {
1093   return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1094 }
1095
1096 /* Returns true if the token N ahead of the current one was produced by macro
1097    expansion, false otherwise. */
1098 bool
1099 lex_next_is_from_macro (const struct lexer *lexer, int n)
1100 {
1101   return lex_next__ (lexer, n)->macro_rep != NULL;
1102 }
1103
1104 static bool
1105 lex_tokens_match (const struct token *actual, const struct token *expected)
1106 {
1107   if (actual->type != expected->type)
1108     return false;
1109
1110   switch (actual->type)
1111     {
1112     case T_POS_NUM:
1113     case T_NEG_NUM:
1114       return actual->number == expected->number;
1115
1116     case T_ID:
1117       return lex_id_match (expected->string, actual->string);
1118
1119     case T_STRING:
1120       return (actual->string.length == expected->string.length
1121               && !memcmp (actual->string.string, expected->string.string,
1122                           actual->string.length));
1123
1124     default:
1125       return true;
1126     }
1127 }
1128
1129 static size_t
1130 lex_at_phrase__ (struct lexer *lexer, const char *s)
1131 {
1132   struct string_lexer slex;
1133   struct token token;
1134
1135   size_t i = 0;
1136   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1137   while (string_lexer_next (&slex, &token))
1138     {
1139       bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1140       token_uninit (&token);
1141       if (!match)
1142         return 0;
1143     }
1144   return i;
1145 }
1146
1147 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1148    returns true.  Otherwise, returns false.
1149
1150    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1151    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1152    first three letters. */
1153 bool
1154 lex_at_phrase (struct lexer *lexer, const char *s)
1155 {
1156   return lex_at_phrase__ (lexer, s) > 0;
1157 }
1158
1159 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1160    skips it and returns true.  Otherwise, returns false.
1161
1162    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1163    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1164    first three letters. */
1165 bool
1166 lex_match_phrase (struct lexer *lexer, const char *s)
1167 {
1168   size_t n = lex_at_phrase__ (lexer, s);
1169   if (n > 0)
1170     lex_get_n (lexer, n);
1171   return n > 0;
1172 }
1173
1174 static int
1175 count_newlines (char *s, size_t length)
1176 {
1177   int n_newlines = 0;
1178   char *newline;
1179
1180   while ((newline = memchr (s, '\n', length)) != NULL)
1181     {
1182       n_newlines++;
1183       length -= (newline + 1) - s;
1184       s = newline + 1;
1185     }
1186
1187   return n_newlines;
1188 }
1189
1190 static int
1191 lex_token_get_last_line_number (const struct lex_source *src,
1192                                 const struct lex_token *token)
1193 {
1194   if (token->first_line == 0)
1195     return 0;
1196   else
1197     {
1198       char *token_str = &src->buffer[token->token_pos];
1199       return token->first_line + count_newlines (token_str, token->token_len) + 1;
1200     }
1201 }
1202
1203 static int
1204 lex_token_get_column__ (const struct lex_source *src, size_t offset)
1205 {
1206   const char *newline = memrchr (src->buffer, '\n', offset);
1207   size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1208   return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1209 }
1210
1211 static int
1212 lex_token_get_first_column (const struct lex_source *src,
1213                             const struct lex_token *token)
1214 {
1215   return lex_token_get_column__ (src, token->token_pos);
1216 }
1217
1218 static int
1219 lex_token_get_last_column (const struct lex_source *src,
1220                            const struct lex_token *token)
1221 {
1222   return lex_token_get_column__ (src, token->token_pos + token->token_len);
1223 }
1224
1225 static struct msg_location
1226 lex_token_location (const struct lex_source *src,
1227                     const struct lex_token *t0,
1228                     const struct lex_token *t1)
1229 {
1230   return (struct msg_location) {
1231     .file_name = src->reader->file_name,
1232     .first_line = t0->first_line,
1233     .last_line = lex_token_get_last_line_number (src, t1),
1234     .first_column = lex_token_get_first_column (src, t0),
1235     .last_column = lex_token_get_last_column (src, t1),
1236   };
1237 }
1238
1239 static struct msg_location *
1240 lex_token_location_rw (const struct lex_source *src,
1241                        const struct lex_token *t0,
1242                        const struct lex_token *t1)
1243 {
1244   struct msg_location location = lex_token_location (src, t0, t1);
1245   return msg_location_dup (&location);
1246 }
1247
1248 static struct msg_location *
1249 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1250 {
1251   return lex_token_location_rw (src,
1252                                 lex_source_next__ (src, n0),
1253                                 lex_source_next__ (src, n1));
1254 }
1255
1256 /* Returns the 1-based line number of the start of the syntax that represents
1257    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1258    if the token is drawn from a source that does not have line numbers. */
1259 int
1260 lex_get_first_line_number (const struct lexer *lexer, int n)
1261 {
1262   const struct lex_source *src = lex_source__ (lexer);
1263   return src ? lex_source_next__ (src, n)->first_line : 0;
1264 }
1265
1266 /* Returns the 1-based line number of the end of the syntax that represents the
1267    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1268    token or if the token is drawn from a source that does not have line
1269    numbers.
1270
1271    Most of the time, a single token is wholly within a single line of syntax,
1272    but there are two exceptions: a T_STRING token can be made up of multiple
1273    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1274    token can consist of a "-" on one line followed by the number on the next.
1275  */
1276 int
1277 lex_get_last_line_number (const struct lexer *lexer, int n)
1278 {
1279   const struct lex_source *src = lex_source__ (lexer);
1280   return src ? lex_token_get_last_line_number (src,
1281                                                lex_source_next__ (src, n)) : 0;
1282 }
1283
1284 /* Returns the 1-based column number of the start of the syntax that represents
1285    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1286    token.
1287
1288    Column numbers are measured according to the width of characters as shown in
1289    a typical fixed-width font, in which CJK characters have width 2 and
1290    combining characters have width 0.  */
1291 int
1292 lex_get_first_column (const struct lexer *lexer, int n)
1293 {
1294   const struct lex_source *src = lex_source__ (lexer);
1295   return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1296 }
1297
1298 /* Returns the 1-based column number of the end of the syntax that represents
1299    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1300    token.
1301
1302    Column numbers are measured according to the width of characters as shown in
1303    a typical fixed-width font, in which CJK characters have width 2 and
1304    combining characters have width 0.  */
1305 int
1306 lex_get_last_column (const struct lexer *lexer, int n)
1307 {
1308   const struct lex_source *src = lex_source__ (lexer);
1309   return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1310 }
1311
1312 /* Returns the name of the syntax file from which the current command is drawn.
1313    Returns NULL for a T_STOP token or if the command's source does not have
1314    line numbers.
1315
1316    There is no version of this function that takes an N argument because
1317    lookahead only works to the end of a command and any given command is always
1318    within a single syntax file. */
1319 const char *
1320 lex_get_file_name (const struct lexer *lexer)
1321 {
1322   struct lex_source *src = lex_source__ (lexer);
1323   return src == NULL ? NULL : src->reader->file_name;
1324 }
1325
1326 /* Returns a newly allocated msg_location for the syntax that represents tokens
1327    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1328    must eventually free the location (with msg_location_destroy()). */
1329 struct msg_location *
1330 lex_get_location (const struct lexer *lexer, int n0, int n1)
1331 {
1332   struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1333   loc->first_column = lex_get_first_column (lexer, n0);
1334   loc->last_column = lex_get_last_column (lexer, n1);
1335   return loc;
1336 }
1337
1338 /* Returns a newly allocated msg_location for the syntax that represents tokens
1339    with 0-based offsets N0...N1, inclusive, from the current token.  The
1340    location only covers the tokens' lines, not the columns.  The caller must
1341    eventually free the location (with msg_location_destroy()). */
1342 struct msg_location *
1343 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1344 {
1345   struct msg_location *loc = xmalloc (sizeof *loc);
1346   *loc = (struct msg_location) {
1347     .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1348     .first_line = lex_get_first_line_number (lexer, n0),
1349     .last_line = lex_get_last_line_number (lexer, n1),
1350   };
1351   return loc;
1352 }
1353
1354 const char *
1355 lex_get_encoding (const struct lexer *lexer)
1356 {
1357   struct lex_source *src = lex_source__ (lexer);
1358   return src == NULL ? NULL : src->reader->encoding;
1359 }
1360
1361 /* Returns the syntax mode for the syntax file from which the current drawn is
1362    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1363    does not have line numbers.
1364
1365    There is no version of this function that takes an N argument because
1366    lookahead only works to the end of a command and any given command is always
1367    within a single syntax file. */
1368 enum segmenter_mode
1369 lex_get_syntax_mode (const struct lexer *lexer)
1370 {
1371   struct lex_source *src = lex_source__ (lexer);
1372   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1373 }
1374
1375 /* Returns the error mode for the syntax file from which the current drawn is
1376    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1377    source does not have line numbers.
1378
1379    There is no version of this function that takes an N argument because
1380    lookahead only works to the end of a command and any given command is always
1381    within a single syntax file. */
1382 enum lex_error_mode
1383 lex_get_error_mode (const struct lexer *lexer)
1384 {
1385   struct lex_source *src = lex_source__ (lexer);
1386   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1387 }
1388
1389 /* If the source that LEXER is currently reading has error mode
1390    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1391    token to be read comes directly from whatever is next read from the stream.
1392
1393    It makes sense to call this function after encountering an error in a
1394    command entered on the console, because usually the user would prefer not to
1395    have cascading errors. */
1396 void
1397 lex_interactive_reset (struct lexer *lexer)
1398 {
1399   struct lex_source *src = lex_source__ (lexer);
1400   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1401     {
1402       src->length = 0;
1403       src->journal_pos = src->seg_pos = 0;
1404       src->n_newlines = 0;
1405       src->suppress_next_newline = false;
1406       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1407                                        false);
1408       lex_stage_clear (&src->pp);
1409       lex_stage_clear (&src->merge);
1410       lex_stage_clear (&src->lookahead);
1411       lex_source_push_endcmd__ (src);
1412     }
1413 }
1414
1415 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1416 void
1417 lex_discard_rest_of_command (struct lexer *lexer)
1418 {
1419   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1420     lex_get (lexer);
1421 }
1422
1423 /* Discards all lookahead tokens in LEXER, then discards all input sources
1424    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1425    runs out of input sources. */
1426 void
1427 lex_discard_noninteractive (struct lexer *lexer)
1428 {
1429   struct lex_source *src = lex_source__ (lexer);
1430
1431   if (src != NULL)
1432     {
1433       lex_stage_clear (&src->pp);
1434       lex_stage_clear (&src->merge);
1435       lex_stage_clear (&src->lookahead);
1436
1437       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1438            src = lex_source__ (lexer))
1439         lex_source_destroy (src);
1440     }
1441 }
1442 \f
1443 static void
1444 lex_source_expand__ (struct lex_source *src)
1445 {
1446   if (src->length >= src->allocated)
1447     src->buffer = x2realloc (src->buffer, &src->allocated);
1448 }
1449
1450 static void
1451 lex_source_read__ (struct lex_source *src)
1452 {
1453   do
1454     {
1455       lex_source_expand__ (src);
1456
1457       size_t space = src->allocated - src->length;
1458       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1459       size_t n = src->reader->class->read (src->reader,
1460                                            &src->buffer[src->length],
1461                                            space, prompt);
1462       assert (n <= space);
1463
1464       if (n == 0)
1465         {
1466           /* End of input. */
1467           src->reader->eof = true;
1468           lex_source_expand__ (src);
1469           return;
1470         }
1471
1472       src->length += n;
1473     }
1474   while (!memchr (&src->buffer[src->seg_pos], '\n',
1475                   src->length - src->seg_pos));
1476 }
1477
1478 static struct lex_source *
1479 lex_source__ (const struct lexer *lexer)
1480 {
1481   return (ll_is_empty (&lexer->sources) ? NULL
1482           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1483 }
1484
1485 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1486    one, through N1 ahead of the current one, inclusive.  (For example, if N0
1487    and N1 are both zero, this requests the syntax for the current token.)  The
1488    caller must eventually free the returned string (with free()).  The syntax
1489    is encoded in UTF-8 and in the original form supplied to the lexer so that,
1490    for example, it may include comments, spaces, and new-lines if it spans
1491    multiple tokens.  Macro expansion, however, has already been performed. */
1492 static char *
1493 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1494 {
1495   struct string s = DS_EMPTY_INITIALIZER;
1496   for (size_t i = n0; i <= n1; )
1497     {
1498       /* Find [I,J) as the longest sequence of tokens not produced by macro
1499          expansion, or otherwise the longest sequence expanded from a single
1500          macro call. */
1501       const struct lex_token *first = lex_source_next__ (src, i);
1502       size_t j;
1503       for (j = i + 1; j <= n1; j++)
1504         {
1505           const struct lex_token *cur = lex_source_next__ (src, j);
1506           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1507               || first->macro_rep != cur->macro_rep)
1508             break;
1509         }
1510       const struct lex_token *last = lex_source_next__ (src, j - 1);
1511
1512       /* Now add the syntax for this sequence of tokens to SRC. */
1513       if (!ds_is_empty (&s))
1514         ds_put_byte (&s, ' ');
1515       if (!first->macro_rep)
1516         {
1517           size_t start = first->token_pos;
1518           size_t end = last->token_pos + last->token_len;
1519           ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1520         }
1521       else
1522         {
1523           size_t start = first->ofs;
1524           size_t end = last->ofs + last->len;
1525           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1526                                            end - start));
1527         }
1528
1529       i = j;
1530     }
1531   return ds_steal_cstr (&s);
1532 }
1533
1534 static bool
1535 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1536 {
1537   for (size_t i = n0; i <= n1; i++)
1538     if (lex_source_next__ (src, i)->macro_rep)
1539       return true;
1540   return false;
1541 }
1542
1543 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1544    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1545    other tokens included in that range.  The syntax is encoded in UTF-8 and in
1546    the original form supplied to the lexer so that, for example, it may include
1547    comments, spaces, and new-lines if it spans multiple tokens.
1548
1549    Returns an empty string if the token range doesn't include a macro call.
1550
1551    The caller must not modify or free the returned string. */
1552 static struct substring
1553 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1554 {
1555   if (!lex_source_contains_macro_call (src, n0, n1))
1556     return ss_empty ();
1557
1558   const struct lex_token *token0 = lex_source_next__ (src, n0);
1559   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1560   size_t start = token0->token_pos;
1561   size_t end = token1->token_pos + token1->token_len;
1562
1563   return ss_buffer (&src->buffer[start], end - start);
1564 }
1565
1566 static void
1567 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1568                          const char *format, va_list args)
1569 {
1570   const struct lex_token *token;
1571   struct string s;
1572
1573   ds_init_empty (&s);
1574
1575   token = lex_source_next__ (src, n0);
1576   if (token->token.type == T_ENDCMD)
1577     ds_put_cstr (&s, _("Syntax error at end of command"));
1578   else
1579     {
1580       /* Get the syntax that caused the error. */
1581       char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1582       char syntax[64];
1583       str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1584       free (raw_syntax);
1585
1586       /* Get the macro call(s) that expanded to the syntax that caused the
1587          error. */
1588       char call[64];
1589       str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1590                      call, sizeof call);
1591
1592       if (syntax[0])
1593         {
1594           if (call[0])
1595             ds_put_format (&s,
1596                            _("Syntax error at `%s' (in expansion of `%s')"),
1597                            syntax, call);
1598           else
1599             ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1600         }
1601       else
1602         {
1603           if (call[0])
1604             ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1605                            call);
1606           else
1607             ds_put_cstr (&s, _("Syntax error"));
1608         }
1609     }
1610
1611   if (format)
1612     {
1613       ds_put_cstr (&s, ": ");
1614       ds_put_vformat (&s, format, args);
1615     }
1616   if (ds_last (&s) != '.')
1617     ds_put_byte (&s, '.');
1618
1619   struct msg *m = xmalloc (sizeof *m);
1620   *m = (struct msg) {
1621     .category = MSG_C_SYNTAX,
1622     .severity = MSG_S_ERROR,
1623     .location = lex_source_get_location (src, n0, n1),
1624     .text = ds_steal_cstr (&s),
1625   };
1626   msg_emit (m);
1627 }
1628
1629 static void
1630 lex_get_error (struct lex_source *src, const struct lex_token *token)
1631 {
1632   char syntax[64];
1633   str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1634                  syntax, sizeof syntax);
1635
1636   struct string s = DS_EMPTY_INITIALIZER;
1637   ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1638   ds_put_format (&s, ": %s", token->token.string.string);
1639
1640   struct msg *m = xmalloc (sizeof *m);
1641   *m = (struct msg) {
1642     .category = MSG_C_SYNTAX,
1643     .severity = MSG_S_ERROR,
1644     .location = lex_token_location_rw (src, token, token),
1645     .text = ds_steal_cstr (&s),
1646   };
1647   msg_emit (m);
1648 }
1649
1650 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1651    underlying lex_reader if necessary.  Returns true if a new token was added
1652    to SRC's deque, false otherwise.  The caller should retry failures unless
1653    SRC's 'eof' marker was set to true indicating that there will be no more
1654    tokens from this source. */
1655 static bool
1656 lex_source_try_get_pp (struct lex_source *src)
1657 {
1658   /* Append a new token to SRC and initialize it. */
1659   struct lex_token *token = xmalloc (sizeof *token);
1660   token->token = (struct token) { .type = T_STOP };
1661   token->macro_rep = NULL;
1662   token->ref_cnt = NULL;
1663   token->token_pos = src->seg_pos;
1664   if (src->reader->line_number > 0)
1665     token->first_line = src->reader->line_number + src->n_newlines;
1666   else
1667     token->first_line = 0;
1668
1669   /* Extract a segment. */
1670   const char *segment;
1671   enum segment_type seg_type;
1672   int seg_len;
1673   for (;;)
1674     {
1675       segment = &src->buffer[src->seg_pos];
1676       seg_len = segmenter_push (&src->segmenter, segment,
1677                                 src->length - src->seg_pos,
1678                                 src->reader->eof, &seg_type);
1679       if (seg_len >= 0)
1680         break;
1681
1682       /* The segmenter needs more input to produce a segment. */
1683       assert (!src->reader->eof);
1684       lex_source_read__ (src);
1685     }
1686
1687   /* Update state based on the segment. */
1688   token->token_len = seg_len;
1689   src->seg_pos += seg_len;
1690   if (seg_type == SEG_NEWLINE)
1691     src->n_newlines++;
1692
1693   /* Get a token from the segment. */
1694   enum tokenize_result result = token_from_segment (
1695     seg_type, ss_buffer (segment, seg_len), &token->token);
1696
1697   /* If we've reached the end of a line, or the end of a command, then pass
1698      the line to the output engine as a syntax text item.  */
1699   int n_lines = seg_type == SEG_NEWLINE;
1700   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1701     {
1702       n_lines++;
1703       src->suppress_next_newline = true;
1704     }
1705   else if (n_lines > 0 && src->suppress_next_newline)
1706     {
1707       n_lines--;
1708       src->suppress_next_newline = false;
1709     }
1710   for (int i = 0; i < n_lines; i++)
1711     {
1712       /* Beginning of line. */
1713       const char *line = &src->buffer[src->journal_pos];
1714
1715       /* Calculate line length, including \n or \r\n end-of-line if present.
1716
1717          We use src->head even though that may be beyond what we've actually
1718          converted to tokens (which is only through line_pos).  That's because,
1719          if we're emitting the line due to SEG_END_COMMAND, we want to take the
1720          whole line through the newline, not just through the '.'. */
1721       size_t max_len = src->length - src->journal_pos;
1722       const char *newline = memchr (line, '\n', max_len);
1723       size_t line_len = newline ? newline - line + 1 : max_len;
1724
1725       /* Calculate line length excluding end-of-line. */
1726       size_t copy_len = line_len;
1727       if (copy_len > 0 && line[copy_len - 1] == '\n')
1728         copy_len--;
1729       if (copy_len > 0 && line[copy_len - 1] == '\r')
1730         copy_len--;
1731
1732       /* Submit the line as syntax. */
1733       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1734                                                    xmemdup0 (line, copy_len),
1735                                                    NULL));
1736
1737       src->journal_pos += line_len;
1738     }
1739
1740   switch (result)
1741     {
1742     case TOKENIZE_ERROR:
1743       lex_get_error (src, token);
1744       /* Fall through. */
1745     case TOKENIZE_EMPTY:
1746       lex_token_destroy (token);
1747       return false;
1748
1749     case TOKENIZE_TOKEN:
1750       if (token->token.type == T_STOP)
1751         {
1752           token->token.type = T_ENDCMD;
1753           src->eof = true;
1754         }
1755       lex_stage_push_last (&src->pp, token);
1756       return true;
1757     }
1758   NOT_REACHED ();
1759 }
1760
1761 /* Attempts to append a new token to SRC.  Returns true if successful, false on
1762    failure.  On failure, the end of SRC has been reached and no more tokens
1763    will be forthcoming from it.
1764
1765    Does not make the new token available for lookahead yet; the caller must
1766    adjust SRC's 'middle' pointer to do so. */
1767 static bool
1768 lex_source_get_pp (struct lex_source *src)
1769 {
1770   while (!src->eof)
1771     if (lex_source_try_get_pp (src))
1772       return true;
1773   return false;
1774 }
1775
1776 static bool
1777 lex_source_try_get_merge (const struct lex_source *src_)
1778 {
1779   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1780
1781   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1782     return false;
1783
1784   if (!settings_get_mexpand ())
1785     {
1786       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1787       return true;
1788     }
1789
1790   /* Now pass tokens one-by-one to the macro expander.
1791
1792      In the common case where there is no macro to expand, the loop is not
1793      entered.  */
1794   struct macro_call *mc;
1795   int n_call = macro_call_create (src->lexer->macros,
1796                                   &lex_stage_first (&src->pp)->token, &mc);
1797   for (int ofs = 1; !n_call; ofs++)
1798     {
1799       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1800         {
1801           /* This should not be reachable because we always get a T_ENDCMD at
1802              the end of an input file (transformed from T_STOP by
1803              lex_source_try_get_pp()) and the macro_expander should always
1804              terminate expansion on T_ENDCMD. */
1805           NOT_REACHED ();
1806         }
1807
1808       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1809       size_t start = t->token_pos;
1810       size_t end = t->token_pos + t->token_len;
1811       const struct macro_token mt = {
1812         .token = t->token,
1813         .syntax = ss_buffer (&src->buffer[start], end - start),
1814       };
1815       const struct msg_location loc = lex_token_location (src, t, t);
1816       n_call = macro_call_add (mc, &mt, &loc);
1817     }
1818   if (n_call < 0)
1819     {
1820       /* False alarm: no macro expansion after all.  Use first token as
1821          lookahead.  We'll retry macro expansion from the second token next
1822          time around. */
1823       macro_call_destroy (mc);
1824       lex_stage_shift (&src->merge, &src->pp, 1);
1825       return true;
1826     }
1827
1828   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1829      are a macro call.  (These are likely to be the only tokens in 'pp'.)
1830      Expand them.  */
1831   const struct lex_token *c0 = lex_stage_first (&src->pp);
1832   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1833   struct macro_tokens expansion = { .n = 0 };
1834   struct msg_location loc = lex_token_location (src, c0, c1);
1835   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1836   macro_call_destroy (mc);
1837
1838   /* Convert the macro expansion into syntax for possible error messages
1839      later. */
1840   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1841   size_t *len = xnmalloc (expansion.n, sizeof *len);
1842   struct string s = DS_EMPTY_INITIALIZER;
1843   macro_tokens_to_syntax (&expansion, &s, ofs, len);
1844
1845   if (settings_get_mprint ())
1846     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1847                                           _("Macro Expansion")));
1848
1849   /* Append the macro expansion tokens to the lookahead. */
1850   if (expansion.n > 0)
1851     {
1852       char *macro_rep = ds_steal_cstr (&s);
1853       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1854       *ref_cnt = expansion.n;
1855       for (size_t i = 0; i < expansion.n; i++)
1856         {
1857           struct lex_token *token = xmalloc (sizeof *token);
1858           *token = (struct lex_token) {
1859             .token = expansion.mts[i].token,
1860             .token_pos = c0->token_pos,
1861             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1862             .first_line = c0->first_line,
1863             .macro_rep = macro_rep,
1864             .ofs = ofs[i],
1865             .len = len[i],
1866             .ref_cnt = ref_cnt,
1867           };
1868           lex_stage_push_last (&src->merge, token);
1869
1870           ss_dealloc (&expansion.mts[i].syntax);
1871         }
1872     }
1873   else
1874     ds_destroy (&s);
1875   free (expansion.mts);
1876   free (ofs);
1877   free (len);
1878
1879   /* Destroy the tokens for the call. */
1880   for (size_t i = 0; i < n_call; i++)
1881     lex_stage_pop_first (&src->pp);
1882
1883   return expansion.n > 0;
1884 }
1885
1886 /* Attempts to obtain at least one new token into 'merge' in SRC.
1887
1888    Returns true if successful, false on failure.  In the latter case, SRC is
1889    exhausted and 'src->eof' is now true. */
1890 static bool
1891 lex_source_get_merge (struct lex_source *src)
1892 {
1893   while (!src->eof)
1894     if (lex_source_try_get_merge (src))
1895       return true;
1896   return false;
1897 }
1898
1899 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1900
1901    Returns true if successful, false on failure.  In the latter case, SRC is
1902    exhausted and 'src->eof' is now true. */
1903 static bool
1904 lex_source_get_lookahead (struct lex_source *src)
1905 {
1906   struct merger m = MERGER_INIT;
1907   struct token out;
1908   for (size_t i = 0; ; i++)
1909     {
1910       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1911         {
1912           /* We always get a T_ENDCMD at the end of an input file
1913              (transformed from T_STOP by lex_source_try_get_pp()) and
1914              merger_add() should never return -1 on T_ENDCMD. */
1915           assert (lex_stage_is_empty (&src->merge));
1916           return false;
1917         }
1918
1919       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1920                                &out);
1921       if (!retval)
1922         {
1923           lex_stage_shift (&src->lookahead, &src->merge, 1);
1924           return true;
1925         }
1926       else if (retval > 0)
1927         {
1928           /* Add a token that merges all the tokens together. */
1929           const struct lex_token *first = lex_stage_first (&src->merge);
1930           const struct lex_token *last = lex_stage_nth (&src->merge,
1931                                                         retval - 1);
1932           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1933           struct lex_token *t = xmalloc (sizeof *t);
1934           *t = (struct lex_token) {
1935             .token = out,
1936             .token_pos = first->token_pos,
1937             .token_len = (last->token_pos - first->token_pos) + last->token_len,
1938             .first_line = first->first_line,
1939
1940             /* This works well if all the tokens were not expanded from macros,
1941                or if they came from the same macro expansion.  It just gives up
1942                in the other (corner) cases. */
1943             .macro_rep = macro ? first->macro_rep : NULL,
1944             .ofs = macro ? first->ofs : 0,
1945             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
1946             .ref_cnt = macro ? first->ref_cnt : NULL,
1947           };
1948           if (t->ref_cnt)
1949             ++*t->ref_cnt;
1950           lex_stage_push_last (&src->lookahead, t);
1951
1952           for (int i = 0; i < retval; i++)
1953             lex_stage_pop_first (&src->merge);
1954           return true;
1955         }
1956     }
1957 }
1958 \f
1959 static void
1960 lex_source_push_endcmd__ (struct lex_source *src)
1961 {
1962   assert (lex_stage_is_empty (&src->lookahead));
1963   struct lex_token *token = xmalloc (sizeof *token);
1964   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
1965   lex_stage_push_last (&src->lookahead, token);
1966 }
1967
1968 static struct lex_source *
1969 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
1970 {
1971   struct lex_source *src = xmalloc (sizeof *src);
1972   *src = (struct lex_source) {
1973     .reader = reader,
1974     .segmenter = segmenter_init (reader->syntax, false),
1975     .lexer = lexer,
1976   };
1977
1978   lex_source_push_endcmd__ (src);
1979
1980   return src;
1981 }
1982
1983 static void
1984 lex_source_destroy (struct lex_source *src)
1985 {
1986   char *file_name = src->reader->file_name;
1987   char *encoding = src->reader->encoding;
1988   if (src->reader->class->destroy != NULL)
1989     src->reader->class->destroy (src->reader);
1990   free (file_name);
1991   free (encoding);
1992   free (src->buffer);
1993   lex_stage_uninit (&src->pp);
1994   lex_stage_uninit (&src->merge);
1995   lex_stage_uninit (&src->lookahead);
1996   ll_remove (&src->ll);
1997   free (src);
1998 }
1999 \f
2000 struct lex_file_reader
2001   {
2002     struct lex_reader reader;
2003     struct u8_istream *istream;
2004   };
2005
2006 static struct lex_reader_class lex_file_reader_class;
2007
2008 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2009    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2010    ENCODING, which should take one of the forms accepted by
2011    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2012    mode of the new reader, respectively.
2013
2014    Returns a null pointer if FILE_NAME cannot be opened. */
2015 struct lex_reader *
2016 lex_reader_for_file (const char *file_name, const char *encoding,
2017                      enum segmenter_mode syntax,
2018                      enum lex_error_mode error)
2019 {
2020   struct lex_file_reader *r;
2021   struct u8_istream *istream;
2022
2023   istream = (!strcmp(file_name, "-")
2024              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2025              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2026   if (istream == NULL)
2027     {
2028       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2029       return NULL;
2030     }
2031
2032   r = xmalloc (sizeof *r);
2033   lex_reader_init (&r->reader, &lex_file_reader_class);
2034   r->reader.syntax = syntax;
2035   r->reader.error = error;
2036   r->reader.file_name = xstrdup (file_name);
2037   r->reader.encoding = xstrdup_if_nonnull (encoding);
2038   r->reader.line_number = 1;
2039   r->istream = istream;
2040
2041   return &r->reader;
2042 }
2043
2044 static struct lex_file_reader *
2045 lex_file_reader_cast (struct lex_reader *r)
2046 {
2047   return UP_CAST (r, struct lex_file_reader, reader);
2048 }
2049
2050 static size_t
2051 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2052                enum prompt_style prompt_style UNUSED)
2053 {
2054   struct lex_file_reader *r = lex_file_reader_cast (r_);
2055   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2056   if (n_read < 0)
2057     {
2058       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2059       return 0;
2060     }
2061   return n_read;
2062 }
2063
2064 static void
2065 lex_file_close (struct lex_reader *r_)
2066 {
2067   struct lex_file_reader *r = lex_file_reader_cast (r_);
2068
2069   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2070     {
2071       if (u8_istream_close (r->istream) != 0)
2072         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2073     }
2074   else
2075     u8_istream_free (r->istream);
2076
2077   free (r);
2078 }
2079
2080 static struct lex_reader_class lex_file_reader_class =
2081   {
2082     lex_file_read,
2083     lex_file_close
2084   };
2085 \f
2086 struct lex_string_reader
2087   {
2088     struct lex_reader reader;
2089     struct substring s;
2090     size_t offset;
2091   };
2092
2093 static struct lex_reader_class lex_string_reader_class;
2094
2095 /* Creates and returns a new lex_reader for the contents of S, which must be
2096    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2097    with ss_dealloc() when it is closed. */
2098 struct lex_reader *
2099 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2100 {
2101   struct lex_string_reader *r;
2102
2103   r = xmalloc (sizeof *r);
2104   lex_reader_init (&r->reader, &lex_string_reader_class);
2105   r->reader.syntax = SEG_MODE_AUTO;
2106   r->reader.encoding = xstrdup_if_nonnull (encoding);
2107   r->s = s;
2108   r->offset = 0;
2109
2110   return &r->reader;
2111 }
2112
2113 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2114    which must be encoded in ENCODING.  The caller retains ownership of S. */
2115 struct lex_reader *
2116 lex_reader_for_string (const char *s, const char *encoding)
2117 {
2118   struct substring ss;
2119   ss_alloc_substring (&ss, ss_cstr (s));
2120   return lex_reader_for_substring_nocopy (ss, encoding);
2121 }
2122
2123 /* Formats FORMAT as a printf()-like format string and creates and returns a
2124    new lex_reader for the formatted result.  */
2125 struct lex_reader *
2126 lex_reader_for_format (const char *format, const char *encoding, ...)
2127 {
2128   struct lex_reader *r;
2129   va_list args;
2130
2131   va_start (args, encoding);
2132   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2133   va_end (args);
2134
2135   return r;
2136 }
2137
2138 static struct lex_string_reader *
2139 lex_string_reader_cast (struct lex_reader *r)
2140 {
2141   return UP_CAST (r, struct lex_string_reader, reader);
2142 }
2143
2144 static size_t
2145 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2146                  enum prompt_style prompt_style UNUSED)
2147 {
2148   struct lex_string_reader *r = lex_string_reader_cast (r_);
2149   size_t chunk;
2150
2151   chunk = MIN (n, r->s.length - r->offset);
2152   memcpy (buf, r->s.string + r->offset, chunk);
2153   r->offset += chunk;
2154
2155   return chunk;
2156 }
2157
2158 static void
2159 lex_string_close (struct lex_reader *r_)
2160 {
2161   struct lex_string_reader *r = lex_string_reader_cast (r_);
2162
2163   ss_dealloc (&r->s);
2164   free (r);
2165 }
2166
2167 static struct lex_reader_class lex_string_reader_class =
2168   {
2169     lex_string_read,
2170     lex_string_close
2171   };