pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31
  32 #include "language/command.h"
  33 #include "language/lexer/macro.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/ll.h"
  42 #include "libpspp/message.h"
  43 #include "libpspp/misc.h"
  44 #include "libpspp/str.h"
  45 #include "libpspp/u8-istream.h"
  46 #include "output/journal.h"
  47 #include "output/output-item.h"
  48
  49 #include "gl/c-ctype.h"
  50 #include "gl/minmax.h"
  51 #include "gl/xalloc.h"
  52 #include "gl/xmemdup0.h"
  53
  54 #include "gettext.h"
  55 #define _(msgid) gettext (msgid)
  56 #define N_(msgid) msgid
  57
  58 /* A token within a lex_source. */
  59 struct lex_token
  60   {
  61     /* The regular token information. */
  62     struct token token;
  63
  64     /* For a token obtained through the lexer in an ordinary way, this is the
  65        location of the token in terms of the lex_source's buffer.
  66
  67        For a token produced through macro expansion, this is the entire macro
  68        call. */
  69     size_t token_pos;           /* Offset into src->buffer of token start. */
  70     size_t token_len;           /* Length of source for token in bytes. */
  71     size_t line_pos;            /* Start of line containing token_pos. */
  72     int first_line;             /* Line number at token_pos. */
  73
  74     /* For a token obtained through macro expansion, this is just this token.
  75
  76        For a token obtained through the lexer in an ordinary way, these are
  77        nulls and zeros. */
  78     char *macro_rep;        /* The whole macro expansion. */
  79     size_t ofs;             /* Offset of this token in macro_rep. */
  80     size_t len;             /* Length of this token in macro_rep. */
  81     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  82   };
  83
  84 static void
  85 lex_token_destroy (struct lex_token *t)
  86 {
  87   token_uninit (&t->token);
  88   if (t->ref_cnt)
  89     {
  90       assert (*t->ref_cnt > 0);
  91       if (!--*t->ref_cnt)
  92         {
  93           free (t->macro_rep);
  94           free (t->ref_cnt);
  95         }
  96     }
  97   free (t);
  98 }
  99 \f
 100 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 101    lex_source. */
 102 struct lex_stage
 103   {
 104     struct deque deque;
 105     struct lex_token **tokens;
 106   };
 107
 108 static void lex_stage_clear (struct lex_stage *);
 109 static void lex_stage_uninit (struct lex_stage *);
 110
 111 static size_t lex_stage_count (const struct lex_stage *);
 112 static bool lex_stage_is_empty (const struct lex_stage *);
 113
 114 static struct lex_token *lex_stage_last (struct lex_stage *);
 115 static struct lex_token *lex_stage_first (struct lex_stage *);
 116 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 117
 118 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 119 static void lex_stage_pop_first (struct lex_stage *);
 120
 121 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 122                              size_t n);
 123
 124 /* Deletes all the tokens from STAGE. */
 125 static void
 126 lex_stage_clear (struct lex_stage *stage)
 127 {
 128   while (!deque_is_empty (&stage->deque))
 129     lex_stage_pop_first (stage);
 130 }
 131
 132 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 133 static void
 134 lex_stage_uninit (struct lex_stage *stage)
 135 {
 136   lex_stage_clear (stage);
 137   free (stage->tokens);
 138 }
 139
 140 /* Returns true if STAGE contains no tokens, otherwise false. */
 141 static bool
 142 lex_stage_is_empty (const struct lex_stage *stage)
 143 {
 144   return deque_is_empty (&stage->deque);
 145 }
 146
 147 /* Returns the number of tokens in STAGE. */
 148 static size_t
 149 lex_stage_count (const struct lex_stage *stage)
 150 {
 151   return deque_count (&stage->deque);
 152 }
 153
 154 /* Returns the last token in STAGE, which must be nonempty.  The last token is
 155    the one accessed with the greatest lookahead. */
 156 static struct lex_token *
 157 lex_stage_last (struct lex_stage *stage)
 158 {
 159   return stage->tokens[deque_front (&stage->deque, 0)];
 160 }
 161
 162 /* Returns the first token in STAGE, which must be nonempty.
 163    The first token is the one accessed with the least lookahead. */
 164 static struct lex_token *
 165 lex_stage_first (struct lex_stage *stage)
 166 {
 167   return lex_stage_nth (stage, 0);
 168 }
 169
 170 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 171    lookahead) is 0, the second token is 1, and so on.  There must be at least
 172    INDEX + 1 tokens in STAGE. */
 173 static struct lex_token *
 174 lex_stage_nth (struct lex_stage *stage, size_t index)
 175 {
 176   return stage->tokens[deque_back (&stage->deque, index)];
 177 }
 178
 179 /* Adds TOKEN so that it becomes the last token in STAGE. */
 180 static void
 181 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 182 {
 183   if (deque_is_full (&stage->deque))
 184     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 185                                   sizeof *stage->tokens);
 186   stage->tokens[deque_push_front (&stage->deque)] = token;
 187 }
 188
 189 /* Removes the first token from STAGE and uninitializes it. */
 190 static void
 191 lex_stage_pop_first (struct lex_stage *stage)
 192 {
 193   lex_token_destroy (stage->tokens[deque_pop_back (&stage->deque)]);
 194 }
 195
 196 /* Removes the first N tokens from SRC, appending them to DST as the last
 197    tokens. */
 198 static void
 199 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 200 {
 201   for (size_t i = 0; i < n; i++)
 202     {
 203       lex_stage_push_last (dst, lex_stage_first (src));
 204       deque_pop_back (&src->deque);
 205     }
 206 }
 207
 208 /* A source of tokens, corresponding to a syntax file.
 209
 210    This is conceptually a lex_reader wrapped with everything needed to convert
 211    its UTF-8 bytes into tokens. */
 212 struct lex_source
 213   {
 214     struct ll ll;               /* In lexer's list of sources. */
 215     struct lex_reader *reader;
 216     struct lexer *lexer;
 217     struct segmenter segmenter;
 218     bool eof;                   /* True if T_STOP was read from 'reader'. */
 219
 220     /* Buffer of UTF-8 bytes. */
 221     char *buffer;               /* Source file contents. */
 222     size_t length;              /* Number of bytes filled. */
 223     size_t allocated;           /* Number of bytes allocated. */
 224
 225     /* Offsets into 'buffer'. */
 226     size_t journal_pos;         /* First byte not yet output to journal. */
 227     size_t seg_pos;             /* First byte not yet scanned as token. */
 228     size_t line_pos;            /* First byte of line containing seg_pos. */
 229
 230     int n_newlines;             /* Number of new-lines up to seg_pos. */
 231     bool suppress_next_newline;
 232
 233     /* Tokens.
 234
 235        This is a pipeline with the following stages.  Each token eventually
 236        made available to the parser passes through of these stages.  The stages
 237        are named after the processing that happens in each one.
 238
 239        Initially, tokens come from the segmenter and scanner to 'pp':
 240
 241        - pp: Tokens that need to pass through the macro preprocessor to end up
 242          in 'merge'.
 243
 244        - merge: Tokens that need to pass through scan_merge() to end up in
 245          'lookahead'.
 246
 247        - lookahead: Tokens available to the client for parsing. */
 248     struct lex_stage pp;
 249     struct lex_stage merge;
 250     struct lex_stage lookahead;
 251   };
 252
 253 static struct lex_source *lex_source_create (struct lexer *,
 254                                              struct lex_reader *);
 255 static void lex_source_destroy (struct lex_source *);
 256
 257 /* Lexer. */
 258 struct lexer
 259   {
 260     struct ll_list sources;     /* Contains "struct lex_source"s. */
 261     struct macro_set *macros;
 262   };
 263
 264 static struct lex_source *lex_source__ (const struct lexer *);
 265 static char *lex_source_get_syntax__ (const struct lex_source *,
 266                                       int n0, int n1);
 267 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 268 static void lex_source_push_endcmd__ (struct lex_source *);
 269
 270 static bool lex_source_get_lookahead (struct lex_source *);
 271 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 272                                      const char *format, va_list)
 273    PRINTF_FORMAT (4, 0);
 274 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 275                                                   int n);
 276 \f
 277 /* Initializes READER with the specified CLASS and otherwise some reasonable
 278    defaults.  The caller should fill in the others members as desired. */
 279 void
 280 lex_reader_init (struct lex_reader *reader,
 281                  const struct lex_reader_class *class)
 282 {
 283   reader->class = class;
 284   reader->syntax = SEG_MODE_AUTO;
 285   reader->error = LEX_ERROR_CONTINUE;
 286   reader->file_name = NULL;
 287   reader->encoding = NULL;
 288   reader->line_number = 0;
 289   reader->eof = false;
 290 }
 291
 292 /* Frees any file name already in READER and replaces it by a copy of
 293    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 294 void
 295 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 296 {
 297   free (reader->file_name);
 298   reader->file_name = xstrdup_if_nonnull (file_name);
 299 }
 300 \f
 301 /* Creates and returns a new lexer. */
 302 struct lexer *
 303 lex_create (void)
 304 {
 305   struct lexer *lexer = xmalloc (sizeof *lexer);
 306   *lexer = (struct lexer) {
 307     .sources = LL_INITIALIZER (lexer->sources),
 308     .macros = macro_set_create (),
 309   };
 310   return lexer;
 311 }
 312
 313 /* Destroys LEXER. */
 314 void
 315 lex_destroy (struct lexer *lexer)
 316 {
 317   if (lexer != NULL)
 318     {
 319       struct lex_source *source, *next;
 320
 321       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 322         lex_source_destroy (source);
 323       macro_set_destroy (lexer->macros);
 324       free (lexer);
 325     }
 326 }
 327
 328 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 329    same name.  Takes ownership of M. */
 330 void
 331 lex_define_macro (struct lexer *lexer, struct macro *m)
 332 {
 333   macro_set_add (lexer->macros, m);
 334 }
 335
 336 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 337    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 338    token. */
 339 void
 340 lex_include (struct lexer *lexer, struct lex_reader *reader)
 341 {
 342   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 343   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 344 }
 345
 346 /* Appends READER to LEXER, so that it will be read after all other current
 347    readers have already been read. */
 348 void
 349 lex_append (struct lexer *lexer, struct lex_reader *reader)
 350 {
 351   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 352 }
 353 \f
 354 /* Advancing. */
 355
 356 /* Advances LEXER to the next token, consuming the current token. */
 357 void
 358 lex_get (struct lexer *lexer)
 359 {
 360   struct lex_source *src;
 361
 362   src = lex_source__ (lexer);
 363   if (src == NULL)
 364     return;
 365
 366   if (!lex_stage_is_empty (&src->lookahead))
 367     lex_stage_pop_first (&src->lookahead);
 368
 369   while (lex_stage_is_empty (&src->lookahead))
 370     if (!lex_source_get_lookahead (src))
 371       {
 372         lex_source_destroy (src);
 373         src = lex_source__ (lexer);
 374         if (src == NULL)
 375           return;
 376       }
 377 }
 378
 379 /* Advances LEXER by N tokens. */
 380 void
 381 lex_get_n (struct lexer *lexer, size_t n)
 382 {
 383   while (n-- > 0)
 384     lex_get (lexer);
 385 }
 386 \f
 387 /* Issuing errors. */
 388
 389 /* Prints a syntax error message containing the current token and
 390    given message MESSAGE (if non-null). */
 391 void
 392 lex_error (struct lexer *lexer, const char *format, ...)
 393 {
 394   va_list args;
 395
 396   va_start (args, format);
 397   lex_next_error_valist (lexer, 0, 0, format, args);
 398   va_end (args);
 399 }
 400
 401 /* Prints a syntax error message containing the current token and
 402    given message MESSAGE (if non-null). */
 403 void
 404 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 405 {
 406   lex_next_error_valist (lexer, 0, 0, format, args);
 407 }
 408
 409 /* Prints a syntax error message containing the current token and
 410    given message MESSAGE (if non-null). */
 411 void
 412 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 413 {
 414   va_list args;
 415
 416   va_start (args, format);
 417   lex_next_error_valist (lexer, n0, n1, format, args);
 418   va_end (args);
 419 }
 420
 421 /* Prints a syntax error message saying that one of the strings provided as
 422    varargs, up to the first NULL, is expected. */
 423 void
 424 (lex_error_expecting) (struct lexer *lexer, ...)
 425 {
 426   va_list args;
 427
 428   va_start (args, lexer);
 429   lex_error_expecting_valist (lexer, args);
 430   va_end (args);
 431 }
 432
 433 /* Prints a syntax error message saying that one of the options provided in
 434    ARGS, up to the first NULL, is expected. */
 435 void
 436 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 437 {
 438   enum { MAX_OPTIONS = 9 };
 439   const char *options[MAX_OPTIONS];
 440   int n = 0;
 441   while (n < MAX_OPTIONS)
 442     {
 443       const char *option = va_arg (args, const char *);
 444       if (!option)
 445         break;
 446
 447       options[n++] = option;
 448     }
 449   lex_error_expecting_array (lexer, options, n);
 450 }
 451
 452 void
 453 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 454 {
 455   switch (n)
 456     {
 457     case 0:
 458       lex_error (lexer, NULL);
 459       break;
 460
 461     case 1:
 462       lex_error (lexer, _("expecting %s"), options[0]);
 463       break;
 464
 465     case 2:
 466       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 467       break;
 468
 469     case 3:
 470       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 471                  options[2]);
 472       break;
 473
 474     case 4:
 475       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 476                  options[0], options[1], options[2], options[3]);
 477       break;
 478
 479     case 5:
 480       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 481                  options[0], options[1], options[2], options[3], options[4]);
 482       break;
 483
 484     case 6:
 485       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 486                  options[0], options[1], options[2], options[3], options[4],
 487                  options[5]);
 488       break;
 489
 490     case 7:
 491       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 492                  options[0], options[1], options[2], options[3], options[4],
 493                  options[5], options[6]);
 494       break;
 495
 496     case 8:
 497       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 498                  options[0], options[1], options[2], options[3], options[4],
 499                  options[5], options[6], options[7]);
 500       break;
 501
 502     default:
 503       lex_error (lexer, NULL);
 504     }
 505 }
 506
 507 /* Reports an error to the effect that subcommand SBC may only be specified
 508    once.
 509
 510    This function does not take a lexer as an argument or use lex_error(),
 511    because the result would ordinarily just be redundant: "Syntax error at
 512    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 513    not help the user find the error. */
 514 void
 515 lex_sbc_only_once (const char *sbc)
 516 {
 517   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 518 }
 519
 520 /* Reports an error to the effect that subcommand SBC is missing.
 521
 522    This function does not take a lexer as an argument or use lex_error(),
 523    because a missing subcommand can normally be detected only after the whole
 524    command has been parsed, and so lex_error() would always report "Syntax
 525    error at end of command", which does not help the user find the error. */
 526 void
 527 lex_sbc_missing (const char *sbc)
 528 {
 529   msg (SE, _("Required subcommand %s was not specified."), sbc);
 530 }
 531
 532 /* Reports an error to the effect that specification SPEC may only be specified
 533    once within subcommand SBC. */
 534 void
 535 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 536 {
 537   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 538              spec, sbc);
 539 }
 540
 541 /* Reports an error to the effect that specification SPEC is missing within
 542    subcommand SBC. */
 543 void
 544 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 545 {
 546   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 547              sbc, spec);
 548 }
 549
 550 /* Prints a syntax error message containing the current token and
 551    given message MESSAGE (if non-null). */
 552 void
 553 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 554                        const char *format, va_list args)
 555 {
 556   struct lex_source *src = lex_source__ (lexer);
 557
 558   if (src != NULL)
 559     lex_source_error_valist (src, n0, n1, format, args);
 560   else
 561     {
 562       struct string s;
 563
 564       ds_init_empty (&s);
 565       ds_put_format (&s, _("Syntax error at end of input"));
 566       if (format != NULL)
 567         {
 568           ds_put_cstr (&s, ": ");
 569           ds_put_vformat (&s, format, args);
 570         }
 571       if (ds_last (&s) != '.')
 572         ds_put_byte (&s, '.');
 573       msg (SE, "%s", ds_cstr (&s));
 574       ds_destroy (&s);
 575     }
 576 }
 577
 578 /* Checks that we're at end of command.
 579    If so, returns a successful command completion code.
 580    If not, flags a syntax error and returns an error command
 581    completion code. */
 582 int
 583 lex_end_of_command (struct lexer *lexer)
 584 {
 585   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 586     {
 587       lex_error (lexer, _("expecting end of command"));
 588       return CMD_FAILURE;
 589     }
 590   else
 591     return CMD_SUCCESS;
 592 }
 593 \f
 594 /* Token testing functions. */
 595
 596 /* Returns true if the current token is a number. */
 597 bool
 598 lex_is_number (const struct lexer *lexer)
 599 {
 600   return lex_next_is_number (lexer, 0);
 601 }
 602
 603 /* Returns true if the current token is a string. */
 604 bool
 605 lex_is_string (const struct lexer *lexer)
 606 {
 607   return lex_next_is_string (lexer, 0);
 608 }
 609
 610 /* Returns the value of the current token, which must be a
 611    floating point number. */
 612 double
 613 lex_number (const struct lexer *lexer)
 614 {
 615   return lex_next_number (lexer, 0);
 616 }
 617
 618 /* Returns true iff the current token is an integer. */
 619 bool
 620 lex_is_integer (const struct lexer *lexer)
 621 {
 622   return lex_next_is_integer (lexer, 0);
 623 }
 624
 625 /* Returns the value of the current token, which must be an
 626    integer. */
 627 long
 628 lex_integer (const struct lexer *lexer)
 629 {
 630   return lex_next_integer (lexer, 0);
 631 }
 632 \f
 633 /* Token testing functions with lookahead.
 634
 635    A value of 0 for N as an argument to any of these functions refers to the
 636    current token.  Lookahead is limited to the current command.  Any N greater
 637    than the number of tokens remaining in the current command will be treated
 638    as referring to a T_ENDCMD token. */
 639
 640 /* Returns true if the token N ahead of the current token is a number. */
 641 bool
 642 lex_next_is_number (const struct lexer *lexer, int n)
 643 {
 644   return token_is_number (lex_next (lexer, n));
 645 }
 646
 647 /* Returns true if the token N ahead of the current token is a string. */
 648 bool
 649 lex_next_is_string (const struct lexer *lexer, int n)
 650 {
 651   return token_is_string (lex_next (lexer, n));
 652 }
 653
 654 /* Returns the value of the token N ahead of the current token, which must be a
 655    floating point number. */
 656 double
 657 lex_next_number (const struct lexer *lexer, int n)
 658 {
 659   return token_number (lex_next (lexer, n));
 660 }
 661
 662 /* Returns true if the token N ahead of the current token is an integer. */
 663 bool
 664 lex_next_is_integer (const struct lexer *lexer, int n)
 665 {
 666   return token_is_integer (lex_next (lexer, n));
 667 }
 668
 669 /* Returns the value of the token N ahead of the current token, which must be
 670    an integer. */
 671 long
 672 lex_next_integer (const struct lexer *lexer, int n)
 673 {
 674   return token_integer (lex_next (lexer, n));
 675 }
 676 \f
 677 /* Token matching functions. */
 678
 679 /* If the current token has the specified TYPE, skips it and returns true.
 680    Otherwise, returns false. */
 681 bool
 682 lex_match (struct lexer *lexer, enum token_type type)
 683 {
 684   if (lex_token (lexer) == type)
 685     {
 686       lex_get (lexer);
 687       return true;
 688     }
 689   else
 690     return false;
 691 }
 692
 693 /* If the current token matches IDENTIFIER, skips it and returns true.
 694    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 695    returns false.
 696
 697    IDENTIFIER must be an ASCII string. */
 698 bool
 699 lex_match_id (struct lexer *lexer, const char *identifier)
 700 {
 701   return lex_match_id_n (lexer, identifier, 3);
 702 }
 703
 704 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 705    may be abbreviated to its first N letters.  Otherwise, returns false.
 706
 707    IDENTIFIER must be an ASCII string. */
 708 bool
 709 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 710 {
 711   if (lex_token (lexer) == T_ID
 712       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 713     {
 714       lex_get (lexer);
 715       return true;
 716     }
 717   else
 718     return false;
 719 }
 720
 721 /* If the current token is integer X, skips it and returns true.  Otherwise,
 722    returns false. */
 723 bool
 724 lex_match_int (struct lexer *lexer, int x)
 725 {
 726   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 727     {
 728       lex_get (lexer);
 729       return true;
 730     }
 731   else
 732     return false;
 733 }
 734 \f
 735 /* Forced matches. */
 736
 737 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 738    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 739    false.
 740
 741    IDENTIFIER must be an ASCII string. */
 742 bool
 743 lex_force_match_id (struct lexer *lexer, const char *identifier)
 744 {
 745   if (lex_match_id (lexer, identifier))
 746     return true;
 747   else
 748     {
 749       lex_error_expecting (lexer, identifier);
 750       return false;
 751     }
 752 }
 753
 754 /* If the current token has the specified TYPE, skips it and returns true.
 755    Otherwise, reports an error and returns false. */
 756 bool
 757 lex_force_match (struct lexer *lexer, enum token_type type)
 758 {
 759   if (lex_token (lexer) == type)
 760     {
 761       lex_get (lexer);
 762       return true;
 763     }
 764   else
 765     {
 766       const char *type_string = token_type_to_string (type);
 767       if (type_string)
 768         {
 769           char *s = xasprintf ("`%s'", type_string);
 770           lex_error_expecting (lexer, s);
 771           free (s);
 772         }
 773       else
 774         lex_error_expecting (lexer, token_type_to_name (type));
 775
 776       return false;
 777     }
 778 }
 779
 780 /* If the current token is a string, does nothing and returns true.
 781    Otherwise, reports an error and returns false. */
 782 bool
 783 lex_force_string (struct lexer *lexer)
 784 {
 785   if (lex_is_string (lexer))
 786     return true;
 787   else
 788     {
 789       lex_error (lexer, _("expecting string"));
 790       return false;
 791     }
 792 }
 793
 794 /* If the current token is a string or an identifier, does nothing and returns
 795    true.  Otherwise, reports an error and returns false.
 796
 797    This is meant for use in syntactic situations where we want to encourage the
 798    user to supply a quoted string, but for compatibility we also accept
 799    identifiers.  (One example of such a situation is file names.)  Therefore,
 800    the error message issued when the current token is wrong only says that a
 801    string is expected and doesn't mention that an identifier would also be
 802    accepted. */
 803 bool
 804 lex_force_string_or_id (struct lexer *lexer)
 805 {
 806   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 807 }
 808
 809 /* If the current token is an integer, does nothing and returns true.
 810    Otherwise, reports an error and returns false. */
 811 bool
 812 lex_force_int (struct lexer *lexer)
 813 {
 814   if (lex_is_integer (lexer))
 815     return true;
 816   else
 817     {
 818       lex_error (lexer, _("expecting integer"));
 819       return false;
 820     }
 821 }
 822
 823 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 824    nothing and returns true.  Otherwise, reports an error and returns false.
 825    If NAME is nonnull, then it is used in the error message. */
 826 bool
 827 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 828 {
 829   bool is_integer = lex_is_integer (lexer);
 830   bool too_small = is_integer && lex_integer (lexer) < min;
 831   bool too_big = is_integer && lex_integer (lexer) > max;
 832   if (is_integer && !too_small && !too_big)
 833     return true;
 834
 835   if (min > max)
 836     {
 837       /* Weird, maybe a bug in the caller.  Just report that we needed an
 838          integer. */
 839       if (name)
 840         lex_error (lexer, _("Integer expected for %s."), name);
 841       else
 842         lex_error (lexer, _("Integer expected."));
 843     }
 844   else if (min == max)
 845     {
 846       if (name)
 847         lex_error (lexer, _("Expected %ld for %s."), min, name);
 848       else
 849         lex_error (lexer, _("Expected %ld."), min);
 850     }
 851   else if (min + 1 == max)
 852     {
 853       if (name)
 854         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 855       else
 856         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 857     }
 858   else
 859     {
 860       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 861       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 862
 863       if (report_lower_bound && report_upper_bound)
 864         {
 865           if (name)
 866             lex_error (lexer,
 867                        _("Expected integer between %ld and %ld for %s."),
 868                        min, max, name);
 869           else
 870             lex_error (lexer, _("Expected integer between %ld and %ld."),
 871                        min, max);
 872         }
 873       else if (report_lower_bound)
 874         {
 875           if (min == 0)
 876             {
 877               if (name)
 878                 lex_error (lexer, _("Expected non-negative integer for %s."),
 879                            name);
 880               else
 881                 lex_error (lexer, _("Expected non-negative integer."));
 882             }
 883           else if (min == 1)
 884             {
 885               if (name)
 886                 lex_error (lexer, _("Expected positive integer for %s."),
 887                            name);
 888               else
 889                 lex_error (lexer, _("Expected positive integer."));
 890             }
 891         }
 892       else if (report_upper_bound)
 893         {
 894           if (name)
 895             lex_error (lexer,
 896                        _("Expected integer less than or equal to %ld for %s."),
 897                        max, name);
 898           else
 899             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 900                        max);
 901         }
 902       else
 903         {
 904           if (name)
 905             lex_error (lexer, _("Integer expected for %s."), name);
 906           else
 907             lex_error (lexer, _("Integer expected."));
 908         }
 909     }
 910   return false;
 911 }
 912
 913 /* If the current token is a number, does nothing and returns true.
 914    Otherwise, reports an error and returns false. */
 915 bool
 916 lex_force_num (struct lexer *lexer)
 917 {
 918   if (lex_is_number (lexer))
 919     return true;
 920
 921   lex_error (lexer, _("expecting number"));
 922   return false;
 923 }
 924
 925 /* If the current token is an identifier, does nothing and returns true.
 926    Otherwise, reports an error and returns false. */
 927 bool
 928 lex_force_id (struct lexer *lexer)
 929 {
 930   if (lex_token (lexer) == T_ID)
 931     return true;
 932
 933   lex_error (lexer, _("expecting identifier"));
 934   return false;
 935 }
 936 \f
 937 /* Token accessors. */
 938
 939 /* Returns the type of LEXER's current token. */
 940 enum token_type
 941 lex_token (const struct lexer *lexer)
 942 {
 943   return lex_next_token (lexer, 0);
 944 }
 945
 946 /* Returns the number in LEXER's current token.
 947
 948    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 949    tokens this function will always return zero. */
 950 double
 951 lex_tokval (const struct lexer *lexer)
 952 {
 953   return lex_next_tokval (lexer, 0);
 954 }
 955
 956 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 957
 958    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 959    this functions this function will always return NULL.
 960
 961    The UTF-8 encoding of the returned string is correct for variable names and
 962    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 963    data_in() to use it in a "union value".  */
 964 const char *
 965 lex_tokcstr (const struct lexer *lexer)
 966 {
 967   return lex_next_tokcstr (lexer, 0);
 968 }
 969
 970 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 971    null-terminated (but the null terminator is not included in the returned
 972    substring's 'length').
 973
 974    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 975    this functions this function will always return NULL.
 976
 977    The UTF-8 encoding of the returned string is correct for variable names and
 978    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 979    data_in() to use it in a "union value".  */
 980 struct substring
 981 lex_tokss (const struct lexer *lexer)
 982 {
 983   return lex_next_tokss (lexer, 0);
 984 }
 985 \f
 986 /* Looking ahead.
 987
 988    A value of 0 for N as an argument to any of these functions refers to the
 989    current token.  Lookahead is limited to the current command.  Any N greater
 990    than the number of tokens remaining in the current command will be treated
 991    as referring to a T_ENDCMD token. */
 992
 993 static const struct lex_token *
 994 lex_next__ (const struct lexer *lexer_, int n)
 995 {
 996   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
 997   struct lex_source *src = lex_source__ (lexer);
 998
 999   if (src != NULL)
1000     return lex_source_next__ (src, n);
1001   else
1002     {
1003       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1004       return &stop_token;
1005     }
1006 }
1007
1008 static const struct lex_token *
1009 lex_source_next__ (const struct lex_source *src_, int n)
1010 {
1011   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1012   while (lex_stage_count (&src->lookahead) <= n)
1013     {
1014       if (!lex_stage_is_empty (&src->lookahead))
1015         {
1016           const struct lex_token *t = lex_stage_last (&src->lookahead);
1017           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1018             return t;
1019         }
1020
1021       lex_source_get_lookahead (src);
1022     }
1023
1024   return lex_stage_nth (&src->lookahead, n);
1025 }
1026
1027 /* Returns the "struct token" of the token N after the current one in LEXER.
1028    The returned pointer can be invalidated by pretty much any succeeding call
1029    into the lexer, although the string pointer within the returned token is
1030    only invalidated by consuming the token (e.g. with lex_get()). */
1031 const struct token *
1032 lex_next (const struct lexer *lexer, int n)
1033 {
1034   return &lex_next__ (lexer, n)->token;
1035 }
1036
1037 /* Returns the type of the token N after the current one in LEXER. */
1038 enum token_type
1039 lex_next_token (const struct lexer *lexer, int n)
1040 {
1041   return lex_next (lexer, n)->type;
1042 }
1043
1044 /* Returns the number in the tokn N after the current one in LEXER.
1045
1046    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1047    tokens this function will always return zero. */
1048 double
1049 lex_next_tokval (const struct lexer *lexer, int n)
1050 {
1051   return token_number (lex_next (lexer, n));
1052 }
1053
1054 /* Returns the null-terminated string in the token N after the current one, in
1055    UTF-8 encoding.
1056
1057    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1058    this functions this function will always return NULL.
1059
1060    The UTF-8 encoding of the returned string is correct for variable names and
1061    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1062    data_in() to use it in a "union value".  */
1063 const char *
1064 lex_next_tokcstr (const struct lexer *lexer, int n)
1065 {
1066   return lex_next_tokss (lexer, n).string;
1067 }
1068
1069 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1070    The string is null-terminated (but the null terminator is not included in
1071    the returned substring's 'length').
1072
1073    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1074    tokens this functions this function will always return NULL.
1075
1076    The UTF-8 encoding of the returned string is correct for variable names and
1077    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1078    data_in() to use it in a "union value".  */
1079 struct substring
1080 lex_next_tokss (const struct lexer *lexer, int n)
1081 {
1082   return lex_next (lexer, n)->string;
1083 }
1084
1085 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1086    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1087    are both zero, this requests the syntax for the current token.)  The caller
1088    must eventually free the returned string (with free()).  The syntax is
1089    encoded in UTF-8 and in the original form supplied to the lexer so that, for
1090    example, it may include comments, spaces, and new-lines if it spans multiple
1091    tokens.  Macro expansion, however, has already been performed. */
1092 char *
1093 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1094 {
1095   return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1096 }
1097
1098 /* Returns true if the token N ahead of the current one was produced by macro
1099    expansion, false otherwise. */
1100 bool
1101 lex_next_is_from_macro (const struct lexer *lexer, int n)
1102 {
1103   return lex_next__ (lexer, n)->macro_rep != NULL;
1104 }
1105
1106 static bool
1107 lex_tokens_match (const struct token *actual, const struct token *expected)
1108 {
1109   if (actual->type != expected->type)
1110     return false;
1111
1112   switch (actual->type)
1113     {
1114     case T_POS_NUM:
1115     case T_NEG_NUM:
1116       return actual->number == expected->number;
1117
1118     case T_ID:
1119       return lex_id_match (expected->string, actual->string);
1120
1121     case T_STRING:
1122       return (actual->string.length == expected->string.length
1123               && !memcmp (actual->string.string, expected->string.string,
1124                           actual->string.length));
1125
1126     default:
1127       return true;
1128     }
1129 }
1130
1131 static size_t
1132 lex_at_phrase__ (struct lexer *lexer, const char *s)
1133 {
1134   struct string_lexer slex;
1135   struct token token;
1136
1137   size_t i = 0;
1138   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1139   while (string_lexer_next (&slex, &token))
1140     {
1141       bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1142       token_uninit (&token);
1143       if (!match)
1144         return 0;
1145     }
1146   return i;
1147 }
1148
1149 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1150    returns true.  Otherwise, returns false.
1151
1152    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1153    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1154    first three letters. */
1155 bool
1156 lex_at_phrase (struct lexer *lexer, const char *s)
1157 {
1158   return lex_at_phrase__ (lexer, s) > 0;
1159 }
1160
1161 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1162    skips it and returns true.  Otherwise, returns false.
1163
1164    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1165    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1166    first three letters. */
1167 bool
1168 lex_match_phrase (struct lexer *lexer, const char *s)
1169 {
1170   size_t n = lex_at_phrase__ (lexer, s);
1171   if (n > 0)
1172     lex_get_n (lexer, n);
1173   return n > 0;
1174 }
1175
1176 static int
1177 count_newlines (char *s, size_t length)
1178 {
1179   int n_newlines = 0;
1180   char *newline;
1181
1182   while ((newline = memchr (s, '\n', length)) != NULL)
1183     {
1184       n_newlines++;
1185       length -= (newline + 1) - s;
1186       s = newline + 1;
1187     }
1188
1189   return n_newlines;
1190 }
1191
1192 static int
1193 lex_token_get_last_line_number (const struct lex_source *src,
1194                                 const struct lex_token *token)
1195 {
1196   if (token->first_line == 0)
1197     return 0;
1198   else
1199     {
1200       char *token_str = &src->buffer[token->token_pos];
1201       return token->first_line + count_newlines (token_str, token->token_len) + 1;
1202     }
1203 }
1204
1205 static int
1206 lex_token_get_first_column (const struct lex_source *src,
1207                             const struct lex_token *token)
1208 {
1209   return utf8_count_columns (&src->buffer[token->line_pos],
1210                              token->token_pos - token->line_pos) + 1;
1211 }
1212
1213 static int
1214 lex_token_get_last_column (const struct lex_source *src,
1215                            const struct lex_token *token)
1216 {
1217   char *start, *end, *newline;
1218
1219   start = &src->buffer[token->line_pos];
1220   end = &src->buffer[token->token_pos + token->token_len];
1221   newline = memrchr (start, '\n', end - start);
1222   if (newline != NULL)
1223     start = newline + 1;
1224   return utf8_count_columns (start, end - start) + 1;
1225 }
1226
1227 static struct msg_location
1228 lex_token_location (const struct lex_source *src,
1229                     const struct lex_token *t0,
1230                     const struct lex_token *t1)
1231 {
1232   return (struct msg_location) {
1233     .file_name = src->reader->file_name,
1234     .first_line = t0->first_line,
1235     .last_line = lex_token_get_last_line_number (src, t1),
1236     .first_column = lex_token_get_first_column (src, t0),
1237     .last_column = lex_token_get_last_column (src, t1),
1238   };
1239 }
1240
1241 static struct msg_location *
1242 lex_token_location_rw (const struct lex_source *src,
1243                        const struct lex_token *t0,
1244                        const struct lex_token *t1)
1245 {
1246   struct msg_location location = lex_token_location (src, t0, t1);
1247   return msg_location_dup (&location);
1248 }
1249
1250 static struct msg_location *
1251 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1252 {
1253   return lex_token_location_rw (src,
1254                                 lex_source_next__ (src, n0),
1255                                 lex_source_next__ (src, n1));
1256 }
1257
1258 /* Returns the 1-based line number of the start of the syntax that represents
1259    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1260    if the token is drawn from a source that does not have line numbers. */
1261 int
1262 lex_get_first_line_number (const struct lexer *lexer, int n)
1263 {
1264   const struct lex_source *src = lex_source__ (lexer);
1265   return src ? lex_source_next__ (src, n)->first_line : 0;
1266 }
1267
1268 /* Returns the 1-based line number of the end of the syntax that represents the
1269    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1270    token or if the token is drawn from a source that does not have line
1271    numbers.
1272
1273    Most of the time, a single token is wholly within a single line of syntax,
1274    but there are two exceptions: a T_STRING token can be made up of multiple
1275    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1276    token can consist of a "-" on one line followed by the number on the next.
1277  */
1278 int
1279 lex_get_last_line_number (const struct lexer *lexer, int n)
1280 {
1281   const struct lex_source *src = lex_source__ (lexer);
1282   return src ? lex_token_get_last_line_number (src,
1283                                                lex_source_next__ (src, n)) : 0;
1284 }
1285
1286 /* Returns the 1-based column number of the start of the syntax that represents
1287    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1288    token.
1289
1290    Column numbers are measured according to the width of characters as shown in
1291    a typical fixed-width font, in which CJK characters have width 2 and
1292    combining characters have width 0.  */
1293 int
1294 lex_get_first_column (const struct lexer *lexer, int n)
1295 {
1296   const struct lex_source *src = lex_source__ (lexer);
1297   return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1298 }
1299
1300 /* Returns the 1-based column number of the end of the syntax that represents
1301    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1302    token.
1303
1304    Column numbers are measured according to the width of characters as shown in
1305    a typical fixed-width font, in which CJK characters have width 2 and
1306    combining characters have width 0.  */
1307 int
1308 lex_get_last_column (const struct lexer *lexer, int n)
1309 {
1310   const struct lex_source *src = lex_source__ (lexer);
1311   return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1312 }
1313
1314 /* Returns the name of the syntax file from which the current command is drawn.
1315    Returns NULL for a T_STOP token or if the command's source does not have
1316    line numbers.
1317
1318    There is no version of this function that takes an N argument because
1319    lookahead only works to the end of a command and any given command is always
1320    within a single syntax file. */
1321 const char *
1322 lex_get_file_name (const struct lexer *lexer)
1323 {
1324   struct lex_source *src = lex_source__ (lexer);
1325   return src == NULL ? NULL : src->reader->file_name;
1326 }
1327
1328 /* Returns a newly allocated msg_location for the syntax that represents tokens
1329    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1330    must eventually free the location (with msg_location_destroy()). */
1331 struct msg_location *
1332 lex_get_location (const struct lexer *lexer, int n0, int n1)
1333 {
1334   struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1335   loc->first_column = lex_get_first_column (lexer, n0);
1336   loc->last_column = lex_get_last_column (lexer, n1);
1337   return loc;
1338 }
1339
1340 /* Returns a newly allocated msg_location for the syntax that represents tokens
1341    with 0-based offsets N0...N1, inclusive, from the current token.  The
1342    location only covers the tokens' lines, not the columns.  The caller must
1343    eventually free the location (with msg_location_destroy()). */
1344 struct msg_location *
1345 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1346 {
1347   struct msg_location *loc = xmalloc (sizeof *loc);
1348   *loc = (struct msg_location) {
1349     .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1350     .first_line = lex_get_first_line_number (lexer, n0),
1351     .last_line = lex_get_last_line_number (lexer, n1),
1352   };
1353   return loc;
1354 }
1355
1356 const char *
1357 lex_get_encoding (const struct lexer *lexer)
1358 {
1359   struct lex_source *src = lex_source__ (lexer);
1360   return src == NULL ? NULL : src->reader->encoding;
1361 }
1362
1363 /* Returns the syntax mode for the syntax file from which the current drawn is
1364    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1365    does not have line numbers.
1366
1367    There is no version of this function that takes an N argument because
1368    lookahead only works to the end of a command and any given command is always
1369    within a single syntax file. */
1370 enum segmenter_mode
1371 lex_get_syntax_mode (const struct lexer *lexer)
1372 {
1373   struct lex_source *src = lex_source__ (lexer);
1374   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1375 }
1376
1377 /* Returns the error mode for the syntax file from which the current drawn is
1378    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1379    source does not have line numbers.
1380
1381    There is no version of this function that takes an N argument because
1382    lookahead only works to the end of a command and any given command is always
1383    within a single syntax file. */
1384 enum lex_error_mode
1385 lex_get_error_mode (const struct lexer *lexer)
1386 {
1387   struct lex_source *src = lex_source__ (lexer);
1388   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1389 }
1390
1391 /* If the source that LEXER is currently reading has error mode
1392    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1393    token to be read comes directly from whatever is next read from the stream.
1394
1395    It makes sense to call this function after encountering an error in a
1396    command entered on the console, because usually the user would prefer not to
1397    have cascading errors. */
1398 void
1399 lex_interactive_reset (struct lexer *lexer)
1400 {
1401   struct lex_source *src = lex_source__ (lexer);
1402   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1403     {
1404       src->length = 0;
1405       src->journal_pos = src->seg_pos = src->line_pos = 0;
1406       src->n_newlines = 0;
1407       src->suppress_next_newline = false;
1408       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1409                                        false);
1410       lex_stage_clear (&src->pp);
1411       lex_stage_clear (&src->merge);
1412       lex_stage_clear (&src->lookahead);
1413       lex_source_push_endcmd__ (src);
1414     }
1415 }
1416
1417 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1418 void
1419 lex_discard_rest_of_command (struct lexer *lexer)
1420 {
1421   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1422     lex_get (lexer);
1423 }
1424
1425 /* Discards all lookahead tokens in LEXER, then discards all input sources
1426    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1427    runs out of input sources. */
1428 void
1429 lex_discard_noninteractive (struct lexer *lexer)
1430 {
1431   struct lex_source *src = lex_source__ (lexer);
1432
1433   if (src != NULL)
1434     {
1435       lex_stage_clear (&src->pp);
1436       lex_stage_clear (&src->merge);
1437       lex_stage_clear (&src->lookahead);
1438
1439       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1440            src = lex_source__ (lexer))
1441         lex_source_destroy (src);
1442     }
1443 }
1444 \f
1445 static void
1446 lex_source_expand__ (struct lex_source *src)
1447 {
1448   if (src->length >= src->allocated)
1449     src->buffer = x2realloc (src->buffer, &src->allocated);
1450 }
1451
1452 static void
1453 lex_source_read__ (struct lex_source *src)
1454 {
1455   do
1456     {
1457       lex_source_expand__ (src);
1458
1459       size_t space = src->allocated - src->length;
1460       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1461       size_t n = src->reader->class->read (src->reader,
1462                                            &src->buffer[src->length],
1463                                            space, prompt);
1464       assert (n <= space);
1465
1466       if (n == 0)
1467         {
1468           /* End of input. */
1469           src->reader->eof = true;
1470           lex_source_expand__ (src);
1471           return;
1472         }
1473
1474       src->length += n;
1475     }
1476   while (!memchr (&src->buffer[src->seg_pos], '\n',
1477                   src->length - src->seg_pos));
1478 }
1479
1480 static struct lex_source *
1481 lex_source__ (const struct lexer *lexer)
1482 {
1483   return (ll_is_empty (&lexer->sources) ? NULL
1484           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1485 }
1486
1487 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1488    one, through N1 ahead of the current one, inclusive.  (For example, if N0
1489    and N1 are both zero, this requests the syntax for the current token.)  The
1490    caller must eventually free the returned string (with free()).  The syntax
1491    is encoded in UTF-8 and in the original form supplied to the lexer so that,
1492    for example, it may include comments, spaces, and new-lines if it spans
1493    multiple tokens.  Macro expansion, however, has already been performed. */
1494 static char *
1495 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1496 {
1497   struct string s = DS_EMPTY_INITIALIZER;
1498   for (size_t i = n0; i <= n1; )
1499     {
1500       /* Find [I,J) as the longest sequence of tokens not produced by macro
1501          expansion, or otherwise the longest sequence expanded from a single
1502          macro call. */
1503       const struct lex_token *first = lex_source_next__ (src, i);
1504       size_t j;
1505       for (j = i + 1; j <= n1; j++)
1506         {
1507           const struct lex_token *cur = lex_source_next__ (src, j);
1508           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1509               || first->macro_rep != cur->macro_rep)
1510             break;
1511         }
1512       const struct lex_token *last = lex_source_next__ (src, j - 1);
1513
1514       /* Now add the syntax for this sequence of tokens to SRC. */
1515       if (!ds_is_empty (&s))
1516         ds_put_byte (&s, ' ');
1517       if (!first->macro_rep)
1518         {
1519           size_t start = first->token_pos;
1520           size_t end = last->token_pos + last->token_len;
1521           ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1522         }
1523       else
1524         {
1525           size_t start = first->ofs;
1526           size_t end = last->ofs + last->len;
1527           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1528                                            end - start));
1529         }
1530
1531       i = j;
1532     }
1533   return ds_steal_cstr (&s);
1534 }
1535
1536 static bool
1537 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1538 {
1539   for (size_t i = n0; i <= n1; i++)
1540     if (lex_source_next__ (src, i)->macro_rep)
1541       return true;
1542   return false;
1543 }
1544
1545 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1546    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1547    other tokens included in that range.  The syntax is encoded in UTF-8 and in
1548    the original form supplied to the lexer so that, for example, it may include
1549    comments, spaces, and new-lines if it spans multiple tokens.
1550
1551    Returns an empty string if the token range doesn't include a macro call.
1552
1553    The caller must not modify or free the returned string. */
1554 static struct substring
1555 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1556 {
1557   if (!lex_source_contains_macro_call (src, n0, n1))
1558     return ss_empty ();
1559
1560   const struct lex_token *token0 = lex_source_next__ (src, n0);
1561   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1562   size_t start = token0->token_pos;
1563   size_t end = token1->token_pos + token1->token_len;
1564
1565   return ss_buffer (&src->buffer[start], end - start);
1566 }
1567
1568 static void
1569 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1570                          const char *format, va_list args)
1571 {
1572   const struct lex_token *token;
1573   struct string s;
1574
1575   ds_init_empty (&s);
1576
1577   token = lex_source_next__ (src, n0);
1578   if (token->token.type == T_ENDCMD)
1579     ds_put_cstr (&s, _("Syntax error at end of command"));
1580   else
1581     {
1582       /* Get the syntax that caused the error. */
1583       char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1584       char syntax[64];
1585       str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1586       free (raw_syntax);
1587
1588       /* Get the macro call(s) that expanded to the syntax that caused the
1589          error. */
1590       char call[64];
1591       str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1592                      call, sizeof call);
1593
1594       if (syntax[0])
1595         {
1596           if (call[0])
1597             ds_put_format (&s,
1598                            _("Syntax error at `%s' (in expansion of `%s')"),
1599                            syntax, call);
1600           else
1601             ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1602         }
1603       else
1604         {
1605           if (call[0])
1606             ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1607                            call);
1608           else
1609             ds_put_cstr (&s, _("Syntax error"));
1610         }
1611     }
1612
1613   if (format)
1614     {
1615       ds_put_cstr (&s, ": ");
1616       ds_put_vformat (&s, format, args);
1617     }
1618   if (ds_last (&s) != '.')
1619     ds_put_byte (&s, '.');
1620
1621   struct msg *m = xmalloc (sizeof *m);
1622   *m = (struct msg) {
1623     .category = MSG_C_SYNTAX,
1624     .severity = MSG_S_ERROR,
1625     .location = lex_source_get_location (src, n0, n1),
1626     .text = ds_steal_cstr (&s),
1627   };
1628   msg_emit (m);
1629 }
1630
1631 static void
1632 lex_get_error (struct lex_source *src, const struct lex_token *token)
1633 {
1634   char syntax[64];
1635   str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1636                  syntax, sizeof syntax);
1637
1638   struct string s = DS_EMPTY_INITIALIZER;
1639   ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1640   ds_put_format (&s, ": %s", token->token.string.string);
1641
1642   struct msg *m = xmalloc (sizeof *m);
1643   *m = (struct msg) {
1644     .category = MSG_C_SYNTAX,
1645     .severity = MSG_S_ERROR,
1646     .location = lex_token_location_rw (src, token, token),
1647     .text = ds_steal_cstr (&s),
1648   };
1649   msg_emit (m);
1650 }
1651
1652 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1653    underlying lex_reader if necessary.  Returns true if a new token was added
1654    to SRC's deque, false otherwise.  The caller should retry failures unless
1655    SRC's 'eof' marker was set to true indicating that there will be no more
1656    tokens from this source. */
1657 static bool
1658 lex_source_try_get_pp (struct lex_source *src)
1659 {
1660   /* Append a new token to SRC and initialize it. */
1661   struct lex_token *token = xmalloc (sizeof *token);
1662   token->token = (struct token) { .type = T_STOP };
1663   token->macro_rep = NULL;
1664   token->ref_cnt = NULL;
1665   token->line_pos = src->line_pos;
1666   token->token_pos = src->seg_pos;
1667   if (src->reader->line_number > 0)
1668     token->first_line = src->reader->line_number + src->n_newlines;
1669   else
1670     token->first_line = 0;
1671
1672   /* Extract a segment. */
1673   const char *segment;
1674   enum segment_type seg_type;
1675   int seg_len;
1676   for (;;)
1677     {
1678       segment = &src->buffer[src->seg_pos];
1679       seg_len = segmenter_push (&src->segmenter, segment,
1680                                 src->length - src->seg_pos,
1681                                 src->reader->eof, &seg_type);
1682       if (seg_len >= 0)
1683         break;
1684
1685       /* The segmenter needs more input to produce a segment. */
1686       assert (!src->reader->eof);
1687       lex_source_read__ (src);
1688     }
1689
1690   /* Update state based on the segment. */
1691   token->token_len = seg_len;
1692   src->seg_pos += seg_len;
1693   if (seg_type == SEG_NEWLINE)
1694     {
1695       src->line_pos = src->seg_pos;
1696       src->n_newlines++;
1697     }
1698
1699   /* Get a token from the segment. */
1700   enum tokenize_result result = token_from_segment (
1701     seg_type, ss_buffer (segment, seg_len), &token->token);
1702
1703   /* If we've reached the end of a line, or the end of a command, then pass
1704      the line to the output engine as a syntax text item.  */
1705   int n_lines = seg_type == SEG_NEWLINE;
1706   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1707     {
1708       n_lines++;
1709       src->suppress_next_newline = true;
1710     }
1711   else if (n_lines > 0 && src->suppress_next_newline)
1712     {
1713       n_lines--;
1714       src->suppress_next_newline = false;
1715     }
1716   for (int i = 0; i < n_lines; i++)
1717     {
1718       /* Beginning of line. */
1719       const char *line = &src->buffer[src->journal_pos];
1720
1721       /* Calculate line length, including \n or \r\n end-of-line if present.
1722
1723          We use src->head even though that may be beyond what we've actually
1724          converted to tokens (which is only through line_pos).  That's because,
1725          if we're emitting the line due to SEG_END_COMMAND, we want to take the
1726          whole line through the newline, not just through the '.'. */
1727       size_t max_len = src->length - src->journal_pos;
1728       const char *newline = memchr (line, '\n', max_len);
1729       size_t line_len = newline ? newline - line + 1 : max_len;
1730
1731       /* Calculate line length excluding end-of-line. */
1732       size_t copy_len = line_len;
1733       if (copy_len > 0 && line[copy_len - 1] == '\n')
1734         copy_len--;
1735       if (copy_len > 0 && line[copy_len - 1] == '\r')
1736         copy_len--;
1737
1738       /* Submit the line as syntax. */
1739       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1740                                                    xmemdup0 (line, copy_len),
1741                                                    NULL));
1742
1743       src->journal_pos += line_len;
1744     }
1745
1746   switch (result)
1747     {
1748     case TOKENIZE_ERROR:
1749       lex_get_error (src, token);
1750       /* Fall through. */
1751     case TOKENIZE_EMPTY:
1752       lex_token_destroy (token);
1753       return false;
1754
1755     case TOKENIZE_TOKEN:
1756       if (token->token.type == T_STOP)
1757         {
1758           token->token.type = T_ENDCMD;
1759           src->eof = true;
1760         }
1761       lex_stage_push_last (&src->pp, token);
1762       return true;
1763     }
1764   NOT_REACHED ();
1765 }
1766
1767 /* Attempts to append a new token to SRC.  Returns true if successful, false on
1768    failure.  On failure, the end of SRC has been reached and no more tokens
1769    will be forthcoming from it.
1770
1771    Does not make the new token available for lookahead yet; the caller must
1772    adjust SRC's 'middle' pointer to do so. */
1773 static bool
1774 lex_source_get_pp (struct lex_source *src)
1775 {
1776   while (!src->eof)
1777     if (lex_source_try_get_pp (src))
1778       return true;
1779   return false;
1780 }
1781
1782 static bool
1783 lex_source_try_get_merge (const struct lex_source *src_)
1784 {
1785   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1786
1787   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1788     return false;
1789
1790   if (!settings_get_mexpand ())
1791     {
1792       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1793       return true;
1794     }
1795
1796   /* Now pass tokens one-by-one to the macro expander.
1797
1798      In the common case where there is no macro to expand, the loop is not
1799      entered.  */
1800   struct macro_call *mc;
1801   int n_call = macro_call_create (src->lexer->macros,
1802                                   &lex_stage_first (&src->pp)->token, &mc);
1803   for (int ofs = 1; !n_call; ofs++)
1804     {
1805       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1806         {
1807           /* This should not be reachable because we always get a T_ENDCMD at
1808              the end of an input file (transformed from T_STOP by
1809              lex_source_try_get_pp()) and the macro_expander should always
1810              terminate expansion on T_ENDCMD. */
1811           NOT_REACHED ();
1812         }
1813
1814       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1815       size_t start = t->token_pos;
1816       size_t end = t->token_pos + t->token_len;
1817       const struct macro_token mt = {
1818         .token = t->token,
1819         .syntax = ss_buffer (&src->buffer[start], end - start),
1820       };
1821       const struct msg_location loc = lex_token_location (src, t, t);
1822       n_call = macro_call_add (mc, &mt, &loc);
1823     }
1824   if (n_call < 0)
1825     {
1826       /* False alarm: no macro expansion after all.  Use first token as
1827          lookahead.  We'll retry macro expansion from the second token next
1828          time around. */
1829       macro_call_destroy (mc);
1830       lex_stage_shift (&src->merge, &src->pp, 1);
1831       return true;
1832     }
1833
1834   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1835      are a macro call.  (These are likely to be the only tokens in 'pp'.)
1836      Expand them.  */
1837   const struct lex_token *c0 = lex_stage_first (&src->pp);
1838   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1839   struct macro_tokens expansion = { .n = 0 };
1840   struct msg_location loc = lex_token_location (src, c0, c1);
1841   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1842   macro_call_destroy (mc);
1843
1844   /* Convert the macro expansion into syntax for possible error messages
1845      later. */
1846   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1847   size_t *len = xnmalloc (expansion.n, sizeof *len);
1848   struct string s = DS_EMPTY_INITIALIZER;
1849   macro_tokens_to_syntax (&expansion, &s, ofs, len);
1850
1851   if (settings_get_mprint ())
1852     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1853                                           _("Macro Expansion")));
1854
1855   /* Append the macro expansion tokens to the lookahead. */
1856   if (expansion.n > 0)
1857     {
1858       char *macro_rep = ds_steal_cstr (&s);
1859       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1860       *ref_cnt = expansion.n;
1861       for (size_t i = 0; i < expansion.n; i++)
1862         {
1863           struct lex_token *token = xmalloc (sizeof *token);
1864           *token = (struct lex_token) {
1865             .token = expansion.mts[i].token,
1866             .token_pos = c0->token_pos,
1867             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1868             .line_pos = c0->line_pos,
1869             .first_line = c0->first_line,
1870             .macro_rep = macro_rep,
1871             .ofs = ofs[i],
1872             .len = len[i],
1873             .ref_cnt = ref_cnt,
1874           };
1875           lex_stage_push_last (&src->merge, token);
1876
1877           ss_dealloc (&expansion.mts[i].syntax);
1878         }
1879     }
1880   else
1881     ds_destroy (&s);
1882   free (expansion.mts);
1883   free (ofs);
1884   free (len);
1885
1886   /* Destroy the tokens for the call. */
1887   for (size_t i = 0; i < n_call; i++)
1888     lex_stage_pop_first (&src->pp);
1889
1890   return expansion.n > 0;
1891 }
1892
1893 /* Attempts to obtain at least one new token into 'merge' in SRC.
1894
1895    Returns true if successful, false on failure.  In the latter case, SRC is
1896    exhausted and 'src->eof' is now true. */
1897 static bool
1898 lex_source_get_merge (struct lex_source *src)
1899 {
1900   while (!src->eof)
1901     if (lex_source_try_get_merge (src))
1902       return true;
1903   return false;
1904 }
1905
1906 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1907
1908    Returns true if successful, false on failure.  In the latter case, SRC is
1909    exhausted and 'src->eof' is now true. */
1910 static bool
1911 lex_source_get_lookahead (struct lex_source *src)
1912 {
1913   struct merger m = MERGER_INIT;
1914   struct token out;
1915   for (size_t i = 0; ; i++)
1916     {
1917       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1918         {
1919           /* We always get a T_ENDCMD at the end of an input file
1920              (transformed from T_STOP by lex_source_try_get_pp()) and
1921              merger_add() should never return -1 on T_ENDCMD. */
1922           assert (lex_stage_is_empty (&src->merge));
1923           return false;
1924         }
1925
1926       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1927                                &out);
1928       if (!retval)
1929         {
1930           lex_stage_shift (&src->lookahead, &src->merge, 1);
1931           return true;
1932         }
1933       else if (retval > 0)
1934         {
1935           /* Add a token that merges all the tokens together. */
1936           const struct lex_token *first = lex_stage_first (&src->merge);
1937           const struct lex_token *last = lex_stage_nth (&src->merge,
1938                                                         retval - 1);
1939           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1940           struct lex_token *t = xmalloc (sizeof *t);
1941           *t = (struct lex_token) {
1942             .token = out,
1943             .token_pos = first->token_pos,
1944             .token_len = (last->token_pos - first->token_pos) + last->token_len,
1945             .line_pos = first->line_pos,
1946             .first_line = first->first_line,
1947
1948             /* This works well if all the tokens were not expanded from macros,
1949                or if they came from the same macro expansion.  It just gives up
1950                in the other (corner) cases. */
1951             .macro_rep = macro ? first->macro_rep : NULL,
1952             .ofs = macro ? first->ofs : 0,
1953             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
1954             .ref_cnt = macro ? first->ref_cnt : NULL,
1955           };
1956           if (t->ref_cnt)
1957             ++*t->ref_cnt;
1958           lex_stage_push_last (&src->lookahead, t);
1959
1960           for (int i = 0; i < retval; i++)
1961             lex_stage_pop_first (&src->merge);
1962           return true;
1963         }
1964     }
1965 }
1966 \f
1967 static void
1968 lex_source_push_endcmd__ (struct lex_source *src)
1969 {
1970   assert (lex_stage_is_empty (&src->lookahead));
1971   struct lex_token *token = xmalloc (sizeof *token);
1972   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
1973   lex_stage_push_last (&src->lookahead, token);
1974 }
1975
1976 static struct lex_source *
1977 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
1978 {
1979   struct lex_source *src = xmalloc (sizeof *src);
1980   *src = (struct lex_source) {
1981     .reader = reader,
1982     .segmenter = segmenter_init (reader->syntax, false),
1983     .lexer = lexer,
1984   };
1985
1986   lex_source_push_endcmd__ (src);
1987
1988   return src;
1989 }
1990
1991 static void
1992 lex_source_destroy (struct lex_source *src)
1993 {
1994   char *file_name = src->reader->file_name;
1995   char *encoding = src->reader->encoding;
1996   if (src->reader->class->destroy != NULL)
1997     src->reader->class->destroy (src->reader);
1998   free (file_name);
1999   free (encoding);
2000   free (src->buffer);
2001   lex_stage_uninit (&src->pp);
2002   lex_stage_uninit (&src->merge);
2003   lex_stage_uninit (&src->lookahead);
2004   ll_remove (&src->ll);
2005   free (src);
2006 }
2007 \f
2008 struct lex_file_reader
2009   {
2010     struct lex_reader reader;
2011     struct u8_istream *istream;
2012   };
2013
2014 static struct lex_reader_class lex_file_reader_class;
2015
2016 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2017    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2018    ENCODING, which should take one of the forms accepted by
2019    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2020    mode of the new reader, respectively.
2021
2022    Returns a null pointer if FILE_NAME cannot be opened. */
2023 struct lex_reader *
2024 lex_reader_for_file (const char *file_name, const char *encoding,
2025                      enum segmenter_mode syntax,
2026                      enum lex_error_mode error)
2027 {
2028   struct lex_file_reader *r;
2029   struct u8_istream *istream;
2030
2031   istream = (!strcmp(file_name, "-")
2032              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2033              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2034   if (istream == NULL)
2035     {
2036       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2037       return NULL;
2038     }
2039
2040   r = xmalloc (sizeof *r);
2041   lex_reader_init (&r->reader, &lex_file_reader_class);
2042   r->reader.syntax = syntax;
2043   r->reader.error = error;
2044   r->reader.file_name = xstrdup (file_name);
2045   r->reader.encoding = xstrdup_if_nonnull (encoding);
2046   r->reader.line_number = 1;
2047   r->istream = istream;
2048
2049   return &r->reader;
2050 }
2051
2052 static struct lex_file_reader *
2053 lex_file_reader_cast (struct lex_reader *r)
2054 {
2055   return UP_CAST (r, struct lex_file_reader, reader);
2056 }
2057
2058 static size_t
2059 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2060                enum prompt_style prompt_style UNUSED)
2061 {
2062   struct lex_file_reader *r = lex_file_reader_cast (r_);
2063   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2064   if (n_read < 0)
2065     {
2066       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2067       return 0;
2068     }
2069   return n_read;
2070 }
2071
2072 static void
2073 lex_file_close (struct lex_reader *r_)
2074 {
2075   struct lex_file_reader *r = lex_file_reader_cast (r_);
2076
2077   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2078     {
2079       if (u8_istream_close (r->istream) != 0)
2080         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2081     }
2082   else
2083     u8_istream_free (r->istream);
2084
2085   free (r);
2086 }
2087
2088 static struct lex_reader_class lex_file_reader_class =
2089   {
2090     lex_file_read,
2091     lex_file_close
2092   };
2093 \f
2094 struct lex_string_reader
2095   {
2096     struct lex_reader reader;
2097     struct substring s;
2098     size_t offset;
2099   };
2100
2101 static struct lex_reader_class lex_string_reader_class;
2102
2103 /* Creates and returns a new lex_reader for the contents of S, which must be
2104    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2105    with ss_dealloc() when it is closed. */
2106 struct lex_reader *
2107 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2108 {
2109   struct lex_string_reader *r;
2110
2111   r = xmalloc (sizeof *r);
2112   lex_reader_init (&r->reader, &lex_string_reader_class);
2113   r->reader.syntax = SEG_MODE_AUTO;
2114   r->reader.encoding = xstrdup_if_nonnull (encoding);
2115   r->s = s;
2116   r->offset = 0;
2117
2118   return &r->reader;
2119 }
2120
2121 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2122    which must be encoded in ENCODING.  The caller retains ownership of S. */
2123 struct lex_reader *
2124 lex_reader_for_string (const char *s, const char *encoding)
2125 {
2126   struct substring ss;
2127   ss_alloc_substring (&ss, ss_cstr (s));
2128   return lex_reader_for_substring_nocopy (ss, encoding);
2129 }
2130
2131 /* Formats FORMAT as a printf()-like format string and creates and returns a
2132    new lex_reader for the formatted result.  */
2133 struct lex_reader *
2134 lex_reader_for_format (const char *format, const char *encoding, ...)
2135 {
2136   struct lex_reader *r;
2137   va_list args;
2138
2139   va_start (args, encoding);
2140   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2141   va_end (args);
2142
2143   return r;
2144 }
2145
2146 static struct lex_string_reader *
2147 lex_string_reader_cast (struct lex_reader *r)
2148 {
2149   return UP_CAST (r, struct lex_string_reader, reader);
2150 }
2151
2152 static size_t
2153 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2154                  enum prompt_style prompt_style UNUSED)
2155 {
2156   struct lex_string_reader *r = lex_string_reader_cast (r_);
2157   size_t chunk;
2158
2159   chunk = MIN (n, r->s.length - r->offset);
2160   memcpy (buf, r->s.string + r->offset, chunk);
2161   r->offset += chunk;
2162
2163   return chunk;
2164 }
2165
2166 static void
2167 lex_string_close (struct lex_reader *r_)
2168 {
2169   struct lex_string_reader *r = lex_string_reader_cast (r_);
2170
2171   ss_dealloc (&r->s);
2172   free (r);
2173 }
2174
2175 static struct lex_reader_class lex_string_reader_class =
2176   {
2177     lex_string_read,
2178     lex_string_close
2179   };