pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31 #include <uniwidth.h>
  32
  33 #include "language/command.h"
  34 #include "language/lexer/macro.h"
  35 #include "language/lexer/scan.h"
  36 #include "language/lexer/segment.h"
  37 #include "language/lexer/token.h"
  38 #include "libpspp/assertion.h"
  39 #include "libpspp/cast.h"
  40 #include "libpspp/deque.h"
  41 #include "libpspp/i18n.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* For a token obtained through the lexer in an ordinary way, this is the
  66        location of the token in terms of the lex_source's buffer.
  67
  68        For a token produced through macro expansion, this is the entire macro
  69        call.
  70
  71        src->tail <= line_pos <= token_pos <= src->head. */
  72     size_t token_pos;           /* Start of token. */
  73     size_t token_len;           /* Length of source for token in bytes. */
  74     size_t line_pos;            /* Start of line containing token_pos. */
  75     int first_line;             /* Line number at token_pos. */
  76
  77     /* For a token obtained through macro expansion, this is just this token.
  78
  79        For a token obtained through the lexer in an ordinary way, these are
  80        nulls and zeros. */
  81     char *macro_rep;        /* The whole macro expansion. */
  82     size_t ofs;             /* Offset of this token in macro_rep. */
  83     size_t len;             /* Length of this token in macro_rep. */
  84     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  85   };
  86
  87 static void
  88 lex_token_destroy (struct lex_token *t)
  89 {
  90   token_uninit (&t->token);
  91   if (t->ref_cnt)
  92     {
  93       assert (*t->ref_cnt > 0);
  94       if (!--*t->ref_cnt)
  95         {
  96           free (t->macro_rep);
  97           free (t->ref_cnt);
  98         }
  99     }
 100   free (t);
 101 }
 102 \f
 103 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 104    lex_source. */
 105 struct lex_stage
 106   {
 107     struct deque deque;
 108     struct lex_token **tokens;
 109   };
 110
 111 static void lex_stage_clear (struct lex_stage *);
 112 static void lex_stage_uninit (struct lex_stage *);
 113
 114 static size_t lex_stage_count (const struct lex_stage *);
 115 static bool lex_stage_is_empty (const struct lex_stage *);
 116
 117 static struct lex_token *lex_stage_last (struct lex_stage *);
 118 static struct lex_token *lex_stage_first (struct lex_stage *);
 119 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 120
 121 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 122 static void lex_stage_pop_first (struct lex_stage *);
 123
 124 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 125                              size_t n);
 126
 127 /* Deletes all the tokens from STAGE. */
 128 static void
 129 lex_stage_clear (struct lex_stage *stage)
 130 {
 131   while (!deque_is_empty (&stage->deque))
 132     lex_stage_pop_first (stage);
 133 }
 134
 135 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 136 static void
 137 lex_stage_uninit (struct lex_stage *stage)
 138 {
 139   lex_stage_clear (stage);
 140   free (stage->tokens);
 141 }
 142
 143 /* Returns true if STAGE contains no tokens, otherwise false. */
 144 static bool
 145 lex_stage_is_empty (const struct lex_stage *stage)
 146 {
 147   return deque_is_empty (&stage->deque);
 148 }
 149
 150 /* Returns the number of tokens in STAGE. */
 151 static size_t
 152 lex_stage_count (const struct lex_stage *stage)
 153 {
 154   return deque_count (&stage->deque);
 155 }
 156
 157 /* Returns the last token in STAGE, which must be nonempty.  The last token is
 158    the one accessed with the greatest lookahead. */
 159 static struct lex_token *
 160 lex_stage_last (struct lex_stage *stage)
 161 {
 162   return stage->tokens[deque_front (&stage->deque, 0)];
 163 }
 164
 165 /* Returns the first token in STAGE, which must be nonempty.
 166    The first token is the one accessed with the least lookahead. */
 167 static struct lex_token *
 168 lex_stage_first (struct lex_stage *stage)
 169 {
 170   return lex_stage_nth (stage, 0);
 171 }
 172
 173 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 174    lookahead) is 0, the second token is 1, and so on.  There must be at least
 175    INDEX + 1 tokens in STAGE. */
 176 static struct lex_token *
 177 lex_stage_nth (struct lex_stage *stage, size_t index)
 178 {
 179   return stage->tokens[deque_back (&stage->deque, index)];
 180 }
 181
 182 /* Adds TOKEN so that it becomes the last token in STAGE. */
 183 static void
 184 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 185 {
 186   if (deque_is_full (&stage->deque))
 187     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 188                                   sizeof *stage->tokens);
 189   stage->tokens[deque_push_front (&stage->deque)] = token;
 190 }
 191
 192 /* Removes the first token from STAGE and uninitializes it. */
 193 static void
 194 lex_stage_pop_first (struct lex_stage *stage)
 195 {
 196   lex_token_destroy (stage->tokens[deque_pop_back (&stage->deque)]);
 197 }
 198
 199 /* Removes the first N tokens from SRC, appending them to DST as the last
 200    tokens. */
 201 static void
 202 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 203 {
 204   for (size_t i = 0; i < n; i++)
 205     {
 206       lex_stage_push_last (dst, lex_stage_first (src));
 207       deque_pop_back (&src->deque);
 208     }
 209 }
 210
 211 /* A source of tokens, corresponding to a syntax file.
 212
 213    This is conceptually a lex_reader wrapped with everything needed to convert
 214    its UTF-8 bytes into tokens. */
 215 struct lex_source
 216   {
 217     struct ll ll;               /* In lexer's list of sources. */
 218     struct lex_reader *reader;
 219     struct lexer *lexer;
 220     struct segmenter segmenter;
 221     bool eof;                   /* True if T_STOP was read from 'reader'. */
 222
 223     /* Buffer of UTF-8 bytes. */
 224     char *buffer;
 225     size_t allocated;           /* Number of bytes allocated. */
 226     size_t tail;                /* &buffer[0] offset into UTF-8 source. */
 227     size_t head;                /* &buffer[head - tail] offset into source. */
 228
 229     /* Positions in source file, tail <= pos <= head for each member here. */
 230     size_t journal_pos;         /* First byte not yet output to journal. */
 231     size_t seg_pos;             /* First byte not yet scanned as token. */
 232     size_t line_pos;            /* First byte of line containing seg_pos. */
 233
 234     int n_newlines;             /* Number of new-lines up to seg_pos. */
 235     bool suppress_next_newline;
 236
 237     /* Tokens.
 238
 239        This is a pipeline with the following stages.  Each token eventually
 240        made available to the parser passes through of these stages.  The stages
 241        are named after the processing that happens in each one.
 242
 243        Initially, tokens come from the segmenter and scanner to 'pp':
 244
 245        - pp: Tokens that need to pass through the macro preprocessor to end up
 246          in 'merge'.
 247
 248        - merge: Tokens that need to pass through scan_merge() to end up in
 249          'lookahead'.
 250
 251        - lookahead: Tokens available to the client for parsing. */
 252     struct lex_stage pp;
 253     struct lex_stage merge;
 254     struct lex_stage lookahead;
 255   };
 256
 257 static struct lex_source *lex_source_create (struct lexer *,
 258                                              struct lex_reader *);
 259 static void lex_source_destroy (struct lex_source *);
 260
 261 /* Lexer. */
 262 struct lexer
 263   {
 264     struct ll_list sources;     /* Contains "struct lex_source"s. */
 265     struct macro_set *macros;
 266   };
 267
 268 static struct lex_source *lex_source__ (const struct lexer *);
 269 static char *lex_source_get_syntax__ (const struct lex_source *,
 270                                       int n0, int n1);
 271 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 272 static void lex_source_push_endcmd__ (struct lex_source *);
 273
 274 static bool lex_source_get_lookahead (struct lex_source *);
 275 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 276                                      const char *format, va_list)
 277    PRINTF_FORMAT (4, 0);
 278 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 279                                                   int n);
 280 \f
 281 /* Initializes READER with the specified CLASS and otherwise some reasonable
 282    defaults.  The caller should fill in the others members as desired. */
 283 void
 284 lex_reader_init (struct lex_reader *reader,
 285                  const struct lex_reader_class *class)
 286 {
 287   reader->class = class;
 288   reader->syntax = SEG_MODE_AUTO;
 289   reader->error = LEX_ERROR_CONTINUE;
 290   reader->file_name = NULL;
 291   reader->encoding = NULL;
 292   reader->line_number = 0;
 293   reader->eof = false;
 294 }
 295
 296 /* Frees any file name already in READER and replaces it by a copy of
 297    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 298 void
 299 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 300 {
 301   free (reader->file_name);
 302   reader->file_name = xstrdup_if_nonnull (file_name);
 303 }
 304 \f
 305 /* Creates and returns a new lexer. */
 306 struct lexer *
 307 lex_create (void)
 308 {
 309   struct lexer *lexer = xmalloc (sizeof *lexer);
 310   *lexer = (struct lexer) {
 311     .sources = LL_INITIALIZER (lexer->sources),
 312     .macros = macro_set_create (),
 313   };
 314   return lexer;
 315 }
 316
 317 /* Destroys LEXER. */
 318 void
 319 lex_destroy (struct lexer *lexer)
 320 {
 321   if (lexer != NULL)
 322     {
 323       struct lex_source *source, *next;
 324
 325       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 326         lex_source_destroy (source);
 327       macro_set_destroy (lexer->macros);
 328       free (lexer);
 329     }
 330 }
 331
 332 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 333    same name.  Takes ownership of M. */
 334 void
 335 lex_define_macro (struct lexer *lexer, struct macro *m)
 336 {
 337   macro_set_add (lexer->macros, m);
 338 }
 339
 340 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 341    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 342    token. */
 343 void
 344 lex_include (struct lexer *lexer, struct lex_reader *reader)
 345 {
 346   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 347   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 348 }
 349
 350 /* Appends READER to LEXER, so that it will be read after all other current
 351    readers have already been read. */
 352 void
 353 lex_append (struct lexer *lexer, struct lex_reader *reader)
 354 {
 355   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 356 }
 357 \f
 358 /* Advancing. */
 359
 360 /* Advances LEXER to the next token, consuming the current token. */
 361 void
 362 lex_get (struct lexer *lexer)
 363 {
 364   struct lex_source *src;
 365
 366   src = lex_source__ (lexer);
 367   if (src == NULL)
 368     return;
 369
 370   if (!lex_stage_is_empty (&src->lookahead))
 371     lex_stage_pop_first (&src->lookahead);
 372
 373   while (lex_stage_is_empty (&src->lookahead))
 374     if (!lex_source_get_lookahead (src))
 375       {
 376         lex_source_destroy (src);
 377         src = lex_source__ (lexer);
 378         if (src == NULL)
 379           return;
 380       }
 381 }
 382
 383 /* Advances LEXER by N tokens. */
 384 void
 385 lex_get_n (struct lexer *lexer, size_t n)
 386 {
 387   while (n-- > 0)
 388     lex_get (lexer);
 389 }
 390 \f
 391 /* Issuing errors. */
 392
 393 /* Prints a syntax error message containing the current token and
 394    given message MESSAGE (if non-null). */
 395 void
 396 lex_error (struct lexer *lexer, const char *format, ...)
 397 {
 398   va_list args;
 399
 400   va_start (args, format);
 401   lex_next_error_valist (lexer, 0, 0, format, args);
 402   va_end (args);
 403 }
 404
 405 /* Prints a syntax error message containing the current token and
 406    given message MESSAGE (if non-null). */
 407 void
 408 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 409 {
 410   lex_next_error_valist (lexer, 0, 0, format, args);
 411 }
 412
 413 /* Prints a syntax error message containing the current token and
 414    given message MESSAGE (if non-null). */
 415 void
 416 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 417 {
 418   va_list args;
 419
 420   va_start (args, format);
 421   lex_next_error_valist (lexer, n0, n1, format, args);
 422   va_end (args);
 423 }
 424
 425 /* Prints a syntax error message saying that one of the strings provided as
 426    varargs, up to the first NULL, is expected. */
 427 void
 428 (lex_error_expecting) (struct lexer *lexer, ...)
 429 {
 430   va_list args;
 431
 432   va_start (args, lexer);
 433   lex_error_expecting_valist (lexer, args);
 434   va_end (args);
 435 }
 436
 437 /* Prints a syntax error message saying that one of the options provided in
 438    ARGS, up to the first NULL, is expected. */
 439 void
 440 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 441 {
 442   enum { MAX_OPTIONS = 9 };
 443   const char *options[MAX_OPTIONS];
 444   int n = 0;
 445   while (n < MAX_OPTIONS)
 446     {
 447       const char *option = va_arg (args, const char *);
 448       if (!option)
 449         break;
 450
 451       options[n++] = option;
 452     }
 453   lex_error_expecting_array (lexer, options, n);
 454 }
 455
 456 void
 457 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 458 {
 459   switch (n)
 460     {
 461     case 0:
 462       lex_error (lexer, NULL);
 463       break;
 464
 465     case 1:
 466       lex_error (lexer, _("expecting %s"), options[0]);
 467       break;
 468
 469     case 2:
 470       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 471       break;
 472
 473     case 3:
 474       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 475                  options[2]);
 476       break;
 477
 478     case 4:
 479       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 480                  options[0], options[1], options[2], options[3]);
 481       break;
 482
 483     case 5:
 484       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 485                  options[0], options[1], options[2], options[3], options[4]);
 486       break;
 487
 488     case 6:
 489       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 490                  options[0], options[1], options[2], options[3], options[4],
 491                  options[5]);
 492       break;
 493
 494     case 7:
 495       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 496                  options[0], options[1], options[2], options[3], options[4],
 497                  options[5], options[6]);
 498       break;
 499
 500     case 8:
 501       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 502                  options[0], options[1], options[2], options[3], options[4],
 503                  options[5], options[6], options[7]);
 504       break;
 505
 506     default:
 507       lex_error (lexer, NULL);
 508     }
 509 }
 510
 511 /* Reports an error to the effect that subcommand SBC may only be specified
 512    once.
 513
 514    This function does not take a lexer as an argument or use lex_error(),
 515    because the result would ordinarily just be redundant: "Syntax error at
 516    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 517    not help the user find the error. */
 518 void
 519 lex_sbc_only_once (const char *sbc)
 520 {
 521   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 522 }
 523
 524 /* Reports an error to the effect that subcommand SBC is missing.
 525
 526    This function does not take a lexer as an argument or use lex_error(),
 527    because a missing subcommand can normally be detected only after the whole
 528    command has been parsed, and so lex_error() would always report "Syntax
 529    error at end of command", which does not help the user find the error. */
 530 void
 531 lex_sbc_missing (const char *sbc)
 532 {
 533   msg (SE, _("Required subcommand %s was not specified."), sbc);
 534 }
 535
 536 /* Reports an error to the effect that specification SPEC may only be specified
 537    once within subcommand SBC. */
 538 void
 539 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 540 {
 541   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 542              spec, sbc);
 543 }
 544
 545 /* Reports an error to the effect that specification SPEC is missing within
 546    subcommand SBC. */
 547 void
 548 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 549 {
 550   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 551              sbc, spec);
 552 }
 553
 554 /* Prints a syntax error message containing the current token and
 555    given message MESSAGE (if non-null). */
 556 void
 557 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 558                        const char *format, va_list args)
 559 {
 560   struct lex_source *src = lex_source__ (lexer);
 561
 562   if (src != NULL)
 563     lex_source_error_valist (src, n0, n1, format, args);
 564   else
 565     {
 566       struct string s;
 567
 568       ds_init_empty (&s);
 569       ds_put_format (&s, _("Syntax error at end of input"));
 570       if (format != NULL)
 571         {
 572           ds_put_cstr (&s, ": ");
 573           ds_put_vformat (&s, format, args);
 574         }
 575       if (ds_last (&s) != '.')
 576         ds_put_byte (&s, '.');
 577       msg (SE, "%s", ds_cstr (&s));
 578       ds_destroy (&s);
 579     }
 580 }
 581
 582 /* Checks that we're at end of command.
 583    If so, returns a successful command completion code.
 584    If not, flags a syntax error and returns an error command
 585    completion code. */
 586 int
 587 lex_end_of_command (struct lexer *lexer)
 588 {
 589   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 590     {
 591       lex_error (lexer, _("expecting end of command"));
 592       return CMD_FAILURE;
 593     }
 594   else
 595     return CMD_SUCCESS;
 596 }
 597 \f
 598 /* Token testing functions. */
 599
 600 /* Returns true if the current token is a number. */
 601 bool
 602 lex_is_number (const struct lexer *lexer)
 603 {
 604   return lex_next_is_number (lexer, 0);
 605 }
 606
 607 /* Returns true if the current token is a string. */
 608 bool
 609 lex_is_string (const struct lexer *lexer)
 610 {
 611   return lex_next_is_string (lexer, 0);
 612 }
 613
 614 /* Returns the value of the current token, which must be a
 615    floating point number. */
 616 double
 617 lex_number (const struct lexer *lexer)
 618 {
 619   return lex_next_number (lexer, 0);
 620 }
 621
 622 /* Returns true iff the current token is an integer. */
 623 bool
 624 lex_is_integer (const struct lexer *lexer)
 625 {
 626   return lex_next_is_integer (lexer, 0);
 627 }
 628
 629 /* Returns the value of the current token, which must be an
 630    integer. */
 631 long
 632 lex_integer (const struct lexer *lexer)
 633 {
 634   return lex_next_integer (lexer, 0);
 635 }
 636 \f
 637 /* Token testing functions with lookahead.
 638
 639    A value of 0 for N as an argument to any of these functions refers to the
 640    current token.  Lookahead is limited to the current command.  Any N greater
 641    than the number of tokens remaining in the current command will be treated
 642    as referring to a T_ENDCMD token. */
 643
 644 /* Returns true if the token N ahead of the current token is a number. */
 645 bool
 646 lex_next_is_number (const struct lexer *lexer, int n)
 647 {
 648   return token_is_number (lex_next (lexer, n));
 649 }
 650
 651 /* Returns true if the token N ahead of the current token is a string. */
 652 bool
 653 lex_next_is_string (const struct lexer *lexer, int n)
 654 {
 655   return token_is_string (lex_next (lexer, n));
 656 }
 657
 658 /* Returns the value of the token N ahead of the current token, which must be a
 659    floating point number. */
 660 double
 661 lex_next_number (const struct lexer *lexer, int n)
 662 {
 663   return token_number (lex_next (lexer, n));
 664 }
 665
 666 /* Returns true if the token N ahead of the current token is an integer. */
 667 bool
 668 lex_next_is_integer (const struct lexer *lexer, int n)
 669 {
 670   return token_is_integer (lex_next (lexer, n));
 671 }
 672
 673 /* Returns the value of the token N ahead of the current token, which must be
 674    an integer. */
 675 long
 676 lex_next_integer (const struct lexer *lexer, int n)
 677 {
 678   return token_integer (lex_next (lexer, n));
 679 }
 680 \f
 681 /* Token matching functions. */
 682
 683 /* If the current token has the specified TYPE, skips it and returns true.
 684    Otherwise, returns false. */
 685 bool
 686 lex_match (struct lexer *lexer, enum token_type type)
 687 {
 688   if (lex_token (lexer) == type)
 689     {
 690       lex_get (lexer);
 691       return true;
 692     }
 693   else
 694     return false;
 695 }
 696
 697 /* If the current token matches IDENTIFIER, skips it and returns true.
 698    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 699    returns false.
 700
 701    IDENTIFIER must be an ASCII string. */
 702 bool
 703 lex_match_id (struct lexer *lexer, const char *identifier)
 704 {
 705   return lex_match_id_n (lexer, identifier, 3);
 706 }
 707
 708 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 709    may be abbreviated to its first N letters.  Otherwise, returns false.
 710
 711    IDENTIFIER must be an ASCII string. */
 712 bool
 713 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 714 {
 715   if (lex_token (lexer) == T_ID
 716       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 717     {
 718       lex_get (lexer);
 719       return true;
 720     }
 721   else
 722     return false;
 723 }
 724
 725 /* If the current token is integer X, skips it and returns true.  Otherwise,
 726    returns false. */
 727 bool
 728 lex_match_int (struct lexer *lexer, int x)
 729 {
 730   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 731     {
 732       lex_get (lexer);
 733       return true;
 734     }
 735   else
 736     return false;
 737 }
 738 \f
 739 /* Forced matches. */
 740
 741 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 742    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 743    false.
 744
 745    IDENTIFIER must be an ASCII string. */
 746 bool
 747 lex_force_match_id (struct lexer *lexer, const char *identifier)
 748 {
 749   if (lex_match_id (lexer, identifier))
 750     return true;
 751   else
 752     {
 753       lex_error_expecting (lexer, identifier);
 754       return false;
 755     }
 756 }
 757
 758 /* If the current token has the specified TYPE, skips it and returns true.
 759    Otherwise, reports an error and returns false. */
 760 bool
 761 lex_force_match (struct lexer *lexer, enum token_type type)
 762 {
 763   if (lex_token (lexer) == type)
 764     {
 765       lex_get (lexer);
 766       return true;
 767     }
 768   else
 769     {
 770       const char *type_string = token_type_to_string (type);
 771       if (type_string)
 772         {
 773           char *s = xasprintf ("`%s'", type_string);
 774           lex_error_expecting (lexer, s);
 775           free (s);
 776         }
 777       else
 778         lex_error_expecting (lexer, token_type_to_name (type));
 779
 780       return false;
 781     }
 782 }
 783
 784 /* If the current token is a string, does nothing and returns true.
 785    Otherwise, reports an error and returns false. */
 786 bool
 787 lex_force_string (struct lexer *lexer)
 788 {
 789   if (lex_is_string (lexer))
 790     return true;
 791   else
 792     {
 793       lex_error (lexer, _("expecting string"));
 794       return false;
 795     }
 796 }
 797
 798 /* If the current token is a string or an identifier, does nothing and returns
 799    true.  Otherwise, reports an error and returns false.
 800
 801    This is meant for use in syntactic situations where we want to encourage the
 802    user to supply a quoted string, but for compatibility we also accept
 803    identifiers.  (One example of such a situation is file names.)  Therefore,
 804    the error message issued when the current token is wrong only says that a
 805    string is expected and doesn't mention that an identifier would also be
 806    accepted. */
 807 bool
 808 lex_force_string_or_id (struct lexer *lexer)
 809 {
 810   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 811 }
 812
 813 /* If the current token is an integer, does nothing and returns true.
 814    Otherwise, reports an error and returns false. */
 815 bool
 816 lex_force_int (struct lexer *lexer)
 817 {
 818   if (lex_is_integer (lexer))
 819     return true;
 820   else
 821     {
 822       lex_error (lexer, _("expecting integer"));
 823       return false;
 824     }
 825 }
 826
 827 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 828    nothing and returns true.  Otherwise, reports an error and returns false.
 829    If NAME is nonnull, then it is used in the error message. */
 830 bool
 831 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 832 {
 833   bool is_number = lex_is_number (lexer);
 834   bool is_integer = lex_is_integer (lexer);
 835   bool too_small = (is_integer ? lex_integer (lexer) < min
 836                     : is_number ? lex_number (lexer) < min
 837                     : false);
 838   bool too_big = (is_integer ? lex_integer (lexer) > max
 839                   : is_number ? lex_number (lexer) > max
 840                   : false);
 841   if (is_integer && !too_small && !too_big)
 842     return true;
 843
 844   if (min > max)
 845     {
 846       /* Weird, maybe a bug in the caller.  Just report that we needed an
 847          integer. */
 848       if (name)
 849         lex_error (lexer, _("Integer expected for %s."), name);
 850       else
 851         lex_error (lexer, _("Integer expected."));
 852     }
 853   else if (min == max)
 854     {
 855       if (name)
 856         lex_error (lexer, _("Expected %ld for %s."), min, name);
 857       else
 858         lex_error (lexer, _("Expected %ld."), min);
 859     }
 860   else if (min + 1 == max)
 861     {
 862       if (name)
 863         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 864       else
 865         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 866     }
 867   else
 868     {
 869       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 870       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 871
 872       if (report_lower_bound && report_upper_bound)
 873         {
 874           if (name)
 875             lex_error (lexer,
 876                        _("Expected integer between %ld and %ld for %s."),
 877                        min, max, name);
 878           else
 879             lex_error (lexer, _("Expected integer between %ld and %ld."),
 880                        min, max);
 881         }
 882       else if (report_lower_bound)
 883         {
 884           if (min == 0)
 885             {
 886               if (name)
 887                 lex_error (lexer, _("Expected non-negative integer for %s."),
 888                            name);
 889               else
 890                 lex_error (lexer, _("Expected non-negative integer."));
 891             }
 892           else if (min == 1)
 893             {
 894               if (name)
 895                 lex_error (lexer, _("Expected positive integer for %s."),
 896                            name);
 897               else
 898                 lex_error (lexer, _("Expected positive integer."));
 899             }
 900           else
 901             {
 902               if (name)
 903                 lex_error (lexer, _("Expected integer %ld or greater for %s."),
 904                            min, name);
 905               else
 906                 lex_error (lexer, _("Expected integer %ld or greater."), min);
 907             }
 908         }
 909       else if (report_upper_bound)
 910         {
 911           if (name)
 912             lex_error (lexer,
 913                        _("Expected integer less than or equal to %ld for %s."),
 914                        max, name);
 915           else
 916             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 917                        max);
 918         }
 919       else
 920         {
 921           if (name)
 922             lex_error (lexer, _("Integer expected for %s."), name);
 923           else
 924             lex_error (lexer, _("Integer expected."));
 925         }
 926     }
 927   return false;
 928 }
 929
 930 /* If the current token is a number, does nothing and returns true.
 931    Otherwise, reports an error and returns false. */
 932 bool
 933 lex_force_num (struct lexer *lexer)
 934 {
 935   if (lex_is_number (lexer))
 936     return true;
 937
 938   lex_error (lexer, _("expecting number"));
 939   return false;
 940 }
 941
 942 /* If the current token is an identifier, does nothing and returns true.
 943    Otherwise, reports an error and returns false. */
 944 bool
 945 lex_force_id (struct lexer *lexer)
 946 {
 947   if (lex_token (lexer) == T_ID)
 948     return true;
 949
 950   lex_error (lexer, _("expecting identifier"));
 951   return false;
 952 }
 953 \f
 954 /* Token accessors. */
 955
 956 /* Returns the type of LEXER's current token. */
 957 enum token_type
 958 lex_token (const struct lexer *lexer)
 959 {
 960   return lex_next_token (lexer, 0);
 961 }
 962
 963 /* Returns the number in LEXER's current token.
 964
 965    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 966    tokens this function will always return zero. */
 967 double
 968 lex_tokval (const struct lexer *lexer)
 969 {
 970   return lex_next_tokval (lexer, 0);
 971 }
 972
 973 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 974
 975    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 976    this functions this function will always return NULL.
 977
 978    The UTF-8 encoding of the returned string is correct for variable names and
 979    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 980    data_in() to use it in a "union value".  */
 981 const char *
 982 lex_tokcstr (const struct lexer *lexer)
 983 {
 984   return lex_next_tokcstr (lexer, 0);
 985 }
 986
 987 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 988    null-terminated (but the null terminator is not included in the returned
 989    substring's 'length').
 990
 991    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 992    this functions this function will always return NULL.
 993
 994    The UTF-8 encoding of the returned string is correct for variable names and
 995    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 996    data_in() to use it in a "union value".  */
 997 struct substring
 998 lex_tokss (const struct lexer *lexer)
 999 {
1000   return lex_next_tokss (lexer, 0);
1001 }
1002 \f
1003 /* Looking ahead.
1004
1005    A value of 0 for N as an argument to any of these functions refers to the
1006    current token.  Lookahead is limited to the current command.  Any N greater
1007    than the number of tokens remaining in the current command will be treated
1008    as referring to a T_ENDCMD token. */
1009
1010 static const struct lex_token *
1011 lex_next__ (const struct lexer *lexer_, int n)
1012 {
1013   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1014   struct lex_source *src = lex_source__ (lexer);
1015
1016   if (src != NULL)
1017     return lex_source_next__ (src, n);
1018   else
1019     {
1020       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1021       return &stop_token;
1022     }
1023 }
1024
1025 static const struct lex_token *
1026 lex_source_next__ (const struct lex_source *src_, int n)
1027 {
1028   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1029   while (lex_stage_count (&src->lookahead) <= n)
1030     {
1031       if (!lex_stage_is_empty (&src->lookahead))
1032         {
1033           const struct lex_token *t = lex_stage_last (&src->lookahead);
1034           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1035             return t;
1036         }
1037
1038       lex_source_get_lookahead (src);
1039     }
1040
1041   return lex_stage_nth (&src->lookahead, n);
1042 }
1043
1044 /* Returns the "struct token" of the token N after the current one in LEXER.
1045    The returned pointer can be invalidated by pretty much any succeeding call
1046    into the lexer, although the string pointer within the returned token is
1047    only invalidated by consuming the token (e.g. with lex_get()). */
1048 const struct token *
1049 lex_next (const struct lexer *lexer, int n)
1050 {
1051   return &lex_next__ (lexer, n)->token;
1052 }
1053
1054 /* Returns the type of the token N after the current one in LEXER. */
1055 enum token_type
1056 lex_next_token (const struct lexer *lexer, int n)
1057 {
1058   return lex_next (lexer, n)->type;
1059 }
1060
1061 /* Returns the number in the tokn N after the current one in LEXER.
1062
1063    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1064    tokens this function will always return zero. */
1065 double
1066 lex_next_tokval (const struct lexer *lexer, int n)
1067 {
1068   return token_number (lex_next (lexer, n));
1069 }
1070
1071 /* Returns the null-terminated string in the token N after the current one, in
1072    UTF-8 encoding.
1073
1074    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1075    this functions this function will always return NULL.
1076
1077    The UTF-8 encoding of the returned string is correct for variable names and
1078    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1079    data_in() to use it in a "union value".  */
1080 const char *
1081 lex_next_tokcstr (const struct lexer *lexer, int n)
1082 {
1083   return lex_next_tokss (lexer, n).string;
1084 }
1085
1086 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1087    The string is null-terminated (but the null terminator is not included in
1088    the returned substring's 'length').
1089
1090    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1091    tokens this functions this function will always return NULL.
1092
1093    The UTF-8 encoding of the returned string is correct for variable names and
1094    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1095    data_in() to use it in a "union value".  */
1096 struct substring
1097 lex_next_tokss (const struct lexer *lexer, int n)
1098 {
1099   return lex_next (lexer, n)->string;
1100 }
1101
1102 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1103    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1104    are both zero, this requests the syntax for the current token.)  The caller
1105    must eventually free the returned string (with free()).  The syntax is
1106    encoded in UTF-8 and in the original form supplied to the lexer so that, for
1107    example, it may include comments, spaces, and new-lines if it spans multiple
1108    tokens.  Macro expansion, however, has already been performed. */
1109 char *
1110 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1111 {
1112   return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1113 }
1114
1115 /* Returns true if the token N ahead of the current one was produced by macro
1116    expansion, false otherwise. */
1117 bool
1118 lex_next_is_from_macro (const struct lexer *lexer, int n)
1119 {
1120   return lex_next__ (lexer, n)->macro_rep != NULL;
1121 }
1122
1123 static bool
1124 lex_tokens_match (const struct token *actual, const struct token *expected)
1125 {
1126   if (actual->type != expected->type)
1127     return false;
1128
1129   switch (actual->type)
1130     {
1131     case T_POS_NUM:
1132     case T_NEG_NUM:
1133       return actual->number == expected->number;
1134
1135     case T_ID:
1136       return lex_id_match (expected->string, actual->string);
1137
1138     case T_STRING:
1139       return (actual->string.length == expected->string.length
1140               && !memcmp (actual->string.string, expected->string.string,
1141                           actual->string.length));
1142
1143     default:
1144       return true;
1145     }
1146 }
1147
1148 static size_t
1149 lex_at_phrase__ (struct lexer *lexer, const char *s)
1150 {
1151   struct string_lexer slex;
1152   struct token token;
1153
1154   size_t i = 0;
1155   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1156   while (string_lexer_next (&slex, &token))
1157     {
1158       bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1159       token_uninit (&token);
1160       if (!match)
1161         return 0;
1162     }
1163   return i;
1164 }
1165
1166 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1167    returns true.  Otherwise, returns false.
1168
1169    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1170    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1171    first three letters. */
1172 bool
1173 lex_at_phrase (struct lexer *lexer, const char *s)
1174 {
1175   return lex_at_phrase__ (lexer, s) > 0;
1176 }
1177
1178 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1179    skips it and returns true.  Otherwise, returns false.
1180
1181    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1182    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1183    first three letters. */
1184 bool
1185 lex_match_phrase (struct lexer *lexer, const char *s)
1186 {
1187   size_t n = lex_at_phrase__ (lexer, s);
1188   if (n > 0)
1189     lex_get_n (lexer, n);
1190   return n > 0;
1191 }
1192
1193 static int
1194 count_newlines (char *s, size_t length)
1195 {
1196   int n_newlines = 0;
1197   char *newline;
1198
1199   while ((newline = memchr (s, '\n', length)) != NULL)
1200     {
1201       n_newlines++;
1202       length -= (newline + 1) - s;
1203       s = newline + 1;
1204     }
1205
1206   return n_newlines;
1207 }
1208
1209 static int
1210 lex_token_get_last_line_number (const struct lex_source *src,
1211                                 const struct lex_token *token)
1212 {
1213   if (token->first_line == 0)
1214     return 0;
1215   else
1216     {
1217       char *token_str = &src->buffer[token->token_pos - src->tail];
1218       return token->first_line + count_newlines (token_str, token->token_len) + 1;
1219     }
1220 }
1221
1222 static int
1223 count_columns (const char *s_, size_t length)
1224 {
1225   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1226   int columns;
1227   size_t ofs;
1228   int mblen;
1229
1230   columns = 0;
1231   for (ofs = 0; ofs < length; ofs += mblen)
1232     {
1233       ucs4_t uc;
1234
1235       mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1236       if (uc != '\t')
1237         {
1238           int width = uc_width (uc, "UTF-8");
1239           if (width > 0)
1240             columns += width;
1241         }
1242       else
1243         columns = ROUND_UP (columns + 1, 8);
1244     }
1245
1246   return columns + 1;
1247 }
1248
1249 static int
1250 lex_token_get_first_column (const struct lex_source *src,
1251                             const struct lex_token *token)
1252 {
1253   return count_columns (&src->buffer[token->line_pos - src->tail],
1254                         token->token_pos - token->line_pos);
1255 }
1256
1257 static int
1258 lex_token_get_last_column (const struct lex_source *src,
1259                            const struct lex_token *token)
1260 {
1261   char *start, *end, *newline;
1262
1263   start = &src->buffer[token->line_pos - src->tail];
1264   end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1265   newline = memrchr (start, '\n', end - start);
1266   if (newline != NULL)
1267     start = newline + 1;
1268   return count_columns (start, end - start);
1269 }
1270
1271 static struct msg_location
1272 lex_token_location (const struct lex_source *src,
1273                     const struct lex_token *t0,
1274                     const struct lex_token *t1)
1275 {
1276   return (struct msg_location) {
1277     .file_name = src->reader->file_name,
1278     .first_line = t0->first_line,
1279     .last_line = lex_token_get_last_line_number (src, t1),
1280     .first_column = lex_token_get_first_column (src, t0),
1281     .last_column = lex_token_get_last_column (src, t1),
1282   };
1283 }
1284
1285 static struct msg_location *
1286 lex_token_location_rw (const struct lex_source *src,
1287                        const struct lex_token *t0,
1288                        const struct lex_token *t1)
1289 {
1290   struct msg_location location = lex_token_location (src, t0, t1);
1291   return msg_location_dup (&location);
1292 }
1293
1294 static struct msg_location *
1295 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1296 {
1297   return lex_token_location_rw (src,
1298                                 lex_source_next__ (src, n0),
1299                                 lex_source_next__ (src, n1));
1300 }
1301
1302 /* Returns the 1-based line number of the start of the syntax that represents
1303    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1304    if the token is drawn from a source that does not have line numbers. */
1305 int
1306 lex_get_first_line_number (const struct lexer *lexer, int n)
1307 {
1308   const struct lex_source *src = lex_source__ (lexer);
1309   return src ? lex_source_next__ (src, n)->first_line : 0;
1310 }
1311
1312 /* Returns the 1-based line number of the end of the syntax that represents the
1313    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1314    token or if the token is drawn from a source that does not have line
1315    numbers.
1316
1317    Most of the time, a single token is wholly within a single line of syntax,
1318    but there are two exceptions: a T_STRING token can be made up of multiple
1319    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1320    token can consist of a "-" on one line followed by the number on the next.
1321  */
1322 int
1323 lex_get_last_line_number (const struct lexer *lexer, int n)
1324 {
1325   const struct lex_source *src = lex_source__ (lexer);
1326   return src ? lex_token_get_last_line_number (src,
1327                                                lex_source_next__ (src, n)) : 0;
1328 }
1329
1330 /* Returns the 1-based column number of the start of the syntax that represents
1331    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1332    token.
1333
1334    Column numbers are measured according to the width of characters as shown in
1335    a typical fixed-width font, in which CJK characters have width 2 and
1336    combining characters have width 0.  */
1337 int
1338 lex_get_first_column (const struct lexer *lexer, int n)
1339 {
1340   const struct lex_source *src = lex_source__ (lexer);
1341   return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1342 }
1343
1344 /* Returns the 1-based column number of the end of the syntax that represents
1345    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1346    token.
1347
1348    Column numbers are measured according to the width of characters as shown in
1349    a typical fixed-width font, in which CJK characters have width 2 and
1350    combining characters have width 0.  */
1351 int
1352 lex_get_last_column (const struct lexer *lexer, int n)
1353 {
1354   const struct lex_source *src = lex_source__ (lexer);
1355   return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1356 }
1357
1358 /* Returns the name of the syntax file from which the current command is drawn.
1359    Returns NULL for a T_STOP token or if the command's source does not have
1360    line numbers.
1361
1362    There is no version of this function that takes an N argument because
1363    lookahead only works to the end of a command and any given command is always
1364    within a single syntax file. */
1365 const char *
1366 lex_get_file_name (const struct lexer *lexer)
1367 {
1368   struct lex_source *src = lex_source__ (lexer);
1369   return src == NULL ? NULL : src->reader->file_name;
1370 }
1371
1372 /* Returns a newly allocated msg_location for the syntax that represents tokens
1373    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1374    must eventually free the location (with msg_location_destroy()). */
1375 struct msg_location *
1376 lex_get_location (const struct lexer *lexer, int n0, int n1)
1377 {
1378   struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1379   loc->first_column = lex_get_first_column (lexer, n0);
1380   loc->last_column = lex_get_last_column (lexer, n1);
1381   return loc;
1382 }
1383
1384 /* Returns a newly allocated msg_location for the syntax that represents tokens
1385    with 0-based offsets N0...N1, inclusive, from the current token.  The
1386    location only covers the tokens' lines, not the columns.  The caller must
1387    eventually free the location (with msg_location_destroy()). */
1388 struct msg_location *
1389 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1390 {
1391   struct msg_location *loc = xmalloc (sizeof *loc);
1392   *loc = (struct msg_location) {
1393     .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1394     .first_line = lex_get_first_line_number (lexer, n0),
1395     .last_line = lex_get_last_line_number (lexer, n1),
1396   };
1397   return loc;
1398 }
1399
1400 const char *
1401 lex_get_encoding (const struct lexer *lexer)
1402 {
1403   struct lex_source *src = lex_source__ (lexer);
1404   return src == NULL ? NULL : src->reader->encoding;
1405 }
1406
1407 /* Returns the syntax mode for the syntax file from which the current drawn is
1408    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1409    does not have line numbers.
1410
1411    There is no version of this function that takes an N argument because
1412    lookahead only works to the end of a command and any given command is always
1413    within a single syntax file. */
1414 enum segmenter_mode
1415 lex_get_syntax_mode (const struct lexer *lexer)
1416 {
1417   struct lex_source *src = lex_source__ (lexer);
1418   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1419 }
1420
1421 /* Returns the error mode for the syntax file from which the current drawn is
1422    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1423    source does not have line numbers.
1424
1425    There is no version of this function that takes an N argument because
1426    lookahead only works to the end of a command and any given command is always
1427    within a single syntax file. */
1428 enum lex_error_mode
1429 lex_get_error_mode (const struct lexer *lexer)
1430 {
1431   struct lex_source *src = lex_source__ (lexer);
1432   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1433 }
1434
1435 /* If the source that LEXER is currently reading has error mode
1436    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1437    token to be read comes directly from whatever is next read from the stream.
1438
1439    It makes sense to call this function after encountering an error in a
1440    command entered on the console, because usually the user would prefer not to
1441    have cascading errors. */
1442 void
1443 lex_interactive_reset (struct lexer *lexer)
1444 {
1445   struct lex_source *src = lex_source__ (lexer);
1446   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1447     {
1448       src->head = src->tail = 0;
1449       src->journal_pos = src->seg_pos = src->line_pos = 0;
1450       src->n_newlines = 0;
1451       src->suppress_next_newline = false;
1452       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1453                                        false);
1454       lex_stage_clear (&src->pp);
1455       lex_stage_clear (&src->merge);
1456       lex_stage_clear (&src->lookahead);
1457       lex_source_push_endcmd__ (src);
1458     }
1459 }
1460
1461 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1462 void
1463 lex_discard_rest_of_command (struct lexer *lexer)
1464 {
1465   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1466     lex_get (lexer);
1467 }
1468
1469 /* Discards all lookahead tokens in LEXER, then discards all input sources
1470    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1471    runs out of input sources. */
1472 void
1473 lex_discard_noninteractive (struct lexer *lexer)
1474 {
1475   struct lex_source *src = lex_source__ (lexer);
1476
1477   if (src != NULL)
1478     {
1479       lex_stage_clear (&src->pp);
1480       lex_stage_clear (&src->merge);
1481       lex_stage_clear (&src->lookahead);
1482
1483       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1484            src = lex_source__ (lexer))
1485         lex_source_destroy (src);
1486     }
1487 }
1488 \f
1489 static size_t
1490 lex_source_max_tail__ (const struct lex_source *src_)
1491 {
1492   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1493
1494   assert (src->seg_pos >= src->line_pos);
1495   size_t max_tail = MIN (src->journal_pos, src->line_pos);
1496
1497   /* Use the oldest token also. */
1498   struct lex_stage *stages[] = { &src->lookahead, &src->merge, &src->pp };
1499   for (size_t i = 0; i < sizeof stages / sizeof *stages; i++)
1500     if (!lex_stage_is_empty (stages[i]))
1501       {
1502         struct lex_token *first = lex_stage_first (stages[i]);
1503         assert (first->token_pos >= first->line_pos);
1504         return MIN (max_tail, first->line_pos);
1505       }
1506
1507   return max_tail;
1508 }
1509
1510 static void
1511 lex_source_expand__ (struct lex_source *src)
1512 {
1513   if (src->head - src->tail >= src->allocated)
1514     {
1515       size_t max_tail = lex_source_max_tail__ (src);
1516       if (max_tail > src->tail)
1517         {
1518           /* Advance the tail, freeing up room at the head. */
1519           memmove (src->buffer, src->buffer + (max_tail - src->tail),
1520                    src->head - max_tail);
1521           src->tail = max_tail;
1522         }
1523       else
1524         {
1525           /* Buffer is completely full.  Expand it. */
1526           src->buffer = x2realloc (src->buffer, &src->allocated);
1527         }
1528     }
1529   else
1530     {
1531       /* There's space available at the head of the buffer.  Nothing to do. */
1532     }
1533 }
1534
1535 static void
1536 lex_source_read__ (struct lex_source *src)
1537 {
1538   do
1539     {
1540       lex_source_expand__ (src);
1541
1542       size_t head_ofs = src->head - src->tail;
1543       size_t space = src->allocated - head_ofs;
1544       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1545       size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1546                                            space, prompt);
1547       assert (n <= space);
1548
1549       if (n == 0)
1550         {
1551           /* End of input. */
1552           src->reader->eof = true;
1553           lex_source_expand__ (src);
1554           return;
1555         }
1556
1557       src->head += n;
1558     }
1559   while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1560                   src->head - src->seg_pos));
1561 }
1562
1563 static struct lex_source *
1564 lex_source__ (const struct lexer *lexer)
1565 {
1566   return (ll_is_empty (&lexer->sources) ? NULL
1567           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1568 }
1569
1570 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1571    one, through N1 ahead of the current one, inclusive.  (For example, if N0
1572    and N1 are both zero, this requests the syntax for the current token.)  The
1573    caller must eventually free the returned string (with free()).  The syntax
1574    is encoded in UTF-8 and in the original form supplied to the lexer so that,
1575    for example, it may include comments, spaces, and new-lines if it spans
1576    multiple tokens.  Macro expansion, however, has already been performed. */
1577 static char *
1578 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1579 {
1580   struct string s = DS_EMPTY_INITIALIZER;
1581   for (size_t i = n0; i <= n1; )
1582     {
1583       /* Find [I,J) as the longest sequence of tokens not produced by macro
1584          expansion, or otherwise the longest sequence expanded from a single
1585          macro call. */
1586       const struct lex_token *first = lex_source_next__ (src, i);
1587       size_t j;
1588       for (j = i + 1; j <= n1; j++)
1589         {
1590           const struct lex_token *cur = lex_source_next__ (src, j);
1591           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1592               || first->macro_rep != cur->macro_rep)
1593             break;
1594         }
1595       const struct lex_token *last = lex_source_next__ (src, j - 1);
1596
1597       /* Now add the syntax for this sequence of tokens to SRC. */
1598       if (!ds_is_empty (&s))
1599         ds_put_byte (&s, ' ');
1600       if (!first->macro_rep)
1601         {
1602           size_t start = first->token_pos;
1603           size_t end = last->token_pos + last->token_len;
1604           ds_put_substring (&s, ss_buffer (&src->buffer[start - src->tail],
1605                                            end - start));
1606         }
1607       else
1608         {
1609           size_t start = first->ofs;
1610           size_t end = last->ofs + last->len;
1611           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1612                                            end - start));
1613         }
1614
1615       i = j;
1616     }
1617   return ds_steal_cstr (&s);
1618 }
1619
1620 static bool
1621 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1622 {
1623   for (size_t i = n0; i <= n1; i++)
1624     if (lex_source_next__ (src, i)->macro_rep)
1625       return true;
1626   return false;
1627 }
1628
1629 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1630    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1631    other tokens included in that range.  The syntax is encoded in UTF-8 and in
1632    the original form supplied to the lexer so that, for example, it may include
1633    comments, spaces, and new-lines if it spans multiple tokens.
1634
1635    Returns an empty string if the token range doesn't include a macro call.
1636
1637    The caller must not modify or free the returned string. */
1638 static struct substring
1639 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1640 {
1641   if (!lex_source_contains_macro_call (src, n0, n1))
1642     return ss_empty ();
1643
1644   const struct lex_token *token0 = lex_source_next__ (src, n0);
1645   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1646   size_t start = token0->token_pos;
1647   size_t end = token1->token_pos + token1->token_len;
1648
1649   return ss_buffer (&src->buffer[start - src->tail], end - start);
1650 }
1651
1652 static void
1653 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1654                          const char *format, va_list args)
1655 {
1656   const struct lex_token *token;
1657   struct string s;
1658
1659   ds_init_empty (&s);
1660
1661   token = lex_source_next__ (src, n0);
1662   if (token->token.type == T_ENDCMD)
1663     ds_put_cstr (&s, _("Syntax error at end of command"));
1664   else
1665     {
1666       /* Get the syntax that caused the error. */
1667       char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1668       char syntax[64];
1669       str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1670       free (raw_syntax);
1671
1672       /* Get the macro call(s) that expanded to the syntax that caused the
1673          error. */
1674       char call[64];
1675       str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1676                      call, sizeof call);
1677
1678       if (syntax[0])
1679         {
1680           if (call[0])
1681             ds_put_format (&s,
1682                            _("Syntax error at `%s' (in expansion of `%s')"),
1683                            syntax, call);
1684           else
1685             ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1686         }
1687       else
1688         {
1689           if (call[0])
1690             ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1691                            call);
1692           else
1693             ds_put_cstr (&s, _("Syntax error"));
1694         }
1695     }
1696
1697   if (format)
1698     {
1699       ds_put_cstr (&s, ": ");
1700       ds_put_vformat (&s, format, args);
1701     }
1702   if (ds_last (&s) != '.')
1703     ds_put_byte (&s, '.');
1704
1705   struct msg *m = xmalloc (sizeof *m);
1706   *m = (struct msg) {
1707     .category = MSG_C_SYNTAX,
1708     .severity = MSG_S_ERROR,
1709     .location = lex_source_get_location (src, n0, n1),
1710     .text = ds_steal_cstr (&s),
1711   };
1712   msg_emit (m);
1713 }
1714
1715 static void
1716 lex_get_error (struct lex_source *src, const struct lex_token *token)
1717 {
1718   char syntax[64];
1719   str_ellipsize (ss_buffer (&src->buffer[token->token_pos - src->tail],
1720                             token->token_len),
1721                  syntax, sizeof syntax);
1722
1723   struct string s = DS_EMPTY_INITIALIZER;
1724   ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1725   ds_put_format (&s, ": %s", token->token.string.string);
1726
1727   struct msg *m = xmalloc (sizeof *m);
1728   *m = (struct msg) {
1729     .category = MSG_C_SYNTAX,
1730     .severity = MSG_S_ERROR,
1731     .location = lex_token_location_rw (src, token, token),
1732     .text = ds_steal_cstr (&s),
1733   };
1734   msg_emit (m);
1735 }
1736
1737 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1738    underlying lex_reader if necessary.  Returns true if a new token was added
1739    to SRC's deque, false otherwise.  The caller should retry failures unless
1740    SRC's 'eof' marker was set to true indicating that there will be no more
1741    tokens from this source. */
1742 static bool
1743 lex_source_try_get_pp (struct lex_source *src)
1744 {
1745   /* Append a new token to SRC and initialize it. */
1746   struct lex_token *token = xmalloc (sizeof *token);
1747   token->token = (struct token) { .type = T_STOP };
1748   token->macro_rep = NULL;
1749   token->ref_cnt = NULL;
1750   token->line_pos = src->line_pos;
1751   token->token_pos = src->seg_pos;
1752   if (src->reader->line_number > 0)
1753     token->first_line = src->reader->line_number + src->n_newlines;
1754   else
1755     token->first_line = 0;
1756
1757   /* Extract a segment. */
1758   const char *segment;
1759   enum segment_type seg_type;
1760   int seg_len;
1761   for (;;)
1762     {
1763       segment = &src->buffer[src->seg_pos - src->tail];
1764       seg_len = segmenter_push (&src->segmenter, segment,
1765                                 src->head - src->seg_pos,
1766                                 src->reader->eof, &seg_type);
1767       if (seg_len >= 0)
1768         break;
1769
1770       /* The segmenter needs more input to produce a segment. */
1771       assert (!src->reader->eof);
1772       lex_source_read__ (src);
1773     }
1774
1775   /* Update state based on the segment. */
1776   token->token_len = seg_len;
1777   src->seg_pos += seg_len;
1778   if (seg_type == SEG_NEWLINE)
1779     {
1780       src->line_pos = src->seg_pos;
1781       src->n_newlines++;
1782     }
1783
1784   /* Get a token from the segment. */
1785   enum tokenize_result result = token_from_segment (
1786     seg_type, ss_buffer (segment, seg_len), &token->token);
1787
1788   /* If we've reached the end of a line, or the end of a command, then pass
1789      the line to the output engine as a syntax text item.  */
1790   int n_lines = seg_type == SEG_NEWLINE;
1791   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1792     {
1793       n_lines++;
1794       src->suppress_next_newline = true;
1795     }
1796   else if (n_lines > 0 && src->suppress_next_newline)
1797     {
1798       n_lines--;
1799       src->suppress_next_newline = false;
1800     }
1801   for (int i = 0; i < n_lines; i++)
1802     {
1803       /* Beginning of line. */
1804       const char *line = &src->buffer[src->journal_pos - src->tail];
1805
1806       /* Calculate line length, including \n or \r\n end-of-line if present.
1807
1808          We use src->head even though that may be beyond what we've actually
1809          converted to tokens (which is only through line_pos).  That's because,
1810          if we're emitting the line due to SEG_END_COMMAND, we want to take the
1811          whole line through the newline, not just through the '.'. */
1812       size_t max_len = src->head - src->journal_pos;
1813       const char *newline = memchr (line, '\n', max_len);
1814       size_t line_len = newline ? newline - line + 1 : max_len;
1815
1816       /* Calculate line length excluding end-of-line. */
1817       size_t copy_len = line_len;
1818       if (copy_len > 0 && line[copy_len - 1] == '\n')
1819         copy_len--;
1820       if (copy_len > 0 && line[copy_len - 1] == '\r')
1821         copy_len--;
1822
1823       /* Submit the line as syntax. */
1824       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1825                                                    xmemdup0 (line, copy_len),
1826                                                    NULL));
1827
1828       src->journal_pos += line_len;
1829     }
1830
1831   switch (result)
1832     {
1833     case TOKENIZE_ERROR:
1834       lex_get_error (src, token);
1835       /* Fall through. */
1836     case TOKENIZE_EMPTY:
1837       lex_token_destroy (token);
1838       return false;
1839
1840     case TOKENIZE_TOKEN:
1841       if (token->token.type == T_STOP)
1842         {
1843           token->token.type = T_ENDCMD;
1844           src->eof = true;
1845         }
1846       lex_stage_push_last (&src->pp, token);
1847       return true;
1848     }
1849   NOT_REACHED ();
1850 }
1851
1852 /* Attempts to append a new token to SRC.  Returns true if successful, false on
1853    failure.  On failure, the end of SRC has been reached and no more tokens
1854    will be forthcoming from it.
1855
1856    Does not make the new token available for lookahead yet; the caller must
1857    adjust SRC's 'middle' pointer to do so. */
1858 static bool
1859 lex_source_get_pp (struct lex_source *src)
1860 {
1861   while (!src->eof)
1862     if (lex_source_try_get_pp (src))
1863       return true;
1864   return false;
1865 }
1866
1867 static bool
1868 lex_source_try_get_merge (const struct lex_source *src_)
1869 {
1870   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1871
1872   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1873     return false;
1874
1875   if (!settings_get_mexpand ())
1876     {
1877       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1878       return true;
1879     }
1880
1881   /* Now pass tokens one-by-one to the macro expander.
1882
1883      In the common case where there is no macro to expand, the loop is not
1884      entered.  */
1885   struct macro_call *mc;
1886   int n_call = macro_call_create (src->lexer->macros,
1887                                   &lex_stage_first (&src->pp)->token, &mc);
1888   for (int ofs = 1; !n_call; ofs++)
1889     {
1890       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1891         {
1892           /* This should not be reachable because we always get a T_ENDCMD at
1893              the end of an input file (transformed from T_STOP by
1894              lex_source_try_get_pp()) and the macro_expander should always
1895              terminate expansion on T_ENDCMD. */
1896           NOT_REACHED ();
1897         }
1898
1899       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1900       size_t start = t->token_pos;
1901       size_t end = t->token_pos + t->token_len;
1902       const struct macro_token mt = {
1903         .token = t->token,
1904         .syntax = ss_buffer (&src->buffer[start - src->tail], end - start),
1905       };
1906       const struct msg_location loc = lex_token_location (src, t, t);
1907       n_call = macro_call_add (mc, &mt, &loc);
1908     }
1909   if (n_call < 0)
1910     {
1911       /* False alarm: no macro expansion after all.  Use first token as
1912          lookahead.  We'll retry macro expansion from the second token next
1913          time around. */
1914       macro_call_destroy (mc);
1915       lex_stage_shift (&src->merge, &src->pp, 1);
1916       return true;
1917     }
1918
1919   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1920      are a macro call.  (These are likely to be the only tokens in 'pp'.)
1921      Expand them.  */
1922   const struct lex_token *c0 = lex_stage_first (&src->pp);
1923   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1924   struct macro_tokens expansion = { .n = 0 };
1925   struct msg_location loc = lex_token_location (src, c0, c1);
1926   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1927   macro_call_destroy (mc);
1928
1929   /* Convert the macro expansion into syntax for possible error messages
1930      later. */
1931   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1932   size_t *len = xnmalloc (expansion.n, sizeof *len);
1933   struct string s = DS_EMPTY_INITIALIZER;
1934   macro_tokens_to_syntax (&expansion, &s, ofs, len);
1935
1936   if (settings_get_mprint ())
1937     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1938                                           _("Macro Expansion")));
1939
1940   /* Append the macro expansion tokens to the lookahead. */
1941   if (expansion.n > 0)
1942     {
1943       char *macro_rep = ds_steal_cstr (&s);
1944       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1945       *ref_cnt = expansion.n;
1946       for (size_t i = 0; i < expansion.n; i++)
1947         {
1948           struct lex_token *token = xmalloc (sizeof *token);
1949           *token = (struct lex_token) {
1950             .token = expansion.mts[i].token,
1951             .token_pos = c0->token_pos,
1952             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1953             .line_pos = c0->line_pos,
1954             .first_line = c0->first_line,
1955             .macro_rep = macro_rep,
1956             .ofs = ofs[i],
1957             .len = len[i],
1958             .ref_cnt = ref_cnt,
1959           };
1960           lex_stage_push_last (&src->merge, token);
1961
1962           ss_dealloc (&expansion.mts[i].syntax);
1963         }
1964     }
1965   else
1966     ds_destroy (&s);
1967   free (expansion.mts);
1968   free (ofs);
1969   free (len);
1970
1971   /* Destroy the tokens for the call. */
1972   for (size_t i = 0; i < n_call; i++)
1973     lex_stage_pop_first (&src->pp);
1974
1975   return expansion.n > 0;
1976 }
1977
1978 /* Attempts to obtain at least one new token into 'merge' in SRC.
1979
1980    Returns true if successful, false on failure.  In the latter case, SRC is
1981    exhausted and 'src->eof' is now true. */
1982 static bool
1983 lex_source_get_merge (struct lex_source *src)
1984 {
1985   while (!src->eof)
1986     if (lex_source_try_get_merge (src))
1987       return true;
1988   return false;
1989 }
1990
1991 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1992
1993    Returns true if successful, false on failure.  In the latter case, SRC is
1994    exhausted and 'src->eof' is now true. */
1995 static bool
1996 lex_source_get_lookahead (struct lex_source *src)
1997 {
1998   struct merger m = MERGER_INIT;
1999   struct token out;
2000   for (size_t i = 0; ; i++)
2001     {
2002       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
2003         {
2004           /* We always get a T_ENDCMD at the end of an input file
2005              (transformed from T_STOP by lex_source_try_get_pp()) and
2006              merger_add() should never return -1 on T_ENDCMD. */
2007           assert (lex_stage_is_empty (&src->merge));
2008           return false;
2009         }
2010
2011       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
2012                                &out);
2013       if (!retval)
2014         {
2015           lex_stage_shift (&src->lookahead, &src->merge, 1);
2016           return true;
2017         }
2018       else if (retval > 0)
2019         {
2020           /* Add a token that merges all the tokens together. */
2021           const struct lex_token *first = lex_stage_first (&src->merge);
2022           const struct lex_token *last = lex_stage_nth (&src->merge,
2023                                                         retval - 1);
2024           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2025           struct lex_token *t = xmalloc (sizeof *t);
2026           *t = (struct lex_token) {
2027             .token = out,
2028             .token_pos = first->token_pos,
2029             .token_len = (last->token_pos - first->token_pos) + last->token_len,
2030             .line_pos = first->line_pos,
2031             .first_line = first->first_line,
2032
2033             /* This works well if all the tokens were not expanded from macros,
2034                or if they came from the same macro expansion.  It just gives up
2035                in the other (corner) cases. */
2036             .macro_rep = macro ? first->macro_rep : NULL,
2037             .ofs = macro ? first->ofs : 0,
2038             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2039             .ref_cnt = macro ? first->ref_cnt : NULL,
2040           };
2041           if (t->ref_cnt)
2042             ++*t->ref_cnt;
2043           lex_stage_push_last (&src->lookahead, t);
2044
2045           for (int i = 0; i < retval; i++)
2046             lex_stage_pop_first (&src->merge);
2047           return true;
2048         }
2049     }
2050 }
2051 \f
2052 static void
2053 lex_source_push_endcmd__ (struct lex_source *src)
2054 {
2055   assert (lex_stage_is_empty (&src->lookahead));
2056   struct lex_token *token = xmalloc (sizeof *token);
2057   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2058   lex_stage_push_last (&src->lookahead, token);
2059 }
2060
2061 static struct lex_source *
2062 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2063 {
2064   struct lex_source *src = xmalloc (sizeof *src);
2065   *src = (struct lex_source) {
2066     .reader = reader,
2067     .segmenter = segmenter_init (reader->syntax, false),
2068     .lexer = lexer,
2069   };
2070
2071   lex_source_push_endcmd__ (src);
2072
2073   return src;
2074 }
2075
2076 static void
2077 lex_source_destroy (struct lex_source *src)
2078 {
2079   char *file_name = src->reader->file_name;
2080   char *encoding = src->reader->encoding;
2081   if (src->reader->class->destroy != NULL)
2082     src->reader->class->destroy (src->reader);
2083   free (file_name);
2084   free (encoding);
2085   free (src->buffer);
2086   lex_stage_uninit (&src->pp);
2087   lex_stage_uninit (&src->merge);
2088   lex_stage_uninit (&src->lookahead);
2089   ll_remove (&src->ll);
2090   free (src);
2091 }
2092 \f
2093 struct lex_file_reader
2094   {
2095     struct lex_reader reader;
2096     struct u8_istream *istream;
2097   };
2098
2099 static struct lex_reader_class lex_file_reader_class;
2100
2101 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2102    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2103    ENCODING, which should take one of the forms accepted by
2104    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2105    mode of the new reader, respectively.
2106
2107    Returns a null pointer if FILE_NAME cannot be opened. */
2108 struct lex_reader *
2109 lex_reader_for_file (const char *file_name, const char *encoding,
2110                      enum segmenter_mode syntax,
2111                      enum lex_error_mode error)
2112 {
2113   struct lex_file_reader *r;
2114   struct u8_istream *istream;
2115
2116   istream = (!strcmp(file_name, "-")
2117              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2118              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2119   if (istream == NULL)
2120     {
2121       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2122       return NULL;
2123     }
2124
2125   r = xmalloc (sizeof *r);
2126   lex_reader_init (&r->reader, &lex_file_reader_class);
2127   r->reader.syntax = syntax;
2128   r->reader.error = error;
2129   r->reader.file_name = xstrdup (file_name);
2130   r->reader.encoding = xstrdup_if_nonnull (encoding);
2131   r->reader.line_number = 1;
2132   r->istream = istream;
2133
2134   return &r->reader;
2135 }
2136
2137 static struct lex_file_reader *
2138 lex_file_reader_cast (struct lex_reader *r)
2139 {
2140   return UP_CAST (r, struct lex_file_reader, reader);
2141 }
2142
2143 static size_t
2144 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2145                enum prompt_style prompt_style UNUSED)
2146 {
2147   struct lex_file_reader *r = lex_file_reader_cast (r_);
2148   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2149   if (n_read < 0)
2150     {
2151       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2152       return 0;
2153     }
2154   return n_read;
2155 }
2156
2157 static void
2158 lex_file_close (struct lex_reader *r_)
2159 {
2160   struct lex_file_reader *r = lex_file_reader_cast (r_);
2161
2162   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2163     {
2164       if (u8_istream_close (r->istream) != 0)
2165         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2166     }
2167   else
2168     u8_istream_free (r->istream);
2169
2170   free (r);
2171 }
2172
2173 static struct lex_reader_class lex_file_reader_class =
2174   {
2175     lex_file_read,
2176     lex_file_close
2177   };
2178 \f
2179 struct lex_string_reader
2180   {
2181     struct lex_reader reader;
2182     struct substring s;
2183     size_t offset;
2184   };
2185
2186 static struct lex_reader_class lex_string_reader_class;
2187
2188 /* Creates and returns a new lex_reader for the contents of S, which must be
2189    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2190    with ss_dealloc() when it is closed. */
2191 struct lex_reader *
2192 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2193 {
2194   struct lex_string_reader *r;
2195
2196   r = xmalloc (sizeof *r);
2197   lex_reader_init (&r->reader, &lex_string_reader_class);
2198   r->reader.syntax = SEG_MODE_AUTO;
2199   r->reader.encoding = xstrdup_if_nonnull (encoding);
2200   r->s = s;
2201   r->offset = 0;
2202
2203   return &r->reader;
2204 }
2205
2206 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2207    which must be encoded in ENCODING.  The caller retains ownership of S. */
2208 struct lex_reader *
2209 lex_reader_for_string (const char *s, const char *encoding)
2210 {
2211   struct substring ss;
2212   ss_alloc_substring (&ss, ss_cstr (s));
2213   return lex_reader_for_substring_nocopy (ss, encoding);
2214 }
2215
2216 /* Formats FORMAT as a printf()-like format string and creates and returns a
2217    new lex_reader for the formatted result.  */
2218 struct lex_reader *
2219 lex_reader_for_format (const char *format, const char *encoding, ...)
2220 {
2221   struct lex_reader *r;
2222   va_list args;
2223
2224   va_start (args, encoding);
2225   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2226   va_end (args);
2227
2228   return r;
2229 }
2230
2231 static struct lex_string_reader *
2232 lex_string_reader_cast (struct lex_reader *r)
2233 {
2234   return UP_CAST (r, struct lex_string_reader, reader);
2235 }
2236
2237 static size_t
2238 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2239                  enum prompt_style prompt_style UNUSED)
2240 {
2241   struct lex_string_reader *r = lex_string_reader_cast (r_);
2242   size_t chunk;
2243
2244   chunk = MIN (n, r->s.length - r->offset);
2245   memcpy (buf, r->s.string + r->offset, chunk);
2246   r->offset += chunk;
2247
2248   return chunk;
2249 }
2250
2251 static void
2252 lex_string_close (struct lex_reader *r_)
2253 {
2254   struct lex_string_reader *r = lex_string_reader_cast (r_);
2255
2256   ss_dealloc (&r->s);
2257   free (r);
2258 }
2259
2260 static struct lex_reader_class lex_string_reader_class =
2261   {
2262     lex_string_read,
2263     lex_string_close
2264   };