pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31
  32 #include "language/command.h"
  33 #include "language/lexer/macro.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/intern.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* For a token obtained through the lexer in an ordinary way, this is the
  66        location of the token in terms of the lex_source's buffer.
  67
  68        For a token produced through macro expansion, this is the entire macro
  69        call. */
  70     size_t token_pos;           /* Offset into src->buffer of token start. */
  71     size_t token_len;           /* Length of source for token in bytes. */
  72
  73     /* For a token obtained through macro expansion, this is just this token.
  74
  75        For a token obtained through the lexer in an ordinary way, these are
  76        nulls and zeros. */
  77     char *macro_rep;        /* The whole macro expansion. */
  78     size_t ofs;             /* Offset of this token in macro_rep. */
  79     size_t len;             /* Length of this token in macro_rep. */
  80     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  81   };
  82
  83 static void
  84 lex_token_destroy (struct lex_token *t)
  85 {
  86   token_uninit (&t->token);
  87   if (t->ref_cnt)
  88     {
  89       assert (*t->ref_cnt > 0);
  90       if (!--*t->ref_cnt)
  91         {
  92           free (t->macro_rep);
  93           free (t->ref_cnt);
  94         }
  95     }
  96   free (t);
  97 }
  98 \f
  99 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 100    lex_source. */
 101 struct lex_stage
 102   {
 103     struct deque deque;
 104     struct lex_token **tokens;
 105   };
 106
 107 static void lex_stage_clear (struct lex_stage *);
 108 static void lex_stage_uninit (struct lex_stage *);
 109
 110 static size_t lex_stage_count (const struct lex_stage *);
 111 static bool lex_stage_is_empty (const struct lex_stage *);
 112
 113 static struct lex_token *lex_stage_first (struct lex_stage *);
 114 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 115
 116 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 117 static void lex_stage_pop_first (struct lex_stage *);
 118
 119 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 120                              size_t n);
 121
 122 /* Deletes all the tokens from STAGE. */
 123 static void
 124 lex_stage_clear (struct lex_stage *stage)
 125 {
 126   while (!deque_is_empty (&stage->deque))
 127     lex_stage_pop_first (stage);
 128 }
 129
 130 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 131 static void
 132 lex_stage_uninit (struct lex_stage *stage)
 133 {
 134   lex_stage_clear (stage);
 135   free (stage->tokens);
 136 }
 137
 138 /* Returns true if STAGE contains no tokens, otherwise false. */
 139 static bool
 140 lex_stage_is_empty (const struct lex_stage *stage)
 141 {
 142   return deque_is_empty (&stage->deque);
 143 }
 144
 145 /* Returns the number of tokens in STAGE. */
 146 static size_t
 147 lex_stage_count (const struct lex_stage *stage)
 148 {
 149   return deque_count (&stage->deque);
 150 }
 151
 152 /* Returns the first token in STAGE, which must be nonempty.
 153    The first token is the one accessed with the least lookahead. */
 154 static struct lex_token *
 155 lex_stage_first (struct lex_stage *stage)
 156 {
 157   return lex_stage_nth (stage, 0);
 158 }
 159
 160 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 161    lookahead) is 0, the second token is 1, and so on.  There must be at least
 162    INDEX + 1 tokens in STAGE. */
 163 static struct lex_token *
 164 lex_stage_nth (struct lex_stage *stage, size_t index)
 165 {
 166   return stage->tokens[deque_back (&stage->deque, index)];
 167 }
 168
 169 /* Adds TOKEN so that it becomes the last token in STAGE. */
 170 static void
 171 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 172 {
 173   if (deque_is_full (&stage->deque))
 174     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 175                                   sizeof *stage->tokens);
 176   stage->tokens[deque_push_front (&stage->deque)] = token;
 177 }
 178
 179 /* Removes and returns the first token from STAGE. */
 180 static struct lex_token *
 181 lex_stage_take_first (struct lex_stage *stage)
 182 {
 183   return stage->tokens[deque_pop_back (&stage->deque)];
 184 }
 185
 186 /* Removes the first token from STAGE and uninitializes it. */
 187 static void
 188 lex_stage_pop_first (struct lex_stage *stage)
 189 {
 190   lex_token_destroy (lex_stage_take_first (stage));
 191 }
 192
 193 /* Removes the first N tokens from SRC, appending them to DST as the last
 194    tokens. */
 195 static void
 196 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 197 {
 198   for (size_t i = 0; i < n; i++)
 199     lex_stage_push_last (dst, lex_stage_take_first (src));
 200 }
 201
 202 /* A source of tokens, corresponding to a syntax file.
 203
 204    This is conceptually a lex_reader wrapped with everything needed to convert
 205    its UTF-8 bytes into tokens. */
 206 struct lex_source
 207   {
 208     struct ll ll;               /* In lexer's list of sources. */
 209     struct lex_reader *reader;
 210     struct lexer *lexer;
 211     struct segmenter segmenter;
 212     bool eof;                   /* True if T_STOP was read from 'reader'. */
 213
 214     /* Buffer of UTF-8 bytes. */
 215     char *buffer;               /* Source file contents. */
 216     size_t length;              /* Number of bytes filled. */
 217     size_t allocated;           /* Number of bytes allocated. */
 218
 219     /* Offsets into 'buffer'. */
 220     size_t journal_pos;         /* First byte not yet output to journal. */
 221     size_t seg_pos;             /* First byte not yet scanned as token. */
 222
 223     /* Offset into 'buffer' of starts of lines. */
 224     size_t *lines;
 225     size_t n_lines, allocated_lines;
 226
 227     bool suppress_next_newline;
 228
 229     /* Tokens.
 230
 231        This is a pipeline with the following stages.  Each token eventually
 232        made available to the parser passes through of these stages.  The stages
 233        are named after the processing that happens in each one.
 234
 235        Initially, tokens come from the segmenter and scanner to 'pp':
 236
 237        - pp: Tokens that need to pass through the macro preprocessor to end up
 238          in 'merge'.
 239
 240        - merge: Tokens that need to pass through scan_merge() to end up in
 241          'parse'.
 242
 243        - parse: Tokens available to the client for parsing.
 244
 245       'pp' and 'merge' store tokens only temporarily until they pass into
 246       'parse'.  Tokens then live in 'parse' until the command is fully
 247       consumed, at which time they are freed together. */
 248     struct lex_stage pp;
 249     struct lex_stage merge;
 250     struct lex_token **parse;
 251     size_t n_parse, allocated_parse, parse_ofs;
 252   };
 253
 254 static struct lex_source *lex_source_create (struct lexer *,
 255                                              struct lex_reader *);
 256 static void lex_source_destroy (struct lex_source *);
 257
 258 /* Lexer. */
 259 struct lexer
 260   {
 261     struct ll_list sources;     /* Contains "struct lex_source"s. */
 262     struct macro_set *macros;
 263   };
 264
 265 static struct lex_source *lex_source__ (const struct lexer *);
 266 static char *lex_source_get_syntax__ (const struct lex_source *,
 267                                       int n0, int n1);
 268 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 269 static void lex_source_push_endcmd__ (struct lex_source *);
 270 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
 271 static void lex_source_clear_parse (struct lex_source *);
 272
 273 static bool lex_source_get_parse (struct lex_source *);
 274 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 275                                      const char *format, va_list)
 276    PRINTF_FORMAT (4, 0);
 277 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 278                                                   int n);
 279 \f
 280 /* Initializes READER with the specified CLASS and otherwise some reasonable
 281    defaults.  The caller should fill in the others members as desired. */
 282 void
 283 lex_reader_init (struct lex_reader *reader,
 284                  const struct lex_reader_class *class)
 285 {
 286   reader->class = class;
 287   reader->syntax = SEG_MODE_AUTO;
 288   reader->error = LEX_ERROR_CONTINUE;
 289   reader->file_name = NULL;
 290   reader->encoding = NULL;
 291   reader->line_number = 0;
 292   reader->eof = false;
 293 }
 294
 295 /* Frees any file name already in READER and replaces it by a copy of
 296    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 297 void
 298 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 299 {
 300   free (reader->file_name);
 301   reader->file_name = xstrdup_if_nonnull (file_name);
 302 }
 303 \f
 304 /* Creates and returns a new lexer. */
 305 struct lexer *
 306 lex_create (void)
 307 {
 308   struct lexer *lexer = xmalloc (sizeof *lexer);
 309   *lexer = (struct lexer) {
 310     .sources = LL_INITIALIZER (lexer->sources),
 311     .macros = macro_set_create (),
 312   };
 313   return lexer;
 314 }
 315
 316 /* Destroys LEXER. */
 317 void
 318 lex_destroy (struct lexer *lexer)
 319 {
 320   if (lexer != NULL)
 321     {
 322       struct lex_source *source, *next;
 323
 324       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 325         lex_source_destroy (source);
 326       macro_set_destroy (lexer->macros);
 327       free (lexer);
 328     }
 329 }
 330
 331 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 332    same name.  Takes ownership of M. */
 333 void
 334 lex_define_macro (struct lexer *lexer, struct macro *m)
 335 {
 336   macro_set_add (lexer->macros, m);
 337 }
 338
 339 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 340    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 341    token. */
 342 void
 343 lex_include (struct lexer *lexer, struct lex_reader *reader)
 344 {
 345   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 346   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 347 }
 348
 349 /* Appends READER to LEXER, so that it will be read after all other current
 350    readers have already been read. */
 351 void
 352 lex_append (struct lexer *lexer, struct lex_reader *reader)
 353 {
 354   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 355 }
 356 \f
 357 /* Advancing. */
 358
 359 /* Advances LEXER to the next token, consuming the current token. */
 360 void
 361 lex_get (struct lexer *lexer)
 362 {
 363   struct lex_source *src;
 364
 365   src = lex_source__ (lexer);
 366   if (src == NULL)
 367     return;
 368
 369   if (src->parse_ofs < src->n_parse)
 370     {
 371       if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
 372         lex_source_clear_parse (src);
 373       else
 374         src->parse_ofs++;
 375     }
 376
 377   while (src->parse_ofs == src->n_parse)
 378     if (!lex_source_get_parse (src))
 379       {
 380         lex_source_destroy (src);
 381         src = lex_source__ (lexer);
 382         if (src == NULL)
 383           return;
 384       }
 385 }
 386
 387 /* Advances LEXER by N tokens. */
 388 void
 389 lex_get_n (struct lexer *lexer, size_t n)
 390 {
 391   while (n-- > 0)
 392     lex_get (lexer);
 393 }
 394 \f
 395 /* Issuing errors. */
 396
 397 /* Prints a syntax error message containing the current token and
 398    given message MESSAGE (if non-null). */
 399 void
 400 lex_error (struct lexer *lexer, const char *format, ...)
 401 {
 402   va_list args;
 403
 404   va_start (args, format);
 405   lex_next_error_valist (lexer, 0, 0, format, args);
 406   va_end (args);
 407 }
 408
 409 /* Prints a syntax error message containing the current token and
 410    given message MESSAGE (if non-null). */
 411 void
 412 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 413 {
 414   lex_next_error_valist (lexer, 0, 0, format, args);
 415 }
 416
 417 /* Prints a syntax error message containing the current token and
 418    given message MESSAGE (if non-null). */
 419 void
 420 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 421 {
 422   va_list args;
 423
 424   va_start (args, format);
 425   lex_next_error_valist (lexer, n0, n1, format, args);
 426   va_end (args);
 427 }
 428
 429 /* Prints a syntax error message saying that one of the strings provided as
 430    varargs, up to the first NULL, is expected. */
 431 void
 432 (lex_error_expecting) (struct lexer *lexer, ...)
 433 {
 434   va_list args;
 435
 436   va_start (args, lexer);
 437   lex_error_expecting_valist (lexer, args);
 438   va_end (args);
 439 }
 440
 441 /* Prints a syntax error message saying that one of the options provided in
 442    ARGS, up to the first NULL, is expected. */
 443 void
 444 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 445 {
 446   enum { MAX_OPTIONS = 9 };
 447   const char *options[MAX_OPTIONS];
 448   int n = 0;
 449   while (n < MAX_OPTIONS)
 450     {
 451       const char *option = va_arg (args, const char *);
 452       if (!option)
 453         break;
 454
 455       options[n++] = option;
 456     }
 457   lex_error_expecting_array (lexer, options, n);
 458 }
 459
 460 void
 461 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 462 {
 463   switch (n)
 464     {
 465     case 0:
 466       lex_error (lexer, NULL);
 467       break;
 468
 469     case 1:
 470       lex_error (lexer, _("expecting %s"), options[0]);
 471       break;
 472
 473     case 2:
 474       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 475       break;
 476
 477     case 3:
 478       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 479                  options[2]);
 480       break;
 481
 482     case 4:
 483       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 484                  options[0], options[1], options[2], options[3]);
 485       break;
 486
 487     case 5:
 488       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 489                  options[0], options[1], options[2], options[3], options[4]);
 490       break;
 491
 492     case 6:
 493       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 494                  options[0], options[1], options[2], options[3], options[4],
 495                  options[5]);
 496       break;
 497
 498     case 7:
 499       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 500                  options[0], options[1], options[2], options[3], options[4],
 501                  options[5], options[6]);
 502       break;
 503
 504     case 8:
 505       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 506                  options[0], options[1], options[2], options[3], options[4],
 507                  options[5], options[6], options[7]);
 508       break;
 509
 510     default:
 511       lex_error (lexer, NULL);
 512     }
 513 }
 514
 515 /* Reports an error to the effect that subcommand SBC may only be specified
 516    once.
 517
 518    This function does not take a lexer as an argument or use lex_error(),
 519    because the result would ordinarily just be redundant: "Syntax error at
 520    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 521    not help the user find the error. */
 522 void
 523 lex_sbc_only_once (const char *sbc)
 524 {
 525   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 526 }
 527
 528 /* Reports an error to the effect that subcommand SBC is missing.
 529
 530    This function does not take a lexer as an argument or use lex_error(),
 531    because a missing subcommand can normally be detected only after the whole
 532    command has been parsed, and so lex_error() would always report "Syntax
 533    error at end of command", which does not help the user find the error. */
 534 void
 535 lex_sbc_missing (const char *sbc)
 536 {
 537   msg (SE, _("Required subcommand %s was not specified."), sbc);
 538 }
 539
 540 /* Reports an error to the effect that specification SPEC may only be specified
 541    once within subcommand SBC. */
 542 void
 543 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 544 {
 545   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 546              spec, sbc);
 547 }
 548
 549 /* Reports an error to the effect that specification SPEC is missing within
 550    subcommand SBC. */
 551 void
 552 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 553 {
 554   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 555              sbc, spec);
 556 }
 557
 558 /* Prints a syntax error message containing the current token and
 559    given message MESSAGE (if non-null). */
 560 void
 561 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 562                        const char *format, va_list args)
 563 {
 564   struct lex_source *src = lex_source__ (lexer);
 565
 566   if (src != NULL)
 567     lex_source_error_valist (src, n0, n1, format, args);
 568   else
 569     {
 570       struct string s;
 571
 572       ds_init_empty (&s);
 573       ds_put_format (&s, _("Syntax error at end of input"));
 574       if (format != NULL)
 575         {
 576           ds_put_cstr (&s, ": ");
 577           ds_put_vformat (&s, format, args);
 578         }
 579       if (ds_last (&s) != '.')
 580         ds_put_byte (&s, '.');
 581       msg (SE, "%s", ds_cstr (&s));
 582       ds_destroy (&s);
 583     }
 584 }
 585
 586 /* Checks that we're at end of command.
 587    If so, returns a successful command completion code.
 588    If not, flags a syntax error and returns an error command
 589    completion code. */
 590 int
 591 lex_end_of_command (struct lexer *lexer)
 592 {
 593   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 594     {
 595       lex_error (lexer, _("expecting end of command"));
 596       return CMD_FAILURE;
 597     }
 598   else
 599     return CMD_SUCCESS;
 600 }
 601 \f
 602 /* Token testing functions. */
 603
 604 /* Returns true if the current token is a number. */
 605 bool
 606 lex_is_number (const struct lexer *lexer)
 607 {
 608   return lex_next_is_number (lexer, 0);
 609 }
 610
 611 /* Returns true if the current token is a string. */
 612 bool
 613 lex_is_string (const struct lexer *lexer)
 614 {
 615   return lex_next_is_string (lexer, 0);
 616 }
 617
 618 /* Returns the value of the current token, which must be a
 619    floating point number. */
 620 double
 621 lex_number (const struct lexer *lexer)
 622 {
 623   return lex_next_number (lexer, 0);
 624 }
 625
 626 /* Returns true iff the current token is an integer. */
 627 bool
 628 lex_is_integer (const struct lexer *lexer)
 629 {
 630   return lex_next_is_integer (lexer, 0);
 631 }
 632
 633 /* Returns the value of the current token, which must be an
 634    integer. */
 635 long
 636 lex_integer (const struct lexer *lexer)
 637 {
 638   return lex_next_integer (lexer, 0);
 639 }
 640 \f
 641 /* Token testing functions with lookahead.
 642
 643    A value of 0 for N as an argument to any of these functions refers to the
 644    current token.  Lookahead is limited to the current command.  Any N greater
 645    than the number of tokens remaining in the current command will be treated
 646    as referring to a T_ENDCMD token. */
 647
 648 /* Returns true if the token N ahead of the current token is a number. */
 649 bool
 650 lex_next_is_number (const struct lexer *lexer, int n)
 651 {
 652   return token_is_number (lex_next (lexer, n));
 653 }
 654
 655 /* Returns true if the token N ahead of the current token is a string. */
 656 bool
 657 lex_next_is_string (const struct lexer *lexer, int n)
 658 {
 659   return token_is_string (lex_next (lexer, n));
 660 }
 661
 662 /* Returns the value of the token N ahead of the current token, which must be a
 663    floating point number. */
 664 double
 665 lex_next_number (const struct lexer *lexer, int n)
 666 {
 667   return token_number (lex_next (lexer, n));
 668 }
 669
 670 /* Returns true if the token N ahead of the current token is an integer. */
 671 bool
 672 lex_next_is_integer (const struct lexer *lexer, int n)
 673 {
 674   return token_is_integer (lex_next (lexer, n));
 675 }
 676
 677 /* Returns the value of the token N ahead of the current token, which must be
 678    an integer. */
 679 long
 680 lex_next_integer (const struct lexer *lexer, int n)
 681 {
 682   return token_integer (lex_next (lexer, n));
 683 }
 684 \f
 685 /* Token matching functions. */
 686
 687 /* If the current token has the specified TYPE, skips it and returns true.
 688    Otherwise, returns false. */
 689 bool
 690 lex_match (struct lexer *lexer, enum token_type type)
 691 {
 692   if (lex_token (lexer) == type)
 693     {
 694       lex_get (lexer);
 695       return true;
 696     }
 697   else
 698     return false;
 699 }
 700
 701 /* If the current token matches IDENTIFIER, skips it and returns true.
 702    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 703    returns false.
 704
 705    IDENTIFIER must be an ASCII string. */
 706 bool
 707 lex_match_id (struct lexer *lexer, const char *identifier)
 708 {
 709   return lex_match_id_n (lexer, identifier, 3);
 710 }
 711
 712 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 713    may be abbreviated to its first N letters.  Otherwise, returns false.
 714
 715    IDENTIFIER must be an ASCII string. */
 716 bool
 717 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 718 {
 719   if (lex_token (lexer) == T_ID
 720       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 721     {
 722       lex_get (lexer);
 723       return true;
 724     }
 725   else
 726     return false;
 727 }
 728
 729 /* If the current token is integer X, skips it and returns true.  Otherwise,
 730    returns false. */
 731 bool
 732 lex_match_int (struct lexer *lexer, int x)
 733 {
 734   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 735     {
 736       lex_get (lexer);
 737       return true;
 738     }
 739   else
 740     return false;
 741 }
 742 \f
 743 /* Forced matches. */
 744
 745 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 746    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 747    false.
 748
 749    IDENTIFIER must be an ASCII string. */
 750 bool
 751 lex_force_match_id (struct lexer *lexer, const char *identifier)
 752 {
 753   if (lex_match_id (lexer, identifier))
 754     return true;
 755   else
 756     {
 757       lex_error_expecting (lexer, identifier);
 758       return false;
 759     }
 760 }
 761
 762 /* If the current token has the specified TYPE, skips it and returns true.
 763    Otherwise, reports an error and returns false. */
 764 bool
 765 lex_force_match (struct lexer *lexer, enum token_type type)
 766 {
 767   if (lex_token (lexer) == type)
 768     {
 769       lex_get (lexer);
 770       return true;
 771     }
 772   else
 773     {
 774       const char *type_string = token_type_to_string (type);
 775       if (type_string)
 776         {
 777           char *s = xasprintf ("`%s'", type_string);
 778           lex_error_expecting (lexer, s);
 779           free (s);
 780         }
 781       else
 782         lex_error_expecting (lexer, token_type_to_name (type));
 783
 784       return false;
 785     }
 786 }
 787
 788 /* If the current token is a string, does nothing and returns true.
 789    Otherwise, reports an error and returns false. */
 790 bool
 791 lex_force_string (struct lexer *lexer)
 792 {
 793   if (lex_is_string (lexer))
 794     return true;
 795   else
 796     {
 797       lex_error (lexer, _("expecting string"));
 798       return false;
 799     }
 800 }
 801
 802 /* If the current token is a string or an identifier, does nothing and returns
 803    true.  Otherwise, reports an error and returns false.
 804
 805    This is meant for use in syntactic situations where we want to encourage the
 806    user to supply a quoted string, but for compatibility we also accept
 807    identifiers.  (One example of such a situation is file names.)  Therefore,
 808    the error message issued when the current token is wrong only says that a
 809    string is expected and doesn't mention that an identifier would also be
 810    accepted. */
 811 bool
 812 lex_force_string_or_id (struct lexer *lexer)
 813 {
 814   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 815 }
 816
 817 /* If the current token is an integer, does nothing and returns true.
 818    Otherwise, reports an error and returns false. */
 819 bool
 820 lex_force_int (struct lexer *lexer)
 821 {
 822   if (lex_is_integer (lexer))
 823     return true;
 824   else
 825     {
 826       lex_error (lexer, _("expecting integer"));
 827       return false;
 828     }
 829 }
 830
 831 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 832    nothing and returns true.  Otherwise, reports an error and returns false.
 833    If NAME is nonnull, then it is used in the error message. */
 834 bool
 835 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 836 {
 837   bool is_integer = lex_is_integer (lexer);
 838   bool too_small = is_integer && lex_integer (lexer) < min;
 839   bool too_big = is_integer && lex_integer (lexer) > max;
 840   if (is_integer && !too_small && !too_big)
 841     return true;
 842
 843   if (min > max)
 844     {
 845       /* Weird, maybe a bug in the caller.  Just report that we needed an
 846          integer. */
 847       if (name)
 848         lex_error (lexer, _("Integer expected for %s."), name);
 849       else
 850         lex_error (lexer, _("Integer expected."));
 851     }
 852   else if (min == max)
 853     {
 854       if (name)
 855         lex_error (lexer, _("Expected %ld for %s."), min, name);
 856       else
 857         lex_error (lexer, _("Expected %ld."), min);
 858     }
 859   else if (min + 1 == max)
 860     {
 861       if (name)
 862         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 863       else
 864         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 865     }
 866   else
 867     {
 868       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 869       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 870
 871       if (report_lower_bound && report_upper_bound)
 872         {
 873           if (name)
 874             lex_error (lexer,
 875                        _("Expected integer between %ld and %ld for %s."),
 876                        min, max, name);
 877           else
 878             lex_error (lexer, _("Expected integer between %ld and %ld."),
 879                        min, max);
 880         }
 881       else if (report_lower_bound)
 882         {
 883           if (min == 0)
 884             {
 885               if (name)
 886                 lex_error (lexer, _("Expected non-negative integer for %s."),
 887                            name);
 888               else
 889                 lex_error (lexer, _("Expected non-negative integer."));
 890             }
 891           else if (min == 1)
 892             {
 893               if (name)
 894                 lex_error (lexer, _("Expected positive integer for %s."),
 895                            name);
 896               else
 897                 lex_error (lexer, _("Expected positive integer."));
 898             }
 899         }
 900       else if (report_upper_bound)
 901         {
 902           if (name)
 903             lex_error (lexer,
 904                        _("Expected integer less than or equal to %ld for %s."),
 905                        max, name);
 906           else
 907             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 908                        max);
 909         }
 910       else
 911         {
 912           if (name)
 913             lex_error (lexer, _("Integer expected for %s."), name);
 914           else
 915             lex_error (lexer, _("Integer expected."));
 916         }
 917     }
 918   return false;
 919 }
 920
 921 /* If the current token is a number, does nothing and returns true.
 922    Otherwise, reports an error and returns false. */
 923 bool
 924 lex_force_num (struct lexer *lexer)
 925 {
 926   if (lex_is_number (lexer))
 927     return true;
 928
 929   lex_error (lexer, _("expecting number"));
 930   return false;
 931 }
 932
 933 /* If the current token is an identifier, does nothing and returns true.
 934    Otherwise, reports an error and returns false. */
 935 bool
 936 lex_force_id (struct lexer *lexer)
 937 {
 938   if (lex_token (lexer) == T_ID)
 939     return true;
 940
 941   lex_error (lexer, _("expecting identifier"));
 942   return false;
 943 }
 944 \f
 945 /* Token accessors. */
 946
 947 /* Returns the type of LEXER's current token. */
 948 enum token_type
 949 lex_token (const struct lexer *lexer)
 950 {
 951   return lex_next_token (lexer, 0);
 952 }
 953
 954 /* Returns the number in LEXER's current token.
 955
 956    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 957    tokens this function will always return zero. */
 958 double
 959 lex_tokval (const struct lexer *lexer)
 960 {
 961   return lex_next_tokval (lexer, 0);
 962 }
 963
 964 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 965
 966    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 967    this functions this function will always return NULL.
 968
 969    The UTF-8 encoding of the returned string is correct for variable names and
 970    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 971    data_in() to use it in a "union value".  */
 972 const char *
 973 lex_tokcstr (const struct lexer *lexer)
 974 {
 975   return lex_next_tokcstr (lexer, 0);
 976 }
 977
 978 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 979    null-terminated (but the null terminator is not included in the returned
 980    substring's 'length').
 981
 982    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 983    this functions this function will always return NULL.
 984
 985    The UTF-8 encoding of the returned string is correct for variable names and
 986    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 987    data_in() to use it in a "union value".  */
 988 struct substring
 989 lex_tokss (const struct lexer *lexer)
 990 {
 991   return lex_next_tokss (lexer, 0);
 992 }
 993 \f
 994 /* Looking ahead.
 995
 996    A value of 0 for N as an argument to any of these functions refers to the
 997    current token.  Lookahead is limited to the current command.  Any N greater
 998    than the number of tokens remaining in the current command will be treated
 999    as referring to a T_ENDCMD token. */
1000
1001 static const struct lex_token *
1002 lex_next__ (const struct lexer *lexer_, int n)
1003 {
1004   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1005   struct lex_source *src = lex_source__ (lexer);
1006
1007   if (src != NULL)
1008     return lex_source_next__ (src, n);
1009   else
1010     {
1011       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1012       return &stop_token;
1013     }
1014 }
1015
1016 static const struct lex_token *
1017 lex_source_next__ (const struct lex_source *src_, int n)
1018 {
1019   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1020
1021   if (n < 0)
1022     {
1023       if (-n <= src->parse_ofs)
1024         return src->parse[src->parse_ofs - (-n)];
1025       else
1026         {
1027           static const struct lex_token endcmd_token
1028             = { .token = { .type = T_ENDCMD } };
1029           return &endcmd_token;
1030         }
1031     }
1032
1033   while (src->n_parse - src->parse_ofs <= n)
1034     {
1035       if (src->n_parse > 0)
1036         {
1037           const struct lex_token *t = src->parse[src->n_parse - 1];
1038           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1039             return t;
1040         }
1041
1042       lex_source_get_parse (src);
1043     }
1044
1045   return src->parse[src->parse_ofs + n];
1046 }
1047
1048 /* Returns the "struct token" of the token N after the current one in LEXER.
1049    The returned pointer can be invalidated by pretty much any succeeding call
1050    into the lexer, although the string pointer within the returned token is
1051    only invalidated by consuming the token (e.g. with lex_get()). */
1052 const struct token *
1053 lex_next (const struct lexer *lexer, int n)
1054 {
1055   return &lex_next__ (lexer, n)->token;
1056 }
1057
1058 /* Returns the type of the token N after the current one in LEXER. */
1059 enum token_type
1060 lex_next_token (const struct lexer *lexer, int n)
1061 {
1062   return lex_next (lexer, n)->type;
1063 }
1064
1065 /* Returns the number in the tokn N after the current one in LEXER.
1066
1067    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1068    tokens this function will always return zero. */
1069 double
1070 lex_next_tokval (const struct lexer *lexer, int n)
1071 {
1072   return token_number (lex_next (lexer, n));
1073 }
1074
1075 /* Returns the null-terminated string in the token N after the current one, in
1076    UTF-8 encoding.
1077
1078    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1079    this functions this function will always return NULL.
1080
1081    The UTF-8 encoding of the returned string is correct for variable names and
1082    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1083    data_in() to use it in a "union value".  */
1084 const char *
1085 lex_next_tokcstr (const struct lexer *lexer, int n)
1086 {
1087   return lex_next_tokss (lexer, n).string;
1088 }
1089
1090 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1091    The string is null-terminated (but the null terminator is not included in
1092    the returned substring's 'length').
1093
1094    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1095    tokens this functions this function will always return NULL.
1096
1097    The UTF-8 encoding of the returned string is correct for variable names and
1098    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1099    data_in() to use it in a "union value".  */
1100 struct substring
1101 lex_next_tokss (const struct lexer *lexer, int n)
1102 {
1103   return lex_next (lexer, n)->string;
1104 }
1105
1106 int
1107 lex_ofs (const struct lexer *lexer)
1108 {
1109   struct lex_source *src = lex_source__ (lexer);
1110   return src ? src->parse_ofs : 0;
1111 }
1112
1113 const struct token *
1114 lex_ofs_token (const struct lexer *lexer_, int ofs)
1115 {
1116   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1117   struct lex_source *src = lex_source__ (lexer);
1118
1119   if (src != NULL)
1120     return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1121   else
1122     {
1123       static const struct token stop_token = { .type = T_STOP };
1124       return &stop_token;
1125     }
1126 }
1127
1128 struct msg_location *
1129 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1130 {
1131   int ofs = lex_ofs (lexer);
1132   return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1133 }
1134
1135 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1136    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1137    are both zero, this requests the syntax for the current token.)  The caller
1138    must eventually free the returned string (with free()).  The syntax is
1139    encoded in UTF-8 and in the original form supplied to the lexer so that, for
1140    example, it may include comments, spaces, and new-lines if it spans multiple
1141    tokens.  Macro expansion, however, has already been performed. */
1142 char *
1143 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1144 {
1145   return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1146 }
1147
1148 /* Returns true if the token N ahead of the current one was produced by macro
1149    expansion, false otherwise. */
1150 bool
1151 lex_next_is_from_macro (const struct lexer *lexer, int n)
1152 {
1153   return lex_next__ (lexer, n)->macro_rep != NULL;
1154 }
1155
1156 static bool
1157 lex_tokens_match (const struct token *actual, const struct token *expected)
1158 {
1159   if (actual->type != expected->type)
1160     return false;
1161
1162   switch (actual->type)
1163     {
1164     case T_POS_NUM:
1165     case T_NEG_NUM:
1166       return actual->number == expected->number;
1167
1168     case T_ID:
1169       return lex_id_match (expected->string, actual->string);
1170
1171     case T_STRING:
1172       return (actual->string.length == expected->string.length
1173               && !memcmp (actual->string.string, expected->string.string,
1174                           actual->string.length));
1175
1176     default:
1177       return true;
1178     }
1179 }
1180
1181 static size_t
1182 lex_at_phrase__ (struct lexer *lexer, const char *s)
1183 {
1184   struct string_lexer slex;
1185   struct token token;
1186
1187   size_t i = 0;
1188   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1189   while (string_lexer_next (&slex, &token))
1190     {
1191       bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1192       token_uninit (&token);
1193       if (!match)
1194         return 0;
1195     }
1196   return i;
1197 }
1198
1199 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1200    returns true.  Otherwise, returns false.
1201
1202    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1203    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1204    first three letters. */
1205 bool
1206 lex_at_phrase (struct lexer *lexer, const char *s)
1207 {
1208   return lex_at_phrase__ (lexer, s) > 0;
1209 }
1210
1211 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1212    skips it and returns true.  Otherwise, returns false.
1213
1214    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1215    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1216    first three letters. */
1217 bool
1218 lex_match_phrase (struct lexer *lexer, const char *s)
1219 {
1220   size_t n = lex_at_phrase__ (lexer, s);
1221   if (n > 0)
1222     lex_get_n (lexer, n);
1223   return n > 0;
1224 }
1225
1226 static int
1227 count_newlines (char *s, size_t length)
1228 {
1229   int n_newlines = 0;
1230   char *newline;
1231
1232   while ((newline = memchr (s, '\n', length)) != NULL)
1233     {
1234       n_newlines++;
1235       length -= (newline + 1) - s;
1236       s = newline + 1;
1237     }
1238
1239   return n_newlines;
1240 }
1241
1242 static int
1243 lex_token_get_last_line_number (const struct lex_source *src,
1244                                 const struct lex_token *token)
1245 {
1246   size_t end = token->token_pos + token->token_len
1247   return lex_source_ofs_to_line_number (src,
1248   if (token->first_line == 0)
1249     return 0;
1250   else
1251     {
1252       char *token_str = &src->buffer[token->token_pos];
1253       return token->first_line + count_newlines (token_str, token->token_len) + 1;
1254     }
1255 }
1256
1257 static int
1258 lex_token_get_column__ (const struct lex_source *src, size_t offset)
1259 {
1260   const char *newline = memrchr (src->buffer, '\n', offset);
1261   size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1262   return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1263 }
1264
1265 static int
1266 lex_token_get_first_column (const struct lex_source *src,
1267                             const struct lex_token *token)
1268 {
1269   return lex_token_get_column__ (src, token->token_pos);
1270 }
1271
1272 static int
1273 lex_token_get_last_column (const struct lex_source *src,
1274                            const struct lex_token *token)
1275 {
1276   return lex_token_get_column__ (src, token->token_pos + token->token_len);
1277 }
1278
1279 static struct msg_location
1280 lex_token_location (const struct lex_source *src,
1281                     const struct lex_token *t0,
1282                     const struct lex_token *t1)
1283 {
1284   int first_column = lex_token_get_first_column (src, t0);
1285   int last_line = lex_token_get_last_line_number (src, t1) - 1;
1286   int last_column = lex_token_get_last_column (src, t1) - 1;
1287   return (struct msg_location) {
1288     .file_name = intern_new_if_nonnull (src->reader->file_name),
1289     .p[0] = { .line = t0->first_line, .column = first_column },
1290     .p[1] = { .line = last_line, .column = last_column },
1291   };
1292 }
1293
1294 static struct msg_location *
1295 lex_token_location_rw (const struct lex_source *src,
1296                        const struct lex_token *t0,
1297                        const struct lex_token *t1)
1298 {
1299   struct msg_location location = lex_token_location (src, t0, t1);
1300   return msg_location_dup (&location);
1301 }
1302
1303 static struct msg_location *
1304 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1305 {
1306   return lex_token_location_rw (src,
1307                                 lex_source_next__ (src, n0),
1308                                 lex_source_next__ (src, n1));
1309 }
1310
1311 /* Returns the 1-based line number of the start of the syntax that represents
1312    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1313    if the token is drawn from a source that does not have line numbers. */
1314 int
1315 lex_get_first_line_number (const struct lexer *lexer, int n)
1316 {
1317   const struct lex_source *src = lex_source__ (lexer);
1318   return src ? lex_source_next__ (src, n)->first_line : 0;
1319 }
1320
1321 /* Returns the 1-based line number of the end of the syntax that represents the
1322    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1323    token or if the token is drawn from a source that does not have line
1324    numbers.
1325
1326    Most of the time, a single token is wholly within a single line of syntax,
1327    but there are two exceptions: a T_STRING token can be made up of multiple
1328    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1329    token can consist of a "-" on one line followed by the number on the next.
1330  */
1331 int
1332 lex_get_last_line_number (const struct lexer *lexer, int n)
1333 {
1334   const struct lex_source *src = lex_source__ (lexer);
1335   return src ? lex_token_get_last_line_number (src,
1336                                                lex_source_next__ (src, n)) : 0;
1337 }
1338
1339 /* Returns the 1-based column number of the start of the syntax that represents
1340    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1341    token.
1342
1343    Column numbers are measured according to the width of characters as shown in
1344    a typical fixed-width font, in which CJK characters have width 2 and
1345    combining characters have width 0.  */
1346 int
1347 lex_get_first_column (const struct lexer *lexer, int n)
1348 {
1349   const struct lex_source *src = lex_source__ (lexer);
1350   return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1351 }
1352
1353 /* Returns the 1-based column number of the end of the syntax that represents
1354    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1355    token.
1356
1357    Column numbers are measured according to the width of characters as shown in
1358    a typical fixed-width font, in which CJK characters have width 2 and
1359    combining characters have width 0.  */
1360 int
1361 lex_get_last_column (const struct lexer *lexer, int n)
1362 {
1363   const struct lex_source *src = lex_source__ (lexer);
1364   return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1365 }
1366
1367 /* Returns the name of the syntax file from which the current command is drawn.
1368    Returns NULL for a T_STOP token or if the command's source does not have
1369    line numbers.
1370
1371    There is no version of this function that takes an N argument because
1372    lookahead only works to the end of a command and any given command is always
1373    within a single syntax file. */
1374 const char *
1375 lex_get_file_name (const struct lexer *lexer)
1376 {
1377   struct lex_source *src = lex_source__ (lexer);
1378   return src == NULL ? NULL : src->reader->file_name;
1379 }
1380
1381 /* Returns a newly allocated msg_location for the syntax that represents tokens
1382    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1383    must eventually free the location (with msg_location_destroy()). */
1384 struct msg_location *
1385 lex_get_location (const struct lexer *lexer, int n0, int n1)
1386 {
1387   struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1388   loc->p[0].column = lex_get_first_column (lexer, n0);
1389   loc->p[1].column = lex_get_last_column (lexer, n1) - 1;
1390   return loc;
1391 }
1392
1393 /* Returns a newly allocated msg_location for the syntax that represents tokens
1394    with 0-based offsets N0...N1, inclusive, from the current token.  The
1395    location only covers the tokens' lines, not the columns.  The caller must
1396    eventually free the location (with msg_location_destroy()). */
1397 struct msg_location *
1398 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1399 {
1400   struct msg_location *loc = xmalloc (sizeof *loc);
1401   int first_line = lex_get_first_line_number (lexer, n0);
1402   int last_line = lex_get_last_line_number (lexer, n1) - 1;
1403   *loc = (struct msg_location) {
1404     .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1405     .p[0] = { .line = first_line },
1406     .p[1] = { .line = last_line },
1407   };
1408   return loc;
1409 }
1410
1411 void
1412 lex_extend_location (const struct lexer *lexer, int n, struct msg_location **loc)
1413 {
1414   struct msg_location *new = lex_get_location (lexer, n, n);
1415   msg_location_merge (loc, new);
1416   msg_location_destroy (new);
1417 }
1418
1419 const char *
1420 lex_get_encoding (const struct lexer *lexer)
1421 {
1422   struct lex_source *src = lex_source__ (lexer);
1423   return src == NULL ? NULL : src->reader->encoding;
1424 }
1425
1426 /* Returns the syntax mode for the syntax file from which the current drawn is
1427    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1428    does not have line numbers.
1429
1430    There is no version of this function that takes an N argument because
1431    lookahead only works to the end of a command and any given command is always
1432    within a single syntax file. */
1433 enum segmenter_mode
1434 lex_get_syntax_mode (const struct lexer *lexer)
1435 {
1436   struct lex_source *src = lex_source__ (lexer);
1437   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1438 }
1439
1440 /* Returns the error mode for the syntax file from which the current drawn is
1441    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1442    source does not have line numbers.
1443
1444    There is no version of this function that takes an N argument because
1445    lookahead only works to the end of a command and any given command is always
1446    within a single syntax file. */
1447 enum lex_error_mode
1448 lex_get_error_mode (const struct lexer *lexer)
1449 {
1450   struct lex_source *src = lex_source__ (lexer);
1451   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1452 }
1453
1454 /* If the source that LEXER is currently reading has error mode
1455    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1456    token to be read comes directly from whatever is next read from the stream.
1457
1458    It makes sense to call this function after encountering an error in a
1459    command entered on the console, because usually the user would prefer not to
1460    have cascading errors. */
1461 void
1462 lex_interactive_reset (struct lexer *lexer)
1463 {
1464   struct lex_source *src = lex_source__ (lexer);
1465   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1466     {
1467       src->length = 0;
1468       src->journal_pos = src->seg_pos = 0;
1469       src->n_newlines = 0;
1470       src->suppress_next_newline = false;
1471       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1472                                        false);
1473       lex_stage_clear (&src->pp);
1474       lex_stage_clear (&src->merge);
1475       lex_source_clear_parse (src);
1476       lex_source_push_endcmd__ (src);
1477     }
1478 }
1479
1480 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1481 void
1482 lex_discard_rest_of_command (struct lexer *lexer)
1483 {
1484   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1485     lex_get (lexer);
1486 }
1487
1488 /* Discards all lookahead tokens in LEXER, then discards all input sources
1489    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1490    runs out of input sources. */
1491 void
1492 lex_discard_noninteractive (struct lexer *lexer)
1493 {
1494   struct lex_source *src = lex_source__ (lexer);
1495
1496   if (src != NULL)
1497     {
1498       lex_stage_clear (&src->pp);
1499       lex_stage_clear (&src->merge);
1500       lex_source_clear_parse (src);
1501
1502       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1503            src = lex_source__ (lexer))
1504         lex_source_destroy (src);
1505     }
1506 }
1507 \f
1508 static void
1509 lex_source_expand__ (struct lex_source *src)
1510 {
1511   if (src->length >= src->allocated)
1512     src->buffer = x2realloc (src->buffer, &src->allocated);
1513 }
1514
1515 static void
1516 lex_source_read__ (struct lex_source *src)
1517 {
1518   do
1519     {
1520       lex_source_expand__ (src);
1521
1522       size_t space = src->allocated - src->length;
1523       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1524       size_t n = src->reader->class->read (src->reader,
1525                                            &src->buffer[src->length],
1526                                            space, prompt);
1527       assert (n <= space);
1528
1529       if (n == 0)
1530         {
1531           /* End of input. */
1532           src->reader->eof = true;
1533           lex_source_expand__ (src);
1534           return;
1535         }
1536
1537       src->length += n;
1538     }
1539   while (!memchr (&src->buffer[src->seg_pos], '\n',
1540                   src->length - src->seg_pos));
1541 }
1542
1543 static struct lex_source *
1544 lex_source__ (const struct lexer *lexer)
1545 {
1546   return (ll_is_empty (&lexer->sources) ? NULL
1547           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1548 }
1549
1550 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1551    one, through N1 ahead of the current one, inclusive.  (For example, if N0
1552    and N1 are both zero, this requests the syntax for the current token.)  The
1553    caller must eventually free the returned string (with free()).  The syntax
1554    is encoded in UTF-8 and in the original form supplied to the lexer so that,
1555    for example, it may include comments, spaces, and new-lines if it spans
1556    multiple tokens.  Macro expansion, however, has already been performed. */
1557 static char *
1558 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1559 {
1560   struct string s = DS_EMPTY_INITIALIZER;
1561   for (size_t i = n0; i <= n1; )
1562     {
1563       /* Find [I,J) as the longest sequence of tokens not produced by macro
1564          expansion, or otherwise the longest sequence expanded from a single
1565          macro call. */
1566       const struct lex_token *first = lex_source_next__ (src, i);
1567       size_t j;
1568       for (j = i + 1; j <= n1; j++)
1569         {
1570           const struct lex_token *cur = lex_source_next__ (src, j);
1571           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1572               || first->macro_rep != cur->macro_rep)
1573             break;
1574         }
1575       const struct lex_token *last = lex_source_next__ (src, j - 1);
1576
1577       /* Now add the syntax for this sequence of tokens to SRC. */
1578       if (!ds_is_empty (&s))
1579         ds_put_byte (&s, ' ');
1580       if (!first->macro_rep)
1581         {
1582           size_t start = first->token_pos;
1583           size_t end = last->token_pos + last->token_len;
1584           ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1585         }
1586       else
1587         {
1588           size_t start = first->ofs;
1589           size_t end = last->ofs + last->len;
1590           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1591                                            end - start));
1592         }
1593
1594       i = j;
1595     }
1596   return ds_steal_cstr (&s);
1597 }
1598
1599 static bool
1600 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1601 {
1602   for (size_t i = n0; i <= n1; i++)
1603     if (lex_source_next__ (src, i)->macro_rep)
1604       return true;
1605   return false;
1606 }
1607
1608 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1609    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1610    other tokens included in that range.  The syntax is encoded in UTF-8 and in
1611    the original form supplied to the lexer so that, for example, it may include
1612    comments, spaces, and new-lines if it spans multiple tokens.
1613
1614    Returns an empty string if the token range doesn't include a macro call.
1615
1616    The caller must not modify or free the returned string. */
1617 static struct substring
1618 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1619 {
1620   if (!lex_source_contains_macro_call (src, n0, n1))
1621     return ss_empty ();
1622
1623   const struct lex_token *token0 = lex_source_next__ (src, n0);
1624   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1625   size_t start = token0->token_pos;
1626   size_t end = token1->token_pos + token1->token_len;
1627
1628   return ss_buffer (&src->buffer[start], end - start);
1629 }
1630
1631 static void
1632 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1633                          const char *format, va_list args)
1634 {
1635   const struct lex_token *token;
1636   struct string s;
1637
1638   ds_init_empty (&s);
1639
1640   token = lex_source_next__ (src, n0);
1641   if (token->token.type == T_ENDCMD)
1642     ds_put_cstr (&s, _("Syntax error at end of command"));
1643   else
1644     {
1645       /* Get the syntax that caused the error. */
1646       char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1647       char syntax[64];
1648       str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1649       free (raw_syntax);
1650
1651       /* Get the macro call(s) that expanded to the syntax that caused the
1652          error. */
1653       char call[64];
1654       str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1655                      call, sizeof call);
1656
1657       if (syntax[0])
1658         {
1659           if (call[0])
1660             ds_put_format (&s,
1661                            _("Syntax error at `%s' (in expansion of `%s')"),
1662                            syntax, call);
1663           else
1664             ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1665         }
1666       else
1667         {
1668           if (call[0])
1669             ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1670                            call);
1671           else
1672             ds_put_cstr (&s, _("Syntax error"));
1673         }
1674     }
1675
1676   if (format)
1677     {
1678       ds_put_cstr (&s, ": ");
1679       ds_put_vformat (&s, format, args);
1680     }
1681   if (ds_last (&s) != '.')
1682     ds_put_byte (&s, '.');
1683
1684   struct msg *m = xmalloc (sizeof *m);
1685   *m = (struct msg) {
1686     .category = MSG_C_SYNTAX,
1687     .severity = MSG_S_ERROR,
1688     .location = lex_source_get_location (src, n0, n1),
1689     .text = ds_steal_cstr (&s),
1690   };
1691   msg_emit (m);
1692 }
1693
1694 static void
1695 lex_get_error (struct lex_source *src, const struct lex_token *token)
1696 {
1697   char syntax[64];
1698   str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1699                  syntax, sizeof syntax);
1700
1701   struct string s = DS_EMPTY_INITIALIZER;
1702   ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1703   ds_put_format (&s, ": %s", token->token.string.string);
1704
1705   struct msg *m = xmalloc (sizeof *m);
1706   *m = (struct msg) {
1707     .category = MSG_C_SYNTAX,
1708     .severity = MSG_S_ERROR,
1709     .location = lex_token_location_rw (src, token, token),
1710     .text = ds_steal_cstr (&s),
1711   };
1712   msg_emit (m);
1713 }
1714
1715 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1716    underlying lex_reader if necessary.  Returns true if a new token was added
1717    to SRC's deque, false otherwise.  The caller should retry failures unless
1718    SRC's 'eof' marker was set to true indicating that there will be no more
1719    tokens from this source. */
1720 static bool
1721 lex_source_try_get_pp (struct lex_source *src)
1722 {
1723   /* Append a new token to SRC and initialize it. */
1724   struct lex_token *token = xmalloc (sizeof *token);
1725   token->token = (struct token) { .type = T_STOP };
1726   token->macro_rep = NULL;
1727   token->ref_cnt = NULL;
1728   token->token_pos = src->seg_pos;
1729   if (src->reader->line_number > 0)
1730     token->first_line = src->reader->line_number + src->n_newlines;
1731   else
1732     token->first_line = 0;
1733
1734   /* Extract a segment. */
1735   const char *segment;
1736   enum segment_type seg_type;
1737   int seg_len;
1738   for (;;)
1739     {
1740       segment = &src->buffer[src->seg_pos];
1741       seg_len = segmenter_push (&src->segmenter, segment,
1742                                 src->length - src->seg_pos,
1743                                 src->reader->eof, &seg_type);
1744       if (seg_len >= 0)
1745         break;
1746
1747       /* The segmenter needs more input to produce a segment. */
1748       assert (!src->reader->eof);
1749       lex_source_read__ (src);
1750     }
1751
1752   /* Update state based on the segment. */
1753   token->token_len = seg_len;
1754   src->seg_pos += seg_len;
1755   if (seg_type == SEG_NEWLINE)
1756     src->n_newlines++;
1757
1758   /* Get a token from the segment. */
1759   enum tokenize_result result = token_from_segment (
1760     seg_type, ss_buffer (segment, seg_len), &token->token);
1761
1762   /* If we've reached the end of a line, or the end of a command, then pass
1763      the line to the output engine as a syntax text item.  */
1764   int n_lines = seg_type == SEG_NEWLINE;
1765   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1766     {
1767       n_lines++;
1768       src->suppress_next_newline = true;
1769     }
1770   else if (n_lines > 0 && src->suppress_next_newline)
1771     {
1772       n_lines--;
1773       src->suppress_next_newline = false;
1774     }
1775   for (int i = 0; i < n_lines; i++)
1776     {
1777       /* Beginning of line. */
1778       const char *line = &src->buffer[src->journal_pos];
1779
1780       /* Calculate line length, including \n or \r\n end-of-line if present.
1781
1782          We use src->head even though that may be beyond what we've actually
1783          converted to tokens (which is only through line_pos).  That's because,
1784          if we're emitting the line due to SEG_END_COMMAND, we want to take the
1785          whole line through the newline, not just through the '.'. */
1786       size_t max_len = src->length - src->journal_pos;
1787       const char *newline = memchr (line, '\n', max_len);
1788       size_t line_len = newline ? newline - line + 1 : max_len;
1789
1790       /* Calculate line length excluding end-of-line. */
1791       size_t copy_len = line_len;
1792       if (copy_len > 0 && line[copy_len - 1] == '\n')
1793         copy_len--;
1794       if (copy_len > 0 && line[copy_len - 1] == '\r')
1795         copy_len--;
1796
1797       /* Submit the line as syntax. */
1798       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1799                                                    xmemdup0 (line, copy_len),
1800                                                    NULL));
1801
1802       src->journal_pos += line_len;
1803     }
1804
1805   switch (result)
1806     {
1807     case TOKENIZE_ERROR:
1808       lex_get_error (src, token);
1809       /* Fall through. */
1810     case TOKENIZE_EMPTY:
1811       lex_token_destroy (token);
1812       return false;
1813
1814     case TOKENIZE_TOKEN:
1815       if (token->token.type == T_STOP)
1816         {
1817           token->token.type = T_ENDCMD;
1818           src->eof = true;
1819         }
1820       lex_stage_push_last (&src->pp, token);
1821       return true;
1822     }
1823   NOT_REACHED ();
1824 }
1825
1826 /* Attempts to append a new token to SRC.  Returns true if successful, false on
1827    failure.  On failure, the end of SRC has been reached and no more tokens
1828    will be forthcoming from it.
1829
1830    Does not make the new token available for lookahead yet; the caller must
1831    adjust SRC's 'middle' pointer to do so. */
1832 static bool
1833 lex_source_get_pp (struct lex_source *src)
1834 {
1835   while (!src->eof)
1836     if (lex_source_try_get_pp (src))
1837       return true;
1838   return false;
1839 }
1840
1841 static bool
1842 lex_source_try_get_merge (const struct lex_source *src_)
1843 {
1844   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1845
1846   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1847     return false;
1848
1849   if (!settings_get_mexpand ())
1850     {
1851       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1852       return true;
1853     }
1854
1855   /* Now pass tokens one-by-one to the macro expander.
1856
1857      In the common case where there is no macro to expand, the loop is not
1858      entered.  */
1859   struct macro_call *mc;
1860   int n_call = macro_call_create (src->lexer->macros,
1861                                   &lex_stage_first (&src->pp)->token, &mc);
1862   for (int ofs = 1; !n_call; ofs++)
1863     {
1864       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1865         {
1866           /* This should not be reachable because we always get a T_ENDCMD at
1867              the end of an input file (transformed from T_STOP by
1868              lex_source_try_get_pp()) and the macro_expander should always
1869              terminate expansion on T_ENDCMD. */
1870           NOT_REACHED ();
1871         }
1872
1873       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1874       size_t start = t->token_pos;
1875       size_t end = t->token_pos + t->token_len;
1876       const struct macro_token mt = {
1877         .token = t->token,
1878         .syntax = ss_buffer (&src->buffer[start], end - start),
1879       };
1880       const struct msg_location loc = lex_token_location (src, t, t);
1881       n_call = macro_call_add (mc, &mt, &loc);
1882     }
1883   if (n_call < 0)
1884     {
1885       /* False alarm: no macro expansion after all.  Use first token as
1886          lookahead.  We'll retry macro expansion from the second token next
1887          time around. */
1888       macro_call_destroy (mc);
1889       lex_stage_shift (&src->merge, &src->pp, 1);
1890       return true;
1891     }
1892
1893   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1894      are a macro call.  (These are likely to be the only tokens in 'pp'.)
1895      Expand them.  */
1896   const struct lex_token *c0 = lex_stage_first (&src->pp);
1897   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1898   struct macro_tokens expansion = { .n = 0 };
1899   struct msg_location loc = lex_token_location (src, c0, c1);
1900   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1901   macro_call_destroy (mc);
1902
1903   /* Convert the macro expansion into syntax for possible error messages
1904      later. */
1905   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1906   size_t *len = xnmalloc (expansion.n, sizeof *len);
1907   struct string s = DS_EMPTY_INITIALIZER;
1908   macro_tokens_to_syntax (&expansion, &s, ofs, len);
1909
1910   if (settings_get_mprint ())
1911     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1912                                           _("Macro Expansion")));
1913
1914   /* Append the macro expansion tokens to the lookahead. */
1915   if (expansion.n > 0)
1916     {
1917       char *macro_rep = ds_steal_cstr (&s);
1918       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1919       *ref_cnt = expansion.n;
1920       for (size_t i = 0; i < expansion.n; i++)
1921         {
1922           struct lex_token *token = xmalloc (sizeof *token);
1923           *token = (struct lex_token) {
1924             .token = expansion.mts[i].token,
1925             .token_pos = c0->token_pos,
1926             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1927             .first_line = c0->first_line,
1928             .macro_rep = macro_rep,
1929             .ofs = ofs[i],
1930             .len = len[i],
1931             .ref_cnt = ref_cnt,
1932           };
1933           lex_stage_push_last (&src->merge, token);
1934
1935           ss_dealloc (&expansion.mts[i].syntax);
1936         }
1937     }
1938   else
1939     ds_destroy (&s);
1940   free (expansion.mts);
1941   free (ofs);
1942   free (len);
1943
1944   /* Destroy the tokens for the call. */
1945   for (size_t i = 0; i < n_call; i++)
1946     lex_stage_pop_first (&src->pp);
1947
1948   return expansion.n > 0;
1949 }
1950
1951 /* Attempts to obtain at least one new token into 'merge' in SRC.
1952
1953    Returns true if successful, false on failure.  In the latter case, SRC is
1954    exhausted and 'src->eof' is now true. */
1955 static bool
1956 lex_source_get_merge (struct lex_source *src)
1957 {
1958   while (!src->eof)
1959     if (lex_source_try_get_merge (src))
1960       return true;
1961   return false;
1962 }
1963
1964 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1965
1966    Returns true if successful, false on failure.  In the latter case, SRC is
1967    exhausted and 'src->eof' is now true. */
1968 static bool
1969 lex_source_get_parse (struct lex_source *src)
1970 {
1971   struct merger m = MERGER_INIT;
1972   struct token out;
1973   for (size_t i = 0; ; i++)
1974     {
1975       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1976         {
1977           /* We always get a T_ENDCMD at the end of an input file
1978              (transformed from T_STOP by lex_source_try_get_pp()) and
1979              merger_add() should never return -1 on T_ENDCMD. */
1980           assert (lex_stage_is_empty (&src->merge));
1981           return false;
1982         }
1983
1984       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1985                                &out);
1986       if (!retval)
1987         {
1988           lex_source_push_parse (src, lex_stage_take_first (&src->merge));
1989           return true;
1990         }
1991       else if (retval > 0)
1992         {
1993           /* Add a token that merges all the tokens together. */
1994           const struct lex_token *first = lex_stage_first (&src->merge);
1995           const struct lex_token *last = lex_stage_nth (&src->merge,
1996                                                         retval - 1);
1997           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1998           struct lex_token *t = xmalloc (sizeof *t);
1999           *t = (struct lex_token) {
2000             .token = out,
2001             .token_pos = first->token_pos,
2002             .token_len = (last->token_pos - first->token_pos) + last->token_len,
2003             .first_line = first->first_line,
2004
2005             /* This works well if all the tokens were not expanded from macros,
2006                or if they came from the same macro expansion.  It just gives up
2007                in the other (corner) cases. */
2008             .macro_rep = macro ? first->macro_rep : NULL,
2009             .ofs = macro ? first->ofs : 0,
2010             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2011             .ref_cnt = macro ? first->ref_cnt : NULL,
2012           };
2013           if (t->ref_cnt)
2014             ++*t->ref_cnt;
2015           lex_source_push_parse (src, t);
2016
2017           for (int i = 0; i < retval; i++)
2018             lex_stage_pop_first (&src->merge);
2019           return true;
2020         }
2021     }
2022 }
2023 \f
2024 static void
2025 lex_source_push_endcmd__ (struct lex_source *src)
2026 {
2027   assert (src->n_parse == 0);
2028
2029   struct lex_token *token = xmalloc (sizeof *token);
2030   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2031   lex_source_push_parse (src, token);
2032 }
2033
2034 static void
2035 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2036 {
2037   if (src->n_parse >= src->allocated_parse)
2038     src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2039                              sizeof *src->parse);
2040   src->parse[src->n_parse++] = token;
2041 }
2042
2043 static void
2044 lex_source_clear_parse (struct lex_source *src)
2045 {
2046   for (size_t i = 0; i < src->n_parse; i++)
2047     lex_token_destroy (src->parse[i]);
2048   src->n_parse = src->parse_ofs = 0;
2049 }
2050
2051 static struct lex_source *
2052 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2053 {
2054   struct lex_source *src = xmalloc (sizeof *src);
2055   *src = (struct lex_source) {
2056     .reader = reader,
2057     .segmenter = segmenter_init (reader->syntax, false),
2058     .lexer = lexer,
2059   };
2060
2061   lex_source_push_endcmd__ (src);
2062
2063   return src;
2064 }
2065
2066 static void
2067 lex_source_destroy (struct lex_source *src)
2068 {
2069   char *file_name = src->reader->file_name;
2070   char *encoding = src->reader->encoding;
2071   if (src->reader->class->destroy != NULL)
2072     src->reader->class->destroy (src->reader);
2073   free (file_name);
2074   free (encoding);
2075   free (src->buffer);
2076   lex_stage_uninit (&src->pp);
2077   lex_stage_uninit (&src->merge);
2078   lex_source_clear_parse (src);
2079   free (src->parse);
2080   ll_remove (&src->ll);
2081   free (src);
2082 }
2083 \f
2084 struct lex_file_reader
2085   {
2086     struct lex_reader reader;
2087     struct u8_istream *istream;
2088   };
2089
2090 static struct lex_reader_class lex_file_reader_class;
2091
2092 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2093    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2094    ENCODING, which should take one of the forms accepted by
2095    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2096    mode of the new reader, respectively.
2097
2098    Returns a null pointer if FILE_NAME cannot be opened. */
2099 struct lex_reader *
2100 lex_reader_for_file (const char *file_name, const char *encoding,
2101                      enum segmenter_mode syntax,
2102                      enum lex_error_mode error)
2103 {
2104   struct lex_file_reader *r;
2105   struct u8_istream *istream;
2106
2107   istream = (!strcmp(file_name, "-")
2108              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2109              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2110   if (istream == NULL)
2111     {
2112       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2113       return NULL;
2114     }
2115
2116   r = xmalloc (sizeof *r);
2117   lex_reader_init (&r->reader, &lex_file_reader_class);
2118   r->reader.syntax = syntax;
2119   r->reader.error = error;
2120   r->reader.file_name = xstrdup (file_name);
2121   r->reader.encoding = xstrdup_if_nonnull (encoding);
2122   r->reader.line_number = 1;
2123   r->istream = istream;
2124
2125   return &r->reader;
2126 }
2127
2128 static struct lex_file_reader *
2129 lex_file_reader_cast (struct lex_reader *r)
2130 {
2131   return UP_CAST (r, struct lex_file_reader, reader);
2132 }
2133
2134 static size_t
2135 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2136                enum prompt_style prompt_style UNUSED)
2137 {
2138   struct lex_file_reader *r = lex_file_reader_cast (r_);
2139   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2140   if (n_read < 0)
2141     {
2142       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2143       return 0;
2144     }
2145   return n_read;
2146 }
2147
2148 static void
2149 lex_file_close (struct lex_reader *r_)
2150 {
2151   struct lex_file_reader *r = lex_file_reader_cast (r_);
2152
2153   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2154     {
2155       if (u8_istream_close (r->istream) != 0)
2156         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2157     }
2158   else
2159     u8_istream_free (r->istream);
2160
2161   free (r);
2162 }
2163
2164 static struct lex_reader_class lex_file_reader_class =
2165   {
2166     lex_file_read,
2167     lex_file_close
2168   };
2169 \f
2170 struct lex_string_reader
2171   {
2172     struct lex_reader reader;
2173     struct substring s;
2174     size_t offset;
2175   };
2176
2177 static struct lex_reader_class lex_string_reader_class;
2178
2179 /* Creates and returns a new lex_reader for the contents of S, which must be
2180    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2181    with ss_dealloc() when it is closed. */
2182 struct lex_reader *
2183 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2184 {
2185   struct lex_string_reader *r;
2186
2187   r = xmalloc (sizeof *r);
2188   lex_reader_init (&r->reader, &lex_string_reader_class);
2189   r->reader.syntax = SEG_MODE_AUTO;
2190   r->reader.encoding = xstrdup_if_nonnull (encoding);
2191   r->s = s;
2192   r->offset = 0;
2193
2194   return &r->reader;
2195 }
2196
2197 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2198    which must be encoded in ENCODING.  The caller retains ownership of S. */
2199 struct lex_reader *
2200 lex_reader_for_string (const char *s, const char *encoding)
2201 {
2202   struct substring ss;
2203   ss_alloc_substring (&ss, ss_cstr (s));
2204   return lex_reader_for_substring_nocopy (ss, encoding);
2205 }
2206
2207 /* Formats FORMAT as a printf()-like format string and creates and returns a
2208    new lex_reader for the formatted result.  */
2209 struct lex_reader *
2210 lex_reader_for_format (const char *format, const char *encoding, ...)
2211 {
2212   struct lex_reader *r;
2213   va_list args;
2214
2215   va_start (args, encoding);
2216   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2217   va_end (args);
2218
2219   return r;
2220 }
2221
2222 static struct lex_string_reader *
2223 lex_string_reader_cast (struct lex_reader *r)
2224 {
2225   return UP_CAST (r, struct lex_string_reader, reader);
2226 }
2227
2228 static size_t
2229 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2230                  enum prompt_style prompt_style UNUSED)
2231 {
2232   struct lex_string_reader *r = lex_string_reader_cast (r_);
2233   size_t chunk;
2234
2235   chunk = MIN (n, r->s.length - r->offset);
2236   memcpy (buf, r->s.string + r->offset, chunk);
2237   r->offset += chunk;
2238
2239   return chunk;
2240 }
2241
2242 static void
2243 lex_string_close (struct lex_reader *r_)
2244 {
2245   struct lex_string_reader *r = lex_string_reader_cast (r_);
2246
2247   ss_dealloc (&r->s);
2248   free (r);
2249 }
2250
2251 static struct lex_reader_class lex_string_reader_class =
2252   {
2253     lex_string_read,
2254     lex_string_close
2255   };