pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31
  32 #include "language/command.h"
  33 #include "language/lexer/macro.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/intern.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* For a token obtained through the lexer in an ordinary way, this is the
  66        location of the token in terms of the lex_source's buffer.
  67
  68        For a token produced through macro expansion, this is the entire macro
  69        call. */
  70     size_t token_pos;           /* Offset into src->buffer of token start. */
  71     size_t token_len;           /* Length of source for token in bytes. */
  72     int first_line;             /* Line number at token_pos. */
  73
  74     /* For a token obtained through macro expansion, this is just this token.
  75
  76        For a token obtained through the lexer in an ordinary way, these are
  77        nulls and zeros. */
  78     char *macro_rep;        /* The whole macro expansion. */
  79     size_t ofs;             /* Offset of this token in macro_rep. */
  80     size_t len;             /* Length of this token in macro_rep. */
  81     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  82   };
  83
  84 static void
  85 lex_token_destroy (struct lex_token *t)
  86 {
  87   token_uninit (&t->token);
  88   if (t->ref_cnt)
  89     {
  90       assert (*t->ref_cnt > 0);
  91       if (!--*t->ref_cnt)
  92         {
  93           free (t->macro_rep);
  94           free (t->ref_cnt);
  95         }
  96     }
  97   free (t);
  98 }
  99 \f
 100 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 101    lex_source. */
 102 struct lex_stage
 103   {
 104     struct deque deque;
 105     struct lex_token **tokens;
 106   };
 107
 108 static void lex_stage_clear (struct lex_stage *);
 109 static void lex_stage_uninit (struct lex_stage *);
 110
 111 static size_t lex_stage_count (const struct lex_stage *);
 112 static bool lex_stage_is_empty (const struct lex_stage *);
 113
 114 static struct lex_token *lex_stage_first (struct lex_stage *);
 115 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 116
 117 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 118 static void lex_stage_pop_first (struct lex_stage *);
 119
 120 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 121                              size_t n);
 122
 123 /* Deletes all the tokens from STAGE. */
 124 static void
 125 lex_stage_clear (struct lex_stage *stage)
 126 {
 127   while (!deque_is_empty (&stage->deque))
 128     lex_stage_pop_first (stage);
 129 }
 130
 131 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 132 static void
 133 lex_stage_uninit (struct lex_stage *stage)
 134 {
 135   lex_stage_clear (stage);
 136   free (stage->tokens);
 137 }
 138
 139 /* Returns true if STAGE contains no tokens, otherwise false. */
 140 static bool
 141 lex_stage_is_empty (const struct lex_stage *stage)
 142 {
 143   return deque_is_empty (&stage->deque);
 144 }
 145
 146 /* Returns the number of tokens in STAGE. */
 147 static size_t
 148 lex_stage_count (const struct lex_stage *stage)
 149 {
 150   return deque_count (&stage->deque);
 151 }
 152
 153 /* Returns the first token in STAGE, which must be nonempty.
 154    The first token is the one accessed with the least lookahead. */
 155 static struct lex_token *
 156 lex_stage_first (struct lex_stage *stage)
 157 {
 158   return lex_stage_nth (stage, 0);
 159 }
 160
 161 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 162    lookahead) is 0, the second token is 1, and so on.  There must be at least
 163    INDEX + 1 tokens in STAGE. */
 164 static struct lex_token *
 165 lex_stage_nth (struct lex_stage *stage, size_t index)
 166 {
 167   return stage->tokens[deque_back (&stage->deque, index)];
 168 }
 169
 170 /* Adds TOKEN so that it becomes the last token in STAGE. */
 171 static void
 172 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 173 {
 174   if (deque_is_full (&stage->deque))
 175     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 176                                   sizeof *stage->tokens);
 177   stage->tokens[deque_push_front (&stage->deque)] = token;
 178 }
 179
 180 /* Removes and returns the first token from STAGE. */
 181 static struct lex_token *
 182 lex_stage_take_first (struct lex_stage *stage)
 183 {
 184   return stage->tokens[deque_pop_back (&stage->deque)];
 185 }
 186
 187 /* Removes the first token from STAGE and uninitializes it. */
 188 static void
 189 lex_stage_pop_first (struct lex_stage *stage)
 190 {
 191   lex_token_destroy (lex_stage_take_first (stage));
 192 }
 193
 194 /* Removes the first N tokens from SRC, appending them to DST as the last
 195    tokens. */
 196 static void
 197 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 198 {
 199   for (size_t i = 0; i < n; i++)
 200     lex_stage_push_last (dst, lex_stage_take_first (src));
 201 }
 202
 203 /* A source of tokens, corresponding to a syntax file.
 204
 205    This is conceptually a lex_reader wrapped with everything needed to convert
 206    its UTF-8 bytes into tokens. */
 207 struct lex_source
 208   {
 209     struct ll ll;               /* In lexer's list of sources. */
 210     struct lex_reader *reader;
 211     struct lexer *lexer;
 212     struct segmenter segmenter;
 213     bool eof;                   /* True if T_STOP was read from 'reader'. */
 214
 215     /* Buffer of UTF-8 bytes. */
 216     char *buffer;               /* Source file contents. */
 217     size_t length;              /* Number of bytes filled. */
 218     size_t allocated;           /* Number of bytes allocated. */
 219
 220     /* Offsets into 'buffer'. */
 221     size_t journal_pos;         /* First byte not yet output to journal. */
 222     size_t seg_pos;             /* First byte not yet scanned as token. */
 223
 224     int n_newlines;             /* Number of new-lines up to seg_pos. */
 225     bool suppress_next_newline;
 226
 227     /* Tokens.
 228
 229        This is a pipeline with the following stages.  Each token eventually
 230        made available to the parser passes through of these stages.  The stages
 231        are named after the processing that happens in each one.
 232
 233        Initially, tokens come from the segmenter and scanner to 'pp':
 234
 235        - pp: Tokens that need to pass through the macro preprocessor to end up
 236          in 'merge'.
 237
 238        - merge: Tokens that need to pass through scan_merge() to end up in
 239          'parse'.
 240
 241        - parse: Tokens available to the client for parsing.
 242
 243       'pp' and 'merge' store tokens only temporarily until they pass into
 244       'parse'.  Tokens then live in 'parse' until the command is fully
 245       consumed, at which time they are freed together. */
 246     struct lex_stage pp;
 247     struct lex_stage merge;
 248     struct lex_token **parse;
 249     size_t n_parse, allocated_parse, parse_ofs;
 250   };
 251
 252 static struct lex_source *lex_source_create (struct lexer *,
 253                                              struct lex_reader *);
 254 static void lex_source_destroy (struct lex_source *);
 255
 256 /* Lexer. */
 257 struct lexer
 258   {
 259     struct ll_list sources;     /* Contains "struct lex_source"s. */
 260     struct macro_set *macros;
 261   };
 262
 263 static struct lex_source *lex_source__ (const struct lexer *);
 264 static char *lex_source_get_syntax__ (const struct lex_source *,
 265                                       int n0, int n1);
 266 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 267 static void lex_source_push_endcmd__ (struct lex_source *);
 268 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
 269 static void lex_source_clear_parse (struct lex_source *);
 270
 271 static bool lex_source_get_parse (struct lex_source *);
 272 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 273                                      const char *format, va_list)
 274    PRINTF_FORMAT (4, 0);
 275 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 276                                                   int n);
 277 \f
 278 /* Initializes READER with the specified CLASS and otherwise some reasonable
 279    defaults.  The caller should fill in the others members as desired. */
 280 void
 281 lex_reader_init (struct lex_reader *reader,
 282                  const struct lex_reader_class *class)
 283 {
 284   reader->class = class;
 285   reader->syntax = SEG_MODE_AUTO;
 286   reader->error = LEX_ERROR_CONTINUE;
 287   reader->file_name = NULL;
 288   reader->encoding = NULL;
 289   reader->line_number = 0;
 290   reader->eof = false;
 291 }
 292
 293 /* Frees any file name already in READER and replaces it by a copy of
 294    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 295 void
 296 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 297 {
 298   free (reader->file_name);
 299   reader->file_name = xstrdup_if_nonnull (file_name);
 300 }
 301 \f
 302 /* Creates and returns a new lexer. */
 303 struct lexer *
 304 lex_create (void)
 305 {
 306   struct lexer *lexer = xmalloc (sizeof *lexer);
 307   *lexer = (struct lexer) {
 308     .sources = LL_INITIALIZER (lexer->sources),
 309     .macros = macro_set_create (),
 310   };
 311   return lexer;
 312 }
 313
 314 /* Destroys LEXER. */
 315 void
 316 lex_destroy (struct lexer *lexer)
 317 {
 318   if (lexer != NULL)
 319     {
 320       struct lex_source *source, *next;
 321
 322       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 323         lex_source_destroy (source);
 324       macro_set_destroy (lexer->macros);
 325       free (lexer);
 326     }
 327 }
 328
 329 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 330    same name.  Takes ownership of M. */
 331 void
 332 lex_define_macro (struct lexer *lexer, struct macro *m)
 333 {
 334   macro_set_add (lexer->macros, m);
 335 }
 336
 337 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 338    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 339    token. */
 340 void
 341 lex_include (struct lexer *lexer, struct lex_reader *reader)
 342 {
 343   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 344   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 345 }
 346
 347 /* Appends READER to LEXER, so that it will be read after all other current
 348    readers have already been read. */
 349 void
 350 lex_append (struct lexer *lexer, struct lex_reader *reader)
 351 {
 352   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 353 }
 354 \f
 355 /* Advancing. */
 356
 357 /* Advances LEXER to the next token, consuming the current token. */
 358 void
 359 lex_get (struct lexer *lexer)
 360 {
 361   struct lex_source *src;
 362
 363   src = lex_source__ (lexer);
 364   if (src == NULL)
 365     return;
 366
 367   if (src->parse_ofs < src->n_parse)
 368     {
 369       if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
 370         lex_source_clear_parse (src);
 371       else
 372         src->parse_ofs++;
 373     }
 374
 375   while (src->parse_ofs == src->n_parse)
 376     if (!lex_source_get_parse (src))
 377       {
 378         lex_source_destroy (src);
 379         src = lex_source__ (lexer);
 380         if (src == NULL)
 381           return;
 382       }
 383 }
 384
 385 /* Advances LEXER by N tokens. */
 386 void
 387 lex_get_n (struct lexer *lexer, size_t n)
 388 {
 389   while (n-- > 0)
 390     lex_get (lexer);
 391 }
 392 \f
 393 /* Issuing errors. */
 394
 395 /* Prints a syntax error message containing the current token and
 396    given message MESSAGE (if non-null). */
 397 void
 398 lex_error (struct lexer *lexer, const char *format, ...)
 399 {
 400   va_list args;
 401
 402   va_start (args, format);
 403   lex_next_error_valist (lexer, 0, 0, format, args);
 404   va_end (args);
 405 }
 406
 407 /* Prints a syntax error message containing the current token and
 408    given message MESSAGE (if non-null). */
 409 void
 410 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 411 {
 412   lex_next_error_valist (lexer, 0, 0, format, args);
 413 }
 414
 415 /* Prints a syntax error message containing the current token and
 416    given message MESSAGE (if non-null). */
 417 void
 418 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 419 {
 420   va_list args;
 421
 422   va_start (args, format);
 423   lex_next_error_valist (lexer, n0, n1, format, args);
 424   va_end (args);
 425 }
 426
 427 /* Prints a syntax error message saying that one of the strings provided as
 428    varargs, up to the first NULL, is expected. */
 429 void
 430 (lex_error_expecting) (struct lexer *lexer, ...)
 431 {
 432   va_list args;
 433
 434   va_start (args, lexer);
 435   lex_error_expecting_valist (lexer, args);
 436   va_end (args);
 437 }
 438
 439 /* Prints a syntax error message saying that one of the options provided in
 440    ARGS, up to the first NULL, is expected. */
 441 void
 442 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 443 {
 444   enum { MAX_OPTIONS = 9 };
 445   const char *options[MAX_OPTIONS];
 446   int n = 0;
 447   while (n < MAX_OPTIONS)
 448     {
 449       const char *option = va_arg (args, const char *);
 450       if (!option)
 451         break;
 452
 453       options[n++] = option;
 454     }
 455   lex_error_expecting_array (lexer, options, n);
 456 }
 457
 458 void
 459 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 460 {
 461   switch (n)
 462     {
 463     case 0:
 464       lex_error (lexer, NULL);
 465       break;
 466
 467     case 1:
 468       lex_error (lexer, _("expecting %s"), options[0]);
 469       break;
 470
 471     case 2:
 472       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 473       break;
 474
 475     case 3:
 476       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 477                  options[2]);
 478       break;
 479
 480     case 4:
 481       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 482                  options[0], options[1], options[2], options[3]);
 483       break;
 484
 485     case 5:
 486       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 487                  options[0], options[1], options[2], options[3], options[4]);
 488       break;
 489
 490     case 6:
 491       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 492                  options[0], options[1], options[2], options[3], options[4],
 493                  options[5]);
 494       break;
 495
 496     case 7:
 497       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 498                  options[0], options[1], options[2], options[3], options[4],
 499                  options[5], options[6]);
 500       break;
 501
 502     case 8:
 503       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 504                  options[0], options[1], options[2], options[3], options[4],
 505                  options[5], options[6], options[7]);
 506       break;
 507
 508     default:
 509       lex_error (lexer, NULL);
 510     }
 511 }
 512
 513 /* Reports an error to the effect that subcommand SBC may only be specified
 514    once.
 515
 516    This function does not take a lexer as an argument or use lex_error(),
 517    because the result would ordinarily just be redundant: "Syntax error at
 518    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 519    not help the user find the error. */
 520 void
 521 lex_sbc_only_once (const char *sbc)
 522 {
 523   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 524 }
 525
 526 /* Reports an error to the effect that subcommand SBC is missing.
 527
 528    This function does not take a lexer as an argument or use lex_error(),
 529    because a missing subcommand can normally be detected only after the whole
 530    command has been parsed, and so lex_error() would always report "Syntax
 531    error at end of command", which does not help the user find the error. */
 532 void
 533 lex_sbc_missing (const char *sbc)
 534 {
 535   msg (SE, _("Required subcommand %s was not specified."), sbc);
 536 }
 537
 538 /* Reports an error to the effect that specification SPEC may only be specified
 539    once within subcommand SBC. */
 540 void
 541 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 542 {
 543   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 544              spec, sbc);
 545 }
 546
 547 /* Reports an error to the effect that specification SPEC is missing within
 548    subcommand SBC. */
 549 void
 550 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 551 {
 552   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 553              sbc, spec);
 554 }
 555
 556 /* Prints a syntax error message containing the current token and
 557    given message MESSAGE (if non-null). */
 558 void
 559 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 560                        const char *format, va_list args)
 561 {
 562   struct lex_source *src = lex_source__ (lexer);
 563
 564   if (src != NULL)
 565     lex_source_error_valist (src, n0, n1, format, args);
 566   else
 567     {
 568       struct string s;
 569
 570       ds_init_empty (&s);
 571       ds_put_format (&s, _("Syntax error at end of input"));
 572       if (format != NULL)
 573         {
 574           ds_put_cstr (&s, ": ");
 575           ds_put_vformat (&s, format, args);
 576         }
 577       if (ds_last (&s) != '.')
 578         ds_put_byte (&s, '.');
 579       msg (SE, "%s", ds_cstr (&s));
 580       ds_destroy (&s);
 581     }
 582 }
 583
 584 /* Checks that we're at end of command.
 585    If so, returns a successful command completion code.
 586    If not, flags a syntax error and returns an error command
 587    completion code. */
 588 int
 589 lex_end_of_command (struct lexer *lexer)
 590 {
 591   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 592     {
 593       lex_error (lexer, _("expecting end of command"));
 594       return CMD_FAILURE;
 595     }
 596   else
 597     return CMD_SUCCESS;
 598 }
 599 \f
 600 /* Token testing functions. */
 601
 602 /* Returns true if the current token is a number. */
 603 bool
 604 lex_is_number (const struct lexer *lexer)
 605 {
 606   return lex_next_is_number (lexer, 0);
 607 }
 608
 609 /* Returns true if the current token is a string. */
 610 bool
 611 lex_is_string (const struct lexer *lexer)
 612 {
 613   return lex_next_is_string (lexer, 0);
 614 }
 615
 616 /* Returns the value of the current token, which must be a
 617    floating point number. */
 618 double
 619 lex_number (const struct lexer *lexer)
 620 {
 621   return lex_next_number (lexer, 0);
 622 }
 623
 624 /* Returns true iff the current token is an integer. */
 625 bool
 626 lex_is_integer (const struct lexer *lexer)
 627 {
 628   return lex_next_is_integer (lexer, 0);
 629 }
 630
 631 /* Returns the value of the current token, which must be an
 632    integer. */
 633 long
 634 lex_integer (const struct lexer *lexer)
 635 {
 636   return lex_next_integer (lexer, 0);
 637 }
 638 \f
 639 /* Token testing functions with lookahead.
 640
 641    A value of 0 for N as an argument to any of these functions refers to the
 642    current token.  Lookahead is limited to the current command.  Any N greater
 643    than the number of tokens remaining in the current command will be treated
 644    as referring to a T_ENDCMD token. */
 645
 646 /* Returns true if the token N ahead of the current token is a number. */
 647 bool
 648 lex_next_is_number (const struct lexer *lexer, int n)
 649 {
 650   return token_is_number (lex_next (lexer, n));
 651 }
 652
 653 /* Returns true if the token N ahead of the current token is a string. */
 654 bool
 655 lex_next_is_string (const struct lexer *lexer, int n)
 656 {
 657   return token_is_string (lex_next (lexer, n));
 658 }
 659
 660 /* Returns the value of the token N ahead of the current token, which must be a
 661    floating point number. */
 662 double
 663 lex_next_number (const struct lexer *lexer, int n)
 664 {
 665   return token_number (lex_next (lexer, n));
 666 }
 667
 668 /* Returns true if the token N ahead of the current token is an integer. */
 669 bool
 670 lex_next_is_integer (const struct lexer *lexer, int n)
 671 {
 672   return token_is_integer (lex_next (lexer, n));
 673 }
 674
 675 /* Returns the value of the token N ahead of the current token, which must be
 676    an integer. */
 677 long
 678 lex_next_integer (const struct lexer *lexer, int n)
 679 {
 680   return token_integer (lex_next (lexer, n));
 681 }
 682 \f
 683 /* Token matching functions. */
 684
 685 /* If the current token has the specified TYPE, skips it and returns true.
 686    Otherwise, returns false. */
 687 bool
 688 lex_match (struct lexer *lexer, enum token_type type)
 689 {
 690   if (lex_token (lexer) == type)
 691     {
 692       lex_get (lexer);
 693       return true;
 694     }
 695   else
 696     return false;
 697 }
 698
 699 /* If the current token matches IDENTIFIER, skips it and returns true.
 700    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 701    returns false.
 702
 703    IDENTIFIER must be an ASCII string. */
 704 bool
 705 lex_match_id (struct lexer *lexer, const char *identifier)
 706 {
 707   return lex_match_id_n (lexer, identifier, 3);
 708 }
 709
 710 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 711    may be abbreviated to its first N letters.  Otherwise, returns false.
 712
 713    IDENTIFIER must be an ASCII string. */
 714 bool
 715 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 716 {
 717   if (lex_token (lexer) == T_ID
 718       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 719     {
 720       lex_get (lexer);
 721       return true;
 722     }
 723   else
 724     return false;
 725 }
 726
 727 /* If the current token is integer X, skips it and returns true.  Otherwise,
 728    returns false. */
 729 bool
 730 lex_match_int (struct lexer *lexer, int x)
 731 {
 732   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 733     {
 734       lex_get (lexer);
 735       return true;
 736     }
 737   else
 738     return false;
 739 }
 740 \f
 741 /* Forced matches. */
 742
 743 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 744    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 745    false.
 746
 747    IDENTIFIER must be an ASCII string. */
 748 bool
 749 lex_force_match_id (struct lexer *lexer, const char *identifier)
 750 {
 751   if (lex_match_id (lexer, identifier))
 752     return true;
 753   else
 754     {
 755       lex_error_expecting (lexer, identifier);
 756       return false;
 757     }
 758 }
 759
 760 /* If the current token has the specified TYPE, skips it and returns true.
 761    Otherwise, reports an error and returns false. */
 762 bool
 763 lex_force_match (struct lexer *lexer, enum token_type type)
 764 {
 765   if (lex_token (lexer) == type)
 766     {
 767       lex_get (lexer);
 768       return true;
 769     }
 770   else
 771     {
 772       const char *type_string = token_type_to_string (type);
 773       if (type_string)
 774         {
 775           char *s = xasprintf ("`%s'", type_string);
 776           lex_error_expecting (lexer, s);
 777           free (s);
 778         }
 779       else
 780         lex_error_expecting (lexer, token_type_to_name (type));
 781
 782       return false;
 783     }
 784 }
 785
 786 /* If the current token is a string, does nothing and returns true.
 787    Otherwise, reports an error and returns false. */
 788 bool
 789 lex_force_string (struct lexer *lexer)
 790 {
 791   if (lex_is_string (lexer))
 792     return true;
 793   else
 794     {
 795       lex_error (lexer, _("expecting string"));
 796       return false;
 797     }
 798 }
 799
 800 /* If the current token is a string or an identifier, does nothing and returns
 801    true.  Otherwise, reports an error and returns false.
 802
 803    This is meant for use in syntactic situations where we want to encourage the
 804    user to supply a quoted string, but for compatibility we also accept
 805    identifiers.  (One example of such a situation is file names.)  Therefore,
 806    the error message issued when the current token is wrong only says that a
 807    string is expected and doesn't mention that an identifier would also be
 808    accepted. */
 809 bool
 810 lex_force_string_or_id (struct lexer *lexer)
 811 {
 812   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 813 }
 814
 815 /* If the current token is an integer, does nothing and returns true.
 816    Otherwise, reports an error and returns false. */
 817 bool
 818 lex_force_int (struct lexer *lexer)
 819 {
 820   if (lex_is_integer (lexer))
 821     return true;
 822   else
 823     {
 824       lex_error (lexer, _("expecting integer"));
 825       return false;
 826     }
 827 }
 828
 829 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 830    nothing and returns true.  Otherwise, reports an error and returns false.
 831    If NAME is nonnull, then it is used in the error message. */
 832 bool
 833 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 834 {
 835   bool is_integer = lex_is_integer (lexer);
 836   bool too_small = is_integer && lex_integer (lexer) < min;
 837   bool too_big = is_integer && lex_integer (lexer) > max;
 838   if (is_integer && !too_small && !too_big)
 839     return true;
 840
 841   if (min > max)
 842     {
 843       /* Weird, maybe a bug in the caller.  Just report that we needed an
 844          integer. */
 845       if (name)
 846         lex_error (lexer, _("Integer expected for %s."), name);
 847       else
 848         lex_error (lexer, _("Integer expected."));
 849     }
 850   else if (min == max)
 851     {
 852       if (name)
 853         lex_error (lexer, _("Expected %ld for %s."), min, name);
 854       else
 855         lex_error (lexer, _("Expected %ld."), min);
 856     }
 857   else if (min + 1 == max)
 858     {
 859       if (name)
 860         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 861       else
 862         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 863     }
 864   else
 865     {
 866       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 867       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 868
 869       if (report_lower_bound && report_upper_bound)
 870         {
 871           if (name)
 872             lex_error (lexer,
 873                        _("Expected integer between %ld and %ld for %s."),
 874                        min, max, name);
 875           else
 876             lex_error (lexer, _("Expected integer between %ld and %ld."),
 877                        min, max);
 878         }
 879       else if (report_lower_bound)
 880         {
 881           if (min == 0)
 882             {
 883               if (name)
 884                 lex_error (lexer, _("Expected non-negative integer for %s."),
 885                            name);
 886               else
 887                 lex_error (lexer, _("Expected non-negative integer."));
 888             }
 889           else if (min == 1)
 890             {
 891               if (name)
 892                 lex_error (lexer, _("Expected positive integer for %s."),
 893                            name);
 894               else
 895                 lex_error (lexer, _("Expected positive integer."));
 896             }
 897         }
 898       else if (report_upper_bound)
 899         {
 900           if (name)
 901             lex_error (lexer,
 902                        _("Expected integer less than or equal to %ld for %s."),
 903                        max, name);
 904           else
 905             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 906                        max);
 907         }
 908       else
 909         {
 910           if (name)
 911             lex_error (lexer, _("Integer expected for %s."), name);
 912           else
 913             lex_error (lexer, _("Integer expected."));
 914         }
 915     }
 916   return false;
 917 }
 918
 919 /* If the current token is a number, does nothing and returns true.
 920    Otherwise, reports an error and returns false. */
 921 bool
 922 lex_force_num (struct lexer *lexer)
 923 {
 924   if (lex_is_number (lexer))
 925     return true;
 926
 927   lex_error (lexer, _("expecting number"));
 928   return false;
 929 }
 930
 931 /* If the current token is an identifier, does nothing and returns true.
 932    Otherwise, reports an error and returns false. */
 933 bool
 934 lex_force_id (struct lexer *lexer)
 935 {
 936   if (lex_token (lexer) == T_ID)
 937     return true;
 938
 939   lex_error (lexer, _("expecting identifier"));
 940   return false;
 941 }
 942 \f
 943 /* Token accessors. */
 944
 945 /* Returns the type of LEXER's current token. */
 946 enum token_type
 947 lex_token (const struct lexer *lexer)
 948 {
 949   return lex_next_token (lexer, 0);
 950 }
 951
 952 /* Returns the number in LEXER's current token.
 953
 954    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 955    tokens this function will always return zero. */
 956 double
 957 lex_tokval (const struct lexer *lexer)
 958 {
 959   return lex_next_tokval (lexer, 0);
 960 }
 961
 962 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 963
 964    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 965    this functions this function will always return NULL.
 966
 967    The UTF-8 encoding of the returned string is correct for variable names and
 968    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 969    data_in() to use it in a "union value".  */
 970 const char *
 971 lex_tokcstr (const struct lexer *lexer)
 972 {
 973   return lex_next_tokcstr (lexer, 0);
 974 }
 975
 976 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 977    null-terminated (but the null terminator is not included in the returned
 978    substring's 'length').
 979
 980    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 981    this functions this function will always return NULL.
 982
 983    The UTF-8 encoding of the returned string is correct for variable names and
 984    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 985    data_in() to use it in a "union value".  */
 986 struct substring
 987 lex_tokss (const struct lexer *lexer)
 988 {
 989   return lex_next_tokss (lexer, 0);
 990 }
 991 \f
 992 /* Looking ahead.
 993
 994    A value of 0 for N as an argument to any of these functions refers to the
 995    current token.  Lookahead is limited to the current command.  Any N greater
 996    than the number of tokens remaining in the current command will be treated
 997    as referring to a T_ENDCMD token. */
 998
 999 static const struct lex_token *
1000 lex_next__ (const struct lexer *lexer_, int n)
1001 {
1002   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1003   struct lex_source *src = lex_source__ (lexer);
1004
1005   if (src != NULL)
1006     return lex_source_next__ (src, n);
1007   else
1008     {
1009       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1010       return &stop_token;
1011     }
1012 }
1013
1014 static const struct lex_token *
1015 lex_source_next__ (const struct lex_source *src_, int n)
1016 {
1017   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1018
1019   if (n < 0)
1020     {
1021       if (-n <= src->parse_ofs)
1022         return src->parse[src->parse_ofs - (-n)];
1023       else
1024         {
1025           static const struct lex_token endcmd_token
1026             = { .token = { .type = T_ENDCMD } };
1027           return &endcmd_token;
1028         }
1029     }
1030
1031   while (src->n_parse - src->parse_ofs <= n)
1032     {
1033       if (src->n_parse > 0)
1034         {
1035           const struct lex_token *t = src->parse[src->n_parse - 1];
1036           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1037             return t;
1038         }
1039
1040       lex_source_get_parse (src);
1041     }
1042
1043   return src->parse[src->parse_ofs + n];
1044 }
1045
1046 /* Returns the "struct token" of the token N after the current one in LEXER.
1047    The returned pointer can be invalidated by pretty much any succeeding call
1048    into the lexer, although the string pointer within the returned token is
1049    only invalidated by consuming the token (e.g. with lex_get()). */
1050 const struct token *
1051 lex_next (const struct lexer *lexer, int n)
1052 {
1053   return &lex_next__ (lexer, n)->token;
1054 }
1055
1056 /* Returns the type of the token N after the current one in LEXER. */
1057 enum token_type
1058 lex_next_token (const struct lexer *lexer, int n)
1059 {
1060   return lex_next (lexer, n)->type;
1061 }
1062
1063 /* Returns the number in the tokn N after the current one in LEXER.
1064
1065    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1066    tokens this function will always return zero. */
1067 double
1068 lex_next_tokval (const struct lexer *lexer, int n)
1069 {
1070   return token_number (lex_next (lexer, n));
1071 }
1072
1073 /* Returns the null-terminated string in the token N after the current one, in
1074    UTF-8 encoding.
1075
1076    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1077    this functions this function will always return NULL.
1078
1079    The UTF-8 encoding of the returned string is correct for variable names and
1080    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1081    data_in() to use it in a "union value".  */
1082 const char *
1083 lex_next_tokcstr (const struct lexer *lexer, int n)
1084 {
1085   return lex_next_tokss (lexer, n).string;
1086 }
1087
1088 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1089    The string is null-terminated (but the null terminator is not included in
1090    the returned substring's 'length').
1091
1092    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1093    tokens this functions this function will always return NULL.
1094
1095    The UTF-8 encoding of the returned string is correct for variable names and
1096    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1097    data_in() to use it in a "union value".  */
1098 struct substring
1099 lex_next_tokss (const struct lexer *lexer, int n)
1100 {
1101   return lex_next (lexer, n)->string;
1102 }
1103
1104 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1105    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1106    are both zero, this requests the syntax for the current token.)  The caller
1107    must eventually free the returned string (with free()).  The syntax is
1108    encoded in UTF-8 and in the original form supplied to the lexer so that, for
1109    example, it may include comments, spaces, and new-lines if it spans multiple
1110    tokens.  Macro expansion, however, has already been performed. */
1111 char *
1112 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1113 {
1114   return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1115 }
1116
1117 /* Returns true if the token N ahead of the current one was produced by macro
1118    expansion, false otherwise. */
1119 bool
1120 lex_next_is_from_macro (const struct lexer *lexer, int n)
1121 {
1122   return lex_next__ (lexer, n)->macro_rep != NULL;
1123 }
1124
1125 static bool
1126 lex_tokens_match (const struct token *actual, const struct token *expected)
1127 {
1128   if (actual->type != expected->type)
1129     return false;
1130
1131   switch (actual->type)
1132     {
1133     case T_POS_NUM:
1134     case T_NEG_NUM:
1135       return actual->number == expected->number;
1136
1137     case T_ID:
1138       return lex_id_match (expected->string, actual->string);
1139
1140     case T_STRING:
1141       return (actual->string.length == expected->string.length
1142               && !memcmp (actual->string.string, expected->string.string,
1143                           actual->string.length));
1144
1145     default:
1146       return true;
1147     }
1148 }
1149
1150 static size_t
1151 lex_at_phrase__ (struct lexer *lexer, const char *s)
1152 {
1153   struct string_lexer slex;
1154   struct token token;
1155
1156   size_t i = 0;
1157   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1158   while (string_lexer_next (&slex, &token))
1159     {
1160       bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1161       token_uninit (&token);
1162       if (!match)
1163         return 0;
1164     }
1165   return i;
1166 }
1167
1168 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1169    returns true.  Otherwise, returns false.
1170
1171    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1172    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1173    first three letters. */
1174 bool
1175 lex_at_phrase (struct lexer *lexer, const char *s)
1176 {
1177   return lex_at_phrase__ (lexer, s) > 0;
1178 }
1179
1180 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1181    skips it and returns true.  Otherwise, returns false.
1182
1183    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1184    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1185    first three letters. */
1186 bool
1187 lex_match_phrase (struct lexer *lexer, const char *s)
1188 {
1189   size_t n = lex_at_phrase__ (lexer, s);
1190   if (n > 0)
1191     lex_get_n (lexer, n);
1192   return n > 0;
1193 }
1194
1195 static int
1196 count_newlines (char *s, size_t length)
1197 {
1198   int n_newlines = 0;
1199   char *newline;
1200
1201   while ((newline = memchr (s, '\n', length)) != NULL)
1202     {
1203       n_newlines++;
1204       length -= (newline + 1) - s;
1205       s = newline + 1;
1206     }
1207
1208   return n_newlines;
1209 }
1210
1211 static int
1212 lex_token_get_last_line_number (const struct lex_source *src,
1213                                 const struct lex_token *token)
1214 {
1215   if (token->first_line == 0)
1216     return 0;
1217   else
1218     {
1219       char *token_str = &src->buffer[token->token_pos];
1220       return token->first_line + count_newlines (token_str, token->token_len) + 1;
1221     }
1222 }
1223
1224 static int
1225 lex_token_get_column__ (const struct lex_source *src, size_t offset)
1226 {
1227   const char *newline = memrchr (src->buffer, '\n', offset);
1228   size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1229   return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1230 }
1231
1232 static int
1233 lex_token_get_first_column (const struct lex_source *src,
1234                             const struct lex_token *token)
1235 {
1236   return lex_token_get_column__ (src, token->token_pos);
1237 }
1238
1239 static int
1240 lex_token_get_last_column (const struct lex_source *src,
1241                            const struct lex_token *token)
1242 {
1243   return lex_token_get_column__ (src, token->token_pos + token->token_len);
1244 }
1245
1246 static struct msg_location
1247 lex_token_location (const struct lex_source *src,
1248                     const struct lex_token *t0,
1249                     const struct lex_token *t1)
1250 {
1251   int first_column = lex_token_get_first_column (src, t0);
1252   int last_line = lex_token_get_last_line_number (src, t1) - 1;
1253   int last_column = lex_token_get_last_column (src, t1) - 1;
1254   return (struct msg_location) {
1255     .file_name = intern_new_if_nonnull (src->reader->file_name),
1256     .p[0] = { .line = t0->first_line, .column = first_column },
1257     .p[1] = { .line = last_line, .column = last_column },
1258   };
1259 }
1260
1261 static struct msg_location *
1262 lex_token_location_rw (const struct lex_source *src,
1263                        const struct lex_token *t0,
1264                        const struct lex_token *t1)
1265 {
1266   struct msg_location location = lex_token_location (src, t0, t1);
1267   return msg_location_dup (&location);
1268 }
1269
1270 static struct msg_location *
1271 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1272 {
1273   return lex_token_location_rw (src,
1274                                 lex_source_next__ (src, n0),
1275                                 lex_source_next__ (src, n1));
1276 }
1277
1278 /* Returns the 1-based line number of the start of the syntax that represents
1279    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1280    if the token is drawn from a source that does not have line numbers. */
1281 int
1282 lex_get_first_line_number (const struct lexer *lexer, int n)
1283 {
1284   const struct lex_source *src = lex_source__ (lexer);
1285   return src ? lex_source_next__ (src, n)->first_line : 0;
1286 }
1287
1288 /* Returns the 1-based line number of the end of the syntax that represents the
1289    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1290    token or if the token is drawn from a source that does not have line
1291    numbers.
1292
1293    Most of the time, a single token is wholly within a single line of syntax,
1294    but there are two exceptions: a T_STRING token can be made up of multiple
1295    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1296    token can consist of a "-" on one line followed by the number on the next.
1297  */
1298 int
1299 lex_get_last_line_number (const struct lexer *lexer, int n)
1300 {
1301   const struct lex_source *src = lex_source__ (lexer);
1302   return src ? lex_token_get_last_line_number (src,
1303                                                lex_source_next__ (src, n)) : 0;
1304 }
1305
1306 /* Returns the 1-based column number of the start of the syntax that represents
1307    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1308    token.
1309
1310    Column numbers are measured according to the width of characters as shown in
1311    a typical fixed-width font, in which CJK characters have width 2 and
1312    combining characters have width 0.  */
1313 int
1314 lex_get_first_column (const struct lexer *lexer, int n)
1315 {
1316   const struct lex_source *src = lex_source__ (lexer);
1317   return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1318 }
1319
1320 /* Returns the 1-based column number of the end of the syntax that represents
1321    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1322    token.
1323
1324    Column numbers are measured according to the width of characters as shown in
1325    a typical fixed-width font, in which CJK characters have width 2 and
1326    combining characters have width 0.  */
1327 int
1328 lex_get_last_column (const struct lexer *lexer, int n)
1329 {
1330   const struct lex_source *src = lex_source__ (lexer);
1331   return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1332 }
1333
1334 /* Returns the name of the syntax file from which the current command is drawn.
1335    Returns NULL for a T_STOP token or if the command's source does not have
1336    line numbers.
1337
1338    There is no version of this function that takes an N argument because
1339    lookahead only works to the end of a command and any given command is always
1340    within a single syntax file. */
1341 const char *
1342 lex_get_file_name (const struct lexer *lexer)
1343 {
1344   struct lex_source *src = lex_source__ (lexer);
1345   return src == NULL ? NULL : src->reader->file_name;
1346 }
1347
1348 /* Returns a newly allocated msg_location for the syntax that represents tokens
1349    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1350    must eventually free the location (with msg_location_destroy()). */
1351 struct msg_location *
1352 lex_get_location (const struct lexer *lexer, int n0, int n1)
1353 {
1354   struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1355   loc->p[0].column = lex_get_first_column (lexer, n0);
1356   loc->p[1].column = lex_get_last_column (lexer, n1) - 1;
1357   return loc;
1358 }
1359
1360 /* Returns a newly allocated msg_location for the syntax that represents tokens
1361    with 0-based offsets N0...N1, inclusive, from the current token.  The
1362    location only covers the tokens' lines, not the columns.  The caller must
1363    eventually free the location (with msg_location_destroy()). */
1364 struct msg_location *
1365 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1366 {
1367   struct msg_location *loc = xmalloc (sizeof *loc);
1368   int first_line = lex_get_first_line_number (lexer, n0);
1369   int last_line = lex_get_last_line_number (lexer, n1) - 1;
1370   *loc = (struct msg_location) {
1371     .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1372     .p[0] = { .line = first_line },
1373     .p[1] = { .line = last_line },
1374   };
1375   return loc;
1376 }
1377
1378 void
1379 lex_extend_location (const struct lexer *lexer, int n, struct msg_location *loc)
1380 {
1381   struct msg_location *new = lex_get_location (lexer, n, n);
1382   msg_location_merge (loc, new);
1383   msg_location_destroy (new);
1384 }
1385
1386 const char *
1387 lex_get_encoding (const struct lexer *lexer)
1388 {
1389   struct lex_source *src = lex_source__ (lexer);
1390   return src == NULL ? NULL : src->reader->encoding;
1391 }
1392
1393 /* Returns the syntax mode for the syntax file from which the current drawn is
1394    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1395    does not have line numbers.
1396
1397    There is no version of this function that takes an N argument because
1398    lookahead only works to the end of a command and any given command is always
1399    within a single syntax file. */
1400 enum segmenter_mode
1401 lex_get_syntax_mode (const struct lexer *lexer)
1402 {
1403   struct lex_source *src = lex_source__ (lexer);
1404   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1405 }
1406
1407 /* Returns the error mode for the syntax file from which the current drawn is
1408    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1409    source does not have line numbers.
1410
1411    There is no version of this function that takes an N argument because
1412    lookahead only works to the end of a command and any given command is always
1413    within a single syntax file. */
1414 enum lex_error_mode
1415 lex_get_error_mode (const struct lexer *lexer)
1416 {
1417   struct lex_source *src = lex_source__ (lexer);
1418   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1419 }
1420
1421 /* If the source that LEXER is currently reading has error mode
1422    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1423    token to be read comes directly from whatever is next read from the stream.
1424
1425    It makes sense to call this function after encountering an error in a
1426    command entered on the console, because usually the user would prefer not to
1427    have cascading errors. */
1428 void
1429 lex_interactive_reset (struct lexer *lexer)
1430 {
1431   struct lex_source *src = lex_source__ (lexer);
1432   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1433     {
1434       src->length = 0;
1435       src->journal_pos = src->seg_pos = 0;
1436       src->n_newlines = 0;
1437       src->suppress_next_newline = false;
1438       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1439                                        false);
1440       lex_stage_clear (&src->pp);
1441       lex_stage_clear (&src->merge);
1442       lex_source_clear_parse (src);
1443       lex_source_push_endcmd__ (src);
1444     }
1445 }
1446
1447 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1448 void
1449 lex_discard_rest_of_command (struct lexer *lexer)
1450 {
1451   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1452     lex_get (lexer);
1453 }
1454
1455 /* Discards all lookahead tokens in LEXER, then discards all input sources
1456    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1457    runs out of input sources. */
1458 void
1459 lex_discard_noninteractive (struct lexer *lexer)
1460 {
1461   struct lex_source *src = lex_source__ (lexer);
1462
1463   if (src != NULL)
1464     {
1465       lex_stage_clear (&src->pp);
1466       lex_stage_clear (&src->merge);
1467       lex_source_clear_parse (src);
1468
1469       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1470            src = lex_source__ (lexer))
1471         lex_source_destroy (src);
1472     }
1473 }
1474 \f
1475 static void
1476 lex_source_expand__ (struct lex_source *src)
1477 {
1478   if (src->length >= src->allocated)
1479     src->buffer = x2realloc (src->buffer, &src->allocated);
1480 }
1481
1482 static void
1483 lex_source_read__ (struct lex_source *src)
1484 {
1485   do
1486     {
1487       lex_source_expand__ (src);
1488
1489       size_t space = src->allocated - src->length;
1490       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1491       size_t n = src->reader->class->read (src->reader,
1492                                            &src->buffer[src->length],
1493                                            space, prompt);
1494       assert (n <= space);
1495
1496       if (n == 0)
1497         {
1498           /* End of input. */
1499           src->reader->eof = true;
1500           lex_source_expand__ (src);
1501           return;
1502         }
1503
1504       src->length += n;
1505     }
1506   while (!memchr (&src->buffer[src->seg_pos], '\n',
1507                   src->length - src->seg_pos));
1508 }
1509
1510 static struct lex_source *
1511 lex_source__ (const struct lexer *lexer)
1512 {
1513   return (ll_is_empty (&lexer->sources) ? NULL
1514           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1515 }
1516
1517 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1518    one, through N1 ahead of the current one, inclusive.  (For example, if N0
1519    and N1 are both zero, this requests the syntax for the current token.)  The
1520    caller must eventually free the returned string (with free()).  The syntax
1521    is encoded in UTF-8 and in the original form supplied to the lexer so that,
1522    for example, it may include comments, spaces, and new-lines if it spans
1523    multiple tokens.  Macro expansion, however, has already been performed. */
1524 static char *
1525 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1526 {
1527   struct string s = DS_EMPTY_INITIALIZER;
1528   for (size_t i = n0; i <= n1; )
1529     {
1530       /* Find [I,J) as the longest sequence of tokens not produced by macro
1531          expansion, or otherwise the longest sequence expanded from a single
1532          macro call. */
1533       const struct lex_token *first = lex_source_next__ (src, i);
1534       size_t j;
1535       for (j = i + 1; j <= n1; j++)
1536         {
1537           const struct lex_token *cur = lex_source_next__ (src, j);
1538           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1539               || first->macro_rep != cur->macro_rep)
1540             break;
1541         }
1542       const struct lex_token *last = lex_source_next__ (src, j - 1);
1543
1544       /* Now add the syntax for this sequence of tokens to SRC. */
1545       if (!ds_is_empty (&s))
1546         ds_put_byte (&s, ' ');
1547       if (!first->macro_rep)
1548         {
1549           size_t start = first->token_pos;
1550           size_t end = last->token_pos + last->token_len;
1551           ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1552         }
1553       else
1554         {
1555           size_t start = first->ofs;
1556           size_t end = last->ofs + last->len;
1557           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1558                                            end - start));
1559         }
1560
1561       i = j;
1562     }
1563   return ds_steal_cstr (&s);
1564 }
1565
1566 static bool
1567 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1568 {
1569   for (size_t i = n0; i <= n1; i++)
1570     if (lex_source_next__ (src, i)->macro_rep)
1571       return true;
1572   return false;
1573 }
1574
1575 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1576    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1577    other tokens included in that range.  The syntax is encoded in UTF-8 and in
1578    the original form supplied to the lexer so that, for example, it may include
1579    comments, spaces, and new-lines if it spans multiple tokens.
1580
1581    Returns an empty string if the token range doesn't include a macro call.
1582
1583    The caller must not modify or free the returned string. */
1584 static struct substring
1585 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1586 {
1587   if (!lex_source_contains_macro_call (src, n0, n1))
1588     return ss_empty ();
1589
1590   const struct lex_token *token0 = lex_source_next__ (src, n0);
1591   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1592   size_t start = token0->token_pos;
1593   size_t end = token1->token_pos + token1->token_len;
1594
1595   return ss_buffer (&src->buffer[start], end - start);
1596 }
1597
1598 static void
1599 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1600                          const char *format, va_list args)
1601 {
1602   const struct lex_token *token;
1603   struct string s;
1604
1605   ds_init_empty (&s);
1606
1607   token = lex_source_next__ (src, n0);
1608   if (token->token.type == T_ENDCMD)
1609     ds_put_cstr (&s, _("Syntax error at end of command"));
1610   else
1611     {
1612       /* Get the syntax that caused the error. */
1613       char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1614       char syntax[64];
1615       str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1616       free (raw_syntax);
1617
1618       /* Get the macro call(s) that expanded to the syntax that caused the
1619          error. */
1620       char call[64];
1621       str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1622                      call, sizeof call);
1623
1624       if (syntax[0])
1625         {
1626           if (call[0])
1627             ds_put_format (&s,
1628                            _("Syntax error at `%s' (in expansion of `%s')"),
1629                            syntax, call);
1630           else
1631             ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1632         }
1633       else
1634         {
1635           if (call[0])
1636             ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1637                            call);
1638           else
1639             ds_put_cstr (&s, _("Syntax error"));
1640         }
1641     }
1642
1643   if (format)
1644     {
1645       ds_put_cstr (&s, ": ");
1646       ds_put_vformat (&s, format, args);
1647     }
1648   if (ds_last (&s) != '.')
1649     ds_put_byte (&s, '.');
1650
1651   struct msg *m = xmalloc (sizeof *m);
1652   *m = (struct msg) {
1653     .category = MSG_C_SYNTAX,
1654     .severity = MSG_S_ERROR,
1655     .location = lex_source_get_location (src, n0, n1),
1656     .text = ds_steal_cstr (&s),
1657   };
1658   msg_emit (m);
1659 }
1660
1661 static void
1662 lex_get_error (struct lex_source *src, const struct lex_token *token)
1663 {
1664   char syntax[64];
1665   str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1666                  syntax, sizeof syntax);
1667
1668   struct string s = DS_EMPTY_INITIALIZER;
1669   ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1670   ds_put_format (&s, ": %s", token->token.string.string);
1671
1672   struct msg *m = xmalloc (sizeof *m);
1673   *m = (struct msg) {
1674     .category = MSG_C_SYNTAX,
1675     .severity = MSG_S_ERROR,
1676     .location = lex_token_location_rw (src, token, token),
1677     .text = ds_steal_cstr (&s),
1678   };
1679   msg_emit (m);
1680 }
1681
1682 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1683    underlying lex_reader if necessary.  Returns true if a new token was added
1684    to SRC's deque, false otherwise.  The caller should retry failures unless
1685    SRC's 'eof' marker was set to true indicating that there will be no more
1686    tokens from this source. */
1687 static bool
1688 lex_source_try_get_pp (struct lex_source *src)
1689 {
1690   /* Append a new token to SRC and initialize it. */
1691   struct lex_token *token = xmalloc (sizeof *token);
1692   token->token = (struct token) { .type = T_STOP };
1693   token->macro_rep = NULL;
1694   token->ref_cnt = NULL;
1695   token->token_pos = src->seg_pos;
1696   if (src->reader->line_number > 0)
1697     token->first_line = src->reader->line_number + src->n_newlines;
1698   else
1699     token->first_line = 0;
1700
1701   /* Extract a segment. */
1702   const char *segment;
1703   enum segment_type seg_type;
1704   int seg_len;
1705   for (;;)
1706     {
1707       segment = &src->buffer[src->seg_pos];
1708       seg_len = segmenter_push (&src->segmenter, segment,
1709                                 src->length - src->seg_pos,
1710                                 src->reader->eof, &seg_type);
1711       if (seg_len >= 0)
1712         break;
1713
1714       /* The segmenter needs more input to produce a segment. */
1715       assert (!src->reader->eof);
1716       lex_source_read__ (src);
1717     }
1718
1719   /* Update state based on the segment. */
1720   token->token_len = seg_len;
1721   src->seg_pos += seg_len;
1722   if (seg_type == SEG_NEWLINE)
1723     src->n_newlines++;
1724
1725   /* Get a token from the segment. */
1726   enum tokenize_result result = token_from_segment (
1727     seg_type, ss_buffer (segment, seg_len), &token->token);
1728
1729   /* If we've reached the end of a line, or the end of a command, then pass
1730      the line to the output engine as a syntax text item.  */
1731   int n_lines = seg_type == SEG_NEWLINE;
1732   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1733     {
1734       n_lines++;
1735       src->suppress_next_newline = true;
1736     }
1737   else if (n_lines > 0 && src->suppress_next_newline)
1738     {
1739       n_lines--;
1740       src->suppress_next_newline = false;
1741     }
1742   for (int i = 0; i < n_lines; i++)
1743     {
1744       /* Beginning of line. */
1745       const char *line = &src->buffer[src->journal_pos];
1746
1747       /* Calculate line length, including \n or \r\n end-of-line if present.
1748
1749          We use src->head even though that may be beyond what we've actually
1750          converted to tokens (which is only through line_pos).  That's because,
1751          if we're emitting the line due to SEG_END_COMMAND, we want to take the
1752          whole line through the newline, not just through the '.'. */
1753       size_t max_len = src->length - src->journal_pos;
1754       const char *newline = memchr (line, '\n', max_len);
1755       size_t line_len = newline ? newline - line + 1 : max_len;
1756
1757       /* Calculate line length excluding end-of-line. */
1758       size_t copy_len = line_len;
1759       if (copy_len > 0 && line[copy_len - 1] == '\n')
1760         copy_len--;
1761       if (copy_len > 0 && line[copy_len - 1] == '\r')
1762         copy_len--;
1763
1764       /* Submit the line as syntax. */
1765       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1766                                                    xmemdup0 (line, copy_len),
1767                                                    NULL));
1768
1769       src->journal_pos += line_len;
1770     }
1771
1772   switch (result)
1773     {
1774     case TOKENIZE_ERROR:
1775       lex_get_error (src, token);
1776       /* Fall through. */
1777     case TOKENIZE_EMPTY:
1778       lex_token_destroy (token);
1779       return false;
1780
1781     case TOKENIZE_TOKEN:
1782       if (token->token.type == T_STOP)
1783         {
1784           token->token.type = T_ENDCMD;
1785           src->eof = true;
1786         }
1787       lex_stage_push_last (&src->pp, token);
1788       return true;
1789     }
1790   NOT_REACHED ();
1791 }
1792
1793 /* Attempts to append a new token to SRC.  Returns true if successful, false on
1794    failure.  On failure, the end of SRC has been reached and no more tokens
1795    will be forthcoming from it.
1796
1797    Does not make the new token available for lookahead yet; the caller must
1798    adjust SRC's 'middle' pointer to do so. */
1799 static bool
1800 lex_source_get_pp (struct lex_source *src)
1801 {
1802   while (!src->eof)
1803     if (lex_source_try_get_pp (src))
1804       return true;
1805   return false;
1806 }
1807
1808 static bool
1809 lex_source_try_get_merge (const struct lex_source *src_)
1810 {
1811   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1812
1813   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1814     return false;
1815
1816   if (!settings_get_mexpand ())
1817     {
1818       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1819       return true;
1820     }
1821
1822   /* Now pass tokens one-by-one to the macro expander.
1823
1824      In the common case where there is no macro to expand, the loop is not
1825      entered.  */
1826   struct macro_call *mc;
1827   int n_call = macro_call_create (src->lexer->macros,
1828                                   &lex_stage_first (&src->pp)->token, &mc);
1829   for (int ofs = 1; !n_call; ofs++)
1830     {
1831       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1832         {
1833           /* This should not be reachable because we always get a T_ENDCMD at
1834              the end of an input file (transformed from T_STOP by
1835              lex_source_try_get_pp()) and the macro_expander should always
1836              terminate expansion on T_ENDCMD. */
1837           NOT_REACHED ();
1838         }
1839
1840       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1841       size_t start = t->token_pos;
1842       size_t end = t->token_pos + t->token_len;
1843       const struct macro_token mt = {
1844         .token = t->token,
1845         .syntax = ss_buffer (&src->buffer[start], end - start),
1846       };
1847       const struct msg_location loc = lex_token_location (src, t, t);
1848       n_call = macro_call_add (mc, &mt, &loc);
1849     }
1850   if (n_call < 0)
1851     {
1852       /* False alarm: no macro expansion after all.  Use first token as
1853          lookahead.  We'll retry macro expansion from the second token next
1854          time around. */
1855       macro_call_destroy (mc);
1856       lex_stage_shift (&src->merge, &src->pp, 1);
1857       return true;
1858     }
1859
1860   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1861      are a macro call.  (These are likely to be the only tokens in 'pp'.)
1862      Expand them.  */
1863   const struct lex_token *c0 = lex_stage_first (&src->pp);
1864   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1865   struct macro_tokens expansion = { .n = 0 };
1866   struct msg_location loc = lex_token_location (src, c0, c1);
1867   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1868   macro_call_destroy (mc);
1869
1870   /* Convert the macro expansion into syntax for possible error messages
1871      later. */
1872   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1873   size_t *len = xnmalloc (expansion.n, sizeof *len);
1874   struct string s = DS_EMPTY_INITIALIZER;
1875   macro_tokens_to_syntax (&expansion, &s, ofs, len);
1876
1877   if (settings_get_mprint ())
1878     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1879                                           _("Macro Expansion")));
1880
1881   /* Append the macro expansion tokens to the lookahead. */
1882   if (expansion.n > 0)
1883     {
1884       char *macro_rep = ds_steal_cstr (&s);
1885       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1886       *ref_cnt = expansion.n;
1887       for (size_t i = 0; i < expansion.n; i++)
1888         {
1889           struct lex_token *token = xmalloc (sizeof *token);
1890           *token = (struct lex_token) {
1891             .token = expansion.mts[i].token,
1892             .token_pos = c0->token_pos,
1893             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1894             .first_line = c0->first_line,
1895             .macro_rep = macro_rep,
1896             .ofs = ofs[i],
1897             .len = len[i],
1898             .ref_cnt = ref_cnt,
1899           };
1900           lex_stage_push_last (&src->merge, token);
1901
1902           ss_dealloc (&expansion.mts[i].syntax);
1903         }
1904     }
1905   else
1906     ds_destroy (&s);
1907   free (expansion.mts);
1908   free (ofs);
1909   free (len);
1910
1911   /* Destroy the tokens for the call. */
1912   for (size_t i = 0; i < n_call; i++)
1913     lex_stage_pop_first (&src->pp);
1914
1915   return expansion.n > 0;
1916 }
1917
1918 /* Attempts to obtain at least one new token into 'merge' in SRC.
1919
1920    Returns true if successful, false on failure.  In the latter case, SRC is
1921    exhausted and 'src->eof' is now true. */
1922 static bool
1923 lex_source_get_merge (struct lex_source *src)
1924 {
1925   while (!src->eof)
1926     if (lex_source_try_get_merge (src))
1927       return true;
1928   return false;
1929 }
1930
1931 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1932
1933    Returns true if successful, false on failure.  In the latter case, SRC is
1934    exhausted and 'src->eof' is now true. */
1935 static bool
1936 lex_source_get_parse (struct lex_source *src)
1937 {
1938   struct merger m = MERGER_INIT;
1939   struct token out;
1940   for (size_t i = 0; ; i++)
1941     {
1942       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1943         {
1944           /* We always get a T_ENDCMD at the end of an input file
1945              (transformed from T_STOP by lex_source_try_get_pp()) and
1946              merger_add() should never return -1 on T_ENDCMD. */
1947           assert (lex_stage_is_empty (&src->merge));
1948           return false;
1949         }
1950
1951       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1952                                &out);
1953       if (!retval)
1954         {
1955           lex_source_push_parse (src, lex_stage_take_first (&src->merge));
1956           return true;
1957         }
1958       else if (retval > 0)
1959         {
1960           /* Add a token that merges all the tokens together. */
1961           const struct lex_token *first = lex_stage_first (&src->merge);
1962           const struct lex_token *last = lex_stage_nth (&src->merge,
1963                                                         retval - 1);
1964           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1965           struct lex_token *t = xmalloc (sizeof *t);
1966           *t = (struct lex_token) {
1967             .token = out,
1968             .token_pos = first->token_pos,
1969             .token_len = (last->token_pos - first->token_pos) + last->token_len,
1970             .first_line = first->first_line,
1971
1972             /* This works well if all the tokens were not expanded from macros,
1973                or if they came from the same macro expansion.  It just gives up
1974                in the other (corner) cases. */
1975             .macro_rep = macro ? first->macro_rep : NULL,
1976             .ofs = macro ? first->ofs : 0,
1977             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
1978             .ref_cnt = macro ? first->ref_cnt : NULL,
1979           };
1980           if (t->ref_cnt)
1981             ++*t->ref_cnt;
1982           lex_source_push_parse (src, t);
1983
1984           for (int i = 0; i < retval; i++)
1985             lex_stage_pop_first (&src->merge);
1986           return true;
1987         }
1988     }
1989 }
1990 \f
1991 static void
1992 lex_source_push_endcmd__ (struct lex_source *src)
1993 {
1994   assert (src->n_parse == 0);
1995
1996   struct lex_token *token = xmalloc (sizeof *token);
1997   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
1998   lex_source_push_parse (src, token);
1999 }
2000
2001 static void
2002 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2003 {
2004   if (src->n_parse >= src->allocated_parse)
2005     src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2006                              sizeof *src->parse);
2007   src->parse[src->n_parse++] = token;
2008 }
2009
2010 static void
2011 lex_source_clear_parse (struct lex_source *src)
2012 {
2013   for (size_t i = 0; i < src->n_parse; i++)
2014     lex_token_destroy (src->parse[i]);
2015   src->n_parse = src->parse_ofs = 0;
2016 }
2017
2018 static struct lex_source *
2019 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2020 {
2021   struct lex_source *src = xmalloc (sizeof *src);
2022   *src = (struct lex_source) {
2023     .reader = reader,
2024     .segmenter = segmenter_init (reader->syntax, false),
2025     .lexer = lexer,
2026   };
2027
2028   lex_source_push_endcmd__ (src);
2029
2030   return src;
2031 }
2032
2033 static void
2034 lex_source_destroy (struct lex_source *src)
2035 {
2036   char *file_name = src->reader->file_name;
2037   char *encoding = src->reader->encoding;
2038   if (src->reader->class->destroy != NULL)
2039     src->reader->class->destroy (src->reader);
2040   free (file_name);
2041   free (encoding);
2042   free (src->buffer);
2043   lex_stage_uninit (&src->pp);
2044   lex_stage_uninit (&src->merge);
2045   lex_source_clear_parse (src);
2046   free (src->parse);
2047   ll_remove (&src->ll);
2048   free (src);
2049 }
2050 \f
2051 struct lex_file_reader
2052   {
2053     struct lex_reader reader;
2054     struct u8_istream *istream;
2055   };
2056
2057 static struct lex_reader_class lex_file_reader_class;
2058
2059 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2060    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2061    ENCODING, which should take one of the forms accepted by
2062    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2063    mode of the new reader, respectively.
2064
2065    Returns a null pointer if FILE_NAME cannot be opened. */
2066 struct lex_reader *
2067 lex_reader_for_file (const char *file_name, const char *encoding,
2068                      enum segmenter_mode syntax,
2069                      enum lex_error_mode error)
2070 {
2071   struct lex_file_reader *r;
2072   struct u8_istream *istream;
2073
2074   istream = (!strcmp(file_name, "-")
2075              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2076              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2077   if (istream == NULL)
2078     {
2079       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2080       return NULL;
2081     }
2082
2083   r = xmalloc (sizeof *r);
2084   lex_reader_init (&r->reader, &lex_file_reader_class);
2085   r->reader.syntax = syntax;
2086   r->reader.error = error;
2087   r->reader.file_name = xstrdup (file_name);
2088   r->reader.encoding = xstrdup_if_nonnull (encoding);
2089   r->reader.line_number = 1;
2090   r->istream = istream;
2091
2092   return &r->reader;
2093 }
2094
2095 static struct lex_file_reader *
2096 lex_file_reader_cast (struct lex_reader *r)
2097 {
2098   return UP_CAST (r, struct lex_file_reader, reader);
2099 }
2100
2101 static size_t
2102 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2103                enum prompt_style prompt_style UNUSED)
2104 {
2105   struct lex_file_reader *r = lex_file_reader_cast (r_);
2106   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2107   if (n_read < 0)
2108     {
2109       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2110       return 0;
2111     }
2112   return n_read;
2113 }
2114
2115 static void
2116 lex_file_close (struct lex_reader *r_)
2117 {
2118   struct lex_file_reader *r = lex_file_reader_cast (r_);
2119
2120   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2121     {
2122       if (u8_istream_close (r->istream) != 0)
2123         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2124     }
2125   else
2126     u8_istream_free (r->istream);
2127
2128   free (r);
2129 }
2130
2131 static struct lex_reader_class lex_file_reader_class =
2132   {
2133     lex_file_read,
2134     lex_file_close
2135   };
2136 \f
2137 struct lex_string_reader
2138   {
2139     struct lex_reader reader;
2140     struct substring s;
2141     size_t offset;
2142   };
2143
2144 static struct lex_reader_class lex_string_reader_class;
2145
2146 /* Creates and returns a new lex_reader for the contents of S, which must be
2147    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2148    with ss_dealloc() when it is closed. */
2149 struct lex_reader *
2150 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2151 {
2152   struct lex_string_reader *r;
2153
2154   r = xmalloc (sizeof *r);
2155   lex_reader_init (&r->reader, &lex_string_reader_class);
2156   r->reader.syntax = SEG_MODE_AUTO;
2157   r->reader.encoding = xstrdup_if_nonnull (encoding);
2158   r->s = s;
2159   r->offset = 0;
2160
2161   return &r->reader;
2162 }
2163
2164 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2165    which must be encoded in ENCODING.  The caller retains ownership of S. */
2166 struct lex_reader *
2167 lex_reader_for_string (const char *s, const char *encoding)
2168 {
2169   struct substring ss;
2170   ss_alloc_substring (&ss, ss_cstr (s));
2171   return lex_reader_for_substring_nocopy (ss, encoding);
2172 }
2173
2174 /* Formats FORMAT as a printf()-like format string and creates and returns a
2175    new lex_reader for the formatted result.  */
2176 struct lex_reader *
2177 lex_reader_for_format (const char *format, const char *encoding, ...)
2178 {
2179   struct lex_reader *r;
2180   va_list args;
2181
2182   va_start (args, encoding);
2183   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2184   va_end (args);
2185
2186   return r;
2187 }
2188
2189 static struct lex_string_reader *
2190 lex_string_reader_cast (struct lex_reader *r)
2191 {
2192   return UP_CAST (r, struct lex_string_reader, reader);
2193 }
2194
2195 static size_t
2196 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2197                  enum prompt_style prompt_style UNUSED)
2198 {
2199   struct lex_string_reader *r = lex_string_reader_cast (r_);
2200   size_t chunk;
2201
2202   chunk = MIN (n, r->s.length - r->offset);
2203   memcpy (buf, r->s.string + r->offset, chunk);
2204   r->offset += chunk;
2205
2206   return chunk;
2207 }
2208
2209 static void
2210 lex_string_close (struct lex_reader *r_)
2211 {
2212   struct lex_string_reader *r = lex_string_reader_cast (r_);
2213
2214   ss_dealloc (&r->s);
2215   free (r);
2216 }
2217
2218 static struct lex_reader_class lex_string_reader_class =
2219   {
2220     lex_string_read,
2221     lex_string_close
2222   };