pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31
  32 #include "language/command.h"
  33 #include "language/lexer/macro.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/intern.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* For a token obtained through the lexer in an ordinary way, this is the
  66        location of the token in terms of the lex_source's buffer.
  67
  68        For a token produced through macro expansion, this is the entire macro
  69        call. */
  70     size_t token_pos;           /* Offset into src->buffer of token start. */
  71     size_t token_len;           /* Length of source for token in bytes. */
  72     int first_line;             /* Line number at token_pos. */
  73
  74     /* For a token obtained through macro expansion, this is just this token.
  75
  76        For a token obtained through the lexer in an ordinary way, these are
  77        nulls and zeros. */
  78     char *macro_rep;        /* The whole macro expansion. */
  79     size_t ofs;             /* Offset of this token in macro_rep. */
  80     size_t len;             /* Length of this token in macro_rep. */
  81     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  82   };
  83
  84 static void
  85 lex_token_destroy (struct lex_token *t)
  86 {
  87   token_uninit (&t->token);
  88   if (t->ref_cnt)
  89     {
  90       assert (*t->ref_cnt > 0);
  91       if (!--*t->ref_cnt)
  92         {
  93           free (t->macro_rep);
  94           free (t->ref_cnt);
  95         }
  96     }
  97   free (t);
  98 }
  99 \f
 100 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 101    lex_source. */
 102 struct lex_stage
 103   {
 104     struct deque deque;
 105     struct lex_token **tokens;
 106   };
 107
 108 static void lex_stage_clear (struct lex_stage *);
 109 static void lex_stage_uninit (struct lex_stage *);
 110
 111 static size_t lex_stage_count (const struct lex_stage *);
 112 static bool lex_stage_is_empty (const struct lex_stage *);
 113
 114 static struct lex_token *lex_stage_first (struct lex_stage *);
 115 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 116
 117 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 118 static void lex_stage_pop_first (struct lex_stage *);
 119
 120 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 121                              size_t n);
 122
 123 /* Deletes all the tokens from STAGE. */
 124 static void
 125 lex_stage_clear (struct lex_stage *stage)
 126 {
 127   while (!deque_is_empty (&stage->deque))
 128     lex_stage_pop_first (stage);
 129 }
 130
 131 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 132 static void
 133 lex_stage_uninit (struct lex_stage *stage)
 134 {
 135   lex_stage_clear (stage);
 136   free (stage->tokens);
 137 }
 138
 139 /* Returns true if STAGE contains no tokens, otherwise false. */
 140 static bool
 141 lex_stage_is_empty (const struct lex_stage *stage)
 142 {
 143   return deque_is_empty (&stage->deque);
 144 }
 145
 146 /* Returns the number of tokens in STAGE. */
 147 static size_t
 148 lex_stage_count (const struct lex_stage *stage)
 149 {
 150   return deque_count (&stage->deque);
 151 }
 152
 153 /* Returns the first token in STAGE, which must be nonempty.
 154    The first token is the one accessed with the least lookahead. */
 155 static struct lex_token *
 156 lex_stage_first (struct lex_stage *stage)
 157 {
 158   return lex_stage_nth (stage, 0);
 159 }
 160
 161 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 162    lookahead) is 0, the second token is 1, and so on.  There must be at least
 163    INDEX + 1 tokens in STAGE. */
 164 static struct lex_token *
 165 lex_stage_nth (struct lex_stage *stage, size_t index)
 166 {
 167   return stage->tokens[deque_back (&stage->deque, index)];
 168 }
 169
 170 /* Adds TOKEN so that it becomes the last token in STAGE. */
 171 static void
 172 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 173 {
 174   if (deque_is_full (&stage->deque))
 175     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 176                                   sizeof *stage->tokens);
 177   stage->tokens[deque_push_front (&stage->deque)] = token;
 178 }
 179
 180 /* Removes and returns the first token from STAGE. */
 181 static struct lex_token *
 182 lex_stage_take_first (struct lex_stage *stage)
 183 {
 184   return stage->tokens[deque_pop_back (&stage->deque)];
 185 }
 186
 187 /* Removes the first token from STAGE and uninitializes it. */
 188 static void
 189 lex_stage_pop_first (struct lex_stage *stage)
 190 {
 191   lex_token_destroy (lex_stage_take_first (stage));
 192 }
 193
 194 /* Removes the first N tokens from SRC, appending them to DST as the last
 195    tokens. */
 196 static void
 197 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 198 {
 199   for (size_t i = 0; i < n; i++)
 200     lex_stage_push_last (dst, lex_stage_take_first (src));
 201 }
 202
 203 /* A source of tokens, corresponding to a syntax file.
 204
 205    This is conceptually a lex_reader wrapped with everything needed to convert
 206    its UTF-8 bytes into tokens. */
 207 struct lex_source
 208   {
 209     struct ll ll;               /* In lexer's list of sources. */
 210     struct lex_reader *reader;
 211     struct lexer *lexer;
 212     struct segmenter segmenter;
 213     bool eof;                   /* True if T_STOP was read from 'reader'. */
 214
 215     /* Buffer of UTF-8 bytes. */
 216     char *buffer;               /* Source file contents. */
 217     size_t length;              /* Number of bytes filled. */
 218     size_t allocated;           /* Number of bytes allocated. */
 219
 220     /* Offsets into 'buffer'. */
 221     size_t journal_pos;         /* First byte not yet output to journal. */
 222     size_t seg_pos;             /* First byte not yet scanned as token. */
 223
 224     int n_newlines;             /* Number of new-lines up to seg_pos. */
 225     bool suppress_next_newline;
 226
 227     /* Tokens.
 228
 229        This is a pipeline with the following stages.  Each token eventually
 230        made available to the parser passes through of these stages.  The stages
 231        are named after the processing that happens in each one.
 232
 233        Initially, tokens come from the segmenter and scanner to 'pp':
 234
 235        - pp: Tokens that need to pass through the macro preprocessor to end up
 236          in 'merge'.
 237
 238        - merge: Tokens that need to pass through scan_merge() to end up in
 239          'parse'.
 240
 241        - parse: Tokens available to the client for parsing.
 242
 243       'pp' and 'merge' store tokens only temporarily until they pass into
 244       'parse'.  Tokens then live in 'parse' until the command is fully
 245       consumed, at which time they are freed together. */
 246     struct lex_stage pp;
 247     struct lex_stage merge;
 248     struct lex_token **parse;
 249     size_t n_parse, allocated_parse, parse_ofs;
 250   };
 251
 252 static struct lex_source *lex_source_create (struct lexer *,
 253                                              struct lex_reader *);
 254 static void lex_source_destroy (struct lex_source *);
 255
 256 /* Lexer. */
 257 struct lexer
 258   {
 259     struct ll_list sources;     /* Contains "struct lex_source"s. */
 260     struct macro_set *macros;
 261   };
 262
 263 static struct lex_source *lex_source__ (const struct lexer *);
 264 static char *lex_source_get_syntax__ (const struct lex_source *,
 265                                       int n0, int n1);
 266 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 267 static void lex_source_push_endcmd__ (struct lex_source *);
 268 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
 269 static void lex_source_clear_parse (struct lex_source *);
 270
 271 static bool lex_source_get_parse (struct lex_source *);
 272 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 273                                      const char *format, va_list)
 274    PRINTF_FORMAT (4, 0);
 275 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 276                                                   int n);
 277 \f
 278 /* Initializes READER with the specified CLASS and otherwise some reasonable
 279    defaults.  The caller should fill in the others members as desired. */
 280 void
 281 lex_reader_init (struct lex_reader *reader,
 282                  const struct lex_reader_class *class)
 283 {
 284   reader->class = class;
 285   reader->syntax = SEG_MODE_AUTO;
 286   reader->error = LEX_ERROR_CONTINUE;
 287   reader->file_name = NULL;
 288   reader->encoding = NULL;
 289   reader->line_number = 0;
 290   reader->eof = false;
 291 }
 292
 293 /* Frees any file name already in READER and replaces it by a copy of
 294    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 295 void
 296 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 297 {
 298   free (reader->file_name);
 299   reader->file_name = xstrdup_if_nonnull (file_name);
 300 }
 301 \f
 302 /* Creates and returns a new lexer. */
 303 struct lexer *
 304 lex_create (void)
 305 {
 306   struct lexer *lexer = xmalloc (sizeof *lexer);
 307   *lexer = (struct lexer) {
 308     .sources = LL_INITIALIZER (lexer->sources),
 309     .macros = macro_set_create (),
 310   };
 311   return lexer;
 312 }
 313
 314 /* Destroys LEXER. */
 315 void
 316 lex_destroy (struct lexer *lexer)
 317 {
 318   if (lexer != NULL)
 319     {
 320       struct lex_source *source, *next;
 321
 322       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 323         lex_source_destroy (source);
 324       macro_set_destroy (lexer->macros);
 325       free (lexer);
 326     }
 327 }
 328
 329 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 330    same name.  Takes ownership of M. */
 331 void
 332 lex_define_macro (struct lexer *lexer, struct macro *m)
 333 {
 334   macro_set_add (lexer->macros, m);
 335 }
 336
 337 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 338    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 339    token. */
 340 void
 341 lex_include (struct lexer *lexer, struct lex_reader *reader)
 342 {
 343   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 344   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 345 }
 346
 347 /* Appends READER to LEXER, so that it will be read after all other current
 348    readers have already been read. */
 349 void
 350 lex_append (struct lexer *lexer, struct lex_reader *reader)
 351 {
 352   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 353 }
 354 \f
 355 /* Advancing. */
 356
 357 /* Advances LEXER to the next token, consuming the current token. */
 358 void
 359 lex_get (struct lexer *lexer)
 360 {
 361   struct lex_source *src;
 362
 363   src = lex_source__ (lexer);
 364   if (src == NULL)
 365     return;
 366
 367   if (src->parse_ofs < src->n_parse)
 368     {
 369       if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
 370         lex_source_clear_parse (src);
 371       else
 372         src->parse_ofs++;
 373     }
 374
 375   while (src->parse_ofs == src->n_parse)
 376     if (!lex_source_get_parse (src))
 377       {
 378         lex_source_destroy (src);
 379         src = lex_source__ (lexer);
 380         if (src == NULL)
 381           return;
 382       }
 383 }
 384
 385 /* Advances LEXER by N tokens. */
 386 void
 387 lex_get_n (struct lexer *lexer, size_t n)
 388 {
 389   while (n-- > 0)
 390     lex_get (lexer);
 391 }
 392 \f
 393 /* Issuing errors. */
 394
 395 /* Prints a syntax error message containing the current token and
 396    given message MESSAGE (if non-null). */
 397 void
 398 lex_error (struct lexer *lexer, const char *format, ...)
 399 {
 400   va_list args;
 401
 402   va_start (args, format);
 403   lex_next_error_valist (lexer, 0, 0, format, args);
 404   va_end (args);
 405 }
 406
 407 /* Prints a syntax error message containing the current token and
 408    given message MESSAGE (if non-null). */
 409 void
 410 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 411 {
 412   lex_next_error_valist (lexer, 0, 0, format, args);
 413 }
 414
 415 /* Prints a syntax error message containing the current token and
 416    given message MESSAGE (if non-null). */
 417 void
 418 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 419 {
 420   va_list args;
 421
 422   va_start (args, format);
 423   lex_next_error_valist (lexer, n0, n1, format, args);
 424   va_end (args);
 425 }
 426
 427 /* Prints a syntax error message saying that one of the strings provided as
 428    varargs, up to the first NULL, is expected. */
 429 void
 430 (lex_error_expecting) (struct lexer *lexer, ...)
 431 {
 432   va_list args;
 433
 434   va_start (args, lexer);
 435   lex_error_expecting_valist (lexer, args);
 436   va_end (args);
 437 }
 438
 439 /* Prints a syntax error message saying that one of the options provided in
 440    ARGS, up to the first NULL, is expected. */
 441 void
 442 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 443 {
 444   enum { MAX_OPTIONS = 9 };
 445   const char *options[MAX_OPTIONS];
 446   int n = 0;
 447   while (n < MAX_OPTIONS)
 448     {
 449       const char *option = va_arg (args, const char *);
 450       if (!option)
 451         break;
 452
 453       options[n++] = option;
 454     }
 455   lex_error_expecting_array (lexer, options, n);
 456 }
 457
 458 void
 459 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 460 {
 461   switch (n)
 462     {
 463     case 0:
 464       lex_error (lexer, NULL);
 465       break;
 466
 467     case 1:
 468       lex_error (lexer, _("expecting %s"), options[0]);
 469       break;
 470
 471     case 2:
 472       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 473       break;
 474
 475     case 3:
 476       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 477                  options[2]);
 478       break;
 479
 480     case 4:
 481       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 482                  options[0], options[1], options[2], options[3]);
 483       break;
 484
 485     case 5:
 486       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 487                  options[0], options[1], options[2], options[3], options[4]);
 488       break;
 489
 490     case 6:
 491       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 492                  options[0], options[1], options[2], options[3], options[4],
 493                  options[5]);
 494       break;
 495
 496     case 7:
 497       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 498                  options[0], options[1], options[2], options[3], options[4],
 499                  options[5], options[6]);
 500       break;
 501
 502     case 8:
 503       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 504                  options[0], options[1], options[2], options[3], options[4],
 505                  options[5], options[6], options[7]);
 506       break;
 507
 508     default:
 509       lex_error (lexer, NULL);
 510     }
 511 }
 512
 513 /* Reports an error to the effect that subcommand SBC may only be specified
 514    once.
 515
 516    This function does not take a lexer as an argument or use lex_error(),
 517    because the result would ordinarily just be redundant: "Syntax error at
 518    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 519    not help the user find the error. */
 520 void
 521 lex_sbc_only_once (const char *sbc)
 522 {
 523   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 524 }
 525
 526 /* Reports an error to the effect that subcommand SBC is missing.
 527
 528    This function does not take a lexer as an argument or use lex_error(),
 529    because a missing subcommand can normally be detected only after the whole
 530    command has been parsed, and so lex_error() would always report "Syntax
 531    error at end of command", which does not help the user find the error. */
 532 void
 533 lex_sbc_missing (const char *sbc)
 534 {
 535   msg (SE, _("Required subcommand %s was not specified."), sbc);
 536 }
 537
 538 /* Reports an error to the effect that specification SPEC may only be specified
 539    once within subcommand SBC. */
 540 void
 541 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 542 {
 543   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 544              spec, sbc);
 545 }
 546
 547 /* Reports an error to the effect that specification SPEC is missing within
 548    subcommand SBC. */
 549 void
 550 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 551 {
 552   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 553              sbc, spec);
 554 }
 555
 556 /* Prints a syntax error message containing the current token and
 557    given message MESSAGE (if non-null). */
 558 void
 559 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 560                        const char *format, va_list args)
 561 {
 562   struct lex_source *src = lex_source__ (lexer);
 563
 564   if (src != NULL)
 565     lex_source_error_valist (src, n0, n1, format, args);
 566   else
 567     {
 568       struct string s;
 569
 570       ds_init_empty (&s);
 571       ds_put_format (&s, _("Syntax error at end of input"));
 572       if (format != NULL)
 573         {
 574           ds_put_cstr (&s, ": ");
 575           ds_put_vformat (&s, format, args);
 576         }
 577       if (ds_last (&s) != '.')
 578         ds_put_byte (&s, '.');
 579       msg (SE, "%s", ds_cstr (&s));
 580       ds_destroy (&s);
 581     }
 582 }
 583
 584 /* Checks that we're at end of command.
 585    If so, returns a successful command completion code.
 586    If not, flags a syntax error and returns an error command
 587    completion code. */
 588 int
 589 lex_end_of_command (struct lexer *lexer)
 590 {
 591   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 592     {
 593       lex_error (lexer, _("expecting end of command"));
 594       return CMD_FAILURE;
 595     }
 596   else
 597     return CMD_SUCCESS;
 598 }
 599 \f
 600 /* Token testing functions. */
 601
 602 /* Returns true if the current token is a number. */
 603 bool
 604 lex_is_number (const struct lexer *lexer)
 605 {
 606   return lex_next_is_number (lexer, 0);
 607 }
 608
 609 /* Returns true if the current token is a string. */
 610 bool
 611 lex_is_string (const struct lexer *lexer)
 612 {
 613   return lex_next_is_string (lexer, 0);
 614 }
 615
 616 /* Returns the value of the current token, which must be a
 617    floating point number. */
 618 double
 619 lex_number (const struct lexer *lexer)
 620 {
 621   return lex_next_number (lexer, 0);
 622 }
 623
 624 /* Returns true iff the current token is an integer. */
 625 bool
 626 lex_is_integer (const struct lexer *lexer)
 627 {
 628   return lex_next_is_integer (lexer, 0);
 629 }
 630
 631 /* Returns the value of the current token, which must be an
 632    integer. */
 633 long
 634 lex_integer (const struct lexer *lexer)
 635 {
 636   return lex_next_integer (lexer, 0);
 637 }
 638 \f
 639 /* Token testing functions with lookahead.
 640
 641    A value of 0 for N as an argument to any of these functions refers to the
 642    current token.  Lookahead is limited to the current command.  Any N greater
 643    than the number of tokens remaining in the current command will be treated
 644    as referring to a T_ENDCMD token. */
 645
 646 /* Returns true if the token N ahead of the current token is a number. */
 647 bool
 648 lex_next_is_number (const struct lexer *lexer, int n)
 649 {
 650   return token_is_number (lex_next (lexer, n));
 651 }
 652
 653 /* Returns true if the token N ahead of the current token is a string. */
 654 bool
 655 lex_next_is_string (const struct lexer *lexer, int n)
 656 {
 657   return token_is_string (lex_next (lexer, n));
 658 }
 659
 660 /* Returns the value of the token N ahead of the current token, which must be a
 661    floating point number. */
 662 double
 663 lex_next_number (const struct lexer *lexer, int n)
 664 {
 665   return token_number (lex_next (lexer, n));
 666 }
 667
 668 /* Returns true if the token N ahead of the current token is an integer. */
 669 bool
 670 lex_next_is_integer (const struct lexer *lexer, int n)
 671 {
 672   return token_is_integer (lex_next (lexer, n));
 673 }
 674
 675 /* Returns the value of the token N ahead of the current token, which must be
 676    an integer. */
 677 long
 678 lex_next_integer (const struct lexer *lexer, int n)
 679 {
 680   return token_integer (lex_next (lexer, n));
 681 }
 682 \f
 683 /* Token matching functions. */
 684
 685 /* If the current token has the specified TYPE, skips it and returns true.
 686    Otherwise, returns false. */
 687 bool
 688 lex_match (struct lexer *lexer, enum token_type type)
 689 {
 690   if (lex_token (lexer) == type)
 691     {
 692       lex_get (lexer);
 693       return true;
 694     }
 695   else
 696     return false;
 697 }
 698
 699 /* If the current token matches IDENTIFIER, skips it and returns true.
 700    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 701    returns false.
 702
 703    IDENTIFIER must be an ASCII string. */
 704 bool
 705 lex_match_id (struct lexer *lexer, const char *identifier)
 706 {
 707   return lex_match_id_n (lexer, identifier, 3);
 708 }
 709
 710 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 711    may be abbreviated to its first N letters.  Otherwise, returns false.
 712
 713    IDENTIFIER must be an ASCII string. */
 714 bool
 715 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 716 {
 717   if (lex_token (lexer) == T_ID
 718       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 719     {
 720       lex_get (lexer);
 721       return true;
 722     }
 723   else
 724     return false;
 725 }
 726
 727 /* If the current token is integer X, skips it and returns true.  Otherwise,
 728    returns false. */
 729 bool
 730 lex_match_int (struct lexer *lexer, int x)
 731 {
 732   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 733     {
 734       lex_get (lexer);
 735       return true;
 736     }
 737   else
 738     return false;
 739 }
 740 \f
 741 /* Forced matches. */
 742
 743 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 744    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 745    false.
 746
 747    IDENTIFIER must be an ASCII string. */
 748 bool
 749 lex_force_match_id (struct lexer *lexer, const char *identifier)
 750 {
 751   if (lex_match_id (lexer, identifier))
 752     return true;
 753   else
 754     {
 755       lex_error_expecting (lexer, identifier);
 756       return false;
 757     }
 758 }
 759
 760 /* If the current token has the specified TYPE, skips it and returns true.
 761    Otherwise, reports an error and returns false. */
 762 bool
 763 lex_force_match (struct lexer *lexer, enum token_type type)
 764 {
 765   if (lex_token (lexer) == type)
 766     {
 767       lex_get (lexer);
 768       return true;
 769     }
 770   else
 771     {
 772       const char *type_string = token_type_to_string (type);
 773       if (type_string)
 774         {
 775           char *s = xasprintf ("`%s'", type_string);
 776           lex_error_expecting (lexer, s);
 777           free (s);
 778         }
 779       else
 780         lex_error_expecting (lexer, token_type_to_name (type));
 781
 782       return false;
 783     }
 784 }
 785
 786 /* If the current token is a string, does nothing and returns true.
 787    Otherwise, reports an error and returns false. */
 788 bool
 789 lex_force_string (struct lexer *lexer)
 790 {
 791   if (lex_is_string (lexer))
 792     return true;
 793   else
 794     {
 795       lex_error (lexer, _("expecting string"));
 796       return false;
 797     }
 798 }
 799
 800 /* If the current token is a string or an identifier, does nothing and returns
 801    true.  Otherwise, reports an error and returns false.
 802
 803    This is meant for use in syntactic situations where we want to encourage the
 804    user to supply a quoted string, but for compatibility we also accept
 805    identifiers.  (One example of such a situation is file names.)  Therefore,
 806    the error message issued when the current token is wrong only says that a
 807    string is expected and doesn't mention that an identifier would also be
 808    accepted. */
 809 bool
 810 lex_force_string_or_id (struct lexer *lexer)
 811 {
 812   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 813 }
 814
 815 /* If the current token is an integer, does nothing and returns true.
 816    Otherwise, reports an error and returns false. */
 817 bool
 818 lex_force_int (struct lexer *lexer)
 819 {
 820   if (lex_is_integer (lexer))
 821     return true;
 822   else
 823     {
 824       lex_error (lexer, _("expecting integer"));
 825       return false;
 826     }
 827 }
 828
 829 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 830    nothing and returns true.  Otherwise, reports an error and returns false.
 831    If NAME is nonnull, then it is used in the error message. */
 832 bool
 833 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 834 {
 835   bool is_integer = lex_is_integer (lexer);
 836   bool too_small = is_integer && lex_integer (lexer) < min;
 837   bool too_big = is_integer && lex_integer (lexer) > max;
 838   if (is_integer && !too_small && !too_big)
 839     return true;
 840
 841   if (min > max)
 842     {
 843       /* Weird, maybe a bug in the caller.  Just report that we needed an
 844          integer. */
 845       if (name)
 846         lex_error (lexer, _("Integer expected for %s."), name);
 847       else
 848         lex_error (lexer, _("Integer expected."));
 849     }
 850   else if (min == max)
 851     {
 852       if (name)
 853         lex_error (lexer, _("Expected %ld for %s."), min, name);
 854       else
 855         lex_error (lexer, _("Expected %ld."), min);
 856     }
 857   else if (min + 1 == max)
 858     {
 859       if (name)
 860         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 861       else
 862         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 863     }
 864   else
 865     {
 866       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 867       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 868
 869       if (report_lower_bound && report_upper_bound)
 870         {
 871           if (name)
 872             lex_error (lexer,
 873                        _("Expected integer between %ld and %ld for %s."),
 874                        min, max, name);
 875           else
 876             lex_error (lexer, _("Expected integer between %ld and %ld."),
 877                        min, max);
 878         }
 879       else if (report_lower_bound)
 880         {
 881           if (min == 0)
 882             {
 883               if (name)
 884                 lex_error (lexer, _("Expected non-negative integer for %s."),
 885                            name);
 886               else
 887                 lex_error (lexer, _("Expected non-negative integer."));
 888             }
 889           else if (min == 1)
 890             {
 891               if (name)
 892                 lex_error (lexer, _("Expected positive integer for %s."),
 893                            name);
 894               else
 895                 lex_error (lexer, _("Expected positive integer."));
 896             }
 897         }
 898       else if (report_upper_bound)
 899         {
 900           if (name)
 901             lex_error (lexer,
 902                        _("Expected integer less than or equal to %ld for %s."),
 903                        max, name);
 904           else
 905             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 906                        max);
 907         }
 908       else
 909         {
 910           if (name)
 911             lex_error (lexer, _("Integer expected for %s."), name);
 912           else
 913             lex_error (lexer, _("Integer expected."));
 914         }
 915     }
 916   return false;
 917 }
 918
 919 /* If the current token is a number, does nothing and returns true.
 920    Otherwise, reports an error and returns false. */
 921 bool
 922 lex_force_num (struct lexer *lexer)
 923 {
 924   if (lex_is_number (lexer))
 925     return true;
 926
 927   lex_error (lexer, _("expecting number"));
 928   return false;
 929 }
 930
 931 /* If the current token is an identifier, does nothing and returns true.
 932    Otherwise, reports an error and returns false. */
 933 bool
 934 lex_force_id (struct lexer *lexer)
 935 {
 936   if (lex_token (lexer) == T_ID)
 937     return true;
 938
 939   lex_error (lexer, _("expecting identifier"));
 940   return false;
 941 }
 942 \f
 943 /* Token accessors. */
 944
 945 /* Returns the type of LEXER's current token. */
 946 enum token_type
 947 lex_token (const struct lexer *lexer)
 948 {
 949   return lex_next_token (lexer, 0);
 950 }
 951
 952 /* Returns the number in LEXER's current token.
 953
 954    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 955    tokens this function will always return zero. */
 956 double
 957 lex_tokval (const struct lexer *lexer)
 958 {
 959   return lex_next_tokval (lexer, 0);
 960 }
 961
 962 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 963
 964    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 965    this functions this function will always return NULL.
 966
 967    The UTF-8 encoding of the returned string is correct for variable names and
 968    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 969    data_in() to use it in a "union value".  */
 970 const char *
 971 lex_tokcstr (const struct lexer *lexer)
 972 {
 973   return lex_next_tokcstr (lexer, 0);
 974 }
 975
 976 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 977    null-terminated (but the null terminator is not included in the returned
 978    substring's 'length').
 979
 980    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 981    this functions this function will always return NULL.
 982
 983    The UTF-8 encoding of the returned string is correct for variable names and
 984    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 985    data_in() to use it in a "union value".  */
 986 struct substring
 987 lex_tokss (const struct lexer *lexer)
 988 {
 989   return lex_next_tokss (lexer, 0);
 990 }
 991 \f
 992 /* Looking ahead.
 993
 994    A value of 0 for N as an argument to any of these functions refers to the
 995    current token.  Lookahead is limited to the current command.  Any N greater
 996    than the number of tokens remaining in the current command will be treated
 997    as referring to a T_ENDCMD token. */
 998
 999 static const struct lex_token *
1000 lex_next__ (const struct lexer *lexer_, int n)
1001 {
1002   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1003   struct lex_source *src = lex_source__ (lexer);
1004
1005   if (src != NULL)
1006     return lex_source_next__ (src, n);
1007   else
1008     {
1009       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1010       return &stop_token;
1011     }
1012 }
1013
1014 static const struct lex_token *
1015 lex_source_next__ (const struct lex_source *src_, int n)
1016 {
1017   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1018
1019   if (n < 0)
1020     {
1021       if (-n <= src->parse_ofs)
1022         return src->parse[src->parse_ofs - (-n)];
1023       else
1024         {
1025           static const struct lex_token endcmd_token
1026             = { .token = { .type = T_ENDCMD } };
1027           return &endcmd_token;
1028         }
1029     }
1030
1031   while (src->n_parse - src->parse_ofs <= n)
1032     {
1033       if (src->n_parse > 0)
1034         {
1035           const struct lex_token *t = src->parse[src->n_parse - 1];
1036           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1037             return t;
1038         }
1039
1040       lex_source_get_parse (src);
1041     }
1042
1043   return src->parse[src->parse_ofs + n];
1044 }
1045
1046 /* Returns the "struct token" of the token N after the current one in LEXER.
1047    The returned pointer can be invalidated by pretty much any succeeding call
1048    into the lexer, although the string pointer within the returned token is
1049    only invalidated by consuming the token (e.g. with lex_get()). */
1050 const struct token *
1051 lex_next (const struct lexer *lexer, int n)
1052 {
1053   return &lex_next__ (lexer, n)->token;
1054 }
1055
1056 /* Returns the type of the token N after the current one in LEXER. */
1057 enum token_type
1058 lex_next_token (const struct lexer *lexer, int n)
1059 {
1060   return lex_next (lexer, n)->type;
1061 }
1062
1063 /* Returns the number in the tokn N after the current one in LEXER.
1064
1065    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1066    tokens this function will always return zero. */
1067 double
1068 lex_next_tokval (const struct lexer *lexer, int n)
1069 {
1070   return token_number (lex_next (lexer, n));
1071 }
1072
1073 /* Returns the null-terminated string in the token N after the current one, in
1074    UTF-8 encoding.
1075
1076    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1077    this functions this function will always return NULL.
1078
1079    The UTF-8 encoding of the returned string is correct for variable names and
1080    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1081    data_in() to use it in a "union value".  */
1082 const char *
1083 lex_next_tokcstr (const struct lexer *lexer, int n)
1084 {
1085   return lex_next_tokss (lexer, n).string;
1086 }
1087
1088 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1089    The string is null-terminated (but the null terminator is not included in
1090    the returned substring's 'length').
1091
1092    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1093    tokens this functions this function will always return NULL.
1094
1095    The UTF-8 encoding of the returned string is correct for variable names and
1096    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1097    data_in() to use it in a "union value".  */
1098 struct substring
1099 lex_next_tokss (const struct lexer *lexer, int n)
1100 {
1101   return lex_next (lexer, n)->string;
1102 }
1103
1104 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1105    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1106    are both zero, this requests the syntax for the current token.)  The caller
1107    must eventually free the returned string (with free()).  The syntax is
1108    encoded in UTF-8 and in the original form supplied to the lexer so that, for
1109    example, it may include comments, spaces, and new-lines if it spans multiple
1110    tokens.  Macro expansion, however, has already been performed. */
1111 char *
1112 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1113 {
1114   return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1115 }
1116
1117 /* Returns true if the token N ahead of the current one was produced by macro
1118    expansion, false otherwise. */
1119 bool
1120 lex_next_is_from_macro (const struct lexer *lexer, int n)
1121 {
1122   return lex_next__ (lexer, n)->macro_rep != NULL;
1123 }
1124
1125 static bool
1126 lex_tokens_match (const struct token *actual, const struct token *expected)
1127 {
1128   if (actual->type != expected->type)
1129     return false;
1130
1131   switch (actual->type)
1132     {
1133     case T_POS_NUM:
1134     case T_NEG_NUM:
1135       return actual->number == expected->number;
1136
1137     case T_ID:
1138       return lex_id_match (expected->string, actual->string);
1139
1140     case T_STRING:
1141       return (actual->string.length == expected->string.length
1142               && !memcmp (actual->string.string, expected->string.string,
1143                           actual->string.length));
1144
1145     default:
1146       return true;
1147     }
1148 }
1149
1150 static size_t
1151 lex_at_phrase__ (struct lexer *lexer, const char *s)
1152 {
1153   struct string_lexer slex;
1154   struct token token;
1155
1156   size_t i = 0;
1157   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1158   while (string_lexer_next (&slex, &token))
1159     {
1160       bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1161       token_uninit (&token);
1162       if (!match)
1163         return 0;
1164     }
1165   return i;
1166 }
1167
1168 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1169    returns true.  Otherwise, returns false.
1170
1171    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1172    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1173    first three letters. */
1174 bool
1175 lex_at_phrase (struct lexer *lexer, const char *s)
1176 {
1177   return lex_at_phrase__ (lexer, s) > 0;
1178 }
1179
1180 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1181    skips it and returns true.  Otherwise, returns false.
1182
1183    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1184    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1185    first three letters. */
1186 bool
1187 lex_match_phrase (struct lexer *lexer, const char *s)
1188 {
1189   size_t n = lex_at_phrase__ (lexer, s);
1190   if (n > 0)
1191     lex_get_n (lexer, n);
1192   return n > 0;
1193 }
1194
1195 static int
1196 count_newlines (char *s, size_t length)
1197 {
1198   int n_newlines = 0;
1199   char *newline;
1200
1201   while ((newline = memchr (s, '\n', length)) != NULL)
1202     {
1203       n_newlines++;
1204       length -= (newline + 1) - s;
1205       s = newline + 1;
1206     }
1207
1208   return n_newlines;
1209 }
1210
1211 static int
1212 lex_token_get_last_line_number (const struct lex_source *src,
1213                                 const struct lex_token *token)
1214 {
1215   if (token->first_line == 0)
1216     return 0;
1217   else
1218     {
1219       char *token_str = &src->buffer[token->token_pos];
1220       return token->first_line + count_newlines (token_str, token->token_len) + 1;
1221     }
1222 }
1223
1224 static int
1225 lex_token_get_column__ (const struct lex_source *src, size_t offset)
1226 {
1227   const char *newline = memrchr (src->buffer, '\n', offset);
1228   size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1229   return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1230 }
1231
1232 static int
1233 lex_token_get_first_column (const struct lex_source *src,
1234                             const struct lex_token *token)
1235 {
1236   return lex_token_get_column__ (src, token->token_pos);
1237 }
1238
1239 static int
1240 lex_token_get_last_column (const struct lex_source *src,
1241                            const struct lex_token *token)
1242 {
1243   return lex_token_get_column__ (src, token->token_pos + token->token_len);
1244 }
1245
1246 static struct msg_location
1247 lex_token_location (const struct lex_source *src,
1248                     const struct lex_token *t0,
1249                     const struct lex_token *t1)
1250 {
1251   return (struct msg_location) {
1252     .file_name = intern_new_if_nonnull (src->reader->file_name),
1253     .first_line = t0->first_line,
1254     .last_line = lex_token_get_last_line_number (src, t1),
1255     .first_column = lex_token_get_first_column (src, t0),
1256     .last_column = lex_token_get_last_column (src, t1),
1257   };
1258 }
1259
1260 static struct msg_location *
1261 lex_token_location_rw (const struct lex_source *src,
1262                        const struct lex_token *t0,
1263                        const struct lex_token *t1)
1264 {
1265   struct msg_location location = lex_token_location (src, t0, t1);
1266   return msg_location_dup (&location);
1267 }
1268
1269 static struct msg_location *
1270 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1271 {
1272   return lex_token_location_rw (src,
1273                                 lex_source_next__ (src, n0),
1274                                 lex_source_next__ (src, n1));
1275 }
1276
1277 /* Returns the 1-based line number of the start of the syntax that represents
1278    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1279    if the token is drawn from a source that does not have line numbers. */
1280 int
1281 lex_get_first_line_number (const struct lexer *lexer, int n)
1282 {
1283   const struct lex_source *src = lex_source__ (lexer);
1284   return src ? lex_source_next__ (src, n)->first_line : 0;
1285 }
1286
1287 /* Returns the 1-based line number of the end of the syntax that represents the
1288    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1289    token or if the token is drawn from a source that does not have line
1290    numbers.
1291
1292    Most of the time, a single token is wholly within a single line of syntax,
1293    but there are two exceptions: a T_STRING token can be made up of multiple
1294    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1295    token can consist of a "-" on one line followed by the number on the next.
1296  */
1297 int
1298 lex_get_last_line_number (const struct lexer *lexer, int n)
1299 {
1300   const struct lex_source *src = lex_source__ (lexer);
1301   return src ? lex_token_get_last_line_number (src,
1302                                                lex_source_next__ (src, n)) : 0;
1303 }
1304
1305 /* Returns the 1-based column number of the start of the syntax that represents
1306    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1307    token.
1308
1309    Column numbers are measured according to the width of characters as shown in
1310    a typical fixed-width font, in which CJK characters have width 2 and
1311    combining characters have width 0.  */
1312 int
1313 lex_get_first_column (const struct lexer *lexer, int n)
1314 {
1315   const struct lex_source *src = lex_source__ (lexer);
1316   return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1317 }
1318
1319 /* Returns the 1-based column number of the end of the syntax that represents
1320    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1321    token.
1322
1323    Column numbers are measured according to the width of characters as shown in
1324    a typical fixed-width font, in which CJK characters have width 2 and
1325    combining characters have width 0.  */
1326 int
1327 lex_get_last_column (const struct lexer *lexer, int n)
1328 {
1329   const struct lex_source *src = lex_source__ (lexer);
1330   return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1331 }
1332
1333 /* Returns the name of the syntax file from which the current command is drawn.
1334    Returns NULL for a T_STOP token or if the command's source does not have
1335    line numbers.
1336
1337    There is no version of this function that takes an N argument because
1338    lookahead only works to the end of a command and any given command is always
1339    within a single syntax file. */
1340 const char *
1341 lex_get_file_name (const struct lexer *lexer)
1342 {
1343   struct lex_source *src = lex_source__ (lexer);
1344   return src == NULL ? NULL : src->reader->file_name;
1345 }
1346
1347 /* Returns a newly allocated msg_location for the syntax that represents tokens
1348    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1349    must eventually free the location (with msg_location_destroy()). */
1350 struct msg_location *
1351 lex_get_location (const struct lexer *lexer, int n0, int n1)
1352 {
1353   struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1354   loc->first_column = lex_get_first_column (lexer, n0);
1355   loc->last_column = lex_get_last_column (lexer, n1);
1356   return loc;
1357 }
1358
1359 /* Returns a newly allocated msg_location for the syntax that represents tokens
1360    with 0-based offsets N0...N1, inclusive, from the current token.  The
1361    location only covers the tokens' lines, not the columns.  The caller must
1362    eventually free the location (with msg_location_destroy()). */
1363 struct msg_location *
1364 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1365 {
1366   struct msg_location *loc = xmalloc (sizeof *loc);
1367   *loc = (struct msg_location) {
1368     .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1369     .first_line = lex_get_first_line_number (lexer, n0),
1370     .last_line = lex_get_last_line_number (lexer, n1),
1371   };
1372   return loc;
1373 }
1374
1375 const char *
1376 lex_get_encoding (const struct lexer *lexer)
1377 {
1378   struct lex_source *src = lex_source__ (lexer);
1379   return src == NULL ? NULL : src->reader->encoding;
1380 }
1381
1382 /* Returns the syntax mode for the syntax file from which the current drawn is
1383    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1384    does not have line numbers.
1385
1386    There is no version of this function that takes an N argument because
1387    lookahead only works to the end of a command and any given command is always
1388    within a single syntax file. */
1389 enum segmenter_mode
1390 lex_get_syntax_mode (const struct lexer *lexer)
1391 {
1392   struct lex_source *src = lex_source__ (lexer);
1393   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1394 }
1395
1396 /* Returns the error mode for the syntax file from which the current drawn is
1397    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1398    source does not have line numbers.
1399
1400    There is no version of this function that takes an N argument because
1401    lookahead only works to the end of a command and any given command is always
1402    within a single syntax file. */
1403 enum lex_error_mode
1404 lex_get_error_mode (const struct lexer *lexer)
1405 {
1406   struct lex_source *src = lex_source__ (lexer);
1407   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1408 }
1409
1410 /* If the source that LEXER is currently reading has error mode
1411    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1412    token to be read comes directly from whatever is next read from the stream.
1413
1414    It makes sense to call this function after encountering an error in a
1415    command entered on the console, because usually the user would prefer not to
1416    have cascading errors. */
1417 void
1418 lex_interactive_reset (struct lexer *lexer)
1419 {
1420   struct lex_source *src = lex_source__ (lexer);
1421   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1422     {
1423       src->length = 0;
1424       src->journal_pos = src->seg_pos = 0;
1425       src->n_newlines = 0;
1426       src->suppress_next_newline = false;
1427       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1428                                        false);
1429       lex_stage_clear (&src->pp);
1430       lex_stage_clear (&src->merge);
1431       lex_source_clear_parse (src);
1432       lex_source_push_endcmd__ (src);
1433     }
1434 }
1435
1436 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1437 void
1438 lex_discard_rest_of_command (struct lexer *lexer)
1439 {
1440   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1441     lex_get (lexer);
1442 }
1443
1444 /* Discards all lookahead tokens in LEXER, then discards all input sources
1445    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1446    runs out of input sources. */
1447 void
1448 lex_discard_noninteractive (struct lexer *lexer)
1449 {
1450   struct lex_source *src = lex_source__ (lexer);
1451
1452   if (src != NULL)
1453     {
1454       lex_stage_clear (&src->pp);
1455       lex_stage_clear (&src->merge);
1456       lex_source_clear_parse (src);
1457
1458       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1459            src = lex_source__ (lexer))
1460         lex_source_destroy (src);
1461     }
1462 }
1463 \f
1464 static void
1465 lex_source_expand__ (struct lex_source *src)
1466 {
1467   if (src->length >= src->allocated)
1468     src->buffer = x2realloc (src->buffer, &src->allocated);
1469 }
1470
1471 static void
1472 lex_source_read__ (struct lex_source *src)
1473 {
1474   do
1475     {
1476       lex_source_expand__ (src);
1477
1478       size_t space = src->allocated - src->length;
1479       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1480       size_t n = src->reader->class->read (src->reader,
1481                                            &src->buffer[src->length],
1482                                            space, prompt);
1483       assert (n <= space);
1484
1485       if (n == 0)
1486         {
1487           /* End of input. */
1488           src->reader->eof = true;
1489           lex_source_expand__ (src);
1490           return;
1491         }
1492
1493       src->length += n;
1494     }
1495   while (!memchr (&src->buffer[src->seg_pos], '\n',
1496                   src->length - src->seg_pos));
1497 }
1498
1499 static struct lex_source *
1500 lex_source__ (const struct lexer *lexer)
1501 {
1502   return (ll_is_empty (&lexer->sources) ? NULL
1503           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1504 }
1505
1506 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1507    one, through N1 ahead of the current one, inclusive.  (For example, if N0
1508    and N1 are both zero, this requests the syntax for the current token.)  The
1509    caller must eventually free the returned string (with free()).  The syntax
1510    is encoded in UTF-8 and in the original form supplied to the lexer so that,
1511    for example, it may include comments, spaces, and new-lines if it spans
1512    multiple tokens.  Macro expansion, however, has already been performed. */
1513 static char *
1514 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1515 {
1516   struct string s = DS_EMPTY_INITIALIZER;
1517   for (size_t i = n0; i <= n1; )
1518     {
1519       /* Find [I,J) as the longest sequence of tokens not produced by macro
1520          expansion, or otherwise the longest sequence expanded from a single
1521          macro call. */
1522       const struct lex_token *first = lex_source_next__ (src, i);
1523       size_t j;
1524       for (j = i + 1; j <= n1; j++)
1525         {
1526           const struct lex_token *cur = lex_source_next__ (src, j);
1527           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1528               || first->macro_rep != cur->macro_rep)
1529             break;
1530         }
1531       const struct lex_token *last = lex_source_next__ (src, j - 1);
1532
1533       /* Now add the syntax for this sequence of tokens to SRC. */
1534       if (!ds_is_empty (&s))
1535         ds_put_byte (&s, ' ');
1536       if (!first->macro_rep)
1537         {
1538           size_t start = first->token_pos;
1539           size_t end = last->token_pos + last->token_len;
1540           ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1541         }
1542       else
1543         {
1544           size_t start = first->ofs;
1545           size_t end = last->ofs + last->len;
1546           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1547                                            end - start));
1548         }
1549
1550       i = j;
1551     }
1552   return ds_steal_cstr (&s);
1553 }
1554
1555 static bool
1556 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1557 {
1558   for (size_t i = n0; i <= n1; i++)
1559     if (lex_source_next__ (src, i)->macro_rep)
1560       return true;
1561   return false;
1562 }
1563
1564 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1565    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1566    other tokens included in that range.  The syntax is encoded in UTF-8 and in
1567    the original form supplied to the lexer so that, for example, it may include
1568    comments, spaces, and new-lines if it spans multiple tokens.
1569
1570    Returns an empty string if the token range doesn't include a macro call.
1571
1572    The caller must not modify or free the returned string. */
1573 static struct substring
1574 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1575 {
1576   if (!lex_source_contains_macro_call (src, n0, n1))
1577     return ss_empty ();
1578
1579   const struct lex_token *token0 = lex_source_next__ (src, n0);
1580   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1581   size_t start = token0->token_pos;
1582   size_t end = token1->token_pos + token1->token_len;
1583
1584   return ss_buffer (&src->buffer[start], end - start);
1585 }
1586
1587 static void
1588 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1589                          const char *format, va_list args)
1590 {
1591   const struct lex_token *token;
1592   struct string s;
1593
1594   ds_init_empty (&s);
1595
1596   token = lex_source_next__ (src, n0);
1597   if (token->token.type == T_ENDCMD)
1598     ds_put_cstr (&s, _("Syntax error at end of command"));
1599   else
1600     {
1601       /* Get the syntax that caused the error. */
1602       char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1603       char syntax[64];
1604       str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1605       free (raw_syntax);
1606
1607       /* Get the macro call(s) that expanded to the syntax that caused the
1608          error. */
1609       char call[64];
1610       str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1611                      call, sizeof call);
1612
1613       if (syntax[0])
1614         {
1615           if (call[0])
1616             ds_put_format (&s,
1617                            _("Syntax error at `%s' (in expansion of `%s')"),
1618                            syntax, call);
1619           else
1620             ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1621         }
1622       else
1623         {
1624           if (call[0])
1625             ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1626                            call);
1627           else
1628             ds_put_cstr (&s, _("Syntax error"));
1629         }
1630     }
1631
1632   if (format)
1633     {
1634       ds_put_cstr (&s, ": ");
1635       ds_put_vformat (&s, format, args);
1636     }
1637   if (ds_last (&s) != '.')
1638     ds_put_byte (&s, '.');
1639
1640   struct msg *m = xmalloc (sizeof *m);
1641   *m = (struct msg) {
1642     .category = MSG_C_SYNTAX,
1643     .severity = MSG_S_ERROR,
1644     .location = lex_source_get_location (src, n0, n1),
1645     .text = ds_steal_cstr (&s),
1646   };
1647   msg_emit (m);
1648 }
1649
1650 static void
1651 lex_get_error (struct lex_source *src, const struct lex_token *token)
1652 {
1653   char syntax[64];
1654   str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1655                  syntax, sizeof syntax);
1656
1657   struct string s = DS_EMPTY_INITIALIZER;
1658   ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1659   ds_put_format (&s, ": %s", token->token.string.string);
1660
1661   struct msg *m = xmalloc (sizeof *m);
1662   *m = (struct msg) {
1663     .category = MSG_C_SYNTAX,
1664     .severity = MSG_S_ERROR,
1665     .location = lex_token_location_rw (src, token, token),
1666     .text = ds_steal_cstr (&s),
1667   };
1668   msg_emit (m);
1669 }
1670
1671 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1672    underlying lex_reader if necessary.  Returns true if a new token was added
1673    to SRC's deque, false otherwise.  The caller should retry failures unless
1674    SRC's 'eof' marker was set to true indicating that there will be no more
1675    tokens from this source. */
1676 static bool
1677 lex_source_try_get_pp (struct lex_source *src)
1678 {
1679   /* Append a new token to SRC and initialize it. */
1680   struct lex_token *token = xmalloc (sizeof *token);
1681   token->token = (struct token) { .type = T_STOP };
1682   token->macro_rep = NULL;
1683   token->ref_cnt = NULL;
1684   token->token_pos = src->seg_pos;
1685   if (src->reader->line_number > 0)
1686     token->first_line = src->reader->line_number + src->n_newlines;
1687   else
1688     token->first_line = 0;
1689
1690   /* Extract a segment. */
1691   const char *segment;
1692   enum segment_type seg_type;
1693   int seg_len;
1694   for (;;)
1695     {
1696       segment = &src->buffer[src->seg_pos];
1697       seg_len = segmenter_push (&src->segmenter, segment,
1698                                 src->length - src->seg_pos,
1699                                 src->reader->eof, &seg_type);
1700       if (seg_len >= 0)
1701         break;
1702
1703       /* The segmenter needs more input to produce a segment. */
1704       assert (!src->reader->eof);
1705       lex_source_read__ (src);
1706     }
1707
1708   /* Update state based on the segment. */
1709   token->token_len = seg_len;
1710   src->seg_pos += seg_len;
1711   if (seg_type == SEG_NEWLINE)
1712     src->n_newlines++;
1713
1714   /* Get a token from the segment. */
1715   enum tokenize_result result = token_from_segment (
1716     seg_type, ss_buffer (segment, seg_len), &token->token);
1717
1718   /* If we've reached the end of a line, or the end of a command, then pass
1719      the line to the output engine as a syntax text item.  */
1720   int n_lines = seg_type == SEG_NEWLINE;
1721   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1722     {
1723       n_lines++;
1724       src->suppress_next_newline = true;
1725     }
1726   else if (n_lines > 0 && src->suppress_next_newline)
1727     {
1728       n_lines--;
1729       src->suppress_next_newline = false;
1730     }
1731   for (int i = 0; i < n_lines; i++)
1732     {
1733       /* Beginning of line. */
1734       const char *line = &src->buffer[src->journal_pos];
1735
1736       /* Calculate line length, including \n or \r\n end-of-line if present.
1737
1738          We use src->head even though that may be beyond what we've actually
1739          converted to tokens (which is only through line_pos).  That's because,
1740          if we're emitting the line due to SEG_END_COMMAND, we want to take the
1741          whole line through the newline, not just through the '.'. */
1742       size_t max_len = src->length - src->journal_pos;
1743       const char *newline = memchr (line, '\n', max_len);
1744       size_t line_len = newline ? newline - line + 1 : max_len;
1745
1746       /* Calculate line length excluding end-of-line. */
1747       size_t copy_len = line_len;
1748       if (copy_len > 0 && line[copy_len - 1] == '\n')
1749         copy_len--;
1750       if (copy_len > 0 && line[copy_len - 1] == '\r')
1751         copy_len--;
1752
1753       /* Submit the line as syntax. */
1754       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1755                                                    xmemdup0 (line, copy_len),
1756                                                    NULL));
1757
1758       src->journal_pos += line_len;
1759     }
1760
1761   switch (result)
1762     {
1763     case TOKENIZE_ERROR:
1764       lex_get_error (src, token);
1765       /* Fall through. */
1766     case TOKENIZE_EMPTY:
1767       lex_token_destroy (token);
1768       return false;
1769
1770     case TOKENIZE_TOKEN:
1771       if (token->token.type == T_STOP)
1772         {
1773           token->token.type = T_ENDCMD;
1774           src->eof = true;
1775         }
1776       lex_stage_push_last (&src->pp, token);
1777       return true;
1778     }
1779   NOT_REACHED ();
1780 }
1781
1782 /* Attempts to append a new token to SRC.  Returns true if successful, false on
1783    failure.  On failure, the end of SRC has been reached and no more tokens
1784    will be forthcoming from it.
1785
1786    Does not make the new token available for lookahead yet; the caller must
1787    adjust SRC's 'middle' pointer to do so. */
1788 static bool
1789 lex_source_get_pp (struct lex_source *src)
1790 {
1791   while (!src->eof)
1792     if (lex_source_try_get_pp (src))
1793       return true;
1794   return false;
1795 }
1796
1797 static bool
1798 lex_source_try_get_merge (const struct lex_source *src_)
1799 {
1800   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1801
1802   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1803     return false;
1804
1805   if (!settings_get_mexpand ())
1806     {
1807       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1808       return true;
1809     }
1810
1811   /* Now pass tokens one-by-one to the macro expander.
1812
1813      In the common case where there is no macro to expand, the loop is not
1814      entered.  */
1815   struct macro_call *mc;
1816   int n_call = macro_call_create (src->lexer->macros,
1817                                   &lex_stage_first (&src->pp)->token, &mc);
1818   for (int ofs = 1; !n_call; ofs++)
1819     {
1820       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1821         {
1822           /* This should not be reachable because we always get a T_ENDCMD at
1823              the end of an input file (transformed from T_STOP by
1824              lex_source_try_get_pp()) and the macro_expander should always
1825              terminate expansion on T_ENDCMD. */
1826           NOT_REACHED ();
1827         }
1828
1829       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1830       size_t start = t->token_pos;
1831       size_t end = t->token_pos + t->token_len;
1832       const struct macro_token mt = {
1833         .token = t->token,
1834         .syntax = ss_buffer (&src->buffer[start], end - start),
1835       };
1836       const struct msg_location loc = lex_token_location (src, t, t);
1837       n_call = macro_call_add (mc, &mt, &loc);
1838     }
1839   if (n_call < 0)
1840     {
1841       /* False alarm: no macro expansion after all.  Use first token as
1842          lookahead.  We'll retry macro expansion from the second token next
1843          time around. */
1844       macro_call_destroy (mc);
1845       lex_stage_shift (&src->merge, &src->pp, 1);
1846       return true;
1847     }
1848
1849   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1850      are a macro call.  (These are likely to be the only tokens in 'pp'.)
1851      Expand them.  */
1852   const struct lex_token *c0 = lex_stage_first (&src->pp);
1853   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1854   struct macro_tokens expansion = { .n = 0 };
1855   struct msg_location loc = lex_token_location (src, c0, c1);
1856   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1857   macro_call_destroy (mc);
1858
1859   /* Convert the macro expansion into syntax for possible error messages
1860      later. */
1861   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1862   size_t *len = xnmalloc (expansion.n, sizeof *len);
1863   struct string s = DS_EMPTY_INITIALIZER;
1864   macro_tokens_to_syntax (&expansion, &s, ofs, len);
1865
1866   if (settings_get_mprint ())
1867     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1868                                           _("Macro Expansion")));
1869
1870   /* Append the macro expansion tokens to the lookahead. */
1871   if (expansion.n > 0)
1872     {
1873       char *macro_rep = ds_steal_cstr (&s);
1874       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1875       *ref_cnt = expansion.n;
1876       for (size_t i = 0; i < expansion.n; i++)
1877         {
1878           struct lex_token *token = xmalloc (sizeof *token);
1879           *token = (struct lex_token) {
1880             .token = expansion.mts[i].token,
1881             .token_pos = c0->token_pos,
1882             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1883             .first_line = c0->first_line,
1884             .macro_rep = macro_rep,
1885             .ofs = ofs[i],
1886             .len = len[i],
1887             .ref_cnt = ref_cnt,
1888           };
1889           lex_stage_push_last (&src->merge, token);
1890
1891           ss_dealloc (&expansion.mts[i].syntax);
1892         }
1893     }
1894   else
1895     ds_destroy (&s);
1896   free (expansion.mts);
1897   free (ofs);
1898   free (len);
1899
1900   /* Destroy the tokens for the call. */
1901   for (size_t i = 0; i < n_call; i++)
1902     lex_stage_pop_first (&src->pp);
1903
1904   return expansion.n > 0;
1905 }
1906
1907 /* Attempts to obtain at least one new token into 'merge' in SRC.
1908
1909    Returns true if successful, false on failure.  In the latter case, SRC is
1910    exhausted and 'src->eof' is now true. */
1911 static bool
1912 lex_source_get_merge (struct lex_source *src)
1913 {
1914   while (!src->eof)
1915     if (lex_source_try_get_merge (src))
1916       return true;
1917   return false;
1918 }
1919
1920 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1921
1922    Returns true if successful, false on failure.  In the latter case, SRC is
1923    exhausted and 'src->eof' is now true. */
1924 static bool
1925 lex_source_get_parse (struct lex_source *src)
1926 {
1927   struct merger m = MERGER_INIT;
1928   struct token out;
1929   for (size_t i = 0; ; i++)
1930     {
1931       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1932         {
1933           /* We always get a T_ENDCMD at the end of an input file
1934              (transformed from T_STOP by lex_source_try_get_pp()) and
1935              merger_add() should never return -1 on T_ENDCMD. */
1936           assert (lex_stage_is_empty (&src->merge));
1937           return false;
1938         }
1939
1940       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1941                                &out);
1942       if (!retval)
1943         {
1944           lex_source_push_parse (src, lex_stage_take_first (&src->merge));
1945           return true;
1946         }
1947       else if (retval > 0)
1948         {
1949           /* Add a token that merges all the tokens together. */
1950           const struct lex_token *first = lex_stage_first (&src->merge);
1951           const struct lex_token *last = lex_stage_nth (&src->merge,
1952                                                         retval - 1);
1953           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1954           struct lex_token *t = xmalloc (sizeof *t);
1955           *t = (struct lex_token) {
1956             .token = out,
1957             .token_pos = first->token_pos,
1958             .token_len = (last->token_pos - first->token_pos) + last->token_len,
1959             .first_line = first->first_line,
1960
1961             /* This works well if all the tokens were not expanded from macros,
1962                or if they came from the same macro expansion.  It just gives up
1963                in the other (corner) cases. */
1964             .macro_rep = macro ? first->macro_rep : NULL,
1965             .ofs = macro ? first->ofs : 0,
1966             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
1967             .ref_cnt = macro ? first->ref_cnt : NULL,
1968           };
1969           if (t->ref_cnt)
1970             ++*t->ref_cnt;
1971           lex_source_push_parse (src, t);
1972
1973           for (int i = 0; i < retval; i++)
1974             lex_stage_pop_first (&src->merge);
1975           return true;
1976         }
1977     }
1978 }
1979 \f
1980 static void
1981 lex_source_push_endcmd__ (struct lex_source *src)
1982 {
1983   assert (src->n_parse == 0);
1984
1985   struct lex_token *token = xmalloc (sizeof *token);
1986   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
1987   lex_source_push_parse (src, token);
1988 }
1989
1990 static void
1991 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
1992 {
1993   if (src->n_parse >= src->allocated_parse)
1994     src->parse = x2nrealloc (src->parse, &src->allocated_parse,
1995                              sizeof *src->parse);
1996   src->parse[src->n_parse++] = token;
1997 }
1998
1999 static void
2000 lex_source_clear_parse (struct lex_source *src)
2001 {
2002   for (size_t i = 0; i < src->n_parse; i++)
2003     lex_token_destroy (src->parse[i]);
2004   src->n_parse = src->parse_ofs = 0;
2005 }
2006
2007 static struct lex_source *
2008 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2009 {
2010   struct lex_source *src = xmalloc (sizeof *src);
2011   *src = (struct lex_source) {
2012     .reader = reader,
2013     .segmenter = segmenter_init (reader->syntax, false),
2014     .lexer = lexer,
2015   };
2016
2017   lex_source_push_endcmd__ (src);
2018
2019   return src;
2020 }
2021
2022 static void
2023 lex_source_destroy (struct lex_source *src)
2024 {
2025   char *file_name = src->reader->file_name;
2026   char *encoding = src->reader->encoding;
2027   if (src->reader->class->destroy != NULL)
2028     src->reader->class->destroy (src->reader);
2029   free (file_name);
2030   free (encoding);
2031   free (src->buffer);
2032   lex_stage_uninit (&src->pp);
2033   lex_stage_uninit (&src->merge);
2034   lex_source_clear_parse (src);
2035   free (src->parse);
2036   ll_remove (&src->ll);
2037   free (src);
2038 }
2039 \f
2040 struct lex_file_reader
2041   {
2042     struct lex_reader reader;
2043     struct u8_istream *istream;
2044   };
2045
2046 static struct lex_reader_class lex_file_reader_class;
2047
2048 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2049    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2050    ENCODING, which should take one of the forms accepted by
2051    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2052    mode of the new reader, respectively.
2053
2054    Returns a null pointer if FILE_NAME cannot be opened. */
2055 struct lex_reader *
2056 lex_reader_for_file (const char *file_name, const char *encoding,
2057                      enum segmenter_mode syntax,
2058                      enum lex_error_mode error)
2059 {
2060   struct lex_file_reader *r;
2061   struct u8_istream *istream;
2062
2063   istream = (!strcmp(file_name, "-")
2064              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2065              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2066   if (istream == NULL)
2067     {
2068       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2069       return NULL;
2070     }
2071
2072   r = xmalloc (sizeof *r);
2073   lex_reader_init (&r->reader, &lex_file_reader_class);
2074   r->reader.syntax = syntax;
2075   r->reader.error = error;
2076   r->reader.file_name = xstrdup (file_name);
2077   r->reader.encoding = xstrdup_if_nonnull (encoding);
2078   r->reader.line_number = 1;
2079   r->istream = istream;
2080
2081   return &r->reader;
2082 }
2083
2084 static struct lex_file_reader *
2085 lex_file_reader_cast (struct lex_reader *r)
2086 {
2087   return UP_CAST (r, struct lex_file_reader, reader);
2088 }
2089
2090 static size_t
2091 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2092                enum prompt_style prompt_style UNUSED)
2093 {
2094   struct lex_file_reader *r = lex_file_reader_cast (r_);
2095   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2096   if (n_read < 0)
2097     {
2098       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2099       return 0;
2100     }
2101   return n_read;
2102 }
2103
2104 static void
2105 lex_file_close (struct lex_reader *r_)
2106 {
2107   struct lex_file_reader *r = lex_file_reader_cast (r_);
2108
2109   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2110     {
2111       if (u8_istream_close (r->istream) != 0)
2112         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2113     }
2114   else
2115     u8_istream_free (r->istream);
2116
2117   free (r);
2118 }
2119
2120 static struct lex_reader_class lex_file_reader_class =
2121   {
2122     lex_file_read,
2123     lex_file_close
2124   };
2125 \f
2126 struct lex_string_reader
2127   {
2128     struct lex_reader reader;
2129     struct substring s;
2130     size_t offset;
2131   };
2132
2133 static struct lex_reader_class lex_string_reader_class;
2134
2135 /* Creates and returns a new lex_reader for the contents of S, which must be
2136    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2137    with ss_dealloc() when it is closed. */
2138 struct lex_reader *
2139 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2140 {
2141   struct lex_string_reader *r;
2142
2143   r = xmalloc (sizeof *r);
2144   lex_reader_init (&r->reader, &lex_string_reader_class);
2145   r->reader.syntax = SEG_MODE_AUTO;
2146   r->reader.encoding = xstrdup_if_nonnull (encoding);
2147   r->s = s;
2148   r->offset = 0;
2149
2150   return &r->reader;
2151 }
2152
2153 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2154    which must be encoded in ENCODING.  The caller retains ownership of S. */
2155 struct lex_reader *
2156 lex_reader_for_string (const char *s, const char *encoding)
2157 {
2158   struct substring ss;
2159   ss_alloc_substring (&ss, ss_cstr (s));
2160   return lex_reader_for_substring_nocopy (ss, encoding);
2161 }
2162
2163 /* Formats FORMAT as a printf()-like format string and creates and returns a
2164    new lex_reader for the formatted result.  */
2165 struct lex_reader *
2166 lex_reader_for_format (const char *format, const char *encoding, ...)
2167 {
2168   struct lex_reader *r;
2169   va_list args;
2170
2171   va_start (args, encoding);
2172   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2173   va_end (args);
2174
2175   return r;
2176 }
2177
2178 static struct lex_string_reader *
2179 lex_string_reader_cast (struct lex_reader *r)
2180 {
2181   return UP_CAST (r, struct lex_string_reader, reader);
2182 }
2183
2184 static size_t
2185 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2186                  enum prompt_style prompt_style UNUSED)
2187 {
2188   struct lex_string_reader *r = lex_string_reader_cast (r_);
2189   size_t chunk;
2190
2191   chunk = MIN (n, r->s.length - r->offset);
2192   memcpy (buf, r->s.string + r->offset, chunk);
2193   r->offset += chunk;
2194
2195   return chunk;
2196 }
2197
2198 static void
2199 lex_string_close (struct lex_reader *r_)
2200 {
2201   struct lex_string_reader *r = lex_string_reader_cast (r_);
2202
2203   ss_dealloc (&r->s);
2204   free (r);
2205 }
2206
2207 static struct lex_reader_class lex_string_reader_class =
2208   {
2209     lex_string_read,
2210     lex_string_close
2211   };