pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31
  32 #include "language/command.h"
  33 #include "language/lexer/macro.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/ll.h"
  42 #include "libpspp/message.h"
  43 #include "libpspp/misc.h"
  44 #include "libpspp/str.h"
  45 #include "libpspp/u8-istream.h"
  46 #include "output/journal.h"
  47 #include "output/output-item.h"
  48
  49 #include "gl/c-ctype.h"
  50 #include "gl/minmax.h"
  51 #include "gl/xalloc.h"
  52 #include "gl/xmemdup0.h"
  53
  54 #include "gettext.h"
  55 #define _(msgid) gettext (msgid)
  56 #define N_(msgid) msgid
  57
  58 /* A token within a lex_source. */
  59 struct lex_token
  60   {
  61     /* The regular token information. */
  62     struct token token;
  63
  64     /* For a token obtained through the lexer in an ordinary way, this is the
  65        location of the token in terms of the lex_source's buffer.
  66
  67        For a token produced through macro expansion, this is the entire macro
  68        call. */
  69     size_t token_pos;           /* Offset into src->buffer of token start. */
  70     size_t token_len;           /* Length of source for token in bytes. */
  71     int first_line;             /* Line number at token_pos. */
  72
  73     /* For a token obtained through macro expansion, this is just this token.
  74
  75        For a token obtained through the lexer in an ordinary way, these are
  76        nulls and zeros. */
  77     char *macro_rep;        /* The whole macro expansion. */
  78     size_t ofs;             /* Offset of this token in macro_rep. */
  79     size_t len;             /* Length of this token in macro_rep. */
  80     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  81   };
  82
  83 static void
  84 lex_token_destroy (struct lex_token *t)
  85 {
  86   token_uninit (&t->token);
  87   if (t->ref_cnt)
  88     {
  89       assert (*t->ref_cnt > 0);
  90       if (!--*t->ref_cnt)
  91         {
  92           free (t->macro_rep);
  93           free (t->ref_cnt);
  94         }
  95     }
  96   free (t);
  97 }
  98 \f
  99 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 100    lex_source. */
 101 struct lex_stage
 102   {
 103     struct deque deque;
 104     struct lex_token **tokens;
 105   };
 106
 107 static void lex_stage_clear (struct lex_stage *);
 108 static void lex_stage_uninit (struct lex_stage *);
 109
 110 static size_t lex_stage_count (const struct lex_stage *);
 111 static bool lex_stage_is_empty (const struct lex_stage *);
 112
 113 static struct lex_token *lex_stage_last (struct lex_stage *);
 114 static struct lex_token *lex_stage_first (struct lex_stage *);
 115 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 116
 117 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 118 static void lex_stage_pop_first (struct lex_stage *);
 119
 120 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 121                              size_t n);
 122
 123 /* Deletes all the tokens from STAGE. */
 124 static void
 125 lex_stage_clear (struct lex_stage *stage)
 126 {
 127   while (!deque_is_empty (&stage->deque))
 128     lex_stage_pop_first (stage);
 129 }
 130
 131 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 132 static void
 133 lex_stage_uninit (struct lex_stage *stage)
 134 {
 135   lex_stage_clear (stage);
 136   free (stage->tokens);
 137 }
 138
 139 /* Returns true if STAGE contains no tokens, otherwise false. */
 140 static bool
 141 lex_stage_is_empty (const struct lex_stage *stage)
 142 {
 143   return deque_is_empty (&stage->deque);
 144 }
 145
 146 /* Returns the number of tokens in STAGE. */
 147 static size_t
 148 lex_stage_count (const struct lex_stage *stage)
 149 {
 150   return deque_count (&stage->deque);
 151 }
 152
 153 /* Returns the last token in STAGE, which must be nonempty.  The last token is
 154    the one accessed with the greatest lookahead. */
 155 static struct lex_token *
 156 lex_stage_last (struct lex_stage *stage)
 157 {
 158   return stage->tokens[deque_front (&stage->deque, 0)];
 159 }
 160
 161 /* Returns the first token in STAGE, which must be nonempty.
 162    The first token is the one accessed with the least lookahead. */
 163 static struct lex_token *
 164 lex_stage_first (struct lex_stage *stage)
 165 {
 166   return lex_stage_nth (stage, 0);
 167 }
 168
 169 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 170    lookahead) is 0, the second token is 1, and so on.  There must be at least
 171    INDEX + 1 tokens in STAGE. */
 172 static struct lex_token *
 173 lex_stage_nth (struct lex_stage *stage, size_t index)
 174 {
 175   return stage->tokens[deque_back (&stage->deque, index)];
 176 }
 177
 178 /* Adds TOKEN so that it becomes the last token in STAGE. */
 179 static void
 180 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 181 {
 182   if (deque_is_full (&stage->deque))
 183     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 184                                   sizeof *stage->tokens);
 185   stage->tokens[deque_push_front (&stage->deque)] = token;
 186 }
 187
 188 /* Removes the first token from STAGE and uninitializes it. */
 189 static void
 190 lex_stage_pop_first (struct lex_stage *stage)
 191 {
 192   lex_token_destroy (stage->tokens[deque_pop_back (&stage->deque)]);
 193 }
 194
 195 /* Removes the first N tokens from SRC, appending them to DST as the last
 196    tokens. */
 197 static void
 198 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 199 {
 200   for (size_t i = 0; i < n; i++)
 201     {
 202       lex_stage_push_last (dst, lex_stage_first (src));
 203       deque_pop_back (&src->deque);
 204     }
 205 }
 206
 207 /* A source of tokens, corresponding to a syntax file.
 208
 209    This is conceptually a lex_reader wrapped with everything needed to convert
 210    its UTF-8 bytes into tokens. */
 211 struct lex_source
 212   {
 213     struct ll ll;               /* In lexer's list of sources. */
 214     struct lex_reader *reader;
 215     struct lexer *lexer;
 216     struct segmenter segmenter;
 217     bool eof;                   /* True if T_STOP was read from 'reader'. */
 218
 219     /* Buffer of UTF-8 bytes. */
 220     char *buffer;               /* Source file contents. */
 221     size_t length;              /* Number of bytes filled. */
 222     size_t allocated;           /* Number of bytes allocated. */
 223
 224     /* Offsets into 'buffer'. */
 225     size_t journal_pos;         /* First byte not yet output to journal. */
 226     size_t seg_pos;             /* First byte not yet scanned as token. */
 227
 228     int n_newlines;             /* Number of new-lines up to seg_pos. */
 229     bool suppress_next_newline;
 230
 231     /* Tokens.
 232
 233        This is a pipeline with the following stages.  Each token eventually
 234        made available to the parser passes through of these stages.  The stages
 235        are named after the processing that happens in each one.
 236
 237        Initially, tokens come from the segmenter and scanner to 'pp':
 238
 239        - pp: Tokens that need to pass through the macro preprocessor to end up
 240          in 'merge'.
 241
 242        - merge: Tokens that need to pass through scan_merge() to end up in
 243          'lookahead'.
 244
 245        - lookahead: Tokens available to the client for parsing. */
 246     struct lex_stage pp;
 247     struct lex_stage merge;
 248     struct lex_stage lookahead;
 249   };
 250
 251 static struct lex_source *lex_source_create (struct lexer *,
 252                                              struct lex_reader *);
 253 static void lex_source_destroy (struct lex_source *);
 254
 255 /* Lexer. */
 256 struct lexer
 257   {
 258     struct ll_list sources;     /* Contains "struct lex_source"s. */
 259     struct macro_set *macros;
 260   };
 261
 262 static struct lex_source *lex_source__ (const struct lexer *);
 263 static char *lex_source_get_syntax__ (const struct lex_source *,
 264                                       int n0, int n1);
 265 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 266 static void lex_source_push_endcmd__ (struct lex_source *);
 267
 268 static bool lex_source_get_lookahead (struct lex_source *);
 269 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 270                                      const char *format, va_list)
 271    PRINTF_FORMAT (4, 0);
 272 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 273                                                   int n);
 274 \f
 275 /* Initializes READER with the specified CLASS and otherwise some reasonable
 276    defaults.  The caller should fill in the others members as desired. */
 277 void
 278 lex_reader_init (struct lex_reader *reader,
 279                  const struct lex_reader_class *class)
 280 {
 281   reader->class = class;
 282   reader->syntax = SEG_MODE_AUTO;
 283   reader->error = LEX_ERROR_CONTINUE;
 284   reader->file_name = NULL;
 285   reader->encoding = NULL;
 286   reader->line_number = 0;
 287   reader->eof = false;
 288 }
 289
 290 /* Frees any file name already in READER and replaces it by a copy of
 291    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 292 void
 293 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 294 {
 295   free (reader->file_name);
 296   reader->file_name = xstrdup_if_nonnull (file_name);
 297 }
 298 \f
 299 /* Creates and returns a new lexer. */
 300 struct lexer *
 301 lex_create (void)
 302 {
 303   struct lexer *lexer = xmalloc (sizeof *lexer);
 304   *lexer = (struct lexer) {
 305     .sources = LL_INITIALIZER (lexer->sources),
 306     .macros = macro_set_create (),
 307   };
 308   return lexer;
 309 }
 310
 311 /* Destroys LEXER. */
 312 void
 313 lex_destroy (struct lexer *lexer)
 314 {
 315   if (lexer != NULL)
 316     {
 317       struct lex_source *source, *next;
 318
 319       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 320         lex_source_destroy (source);
 321       macro_set_destroy (lexer->macros);
 322       free (lexer);
 323     }
 324 }
 325
 326 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 327    same name.  Takes ownership of M. */
 328 void
 329 lex_define_macro (struct lexer *lexer, struct macro *m)
 330 {
 331   macro_set_add (lexer->macros, m);
 332 }
 333
 334 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 335    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 336    token. */
 337 void
 338 lex_include (struct lexer *lexer, struct lex_reader *reader)
 339 {
 340   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 341   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 342 }
 343
 344 /* Appends READER to LEXER, so that it will be read after all other current
 345    readers have already been read. */
 346 void
 347 lex_append (struct lexer *lexer, struct lex_reader *reader)
 348 {
 349   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 350 }
 351 \f
 352 /* Advancing. */
 353
 354 /* Advances LEXER to the next token, consuming the current token. */
 355 void
 356 lex_get (struct lexer *lexer)
 357 {
 358   struct lex_source *src;
 359
 360   src = lex_source__ (lexer);
 361   if (src == NULL)
 362     return;
 363
 364   if (!lex_stage_is_empty (&src->lookahead))
 365     lex_stage_pop_first (&src->lookahead);
 366
 367   while (lex_stage_is_empty (&src->lookahead))
 368     if (!lex_source_get_lookahead (src))
 369       {
 370         lex_source_destroy (src);
 371         src = lex_source__ (lexer);
 372         if (src == NULL)
 373           return;
 374       }
 375 }
 376
 377 /* Advances LEXER by N tokens. */
 378 void
 379 lex_get_n (struct lexer *lexer, size_t n)
 380 {
 381   while (n-- > 0)
 382     lex_get (lexer);
 383 }
 384 \f
 385 /* Issuing errors. */
 386
 387 /* Prints a syntax error message containing the current token and
 388    given message MESSAGE (if non-null). */
 389 void
 390 lex_error (struct lexer *lexer, const char *format, ...)
 391 {
 392   va_list args;
 393
 394   va_start (args, format);
 395   lex_next_error_valist (lexer, 0, 0, format, args);
 396   va_end (args);
 397 }
 398
 399 /* Prints a syntax error message containing the current token and
 400    given message MESSAGE (if non-null). */
 401 void
 402 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 403 {
 404   lex_next_error_valist (lexer, 0, 0, format, args);
 405 }
 406
 407 /* Prints a syntax error message containing the current token and
 408    given message MESSAGE (if non-null). */
 409 void
 410 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 411 {
 412   va_list args;
 413
 414   va_start (args, format);
 415   lex_next_error_valist (lexer, n0, n1, format, args);
 416   va_end (args);
 417 }
 418
 419 /* Prints a syntax error message saying that one of the strings provided as
 420    varargs, up to the first NULL, is expected. */
 421 void
 422 (lex_error_expecting) (struct lexer *lexer, ...)
 423 {
 424   va_list args;
 425
 426   va_start (args, lexer);
 427   lex_error_expecting_valist (lexer, args);
 428   va_end (args);
 429 }
 430
 431 /* Prints a syntax error message saying that one of the options provided in
 432    ARGS, up to the first NULL, is expected. */
 433 void
 434 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 435 {
 436   enum { MAX_OPTIONS = 9 };
 437   const char *options[MAX_OPTIONS];
 438   int n = 0;
 439   while (n < MAX_OPTIONS)
 440     {
 441       const char *option = va_arg (args, const char *);
 442       if (!option)
 443         break;
 444
 445       options[n++] = option;
 446     }
 447   lex_error_expecting_array (lexer, options, n);
 448 }
 449
 450 void
 451 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 452 {
 453   switch (n)
 454     {
 455     case 0:
 456       lex_error (lexer, NULL);
 457       break;
 458
 459     case 1:
 460       lex_error (lexer, _("expecting %s"), options[0]);
 461       break;
 462
 463     case 2:
 464       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 465       break;
 466
 467     case 3:
 468       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 469                  options[2]);
 470       break;
 471
 472     case 4:
 473       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 474                  options[0], options[1], options[2], options[3]);
 475       break;
 476
 477     case 5:
 478       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 479                  options[0], options[1], options[2], options[3], options[4]);
 480       break;
 481
 482     case 6:
 483       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 484                  options[0], options[1], options[2], options[3], options[4],
 485                  options[5]);
 486       break;
 487
 488     case 7:
 489       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 490                  options[0], options[1], options[2], options[3], options[4],
 491                  options[5], options[6]);
 492       break;
 493
 494     case 8:
 495       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 496                  options[0], options[1], options[2], options[3], options[4],
 497                  options[5], options[6], options[7]);
 498       break;
 499
 500     default:
 501       lex_error (lexer, NULL);
 502     }
 503 }
 504
 505 /* Reports an error to the effect that subcommand SBC may only be specified
 506    once.
 507
 508    This function does not take a lexer as an argument or use lex_error(),
 509    because the result would ordinarily just be redundant: "Syntax error at
 510    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 511    not help the user find the error. */
 512 void
 513 lex_sbc_only_once (const char *sbc)
 514 {
 515   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 516 }
 517
 518 /* Reports an error to the effect that subcommand SBC is missing.
 519
 520    This function does not take a lexer as an argument or use lex_error(),
 521    because a missing subcommand can normally be detected only after the whole
 522    command has been parsed, and so lex_error() would always report "Syntax
 523    error at end of command", which does not help the user find the error. */
 524 void
 525 lex_sbc_missing (const char *sbc)
 526 {
 527   msg (SE, _("Required subcommand %s was not specified."), sbc);
 528 }
 529
 530 /* Reports an error to the effect that specification SPEC may only be specified
 531    once within subcommand SBC. */
 532 void
 533 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 534 {
 535   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 536              spec, sbc);
 537 }
 538
 539 /* Reports an error to the effect that specification SPEC is missing within
 540    subcommand SBC. */
 541 void
 542 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 543 {
 544   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 545              sbc, spec);
 546 }
 547
 548 /* Prints a syntax error message containing the current token and
 549    given message MESSAGE (if non-null). */
 550 void
 551 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 552                        const char *format, va_list args)
 553 {
 554   struct lex_source *src = lex_source__ (lexer);
 555
 556   if (src != NULL)
 557     lex_source_error_valist (src, n0, n1, format, args);
 558   else
 559     {
 560       struct string s;
 561
 562       ds_init_empty (&s);
 563       ds_put_format (&s, _("Syntax error at end of input"));
 564       if (format != NULL)
 565         {
 566           ds_put_cstr (&s, ": ");
 567           ds_put_vformat (&s, format, args);
 568         }
 569       if (ds_last (&s) != '.')
 570         ds_put_byte (&s, '.');
 571       msg (SE, "%s", ds_cstr (&s));
 572       ds_destroy (&s);
 573     }
 574 }
 575
 576 /* Checks that we're at end of command.
 577    If so, returns a successful command completion code.
 578    If not, flags a syntax error and returns an error command
 579    completion code. */
 580 int
 581 lex_end_of_command (struct lexer *lexer)
 582 {
 583   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 584     {
 585       lex_error (lexer, _("expecting end of command"));
 586       return CMD_FAILURE;
 587     }
 588   else
 589     return CMD_SUCCESS;
 590 }
 591 \f
 592 /* Token testing functions. */
 593
 594 /* Returns true if the current token is a number. */
 595 bool
 596 lex_is_number (const struct lexer *lexer)
 597 {
 598   return lex_next_is_number (lexer, 0);
 599 }
 600
 601 /* Returns true if the current token is a string. */
 602 bool
 603 lex_is_string (const struct lexer *lexer)
 604 {
 605   return lex_next_is_string (lexer, 0);
 606 }
 607
 608 /* Returns the value of the current token, which must be a
 609    floating point number. */
 610 double
 611 lex_number (const struct lexer *lexer)
 612 {
 613   return lex_next_number (lexer, 0);
 614 }
 615
 616 /* Returns true iff the current token is an integer. */
 617 bool
 618 lex_is_integer (const struct lexer *lexer)
 619 {
 620   return lex_next_is_integer (lexer, 0);
 621 }
 622
 623 /* Returns the value of the current token, which must be an
 624    integer. */
 625 long
 626 lex_integer (const struct lexer *lexer)
 627 {
 628   return lex_next_integer (lexer, 0);
 629 }
 630 \f
 631 /* Token testing functions with lookahead.
 632
 633    A value of 0 for N as an argument to any of these functions refers to the
 634    current token.  Lookahead is limited to the current command.  Any N greater
 635    than the number of tokens remaining in the current command will be treated
 636    as referring to a T_ENDCMD token. */
 637
 638 /* Returns true if the token N ahead of the current token is a number. */
 639 bool
 640 lex_next_is_number (const struct lexer *lexer, int n)
 641 {
 642   return token_is_number (lex_next (lexer, n));
 643 }
 644
 645 /* Returns true if the token N ahead of the current token is a string. */
 646 bool
 647 lex_next_is_string (const struct lexer *lexer, int n)
 648 {
 649   return token_is_string (lex_next (lexer, n));
 650 }
 651
 652 /* Returns the value of the token N ahead of the current token, which must be a
 653    floating point number. */
 654 double
 655 lex_next_number (const struct lexer *lexer, int n)
 656 {
 657   return token_number (lex_next (lexer, n));
 658 }
 659
 660 /* Returns true if the token N ahead of the current token is an integer. */
 661 bool
 662 lex_next_is_integer (const struct lexer *lexer, int n)
 663 {
 664   return token_is_integer (lex_next (lexer, n));
 665 }
 666
 667 /* Returns the value of the token N ahead of the current token, which must be
 668    an integer. */
 669 long
 670 lex_next_integer (const struct lexer *lexer, int n)
 671 {
 672   return token_integer (lex_next (lexer, n));
 673 }
 674 \f
 675 /* Token matching functions. */
 676
 677 /* If the current token has the specified TYPE, skips it and returns true.
 678    Otherwise, returns false. */
 679 bool
 680 lex_match (struct lexer *lexer, enum token_type type)
 681 {
 682   if (lex_token (lexer) == type)
 683     {
 684       lex_get (lexer);
 685       return true;
 686     }
 687   else
 688     return false;
 689 }
 690
 691 /* If the current token matches IDENTIFIER, skips it and returns true.
 692    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 693    returns false.
 694
 695    IDENTIFIER must be an ASCII string. */
 696 bool
 697 lex_match_id (struct lexer *lexer, const char *identifier)
 698 {
 699   return lex_match_id_n (lexer, identifier, 3);
 700 }
 701
 702 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 703    may be abbreviated to its first N letters.  Otherwise, returns false.
 704
 705    IDENTIFIER must be an ASCII string. */
 706 bool
 707 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 708 {
 709   if (lex_token (lexer) == T_ID
 710       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 711     {
 712       lex_get (lexer);
 713       return true;
 714     }
 715   else
 716     return false;
 717 }
 718
 719 /* If the current token is integer X, skips it and returns true.  Otherwise,
 720    returns false. */
 721 bool
 722 lex_match_int (struct lexer *lexer, int x)
 723 {
 724   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 725     {
 726       lex_get (lexer);
 727       return true;
 728     }
 729   else
 730     return false;
 731 }
 732 \f
 733 /* Forced matches. */
 734
 735 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 736    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 737    false.
 738
 739    IDENTIFIER must be an ASCII string. */
 740 bool
 741 lex_force_match_id (struct lexer *lexer, const char *identifier)
 742 {
 743   if (lex_match_id (lexer, identifier))
 744     return true;
 745   else
 746     {
 747       lex_error_expecting (lexer, identifier);
 748       return false;
 749     }
 750 }
 751
 752 /* If the current token has the specified TYPE, skips it and returns true.
 753    Otherwise, reports an error and returns false. */
 754 bool
 755 lex_force_match (struct lexer *lexer, enum token_type type)
 756 {
 757   if (lex_token (lexer) == type)
 758     {
 759       lex_get (lexer);
 760       return true;
 761     }
 762   else
 763     {
 764       const char *type_string = token_type_to_string (type);
 765       if (type_string)
 766         {
 767           char *s = xasprintf ("`%s'", type_string);
 768           lex_error_expecting (lexer, s);
 769           free (s);
 770         }
 771       else
 772         lex_error_expecting (lexer, token_type_to_name (type));
 773
 774       return false;
 775     }
 776 }
 777
 778 /* If the current token is a string, does nothing and returns true.
 779    Otherwise, reports an error and returns false. */
 780 bool
 781 lex_force_string (struct lexer *lexer)
 782 {
 783   if (lex_is_string (lexer))
 784     return true;
 785   else
 786     {
 787       lex_error (lexer, _("expecting string"));
 788       return false;
 789     }
 790 }
 791
 792 /* If the current token is a string or an identifier, does nothing and returns
 793    true.  Otherwise, reports an error and returns false.
 794
 795    This is meant for use in syntactic situations where we want to encourage the
 796    user to supply a quoted string, but for compatibility we also accept
 797    identifiers.  (One example of such a situation is file names.)  Therefore,
 798    the error message issued when the current token is wrong only says that a
 799    string is expected and doesn't mention that an identifier would also be
 800    accepted. */
 801 bool
 802 lex_force_string_or_id (struct lexer *lexer)
 803 {
 804   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 805 }
 806
 807 /* If the current token is an integer, does nothing and returns true.
 808    Otherwise, reports an error and returns false. */
 809 bool
 810 lex_force_int (struct lexer *lexer)
 811 {
 812   if (lex_is_integer (lexer))
 813     return true;
 814   else
 815     {
 816       lex_error (lexer, _("expecting integer"));
 817       return false;
 818     }
 819 }
 820
 821 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 822    nothing and returns true.  Otherwise, reports an error and returns false.
 823    If NAME is nonnull, then it is used in the error message. */
 824 bool
 825 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 826 {
 827   bool is_number = lex_is_number (lexer);
 828   bool is_integer = lex_is_integer (lexer);
 829   bool too_small = (is_integer ? lex_integer (lexer) < min
 830                     : is_number ? lex_number (lexer) < min
 831                     : false);
 832   bool too_big = (is_integer ? lex_integer (lexer) > max
 833                   : is_number ? lex_number (lexer) > max
 834                   : false);
 835   if (is_integer && !too_small && !too_big)
 836     return true;
 837
 838   if (min > max)
 839     {
 840       /* Weird, maybe a bug in the caller.  Just report that we needed an
 841          integer. */
 842       if (name)
 843         lex_error (lexer, _("Integer expected for %s."), name);
 844       else
 845         lex_error (lexer, _("Integer expected."));
 846     }
 847   else if (min == max)
 848     {
 849       if (name)
 850         lex_error (lexer, _("Expected %ld for %s."), min, name);
 851       else
 852         lex_error (lexer, _("Expected %ld."), min);
 853     }
 854   else if (min + 1 == max)
 855     {
 856       if (name)
 857         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 858       else
 859         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 860     }
 861   else
 862     {
 863       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 864       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 865
 866       if (report_lower_bound && report_upper_bound)
 867         {
 868           if (name)
 869             lex_error (lexer,
 870                        _("Expected integer between %ld and %ld for %s."),
 871                        min, max, name);
 872           else
 873             lex_error (lexer, _("Expected integer between %ld and %ld."),
 874                        min, max);
 875         }
 876       else if (report_lower_bound)
 877         {
 878           if (min == 0)
 879             {
 880               if (name)
 881                 lex_error (lexer, _("Expected non-negative integer for %s."),
 882                            name);
 883               else
 884                 lex_error (lexer, _("Expected non-negative integer."));
 885             }
 886           else if (min == 1)
 887             {
 888               if (name)
 889                 lex_error (lexer, _("Expected positive integer for %s."),
 890                            name);
 891               else
 892                 lex_error (lexer, _("Expected positive integer."));
 893             }
 894           else
 895             {
 896               if (name)
 897                 lex_error (lexer, _("Expected integer %ld or greater for %s."),
 898                            min, name);
 899               else
 900                 lex_error (lexer, _("Expected integer %ld or greater."), min);
 901             }
 902         }
 903       else if (report_upper_bound)
 904         {
 905           if (name)
 906             lex_error (lexer,
 907                        _("Expected integer less than or equal to %ld for %s."),
 908                        max, name);
 909           else
 910             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 911                        max);
 912         }
 913       else
 914         {
 915           if (name)
 916             lex_error (lexer, _("Integer expected for %s."), name);
 917           else
 918             lex_error (lexer, _("Integer expected."));
 919         }
 920     }
 921   return false;
 922 }
 923
 924 /* If the current token is a number, does nothing and returns true.
 925    Otherwise, reports an error and returns false. */
 926 bool
 927 lex_force_num (struct lexer *lexer)
 928 {
 929   if (lex_is_number (lexer))
 930     return true;
 931
 932   lex_error (lexer, _("expecting number"));
 933   return false;
 934 }
 935
 936 /* If the current token is an identifier, does nothing and returns true.
 937    Otherwise, reports an error and returns false. */
 938 bool
 939 lex_force_id (struct lexer *lexer)
 940 {
 941   if (lex_token (lexer) == T_ID)
 942     return true;
 943
 944   lex_error (lexer, _("expecting identifier"));
 945   return false;
 946 }
 947 \f
 948 /* Token accessors. */
 949
 950 /* Returns the type of LEXER's current token. */
 951 enum token_type
 952 lex_token (const struct lexer *lexer)
 953 {
 954   return lex_next_token (lexer, 0);
 955 }
 956
 957 /* Returns the number in LEXER's current token.
 958
 959    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 960    tokens this function will always return zero. */
 961 double
 962 lex_tokval (const struct lexer *lexer)
 963 {
 964   return lex_next_tokval (lexer, 0);
 965 }
 966
 967 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 968
 969    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 970    this functions this function will always return NULL.
 971
 972    The UTF-8 encoding of the returned string is correct for variable names and
 973    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 974    data_in() to use it in a "union value".  */
 975 const char *
 976 lex_tokcstr (const struct lexer *lexer)
 977 {
 978   return lex_next_tokcstr (lexer, 0);
 979 }
 980
 981 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 982    null-terminated (but the null terminator is not included in the returned
 983    substring's 'length').
 984
 985    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 986    this functions this function will always return NULL.
 987
 988    The UTF-8 encoding of the returned string is correct for variable names and
 989    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 990    data_in() to use it in a "union value".  */
 991 struct substring
 992 lex_tokss (const struct lexer *lexer)
 993 {
 994   return lex_next_tokss (lexer, 0);
 995 }
 996 \f
 997 /* Looking ahead.
 998
 999    A value of 0 for N as an argument to any of these functions refers to the
1000    current token.  Lookahead is limited to the current command.  Any N greater
1001    than the number of tokens remaining in the current command will be treated
1002    as referring to a T_ENDCMD token. */
1003
1004 static const struct lex_token *
1005 lex_next__ (const struct lexer *lexer_, int n)
1006 {
1007   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1008   struct lex_source *src = lex_source__ (lexer);
1009
1010   if (src != NULL)
1011     return lex_source_next__ (src, n);
1012   else
1013     {
1014       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1015       return &stop_token;
1016     }
1017 }
1018
1019 static const struct lex_token *
1020 lex_source_next__ (const struct lex_source *src_, int n)
1021 {
1022   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1023   while (lex_stage_count (&src->lookahead) <= n)
1024     {
1025       if (!lex_stage_is_empty (&src->lookahead))
1026         {
1027           const struct lex_token *t = lex_stage_last (&src->lookahead);
1028           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1029             return t;
1030         }
1031
1032       lex_source_get_lookahead (src);
1033     }
1034
1035   return lex_stage_nth (&src->lookahead, n);
1036 }
1037
1038 /* Returns the "struct token" of the token N after the current one in LEXER.
1039    The returned pointer can be invalidated by pretty much any succeeding call
1040    into the lexer, although the string pointer within the returned token is
1041    only invalidated by consuming the token (e.g. with lex_get()). */
1042 const struct token *
1043 lex_next (const struct lexer *lexer, int n)
1044 {
1045   return &lex_next__ (lexer, n)->token;
1046 }
1047
1048 /* Returns the type of the token N after the current one in LEXER. */
1049 enum token_type
1050 lex_next_token (const struct lexer *lexer, int n)
1051 {
1052   return lex_next (lexer, n)->type;
1053 }
1054
1055 /* Returns the number in the tokn N after the current one in LEXER.
1056
1057    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1058    tokens this function will always return zero. */
1059 double
1060 lex_next_tokval (const struct lexer *lexer, int n)
1061 {
1062   return token_number (lex_next (lexer, n));
1063 }
1064
1065 /* Returns the null-terminated string in the token N after the current one, in
1066    UTF-8 encoding.
1067
1068    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1069    this functions this function will always return NULL.
1070
1071    The UTF-8 encoding of the returned string is correct for variable names and
1072    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1073    data_in() to use it in a "union value".  */
1074 const char *
1075 lex_next_tokcstr (const struct lexer *lexer, int n)
1076 {
1077   return lex_next_tokss (lexer, n).string;
1078 }
1079
1080 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1081    The string is null-terminated (but the null terminator is not included in
1082    the returned substring's 'length').
1083
1084    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1085    tokens this functions this function will always return NULL.
1086
1087    The UTF-8 encoding of the returned string is correct for variable names and
1088    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1089    data_in() to use it in a "union value".  */
1090 struct substring
1091 lex_next_tokss (const struct lexer *lexer, int n)
1092 {
1093   return lex_next (lexer, n)->string;
1094 }
1095
1096 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1097    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1098    are both zero, this requests the syntax for the current token.)  The caller
1099    must eventually free the returned string (with free()).  The syntax is
1100    encoded in UTF-8 and in the original form supplied to the lexer so that, for
1101    example, it may include comments, spaces, and new-lines if it spans multiple
1102    tokens.  Macro expansion, however, has already been performed. */
1103 char *
1104 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1105 {
1106   return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1107 }
1108
1109 /* Returns true if the token N ahead of the current one was produced by macro
1110    expansion, false otherwise. */
1111 bool
1112 lex_next_is_from_macro (const struct lexer *lexer, int n)
1113 {
1114   return lex_next__ (lexer, n)->macro_rep != NULL;
1115 }
1116
1117 static bool
1118 lex_tokens_match (const struct token *actual, const struct token *expected)
1119 {
1120   if (actual->type != expected->type)
1121     return false;
1122
1123   switch (actual->type)
1124     {
1125     case T_POS_NUM:
1126     case T_NEG_NUM:
1127       return actual->number == expected->number;
1128
1129     case T_ID:
1130       return lex_id_match (expected->string, actual->string);
1131
1132     case T_STRING:
1133       return (actual->string.length == expected->string.length
1134               && !memcmp (actual->string.string, expected->string.string,
1135                           actual->string.length));
1136
1137     default:
1138       return true;
1139     }
1140 }
1141
1142 static size_t
1143 lex_at_phrase__ (struct lexer *lexer, const char *s)
1144 {
1145   struct string_lexer slex;
1146   struct token token;
1147
1148   size_t i = 0;
1149   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1150   while (string_lexer_next (&slex, &token))
1151     {
1152       bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1153       token_uninit (&token);
1154       if (!match)
1155         return 0;
1156     }
1157   return i;
1158 }
1159
1160 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1161    returns true.  Otherwise, returns false.
1162
1163    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1164    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1165    first three letters. */
1166 bool
1167 lex_at_phrase (struct lexer *lexer, const char *s)
1168 {
1169   return lex_at_phrase__ (lexer, s) > 0;
1170 }
1171
1172 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1173    skips it and returns true.  Otherwise, returns false.
1174
1175    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1176    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1177    first three letters. */
1178 bool
1179 lex_match_phrase (struct lexer *lexer, const char *s)
1180 {
1181   size_t n = lex_at_phrase__ (lexer, s);
1182   if (n > 0)
1183     lex_get_n (lexer, n);
1184   return n > 0;
1185 }
1186
1187 static int
1188 count_newlines (char *s, size_t length)
1189 {
1190   int n_newlines = 0;
1191   char *newline;
1192
1193   while ((newline = memchr (s, '\n', length)) != NULL)
1194     {
1195       n_newlines++;
1196       length -= (newline + 1) - s;
1197       s = newline + 1;
1198     }
1199
1200   return n_newlines;
1201 }
1202
1203 static int
1204 lex_token_get_last_line_number (const struct lex_source *src,
1205                                 const struct lex_token *token)
1206 {
1207   if (token->first_line == 0)
1208     return 0;
1209   else
1210     {
1211       char *token_str = &src->buffer[token->token_pos];
1212       return token->first_line + count_newlines (token_str, token->token_len) + 1;
1213     }
1214 }
1215
1216 static int
1217 lex_token_get_column__ (const struct lex_source *src, size_t offset)
1218 {
1219   const char *newline = memrchr (src->buffer, '\n', offset);
1220   size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1221   return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1222 }
1223
1224 static int
1225 lex_token_get_first_column (const struct lex_source *src,
1226                             const struct lex_token *token)
1227 {
1228   return lex_token_get_column__ (src, token->token_pos);
1229 }
1230
1231 static int
1232 lex_token_get_last_column (const struct lex_source *src,
1233                            const struct lex_token *token)
1234 {
1235   return lex_token_get_column__ (src, token->token_pos + token->token_len);
1236 }
1237
1238 static struct msg_location
1239 lex_token_location (const struct lex_source *src,
1240                     const struct lex_token *t0,
1241                     const struct lex_token *t1)
1242 {
1243   return (struct msg_location) {
1244     .file_name = src->reader->file_name,
1245     .first_line = t0->first_line,
1246     .last_line = lex_token_get_last_line_number (src, t1),
1247     .first_column = lex_token_get_first_column (src, t0),
1248     .last_column = lex_token_get_last_column (src, t1),
1249   };
1250 }
1251
1252 static struct msg_location *
1253 lex_token_location_rw (const struct lex_source *src,
1254                        const struct lex_token *t0,
1255                        const struct lex_token *t1)
1256 {
1257   struct msg_location location = lex_token_location (src, t0, t1);
1258   return msg_location_dup (&location);
1259 }
1260
1261 static struct msg_location *
1262 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1263 {
1264   return lex_token_location_rw (src,
1265                                 lex_source_next__ (src, n0),
1266                                 lex_source_next__ (src, n1));
1267 }
1268
1269 /* Returns the 1-based line number of the start of the syntax that represents
1270    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1271    if the token is drawn from a source that does not have line numbers. */
1272 int
1273 lex_get_first_line_number (const struct lexer *lexer, int n)
1274 {
1275   const struct lex_source *src = lex_source__ (lexer);
1276   return src ? lex_source_next__ (src, n)->first_line : 0;
1277 }
1278
1279 /* Returns the 1-based line number of the end of the syntax that represents the
1280    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1281    token or if the token is drawn from a source that does not have line
1282    numbers.
1283
1284    Most of the time, a single token is wholly within a single line of syntax,
1285    but there are two exceptions: a T_STRING token can be made up of multiple
1286    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1287    token can consist of a "-" on one line followed by the number on the next.
1288  */
1289 int
1290 lex_get_last_line_number (const struct lexer *lexer, int n)
1291 {
1292   const struct lex_source *src = lex_source__ (lexer);
1293   return src ? lex_token_get_last_line_number (src,
1294                                                lex_source_next__ (src, n)) : 0;
1295 }
1296
1297 /* Returns the 1-based column number of the start of the syntax that represents
1298    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1299    token.
1300
1301    Column numbers are measured according to the width of characters as shown in
1302    a typical fixed-width font, in which CJK characters have width 2 and
1303    combining characters have width 0.  */
1304 int
1305 lex_get_first_column (const struct lexer *lexer, int n)
1306 {
1307   const struct lex_source *src = lex_source__ (lexer);
1308   return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1309 }
1310
1311 /* Returns the 1-based column number of the end of the syntax that represents
1312    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1313    token.
1314
1315    Column numbers are measured according to the width of characters as shown in
1316    a typical fixed-width font, in which CJK characters have width 2 and
1317    combining characters have width 0.  */
1318 int
1319 lex_get_last_column (const struct lexer *lexer, int n)
1320 {
1321   const struct lex_source *src = lex_source__ (lexer);
1322   return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1323 }
1324
1325 /* Returns the name of the syntax file from which the current command is drawn.
1326    Returns NULL for a T_STOP token or if the command's source does not have
1327    line numbers.
1328
1329    There is no version of this function that takes an N argument because
1330    lookahead only works to the end of a command and any given command is always
1331    within a single syntax file. */
1332 const char *
1333 lex_get_file_name (const struct lexer *lexer)
1334 {
1335   struct lex_source *src = lex_source__ (lexer);
1336   return src == NULL ? NULL : src->reader->file_name;
1337 }
1338
1339 /* Returns a newly allocated msg_location for the syntax that represents tokens
1340    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1341    must eventually free the location (with msg_location_destroy()). */
1342 struct msg_location *
1343 lex_get_location (const struct lexer *lexer, int n0, int n1)
1344 {
1345   struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1346   loc->first_column = lex_get_first_column (lexer, n0);
1347   loc->last_column = lex_get_last_column (lexer, n1);
1348   return loc;
1349 }
1350
1351 /* Returns a newly allocated msg_location for the syntax that represents tokens
1352    with 0-based offsets N0...N1, inclusive, from the current token.  The
1353    location only covers the tokens' lines, not the columns.  The caller must
1354    eventually free the location (with msg_location_destroy()). */
1355 struct msg_location *
1356 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1357 {
1358   struct msg_location *loc = xmalloc (sizeof *loc);
1359   *loc = (struct msg_location) {
1360     .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1361     .first_line = lex_get_first_line_number (lexer, n0),
1362     .last_line = lex_get_last_line_number (lexer, n1),
1363   };
1364   return loc;
1365 }
1366
1367 const char *
1368 lex_get_encoding (const struct lexer *lexer)
1369 {
1370   struct lex_source *src = lex_source__ (lexer);
1371   return src == NULL ? NULL : src->reader->encoding;
1372 }
1373
1374 /* Returns the syntax mode for the syntax file from which the current drawn is
1375    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1376    does not have line numbers.
1377
1378    There is no version of this function that takes an N argument because
1379    lookahead only works to the end of a command and any given command is always
1380    within a single syntax file. */
1381 enum segmenter_mode
1382 lex_get_syntax_mode (const struct lexer *lexer)
1383 {
1384   struct lex_source *src = lex_source__ (lexer);
1385   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1386 }
1387
1388 /* Returns the error mode for the syntax file from which the current drawn is
1389    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1390    source does not have line numbers.
1391
1392    There is no version of this function that takes an N argument because
1393    lookahead only works to the end of a command and any given command is always
1394    within a single syntax file. */
1395 enum lex_error_mode
1396 lex_get_error_mode (const struct lexer *lexer)
1397 {
1398   struct lex_source *src = lex_source__ (lexer);
1399   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1400 }
1401
1402 /* If the source that LEXER is currently reading has error mode
1403    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1404    token to be read comes directly from whatever is next read from the stream.
1405
1406    It makes sense to call this function after encountering an error in a
1407    command entered on the console, because usually the user would prefer not to
1408    have cascading errors. */
1409 void
1410 lex_interactive_reset (struct lexer *lexer)
1411 {
1412   struct lex_source *src = lex_source__ (lexer);
1413   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1414     {
1415       src->length = 0;
1416       src->journal_pos = src->seg_pos = 0;
1417       src->n_newlines = 0;
1418       src->suppress_next_newline = false;
1419       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1420                                        false);
1421       lex_stage_clear (&src->pp);
1422       lex_stage_clear (&src->merge);
1423       lex_stage_clear (&src->lookahead);
1424       lex_source_push_endcmd__ (src);
1425     }
1426 }
1427
1428 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1429 void
1430 lex_discard_rest_of_command (struct lexer *lexer)
1431 {
1432   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1433     lex_get (lexer);
1434 }
1435
1436 /* Discards all lookahead tokens in LEXER, then discards all input sources
1437    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1438    runs out of input sources. */
1439 void
1440 lex_discard_noninteractive (struct lexer *lexer)
1441 {
1442   struct lex_source *src = lex_source__ (lexer);
1443
1444   if (src != NULL)
1445     {
1446       lex_stage_clear (&src->pp);
1447       lex_stage_clear (&src->merge);
1448       lex_stage_clear (&src->lookahead);
1449
1450       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1451            src = lex_source__ (lexer))
1452         lex_source_destroy (src);
1453     }
1454 }
1455 \f
1456 static void
1457 lex_source_expand__ (struct lex_source *src)
1458 {
1459   if (src->length >= src->allocated)
1460     src->buffer = x2realloc (src->buffer, &src->allocated);
1461 }
1462
1463 static void
1464 lex_source_read__ (struct lex_source *src)
1465 {
1466   do
1467     {
1468       lex_source_expand__ (src);
1469
1470       size_t space = src->allocated - src->length;
1471       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1472       size_t n = src->reader->class->read (src->reader,
1473                                            &src->buffer[src->length],
1474                                            space, prompt);
1475       assert (n <= space);
1476
1477       if (n == 0)
1478         {
1479           /* End of input. */
1480           src->reader->eof = true;
1481           return;
1482         }
1483
1484       src->length += n;
1485     }
1486   while (!memchr (&src->buffer[src->seg_pos], '\n',
1487                   src->length - src->seg_pos));
1488 }
1489
1490 static struct lex_source *
1491 lex_source__ (const struct lexer *lexer)
1492 {
1493   return (ll_is_empty (&lexer->sources) ? NULL
1494           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1495 }
1496
1497 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1498    one, through N1 ahead of the current one, inclusive.  (For example, if N0
1499    and N1 are both zero, this requests the syntax for the current token.)  The
1500    caller must eventually free the returned string (with free()).  The syntax
1501    is encoded in UTF-8 and in the original form supplied to the lexer so that,
1502    for example, it may include comments, spaces, and new-lines if it spans
1503    multiple tokens.  Macro expansion, however, has already been performed. */
1504 static char *
1505 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1506 {
1507   struct string s = DS_EMPTY_INITIALIZER;
1508   for (size_t i = n0; i <= n1; )
1509     {
1510       /* Find [I,J) as the longest sequence of tokens not produced by macro
1511          expansion, or otherwise the longest sequence expanded from a single
1512          macro call. */
1513       const struct lex_token *first = lex_source_next__ (src, i);
1514       size_t j;
1515       for (j = i + 1; j <= n1; j++)
1516         {
1517           const struct lex_token *cur = lex_source_next__ (src, j);
1518           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1519               || first->macro_rep != cur->macro_rep)
1520             break;
1521         }
1522       const struct lex_token *last = lex_source_next__ (src, j - 1);
1523
1524       /* Now add the syntax for this sequence of tokens to SRC. */
1525       if (!ds_is_empty (&s))
1526         ds_put_byte (&s, ' ');
1527       if (!first->macro_rep)
1528         {
1529           size_t start = first->token_pos;
1530           size_t end = last->token_pos + last->token_len;
1531           ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1532         }
1533       else
1534         {
1535           size_t start = first->ofs;
1536           size_t end = last->ofs + last->len;
1537           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1538                                            end - start));
1539         }
1540
1541       i = j;
1542     }
1543   return ds_steal_cstr (&s);
1544 }
1545
1546 static bool
1547 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1548 {
1549   for (size_t i = n0; i <= n1; i++)
1550     if (lex_source_next__ (src, i)->macro_rep)
1551       return true;
1552   return false;
1553 }
1554
1555 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1556    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1557    other tokens included in that range.  The syntax is encoded in UTF-8 and in
1558    the original form supplied to the lexer so that, for example, it may include
1559    comments, spaces, and new-lines if it spans multiple tokens.
1560
1561    Returns an empty string if the token range doesn't include a macro call.
1562
1563    The caller must not modify or free the returned string. */
1564 static struct substring
1565 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1566 {
1567   if (!lex_source_contains_macro_call (src, n0, n1))
1568     return ss_empty ();
1569
1570   const struct lex_token *token0 = lex_source_next__ (src, n0);
1571   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1572   size_t start = token0->token_pos;
1573   size_t end = token1->token_pos + token1->token_len;
1574
1575   return ss_buffer (&src->buffer[start], end - start);
1576 }
1577
1578 static void
1579 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1580                          const char *format, va_list args)
1581 {
1582   const struct lex_token *token;
1583   struct string s;
1584
1585   ds_init_empty (&s);
1586
1587   token = lex_source_next__ (src, n0);
1588   if (token->token.type == T_ENDCMD)
1589     ds_put_cstr (&s, _("Syntax error at end of command"));
1590   else
1591     {
1592       /* Get the syntax that caused the error. */
1593       char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1594       char syntax[64];
1595       str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1596       free (raw_syntax);
1597
1598       /* Get the macro call(s) that expanded to the syntax that caused the
1599          error. */
1600       char call[64];
1601       str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1602                      call, sizeof call);
1603
1604       if (syntax[0])
1605         {
1606           if (call[0])
1607             ds_put_format (&s,
1608                            _("Syntax error at `%s' (in expansion of `%s')"),
1609                            syntax, call);
1610           else
1611             ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1612         }
1613       else
1614         {
1615           if (call[0])
1616             ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1617                            call);
1618           else
1619             ds_put_cstr (&s, _("Syntax error"));
1620         }
1621     }
1622
1623   if (format)
1624     {
1625       ds_put_cstr (&s, ": ");
1626       ds_put_vformat (&s, format, args);
1627     }
1628   if (ds_last (&s) != '.')
1629     ds_put_byte (&s, '.');
1630
1631   struct msg *m = xmalloc (sizeof *m);
1632   *m = (struct msg) {
1633     .category = MSG_C_SYNTAX,
1634     .severity = MSG_S_ERROR,
1635     .location = lex_source_get_location (src, n0, n1),
1636     .text = ds_steal_cstr (&s),
1637   };
1638   msg_emit (m);
1639 }
1640
1641 static void
1642 lex_get_error (struct lex_source *src, const struct lex_token *token)
1643 {
1644   char syntax[64];
1645   str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1646                  syntax, sizeof syntax);
1647
1648   struct string s = DS_EMPTY_INITIALIZER;
1649   ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1650   ds_put_format (&s, ": %s", token->token.string.string);
1651
1652   struct msg *m = xmalloc (sizeof *m);
1653   *m = (struct msg) {
1654     .category = MSG_C_SYNTAX,
1655     .severity = MSG_S_ERROR,
1656     .location = lex_token_location_rw (src, token, token),
1657     .text = ds_steal_cstr (&s),
1658   };
1659   msg_emit (m);
1660 }
1661
1662 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1663    underlying lex_reader if necessary.  Returns true if a new token was added
1664    to SRC's deque, false otherwise.  The caller should retry failures unless
1665    SRC's 'eof' marker was set to true indicating that there will be no more
1666    tokens from this source. */
1667 static bool
1668 lex_source_try_get_pp (struct lex_source *src)
1669 {
1670   /* Append a new token to SRC and initialize it. */
1671   struct lex_token *token = xmalloc (sizeof *token);
1672   token->token = (struct token) { .type = T_STOP };
1673   token->macro_rep = NULL;
1674   token->ref_cnt = NULL;
1675   token->token_pos = src->seg_pos;
1676   if (src->reader->line_number > 0)
1677     token->first_line = src->reader->line_number + src->n_newlines;
1678   else
1679     token->first_line = 0;
1680
1681   /* Extract a segment. */
1682   const char *segment;
1683   enum segment_type seg_type;
1684   int seg_len;
1685   for (;;)
1686     {
1687       segment = &src->buffer[src->seg_pos];
1688       seg_len = segmenter_push (&src->segmenter, segment,
1689                                 src->length - src->seg_pos,
1690                                 src->reader->eof, &seg_type);
1691       if (seg_len >= 0)
1692         break;
1693
1694       /* The segmenter needs more input to produce a segment. */
1695       assert (!src->reader->eof);
1696       lex_source_read__ (src);
1697     }
1698
1699   /* Update state based on the segment. */
1700   token->token_len = seg_len;
1701   src->seg_pos += seg_len;
1702   if (seg_type == SEG_NEWLINE)
1703     src->n_newlines++;
1704
1705   /* Get a token from the segment. */
1706   enum tokenize_result result = token_from_segment (
1707     seg_type, ss_buffer (segment, seg_len), &token->token);
1708
1709   /* If we've reached the end of a line, or the end of a command, then pass
1710      the line to the output engine as a syntax text item.  */
1711   int n_lines = seg_type == SEG_NEWLINE;
1712   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1713     {
1714       n_lines++;
1715       src->suppress_next_newline = true;
1716     }
1717   else if (n_lines > 0 && src->suppress_next_newline)
1718     {
1719       n_lines--;
1720       src->suppress_next_newline = false;
1721     }
1722   for (int i = 0; i < n_lines; i++)
1723     {
1724       /* Beginning of line. */
1725       const char *line = &src->buffer[src->journal_pos];
1726
1727       /* Calculate line length, including \n or \r\n end-of-line if present.
1728
1729          We use src->length even though that may be beyond what we've actually
1730          converted to tokens.  That's because, if we're emitting the line due
1731          to SEG_END_COMMAND, we want to take the whole line through the
1732          newline, not just through the '.'. */
1733       size_t max_len = src->length - src->journal_pos;
1734       const char *newline = memchr (line, '\n', max_len);
1735       size_t line_len = newline ? newline - line + 1 : max_len;
1736
1737       /* Calculate line length excluding end-of-line. */
1738       size_t copy_len = line_len;
1739       if (copy_len > 0 && line[copy_len - 1] == '\n')
1740         copy_len--;
1741       if (copy_len > 0 && line[copy_len - 1] == '\r')
1742         copy_len--;
1743
1744       /* Submit the line as syntax. */
1745       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1746                                                    xmemdup0 (line, copy_len),
1747                                                    NULL));
1748
1749       src->journal_pos += line_len;
1750     }
1751
1752   switch (result)
1753     {
1754     case TOKENIZE_ERROR:
1755       lex_get_error (src, token);
1756       /* Fall through. */
1757     case TOKENIZE_EMPTY:
1758       lex_token_destroy (token);
1759       return false;
1760
1761     case TOKENIZE_TOKEN:
1762       if (token->token.type == T_STOP)
1763         {
1764           token->token.type = T_ENDCMD;
1765           src->eof = true;
1766         }
1767       lex_stage_push_last (&src->pp, token);
1768       return true;
1769     }
1770   NOT_REACHED ();
1771 }
1772
1773 /* Attempts to append a new token to SRC.  Returns true if successful, false on
1774    failure.  On failure, the end of SRC has been reached and no more tokens
1775    will be forthcoming from it.
1776
1777    Does not make the new token available for lookahead yet; the caller must
1778    adjust SRC's 'middle' pointer to do so. */
1779 static bool
1780 lex_source_get_pp (struct lex_source *src)
1781 {
1782   while (!src->eof)
1783     if (lex_source_try_get_pp (src))
1784       return true;
1785   return false;
1786 }
1787
1788 static bool
1789 lex_source_try_get_merge (const struct lex_source *src_)
1790 {
1791   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1792
1793   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1794     return false;
1795
1796   if (!settings_get_mexpand ())
1797     {
1798       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1799       return true;
1800     }
1801
1802   /* Now pass tokens one-by-one to the macro expander.
1803
1804      In the common case where there is no macro to expand, the loop is not
1805      entered.  */
1806   struct macro_call *mc;
1807   int n_call = macro_call_create (src->lexer->macros,
1808                                   &lex_stage_first (&src->pp)->token, &mc);
1809   for (int ofs = 1; !n_call; ofs++)
1810     {
1811       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1812         {
1813           /* This should not be reachable because we always get a T_ENDCMD at
1814              the end of an input file (transformed from T_STOP by
1815              lex_source_try_get_pp()) and the macro_expander should always
1816              terminate expansion on T_ENDCMD. */
1817           NOT_REACHED ();
1818         }
1819
1820       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1821       size_t start = t->token_pos;
1822       size_t end = t->token_pos + t->token_len;
1823       const struct macro_token mt = {
1824         .token = t->token,
1825         .syntax = ss_buffer (&src->buffer[start], end - start),
1826       };
1827       const struct msg_location loc = lex_token_location (src, t, t);
1828       n_call = macro_call_add (mc, &mt, &loc);
1829     }
1830   if (n_call < 0)
1831     {
1832       /* False alarm: no macro expansion after all.  Use first token as
1833          lookahead.  We'll retry macro expansion from the second token next
1834          time around. */
1835       macro_call_destroy (mc);
1836       lex_stage_shift (&src->merge, &src->pp, 1);
1837       return true;
1838     }
1839
1840   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1841      are a macro call.  (These are likely to be the only tokens in 'pp'.)
1842      Expand them.  */
1843   const struct lex_token *c0 = lex_stage_first (&src->pp);
1844   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1845   struct macro_tokens expansion = { .n = 0 };
1846   struct msg_location loc = lex_token_location (src, c0, c1);
1847   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1848   macro_call_destroy (mc);
1849
1850   /* Convert the macro expansion into syntax for possible error messages
1851      later. */
1852   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1853   size_t *len = xnmalloc (expansion.n, sizeof *len);
1854   struct string s = DS_EMPTY_INITIALIZER;
1855   macro_tokens_to_syntax (&expansion, &s, ofs, len);
1856
1857   if (settings_get_mprint ())
1858     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1859                                           _("Macro Expansion")));
1860
1861   /* Append the macro expansion tokens to the lookahead. */
1862   if (expansion.n > 0)
1863     {
1864       char *macro_rep = ds_steal_cstr (&s);
1865       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1866       *ref_cnt = expansion.n;
1867       for (size_t i = 0; i < expansion.n; i++)
1868         {
1869           struct lex_token *token = xmalloc (sizeof *token);
1870           *token = (struct lex_token) {
1871             .token = expansion.mts[i].token,
1872             .token_pos = c0->token_pos,
1873             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1874             .first_line = c0->first_line,
1875             .macro_rep = macro_rep,
1876             .ofs = ofs[i],
1877             .len = len[i],
1878             .ref_cnt = ref_cnt,
1879           };
1880           lex_stage_push_last (&src->merge, token);
1881
1882           ss_dealloc (&expansion.mts[i].syntax);
1883         }
1884     }
1885   else
1886     ds_destroy (&s);
1887   free (expansion.mts);
1888   free (ofs);
1889   free (len);
1890
1891   /* Destroy the tokens for the call. */
1892   for (size_t i = 0; i < n_call; i++)
1893     lex_stage_pop_first (&src->pp);
1894
1895   return expansion.n > 0;
1896 }
1897
1898 /* Attempts to obtain at least one new token into 'merge' in SRC.
1899
1900    Returns true if successful, false on failure.  In the latter case, SRC is
1901    exhausted and 'src->eof' is now true. */
1902 static bool
1903 lex_source_get_merge (struct lex_source *src)
1904 {
1905   while (!src->eof)
1906     if (lex_source_try_get_merge (src))
1907       return true;
1908   return false;
1909 }
1910
1911 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1912
1913    Returns true if successful, false on failure.  In the latter case, SRC is
1914    exhausted and 'src->eof' is now true. */
1915 static bool
1916 lex_source_get_lookahead (struct lex_source *src)
1917 {
1918   struct merger m = MERGER_INIT;
1919   struct token out;
1920   for (size_t i = 0; ; i++)
1921     {
1922       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1923         {
1924           /* We always get a T_ENDCMD at the end of an input file
1925              (transformed from T_STOP by lex_source_try_get_pp()) and
1926              merger_add() should never return -1 on T_ENDCMD. */
1927           assert (lex_stage_is_empty (&src->merge));
1928           return false;
1929         }
1930
1931       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1932                                &out);
1933       if (!retval)
1934         {
1935           lex_stage_shift (&src->lookahead, &src->merge, 1);
1936           return true;
1937         }
1938       else if (retval > 0)
1939         {
1940           /* Add a token that merges all the tokens together. */
1941           const struct lex_token *first = lex_stage_first (&src->merge);
1942           const struct lex_token *last = lex_stage_nth (&src->merge,
1943                                                         retval - 1);
1944           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1945           struct lex_token *t = xmalloc (sizeof *t);
1946           *t = (struct lex_token) {
1947             .token = out,
1948             .token_pos = first->token_pos,
1949             .token_len = (last->token_pos - first->token_pos) + last->token_len,
1950             .first_line = first->first_line,
1951
1952             /* This works well if all the tokens were not expanded from macros,
1953                or if they came from the same macro expansion.  It just gives up
1954                in the other (corner) cases. */
1955             .macro_rep = macro ? first->macro_rep : NULL,
1956             .ofs = macro ? first->ofs : 0,
1957             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
1958             .ref_cnt = macro ? first->ref_cnt : NULL,
1959           };
1960           if (t->ref_cnt)
1961             ++*t->ref_cnt;
1962           lex_stage_push_last (&src->lookahead, t);
1963
1964           for (int i = 0; i < retval; i++)
1965             lex_stage_pop_first (&src->merge);
1966           return true;
1967         }
1968     }
1969 }
1970 \f
1971 static void
1972 lex_source_push_endcmd__ (struct lex_source *src)
1973 {
1974   assert (lex_stage_is_empty (&src->lookahead));
1975   struct lex_token *token = xmalloc (sizeof *token);
1976   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
1977   lex_stage_push_last (&src->lookahead, token);
1978 }
1979
1980 static struct lex_source *
1981 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
1982 {
1983   struct lex_source *src = xmalloc (sizeof *src);
1984   *src = (struct lex_source) {
1985     .reader = reader,
1986     .segmenter = segmenter_init (reader->syntax, false),
1987     .lexer = lexer,
1988   };
1989
1990   lex_source_push_endcmd__ (src);
1991
1992   return src;
1993 }
1994
1995 static void
1996 lex_source_destroy (struct lex_source *src)
1997 {
1998   char *file_name = src->reader->file_name;
1999   char *encoding = src->reader->encoding;
2000   if (src->reader->class->destroy != NULL)
2001     src->reader->class->destroy (src->reader);
2002   free (file_name);
2003   free (encoding);
2004   free (src->buffer);
2005   lex_stage_uninit (&src->pp);
2006   lex_stage_uninit (&src->merge);
2007   lex_stage_uninit (&src->lookahead);
2008   ll_remove (&src->ll);
2009   free (src);
2010 }
2011 \f
2012 struct lex_file_reader
2013   {
2014     struct lex_reader reader;
2015     struct u8_istream *istream;
2016   };
2017
2018 static struct lex_reader_class lex_file_reader_class;
2019
2020 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2021    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2022    ENCODING, which should take one of the forms accepted by
2023    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2024    mode of the new reader, respectively.
2025
2026    Returns a null pointer if FILE_NAME cannot be opened. */
2027 struct lex_reader *
2028 lex_reader_for_file (const char *file_name, const char *encoding,
2029                      enum segmenter_mode syntax,
2030                      enum lex_error_mode error)
2031 {
2032   struct lex_file_reader *r;
2033   struct u8_istream *istream;
2034
2035   istream = (!strcmp(file_name, "-")
2036              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2037              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2038   if (istream == NULL)
2039     {
2040       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2041       return NULL;
2042     }
2043
2044   r = xmalloc (sizeof *r);
2045   lex_reader_init (&r->reader, &lex_file_reader_class);
2046   r->reader.syntax = syntax;
2047   r->reader.error = error;
2048   r->reader.file_name = xstrdup (file_name);
2049   r->reader.encoding = xstrdup_if_nonnull (encoding);
2050   r->reader.line_number = 1;
2051   r->istream = istream;
2052
2053   return &r->reader;
2054 }
2055
2056 static struct lex_file_reader *
2057 lex_file_reader_cast (struct lex_reader *r)
2058 {
2059   return UP_CAST (r, struct lex_file_reader, reader);
2060 }
2061
2062 static size_t
2063 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2064                enum prompt_style prompt_style UNUSED)
2065 {
2066   struct lex_file_reader *r = lex_file_reader_cast (r_);
2067   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2068   if (n_read < 0)
2069     {
2070       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2071       return 0;
2072     }
2073   return n_read;
2074 }
2075
2076 static void
2077 lex_file_close (struct lex_reader *r_)
2078 {
2079   struct lex_file_reader *r = lex_file_reader_cast (r_);
2080
2081   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2082     {
2083       if (u8_istream_close (r->istream) != 0)
2084         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2085     }
2086   else
2087     u8_istream_free (r->istream);
2088
2089   free (r);
2090 }
2091
2092 static struct lex_reader_class lex_file_reader_class =
2093   {
2094     lex_file_read,
2095     lex_file_close
2096   };
2097 \f
2098 struct lex_string_reader
2099   {
2100     struct lex_reader reader;
2101     struct substring s;
2102     size_t offset;
2103   };
2104
2105 static struct lex_reader_class lex_string_reader_class;
2106
2107 /* Creates and returns a new lex_reader for the contents of S, which must be
2108    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2109    with ss_dealloc() when it is closed. */
2110 struct lex_reader *
2111 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2112 {
2113   struct lex_string_reader *r;
2114
2115   r = xmalloc (sizeof *r);
2116   lex_reader_init (&r->reader, &lex_string_reader_class);
2117   r->reader.syntax = SEG_MODE_AUTO;
2118   r->reader.encoding = xstrdup_if_nonnull (encoding);
2119   r->s = s;
2120   r->offset = 0;
2121
2122   return &r->reader;
2123 }
2124
2125 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2126    which must be encoded in ENCODING.  The caller retains ownership of S. */
2127 struct lex_reader *
2128 lex_reader_for_string (const char *s, const char *encoding)
2129 {
2130   struct substring ss;
2131   ss_alloc_substring (&ss, ss_cstr (s));
2132   return lex_reader_for_substring_nocopy (ss, encoding);
2133 }
2134
2135 /* Formats FORMAT as a printf()-like format string and creates and returns a
2136    new lex_reader for the formatted result.  */
2137 struct lex_reader *
2138 lex_reader_for_format (const char *format, const char *encoding, ...)
2139 {
2140   struct lex_reader *r;
2141   va_list args;
2142
2143   va_start (args, encoding);
2144   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2145   va_end (args);
2146
2147   return r;
2148 }
2149
2150 static struct lex_string_reader *
2151 lex_string_reader_cast (struct lex_reader *r)
2152 {
2153   return UP_CAST (r, struct lex_string_reader, reader);
2154 }
2155
2156 static size_t
2157 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2158                  enum prompt_style prompt_style UNUSED)
2159 {
2160   struct lex_string_reader *r = lex_string_reader_cast (r_);
2161   size_t chunk;
2162
2163   chunk = MIN (n, r->s.length - r->offset);
2164   memcpy (buf, r->s.string + r->offset, chunk);
2165   r->offset += chunk;
2166
2167   return chunk;
2168 }
2169
2170 static void
2171 lex_string_close (struct lex_reader *r_)
2172 {
2173   struct lex_string_reader *r = lex_string_reader_cast (r_);
2174
2175   ss_dealloc (&r->s);
2176   free (r);
2177 }
2178
2179 static struct lex_reader_class lex_string_reader_class =
2180   {
2181     lex_string_read,
2182     lex_string_close
2183   };