pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31 #include <uniwidth.h>
  32
  33 #include "language/command.h"
  34 #include "language/lexer/macro.h"
  35 #include "language/lexer/scan.h"
  36 #include "language/lexer/segment.h"
  37 #include "language/lexer/token.h"
  38 #include "libpspp/assertion.h"
  39 #include "libpspp/cast.h"
  40 #include "libpspp/deque.h"
  41 #include "libpspp/i18n.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* For a token obtained through the lexer in an ordinary way, this is the
  66        location of the token in terms of the lex_source's buffer.
  67
  68        For a token produced through macro expansion, this is the entire macro
  69        call.
  70
  71        src->tail <= line_pos <= token_pos <= src->head. */
  72     size_t token_pos;           /* Start of token. */
  73     size_t token_len;           /* Length of source for token in bytes. */
  74     size_t line_pos;            /* Start of line containing token_pos. */
  75     int first_line;             /* Line number at token_pos. */
  76
  77     /* For a token obtained through macro expansion, this is just this token.
  78
  79        For a token obtained through the lexer in an ordinary way, these are
  80        nulls and zeros. */
  81     char *macro_rep;        /* The whole macro expansion. */
  82     size_t ofs;             /* Offset of this token in macro_rep. */
  83     size_t len;             /* Length of this token in macro_rep. */
  84     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  85   };
  86
  87 static void
  88 lex_token_destroy (struct lex_token *t)
  89 {
  90   token_uninit (&t->token);
  91   if (t->ref_cnt)
  92     {
  93       assert (*t->ref_cnt > 0);
  94       if (!--*t->ref_cnt)
  95         {
  96           free (t->macro_rep);
  97           free (t->ref_cnt);
  98         }
  99     }
 100   free (t);
 101 }
 102 \f
 103 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 104    lex_source. */
 105 struct lex_stage
 106   {
 107     struct deque deque;
 108     struct lex_token **tokens;
 109   };
 110
 111 static void lex_stage_clear (struct lex_stage *);
 112 static void lex_stage_uninit (struct lex_stage *);
 113
 114 static size_t lex_stage_count (const struct lex_stage *);
 115 static bool lex_stage_is_empty (const struct lex_stage *);
 116
 117 static struct lex_token *lex_stage_last (struct lex_stage *);
 118 static struct lex_token *lex_stage_first (struct lex_stage *);
 119 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 120
 121 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 122 static void lex_stage_pop_first (struct lex_stage *);
 123
 124 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 125                              size_t n);
 126
 127 /* Deletes all the tokens from STAGE. */
 128 static void
 129 lex_stage_clear (struct lex_stage *stage)
 130 {
 131   while (!deque_is_empty (&stage->deque))
 132     lex_stage_pop_first (stage);
 133 }
 134
 135 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 136 static void
 137 lex_stage_uninit (struct lex_stage *stage)
 138 {
 139   lex_stage_clear (stage);
 140   free (stage->tokens);
 141 }
 142
 143 /* Returns true if STAGE contains no tokens, otherwise false. */
 144 static bool
 145 lex_stage_is_empty (const struct lex_stage *stage)
 146 {
 147   return deque_is_empty (&stage->deque);
 148 }
 149
 150 /* Returns the number of tokens in STAGE. */
 151 static size_t
 152 lex_stage_count (const struct lex_stage *stage)
 153 {
 154   return deque_count (&stage->deque);
 155 }
 156
 157 /* Returns the last token in STAGE, which must be nonempty.  The last token is
 158    the one accessed with the greatest lookahead. */
 159 static struct lex_token *
 160 lex_stage_last (struct lex_stage *stage)
 161 {
 162   return stage->tokens[deque_front (&stage->deque, 0)];
 163 }
 164
 165 /* Returns the first token in STAGE, which must be nonempty.
 166    The first token is the one accessed with the least lookahead. */
 167 static struct lex_token *
 168 lex_stage_first (struct lex_stage *stage)
 169 {
 170   return lex_stage_nth (stage, 0);
 171 }
 172
 173 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 174    lookahead) is 0, the second token is 1, and so on.  There must be at least
 175    INDEX + 1 tokens in STAGE. */
 176 static struct lex_token *
 177 lex_stage_nth (struct lex_stage *stage, size_t index)
 178 {
 179   return stage->tokens[deque_back (&stage->deque, index)];
 180 }
 181
 182 /* Adds TOKEN so that it becomes the last token in STAGE. */
 183 static void
 184 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 185 {
 186   if (deque_is_full (&stage->deque))
 187     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 188                                   sizeof *stage->tokens);
 189   stage->tokens[deque_push_front (&stage->deque)] = token;
 190 }
 191
 192 /* Removes the first token from STAGE and uninitializes it. */
 193 static void
 194 lex_stage_pop_first (struct lex_stage *stage)
 195 {
 196   lex_token_destroy (stage->tokens[deque_pop_back (&stage->deque)]);
 197 }
 198
 199 /* Removes the first N tokens from SRC, appending them to DST as the last
 200    tokens. */
 201 static void
 202 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 203 {
 204   for (size_t i = 0; i < n; i++)
 205     {
 206       lex_stage_push_last (dst, lex_stage_first (src));
 207       deque_pop_back (&src->deque);
 208     }
 209 }
 210
 211 /* A source of tokens, corresponding to a syntax file.
 212
 213    This is conceptually a lex_reader wrapped with everything needed to convert
 214    its UTF-8 bytes into tokens. */
 215 struct lex_source
 216   {
 217     struct ll ll;               /* In lexer's list of sources. */
 218     struct lex_reader *reader;
 219     struct lexer *lexer;
 220     struct segmenter segmenter;
 221     bool eof;                   /* True if T_STOP was read from 'reader'. */
 222
 223     /* Buffer of UTF-8 bytes. */
 224     char *buffer;
 225     size_t allocated;           /* Number of bytes allocated. */
 226     size_t tail;                /* &buffer[0] offset into UTF-8 source. */
 227     size_t head;                /* &buffer[head - tail] offset into source. */
 228
 229     /* Positions in source file, tail <= pos <= head for each member here. */
 230     size_t journal_pos;         /* First byte not yet output to journal. */
 231     size_t seg_pos;             /* First byte not yet scanned as token. */
 232     size_t line_pos;            /* First byte of line containing seg_pos. */
 233
 234     int n_newlines;             /* Number of new-lines up to seg_pos. */
 235     bool suppress_next_newline;
 236
 237     /* Tokens.
 238
 239        This is a pipeline with the following stages.  Each token eventually
 240        made available to the parser passes through of these stages.  The stages
 241        are named after the processing that happens in each one.
 242
 243        Initially, tokens come from the segmenter and scanner to 'pp':
 244
 245        - pp: Tokens that need to pass through the macro preprocessor to end up
 246          in 'merge'.
 247
 248        - merge: Tokens that need to pass through scan_merge() to end up in
 249          'lookahead'.
 250
 251        - lookahead: Tokens available to the client for parsing. */
 252     struct lex_stage pp;
 253     struct lex_stage merge;
 254     struct lex_stage lookahead;
 255   };
 256
 257 static struct lex_source *lex_source_create (struct lexer *,
 258                                              struct lex_reader *);
 259 static void lex_source_destroy (struct lex_source *);
 260
 261 /* Lexer. */
 262 struct lexer
 263   {
 264     struct ll_list sources;     /* Contains "struct lex_source"s. */
 265     struct macro_set *macros;
 266   };
 267
 268 static struct lex_source *lex_source__ (const struct lexer *);
 269 static char *lex_source_get_syntax__ (const struct lex_source *,
 270                                       int n0, int n1);
 271 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 272 static void lex_source_push_endcmd__ (struct lex_source *);
 273
 274 static bool lex_source_get_lookahead (struct lex_source *);
 275 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 276                                      const char *format, va_list)
 277    PRINTF_FORMAT (4, 0);
 278 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 279                                                   int n);
 280 \f
 281 /* Initializes READER with the specified CLASS and otherwise some reasonable
 282    defaults.  The caller should fill in the others members as desired. */
 283 void
 284 lex_reader_init (struct lex_reader *reader,
 285                  const struct lex_reader_class *class)
 286 {
 287   reader->class = class;
 288   reader->syntax = SEG_MODE_AUTO;
 289   reader->error = LEX_ERROR_CONTINUE;
 290   reader->file_name = NULL;
 291   reader->encoding = NULL;
 292   reader->line_number = 0;
 293   reader->eof = false;
 294 }
 295
 296 /* Frees any file name already in READER and replaces it by a copy of
 297    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 298 void
 299 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 300 {
 301   free (reader->file_name);
 302   reader->file_name = xstrdup_if_nonnull (file_name);
 303 }
 304 \f
 305 /* Creates and returns a new lexer. */
 306 struct lexer *
 307 lex_create (void)
 308 {
 309   struct lexer *lexer = xmalloc (sizeof *lexer);
 310   *lexer = (struct lexer) {
 311     .sources = LL_INITIALIZER (lexer->sources),
 312     .macros = macro_set_create (),
 313   };
 314   return lexer;
 315 }
 316
 317 /* Destroys LEXER. */
 318 void
 319 lex_destroy (struct lexer *lexer)
 320 {
 321   if (lexer != NULL)
 322     {
 323       struct lex_source *source, *next;
 324
 325       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 326         lex_source_destroy (source);
 327       macro_set_destroy (lexer->macros);
 328       free (lexer);
 329     }
 330 }
 331
 332 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 333    same name.  Takes ownership of M. */
 334 void
 335 lex_define_macro (struct lexer *lexer, struct macro *m)
 336 {
 337   macro_set_add (lexer->macros, m);
 338 }
 339
 340 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 341    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 342    token. */
 343 void
 344 lex_include (struct lexer *lexer, struct lex_reader *reader)
 345 {
 346   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 347   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 348 }
 349
 350 /* Appends READER to LEXER, so that it will be read after all other current
 351    readers have already been read. */
 352 void
 353 lex_append (struct lexer *lexer, struct lex_reader *reader)
 354 {
 355   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 356 }
 357 \f
 358 /* Advancing. */
 359
 360 /* Advances LEXER to the next token, consuming the current token. */
 361 void
 362 lex_get (struct lexer *lexer)
 363 {
 364   struct lex_source *src;
 365
 366   src = lex_source__ (lexer);
 367   if (src == NULL)
 368     return;
 369
 370   if (!lex_stage_is_empty (&src->lookahead))
 371     lex_stage_pop_first (&src->lookahead);
 372
 373   while (lex_stage_is_empty (&src->lookahead))
 374     if (!lex_source_get_lookahead (src))
 375       {
 376         lex_source_destroy (src);
 377         src = lex_source__ (lexer);
 378         if (src == NULL)
 379           return;
 380       }
 381 }
 382 \f
 383 /* Issuing errors. */
 384
 385 /* Prints a syntax error message containing the current token and
 386    given message MESSAGE (if non-null). */
 387 void
 388 lex_error (struct lexer *lexer, const char *format, ...)
 389 {
 390   va_list args;
 391
 392   va_start (args, format);
 393   lex_next_error_valist (lexer, 0, 0, format, args);
 394   va_end (args);
 395 }
 396
 397 /* Prints a syntax error message containing the current token and
 398    given message MESSAGE (if non-null). */
 399 void
 400 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 401 {
 402   lex_next_error_valist (lexer, 0, 0, format, args);
 403 }
 404
 405 /* Prints a syntax error message containing the current token and
 406    given message MESSAGE (if non-null). */
 407 void
 408 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 409 {
 410   va_list args;
 411
 412   va_start (args, format);
 413   lex_next_error_valist (lexer, n0, n1, format, args);
 414   va_end (args);
 415 }
 416
 417 /* Prints a syntax error message saying that one of the strings provided as
 418    varargs, up to the first NULL, is expected. */
 419 void
 420 (lex_error_expecting) (struct lexer *lexer, ...)
 421 {
 422   va_list args;
 423
 424   va_start (args, lexer);
 425   lex_error_expecting_valist (lexer, args);
 426   va_end (args);
 427 }
 428
 429 /* Prints a syntax error message saying that one of the options provided in
 430    ARGS, up to the first NULL, is expected. */
 431 void
 432 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 433 {
 434   enum { MAX_OPTIONS = 9 };
 435   const char *options[MAX_OPTIONS];
 436   int n = 0;
 437   while (n < MAX_OPTIONS)
 438     {
 439       const char *option = va_arg (args, const char *);
 440       if (!option)
 441         break;
 442
 443       options[n++] = option;
 444     }
 445   lex_error_expecting_array (lexer, options, n);
 446 }
 447
 448 void
 449 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 450 {
 451   switch (n)
 452     {
 453     case 0:
 454       lex_error (lexer, NULL);
 455       break;
 456
 457     case 1:
 458       lex_error (lexer, _("expecting %s"), options[0]);
 459       break;
 460
 461     case 2:
 462       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 463       break;
 464
 465     case 3:
 466       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 467                  options[2]);
 468       break;
 469
 470     case 4:
 471       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 472                  options[0], options[1], options[2], options[3]);
 473       break;
 474
 475     case 5:
 476       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 477                  options[0], options[1], options[2], options[3], options[4]);
 478       break;
 479
 480     case 6:
 481       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 482                  options[0], options[1], options[2], options[3], options[4],
 483                  options[5]);
 484       break;
 485
 486     case 7:
 487       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 488                  options[0], options[1], options[2], options[3], options[4],
 489                  options[5], options[6]);
 490       break;
 491
 492     case 8:
 493       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 494                  options[0], options[1], options[2], options[3], options[4],
 495                  options[5], options[6], options[7]);
 496       break;
 497
 498     default:
 499       lex_error (lexer, NULL);
 500     }
 501 }
 502
 503 /* Reports an error to the effect that subcommand SBC may only be specified
 504    once.
 505
 506    This function does not take a lexer as an argument or use lex_error(),
 507    because the result would ordinarily just be redundant: "Syntax error at
 508    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 509    not help the user find the error. */
 510 void
 511 lex_sbc_only_once (const char *sbc)
 512 {
 513   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 514 }
 515
 516 /* Reports an error to the effect that subcommand SBC is missing.
 517
 518    This function does not take a lexer as an argument or use lex_error(),
 519    because a missing subcommand can normally be detected only after the whole
 520    command has been parsed, and so lex_error() would always report "Syntax
 521    error at end of command", which does not help the user find the error. */
 522 void
 523 lex_sbc_missing (const char *sbc)
 524 {
 525   msg (SE, _("Required subcommand %s was not specified."), sbc);
 526 }
 527
 528 /* Reports an error to the effect that specification SPEC may only be specified
 529    once within subcommand SBC. */
 530 void
 531 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 532 {
 533   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 534              spec, sbc);
 535 }
 536
 537 /* Reports an error to the effect that specification SPEC is missing within
 538    subcommand SBC. */
 539 void
 540 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 541 {
 542   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 543              sbc, spec);
 544 }
 545
 546 /* Prints a syntax error message containing the current token and
 547    given message MESSAGE (if non-null). */
 548 void
 549 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 550                        const char *format, va_list args)
 551 {
 552   struct lex_source *src = lex_source__ (lexer);
 553
 554   if (src != NULL)
 555     lex_source_error_valist (src, n0, n1, format, args);
 556   else
 557     {
 558       struct string s;
 559
 560       ds_init_empty (&s);
 561       ds_put_format (&s, _("Syntax error at end of input"));
 562       if (format != NULL)
 563         {
 564           ds_put_cstr (&s, ": ");
 565           ds_put_vformat (&s, format, args);
 566         }
 567       ds_put_byte (&s, '.');
 568       msg (SE, "%s", ds_cstr (&s));
 569       ds_destroy (&s);
 570     }
 571 }
 572
 573 /* Checks that we're at end of command.
 574    If so, returns a successful command completion code.
 575    If not, flags a syntax error and returns an error command
 576    completion code. */
 577 int
 578 lex_end_of_command (struct lexer *lexer)
 579 {
 580   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 581     {
 582       lex_error (lexer, _("expecting end of command"));
 583       return CMD_FAILURE;
 584     }
 585   else
 586     return CMD_SUCCESS;
 587 }
 588 \f
 589 /* Token testing functions. */
 590
 591 /* Returns true if the current token is a number. */
 592 bool
 593 lex_is_number (const struct lexer *lexer)
 594 {
 595   return lex_next_is_number (lexer, 0);
 596 }
 597
 598 /* Returns true if the current token is a string. */
 599 bool
 600 lex_is_string (const struct lexer *lexer)
 601 {
 602   return lex_next_is_string (lexer, 0);
 603 }
 604
 605 /* Returns the value of the current token, which must be a
 606    floating point number. */
 607 double
 608 lex_number (const struct lexer *lexer)
 609 {
 610   return lex_next_number (lexer, 0);
 611 }
 612
 613 /* Returns true iff the current token is an integer. */
 614 bool
 615 lex_is_integer (const struct lexer *lexer)
 616 {
 617   return lex_next_is_integer (lexer, 0);
 618 }
 619
 620 /* Returns the value of the current token, which must be an
 621    integer. */
 622 long
 623 lex_integer (const struct lexer *lexer)
 624 {
 625   return lex_next_integer (lexer, 0);
 626 }
 627 \f
 628 /* Token testing functions with lookahead.
 629
 630    A value of 0 for N as an argument to any of these functions refers to the
 631    current token.  Lookahead is limited to the current command.  Any N greater
 632    than the number of tokens remaining in the current command will be treated
 633    as referring to a T_ENDCMD token. */
 634
 635 /* Returns true if the token N ahead of the current token is a number. */
 636 bool
 637 lex_next_is_number (const struct lexer *lexer, int n)
 638 {
 639   return token_is_number (lex_next (lexer, n));
 640 }
 641
 642 /* Returns true if the token N ahead of the current token is a string. */
 643 bool
 644 lex_next_is_string (const struct lexer *lexer, int n)
 645 {
 646   return token_is_string (lex_next (lexer, n));
 647 }
 648
 649 /* Returns the value of the token N ahead of the current token, which must be a
 650    floating point number. */
 651 double
 652 lex_next_number (const struct lexer *lexer, int n)
 653 {
 654   return token_number (lex_next (lexer, n));
 655 }
 656
 657 /* Returns true if the token N ahead of the current token is an integer. */
 658 bool
 659 lex_next_is_integer (const struct lexer *lexer, int n)
 660 {
 661   return token_is_integer (lex_next (lexer, n));
 662 }
 663
 664 /* Returns the value of the token N ahead of the current token, which must be
 665    an integer. */
 666 long
 667 lex_next_integer (const struct lexer *lexer, int n)
 668 {
 669   return token_integer (lex_next (lexer, n));
 670 }
 671 \f
 672 /* Token matching functions. */
 673
 674 /* If the current token has the specified TYPE, skips it and returns true.
 675    Otherwise, returns false. */
 676 bool
 677 lex_match (struct lexer *lexer, enum token_type type)
 678 {
 679   if (lex_token (lexer) == type)
 680     {
 681       lex_get (lexer);
 682       return true;
 683     }
 684   else
 685     return false;
 686 }
 687
 688 /* If the current token matches IDENTIFIER, skips it and returns true.
 689    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 690    returns false.
 691
 692    IDENTIFIER must be an ASCII string. */
 693 bool
 694 lex_match_id (struct lexer *lexer, const char *identifier)
 695 {
 696   return lex_match_id_n (lexer, identifier, 3);
 697 }
 698
 699 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 700    may be abbreviated to its first N letters.  Otherwise, returns false.
 701
 702    IDENTIFIER must be an ASCII string. */
 703 bool
 704 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 705 {
 706   if (lex_token (lexer) == T_ID
 707       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 708     {
 709       lex_get (lexer);
 710       return true;
 711     }
 712   else
 713     return false;
 714 }
 715
 716 /* If the current token is integer X, skips it and returns true.  Otherwise,
 717    returns false. */
 718 bool
 719 lex_match_int (struct lexer *lexer, int x)
 720 {
 721   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 722     {
 723       lex_get (lexer);
 724       return true;
 725     }
 726   else
 727     return false;
 728 }
 729 \f
 730 /* Forced matches. */
 731
 732 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 733    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 734    false.
 735
 736    IDENTIFIER must be an ASCII string. */
 737 bool
 738 lex_force_match_id (struct lexer *lexer, const char *identifier)
 739 {
 740   if (lex_match_id (lexer, identifier))
 741     return true;
 742   else
 743     {
 744       lex_error_expecting (lexer, identifier);
 745       return false;
 746     }
 747 }
 748
 749 /* If the current token has the specified TYPE, skips it and returns true.
 750    Otherwise, reports an error and returns false. */
 751 bool
 752 lex_force_match (struct lexer *lexer, enum token_type type)
 753 {
 754   if (lex_token (lexer) == type)
 755     {
 756       lex_get (lexer);
 757       return true;
 758     }
 759   else
 760     {
 761       const char *type_string = token_type_to_string (type);
 762       if (type_string)
 763         {
 764           char *s = xasprintf ("`%s'", type_string);
 765           lex_error_expecting (lexer, s);
 766           free (s);
 767         }
 768       else
 769         lex_error_expecting (lexer, token_type_to_name (type));
 770
 771       return false;
 772     }
 773 }
 774
 775 /* If the current token is a string, does nothing and returns true.
 776    Otherwise, reports an error and returns false. */
 777 bool
 778 lex_force_string (struct lexer *lexer)
 779 {
 780   if (lex_is_string (lexer))
 781     return true;
 782   else
 783     {
 784       lex_error (lexer, _("expecting string"));
 785       return false;
 786     }
 787 }
 788
 789 /* If the current token is a string or an identifier, does nothing and returns
 790    true.  Otherwise, reports an error and returns false.
 791
 792    This is meant for use in syntactic situations where we want to encourage the
 793    user to supply a quoted string, but for compatibility we also accept
 794    identifiers.  (One example of such a situation is file names.)  Therefore,
 795    the error message issued when the current token is wrong only says that a
 796    string is expected and doesn't mention that an identifier would also be
 797    accepted. */
 798 bool
 799 lex_force_string_or_id (struct lexer *lexer)
 800 {
 801   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 802 }
 803
 804 /* If the current token is an integer, does nothing and returns true.
 805    Otherwise, reports an error and returns false. */
 806 bool
 807 lex_force_int (struct lexer *lexer)
 808 {
 809   if (lex_is_integer (lexer))
 810     return true;
 811   else
 812     {
 813       lex_error (lexer, _("expecting integer"));
 814       return false;
 815     }
 816 }
 817
 818 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 819    nothing and returns true.  Otherwise, reports an error and returns false.
 820    If NAME is nonnull, then it is used in the error message. */
 821 bool
 822 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 823 {
 824   bool is_integer = lex_is_integer (lexer);
 825   bool too_small = is_integer && lex_integer (lexer) < min;
 826   bool too_big = is_integer && lex_integer (lexer) > max;
 827   if (is_integer && !too_small && !too_big)
 828     return true;
 829
 830   if (min > max)
 831     {
 832       /* Weird, maybe a bug in the caller.  Just report that we needed an
 833          integer. */
 834       if (name)
 835         lex_error (lexer, _("Integer expected for %s."), name);
 836       else
 837         lex_error (lexer, _("Integer expected."));
 838     }
 839   else if (min == max)
 840     {
 841       if (name)
 842         lex_error (lexer, _("Expected %ld for %s."), min, name);
 843       else
 844         lex_error (lexer, _("Expected %ld."), min);
 845     }
 846   else if (min + 1 == max)
 847     {
 848       if (name)
 849         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 850       else
 851         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 852     }
 853   else
 854     {
 855       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 856       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 857
 858       if (report_lower_bound && report_upper_bound)
 859         {
 860           if (name)
 861             lex_error (lexer,
 862                        _("Expected integer between %ld and %ld for %s."),
 863                        min, max, name);
 864           else
 865             lex_error (lexer, _("Expected integer between %ld and %ld."),
 866                        min, max);
 867         }
 868       else if (report_lower_bound)
 869         {
 870           if (min == 0)
 871             {
 872               if (name)
 873                 lex_error (lexer, _("Expected non-negative integer for %s."),
 874                            name);
 875               else
 876                 lex_error (lexer, _("Expected non-negative integer."));
 877             }
 878           else if (min == 1)
 879             {
 880               if (name)
 881                 lex_error (lexer, _("Expected positive integer for %s."),
 882                            name);
 883               else
 884                 lex_error (lexer, _("Expected positive integer."));
 885             }
 886         }
 887       else if (report_upper_bound)
 888         {
 889           if (name)
 890             lex_error (lexer,
 891                        _("Expected integer less than or equal to %ld for %s."),
 892                        max, name);
 893           else
 894             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 895                        max);
 896         }
 897       else
 898         {
 899           if (name)
 900             lex_error (lexer, _("Integer expected for %s."), name);
 901           else
 902             lex_error (lexer, _("Integer expected."));
 903         }
 904     }
 905   return false;
 906 }
 907
 908 /* If the current token is a number, does nothing and returns true.
 909    Otherwise, reports an error and returns false. */
 910 bool
 911 lex_force_num (struct lexer *lexer)
 912 {
 913   if (lex_is_number (lexer))
 914     return true;
 915
 916   lex_error (lexer, _("expecting number"));
 917   return false;
 918 }
 919
 920 /* If the current token is an identifier, does nothing and returns true.
 921    Otherwise, reports an error and returns false. */
 922 bool
 923 lex_force_id (struct lexer *lexer)
 924 {
 925   if (lex_token (lexer) == T_ID)
 926     return true;
 927
 928   lex_error (lexer, _("expecting identifier"));
 929   return false;
 930 }
 931 \f
 932 /* Token accessors. */
 933
 934 /* Returns the type of LEXER's current token. */
 935 enum token_type
 936 lex_token (const struct lexer *lexer)
 937 {
 938   return lex_next_token (lexer, 0);
 939 }
 940
 941 /* Returns the number in LEXER's current token.
 942
 943    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 944    tokens this function will always return zero. */
 945 double
 946 lex_tokval (const struct lexer *lexer)
 947 {
 948   return lex_next_tokval (lexer, 0);
 949 }
 950
 951 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 952
 953    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 954    this functions this function will always return NULL.
 955
 956    The UTF-8 encoding of the returned string is correct for variable names and
 957    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 958    data_in() to use it in a "union value".  */
 959 const char *
 960 lex_tokcstr (const struct lexer *lexer)
 961 {
 962   return lex_next_tokcstr (lexer, 0);
 963 }
 964
 965 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 966    null-terminated (but the null terminator is not included in the returned
 967    substring's 'length').
 968
 969    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 970    this functions this function will always return NULL.
 971
 972    The UTF-8 encoding of the returned string is correct for variable names and
 973    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 974    data_in() to use it in a "union value".  */
 975 struct substring
 976 lex_tokss (const struct lexer *lexer)
 977 {
 978   return lex_next_tokss (lexer, 0);
 979 }
 980 \f
 981 /* Looking ahead.
 982
 983    A value of 0 for N as an argument to any of these functions refers to the
 984    current token.  Lookahead is limited to the current command.  Any N greater
 985    than the number of tokens remaining in the current command will be treated
 986    as referring to a T_ENDCMD token. */
 987
 988 static const struct lex_token *
 989 lex_next__ (const struct lexer *lexer_, int n)
 990 {
 991   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
 992   struct lex_source *src = lex_source__ (lexer);
 993
 994   if (src != NULL)
 995     return lex_source_next__ (src, n);
 996   else
 997     {
 998       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
 999       return &stop_token;
1000     }
1001 }
1002
1003 static const struct lex_token *
1004 lex_source_next__ (const struct lex_source *src_, int n)
1005 {
1006   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1007   while (lex_stage_count (&src->lookahead) <= n)
1008     {
1009       if (!lex_stage_is_empty (&src->lookahead))
1010         {
1011           const struct lex_token *t = lex_stage_last (&src->lookahead);
1012           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1013             return t;
1014         }
1015
1016       lex_source_get_lookahead (src);
1017     }
1018
1019   return lex_stage_nth (&src->lookahead, n);
1020 }
1021
1022 /* Returns the "struct token" of the token N after the current one in LEXER.
1023    The returned pointer can be invalidated by pretty much any succeeding call
1024    into the lexer, although the string pointer within the returned token is
1025    only invalidated by consuming the token (e.g. with lex_get()). */
1026 const struct token *
1027 lex_next (const struct lexer *lexer, int n)
1028 {
1029   return &lex_next__ (lexer, n)->token;
1030 }
1031
1032 /* Returns the type of the token N after the current one in LEXER. */
1033 enum token_type
1034 lex_next_token (const struct lexer *lexer, int n)
1035 {
1036   return lex_next (lexer, n)->type;
1037 }
1038
1039 /* Returns the number in the tokn N after the current one in LEXER.
1040
1041    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1042    tokens this function will always return zero. */
1043 double
1044 lex_next_tokval (const struct lexer *lexer, int n)
1045 {
1046   return token_number (lex_next (lexer, n));
1047 }
1048
1049 /* Returns the null-terminated string in the token N after the current one, in
1050    UTF-8 encoding.
1051
1052    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1053    this functions this function will always return NULL.
1054
1055    The UTF-8 encoding of the returned string is correct for variable names and
1056    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1057    data_in() to use it in a "union value".  */
1058 const char *
1059 lex_next_tokcstr (const struct lexer *lexer, int n)
1060 {
1061   return lex_next_tokss (lexer, n).string;
1062 }
1063
1064 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1065    The string is null-terminated (but the null terminator is not included in
1066    the returned substring's 'length').
1067
1068    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1069    tokens this functions this function will always return NULL.
1070
1071    The UTF-8 encoding of the returned string is correct for variable names and
1072    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1073    data_in() to use it in a "union value".  */
1074 struct substring
1075 lex_next_tokss (const struct lexer *lexer, int n)
1076 {
1077   return lex_next (lexer, n)->string;
1078 }
1079
1080 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1081    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1082    are both zero, this requests the syntax for the current token.)  The caller
1083    must eventually free the returned string (with free()).  The syntax is
1084    encoded in UTF-8 and in the original form supplied to the lexer so that, for
1085    example, it may include comments, spaces, and new-lines if it spans multiple
1086    tokens.  Macro expansion, however, has already been performed. */
1087 char *
1088 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1089 {
1090   return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1091 }
1092
1093 /* Returns true if the token N ahead of the current one was produced by macro
1094    expansion, false otherwise. */
1095 bool
1096 lex_next_is_from_macro (const struct lexer *lexer, int n)
1097 {
1098   return lex_next__ (lexer, n)->macro_rep != NULL;
1099 }
1100
1101 static bool
1102 lex_tokens_match (const struct token *actual, const struct token *expected)
1103 {
1104   if (actual->type != expected->type)
1105     return false;
1106
1107   switch (actual->type)
1108     {
1109     case T_POS_NUM:
1110     case T_NEG_NUM:
1111       return actual->number == expected->number;
1112
1113     case T_ID:
1114       return lex_id_match (expected->string, actual->string);
1115
1116     case T_STRING:
1117       return (actual->string.length == expected->string.length
1118               && !memcmp (actual->string.string, expected->string.string,
1119                           actual->string.length));
1120
1121     default:
1122       return true;
1123     }
1124 }
1125
1126 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1127    skips it and returns true.  Otherwise, returns false.
1128
1129    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1130    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1131    first three letters. */
1132 bool
1133 lex_match_phrase (struct lexer *lexer, const char *s)
1134 {
1135   struct string_lexer slex;
1136   struct token token;
1137   int i;
1138
1139   i = 0;
1140   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1141   while (string_lexer_next (&slex, &token))
1142     {
1143       bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1144       token_uninit (&token);
1145       if (!match)
1146         return false;
1147     }
1148
1149   while (i-- > 0)
1150     lex_get (lexer);
1151   return true;
1152 }
1153
1154 static int
1155 count_newlines (char *s, size_t length)
1156 {
1157   int n_newlines = 0;
1158   char *newline;
1159
1160   while ((newline = memchr (s, '\n', length)) != NULL)
1161     {
1162       n_newlines++;
1163       length -= (newline + 1) - s;
1164       s = newline + 1;
1165     }
1166
1167   return n_newlines;
1168 }
1169
1170 static int
1171 lex_token_get_last_line_number (const struct lex_source *src,
1172                                 const struct lex_token *token)
1173 {
1174   if (token->first_line == 0)
1175     return 0;
1176   else
1177     {
1178       char *token_str = &src->buffer[token->token_pos - src->tail];
1179       return token->first_line + count_newlines (token_str, token->token_len) + 1;
1180     }
1181 }
1182
1183 static int
1184 count_columns (const char *s_, size_t length)
1185 {
1186   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1187   int columns;
1188   size_t ofs;
1189   int mblen;
1190
1191   columns = 0;
1192   for (ofs = 0; ofs < length; ofs += mblen)
1193     {
1194       ucs4_t uc;
1195
1196       mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1197       if (uc != '\t')
1198         {
1199           int width = uc_width (uc, "UTF-8");
1200           if (width > 0)
1201             columns += width;
1202         }
1203       else
1204         columns = ROUND_UP (columns + 1, 8);
1205     }
1206
1207   return columns + 1;
1208 }
1209
1210 static int
1211 lex_token_get_first_column (const struct lex_source *src,
1212                             const struct lex_token *token)
1213 {
1214   return count_columns (&src->buffer[token->line_pos - src->tail],
1215                         token->token_pos - token->line_pos);
1216 }
1217
1218 static int
1219 lex_token_get_last_column (const struct lex_source *src,
1220                            const struct lex_token *token)
1221 {
1222   char *start, *end, *newline;
1223
1224   start = &src->buffer[token->line_pos - src->tail];
1225   end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1226   newline = memrchr (start, '\n', end - start);
1227   if (newline != NULL)
1228     start = newline + 1;
1229   return count_columns (start, end - start);
1230 }
1231
1232 static struct msg_location
1233 lex_token_location (const struct lex_source *src,
1234                     const struct lex_token *t0,
1235                     const struct lex_token *t1)
1236 {
1237   return (struct msg_location) {
1238     .file_name = src->reader->file_name,
1239     .first_line = t0->first_line,
1240     .last_line = lex_token_get_last_line_number (src, t1),
1241     .first_column = lex_token_get_first_column (src, t0),
1242     .last_column = lex_token_get_last_column (src, t1),
1243   };
1244 }
1245
1246 static struct msg_location *
1247 lex_token_location_rw (const struct lex_source *src,
1248                        const struct lex_token *t0,
1249                        const struct lex_token *t1)
1250 {
1251   struct msg_location location = lex_token_location (src, t0, t1);
1252   return msg_location_dup (&location);
1253 }
1254
1255 static struct msg_location *
1256 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1257 {
1258   return lex_token_location_rw (src,
1259                                 lex_source_next__ (src, n0),
1260                                 lex_source_next__ (src, n1));
1261 }
1262
1263 /* Returns the 1-based line number of the start of the syntax that represents
1264    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1265    if the token is drawn from a source that does not have line numbers. */
1266 int
1267 lex_get_first_line_number (const struct lexer *lexer, int n)
1268 {
1269   const struct lex_source *src = lex_source__ (lexer);
1270   return src ? lex_source_next__ (src, n)->first_line : 0;
1271 }
1272
1273 /* Returns the 1-based line number of the end of the syntax that represents the
1274    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1275    token or if the token is drawn from a source that does not have line
1276    numbers.
1277
1278    Most of the time, a single token is wholly within a single line of syntax,
1279    but there are two exceptions: a T_STRING token can be made up of multiple
1280    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1281    token can consist of a "-" on one line followed by the number on the next.
1282  */
1283 int
1284 lex_get_last_line_number (const struct lexer *lexer, int n)
1285 {
1286   const struct lex_source *src = lex_source__ (lexer);
1287   return src ? lex_token_get_last_line_number (src,
1288                                                lex_source_next__ (src, n)) : 0;
1289 }
1290
1291 /* Returns the 1-based column number of the start of the syntax that represents
1292    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1293    token.
1294
1295    Column numbers are measured according to the width of characters as shown in
1296    a typical fixed-width font, in which CJK characters have width 2 and
1297    combining characters have width 0.  */
1298 int
1299 lex_get_first_column (const struct lexer *lexer, int n)
1300 {
1301   const struct lex_source *src = lex_source__ (lexer);
1302   return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1303 }
1304
1305 /* Returns the 1-based column number of the end of the syntax that represents
1306    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1307    token.
1308
1309    Column numbers are measured according to the width of characters as shown in
1310    a typical fixed-width font, in which CJK characters have width 2 and
1311    combining characters have width 0.  */
1312 int
1313 lex_get_last_column (const struct lexer *lexer, int n)
1314 {
1315   const struct lex_source *src = lex_source__ (lexer);
1316   return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1317 }
1318
1319 /* Returns the name of the syntax file from which the current command is drawn.
1320    Returns NULL for a T_STOP token or if the command's source does not have
1321    line numbers.
1322
1323    There is no version of this function that takes an N argument because
1324    lookahead only works to the end of a command and any given command is always
1325    within a single syntax file. */
1326 const char *
1327 lex_get_file_name (const struct lexer *lexer)
1328 {
1329   struct lex_source *src = lex_source__ (lexer);
1330   return src == NULL ? NULL : src->reader->file_name;
1331 }
1332
1333 /* Returns a newly allocated msg_location for the syntax that represents tokens
1334    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1335    must eventually free the location (with msg_location_destroy()). */
1336 struct msg_location *
1337 lex_get_location (const struct lexer *lexer, int n0, int n1)
1338 {
1339   struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1340   loc->first_column = lex_get_first_column (lexer, n0);
1341   loc->last_column = lex_get_last_column (lexer, n1);
1342   return loc;
1343 }
1344
1345 /* Returns a newly allocated msg_location for the syntax that represents tokens
1346    with 0-based offsets N0...N1, inclusive, from the current token.  The
1347    location only covers the tokens' lines, not the columns.  The caller must
1348    eventually free the location (with msg_location_destroy()). */
1349 struct msg_location *
1350 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1351 {
1352   struct msg_location *loc = xmalloc (sizeof *loc);
1353   *loc = (struct msg_location) {
1354     .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1355     .first_line = lex_get_first_line_number (lexer, n0),
1356     .last_line = lex_get_last_line_number (lexer, n1),
1357   };
1358   return loc;
1359 }
1360
1361 const char *
1362 lex_get_encoding (const struct lexer *lexer)
1363 {
1364   struct lex_source *src = lex_source__ (lexer);
1365   return src == NULL ? NULL : src->reader->encoding;
1366 }
1367
1368 /* Returns the syntax mode for the syntax file from which the current drawn is
1369    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1370    does not have line numbers.
1371
1372    There is no version of this function that takes an N argument because
1373    lookahead only works to the end of a command and any given command is always
1374    within a single syntax file. */
1375 enum segmenter_mode
1376 lex_get_syntax_mode (const struct lexer *lexer)
1377 {
1378   struct lex_source *src = lex_source__ (lexer);
1379   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1380 }
1381
1382 /* Returns the error mode for the syntax file from which the current drawn is
1383    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1384    source does not have line numbers.
1385
1386    There is no version of this function that takes an N argument because
1387    lookahead only works to the end of a command and any given command is always
1388    within a single syntax file. */
1389 enum lex_error_mode
1390 lex_get_error_mode (const struct lexer *lexer)
1391 {
1392   struct lex_source *src = lex_source__ (lexer);
1393   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1394 }
1395
1396 /* If the source that LEXER is currently reading has error mode
1397    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1398    token to be read comes directly from whatever is next read from the stream.
1399
1400    It makes sense to call this function after encountering an error in a
1401    command entered on the console, because usually the user would prefer not to
1402    have cascading errors. */
1403 void
1404 lex_interactive_reset (struct lexer *lexer)
1405 {
1406   struct lex_source *src = lex_source__ (lexer);
1407   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1408     {
1409       src->head = src->tail = 0;
1410       src->journal_pos = src->seg_pos = src->line_pos = 0;
1411       src->n_newlines = 0;
1412       src->suppress_next_newline = false;
1413       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1414                                        false);
1415       lex_stage_clear (&src->pp);
1416       lex_stage_clear (&src->merge);
1417       lex_stage_clear (&src->lookahead);
1418       lex_source_push_endcmd__ (src);
1419     }
1420 }
1421
1422 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1423 void
1424 lex_discard_rest_of_command (struct lexer *lexer)
1425 {
1426   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1427     lex_get (lexer);
1428 }
1429
1430 /* Discards all lookahead tokens in LEXER, then discards all input sources
1431    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1432    runs out of input sources. */
1433 void
1434 lex_discard_noninteractive (struct lexer *lexer)
1435 {
1436   struct lex_source *src = lex_source__ (lexer);
1437
1438   if (src != NULL)
1439     {
1440       lex_stage_clear (&src->pp);
1441       lex_stage_clear (&src->merge);
1442       lex_stage_clear (&src->lookahead);
1443
1444       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1445            src = lex_source__ (lexer))
1446         lex_source_destroy (src);
1447     }
1448 }
1449 \f
1450 static size_t
1451 lex_source_max_tail__ (const struct lex_source *src_)
1452 {
1453   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1454
1455   assert (src->seg_pos >= src->line_pos);
1456   size_t max_tail = MIN (src->journal_pos, src->line_pos);
1457
1458   /* Use the oldest token also. */
1459   struct lex_stage *stages[] = { &src->lookahead, &src->merge, &src->pp };
1460   for (size_t i = 0; i < sizeof stages / sizeof *stages; i++)
1461     if (!lex_stage_is_empty (stages[i]))
1462       {
1463         struct lex_token *first = lex_stage_first (stages[i]);
1464         assert (first->token_pos >= first->line_pos);
1465         return MIN (max_tail, first->line_pos);
1466       }
1467
1468   return max_tail;
1469 }
1470
1471 static void
1472 lex_source_expand__ (struct lex_source *src)
1473 {
1474   if (src->head - src->tail >= src->allocated)
1475     {
1476       size_t max_tail = lex_source_max_tail__ (src);
1477       if (max_tail > src->tail)
1478         {
1479           /* Advance the tail, freeing up room at the head. */
1480           memmove (src->buffer, src->buffer + (max_tail - src->tail),
1481                    src->head - max_tail);
1482           src->tail = max_tail;
1483         }
1484       else
1485         {
1486           /* Buffer is completely full.  Expand it. */
1487           src->buffer = x2realloc (src->buffer, &src->allocated);
1488         }
1489     }
1490   else
1491     {
1492       /* There's space available at the head of the buffer.  Nothing to do. */
1493     }
1494 }
1495
1496 static void
1497 lex_source_read__ (struct lex_source *src)
1498 {
1499   do
1500     {
1501       lex_source_expand__ (src);
1502
1503       size_t head_ofs = src->head - src->tail;
1504       size_t space = src->allocated - head_ofs;
1505       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1506       size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1507                                            space, prompt);
1508       assert (n <= space);
1509
1510       if (n == 0)
1511         {
1512           /* End of input. */
1513           src->reader->eof = true;
1514           lex_source_expand__ (src);
1515           return;
1516         }
1517
1518       src->head += n;
1519     }
1520   while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1521                   src->head - src->seg_pos));
1522 }
1523
1524 static struct lex_source *
1525 lex_source__ (const struct lexer *lexer)
1526 {
1527   return (ll_is_empty (&lexer->sources) ? NULL
1528           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1529 }
1530
1531 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1532    one, through N1 ahead of the current one, inclusive.  (For example, if N0
1533    and N1 are both zero, this requests the syntax for the current token.)  The
1534    caller must eventually free the returned string (with free()).  The syntax
1535    is encoded in UTF-8 and in the original form supplied to the lexer so that,
1536    for example, it may include comments, spaces, and new-lines if it spans
1537    multiple tokens.  Macro expansion, however, has already been performed. */
1538 static char *
1539 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1540 {
1541   struct string s = DS_EMPTY_INITIALIZER;
1542   for (size_t i = n0; i <= n1; )
1543     {
1544       /* Find [I,J) as the longest sequence of tokens not produced by macro
1545          expansion, or otherwise the longest sequence expanded from a single
1546          macro call. */
1547       const struct lex_token *first = lex_source_next__ (src, i);
1548       size_t j;
1549       for (j = i + 1; j <= n1; j++)
1550         {
1551           const struct lex_token *cur = lex_source_next__ (src, j);
1552           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1553               || first->macro_rep != cur->macro_rep)
1554             break;
1555         }
1556       const struct lex_token *last = lex_source_next__ (src, j - 1);
1557
1558       /* Now add the syntax for this sequence of tokens to SRC. */
1559       if (!ds_is_empty (&s))
1560         ds_put_byte (&s, ' ');
1561       if (!first->macro_rep)
1562         {
1563           size_t start = first->token_pos;
1564           size_t end = last->token_pos + last->token_len;
1565           ds_put_substring (&s, ss_buffer (&src->buffer[start - src->tail],
1566                                            end - start));
1567         }
1568       else
1569         {
1570           size_t start = first->ofs;
1571           size_t end = last->ofs + last->len;
1572           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1573                                            end - start));
1574         }
1575
1576       i = j;
1577     }
1578   return ds_steal_cstr (&s);
1579 }
1580
1581 static bool
1582 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1583 {
1584   for (size_t i = n0; i <= n1; i++)
1585     if (lex_source_next__ (src, i)->macro_rep)
1586       return true;
1587   return false;
1588 }
1589
1590 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1591    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1592    other tokens included in that range.  The syntax is encoded in UTF-8 and in
1593    the original form supplied to the lexer so that, for example, it may include
1594    comments, spaces, and new-lines if it spans multiple tokens.
1595
1596    Returns an empty string if the token range doesn't include a macro call.
1597
1598    The caller must not modify or free the returned string. */
1599 static struct substring
1600 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1601 {
1602   if (!lex_source_contains_macro_call (src, n0, n1))
1603     return ss_empty ();
1604
1605   const struct lex_token *token0 = lex_source_next__ (src, n0);
1606   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1607   size_t start = token0->token_pos;
1608   size_t end = token1->token_pos + token1->token_len;
1609
1610   return ss_buffer (&src->buffer[start - src->tail], end - start);
1611 }
1612
1613 static void
1614 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1615                          const char *format, va_list args)
1616 {
1617   const struct lex_token *token;
1618   struct string s;
1619
1620   ds_init_empty (&s);
1621
1622   token = lex_source_next__ (src, n0);
1623   if (token->token.type == T_ENDCMD)
1624     ds_put_cstr (&s, _("Syntax error at end of command"));
1625   else
1626     {
1627       /* Get the syntax that caused the error. */
1628       char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1629       char syntax[64];
1630       str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1631       free (raw_syntax);
1632
1633       /* Get the macro call(s) that expanded to the syntax that caused the
1634          error. */
1635       char call[64];
1636       str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1637                      call, sizeof call);
1638
1639       if (syntax[0])
1640         {
1641           if (call[0])
1642             ds_put_format (&s,
1643                            _("Syntax error at `%s' (in expansion of `%s')"),
1644                            syntax, call);
1645           else
1646             ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1647         }
1648       else
1649         {
1650           if (call[0])
1651             ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1652                            call);
1653           else
1654             ds_put_cstr (&s, _("Syntax error"));
1655         }
1656     }
1657
1658   if (format)
1659     {
1660       ds_put_cstr (&s, ": ");
1661       ds_put_vformat (&s, format, args);
1662     }
1663   if (ds_last (&s) != '.')
1664     ds_put_byte (&s, '.');
1665
1666   struct msg *m = xmalloc (sizeof *m);
1667   *m = (struct msg) {
1668     .category = MSG_C_SYNTAX,
1669     .severity = MSG_S_ERROR,
1670     .location = lex_source_get_location (src, n0, n1),
1671     .text = ds_steal_cstr (&s),
1672   };
1673   msg_emit (m);
1674 }
1675
1676 static void
1677 lex_get_error (struct lex_source *src, const struct lex_token *token)
1678 {
1679   char syntax[64];
1680   str_ellipsize (ss_buffer (&src->buffer[token->token_pos - src->tail],
1681                             token->token_len),
1682                  syntax, sizeof syntax);
1683
1684   struct string s = DS_EMPTY_INITIALIZER;
1685   ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1686   ds_put_format (&s, ": %s", token->token.string.string);
1687
1688   struct msg *m = xmalloc (sizeof *m);
1689   *m = (struct msg) {
1690     .category = MSG_C_SYNTAX,
1691     .severity = MSG_S_ERROR,
1692     .location = lex_token_location_rw (src, token, token),
1693     .text = ds_steal_cstr (&s),
1694   };
1695   msg_emit (m);
1696 }
1697
1698 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1699    underlying lex_reader if necessary.  Returns true if a new token was added
1700    to SRC's deque, false otherwise.  The caller should retry failures unless
1701    SRC's 'eof' marker was set to true indicating that there will be no more
1702    tokens from this source. */
1703 static bool
1704 lex_source_try_get_pp (struct lex_source *src)
1705 {
1706   /* Append a new token to SRC and initialize it. */
1707   struct lex_token *token = xmalloc (sizeof *token);
1708   token->token = (struct token) { .type = T_STOP };
1709   token->macro_rep = NULL;
1710   token->ref_cnt = NULL;
1711   token->line_pos = src->line_pos;
1712   token->token_pos = src->seg_pos;
1713   if (src->reader->line_number > 0)
1714     token->first_line = src->reader->line_number + src->n_newlines;
1715   else
1716     token->first_line = 0;
1717
1718   /* Extract a segment. */
1719   const char *segment;
1720   enum segment_type seg_type;
1721   int seg_len;
1722   for (;;)
1723     {
1724       segment = &src->buffer[src->seg_pos - src->tail];
1725       seg_len = segmenter_push (&src->segmenter, segment,
1726                                 src->head - src->seg_pos,
1727                                 src->reader->eof, &seg_type);
1728       if (seg_len >= 0)
1729         break;
1730
1731       /* The segmenter needs more input to produce a segment. */
1732       assert (!src->reader->eof);
1733       lex_source_read__ (src);
1734     }
1735
1736   /* Update state based on the segment. */
1737   token->token_len = seg_len;
1738   src->seg_pos += seg_len;
1739   if (seg_type == SEG_NEWLINE)
1740     {
1741       src->line_pos = src->seg_pos;
1742       src->n_newlines++;
1743     }
1744
1745   /* Get a token from the segment. */
1746   enum tokenize_result result = token_from_segment (
1747     seg_type, ss_buffer (segment, seg_len), &token->token);
1748
1749   /* If we've reached the end of a line, or the end of a command, then pass
1750      the line to the output engine as a syntax text item.  */
1751   int n_lines = seg_type == SEG_NEWLINE;
1752   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1753     {
1754       n_lines++;
1755       src->suppress_next_newline = true;
1756     }
1757   else if (n_lines > 0 && src->suppress_next_newline)
1758     {
1759       n_lines--;
1760       src->suppress_next_newline = false;
1761     }
1762   for (int i = 0; i < n_lines; i++)
1763     {
1764       /* Beginning of line. */
1765       const char *line = &src->buffer[src->journal_pos - src->tail];
1766
1767       /* Calculate line length, including \n or \r\n end-of-line if present.
1768
1769          We use src->head even though that may be beyond what we've actually
1770          converted to tokens (which is only through line_pos).  That's because,
1771          if we're emitting the line due to SEG_END_COMMAND, we want to take the
1772          whole line through the newline, not just through the '.'. */
1773       size_t max_len = src->head - src->journal_pos;
1774       const char *newline = memchr (line, '\n', max_len);
1775       size_t line_len = newline ? newline - line + 1 : max_len;
1776
1777       /* Calculate line length excluding end-of-line. */
1778       size_t copy_len = line_len;
1779       if (copy_len > 0 && line[copy_len - 1] == '\n')
1780         copy_len--;
1781       if (copy_len > 0 && line[copy_len - 1] == '\r')
1782         copy_len--;
1783
1784       /* Submit the line as syntax. */
1785       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1786                                                    xmemdup0 (line, copy_len),
1787                                                    NULL));
1788
1789       src->journal_pos += line_len;
1790     }
1791
1792   switch (result)
1793     {
1794     case TOKENIZE_ERROR:
1795       lex_get_error (src, token);
1796       /* Fall through. */
1797     case TOKENIZE_EMPTY:
1798       lex_token_destroy (token);
1799       return false;
1800
1801     case TOKENIZE_TOKEN:
1802       if (token->token.type == T_STOP)
1803         {
1804           token->token.type = T_ENDCMD;
1805           src->eof = true;
1806         }
1807       lex_stage_push_last (&src->pp, token);
1808       return true;
1809     }
1810   NOT_REACHED ();
1811 }
1812
1813 /* Attempts to append a new token to SRC.  Returns true if successful, false on
1814    failure.  On failure, the end of SRC has been reached and no more tokens
1815    will be forthcoming from it.
1816
1817    Does not make the new token available for lookahead yet; the caller must
1818    adjust SRC's 'middle' pointer to do so. */
1819 static bool
1820 lex_source_get_pp (struct lex_source *src)
1821 {
1822   while (!src->eof)
1823     if (lex_source_try_get_pp (src))
1824       return true;
1825   return false;
1826 }
1827
1828 static bool
1829 lex_source_try_get_merge (const struct lex_source *src_)
1830 {
1831   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1832
1833   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1834     return false;
1835
1836   if (!settings_get_mexpand ())
1837     {
1838       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1839       return true;
1840     }
1841
1842   /* Now pass tokens one-by-one to the macro expander.
1843
1844      In the common case where there is no macro to expand, the loop is not
1845      entered.  */
1846   struct macro_call *mc;
1847   int n_call = macro_call_create (src->lexer->macros,
1848                                   &lex_stage_first (&src->pp)->token, &mc);
1849   for (int ofs = 1; !n_call; ofs++)
1850     {
1851       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1852         {
1853           /* This should not be reachable because we always get a T_ENDCMD at
1854              the end of an input file (transformed from T_STOP by
1855              lex_source_try_get_pp()) and the macro_expander should always
1856              terminate expansion on T_ENDCMD. */
1857           NOT_REACHED ();
1858         }
1859
1860       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1861       size_t start = t->token_pos;
1862       size_t end = t->token_pos + t->token_len;
1863       const struct macro_token mt = {
1864         .token = t->token,
1865         .syntax = ss_buffer (&src->buffer[start - src->tail], end - start),
1866       };
1867       const struct msg_location loc = lex_token_location (src, t, t);
1868       n_call = macro_call_add (mc, &mt, &loc);
1869     }
1870   if (n_call < 0)
1871     {
1872       /* False alarm: no macro expansion after all.  Use first token as
1873          lookahead.  We'll retry macro expansion from the second token next
1874          time around. */
1875       macro_call_destroy (mc);
1876       lex_stage_shift (&src->merge, &src->pp, 1);
1877       return true;
1878     }
1879
1880   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1881      are a macro call.  (These are likely to be the only tokens in 'pp'.)
1882      Expand them.  */
1883   const struct lex_token *c0 = lex_stage_first (&src->pp);
1884   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1885   struct macro_tokens expansion = { .n = 0 };
1886   struct msg_location loc = lex_token_location (src, c0, c1);
1887   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1888   macro_call_destroy (mc);
1889
1890   /* Convert the macro expansion into syntax for possible error messages
1891      later. */
1892   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1893   size_t *len = xnmalloc (expansion.n, sizeof *len);
1894   struct string s = DS_EMPTY_INITIALIZER;
1895   macro_tokens_to_syntax (&expansion, &s, ofs, len);
1896
1897   if (settings_get_mprint ())
1898     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1899                                           _("Macro Expansion")));
1900
1901   /* Append the macro expansion tokens to the lookahead. */
1902   if (expansion.n > 0)
1903     {
1904       char *macro_rep = ds_steal_cstr (&s);
1905       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1906       *ref_cnt = expansion.n;
1907       for (size_t i = 0; i < expansion.n; i++)
1908         {
1909           struct lex_token *token = xmalloc (sizeof *token);
1910           *token = (struct lex_token) {
1911             .token = expansion.mts[i].token,
1912             .token_pos = c0->token_pos,
1913             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1914             .line_pos = c0->line_pos,
1915             .first_line = c0->first_line,
1916             .macro_rep = macro_rep,
1917             .ofs = ofs[i],
1918             .len = len[i],
1919             .ref_cnt = ref_cnt,
1920           };
1921           lex_stage_push_last (&src->merge, token);
1922
1923           ss_dealloc (&expansion.mts[i].syntax);
1924         }
1925     }
1926   else
1927     ds_destroy (&s);
1928   free (expansion.mts);
1929   free (ofs);
1930   free (len);
1931
1932   /* Destroy the tokens for the call. */
1933   for (size_t i = 0; i < n_call; i++)
1934     lex_stage_pop_first (&src->pp);
1935
1936   return expansion.n > 0;
1937 }
1938
1939 /* Attempts to obtain at least one new token into 'merge' in SRC.
1940
1941    Returns true if successful, false on failure.  In the latter case, SRC is
1942    exhausted and 'src->eof' is now true. */
1943 static bool
1944 lex_source_get_merge (struct lex_source *src)
1945 {
1946   while (!src->eof)
1947     if (lex_source_try_get_merge (src))
1948       return true;
1949   return false;
1950 }
1951
1952 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1953
1954    Returns true if successful, false on failure.  In the latter case, SRC is
1955    exhausted and 'src->eof' is now true. */
1956 static bool
1957 lex_source_get_lookahead (struct lex_source *src)
1958 {
1959   struct merger m = MERGER_INIT;
1960   struct token out;
1961   for (size_t i = 0; ; i++)
1962     {
1963       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1964         {
1965           /* We always get a T_ENDCMD at the end of an input file
1966              (transformed from T_STOP by lex_source_try_get_pp()) and
1967              merger_add() should never return -1 on T_ENDCMD. */
1968           assert (lex_stage_is_empty (&src->merge));
1969           return false;
1970         }
1971
1972       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1973                                &out);
1974       if (!retval)
1975         {
1976           lex_stage_shift (&src->lookahead, &src->merge, 1);
1977           return true;
1978         }
1979       else if (retval > 0)
1980         {
1981           /* Add a token that merges all the tokens together. */
1982           const struct lex_token *first = lex_stage_first (&src->merge);
1983           const struct lex_token *last = lex_stage_nth (&src->merge,
1984                                                         retval - 1);
1985           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1986           struct lex_token *t = xmalloc (sizeof *t);
1987           *t = (struct lex_token) {
1988             .token = out,
1989             .token_pos = first->token_pos,
1990             .token_len = (last->token_pos - first->token_pos) + last->token_len,
1991             .line_pos = first->line_pos,
1992             .first_line = first->first_line,
1993
1994             /* This works well if all the tokens were not expanded from macros,
1995                or if they came from the same macro expansion.  It just gives up
1996                in the other (corner) cases. */
1997             .macro_rep = macro ? first->macro_rep : NULL,
1998             .ofs = macro ? first->ofs : 0,
1999             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2000             .ref_cnt = macro ? first->ref_cnt : NULL,
2001           };
2002           if (t->ref_cnt)
2003             ++*t->ref_cnt;
2004           lex_stage_push_last (&src->lookahead, t);
2005
2006           for (int i = 0; i < retval; i++)
2007             lex_stage_pop_first (&src->merge);
2008           return true;
2009         }
2010     }
2011 }
2012 \f
2013 static void
2014 lex_source_push_endcmd__ (struct lex_source *src)
2015 {
2016   assert (lex_stage_is_empty (&src->lookahead));
2017   struct lex_token *token = xmalloc (sizeof *token);
2018   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2019   lex_stage_push_last (&src->lookahead, token);
2020 }
2021
2022 static struct lex_source *
2023 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2024 {
2025   struct lex_source *src = xmalloc (sizeof *src);
2026   *src = (struct lex_source) {
2027     .reader = reader,
2028     .segmenter = segmenter_init (reader->syntax, false),
2029     .lexer = lexer,
2030   };
2031
2032   lex_source_push_endcmd__ (src);
2033
2034   return src;
2035 }
2036
2037 static void
2038 lex_source_destroy (struct lex_source *src)
2039 {
2040   char *file_name = src->reader->file_name;
2041   char *encoding = src->reader->encoding;
2042   if (src->reader->class->destroy != NULL)
2043     src->reader->class->destroy (src->reader);
2044   free (file_name);
2045   free (encoding);
2046   free (src->buffer);
2047   lex_stage_uninit (&src->pp);
2048   lex_stage_uninit (&src->merge);
2049   lex_stage_uninit (&src->lookahead);
2050   ll_remove (&src->ll);
2051   free (src);
2052 }
2053 \f
2054 struct lex_file_reader
2055   {
2056     struct lex_reader reader;
2057     struct u8_istream *istream;
2058   };
2059
2060 static struct lex_reader_class lex_file_reader_class;
2061
2062 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2063    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2064    ENCODING, which should take one of the forms accepted by
2065    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2066    mode of the new reader, respectively.
2067
2068    Returns a null pointer if FILE_NAME cannot be opened. */
2069 struct lex_reader *
2070 lex_reader_for_file (const char *file_name, const char *encoding,
2071                      enum segmenter_mode syntax,
2072                      enum lex_error_mode error)
2073 {
2074   struct lex_file_reader *r;
2075   struct u8_istream *istream;
2076
2077   istream = (!strcmp(file_name, "-")
2078              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2079              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2080   if (istream == NULL)
2081     {
2082       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2083       return NULL;
2084     }
2085
2086   r = xmalloc (sizeof *r);
2087   lex_reader_init (&r->reader, &lex_file_reader_class);
2088   r->reader.syntax = syntax;
2089   r->reader.error = error;
2090   r->reader.file_name = xstrdup (file_name);
2091   r->reader.encoding = xstrdup_if_nonnull (encoding);
2092   r->reader.line_number = 1;
2093   r->istream = istream;
2094
2095   return &r->reader;
2096 }
2097
2098 static struct lex_file_reader *
2099 lex_file_reader_cast (struct lex_reader *r)
2100 {
2101   return UP_CAST (r, struct lex_file_reader, reader);
2102 }
2103
2104 static size_t
2105 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2106                enum prompt_style prompt_style UNUSED)
2107 {
2108   struct lex_file_reader *r = lex_file_reader_cast (r_);
2109   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2110   if (n_read < 0)
2111     {
2112       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2113       return 0;
2114     }
2115   return n_read;
2116 }
2117
2118 static void
2119 lex_file_close (struct lex_reader *r_)
2120 {
2121   struct lex_file_reader *r = lex_file_reader_cast (r_);
2122
2123   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2124     {
2125       if (u8_istream_close (r->istream) != 0)
2126         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2127     }
2128   else
2129     u8_istream_free (r->istream);
2130
2131   free (r);
2132 }
2133
2134 static struct lex_reader_class lex_file_reader_class =
2135   {
2136     lex_file_read,
2137     lex_file_close
2138   };
2139 \f
2140 struct lex_string_reader
2141   {
2142     struct lex_reader reader;
2143     struct substring s;
2144     size_t offset;
2145   };
2146
2147 static struct lex_reader_class lex_string_reader_class;
2148
2149 /* Creates and returns a new lex_reader for the contents of S, which must be
2150    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2151    with ss_dealloc() when it is closed. */
2152 struct lex_reader *
2153 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2154 {
2155   struct lex_string_reader *r;
2156
2157   r = xmalloc (sizeof *r);
2158   lex_reader_init (&r->reader, &lex_string_reader_class);
2159   r->reader.syntax = SEG_MODE_AUTO;
2160   r->reader.encoding = xstrdup_if_nonnull (encoding);
2161   r->s = s;
2162   r->offset = 0;
2163
2164   return &r->reader;
2165 }
2166
2167 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2168    which must be encoded in ENCODING.  The caller retains ownership of S. */
2169 struct lex_reader *
2170 lex_reader_for_string (const char *s, const char *encoding)
2171 {
2172   struct substring ss;
2173   ss_alloc_substring (&ss, ss_cstr (s));
2174   return lex_reader_for_substring_nocopy (ss, encoding);
2175 }
2176
2177 /* Formats FORMAT as a printf()-like format string and creates and returns a
2178    new lex_reader for the formatted result.  */
2179 struct lex_reader *
2180 lex_reader_for_format (const char *format, const char *encoding, ...)
2181 {
2182   struct lex_reader *r;
2183   va_list args;
2184
2185   va_start (args, encoding);
2186   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2187   va_end (args);
2188
2189   return r;
2190 }
2191
2192 static struct lex_string_reader *
2193 lex_string_reader_cast (struct lex_reader *r)
2194 {
2195   return UP_CAST (r, struct lex_string_reader, reader);
2196 }
2197
2198 static size_t
2199 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2200                  enum prompt_style prompt_style UNUSED)
2201 {
2202   struct lex_string_reader *r = lex_string_reader_cast (r_);
2203   size_t chunk;
2204
2205   chunk = MIN (n, r->s.length - r->offset);
2206   memcpy (buf, r->s.string + r->offset, chunk);
2207   r->offset += chunk;
2208
2209   return chunk;
2210 }
2211
2212 static void
2213 lex_string_close (struct lex_reader *r_)
2214 {
2215   struct lex_string_reader *r = lex_string_reader_cast (r_);
2216
2217   ss_dealloc (&r->s);
2218   free (r);
2219 }
2220
2221 static struct lex_reader_class lex_string_reader_class =
2222   {
2223     lex_string_read,
2224     lex_string_close
2225   };