pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31 #include <uniwidth.h>
  32
  33 #include "language/command.h"
  34 #include "language/lexer/macro.h"
  35 #include "language/lexer/scan.h"
  36 #include "language/lexer/segment.h"
  37 #include "language/lexer/token.h"
  38 #include "libpspp/assertion.h"
  39 #include "libpspp/cast.h"
  40 #include "libpspp/deque.h"
  41 #include "libpspp/i18n.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* For a token obtained through the lexer in an ordinary way, this is the
  66        location of the token in terms of the lex_source's buffer.
  67
  68        For a token produced through macro expansion, this is the entire macro
  69        call.
  70
  71        src->tail <= line_pos <= token_pos <= src->head. */
  72     size_t token_pos;           /* Start of token. */
  73     size_t token_len;           /* Length of source for token in bytes. */
  74     size_t line_pos;            /* Start of line containing token_pos. */
  75     int first_line;             /* Line number at token_pos. */
  76
  77     /* For a token obtained through macro expansion, this is just this token.
  78
  79        For a token obtained through the lexer in an ordinary way, these are
  80        nulls and zeros. */
  81     char *macro_rep;        /* The whole macro expansion. */
  82     size_t ofs;             /* Offset of this token in macro_rep. */
  83     size_t len;             /* Length of this token in macro_rep. */
  84     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  85   };
  86
  87 static void
  88 lex_token_destroy (struct lex_token *t)
  89 {
  90   token_uninit (&t->token);
  91   if (t->ref_cnt)
  92     {
  93       assert (*t->ref_cnt > 0);
  94       if (!--*t->ref_cnt)
  95         {
  96           free (t->macro_rep);
  97           free (t->ref_cnt);
  98         }
  99     }
 100   free (t);
 101 }
 102 \f
 103 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 104    lex_source. */
 105 struct lex_stage
 106   {
 107     struct deque deque;
 108     struct lex_token **tokens;
 109   };
 110
 111 static void lex_stage_clear (struct lex_stage *);
 112 static void lex_stage_uninit (struct lex_stage *);
 113
 114 static size_t lex_stage_count (const struct lex_stage *);
 115 static bool lex_stage_is_empty (const struct lex_stage *);
 116
 117 static struct lex_token *lex_stage_last (struct lex_stage *);
 118 static struct lex_token *lex_stage_first (struct lex_stage *);
 119 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 120
 121 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 122 static void lex_stage_pop_first (struct lex_stage *);
 123
 124 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 125                              size_t n);
 126
 127 /* Deletes all the tokens from STAGE. */
 128 static void
 129 lex_stage_clear (struct lex_stage *stage)
 130 {
 131   while (!deque_is_empty (&stage->deque))
 132     lex_stage_pop_first (stage);
 133 }
 134
 135 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 136 static void
 137 lex_stage_uninit (struct lex_stage *stage)
 138 {
 139   lex_stage_clear (stage);
 140   free (stage->tokens);
 141 }
 142
 143 /* Returns true if STAGE contains no tokens, otherwise false. */
 144 static bool
 145 lex_stage_is_empty (const struct lex_stage *stage)
 146 {
 147   return deque_is_empty (&stage->deque);
 148 }
 149
 150 /* Returns the number of tokens in STAGE. */
 151 static size_t
 152 lex_stage_count (const struct lex_stage *stage)
 153 {
 154   return deque_count (&stage->deque);
 155 }
 156
 157 /* Returns the last token in STAGE, which must be nonempty.  The last token is
 158    the one accessed with the greatest lookahead. */
 159 static struct lex_token *
 160 lex_stage_last (struct lex_stage *stage)
 161 {
 162   return stage->tokens[deque_front (&stage->deque, 0)];
 163 }
 164
 165 /* Returns the first token in STAGE, which must be nonempty.
 166    The first token is the one accessed with the least lookahead. */
 167 static struct lex_token *
 168 lex_stage_first (struct lex_stage *stage)
 169 {
 170   return lex_stage_nth (stage, 0);
 171 }
 172
 173 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 174    lookahead) is 0, the second token is 1, and so on.  There must be at least
 175    INDEX + 1 tokens in STAGE. */
 176 static struct lex_token *
 177 lex_stage_nth (struct lex_stage *stage, size_t index)
 178 {
 179   return stage->tokens[deque_back (&stage->deque, index)];
 180 }
 181
 182 /* Adds TOKEN so that it becomes the last token in STAGE. */
 183 static void
 184 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 185 {
 186   if (deque_is_full (&stage->deque))
 187     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 188                                   sizeof *stage->tokens);
 189   stage->tokens[deque_push_front (&stage->deque)] = token;
 190 }
 191
 192 /* Removes the first token from STAGE and uninitializes it. */
 193 static void
 194 lex_stage_pop_first (struct lex_stage *stage)
 195 {
 196   lex_token_destroy (stage->tokens[deque_pop_back (&stage->deque)]);
 197 }
 198
 199 /* Removes the first N tokens from SRC, appending them to DST as the last
 200    tokens. */
 201 static void
 202 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 203 {
 204   for (size_t i = 0; i < n; i++)
 205     {
 206       lex_stage_push_last (dst, lex_stage_first (src));
 207       deque_pop_back (&src->deque);
 208     }
 209 }
 210
 211 /* A source of tokens, corresponding to a syntax file.
 212
 213    This is conceptually a lex_reader wrapped with everything needed to convert
 214    its UTF-8 bytes into tokens. */
 215 struct lex_source
 216   {
 217     struct ll ll;               /* In lexer's list of sources. */
 218     struct lex_reader *reader;
 219     struct lexer *lexer;
 220     struct segmenter segmenter;
 221     bool eof;                   /* True if T_STOP was read from 'reader'. */
 222
 223     /* Buffer of UTF-8 bytes. */
 224     char *buffer;
 225     size_t allocated;           /* Number of bytes allocated. */
 226     size_t tail;                /* &buffer[0] offset into UTF-8 source. */
 227     size_t head;                /* &buffer[head - tail] offset into source. */
 228
 229     /* Positions in source file, tail <= pos <= head for each member here. */
 230     size_t journal_pos;         /* First byte not yet output to journal. */
 231     size_t seg_pos;             /* First byte not yet scanned as token. */
 232     size_t line_pos;            /* First byte of line containing seg_pos. */
 233
 234     int n_newlines;             /* Number of new-lines up to seg_pos. */
 235     bool suppress_next_newline;
 236
 237     /* Tokens.
 238
 239        This is a pipeline with the following stages.  Each token eventually
 240        made available to the parser passes through of these stages.  The stages
 241        are named after the processing that happens in each one.
 242
 243        Initially, tokens come from the segmenter and scanner to 'pp':
 244
 245        - pp: Tokens that need to pass through the macro preprocessor to end up
 246          in 'merge'.
 247
 248        - merge: Tokens that need to pass through scan_merge() to end up in
 249          'lookahead'.
 250
 251        - lookahead: Tokens available to the client for parsing. */
 252     struct lex_stage pp;
 253     struct lex_stage merge;
 254     struct lex_stage lookahead;
 255   };
 256
 257 static struct lex_source *lex_source_create (struct lexer *,
 258                                              struct lex_reader *);
 259 static void lex_source_destroy (struct lex_source *);
 260
 261 /* Lexer. */
 262 struct lexer
 263   {
 264     struct ll_list sources;     /* Contains "struct lex_source"s. */
 265     struct macro_set *macros;
 266   };
 267
 268 static struct lex_source *lex_source__ (const struct lexer *);
 269 static char *lex_source_get_syntax__ (const struct lex_source *,
 270                                       int n0, int n1);
 271 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 272 static void lex_source_push_endcmd__ (struct lex_source *);
 273
 274 static bool lex_source_get_lookahead (struct lex_source *);
 275 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 276                                      const char *format, va_list)
 277    PRINTF_FORMAT (4, 0);
 278 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 279                                                   int n);
 280 \f
 281 /* Initializes READER with the specified CLASS and otherwise some reasonable
 282    defaults.  The caller should fill in the others members as desired. */
 283 void
 284 lex_reader_init (struct lex_reader *reader,
 285                  const struct lex_reader_class *class)
 286 {
 287   reader->class = class;
 288   reader->syntax = SEG_MODE_AUTO;
 289   reader->error = LEX_ERROR_CONTINUE;
 290   reader->file_name = NULL;
 291   reader->encoding = NULL;
 292   reader->line_number = 0;
 293   reader->eof = false;
 294 }
 295
 296 /* Frees any file name already in READER and replaces it by a copy of
 297    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 298 void
 299 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 300 {
 301   free (reader->file_name);
 302   reader->file_name = xstrdup_if_nonnull (file_name);
 303 }
 304 \f
 305 /* Creates and returns a new lexer. */
 306 struct lexer *
 307 lex_create (void)
 308 {
 309   struct lexer *lexer = xmalloc (sizeof *lexer);
 310   *lexer = (struct lexer) {
 311     .sources = LL_INITIALIZER (lexer->sources),
 312     .macros = macro_set_create (),
 313   };
 314   return lexer;
 315 }
 316
 317 /* Destroys LEXER. */
 318 void
 319 lex_destroy (struct lexer *lexer)
 320 {
 321   if (lexer != NULL)
 322     {
 323       struct lex_source *source, *next;
 324
 325       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 326         lex_source_destroy (source);
 327       macro_set_destroy (lexer->macros);
 328       free (lexer);
 329     }
 330 }
 331
 332 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 333    same name.  Takes ownership of M. */
 334 void
 335 lex_define_macro (struct lexer *lexer, struct macro *m)
 336 {
 337   macro_set_add (lexer->macros, m);
 338 }
 339
 340 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 341    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 342    token. */
 343 void
 344 lex_include (struct lexer *lexer, struct lex_reader *reader)
 345 {
 346   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 347   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 348 }
 349
 350 /* Appends READER to LEXER, so that it will be read after all other current
 351    readers have already been read. */
 352 void
 353 lex_append (struct lexer *lexer, struct lex_reader *reader)
 354 {
 355   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 356 }
 357 \f
 358 /* Advancing. */
 359
 360 /* Advances LEXER to the next token, consuming the current token. */
 361 void
 362 lex_get (struct lexer *lexer)
 363 {
 364   struct lex_source *src;
 365
 366   src = lex_source__ (lexer);
 367   if (src == NULL)
 368     return;
 369
 370   if (!lex_stage_is_empty (&src->lookahead))
 371     lex_stage_pop_first (&src->lookahead);
 372
 373   while (lex_stage_is_empty (&src->lookahead))
 374     if (!lex_source_get_lookahead (src))
 375       {
 376         lex_source_destroy (src);
 377         src = lex_source__ (lexer);
 378         if (src == NULL)
 379           return;
 380       }
 381 }
 382 \f
 383 /* Issuing errors. */
 384
 385 /* Prints a syntax error message containing the current token and
 386    given message MESSAGE (if non-null). */
 387 void
 388 lex_error (struct lexer *lexer, const char *format, ...)
 389 {
 390   va_list args;
 391
 392   va_start (args, format);
 393   lex_next_error_valist (lexer, 0, 0, format, args);
 394   va_end (args);
 395 }
 396
 397 /* Prints a syntax error message containing the current token and
 398    given message MESSAGE (if non-null). */
 399 void
 400 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 401 {
 402   lex_next_error_valist (lexer, 0, 0, format, args);
 403 }
 404
 405 /* Prints a syntax error message containing the current token and
 406    given message MESSAGE (if non-null). */
 407 void
 408 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 409 {
 410   va_list args;
 411
 412   va_start (args, format);
 413   lex_next_error_valist (lexer, n0, n1, format, args);
 414   va_end (args);
 415 }
 416
 417 /* Prints a syntax error message saying that one of the strings provided as
 418    varargs, up to the first NULL, is expected. */
 419 void
 420 (lex_error_expecting) (struct lexer *lexer, ...)
 421 {
 422   va_list args;
 423
 424   va_start (args, lexer);
 425   lex_error_expecting_valist (lexer, args);
 426   va_end (args);
 427 }
 428
 429 /* Prints a syntax error message saying that one of the options provided in
 430    ARGS, up to the first NULL, is expected. */
 431 void
 432 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 433 {
 434   enum { MAX_OPTIONS = 9 };
 435   const char *options[MAX_OPTIONS];
 436   int n = 0;
 437   while (n < MAX_OPTIONS)
 438     {
 439       const char *option = va_arg (args, const char *);
 440       if (!option)
 441         break;
 442
 443       options[n++] = option;
 444     }
 445   lex_error_expecting_array (lexer, options, n);
 446 }
 447
 448 void
 449 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 450 {
 451   switch (n)
 452     {
 453     case 0:
 454       lex_error (lexer, NULL);
 455       break;
 456
 457     case 1:
 458       lex_error (lexer, _("expecting %s"), options[0]);
 459       break;
 460
 461     case 2:
 462       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 463       break;
 464
 465     case 3:
 466       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 467                  options[2]);
 468       break;
 469
 470     case 4:
 471       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 472                  options[0], options[1], options[2], options[3]);
 473       break;
 474
 475     case 5:
 476       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 477                  options[0], options[1], options[2], options[3], options[4]);
 478       break;
 479
 480     case 6:
 481       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 482                  options[0], options[1], options[2], options[3], options[4],
 483                  options[5]);
 484       break;
 485
 486     case 7:
 487       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 488                  options[0], options[1], options[2], options[3], options[4],
 489                  options[5], options[6]);
 490       break;
 491
 492     case 8:
 493       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 494                  options[0], options[1], options[2], options[3], options[4],
 495                  options[5], options[6], options[7]);
 496       break;
 497
 498     default:
 499       lex_error (lexer, NULL);
 500     }
 501 }
 502
 503 /* Reports an error to the effect that subcommand SBC may only be specified
 504    once.
 505
 506    This function does not take a lexer as an argument or use lex_error(),
 507    because the result would ordinarily just be redundant: "Syntax error at
 508    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 509    not help the user find the error. */
 510 void
 511 lex_sbc_only_once (const char *sbc)
 512 {
 513   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 514 }
 515
 516 /* Reports an error to the effect that subcommand SBC is missing.
 517
 518    This function does not take a lexer as an argument or use lex_error(),
 519    because a missing subcommand can normally be detected only after the whole
 520    command has been parsed, and so lex_error() would always report "Syntax
 521    error at end of command", which does not help the user find the error. */
 522 void
 523 lex_sbc_missing (const char *sbc)
 524 {
 525   msg (SE, _("Required subcommand %s was not specified."), sbc);
 526 }
 527
 528 /* Reports an error to the effect that specification SPEC may only be specified
 529    once within subcommand SBC. */
 530 void
 531 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 532 {
 533   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 534              spec, sbc);
 535 }
 536
 537 /* Reports an error to the effect that specification SPEC is missing within
 538    subcommand SBC. */
 539 void
 540 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 541 {
 542   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 543              sbc, spec);
 544 }
 545
 546 /* Prints a syntax error message containing the current token and
 547    given message MESSAGE (if non-null). */
 548 void
 549 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 550                        const char *format, va_list args)
 551 {
 552   struct lex_source *src = lex_source__ (lexer);
 553
 554   if (src != NULL)
 555     lex_source_error_valist (src, n0, n1, format, args);
 556   else
 557     {
 558       struct string s;
 559
 560       ds_init_empty (&s);
 561       ds_put_format (&s, _("Syntax error at end of input"));
 562       if (format != NULL)
 563         {
 564           ds_put_cstr (&s, ": ");
 565           ds_put_vformat (&s, format, args);
 566         }
 567       ds_put_byte (&s, '.');
 568       msg (SE, "%s", ds_cstr (&s));
 569       ds_destroy (&s);
 570     }
 571 }
 572
 573 /* Checks that we're at end of command.
 574    If so, returns a successful command completion code.
 575    If not, flags a syntax error and returns an error command
 576    completion code. */
 577 int
 578 lex_end_of_command (struct lexer *lexer)
 579 {
 580   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 581     {
 582       lex_error (lexer, _("expecting end of command"));
 583       return CMD_FAILURE;
 584     }
 585   else
 586     return CMD_SUCCESS;
 587 }
 588 \f
 589 /* Token testing functions. */
 590
 591 /* Returns true if the current token is a number. */
 592 bool
 593 lex_is_number (const struct lexer *lexer)
 594 {
 595   return lex_next_is_number (lexer, 0);
 596 }
 597
 598 /* Returns true if the current token is a string. */
 599 bool
 600 lex_is_string (const struct lexer *lexer)
 601 {
 602   return lex_next_is_string (lexer, 0);
 603 }
 604
 605 /* Returns the value of the current token, which must be a
 606    floating point number. */
 607 double
 608 lex_number (const struct lexer *lexer)
 609 {
 610   return lex_next_number (lexer, 0);
 611 }
 612
 613 /* Returns true iff the current token is an integer. */
 614 bool
 615 lex_is_integer (const struct lexer *lexer)
 616 {
 617   return lex_next_is_integer (lexer, 0);
 618 }
 619
 620 /* Returns the value of the current token, which must be an
 621    integer. */
 622 long
 623 lex_integer (const struct lexer *lexer)
 624 {
 625   return lex_next_integer (lexer, 0);
 626 }
 627 \f
 628 /* Token testing functions with lookahead.
 629
 630    A value of 0 for N as an argument to any of these functions refers to the
 631    current token.  Lookahead is limited to the current command.  Any N greater
 632    than the number of tokens remaining in the current command will be treated
 633    as referring to a T_ENDCMD token. */
 634
 635 /* Returns true if the token N ahead of the current token is a number. */
 636 bool
 637 lex_next_is_number (const struct lexer *lexer, int n)
 638 {
 639   return token_is_number (lex_next (lexer, n));
 640 }
 641
 642 /* Returns true if the token N ahead of the current token is a string. */
 643 bool
 644 lex_next_is_string (const struct lexer *lexer, int n)
 645 {
 646   return token_is_string (lex_next (lexer, n));
 647 }
 648
 649 /* Returns the value of the token N ahead of the current token, which must be a
 650    floating point number. */
 651 double
 652 lex_next_number (const struct lexer *lexer, int n)
 653 {
 654   return token_number (lex_next (lexer, n));
 655 }
 656
 657 /* Returns true if the token N ahead of the current token is an integer. */
 658 bool
 659 lex_next_is_integer (const struct lexer *lexer, int n)
 660 {
 661   return token_is_integer (lex_next (lexer, n));
 662 }
 663
 664 /* Returns the value of the token N ahead of the current token, which must be
 665    an integer. */
 666 long
 667 lex_next_integer (const struct lexer *lexer, int n)
 668 {
 669   return token_integer (lex_next (lexer, n));
 670 }
 671 \f
 672 /* Token matching functions. */
 673
 674 /* If the current token has the specified TYPE, skips it and returns true.
 675    Otherwise, returns false. */
 676 bool
 677 lex_match (struct lexer *lexer, enum token_type type)
 678 {
 679   if (lex_token (lexer) == type)
 680     {
 681       lex_get (lexer);
 682       return true;
 683     }
 684   else
 685     return false;
 686 }
 687
 688 /* If the current token matches IDENTIFIER, skips it and returns true.
 689    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 690    returns false.
 691
 692    IDENTIFIER must be an ASCII string. */
 693 bool
 694 lex_match_id (struct lexer *lexer, const char *identifier)
 695 {
 696   return lex_match_id_n (lexer, identifier, 3);
 697 }
 698
 699 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 700    may be abbreviated to its first N letters.  Otherwise, returns false.
 701
 702    IDENTIFIER must be an ASCII string. */
 703 bool
 704 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 705 {
 706   if (lex_token (lexer) == T_ID
 707       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 708     {
 709       lex_get (lexer);
 710       return true;
 711     }
 712   else
 713     return false;
 714 }
 715
 716 /* If the current token is integer X, skips it and returns true.  Otherwise,
 717    returns false. */
 718 bool
 719 lex_match_int (struct lexer *lexer, int x)
 720 {
 721   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 722     {
 723       lex_get (lexer);
 724       return true;
 725     }
 726   else
 727     return false;
 728 }
 729 \f
 730 /* Forced matches. */
 731
 732 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 733    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 734    false.
 735
 736    IDENTIFIER must be an ASCII string. */
 737 bool
 738 lex_force_match_id (struct lexer *lexer, const char *identifier)
 739 {
 740   if (lex_match_id (lexer, identifier))
 741     return true;
 742   else
 743     {
 744       lex_error_expecting (lexer, identifier);
 745       return false;
 746     }
 747 }
 748
 749 /* If the current token has the specified TYPE, skips it and returns true.
 750    Otherwise, reports an error and returns false. */
 751 bool
 752 lex_force_match (struct lexer *lexer, enum token_type type)
 753 {
 754   if (lex_token (lexer) == type)
 755     {
 756       lex_get (lexer);
 757       return true;
 758     }
 759   else
 760     {
 761       const char *type_string = token_type_to_string (type);
 762       if (type_string)
 763         {
 764           char *s = xasprintf ("`%s'", type_string);
 765           lex_error_expecting (lexer, s);
 766           free (s);
 767         }
 768       else
 769         lex_error_expecting (lexer, token_type_to_name (type));
 770
 771       return false;
 772     }
 773 }
 774
 775 /* If the current token is a string, does nothing and returns true.
 776    Otherwise, reports an error and returns false. */
 777 bool
 778 lex_force_string (struct lexer *lexer)
 779 {
 780   if (lex_is_string (lexer))
 781     return true;
 782   else
 783     {
 784       lex_error (lexer, _("expecting string"));
 785       return false;
 786     }
 787 }
 788
 789 /* If the current token is a string or an identifier, does nothing and returns
 790    true.  Otherwise, reports an error and returns false.
 791
 792    This is meant for use in syntactic situations where we want to encourage the
 793    user to supply a quoted string, but for compatibility we also accept
 794    identifiers.  (One example of such a situation is file names.)  Therefore,
 795    the error message issued when the current token is wrong only says that a
 796    string is expected and doesn't mention that an identifier would also be
 797    accepted. */
 798 bool
 799 lex_force_string_or_id (struct lexer *lexer)
 800 {
 801   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 802 }
 803
 804 /* If the current token is an integer, does nothing and returns true.
 805    Otherwise, reports an error and returns false. */
 806 bool
 807 lex_force_int (struct lexer *lexer)
 808 {
 809   if (lex_is_integer (lexer))
 810     return true;
 811   else
 812     {
 813       lex_error (lexer, _("expecting integer"));
 814       return false;
 815     }
 816 }
 817
 818 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 819    nothing and returns true.  Otherwise, reports an error and returns false.
 820    If NAME is nonnull, then it is used in the error message. */
 821 bool
 822 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 823 {
 824   bool is_number = lex_is_number (lexer);
 825   bool is_integer = lex_is_integer (lexer);
 826   bool too_small = (is_integer ? lex_integer (lexer) < min
 827                     : is_number ? lex_number (lexer) < min
 828                     : false);
 829   bool too_big = (is_integer ? lex_integer (lexer) > max
 830                   : is_number ? lex_number (lexer) > max
 831                   : false);
 832   if (is_integer && !too_small && !too_big)
 833     return true;
 834
 835   if (min > max)
 836     {
 837       /* Weird, maybe a bug in the caller.  Just report that we needed an
 838          integer. */
 839       if (name)
 840         lex_error (lexer, _("Integer expected for %s."), name);
 841       else
 842         lex_error (lexer, _("Integer expected."));
 843     }
 844   else if (min == max)
 845     {
 846       if (name)
 847         lex_error (lexer, _("Expected %ld for %s."), min, name);
 848       else
 849         lex_error (lexer, _("Expected %ld."), min);
 850     }
 851   else if (min + 1 == max)
 852     {
 853       if (name)
 854         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 855       else
 856         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 857     }
 858   else
 859     {
 860       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 861       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 862
 863       if (report_lower_bound && report_upper_bound)
 864         {
 865           if (name)
 866             lex_error (lexer,
 867                        _("Expected integer between %ld and %ld for %s."),
 868                        min, max, name);
 869           else
 870             lex_error (lexer, _("Expected integer between %ld and %ld."),
 871                        min, max);
 872         }
 873       else if (report_lower_bound)
 874         {
 875           if (min == 0)
 876             {
 877               if (name)
 878                 lex_error (lexer, _("Expected non-negative integer for %s."),
 879                            name);
 880               else
 881                 lex_error (lexer, _("Expected non-negative integer."));
 882             }
 883           else if (min == 1)
 884             {
 885               if (name)
 886                 lex_error (lexer, _("Expected positive integer for %s."),
 887                            name);
 888               else
 889                 lex_error (lexer, _("Expected positive integer."));
 890             }
 891           else
 892             {
 893               if (name)
 894                 lex_error (lexer, _("Expected integer %ld or greater for %s."),
 895                            min, name);
 896               else
 897                 lex_error (lexer, _("Expected integer %ld or greater."), min);
 898             }
 899         }
 900       else if (report_upper_bound)
 901         {
 902           if (name)
 903             lex_error (lexer,
 904                        _("Expected integer less than or equal to %ld for %s."),
 905                        max, name);
 906           else
 907             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 908                        max);
 909         }
 910       else
 911         {
 912           if (name)
 913             lex_error (lexer, _("Integer expected for %s."), name);
 914           else
 915             lex_error (lexer, _("Integer expected."));
 916         }
 917     }
 918   return false;
 919 }
 920
 921 /* If the current token is a number, does nothing and returns true.
 922    Otherwise, reports an error and returns false. */
 923 bool
 924 lex_force_num (struct lexer *lexer)
 925 {
 926   if (lex_is_number (lexer))
 927     return true;
 928
 929   lex_error (lexer, _("expecting number"));
 930   return false;
 931 }
 932
 933 /* If the current token is an identifier, does nothing and returns true.
 934    Otherwise, reports an error and returns false. */
 935 bool
 936 lex_force_id (struct lexer *lexer)
 937 {
 938   if (lex_token (lexer) == T_ID)
 939     return true;
 940
 941   lex_error (lexer, _("expecting identifier"));
 942   return false;
 943 }
 944 \f
 945 /* Token accessors. */
 946
 947 /* Returns the type of LEXER's current token. */
 948 enum token_type
 949 lex_token (const struct lexer *lexer)
 950 {
 951   return lex_next_token (lexer, 0);
 952 }
 953
 954 /* Returns the number in LEXER's current token.
 955
 956    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 957    tokens this function will always return zero. */
 958 double
 959 lex_tokval (const struct lexer *lexer)
 960 {
 961   return lex_next_tokval (lexer, 0);
 962 }
 963
 964 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 965
 966    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 967    this functions this function will always return NULL.
 968
 969    The UTF-8 encoding of the returned string is correct for variable names and
 970    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 971    data_in() to use it in a "union value".  */
 972 const char *
 973 lex_tokcstr (const struct lexer *lexer)
 974 {
 975   return lex_next_tokcstr (lexer, 0);
 976 }
 977
 978 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 979    null-terminated (but the null terminator is not included in the returned
 980    substring's 'length').
 981
 982    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 983    this functions this function will always return NULL.
 984
 985    The UTF-8 encoding of the returned string is correct for variable names and
 986    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 987    data_in() to use it in a "union value".  */
 988 struct substring
 989 lex_tokss (const struct lexer *lexer)
 990 {
 991   return lex_next_tokss (lexer, 0);
 992 }
 993 \f
 994 /* Looking ahead.
 995
 996    A value of 0 for N as an argument to any of these functions refers to the
 997    current token.  Lookahead is limited to the current command.  Any N greater
 998    than the number of tokens remaining in the current command will be treated
 999    as referring to a T_ENDCMD token. */
1000
1001 static const struct lex_token *
1002 lex_next__ (const struct lexer *lexer_, int n)
1003 {
1004   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1005   struct lex_source *src = lex_source__ (lexer);
1006
1007   if (src != NULL)
1008     return lex_source_next__ (src, n);
1009   else
1010     {
1011       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1012       return &stop_token;
1013     }
1014 }
1015
1016 static const struct lex_token *
1017 lex_source_next__ (const struct lex_source *src_, int n)
1018 {
1019   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1020   while (lex_stage_count (&src->lookahead) <= n)
1021     {
1022       if (!lex_stage_is_empty (&src->lookahead))
1023         {
1024           const struct lex_token *t = lex_stage_last (&src->lookahead);
1025           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1026             return t;
1027         }
1028
1029       lex_source_get_lookahead (src);
1030     }
1031
1032   return lex_stage_nth (&src->lookahead, n);
1033 }
1034
1035 /* Returns the "struct token" of the token N after the current one in LEXER.
1036    The returned pointer can be invalidated by pretty much any succeeding call
1037    into the lexer, although the string pointer within the returned token is
1038    only invalidated by consuming the token (e.g. with lex_get()). */
1039 const struct token *
1040 lex_next (const struct lexer *lexer, int n)
1041 {
1042   return &lex_next__ (lexer, n)->token;
1043 }
1044
1045 /* Returns the type of the token N after the current one in LEXER. */
1046 enum token_type
1047 lex_next_token (const struct lexer *lexer, int n)
1048 {
1049   return lex_next (lexer, n)->type;
1050 }
1051
1052 /* Returns the number in the tokn N after the current one in LEXER.
1053
1054    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1055    tokens this function will always return zero. */
1056 double
1057 lex_next_tokval (const struct lexer *lexer, int n)
1058 {
1059   return token_number (lex_next (lexer, n));
1060 }
1061
1062 /* Returns the null-terminated string in the token N after the current one, in
1063    UTF-8 encoding.
1064
1065    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1066    this functions this function will always return NULL.
1067
1068    The UTF-8 encoding of the returned string is correct for variable names and
1069    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1070    data_in() to use it in a "union value".  */
1071 const char *
1072 lex_next_tokcstr (const struct lexer *lexer, int n)
1073 {
1074   return lex_next_tokss (lexer, n).string;
1075 }
1076
1077 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1078    The string is null-terminated (but the null terminator is not included in
1079    the returned substring's 'length').
1080
1081    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1082    tokens this functions this function will always return NULL.
1083
1084    The UTF-8 encoding of the returned string is correct for variable names and
1085    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1086    data_in() to use it in a "union value".  */
1087 struct substring
1088 lex_next_tokss (const struct lexer *lexer, int n)
1089 {
1090   return lex_next (lexer, n)->string;
1091 }
1092
1093 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1094    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1095    are both zero, this requests the syntax for the current token.)  The caller
1096    must eventually free the returned string (with free()).  The syntax is
1097    encoded in UTF-8 and in the original form supplied to the lexer so that, for
1098    example, it may include comments, spaces, and new-lines if it spans multiple
1099    tokens.  Macro expansion, however, has already been performed. */
1100 char *
1101 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1102 {
1103   return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1104 }
1105
1106 /* Returns true if the token N ahead of the current one was produced by macro
1107    expansion, false otherwise. */
1108 bool
1109 lex_next_is_from_macro (const struct lexer *lexer, int n)
1110 {
1111   return lex_next__ (lexer, n)->macro_rep != NULL;
1112 }
1113
1114 static bool
1115 lex_tokens_match (const struct token *actual, const struct token *expected)
1116 {
1117   if (actual->type != expected->type)
1118     return false;
1119
1120   switch (actual->type)
1121     {
1122     case T_POS_NUM:
1123     case T_NEG_NUM:
1124       return actual->number == expected->number;
1125
1126     case T_ID:
1127       return lex_id_match (expected->string, actual->string);
1128
1129     case T_STRING:
1130       return (actual->string.length == expected->string.length
1131               && !memcmp (actual->string.string, expected->string.string,
1132                           actual->string.length));
1133
1134     default:
1135       return true;
1136     }
1137 }
1138
1139 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1140    skips it and returns true.  Otherwise, returns false.
1141
1142    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1143    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1144    first three letters. */
1145 bool
1146 lex_match_phrase (struct lexer *lexer, const char *s)
1147 {
1148   struct string_lexer slex;
1149   struct token token;
1150   int i;
1151
1152   i = 0;
1153   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1154   while (string_lexer_next (&slex, &token))
1155     {
1156       bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1157       token_uninit (&token);
1158       if (!match)
1159         return false;
1160     }
1161
1162   while (i-- > 0)
1163     lex_get (lexer);
1164   return true;
1165 }
1166
1167 static int
1168 count_newlines (char *s, size_t length)
1169 {
1170   int n_newlines = 0;
1171   char *newline;
1172
1173   while ((newline = memchr (s, '\n', length)) != NULL)
1174     {
1175       n_newlines++;
1176       length -= (newline + 1) - s;
1177       s = newline + 1;
1178     }
1179
1180   return n_newlines;
1181 }
1182
1183 static int
1184 lex_token_get_last_line_number (const struct lex_source *src,
1185                                 const struct lex_token *token)
1186 {
1187   if (token->first_line == 0)
1188     return 0;
1189   else
1190     {
1191       char *token_str = &src->buffer[token->token_pos - src->tail];
1192       return token->first_line + count_newlines (token_str, token->token_len) + 1;
1193     }
1194 }
1195
1196 static int
1197 count_columns (const char *s_, size_t length)
1198 {
1199   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1200   int columns;
1201   size_t ofs;
1202   int mblen;
1203
1204   columns = 0;
1205   for (ofs = 0; ofs < length; ofs += mblen)
1206     {
1207       ucs4_t uc;
1208
1209       mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1210       if (uc != '\t')
1211         {
1212           int width = uc_width (uc, "UTF-8");
1213           if (width > 0)
1214             columns += width;
1215         }
1216       else
1217         columns = ROUND_UP (columns + 1, 8);
1218     }
1219
1220   return columns + 1;
1221 }
1222
1223 static int
1224 lex_token_get_first_column (const struct lex_source *src,
1225                             const struct lex_token *token)
1226 {
1227   return count_columns (&src->buffer[token->line_pos - src->tail],
1228                         token->token_pos - token->line_pos);
1229 }
1230
1231 static int
1232 lex_token_get_last_column (const struct lex_source *src,
1233                            const struct lex_token *token)
1234 {
1235   char *start, *end, *newline;
1236
1237   start = &src->buffer[token->line_pos - src->tail];
1238   end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1239   newline = memrchr (start, '\n', end - start);
1240   if (newline != NULL)
1241     start = newline + 1;
1242   return count_columns (start, end - start);
1243 }
1244
1245 static struct msg_location
1246 lex_token_location (const struct lex_source *src,
1247                     const struct lex_token *t0,
1248                     const struct lex_token *t1)
1249 {
1250   return (struct msg_location) {
1251     .file_name = src->reader->file_name,
1252     .first_line = t0->first_line,
1253     .last_line = lex_token_get_last_line_number (src, t1),
1254     .first_column = lex_token_get_first_column (src, t0),
1255     .last_column = lex_token_get_last_column (src, t1),
1256   };
1257 }
1258
1259 static struct msg_location *
1260 lex_token_location_rw (const struct lex_source *src,
1261                        const struct lex_token *t0,
1262                        const struct lex_token *t1)
1263 {
1264   struct msg_location location = lex_token_location (src, t0, t1);
1265   return msg_location_dup (&location);
1266 }
1267
1268 static struct msg_location *
1269 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1270 {
1271   return lex_token_location_rw (src,
1272                                 lex_source_next__ (src, n0),
1273                                 lex_source_next__ (src, n1));
1274 }
1275
1276 /* Returns the 1-based line number of the start of the syntax that represents
1277    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1278    if the token is drawn from a source that does not have line numbers. */
1279 int
1280 lex_get_first_line_number (const struct lexer *lexer, int n)
1281 {
1282   const struct lex_source *src = lex_source__ (lexer);
1283   return src ? lex_source_next__ (src, n)->first_line : 0;
1284 }
1285
1286 /* Returns the 1-based line number of the end of the syntax that represents the
1287    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1288    token or if the token is drawn from a source that does not have line
1289    numbers.
1290
1291    Most of the time, a single token is wholly within a single line of syntax,
1292    but there are two exceptions: a T_STRING token can be made up of multiple
1293    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1294    token can consist of a "-" on one line followed by the number on the next.
1295  */
1296 int
1297 lex_get_last_line_number (const struct lexer *lexer, int n)
1298 {
1299   const struct lex_source *src = lex_source__ (lexer);
1300   return src ? lex_token_get_last_line_number (src,
1301                                                lex_source_next__ (src, n)) : 0;
1302 }
1303
1304 /* Returns the 1-based column number of the start of the syntax that represents
1305    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1306    token.
1307
1308    Column numbers are measured according to the width of characters as shown in
1309    a typical fixed-width font, in which CJK characters have width 2 and
1310    combining characters have width 0.  */
1311 int
1312 lex_get_first_column (const struct lexer *lexer, int n)
1313 {
1314   const struct lex_source *src = lex_source__ (lexer);
1315   return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1316 }
1317
1318 /* Returns the 1-based column number of the end of the syntax that represents
1319    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1320    token.
1321
1322    Column numbers are measured according to the width of characters as shown in
1323    a typical fixed-width font, in which CJK characters have width 2 and
1324    combining characters have width 0.  */
1325 int
1326 lex_get_last_column (const struct lexer *lexer, int n)
1327 {
1328   const struct lex_source *src = lex_source__ (lexer);
1329   return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1330 }
1331
1332 /* Returns the name of the syntax file from which the current command is drawn.
1333    Returns NULL for a T_STOP token or if the command's source does not have
1334    line numbers.
1335
1336    There is no version of this function that takes an N argument because
1337    lookahead only works to the end of a command and any given command is always
1338    within a single syntax file. */
1339 const char *
1340 lex_get_file_name (const struct lexer *lexer)
1341 {
1342   struct lex_source *src = lex_source__ (lexer);
1343   return src == NULL ? NULL : src->reader->file_name;
1344 }
1345
1346 /* Returns a newly allocated msg_location for the syntax that represents tokens
1347    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1348    must eventually free the location (with msg_location_destroy()). */
1349 struct msg_location *
1350 lex_get_location (const struct lexer *lexer, int n0, int n1)
1351 {
1352   struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1353   loc->first_column = lex_get_first_column (lexer, n0);
1354   loc->last_column = lex_get_last_column (lexer, n1);
1355   return loc;
1356 }
1357
1358 /* Returns a newly allocated msg_location for the syntax that represents tokens
1359    with 0-based offsets N0...N1, inclusive, from the current token.  The
1360    location only covers the tokens' lines, not the columns.  The caller must
1361    eventually free the location (with msg_location_destroy()). */
1362 struct msg_location *
1363 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1364 {
1365   struct msg_location *loc = xmalloc (sizeof *loc);
1366   *loc = (struct msg_location) {
1367     .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1368     .first_line = lex_get_first_line_number (lexer, n0),
1369     .last_line = lex_get_last_line_number (lexer, n1),
1370   };
1371   return loc;
1372 }
1373
1374 const char *
1375 lex_get_encoding (const struct lexer *lexer)
1376 {
1377   struct lex_source *src = lex_source__ (lexer);
1378   return src == NULL ? NULL : src->reader->encoding;
1379 }
1380
1381 /* Returns the syntax mode for the syntax file from which the current drawn is
1382    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1383    does not have line numbers.
1384
1385    There is no version of this function that takes an N argument because
1386    lookahead only works to the end of a command and any given command is always
1387    within a single syntax file. */
1388 enum segmenter_mode
1389 lex_get_syntax_mode (const struct lexer *lexer)
1390 {
1391   struct lex_source *src = lex_source__ (lexer);
1392   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1393 }
1394
1395 /* Returns the error mode for the syntax file from which the current drawn is
1396    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1397    source does not have line numbers.
1398
1399    There is no version of this function that takes an N argument because
1400    lookahead only works to the end of a command and any given command is always
1401    within a single syntax file. */
1402 enum lex_error_mode
1403 lex_get_error_mode (const struct lexer *lexer)
1404 {
1405   struct lex_source *src = lex_source__ (lexer);
1406   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1407 }
1408
1409 /* If the source that LEXER is currently reading has error mode
1410    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1411    token to be read comes directly from whatever is next read from the stream.
1412
1413    It makes sense to call this function after encountering an error in a
1414    command entered on the console, because usually the user would prefer not to
1415    have cascading errors. */
1416 void
1417 lex_interactive_reset (struct lexer *lexer)
1418 {
1419   struct lex_source *src = lex_source__ (lexer);
1420   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1421     {
1422       src->head = src->tail = 0;
1423       src->journal_pos = src->seg_pos = src->line_pos = 0;
1424       src->n_newlines = 0;
1425       src->suppress_next_newline = false;
1426       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1427                                        false);
1428       lex_stage_clear (&src->pp);
1429       lex_stage_clear (&src->merge);
1430       lex_stage_clear (&src->lookahead);
1431       lex_source_push_endcmd__ (src);
1432     }
1433 }
1434
1435 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1436 void
1437 lex_discard_rest_of_command (struct lexer *lexer)
1438 {
1439   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1440     lex_get (lexer);
1441 }
1442
1443 /* Discards all lookahead tokens in LEXER, then discards all input sources
1444    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1445    runs out of input sources. */
1446 void
1447 lex_discard_noninteractive (struct lexer *lexer)
1448 {
1449   struct lex_source *src = lex_source__ (lexer);
1450
1451   if (src != NULL)
1452     {
1453       lex_stage_clear (&src->pp);
1454       lex_stage_clear (&src->merge);
1455       lex_stage_clear (&src->lookahead);
1456
1457       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1458            src = lex_source__ (lexer))
1459         lex_source_destroy (src);
1460     }
1461 }
1462 \f
1463 static size_t
1464 lex_source_max_tail__ (const struct lex_source *src_)
1465 {
1466   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1467
1468   assert (src->seg_pos >= src->line_pos);
1469   size_t max_tail = MIN (src->journal_pos, src->line_pos);
1470
1471   /* Use the oldest token also. */
1472   struct lex_stage *stages[] = { &src->lookahead, &src->merge, &src->pp };
1473   for (size_t i = 0; i < sizeof stages / sizeof *stages; i++)
1474     if (!lex_stage_is_empty (stages[i]))
1475       {
1476         struct lex_token *first = lex_stage_first (stages[i]);
1477         assert (first->token_pos >= first->line_pos);
1478         return MIN (max_tail, first->line_pos);
1479       }
1480
1481   return max_tail;
1482 }
1483
1484 static void
1485 lex_source_expand__ (struct lex_source *src)
1486 {
1487   if (src->head - src->tail >= src->allocated)
1488     {
1489       size_t max_tail = lex_source_max_tail__ (src);
1490       if (max_tail > src->tail)
1491         {
1492           /* Advance the tail, freeing up room at the head. */
1493           memmove (src->buffer, src->buffer + (max_tail - src->tail),
1494                    src->head - max_tail);
1495           src->tail = max_tail;
1496         }
1497       else
1498         {
1499           /* Buffer is completely full.  Expand it. */
1500           src->buffer = x2realloc (src->buffer, &src->allocated);
1501         }
1502     }
1503   else
1504     {
1505       /* There's space available at the head of the buffer.  Nothing to do. */
1506     }
1507 }
1508
1509 static void
1510 lex_source_read__ (struct lex_source *src)
1511 {
1512   do
1513     {
1514       lex_source_expand__ (src);
1515
1516       size_t head_ofs = src->head - src->tail;
1517       size_t space = src->allocated - head_ofs;
1518       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1519       size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1520                                            space, prompt);
1521       assert (n <= space);
1522
1523       if (n == 0)
1524         {
1525           /* End of input. */
1526           src->reader->eof = true;
1527           lex_source_expand__ (src);
1528           return;
1529         }
1530
1531       src->head += n;
1532     }
1533   while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1534                   src->head - src->seg_pos));
1535 }
1536
1537 static struct lex_source *
1538 lex_source__ (const struct lexer *lexer)
1539 {
1540   return (ll_is_empty (&lexer->sources) ? NULL
1541           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1542 }
1543
1544 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1545    one, through N1 ahead of the current one, inclusive.  (For example, if N0
1546    and N1 are both zero, this requests the syntax for the current token.)  The
1547    caller must eventually free the returned string (with free()).  The syntax
1548    is encoded in UTF-8 and in the original form supplied to the lexer so that,
1549    for example, it may include comments, spaces, and new-lines if it spans
1550    multiple tokens.  Macro expansion, however, has already been performed. */
1551 static char *
1552 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1553 {
1554   struct string s = DS_EMPTY_INITIALIZER;
1555   for (size_t i = n0; i <= n1; )
1556     {
1557       /* Find [I,J) as the longest sequence of tokens not produced by macro
1558          expansion, or otherwise the longest sequence expanded from a single
1559          macro call. */
1560       const struct lex_token *first = lex_source_next__ (src, i);
1561       size_t j;
1562       for (j = i + 1; j <= n1; j++)
1563         {
1564           const struct lex_token *cur = lex_source_next__ (src, j);
1565           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1566               || first->macro_rep != cur->macro_rep)
1567             break;
1568         }
1569       const struct lex_token *last = lex_source_next__ (src, j - 1);
1570
1571       /* Now add the syntax for this sequence of tokens to SRC. */
1572       if (!ds_is_empty (&s))
1573         ds_put_byte (&s, ' ');
1574       if (!first->macro_rep)
1575         {
1576           size_t start = first->token_pos;
1577           size_t end = last->token_pos + last->token_len;
1578           ds_put_substring (&s, ss_buffer (&src->buffer[start - src->tail],
1579                                            end - start));
1580         }
1581       else
1582         {
1583           size_t start = first->ofs;
1584           size_t end = last->ofs + last->len;
1585           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1586                                            end - start));
1587         }
1588
1589       i = j;
1590     }
1591   return ds_steal_cstr (&s);
1592 }
1593
1594 static bool
1595 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1596 {
1597   for (size_t i = n0; i <= n1; i++)
1598     if (lex_source_next__ (src, i)->macro_rep)
1599       return true;
1600   return false;
1601 }
1602
1603 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1604    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1605    other tokens included in that range.  The syntax is encoded in UTF-8 and in
1606    the original form supplied to the lexer so that, for example, it may include
1607    comments, spaces, and new-lines if it spans multiple tokens.
1608
1609    Returns an empty string if the token range doesn't include a macro call.
1610
1611    The caller must not modify or free the returned string. */
1612 static struct substring
1613 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1614 {
1615   if (!lex_source_contains_macro_call (src, n0, n1))
1616     return ss_empty ();
1617
1618   const struct lex_token *token0 = lex_source_next__ (src, n0);
1619   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1620   size_t start = token0->token_pos;
1621   size_t end = token1->token_pos + token1->token_len;
1622
1623   return ss_buffer (&src->buffer[start - src->tail], end - start);
1624 }
1625
1626 static void
1627 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1628                          const char *format, va_list args)
1629 {
1630   const struct lex_token *token;
1631   struct string s;
1632
1633   ds_init_empty (&s);
1634
1635   token = lex_source_next__ (src, n0);
1636   if (token->token.type == T_ENDCMD)
1637     ds_put_cstr (&s, _("Syntax error at end of command"));
1638   else
1639     {
1640       /* Get the syntax that caused the error. */
1641       char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1642       char syntax[64];
1643       str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1644       free (raw_syntax);
1645
1646       /* Get the macro call(s) that expanded to the syntax that caused the
1647          error. */
1648       char call[64];
1649       str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1650                      call, sizeof call);
1651
1652       if (syntax[0])
1653         {
1654           if (call[0])
1655             ds_put_format (&s,
1656                            _("Syntax error at `%s' (in expansion of `%s')"),
1657                            syntax, call);
1658           else
1659             ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1660         }
1661       else
1662         {
1663           if (call[0])
1664             ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1665                            call);
1666           else
1667             ds_put_cstr (&s, _("Syntax error"));
1668         }
1669     }
1670
1671   if (format)
1672     {
1673       ds_put_cstr (&s, ": ");
1674       ds_put_vformat (&s, format, args);
1675     }
1676   if (ds_last (&s) != '.')
1677     ds_put_byte (&s, '.');
1678
1679   struct msg *m = xmalloc (sizeof *m);
1680   *m = (struct msg) {
1681     .category = MSG_C_SYNTAX,
1682     .severity = MSG_S_ERROR,
1683     .location = lex_source_get_location (src, n0, n1),
1684     .text = ds_steal_cstr (&s),
1685   };
1686   msg_emit (m);
1687 }
1688
1689 static void
1690 lex_get_error (struct lex_source *src, const struct lex_token *token)
1691 {
1692   char syntax[64];
1693   str_ellipsize (ss_buffer (&src->buffer[token->token_pos - src->tail],
1694                             token->token_len),
1695                  syntax, sizeof syntax);
1696
1697   struct string s = DS_EMPTY_INITIALIZER;
1698   ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1699   ds_put_format (&s, ": %s", token->token.string.string);
1700
1701   struct msg *m = xmalloc (sizeof *m);
1702   *m = (struct msg) {
1703     .category = MSG_C_SYNTAX,
1704     .severity = MSG_S_ERROR,
1705     .location = lex_token_location_rw (src, token, token),
1706     .text = ds_steal_cstr (&s),
1707   };
1708   msg_emit (m);
1709 }
1710
1711 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1712    underlying lex_reader if necessary.  Returns true if a new token was added
1713    to SRC's deque, false otherwise.  The caller should retry failures unless
1714    SRC's 'eof' marker was set to true indicating that there will be no more
1715    tokens from this source. */
1716 static bool
1717 lex_source_try_get_pp (struct lex_source *src)
1718 {
1719   /* Append a new token to SRC and initialize it. */
1720   struct lex_token *token = xmalloc (sizeof *token);
1721   token->token = (struct token) { .type = T_STOP };
1722   token->macro_rep = NULL;
1723   token->ref_cnt = NULL;
1724   token->line_pos = src->line_pos;
1725   token->token_pos = src->seg_pos;
1726   if (src->reader->line_number > 0)
1727     token->first_line = src->reader->line_number + src->n_newlines;
1728   else
1729     token->first_line = 0;
1730
1731   /* Extract a segment. */
1732   const char *segment;
1733   enum segment_type seg_type;
1734   int seg_len;
1735   for (;;)
1736     {
1737       segment = &src->buffer[src->seg_pos - src->tail];
1738       seg_len = segmenter_push (&src->segmenter, segment,
1739                                 src->head - src->seg_pos,
1740                                 src->reader->eof, &seg_type);
1741       if (seg_len >= 0)
1742         break;
1743
1744       /* The segmenter needs more input to produce a segment. */
1745       assert (!src->reader->eof);
1746       lex_source_read__ (src);
1747     }
1748
1749   /* Update state based on the segment. */
1750   token->token_len = seg_len;
1751   src->seg_pos += seg_len;
1752   if (seg_type == SEG_NEWLINE)
1753     {
1754       src->line_pos = src->seg_pos;
1755       src->n_newlines++;
1756     }
1757
1758   /* Get a token from the segment. */
1759   enum tokenize_result result = token_from_segment (
1760     seg_type, ss_buffer (segment, seg_len), &token->token);
1761
1762   /* If we've reached the end of a line, or the end of a command, then pass
1763      the line to the output engine as a syntax text item.  */
1764   int n_lines = seg_type == SEG_NEWLINE;
1765   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1766     {
1767       n_lines++;
1768       src->suppress_next_newline = true;
1769     }
1770   else if (n_lines > 0 && src->suppress_next_newline)
1771     {
1772       n_lines--;
1773       src->suppress_next_newline = false;
1774     }
1775   for (int i = 0; i < n_lines; i++)
1776     {
1777       /* Beginning of line. */
1778       const char *line = &src->buffer[src->journal_pos - src->tail];
1779
1780       /* Calculate line length, including \n or \r\n end-of-line if present.
1781
1782          We use src->head even though that may be beyond what we've actually
1783          converted to tokens (which is only through line_pos).  That's because,
1784          if we're emitting the line due to SEG_END_COMMAND, we want to take the
1785          whole line through the newline, not just through the '.'. */
1786       size_t max_len = src->head - src->journal_pos;
1787       const char *newline = memchr (line, '\n', max_len);
1788       size_t line_len = newline ? newline - line + 1 : max_len;
1789
1790       /* Calculate line length excluding end-of-line. */
1791       size_t copy_len = line_len;
1792       if (copy_len > 0 && line[copy_len - 1] == '\n')
1793         copy_len--;
1794       if (copy_len > 0 && line[copy_len - 1] == '\r')
1795         copy_len--;
1796
1797       /* Submit the line as syntax. */
1798       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1799                                                    xmemdup0 (line, copy_len),
1800                                                    NULL));
1801
1802       src->journal_pos += line_len;
1803     }
1804
1805   switch (result)
1806     {
1807     case TOKENIZE_ERROR:
1808       lex_get_error (src, token);
1809       /* Fall through. */
1810     case TOKENIZE_EMPTY:
1811       lex_token_destroy (token);
1812       return false;
1813
1814     case TOKENIZE_TOKEN:
1815       if (token->token.type == T_STOP)
1816         {
1817           token->token.type = T_ENDCMD;
1818           src->eof = true;
1819         }
1820       lex_stage_push_last (&src->pp, token);
1821       return true;
1822     }
1823   NOT_REACHED ();
1824 }
1825
1826 /* Attempts to append a new token to SRC.  Returns true if successful, false on
1827    failure.  On failure, the end of SRC has been reached and no more tokens
1828    will be forthcoming from it.
1829
1830    Does not make the new token available for lookahead yet; the caller must
1831    adjust SRC's 'middle' pointer to do so. */
1832 static bool
1833 lex_source_get_pp (struct lex_source *src)
1834 {
1835   while (!src->eof)
1836     if (lex_source_try_get_pp (src))
1837       return true;
1838   return false;
1839 }
1840
1841 static bool
1842 lex_source_try_get_merge (const struct lex_source *src_)
1843 {
1844   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1845
1846   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1847     return false;
1848
1849   if (!settings_get_mexpand ())
1850     {
1851       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1852       return true;
1853     }
1854
1855   /* Now pass tokens one-by-one to the macro expander.
1856
1857      In the common case where there is no macro to expand, the loop is not
1858      entered.  */
1859   struct macro_call *mc;
1860   int n_call = macro_call_create (src->lexer->macros,
1861                                   &lex_stage_first (&src->pp)->token, &mc);
1862   for (int ofs = 1; !n_call; ofs++)
1863     {
1864       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1865         {
1866           /* This should not be reachable because we always get a T_ENDCMD at
1867              the end of an input file (transformed from T_STOP by
1868              lex_source_try_get_pp()) and the macro_expander should always
1869              terminate expansion on T_ENDCMD. */
1870           NOT_REACHED ();
1871         }
1872
1873       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1874       size_t start = t->token_pos;
1875       size_t end = t->token_pos + t->token_len;
1876       const struct macro_token mt = {
1877         .token = t->token,
1878         .syntax = ss_buffer (&src->buffer[start - src->tail], end - start),
1879       };
1880       const struct msg_location loc = lex_token_location (src, t, t);
1881       n_call = macro_call_add (mc, &mt, &loc);
1882     }
1883   if (n_call < 0)
1884     {
1885       /* False alarm: no macro expansion after all.  Use first token as
1886          lookahead.  We'll retry macro expansion from the second token next
1887          time around. */
1888       macro_call_destroy (mc);
1889       lex_stage_shift (&src->merge, &src->pp, 1);
1890       return true;
1891     }
1892
1893   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1894      are a macro call.  (These are likely to be the only tokens in 'pp'.)
1895      Expand them.  */
1896   const struct lex_token *c0 = lex_stage_first (&src->pp);
1897   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1898   struct macro_tokens expansion = { .n = 0 };
1899   struct msg_location loc = lex_token_location (src, c0, c1);
1900   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1901   macro_call_destroy (mc);
1902
1903   /* Convert the macro expansion into syntax for possible error messages
1904      later. */
1905   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1906   size_t *len = xnmalloc (expansion.n, sizeof *len);
1907   struct string s = DS_EMPTY_INITIALIZER;
1908   macro_tokens_to_syntax (&expansion, &s, ofs, len);
1909
1910   if (settings_get_mprint ())
1911     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1912                                           _("Macro Expansion")));
1913
1914   /* Append the macro expansion tokens to the lookahead. */
1915   if (expansion.n > 0)
1916     {
1917       char *macro_rep = ds_steal_cstr (&s);
1918       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1919       *ref_cnt = expansion.n;
1920       for (size_t i = 0; i < expansion.n; i++)
1921         {
1922           struct lex_token *token = xmalloc (sizeof *token);
1923           *token = (struct lex_token) {
1924             .token = expansion.mts[i].token,
1925             .token_pos = c0->token_pos,
1926             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1927             .line_pos = c0->line_pos,
1928             .first_line = c0->first_line,
1929             .macro_rep = macro_rep,
1930             .ofs = ofs[i],
1931             .len = len[i],
1932             .ref_cnt = ref_cnt,
1933           };
1934           lex_stage_push_last (&src->merge, token);
1935
1936           ss_dealloc (&expansion.mts[i].syntax);
1937         }
1938     }
1939   else
1940     ds_destroy (&s);
1941   free (expansion.mts);
1942   free (ofs);
1943   free (len);
1944
1945   /* Destroy the tokens for the call. */
1946   for (size_t i = 0; i < n_call; i++)
1947     lex_stage_pop_first (&src->pp);
1948
1949   return expansion.n > 0;
1950 }
1951
1952 /* Attempts to obtain at least one new token into 'merge' in SRC.
1953
1954    Returns true if successful, false on failure.  In the latter case, SRC is
1955    exhausted and 'src->eof' is now true. */
1956 static bool
1957 lex_source_get_merge (struct lex_source *src)
1958 {
1959   while (!src->eof)
1960     if (lex_source_try_get_merge (src))
1961       return true;
1962   return false;
1963 }
1964
1965 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1966
1967    Returns true if successful, false on failure.  In the latter case, SRC is
1968    exhausted and 'src->eof' is now true. */
1969 static bool
1970 lex_source_get_lookahead (struct lex_source *src)
1971 {
1972   struct merger m = MERGER_INIT;
1973   struct token out;
1974   for (size_t i = 0; ; i++)
1975     {
1976       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1977         {
1978           /* We always get a T_ENDCMD at the end of an input file
1979              (transformed from T_STOP by lex_source_try_get_pp()) and
1980              merger_add() should never return -1 on T_ENDCMD. */
1981           assert (lex_stage_is_empty (&src->merge));
1982           return false;
1983         }
1984
1985       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1986                                &out);
1987       if (!retval)
1988         {
1989           lex_stage_shift (&src->lookahead, &src->merge, 1);
1990           return true;
1991         }
1992       else if (retval > 0)
1993         {
1994           /* Add a token that merges all the tokens together. */
1995           const struct lex_token *first = lex_stage_first (&src->merge);
1996           const struct lex_token *last = lex_stage_nth (&src->merge,
1997                                                         retval - 1);
1998           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1999           struct lex_token *t = xmalloc (sizeof *t);
2000           *t = (struct lex_token) {
2001             .token = out,
2002             .token_pos = first->token_pos,
2003             .token_len = (last->token_pos - first->token_pos) + last->token_len,
2004             .line_pos = first->line_pos,
2005             .first_line = first->first_line,
2006
2007             /* This works well if all the tokens were not expanded from macros,
2008                or if they came from the same macro expansion.  It just gives up
2009                in the other (corner) cases. */
2010             .macro_rep = macro ? first->macro_rep : NULL,
2011             .ofs = macro ? first->ofs : 0,
2012             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2013             .ref_cnt = macro ? first->ref_cnt : NULL,
2014           };
2015           if (t->ref_cnt)
2016             ++*t->ref_cnt;
2017           lex_stage_push_last (&src->lookahead, t);
2018
2019           for (int i = 0; i < retval; i++)
2020             lex_stage_pop_first (&src->merge);
2021           return true;
2022         }
2023     }
2024 }
2025 \f
2026 static void
2027 lex_source_push_endcmd__ (struct lex_source *src)
2028 {
2029   assert (lex_stage_is_empty (&src->lookahead));
2030   struct lex_token *token = xmalloc (sizeof *token);
2031   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2032   lex_stage_push_last (&src->lookahead, token);
2033 }
2034
2035 static struct lex_source *
2036 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2037 {
2038   struct lex_source *src = xmalloc (sizeof *src);
2039   *src = (struct lex_source) {
2040     .reader = reader,
2041     .segmenter = segmenter_init (reader->syntax, false),
2042     .lexer = lexer,
2043   };
2044
2045   lex_source_push_endcmd__ (src);
2046
2047   return src;
2048 }
2049
2050 static void
2051 lex_source_destroy (struct lex_source *src)
2052 {
2053   char *file_name = src->reader->file_name;
2054   char *encoding = src->reader->encoding;
2055   if (src->reader->class->destroy != NULL)
2056     src->reader->class->destroy (src->reader);
2057   free (file_name);
2058   free (encoding);
2059   free (src->buffer);
2060   lex_stage_uninit (&src->pp);
2061   lex_stage_uninit (&src->merge);
2062   lex_stage_uninit (&src->lookahead);
2063   ll_remove (&src->ll);
2064   free (src);
2065 }
2066 \f
2067 struct lex_file_reader
2068   {
2069     struct lex_reader reader;
2070     struct u8_istream *istream;
2071   };
2072
2073 static struct lex_reader_class lex_file_reader_class;
2074
2075 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2076    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2077    ENCODING, which should take one of the forms accepted by
2078    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2079    mode of the new reader, respectively.
2080
2081    Returns a null pointer if FILE_NAME cannot be opened. */
2082 struct lex_reader *
2083 lex_reader_for_file (const char *file_name, const char *encoding,
2084                      enum segmenter_mode syntax,
2085                      enum lex_error_mode error)
2086 {
2087   struct lex_file_reader *r;
2088   struct u8_istream *istream;
2089
2090   istream = (!strcmp(file_name, "-")
2091              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2092              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2093   if (istream == NULL)
2094     {
2095       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2096       return NULL;
2097     }
2098
2099   r = xmalloc (sizeof *r);
2100   lex_reader_init (&r->reader, &lex_file_reader_class);
2101   r->reader.syntax = syntax;
2102   r->reader.error = error;
2103   r->reader.file_name = xstrdup (file_name);
2104   r->reader.encoding = xstrdup_if_nonnull (encoding);
2105   r->reader.line_number = 1;
2106   r->istream = istream;
2107
2108   return &r->reader;
2109 }
2110
2111 static struct lex_file_reader *
2112 lex_file_reader_cast (struct lex_reader *r)
2113 {
2114   return UP_CAST (r, struct lex_file_reader, reader);
2115 }
2116
2117 static size_t
2118 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2119                enum prompt_style prompt_style UNUSED)
2120 {
2121   struct lex_file_reader *r = lex_file_reader_cast (r_);
2122   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2123   if (n_read < 0)
2124     {
2125       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2126       return 0;
2127     }
2128   return n_read;
2129 }
2130
2131 static void
2132 lex_file_close (struct lex_reader *r_)
2133 {
2134   struct lex_file_reader *r = lex_file_reader_cast (r_);
2135
2136   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2137     {
2138       if (u8_istream_close (r->istream) != 0)
2139         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2140     }
2141   else
2142     u8_istream_free (r->istream);
2143
2144   free (r);
2145 }
2146
2147 static struct lex_reader_class lex_file_reader_class =
2148   {
2149     lex_file_read,
2150     lex_file_close
2151   };
2152 \f
2153 struct lex_string_reader
2154   {
2155     struct lex_reader reader;
2156     struct substring s;
2157     size_t offset;
2158   };
2159
2160 static struct lex_reader_class lex_string_reader_class;
2161
2162 /* Creates and returns a new lex_reader for the contents of S, which must be
2163    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2164    with ss_dealloc() when it is closed. */
2165 struct lex_reader *
2166 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2167 {
2168   struct lex_string_reader *r;
2169
2170   r = xmalloc (sizeof *r);
2171   lex_reader_init (&r->reader, &lex_string_reader_class);
2172   r->reader.syntax = SEG_MODE_AUTO;
2173   r->reader.encoding = xstrdup_if_nonnull (encoding);
2174   r->s = s;
2175   r->offset = 0;
2176
2177   return &r->reader;
2178 }
2179
2180 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2181    which must be encoded in ENCODING.  The caller retains ownership of S. */
2182 struct lex_reader *
2183 lex_reader_for_string (const char *s, const char *encoding)
2184 {
2185   struct substring ss;
2186   ss_alloc_substring (&ss, ss_cstr (s));
2187   return lex_reader_for_substring_nocopy (ss, encoding);
2188 }
2189
2190 /* Formats FORMAT as a printf()-like format string and creates and returns a
2191    new lex_reader for the formatted result.  */
2192 struct lex_reader *
2193 lex_reader_for_format (const char *format, const char *encoding, ...)
2194 {
2195   struct lex_reader *r;
2196   va_list args;
2197
2198   va_start (args, encoding);
2199   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2200   va_end (args);
2201
2202   return r;
2203 }
2204
2205 static struct lex_string_reader *
2206 lex_string_reader_cast (struct lex_reader *r)
2207 {
2208   return UP_CAST (r, struct lex_string_reader, reader);
2209 }
2210
2211 static size_t
2212 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2213                  enum prompt_style prompt_style UNUSED)
2214 {
2215   struct lex_string_reader *r = lex_string_reader_cast (r_);
2216   size_t chunk;
2217
2218   chunk = MIN (n, r->s.length - r->offset);
2219   memcpy (buf, r->s.string + r->offset, chunk);
2220   r->offset += chunk;
2221
2222   return chunk;
2223 }
2224
2225 static void
2226 lex_string_close (struct lex_reader *r_)
2227 {
2228   struct lex_string_reader *r = lex_string_reader_cast (r_);
2229
2230   ss_dealloc (&r->s);
2231   free (r);
2232 }
2233
2234 static struct lex_reader_class lex_string_reader_class =
2235   {
2236     lex_string_read,
2237     lex_string_close
2238   };