pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31 #include <uniwidth.h>
  32
  33 #include "language/command.h"
  34 #include "language/lexer/macro.h"
  35 #include "language/lexer/scan.h"
  36 #include "language/lexer/segment.h"
  37 #include "language/lexer/token.h"
  38 #include "libpspp/assertion.h"
  39 #include "libpspp/cast.h"
  40 #include "libpspp/deque.h"
  41 #include "libpspp/i18n.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* For a token obtained through the lexer in an ordinary way, this is the
  66        location of the token in terms of the lex_source's buffer.
  67
  68        For a token produced through macro expansion, this is the entire macro
  69        call.
  70
  71        src->tail <= line_pos <= token_pos <= src->head. */
  72     size_t token_pos;           /* Start of token. */
  73     size_t token_len;           /* Length of source for token in bytes. */
  74     size_t line_pos;            /* Start of line containing token_pos. */
  75     int first_line;             /* Line number at token_pos. */
  76
  77     /* For a token obtained through macro expansion, this is just this token.
  78
  79        For a token obtained through the lexer in an ordinary way, these are
  80        nulls and zeros. */
  81     char *macro_rep;        /* The whole macro expansion. */
  82     size_t ofs;             /* Offset of this token in macro_rep. */
  83     size_t len;             /* Length of this token in macro_rep. */
  84     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  85   };
  86
  87 static void
  88 lex_token_destroy (struct lex_token *t)
  89 {
  90   token_uninit (&t->token);
  91   if (t->ref_cnt)
  92     {
  93       assert (*t->ref_cnt > 0);
  94       if (!--*t->ref_cnt)
  95         {
  96           free (t->macro_rep);
  97           free (t->ref_cnt);
  98         }
  99     }
 100   free (t);
 101 }
 102 \f
 103 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 104    lex_source. */
 105 struct lex_stage
 106   {
 107     struct deque deque;
 108     struct lex_token **tokens;
 109   };
 110
 111 static void lex_stage_clear (struct lex_stage *);
 112 static void lex_stage_uninit (struct lex_stage *);
 113
 114 static size_t lex_stage_count (const struct lex_stage *);
 115 static bool lex_stage_is_empty (const struct lex_stage *);
 116
 117 static struct lex_token *lex_stage_last (struct lex_stage *);
 118 static struct lex_token *lex_stage_first (struct lex_stage *);
 119 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 120
 121 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 122 static void lex_stage_pop_first (struct lex_stage *);
 123
 124 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 125                              size_t n);
 126
 127 /* Deletes all the tokens from STAGE. */
 128 static void
 129 lex_stage_clear (struct lex_stage *stage)
 130 {
 131   while (!deque_is_empty (&stage->deque))
 132     lex_stage_pop_first (stage);
 133 }
 134
 135 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 136 static void
 137 lex_stage_uninit (struct lex_stage *stage)
 138 {
 139   lex_stage_clear (stage);
 140   free (stage->tokens);
 141 }
 142
 143 /* Returns true if STAGE contains no tokens, otherwise false. */
 144 static bool
 145 lex_stage_is_empty (const struct lex_stage *stage)
 146 {
 147   return deque_is_empty (&stage->deque);
 148 }
 149
 150 /* Returns the number of tokens in STAGE. */
 151 static size_t
 152 lex_stage_count (const struct lex_stage *stage)
 153 {
 154   return deque_count (&stage->deque);
 155 }
 156
 157 /* Returns the last token in STAGE, which must be nonempty.  The last token is
 158    the one accessed with the greatest lookahead. */
 159 static struct lex_token *
 160 lex_stage_last (struct lex_stage *stage)
 161 {
 162   return stage->tokens[deque_front (&stage->deque, 0)];
 163 }
 164
 165 /* Returns the first token in STAGE, which must be nonempty.
 166    The first token is the one accessed with the least lookahead. */
 167 static struct lex_token *
 168 lex_stage_first (struct lex_stage *stage)
 169 {
 170   return lex_stage_nth (stage, 0);
 171 }
 172
 173 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 174    lookahead) is 0, the second token is 1, and so on.  There must be at least
 175    INDEX + 1 tokens in STAGE. */
 176 static struct lex_token *
 177 lex_stage_nth (struct lex_stage *stage, size_t index)
 178 {
 179   return stage->tokens[deque_back (&stage->deque, index)];
 180 }
 181
 182 /* Adds TOKEN so that it becomes the last token in STAGE. */
 183 static void
 184 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 185 {
 186   if (deque_is_full (&stage->deque))
 187     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 188                                   sizeof *stage->tokens);
 189   stage->tokens[deque_push_front (&stage->deque)] = token;
 190 }
 191
 192 /* Removes the first token from STAGE and uninitializes it. */
 193 static void
 194 lex_stage_pop_first (struct lex_stage *stage)
 195 {
 196   lex_token_destroy (stage->tokens[deque_pop_back (&stage->deque)]);
 197 }
 198
 199 /* Removes the first N tokens from SRC, appending them to DST as the last
 200    tokens. */
 201 static void
 202 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 203 {
 204   for (size_t i = 0; i < n; i++)
 205     {
 206       lex_stage_push_last (dst, lex_stage_first (src));
 207       deque_pop_back (&src->deque);
 208     }
 209 }
 210
 211 /* A source of tokens, corresponding to a syntax file.
 212
 213    This is conceptually a lex_reader wrapped with everything needed to convert
 214    its UTF-8 bytes into tokens. */
 215 struct lex_source
 216   {
 217     struct ll ll;               /* In lexer's list of sources. */
 218     struct lex_reader *reader;
 219     struct lexer *lexer;
 220     struct segmenter segmenter;
 221     bool eof;                   /* True if T_STOP was read from 'reader'. */
 222
 223     /* Buffer of UTF-8 bytes. */
 224     char *buffer;
 225     size_t allocated;           /* Number of bytes allocated. */
 226     size_t tail;                /* &buffer[0] offset into UTF-8 source. */
 227     size_t head;                /* &buffer[head - tail] offset into source. */
 228
 229     /* Positions in source file, tail <= pos <= head for each member here. */
 230     size_t journal_pos;         /* First byte not yet output to journal. */
 231     size_t seg_pos;             /* First byte not yet scanned as token. */
 232     size_t line_pos;            /* First byte of line containing seg_pos. */
 233
 234     int n_newlines;             /* Number of new-lines up to seg_pos. */
 235     bool suppress_next_newline;
 236
 237     /* Tokens.
 238
 239        This is a pipeline with the following stages.  Each token eventually
 240        made available to the parser passes through of these stages.  The stages
 241        are named after the processing that happens in each one.
 242
 243        Initially, tokens come from the segmenter and scanner to 'pp':
 244
 245        - pp: Tokens that need to pass through the macro preprocessor to end up
 246          in 'merge'.
 247
 248        - merge: Tokens that need to pass through scan_merge() to end up in
 249          'lookahead'.
 250
 251        - lookahead: Tokens available to the client for parsing. */
 252     struct lex_stage pp;
 253     struct lex_stage merge;
 254     struct lex_stage lookahead;
 255   };
 256
 257 static struct lex_source *lex_source_create (struct lexer *,
 258                                              struct lex_reader *);
 259 static void lex_source_destroy (struct lex_source *);
 260
 261 /* Lexer. */
 262 struct lexer
 263   {
 264     struct ll_list sources;     /* Contains "struct lex_source"s. */
 265     struct macro_set *macros;
 266   };
 267
 268 static struct lex_source *lex_source__ (const struct lexer *);
 269 static char *lex_source_get_syntax__ (const struct lex_source *,
 270                                       int n0, int n1);
 271 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 272 static void lex_source_push_endcmd__ (struct lex_source *);
 273
 274 static bool lex_source_get_lookahead (struct lex_source *);
 275 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 276                                      const char *format, va_list)
 277    PRINTF_FORMAT (4, 0);
 278 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 279                                                   int n);
 280 \f
 281 /* Initializes READER with the specified CLASS and otherwise some reasonable
 282    defaults.  The caller should fill in the others members as desired. */
 283 void
 284 lex_reader_init (struct lex_reader *reader,
 285                  const struct lex_reader_class *class)
 286 {
 287   reader->class = class;
 288   reader->syntax = SEG_MODE_AUTO;
 289   reader->error = LEX_ERROR_CONTINUE;
 290   reader->file_name = NULL;
 291   reader->encoding = NULL;
 292   reader->line_number = 0;
 293   reader->eof = false;
 294 }
 295
 296 /* Frees any file name already in READER and replaces it by a copy of
 297    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 298 void
 299 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 300 {
 301   free (reader->file_name);
 302   reader->file_name = xstrdup_if_nonnull (file_name);
 303 }
 304 \f
 305 /* Creates and returns a new lexer. */
 306 struct lexer *
 307 lex_create (void)
 308 {
 309   struct lexer *lexer = xmalloc (sizeof *lexer);
 310   *lexer = (struct lexer) {
 311     .sources = LL_INITIALIZER (lexer->sources),
 312     .macros = macro_set_create (),
 313   };
 314   return lexer;
 315 }
 316
 317 /* Destroys LEXER. */
 318 void
 319 lex_destroy (struct lexer *lexer)
 320 {
 321   if (lexer != NULL)
 322     {
 323       struct lex_source *source, *next;
 324
 325       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 326         lex_source_destroy (source);
 327       macro_set_destroy (lexer->macros);
 328       free (lexer);
 329     }
 330 }
 331
 332 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 333    same name.  Takes ownership of M. */
 334 void
 335 lex_define_macro (struct lexer *lexer, struct macro *m)
 336 {
 337   macro_set_add (lexer->macros, m);
 338 }
 339
 340 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 341    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 342    token. */
 343 void
 344 lex_include (struct lexer *lexer, struct lex_reader *reader)
 345 {
 346   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 347   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 348 }
 349
 350 /* Appends READER to LEXER, so that it will be read after all other current
 351    readers have already been read. */
 352 void
 353 lex_append (struct lexer *lexer, struct lex_reader *reader)
 354 {
 355   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 356 }
 357 \f
 358 /* Advancing. */
 359
 360 /* Advances LEXER to the next token, consuming the current token. */
 361 void
 362 lex_get (struct lexer *lexer)
 363 {
 364   struct lex_source *src;
 365
 366   src = lex_source__ (lexer);
 367   if (src == NULL)
 368     return;
 369
 370   if (!lex_stage_is_empty (&src->lookahead))
 371     lex_stage_pop_first (&src->lookahead);
 372
 373   while (lex_stage_is_empty (&src->lookahead))
 374     if (!lex_source_get_lookahead (src))
 375       {
 376         lex_source_destroy (src);
 377         src = lex_source__ (lexer);
 378         if (src == NULL)
 379           return;
 380       }
 381 }
 382 \f
 383 /* Issuing errors. */
 384
 385 /* Prints a syntax error message containing the current token and
 386    given message MESSAGE (if non-null). */
 387 void
 388 lex_error (struct lexer *lexer, const char *format, ...)
 389 {
 390   va_list args;
 391
 392   va_start (args, format);
 393   lex_next_error_valist (lexer, 0, 0, format, args);
 394   va_end (args);
 395 }
 396
 397 /* Prints a syntax error message containing the current token and
 398    given message MESSAGE (if non-null). */
 399 void
 400 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 401 {
 402   lex_next_error_valist (lexer, 0, 0, format, args);
 403 }
 404
 405 /* Prints a syntax error message containing the current token and
 406    given message MESSAGE (if non-null). */
 407 void
 408 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 409 {
 410   va_list args;
 411
 412   va_start (args, format);
 413   lex_next_error_valist (lexer, n0, n1, format, args);
 414   va_end (args);
 415 }
 416
 417 /* Prints a syntax error message saying that one of the strings provided as
 418    varargs, up to the first NULL, is expected. */
 419 void
 420 (lex_error_expecting) (struct lexer *lexer, ...)
 421 {
 422   va_list args;
 423
 424   va_start (args, lexer);
 425   lex_error_expecting_valist (lexer, args);
 426   va_end (args);
 427 }
 428
 429 /* Prints a syntax error message saying that one of the options provided in
 430    ARGS, up to the first NULL, is expected. */
 431 void
 432 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 433 {
 434   enum { MAX_OPTIONS = 9 };
 435   const char *options[MAX_OPTIONS];
 436   int n = 0;
 437   while (n < MAX_OPTIONS)
 438     {
 439       const char *option = va_arg (args, const char *);
 440       if (!option)
 441         break;
 442
 443       options[n++] = option;
 444     }
 445   lex_error_expecting_array (lexer, options, n);
 446 }
 447
 448 void
 449 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 450 {
 451   switch (n)
 452     {
 453     case 0:
 454       lex_error (lexer, NULL);
 455       break;
 456
 457     case 1:
 458       lex_error (lexer, _("expecting %s"), options[0]);
 459       break;
 460
 461     case 2:
 462       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 463       break;
 464
 465     case 3:
 466       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 467                  options[2]);
 468       break;
 469
 470     case 4:
 471       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 472                  options[0], options[1], options[2], options[3]);
 473       break;
 474
 475     case 5:
 476       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 477                  options[0], options[1], options[2], options[3], options[4]);
 478       break;
 479
 480     case 6:
 481       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 482                  options[0], options[1], options[2], options[3], options[4],
 483                  options[5]);
 484       break;
 485
 486     case 7:
 487       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 488                  options[0], options[1], options[2], options[3], options[4],
 489                  options[5], options[6]);
 490       break;
 491
 492     case 8:
 493       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 494                  options[0], options[1], options[2], options[3], options[4],
 495                  options[5], options[6], options[7]);
 496       break;
 497
 498     default:
 499       lex_error (lexer, NULL);
 500     }
 501 }
 502
 503 /* Reports an error to the effect that subcommand SBC may only be specified
 504    once.
 505
 506    This function does not take a lexer as an argument or use lex_error(),
 507    because the result would ordinarily just be redundant: "Syntax error at
 508    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 509    not help the user find the error. */
 510 void
 511 lex_sbc_only_once (const char *sbc)
 512 {
 513   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 514 }
 515
 516 /* Reports an error to the effect that subcommand SBC is missing.
 517
 518    This function does not take a lexer as an argument or use lex_error(),
 519    because a missing subcommand can normally be detected only after the whole
 520    command has been parsed, and so lex_error() would always report "Syntax
 521    error at end of command", which does not help the user find the error. */
 522 void
 523 lex_sbc_missing (const char *sbc)
 524 {
 525   msg (SE, _("Required subcommand %s was not specified."), sbc);
 526 }
 527
 528 /* Reports an error to the effect that specification SPEC may only be specified
 529    once within subcommand SBC. */
 530 void
 531 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 532 {
 533   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 534              spec, sbc);
 535 }
 536
 537 /* Reports an error to the effect that specification SPEC is missing within
 538    subcommand SBC. */
 539 void
 540 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 541 {
 542   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 543              sbc, spec);
 544 }
 545
 546 /* Prints a syntax error message containing the current token and
 547    given message MESSAGE (if non-null). */
 548 void
 549 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 550                        const char *format, va_list args)
 551 {
 552   struct lex_source *src = lex_source__ (lexer);
 553
 554   if (src != NULL)
 555     lex_source_error_valist (src, n0, n1, format, args);
 556   else
 557     {
 558       struct string s;
 559
 560       ds_init_empty (&s);
 561       ds_put_format (&s, _("Syntax error at end of input"));
 562       if (format != NULL)
 563         {
 564           ds_put_cstr (&s, ": ");
 565           ds_put_vformat (&s, format, args);
 566         }
 567       ds_put_byte (&s, '.');
 568       msg (SE, "%s", ds_cstr (&s));
 569       ds_destroy (&s);
 570     }
 571 }
 572
 573 /* Checks that we're at end of command.
 574    If so, returns a successful command completion code.
 575    If not, flags a syntax error and returns an error command
 576    completion code. */
 577 int
 578 lex_end_of_command (struct lexer *lexer)
 579 {
 580   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 581     {
 582       lex_error (lexer, _("expecting end of command"));
 583       return CMD_FAILURE;
 584     }
 585   else
 586     return CMD_SUCCESS;
 587 }
 588 \f
 589 /* Token testing functions. */
 590
 591 /* Returns true if the current token is a number. */
 592 bool
 593 lex_is_number (const struct lexer *lexer)
 594 {
 595   return lex_next_is_number (lexer, 0);
 596 }
 597
 598 /* Returns true if the current token is a string. */
 599 bool
 600 lex_is_string (const struct lexer *lexer)
 601 {
 602   return lex_next_is_string (lexer, 0);
 603 }
 604
 605 /* Returns the value of the current token, which must be a
 606    floating point number. */
 607 double
 608 lex_number (const struct lexer *lexer)
 609 {
 610   return lex_next_number (lexer, 0);
 611 }
 612
 613 /* Returns true iff the current token is an integer. */
 614 bool
 615 lex_is_integer (const struct lexer *lexer)
 616 {
 617   return lex_next_is_integer (lexer, 0);
 618 }
 619
 620 /* Returns the value of the current token, which must be an
 621    integer. */
 622 long
 623 lex_integer (const struct lexer *lexer)
 624 {
 625   return lex_next_integer (lexer, 0);
 626 }
 627 \f
 628 /* Token testing functions with lookahead.
 629
 630    A value of 0 for N as an argument to any of these functions refers to the
 631    current token.  Lookahead is limited to the current command.  Any N greater
 632    than the number of tokens remaining in the current command will be treated
 633    as referring to a T_ENDCMD token. */
 634
 635 /* Returns true if the token N ahead of the current token is a number. */
 636 bool
 637 lex_next_is_number (const struct lexer *lexer, int n)
 638 {
 639   return token_is_number (lex_next (lexer, n));
 640 }
 641
 642 /* Returns true if the token N ahead of the current token is a string. */
 643 bool
 644 lex_next_is_string (const struct lexer *lexer, int n)
 645 {
 646   return token_is_string (lex_next (lexer, n));
 647 }
 648
 649 /* Returns the value of the token N ahead of the current token, which must be a
 650    floating point number. */
 651 double
 652 lex_next_number (const struct lexer *lexer, int n)
 653 {
 654   return token_number (lex_next (lexer, n));
 655 }
 656
 657 /* Returns true if the token N ahead of the current token is an integer. */
 658 bool
 659 lex_next_is_integer (const struct lexer *lexer, int n)
 660 {
 661   return token_is_integer (lex_next (lexer, n));
 662 }
 663
 664 /* Returns the value of the token N ahead of the current token, which must be
 665    an integer. */
 666 long
 667 lex_next_integer (const struct lexer *lexer, int n)
 668 {
 669   return token_integer (lex_next (lexer, n));
 670 }
 671 \f
 672 /* Token matching functions. */
 673
 674 /* If the current token has the specified TYPE, skips it and returns true.
 675    Otherwise, returns false. */
 676 bool
 677 lex_match (struct lexer *lexer, enum token_type type)
 678 {
 679   if (lex_token (lexer) == type)
 680     {
 681       lex_get (lexer);
 682       return true;
 683     }
 684   else
 685     return false;
 686 }
 687
 688 /* If the current token matches IDENTIFIER, skips it and returns true.
 689    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 690    returns false.
 691
 692    IDENTIFIER must be an ASCII string. */
 693 bool
 694 lex_match_id (struct lexer *lexer, const char *identifier)
 695 {
 696   return lex_match_id_n (lexer, identifier, 3);
 697 }
 698
 699 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 700    may be abbreviated to its first N letters.  Otherwise, returns false.
 701
 702    IDENTIFIER must be an ASCII string. */
 703 bool
 704 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 705 {
 706   if (lex_token (lexer) == T_ID
 707       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 708     {
 709       lex_get (lexer);
 710       return true;
 711     }
 712   else
 713     return false;
 714 }
 715
 716 /* If the current token is integer X, skips it and returns true.  Otherwise,
 717    returns false. */
 718 bool
 719 lex_match_int (struct lexer *lexer, int x)
 720 {
 721   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 722     {
 723       lex_get (lexer);
 724       return true;
 725     }
 726   else
 727     return false;
 728 }
 729 \f
 730 /* Forced matches. */
 731
 732 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 733    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 734    false.
 735
 736    IDENTIFIER must be an ASCII string. */
 737 bool
 738 lex_force_match_id (struct lexer *lexer, const char *identifier)
 739 {
 740   if (lex_match_id (lexer, identifier))
 741     return true;
 742   else
 743     {
 744       lex_error_expecting (lexer, identifier);
 745       return false;
 746     }
 747 }
 748
 749 /* If the current token has the specified TYPE, skips it and returns true.
 750    Otherwise, reports an error and returns false. */
 751 bool
 752 lex_force_match (struct lexer *lexer, enum token_type type)
 753 {
 754   if (lex_token (lexer) == type)
 755     {
 756       lex_get (lexer);
 757       return true;
 758     }
 759   else
 760     {
 761       const char *type_string = token_type_to_string (type);
 762       if (type_string)
 763         {
 764           char *s = xasprintf ("`%s'", type_string);
 765           lex_error_expecting (lexer, s);
 766           free (s);
 767         }
 768       else
 769         lex_error_expecting (lexer, token_type_to_name (type));
 770
 771       return false;
 772     }
 773 }
 774
 775 /* If the current token is a string, does nothing and returns true.
 776    Otherwise, reports an error and returns false. */
 777 bool
 778 lex_force_string (struct lexer *lexer)
 779 {
 780   if (lex_is_string (lexer))
 781     return true;
 782   else
 783     {
 784       lex_error (lexer, _("expecting string"));
 785       return false;
 786     }
 787 }
 788
 789 /* If the current token is a string or an identifier, does nothing and returns
 790    true.  Otherwise, reports an error and returns false.
 791
 792    This is meant for use in syntactic situations where we want to encourage the
 793    user to supply a quoted string, but for compatibility we also accept
 794    identifiers.  (One example of such a situation is file names.)  Therefore,
 795    the error message issued when the current token is wrong only says that a
 796    string is expected and doesn't mention that an identifier would also be
 797    accepted. */
 798 bool
 799 lex_force_string_or_id (struct lexer *lexer)
 800 {
 801   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 802 }
 803
 804 /* If the current token is an integer, does nothing and returns true.
 805    Otherwise, reports an error and returns false. */
 806 bool
 807 lex_force_int (struct lexer *lexer)
 808 {
 809   if (lex_is_integer (lexer))
 810     return true;
 811   else
 812     {
 813       lex_error (lexer, _("expecting integer"));
 814       return false;
 815     }
 816 }
 817
 818 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 819    nothing and returns true.  Otherwise, reports an error and returns false.
 820    If NAME is nonnull, then it is used in the error message. */
 821 bool
 822 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 823 {
 824   bool is_number = lex_is_number (lexer);
 825   bool is_integer = lex_is_integer (lexer);
 826   bool too_small = (is_integer ? lex_integer (lexer) < min
 827                     : is_number ? lex_number (lexer) < min
 828                     : false);
 829   bool too_big = (is_integer ? lex_integer (lexer) > max
 830                   : is_number ? lex_number (lexer) > max
 831                   : false);
 832   if (is_integer && !too_small && !too_big)
 833     return true;
 834
 835   if (min > max)
 836     {
 837       /* Weird, maybe a bug in the caller.  Just report that we needed an
 838          integer. */
 839       if (name)
 840         lex_error (lexer, _("Integer expected for %s."), name);
 841       else
 842         lex_error (lexer, _("Integer expected."));
 843     }
 844   else if (min == max)
 845     {
 846       if (name)
 847         lex_error (lexer, _("Expected %ld for %s."), min, name);
 848       else
 849         lex_error (lexer, _("Expected %ld."), min);
 850     }
 851   else if (min + 1 == max)
 852     {
 853       if (name)
 854         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 855       else
 856         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 857     }
 858   else
 859     {
 860       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 861       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 862
 863       if (report_lower_bound && report_upper_bound)
 864         {
 865           if (name)
 866             lex_error (lexer,
 867                        _("Expected integer between %ld and %ld for %s."),
 868                        min, max, name);
 869           else
 870             lex_error (lexer, _("Expected integer between %ld and %ld."),
 871                        min, max);
 872         }
 873       else if (report_lower_bound)
 874         {
 875           if (min == 0)
 876             {
 877               if (name)
 878                 lex_error (lexer, _("Expected non-negative integer for %s."),
 879                            name);
 880               else
 881                 lex_error (lexer, _("Expected non-negative integer."));
 882             }
 883           else if (min == 1)
 884             {
 885               if (name)
 886                 lex_error (lexer, _("Expected positive integer for %s."),
 887                            name);
 888               else
 889                 lex_error (lexer, _("Expected positive integer."));
 890             }
 891         }
 892       else if (report_upper_bound)
 893         {
 894           if (name)
 895             lex_error (lexer,
 896                        _("Expected integer less than or equal to %ld for %s."),
 897                        max, name);
 898           else
 899             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 900                        max);
 901         }
 902       else
 903         {
 904           if (name)
 905             lex_error (lexer, _("Integer expected for %s."), name);
 906           else
 907             lex_error (lexer, _("Integer expected."));
 908         }
 909     }
 910   return false;
 911 }
 912
 913 /* If the current token is a number, does nothing and returns true.
 914    Otherwise, reports an error and returns false. */
 915 bool
 916 lex_force_num (struct lexer *lexer)
 917 {
 918   if (lex_is_number (lexer))
 919     return true;
 920
 921   lex_error (lexer, _("expecting number"));
 922   return false;
 923 }
 924
 925 /* If the current token is an identifier, does nothing and returns true.
 926    Otherwise, reports an error and returns false. */
 927 bool
 928 lex_force_id (struct lexer *lexer)
 929 {
 930   if (lex_token (lexer) == T_ID)
 931     return true;
 932
 933   lex_error (lexer, _("expecting identifier"));
 934   return false;
 935 }
 936 \f
 937 /* Token accessors. */
 938
 939 /* Returns the type of LEXER's current token. */
 940 enum token_type
 941 lex_token (const struct lexer *lexer)
 942 {
 943   return lex_next_token (lexer, 0);
 944 }
 945
 946 /* Returns the number in LEXER's current token.
 947
 948    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 949    tokens this function will always return zero. */
 950 double
 951 lex_tokval (const struct lexer *lexer)
 952 {
 953   return lex_next_tokval (lexer, 0);
 954 }
 955
 956 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 957
 958    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 959    this functions this function will always return NULL.
 960
 961    The UTF-8 encoding of the returned string is correct for variable names and
 962    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 963    data_in() to use it in a "union value".  */
 964 const char *
 965 lex_tokcstr (const struct lexer *lexer)
 966 {
 967   return lex_next_tokcstr (lexer, 0);
 968 }
 969
 970 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 971    null-terminated (but the null terminator is not included in the returned
 972    substring's 'length').
 973
 974    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 975    this functions this function will always return NULL.
 976
 977    The UTF-8 encoding of the returned string is correct for variable names and
 978    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 979    data_in() to use it in a "union value".  */
 980 struct substring
 981 lex_tokss (const struct lexer *lexer)
 982 {
 983   return lex_next_tokss (lexer, 0);
 984 }
 985 \f
 986 /* Looking ahead.
 987
 988    A value of 0 for N as an argument to any of these functions refers to the
 989    current token.  Lookahead is limited to the current command.  Any N greater
 990    than the number of tokens remaining in the current command will be treated
 991    as referring to a T_ENDCMD token. */
 992
 993 static const struct lex_token *
 994 lex_next__ (const struct lexer *lexer_, int n)
 995 {
 996   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
 997   struct lex_source *src = lex_source__ (lexer);
 998
 999   if (src != NULL)
1000     return lex_source_next__ (src, n);
1001   else
1002     {
1003       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1004       return &stop_token;
1005     }
1006 }
1007
1008 static const struct lex_token *
1009 lex_source_next__ (const struct lex_source *src_, int n)
1010 {
1011   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1012   while (lex_stage_count (&src->lookahead) <= n)
1013     {
1014       if (!lex_stage_is_empty (&src->lookahead))
1015         {
1016           const struct lex_token *t = lex_stage_last (&src->lookahead);
1017           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1018             return t;
1019         }
1020
1021       lex_source_get_lookahead (src);
1022     }
1023
1024   return lex_stage_nth (&src->lookahead, n);
1025 }
1026
1027 /* Returns the "struct token" of the token N after the current one in LEXER.
1028    The returned pointer can be invalidated by pretty much any succeeding call
1029    into the lexer, although the string pointer within the returned token is
1030    only invalidated by consuming the token (e.g. with lex_get()). */
1031 const struct token *
1032 lex_next (const struct lexer *lexer, int n)
1033 {
1034   return &lex_next__ (lexer, n)->token;
1035 }
1036
1037 /* Returns the type of the token N after the current one in LEXER. */
1038 enum token_type
1039 lex_next_token (const struct lexer *lexer, int n)
1040 {
1041   return lex_next (lexer, n)->type;
1042 }
1043
1044 /* Returns the number in the tokn N after the current one in LEXER.
1045
1046    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1047    tokens this function will always return zero. */
1048 double
1049 lex_next_tokval (const struct lexer *lexer, int n)
1050 {
1051   return token_number (lex_next (lexer, n));
1052 }
1053
1054 /* Returns the null-terminated string in the token N after the current one, in
1055    UTF-8 encoding.
1056
1057    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1058    this functions this function will always return NULL.
1059
1060    The UTF-8 encoding of the returned string is correct for variable names and
1061    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1062    data_in() to use it in a "union value".  */
1063 const char *
1064 lex_next_tokcstr (const struct lexer *lexer, int n)
1065 {
1066   return lex_next_tokss (lexer, n).string;
1067 }
1068
1069 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1070    The string is null-terminated (but the null terminator is not included in
1071    the returned substring's 'length').
1072
1073    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1074    tokens this functions this function will always return NULL.
1075
1076    The UTF-8 encoding of the returned string is correct for variable names and
1077    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1078    data_in() to use it in a "union value".  */
1079 struct substring
1080 lex_next_tokss (const struct lexer *lexer, int n)
1081 {
1082   return lex_next (lexer, n)->string;
1083 }
1084
1085 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1086    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1087    are both zero, this requests the syntax for the current token.)  The caller
1088    must eventually free the returned string (with free()).  The syntax is
1089    encoded in UTF-8 and in the original form supplied to the lexer so that, for
1090    example, it may include comments, spaces, and new-lines if it spans multiple
1091    tokens.  Macro expansion, however, has already been performed. */
1092 char *
1093 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1094 {
1095   return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1096 }
1097
1098 /* Returns true if the token N ahead of the current one was produced by macro
1099    expansion, false otherwise. */
1100 bool
1101 lex_next_is_from_macro (const struct lexer *lexer, int n)
1102 {
1103   return lex_next__ (lexer, n)->macro_rep != NULL;
1104 }
1105
1106 static bool
1107 lex_tokens_match (const struct token *actual, const struct token *expected)
1108 {
1109   if (actual->type != expected->type)
1110     return false;
1111
1112   switch (actual->type)
1113     {
1114     case T_POS_NUM:
1115     case T_NEG_NUM:
1116       return actual->number == expected->number;
1117
1118     case T_ID:
1119       return lex_id_match (expected->string, actual->string);
1120
1121     case T_STRING:
1122       return (actual->string.length == expected->string.length
1123               && !memcmp (actual->string.string, expected->string.string,
1124                           actual->string.length));
1125
1126     default:
1127       return true;
1128     }
1129 }
1130
1131 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1132    skips it and returns true.  Otherwise, returns false.
1133
1134    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1135    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1136    first three letters. */
1137 bool
1138 lex_match_phrase (struct lexer *lexer, const char *s)
1139 {
1140   struct string_lexer slex;
1141   struct token token;
1142   int i;
1143
1144   i = 0;
1145   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1146   while (string_lexer_next (&slex, &token))
1147     {
1148       bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1149       token_uninit (&token);
1150       if (!match)
1151         return false;
1152     }
1153
1154   while (i-- > 0)
1155     lex_get (lexer);
1156   return true;
1157 }
1158
1159 static int
1160 count_newlines (char *s, size_t length)
1161 {
1162   int n_newlines = 0;
1163   char *newline;
1164
1165   while ((newline = memchr (s, '\n', length)) != NULL)
1166     {
1167       n_newlines++;
1168       length -= (newline + 1) - s;
1169       s = newline + 1;
1170     }
1171
1172   return n_newlines;
1173 }
1174
1175 static int
1176 lex_token_get_last_line_number (const struct lex_source *src,
1177                                 const struct lex_token *token)
1178 {
1179   if (token->first_line == 0)
1180     return 0;
1181   else
1182     {
1183       char *token_str = &src->buffer[token->token_pos - src->tail];
1184       return token->first_line + count_newlines (token_str, token->token_len) + 1;
1185     }
1186 }
1187
1188 static int
1189 count_columns (const char *s_, size_t length)
1190 {
1191   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1192   int columns;
1193   size_t ofs;
1194   int mblen;
1195
1196   columns = 0;
1197   for (ofs = 0; ofs < length; ofs += mblen)
1198     {
1199       ucs4_t uc;
1200
1201       mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1202       if (uc != '\t')
1203         {
1204           int width = uc_width (uc, "UTF-8");
1205           if (width > 0)
1206             columns += width;
1207         }
1208       else
1209         columns = ROUND_UP (columns + 1, 8);
1210     }
1211
1212   return columns + 1;
1213 }
1214
1215 static int
1216 lex_token_get_first_column (const struct lex_source *src,
1217                             const struct lex_token *token)
1218 {
1219   return count_columns (&src->buffer[token->line_pos - src->tail],
1220                         token->token_pos - token->line_pos);
1221 }
1222
1223 static int
1224 lex_token_get_last_column (const struct lex_source *src,
1225                            const struct lex_token *token)
1226 {
1227   char *start, *end, *newline;
1228
1229   start = &src->buffer[token->line_pos - src->tail];
1230   end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1231   newline = memrchr (start, '\n', end - start);
1232   if (newline != NULL)
1233     start = newline + 1;
1234   return count_columns (start, end - start);
1235 }
1236
1237 static struct msg_location
1238 lex_token_location (const struct lex_source *src,
1239                     const struct lex_token *t0,
1240                     const struct lex_token *t1)
1241 {
1242   return (struct msg_location) {
1243     .file_name = src->reader->file_name,
1244     .first_line = t0->first_line,
1245     .last_line = lex_token_get_last_line_number (src, t1),
1246     .first_column = lex_token_get_first_column (src, t0),
1247     .last_column = lex_token_get_last_column (src, t1),
1248   };
1249 }
1250
1251 static struct msg_location *
1252 lex_token_location_rw (const struct lex_source *src,
1253                        const struct lex_token *t0,
1254                        const struct lex_token *t1)
1255 {
1256   struct msg_location location = lex_token_location (src, t0, t1);
1257   return msg_location_dup (&location);
1258 }
1259
1260 static struct msg_location *
1261 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1262 {
1263   return lex_token_location_rw (src,
1264                                 lex_source_next__ (src, n0),
1265                                 lex_source_next__ (src, n1));
1266 }
1267
1268 /* Returns the 1-based line number of the start of the syntax that represents
1269    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1270    if the token is drawn from a source that does not have line numbers. */
1271 int
1272 lex_get_first_line_number (const struct lexer *lexer, int n)
1273 {
1274   const struct lex_source *src = lex_source__ (lexer);
1275   return src ? lex_source_next__ (src, n)->first_line : 0;
1276 }
1277
1278 /* Returns the 1-based line number of the end of the syntax that represents the
1279    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1280    token or if the token is drawn from a source that does not have line
1281    numbers.
1282
1283    Most of the time, a single token is wholly within a single line of syntax,
1284    but there are two exceptions: a T_STRING token can be made up of multiple
1285    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1286    token can consist of a "-" on one line followed by the number on the next.
1287  */
1288 int
1289 lex_get_last_line_number (const struct lexer *lexer, int n)
1290 {
1291   const struct lex_source *src = lex_source__ (lexer);
1292   return src ? lex_token_get_last_line_number (src,
1293                                                lex_source_next__ (src, n)) : 0;
1294 }
1295
1296 /* Returns the 1-based column number of the start of the syntax that represents
1297    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1298    token.
1299
1300    Column numbers are measured according to the width of characters as shown in
1301    a typical fixed-width font, in which CJK characters have width 2 and
1302    combining characters have width 0.  */
1303 int
1304 lex_get_first_column (const struct lexer *lexer, int n)
1305 {
1306   const struct lex_source *src = lex_source__ (lexer);
1307   return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1308 }
1309
1310 /* Returns the 1-based column number of the end of the syntax that represents
1311    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1312    token.
1313
1314    Column numbers are measured according to the width of characters as shown in
1315    a typical fixed-width font, in which CJK characters have width 2 and
1316    combining characters have width 0.  */
1317 int
1318 lex_get_last_column (const struct lexer *lexer, int n)
1319 {
1320   const struct lex_source *src = lex_source__ (lexer);
1321   return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1322 }
1323
1324 /* Returns the name of the syntax file from which the current command is drawn.
1325    Returns NULL for a T_STOP token or if the command's source does not have
1326    line numbers.
1327
1328    There is no version of this function that takes an N argument because
1329    lookahead only works to the end of a command and any given command is always
1330    within a single syntax file. */
1331 const char *
1332 lex_get_file_name (const struct lexer *lexer)
1333 {
1334   struct lex_source *src = lex_source__ (lexer);
1335   return src == NULL ? NULL : src->reader->file_name;
1336 }
1337
1338 /* Returns a newly allocated msg_location for the syntax that represents tokens
1339    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1340    must eventually free the location (with msg_location_destroy()). */
1341 struct msg_location *
1342 lex_get_location (const struct lexer *lexer, int n0, int n1)
1343 {
1344   struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1345   loc->first_column = lex_get_first_column (lexer, n0);
1346   loc->last_column = lex_get_last_column (lexer, n1);
1347   return loc;
1348 }
1349
1350 /* Returns a newly allocated msg_location for the syntax that represents tokens
1351    with 0-based offsets N0...N1, inclusive, from the current token.  The
1352    location only covers the tokens' lines, not the columns.  The caller must
1353    eventually free the location (with msg_location_destroy()). */
1354 struct msg_location *
1355 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1356 {
1357   struct msg_location *loc = xmalloc (sizeof *loc);
1358   *loc = (struct msg_location) {
1359     .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1360     .first_line = lex_get_first_line_number (lexer, n0),
1361     .last_line = lex_get_last_line_number (lexer, n1),
1362   };
1363   return loc;
1364 }
1365
1366 const char *
1367 lex_get_encoding (const struct lexer *lexer)
1368 {
1369   struct lex_source *src = lex_source__ (lexer);
1370   return src == NULL ? NULL : src->reader->encoding;
1371 }
1372
1373 /* Returns the syntax mode for the syntax file from which the current drawn is
1374    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1375    does not have line numbers.
1376
1377    There is no version of this function that takes an N argument because
1378    lookahead only works to the end of a command and any given command is always
1379    within a single syntax file. */
1380 enum segmenter_mode
1381 lex_get_syntax_mode (const struct lexer *lexer)
1382 {
1383   struct lex_source *src = lex_source__ (lexer);
1384   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1385 }
1386
1387 /* Returns the error mode for the syntax file from which the current drawn is
1388    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1389    source does not have line numbers.
1390
1391    There is no version of this function that takes an N argument because
1392    lookahead only works to the end of a command and any given command is always
1393    within a single syntax file. */
1394 enum lex_error_mode
1395 lex_get_error_mode (const struct lexer *lexer)
1396 {
1397   struct lex_source *src = lex_source__ (lexer);
1398   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1399 }
1400
1401 /* If the source that LEXER is currently reading has error mode
1402    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1403    token to be read comes directly from whatever is next read from the stream.
1404
1405    It makes sense to call this function after encountering an error in a
1406    command entered on the console, because usually the user would prefer not to
1407    have cascading errors. */
1408 void
1409 lex_interactive_reset (struct lexer *lexer)
1410 {
1411   struct lex_source *src = lex_source__ (lexer);
1412   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1413     {
1414       src->head = src->tail = 0;
1415       src->journal_pos = src->seg_pos = src->line_pos = 0;
1416       src->n_newlines = 0;
1417       src->suppress_next_newline = false;
1418       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1419                                        false);
1420       lex_stage_clear (&src->pp);
1421       lex_stage_clear (&src->merge);
1422       lex_stage_clear (&src->lookahead);
1423       lex_source_push_endcmd__ (src);
1424     }
1425 }
1426
1427 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1428 void
1429 lex_discard_rest_of_command (struct lexer *lexer)
1430 {
1431   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1432     lex_get (lexer);
1433 }
1434
1435 /* Discards all lookahead tokens in LEXER, then discards all input sources
1436    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1437    runs out of input sources. */
1438 void
1439 lex_discard_noninteractive (struct lexer *lexer)
1440 {
1441   struct lex_source *src = lex_source__ (lexer);
1442
1443   if (src != NULL)
1444     {
1445       lex_stage_clear (&src->pp);
1446       lex_stage_clear (&src->merge);
1447       lex_stage_clear (&src->lookahead);
1448
1449       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1450            src = lex_source__ (lexer))
1451         lex_source_destroy (src);
1452     }
1453 }
1454 \f
1455 static size_t
1456 lex_source_max_tail__ (const struct lex_source *src_)
1457 {
1458   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1459
1460   assert (src->seg_pos >= src->line_pos);
1461   size_t max_tail = MIN (src->journal_pos, src->line_pos);
1462
1463   /* Use the oldest token also. */
1464   struct lex_stage *stages[] = { &src->lookahead, &src->merge, &src->pp };
1465   for (size_t i = 0; i < sizeof stages / sizeof *stages; i++)
1466     if (!lex_stage_is_empty (stages[i]))
1467       {
1468         struct lex_token *first = lex_stage_first (stages[i]);
1469         assert (first->token_pos >= first->line_pos);
1470         return MIN (max_tail, first->line_pos);
1471       }
1472
1473   return max_tail;
1474 }
1475
1476 static void
1477 lex_source_expand__ (struct lex_source *src)
1478 {
1479   if (src->head - src->tail >= src->allocated)
1480     {
1481       size_t max_tail = lex_source_max_tail__ (src);
1482       if (max_tail > src->tail)
1483         {
1484           /* Advance the tail, freeing up room at the head. */
1485           memmove (src->buffer, src->buffer + (max_tail - src->tail),
1486                    src->head - max_tail);
1487           src->tail = max_tail;
1488         }
1489       else
1490         {
1491           /* Buffer is completely full.  Expand it. */
1492           src->buffer = x2realloc (src->buffer, &src->allocated);
1493         }
1494     }
1495   else
1496     {
1497       /* There's space available at the head of the buffer.  Nothing to do. */
1498     }
1499 }
1500
1501 static void
1502 lex_source_read__ (struct lex_source *src)
1503 {
1504   do
1505     {
1506       lex_source_expand__ (src);
1507
1508       size_t head_ofs = src->head - src->tail;
1509       size_t space = src->allocated - head_ofs;
1510       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1511       size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1512                                            space, prompt);
1513       assert (n <= space);
1514
1515       if (n == 0)
1516         {
1517           /* End of input. */
1518           src->reader->eof = true;
1519           lex_source_expand__ (src);
1520           return;
1521         }
1522
1523       src->head += n;
1524     }
1525   while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1526                   src->head - src->seg_pos));
1527 }
1528
1529 static struct lex_source *
1530 lex_source__ (const struct lexer *lexer)
1531 {
1532   return (ll_is_empty (&lexer->sources) ? NULL
1533           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1534 }
1535
1536 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1537    one, through N1 ahead of the current one, inclusive.  (For example, if N0
1538    and N1 are both zero, this requests the syntax for the current token.)  The
1539    caller must eventually free the returned string (with free()).  The syntax
1540    is encoded in UTF-8 and in the original form supplied to the lexer so that,
1541    for example, it may include comments, spaces, and new-lines if it spans
1542    multiple tokens.  Macro expansion, however, has already been performed. */
1543 static char *
1544 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1545 {
1546   struct string s = DS_EMPTY_INITIALIZER;
1547   for (size_t i = n0; i <= n1; )
1548     {
1549       /* Find [I,J) as the longest sequence of tokens not produced by macro
1550          expansion, or otherwise the longest sequence expanded from a single
1551          macro call. */
1552       const struct lex_token *first = lex_source_next__ (src, i);
1553       size_t j;
1554       for (j = i + 1; j <= n1; j++)
1555         {
1556           const struct lex_token *cur = lex_source_next__ (src, j);
1557           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1558               || first->macro_rep != cur->macro_rep)
1559             break;
1560         }
1561       const struct lex_token *last = lex_source_next__ (src, j - 1);
1562
1563       /* Now add the syntax for this sequence of tokens to SRC. */
1564       if (!ds_is_empty (&s))
1565         ds_put_byte (&s, ' ');
1566       if (!first->macro_rep)
1567         {
1568           size_t start = first->token_pos;
1569           size_t end = last->token_pos + last->token_len;
1570           ds_put_substring (&s, ss_buffer (&src->buffer[start - src->tail],
1571                                            end - start));
1572         }
1573       else
1574         {
1575           size_t start = first->ofs;
1576           size_t end = last->ofs + last->len;
1577           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1578                                            end - start));
1579         }
1580
1581       i = j;
1582     }
1583   return ds_steal_cstr (&s);
1584 }
1585
1586 static bool
1587 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1588 {
1589   for (size_t i = n0; i <= n1; i++)
1590     if (lex_source_next__ (src, i)->macro_rep)
1591       return true;
1592   return false;
1593 }
1594
1595 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1596    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1597    other tokens included in that range.  The syntax is encoded in UTF-8 and in
1598    the original form supplied to the lexer so that, for example, it may include
1599    comments, spaces, and new-lines if it spans multiple tokens.
1600
1601    Returns an empty string if the token range doesn't include a macro call.
1602
1603    The caller must not modify or free the returned string. */
1604 static struct substring
1605 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1606 {
1607   if (!lex_source_contains_macro_call (src, n0, n1))
1608     return ss_empty ();
1609
1610   const struct lex_token *token0 = lex_source_next__ (src, n0);
1611   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1612   size_t start = token0->token_pos;
1613   size_t end = token1->token_pos + token1->token_len;
1614
1615   return ss_buffer (&src->buffer[start - src->tail], end - start);
1616 }
1617
1618 static void
1619 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1620                          const char *format, va_list args)
1621 {
1622   const struct lex_token *token;
1623   struct string s;
1624
1625   ds_init_empty (&s);
1626
1627   token = lex_source_next__ (src, n0);
1628   if (token->token.type == T_ENDCMD)
1629     ds_put_cstr (&s, _("Syntax error at end of command"));
1630   else
1631     {
1632       /* Get the syntax that caused the error. */
1633       char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1634       char syntax[64];
1635       str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1636       free (raw_syntax);
1637
1638       /* Get the macro call(s) that expanded to the syntax that caused the
1639          error. */
1640       char call[64];
1641       str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1642                      call, sizeof call);
1643
1644       if (syntax[0])
1645         {
1646           if (call[0])
1647             ds_put_format (&s,
1648                            _("Syntax error at `%s' (in expansion of `%s')"),
1649                            syntax, call);
1650           else
1651             ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1652         }
1653       else
1654         {
1655           if (call[0])
1656             ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1657                            call);
1658           else
1659             ds_put_cstr (&s, _("Syntax error"));
1660         }
1661     }
1662
1663   if (format)
1664     {
1665       ds_put_cstr (&s, ": ");
1666       ds_put_vformat (&s, format, args);
1667     }
1668   if (ds_last (&s) != '.')
1669     ds_put_byte (&s, '.');
1670
1671   struct msg *m = xmalloc (sizeof *m);
1672   *m = (struct msg) {
1673     .category = MSG_C_SYNTAX,
1674     .severity = MSG_S_ERROR,
1675     .location = lex_source_get_location (src, n0, n1),
1676     .text = ds_steal_cstr (&s),
1677   };
1678   msg_emit (m);
1679 }
1680
1681 static void
1682 lex_get_error (struct lex_source *src, const struct lex_token *token)
1683 {
1684   char syntax[64];
1685   str_ellipsize (ss_buffer (&src->buffer[token->token_pos - src->tail],
1686                             token->token_len),
1687                  syntax, sizeof syntax);
1688
1689   struct string s = DS_EMPTY_INITIALIZER;
1690   ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1691   ds_put_format (&s, ": %s", token->token.string.string);
1692
1693   struct msg *m = xmalloc (sizeof *m);
1694   *m = (struct msg) {
1695     .category = MSG_C_SYNTAX,
1696     .severity = MSG_S_ERROR,
1697     .location = lex_token_location_rw (src, token, token),
1698     .text = ds_steal_cstr (&s),
1699   };
1700   msg_emit (m);
1701 }
1702
1703 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1704    underlying lex_reader if necessary.  Returns true if a new token was added
1705    to SRC's deque, false otherwise.  The caller should retry failures unless
1706    SRC's 'eof' marker was set to true indicating that there will be no more
1707    tokens from this source. */
1708 static bool
1709 lex_source_try_get_pp (struct lex_source *src)
1710 {
1711   /* Append a new token to SRC and initialize it. */
1712   struct lex_token *token = xmalloc (sizeof *token);
1713   token->token = (struct token) { .type = T_STOP };
1714   token->macro_rep = NULL;
1715   token->ref_cnt = NULL;
1716   token->line_pos = src->line_pos;
1717   token->token_pos = src->seg_pos;
1718   if (src->reader->line_number > 0)
1719     token->first_line = src->reader->line_number + src->n_newlines;
1720   else
1721     token->first_line = 0;
1722
1723   /* Extract a segment. */
1724   const char *segment;
1725   enum segment_type seg_type;
1726   int seg_len;
1727   for (;;)
1728     {
1729       segment = &src->buffer[src->seg_pos - src->tail];
1730       seg_len = segmenter_push (&src->segmenter, segment,
1731                                 src->head - src->seg_pos,
1732                                 src->reader->eof, &seg_type);
1733       if (seg_len >= 0)
1734         break;
1735
1736       /* The segmenter needs more input to produce a segment. */
1737       assert (!src->reader->eof);
1738       lex_source_read__ (src);
1739     }
1740
1741   /* Update state based on the segment. */
1742   token->token_len = seg_len;
1743   src->seg_pos += seg_len;
1744   if (seg_type == SEG_NEWLINE)
1745     {
1746       src->line_pos = src->seg_pos;
1747       src->n_newlines++;
1748     }
1749
1750   /* Get a token from the segment. */
1751   enum tokenize_result result = token_from_segment (
1752     seg_type, ss_buffer (segment, seg_len), &token->token);
1753
1754   /* If we've reached the end of a line, or the end of a command, then pass
1755      the line to the output engine as a syntax text item.  */
1756   int n_lines = seg_type == SEG_NEWLINE;
1757   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1758     {
1759       n_lines++;
1760       src->suppress_next_newline = true;
1761     }
1762   else if (n_lines > 0 && src->suppress_next_newline)
1763     {
1764       n_lines--;
1765       src->suppress_next_newline = false;
1766     }
1767   for (int i = 0; i < n_lines; i++)
1768     {
1769       /* Beginning of line. */
1770       const char *line = &src->buffer[src->journal_pos - src->tail];
1771
1772       /* Calculate line length, including \n or \r\n end-of-line if present.
1773
1774          We use src->head even though that may be beyond what we've actually
1775          converted to tokens (which is only through line_pos).  That's because,
1776          if we're emitting the line due to SEG_END_COMMAND, we want to take the
1777          whole line through the newline, not just through the '.'. */
1778       size_t max_len = src->head - src->journal_pos;
1779       const char *newline = memchr (line, '\n', max_len);
1780       size_t line_len = newline ? newline - line + 1 : max_len;
1781
1782       /* Calculate line length excluding end-of-line. */
1783       size_t copy_len = line_len;
1784       if (copy_len > 0 && line[copy_len - 1] == '\n')
1785         copy_len--;
1786       if (copy_len > 0 && line[copy_len - 1] == '\r')
1787         copy_len--;
1788
1789       /* Submit the line as syntax. */
1790       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1791                                                    xmemdup0 (line, copy_len),
1792                                                    NULL));
1793
1794       src->journal_pos += line_len;
1795     }
1796
1797   switch (result)
1798     {
1799     case TOKENIZE_ERROR:
1800       lex_get_error (src, token);
1801       /* Fall through. */
1802     case TOKENIZE_EMPTY:
1803       lex_token_destroy (token);
1804       return false;
1805
1806     case TOKENIZE_TOKEN:
1807       if (token->token.type == T_STOP)
1808         {
1809           token->token.type = T_ENDCMD;
1810           src->eof = true;
1811         }
1812       lex_stage_push_last (&src->pp, token);
1813       return true;
1814     }
1815   NOT_REACHED ();
1816 }
1817
1818 /* Attempts to append a new token to SRC.  Returns true if successful, false on
1819    failure.  On failure, the end of SRC has been reached and no more tokens
1820    will be forthcoming from it.
1821
1822    Does not make the new token available for lookahead yet; the caller must
1823    adjust SRC's 'middle' pointer to do so. */
1824 static bool
1825 lex_source_get_pp (struct lex_source *src)
1826 {
1827   while (!src->eof)
1828     if (lex_source_try_get_pp (src))
1829       return true;
1830   return false;
1831 }
1832
1833 static bool
1834 lex_source_try_get_merge (const struct lex_source *src_)
1835 {
1836   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1837
1838   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1839     return false;
1840
1841   if (!settings_get_mexpand ())
1842     {
1843       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1844       return true;
1845     }
1846
1847   /* Now pass tokens one-by-one to the macro expander.
1848
1849      In the common case where there is no macro to expand, the loop is not
1850      entered.  */
1851   struct macro_call *mc;
1852   int n_call = macro_call_create (src->lexer->macros,
1853                                   &lex_stage_first (&src->pp)->token, &mc);
1854   for (int ofs = 1; !n_call; ofs++)
1855     {
1856       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1857         {
1858           /* This should not be reachable because we always get a T_ENDCMD at
1859              the end of an input file (transformed from T_STOP by
1860              lex_source_try_get_pp()) and the macro_expander should always
1861              terminate expansion on T_ENDCMD. */
1862           NOT_REACHED ();
1863         }
1864
1865       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1866       size_t start = t->token_pos;
1867       size_t end = t->token_pos + t->token_len;
1868       const struct macro_token mt = {
1869         .token = t->token,
1870         .syntax = ss_buffer (&src->buffer[start - src->tail], end - start),
1871       };
1872       const struct msg_location loc = lex_token_location (src, t, t);
1873       n_call = macro_call_add (mc, &mt, &loc);
1874     }
1875   if (n_call < 0)
1876     {
1877       /* False alarm: no macro expansion after all.  Use first token as
1878          lookahead.  We'll retry macro expansion from the second token next
1879          time around. */
1880       macro_call_destroy (mc);
1881       lex_stage_shift (&src->merge, &src->pp, 1);
1882       return true;
1883     }
1884
1885   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1886      are a macro call.  (These are likely to be the only tokens in 'pp'.)
1887      Expand them.  */
1888   const struct lex_token *c0 = lex_stage_first (&src->pp);
1889   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1890   struct macro_tokens expansion = { .n = 0 };
1891   struct msg_location loc = lex_token_location (src, c0, c1);
1892   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1893   macro_call_destroy (mc);
1894
1895   /* Convert the macro expansion into syntax for possible error messages
1896      later. */
1897   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1898   size_t *len = xnmalloc (expansion.n, sizeof *len);
1899   struct string s = DS_EMPTY_INITIALIZER;
1900   macro_tokens_to_syntax (&expansion, &s, ofs, len);
1901
1902   if (settings_get_mprint ())
1903     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1904                                           _("Macro Expansion")));
1905
1906   /* Append the macro expansion tokens to the lookahead. */
1907   if (expansion.n > 0)
1908     {
1909       char *macro_rep = ds_steal_cstr (&s);
1910       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1911       *ref_cnt = expansion.n;
1912       for (size_t i = 0; i < expansion.n; i++)
1913         {
1914           struct lex_token *token = xmalloc (sizeof *token);
1915           *token = (struct lex_token) {
1916             .token = expansion.mts[i].token,
1917             .token_pos = c0->token_pos,
1918             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1919             .line_pos = c0->line_pos,
1920             .first_line = c0->first_line,
1921             .macro_rep = macro_rep,
1922             .ofs = ofs[i],
1923             .len = len[i],
1924             .ref_cnt = ref_cnt,
1925           };
1926           lex_stage_push_last (&src->merge, token);
1927
1928           ss_dealloc (&expansion.mts[i].syntax);
1929         }
1930     }
1931   else
1932     ds_destroy (&s);
1933   free (expansion.mts);
1934   free (ofs);
1935   free (len);
1936
1937   /* Destroy the tokens for the call. */
1938   for (size_t i = 0; i < n_call; i++)
1939     lex_stage_pop_first (&src->pp);
1940
1941   return expansion.n > 0;
1942 }
1943
1944 /* Attempts to obtain at least one new token into 'merge' in SRC.
1945
1946    Returns true if successful, false on failure.  In the latter case, SRC is
1947    exhausted and 'src->eof' is now true. */
1948 static bool
1949 lex_source_get_merge (struct lex_source *src)
1950 {
1951   while (!src->eof)
1952     if (lex_source_try_get_merge (src))
1953       return true;
1954   return false;
1955 }
1956
1957 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1958
1959    Returns true if successful, false on failure.  In the latter case, SRC is
1960    exhausted and 'src->eof' is now true. */
1961 static bool
1962 lex_source_get_lookahead (struct lex_source *src)
1963 {
1964   struct merger m = MERGER_INIT;
1965   struct token out;
1966   for (size_t i = 0; ; i++)
1967     {
1968       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1969         {
1970           /* We always get a T_ENDCMD at the end of an input file
1971              (transformed from T_STOP by lex_source_try_get_pp()) and
1972              merger_add() should never return -1 on T_ENDCMD. */
1973           assert (lex_stage_is_empty (&src->merge));
1974           return false;
1975         }
1976
1977       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1978                                &out);
1979       if (!retval)
1980         {
1981           lex_stage_shift (&src->lookahead, &src->merge, 1);
1982           return true;
1983         }
1984       else if (retval > 0)
1985         {
1986           /* Add a token that merges all the tokens together. */
1987           const struct lex_token *first = lex_stage_first (&src->merge);
1988           const struct lex_token *last = lex_stage_nth (&src->merge,
1989                                                         retval - 1);
1990           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1991           struct lex_token *t = xmalloc (sizeof *t);
1992           *t = (struct lex_token) {
1993             .token = out,
1994             .token_pos = first->token_pos,
1995             .token_len = (last->token_pos - first->token_pos) + last->token_len,
1996             .line_pos = first->line_pos,
1997             .first_line = first->first_line,
1998
1999             /* This works well if all the tokens were not expanded from macros,
2000                or if they came from the same macro expansion.  It just gives up
2001                in the other (corner) cases. */
2002             .macro_rep = macro ? first->macro_rep : NULL,
2003             .ofs = macro ? first->ofs : 0,
2004             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2005             .ref_cnt = macro ? first->ref_cnt : NULL,
2006           };
2007           if (t->ref_cnt)
2008             ++*t->ref_cnt;
2009           lex_stage_push_last (&src->lookahead, t);
2010
2011           for (int i = 0; i < retval; i++)
2012             lex_stage_pop_first (&src->merge);
2013           return true;
2014         }
2015     }
2016 }
2017 \f
2018 static void
2019 lex_source_push_endcmd__ (struct lex_source *src)
2020 {
2021   assert (lex_stage_is_empty (&src->lookahead));
2022   struct lex_token *token = xmalloc (sizeof *token);
2023   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2024   lex_stage_push_last (&src->lookahead, token);
2025 }
2026
2027 static struct lex_source *
2028 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2029 {
2030   struct lex_source *src = xmalloc (sizeof *src);
2031   *src = (struct lex_source) {
2032     .reader = reader,
2033     .segmenter = segmenter_init (reader->syntax, false),
2034     .lexer = lexer,
2035   };
2036
2037   lex_source_push_endcmd__ (src);
2038
2039   return src;
2040 }
2041
2042 static void
2043 lex_source_destroy (struct lex_source *src)
2044 {
2045   char *file_name = src->reader->file_name;
2046   char *encoding = src->reader->encoding;
2047   if (src->reader->class->destroy != NULL)
2048     src->reader->class->destroy (src->reader);
2049   free (file_name);
2050   free (encoding);
2051   free (src->buffer);
2052   lex_stage_uninit (&src->pp);
2053   lex_stage_uninit (&src->merge);
2054   lex_stage_uninit (&src->lookahead);
2055   ll_remove (&src->ll);
2056   free (src);
2057 }
2058 \f
2059 struct lex_file_reader
2060   {
2061     struct lex_reader reader;
2062     struct u8_istream *istream;
2063   };
2064
2065 static struct lex_reader_class lex_file_reader_class;
2066
2067 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2068    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2069    ENCODING, which should take one of the forms accepted by
2070    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2071    mode of the new reader, respectively.
2072
2073    Returns a null pointer if FILE_NAME cannot be opened. */
2074 struct lex_reader *
2075 lex_reader_for_file (const char *file_name, const char *encoding,
2076                      enum segmenter_mode syntax,
2077                      enum lex_error_mode error)
2078 {
2079   struct lex_file_reader *r;
2080   struct u8_istream *istream;
2081
2082   istream = (!strcmp(file_name, "-")
2083              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2084              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2085   if (istream == NULL)
2086     {
2087       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2088       return NULL;
2089     }
2090
2091   r = xmalloc (sizeof *r);
2092   lex_reader_init (&r->reader, &lex_file_reader_class);
2093   r->reader.syntax = syntax;
2094   r->reader.error = error;
2095   r->reader.file_name = xstrdup (file_name);
2096   r->reader.encoding = xstrdup_if_nonnull (encoding);
2097   r->reader.line_number = 1;
2098   r->istream = istream;
2099
2100   return &r->reader;
2101 }
2102
2103 static struct lex_file_reader *
2104 lex_file_reader_cast (struct lex_reader *r)
2105 {
2106   return UP_CAST (r, struct lex_file_reader, reader);
2107 }
2108
2109 static size_t
2110 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2111                enum prompt_style prompt_style UNUSED)
2112 {
2113   struct lex_file_reader *r = lex_file_reader_cast (r_);
2114   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2115   if (n_read < 0)
2116     {
2117       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2118       return 0;
2119     }
2120   return n_read;
2121 }
2122
2123 static void
2124 lex_file_close (struct lex_reader *r_)
2125 {
2126   struct lex_file_reader *r = lex_file_reader_cast (r_);
2127
2128   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2129     {
2130       if (u8_istream_close (r->istream) != 0)
2131         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2132     }
2133   else
2134     u8_istream_free (r->istream);
2135
2136   free (r);
2137 }
2138
2139 static struct lex_reader_class lex_file_reader_class =
2140   {
2141     lex_file_read,
2142     lex_file_close
2143   };
2144 \f
2145 struct lex_string_reader
2146   {
2147     struct lex_reader reader;
2148     struct substring s;
2149     size_t offset;
2150   };
2151
2152 static struct lex_reader_class lex_string_reader_class;
2153
2154 /* Creates and returns a new lex_reader for the contents of S, which must be
2155    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2156    with ss_dealloc() when it is closed. */
2157 struct lex_reader *
2158 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2159 {
2160   struct lex_string_reader *r;
2161
2162   r = xmalloc (sizeof *r);
2163   lex_reader_init (&r->reader, &lex_string_reader_class);
2164   r->reader.syntax = SEG_MODE_AUTO;
2165   r->reader.encoding = xstrdup_if_nonnull (encoding);
2166   r->s = s;
2167   r->offset = 0;
2168
2169   return &r->reader;
2170 }
2171
2172 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2173    which must be encoded in ENCODING.  The caller retains ownership of S. */
2174 struct lex_reader *
2175 lex_reader_for_string (const char *s, const char *encoding)
2176 {
2177   struct substring ss;
2178   ss_alloc_substring (&ss, ss_cstr (s));
2179   return lex_reader_for_substring_nocopy (ss, encoding);
2180 }
2181
2182 /* Formats FORMAT as a printf()-like format string and creates and returns a
2183    new lex_reader for the formatted result.  */
2184 struct lex_reader *
2185 lex_reader_for_format (const char *format, const char *encoding, ...)
2186 {
2187   struct lex_reader *r;
2188   va_list args;
2189
2190   va_start (args, encoding);
2191   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2192   va_end (args);
2193
2194   return r;
2195 }
2196
2197 static struct lex_string_reader *
2198 lex_string_reader_cast (struct lex_reader *r)
2199 {
2200   return UP_CAST (r, struct lex_string_reader, reader);
2201 }
2202
2203 static size_t
2204 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2205                  enum prompt_style prompt_style UNUSED)
2206 {
2207   struct lex_string_reader *r = lex_string_reader_cast (r_);
2208   size_t chunk;
2209
2210   chunk = MIN (n, r->s.length - r->offset);
2211   memcpy (buf, r->s.string + r->offset, chunk);
2212   r->offset += chunk;
2213
2214   return chunk;
2215 }
2216
2217 static void
2218 lex_string_close (struct lex_reader *r_)
2219 {
2220   struct lex_string_reader *r = lex_string_reader_cast (r_);
2221
2222   ss_dealloc (&r->s);
2223   free (r);
2224 }
2225
2226 static struct lex_reader_class lex_string_reader_class =
2227   {
2228     lex_string_read,
2229     lex_string_close
2230   };