pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31 #include <uniwidth.h>
  32
  33 #include "language/command.h"
  34 #include "language/lexer/macro.h"
  35 #include "language/lexer/scan.h"
  36 #include "language/lexer/segment.h"
  37 #include "language/lexer/token.h"
  38 #include "libpspp/assertion.h"
  39 #include "libpspp/cast.h"
  40 #include "libpspp/deque.h"
  41 #include "libpspp/i18n.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* For a token obtained through the lexer in an ordinary way, this is the
  66        location of the token in terms of the lex_source's buffer.
  67
  68        For a token produced through macro expansion, this is the entire macro
  69        call.
  70
  71        src->tail <= line_pos <= token_pos <= src->head. */
  72     size_t token_pos;           /* Start of token. */
  73     size_t token_len;           /* Length of source for token in bytes. */
  74     size_t line_pos;            /* Start of line containing token_pos. */
  75     int first_line;             /* Line number at token_pos. */
  76
  77     /* For a token obtained through macro expansion, this is just this token.
  78
  79        For a token obtained through the lexer in an ordinary way, these are
  80        nulls and zeros. */
  81     char *macro_rep;        /* The whole macro expansion. */
  82     size_t ofs;             /* Offset of this token in macro_rep. */
  83     size_t len;             /* Length of this token in macro_rep. */
  84     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  85   };
  86
  87 static void
  88 lex_token_uninit (struct lex_token *t)
  89 {
  90   token_uninit (&t->token);
  91   if (t->ref_cnt)
  92     {
  93       assert (*t->ref_cnt > 0);
  94       if (!--*t->ref_cnt)
  95         {
  96           free (t->macro_rep);
  97           free (t->ref_cnt);
  98         }
  99     }
 100 }
 101
 102 /* A source of tokens, corresponding to a syntax file.
 103
 104    This is conceptually a lex_reader wrapped with everything needed to convert
 105    its UTF-8 bytes into tokens. */
 106 struct lex_source
 107   {
 108     struct ll ll;               /* In lexer's list of sources. */
 109     struct lex_reader *reader;
 110     struct lexer *lexer;
 111     struct segmenter segmenter;
 112     bool eof;                   /* True if T_STOP was read from 'reader'. */
 113
 114     /* Buffer of UTF-8 bytes. */
 115     char *buffer;
 116     size_t allocated;           /* Number of bytes allocated. */
 117     size_t tail;                /* &buffer[0] offset into UTF-8 source. */
 118     size_t head;                /* &buffer[head - tail] offset into source. */
 119
 120     /* Positions in source file, tail <= pos <= head for each member here. */
 121     size_t journal_pos;         /* First byte not yet output to journal. */
 122     size_t seg_pos;             /* First byte not yet scanned as token. */
 123     size_t line_pos;            /* First byte of line containing seg_pos. */
 124
 125     int n_newlines;             /* Number of new-lines up to seg_pos. */
 126     bool suppress_next_newline;
 127
 128     /* Tokens.
 129
 130        This is mostly like a deque, with the invariant that 'back <= middle <=
 131        front' (modulo SIZE_MAX+1).  The tokens available for parsing are
 132        between 'back' and 'middle': the token at 'back' is the current token,
 133        the token at 'back + 1' is the next token, and so on.  There are usually
 134        no tokens between 'middle' and 'front'; if there are, then they need to
 135        go through macro expansion and are not yet available for parsing.
 136
 137        'capacity' is the current number of elements in 'tokens'.  It is always
 138        a power of 2.  'front', 'middle', and 'back' refer to indexes in
 139        'tokens' modulo 'capacity'. */
 140     size_t front;
 141     size_t middle;
 142     size_t back;
 143     size_t capacity;
 144     size_t mask;                /* capacity - 1 */
 145     struct lex_token *tokens;
 146   };
 147
 148 static struct lex_source *lex_source_create (struct lexer *,
 149                                              struct lex_reader *);
 150 static void lex_source_destroy (struct lex_source *);
 151
 152 /* Lexer. */
 153 struct lexer
 154   {
 155     struct ll_list sources;     /* Contains "struct lex_source"s. */
 156     struct macro_set *macros;
 157   };
 158
 159 static struct lex_source *lex_source__ (const struct lexer *);
 160 static char *lex_source_get_syntax__ (const struct lex_source *,
 161                                       int n0, int n1);
 162 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 163 static void lex_source_push_endcmd__ (struct lex_source *);
 164
 165 static void lex_source_pop_back (struct lex_source *);
 166 static bool lex_source_get (const struct lex_source *);
 167 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 168                                      const char *format, va_list)
 169    PRINTF_FORMAT (4, 0);
 170 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 171                                                   int n);
 172 \f
 173 /* Initializes READER with the specified CLASS and otherwise some reasonable
 174    defaults.  The caller should fill in the others members as desired. */
 175 void
 176 lex_reader_init (struct lex_reader *reader,
 177                  const struct lex_reader_class *class)
 178 {
 179   reader->class = class;
 180   reader->syntax = SEG_MODE_AUTO;
 181   reader->error = LEX_ERROR_CONTINUE;
 182   reader->file_name = NULL;
 183   reader->encoding = NULL;
 184   reader->line_number = 0;
 185   reader->eof = false;
 186 }
 187
 188 /* Frees any file name already in READER and replaces it by a copy of
 189    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 190 void
 191 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 192 {
 193   free (reader->file_name);
 194   reader->file_name = xstrdup_if_nonnull (file_name);
 195 }
 196 \f
 197 /* Creates and returns a new lexer. */
 198 struct lexer *
 199 lex_create (void)
 200 {
 201   struct lexer *lexer = xmalloc (sizeof *lexer);
 202   *lexer = (struct lexer) {
 203     .sources = LL_INITIALIZER (lexer->sources),
 204     .macros = macro_set_create (),
 205   };
 206   return lexer;
 207 }
 208
 209 /* Destroys LEXER. */
 210 void
 211 lex_destroy (struct lexer *lexer)
 212 {
 213   if (lexer != NULL)
 214     {
 215       struct lex_source *source, *next;
 216
 217       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 218         lex_source_destroy (source);
 219       macro_set_destroy (lexer->macros);
 220       free (lexer);
 221     }
 222 }
 223
 224 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 225    same name.  Takes ownership of M. */
 226 void
 227 lex_define_macro (struct lexer *lexer, struct macro *m)
 228 {
 229   macro_set_add (lexer->macros, m);
 230 }
 231
 232 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 233    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 234    token. */
 235 void
 236 lex_include (struct lexer *lexer, struct lex_reader *reader)
 237 {
 238   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 239   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 240 }
 241
 242 /* Appends READER to LEXER, so that it will be read after all other current
 243    readers have already been read. */
 244 void
 245 lex_append (struct lexer *lexer, struct lex_reader *reader)
 246 {
 247   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 248 }
 249 \f
 250 /* Advancing. */
 251
 252 /* Adds a new token at the front of SRC and returns a pointer to it.  The
 253    caller should initialize it.  Does not advance the middle pointer, so the
 254    token isn't immediately available to the parser. */
 255 static struct lex_token *
 256 lex_push_token__ (struct lex_source *src)
 257 {
 258   if (src->front - src->back >= src->capacity)
 259     {
 260       /* Expansion works just like a deque, so we reuse the code. */
 261       struct deque deque = {
 262         .capacity = src->capacity,
 263         .front = src->front,
 264         .back = src->back,
 265       };
 266       src->tokens = deque_expand (&deque, src->tokens, sizeof *src->tokens);
 267       src->capacity = deque.capacity;
 268       src->mask = src->capacity - 1;
 269     }
 270
 271   struct lex_token *token = &src->tokens[src->front++ & src->mask];
 272   token->token = (struct token) { .type = T_STOP };
 273   token->macro_rep = NULL;
 274   token->ref_cnt = NULL;
 275   return token;
 276 }
 277
 278 /* Removes the current token from SRC and uninitializes it. */
 279 static void
 280 lex_source_pop_back (struct lex_source *src)
 281 {
 282   assert (src->middle - src->back > 0);
 283   lex_token_uninit (&src->tokens[src->back++ & src->mask]);
 284 }
 285
 286 /* Removes the token at the greatest lookahead from SRC and uninitializes
 287    it. */
 288 static void
 289 lex_source_pop_front (struct lex_source *src)
 290 {
 291   assert (src->front - src->middle > 0);
 292   lex_token_uninit (&src->tokens[--src->front & src->mask]);
 293 }
 294
 295 /* Advances LEXER to the next token, consuming the current token. */
 296 void
 297 lex_get (struct lexer *lexer)
 298 {
 299   struct lex_source *src;
 300
 301   src = lex_source__ (lexer);
 302   if (src == NULL)
 303     return;
 304
 305   if (src->middle - src->back > 0)
 306     lex_source_pop_back (src);
 307
 308   while (src->back == src->middle)
 309     if (!lex_source_get (src))
 310       {
 311         lex_source_destroy (src);
 312         src = lex_source__ (lexer);
 313         if (src == NULL)
 314           return;
 315       }
 316 }
 317 \f
 318 /* Issuing errors. */
 319
 320 /* Prints a syntax error message containing the current token and
 321    given message MESSAGE (if non-null). */
 322 void
 323 lex_error (struct lexer *lexer, const char *format, ...)
 324 {
 325   va_list args;
 326
 327   va_start (args, format);
 328   lex_next_error_valist (lexer, 0, 0, format, args);
 329   va_end (args);
 330 }
 331
 332 /* Prints a syntax error message containing the current token and
 333    given message MESSAGE (if non-null). */
 334 void
 335 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 336 {
 337   lex_next_error_valist (lexer, 0, 0, format, args);
 338 }
 339
 340 /* Prints a syntax error message containing the current token and
 341    given message MESSAGE (if non-null). */
 342 void
 343 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 344 {
 345   va_list args;
 346
 347   va_start (args, format);
 348   lex_next_error_valist (lexer, n0, n1, format, args);
 349   va_end (args);
 350 }
 351
 352 /* Prints a syntax error message saying that one of the strings provided as
 353    varargs, up to the first NULL, is expected. */
 354 void
 355 (lex_error_expecting) (struct lexer *lexer, ...)
 356 {
 357   va_list args;
 358
 359   va_start (args, lexer);
 360   lex_error_expecting_valist (lexer, args);
 361   va_end (args);
 362 }
 363
 364 /* Prints a syntax error message saying that one of the options provided in
 365    ARGS, up to the first NULL, is expected. */
 366 void
 367 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 368 {
 369   enum { MAX_OPTIONS = 9 };
 370   const char *options[MAX_OPTIONS];
 371   int n = 0;
 372   while (n < MAX_OPTIONS)
 373     {
 374       const char *option = va_arg (args, const char *);
 375       if (!option)
 376         break;
 377
 378       options[n++] = option;
 379     }
 380   lex_error_expecting_array (lexer, options, n);
 381 }
 382
 383 void
 384 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 385 {
 386   switch (n)
 387     {
 388     case 0:
 389       lex_error (lexer, NULL);
 390       break;
 391
 392     case 1:
 393       lex_error (lexer, _("expecting %s"), options[0]);
 394       break;
 395
 396     case 2:
 397       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 398       break;
 399
 400     case 3:
 401       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 402                  options[2]);
 403       break;
 404
 405     case 4:
 406       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 407                  options[0], options[1], options[2], options[3]);
 408       break;
 409
 410     case 5:
 411       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 412                  options[0], options[1], options[2], options[3], options[4]);
 413       break;
 414
 415     case 6:
 416       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 417                  options[0], options[1], options[2], options[3], options[4],
 418                  options[5]);
 419       break;
 420
 421     case 7:
 422       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 423                  options[0], options[1], options[2], options[3], options[4],
 424                  options[5], options[6]);
 425       break;
 426
 427     case 8:
 428       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 429                  options[0], options[1], options[2], options[3], options[4],
 430                  options[5], options[6], options[7]);
 431       break;
 432
 433     default:
 434       lex_error (lexer, NULL);
 435     }
 436 }
 437
 438 /* Reports an error to the effect that subcommand SBC may only be specified
 439    once.
 440
 441    This function does not take a lexer as an argument or use lex_error(),
 442    because the result would ordinarily just be redundant: "Syntax error at
 443    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 444    not help the user find the error. */
 445 void
 446 lex_sbc_only_once (const char *sbc)
 447 {
 448   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 449 }
 450
 451 /* Reports an error to the effect that subcommand SBC is missing.
 452
 453    This function does not take a lexer as an argument or use lex_error(),
 454    because a missing subcommand can normally be detected only after the whole
 455    command has been parsed, and so lex_error() would always report "Syntax
 456    error at end of command", which does not help the user find the error. */
 457 void
 458 lex_sbc_missing (const char *sbc)
 459 {
 460   msg (SE, _("Required subcommand %s was not specified."), sbc);
 461 }
 462
 463 /* Reports an error to the effect that specification SPEC may only be specified
 464    once within subcommand SBC. */
 465 void
 466 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 467 {
 468   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 469              spec, sbc);
 470 }
 471
 472 /* Reports an error to the effect that specification SPEC is missing within
 473    subcommand SBC. */
 474 void
 475 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 476 {
 477   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 478              sbc, spec);
 479 }
 480
 481 /* Prints a syntax error message containing the current token and
 482    given message MESSAGE (if non-null). */
 483 void
 484 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 485                        const char *format, va_list args)
 486 {
 487   struct lex_source *src = lex_source__ (lexer);
 488
 489   if (src != NULL)
 490     lex_source_error_valist (src, n0, n1, format, args);
 491   else
 492     {
 493       struct string s;
 494
 495       ds_init_empty (&s);
 496       ds_put_format (&s, _("Syntax error at end of input"));
 497       if (format != NULL)
 498         {
 499           ds_put_cstr (&s, ": ");
 500           ds_put_vformat (&s, format, args);
 501         }
 502       ds_put_byte (&s, '.');
 503       msg (SE, "%s", ds_cstr (&s));
 504       ds_destroy (&s);
 505     }
 506 }
 507
 508 /* Checks that we're at end of command.
 509    If so, returns a successful command completion code.
 510    If not, flags a syntax error and returns an error command
 511    completion code. */
 512 int
 513 lex_end_of_command (struct lexer *lexer)
 514 {
 515   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 516     {
 517       lex_error (lexer, _("expecting end of command"));
 518       return CMD_FAILURE;
 519     }
 520   else
 521     return CMD_SUCCESS;
 522 }
 523 \f
 524 /* Token testing functions. */
 525
 526 /* Returns true if the current token is a number. */
 527 bool
 528 lex_is_number (const struct lexer *lexer)
 529 {
 530   return lex_next_is_number (lexer, 0);
 531 }
 532
 533 /* Returns true if the current token is a string. */
 534 bool
 535 lex_is_string (const struct lexer *lexer)
 536 {
 537   return lex_next_is_string (lexer, 0);
 538 }
 539
 540 /* Returns the value of the current token, which must be a
 541    floating point number. */
 542 double
 543 lex_number (const struct lexer *lexer)
 544 {
 545   return lex_next_number (lexer, 0);
 546 }
 547
 548 /* Returns true iff the current token is an integer. */
 549 bool
 550 lex_is_integer (const struct lexer *lexer)
 551 {
 552   return lex_next_is_integer (lexer, 0);
 553 }
 554
 555 /* Returns the value of the current token, which must be an
 556    integer. */
 557 long
 558 lex_integer (const struct lexer *lexer)
 559 {
 560   return lex_next_integer (lexer, 0);
 561 }
 562 \f
 563 /* Token testing functions with lookahead.
 564
 565    A value of 0 for N as an argument to any of these functions refers to the
 566    current token.  Lookahead is limited to the current command.  Any N greater
 567    than the number of tokens remaining in the current command will be treated
 568    as referring to a T_ENDCMD token. */
 569
 570 /* Returns true if the token N ahead of the current token is a number. */
 571 bool
 572 lex_next_is_number (const struct lexer *lexer, int n)
 573 {
 574   return token_is_number (lex_next (lexer, n));
 575 }
 576
 577 /* Returns true if the token N ahead of the current token is a string. */
 578 bool
 579 lex_next_is_string (const struct lexer *lexer, int n)
 580 {
 581   return token_is_string (lex_next (lexer, n));
 582 }
 583
 584 /* Returns the value of the token N ahead of the current token, which must be a
 585    floating point number. */
 586 double
 587 lex_next_number (const struct lexer *lexer, int n)
 588 {
 589   return token_number (lex_next (lexer, n));
 590 }
 591
 592 /* Returns true if the token N ahead of the current token is an integer. */
 593 bool
 594 lex_next_is_integer (const struct lexer *lexer, int n)
 595 {
 596   return token_is_integer (lex_next (lexer, n));
 597 }
 598
 599 /* Returns the value of the token N ahead of the current token, which must be
 600    an integer. */
 601 long
 602 lex_next_integer (const struct lexer *lexer, int n)
 603 {
 604   return token_integer (lex_next (lexer, n));
 605 }
 606 \f
 607 /* Token matching functions. */
 608
 609 /* If the current token has the specified TYPE, skips it and returns true.
 610    Otherwise, returns false. */
 611 bool
 612 lex_match (struct lexer *lexer, enum token_type type)
 613 {
 614   if (lex_token (lexer) == type)
 615     {
 616       lex_get (lexer);
 617       return true;
 618     }
 619   else
 620     return false;
 621 }
 622
 623 /* If the current token matches IDENTIFIER, skips it and returns true.
 624    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 625    returns false.
 626
 627    IDENTIFIER must be an ASCII string. */
 628 bool
 629 lex_match_id (struct lexer *lexer, const char *identifier)
 630 {
 631   return lex_match_id_n (lexer, identifier, 3);
 632 }
 633
 634 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 635    may be abbreviated to its first N letters.  Otherwise, returns false.
 636
 637    IDENTIFIER must be an ASCII string. */
 638 bool
 639 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 640 {
 641   if (lex_token (lexer) == T_ID
 642       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 643     {
 644       lex_get (lexer);
 645       return true;
 646     }
 647   else
 648     return false;
 649 }
 650
 651 /* If the current token is integer X, skips it and returns true.  Otherwise,
 652    returns false. */
 653 bool
 654 lex_match_int (struct lexer *lexer, int x)
 655 {
 656   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 657     {
 658       lex_get (lexer);
 659       return true;
 660     }
 661   else
 662     return false;
 663 }
 664 \f
 665 /* Forced matches. */
 666
 667 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 668    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 669    false.
 670
 671    IDENTIFIER must be an ASCII string. */
 672 bool
 673 lex_force_match_id (struct lexer *lexer, const char *identifier)
 674 {
 675   if (lex_match_id (lexer, identifier))
 676     return true;
 677   else
 678     {
 679       lex_error_expecting (lexer, identifier);
 680       return false;
 681     }
 682 }
 683
 684 /* If the current token has the specified TYPE, skips it and returns true.
 685    Otherwise, reports an error and returns false. */
 686 bool
 687 lex_force_match (struct lexer *lexer, enum token_type type)
 688 {
 689   if (lex_token (lexer) == type)
 690     {
 691       lex_get (lexer);
 692       return true;
 693     }
 694   else
 695     {
 696       const char *type_string = token_type_to_string (type);
 697       if (type_string)
 698         {
 699           char *s = xasprintf ("`%s'", type_string);
 700           lex_error_expecting (lexer, s);
 701           free (s);
 702         }
 703       else
 704         lex_error_expecting (lexer, token_type_to_name (type));
 705
 706       return false;
 707     }
 708 }
 709
 710 /* If the current token is a string, does nothing and returns true.
 711    Otherwise, reports an error and returns false. */
 712 bool
 713 lex_force_string (struct lexer *lexer)
 714 {
 715   if (lex_is_string (lexer))
 716     return true;
 717   else
 718     {
 719       lex_error (lexer, _("expecting string"));
 720       return false;
 721     }
 722 }
 723
 724 /* If the current token is a string or an identifier, does nothing and returns
 725    true.  Otherwise, reports an error and returns false.
 726
 727    This is meant for use in syntactic situations where we want to encourage the
 728    user to supply a quoted string, but for compatibility we also accept
 729    identifiers.  (One example of such a situation is file names.)  Therefore,
 730    the error message issued when the current token is wrong only says that a
 731    string is expected and doesn't mention that an identifier would also be
 732    accepted. */
 733 bool
 734 lex_force_string_or_id (struct lexer *lexer)
 735 {
 736   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 737 }
 738
 739 /* If the current token is an integer, does nothing and returns true.
 740    Otherwise, reports an error and returns false. */
 741 bool
 742 lex_force_int (struct lexer *lexer)
 743 {
 744   if (lex_is_integer (lexer))
 745     return true;
 746   else
 747     {
 748       lex_error (lexer, _("expecting integer"));
 749       return false;
 750     }
 751 }
 752
 753 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 754    nothing and returns true.  Otherwise, reports an error and returns false.
 755    If NAME is nonnull, then it is used in the error message. */
 756 bool
 757 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 758 {
 759   bool is_integer = lex_is_integer (lexer);
 760   bool too_small = is_integer && lex_integer (lexer) < min;
 761   bool too_big = is_integer && lex_integer (lexer) > max;
 762   if (is_integer && !too_small && !too_big)
 763     return true;
 764
 765   if (min > max)
 766     {
 767       /* Weird, maybe a bug in the caller.  Just report that we needed an
 768          integer. */
 769       if (name)
 770         lex_error (lexer, _("Integer expected for %s."), name);
 771       else
 772         lex_error (lexer, _("Integer expected."));
 773     }
 774   else if (min == max)
 775     {
 776       if (name)
 777         lex_error (lexer, _("Expected %ld for %s."), min, name);
 778       else
 779         lex_error (lexer, _("Expected %ld."), min);
 780     }
 781   else if (min + 1 == max)
 782     {
 783       if (name)
 784         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 785       else
 786         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 787     }
 788   else
 789     {
 790       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 791       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 792
 793       if (report_lower_bound && report_upper_bound)
 794         {
 795           if (name)
 796             lex_error (lexer,
 797                        _("Expected integer between %ld and %ld for %s."),
 798                        min, max, name);
 799           else
 800             lex_error (lexer, _("Expected integer between %ld and %ld."),
 801                        min, max);
 802         }
 803       else if (report_lower_bound)
 804         {
 805           if (min == 0)
 806             {
 807               if (name)
 808                 lex_error (lexer, _("Expected non-negative integer for %s."),
 809                            name);
 810               else
 811                 lex_error (lexer, _("Expected non-negative integer."));
 812             }
 813           else if (min == 1)
 814             {
 815               if (name)
 816                 lex_error (lexer, _("Expected positive integer for %s."),
 817                            name);
 818               else
 819                 lex_error (lexer, _("Expected positive integer."));
 820             }
 821         }
 822       else if (report_upper_bound)
 823         {
 824           if (name)
 825             lex_error (lexer,
 826                        _("Expected integer less than or equal to %ld for %s."),
 827                        max, name);
 828           else
 829             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 830                        max);
 831         }
 832       else
 833         {
 834           if (name)
 835             lex_error (lexer, _("Integer expected for %s."), name);
 836           else
 837             lex_error (lexer, _("Integer expected."));
 838         }
 839     }
 840   return false;
 841 }
 842
 843 /* If the current token is a number, does nothing and returns true.
 844    Otherwise, reports an error and returns false. */
 845 bool
 846 lex_force_num (struct lexer *lexer)
 847 {
 848   if (lex_is_number (lexer))
 849     return true;
 850
 851   lex_error (lexer, _("expecting number"));
 852   return false;
 853 }
 854
 855 /* If the current token is an identifier, does nothing and returns true.
 856    Otherwise, reports an error and returns false. */
 857 bool
 858 lex_force_id (struct lexer *lexer)
 859 {
 860   if (lex_token (lexer) == T_ID)
 861     return true;
 862
 863   lex_error (lexer, _("expecting identifier"));
 864   return false;
 865 }
 866 \f
 867 /* Token accessors. */
 868
 869 /* Returns the type of LEXER's current token. */
 870 enum token_type
 871 lex_token (const struct lexer *lexer)
 872 {
 873   return lex_next_token (lexer, 0);
 874 }
 875
 876 /* Returns the number in LEXER's current token.
 877
 878    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 879    tokens this function will always return zero. */
 880 double
 881 lex_tokval (const struct lexer *lexer)
 882 {
 883   return lex_next_tokval (lexer, 0);
 884 }
 885
 886 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 887
 888    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 889    this functions this function will always return NULL.
 890
 891    The UTF-8 encoding of the returned string is correct for variable names and
 892    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 893    data_in() to use it in a "union value".  */
 894 const char *
 895 lex_tokcstr (const struct lexer *lexer)
 896 {
 897   return lex_next_tokcstr (lexer, 0);
 898 }
 899
 900 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 901    null-terminated (but the null terminator is not included in the returned
 902    substring's 'length').
 903
 904    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 905    this functions this function will always return NULL.
 906
 907    The UTF-8 encoding of the returned string is correct for variable names and
 908    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 909    data_in() to use it in a "union value".  */
 910 struct substring
 911 lex_tokss (const struct lexer *lexer)
 912 {
 913   return lex_next_tokss (lexer, 0);
 914 }
 915 \f
 916 /* Looking ahead.
 917
 918    A value of 0 for N as an argument to any of these functions refers to the
 919    current token.  Lookahead is limited to the current command.  Any N greater
 920    than the number of tokens remaining in the current command will be treated
 921    as referring to a T_ENDCMD token. */
 922
 923 static const struct lex_token *
 924 lex_next__ (const struct lexer *lexer_, int n)
 925 {
 926   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
 927   struct lex_source *src = lex_source__ (lexer);
 928
 929   if (src != NULL)
 930     return lex_source_next__ (src, n);
 931   else
 932     {
 933       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
 934       return &stop_token;
 935     }
 936 }
 937
 938 /* Returns the token in SRC with the greatest lookahead. */
 939 static const struct lex_token *
 940 lex_source_middle (const struct lex_source *src)
 941 {
 942   assert (src->middle - src->back > 0);
 943   return &src->tokens[(src->middle - 1) & src->mask];
 944 }
 945
 946 static const struct lex_token *
 947 lex_source_next__ (const struct lex_source *src, int n)
 948 {
 949   while (src->middle - src->back <= n)
 950     {
 951       if (src->middle - src->back > 0)
 952         {
 953           const struct lex_token *middle = lex_source_middle (src);
 954           if (middle->token.type == T_STOP || middle->token.type == T_ENDCMD)
 955             return middle;
 956         }
 957
 958       lex_source_get (src);
 959     }
 960
 961   return &src->tokens[(src->back + n) & src->mask];
 962 }
 963
 964 /* Returns the "struct token" of the token N after the current one in LEXER.
 965    The returned pointer can be invalidated by pretty much any succeeding call
 966    into the lexer, although the string pointer within the returned token is
 967    only invalidated by consuming the token (e.g. with lex_get()). */
 968 const struct token *
 969 lex_next (const struct lexer *lexer, int n)
 970 {
 971   return &lex_next__ (lexer, n)->token;
 972 }
 973
 974 /* Returns the type of the token N after the current one in LEXER. */
 975 enum token_type
 976 lex_next_token (const struct lexer *lexer, int n)
 977 {
 978   return lex_next (lexer, n)->type;
 979 }
 980
 981 /* Returns the number in the tokn N after the current one in LEXER.
 982
 983    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 984    tokens this function will always return zero. */
 985 double
 986 lex_next_tokval (const struct lexer *lexer, int n)
 987 {
 988   return token_number (lex_next (lexer, n));
 989 }
 990
 991 /* Returns the null-terminated string in the token N after the current one, in
 992    UTF-8 encoding.
 993
 994    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 995    this functions this function will always return NULL.
 996
 997    The UTF-8 encoding of the returned string is correct for variable names and
 998    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 999    data_in() to use it in a "union value".  */
1000 const char *
1001 lex_next_tokcstr (const struct lexer *lexer, int n)
1002 {
1003   return lex_next_tokss (lexer, n).string;
1004 }
1005
1006 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1007    The string is null-terminated (but the null terminator is not included in
1008    the returned substring's 'length').
1009
1010    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1011    tokens this functions this function will always return NULL.
1012
1013    The UTF-8 encoding of the returned string is correct for variable names and
1014    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1015    data_in() to use it in a "union value".  */
1016 struct substring
1017 lex_next_tokss (const struct lexer *lexer, int n)
1018 {
1019   return lex_next (lexer, n)->string;
1020 }
1021
1022 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1023    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1024    are both zero, this requests the syntax for the current token.)  The caller
1025    must eventually free the returned string (with free()).  The syntax is
1026    encoded in UTF-8 and in the original form supplied to the lexer so that, for
1027    example, it may include comments, spaces, and new-lines if it spans multiple
1028    tokens.  Macro expansion, however, has already been performed. */
1029 char *
1030 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1031 {
1032   return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1033 }
1034
1035 /* Returns true if the token N ahead of the current one was produced by macro
1036    expansion, false otherwise. */
1037 bool
1038 lex_next_is_from_macro (const struct lexer *lexer, int n)
1039 {
1040   return lex_next__ (lexer, n)->macro_rep != NULL;
1041 }
1042
1043 static bool
1044 lex_tokens_match (const struct token *actual, const struct token *expected)
1045 {
1046   if (actual->type != expected->type)
1047     return false;
1048
1049   switch (actual->type)
1050     {
1051     case T_POS_NUM:
1052     case T_NEG_NUM:
1053       return actual->number == expected->number;
1054
1055     case T_ID:
1056       return lex_id_match (expected->string, actual->string);
1057
1058     case T_STRING:
1059       return (actual->string.length == expected->string.length
1060               && !memcmp (actual->string.string, expected->string.string,
1061                           actual->string.length));
1062
1063     default:
1064       return true;
1065     }
1066 }
1067
1068 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1069    skips it and returns true.  Otherwise, returns false.
1070
1071    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1072    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1073    first three letters. */
1074 bool
1075 lex_match_phrase (struct lexer *lexer, const char *s)
1076 {
1077   struct string_lexer slex;
1078   struct token token;
1079   int i;
1080
1081   i = 0;
1082   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1083   while (string_lexer_next (&slex, &token))
1084     if (token.type != SCAN_SKIP)
1085       {
1086         bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1087         token_uninit (&token);
1088         if (!match)
1089           return false;
1090       }
1091
1092   while (i-- > 0)
1093     lex_get (lexer);
1094   return true;
1095 }
1096
1097 static int
1098 lex_source_get_first_line_number (const struct lex_source *src, int n)
1099 {
1100   return lex_source_next__ (src, n)->first_line;
1101 }
1102
1103 static int
1104 count_newlines (char *s, size_t length)
1105 {
1106   int n_newlines = 0;
1107   char *newline;
1108
1109   while ((newline = memchr (s, '\n', length)) != NULL)
1110     {
1111       n_newlines++;
1112       length -= (newline + 1) - s;
1113       s = newline + 1;
1114     }
1115
1116   return n_newlines;
1117 }
1118
1119 static int
1120 lex_source_get_last_line_number (const struct lex_source *src, int n)
1121 {
1122   const struct lex_token *token = lex_source_next__ (src, n);
1123
1124   if (token->first_line == 0)
1125     return 0;
1126   else
1127     {
1128       char *token_str = &src->buffer[token->token_pos - src->tail];
1129       return token->first_line + count_newlines (token_str, token->token_len) + 1;
1130     }
1131 }
1132
1133 static int
1134 count_columns (const char *s_, size_t length)
1135 {
1136   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1137   int columns;
1138   size_t ofs;
1139   int mblen;
1140
1141   columns = 0;
1142   for (ofs = 0; ofs < length; ofs += mblen)
1143     {
1144       ucs4_t uc;
1145
1146       mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1147       if (uc != '\t')
1148         {
1149           int width = uc_width (uc, "UTF-8");
1150           if (width > 0)
1151             columns += width;
1152         }
1153       else
1154         columns = ROUND_UP (columns + 1, 8);
1155     }
1156
1157   return columns + 1;
1158 }
1159
1160 static int
1161 lex_source_get_first_column (const struct lex_source *src, int n)
1162 {
1163   const struct lex_token *token = lex_source_next__ (src, n);
1164   return count_columns (&src->buffer[token->line_pos - src->tail],
1165                         token->token_pos - token->line_pos);
1166 }
1167
1168 static int
1169 lex_source_get_last_column (const struct lex_source *src, int n)
1170 {
1171   const struct lex_token *token = lex_source_next__ (src, n);
1172   char *start, *end, *newline;
1173
1174   start = &src->buffer[token->line_pos - src->tail];
1175   end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1176   newline = memrchr (start, '\n', end - start);
1177   if (newline != NULL)
1178     start = newline + 1;
1179   return count_columns (start, end - start);
1180 }
1181
1182 /* Returns the 1-based line number of the start of the syntax that represents
1183    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1184    if the token is drawn from a source that does not have line numbers. */
1185 int
1186 lex_get_first_line_number (const struct lexer *lexer, int n)
1187 {
1188   const struct lex_source *src = lex_source__ (lexer);
1189   return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
1190 }
1191
1192 /* Returns the 1-based line number of the end of the syntax that represents the
1193    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1194    token or if the token is drawn from a source that does not have line
1195    numbers.
1196
1197    Most of the time, a single token is wholly within a single line of syntax,
1198    but there are two exceptions: a T_STRING token can be made up of multiple
1199    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1200    token can consist of a "-" on one line followed by the number on the next.
1201  */
1202 int
1203 lex_get_last_line_number (const struct lexer *lexer, int n)
1204 {
1205   const struct lex_source *src = lex_source__ (lexer);
1206   return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1207 }
1208
1209 /* Returns the 1-based column number of the start of the syntax that represents
1210    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1211    token.
1212
1213    Column numbers are measured according to the width of characters as shown in
1214    a typical fixed-width font, in which CJK characters have width 2 and
1215    combining characters have width 0.  */
1216 int
1217 lex_get_first_column (const struct lexer *lexer, int n)
1218 {
1219   const struct lex_source *src = lex_source__ (lexer);
1220   return src != NULL ? lex_source_get_first_column (src, n) : 0;
1221 }
1222
1223 /* Returns the 1-based column number of the end of the syntax that represents
1224    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1225    token.
1226
1227    Column numbers are measured according to the width of characters as shown in
1228    a typical fixed-width font, in which CJK characters have width 2 and
1229    combining characters have width 0.  */
1230 int
1231 lex_get_last_column (const struct lexer *lexer, int n)
1232 {
1233   const struct lex_source *src = lex_source__ (lexer);
1234   return src != NULL ? lex_source_get_last_column (src, n) : 0;
1235 }
1236
1237 /* Returns the name of the syntax file from which the current command is drawn.
1238    Returns NULL for a T_STOP token or if the command's source does not have
1239    line numbers.
1240
1241    There is no version of this function that takes an N argument because
1242    lookahead only works to the end of a command and any given command is always
1243    within a single syntax file. */
1244 const char *
1245 lex_get_file_name (const struct lexer *lexer)
1246 {
1247   struct lex_source *src = lex_source__ (lexer);
1248   return src == NULL ? NULL : src->reader->file_name;
1249 }
1250
1251 /* Returns a newly allocated msg_location for the syntax that represents tokens
1252    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1253    must eventually free the location (with msg_location_destroy()). */
1254 struct msg_location *
1255 lex_get_location (const struct lexer *lexer, int n0, int n1)
1256 {
1257   struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1258   loc->first_column = lex_get_first_column (lexer, n0);
1259   loc->last_column = lex_get_last_column (lexer, n1);
1260   return loc;
1261 }
1262
1263 /* Returns a newly allocated msg_location for the syntax that represents tokens
1264    with 0-based offsets N0...N1, inclusive, from the current token.  The
1265    location only covers the tokens' lines, not the columns.  The caller must
1266    eventually free the location (with msg_location_destroy()). */
1267 struct msg_location *
1268 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1269 {
1270   struct msg_location *loc = xmalloc (sizeof *loc);
1271   *loc = (struct msg_location) {
1272     .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1273     .first_line = lex_get_first_line_number (lexer, n0),
1274     .last_line = lex_get_last_line_number (lexer, n1),
1275   };
1276   return loc;
1277 }
1278
1279 const char *
1280 lex_get_encoding (const struct lexer *lexer)
1281 {
1282   struct lex_source *src = lex_source__ (lexer);
1283   return src == NULL ? NULL : src->reader->encoding;
1284 }
1285
1286 /* Returns the syntax mode for the syntax file from which the current drawn is
1287    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1288    does not have line numbers.
1289
1290    There is no version of this function that takes an N argument because
1291    lookahead only works to the end of a command and any given command is always
1292    within a single syntax file. */
1293 enum segmenter_mode
1294 lex_get_syntax_mode (const struct lexer *lexer)
1295 {
1296   struct lex_source *src = lex_source__ (lexer);
1297   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1298 }
1299
1300 /* Returns the error mode for the syntax file from which the current drawn is
1301    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1302    source does not have line numbers.
1303
1304    There is no version of this function that takes an N argument because
1305    lookahead only works to the end of a command and any given command is always
1306    within a single syntax file. */
1307 enum lex_error_mode
1308 lex_get_error_mode (const struct lexer *lexer)
1309 {
1310   struct lex_source *src = lex_source__ (lexer);
1311   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1312 }
1313
1314 /* If the source that LEXER is currently reading has error mode
1315    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1316    token to be read comes directly from whatever is next read from the stream.
1317
1318    It makes sense to call this function after encountering an error in a
1319    command entered on the console, because usually the user would prefer not to
1320    have cascading errors. */
1321 void
1322 lex_interactive_reset (struct lexer *lexer)
1323 {
1324   struct lex_source *src = lex_source__ (lexer);
1325   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1326     {
1327       src->head = src->tail = 0;
1328       src->journal_pos = src->seg_pos = src->line_pos = 0;
1329       src->n_newlines = 0;
1330       src->suppress_next_newline = false;
1331       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1332                                        false);
1333       while (src->middle - src->back > 0)
1334         lex_source_pop_back (src);
1335       while (src->front - src->middle > 0)
1336         lex_source_pop_front (src);
1337       lex_source_push_endcmd__ (src);
1338     }
1339 }
1340
1341 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1342 void
1343 lex_discard_rest_of_command (struct lexer *lexer)
1344 {
1345   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1346     lex_get (lexer);
1347 }
1348
1349 /* Discards all lookahead tokens in LEXER, then discards all input sources
1350    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1351    runs out of input sources. */
1352 void
1353 lex_discard_noninteractive (struct lexer *lexer)
1354 {
1355   struct lex_source *src = lex_source__ (lexer);
1356
1357   if (src != NULL)
1358     {
1359       while (src->middle - src->back > 0)
1360         lex_source_pop_back (src);
1361
1362       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1363            src = lex_source__ (lexer))
1364         lex_source_destroy (src);
1365     }
1366 }
1367 \f
1368 static size_t
1369 lex_source_max_tail__ (const struct lex_source *src)
1370 {
1371   const struct lex_token *token;
1372   size_t max_tail;
1373
1374   assert (src->seg_pos >= src->line_pos);
1375   max_tail = MIN (src->journal_pos, src->line_pos);
1376
1377   /* Use the oldest token also.  (We know that src->deque cannot be empty
1378      because we are in the process of adding a new token, which is already
1379      initialized enough to use here.) */
1380   token = &src->tokens[src->back & src->mask];
1381   assert (token->token_pos >= token->line_pos);
1382   max_tail = MIN (max_tail, token->line_pos);
1383
1384   return max_tail;
1385 }
1386
1387 static void
1388 lex_source_expand__ (struct lex_source *src)
1389 {
1390   if (src->head - src->tail >= src->allocated)
1391     {
1392       size_t max_tail = lex_source_max_tail__ (src);
1393       if (max_tail > src->tail)
1394         {
1395           /* Advance the tail, freeing up room at the head. */
1396           memmove (src->buffer, src->buffer + (max_tail - src->tail),
1397                    src->head - max_tail);
1398           src->tail = max_tail;
1399         }
1400       else
1401         {
1402           /* Buffer is completely full.  Expand it. */
1403           src->buffer = x2realloc (src->buffer, &src->allocated);
1404         }
1405     }
1406   else
1407     {
1408       /* There's space available at the head of the buffer.  Nothing to do. */
1409     }
1410 }
1411
1412 static void
1413 lex_source_read__ (struct lex_source *src)
1414 {
1415   do
1416     {
1417       lex_source_expand__ (src);
1418
1419       size_t head_ofs = src->head - src->tail;
1420       size_t space = src->allocated - head_ofs;
1421       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1422       size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1423                                            space, prompt);
1424       assert (n <= space);
1425
1426       if (n == 0)
1427         {
1428           /* End of input. */
1429           src->reader->eof = true;
1430           lex_source_expand__ (src);
1431           return;
1432         }
1433
1434       src->head += n;
1435     }
1436   while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1437                   src->head - src->seg_pos));
1438 }
1439
1440 static struct lex_source *
1441 lex_source__ (const struct lexer *lexer)
1442 {
1443   return (ll_is_empty (&lexer->sources) ? NULL
1444           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1445 }
1446
1447 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1448    one, through N1 ahead of the current one, inclusive.  (For example, if N0
1449    and N1 are both zero, this requests the syntax for the current token.)  The
1450    caller must eventually free the returned string (with free()).  The syntax
1451    is encoded in UTF-8 and in the original form supplied to the lexer so that,
1452    for example, it may include comments, spaces, and new-lines if it spans
1453    multiple tokens.  Macro expansion, however, has already been performed. */
1454 static char *
1455 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1456 {
1457   struct string s = DS_EMPTY_INITIALIZER;
1458   for (size_t i = n0; i <= n1; )
1459     {
1460       /* Find [I,J) as the longest sequence of tokens not produced by macro
1461          expansion, or otherwise the longest sequence expanded from a single
1462          macro call. */
1463       const struct lex_token *first = lex_source_next__ (src, i);
1464       size_t j;
1465       for (j = i + 1; j <= n1; j++)
1466         {
1467           const struct lex_token *cur = lex_source_next__ (src, j);
1468           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1469               || first->macro_rep != cur->macro_rep)
1470             break;
1471         }
1472       const struct lex_token *last = lex_source_next__ (src, j - 1);
1473
1474       /* Now add the syntax for this sequence of tokens to SRC. */
1475       if (!ds_is_empty (&s))
1476         ds_put_byte (&s, ' ');
1477       if (!first->macro_rep)
1478         {
1479           size_t start = first->token_pos;
1480           size_t end = last->token_pos + last->token_len;
1481           ds_put_substring (&s, ss_buffer (&src->buffer[start - src->tail],
1482                                            end - start));
1483         }
1484       else
1485         {
1486           size_t start = first->ofs;
1487           size_t end = last->ofs + last->len;
1488           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1489                                            end - start));
1490         }
1491
1492       i = j;
1493     }
1494   return ds_steal_cstr (&s);
1495 }
1496
1497 static bool
1498 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1499 {
1500   for (size_t i = n0; i <= n1; i++)
1501     if (lex_source_next__ (src, i)->macro_rep)
1502       return true;
1503   return false;
1504 }
1505
1506 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1507    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1508    other tokens included in that range.  The syntax is encoded in UTF-8 and in
1509    the original form supplied to the lexer so that, for example, it may include
1510    comments, spaces, and new-lines if it spans multiple tokens.
1511
1512    Returns an empty string if the token range doesn't include a macro call.
1513
1514    The caller must not modify or free the returned string. */
1515 static struct substring
1516 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1517 {
1518   if (!lex_source_contains_macro_call (src, n0, n1))
1519     return ss_empty ();
1520
1521   const struct lex_token *token0 = lex_source_next__ (src, n0);
1522   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1523   size_t start = token0->token_pos;
1524   size_t end = token1->token_pos + token1->token_len;
1525
1526   return ss_buffer (&src->buffer[start - src->tail], end - start);
1527 }
1528
1529 static void
1530 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1531                          const char *format, va_list args)
1532 {
1533   const struct lex_token *token;
1534   struct string s;
1535
1536   ds_init_empty (&s);
1537
1538   token = lex_source_next__ (src, n0);
1539   if (token->token.type == T_ENDCMD)
1540     ds_put_cstr (&s, _("Syntax error at end of command"));
1541   else
1542     {
1543       /* Get the syntax that caused the error. */
1544       char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1545       char syntax[64];
1546       str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1547       free (raw_syntax);
1548
1549       /* Get the macro call(s) that expanded to the syntax that caused the
1550          error. */
1551       char call[64];
1552       str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1553                      call, sizeof call);
1554
1555       if (syntax[0])
1556         {
1557           if (call[0])
1558             ds_put_format (&s,
1559                            _("Syntax error at `%s' (in expansion of `%s')"),
1560                            syntax, call);
1561           else
1562             ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1563         }
1564       else
1565         {
1566           if (call[0])
1567             ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1568                            call);
1569           else
1570             ds_put_cstr (&s, _("Syntax error"));
1571         }
1572     }
1573
1574   if (format)
1575     {
1576       ds_put_cstr (&s, ": ");
1577       ds_put_vformat (&s, format, args);
1578     }
1579   if (ds_last (&s) != '.')
1580     ds_put_byte (&s, '.');
1581
1582   struct msg_location *location = xmalloc (sizeof *location);
1583   *location = (struct msg_location) {
1584     .file_name = xstrdup_if_nonnull (src->reader->file_name),
1585     .first_line = lex_source_get_first_line_number (src, n0),
1586     .last_line = lex_source_get_last_line_number (src, n1),
1587     .first_column = lex_source_get_first_column (src, n0),
1588     .last_column = lex_source_get_last_column (src, n1),
1589   };
1590   struct msg *m = xmalloc (sizeof *m);
1591   *m = (struct msg) {
1592     .category = MSG_C_SYNTAX,
1593     .severity = MSG_S_ERROR,
1594     .location = location,
1595     .text = ds_steal_cstr (&s),
1596   };
1597   msg_emit (m);
1598 }
1599
1600 static void PRINTF_FORMAT (4, 5)
1601 lex_source_error (struct lex_source *src, int n0, int n1,
1602                   const char *format, ...)
1603 {
1604   va_list args;
1605   va_start (args, format);
1606   lex_source_error_valist (src, n0, n1, format, args);
1607   va_end (args);
1608 }
1609
1610 static void
1611 lex_get_error (struct lex_source *src, const char *s)
1612 {
1613   size_t old_middle = src->middle;
1614   src->middle = src->front;
1615   size_t n = src->front - src->back - 1;
1616   lex_source_error (src, n, n, "%s", s);
1617   src->middle = old_middle;
1618
1619   lex_source_pop_front (src);
1620 }
1621
1622 /* Attempts to append an additional token at the front of SRC, reading more
1623    from the underlying lex_reader if necessary.  Returns true if a new token
1624    was added to SRC's deque, false otherwise.  The caller should retry failures
1625    unless SRC's 'eof' marker was set to true indicating that there will be no
1626    more tokens from this source.
1627
1628    Does not make the new token available for lookahead yet; the caller must
1629    adjust SRC's 'middle' pointer to do so. */
1630 static bool
1631 lex_source_try_get__ (struct lex_source *src)
1632 {
1633   /* State maintained while scanning tokens.  Usually we only need a single
1634      state, but scanner_push() can return SCAN_SAVE to indicate that the state
1635      needs to be saved and possibly restored later with SCAN_BACK. */
1636   struct state
1637     {
1638       struct segmenter segmenter;
1639       enum segment_type last_segment;
1640       int newlines;             /* Number of newlines encountered so far. */
1641       /* Maintained here so we can update lex_source's similar members when we
1642          finish. */
1643       size_t line_pos;
1644       size_t seg_pos;
1645     };
1646
1647   /* Initialize state. */
1648   struct state state =
1649     {
1650       .segmenter = src->segmenter,
1651       .newlines = 0,
1652       .seg_pos = src->seg_pos,
1653       .line_pos = src->line_pos,
1654     };
1655   struct state saved = state;
1656
1657   /* Append a new token to SRC and initialize it. */
1658   struct lex_token *token = lex_push_token__ (src);
1659   struct scanner scanner;
1660   scanner_init (&scanner, &token->token);
1661   token->line_pos = src->line_pos;
1662   token->token_pos = src->seg_pos;
1663   if (src->reader->line_number > 0)
1664     token->first_line = src->reader->line_number + src->n_newlines;
1665   else
1666     token->first_line = 0;
1667
1668   /* Extract segments and pass them through the scanner until we obtain a
1669      token. */
1670   for (;;)
1671     {
1672       /* Extract a segment. */
1673       const char *segment = &src->buffer[state.seg_pos - src->tail];
1674       size_t seg_maxlen = src->head - state.seg_pos;
1675       enum segment_type type;
1676       int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1677                                     src->reader->eof, &type);
1678       if (seg_len < 0)
1679         {
1680           /* The segmenter needs more input to produce a segment. */
1681           assert (!src->reader->eof);
1682           lex_source_read__ (src);
1683           continue;
1684         }
1685
1686       /* Update state based on the segment. */
1687       state.last_segment = type;
1688       state.seg_pos += seg_len;
1689       if (type == SEG_NEWLINE)
1690         {
1691           state.newlines++;
1692           state.line_pos = state.seg_pos;
1693         }
1694
1695       /* Pass the segment into the scanner and try to get a token out. */
1696       enum scan_result result = scanner_push (&scanner, type,
1697                                               ss_buffer (segment, seg_len),
1698                                               &token->token);
1699       if (result == SCAN_SAVE)
1700         saved = state;
1701       else if (result == SCAN_BACK)
1702         {
1703           state = saved;
1704           break;
1705         }
1706       else if (result == SCAN_DONE)
1707         break;
1708     }
1709
1710   /* If we've reached the end of a line, or the end of a command, then pass
1711      the line to the output engine as a syntax text item.  */
1712   int n_lines = state.newlines;
1713   if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1714     {
1715       n_lines++;
1716       src->suppress_next_newline = true;
1717     }
1718   else if (n_lines > 0 && src->suppress_next_newline)
1719     {
1720       n_lines--;
1721       src->suppress_next_newline = false;
1722     }
1723   for (int i = 0; i < n_lines; i++)
1724     {
1725       /* Beginning of line. */
1726       const char *line = &src->buffer[src->journal_pos - src->tail];
1727
1728       /* Calculate line length, including \n or \r\n end-of-line if present.
1729
1730          We use src->head even though that may be beyond what we've actually
1731          converted to tokens (which is only through state.line_pos).  That's
1732          because, if we're emitting the line due to SEG_END_COMMAND, we want to
1733          take the whole line through the newline, not just through the '.'. */
1734       size_t max_len = src->head - src->journal_pos;
1735       const char *newline = memchr (line, '\n', max_len);
1736       size_t line_len = newline ? newline - line + 1 : max_len;
1737
1738       /* Calculate line length excluding end-of-line. */
1739       size_t copy_len = line_len;
1740       if (copy_len > 0 && line[copy_len - 1] == '\n')
1741         copy_len--;
1742       if (copy_len > 0 && line[copy_len - 1] == '\r')
1743         copy_len--;
1744
1745       /* Submit the line as syntax. */
1746       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1747                                                    xmemdup0 (line, copy_len),
1748                                                    NULL));
1749
1750       src->journal_pos += line_len;
1751     }
1752
1753   token->token_len = state.seg_pos - src->seg_pos;
1754
1755   src->segmenter = state.segmenter;
1756   src->seg_pos = state.seg_pos;
1757   src->line_pos = state.line_pos;
1758   src->n_newlines += state.newlines;
1759
1760   switch (token->token.type)
1761     {
1762     default:
1763       return true;
1764
1765     case T_STOP:
1766       token->token.type = T_ENDCMD;
1767       src->eof = true;
1768       return true;
1769
1770     case SCAN_BAD_HEX_LENGTH:
1771     case SCAN_BAD_HEX_DIGIT:
1772     case SCAN_BAD_UNICODE_DIGIT:
1773     case SCAN_BAD_UNICODE_LENGTH:
1774     case SCAN_BAD_UNICODE_CODE_POINT:
1775     case SCAN_EXPECTED_QUOTE:
1776     case SCAN_EXPECTED_EXPONENT:
1777     case SCAN_UNEXPECTED_CHAR:
1778       char *msg = scan_token_to_error (&token->token);
1779       lex_get_error (src, msg);
1780       free (msg);
1781       return false;
1782
1783     case SCAN_SKIP:
1784       lex_source_pop_front (src);
1785       return false;
1786     }
1787
1788   NOT_REACHED ();
1789 }
1790
1791 /* Attempts to add a new token at the front of SRC.  Returns true if
1792    successful, false on failure.  On failure, the end of SRC has been reached
1793    and no more tokens will be forthcoming from it.
1794
1795    Does not make the new token available for lookahead yet; the caller must
1796    adjust SRC's 'middle' pointer to do so. */
1797 static bool
1798 lex_source_get__ (struct lex_source *src)
1799 {
1800   while (!src->eof)
1801     if (lex_source_try_get__ (src))
1802       return true;
1803   return false;
1804 }
1805
1806 /* Attempts to obtain a new token for SRC, in particular expanding the number
1807    of lookahead tokens (the tokens between 'back' and 'middle').
1808
1809    Returns true if successful, false on failure.  In the latter case, SRC is
1810    exhausted and 'src->eof' is now true. */
1811 static bool
1812 lex_source_get (const struct lex_source *src_)
1813 {
1814   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1815
1816   /* In the common case, call into the scanner and segmenter to obtain a new
1817      token between 'middle' and 'front'.  In the uncommon case, there can be one
1818      or a few tokens there already, leftovers from a macro expansion.
1819
1820      If we call into the scanner and it fails, then we've hit EOF and we're
1821      done. */
1822   if (src->front - src->middle == 0 && !lex_source_get__ (src))
1823     return false;
1824
1825   /* We have at least one token available between 'middle' and 'front'.
1826
1827      The remaining complication is all about macro expansion.  If macro
1828      expansion is disabled, we're done.  */
1829   if (!settings_get_mexpand ())
1830     {
1831       src->middle++;
1832       return true;
1833     }
1834
1835   /* Now pass tokens one-by-one to the macro expander.
1836
1837      In the common case where there is no macro to expand, the loop is not
1838      entered.  */
1839   struct macro_call *mc;
1840   int n_call = macro_call_create (
1841     src->lexer->macros, &src->tokens[src->middle & src->mask].token,
1842     &mc);
1843   for (int middle_ofs = 1; !n_call; middle_ofs++)
1844     {
1845       if (src->front - src->middle <= middle_ofs && !lex_source_get__ (src))
1846         {
1847           /* This should not be reachable because we always get a T_ENDCMD at
1848              the end of an input file (transformed from T_STOP by
1849              lex_source_try_get__()) and the macro_expander should always
1850              terminate expansion on T_ENDCMD. */
1851           NOT_REACHED ();
1852         }
1853
1854       const struct lex_token *t = &src->tokens[(src->middle + middle_ofs)
1855                                                & src->mask];
1856       size_t start = t->token_pos;
1857       size_t end = t->token_pos + t->token_len;
1858       const struct macro_token mt = {
1859         .token = t->token,
1860         .syntax = ss_buffer (&src->buffer[start - src->tail], end - start),
1861       };
1862
1863       /* We temporarily add the tokens to the source to avoid re-entry if
1864          macro_expander_add() reports an error and to give better error
1865          messages. */
1866       src->middle += middle_ofs + 1;
1867       n_call = macro_call_add (mc, &mt);
1868       src->middle -= middle_ofs + 1;
1869     }
1870   if (n_call < 0)
1871     {
1872       /* False alarm: no macro expansion after all.  Use first token as
1873          lookahead.  We'll retry macro expansion from the second token next
1874          time around. */
1875       macro_call_destroy (mc);
1876       src->middle++;
1877       return true;
1878     }
1879
1880   /* Now expand the macro.
1881
1882      We temporarily add the macro call's tokens to the source in case the macro
1883      expansion calls msg() to report an error and error processing tries to get
1884      the location of the error with, e.g. lex_get_first_line_number(), which
1885      would re-enter this code.  This is a kluge; it might be cleaner to pass
1886      the line number into macro_expander_get_expansion(). */
1887   src->middle += n_call;
1888   struct macro_tokens expansion = { .n = 0 };
1889   macro_call_expand (mc, src->reader->syntax, &expansion);
1890   macro_call_destroy (mc);
1891   src->middle -= n_call;
1892
1893   /* Convert the macro expansion into syntax for possible error messages later. */
1894   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1895   size_t *len = xnmalloc (expansion.n, sizeof *len);
1896   struct string s = DS_EMPTY_INITIALIZER;
1897   macro_tokens_to_syntax (&expansion, &s, ofs, len);
1898
1899   if (settings_get_mprint ())
1900     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1901                                           _("Macro Expansion")));
1902
1903   /* The first 'n_call' tokens starting at 'middle' will be replaced by the
1904      macro expansion.  There might be more tokens after that, up to 'front'.
1905
1906      Figure out the boundary of the macro call in the syntax, to go into the
1907      lex_tokens for the expansion so that later error messages can report what
1908      macro was called. */
1909   const struct lex_token *call_first = &src->tokens[src->middle & src->mask];
1910   const struct lex_token *call_last
1911     = &src->tokens[(src->middle + n_call - 1) & src->mask];
1912   size_t call_pos = call_first->token_pos;
1913   size_t call_len = (call_last->token_pos + call_last->token_len) - call_pos;
1914   size_t line_pos = call_first->line_pos;
1915   int first_line = call_first->first_line;
1916
1917   /* Destroy the tokens for the call, and save any tokens following the call so
1918      we can add them back later. */
1919   for (size_t i = src->middle; i != src->middle + n_call; i++)
1920     lex_token_uninit (&src->tokens[i & src->mask]);
1921   size_t n_save = src->front - (src->middle + n_call);
1922   struct lex_token *save_tokens = xnmalloc (n_save, sizeof *save_tokens);
1923   for (size_t i = 0; i < n_save; i++)
1924     save_tokens[i] = src->tokens[(src->middle + n_call + i) & src->mask];
1925   src->front = src->middle;
1926
1927   /* Append the macro expansion tokens to the lookahead. */
1928   char *macro_rep = ds_steal_cstr (&s);
1929   size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1930   *ref_cnt = expansion.n;
1931   for (size_t i = 0; i < expansion.n; i++)
1932     {
1933       *lex_push_token__ (src) = (struct lex_token) {
1934         .token = expansion.mts[i].token,
1935         .token_pos = call_pos,
1936         .token_len = call_len,
1937         .line_pos = line_pos,
1938         .first_line = first_line,
1939         .macro_rep = macro_rep,
1940         .ofs = ofs[i],
1941         .len = len[i],
1942         .ref_cnt = ref_cnt,
1943       };
1944       src->middle++;
1945
1946       ss_dealloc (&expansion.mts[i].syntax);
1947     }
1948   free (expansion.mts);
1949   free (ofs);
1950   free (len);
1951
1952   /* Finally, put the saved tokens back. */
1953   for (size_t i = 0; i < n_save; i++)
1954     *lex_push_token__ (src) = save_tokens[i];
1955   free (save_tokens);
1956
1957   return true;
1958 }
1959 \f
1960 static void
1961 lex_source_push_endcmd__ (struct lex_source *src)
1962 {
1963   assert (src->back == src->middle && src->middle == src->front);
1964   *lex_push_token__ (src) = (struct lex_token) {
1965     .token = { .type = T_ENDCMD } };
1966   src->middle++;
1967 }
1968
1969 static struct lex_source *
1970 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
1971 {
1972   struct lex_source *src = xmalloc (sizeof *src);
1973   *src = (struct lex_source) {
1974     .reader = reader,
1975     .segmenter = segmenter_init (reader->syntax, false),
1976     .lexer = lexer,
1977   };
1978
1979   lex_source_push_endcmd__ (src);
1980
1981   return src;
1982 }
1983
1984 static void
1985 lex_source_destroy (struct lex_source *src)
1986 {
1987   char *file_name = src->reader->file_name;
1988   char *encoding = src->reader->encoding;
1989   if (src->reader->class->destroy != NULL)
1990     src->reader->class->destroy (src->reader);
1991   free (file_name);
1992   free (encoding);
1993   free (src->buffer);
1994   while (src->middle - src->back > 0)
1995     lex_source_pop_back (src);
1996   while (src->front - src->middle > 0)
1997     lex_source_pop_front (src);
1998   free (src->tokens);
1999   ll_remove (&src->ll);
2000   free (src);
2001 }
2002 \f
2003 struct lex_file_reader
2004   {
2005     struct lex_reader reader;
2006     struct u8_istream *istream;
2007   };
2008
2009 static struct lex_reader_class lex_file_reader_class;
2010
2011 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2012    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2013    ENCODING, which should take one of the forms accepted by
2014    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2015    mode of the new reader, respectively.
2016
2017    Returns a null pointer if FILE_NAME cannot be opened. */
2018 struct lex_reader *
2019 lex_reader_for_file (const char *file_name, const char *encoding,
2020                      enum segmenter_mode syntax,
2021                      enum lex_error_mode error)
2022 {
2023   struct lex_file_reader *r;
2024   struct u8_istream *istream;
2025
2026   istream = (!strcmp(file_name, "-")
2027              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2028              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2029   if (istream == NULL)
2030     {
2031       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2032       return NULL;
2033     }
2034
2035   r = xmalloc (sizeof *r);
2036   lex_reader_init (&r->reader, &lex_file_reader_class);
2037   r->reader.syntax = syntax;
2038   r->reader.error = error;
2039   r->reader.file_name = xstrdup (file_name);
2040   r->reader.encoding = xstrdup_if_nonnull (encoding);
2041   r->reader.line_number = 1;
2042   r->istream = istream;
2043
2044   return &r->reader;
2045 }
2046
2047 static struct lex_file_reader *
2048 lex_file_reader_cast (struct lex_reader *r)
2049 {
2050   return UP_CAST (r, struct lex_file_reader, reader);
2051 }
2052
2053 static size_t
2054 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2055                enum prompt_style prompt_style UNUSED)
2056 {
2057   struct lex_file_reader *r = lex_file_reader_cast (r_);
2058   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2059   if (n_read < 0)
2060     {
2061       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2062       return 0;
2063     }
2064   return n_read;
2065 }
2066
2067 static void
2068 lex_file_close (struct lex_reader *r_)
2069 {
2070   struct lex_file_reader *r = lex_file_reader_cast (r_);
2071
2072   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2073     {
2074       if (u8_istream_close (r->istream) != 0)
2075         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2076     }
2077   else
2078     u8_istream_free (r->istream);
2079
2080   free (r);
2081 }
2082
2083 static struct lex_reader_class lex_file_reader_class =
2084   {
2085     lex_file_read,
2086     lex_file_close
2087   };
2088 \f
2089 struct lex_string_reader
2090   {
2091     struct lex_reader reader;
2092     struct substring s;
2093     size_t offset;
2094   };
2095
2096 static struct lex_reader_class lex_string_reader_class;
2097
2098 /* Creates and returns a new lex_reader for the contents of S, which must be
2099    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2100    with ss_dealloc() when it is closed. */
2101 struct lex_reader *
2102 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2103 {
2104   struct lex_string_reader *r;
2105
2106   r = xmalloc (sizeof *r);
2107   lex_reader_init (&r->reader, &lex_string_reader_class);
2108   r->reader.syntax = SEG_MODE_AUTO;
2109   r->reader.encoding = xstrdup_if_nonnull (encoding);
2110   r->s = s;
2111   r->offset = 0;
2112
2113   return &r->reader;
2114 }
2115
2116 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2117    which must be encoded in ENCODING.  The caller retains ownership of S. */
2118 struct lex_reader *
2119 lex_reader_for_string (const char *s, const char *encoding)
2120 {
2121   struct substring ss;
2122   ss_alloc_substring (&ss, ss_cstr (s));
2123   return lex_reader_for_substring_nocopy (ss, encoding);
2124 }
2125
2126 /* Formats FORMAT as a printf()-like format string and creates and returns a
2127    new lex_reader for the formatted result.  */
2128 struct lex_reader *
2129 lex_reader_for_format (const char *format, const char *encoding, ...)
2130 {
2131   struct lex_reader *r;
2132   va_list args;
2133
2134   va_start (args, encoding);
2135   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2136   va_end (args);
2137
2138   return r;
2139 }
2140
2141 static struct lex_string_reader *
2142 lex_string_reader_cast (struct lex_reader *r)
2143 {
2144   return UP_CAST (r, struct lex_string_reader, reader);
2145 }
2146
2147 static size_t
2148 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2149                  enum prompt_style prompt_style UNUSED)
2150 {
2151   struct lex_string_reader *r = lex_string_reader_cast (r_);
2152   size_t chunk;
2153
2154   chunk = MIN (n, r->s.length - r->offset);
2155   memcpy (buf, r->s.string + r->offset, chunk);
2156   r->offset += chunk;
2157
2158   return chunk;
2159 }
2160
2161 static void
2162 lex_string_close (struct lex_reader *r_)
2163 {
2164   struct lex_string_reader *r = lex_string_reader_cast (r_);
2165
2166   ss_dealloc (&r->s);
2167   free (r);
2168 }
2169
2170 static struct lex_reader_class lex_string_reader_class =
2171   {
2172     lex_string_read,
2173     lex_string_close
2174   };