pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31 #include <uniwidth.h>
  32
  33 #include "language/command.h"
  34 #include "language/lexer/macro.h"
  35 #include "language/lexer/scan.h"
  36 #include "language/lexer/segment.h"
  37 #include "language/lexer/token.h"
  38 #include "libpspp/assertion.h"
  39 #include "libpspp/cast.h"
  40 #include "libpspp/deque.h"
  41 #include "libpspp/i18n.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* For a token obtained through the lexer in an ordinary way, this is the
  66        location of the token in terms of the lex_source's buffer.
  67
  68        For a token produced through macro expansion, this is the entire macro
  69        call.
  70
  71        src->tail <= line_pos <= token_pos <= src->head. */
  72     size_t token_pos;           /* Start of token. */
  73     size_t token_len;           /* Length of source for token in bytes. */
  74     size_t line_pos;            /* Start of line containing token_pos. */
  75     int first_line;             /* Line number at token_pos. */
  76
  77     /* For a token obtained through macro expansion, this is just this token. */
  78     char *macro_rep;        /* The whole macro expansion. */
  79     size_t ofs;             /* Offset of this token in macro_rep. */
  80     size_t len;             /* Length of this token in macro_rep. */
  81     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  82   };
  83
  84 static void
  85 lex_token_uninit (struct lex_token *t)
  86 {
  87   token_uninit (&t->token);
  88   if (t->ref_cnt)
  89     {
  90       assert (*t->ref_cnt > 0);
  91       if (!--*t->ref_cnt)
  92         {
  93           free (t->macro_rep);
  94           free (t->ref_cnt);
  95         }
  96     }
  97 }
  98
  99 /* A source of tokens, corresponding to a syntax file.
 100
 101    This is conceptually a lex_reader wrapped with everything needed to convert
 102    its UTF-8 bytes into tokens. */
 103 struct lex_source
 104   {
 105     struct ll ll;               /* In lexer's list of sources. */
 106     struct lex_reader *reader;
 107     struct lexer *lexer;
 108     struct segmenter segmenter;
 109     bool eof;                   /* True if T_STOP was read from 'reader'. */
 110
 111     /* Buffer of UTF-8 bytes. */
 112     char *buffer;
 113     size_t allocated;           /* Number of bytes allocated. */
 114     size_t tail;                /* &buffer[0] offset into UTF-8 source. */
 115     size_t head;                /* &buffer[head - tail] offset into source. */
 116
 117     /* Positions in source file, tail <= pos <= head for each member here. */
 118     size_t journal_pos;         /* First byte not yet output to journal. */
 119     size_t seg_pos;             /* First byte not yet scanned as token. */
 120     size_t line_pos;            /* First byte of line containing seg_pos. */
 121
 122     int n_newlines;             /* Number of new-lines up to seg_pos. */
 123     bool suppress_next_newline;
 124
 125     /* Tokens.
 126
 127        This is mostly like a deque, with the conceptual invariant that back <=
 128        middle <= front (modulo SIZE_MAX+1).  The tokens available for parsing
 129        lie between 'back' and 'middle': the token at 'back' is the current
 130        token, the token at 'back + 1' is the next token, and so on.  There are
 131        usually no tokens between 'middle' and 'front'; if there are, then they
 132        need to go through macro expansion and are not yet available for
 133        parsing.
 134
 135        'capacity' is the current number of elements in 'tokens'.  It is always
 136        a power of 2.  'front', 'middle', and 'back' refer to indexes in
 137        'tokens' modulo 'capacity'. */
 138     size_t front;
 139     size_t middle;
 140     size_t back;
 141     size_t capacity;
 142     struct lex_token *tokens;
 143   };
 144
 145 static struct lex_source *lex_source_create (struct lexer *,
 146                                              struct lex_reader *);
 147 static void lex_source_destroy (struct lex_source *);
 148
 149 /* Lexer. */
 150 struct lexer
 151   {
 152     struct ll_list sources;     /* Contains "struct lex_source"s. */
 153     struct macro_set *macros;
 154   };
 155
 156 static struct lex_source *lex_source__ (const struct lexer *);
 157 static char *lex_source_get_syntax__ (const struct lex_source *,
 158                                       int n0, int n1);
 159 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 160 static void lex_source_push_endcmd__ (struct lex_source *);
 161
 162 static void lex_source_pop_back (struct lex_source *);
 163 static bool lex_source_get (const struct lex_source *);
 164 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 165                                      const char *format, va_list)
 166    PRINTF_FORMAT (4, 0);
 167 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 168                                                   int n);
 169 \f
 170 /* Initializes READER with the specified CLASS and otherwise some reasonable
 171    defaults.  The caller should fill in the others members as desired. */
 172 void
 173 lex_reader_init (struct lex_reader *reader,
 174                  const struct lex_reader_class *class)
 175 {
 176   reader->class = class;
 177   reader->syntax = SEG_MODE_AUTO;
 178   reader->error = LEX_ERROR_CONTINUE;
 179   reader->file_name = NULL;
 180   reader->encoding = NULL;
 181   reader->line_number = 0;
 182   reader->eof = false;
 183 }
 184
 185 /* Frees any file name already in READER and replaces it by a copy of
 186    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 187 void
 188 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 189 {
 190   free (reader->file_name);
 191   reader->file_name = xstrdup_if_nonnull (file_name);
 192 }
 193 \f
 194 /* Creates and returns a new lexer. */
 195 struct lexer *
 196 lex_create (void)
 197 {
 198   struct lexer *lexer = xmalloc (sizeof *lexer);
 199   *lexer = (struct lexer) {
 200     .sources = LL_INITIALIZER (lexer->sources),
 201     .macros = macro_set_create (),
 202   };
 203   return lexer;
 204 }
 205
 206 /* Destroys LEXER. */
 207 void
 208 lex_destroy (struct lexer *lexer)
 209 {
 210   if (lexer != NULL)
 211     {
 212       struct lex_source *source, *next;
 213
 214       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 215         lex_source_destroy (source);
 216       macro_set_destroy (lexer->macros);
 217       free (lexer);
 218     }
 219 }
 220
 221 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 222    same name.  Takes ownership of M. */
 223 void
 224 lex_define_macro (struct lexer *lexer, struct macro *m)
 225 {
 226   macro_set_add (lexer->macros, m);
 227 }
 228
 229 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 230    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 231    token. */
 232 void
 233 lex_include (struct lexer *lexer, struct lex_reader *reader)
 234 {
 235   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 236   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 237 }
 238
 239 /* Appends READER to LEXER, so that it will be read after all other current
 240    readers have already been read. */
 241 void
 242 lex_append (struct lexer *lexer, struct lex_reader *reader)
 243 {
 244   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 245 }
 246 \f
 247 /* Advancing. */
 248
 249 /* Adds a new token at the front of SRC and returns a pointer to it.  The
 250    caller should initialize it.  Does not advance the middle pointer, so the
 251    token isn't immediately available to the parser. */
 252 static struct lex_token *
 253 lex_push_token__ (struct lex_source *src)
 254 {
 255   if (src->front - src->back >= src->capacity)
 256     {
 257       /* Expansion works just like a deque, so we reuse the code. */
 258       struct deque deque = {
 259         .capacity = src->capacity,
 260         .front = src->front,
 261         .back = src->back,
 262       };
 263       src->tokens = deque_expand (&deque, src->tokens, sizeof *src->tokens);
 264       src->capacity = deque.capacity;
 265     }
 266
 267   struct lex_token *token = &src->tokens[src->front++ & (src->capacity - 1)];
 268   token->token = (struct token) { .type = T_STOP };
 269   token->macro_rep = NULL;
 270   token->ref_cnt = NULL;
 271   return token;
 272 }
 273
 274 /* Removes the current token from SRC and uninitializes it. */
 275 static void
 276 lex_source_pop_back (struct lex_source *src)
 277 {
 278   assert (src->middle - src->back > 0);
 279   lex_token_uninit (&src->tokens[src->back++ & (src->capacity - 1)]);
 280 }
 281
 282 /* Removes the token at the greatest lookahead from SRC and uninitializes
 283    it. */
 284 static void
 285 lex_source_pop_front (struct lex_source *src)
 286 {
 287   assert (src->front - src->middle > 0);
 288   lex_token_uninit (&src->tokens[--src->front & (src->capacity - 1)]);
 289 }
 290
 291 /* Advances LEXER to the next token, consuming the current token. */
 292 void
 293 lex_get (struct lexer *lexer)
 294 {
 295   struct lex_source *src;
 296
 297   src = lex_source__ (lexer);
 298   if (src == NULL)
 299     return;
 300
 301   if (src->middle - src->back > 0)
 302     lex_source_pop_back (src);
 303
 304   while (src->back == src->middle)
 305     if (!lex_source_get (src))
 306       {
 307         lex_source_destroy (src);
 308         src = lex_source__ (lexer);
 309         if (src == NULL)
 310           return;
 311       }
 312 }
 313 \f
 314 /* Issuing errors. */
 315
 316 /* Prints a syntax error message containing the current token and
 317    given message MESSAGE (if non-null). */
 318 void
 319 lex_error (struct lexer *lexer, const char *format, ...)
 320 {
 321   va_list args;
 322
 323   va_start (args, format);
 324   lex_next_error_valist (lexer, 0, 0, format, args);
 325   va_end (args);
 326 }
 327
 328 /* Prints a syntax error message containing the current token and
 329    given message MESSAGE (if non-null). */
 330 void
 331 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 332 {
 333   lex_next_error_valist (lexer, 0, 0, format, args);
 334 }
 335
 336 /* Prints a syntax error message containing the current token and
 337    given message MESSAGE (if non-null). */
 338 void
 339 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 340 {
 341   va_list args;
 342
 343   va_start (args, format);
 344   lex_next_error_valist (lexer, n0, n1, format, args);
 345   va_end (args);
 346 }
 347
 348 /* Prints a syntax error message saying that one of the strings provided as
 349    varargs, up to the first NULL, is expected. */
 350 void
 351 (lex_error_expecting) (struct lexer *lexer, ...)
 352 {
 353   va_list args;
 354
 355   va_start (args, lexer);
 356   lex_error_expecting_valist (lexer, args);
 357   va_end (args);
 358 }
 359
 360 /* Prints a syntax error message saying that one of the options provided in
 361    ARGS, up to the first NULL, is expected. */
 362 void
 363 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 364 {
 365   enum { MAX_OPTIONS = 9 };
 366   const char *options[MAX_OPTIONS];
 367   int n = 0;
 368   while (n < MAX_OPTIONS)
 369     {
 370       const char *option = va_arg (args, const char *);
 371       if (!option)
 372         break;
 373
 374       options[n++] = option;
 375     }
 376   lex_error_expecting_array (lexer, options, n);
 377 }
 378
 379 void
 380 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 381 {
 382   switch (n)
 383     {
 384     case 0:
 385       lex_error (lexer, NULL);
 386       break;
 387
 388     case 1:
 389       lex_error (lexer, _("expecting %s"), options[0]);
 390       break;
 391
 392     case 2:
 393       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 394       break;
 395
 396     case 3:
 397       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 398                  options[2]);
 399       break;
 400
 401     case 4:
 402       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 403                  options[0], options[1], options[2], options[3]);
 404       break;
 405
 406     case 5:
 407       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 408                  options[0], options[1], options[2], options[3], options[4]);
 409       break;
 410
 411     case 6:
 412       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 413                  options[0], options[1], options[2], options[3], options[4],
 414                  options[5]);
 415       break;
 416
 417     case 7:
 418       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 419                  options[0], options[1], options[2], options[3], options[4],
 420                  options[5], options[6]);
 421       break;
 422
 423     case 8:
 424       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 425                  options[0], options[1], options[2], options[3], options[4],
 426                  options[5], options[6], options[7]);
 427       break;
 428
 429     default:
 430       lex_error (lexer, NULL);
 431     }
 432 }
 433
 434 /* Reports an error to the effect that subcommand SBC may only be specified
 435    once.
 436
 437    This function does not take a lexer as an argument or use lex_error(),
 438    because the result would ordinarily just be redundant: "Syntax error at
 439    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 440    not help the user find the error. */
 441 void
 442 lex_sbc_only_once (const char *sbc)
 443 {
 444   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 445 }
 446
 447 /* Reports an error to the effect that subcommand SBC is missing.
 448
 449    This function does not take a lexer as an argument or use lex_error(),
 450    because a missing subcommand can normally be detected only after the whole
 451    command has been parsed, and so lex_error() would always report "Syntax
 452    error at end of command", which does not help the user find the error. */
 453 void
 454 lex_sbc_missing (const char *sbc)
 455 {
 456   msg (SE, _("Required subcommand %s was not specified."), sbc);
 457 }
 458
 459 /* Reports an error to the effect that specification SPEC may only be specified
 460    once within subcommand SBC. */
 461 void
 462 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 463 {
 464   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 465              spec, sbc);
 466 }
 467
 468 /* Reports an error to the effect that specification SPEC is missing within
 469    subcommand SBC. */
 470 void
 471 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 472 {
 473   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 474              sbc, spec);
 475 }
 476
 477 /* Prints a syntax error message containing the current token and
 478    given message MESSAGE (if non-null). */
 479 void
 480 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 481                        const char *format, va_list args)
 482 {
 483   struct lex_source *src = lex_source__ (lexer);
 484
 485   if (src != NULL)
 486     lex_source_error_valist (src, n0, n1, format, args);
 487   else
 488     {
 489       struct string s;
 490
 491       ds_init_empty (&s);
 492       ds_put_format (&s, _("Syntax error at end of input"));
 493       if (format != NULL)
 494         {
 495           ds_put_cstr (&s, ": ");
 496           ds_put_vformat (&s, format, args);
 497         }
 498       ds_put_byte (&s, '.');
 499       msg (SE, "%s", ds_cstr (&s));
 500       ds_destroy (&s);
 501     }
 502 }
 503
 504 /* Checks that we're at end of command.
 505    If so, returns a successful command completion code.
 506    If not, flags a syntax error and returns an error command
 507    completion code. */
 508 int
 509 lex_end_of_command (struct lexer *lexer)
 510 {
 511   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 512     {
 513       lex_error (lexer, _("expecting end of command"));
 514       return CMD_FAILURE;
 515     }
 516   else
 517     return CMD_SUCCESS;
 518 }
 519 \f
 520 /* Token testing functions. */
 521
 522 /* Returns true if the current token is a number. */
 523 bool
 524 lex_is_number (const struct lexer *lexer)
 525 {
 526   return lex_next_is_number (lexer, 0);
 527 }
 528
 529 /* Returns true if the current token is a string. */
 530 bool
 531 lex_is_string (const struct lexer *lexer)
 532 {
 533   return lex_next_is_string (lexer, 0);
 534 }
 535
 536 /* Returns the value of the current token, which must be a
 537    floating point number. */
 538 double
 539 lex_number (const struct lexer *lexer)
 540 {
 541   return lex_next_number (lexer, 0);
 542 }
 543
 544 /* Returns true iff the current token is an integer. */
 545 bool
 546 lex_is_integer (const struct lexer *lexer)
 547 {
 548   return lex_next_is_integer (lexer, 0);
 549 }
 550
 551 /* Returns the value of the current token, which must be an
 552    integer. */
 553 long
 554 lex_integer (const struct lexer *lexer)
 555 {
 556   return lex_next_integer (lexer, 0);
 557 }
 558 \f
 559 /* Token testing functions with lookahead.
 560
 561    A value of 0 for N as an argument to any of these functions refers to the
 562    current token.  Lookahead is limited to the current command.  Any N greater
 563    than the number of tokens remaining in the current command will be treated
 564    as referring to a T_ENDCMD token. */
 565
 566 /* Returns true if the token N ahead of the current token is a number. */
 567 bool
 568 lex_next_is_number (const struct lexer *lexer, int n)
 569 {
 570   return token_is_number (lex_next (lexer, n));
 571 }
 572
 573 /* Returns true if the token N ahead of the current token is a string. */
 574 bool
 575 lex_next_is_string (const struct lexer *lexer, int n)
 576 {
 577   return token_is_string (lex_next (lexer, n));
 578 }
 579
 580 /* Returns the value of the token N ahead of the current token, which must be a
 581    floating point number. */
 582 double
 583 lex_next_number (const struct lexer *lexer, int n)
 584 {
 585   return token_number (lex_next (lexer, n));
 586 }
 587
 588 /* Returns true if the token N ahead of the current token is an integer. */
 589 bool
 590 lex_next_is_integer (const struct lexer *lexer, int n)
 591 {
 592   return token_is_integer (lex_next (lexer, n));
 593 }
 594
 595 /* Returns the value of the token N ahead of the current token, which must be
 596    an integer. */
 597 long
 598 lex_next_integer (const struct lexer *lexer, int n)
 599 {
 600   return token_integer (lex_next (lexer, n));
 601 }
 602 \f
 603 /* Token matching functions. */
 604
 605 /* If the current token has the specified TYPE, skips it and returns true.
 606    Otherwise, returns false. */
 607 bool
 608 lex_match (struct lexer *lexer, enum token_type type)
 609 {
 610   if (lex_token (lexer) == type)
 611     {
 612       lex_get (lexer);
 613       return true;
 614     }
 615   else
 616     return false;
 617 }
 618
 619 /* If the current token matches IDENTIFIER, skips it and returns true.
 620    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 621    returns false.
 622
 623    IDENTIFIER must be an ASCII string. */
 624 bool
 625 lex_match_id (struct lexer *lexer, const char *identifier)
 626 {
 627   return lex_match_id_n (lexer, identifier, 3);
 628 }
 629
 630 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 631    may be abbreviated to its first N letters.  Otherwise, returns false.
 632
 633    IDENTIFIER must be an ASCII string. */
 634 bool
 635 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 636 {
 637   if (lex_token (lexer) == T_ID
 638       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 639     {
 640       lex_get (lexer);
 641       return true;
 642     }
 643   else
 644     return false;
 645 }
 646
 647 /* If the current token is integer X, skips it and returns true.  Otherwise,
 648    returns false. */
 649 bool
 650 lex_match_int (struct lexer *lexer, int x)
 651 {
 652   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 653     {
 654       lex_get (lexer);
 655       return true;
 656     }
 657   else
 658     return false;
 659 }
 660 \f
 661 /* Forced matches. */
 662
 663 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 664    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 665    false.
 666
 667    IDENTIFIER must be an ASCII string. */
 668 bool
 669 lex_force_match_id (struct lexer *lexer, const char *identifier)
 670 {
 671   if (lex_match_id (lexer, identifier))
 672     return true;
 673   else
 674     {
 675       lex_error_expecting (lexer, identifier);
 676       return false;
 677     }
 678 }
 679
 680 /* If the current token has the specified TYPE, skips it and returns true.
 681    Otherwise, reports an error and returns false. */
 682 bool
 683 lex_force_match (struct lexer *lexer, enum token_type type)
 684 {
 685   if (lex_token (lexer) == type)
 686     {
 687       lex_get (lexer);
 688       return true;
 689     }
 690   else
 691     {
 692       const char *type_string = token_type_to_string (type);
 693       if (type_string)
 694         {
 695           char *s = xasprintf ("`%s'", type_string);
 696           lex_error_expecting (lexer, s);
 697           free (s);
 698         }
 699       else
 700         lex_error_expecting (lexer, token_type_to_name (type));
 701
 702       return false;
 703     }
 704 }
 705
 706 /* If the current token is a string, does nothing and returns true.
 707    Otherwise, reports an error and returns false. */
 708 bool
 709 lex_force_string (struct lexer *lexer)
 710 {
 711   if (lex_is_string (lexer))
 712     return true;
 713   else
 714     {
 715       lex_error (lexer, _("expecting string"));
 716       return false;
 717     }
 718 }
 719
 720 /* If the current token is a string or an identifier, does nothing and returns
 721    true.  Otherwise, reports an error and returns false.
 722
 723    This is meant for use in syntactic situations where we want to encourage the
 724    user to supply a quoted string, but for compatibility we also accept
 725    identifiers.  (One example of such a situation is file names.)  Therefore,
 726    the error message issued when the current token is wrong only says that a
 727    string is expected and doesn't mention that an identifier would also be
 728    accepted. */
 729 bool
 730 lex_force_string_or_id (struct lexer *lexer)
 731 {
 732   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 733 }
 734
 735 /* If the current token is an integer, does nothing and returns true.
 736    Otherwise, reports an error and returns false. */
 737 bool
 738 lex_force_int (struct lexer *lexer)
 739 {
 740   if (lex_is_integer (lexer))
 741     return true;
 742   else
 743     {
 744       lex_error (lexer, _("expecting integer"));
 745       return false;
 746     }
 747 }
 748
 749 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 750    nothing and returns true.  Otherwise, reports an error and returns false.
 751    If NAME is nonnull, then it is used in the error message. */
 752 bool
 753 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 754 {
 755   bool is_integer = lex_is_integer (lexer);
 756   bool too_small = is_integer && lex_integer (lexer) < min;
 757   bool too_big = is_integer && lex_integer (lexer) > max;
 758   if (is_integer && !too_small && !too_big)
 759     return true;
 760
 761   if (min > max)
 762     {
 763       /* Weird, maybe a bug in the caller.  Just report that we needed an
 764          integer. */
 765       if (name)
 766         lex_error (lexer, _("Integer expected for %s."), name);
 767       else
 768         lex_error (lexer, _("Integer expected."));
 769     }
 770   else if (min == max)
 771     {
 772       if (name)
 773         lex_error (lexer, _("Expected %ld for %s."), min, name);
 774       else
 775         lex_error (lexer, _("Expected %ld."), min);
 776     }
 777   else if (min + 1 == max)
 778     {
 779       if (name)
 780         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 781       else
 782         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 783     }
 784   else
 785     {
 786       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 787       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 788
 789       if (report_lower_bound && report_upper_bound)
 790         {
 791           if (name)
 792             lex_error (lexer,
 793                        _("Expected integer between %ld and %ld for %s."),
 794                        min, max, name);
 795           else
 796             lex_error (lexer, _("Expected integer between %ld and %ld."),
 797                        min, max);
 798         }
 799       else if (report_lower_bound)
 800         {
 801           if (min == 0)
 802             {
 803               if (name)
 804                 lex_error (lexer, _("Expected non-negative integer for %s."),
 805                            name);
 806               else
 807                 lex_error (lexer, _("Expected non-negative integer."));
 808             }
 809           else if (min == 1)
 810             {
 811               if (name)
 812                 lex_error (lexer, _("Expected positive integer for %s."),
 813                            name);
 814               else
 815                 lex_error (lexer, _("Expected positive integer."));
 816             }
 817         }
 818       else if (report_upper_bound)
 819         {
 820           if (name)
 821             lex_error (lexer,
 822                        _("Expected integer less than or equal to %ld for %s."),
 823                        max, name);
 824           else
 825             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 826                        max);
 827         }
 828       else
 829         {
 830           if (name)
 831             lex_error (lexer, _("Integer expected for %s."), name);
 832           else
 833             lex_error (lexer, _("Integer expected."));
 834         }
 835     }
 836   return false;
 837 }
 838
 839 /* If the current token is a number, does nothing and returns true.
 840    Otherwise, reports an error and returns false. */
 841 bool
 842 lex_force_num (struct lexer *lexer)
 843 {
 844   if (lex_is_number (lexer))
 845     return true;
 846
 847   lex_error (lexer, _("expecting number"));
 848   return false;
 849 }
 850
 851 /* If the current token is an identifier, does nothing and returns true.
 852    Otherwise, reports an error and returns false. */
 853 bool
 854 lex_force_id (struct lexer *lexer)
 855 {
 856   if (lex_token (lexer) == T_ID)
 857     return true;
 858
 859   lex_error (lexer, _("expecting identifier"));
 860   return false;
 861 }
 862 \f
 863 /* Token accessors. */
 864
 865 /* Returns the type of LEXER's current token. */
 866 enum token_type
 867 lex_token (const struct lexer *lexer)
 868 {
 869   return lex_next_token (lexer, 0);
 870 }
 871
 872 /* Returns the number in LEXER's current token.
 873
 874    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 875    tokens this function will always return zero. */
 876 double
 877 lex_tokval (const struct lexer *lexer)
 878 {
 879   return lex_next_tokval (lexer, 0);
 880 }
 881
 882 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 883
 884    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 885    this functions this function will always return NULL.
 886
 887    The UTF-8 encoding of the returned string is correct for variable names and
 888    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 889    data_in() to use it in a "union value".  */
 890 const char *
 891 lex_tokcstr (const struct lexer *lexer)
 892 {
 893   return lex_next_tokcstr (lexer, 0);
 894 }
 895
 896 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 897    null-terminated (but the null terminator is not included in the returned
 898    substring's 'length').
 899
 900    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 901    this functions this function will always return NULL.
 902
 903    The UTF-8 encoding of the returned string is correct for variable names and
 904    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 905    data_in() to use it in a "union value".  */
 906 struct substring
 907 lex_tokss (const struct lexer *lexer)
 908 {
 909   return lex_next_tokss (lexer, 0);
 910 }
 911 \f
 912 /* Looking ahead.
 913
 914    A value of 0 for N as an argument to any of these functions refers to the
 915    current token.  Lookahead is limited to the current command.  Any N greater
 916    than the number of tokens remaining in the current command will be treated
 917    as referring to a T_ENDCMD token. */
 918
 919 static const struct lex_token *
 920 lex_next__ (const struct lexer *lexer_, int n)
 921 {
 922   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
 923   struct lex_source *src = lex_source__ (lexer);
 924
 925   if (src != NULL)
 926     return lex_source_next__ (src, n);
 927   else
 928     {
 929       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
 930       return &stop_token;
 931     }
 932 }
 933
 934 /* Returns the token in SRC with the greatest lookahead. */
 935 static const struct lex_token *
 936 lex_source_middle (const struct lex_source *src)
 937 {
 938   assert (src->middle - src->back > 0);
 939   return &src->tokens[(src->middle - 1) & (src->capacity - 1)];
 940 }
 941
 942 static const struct lex_token *
 943 lex_source_next__ (const struct lex_source *src, int n)
 944 {
 945   while (src->middle - src->back <= n)
 946     {
 947       if (src->middle - src->back > 0)
 948         {
 949           const struct lex_token *middle = lex_source_middle (src);
 950           if (middle->token.type == T_STOP || middle->token.type == T_ENDCMD)
 951             return middle;
 952         }
 953
 954       lex_source_get (src);
 955     }
 956
 957   return &src->tokens[(src->back + n) & (src->capacity - 1)];
 958 }
 959
 960 /* Returns the "struct token" of the token N after the current one in LEXER.
 961    The returned pointer can be invalidated by pretty much any succeeding call
 962    into the lexer, although the string pointer within the returned token is
 963    only invalidated by consuming the token (e.g. with lex_get()). */
 964 const struct token *
 965 lex_next (const struct lexer *lexer, int n)
 966 {
 967   return &lex_next__ (lexer, n)->token;
 968 }
 969
 970 /* Returns the type of the token N after the current one in LEXER. */
 971 enum token_type
 972 lex_next_token (const struct lexer *lexer, int n)
 973 {
 974   return lex_next (lexer, n)->type;
 975 }
 976
 977 /* Returns the number in the tokn N after the current one in LEXER.
 978
 979    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 980    tokens this function will always return zero. */
 981 double
 982 lex_next_tokval (const struct lexer *lexer, int n)
 983 {
 984   return token_number (lex_next (lexer, n));
 985 }
 986
 987 /* Returns the null-terminated string in the token N after the current one, in
 988    UTF-8 encoding.
 989
 990    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 991    this functions this function will always return NULL.
 992
 993    The UTF-8 encoding of the returned string is correct for variable names and
 994    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 995    data_in() to use it in a "union value".  */
 996 const char *
 997 lex_next_tokcstr (const struct lexer *lexer, int n)
 998 {
 999   return lex_next_tokss (lexer, n).string;
1000 }
1001
1002 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1003    The string is null-terminated (but the null terminator is not included in
1004    the returned substring's 'length').
1005
1006    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1007    tokens this functions this function will always return NULL.
1008
1009    The UTF-8 encoding of the returned string is correct for variable names and
1010    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1011    data_in() to use it in a "union value".  */
1012 struct substring
1013 lex_next_tokss (const struct lexer *lexer, int n)
1014 {
1015   return lex_next (lexer, n)->string;
1016 }
1017
1018 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1019    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1020    are both zero, this requests the syntax for the current token.)  The caller
1021    must eventually free the returned string (with free()).  The syntax is
1022    encoded in UTF-8 and in the original form supplied to the lexer so that, for
1023    example, it may include comments, spaces, and new-lines if it spans multiple
1024    tokens.  Macro expansion, however, has already been performed. */
1025 char *
1026 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1027 {
1028   return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1029 }
1030
1031 bool
1032 lex_next_is_from_macro (const struct lexer *lexer, int n)
1033 {
1034   return lex_next__ (lexer, n)->macro_rep != NULL;
1035 }
1036
1037 static bool
1038 lex_tokens_match (const struct token *actual, const struct token *expected)
1039 {
1040   if (actual->type != expected->type)
1041     return false;
1042
1043   switch (actual->type)
1044     {
1045     case T_POS_NUM:
1046     case T_NEG_NUM:
1047       return actual->number == expected->number;
1048
1049     case T_ID:
1050       return lex_id_match (expected->string, actual->string);
1051
1052     case T_STRING:
1053       return (actual->string.length == expected->string.length
1054               && !memcmp (actual->string.string, expected->string.string,
1055                           actual->string.length));
1056
1057     default:
1058       return true;
1059     }
1060 }
1061
1062 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1063    skips it and returns true.  Otherwise, returns false.
1064
1065    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1066    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1067    first three letters. */
1068 bool
1069 lex_match_phrase (struct lexer *lexer, const char *s)
1070 {
1071   struct string_lexer slex;
1072   struct token token;
1073   int i;
1074
1075   i = 0;
1076   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1077   while (string_lexer_next (&slex, &token))
1078     if (token.type != SCAN_SKIP)
1079       {
1080         bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1081         token_uninit (&token);
1082         if (!match)
1083           return false;
1084       }
1085
1086   while (i-- > 0)
1087     lex_get (lexer);
1088   return true;
1089 }
1090
1091 static int
1092 lex_source_get_first_line_number (const struct lex_source *src, int n)
1093 {
1094   return lex_source_next__ (src, n)->first_line;
1095 }
1096
1097 static int
1098 count_newlines (char *s, size_t length)
1099 {
1100   int n_newlines = 0;
1101   char *newline;
1102
1103   while ((newline = memchr (s, '\n', length)) != NULL)
1104     {
1105       n_newlines++;
1106       length -= (newline + 1) - s;
1107       s = newline + 1;
1108     }
1109
1110   return n_newlines;
1111 }
1112
1113 static int
1114 lex_source_get_last_line_number (const struct lex_source *src, int n)
1115 {
1116   const struct lex_token *token = lex_source_next__ (src, n);
1117
1118   if (token->first_line == 0)
1119     return 0;
1120   else
1121     {
1122       char *token_str = &src->buffer[token->token_pos - src->tail];
1123       return token->first_line + count_newlines (token_str, token->token_len) + 1;
1124     }
1125 }
1126
1127 static int
1128 count_columns (const char *s_, size_t length)
1129 {
1130   const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
1131   int columns;
1132   size_t ofs;
1133   int mblen;
1134
1135   columns = 0;
1136   for (ofs = 0; ofs < length; ofs += mblen)
1137     {
1138       ucs4_t uc;
1139
1140       mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
1141       if (uc != '\t')
1142         {
1143           int width = uc_width (uc, "UTF-8");
1144           if (width > 0)
1145             columns += width;
1146         }
1147       else
1148         columns = ROUND_UP (columns + 1, 8);
1149     }
1150
1151   return columns + 1;
1152 }
1153
1154 static int
1155 lex_source_get_first_column (const struct lex_source *src, int n)
1156 {
1157   const struct lex_token *token = lex_source_next__ (src, n);
1158   return count_columns (&src->buffer[token->line_pos - src->tail],
1159                         token->token_pos - token->line_pos);
1160 }
1161
1162 static int
1163 lex_source_get_last_column (const struct lex_source *src, int n)
1164 {
1165   const struct lex_token *token = lex_source_next__ (src, n);
1166   char *start, *end, *newline;
1167
1168   start = &src->buffer[token->line_pos - src->tail];
1169   end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
1170   newline = memrchr (start, '\n', end - start);
1171   if (newline != NULL)
1172     start = newline + 1;
1173   return count_columns (start, end - start);
1174 }
1175
1176 /* Returns the 1-based line number of the start of the syntax that represents
1177    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1178    if the token is drawn from a source that does not have line numbers. */
1179 int
1180 lex_get_first_line_number (const struct lexer *lexer, int n)
1181 {
1182   const struct lex_source *src = lex_source__ (lexer);
1183   return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
1184 }
1185
1186 /* Returns the 1-based line number of the end of the syntax that represents the
1187    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1188    token or if the token is drawn from a source that does not have line
1189    numbers.
1190
1191    Most of the time, a single token is wholly within a single line of syntax,
1192    but there are two exceptions: a T_STRING token can be made up of multiple
1193    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1194    token can consist of a "-" on one line followed by the number on the next.
1195  */
1196 int
1197 lex_get_last_line_number (const struct lexer *lexer, int n)
1198 {
1199   const struct lex_source *src = lex_source__ (lexer);
1200   return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1201 }
1202
1203 /* Returns the 1-based column number of the start of the syntax that represents
1204    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1205    token.
1206
1207    Column numbers are measured according to the width of characters as shown in
1208    a typical fixed-width font, in which CJK characters have width 2 and
1209    combining characters have width 0.  */
1210 int
1211 lex_get_first_column (const struct lexer *lexer, int n)
1212 {
1213   const struct lex_source *src = lex_source__ (lexer);
1214   return src != NULL ? lex_source_get_first_column (src, n) : 0;
1215 }
1216
1217 /* Returns the 1-based column number of the end of the syntax that represents
1218    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1219    token.
1220
1221    Column numbers are measured according to the width of characters as shown in
1222    a typical fixed-width font, in which CJK characters have width 2 and
1223    combining characters have width 0.  */
1224 int
1225 lex_get_last_column (const struct lexer *lexer, int n)
1226 {
1227   const struct lex_source *src = lex_source__ (lexer);
1228   return src != NULL ? lex_source_get_last_column (src, n) : 0;
1229 }
1230
1231 /* Returns the name of the syntax file from which the current command is drawn.
1232    Returns NULL for a T_STOP token or if the command's source does not have
1233    line numbers.
1234
1235    There is no version of this function that takes an N argument because
1236    lookahead only works to the end of a command and any given command is always
1237    within a single syntax file. */
1238 const char *
1239 lex_get_file_name (const struct lexer *lexer)
1240 {
1241   struct lex_source *src = lex_source__ (lexer);
1242   return src == NULL ? NULL : src->reader->file_name;
1243 }
1244
1245 /* Returns a newly allocated msg_location for the syntax that represents tokens
1246    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1247    must eventually free the location (with msg_location_destroy()). */
1248 struct msg_location *
1249 lex_get_location (const struct lexer *lexer, int n0, int n1)
1250 {
1251   struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1252   loc->first_column = lex_get_first_column (lexer, n0);
1253   loc->last_column = lex_get_last_column (lexer, n1);
1254   return loc;
1255 }
1256
1257 /* Returns a newly allocated msg_location for the syntax that represents tokens
1258    with 0-based offsets N0...N1, inclusive, from the current token.  The
1259    location only covers the tokens' lines, not the columns.  The caller must
1260    eventually free the location (with msg_location_destroy()). */
1261 struct msg_location *
1262 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1263 {
1264   struct msg_location *loc = xmalloc (sizeof *loc);
1265   *loc = (struct msg_location) {
1266     .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1267     .first_line = lex_get_first_line_number (lexer, n0),
1268     .last_line = lex_get_last_line_number (lexer, n1),
1269   };
1270   return loc;
1271 }
1272
1273 const char *
1274 lex_get_encoding (const struct lexer *lexer)
1275 {
1276   struct lex_source *src = lex_source__ (lexer);
1277   return src == NULL ? NULL : src->reader->encoding;
1278 }
1279
1280 /* Returns the syntax mode for the syntax file from which the current drawn is
1281    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1282    does not have line numbers.
1283
1284    There is no version of this function that takes an N argument because
1285    lookahead only works to the end of a command and any given command is always
1286    within a single syntax file. */
1287 enum segmenter_mode
1288 lex_get_syntax_mode (const struct lexer *lexer)
1289 {
1290   struct lex_source *src = lex_source__ (lexer);
1291   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1292 }
1293
1294 /* Returns the error mode for the syntax file from which the current drawn is
1295    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1296    source does not have line numbers.
1297
1298    There is no version of this function that takes an N argument because
1299    lookahead only works to the end of a command and any given command is always
1300    within a single syntax file. */
1301 enum lex_error_mode
1302 lex_get_error_mode (const struct lexer *lexer)
1303 {
1304   struct lex_source *src = lex_source__ (lexer);
1305   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1306 }
1307
1308 /* If the source that LEXER is currently reading has error mode
1309    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1310    token to be read comes directly from whatever is next read from the stream.
1311
1312    It makes sense to call this function after encountering an error in a
1313    command entered on the console, because usually the user would prefer not to
1314    have cascading errors. */
1315 void
1316 lex_interactive_reset (struct lexer *lexer)
1317 {
1318   struct lex_source *src = lex_source__ (lexer);
1319   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1320     {
1321       src->head = src->tail = 0;
1322       src->journal_pos = src->seg_pos = src->line_pos = 0;
1323       src->n_newlines = 0;
1324       src->suppress_next_newline = false;
1325       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1326                                        false);
1327       while (src->middle - src->back > 0)
1328         lex_source_pop_back (src);
1329       while (src->front - src->middle > 0)
1330         lex_source_pop_front (src);
1331       lex_source_push_endcmd__ (src);
1332     }
1333 }
1334
1335 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1336 void
1337 lex_discard_rest_of_command (struct lexer *lexer)
1338 {
1339   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1340     lex_get (lexer);
1341 }
1342
1343 /* Discards all lookahead tokens in LEXER, then discards all input sources
1344    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1345    runs out of input sources. */
1346 void
1347 lex_discard_noninteractive (struct lexer *lexer)
1348 {
1349   struct lex_source *src = lex_source__ (lexer);
1350
1351   if (src != NULL)
1352     {
1353       while (src->middle - src->back > 0)
1354         lex_source_pop_back (src);
1355
1356       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1357            src = lex_source__ (lexer))
1358         lex_source_destroy (src);
1359     }
1360 }
1361 \f
1362 static size_t
1363 lex_source_max_tail__ (const struct lex_source *src)
1364 {
1365   const struct lex_token *token;
1366   size_t max_tail;
1367
1368   assert (src->seg_pos >= src->line_pos);
1369   max_tail = MIN (src->journal_pos, src->line_pos);
1370
1371   /* Use the oldest token also.  (We know that src->deque cannot be empty
1372      because we are in the process of adding a new token, which is already
1373      initialized enough to use here.) */
1374   token = &src->tokens[src->back & (src->capacity - 1)];
1375   assert (token->token_pos >= token->line_pos);
1376   max_tail = MIN (max_tail, token->line_pos);
1377
1378   return max_tail;
1379 }
1380
1381 static void
1382 lex_source_expand__ (struct lex_source *src)
1383 {
1384   if (src->head - src->tail >= src->allocated)
1385     {
1386       size_t max_tail = lex_source_max_tail__ (src);
1387       if (max_tail > src->tail)
1388         {
1389           /* Advance the tail, freeing up room at the head. */
1390           memmove (src->buffer, src->buffer + (max_tail - src->tail),
1391                    src->head - max_tail);
1392           src->tail = max_tail;
1393         }
1394       else
1395         {
1396           /* Buffer is completely full.  Expand it. */
1397           src->buffer = x2realloc (src->buffer, &src->allocated);
1398         }
1399     }
1400   else
1401     {
1402       /* There's space available at the head of the buffer.  Nothing to do. */
1403     }
1404 }
1405
1406 static void
1407 lex_source_read__ (struct lex_source *src)
1408 {
1409   do
1410     {
1411       lex_source_expand__ (src);
1412
1413       size_t head_ofs = src->head - src->tail;
1414       size_t space = src->allocated - head_ofs;
1415       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1416       size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1417                                            space, prompt);
1418       assert (n <= space);
1419
1420       if (n == 0)
1421         {
1422           /* End of input. */
1423           src->reader->eof = true;
1424           lex_source_expand__ (src);
1425           return;
1426         }
1427
1428       src->head += n;
1429     }
1430   while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1431                   src->head - src->seg_pos));
1432 }
1433
1434 static struct lex_source *
1435 lex_source__ (const struct lexer *lexer)
1436 {
1437   return (ll_is_empty (&lexer->sources) ? NULL
1438           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1439 }
1440
1441 static char *
1442 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1443 {
1444   struct string s = DS_EMPTY_INITIALIZER;
1445   for (size_t i = n0; i <= n1; )
1446     {
1447       /* Find [I,J) as the longest sequence of tokens not produced by macro
1448          expansion, or otherwise the longest sequence expanded from a single
1449          macro call. */
1450       const struct lex_token *first = lex_source_next__ (src, i);
1451       size_t j;
1452       for (j = i + 1; j <= n1; j++)
1453         {
1454           const struct lex_token *cur = lex_source_next__ (src, j);
1455           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1456               || first->macro_rep != cur->macro_rep)
1457             break;
1458         }
1459       const struct lex_token *last = lex_source_next__ (src, j - 1);
1460
1461       if (!ds_is_empty (&s))
1462         ds_put_byte (&s, ' ');
1463       if (!first->macro_rep)
1464         {
1465           size_t start = first->token_pos;
1466           size_t end = last->token_pos + last->token_len;
1467           ds_put_substring (&s, ss_buffer (&src->buffer[start - src->tail],
1468                                            end - start));
1469         }
1470       else
1471         {
1472           size_t start = first->ofs;
1473           size_t end = last->ofs + last->len;
1474           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1475                                            end - start));
1476         }
1477
1478       i = j;
1479     }
1480   return ds_steal_cstr (&s);
1481 }
1482
1483 static void
1484 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1485 {
1486   size_t out_maxlen;
1487   size_t out_len;
1488   int mblen;
1489
1490   assert (out_size >= 16);
1491   out_maxlen = out_size - 1;
1492   if (in.length > out_maxlen - 3)
1493     out_maxlen -= 3;
1494
1495   for (out_len = 0; out_len < in.length; out_len += mblen)
1496     {
1497       if (in.string[out_len] == '\n'
1498           || in.string[out_len] == '\0'
1499           || (in.string[out_len] == '\r'
1500               && out_len + 1 < in.length
1501               && in.string[out_len + 1] == '\n'))
1502         break;
1503
1504       mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1505                         in.length - out_len);
1506
1507       if (mblen < 0)
1508         break;
1509
1510       if (out_len + mblen > out_maxlen)
1511         break;
1512     }
1513
1514   memcpy (out, in.string, out_len);
1515   strcpy (&out[out_len], out_len < in.length ? "..." : "");
1516 }
1517
1518 static bool
1519 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1520 {
1521   for (size_t i = n0; i <= n1; i++)
1522     if (lex_source_next__ (src, i)->macro_rep)
1523       return true;
1524   return false;
1525 }
1526
1527 static struct substring
1528 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1529 {
1530   if (!lex_source_contains_macro_call (src, n0, n1))
1531     return ss_empty ();
1532
1533   const struct lex_token *token0 = lex_source_next__ (src, n0);
1534   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1535   size_t start = token0->token_pos;
1536   size_t end = token1->token_pos + token1->token_len;
1537
1538   return ss_buffer (&src->buffer[start - src->tail], end - start);
1539 }
1540
1541 static void
1542 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1543                          const char *format, va_list args)
1544 {
1545   const struct lex_token *token;
1546   struct string s;
1547
1548   ds_init_empty (&s);
1549
1550   token = lex_source_next__ (src, n0);
1551   if (token->token.type == T_ENDCMD)
1552     ds_put_cstr (&s, _("Syntax error at end of command"));
1553   else
1554     {
1555       /* Get the syntax that caused the error. */
1556       char *syntax = lex_source_get_syntax__ (src, n0, n1);
1557       char syntax_cstr[64];
1558       lex_ellipsize__ (ss_cstr (syntax), syntax_cstr, sizeof syntax_cstr);
1559       free (syntax);
1560
1561       /* Get the macro call(s) that expanded to the syntax that caused the
1562          error. */
1563       char call_cstr[64];
1564       struct substring call = lex_source_get_macro_call (src, n0, n1);
1565       lex_ellipsize__ (call, call_cstr, sizeof call_cstr);
1566
1567       if (syntax_cstr[0])
1568         {
1569           if (call_cstr[0])
1570             ds_put_format (&s, _("Syntax error at `%s' "
1571                                  "(in expansion of `%s')"),
1572                            syntax_cstr, call_cstr);
1573           else
1574             ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1575         }
1576       else if (call_cstr[0])
1577         ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1578                        call_cstr);
1579       else
1580         ds_put_cstr (&s, _("Syntax error"));
1581     }
1582
1583   if (format)
1584     {
1585       ds_put_cstr (&s, ": ");
1586       ds_put_vformat (&s, format, args);
1587     }
1588   if (ds_last (&s) != '.')
1589     ds_put_byte (&s, '.');
1590
1591   struct msg_location *location = xmalloc (sizeof *location);
1592   *location = (struct msg_location) {
1593     .file_name = xstrdup_if_nonnull (src->reader->file_name),
1594     .first_line = lex_source_get_first_line_number (src, n0),
1595     .last_line = lex_source_get_last_line_number (src, n1),
1596     .first_column = lex_source_get_first_column (src, n0),
1597     .last_column = lex_source_get_last_column (src, n1),
1598   };
1599   struct msg *m = xmalloc (sizeof *m);
1600   *m = (struct msg) {
1601     .category = MSG_C_SYNTAX,
1602     .severity = MSG_S_ERROR,
1603     .location = location,
1604     .text = ds_steal_cstr (&s),
1605   };
1606   msg_emit (m);
1607 }
1608
1609 static void PRINTF_FORMAT (2, 3)
1610 lex_get_error (struct lex_source *src, const char *format, ...)
1611 {
1612   va_list args;
1613   va_start (args, format);
1614
1615   size_t old_middle = src->middle;
1616   src->middle = src->front;
1617   size_t n = src->front - src->back - 1;
1618   lex_source_error_valist (src, n, n, format, args);
1619   src->middle = old_middle;
1620
1621   lex_source_pop_front (src);
1622
1623   va_end (args);
1624 }
1625
1626 /* Attempts to append an additional token at the front of SRC, reading more
1627    from the underlying lex_reader if necessary.  Returns true if a new token
1628    was added to SRC's deque, false otherwise.  The caller should retry failures
1629    unless SRC's 'eof' marker was set to true indicating that there will be no
1630    more tokens from this source.
1631
1632    Does not make the new token available for lookahead yet; the caller must
1633    adjust SRC's 'middle' pointer to do so. */
1634 static bool
1635 lex_source_try_get__ (struct lex_source *src)
1636 {
1637   /* State maintained while scanning tokens.  Usually we only need a single
1638      state, but scanner_push() can return SCAN_SAVE to indicate that the state
1639      needs to be saved and possibly restored later with SCAN_BACK. */
1640   struct state
1641     {
1642       struct segmenter segmenter;
1643       enum segment_type last_segment;
1644       int newlines;             /* Number of newlines encountered so far. */
1645       /* Maintained here so we can update lex_source's similar members when we
1646          finish. */
1647       size_t line_pos;
1648       size_t seg_pos;
1649     };
1650
1651   /* Initialize state. */
1652   struct state state =
1653     {
1654       .segmenter = src->segmenter,
1655       .newlines = 0,
1656       .seg_pos = src->seg_pos,
1657       .line_pos = src->line_pos,
1658     };
1659   struct state saved = state;
1660
1661   /* Append a new token to SRC and initialize it. */
1662   struct lex_token *token = lex_push_token__ (src);
1663   struct scanner scanner;
1664   scanner_init (&scanner, &token->token);
1665   token->line_pos = src->line_pos;
1666   token->token_pos = src->seg_pos;
1667   if (src->reader->line_number > 0)
1668     token->first_line = src->reader->line_number + src->n_newlines;
1669   else
1670     token->first_line = 0;
1671
1672   /* Extract segments and pass them through the scanner until we obtain a
1673      token. */
1674   for (;;)
1675     {
1676       /* Extract a segment. */
1677       const char *segment = &src->buffer[state.seg_pos - src->tail];
1678       size_t seg_maxlen = src->head - state.seg_pos;
1679       enum segment_type type;
1680       int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1681                                     src->reader->eof, &type);
1682       if (seg_len < 0)
1683         {
1684           /* The segmenter needs more input to produce a segment. */
1685           assert (!src->reader->eof);
1686           lex_source_read__ (src);
1687           continue;
1688         }
1689
1690       /* Update state based on the segment. */
1691       state.last_segment = type;
1692       state.seg_pos += seg_len;
1693       if (type == SEG_NEWLINE)
1694         {
1695           state.newlines++;
1696           state.line_pos = state.seg_pos;
1697         }
1698
1699       /* Pass the segment into the scanner and try to get a token out. */
1700       enum scan_result result = scanner_push (&scanner, type,
1701                                               ss_buffer (segment, seg_len),
1702                                               &token->token);
1703       if (result == SCAN_SAVE)
1704         saved = state;
1705       else if (result == SCAN_BACK)
1706         {
1707           state = saved;
1708           break;
1709         }
1710       else if (result == SCAN_DONE)
1711         break;
1712     }
1713
1714   /* If we've reached the end of a line, or the end of a command, then pass
1715      the line to the output engine as a syntax text item.  */
1716   int n_lines = state.newlines;
1717   if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1718     {
1719       n_lines++;
1720       src->suppress_next_newline = true;
1721     }
1722   else if (n_lines > 0 && src->suppress_next_newline)
1723     {
1724       n_lines--;
1725       src->suppress_next_newline = false;
1726     }
1727   for (int i = 0; i < n_lines; i++)
1728     {
1729       /* Beginning of line. */
1730       const char *line = &src->buffer[src->journal_pos - src->tail];
1731
1732       /* Calculate line length, including \n or \r\n end-of-line if present.
1733
1734          We use src->head even though that may be beyond what we've actually
1735          converted to tokens (which is only through state.line_pos).  That's
1736          because, if we're emitting the line due to SEG_END_COMMAND, we want to
1737          take the whole line through the newline, not just through the '.'. */
1738       size_t max_len = src->head - src->journal_pos;
1739       const char *newline = memchr (line, '\n', max_len);
1740       size_t line_len = newline ? newline - line + 1 : max_len;
1741
1742       /* Calculate line length excluding end-of-line. */
1743       size_t copy_len = line_len;
1744       if (copy_len > 0 && line[copy_len - 1] == '\n')
1745         copy_len--;
1746       if (copy_len > 0 && line[copy_len - 1] == '\r')
1747         copy_len--;
1748
1749       /* Submit the line as syntax. */
1750       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1751                                                    xmemdup0 (line, copy_len),
1752                                                    NULL));
1753
1754       src->journal_pos += line_len;
1755     }
1756
1757   token->token_len = state.seg_pos - src->seg_pos;
1758
1759   src->segmenter = state.segmenter;
1760   src->seg_pos = state.seg_pos;
1761   src->line_pos = state.line_pos;
1762   src->n_newlines += state.newlines;
1763
1764   switch (token->token.type)
1765     {
1766     default:
1767       return true;
1768
1769     case T_STOP:
1770       token->token.type = T_ENDCMD;
1771       src->eof = true;
1772       return true;
1773
1774     case SCAN_BAD_HEX_LENGTH:
1775       lex_get_error (src, _("String of hex digits has %d characters, which "
1776                             "is not a multiple of 2"),
1777                      (int) token->token.number);
1778       return false;
1779
1780     case SCAN_BAD_HEX_DIGIT:
1781     case SCAN_BAD_UNICODE_DIGIT:
1782       lex_get_error (src, _("`%c' is not a valid hex digit"),
1783                      (int) token->token.number);
1784       return false;
1785
1786     case SCAN_BAD_UNICODE_LENGTH:
1787       lex_get_error (src, _("Unicode string contains %d bytes, which is "
1788                             "not in the valid range of 1 to 8 bytes"),
1789                      (int) token->token.number);
1790       return false;
1791
1792     case SCAN_BAD_UNICODE_CODE_POINT:
1793       lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1794                      (int) token->token.number);
1795       return false;
1796
1797     case SCAN_EXPECTED_QUOTE:
1798       lex_get_error (src, _("Unterminated string constant"));
1799       return false;
1800
1801     case SCAN_EXPECTED_EXPONENT:
1802       lex_get_error (src, _("Missing exponent following `%s'"),
1803                      token->token.string.string);
1804       return false;
1805
1806     case SCAN_UNEXPECTED_CHAR:
1807       {
1808         char c_name[16];
1809         lex_get_error (src, _("Bad character %s in input"),
1810                        uc_name (token->token.number, c_name));
1811         return false;
1812       }
1813
1814     case SCAN_SKIP:
1815       lex_source_pop_front (src);
1816       return false;
1817     }
1818
1819   NOT_REACHED ();
1820 }
1821
1822 /* Attempts to add a new token at the front of SRC.  Returns true if
1823    successful, false on failure.  On failure, the end of SRC has been reached
1824    and no more tokens will be forthcoming from it.
1825
1826    Does not make the new token available for lookahead yet; the caller must
1827    adjust SRC's 'middle' pointer to do so. */
1828 static bool
1829 lex_source_get__ (struct lex_source *src)
1830 {
1831   while (!src->eof)
1832     if (lex_source_try_get__ (src))
1833       return true;
1834   return false;
1835 }
1836
1837 static bool
1838 lex_source_get (const struct lex_source *src_)
1839 {
1840   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1841
1842   if (src->front - src->middle == 0)
1843     {
1844       if (!lex_source_get__ (src))
1845         return false;
1846     }
1847
1848   if (!settings_get_mexpand ())
1849     {
1850       src->middle++;
1851       return true;
1852     }
1853
1854   struct macro_expander *me;
1855   int n_call = macro_expander_create (
1856     src->lexer->macros, &src->tokens[src->middle & (src->capacity - 1)].token,
1857     &me);
1858   for (int middle_ofs = 1; !n_call; middle_ofs++)
1859     {
1860       if (src->front - src->middle <= middle_ofs && !lex_source_get__ (src))
1861         {
1862           /* This should not be reachable because we always get a T_ENDCMD at
1863              the end of an input file (transformed from T_STOP by
1864              lex_source_try_get__()) and the macro_expander should always
1865              terminate expansion on T_ENDCMD. */
1866           NOT_REACHED ();
1867         }
1868
1869       const struct lex_token *t = &src->tokens[(src->middle + middle_ofs)
1870                                                & (src->capacity - 1)];
1871       size_t start = t->token_pos;
1872       size_t end = t->token_pos + t->token_len;
1873       const struct macro_token mt = {
1874         .token = t->token,
1875         .representation = ss_buffer (&src->buffer[start - src->tail],
1876                                      end - start),
1877       };
1878       src->middle += middle_ofs + 1;
1879       n_call = macro_expander_add (me, &mt);
1880       src->middle -= middle_ofs + 1;
1881     }
1882   if (n_call < 0)
1883     {
1884       /* False alarm: no macro expansion after all.  Use first token as
1885          lookahead.  We'll retry macro expansion from the second token next
1886          time around. */
1887       macro_expander_destroy (me);
1888       src->middle++;
1889       return true;
1890     }
1891
1892   /* The first 'n_call' tokens starting at 'middle' will be replaced by a
1893      macro expansion.  There might be more tokens after that, up to 'front'.
1894
1895      Figure out the boundary of the macro call in the syntax, to go into the
1896      lex_tokens for the expansion so that later error messages can report what
1897      macro was called. */
1898   const struct lex_token *call_first
1899     = &src->tokens[src->middle & (src->capacity - 1)];
1900   const struct lex_token *call_last
1901     = &src->tokens[(src->middle + n_call - 1) & (src->capacity - 1)];
1902   size_t call_pos = call_first->token_pos;
1903   size_t call_len = (call_last->token_pos + call_last->token_len) - call_pos;
1904   size_t line_pos = call_first->line_pos;
1905   int first_line = call_first->first_line;
1906
1907   /* Destroy the tokens for the call, and save any tokens following the call so
1908      we can add them back later. */
1909   for (size_t i = src->middle; i != src->middle + n_call; i++)
1910     lex_token_uninit (&src->tokens[i & (src->capacity - 1)]);
1911   size_t n_save = src->front - (src->middle + n_call);
1912   struct lex_token *save_tokens = xnmalloc (n_save, sizeof *save_tokens);
1913   for (size_t i = 0; i < n_save; i++)
1914     save_tokens[i] = src->tokens[(src->middle + n_call + i)
1915                                  & (src->capacity - 1)];
1916   src->front = src->middle;
1917
1918   /* Now expand the macro. */
1919   struct macro_tokens expansion = { .n = 0 };
1920   macro_expander_get_expansion (me, &expansion);
1921   macro_expander_destroy (me);
1922
1923   /* Convert the macro expansion into syntax for possible error messages later. */
1924   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1925   size_t *len = xnmalloc (expansion.n, sizeof *len);
1926   struct string s = DS_EMPTY_INITIALIZER;
1927   macro_tokens_to_representation (&expansion, &s, ofs, len);
1928
1929   if (settings_get_mprint ())
1930     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1931                                           _("Macro Expansion")));
1932
1933   /* Append the macro expansion tokens to the lookahead. */
1934   char *macro_rep = ds_steal_cstr (&s);
1935   size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1936   *ref_cnt = expansion.n;
1937   for (size_t i = 0; i < expansion.n; i++)
1938     {
1939       *lex_push_token__ (src) = (struct lex_token) {
1940         .token = expansion.mts[i].token,
1941         .token_pos = call_pos,
1942         .token_len = call_len,
1943         .line_pos = line_pos,
1944         .first_line = first_line,
1945         .macro_rep = macro_rep,
1946         .ofs = ofs[i],
1947         .len = len[i],
1948         .ref_cnt = ref_cnt,
1949       };
1950       src->middle++;
1951
1952       ss_dealloc (&expansion.mts[i].representation);
1953     }
1954   free (expansion.mts);
1955   free (ofs);
1956   free (len);
1957
1958   /* Finally, put the saved tokens back. */
1959   for (size_t i = 0; i < n_save; i++)
1960     *lex_push_token__ (src) = save_tokens[i];
1961   free (save_tokens);
1962
1963   return true;
1964 }
1965 \f
1966 static void
1967 lex_source_push_endcmd__ (struct lex_source *src)
1968 {
1969   assert (src->back == src->middle && src->middle == src->front);
1970   *lex_push_token__ (src) = (struct lex_token) {
1971     .token = { .type = T_ENDCMD } };
1972   src->middle++;
1973 }
1974
1975 static struct lex_source *
1976 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
1977 {
1978   struct lex_source *src = xmalloc (sizeof *src);
1979   *src = (struct lex_source) {
1980     .reader = reader,
1981     .segmenter = segmenter_init (reader->syntax, false),
1982     .lexer = lexer,
1983   };
1984
1985   lex_source_push_endcmd__ (src);
1986
1987   return src;
1988 }
1989
1990 static void
1991 lex_source_destroy (struct lex_source *src)
1992 {
1993   char *file_name = src->reader->file_name;
1994   char *encoding = src->reader->encoding;
1995   if (src->reader->class->destroy != NULL)
1996     src->reader->class->destroy (src->reader);
1997   free (file_name);
1998   free (encoding);
1999   free (src->buffer);
2000   while (src->middle - src->back > 0)
2001     lex_source_pop_back (src);
2002   while (src->front - src->middle > 0)
2003     lex_source_pop_front (src);
2004   free (src->tokens);
2005   ll_remove (&src->ll);
2006   free (src);
2007 }
2008 \f
2009 struct lex_file_reader
2010   {
2011     struct lex_reader reader;
2012     struct u8_istream *istream;
2013   };
2014
2015 static struct lex_reader_class lex_file_reader_class;
2016
2017 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2018    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2019    ENCODING, which should take one of the forms accepted by
2020    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2021    mode of the new reader, respectively.
2022
2023    Returns a null pointer if FILE_NAME cannot be opened. */
2024 struct lex_reader *
2025 lex_reader_for_file (const char *file_name, const char *encoding,
2026                      enum segmenter_mode syntax,
2027                      enum lex_error_mode error)
2028 {
2029   struct lex_file_reader *r;
2030   struct u8_istream *istream;
2031
2032   istream = (!strcmp(file_name, "-")
2033              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2034              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2035   if (istream == NULL)
2036     {
2037       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2038       return NULL;
2039     }
2040
2041   r = xmalloc (sizeof *r);
2042   lex_reader_init (&r->reader, &lex_file_reader_class);
2043   r->reader.syntax = syntax;
2044   r->reader.error = error;
2045   r->reader.file_name = xstrdup (file_name);
2046   r->reader.encoding = xstrdup_if_nonnull (encoding);
2047   r->reader.line_number = 1;
2048   r->istream = istream;
2049
2050   return &r->reader;
2051 }
2052
2053 static struct lex_file_reader *
2054 lex_file_reader_cast (struct lex_reader *r)
2055 {
2056   return UP_CAST (r, struct lex_file_reader, reader);
2057 }
2058
2059 static size_t
2060 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2061                enum prompt_style prompt_style UNUSED)
2062 {
2063   struct lex_file_reader *r = lex_file_reader_cast (r_);
2064   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2065   if (n_read < 0)
2066     {
2067       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2068       return 0;
2069     }
2070   return n_read;
2071 }
2072
2073 static void
2074 lex_file_close (struct lex_reader *r_)
2075 {
2076   struct lex_file_reader *r = lex_file_reader_cast (r_);
2077
2078   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2079     {
2080       if (u8_istream_close (r->istream) != 0)
2081         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2082     }
2083   else
2084     u8_istream_free (r->istream);
2085
2086   free (r);
2087 }
2088
2089 static struct lex_reader_class lex_file_reader_class =
2090   {
2091     lex_file_read,
2092     lex_file_close
2093   };
2094 \f
2095 struct lex_string_reader
2096   {
2097     struct lex_reader reader;
2098     struct substring s;
2099     size_t offset;
2100   };
2101
2102 static struct lex_reader_class lex_string_reader_class;
2103
2104 /* Creates and returns a new lex_reader for the contents of S, which must be
2105    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2106    with ss_dealloc() when it is closed. */
2107 struct lex_reader *
2108 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2109 {
2110   struct lex_string_reader *r;
2111
2112   r = xmalloc (sizeof *r);
2113   lex_reader_init (&r->reader, &lex_string_reader_class);
2114   r->reader.syntax = SEG_MODE_AUTO;
2115   r->reader.encoding = xstrdup_if_nonnull (encoding);
2116   r->s = s;
2117   r->offset = 0;
2118
2119   return &r->reader;
2120 }
2121
2122 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2123    which must be encoded in ENCODING.  The caller retains ownership of S. */
2124 struct lex_reader *
2125 lex_reader_for_string (const char *s, const char *encoding)
2126 {
2127   struct substring ss;
2128   ss_alloc_substring (&ss, ss_cstr (s));
2129   return lex_reader_for_substring_nocopy (ss, encoding);
2130 }
2131
2132 /* Formats FORMAT as a printf()-like format string and creates and returns a
2133    new lex_reader for the formatted result.  */
2134 struct lex_reader *
2135 lex_reader_for_format (const char *format, const char *encoding, ...)
2136 {
2137   struct lex_reader *r;
2138   va_list args;
2139
2140   va_start (args, encoding);
2141   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2142   va_end (args);
2143
2144   return r;
2145 }
2146
2147 static struct lex_string_reader *
2148 lex_string_reader_cast (struct lex_reader *r)
2149 {
2150   return UP_CAST (r, struct lex_string_reader, reader);
2151 }
2152
2153 static size_t
2154 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2155                  enum prompt_style prompt_style UNUSED)
2156 {
2157   struct lex_string_reader *r = lex_string_reader_cast (r_);
2158   size_t chunk;
2159
2160   chunk = MIN (n, r->s.length - r->offset);
2161   memcpy (buf, r->s.string + r->offset, chunk);
2162   r->offset += chunk;
2163
2164   return chunk;
2165 }
2166
2167 static void
2168 lex_string_close (struct lex_reader *r_)
2169 {
2170   struct lex_string_reader *r = lex_string_reader_cast (r_);
2171
2172   ss_dealloc (&r->s);
2173   free (r);
2174 }
2175
2176 static struct lex_reader_class lex_string_reader_class =
2177   {
2178     lex_string_read,
2179     lex_string_close
2180   };