pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31
  32 #include "language/command.h"
  33 #include "language/lexer/macro.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/ll.h"
  42 #include "libpspp/message.h"
  43 #include "libpspp/misc.h"
  44 #include "libpspp/str.h"
  45 #include "libpspp/u8-istream.h"
  46 #include "output/journal.h"
  47 #include "output/output-item.h"
  48
  49 #include "gl/c-ctype.h"
  50 #include "gl/minmax.h"
  51 #include "gl/xalloc.h"
  52 #include "gl/xmemdup0.h"
  53
  54 #include "gettext.h"
  55 #define _(msgid) gettext (msgid)
  56 #define N_(msgid) msgid
  57
  58 /* A token within a lex_source. */
  59 struct lex_token
  60   {
  61     /* The regular token information. */
  62     struct token token;
  63
  64     /* For a token obtained through the lexer in an ordinary way, this is the
  65        location of the token in terms of the lex_source's buffer.
  66
  67        For a token produced through macro expansion, this is the entire macro
  68        call. */
  69     size_t token_pos;           /* Offset into src->buffer of token start. */
  70     size_t token_len;           /* Length of source for token in bytes. */
  71     int first_line;             /* Line number at token_pos. */
  72
  73     /* For a token obtained through macro expansion, this is just this token.
  74
  75        For a token obtained through the lexer in an ordinary way, these are
  76        nulls and zeros. */
  77     char *macro_rep;        /* The whole macro expansion. */
  78     size_t ofs;             /* Offset of this token in macro_rep. */
  79     size_t len;             /* Length of this token in macro_rep. */
  80     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  81   };
  82
  83 static void
  84 lex_token_destroy (struct lex_token *t)
  85 {
  86   token_uninit (&t->token);
  87   if (t->ref_cnt)
  88     {
  89       assert (*t->ref_cnt > 0);
  90       if (!--*t->ref_cnt)
  91         {
  92           free (t->macro_rep);
  93           free (t->ref_cnt);
  94         }
  95     }
  96   free (t);
  97 }
  98 \f
  99 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 100    lex_source. */
 101 struct lex_stage
 102   {
 103     struct deque deque;
 104     struct lex_token **tokens;
 105   };
 106
 107 static void lex_stage_clear (struct lex_stage *);
 108 static void lex_stage_uninit (struct lex_stage *);
 109
 110 static size_t lex_stage_count (const struct lex_stage *);
 111 static bool lex_stage_is_empty (const struct lex_stage *);
 112
 113 static struct lex_token *lex_stage_first (struct lex_stage *);
 114 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 115
 116 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 117 static void lex_stage_pop_first (struct lex_stage *);
 118
 119 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 120                              size_t n);
 121
 122 /* Deletes all the tokens from STAGE. */
 123 static void
 124 lex_stage_clear (struct lex_stage *stage)
 125 {
 126   while (!deque_is_empty (&stage->deque))
 127     lex_stage_pop_first (stage);
 128 }
 129
 130 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 131 static void
 132 lex_stage_uninit (struct lex_stage *stage)
 133 {
 134   lex_stage_clear (stage);
 135   free (stage->tokens);
 136 }
 137
 138 /* Returns true if STAGE contains no tokens, otherwise false. */
 139 static bool
 140 lex_stage_is_empty (const struct lex_stage *stage)
 141 {
 142   return deque_is_empty (&stage->deque);
 143 }
 144
 145 /* Returns the number of tokens in STAGE. */
 146 static size_t
 147 lex_stage_count (const struct lex_stage *stage)
 148 {
 149   return deque_count (&stage->deque);
 150 }
 151
 152 /* Returns the first token in STAGE, which must be nonempty.
 153    The first token is the one accessed with the least lookahead. */
 154 static struct lex_token *
 155 lex_stage_first (struct lex_stage *stage)
 156 {
 157   return lex_stage_nth (stage, 0);
 158 }
 159
 160 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 161    lookahead) is 0, the second token is 1, and so on.  There must be at least
 162    INDEX + 1 tokens in STAGE. */
 163 static struct lex_token *
 164 lex_stage_nth (struct lex_stage *stage, size_t index)
 165 {
 166   return stage->tokens[deque_back (&stage->deque, index)];
 167 }
 168
 169 /* Adds TOKEN so that it becomes the last token in STAGE. */
 170 static void
 171 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 172 {
 173   if (deque_is_full (&stage->deque))
 174     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 175                                   sizeof *stage->tokens);
 176   stage->tokens[deque_push_front (&stage->deque)] = token;
 177 }
 178
 179 /* Removes and returns the first token from STAGE. */
 180 static struct lex_token *
 181 lex_stage_take_first (struct lex_stage *stage)
 182 {
 183   return stage->tokens[deque_pop_back (&stage->deque)];
 184 }
 185
 186 /* Removes the first token from STAGE and uninitializes it. */
 187 static void
 188 lex_stage_pop_first (struct lex_stage *stage)
 189 {
 190   lex_token_destroy (lex_stage_take_first (stage));
 191 }
 192
 193 /* Removes the first N tokens from SRC, appending them to DST as the last
 194    tokens. */
 195 static void
 196 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 197 {
 198   for (size_t i = 0; i < n; i++)
 199     lex_stage_push_last (dst, lex_stage_take_first (src));
 200 }
 201
 202 /* A source of tokens, corresponding to a syntax file.
 203
 204    This is conceptually a lex_reader wrapped with everything needed to convert
 205    its UTF-8 bytes into tokens. */
 206 struct lex_source
 207   {
 208     struct ll ll;               /* In lexer's list of sources. */
 209     struct lex_reader *reader;
 210     struct lexer *lexer;
 211     struct segmenter segmenter;
 212     bool eof;                   /* True if T_STOP was read from 'reader'. */
 213
 214     /* Buffer of UTF-8 bytes. */
 215     char *buffer;               /* Source file contents. */
 216     size_t length;              /* Number of bytes filled. */
 217     size_t allocated;           /* Number of bytes allocated. */
 218
 219     /* Offsets into 'buffer'. */
 220     size_t journal_pos;         /* First byte not yet output to journal. */
 221     size_t seg_pos;             /* First byte not yet scanned as token. */
 222
 223     int n_newlines;             /* Number of new-lines up to seg_pos. */
 224     bool suppress_next_newline;
 225
 226     /* Tokens.
 227
 228        This is a pipeline with the following stages.  Each token eventually
 229        made available to the parser passes through of these stages.  The stages
 230        are named after the processing that happens in each one.
 231
 232        Initially, tokens come from the segmenter and scanner to 'pp':
 233
 234        - pp: Tokens that need to pass through the macro preprocessor to end up
 235          in 'merge'.
 236
 237        - merge: Tokens that need to pass through scan_merge() to end up in
 238          'parse'.
 239
 240        - parse: Tokens available to the client for parsing.
 241
 242       'pp' and 'merge' store tokens only temporarily until they pass into
 243       'parse'.  Tokens then live in 'parse' until the command is fully
 244       consumed, at which time they are freed together. */
 245     struct lex_stage pp;
 246     struct lex_stage merge;
 247     struct lex_token **parse;
 248     size_t n_parse, allocated_parse, parse_ofs;
 249   };
 250
 251 static struct lex_source *lex_source_create (struct lexer *,
 252                                              struct lex_reader *);
 253 static void lex_source_destroy (struct lex_source *);
 254
 255 /* Lexer. */
 256 struct lexer
 257   {
 258     struct ll_list sources;     /* Contains "struct lex_source"s. */
 259     struct macro_set *macros;
 260   };
 261
 262 static struct lex_source *lex_source__ (const struct lexer *);
 263 static char *lex_source_get_syntax__ (const struct lex_source *,
 264                                       int n0, int n1);
 265 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 266 static void lex_source_push_endcmd__ (struct lex_source *);
 267 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
 268 static void lex_source_clear_parse (struct lex_source *);
 269
 270 static bool lex_source_get_parse (struct lex_source *);
 271 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 272                                      const char *format, va_list)
 273    PRINTF_FORMAT (4, 0);
 274 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 275                                                   int n);
 276 \f
 277 /* Initializes READER with the specified CLASS and otherwise some reasonable
 278    defaults.  The caller should fill in the others members as desired. */
 279 void
 280 lex_reader_init (struct lex_reader *reader,
 281                  const struct lex_reader_class *class)
 282 {
 283   reader->class = class;
 284   reader->syntax = SEG_MODE_AUTO;
 285   reader->error = LEX_ERROR_CONTINUE;
 286   reader->file_name = NULL;
 287   reader->encoding = NULL;
 288   reader->line_number = 0;
 289   reader->eof = false;
 290 }
 291
 292 /* Frees any file name already in READER and replaces it by a copy of
 293    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 294 void
 295 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 296 {
 297   free (reader->file_name);
 298   reader->file_name = xstrdup_if_nonnull (file_name);
 299 }
 300 \f
 301 /* Creates and returns a new lexer. */
 302 struct lexer *
 303 lex_create (void)
 304 {
 305   struct lexer *lexer = xmalloc (sizeof *lexer);
 306   *lexer = (struct lexer) {
 307     .sources = LL_INITIALIZER (lexer->sources),
 308     .macros = macro_set_create (),
 309   };
 310   return lexer;
 311 }
 312
 313 /* Destroys LEXER. */
 314 void
 315 lex_destroy (struct lexer *lexer)
 316 {
 317   if (lexer != NULL)
 318     {
 319       struct lex_source *source, *next;
 320
 321       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 322         lex_source_destroy (source);
 323       macro_set_destroy (lexer->macros);
 324       free (lexer);
 325     }
 326 }
 327
 328 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 329    same name.  Takes ownership of M. */
 330 void
 331 lex_define_macro (struct lexer *lexer, struct macro *m)
 332 {
 333   macro_set_add (lexer->macros, m);
 334 }
 335
 336 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 337    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 338    token. */
 339 void
 340 lex_include (struct lexer *lexer, struct lex_reader *reader)
 341 {
 342   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 343   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 344 }
 345
 346 /* Appends READER to LEXER, so that it will be read after all other current
 347    readers have already been read. */
 348 void
 349 lex_append (struct lexer *lexer, struct lex_reader *reader)
 350 {
 351   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 352 }
 353 \f
 354 /* Advancing. */
 355
 356 /* Advances LEXER to the next token, consuming the current token. */
 357 void
 358 lex_get (struct lexer *lexer)
 359 {
 360   struct lex_source *src;
 361
 362   src = lex_source__ (lexer);
 363   if (src == NULL)
 364     return;
 365
 366   if (src->parse_ofs < src->n_parse)
 367     {
 368       if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
 369         lex_source_clear_parse (src);
 370       else
 371         src->parse_ofs++;
 372     }
 373
 374   while (src->parse_ofs == src->n_parse)
 375     if (!lex_source_get_parse (src))
 376       {
 377         lex_source_destroy (src);
 378         src = lex_source__ (lexer);
 379         if (src == NULL)
 380           return;
 381       }
 382 }
 383
 384 /* Advances LEXER by N tokens. */
 385 void
 386 lex_get_n (struct lexer *lexer, size_t n)
 387 {
 388   while (n-- > 0)
 389     lex_get (lexer);
 390 }
 391 \f
 392 /* Issuing errors. */
 393
 394 /* Prints a syntax error message containing the current token and
 395    given message MESSAGE (if non-null). */
 396 void
 397 lex_error (struct lexer *lexer, const char *format, ...)
 398 {
 399   va_list args;
 400
 401   va_start (args, format);
 402   lex_next_error_valist (lexer, 0, 0, format, args);
 403   va_end (args);
 404 }
 405
 406 /* Prints a syntax error message containing the current token and
 407    given message MESSAGE (if non-null). */
 408 void
 409 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 410 {
 411   lex_next_error_valist (lexer, 0, 0, format, args);
 412 }
 413
 414 /* Prints a syntax error message containing the current token and
 415    given message MESSAGE (if non-null). */
 416 void
 417 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 418 {
 419   va_list args;
 420
 421   va_start (args, format);
 422   lex_next_error_valist (lexer, n0, n1, format, args);
 423   va_end (args);
 424 }
 425
 426 /* Prints a syntax error message saying that one of the strings provided as
 427    varargs, up to the first NULL, is expected. */
 428 void
 429 (lex_error_expecting) (struct lexer *lexer, ...)
 430 {
 431   va_list args;
 432
 433   va_start (args, lexer);
 434   lex_error_expecting_valist (lexer, args);
 435   va_end (args);
 436 }
 437
 438 /* Prints a syntax error message saying that one of the options provided in
 439    ARGS, up to the first NULL, is expected. */
 440 void
 441 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 442 {
 443   enum { MAX_OPTIONS = 9 };
 444   const char *options[MAX_OPTIONS];
 445   int n = 0;
 446   while (n < MAX_OPTIONS)
 447     {
 448       const char *option = va_arg (args, const char *);
 449       if (!option)
 450         break;
 451
 452       options[n++] = option;
 453     }
 454   lex_error_expecting_array (lexer, options, n);
 455 }
 456
 457 void
 458 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 459 {
 460   switch (n)
 461     {
 462     case 0:
 463       lex_error (lexer, NULL);
 464       break;
 465
 466     case 1:
 467       lex_error (lexer, _("expecting %s"), options[0]);
 468       break;
 469
 470     case 2:
 471       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 472       break;
 473
 474     case 3:
 475       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 476                  options[2]);
 477       break;
 478
 479     case 4:
 480       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 481                  options[0], options[1], options[2], options[3]);
 482       break;
 483
 484     case 5:
 485       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 486                  options[0], options[1], options[2], options[3], options[4]);
 487       break;
 488
 489     case 6:
 490       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 491                  options[0], options[1], options[2], options[3], options[4],
 492                  options[5]);
 493       break;
 494
 495     case 7:
 496       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 497                  options[0], options[1], options[2], options[3], options[4],
 498                  options[5], options[6]);
 499       break;
 500
 501     case 8:
 502       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 503                  options[0], options[1], options[2], options[3], options[4],
 504                  options[5], options[6], options[7]);
 505       break;
 506
 507     default:
 508       lex_error (lexer, NULL);
 509     }
 510 }
 511
 512 /* Reports an error to the effect that subcommand SBC may only be specified
 513    once.
 514
 515    This function does not take a lexer as an argument or use lex_error(),
 516    because the result would ordinarily just be redundant: "Syntax error at
 517    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 518    not help the user find the error. */
 519 void
 520 lex_sbc_only_once (const char *sbc)
 521 {
 522   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 523 }
 524
 525 /* Reports an error to the effect that subcommand SBC is missing.
 526
 527    This function does not take a lexer as an argument or use lex_error(),
 528    because a missing subcommand can normally be detected only after the whole
 529    command has been parsed, and so lex_error() would always report "Syntax
 530    error at end of command", which does not help the user find the error. */
 531 void
 532 lex_sbc_missing (const char *sbc)
 533 {
 534   msg (SE, _("Required subcommand %s was not specified."), sbc);
 535 }
 536
 537 /* Reports an error to the effect that specification SPEC may only be specified
 538    once within subcommand SBC. */
 539 void
 540 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 541 {
 542   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 543              spec, sbc);
 544 }
 545
 546 /* Reports an error to the effect that specification SPEC is missing within
 547    subcommand SBC. */
 548 void
 549 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 550 {
 551   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 552              sbc, spec);
 553 }
 554
 555 /* Prints a syntax error message containing the current token and
 556    given message MESSAGE (if non-null). */
 557 void
 558 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 559                        const char *format, va_list args)
 560 {
 561   struct lex_source *src = lex_source__ (lexer);
 562
 563   if (src != NULL)
 564     lex_source_error_valist (src, n0, n1, format, args);
 565   else
 566     {
 567       struct string s;
 568
 569       ds_init_empty (&s);
 570       ds_put_format (&s, _("Syntax error at end of input"));
 571       if (format != NULL)
 572         {
 573           ds_put_cstr (&s, ": ");
 574           ds_put_vformat (&s, format, args);
 575         }
 576       if (ds_last (&s) != '.')
 577         ds_put_byte (&s, '.');
 578       msg (SE, "%s", ds_cstr (&s));
 579       ds_destroy (&s);
 580     }
 581 }
 582
 583 /* Checks that we're at end of command.
 584    If so, returns a successful command completion code.
 585    If not, flags a syntax error and returns an error command
 586    completion code. */
 587 int
 588 lex_end_of_command (struct lexer *lexer)
 589 {
 590   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 591     {
 592       lex_error (lexer, _("expecting end of command"));
 593       return CMD_FAILURE;
 594     }
 595   else
 596     return CMD_SUCCESS;
 597 }
 598 \f
 599 /* Token testing functions. */
 600
 601 /* Returns true if the current token is a number. */
 602 bool
 603 lex_is_number (const struct lexer *lexer)
 604 {
 605   return lex_next_is_number (lexer, 0);
 606 }
 607
 608 /* Returns true if the current token is a string. */
 609 bool
 610 lex_is_string (const struct lexer *lexer)
 611 {
 612   return lex_next_is_string (lexer, 0);
 613 }
 614
 615 /* Returns the value of the current token, which must be a
 616    floating point number. */
 617 double
 618 lex_number (const struct lexer *lexer)
 619 {
 620   return lex_next_number (lexer, 0);
 621 }
 622
 623 /* Returns true iff the current token is an integer. */
 624 bool
 625 lex_is_integer (const struct lexer *lexer)
 626 {
 627   return lex_next_is_integer (lexer, 0);
 628 }
 629
 630 /* Returns the value of the current token, which must be an
 631    integer. */
 632 long
 633 lex_integer (const struct lexer *lexer)
 634 {
 635   return lex_next_integer (lexer, 0);
 636 }
 637 \f
 638 /* Token testing functions with lookahead.
 639
 640    A value of 0 for N as an argument to any of these functions refers to the
 641    current token.  Lookahead is limited to the current command.  Any N greater
 642    than the number of tokens remaining in the current command will be treated
 643    as referring to a T_ENDCMD token. */
 644
 645 /* Returns true if the token N ahead of the current token is a number. */
 646 bool
 647 lex_next_is_number (const struct lexer *lexer, int n)
 648 {
 649   return token_is_number (lex_next (lexer, n));
 650 }
 651
 652 /* Returns true if the token N ahead of the current token is a string. */
 653 bool
 654 lex_next_is_string (const struct lexer *lexer, int n)
 655 {
 656   return token_is_string (lex_next (lexer, n));
 657 }
 658
 659 /* Returns the value of the token N ahead of the current token, which must be a
 660    floating point number. */
 661 double
 662 lex_next_number (const struct lexer *lexer, int n)
 663 {
 664   return token_number (lex_next (lexer, n));
 665 }
 666
 667 /* Returns true if the token N ahead of the current token is an integer. */
 668 bool
 669 lex_next_is_integer (const struct lexer *lexer, int n)
 670 {
 671   return token_is_integer (lex_next (lexer, n));
 672 }
 673
 674 /* Returns the value of the token N ahead of the current token, which must be
 675    an integer. */
 676 long
 677 lex_next_integer (const struct lexer *lexer, int n)
 678 {
 679   return token_integer (lex_next (lexer, n));
 680 }
 681 \f
 682 /* Token matching functions. */
 683
 684 /* If the current token has the specified TYPE, skips it and returns true.
 685    Otherwise, returns false. */
 686 bool
 687 lex_match (struct lexer *lexer, enum token_type type)
 688 {
 689   if (lex_token (lexer) == type)
 690     {
 691       lex_get (lexer);
 692       return true;
 693     }
 694   else
 695     return false;
 696 }
 697
 698 /* If the current token matches IDENTIFIER, skips it and returns true.
 699    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 700    returns false.
 701
 702    IDENTIFIER must be an ASCII string. */
 703 bool
 704 lex_match_id (struct lexer *lexer, const char *identifier)
 705 {
 706   return lex_match_id_n (lexer, identifier, 3);
 707 }
 708
 709 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 710    may be abbreviated to its first N letters.  Otherwise, returns false.
 711
 712    IDENTIFIER must be an ASCII string. */
 713 bool
 714 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 715 {
 716   if (lex_token (lexer) == T_ID
 717       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 718     {
 719       lex_get (lexer);
 720       return true;
 721     }
 722   else
 723     return false;
 724 }
 725
 726 /* If the current token is integer X, skips it and returns true.  Otherwise,
 727    returns false. */
 728 bool
 729 lex_match_int (struct lexer *lexer, int x)
 730 {
 731   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 732     {
 733       lex_get (lexer);
 734       return true;
 735     }
 736   else
 737     return false;
 738 }
 739 \f
 740 /* Forced matches. */
 741
 742 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 743    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 744    false.
 745
 746    IDENTIFIER must be an ASCII string. */
 747 bool
 748 lex_force_match_id (struct lexer *lexer, const char *identifier)
 749 {
 750   if (lex_match_id (lexer, identifier))
 751     return true;
 752   else
 753     {
 754       lex_error_expecting (lexer, identifier);
 755       return false;
 756     }
 757 }
 758
 759 /* If the current token has the specified TYPE, skips it and returns true.
 760    Otherwise, reports an error and returns false. */
 761 bool
 762 lex_force_match (struct lexer *lexer, enum token_type type)
 763 {
 764   if (lex_token (lexer) == type)
 765     {
 766       lex_get (lexer);
 767       return true;
 768     }
 769   else
 770     {
 771       const char *type_string = token_type_to_string (type);
 772       if (type_string)
 773         {
 774           char *s = xasprintf ("`%s'", type_string);
 775           lex_error_expecting (lexer, s);
 776           free (s);
 777         }
 778       else
 779         lex_error_expecting (lexer, token_type_to_name (type));
 780
 781       return false;
 782     }
 783 }
 784
 785 /* If the current token is a string, does nothing and returns true.
 786    Otherwise, reports an error and returns false. */
 787 bool
 788 lex_force_string (struct lexer *lexer)
 789 {
 790   if (lex_is_string (lexer))
 791     return true;
 792   else
 793     {
 794       lex_error (lexer, _("expecting string"));
 795       return false;
 796     }
 797 }
 798
 799 /* If the current token is a string or an identifier, does nothing and returns
 800    true.  Otherwise, reports an error and returns false.
 801
 802    This is meant for use in syntactic situations where we want to encourage the
 803    user to supply a quoted string, but for compatibility we also accept
 804    identifiers.  (One example of such a situation is file names.)  Therefore,
 805    the error message issued when the current token is wrong only says that a
 806    string is expected and doesn't mention that an identifier would also be
 807    accepted. */
 808 bool
 809 lex_force_string_or_id (struct lexer *lexer)
 810 {
 811   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 812 }
 813
 814 /* If the current token is an integer, does nothing and returns true.
 815    Otherwise, reports an error and returns false. */
 816 bool
 817 lex_force_int (struct lexer *lexer)
 818 {
 819   if (lex_is_integer (lexer))
 820     return true;
 821   else
 822     {
 823       lex_error (lexer, _("expecting integer"));
 824       return false;
 825     }
 826 }
 827
 828 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 829    nothing and returns true.  Otherwise, reports an error and returns false.
 830    If NAME is nonnull, then it is used in the error message. */
 831 bool
 832 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 833 {
 834   bool is_integer = lex_is_integer (lexer);
 835   bool too_small = is_integer && lex_integer (lexer) < min;
 836   bool too_big = is_integer && lex_integer (lexer) > max;
 837   if (is_integer && !too_small && !too_big)
 838     return true;
 839
 840   if (min > max)
 841     {
 842       /* Weird, maybe a bug in the caller.  Just report that we needed an
 843          integer. */
 844       if (name)
 845         lex_error (lexer, _("Integer expected for %s."), name);
 846       else
 847         lex_error (lexer, _("Integer expected."));
 848     }
 849   else if (min == max)
 850     {
 851       if (name)
 852         lex_error (lexer, _("Expected %ld for %s."), min, name);
 853       else
 854         lex_error (lexer, _("Expected %ld."), min);
 855     }
 856   else if (min + 1 == max)
 857     {
 858       if (name)
 859         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 860       else
 861         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 862     }
 863   else
 864     {
 865       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 866       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 867
 868       if (report_lower_bound && report_upper_bound)
 869         {
 870           if (name)
 871             lex_error (lexer,
 872                        _("Expected integer between %ld and %ld for %s."),
 873                        min, max, name);
 874           else
 875             lex_error (lexer, _("Expected integer between %ld and %ld."),
 876                        min, max);
 877         }
 878       else if (report_lower_bound)
 879         {
 880           if (min == 0)
 881             {
 882               if (name)
 883                 lex_error (lexer, _("Expected non-negative integer for %s."),
 884                            name);
 885               else
 886                 lex_error (lexer, _("Expected non-negative integer."));
 887             }
 888           else if (min == 1)
 889             {
 890               if (name)
 891                 lex_error (lexer, _("Expected positive integer for %s."),
 892                            name);
 893               else
 894                 lex_error (lexer, _("Expected positive integer."));
 895             }
 896         }
 897       else if (report_upper_bound)
 898         {
 899           if (name)
 900             lex_error (lexer,
 901                        _("Expected integer less than or equal to %ld for %s."),
 902                        max, name);
 903           else
 904             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 905                        max);
 906         }
 907       else
 908         {
 909           if (name)
 910             lex_error (lexer, _("Integer expected for %s."), name);
 911           else
 912             lex_error (lexer, _("Integer expected."));
 913         }
 914     }
 915   return false;
 916 }
 917
 918 /* If the current token is a number, does nothing and returns true.
 919    Otherwise, reports an error and returns false. */
 920 bool
 921 lex_force_num (struct lexer *lexer)
 922 {
 923   if (lex_is_number (lexer))
 924     return true;
 925
 926   lex_error (lexer, _("expecting number"));
 927   return false;
 928 }
 929
 930 /* If the current token is an identifier, does nothing and returns true.
 931    Otherwise, reports an error and returns false. */
 932 bool
 933 lex_force_id (struct lexer *lexer)
 934 {
 935   if (lex_token (lexer) == T_ID)
 936     return true;
 937
 938   lex_error (lexer, _("expecting identifier"));
 939   return false;
 940 }
 941 \f
 942 /* Token accessors. */
 943
 944 /* Returns the type of LEXER's current token. */
 945 enum token_type
 946 lex_token (const struct lexer *lexer)
 947 {
 948   return lex_next_token (lexer, 0);
 949 }
 950
 951 /* Returns the number in LEXER's current token.
 952
 953    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 954    tokens this function will always return zero. */
 955 double
 956 lex_tokval (const struct lexer *lexer)
 957 {
 958   return lex_next_tokval (lexer, 0);
 959 }
 960
 961 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
 962
 963    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 964    this functions this function will always return NULL.
 965
 966    The UTF-8 encoding of the returned string is correct for variable names and
 967    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 968    data_in() to use it in a "union value".  */
 969 const char *
 970 lex_tokcstr (const struct lexer *lexer)
 971 {
 972   return lex_next_tokcstr (lexer, 0);
 973 }
 974
 975 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
 976    null-terminated (but the null terminator is not included in the returned
 977    substring's 'length').
 978
 979    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
 980    this functions this function will always return NULL.
 981
 982    The UTF-8 encoding of the returned string is correct for variable names and
 983    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
 984    data_in() to use it in a "union value".  */
 985 struct substring
 986 lex_tokss (const struct lexer *lexer)
 987 {
 988   return lex_next_tokss (lexer, 0);
 989 }
 990 \f
 991 /* Looking ahead.
 992
 993    A value of 0 for N as an argument to any of these functions refers to the
 994    current token.  Lookahead is limited to the current command.  Any N greater
 995    than the number of tokens remaining in the current command will be treated
 996    as referring to a T_ENDCMD token. */
 997
 998 static const struct lex_token *
 999 lex_next__ (const struct lexer *lexer_, int n)
1000 {
1001   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1002   struct lex_source *src = lex_source__ (lexer);
1003
1004   if (src != NULL)
1005     return lex_source_next__ (src, n);
1006   else
1007     {
1008       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1009       return &stop_token;
1010     }
1011 }
1012
1013 static const struct lex_token *
1014 lex_source_next__ (const struct lex_source *src_, int n)
1015 {
1016   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1017
1018   if (n < 0)
1019     {
1020       if (-n <= src->parse_ofs)
1021         return src->parse[src->parse_ofs - (-n)];
1022       else
1023         {
1024           static const struct lex_token endcmd_token
1025             = { .token = { .type = T_ENDCMD } };
1026           return &endcmd_token;
1027         }
1028     }
1029
1030   while (src->n_parse - src->parse_ofs <= n)
1031     {
1032       if (src->n_parse > 0)
1033         {
1034           const struct lex_token *t = src->parse[src->n_parse - 1];
1035           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1036             return t;
1037         }
1038
1039       lex_source_get_parse (src);
1040     }
1041
1042   return src->parse[src->parse_ofs + n];
1043 }
1044
1045 /* Returns the "struct token" of the token N after the current one in LEXER.
1046    The returned pointer can be invalidated by pretty much any succeeding call
1047    into the lexer, although the string pointer within the returned token is
1048    only invalidated by consuming the token (e.g. with lex_get()). */
1049 const struct token *
1050 lex_next (const struct lexer *lexer, int n)
1051 {
1052   return &lex_next__ (lexer, n)->token;
1053 }
1054
1055 /* Returns the type of the token N after the current one in LEXER. */
1056 enum token_type
1057 lex_next_token (const struct lexer *lexer, int n)
1058 {
1059   return lex_next (lexer, n)->type;
1060 }
1061
1062 /* Returns the number in the tokn N after the current one in LEXER.
1063
1064    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1065    tokens this function will always return zero. */
1066 double
1067 lex_next_tokval (const struct lexer *lexer, int n)
1068 {
1069   return token_number (lex_next (lexer, n));
1070 }
1071
1072 /* Returns the null-terminated string in the token N after the current one, in
1073    UTF-8 encoding.
1074
1075    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1076    this functions this function will always return NULL.
1077
1078    The UTF-8 encoding of the returned string is correct for variable names and
1079    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1080    data_in() to use it in a "union value".  */
1081 const char *
1082 lex_next_tokcstr (const struct lexer *lexer, int n)
1083 {
1084   return lex_next_tokss (lexer, n).string;
1085 }
1086
1087 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1088    The string is null-terminated (but the null terminator is not included in
1089    the returned substring's 'length').
1090
1091    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1092    tokens this functions this function will always return NULL.
1093
1094    The UTF-8 encoding of the returned string is correct for variable names and
1095    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1096    data_in() to use it in a "union value".  */
1097 struct substring
1098 lex_next_tokss (const struct lexer *lexer, int n)
1099 {
1100   return lex_next (lexer, n)->string;
1101 }
1102
1103 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1104    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1105    are both zero, this requests the syntax for the current token.)  The caller
1106    must eventually free the returned string (with free()).  The syntax is
1107    encoded in UTF-8 and in the original form supplied to the lexer so that, for
1108    example, it may include comments, spaces, and new-lines if it spans multiple
1109    tokens.  Macro expansion, however, has already been performed. */
1110 char *
1111 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1112 {
1113   return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1114 }
1115
1116 /* Returns true if the token N ahead of the current one was produced by macro
1117    expansion, false otherwise. */
1118 bool
1119 lex_next_is_from_macro (const struct lexer *lexer, int n)
1120 {
1121   return lex_next__ (lexer, n)->macro_rep != NULL;
1122 }
1123
1124 static bool
1125 lex_tokens_match (const struct token *actual, const struct token *expected)
1126 {
1127   if (actual->type != expected->type)
1128     return false;
1129
1130   switch (actual->type)
1131     {
1132     case T_POS_NUM:
1133     case T_NEG_NUM:
1134       return actual->number == expected->number;
1135
1136     case T_ID:
1137       return lex_id_match (expected->string, actual->string);
1138
1139     case T_STRING:
1140       return (actual->string.length == expected->string.length
1141               && !memcmp (actual->string.string, expected->string.string,
1142                           actual->string.length));
1143
1144     default:
1145       return true;
1146     }
1147 }
1148
1149 static size_t
1150 lex_at_phrase__ (struct lexer *lexer, const char *s)
1151 {
1152   struct string_lexer slex;
1153   struct token token;
1154
1155   size_t i = 0;
1156   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1157   while (string_lexer_next (&slex, &token))
1158     {
1159       bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1160       token_uninit (&token);
1161       if (!match)
1162         return 0;
1163     }
1164   return i;
1165 }
1166
1167 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1168    returns true.  Otherwise, returns false.
1169
1170    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1171    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1172    first three letters. */
1173 bool
1174 lex_at_phrase (struct lexer *lexer, const char *s)
1175 {
1176   return lex_at_phrase__ (lexer, s) > 0;
1177 }
1178
1179 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1180    skips it and returns true.  Otherwise, returns false.
1181
1182    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1183    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1184    first three letters. */
1185 bool
1186 lex_match_phrase (struct lexer *lexer, const char *s)
1187 {
1188   size_t n = lex_at_phrase__ (lexer, s);
1189   if (n > 0)
1190     lex_get_n (lexer, n);
1191   return n > 0;
1192 }
1193
1194 static int
1195 count_newlines (char *s, size_t length)
1196 {
1197   int n_newlines = 0;
1198   char *newline;
1199
1200   while ((newline = memchr (s, '\n', length)) != NULL)
1201     {
1202       n_newlines++;
1203       length -= (newline + 1) - s;
1204       s = newline + 1;
1205     }
1206
1207   return n_newlines;
1208 }
1209
1210 static int
1211 lex_token_get_last_line_number (const struct lex_source *src,
1212                                 const struct lex_token *token)
1213 {
1214   if (token->first_line == 0)
1215     return 0;
1216   else
1217     {
1218       char *token_str = &src->buffer[token->token_pos];
1219       return token->first_line + count_newlines (token_str, token->token_len) + 1;
1220     }
1221 }
1222
1223 static int
1224 lex_token_get_column__ (const struct lex_source *src, size_t offset)
1225 {
1226   const char *newline = memrchr (src->buffer, '\n', offset);
1227   size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1228   return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1229 }
1230
1231 static int
1232 lex_token_get_first_column (const struct lex_source *src,
1233                             const struct lex_token *token)
1234 {
1235   return lex_token_get_column__ (src, token->token_pos);
1236 }
1237
1238 static int
1239 lex_token_get_last_column (const struct lex_source *src,
1240                            const struct lex_token *token)
1241 {
1242   return lex_token_get_column__ (src, token->token_pos + token->token_len);
1243 }
1244
1245 static struct msg_location
1246 lex_token_location (const struct lex_source *src,
1247                     const struct lex_token *t0,
1248                     const struct lex_token *t1)
1249 {
1250   return (struct msg_location) {
1251     .file_name = src->reader->file_name,
1252     .first_line = t0->first_line,
1253     .last_line = lex_token_get_last_line_number (src, t1),
1254     .first_column = lex_token_get_first_column (src, t0),
1255     .last_column = lex_token_get_last_column (src, t1),
1256   };
1257 }
1258
1259 static struct msg_location *
1260 lex_token_location_rw (const struct lex_source *src,
1261                        const struct lex_token *t0,
1262                        const struct lex_token *t1)
1263 {
1264   struct msg_location location = lex_token_location (src, t0, t1);
1265   return msg_location_dup (&location);
1266 }
1267
1268 static struct msg_location *
1269 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1270 {
1271   return lex_token_location_rw (src,
1272                                 lex_source_next__ (src, n0),
1273                                 lex_source_next__ (src, n1));
1274 }
1275
1276 /* Returns the 1-based line number of the start of the syntax that represents
1277    the token N after the current one in LEXER.  Returns 0 for a T_STOP token or
1278    if the token is drawn from a source that does not have line numbers. */
1279 int
1280 lex_get_first_line_number (const struct lexer *lexer, int n)
1281 {
1282   const struct lex_source *src = lex_source__ (lexer);
1283   return src ? lex_source_next__ (src, n)->first_line : 0;
1284 }
1285
1286 /* Returns the 1-based line number of the end of the syntax that represents the
1287    token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1288    token or if the token is drawn from a source that does not have line
1289    numbers.
1290
1291    Most of the time, a single token is wholly within a single line of syntax,
1292    but there are two exceptions: a T_STRING token can be made up of multiple
1293    segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
1294    token can consist of a "-" on one line followed by the number on the next.
1295  */
1296 int
1297 lex_get_last_line_number (const struct lexer *lexer, int n)
1298 {
1299   const struct lex_source *src = lex_source__ (lexer);
1300   return src ? lex_token_get_last_line_number (src,
1301                                                lex_source_next__ (src, n)) : 0;
1302 }
1303
1304 /* Returns the 1-based column number of the start of the syntax that represents
1305    the token N after the current one in LEXER.  Returns 0 for a T_STOP
1306    token.
1307
1308    Column numbers are measured according to the width of characters as shown in
1309    a typical fixed-width font, in which CJK characters have width 2 and
1310    combining characters have width 0.  */
1311 int
1312 lex_get_first_column (const struct lexer *lexer, int n)
1313 {
1314   const struct lex_source *src = lex_source__ (lexer);
1315   return src ? lex_token_get_first_column (src, lex_source_next__ (src, n)) : 0;
1316 }
1317
1318 /* Returns the 1-based column number of the end of the syntax that represents
1319    the token N after the current one in LEXER, plus 1.  Returns 0 for a T_STOP
1320    token.
1321
1322    Column numbers are measured according to the width of characters as shown in
1323    a typical fixed-width font, in which CJK characters have width 2 and
1324    combining characters have width 0.  */
1325 int
1326 lex_get_last_column (const struct lexer *lexer, int n)
1327 {
1328   const struct lex_source *src = lex_source__ (lexer);
1329   return src ? lex_token_get_last_column (src, lex_source_next__ (src, n)) : 0;
1330 }
1331
1332 /* Returns the name of the syntax file from which the current command is drawn.
1333    Returns NULL for a T_STOP token or if the command's source does not have
1334    line numbers.
1335
1336    There is no version of this function that takes an N argument because
1337    lookahead only works to the end of a command and any given command is always
1338    within a single syntax file. */
1339 const char *
1340 lex_get_file_name (const struct lexer *lexer)
1341 {
1342   struct lex_source *src = lex_source__ (lexer);
1343   return src == NULL ? NULL : src->reader->file_name;
1344 }
1345
1346 /* Returns a newly allocated msg_location for the syntax that represents tokens
1347    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1348    must eventually free the location (with msg_location_destroy()). */
1349 struct msg_location *
1350 lex_get_location (const struct lexer *lexer, int n0, int n1)
1351 {
1352   struct msg_location *loc = lex_get_lines (lexer, n0, n1);
1353   loc->first_column = lex_get_first_column (lexer, n0);
1354   loc->last_column = lex_get_last_column (lexer, n1);
1355   return loc;
1356 }
1357
1358 /* Returns a newly allocated msg_location for the syntax that represents tokens
1359    with 0-based offsets N0...N1, inclusive, from the current token.  The
1360    location only covers the tokens' lines, not the columns.  The caller must
1361    eventually free the location (with msg_location_destroy()). */
1362 struct msg_location *
1363 lex_get_lines (const struct lexer *lexer, int n0, int n1)
1364 {
1365   struct msg_location *loc = xmalloc (sizeof *loc);
1366   *loc = (struct msg_location) {
1367     .file_name = xstrdup_if_nonnull (lex_get_file_name (lexer)),
1368     .first_line = lex_get_first_line_number (lexer, n0),
1369     .last_line = lex_get_last_line_number (lexer, n1),
1370   };
1371   return loc;
1372 }
1373
1374 const char *
1375 lex_get_encoding (const struct lexer *lexer)
1376 {
1377   struct lex_source *src = lex_source__ (lexer);
1378   return src == NULL ? NULL : src->reader->encoding;
1379 }
1380
1381 /* Returns the syntax mode for the syntax file from which the current drawn is
1382    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1383    does not have line numbers.
1384
1385    There is no version of this function that takes an N argument because
1386    lookahead only works to the end of a command and any given command is always
1387    within a single syntax file. */
1388 enum segmenter_mode
1389 lex_get_syntax_mode (const struct lexer *lexer)
1390 {
1391   struct lex_source *src = lex_source__ (lexer);
1392   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1393 }
1394
1395 /* Returns the error mode for the syntax file from which the current drawn is
1396    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1397    source does not have line numbers.
1398
1399    There is no version of this function that takes an N argument because
1400    lookahead only works to the end of a command and any given command is always
1401    within a single syntax file. */
1402 enum lex_error_mode
1403 lex_get_error_mode (const struct lexer *lexer)
1404 {
1405   struct lex_source *src = lex_source__ (lexer);
1406   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1407 }
1408
1409 /* If the source that LEXER is currently reading has error mode
1410    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1411    token to be read comes directly from whatever is next read from the stream.
1412
1413    It makes sense to call this function after encountering an error in a
1414    command entered on the console, because usually the user would prefer not to
1415    have cascading errors. */
1416 void
1417 lex_interactive_reset (struct lexer *lexer)
1418 {
1419   struct lex_source *src = lex_source__ (lexer);
1420   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1421     {
1422       src->length = 0;
1423       src->journal_pos = src->seg_pos = 0;
1424       src->n_newlines = 0;
1425       src->suppress_next_newline = false;
1426       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1427                                        false);
1428       lex_stage_clear (&src->pp);
1429       lex_stage_clear (&src->merge);
1430       lex_source_clear_parse (src);
1431       lex_source_push_endcmd__ (src);
1432     }
1433 }
1434
1435 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1436 void
1437 lex_discard_rest_of_command (struct lexer *lexer)
1438 {
1439   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1440     lex_get (lexer);
1441 }
1442
1443 /* Discards all lookahead tokens in LEXER, then discards all input sources
1444    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1445    runs out of input sources. */
1446 void
1447 lex_discard_noninteractive (struct lexer *lexer)
1448 {
1449   struct lex_source *src = lex_source__ (lexer);
1450
1451   if (src != NULL)
1452     {
1453       lex_stage_clear (&src->pp);
1454       lex_stage_clear (&src->merge);
1455       lex_source_clear_parse (src);
1456
1457       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1458            src = lex_source__ (lexer))
1459         lex_source_destroy (src);
1460     }
1461 }
1462 \f
1463 static void
1464 lex_source_expand__ (struct lex_source *src)
1465 {
1466   if (src->length >= src->allocated)
1467     src->buffer = x2realloc (src->buffer, &src->allocated);
1468 }
1469
1470 static void
1471 lex_source_read__ (struct lex_source *src)
1472 {
1473   do
1474     {
1475       lex_source_expand__ (src);
1476
1477       size_t space = src->allocated - src->length;
1478       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1479       size_t n = src->reader->class->read (src->reader,
1480                                            &src->buffer[src->length],
1481                                            space, prompt);
1482       assert (n <= space);
1483
1484       if (n == 0)
1485         {
1486           /* End of input. */
1487           src->reader->eof = true;
1488           lex_source_expand__ (src);
1489           return;
1490         }
1491
1492       src->length += n;
1493     }
1494   while (!memchr (&src->buffer[src->seg_pos], '\n',
1495                   src->length - src->seg_pos));
1496 }
1497
1498 static struct lex_source *
1499 lex_source__ (const struct lexer *lexer)
1500 {
1501   return (ll_is_empty (&lexer->sources) ? NULL
1502           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1503 }
1504
1505 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1506    one, through N1 ahead of the current one, inclusive.  (For example, if N0
1507    and N1 are both zero, this requests the syntax for the current token.)  The
1508    caller must eventually free the returned string (with free()).  The syntax
1509    is encoded in UTF-8 and in the original form supplied to the lexer so that,
1510    for example, it may include comments, spaces, and new-lines if it spans
1511    multiple tokens.  Macro expansion, however, has already been performed. */
1512 static char *
1513 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1514 {
1515   struct string s = DS_EMPTY_INITIALIZER;
1516   for (size_t i = n0; i <= n1; )
1517     {
1518       /* Find [I,J) as the longest sequence of tokens not produced by macro
1519          expansion, or otherwise the longest sequence expanded from a single
1520          macro call. */
1521       const struct lex_token *first = lex_source_next__ (src, i);
1522       size_t j;
1523       for (j = i + 1; j <= n1; j++)
1524         {
1525           const struct lex_token *cur = lex_source_next__ (src, j);
1526           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1527               || first->macro_rep != cur->macro_rep)
1528             break;
1529         }
1530       const struct lex_token *last = lex_source_next__ (src, j - 1);
1531
1532       /* Now add the syntax for this sequence of tokens to SRC. */
1533       if (!ds_is_empty (&s))
1534         ds_put_byte (&s, ' ');
1535       if (!first->macro_rep)
1536         {
1537           size_t start = first->token_pos;
1538           size_t end = last->token_pos + last->token_len;
1539           ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1540         }
1541       else
1542         {
1543           size_t start = first->ofs;
1544           size_t end = last->ofs + last->len;
1545           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1546                                            end - start));
1547         }
1548
1549       i = j;
1550     }
1551   return ds_steal_cstr (&s);
1552 }
1553
1554 static bool
1555 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1556 {
1557   for (size_t i = n0; i <= n1; i++)
1558     if (lex_source_next__ (src, i)->macro_rep)
1559       return true;
1560   return false;
1561 }
1562
1563 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1564    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1565    other tokens included in that range.  The syntax is encoded in UTF-8 and in
1566    the original form supplied to the lexer so that, for example, it may include
1567    comments, spaces, and new-lines if it spans multiple tokens.
1568
1569    Returns an empty string if the token range doesn't include a macro call.
1570
1571    The caller must not modify or free the returned string. */
1572 static struct substring
1573 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1574 {
1575   if (!lex_source_contains_macro_call (src, n0, n1))
1576     return ss_empty ();
1577
1578   const struct lex_token *token0 = lex_source_next__ (src, n0);
1579   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1580   size_t start = token0->token_pos;
1581   size_t end = token1->token_pos + token1->token_len;
1582
1583   return ss_buffer (&src->buffer[start], end - start);
1584 }
1585
1586 static void
1587 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1588                          const char *format, va_list args)
1589 {
1590   const struct lex_token *token;
1591   struct string s;
1592
1593   ds_init_empty (&s);
1594
1595   token = lex_source_next__ (src, n0);
1596   if (token->token.type == T_ENDCMD)
1597     ds_put_cstr (&s, _("Syntax error at end of command"));
1598   else
1599     {
1600       /* Get the syntax that caused the error. */
1601       char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1602       char syntax[64];
1603       str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1604       free (raw_syntax);
1605
1606       /* Get the macro call(s) that expanded to the syntax that caused the
1607          error. */
1608       char call[64];
1609       str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1610                      call, sizeof call);
1611
1612       if (syntax[0])
1613         {
1614           if (call[0])
1615             ds_put_format (&s,
1616                            _("Syntax error at `%s' (in expansion of `%s')"),
1617                            syntax, call);
1618           else
1619             ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1620         }
1621       else
1622         {
1623           if (call[0])
1624             ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1625                            call);
1626           else
1627             ds_put_cstr (&s, _("Syntax error"));
1628         }
1629     }
1630
1631   if (format)
1632     {
1633       ds_put_cstr (&s, ": ");
1634       ds_put_vformat (&s, format, args);
1635     }
1636   if (ds_last (&s) != '.')
1637     ds_put_byte (&s, '.');
1638
1639   struct msg *m = xmalloc (sizeof *m);
1640   *m = (struct msg) {
1641     .category = MSG_C_SYNTAX,
1642     .severity = MSG_S_ERROR,
1643     .location = lex_source_get_location (src, n0, n1),
1644     .text = ds_steal_cstr (&s),
1645   };
1646   msg_emit (m);
1647 }
1648
1649 static void
1650 lex_get_error (struct lex_source *src, const struct lex_token *token)
1651 {
1652   char syntax[64];
1653   str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1654                  syntax, sizeof syntax);
1655
1656   struct string s = DS_EMPTY_INITIALIZER;
1657   ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1658   ds_put_format (&s, ": %s", token->token.string.string);
1659
1660   struct msg *m = xmalloc (sizeof *m);
1661   *m = (struct msg) {
1662     .category = MSG_C_SYNTAX,
1663     .severity = MSG_S_ERROR,
1664     .location = lex_token_location_rw (src, token, token),
1665     .text = ds_steal_cstr (&s),
1666   };
1667   msg_emit (m);
1668 }
1669
1670 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1671    underlying lex_reader if necessary.  Returns true if a new token was added
1672    to SRC's deque, false otherwise.  The caller should retry failures unless
1673    SRC's 'eof' marker was set to true indicating that there will be no more
1674    tokens from this source. */
1675 static bool
1676 lex_source_try_get_pp (struct lex_source *src)
1677 {
1678   /* Append a new token to SRC and initialize it. */
1679   struct lex_token *token = xmalloc (sizeof *token);
1680   token->token = (struct token) { .type = T_STOP };
1681   token->macro_rep = NULL;
1682   token->ref_cnt = NULL;
1683   token->token_pos = src->seg_pos;
1684   if (src->reader->line_number > 0)
1685     token->first_line = src->reader->line_number + src->n_newlines;
1686   else
1687     token->first_line = 0;
1688
1689   /* Extract a segment. */
1690   const char *segment;
1691   enum segment_type seg_type;
1692   int seg_len;
1693   for (;;)
1694     {
1695       segment = &src->buffer[src->seg_pos];
1696       seg_len = segmenter_push (&src->segmenter, segment,
1697                                 src->length - src->seg_pos,
1698                                 src->reader->eof, &seg_type);
1699       if (seg_len >= 0)
1700         break;
1701
1702       /* The segmenter needs more input to produce a segment. */
1703       assert (!src->reader->eof);
1704       lex_source_read__ (src);
1705     }
1706
1707   /* Update state based on the segment. */
1708   token->token_len = seg_len;
1709   src->seg_pos += seg_len;
1710   if (seg_type == SEG_NEWLINE)
1711     src->n_newlines++;
1712
1713   /* Get a token from the segment. */
1714   enum tokenize_result result = token_from_segment (
1715     seg_type, ss_buffer (segment, seg_len), &token->token);
1716
1717   /* If we've reached the end of a line, or the end of a command, then pass
1718      the line to the output engine as a syntax text item.  */
1719   int n_lines = seg_type == SEG_NEWLINE;
1720   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1721     {
1722       n_lines++;
1723       src->suppress_next_newline = true;
1724     }
1725   else if (n_lines > 0 && src->suppress_next_newline)
1726     {
1727       n_lines--;
1728       src->suppress_next_newline = false;
1729     }
1730   for (int i = 0; i < n_lines; i++)
1731     {
1732       /* Beginning of line. */
1733       const char *line = &src->buffer[src->journal_pos];
1734
1735       /* Calculate line length, including \n or \r\n end-of-line if present.
1736
1737          We use src->head even though that may be beyond what we've actually
1738          converted to tokens (which is only through line_pos).  That's because,
1739          if we're emitting the line due to SEG_END_COMMAND, we want to take the
1740          whole line through the newline, not just through the '.'. */
1741       size_t max_len = src->length - src->journal_pos;
1742       const char *newline = memchr (line, '\n', max_len);
1743       size_t line_len = newline ? newline - line + 1 : max_len;
1744
1745       /* Calculate line length excluding end-of-line. */
1746       size_t copy_len = line_len;
1747       if (copy_len > 0 && line[copy_len - 1] == '\n')
1748         copy_len--;
1749       if (copy_len > 0 && line[copy_len - 1] == '\r')
1750         copy_len--;
1751
1752       /* Submit the line as syntax. */
1753       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1754                                                    xmemdup0 (line, copy_len),
1755                                                    NULL));
1756
1757       src->journal_pos += line_len;
1758     }
1759
1760   switch (result)
1761     {
1762     case TOKENIZE_ERROR:
1763       lex_get_error (src, token);
1764       /* Fall through. */
1765     case TOKENIZE_EMPTY:
1766       lex_token_destroy (token);
1767       return false;
1768
1769     case TOKENIZE_TOKEN:
1770       if (token->token.type == T_STOP)
1771         {
1772           token->token.type = T_ENDCMD;
1773           src->eof = true;
1774         }
1775       lex_stage_push_last (&src->pp, token);
1776       return true;
1777     }
1778   NOT_REACHED ();
1779 }
1780
1781 /* Attempts to append a new token to SRC.  Returns true if successful, false on
1782    failure.  On failure, the end of SRC has been reached and no more tokens
1783    will be forthcoming from it.
1784
1785    Does not make the new token available for lookahead yet; the caller must
1786    adjust SRC's 'middle' pointer to do so. */
1787 static bool
1788 lex_source_get_pp (struct lex_source *src)
1789 {
1790   while (!src->eof)
1791     if (lex_source_try_get_pp (src))
1792       return true;
1793   return false;
1794 }
1795
1796 static bool
1797 lex_source_try_get_merge (const struct lex_source *src_)
1798 {
1799   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1800
1801   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1802     return false;
1803
1804   if (!settings_get_mexpand ())
1805     {
1806       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1807       return true;
1808     }
1809
1810   /* Now pass tokens one-by-one to the macro expander.
1811
1812      In the common case where there is no macro to expand, the loop is not
1813      entered.  */
1814   struct macro_call *mc;
1815   int n_call = macro_call_create (src->lexer->macros,
1816                                   &lex_stage_first (&src->pp)->token, &mc);
1817   for (int ofs = 1; !n_call; ofs++)
1818     {
1819       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1820         {
1821           /* This should not be reachable because we always get a T_ENDCMD at
1822              the end of an input file (transformed from T_STOP by
1823              lex_source_try_get_pp()) and the macro_expander should always
1824              terminate expansion on T_ENDCMD. */
1825           NOT_REACHED ();
1826         }
1827
1828       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1829       size_t start = t->token_pos;
1830       size_t end = t->token_pos + t->token_len;
1831       const struct macro_token mt = {
1832         .token = t->token,
1833         .syntax = ss_buffer (&src->buffer[start], end - start),
1834       };
1835       const struct msg_location loc = lex_token_location (src, t, t);
1836       n_call = macro_call_add (mc, &mt, &loc);
1837     }
1838   if (n_call < 0)
1839     {
1840       /* False alarm: no macro expansion after all.  Use first token as
1841          lookahead.  We'll retry macro expansion from the second token next
1842          time around. */
1843       macro_call_destroy (mc);
1844       lex_stage_shift (&src->merge, &src->pp, 1);
1845       return true;
1846     }
1847
1848   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1849      are a macro call.  (These are likely to be the only tokens in 'pp'.)
1850      Expand them.  */
1851   const struct lex_token *c0 = lex_stage_first (&src->pp);
1852   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1853   struct macro_tokens expansion = { .n = 0 };
1854   struct msg_location loc = lex_token_location (src, c0, c1);
1855   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1856   macro_call_destroy (mc);
1857
1858   /* Convert the macro expansion into syntax for possible error messages
1859      later. */
1860   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1861   size_t *len = xnmalloc (expansion.n, sizeof *len);
1862   struct string s = DS_EMPTY_INITIALIZER;
1863   macro_tokens_to_syntax (&expansion, &s, ofs, len);
1864
1865   if (settings_get_mprint ())
1866     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1867                                           _("Macro Expansion")));
1868
1869   /* Append the macro expansion tokens to the lookahead. */
1870   if (expansion.n > 0)
1871     {
1872       char *macro_rep = ds_steal_cstr (&s);
1873       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1874       *ref_cnt = expansion.n;
1875       for (size_t i = 0; i < expansion.n; i++)
1876         {
1877           struct lex_token *token = xmalloc (sizeof *token);
1878           *token = (struct lex_token) {
1879             .token = expansion.mts[i].token,
1880             .token_pos = c0->token_pos,
1881             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1882             .first_line = c0->first_line,
1883             .macro_rep = macro_rep,
1884             .ofs = ofs[i],
1885             .len = len[i],
1886             .ref_cnt = ref_cnt,
1887           };
1888           lex_stage_push_last (&src->merge, token);
1889
1890           ss_dealloc (&expansion.mts[i].syntax);
1891         }
1892     }
1893   else
1894     ds_destroy (&s);
1895   free (expansion.mts);
1896   free (ofs);
1897   free (len);
1898
1899   /* Destroy the tokens for the call. */
1900   for (size_t i = 0; i < n_call; i++)
1901     lex_stage_pop_first (&src->pp);
1902
1903   return expansion.n > 0;
1904 }
1905
1906 /* Attempts to obtain at least one new token into 'merge' in SRC.
1907
1908    Returns true if successful, false on failure.  In the latter case, SRC is
1909    exhausted and 'src->eof' is now true. */
1910 static bool
1911 lex_source_get_merge (struct lex_source *src)
1912 {
1913   while (!src->eof)
1914     if (lex_source_try_get_merge (src))
1915       return true;
1916   return false;
1917 }
1918
1919 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1920
1921    Returns true if successful, false on failure.  In the latter case, SRC is
1922    exhausted and 'src->eof' is now true. */
1923 static bool
1924 lex_source_get_parse (struct lex_source *src)
1925 {
1926   struct merger m = MERGER_INIT;
1927   struct token out;
1928   for (size_t i = 0; ; i++)
1929     {
1930       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1931         {
1932           /* We always get a T_ENDCMD at the end of an input file
1933              (transformed from T_STOP by lex_source_try_get_pp()) and
1934              merger_add() should never return -1 on T_ENDCMD. */
1935           assert (lex_stage_is_empty (&src->merge));
1936           return false;
1937         }
1938
1939       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1940                                &out);
1941       if (!retval)
1942         {
1943           lex_source_push_parse (src, lex_stage_take_first (&src->merge));
1944           return true;
1945         }
1946       else if (retval > 0)
1947         {
1948           /* Add a token that merges all the tokens together. */
1949           const struct lex_token *first = lex_stage_first (&src->merge);
1950           const struct lex_token *last = lex_stage_nth (&src->merge,
1951                                                         retval - 1);
1952           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
1953           struct lex_token *t = xmalloc (sizeof *t);
1954           *t = (struct lex_token) {
1955             .token = out,
1956             .token_pos = first->token_pos,
1957             .token_len = (last->token_pos - first->token_pos) + last->token_len,
1958             .first_line = first->first_line,
1959
1960             /* This works well if all the tokens were not expanded from macros,
1961                or if they came from the same macro expansion.  It just gives up
1962                in the other (corner) cases. */
1963             .macro_rep = macro ? first->macro_rep : NULL,
1964             .ofs = macro ? first->ofs : 0,
1965             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
1966             .ref_cnt = macro ? first->ref_cnt : NULL,
1967           };
1968           if (t->ref_cnt)
1969             ++*t->ref_cnt;
1970           lex_source_push_parse (src, t);
1971
1972           for (int i = 0; i < retval; i++)
1973             lex_stage_pop_first (&src->merge);
1974           return true;
1975         }
1976     }
1977 }
1978 \f
1979 static void
1980 lex_source_push_endcmd__ (struct lex_source *src)
1981 {
1982   assert (src->n_parse == 0);
1983
1984   struct lex_token *token = xmalloc (sizeof *token);
1985   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
1986   lex_source_push_parse (src, token);
1987 }
1988
1989 static void
1990 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
1991 {
1992   if (src->n_parse >= src->allocated_parse)
1993     src->parse = x2nrealloc (src->parse, &src->allocated_parse,
1994                              sizeof *src->parse);
1995   src->parse[src->n_parse++] = token;
1996 }
1997
1998 static void
1999 lex_source_clear_parse (struct lex_source *src)
2000 {
2001   for (size_t i = 0; i < src->n_parse; i++)
2002     lex_token_destroy (src->parse[i]);
2003   src->n_parse = src->parse_ofs = 0;
2004 }
2005
2006 static struct lex_source *
2007 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2008 {
2009   struct lex_source *src = xmalloc (sizeof *src);
2010   *src = (struct lex_source) {
2011     .reader = reader,
2012     .segmenter = segmenter_init (reader->syntax, false),
2013     .lexer = lexer,
2014   };
2015
2016   lex_source_push_endcmd__ (src);
2017
2018   return src;
2019 }
2020
2021 static void
2022 lex_source_destroy (struct lex_source *src)
2023 {
2024   char *file_name = src->reader->file_name;
2025   char *encoding = src->reader->encoding;
2026   if (src->reader->class->destroy != NULL)
2027     src->reader->class->destroy (src->reader);
2028   free (file_name);
2029   free (encoding);
2030   free (src->buffer);
2031   lex_stage_uninit (&src->pp);
2032   lex_stage_uninit (&src->merge);
2033   lex_source_clear_parse (src);
2034   free (src->parse);
2035   ll_remove (&src->ll);
2036   free (src);
2037 }
2038 \f
2039 struct lex_file_reader
2040   {
2041     struct lex_reader reader;
2042     struct u8_istream *istream;
2043   };
2044
2045 static struct lex_reader_class lex_file_reader_class;
2046
2047 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2048    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2049    ENCODING, which should take one of the forms accepted by
2050    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2051    mode of the new reader, respectively.
2052
2053    Returns a null pointer if FILE_NAME cannot be opened. */
2054 struct lex_reader *
2055 lex_reader_for_file (const char *file_name, const char *encoding,
2056                      enum segmenter_mode syntax,
2057                      enum lex_error_mode error)
2058 {
2059   struct lex_file_reader *r;
2060   struct u8_istream *istream;
2061
2062   istream = (!strcmp(file_name, "-")
2063              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2064              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2065   if (istream == NULL)
2066     {
2067       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2068       return NULL;
2069     }
2070
2071   r = xmalloc (sizeof *r);
2072   lex_reader_init (&r->reader, &lex_file_reader_class);
2073   r->reader.syntax = syntax;
2074   r->reader.error = error;
2075   r->reader.file_name = xstrdup (file_name);
2076   r->reader.encoding = xstrdup_if_nonnull (encoding);
2077   r->reader.line_number = 1;
2078   r->istream = istream;
2079
2080   return &r->reader;
2081 }
2082
2083 static struct lex_file_reader *
2084 lex_file_reader_cast (struct lex_reader *r)
2085 {
2086   return UP_CAST (r, struct lex_file_reader, reader);
2087 }
2088
2089 static size_t
2090 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2091                enum prompt_style prompt_style UNUSED)
2092 {
2093   struct lex_file_reader *r = lex_file_reader_cast (r_);
2094   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2095   if (n_read < 0)
2096     {
2097       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2098       return 0;
2099     }
2100   return n_read;
2101 }
2102
2103 static void
2104 lex_file_close (struct lex_reader *r_)
2105 {
2106   struct lex_file_reader *r = lex_file_reader_cast (r_);
2107
2108   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2109     {
2110       if (u8_istream_close (r->istream) != 0)
2111         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2112     }
2113   else
2114     u8_istream_free (r->istream);
2115
2116   free (r);
2117 }
2118
2119 static struct lex_reader_class lex_file_reader_class =
2120   {
2121     lex_file_read,
2122     lex_file_close
2123   };
2124 \f
2125 struct lex_string_reader
2126   {
2127     struct lex_reader reader;
2128     struct substring s;
2129     size_t offset;
2130   };
2131
2132 static struct lex_reader_class lex_string_reader_class;
2133
2134 /* Creates and returns a new lex_reader for the contents of S, which must be
2135    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2136    with ss_dealloc() when it is closed. */
2137 struct lex_reader *
2138 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2139 {
2140   struct lex_string_reader *r;
2141
2142   r = xmalloc (sizeof *r);
2143   lex_reader_init (&r->reader, &lex_string_reader_class);
2144   r->reader.syntax = SEG_MODE_AUTO;
2145   r->reader.encoding = xstrdup_if_nonnull (encoding);
2146   r->s = s;
2147   r->offset = 0;
2148
2149   return &r->reader;
2150 }
2151
2152 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2153    which must be encoded in ENCODING.  The caller retains ownership of S. */
2154 struct lex_reader *
2155 lex_reader_for_string (const char *s, const char *encoding)
2156 {
2157   struct substring ss;
2158   ss_alloc_substring (&ss, ss_cstr (s));
2159   return lex_reader_for_substring_nocopy (ss, encoding);
2160 }
2161
2162 /* Formats FORMAT as a printf()-like format string and creates and returns a
2163    new lex_reader for the formatted result.  */
2164 struct lex_reader *
2165 lex_reader_for_format (const char *format, const char *encoding, ...)
2166 {
2167   struct lex_reader *r;
2168   va_list args;
2169
2170   va_start (args, encoding);
2171   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2172   va_end (args);
2173
2174   return r;
2175 }
2176
2177 static struct lex_string_reader *
2178 lex_string_reader_cast (struct lex_reader *r)
2179 {
2180   return UP_CAST (r, struct lex_string_reader, reader);
2181 }
2182
2183 static size_t
2184 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2185                  enum prompt_style prompt_style UNUSED)
2186 {
2187   struct lex_string_reader *r = lex_string_reader_cast (r_);
2188   size_t chunk;
2189
2190   chunk = MIN (n, r->s.length - r->offset);
2191   memcpy (buf, r->s.string + r->offset, chunk);
2192   r->offset += chunk;
2193
2194   return chunk;
2195 }
2196
2197 static void
2198 lex_string_close (struct lex_reader *r_)
2199 {
2200   struct lex_string_reader *r = lex_string_reader_cast (r_);
2201
2202   ss_dealloc (&r->s);
2203   free (r);
2204 }
2205
2206 static struct lex_reader_class lex_string_reader_class =
2207   {
2208     lex_string_read,
2209     lex_string_close
2210   };