pintos-os.org Git - pspp/blob - src/language/lexer/lexer.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 #include <config.h>
  18
  19 #include "language/lexer/lexer.h"
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <limits.h>
  24 #include <math.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <unictype.h>
  29 #include <unistd.h>
  30 #include <unistr.h>
  31
  32 #include "language/command.h"
  33 #include "language/lexer/macro.h"
  34 #include "language/lexer/scan.h"
  35 #include "language/lexer/segment.h"
  36 #include "language/lexer/token.h"
  37 #include "libpspp/assertion.h"
  38 #include "libpspp/cast.h"
  39 #include "libpspp/deque.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/intern.h"
  42 #include "libpspp/ll.h"
  43 #include "libpspp/message.h"
  44 #include "libpspp/misc.h"
  45 #include "libpspp/str.h"
  46 #include "libpspp/u8-istream.h"
  47 #include "output/journal.h"
  48 #include "output/output-item.h"
  49
  50 #include "gl/c-ctype.h"
  51 #include "gl/minmax.h"
  52 #include "gl/xalloc.h"
  53 #include "gl/xmemdup0.h"
  54
  55 #include "gettext.h"
  56 #define _(msgid) gettext (msgid)
  57 #define N_(msgid) msgid
  58
  59 /* A token within a lex_source. */
  60 struct lex_token
  61   {
  62     /* The regular token information. */
  63     struct token token;
  64
  65     /* For a token obtained through the lexer in an ordinary way, this is the
  66        location of the token in terms of the lex_source's buffer.
  67
  68        For a token produced through macro expansion, this is the entire macro
  69        call. */
  70     size_t token_pos;           /* Offset into src->buffer of token start. */
  71     size_t token_len;           /* Length of source for token in bytes. */
  72
  73     /* For a token obtained through macro expansion, this is just this token.
  74
  75        For a token obtained through the lexer in an ordinary way, these are
  76        nulls and zeros. */
  77     char *macro_rep;        /* The whole macro expansion. */
  78     size_t ofs;             /* Offset of this token in macro_rep. */
  79     size_t len;             /* Length of this token in macro_rep. */
  80     size_t *ref_cnt;        /* Number of lex_tokens that refer to macro_rep. */
  81   };
  82
  83 static struct msg_point lex_token_start_point (const struct lex_source *,
  84                                                const struct lex_token *);
  85 static struct msg_point lex_token_end_point (const struct lex_source *,
  86                                              const struct lex_token *);
  87
  88 /* Source offset of the last byte in TOKEN. */
  89 static size_t
  90 lex_token_end (const struct lex_token *token)
  91 {
  92   return token->token_pos + MAX (token->token_len, 1) - 1;
  93 }
  94
  95 static void
  96 lex_token_destroy (struct lex_token *t)
  97 {
  98   token_uninit (&t->token);
  99   if (t->ref_cnt)
 100     {
 101       assert (*t->ref_cnt > 0);
 102       if (!--*t->ref_cnt)
 103         {
 104           free (t->macro_rep);
 105           free (t->ref_cnt);
 106         }
 107     }
 108   free (t);
 109 }
 110 \f
 111 /* A deque of lex_tokens that comprises one stage in the token pipeline in a
 112    lex_source. */
 113 struct lex_stage
 114   {
 115     struct deque deque;
 116     struct lex_token **tokens;
 117   };
 118
 119 static void lex_stage_clear (struct lex_stage *);
 120 static void lex_stage_uninit (struct lex_stage *);
 121
 122 static size_t lex_stage_count (const struct lex_stage *);
 123 static bool lex_stage_is_empty (const struct lex_stage *);
 124
 125 static struct lex_token *lex_stage_first (struct lex_stage *);
 126 static struct lex_token *lex_stage_nth (struct lex_stage *, size_t ofs);
 127
 128 static void lex_stage_push_last (struct lex_stage *, struct lex_token *);
 129 static void lex_stage_pop_first (struct lex_stage *);
 130
 131 static void lex_stage_shift (struct lex_stage *dst, struct lex_stage *src,
 132                              size_t n);
 133
 134 /* Deletes all the tokens from STAGE. */
 135 static void
 136 lex_stage_clear (struct lex_stage *stage)
 137 {
 138   while (!deque_is_empty (&stage->deque))
 139     lex_stage_pop_first (stage);
 140 }
 141
 142 /* Deletes all the tokens from STAGE and frees storage for the deque. */
 143 static void
 144 lex_stage_uninit (struct lex_stage *stage)
 145 {
 146   lex_stage_clear (stage);
 147   free (stage->tokens);
 148 }
 149
 150 /* Returns true if STAGE contains no tokens, otherwise false. */
 151 static bool
 152 lex_stage_is_empty (const struct lex_stage *stage)
 153 {
 154   return deque_is_empty (&stage->deque);
 155 }
 156
 157 /* Returns the number of tokens in STAGE. */
 158 static size_t
 159 lex_stage_count (const struct lex_stage *stage)
 160 {
 161   return deque_count (&stage->deque);
 162 }
 163
 164 /* Returns the first token in STAGE, which must be nonempty.
 165    The first token is the one accessed with the least lookahead. */
 166 static struct lex_token *
 167 lex_stage_first (struct lex_stage *stage)
 168 {
 169   return lex_stage_nth (stage, 0);
 170 }
 171
 172 /* Returns the token the given INDEX in STAGE.  The first token (with the least
 173    lookahead) is 0, the second token is 1, and so on.  There must be at least
 174    INDEX + 1 tokens in STAGE. */
 175 static struct lex_token *
 176 lex_stage_nth (struct lex_stage *stage, size_t index)
 177 {
 178   return stage->tokens[deque_back (&stage->deque, index)];
 179 }
 180
 181 /* Adds TOKEN so that it becomes the last token in STAGE. */
 182 static void
 183 lex_stage_push_last (struct lex_stage *stage, struct lex_token *token)
 184 {
 185   if (deque_is_full (&stage->deque))
 186     stage->tokens = deque_expand (&stage->deque, stage->tokens,
 187                                   sizeof *stage->tokens);
 188   stage->tokens[deque_push_front (&stage->deque)] = token;
 189 }
 190
 191 /* Removes and returns the first token from STAGE. */
 192 static struct lex_token *
 193 lex_stage_take_first (struct lex_stage *stage)
 194 {
 195   return stage->tokens[deque_pop_back (&stage->deque)];
 196 }
 197
 198 /* Removes the first token from STAGE and uninitializes it. */
 199 static void
 200 lex_stage_pop_first (struct lex_stage *stage)
 201 {
 202   lex_token_destroy (lex_stage_take_first (stage));
 203 }
 204
 205 /* Removes the first N tokens from SRC, appending them to DST as the last
 206    tokens. */
 207 static void
 208 lex_stage_shift (struct lex_stage *dst, struct lex_stage *src, size_t n)
 209 {
 210   for (size_t i = 0; i < n; i++)
 211     lex_stage_push_last (dst, lex_stage_take_first (src));
 212 }
 213
 214 /* A source of tokens, corresponding to a syntax file.
 215
 216    This is conceptually a lex_reader wrapped with everything needed to convert
 217    its UTF-8 bytes into tokens. */
 218 struct lex_source
 219   {
 220     struct ll ll;               /* In lexer's list of sources. */
 221
 222     /* Reference count:
 223
 224        - One for struct lexer.
 225
 226        - One for each struct msg_location that references this source. */
 227     size_t n_refs;
 228
 229     struct lex_reader *reader;
 230     struct lexer *lexer;
 231     struct segmenter segmenter;
 232     bool eof;                   /* True if T_STOP was read from 'reader'. */
 233
 234     /* Buffer of UTF-8 bytes. */
 235     char *buffer;               /* Source file contents. */
 236     size_t length;              /* Number of bytes filled. */
 237     size_t allocated;           /* Number of bytes allocated. */
 238
 239     /* Offsets into 'buffer'. */
 240     size_t journal_pos;         /* First byte not yet output to journal. */
 241     size_t seg_pos;             /* First byte not yet scanned as token. */
 242
 243     /* Offset into 'buffer' of starts of lines. */
 244     size_t *lines;
 245     size_t n_lines, allocated_lines;
 246
 247     bool suppress_next_newline;
 248
 249     /* Tokens.
 250
 251        This is a pipeline with the following stages.  Each token eventually
 252        made available to the parser passes through of these stages.  The stages
 253        are named after the processing that happens in each one.
 254
 255        Initially, tokens come from the segmenter and scanner to 'pp':
 256
 257        - pp: Tokens that need to pass through the macro preprocessor to end up
 258          in 'merge'.
 259
 260        - merge: Tokens that need to pass through scan_merge() to end up in
 261          'parse'.
 262
 263        - parse: Tokens available to the client for parsing.
 264
 265       'pp' and 'merge' store tokens only temporarily until they pass into
 266       'parse'.  Tokens then live in 'parse' until the command is fully
 267       consumed, at which time they are freed together. */
 268     struct lex_stage pp;
 269     struct lex_stage merge;
 270     struct lex_token **parse;
 271     size_t n_parse, allocated_parse, parse_ofs;
 272   };
 273
 274 static struct lex_source *lex_source_create (struct lexer *,
 275                                              struct lex_reader *);
 276
 277 /* Lexer. */
 278 struct lexer
 279   {
 280     struct ll_list sources;     /* Contains "struct lex_source"s. */
 281     struct macro_set *macros;
 282   };
 283
 284 static struct lex_source *lex_source__ (const struct lexer *);
 285 static char *lex_source_get_syntax__ (const struct lex_source *,
 286                                       int n0, int n1);
 287 static const struct lex_token *lex_next__ (const struct lexer *, int n);
 288 static void lex_source_push_endcmd__ (struct lex_source *);
 289 static void lex_source_push_parse (struct lex_source *, struct lex_token *);
 290 static void lex_source_clear_parse (struct lex_source *);
 291
 292 static bool lex_source_get_parse (struct lex_source *);
 293 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
 294                                      const char *format, va_list)
 295    PRINTF_FORMAT (4, 0);
 296 static const struct lex_token *lex_source_next__ (const struct lex_source *,
 297                                                   int n);
 298 \f
 299 /* Initializes READER with the specified CLASS and otherwise some reasonable
 300    defaults.  The caller should fill in the others members as desired. */
 301 void
 302 lex_reader_init (struct lex_reader *reader,
 303                  const struct lex_reader_class *class)
 304 {
 305   reader->class = class;
 306   reader->syntax = SEG_MODE_AUTO;
 307   reader->error = LEX_ERROR_CONTINUE;
 308   reader->file_name = NULL;
 309   reader->encoding = NULL;
 310   reader->line_number = 0;
 311   reader->eof = false;
 312 }
 313
 314 /* Frees any file name already in READER and replaces it by a copy of
 315    FILE_NAME, or if FILE_NAME is null then clears any existing name. */
 316 void
 317 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
 318 {
 319   free (reader->file_name);
 320   reader->file_name = xstrdup_if_nonnull (file_name);
 321 }
 322 \f
 323 /* Creates and returns a new lexer. */
 324 struct lexer *
 325 lex_create (void)
 326 {
 327   struct lexer *lexer = xmalloc (sizeof *lexer);
 328   *lexer = (struct lexer) {
 329     .sources = LL_INITIALIZER (lexer->sources),
 330     .macros = macro_set_create (),
 331   };
 332   return lexer;
 333 }
 334
 335 /* Destroys LEXER. */
 336 void
 337 lex_destroy (struct lexer *lexer)
 338 {
 339   if (lexer != NULL)
 340     {
 341       struct lex_source *source, *next;
 342
 343       ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
 344         {
 345           ll_remove (&source->ll);
 346           lex_source_unref (source);
 347         }
 348       macro_set_destroy (lexer->macros);
 349       free (lexer);
 350     }
 351 }
 352
 353 /* Adds M to LEXER's set of macros.  M replaces any existing macro with the
 354    same name.  Takes ownership of M. */
 355 void
 356 lex_define_macro (struct lexer *lexer, struct macro *m)
 357 {
 358   macro_set_add (lexer->macros, m);
 359 }
 360
 361 /* Inserts READER into LEXER so that the next token read by LEXER comes from
 362    READER.  Before the caller, LEXER must either be empty or at a T_ENDCMD
 363    token. */
 364 void
 365 lex_include (struct lexer *lexer, struct lex_reader *reader)
 366 {
 367   assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
 368   ll_push_head (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 369 }
 370
 371 /* Appends READER to LEXER, so that it will be read after all other current
 372    readers have already been read. */
 373 void
 374 lex_append (struct lexer *lexer, struct lex_reader *reader)
 375 {
 376   ll_push_tail (&lexer->sources, &lex_source_create (lexer, reader)->ll);
 377 }
 378 \f
 379 /* Advancing. */
 380
 381 /* Advances LEXER to the next token, consuming the current token. */
 382 void
 383 lex_get (struct lexer *lexer)
 384 {
 385   struct lex_source *src;
 386
 387   src = lex_source__ (lexer);
 388   if (src == NULL)
 389     return;
 390
 391   if (src->parse_ofs < src->n_parse)
 392     {
 393       if (src->parse[src->parse_ofs]->token.type == T_ENDCMD)
 394         lex_source_clear_parse (src);
 395       else
 396         src->parse_ofs++;
 397     }
 398
 399   while (src->parse_ofs == src->n_parse)
 400     if (!lex_source_get_parse (src))
 401       {
 402         ll_remove (&src->ll);
 403         lex_source_unref (src);
 404         src = lex_source__ (lexer);
 405         if (src == NULL)
 406           return;
 407       }
 408 }
 409
 410 /* Advances LEXER by N tokens. */
 411 void
 412 lex_get_n (struct lexer *lexer, size_t n)
 413 {
 414   while (n-- > 0)
 415     lex_get (lexer);
 416 }
 417 \f
 418 /* Issuing errors. */
 419
 420 /* Prints a syntax error message containing the current token and
 421    given message MESSAGE (if non-null). */
 422 void
 423 lex_error (struct lexer *lexer, const char *format, ...)
 424 {
 425   va_list args;
 426
 427   va_start (args, format);
 428   lex_next_error_valist (lexer, 0, 0, format, args);
 429   va_end (args);
 430 }
 431
 432 /* Prints a syntax error message containing the current token and
 433    given message MESSAGE (if non-null). */
 434 void
 435 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
 436 {
 437   lex_next_error_valist (lexer, 0, 0, format, args);
 438 }
 439
 440 /* Prints a syntax error message containing the current token and
 441    given message MESSAGE (if non-null). */
 442 void
 443 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
 444 {
 445   va_list args;
 446
 447   va_start (args, format);
 448   lex_next_error_valist (lexer, n0, n1, format, args);
 449   va_end (args);
 450 }
 451
 452 /* Prints a syntax error message saying that one of the strings provided as
 453    varargs, up to the first NULL, is expected. */
 454 void
 455 (lex_error_expecting) (struct lexer *lexer, ...)
 456 {
 457   va_list args;
 458
 459   va_start (args, lexer);
 460   lex_error_expecting_valist (lexer, args);
 461   va_end (args);
 462 }
 463
 464 /* Prints a syntax error message saying that one of the options provided in
 465    ARGS, up to the first NULL, is expected. */
 466 void
 467 lex_error_expecting_valist (struct lexer *lexer, va_list args)
 468 {
 469   enum { MAX_OPTIONS = 9 };
 470   const char *options[MAX_OPTIONS];
 471   int n = 0;
 472   while (n < MAX_OPTIONS)
 473     {
 474       const char *option = va_arg (args, const char *);
 475       if (!option)
 476         break;
 477
 478       options[n++] = option;
 479     }
 480   lex_error_expecting_array (lexer, options, n);
 481 }
 482
 483 void
 484 lex_error_expecting_array (struct lexer *lexer, const char **options, size_t n)
 485 {
 486   switch (n)
 487     {
 488     case 0:
 489       lex_error (lexer, NULL);
 490       break;
 491
 492     case 1:
 493       lex_error (lexer, _("expecting %s"), options[0]);
 494       break;
 495
 496     case 2:
 497       lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
 498       break;
 499
 500     case 3:
 501       lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
 502                  options[2]);
 503       break;
 504
 505     case 4:
 506       lex_error (lexer, _("expecting %s, %s, %s, or %s"),
 507                  options[0], options[1], options[2], options[3]);
 508       break;
 509
 510     case 5:
 511       lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
 512                  options[0], options[1], options[2], options[3], options[4]);
 513       break;
 514
 515     case 6:
 516       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
 517                  options[0], options[1], options[2], options[3], options[4],
 518                  options[5]);
 519       break;
 520
 521     case 7:
 522       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
 523                  options[0], options[1], options[2], options[3], options[4],
 524                  options[5], options[6]);
 525       break;
 526
 527     case 8:
 528       lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
 529                  options[0], options[1], options[2], options[3], options[4],
 530                  options[5], options[6], options[7]);
 531       break;
 532
 533     default:
 534       lex_error (lexer, NULL);
 535     }
 536 }
 537
 538 /* Reports an error to the effect that subcommand SBC may only be specified
 539    once.
 540
 541    This function does not take a lexer as an argument or use lex_error(),
 542    because the result would ordinarily just be redundant: "Syntax error at
 543    SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
 544    not help the user find the error. */
 545 void
 546 lex_sbc_only_once (const char *sbc)
 547 {
 548   msg (SE, _("Subcommand %s may only be specified once."), sbc);
 549 }
 550
 551 /* Reports an error to the effect that subcommand SBC is missing.
 552
 553    This function does not take a lexer as an argument or use lex_error(),
 554    because a missing subcommand can normally be detected only after the whole
 555    command has been parsed, and so lex_error() would always report "Syntax
 556    error at end of command", which does not help the user find the error. */
 557 void
 558 lex_sbc_missing (const char *sbc)
 559 {
 560   msg (SE, _("Required subcommand %s was not specified."), sbc);
 561 }
 562
 563 /* Reports an error to the effect that specification SPEC may only be specified
 564    once within subcommand SBC. */
 565 void
 566 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
 567 {
 568   lex_error (lexer, _("%s may only be specified once within subcommand %s"),
 569              spec, sbc);
 570 }
 571
 572 /* Reports an error to the effect that specification SPEC is missing within
 573    subcommand SBC. */
 574 void
 575 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
 576 {
 577   lex_error (lexer, _("Required %s specification missing from %s subcommand"),
 578              sbc, spec);
 579 }
 580
 581 /* Prints a syntax error message containing the current token and
 582    given message MESSAGE (if non-null). */
 583 void
 584 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
 585                        const char *format, va_list args)
 586 {
 587   struct lex_source *src = lex_source__ (lexer);
 588
 589   if (src != NULL)
 590     lex_source_error_valist (src, n0, n1, format, args);
 591   else
 592     {
 593       struct string s;
 594
 595       ds_init_empty (&s);
 596       ds_put_format (&s, _("Syntax error at end of input"));
 597       if (format != NULL)
 598         {
 599           ds_put_cstr (&s, ": ");
 600           ds_put_vformat (&s, format, args);
 601         }
 602       if (ds_last (&s) != '.')
 603         ds_put_byte (&s, '.');
 604       msg (SE, "%s", ds_cstr (&s));
 605       ds_destroy (&s);
 606     }
 607 }
 608
 609 /* Checks that we're at end of command.
 610    If so, returns a successful command completion code.
 611    If not, flags a syntax error and returns an error command
 612    completion code. */
 613 int
 614 lex_end_of_command (struct lexer *lexer)
 615 {
 616   if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
 617     {
 618       lex_error (lexer, _("expecting end of command"));
 619       return CMD_FAILURE;
 620     }
 621   else
 622     return CMD_SUCCESS;
 623 }
 624 \f
 625 /* Token testing functions. */
 626
 627 /* Returns true if the current token is a number. */
 628 bool
 629 lex_is_number (const struct lexer *lexer)
 630 {
 631   return lex_next_is_number (lexer, 0);
 632 }
 633
 634 /* Returns true if the current token is a string. */
 635 bool
 636 lex_is_string (const struct lexer *lexer)
 637 {
 638   return lex_next_is_string (lexer, 0);
 639 }
 640
 641 /* Returns the value of the current token, which must be a
 642    floating point number. */
 643 double
 644 lex_number (const struct lexer *lexer)
 645 {
 646   return lex_next_number (lexer, 0);
 647 }
 648
 649 /* Returns true iff the current token is an integer. */
 650 bool
 651 lex_is_integer (const struct lexer *lexer)
 652 {
 653   return lex_next_is_integer (lexer, 0);
 654 }
 655
 656 /* Returns the value of the current token, which must be an
 657    integer. */
 658 long
 659 lex_integer (const struct lexer *lexer)
 660 {
 661   return lex_next_integer (lexer, 0);
 662 }
 663 \f
 664 /* Token testing functions with lookahead.
 665
 666    A value of 0 for N as an argument to any of these functions refers to the
 667    current token.  Lookahead is limited to the current command.  Any N greater
 668    than the number of tokens remaining in the current command will be treated
 669    as referring to a T_ENDCMD token. */
 670
 671 /* Returns true if the token N ahead of the current token is a number. */
 672 bool
 673 lex_next_is_number (const struct lexer *lexer, int n)
 674 {
 675   return token_is_number (lex_next (lexer, n));
 676 }
 677
 678 /* Returns true if the token N ahead of the current token is a string. */
 679 bool
 680 lex_next_is_string (const struct lexer *lexer, int n)
 681 {
 682   return token_is_string (lex_next (lexer, n));
 683 }
 684
 685 /* Returns the value of the token N ahead of the current token, which must be a
 686    floating point number. */
 687 double
 688 lex_next_number (const struct lexer *lexer, int n)
 689 {
 690   return token_number (lex_next (lexer, n));
 691 }
 692
 693 /* Returns true if the token N ahead of the current token is an integer. */
 694 bool
 695 lex_next_is_integer (const struct lexer *lexer, int n)
 696 {
 697   return token_is_integer (lex_next (lexer, n));
 698 }
 699
 700 /* Returns the value of the token N ahead of the current token, which must be
 701    an integer. */
 702 long
 703 lex_next_integer (const struct lexer *lexer, int n)
 704 {
 705   return token_integer (lex_next (lexer, n));
 706 }
 707 \f
 708 /* Token matching functions. */
 709
 710 /* If the current token has the specified TYPE, skips it and returns true.
 711    Otherwise, returns false. */
 712 bool
 713 lex_match (struct lexer *lexer, enum token_type type)
 714 {
 715   if (lex_token (lexer) == type)
 716     {
 717       lex_get (lexer);
 718       return true;
 719     }
 720   else
 721     return false;
 722 }
 723
 724 /* If the current token matches IDENTIFIER, skips it and returns true.
 725    IDENTIFIER may be abbreviated to its first three letters.  Otherwise,
 726    returns false.
 727
 728    IDENTIFIER must be an ASCII string. */
 729 bool
 730 lex_match_id (struct lexer *lexer, const char *identifier)
 731 {
 732   return lex_match_id_n (lexer, identifier, 3);
 733 }
 734
 735 /* If the current token is IDENTIFIER, skips it and returns true.  IDENTIFIER
 736    may be abbreviated to its first N letters.  Otherwise, returns false.
 737
 738    IDENTIFIER must be an ASCII string. */
 739 bool
 740 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
 741 {
 742   if (lex_token (lexer) == T_ID
 743       && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
 744     {
 745       lex_get (lexer);
 746       return true;
 747     }
 748   else
 749     return false;
 750 }
 751
 752 /* If the current token is integer X, skips it and returns true.  Otherwise,
 753    returns false. */
 754 bool
 755 lex_match_int (struct lexer *lexer, int x)
 756 {
 757   if (lex_is_integer (lexer) && lex_integer (lexer) == x)
 758     {
 759       lex_get (lexer);
 760       return true;
 761     }
 762   else
 763     return false;
 764 }
 765 \f
 766 /* Forced matches. */
 767
 768 /* If this token is IDENTIFIER, skips it and returns true.  IDENTIFIER may be
 769    abbreviated to its first 3 letters.  Otherwise, reports an error and returns
 770    false.
 771
 772    IDENTIFIER must be an ASCII string. */
 773 bool
 774 lex_force_match_id (struct lexer *lexer, const char *identifier)
 775 {
 776   if (lex_match_id (lexer, identifier))
 777     return true;
 778   else
 779     {
 780       lex_error_expecting (lexer, identifier);
 781       return false;
 782     }
 783 }
 784
 785 /* If the current token has the specified TYPE, skips it and returns true.
 786    Otherwise, reports an error and returns false. */
 787 bool
 788 lex_force_match (struct lexer *lexer, enum token_type type)
 789 {
 790   if (lex_token (lexer) == type)
 791     {
 792       lex_get (lexer);
 793       return true;
 794     }
 795   else
 796     {
 797       const char *type_string = token_type_to_string (type);
 798       if (type_string)
 799         {
 800           char *s = xasprintf ("`%s'", type_string);
 801           lex_error_expecting (lexer, s);
 802           free (s);
 803         }
 804       else
 805         lex_error_expecting (lexer, token_type_to_name (type));
 806
 807       return false;
 808     }
 809 }
 810
 811 /* If the current token is a string, does nothing and returns true.
 812    Otherwise, reports an error and returns false. */
 813 bool
 814 lex_force_string (struct lexer *lexer)
 815 {
 816   if (lex_is_string (lexer))
 817     return true;
 818   else
 819     {
 820       lex_error (lexer, _("expecting string"));
 821       return false;
 822     }
 823 }
 824
 825 /* If the current token is a string or an identifier, does nothing and returns
 826    true.  Otherwise, reports an error and returns false.
 827
 828    This is meant for use in syntactic situations where we want to encourage the
 829    user to supply a quoted string, but for compatibility we also accept
 830    identifiers.  (One example of such a situation is file names.)  Therefore,
 831    the error message issued when the current token is wrong only says that a
 832    string is expected and doesn't mention that an identifier would also be
 833    accepted. */
 834 bool
 835 lex_force_string_or_id (struct lexer *lexer)
 836 {
 837   return lex_token (lexer) == T_ID || lex_force_string (lexer);
 838 }
 839
 840 /* If the current token is an integer, does nothing and returns true.
 841    Otherwise, reports an error and returns false. */
 842 bool
 843 lex_force_int (struct lexer *lexer)
 844 {
 845   if (lex_is_integer (lexer))
 846     return true;
 847   else
 848     {
 849       lex_error (lexer, _("expecting integer"));
 850       return false;
 851     }
 852 }
 853
 854 /* If the current token is an integer in the range MIN...MAX (inclusive), does
 855    nothing and returns true.  Otherwise, reports an error and returns false.
 856    If NAME is nonnull, then it is used in the error message. */
 857 bool
 858 lex_force_int_range (struct lexer *lexer, const char *name, long min, long max)
 859 {
 860   bool is_number = lex_is_number (lexer);
 861   bool is_integer = lex_is_integer (lexer);
 862   bool too_small = (is_integer ? lex_integer (lexer) < min
 863                     : is_number ? lex_number (lexer) < min
 864                     : false);
 865   bool too_big = (is_integer ? lex_integer (lexer) > max
 866                   : is_number ? lex_number (lexer) > max
 867                   : false);
 868   if (is_integer && !too_small && !too_big)
 869     return true;
 870
 871   if (min > max)
 872     {
 873       /* Weird, maybe a bug in the caller.  Just report that we needed an
 874          integer. */
 875       if (name)
 876         lex_error (lexer, _("Integer expected for %s."), name);
 877       else
 878         lex_error (lexer, _("Integer expected."));
 879     }
 880   else if (min == max)
 881     {
 882       if (name)
 883         lex_error (lexer, _("Expected %ld for %s."), min, name);
 884       else
 885         lex_error (lexer, _("Expected %ld."), min);
 886     }
 887   else if (min + 1 == max)
 888     {
 889       if (name)
 890         lex_error (lexer, _("Expected %ld or %ld for %s."), min, min + 1, name);
 891       else
 892         lex_error (lexer, _("Expected %ld or %ld."), min, min + 1);
 893     }
 894   else
 895     {
 896       bool report_lower_bound = (min > INT_MIN / 2) || too_small;
 897       bool report_upper_bound = (max < INT_MAX / 2) || too_big;
 898
 899       if (report_lower_bound && report_upper_bound)
 900         {
 901           if (name)
 902             lex_error (lexer,
 903                        _("Expected integer between %ld and %ld for %s."),
 904                        min, max, name);
 905           else
 906             lex_error (lexer, _("Expected integer between %ld and %ld."),
 907                        min, max);
 908         }
 909       else if (report_lower_bound)
 910         {
 911           if (min == 0)
 912             {
 913               if (name)
 914                 lex_error (lexer, _("Expected non-negative integer for %s."),
 915                            name);
 916               else
 917                 lex_error (lexer, _("Expected non-negative integer."));
 918             }
 919           else if (min == 1)
 920             {
 921               if (name)
 922                 lex_error (lexer, _("Expected positive integer for %s."),
 923                            name);
 924               else
 925                 lex_error (lexer, _("Expected positive integer."));
 926             }
 927           else
 928             {
 929               if (name)
 930                 lex_error (lexer, _("Expected integer %ld or greater for %s."),
 931                            min, name);
 932               else
 933                 lex_error (lexer, _("Expected integer %ld or greater."), min);
 934             }
 935         }
 936       else if (report_upper_bound)
 937         {
 938           if (name)
 939             lex_error (lexer,
 940                        _("Expected integer less than or equal to %ld for %s."),
 941                        max, name);
 942           else
 943             lex_error (lexer, _("Expected integer less than or equal to %ld."),
 944                        max);
 945         }
 946       else
 947         {
 948           if (name)
 949             lex_error (lexer, _("Integer expected for %s."), name);
 950           else
 951             lex_error (lexer, _("Integer expected."));
 952         }
 953     }
 954   return false;
 955 }
 956
 957 /* If the current token is a number, does nothing and returns true.
 958    Otherwise, reports an error and returns false. */
 959 bool
 960 lex_force_num (struct lexer *lexer)
 961 {
 962   if (lex_is_number (lexer))
 963     return true;
 964
 965   lex_error (lexer, _("expecting number"));
 966   return false;
 967 }
 968
 969 /* If the current token is an identifier, does nothing and returns true.
 970    Otherwise, reports an error and returns false. */
 971 bool
 972 lex_force_id (struct lexer *lexer)
 973 {
 974   if (lex_token (lexer) == T_ID)
 975     return true;
 976
 977   lex_error (lexer, _("expecting identifier"));
 978   return false;
 979 }
 980 \f
 981 /* Token accessors. */
 982
 983 /* Returns the type of LEXER's current token. */
 984 enum token_type
 985 lex_token (const struct lexer *lexer)
 986 {
 987   return lex_next_token (lexer, 0);
 988 }
 989
 990 /* Returns the number in LEXER's current token.
 991
 992    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
 993    tokens this function will always return zero. */
 994 double
 995 lex_tokval (const struct lexer *lexer)
 996 {
 997   return lex_next_tokval (lexer, 0);
 998 }
 999
1000 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
1001
1002    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1003    this functions this function will always return NULL.
1004
1005    The UTF-8 encoding of the returned string is correct for variable names and
1006    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1007    data_in() to use it in a "union value".  */
1008 const char *
1009 lex_tokcstr (const struct lexer *lexer)
1010 {
1011   return lex_next_tokcstr (lexer, 0);
1012 }
1013
1014 /* Returns the string in LEXER's current token, UTF-8 encoded.  The string is
1015    null-terminated (but the null terminator is not included in the returned
1016    substring's 'length').
1017
1018    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1019    this functions this function will always return NULL.
1020
1021    The UTF-8 encoding of the returned string is correct for variable names and
1022    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1023    data_in() to use it in a "union value".  */
1024 struct substring
1025 lex_tokss (const struct lexer *lexer)
1026 {
1027   return lex_next_tokss (lexer, 0);
1028 }
1029 \f
1030 /* Looking ahead.
1031
1032    A value of 0 for N as an argument to any of these functions refers to the
1033    current token.  Lookahead is limited to the current command.  Any N greater
1034    than the number of tokens remaining in the current command will be treated
1035    as referring to a T_ENDCMD token. */
1036
1037 static const struct lex_token *
1038 lex_next__ (const struct lexer *lexer_, int n)
1039 {
1040   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1041   struct lex_source *src = lex_source__ (lexer);
1042
1043   if (src != NULL)
1044     return lex_source_next__ (src, n);
1045   else
1046     {
1047       static const struct lex_token stop_token = { .token = { .type = T_STOP } };
1048       return &stop_token;
1049     }
1050 }
1051
1052 static const struct lex_token *
1053 lex_source_ofs__ (const struct lex_source *src_, int ofs)
1054 {
1055   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1056
1057   if (ofs < 0)
1058     {
1059       static const struct lex_token endcmd_token
1060         = { .token = { .type = T_ENDCMD } };
1061       return &endcmd_token;
1062     }
1063
1064   while (ofs >= src->n_parse)
1065     {
1066       if (src->n_parse > 0)
1067         {
1068           const struct lex_token *t = src->parse[src->n_parse - 1];
1069           if (t->token.type == T_STOP || t->token.type == T_ENDCMD)
1070             return t;
1071         }
1072
1073       lex_source_get_parse (src);
1074     }
1075
1076   return src->parse[ofs];
1077 }
1078
1079 static const struct lex_token *
1080 lex_source_next__ (const struct lex_source *src, int n)
1081 {
1082   return lex_source_ofs__ (src, n + src->parse_ofs);
1083 }
1084
1085 /* Returns the "struct token" of the token N after the current one in LEXER.
1086    The returned pointer can be invalidated by pretty much any succeeding call
1087    into the lexer, although the string pointer within the returned token is
1088    only invalidated by consuming the token (e.g. with lex_get()). */
1089 const struct token *
1090 lex_next (const struct lexer *lexer, int n)
1091 {
1092   return &lex_next__ (lexer, n)->token;
1093 }
1094
1095 /* Returns the type of the token N after the current one in LEXER. */
1096 enum token_type
1097 lex_next_token (const struct lexer *lexer, int n)
1098 {
1099   return lex_next (lexer, n)->type;
1100 }
1101
1102 /* Returns the number in the tokn N after the current one in LEXER.
1103
1104    Only T_NEG_NUM and T_POS_NUM tokens have meaningful values.  For other
1105    tokens this function will always return zero. */
1106 double
1107 lex_next_tokval (const struct lexer *lexer, int n)
1108 {
1109   return token_number (lex_next (lexer, n));
1110 }
1111
1112 /* Returns the null-terminated string in the token N after the current one, in
1113    UTF-8 encoding.
1114
1115    Only T_ID and T_STRING tokens have meaningful strings.  For other tokens
1116    this functions this function will always return NULL.
1117
1118    The UTF-8 encoding of the returned string is correct for variable names and
1119    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1120    data_in() to use it in a "union value".  */
1121 const char *
1122 lex_next_tokcstr (const struct lexer *lexer, int n)
1123 {
1124   return lex_next_tokss (lexer, n).string;
1125 }
1126
1127 /* Returns the string in the token N after the current one, in UTF-8 encoding.
1128    The string is null-terminated (but the null terminator is not included in
1129    the returned substring's 'length').
1130
1131    Only T_ID, T_MACRO_ID, T_STRING tokens have meaningful strings.  For other
1132    tokens this functions this function will always return NULL.
1133
1134    The UTF-8 encoding of the returned string is correct for variable names and
1135    other identifiers.  Use filename_to_utf8() to use it as a filename.  Use
1136    data_in() to use it in a "union value".  */
1137 struct substring
1138 lex_next_tokss (const struct lexer *lexer, int n)
1139 {
1140   return lex_next (lexer, n)->string;
1141 }
1142
1143 /* Returns the offset of the current token within the command being parsed in
1144    LEXER.  This is 0 for the first token in a command, 1 for the second, and so
1145    on.  The return value is useful later for referring to this token in calls
1146    to lex_ofs_*(). */
1147 int
1148 lex_ofs (const struct lexer *lexer)
1149 {
1150   struct lex_source *src = lex_source__ (lexer);
1151   return src ? src->parse_ofs : 0;
1152 }
1153
1154 /* Returns the token within LEXER's current command with offset OFS.  Use
1155    lex_ofs() to find out the offset of the current token. */
1156 const struct token *
1157 lex_ofs_token (const struct lexer *lexer_, int ofs)
1158 {
1159   struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
1160   struct lex_source *src = lex_source__ (lexer);
1161
1162   if (src != NULL)
1163     return &lex_source_next__ (src, ofs - src->parse_ofs)->token;
1164   else
1165     {
1166       static const struct token stop_token = { .type = T_STOP };
1167       return &stop_token;
1168     }
1169 }
1170
1171 /* Allocates and returns a new struct msg_location that spans tokens with
1172    offsets OFS0 through OFS1, inclusive, within the current command in
1173    LEXER.  See lex_ofs() for an explanation of token offsets.
1174
1175    The caller owns and must eventually free the returned object. */
1176 struct msg_location *
1177 lex_ofs_location (const struct lexer *lexer, int ofs0, int ofs1)
1178 {
1179   int ofs = lex_ofs (lexer);
1180   return lex_get_location (lexer, ofs0 - ofs, ofs1 - ofs);
1181 }
1182
1183 /* Returns a msg_point for the first character in the token with offset OFS,
1184    where offset 0 is the first token in the command currently being parsed, 1
1185    the second token, and so on.  These are absolute offsets, not relative to
1186    the token currently being parsed within the command.
1187
1188    Returns zeros for a T_STOP token.
1189  */
1190 struct msg_point
1191 lex_ofs_start_point (const struct lexer *lexer, int ofs)
1192 {
1193   const struct lex_source *src = lex_source__ (lexer);
1194   return (src
1195           ? lex_token_start_point (src, lex_source_ofs__ (src, ofs))
1196           : (struct msg_point) { 0, 0 });
1197 }
1198
1199 /* Returns a msg_point for the last character, inclusive, in the token with
1200    offset OFS, where offset 0 is the first token in the command currently being
1201    parsed, 1 the second token, and so on.  These are absolute offsets, not
1202    relative to the token currently being parsed within the command.
1203
1204    Returns zeros for a T_STOP token.
1205
1206    Most of the time, a single token is wholly within a single line of syntax,
1207    so that the start and end point for a given offset have the same line
1208    number.  There are two exceptions: a T_STRING token can be made up of
1209    multiple segments on adjacent lines connected with "+" punctuators, and a
1210    T_NEG_NUM token can consist of a "-" on one line followed by the number on
1211    the next.
1212  */
1213 struct msg_point
1214 lex_ofs_end_point (const struct lexer *lexer, int ofs)
1215 {
1216   const struct lex_source *src = lex_source__ (lexer);
1217   return (src
1218           ? lex_token_end_point (src, lex_source_ofs__ (src, ofs))
1219           : (struct msg_point) { 0, 0 });
1220 }
1221
1222 /* Returns the text of the syntax in tokens N0 ahead of the current one,
1223    through N1 ahead of the current one, inclusive.  (For example, if N0 and N1
1224    are both zero, this requests the syntax for the current token.)  The caller
1225    must eventually free the returned string (with free()).  The syntax is
1226    encoded in UTF-8 and in the original form supplied to the lexer so that, for
1227    example, it may include comments, spaces, and new-lines if it spans multiple
1228    tokens.  Macro expansion, however, has already been performed. */
1229 char *
1230 lex_next_representation (const struct lexer *lexer, int n0, int n1)
1231 {
1232   return lex_source_get_syntax__ (lex_source__ (lexer), n0, n1);
1233 }
1234
1235 /* Returns true if the token N ahead of the current one was produced by macro
1236    expansion, false otherwise. */
1237 bool
1238 lex_next_is_from_macro (const struct lexer *lexer, int n)
1239 {
1240   return lex_next__ (lexer, n)->macro_rep != NULL;
1241 }
1242
1243 static bool
1244 lex_tokens_match (const struct token *actual, const struct token *expected)
1245 {
1246   if (actual->type != expected->type)
1247     return false;
1248
1249   switch (actual->type)
1250     {
1251     case T_POS_NUM:
1252     case T_NEG_NUM:
1253       return actual->number == expected->number;
1254
1255     case T_ID:
1256       return lex_id_match (expected->string, actual->string);
1257
1258     case T_STRING:
1259       return (actual->string.length == expected->string.length
1260               && !memcmp (actual->string.string, expected->string.string,
1261                           actual->string.length));
1262
1263     default:
1264       return true;
1265     }
1266 }
1267
1268 static size_t
1269 lex_at_phrase__ (struct lexer *lexer, const char *s)
1270 {
1271   struct string_lexer slex;
1272   struct token token;
1273
1274   size_t i = 0;
1275   string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE, true);
1276   while (string_lexer_next (&slex, &token))
1277     {
1278       bool match = lex_tokens_match (lex_next (lexer, i++), &token);
1279       token_uninit (&token);
1280       if (!match)
1281         return 0;
1282     }
1283   return i;
1284 }
1285
1286 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1287    returns true.  Otherwise, returns false.
1288
1289    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1290    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1291    first three letters. */
1292 bool
1293 lex_at_phrase (struct lexer *lexer, const char *s)
1294 {
1295   return lex_at_phrase__ (lexer, s) > 0;
1296 }
1297
1298 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
1299    skips it and returns true.  Otherwise, returns false.
1300
1301    S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
1302    "2SLS", or "END INPUT PROGRAM".  Identifiers may be abbreviated to their
1303    first three letters. */
1304 bool
1305 lex_match_phrase (struct lexer *lexer, const char *s)
1306 {
1307   size_t n = lex_at_phrase__ (lexer, s);
1308   if (n > 0)
1309     lex_get_n (lexer, n);
1310   return n > 0;
1311 }
1312
1313 /* Returns the 1-based line number of the source text at the byte OFFSET in
1314    SRC. */
1315 static int
1316 lex_source_ofs_to_line_number (const struct lex_source *src, size_t offset)
1317 {
1318   size_t lo = 0;
1319   size_t hi = src->n_lines;
1320   for (;;)
1321     {
1322       size_t mid = (lo + hi) / 2;
1323       if (mid + 1 >= src->n_lines)
1324         return src->n_lines;
1325       else if (offset >= src->lines[mid + 1])
1326         lo = mid;
1327       else if (offset < src->lines[mid])
1328         hi = mid;
1329       else
1330         return mid + 1;
1331     }
1332 }
1333
1334 /* Returns the 1-based column number of the source text at the byte OFFSET in
1335    SRC. */
1336 static int
1337 lex_source_ofs_to_column_number (const struct lex_source *src, size_t offset)
1338 {
1339   const char *newline = memrchr (src->buffer, '\n', offset);
1340   size_t line_ofs = newline ? newline - src->buffer + 1 : 0;
1341   return utf8_count_columns (&src->buffer[line_ofs], offset - line_ofs) + 1;
1342 }
1343
1344 static struct msg_point
1345 lex_source_ofs_to_point__ (const struct lex_source *src, size_t offset)
1346 {
1347   return (struct msg_point) {
1348     .line = lex_source_ofs_to_line_number (src, offset),
1349     .column = lex_source_ofs_to_column_number (src, offset),
1350   };
1351 }
1352
1353 static struct msg_point
1354 lex_token_start_point (const struct lex_source *src,
1355                        const struct lex_token *token)
1356 {
1357   return lex_source_ofs_to_point__ (src, token->token_pos);
1358 }
1359
1360 static struct msg_point
1361 lex_token_end_point (const struct lex_source *src,
1362                      const struct lex_token *token)
1363 {
1364   return lex_source_ofs_to_point__ (src, lex_token_end (token));
1365 }
1366
1367 static struct msg_location
1368 lex_token_location (const struct lex_source *src,
1369                     const struct lex_token *t0,
1370                     const struct lex_token *t1)
1371 {
1372   return (struct msg_location) {
1373     .file_name = intern_new_if_nonnull (src->reader->file_name),
1374     .start = lex_token_start_point (src, t0),
1375     .end = lex_token_end_point (src, t1),
1376   };
1377 }
1378
1379 static struct msg_location *
1380 lex_token_location_rw (const struct lex_source *src,
1381                        const struct lex_token *t0,
1382                        const struct lex_token *t1)
1383 {
1384   struct msg_location location = lex_token_location (src, t0, t1);
1385   return msg_location_dup (&location);
1386 }
1387
1388 static struct msg_location *
1389 lex_source_get_location (const struct lex_source *src, int n0, int n1)
1390 {
1391   return lex_token_location_rw (src,
1392                                 lex_source_next__ (src, n0),
1393                                 lex_source_next__ (src, n1));
1394 }
1395
1396 /* Returns the name of the syntax file from which the current command is drawn.
1397    Returns NULL for a T_STOP token or if the command's source does not have
1398    line numbers.
1399
1400    There is no version of this function that takes an N argument because
1401    lookahead only works to the end of a command and any given command is always
1402    within a single syntax file. */
1403 const char *
1404 lex_get_file_name (const struct lexer *lexer)
1405 {
1406   struct lex_source *src = lex_source__ (lexer);
1407   return src == NULL ? NULL : src->reader->file_name;
1408 }
1409
1410 /* Returns a newly allocated msg_location for the syntax that represents tokens
1411    with 0-based offsets N0...N1, inclusive, from the current token.  The caller
1412    must eventually free the location (with msg_location_destroy()). */
1413 struct msg_location *
1414 lex_get_location (const struct lexer *lexer, int n0, int n1)
1415 {
1416   struct msg_location *loc = xmalloc (sizeof *loc);
1417   *loc = (struct msg_location) {
1418     .file_name = intern_new_if_nonnull (lex_get_file_name (lexer)),
1419     .start = lex_ofs_start_point (lexer, n0 + lex_ofs (lexer)),
1420     .end = lex_ofs_end_point (lexer, n1 + lex_ofs (lexer)),
1421     .src = lex_source__ (lexer),
1422   };
1423   lex_source_ref (loc->src);
1424   return loc;
1425 }
1426
1427 const char *
1428 lex_get_encoding (const struct lexer *lexer)
1429 {
1430   struct lex_source *src = lex_source__ (lexer);
1431   return src == NULL ? NULL : src->reader->encoding;
1432 }
1433
1434 /* Returns the syntax mode for the syntax file from which the current drawn is
1435    drawn.  Returns SEG_MODE_AUTO for a T_STOP token or if the command's source
1436    does not have line numbers.
1437
1438    There is no version of this function that takes an N argument because
1439    lookahead only works to the end of a command and any given command is always
1440    within a single syntax file. */
1441 enum segmenter_mode
1442 lex_get_syntax_mode (const struct lexer *lexer)
1443 {
1444   struct lex_source *src = lex_source__ (lexer);
1445   return src == NULL ? SEG_MODE_AUTO : src->reader->syntax;
1446 }
1447
1448 /* Returns the error mode for the syntax file from which the current drawn is
1449    drawn.  Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1450    source does not have line numbers.
1451
1452    There is no version of this function that takes an N argument because
1453    lookahead only works to the end of a command and any given command is always
1454    within a single syntax file. */
1455 enum lex_error_mode
1456 lex_get_error_mode (const struct lexer *lexer)
1457 {
1458   struct lex_source *src = lex_source__ (lexer);
1459   return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1460 }
1461
1462 /* If the source that LEXER is currently reading has error mode
1463    LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1464    token to be read comes directly from whatever is next read from the stream.
1465
1466    It makes sense to call this function after encountering an error in a
1467    command entered on the console, because usually the user would prefer not to
1468    have cascading errors. */
1469 void
1470 lex_interactive_reset (struct lexer *lexer)
1471 {
1472   struct lex_source *src = lex_source__ (lexer);
1473   if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1474     {
1475       src->length = 0;
1476       src->journal_pos = src->seg_pos = 0;
1477       src->n_lines = 0;
1478       src->suppress_next_newline = false;
1479       src->segmenter = segmenter_init (segmenter_get_mode (&src->segmenter),
1480                                        false);
1481       lex_stage_clear (&src->pp);
1482       lex_stage_clear (&src->merge);
1483       lex_source_clear_parse (src);
1484       lex_source_push_endcmd__ (src);
1485     }
1486 }
1487
1488 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1489 void
1490 lex_discard_rest_of_command (struct lexer *lexer)
1491 {
1492   while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1493     lex_get (lexer);
1494 }
1495
1496 /* Discards all lookahead tokens in LEXER, then discards all input sources
1497    until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1498    runs out of input sources. */
1499 void
1500 lex_discard_noninteractive (struct lexer *lexer)
1501 {
1502   struct lex_source *src = lex_source__ (lexer);
1503
1504   if (src != NULL)
1505     {
1506       lex_stage_clear (&src->pp);
1507       lex_stage_clear (&src->merge);
1508       lex_source_clear_parse (src);
1509
1510       for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1511            src = lex_source__ (lexer))
1512         {
1513           ll_remove (&src->ll);
1514           lex_source_unref (src);
1515         }
1516     }
1517 }
1518 \f
1519 static void
1520 lex_source_expand__ (struct lex_source *src)
1521 {
1522   if (src->length >= src->allocated)
1523     src->buffer = x2realloc (src->buffer, &src->allocated);
1524 }
1525
1526 static void
1527 lex_source_read__ (struct lex_source *src)
1528 {
1529   do
1530     {
1531       lex_source_expand__ (src);
1532
1533       size_t space = src->allocated - src->length;
1534       enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1535       size_t n = src->reader->class->read (src->reader,
1536                                            &src->buffer[src->length],
1537                                            space, prompt);
1538       assert (n <= space);
1539
1540       if (n == 0)
1541         {
1542           /* End of input. */
1543           src->reader->eof = true;
1544           return;
1545         }
1546
1547       src->length += n;
1548     }
1549   while (!memchr (&src->buffer[src->seg_pos], '\n',
1550                   src->length - src->seg_pos));
1551 }
1552
1553 static struct lex_source *
1554 lex_source__ (const struct lexer *lexer)
1555 {
1556   return (ll_is_empty (&lexer->sources) ? NULL
1557           : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1558 }
1559
1560 /* Returns the text of the syntax in SRC for tokens N0 ahead of the current
1561    one, through N1 ahead of the current one, inclusive.  (For example, if N0
1562    and N1 are both zero, this requests the syntax for the current token.)  The
1563    caller must eventually free the returned string (with free()).  The syntax
1564    is encoded in UTF-8 and in the original form supplied to the lexer so that,
1565    for example, it may include comments, spaces, and new-lines if it spans
1566    multiple tokens.  Macro expansion, however, has already been performed. */
1567 static char *
1568 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1569 {
1570   struct string s = DS_EMPTY_INITIALIZER;
1571   for (size_t i = n0; i <= n1; )
1572     {
1573       /* Find [I,J) as the longest sequence of tokens not produced by macro
1574          expansion, or otherwise the longest sequence expanded from a single
1575          macro call. */
1576       const struct lex_token *first = lex_source_next__ (src, i);
1577       size_t j;
1578       for (j = i + 1; j <= n1; j++)
1579         {
1580           const struct lex_token *cur = lex_source_next__ (src, j);
1581           if ((first->macro_rep != NULL) != (cur->macro_rep != NULL)
1582               || first->macro_rep != cur->macro_rep)
1583             break;
1584         }
1585       const struct lex_token *last = lex_source_next__ (src, j - 1);
1586
1587       /* Now add the syntax for this sequence of tokens to SRC. */
1588       if (!ds_is_empty (&s))
1589         ds_put_byte (&s, ' ');
1590       if (!first->macro_rep)
1591         {
1592           size_t start = first->token_pos;
1593           size_t end = last->token_pos + last->token_len;
1594           ds_put_substring (&s, ss_buffer (&src->buffer[start], end - start));
1595         }
1596       else
1597         {
1598           size_t start = first->ofs;
1599           size_t end = last->ofs + last->len;
1600           ds_put_substring (&s, ss_buffer (first->macro_rep + start,
1601                                            end - start));
1602         }
1603
1604       i = j;
1605     }
1606   return ds_steal_cstr (&s);
1607 }
1608
1609 static bool
1610 lex_source_contains_macro_call (struct lex_source *src, int n0, int n1)
1611 {
1612   for (size_t i = n0; i <= n1; i++)
1613     if (lex_source_next__ (src, i)->macro_rep)
1614       return true;
1615   return false;
1616 }
1617
1618 /* If tokens N0...N1 (inclusive) in SRC contains a macro call, this returns the
1619    raw UTF-8 syntax for the macro call (not for the expansion) and for any
1620    other tokens included in that range.  The syntax is encoded in UTF-8 and in
1621    the original form supplied to the lexer so that, for example, it may include
1622    comments, spaces, and new-lines if it spans multiple tokens.
1623
1624    Returns an empty string if the token range doesn't include a macro call.
1625
1626    The caller must not modify or free the returned string. */
1627 static struct substring
1628 lex_source_get_macro_call (struct lex_source *src, int n0, int n1)
1629 {
1630   if (!lex_source_contains_macro_call (src, n0, n1))
1631     return ss_empty ();
1632
1633   const struct lex_token *token0 = lex_source_next__ (src, n0);
1634   const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1635   size_t start = token0->token_pos;
1636   size_t end = token1->token_pos + token1->token_len;
1637
1638   return ss_buffer (&src->buffer[start], end - start);
1639 }
1640
1641 static void
1642 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1643                          const char *format, va_list args)
1644 {
1645   const struct lex_token *token;
1646   struct string s;
1647
1648   ds_init_empty (&s);
1649
1650   token = lex_source_next__ (src, n0);
1651   if (token->token.type == T_ENDCMD)
1652     ds_put_cstr (&s, _("Syntax error at end of command"));
1653   else
1654     {
1655       /* Get the syntax that caused the error. */
1656       char *raw_syntax = lex_source_get_syntax__ (src, n0, n1);
1657       char syntax[64];
1658       str_ellipsize (ss_cstr (raw_syntax), syntax, sizeof syntax);
1659       free (raw_syntax);
1660
1661       /* Get the macro call(s) that expanded to the syntax that caused the
1662          error. */
1663       char call[64];
1664       str_ellipsize (lex_source_get_macro_call (src, n0, n1),
1665                      call, sizeof call);
1666
1667       if (syntax[0])
1668         {
1669           if (call[0])
1670             ds_put_format (&s,
1671                            _("Syntax error at `%s' (in expansion of `%s')"),
1672                            syntax, call);
1673           else
1674             ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1675         }
1676       else
1677         {
1678           if (call[0])
1679             ds_put_format (&s, _("Syntax error in syntax expanded from `%s'"),
1680                            call);
1681           else
1682             ds_put_cstr (&s, _("Syntax error"));
1683         }
1684     }
1685
1686   if (format)
1687     {
1688       ds_put_cstr (&s, ": ");
1689       ds_put_vformat (&s, format, args);
1690     }
1691   if (ds_last (&s) != '.')
1692     ds_put_byte (&s, '.');
1693
1694   struct msg *m = xmalloc (sizeof *m);
1695   *m = (struct msg) {
1696     .category = MSG_C_SYNTAX,
1697     .severity = MSG_S_ERROR,
1698     .location = lex_source_get_location (src, n0, n1),
1699     .text = ds_steal_cstr (&s),
1700   };
1701   msg_emit (m);
1702 }
1703
1704 static void
1705 lex_get_error (struct lex_source *src, const struct lex_token *token)
1706 {
1707   char syntax[64];
1708   str_ellipsize (ss_buffer (&src->buffer[token->token_pos], token->token_len),
1709                  syntax, sizeof syntax);
1710
1711   struct string s = DS_EMPTY_INITIALIZER;
1712   ds_put_format (&s, _("Syntax error at `%s'"), syntax);
1713   ds_put_format (&s, ": %s", token->token.string.string);
1714
1715   struct msg *m = xmalloc (sizeof *m);
1716   *m = (struct msg) {
1717     .category = MSG_C_SYNTAX,
1718     .severity = MSG_S_ERROR,
1719     .location = lex_token_location_rw (src, token, token),
1720     .text = ds_steal_cstr (&s),
1721   };
1722   msg_emit (m);
1723 }
1724
1725 /* Attempts to append an additional token to 'pp' in SRC, reading more from the
1726    underlying lex_reader if necessary.  Returns true if a new token was added
1727    to SRC's deque, false otherwise.  The caller should retry failures unless
1728    SRC's 'eof' marker was set to true indicating that there will be no more
1729    tokens from this source. */
1730 static bool
1731 lex_source_try_get_pp (struct lex_source *src)
1732 {
1733   /* Append a new token to SRC and initialize it. */
1734   struct lex_token *token = xmalloc (sizeof *token);
1735   token->token = (struct token) { .type = T_STOP };
1736   token->macro_rep = NULL;
1737   token->ref_cnt = NULL;
1738   token->token_pos = src->seg_pos;
1739
1740   /* Extract a segment. */
1741   const char *segment;
1742   enum segment_type seg_type;
1743   int seg_len;
1744   for (;;)
1745     {
1746       segment = &src->buffer[src->seg_pos];
1747       seg_len = segmenter_push (&src->segmenter, segment,
1748                                 src->length - src->seg_pos,
1749                                 src->reader->eof, &seg_type);
1750       if (seg_len >= 0)
1751         break;
1752
1753       /* The segmenter needs more input to produce a segment. */
1754       assert (!src->reader->eof);
1755       lex_source_read__ (src);
1756     }
1757
1758   /* Update state based on the segment. */
1759   token->token_len = seg_len;
1760   src->seg_pos += seg_len;
1761   if (seg_type == SEG_NEWLINE)
1762     {
1763       if (src->n_lines >= src->allocated_lines)
1764         src->lines = x2nrealloc (src->lines, &src->allocated_lines,
1765                                  sizeof *src->lines);
1766       src->lines[src->n_lines++] = src->seg_pos;
1767     }
1768
1769   /* Get a token from the segment. */
1770   enum tokenize_result result = token_from_segment (
1771     seg_type, ss_buffer (segment, seg_len), &token->token);
1772
1773   /* If we've reached the end of a line, or the end of a command, then pass
1774      the line to the output engine as a syntax text item.  */
1775   int n_lines = seg_type == SEG_NEWLINE;
1776   if (seg_type == SEG_END_COMMAND && !src->suppress_next_newline)
1777     {
1778       n_lines++;
1779       src->suppress_next_newline = true;
1780     }
1781   else if (n_lines > 0 && src->suppress_next_newline)
1782     {
1783       n_lines--;
1784       src->suppress_next_newline = false;
1785     }
1786   for (int i = 0; i < n_lines; i++)
1787     {
1788       /* Beginning of line. */
1789       const char *line = &src->buffer[src->journal_pos];
1790
1791       /* Calculate line length, including \n or \r\n end-of-line if present.
1792
1793          We use src->length even though that may be beyond what we've actually
1794          converted to tokens.  That's because, if we're emitting the line due
1795          to SEG_END_COMMAND, we want to take the whole line through the
1796          newline, not just through the '.'. */
1797       size_t max_len = src->length - src->journal_pos;
1798       const char *newline = memchr (line, '\n', max_len);
1799       size_t line_len = newline ? newline - line + 1 : max_len;
1800
1801       /* Calculate line length excluding end-of-line. */
1802       size_t copy_len = line_len;
1803       if (copy_len > 0 && line[copy_len - 1] == '\n')
1804         copy_len--;
1805       if (copy_len > 0 && line[copy_len - 1] == '\r')
1806         copy_len--;
1807
1808       /* Submit the line as syntax. */
1809       output_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1810                                                    xmemdup0 (line, copy_len),
1811                                                    NULL));
1812
1813       src->journal_pos += line_len;
1814     }
1815
1816   switch (result)
1817     {
1818     case TOKENIZE_ERROR:
1819       lex_get_error (src, token);
1820       /* Fall through. */
1821     case TOKENIZE_EMPTY:
1822       lex_token_destroy (token);
1823       return false;
1824
1825     case TOKENIZE_TOKEN:
1826       if (token->token.type == T_STOP)
1827         {
1828           token->token.type = T_ENDCMD;
1829           src->eof = true;
1830         }
1831       lex_stage_push_last (&src->pp, token);
1832       return true;
1833     }
1834   NOT_REACHED ();
1835 }
1836
1837 /* Attempts to append a new token to SRC.  Returns true if successful, false on
1838    failure.  On failure, the end of SRC has been reached and no more tokens
1839    will be forthcoming from it.
1840
1841    Does not make the new token available for lookahead yet; the caller must
1842    adjust SRC's 'middle' pointer to do so. */
1843 static bool
1844 lex_source_get_pp (struct lex_source *src)
1845 {
1846   while (!src->eof)
1847     if (lex_source_try_get_pp (src))
1848       return true;
1849   return false;
1850 }
1851
1852 static bool
1853 lex_source_try_get_merge (const struct lex_source *src_)
1854 {
1855   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1856
1857   if (lex_stage_is_empty (&src->pp) && !lex_source_get_pp (src))
1858     return false;
1859
1860   if (!settings_get_mexpand ())
1861     {
1862       lex_stage_shift (&src->merge, &src->pp, lex_stage_count (&src->pp));
1863       return true;
1864     }
1865
1866   /* Now pass tokens one-by-one to the macro expander.
1867
1868      In the common case where there is no macro to expand, the loop is not
1869      entered.  */
1870   struct macro_call *mc;
1871   int n_call = macro_call_create (src->lexer->macros,
1872                                   &lex_stage_first (&src->pp)->token, &mc);
1873   for (int ofs = 1; !n_call; ofs++)
1874     {
1875       if (lex_stage_count (&src->pp) <= ofs && !lex_source_get_pp (src))
1876         {
1877           /* This should not be reachable because we always get a T_ENDCMD at
1878              the end of an input file (transformed from T_STOP by
1879              lex_source_try_get_pp()) and the macro_expander should always
1880              terminate expansion on T_ENDCMD. */
1881           NOT_REACHED ();
1882         }
1883
1884       const struct lex_token *t = lex_stage_nth (&src->pp, ofs);
1885       const struct macro_token mt = {
1886         .token = t->token,
1887         .syntax = ss_buffer (&src->buffer[t->token_pos], t->token_len),
1888       };
1889       const struct msg_location loc = lex_token_location (src, t, t);
1890       n_call = macro_call_add (mc, &mt, &loc);
1891     }
1892   if (n_call < 0)
1893     {
1894       /* False alarm: no macro expansion after all.  Use first token as
1895          lookahead.  We'll retry macro expansion from the second token next
1896          time around. */
1897       macro_call_destroy (mc);
1898       lex_stage_shift (&src->merge, &src->pp, 1);
1899       return true;
1900     }
1901
1902   /* The first 'n_call' tokens in 'pp', which we bracket as C0...C1, inclusive,
1903      are a macro call.  (These are likely to be the only tokens in 'pp'.)
1904      Expand them.  */
1905   const struct lex_token *c0 = lex_stage_first (&src->pp);
1906   const struct lex_token *c1 = lex_stage_nth (&src->pp, n_call - 1);
1907   struct macro_tokens expansion = { .n = 0 };
1908   struct msg_location loc = lex_token_location (src, c0, c1);
1909   macro_call_expand (mc, src->reader->syntax, &loc, &expansion);
1910   macro_call_destroy (mc);
1911
1912   /* Convert the macro expansion into syntax for possible error messages
1913      later. */
1914   size_t *ofs = xnmalloc (expansion.n, sizeof *ofs);
1915   size_t *len = xnmalloc (expansion.n, sizeof *len);
1916   struct string s = DS_EMPTY_INITIALIZER;
1917   macro_tokens_to_syntax (&expansion, &s, ofs, len);
1918
1919   if (settings_get_mprint ())
1920     output_item_submit (text_item_create (TEXT_ITEM_LOG, ds_cstr (&s),
1921                                           _("Macro Expansion")));
1922
1923   /* Append the macro expansion tokens to the lookahead. */
1924   if (expansion.n > 0)
1925     {
1926       char *macro_rep = ds_steal_cstr (&s);
1927       size_t *ref_cnt = xmalloc (sizeof *ref_cnt);
1928       *ref_cnt = expansion.n;
1929       for (size_t i = 0; i < expansion.n; i++)
1930         {
1931           struct lex_token *token = xmalloc (sizeof *token);
1932           *token = (struct lex_token) {
1933             .token = expansion.mts[i].token,
1934             .token_pos = c0->token_pos,
1935             .token_len = (c1->token_pos + c1->token_len) - c0->token_pos,
1936             .macro_rep = macro_rep,
1937             .ofs = ofs[i],
1938             .len = len[i],
1939             .ref_cnt = ref_cnt,
1940           };
1941           lex_stage_push_last (&src->merge, token);
1942
1943           ss_dealloc (&expansion.mts[i].syntax);
1944         }
1945     }
1946   else
1947     ds_destroy (&s);
1948   free (expansion.mts);
1949   free (ofs);
1950   free (len);
1951
1952   /* Destroy the tokens for the call. */
1953   for (size_t i = 0; i < n_call; i++)
1954     lex_stage_pop_first (&src->pp);
1955
1956   return expansion.n > 0;
1957 }
1958
1959 /* Attempts to obtain at least one new token into 'merge' in SRC.
1960
1961    Returns true if successful, false on failure.  In the latter case, SRC is
1962    exhausted and 'src->eof' is now true. */
1963 static bool
1964 lex_source_get_merge (struct lex_source *src)
1965 {
1966   while (!src->eof)
1967     if (lex_source_try_get_merge (src))
1968       return true;
1969   return false;
1970 }
1971
1972 /* Attempts to obtain at least one new token into 'lookahead' in SRC.
1973
1974    Returns true if successful, false on failure.  In the latter case, SRC is
1975    exhausted and 'src->eof' is now true. */
1976 static bool
1977 lex_source_get_parse (struct lex_source *src)
1978 {
1979   struct merger m = MERGER_INIT;
1980   struct token out;
1981   for (size_t i = 0; ; i++)
1982     {
1983       while (lex_stage_count (&src->merge) <= i && !lex_source_get_merge (src))
1984         {
1985           /* We always get a T_ENDCMD at the end of an input file
1986              (transformed from T_STOP by lex_source_try_get_pp()) and
1987              merger_add() should never return -1 on T_ENDCMD. */
1988           assert (lex_stage_is_empty (&src->merge));
1989           return false;
1990         }
1991
1992       int retval = merger_add (&m, &lex_stage_nth (&src->merge, i)->token,
1993                                &out);
1994       if (!retval)
1995         {
1996           lex_source_push_parse (src, lex_stage_take_first (&src->merge));
1997           return true;
1998         }
1999       else if (retval > 0)
2000         {
2001           /* Add a token that merges all the tokens together. */
2002           const struct lex_token *first = lex_stage_first (&src->merge);
2003           const struct lex_token *last = lex_stage_nth (&src->merge,
2004                                                         retval - 1);
2005           bool macro = first->macro_rep && first->macro_rep == last->macro_rep;
2006           struct lex_token *t = xmalloc (sizeof *t);
2007           *t = (struct lex_token) {
2008             .token = out,
2009             .token_pos = first->token_pos,
2010             .token_len = (last->token_pos - first->token_pos) + last->token_len,
2011
2012             /* This works well if all the tokens were not expanded from macros,
2013                or if they came from the same macro expansion.  It just gives up
2014                in the other (corner) cases. */
2015             .macro_rep = macro ? first->macro_rep : NULL,
2016             .ofs = macro ? first->ofs : 0,
2017             .len = macro ? (last->ofs - first->ofs) + last->len : 0,
2018             .ref_cnt = macro ? first->ref_cnt : NULL,
2019           };
2020           if (t->ref_cnt)
2021             ++*t->ref_cnt;
2022           lex_source_push_parse (src, t);
2023
2024           for (int i = 0; i < retval; i++)
2025             lex_stage_pop_first (&src->merge);
2026           return true;
2027         }
2028     }
2029 }
2030 \f
2031 static void
2032 lex_source_push_endcmd__ (struct lex_source *src)
2033 {
2034   assert (src->n_parse == 0);
2035
2036   struct lex_token *token = xmalloc (sizeof *token);
2037   *token = (struct lex_token) { .token = { .type = T_ENDCMD } };
2038   lex_source_push_parse (src, token);
2039 }
2040
2041 static void
2042 lex_source_push_parse (struct lex_source *src, struct lex_token *token)
2043 {
2044   if (src->n_parse >= src->allocated_parse)
2045     src->parse = x2nrealloc (src->parse, &src->allocated_parse,
2046                              sizeof *src->parse);
2047   src->parse[src->n_parse++] = token;
2048 }
2049
2050 static void
2051 lex_source_clear_parse (struct lex_source *src)
2052 {
2053   for (size_t i = 0; i < src->n_parse; i++)
2054     lex_token_destroy (src->parse[i]);
2055   src->n_parse = src->parse_ofs = 0;
2056 }
2057
2058 static struct lex_source *
2059 lex_source_create (struct lexer *lexer, struct lex_reader *reader)
2060 {
2061   size_t allocated_lines = 4;
2062   size_t *lines = xmalloc (allocated_lines * sizeof *lines);
2063   *lines = 0;
2064
2065   struct lex_source *src = xmalloc (sizeof *src);
2066   *src = (struct lex_source) {
2067     .n_refs = 1,
2068     .reader = reader,
2069     .segmenter = segmenter_init (reader->syntax, false),
2070     .lexer = lexer,
2071     .lines = lines,
2072     .n_lines = 1,
2073     .allocated_lines = allocated_lines,
2074   };
2075
2076   lex_source_push_endcmd__ (src);
2077
2078   return src;
2079 }
2080
2081 void
2082 lex_set_message_handler (struct lexer *lexer,
2083                          void (*output_msg) (const struct msg *,
2084                                              struct lexer *))
2085 {
2086   struct msg_handler msg_handler = {
2087     .output_msg = (void (*)(const struct msg *, void *)) output_msg,
2088     .aux = lexer,
2089     .lex_source_ref = lex_source_ref,
2090     .lex_source_unref = lex_source_unref,
2091     .lex_source_get_line = lex_source_get_line,
2092   };
2093   msg_set_handler (&msg_handler);
2094 }
2095
2096 void
2097 lex_source_ref (const struct lex_source *src_)
2098 {
2099   struct lex_source *src = CONST_CAST (struct lex_source *, src_);
2100   if (src)
2101     {
2102       assert (src->n_refs > 0);
2103       src->n_refs++;
2104     }
2105 }
2106
2107 void
2108 lex_source_unref (struct lex_source *src)
2109 {
2110   if (!src)
2111     return;
2112
2113   assert (src->n_refs > 0);
2114   if (--src->n_refs > 0)
2115     return;
2116
2117   char *file_name = src->reader->file_name;
2118   char *encoding = src->reader->encoding;
2119   if (src->reader->class->destroy != NULL)
2120     src->reader->class->destroy (src->reader);
2121   free (file_name);
2122   free (encoding);
2123   free (src->buffer);
2124   free (src->lines);
2125   lex_stage_uninit (&src->pp);
2126   lex_stage_uninit (&src->merge);
2127   lex_source_clear_parse (src);
2128   free (src->parse);
2129   free (src);
2130 }
2131 \f
2132 struct lex_file_reader
2133   {
2134     struct lex_reader reader;
2135     struct u8_istream *istream;
2136   };
2137
2138 static struct lex_reader_class lex_file_reader_class;
2139
2140 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
2141    from stdin if FILE_NAME is "-").  The file is expected to be encoded with
2142    ENCODING, which should take one of the forms accepted by
2143    u8_istream_for_file().  SYNTAX and ERROR become the syntax mode and error
2144    mode of the new reader, respectively.
2145
2146    Returns a null pointer if FILE_NAME cannot be opened. */
2147 struct lex_reader *
2148 lex_reader_for_file (const char *file_name, const char *encoding,
2149                      enum segmenter_mode syntax,
2150                      enum lex_error_mode error)
2151 {
2152   struct lex_file_reader *r;
2153   struct u8_istream *istream;
2154
2155   istream = (!strcmp(file_name, "-")
2156              ? u8_istream_for_fd (encoding, STDIN_FILENO)
2157              : u8_istream_for_file (encoding, file_name, O_RDONLY));
2158   if (istream == NULL)
2159     {
2160       msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
2161       return NULL;
2162     }
2163
2164   r = xmalloc (sizeof *r);
2165   lex_reader_init (&r->reader, &lex_file_reader_class);
2166   r->reader.syntax = syntax;
2167   r->reader.error = error;
2168   r->reader.file_name = xstrdup (file_name);
2169   r->reader.encoding = xstrdup_if_nonnull (encoding);
2170   r->reader.line_number = 1;
2171   r->istream = istream;
2172
2173   return &r->reader;
2174 }
2175
2176 static struct lex_file_reader *
2177 lex_file_reader_cast (struct lex_reader *r)
2178 {
2179   return UP_CAST (r, struct lex_file_reader, reader);
2180 }
2181
2182 static size_t
2183 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
2184                enum prompt_style prompt_style UNUSED)
2185 {
2186   struct lex_file_reader *r = lex_file_reader_cast (r_);
2187   ssize_t n_read = u8_istream_read (r->istream, buf, n);
2188   if (n_read < 0)
2189     {
2190       msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
2191       return 0;
2192     }
2193   return n_read;
2194 }
2195
2196 static void
2197 lex_file_close (struct lex_reader *r_)
2198 {
2199   struct lex_file_reader *r = lex_file_reader_cast (r_);
2200
2201   if (u8_istream_fileno (r->istream) != STDIN_FILENO)
2202     {
2203       if (u8_istream_close (r->istream) != 0)
2204         msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
2205     }
2206   else
2207     u8_istream_free (r->istream);
2208
2209   free (r);
2210 }
2211
2212 static struct lex_reader_class lex_file_reader_class =
2213   {
2214     lex_file_read,
2215     lex_file_close
2216   };
2217 \f
2218 struct lex_string_reader
2219   {
2220     struct lex_reader reader;
2221     struct substring s;
2222     size_t offset;
2223   };
2224
2225 static struct lex_reader_class lex_string_reader_class;
2226
2227 /* Creates and returns a new lex_reader for the contents of S, which must be
2228    encoded in the given ENCODING.  The new reader takes ownership of S and will free it
2229    with ss_dealloc() when it is closed. */
2230 struct lex_reader *
2231 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
2232 {
2233   struct lex_string_reader *r;
2234
2235   r = xmalloc (sizeof *r);
2236   lex_reader_init (&r->reader, &lex_string_reader_class);
2237   r->reader.syntax = SEG_MODE_AUTO;
2238   r->reader.encoding = xstrdup_if_nonnull (encoding);
2239   r->s = s;
2240   r->offset = 0;
2241
2242   return &r->reader;
2243 }
2244
2245 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
2246    which must be encoded in ENCODING.  The caller retains ownership of S. */
2247 struct lex_reader *
2248 lex_reader_for_string (const char *s, const char *encoding)
2249 {
2250   struct substring ss;
2251   ss_alloc_substring (&ss, ss_cstr (s));
2252   return lex_reader_for_substring_nocopy (ss, encoding);
2253 }
2254
2255 /* Formats FORMAT as a printf()-like format string and creates and returns a
2256    new lex_reader for the formatted result.  */
2257 struct lex_reader *
2258 lex_reader_for_format (const char *format, const char *encoding, ...)
2259 {
2260   struct lex_reader *r;
2261   va_list args;
2262
2263   va_start (args, encoding);
2264   r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
2265   va_end (args);
2266
2267   return r;
2268 }
2269
2270 static struct lex_string_reader *
2271 lex_string_reader_cast (struct lex_reader *r)
2272 {
2273   return UP_CAST (r, struct lex_string_reader, reader);
2274 }
2275
2276 static size_t
2277 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
2278                  enum prompt_style prompt_style UNUSED)
2279 {
2280   struct lex_string_reader *r = lex_string_reader_cast (r_);
2281   size_t chunk;
2282
2283   chunk = MIN (n, r->s.length - r->offset);
2284   memcpy (buf, r->s.string + r->offset, chunk);
2285   r->offset += chunk;
2286
2287   return chunk;
2288 }
2289
2290 static void
2291 lex_string_close (struct lex_reader *r_)
2292 {
2293   struct lex_string_reader *r = lex_string_reader_cast (r_);
2294
2295   ss_dealloc (&r->s);
2296   free (r);
2297 }
2298
2299 static struct lex_reader_class lex_string_reader_class =
2300   {
2301     lex_string_read,
2302     lex_string_close
2303   };
2304 \f
2305 struct substring
2306 lex_source_get_line (const struct lex_source *src, int line)
2307 {
2308   if (line < 1 || line > src->n_lines)
2309     return ss_empty ();
2310
2311   size_t ofs = src->lines[line - 1];
2312   size_t end = line >= src->n_lines ? src->length : src->lines[line];
2313   return ss_buffer (&src->buffer[ofs], end - ofs);
2314 }